1 /*- 2 * Copyright 1998, 2000 Marshall Kirk McKusick. 3 * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org> 4 * All rights reserved. 5 * 6 * The soft updates code is derived from the appendix of a University 7 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, 8 * "Soft Updates: A Solution to the Metadata Update Problem in File 9 * Systems", CSE-TR-254-95, August 1995). 10 * 11 * Further information about soft updates can be obtained from: 12 * 13 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 14 * 1614 Oxford Street mckusick@mckusick.com 15 * Berkeley, CA 94709-1608 +1-510-843-9542 16 * USA 17 * 18 * Redistribution and use in source and binary forms, with or without 19 * modification, are permitted provided that the following conditions 20 * are met: 21 * 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 31 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, 32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 34 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 35 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 36 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 37 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 * 39 * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00 40 */ 41 42 #include <sys/cdefs.h> 43 __FBSDID("$FreeBSD$"); 44 45 #include "opt_ffs.h" 46 #include "opt_quota.h" 47 #include "opt_ddb.h" 48 49 /* 50 * For now we want the safety net that the DEBUG flag provides. 51 */ 52 #ifndef DEBUG 53 #define DEBUG 54 #endif 55 56 #include <sys/param.h> 57 #include <sys/kernel.h> 58 #include <sys/systm.h> 59 #include <sys/bio.h> 60 #include <sys/buf.h> 61 #include <sys/kdb.h> 62 #include <sys/kthread.h> 63 #include <sys/limits.h> 64 #include <sys/lock.h> 65 #include <sys/malloc.h> 66 #include <sys/mount.h> 67 #include <sys/mutex.h> 68 #include <sys/namei.h> 69 #include <sys/priv.h> 70 #include <sys/proc.h> 71 #include <sys/stat.h> 72 #include <sys/sysctl.h> 73 #include <sys/syslog.h> 74 #include <sys/vnode.h> 75 #include <sys/conf.h> 76 77 #include <ufs/ufs/dir.h> 78 #include <ufs/ufs/extattr.h> 79 #include <ufs/ufs/quota.h> 80 #include <ufs/ufs/inode.h> 81 #include <ufs/ufs/ufsmount.h> 82 #include <ufs/ffs/fs.h> 83 #include <ufs/ffs/softdep.h> 84 #include <ufs/ffs/ffs_extern.h> 85 #include <ufs/ufs/ufs_extern.h> 86 87 #include <vm/vm.h> 88 #include <vm/vm_extern.h> 89 #include <vm/vm_object.h> 90 91 #include <ddb/ddb.h> 92 93 #ifndef SOFTUPDATES 94 95 int 96 softdep_flushfiles(oldmnt, flags, td) 97 struct mount *oldmnt; 98 int flags; 99 struct thread *td; 100 { 101 102 panic("softdep_flushfiles called"); 103 } 104 105 int 106 softdep_mount(devvp, mp, fs, cred) 107 struct vnode *devvp; 108 struct mount *mp; 109 struct fs *fs; 110 struct ucred *cred; 111 { 112 113 return (0); 114 } 115 116 void 117 softdep_initialize() 118 { 119 120 return; 121 } 122 123 void 124 softdep_uninitialize() 125 { 126 127 return; 128 } 129 130 void 131 softdep_unmount(mp) 132 struct mount *mp; 133 { 134 135 } 136 137 void 138 softdep_setup_sbupdate(ump, fs, bp) 139 struct ufsmount *ump; 140 struct fs *fs; 141 struct buf *bp; 142 { 143 } 144 145 void 146 softdep_setup_inomapdep(bp, ip, newinum, mode) 147 struct buf *bp; 148 struct inode *ip; 149 ino_t newinum; 150 int mode; 151 { 152 153 panic("softdep_setup_inomapdep called"); 154 } 155 156 void 157 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) 158 struct buf *bp; 159 struct mount *mp; 160 ufs2_daddr_t newblkno; 161 int frags; 162 int oldfrags; 163 { 164 165 panic("softdep_setup_blkmapdep called"); 166 } 167 168 void 169 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 170 struct inode *ip; 171 ufs_lbn_t lbn; 172 ufs2_daddr_t newblkno; 173 ufs2_daddr_t oldblkno; 174 long newsize; 175 long oldsize; 176 struct buf *bp; 177 { 178 179 panic("softdep_setup_allocdirect called"); 180 } 181 182 void 183 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 184 struct inode *ip; 185 ufs_lbn_t lbn; 186 ufs2_daddr_t newblkno; 187 ufs2_daddr_t oldblkno; 188 long newsize; 189 long oldsize; 190 struct buf *bp; 191 { 192 193 panic("softdep_setup_allocext called"); 194 } 195 196 void 197 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 198 struct inode *ip; 199 ufs_lbn_t lbn; 200 struct buf *bp; 201 int ptrno; 202 ufs2_daddr_t newblkno; 203 ufs2_daddr_t oldblkno; 204 struct buf *nbp; 205 { 206 207 panic("softdep_setup_allocindir_page called"); 208 } 209 210 void 211 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 212 struct buf *nbp; 213 struct inode *ip; 214 struct buf *bp; 215 int ptrno; 216 ufs2_daddr_t newblkno; 217 { 218 219 panic("softdep_setup_allocindir_meta called"); 220 } 221 222 void 223 softdep_journal_freeblocks(ip, cred, length, flags) 224 struct inode *ip; 225 struct ucred *cred; 226 off_t length; 227 int flags; 228 { 229 230 panic("softdep_journal_freeblocks called"); 231 } 232 233 void 234 softdep_journal_fsync(ip) 235 struct inode *ip; 236 { 237 238 panic("softdep_journal_fsync called"); 239 } 240 241 void 242 softdep_setup_freeblocks(ip, length, flags) 243 struct inode *ip; 244 off_t length; 245 int flags; 246 { 247 248 panic("softdep_setup_freeblocks called"); 249 } 250 251 void 252 softdep_freefile(pvp, ino, mode) 253 struct vnode *pvp; 254 ino_t ino; 255 int mode; 256 { 257 258 panic("softdep_freefile called"); 259 } 260 261 int 262 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) 263 struct buf *bp; 264 struct inode *dp; 265 off_t diroffset; 266 ino_t newinum; 267 struct buf *newdirbp; 268 int isnewblk; 269 { 270 271 panic("softdep_setup_directory_add called"); 272 } 273 274 void 275 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) 276 struct buf *bp; 277 struct inode *dp; 278 caddr_t base; 279 caddr_t oldloc; 280 caddr_t newloc; 281 int entrysize; 282 { 283 284 panic("softdep_change_directoryentry_offset called"); 285 } 286 287 void 288 softdep_setup_remove(bp, dp, ip, isrmdir) 289 struct buf *bp; 290 struct inode *dp; 291 struct inode *ip; 292 int isrmdir; 293 { 294 295 panic("softdep_setup_remove called"); 296 } 297 298 void 299 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 300 struct buf *bp; 301 struct inode *dp; 302 struct inode *ip; 303 ino_t newinum; 304 int isrmdir; 305 { 306 307 panic("softdep_setup_directory_change called"); 308 } 309 310 void 311 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) 312 struct mount *mp; 313 struct buf *bp; 314 ufs2_daddr_t blkno; 315 int frags; 316 struct workhead *wkhd; 317 { 318 319 panic("%s called", __FUNCTION__); 320 } 321 322 void 323 softdep_setup_inofree(mp, bp, ino, wkhd) 324 struct mount *mp; 325 struct buf *bp; 326 ino_t ino; 327 struct workhead *wkhd; 328 { 329 330 panic("%s called", __FUNCTION__); 331 } 332 333 void 334 softdep_setup_unlink(dp, ip) 335 struct inode *dp; 336 struct inode *ip; 337 { 338 339 panic("%s called", __FUNCTION__); 340 } 341 342 void 343 softdep_setup_link(dp, ip) 344 struct inode *dp; 345 struct inode *ip; 346 { 347 348 panic("%s called", __FUNCTION__); 349 } 350 351 void 352 softdep_revert_link(dp, ip) 353 struct inode *dp; 354 struct inode *ip; 355 { 356 357 panic("%s called", __FUNCTION__); 358 } 359 360 void 361 softdep_setup_rmdir(dp, ip) 362 struct inode *dp; 363 struct inode *ip; 364 { 365 366 panic("%s called", __FUNCTION__); 367 } 368 369 void 370 softdep_revert_rmdir(dp, ip) 371 struct inode *dp; 372 struct inode *ip; 373 { 374 375 panic("%s called", __FUNCTION__); 376 } 377 378 void 379 softdep_setup_create(dp, ip) 380 struct inode *dp; 381 struct inode *ip; 382 { 383 384 panic("%s called", __FUNCTION__); 385 } 386 387 void 388 softdep_revert_create(dp, ip) 389 struct inode *dp; 390 struct inode *ip; 391 { 392 393 panic("%s called", __FUNCTION__); 394 } 395 396 void 397 softdep_setup_mkdir(dp, ip) 398 struct inode *dp; 399 struct inode *ip; 400 { 401 402 panic("%s called", __FUNCTION__); 403 } 404 405 void 406 softdep_revert_mkdir(dp, ip) 407 struct inode *dp; 408 struct inode *ip; 409 { 410 411 panic("%s called", __FUNCTION__); 412 } 413 414 void 415 softdep_setup_dotdot_link(dp, ip) 416 struct inode *dp; 417 struct inode *ip; 418 { 419 420 panic("%s called", __FUNCTION__); 421 } 422 423 int 424 softdep_prealloc(vp, waitok) 425 struct vnode *vp; 426 int waitok; 427 { 428 429 panic("%s called", __FUNCTION__); 430 431 return (0); 432 } 433 434 int 435 softdep_journal_lookup(mp, vpp) 436 struct mount *mp; 437 struct vnode **vpp; 438 { 439 440 return (ENOENT); 441 } 442 443 void 444 softdep_change_linkcnt(ip) 445 struct inode *ip; 446 { 447 448 panic("softdep_change_linkcnt called"); 449 } 450 451 void 452 softdep_load_inodeblock(ip) 453 struct inode *ip; 454 { 455 456 panic("softdep_load_inodeblock called"); 457 } 458 459 void 460 softdep_update_inodeblock(ip, bp, waitfor) 461 struct inode *ip; 462 struct buf *bp; 463 int waitfor; 464 { 465 466 panic("softdep_update_inodeblock called"); 467 } 468 469 int 470 softdep_fsync(vp) 471 struct vnode *vp; /* the "in_core" copy of the inode */ 472 { 473 474 return (0); 475 } 476 477 void 478 softdep_fsync_mountdev(vp) 479 struct vnode *vp; 480 { 481 482 return; 483 } 484 485 int 486 softdep_flushworklist(oldmnt, countp, td) 487 struct mount *oldmnt; 488 int *countp; 489 struct thread *td; 490 { 491 492 *countp = 0; 493 return (0); 494 } 495 496 int 497 softdep_sync_metadata(struct vnode *vp) 498 { 499 500 return (0); 501 } 502 503 int 504 softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor) 505 { 506 507 return (0); 508 } 509 510 int 511 softdep_slowdown(vp) 512 struct vnode *vp; 513 { 514 515 panic("softdep_slowdown called"); 516 } 517 518 void 519 softdep_releasefile(ip) 520 struct inode *ip; /* inode with the zero effective link count */ 521 { 522 523 panic("softdep_releasefile called"); 524 } 525 526 int 527 softdep_request_cleanup(fs, vp, cred, resource) 528 struct fs *fs; 529 struct vnode *vp; 530 struct ucred *cred; 531 int resource; 532 { 533 534 return (0); 535 } 536 537 int 538 softdep_check_suspend(struct mount *mp, 539 struct vnode *devvp, 540 int softdep_deps, 541 int softdep_accdeps, 542 int secondary_writes, 543 int secondary_accwrites) 544 { 545 struct bufobj *bo; 546 int error; 547 548 (void) softdep_deps, 549 (void) softdep_accdeps; 550 551 bo = &devvp->v_bufobj; 552 ASSERT_BO_LOCKED(bo); 553 554 MNT_ILOCK(mp); 555 while (mp->mnt_secondary_writes != 0) { 556 BO_UNLOCK(bo); 557 msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), 558 (PUSER - 1) | PDROP, "secwr", 0); 559 BO_LOCK(bo); 560 MNT_ILOCK(mp); 561 } 562 563 /* 564 * Reasons for needing more work before suspend: 565 * - Dirty buffers on devvp. 566 * - Secondary writes occurred after start of vnode sync loop 567 */ 568 error = 0; 569 if (bo->bo_numoutput > 0 || 570 bo->bo_dirty.bv_cnt > 0 || 571 secondary_writes != 0 || 572 mp->mnt_secondary_writes != 0 || 573 secondary_accwrites != mp->mnt_secondary_accwrites) 574 error = EAGAIN; 575 BO_UNLOCK(bo); 576 return (error); 577 } 578 579 void 580 softdep_get_depcounts(struct mount *mp, 581 int *softdepactivep, 582 int *softdepactiveaccp) 583 { 584 (void) mp; 585 *softdepactivep = 0; 586 *softdepactiveaccp = 0; 587 } 588 589 void 590 softdep_buf_append(bp, wkhd) 591 struct buf *bp; 592 struct workhead *wkhd; 593 { 594 595 panic("softdep_buf_appendwork called"); 596 } 597 598 void 599 softdep_inode_append(ip, cred, wkhd) 600 struct inode *ip; 601 struct ucred *cred; 602 struct workhead *wkhd; 603 { 604 605 panic("softdep_inode_appendwork called"); 606 } 607 608 void 609 softdep_freework(wkhd) 610 struct workhead *wkhd; 611 { 612 613 panic("softdep_freework called"); 614 } 615 616 #else 617 618 FEATURE(softupdates, "FFS soft-updates support"); 619 620 /* 621 * These definitions need to be adapted to the system to which 622 * this file is being ported. 623 */ 624 625 #define M_SOFTDEP_FLAGS (M_WAITOK) 626 627 #define D_PAGEDEP 0 628 #define D_INODEDEP 1 629 #define D_BMSAFEMAP 2 630 #define D_NEWBLK 3 631 #define D_ALLOCDIRECT 4 632 #define D_INDIRDEP 5 633 #define D_ALLOCINDIR 6 634 #define D_FREEFRAG 7 635 #define D_FREEBLKS 8 636 #define D_FREEFILE 9 637 #define D_DIRADD 10 638 #define D_MKDIR 11 639 #define D_DIRREM 12 640 #define D_NEWDIRBLK 13 641 #define D_FREEWORK 14 642 #define D_FREEDEP 15 643 #define D_JADDREF 16 644 #define D_JREMREF 17 645 #define D_JMVREF 18 646 #define D_JNEWBLK 19 647 #define D_JFREEBLK 20 648 #define D_JFREEFRAG 21 649 #define D_JSEG 22 650 #define D_JSEGDEP 23 651 #define D_SBDEP 24 652 #define D_JTRUNC 25 653 #define D_JFSYNC 26 654 #define D_SENTINAL 27 655 #define D_LAST D_SENTINAL 656 657 unsigned long dep_current[D_LAST + 1]; 658 unsigned long dep_total[D_LAST + 1]; 659 unsigned long dep_write[D_LAST + 1]; 660 661 662 static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0, 663 "soft updates stats"); 664 static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0, 665 "total dependencies allocated"); 666 static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0, 667 "current dependencies allocated"); 668 static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW, 0, 669 "current dependencies written"); 670 671 #define SOFTDEP_TYPE(type, str, long) \ 672 static MALLOC_DEFINE(M_ ## type, #str, long); \ 673 SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD, \ 674 &dep_total[D_ ## type], 0, ""); \ 675 SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, \ 676 &dep_current[D_ ## type], 0, ""); \ 677 SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, \ 678 &dep_write[D_ ## type], 0, ""); 679 680 SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies"); 681 SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies"); 682 SOFTDEP_TYPE(BMSAFEMAP, bmsafemap, 683 "Block or frag allocated from cyl group map"); 684 SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency"); 685 SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode"); 686 SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies"); 687 SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block"); 688 SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode"); 689 SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode"); 690 SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated"); 691 SOFTDEP_TYPE(DIRADD, diradd, "New directory entry"); 692 SOFTDEP_TYPE(MKDIR, mkdir, "New directory"); 693 SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted"); 694 SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block"); 695 SOFTDEP_TYPE(FREEWORK, freework, "free an inode block"); 696 SOFTDEP_TYPE(FREEDEP, freedep, "track a block free"); 697 SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add"); 698 SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove"); 699 SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move"); 700 SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block"); 701 SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block"); 702 SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag"); 703 SOFTDEP_TYPE(JSEG, jseg, "Journal segment"); 704 SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete"); 705 SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency"); 706 SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation"); 707 SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete"); 708 709 static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes"); 710 static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations"); 711 712 /* 713 * translate from workitem type to memory type 714 * MUST match the defines above, such that memtype[D_XXX] == M_XXX 715 */ 716 static struct malloc_type *memtype[] = { 717 M_PAGEDEP, 718 M_INODEDEP, 719 M_BMSAFEMAP, 720 M_NEWBLK, 721 M_ALLOCDIRECT, 722 M_INDIRDEP, 723 M_ALLOCINDIR, 724 M_FREEFRAG, 725 M_FREEBLKS, 726 M_FREEFILE, 727 M_DIRADD, 728 M_MKDIR, 729 M_DIRREM, 730 M_NEWDIRBLK, 731 M_FREEWORK, 732 M_FREEDEP, 733 M_JADDREF, 734 M_JREMREF, 735 M_JMVREF, 736 M_JNEWBLK, 737 M_JFREEBLK, 738 M_JFREEFRAG, 739 M_JSEG, 740 M_JSEGDEP, 741 M_SBDEP, 742 M_JTRUNC, 743 M_JFSYNC 744 }; 745 746 static LIST_HEAD(mkdirlist, mkdir) mkdirlisthd; 747 748 #define DtoM(type) (memtype[type]) 749 750 /* 751 * Names of malloc types. 752 */ 753 #define TYPENAME(type) \ 754 ((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???") 755 /* 756 * End system adaptation definitions. 757 */ 758 759 #define DOTDOT_OFFSET offsetof(struct dirtemplate, dotdot_ino) 760 #define DOT_OFFSET offsetof(struct dirtemplate, dot_ino) 761 762 /* 763 * Forward declarations. 764 */ 765 struct inodedep_hashhead; 766 struct newblk_hashhead; 767 struct pagedep_hashhead; 768 struct bmsafemap_hashhead; 769 770 /* 771 * Internal function prototypes. 772 */ 773 static void softdep_error(char *, int); 774 static void drain_output(struct vnode *); 775 static struct buf *getdirtybuf(struct buf *, struct mtx *, int); 776 static void clear_remove(struct thread *); 777 static void clear_inodedeps(struct thread *); 778 static void unlinked_inodedep(struct mount *, struct inodedep *); 779 static void clear_unlinked_inodedep(struct inodedep *); 780 static struct inodedep *first_unlinked_inodedep(struct ufsmount *); 781 static int flush_pagedep_deps(struct vnode *, struct mount *, 782 struct diraddhd *); 783 static int free_pagedep(struct pagedep *); 784 static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t); 785 static int flush_inodedep_deps(struct vnode *, struct mount *, ino_t); 786 static int flush_deplist(struct allocdirectlst *, int, int *); 787 static int sync_cgs(struct mount *, int); 788 static int handle_written_filepage(struct pagedep *, struct buf *); 789 static int handle_written_sbdep(struct sbdep *, struct buf *); 790 static void initiate_write_sbdep(struct sbdep *); 791 static void diradd_inode_written(struct diradd *, struct inodedep *); 792 static int handle_written_indirdep(struct indirdep *, struct buf *, 793 struct buf**); 794 static int handle_written_inodeblock(struct inodedep *, struct buf *); 795 static int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *, 796 uint8_t *); 797 static int handle_written_bmsafemap(struct bmsafemap *, struct buf *); 798 static void handle_written_jaddref(struct jaddref *); 799 static void handle_written_jremref(struct jremref *); 800 static void handle_written_jseg(struct jseg *, struct buf *); 801 static void handle_written_jnewblk(struct jnewblk *); 802 static void handle_written_jblkdep(struct jblkdep *); 803 static void handle_written_jfreefrag(struct jfreefrag *); 804 static void complete_jseg(struct jseg *); 805 static void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *); 806 static void jaddref_write(struct jaddref *, struct jseg *, uint8_t *); 807 static void jremref_write(struct jremref *, struct jseg *, uint8_t *); 808 static void jmvref_write(struct jmvref *, struct jseg *, uint8_t *); 809 static void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *); 810 static void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data); 811 static void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *); 812 static void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *); 813 static void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *); 814 static inline void inoref_write(struct inoref *, struct jseg *, 815 struct jrefrec *); 816 static void handle_allocdirect_partdone(struct allocdirect *, 817 struct workhead *); 818 static struct jnewblk *cancel_newblk(struct newblk *, struct worklist *, 819 struct workhead *); 820 static void indirdep_complete(struct indirdep *); 821 static int indirblk_lookup(struct mount *, ufs2_daddr_t); 822 static void indirblk_insert(struct freework *); 823 static void indirblk_remove(struct freework *); 824 static void handle_allocindir_partdone(struct allocindir *); 825 static void initiate_write_filepage(struct pagedep *, struct buf *); 826 static void initiate_write_indirdep(struct indirdep*, struct buf *); 827 static void handle_written_mkdir(struct mkdir *, int); 828 static int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *, 829 uint8_t *); 830 static void initiate_write_bmsafemap(struct bmsafemap *, struct buf *); 831 static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *); 832 static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *); 833 static void handle_workitem_freefile(struct freefile *); 834 static int handle_workitem_remove(struct dirrem *, int); 835 static struct dirrem *newdirrem(struct buf *, struct inode *, 836 struct inode *, int, struct dirrem **); 837 static struct indirdep *indirdep_lookup(struct mount *, struct inode *, 838 struct buf *); 839 static void cancel_indirdep(struct indirdep *, struct buf *, 840 struct freeblks *); 841 static void free_indirdep(struct indirdep *); 842 static void free_diradd(struct diradd *, struct workhead *); 843 static void merge_diradd(struct inodedep *, struct diradd *); 844 static void complete_diradd(struct diradd *); 845 static struct diradd *diradd_lookup(struct pagedep *, int); 846 static struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *, 847 struct jremref *); 848 static struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *, 849 struct jremref *); 850 static void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *, 851 struct jremref *, struct jremref *); 852 static void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *, 853 struct jremref *); 854 static void cancel_allocindir(struct allocindir *, struct buf *bp, 855 struct freeblks *, int); 856 static int setup_trunc_indir(struct freeblks *, struct inode *, 857 ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t); 858 static void complete_trunc_indir(struct freework *); 859 static void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *, 860 int); 861 static void complete_mkdir(struct mkdir *); 862 static void free_newdirblk(struct newdirblk *); 863 static void free_jremref(struct jremref *); 864 static void free_jaddref(struct jaddref *); 865 static void free_jsegdep(struct jsegdep *); 866 static void free_jsegs(struct jblocks *); 867 static void rele_jseg(struct jseg *); 868 static void free_jseg(struct jseg *, struct jblocks *); 869 static void free_jnewblk(struct jnewblk *); 870 static void free_jblkdep(struct jblkdep *); 871 static void free_jfreefrag(struct jfreefrag *); 872 static void free_freedep(struct freedep *); 873 static void journal_jremref(struct dirrem *, struct jremref *, 874 struct inodedep *); 875 static void cancel_jnewblk(struct jnewblk *, struct workhead *); 876 static int cancel_jaddref(struct jaddref *, struct inodedep *, 877 struct workhead *); 878 static void cancel_jfreefrag(struct jfreefrag *); 879 static inline void setup_freedirect(struct freeblks *, struct inode *, 880 int, int); 881 static inline void setup_freeext(struct freeblks *, struct inode *, int, int); 882 static inline void setup_freeindir(struct freeblks *, struct inode *, int, 883 ufs_lbn_t, int); 884 static inline struct freeblks *newfreeblks(struct mount *, struct inode *); 885 static void freeblks_free(struct ufsmount *, struct freeblks *, int); 886 static void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t); 887 ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t); 888 static int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int); 889 static void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t, 890 int, int); 891 static void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int); 892 static int cancel_pagedep(struct pagedep *, struct freeblks *, int); 893 static int deallocate_dependencies(struct buf *, struct freeblks *, int); 894 static void newblk_freefrag(struct newblk*); 895 static void free_newblk(struct newblk *); 896 static void cancel_allocdirect(struct allocdirectlst *, 897 struct allocdirect *, struct freeblks *); 898 static int check_inode_unwritten(struct inodedep *); 899 static int free_inodedep(struct inodedep *); 900 static void freework_freeblock(struct freework *); 901 static void freework_enqueue(struct freework *); 902 static int handle_workitem_freeblocks(struct freeblks *, int); 903 static int handle_complete_freeblocks(struct freeblks *, int); 904 static void handle_workitem_indirblk(struct freework *); 905 static void handle_written_freework(struct freework *); 906 static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *); 907 static struct worklist *jnewblk_merge(struct worklist *, struct worklist *, 908 struct workhead *); 909 static struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *, 910 struct inodedep *, struct allocindir *, ufs_lbn_t); 911 static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t, 912 ufs2_daddr_t, ufs_lbn_t); 913 static void handle_workitem_freefrag(struct freefrag *); 914 static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long, 915 ufs_lbn_t); 916 static void allocdirect_merge(struct allocdirectlst *, 917 struct allocdirect *, struct allocdirect *); 918 static struct freefrag *allocindir_merge(struct allocindir *, 919 struct allocindir *); 920 static int bmsafemap_find(struct bmsafemap_hashhead *, struct mount *, int, 921 struct bmsafemap **); 922 static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *, 923 int cg); 924 static int newblk_find(struct newblk_hashhead *, struct mount *, ufs2_daddr_t, 925 int, struct newblk **); 926 static int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **); 927 static int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t, 928 struct inodedep **); 929 static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **); 930 static int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t, 931 int, struct pagedep **); 932 static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t, 933 struct mount *mp, int, struct pagedep **); 934 static void pause_timer(void *); 935 static int request_cleanup(struct mount *, int); 936 static int process_worklist_item(struct mount *, int, int); 937 static void process_removes(struct vnode *); 938 static void process_truncates(struct vnode *); 939 static void jwork_move(struct workhead *, struct workhead *); 940 static void jwork_insert(struct workhead *, struct jsegdep *); 941 static void add_to_worklist(struct worklist *, int); 942 static void wake_worklist(struct worklist *); 943 static void wait_worklist(struct worklist *, char *); 944 static void remove_from_worklist(struct worklist *); 945 static void softdep_flush(void); 946 static void softdep_flushjournal(struct mount *); 947 static int softdep_speedup(void); 948 static void worklist_speedup(void); 949 static int journal_mount(struct mount *, struct fs *, struct ucred *); 950 static void journal_unmount(struct mount *); 951 static int journal_space(struct ufsmount *, int); 952 static void journal_suspend(struct ufsmount *); 953 static int journal_unsuspend(struct ufsmount *ump); 954 static void softdep_prelink(struct vnode *, struct vnode *); 955 static void add_to_journal(struct worklist *); 956 static void remove_from_journal(struct worklist *); 957 static void softdep_process_journal(struct mount *, struct worklist *, int); 958 static struct jremref *newjremref(struct dirrem *, struct inode *, 959 struct inode *ip, off_t, nlink_t); 960 static struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t, 961 uint16_t); 962 static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t, 963 uint16_t); 964 static inline struct jsegdep *inoref_jseg(struct inoref *); 965 static struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t); 966 static struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t, 967 ufs2_daddr_t, int); 968 static struct jtrunc *newjtrunc(struct freeblks *, off_t, int); 969 static void move_newblock_dep(struct jaddref *, struct inodedep *); 970 static void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t); 971 static struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *, 972 ufs2_daddr_t, long, ufs_lbn_t); 973 static struct freework *newfreework(struct ufsmount *, struct freeblks *, 974 struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int); 975 static int jwait(struct worklist *, int); 976 static struct inodedep *inodedep_lookup_ip(struct inode *); 977 static int bmsafemap_rollbacks(struct bmsafemap *); 978 static struct freefile *handle_bufwait(struct inodedep *, struct workhead *); 979 static void handle_jwork(struct workhead *); 980 static struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *, 981 struct mkdir **); 982 static struct jblocks *jblocks_create(void); 983 static ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *); 984 static void jblocks_free(struct jblocks *, struct mount *, int); 985 static void jblocks_destroy(struct jblocks *); 986 static void jblocks_add(struct jblocks *, ufs2_daddr_t, int); 987 988 /* 989 * Exported softdep operations. 990 */ 991 static void softdep_disk_io_initiation(struct buf *); 992 static void softdep_disk_write_complete(struct buf *); 993 static void softdep_deallocate_dependencies(struct buf *); 994 static int softdep_count_dependencies(struct buf *bp, int); 995 996 static struct mtx lk; 997 MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF); 998 999 #define TRY_ACQUIRE_LOCK(lk) mtx_trylock(lk) 1000 #define ACQUIRE_LOCK(lk) mtx_lock(lk) 1001 #define FREE_LOCK(lk) mtx_unlock(lk) 1002 1003 #define BUF_AREC(bp) lockallowrecurse(&(bp)->b_lock) 1004 #define BUF_NOREC(bp) lockdisablerecurse(&(bp)->b_lock) 1005 1006 /* 1007 * Worklist queue management. 1008 * These routines require that the lock be held. 1009 */ 1010 #ifndef /* NOT */ DEBUG 1011 #define WORKLIST_INSERT(head, item) do { \ 1012 (item)->wk_state |= ONWORKLIST; \ 1013 LIST_INSERT_HEAD(head, item, wk_list); \ 1014 } while (0) 1015 #define WORKLIST_REMOVE(item) do { \ 1016 (item)->wk_state &= ~ONWORKLIST; \ 1017 LIST_REMOVE(item, wk_list); \ 1018 } while (0) 1019 #define WORKLIST_INSERT_UNLOCKED WORKLIST_INSERT 1020 #define WORKLIST_REMOVE_UNLOCKED WORKLIST_REMOVE 1021 1022 #else /* DEBUG */ 1023 static void worklist_insert(struct workhead *, struct worklist *, int); 1024 static void worklist_remove(struct worklist *, int); 1025 1026 #define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1) 1027 #define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0) 1028 #define WORKLIST_REMOVE(item) worklist_remove(item, 1) 1029 #define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0) 1030 1031 static void 1032 worklist_insert(head, item, locked) 1033 struct workhead *head; 1034 struct worklist *item; 1035 int locked; 1036 { 1037 1038 if (locked) 1039 mtx_assert(&lk, MA_OWNED); 1040 if (item->wk_state & ONWORKLIST) 1041 panic("worklist_insert: %p %s(0x%X) already on list", 1042 item, TYPENAME(item->wk_type), item->wk_state); 1043 item->wk_state |= ONWORKLIST; 1044 LIST_INSERT_HEAD(head, item, wk_list); 1045 } 1046 1047 static void 1048 worklist_remove(item, locked) 1049 struct worklist *item; 1050 int locked; 1051 { 1052 1053 if (locked) 1054 mtx_assert(&lk, MA_OWNED); 1055 if ((item->wk_state & ONWORKLIST) == 0) 1056 panic("worklist_remove: %p %s(0x%X) not on list", 1057 item, TYPENAME(item->wk_type), item->wk_state); 1058 item->wk_state &= ~ONWORKLIST; 1059 LIST_REMOVE(item, wk_list); 1060 } 1061 #endif /* DEBUG */ 1062 1063 /* 1064 * Merge two jsegdeps keeping only the oldest one as newer references 1065 * can't be discarded until after older references. 1066 */ 1067 static inline struct jsegdep * 1068 jsegdep_merge(struct jsegdep *one, struct jsegdep *two) 1069 { 1070 struct jsegdep *swp; 1071 1072 if (two == NULL) 1073 return (one); 1074 1075 if (one->jd_seg->js_seq > two->jd_seg->js_seq) { 1076 swp = one; 1077 one = two; 1078 two = swp; 1079 } 1080 WORKLIST_REMOVE(&two->jd_list); 1081 free_jsegdep(two); 1082 1083 return (one); 1084 } 1085 1086 /* 1087 * If two freedeps are compatible free one to reduce list size. 1088 */ 1089 static inline struct freedep * 1090 freedep_merge(struct freedep *one, struct freedep *two) 1091 { 1092 if (two == NULL) 1093 return (one); 1094 1095 if (one->fd_freework == two->fd_freework) { 1096 WORKLIST_REMOVE(&two->fd_list); 1097 free_freedep(two); 1098 } 1099 return (one); 1100 } 1101 1102 /* 1103 * Move journal work from one list to another. Duplicate freedeps and 1104 * jsegdeps are coalesced to keep the lists as small as possible. 1105 */ 1106 static void 1107 jwork_move(dst, src) 1108 struct workhead *dst; 1109 struct workhead *src; 1110 { 1111 struct freedep *freedep; 1112 struct jsegdep *jsegdep; 1113 struct worklist *wkn; 1114 struct worklist *wk; 1115 1116 KASSERT(dst != src, 1117 ("jwork_move: dst == src")); 1118 freedep = NULL; 1119 jsegdep = NULL; 1120 LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) { 1121 if (wk->wk_type == D_JSEGDEP) 1122 jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); 1123 if (wk->wk_type == D_FREEDEP) 1124 freedep = freedep_merge(WK_FREEDEP(wk), freedep); 1125 } 1126 1127 mtx_assert(&lk, MA_OWNED); 1128 while ((wk = LIST_FIRST(src)) != NULL) { 1129 WORKLIST_REMOVE(wk); 1130 WORKLIST_INSERT(dst, wk); 1131 if (wk->wk_type == D_JSEGDEP) { 1132 jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); 1133 continue; 1134 } 1135 if (wk->wk_type == D_FREEDEP) 1136 freedep = freedep_merge(WK_FREEDEP(wk), freedep); 1137 } 1138 } 1139 1140 static void 1141 jwork_insert(dst, jsegdep) 1142 struct workhead *dst; 1143 struct jsegdep *jsegdep; 1144 { 1145 struct jsegdep *jsegdepn; 1146 struct worklist *wk; 1147 1148 LIST_FOREACH(wk, dst, wk_list) 1149 if (wk->wk_type == D_JSEGDEP) 1150 break; 1151 if (wk == NULL) { 1152 WORKLIST_INSERT(dst, &jsegdep->jd_list); 1153 return; 1154 } 1155 jsegdepn = WK_JSEGDEP(wk); 1156 if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) { 1157 WORKLIST_REMOVE(wk); 1158 free_jsegdep(jsegdepn); 1159 WORKLIST_INSERT(dst, &jsegdep->jd_list); 1160 } else 1161 free_jsegdep(jsegdep); 1162 } 1163 1164 /* 1165 * Routines for tracking and managing workitems. 1166 */ 1167 static void workitem_free(struct worklist *, int); 1168 static void workitem_alloc(struct worklist *, int, struct mount *); 1169 1170 #define WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type)) 1171 1172 static void 1173 workitem_free(item, type) 1174 struct worklist *item; 1175 int type; 1176 { 1177 struct ufsmount *ump; 1178 mtx_assert(&lk, MA_OWNED); 1179 1180 #ifdef DEBUG 1181 if (item->wk_state & ONWORKLIST) 1182 panic("workitem_free: %s(0x%X) still on list", 1183 TYPENAME(item->wk_type), item->wk_state); 1184 if (item->wk_type != type) 1185 panic("workitem_free: type mismatch %s != %s", 1186 TYPENAME(item->wk_type), TYPENAME(type)); 1187 #endif 1188 if (item->wk_state & IOWAITING) 1189 wakeup(item); 1190 ump = VFSTOUFS(item->wk_mp); 1191 if (--ump->softdep_deps == 0 && ump->softdep_req) 1192 wakeup(&ump->softdep_deps); 1193 dep_current[type]--; 1194 free(item, DtoM(type)); 1195 } 1196 1197 static void 1198 workitem_alloc(item, type, mp) 1199 struct worklist *item; 1200 int type; 1201 struct mount *mp; 1202 { 1203 struct ufsmount *ump; 1204 1205 item->wk_type = type; 1206 item->wk_mp = mp; 1207 item->wk_state = 0; 1208 1209 ump = VFSTOUFS(mp); 1210 ACQUIRE_LOCK(&lk); 1211 dep_current[type]++; 1212 dep_total[type]++; 1213 ump->softdep_deps++; 1214 ump->softdep_accdeps++; 1215 FREE_LOCK(&lk); 1216 } 1217 1218 /* 1219 * Workitem queue management 1220 */ 1221 static int max_softdeps; /* maximum number of structs before slowdown */ 1222 static int maxindirdeps = 50; /* max number of indirdeps before slowdown */ 1223 static int tickdelay = 2; /* number of ticks to pause during slowdown */ 1224 static int proc_waiting; /* tracks whether we have a timeout posted */ 1225 static int *stat_countp; /* statistic to count in proc_waiting timeout */ 1226 static struct callout softdep_callout; 1227 static int req_pending; 1228 static int req_clear_inodedeps; /* syncer process flush some inodedeps */ 1229 static int req_clear_remove; /* syncer process flush some freeblks */ 1230 1231 /* 1232 * runtime statistics 1233 */ 1234 static int stat_worklist_push; /* number of worklist cleanups */ 1235 static int stat_blk_limit_push; /* number of times block limit neared */ 1236 static int stat_ino_limit_push; /* number of times inode limit neared */ 1237 static int stat_blk_limit_hit; /* number of times block slowdown imposed */ 1238 static int stat_ino_limit_hit; /* number of times inode slowdown imposed */ 1239 static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */ 1240 static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */ 1241 static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */ 1242 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */ 1243 static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */ 1244 static int stat_jaddref; /* bufs redirtied as ino bitmap can not write */ 1245 static int stat_jnewblk; /* bufs redirtied as blk bitmap can not write */ 1246 static int stat_journal_min; /* Times hit journal min threshold */ 1247 static int stat_journal_low; /* Times hit journal low threshold */ 1248 static int stat_journal_wait; /* Times blocked in jwait(). */ 1249 static int stat_jwait_filepage; /* Times blocked in jwait() for filepage. */ 1250 static int stat_jwait_freeblks; /* Times blocked in jwait() for freeblks. */ 1251 static int stat_jwait_inode; /* Times blocked in jwait() for inodes. */ 1252 static int stat_jwait_newblk; /* Times blocked in jwait() for newblks. */ 1253 static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */ 1254 static int stat_cleanup_blkrequests; /* Number of block cleanup requests */ 1255 static int stat_cleanup_inorequests; /* Number of inode cleanup requests */ 1256 static int stat_cleanup_retries; /* Number of cleanups that needed to flush */ 1257 static int stat_cleanup_failures; /* Number of cleanup requests that failed */ 1258 1259 SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW, 1260 &max_softdeps, 0, ""); 1261 SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW, 1262 &tickdelay, 0, ""); 1263 SYSCTL_INT(_debug_softdep, OID_AUTO, maxindirdeps, CTLFLAG_RW, 1264 &maxindirdeps, 0, ""); 1265 SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW, 1266 &stat_worklist_push, 0,""); 1267 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW, 1268 &stat_blk_limit_push, 0,""); 1269 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW, 1270 &stat_ino_limit_push, 0,""); 1271 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW, 1272 &stat_blk_limit_hit, 0, ""); 1273 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW, 1274 &stat_ino_limit_hit, 0, ""); 1275 SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW, 1276 &stat_sync_limit_hit, 0, ""); 1277 SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, 1278 &stat_indir_blk_ptrs, 0, ""); 1279 SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW, 1280 &stat_inode_bitmap, 0, ""); 1281 SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, 1282 &stat_direct_blk_ptrs, 0, ""); 1283 SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW, 1284 &stat_dir_entry, 0, ""); 1285 SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW, 1286 &stat_jaddref, 0, ""); 1287 SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW, 1288 &stat_jnewblk, 0, ""); 1289 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW, 1290 &stat_journal_low, 0, ""); 1291 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW, 1292 &stat_journal_min, 0, ""); 1293 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW, 1294 &stat_journal_wait, 0, ""); 1295 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW, 1296 &stat_jwait_filepage, 0, ""); 1297 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW, 1298 &stat_jwait_freeblks, 0, ""); 1299 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW, 1300 &stat_jwait_inode, 0, ""); 1301 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW, 1302 &stat_jwait_newblk, 0, ""); 1303 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW, 1304 &stat_cleanup_blkrequests, 0, ""); 1305 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW, 1306 &stat_cleanup_inorequests, 0, ""); 1307 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW, 1308 &stat_cleanup_high_delay, 0, ""); 1309 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW, 1310 &stat_cleanup_retries, 0, ""); 1311 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW, 1312 &stat_cleanup_failures, 0, ""); 1313 1314 SYSCTL_DECL(_vfs_ffs); 1315 1316 LIST_HEAD(bmsafemap_hashhead, bmsafemap) *bmsafemap_hashtbl; 1317 static u_long bmsafemap_hash; /* size of hash table - 1 */ 1318 1319 static int compute_summary_at_mount = 0; /* Whether to recompute the summary at mount time */ 1320 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW, 1321 &compute_summary_at_mount, 0, "Recompute summary at mount"); 1322 1323 static struct proc *softdepproc; 1324 static struct kproc_desc softdep_kp = { 1325 "softdepflush", 1326 softdep_flush, 1327 &softdepproc 1328 }; 1329 SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start, 1330 &softdep_kp); 1331 1332 static void 1333 softdep_flush(void) 1334 { 1335 struct mount *nmp; 1336 struct mount *mp; 1337 struct ufsmount *ump; 1338 struct thread *td; 1339 int remaining; 1340 int progress; 1341 int vfslocked; 1342 1343 td = curthread; 1344 td->td_pflags |= TDP_NORUNNINGBUF; 1345 1346 for (;;) { 1347 kproc_suspend_check(softdepproc); 1348 vfslocked = VFS_LOCK_GIANT((struct mount *)NULL); 1349 ACQUIRE_LOCK(&lk); 1350 /* 1351 * If requested, try removing inode or removal dependencies. 1352 */ 1353 if (req_clear_inodedeps) { 1354 clear_inodedeps(td); 1355 req_clear_inodedeps -= 1; 1356 wakeup_one(&proc_waiting); 1357 } 1358 if (req_clear_remove) { 1359 clear_remove(td); 1360 req_clear_remove -= 1; 1361 wakeup_one(&proc_waiting); 1362 } 1363 FREE_LOCK(&lk); 1364 VFS_UNLOCK_GIANT(vfslocked); 1365 remaining = progress = 0; 1366 mtx_lock(&mountlist_mtx); 1367 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 1368 nmp = TAILQ_NEXT(mp, mnt_list); 1369 if (MOUNTEDSOFTDEP(mp) == 0) 1370 continue; 1371 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) 1372 continue; 1373 vfslocked = VFS_LOCK_GIANT(mp); 1374 progress += softdep_process_worklist(mp, 0); 1375 ump = VFSTOUFS(mp); 1376 remaining += ump->softdep_on_worklist; 1377 VFS_UNLOCK_GIANT(vfslocked); 1378 mtx_lock(&mountlist_mtx); 1379 nmp = TAILQ_NEXT(mp, mnt_list); 1380 vfs_unbusy(mp); 1381 } 1382 mtx_unlock(&mountlist_mtx); 1383 if (remaining && progress) 1384 continue; 1385 ACQUIRE_LOCK(&lk); 1386 if (!req_pending) 1387 msleep(&req_pending, &lk, PVM, "sdflush", hz); 1388 req_pending = 0; 1389 FREE_LOCK(&lk); 1390 } 1391 } 1392 1393 static void 1394 worklist_speedup(void) 1395 { 1396 mtx_assert(&lk, MA_OWNED); 1397 if (req_pending == 0) { 1398 req_pending = 1; 1399 wakeup(&req_pending); 1400 } 1401 } 1402 1403 static int 1404 softdep_speedup(void) 1405 { 1406 1407 worklist_speedup(); 1408 bd_speedup(); 1409 return speedup_syncer(); 1410 } 1411 1412 /* 1413 * Add an item to the end of the work queue. 1414 * This routine requires that the lock be held. 1415 * This is the only routine that adds items to the list. 1416 * The following routine is the only one that removes items 1417 * and does so in order from first to last. 1418 */ 1419 1420 #define WK_HEAD 0x0001 /* Add to HEAD. */ 1421 #define WK_NODELAY 0x0002 /* Process immediately. */ 1422 1423 static void 1424 add_to_worklist(wk, flags) 1425 struct worklist *wk; 1426 int flags; 1427 { 1428 struct ufsmount *ump; 1429 1430 mtx_assert(&lk, MA_OWNED); 1431 ump = VFSTOUFS(wk->wk_mp); 1432 if (wk->wk_state & ONWORKLIST) 1433 panic("add_to_worklist: %s(0x%X) already on list", 1434 TYPENAME(wk->wk_type), wk->wk_state); 1435 wk->wk_state |= ONWORKLIST; 1436 if (ump->softdep_on_worklist == 0) { 1437 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); 1438 ump->softdep_worklist_tail = wk; 1439 } else if (flags & WK_HEAD) { 1440 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); 1441 } else { 1442 LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list); 1443 ump->softdep_worklist_tail = wk; 1444 } 1445 ump->softdep_on_worklist += 1; 1446 if (flags & WK_NODELAY) 1447 worklist_speedup(); 1448 } 1449 1450 /* 1451 * Remove the item to be processed. If we are removing the last 1452 * item on the list, we need to recalculate the tail pointer. 1453 */ 1454 static void 1455 remove_from_worklist(wk) 1456 struct worklist *wk; 1457 { 1458 struct ufsmount *ump; 1459 1460 ump = VFSTOUFS(wk->wk_mp); 1461 WORKLIST_REMOVE(wk); 1462 if (ump->softdep_worklist_tail == wk) 1463 ump->softdep_worklist_tail = 1464 (struct worklist *)wk->wk_list.le_prev; 1465 ump->softdep_on_worklist -= 1; 1466 } 1467 1468 static void 1469 wake_worklist(wk) 1470 struct worklist *wk; 1471 { 1472 if (wk->wk_state & IOWAITING) { 1473 wk->wk_state &= ~IOWAITING; 1474 wakeup(wk); 1475 } 1476 } 1477 1478 static void 1479 wait_worklist(wk, wmesg) 1480 struct worklist *wk; 1481 char *wmesg; 1482 { 1483 1484 wk->wk_state |= IOWAITING; 1485 msleep(wk, &lk, PVM, wmesg, 0); 1486 } 1487 1488 /* 1489 * Process that runs once per second to handle items in the background queue. 1490 * 1491 * Note that we ensure that everything is done in the order in which they 1492 * appear in the queue. The code below depends on this property to ensure 1493 * that blocks of a file are freed before the inode itself is freed. This 1494 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated 1495 * until all the old ones have been purged from the dependency lists. 1496 */ 1497 int 1498 softdep_process_worklist(mp, full) 1499 struct mount *mp; 1500 int full; 1501 { 1502 struct thread *td = curthread; 1503 int cnt, matchcnt; 1504 struct ufsmount *ump; 1505 long starttime; 1506 1507 KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp")); 1508 /* 1509 * Record the process identifier of our caller so that we can give 1510 * this process preferential treatment in request_cleanup below. 1511 */ 1512 matchcnt = 0; 1513 ump = VFSTOUFS(mp); 1514 ACQUIRE_LOCK(&lk); 1515 starttime = time_second; 1516 softdep_process_journal(mp, NULL, full?MNT_WAIT:0); 1517 while (ump->softdep_on_worklist > 0) { 1518 if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0) 1519 break; 1520 else 1521 matchcnt += cnt; 1522 /* 1523 * If requested, try removing inode or removal dependencies. 1524 */ 1525 if (req_clear_inodedeps) { 1526 clear_inodedeps(td); 1527 req_clear_inodedeps -= 1; 1528 wakeup_one(&proc_waiting); 1529 } 1530 if (req_clear_remove) { 1531 clear_remove(td); 1532 req_clear_remove -= 1; 1533 wakeup_one(&proc_waiting); 1534 } 1535 /* 1536 * We do not generally want to stop for buffer space, but if 1537 * we are really being a buffer hog, we will stop and wait. 1538 */ 1539 if (should_yield()) { 1540 FREE_LOCK(&lk); 1541 kern_yield(PRI_UNCHANGED); 1542 bwillwrite(); 1543 ACQUIRE_LOCK(&lk); 1544 } 1545 /* 1546 * Never allow processing to run for more than one 1547 * second. Otherwise the other mountpoints may get 1548 * excessively backlogged. 1549 */ 1550 if (!full && starttime != time_second) 1551 break; 1552 } 1553 if (full == 0) 1554 journal_unsuspend(ump); 1555 FREE_LOCK(&lk); 1556 return (matchcnt); 1557 } 1558 1559 /* 1560 * Process all removes associated with a vnode if we are running out of 1561 * journal space. Any other process which attempts to flush these will 1562 * be unable as we have the vnodes locked. 1563 */ 1564 static void 1565 process_removes(vp) 1566 struct vnode *vp; 1567 { 1568 struct inodedep *inodedep; 1569 struct dirrem *dirrem; 1570 struct mount *mp; 1571 ino_t inum; 1572 1573 mtx_assert(&lk, MA_OWNED); 1574 1575 mp = vp->v_mount; 1576 inum = VTOI(vp)->i_number; 1577 for (;;) { 1578 top: 1579 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) 1580 return; 1581 LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) { 1582 /* 1583 * If another thread is trying to lock this vnode 1584 * it will fail but we must wait for it to do so 1585 * before we can proceed. 1586 */ 1587 if (dirrem->dm_state & INPROGRESS) { 1588 wait_worklist(&dirrem->dm_list, "pwrwait"); 1589 goto top; 1590 } 1591 if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) == 1592 (COMPLETE | ONWORKLIST)) 1593 break; 1594 } 1595 if (dirrem == NULL) 1596 return; 1597 remove_from_worklist(&dirrem->dm_list); 1598 FREE_LOCK(&lk); 1599 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) 1600 panic("process_removes: suspended filesystem"); 1601 handle_workitem_remove(dirrem, 0); 1602 vn_finished_secondary_write(mp); 1603 ACQUIRE_LOCK(&lk); 1604 } 1605 } 1606 1607 /* 1608 * Process all truncations associated with a vnode if we are running out 1609 * of journal space. This is called when the vnode lock is already held 1610 * and no other process can clear the truncation. This function returns 1611 * a value greater than zero if it did any work. 1612 */ 1613 static void 1614 process_truncates(vp) 1615 struct vnode *vp; 1616 { 1617 struct inodedep *inodedep; 1618 struct freeblks *freeblks; 1619 struct mount *mp; 1620 ino_t inum; 1621 int cgwait; 1622 1623 mtx_assert(&lk, MA_OWNED); 1624 1625 mp = vp->v_mount; 1626 inum = VTOI(vp)->i_number; 1627 for (;;) { 1628 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) 1629 return; 1630 cgwait = 0; 1631 TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) { 1632 /* Journal entries not yet written. */ 1633 if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) { 1634 jwait(&LIST_FIRST( 1635 &freeblks->fb_jblkdephd)->jb_list, 1636 MNT_WAIT); 1637 break; 1638 } 1639 /* Another thread is executing this item. */ 1640 if (freeblks->fb_state & INPROGRESS) { 1641 wait_worklist(&freeblks->fb_list, "ptrwait"); 1642 break; 1643 } 1644 /* Freeblks is waiting on a inode write. */ 1645 if ((freeblks->fb_state & COMPLETE) == 0) { 1646 FREE_LOCK(&lk); 1647 ffs_update(vp, 1); 1648 ACQUIRE_LOCK(&lk); 1649 break; 1650 } 1651 if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) == 1652 (ALLCOMPLETE | ONWORKLIST)) { 1653 remove_from_worklist(&freeblks->fb_list); 1654 freeblks->fb_state |= INPROGRESS; 1655 FREE_LOCK(&lk); 1656 if (vn_start_secondary_write(NULL, &mp, 1657 V_NOWAIT)) 1658 panic("process_truncates: " 1659 "suspended filesystem"); 1660 handle_workitem_freeblocks(freeblks, 0); 1661 vn_finished_secondary_write(mp); 1662 ACQUIRE_LOCK(&lk); 1663 break; 1664 } 1665 if (freeblks->fb_cgwait) 1666 cgwait++; 1667 } 1668 if (cgwait) { 1669 FREE_LOCK(&lk); 1670 sync_cgs(mp, MNT_WAIT); 1671 ffs_sync_snap(mp, MNT_WAIT); 1672 ACQUIRE_LOCK(&lk); 1673 continue; 1674 } 1675 if (freeblks == NULL) 1676 break; 1677 } 1678 return; 1679 } 1680 1681 /* 1682 * Process one item on the worklist. 1683 */ 1684 static int 1685 process_worklist_item(mp, target, flags) 1686 struct mount *mp; 1687 int target; 1688 int flags; 1689 { 1690 struct worklist sintenel; 1691 struct worklist *wk; 1692 struct ufsmount *ump; 1693 int matchcnt; 1694 int error; 1695 1696 mtx_assert(&lk, MA_OWNED); 1697 KASSERT(mp != NULL, ("process_worklist_item: NULL mp")); 1698 /* 1699 * If we are being called because of a process doing a 1700 * copy-on-write, then it is not safe to write as we may 1701 * recurse into the copy-on-write routine. 1702 */ 1703 if (curthread->td_pflags & TDP_COWINPROGRESS) 1704 return (-1); 1705 PHOLD(curproc); /* Don't let the stack go away. */ 1706 ump = VFSTOUFS(mp); 1707 matchcnt = 0; 1708 sintenel.wk_mp = NULL; 1709 sintenel.wk_type = D_SENTINAL; 1710 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sintenel, wk_list); 1711 for (wk = LIST_NEXT(&sintenel, wk_list); wk != NULL; 1712 wk = LIST_NEXT(&sintenel, wk_list)) { 1713 if (wk->wk_type == D_SENTINAL) { 1714 LIST_REMOVE(&sintenel, wk_list); 1715 LIST_INSERT_AFTER(wk, &sintenel, wk_list); 1716 continue; 1717 } 1718 if (wk->wk_state & INPROGRESS) 1719 panic("process_worklist_item: %p already in progress.", 1720 wk); 1721 wk->wk_state |= INPROGRESS; 1722 remove_from_worklist(wk); 1723 FREE_LOCK(&lk); 1724 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) 1725 panic("process_worklist_item: suspended filesystem"); 1726 switch (wk->wk_type) { 1727 case D_DIRREM: 1728 /* removal of a directory entry */ 1729 error = handle_workitem_remove(WK_DIRREM(wk), flags); 1730 break; 1731 1732 case D_FREEBLKS: 1733 /* releasing blocks and/or fragments from a file */ 1734 error = handle_workitem_freeblocks(WK_FREEBLKS(wk), 1735 flags); 1736 break; 1737 1738 case D_FREEFRAG: 1739 /* releasing a fragment when replaced as a file grows */ 1740 handle_workitem_freefrag(WK_FREEFRAG(wk)); 1741 error = 0; 1742 break; 1743 1744 case D_FREEFILE: 1745 /* releasing an inode when its link count drops to 0 */ 1746 handle_workitem_freefile(WK_FREEFILE(wk)); 1747 error = 0; 1748 break; 1749 1750 default: 1751 panic("%s_process_worklist: Unknown type %s", 1752 "softdep", TYPENAME(wk->wk_type)); 1753 /* NOTREACHED */ 1754 } 1755 vn_finished_secondary_write(mp); 1756 ACQUIRE_LOCK(&lk); 1757 if (error == 0) { 1758 if (++matchcnt == target) 1759 break; 1760 continue; 1761 } 1762 /* 1763 * We have to retry the worklist item later. Wake up any 1764 * waiters who may be able to complete it immediately and 1765 * add the item back to the head so we don't try to execute 1766 * it again. 1767 */ 1768 wk->wk_state &= ~INPROGRESS; 1769 wake_worklist(wk); 1770 add_to_worklist(wk, WK_HEAD); 1771 } 1772 LIST_REMOVE(&sintenel, wk_list); 1773 /* Sentinal could've become the tail from remove_from_worklist. */ 1774 if (ump->softdep_worklist_tail == &sintenel) 1775 ump->softdep_worklist_tail = 1776 (struct worklist *)sintenel.wk_list.le_prev; 1777 PRELE(curproc); 1778 return (matchcnt); 1779 } 1780 1781 /* 1782 * Move dependencies from one buffer to another. 1783 */ 1784 int 1785 softdep_move_dependencies(oldbp, newbp) 1786 struct buf *oldbp; 1787 struct buf *newbp; 1788 { 1789 struct worklist *wk, *wktail; 1790 int dirty; 1791 1792 dirty = 0; 1793 wktail = NULL; 1794 ACQUIRE_LOCK(&lk); 1795 while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { 1796 LIST_REMOVE(wk, wk_list); 1797 if (wk->wk_type == D_BMSAFEMAP && 1798 bmsafemap_rollbacks(WK_BMSAFEMAP(wk))) 1799 dirty = 1; 1800 if (wktail == 0) 1801 LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); 1802 else 1803 LIST_INSERT_AFTER(wktail, wk, wk_list); 1804 wktail = wk; 1805 } 1806 FREE_LOCK(&lk); 1807 1808 return (dirty); 1809 } 1810 1811 /* 1812 * Purge the work list of all items associated with a particular mount point. 1813 */ 1814 int 1815 softdep_flushworklist(oldmnt, countp, td) 1816 struct mount *oldmnt; 1817 int *countp; 1818 struct thread *td; 1819 { 1820 struct vnode *devvp; 1821 int count, error = 0; 1822 struct ufsmount *ump; 1823 1824 /* 1825 * Alternately flush the block device associated with the mount 1826 * point and process any dependencies that the flushing 1827 * creates. We continue until no more worklist dependencies 1828 * are found. 1829 */ 1830 *countp = 0; 1831 ump = VFSTOUFS(oldmnt); 1832 devvp = ump->um_devvp; 1833 while ((count = softdep_process_worklist(oldmnt, 1)) > 0) { 1834 *countp += count; 1835 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1836 error = VOP_FSYNC(devvp, MNT_WAIT, td); 1837 VOP_UNLOCK(devvp, 0); 1838 if (error) 1839 break; 1840 } 1841 return (error); 1842 } 1843 1844 int 1845 softdep_waitidle(struct mount *mp) 1846 { 1847 struct ufsmount *ump; 1848 int error; 1849 int i; 1850 1851 ump = VFSTOUFS(mp); 1852 ACQUIRE_LOCK(&lk); 1853 for (i = 0; i < 10 && ump->softdep_deps; i++) { 1854 ump->softdep_req = 1; 1855 if (ump->softdep_on_worklist) 1856 panic("softdep_waitidle: work added after flush."); 1857 msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1); 1858 } 1859 ump->softdep_req = 0; 1860 FREE_LOCK(&lk); 1861 error = 0; 1862 if (i == 10) { 1863 error = EBUSY; 1864 printf("softdep_waitidle: Failed to flush worklist for %p\n", 1865 mp); 1866 } 1867 1868 return (error); 1869 } 1870 1871 /* 1872 * Flush all vnodes and worklist items associated with a specified mount point. 1873 */ 1874 int 1875 softdep_flushfiles(oldmnt, flags, td) 1876 struct mount *oldmnt; 1877 int flags; 1878 struct thread *td; 1879 { 1880 int error, depcount, loopcnt, retry_flush_count, retry; 1881 1882 loopcnt = 10; 1883 retry_flush_count = 3; 1884 retry_flush: 1885 error = 0; 1886 1887 /* 1888 * Alternately flush the vnodes associated with the mount 1889 * point and process any dependencies that the flushing 1890 * creates. In theory, this loop can happen at most twice, 1891 * but we give it a few extra just to be sure. 1892 */ 1893 for (; loopcnt > 0; loopcnt--) { 1894 /* 1895 * Do another flush in case any vnodes were brought in 1896 * as part of the cleanup operations. 1897 */ 1898 if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0) 1899 break; 1900 if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 || 1901 depcount == 0) 1902 break; 1903 } 1904 /* 1905 * If we are unmounting then it is an error to fail. If we 1906 * are simply trying to downgrade to read-only, then filesystem 1907 * activity can keep us busy forever, so we just fail with EBUSY. 1908 */ 1909 if (loopcnt == 0) { 1910 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) 1911 panic("softdep_flushfiles: looping"); 1912 error = EBUSY; 1913 } 1914 if (!error) 1915 error = softdep_waitidle(oldmnt); 1916 if (!error) { 1917 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) { 1918 retry = 0; 1919 MNT_ILOCK(oldmnt); 1920 KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0, 1921 ("softdep_flushfiles: !MNTK_NOINSMNTQ")); 1922 if (oldmnt->mnt_nvnodelistsize > 0) { 1923 if (--retry_flush_count > 0) { 1924 retry = 1; 1925 loopcnt = 3; 1926 } else 1927 error = EBUSY; 1928 } 1929 MNT_IUNLOCK(oldmnt); 1930 if (retry) 1931 goto retry_flush; 1932 } 1933 } 1934 return (error); 1935 } 1936 1937 /* 1938 * Structure hashing. 1939 * 1940 * There are three types of structures that can be looked up: 1941 * 1) pagedep structures identified by mount point, inode number, 1942 * and logical block. 1943 * 2) inodedep structures identified by mount point and inode number. 1944 * 3) newblk structures identified by mount point and 1945 * physical block number. 1946 * 1947 * The "pagedep" and "inodedep" dependency structures are hashed 1948 * separately from the file blocks and inodes to which they correspond. 1949 * This separation helps when the in-memory copy of an inode or 1950 * file block must be replaced. It also obviates the need to access 1951 * an inode or file page when simply updating (or de-allocating) 1952 * dependency structures. Lookup of newblk structures is needed to 1953 * find newly allocated blocks when trying to associate them with 1954 * their allocdirect or allocindir structure. 1955 * 1956 * The lookup routines optionally create and hash a new instance when 1957 * an existing entry is not found. 1958 */ 1959 #define DEPALLOC 0x0001 /* allocate structure if lookup fails */ 1960 #define NODELAY 0x0002 /* cannot do background work */ 1961 1962 /* 1963 * Structures and routines associated with pagedep caching. 1964 */ 1965 LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl; 1966 u_long pagedep_hash; /* size of hash table - 1 */ 1967 #define PAGEDEP_HASH(mp, inum, lbn) \ 1968 (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \ 1969 pagedep_hash]) 1970 1971 static int 1972 pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp) 1973 struct pagedep_hashhead *pagedephd; 1974 ino_t ino; 1975 ufs_lbn_t lbn; 1976 struct mount *mp; 1977 int flags; 1978 struct pagedep **pagedeppp; 1979 { 1980 struct pagedep *pagedep; 1981 1982 LIST_FOREACH(pagedep, pagedephd, pd_hash) { 1983 if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn && 1984 mp == pagedep->pd_list.wk_mp) { 1985 *pagedeppp = pagedep; 1986 return (1); 1987 } 1988 } 1989 *pagedeppp = NULL; 1990 return (0); 1991 } 1992 /* 1993 * Look up a pagedep. Return 1 if found, 0 otherwise. 1994 * If not found, allocate if DEPALLOC flag is passed. 1995 * Found or allocated entry is returned in pagedeppp. 1996 * This routine must be called with splbio interrupts blocked. 1997 */ 1998 static int 1999 pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp) 2000 struct mount *mp; 2001 struct buf *bp; 2002 ino_t ino; 2003 ufs_lbn_t lbn; 2004 int flags; 2005 struct pagedep **pagedeppp; 2006 { 2007 struct pagedep *pagedep; 2008 struct pagedep_hashhead *pagedephd; 2009 struct worklist *wk; 2010 int ret; 2011 int i; 2012 2013 mtx_assert(&lk, MA_OWNED); 2014 if (bp) { 2015 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 2016 if (wk->wk_type == D_PAGEDEP) { 2017 *pagedeppp = WK_PAGEDEP(wk); 2018 return (1); 2019 } 2020 } 2021 } 2022 pagedephd = PAGEDEP_HASH(mp, ino, lbn); 2023 ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp); 2024 if (ret) { 2025 if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp) 2026 WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list); 2027 return (1); 2028 } 2029 if ((flags & DEPALLOC) == 0) 2030 return (0); 2031 FREE_LOCK(&lk); 2032 pagedep = malloc(sizeof(struct pagedep), 2033 M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO); 2034 workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp); 2035 ACQUIRE_LOCK(&lk); 2036 ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp); 2037 if (*pagedeppp) { 2038 /* 2039 * This should never happen since we only create pagedeps 2040 * with the vnode lock held. Could be an assert. 2041 */ 2042 WORKITEM_FREE(pagedep, D_PAGEDEP); 2043 return (ret); 2044 } 2045 pagedep->pd_ino = ino; 2046 pagedep->pd_lbn = lbn; 2047 LIST_INIT(&pagedep->pd_dirremhd); 2048 LIST_INIT(&pagedep->pd_pendinghd); 2049 for (i = 0; i < DAHASHSZ; i++) 2050 LIST_INIT(&pagedep->pd_diraddhd[i]); 2051 LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); 2052 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 2053 *pagedeppp = pagedep; 2054 return (0); 2055 } 2056 2057 /* 2058 * Structures and routines associated with inodedep caching. 2059 */ 2060 LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl; 2061 static u_long inodedep_hash; /* size of hash table - 1 */ 2062 #define INODEDEP_HASH(fs, inum) \ 2063 (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash]) 2064 2065 static int 2066 inodedep_find(inodedephd, fs, inum, inodedeppp) 2067 struct inodedep_hashhead *inodedephd; 2068 struct fs *fs; 2069 ino_t inum; 2070 struct inodedep **inodedeppp; 2071 { 2072 struct inodedep *inodedep; 2073 2074 LIST_FOREACH(inodedep, inodedephd, id_hash) 2075 if (inum == inodedep->id_ino && fs == inodedep->id_fs) 2076 break; 2077 if (inodedep) { 2078 *inodedeppp = inodedep; 2079 return (1); 2080 } 2081 *inodedeppp = NULL; 2082 2083 return (0); 2084 } 2085 /* 2086 * Look up an inodedep. Return 1 if found, 0 if not found. 2087 * If not found, allocate if DEPALLOC flag is passed. 2088 * Found or allocated entry is returned in inodedeppp. 2089 * This routine must be called with splbio interrupts blocked. 2090 */ 2091 static int 2092 inodedep_lookup(mp, inum, flags, inodedeppp) 2093 struct mount *mp; 2094 ino_t inum; 2095 int flags; 2096 struct inodedep **inodedeppp; 2097 { 2098 struct inodedep *inodedep; 2099 struct inodedep_hashhead *inodedephd; 2100 struct fs *fs; 2101 2102 mtx_assert(&lk, MA_OWNED); 2103 fs = VFSTOUFS(mp)->um_fs; 2104 inodedephd = INODEDEP_HASH(fs, inum); 2105 2106 if (inodedep_find(inodedephd, fs, inum, inodedeppp)) 2107 return (1); 2108 if ((flags & DEPALLOC) == 0) 2109 return (0); 2110 /* 2111 * If we are over our limit, try to improve the situation. 2112 */ 2113 if (dep_current[D_INODEDEP] > max_softdeps && (flags & NODELAY) == 0) 2114 request_cleanup(mp, FLUSH_INODES); 2115 FREE_LOCK(&lk); 2116 inodedep = malloc(sizeof(struct inodedep), 2117 M_INODEDEP, M_SOFTDEP_FLAGS); 2118 workitem_alloc(&inodedep->id_list, D_INODEDEP, mp); 2119 ACQUIRE_LOCK(&lk); 2120 if (inodedep_find(inodedephd, fs, inum, inodedeppp)) { 2121 WORKITEM_FREE(inodedep, D_INODEDEP); 2122 return (1); 2123 } 2124 inodedep->id_fs = fs; 2125 inodedep->id_ino = inum; 2126 inodedep->id_state = ALLCOMPLETE; 2127 inodedep->id_nlinkdelta = 0; 2128 inodedep->id_savedino1 = NULL; 2129 inodedep->id_savedsize = -1; 2130 inodedep->id_savedextsize = -1; 2131 inodedep->id_savednlink = -1; 2132 inodedep->id_bmsafemap = NULL; 2133 inodedep->id_mkdiradd = NULL; 2134 LIST_INIT(&inodedep->id_dirremhd); 2135 LIST_INIT(&inodedep->id_pendinghd); 2136 LIST_INIT(&inodedep->id_inowait); 2137 LIST_INIT(&inodedep->id_bufwait); 2138 TAILQ_INIT(&inodedep->id_inoreflst); 2139 TAILQ_INIT(&inodedep->id_inoupdt); 2140 TAILQ_INIT(&inodedep->id_newinoupdt); 2141 TAILQ_INIT(&inodedep->id_extupdt); 2142 TAILQ_INIT(&inodedep->id_newextupdt); 2143 TAILQ_INIT(&inodedep->id_freeblklst); 2144 LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); 2145 *inodedeppp = inodedep; 2146 return (0); 2147 } 2148 2149 /* 2150 * Structures and routines associated with newblk caching. 2151 */ 2152 LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl; 2153 u_long newblk_hash; /* size of hash table - 1 */ 2154 #define NEWBLK_HASH(fs, inum) \ 2155 (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash]) 2156 2157 static int 2158 newblk_find(newblkhd, mp, newblkno, flags, newblkpp) 2159 struct newblk_hashhead *newblkhd; 2160 struct mount *mp; 2161 ufs2_daddr_t newblkno; 2162 int flags; 2163 struct newblk **newblkpp; 2164 { 2165 struct newblk *newblk; 2166 2167 LIST_FOREACH(newblk, newblkhd, nb_hash) { 2168 if (newblkno != newblk->nb_newblkno) 2169 continue; 2170 if (mp != newblk->nb_list.wk_mp) 2171 continue; 2172 /* 2173 * If we're creating a new dependency don't match those that 2174 * have already been converted to allocdirects. This is for 2175 * a frag extend. 2176 */ 2177 if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK) 2178 continue; 2179 break; 2180 } 2181 if (newblk) { 2182 *newblkpp = newblk; 2183 return (1); 2184 } 2185 *newblkpp = NULL; 2186 return (0); 2187 } 2188 2189 /* 2190 * Look up a newblk. Return 1 if found, 0 if not found. 2191 * If not found, allocate if DEPALLOC flag is passed. 2192 * Found or allocated entry is returned in newblkpp. 2193 */ 2194 static int 2195 newblk_lookup(mp, newblkno, flags, newblkpp) 2196 struct mount *mp; 2197 ufs2_daddr_t newblkno; 2198 int flags; 2199 struct newblk **newblkpp; 2200 { 2201 struct newblk *newblk; 2202 struct newblk_hashhead *newblkhd; 2203 2204 newblkhd = NEWBLK_HASH(VFSTOUFS(mp)->um_fs, newblkno); 2205 if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) 2206 return (1); 2207 if ((flags & DEPALLOC) == 0) 2208 return (0); 2209 FREE_LOCK(&lk); 2210 newblk = malloc(sizeof(union allblk), M_NEWBLK, 2211 M_SOFTDEP_FLAGS | M_ZERO); 2212 workitem_alloc(&newblk->nb_list, D_NEWBLK, mp); 2213 ACQUIRE_LOCK(&lk); 2214 if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) { 2215 WORKITEM_FREE(newblk, D_NEWBLK); 2216 return (1); 2217 } 2218 newblk->nb_freefrag = NULL; 2219 LIST_INIT(&newblk->nb_indirdeps); 2220 LIST_INIT(&newblk->nb_newdirblk); 2221 LIST_INIT(&newblk->nb_jwork); 2222 newblk->nb_state = ATTACHED; 2223 newblk->nb_newblkno = newblkno; 2224 LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); 2225 *newblkpp = newblk; 2226 return (0); 2227 } 2228 2229 /* 2230 * Structures and routines associated with freed indirect block caching. 2231 */ 2232 struct freeworklst *indir_hashtbl; 2233 u_long indir_hash; /* size of hash table - 1 */ 2234 #define INDIR_HASH(mp, blkno) \ 2235 (&indir_hashtbl[((((register_t)(mp)) >> 13) + (blkno)) & indir_hash]) 2236 2237 /* 2238 * Lookup an indirect block in the indir hash table. The freework is 2239 * removed and potentially freed. The caller must do a blocking journal 2240 * write before writing to the blkno. 2241 */ 2242 static int 2243 indirblk_lookup(mp, blkno) 2244 struct mount *mp; 2245 ufs2_daddr_t blkno; 2246 { 2247 struct freework *freework; 2248 struct freeworklst *wkhd; 2249 2250 wkhd = INDIR_HASH(mp, blkno); 2251 TAILQ_FOREACH(freework, wkhd, fw_next) { 2252 if (freework->fw_blkno != blkno) 2253 continue; 2254 if (freework->fw_list.wk_mp != mp) 2255 continue; 2256 indirblk_remove(freework); 2257 return (1); 2258 } 2259 return (0); 2260 } 2261 2262 /* 2263 * Insert an indirect block represented by freework into the indirblk 2264 * hash table so that it may prevent the block from being re-used prior 2265 * to the journal being written. 2266 */ 2267 static void 2268 indirblk_insert(freework) 2269 struct freework *freework; 2270 { 2271 struct freeblks *freeblks; 2272 struct jsegdep *jsegdep; 2273 struct worklist *wk; 2274 2275 freeblks = freework->fw_freeblks; 2276 LIST_FOREACH(wk, &freeblks->fb_jwork, wk_list) 2277 if (wk->wk_type == D_JSEGDEP) 2278 break; 2279 if (wk == NULL) 2280 return; 2281 2282 jsegdep = WK_JSEGDEP(wk); 2283 LIST_INSERT_HEAD(&jsegdep->jd_seg->js_indirs, freework, fw_segs); 2284 TAILQ_INSERT_HEAD(INDIR_HASH(freework->fw_list.wk_mp, 2285 freework->fw_blkno), freework, fw_next); 2286 freework->fw_state &= ~DEPCOMPLETE; 2287 } 2288 2289 static void 2290 indirblk_remove(freework) 2291 struct freework *freework; 2292 { 2293 2294 LIST_REMOVE(freework, fw_segs); 2295 TAILQ_REMOVE(INDIR_HASH(freework->fw_list.wk_mp, 2296 freework->fw_blkno), freework, fw_next); 2297 freework->fw_state |= DEPCOMPLETE; 2298 if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE) 2299 WORKITEM_FREE(freework, D_FREEWORK); 2300 } 2301 2302 /* 2303 * Executed during filesystem system initialization before 2304 * mounting any filesystems. 2305 */ 2306 void 2307 softdep_initialize() 2308 { 2309 int i; 2310 2311 LIST_INIT(&mkdirlisthd); 2312 max_softdeps = desiredvnodes * 4; 2313 pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash); 2314 inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); 2315 newblk_hashtbl = hashinit(desiredvnodes / 5, M_NEWBLK, &newblk_hash); 2316 bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash); 2317 i = 1 << (ffs(desiredvnodes / 10) - 1); 2318 indir_hashtbl = malloc(i * sizeof(indir_hashtbl[0]), M_FREEWORK, 2319 M_WAITOK); 2320 indir_hash = i - 1; 2321 for (i = 0; i <= indir_hash; i++) 2322 TAILQ_INIT(&indir_hashtbl[i]); 2323 2324 /* initialise bioops hack */ 2325 bioops.io_start = softdep_disk_io_initiation; 2326 bioops.io_complete = softdep_disk_write_complete; 2327 bioops.io_deallocate = softdep_deallocate_dependencies; 2328 bioops.io_countdeps = softdep_count_dependencies; 2329 2330 /* Initialize the callout with an mtx. */ 2331 callout_init_mtx(&softdep_callout, &lk, 0); 2332 } 2333 2334 /* 2335 * Executed after all filesystems have been unmounted during 2336 * filesystem module unload. 2337 */ 2338 void 2339 softdep_uninitialize() 2340 { 2341 2342 callout_drain(&softdep_callout); 2343 hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash); 2344 hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash); 2345 hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash); 2346 hashdestroy(bmsafemap_hashtbl, M_BMSAFEMAP, bmsafemap_hash); 2347 free(indir_hashtbl, M_FREEWORK); 2348 } 2349 2350 /* 2351 * Called at mount time to notify the dependency code that a 2352 * filesystem wishes to use it. 2353 */ 2354 int 2355 softdep_mount(devvp, mp, fs, cred) 2356 struct vnode *devvp; 2357 struct mount *mp; 2358 struct fs *fs; 2359 struct ucred *cred; 2360 { 2361 struct csum_total cstotal; 2362 struct ufsmount *ump; 2363 struct cg *cgp; 2364 struct buf *bp; 2365 int error, cyl; 2366 2367 MNT_ILOCK(mp); 2368 mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP; 2369 if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) { 2370 mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) | 2371 MNTK_SOFTDEP; 2372 mp->mnt_noasync++; 2373 } 2374 MNT_IUNLOCK(mp); 2375 ump = VFSTOUFS(mp); 2376 LIST_INIT(&ump->softdep_workitem_pending); 2377 LIST_INIT(&ump->softdep_journal_pending); 2378 TAILQ_INIT(&ump->softdep_unlinked); 2379 LIST_INIT(&ump->softdep_dirtycg); 2380 ump->softdep_worklist_tail = NULL; 2381 ump->softdep_on_worklist = 0; 2382 ump->softdep_deps = 0; 2383 if ((fs->fs_flags & FS_SUJ) && 2384 (error = journal_mount(mp, fs, cred)) != 0) { 2385 printf("Failed to start journal: %d\n", error); 2386 return (error); 2387 } 2388 /* 2389 * When doing soft updates, the counters in the 2390 * superblock may have gotten out of sync. Recomputation 2391 * can take a long time and can be deferred for background 2392 * fsck. However, the old behavior of scanning the cylinder 2393 * groups and recalculating them at mount time is available 2394 * by setting vfs.ffs.compute_summary_at_mount to one. 2395 */ 2396 if (compute_summary_at_mount == 0 || fs->fs_clean != 0) 2397 return (0); 2398 bzero(&cstotal, sizeof cstotal); 2399 for (cyl = 0; cyl < fs->fs_ncg; cyl++) { 2400 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), 2401 fs->fs_cgsize, cred, &bp)) != 0) { 2402 brelse(bp); 2403 return (error); 2404 } 2405 cgp = (struct cg *)bp->b_data; 2406 cstotal.cs_nffree += cgp->cg_cs.cs_nffree; 2407 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; 2408 cstotal.cs_nifree += cgp->cg_cs.cs_nifree; 2409 cstotal.cs_ndir += cgp->cg_cs.cs_ndir; 2410 fs->fs_cs(fs, cyl) = cgp->cg_cs; 2411 brelse(bp); 2412 } 2413 #ifdef DEBUG 2414 if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) 2415 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt); 2416 #endif 2417 bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); 2418 return (0); 2419 } 2420 2421 void 2422 softdep_unmount(mp) 2423 struct mount *mp; 2424 { 2425 2426 MNT_ILOCK(mp); 2427 mp->mnt_flag &= ~MNT_SOFTDEP; 2428 if (MOUNTEDSUJ(mp) == 0) { 2429 MNT_IUNLOCK(mp); 2430 return; 2431 } 2432 mp->mnt_flag &= ~MNT_SUJ; 2433 MNT_IUNLOCK(mp); 2434 journal_unmount(mp); 2435 } 2436 2437 struct jblocks { 2438 struct jseglst jb_segs; /* TAILQ of current segments. */ 2439 struct jseg *jb_writeseg; /* Next write to complete. */ 2440 struct jseg *jb_oldestseg; /* Oldest segment with valid entries. */ 2441 struct jextent *jb_extent; /* Extent array. */ 2442 uint64_t jb_nextseq; /* Next sequence number. */ 2443 uint64_t jb_oldestwrseq; /* Oldest written sequence number. */ 2444 uint8_t jb_needseg; /* Need a forced segment. */ 2445 uint8_t jb_suspended; /* Did journal suspend writes? */ 2446 int jb_avail; /* Available extents. */ 2447 int jb_used; /* Last used extent. */ 2448 int jb_head; /* Allocator head. */ 2449 int jb_off; /* Allocator extent offset. */ 2450 int jb_blocks; /* Total disk blocks covered. */ 2451 int jb_free; /* Total disk blocks free. */ 2452 int jb_min; /* Minimum free space. */ 2453 int jb_low; /* Low on space. */ 2454 int jb_age; /* Insertion time of oldest rec. */ 2455 }; 2456 2457 struct jextent { 2458 ufs2_daddr_t je_daddr; /* Disk block address. */ 2459 int je_blocks; /* Disk block count. */ 2460 }; 2461 2462 static struct jblocks * 2463 jblocks_create(void) 2464 { 2465 struct jblocks *jblocks; 2466 2467 jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO); 2468 TAILQ_INIT(&jblocks->jb_segs); 2469 jblocks->jb_avail = 10; 2470 jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail, 2471 M_JBLOCKS, M_WAITOK | M_ZERO); 2472 2473 return (jblocks); 2474 } 2475 2476 static ufs2_daddr_t 2477 jblocks_alloc(jblocks, bytes, actual) 2478 struct jblocks *jblocks; 2479 int bytes; 2480 int *actual; 2481 { 2482 ufs2_daddr_t daddr; 2483 struct jextent *jext; 2484 int freecnt; 2485 int blocks; 2486 2487 blocks = bytes / DEV_BSIZE; 2488 jext = &jblocks->jb_extent[jblocks->jb_head]; 2489 freecnt = jext->je_blocks - jblocks->jb_off; 2490 if (freecnt == 0) { 2491 jblocks->jb_off = 0; 2492 if (++jblocks->jb_head > jblocks->jb_used) 2493 jblocks->jb_head = 0; 2494 jext = &jblocks->jb_extent[jblocks->jb_head]; 2495 freecnt = jext->je_blocks; 2496 } 2497 if (freecnt > blocks) 2498 freecnt = blocks; 2499 *actual = freecnt * DEV_BSIZE; 2500 daddr = jext->je_daddr + jblocks->jb_off; 2501 jblocks->jb_off += freecnt; 2502 jblocks->jb_free -= freecnt; 2503 2504 return (daddr); 2505 } 2506 2507 static void 2508 jblocks_free(jblocks, mp, bytes) 2509 struct jblocks *jblocks; 2510 struct mount *mp; 2511 int bytes; 2512 { 2513 2514 jblocks->jb_free += bytes / DEV_BSIZE; 2515 if (jblocks->jb_suspended) 2516 worklist_speedup(); 2517 wakeup(jblocks); 2518 } 2519 2520 static void 2521 jblocks_destroy(jblocks) 2522 struct jblocks *jblocks; 2523 { 2524 2525 if (jblocks->jb_extent) 2526 free(jblocks->jb_extent, M_JBLOCKS); 2527 free(jblocks, M_JBLOCKS); 2528 } 2529 2530 static void 2531 jblocks_add(jblocks, daddr, blocks) 2532 struct jblocks *jblocks; 2533 ufs2_daddr_t daddr; 2534 int blocks; 2535 { 2536 struct jextent *jext; 2537 2538 jblocks->jb_blocks += blocks; 2539 jblocks->jb_free += blocks; 2540 jext = &jblocks->jb_extent[jblocks->jb_used]; 2541 /* Adding the first block. */ 2542 if (jext->je_daddr == 0) { 2543 jext->je_daddr = daddr; 2544 jext->je_blocks = blocks; 2545 return; 2546 } 2547 /* Extending the last extent. */ 2548 if (jext->je_daddr + jext->je_blocks == daddr) { 2549 jext->je_blocks += blocks; 2550 return; 2551 } 2552 /* Adding a new extent. */ 2553 if (++jblocks->jb_used == jblocks->jb_avail) { 2554 jblocks->jb_avail *= 2; 2555 jext = malloc(sizeof(struct jextent) * jblocks->jb_avail, 2556 M_JBLOCKS, M_WAITOK | M_ZERO); 2557 memcpy(jext, jblocks->jb_extent, 2558 sizeof(struct jextent) * jblocks->jb_used); 2559 free(jblocks->jb_extent, M_JBLOCKS); 2560 jblocks->jb_extent = jext; 2561 } 2562 jext = &jblocks->jb_extent[jblocks->jb_used]; 2563 jext->je_daddr = daddr; 2564 jext->je_blocks = blocks; 2565 return; 2566 } 2567 2568 int 2569 softdep_journal_lookup(mp, vpp) 2570 struct mount *mp; 2571 struct vnode **vpp; 2572 { 2573 struct componentname cnp; 2574 struct vnode *dvp; 2575 ino_t sujournal; 2576 int error; 2577 2578 error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp); 2579 if (error) 2580 return (error); 2581 bzero(&cnp, sizeof(cnp)); 2582 cnp.cn_nameiop = LOOKUP; 2583 cnp.cn_flags = ISLASTCN; 2584 cnp.cn_thread = curthread; 2585 cnp.cn_cred = curthread->td_ucred; 2586 cnp.cn_pnbuf = SUJ_FILE; 2587 cnp.cn_nameptr = SUJ_FILE; 2588 cnp.cn_namelen = strlen(SUJ_FILE); 2589 error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal); 2590 vput(dvp); 2591 if (error != 0) 2592 return (error); 2593 error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp); 2594 return (error); 2595 } 2596 2597 /* 2598 * Open and verify the journal file. 2599 */ 2600 static int 2601 journal_mount(mp, fs, cred) 2602 struct mount *mp; 2603 struct fs *fs; 2604 struct ucred *cred; 2605 { 2606 struct jblocks *jblocks; 2607 struct vnode *vp; 2608 struct inode *ip; 2609 ufs2_daddr_t blkno; 2610 int bcount; 2611 int error; 2612 int i; 2613 2614 error = softdep_journal_lookup(mp, &vp); 2615 if (error != 0) { 2616 printf("Failed to find journal. Use tunefs to create one\n"); 2617 return (error); 2618 } 2619 ip = VTOI(vp); 2620 if (ip->i_size < SUJ_MIN) { 2621 error = ENOSPC; 2622 goto out; 2623 } 2624 bcount = lblkno(fs, ip->i_size); /* Only use whole blocks. */ 2625 jblocks = jblocks_create(); 2626 for (i = 0; i < bcount; i++) { 2627 error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL); 2628 if (error) 2629 break; 2630 jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag)); 2631 } 2632 if (error) { 2633 jblocks_destroy(jblocks); 2634 goto out; 2635 } 2636 jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */ 2637 jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */ 2638 VFSTOUFS(mp)->softdep_jblocks = jblocks; 2639 out: 2640 if (error == 0) { 2641 MNT_ILOCK(mp); 2642 mp->mnt_flag |= MNT_SUJ; 2643 mp->mnt_flag &= ~MNT_SOFTDEP; 2644 MNT_IUNLOCK(mp); 2645 /* 2646 * Only validate the journal contents if the 2647 * filesystem is clean, otherwise we write the logs 2648 * but they'll never be used. If the filesystem was 2649 * still dirty when we mounted it the journal is 2650 * invalid and a new journal can only be valid if it 2651 * starts from a clean mount. 2652 */ 2653 if (fs->fs_clean) { 2654 DIP_SET(ip, i_modrev, fs->fs_mtime); 2655 ip->i_flags |= IN_MODIFIED; 2656 ffs_update(vp, 1); 2657 } 2658 } 2659 vput(vp); 2660 return (error); 2661 } 2662 2663 static void 2664 journal_unmount(mp) 2665 struct mount *mp; 2666 { 2667 struct ufsmount *ump; 2668 2669 ump = VFSTOUFS(mp); 2670 if (ump->softdep_jblocks) 2671 jblocks_destroy(ump->softdep_jblocks); 2672 ump->softdep_jblocks = NULL; 2673 } 2674 2675 /* 2676 * Called when a journal record is ready to be written. Space is allocated 2677 * and the journal entry is created when the journal is flushed to stable 2678 * store. 2679 */ 2680 static void 2681 add_to_journal(wk) 2682 struct worklist *wk; 2683 { 2684 struct ufsmount *ump; 2685 2686 mtx_assert(&lk, MA_OWNED); 2687 ump = VFSTOUFS(wk->wk_mp); 2688 if (wk->wk_state & ONWORKLIST) 2689 panic("add_to_journal: %s(0x%X) already on list", 2690 TYPENAME(wk->wk_type), wk->wk_state); 2691 wk->wk_state |= ONWORKLIST | DEPCOMPLETE; 2692 if (LIST_EMPTY(&ump->softdep_journal_pending)) { 2693 ump->softdep_jblocks->jb_age = ticks; 2694 LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list); 2695 } else 2696 LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list); 2697 ump->softdep_journal_tail = wk; 2698 ump->softdep_on_journal += 1; 2699 } 2700 2701 /* 2702 * Remove an arbitrary item for the journal worklist maintain the tail 2703 * pointer. This happens when a new operation obviates the need to 2704 * journal an old operation. 2705 */ 2706 static void 2707 remove_from_journal(wk) 2708 struct worklist *wk; 2709 { 2710 struct ufsmount *ump; 2711 2712 mtx_assert(&lk, MA_OWNED); 2713 ump = VFSTOUFS(wk->wk_mp); 2714 #ifdef SUJ_DEBUG 2715 { 2716 struct worklist *wkn; 2717 2718 LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list) 2719 if (wkn == wk) 2720 break; 2721 if (wkn == NULL) 2722 panic("remove_from_journal: %p is not in journal", wk); 2723 } 2724 #endif 2725 /* 2726 * We emulate a TAILQ to save space in most structures which do not 2727 * require TAILQ semantics. Here we must update the tail position 2728 * when removing the tail which is not the final entry. This works 2729 * only if the worklist linkage are at the beginning of the structure. 2730 */ 2731 if (ump->softdep_journal_tail == wk) 2732 ump->softdep_journal_tail = 2733 (struct worklist *)wk->wk_list.le_prev; 2734 2735 WORKLIST_REMOVE(wk); 2736 ump->softdep_on_journal -= 1; 2737 } 2738 2739 /* 2740 * Check for journal space as well as dependency limits so the prelink 2741 * code can throttle both journaled and non-journaled filesystems. 2742 * Threshold is 0 for low and 1 for min. 2743 */ 2744 static int 2745 journal_space(ump, thresh) 2746 struct ufsmount *ump; 2747 int thresh; 2748 { 2749 struct jblocks *jblocks; 2750 int avail; 2751 2752 jblocks = ump->softdep_jblocks; 2753 if (jblocks == NULL) 2754 return (1); 2755 /* 2756 * We use a tighter restriction here to prevent request_cleanup() 2757 * running in threads from running into locks we currently hold. 2758 */ 2759 if (dep_current[D_INODEDEP] > (max_softdeps / 10) * 9) 2760 return (0); 2761 if (thresh) 2762 thresh = jblocks->jb_min; 2763 else 2764 thresh = jblocks->jb_low; 2765 avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE; 2766 avail = jblocks->jb_free - avail; 2767 2768 return (avail > thresh); 2769 } 2770 2771 static void 2772 journal_suspend(ump) 2773 struct ufsmount *ump; 2774 { 2775 struct jblocks *jblocks; 2776 struct mount *mp; 2777 2778 mp = UFSTOVFS(ump); 2779 jblocks = ump->softdep_jblocks; 2780 MNT_ILOCK(mp); 2781 if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { 2782 stat_journal_min++; 2783 mp->mnt_kern_flag |= MNTK_SUSPEND; 2784 mp->mnt_susp_owner = FIRST_THREAD_IN_PROC(softdepproc); 2785 } 2786 jblocks->jb_suspended = 1; 2787 MNT_IUNLOCK(mp); 2788 } 2789 2790 static int 2791 journal_unsuspend(struct ufsmount *ump) 2792 { 2793 struct jblocks *jblocks; 2794 struct mount *mp; 2795 2796 mp = UFSTOVFS(ump); 2797 jblocks = ump->softdep_jblocks; 2798 2799 if (jblocks != NULL && jblocks->jb_suspended && 2800 journal_space(ump, jblocks->jb_min)) { 2801 jblocks->jb_suspended = 0; 2802 FREE_LOCK(&lk); 2803 mp->mnt_susp_owner = curthread; 2804 vfs_write_resume(mp); 2805 ACQUIRE_LOCK(&lk); 2806 return (1); 2807 } 2808 return (0); 2809 } 2810 2811 /* 2812 * Called before any allocation function to be certain that there is 2813 * sufficient space in the journal prior to creating any new records. 2814 * Since in the case of block allocation we may have multiple locked 2815 * buffers at the time of the actual allocation we can not block 2816 * when the journal records are created. Doing so would create a deadlock 2817 * if any of these buffers needed to be flushed to reclaim space. Instead 2818 * we require a sufficiently large amount of available space such that 2819 * each thread in the system could have passed this allocation check and 2820 * still have sufficient free space. With 20% of a minimum journal size 2821 * of 1MB we have 6553 records available. 2822 */ 2823 int 2824 softdep_prealloc(vp, waitok) 2825 struct vnode *vp; 2826 int waitok; 2827 { 2828 struct ufsmount *ump; 2829 2830 if (DOINGSUJ(vp) == 0) 2831 return (0); 2832 ump = VFSTOUFS(vp->v_mount); 2833 ACQUIRE_LOCK(&lk); 2834 if (journal_space(ump, 0)) { 2835 FREE_LOCK(&lk); 2836 return (0); 2837 } 2838 stat_journal_low++; 2839 FREE_LOCK(&lk); 2840 if (waitok == MNT_NOWAIT) 2841 return (ENOSPC); 2842 /* 2843 * Attempt to sync this vnode once to flush any journal 2844 * work attached to it. 2845 */ 2846 if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0) 2847 ffs_syncvnode(vp, waitok); 2848 ACQUIRE_LOCK(&lk); 2849 process_removes(vp); 2850 process_truncates(vp); 2851 if (journal_space(ump, 0) == 0) { 2852 softdep_speedup(); 2853 if (journal_space(ump, 1) == 0) 2854 journal_suspend(ump); 2855 } 2856 FREE_LOCK(&lk); 2857 2858 return (0); 2859 } 2860 2861 /* 2862 * Before adjusting a link count on a vnode verify that we have sufficient 2863 * journal space. If not, process operations that depend on the currently 2864 * locked pair of vnodes to try to flush space as the syncer, buf daemon, 2865 * and softdep flush threads can not acquire these locks to reclaim space. 2866 */ 2867 static void 2868 softdep_prelink(dvp, vp) 2869 struct vnode *dvp; 2870 struct vnode *vp; 2871 { 2872 struct ufsmount *ump; 2873 2874 ump = VFSTOUFS(dvp->v_mount); 2875 mtx_assert(&lk, MA_OWNED); 2876 if (journal_space(ump, 0)) 2877 return; 2878 stat_journal_low++; 2879 FREE_LOCK(&lk); 2880 if (vp) 2881 ffs_syncvnode(vp, MNT_NOWAIT); 2882 ffs_syncvnode(dvp, MNT_WAIT); 2883 ACQUIRE_LOCK(&lk); 2884 /* Process vp before dvp as it may create .. removes. */ 2885 if (vp) { 2886 process_removes(vp); 2887 process_truncates(vp); 2888 } 2889 process_removes(dvp); 2890 process_truncates(dvp); 2891 softdep_speedup(); 2892 process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT); 2893 if (journal_space(ump, 0) == 0) { 2894 softdep_speedup(); 2895 if (journal_space(ump, 1) == 0) 2896 journal_suspend(ump); 2897 } 2898 } 2899 2900 static void 2901 jseg_write(ump, jseg, data) 2902 struct ufsmount *ump; 2903 struct jseg *jseg; 2904 uint8_t *data; 2905 { 2906 struct jsegrec *rec; 2907 2908 rec = (struct jsegrec *)data; 2909 rec->jsr_seq = jseg->js_seq; 2910 rec->jsr_oldest = jseg->js_oldseq; 2911 rec->jsr_cnt = jseg->js_cnt; 2912 rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize; 2913 rec->jsr_crc = 0; 2914 rec->jsr_time = ump->um_fs->fs_mtime; 2915 } 2916 2917 static inline void 2918 inoref_write(inoref, jseg, rec) 2919 struct inoref *inoref; 2920 struct jseg *jseg; 2921 struct jrefrec *rec; 2922 { 2923 2924 inoref->if_jsegdep->jd_seg = jseg; 2925 rec->jr_ino = inoref->if_ino; 2926 rec->jr_parent = inoref->if_parent; 2927 rec->jr_nlink = inoref->if_nlink; 2928 rec->jr_mode = inoref->if_mode; 2929 rec->jr_diroff = inoref->if_diroff; 2930 } 2931 2932 static void 2933 jaddref_write(jaddref, jseg, data) 2934 struct jaddref *jaddref; 2935 struct jseg *jseg; 2936 uint8_t *data; 2937 { 2938 struct jrefrec *rec; 2939 2940 rec = (struct jrefrec *)data; 2941 rec->jr_op = JOP_ADDREF; 2942 inoref_write(&jaddref->ja_ref, jseg, rec); 2943 } 2944 2945 static void 2946 jremref_write(jremref, jseg, data) 2947 struct jremref *jremref; 2948 struct jseg *jseg; 2949 uint8_t *data; 2950 { 2951 struct jrefrec *rec; 2952 2953 rec = (struct jrefrec *)data; 2954 rec->jr_op = JOP_REMREF; 2955 inoref_write(&jremref->jr_ref, jseg, rec); 2956 } 2957 2958 static void 2959 jmvref_write(jmvref, jseg, data) 2960 struct jmvref *jmvref; 2961 struct jseg *jseg; 2962 uint8_t *data; 2963 { 2964 struct jmvrec *rec; 2965 2966 rec = (struct jmvrec *)data; 2967 rec->jm_op = JOP_MVREF; 2968 rec->jm_ino = jmvref->jm_ino; 2969 rec->jm_parent = jmvref->jm_parent; 2970 rec->jm_oldoff = jmvref->jm_oldoff; 2971 rec->jm_newoff = jmvref->jm_newoff; 2972 } 2973 2974 static void 2975 jnewblk_write(jnewblk, jseg, data) 2976 struct jnewblk *jnewblk; 2977 struct jseg *jseg; 2978 uint8_t *data; 2979 { 2980 struct jblkrec *rec; 2981 2982 jnewblk->jn_jsegdep->jd_seg = jseg; 2983 rec = (struct jblkrec *)data; 2984 rec->jb_op = JOP_NEWBLK; 2985 rec->jb_ino = jnewblk->jn_ino; 2986 rec->jb_blkno = jnewblk->jn_blkno; 2987 rec->jb_lbn = jnewblk->jn_lbn; 2988 rec->jb_frags = jnewblk->jn_frags; 2989 rec->jb_oldfrags = jnewblk->jn_oldfrags; 2990 } 2991 2992 static void 2993 jfreeblk_write(jfreeblk, jseg, data) 2994 struct jfreeblk *jfreeblk; 2995 struct jseg *jseg; 2996 uint8_t *data; 2997 { 2998 struct jblkrec *rec; 2999 3000 jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg; 3001 rec = (struct jblkrec *)data; 3002 rec->jb_op = JOP_FREEBLK; 3003 rec->jb_ino = jfreeblk->jf_ino; 3004 rec->jb_blkno = jfreeblk->jf_blkno; 3005 rec->jb_lbn = jfreeblk->jf_lbn; 3006 rec->jb_frags = jfreeblk->jf_frags; 3007 rec->jb_oldfrags = 0; 3008 } 3009 3010 static void 3011 jfreefrag_write(jfreefrag, jseg, data) 3012 struct jfreefrag *jfreefrag; 3013 struct jseg *jseg; 3014 uint8_t *data; 3015 { 3016 struct jblkrec *rec; 3017 3018 jfreefrag->fr_jsegdep->jd_seg = jseg; 3019 rec = (struct jblkrec *)data; 3020 rec->jb_op = JOP_FREEBLK; 3021 rec->jb_ino = jfreefrag->fr_ino; 3022 rec->jb_blkno = jfreefrag->fr_blkno; 3023 rec->jb_lbn = jfreefrag->fr_lbn; 3024 rec->jb_frags = jfreefrag->fr_frags; 3025 rec->jb_oldfrags = 0; 3026 } 3027 3028 static void 3029 jtrunc_write(jtrunc, jseg, data) 3030 struct jtrunc *jtrunc; 3031 struct jseg *jseg; 3032 uint8_t *data; 3033 { 3034 struct jtrncrec *rec; 3035 3036 jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg; 3037 rec = (struct jtrncrec *)data; 3038 rec->jt_op = JOP_TRUNC; 3039 rec->jt_ino = jtrunc->jt_ino; 3040 rec->jt_size = jtrunc->jt_size; 3041 rec->jt_extsize = jtrunc->jt_extsize; 3042 } 3043 3044 static void 3045 jfsync_write(jfsync, jseg, data) 3046 struct jfsync *jfsync; 3047 struct jseg *jseg; 3048 uint8_t *data; 3049 { 3050 struct jtrncrec *rec; 3051 3052 rec = (struct jtrncrec *)data; 3053 rec->jt_op = JOP_SYNC; 3054 rec->jt_ino = jfsync->jfs_ino; 3055 rec->jt_size = jfsync->jfs_size; 3056 rec->jt_extsize = jfsync->jfs_extsize; 3057 } 3058 3059 static void 3060 softdep_flushjournal(mp) 3061 struct mount *mp; 3062 { 3063 struct jblocks *jblocks; 3064 struct ufsmount *ump; 3065 3066 if (MOUNTEDSUJ(mp) == 0) 3067 return; 3068 ump = VFSTOUFS(mp); 3069 jblocks = ump->softdep_jblocks; 3070 ACQUIRE_LOCK(&lk); 3071 while (ump->softdep_on_journal) { 3072 jblocks->jb_needseg = 1; 3073 softdep_process_journal(mp, NULL, MNT_WAIT); 3074 } 3075 FREE_LOCK(&lk); 3076 } 3077 3078 /* 3079 * Flush some journal records to disk. 3080 */ 3081 static void 3082 softdep_process_journal(mp, needwk, flags) 3083 struct mount *mp; 3084 struct worklist *needwk; 3085 int flags; 3086 { 3087 struct jblocks *jblocks; 3088 struct ufsmount *ump; 3089 struct worklist *wk; 3090 struct jseg *jseg; 3091 struct buf *bp; 3092 uint8_t *data; 3093 struct fs *fs; 3094 int segwritten; 3095 int jrecmin; /* Minimum records per block. */ 3096 int jrecmax; /* Maximum records per block. */ 3097 int size; 3098 int cnt; 3099 int off; 3100 int devbsize; 3101 3102 if (MOUNTEDSUJ(mp) == 0) 3103 return; 3104 ump = VFSTOUFS(mp); 3105 fs = ump->um_fs; 3106 jblocks = ump->softdep_jblocks; 3107 devbsize = ump->um_devvp->v_bufobj.bo_bsize; 3108 /* 3109 * We write anywhere between a disk block and fs block. The upper 3110 * bound is picked to prevent buffer cache fragmentation and limit 3111 * processing time per I/O. 3112 */ 3113 jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */ 3114 jrecmax = (fs->fs_bsize / devbsize) * jrecmin; 3115 segwritten = 0; 3116 for (;;) { 3117 cnt = ump->softdep_on_journal; 3118 /* 3119 * Criteria for writing a segment: 3120 * 1) We have a full block. 3121 * 2) We're called from jwait() and haven't found the 3122 * journal item yet. 3123 * 3) Always write if needseg is set. 3124 * 4) If we are called from process_worklist and have 3125 * not yet written anything we write a partial block 3126 * to enforce a 1 second maximum latency on journal 3127 * entries. 3128 */ 3129 if (cnt < (jrecmax - 1) && needwk == NULL && 3130 jblocks->jb_needseg == 0 && (segwritten || cnt == 0)) 3131 break; 3132 cnt++; 3133 /* 3134 * Verify some free journal space. softdep_prealloc() should 3135 * guarantee that we don't run out so this is indicative of 3136 * a problem with the flow control. Try to recover 3137 * gracefully in any event. 3138 */ 3139 while (jblocks->jb_free == 0) { 3140 if (flags != MNT_WAIT) 3141 break; 3142 printf("softdep: Out of journal space!\n"); 3143 softdep_speedup(); 3144 msleep(jblocks, &lk, PRIBIO, "jblocks", hz); 3145 } 3146 FREE_LOCK(&lk); 3147 jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS); 3148 workitem_alloc(&jseg->js_list, D_JSEG, mp); 3149 LIST_INIT(&jseg->js_entries); 3150 LIST_INIT(&jseg->js_indirs); 3151 jseg->js_state = ATTACHED; 3152 jseg->js_jblocks = jblocks; 3153 bp = geteblk(fs->fs_bsize, 0); 3154 ACQUIRE_LOCK(&lk); 3155 /* 3156 * If there was a race while we were allocating the block 3157 * and jseg the entry we care about was likely written. 3158 * We bail out in both the WAIT and NOWAIT case and assume 3159 * the caller will loop if the entry it cares about is 3160 * not written. 3161 */ 3162 cnt = ump->softdep_on_journal; 3163 if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) { 3164 bp->b_flags |= B_INVAL | B_NOCACHE; 3165 WORKITEM_FREE(jseg, D_JSEG); 3166 FREE_LOCK(&lk); 3167 brelse(bp); 3168 ACQUIRE_LOCK(&lk); 3169 break; 3170 } 3171 /* 3172 * Calculate the disk block size required for the available 3173 * records rounded to the min size. 3174 */ 3175 if (cnt == 0) 3176 size = devbsize; 3177 else if (cnt < jrecmax) 3178 size = howmany(cnt, jrecmin) * devbsize; 3179 else 3180 size = fs->fs_bsize; 3181 /* 3182 * Allocate a disk block for this journal data and account 3183 * for truncation of the requested size if enough contiguous 3184 * space was not available. 3185 */ 3186 bp->b_blkno = jblocks_alloc(jblocks, size, &size); 3187 bp->b_lblkno = bp->b_blkno; 3188 bp->b_offset = bp->b_blkno * DEV_BSIZE; 3189 bp->b_bcount = size; 3190 bp->b_bufobj = &ump->um_devvp->v_bufobj; 3191 bp->b_flags &= ~B_INVAL; 3192 bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY; 3193 /* 3194 * Initialize our jseg with cnt records. Assign the next 3195 * sequence number to it and link it in-order. 3196 */ 3197 cnt = MIN(cnt, (size / devbsize) * jrecmin); 3198 jseg->js_buf = bp; 3199 jseg->js_cnt = cnt; 3200 jseg->js_refs = cnt + 1; /* Self ref. */ 3201 jseg->js_size = size; 3202 jseg->js_seq = jblocks->jb_nextseq++; 3203 if (jblocks->jb_oldestseg == NULL) 3204 jblocks->jb_oldestseg = jseg; 3205 jseg->js_oldseq = jblocks->jb_oldestseg->js_seq; 3206 TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next); 3207 if (jblocks->jb_writeseg == NULL) 3208 jblocks->jb_writeseg = jseg; 3209 /* 3210 * Start filling in records from the pending list. 3211 */ 3212 data = bp->b_data; 3213 off = 0; 3214 while ((wk = LIST_FIRST(&ump->softdep_journal_pending)) 3215 != NULL) { 3216 if (cnt == 0) 3217 break; 3218 /* Place a segment header on every device block. */ 3219 if ((off % devbsize) == 0) { 3220 jseg_write(ump, jseg, data); 3221 off += JREC_SIZE; 3222 data = bp->b_data + off; 3223 } 3224 if (wk == needwk) 3225 needwk = NULL; 3226 remove_from_journal(wk); 3227 wk->wk_state |= INPROGRESS; 3228 WORKLIST_INSERT(&jseg->js_entries, wk); 3229 switch (wk->wk_type) { 3230 case D_JADDREF: 3231 jaddref_write(WK_JADDREF(wk), jseg, data); 3232 break; 3233 case D_JREMREF: 3234 jremref_write(WK_JREMREF(wk), jseg, data); 3235 break; 3236 case D_JMVREF: 3237 jmvref_write(WK_JMVREF(wk), jseg, data); 3238 break; 3239 case D_JNEWBLK: 3240 jnewblk_write(WK_JNEWBLK(wk), jseg, data); 3241 break; 3242 case D_JFREEBLK: 3243 jfreeblk_write(WK_JFREEBLK(wk), jseg, data); 3244 break; 3245 case D_JFREEFRAG: 3246 jfreefrag_write(WK_JFREEFRAG(wk), jseg, data); 3247 break; 3248 case D_JTRUNC: 3249 jtrunc_write(WK_JTRUNC(wk), jseg, data); 3250 break; 3251 case D_JFSYNC: 3252 jfsync_write(WK_JFSYNC(wk), jseg, data); 3253 break; 3254 default: 3255 panic("process_journal: Unknown type %s", 3256 TYPENAME(wk->wk_type)); 3257 /* NOTREACHED */ 3258 } 3259 off += JREC_SIZE; 3260 data = bp->b_data + off; 3261 cnt--; 3262 } 3263 /* 3264 * Write this one buffer and continue. 3265 */ 3266 segwritten = 1; 3267 jblocks->jb_needseg = 0; 3268 WORKLIST_INSERT(&bp->b_dep, &jseg->js_list); 3269 FREE_LOCK(&lk); 3270 BO_LOCK(bp->b_bufobj); 3271 bgetvp(ump->um_devvp, bp); 3272 BO_UNLOCK(bp->b_bufobj); 3273 /* 3274 * We only do the blocking wait once we find the journal 3275 * entry we're looking for. 3276 */ 3277 if (needwk == NULL && flags == MNT_WAIT) 3278 bwrite(bp); 3279 else 3280 bawrite(bp); 3281 ACQUIRE_LOCK(&lk); 3282 } 3283 /* 3284 * If we've suspended the filesystem because we ran out of journal 3285 * space either try to sync it here to make some progress or 3286 * unsuspend it if we already have. 3287 */ 3288 if (flags == 0 && jblocks->jb_suspended) { 3289 if (journal_unsuspend(ump)) 3290 return; 3291 FREE_LOCK(&lk); 3292 VFS_SYNC(mp, MNT_NOWAIT); 3293 ffs_sbupdate(ump, MNT_WAIT, 0); 3294 ACQUIRE_LOCK(&lk); 3295 } 3296 } 3297 3298 /* 3299 * Complete a jseg, allowing all dependencies awaiting journal writes 3300 * to proceed. Each journal dependency also attaches a jsegdep to dependent 3301 * structures so that the journal segment can be freed to reclaim space. 3302 */ 3303 static void 3304 complete_jseg(jseg) 3305 struct jseg *jseg; 3306 { 3307 struct worklist *wk; 3308 struct jmvref *jmvref; 3309 int waiting; 3310 #ifdef INVARIANTS 3311 int i = 0; 3312 #endif 3313 3314 while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) { 3315 WORKLIST_REMOVE(wk); 3316 waiting = wk->wk_state & IOWAITING; 3317 wk->wk_state &= ~(INPROGRESS | IOWAITING); 3318 wk->wk_state |= COMPLETE; 3319 KASSERT(i++ < jseg->js_cnt, 3320 ("handle_written_jseg: overflow %d >= %d", 3321 i - 1, jseg->js_cnt)); 3322 switch (wk->wk_type) { 3323 case D_JADDREF: 3324 handle_written_jaddref(WK_JADDREF(wk)); 3325 break; 3326 case D_JREMREF: 3327 handle_written_jremref(WK_JREMREF(wk)); 3328 break; 3329 case D_JMVREF: 3330 rele_jseg(jseg); /* No jsegdep. */ 3331 jmvref = WK_JMVREF(wk); 3332 LIST_REMOVE(jmvref, jm_deps); 3333 if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0) 3334 free_pagedep(jmvref->jm_pagedep); 3335 WORKITEM_FREE(jmvref, D_JMVREF); 3336 break; 3337 case D_JNEWBLK: 3338 handle_written_jnewblk(WK_JNEWBLK(wk)); 3339 break; 3340 case D_JFREEBLK: 3341 handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep); 3342 break; 3343 case D_JTRUNC: 3344 handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep); 3345 break; 3346 case D_JFSYNC: 3347 rele_jseg(jseg); /* No jsegdep. */ 3348 WORKITEM_FREE(wk, D_JFSYNC); 3349 break; 3350 case D_JFREEFRAG: 3351 handle_written_jfreefrag(WK_JFREEFRAG(wk)); 3352 break; 3353 default: 3354 panic("handle_written_jseg: Unknown type %s", 3355 TYPENAME(wk->wk_type)); 3356 /* NOTREACHED */ 3357 } 3358 if (waiting) 3359 wakeup(wk); 3360 } 3361 /* Release the self reference so the structure may be freed. */ 3362 rele_jseg(jseg); 3363 } 3364 3365 /* 3366 * Mark a jseg as DEPCOMPLETE and throw away the buffer. Handle jseg 3367 * completions in order only. 3368 */ 3369 static void 3370 handle_written_jseg(jseg, bp) 3371 struct jseg *jseg; 3372 struct buf *bp; 3373 { 3374 struct jblocks *jblocks; 3375 struct jseg *jsegn; 3376 3377 if (jseg->js_refs == 0) 3378 panic("handle_written_jseg: No self-reference on %p", jseg); 3379 jseg->js_state |= DEPCOMPLETE; 3380 /* 3381 * We'll never need this buffer again, set flags so it will be 3382 * discarded. 3383 */ 3384 bp->b_flags |= B_INVAL | B_NOCACHE; 3385 jblocks = jseg->js_jblocks; 3386 /* 3387 * Don't allow out of order completions. If this isn't the first 3388 * block wait for it to write before we're done. 3389 */ 3390 if (jseg != jblocks->jb_writeseg) 3391 return; 3392 /* Iterate through available jsegs processing their entries. */ 3393 do { 3394 jblocks->jb_oldestwrseq = jseg->js_oldseq; 3395 jsegn = TAILQ_NEXT(jseg, js_next); 3396 complete_jseg(jseg); 3397 jseg = jsegn; 3398 } while (jseg && jseg->js_state & DEPCOMPLETE); 3399 jblocks->jb_writeseg = jseg; 3400 /* 3401 * Attempt to free jsegs now that oldestwrseq may have advanced. 3402 */ 3403 free_jsegs(jblocks); 3404 } 3405 3406 static inline struct jsegdep * 3407 inoref_jseg(inoref) 3408 struct inoref *inoref; 3409 { 3410 struct jsegdep *jsegdep; 3411 3412 jsegdep = inoref->if_jsegdep; 3413 inoref->if_jsegdep = NULL; 3414 3415 return (jsegdep); 3416 } 3417 3418 /* 3419 * Called once a jremref has made it to stable store. The jremref is marked 3420 * complete and we attempt to free it. Any pagedeps writes sleeping waiting 3421 * for the jremref to complete will be awoken by free_jremref. 3422 */ 3423 static void 3424 handle_written_jremref(jremref) 3425 struct jremref *jremref; 3426 { 3427 struct inodedep *inodedep; 3428 struct jsegdep *jsegdep; 3429 struct dirrem *dirrem; 3430 3431 /* Grab the jsegdep. */ 3432 jsegdep = inoref_jseg(&jremref->jr_ref); 3433 /* 3434 * Remove us from the inoref list. 3435 */ 3436 if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 3437 0, &inodedep) == 0) 3438 panic("handle_written_jremref: Lost inodedep"); 3439 TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); 3440 /* 3441 * Complete the dirrem. 3442 */ 3443 dirrem = jremref->jr_dirrem; 3444 jremref->jr_dirrem = NULL; 3445 LIST_REMOVE(jremref, jr_deps); 3446 jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT; 3447 jwork_insert(&dirrem->dm_jwork, jsegdep); 3448 if (LIST_EMPTY(&dirrem->dm_jremrefhd) && 3449 (dirrem->dm_state & COMPLETE) != 0) 3450 add_to_worklist(&dirrem->dm_list, 0); 3451 free_jremref(jremref); 3452 } 3453 3454 /* 3455 * Called once a jaddref has made it to stable store. The dependency is 3456 * marked complete and any dependent structures are added to the inode 3457 * bufwait list to be completed as soon as it is written. If a bitmap write 3458 * depends on this entry we move the inode into the inodedephd of the 3459 * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap. 3460 */ 3461 static void 3462 handle_written_jaddref(jaddref) 3463 struct jaddref *jaddref; 3464 { 3465 struct jsegdep *jsegdep; 3466 struct inodedep *inodedep; 3467 struct diradd *diradd; 3468 struct mkdir *mkdir; 3469 3470 /* Grab the jsegdep. */ 3471 jsegdep = inoref_jseg(&jaddref->ja_ref); 3472 mkdir = NULL; 3473 diradd = NULL; 3474 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 3475 0, &inodedep) == 0) 3476 panic("handle_written_jaddref: Lost inodedep."); 3477 if (jaddref->ja_diradd == NULL) 3478 panic("handle_written_jaddref: No dependency"); 3479 if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) { 3480 diradd = jaddref->ja_diradd; 3481 WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list); 3482 } else if (jaddref->ja_state & MKDIR_PARENT) { 3483 mkdir = jaddref->ja_mkdir; 3484 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list); 3485 } else if (jaddref->ja_state & MKDIR_BODY) 3486 mkdir = jaddref->ja_mkdir; 3487 else 3488 panic("handle_written_jaddref: Unknown dependency %p", 3489 jaddref->ja_diradd); 3490 jaddref->ja_diradd = NULL; /* also clears ja_mkdir */ 3491 /* 3492 * Remove us from the inode list. 3493 */ 3494 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); 3495 /* 3496 * The mkdir may be waiting on the jaddref to clear before freeing. 3497 */ 3498 if (mkdir) { 3499 KASSERT(mkdir->md_list.wk_type == D_MKDIR, 3500 ("handle_written_jaddref: Incorrect type for mkdir %s", 3501 TYPENAME(mkdir->md_list.wk_type))); 3502 mkdir->md_jaddref = NULL; 3503 diradd = mkdir->md_diradd; 3504 mkdir->md_state |= DEPCOMPLETE; 3505 complete_mkdir(mkdir); 3506 } 3507 jwork_insert(&diradd->da_jwork, jsegdep); 3508 if (jaddref->ja_state & NEWBLOCK) { 3509 inodedep->id_state |= ONDEPLIST; 3510 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd, 3511 inodedep, id_deps); 3512 } 3513 free_jaddref(jaddref); 3514 } 3515 3516 /* 3517 * Called once a jnewblk journal is written. The allocdirect or allocindir 3518 * is placed in the bmsafemap to await notification of a written bitmap. If 3519 * the operation was canceled we add the segdep to the appropriate 3520 * dependency to free the journal space once the canceling operation 3521 * completes. 3522 */ 3523 static void 3524 handle_written_jnewblk(jnewblk) 3525 struct jnewblk *jnewblk; 3526 { 3527 struct bmsafemap *bmsafemap; 3528 struct freefrag *freefrag; 3529 struct freework *freework; 3530 struct jsegdep *jsegdep; 3531 struct newblk *newblk; 3532 3533 /* Grab the jsegdep. */ 3534 jsegdep = jnewblk->jn_jsegdep; 3535 jnewblk->jn_jsegdep = NULL; 3536 if (jnewblk->jn_dep == NULL) 3537 panic("handle_written_jnewblk: No dependency for the segdep."); 3538 switch (jnewblk->jn_dep->wk_type) { 3539 case D_NEWBLK: 3540 case D_ALLOCDIRECT: 3541 case D_ALLOCINDIR: 3542 /* 3543 * Add the written block to the bmsafemap so it can 3544 * be notified when the bitmap is on disk. 3545 */ 3546 newblk = WK_NEWBLK(jnewblk->jn_dep); 3547 newblk->nb_jnewblk = NULL; 3548 if ((newblk->nb_state & GOINGAWAY) == 0) { 3549 bmsafemap = newblk->nb_bmsafemap; 3550 newblk->nb_state |= ONDEPLIST; 3551 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, 3552 nb_deps); 3553 } 3554 jwork_insert(&newblk->nb_jwork, jsegdep); 3555 break; 3556 case D_FREEFRAG: 3557 /* 3558 * A newblock being removed by a freefrag when replaced by 3559 * frag extension. 3560 */ 3561 freefrag = WK_FREEFRAG(jnewblk->jn_dep); 3562 freefrag->ff_jdep = NULL; 3563 WORKLIST_INSERT(&freefrag->ff_jwork, &jsegdep->jd_list); 3564 break; 3565 case D_FREEWORK: 3566 /* 3567 * A direct block was removed by truncate. 3568 */ 3569 freework = WK_FREEWORK(jnewblk->jn_dep); 3570 freework->fw_jnewblk = NULL; 3571 WORKLIST_INSERT(&freework->fw_freeblks->fb_jwork, 3572 &jsegdep->jd_list); 3573 break; 3574 default: 3575 panic("handle_written_jnewblk: Unknown type %d.", 3576 jnewblk->jn_dep->wk_type); 3577 } 3578 jnewblk->jn_dep = NULL; 3579 free_jnewblk(jnewblk); 3580 } 3581 3582 /* 3583 * Cancel a jfreefrag that won't be needed, probably due to colliding with 3584 * an in-flight allocation that has not yet been committed. Divorce us 3585 * from the freefrag and mark it DEPCOMPLETE so that it may be added 3586 * to the worklist. 3587 */ 3588 static void 3589 cancel_jfreefrag(jfreefrag) 3590 struct jfreefrag *jfreefrag; 3591 { 3592 struct freefrag *freefrag; 3593 3594 if (jfreefrag->fr_jsegdep) { 3595 free_jsegdep(jfreefrag->fr_jsegdep); 3596 jfreefrag->fr_jsegdep = NULL; 3597 } 3598 freefrag = jfreefrag->fr_freefrag; 3599 jfreefrag->fr_freefrag = NULL; 3600 free_jfreefrag(jfreefrag); 3601 freefrag->ff_state |= DEPCOMPLETE; 3602 } 3603 3604 /* 3605 * Free a jfreefrag when the parent freefrag is rendered obsolete. 3606 */ 3607 static void 3608 free_jfreefrag(jfreefrag) 3609 struct jfreefrag *jfreefrag; 3610 { 3611 3612 if (jfreefrag->fr_state & INPROGRESS) 3613 WORKLIST_REMOVE(&jfreefrag->fr_list); 3614 else if (jfreefrag->fr_state & ONWORKLIST) 3615 remove_from_journal(&jfreefrag->fr_list); 3616 if (jfreefrag->fr_freefrag != NULL) 3617 panic("free_jfreefrag: Still attached to a freefrag."); 3618 WORKITEM_FREE(jfreefrag, D_JFREEFRAG); 3619 } 3620 3621 /* 3622 * Called when the journal write for a jfreefrag completes. The parent 3623 * freefrag is added to the worklist if this completes its dependencies. 3624 */ 3625 static void 3626 handle_written_jfreefrag(jfreefrag) 3627 struct jfreefrag *jfreefrag; 3628 { 3629 struct jsegdep *jsegdep; 3630 struct freefrag *freefrag; 3631 3632 /* Grab the jsegdep. */ 3633 jsegdep = jfreefrag->fr_jsegdep; 3634 jfreefrag->fr_jsegdep = NULL; 3635 freefrag = jfreefrag->fr_freefrag; 3636 if (freefrag == NULL) 3637 panic("handle_written_jfreefrag: No freefrag."); 3638 freefrag->ff_state |= DEPCOMPLETE; 3639 freefrag->ff_jdep = NULL; 3640 jwork_insert(&freefrag->ff_jwork, jsegdep); 3641 if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) 3642 add_to_worklist(&freefrag->ff_list, 0); 3643 jfreefrag->fr_freefrag = NULL; 3644 free_jfreefrag(jfreefrag); 3645 } 3646 3647 /* 3648 * Called when the journal write for a jfreeblk completes. The jfreeblk 3649 * is removed from the freeblks list of pending journal writes and the 3650 * jsegdep is moved to the freeblks jwork to be completed when all blocks 3651 * have been reclaimed. 3652 */ 3653 static void 3654 handle_written_jblkdep(jblkdep) 3655 struct jblkdep *jblkdep; 3656 { 3657 struct freeblks *freeblks; 3658 struct jsegdep *jsegdep; 3659 3660 /* Grab the jsegdep. */ 3661 jsegdep = jblkdep->jb_jsegdep; 3662 jblkdep->jb_jsegdep = NULL; 3663 freeblks = jblkdep->jb_freeblks; 3664 LIST_REMOVE(jblkdep, jb_deps); 3665 WORKLIST_INSERT(&freeblks->fb_jwork, &jsegdep->jd_list); 3666 /* 3667 * If the freeblks is all journaled, we can add it to the worklist. 3668 */ 3669 if (LIST_EMPTY(&freeblks->fb_jblkdephd) && 3670 (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) 3671 add_to_worklist(&freeblks->fb_list, WK_NODELAY); 3672 3673 free_jblkdep(jblkdep); 3674 } 3675 3676 static struct jsegdep * 3677 newjsegdep(struct worklist *wk) 3678 { 3679 struct jsegdep *jsegdep; 3680 3681 jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS); 3682 workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp); 3683 jsegdep->jd_seg = NULL; 3684 3685 return (jsegdep); 3686 } 3687 3688 static struct jmvref * 3689 newjmvref(dp, ino, oldoff, newoff) 3690 struct inode *dp; 3691 ino_t ino; 3692 off_t oldoff; 3693 off_t newoff; 3694 { 3695 struct jmvref *jmvref; 3696 3697 jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS); 3698 workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump)); 3699 jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE; 3700 jmvref->jm_parent = dp->i_number; 3701 jmvref->jm_ino = ino; 3702 jmvref->jm_oldoff = oldoff; 3703 jmvref->jm_newoff = newoff; 3704 3705 return (jmvref); 3706 } 3707 3708 /* 3709 * Allocate a new jremref that tracks the removal of ip from dp with the 3710 * directory entry offset of diroff. Mark the entry as ATTACHED and 3711 * DEPCOMPLETE as we have all the information required for the journal write 3712 * and the directory has already been removed from the buffer. The caller 3713 * is responsible for linking the jremref into the pagedep and adding it 3714 * to the journal to write. The MKDIR_PARENT flag is set if we're doing 3715 * a DOTDOT addition so handle_workitem_remove() can properly assign 3716 * the jsegdep when we're done. 3717 */ 3718 static struct jremref * 3719 newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip, 3720 off_t diroff, nlink_t nlink) 3721 { 3722 struct jremref *jremref; 3723 3724 jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS); 3725 workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump)); 3726 jremref->jr_state = ATTACHED; 3727 newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff, 3728 nlink, ip->i_mode); 3729 jremref->jr_dirrem = dirrem; 3730 3731 return (jremref); 3732 } 3733 3734 static inline void 3735 newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff, 3736 nlink_t nlink, uint16_t mode) 3737 { 3738 3739 inoref->if_jsegdep = newjsegdep(&inoref->if_list); 3740 inoref->if_diroff = diroff; 3741 inoref->if_ino = ino; 3742 inoref->if_parent = parent; 3743 inoref->if_nlink = nlink; 3744 inoref->if_mode = mode; 3745 } 3746 3747 /* 3748 * Allocate a new jaddref to track the addition of ino to dp at diroff. The 3749 * directory offset may not be known until later. The caller is responsible 3750 * adding the entry to the journal when this information is available. nlink 3751 * should be the link count prior to the addition and mode is only required 3752 * to have the correct FMT. 3753 */ 3754 static struct jaddref * 3755 newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink, 3756 uint16_t mode) 3757 { 3758 struct jaddref *jaddref; 3759 3760 jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS); 3761 workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump)); 3762 jaddref->ja_state = ATTACHED; 3763 jaddref->ja_mkdir = NULL; 3764 newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode); 3765 3766 return (jaddref); 3767 } 3768 3769 /* 3770 * Create a new free dependency for a freework. The caller is responsible 3771 * for adjusting the reference count when it has the lock held. The freedep 3772 * will track an outstanding bitmap write that will ultimately clear the 3773 * freework to continue. 3774 */ 3775 static struct freedep * 3776 newfreedep(struct freework *freework) 3777 { 3778 struct freedep *freedep; 3779 3780 freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS); 3781 workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp); 3782 freedep->fd_freework = freework; 3783 3784 return (freedep); 3785 } 3786 3787 /* 3788 * Free a freedep structure once the buffer it is linked to is written. If 3789 * this is the last reference to the freework schedule it for completion. 3790 */ 3791 static void 3792 free_freedep(freedep) 3793 struct freedep *freedep; 3794 { 3795 struct freework *freework; 3796 3797 freework = freedep->fd_freework; 3798 freework->fw_freeblks->fb_cgwait--; 3799 if (--freework->fw_ref == 0) 3800 freework_enqueue(freework); 3801 WORKITEM_FREE(freedep, D_FREEDEP); 3802 } 3803 3804 /* 3805 * Allocate a new freework structure that may be a level in an indirect 3806 * when parent is not NULL or a top level block when it is. The top level 3807 * freework structures are allocated without lk held and before the freeblks 3808 * is visible outside of softdep_setup_freeblocks(). 3809 */ 3810 static struct freework * 3811 newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal) 3812 struct ufsmount *ump; 3813 struct freeblks *freeblks; 3814 struct freework *parent; 3815 ufs_lbn_t lbn; 3816 ufs2_daddr_t nb; 3817 int frags; 3818 int off; 3819 int journal; 3820 { 3821 struct freework *freework; 3822 3823 freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS); 3824 workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp); 3825 freework->fw_state = ATTACHED; 3826 freework->fw_jnewblk = NULL; 3827 freework->fw_freeblks = freeblks; 3828 freework->fw_parent = parent; 3829 freework->fw_lbn = lbn; 3830 freework->fw_blkno = nb; 3831 freework->fw_frags = frags; 3832 freework->fw_indir = NULL; 3833 freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 || lbn >= -NXADDR) 3834 ? 0 : NINDIR(ump->um_fs) + 1; 3835 freework->fw_start = freework->fw_off = off; 3836 if (journal) 3837 newjfreeblk(freeblks, lbn, nb, frags); 3838 if (parent == NULL) { 3839 ACQUIRE_LOCK(&lk); 3840 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list); 3841 freeblks->fb_ref++; 3842 FREE_LOCK(&lk); 3843 } 3844 3845 return (freework); 3846 } 3847 3848 /* 3849 * Eliminate a jfreeblk for a block that does not need journaling. 3850 */ 3851 static void 3852 cancel_jfreeblk(freeblks, blkno) 3853 struct freeblks *freeblks; 3854 ufs2_daddr_t blkno; 3855 { 3856 struct jfreeblk *jfreeblk; 3857 struct jblkdep *jblkdep; 3858 3859 LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) { 3860 if (jblkdep->jb_list.wk_type != D_JFREEBLK) 3861 continue; 3862 jfreeblk = WK_JFREEBLK(&jblkdep->jb_list); 3863 if (jfreeblk->jf_blkno == blkno) 3864 break; 3865 } 3866 if (jblkdep == NULL) 3867 return; 3868 free_jsegdep(jblkdep->jb_jsegdep); 3869 LIST_REMOVE(jblkdep, jb_deps); 3870 WORKITEM_FREE(jfreeblk, D_JFREEBLK); 3871 } 3872 3873 /* 3874 * Allocate a new jfreeblk to journal top level block pointer when truncating 3875 * a file. The caller must add this to the worklist when lk is held. 3876 */ 3877 static struct jfreeblk * 3878 newjfreeblk(freeblks, lbn, blkno, frags) 3879 struct freeblks *freeblks; 3880 ufs_lbn_t lbn; 3881 ufs2_daddr_t blkno; 3882 int frags; 3883 { 3884 struct jfreeblk *jfreeblk; 3885 3886 jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS); 3887 workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK, 3888 freeblks->fb_list.wk_mp); 3889 jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list); 3890 jfreeblk->jf_dep.jb_freeblks = freeblks; 3891 jfreeblk->jf_ino = freeblks->fb_inum; 3892 jfreeblk->jf_lbn = lbn; 3893 jfreeblk->jf_blkno = blkno; 3894 jfreeblk->jf_frags = frags; 3895 LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps); 3896 3897 return (jfreeblk); 3898 } 3899 3900 /* 3901 * Allocate a new jtrunc to track a partial truncation. 3902 */ 3903 static struct jtrunc * 3904 newjtrunc(freeblks, size, extsize) 3905 struct freeblks *freeblks; 3906 off_t size; 3907 int extsize; 3908 { 3909 struct jtrunc *jtrunc; 3910 3911 jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS); 3912 workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC, 3913 freeblks->fb_list.wk_mp); 3914 jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list); 3915 jtrunc->jt_dep.jb_freeblks = freeblks; 3916 jtrunc->jt_ino = freeblks->fb_inum; 3917 jtrunc->jt_size = size; 3918 jtrunc->jt_extsize = extsize; 3919 LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps); 3920 3921 return (jtrunc); 3922 } 3923 3924 /* 3925 * If we're canceling a new bitmap we have to search for another ref 3926 * to move into the bmsafemap dep. This might be better expressed 3927 * with another structure. 3928 */ 3929 static void 3930 move_newblock_dep(jaddref, inodedep) 3931 struct jaddref *jaddref; 3932 struct inodedep *inodedep; 3933 { 3934 struct inoref *inoref; 3935 struct jaddref *jaddrefn; 3936 3937 jaddrefn = NULL; 3938 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; 3939 inoref = TAILQ_NEXT(inoref, if_deps)) { 3940 if ((jaddref->ja_state & NEWBLOCK) && 3941 inoref->if_list.wk_type == D_JADDREF) { 3942 jaddrefn = (struct jaddref *)inoref; 3943 break; 3944 } 3945 } 3946 if (jaddrefn == NULL) 3947 return; 3948 jaddrefn->ja_state &= ~(ATTACHED | UNDONE); 3949 jaddrefn->ja_state |= jaddref->ja_state & 3950 (ATTACHED | UNDONE | NEWBLOCK); 3951 jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK); 3952 jaddref->ja_state |= ATTACHED; 3953 LIST_REMOVE(jaddref, ja_bmdeps); 3954 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn, 3955 ja_bmdeps); 3956 } 3957 3958 /* 3959 * Cancel a jaddref either before it has been written or while it is being 3960 * written. This happens when a link is removed before the add reaches 3961 * the disk. The jaddref dependency is kept linked into the bmsafemap 3962 * and inode to prevent the link count or bitmap from reaching the disk 3963 * until handle_workitem_remove() re-adjusts the counts and bitmaps as 3964 * required. 3965 * 3966 * Returns 1 if the canceled addref requires journaling of the remove and 3967 * 0 otherwise. 3968 */ 3969 static int 3970 cancel_jaddref(jaddref, inodedep, wkhd) 3971 struct jaddref *jaddref; 3972 struct inodedep *inodedep; 3973 struct workhead *wkhd; 3974 { 3975 struct inoref *inoref; 3976 struct jsegdep *jsegdep; 3977 int needsj; 3978 3979 KASSERT((jaddref->ja_state & COMPLETE) == 0, 3980 ("cancel_jaddref: Canceling complete jaddref")); 3981 if (jaddref->ja_state & (INPROGRESS | COMPLETE)) 3982 needsj = 1; 3983 else 3984 needsj = 0; 3985 if (inodedep == NULL) 3986 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 3987 0, &inodedep) == 0) 3988 panic("cancel_jaddref: Lost inodedep"); 3989 /* 3990 * We must adjust the nlink of any reference operation that follows 3991 * us so that it is consistent with the in-memory reference. This 3992 * ensures that inode nlink rollbacks always have the correct link. 3993 */ 3994 if (needsj == 0) { 3995 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; 3996 inoref = TAILQ_NEXT(inoref, if_deps)) { 3997 if (inoref->if_state & GOINGAWAY) 3998 break; 3999 inoref->if_nlink--; 4000 } 4001 } 4002 jsegdep = inoref_jseg(&jaddref->ja_ref); 4003 if (jaddref->ja_state & NEWBLOCK) 4004 move_newblock_dep(jaddref, inodedep); 4005 wake_worklist(&jaddref->ja_list); 4006 jaddref->ja_mkdir = NULL; 4007 if (jaddref->ja_state & INPROGRESS) { 4008 jaddref->ja_state &= ~INPROGRESS; 4009 WORKLIST_REMOVE(&jaddref->ja_list); 4010 jwork_insert(wkhd, jsegdep); 4011 } else { 4012 free_jsegdep(jsegdep); 4013 if (jaddref->ja_state & DEPCOMPLETE) 4014 remove_from_journal(&jaddref->ja_list); 4015 } 4016 jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE); 4017 /* 4018 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove 4019 * can arrange for them to be freed with the bitmap. Otherwise we 4020 * no longer need this addref attached to the inoreflst and it 4021 * will incorrectly adjust nlink if we leave it. 4022 */ 4023 if ((jaddref->ja_state & NEWBLOCK) == 0) { 4024 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, 4025 if_deps); 4026 jaddref->ja_state |= COMPLETE; 4027 free_jaddref(jaddref); 4028 return (needsj); 4029 } 4030 /* 4031 * Leave the head of the list for jsegdeps for fast merging. 4032 */ 4033 if (LIST_FIRST(wkhd) != NULL) { 4034 jaddref->ja_state |= ONWORKLIST; 4035 LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list); 4036 } else 4037 WORKLIST_INSERT(wkhd, &jaddref->ja_list); 4038 4039 return (needsj); 4040 } 4041 4042 /* 4043 * Attempt to free a jaddref structure when some work completes. This 4044 * should only succeed once the entry is written and all dependencies have 4045 * been notified. 4046 */ 4047 static void 4048 free_jaddref(jaddref) 4049 struct jaddref *jaddref; 4050 { 4051 4052 if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE) 4053 return; 4054 if (jaddref->ja_ref.if_jsegdep) 4055 panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n", 4056 jaddref, jaddref->ja_state); 4057 if (jaddref->ja_state & NEWBLOCK) 4058 LIST_REMOVE(jaddref, ja_bmdeps); 4059 if (jaddref->ja_state & (INPROGRESS | ONWORKLIST)) 4060 panic("free_jaddref: Bad state %p(0x%X)", 4061 jaddref, jaddref->ja_state); 4062 if (jaddref->ja_mkdir != NULL) 4063 panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state); 4064 WORKITEM_FREE(jaddref, D_JADDREF); 4065 } 4066 4067 /* 4068 * Free a jremref structure once it has been written or discarded. 4069 */ 4070 static void 4071 free_jremref(jremref) 4072 struct jremref *jremref; 4073 { 4074 4075 if (jremref->jr_ref.if_jsegdep) 4076 free_jsegdep(jremref->jr_ref.if_jsegdep); 4077 if (jremref->jr_state & INPROGRESS) 4078 panic("free_jremref: IO still pending"); 4079 WORKITEM_FREE(jremref, D_JREMREF); 4080 } 4081 4082 /* 4083 * Free a jnewblk structure. 4084 */ 4085 static void 4086 free_jnewblk(jnewblk) 4087 struct jnewblk *jnewblk; 4088 { 4089 4090 if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE) 4091 return; 4092 LIST_REMOVE(jnewblk, jn_deps); 4093 if (jnewblk->jn_dep != NULL) 4094 panic("free_jnewblk: Dependency still attached."); 4095 WORKITEM_FREE(jnewblk, D_JNEWBLK); 4096 } 4097 4098 /* 4099 * Cancel a jnewblk which has been been made redundant by frag extension. 4100 */ 4101 static void 4102 cancel_jnewblk(jnewblk, wkhd) 4103 struct jnewblk *jnewblk; 4104 struct workhead *wkhd; 4105 { 4106 struct jsegdep *jsegdep; 4107 4108 jsegdep = jnewblk->jn_jsegdep; 4109 if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL) 4110 panic("cancel_jnewblk: Invalid state"); 4111 jnewblk->jn_jsegdep = NULL; 4112 jnewblk->jn_dep = NULL; 4113 jnewblk->jn_state |= GOINGAWAY; 4114 if (jnewblk->jn_state & INPROGRESS) { 4115 jnewblk->jn_state &= ~INPROGRESS; 4116 WORKLIST_REMOVE(&jnewblk->jn_list); 4117 jwork_insert(wkhd, jsegdep); 4118 } else { 4119 free_jsegdep(jsegdep); 4120 remove_from_journal(&jnewblk->jn_list); 4121 } 4122 wake_worklist(&jnewblk->jn_list); 4123 WORKLIST_INSERT(wkhd, &jnewblk->jn_list); 4124 } 4125 4126 static void 4127 free_jblkdep(jblkdep) 4128 struct jblkdep *jblkdep; 4129 { 4130 4131 if (jblkdep->jb_list.wk_type == D_JFREEBLK) 4132 WORKITEM_FREE(jblkdep, D_JFREEBLK); 4133 else if (jblkdep->jb_list.wk_type == D_JTRUNC) 4134 WORKITEM_FREE(jblkdep, D_JTRUNC); 4135 else 4136 panic("free_jblkdep: Unexpected type %s", 4137 TYPENAME(jblkdep->jb_list.wk_type)); 4138 } 4139 4140 /* 4141 * Free a single jseg once it is no longer referenced in memory or on 4142 * disk. Reclaim journal blocks and dependencies waiting for the segment 4143 * to disappear. 4144 */ 4145 static void 4146 free_jseg(jseg, jblocks) 4147 struct jseg *jseg; 4148 struct jblocks *jblocks; 4149 { 4150 struct freework *freework; 4151 4152 /* 4153 * Free freework structures that were lingering to indicate freed 4154 * indirect blocks that forced journal write ordering on reallocate. 4155 */ 4156 while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL) 4157 indirblk_remove(freework); 4158 if (jblocks->jb_oldestseg == jseg) 4159 jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next); 4160 TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next); 4161 jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size); 4162 KASSERT(LIST_EMPTY(&jseg->js_entries), 4163 ("free_jseg: Freed jseg has valid entries.")); 4164 WORKITEM_FREE(jseg, D_JSEG); 4165 } 4166 4167 /* 4168 * Free all jsegs that meet the criteria for being reclaimed and update 4169 * oldestseg. 4170 */ 4171 static void 4172 free_jsegs(jblocks) 4173 struct jblocks *jblocks; 4174 { 4175 struct jseg *jseg; 4176 4177 /* 4178 * Free only those jsegs which have none allocated before them to 4179 * preserve the journal space ordering. 4180 */ 4181 while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) { 4182 /* 4183 * Only reclaim space when nothing depends on this journal 4184 * set and another set has written that it is no longer 4185 * valid. 4186 */ 4187 if (jseg->js_refs != 0) { 4188 jblocks->jb_oldestseg = jseg; 4189 return; 4190 } 4191 if (!LIST_EMPTY(&jseg->js_indirs) && 4192 jseg->js_seq >= jblocks->jb_oldestwrseq) 4193 break; 4194 free_jseg(jseg, jblocks); 4195 } 4196 /* 4197 * If we exited the loop above we still must discover the 4198 * oldest valid segment. 4199 */ 4200 if (jseg) 4201 for (jseg = jblocks->jb_oldestseg; jseg != NULL; 4202 jseg = TAILQ_NEXT(jseg, js_next)) 4203 if (jseg->js_refs != 0) 4204 break; 4205 jblocks->jb_oldestseg = jseg; 4206 /* 4207 * The journal has no valid records but some jsegs may still be 4208 * waiting on oldestwrseq to advance. We force a small record 4209 * out to permit these lingering records to be reclaimed. 4210 */ 4211 if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs)) 4212 jblocks->jb_needseg = 1; 4213 } 4214 4215 /* 4216 * Release one reference to a jseg and free it if the count reaches 0. This 4217 * should eventually reclaim journal space as well. 4218 */ 4219 static void 4220 rele_jseg(jseg) 4221 struct jseg *jseg; 4222 { 4223 4224 KASSERT(jseg->js_refs > 0, 4225 ("free_jseg: Invalid refcnt %d", jseg->js_refs)); 4226 if (--jseg->js_refs != 0) 4227 return; 4228 free_jsegs(jseg->js_jblocks); 4229 } 4230 4231 /* 4232 * Release a jsegdep and decrement the jseg count. 4233 */ 4234 static void 4235 free_jsegdep(jsegdep) 4236 struct jsegdep *jsegdep; 4237 { 4238 4239 if (jsegdep->jd_seg) 4240 rele_jseg(jsegdep->jd_seg); 4241 WORKITEM_FREE(jsegdep, D_JSEGDEP); 4242 } 4243 4244 /* 4245 * Wait for a journal item to make it to disk. Initiate journal processing 4246 * if required. 4247 */ 4248 static int 4249 jwait(wk, waitfor) 4250 struct worklist *wk; 4251 int waitfor; 4252 { 4253 4254 /* 4255 * Blocking journal waits cause slow synchronous behavior. Record 4256 * stats on the frequency of these blocking operations. 4257 */ 4258 if (waitfor == MNT_WAIT) { 4259 stat_journal_wait++; 4260 switch (wk->wk_type) { 4261 case D_JREMREF: 4262 case D_JMVREF: 4263 stat_jwait_filepage++; 4264 break; 4265 case D_JTRUNC: 4266 case D_JFREEBLK: 4267 stat_jwait_freeblks++; 4268 break; 4269 case D_JNEWBLK: 4270 stat_jwait_newblk++; 4271 break; 4272 case D_JADDREF: 4273 stat_jwait_inode++; 4274 break; 4275 default: 4276 break; 4277 } 4278 } 4279 /* 4280 * If IO has not started we process the journal. We can't mark the 4281 * worklist item as IOWAITING because we drop the lock while 4282 * processing the journal and the worklist entry may be freed after 4283 * this point. The caller may call back in and re-issue the request. 4284 */ 4285 if ((wk->wk_state & INPROGRESS) == 0) { 4286 softdep_process_journal(wk->wk_mp, wk, waitfor); 4287 if (waitfor != MNT_WAIT) 4288 return (EBUSY); 4289 return (0); 4290 } 4291 if (waitfor != MNT_WAIT) 4292 return (EBUSY); 4293 wait_worklist(wk, "jwait"); 4294 return (0); 4295 } 4296 4297 /* 4298 * Lookup an inodedep based on an inode pointer and set the nlinkdelta as 4299 * appropriate. This is a convenience function to reduce duplicate code 4300 * for the setup and revert functions below. 4301 */ 4302 static struct inodedep * 4303 inodedep_lookup_ip(ip) 4304 struct inode *ip; 4305 { 4306 struct inodedep *inodedep; 4307 4308 KASSERT(ip->i_nlink >= ip->i_effnlink, 4309 ("inodedep_lookup_ip: bad delta")); 4310 (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 4311 DEPALLOC, &inodedep); 4312 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 4313 4314 return (inodedep); 4315 } 4316 4317 /* 4318 * Called prior to creating a new inode and linking it to a directory. The 4319 * jaddref structure must already be allocated by softdep_setup_inomapdep 4320 * and it is discovered here so we can initialize the mode and update 4321 * nlinkdelta. 4322 */ 4323 void 4324 softdep_setup_create(dp, ip) 4325 struct inode *dp; 4326 struct inode *ip; 4327 { 4328 struct inodedep *inodedep; 4329 struct jaddref *jaddref; 4330 struct vnode *dvp; 4331 4332 KASSERT(ip->i_nlink == 1, 4333 ("softdep_setup_create: Invalid link count.")); 4334 dvp = ITOV(dp); 4335 ACQUIRE_LOCK(&lk); 4336 inodedep = inodedep_lookup_ip(ip); 4337 if (DOINGSUJ(dvp)) { 4338 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4339 inoreflst); 4340 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 4341 ("softdep_setup_create: No addref structure present.")); 4342 } 4343 softdep_prelink(dvp, NULL); 4344 FREE_LOCK(&lk); 4345 } 4346 4347 /* 4348 * Create a jaddref structure to track the addition of a DOTDOT link when 4349 * we are reparenting an inode as part of a rename. This jaddref will be 4350 * found by softdep_setup_directory_change. Adjusts nlinkdelta for 4351 * non-journaling softdep. 4352 */ 4353 void 4354 softdep_setup_dotdot_link(dp, ip) 4355 struct inode *dp; 4356 struct inode *ip; 4357 { 4358 struct inodedep *inodedep; 4359 struct jaddref *jaddref; 4360 struct vnode *dvp; 4361 struct vnode *vp; 4362 4363 dvp = ITOV(dp); 4364 vp = ITOV(ip); 4365 jaddref = NULL; 4366 /* 4367 * We don't set MKDIR_PARENT as this is not tied to a mkdir and 4368 * is used as a normal link would be. 4369 */ 4370 if (DOINGSUJ(dvp)) 4371 jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, 4372 dp->i_effnlink - 1, dp->i_mode); 4373 ACQUIRE_LOCK(&lk); 4374 inodedep = inodedep_lookup_ip(dp); 4375 if (jaddref) 4376 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 4377 if_deps); 4378 softdep_prelink(dvp, ITOV(ip)); 4379 FREE_LOCK(&lk); 4380 } 4381 4382 /* 4383 * Create a jaddref structure to track a new link to an inode. The directory 4384 * offset is not known until softdep_setup_directory_add or 4385 * softdep_setup_directory_change. Adjusts nlinkdelta for non-journaling 4386 * softdep. 4387 */ 4388 void 4389 softdep_setup_link(dp, ip) 4390 struct inode *dp; 4391 struct inode *ip; 4392 { 4393 struct inodedep *inodedep; 4394 struct jaddref *jaddref; 4395 struct vnode *dvp; 4396 4397 dvp = ITOV(dp); 4398 jaddref = NULL; 4399 if (DOINGSUJ(dvp)) 4400 jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1, 4401 ip->i_mode); 4402 ACQUIRE_LOCK(&lk); 4403 inodedep = inodedep_lookup_ip(ip); 4404 if (jaddref) 4405 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 4406 if_deps); 4407 softdep_prelink(dvp, ITOV(ip)); 4408 FREE_LOCK(&lk); 4409 } 4410 4411 /* 4412 * Called to create the jaddref structures to track . and .. references as 4413 * well as lookup and further initialize the incomplete jaddref created 4414 * by softdep_setup_inomapdep when the inode was allocated. Adjusts 4415 * nlinkdelta for non-journaling softdep. 4416 */ 4417 void 4418 softdep_setup_mkdir(dp, ip) 4419 struct inode *dp; 4420 struct inode *ip; 4421 { 4422 struct inodedep *inodedep; 4423 struct jaddref *dotdotaddref; 4424 struct jaddref *dotaddref; 4425 struct jaddref *jaddref; 4426 struct vnode *dvp; 4427 4428 dvp = ITOV(dp); 4429 dotaddref = dotdotaddref = NULL; 4430 if (DOINGSUJ(dvp)) { 4431 dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1, 4432 ip->i_mode); 4433 dotaddref->ja_state |= MKDIR_BODY; 4434 dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, 4435 dp->i_effnlink - 1, dp->i_mode); 4436 dotdotaddref->ja_state |= MKDIR_PARENT; 4437 } 4438 ACQUIRE_LOCK(&lk); 4439 inodedep = inodedep_lookup_ip(ip); 4440 if (DOINGSUJ(dvp)) { 4441 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4442 inoreflst); 4443 KASSERT(jaddref != NULL, 4444 ("softdep_setup_mkdir: No addref structure present.")); 4445 KASSERT(jaddref->ja_parent == dp->i_number, 4446 ("softdep_setup_mkdir: bad parent %d", 4447 jaddref->ja_parent)); 4448 TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref, 4449 if_deps); 4450 } 4451 inodedep = inodedep_lookup_ip(dp); 4452 if (DOINGSUJ(dvp)) 4453 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, 4454 &dotdotaddref->ja_ref, if_deps); 4455 softdep_prelink(ITOV(dp), NULL); 4456 FREE_LOCK(&lk); 4457 } 4458 4459 /* 4460 * Called to track nlinkdelta of the inode and parent directories prior to 4461 * unlinking a directory. 4462 */ 4463 void 4464 softdep_setup_rmdir(dp, ip) 4465 struct inode *dp; 4466 struct inode *ip; 4467 { 4468 struct vnode *dvp; 4469 4470 dvp = ITOV(dp); 4471 ACQUIRE_LOCK(&lk); 4472 (void) inodedep_lookup_ip(ip); 4473 (void) inodedep_lookup_ip(dp); 4474 softdep_prelink(dvp, ITOV(ip)); 4475 FREE_LOCK(&lk); 4476 } 4477 4478 /* 4479 * Called to track nlinkdelta of the inode and parent directories prior to 4480 * unlink. 4481 */ 4482 void 4483 softdep_setup_unlink(dp, ip) 4484 struct inode *dp; 4485 struct inode *ip; 4486 { 4487 struct vnode *dvp; 4488 4489 dvp = ITOV(dp); 4490 ACQUIRE_LOCK(&lk); 4491 (void) inodedep_lookup_ip(ip); 4492 (void) inodedep_lookup_ip(dp); 4493 softdep_prelink(dvp, ITOV(ip)); 4494 FREE_LOCK(&lk); 4495 } 4496 4497 /* 4498 * Called to release the journal structures created by a failed non-directory 4499 * creation. Adjusts nlinkdelta for non-journaling softdep. 4500 */ 4501 void 4502 softdep_revert_create(dp, ip) 4503 struct inode *dp; 4504 struct inode *ip; 4505 { 4506 struct inodedep *inodedep; 4507 struct jaddref *jaddref; 4508 struct vnode *dvp; 4509 4510 dvp = ITOV(dp); 4511 ACQUIRE_LOCK(&lk); 4512 inodedep = inodedep_lookup_ip(ip); 4513 if (DOINGSUJ(dvp)) { 4514 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4515 inoreflst); 4516 KASSERT(jaddref->ja_parent == dp->i_number, 4517 ("softdep_revert_create: addref parent mismatch")); 4518 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4519 } 4520 FREE_LOCK(&lk); 4521 } 4522 4523 /* 4524 * Called to release the journal structures created by a failed dotdot link 4525 * creation. Adjusts nlinkdelta for non-journaling softdep. 4526 */ 4527 void 4528 softdep_revert_dotdot_link(dp, ip) 4529 struct inode *dp; 4530 struct inode *ip; 4531 { 4532 struct inodedep *inodedep; 4533 struct jaddref *jaddref; 4534 struct vnode *dvp; 4535 4536 dvp = ITOV(dp); 4537 ACQUIRE_LOCK(&lk); 4538 inodedep = inodedep_lookup_ip(dp); 4539 if (DOINGSUJ(dvp)) { 4540 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4541 inoreflst); 4542 KASSERT(jaddref->ja_parent == ip->i_number, 4543 ("softdep_revert_dotdot_link: addref parent mismatch")); 4544 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4545 } 4546 FREE_LOCK(&lk); 4547 } 4548 4549 /* 4550 * Called to release the journal structures created by a failed link 4551 * addition. Adjusts nlinkdelta for non-journaling softdep. 4552 */ 4553 void 4554 softdep_revert_link(dp, ip) 4555 struct inode *dp; 4556 struct inode *ip; 4557 { 4558 struct inodedep *inodedep; 4559 struct jaddref *jaddref; 4560 struct vnode *dvp; 4561 4562 dvp = ITOV(dp); 4563 ACQUIRE_LOCK(&lk); 4564 inodedep = inodedep_lookup_ip(ip); 4565 if (DOINGSUJ(dvp)) { 4566 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4567 inoreflst); 4568 KASSERT(jaddref->ja_parent == dp->i_number, 4569 ("softdep_revert_link: addref parent mismatch")); 4570 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4571 } 4572 FREE_LOCK(&lk); 4573 } 4574 4575 /* 4576 * Called to release the journal structures created by a failed mkdir 4577 * attempt. Adjusts nlinkdelta for non-journaling softdep. 4578 */ 4579 void 4580 softdep_revert_mkdir(dp, ip) 4581 struct inode *dp; 4582 struct inode *ip; 4583 { 4584 struct inodedep *inodedep; 4585 struct jaddref *jaddref; 4586 struct jaddref *dotaddref; 4587 struct vnode *dvp; 4588 4589 dvp = ITOV(dp); 4590 4591 ACQUIRE_LOCK(&lk); 4592 inodedep = inodedep_lookup_ip(dp); 4593 if (DOINGSUJ(dvp)) { 4594 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4595 inoreflst); 4596 KASSERT(jaddref->ja_parent == ip->i_number, 4597 ("softdep_revert_mkdir: dotdot addref parent mismatch")); 4598 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4599 } 4600 inodedep = inodedep_lookup_ip(ip); 4601 if (DOINGSUJ(dvp)) { 4602 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4603 inoreflst); 4604 KASSERT(jaddref->ja_parent == dp->i_number, 4605 ("softdep_revert_mkdir: addref parent mismatch")); 4606 dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, 4607 inoreflst, if_deps); 4608 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4609 KASSERT(dotaddref->ja_parent == ip->i_number, 4610 ("softdep_revert_mkdir: dot addref parent mismatch")); 4611 cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait); 4612 } 4613 FREE_LOCK(&lk); 4614 } 4615 4616 /* 4617 * Called to correct nlinkdelta after a failed rmdir. 4618 */ 4619 void 4620 softdep_revert_rmdir(dp, ip) 4621 struct inode *dp; 4622 struct inode *ip; 4623 { 4624 4625 ACQUIRE_LOCK(&lk); 4626 (void) inodedep_lookup_ip(ip); 4627 (void) inodedep_lookup_ip(dp); 4628 FREE_LOCK(&lk); 4629 } 4630 4631 /* 4632 * Protecting the freemaps (or bitmaps). 4633 * 4634 * To eliminate the need to execute fsck before mounting a filesystem 4635 * after a power failure, one must (conservatively) guarantee that the 4636 * on-disk copy of the bitmaps never indicate that a live inode or block is 4637 * free. So, when a block or inode is allocated, the bitmap should be 4638 * updated (on disk) before any new pointers. When a block or inode is 4639 * freed, the bitmap should not be updated until all pointers have been 4640 * reset. The latter dependency is handled by the delayed de-allocation 4641 * approach described below for block and inode de-allocation. The former 4642 * dependency is handled by calling the following procedure when a block or 4643 * inode is allocated. When an inode is allocated an "inodedep" is created 4644 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. 4645 * Each "inodedep" is also inserted into the hash indexing structure so 4646 * that any additional link additions can be made dependent on the inode 4647 * allocation. 4648 * 4649 * The ufs filesystem maintains a number of free block counts (e.g., per 4650 * cylinder group, per cylinder and per <cylinder, rotational position> pair) 4651 * in addition to the bitmaps. These counts are used to improve efficiency 4652 * during allocation and therefore must be consistent with the bitmaps. 4653 * There is no convenient way to guarantee post-crash consistency of these 4654 * counts with simple update ordering, for two main reasons: (1) The counts 4655 * and bitmaps for a single cylinder group block are not in the same disk 4656 * sector. If a disk write is interrupted (e.g., by power failure), one may 4657 * be written and the other not. (2) Some of the counts are located in the 4658 * superblock rather than the cylinder group block. So, we focus our soft 4659 * updates implementation on protecting the bitmaps. When mounting a 4660 * filesystem, we recompute the auxiliary counts from the bitmaps. 4661 */ 4662 4663 /* 4664 * Called just after updating the cylinder group block to allocate an inode. 4665 */ 4666 void 4667 softdep_setup_inomapdep(bp, ip, newinum, mode) 4668 struct buf *bp; /* buffer for cylgroup block with inode map */ 4669 struct inode *ip; /* inode related to allocation */ 4670 ino_t newinum; /* new inode number being allocated */ 4671 int mode; 4672 { 4673 struct inodedep *inodedep; 4674 struct bmsafemap *bmsafemap; 4675 struct jaddref *jaddref; 4676 struct mount *mp; 4677 struct fs *fs; 4678 4679 mp = UFSTOVFS(ip->i_ump); 4680 fs = ip->i_ump->um_fs; 4681 jaddref = NULL; 4682 4683 /* 4684 * Allocate the journal reference add structure so that the bitmap 4685 * can be dependent on it. 4686 */ 4687 if (MOUNTEDSUJ(mp)) { 4688 jaddref = newjaddref(ip, newinum, 0, 0, mode); 4689 jaddref->ja_state |= NEWBLOCK; 4690 } 4691 4692 /* 4693 * Create a dependency for the newly allocated inode. 4694 * Panic if it already exists as something is seriously wrong. 4695 * Otherwise add it to the dependency list for the buffer holding 4696 * the cylinder group map from which it was allocated. 4697 */ 4698 ACQUIRE_LOCK(&lk); 4699 if ((inodedep_lookup(mp, newinum, DEPALLOC|NODELAY, &inodedep))) 4700 panic("softdep_setup_inomapdep: dependency %p for new" 4701 "inode already exists", inodedep); 4702 bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum)); 4703 if (jaddref) { 4704 LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps); 4705 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 4706 if_deps); 4707 } else { 4708 inodedep->id_state |= ONDEPLIST; 4709 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); 4710 } 4711 inodedep->id_bmsafemap = bmsafemap; 4712 inodedep->id_state &= ~DEPCOMPLETE; 4713 FREE_LOCK(&lk); 4714 } 4715 4716 /* 4717 * Called just after updating the cylinder group block to 4718 * allocate block or fragment. 4719 */ 4720 void 4721 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) 4722 struct buf *bp; /* buffer for cylgroup block with block map */ 4723 struct mount *mp; /* filesystem doing allocation */ 4724 ufs2_daddr_t newblkno; /* number of newly allocated block */ 4725 int frags; /* Number of fragments. */ 4726 int oldfrags; /* Previous number of fragments for extend. */ 4727 { 4728 struct newblk *newblk; 4729 struct bmsafemap *bmsafemap; 4730 struct jnewblk *jnewblk; 4731 struct fs *fs; 4732 4733 fs = VFSTOUFS(mp)->um_fs; 4734 jnewblk = NULL; 4735 /* 4736 * Create a dependency for the newly allocated block. 4737 * Add it to the dependency list for the buffer holding 4738 * the cylinder group map from which it was allocated. 4739 */ 4740 if (MOUNTEDSUJ(mp)) { 4741 jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS); 4742 workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp); 4743 jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list); 4744 jnewblk->jn_state = ATTACHED; 4745 jnewblk->jn_blkno = newblkno; 4746 jnewblk->jn_frags = frags; 4747 jnewblk->jn_oldfrags = oldfrags; 4748 #ifdef SUJ_DEBUG 4749 { 4750 struct cg *cgp; 4751 uint8_t *blksfree; 4752 long bno; 4753 int i; 4754 4755 cgp = (struct cg *)bp->b_data; 4756 blksfree = cg_blksfree(cgp); 4757 bno = dtogd(fs, jnewblk->jn_blkno); 4758 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; 4759 i++) { 4760 if (isset(blksfree, bno + i)) 4761 panic("softdep_setup_blkmapdep: " 4762 "free fragment %d from %d-%d " 4763 "state 0x%X dep %p", i, 4764 jnewblk->jn_oldfrags, 4765 jnewblk->jn_frags, 4766 jnewblk->jn_state, 4767 jnewblk->jn_dep); 4768 } 4769 } 4770 #endif 4771 } 4772 ACQUIRE_LOCK(&lk); 4773 if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0) 4774 panic("softdep_setup_blkmapdep: found block"); 4775 newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp, 4776 dtog(fs, newblkno)); 4777 if (jnewblk) { 4778 jnewblk->jn_dep = (struct worklist *)newblk; 4779 LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps); 4780 } else { 4781 newblk->nb_state |= ONDEPLIST; 4782 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); 4783 } 4784 newblk->nb_bmsafemap = bmsafemap; 4785 newblk->nb_jnewblk = jnewblk; 4786 FREE_LOCK(&lk); 4787 } 4788 4789 #define BMSAFEMAP_HASH(fs, cg) \ 4790 (&bmsafemap_hashtbl[((((register_t)(fs)) >> 13) + (cg)) & bmsafemap_hash]) 4791 4792 static int 4793 bmsafemap_find(bmsafemaphd, mp, cg, bmsafemapp) 4794 struct bmsafemap_hashhead *bmsafemaphd; 4795 struct mount *mp; 4796 int cg; 4797 struct bmsafemap **bmsafemapp; 4798 { 4799 struct bmsafemap *bmsafemap; 4800 4801 LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash) 4802 if (bmsafemap->sm_list.wk_mp == mp && bmsafemap->sm_cg == cg) 4803 break; 4804 if (bmsafemap) { 4805 *bmsafemapp = bmsafemap; 4806 return (1); 4807 } 4808 *bmsafemapp = NULL; 4809 4810 return (0); 4811 } 4812 4813 /* 4814 * Find the bmsafemap associated with a cylinder group buffer. 4815 * If none exists, create one. The buffer must be locked when 4816 * this routine is called and this routine must be called with 4817 * splbio interrupts blocked. 4818 */ 4819 static struct bmsafemap * 4820 bmsafemap_lookup(mp, bp, cg) 4821 struct mount *mp; 4822 struct buf *bp; 4823 int cg; 4824 { 4825 struct bmsafemap_hashhead *bmsafemaphd; 4826 struct bmsafemap *bmsafemap, *collision; 4827 struct worklist *wk; 4828 struct fs *fs; 4829 4830 mtx_assert(&lk, MA_OWNED); 4831 if (bp) 4832 LIST_FOREACH(wk, &bp->b_dep, wk_list) 4833 if (wk->wk_type == D_BMSAFEMAP) 4834 return (WK_BMSAFEMAP(wk)); 4835 fs = VFSTOUFS(mp)->um_fs; 4836 bmsafemaphd = BMSAFEMAP_HASH(fs, cg); 4837 if (bmsafemap_find(bmsafemaphd, mp, cg, &bmsafemap) == 1) 4838 return (bmsafemap); 4839 FREE_LOCK(&lk); 4840 bmsafemap = malloc(sizeof(struct bmsafemap), 4841 M_BMSAFEMAP, M_SOFTDEP_FLAGS); 4842 workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); 4843 bmsafemap->sm_buf = bp; 4844 LIST_INIT(&bmsafemap->sm_inodedephd); 4845 LIST_INIT(&bmsafemap->sm_inodedepwr); 4846 LIST_INIT(&bmsafemap->sm_newblkhd); 4847 LIST_INIT(&bmsafemap->sm_newblkwr); 4848 LIST_INIT(&bmsafemap->sm_jaddrefhd); 4849 LIST_INIT(&bmsafemap->sm_jnewblkhd); 4850 LIST_INIT(&bmsafemap->sm_freehd); 4851 LIST_INIT(&bmsafemap->sm_freewr); 4852 ACQUIRE_LOCK(&lk); 4853 if (bmsafemap_find(bmsafemaphd, mp, cg, &collision) == 1) { 4854 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 4855 return (collision); 4856 } 4857 bmsafemap->sm_cg = cg; 4858 LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash); 4859 LIST_INSERT_HEAD(&VFSTOUFS(mp)->softdep_dirtycg, bmsafemap, sm_next); 4860 WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); 4861 return (bmsafemap); 4862 } 4863 4864 /* 4865 * Direct block allocation dependencies. 4866 * 4867 * When a new block is allocated, the corresponding disk locations must be 4868 * initialized (with zeros or new data) before the on-disk inode points to 4869 * them. Also, the freemap from which the block was allocated must be 4870 * updated (on disk) before the inode's pointer. These two dependencies are 4871 * independent of each other and are needed for all file blocks and indirect 4872 * blocks that are pointed to directly by the inode. Just before the 4873 * "in-core" version of the inode is updated with a newly allocated block 4874 * number, a procedure (below) is called to setup allocation dependency 4875 * structures. These structures are removed when the corresponding 4876 * dependencies are satisfied or when the block allocation becomes obsolete 4877 * (i.e., the file is deleted, the block is de-allocated, or the block is a 4878 * fragment that gets upgraded). All of these cases are handled in 4879 * procedures described later. 4880 * 4881 * When a file extension causes a fragment to be upgraded, either to a larger 4882 * fragment or to a full block, the on-disk location may change (if the 4883 * previous fragment could not simply be extended). In this case, the old 4884 * fragment must be de-allocated, but not until after the inode's pointer has 4885 * been updated. In most cases, this is handled by later procedures, which 4886 * will construct a "freefrag" structure to be added to the workitem queue 4887 * when the inode update is complete (or obsolete). The main exception to 4888 * this is when an allocation occurs while a pending allocation dependency 4889 * (for the same block pointer) remains. This case is handled in the main 4890 * allocation dependency setup procedure by immediately freeing the 4891 * unreferenced fragments. 4892 */ 4893 void 4894 softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp) 4895 struct inode *ip; /* inode to which block is being added */ 4896 ufs_lbn_t off; /* block pointer within inode */ 4897 ufs2_daddr_t newblkno; /* disk block number being added */ 4898 ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */ 4899 long newsize; /* size of new block */ 4900 long oldsize; /* size of new block */ 4901 struct buf *bp; /* bp for allocated block */ 4902 { 4903 struct allocdirect *adp, *oldadp; 4904 struct allocdirectlst *adphead; 4905 struct freefrag *freefrag; 4906 struct inodedep *inodedep; 4907 struct pagedep *pagedep; 4908 struct jnewblk *jnewblk; 4909 struct newblk *newblk; 4910 struct mount *mp; 4911 ufs_lbn_t lbn; 4912 4913 lbn = bp->b_lblkno; 4914 mp = UFSTOVFS(ip->i_ump); 4915 if (oldblkno && oldblkno != newblkno) 4916 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); 4917 else 4918 freefrag = NULL; 4919 4920 ACQUIRE_LOCK(&lk); 4921 if (off >= NDADDR) { 4922 if (lbn > 0) 4923 panic("softdep_setup_allocdirect: bad lbn %jd, off %jd", 4924 lbn, off); 4925 /* allocating an indirect block */ 4926 if (oldblkno != 0) 4927 panic("softdep_setup_allocdirect: non-zero indir"); 4928 } else { 4929 if (off != lbn) 4930 panic("softdep_setup_allocdirect: lbn %jd != off %jd", 4931 lbn, off); 4932 /* 4933 * Allocating a direct block. 4934 * 4935 * If we are allocating a directory block, then we must 4936 * allocate an associated pagedep to track additions and 4937 * deletions. 4938 */ 4939 if ((ip->i_mode & IFMT) == IFDIR) 4940 pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC, 4941 &pagedep); 4942 } 4943 if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) 4944 panic("softdep_setup_allocdirect: lost block"); 4945 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 4946 ("softdep_setup_allocdirect: newblk already initialized")); 4947 /* 4948 * Convert the newblk to an allocdirect. 4949 */ 4950 newblk->nb_list.wk_type = D_ALLOCDIRECT; 4951 adp = (struct allocdirect *)newblk; 4952 newblk->nb_freefrag = freefrag; 4953 adp->ad_offset = off; 4954 adp->ad_oldblkno = oldblkno; 4955 adp->ad_newsize = newsize; 4956 adp->ad_oldsize = oldsize; 4957 4958 /* 4959 * Finish initializing the journal. 4960 */ 4961 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 4962 jnewblk->jn_ino = ip->i_number; 4963 jnewblk->jn_lbn = lbn; 4964 add_to_journal(&jnewblk->jn_list); 4965 } 4966 if (freefrag && freefrag->ff_jdep != NULL && 4967 freefrag->ff_jdep->wk_type == D_JFREEFRAG) 4968 add_to_journal(freefrag->ff_jdep); 4969 inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); 4970 adp->ad_inodedep = inodedep; 4971 4972 WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); 4973 /* 4974 * The list of allocdirects must be kept in sorted and ascending 4975 * order so that the rollback routines can quickly determine the 4976 * first uncommitted block (the size of the file stored on disk 4977 * ends at the end of the lowest committed fragment, or if there 4978 * are no fragments, at the end of the highest committed block). 4979 * Since files generally grow, the typical case is that the new 4980 * block is to be added at the end of the list. We speed this 4981 * special case by checking against the last allocdirect in the 4982 * list before laboriously traversing the list looking for the 4983 * insertion point. 4984 */ 4985 adphead = &inodedep->id_newinoupdt; 4986 oldadp = TAILQ_LAST(adphead, allocdirectlst); 4987 if (oldadp == NULL || oldadp->ad_offset <= off) { 4988 /* insert at end of list */ 4989 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 4990 if (oldadp != NULL && oldadp->ad_offset == off) 4991 allocdirect_merge(adphead, adp, oldadp); 4992 FREE_LOCK(&lk); 4993 return; 4994 } 4995 TAILQ_FOREACH(oldadp, adphead, ad_next) { 4996 if (oldadp->ad_offset >= off) 4997 break; 4998 } 4999 if (oldadp == NULL) 5000 panic("softdep_setup_allocdirect: lost entry"); 5001 /* insert in middle of list */ 5002 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 5003 if (oldadp->ad_offset == off) 5004 allocdirect_merge(adphead, adp, oldadp); 5005 5006 FREE_LOCK(&lk); 5007 } 5008 5009 /* 5010 * Merge a newer and older journal record to be stored either in a 5011 * newblock or freefrag. This handles aggregating journal records for 5012 * fragment allocation into a second record as well as replacing a 5013 * journal free with an aborted journal allocation. A segment for the 5014 * oldest record will be placed on wkhd if it has been written. If not 5015 * the segment for the newer record will suffice. 5016 */ 5017 static struct worklist * 5018 jnewblk_merge(new, old, wkhd) 5019 struct worklist *new; 5020 struct worklist *old; 5021 struct workhead *wkhd; 5022 { 5023 struct jnewblk *njnewblk; 5024 struct jnewblk *jnewblk; 5025 5026 /* Handle NULLs to simplify callers. */ 5027 if (new == NULL) 5028 return (old); 5029 if (old == NULL) 5030 return (new); 5031 /* Replace a jfreefrag with a jnewblk. */ 5032 if (new->wk_type == D_JFREEFRAG) { 5033 cancel_jfreefrag(WK_JFREEFRAG(new)); 5034 return (old); 5035 } 5036 /* 5037 * Handle merging of two jnewblk records that describe 5038 * different sets of fragments in the same block. 5039 */ 5040 jnewblk = WK_JNEWBLK(old); 5041 njnewblk = WK_JNEWBLK(new); 5042 if (jnewblk->jn_blkno != njnewblk->jn_blkno) 5043 panic("jnewblk_merge: Merging disparate blocks."); 5044 /* 5045 * The record may be rolled back in the cg. 5046 */ 5047 if (jnewblk->jn_state & UNDONE) { 5048 jnewblk->jn_state &= ~UNDONE; 5049 njnewblk->jn_state |= UNDONE; 5050 njnewblk->jn_state &= ~ATTACHED; 5051 } 5052 /* 5053 * We modify the newer addref and free the older so that if neither 5054 * has been written the most up-to-date copy will be on disk. If 5055 * both have been written but rolled back we only temporarily need 5056 * one of them to fix the bits when the cg write completes. 5057 */ 5058 jnewblk->jn_state |= ATTACHED | COMPLETE; 5059 njnewblk->jn_oldfrags = jnewblk->jn_oldfrags; 5060 cancel_jnewblk(jnewblk, wkhd); 5061 WORKLIST_REMOVE(&jnewblk->jn_list); 5062 free_jnewblk(jnewblk); 5063 return (new); 5064 } 5065 5066 /* 5067 * Replace an old allocdirect dependency with a newer one. 5068 * This routine must be called with splbio interrupts blocked. 5069 */ 5070 static void 5071 allocdirect_merge(adphead, newadp, oldadp) 5072 struct allocdirectlst *adphead; /* head of list holding allocdirects */ 5073 struct allocdirect *newadp; /* allocdirect being added */ 5074 struct allocdirect *oldadp; /* existing allocdirect being checked */ 5075 { 5076 struct worklist *wk; 5077 struct freefrag *freefrag; 5078 5079 freefrag = NULL; 5080 mtx_assert(&lk, MA_OWNED); 5081 if (newadp->ad_oldblkno != oldadp->ad_newblkno || 5082 newadp->ad_oldsize != oldadp->ad_newsize || 5083 newadp->ad_offset >= NDADDR) 5084 panic("%s %jd != new %jd || old size %ld != new %ld", 5085 "allocdirect_merge: old blkno", 5086 (intmax_t)newadp->ad_oldblkno, 5087 (intmax_t)oldadp->ad_newblkno, 5088 newadp->ad_oldsize, oldadp->ad_newsize); 5089 newadp->ad_oldblkno = oldadp->ad_oldblkno; 5090 newadp->ad_oldsize = oldadp->ad_oldsize; 5091 /* 5092 * If the old dependency had a fragment to free or had never 5093 * previously had a block allocated, then the new dependency 5094 * can immediately post its freefrag and adopt the old freefrag. 5095 * This action is done by swapping the freefrag dependencies. 5096 * The new dependency gains the old one's freefrag, and the 5097 * old one gets the new one and then immediately puts it on 5098 * the worklist when it is freed by free_newblk. It is 5099 * not possible to do this swap when the old dependency had a 5100 * non-zero size but no previous fragment to free. This condition 5101 * arises when the new block is an extension of the old block. 5102 * Here, the first part of the fragment allocated to the new 5103 * dependency is part of the block currently claimed on disk by 5104 * the old dependency, so cannot legitimately be freed until the 5105 * conditions for the new dependency are fulfilled. 5106 */ 5107 freefrag = newadp->ad_freefrag; 5108 if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { 5109 newadp->ad_freefrag = oldadp->ad_freefrag; 5110 oldadp->ad_freefrag = freefrag; 5111 } 5112 /* 5113 * If we are tracking a new directory-block allocation, 5114 * move it from the old allocdirect to the new allocdirect. 5115 */ 5116 if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) { 5117 WORKLIST_REMOVE(wk); 5118 if (!LIST_EMPTY(&oldadp->ad_newdirblk)) 5119 panic("allocdirect_merge: extra newdirblk"); 5120 WORKLIST_INSERT(&newadp->ad_newdirblk, wk); 5121 } 5122 TAILQ_REMOVE(adphead, oldadp, ad_next); 5123 /* 5124 * We need to move any journal dependencies over to the freefrag 5125 * that releases this block if it exists. Otherwise we are 5126 * extending an existing block and we'll wait until that is 5127 * complete to release the journal space and extend the 5128 * new journal to cover this old space as well. 5129 */ 5130 if (freefrag == NULL) { 5131 if (oldadp->ad_newblkno != newadp->ad_newblkno) 5132 panic("allocdirect_merge: %jd != %jd", 5133 oldadp->ad_newblkno, newadp->ad_newblkno); 5134 newadp->ad_block.nb_jnewblk = (struct jnewblk *) 5135 jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list, 5136 &oldadp->ad_block.nb_jnewblk->jn_list, 5137 &newadp->ad_block.nb_jwork); 5138 oldadp->ad_block.nb_jnewblk = NULL; 5139 cancel_newblk(&oldadp->ad_block, NULL, 5140 &newadp->ad_block.nb_jwork); 5141 } else { 5142 wk = (struct worklist *) cancel_newblk(&oldadp->ad_block, 5143 &freefrag->ff_list, &freefrag->ff_jwork); 5144 freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk, 5145 &freefrag->ff_jwork); 5146 } 5147 free_newblk(&oldadp->ad_block); 5148 } 5149 5150 /* 5151 * Allocate a jfreefrag structure to journal a single block free. 5152 */ 5153 static struct jfreefrag * 5154 newjfreefrag(freefrag, ip, blkno, size, lbn) 5155 struct freefrag *freefrag; 5156 struct inode *ip; 5157 ufs2_daddr_t blkno; 5158 long size; 5159 ufs_lbn_t lbn; 5160 { 5161 struct jfreefrag *jfreefrag; 5162 struct fs *fs; 5163 5164 fs = ip->i_fs; 5165 jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG, 5166 M_SOFTDEP_FLAGS); 5167 workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump)); 5168 jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list); 5169 jfreefrag->fr_state = ATTACHED | DEPCOMPLETE; 5170 jfreefrag->fr_ino = ip->i_number; 5171 jfreefrag->fr_lbn = lbn; 5172 jfreefrag->fr_blkno = blkno; 5173 jfreefrag->fr_frags = numfrags(fs, size); 5174 jfreefrag->fr_freefrag = freefrag; 5175 5176 return (jfreefrag); 5177 } 5178 5179 /* 5180 * Allocate a new freefrag structure. 5181 */ 5182 static struct freefrag * 5183 newfreefrag(ip, blkno, size, lbn) 5184 struct inode *ip; 5185 ufs2_daddr_t blkno; 5186 long size; 5187 ufs_lbn_t lbn; 5188 { 5189 struct freefrag *freefrag; 5190 struct fs *fs; 5191 5192 fs = ip->i_fs; 5193 if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) 5194 panic("newfreefrag: frag size"); 5195 freefrag = malloc(sizeof(struct freefrag), 5196 M_FREEFRAG, M_SOFTDEP_FLAGS); 5197 workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump)); 5198 freefrag->ff_state = ATTACHED; 5199 LIST_INIT(&freefrag->ff_jwork); 5200 freefrag->ff_inum = ip->i_number; 5201 freefrag->ff_vtype = ITOV(ip)->v_type; 5202 freefrag->ff_blkno = blkno; 5203 freefrag->ff_fragsize = size; 5204 5205 if (MOUNTEDSUJ(UFSTOVFS(ip->i_ump))) { 5206 freefrag->ff_jdep = (struct worklist *) 5207 newjfreefrag(freefrag, ip, blkno, size, lbn); 5208 } else { 5209 freefrag->ff_state |= DEPCOMPLETE; 5210 freefrag->ff_jdep = NULL; 5211 } 5212 5213 return (freefrag); 5214 } 5215 5216 /* 5217 * This workitem de-allocates fragments that were replaced during 5218 * file block allocation. 5219 */ 5220 static void 5221 handle_workitem_freefrag(freefrag) 5222 struct freefrag *freefrag; 5223 { 5224 struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp); 5225 struct workhead wkhd; 5226 5227 /* 5228 * It would be illegal to add new completion items to the 5229 * freefrag after it was schedule to be done so it must be 5230 * safe to modify the list head here. 5231 */ 5232 LIST_INIT(&wkhd); 5233 ACQUIRE_LOCK(&lk); 5234 LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list); 5235 /* 5236 * If the journal has not been written we must cancel it here. 5237 */ 5238 if (freefrag->ff_jdep) { 5239 if (freefrag->ff_jdep->wk_type != D_JNEWBLK) 5240 panic("handle_workitem_freefrag: Unexpected type %d\n", 5241 freefrag->ff_jdep->wk_type); 5242 cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd); 5243 } 5244 FREE_LOCK(&lk); 5245 ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno, 5246 freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd); 5247 ACQUIRE_LOCK(&lk); 5248 WORKITEM_FREE(freefrag, D_FREEFRAG); 5249 FREE_LOCK(&lk); 5250 } 5251 5252 /* 5253 * Set up a dependency structure for an external attributes data block. 5254 * This routine follows much of the structure of softdep_setup_allocdirect. 5255 * See the description of softdep_setup_allocdirect above for details. 5256 */ 5257 void 5258 softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp) 5259 struct inode *ip; 5260 ufs_lbn_t off; 5261 ufs2_daddr_t newblkno; 5262 ufs2_daddr_t oldblkno; 5263 long newsize; 5264 long oldsize; 5265 struct buf *bp; 5266 { 5267 struct allocdirect *adp, *oldadp; 5268 struct allocdirectlst *adphead; 5269 struct freefrag *freefrag; 5270 struct inodedep *inodedep; 5271 struct jnewblk *jnewblk; 5272 struct newblk *newblk; 5273 struct mount *mp; 5274 ufs_lbn_t lbn; 5275 5276 if (off >= NXADDR) 5277 panic("softdep_setup_allocext: lbn %lld > NXADDR", 5278 (long long)off); 5279 5280 lbn = bp->b_lblkno; 5281 mp = UFSTOVFS(ip->i_ump); 5282 if (oldblkno && oldblkno != newblkno) 5283 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); 5284 else 5285 freefrag = NULL; 5286 5287 ACQUIRE_LOCK(&lk); 5288 if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) 5289 panic("softdep_setup_allocext: lost block"); 5290 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 5291 ("softdep_setup_allocext: newblk already initialized")); 5292 /* 5293 * Convert the newblk to an allocdirect. 5294 */ 5295 newblk->nb_list.wk_type = D_ALLOCDIRECT; 5296 adp = (struct allocdirect *)newblk; 5297 newblk->nb_freefrag = freefrag; 5298 adp->ad_offset = off; 5299 adp->ad_oldblkno = oldblkno; 5300 adp->ad_newsize = newsize; 5301 adp->ad_oldsize = oldsize; 5302 adp->ad_state |= EXTDATA; 5303 5304 /* 5305 * Finish initializing the journal. 5306 */ 5307 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 5308 jnewblk->jn_ino = ip->i_number; 5309 jnewblk->jn_lbn = lbn; 5310 add_to_journal(&jnewblk->jn_list); 5311 } 5312 if (freefrag && freefrag->ff_jdep != NULL && 5313 freefrag->ff_jdep->wk_type == D_JFREEFRAG) 5314 add_to_journal(freefrag->ff_jdep); 5315 inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); 5316 adp->ad_inodedep = inodedep; 5317 5318 WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); 5319 /* 5320 * The list of allocdirects must be kept in sorted and ascending 5321 * order so that the rollback routines can quickly determine the 5322 * first uncommitted block (the size of the file stored on disk 5323 * ends at the end of the lowest committed fragment, or if there 5324 * are no fragments, at the end of the highest committed block). 5325 * Since files generally grow, the typical case is that the new 5326 * block is to be added at the end of the list. We speed this 5327 * special case by checking against the last allocdirect in the 5328 * list before laboriously traversing the list looking for the 5329 * insertion point. 5330 */ 5331 adphead = &inodedep->id_newextupdt; 5332 oldadp = TAILQ_LAST(adphead, allocdirectlst); 5333 if (oldadp == NULL || oldadp->ad_offset <= off) { 5334 /* insert at end of list */ 5335 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 5336 if (oldadp != NULL && oldadp->ad_offset == off) 5337 allocdirect_merge(adphead, adp, oldadp); 5338 FREE_LOCK(&lk); 5339 return; 5340 } 5341 TAILQ_FOREACH(oldadp, adphead, ad_next) { 5342 if (oldadp->ad_offset >= off) 5343 break; 5344 } 5345 if (oldadp == NULL) 5346 panic("softdep_setup_allocext: lost entry"); 5347 /* insert in middle of list */ 5348 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 5349 if (oldadp->ad_offset == off) 5350 allocdirect_merge(adphead, adp, oldadp); 5351 FREE_LOCK(&lk); 5352 } 5353 5354 /* 5355 * Indirect block allocation dependencies. 5356 * 5357 * The same dependencies that exist for a direct block also exist when 5358 * a new block is allocated and pointed to by an entry in a block of 5359 * indirect pointers. The undo/redo states described above are also 5360 * used here. Because an indirect block contains many pointers that 5361 * may have dependencies, a second copy of the entire in-memory indirect 5362 * block is kept. The buffer cache copy is always completely up-to-date. 5363 * The second copy, which is used only as a source for disk writes, 5364 * contains only the safe pointers (i.e., those that have no remaining 5365 * update dependencies). The second copy is freed when all pointers 5366 * are safe. The cache is not allowed to replace indirect blocks with 5367 * pending update dependencies. If a buffer containing an indirect 5368 * block with dependencies is written, these routines will mark it 5369 * dirty again. It can only be successfully written once all the 5370 * dependencies are removed. The ffs_fsync routine in conjunction with 5371 * softdep_sync_metadata work together to get all the dependencies 5372 * removed so that a file can be successfully written to disk. Three 5373 * procedures are used when setting up indirect block pointer 5374 * dependencies. The division is necessary because of the organization 5375 * of the "balloc" routine and because of the distinction between file 5376 * pages and file metadata blocks. 5377 */ 5378 5379 /* 5380 * Allocate a new allocindir structure. 5381 */ 5382 static struct allocindir * 5383 newallocindir(ip, ptrno, newblkno, oldblkno, lbn) 5384 struct inode *ip; /* inode for file being extended */ 5385 int ptrno; /* offset of pointer in indirect block */ 5386 ufs2_daddr_t newblkno; /* disk block number being added */ 5387 ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ 5388 ufs_lbn_t lbn; 5389 { 5390 struct newblk *newblk; 5391 struct allocindir *aip; 5392 struct freefrag *freefrag; 5393 struct jnewblk *jnewblk; 5394 5395 if (oldblkno) 5396 freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn); 5397 else 5398 freefrag = NULL; 5399 ACQUIRE_LOCK(&lk); 5400 if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0) 5401 panic("new_allocindir: lost block"); 5402 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 5403 ("newallocindir: newblk already initialized")); 5404 newblk->nb_list.wk_type = D_ALLOCINDIR; 5405 newblk->nb_freefrag = freefrag; 5406 aip = (struct allocindir *)newblk; 5407 aip->ai_offset = ptrno; 5408 aip->ai_oldblkno = oldblkno; 5409 aip->ai_lbn = lbn; 5410 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 5411 jnewblk->jn_ino = ip->i_number; 5412 jnewblk->jn_lbn = lbn; 5413 add_to_journal(&jnewblk->jn_list); 5414 } 5415 if (freefrag && freefrag->ff_jdep != NULL && 5416 freefrag->ff_jdep->wk_type == D_JFREEFRAG) 5417 add_to_journal(freefrag->ff_jdep); 5418 return (aip); 5419 } 5420 5421 /* 5422 * Called just before setting an indirect block pointer 5423 * to a newly allocated file page. 5424 */ 5425 void 5426 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 5427 struct inode *ip; /* inode for file being extended */ 5428 ufs_lbn_t lbn; /* allocated block number within file */ 5429 struct buf *bp; /* buffer with indirect blk referencing page */ 5430 int ptrno; /* offset of pointer in indirect block */ 5431 ufs2_daddr_t newblkno; /* disk block number being added */ 5432 ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ 5433 struct buf *nbp; /* buffer holding allocated page */ 5434 { 5435 struct inodedep *inodedep; 5436 struct freefrag *freefrag; 5437 struct allocindir *aip; 5438 struct pagedep *pagedep; 5439 struct mount *mp; 5440 5441 if (lbn != nbp->b_lblkno) 5442 panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd", 5443 lbn, bp->b_lblkno); 5444 ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page"); 5445 mp = UFSTOVFS(ip->i_ump); 5446 aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn); 5447 (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 5448 /* 5449 * If we are allocating a directory page, then we must 5450 * allocate an associated pagedep to track additions and 5451 * deletions. 5452 */ 5453 if ((ip->i_mode & IFMT) == IFDIR) 5454 pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep); 5455 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); 5456 freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn); 5457 FREE_LOCK(&lk); 5458 if (freefrag) 5459 handle_workitem_freefrag(freefrag); 5460 } 5461 5462 /* 5463 * Called just before setting an indirect block pointer to a 5464 * newly allocated indirect block. 5465 */ 5466 void 5467 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 5468 struct buf *nbp; /* newly allocated indirect block */ 5469 struct inode *ip; /* inode for file being extended */ 5470 struct buf *bp; /* indirect block referencing allocated block */ 5471 int ptrno; /* offset of pointer in indirect block */ 5472 ufs2_daddr_t newblkno; /* disk block number being added */ 5473 { 5474 struct inodedep *inodedep; 5475 struct allocindir *aip; 5476 ufs_lbn_t lbn; 5477 5478 lbn = nbp->b_lblkno; 5479 ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta"); 5480 aip = newallocindir(ip, ptrno, newblkno, 0, lbn); 5481 inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep); 5482 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); 5483 if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)) 5484 panic("softdep_setup_allocindir_meta: Block already existed"); 5485 FREE_LOCK(&lk); 5486 } 5487 5488 static void 5489 indirdep_complete(indirdep) 5490 struct indirdep *indirdep; 5491 { 5492 struct allocindir *aip; 5493 5494 LIST_REMOVE(indirdep, ir_next); 5495 indirdep->ir_state |= DEPCOMPLETE; 5496 5497 while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) { 5498 LIST_REMOVE(aip, ai_next); 5499 free_newblk(&aip->ai_block); 5500 } 5501 /* 5502 * If this indirdep is not attached to a buf it was simply waiting 5503 * on completion to clear completehd. free_indirdep() asserts 5504 * that nothing is dangling. 5505 */ 5506 if ((indirdep->ir_state & ONWORKLIST) == 0) 5507 free_indirdep(indirdep); 5508 } 5509 5510 static struct indirdep * 5511 indirdep_lookup(mp, ip, bp) 5512 struct mount *mp; 5513 struct inode *ip; 5514 struct buf *bp; 5515 { 5516 struct indirdep *indirdep, *newindirdep; 5517 struct newblk *newblk; 5518 struct worklist *wk; 5519 struct fs *fs; 5520 ufs2_daddr_t blkno; 5521 5522 mtx_assert(&lk, MA_OWNED); 5523 indirdep = NULL; 5524 newindirdep = NULL; 5525 fs = ip->i_fs; 5526 for (;;) { 5527 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 5528 if (wk->wk_type != D_INDIRDEP) 5529 continue; 5530 indirdep = WK_INDIRDEP(wk); 5531 break; 5532 } 5533 /* Found on the buffer worklist, no new structure to free. */ 5534 if (indirdep != NULL && newindirdep == NULL) 5535 return (indirdep); 5536 if (indirdep != NULL && newindirdep != NULL) 5537 panic("indirdep_lookup: simultaneous create"); 5538 /* None found on the buffer and a new structure is ready. */ 5539 if (indirdep == NULL && newindirdep != NULL) 5540 break; 5541 /* None found and no new structure available. */ 5542 FREE_LOCK(&lk); 5543 newindirdep = malloc(sizeof(struct indirdep), 5544 M_INDIRDEP, M_SOFTDEP_FLAGS); 5545 workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp); 5546 newindirdep->ir_state = ATTACHED; 5547 if (ip->i_ump->um_fstype == UFS1) 5548 newindirdep->ir_state |= UFS1FMT; 5549 TAILQ_INIT(&newindirdep->ir_trunc); 5550 newindirdep->ir_saveddata = NULL; 5551 LIST_INIT(&newindirdep->ir_deplisthd); 5552 LIST_INIT(&newindirdep->ir_donehd); 5553 LIST_INIT(&newindirdep->ir_writehd); 5554 LIST_INIT(&newindirdep->ir_completehd); 5555 if (bp->b_blkno == bp->b_lblkno) { 5556 ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp, 5557 NULL, NULL); 5558 bp->b_blkno = blkno; 5559 } 5560 newindirdep->ir_freeblks = NULL; 5561 newindirdep->ir_savebp = 5562 getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0); 5563 newindirdep->ir_bp = bp; 5564 BUF_KERNPROC(newindirdep->ir_savebp); 5565 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); 5566 ACQUIRE_LOCK(&lk); 5567 } 5568 indirdep = newindirdep; 5569 WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); 5570 /* 5571 * If the block is not yet allocated we don't set DEPCOMPLETE so 5572 * that we don't free dependencies until the pointers are valid. 5573 * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather 5574 * than using the hash. 5575 */ 5576 if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)) 5577 LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next); 5578 else 5579 indirdep->ir_state |= DEPCOMPLETE; 5580 return (indirdep); 5581 } 5582 5583 /* 5584 * Called to finish the allocation of the "aip" allocated 5585 * by one of the two routines above. 5586 */ 5587 static struct freefrag * 5588 setup_allocindir_phase2(bp, ip, inodedep, aip, lbn) 5589 struct buf *bp; /* in-memory copy of the indirect block */ 5590 struct inode *ip; /* inode for file being extended */ 5591 struct inodedep *inodedep; /* Inodedep for ip */ 5592 struct allocindir *aip; /* allocindir allocated by the above routines */ 5593 ufs_lbn_t lbn; /* Logical block number for this block. */ 5594 { 5595 struct fs *fs; 5596 struct indirdep *indirdep; 5597 struct allocindir *oldaip; 5598 struct freefrag *freefrag; 5599 struct mount *mp; 5600 5601 mtx_assert(&lk, MA_OWNED); 5602 mp = UFSTOVFS(ip->i_ump); 5603 fs = ip->i_fs; 5604 if (bp->b_lblkno >= 0) 5605 panic("setup_allocindir_phase2: not indir blk"); 5606 KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs), 5607 ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset)); 5608 indirdep = indirdep_lookup(mp, ip, bp); 5609 KASSERT(indirdep->ir_savebp != NULL, 5610 ("setup_allocindir_phase2 NULL ir_savebp")); 5611 aip->ai_indirdep = indirdep; 5612 /* 5613 * Check for an unwritten dependency for this indirect offset. If 5614 * there is, merge the old dependency into the new one. This happens 5615 * as a result of reallocblk only. 5616 */ 5617 freefrag = NULL; 5618 if (aip->ai_oldblkno != 0) { 5619 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) { 5620 if (oldaip->ai_offset == aip->ai_offset) { 5621 freefrag = allocindir_merge(aip, oldaip); 5622 goto done; 5623 } 5624 } 5625 LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) { 5626 if (oldaip->ai_offset == aip->ai_offset) { 5627 freefrag = allocindir_merge(aip, oldaip); 5628 goto done; 5629 } 5630 } 5631 } 5632 done: 5633 LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); 5634 return (freefrag); 5635 } 5636 5637 /* 5638 * Merge two allocindirs which refer to the same block. Move newblock 5639 * dependencies and setup the freefrags appropriately. 5640 */ 5641 static struct freefrag * 5642 allocindir_merge(aip, oldaip) 5643 struct allocindir *aip; 5644 struct allocindir *oldaip; 5645 { 5646 struct freefrag *freefrag; 5647 struct worklist *wk; 5648 5649 if (oldaip->ai_newblkno != aip->ai_oldblkno) 5650 panic("allocindir_merge: blkno"); 5651 aip->ai_oldblkno = oldaip->ai_oldblkno; 5652 freefrag = aip->ai_freefrag; 5653 aip->ai_freefrag = oldaip->ai_freefrag; 5654 oldaip->ai_freefrag = NULL; 5655 KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag")); 5656 /* 5657 * If we are tracking a new directory-block allocation, 5658 * move it from the old allocindir to the new allocindir. 5659 */ 5660 if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) { 5661 WORKLIST_REMOVE(wk); 5662 if (!LIST_EMPTY(&oldaip->ai_newdirblk)) 5663 panic("allocindir_merge: extra newdirblk"); 5664 WORKLIST_INSERT(&aip->ai_newdirblk, wk); 5665 } 5666 /* 5667 * We can skip journaling for this freefrag and just complete 5668 * any pending journal work for the allocindir that is being 5669 * removed after the freefrag completes. 5670 */ 5671 if (freefrag->ff_jdep) 5672 cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep)); 5673 LIST_REMOVE(oldaip, ai_next); 5674 freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block, 5675 &freefrag->ff_list, &freefrag->ff_jwork); 5676 free_newblk(&oldaip->ai_block); 5677 5678 return (freefrag); 5679 } 5680 5681 static inline void 5682 setup_freedirect(freeblks, ip, i, needj) 5683 struct freeblks *freeblks; 5684 struct inode *ip; 5685 int i; 5686 int needj; 5687 { 5688 ufs2_daddr_t blkno; 5689 int frags; 5690 5691 blkno = DIP(ip, i_db[i]); 5692 if (blkno == 0) 5693 return; 5694 DIP_SET(ip, i_db[i], 0); 5695 frags = sblksize(ip->i_fs, ip->i_size, i); 5696 frags = numfrags(ip->i_fs, frags); 5697 newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags, 0, needj); 5698 } 5699 5700 static inline void 5701 setup_freeext(freeblks, ip, i, needj) 5702 struct freeblks *freeblks; 5703 struct inode *ip; 5704 int i; 5705 int needj; 5706 { 5707 ufs2_daddr_t blkno; 5708 int frags; 5709 5710 blkno = ip->i_din2->di_extb[i]; 5711 if (blkno == 0) 5712 return; 5713 ip->i_din2->di_extb[i] = 0; 5714 frags = sblksize(ip->i_fs, ip->i_din2->di_extsize, i); 5715 frags = numfrags(ip->i_fs, frags); 5716 newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj); 5717 } 5718 5719 static inline void 5720 setup_freeindir(freeblks, ip, i, lbn, needj) 5721 struct freeblks *freeblks; 5722 struct inode *ip; 5723 int i; 5724 ufs_lbn_t lbn; 5725 int needj; 5726 { 5727 ufs2_daddr_t blkno; 5728 5729 blkno = DIP(ip, i_ib[i]); 5730 if (blkno == 0) 5731 return; 5732 DIP_SET(ip, i_ib[i], 0); 5733 newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, ip->i_fs->fs_frag, 5734 0, needj); 5735 } 5736 5737 static inline struct freeblks * 5738 newfreeblks(mp, ip) 5739 struct mount *mp; 5740 struct inode *ip; 5741 { 5742 struct freeblks *freeblks; 5743 5744 freeblks = malloc(sizeof(struct freeblks), 5745 M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO); 5746 workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp); 5747 LIST_INIT(&freeblks->fb_jblkdephd); 5748 LIST_INIT(&freeblks->fb_jwork); 5749 freeblks->fb_ref = 0; 5750 freeblks->fb_cgwait = 0; 5751 freeblks->fb_state = ATTACHED; 5752 freeblks->fb_uid = ip->i_uid; 5753 freeblks->fb_inum = ip->i_number; 5754 freeblks->fb_vtype = ITOV(ip)->v_type; 5755 freeblks->fb_modrev = DIP(ip, i_modrev); 5756 freeblks->fb_devvp = ip->i_devvp; 5757 freeblks->fb_chkcnt = 0; 5758 freeblks->fb_len = 0; 5759 5760 return (freeblks); 5761 } 5762 5763 static void 5764 trunc_indirdep(indirdep, freeblks, bp, off) 5765 struct indirdep *indirdep; 5766 struct freeblks *freeblks; 5767 struct buf *bp; 5768 int off; 5769 { 5770 struct allocindir *aip, *aipn; 5771 5772 /* 5773 * The first set of allocindirs won't be in savedbp. 5774 */ 5775 LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn) 5776 if (aip->ai_offset > off) 5777 cancel_allocindir(aip, bp, freeblks, 1); 5778 LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn) 5779 if (aip->ai_offset > off) 5780 cancel_allocindir(aip, bp, freeblks, 1); 5781 /* 5782 * These will exist in savedbp. 5783 */ 5784 LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn) 5785 if (aip->ai_offset > off) 5786 cancel_allocindir(aip, NULL, freeblks, 0); 5787 LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn) 5788 if (aip->ai_offset > off) 5789 cancel_allocindir(aip, NULL, freeblks, 0); 5790 } 5791 5792 /* 5793 * Follow the chain of indirects down to lastlbn creating a freework 5794 * structure for each. This will be used to start indir_trunc() at 5795 * the right offset and create the journal records for the parrtial 5796 * truncation. A second step will handle the truncated dependencies. 5797 */ 5798 static int 5799 setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno) 5800 struct freeblks *freeblks; 5801 struct inode *ip; 5802 ufs_lbn_t lbn; 5803 ufs_lbn_t lastlbn; 5804 ufs2_daddr_t blkno; 5805 { 5806 struct indirdep *indirdep; 5807 struct indirdep *indirn; 5808 struct freework *freework; 5809 struct newblk *newblk; 5810 struct mount *mp; 5811 struct buf *bp; 5812 uint8_t *start; 5813 uint8_t *end; 5814 ufs_lbn_t lbnadd; 5815 int level; 5816 int error; 5817 int off; 5818 5819 5820 freework = NULL; 5821 if (blkno == 0) 5822 return (0); 5823 mp = freeblks->fb_list.wk_mp; 5824 bp = getblk(ITOV(ip), lbn, mp->mnt_stat.f_iosize, 0, 0, 0); 5825 if ((bp->b_flags & B_CACHE) == 0) { 5826 bp->b_blkno = blkptrtodb(VFSTOUFS(mp), blkno); 5827 bp->b_iocmd = BIO_READ; 5828 bp->b_flags &= ~B_INVAL; 5829 bp->b_ioflags &= ~BIO_ERROR; 5830 vfs_busy_pages(bp, 0); 5831 bp->b_iooffset = dbtob(bp->b_blkno); 5832 bstrategy(bp); 5833 curthread->td_ru.ru_inblock++; 5834 error = bufwait(bp); 5835 if (error) { 5836 brelse(bp); 5837 return (error); 5838 } 5839 } 5840 level = lbn_level(lbn); 5841 lbnadd = lbn_offset(ip->i_fs, level); 5842 /* 5843 * Compute the offset of the last block we want to keep. Store 5844 * in the freework the first block we want to completely free. 5845 */ 5846 off = (lastlbn - -(lbn + level)) / lbnadd; 5847 if (off + 1 == NINDIR(ip->i_fs)) 5848 goto nowork; 5849 freework = newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, 0, off+1, 5850 0); 5851 /* 5852 * Link the freework into the indirdep. This will prevent any new 5853 * allocations from proceeding until we are finished with the 5854 * truncate and the block is written. 5855 */ 5856 ACQUIRE_LOCK(&lk); 5857 indirdep = indirdep_lookup(mp, ip, bp); 5858 if (indirdep->ir_freeblks) 5859 panic("setup_trunc_indir: indirdep already truncated."); 5860 TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next); 5861 freework->fw_indir = indirdep; 5862 /* 5863 * Cancel any allocindirs that will not make it to disk. 5864 * We have to do this for all copies of the indirdep that 5865 * live on this newblk. 5866 */ 5867 if ((indirdep->ir_state & DEPCOMPLETE) == 0) { 5868 newblk_lookup(mp, dbtofsb(ip->i_fs, bp->b_blkno), 0, &newblk); 5869 LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next) 5870 trunc_indirdep(indirn, freeblks, bp, off); 5871 } else 5872 trunc_indirdep(indirdep, freeblks, bp, off); 5873 FREE_LOCK(&lk); 5874 /* 5875 * Creation is protected by the buf lock. The saveddata is only 5876 * needed if a full truncation follows a partial truncation but it 5877 * is difficult to allocate in that case so we fetch it anyway. 5878 */ 5879 if (indirdep->ir_saveddata == NULL) 5880 indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, 5881 M_SOFTDEP_FLAGS); 5882 nowork: 5883 /* Fetch the blkno of the child and the zero start offset. */ 5884 if (ip->i_ump->um_fstype == UFS1) { 5885 blkno = ((ufs1_daddr_t *)bp->b_data)[off]; 5886 start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1]; 5887 } else { 5888 blkno = ((ufs2_daddr_t *)bp->b_data)[off]; 5889 start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1]; 5890 } 5891 if (freework) { 5892 /* Zero the truncated pointers. */ 5893 end = bp->b_data + bp->b_bcount; 5894 bzero(start, end - start); 5895 bdwrite(bp); 5896 } else 5897 bqrelse(bp); 5898 if (level == 0) 5899 return (0); 5900 lbn++; /* adjust level */ 5901 lbn -= (off * lbnadd); 5902 return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno); 5903 } 5904 5905 /* 5906 * Complete the partial truncation of an indirect block setup by 5907 * setup_trunc_indir(). This zeros the truncated pointers in the saved 5908 * copy and writes them to disk before the freeblks is allowed to complete. 5909 */ 5910 static void 5911 complete_trunc_indir(freework) 5912 struct freework *freework; 5913 { 5914 struct freework *fwn; 5915 struct indirdep *indirdep; 5916 struct buf *bp; 5917 uintptr_t start; 5918 int count; 5919 5920 indirdep = freework->fw_indir; 5921 for (;;) { 5922 bp = indirdep->ir_bp; 5923 /* See if the block was discarded. */ 5924 if (bp == NULL) 5925 break; 5926 /* Inline part of getdirtybuf(). We dont want bremfree. */ 5927 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) 5928 break; 5929 if (BUF_LOCK(bp, 5930 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, &lk) == 0) 5931 BUF_UNLOCK(bp); 5932 ACQUIRE_LOCK(&lk); 5933 } 5934 mtx_assert(&lk, MA_OWNED); 5935 freework->fw_state |= DEPCOMPLETE; 5936 TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next); 5937 /* 5938 * Zero the pointers in the saved copy. 5939 */ 5940 if (indirdep->ir_state & UFS1FMT) 5941 start = sizeof(ufs1_daddr_t); 5942 else 5943 start = sizeof(ufs2_daddr_t); 5944 start *= freework->fw_start; 5945 count = indirdep->ir_savebp->b_bcount - start; 5946 start += (uintptr_t)indirdep->ir_savebp->b_data; 5947 bzero((char *)start, count); 5948 /* 5949 * We need to start the next truncation in the list if it has not 5950 * been started yet. 5951 */ 5952 fwn = TAILQ_FIRST(&indirdep->ir_trunc); 5953 if (fwn != NULL) { 5954 if (fwn->fw_freeblks == indirdep->ir_freeblks) 5955 TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next); 5956 if ((fwn->fw_state & ONWORKLIST) == 0) 5957 freework_enqueue(fwn); 5958 } 5959 /* 5960 * If bp is NULL the block was fully truncated, restore 5961 * the saved block list otherwise free it if it is no 5962 * longer needed. 5963 */ 5964 if (TAILQ_EMPTY(&indirdep->ir_trunc)) { 5965 if (bp == NULL) 5966 bcopy(indirdep->ir_saveddata, 5967 indirdep->ir_savebp->b_data, 5968 indirdep->ir_savebp->b_bcount); 5969 free(indirdep->ir_saveddata, M_INDIRDEP); 5970 indirdep->ir_saveddata = NULL; 5971 } 5972 /* 5973 * When bp is NULL there is a full truncation pending. We 5974 * must wait for this full truncation to be journaled before 5975 * we can release this freework because the disk pointers will 5976 * never be written as zero. 5977 */ 5978 if (bp == NULL) { 5979 if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd)) 5980 handle_written_freework(freework); 5981 else 5982 WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd, 5983 &freework->fw_list); 5984 } else { 5985 /* Complete when the real copy is written. */ 5986 WORKLIST_INSERT(&bp->b_dep, &freework->fw_list); 5987 BUF_UNLOCK(bp); 5988 } 5989 } 5990 5991 /* 5992 * Calculate the number of blocks we are going to release where datablocks 5993 * is the current total and length is the new file size. 5994 */ 5995 ufs2_daddr_t 5996 blkcount(fs, datablocks, length) 5997 struct fs *fs; 5998 ufs2_daddr_t datablocks; 5999 off_t length; 6000 { 6001 off_t totblks, numblks; 6002 6003 totblks = 0; 6004 numblks = howmany(length, fs->fs_bsize); 6005 if (numblks <= NDADDR) { 6006 totblks = howmany(length, fs->fs_fsize); 6007 goto out; 6008 } 6009 totblks = blkstofrags(fs, numblks); 6010 numblks -= NDADDR; 6011 /* 6012 * Count all single, then double, then triple indirects required. 6013 * Subtracting one indirects worth of blocks for each pass 6014 * acknowledges one of each pointed to by the inode. 6015 */ 6016 for (;;) { 6017 totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs))); 6018 numblks -= NINDIR(fs); 6019 if (numblks <= 0) 6020 break; 6021 numblks = howmany(numblks, NINDIR(fs)); 6022 } 6023 out: 6024 totblks = fsbtodb(fs, totblks); 6025 /* 6026 * Handle sparse files. We can't reclaim more blocks than the inode 6027 * references. We will correct it later in handle_complete_freeblks() 6028 * when we know the real count. 6029 */ 6030 if (totblks > datablocks) 6031 return (0); 6032 return (datablocks - totblks); 6033 } 6034 6035 /* 6036 * Handle freeblocks for journaled softupdate filesystems. 6037 * 6038 * Contrary to normal softupdates, we must preserve the block pointers in 6039 * indirects until their subordinates are free. This is to avoid journaling 6040 * every block that is freed which may consume more space than the journal 6041 * itself. The recovery program will see the free block journals at the 6042 * base of the truncated area and traverse them to reclaim space. The 6043 * pointers in the inode may be cleared immediately after the journal 6044 * records are written because each direct and indirect pointer in the 6045 * inode is recorded in a journal. This permits full truncation to proceed 6046 * asynchronously. The write order is journal -> inode -> cgs -> indirects. 6047 * 6048 * The algorithm is as follows: 6049 * 1) Traverse the in-memory state and create journal entries to release 6050 * the relevant blocks and full indirect trees. 6051 * 2) Traverse the indirect block chain adding partial truncation freework 6052 * records to indirects in the path to lastlbn. The freework will 6053 * prevent new allocation dependencies from being satisfied in this 6054 * indirect until the truncation completes. 6055 * 3) Read and lock the inode block, performing an update with the new size 6056 * and pointers. This prevents truncated data from becoming valid on 6057 * disk through step 4. 6058 * 4) Reap unsatisfied dependencies that are beyond the truncated area, 6059 * eliminate journal work for those records that do not require it. 6060 * 5) Schedule the journal records to be written followed by the inode block. 6061 * 6) Allocate any necessary frags for the end of file. 6062 * 7) Zero any partially truncated blocks. 6063 * 6064 * From this truncation proceeds asynchronously using the freework and 6065 * indir_trunc machinery. The file will not be extended again into a 6066 * partially truncated indirect block until all work is completed but 6067 * the normal dependency mechanism ensures that it is rolled back/forward 6068 * as appropriate. Further truncation may occur without delay and is 6069 * serialized in indir_trunc(). 6070 */ 6071 void 6072 softdep_journal_freeblocks(ip, cred, length, flags) 6073 struct inode *ip; /* The inode whose length is to be reduced */ 6074 struct ucred *cred; 6075 off_t length; /* The new length for the file */ 6076 int flags; /* IO_EXT and/or IO_NORMAL */ 6077 { 6078 struct freeblks *freeblks, *fbn; 6079 struct inodedep *inodedep; 6080 struct jblkdep *jblkdep; 6081 struct allocdirect *adp, *adpn; 6082 struct fs *fs; 6083 struct buf *bp; 6084 struct vnode *vp; 6085 struct mount *mp; 6086 ufs2_daddr_t extblocks, datablocks; 6087 ufs_lbn_t tmpval, lbn, lastlbn; 6088 int frags; 6089 int lastoff, iboff; 6090 int allocblock; 6091 int error, i; 6092 int needj; 6093 6094 fs = ip->i_fs; 6095 mp = UFSTOVFS(ip->i_ump); 6096 vp = ITOV(ip); 6097 needj = 1; 6098 iboff = -1; 6099 allocblock = 0; 6100 extblocks = 0; 6101 datablocks = 0; 6102 frags = 0; 6103 freeblks = newfreeblks(mp, ip); 6104 ACQUIRE_LOCK(&lk); 6105 /* 6106 * If we're truncating a removed file that will never be written 6107 * we don't need to journal the block frees. The canceled journals 6108 * for the allocations will suffice. 6109 */ 6110 inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 6111 if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED && 6112 length == 0) 6113 needj = 0; 6114 FREE_LOCK(&lk); 6115 /* 6116 * Calculate the lbn that we are truncating to. This results in -1 6117 * if we're truncating the 0 bytes. So it is the last lbn we want 6118 * to keep, not the first lbn we want to truncate. 6119 */ 6120 lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1; 6121 lastoff = blkoff(fs, length); 6122 /* 6123 * Compute frags we are keeping in lastlbn. 0 means all. 6124 */ 6125 if (lastlbn >= 0 && lastlbn < NDADDR) { 6126 frags = fragroundup(fs, lastoff); 6127 /* adp offset of last valid allocdirect. */ 6128 iboff = lastlbn; 6129 } else if (lastlbn > 0) 6130 iboff = NDADDR; 6131 if (fs->fs_magic == FS_UFS2_MAGIC) 6132 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); 6133 /* 6134 * Handle normal data blocks and indirects. This section saves 6135 * values used after the inode update to complete frag and indirect 6136 * truncation. 6137 */ 6138 if ((flags & IO_NORMAL) != 0) { 6139 /* 6140 * Handle truncation of whole direct and indirect blocks. 6141 */ 6142 for (i = iboff + 1; i < NDADDR; i++) 6143 setup_freedirect(freeblks, ip, i, needj); 6144 for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; 6145 i++, lbn += tmpval, tmpval *= NINDIR(fs)) { 6146 /* Release a whole indirect tree. */ 6147 if (lbn > lastlbn) { 6148 setup_freeindir(freeblks, ip, i, -lbn -i, 6149 needj); 6150 continue; 6151 } 6152 iboff = i + NDADDR; 6153 /* 6154 * Traverse partially truncated indirect tree. 6155 */ 6156 if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn) 6157 setup_trunc_indir(freeblks, ip, -lbn - i, 6158 lastlbn, DIP(ip, i_ib[i])); 6159 } 6160 /* 6161 * Handle partial truncation to a frag boundary. 6162 */ 6163 if (frags) { 6164 ufs2_daddr_t blkno; 6165 long oldfrags; 6166 6167 oldfrags = blksize(fs, ip, lastlbn); 6168 blkno = DIP(ip, i_db[lastlbn]); 6169 if (blkno && oldfrags != frags) { 6170 oldfrags -= frags; 6171 oldfrags = numfrags(ip->i_fs, oldfrags); 6172 blkno += numfrags(ip->i_fs, frags); 6173 newfreework(ip->i_ump, freeblks, NULL, lastlbn, 6174 blkno, oldfrags, 0, needj); 6175 } else if (blkno == 0) 6176 allocblock = 1; 6177 } 6178 /* 6179 * Add a journal record for partial truncate if we are 6180 * handling indirect blocks. Non-indirects need no extra 6181 * journaling. 6182 */ 6183 if (length != 0 && lastlbn >= NDADDR) { 6184 ip->i_flag |= IN_TRUNCATED; 6185 newjtrunc(freeblks, length, 0); 6186 } 6187 ip->i_size = length; 6188 DIP_SET(ip, i_size, ip->i_size); 6189 datablocks = DIP(ip, i_blocks) - extblocks; 6190 if (length != 0) 6191 datablocks = blkcount(ip->i_fs, datablocks, length); 6192 freeblks->fb_len = length; 6193 } 6194 if ((flags & IO_EXT) != 0) { 6195 for (i = 0; i < NXADDR; i++) 6196 setup_freeext(freeblks, ip, i, needj); 6197 ip->i_din2->di_extsize = 0; 6198 datablocks += extblocks; 6199 } 6200 #ifdef QUOTA 6201 /* Reference the quotas in case the block count is wrong in the end. */ 6202 quotaref(vp, freeblks->fb_quota); 6203 (void) chkdq(ip, -datablocks, NOCRED, 0); 6204 #endif 6205 freeblks->fb_chkcnt = -datablocks; 6206 UFS_LOCK(ip->i_ump); 6207 fs->fs_pendingblocks += datablocks; 6208 UFS_UNLOCK(ip->i_ump); 6209 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks); 6210 /* 6211 * Handle truncation of incomplete alloc direct dependencies. We 6212 * hold the inode block locked to prevent incomplete dependencies 6213 * from reaching the disk while we are eliminating those that 6214 * have been truncated. This is a partially inlined ffs_update(). 6215 */ 6216 ufs_itimes(vp); 6217 ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED); 6218 error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 6219 (int)fs->fs_bsize, cred, &bp); 6220 if (error) { 6221 brelse(bp); 6222 softdep_error("softdep_journal_freeblocks", error); 6223 return; 6224 } 6225 if (bp->b_bufsize == fs->fs_bsize) 6226 bp->b_flags |= B_CLUSTEROK; 6227 softdep_update_inodeblock(ip, bp, 0); 6228 if (ip->i_ump->um_fstype == UFS1) 6229 *((struct ufs1_dinode *)bp->b_data + 6230 ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1; 6231 else 6232 *((struct ufs2_dinode *)bp->b_data + 6233 ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2; 6234 ACQUIRE_LOCK(&lk); 6235 (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 6236 if ((inodedep->id_state & IOSTARTED) != 0) 6237 panic("softdep_setup_freeblocks: inode busy"); 6238 /* 6239 * Add the freeblks structure to the list of operations that 6240 * must await the zero'ed inode being written to disk. If we 6241 * still have a bitmap dependency (needj), then the inode 6242 * has never been written to disk, so we can process the 6243 * freeblks below once we have deleted the dependencies. 6244 */ 6245 if (needj) 6246 WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); 6247 else 6248 freeblks->fb_state |= COMPLETE; 6249 if ((flags & IO_NORMAL) != 0) { 6250 TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) { 6251 if (adp->ad_offset > iboff) 6252 cancel_allocdirect(&inodedep->id_inoupdt, adp, 6253 freeblks); 6254 /* 6255 * Truncate the allocdirect. We could eliminate 6256 * or modify journal records as well. 6257 */ 6258 else if (adp->ad_offset == iboff && frags) 6259 adp->ad_newsize = frags; 6260 } 6261 } 6262 if ((flags & IO_EXT) != 0) 6263 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0) 6264 cancel_allocdirect(&inodedep->id_extupdt, adp, 6265 freeblks); 6266 /* 6267 * Add journal work. 6268 */ 6269 LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) 6270 add_to_journal(&jblkdep->jb_list); 6271 FREE_LOCK(&lk); 6272 bdwrite(bp); 6273 /* 6274 * Truncate dependency structures beyond length. 6275 */ 6276 trunc_dependencies(ip, freeblks, lastlbn, frags, flags); 6277 /* 6278 * This is only set when we need to allocate a fragment because 6279 * none existed at the end of a frag-sized file. It handles only 6280 * allocating a new, zero filled block. 6281 */ 6282 if (allocblock) { 6283 ip->i_size = length - lastoff; 6284 DIP_SET(ip, i_size, ip->i_size); 6285 error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp); 6286 if (error != 0) { 6287 softdep_error("softdep_journal_freeblks", error); 6288 return; 6289 } 6290 ip->i_size = length; 6291 DIP_SET(ip, i_size, length); 6292 ip->i_flag |= IN_CHANGE | IN_UPDATE; 6293 allocbuf(bp, frags); 6294 ffs_update(vp, MNT_NOWAIT); 6295 bawrite(bp); 6296 } else if (lastoff != 0 && vp->v_type != VDIR) { 6297 int size; 6298 6299 /* 6300 * Zero the end of a truncated frag or block. 6301 */ 6302 size = sblksize(fs, length, lastlbn); 6303 error = bread(vp, lastlbn, size, cred, &bp); 6304 if (error) { 6305 softdep_error("softdep_journal_freeblks", error); 6306 return; 6307 } 6308 bzero((char *)bp->b_data + lastoff, size - lastoff); 6309 bawrite(bp); 6310 6311 } 6312 ACQUIRE_LOCK(&lk); 6313 inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 6314 TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next); 6315 freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST; 6316 /* 6317 * We zero earlier truncations so they don't erroneously 6318 * update i_blocks. 6319 */ 6320 if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0) 6321 TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next) 6322 fbn->fb_len = 0; 6323 if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE && 6324 LIST_EMPTY(&freeblks->fb_jblkdephd)) 6325 freeblks->fb_state |= INPROGRESS; 6326 else 6327 freeblks = NULL; 6328 FREE_LOCK(&lk); 6329 if (freeblks) 6330 handle_workitem_freeblocks(freeblks, 0); 6331 trunc_pages(ip, length, extblocks, flags); 6332 6333 } 6334 6335 /* 6336 * Flush a JOP_SYNC to the journal. 6337 */ 6338 void 6339 softdep_journal_fsync(ip) 6340 struct inode *ip; 6341 { 6342 struct jfsync *jfsync; 6343 6344 if ((ip->i_flag & IN_TRUNCATED) == 0) 6345 return; 6346 ip->i_flag &= ~IN_TRUNCATED; 6347 jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO); 6348 workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ip->i_ump)); 6349 jfsync->jfs_size = ip->i_size; 6350 jfsync->jfs_ino = ip->i_number; 6351 ACQUIRE_LOCK(&lk); 6352 add_to_journal(&jfsync->jfs_list); 6353 jwait(&jfsync->jfs_list, MNT_WAIT); 6354 FREE_LOCK(&lk); 6355 } 6356 6357 /* 6358 * Block de-allocation dependencies. 6359 * 6360 * When blocks are de-allocated, the on-disk pointers must be nullified before 6361 * the blocks are made available for use by other files. (The true 6362 * requirement is that old pointers must be nullified before new on-disk 6363 * pointers are set. We chose this slightly more stringent requirement to 6364 * reduce complexity.) Our implementation handles this dependency by updating 6365 * the inode (or indirect block) appropriately but delaying the actual block 6366 * de-allocation (i.e., freemap and free space count manipulation) until 6367 * after the updated versions reach stable storage. After the disk is 6368 * updated, the blocks can be safely de-allocated whenever it is convenient. 6369 * This implementation handles only the common case of reducing a file's 6370 * length to zero. Other cases are handled by the conventional synchronous 6371 * write approach. 6372 * 6373 * The ffs implementation with which we worked double-checks 6374 * the state of the block pointers and file size as it reduces 6375 * a file's length. Some of this code is replicated here in our 6376 * soft updates implementation. The freeblks->fb_chkcnt field is 6377 * used to transfer a part of this information to the procedure 6378 * that eventually de-allocates the blocks. 6379 * 6380 * This routine should be called from the routine that shortens 6381 * a file's length, before the inode's size or block pointers 6382 * are modified. It will save the block pointer information for 6383 * later release and zero the inode so that the calling routine 6384 * can release it. 6385 */ 6386 void 6387 softdep_setup_freeblocks(ip, length, flags) 6388 struct inode *ip; /* The inode whose length is to be reduced */ 6389 off_t length; /* The new length for the file */ 6390 int flags; /* IO_EXT and/or IO_NORMAL */ 6391 { 6392 struct ufs1_dinode *dp1; 6393 struct ufs2_dinode *dp2; 6394 struct freeblks *freeblks; 6395 struct inodedep *inodedep; 6396 struct allocdirect *adp; 6397 struct buf *bp; 6398 struct fs *fs; 6399 ufs2_daddr_t extblocks, datablocks; 6400 struct mount *mp; 6401 int i, delay, error; 6402 ufs_lbn_t tmpval; 6403 ufs_lbn_t lbn; 6404 6405 fs = ip->i_fs; 6406 mp = UFSTOVFS(ip->i_ump); 6407 if (length != 0) 6408 panic("softdep_setup_freeblocks: non-zero length"); 6409 freeblks = newfreeblks(mp, ip); 6410 extblocks = 0; 6411 datablocks = 0; 6412 if (fs->fs_magic == FS_UFS2_MAGIC) 6413 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); 6414 if ((flags & IO_NORMAL) != 0) { 6415 for (i = 0; i < NDADDR; i++) 6416 setup_freedirect(freeblks, ip, i, 0); 6417 for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; 6418 i++, lbn += tmpval, tmpval *= NINDIR(fs)) 6419 setup_freeindir(freeblks, ip, i, -lbn -i, 0); 6420 ip->i_size = 0; 6421 DIP_SET(ip, i_size, 0); 6422 datablocks = DIP(ip, i_blocks) - extblocks; 6423 } 6424 if ((flags & IO_EXT) != 0) { 6425 for (i = 0; i < NXADDR; i++) 6426 setup_freeext(freeblks, ip, i, 0); 6427 ip->i_din2->di_extsize = 0; 6428 datablocks += extblocks; 6429 } 6430 #ifdef QUOTA 6431 /* Reference the quotas in case the block count is wrong in the end. */ 6432 quotaref(ITOV(ip), freeblks->fb_quota); 6433 (void) chkdq(ip, -datablocks, NOCRED, 0); 6434 #endif 6435 freeblks->fb_chkcnt = -datablocks; 6436 UFS_LOCK(ip->i_ump); 6437 fs->fs_pendingblocks += datablocks; 6438 UFS_UNLOCK(ip->i_ump); 6439 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks); 6440 /* 6441 * Push the zero'ed inode to to its disk buffer so that we are free 6442 * to delete its dependencies below. Once the dependencies are gone 6443 * the buffer can be safely released. 6444 */ 6445 if ((error = bread(ip->i_devvp, 6446 fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 6447 (int)fs->fs_bsize, NOCRED, &bp)) != 0) { 6448 brelse(bp); 6449 softdep_error("softdep_setup_freeblocks", error); 6450 } 6451 if (ip->i_ump->um_fstype == UFS1) { 6452 dp1 = ((struct ufs1_dinode *)bp->b_data + 6453 ino_to_fsbo(fs, ip->i_number)); 6454 ip->i_din1->di_freelink = dp1->di_freelink; 6455 *dp1 = *ip->i_din1; 6456 } else { 6457 dp2 = ((struct ufs2_dinode *)bp->b_data + 6458 ino_to_fsbo(fs, ip->i_number)); 6459 ip->i_din2->di_freelink = dp2->di_freelink; 6460 *dp2 = *ip->i_din2; 6461 } 6462 /* 6463 * Find and eliminate any inode dependencies. 6464 */ 6465 ACQUIRE_LOCK(&lk); 6466 (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 6467 if ((inodedep->id_state & IOSTARTED) != 0) 6468 panic("softdep_setup_freeblocks: inode busy"); 6469 /* 6470 * Add the freeblks structure to the list of operations that 6471 * must await the zero'ed inode being written to disk. If we 6472 * still have a bitmap dependency (delay == 0), then the inode 6473 * has never been written to disk, so we can process the 6474 * freeblks below once we have deleted the dependencies. 6475 */ 6476 delay = (inodedep->id_state & DEPCOMPLETE); 6477 if (delay) 6478 WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); 6479 else 6480 freeblks->fb_state |= COMPLETE; 6481 /* 6482 * Because the file length has been truncated to zero, any 6483 * pending block allocation dependency structures associated 6484 * with this inode are obsolete and can simply be de-allocated. 6485 * We must first merge the two dependency lists to get rid of 6486 * any duplicate freefrag structures, then purge the merged list. 6487 * If we still have a bitmap dependency, then the inode has never 6488 * been written to disk, so we can free any fragments without delay. 6489 */ 6490 if (flags & IO_NORMAL) { 6491 merge_inode_lists(&inodedep->id_newinoupdt, 6492 &inodedep->id_inoupdt); 6493 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) 6494 cancel_allocdirect(&inodedep->id_inoupdt, adp, 6495 freeblks); 6496 } 6497 if (flags & IO_EXT) { 6498 merge_inode_lists(&inodedep->id_newextupdt, 6499 &inodedep->id_extupdt); 6500 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0) 6501 cancel_allocdirect(&inodedep->id_extupdt, adp, 6502 freeblks); 6503 } 6504 FREE_LOCK(&lk); 6505 bdwrite(bp); 6506 trunc_dependencies(ip, freeblks, -1, 0, flags); 6507 ACQUIRE_LOCK(&lk); 6508 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) 6509 (void) free_inodedep(inodedep); 6510 freeblks->fb_state |= DEPCOMPLETE; 6511 /* 6512 * If the inode with zeroed block pointers is now on disk 6513 * we can start freeing blocks. 6514 */ 6515 if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) 6516 freeblks->fb_state |= INPROGRESS; 6517 else 6518 freeblks = NULL; 6519 FREE_LOCK(&lk); 6520 if (freeblks) 6521 handle_workitem_freeblocks(freeblks, 0); 6522 trunc_pages(ip, length, extblocks, flags); 6523 } 6524 6525 /* 6526 * Eliminate pages from the page cache that back parts of this inode and 6527 * adjust the vnode pager's idea of our size. This prevents stale data 6528 * from hanging around in the page cache. 6529 */ 6530 static void 6531 trunc_pages(ip, length, extblocks, flags) 6532 struct inode *ip; 6533 off_t length; 6534 ufs2_daddr_t extblocks; 6535 int flags; 6536 { 6537 struct vnode *vp; 6538 struct fs *fs; 6539 ufs_lbn_t lbn; 6540 off_t end, extend; 6541 6542 vp = ITOV(ip); 6543 fs = ip->i_fs; 6544 extend = OFF_TO_IDX(lblktosize(fs, -extblocks)); 6545 if ((flags & IO_EXT) != 0) 6546 vn_pages_remove(vp, extend, 0); 6547 if ((flags & IO_NORMAL) == 0) 6548 return; 6549 BO_LOCK(&vp->v_bufobj); 6550 drain_output(vp); 6551 BO_UNLOCK(&vp->v_bufobj); 6552 /* 6553 * The vnode pager eliminates file pages we eliminate indirects 6554 * below. 6555 */ 6556 vnode_pager_setsize(vp, length); 6557 /* 6558 * Calculate the end based on the last indirect we want to keep. If 6559 * the block extends into indirects we can just use the negative of 6560 * its lbn. Doubles and triples exist at lower numbers so we must 6561 * be careful not to remove those, if they exist. double and triple 6562 * indirect lbns do not overlap with others so it is not important 6563 * to verify how many levels are required. 6564 */ 6565 lbn = lblkno(fs, length); 6566 if (lbn >= NDADDR) { 6567 /* Calculate the virtual lbn of the triple indirect. */ 6568 lbn = -lbn - (NIADDR - 1); 6569 end = OFF_TO_IDX(lblktosize(fs, lbn)); 6570 } else 6571 end = extend; 6572 vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end); 6573 } 6574 6575 /* 6576 * See if the buf bp is in the range eliminated by truncation. 6577 */ 6578 static int 6579 trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags) 6580 struct buf *bp; 6581 int *blkoffp; 6582 ufs_lbn_t lastlbn; 6583 int lastoff; 6584 int flags; 6585 { 6586 ufs_lbn_t lbn; 6587 6588 *blkoffp = 0; 6589 /* Only match ext/normal blocks as appropriate. */ 6590 if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) || 6591 ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0)) 6592 return (0); 6593 /* ALTDATA is always a full truncation. */ 6594 if ((bp->b_xflags & BX_ALTDATA) != 0) 6595 return (1); 6596 /* -1 is full truncation. */ 6597 if (lastlbn == -1) 6598 return (1); 6599 /* 6600 * If this is a partial truncate we only want those 6601 * blocks and indirect blocks that cover the range 6602 * we're after. 6603 */ 6604 lbn = bp->b_lblkno; 6605 if (lbn < 0) 6606 lbn = -(lbn + lbn_level(lbn)); 6607 if (lbn < lastlbn) 6608 return (0); 6609 /* Here we only truncate lblkno if it's partial. */ 6610 if (lbn == lastlbn) { 6611 if (lastoff == 0) 6612 return (0); 6613 *blkoffp = lastoff; 6614 } 6615 return (1); 6616 } 6617 6618 /* 6619 * Eliminate any dependencies that exist in memory beyond lblkno:off 6620 */ 6621 static void 6622 trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags) 6623 struct inode *ip; 6624 struct freeblks *freeblks; 6625 ufs_lbn_t lastlbn; 6626 int lastoff; 6627 int flags; 6628 { 6629 struct bufobj *bo; 6630 struct vnode *vp; 6631 struct buf *bp; 6632 struct fs *fs; 6633 int blkoff; 6634 6635 /* 6636 * We must wait for any I/O in progress to finish so that 6637 * all potential buffers on the dirty list will be visible. 6638 * Once they are all there, walk the list and get rid of 6639 * any dependencies. 6640 */ 6641 fs = ip->i_fs; 6642 vp = ITOV(ip); 6643 bo = &vp->v_bufobj; 6644 BO_LOCK(bo); 6645 drain_output(vp); 6646 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) 6647 bp->b_vflags &= ~BV_SCANNED; 6648 restart: 6649 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { 6650 if (bp->b_vflags & BV_SCANNED) 6651 continue; 6652 if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) { 6653 bp->b_vflags |= BV_SCANNED; 6654 continue; 6655 } 6656 if ((bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT)) == NULL) 6657 goto restart; 6658 BO_UNLOCK(bo); 6659 if (deallocate_dependencies(bp, freeblks, blkoff)) 6660 bqrelse(bp); 6661 else 6662 brelse(bp); 6663 BO_LOCK(bo); 6664 goto restart; 6665 } 6666 /* 6667 * Now do the work of vtruncbuf while also matching indirect blocks. 6668 */ 6669 TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) 6670 bp->b_vflags &= ~BV_SCANNED; 6671 cleanrestart: 6672 TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) { 6673 if (bp->b_vflags & BV_SCANNED) 6674 continue; 6675 if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) { 6676 bp->b_vflags |= BV_SCANNED; 6677 continue; 6678 } 6679 if (BUF_LOCK(bp, 6680 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 6681 BO_MTX(bo)) == ENOLCK) { 6682 BO_LOCK(bo); 6683 goto cleanrestart; 6684 } 6685 bp->b_vflags |= BV_SCANNED; 6686 BO_LOCK(bo); 6687 bremfree(bp); 6688 BO_UNLOCK(bo); 6689 if (blkoff != 0) { 6690 allocbuf(bp, blkoff); 6691 bqrelse(bp); 6692 } else { 6693 bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF; 6694 brelse(bp); 6695 } 6696 BO_LOCK(bo); 6697 goto cleanrestart; 6698 } 6699 drain_output(vp); 6700 BO_UNLOCK(bo); 6701 } 6702 6703 static int 6704 cancel_pagedep(pagedep, freeblks, blkoff) 6705 struct pagedep *pagedep; 6706 struct freeblks *freeblks; 6707 int blkoff; 6708 { 6709 struct jremref *jremref; 6710 struct jmvref *jmvref; 6711 struct dirrem *dirrem, *tmp; 6712 int i; 6713 6714 /* 6715 * Copy any directory remove dependencies to the list 6716 * to be processed after the freeblks proceeds. If 6717 * directory entry never made it to disk they 6718 * can be dumped directly onto the work list. 6719 */ 6720 LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) { 6721 /* Skip this directory removal if it is intended to remain. */ 6722 if (dirrem->dm_offset < blkoff) 6723 continue; 6724 /* 6725 * If there are any dirrems we wait for the journal write 6726 * to complete and then restart the buf scan as the lock 6727 * has been dropped. 6728 */ 6729 while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) { 6730 jwait(&jremref->jr_list, MNT_WAIT); 6731 return (ERESTART); 6732 } 6733 LIST_REMOVE(dirrem, dm_next); 6734 dirrem->dm_dirinum = pagedep->pd_ino; 6735 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list); 6736 } 6737 while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) { 6738 jwait(&jmvref->jm_list, MNT_WAIT); 6739 return (ERESTART); 6740 } 6741 /* 6742 * When we're partially truncating a pagedep we just want to flush 6743 * journal entries and return. There can not be any adds in the 6744 * truncated portion of the directory and newblk must remain if 6745 * part of the block remains. 6746 */ 6747 if (blkoff != 0) { 6748 struct diradd *dap; 6749 6750 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) 6751 if (dap->da_offset > blkoff) 6752 panic("cancel_pagedep: diradd %p off %d > %d", 6753 dap, dap->da_offset, blkoff); 6754 for (i = 0; i < DAHASHSZ; i++) 6755 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) 6756 if (dap->da_offset > blkoff) 6757 panic("cancel_pagedep: diradd %p off %d > %d", 6758 dap, dap->da_offset, blkoff); 6759 return (0); 6760 } 6761 /* 6762 * There should be no directory add dependencies present 6763 * as the directory could not be truncated until all 6764 * children were removed. 6765 */ 6766 KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL, 6767 ("deallocate_dependencies: pendinghd != NULL")); 6768 for (i = 0; i < DAHASHSZ; i++) 6769 KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL, 6770 ("deallocate_dependencies: diraddhd != NULL")); 6771 if ((pagedep->pd_state & NEWBLOCK) != 0) 6772 free_newdirblk(pagedep->pd_newdirblk); 6773 if (free_pagedep(pagedep) == 0) 6774 panic("Failed to free pagedep %p", pagedep); 6775 return (0); 6776 } 6777 6778 /* 6779 * Reclaim any dependency structures from a buffer that is about to 6780 * be reallocated to a new vnode. The buffer must be locked, thus, 6781 * no I/O completion operations can occur while we are manipulating 6782 * its associated dependencies. The mutex is held so that other I/O's 6783 * associated with related dependencies do not occur. 6784 */ 6785 static int 6786 deallocate_dependencies(bp, freeblks, off) 6787 struct buf *bp; 6788 struct freeblks *freeblks; 6789 int off; 6790 { 6791 struct indirdep *indirdep; 6792 struct pagedep *pagedep; 6793 struct allocdirect *adp; 6794 struct worklist *wk, *wkn; 6795 6796 ACQUIRE_LOCK(&lk); 6797 LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) { 6798 switch (wk->wk_type) { 6799 case D_INDIRDEP: 6800 indirdep = WK_INDIRDEP(wk); 6801 if (bp->b_lblkno >= 0 || 6802 bp->b_blkno != indirdep->ir_savebp->b_lblkno) 6803 panic("deallocate_dependencies: not indir"); 6804 cancel_indirdep(indirdep, bp, freeblks); 6805 continue; 6806 6807 case D_PAGEDEP: 6808 pagedep = WK_PAGEDEP(wk); 6809 if (cancel_pagedep(pagedep, freeblks, off)) { 6810 FREE_LOCK(&lk); 6811 return (ERESTART); 6812 } 6813 continue; 6814 6815 case D_ALLOCINDIR: 6816 /* 6817 * Simply remove the allocindir, we'll find it via 6818 * the indirdep where we can clear pointers if 6819 * needed. 6820 */ 6821 WORKLIST_REMOVE(wk); 6822 continue; 6823 6824 case D_FREEWORK: 6825 /* 6826 * A truncation is waiting for the zero'd pointers 6827 * to be written. It can be freed when the freeblks 6828 * is journaled. 6829 */ 6830 WORKLIST_REMOVE(wk); 6831 wk->wk_state |= ONDEPLIST; 6832 WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk); 6833 break; 6834 6835 case D_ALLOCDIRECT: 6836 adp = WK_ALLOCDIRECT(wk); 6837 if (off != 0) 6838 continue; 6839 /* FALLTHROUGH */ 6840 default: 6841 panic("deallocate_dependencies: Unexpected type %s", 6842 TYPENAME(wk->wk_type)); 6843 /* NOTREACHED */ 6844 } 6845 } 6846 FREE_LOCK(&lk); 6847 /* 6848 * Don't throw away this buf, we were partially truncating and 6849 * some deps may always remain. 6850 */ 6851 if (off) { 6852 allocbuf(bp, off); 6853 bp->b_vflags |= BV_SCANNED; 6854 return (EBUSY); 6855 } 6856 bp->b_flags |= B_INVAL | B_NOCACHE; 6857 6858 return (0); 6859 } 6860 6861 /* 6862 * An allocdirect is being canceled due to a truncate. We must make sure 6863 * the journal entry is released in concert with the blkfree that releases 6864 * the storage. Completed journal entries must not be released until the 6865 * space is no longer pointed to by the inode or in the bitmap. 6866 */ 6867 static void 6868 cancel_allocdirect(adphead, adp, freeblks) 6869 struct allocdirectlst *adphead; 6870 struct allocdirect *adp; 6871 struct freeblks *freeblks; 6872 { 6873 struct freework *freework; 6874 struct newblk *newblk; 6875 struct worklist *wk; 6876 6877 TAILQ_REMOVE(adphead, adp, ad_next); 6878 newblk = (struct newblk *)adp; 6879 freework = NULL; 6880 /* 6881 * Find the correct freework structure. 6882 */ 6883 LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) { 6884 if (wk->wk_type != D_FREEWORK) 6885 continue; 6886 freework = WK_FREEWORK(wk); 6887 if (freework->fw_blkno == newblk->nb_newblkno) 6888 break; 6889 } 6890 if (freework == NULL) 6891 panic("cancel_allocdirect: Freework not found"); 6892 /* 6893 * If a newblk exists at all we still have the journal entry that 6894 * initiated the allocation so we do not need to journal the free. 6895 */ 6896 cancel_jfreeblk(freeblks, freework->fw_blkno); 6897 /* 6898 * If the journal hasn't been written the jnewblk must be passed 6899 * to the call to ffs_blkfree that reclaims the space. We accomplish 6900 * this by linking the journal dependency into the freework to be 6901 * freed when freework_freeblock() is called. If the journal has 6902 * been written we can simply reclaim the journal space when the 6903 * freeblks work is complete. 6904 */ 6905 freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list, 6906 &freeblks->fb_jwork); 6907 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list); 6908 } 6909 6910 6911 /* 6912 * Cancel a new block allocation. May be an indirect or direct block. We 6913 * remove it from various lists and return any journal record that needs to 6914 * be resolved by the caller. 6915 * 6916 * A special consideration is made for indirects which were never pointed 6917 * at on disk and will never be found once this block is released. 6918 */ 6919 static struct jnewblk * 6920 cancel_newblk(newblk, wk, wkhd) 6921 struct newblk *newblk; 6922 struct worklist *wk; 6923 struct workhead *wkhd; 6924 { 6925 struct jnewblk *jnewblk; 6926 6927 newblk->nb_state |= GOINGAWAY; 6928 /* 6929 * Previously we traversed the completedhd on each indirdep 6930 * attached to this newblk to cancel them and gather journal 6931 * work. Since we need only the oldest journal segment and 6932 * the lowest point on the tree will always have the oldest 6933 * journal segment we are free to release the segments 6934 * of any subordinates and may leave the indirdep list to 6935 * indirdep_complete() when this newblk is freed. 6936 */ 6937 if (newblk->nb_state & ONDEPLIST) { 6938 newblk->nb_state &= ~ONDEPLIST; 6939 LIST_REMOVE(newblk, nb_deps); 6940 } 6941 if (newblk->nb_state & ONWORKLIST) 6942 WORKLIST_REMOVE(&newblk->nb_list); 6943 /* 6944 * If the journal entry hasn't been written we save a pointer to 6945 * the dependency that frees it until it is written or the 6946 * superseding operation completes. 6947 */ 6948 jnewblk = newblk->nb_jnewblk; 6949 if (jnewblk != NULL && wk != NULL) { 6950 newblk->nb_jnewblk = NULL; 6951 jnewblk->jn_dep = wk; 6952 } 6953 if (!LIST_EMPTY(&newblk->nb_jwork)) 6954 jwork_move(wkhd, &newblk->nb_jwork); 6955 /* 6956 * When truncating we must free the newdirblk early to remove 6957 * the pagedep from the hash before returning. 6958 */ 6959 if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) 6960 free_newdirblk(WK_NEWDIRBLK(wk)); 6961 if (!LIST_EMPTY(&newblk->nb_newdirblk)) 6962 panic("cancel_newblk: extra newdirblk"); 6963 6964 return (jnewblk); 6965 } 6966 6967 /* 6968 * Schedule the freefrag associated with a newblk to be released once 6969 * the pointers are written and the previous block is no longer needed. 6970 */ 6971 static void 6972 newblk_freefrag(newblk) 6973 struct newblk *newblk; 6974 { 6975 struct freefrag *freefrag; 6976 6977 if (newblk->nb_freefrag == NULL) 6978 return; 6979 freefrag = newblk->nb_freefrag; 6980 newblk->nb_freefrag = NULL; 6981 freefrag->ff_state |= COMPLETE; 6982 if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) 6983 add_to_worklist(&freefrag->ff_list, 0); 6984 } 6985 6986 /* 6987 * Free a newblk. Generate a new freefrag work request if appropriate. 6988 * This must be called after the inode pointer and any direct block pointers 6989 * are valid or fully removed via truncate or frag extension. 6990 */ 6991 static void 6992 free_newblk(newblk) 6993 struct newblk *newblk; 6994 { 6995 struct indirdep *indirdep; 6996 struct worklist *wk; 6997 6998 KASSERT(newblk->nb_jnewblk == NULL, 6999 ("free_newblk; jnewblk %p still attached", newblk->nb_jnewblk)); 7000 mtx_assert(&lk, MA_OWNED); 7001 newblk_freefrag(newblk); 7002 if (newblk->nb_state & ONDEPLIST) 7003 LIST_REMOVE(newblk, nb_deps); 7004 if (newblk->nb_state & ONWORKLIST) 7005 WORKLIST_REMOVE(&newblk->nb_list); 7006 LIST_REMOVE(newblk, nb_hash); 7007 if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) 7008 free_newdirblk(WK_NEWDIRBLK(wk)); 7009 if (!LIST_EMPTY(&newblk->nb_newdirblk)) 7010 panic("free_newblk: extra newdirblk"); 7011 while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) 7012 indirdep_complete(indirdep); 7013 handle_jwork(&newblk->nb_jwork); 7014 newblk->nb_list.wk_type = D_NEWBLK; 7015 WORKITEM_FREE(newblk, D_NEWBLK); 7016 } 7017 7018 /* 7019 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep. 7020 * This routine must be called with splbio interrupts blocked. 7021 */ 7022 static void 7023 free_newdirblk(newdirblk) 7024 struct newdirblk *newdirblk; 7025 { 7026 struct pagedep *pagedep; 7027 struct diradd *dap; 7028 struct worklist *wk; 7029 7030 mtx_assert(&lk, MA_OWNED); 7031 WORKLIST_REMOVE(&newdirblk->db_list); 7032 /* 7033 * If the pagedep is still linked onto the directory buffer 7034 * dependency chain, then some of the entries on the 7035 * pd_pendinghd list may not be committed to disk yet. In 7036 * this case, we will simply clear the NEWBLOCK flag and 7037 * let the pd_pendinghd list be processed when the pagedep 7038 * is next written. If the pagedep is no longer on the buffer 7039 * dependency chain, then all the entries on the pd_pending 7040 * list are committed to disk and we can free them here. 7041 */ 7042 pagedep = newdirblk->db_pagedep; 7043 pagedep->pd_state &= ~NEWBLOCK; 7044 if ((pagedep->pd_state & ONWORKLIST) == 0) { 7045 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 7046 free_diradd(dap, NULL); 7047 /* 7048 * If no dependencies remain, the pagedep will be freed. 7049 */ 7050 free_pagedep(pagedep); 7051 } 7052 /* Should only ever be one item in the list. */ 7053 while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) { 7054 WORKLIST_REMOVE(wk); 7055 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 7056 } 7057 WORKITEM_FREE(newdirblk, D_NEWDIRBLK); 7058 } 7059 7060 /* 7061 * Prepare an inode to be freed. The actual free operation is not 7062 * done until the zero'ed inode has been written to disk. 7063 */ 7064 void 7065 softdep_freefile(pvp, ino, mode) 7066 struct vnode *pvp; 7067 ino_t ino; 7068 int mode; 7069 { 7070 struct inode *ip = VTOI(pvp); 7071 struct inodedep *inodedep; 7072 struct freefile *freefile; 7073 struct freeblks *freeblks; 7074 7075 /* 7076 * This sets up the inode de-allocation dependency. 7077 */ 7078 freefile = malloc(sizeof(struct freefile), 7079 M_FREEFILE, M_SOFTDEP_FLAGS); 7080 workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount); 7081 freefile->fx_mode = mode; 7082 freefile->fx_oldinum = ino; 7083 freefile->fx_devvp = ip->i_devvp; 7084 LIST_INIT(&freefile->fx_jwork); 7085 UFS_LOCK(ip->i_ump); 7086 ip->i_fs->fs_pendinginodes += 1; 7087 UFS_UNLOCK(ip->i_ump); 7088 7089 /* 7090 * If the inodedep does not exist, then the zero'ed inode has 7091 * been written to disk. If the allocated inode has never been 7092 * written to disk, then the on-disk inode is zero'ed. In either 7093 * case we can free the file immediately. If the journal was 7094 * canceled before being written the inode will never make it to 7095 * disk and we must send the canceled journal entrys to 7096 * ffs_freefile() to be cleared in conjunction with the bitmap. 7097 * Any blocks waiting on the inode to write can be safely freed 7098 * here as it will never been written. 7099 */ 7100 ACQUIRE_LOCK(&lk); 7101 inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); 7102 if (inodedep) { 7103 /* 7104 * Clear out freeblks that no longer need to reference 7105 * this inode. 7106 */ 7107 while ((freeblks = 7108 TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) { 7109 TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, 7110 fb_next); 7111 freeblks->fb_state &= ~ONDEPLIST; 7112 } 7113 /* 7114 * Remove this inode from the unlinked list. 7115 */ 7116 if (inodedep->id_state & UNLINKED) { 7117 /* 7118 * Save the journal work to be freed with the bitmap 7119 * before we clear UNLINKED. Otherwise it can be lost 7120 * if the inode block is written. 7121 */ 7122 handle_bufwait(inodedep, &freefile->fx_jwork); 7123 clear_unlinked_inodedep(inodedep); 7124 /* Re-acquire inodedep as we've dropped lk. */ 7125 inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); 7126 } 7127 } 7128 if (inodedep == NULL || check_inode_unwritten(inodedep)) { 7129 FREE_LOCK(&lk); 7130 handle_workitem_freefile(freefile); 7131 return; 7132 } 7133 if ((inodedep->id_state & DEPCOMPLETE) == 0) 7134 inodedep->id_state |= GOINGAWAY; 7135 WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); 7136 FREE_LOCK(&lk); 7137 if (ip->i_number == ino) 7138 ip->i_flag |= IN_MODIFIED; 7139 } 7140 7141 /* 7142 * Check to see if an inode has never been written to disk. If 7143 * so free the inodedep and return success, otherwise return failure. 7144 * This routine must be called with splbio interrupts blocked. 7145 * 7146 * If we still have a bitmap dependency, then the inode has never 7147 * been written to disk. Drop the dependency as it is no longer 7148 * necessary since the inode is being deallocated. We set the 7149 * ALLCOMPLETE flags since the bitmap now properly shows that the 7150 * inode is not allocated. Even if the inode is actively being 7151 * written, it has been rolled back to its zero'ed state, so we 7152 * are ensured that a zero inode is what is on the disk. For short 7153 * lived files, this change will usually result in removing all the 7154 * dependencies from the inode so that it can be freed immediately. 7155 */ 7156 static int 7157 check_inode_unwritten(inodedep) 7158 struct inodedep *inodedep; 7159 { 7160 7161 mtx_assert(&lk, MA_OWNED); 7162 7163 if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 || 7164 !LIST_EMPTY(&inodedep->id_dirremhd) || 7165 !LIST_EMPTY(&inodedep->id_pendinghd) || 7166 !LIST_EMPTY(&inodedep->id_bufwait) || 7167 !LIST_EMPTY(&inodedep->id_inowait) || 7168 !TAILQ_EMPTY(&inodedep->id_inoreflst) || 7169 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 7170 !TAILQ_EMPTY(&inodedep->id_newinoupdt) || 7171 !TAILQ_EMPTY(&inodedep->id_extupdt) || 7172 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 7173 !TAILQ_EMPTY(&inodedep->id_freeblklst) || 7174 inodedep->id_mkdiradd != NULL || 7175 inodedep->id_nlinkdelta != 0) 7176 return (0); 7177 /* 7178 * Another process might be in initiate_write_inodeblock_ufs[12] 7179 * trying to allocate memory without holding "Softdep Lock". 7180 */ 7181 if ((inodedep->id_state & IOSTARTED) != 0 && 7182 inodedep->id_savedino1 == NULL) 7183 return (0); 7184 7185 if (inodedep->id_state & ONDEPLIST) 7186 LIST_REMOVE(inodedep, id_deps); 7187 inodedep->id_state &= ~ONDEPLIST; 7188 inodedep->id_state |= ALLCOMPLETE; 7189 inodedep->id_bmsafemap = NULL; 7190 if (inodedep->id_state & ONWORKLIST) 7191 WORKLIST_REMOVE(&inodedep->id_list); 7192 if (inodedep->id_savedino1 != NULL) { 7193 free(inodedep->id_savedino1, M_SAVEDINO); 7194 inodedep->id_savedino1 = NULL; 7195 } 7196 if (free_inodedep(inodedep) == 0) 7197 panic("check_inode_unwritten: busy inode"); 7198 return (1); 7199 } 7200 7201 /* 7202 * Try to free an inodedep structure. Return 1 if it could be freed. 7203 */ 7204 static int 7205 free_inodedep(inodedep) 7206 struct inodedep *inodedep; 7207 { 7208 7209 mtx_assert(&lk, MA_OWNED); 7210 if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 || 7211 (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || 7212 !LIST_EMPTY(&inodedep->id_dirremhd) || 7213 !LIST_EMPTY(&inodedep->id_pendinghd) || 7214 !LIST_EMPTY(&inodedep->id_bufwait) || 7215 !LIST_EMPTY(&inodedep->id_inowait) || 7216 !TAILQ_EMPTY(&inodedep->id_inoreflst) || 7217 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 7218 !TAILQ_EMPTY(&inodedep->id_newinoupdt) || 7219 !TAILQ_EMPTY(&inodedep->id_extupdt) || 7220 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 7221 !TAILQ_EMPTY(&inodedep->id_freeblklst) || 7222 inodedep->id_mkdiradd != NULL || 7223 inodedep->id_nlinkdelta != 0 || 7224 inodedep->id_savedino1 != NULL) 7225 return (0); 7226 if (inodedep->id_state & ONDEPLIST) 7227 LIST_REMOVE(inodedep, id_deps); 7228 LIST_REMOVE(inodedep, id_hash); 7229 WORKITEM_FREE(inodedep, D_INODEDEP); 7230 return (1); 7231 } 7232 7233 /* 7234 * Free the block referenced by a freework structure. The parent freeblks 7235 * structure is released and completed when the final cg bitmap reaches 7236 * the disk. This routine may be freeing a jnewblk which never made it to 7237 * disk in which case we do not have to wait as the operation is undone 7238 * in memory immediately. 7239 */ 7240 static void 7241 freework_freeblock(freework) 7242 struct freework *freework; 7243 { 7244 struct freeblks *freeblks; 7245 struct jnewblk *jnewblk; 7246 struct ufsmount *ump; 7247 struct workhead wkhd; 7248 struct fs *fs; 7249 int bsize; 7250 int needj; 7251 7252 mtx_assert(&lk, MA_OWNED); 7253 /* 7254 * Handle partial truncate separately. 7255 */ 7256 if (freework->fw_indir) { 7257 complete_trunc_indir(freework); 7258 return; 7259 } 7260 freeblks = freework->fw_freeblks; 7261 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 7262 fs = ump->um_fs; 7263 needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0; 7264 bsize = lfragtosize(fs, freework->fw_frags); 7265 LIST_INIT(&wkhd); 7266 /* 7267 * DEPCOMPLETE is cleared in indirblk_insert() if the block lives 7268 * on the indirblk hashtable and prevents premature freeing. 7269 */ 7270 freework->fw_state |= DEPCOMPLETE; 7271 /* 7272 * SUJ needs to wait for the segment referencing freed indirect 7273 * blocks to expire so that we know the checker will not confuse 7274 * a re-allocated indirect block with its old contents. 7275 */ 7276 if (needj && freework->fw_lbn <= -NDADDR) 7277 indirblk_insert(freework); 7278 /* 7279 * If we are canceling an existing jnewblk pass it to the free 7280 * routine, otherwise pass the freeblk which will ultimately 7281 * release the freeblks. If we're not journaling, we can just 7282 * free the freeblks immediately. 7283 */ 7284 jnewblk = freework->fw_jnewblk; 7285 if (jnewblk != NULL) { 7286 cancel_jnewblk(jnewblk, &wkhd); 7287 needj = 0; 7288 } else if (needj) { 7289 freework->fw_state |= DELAYEDFREE; 7290 freeblks->fb_cgwait++; 7291 WORKLIST_INSERT(&wkhd, &freework->fw_list); 7292 } 7293 FREE_LOCK(&lk); 7294 freeblks_free(ump, freeblks, btodb(bsize)); 7295 ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize, 7296 freeblks->fb_inum, freeblks->fb_vtype, &wkhd); 7297 ACQUIRE_LOCK(&lk); 7298 /* 7299 * The jnewblk will be discarded and the bits in the map never 7300 * made it to disk. We can immediately free the freeblk. 7301 */ 7302 if (needj == 0) 7303 handle_written_freework(freework); 7304 } 7305 7306 /* 7307 * We enqueue freework items that need processing back on the freeblks and 7308 * add the freeblks to the worklist. This makes it easier to find all work 7309 * required to flush a truncation in process_truncates(). 7310 */ 7311 static void 7312 freework_enqueue(freework) 7313 struct freework *freework; 7314 { 7315 struct freeblks *freeblks; 7316 7317 freeblks = freework->fw_freeblks; 7318 if ((freework->fw_state & INPROGRESS) == 0) 7319 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list); 7320 if ((freeblks->fb_state & 7321 (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE && 7322 LIST_EMPTY(&freeblks->fb_jblkdephd)) 7323 add_to_worklist(&freeblks->fb_list, WK_NODELAY); 7324 } 7325 7326 /* 7327 * Start, continue, or finish the process of freeing an indirect block tree. 7328 * The free operation may be paused at any point with fw_off containing the 7329 * offset to restart from. This enables us to implement some flow control 7330 * for large truncates which may fan out and generate a huge number of 7331 * dependencies. 7332 */ 7333 static void 7334 handle_workitem_indirblk(freework) 7335 struct freework *freework; 7336 { 7337 struct freeblks *freeblks; 7338 struct ufsmount *ump; 7339 struct fs *fs; 7340 7341 freeblks = freework->fw_freeblks; 7342 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 7343 fs = ump->um_fs; 7344 if (freework->fw_state & DEPCOMPLETE) { 7345 handle_written_freework(freework); 7346 return; 7347 } 7348 if (freework->fw_off == NINDIR(fs)) { 7349 freework_freeblock(freework); 7350 return; 7351 } 7352 freework->fw_state |= INPROGRESS; 7353 FREE_LOCK(&lk); 7354 indir_trunc(freework, fsbtodb(fs, freework->fw_blkno), 7355 freework->fw_lbn); 7356 ACQUIRE_LOCK(&lk); 7357 } 7358 7359 /* 7360 * Called when a freework structure attached to a cg buf is written. The 7361 * ref on either the parent or the freeblks structure is released and 7362 * the freeblks is added back to the worklist if there is more work to do. 7363 */ 7364 static void 7365 handle_written_freework(freework) 7366 struct freework *freework; 7367 { 7368 struct freeblks *freeblks; 7369 struct freework *parent; 7370 7371 freeblks = freework->fw_freeblks; 7372 parent = freework->fw_parent; 7373 if (freework->fw_state & DELAYEDFREE) 7374 freeblks->fb_cgwait--; 7375 freework->fw_state |= COMPLETE; 7376 if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE) 7377 WORKITEM_FREE(freework, D_FREEWORK); 7378 if (parent) { 7379 if (--parent->fw_ref == 0) 7380 freework_enqueue(parent); 7381 return; 7382 } 7383 if (--freeblks->fb_ref != 0) 7384 return; 7385 if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) == 7386 ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd)) 7387 add_to_worklist(&freeblks->fb_list, WK_NODELAY); 7388 } 7389 7390 /* 7391 * This workitem routine performs the block de-allocation. 7392 * The workitem is added to the pending list after the updated 7393 * inode block has been written to disk. As mentioned above, 7394 * checks regarding the number of blocks de-allocated (compared 7395 * to the number of blocks allocated for the file) are also 7396 * performed in this function. 7397 */ 7398 static int 7399 handle_workitem_freeblocks(freeblks, flags) 7400 struct freeblks *freeblks; 7401 int flags; 7402 { 7403 struct freework *freework; 7404 struct newblk *newblk; 7405 struct allocindir *aip; 7406 struct ufsmount *ump; 7407 struct worklist *wk; 7408 7409 KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd), 7410 ("handle_workitem_freeblocks: Journal entries not written.")); 7411 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 7412 ACQUIRE_LOCK(&lk); 7413 while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) { 7414 WORKLIST_REMOVE(wk); 7415 switch (wk->wk_type) { 7416 case D_DIRREM: 7417 wk->wk_state |= COMPLETE; 7418 add_to_worklist(wk, 0); 7419 continue; 7420 7421 case D_ALLOCDIRECT: 7422 free_newblk(WK_NEWBLK(wk)); 7423 continue; 7424 7425 case D_ALLOCINDIR: 7426 aip = WK_ALLOCINDIR(wk); 7427 freework = NULL; 7428 if (aip->ai_state & DELAYEDFREE) { 7429 FREE_LOCK(&lk); 7430 freework = newfreework(ump, freeblks, NULL, 7431 aip->ai_lbn, aip->ai_newblkno, 7432 ump->um_fs->fs_frag, 0, 0); 7433 ACQUIRE_LOCK(&lk); 7434 } 7435 newblk = WK_NEWBLK(wk); 7436 if (newblk->nb_jnewblk) { 7437 freework->fw_jnewblk = newblk->nb_jnewblk; 7438 newblk->nb_jnewblk->jn_dep = &freework->fw_list; 7439 newblk->nb_jnewblk = NULL; 7440 } 7441 free_newblk(newblk); 7442 continue; 7443 7444 case D_FREEWORK: 7445 freework = WK_FREEWORK(wk); 7446 if (freework->fw_lbn <= -NDADDR) 7447 handle_workitem_indirblk(freework); 7448 else 7449 freework_freeblock(freework); 7450 continue; 7451 default: 7452 panic("handle_workitem_freeblocks: Unknown type %s", 7453 TYPENAME(wk->wk_type)); 7454 } 7455 } 7456 if (freeblks->fb_ref != 0) { 7457 freeblks->fb_state &= ~INPROGRESS; 7458 wake_worklist(&freeblks->fb_list); 7459 freeblks = NULL; 7460 } 7461 FREE_LOCK(&lk); 7462 if (freeblks) 7463 return handle_complete_freeblocks(freeblks, flags); 7464 return (0); 7465 } 7466 7467 /* 7468 * Handle completion of block free via truncate. This allows fs_pending 7469 * to track the actual free block count more closely than if we only updated 7470 * it at the end. We must be careful to handle cases where the block count 7471 * on free was incorrect. 7472 */ 7473 static void 7474 freeblks_free(ump, freeblks, blocks) 7475 struct ufsmount *ump; 7476 struct freeblks *freeblks; 7477 int blocks; 7478 { 7479 struct fs *fs; 7480 ufs2_daddr_t remain; 7481 7482 UFS_LOCK(ump); 7483 remain = -freeblks->fb_chkcnt; 7484 freeblks->fb_chkcnt += blocks; 7485 if (remain > 0) { 7486 if (remain < blocks) 7487 blocks = remain; 7488 fs = ump->um_fs; 7489 fs->fs_pendingblocks -= blocks; 7490 } 7491 UFS_UNLOCK(ump); 7492 } 7493 7494 /* 7495 * Once all of the freework workitems are complete we can retire the 7496 * freeblocks dependency and any journal work awaiting completion. This 7497 * can not be called until all other dependencies are stable on disk. 7498 */ 7499 static int 7500 handle_complete_freeblocks(freeblks, flags) 7501 struct freeblks *freeblks; 7502 int flags; 7503 { 7504 struct inodedep *inodedep; 7505 struct inode *ip; 7506 struct vnode *vp; 7507 struct fs *fs; 7508 struct ufsmount *ump; 7509 ufs2_daddr_t spare; 7510 7511 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 7512 fs = ump->um_fs; 7513 flags = LK_EXCLUSIVE | flags; 7514 spare = freeblks->fb_chkcnt; 7515 7516 /* 7517 * If we did not release the expected number of blocks we may have 7518 * to adjust the inode block count here. Only do so if it wasn't 7519 * a truncation to zero and the modrev still matches. 7520 */ 7521 if (spare && freeblks->fb_len != 0) { 7522 if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum, 7523 flags, &vp, FFSV_FORCEINSMQ) != 0) 7524 return (EBUSY); 7525 ip = VTOI(vp); 7526 if (DIP(ip, i_modrev) == freeblks->fb_modrev) { 7527 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare); 7528 ip->i_flag |= IN_CHANGE; 7529 /* 7530 * We must wait so this happens before the 7531 * journal is reclaimed. 7532 */ 7533 ffs_update(vp, 1); 7534 } 7535 vput(vp); 7536 } 7537 if (spare < 0) { 7538 UFS_LOCK(ump); 7539 fs->fs_pendingblocks += spare; 7540 UFS_UNLOCK(ump); 7541 } 7542 #ifdef QUOTA 7543 /* Handle spare. */ 7544 if (spare) 7545 quotaadj(freeblks->fb_quota, ump, -spare); 7546 quotarele(freeblks->fb_quota); 7547 #endif 7548 ACQUIRE_LOCK(&lk); 7549 if (freeblks->fb_state & ONDEPLIST) { 7550 inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum, 7551 0, &inodedep); 7552 TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next); 7553 freeblks->fb_state &= ~ONDEPLIST; 7554 if (TAILQ_EMPTY(&inodedep->id_freeblklst)) 7555 free_inodedep(inodedep); 7556 } 7557 /* 7558 * All of the freeblock deps must be complete prior to this call 7559 * so it's now safe to complete earlier outstanding journal entries. 7560 */ 7561 handle_jwork(&freeblks->fb_jwork); 7562 WORKITEM_FREE(freeblks, D_FREEBLKS); 7563 FREE_LOCK(&lk); 7564 return (0); 7565 } 7566 7567 /* 7568 * Release blocks associated with the freeblks and stored in the indirect 7569 * block dbn. If level is greater than SINGLE, the block is an indirect block 7570 * and recursive calls to indirtrunc must be used to cleanse other indirect 7571 * blocks. 7572 * 7573 * This handles partial and complete truncation of blocks. Partial is noted 7574 * with goingaway == 0. In this case the freework is completed after the 7575 * zero'd indirects are written to disk. For full truncation the freework 7576 * is completed after the block is freed. 7577 */ 7578 static void 7579 indir_trunc(freework, dbn, lbn) 7580 struct freework *freework; 7581 ufs2_daddr_t dbn; 7582 ufs_lbn_t lbn; 7583 { 7584 struct freework *nfreework; 7585 struct workhead wkhd; 7586 struct freeblks *freeblks; 7587 struct buf *bp; 7588 struct fs *fs; 7589 struct indirdep *indirdep; 7590 struct ufsmount *ump; 7591 ufs1_daddr_t *bap1 = 0; 7592 ufs2_daddr_t nb, nnb, *bap2 = 0; 7593 ufs_lbn_t lbnadd, nlbn; 7594 int i, nblocks, ufs1fmt; 7595 int freedblocks; 7596 int goingaway; 7597 int freedeps; 7598 int needj; 7599 int level; 7600 int cnt; 7601 7602 freeblks = freework->fw_freeblks; 7603 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 7604 fs = ump->um_fs; 7605 /* 7606 * Get buffer of block pointers to be freed. There are three cases: 7607 * 7608 * 1) Partial truncate caches the indirdep pointer in the freework 7609 * which provides us a back copy to the save bp which holds the 7610 * pointers we want to clear. When this completes the zero 7611 * pointers are written to the real copy. 7612 * 2) The indirect is being completely truncated, cancel_indirdep() 7613 * eliminated the real copy and placed the indirdep on the saved 7614 * copy. The indirdep and buf are discarded when this completes. 7615 * 3) The indirect was not in memory, we read a copy off of the disk 7616 * using the devvp and drop and invalidate the buffer when we're 7617 * done. 7618 */ 7619 goingaway = 1; 7620 indirdep = NULL; 7621 if (freework->fw_indir != NULL) { 7622 goingaway = 0; 7623 indirdep = freework->fw_indir; 7624 bp = indirdep->ir_savebp; 7625 if (bp == NULL || bp->b_blkno != dbn) 7626 panic("indir_trunc: Bad saved buf %p blkno %jd", 7627 bp, (intmax_t)dbn); 7628 } else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) { 7629 /* 7630 * The lock prevents the buf dep list from changing and 7631 * indirects on devvp should only ever have one dependency. 7632 */ 7633 indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep)); 7634 if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0) 7635 panic("indir_trunc: Bad indirdep %p from buf %p", 7636 indirdep, bp); 7637 } else if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 7638 NOCRED, &bp) != 0) { 7639 brelse(bp); 7640 return; 7641 } 7642 ACQUIRE_LOCK(&lk); 7643 /* Protects against a race with complete_trunc_indir(). */ 7644 freework->fw_state &= ~INPROGRESS; 7645 /* 7646 * If we have an indirdep we need to enforce the truncation order 7647 * and discard it when it is complete. 7648 */ 7649 if (indirdep) { 7650 if (freework != TAILQ_FIRST(&indirdep->ir_trunc) && 7651 !TAILQ_EMPTY(&indirdep->ir_trunc)) { 7652 /* 7653 * Add the complete truncate to the list on the 7654 * indirdep to enforce in-order processing. 7655 */ 7656 if (freework->fw_indir == NULL) 7657 TAILQ_INSERT_TAIL(&indirdep->ir_trunc, 7658 freework, fw_next); 7659 FREE_LOCK(&lk); 7660 return; 7661 } 7662 /* 7663 * If we're goingaway, free the indirdep. Otherwise it will 7664 * linger until the write completes. 7665 */ 7666 if (goingaway) { 7667 free_indirdep(indirdep); 7668 ump->um_numindirdeps -= 1; 7669 } 7670 } 7671 FREE_LOCK(&lk); 7672 /* Initialize pointers depending on block size. */ 7673 if (ump->um_fstype == UFS1) { 7674 bap1 = (ufs1_daddr_t *)bp->b_data; 7675 nb = bap1[freework->fw_off]; 7676 ufs1fmt = 1; 7677 } else { 7678 bap2 = (ufs2_daddr_t *)bp->b_data; 7679 nb = bap2[freework->fw_off]; 7680 ufs1fmt = 0; 7681 } 7682 level = lbn_level(lbn); 7683 needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0; 7684 lbnadd = lbn_offset(fs, level); 7685 nblocks = btodb(fs->fs_bsize); 7686 nfreework = freework; 7687 freedeps = 0; 7688 cnt = 0; 7689 /* 7690 * Reclaim blocks. Traverses into nested indirect levels and 7691 * arranges for the current level to be freed when subordinates 7692 * are free when journaling. 7693 */ 7694 for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) { 7695 if (i != NINDIR(fs) - 1) { 7696 if (ufs1fmt) 7697 nnb = bap1[i+1]; 7698 else 7699 nnb = bap2[i+1]; 7700 } else 7701 nnb = 0; 7702 if (nb == 0) 7703 continue; 7704 cnt++; 7705 if (level != 0) { 7706 nlbn = (lbn + 1) - (i * lbnadd); 7707 if (needj != 0) { 7708 nfreework = newfreework(ump, freeblks, freework, 7709 nlbn, nb, fs->fs_frag, 0, 0); 7710 freedeps++; 7711 } 7712 indir_trunc(nfreework, fsbtodb(fs, nb), nlbn); 7713 } else { 7714 struct freedep *freedep; 7715 7716 /* 7717 * Attempt to aggregate freedep dependencies for 7718 * all blocks being released to the same CG. 7719 */ 7720 LIST_INIT(&wkhd); 7721 if (needj != 0 && 7722 (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) { 7723 freedep = newfreedep(freework); 7724 WORKLIST_INSERT_UNLOCKED(&wkhd, 7725 &freedep->fd_list); 7726 freedeps++; 7727 } 7728 ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, 7729 fs->fs_bsize, freeblks->fb_inum, 7730 freeblks->fb_vtype, &wkhd); 7731 } 7732 } 7733 if (goingaway) { 7734 bp->b_flags |= B_INVAL | B_NOCACHE; 7735 brelse(bp); 7736 } 7737 freedblocks = 0; 7738 if (level == 0) 7739 freedblocks = (nblocks * cnt); 7740 if (needj == 0) 7741 freedblocks += nblocks; 7742 freeblks_free(ump, freeblks, freedblocks); 7743 /* 7744 * If we are journaling set up the ref counts and offset so this 7745 * indirect can be completed when its children are free. 7746 */ 7747 if (needj) { 7748 ACQUIRE_LOCK(&lk); 7749 freework->fw_off = i; 7750 freework->fw_ref += freedeps; 7751 freework->fw_ref -= NINDIR(fs) + 1; 7752 if (level == 0) 7753 freeblks->fb_cgwait += freedeps; 7754 if (freework->fw_ref == 0) 7755 freework_freeblock(freework); 7756 FREE_LOCK(&lk); 7757 return; 7758 } 7759 /* 7760 * If we're not journaling we can free the indirect now. 7761 */ 7762 dbn = dbtofsb(fs, dbn); 7763 ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize, 7764 freeblks->fb_inum, freeblks->fb_vtype, NULL); 7765 /* Non SUJ softdep does single-threaded truncations. */ 7766 if (freework->fw_blkno == dbn) { 7767 freework->fw_state |= ALLCOMPLETE; 7768 ACQUIRE_LOCK(&lk); 7769 handle_written_freework(freework); 7770 FREE_LOCK(&lk); 7771 } 7772 return; 7773 } 7774 7775 /* 7776 * Cancel an allocindir when it is removed via truncation. When bp is not 7777 * NULL the indirect never appeared on disk and is scheduled to be freed 7778 * independently of the indir so we can more easily track journal work. 7779 */ 7780 static void 7781 cancel_allocindir(aip, bp, freeblks, trunc) 7782 struct allocindir *aip; 7783 struct buf *bp; 7784 struct freeblks *freeblks; 7785 int trunc; 7786 { 7787 struct indirdep *indirdep; 7788 struct freefrag *freefrag; 7789 struct newblk *newblk; 7790 7791 newblk = (struct newblk *)aip; 7792 LIST_REMOVE(aip, ai_next); 7793 /* 7794 * We must eliminate the pointer in bp if it must be freed on its 7795 * own due to partial truncate or pending journal work. 7796 */ 7797 if (bp && (trunc || newblk->nb_jnewblk)) { 7798 /* 7799 * Clear the pointer and mark the aip to be freed 7800 * directly if it never existed on disk. 7801 */ 7802 aip->ai_state |= DELAYEDFREE; 7803 indirdep = aip->ai_indirdep; 7804 if (indirdep->ir_state & UFS1FMT) 7805 ((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0; 7806 else 7807 ((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0; 7808 } 7809 /* 7810 * When truncating the previous pointer will be freed via 7811 * savedbp. Eliminate the freefrag which would dup free. 7812 */ 7813 if (trunc && (freefrag = newblk->nb_freefrag) != NULL) { 7814 newblk->nb_freefrag = NULL; 7815 if (freefrag->ff_jdep) 7816 cancel_jfreefrag( 7817 WK_JFREEFRAG(freefrag->ff_jdep)); 7818 jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork); 7819 WORKITEM_FREE(freefrag, D_FREEFRAG); 7820 } 7821 /* 7822 * If the journal hasn't been written the jnewblk must be passed 7823 * to the call to ffs_blkfree that reclaims the space. We accomplish 7824 * this by leaving the journal dependency on the newblk to be freed 7825 * when a freework is created in handle_workitem_freeblocks(). 7826 */ 7827 cancel_newblk(newblk, NULL, &freeblks->fb_jwork); 7828 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list); 7829 } 7830 7831 /* 7832 * Create the mkdir dependencies for . and .. in a new directory. Link them 7833 * in to a newdirblk so any subsequent additions are tracked properly. The 7834 * caller is responsible for adding the mkdir1 dependency to the journal 7835 * and updating id_mkdiradd. This function returns with lk held. 7836 */ 7837 static struct mkdir * 7838 setup_newdir(dap, newinum, dinum, newdirbp, mkdirp) 7839 struct diradd *dap; 7840 ino_t newinum; 7841 ino_t dinum; 7842 struct buf *newdirbp; 7843 struct mkdir **mkdirp; 7844 { 7845 struct newblk *newblk; 7846 struct pagedep *pagedep; 7847 struct inodedep *inodedep; 7848 struct newdirblk *newdirblk = 0; 7849 struct mkdir *mkdir1, *mkdir2; 7850 struct worklist *wk; 7851 struct jaddref *jaddref; 7852 struct mount *mp; 7853 7854 mp = dap->da_list.wk_mp; 7855 newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK, 7856 M_SOFTDEP_FLAGS); 7857 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); 7858 LIST_INIT(&newdirblk->db_mkdir); 7859 mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); 7860 workitem_alloc(&mkdir1->md_list, D_MKDIR, mp); 7861 mkdir1->md_state = ATTACHED | MKDIR_BODY; 7862 mkdir1->md_diradd = dap; 7863 mkdir1->md_jaddref = NULL; 7864 mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); 7865 workitem_alloc(&mkdir2->md_list, D_MKDIR, mp); 7866 mkdir2->md_state = ATTACHED | MKDIR_PARENT; 7867 mkdir2->md_diradd = dap; 7868 mkdir2->md_jaddref = NULL; 7869 if (MOUNTEDSUJ(mp) == 0) { 7870 mkdir1->md_state |= DEPCOMPLETE; 7871 mkdir2->md_state |= DEPCOMPLETE; 7872 } 7873 /* 7874 * Dependency on "." and ".." being written to disk. 7875 */ 7876 mkdir1->md_buf = newdirbp; 7877 ACQUIRE_LOCK(&lk); 7878 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); 7879 /* 7880 * We must link the pagedep, allocdirect, and newdirblk for 7881 * the initial file page so the pointer to the new directory 7882 * is not written until the directory contents are live and 7883 * any subsequent additions are not marked live until the 7884 * block is reachable via the inode. 7885 */ 7886 if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0) 7887 panic("setup_newdir: lost pagedep"); 7888 LIST_FOREACH(wk, &newdirbp->b_dep, wk_list) 7889 if (wk->wk_type == D_ALLOCDIRECT) 7890 break; 7891 if (wk == NULL) 7892 panic("setup_newdir: lost allocdirect"); 7893 if (pagedep->pd_state & NEWBLOCK) 7894 panic("setup_newdir: NEWBLOCK already set"); 7895 newblk = WK_NEWBLK(wk); 7896 pagedep->pd_state |= NEWBLOCK; 7897 pagedep->pd_newdirblk = newdirblk; 7898 newdirblk->db_pagedep = pagedep; 7899 WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); 7900 WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list); 7901 /* 7902 * Look up the inodedep for the parent directory so that we 7903 * can link mkdir2 into the pending dotdot jaddref or 7904 * the inode write if there is none. If the inode is 7905 * ALLCOMPLETE and no jaddref is present all dependencies have 7906 * been satisfied and mkdir2 can be freed. 7907 */ 7908 inodedep_lookup(mp, dinum, 0, &inodedep); 7909 if (MOUNTEDSUJ(mp)) { 7910 if (inodedep == NULL) 7911 panic("setup_newdir: Lost parent."); 7912 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 7913 inoreflst); 7914 KASSERT(jaddref != NULL && jaddref->ja_parent == newinum && 7915 (jaddref->ja_state & MKDIR_PARENT), 7916 ("setup_newdir: bad dotdot jaddref %p", jaddref)); 7917 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 7918 mkdir2->md_jaddref = jaddref; 7919 jaddref->ja_mkdir = mkdir2; 7920 } else if (inodedep == NULL || 7921 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 7922 dap->da_state &= ~MKDIR_PARENT; 7923 WORKITEM_FREE(mkdir2, D_MKDIR); 7924 } else { 7925 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 7926 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list); 7927 } 7928 *mkdirp = mkdir2; 7929 7930 return (mkdir1); 7931 } 7932 7933 /* 7934 * Directory entry addition dependencies. 7935 * 7936 * When adding a new directory entry, the inode (with its incremented link 7937 * count) must be written to disk before the directory entry's pointer to it. 7938 * Also, if the inode is newly allocated, the corresponding freemap must be 7939 * updated (on disk) before the directory entry's pointer. These requirements 7940 * are met via undo/redo on the directory entry's pointer, which consists 7941 * simply of the inode number. 7942 * 7943 * As directory entries are added and deleted, the free space within a 7944 * directory block can become fragmented. The ufs filesystem will compact 7945 * a fragmented directory block to make space for a new entry. When this 7946 * occurs, the offsets of previously added entries change. Any "diradd" 7947 * dependency structures corresponding to these entries must be updated with 7948 * the new offsets. 7949 */ 7950 7951 /* 7952 * This routine is called after the in-memory inode's link 7953 * count has been incremented, but before the directory entry's 7954 * pointer to the inode has been set. 7955 */ 7956 int 7957 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) 7958 struct buf *bp; /* buffer containing directory block */ 7959 struct inode *dp; /* inode for directory */ 7960 off_t diroffset; /* offset of new entry in directory */ 7961 ino_t newinum; /* inode referenced by new directory entry */ 7962 struct buf *newdirbp; /* non-NULL => contents of new mkdir */ 7963 int isnewblk; /* entry is in a newly allocated block */ 7964 { 7965 int offset; /* offset of new entry within directory block */ 7966 ufs_lbn_t lbn; /* block in directory containing new entry */ 7967 struct fs *fs; 7968 struct diradd *dap; 7969 struct newblk *newblk; 7970 struct pagedep *pagedep; 7971 struct inodedep *inodedep; 7972 struct newdirblk *newdirblk = 0; 7973 struct mkdir *mkdir1, *mkdir2; 7974 struct jaddref *jaddref; 7975 struct mount *mp; 7976 int isindir; 7977 7978 /* 7979 * Whiteouts have no dependencies. 7980 */ 7981 if (newinum == WINO) { 7982 if (newdirbp != NULL) 7983 bdwrite(newdirbp); 7984 return (0); 7985 } 7986 jaddref = NULL; 7987 mkdir1 = mkdir2 = NULL; 7988 mp = UFSTOVFS(dp->i_ump); 7989 fs = dp->i_fs; 7990 lbn = lblkno(fs, diroffset); 7991 offset = blkoff(fs, diroffset); 7992 dap = malloc(sizeof(struct diradd), M_DIRADD, 7993 M_SOFTDEP_FLAGS|M_ZERO); 7994 workitem_alloc(&dap->da_list, D_DIRADD, mp); 7995 dap->da_offset = offset; 7996 dap->da_newinum = newinum; 7997 dap->da_state = ATTACHED; 7998 LIST_INIT(&dap->da_jwork); 7999 isindir = bp->b_lblkno >= NDADDR; 8000 if (isnewblk && 8001 (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) { 8002 newdirblk = malloc(sizeof(struct newdirblk), 8003 M_NEWDIRBLK, M_SOFTDEP_FLAGS); 8004 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); 8005 LIST_INIT(&newdirblk->db_mkdir); 8006 } 8007 /* 8008 * If we're creating a new directory setup the dependencies and set 8009 * the dap state to wait for them. Otherwise it's COMPLETE and 8010 * we can move on. 8011 */ 8012 if (newdirbp == NULL) { 8013 dap->da_state |= DEPCOMPLETE; 8014 ACQUIRE_LOCK(&lk); 8015 } else { 8016 dap->da_state |= MKDIR_BODY | MKDIR_PARENT; 8017 mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp, 8018 &mkdir2); 8019 } 8020 /* 8021 * Link into parent directory pagedep to await its being written. 8022 */ 8023 pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep); 8024 #ifdef DEBUG 8025 if (diradd_lookup(pagedep, offset) != NULL) 8026 panic("softdep_setup_directory_add: %p already at off %d\n", 8027 diradd_lookup(pagedep, offset), offset); 8028 #endif 8029 dap->da_pagedep = pagedep; 8030 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, 8031 da_pdlist); 8032 inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); 8033 /* 8034 * If we're journaling, link the diradd into the jaddref so it 8035 * may be completed after the journal entry is written. Otherwise, 8036 * link the diradd into its inodedep. If the inode is not yet 8037 * written place it on the bufwait list, otherwise do the post-inode 8038 * write processing to put it on the id_pendinghd list. 8039 */ 8040 if (MOUNTEDSUJ(mp)) { 8041 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 8042 inoreflst); 8043 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 8044 ("softdep_setup_directory_add: bad jaddref %p", jaddref)); 8045 jaddref->ja_diroff = diroffset; 8046 jaddref->ja_diradd = dap; 8047 add_to_journal(&jaddref->ja_list); 8048 } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) 8049 diradd_inode_written(dap, inodedep); 8050 else 8051 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 8052 /* 8053 * Add the journal entries for . and .. links now that the primary 8054 * link is written. 8055 */ 8056 if (mkdir1 != NULL && MOUNTEDSUJ(mp)) { 8057 jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, 8058 inoreflst, if_deps); 8059 KASSERT(jaddref != NULL && 8060 jaddref->ja_ino == jaddref->ja_parent && 8061 (jaddref->ja_state & MKDIR_BODY), 8062 ("softdep_setup_directory_add: bad dot jaddref %p", 8063 jaddref)); 8064 mkdir1->md_jaddref = jaddref; 8065 jaddref->ja_mkdir = mkdir1; 8066 /* 8067 * It is important that the dotdot journal entry 8068 * is added prior to the dot entry since dot writes 8069 * both the dot and dotdot links. These both must 8070 * be added after the primary link for the journal 8071 * to remain consistent. 8072 */ 8073 add_to_journal(&mkdir2->md_jaddref->ja_list); 8074 add_to_journal(&jaddref->ja_list); 8075 } 8076 /* 8077 * If we are adding a new directory remember this diradd so that if 8078 * we rename it we can keep the dot and dotdot dependencies. If 8079 * we are adding a new name for an inode that has a mkdiradd we 8080 * must be in rename and we have to move the dot and dotdot 8081 * dependencies to this new name. The old name is being orphaned 8082 * soon. 8083 */ 8084 if (mkdir1 != NULL) { 8085 if (inodedep->id_mkdiradd != NULL) 8086 panic("softdep_setup_directory_add: Existing mkdir"); 8087 inodedep->id_mkdiradd = dap; 8088 } else if (inodedep->id_mkdiradd) 8089 merge_diradd(inodedep, dap); 8090 if (newdirblk) { 8091 /* 8092 * There is nothing to do if we are already tracking 8093 * this block. 8094 */ 8095 if ((pagedep->pd_state & NEWBLOCK) != 0) { 8096 WORKITEM_FREE(newdirblk, D_NEWDIRBLK); 8097 FREE_LOCK(&lk); 8098 return (0); 8099 } 8100 if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk) 8101 == 0) 8102 panic("softdep_setup_directory_add: lost entry"); 8103 WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); 8104 pagedep->pd_state |= NEWBLOCK; 8105 pagedep->pd_newdirblk = newdirblk; 8106 newdirblk->db_pagedep = pagedep; 8107 FREE_LOCK(&lk); 8108 /* 8109 * If we extended into an indirect signal direnter to sync. 8110 */ 8111 if (isindir) 8112 return (1); 8113 return (0); 8114 } 8115 FREE_LOCK(&lk); 8116 return (0); 8117 } 8118 8119 /* 8120 * This procedure is called to change the offset of a directory 8121 * entry when compacting a directory block which must be owned 8122 * exclusively by the caller. Note that the actual entry movement 8123 * must be done in this procedure to ensure that no I/O completions 8124 * occur while the move is in progress. 8125 */ 8126 void 8127 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) 8128 struct buf *bp; /* Buffer holding directory block. */ 8129 struct inode *dp; /* inode for directory */ 8130 caddr_t base; /* address of dp->i_offset */ 8131 caddr_t oldloc; /* address of old directory location */ 8132 caddr_t newloc; /* address of new directory location */ 8133 int entrysize; /* size of directory entry */ 8134 { 8135 int offset, oldoffset, newoffset; 8136 struct pagedep *pagedep; 8137 struct jmvref *jmvref; 8138 struct diradd *dap; 8139 struct direct *de; 8140 struct mount *mp; 8141 ufs_lbn_t lbn; 8142 int flags; 8143 8144 mp = UFSTOVFS(dp->i_ump); 8145 de = (struct direct *)oldloc; 8146 jmvref = NULL; 8147 flags = 0; 8148 /* 8149 * Moves are always journaled as it would be too complex to 8150 * determine if any affected adds or removes are present in the 8151 * journal. 8152 */ 8153 if (MOUNTEDSUJ(mp)) { 8154 flags = DEPALLOC; 8155 jmvref = newjmvref(dp, de->d_ino, 8156 dp->i_offset + (oldloc - base), 8157 dp->i_offset + (newloc - base)); 8158 } 8159 lbn = lblkno(dp->i_fs, dp->i_offset); 8160 offset = blkoff(dp->i_fs, dp->i_offset); 8161 oldoffset = offset + (oldloc - base); 8162 newoffset = offset + (newloc - base); 8163 ACQUIRE_LOCK(&lk); 8164 if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0) 8165 goto done; 8166 dap = diradd_lookup(pagedep, oldoffset); 8167 if (dap) { 8168 dap->da_offset = newoffset; 8169 newoffset = DIRADDHASH(newoffset); 8170 oldoffset = DIRADDHASH(oldoffset); 8171 if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE && 8172 newoffset != oldoffset) { 8173 LIST_REMOVE(dap, da_pdlist); 8174 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset], 8175 dap, da_pdlist); 8176 } 8177 } 8178 done: 8179 if (jmvref) { 8180 jmvref->jm_pagedep = pagedep; 8181 LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps); 8182 add_to_journal(&jmvref->jm_list); 8183 } 8184 bcopy(oldloc, newloc, entrysize); 8185 FREE_LOCK(&lk); 8186 } 8187 8188 /* 8189 * Move the mkdir dependencies and journal work from one diradd to another 8190 * when renaming a directory. The new name must depend on the mkdir deps 8191 * completing as the old name did. Directories can only have one valid link 8192 * at a time so one must be canonical. 8193 */ 8194 static void 8195 merge_diradd(inodedep, newdap) 8196 struct inodedep *inodedep; 8197 struct diradd *newdap; 8198 { 8199 struct diradd *olddap; 8200 struct mkdir *mkdir, *nextmd; 8201 short state; 8202 8203 olddap = inodedep->id_mkdiradd; 8204 inodedep->id_mkdiradd = newdap; 8205 if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 8206 newdap->da_state &= ~DEPCOMPLETE; 8207 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 8208 nextmd = LIST_NEXT(mkdir, md_mkdirs); 8209 if (mkdir->md_diradd != olddap) 8210 continue; 8211 mkdir->md_diradd = newdap; 8212 state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY); 8213 newdap->da_state |= state; 8214 olddap->da_state &= ~state; 8215 if ((olddap->da_state & 8216 (MKDIR_PARENT | MKDIR_BODY)) == 0) 8217 break; 8218 } 8219 if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) 8220 panic("merge_diradd: unfound ref"); 8221 } 8222 /* 8223 * Any mkdir related journal items are not safe to be freed until 8224 * the new name is stable. 8225 */ 8226 jwork_move(&newdap->da_jwork, &olddap->da_jwork); 8227 olddap->da_state |= DEPCOMPLETE; 8228 complete_diradd(olddap); 8229 } 8230 8231 /* 8232 * Move the diradd to the pending list when all diradd dependencies are 8233 * complete. 8234 */ 8235 static void 8236 complete_diradd(dap) 8237 struct diradd *dap; 8238 { 8239 struct pagedep *pagedep; 8240 8241 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 8242 if (dap->da_state & DIRCHG) 8243 pagedep = dap->da_previous->dm_pagedep; 8244 else 8245 pagedep = dap->da_pagedep; 8246 LIST_REMOVE(dap, da_pdlist); 8247 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 8248 } 8249 } 8250 8251 /* 8252 * Cancel a diradd when a dirrem overlaps with it. We must cancel the journal 8253 * add entries and conditonally journal the remove. 8254 */ 8255 static void 8256 cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref) 8257 struct diradd *dap; 8258 struct dirrem *dirrem; 8259 struct jremref *jremref; 8260 struct jremref *dotremref; 8261 struct jremref *dotdotremref; 8262 { 8263 struct inodedep *inodedep; 8264 struct jaddref *jaddref; 8265 struct inoref *inoref; 8266 struct mkdir *mkdir; 8267 8268 /* 8269 * If no remove references were allocated we're on a non-journaled 8270 * filesystem and can skip the cancel step. 8271 */ 8272 if (jremref == NULL) { 8273 free_diradd(dap, NULL); 8274 return; 8275 } 8276 /* 8277 * Cancel the primary name an free it if it does not require 8278 * journaling. 8279 */ 8280 if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum, 8281 0, &inodedep) != 0) { 8282 /* Abort the addref that reference this diradd. */ 8283 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 8284 if (inoref->if_list.wk_type != D_JADDREF) 8285 continue; 8286 jaddref = (struct jaddref *)inoref; 8287 if (jaddref->ja_diradd != dap) 8288 continue; 8289 if (cancel_jaddref(jaddref, inodedep, 8290 &dirrem->dm_jwork) == 0) { 8291 free_jremref(jremref); 8292 jremref = NULL; 8293 } 8294 break; 8295 } 8296 } 8297 /* 8298 * Cancel subordinate names and free them if they do not require 8299 * journaling. 8300 */ 8301 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 8302 LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) { 8303 if (mkdir->md_diradd != dap) 8304 continue; 8305 if ((jaddref = mkdir->md_jaddref) == NULL) 8306 continue; 8307 mkdir->md_jaddref = NULL; 8308 if (mkdir->md_state & MKDIR_PARENT) { 8309 if (cancel_jaddref(jaddref, NULL, 8310 &dirrem->dm_jwork) == 0) { 8311 free_jremref(dotdotremref); 8312 dotdotremref = NULL; 8313 } 8314 } else { 8315 if (cancel_jaddref(jaddref, inodedep, 8316 &dirrem->dm_jwork) == 0) { 8317 free_jremref(dotremref); 8318 dotremref = NULL; 8319 } 8320 } 8321 } 8322 } 8323 8324 if (jremref) 8325 journal_jremref(dirrem, jremref, inodedep); 8326 if (dotremref) 8327 journal_jremref(dirrem, dotremref, inodedep); 8328 if (dotdotremref) 8329 journal_jremref(dirrem, dotdotremref, NULL); 8330 jwork_move(&dirrem->dm_jwork, &dap->da_jwork); 8331 free_diradd(dap, &dirrem->dm_jwork); 8332 } 8333 8334 /* 8335 * Free a diradd dependency structure. This routine must be called 8336 * with splbio interrupts blocked. 8337 */ 8338 static void 8339 free_diradd(dap, wkhd) 8340 struct diradd *dap; 8341 struct workhead *wkhd; 8342 { 8343 struct dirrem *dirrem; 8344 struct pagedep *pagedep; 8345 struct inodedep *inodedep; 8346 struct mkdir *mkdir, *nextmd; 8347 8348 mtx_assert(&lk, MA_OWNED); 8349 LIST_REMOVE(dap, da_pdlist); 8350 if (dap->da_state & ONWORKLIST) 8351 WORKLIST_REMOVE(&dap->da_list); 8352 if ((dap->da_state & DIRCHG) == 0) { 8353 pagedep = dap->da_pagedep; 8354 } else { 8355 dirrem = dap->da_previous; 8356 pagedep = dirrem->dm_pagedep; 8357 dirrem->dm_dirinum = pagedep->pd_ino; 8358 dirrem->dm_state |= COMPLETE; 8359 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 8360 add_to_worklist(&dirrem->dm_list, 0); 8361 } 8362 if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum, 8363 0, &inodedep) != 0) 8364 if (inodedep->id_mkdiradd == dap) 8365 inodedep->id_mkdiradd = NULL; 8366 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 8367 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 8368 nextmd = LIST_NEXT(mkdir, md_mkdirs); 8369 if (mkdir->md_diradd != dap) 8370 continue; 8371 dap->da_state &= 8372 ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); 8373 LIST_REMOVE(mkdir, md_mkdirs); 8374 if (mkdir->md_state & ONWORKLIST) 8375 WORKLIST_REMOVE(&mkdir->md_list); 8376 if (mkdir->md_jaddref != NULL) 8377 panic("free_diradd: Unexpected jaddref"); 8378 WORKITEM_FREE(mkdir, D_MKDIR); 8379 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) 8380 break; 8381 } 8382 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) 8383 panic("free_diradd: unfound ref"); 8384 } 8385 if (inodedep) 8386 free_inodedep(inodedep); 8387 /* 8388 * Free any journal segments waiting for the directory write. 8389 */ 8390 handle_jwork(&dap->da_jwork); 8391 WORKITEM_FREE(dap, D_DIRADD); 8392 } 8393 8394 /* 8395 * Directory entry removal dependencies. 8396 * 8397 * When removing a directory entry, the entry's inode pointer must be 8398 * zero'ed on disk before the corresponding inode's link count is decremented 8399 * (possibly freeing the inode for re-use). This dependency is handled by 8400 * updating the directory entry but delaying the inode count reduction until 8401 * after the directory block has been written to disk. After this point, the 8402 * inode count can be decremented whenever it is convenient. 8403 */ 8404 8405 /* 8406 * This routine should be called immediately after removing 8407 * a directory entry. The inode's link count should not be 8408 * decremented by the calling procedure -- the soft updates 8409 * code will do this task when it is safe. 8410 */ 8411 void 8412 softdep_setup_remove(bp, dp, ip, isrmdir) 8413 struct buf *bp; /* buffer containing directory block */ 8414 struct inode *dp; /* inode for the directory being modified */ 8415 struct inode *ip; /* inode for directory entry being removed */ 8416 int isrmdir; /* indicates if doing RMDIR */ 8417 { 8418 struct dirrem *dirrem, *prevdirrem; 8419 struct inodedep *inodedep; 8420 int direct; 8421 8422 /* 8423 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. We want 8424 * newdirrem() to setup the full directory remove which requires 8425 * isrmdir > 1. 8426 */ 8427 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 8428 /* 8429 * Add the dirrem to the inodedep's pending remove list for quick 8430 * discovery later. 8431 */ 8432 if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 8433 &inodedep) == 0) 8434 panic("softdep_setup_remove: Lost inodedep."); 8435 dirrem->dm_state |= ONDEPLIST; 8436 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 8437 8438 /* 8439 * If the COMPLETE flag is clear, then there were no active 8440 * entries and we want to roll back to a zeroed entry until 8441 * the new inode is committed to disk. If the COMPLETE flag is 8442 * set then we have deleted an entry that never made it to 8443 * disk. If the entry we deleted resulted from a name change, 8444 * then the old name still resides on disk. We cannot delete 8445 * its inode (returned to us in prevdirrem) until the zeroed 8446 * directory entry gets to disk. The new inode has never been 8447 * referenced on the disk, so can be deleted immediately. 8448 */ 8449 if ((dirrem->dm_state & COMPLETE) == 0) { 8450 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, 8451 dm_next); 8452 FREE_LOCK(&lk); 8453 } else { 8454 if (prevdirrem != NULL) 8455 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, 8456 prevdirrem, dm_next); 8457 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; 8458 direct = LIST_EMPTY(&dirrem->dm_jremrefhd); 8459 FREE_LOCK(&lk); 8460 if (direct) 8461 handle_workitem_remove(dirrem, 0); 8462 } 8463 } 8464 8465 /* 8466 * Check for an entry matching 'offset' on both the pd_dirraddhd list and the 8467 * pd_pendinghd list of a pagedep. 8468 */ 8469 static struct diradd * 8470 diradd_lookup(pagedep, offset) 8471 struct pagedep *pagedep; 8472 int offset; 8473 { 8474 struct diradd *dap; 8475 8476 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) 8477 if (dap->da_offset == offset) 8478 return (dap); 8479 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) 8480 if (dap->da_offset == offset) 8481 return (dap); 8482 return (NULL); 8483 } 8484 8485 /* 8486 * Search for a .. diradd dependency in a directory that is being removed. 8487 * If the directory was renamed to a new parent we have a diradd rather 8488 * than a mkdir for the .. entry. We need to cancel it now before 8489 * it is found in truncate(). 8490 */ 8491 static struct jremref * 8492 cancel_diradd_dotdot(ip, dirrem, jremref) 8493 struct inode *ip; 8494 struct dirrem *dirrem; 8495 struct jremref *jremref; 8496 { 8497 struct pagedep *pagedep; 8498 struct diradd *dap; 8499 struct worklist *wk; 8500 8501 if (pagedep_lookup(UFSTOVFS(ip->i_ump), NULL, ip->i_number, 0, 0, 8502 &pagedep) == 0) 8503 return (jremref); 8504 dap = diradd_lookup(pagedep, DOTDOT_OFFSET); 8505 if (dap == NULL) 8506 return (jremref); 8507 cancel_diradd(dap, dirrem, jremref, NULL, NULL); 8508 /* 8509 * Mark any journal work as belonging to the parent so it is freed 8510 * with the .. reference. 8511 */ 8512 LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) 8513 wk->wk_state |= MKDIR_PARENT; 8514 return (NULL); 8515 } 8516 8517 /* 8518 * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to 8519 * replace it with a dirrem/diradd pair as a result of re-parenting a 8520 * directory. This ensures that we don't simultaneously have a mkdir and 8521 * a diradd for the same .. entry. 8522 */ 8523 static struct jremref * 8524 cancel_mkdir_dotdot(ip, dirrem, jremref) 8525 struct inode *ip; 8526 struct dirrem *dirrem; 8527 struct jremref *jremref; 8528 { 8529 struct inodedep *inodedep; 8530 struct jaddref *jaddref; 8531 struct mkdir *mkdir; 8532 struct diradd *dap; 8533 8534 if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 8535 &inodedep) == 0) 8536 panic("cancel_mkdir_dotdot: Lost inodedep"); 8537 dap = inodedep->id_mkdiradd; 8538 if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0) 8539 return (jremref); 8540 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; 8541 mkdir = LIST_NEXT(mkdir, md_mkdirs)) 8542 if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT) 8543 break; 8544 if (mkdir == NULL) 8545 panic("cancel_mkdir_dotdot: Unable to find mkdir\n"); 8546 if ((jaddref = mkdir->md_jaddref) != NULL) { 8547 mkdir->md_jaddref = NULL; 8548 jaddref->ja_state &= ~MKDIR_PARENT; 8549 if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0, 8550 &inodedep) == 0) 8551 panic("cancel_mkdir_dotdot: Lost parent inodedep"); 8552 if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) { 8553 journal_jremref(dirrem, jremref, inodedep); 8554 jremref = NULL; 8555 } 8556 } 8557 if (mkdir->md_state & ONWORKLIST) 8558 WORKLIST_REMOVE(&mkdir->md_list); 8559 mkdir->md_state |= ALLCOMPLETE; 8560 complete_mkdir(mkdir); 8561 return (jremref); 8562 } 8563 8564 static void 8565 journal_jremref(dirrem, jremref, inodedep) 8566 struct dirrem *dirrem; 8567 struct jremref *jremref; 8568 struct inodedep *inodedep; 8569 { 8570 8571 if (inodedep == NULL) 8572 if (inodedep_lookup(jremref->jr_list.wk_mp, 8573 jremref->jr_ref.if_ino, 0, &inodedep) == 0) 8574 panic("journal_jremref: Lost inodedep"); 8575 LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps); 8576 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); 8577 add_to_journal(&jremref->jr_list); 8578 } 8579 8580 static void 8581 dirrem_journal(dirrem, jremref, dotremref, dotdotremref) 8582 struct dirrem *dirrem; 8583 struct jremref *jremref; 8584 struct jremref *dotremref; 8585 struct jremref *dotdotremref; 8586 { 8587 struct inodedep *inodedep; 8588 8589 8590 if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0, 8591 &inodedep) == 0) 8592 panic("dirrem_journal: Lost inodedep"); 8593 journal_jremref(dirrem, jremref, inodedep); 8594 if (dotremref) 8595 journal_jremref(dirrem, dotremref, inodedep); 8596 if (dotdotremref) 8597 journal_jremref(dirrem, dotdotremref, NULL); 8598 } 8599 8600 /* 8601 * Allocate a new dirrem if appropriate and return it along with 8602 * its associated pagedep. Called without a lock, returns with lock. 8603 */ 8604 static struct dirrem * 8605 newdirrem(bp, dp, ip, isrmdir, prevdirremp) 8606 struct buf *bp; /* buffer containing directory block */ 8607 struct inode *dp; /* inode for the directory being modified */ 8608 struct inode *ip; /* inode for directory entry being removed */ 8609 int isrmdir; /* indicates if doing RMDIR */ 8610 struct dirrem **prevdirremp; /* previously referenced inode, if any */ 8611 { 8612 int offset; 8613 ufs_lbn_t lbn; 8614 struct diradd *dap; 8615 struct dirrem *dirrem; 8616 struct pagedep *pagedep; 8617 struct jremref *jremref; 8618 struct jremref *dotremref; 8619 struct jremref *dotdotremref; 8620 struct vnode *dvp; 8621 8622 /* 8623 * Whiteouts have no deletion dependencies. 8624 */ 8625 if (ip == NULL) 8626 panic("newdirrem: whiteout"); 8627 dvp = ITOV(dp); 8628 /* 8629 * If we are over our limit, try to improve the situation. 8630 * Limiting the number of dirrem structures will also limit 8631 * the number of freefile and freeblks structures. 8632 */ 8633 ACQUIRE_LOCK(&lk); 8634 if (!(ip->i_flags & SF_SNAPSHOT) && 8635 dep_current[D_DIRREM] > max_softdeps / 2) 8636 (void) request_cleanup(ITOV(dp)->v_mount, FLUSH_BLOCKS); 8637 FREE_LOCK(&lk); 8638 dirrem = malloc(sizeof(struct dirrem), 8639 M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO); 8640 workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount); 8641 LIST_INIT(&dirrem->dm_jremrefhd); 8642 LIST_INIT(&dirrem->dm_jwork); 8643 dirrem->dm_state = isrmdir ? RMDIR : 0; 8644 dirrem->dm_oldinum = ip->i_number; 8645 *prevdirremp = NULL; 8646 /* 8647 * Allocate remove reference structures to track journal write 8648 * dependencies. We will always have one for the link and 8649 * when doing directories we will always have one more for dot. 8650 * When renaming a directory we skip the dotdot link change so 8651 * this is not needed. 8652 */ 8653 jremref = dotremref = dotdotremref = NULL; 8654 if (DOINGSUJ(dvp)) { 8655 if (isrmdir) { 8656 jremref = newjremref(dirrem, dp, ip, dp->i_offset, 8657 ip->i_effnlink + 2); 8658 dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET, 8659 ip->i_effnlink + 1); 8660 dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET, 8661 dp->i_effnlink + 1); 8662 dotdotremref->jr_state |= MKDIR_PARENT; 8663 } else 8664 jremref = newjremref(dirrem, dp, ip, dp->i_offset, 8665 ip->i_effnlink + 1); 8666 } 8667 ACQUIRE_LOCK(&lk); 8668 lbn = lblkno(dp->i_fs, dp->i_offset); 8669 offset = blkoff(dp->i_fs, dp->i_offset); 8670 pagedep_lookup(UFSTOVFS(dp->i_ump), bp, dp->i_number, lbn, DEPALLOC, 8671 &pagedep); 8672 dirrem->dm_pagedep = pagedep; 8673 dirrem->dm_offset = offset; 8674 /* 8675 * If we're renaming a .. link to a new directory, cancel any 8676 * existing MKDIR_PARENT mkdir. If it has already been canceled 8677 * the jremref is preserved for any potential diradd in this 8678 * location. This can not coincide with a rmdir. 8679 */ 8680 if (dp->i_offset == DOTDOT_OFFSET) { 8681 if (isrmdir) 8682 panic("newdirrem: .. directory change during remove?"); 8683 jremref = cancel_mkdir_dotdot(dp, dirrem, jremref); 8684 } 8685 /* 8686 * If we're removing a directory search for the .. dependency now and 8687 * cancel it. Any pending journal work will be added to the dirrem 8688 * to be completed when the workitem remove completes. 8689 */ 8690 if (isrmdir) 8691 dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref); 8692 /* 8693 * Check for a diradd dependency for the same directory entry. 8694 * If present, then both dependencies become obsolete and can 8695 * be de-allocated. 8696 */ 8697 dap = diradd_lookup(pagedep, offset); 8698 if (dap == NULL) { 8699 /* 8700 * Link the jremref structures into the dirrem so they are 8701 * written prior to the pagedep. 8702 */ 8703 if (jremref) 8704 dirrem_journal(dirrem, jremref, dotremref, 8705 dotdotremref); 8706 return (dirrem); 8707 } 8708 /* 8709 * Must be ATTACHED at this point. 8710 */ 8711 if ((dap->da_state & ATTACHED) == 0) 8712 panic("newdirrem: not ATTACHED"); 8713 if (dap->da_newinum != ip->i_number) 8714 panic("newdirrem: inum %d should be %d", 8715 ip->i_number, dap->da_newinum); 8716 /* 8717 * If we are deleting a changed name that never made it to disk, 8718 * then return the dirrem describing the previous inode (which 8719 * represents the inode currently referenced from this entry on disk). 8720 */ 8721 if ((dap->da_state & DIRCHG) != 0) { 8722 *prevdirremp = dap->da_previous; 8723 dap->da_state &= ~DIRCHG; 8724 dap->da_pagedep = pagedep; 8725 } 8726 /* 8727 * We are deleting an entry that never made it to disk. 8728 * Mark it COMPLETE so we can delete its inode immediately. 8729 */ 8730 dirrem->dm_state |= COMPLETE; 8731 cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref); 8732 #ifdef SUJ_DEBUG 8733 if (isrmdir == 0) { 8734 struct worklist *wk; 8735 8736 LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) 8737 if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT)) 8738 panic("bad wk %p (0x%X)\n", wk, wk->wk_state); 8739 } 8740 #endif 8741 8742 return (dirrem); 8743 } 8744 8745 /* 8746 * Directory entry change dependencies. 8747 * 8748 * Changing an existing directory entry requires that an add operation 8749 * be completed first followed by a deletion. The semantics for the addition 8750 * are identical to the description of adding a new entry above except 8751 * that the rollback is to the old inode number rather than zero. Once 8752 * the addition dependency is completed, the removal is done as described 8753 * in the removal routine above. 8754 */ 8755 8756 /* 8757 * This routine should be called immediately after changing 8758 * a directory entry. The inode's link count should not be 8759 * decremented by the calling procedure -- the soft updates 8760 * code will perform this task when it is safe. 8761 */ 8762 void 8763 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 8764 struct buf *bp; /* buffer containing directory block */ 8765 struct inode *dp; /* inode for the directory being modified */ 8766 struct inode *ip; /* inode for directory entry being removed */ 8767 ino_t newinum; /* new inode number for changed entry */ 8768 int isrmdir; /* indicates if doing RMDIR */ 8769 { 8770 int offset; 8771 struct diradd *dap = NULL; 8772 struct dirrem *dirrem, *prevdirrem; 8773 struct pagedep *pagedep; 8774 struct inodedep *inodedep; 8775 struct jaddref *jaddref; 8776 struct mount *mp; 8777 8778 offset = blkoff(dp->i_fs, dp->i_offset); 8779 mp = UFSTOVFS(dp->i_ump); 8780 8781 /* 8782 * Whiteouts do not need diradd dependencies. 8783 */ 8784 if (newinum != WINO) { 8785 dap = malloc(sizeof(struct diradd), 8786 M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO); 8787 workitem_alloc(&dap->da_list, D_DIRADD, mp); 8788 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; 8789 dap->da_offset = offset; 8790 dap->da_newinum = newinum; 8791 LIST_INIT(&dap->da_jwork); 8792 } 8793 8794 /* 8795 * Allocate a new dirrem and ACQUIRE_LOCK. 8796 */ 8797 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 8798 pagedep = dirrem->dm_pagedep; 8799 /* 8800 * The possible values for isrmdir: 8801 * 0 - non-directory file rename 8802 * 1 - directory rename within same directory 8803 * inum - directory rename to new directory of given inode number 8804 * When renaming to a new directory, we are both deleting and 8805 * creating a new directory entry, so the link count on the new 8806 * directory should not change. Thus we do not need the followup 8807 * dirrem which is usually done in handle_workitem_remove. We set 8808 * the DIRCHG flag to tell handle_workitem_remove to skip the 8809 * followup dirrem. 8810 */ 8811 if (isrmdir > 1) 8812 dirrem->dm_state |= DIRCHG; 8813 8814 /* 8815 * Whiteouts have no additional dependencies, 8816 * so just put the dirrem on the correct list. 8817 */ 8818 if (newinum == WINO) { 8819 if ((dirrem->dm_state & COMPLETE) == 0) { 8820 LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, 8821 dm_next); 8822 } else { 8823 dirrem->dm_dirinum = pagedep->pd_ino; 8824 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 8825 add_to_worklist(&dirrem->dm_list, 0); 8826 } 8827 FREE_LOCK(&lk); 8828 return; 8829 } 8830 /* 8831 * Add the dirrem to the inodedep's pending remove list for quick 8832 * discovery later. A valid nlinkdelta ensures that this lookup 8833 * will not fail. 8834 */ 8835 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) 8836 panic("softdep_setup_directory_change: Lost inodedep."); 8837 dirrem->dm_state |= ONDEPLIST; 8838 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 8839 8840 /* 8841 * If the COMPLETE flag is clear, then there were no active 8842 * entries and we want to roll back to the previous inode until 8843 * the new inode is committed to disk. If the COMPLETE flag is 8844 * set, then we have deleted an entry that never made it to disk. 8845 * If the entry we deleted resulted from a name change, then the old 8846 * inode reference still resides on disk. Any rollback that we do 8847 * needs to be to that old inode (returned to us in prevdirrem). If 8848 * the entry we deleted resulted from a create, then there is 8849 * no entry on the disk, so we want to roll back to zero rather 8850 * than the uncommitted inode. In either of the COMPLETE cases we 8851 * want to immediately free the unwritten and unreferenced inode. 8852 */ 8853 if ((dirrem->dm_state & COMPLETE) == 0) { 8854 dap->da_previous = dirrem; 8855 } else { 8856 if (prevdirrem != NULL) { 8857 dap->da_previous = prevdirrem; 8858 } else { 8859 dap->da_state &= ~DIRCHG; 8860 dap->da_pagedep = pagedep; 8861 } 8862 dirrem->dm_dirinum = pagedep->pd_ino; 8863 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 8864 add_to_worklist(&dirrem->dm_list, 0); 8865 } 8866 /* 8867 * Lookup the jaddref for this journal entry. We must finish 8868 * initializing it and make the diradd write dependent on it. 8869 * If we're not journaling Put it on the id_bufwait list if the inode 8870 * is not yet written. If it is written, do the post-inode write 8871 * processing to put it on the id_pendinghd list. 8872 */ 8873 inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); 8874 if (MOUNTEDSUJ(mp)) { 8875 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 8876 inoreflst); 8877 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 8878 ("softdep_setup_directory_change: bad jaddref %p", 8879 jaddref)); 8880 jaddref->ja_diroff = dp->i_offset; 8881 jaddref->ja_diradd = dap; 8882 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 8883 dap, da_pdlist); 8884 add_to_journal(&jaddref->ja_list); 8885 } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 8886 dap->da_state |= COMPLETE; 8887 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 8888 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 8889 } else { 8890 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 8891 dap, da_pdlist); 8892 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 8893 } 8894 /* 8895 * If we're making a new name for a directory that has not been 8896 * committed when need to move the dot and dotdot references to 8897 * this new name. 8898 */ 8899 if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET) 8900 merge_diradd(inodedep, dap); 8901 FREE_LOCK(&lk); 8902 } 8903 8904 /* 8905 * Called whenever the link count on an inode is changed. 8906 * It creates an inode dependency so that the new reference(s) 8907 * to the inode cannot be committed to disk until the updated 8908 * inode has been written. 8909 */ 8910 void 8911 softdep_change_linkcnt(ip) 8912 struct inode *ip; /* the inode with the increased link count */ 8913 { 8914 struct inodedep *inodedep; 8915 8916 ACQUIRE_LOCK(&lk); 8917 inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep); 8918 if (ip->i_nlink < ip->i_effnlink) 8919 panic("softdep_change_linkcnt: bad delta"); 8920 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 8921 FREE_LOCK(&lk); 8922 } 8923 8924 /* 8925 * Attach a sbdep dependency to the superblock buf so that we can keep 8926 * track of the head of the linked list of referenced but unlinked inodes. 8927 */ 8928 void 8929 softdep_setup_sbupdate(ump, fs, bp) 8930 struct ufsmount *ump; 8931 struct fs *fs; 8932 struct buf *bp; 8933 { 8934 struct sbdep *sbdep; 8935 struct worklist *wk; 8936 8937 if (MOUNTEDSUJ(UFSTOVFS(ump)) == 0) 8938 return; 8939 LIST_FOREACH(wk, &bp->b_dep, wk_list) 8940 if (wk->wk_type == D_SBDEP) 8941 break; 8942 if (wk != NULL) 8943 return; 8944 sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS); 8945 workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump)); 8946 sbdep->sb_fs = fs; 8947 sbdep->sb_ump = ump; 8948 ACQUIRE_LOCK(&lk); 8949 WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list); 8950 FREE_LOCK(&lk); 8951 } 8952 8953 /* 8954 * Return the first unlinked inodedep which is ready to be the head of the 8955 * list. The inodedep and all those after it must have valid next pointers. 8956 */ 8957 static struct inodedep * 8958 first_unlinked_inodedep(ump) 8959 struct ufsmount *ump; 8960 { 8961 struct inodedep *inodedep; 8962 struct inodedep *idp; 8963 8964 for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst); 8965 inodedep; inodedep = idp) { 8966 if ((inodedep->id_state & UNLINKNEXT) == 0) 8967 return (NULL); 8968 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 8969 if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0) 8970 break; 8971 if ((inodedep->id_state & UNLINKPREV) == 0) 8972 panic("first_unlinked_inodedep: prev != next"); 8973 } 8974 if (inodedep == NULL) 8975 return (NULL); 8976 8977 return (inodedep); 8978 } 8979 8980 /* 8981 * Set the sujfree unlinked head pointer prior to writing a superblock. 8982 */ 8983 static void 8984 initiate_write_sbdep(sbdep) 8985 struct sbdep *sbdep; 8986 { 8987 struct inodedep *inodedep; 8988 struct fs *bpfs; 8989 struct fs *fs; 8990 8991 bpfs = sbdep->sb_fs; 8992 fs = sbdep->sb_ump->um_fs; 8993 inodedep = first_unlinked_inodedep(sbdep->sb_ump); 8994 if (inodedep) { 8995 fs->fs_sujfree = inodedep->id_ino; 8996 inodedep->id_state |= UNLINKPREV; 8997 } else 8998 fs->fs_sujfree = 0; 8999 bpfs->fs_sujfree = fs->fs_sujfree; 9000 } 9001 9002 /* 9003 * After a superblock is written determine whether it must be written again 9004 * due to a changing unlinked list head. 9005 */ 9006 static int 9007 handle_written_sbdep(sbdep, bp) 9008 struct sbdep *sbdep; 9009 struct buf *bp; 9010 { 9011 struct inodedep *inodedep; 9012 struct mount *mp; 9013 struct fs *fs; 9014 9015 fs = sbdep->sb_fs; 9016 mp = UFSTOVFS(sbdep->sb_ump); 9017 inodedep = first_unlinked_inodedep(sbdep->sb_ump); 9018 if ((inodedep && fs->fs_sujfree != inodedep->id_ino) || 9019 (inodedep == NULL && fs->fs_sujfree != 0)) { 9020 bdirty(bp); 9021 return (1); 9022 } 9023 WORKITEM_FREE(sbdep, D_SBDEP); 9024 if (fs->fs_sujfree == 0) 9025 return (0); 9026 if (inodedep_lookup(mp, fs->fs_sujfree, 0, &inodedep) == 0) 9027 panic("handle_written_sbdep: lost inodedep"); 9028 /* 9029 * Now that we have a record of this inode in stable store allow it 9030 * to be written to free up pending work. Inodes may see a lot of 9031 * write activity after they are unlinked which we must not hold up. 9032 */ 9033 for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) { 9034 if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS) 9035 panic("handle_written_sbdep: Bad inodedep %p (0x%X)", 9036 inodedep, inodedep->id_state); 9037 if (inodedep->id_state & UNLINKONLIST) 9038 break; 9039 inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST; 9040 } 9041 9042 return (0); 9043 } 9044 9045 /* 9046 * Mark an inodedep as unlinked and insert it into the in-memory unlinked list. 9047 */ 9048 static void 9049 unlinked_inodedep(mp, inodedep) 9050 struct mount *mp; 9051 struct inodedep *inodedep; 9052 { 9053 struct ufsmount *ump; 9054 9055 if (MOUNTEDSUJ(mp) == 0) 9056 return; 9057 ump = VFSTOUFS(mp); 9058 ump->um_fs->fs_fmod = 1; 9059 inodedep->id_state |= UNLINKED; 9060 TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked); 9061 } 9062 9063 /* 9064 * Remove an inodedep from the unlinked inodedep list. This may require 9065 * disk writes if the inode has made it that far. 9066 */ 9067 static void 9068 clear_unlinked_inodedep(inodedep) 9069 struct inodedep *inodedep; 9070 { 9071 struct ufsmount *ump; 9072 struct inodedep *idp; 9073 struct inodedep *idn; 9074 struct fs *fs; 9075 struct buf *bp; 9076 ino_t ino; 9077 ino_t nino; 9078 ino_t pino; 9079 int error; 9080 9081 ump = VFSTOUFS(inodedep->id_list.wk_mp); 9082 fs = ump->um_fs; 9083 ino = inodedep->id_ino; 9084 error = 0; 9085 for (;;) { 9086 /* 9087 * If nothing has yet been written simply remove us from 9088 * the in memory list and return. This is the most common 9089 * case where handle_workitem_remove() loses the final 9090 * reference. 9091 */ 9092 if ((inodedep->id_state & UNLINKLINKS) == 0) 9093 break; 9094 /* 9095 * If we have a NEXT pointer and no PREV pointer we can simply 9096 * clear NEXT's PREV and remove ourselves from the list. Be 9097 * careful not to clear PREV if the superblock points at 9098 * next as well. 9099 */ 9100 idn = TAILQ_NEXT(inodedep, id_unlinked); 9101 if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) { 9102 if (idn && fs->fs_sujfree != idn->id_ino) 9103 idn->id_state &= ~UNLINKPREV; 9104 break; 9105 } 9106 /* 9107 * Here we have an inodedep which is actually linked into 9108 * the list. We must remove it by forcing a write to the 9109 * link before us, whether it be the superblock or an inode. 9110 * Unfortunately the list may change while we're waiting 9111 * on the buf lock for either resource so we must loop until 9112 * we lock the right one. If both the superblock and an 9113 * inode point to this inode we must clear the inode first 9114 * followed by the superblock. 9115 */ 9116 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 9117 pino = 0; 9118 if (idp && (idp->id_state & UNLINKNEXT)) 9119 pino = idp->id_ino; 9120 FREE_LOCK(&lk); 9121 if (pino == 0) 9122 bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), 9123 (int)fs->fs_sbsize, 0, 0, 0); 9124 else 9125 error = bread(ump->um_devvp, 9126 fsbtodb(fs, ino_to_fsba(fs, pino)), 9127 (int)fs->fs_bsize, NOCRED, &bp); 9128 ACQUIRE_LOCK(&lk); 9129 if (error) 9130 break; 9131 /* If the list has changed restart the loop. */ 9132 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 9133 nino = 0; 9134 if (idp && (idp->id_state & UNLINKNEXT)) 9135 nino = idp->id_ino; 9136 if (nino != pino || 9137 (inodedep->id_state & UNLINKPREV) != UNLINKPREV) { 9138 FREE_LOCK(&lk); 9139 brelse(bp); 9140 ACQUIRE_LOCK(&lk); 9141 continue; 9142 } 9143 /* 9144 * Remove us from the in memory list. After this we cannot 9145 * access the inodedep. 9146 */ 9147 idn = TAILQ_NEXT(inodedep, id_unlinked); 9148 inodedep->id_state &= ~(UNLINKED | UNLINKLINKS); 9149 TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); 9150 /* 9151 * Determine the next inode number. 9152 */ 9153 nino = 0; 9154 if (idn) { 9155 /* 9156 * If next isn't on the list we can just clear prev's 9157 * state and schedule it to be fixed later. No need 9158 * to synchronously write if we're not in the real 9159 * list. 9160 */ 9161 if ((idn->id_state & UNLINKPREV) == 0 && pino != 0) { 9162 idp->id_state &= ~UNLINKNEXT; 9163 if ((idp->id_state & ONWORKLIST) == 0) 9164 WORKLIST_INSERT(&bp->b_dep, 9165 &idp->id_list); 9166 FREE_LOCK(&lk); 9167 bawrite(bp); 9168 ACQUIRE_LOCK(&lk); 9169 return; 9170 } 9171 nino = idn->id_ino; 9172 } 9173 FREE_LOCK(&lk); 9174 /* 9175 * The predecessor's next pointer is manually updated here 9176 * so that the NEXT flag is never cleared for an element 9177 * that is in the list. 9178 */ 9179 if (pino == 0) { 9180 bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); 9181 ffs_oldfscompat_write((struct fs *)bp->b_data, ump); 9182 softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, 9183 bp); 9184 } else if (fs->fs_magic == FS_UFS1_MAGIC) 9185 ((struct ufs1_dinode *)bp->b_data + 9186 ino_to_fsbo(fs, pino))->di_freelink = nino; 9187 else 9188 ((struct ufs2_dinode *)bp->b_data + 9189 ino_to_fsbo(fs, pino))->di_freelink = nino; 9190 /* 9191 * If the bwrite fails we have no recourse to recover. The 9192 * filesystem is corrupted already. 9193 */ 9194 bwrite(bp); 9195 ACQUIRE_LOCK(&lk); 9196 /* 9197 * If the superblock pointer still needs to be cleared force 9198 * a write here. 9199 */ 9200 if (fs->fs_sujfree == ino) { 9201 FREE_LOCK(&lk); 9202 bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), 9203 (int)fs->fs_sbsize, 0, 0, 0); 9204 bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); 9205 ffs_oldfscompat_write((struct fs *)bp->b_data, ump); 9206 softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, 9207 bp); 9208 bwrite(bp); 9209 ACQUIRE_LOCK(&lk); 9210 } 9211 if (fs->fs_sujfree != ino) 9212 return; 9213 panic("clear_unlinked_inodedep: Failed to clear free head"); 9214 } 9215 if (inodedep->id_ino == fs->fs_sujfree) 9216 panic("clear_unlinked_inodedep: Freeing head of free list"); 9217 inodedep->id_state &= ~(UNLINKED | UNLINKLINKS); 9218 TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); 9219 return; 9220 } 9221 9222 /* 9223 * This workitem decrements the inode's link count. 9224 * If the link count reaches zero, the file is removed. 9225 */ 9226 static int 9227 handle_workitem_remove(dirrem, flags) 9228 struct dirrem *dirrem; 9229 int flags; 9230 { 9231 struct inodedep *inodedep; 9232 struct workhead dotdotwk; 9233 struct worklist *wk; 9234 struct ufsmount *ump; 9235 struct mount *mp; 9236 struct vnode *vp; 9237 struct inode *ip; 9238 ino_t oldinum; 9239 9240 if (dirrem->dm_state & ONWORKLIST) 9241 panic("handle_workitem_remove: dirrem %p still on worklist", 9242 dirrem); 9243 oldinum = dirrem->dm_oldinum; 9244 mp = dirrem->dm_list.wk_mp; 9245 ump = VFSTOUFS(mp); 9246 flags |= LK_EXCLUSIVE; 9247 if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ) != 0) 9248 return (EBUSY); 9249 ip = VTOI(vp); 9250 ACQUIRE_LOCK(&lk); 9251 if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0) 9252 panic("handle_workitem_remove: lost inodedep"); 9253 if (dirrem->dm_state & ONDEPLIST) 9254 LIST_REMOVE(dirrem, dm_inonext); 9255 KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), 9256 ("handle_workitem_remove: Journal entries not written.")); 9257 9258 /* 9259 * Move all dependencies waiting on the remove to complete 9260 * from the dirrem to the inode inowait list to be completed 9261 * after the inode has been updated and written to disk. Any 9262 * marked MKDIR_PARENT are saved to be completed when the .. ref 9263 * is removed. 9264 */ 9265 LIST_INIT(&dotdotwk); 9266 while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) { 9267 WORKLIST_REMOVE(wk); 9268 if (wk->wk_state & MKDIR_PARENT) { 9269 wk->wk_state &= ~MKDIR_PARENT; 9270 WORKLIST_INSERT(&dotdotwk, wk); 9271 continue; 9272 } 9273 WORKLIST_INSERT(&inodedep->id_inowait, wk); 9274 } 9275 LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list); 9276 /* 9277 * Normal file deletion. 9278 */ 9279 if ((dirrem->dm_state & RMDIR) == 0) { 9280 ip->i_nlink--; 9281 DIP_SET(ip, i_nlink, ip->i_nlink); 9282 ip->i_flag |= IN_CHANGE; 9283 if (ip->i_nlink < ip->i_effnlink) 9284 panic("handle_workitem_remove: bad file delta"); 9285 if (ip->i_nlink == 0) 9286 unlinked_inodedep(mp, inodedep); 9287 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 9288 KASSERT(LIST_EMPTY(&dirrem->dm_jwork), 9289 ("handle_workitem_remove: worklist not empty. %s", 9290 TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type))); 9291 WORKITEM_FREE(dirrem, D_DIRREM); 9292 FREE_LOCK(&lk); 9293 goto out; 9294 } 9295 /* 9296 * Directory deletion. Decrement reference count for both the 9297 * just deleted parent directory entry and the reference for ".". 9298 * Arrange to have the reference count on the parent decremented 9299 * to account for the loss of "..". 9300 */ 9301 ip->i_nlink -= 2; 9302 DIP_SET(ip, i_nlink, ip->i_nlink); 9303 ip->i_flag |= IN_CHANGE; 9304 if (ip->i_nlink < ip->i_effnlink) 9305 panic("handle_workitem_remove: bad dir delta"); 9306 if (ip->i_nlink == 0) 9307 unlinked_inodedep(mp, inodedep); 9308 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 9309 /* 9310 * Rename a directory to a new parent. Since, we are both deleting 9311 * and creating a new directory entry, the link count on the new 9312 * directory should not change. Thus we skip the followup dirrem. 9313 */ 9314 if (dirrem->dm_state & DIRCHG) { 9315 KASSERT(LIST_EMPTY(&dirrem->dm_jwork), 9316 ("handle_workitem_remove: DIRCHG and worklist not empty.")); 9317 WORKITEM_FREE(dirrem, D_DIRREM); 9318 FREE_LOCK(&lk); 9319 goto out; 9320 } 9321 dirrem->dm_state = ONDEPLIST; 9322 dirrem->dm_oldinum = dirrem->dm_dirinum; 9323 /* 9324 * Place the dirrem on the parent's diremhd list. 9325 */ 9326 if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0) 9327 panic("handle_workitem_remove: lost dir inodedep"); 9328 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 9329 /* 9330 * If the allocated inode has never been written to disk, then 9331 * the on-disk inode is zero'ed and we can remove the file 9332 * immediately. When journaling if the inode has been marked 9333 * unlinked and not DEPCOMPLETE we know it can never be written. 9334 */ 9335 inodedep_lookup(mp, oldinum, 0, &inodedep); 9336 if (inodedep == NULL || 9337 (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED || 9338 check_inode_unwritten(inodedep)) { 9339 FREE_LOCK(&lk); 9340 vput(vp); 9341 return handle_workitem_remove(dirrem, flags); 9342 } 9343 WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); 9344 FREE_LOCK(&lk); 9345 ip->i_flag |= IN_CHANGE; 9346 out: 9347 ffs_update(vp, 0); 9348 vput(vp); 9349 return (0); 9350 } 9351 9352 /* 9353 * Inode de-allocation dependencies. 9354 * 9355 * When an inode's link count is reduced to zero, it can be de-allocated. We 9356 * found it convenient to postpone de-allocation until after the inode is 9357 * written to disk with its new link count (zero). At this point, all of the 9358 * on-disk inode's block pointers are nullified and, with careful dependency 9359 * list ordering, all dependencies related to the inode will be satisfied and 9360 * the corresponding dependency structures de-allocated. So, if/when the 9361 * inode is reused, there will be no mixing of old dependencies with new 9362 * ones. This artificial dependency is set up by the block de-allocation 9363 * procedure above (softdep_setup_freeblocks) and completed by the 9364 * following procedure. 9365 */ 9366 static void 9367 handle_workitem_freefile(freefile) 9368 struct freefile *freefile; 9369 { 9370 struct workhead wkhd; 9371 struct fs *fs; 9372 struct inodedep *idp; 9373 struct ufsmount *ump; 9374 int error; 9375 9376 ump = VFSTOUFS(freefile->fx_list.wk_mp); 9377 fs = ump->um_fs; 9378 #ifdef DEBUG 9379 ACQUIRE_LOCK(&lk); 9380 error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp); 9381 FREE_LOCK(&lk); 9382 if (error) 9383 panic("handle_workitem_freefile: inodedep %p survived", idp); 9384 #endif 9385 UFS_LOCK(ump); 9386 fs->fs_pendinginodes -= 1; 9387 UFS_UNLOCK(ump); 9388 LIST_INIT(&wkhd); 9389 LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list); 9390 if ((error = ffs_freefile(ump, fs, freefile->fx_devvp, 9391 freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0) 9392 softdep_error("handle_workitem_freefile", error); 9393 ACQUIRE_LOCK(&lk); 9394 WORKITEM_FREE(freefile, D_FREEFILE); 9395 FREE_LOCK(&lk); 9396 } 9397 9398 9399 /* 9400 * Helper function which unlinks marker element from work list and returns 9401 * the next element on the list. 9402 */ 9403 static __inline struct worklist * 9404 markernext(struct worklist *marker) 9405 { 9406 struct worklist *next; 9407 9408 next = LIST_NEXT(marker, wk_list); 9409 LIST_REMOVE(marker, wk_list); 9410 return next; 9411 } 9412 9413 /* 9414 * Disk writes. 9415 * 9416 * The dependency structures constructed above are most actively used when file 9417 * system blocks are written to disk. No constraints are placed on when a 9418 * block can be written, but unsatisfied update dependencies are made safe by 9419 * modifying (or replacing) the source memory for the duration of the disk 9420 * write. When the disk write completes, the memory block is again brought 9421 * up-to-date. 9422 * 9423 * In-core inode structure reclamation. 9424 * 9425 * Because there are a finite number of "in-core" inode structures, they are 9426 * reused regularly. By transferring all inode-related dependencies to the 9427 * in-memory inode block and indexing them separately (via "inodedep"s), we 9428 * can allow "in-core" inode structures to be reused at any time and avoid 9429 * any increase in contention. 9430 * 9431 * Called just before entering the device driver to initiate a new disk I/O. 9432 * The buffer must be locked, thus, no I/O completion operations can occur 9433 * while we are manipulating its associated dependencies. 9434 */ 9435 static void 9436 softdep_disk_io_initiation(bp) 9437 struct buf *bp; /* structure describing disk write to occur */ 9438 { 9439 struct worklist *wk; 9440 struct worklist marker; 9441 struct inodedep *inodedep; 9442 struct freeblks *freeblks; 9443 struct jblkdep *jblkdep; 9444 struct newblk *newblk; 9445 9446 /* 9447 * We only care about write operations. There should never 9448 * be dependencies for reads. 9449 */ 9450 if (bp->b_iocmd != BIO_WRITE) 9451 panic("softdep_disk_io_initiation: not write"); 9452 9453 if (bp->b_vflags & BV_BKGRDINPROG) 9454 panic("softdep_disk_io_initiation: Writing buffer with " 9455 "background write in progress: %p", bp); 9456 9457 marker.wk_type = D_LAST + 1; /* Not a normal workitem */ 9458 PHOLD(curproc); /* Don't swap out kernel stack */ 9459 9460 ACQUIRE_LOCK(&lk); 9461 /* 9462 * Do any necessary pre-I/O processing. 9463 */ 9464 for (wk = LIST_FIRST(&bp->b_dep); wk != NULL; 9465 wk = markernext(&marker)) { 9466 LIST_INSERT_AFTER(wk, &marker, wk_list); 9467 switch (wk->wk_type) { 9468 9469 case D_PAGEDEP: 9470 initiate_write_filepage(WK_PAGEDEP(wk), bp); 9471 continue; 9472 9473 case D_INODEDEP: 9474 inodedep = WK_INODEDEP(wk); 9475 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) 9476 initiate_write_inodeblock_ufs1(inodedep, bp); 9477 else 9478 initiate_write_inodeblock_ufs2(inodedep, bp); 9479 continue; 9480 9481 case D_INDIRDEP: 9482 initiate_write_indirdep(WK_INDIRDEP(wk), bp); 9483 continue; 9484 9485 case D_BMSAFEMAP: 9486 initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp); 9487 continue; 9488 9489 case D_JSEG: 9490 WK_JSEG(wk)->js_buf = NULL; 9491 continue; 9492 9493 case D_FREEBLKS: 9494 freeblks = WK_FREEBLKS(wk); 9495 jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd); 9496 /* 9497 * We have to wait for the freeblks to be journaled 9498 * before we can write an inodeblock with updated 9499 * pointers. Be careful to arrange the marker so 9500 * we revisit the freeblks if it's not removed by 9501 * the first jwait(). 9502 */ 9503 if (jblkdep != NULL) { 9504 LIST_REMOVE(&marker, wk_list); 9505 LIST_INSERT_BEFORE(wk, &marker, wk_list); 9506 jwait(&jblkdep->jb_list, MNT_WAIT); 9507 } 9508 continue; 9509 case D_ALLOCDIRECT: 9510 case D_ALLOCINDIR: 9511 /* 9512 * We have to wait for the jnewblk to be journaled 9513 * before we can write to a block if the contents 9514 * may be confused with an earlier file's indirect 9515 * at recovery time. Handle the marker as described 9516 * above. 9517 */ 9518 newblk = WK_NEWBLK(wk); 9519 if (newblk->nb_jnewblk != NULL && 9520 indirblk_lookup(newblk->nb_list.wk_mp, 9521 newblk->nb_newblkno)) { 9522 LIST_REMOVE(&marker, wk_list); 9523 LIST_INSERT_BEFORE(wk, &marker, wk_list); 9524 jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); 9525 } 9526 continue; 9527 9528 case D_SBDEP: 9529 initiate_write_sbdep(WK_SBDEP(wk)); 9530 continue; 9531 9532 case D_MKDIR: 9533 case D_FREEWORK: 9534 case D_FREEDEP: 9535 case D_JSEGDEP: 9536 continue; 9537 9538 default: 9539 panic("handle_disk_io_initiation: Unexpected type %s", 9540 TYPENAME(wk->wk_type)); 9541 /* NOTREACHED */ 9542 } 9543 } 9544 FREE_LOCK(&lk); 9545 PRELE(curproc); /* Allow swapout of kernel stack */ 9546 } 9547 9548 /* 9549 * Called from within the procedure above to deal with unsatisfied 9550 * allocation dependencies in a directory. The buffer must be locked, 9551 * thus, no I/O completion operations can occur while we are 9552 * manipulating its associated dependencies. 9553 */ 9554 static void 9555 initiate_write_filepage(pagedep, bp) 9556 struct pagedep *pagedep; 9557 struct buf *bp; 9558 { 9559 struct jremref *jremref; 9560 struct jmvref *jmvref; 9561 struct dirrem *dirrem; 9562 struct diradd *dap; 9563 struct direct *ep; 9564 int i; 9565 9566 if (pagedep->pd_state & IOSTARTED) { 9567 /* 9568 * This can only happen if there is a driver that does not 9569 * understand chaining. Here biodone will reissue the call 9570 * to strategy for the incomplete buffers. 9571 */ 9572 printf("initiate_write_filepage: already started\n"); 9573 return; 9574 } 9575 pagedep->pd_state |= IOSTARTED; 9576 /* 9577 * Wait for all journal remove dependencies to hit the disk. 9578 * We can not allow any potentially conflicting directory adds 9579 * to be visible before removes and rollback is too difficult. 9580 * lk may be dropped and re-acquired, however we hold the buf 9581 * locked so the dependency can not go away. 9582 */ 9583 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) 9584 while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) 9585 jwait(&jremref->jr_list, MNT_WAIT); 9586 while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) 9587 jwait(&jmvref->jm_list, MNT_WAIT); 9588 for (i = 0; i < DAHASHSZ; i++) { 9589 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 9590 ep = (struct direct *) 9591 ((char *)bp->b_data + dap->da_offset); 9592 if (ep->d_ino != dap->da_newinum) 9593 panic("%s: dir inum %d != new %d", 9594 "initiate_write_filepage", 9595 ep->d_ino, dap->da_newinum); 9596 if (dap->da_state & DIRCHG) 9597 ep->d_ino = dap->da_previous->dm_oldinum; 9598 else 9599 ep->d_ino = 0; 9600 dap->da_state &= ~ATTACHED; 9601 dap->da_state |= UNDONE; 9602 } 9603 } 9604 } 9605 9606 /* 9607 * Version of initiate_write_inodeblock that handles UFS1 dinodes. 9608 * Note that any bug fixes made to this routine must be done in the 9609 * version found below. 9610 * 9611 * Called from within the procedure above to deal with unsatisfied 9612 * allocation dependencies in an inodeblock. The buffer must be 9613 * locked, thus, no I/O completion operations can occur while we 9614 * are manipulating its associated dependencies. 9615 */ 9616 static void 9617 initiate_write_inodeblock_ufs1(inodedep, bp) 9618 struct inodedep *inodedep; 9619 struct buf *bp; /* The inode block */ 9620 { 9621 struct allocdirect *adp, *lastadp; 9622 struct ufs1_dinode *dp; 9623 struct ufs1_dinode *sip; 9624 struct inoref *inoref; 9625 struct fs *fs; 9626 ufs_lbn_t i; 9627 #ifdef INVARIANTS 9628 ufs_lbn_t prevlbn = 0; 9629 #endif 9630 int deplist; 9631 9632 if (inodedep->id_state & IOSTARTED) 9633 panic("initiate_write_inodeblock_ufs1: already started"); 9634 inodedep->id_state |= IOSTARTED; 9635 fs = inodedep->id_fs; 9636 dp = (struct ufs1_dinode *)bp->b_data + 9637 ino_to_fsbo(fs, inodedep->id_ino); 9638 9639 /* 9640 * If we're on the unlinked list but have not yet written our 9641 * next pointer initialize it here. 9642 */ 9643 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 9644 struct inodedep *inon; 9645 9646 inon = TAILQ_NEXT(inodedep, id_unlinked); 9647 dp->di_freelink = inon ? inon->id_ino : 0; 9648 } 9649 /* 9650 * If the bitmap is not yet written, then the allocated 9651 * inode cannot be written to disk. 9652 */ 9653 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 9654 if (inodedep->id_savedino1 != NULL) 9655 panic("initiate_write_inodeblock_ufs1: I/O underway"); 9656 FREE_LOCK(&lk); 9657 sip = malloc(sizeof(struct ufs1_dinode), 9658 M_SAVEDINO, M_SOFTDEP_FLAGS); 9659 ACQUIRE_LOCK(&lk); 9660 inodedep->id_savedino1 = sip; 9661 *inodedep->id_savedino1 = *dp; 9662 bzero((caddr_t)dp, sizeof(struct ufs1_dinode)); 9663 dp->di_gen = inodedep->id_savedino1->di_gen; 9664 dp->di_freelink = inodedep->id_savedino1->di_freelink; 9665 return; 9666 } 9667 /* 9668 * If no dependencies, then there is nothing to roll back. 9669 */ 9670 inodedep->id_savedsize = dp->di_size; 9671 inodedep->id_savedextsize = 0; 9672 inodedep->id_savednlink = dp->di_nlink; 9673 if (TAILQ_EMPTY(&inodedep->id_inoupdt) && 9674 TAILQ_EMPTY(&inodedep->id_inoreflst)) 9675 return; 9676 /* 9677 * Revert the link count to that of the first unwritten journal entry. 9678 */ 9679 inoref = TAILQ_FIRST(&inodedep->id_inoreflst); 9680 if (inoref) 9681 dp->di_nlink = inoref->if_nlink; 9682 /* 9683 * Set the dependencies to busy. 9684 */ 9685 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 9686 adp = TAILQ_NEXT(adp, ad_next)) { 9687 #ifdef INVARIANTS 9688 if (deplist != 0 && prevlbn >= adp->ad_offset) 9689 panic("softdep_write_inodeblock: lbn order"); 9690 prevlbn = adp->ad_offset; 9691 if (adp->ad_offset < NDADDR && 9692 dp->di_db[adp->ad_offset] != adp->ad_newblkno) 9693 panic("%s: direct pointer #%jd mismatch %d != %jd", 9694 "softdep_write_inodeblock", 9695 (intmax_t)adp->ad_offset, 9696 dp->di_db[adp->ad_offset], 9697 (intmax_t)adp->ad_newblkno); 9698 if (adp->ad_offset >= NDADDR && 9699 dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) 9700 panic("%s: indirect pointer #%jd mismatch %d != %jd", 9701 "softdep_write_inodeblock", 9702 (intmax_t)adp->ad_offset - NDADDR, 9703 dp->di_ib[adp->ad_offset - NDADDR], 9704 (intmax_t)adp->ad_newblkno); 9705 deplist |= 1 << adp->ad_offset; 9706 if ((adp->ad_state & ATTACHED) == 0) 9707 panic("softdep_write_inodeblock: Unknown state 0x%x", 9708 adp->ad_state); 9709 #endif /* INVARIANTS */ 9710 adp->ad_state &= ~ATTACHED; 9711 adp->ad_state |= UNDONE; 9712 } 9713 /* 9714 * The on-disk inode cannot claim to be any larger than the last 9715 * fragment that has been written. Otherwise, the on-disk inode 9716 * might have fragments that were not the last block in the file 9717 * which would corrupt the filesystem. 9718 */ 9719 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 9720 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 9721 if (adp->ad_offset >= NDADDR) 9722 break; 9723 dp->di_db[adp->ad_offset] = adp->ad_oldblkno; 9724 /* keep going until hitting a rollback to a frag */ 9725 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 9726 continue; 9727 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 9728 for (i = adp->ad_offset + 1; i < NDADDR; i++) { 9729 #ifdef INVARIANTS 9730 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) 9731 panic("softdep_write_inodeblock: lost dep1"); 9732 #endif /* INVARIANTS */ 9733 dp->di_db[i] = 0; 9734 } 9735 for (i = 0; i < NIADDR; i++) { 9736 #ifdef INVARIANTS 9737 if (dp->di_ib[i] != 0 && 9738 (deplist & ((1 << NDADDR) << i)) == 0) 9739 panic("softdep_write_inodeblock: lost dep2"); 9740 #endif /* INVARIANTS */ 9741 dp->di_ib[i] = 0; 9742 } 9743 return; 9744 } 9745 /* 9746 * If we have zero'ed out the last allocated block of the file, 9747 * roll back the size to the last currently allocated block. 9748 * We know that this last allocated block is a full-sized as 9749 * we already checked for fragments in the loop above. 9750 */ 9751 if (lastadp != NULL && 9752 dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 9753 for (i = lastadp->ad_offset; i >= 0; i--) 9754 if (dp->di_db[i] != 0) 9755 break; 9756 dp->di_size = (i + 1) * fs->fs_bsize; 9757 } 9758 /* 9759 * The only dependencies are for indirect blocks. 9760 * 9761 * The file size for indirect block additions is not guaranteed. 9762 * Such a guarantee would be non-trivial to achieve. The conventional 9763 * synchronous write implementation also does not make this guarantee. 9764 * Fsck should catch and fix discrepancies. Arguably, the file size 9765 * can be over-estimated without destroying integrity when the file 9766 * moves into the indirect blocks (i.e., is large). If we want to 9767 * postpone fsck, we are stuck with this argument. 9768 */ 9769 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 9770 dp->di_ib[adp->ad_offset - NDADDR] = 0; 9771 } 9772 9773 /* 9774 * Version of initiate_write_inodeblock that handles UFS2 dinodes. 9775 * Note that any bug fixes made to this routine must be done in the 9776 * version found above. 9777 * 9778 * Called from within the procedure above to deal with unsatisfied 9779 * allocation dependencies in an inodeblock. The buffer must be 9780 * locked, thus, no I/O completion operations can occur while we 9781 * are manipulating its associated dependencies. 9782 */ 9783 static void 9784 initiate_write_inodeblock_ufs2(inodedep, bp) 9785 struct inodedep *inodedep; 9786 struct buf *bp; /* The inode block */ 9787 { 9788 struct allocdirect *adp, *lastadp; 9789 struct ufs2_dinode *dp; 9790 struct ufs2_dinode *sip; 9791 struct inoref *inoref; 9792 struct fs *fs; 9793 ufs_lbn_t i; 9794 #ifdef INVARIANTS 9795 ufs_lbn_t prevlbn = 0; 9796 #endif 9797 int deplist; 9798 9799 if (inodedep->id_state & IOSTARTED) 9800 panic("initiate_write_inodeblock_ufs2: already started"); 9801 inodedep->id_state |= IOSTARTED; 9802 fs = inodedep->id_fs; 9803 dp = (struct ufs2_dinode *)bp->b_data + 9804 ino_to_fsbo(fs, inodedep->id_ino); 9805 9806 /* 9807 * If we're on the unlinked list but have not yet written our 9808 * next pointer initialize it here. 9809 */ 9810 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 9811 struct inodedep *inon; 9812 9813 inon = TAILQ_NEXT(inodedep, id_unlinked); 9814 dp->di_freelink = inon ? inon->id_ino : 0; 9815 } 9816 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == 9817 (UNLINKED | UNLINKNEXT)) { 9818 struct inodedep *inon; 9819 ino_t freelink; 9820 9821 inon = TAILQ_NEXT(inodedep, id_unlinked); 9822 freelink = inon ? inon->id_ino : 0; 9823 if (freelink != dp->di_freelink) 9824 panic("ino %p(0x%X) %d, %d != %d", 9825 inodedep, inodedep->id_state, inodedep->id_ino, 9826 freelink, dp->di_freelink); 9827 } 9828 /* 9829 * If the bitmap is not yet written, then the allocated 9830 * inode cannot be written to disk. 9831 */ 9832 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 9833 if (inodedep->id_savedino2 != NULL) 9834 panic("initiate_write_inodeblock_ufs2: I/O underway"); 9835 FREE_LOCK(&lk); 9836 sip = malloc(sizeof(struct ufs2_dinode), 9837 M_SAVEDINO, M_SOFTDEP_FLAGS); 9838 ACQUIRE_LOCK(&lk); 9839 inodedep->id_savedino2 = sip; 9840 *inodedep->id_savedino2 = *dp; 9841 bzero((caddr_t)dp, sizeof(struct ufs2_dinode)); 9842 dp->di_gen = inodedep->id_savedino2->di_gen; 9843 dp->di_freelink = inodedep->id_savedino2->di_freelink; 9844 return; 9845 } 9846 /* 9847 * If no dependencies, then there is nothing to roll back. 9848 */ 9849 inodedep->id_savedsize = dp->di_size; 9850 inodedep->id_savedextsize = dp->di_extsize; 9851 inodedep->id_savednlink = dp->di_nlink; 9852 if (TAILQ_EMPTY(&inodedep->id_inoupdt) && 9853 TAILQ_EMPTY(&inodedep->id_extupdt) && 9854 TAILQ_EMPTY(&inodedep->id_inoreflst)) 9855 return; 9856 /* 9857 * Revert the link count to that of the first unwritten journal entry. 9858 */ 9859 inoref = TAILQ_FIRST(&inodedep->id_inoreflst); 9860 if (inoref) 9861 dp->di_nlink = inoref->if_nlink; 9862 9863 /* 9864 * Set the ext data dependencies to busy. 9865 */ 9866 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; 9867 adp = TAILQ_NEXT(adp, ad_next)) { 9868 #ifdef INVARIANTS 9869 if (deplist != 0 && prevlbn >= adp->ad_offset) 9870 panic("softdep_write_inodeblock: lbn order"); 9871 prevlbn = adp->ad_offset; 9872 if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno) 9873 panic("%s: direct pointer #%jd mismatch %jd != %jd", 9874 "softdep_write_inodeblock", 9875 (intmax_t)adp->ad_offset, 9876 (intmax_t)dp->di_extb[adp->ad_offset], 9877 (intmax_t)adp->ad_newblkno); 9878 deplist |= 1 << adp->ad_offset; 9879 if ((adp->ad_state & ATTACHED) == 0) 9880 panic("softdep_write_inodeblock: Unknown state 0x%x", 9881 adp->ad_state); 9882 #endif /* INVARIANTS */ 9883 adp->ad_state &= ~ATTACHED; 9884 adp->ad_state |= UNDONE; 9885 } 9886 /* 9887 * The on-disk inode cannot claim to be any larger than the last 9888 * fragment that has been written. Otherwise, the on-disk inode 9889 * might have fragments that were not the last block in the ext 9890 * data which would corrupt the filesystem. 9891 */ 9892 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; 9893 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 9894 dp->di_extb[adp->ad_offset] = adp->ad_oldblkno; 9895 /* keep going until hitting a rollback to a frag */ 9896 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 9897 continue; 9898 dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 9899 for (i = adp->ad_offset + 1; i < NXADDR; i++) { 9900 #ifdef INVARIANTS 9901 if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) 9902 panic("softdep_write_inodeblock: lost dep1"); 9903 #endif /* INVARIANTS */ 9904 dp->di_extb[i] = 0; 9905 } 9906 lastadp = NULL; 9907 break; 9908 } 9909 /* 9910 * If we have zero'ed out the last allocated block of the ext 9911 * data, roll back the size to the last currently allocated block. 9912 * We know that this last allocated block is a full-sized as 9913 * we already checked for fragments in the loop above. 9914 */ 9915 if (lastadp != NULL && 9916 dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 9917 for (i = lastadp->ad_offset; i >= 0; i--) 9918 if (dp->di_extb[i] != 0) 9919 break; 9920 dp->di_extsize = (i + 1) * fs->fs_bsize; 9921 } 9922 /* 9923 * Set the file data dependencies to busy. 9924 */ 9925 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 9926 adp = TAILQ_NEXT(adp, ad_next)) { 9927 #ifdef INVARIANTS 9928 if (deplist != 0 && prevlbn >= adp->ad_offset) 9929 panic("softdep_write_inodeblock: lbn order"); 9930 if ((adp->ad_state & ATTACHED) == 0) 9931 panic("inodedep %p and adp %p not attached", inodedep, adp); 9932 prevlbn = adp->ad_offset; 9933 if (adp->ad_offset < NDADDR && 9934 dp->di_db[adp->ad_offset] != adp->ad_newblkno) 9935 panic("%s: direct pointer #%jd mismatch %jd != %jd", 9936 "softdep_write_inodeblock", 9937 (intmax_t)adp->ad_offset, 9938 (intmax_t)dp->di_db[adp->ad_offset], 9939 (intmax_t)adp->ad_newblkno); 9940 if (adp->ad_offset >= NDADDR && 9941 dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) 9942 panic("%s indirect pointer #%jd mismatch %jd != %jd", 9943 "softdep_write_inodeblock:", 9944 (intmax_t)adp->ad_offset - NDADDR, 9945 (intmax_t)dp->di_ib[adp->ad_offset - NDADDR], 9946 (intmax_t)adp->ad_newblkno); 9947 deplist |= 1 << adp->ad_offset; 9948 if ((adp->ad_state & ATTACHED) == 0) 9949 panic("softdep_write_inodeblock: Unknown state 0x%x", 9950 adp->ad_state); 9951 #endif /* INVARIANTS */ 9952 adp->ad_state &= ~ATTACHED; 9953 adp->ad_state |= UNDONE; 9954 } 9955 /* 9956 * The on-disk inode cannot claim to be any larger than the last 9957 * fragment that has been written. Otherwise, the on-disk inode 9958 * might have fragments that were not the last block in the file 9959 * which would corrupt the filesystem. 9960 */ 9961 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 9962 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 9963 if (adp->ad_offset >= NDADDR) 9964 break; 9965 dp->di_db[adp->ad_offset] = adp->ad_oldblkno; 9966 /* keep going until hitting a rollback to a frag */ 9967 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 9968 continue; 9969 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 9970 for (i = adp->ad_offset + 1; i < NDADDR; i++) { 9971 #ifdef INVARIANTS 9972 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) 9973 panic("softdep_write_inodeblock: lost dep2"); 9974 #endif /* INVARIANTS */ 9975 dp->di_db[i] = 0; 9976 } 9977 for (i = 0; i < NIADDR; i++) { 9978 #ifdef INVARIANTS 9979 if (dp->di_ib[i] != 0 && 9980 (deplist & ((1 << NDADDR) << i)) == 0) 9981 panic("softdep_write_inodeblock: lost dep3"); 9982 #endif /* INVARIANTS */ 9983 dp->di_ib[i] = 0; 9984 } 9985 return; 9986 } 9987 /* 9988 * If we have zero'ed out the last allocated block of the file, 9989 * roll back the size to the last currently allocated block. 9990 * We know that this last allocated block is a full-sized as 9991 * we already checked for fragments in the loop above. 9992 */ 9993 if (lastadp != NULL && 9994 dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 9995 for (i = lastadp->ad_offset; i >= 0; i--) 9996 if (dp->di_db[i] != 0) 9997 break; 9998 dp->di_size = (i + 1) * fs->fs_bsize; 9999 } 10000 /* 10001 * The only dependencies are for indirect blocks. 10002 * 10003 * The file size for indirect block additions is not guaranteed. 10004 * Such a guarantee would be non-trivial to achieve. The conventional 10005 * synchronous write implementation also does not make this guarantee. 10006 * Fsck should catch and fix discrepancies. Arguably, the file size 10007 * can be over-estimated without destroying integrity when the file 10008 * moves into the indirect blocks (i.e., is large). If we want to 10009 * postpone fsck, we are stuck with this argument. 10010 */ 10011 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 10012 dp->di_ib[adp->ad_offset - NDADDR] = 0; 10013 } 10014 10015 /* 10016 * Cancel an indirdep as a result of truncation. Release all of the 10017 * children allocindirs and place their journal work on the appropriate 10018 * list. 10019 */ 10020 static void 10021 cancel_indirdep(indirdep, bp, freeblks) 10022 struct indirdep *indirdep; 10023 struct buf *bp; 10024 struct freeblks *freeblks; 10025 { 10026 struct allocindir *aip; 10027 10028 /* 10029 * None of the indirect pointers will ever be visible, 10030 * so they can simply be tossed. GOINGAWAY ensures 10031 * that allocated pointers will be saved in the buffer 10032 * cache until they are freed. Note that they will 10033 * only be able to be found by their physical address 10034 * since the inode mapping the logical address will 10035 * be gone. The save buffer used for the safe copy 10036 * was allocated in setup_allocindir_phase2 using 10037 * the physical address so it could be used for this 10038 * purpose. Hence we swap the safe copy with the real 10039 * copy, allowing the safe copy to be freed and holding 10040 * on to the real copy for later use in indir_trunc. 10041 */ 10042 if (indirdep->ir_state & GOINGAWAY) 10043 panic("cancel_indirdep: already gone"); 10044 if ((indirdep->ir_state & DEPCOMPLETE) == 0) { 10045 indirdep->ir_state |= DEPCOMPLETE; 10046 LIST_REMOVE(indirdep, ir_next); 10047 } 10048 indirdep->ir_state |= GOINGAWAY; 10049 VFSTOUFS(indirdep->ir_list.wk_mp)->um_numindirdeps += 1; 10050 /* 10051 * Pass in bp for blocks still have journal writes 10052 * pending so we can cancel them on their own. 10053 */ 10054 while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) 10055 cancel_allocindir(aip, bp, freeblks, 0); 10056 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) 10057 cancel_allocindir(aip, NULL, freeblks, 0); 10058 while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) 10059 cancel_allocindir(aip, NULL, freeblks, 0); 10060 while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0) 10061 cancel_allocindir(aip, NULL, freeblks, 0); 10062 /* 10063 * If there are pending partial truncations we need to keep the 10064 * old block copy around until they complete. This is because 10065 * the current b_data is not a perfect superset of the available 10066 * blocks. 10067 */ 10068 if (TAILQ_EMPTY(&indirdep->ir_trunc)) 10069 bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount); 10070 else 10071 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); 10072 WORKLIST_REMOVE(&indirdep->ir_list); 10073 WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list); 10074 indirdep->ir_bp = NULL; 10075 indirdep->ir_freeblks = freeblks; 10076 } 10077 10078 /* 10079 * Free an indirdep once it no longer has new pointers to track. 10080 */ 10081 static void 10082 free_indirdep(indirdep) 10083 struct indirdep *indirdep; 10084 { 10085 10086 KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc), 10087 ("free_indirdep: Indir trunc list not empty.")); 10088 KASSERT(LIST_EMPTY(&indirdep->ir_completehd), 10089 ("free_indirdep: Complete head not empty.")); 10090 KASSERT(LIST_EMPTY(&indirdep->ir_writehd), 10091 ("free_indirdep: write head not empty.")); 10092 KASSERT(LIST_EMPTY(&indirdep->ir_donehd), 10093 ("free_indirdep: done head not empty.")); 10094 KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd), 10095 ("free_indirdep: deplist head not empty.")); 10096 KASSERT((indirdep->ir_state & DEPCOMPLETE), 10097 ("free_indirdep: %p still on newblk list.", indirdep)); 10098 KASSERT(indirdep->ir_saveddata == NULL, 10099 ("free_indirdep: %p still has saved data.", indirdep)); 10100 if (indirdep->ir_state & ONWORKLIST) 10101 WORKLIST_REMOVE(&indirdep->ir_list); 10102 WORKITEM_FREE(indirdep, D_INDIRDEP); 10103 } 10104 10105 /* 10106 * Called before a write to an indirdep. This routine is responsible for 10107 * rolling back pointers to a safe state which includes only those 10108 * allocindirs which have been completed. 10109 */ 10110 static void 10111 initiate_write_indirdep(indirdep, bp) 10112 struct indirdep *indirdep; 10113 struct buf *bp; 10114 { 10115 10116 indirdep->ir_state |= IOSTARTED; 10117 if (indirdep->ir_state & GOINGAWAY) 10118 panic("disk_io_initiation: indirdep gone"); 10119 /* 10120 * If there are no remaining dependencies, this will be writing 10121 * the real pointers. 10122 */ 10123 if (LIST_EMPTY(&indirdep->ir_deplisthd) && 10124 TAILQ_EMPTY(&indirdep->ir_trunc)) 10125 return; 10126 /* 10127 * Replace up-to-date version with safe version. 10128 */ 10129 if (indirdep->ir_saveddata == NULL) { 10130 FREE_LOCK(&lk); 10131 indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, 10132 M_SOFTDEP_FLAGS); 10133 ACQUIRE_LOCK(&lk); 10134 } 10135 indirdep->ir_state &= ~ATTACHED; 10136 indirdep->ir_state |= UNDONE; 10137 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); 10138 bcopy(indirdep->ir_savebp->b_data, bp->b_data, 10139 bp->b_bcount); 10140 } 10141 10142 /* 10143 * Called when an inode has been cleared in a cg bitmap. This finally 10144 * eliminates any canceled jaddrefs 10145 */ 10146 void 10147 softdep_setup_inofree(mp, bp, ino, wkhd) 10148 struct mount *mp; 10149 struct buf *bp; 10150 ino_t ino; 10151 struct workhead *wkhd; 10152 { 10153 struct worklist *wk, *wkn; 10154 struct inodedep *inodedep; 10155 uint8_t *inosused; 10156 struct cg *cgp; 10157 struct fs *fs; 10158 10159 ACQUIRE_LOCK(&lk); 10160 fs = VFSTOUFS(mp)->um_fs; 10161 cgp = (struct cg *)bp->b_data; 10162 inosused = cg_inosused(cgp); 10163 if (isset(inosused, ino % fs->fs_ipg)) 10164 panic("softdep_setup_inofree: inode %d not freed.", ino); 10165 if (inodedep_lookup(mp, ino, 0, &inodedep)) 10166 panic("softdep_setup_inofree: ino %d has existing inodedep %p", 10167 ino, inodedep); 10168 if (wkhd) { 10169 LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) { 10170 if (wk->wk_type != D_JADDREF) 10171 continue; 10172 WORKLIST_REMOVE(wk); 10173 /* 10174 * We can free immediately even if the jaddref 10175 * isn't attached in a background write as now 10176 * the bitmaps are reconciled. 10177 */ 10178 wk->wk_state |= COMPLETE | ATTACHED; 10179 free_jaddref(WK_JADDREF(wk)); 10180 } 10181 jwork_move(&bp->b_dep, wkhd); 10182 } 10183 FREE_LOCK(&lk); 10184 } 10185 10186 10187 /* 10188 * Called via ffs_blkfree() after a set of frags has been cleared from a cg 10189 * map. Any dependencies waiting for the write to clear are added to the 10190 * buf's list and any jnewblks that are being canceled are discarded 10191 * immediately. 10192 */ 10193 void 10194 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) 10195 struct mount *mp; 10196 struct buf *bp; 10197 ufs2_daddr_t blkno; 10198 int frags; 10199 struct workhead *wkhd; 10200 { 10201 struct bmsafemap *bmsafemap; 10202 struct jnewblk *jnewblk; 10203 struct worklist *wk; 10204 struct fs *fs; 10205 #ifdef SUJ_DEBUG 10206 uint8_t *blksfree; 10207 struct cg *cgp; 10208 ufs2_daddr_t jstart; 10209 ufs2_daddr_t jend; 10210 ufs2_daddr_t end; 10211 long bno; 10212 int i; 10213 #endif 10214 10215 ACQUIRE_LOCK(&lk); 10216 /* Lookup the bmsafemap so we track when it is dirty. */ 10217 fs = VFSTOUFS(mp)->um_fs; 10218 bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno)); 10219 /* 10220 * Detach any jnewblks which have been canceled. They must linger 10221 * until the bitmap is cleared again by ffs_blkfree() to prevent 10222 * an unjournaled allocation from hitting the disk. 10223 */ 10224 if (wkhd) { 10225 while ((wk = LIST_FIRST(wkhd)) != NULL) { 10226 WORKLIST_REMOVE(wk); 10227 if (wk->wk_type != D_JNEWBLK) { 10228 WORKLIST_INSERT(&bmsafemap->sm_freehd, wk); 10229 continue; 10230 } 10231 jnewblk = WK_JNEWBLK(wk); 10232 KASSERT(jnewblk->jn_state & GOINGAWAY, 10233 ("softdep_setup_blkfree: jnewblk not canceled.")); 10234 #ifdef SUJ_DEBUG 10235 /* 10236 * Assert that this block is free in the bitmap 10237 * before we discard the jnewblk. 10238 */ 10239 cgp = (struct cg *)bp->b_data; 10240 blksfree = cg_blksfree(cgp); 10241 bno = dtogd(fs, jnewblk->jn_blkno); 10242 for (i = jnewblk->jn_oldfrags; 10243 i < jnewblk->jn_frags; i++) { 10244 if (isset(blksfree, bno + i)) 10245 continue; 10246 panic("softdep_setup_blkfree: not free"); 10247 } 10248 #endif 10249 /* 10250 * Even if it's not attached we can free immediately 10251 * as the new bitmap is correct. 10252 */ 10253 wk->wk_state |= COMPLETE | ATTACHED; 10254 free_jnewblk(jnewblk); 10255 } 10256 } 10257 10258 #ifdef SUJ_DEBUG 10259 /* 10260 * Assert that we are not freeing a block which has an outstanding 10261 * allocation dependency. 10262 */ 10263 fs = VFSTOUFS(mp)->um_fs; 10264 bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno)); 10265 end = blkno + frags; 10266 LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { 10267 /* 10268 * Don't match against blocks that will be freed when the 10269 * background write is done. 10270 */ 10271 if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) == 10272 (COMPLETE | DEPCOMPLETE)) 10273 continue; 10274 jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags; 10275 jend = jnewblk->jn_blkno + jnewblk->jn_frags; 10276 if ((blkno >= jstart && blkno < jend) || 10277 (end > jstart && end <= jend)) { 10278 printf("state 0x%X %jd - %d %d dep %p\n", 10279 jnewblk->jn_state, jnewblk->jn_blkno, 10280 jnewblk->jn_oldfrags, jnewblk->jn_frags, 10281 jnewblk->jn_dep); 10282 panic("softdep_setup_blkfree: " 10283 "%jd-%jd(%d) overlaps with %jd-%jd", 10284 blkno, end, frags, jstart, jend); 10285 } 10286 } 10287 #endif 10288 FREE_LOCK(&lk); 10289 } 10290 10291 /* 10292 * Revert a block allocation when the journal record that describes it 10293 * is not yet written. 10294 */ 10295 int 10296 jnewblk_rollback(jnewblk, fs, cgp, blksfree) 10297 struct jnewblk *jnewblk; 10298 struct fs *fs; 10299 struct cg *cgp; 10300 uint8_t *blksfree; 10301 { 10302 ufs1_daddr_t fragno; 10303 long cgbno, bbase; 10304 int frags, blk; 10305 int i; 10306 10307 frags = 0; 10308 cgbno = dtogd(fs, jnewblk->jn_blkno); 10309 /* 10310 * We have to test which frags need to be rolled back. We may 10311 * be operating on a stale copy when doing background writes. 10312 */ 10313 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) 10314 if (isclr(blksfree, cgbno + i)) 10315 frags++; 10316 if (frags == 0) 10317 return (0); 10318 /* 10319 * This is mostly ffs_blkfree() sans some validation and 10320 * superblock updates. 10321 */ 10322 if (frags == fs->fs_frag) { 10323 fragno = fragstoblks(fs, cgbno); 10324 ffs_setblock(fs, blksfree, fragno); 10325 ffs_clusteracct(fs, cgp, fragno, 1); 10326 cgp->cg_cs.cs_nbfree++; 10327 } else { 10328 cgbno += jnewblk->jn_oldfrags; 10329 bbase = cgbno - fragnum(fs, cgbno); 10330 /* Decrement the old frags. */ 10331 blk = blkmap(fs, blksfree, bbase); 10332 ffs_fragacct(fs, blk, cgp->cg_frsum, -1); 10333 /* Deallocate the fragment */ 10334 for (i = 0; i < frags; i++) 10335 setbit(blksfree, cgbno + i); 10336 cgp->cg_cs.cs_nffree += frags; 10337 /* Add back in counts associated with the new frags */ 10338 blk = blkmap(fs, blksfree, bbase); 10339 ffs_fragacct(fs, blk, cgp->cg_frsum, 1); 10340 /* If a complete block has been reassembled, account for it. */ 10341 fragno = fragstoblks(fs, bbase); 10342 if (ffs_isblock(fs, blksfree, fragno)) { 10343 cgp->cg_cs.cs_nffree -= fs->fs_frag; 10344 ffs_clusteracct(fs, cgp, fragno, 1); 10345 cgp->cg_cs.cs_nbfree++; 10346 } 10347 } 10348 stat_jnewblk++; 10349 jnewblk->jn_state &= ~ATTACHED; 10350 jnewblk->jn_state |= UNDONE; 10351 10352 return (frags); 10353 } 10354 10355 static void 10356 initiate_write_bmsafemap(bmsafemap, bp) 10357 struct bmsafemap *bmsafemap; 10358 struct buf *bp; /* The cg block. */ 10359 { 10360 struct jaddref *jaddref; 10361 struct jnewblk *jnewblk; 10362 uint8_t *inosused; 10363 uint8_t *blksfree; 10364 struct cg *cgp; 10365 struct fs *fs; 10366 ino_t ino; 10367 10368 if (bmsafemap->sm_state & IOSTARTED) 10369 panic("initiate_write_bmsafemap: Already started\n"); 10370 bmsafemap->sm_state |= IOSTARTED; 10371 /* 10372 * Clear any inode allocations which are pending journal writes. 10373 */ 10374 if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) { 10375 cgp = (struct cg *)bp->b_data; 10376 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 10377 inosused = cg_inosused(cgp); 10378 LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) { 10379 ino = jaddref->ja_ino % fs->fs_ipg; 10380 /* 10381 * If this is a background copy the inode may not 10382 * be marked used yet. 10383 */ 10384 if (isset(inosused, ino)) { 10385 if ((jaddref->ja_mode & IFMT) == IFDIR) 10386 cgp->cg_cs.cs_ndir--; 10387 cgp->cg_cs.cs_nifree++; 10388 clrbit(inosused, ino); 10389 jaddref->ja_state &= ~ATTACHED; 10390 jaddref->ja_state |= UNDONE; 10391 stat_jaddref++; 10392 } else if ((bp->b_xflags & BX_BKGRDMARKER) == 0) 10393 panic("initiate_write_bmsafemap: inode %d " 10394 "marked free", jaddref->ja_ino); 10395 } 10396 } 10397 /* 10398 * Clear any block allocations which are pending journal writes. 10399 */ 10400 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { 10401 cgp = (struct cg *)bp->b_data; 10402 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 10403 blksfree = cg_blksfree(cgp); 10404 LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { 10405 if (jnewblk_rollback(jnewblk, fs, cgp, blksfree)) 10406 continue; 10407 if ((bp->b_xflags & BX_BKGRDMARKER) == 0) 10408 panic("initiate_write_bmsafemap: block %jd " 10409 "marked free", jnewblk->jn_blkno); 10410 } 10411 } 10412 /* 10413 * Move allocation lists to the written lists so they can be 10414 * cleared once the block write is complete. 10415 */ 10416 LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr, 10417 inodedep, id_deps); 10418 LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr, 10419 newblk, nb_deps); 10420 LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist, 10421 wk_list); 10422 } 10423 10424 /* 10425 * This routine is called during the completion interrupt 10426 * service routine for a disk write (from the procedure called 10427 * by the device driver to inform the filesystem caches of 10428 * a request completion). It should be called early in this 10429 * procedure, before the block is made available to other 10430 * processes or other routines are called. 10431 * 10432 */ 10433 static void 10434 softdep_disk_write_complete(bp) 10435 struct buf *bp; /* describes the completed disk write */ 10436 { 10437 struct worklist *wk; 10438 struct worklist *owk; 10439 struct workhead reattach; 10440 struct freeblks *freeblks; 10441 struct buf *sbp; 10442 10443 /* 10444 * If an error occurred while doing the write, then the data 10445 * has not hit the disk and the dependencies cannot be unrolled. 10446 */ 10447 if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) 10448 return; 10449 LIST_INIT(&reattach); 10450 /* 10451 * This lock must not be released anywhere in this code segment. 10452 */ 10453 sbp = NULL; 10454 owk = NULL; 10455 ACQUIRE_LOCK(&lk); 10456 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 10457 WORKLIST_REMOVE(wk); 10458 dep_write[wk->wk_type]++; 10459 if (wk == owk) 10460 panic("duplicate worklist: %p\n", wk); 10461 owk = wk; 10462 switch (wk->wk_type) { 10463 10464 case D_PAGEDEP: 10465 if (handle_written_filepage(WK_PAGEDEP(wk), bp)) 10466 WORKLIST_INSERT(&reattach, wk); 10467 continue; 10468 10469 case D_INODEDEP: 10470 if (handle_written_inodeblock(WK_INODEDEP(wk), bp)) 10471 WORKLIST_INSERT(&reattach, wk); 10472 continue; 10473 10474 case D_BMSAFEMAP: 10475 if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp)) 10476 WORKLIST_INSERT(&reattach, wk); 10477 continue; 10478 10479 case D_MKDIR: 10480 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 10481 continue; 10482 10483 case D_ALLOCDIRECT: 10484 wk->wk_state |= COMPLETE; 10485 handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL); 10486 continue; 10487 10488 case D_ALLOCINDIR: 10489 wk->wk_state |= COMPLETE; 10490 handle_allocindir_partdone(WK_ALLOCINDIR(wk)); 10491 continue; 10492 10493 case D_INDIRDEP: 10494 if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp)) 10495 WORKLIST_INSERT(&reattach, wk); 10496 continue; 10497 10498 case D_FREEBLKS: 10499 wk->wk_state |= COMPLETE; 10500 freeblks = WK_FREEBLKS(wk); 10501 if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE && 10502 LIST_EMPTY(&freeblks->fb_jblkdephd)) 10503 add_to_worklist(wk, WK_NODELAY); 10504 continue; 10505 10506 case D_FREEWORK: 10507 handle_written_freework(WK_FREEWORK(wk)); 10508 break; 10509 10510 case D_JSEGDEP: 10511 free_jsegdep(WK_JSEGDEP(wk)); 10512 continue; 10513 10514 case D_JSEG: 10515 handle_written_jseg(WK_JSEG(wk), bp); 10516 continue; 10517 10518 case D_SBDEP: 10519 if (handle_written_sbdep(WK_SBDEP(wk), bp)) 10520 WORKLIST_INSERT(&reattach, wk); 10521 continue; 10522 10523 case D_FREEDEP: 10524 free_freedep(WK_FREEDEP(wk)); 10525 continue; 10526 10527 default: 10528 panic("handle_disk_write_complete: Unknown type %s", 10529 TYPENAME(wk->wk_type)); 10530 /* NOTREACHED */ 10531 } 10532 } 10533 /* 10534 * Reattach any requests that must be redone. 10535 */ 10536 while ((wk = LIST_FIRST(&reattach)) != NULL) { 10537 WORKLIST_REMOVE(wk); 10538 WORKLIST_INSERT(&bp->b_dep, wk); 10539 } 10540 FREE_LOCK(&lk); 10541 if (sbp) 10542 brelse(sbp); 10543 } 10544 10545 /* 10546 * Called from within softdep_disk_write_complete above. Note that 10547 * this routine is always called from interrupt level with further 10548 * splbio interrupts blocked. 10549 */ 10550 static void 10551 handle_allocdirect_partdone(adp, wkhd) 10552 struct allocdirect *adp; /* the completed allocdirect */ 10553 struct workhead *wkhd; /* Work to do when inode is writtne. */ 10554 { 10555 struct allocdirectlst *listhead; 10556 struct allocdirect *listadp; 10557 struct inodedep *inodedep; 10558 long bsize; 10559 10560 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 10561 return; 10562 /* 10563 * The on-disk inode cannot claim to be any larger than the last 10564 * fragment that has been written. Otherwise, the on-disk inode 10565 * might have fragments that were not the last block in the file 10566 * which would corrupt the filesystem. Thus, we cannot free any 10567 * allocdirects after one whose ad_oldblkno claims a fragment as 10568 * these blocks must be rolled back to zero before writing the inode. 10569 * We check the currently active set of allocdirects in id_inoupdt 10570 * or id_extupdt as appropriate. 10571 */ 10572 inodedep = adp->ad_inodedep; 10573 bsize = inodedep->id_fs->fs_bsize; 10574 if (adp->ad_state & EXTDATA) 10575 listhead = &inodedep->id_extupdt; 10576 else 10577 listhead = &inodedep->id_inoupdt; 10578 TAILQ_FOREACH(listadp, listhead, ad_next) { 10579 /* found our block */ 10580 if (listadp == adp) 10581 break; 10582 /* continue if ad_oldlbn is not a fragment */ 10583 if (listadp->ad_oldsize == 0 || 10584 listadp->ad_oldsize == bsize) 10585 continue; 10586 /* hit a fragment */ 10587 return; 10588 } 10589 /* 10590 * If we have reached the end of the current list without 10591 * finding the just finished dependency, then it must be 10592 * on the future dependency list. Future dependencies cannot 10593 * be freed until they are moved to the current list. 10594 */ 10595 if (listadp == NULL) { 10596 #ifdef DEBUG 10597 if (adp->ad_state & EXTDATA) 10598 listhead = &inodedep->id_newextupdt; 10599 else 10600 listhead = &inodedep->id_newinoupdt; 10601 TAILQ_FOREACH(listadp, listhead, ad_next) 10602 /* found our block */ 10603 if (listadp == adp) 10604 break; 10605 if (listadp == NULL) 10606 panic("handle_allocdirect_partdone: lost dep"); 10607 #endif /* DEBUG */ 10608 return; 10609 } 10610 /* 10611 * If we have found the just finished dependency, then queue 10612 * it along with anything that follows it that is complete. 10613 * Since the pointer has not yet been written in the inode 10614 * as the dependency prevents it, place the allocdirect on the 10615 * bufwait list where it will be freed once the pointer is 10616 * valid. 10617 */ 10618 if (wkhd == NULL) 10619 wkhd = &inodedep->id_bufwait; 10620 for (; adp; adp = listadp) { 10621 listadp = TAILQ_NEXT(adp, ad_next); 10622 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 10623 return; 10624 TAILQ_REMOVE(listhead, adp, ad_next); 10625 WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list); 10626 } 10627 } 10628 10629 /* 10630 * Called from within softdep_disk_write_complete above. This routine 10631 * completes successfully written allocindirs. 10632 */ 10633 static void 10634 handle_allocindir_partdone(aip) 10635 struct allocindir *aip; /* the completed allocindir */ 10636 { 10637 struct indirdep *indirdep; 10638 10639 if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) 10640 return; 10641 indirdep = aip->ai_indirdep; 10642 LIST_REMOVE(aip, ai_next); 10643 /* 10644 * Don't set a pointer while the buffer is undergoing IO or while 10645 * we have active truncations. 10646 */ 10647 if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) { 10648 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); 10649 return; 10650 } 10651 if (indirdep->ir_state & UFS1FMT) 10652 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 10653 aip->ai_newblkno; 10654 else 10655 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 10656 aip->ai_newblkno; 10657 /* 10658 * Await the pointer write before freeing the allocindir. 10659 */ 10660 LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next); 10661 } 10662 10663 /* 10664 * Release segments held on a jwork list. 10665 */ 10666 static void 10667 handle_jwork(wkhd) 10668 struct workhead *wkhd; 10669 { 10670 struct worklist *wk; 10671 10672 while ((wk = LIST_FIRST(wkhd)) != NULL) { 10673 WORKLIST_REMOVE(wk); 10674 switch (wk->wk_type) { 10675 case D_JSEGDEP: 10676 free_jsegdep(WK_JSEGDEP(wk)); 10677 continue; 10678 case D_FREEDEP: 10679 free_freedep(WK_FREEDEP(wk)); 10680 continue; 10681 case D_FREEFRAG: 10682 rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep)); 10683 WORKITEM_FREE(wk, D_FREEFRAG); 10684 case D_FREEWORK: 10685 handle_written_freework(WK_FREEWORK(wk)); 10686 continue; 10687 default: 10688 panic("handle_jwork: Unknown type %s\n", 10689 TYPENAME(wk->wk_type)); 10690 } 10691 } 10692 } 10693 10694 /* 10695 * Handle the bufwait list on an inode when it is safe to release items 10696 * held there. This normally happens after an inode block is written but 10697 * may be delayed and handled later if there are pending journal items that 10698 * are not yet safe to be released. 10699 */ 10700 static struct freefile * 10701 handle_bufwait(inodedep, refhd) 10702 struct inodedep *inodedep; 10703 struct workhead *refhd; 10704 { 10705 struct jaddref *jaddref; 10706 struct freefile *freefile; 10707 struct worklist *wk; 10708 10709 freefile = NULL; 10710 while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { 10711 WORKLIST_REMOVE(wk); 10712 switch (wk->wk_type) { 10713 case D_FREEFILE: 10714 /* 10715 * We defer adding freefile to the worklist 10716 * until all other additions have been made to 10717 * ensure that it will be done after all the 10718 * old blocks have been freed. 10719 */ 10720 if (freefile != NULL) 10721 panic("handle_bufwait: freefile"); 10722 freefile = WK_FREEFILE(wk); 10723 continue; 10724 10725 case D_MKDIR: 10726 handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); 10727 continue; 10728 10729 case D_DIRADD: 10730 diradd_inode_written(WK_DIRADD(wk), inodedep); 10731 continue; 10732 10733 case D_FREEFRAG: 10734 wk->wk_state |= COMPLETE; 10735 if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE) 10736 add_to_worklist(wk, 0); 10737 continue; 10738 10739 case D_DIRREM: 10740 wk->wk_state |= COMPLETE; 10741 add_to_worklist(wk, 0); 10742 continue; 10743 10744 case D_ALLOCDIRECT: 10745 case D_ALLOCINDIR: 10746 free_newblk(WK_NEWBLK(wk)); 10747 continue; 10748 10749 case D_JNEWBLK: 10750 wk->wk_state |= COMPLETE; 10751 free_jnewblk(WK_JNEWBLK(wk)); 10752 continue; 10753 10754 /* 10755 * Save freed journal segments and add references on 10756 * the supplied list which will delay their release 10757 * until the cg bitmap is cleared on disk. 10758 */ 10759 case D_JSEGDEP: 10760 if (refhd == NULL) 10761 free_jsegdep(WK_JSEGDEP(wk)); 10762 else 10763 WORKLIST_INSERT(refhd, wk); 10764 continue; 10765 10766 case D_JADDREF: 10767 jaddref = WK_JADDREF(wk); 10768 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, 10769 if_deps); 10770 /* 10771 * Transfer any jaddrefs to the list to be freed with 10772 * the bitmap if we're handling a removed file. 10773 */ 10774 if (refhd == NULL) { 10775 wk->wk_state |= COMPLETE; 10776 free_jaddref(jaddref); 10777 } else 10778 WORKLIST_INSERT(refhd, wk); 10779 continue; 10780 10781 default: 10782 panic("handle_bufwait: Unknown type %p(%s)", 10783 wk, TYPENAME(wk->wk_type)); 10784 /* NOTREACHED */ 10785 } 10786 } 10787 return (freefile); 10788 } 10789 /* 10790 * Called from within softdep_disk_write_complete above to restore 10791 * in-memory inode block contents to their most up-to-date state. Note 10792 * that this routine is always called from interrupt level with further 10793 * splbio interrupts blocked. 10794 */ 10795 static int 10796 handle_written_inodeblock(inodedep, bp) 10797 struct inodedep *inodedep; 10798 struct buf *bp; /* buffer containing the inode block */ 10799 { 10800 struct freefile *freefile; 10801 struct allocdirect *adp, *nextadp; 10802 struct ufs1_dinode *dp1 = NULL; 10803 struct ufs2_dinode *dp2 = NULL; 10804 struct workhead wkhd; 10805 int hadchanges, fstype; 10806 ino_t freelink; 10807 10808 LIST_INIT(&wkhd); 10809 hadchanges = 0; 10810 freefile = NULL; 10811 if ((inodedep->id_state & IOSTARTED) == 0) 10812 panic("handle_written_inodeblock: not started"); 10813 inodedep->id_state &= ~IOSTARTED; 10814 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) { 10815 fstype = UFS1; 10816 dp1 = (struct ufs1_dinode *)bp->b_data + 10817 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 10818 freelink = dp1->di_freelink; 10819 } else { 10820 fstype = UFS2; 10821 dp2 = (struct ufs2_dinode *)bp->b_data + 10822 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 10823 freelink = dp2->di_freelink; 10824 } 10825 /* 10826 * If we wrote a valid freelink pointer during the last write 10827 * record it here. 10828 */ 10829 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 10830 struct inodedep *inon; 10831 10832 inon = TAILQ_NEXT(inodedep, id_unlinked); 10833 if ((inon == NULL && freelink == 0) || 10834 (inon && inon->id_ino == freelink)) { 10835 if (inon) 10836 inon->id_state |= UNLINKPREV; 10837 inodedep->id_state |= UNLINKNEXT; 10838 } else 10839 hadchanges = 1; 10840 } 10841 /* Leave this inodeblock dirty until it's in the list. */ 10842 if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED) 10843 hadchanges = 1; 10844 /* 10845 * If we had to rollback the inode allocation because of 10846 * bitmaps being incomplete, then simply restore it. 10847 * Keep the block dirty so that it will not be reclaimed until 10848 * all associated dependencies have been cleared and the 10849 * corresponding updates written to disk. 10850 */ 10851 if (inodedep->id_savedino1 != NULL) { 10852 hadchanges = 1; 10853 if (fstype == UFS1) 10854 *dp1 = *inodedep->id_savedino1; 10855 else 10856 *dp2 = *inodedep->id_savedino2; 10857 free(inodedep->id_savedino1, M_SAVEDINO); 10858 inodedep->id_savedino1 = NULL; 10859 if ((bp->b_flags & B_DELWRI) == 0) 10860 stat_inode_bitmap++; 10861 bdirty(bp); 10862 /* 10863 * If the inode is clear here and GOINGAWAY it will never 10864 * be written. Process the bufwait and clear any pending 10865 * work which may include the freefile. 10866 */ 10867 if (inodedep->id_state & GOINGAWAY) 10868 goto bufwait; 10869 return (1); 10870 } 10871 inodedep->id_state |= COMPLETE; 10872 /* 10873 * Roll forward anything that had to be rolled back before 10874 * the inode could be updated. 10875 */ 10876 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { 10877 nextadp = TAILQ_NEXT(adp, ad_next); 10878 if (adp->ad_state & ATTACHED) 10879 panic("handle_written_inodeblock: new entry"); 10880 if (fstype == UFS1) { 10881 if (adp->ad_offset < NDADDR) { 10882 if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno) 10883 panic("%s %s #%jd mismatch %d != %jd", 10884 "handle_written_inodeblock:", 10885 "direct pointer", 10886 (intmax_t)adp->ad_offset, 10887 dp1->di_db[adp->ad_offset], 10888 (intmax_t)adp->ad_oldblkno); 10889 dp1->di_db[adp->ad_offset] = adp->ad_newblkno; 10890 } else { 10891 if (dp1->di_ib[adp->ad_offset - NDADDR] != 0) 10892 panic("%s: %s #%jd allocated as %d", 10893 "handle_written_inodeblock", 10894 "indirect pointer", 10895 (intmax_t)adp->ad_offset - NDADDR, 10896 dp1->di_ib[adp->ad_offset - NDADDR]); 10897 dp1->di_ib[adp->ad_offset - NDADDR] = 10898 adp->ad_newblkno; 10899 } 10900 } else { 10901 if (adp->ad_offset < NDADDR) { 10902 if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno) 10903 panic("%s: %s #%jd %s %jd != %jd", 10904 "handle_written_inodeblock", 10905 "direct pointer", 10906 (intmax_t)adp->ad_offset, "mismatch", 10907 (intmax_t)dp2->di_db[adp->ad_offset], 10908 (intmax_t)adp->ad_oldblkno); 10909 dp2->di_db[adp->ad_offset] = adp->ad_newblkno; 10910 } else { 10911 if (dp2->di_ib[adp->ad_offset - NDADDR] != 0) 10912 panic("%s: %s #%jd allocated as %jd", 10913 "handle_written_inodeblock", 10914 "indirect pointer", 10915 (intmax_t)adp->ad_offset - NDADDR, 10916 (intmax_t) 10917 dp2->di_ib[adp->ad_offset - NDADDR]); 10918 dp2->di_ib[adp->ad_offset - NDADDR] = 10919 adp->ad_newblkno; 10920 } 10921 } 10922 adp->ad_state &= ~UNDONE; 10923 adp->ad_state |= ATTACHED; 10924 hadchanges = 1; 10925 } 10926 for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) { 10927 nextadp = TAILQ_NEXT(adp, ad_next); 10928 if (adp->ad_state & ATTACHED) 10929 panic("handle_written_inodeblock: new entry"); 10930 if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno) 10931 panic("%s: direct pointers #%jd %s %jd != %jd", 10932 "handle_written_inodeblock", 10933 (intmax_t)adp->ad_offset, "mismatch", 10934 (intmax_t)dp2->di_extb[adp->ad_offset], 10935 (intmax_t)adp->ad_oldblkno); 10936 dp2->di_extb[adp->ad_offset] = adp->ad_newblkno; 10937 adp->ad_state &= ~UNDONE; 10938 adp->ad_state |= ATTACHED; 10939 hadchanges = 1; 10940 } 10941 if (hadchanges && (bp->b_flags & B_DELWRI) == 0) 10942 stat_direct_blk_ptrs++; 10943 /* 10944 * Reset the file size to its most up-to-date value. 10945 */ 10946 if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1) 10947 panic("handle_written_inodeblock: bad size"); 10948 if (inodedep->id_savednlink > LINK_MAX) 10949 panic("handle_written_inodeblock: Invalid link count " 10950 "%d for inodedep %p", inodedep->id_savednlink, inodedep); 10951 if (fstype == UFS1) { 10952 if (dp1->di_nlink != inodedep->id_savednlink) { 10953 dp1->di_nlink = inodedep->id_savednlink; 10954 hadchanges = 1; 10955 } 10956 if (dp1->di_size != inodedep->id_savedsize) { 10957 dp1->di_size = inodedep->id_savedsize; 10958 hadchanges = 1; 10959 } 10960 } else { 10961 if (dp2->di_nlink != inodedep->id_savednlink) { 10962 dp2->di_nlink = inodedep->id_savednlink; 10963 hadchanges = 1; 10964 } 10965 if (dp2->di_size != inodedep->id_savedsize) { 10966 dp2->di_size = inodedep->id_savedsize; 10967 hadchanges = 1; 10968 } 10969 if (dp2->di_extsize != inodedep->id_savedextsize) { 10970 dp2->di_extsize = inodedep->id_savedextsize; 10971 hadchanges = 1; 10972 } 10973 } 10974 inodedep->id_savedsize = -1; 10975 inodedep->id_savedextsize = -1; 10976 inodedep->id_savednlink = -1; 10977 /* 10978 * If there were any rollbacks in the inode block, then it must be 10979 * marked dirty so that its will eventually get written back in 10980 * its correct form. 10981 */ 10982 if (hadchanges) 10983 bdirty(bp); 10984 bufwait: 10985 /* 10986 * Process any allocdirects that completed during the update. 10987 */ 10988 if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) 10989 handle_allocdirect_partdone(adp, &wkhd); 10990 if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) 10991 handle_allocdirect_partdone(adp, &wkhd); 10992 /* 10993 * Process deallocations that were held pending until the 10994 * inode had been written to disk. Freeing of the inode 10995 * is delayed until after all blocks have been freed to 10996 * avoid creation of new <vfsid, inum, lbn> triples 10997 * before the old ones have been deleted. Completely 10998 * unlinked inodes are not processed until the unlinked 10999 * inode list is written or the last reference is removed. 11000 */ 11001 if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) { 11002 freefile = handle_bufwait(inodedep, NULL); 11003 if (freefile && !LIST_EMPTY(&wkhd)) { 11004 WORKLIST_INSERT(&wkhd, &freefile->fx_list); 11005 freefile = NULL; 11006 } 11007 } 11008 /* 11009 * Move rolled forward dependency completions to the bufwait list 11010 * now that those that were already written have been processed. 11011 */ 11012 if (!LIST_EMPTY(&wkhd) && hadchanges == 0) 11013 panic("handle_written_inodeblock: bufwait but no changes"); 11014 jwork_move(&inodedep->id_bufwait, &wkhd); 11015 11016 if (freefile != NULL) { 11017 /* 11018 * If the inode is goingaway it was never written. Fake up 11019 * the state here so free_inodedep() can succeed. 11020 */ 11021 if (inodedep->id_state & GOINGAWAY) 11022 inodedep->id_state |= COMPLETE | DEPCOMPLETE; 11023 if (free_inodedep(inodedep) == 0) 11024 panic("handle_written_inodeblock: live inodedep %p", 11025 inodedep); 11026 add_to_worklist(&freefile->fx_list, 0); 11027 return (0); 11028 } 11029 11030 /* 11031 * If no outstanding dependencies, free it. 11032 */ 11033 if (free_inodedep(inodedep) || 11034 (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 && 11035 TAILQ_FIRST(&inodedep->id_inoupdt) == 0 && 11036 TAILQ_FIRST(&inodedep->id_extupdt) == 0 && 11037 LIST_FIRST(&inodedep->id_bufwait) == 0)) 11038 return (0); 11039 return (hadchanges); 11040 } 11041 11042 static int 11043 handle_written_indirdep(indirdep, bp, bpp) 11044 struct indirdep *indirdep; 11045 struct buf *bp; 11046 struct buf **bpp; 11047 { 11048 struct allocindir *aip; 11049 struct buf *sbp; 11050 int chgs; 11051 11052 if (indirdep->ir_state & GOINGAWAY) 11053 panic("handle_written_indirdep: indirdep gone"); 11054 if ((indirdep->ir_state & IOSTARTED) == 0) 11055 panic("handle_written_indirdep: IO not started"); 11056 chgs = 0; 11057 /* 11058 * If there were rollbacks revert them here. 11059 */ 11060 if (indirdep->ir_saveddata) { 11061 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); 11062 if (TAILQ_EMPTY(&indirdep->ir_trunc)) { 11063 free(indirdep->ir_saveddata, M_INDIRDEP); 11064 indirdep->ir_saveddata = NULL; 11065 } 11066 chgs = 1; 11067 } 11068 indirdep->ir_state &= ~(UNDONE | IOSTARTED); 11069 indirdep->ir_state |= ATTACHED; 11070 /* 11071 * Move allocindirs with written pointers to the completehd if 11072 * the indirdep's pointer is not yet written. Otherwise 11073 * free them here. 11074 */ 11075 while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) { 11076 LIST_REMOVE(aip, ai_next); 11077 if ((indirdep->ir_state & DEPCOMPLETE) == 0) { 11078 LIST_INSERT_HEAD(&indirdep->ir_completehd, aip, 11079 ai_next); 11080 newblk_freefrag(&aip->ai_block); 11081 continue; 11082 } 11083 free_newblk(&aip->ai_block); 11084 } 11085 /* 11086 * Move allocindirs that have finished dependency processing from 11087 * the done list to the write list after updating the pointers. 11088 */ 11089 if (TAILQ_EMPTY(&indirdep->ir_trunc)) { 11090 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { 11091 handle_allocindir_partdone(aip); 11092 if (aip == LIST_FIRST(&indirdep->ir_donehd)) 11093 panic("disk_write_complete: not gone"); 11094 chgs = 1; 11095 } 11096 } 11097 /* 11098 * Preserve the indirdep if there were any changes or if it is not 11099 * yet valid on disk. 11100 */ 11101 if (chgs) { 11102 stat_indir_blk_ptrs++; 11103 bdirty(bp); 11104 return (1); 11105 } 11106 /* 11107 * If there were no changes we can discard the savedbp and detach 11108 * ourselves from the buf. We are only carrying completed pointers 11109 * in this case. 11110 */ 11111 sbp = indirdep->ir_savebp; 11112 sbp->b_flags |= B_INVAL | B_NOCACHE; 11113 indirdep->ir_savebp = NULL; 11114 indirdep->ir_bp = NULL; 11115 if (*bpp != NULL) 11116 panic("handle_written_indirdep: bp already exists."); 11117 *bpp = sbp; 11118 /* 11119 * The indirdep may not be freed until its parent points at it. 11120 */ 11121 if (indirdep->ir_state & DEPCOMPLETE) 11122 free_indirdep(indirdep); 11123 11124 return (0); 11125 } 11126 11127 /* 11128 * Process a diradd entry after its dependent inode has been written. 11129 * This routine must be called with splbio interrupts blocked. 11130 */ 11131 static void 11132 diradd_inode_written(dap, inodedep) 11133 struct diradd *dap; 11134 struct inodedep *inodedep; 11135 { 11136 11137 dap->da_state |= COMPLETE; 11138 complete_diradd(dap); 11139 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 11140 } 11141 11142 /* 11143 * Returns true if the bmsafemap will have rollbacks when written. Must 11144 * only be called with lk and the buf lock on the cg held. 11145 */ 11146 static int 11147 bmsafemap_rollbacks(bmsafemap) 11148 struct bmsafemap *bmsafemap; 11149 { 11150 11151 return (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd) | 11152 !LIST_EMPTY(&bmsafemap->sm_jnewblkhd)); 11153 } 11154 11155 /* 11156 * Re-apply an allocation when a cg write is complete. 11157 */ 11158 static int 11159 jnewblk_rollforward(jnewblk, fs, cgp, blksfree) 11160 struct jnewblk *jnewblk; 11161 struct fs *fs; 11162 struct cg *cgp; 11163 uint8_t *blksfree; 11164 { 11165 ufs1_daddr_t fragno; 11166 ufs2_daddr_t blkno; 11167 long cgbno, bbase; 11168 int frags, blk; 11169 int i; 11170 11171 frags = 0; 11172 cgbno = dtogd(fs, jnewblk->jn_blkno); 11173 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) { 11174 if (isclr(blksfree, cgbno + i)) 11175 panic("jnewblk_rollforward: re-allocated fragment"); 11176 frags++; 11177 } 11178 if (frags == fs->fs_frag) { 11179 blkno = fragstoblks(fs, cgbno); 11180 ffs_clrblock(fs, blksfree, (long)blkno); 11181 ffs_clusteracct(fs, cgp, blkno, -1); 11182 cgp->cg_cs.cs_nbfree--; 11183 } else { 11184 bbase = cgbno - fragnum(fs, cgbno); 11185 cgbno += jnewblk->jn_oldfrags; 11186 /* If a complete block had been reassembled, account for it. */ 11187 fragno = fragstoblks(fs, bbase); 11188 if (ffs_isblock(fs, blksfree, fragno)) { 11189 cgp->cg_cs.cs_nffree += fs->fs_frag; 11190 ffs_clusteracct(fs, cgp, fragno, -1); 11191 cgp->cg_cs.cs_nbfree--; 11192 } 11193 /* Decrement the old frags. */ 11194 blk = blkmap(fs, blksfree, bbase); 11195 ffs_fragacct(fs, blk, cgp->cg_frsum, -1); 11196 /* Allocate the fragment */ 11197 for (i = 0; i < frags; i++) 11198 clrbit(blksfree, cgbno + i); 11199 cgp->cg_cs.cs_nffree -= frags; 11200 /* Add back in counts associated with the new frags */ 11201 blk = blkmap(fs, blksfree, bbase); 11202 ffs_fragacct(fs, blk, cgp->cg_frsum, 1); 11203 } 11204 return (frags); 11205 } 11206 11207 /* 11208 * Complete a write to a bmsafemap structure. Roll forward any bitmap 11209 * changes if it's not a background write. Set all written dependencies 11210 * to DEPCOMPLETE and free the structure if possible. 11211 */ 11212 static int 11213 handle_written_bmsafemap(bmsafemap, bp) 11214 struct bmsafemap *bmsafemap; 11215 struct buf *bp; 11216 { 11217 struct newblk *newblk; 11218 struct inodedep *inodedep; 11219 struct jaddref *jaddref, *jatmp; 11220 struct jnewblk *jnewblk, *jntmp; 11221 struct ufsmount *ump; 11222 uint8_t *inosused; 11223 uint8_t *blksfree; 11224 struct cg *cgp; 11225 struct fs *fs; 11226 ino_t ino; 11227 int chgs; 11228 11229 if ((bmsafemap->sm_state & IOSTARTED) == 0) 11230 panic("initiate_write_bmsafemap: Not started\n"); 11231 ump = VFSTOUFS(bmsafemap->sm_list.wk_mp); 11232 chgs = 0; 11233 bmsafemap->sm_state &= ~IOSTARTED; 11234 /* 11235 * Release journal work that was waiting on the write. 11236 */ 11237 handle_jwork(&bmsafemap->sm_freewr); 11238 11239 /* 11240 * Restore unwritten inode allocation pending jaddref writes. 11241 */ 11242 if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) { 11243 cgp = (struct cg *)bp->b_data; 11244 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 11245 inosused = cg_inosused(cgp); 11246 LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd, 11247 ja_bmdeps, jatmp) { 11248 if ((jaddref->ja_state & UNDONE) == 0) 11249 continue; 11250 ino = jaddref->ja_ino % fs->fs_ipg; 11251 if (isset(inosused, ino)) 11252 panic("handle_written_bmsafemap: " 11253 "re-allocated inode"); 11254 if ((bp->b_xflags & BX_BKGRDMARKER) == 0) { 11255 if ((jaddref->ja_mode & IFMT) == IFDIR) 11256 cgp->cg_cs.cs_ndir++; 11257 cgp->cg_cs.cs_nifree--; 11258 setbit(inosused, ino); 11259 chgs = 1; 11260 } 11261 jaddref->ja_state &= ~UNDONE; 11262 jaddref->ja_state |= ATTACHED; 11263 free_jaddref(jaddref); 11264 } 11265 } 11266 /* 11267 * Restore any block allocations which are pending journal writes. 11268 */ 11269 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { 11270 cgp = (struct cg *)bp->b_data; 11271 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 11272 blksfree = cg_blksfree(cgp); 11273 LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps, 11274 jntmp) { 11275 if ((jnewblk->jn_state & UNDONE) == 0) 11276 continue; 11277 if ((bp->b_xflags & BX_BKGRDMARKER) == 0 && 11278 jnewblk_rollforward(jnewblk, fs, cgp, blksfree)) 11279 chgs = 1; 11280 jnewblk->jn_state &= ~(UNDONE | NEWBLOCK); 11281 jnewblk->jn_state |= ATTACHED; 11282 free_jnewblk(jnewblk); 11283 } 11284 } 11285 while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) { 11286 newblk->nb_state |= DEPCOMPLETE; 11287 newblk->nb_state &= ~ONDEPLIST; 11288 newblk->nb_bmsafemap = NULL; 11289 LIST_REMOVE(newblk, nb_deps); 11290 if (newblk->nb_list.wk_type == D_ALLOCDIRECT) 11291 handle_allocdirect_partdone( 11292 WK_ALLOCDIRECT(&newblk->nb_list), NULL); 11293 else if (newblk->nb_list.wk_type == D_ALLOCINDIR) 11294 handle_allocindir_partdone( 11295 WK_ALLOCINDIR(&newblk->nb_list)); 11296 else if (newblk->nb_list.wk_type != D_NEWBLK) 11297 panic("handle_written_bmsafemap: Unexpected type: %s", 11298 TYPENAME(newblk->nb_list.wk_type)); 11299 } 11300 while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) { 11301 inodedep->id_state |= DEPCOMPLETE; 11302 inodedep->id_state &= ~ONDEPLIST; 11303 LIST_REMOVE(inodedep, id_deps); 11304 inodedep->id_bmsafemap = NULL; 11305 } 11306 LIST_REMOVE(bmsafemap, sm_next); 11307 if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) && 11308 LIST_EMPTY(&bmsafemap->sm_jnewblkhd) && 11309 LIST_EMPTY(&bmsafemap->sm_newblkhd) && 11310 LIST_EMPTY(&bmsafemap->sm_inodedephd) && 11311 LIST_EMPTY(&bmsafemap->sm_freehd)) { 11312 LIST_REMOVE(bmsafemap, sm_hash); 11313 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 11314 return (0); 11315 } 11316 LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next); 11317 bdirty(bp); 11318 return (1); 11319 } 11320 11321 /* 11322 * Try to free a mkdir dependency. 11323 */ 11324 static void 11325 complete_mkdir(mkdir) 11326 struct mkdir *mkdir; 11327 { 11328 struct diradd *dap; 11329 11330 if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE) 11331 return; 11332 LIST_REMOVE(mkdir, md_mkdirs); 11333 dap = mkdir->md_diradd; 11334 dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); 11335 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) { 11336 dap->da_state |= DEPCOMPLETE; 11337 complete_diradd(dap); 11338 } 11339 WORKITEM_FREE(mkdir, D_MKDIR); 11340 } 11341 11342 /* 11343 * Handle the completion of a mkdir dependency. 11344 */ 11345 static void 11346 handle_written_mkdir(mkdir, type) 11347 struct mkdir *mkdir; 11348 int type; 11349 { 11350 11351 if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type) 11352 panic("handle_written_mkdir: bad type"); 11353 mkdir->md_state |= COMPLETE; 11354 complete_mkdir(mkdir); 11355 } 11356 11357 static int 11358 free_pagedep(pagedep) 11359 struct pagedep *pagedep; 11360 { 11361 int i; 11362 11363 if (pagedep->pd_state & NEWBLOCK) 11364 return (0); 11365 if (!LIST_EMPTY(&pagedep->pd_dirremhd)) 11366 return (0); 11367 for (i = 0; i < DAHASHSZ; i++) 11368 if (!LIST_EMPTY(&pagedep->pd_diraddhd[i])) 11369 return (0); 11370 if (!LIST_EMPTY(&pagedep->pd_pendinghd)) 11371 return (0); 11372 if (!LIST_EMPTY(&pagedep->pd_jmvrefhd)) 11373 return (0); 11374 if (pagedep->pd_state & ONWORKLIST) 11375 WORKLIST_REMOVE(&pagedep->pd_list); 11376 LIST_REMOVE(pagedep, pd_hash); 11377 WORKITEM_FREE(pagedep, D_PAGEDEP); 11378 11379 return (1); 11380 } 11381 11382 /* 11383 * Called from within softdep_disk_write_complete above. 11384 * A write operation was just completed. Removed inodes can 11385 * now be freed and associated block pointers may be committed. 11386 * Note that this routine is always called from interrupt level 11387 * with further splbio interrupts blocked. 11388 */ 11389 static int 11390 handle_written_filepage(pagedep, bp) 11391 struct pagedep *pagedep; 11392 struct buf *bp; /* buffer containing the written page */ 11393 { 11394 struct dirrem *dirrem; 11395 struct diradd *dap, *nextdap; 11396 struct direct *ep; 11397 int i, chgs; 11398 11399 if ((pagedep->pd_state & IOSTARTED) == 0) 11400 panic("handle_written_filepage: not started"); 11401 pagedep->pd_state &= ~IOSTARTED; 11402 /* 11403 * Process any directory removals that have been committed. 11404 */ 11405 while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { 11406 LIST_REMOVE(dirrem, dm_next); 11407 dirrem->dm_state |= COMPLETE; 11408 dirrem->dm_dirinum = pagedep->pd_ino; 11409 KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), 11410 ("handle_written_filepage: Journal entries not written.")); 11411 add_to_worklist(&dirrem->dm_list, 0); 11412 } 11413 /* 11414 * Free any directory additions that have been committed. 11415 * If it is a newly allocated block, we have to wait until 11416 * the on-disk directory inode claims the new block. 11417 */ 11418 if ((pagedep->pd_state & NEWBLOCK) == 0) 11419 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 11420 free_diradd(dap, NULL); 11421 /* 11422 * Uncommitted directory entries must be restored. 11423 */ 11424 for (chgs = 0, i = 0; i < DAHASHSZ; i++) { 11425 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; 11426 dap = nextdap) { 11427 nextdap = LIST_NEXT(dap, da_pdlist); 11428 if (dap->da_state & ATTACHED) 11429 panic("handle_written_filepage: attached"); 11430 ep = (struct direct *) 11431 ((char *)bp->b_data + dap->da_offset); 11432 ep->d_ino = dap->da_newinum; 11433 dap->da_state &= ~UNDONE; 11434 dap->da_state |= ATTACHED; 11435 chgs = 1; 11436 /* 11437 * If the inode referenced by the directory has 11438 * been written out, then the dependency can be 11439 * moved to the pending list. 11440 */ 11441 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 11442 LIST_REMOVE(dap, da_pdlist); 11443 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, 11444 da_pdlist); 11445 } 11446 } 11447 } 11448 /* 11449 * If there were any rollbacks in the directory, then it must be 11450 * marked dirty so that its will eventually get written back in 11451 * its correct form. 11452 */ 11453 if (chgs) { 11454 if ((bp->b_flags & B_DELWRI) == 0) 11455 stat_dir_entry++; 11456 bdirty(bp); 11457 return (1); 11458 } 11459 /* 11460 * If we are not waiting for a new directory block to be 11461 * claimed by its inode, then the pagedep will be freed. 11462 * Otherwise it will remain to track any new entries on 11463 * the page in case they are fsync'ed. 11464 */ 11465 free_pagedep(pagedep); 11466 return (0); 11467 } 11468 11469 /* 11470 * Writing back in-core inode structures. 11471 * 11472 * The filesystem only accesses an inode's contents when it occupies an 11473 * "in-core" inode structure. These "in-core" structures are separate from 11474 * the page frames used to cache inode blocks. Only the latter are 11475 * transferred to/from the disk. So, when the updated contents of the 11476 * "in-core" inode structure are copied to the corresponding in-memory inode 11477 * block, the dependencies are also transferred. The following procedure is 11478 * called when copying a dirty "in-core" inode to a cached inode block. 11479 */ 11480 11481 /* 11482 * Called when an inode is loaded from disk. If the effective link count 11483 * differed from the actual link count when it was last flushed, then we 11484 * need to ensure that the correct effective link count is put back. 11485 */ 11486 void 11487 softdep_load_inodeblock(ip) 11488 struct inode *ip; /* the "in_core" copy of the inode */ 11489 { 11490 struct inodedep *inodedep; 11491 11492 /* 11493 * Check for alternate nlink count. 11494 */ 11495 ip->i_effnlink = ip->i_nlink; 11496 ACQUIRE_LOCK(&lk); 11497 if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 11498 &inodedep) == 0) { 11499 FREE_LOCK(&lk); 11500 return; 11501 } 11502 ip->i_effnlink -= inodedep->id_nlinkdelta; 11503 FREE_LOCK(&lk); 11504 } 11505 11506 /* 11507 * This routine is called just before the "in-core" inode 11508 * information is to be copied to the in-memory inode block. 11509 * Recall that an inode block contains several inodes. If 11510 * the force flag is set, then the dependencies will be 11511 * cleared so that the update can always be made. Note that 11512 * the buffer is locked when this routine is called, so we 11513 * will never be in the middle of writing the inode block 11514 * to disk. 11515 */ 11516 void 11517 softdep_update_inodeblock(ip, bp, waitfor) 11518 struct inode *ip; /* the "in_core" copy of the inode */ 11519 struct buf *bp; /* the buffer containing the inode block */ 11520 int waitfor; /* nonzero => update must be allowed */ 11521 { 11522 struct inodedep *inodedep; 11523 struct inoref *inoref; 11524 struct worklist *wk; 11525 struct mount *mp; 11526 struct buf *ibp; 11527 struct fs *fs; 11528 int error; 11529 11530 mp = UFSTOVFS(ip->i_ump); 11531 fs = ip->i_fs; 11532 /* 11533 * Preserve the freelink that is on disk. clear_unlinked_inodedep() 11534 * does not have access to the in-core ip so must write directly into 11535 * the inode block buffer when setting freelink. 11536 */ 11537 if (fs->fs_magic == FS_UFS1_MAGIC) 11538 DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data + 11539 ino_to_fsbo(fs, ip->i_number))->di_freelink); 11540 else 11541 DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data + 11542 ino_to_fsbo(fs, ip->i_number))->di_freelink); 11543 /* 11544 * If the effective link count is not equal to the actual link 11545 * count, then we must track the difference in an inodedep while 11546 * the inode is (potentially) tossed out of the cache. Otherwise, 11547 * if there is no existing inodedep, then there are no dependencies 11548 * to track. 11549 */ 11550 ACQUIRE_LOCK(&lk); 11551 again: 11552 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { 11553 FREE_LOCK(&lk); 11554 if (ip->i_effnlink != ip->i_nlink) 11555 panic("softdep_update_inodeblock: bad link count"); 11556 return; 11557 } 11558 if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) 11559 panic("softdep_update_inodeblock: bad delta"); 11560 /* 11561 * If we're flushing all dependencies we must also move any waiting 11562 * for journal writes onto the bufwait list prior to I/O. 11563 */ 11564 if (waitfor) { 11565 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 11566 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 11567 == DEPCOMPLETE) { 11568 jwait(&inoref->if_list, MNT_WAIT); 11569 goto again; 11570 } 11571 } 11572 } 11573 /* 11574 * Changes have been initiated. Anything depending on these 11575 * changes cannot occur until this inode has been written. 11576 */ 11577 inodedep->id_state &= ~COMPLETE; 11578 if ((inodedep->id_state & ONWORKLIST) == 0) 11579 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); 11580 /* 11581 * Any new dependencies associated with the incore inode must 11582 * now be moved to the list associated with the buffer holding 11583 * the in-memory copy of the inode. Once merged process any 11584 * allocdirects that are completed by the merger. 11585 */ 11586 merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); 11587 if (!TAILQ_EMPTY(&inodedep->id_inoupdt)) 11588 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt), 11589 NULL); 11590 merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); 11591 if (!TAILQ_EMPTY(&inodedep->id_extupdt)) 11592 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt), 11593 NULL); 11594 /* 11595 * Now that the inode has been pushed into the buffer, the 11596 * operations dependent on the inode being written to disk 11597 * can be moved to the id_bufwait so that they will be 11598 * processed when the buffer I/O completes. 11599 */ 11600 while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { 11601 WORKLIST_REMOVE(wk); 11602 WORKLIST_INSERT(&inodedep->id_bufwait, wk); 11603 } 11604 /* 11605 * Newly allocated inodes cannot be written until the bitmap 11606 * that allocates them have been written (indicated by 11607 * DEPCOMPLETE being set in id_state). If we are doing a 11608 * forced sync (e.g., an fsync on a file), we force the bitmap 11609 * to be written so that the update can be done. 11610 */ 11611 if (waitfor == 0) { 11612 FREE_LOCK(&lk); 11613 return; 11614 } 11615 retry: 11616 if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) { 11617 FREE_LOCK(&lk); 11618 return; 11619 } 11620 ibp = inodedep->id_bmsafemap->sm_buf; 11621 ibp = getdirtybuf(ibp, &lk, MNT_WAIT); 11622 if (ibp == NULL) { 11623 /* 11624 * If ibp came back as NULL, the dependency could have been 11625 * freed while we slept. Look it up again, and check to see 11626 * that it has completed. 11627 */ 11628 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) 11629 goto retry; 11630 FREE_LOCK(&lk); 11631 return; 11632 } 11633 FREE_LOCK(&lk); 11634 if ((error = bwrite(ibp)) != 0) 11635 softdep_error("softdep_update_inodeblock: bwrite", error); 11636 } 11637 11638 /* 11639 * Merge the a new inode dependency list (such as id_newinoupdt) into an 11640 * old inode dependency list (such as id_inoupdt). This routine must be 11641 * called with splbio interrupts blocked. 11642 */ 11643 static void 11644 merge_inode_lists(newlisthead, oldlisthead) 11645 struct allocdirectlst *newlisthead; 11646 struct allocdirectlst *oldlisthead; 11647 { 11648 struct allocdirect *listadp, *newadp; 11649 11650 newadp = TAILQ_FIRST(newlisthead); 11651 for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) { 11652 if (listadp->ad_offset < newadp->ad_offset) { 11653 listadp = TAILQ_NEXT(listadp, ad_next); 11654 continue; 11655 } 11656 TAILQ_REMOVE(newlisthead, newadp, ad_next); 11657 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); 11658 if (listadp->ad_offset == newadp->ad_offset) { 11659 allocdirect_merge(oldlisthead, newadp, 11660 listadp); 11661 listadp = newadp; 11662 } 11663 newadp = TAILQ_FIRST(newlisthead); 11664 } 11665 while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) { 11666 TAILQ_REMOVE(newlisthead, newadp, ad_next); 11667 TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next); 11668 } 11669 } 11670 11671 /* 11672 * If we are doing an fsync, then we must ensure that any directory 11673 * entries for the inode have been written after the inode gets to disk. 11674 */ 11675 int 11676 softdep_fsync(vp) 11677 struct vnode *vp; /* the "in_core" copy of the inode */ 11678 { 11679 struct inodedep *inodedep; 11680 struct pagedep *pagedep; 11681 struct inoref *inoref; 11682 struct worklist *wk; 11683 struct diradd *dap; 11684 struct mount *mp; 11685 struct vnode *pvp; 11686 struct inode *ip; 11687 struct buf *bp; 11688 struct fs *fs; 11689 struct thread *td = curthread; 11690 int error, flushparent, pagedep_new_block; 11691 ino_t parentino; 11692 ufs_lbn_t lbn; 11693 11694 ip = VTOI(vp); 11695 fs = ip->i_fs; 11696 mp = vp->v_mount; 11697 ACQUIRE_LOCK(&lk); 11698 restart: 11699 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { 11700 FREE_LOCK(&lk); 11701 return (0); 11702 } 11703 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 11704 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 11705 == DEPCOMPLETE) { 11706 jwait(&inoref->if_list, MNT_WAIT); 11707 goto restart; 11708 } 11709 } 11710 if (!LIST_EMPTY(&inodedep->id_inowait) || 11711 !TAILQ_EMPTY(&inodedep->id_extupdt) || 11712 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 11713 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 11714 !TAILQ_EMPTY(&inodedep->id_newinoupdt)) 11715 panic("softdep_fsync: pending ops %p", inodedep); 11716 for (error = 0, flushparent = 0; ; ) { 11717 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) 11718 break; 11719 if (wk->wk_type != D_DIRADD) 11720 panic("softdep_fsync: Unexpected type %s", 11721 TYPENAME(wk->wk_type)); 11722 dap = WK_DIRADD(wk); 11723 /* 11724 * Flush our parent if this directory entry has a MKDIR_PARENT 11725 * dependency or is contained in a newly allocated block. 11726 */ 11727 if (dap->da_state & DIRCHG) 11728 pagedep = dap->da_previous->dm_pagedep; 11729 else 11730 pagedep = dap->da_pagedep; 11731 parentino = pagedep->pd_ino; 11732 lbn = pagedep->pd_lbn; 11733 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) 11734 panic("softdep_fsync: dirty"); 11735 if ((dap->da_state & MKDIR_PARENT) || 11736 (pagedep->pd_state & NEWBLOCK)) 11737 flushparent = 1; 11738 else 11739 flushparent = 0; 11740 /* 11741 * If we are being fsync'ed as part of vgone'ing this vnode, 11742 * then we will not be able to release and recover the 11743 * vnode below, so we just have to give up on writing its 11744 * directory entry out. It will eventually be written, just 11745 * not now, but then the user was not asking to have it 11746 * written, so we are not breaking any promises. 11747 */ 11748 if (vp->v_iflag & VI_DOOMED) 11749 break; 11750 /* 11751 * We prevent deadlock by always fetching inodes from the 11752 * root, moving down the directory tree. Thus, when fetching 11753 * our parent directory, we first try to get the lock. If 11754 * that fails, we must unlock ourselves before requesting 11755 * the lock on our parent. See the comment in ufs_lookup 11756 * for details on possible races. 11757 */ 11758 FREE_LOCK(&lk); 11759 if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp, 11760 FFSV_FORCEINSMQ)) { 11761 error = vfs_busy(mp, MBF_NOWAIT); 11762 if (error != 0) { 11763 vfs_ref(mp); 11764 VOP_UNLOCK(vp, 0); 11765 error = vfs_busy(mp, 0); 11766 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 11767 vfs_rel(mp); 11768 if (error != 0) 11769 return (ENOENT); 11770 if (vp->v_iflag & VI_DOOMED) { 11771 vfs_unbusy(mp); 11772 return (ENOENT); 11773 } 11774 } 11775 VOP_UNLOCK(vp, 0); 11776 error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE, 11777 &pvp, FFSV_FORCEINSMQ); 11778 vfs_unbusy(mp); 11779 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 11780 if (vp->v_iflag & VI_DOOMED) { 11781 if (error == 0) 11782 vput(pvp); 11783 error = ENOENT; 11784 } 11785 if (error != 0) 11786 return (error); 11787 } 11788 /* 11789 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps 11790 * that are contained in direct blocks will be resolved by 11791 * doing a ffs_update. Pagedeps contained in indirect blocks 11792 * may require a complete sync'ing of the directory. So, we 11793 * try the cheap and fast ffs_update first, and if that fails, 11794 * then we do the slower ffs_syncvnode of the directory. 11795 */ 11796 if (flushparent) { 11797 int locked; 11798 11799 if ((error = ffs_update(pvp, 1)) != 0) { 11800 vput(pvp); 11801 return (error); 11802 } 11803 ACQUIRE_LOCK(&lk); 11804 locked = 1; 11805 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) { 11806 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) { 11807 if (wk->wk_type != D_DIRADD) 11808 panic("softdep_fsync: Unexpected type %s", 11809 TYPENAME(wk->wk_type)); 11810 dap = WK_DIRADD(wk); 11811 if (dap->da_state & DIRCHG) 11812 pagedep = dap->da_previous->dm_pagedep; 11813 else 11814 pagedep = dap->da_pagedep; 11815 pagedep_new_block = pagedep->pd_state & NEWBLOCK; 11816 FREE_LOCK(&lk); 11817 locked = 0; 11818 if (pagedep_new_block && 11819 (error = ffs_syncvnode(pvp, MNT_WAIT))) { 11820 vput(pvp); 11821 return (error); 11822 } 11823 } 11824 } 11825 if (locked) 11826 FREE_LOCK(&lk); 11827 } 11828 /* 11829 * Flush directory page containing the inode's name. 11830 */ 11831 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred, 11832 &bp); 11833 if (error == 0) 11834 error = bwrite(bp); 11835 else 11836 brelse(bp); 11837 vput(pvp); 11838 if (error != 0) 11839 return (error); 11840 ACQUIRE_LOCK(&lk); 11841 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) 11842 break; 11843 } 11844 FREE_LOCK(&lk); 11845 return (0); 11846 } 11847 11848 /* 11849 * Flush all the dirty bitmaps associated with the block device 11850 * before flushing the rest of the dirty blocks so as to reduce 11851 * the number of dependencies that will have to be rolled back. 11852 * 11853 * XXX Unused? 11854 */ 11855 void 11856 softdep_fsync_mountdev(vp) 11857 struct vnode *vp; 11858 { 11859 struct buf *bp, *nbp; 11860 struct worklist *wk; 11861 struct bufobj *bo; 11862 11863 if (!vn_isdisk(vp, NULL)) 11864 panic("softdep_fsync_mountdev: vnode not a disk"); 11865 bo = &vp->v_bufobj; 11866 restart: 11867 BO_LOCK(bo); 11868 ACQUIRE_LOCK(&lk); 11869 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 11870 /* 11871 * If it is already scheduled, skip to the next buffer. 11872 */ 11873 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 11874 continue; 11875 11876 if ((bp->b_flags & B_DELWRI) == 0) 11877 panic("softdep_fsync_mountdev: not dirty"); 11878 /* 11879 * We are only interested in bitmaps with outstanding 11880 * dependencies. 11881 */ 11882 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL || 11883 wk->wk_type != D_BMSAFEMAP || 11884 (bp->b_vflags & BV_BKGRDINPROG)) { 11885 BUF_UNLOCK(bp); 11886 continue; 11887 } 11888 FREE_LOCK(&lk); 11889 BO_UNLOCK(bo); 11890 bremfree(bp); 11891 (void) bawrite(bp); 11892 goto restart; 11893 } 11894 FREE_LOCK(&lk); 11895 drain_output(vp); 11896 BO_UNLOCK(bo); 11897 } 11898 11899 /* 11900 * Sync all cylinder groups that were dirty at the time this function is 11901 * called. Newly dirtied cgs will be inserted before the sintenel. This 11902 * is used to flush freedep activity that may be holding up writes to a 11903 * indirect block. 11904 */ 11905 static int 11906 sync_cgs(mp, waitfor) 11907 struct mount *mp; 11908 int waitfor; 11909 { 11910 struct bmsafemap *bmsafemap; 11911 struct bmsafemap *sintenel; 11912 struct ufsmount *ump; 11913 struct buf *bp; 11914 int error; 11915 11916 sintenel = malloc(sizeof(*sintenel), M_BMSAFEMAP, M_ZERO | M_WAITOK); 11917 sintenel->sm_cg = -1; 11918 ump = VFSTOUFS(mp); 11919 error = 0; 11920 ACQUIRE_LOCK(&lk); 11921 LIST_INSERT_HEAD(&ump->softdep_dirtycg, sintenel, sm_next); 11922 for (bmsafemap = LIST_NEXT(sintenel, sm_next); bmsafemap != NULL; 11923 bmsafemap = LIST_NEXT(sintenel, sm_next)) { 11924 /* Skip sintenels and cgs with no work to release. */ 11925 if (bmsafemap->sm_cg == -1 || 11926 (LIST_EMPTY(&bmsafemap->sm_freehd) && 11927 LIST_EMPTY(&bmsafemap->sm_freewr))) { 11928 LIST_REMOVE(sintenel, sm_next); 11929 LIST_INSERT_AFTER(bmsafemap, sintenel, sm_next); 11930 continue; 11931 } 11932 /* 11933 * If we don't get the lock and we're waiting try again, if 11934 * not move on to the next buf and try to sync it. 11935 */ 11936 bp = getdirtybuf(bmsafemap->sm_buf, &lk, waitfor); 11937 if (bp == NULL && waitfor == MNT_WAIT) 11938 continue; 11939 LIST_REMOVE(sintenel, sm_next); 11940 LIST_INSERT_AFTER(bmsafemap, sintenel, sm_next); 11941 if (bp == NULL) 11942 continue; 11943 FREE_LOCK(&lk); 11944 if (waitfor == MNT_NOWAIT) 11945 bawrite(bp); 11946 else 11947 error = bwrite(bp); 11948 ACQUIRE_LOCK(&lk); 11949 if (error) 11950 break; 11951 } 11952 LIST_REMOVE(sintenel, sm_next); 11953 FREE_LOCK(&lk); 11954 free(sintenel, M_BMSAFEMAP); 11955 return (error); 11956 } 11957 11958 /* 11959 * This routine is called when we are trying to synchronously flush a 11960 * file. This routine must eliminate any filesystem metadata dependencies 11961 * so that the syncing routine can succeed. 11962 */ 11963 int 11964 softdep_sync_metadata(struct vnode *vp) 11965 { 11966 int error; 11967 11968 /* 11969 * Ensure that any direct block dependencies have been cleared, 11970 * truncations are started, and inode references are journaled. 11971 */ 11972 ACQUIRE_LOCK(&lk); 11973 /* 11974 * Write all journal records to prevent rollbacks on devvp. 11975 */ 11976 if (vp->v_type == VCHR) 11977 softdep_flushjournal(vp->v_mount); 11978 error = flush_inodedep_deps(vp, vp->v_mount, VTOI(vp)->i_number); 11979 /* 11980 * Ensure that all truncates are written so we won't find deps on 11981 * indirect blocks. 11982 */ 11983 process_truncates(vp); 11984 FREE_LOCK(&lk); 11985 11986 return (error); 11987 } 11988 11989 /* 11990 * This routine is called when we are attempting to sync a buf with 11991 * dependencies. If waitfor is MNT_NOWAIT it attempts to schedule any 11992 * other IO it can but returns EBUSY if the buffer is not yet able to 11993 * be written. Dependencies which will not cause rollbacks will always 11994 * return 0. 11995 */ 11996 int 11997 softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor) 11998 { 11999 struct indirdep *indirdep; 12000 struct pagedep *pagedep; 12001 struct allocindir *aip; 12002 struct newblk *newblk; 12003 struct buf *nbp; 12004 struct worklist *wk; 12005 int i, error; 12006 12007 /* 12008 * For VCHR we just don't want to force flush any dependencies that 12009 * will cause rollbacks. 12010 */ 12011 if (vp->v_type == VCHR) { 12012 if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0)) 12013 return (EBUSY); 12014 return (0); 12015 } 12016 ACQUIRE_LOCK(&lk); 12017 /* 12018 * As we hold the buffer locked, none of its dependencies 12019 * will disappear. 12020 */ 12021 error = 0; 12022 top: 12023 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 12024 switch (wk->wk_type) { 12025 12026 case D_ALLOCDIRECT: 12027 case D_ALLOCINDIR: 12028 newblk = WK_NEWBLK(wk); 12029 if (newblk->nb_jnewblk != NULL) { 12030 if (waitfor == MNT_NOWAIT) { 12031 error = EBUSY; 12032 goto out_unlock; 12033 } 12034 jwait(&newblk->nb_jnewblk->jn_list, waitfor); 12035 goto top; 12036 } 12037 if (newblk->nb_state & DEPCOMPLETE || 12038 waitfor == MNT_NOWAIT) 12039 continue; 12040 nbp = newblk->nb_bmsafemap->sm_buf; 12041 nbp = getdirtybuf(nbp, &lk, waitfor); 12042 if (nbp == NULL) 12043 goto top; 12044 FREE_LOCK(&lk); 12045 if ((error = bwrite(nbp)) != 0) 12046 goto out; 12047 ACQUIRE_LOCK(&lk); 12048 continue; 12049 12050 case D_INDIRDEP: 12051 indirdep = WK_INDIRDEP(wk); 12052 if (waitfor == MNT_NOWAIT) { 12053 if (!TAILQ_EMPTY(&indirdep->ir_trunc) || 12054 !LIST_EMPTY(&indirdep->ir_deplisthd)) { 12055 error = EBUSY; 12056 goto out_unlock; 12057 } 12058 } 12059 if (!TAILQ_EMPTY(&indirdep->ir_trunc)) 12060 panic("softdep_sync_buf: truncation pending."); 12061 restart: 12062 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { 12063 newblk = (struct newblk *)aip; 12064 if (newblk->nb_jnewblk != NULL) { 12065 jwait(&newblk->nb_jnewblk->jn_list, 12066 waitfor); 12067 goto restart; 12068 } 12069 if (newblk->nb_state & DEPCOMPLETE) 12070 continue; 12071 nbp = newblk->nb_bmsafemap->sm_buf; 12072 nbp = getdirtybuf(nbp, &lk, waitfor); 12073 if (nbp == NULL) 12074 goto restart; 12075 FREE_LOCK(&lk); 12076 if ((error = bwrite(nbp)) != 0) 12077 goto out; 12078 ACQUIRE_LOCK(&lk); 12079 goto restart; 12080 } 12081 continue; 12082 12083 case D_PAGEDEP: 12084 /* 12085 * Only flush directory entries in synchronous passes. 12086 */ 12087 if (waitfor != MNT_WAIT) { 12088 error = EBUSY; 12089 goto out_unlock; 12090 } 12091 /* 12092 * While syncing snapshots, we must allow recursive 12093 * lookups. 12094 */ 12095 BUF_AREC(bp); 12096 /* 12097 * We are trying to sync a directory that may 12098 * have dependencies on both its own metadata 12099 * and/or dependencies on the inodes of any 12100 * recently allocated files. We walk its diradd 12101 * lists pushing out the associated inode. 12102 */ 12103 pagedep = WK_PAGEDEP(wk); 12104 for (i = 0; i < DAHASHSZ; i++) { 12105 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) 12106 continue; 12107 if ((error = flush_pagedep_deps(vp, wk->wk_mp, 12108 &pagedep->pd_diraddhd[i]))) { 12109 BUF_NOREC(bp); 12110 goto out_unlock; 12111 } 12112 } 12113 BUF_NOREC(bp); 12114 continue; 12115 12116 case D_FREEWORK: 12117 case D_FREEDEP: 12118 case D_JSEGDEP: 12119 case D_JNEWBLK: 12120 continue; 12121 12122 default: 12123 panic("softdep_sync_buf: Unknown type %s", 12124 TYPENAME(wk->wk_type)); 12125 /* NOTREACHED */ 12126 } 12127 } 12128 out_unlock: 12129 FREE_LOCK(&lk); 12130 out: 12131 return (error); 12132 } 12133 12134 /* 12135 * Flush the dependencies associated with an inodedep. 12136 * Called with splbio blocked. 12137 */ 12138 static int 12139 flush_inodedep_deps(vp, mp, ino) 12140 struct vnode *vp; 12141 struct mount *mp; 12142 ino_t ino; 12143 { 12144 struct inodedep *inodedep; 12145 struct inoref *inoref; 12146 int error, waitfor; 12147 12148 /* 12149 * This work is done in two passes. The first pass grabs most 12150 * of the buffers and begins asynchronously writing them. The 12151 * only way to wait for these asynchronous writes is to sleep 12152 * on the filesystem vnode which may stay busy for a long time 12153 * if the filesystem is active. So, instead, we make a second 12154 * pass over the dependencies blocking on each write. In the 12155 * usual case we will be blocking against a write that we 12156 * initiated, so when it is done the dependency will have been 12157 * resolved. Thus the second pass is expected to end quickly. 12158 * We give a brief window at the top of the loop to allow 12159 * any pending I/O to complete. 12160 */ 12161 for (error = 0, waitfor = MNT_NOWAIT; ; ) { 12162 if (error) 12163 return (error); 12164 FREE_LOCK(&lk); 12165 ACQUIRE_LOCK(&lk); 12166 restart: 12167 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) 12168 return (0); 12169 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 12170 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 12171 == DEPCOMPLETE) { 12172 jwait(&inoref->if_list, MNT_WAIT); 12173 goto restart; 12174 } 12175 } 12176 if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) || 12177 flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) || 12178 flush_deplist(&inodedep->id_extupdt, waitfor, &error) || 12179 flush_deplist(&inodedep->id_newextupdt, waitfor, &error)) 12180 continue; 12181 /* 12182 * If pass2, we are done, otherwise do pass 2. 12183 */ 12184 if (waitfor == MNT_WAIT) 12185 break; 12186 waitfor = MNT_WAIT; 12187 } 12188 /* 12189 * Try freeing inodedep in case all dependencies have been removed. 12190 */ 12191 if (inodedep_lookup(mp, ino, 0, &inodedep) != 0) 12192 (void) free_inodedep(inodedep); 12193 return (0); 12194 } 12195 12196 /* 12197 * Flush an inode dependency list. 12198 * Called with splbio blocked. 12199 */ 12200 static int 12201 flush_deplist(listhead, waitfor, errorp) 12202 struct allocdirectlst *listhead; 12203 int waitfor; 12204 int *errorp; 12205 { 12206 struct allocdirect *adp; 12207 struct newblk *newblk; 12208 struct buf *bp; 12209 12210 mtx_assert(&lk, MA_OWNED); 12211 TAILQ_FOREACH(adp, listhead, ad_next) { 12212 newblk = (struct newblk *)adp; 12213 if (newblk->nb_jnewblk != NULL) { 12214 jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); 12215 return (1); 12216 } 12217 if (newblk->nb_state & DEPCOMPLETE) 12218 continue; 12219 bp = newblk->nb_bmsafemap->sm_buf; 12220 bp = getdirtybuf(bp, &lk, waitfor); 12221 if (bp == NULL) { 12222 if (waitfor == MNT_NOWAIT) 12223 continue; 12224 return (1); 12225 } 12226 FREE_LOCK(&lk); 12227 if (waitfor == MNT_NOWAIT) 12228 bawrite(bp); 12229 else 12230 *errorp = bwrite(bp); 12231 ACQUIRE_LOCK(&lk); 12232 return (1); 12233 } 12234 return (0); 12235 } 12236 12237 /* 12238 * Flush dependencies associated with an allocdirect block. 12239 */ 12240 static int 12241 flush_newblk_dep(vp, mp, lbn) 12242 struct vnode *vp; 12243 struct mount *mp; 12244 ufs_lbn_t lbn; 12245 { 12246 struct newblk *newblk; 12247 struct bufobj *bo; 12248 struct inode *ip; 12249 struct buf *bp; 12250 ufs2_daddr_t blkno; 12251 int error; 12252 12253 error = 0; 12254 bo = &vp->v_bufobj; 12255 ip = VTOI(vp); 12256 blkno = DIP(ip, i_db[lbn]); 12257 if (blkno == 0) 12258 panic("flush_newblk_dep: Missing block"); 12259 ACQUIRE_LOCK(&lk); 12260 /* 12261 * Loop until all dependencies related to this block are satisfied. 12262 * We must be careful to restart after each sleep in case a write 12263 * completes some part of this process for us. 12264 */ 12265 for (;;) { 12266 if (newblk_lookup(mp, blkno, 0, &newblk) == 0) { 12267 FREE_LOCK(&lk); 12268 break; 12269 } 12270 if (newblk->nb_list.wk_type != D_ALLOCDIRECT) 12271 panic("flush_newblk_deps: Bad newblk %p", newblk); 12272 /* 12273 * Flush the journal. 12274 */ 12275 if (newblk->nb_jnewblk != NULL) { 12276 jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); 12277 continue; 12278 } 12279 /* 12280 * Write the bitmap dependency. 12281 */ 12282 if ((newblk->nb_state & DEPCOMPLETE) == 0) { 12283 bp = newblk->nb_bmsafemap->sm_buf; 12284 bp = getdirtybuf(bp, &lk, MNT_WAIT); 12285 if (bp == NULL) 12286 continue; 12287 FREE_LOCK(&lk); 12288 error = bwrite(bp); 12289 if (error) 12290 break; 12291 ACQUIRE_LOCK(&lk); 12292 continue; 12293 } 12294 /* 12295 * Write the buffer. 12296 */ 12297 FREE_LOCK(&lk); 12298 BO_LOCK(bo); 12299 bp = gbincore(bo, lbn); 12300 if (bp != NULL) { 12301 error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 12302 LK_INTERLOCK, BO_MTX(bo)); 12303 if (error == ENOLCK) { 12304 ACQUIRE_LOCK(&lk); 12305 continue; /* Slept, retry */ 12306 } 12307 if (error != 0) 12308 break; /* Failed */ 12309 if (bp->b_flags & B_DELWRI) { 12310 bremfree(bp); 12311 error = bwrite(bp); 12312 if (error) 12313 break; 12314 } else 12315 BUF_UNLOCK(bp); 12316 } else 12317 BO_UNLOCK(bo); 12318 /* 12319 * We have to wait for the direct pointers to 12320 * point at the newdirblk before the dependency 12321 * will go away. 12322 */ 12323 error = ffs_update(vp, MNT_WAIT); 12324 if (error) 12325 break; 12326 ACQUIRE_LOCK(&lk); 12327 } 12328 return (error); 12329 } 12330 12331 /* 12332 * Eliminate a pagedep dependency by flushing out all its diradd dependencies. 12333 * Called with splbio blocked. 12334 */ 12335 static int 12336 flush_pagedep_deps(pvp, mp, diraddhdp) 12337 struct vnode *pvp; 12338 struct mount *mp; 12339 struct diraddhd *diraddhdp; 12340 { 12341 struct inodedep *inodedep; 12342 struct inoref *inoref; 12343 struct ufsmount *ump; 12344 struct diradd *dap; 12345 struct vnode *vp; 12346 int error = 0; 12347 struct buf *bp; 12348 ino_t inum; 12349 12350 ump = VFSTOUFS(mp); 12351 restart: 12352 while ((dap = LIST_FIRST(diraddhdp)) != NULL) { 12353 /* 12354 * Flush ourselves if this directory entry 12355 * has a MKDIR_PARENT dependency. 12356 */ 12357 if (dap->da_state & MKDIR_PARENT) { 12358 FREE_LOCK(&lk); 12359 if ((error = ffs_update(pvp, MNT_WAIT)) != 0) 12360 break; 12361 ACQUIRE_LOCK(&lk); 12362 /* 12363 * If that cleared dependencies, go on to next. 12364 */ 12365 if (dap != LIST_FIRST(diraddhdp)) 12366 continue; 12367 if (dap->da_state & MKDIR_PARENT) 12368 panic("flush_pagedep_deps: MKDIR_PARENT"); 12369 } 12370 /* 12371 * A newly allocated directory must have its "." and 12372 * ".." entries written out before its name can be 12373 * committed in its parent. 12374 */ 12375 inum = dap->da_newinum; 12376 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) 12377 panic("flush_pagedep_deps: lost inode1"); 12378 /* 12379 * Wait for any pending journal adds to complete so we don't 12380 * cause rollbacks while syncing. 12381 */ 12382 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 12383 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 12384 == DEPCOMPLETE) { 12385 jwait(&inoref->if_list, MNT_WAIT); 12386 goto restart; 12387 } 12388 } 12389 if (dap->da_state & MKDIR_BODY) { 12390 FREE_LOCK(&lk); 12391 if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, 12392 FFSV_FORCEINSMQ))) 12393 break; 12394 error = flush_newblk_dep(vp, mp, 0); 12395 /* 12396 * If we still have the dependency we might need to 12397 * update the vnode to sync the new link count to 12398 * disk. 12399 */ 12400 if (error == 0 && dap == LIST_FIRST(diraddhdp)) 12401 error = ffs_update(vp, MNT_WAIT); 12402 vput(vp); 12403 if (error != 0) 12404 break; 12405 ACQUIRE_LOCK(&lk); 12406 /* 12407 * If that cleared dependencies, go on to next. 12408 */ 12409 if (dap != LIST_FIRST(diraddhdp)) 12410 continue; 12411 if (dap->da_state & MKDIR_BODY) { 12412 inodedep_lookup(UFSTOVFS(ump), inum, 0, 12413 &inodedep); 12414 panic("flush_pagedep_deps: MKDIR_BODY " 12415 "inodedep %p dap %p vp %p", 12416 inodedep, dap, vp); 12417 } 12418 } 12419 /* 12420 * Flush the inode on which the directory entry depends. 12421 * Having accounted for MKDIR_PARENT and MKDIR_BODY above, 12422 * the only remaining dependency is that the updated inode 12423 * count must get pushed to disk. The inode has already 12424 * been pushed into its inode buffer (via VOP_UPDATE) at 12425 * the time of the reference count change. So we need only 12426 * locate that buffer, ensure that there will be no rollback 12427 * caused by a bitmap dependency, then write the inode buffer. 12428 */ 12429 retry: 12430 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) 12431 panic("flush_pagedep_deps: lost inode"); 12432 /* 12433 * If the inode still has bitmap dependencies, 12434 * push them to disk. 12435 */ 12436 if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) { 12437 bp = inodedep->id_bmsafemap->sm_buf; 12438 bp = getdirtybuf(bp, &lk, MNT_WAIT); 12439 if (bp == NULL) 12440 goto retry; 12441 FREE_LOCK(&lk); 12442 if ((error = bwrite(bp)) != 0) 12443 break; 12444 ACQUIRE_LOCK(&lk); 12445 if (dap != LIST_FIRST(diraddhdp)) 12446 continue; 12447 } 12448 /* 12449 * If the inode is still sitting in a buffer waiting 12450 * to be written or waiting for the link count to be 12451 * adjusted update it here to flush it to disk. 12452 */ 12453 if (dap == LIST_FIRST(diraddhdp)) { 12454 FREE_LOCK(&lk); 12455 if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, 12456 FFSV_FORCEINSMQ))) 12457 break; 12458 error = ffs_update(vp, MNT_WAIT); 12459 vput(vp); 12460 if (error) 12461 break; 12462 ACQUIRE_LOCK(&lk); 12463 } 12464 /* 12465 * If we have failed to get rid of all the dependencies 12466 * then something is seriously wrong. 12467 */ 12468 if (dap == LIST_FIRST(diraddhdp)) { 12469 inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep); 12470 panic("flush_pagedep_deps: failed to flush " 12471 "inodedep %p ino %d dap %p", inodedep, inum, dap); 12472 } 12473 } 12474 if (error) 12475 ACQUIRE_LOCK(&lk); 12476 return (error); 12477 } 12478 12479 /* 12480 * A large burst of file addition or deletion activity can drive the 12481 * memory load excessively high. First attempt to slow things down 12482 * using the techniques below. If that fails, this routine requests 12483 * the offending operations to fall back to running synchronously 12484 * until the memory load returns to a reasonable level. 12485 */ 12486 int 12487 softdep_slowdown(vp) 12488 struct vnode *vp; 12489 { 12490 struct ufsmount *ump; 12491 int jlow; 12492 int max_softdeps_hard; 12493 12494 ACQUIRE_LOCK(&lk); 12495 jlow = 0; 12496 /* 12497 * Check for journal space if needed. 12498 */ 12499 if (DOINGSUJ(vp)) { 12500 ump = VFSTOUFS(vp->v_mount); 12501 if (journal_space(ump, 0) == 0) 12502 jlow = 1; 12503 } 12504 max_softdeps_hard = max_softdeps * 11 / 10; 12505 if (dep_current[D_DIRREM] < max_softdeps_hard / 2 && 12506 dep_current[D_INODEDEP] < max_softdeps_hard && 12507 VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps && 12508 dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0) { 12509 FREE_LOCK(&lk); 12510 return (0); 12511 } 12512 if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps || jlow) 12513 softdep_speedup(); 12514 stat_sync_limit_hit += 1; 12515 FREE_LOCK(&lk); 12516 if (DOINGSUJ(vp)) 12517 return (0); 12518 return (1); 12519 } 12520 12521 /* 12522 * Called by the allocation routines when they are about to fail 12523 * in the hope that we can free up the requested resource (inodes 12524 * or disk space). 12525 * 12526 * First check to see if the work list has anything on it. If it has, 12527 * clean up entries until we successfully free the requested resource. 12528 * Because this process holds inodes locked, we cannot handle any remove 12529 * requests that might block on a locked inode as that could lead to 12530 * deadlock. If the worklist yields none of the requested resource, 12531 * start syncing out vnodes to free up the needed space. 12532 */ 12533 int 12534 softdep_request_cleanup(fs, vp, cred, resource) 12535 struct fs *fs; 12536 struct vnode *vp; 12537 struct ucred *cred; 12538 int resource; 12539 { 12540 struct ufsmount *ump; 12541 struct mount *mp; 12542 struct vnode *lvp, *mvp; 12543 long starttime; 12544 ufs2_daddr_t needed; 12545 int error; 12546 12547 mp = vp->v_mount; 12548 ump = VFSTOUFS(mp); 12549 mtx_assert(UFS_MTX(ump), MA_OWNED); 12550 if (resource == FLUSH_BLOCKS_WAIT) 12551 stat_cleanup_blkrequests += 1; 12552 else 12553 stat_cleanup_inorequests += 1; 12554 12555 /* 12556 * If we are being called because of a process doing a 12557 * copy-on-write, then it is not safe to process any 12558 * worklist items as we will recurse into the copyonwrite 12559 * routine. This will result in an incoherent snapshot. 12560 */ 12561 if (curthread->td_pflags & TDP_COWINPROGRESS) 12562 return (0); 12563 UFS_UNLOCK(ump); 12564 error = ffs_update(vp, 1); 12565 if (error != 0) { 12566 UFS_LOCK(ump); 12567 return (0); 12568 } 12569 /* 12570 * If we are in need of resources, consider pausing for 12571 * tickdelay to give ourselves some breathing room. 12572 */ 12573 ACQUIRE_LOCK(&lk); 12574 process_removes(vp); 12575 process_truncates(vp); 12576 request_cleanup(UFSTOVFS(ump), resource); 12577 FREE_LOCK(&lk); 12578 /* 12579 * Now clean up at least as many resources as we will need. 12580 * 12581 * When requested to clean up inodes, the number that are needed 12582 * is set by the number of simultaneous writers (mnt_writeopcount) 12583 * plus a bit of slop (2) in case some more writers show up while 12584 * we are cleaning. 12585 * 12586 * When requested to free up space, the amount of space that 12587 * we need is enough blocks to allocate a full-sized segment 12588 * (fs_contigsumsize). The number of such segments that will 12589 * be needed is set by the number of simultaneous writers 12590 * (mnt_writeopcount) plus a bit of slop (2) in case some more 12591 * writers show up while we are cleaning. 12592 * 12593 * Additionally, if we are unpriviledged and allocating space, 12594 * we need to ensure that we clean up enough blocks to get the 12595 * needed number of blocks over the threshhold of the minimum 12596 * number of blocks required to be kept free by the filesystem 12597 * (fs_minfree). 12598 */ 12599 if (resource == FLUSH_INODES_WAIT) { 12600 needed = vp->v_mount->mnt_writeopcount + 2; 12601 } else if (resource == FLUSH_BLOCKS_WAIT) { 12602 needed = (vp->v_mount->mnt_writeopcount + 2) * 12603 fs->fs_contigsumsize; 12604 if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0)) 12605 needed += fragstoblks(fs, 12606 roundup((fs->fs_dsize * fs->fs_minfree / 100) - 12607 fs->fs_cstotal.cs_nffree, fs->fs_frag)); 12608 } else { 12609 UFS_LOCK(ump); 12610 printf("softdep_request_cleanup: Unknown resource type %d\n", 12611 resource); 12612 return (0); 12613 } 12614 starttime = time_second; 12615 retry: 12616 if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 && 12617 fs->fs_cstotal.cs_nbfree <= needed) || 12618 (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && 12619 fs->fs_cstotal.cs_nifree <= needed)) { 12620 ACQUIRE_LOCK(&lk); 12621 if (ump->softdep_on_worklist > 0 && 12622 process_worklist_item(UFSTOVFS(ump), 12623 ump->softdep_on_worklist, LK_NOWAIT) != 0) 12624 stat_worklist_push += 1; 12625 FREE_LOCK(&lk); 12626 } 12627 /* 12628 * If we still need resources and there are no more worklist 12629 * entries to process to obtain them, we have to start flushing 12630 * the dirty vnodes to force the release of additional requests 12631 * to the worklist that we can then process to reap addition 12632 * resources. We walk the vnodes associated with the mount point 12633 * until we get the needed worklist requests that we can reap. 12634 */ 12635 if ((resource == FLUSH_BLOCKS_WAIT && 12636 fs->fs_cstotal.cs_nbfree <= needed) || 12637 (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && 12638 fs->fs_cstotal.cs_nifree <= needed)) { 12639 MNT_ILOCK(mp); 12640 MNT_VNODE_FOREACH(lvp, mp, mvp) { 12641 VI_LOCK(lvp); 12642 if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) { 12643 VI_UNLOCK(lvp); 12644 continue; 12645 } 12646 MNT_IUNLOCK(mp); 12647 if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT, 12648 curthread)) { 12649 MNT_ILOCK(mp); 12650 continue; 12651 } 12652 if (lvp->v_vflag & VV_NOSYNC) { /* unlinked */ 12653 vput(lvp); 12654 MNT_ILOCK(mp); 12655 continue; 12656 } 12657 (void) ffs_syncvnode(lvp, MNT_NOWAIT); 12658 vput(lvp); 12659 MNT_ILOCK(mp); 12660 } 12661 MNT_IUNLOCK(mp); 12662 lvp = ump->um_devvp; 12663 if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) { 12664 VOP_FSYNC(lvp, MNT_NOWAIT, curthread); 12665 VOP_UNLOCK(lvp, 0); 12666 } 12667 if (ump->softdep_on_worklist > 0) { 12668 stat_cleanup_retries += 1; 12669 goto retry; 12670 } 12671 stat_cleanup_failures += 1; 12672 } 12673 if (time_second - starttime > stat_cleanup_high_delay) 12674 stat_cleanup_high_delay = time_second - starttime; 12675 UFS_LOCK(ump); 12676 return (1); 12677 } 12678 12679 /* 12680 * If memory utilization has gotten too high, deliberately slow things 12681 * down and speed up the I/O processing. 12682 */ 12683 extern struct thread *syncertd; 12684 static int 12685 request_cleanup(mp, resource) 12686 struct mount *mp; 12687 int resource; 12688 { 12689 struct thread *td = curthread; 12690 struct ufsmount *ump; 12691 12692 mtx_assert(&lk, MA_OWNED); 12693 /* 12694 * We never hold up the filesystem syncer or buf daemon. 12695 */ 12696 if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF)) 12697 return (0); 12698 ump = VFSTOUFS(mp); 12699 /* 12700 * First check to see if the work list has gotten backlogged. 12701 * If it has, co-opt this process to help clean up two entries. 12702 * Because this process may hold inodes locked, we cannot 12703 * handle any remove requests that might block on a locked 12704 * inode as that could lead to deadlock. We set TDP_SOFTDEP 12705 * to avoid recursively processing the worklist. 12706 */ 12707 if (ump->softdep_on_worklist > max_softdeps / 10) { 12708 td->td_pflags |= TDP_SOFTDEP; 12709 process_worklist_item(mp, 2, LK_NOWAIT); 12710 td->td_pflags &= ~TDP_SOFTDEP; 12711 stat_worklist_push += 2; 12712 return(1); 12713 } 12714 /* 12715 * Next, we attempt to speed up the syncer process. If that 12716 * is successful, then we allow the process to continue. 12717 */ 12718 if (softdep_speedup() && 12719 resource != FLUSH_BLOCKS_WAIT && 12720 resource != FLUSH_INODES_WAIT) 12721 return(0); 12722 /* 12723 * If we are resource constrained on inode dependencies, try 12724 * flushing some dirty inodes. Otherwise, we are constrained 12725 * by file deletions, so try accelerating flushes of directories 12726 * with removal dependencies. We would like to do the cleanup 12727 * here, but we probably hold an inode locked at this point and 12728 * that might deadlock against one that we try to clean. So, 12729 * the best that we can do is request the syncer daemon to do 12730 * the cleanup for us. 12731 */ 12732 switch (resource) { 12733 12734 case FLUSH_INODES: 12735 case FLUSH_INODES_WAIT: 12736 stat_ino_limit_push += 1; 12737 req_clear_inodedeps += 1; 12738 stat_countp = &stat_ino_limit_hit; 12739 break; 12740 12741 case FLUSH_BLOCKS: 12742 case FLUSH_BLOCKS_WAIT: 12743 stat_blk_limit_push += 1; 12744 req_clear_remove += 1; 12745 stat_countp = &stat_blk_limit_hit; 12746 break; 12747 12748 default: 12749 panic("request_cleanup: unknown type"); 12750 } 12751 /* 12752 * Hopefully the syncer daemon will catch up and awaken us. 12753 * We wait at most tickdelay before proceeding in any case. 12754 */ 12755 proc_waiting += 1; 12756 if (callout_pending(&softdep_callout) == FALSE) 12757 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, 12758 pause_timer, 0); 12759 12760 msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0); 12761 proc_waiting -= 1; 12762 return (1); 12763 } 12764 12765 /* 12766 * Awaken processes pausing in request_cleanup and clear proc_waiting 12767 * to indicate that there is no longer a timer running. 12768 */ 12769 static void 12770 pause_timer(arg) 12771 void *arg; 12772 { 12773 12774 /* 12775 * The callout_ API has acquired mtx and will hold it around this 12776 * function call. 12777 */ 12778 *stat_countp += 1; 12779 wakeup_one(&proc_waiting); 12780 if (proc_waiting > 0) 12781 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, 12782 pause_timer, 0); 12783 } 12784 12785 /* 12786 * Flush out a directory with at least one removal dependency in an effort to 12787 * reduce the number of dirrem, freefile, and freeblks dependency structures. 12788 */ 12789 static void 12790 clear_remove(td) 12791 struct thread *td; 12792 { 12793 struct pagedep_hashhead *pagedephd; 12794 struct pagedep *pagedep; 12795 static int next = 0; 12796 struct mount *mp; 12797 struct vnode *vp; 12798 struct bufobj *bo; 12799 int error, cnt; 12800 ino_t ino; 12801 12802 mtx_assert(&lk, MA_OWNED); 12803 12804 for (cnt = 0; cnt < pagedep_hash; cnt++) { 12805 pagedephd = &pagedep_hashtbl[next++]; 12806 if (next >= pagedep_hash) 12807 next = 0; 12808 LIST_FOREACH(pagedep, pagedephd, pd_hash) { 12809 if (LIST_EMPTY(&pagedep->pd_dirremhd)) 12810 continue; 12811 mp = pagedep->pd_list.wk_mp; 12812 ino = pagedep->pd_ino; 12813 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 12814 continue; 12815 FREE_LOCK(&lk); 12816 12817 /* 12818 * Let unmount clear deps 12819 */ 12820 error = vfs_busy(mp, MBF_NOWAIT); 12821 if (error != 0) 12822 goto finish_write; 12823 error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, 12824 FFSV_FORCEINSMQ); 12825 vfs_unbusy(mp); 12826 if (error != 0) { 12827 softdep_error("clear_remove: vget", error); 12828 goto finish_write; 12829 } 12830 if ((error = ffs_syncvnode(vp, MNT_NOWAIT))) 12831 softdep_error("clear_remove: fsync", error); 12832 bo = &vp->v_bufobj; 12833 BO_LOCK(bo); 12834 drain_output(vp); 12835 BO_UNLOCK(bo); 12836 vput(vp); 12837 finish_write: 12838 vn_finished_write(mp); 12839 ACQUIRE_LOCK(&lk); 12840 return; 12841 } 12842 } 12843 } 12844 12845 /* 12846 * Clear out a block of dirty inodes in an effort to reduce 12847 * the number of inodedep dependency structures. 12848 */ 12849 static void 12850 clear_inodedeps(td) 12851 struct thread *td; 12852 { 12853 struct inodedep_hashhead *inodedephd; 12854 struct inodedep *inodedep; 12855 static int next = 0; 12856 struct mount *mp; 12857 struct vnode *vp; 12858 struct fs *fs; 12859 int error, cnt; 12860 ino_t firstino, lastino, ino; 12861 12862 mtx_assert(&lk, MA_OWNED); 12863 /* 12864 * Pick a random inode dependency to be cleared. 12865 * We will then gather up all the inodes in its block 12866 * that have dependencies and flush them out. 12867 */ 12868 for (cnt = 0; cnt < inodedep_hash; cnt++) { 12869 inodedephd = &inodedep_hashtbl[next++]; 12870 if (next >= inodedep_hash) 12871 next = 0; 12872 if ((inodedep = LIST_FIRST(inodedephd)) != NULL) 12873 break; 12874 } 12875 if (inodedep == NULL) 12876 return; 12877 fs = inodedep->id_fs; 12878 mp = inodedep->id_list.wk_mp; 12879 /* 12880 * Find the last inode in the block with dependencies. 12881 */ 12882 firstino = inodedep->id_ino & ~(INOPB(fs) - 1); 12883 for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) 12884 if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0) 12885 break; 12886 /* 12887 * Asynchronously push all but the last inode with dependencies. 12888 * Synchronously push the last inode with dependencies to ensure 12889 * that the inode block gets written to free up the inodedeps. 12890 */ 12891 for (ino = firstino; ino <= lastino; ino++) { 12892 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) 12893 continue; 12894 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 12895 continue; 12896 FREE_LOCK(&lk); 12897 error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */ 12898 if (error != 0) { 12899 vn_finished_write(mp); 12900 ACQUIRE_LOCK(&lk); 12901 return; 12902 } 12903 if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, 12904 FFSV_FORCEINSMQ)) != 0) { 12905 softdep_error("clear_inodedeps: vget", error); 12906 vfs_unbusy(mp); 12907 vn_finished_write(mp); 12908 ACQUIRE_LOCK(&lk); 12909 return; 12910 } 12911 vfs_unbusy(mp); 12912 if (ino == lastino) { 12913 if ((error = ffs_syncvnode(vp, MNT_WAIT))) 12914 softdep_error("clear_inodedeps: fsync1", error); 12915 } else { 12916 if ((error = ffs_syncvnode(vp, MNT_NOWAIT))) 12917 softdep_error("clear_inodedeps: fsync2", error); 12918 BO_LOCK(&vp->v_bufobj); 12919 drain_output(vp); 12920 BO_UNLOCK(&vp->v_bufobj); 12921 } 12922 vput(vp); 12923 vn_finished_write(mp); 12924 ACQUIRE_LOCK(&lk); 12925 } 12926 } 12927 12928 void 12929 softdep_buf_append(bp, wkhd) 12930 struct buf *bp; 12931 struct workhead *wkhd; 12932 { 12933 struct worklist *wk; 12934 12935 ACQUIRE_LOCK(&lk); 12936 while ((wk = LIST_FIRST(wkhd)) != NULL) { 12937 WORKLIST_REMOVE(wk); 12938 WORKLIST_INSERT(&bp->b_dep, wk); 12939 } 12940 FREE_LOCK(&lk); 12941 12942 } 12943 12944 void 12945 softdep_inode_append(ip, cred, wkhd) 12946 struct inode *ip; 12947 struct ucred *cred; 12948 struct workhead *wkhd; 12949 { 12950 struct buf *bp; 12951 struct fs *fs; 12952 int error; 12953 12954 fs = ip->i_fs; 12955 error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 12956 (int)fs->fs_bsize, cred, &bp); 12957 if (error) { 12958 softdep_freework(wkhd); 12959 return; 12960 } 12961 softdep_buf_append(bp, wkhd); 12962 bqrelse(bp); 12963 } 12964 12965 void 12966 softdep_freework(wkhd) 12967 struct workhead *wkhd; 12968 { 12969 12970 ACQUIRE_LOCK(&lk); 12971 handle_jwork(wkhd); 12972 FREE_LOCK(&lk); 12973 } 12974 12975 /* 12976 * Function to determine if the buffer has outstanding dependencies 12977 * that will cause a roll-back if the buffer is written. If wantcount 12978 * is set, return number of dependencies, otherwise just yes or no. 12979 */ 12980 static int 12981 softdep_count_dependencies(bp, wantcount) 12982 struct buf *bp; 12983 int wantcount; 12984 { 12985 struct worklist *wk; 12986 struct bmsafemap *bmsafemap; 12987 struct freework *freework; 12988 struct inodedep *inodedep; 12989 struct indirdep *indirdep; 12990 struct freeblks *freeblks; 12991 struct allocindir *aip; 12992 struct pagedep *pagedep; 12993 struct dirrem *dirrem; 12994 struct newblk *newblk; 12995 struct mkdir *mkdir; 12996 struct diradd *dap; 12997 int i, retval; 12998 12999 retval = 0; 13000 ACQUIRE_LOCK(&lk); 13001 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 13002 switch (wk->wk_type) { 13003 13004 case D_INODEDEP: 13005 inodedep = WK_INODEDEP(wk); 13006 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 13007 /* bitmap allocation dependency */ 13008 retval += 1; 13009 if (!wantcount) 13010 goto out; 13011 } 13012 if (TAILQ_FIRST(&inodedep->id_inoupdt)) { 13013 /* direct block pointer dependency */ 13014 retval += 1; 13015 if (!wantcount) 13016 goto out; 13017 } 13018 if (TAILQ_FIRST(&inodedep->id_extupdt)) { 13019 /* direct block pointer dependency */ 13020 retval += 1; 13021 if (!wantcount) 13022 goto out; 13023 } 13024 if (TAILQ_FIRST(&inodedep->id_inoreflst)) { 13025 /* Add reference dependency. */ 13026 retval += 1; 13027 if (!wantcount) 13028 goto out; 13029 } 13030 continue; 13031 13032 case D_INDIRDEP: 13033 indirdep = WK_INDIRDEP(wk); 13034 13035 TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) { 13036 /* indirect truncation dependency */ 13037 retval += 1; 13038 if (!wantcount) 13039 goto out; 13040 } 13041 13042 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { 13043 /* indirect block pointer dependency */ 13044 retval += 1; 13045 if (!wantcount) 13046 goto out; 13047 } 13048 continue; 13049 13050 case D_PAGEDEP: 13051 pagedep = WK_PAGEDEP(wk); 13052 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { 13053 if (LIST_FIRST(&dirrem->dm_jremrefhd)) { 13054 /* Journal remove ref dependency. */ 13055 retval += 1; 13056 if (!wantcount) 13057 goto out; 13058 } 13059 } 13060 for (i = 0; i < DAHASHSZ; i++) { 13061 13062 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 13063 /* directory entry dependency */ 13064 retval += 1; 13065 if (!wantcount) 13066 goto out; 13067 } 13068 } 13069 continue; 13070 13071 case D_BMSAFEMAP: 13072 bmsafemap = WK_BMSAFEMAP(wk); 13073 if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) { 13074 /* Add reference dependency. */ 13075 retval += 1; 13076 if (!wantcount) 13077 goto out; 13078 } 13079 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) { 13080 /* Allocate block dependency. */ 13081 retval += 1; 13082 if (!wantcount) 13083 goto out; 13084 } 13085 continue; 13086 13087 case D_FREEBLKS: 13088 freeblks = WK_FREEBLKS(wk); 13089 if (LIST_FIRST(&freeblks->fb_jblkdephd)) { 13090 /* Freeblk journal dependency. */ 13091 retval += 1; 13092 if (!wantcount) 13093 goto out; 13094 } 13095 continue; 13096 13097 case D_ALLOCDIRECT: 13098 case D_ALLOCINDIR: 13099 newblk = WK_NEWBLK(wk); 13100 if (newblk->nb_jnewblk) { 13101 /* Journal allocate dependency. */ 13102 retval += 1; 13103 if (!wantcount) 13104 goto out; 13105 } 13106 continue; 13107 13108 case D_MKDIR: 13109 mkdir = WK_MKDIR(wk); 13110 if (mkdir->md_jaddref) { 13111 /* Journal reference dependency. */ 13112 retval += 1; 13113 if (!wantcount) 13114 goto out; 13115 } 13116 continue; 13117 13118 case D_FREEWORK: 13119 case D_FREEDEP: 13120 case D_JSEGDEP: 13121 case D_JSEG: 13122 case D_SBDEP: 13123 /* never a dependency on these blocks */ 13124 continue; 13125 13126 default: 13127 panic("softdep_count_dependencies: Unexpected type %s", 13128 TYPENAME(wk->wk_type)); 13129 /* NOTREACHED */ 13130 } 13131 } 13132 out: 13133 FREE_LOCK(&lk); 13134 return retval; 13135 } 13136 13137 /* 13138 * Acquire exclusive access to a buffer. 13139 * Must be called with a locked mtx parameter. 13140 * Return acquired buffer or NULL on failure. 13141 */ 13142 static struct buf * 13143 getdirtybuf(bp, mtx, waitfor) 13144 struct buf *bp; 13145 struct mtx *mtx; 13146 int waitfor; 13147 { 13148 int error; 13149 13150 mtx_assert(mtx, MA_OWNED); 13151 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) { 13152 if (waitfor != MNT_WAIT) 13153 return (NULL); 13154 error = BUF_LOCK(bp, 13155 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx); 13156 /* 13157 * Even if we sucessfully acquire bp here, we have dropped 13158 * mtx, which may violates our guarantee. 13159 */ 13160 if (error == 0) 13161 BUF_UNLOCK(bp); 13162 else if (error != ENOLCK) 13163 panic("getdirtybuf: inconsistent lock: %d", error); 13164 mtx_lock(mtx); 13165 return (NULL); 13166 } 13167 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { 13168 if (mtx == &lk && waitfor == MNT_WAIT) { 13169 mtx_unlock(mtx); 13170 BO_LOCK(bp->b_bufobj); 13171 BUF_UNLOCK(bp); 13172 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { 13173 bp->b_vflags |= BV_BKGRDWAIT; 13174 msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj), 13175 PRIBIO | PDROP, "getbuf", 0); 13176 } else 13177 BO_UNLOCK(bp->b_bufobj); 13178 mtx_lock(mtx); 13179 return (NULL); 13180 } 13181 BUF_UNLOCK(bp); 13182 if (waitfor != MNT_WAIT) 13183 return (NULL); 13184 /* 13185 * The mtx argument must be bp->b_vp's mutex in 13186 * this case. 13187 */ 13188 #ifdef DEBUG_VFS_LOCKS 13189 if (bp->b_vp->v_type != VCHR) 13190 ASSERT_BO_LOCKED(bp->b_bufobj); 13191 #endif 13192 bp->b_vflags |= BV_BKGRDWAIT; 13193 msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0); 13194 return (NULL); 13195 } 13196 if ((bp->b_flags & B_DELWRI) == 0) { 13197 BUF_UNLOCK(bp); 13198 return (NULL); 13199 } 13200 bremfree(bp); 13201 return (bp); 13202 } 13203 13204 13205 /* 13206 * Check if it is safe to suspend the file system now. On entry, 13207 * the vnode interlock for devvp should be held. Return 0 with 13208 * the mount interlock held if the file system can be suspended now, 13209 * otherwise return EAGAIN with the mount interlock held. 13210 */ 13211 int 13212 softdep_check_suspend(struct mount *mp, 13213 struct vnode *devvp, 13214 int softdep_deps, 13215 int softdep_accdeps, 13216 int secondary_writes, 13217 int secondary_accwrites) 13218 { 13219 struct bufobj *bo; 13220 struct ufsmount *ump; 13221 int error; 13222 13223 ump = VFSTOUFS(mp); 13224 bo = &devvp->v_bufobj; 13225 ASSERT_BO_LOCKED(bo); 13226 13227 for (;;) { 13228 if (!TRY_ACQUIRE_LOCK(&lk)) { 13229 BO_UNLOCK(bo); 13230 ACQUIRE_LOCK(&lk); 13231 FREE_LOCK(&lk); 13232 BO_LOCK(bo); 13233 continue; 13234 } 13235 MNT_ILOCK(mp); 13236 if (mp->mnt_secondary_writes != 0) { 13237 FREE_LOCK(&lk); 13238 BO_UNLOCK(bo); 13239 msleep(&mp->mnt_secondary_writes, 13240 MNT_MTX(mp), 13241 (PUSER - 1) | PDROP, "secwr", 0); 13242 BO_LOCK(bo); 13243 continue; 13244 } 13245 break; 13246 } 13247 13248 /* 13249 * Reasons for needing more work before suspend: 13250 * - Dirty buffers on devvp. 13251 * - Softdep activity occurred after start of vnode sync loop 13252 * - Secondary writes occurred after start of vnode sync loop 13253 */ 13254 error = 0; 13255 if (bo->bo_numoutput > 0 || 13256 bo->bo_dirty.bv_cnt > 0 || 13257 softdep_deps != 0 || 13258 ump->softdep_deps != 0 || 13259 softdep_accdeps != ump->softdep_accdeps || 13260 secondary_writes != 0 || 13261 mp->mnt_secondary_writes != 0 || 13262 secondary_accwrites != mp->mnt_secondary_accwrites) 13263 error = EAGAIN; 13264 FREE_LOCK(&lk); 13265 BO_UNLOCK(bo); 13266 return (error); 13267 } 13268 13269 13270 /* 13271 * Get the number of dependency structures for the file system, both 13272 * the current number and the total number allocated. These will 13273 * later be used to detect that softdep processing has occurred. 13274 */ 13275 void 13276 softdep_get_depcounts(struct mount *mp, 13277 int *softdep_depsp, 13278 int *softdep_accdepsp) 13279 { 13280 struct ufsmount *ump; 13281 13282 ump = VFSTOUFS(mp); 13283 ACQUIRE_LOCK(&lk); 13284 *softdep_depsp = ump->softdep_deps; 13285 *softdep_accdepsp = ump->softdep_accdeps; 13286 FREE_LOCK(&lk); 13287 } 13288 13289 /* 13290 * Wait for pending output on a vnode to complete. 13291 * Must be called with vnode lock and interlock locked. 13292 * 13293 * XXX: Should just be a call to bufobj_wwait(). 13294 */ 13295 static void 13296 drain_output(vp) 13297 struct vnode *vp; 13298 { 13299 struct bufobj *bo; 13300 13301 bo = &vp->v_bufobj; 13302 ASSERT_VOP_LOCKED(vp, "drain_output"); 13303 ASSERT_BO_LOCKED(bo); 13304 13305 while (bo->bo_numoutput) { 13306 bo->bo_flag |= BO_WWAIT; 13307 msleep((caddr_t)&bo->bo_numoutput, 13308 BO_MTX(bo), PRIBIO + 1, "drainvp", 0); 13309 } 13310 } 13311 13312 /* 13313 * Called whenever a buffer that is being invalidated or reallocated 13314 * contains dependencies. This should only happen if an I/O error has 13315 * occurred. The routine is called with the buffer locked. 13316 */ 13317 static void 13318 softdep_deallocate_dependencies(bp) 13319 struct buf *bp; 13320 { 13321 13322 if ((bp->b_ioflags & BIO_ERROR) == 0) 13323 panic("softdep_deallocate_dependencies: dangling deps"); 13324 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); 13325 panic("softdep_deallocate_dependencies: unrecovered I/O error"); 13326 } 13327 13328 /* 13329 * Function to handle asynchronous write errors in the filesystem. 13330 */ 13331 static void 13332 softdep_error(func, error) 13333 char *func; 13334 int error; 13335 { 13336 13337 /* XXX should do something better! */ 13338 printf("%s: got error %d while accessing filesystem\n", func, error); 13339 } 13340 13341 #ifdef DDB 13342 13343 static void 13344 inodedep_print(struct inodedep *inodedep, int verbose) 13345 { 13346 db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d" 13347 " saveino %p\n", 13348 inodedep, inodedep->id_fs, inodedep->id_state, 13349 (intmax_t)inodedep->id_ino, 13350 (intmax_t)fsbtodb(inodedep->id_fs, 13351 ino_to_fsba(inodedep->id_fs, inodedep->id_ino)), 13352 inodedep->id_nlinkdelta, inodedep->id_savednlink, 13353 inodedep->id_savedino1); 13354 13355 if (verbose == 0) 13356 return; 13357 13358 db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, " 13359 "mkdiradd %p\n", 13360 LIST_FIRST(&inodedep->id_pendinghd), 13361 LIST_FIRST(&inodedep->id_bufwait), 13362 LIST_FIRST(&inodedep->id_inowait), 13363 TAILQ_FIRST(&inodedep->id_inoreflst), 13364 inodedep->id_mkdiradd); 13365 db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n", 13366 TAILQ_FIRST(&inodedep->id_inoupdt), 13367 TAILQ_FIRST(&inodedep->id_newinoupdt), 13368 TAILQ_FIRST(&inodedep->id_extupdt), 13369 TAILQ_FIRST(&inodedep->id_newextupdt)); 13370 } 13371 13372 DB_SHOW_COMMAND(inodedep, db_show_inodedep) 13373 { 13374 13375 if (have_addr == 0) { 13376 db_printf("Address required\n"); 13377 return; 13378 } 13379 inodedep_print((struct inodedep*)addr, 1); 13380 } 13381 13382 DB_SHOW_COMMAND(inodedeps, db_show_inodedeps) 13383 { 13384 struct inodedep_hashhead *inodedephd; 13385 struct inodedep *inodedep; 13386 struct fs *fs; 13387 int cnt; 13388 13389 fs = have_addr ? (struct fs *)addr : NULL; 13390 for (cnt = 0; cnt < inodedep_hash; cnt++) { 13391 inodedephd = &inodedep_hashtbl[cnt]; 13392 LIST_FOREACH(inodedep, inodedephd, id_hash) { 13393 if (fs != NULL && fs != inodedep->id_fs) 13394 continue; 13395 inodedep_print(inodedep, 0); 13396 } 13397 } 13398 } 13399 13400 DB_SHOW_COMMAND(worklist, db_show_worklist) 13401 { 13402 struct worklist *wk; 13403 13404 if (have_addr == 0) { 13405 db_printf("Address required\n"); 13406 return; 13407 } 13408 wk = (struct worklist *)addr; 13409 printf("worklist: %p type %s state 0x%X\n", 13410 wk, TYPENAME(wk->wk_type), wk->wk_state); 13411 } 13412 13413 DB_SHOW_COMMAND(workhead, db_show_workhead) 13414 { 13415 struct workhead *wkhd; 13416 struct worklist *wk; 13417 int i; 13418 13419 if (have_addr == 0) { 13420 db_printf("Address required\n"); 13421 return; 13422 } 13423 wkhd = (struct workhead *)addr; 13424 wk = LIST_FIRST(wkhd); 13425 for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list)) 13426 db_printf("worklist: %p type %s state 0x%X", 13427 wk, TYPENAME(wk->wk_type), wk->wk_state); 13428 if (i == 100) 13429 db_printf("workhead overflow"); 13430 printf("\n"); 13431 } 13432 13433 13434 DB_SHOW_COMMAND(mkdirs, db_show_mkdirs) 13435 { 13436 struct jaddref *jaddref; 13437 struct diradd *diradd; 13438 struct mkdir *mkdir; 13439 13440 LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) { 13441 diradd = mkdir->md_diradd; 13442 db_printf("mkdir: %p state 0x%X dap %p state 0x%X", 13443 mkdir, mkdir->md_state, diradd, diradd->da_state); 13444 if ((jaddref = mkdir->md_jaddref) != NULL) 13445 db_printf(" jaddref %p jaddref state 0x%X", 13446 jaddref, jaddref->ja_state); 13447 db_printf("\n"); 13448 } 13449 } 13450 13451 #endif /* DDB */ 13452 13453 #endif /* SOFTUPDATES */ 13454