1 /*- 2 * Copyright 1998, 2000 Marshall Kirk McKusick. 3 * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org> 4 * All rights reserved. 5 * 6 * The soft updates code is derived from the appendix of a University 7 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, 8 * "Soft Updates: A Solution to the Metadata Update Problem in File 9 * Systems", CSE-TR-254-95, August 1995). 10 * 11 * Further information about soft updates can be obtained from: 12 * 13 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 14 * 1614 Oxford Street mckusick@mckusick.com 15 * Berkeley, CA 94709-1608 +1-510-843-9542 16 * USA 17 * 18 * Redistribution and use in source and binary forms, with or without 19 * modification, are permitted provided that the following conditions 20 * are met: 21 * 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 31 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, 32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 34 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 35 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 36 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 37 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 * 39 * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00 40 */ 41 42 #include <sys/cdefs.h> 43 __FBSDID("$FreeBSD$"); 44 45 #include "opt_ffs.h" 46 #include "opt_quota.h" 47 #include "opt_ddb.h" 48 49 /* 50 * For now we want the safety net that the DEBUG flag provides. 51 */ 52 #ifndef DEBUG 53 #define DEBUG 54 #endif 55 56 #include <sys/param.h> 57 #include <sys/kernel.h> 58 #include <sys/systm.h> 59 #include <sys/bio.h> 60 #include <sys/buf.h> 61 #include <sys/kdb.h> 62 #include <sys/kthread.h> 63 #include <sys/limits.h> 64 #include <sys/lock.h> 65 #include <sys/malloc.h> 66 #include <sys/mount.h> 67 #include <sys/mutex.h> 68 #include <sys/namei.h> 69 #include <sys/priv.h> 70 #include <sys/proc.h> 71 #include <sys/stat.h> 72 #include <sys/sysctl.h> 73 #include <sys/syslog.h> 74 #include <sys/vnode.h> 75 #include <sys/conf.h> 76 77 #include <ufs/ufs/dir.h> 78 #include <ufs/ufs/extattr.h> 79 #include <ufs/ufs/quota.h> 80 #include <ufs/ufs/inode.h> 81 #include <ufs/ufs/ufsmount.h> 82 #include <ufs/ffs/fs.h> 83 #include <ufs/ffs/softdep.h> 84 #include <ufs/ffs/ffs_extern.h> 85 #include <ufs/ufs/ufs_extern.h> 86 87 #include <vm/vm.h> 88 #include <vm/vm_extern.h> 89 #include <vm/vm_object.h> 90 91 #include <ddb/ddb.h> 92 93 #ifndef SOFTUPDATES 94 95 int 96 softdep_flushfiles(oldmnt, flags, td) 97 struct mount *oldmnt; 98 int flags; 99 struct thread *td; 100 { 101 102 panic("softdep_flushfiles called"); 103 } 104 105 int 106 softdep_mount(devvp, mp, fs, cred) 107 struct vnode *devvp; 108 struct mount *mp; 109 struct fs *fs; 110 struct ucred *cred; 111 { 112 113 return (0); 114 } 115 116 void 117 softdep_initialize() 118 { 119 120 return; 121 } 122 123 void 124 softdep_uninitialize() 125 { 126 127 return; 128 } 129 130 void 131 softdep_unmount(mp) 132 struct mount *mp; 133 { 134 135 } 136 137 void 138 softdep_setup_sbupdate(ump, fs, bp) 139 struct ufsmount *ump; 140 struct fs *fs; 141 struct buf *bp; 142 { 143 } 144 145 void 146 softdep_setup_inomapdep(bp, ip, newinum, mode) 147 struct buf *bp; 148 struct inode *ip; 149 ino_t newinum; 150 int mode; 151 { 152 153 panic("softdep_setup_inomapdep called"); 154 } 155 156 void 157 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) 158 struct buf *bp; 159 struct mount *mp; 160 ufs2_daddr_t newblkno; 161 int frags; 162 int oldfrags; 163 { 164 165 panic("softdep_setup_blkmapdep called"); 166 } 167 168 void 169 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 170 struct inode *ip; 171 ufs_lbn_t lbn; 172 ufs2_daddr_t newblkno; 173 ufs2_daddr_t oldblkno; 174 long newsize; 175 long oldsize; 176 struct buf *bp; 177 { 178 179 panic("softdep_setup_allocdirect called"); 180 } 181 182 void 183 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 184 struct inode *ip; 185 ufs_lbn_t lbn; 186 ufs2_daddr_t newblkno; 187 ufs2_daddr_t oldblkno; 188 long newsize; 189 long oldsize; 190 struct buf *bp; 191 { 192 193 panic("softdep_setup_allocext called"); 194 } 195 196 void 197 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 198 struct inode *ip; 199 ufs_lbn_t lbn; 200 struct buf *bp; 201 int ptrno; 202 ufs2_daddr_t newblkno; 203 ufs2_daddr_t oldblkno; 204 struct buf *nbp; 205 { 206 207 panic("softdep_setup_allocindir_page called"); 208 } 209 210 void 211 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 212 struct buf *nbp; 213 struct inode *ip; 214 struct buf *bp; 215 int ptrno; 216 ufs2_daddr_t newblkno; 217 { 218 219 panic("softdep_setup_allocindir_meta called"); 220 } 221 222 void 223 softdep_journal_freeblocks(ip, cred, length, flags) 224 struct inode *ip; 225 struct ucred *cred; 226 off_t length; 227 int flags; 228 { 229 230 panic("softdep_journal_freeblocks called"); 231 } 232 233 void 234 softdep_journal_fsync(ip) 235 struct inode *ip; 236 { 237 238 panic("softdep_journal_fsync called"); 239 } 240 241 void 242 softdep_setup_freeblocks(ip, length, flags) 243 struct inode *ip; 244 off_t length; 245 int flags; 246 { 247 248 panic("softdep_setup_freeblocks called"); 249 } 250 251 void 252 softdep_freefile(pvp, ino, mode) 253 struct vnode *pvp; 254 ino_t ino; 255 int mode; 256 { 257 258 panic("softdep_freefile called"); 259 } 260 261 int 262 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) 263 struct buf *bp; 264 struct inode *dp; 265 off_t diroffset; 266 ino_t newinum; 267 struct buf *newdirbp; 268 int isnewblk; 269 { 270 271 panic("softdep_setup_directory_add called"); 272 } 273 274 void 275 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) 276 struct buf *bp; 277 struct inode *dp; 278 caddr_t base; 279 caddr_t oldloc; 280 caddr_t newloc; 281 int entrysize; 282 { 283 284 panic("softdep_change_directoryentry_offset called"); 285 } 286 287 void 288 softdep_setup_remove(bp, dp, ip, isrmdir) 289 struct buf *bp; 290 struct inode *dp; 291 struct inode *ip; 292 int isrmdir; 293 { 294 295 panic("softdep_setup_remove called"); 296 } 297 298 void 299 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 300 struct buf *bp; 301 struct inode *dp; 302 struct inode *ip; 303 ino_t newinum; 304 int isrmdir; 305 { 306 307 panic("softdep_setup_directory_change called"); 308 } 309 310 void 311 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) 312 struct mount *mp; 313 struct buf *bp; 314 ufs2_daddr_t blkno; 315 int frags; 316 struct workhead *wkhd; 317 { 318 319 panic("%s called", __FUNCTION__); 320 } 321 322 void 323 softdep_setup_inofree(mp, bp, ino, wkhd) 324 struct mount *mp; 325 struct buf *bp; 326 ino_t ino; 327 struct workhead *wkhd; 328 { 329 330 panic("%s called", __FUNCTION__); 331 } 332 333 void 334 softdep_setup_unlink(dp, ip) 335 struct inode *dp; 336 struct inode *ip; 337 { 338 339 panic("%s called", __FUNCTION__); 340 } 341 342 void 343 softdep_setup_link(dp, ip) 344 struct inode *dp; 345 struct inode *ip; 346 { 347 348 panic("%s called", __FUNCTION__); 349 } 350 351 void 352 softdep_revert_link(dp, ip) 353 struct inode *dp; 354 struct inode *ip; 355 { 356 357 panic("%s called", __FUNCTION__); 358 } 359 360 void 361 softdep_setup_rmdir(dp, ip) 362 struct inode *dp; 363 struct inode *ip; 364 { 365 366 panic("%s called", __FUNCTION__); 367 } 368 369 void 370 softdep_revert_rmdir(dp, ip) 371 struct inode *dp; 372 struct inode *ip; 373 { 374 375 panic("%s called", __FUNCTION__); 376 } 377 378 void 379 softdep_setup_create(dp, ip) 380 struct inode *dp; 381 struct inode *ip; 382 { 383 384 panic("%s called", __FUNCTION__); 385 } 386 387 void 388 softdep_revert_create(dp, ip) 389 struct inode *dp; 390 struct inode *ip; 391 { 392 393 panic("%s called", __FUNCTION__); 394 } 395 396 void 397 softdep_setup_mkdir(dp, ip) 398 struct inode *dp; 399 struct inode *ip; 400 { 401 402 panic("%s called", __FUNCTION__); 403 } 404 405 void 406 softdep_revert_mkdir(dp, ip) 407 struct inode *dp; 408 struct inode *ip; 409 { 410 411 panic("%s called", __FUNCTION__); 412 } 413 414 void 415 softdep_setup_dotdot_link(dp, ip) 416 struct inode *dp; 417 struct inode *ip; 418 { 419 420 panic("%s called", __FUNCTION__); 421 } 422 423 int 424 softdep_prealloc(vp, waitok) 425 struct vnode *vp; 426 int waitok; 427 { 428 429 panic("%s called", __FUNCTION__); 430 431 return (0); 432 } 433 434 int 435 softdep_journal_lookup(mp, vpp) 436 struct mount *mp; 437 struct vnode **vpp; 438 { 439 440 return (ENOENT); 441 } 442 443 void 444 softdep_change_linkcnt(ip) 445 struct inode *ip; 446 { 447 448 panic("softdep_change_linkcnt called"); 449 } 450 451 void 452 softdep_load_inodeblock(ip) 453 struct inode *ip; 454 { 455 456 panic("softdep_load_inodeblock called"); 457 } 458 459 void 460 softdep_update_inodeblock(ip, bp, waitfor) 461 struct inode *ip; 462 struct buf *bp; 463 int waitfor; 464 { 465 466 panic("softdep_update_inodeblock called"); 467 } 468 469 int 470 softdep_fsync(vp) 471 struct vnode *vp; /* the "in_core" copy of the inode */ 472 { 473 474 return (0); 475 } 476 477 void 478 softdep_fsync_mountdev(vp) 479 struct vnode *vp; 480 { 481 482 return; 483 } 484 485 int 486 softdep_flushworklist(oldmnt, countp, td) 487 struct mount *oldmnt; 488 int *countp; 489 struct thread *td; 490 { 491 492 *countp = 0; 493 return (0); 494 } 495 496 int 497 softdep_sync_metadata(struct vnode *vp) 498 { 499 500 return (0); 501 } 502 503 int 504 softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor) 505 { 506 507 return (0); 508 } 509 510 int 511 softdep_slowdown(vp) 512 struct vnode *vp; 513 { 514 515 panic("softdep_slowdown called"); 516 } 517 518 void 519 softdep_releasefile(ip) 520 struct inode *ip; /* inode with the zero effective link count */ 521 { 522 523 panic("softdep_releasefile called"); 524 } 525 526 int 527 softdep_request_cleanup(fs, vp, cred, resource) 528 struct fs *fs; 529 struct vnode *vp; 530 struct ucred *cred; 531 int resource; 532 { 533 534 return (0); 535 } 536 537 int 538 softdep_check_suspend(struct mount *mp, 539 struct vnode *devvp, 540 int softdep_deps, 541 int softdep_accdeps, 542 int secondary_writes, 543 int secondary_accwrites) 544 { 545 struct bufobj *bo; 546 int error; 547 548 (void) softdep_deps, 549 (void) softdep_accdeps; 550 551 bo = &devvp->v_bufobj; 552 ASSERT_BO_LOCKED(bo); 553 554 MNT_ILOCK(mp); 555 while (mp->mnt_secondary_writes != 0) { 556 BO_UNLOCK(bo); 557 msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), 558 (PUSER - 1) | PDROP, "secwr", 0); 559 BO_LOCK(bo); 560 MNT_ILOCK(mp); 561 } 562 563 /* 564 * Reasons for needing more work before suspend: 565 * - Dirty buffers on devvp. 566 * - Secondary writes occurred after start of vnode sync loop 567 */ 568 error = 0; 569 if (bo->bo_numoutput > 0 || 570 bo->bo_dirty.bv_cnt > 0 || 571 secondary_writes != 0 || 572 mp->mnt_secondary_writes != 0 || 573 secondary_accwrites != mp->mnt_secondary_accwrites) 574 error = EAGAIN; 575 BO_UNLOCK(bo); 576 return (error); 577 } 578 579 void 580 softdep_get_depcounts(struct mount *mp, 581 int *softdepactivep, 582 int *softdepactiveaccp) 583 { 584 (void) mp; 585 *softdepactivep = 0; 586 *softdepactiveaccp = 0; 587 } 588 589 void 590 softdep_buf_append(bp, wkhd) 591 struct buf *bp; 592 struct workhead *wkhd; 593 { 594 595 panic("softdep_buf_appendwork called"); 596 } 597 598 void 599 softdep_inode_append(ip, cred, wkhd) 600 struct inode *ip; 601 struct ucred *cred; 602 struct workhead *wkhd; 603 { 604 605 panic("softdep_inode_appendwork called"); 606 } 607 608 void 609 softdep_freework(wkhd) 610 struct workhead *wkhd; 611 { 612 613 panic("softdep_freework called"); 614 } 615 616 #else 617 618 FEATURE(softupdates, "FFS soft-updates support"); 619 620 /* 621 * These definitions need to be adapted to the system to which 622 * this file is being ported. 623 */ 624 625 #define M_SOFTDEP_FLAGS (M_WAITOK) 626 627 #define D_PAGEDEP 0 628 #define D_INODEDEP 1 629 #define D_BMSAFEMAP 2 630 #define D_NEWBLK 3 631 #define D_ALLOCDIRECT 4 632 #define D_INDIRDEP 5 633 #define D_ALLOCINDIR 6 634 #define D_FREEFRAG 7 635 #define D_FREEBLKS 8 636 #define D_FREEFILE 9 637 #define D_DIRADD 10 638 #define D_MKDIR 11 639 #define D_DIRREM 12 640 #define D_NEWDIRBLK 13 641 #define D_FREEWORK 14 642 #define D_FREEDEP 15 643 #define D_JADDREF 16 644 #define D_JREMREF 17 645 #define D_JMVREF 18 646 #define D_JNEWBLK 19 647 #define D_JFREEBLK 20 648 #define D_JFREEFRAG 21 649 #define D_JSEG 22 650 #define D_JSEGDEP 23 651 #define D_SBDEP 24 652 #define D_JTRUNC 25 653 #define D_JFSYNC 26 654 #define D_SENTINAL 27 655 #define D_LAST D_SENTINAL 656 657 unsigned long dep_current[D_LAST + 1]; 658 unsigned long dep_total[D_LAST + 1]; 659 unsigned long dep_write[D_LAST + 1]; 660 661 662 static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0, 663 "soft updates stats"); 664 static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0, 665 "total dependencies allocated"); 666 static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0, 667 "current dependencies allocated"); 668 static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW, 0, 669 "current dependencies written"); 670 671 #define SOFTDEP_TYPE(type, str, long) \ 672 static MALLOC_DEFINE(M_ ## type, #str, long); \ 673 SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD, \ 674 &dep_total[D_ ## type], 0, ""); \ 675 SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, \ 676 &dep_current[D_ ## type], 0, ""); \ 677 SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, \ 678 &dep_write[D_ ## type], 0, ""); 679 680 SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies"); 681 SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies"); 682 SOFTDEP_TYPE(BMSAFEMAP, bmsafemap, 683 "Block or frag allocated from cyl group map"); 684 SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency"); 685 SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode"); 686 SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies"); 687 SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block"); 688 SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode"); 689 SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode"); 690 SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated"); 691 SOFTDEP_TYPE(DIRADD, diradd, "New directory entry"); 692 SOFTDEP_TYPE(MKDIR, mkdir, "New directory"); 693 SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted"); 694 SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block"); 695 SOFTDEP_TYPE(FREEWORK, freework, "free an inode block"); 696 SOFTDEP_TYPE(FREEDEP, freedep, "track a block free"); 697 SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add"); 698 SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove"); 699 SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move"); 700 SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block"); 701 SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block"); 702 SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag"); 703 SOFTDEP_TYPE(JSEG, jseg, "Journal segment"); 704 SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete"); 705 SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency"); 706 SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation"); 707 SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete"); 708 709 static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes"); 710 static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations"); 711 712 /* 713 * translate from workitem type to memory type 714 * MUST match the defines above, such that memtype[D_XXX] == M_XXX 715 */ 716 static struct malloc_type *memtype[] = { 717 M_PAGEDEP, 718 M_INODEDEP, 719 M_BMSAFEMAP, 720 M_NEWBLK, 721 M_ALLOCDIRECT, 722 M_INDIRDEP, 723 M_ALLOCINDIR, 724 M_FREEFRAG, 725 M_FREEBLKS, 726 M_FREEFILE, 727 M_DIRADD, 728 M_MKDIR, 729 M_DIRREM, 730 M_NEWDIRBLK, 731 M_FREEWORK, 732 M_FREEDEP, 733 M_JADDREF, 734 M_JREMREF, 735 M_JMVREF, 736 M_JNEWBLK, 737 M_JFREEBLK, 738 M_JFREEFRAG, 739 M_JSEG, 740 M_JSEGDEP, 741 M_SBDEP, 742 M_JTRUNC, 743 M_JFSYNC 744 }; 745 746 static LIST_HEAD(mkdirlist, mkdir) mkdirlisthd; 747 748 #define DtoM(type) (memtype[type]) 749 750 /* 751 * Names of malloc types. 752 */ 753 #define TYPENAME(type) \ 754 ((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???") 755 /* 756 * End system adaptation definitions. 757 */ 758 759 #define DOTDOT_OFFSET offsetof(struct dirtemplate, dotdot_ino) 760 #define DOT_OFFSET offsetof(struct dirtemplate, dot_ino) 761 762 /* 763 * Forward declarations. 764 */ 765 struct inodedep_hashhead; 766 struct newblk_hashhead; 767 struct pagedep_hashhead; 768 struct bmsafemap_hashhead; 769 770 /* 771 * Internal function prototypes. 772 */ 773 static void softdep_error(char *, int); 774 static void drain_output(struct vnode *); 775 static struct buf *getdirtybuf(struct buf *, struct mtx *, int); 776 static void clear_remove(struct thread *); 777 static void clear_inodedeps(struct thread *); 778 static void unlinked_inodedep(struct mount *, struct inodedep *); 779 static void clear_unlinked_inodedep(struct inodedep *); 780 static struct inodedep *first_unlinked_inodedep(struct ufsmount *); 781 static int flush_pagedep_deps(struct vnode *, struct mount *, 782 struct diraddhd *); 783 static int free_pagedep(struct pagedep *); 784 static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t); 785 static int flush_inodedep_deps(struct vnode *, struct mount *, ino_t); 786 static int flush_deplist(struct allocdirectlst *, int, int *); 787 static int sync_cgs(struct mount *, int); 788 static int handle_written_filepage(struct pagedep *, struct buf *); 789 static int handle_written_sbdep(struct sbdep *, struct buf *); 790 static void initiate_write_sbdep(struct sbdep *); 791 static void diradd_inode_written(struct diradd *, struct inodedep *); 792 static int handle_written_indirdep(struct indirdep *, struct buf *, 793 struct buf**); 794 static int handle_written_inodeblock(struct inodedep *, struct buf *); 795 static int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *, 796 uint8_t *); 797 static int handle_written_bmsafemap(struct bmsafemap *, struct buf *); 798 static void handle_written_jaddref(struct jaddref *); 799 static void handle_written_jremref(struct jremref *); 800 static void handle_written_jseg(struct jseg *, struct buf *); 801 static void handle_written_jnewblk(struct jnewblk *); 802 static void handle_written_jblkdep(struct jblkdep *); 803 static void handle_written_jfreefrag(struct jfreefrag *); 804 static void complete_jseg(struct jseg *); 805 static void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *); 806 static void jaddref_write(struct jaddref *, struct jseg *, uint8_t *); 807 static void jremref_write(struct jremref *, struct jseg *, uint8_t *); 808 static void jmvref_write(struct jmvref *, struct jseg *, uint8_t *); 809 static void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *); 810 static void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data); 811 static void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *); 812 static void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *); 813 static void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *); 814 static inline void inoref_write(struct inoref *, struct jseg *, 815 struct jrefrec *); 816 static void handle_allocdirect_partdone(struct allocdirect *, 817 struct workhead *); 818 static struct jnewblk *cancel_newblk(struct newblk *, struct worklist *, 819 struct workhead *); 820 static void indirdep_complete(struct indirdep *); 821 static int indirblk_lookup(struct mount *, ufs2_daddr_t); 822 static void indirblk_insert(struct freework *); 823 static void indirblk_remove(struct freework *); 824 static void handle_allocindir_partdone(struct allocindir *); 825 static void initiate_write_filepage(struct pagedep *, struct buf *); 826 static void initiate_write_indirdep(struct indirdep*, struct buf *); 827 static void handle_written_mkdir(struct mkdir *, int); 828 static int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *, 829 uint8_t *); 830 static void initiate_write_bmsafemap(struct bmsafemap *, struct buf *); 831 static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *); 832 static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *); 833 static void handle_workitem_freefile(struct freefile *); 834 static int handle_workitem_remove(struct dirrem *, int); 835 static struct dirrem *newdirrem(struct buf *, struct inode *, 836 struct inode *, int, struct dirrem **); 837 static struct indirdep *indirdep_lookup(struct mount *, struct inode *, 838 struct buf *); 839 static void cancel_indirdep(struct indirdep *, struct buf *, 840 struct freeblks *); 841 static void free_indirdep(struct indirdep *); 842 static void free_diradd(struct diradd *, struct workhead *); 843 static void merge_diradd(struct inodedep *, struct diradd *); 844 static void complete_diradd(struct diradd *); 845 static struct diradd *diradd_lookup(struct pagedep *, int); 846 static struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *, 847 struct jremref *); 848 static struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *, 849 struct jremref *); 850 static void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *, 851 struct jremref *, struct jremref *); 852 static void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *, 853 struct jremref *); 854 static void cancel_allocindir(struct allocindir *, struct buf *bp, 855 struct freeblks *, int); 856 static int setup_trunc_indir(struct freeblks *, struct inode *, 857 ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t); 858 static void complete_trunc_indir(struct freework *); 859 static void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *, 860 int); 861 static void complete_mkdir(struct mkdir *); 862 static void free_newdirblk(struct newdirblk *); 863 static void free_jremref(struct jremref *); 864 static void free_jaddref(struct jaddref *); 865 static void free_jsegdep(struct jsegdep *); 866 static void free_jsegs(struct jblocks *); 867 static void rele_jseg(struct jseg *); 868 static void free_jseg(struct jseg *, struct jblocks *); 869 static void free_jnewblk(struct jnewblk *); 870 static void free_jblkdep(struct jblkdep *); 871 static void free_jfreefrag(struct jfreefrag *); 872 static void free_freedep(struct freedep *); 873 static void journal_jremref(struct dirrem *, struct jremref *, 874 struct inodedep *); 875 static void cancel_jnewblk(struct jnewblk *, struct workhead *); 876 static int cancel_jaddref(struct jaddref *, struct inodedep *, 877 struct workhead *); 878 static void cancel_jfreefrag(struct jfreefrag *); 879 static inline void setup_freedirect(struct freeblks *, struct inode *, 880 int, int); 881 static inline void setup_freeext(struct freeblks *, struct inode *, int, int); 882 static inline void setup_freeindir(struct freeblks *, struct inode *, int, 883 ufs_lbn_t, int); 884 static inline struct freeblks *newfreeblks(struct mount *, struct inode *); 885 static void freeblks_free(struct ufsmount *, struct freeblks *, int); 886 static void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t); 887 ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t); 888 static int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int); 889 static void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t, 890 int, int); 891 static void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int); 892 static int cancel_pagedep(struct pagedep *, struct freeblks *, int); 893 static int deallocate_dependencies(struct buf *, struct freeblks *, int); 894 static void newblk_freefrag(struct newblk*); 895 static void free_newblk(struct newblk *); 896 static void cancel_allocdirect(struct allocdirectlst *, 897 struct allocdirect *, struct freeblks *); 898 static int check_inode_unwritten(struct inodedep *); 899 static int free_inodedep(struct inodedep *); 900 static void freework_freeblock(struct freework *); 901 static void freework_enqueue(struct freework *); 902 static int handle_workitem_freeblocks(struct freeblks *, int); 903 static int handle_complete_freeblocks(struct freeblks *, int); 904 static void handle_workitem_indirblk(struct freework *); 905 static void handle_written_freework(struct freework *); 906 static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *); 907 static struct worklist *jnewblk_merge(struct worklist *, struct worklist *, 908 struct workhead *); 909 static struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *, 910 struct inodedep *, struct allocindir *, ufs_lbn_t); 911 static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t, 912 ufs2_daddr_t, ufs_lbn_t); 913 static void handle_workitem_freefrag(struct freefrag *); 914 static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long, 915 ufs_lbn_t); 916 static void allocdirect_merge(struct allocdirectlst *, 917 struct allocdirect *, struct allocdirect *); 918 static struct freefrag *allocindir_merge(struct allocindir *, 919 struct allocindir *); 920 static int bmsafemap_find(struct bmsafemap_hashhead *, struct mount *, int, 921 struct bmsafemap **); 922 static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *, 923 int cg); 924 static int newblk_find(struct newblk_hashhead *, struct mount *, ufs2_daddr_t, 925 int, struct newblk **); 926 static int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **); 927 static int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t, 928 struct inodedep **); 929 static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **); 930 static int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t, 931 int, struct pagedep **); 932 static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t, 933 struct mount *mp, int, struct pagedep **); 934 static void pause_timer(void *); 935 static int request_cleanup(struct mount *, int); 936 static int process_worklist_item(struct mount *, int, int); 937 static void process_removes(struct vnode *); 938 static void process_truncates(struct vnode *); 939 static void jwork_move(struct workhead *, struct workhead *); 940 static void jwork_insert(struct workhead *, struct jsegdep *); 941 static void add_to_worklist(struct worklist *, int); 942 static void wake_worklist(struct worklist *); 943 static void wait_worklist(struct worklist *, char *); 944 static void remove_from_worklist(struct worklist *); 945 static void softdep_flush(void); 946 static void softdep_flushjournal(struct mount *); 947 static int softdep_speedup(void); 948 static void worklist_speedup(void); 949 static int journal_mount(struct mount *, struct fs *, struct ucred *); 950 static void journal_unmount(struct mount *); 951 static int journal_space(struct ufsmount *, int); 952 static void journal_suspend(struct ufsmount *); 953 static int journal_unsuspend(struct ufsmount *ump); 954 static void softdep_prelink(struct vnode *, struct vnode *); 955 static void add_to_journal(struct worklist *); 956 static void remove_from_journal(struct worklist *); 957 static void softdep_process_journal(struct mount *, struct worklist *, int); 958 static struct jremref *newjremref(struct dirrem *, struct inode *, 959 struct inode *ip, off_t, nlink_t); 960 static struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t, 961 uint16_t); 962 static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t, 963 uint16_t); 964 static inline struct jsegdep *inoref_jseg(struct inoref *); 965 static struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t); 966 static struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t, 967 ufs2_daddr_t, int); 968 static struct jtrunc *newjtrunc(struct freeblks *, off_t, int); 969 static void move_newblock_dep(struct jaddref *, struct inodedep *); 970 static void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t); 971 static struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *, 972 ufs2_daddr_t, long, ufs_lbn_t); 973 static struct freework *newfreework(struct ufsmount *, struct freeblks *, 974 struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int); 975 static int jwait(struct worklist *, int); 976 static struct inodedep *inodedep_lookup_ip(struct inode *); 977 static int bmsafemap_rollbacks(struct bmsafemap *); 978 static struct freefile *handle_bufwait(struct inodedep *, struct workhead *); 979 static void handle_jwork(struct workhead *); 980 static struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *, 981 struct mkdir **); 982 static struct jblocks *jblocks_create(void); 983 static ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *); 984 static void jblocks_free(struct jblocks *, struct mount *, int); 985 static void jblocks_destroy(struct jblocks *); 986 static void jblocks_add(struct jblocks *, ufs2_daddr_t, int); 987 988 /* 989 * Exported softdep operations. 990 */ 991 static void softdep_disk_io_initiation(struct buf *); 992 static void softdep_disk_write_complete(struct buf *); 993 static void softdep_deallocate_dependencies(struct buf *); 994 static int softdep_count_dependencies(struct buf *bp, int); 995 996 static struct mtx lk; 997 MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF); 998 999 #define TRY_ACQUIRE_LOCK(lk) mtx_trylock(lk) 1000 #define ACQUIRE_LOCK(lk) mtx_lock(lk) 1001 #define FREE_LOCK(lk) mtx_unlock(lk) 1002 1003 #define BUF_AREC(bp) lockallowrecurse(&(bp)->b_lock) 1004 #define BUF_NOREC(bp) lockdisablerecurse(&(bp)->b_lock) 1005 1006 /* 1007 * Worklist queue management. 1008 * These routines require that the lock be held. 1009 */ 1010 #ifndef /* NOT */ DEBUG 1011 #define WORKLIST_INSERT(head, item) do { \ 1012 (item)->wk_state |= ONWORKLIST; \ 1013 LIST_INSERT_HEAD(head, item, wk_list); \ 1014 } while (0) 1015 #define WORKLIST_REMOVE(item) do { \ 1016 (item)->wk_state &= ~ONWORKLIST; \ 1017 LIST_REMOVE(item, wk_list); \ 1018 } while (0) 1019 #define WORKLIST_INSERT_UNLOCKED WORKLIST_INSERT 1020 #define WORKLIST_REMOVE_UNLOCKED WORKLIST_REMOVE 1021 1022 #else /* DEBUG */ 1023 static void worklist_insert(struct workhead *, struct worklist *, int); 1024 static void worklist_remove(struct worklist *, int); 1025 1026 #define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1) 1027 #define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0) 1028 #define WORKLIST_REMOVE(item) worklist_remove(item, 1) 1029 #define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0) 1030 1031 static void 1032 worklist_insert(head, item, locked) 1033 struct workhead *head; 1034 struct worklist *item; 1035 int locked; 1036 { 1037 1038 if (locked) 1039 mtx_assert(&lk, MA_OWNED); 1040 if (item->wk_state & ONWORKLIST) 1041 panic("worklist_insert: %p %s(0x%X) already on list", 1042 item, TYPENAME(item->wk_type), item->wk_state); 1043 item->wk_state |= ONWORKLIST; 1044 LIST_INSERT_HEAD(head, item, wk_list); 1045 } 1046 1047 static void 1048 worklist_remove(item, locked) 1049 struct worklist *item; 1050 int locked; 1051 { 1052 1053 if (locked) 1054 mtx_assert(&lk, MA_OWNED); 1055 if ((item->wk_state & ONWORKLIST) == 0) 1056 panic("worklist_remove: %p %s(0x%X) not on list", 1057 item, TYPENAME(item->wk_type), item->wk_state); 1058 item->wk_state &= ~ONWORKLIST; 1059 LIST_REMOVE(item, wk_list); 1060 } 1061 #endif /* DEBUG */ 1062 1063 /* 1064 * Merge two jsegdeps keeping only the oldest one as newer references 1065 * can't be discarded until after older references. 1066 */ 1067 static inline struct jsegdep * 1068 jsegdep_merge(struct jsegdep *one, struct jsegdep *two) 1069 { 1070 struct jsegdep *swp; 1071 1072 if (two == NULL) 1073 return (one); 1074 1075 if (one->jd_seg->js_seq > two->jd_seg->js_seq) { 1076 swp = one; 1077 one = two; 1078 two = swp; 1079 } 1080 WORKLIST_REMOVE(&two->jd_list); 1081 free_jsegdep(two); 1082 1083 return (one); 1084 } 1085 1086 /* 1087 * If two freedeps are compatible free one to reduce list size. 1088 */ 1089 static inline struct freedep * 1090 freedep_merge(struct freedep *one, struct freedep *two) 1091 { 1092 if (two == NULL) 1093 return (one); 1094 1095 if (one->fd_freework == two->fd_freework) { 1096 WORKLIST_REMOVE(&two->fd_list); 1097 free_freedep(two); 1098 } 1099 return (one); 1100 } 1101 1102 /* 1103 * Move journal work from one list to another. Duplicate freedeps and 1104 * jsegdeps are coalesced to keep the lists as small as possible. 1105 */ 1106 static void 1107 jwork_move(dst, src) 1108 struct workhead *dst; 1109 struct workhead *src; 1110 { 1111 struct freedep *freedep; 1112 struct jsegdep *jsegdep; 1113 struct worklist *wkn; 1114 struct worklist *wk; 1115 1116 KASSERT(dst != src, 1117 ("jwork_move: dst == src")); 1118 freedep = NULL; 1119 jsegdep = NULL; 1120 LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) { 1121 if (wk->wk_type == D_JSEGDEP) 1122 jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); 1123 if (wk->wk_type == D_FREEDEP) 1124 freedep = freedep_merge(WK_FREEDEP(wk), freedep); 1125 } 1126 1127 mtx_assert(&lk, MA_OWNED); 1128 while ((wk = LIST_FIRST(src)) != NULL) { 1129 WORKLIST_REMOVE(wk); 1130 WORKLIST_INSERT(dst, wk); 1131 if (wk->wk_type == D_JSEGDEP) { 1132 jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); 1133 continue; 1134 } 1135 if (wk->wk_type == D_FREEDEP) 1136 freedep = freedep_merge(WK_FREEDEP(wk), freedep); 1137 } 1138 } 1139 1140 static void 1141 jwork_insert(dst, jsegdep) 1142 struct workhead *dst; 1143 struct jsegdep *jsegdep; 1144 { 1145 struct jsegdep *jsegdepn; 1146 struct worklist *wk; 1147 1148 LIST_FOREACH(wk, dst, wk_list) 1149 if (wk->wk_type == D_JSEGDEP) 1150 break; 1151 if (wk == NULL) { 1152 WORKLIST_INSERT(dst, &jsegdep->jd_list); 1153 return; 1154 } 1155 jsegdepn = WK_JSEGDEP(wk); 1156 if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) { 1157 WORKLIST_REMOVE(wk); 1158 free_jsegdep(jsegdepn); 1159 WORKLIST_INSERT(dst, &jsegdep->jd_list); 1160 } else 1161 free_jsegdep(jsegdep); 1162 } 1163 1164 /* 1165 * Routines for tracking and managing workitems. 1166 */ 1167 static void workitem_free(struct worklist *, int); 1168 static void workitem_alloc(struct worklist *, int, struct mount *); 1169 1170 #define WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type)) 1171 1172 static void 1173 workitem_free(item, type) 1174 struct worklist *item; 1175 int type; 1176 { 1177 struct ufsmount *ump; 1178 mtx_assert(&lk, MA_OWNED); 1179 1180 #ifdef DEBUG 1181 if (item->wk_state & ONWORKLIST) 1182 panic("workitem_free: %s(0x%X) still on list", 1183 TYPENAME(item->wk_type), item->wk_state); 1184 if (item->wk_type != type) 1185 panic("workitem_free: type mismatch %s != %s", 1186 TYPENAME(item->wk_type), TYPENAME(type)); 1187 #endif 1188 if (item->wk_state & IOWAITING) 1189 wakeup(item); 1190 ump = VFSTOUFS(item->wk_mp); 1191 if (--ump->softdep_deps == 0 && ump->softdep_req) 1192 wakeup(&ump->softdep_deps); 1193 dep_current[type]--; 1194 free(item, DtoM(type)); 1195 } 1196 1197 static void 1198 workitem_alloc(item, type, mp) 1199 struct worklist *item; 1200 int type; 1201 struct mount *mp; 1202 { 1203 struct ufsmount *ump; 1204 1205 item->wk_type = type; 1206 item->wk_mp = mp; 1207 item->wk_state = 0; 1208 1209 ump = VFSTOUFS(mp); 1210 ACQUIRE_LOCK(&lk); 1211 dep_current[type]++; 1212 dep_total[type]++; 1213 ump->softdep_deps++; 1214 ump->softdep_accdeps++; 1215 FREE_LOCK(&lk); 1216 } 1217 1218 /* 1219 * Workitem queue management 1220 */ 1221 static int max_softdeps; /* maximum number of structs before slowdown */ 1222 static int maxindirdeps = 50; /* max number of indirdeps before slowdown */ 1223 static int tickdelay = 2; /* number of ticks to pause during slowdown */ 1224 static int proc_waiting; /* tracks whether we have a timeout posted */ 1225 static int *stat_countp; /* statistic to count in proc_waiting timeout */ 1226 static struct callout softdep_callout; 1227 static int req_pending; 1228 static int req_clear_inodedeps; /* syncer process flush some inodedeps */ 1229 static int req_clear_remove; /* syncer process flush some freeblks */ 1230 1231 /* 1232 * runtime statistics 1233 */ 1234 static int stat_worklist_push; /* number of worklist cleanups */ 1235 static int stat_blk_limit_push; /* number of times block limit neared */ 1236 static int stat_ino_limit_push; /* number of times inode limit neared */ 1237 static int stat_blk_limit_hit; /* number of times block slowdown imposed */ 1238 static int stat_ino_limit_hit; /* number of times inode slowdown imposed */ 1239 static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */ 1240 static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */ 1241 static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */ 1242 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */ 1243 static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */ 1244 static int stat_jaddref; /* bufs redirtied as ino bitmap can not write */ 1245 static int stat_jnewblk; /* bufs redirtied as blk bitmap can not write */ 1246 static int stat_journal_min; /* Times hit journal min threshold */ 1247 static int stat_journal_low; /* Times hit journal low threshold */ 1248 static int stat_journal_wait; /* Times blocked in jwait(). */ 1249 static int stat_jwait_filepage; /* Times blocked in jwait() for filepage. */ 1250 static int stat_jwait_freeblks; /* Times blocked in jwait() for freeblks. */ 1251 static int stat_jwait_inode; /* Times blocked in jwait() for inodes. */ 1252 static int stat_jwait_newblk; /* Times blocked in jwait() for newblks. */ 1253 static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */ 1254 static int stat_cleanup_blkrequests; /* Number of block cleanup requests */ 1255 static int stat_cleanup_inorequests; /* Number of inode cleanup requests */ 1256 static int stat_cleanup_retries; /* Number of cleanups that needed to flush */ 1257 static int stat_cleanup_failures; /* Number of cleanup requests that failed */ 1258 1259 SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW, 1260 &max_softdeps, 0, ""); 1261 SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW, 1262 &tickdelay, 0, ""); 1263 SYSCTL_INT(_debug_softdep, OID_AUTO, maxindirdeps, CTLFLAG_RW, 1264 &maxindirdeps, 0, ""); 1265 SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW, 1266 &stat_worklist_push, 0,""); 1267 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW, 1268 &stat_blk_limit_push, 0,""); 1269 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW, 1270 &stat_ino_limit_push, 0,""); 1271 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW, 1272 &stat_blk_limit_hit, 0, ""); 1273 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW, 1274 &stat_ino_limit_hit, 0, ""); 1275 SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW, 1276 &stat_sync_limit_hit, 0, ""); 1277 SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, 1278 &stat_indir_blk_ptrs, 0, ""); 1279 SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW, 1280 &stat_inode_bitmap, 0, ""); 1281 SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, 1282 &stat_direct_blk_ptrs, 0, ""); 1283 SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW, 1284 &stat_dir_entry, 0, ""); 1285 SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW, 1286 &stat_jaddref, 0, ""); 1287 SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW, 1288 &stat_jnewblk, 0, ""); 1289 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW, 1290 &stat_journal_low, 0, ""); 1291 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW, 1292 &stat_journal_min, 0, ""); 1293 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW, 1294 &stat_journal_wait, 0, ""); 1295 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW, 1296 &stat_jwait_filepage, 0, ""); 1297 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW, 1298 &stat_jwait_freeblks, 0, ""); 1299 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW, 1300 &stat_jwait_inode, 0, ""); 1301 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW, 1302 &stat_jwait_newblk, 0, ""); 1303 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW, 1304 &stat_cleanup_blkrequests, 0, ""); 1305 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW, 1306 &stat_cleanup_inorequests, 0, ""); 1307 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW, 1308 &stat_cleanup_high_delay, 0, ""); 1309 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW, 1310 &stat_cleanup_retries, 0, ""); 1311 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW, 1312 &stat_cleanup_failures, 0, ""); 1313 1314 SYSCTL_DECL(_vfs_ffs); 1315 1316 LIST_HEAD(bmsafemap_hashhead, bmsafemap) *bmsafemap_hashtbl; 1317 static u_long bmsafemap_hash; /* size of hash table - 1 */ 1318 1319 static int compute_summary_at_mount = 0; /* Whether to recompute the summary at mount time */ 1320 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW, 1321 &compute_summary_at_mount, 0, "Recompute summary at mount"); 1322 1323 static struct proc *softdepproc; 1324 static struct kproc_desc softdep_kp = { 1325 "softdepflush", 1326 softdep_flush, 1327 &softdepproc 1328 }; 1329 SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start, 1330 &softdep_kp); 1331 1332 static void 1333 softdep_flush(void) 1334 { 1335 struct mount *nmp; 1336 struct mount *mp; 1337 struct ufsmount *ump; 1338 struct thread *td; 1339 int remaining; 1340 int progress; 1341 int vfslocked; 1342 1343 td = curthread; 1344 td->td_pflags |= TDP_NORUNNINGBUF; 1345 1346 for (;;) { 1347 kproc_suspend_check(softdepproc); 1348 vfslocked = VFS_LOCK_GIANT((struct mount *)NULL); 1349 ACQUIRE_LOCK(&lk); 1350 /* 1351 * If requested, try removing inode or removal dependencies. 1352 */ 1353 if (req_clear_inodedeps) { 1354 clear_inodedeps(td); 1355 req_clear_inodedeps -= 1; 1356 wakeup_one(&proc_waiting); 1357 } 1358 if (req_clear_remove) { 1359 clear_remove(td); 1360 req_clear_remove -= 1; 1361 wakeup_one(&proc_waiting); 1362 } 1363 FREE_LOCK(&lk); 1364 VFS_UNLOCK_GIANT(vfslocked); 1365 remaining = progress = 0; 1366 mtx_lock(&mountlist_mtx); 1367 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 1368 nmp = TAILQ_NEXT(mp, mnt_list); 1369 if (MOUNTEDSOFTDEP(mp) == 0) 1370 continue; 1371 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) 1372 continue; 1373 vfslocked = VFS_LOCK_GIANT(mp); 1374 progress += softdep_process_worklist(mp, 0); 1375 ump = VFSTOUFS(mp); 1376 remaining += ump->softdep_on_worklist; 1377 VFS_UNLOCK_GIANT(vfslocked); 1378 mtx_lock(&mountlist_mtx); 1379 nmp = TAILQ_NEXT(mp, mnt_list); 1380 vfs_unbusy(mp); 1381 } 1382 mtx_unlock(&mountlist_mtx); 1383 if (remaining && progress) 1384 continue; 1385 ACQUIRE_LOCK(&lk); 1386 if (!req_pending) 1387 msleep(&req_pending, &lk, PVM, "sdflush", hz); 1388 req_pending = 0; 1389 FREE_LOCK(&lk); 1390 } 1391 } 1392 1393 static void 1394 worklist_speedup(void) 1395 { 1396 mtx_assert(&lk, MA_OWNED); 1397 if (req_pending == 0) { 1398 req_pending = 1; 1399 wakeup(&req_pending); 1400 } 1401 } 1402 1403 static int 1404 softdep_speedup(void) 1405 { 1406 1407 worklist_speedup(); 1408 bd_speedup(); 1409 return speedup_syncer(); 1410 } 1411 1412 /* 1413 * Add an item to the end of the work queue. 1414 * This routine requires that the lock be held. 1415 * This is the only routine that adds items to the list. 1416 * The following routine is the only one that removes items 1417 * and does so in order from first to last. 1418 */ 1419 1420 #define WK_HEAD 0x0001 /* Add to HEAD. */ 1421 #define WK_NODELAY 0x0002 /* Process immediately. */ 1422 1423 static void 1424 add_to_worklist(wk, flags) 1425 struct worklist *wk; 1426 int flags; 1427 { 1428 struct ufsmount *ump; 1429 1430 mtx_assert(&lk, MA_OWNED); 1431 ump = VFSTOUFS(wk->wk_mp); 1432 if (wk->wk_state & ONWORKLIST) 1433 panic("add_to_worklist: %s(0x%X) already on list", 1434 TYPENAME(wk->wk_type), wk->wk_state); 1435 wk->wk_state |= ONWORKLIST; 1436 if (ump->softdep_on_worklist == 0) { 1437 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); 1438 ump->softdep_worklist_tail = wk; 1439 } else if (flags & WK_HEAD) { 1440 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); 1441 } else { 1442 LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list); 1443 ump->softdep_worklist_tail = wk; 1444 } 1445 ump->softdep_on_worklist += 1; 1446 if (flags & WK_NODELAY) 1447 worklist_speedup(); 1448 } 1449 1450 /* 1451 * Remove the item to be processed. If we are removing the last 1452 * item on the list, we need to recalculate the tail pointer. 1453 */ 1454 static void 1455 remove_from_worklist(wk) 1456 struct worklist *wk; 1457 { 1458 struct ufsmount *ump; 1459 1460 ump = VFSTOUFS(wk->wk_mp); 1461 WORKLIST_REMOVE(wk); 1462 if (ump->softdep_worklist_tail == wk) 1463 ump->softdep_worklist_tail = 1464 (struct worklist *)wk->wk_list.le_prev; 1465 ump->softdep_on_worklist -= 1; 1466 } 1467 1468 static void 1469 wake_worklist(wk) 1470 struct worklist *wk; 1471 { 1472 if (wk->wk_state & IOWAITING) { 1473 wk->wk_state &= ~IOWAITING; 1474 wakeup(wk); 1475 } 1476 } 1477 1478 static void 1479 wait_worklist(wk, wmesg) 1480 struct worklist *wk; 1481 char *wmesg; 1482 { 1483 1484 wk->wk_state |= IOWAITING; 1485 msleep(wk, &lk, PVM, wmesg, 0); 1486 } 1487 1488 /* 1489 * Process that runs once per second to handle items in the background queue. 1490 * 1491 * Note that we ensure that everything is done in the order in which they 1492 * appear in the queue. The code below depends on this property to ensure 1493 * that blocks of a file are freed before the inode itself is freed. This 1494 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated 1495 * until all the old ones have been purged from the dependency lists. 1496 */ 1497 int 1498 softdep_process_worklist(mp, full) 1499 struct mount *mp; 1500 int full; 1501 { 1502 struct thread *td = curthread; 1503 int cnt, matchcnt; 1504 struct ufsmount *ump; 1505 long starttime; 1506 1507 KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp")); 1508 /* 1509 * Record the process identifier of our caller so that we can give 1510 * this process preferential treatment in request_cleanup below. 1511 */ 1512 matchcnt = 0; 1513 ump = VFSTOUFS(mp); 1514 ACQUIRE_LOCK(&lk); 1515 starttime = time_second; 1516 softdep_process_journal(mp, NULL, full?MNT_WAIT:0); 1517 while (ump->softdep_on_worklist > 0) { 1518 if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0) 1519 break; 1520 else 1521 matchcnt += cnt; 1522 /* 1523 * If requested, try removing inode or removal dependencies. 1524 */ 1525 if (req_clear_inodedeps) { 1526 clear_inodedeps(td); 1527 req_clear_inodedeps -= 1; 1528 wakeup_one(&proc_waiting); 1529 } 1530 if (req_clear_remove) { 1531 clear_remove(td); 1532 req_clear_remove -= 1; 1533 wakeup_one(&proc_waiting); 1534 } 1535 /* 1536 * We do not generally want to stop for buffer space, but if 1537 * we are really being a buffer hog, we will stop and wait. 1538 */ 1539 if (should_yield()) { 1540 FREE_LOCK(&lk); 1541 kern_yield(PRI_UNCHANGED); 1542 bwillwrite(); 1543 ACQUIRE_LOCK(&lk); 1544 } 1545 /* 1546 * Never allow processing to run for more than one 1547 * second. Otherwise the other mountpoints may get 1548 * excessively backlogged. 1549 */ 1550 if (!full && starttime != time_second) 1551 break; 1552 } 1553 if (full == 0) 1554 journal_unsuspend(ump); 1555 FREE_LOCK(&lk); 1556 return (matchcnt); 1557 } 1558 1559 /* 1560 * Process all removes associated with a vnode if we are running out of 1561 * journal space. Any other process which attempts to flush these will 1562 * be unable as we have the vnodes locked. 1563 */ 1564 static void 1565 process_removes(vp) 1566 struct vnode *vp; 1567 { 1568 struct inodedep *inodedep; 1569 struct dirrem *dirrem; 1570 struct mount *mp; 1571 ino_t inum; 1572 1573 mtx_assert(&lk, MA_OWNED); 1574 1575 mp = vp->v_mount; 1576 inum = VTOI(vp)->i_number; 1577 for (;;) { 1578 top: 1579 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) 1580 return; 1581 LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) { 1582 /* 1583 * If another thread is trying to lock this vnode 1584 * it will fail but we must wait for it to do so 1585 * before we can proceed. 1586 */ 1587 if (dirrem->dm_state & INPROGRESS) { 1588 wait_worklist(&dirrem->dm_list, "pwrwait"); 1589 goto top; 1590 } 1591 if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) == 1592 (COMPLETE | ONWORKLIST)) 1593 break; 1594 } 1595 if (dirrem == NULL) 1596 return; 1597 remove_from_worklist(&dirrem->dm_list); 1598 FREE_LOCK(&lk); 1599 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) 1600 panic("process_removes: suspended filesystem"); 1601 handle_workitem_remove(dirrem, 0); 1602 vn_finished_secondary_write(mp); 1603 ACQUIRE_LOCK(&lk); 1604 } 1605 } 1606 1607 /* 1608 * Process all truncations associated with a vnode if we are running out 1609 * of journal space. This is called when the vnode lock is already held 1610 * and no other process can clear the truncation. This function returns 1611 * a value greater than zero if it did any work. 1612 */ 1613 static void 1614 process_truncates(vp) 1615 struct vnode *vp; 1616 { 1617 struct inodedep *inodedep; 1618 struct freeblks *freeblks; 1619 struct mount *mp; 1620 ino_t inum; 1621 int cgwait; 1622 1623 mtx_assert(&lk, MA_OWNED); 1624 1625 mp = vp->v_mount; 1626 inum = VTOI(vp)->i_number; 1627 for (;;) { 1628 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) 1629 return; 1630 cgwait = 0; 1631 TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) { 1632 /* Journal entries not yet written. */ 1633 if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) { 1634 jwait(&LIST_FIRST( 1635 &freeblks->fb_jblkdephd)->jb_list, 1636 MNT_WAIT); 1637 break; 1638 } 1639 /* Another thread is executing this item. */ 1640 if (freeblks->fb_state & INPROGRESS) { 1641 wait_worklist(&freeblks->fb_list, "ptrwait"); 1642 break; 1643 } 1644 /* Freeblks is waiting on a inode write. */ 1645 if ((freeblks->fb_state & COMPLETE) == 0) { 1646 FREE_LOCK(&lk); 1647 ffs_update(vp, 1); 1648 ACQUIRE_LOCK(&lk); 1649 break; 1650 } 1651 if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) == 1652 (ALLCOMPLETE | ONWORKLIST)) { 1653 remove_from_worklist(&freeblks->fb_list); 1654 freeblks->fb_state |= INPROGRESS; 1655 FREE_LOCK(&lk); 1656 if (vn_start_secondary_write(NULL, &mp, 1657 V_NOWAIT)) 1658 panic("process_truncates: " 1659 "suspended filesystem"); 1660 handle_workitem_freeblocks(freeblks, 0); 1661 vn_finished_secondary_write(mp); 1662 ACQUIRE_LOCK(&lk); 1663 break; 1664 } 1665 if (freeblks->fb_cgwait) 1666 cgwait++; 1667 } 1668 if (cgwait) { 1669 FREE_LOCK(&lk); 1670 sync_cgs(mp, MNT_WAIT); 1671 ffs_sync_snap(mp, MNT_WAIT); 1672 ACQUIRE_LOCK(&lk); 1673 continue; 1674 } 1675 if (freeblks == NULL) 1676 break; 1677 } 1678 return; 1679 } 1680 1681 /* 1682 * Process one item on the worklist. 1683 */ 1684 static int 1685 process_worklist_item(mp, target, flags) 1686 struct mount *mp; 1687 int target; 1688 int flags; 1689 { 1690 struct worklist sintenel; 1691 struct worklist *wk; 1692 struct ufsmount *ump; 1693 int matchcnt; 1694 int error; 1695 1696 mtx_assert(&lk, MA_OWNED); 1697 KASSERT(mp != NULL, ("process_worklist_item: NULL mp")); 1698 /* 1699 * If we are being called because of a process doing a 1700 * copy-on-write, then it is not safe to write as we may 1701 * recurse into the copy-on-write routine. 1702 */ 1703 if (curthread->td_pflags & TDP_COWINPROGRESS) 1704 return (-1); 1705 PHOLD(curproc); /* Don't let the stack go away. */ 1706 ump = VFSTOUFS(mp); 1707 matchcnt = 0; 1708 sintenel.wk_mp = NULL; 1709 sintenel.wk_type = D_SENTINAL; 1710 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sintenel, wk_list); 1711 for (wk = LIST_NEXT(&sintenel, wk_list); wk != NULL; 1712 wk = LIST_NEXT(&sintenel, wk_list)) { 1713 if (wk->wk_type == D_SENTINAL) { 1714 LIST_REMOVE(&sintenel, wk_list); 1715 LIST_INSERT_AFTER(wk, &sintenel, wk_list); 1716 continue; 1717 } 1718 if (wk->wk_state & INPROGRESS) 1719 panic("process_worklist_item: %p already in progress.", 1720 wk); 1721 wk->wk_state |= INPROGRESS; 1722 remove_from_worklist(wk); 1723 FREE_LOCK(&lk); 1724 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) 1725 panic("process_worklist_item: suspended filesystem"); 1726 switch (wk->wk_type) { 1727 case D_DIRREM: 1728 /* removal of a directory entry */ 1729 error = handle_workitem_remove(WK_DIRREM(wk), flags); 1730 break; 1731 1732 case D_FREEBLKS: 1733 /* releasing blocks and/or fragments from a file */ 1734 error = handle_workitem_freeblocks(WK_FREEBLKS(wk), 1735 flags); 1736 break; 1737 1738 case D_FREEFRAG: 1739 /* releasing a fragment when replaced as a file grows */ 1740 handle_workitem_freefrag(WK_FREEFRAG(wk)); 1741 error = 0; 1742 break; 1743 1744 case D_FREEFILE: 1745 /* releasing an inode when its link count drops to 0 */ 1746 handle_workitem_freefile(WK_FREEFILE(wk)); 1747 error = 0; 1748 break; 1749 1750 default: 1751 panic("%s_process_worklist: Unknown type %s", 1752 "softdep", TYPENAME(wk->wk_type)); 1753 /* NOTREACHED */ 1754 } 1755 vn_finished_secondary_write(mp); 1756 ACQUIRE_LOCK(&lk); 1757 if (error == 0) { 1758 if (++matchcnt == target) 1759 break; 1760 continue; 1761 } 1762 /* 1763 * We have to retry the worklist item later. Wake up any 1764 * waiters who may be able to complete it immediately and 1765 * add the item back to the head so we don't try to execute 1766 * it again. 1767 */ 1768 wk->wk_state &= ~INPROGRESS; 1769 wake_worklist(wk); 1770 add_to_worklist(wk, WK_HEAD); 1771 } 1772 LIST_REMOVE(&sintenel, wk_list); 1773 /* Sentinal could've become the tail from remove_from_worklist. */ 1774 if (ump->softdep_worklist_tail == &sintenel) 1775 ump->softdep_worklist_tail = 1776 (struct worklist *)sintenel.wk_list.le_prev; 1777 PRELE(curproc); 1778 return (matchcnt); 1779 } 1780 1781 /* 1782 * Move dependencies from one buffer to another. 1783 */ 1784 int 1785 softdep_move_dependencies(oldbp, newbp) 1786 struct buf *oldbp; 1787 struct buf *newbp; 1788 { 1789 struct worklist *wk, *wktail; 1790 int dirty; 1791 1792 dirty = 0; 1793 wktail = NULL; 1794 ACQUIRE_LOCK(&lk); 1795 while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { 1796 LIST_REMOVE(wk, wk_list); 1797 if (wk->wk_type == D_BMSAFEMAP && 1798 bmsafemap_rollbacks(WK_BMSAFEMAP(wk))) 1799 dirty = 1; 1800 if (wktail == 0) 1801 LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); 1802 else 1803 LIST_INSERT_AFTER(wktail, wk, wk_list); 1804 wktail = wk; 1805 } 1806 FREE_LOCK(&lk); 1807 1808 return (dirty); 1809 } 1810 1811 /* 1812 * Purge the work list of all items associated with a particular mount point. 1813 */ 1814 int 1815 softdep_flushworklist(oldmnt, countp, td) 1816 struct mount *oldmnt; 1817 int *countp; 1818 struct thread *td; 1819 { 1820 struct vnode *devvp; 1821 int count, error = 0; 1822 struct ufsmount *ump; 1823 1824 /* 1825 * Alternately flush the block device associated with the mount 1826 * point and process any dependencies that the flushing 1827 * creates. We continue until no more worklist dependencies 1828 * are found. 1829 */ 1830 *countp = 0; 1831 ump = VFSTOUFS(oldmnt); 1832 devvp = ump->um_devvp; 1833 while ((count = softdep_process_worklist(oldmnt, 1)) > 0) { 1834 *countp += count; 1835 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1836 error = VOP_FSYNC(devvp, MNT_WAIT, td); 1837 VOP_UNLOCK(devvp, 0); 1838 if (error) 1839 break; 1840 } 1841 return (error); 1842 } 1843 1844 int 1845 softdep_waitidle(struct mount *mp) 1846 { 1847 struct ufsmount *ump; 1848 int error; 1849 int i; 1850 1851 ump = VFSTOUFS(mp); 1852 ACQUIRE_LOCK(&lk); 1853 for (i = 0; i < 10 && ump->softdep_deps; i++) { 1854 ump->softdep_req = 1; 1855 if (ump->softdep_on_worklist) 1856 panic("softdep_waitidle: work added after flush."); 1857 msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1); 1858 } 1859 ump->softdep_req = 0; 1860 FREE_LOCK(&lk); 1861 error = 0; 1862 if (i == 10) { 1863 error = EBUSY; 1864 printf("softdep_waitidle: Failed to flush worklist for %p\n", 1865 mp); 1866 } 1867 1868 return (error); 1869 } 1870 1871 /* 1872 * Flush all vnodes and worklist items associated with a specified mount point. 1873 */ 1874 int 1875 softdep_flushfiles(oldmnt, flags, td) 1876 struct mount *oldmnt; 1877 int flags; 1878 struct thread *td; 1879 { 1880 int error, depcount, loopcnt, retry_flush_count, retry; 1881 1882 loopcnt = 10; 1883 retry_flush_count = 3; 1884 retry_flush: 1885 error = 0; 1886 1887 /* 1888 * Alternately flush the vnodes associated with the mount 1889 * point and process any dependencies that the flushing 1890 * creates. In theory, this loop can happen at most twice, 1891 * but we give it a few extra just to be sure. 1892 */ 1893 for (; loopcnt > 0; loopcnt--) { 1894 /* 1895 * Do another flush in case any vnodes were brought in 1896 * as part of the cleanup operations. 1897 */ 1898 if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0) 1899 break; 1900 if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 || 1901 depcount == 0) 1902 break; 1903 } 1904 /* 1905 * If we are unmounting then it is an error to fail. If we 1906 * are simply trying to downgrade to read-only, then filesystem 1907 * activity can keep us busy forever, so we just fail with EBUSY. 1908 */ 1909 if (loopcnt == 0) { 1910 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) 1911 panic("softdep_flushfiles: looping"); 1912 error = EBUSY; 1913 } 1914 if (!error) 1915 error = softdep_waitidle(oldmnt); 1916 if (!error) { 1917 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) { 1918 retry = 0; 1919 MNT_ILOCK(oldmnt); 1920 KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0, 1921 ("softdep_flushfiles: !MNTK_NOINSMNTQ")); 1922 if (oldmnt->mnt_nvnodelistsize > 0) { 1923 if (--retry_flush_count > 0) { 1924 retry = 1; 1925 loopcnt = 3; 1926 } else 1927 error = EBUSY; 1928 } 1929 MNT_IUNLOCK(oldmnt); 1930 if (retry) 1931 goto retry_flush; 1932 } 1933 } 1934 return (error); 1935 } 1936 1937 /* 1938 * Structure hashing. 1939 * 1940 * There are three types of structures that can be looked up: 1941 * 1) pagedep structures identified by mount point, inode number, 1942 * and logical block. 1943 * 2) inodedep structures identified by mount point and inode number. 1944 * 3) newblk structures identified by mount point and 1945 * physical block number. 1946 * 1947 * The "pagedep" and "inodedep" dependency structures are hashed 1948 * separately from the file blocks and inodes to which they correspond. 1949 * This separation helps when the in-memory copy of an inode or 1950 * file block must be replaced. It also obviates the need to access 1951 * an inode or file page when simply updating (or de-allocating) 1952 * dependency structures. Lookup of newblk structures is needed to 1953 * find newly allocated blocks when trying to associate them with 1954 * their allocdirect or allocindir structure. 1955 * 1956 * The lookup routines optionally create and hash a new instance when 1957 * an existing entry is not found. 1958 */ 1959 #define DEPALLOC 0x0001 /* allocate structure if lookup fails */ 1960 #define NODELAY 0x0002 /* cannot do background work */ 1961 1962 /* 1963 * Structures and routines associated with pagedep caching. 1964 */ 1965 LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl; 1966 u_long pagedep_hash; /* size of hash table - 1 */ 1967 #define PAGEDEP_HASH(mp, inum, lbn) \ 1968 (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \ 1969 pagedep_hash]) 1970 1971 static int 1972 pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp) 1973 struct pagedep_hashhead *pagedephd; 1974 ino_t ino; 1975 ufs_lbn_t lbn; 1976 struct mount *mp; 1977 int flags; 1978 struct pagedep **pagedeppp; 1979 { 1980 struct pagedep *pagedep; 1981 1982 LIST_FOREACH(pagedep, pagedephd, pd_hash) { 1983 if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn && 1984 mp == pagedep->pd_list.wk_mp) { 1985 *pagedeppp = pagedep; 1986 return (1); 1987 } 1988 } 1989 *pagedeppp = NULL; 1990 return (0); 1991 } 1992 /* 1993 * Look up a pagedep. Return 1 if found, 0 otherwise. 1994 * If not found, allocate if DEPALLOC flag is passed. 1995 * Found or allocated entry is returned in pagedeppp. 1996 * This routine must be called with splbio interrupts blocked. 1997 */ 1998 static int 1999 pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp) 2000 struct mount *mp; 2001 struct buf *bp; 2002 ino_t ino; 2003 ufs_lbn_t lbn; 2004 int flags; 2005 struct pagedep **pagedeppp; 2006 { 2007 struct pagedep *pagedep; 2008 struct pagedep_hashhead *pagedephd; 2009 struct worklist *wk; 2010 int ret; 2011 int i; 2012 2013 mtx_assert(&lk, MA_OWNED); 2014 if (bp) { 2015 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 2016 if (wk->wk_type == D_PAGEDEP) { 2017 *pagedeppp = WK_PAGEDEP(wk); 2018 return (1); 2019 } 2020 } 2021 } 2022 pagedephd = PAGEDEP_HASH(mp, ino, lbn); 2023 ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp); 2024 if (ret) { 2025 if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp) 2026 WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list); 2027 return (1); 2028 } 2029 if ((flags & DEPALLOC) == 0) 2030 return (0); 2031 FREE_LOCK(&lk); 2032 pagedep = malloc(sizeof(struct pagedep), 2033 M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO); 2034 workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp); 2035 ACQUIRE_LOCK(&lk); 2036 ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp); 2037 if (*pagedeppp) { 2038 /* 2039 * This should never happen since we only create pagedeps 2040 * with the vnode lock held. Could be an assert. 2041 */ 2042 WORKITEM_FREE(pagedep, D_PAGEDEP); 2043 return (ret); 2044 } 2045 pagedep->pd_ino = ino; 2046 pagedep->pd_lbn = lbn; 2047 LIST_INIT(&pagedep->pd_dirremhd); 2048 LIST_INIT(&pagedep->pd_pendinghd); 2049 for (i = 0; i < DAHASHSZ; i++) 2050 LIST_INIT(&pagedep->pd_diraddhd[i]); 2051 LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); 2052 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 2053 *pagedeppp = pagedep; 2054 return (0); 2055 } 2056 2057 /* 2058 * Structures and routines associated with inodedep caching. 2059 */ 2060 LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl; 2061 static u_long inodedep_hash; /* size of hash table - 1 */ 2062 #define INODEDEP_HASH(fs, inum) \ 2063 (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash]) 2064 2065 static int 2066 inodedep_find(inodedephd, fs, inum, inodedeppp) 2067 struct inodedep_hashhead *inodedephd; 2068 struct fs *fs; 2069 ino_t inum; 2070 struct inodedep **inodedeppp; 2071 { 2072 struct inodedep *inodedep; 2073 2074 LIST_FOREACH(inodedep, inodedephd, id_hash) 2075 if (inum == inodedep->id_ino && fs == inodedep->id_fs) 2076 break; 2077 if (inodedep) { 2078 *inodedeppp = inodedep; 2079 return (1); 2080 } 2081 *inodedeppp = NULL; 2082 2083 return (0); 2084 } 2085 /* 2086 * Look up an inodedep. Return 1 if found, 0 if not found. 2087 * If not found, allocate if DEPALLOC flag is passed. 2088 * Found or allocated entry is returned in inodedeppp. 2089 * This routine must be called with splbio interrupts blocked. 2090 */ 2091 static int 2092 inodedep_lookup(mp, inum, flags, inodedeppp) 2093 struct mount *mp; 2094 ino_t inum; 2095 int flags; 2096 struct inodedep **inodedeppp; 2097 { 2098 struct inodedep *inodedep; 2099 struct inodedep_hashhead *inodedephd; 2100 struct fs *fs; 2101 2102 mtx_assert(&lk, MA_OWNED); 2103 fs = VFSTOUFS(mp)->um_fs; 2104 inodedephd = INODEDEP_HASH(fs, inum); 2105 2106 if (inodedep_find(inodedephd, fs, inum, inodedeppp)) 2107 return (1); 2108 if ((flags & DEPALLOC) == 0) 2109 return (0); 2110 /* 2111 * If we are over our limit, try to improve the situation. 2112 */ 2113 if (dep_current[D_INODEDEP] > max_softdeps && (flags & NODELAY) == 0) 2114 request_cleanup(mp, FLUSH_INODES); 2115 FREE_LOCK(&lk); 2116 inodedep = malloc(sizeof(struct inodedep), 2117 M_INODEDEP, M_SOFTDEP_FLAGS); 2118 workitem_alloc(&inodedep->id_list, D_INODEDEP, mp); 2119 ACQUIRE_LOCK(&lk); 2120 if (inodedep_find(inodedephd, fs, inum, inodedeppp)) { 2121 WORKITEM_FREE(inodedep, D_INODEDEP); 2122 return (1); 2123 } 2124 inodedep->id_fs = fs; 2125 inodedep->id_ino = inum; 2126 inodedep->id_state = ALLCOMPLETE; 2127 inodedep->id_nlinkdelta = 0; 2128 inodedep->id_savedino1 = NULL; 2129 inodedep->id_savedsize = -1; 2130 inodedep->id_savedextsize = -1; 2131 inodedep->id_savednlink = -1; 2132 inodedep->id_bmsafemap = NULL; 2133 inodedep->id_mkdiradd = NULL; 2134 LIST_INIT(&inodedep->id_dirremhd); 2135 LIST_INIT(&inodedep->id_pendinghd); 2136 LIST_INIT(&inodedep->id_inowait); 2137 LIST_INIT(&inodedep->id_bufwait); 2138 TAILQ_INIT(&inodedep->id_inoreflst); 2139 TAILQ_INIT(&inodedep->id_inoupdt); 2140 TAILQ_INIT(&inodedep->id_newinoupdt); 2141 TAILQ_INIT(&inodedep->id_extupdt); 2142 TAILQ_INIT(&inodedep->id_newextupdt); 2143 TAILQ_INIT(&inodedep->id_freeblklst); 2144 LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); 2145 *inodedeppp = inodedep; 2146 return (0); 2147 } 2148 2149 /* 2150 * Structures and routines associated with newblk caching. 2151 */ 2152 LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl; 2153 u_long newblk_hash; /* size of hash table - 1 */ 2154 #define NEWBLK_HASH(fs, inum) \ 2155 (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash]) 2156 2157 static int 2158 newblk_find(newblkhd, mp, newblkno, flags, newblkpp) 2159 struct newblk_hashhead *newblkhd; 2160 struct mount *mp; 2161 ufs2_daddr_t newblkno; 2162 int flags; 2163 struct newblk **newblkpp; 2164 { 2165 struct newblk *newblk; 2166 2167 LIST_FOREACH(newblk, newblkhd, nb_hash) { 2168 if (newblkno != newblk->nb_newblkno) 2169 continue; 2170 if (mp != newblk->nb_list.wk_mp) 2171 continue; 2172 /* 2173 * If we're creating a new dependency don't match those that 2174 * have already been converted to allocdirects. This is for 2175 * a frag extend. 2176 */ 2177 if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK) 2178 continue; 2179 break; 2180 } 2181 if (newblk) { 2182 *newblkpp = newblk; 2183 return (1); 2184 } 2185 *newblkpp = NULL; 2186 return (0); 2187 } 2188 2189 /* 2190 * Look up a newblk. Return 1 if found, 0 if not found. 2191 * If not found, allocate if DEPALLOC flag is passed. 2192 * Found or allocated entry is returned in newblkpp. 2193 */ 2194 static int 2195 newblk_lookup(mp, newblkno, flags, newblkpp) 2196 struct mount *mp; 2197 ufs2_daddr_t newblkno; 2198 int flags; 2199 struct newblk **newblkpp; 2200 { 2201 struct newblk *newblk; 2202 struct newblk_hashhead *newblkhd; 2203 2204 newblkhd = NEWBLK_HASH(VFSTOUFS(mp)->um_fs, newblkno); 2205 if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) 2206 return (1); 2207 if ((flags & DEPALLOC) == 0) 2208 return (0); 2209 FREE_LOCK(&lk); 2210 newblk = malloc(sizeof(union allblk), M_NEWBLK, 2211 M_SOFTDEP_FLAGS | M_ZERO); 2212 workitem_alloc(&newblk->nb_list, D_NEWBLK, mp); 2213 ACQUIRE_LOCK(&lk); 2214 if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) { 2215 WORKITEM_FREE(newblk, D_NEWBLK); 2216 return (1); 2217 } 2218 newblk->nb_freefrag = NULL; 2219 LIST_INIT(&newblk->nb_indirdeps); 2220 LIST_INIT(&newblk->nb_newdirblk); 2221 LIST_INIT(&newblk->nb_jwork); 2222 newblk->nb_state = ATTACHED; 2223 newblk->nb_newblkno = newblkno; 2224 LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); 2225 *newblkpp = newblk; 2226 return (0); 2227 } 2228 2229 /* 2230 * Structures and routines associated with freed indirect block caching. 2231 */ 2232 struct freeworklst *indir_hashtbl; 2233 u_long indir_hash; /* size of hash table - 1 */ 2234 #define INDIR_HASH(mp, blkno) \ 2235 (&indir_hashtbl[((((register_t)(mp)) >> 13) + (blkno)) & indir_hash]) 2236 2237 /* 2238 * Lookup an indirect block in the indir hash table. The freework is 2239 * removed and potentially freed. The caller must do a blocking journal 2240 * write before writing to the blkno. 2241 */ 2242 static int 2243 indirblk_lookup(mp, blkno) 2244 struct mount *mp; 2245 ufs2_daddr_t blkno; 2246 { 2247 struct freework *freework; 2248 struct freeworklst *wkhd; 2249 2250 wkhd = INDIR_HASH(mp, blkno); 2251 TAILQ_FOREACH(freework, wkhd, fw_next) { 2252 if (freework->fw_blkno != blkno) 2253 continue; 2254 if (freework->fw_list.wk_mp != mp) 2255 continue; 2256 indirblk_remove(freework); 2257 return (1); 2258 } 2259 return (0); 2260 } 2261 2262 /* 2263 * Insert an indirect block represented by freework into the indirblk 2264 * hash table so that it may prevent the block from being re-used prior 2265 * to the journal being written. 2266 */ 2267 static void 2268 indirblk_insert(freework) 2269 struct freework *freework; 2270 { 2271 struct freeblks *freeblks; 2272 struct jsegdep *jsegdep; 2273 struct worklist *wk; 2274 2275 freeblks = freework->fw_freeblks; 2276 LIST_FOREACH(wk, &freeblks->fb_jwork, wk_list) 2277 if (wk->wk_type == D_JSEGDEP) 2278 break; 2279 if (wk == NULL) 2280 return; 2281 2282 jsegdep = WK_JSEGDEP(wk); 2283 LIST_INSERT_HEAD(&jsegdep->jd_seg->js_indirs, freework, fw_segs); 2284 TAILQ_INSERT_HEAD(INDIR_HASH(freework->fw_list.wk_mp, 2285 freework->fw_blkno), freework, fw_next); 2286 freework->fw_state &= ~DEPCOMPLETE; 2287 } 2288 2289 static void 2290 indirblk_remove(freework) 2291 struct freework *freework; 2292 { 2293 2294 LIST_REMOVE(freework, fw_segs); 2295 TAILQ_REMOVE(INDIR_HASH(freework->fw_list.wk_mp, 2296 freework->fw_blkno), freework, fw_next); 2297 freework->fw_state |= DEPCOMPLETE; 2298 if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE) 2299 WORKITEM_FREE(freework, D_FREEWORK); 2300 } 2301 2302 /* 2303 * Executed during filesystem system initialization before 2304 * mounting any filesystems. 2305 */ 2306 void 2307 softdep_initialize() 2308 { 2309 int i; 2310 2311 LIST_INIT(&mkdirlisthd); 2312 max_softdeps = desiredvnodes * 4; 2313 pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash); 2314 inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); 2315 newblk_hashtbl = hashinit(desiredvnodes / 5, M_NEWBLK, &newblk_hash); 2316 bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash); 2317 i = 1 << (ffs(desiredvnodes / 10) - 1); 2318 indir_hashtbl = malloc(i * sizeof(indir_hashtbl[0]), M_FREEWORK, 2319 M_WAITOK); 2320 indir_hash = i - 1; 2321 for (i = 0; i <= indir_hash; i++) 2322 TAILQ_INIT(&indir_hashtbl[i]); 2323 2324 /* initialise bioops hack */ 2325 bioops.io_start = softdep_disk_io_initiation; 2326 bioops.io_complete = softdep_disk_write_complete; 2327 bioops.io_deallocate = softdep_deallocate_dependencies; 2328 bioops.io_countdeps = softdep_count_dependencies; 2329 2330 /* Initialize the callout with an mtx. */ 2331 callout_init_mtx(&softdep_callout, &lk, 0); 2332 } 2333 2334 /* 2335 * Executed after all filesystems have been unmounted during 2336 * filesystem module unload. 2337 */ 2338 void 2339 softdep_uninitialize() 2340 { 2341 2342 callout_drain(&softdep_callout); 2343 hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash); 2344 hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash); 2345 hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash); 2346 hashdestroy(bmsafemap_hashtbl, M_BMSAFEMAP, bmsafemap_hash); 2347 free(indir_hashtbl, M_FREEWORK); 2348 } 2349 2350 /* 2351 * Called at mount time to notify the dependency code that a 2352 * filesystem wishes to use it. 2353 */ 2354 int 2355 softdep_mount(devvp, mp, fs, cred) 2356 struct vnode *devvp; 2357 struct mount *mp; 2358 struct fs *fs; 2359 struct ucred *cred; 2360 { 2361 struct csum_total cstotal; 2362 struct ufsmount *ump; 2363 struct cg *cgp; 2364 struct buf *bp; 2365 int error, cyl; 2366 2367 MNT_ILOCK(mp); 2368 mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP; 2369 if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) { 2370 mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) | 2371 MNTK_SOFTDEP | MNTK_NOASYNC; 2372 } 2373 MNT_IUNLOCK(mp); 2374 ump = VFSTOUFS(mp); 2375 LIST_INIT(&ump->softdep_workitem_pending); 2376 LIST_INIT(&ump->softdep_journal_pending); 2377 TAILQ_INIT(&ump->softdep_unlinked); 2378 LIST_INIT(&ump->softdep_dirtycg); 2379 ump->softdep_worklist_tail = NULL; 2380 ump->softdep_on_worklist = 0; 2381 ump->softdep_deps = 0; 2382 if ((fs->fs_flags & FS_SUJ) && 2383 (error = journal_mount(mp, fs, cred)) != 0) { 2384 printf("Failed to start journal: %d\n", error); 2385 return (error); 2386 } 2387 /* 2388 * When doing soft updates, the counters in the 2389 * superblock may have gotten out of sync. Recomputation 2390 * can take a long time and can be deferred for background 2391 * fsck. However, the old behavior of scanning the cylinder 2392 * groups and recalculating them at mount time is available 2393 * by setting vfs.ffs.compute_summary_at_mount to one. 2394 */ 2395 if (compute_summary_at_mount == 0 || fs->fs_clean != 0) 2396 return (0); 2397 bzero(&cstotal, sizeof cstotal); 2398 for (cyl = 0; cyl < fs->fs_ncg; cyl++) { 2399 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), 2400 fs->fs_cgsize, cred, &bp)) != 0) { 2401 brelse(bp); 2402 return (error); 2403 } 2404 cgp = (struct cg *)bp->b_data; 2405 cstotal.cs_nffree += cgp->cg_cs.cs_nffree; 2406 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; 2407 cstotal.cs_nifree += cgp->cg_cs.cs_nifree; 2408 cstotal.cs_ndir += cgp->cg_cs.cs_ndir; 2409 fs->fs_cs(fs, cyl) = cgp->cg_cs; 2410 brelse(bp); 2411 } 2412 #ifdef DEBUG 2413 if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) 2414 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt); 2415 #endif 2416 bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); 2417 return (0); 2418 } 2419 2420 void 2421 softdep_unmount(mp) 2422 struct mount *mp; 2423 { 2424 2425 MNT_ILOCK(mp); 2426 mp->mnt_flag &= ~MNT_SOFTDEP; 2427 if (MOUNTEDSUJ(mp) == 0) { 2428 MNT_IUNLOCK(mp); 2429 return; 2430 } 2431 mp->mnt_flag &= ~MNT_SUJ; 2432 MNT_IUNLOCK(mp); 2433 journal_unmount(mp); 2434 } 2435 2436 struct jblocks { 2437 struct jseglst jb_segs; /* TAILQ of current segments. */ 2438 struct jseg *jb_writeseg; /* Next write to complete. */ 2439 struct jseg *jb_oldestseg; /* Oldest segment with valid entries. */ 2440 struct jextent *jb_extent; /* Extent array. */ 2441 uint64_t jb_nextseq; /* Next sequence number. */ 2442 uint64_t jb_oldestwrseq; /* Oldest written sequence number. */ 2443 uint8_t jb_needseg; /* Need a forced segment. */ 2444 uint8_t jb_suspended; /* Did journal suspend writes? */ 2445 int jb_avail; /* Available extents. */ 2446 int jb_used; /* Last used extent. */ 2447 int jb_head; /* Allocator head. */ 2448 int jb_off; /* Allocator extent offset. */ 2449 int jb_blocks; /* Total disk blocks covered. */ 2450 int jb_free; /* Total disk blocks free. */ 2451 int jb_min; /* Minimum free space. */ 2452 int jb_low; /* Low on space. */ 2453 int jb_age; /* Insertion time of oldest rec. */ 2454 }; 2455 2456 struct jextent { 2457 ufs2_daddr_t je_daddr; /* Disk block address. */ 2458 int je_blocks; /* Disk block count. */ 2459 }; 2460 2461 static struct jblocks * 2462 jblocks_create(void) 2463 { 2464 struct jblocks *jblocks; 2465 2466 jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO); 2467 TAILQ_INIT(&jblocks->jb_segs); 2468 jblocks->jb_avail = 10; 2469 jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail, 2470 M_JBLOCKS, M_WAITOK | M_ZERO); 2471 2472 return (jblocks); 2473 } 2474 2475 static ufs2_daddr_t 2476 jblocks_alloc(jblocks, bytes, actual) 2477 struct jblocks *jblocks; 2478 int bytes; 2479 int *actual; 2480 { 2481 ufs2_daddr_t daddr; 2482 struct jextent *jext; 2483 int freecnt; 2484 int blocks; 2485 2486 blocks = bytes / DEV_BSIZE; 2487 jext = &jblocks->jb_extent[jblocks->jb_head]; 2488 freecnt = jext->je_blocks - jblocks->jb_off; 2489 if (freecnt == 0) { 2490 jblocks->jb_off = 0; 2491 if (++jblocks->jb_head > jblocks->jb_used) 2492 jblocks->jb_head = 0; 2493 jext = &jblocks->jb_extent[jblocks->jb_head]; 2494 freecnt = jext->je_blocks; 2495 } 2496 if (freecnt > blocks) 2497 freecnt = blocks; 2498 *actual = freecnt * DEV_BSIZE; 2499 daddr = jext->je_daddr + jblocks->jb_off; 2500 jblocks->jb_off += freecnt; 2501 jblocks->jb_free -= freecnt; 2502 2503 return (daddr); 2504 } 2505 2506 static void 2507 jblocks_free(jblocks, mp, bytes) 2508 struct jblocks *jblocks; 2509 struct mount *mp; 2510 int bytes; 2511 { 2512 2513 jblocks->jb_free += bytes / DEV_BSIZE; 2514 if (jblocks->jb_suspended) 2515 worklist_speedup(); 2516 wakeup(jblocks); 2517 } 2518 2519 static void 2520 jblocks_destroy(jblocks) 2521 struct jblocks *jblocks; 2522 { 2523 2524 if (jblocks->jb_extent) 2525 free(jblocks->jb_extent, M_JBLOCKS); 2526 free(jblocks, M_JBLOCKS); 2527 } 2528 2529 static void 2530 jblocks_add(jblocks, daddr, blocks) 2531 struct jblocks *jblocks; 2532 ufs2_daddr_t daddr; 2533 int blocks; 2534 { 2535 struct jextent *jext; 2536 2537 jblocks->jb_blocks += blocks; 2538 jblocks->jb_free += blocks; 2539 jext = &jblocks->jb_extent[jblocks->jb_used]; 2540 /* Adding the first block. */ 2541 if (jext->je_daddr == 0) { 2542 jext->je_daddr = daddr; 2543 jext->je_blocks = blocks; 2544 return; 2545 } 2546 /* Extending the last extent. */ 2547 if (jext->je_daddr + jext->je_blocks == daddr) { 2548 jext->je_blocks += blocks; 2549 return; 2550 } 2551 /* Adding a new extent. */ 2552 if (++jblocks->jb_used == jblocks->jb_avail) { 2553 jblocks->jb_avail *= 2; 2554 jext = malloc(sizeof(struct jextent) * jblocks->jb_avail, 2555 M_JBLOCKS, M_WAITOK | M_ZERO); 2556 memcpy(jext, jblocks->jb_extent, 2557 sizeof(struct jextent) * jblocks->jb_used); 2558 free(jblocks->jb_extent, M_JBLOCKS); 2559 jblocks->jb_extent = jext; 2560 } 2561 jext = &jblocks->jb_extent[jblocks->jb_used]; 2562 jext->je_daddr = daddr; 2563 jext->je_blocks = blocks; 2564 return; 2565 } 2566 2567 int 2568 softdep_journal_lookup(mp, vpp) 2569 struct mount *mp; 2570 struct vnode **vpp; 2571 { 2572 struct componentname cnp; 2573 struct vnode *dvp; 2574 ino_t sujournal; 2575 int error; 2576 2577 error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp); 2578 if (error) 2579 return (error); 2580 bzero(&cnp, sizeof(cnp)); 2581 cnp.cn_nameiop = LOOKUP; 2582 cnp.cn_flags = ISLASTCN; 2583 cnp.cn_thread = curthread; 2584 cnp.cn_cred = curthread->td_ucred; 2585 cnp.cn_pnbuf = SUJ_FILE; 2586 cnp.cn_nameptr = SUJ_FILE; 2587 cnp.cn_namelen = strlen(SUJ_FILE); 2588 error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal); 2589 vput(dvp); 2590 if (error != 0) 2591 return (error); 2592 error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp); 2593 return (error); 2594 } 2595 2596 /* 2597 * Open and verify the journal file. 2598 */ 2599 static int 2600 journal_mount(mp, fs, cred) 2601 struct mount *mp; 2602 struct fs *fs; 2603 struct ucred *cred; 2604 { 2605 struct jblocks *jblocks; 2606 struct vnode *vp; 2607 struct inode *ip; 2608 ufs2_daddr_t blkno; 2609 int bcount; 2610 int error; 2611 int i; 2612 2613 error = softdep_journal_lookup(mp, &vp); 2614 if (error != 0) { 2615 printf("Failed to find journal. Use tunefs to create one\n"); 2616 return (error); 2617 } 2618 ip = VTOI(vp); 2619 if (ip->i_size < SUJ_MIN) { 2620 error = ENOSPC; 2621 goto out; 2622 } 2623 bcount = lblkno(fs, ip->i_size); /* Only use whole blocks. */ 2624 jblocks = jblocks_create(); 2625 for (i = 0; i < bcount; i++) { 2626 error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL); 2627 if (error) 2628 break; 2629 jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag)); 2630 } 2631 if (error) { 2632 jblocks_destroy(jblocks); 2633 goto out; 2634 } 2635 jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */ 2636 jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */ 2637 VFSTOUFS(mp)->softdep_jblocks = jblocks; 2638 out: 2639 if (error == 0) { 2640 MNT_ILOCK(mp); 2641 mp->mnt_flag |= MNT_SUJ; 2642 mp->mnt_flag &= ~MNT_SOFTDEP; 2643 MNT_IUNLOCK(mp); 2644 /* 2645 * Only validate the journal contents if the 2646 * filesystem is clean, otherwise we write the logs 2647 * but they'll never be used. If the filesystem was 2648 * still dirty when we mounted it the journal is 2649 * invalid and a new journal can only be valid if it 2650 * starts from a clean mount. 2651 */ 2652 if (fs->fs_clean) { 2653 DIP_SET(ip, i_modrev, fs->fs_mtime); 2654 ip->i_flags |= IN_MODIFIED; 2655 ffs_update(vp, 1); 2656 } 2657 } 2658 vput(vp); 2659 return (error); 2660 } 2661 2662 static void 2663 journal_unmount(mp) 2664 struct mount *mp; 2665 { 2666 struct ufsmount *ump; 2667 2668 ump = VFSTOUFS(mp); 2669 if (ump->softdep_jblocks) 2670 jblocks_destroy(ump->softdep_jblocks); 2671 ump->softdep_jblocks = NULL; 2672 } 2673 2674 /* 2675 * Called when a journal record is ready to be written. Space is allocated 2676 * and the journal entry is created when the journal is flushed to stable 2677 * store. 2678 */ 2679 static void 2680 add_to_journal(wk) 2681 struct worklist *wk; 2682 { 2683 struct ufsmount *ump; 2684 2685 mtx_assert(&lk, MA_OWNED); 2686 ump = VFSTOUFS(wk->wk_mp); 2687 if (wk->wk_state & ONWORKLIST) 2688 panic("add_to_journal: %s(0x%X) already on list", 2689 TYPENAME(wk->wk_type), wk->wk_state); 2690 wk->wk_state |= ONWORKLIST | DEPCOMPLETE; 2691 if (LIST_EMPTY(&ump->softdep_journal_pending)) { 2692 ump->softdep_jblocks->jb_age = ticks; 2693 LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list); 2694 } else 2695 LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list); 2696 ump->softdep_journal_tail = wk; 2697 ump->softdep_on_journal += 1; 2698 } 2699 2700 /* 2701 * Remove an arbitrary item for the journal worklist maintain the tail 2702 * pointer. This happens when a new operation obviates the need to 2703 * journal an old operation. 2704 */ 2705 static void 2706 remove_from_journal(wk) 2707 struct worklist *wk; 2708 { 2709 struct ufsmount *ump; 2710 2711 mtx_assert(&lk, MA_OWNED); 2712 ump = VFSTOUFS(wk->wk_mp); 2713 #ifdef SUJ_DEBUG 2714 { 2715 struct worklist *wkn; 2716 2717 LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list) 2718 if (wkn == wk) 2719 break; 2720 if (wkn == NULL) 2721 panic("remove_from_journal: %p is not in journal", wk); 2722 } 2723 #endif 2724 /* 2725 * We emulate a TAILQ to save space in most structures which do not 2726 * require TAILQ semantics. Here we must update the tail position 2727 * when removing the tail which is not the final entry. This works 2728 * only if the worklist linkage are at the beginning of the structure. 2729 */ 2730 if (ump->softdep_journal_tail == wk) 2731 ump->softdep_journal_tail = 2732 (struct worklist *)wk->wk_list.le_prev; 2733 2734 WORKLIST_REMOVE(wk); 2735 ump->softdep_on_journal -= 1; 2736 } 2737 2738 /* 2739 * Check for journal space as well as dependency limits so the prelink 2740 * code can throttle both journaled and non-journaled filesystems. 2741 * Threshold is 0 for low and 1 for min. 2742 */ 2743 static int 2744 journal_space(ump, thresh) 2745 struct ufsmount *ump; 2746 int thresh; 2747 { 2748 struct jblocks *jblocks; 2749 int avail; 2750 2751 jblocks = ump->softdep_jblocks; 2752 if (jblocks == NULL) 2753 return (1); 2754 /* 2755 * We use a tighter restriction here to prevent request_cleanup() 2756 * running in threads from running into locks we currently hold. 2757 */ 2758 if (dep_current[D_INODEDEP] > (max_softdeps / 10) * 9) 2759 return (0); 2760 if (thresh) 2761 thresh = jblocks->jb_min; 2762 else 2763 thresh = jblocks->jb_low; 2764 avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE; 2765 avail = jblocks->jb_free - avail; 2766 2767 return (avail > thresh); 2768 } 2769 2770 static void 2771 journal_suspend(ump) 2772 struct ufsmount *ump; 2773 { 2774 struct jblocks *jblocks; 2775 struct mount *mp; 2776 2777 mp = UFSTOVFS(ump); 2778 jblocks = ump->softdep_jblocks; 2779 MNT_ILOCK(mp); 2780 if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { 2781 stat_journal_min++; 2782 mp->mnt_kern_flag |= MNTK_SUSPEND; 2783 mp->mnt_susp_owner = FIRST_THREAD_IN_PROC(softdepproc); 2784 } 2785 jblocks->jb_suspended = 1; 2786 MNT_IUNLOCK(mp); 2787 } 2788 2789 static int 2790 journal_unsuspend(struct ufsmount *ump) 2791 { 2792 struct jblocks *jblocks; 2793 struct mount *mp; 2794 2795 mp = UFSTOVFS(ump); 2796 jblocks = ump->softdep_jblocks; 2797 2798 if (jblocks != NULL && jblocks->jb_suspended && 2799 journal_space(ump, jblocks->jb_min)) { 2800 jblocks->jb_suspended = 0; 2801 FREE_LOCK(&lk); 2802 mp->mnt_susp_owner = curthread; 2803 vfs_write_resume(mp); 2804 ACQUIRE_LOCK(&lk); 2805 return (1); 2806 } 2807 return (0); 2808 } 2809 2810 /* 2811 * Called before any allocation function to be certain that there is 2812 * sufficient space in the journal prior to creating any new records. 2813 * Since in the case of block allocation we may have multiple locked 2814 * buffers at the time of the actual allocation we can not block 2815 * when the journal records are created. Doing so would create a deadlock 2816 * if any of these buffers needed to be flushed to reclaim space. Instead 2817 * we require a sufficiently large amount of available space such that 2818 * each thread in the system could have passed this allocation check and 2819 * still have sufficient free space. With 20% of a minimum journal size 2820 * of 1MB we have 6553 records available. 2821 */ 2822 int 2823 softdep_prealloc(vp, waitok) 2824 struct vnode *vp; 2825 int waitok; 2826 { 2827 struct ufsmount *ump; 2828 2829 /* 2830 * Nothing to do if we are not running journaled soft updates. 2831 * If we currently hold the snapshot lock, we must avoid handling 2832 * other resources that could cause deadlock. 2833 */ 2834 if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp))) 2835 return (0); 2836 ump = VFSTOUFS(vp->v_mount); 2837 ACQUIRE_LOCK(&lk); 2838 if (journal_space(ump, 0)) { 2839 FREE_LOCK(&lk); 2840 return (0); 2841 } 2842 stat_journal_low++; 2843 FREE_LOCK(&lk); 2844 if (waitok == MNT_NOWAIT) 2845 return (ENOSPC); 2846 /* 2847 * Attempt to sync this vnode once to flush any journal 2848 * work attached to it. 2849 */ 2850 if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0) 2851 ffs_syncvnode(vp, waitok, 0); 2852 ACQUIRE_LOCK(&lk); 2853 process_removes(vp); 2854 process_truncates(vp); 2855 if (journal_space(ump, 0) == 0) { 2856 softdep_speedup(); 2857 if (journal_space(ump, 1) == 0) 2858 journal_suspend(ump); 2859 } 2860 FREE_LOCK(&lk); 2861 2862 return (0); 2863 } 2864 2865 /* 2866 * Before adjusting a link count on a vnode verify that we have sufficient 2867 * journal space. If not, process operations that depend on the currently 2868 * locked pair of vnodes to try to flush space as the syncer, buf daemon, 2869 * and softdep flush threads can not acquire these locks to reclaim space. 2870 */ 2871 static void 2872 softdep_prelink(dvp, vp) 2873 struct vnode *dvp; 2874 struct vnode *vp; 2875 { 2876 struct ufsmount *ump; 2877 2878 ump = VFSTOUFS(dvp->v_mount); 2879 mtx_assert(&lk, MA_OWNED); 2880 /* 2881 * Nothing to do if we have sufficient journal space. 2882 * If we currently hold the snapshot lock, we must avoid 2883 * handling other resources that could cause deadlock. 2884 */ 2885 if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp)))) 2886 return; 2887 stat_journal_low++; 2888 FREE_LOCK(&lk); 2889 if (vp) 2890 ffs_syncvnode(vp, MNT_NOWAIT, 0); 2891 ffs_syncvnode(dvp, MNT_WAIT, 0); 2892 ACQUIRE_LOCK(&lk); 2893 /* Process vp before dvp as it may create .. removes. */ 2894 if (vp) { 2895 process_removes(vp); 2896 process_truncates(vp); 2897 } 2898 process_removes(dvp); 2899 process_truncates(dvp); 2900 softdep_speedup(); 2901 process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT); 2902 if (journal_space(ump, 0) == 0) { 2903 softdep_speedup(); 2904 if (journal_space(ump, 1) == 0) 2905 journal_suspend(ump); 2906 } 2907 } 2908 2909 static void 2910 jseg_write(ump, jseg, data) 2911 struct ufsmount *ump; 2912 struct jseg *jseg; 2913 uint8_t *data; 2914 { 2915 struct jsegrec *rec; 2916 2917 rec = (struct jsegrec *)data; 2918 rec->jsr_seq = jseg->js_seq; 2919 rec->jsr_oldest = jseg->js_oldseq; 2920 rec->jsr_cnt = jseg->js_cnt; 2921 rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize; 2922 rec->jsr_crc = 0; 2923 rec->jsr_time = ump->um_fs->fs_mtime; 2924 } 2925 2926 static inline void 2927 inoref_write(inoref, jseg, rec) 2928 struct inoref *inoref; 2929 struct jseg *jseg; 2930 struct jrefrec *rec; 2931 { 2932 2933 inoref->if_jsegdep->jd_seg = jseg; 2934 rec->jr_ino = inoref->if_ino; 2935 rec->jr_parent = inoref->if_parent; 2936 rec->jr_nlink = inoref->if_nlink; 2937 rec->jr_mode = inoref->if_mode; 2938 rec->jr_diroff = inoref->if_diroff; 2939 } 2940 2941 static void 2942 jaddref_write(jaddref, jseg, data) 2943 struct jaddref *jaddref; 2944 struct jseg *jseg; 2945 uint8_t *data; 2946 { 2947 struct jrefrec *rec; 2948 2949 rec = (struct jrefrec *)data; 2950 rec->jr_op = JOP_ADDREF; 2951 inoref_write(&jaddref->ja_ref, jseg, rec); 2952 } 2953 2954 static void 2955 jremref_write(jremref, jseg, data) 2956 struct jremref *jremref; 2957 struct jseg *jseg; 2958 uint8_t *data; 2959 { 2960 struct jrefrec *rec; 2961 2962 rec = (struct jrefrec *)data; 2963 rec->jr_op = JOP_REMREF; 2964 inoref_write(&jremref->jr_ref, jseg, rec); 2965 } 2966 2967 static void 2968 jmvref_write(jmvref, jseg, data) 2969 struct jmvref *jmvref; 2970 struct jseg *jseg; 2971 uint8_t *data; 2972 { 2973 struct jmvrec *rec; 2974 2975 rec = (struct jmvrec *)data; 2976 rec->jm_op = JOP_MVREF; 2977 rec->jm_ino = jmvref->jm_ino; 2978 rec->jm_parent = jmvref->jm_parent; 2979 rec->jm_oldoff = jmvref->jm_oldoff; 2980 rec->jm_newoff = jmvref->jm_newoff; 2981 } 2982 2983 static void 2984 jnewblk_write(jnewblk, jseg, data) 2985 struct jnewblk *jnewblk; 2986 struct jseg *jseg; 2987 uint8_t *data; 2988 { 2989 struct jblkrec *rec; 2990 2991 jnewblk->jn_jsegdep->jd_seg = jseg; 2992 rec = (struct jblkrec *)data; 2993 rec->jb_op = JOP_NEWBLK; 2994 rec->jb_ino = jnewblk->jn_ino; 2995 rec->jb_blkno = jnewblk->jn_blkno; 2996 rec->jb_lbn = jnewblk->jn_lbn; 2997 rec->jb_frags = jnewblk->jn_frags; 2998 rec->jb_oldfrags = jnewblk->jn_oldfrags; 2999 } 3000 3001 static void 3002 jfreeblk_write(jfreeblk, jseg, data) 3003 struct jfreeblk *jfreeblk; 3004 struct jseg *jseg; 3005 uint8_t *data; 3006 { 3007 struct jblkrec *rec; 3008 3009 jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg; 3010 rec = (struct jblkrec *)data; 3011 rec->jb_op = JOP_FREEBLK; 3012 rec->jb_ino = jfreeblk->jf_ino; 3013 rec->jb_blkno = jfreeblk->jf_blkno; 3014 rec->jb_lbn = jfreeblk->jf_lbn; 3015 rec->jb_frags = jfreeblk->jf_frags; 3016 rec->jb_oldfrags = 0; 3017 } 3018 3019 static void 3020 jfreefrag_write(jfreefrag, jseg, data) 3021 struct jfreefrag *jfreefrag; 3022 struct jseg *jseg; 3023 uint8_t *data; 3024 { 3025 struct jblkrec *rec; 3026 3027 jfreefrag->fr_jsegdep->jd_seg = jseg; 3028 rec = (struct jblkrec *)data; 3029 rec->jb_op = JOP_FREEBLK; 3030 rec->jb_ino = jfreefrag->fr_ino; 3031 rec->jb_blkno = jfreefrag->fr_blkno; 3032 rec->jb_lbn = jfreefrag->fr_lbn; 3033 rec->jb_frags = jfreefrag->fr_frags; 3034 rec->jb_oldfrags = 0; 3035 } 3036 3037 static void 3038 jtrunc_write(jtrunc, jseg, data) 3039 struct jtrunc *jtrunc; 3040 struct jseg *jseg; 3041 uint8_t *data; 3042 { 3043 struct jtrncrec *rec; 3044 3045 jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg; 3046 rec = (struct jtrncrec *)data; 3047 rec->jt_op = JOP_TRUNC; 3048 rec->jt_ino = jtrunc->jt_ino; 3049 rec->jt_size = jtrunc->jt_size; 3050 rec->jt_extsize = jtrunc->jt_extsize; 3051 } 3052 3053 static void 3054 jfsync_write(jfsync, jseg, data) 3055 struct jfsync *jfsync; 3056 struct jseg *jseg; 3057 uint8_t *data; 3058 { 3059 struct jtrncrec *rec; 3060 3061 rec = (struct jtrncrec *)data; 3062 rec->jt_op = JOP_SYNC; 3063 rec->jt_ino = jfsync->jfs_ino; 3064 rec->jt_size = jfsync->jfs_size; 3065 rec->jt_extsize = jfsync->jfs_extsize; 3066 } 3067 3068 static void 3069 softdep_flushjournal(mp) 3070 struct mount *mp; 3071 { 3072 struct jblocks *jblocks; 3073 struct ufsmount *ump; 3074 3075 if (MOUNTEDSUJ(mp) == 0) 3076 return; 3077 ump = VFSTOUFS(mp); 3078 jblocks = ump->softdep_jblocks; 3079 ACQUIRE_LOCK(&lk); 3080 while (ump->softdep_on_journal) { 3081 jblocks->jb_needseg = 1; 3082 softdep_process_journal(mp, NULL, MNT_WAIT); 3083 } 3084 FREE_LOCK(&lk); 3085 } 3086 3087 /* 3088 * Flush some journal records to disk. 3089 */ 3090 static void 3091 softdep_process_journal(mp, needwk, flags) 3092 struct mount *mp; 3093 struct worklist *needwk; 3094 int flags; 3095 { 3096 struct jblocks *jblocks; 3097 struct ufsmount *ump; 3098 struct worklist *wk; 3099 struct jseg *jseg; 3100 struct buf *bp; 3101 uint8_t *data; 3102 struct fs *fs; 3103 int segwritten; 3104 int jrecmin; /* Minimum records per block. */ 3105 int jrecmax; /* Maximum records per block. */ 3106 int size; 3107 int cnt; 3108 int off; 3109 int devbsize; 3110 3111 if (MOUNTEDSUJ(mp) == 0) 3112 return; 3113 ump = VFSTOUFS(mp); 3114 fs = ump->um_fs; 3115 jblocks = ump->softdep_jblocks; 3116 devbsize = ump->um_devvp->v_bufobj.bo_bsize; 3117 /* 3118 * We write anywhere between a disk block and fs block. The upper 3119 * bound is picked to prevent buffer cache fragmentation and limit 3120 * processing time per I/O. 3121 */ 3122 jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */ 3123 jrecmax = (fs->fs_bsize / devbsize) * jrecmin; 3124 segwritten = 0; 3125 for (;;) { 3126 cnt = ump->softdep_on_journal; 3127 /* 3128 * Criteria for writing a segment: 3129 * 1) We have a full block. 3130 * 2) We're called from jwait() and haven't found the 3131 * journal item yet. 3132 * 3) Always write if needseg is set. 3133 * 4) If we are called from process_worklist and have 3134 * not yet written anything we write a partial block 3135 * to enforce a 1 second maximum latency on journal 3136 * entries. 3137 */ 3138 if (cnt < (jrecmax - 1) && needwk == NULL && 3139 jblocks->jb_needseg == 0 && (segwritten || cnt == 0)) 3140 break; 3141 cnt++; 3142 /* 3143 * Verify some free journal space. softdep_prealloc() should 3144 * guarantee that we don't run out so this is indicative of 3145 * a problem with the flow control. Try to recover 3146 * gracefully in any event. 3147 */ 3148 while (jblocks->jb_free == 0) { 3149 if (flags != MNT_WAIT) 3150 break; 3151 printf("softdep: Out of journal space!\n"); 3152 softdep_speedup(); 3153 msleep(jblocks, &lk, PRIBIO, "jblocks", hz); 3154 } 3155 FREE_LOCK(&lk); 3156 jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS); 3157 workitem_alloc(&jseg->js_list, D_JSEG, mp); 3158 LIST_INIT(&jseg->js_entries); 3159 LIST_INIT(&jseg->js_indirs); 3160 jseg->js_state = ATTACHED; 3161 jseg->js_jblocks = jblocks; 3162 bp = geteblk(fs->fs_bsize, 0); 3163 ACQUIRE_LOCK(&lk); 3164 /* 3165 * If there was a race while we were allocating the block 3166 * and jseg the entry we care about was likely written. 3167 * We bail out in both the WAIT and NOWAIT case and assume 3168 * the caller will loop if the entry it cares about is 3169 * not written. 3170 */ 3171 cnt = ump->softdep_on_journal; 3172 if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) { 3173 bp->b_flags |= B_INVAL | B_NOCACHE; 3174 WORKITEM_FREE(jseg, D_JSEG); 3175 FREE_LOCK(&lk); 3176 brelse(bp); 3177 ACQUIRE_LOCK(&lk); 3178 break; 3179 } 3180 /* 3181 * Calculate the disk block size required for the available 3182 * records rounded to the min size. 3183 */ 3184 if (cnt == 0) 3185 size = devbsize; 3186 else if (cnt < jrecmax) 3187 size = howmany(cnt, jrecmin) * devbsize; 3188 else 3189 size = fs->fs_bsize; 3190 /* 3191 * Allocate a disk block for this journal data and account 3192 * for truncation of the requested size if enough contiguous 3193 * space was not available. 3194 */ 3195 bp->b_blkno = jblocks_alloc(jblocks, size, &size); 3196 bp->b_lblkno = bp->b_blkno; 3197 bp->b_offset = bp->b_blkno * DEV_BSIZE; 3198 bp->b_bcount = size; 3199 bp->b_bufobj = &ump->um_devvp->v_bufobj; 3200 bp->b_flags &= ~B_INVAL; 3201 bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY; 3202 /* 3203 * Initialize our jseg with cnt records. Assign the next 3204 * sequence number to it and link it in-order. 3205 */ 3206 cnt = MIN(cnt, (size / devbsize) * jrecmin); 3207 jseg->js_buf = bp; 3208 jseg->js_cnt = cnt; 3209 jseg->js_refs = cnt + 1; /* Self ref. */ 3210 jseg->js_size = size; 3211 jseg->js_seq = jblocks->jb_nextseq++; 3212 if (jblocks->jb_oldestseg == NULL) 3213 jblocks->jb_oldestseg = jseg; 3214 jseg->js_oldseq = jblocks->jb_oldestseg->js_seq; 3215 TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next); 3216 if (jblocks->jb_writeseg == NULL) 3217 jblocks->jb_writeseg = jseg; 3218 /* 3219 * Start filling in records from the pending list. 3220 */ 3221 data = bp->b_data; 3222 off = 0; 3223 while ((wk = LIST_FIRST(&ump->softdep_journal_pending)) 3224 != NULL) { 3225 if (cnt == 0) 3226 break; 3227 /* Place a segment header on every device block. */ 3228 if ((off % devbsize) == 0) { 3229 jseg_write(ump, jseg, data); 3230 off += JREC_SIZE; 3231 data = bp->b_data + off; 3232 } 3233 if (wk == needwk) 3234 needwk = NULL; 3235 remove_from_journal(wk); 3236 wk->wk_state |= INPROGRESS; 3237 WORKLIST_INSERT(&jseg->js_entries, wk); 3238 switch (wk->wk_type) { 3239 case D_JADDREF: 3240 jaddref_write(WK_JADDREF(wk), jseg, data); 3241 break; 3242 case D_JREMREF: 3243 jremref_write(WK_JREMREF(wk), jseg, data); 3244 break; 3245 case D_JMVREF: 3246 jmvref_write(WK_JMVREF(wk), jseg, data); 3247 break; 3248 case D_JNEWBLK: 3249 jnewblk_write(WK_JNEWBLK(wk), jseg, data); 3250 break; 3251 case D_JFREEBLK: 3252 jfreeblk_write(WK_JFREEBLK(wk), jseg, data); 3253 break; 3254 case D_JFREEFRAG: 3255 jfreefrag_write(WK_JFREEFRAG(wk), jseg, data); 3256 break; 3257 case D_JTRUNC: 3258 jtrunc_write(WK_JTRUNC(wk), jseg, data); 3259 break; 3260 case D_JFSYNC: 3261 jfsync_write(WK_JFSYNC(wk), jseg, data); 3262 break; 3263 default: 3264 panic("process_journal: Unknown type %s", 3265 TYPENAME(wk->wk_type)); 3266 /* NOTREACHED */ 3267 } 3268 off += JREC_SIZE; 3269 data = bp->b_data + off; 3270 cnt--; 3271 } 3272 /* 3273 * Write this one buffer and continue. 3274 */ 3275 segwritten = 1; 3276 jblocks->jb_needseg = 0; 3277 WORKLIST_INSERT(&bp->b_dep, &jseg->js_list); 3278 FREE_LOCK(&lk); 3279 BO_LOCK(bp->b_bufobj); 3280 bgetvp(ump->um_devvp, bp); 3281 BO_UNLOCK(bp->b_bufobj); 3282 /* 3283 * We only do the blocking wait once we find the journal 3284 * entry we're looking for. 3285 */ 3286 if (needwk == NULL && flags == MNT_WAIT) 3287 bwrite(bp); 3288 else 3289 bawrite(bp); 3290 ACQUIRE_LOCK(&lk); 3291 } 3292 /* 3293 * If we've suspended the filesystem because we ran out of journal 3294 * space either try to sync it here to make some progress or 3295 * unsuspend it if we already have. 3296 */ 3297 if (flags == 0 && jblocks->jb_suspended) { 3298 if (journal_unsuspend(ump)) 3299 return; 3300 FREE_LOCK(&lk); 3301 VFS_SYNC(mp, MNT_NOWAIT); 3302 ffs_sbupdate(ump, MNT_WAIT, 0); 3303 ACQUIRE_LOCK(&lk); 3304 } 3305 } 3306 3307 /* 3308 * Complete a jseg, allowing all dependencies awaiting journal writes 3309 * to proceed. Each journal dependency also attaches a jsegdep to dependent 3310 * structures so that the journal segment can be freed to reclaim space. 3311 */ 3312 static void 3313 complete_jseg(jseg) 3314 struct jseg *jseg; 3315 { 3316 struct worklist *wk; 3317 struct jmvref *jmvref; 3318 int waiting; 3319 #ifdef INVARIANTS 3320 int i = 0; 3321 #endif 3322 3323 while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) { 3324 WORKLIST_REMOVE(wk); 3325 waiting = wk->wk_state & IOWAITING; 3326 wk->wk_state &= ~(INPROGRESS | IOWAITING); 3327 wk->wk_state |= COMPLETE; 3328 KASSERT(i++ < jseg->js_cnt, 3329 ("handle_written_jseg: overflow %d >= %d", 3330 i - 1, jseg->js_cnt)); 3331 switch (wk->wk_type) { 3332 case D_JADDREF: 3333 handle_written_jaddref(WK_JADDREF(wk)); 3334 break; 3335 case D_JREMREF: 3336 handle_written_jremref(WK_JREMREF(wk)); 3337 break; 3338 case D_JMVREF: 3339 rele_jseg(jseg); /* No jsegdep. */ 3340 jmvref = WK_JMVREF(wk); 3341 LIST_REMOVE(jmvref, jm_deps); 3342 if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0) 3343 free_pagedep(jmvref->jm_pagedep); 3344 WORKITEM_FREE(jmvref, D_JMVREF); 3345 break; 3346 case D_JNEWBLK: 3347 handle_written_jnewblk(WK_JNEWBLK(wk)); 3348 break; 3349 case D_JFREEBLK: 3350 handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep); 3351 break; 3352 case D_JTRUNC: 3353 handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep); 3354 break; 3355 case D_JFSYNC: 3356 rele_jseg(jseg); /* No jsegdep. */ 3357 WORKITEM_FREE(wk, D_JFSYNC); 3358 break; 3359 case D_JFREEFRAG: 3360 handle_written_jfreefrag(WK_JFREEFRAG(wk)); 3361 break; 3362 default: 3363 panic("handle_written_jseg: Unknown type %s", 3364 TYPENAME(wk->wk_type)); 3365 /* NOTREACHED */ 3366 } 3367 if (waiting) 3368 wakeup(wk); 3369 } 3370 /* Release the self reference so the structure may be freed. */ 3371 rele_jseg(jseg); 3372 } 3373 3374 /* 3375 * Mark a jseg as DEPCOMPLETE and throw away the buffer. Handle jseg 3376 * completions in order only. 3377 */ 3378 static void 3379 handle_written_jseg(jseg, bp) 3380 struct jseg *jseg; 3381 struct buf *bp; 3382 { 3383 struct jblocks *jblocks; 3384 struct jseg *jsegn; 3385 3386 if (jseg->js_refs == 0) 3387 panic("handle_written_jseg: No self-reference on %p", jseg); 3388 jseg->js_state |= DEPCOMPLETE; 3389 /* 3390 * We'll never need this buffer again, set flags so it will be 3391 * discarded. 3392 */ 3393 bp->b_flags |= B_INVAL | B_NOCACHE; 3394 jblocks = jseg->js_jblocks; 3395 /* 3396 * Don't allow out of order completions. If this isn't the first 3397 * block wait for it to write before we're done. 3398 */ 3399 if (jseg != jblocks->jb_writeseg) 3400 return; 3401 /* Iterate through available jsegs processing their entries. */ 3402 do { 3403 jblocks->jb_oldestwrseq = jseg->js_oldseq; 3404 jsegn = TAILQ_NEXT(jseg, js_next); 3405 complete_jseg(jseg); 3406 jseg = jsegn; 3407 } while (jseg && jseg->js_state & DEPCOMPLETE); 3408 jblocks->jb_writeseg = jseg; 3409 /* 3410 * Attempt to free jsegs now that oldestwrseq may have advanced. 3411 */ 3412 free_jsegs(jblocks); 3413 } 3414 3415 static inline struct jsegdep * 3416 inoref_jseg(inoref) 3417 struct inoref *inoref; 3418 { 3419 struct jsegdep *jsegdep; 3420 3421 jsegdep = inoref->if_jsegdep; 3422 inoref->if_jsegdep = NULL; 3423 3424 return (jsegdep); 3425 } 3426 3427 /* 3428 * Called once a jremref has made it to stable store. The jremref is marked 3429 * complete and we attempt to free it. Any pagedeps writes sleeping waiting 3430 * for the jremref to complete will be awoken by free_jremref. 3431 */ 3432 static void 3433 handle_written_jremref(jremref) 3434 struct jremref *jremref; 3435 { 3436 struct inodedep *inodedep; 3437 struct jsegdep *jsegdep; 3438 struct dirrem *dirrem; 3439 3440 /* Grab the jsegdep. */ 3441 jsegdep = inoref_jseg(&jremref->jr_ref); 3442 /* 3443 * Remove us from the inoref list. 3444 */ 3445 if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 3446 0, &inodedep) == 0) 3447 panic("handle_written_jremref: Lost inodedep"); 3448 TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); 3449 /* 3450 * Complete the dirrem. 3451 */ 3452 dirrem = jremref->jr_dirrem; 3453 jremref->jr_dirrem = NULL; 3454 LIST_REMOVE(jremref, jr_deps); 3455 jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT; 3456 jwork_insert(&dirrem->dm_jwork, jsegdep); 3457 if (LIST_EMPTY(&dirrem->dm_jremrefhd) && 3458 (dirrem->dm_state & COMPLETE) != 0) 3459 add_to_worklist(&dirrem->dm_list, 0); 3460 free_jremref(jremref); 3461 } 3462 3463 /* 3464 * Called once a jaddref has made it to stable store. The dependency is 3465 * marked complete and any dependent structures are added to the inode 3466 * bufwait list to be completed as soon as it is written. If a bitmap write 3467 * depends on this entry we move the inode into the inodedephd of the 3468 * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap. 3469 */ 3470 static void 3471 handle_written_jaddref(jaddref) 3472 struct jaddref *jaddref; 3473 { 3474 struct jsegdep *jsegdep; 3475 struct inodedep *inodedep; 3476 struct diradd *diradd; 3477 struct mkdir *mkdir; 3478 3479 /* Grab the jsegdep. */ 3480 jsegdep = inoref_jseg(&jaddref->ja_ref); 3481 mkdir = NULL; 3482 diradd = NULL; 3483 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 3484 0, &inodedep) == 0) 3485 panic("handle_written_jaddref: Lost inodedep."); 3486 if (jaddref->ja_diradd == NULL) 3487 panic("handle_written_jaddref: No dependency"); 3488 if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) { 3489 diradd = jaddref->ja_diradd; 3490 WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list); 3491 } else if (jaddref->ja_state & MKDIR_PARENT) { 3492 mkdir = jaddref->ja_mkdir; 3493 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list); 3494 } else if (jaddref->ja_state & MKDIR_BODY) 3495 mkdir = jaddref->ja_mkdir; 3496 else 3497 panic("handle_written_jaddref: Unknown dependency %p", 3498 jaddref->ja_diradd); 3499 jaddref->ja_diradd = NULL; /* also clears ja_mkdir */ 3500 /* 3501 * Remove us from the inode list. 3502 */ 3503 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); 3504 /* 3505 * The mkdir may be waiting on the jaddref to clear before freeing. 3506 */ 3507 if (mkdir) { 3508 KASSERT(mkdir->md_list.wk_type == D_MKDIR, 3509 ("handle_written_jaddref: Incorrect type for mkdir %s", 3510 TYPENAME(mkdir->md_list.wk_type))); 3511 mkdir->md_jaddref = NULL; 3512 diradd = mkdir->md_diradd; 3513 mkdir->md_state |= DEPCOMPLETE; 3514 complete_mkdir(mkdir); 3515 } 3516 jwork_insert(&diradd->da_jwork, jsegdep); 3517 if (jaddref->ja_state & NEWBLOCK) { 3518 inodedep->id_state |= ONDEPLIST; 3519 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd, 3520 inodedep, id_deps); 3521 } 3522 free_jaddref(jaddref); 3523 } 3524 3525 /* 3526 * Called once a jnewblk journal is written. The allocdirect or allocindir 3527 * is placed in the bmsafemap to await notification of a written bitmap. If 3528 * the operation was canceled we add the segdep to the appropriate 3529 * dependency to free the journal space once the canceling operation 3530 * completes. 3531 */ 3532 static void 3533 handle_written_jnewblk(jnewblk) 3534 struct jnewblk *jnewblk; 3535 { 3536 struct bmsafemap *bmsafemap; 3537 struct freefrag *freefrag; 3538 struct freework *freework; 3539 struct jsegdep *jsegdep; 3540 struct newblk *newblk; 3541 3542 /* Grab the jsegdep. */ 3543 jsegdep = jnewblk->jn_jsegdep; 3544 jnewblk->jn_jsegdep = NULL; 3545 if (jnewblk->jn_dep == NULL) 3546 panic("handle_written_jnewblk: No dependency for the segdep."); 3547 switch (jnewblk->jn_dep->wk_type) { 3548 case D_NEWBLK: 3549 case D_ALLOCDIRECT: 3550 case D_ALLOCINDIR: 3551 /* 3552 * Add the written block to the bmsafemap so it can 3553 * be notified when the bitmap is on disk. 3554 */ 3555 newblk = WK_NEWBLK(jnewblk->jn_dep); 3556 newblk->nb_jnewblk = NULL; 3557 if ((newblk->nb_state & GOINGAWAY) == 0) { 3558 bmsafemap = newblk->nb_bmsafemap; 3559 newblk->nb_state |= ONDEPLIST; 3560 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, 3561 nb_deps); 3562 } 3563 jwork_insert(&newblk->nb_jwork, jsegdep); 3564 break; 3565 case D_FREEFRAG: 3566 /* 3567 * A newblock being removed by a freefrag when replaced by 3568 * frag extension. 3569 */ 3570 freefrag = WK_FREEFRAG(jnewblk->jn_dep); 3571 freefrag->ff_jdep = NULL; 3572 WORKLIST_INSERT(&freefrag->ff_jwork, &jsegdep->jd_list); 3573 break; 3574 case D_FREEWORK: 3575 /* 3576 * A direct block was removed by truncate. 3577 */ 3578 freework = WK_FREEWORK(jnewblk->jn_dep); 3579 freework->fw_jnewblk = NULL; 3580 WORKLIST_INSERT(&freework->fw_freeblks->fb_jwork, 3581 &jsegdep->jd_list); 3582 break; 3583 default: 3584 panic("handle_written_jnewblk: Unknown type %d.", 3585 jnewblk->jn_dep->wk_type); 3586 } 3587 jnewblk->jn_dep = NULL; 3588 free_jnewblk(jnewblk); 3589 } 3590 3591 /* 3592 * Cancel a jfreefrag that won't be needed, probably due to colliding with 3593 * an in-flight allocation that has not yet been committed. Divorce us 3594 * from the freefrag and mark it DEPCOMPLETE so that it may be added 3595 * to the worklist. 3596 */ 3597 static void 3598 cancel_jfreefrag(jfreefrag) 3599 struct jfreefrag *jfreefrag; 3600 { 3601 struct freefrag *freefrag; 3602 3603 if (jfreefrag->fr_jsegdep) { 3604 free_jsegdep(jfreefrag->fr_jsegdep); 3605 jfreefrag->fr_jsegdep = NULL; 3606 } 3607 freefrag = jfreefrag->fr_freefrag; 3608 jfreefrag->fr_freefrag = NULL; 3609 free_jfreefrag(jfreefrag); 3610 freefrag->ff_state |= DEPCOMPLETE; 3611 } 3612 3613 /* 3614 * Free a jfreefrag when the parent freefrag is rendered obsolete. 3615 */ 3616 static void 3617 free_jfreefrag(jfreefrag) 3618 struct jfreefrag *jfreefrag; 3619 { 3620 3621 if (jfreefrag->fr_state & INPROGRESS) 3622 WORKLIST_REMOVE(&jfreefrag->fr_list); 3623 else if (jfreefrag->fr_state & ONWORKLIST) 3624 remove_from_journal(&jfreefrag->fr_list); 3625 if (jfreefrag->fr_freefrag != NULL) 3626 panic("free_jfreefrag: Still attached to a freefrag."); 3627 WORKITEM_FREE(jfreefrag, D_JFREEFRAG); 3628 } 3629 3630 /* 3631 * Called when the journal write for a jfreefrag completes. The parent 3632 * freefrag is added to the worklist if this completes its dependencies. 3633 */ 3634 static void 3635 handle_written_jfreefrag(jfreefrag) 3636 struct jfreefrag *jfreefrag; 3637 { 3638 struct jsegdep *jsegdep; 3639 struct freefrag *freefrag; 3640 3641 /* Grab the jsegdep. */ 3642 jsegdep = jfreefrag->fr_jsegdep; 3643 jfreefrag->fr_jsegdep = NULL; 3644 freefrag = jfreefrag->fr_freefrag; 3645 if (freefrag == NULL) 3646 panic("handle_written_jfreefrag: No freefrag."); 3647 freefrag->ff_state |= DEPCOMPLETE; 3648 freefrag->ff_jdep = NULL; 3649 jwork_insert(&freefrag->ff_jwork, jsegdep); 3650 if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) 3651 add_to_worklist(&freefrag->ff_list, 0); 3652 jfreefrag->fr_freefrag = NULL; 3653 free_jfreefrag(jfreefrag); 3654 } 3655 3656 /* 3657 * Called when the journal write for a jfreeblk completes. The jfreeblk 3658 * is removed from the freeblks list of pending journal writes and the 3659 * jsegdep is moved to the freeblks jwork to be completed when all blocks 3660 * have been reclaimed. 3661 */ 3662 static void 3663 handle_written_jblkdep(jblkdep) 3664 struct jblkdep *jblkdep; 3665 { 3666 struct freeblks *freeblks; 3667 struct jsegdep *jsegdep; 3668 3669 /* Grab the jsegdep. */ 3670 jsegdep = jblkdep->jb_jsegdep; 3671 jblkdep->jb_jsegdep = NULL; 3672 freeblks = jblkdep->jb_freeblks; 3673 LIST_REMOVE(jblkdep, jb_deps); 3674 WORKLIST_INSERT(&freeblks->fb_jwork, &jsegdep->jd_list); 3675 /* 3676 * If the freeblks is all journaled, we can add it to the worklist. 3677 */ 3678 if (LIST_EMPTY(&freeblks->fb_jblkdephd) && 3679 (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) 3680 add_to_worklist(&freeblks->fb_list, WK_NODELAY); 3681 3682 free_jblkdep(jblkdep); 3683 } 3684 3685 static struct jsegdep * 3686 newjsegdep(struct worklist *wk) 3687 { 3688 struct jsegdep *jsegdep; 3689 3690 jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS); 3691 workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp); 3692 jsegdep->jd_seg = NULL; 3693 3694 return (jsegdep); 3695 } 3696 3697 static struct jmvref * 3698 newjmvref(dp, ino, oldoff, newoff) 3699 struct inode *dp; 3700 ino_t ino; 3701 off_t oldoff; 3702 off_t newoff; 3703 { 3704 struct jmvref *jmvref; 3705 3706 jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS); 3707 workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump)); 3708 jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE; 3709 jmvref->jm_parent = dp->i_number; 3710 jmvref->jm_ino = ino; 3711 jmvref->jm_oldoff = oldoff; 3712 jmvref->jm_newoff = newoff; 3713 3714 return (jmvref); 3715 } 3716 3717 /* 3718 * Allocate a new jremref that tracks the removal of ip from dp with the 3719 * directory entry offset of diroff. Mark the entry as ATTACHED and 3720 * DEPCOMPLETE as we have all the information required for the journal write 3721 * and the directory has already been removed from the buffer. The caller 3722 * is responsible for linking the jremref into the pagedep and adding it 3723 * to the journal to write. The MKDIR_PARENT flag is set if we're doing 3724 * a DOTDOT addition so handle_workitem_remove() can properly assign 3725 * the jsegdep when we're done. 3726 */ 3727 static struct jremref * 3728 newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip, 3729 off_t diroff, nlink_t nlink) 3730 { 3731 struct jremref *jremref; 3732 3733 jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS); 3734 workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump)); 3735 jremref->jr_state = ATTACHED; 3736 newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff, 3737 nlink, ip->i_mode); 3738 jremref->jr_dirrem = dirrem; 3739 3740 return (jremref); 3741 } 3742 3743 static inline void 3744 newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff, 3745 nlink_t nlink, uint16_t mode) 3746 { 3747 3748 inoref->if_jsegdep = newjsegdep(&inoref->if_list); 3749 inoref->if_diroff = diroff; 3750 inoref->if_ino = ino; 3751 inoref->if_parent = parent; 3752 inoref->if_nlink = nlink; 3753 inoref->if_mode = mode; 3754 } 3755 3756 /* 3757 * Allocate a new jaddref to track the addition of ino to dp at diroff. The 3758 * directory offset may not be known until later. The caller is responsible 3759 * adding the entry to the journal when this information is available. nlink 3760 * should be the link count prior to the addition and mode is only required 3761 * to have the correct FMT. 3762 */ 3763 static struct jaddref * 3764 newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink, 3765 uint16_t mode) 3766 { 3767 struct jaddref *jaddref; 3768 3769 jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS); 3770 workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump)); 3771 jaddref->ja_state = ATTACHED; 3772 jaddref->ja_mkdir = NULL; 3773 newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode); 3774 3775 return (jaddref); 3776 } 3777 3778 /* 3779 * Create a new free dependency for a freework. The caller is responsible 3780 * for adjusting the reference count when it has the lock held. The freedep 3781 * will track an outstanding bitmap write that will ultimately clear the 3782 * freework to continue. 3783 */ 3784 static struct freedep * 3785 newfreedep(struct freework *freework) 3786 { 3787 struct freedep *freedep; 3788 3789 freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS); 3790 workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp); 3791 freedep->fd_freework = freework; 3792 3793 return (freedep); 3794 } 3795 3796 /* 3797 * Free a freedep structure once the buffer it is linked to is written. If 3798 * this is the last reference to the freework schedule it for completion. 3799 */ 3800 static void 3801 free_freedep(freedep) 3802 struct freedep *freedep; 3803 { 3804 struct freework *freework; 3805 3806 freework = freedep->fd_freework; 3807 freework->fw_freeblks->fb_cgwait--; 3808 if (--freework->fw_ref == 0) 3809 freework_enqueue(freework); 3810 WORKITEM_FREE(freedep, D_FREEDEP); 3811 } 3812 3813 /* 3814 * Allocate a new freework structure that may be a level in an indirect 3815 * when parent is not NULL or a top level block when it is. The top level 3816 * freework structures are allocated without lk held and before the freeblks 3817 * is visible outside of softdep_setup_freeblocks(). 3818 */ 3819 static struct freework * 3820 newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal) 3821 struct ufsmount *ump; 3822 struct freeblks *freeblks; 3823 struct freework *parent; 3824 ufs_lbn_t lbn; 3825 ufs2_daddr_t nb; 3826 int frags; 3827 int off; 3828 int journal; 3829 { 3830 struct freework *freework; 3831 3832 freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS); 3833 workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp); 3834 freework->fw_state = ATTACHED; 3835 freework->fw_jnewblk = NULL; 3836 freework->fw_freeblks = freeblks; 3837 freework->fw_parent = parent; 3838 freework->fw_lbn = lbn; 3839 freework->fw_blkno = nb; 3840 freework->fw_frags = frags; 3841 freework->fw_indir = NULL; 3842 freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 || lbn >= -NXADDR) 3843 ? 0 : NINDIR(ump->um_fs) + 1; 3844 freework->fw_start = freework->fw_off = off; 3845 if (journal) 3846 newjfreeblk(freeblks, lbn, nb, frags); 3847 if (parent == NULL) { 3848 ACQUIRE_LOCK(&lk); 3849 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list); 3850 freeblks->fb_ref++; 3851 FREE_LOCK(&lk); 3852 } 3853 3854 return (freework); 3855 } 3856 3857 /* 3858 * Eliminate a jfreeblk for a block that does not need journaling. 3859 */ 3860 static void 3861 cancel_jfreeblk(freeblks, blkno) 3862 struct freeblks *freeblks; 3863 ufs2_daddr_t blkno; 3864 { 3865 struct jfreeblk *jfreeblk; 3866 struct jblkdep *jblkdep; 3867 3868 LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) { 3869 if (jblkdep->jb_list.wk_type != D_JFREEBLK) 3870 continue; 3871 jfreeblk = WK_JFREEBLK(&jblkdep->jb_list); 3872 if (jfreeblk->jf_blkno == blkno) 3873 break; 3874 } 3875 if (jblkdep == NULL) 3876 return; 3877 free_jsegdep(jblkdep->jb_jsegdep); 3878 LIST_REMOVE(jblkdep, jb_deps); 3879 WORKITEM_FREE(jfreeblk, D_JFREEBLK); 3880 } 3881 3882 /* 3883 * Allocate a new jfreeblk to journal top level block pointer when truncating 3884 * a file. The caller must add this to the worklist when lk is held. 3885 */ 3886 static struct jfreeblk * 3887 newjfreeblk(freeblks, lbn, blkno, frags) 3888 struct freeblks *freeblks; 3889 ufs_lbn_t lbn; 3890 ufs2_daddr_t blkno; 3891 int frags; 3892 { 3893 struct jfreeblk *jfreeblk; 3894 3895 jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS); 3896 workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK, 3897 freeblks->fb_list.wk_mp); 3898 jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list); 3899 jfreeblk->jf_dep.jb_freeblks = freeblks; 3900 jfreeblk->jf_ino = freeblks->fb_inum; 3901 jfreeblk->jf_lbn = lbn; 3902 jfreeblk->jf_blkno = blkno; 3903 jfreeblk->jf_frags = frags; 3904 LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps); 3905 3906 return (jfreeblk); 3907 } 3908 3909 /* 3910 * Allocate a new jtrunc to track a partial truncation. 3911 */ 3912 static struct jtrunc * 3913 newjtrunc(freeblks, size, extsize) 3914 struct freeblks *freeblks; 3915 off_t size; 3916 int extsize; 3917 { 3918 struct jtrunc *jtrunc; 3919 3920 jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS); 3921 workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC, 3922 freeblks->fb_list.wk_mp); 3923 jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list); 3924 jtrunc->jt_dep.jb_freeblks = freeblks; 3925 jtrunc->jt_ino = freeblks->fb_inum; 3926 jtrunc->jt_size = size; 3927 jtrunc->jt_extsize = extsize; 3928 LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps); 3929 3930 return (jtrunc); 3931 } 3932 3933 /* 3934 * If we're canceling a new bitmap we have to search for another ref 3935 * to move into the bmsafemap dep. This might be better expressed 3936 * with another structure. 3937 */ 3938 static void 3939 move_newblock_dep(jaddref, inodedep) 3940 struct jaddref *jaddref; 3941 struct inodedep *inodedep; 3942 { 3943 struct inoref *inoref; 3944 struct jaddref *jaddrefn; 3945 3946 jaddrefn = NULL; 3947 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; 3948 inoref = TAILQ_NEXT(inoref, if_deps)) { 3949 if ((jaddref->ja_state & NEWBLOCK) && 3950 inoref->if_list.wk_type == D_JADDREF) { 3951 jaddrefn = (struct jaddref *)inoref; 3952 break; 3953 } 3954 } 3955 if (jaddrefn == NULL) 3956 return; 3957 jaddrefn->ja_state &= ~(ATTACHED | UNDONE); 3958 jaddrefn->ja_state |= jaddref->ja_state & 3959 (ATTACHED | UNDONE | NEWBLOCK); 3960 jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK); 3961 jaddref->ja_state |= ATTACHED; 3962 LIST_REMOVE(jaddref, ja_bmdeps); 3963 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn, 3964 ja_bmdeps); 3965 } 3966 3967 /* 3968 * Cancel a jaddref either before it has been written or while it is being 3969 * written. This happens when a link is removed before the add reaches 3970 * the disk. The jaddref dependency is kept linked into the bmsafemap 3971 * and inode to prevent the link count or bitmap from reaching the disk 3972 * until handle_workitem_remove() re-adjusts the counts and bitmaps as 3973 * required. 3974 * 3975 * Returns 1 if the canceled addref requires journaling of the remove and 3976 * 0 otherwise. 3977 */ 3978 static int 3979 cancel_jaddref(jaddref, inodedep, wkhd) 3980 struct jaddref *jaddref; 3981 struct inodedep *inodedep; 3982 struct workhead *wkhd; 3983 { 3984 struct inoref *inoref; 3985 struct jsegdep *jsegdep; 3986 int needsj; 3987 3988 KASSERT((jaddref->ja_state & COMPLETE) == 0, 3989 ("cancel_jaddref: Canceling complete jaddref")); 3990 if (jaddref->ja_state & (INPROGRESS | COMPLETE)) 3991 needsj = 1; 3992 else 3993 needsj = 0; 3994 if (inodedep == NULL) 3995 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 3996 0, &inodedep) == 0) 3997 panic("cancel_jaddref: Lost inodedep"); 3998 /* 3999 * We must adjust the nlink of any reference operation that follows 4000 * us so that it is consistent with the in-memory reference. This 4001 * ensures that inode nlink rollbacks always have the correct link. 4002 */ 4003 if (needsj == 0) { 4004 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; 4005 inoref = TAILQ_NEXT(inoref, if_deps)) { 4006 if (inoref->if_state & GOINGAWAY) 4007 break; 4008 inoref->if_nlink--; 4009 } 4010 } 4011 jsegdep = inoref_jseg(&jaddref->ja_ref); 4012 if (jaddref->ja_state & NEWBLOCK) 4013 move_newblock_dep(jaddref, inodedep); 4014 wake_worklist(&jaddref->ja_list); 4015 jaddref->ja_mkdir = NULL; 4016 if (jaddref->ja_state & INPROGRESS) { 4017 jaddref->ja_state &= ~INPROGRESS; 4018 WORKLIST_REMOVE(&jaddref->ja_list); 4019 jwork_insert(wkhd, jsegdep); 4020 } else { 4021 free_jsegdep(jsegdep); 4022 if (jaddref->ja_state & DEPCOMPLETE) 4023 remove_from_journal(&jaddref->ja_list); 4024 } 4025 jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE); 4026 /* 4027 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove 4028 * can arrange for them to be freed with the bitmap. Otherwise we 4029 * no longer need this addref attached to the inoreflst and it 4030 * will incorrectly adjust nlink if we leave it. 4031 */ 4032 if ((jaddref->ja_state & NEWBLOCK) == 0) { 4033 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, 4034 if_deps); 4035 jaddref->ja_state |= COMPLETE; 4036 free_jaddref(jaddref); 4037 return (needsj); 4038 } 4039 /* 4040 * Leave the head of the list for jsegdeps for fast merging. 4041 */ 4042 if (LIST_FIRST(wkhd) != NULL) { 4043 jaddref->ja_state |= ONWORKLIST; 4044 LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list); 4045 } else 4046 WORKLIST_INSERT(wkhd, &jaddref->ja_list); 4047 4048 return (needsj); 4049 } 4050 4051 /* 4052 * Attempt to free a jaddref structure when some work completes. This 4053 * should only succeed once the entry is written and all dependencies have 4054 * been notified. 4055 */ 4056 static void 4057 free_jaddref(jaddref) 4058 struct jaddref *jaddref; 4059 { 4060 4061 if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE) 4062 return; 4063 if (jaddref->ja_ref.if_jsegdep) 4064 panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n", 4065 jaddref, jaddref->ja_state); 4066 if (jaddref->ja_state & NEWBLOCK) 4067 LIST_REMOVE(jaddref, ja_bmdeps); 4068 if (jaddref->ja_state & (INPROGRESS | ONWORKLIST)) 4069 panic("free_jaddref: Bad state %p(0x%X)", 4070 jaddref, jaddref->ja_state); 4071 if (jaddref->ja_mkdir != NULL) 4072 panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state); 4073 WORKITEM_FREE(jaddref, D_JADDREF); 4074 } 4075 4076 /* 4077 * Free a jremref structure once it has been written or discarded. 4078 */ 4079 static void 4080 free_jremref(jremref) 4081 struct jremref *jremref; 4082 { 4083 4084 if (jremref->jr_ref.if_jsegdep) 4085 free_jsegdep(jremref->jr_ref.if_jsegdep); 4086 if (jremref->jr_state & INPROGRESS) 4087 panic("free_jremref: IO still pending"); 4088 WORKITEM_FREE(jremref, D_JREMREF); 4089 } 4090 4091 /* 4092 * Free a jnewblk structure. 4093 */ 4094 static void 4095 free_jnewblk(jnewblk) 4096 struct jnewblk *jnewblk; 4097 { 4098 4099 if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE) 4100 return; 4101 LIST_REMOVE(jnewblk, jn_deps); 4102 if (jnewblk->jn_dep != NULL) 4103 panic("free_jnewblk: Dependency still attached."); 4104 WORKITEM_FREE(jnewblk, D_JNEWBLK); 4105 } 4106 4107 /* 4108 * Cancel a jnewblk which has been been made redundant by frag extension. 4109 */ 4110 static void 4111 cancel_jnewblk(jnewblk, wkhd) 4112 struct jnewblk *jnewblk; 4113 struct workhead *wkhd; 4114 { 4115 struct jsegdep *jsegdep; 4116 4117 jsegdep = jnewblk->jn_jsegdep; 4118 if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL) 4119 panic("cancel_jnewblk: Invalid state"); 4120 jnewblk->jn_jsegdep = NULL; 4121 jnewblk->jn_dep = NULL; 4122 jnewblk->jn_state |= GOINGAWAY; 4123 if (jnewblk->jn_state & INPROGRESS) { 4124 jnewblk->jn_state &= ~INPROGRESS; 4125 WORKLIST_REMOVE(&jnewblk->jn_list); 4126 jwork_insert(wkhd, jsegdep); 4127 } else { 4128 free_jsegdep(jsegdep); 4129 remove_from_journal(&jnewblk->jn_list); 4130 } 4131 wake_worklist(&jnewblk->jn_list); 4132 WORKLIST_INSERT(wkhd, &jnewblk->jn_list); 4133 } 4134 4135 static void 4136 free_jblkdep(jblkdep) 4137 struct jblkdep *jblkdep; 4138 { 4139 4140 if (jblkdep->jb_list.wk_type == D_JFREEBLK) 4141 WORKITEM_FREE(jblkdep, D_JFREEBLK); 4142 else if (jblkdep->jb_list.wk_type == D_JTRUNC) 4143 WORKITEM_FREE(jblkdep, D_JTRUNC); 4144 else 4145 panic("free_jblkdep: Unexpected type %s", 4146 TYPENAME(jblkdep->jb_list.wk_type)); 4147 } 4148 4149 /* 4150 * Free a single jseg once it is no longer referenced in memory or on 4151 * disk. Reclaim journal blocks and dependencies waiting for the segment 4152 * to disappear. 4153 */ 4154 static void 4155 free_jseg(jseg, jblocks) 4156 struct jseg *jseg; 4157 struct jblocks *jblocks; 4158 { 4159 struct freework *freework; 4160 4161 /* 4162 * Free freework structures that were lingering to indicate freed 4163 * indirect blocks that forced journal write ordering on reallocate. 4164 */ 4165 while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL) 4166 indirblk_remove(freework); 4167 if (jblocks->jb_oldestseg == jseg) 4168 jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next); 4169 TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next); 4170 jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size); 4171 KASSERT(LIST_EMPTY(&jseg->js_entries), 4172 ("free_jseg: Freed jseg has valid entries.")); 4173 WORKITEM_FREE(jseg, D_JSEG); 4174 } 4175 4176 /* 4177 * Free all jsegs that meet the criteria for being reclaimed and update 4178 * oldestseg. 4179 */ 4180 static void 4181 free_jsegs(jblocks) 4182 struct jblocks *jblocks; 4183 { 4184 struct jseg *jseg; 4185 4186 /* 4187 * Free only those jsegs which have none allocated before them to 4188 * preserve the journal space ordering. 4189 */ 4190 while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) { 4191 /* 4192 * Only reclaim space when nothing depends on this journal 4193 * set and another set has written that it is no longer 4194 * valid. 4195 */ 4196 if (jseg->js_refs != 0) { 4197 jblocks->jb_oldestseg = jseg; 4198 return; 4199 } 4200 if (!LIST_EMPTY(&jseg->js_indirs) && 4201 jseg->js_seq >= jblocks->jb_oldestwrseq) 4202 break; 4203 free_jseg(jseg, jblocks); 4204 } 4205 /* 4206 * If we exited the loop above we still must discover the 4207 * oldest valid segment. 4208 */ 4209 if (jseg) 4210 for (jseg = jblocks->jb_oldestseg; jseg != NULL; 4211 jseg = TAILQ_NEXT(jseg, js_next)) 4212 if (jseg->js_refs != 0) 4213 break; 4214 jblocks->jb_oldestseg = jseg; 4215 /* 4216 * The journal has no valid records but some jsegs may still be 4217 * waiting on oldestwrseq to advance. We force a small record 4218 * out to permit these lingering records to be reclaimed. 4219 */ 4220 if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs)) 4221 jblocks->jb_needseg = 1; 4222 } 4223 4224 /* 4225 * Release one reference to a jseg and free it if the count reaches 0. This 4226 * should eventually reclaim journal space as well. 4227 */ 4228 static void 4229 rele_jseg(jseg) 4230 struct jseg *jseg; 4231 { 4232 4233 KASSERT(jseg->js_refs > 0, 4234 ("free_jseg: Invalid refcnt %d", jseg->js_refs)); 4235 if (--jseg->js_refs != 0) 4236 return; 4237 free_jsegs(jseg->js_jblocks); 4238 } 4239 4240 /* 4241 * Release a jsegdep and decrement the jseg count. 4242 */ 4243 static void 4244 free_jsegdep(jsegdep) 4245 struct jsegdep *jsegdep; 4246 { 4247 4248 if (jsegdep->jd_seg) 4249 rele_jseg(jsegdep->jd_seg); 4250 WORKITEM_FREE(jsegdep, D_JSEGDEP); 4251 } 4252 4253 /* 4254 * Wait for a journal item to make it to disk. Initiate journal processing 4255 * if required. 4256 */ 4257 static int 4258 jwait(wk, waitfor) 4259 struct worklist *wk; 4260 int waitfor; 4261 { 4262 4263 /* 4264 * Blocking journal waits cause slow synchronous behavior. Record 4265 * stats on the frequency of these blocking operations. 4266 */ 4267 if (waitfor == MNT_WAIT) { 4268 stat_journal_wait++; 4269 switch (wk->wk_type) { 4270 case D_JREMREF: 4271 case D_JMVREF: 4272 stat_jwait_filepage++; 4273 break; 4274 case D_JTRUNC: 4275 case D_JFREEBLK: 4276 stat_jwait_freeblks++; 4277 break; 4278 case D_JNEWBLK: 4279 stat_jwait_newblk++; 4280 break; 4281 case D_JADDREF: 4282 stat_jwait_inode++; 4283 break; 4284 default: 4285 break; 4286 } 4287 } 4288 /* 4289 * If IO has not started we process the journal. We can't mark the 4290 * worklist item as IOWAITING because we drop the lock while 4291 * processing the journal and the worklist entry may be freed after 4292 * this point. The caller may call back in and re-issue the request. 4293 */ 4294 if ((wk->wk_state & INPROGRESS) == 0) { 4295 softdep_process_journal(wk->wk_mp, wk, waitfor); 4296 if (waitfor != MNT_WAIT) 4297 return (EBUSY); 4298 return (0); 4299 } 4300 if (waitfor != MNT_WAIT) 4301 return (EBUSY); 4302 wait_worklist(wk, "jwait"); 4303 return (0); 4304 } 4305 4306 /* 4307 * Lookup an inodedep based on an inode pointer and set the nlinkdelta as 4308 * appropriate. This is a convenience function to reduce duplicate code 4309 * for the setup and revert functions below. 4310 */ 4311 static struct inodedep * 4312 inodedep_lookup_ip(ip) 4313 struct inode *ip; 4314 { 4315 struct inodedep *inodedep; 4316 int dflags; 4317 4318 KASSERT(ip->i_nlink >= ip->i_effnlink, 4319 ("inodedep_lookup_ip: bad delta")); 4320 dflags = DEPALLOC; 4321 if (IS_SNAPSHOT(ip)) 4322 dflags |= NODELAY; 4323 (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, 4324 &inodedep); 4325 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 4326 KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked")); 4327 4328 return (inodedep); 4329 } 4330 4331 /* 4332 * Called prior to creating a new inode and linking it to a directory. The 4333 * jaddref structure must already be allocated by softdep_setup_inomapdep 4334 * and it is discovered here so we can initialize the mode and update 4335 * nlinkdelta. 4336 */ 4337 void 4338 softdep_setup_create(dp, ip) 4339 struct inode *dp; 4340 struct inode *ip; 4341 { 4342 struct inodedep *inodedep; 4343 struct jaddref *jaddref; 4344 struct vnode *dvp; 4345 4346 KASSERT(ip->i_nlink == 1, 4347 ("softdep_setup_create: Invalid link count.")); 4348 dvp = ITOV(dp); 4349 ACQUIRE_LOCK(&lk); 4350 inodedep = inodedep_lookup_ip(ip); 4351 if (DOINGSUJ(dvp)) { 4352 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4353 inoreflst); 4354 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 4355 ("softdep_setup_create: No addref structure present.")); 4356 } 4357 softdep_prelink(dvp, NULL); 4358 FREE_LOCK(&lk); 4359 } 4360 4361 /* 4362 * Create a jaddref structure to track the addition of a DOTDOT link when 4363 * we are reparenting an inode as part of a rename. This jaddref will be 4364 * found by softdep_setup_directory_change. Adjusts nlinkdelta for 4365 * non-journaling softdep. 4366 */ 4367 void 4368 softdep_setup_dotdot_link(dp, ip) 4369 struct inode *dp; 4370 struct inode *ip; 4371 { 4372 struct inodedep *inodedep; 4373 struct jaddref *jaddref; 4374 struct vnode *dvp; 4375 struct vnode *vp; 4376 4377 dvp = ITOV(dp); 4378 vp = ITOV(ip); 4379 jaddref = NULL; 4380 /* 4381 * We don't set MKDIR_PARENT as this is not tied to a mkdir and 4382 * is used as a normal link would be. 4383 */ 4384 if (DOINGSUJ(dvp)) 4385 jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, 4386 dp->i_effnlink - 1, dp->i_mode); 4387 ACQUIRE_LOCK(&lk); 4388 inodedep = inodedep_lookup_ip(dp); 4389 if (jaddref) 4390 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 4391 if_deps); 4392 softdep_prelink(dvp, ITOV(ip)); 4393 FREE_LOCK(&lk); 4394 } 4395 4396 /* 4397 * Create a jaddref structure to track a new link to an inode. The directory 4398 * offset is not known until softdep_setup_directory_add or 4399 * softdep_setup_directory_change. Adjusts nlinkdelta for non-journaling 4400 * softdep. 4401 */ 4402 void 4403 softdep_setup_link(dp, ip) 4404 struct inode *dp; 4405 struct inode *ip; 4406 { 4407 struct inodedep *inodedep; 4408 struct jaddref *jaddref; 4409 struct vnode *dvp; 4410 4411 dvp = ITOV(dp); 4412 jaddref = NULL; 4413 if (DOINGSUJ(dvp)) 4414 jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1, 4415 ip->i_mode); 4416 ACQUIRE_LOCK(&lk); 4417 inodedep = inodedep_lookup_ip(ip); 4418 if (jaddref) 4419 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 4420 if_deps); 4421 softdep_prelink(dvp, ITOV(ip)); 4422 FREE_LOCK(&lk); 4423 } 4424 4425 /* 4426 * Called to create the jaddref structures to track . and .. references as 4427 * well as lookup and further initialize the incomplete jaddref created 4428 * by softdep_setup_inomapdep when the inode was allocated. Adjusts 4429 * nlinkdelta for non-journaling softdep. 4430 */ 4431 void 4432 softdep_setup_mkdir(dp, ip) 4433 struct inode *dp; 4434 struct inode *ip; 4435 { 4436 struct inodedep *inodedep; 4437 struct jaddref *dotdotaddref; 4438 struct jaddref *dotaddref; 4439 struct jaddref *jaddref; 4440 struct vnode *dvp; 4441 4442 dvp = ITOV(dp); 4443 dotaddref = dotdotaddref = NULL; 4444 if (DOINGSUJ(dvp)) { 4445 dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1, 4446 ip->i_mode); 4447 dotaddref->ja_state |= MKDIR_BODY; 4448 dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, 4449 dp->i_effnlink - 1, dp->i_mode); 4450 dotdotaddref->ja_state |= MKDIR_PARENT; 4451 } 4452 ACQUIRE_LOCK(&lk); 4453 inodedep = inodedep_lookup_ip(ip); 4454 if (DOINGSUJ(dvp)) { 4455 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4456 inoreflst); 4457 KASSERT(jaddref != NULL, 4458 ("softdep_setup_mkdir: No addref structure present.")); 4459 KASSERT(jaddref->ja_parent == dp->i_number, 4460 ("softdep_setup_mkdir: bad parent %d", 4461 jaddref->ja_parent)); 4462 TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref, 4463 if_deps); 4464 } 4465 inodedep = inodedep_lookup_ip(dp); 4466 if (DOINGSUJ(dvp)) 4467 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, 4468 &dotdotaddref->ja_ref, if_deps); 4469 softdep_prelink(ITOV(dp), NULL); 4470 FREE_LOCK(&lk); 4471 } 4472 4473 /* 4474 * Called to track nlinkdelta of the inode and parent directories prior to 4475 * unlinking a directory. 4476 */ 4477 void 4478 softdep_setup_rmdir(dp, ip) 4479 struct inode *dp; 4480 struct inode *ip; 4481 { 4482 struct vnode *dvp; 4483 4484 dvp = ITOV(dp); 4485 ACQUIRE_LOCK(&lk); 4486 (void) inodedep_lookup_ip(ip); 4487 (void) inodedep_lookup_ip(dp); 4488 softdep_prelink(dvp, ITOV(ip)); 4489 FREE_LOCK(&lk); 4490 } 4491 4492 /* 4493 * Called to track nlinkdelta of the inode and parent directories prior to 4494 * unlink. 4495 */ 4496 void 4497 softdep_setup_unlink(dp, ip) 4498 struct inode *dp; 4499 struct inode *ip; 4500 { 4501 struct vnode *dvp; 4502 4503 dvp = ITOV(dp); 4504 ACQUIRE_LOCK(&lk); 4505 (void) inodedep_lookup_ip(ip); 4506 (void) inodedep_lookup_ip(dp); 4507 softdep_prelink(dvp, ITOV(ip)); 4508 FREE_LOCK(&lk); 4509 } 4510 4511 /* 4512 * Called to release the journal structures created by a failed non-directory 4513 * creation. Adjusts nlinkdelta for non-journaling softdep. 4514 */ 4515 void 4516 softdep_revert_create(dp, ip) 4517 struct inode *dp; 4518 struct inode *ip; 4519 { 4520 struct inodedep *inodedep; 4521 struct jaddref *jaddref; 4522 struct vnode *dvp; 4523 4524 dvp = ITOV(dp); 4525 ACQUIRE_LOCK(&lk); 4526 inodedep = inodedep_lookup_ip(ip); 4527 if (DOINGSUJ(dvp)) { 4528 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4529 inoreflst); 4530 KASSERT(jaddref->ja_parent == dp->i_number, 4531 ("softdep_revert_create: addref parent mismatch")); 4532 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4533 } 4534 FREE_LOCK(&lk); 4535 } 4536 4537 /* 4538 * Called to release the journal structures created by a failed dotdot link 4539 * creation. Adjusts nlinkdelta for non-journaling softdep. 4540 */ 4541 void 4542 softdep_revert_dotdot_link(dp, ip) 4543 struct inode *dp; 4544 struct inode *ip; 4545 { 4546 struct inodedep *inodedep; 4547 struct jaddref *jaddref; 4548 struct vnode *dvp; 4549 4550 dvp = ITOV(dp); 4551 ACQUIRE_LOCK(&lk); 4552 inodedep = inodedep_lookup_ip(dp); 4553 if (DOINGSUJ(dvp)) { 4554 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4555 inoreflst); 4556 KASSERT(jaddref->ja_parent == ip->i_number, 4557 ("softdep_revert_dotdot_link: addref parent mismatch")); 4558 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4559 } 4560 FREE_LOCK(&lk); 4561 } 4562 4563 /* 4564 * Called to release the journal structures created by a failed link 4565 * addition. Adjusts nlinkdelta for non-journaling softdep. 4566 */ 4567 void 4568 softdep_revert_link(dp, ip) 4569 struct inode *dp; 4570 struct inode *ip; 4571 { 4572 struct inodedep *inodedep; 4573 struct jaddref *jaddref; 4574 struct vnode *dvp; 4575 4576 dvp = ITOV(dp); 4577 ACQUIRE_LOCK(&lk); 4578 inodedep = inodedep_lookup_ip(ip); 4579 if (DOINGSUJ(dvp)) { 4580 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4581 inoreflst); 4582 KASSERT(jaddref->ja_parent == dp->i_number, 4583 ("softdep_revert_link: addref parent mismatch")); 4584 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4585 } 4586 FREE_LOCK(&lk); 4587 } 4588 4589 /* 4590 * Called to release the journal structures created by a failed mkdir 4591 * attempt. Adjusts nlinkdelta for non-journaling softdep. 4592 */ 4593 void 4594 softdep_revert_mkdir(dp, ip) 4595 struct inode *dp; 4596 struct inode *ip; 4597 { 4598 struct inodedep *inodedep; 4599 struct jaddref *jaddref; 4600 struct jaddref *dotaddref; 4601 struct vnode *dvp; 4602 4603 dvp = ITOV(dp); 4604 4605 ACQUIRE_LOCK(&lk); 4606 inodedep = inodedep_lookup_ip(dp); 4607 if (DOINGSUJ(dvp)) { 4608 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4609 inoreflst); 4610 KASSERT(jaddref->ja_parent == ip->i_number, 4611 ("softdep_revert_mkdir: dotdot addref parent mismatch")); 4612 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4613 } 4614 inodedep = inodedep_lookup_ip(ip); 4615 if (DOINGSUJ(dvp)) { 4616 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4617 inoreflst); 4618 KASSERT(jaddref->ja_parent == dp->i_number, 4619 ("softdep_revert_mkdir: addref parent mismatch")); 4620 dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, 4621 inoreflst, if_deps); 4622 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4623 KASSERT(dotaddref->ja_parent == ip->i_number, 4624 ("softdep_revert_mkdir: dot addref parent mismatch")); 4625 cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait); 4626 } 4627 FREE_LOCK(&lk); 4628 } 4629 4630 /* 4631 * Called to correct nlinkdelta after a failed rmdir. 4632 */ 4633 void 4634 softdep_revert_rmdir(dp, ip) 4635 struct inode *dp; 4636 struct inode *ip; 4637 { 4638 4639 ACQUIRE_LOCK(&lk); 4640 (void) inodedep_lookup_ip(ip); 4641 (void) inodedep_lookup_ip(dp); 4642 FREE_LOCK(&lk); 4643 } 4644 4645 /* 4646 * Protecting the freemaps (or bitmaps). 4647 * 4648 * To eliminate the need to execute fsck before mounting a filesystem 4649 * after a power failure, one must (conservatively) guarantee that the 4650 * on-disk copy of the bitmaps never indicate that a live inode or block is 4651 * free. So, when a block or inode is allocated, the bitmap should be 4652 * updated (on disk) before any new pointers. When a block or inode is 4653 * freed, the bitmap should not be updated until all pointers have been 4654 * reset. The latter dependency is handled by the delayed de-allocation 4655 * approach described below for block and inode de-allocation. The former 4656 * dependency is handled by calling the following procedure when a block or 4657 * inode is allocated. When an inode is allocated an "inodedep" is created 4658 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. 4659 * Each "inodedep" is also inserted into the hash indexing structure so 4660 * that any additional link additions can be made dependent on the inode 4661 * allocation. 4662 * 4663 * The ufs filesystem maintains a number of free block counts (e.g., per 4664 * cylinder group, per cylinder and per <cylinder, rotational position> pair) 4665 * in addition to the bitmaps. These counts are used to improve efficiency 4666 * during allocation and therefore must be consistent with the bitmaps. 4667 * There is no convenient way to guarantee post-crash consistency of these 4668 * counts with simple update ordering, for two main reasons: (1) The counts 4669 * and bitmaps for a single cylinder group block are not in the same disk 4670 * sector. If a disk write is interrupted (e.g., by power failure), one may 4671 * be written and the other not. (2) Some of the counts are located in the 4672 * superblock rather than the cylinder group block. So, we focus our soft 4673 * updates implementation on protecting the bitmaps. When mounting a 4674 * filesystem, we recompute the auxiliary counts from the bitmaps. 4675 */ 4676 4677 /* 4678 * Called just after updating the cylinder group block to allocate an inode. 4679 */ 4680 void 4681 softdep_setup_inomapdep(bp, ip, newinum, mode) 4682 struct buf *bp; /* buffer for cylgroup block with inode map */ 4683 struct inode *ip; /* inode related to allocation */ 4684 ino_t newinum; /* new inode number being allocated */ 4685 int mode; 4686 { 4687 struct inodedep *inodedep; 4688 struct bmsafemap *bmsafemap; 4689 struct jaddref *jaddref; 4690 struct mount *mp; 4691 struct fs *fs; 4692 4693 mp = UFSTOVFS(ip->i_ump); 4694 fs = ip->i_ump->um_fs; 4695 jaddref = NULL; 4696 4697 /* 4698 * Allocate the journal reference add structure so that the bitmap 4699 * can be dependent on it. 4700 */ 4701 if (MOUNTEDSUJ(mp)) { 4702 jaddref = newjaddref(ip, newinum, 0, 0, mode); 4703 jaddref->ja_state |= NEWBLOCK; 4704 } 4705 4706 /* 4707 * Create a dependency for the newly allocated inode. 4708 * Panic if it already exists as something is seriously wrong. 4709 * Otherwise add it to the dependency list for the buffer holding 4710 * the cylinder group map from which it was allocated. 4711 */ 4712 ACQUIRE_LOCK(&lk); 4713 if ((inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep))) 4714 panic("softdep_setup_inomapdep: dependency %p for new" 4715 "inode already exists", inodedep); 4716 bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum)); 4717 if (jaddref) { 4718 LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps); 4719 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 4720 if_deps); 4721 } else { 4722 inodedep->id_state |= ONDEPLIST; 4723 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); 4724 } 4725 inodedep->id_bmsafemap = bmsafemap; 4726 inodedep->id_state &= ~DEPCOMPLETE; 4727 FREE_LOCK(&lk); 4728 } 4729 4730 /* 4731 * Called just after updating the cylinder group block to 4732 * allocate block or fragment. 4733 */ 4734 void 4735 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) 4736 struct buf *bp; /* buffer for cylgroup block with block map */ 4737 struct mount *mp; /* filesystem doing allocation */ 4738 ufs2_daddr_t newblkno; /* number of newly allocated block */ 4739 int frags; /* Number of fragments. */ 4740 int oldfrags; /* Previous number of fragments for extend. */ 4741 { 4742 struct newblk *newblk; 4743 struct bmsafemap *bmsafemap; 4744 struct jnewblk *jnewblk; 4745 struct fs *fs; 4746 4747 fs = VFSTOUFS(mp)->um_fs; 4748 jnewblk = NULL; 4749 /* 4750 * Create a dependency for the newly allocated block. 4751 * Add it to the dependency list for the buffer holding 4752 * the cylinder group map from which it was allocated. 4753 */ 4754 if (MOUNTEDSUJ(mp)) { 4755 jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS); 4756 workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp); 4757 jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list); 4758 jnewblk->jn_state = ATTACHED; 4759 jnewblk->jn_blkno = newblkno; 4760 jnewblk->jn_frags = frags; 4761 jnewblk->jn_oldfrags = oldfrags; 4762 #ifdef SUJ_DEBUG 4763 { 4764 struct cg *cgp; 4765 uint8_t *blksfree; 4766 long bno; 4767 int i; 4768 4769 cgp = (struct cg *)bp->b_data; 4770 blksfree = cg_blksfree(cgp); 4771 bno = dtogd(fs, jnewblk->jn_blkno); 4772 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; 4773 i++) { 4774 if (isset(blksfree, bno + i)) 4775 panic("softdep_setup_blkmapdep: " 4776 "free fragment %d from %d-%d " 4777 "state 0x%X dep %p", i, 4778 jnewblk->jn_oldfrags, 4779 jnewblk->jn_frags, 4780 jnewblk->jn_state, 4781 jnewblk->jn_dep); 4782 } 4783 } 4784 #endif 4785 } 4786 ACQUIRE_LOCK(&lk); 4787 if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0) 4788 panic("softdep_setup_blkmapdep: found block"); 4789 newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp, 4790 dtog(fs, newblkno)); 4791 if (jnewblk) { 4792 jnewblk->jn_dep = (struct worklist *)newblk; 4793 LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps); 4794 } else { 4795 newblk->nb_state |= ONDEPLIST; 4796 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); 4797 } 4798 newblk->nb_bmsafemap = bmsafemap; 4799 newblk->nb_jnewblk = jnewblk; 4800 FREE_LOCK(&lk); 4801 } 4802 4803 #define BMSAFEMAP_HASH(fs, cg) \ 4804 (&bmsafemap_hashtbl[((((register_t)(fs)) >> 13) + (cg)) & bmsafemap_hash]) 4805 4806 static int 4807 bmsafemap_find(bmsafemaphd, mp, cg, bmsafemapp) 4808 struct bmsafemap_hashhead *bmsafemaphd; 4809 struct mount *mp; 4810 int cg; 4811 struct bmsafemap **bmsafemapp; 4812 { 4813 struct bmsafemap *bmsafemap; 4814 4815 LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash) 4816 if (bmsafemap->sm_list.wk_mp == mp && bmsafemap->sm_cg == cg) 4817 break; 4818 if (bmsafemap) { 4819 *bmsafemapp = bmsafemap; 4820 return (1); 4821 } 4822 *bmsafemapp = NULL; 4823 4824 return (0); 4825 } 4826 4827 /* 4828 * Find the bmsafemap associated with a cylinder group buffer. 4829 * If none exists, create one. The buffer must be locked when 4830 * this routine is called and this routine must be called with 4831 * splbio interrupts blocked. 4832 */ 4833 static struct bmsafemap * 4834 bmsafemap_lookup(mp, bp, cg) 4835 struct mount *mp; 4836 struct buf *bp; 4837 int cg; 4838 { 4839 struct bmsafemap_hashhead *bmsafemaphd; 4840 struct bmsafemap *bmsafemap, *collision; 4841 struct worklist *wk; 4842 struct fs *fs; 4843 4844 mtx_assert(&lk, MA_OWNED); 4845 if (bp) 4846 LIST_FOREACH(wk, &bp->b_dep, wk_list) 4847 if (wk->wk_type == D_BMSAFEMAP) 4848 return (WK_BMSAFEMAP(wk)); 4849 fs = VFSTOUFS(mp)->um_fs; 4850 bmsafemaphd = BMSAFEMAP_HASH(fs, cg); 4851 if (bmsafemap_find(bmsafemaphd, mp, cg, &bmsafemap) == 1) 4852 return (bmsafemap); 4853 FREE_LOCK(&lk); 4854 bmsafemap = malloc(sizeof(struct bmsafemap), 4855 M_BMSAFEMAP, M_SOFTDEP_FLAGS); 4856 workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); 4857 bmsafemap->sm_buf = bp; 4858 LIST_INIT(&bmsafemap->sm_inodedephd); 4859 LIST_INIT(&bmsafemap->sm_inodedepwr); 4860 LIST_INIT(&bmsafemap->sm_newblkhd); 4861 LIST_INIT(&bmsafemap->sm_newblkwr); 4862 LIST_INIT(&bmsafemap->sm_jaddrefhd); 4863 LIST_INIT(&bmsafemap->sm_jnewblkhd); 4864 LIST_INIT(&bmsafemap->sm_freehd); 4865 LIST_INIT(&bmsafemap->sm_freewr); 4866 ACQUIRE_LOCK(&lk); 4867 if (bmsafemap_find(bmsafemaphd, mp, cg, &collision) == 1) { 4868 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 4869 return (collision); 4870 } 4871 bmsafemap->sm_cg = cg; 4872 LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash); 4873 LIST_INSERT_HEAD(&VFSTOUFS(mp)->softdep_dirtycg, bmsafemap, sm_next); 4874 WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); 4875 return (bmsafemap); 4876 } 4877 4878 /* 4879 * Direct block allocation dependencies. 4880 * 4881 * When a new block is allocated, the corresponding disk locations must be 4882 * initialized (with zeros or new data) before the on-disk inode points to 4883 * them. Also, the freemap from which the block was allocated must be 4884 * updated (on disk) before the inode's pointer. These two dependencies are 4885 * independent of each other and are needed for all file blocks and indirect 4886 * blocks that are pointed to directly by the inode. Just before the 4887 * "in-core" version of the inode is updated with a newly allocated block 4888 * number, a procedure (below) is called to setup allocation dependency 4889 * structures. These structures are removed when the corresponding 4890 * dependencies are satisfied or when the block allocation becomes obsolete 4891 * (i.e., the file is deleted, the block is de-allocated, or the block is a 4892 * fragment that gets upgraded). All of these cases are handled in 4893 * procedures described later. 4894 * 4895 * When a file extension causes a fragment to be upgraded, either to a larger 4896 * fragment or to a full block, the on-disk location may change (if the 4897 * previous fragment could not simply be extended). In this case, the old 4898 * fragment must be de-allocated, but not until after the inode's pointer has 4899 * been updated. In most cases, this is handled by later procedures, which 4900 * will construct a "freefrag" structure to be added to the workitem queue 4901 * when the inode update is complete (or obsolete). The main exception to 4902 * this is when an allocation occurs while a pending allocation dependency 4903 * (for the same block pointer) remains. This case is handled in the main 4904 * allocation dependency setup procedure by immediately freeing the 4905 * unreferenced fragments. 4906 */ 4907 void 4908 softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp) 4909 struct inode *ip; /* inode to which block is being added */ 4910 ufs_lbn_t off; /* block pointer within inode */ 4911 ufs2_daddr_t newblkno; /* disk block number being added */ 4912 ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */ 4913 long newsize; /* size of new block */ 4914 long oldsize; /* size of new block */ 4915 struct buf *bp; /* bp for allocated block */ 4916 { 4917 struct allocdirect *adp, *oldadp; 4918 struct allocdirectlst *adphead; 4919 struct freefrag *freefrag; 4920 struct inodedep *inodedep; 4921 struct pagedep *pagedep; 4922 struct jnewblk *jnewblk; 4923 struct newblk *newblk; 4924 struct mount *mp; 4925 ufs_lbn_t lbn; 4926 4927 lbn = bp->b_lblkno; 4928 mp = UFSTOVFS(ip->i_ump); 4929 if (oldblkno && oldblkno != newblkno) 4930 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); 4931 else 4932 freefrag = NULL; 4933 4934 ACQUIRE_LOCK(&lk); 4935 if (off >= NDADDR) { 4936 if (lbn > 0) 4937 panic("softdep_setup_allocdirect: bad lbn %jd, off %jd", 4938 lbn, off); 4939 /* allocating an indirect block */ 4940 if (oldblkno != 0) 4941 panic("softdep_setup_allocdirect: non-zero indir"); 4942 } else { 4943 if (off != lbn) 4944 panic("softdep_setup_allocdirect: lbn %jd != off %jd", 4945 lbn, off); 4946 /* 4947 * Allocating a direct block. 4948 * 4949 * If we are allocating a directory block, then we must 4950 * allocate an associated pagedep to track additions and 4951 * deletions. 4952 */ 4953 if ((ip->i_mode & IFMT) == IFDIR) 4954 pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC, 4955 &pagedep); 4956 } 4957 if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) 4958 panic("softdep_setup_allocdirect: lost block"); 4959 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 4960 ("softdep_setup_allocdirect: newblk already initialized")); 4961 /* 4962 * Convert the newblk to an allocdirect. 4963 */ 4964 newblk->nb_list.wk_type = D_ALLOCDIRECT; 4965 adp = (struct allocdirect *)newblk; 4966 newblk->nb_freefrag = freefrag; 4967 adp->ad_offset = off; 4968 adp->ad_oldblkno = oldblkno; 4969 adp->ad_newsize = newsize; 4970 adp->ad_oldsize = oldsize; 4971 4972 /* 4973 * Finish initializing the journal. 4974 */ 4975 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 4976 jnewblk->jn_ino = ip->i_number; 4977 jnewblk->jn_lbn = lbn; 4978 add_to_journal(&jnewblk->jn_list); 4979 } 4980 if (freefrag && freefrag->ff_jdep != NULL && 4981 freefrag->ff_jdep->wk_type == D_JFREEFRAG) 4982 add_to_journal(freefrag->ff_jdep); 4983 inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); 4984 adp->ad_inodedep = inodedep; 4985 4986 WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); 4987 /* 4988 * The list of allocdirects must be kept in sorted and ascending 4989 * order so that the rollback routines can quickly determine the 4990 * first uncommitted block (the size of the file stored on disk 4991 * ends at the end of the lowest committed fragment, or if there 4992 * are no fragments, at the end of the highest committed block). 4993 * Since files generally grow, the typical case is that the new 4994 * block is to be added at the end of the list. We speed this 4995 * special case by checking against the last allocdirect in the 4996 * list before laboriously traversing the list looking for the 4997 * insertion point. 4998 */ 4999 adphead = &inodedep->id_newinoupdt; 5000 oldadp = TAILQ_LAST(adphead, allocdirectlst); 5001 if (oldadp == NULL || oldadp->ad_offset <= off) { 5002 /* insert at end of list */ 5003 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 5004 if (oldadp != NULL && oldadp->ad_offset == off) 5005 allocdirect_merge(adphead, adp, oldadp); 5006 FREE_LOCK(&lk); 5007 return; 5008 } 5009 TAILQ_FOREACH(oldadp, adphead, ad_next) { 5010 if (oldadp->ad_offset >= off) 5011 break; 5012 } 5013 if (oldadp == NULL) 5014 panic("softdep_setup_allocdirect: lost entry"); 5015 /* insert in middle of list */ 5016 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 5017 if (oldadp->ad_offset == off) 5018 allocdirect_merge(adphead, adp, oldadp); 5019 5020 FREE_LOCK(&lk); 5021 } 5022 5023 /* 5024 * Merge a newer and older journal record to be stored either in a 5025 * newblock or freefrag. This handles aggregating journal records for 5026 * fragment allocation into a second record as well as replacing a 5027 * journal free with an aborted journal allocation. A segment for the 5028 * oldest record will be placed on wkhd if it has been written. If not 5029 * the segment for the newer record will suffice. 5030 */ 5031 static struct worklist * 5032 jnewblk_merge(new, old, wkhd) 5033 struct worklist *new; 5034 struct worklist *old; 5035 struct workhead *wkhd; 5036 { 5037 struct jnewblk *njnewblk; 5038 struct jnewblk *jnewblk; 5039 5040 /* Handle NULLs to simplify callers. */ 5041 if (new == NULL) 5042 return (old); 5043 if (old == NULL) 5044 return (new); 5045 /* Replace a jfreefrag with a jnewblk. */ 5046 if (new->wk_type == D_JFREEFRAG) { 5047 cancel_jfreefrag(WK_JFREEFRAG(new)); 5048 return (old); 5049 } 5050 /* 5051 * Handle merging of two jnewblk records that describe 5052 * different sets of fragments in the same block. 5053 */ 5054 jnewblk = WK_JNEWBLK(old); 5055 njnewblk = WK_JNEWBLK(new); 5056 if (jnewblk->jn_blkno != njnewblk->jn_blkno) 5057 panic("jnewblk_merge: Merging disparate blocks."); 5058 /* 5059 * The record may be rolled back in the cg. 5060 */ 5061 if (jnewblk->jn_state & UNDONE) { 5062 jnewblk->jn_state &= ~UNDONE; 5063 njnewblk->jn_state |= UNDONE; 5064 njnewblk->jn_state &= ~ATTACHED; 5065 } 5066 /* 5067 * We modify the newer addref and free the older so that if neither 5068 * has been written the most up-to-date copy will be on disk. If 5069 * both have been written but rolled back we only temporarily need 5070 * one of them to fix the bits when the cg write completes. 5071 */ 5072 jnewblk->jn_state |= ATTACHED | COMPLETE; 5073 njnewblk->jn_oldfrags = jnewblk->jn_oldfrags; 5074 cancel_jnewblk(jnewblk, wkhd); 5075 WORKLIST_REMOVE(&jnewblk->jn_list); 5076 free_jnewblk(jnewblk); 5077 return (new); 5078 } 5079 5080 /* 5081 * Replace an old allocdirect dependency with a newer one. 5082 * This routine must be called with splbio interrupts blocked. 5083 */ 5084 static void 5085 allocdirect_merge(adphead, newadp, oldadp) 5086 struct allocdirectlst *adphead; /* head of list holding allocdirects */ 5087 struct allocdirect *newadp; /* allocdirect being added */ 5088 struct allocdirect *oldadp; /* existing allocdirect being checked */ 5089 { 5090 struct worklist *wk; 5091 struct freefrag *freefrag; 5092 5093 freefrag = NULL; 5094 mtx_assert(&lk, MA_OWNED); 5095 if (newadp->ad_oldblkno != oldadp->ad_newblkno || 5096 newadp->ad_oldsize != oldadp->ad_newsize || 5097 newadp->ad_offset >= NDADDR) 5098 panic("%s %jd != new %jd || old size %ld != new %ld", 5099 "allocdirect_merge: old blkno", 5100 (intmax_t)newadp->ad_oldblkno, 5101 (intmax_t)oldadp->ad_newblkno, 5102 newadp->ad_oldsize, oldadp->ad_newsize); 5103 newadp->ad_oldblkno = oldadp->ad_oldblkno; 5104 newadp->ad_oldsize = oldadp->ad_oldsize; 5105 /* 5106 * If the old dependency had a fragment to free or had never 5107 * previously had a block allocated, then the new dependency 5108 * can immediately post its freefrag and adopt the old freefrag. 5109 * This action is done by swapping the freefrag dependencies. 5110 * The new dependency gains the old one's freefrag, and the 5111 * old one gets the new one and then immediately puts it on 5112 * the worklist when it is freed by free_newblk. It is 5113 * not possible to do this swap when the old dependency had a 5114 * non-zero size but no previous fragment to free. This condition 5115 * arises when the new block is an extension of the old block. 5116 * Here, the first part of the fragment allocated to the new 5117 * dependency is part of the block currently claimed on disk by 5118 * the old dependency, so cannot legitimately be freed until the 5119 * conditions for the new dependency are fulfilled. 5120 */ 5121 freefrag = newadp->ad_freefrag; 5122 if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { 5123 newadp->ad_freefrag = oldadp->ad_freefrag; 5124 oldadp->ad_freefrag = freefrag; 5125 } 5126 /* 5127 * If we are tracking a new directory-block allocation, 5128 * move it from the old allocdirect to the new allocdirect. 5129 */ 5130 if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) { 5131 WORKLIST_REMOVE(wk); 5132 if (!LIST_EMPTY(&oldadp->ad_newdirblk)) 5133 panic("allocdirect_merge: extra newdirblk"); 5134 WORKLIST_INSERT(&newadp->ad_newdirblk, wk); 5135 } 5136 TAILQ_REMOVE(adphead, oldadp, ad_next); 5137 /* 5138 * We need to move any journal dependencies over to the freefrag 5139 * that releases this block if it exists. Otherwise we are 5140 * extending an existing block and we'll wait until that is 5141 * complete to release the journal space and extend the 5142 * new journal to cover this old space as well. 5143 */ 5144 if (freefrag == NULL) { 5145 if (oldadp->ad_newblkno != newadp->ad_newblkno) 5146 panic("allocdirect_merge: %jd != %jd", 5147 oldadp->ad_newblkno, newadp->ad_newblkno); 5148 newadp->ad_block.nb_jnewblk = (struct jnewblk *) 5149 jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list, 5150 &oldadp->ad_block.nb_jnewblk->jn_list, 5151 &newadp->ad_block.nb_jwork); 5152 oldadp->ad_block.nb_jnewblk = NULL; 5153 cancel_newblk(&oldadp->ad_block, NULL, 5154 &newadp->ad_block.nb_jwork); 5155 } else { 5156 wk = (struct worklist *) cancel_newblk(&oldadp->ad_block, 5157 &freefrag->ff_list, &freefrag->ff_jwork); 5158 freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk, 5159 &freefrag->ff_jwork); 5160 } 5161 free_newblk(&oldadp->ad_block); 5162 } 5163 5164 /* 5165 * Allocate a jfreefrag structure to journal a single block free. 5166 */ 5167 static struct jfreefrag * 5168 newjfreefrag(freefrag, ip, blkno, size, lbn) 5169 struct freefrag *freefrag; 5170 struct inode *ip; 5171 ufs2_daddr_t blkno; 5172 long size; 5173 ufs_lbn_t lbn; 5174 { 5175 struct jfreefrag *jfreefrag; 5176 struct fs *fs; 5177 5178 fs = ip->i_fs; 5179 jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG, 5180 M_SOFTDEP_FLAGS); 5181 workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump)); 5182 jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list); 5183 jfreefrag->fr_state = ATTACHED | DEPCOMPLETE; 5184 jfreefrag->fr_ino = ip->i_number; 5185 jfreefrag->fr_lbn = lbn; 5186 jfreefrag->fr_blkno = blkno; 5187 jfreefrag->fr_frags = numfrags(fs, size); 5188 jfreefrag->fr_freefrag = freefrag; 5189 5190 return (jfreefrag); 5191 } 5192 5193 /* 5194 * Allocate a new freefrag structure. 5195 */ 5196 static struct freefrag * 5197 newfreefrag(ip, blkno, size, lbn) 5198 struct inode *ip; 5199 ufs2_daddr_t blkno; 5200 long size; 5201 ufs_lbn_t lbn; 5202 { 5203 struct freefrag *freefrag; 5204 struct fs *fs; 5205 5206 fs = ip->i_fs; 5207 if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) 5208 panic("newfreefrag: frag size"); 5209 freefrag = malloc(sizeof(struct freefrag), 5210 M_FREEFRAG, M_SOFTDEP_FLAGS); 5211 workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump)); 5212 freefrag->ff_state = ATTACHED; 5213 LIST_INIT(&freefrag->ff_jwork); 5214 freefrag->ff_inum = ip->i_number; 5215 freefrag->ff_vtype = ITOV(ip)->v_type; 5216 freefrag->ff_blkno = blkno; 5217 freefrag->ff_fragsize = size; 5218 5219 if (MOUNTEDSUJ(UFSTOVFS(ip->i_ump))) { 5220 freefrag->ff_jdep = (struct worklist *) 5221 newjfreefrag(freefrag, ip, blkno, size, lbn); 5222 } else { 5223 freefrag->ff_state |= DEPCOMPLETE; 5224 freefrag->ff_jdep = NULL; 5225 } 5226 5227 return (freefrag); 5228 } 5229 5230 /* 5231 * This workitem de-allocates fragments that were replaced during 5232 * file block allocation. 5233 */ 5234 static void 5235 handle_workitem_freefrag(freefrag) 5236 struct freefrag *freefrag; 5237 { 5238 struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp); 5239 struct workhead wkhd; 5240 5241 /* 5242 * It would be illegal to add new completion items to the 5243 * freefrag after it was schedule to be done so it must be 5244 * safe to modify the list head here. 5245 */ 5246 LIST_INIT(&wkhd); 5247 ACQUIRE_LOCK(&lk); 5248 LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list); 5249 /* 5250 * If the journal has not been written we must cancel it here. 5251 */ 5252 if (freefrag->ff_jdep) { 5253 if (freefrag->ff_jdep->wk_type != D_JNEWBLK) 5254 panic("handle_workitem_freefrag: Unexpected type %d\n", 5255 freefrag->ff_jdep->wk_type); 5256 cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd); 5257 } 5258 FREE_LOCK(&lk); 5259 ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno, 5260 freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd); 5261 ACQUIRE_LOCK(&lk); 5262 WORKITEM_FREE(freefrag, D_FREEFRAG); 5263 FREE_LOCK(&lk); 5264 } 5265 5266 /* 5267 * Set up a dependency structure for an external attributes data block. 5268 * This routine follows much of the structure of softdep_setup_allocdirect. 5269 * See the description of softdep_setup_allocdirect above for details. 5270 */ 5271 void 5272 softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp) 5273 struct inode *ip; 5274 ufs_lbn_t off; 5275 ufs2_daddr_t newblkno; 5276 ufs2_daddr_t oldblkno; 5277 long newsize; 5278 long oldsize; 5279 struct buf *bp; 5280 { 5281 struct allocdirect *adp, *oldadp; 5282 struct allocdirectlst *adphead; 5283 struct freefrag *freefrag; 5284 struct inodedep *inodedep; 5285 struct jnewblk *jnewblk; 5286 struct newblk *newblk; 5287 struct mount *mp; 5288 ufs_lbn_t lbn; 5289 5290 if (off >= NXADDR) 5291 panic("softdep_setup_allocext: lbn %lld > NXADDR", 5292 (long long)off); 5293 5294 lbn = bp->b_lblkno; 5295 mp = UFSTOVFS(ip->i_ump); 5296 if (oldblkno && oldblkno != newblkno) 5297 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); 5298 else 5299 freefrag = NULL; 5300 5301 ACQUIRE_LOCK(&lk); 5302 if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) 5303 panic("softdep_setup_allocext: lost block"); 5304 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 5305 ("softdep_setup_allocext: newblk already initialized")); 5306 /* 5307 * Convert the newblk to an allocdirect. 5308 */ 5309 newblk->nb_list.wk_type = D_ALLOCDIRECT; 5310 adp = (struct allocdirect *)newblk; 5311 newblk->nb_freefrag = freefrag; 5312 adp->ad_offset = off; 5313 adp->ad_oldblkno = oldblkno; 5314 adp->ad_newsize = newsize; 5315 adp->ad_oldsize = oldsize; 5316 adp->ad_state |= EXTDATA; 5317 5318 /* 5319 * Finish initializing the journal. 5320 */ 5321 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 5322 jnewblk->jn_ino = ip->i_number; 5323 jnewblk->jn_lbn = lbn; 5324 add_to_journal(&jnewblk->jn_list); 5325 } 5326 if (freefrag && freefrag->ff_jdep != NULL && 5327 freefrag->ff_jdep->wk_type == D_JFREEFRAG) 5328 add_to_journal(freefrag->ff_jdep); 5329 inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); 5330 adp->ad_inodedep = inodedep; 5331 5332 WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); 5333 /* 5334 * The list of allocdirects must be kept in sorted and ascending 5335 * order so that the rollback routines can quickly determine the 5336 * first uncommitted block (the size of the file stored on disk 5337 * ends at the end of the lowest committed fragment, or if there 5338 * are no fragments, at the end of the highest committed block). 5339 * Since files generally grow, the typical case is that the new 5340 * block is to be added at the end of the list. We speed this 5341 * special case by checking against the last allocdirect in the 5342 * list before laboriously traversing the list looking for the 5343 * insertion point. 5344 */ 5345 adphead = &inodedep->id_newextupdt; 5346 oldadp = TAILQ_LAST(adphead, allocdirectlst); 5347 if (oldadp == NULL || oldadp->ad_offset <= off) { 5348 /* insert at end of list */ 5349 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 5350 if (oldadp != NULL && oldadp->ad_offset == off) 5351 allocdirect_merge(adphead, adp, oldadp); 5352 FREE_LOCK(&lk); 5353 return; 5354 } 5355 TAILQ_FOREACH(oldadp, adphead, ad_next) { 5356 if (oldadp->ad_offset >= off) 5357 break; 5358 } 5359 if (oldadp == NULL) 5360 panic("softdep_setup_allocext: lost entry"); 5361 /* insert in middle of list */ 5362 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 5363 if (oldadp->ad_offset == off) 5364 allocdirect_merge(adphead, adp, oldadp); 5365 FREE_LOCK(&lk); 5366 } 5367 5368 /* 5369 * Indirect block allocation dependencies. 5370 * 5371 * The same dependencies that exist for a direct block also exist when 5372 * a new block is allocated and pointed to by an entry in a block of 5373 * indirect pointers. The undo/redo states described above are also 5374 * used here. Because an indirect block contains many pointers that 5375 * may have dependencies, a second copy of the entire in-memory indirect 5376 * block is kept. The buffer cache copy is always completely up-to-date. 5377 * The second copy, which is used only as a source for disk writes, 5378 * contains only the safe pointers (i.e., those that have no remaining 5379 * update dependencies). The second copy is freed when all pointers 5380 * are safe. The cache is not allowed to replace indirect blocks with 5381 * pending update dependencies. If a buffer containing an indirect 5382 * block with dependencies is written, these routines will mark it 5383 * dirty again. It can only be successfully written once all the 5384 * dependencies are removed. The ffs_fsync routine in conjunction with 5385 * softdep_sync_metadata work together to get all the dependencies 5386 * removed so that a file can be successfully written to disk. Three 5387 * procedures are used when setting up indirect block pointer 5388 * dependencies. The division is necessary because of the organization 5389 * of the "balloc" routine and because of the distinction between file 5390 * pages and file metadata blocks. 5391 */ 5392 5393 /* 5394 * Allocate a new allocindir structure. 5395 */ 5396 static struct allocindir * 5397 newallocindir(ip, ptrno, newblkno, oldblkno, lbn) 5398 struct inode *ip; /* inode for file being extended */ 5399 int ptrno; /* offset of pointer in indirect block */ 5400 ufs2_daddr_t newblkno; /* disk block number being added */ 5401 ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ 5402 ufs_lbn_t lbn; 5403 { 5404 struct newblk *newblk; 5405 struct allocindir *aip; 5406 struct freefrag *freefrag; 5407 struct jnewblk *jnewblk; 5408 5409 if (oldblkno) 5410 freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn); 5411 else 5412 freefrag = NULL; 5413 ACQUIRE_LOCK(&lk); 5414 if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0) 5415 panic("new_allocindir: lost block"); 5416 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 5417 ("newallocindir: newblk already initialized")); 5418 newblk->nb_list.wk_type = D_ALLOCINDIR; 5419 newblk->nb_freefrag = freefrag; 5420 aip = (struct allocindir *)newblk; 5421 aip->ai_offset = ptrno; 5422 aip->ai_oldblkno = oldblkno; 5423 aip->ai_lbn = lbn; 5424 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 5425 jnewblk->jn_ino = ip->i_number; 5426 jnewblk->jn_lbn = lbn; 5427 add_to_journal(&jnewblk->jn_list); 5428 } 5429 if (freefrag && freefrag->ff_jdep != NULL && 5430 freefrag->ff_jdep->wk_type == D_JFREEFRAG) 5431 add_to_journal(freefrag->ff_jdep); 5432 return (aip); 5433 } 5434 5435 /* 5436 * Called just before setting an indirect block pointer 5437 * to a newly allocated file page. 5438 */ 5439 void 5440 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 5441 struct inode *ip; /* inode for file being extended */ 5442 ufs_lbn_t lbn; /* allocated block number within file */ 5443 struct buf *bp; /* buffer with indirect blk referencing page */ 5444 int ptrno; /* offset of pointer in indirect block */ 5445 ufs2_daddr_t newblkno; /* disk block number being added */ 5446 ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ 5447 struct buf *nbp; /* buffer holding allocated page */ 5448 { 5449 struct inodedep *inodedep; 5450 struct freefrag *freefrag; 5451 struct allocindir *aip; 5452 struct pagedep *pagedep; 5453 struct mount *mp; 5454 int dflags; 5455 5456 if (lbn != nbp->b_lblkno) 5457 panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd", 5458 lbn, bp->b_lblkno); 5459 ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page"); 5460 mp = UFSTOVFS(ip->i_ump); 5461 aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn); 5462 dflags = DEPALLOC; 5463 if (IS_SNAPSHOT(ip)) 5464 dflags |= NODELAY; 5465 (void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep); 5466 /* 5467 * If we are allocating a directory page, then we must 5468 * allocate an associated pagedep to track additions and 5469 * deletions. 5470 */ 5471 if ((ip->i_mode & IFMT) == IFDIR) 5472 pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep); 5473 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); 5474 freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn); 5475 FREE_LOCK(&lk); 5476 if (freefrag) 5477 handle_workitem_freefrag(freefrag); 5478 } 5479 5480 /* 5481 * Called just before setting an indirect block pointer to a 5482 * newly allocated indirect block. 5483 */ 5484 void 5485 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 5486 struct buf *nbp; /* newly allocated indirect block */ 5487 struct inode *ip; /* inode for file being extended */ 5488 struct buf *bp; /* indirect block referencing allocated block */ 5489 int ptrno; /* offset of pointer in indirect block */ 5490 ufs2_daddr_t newblkno; /* disk block number being added */ 5491 { 5492 struct inodedep *inodedep; 5493 struct allocindir *aip; 5494 ufs_lbn_t lbn; 5495 int dflags; 5496 5497 lbn = nbp->b_lblkno; 5498 ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta"); 5499 aip = newallocindir(ip, ptrno, newblkno, 0, lbn); 5500 dflags = DEPALLOC; 5501 if (IS_SNAPSHOT(ip)) 5502 dflags |= NODELAY; 5503 inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, &inodedep); 5504 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); 5505 if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)) 5506 panic("softdep_setup_allocindir_meta: Block already existed"); 5507 FREE_LOCK(&lk); 5508 } 5509 5510 static void 5511 indirdep_complete(indirdep) 5512 struct indirdep *indirdep; 5513 { 5514 struct allocindir *aip; 5515 5516 LIST_REMOVE(indirdep, ir_next); 5517 indirdep->ir_state |= DEPCOMPLETE; 5518 5519 while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) { 5520 LIST_REMOVE(aip, ai_next); 5521 free_newblk(&aip->ai_block); 5522 } 5523 /* 5524 * If this indirdep is not attached to a buf it was simply waiting 5525 * on completion to clear completehd. free_indirdep() asserts 5526 * that nothing is dangling. 5527 */ 5528 if ((indirdep->ir_state & ONWORKLIST) == 0) 5529 free_indirdep(indirdep); 5530 } 5531 5532 static struct indirdep * 5533 indirdep_lookup(mp, ip, bp) 5534 struct mount *mp; 5535 struct inode *ip; 5536 struct buf *bp; 5537 { 5538 struct indirdep *indirdep, *newindirdep; 5539 struct newblk *newblk; 5540 struct worklist *wk; 5541 struct fs *fs; 5542 ufs2_daddr_t blkno; 5543 5544 mtx_assert(&lk, MA_OWNED); 5545 indirdep = NULL; 5546 newindirdep = NULL; 5547 fs = ip->i_fs; 5548 for (;;) { 5549 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 5550 if (wk->wk_type != D_INDIRDEP) 5551 continue; 5552 indirdep = WK_INDIRDEP(wk); 5553 break; 5554 } 5555 /* Found on the buffer worklist, no new structure to free. */ 5556 if (indirdep != NULL && newindirdep == NULL) 5557 return (indirdep); 5558 if (indirdep != NULL && newindirdep != NULL) 5559 panic("indirdep_lookup: simultaneous create"); 5560 /* None found on the buffer and a new structure is ready. */ 5561 if (indirdep == NULL && newindirdep != NULL) 5562 break; 5563 /* None found and no new structure available. */ 5564 FREE_LOCK(&lk); 5565 newindirdep = malloc(sizeof(struct indirdep), 5566 M_INDIRDEP, M_SOFTDEP_FLAGS); 5567 workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp); 5568 newindirdep->ir_state = ATTACHED; 5569 if (ip->i_ump->um_fstype == UFS1) 5570 newindirdep->ir_state |= UFS1FMT; 5571 TAILQ_INIT(&newindirdep->ir_trunc); 5572 newindirdep->ir_saveddata = NULL; 5573 LIST_INIT(&newindirdep->ir_deplisthd); 5574 LIST_INIT(&newindirdep->ir_donehd); 5575 LIST_INIT(&newindirdep->ir_writehd); 5576 LIST_INIT(&newindirdep->ir_completehd); 5577 if (bp->b_blkno == bp->b_lblkno) { 5578 ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp, 5579 NULL, NULL); 5580 bp->b_blkno = blkno; 5581 } 5582 newindirdep->ir_freeblks = NULL; 5583 newindirdep->ir_savebp = 5584 getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0); 5585 newindirdep->ir_bp = bp; 5586 BUF_KERNPROC(newindirdep->ir_savebp); 5587 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); 5588 ACQUIRE_LOCK(&lk); 5589 } 5590 indirdep = newindirdep; 5591 WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); 5592 /* 5593 * If the block is not yet allocated we don't set DEPCOMPLETE so 5594 * that we don't free dependencies until the pointers are valid. 5595 * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather 5596 * than using the hash. 5597 */ 5598 if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)) 5599 LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next); 5600 else 5601 indirdep->ir_state |= DEPCOMPLETE; 5602 return (indirdep); 5603 } 5604 5605 /* 5606 * Called to finish the allocation of the "aip" allocated 5607 * by one of the two routines above. 5608 */ 5609 static struct freefrag * 5610 setup_allocindir_phase2(bp, ip, inodedep, aip, lbn) 5611 struct buf *bp; /* in-memory copy of the indirect block */ 5612 struct inode *ip; /* inode for file being extended */ 5613 struct inodedep *inodedep; /* Inodedep for ip */ 5614 struct allocindir *aip; /* allocindir allocated by the above routines */ 5615 ufs_lbn_t lbn; /* Logical block number for this block. */ 5616 { 5617 struct fs *fs; 5618 struct indirdep *indirdep; 5619 struct allocindir *oldaip; 5620 struct freefrag *freefrag; 5621 struct mount *mp; 5622 5623 mtx_assert(&lk, MA_OWNED); 5624 mp = UFSTOVFS(ip->i_ump); 5625 fs = ip->i_fs; 5626 if (bp->b_lblkno >= 0) 5627 panic("setup_allocindir_phase2: not indir blk"); 5628 KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs), 5629 ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset)); 5630 indirdep = indirdep_lookup(mp, ip, bp); 5631 KASSERT(indirdep->ir_savebp != NULL, 5632 ("setup_allocindir_phase2 NULL ir_savebp")); 5633 aip->ai_indirdep = indirdep; 5634 /* 5635 * Check for an unwritten dependency for this indirect offset. If 5636 * there is, merge the old dependency into the new one. This happens 5637 * as a result of reallocblk only. 5638 */ 5639 freefrag = NULL; 5640 if (aip->ai_oldblkno != 0) { 5641 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) { 5642 if (oldaip->ai_offset == aip->ai_offset) { 5643 freefrag = allocindir_merge(aip, oldaip); 5644 goto done; 5645 } 5646 } 5647 LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) { 5648 if (oldaip->ai_offset == aip->ai_offset) { 5649 freefrag = allocindir_merge(aip, oldaip); 5650 goto done; 5651 } 5652 } 5653 } 5654 done: 5655 LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); 5656 return (freefrag); 5657 } 5658 5659 /* 5660 * Merge two allocindirs which refer to the same block. Move newblock 5661 * dependencies and setup the freefrags appropriately. 5662 */ 5663 static struct freefrag * 5664 allocindir_merge(aip, oldaip) 5665 struct allocindir *aip; 5666 struct allocindir *oldaip; 5667 { 5668 struct freefrag *freefrag; 5669 struct worklist *wk; 5670 5671 if (oldaip->ai_newblkno != aip->ai_oldblkno) 5672 panic("allocindir_merge: blkno"); 5673 aip->ai_oldblkno = oldaip->ai_oldblkno; 5674 freefrag = aip->ai_freefrag; 5675 aip->ai_freefrag = oldaip->ai_freefrag; 5676 oldaip->ai_freefrag = NULL; 5677 KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag")); 5678 /* 5679 * If we are tracking a new directory-block allocation, 5680 * move it from the old allocindir to the new allocindir. 5681 */ 5682 if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) { 5683 WORKLIST_REMOVE(wk); 5684 if (!LIST_EMPTY(&oldaip->ai_newdirblk)) 5685 panic("allocindir_merge: extra newdirblk"); 5686 WORKLIST_INSERT(&aip->ai_newdirblk, wk); 5687 } 5688 /* 5689 * We can skip journaling for this freefrag and just complete 5690 * any pending journal work for the allocindir that is being 5691 * removed after the freefrag completes. 5692 */ 5693 if (freefrag->ff_jdep) 5694 cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep)); 5695 LIST_REMOVE(oldaip, ai_next); 5696 freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block, 5697 &freefrag->ff_list, &freefrag->ff_jwork); 5698 free_newblk(&oldaip->ai_block); 5699 5700 return (freefrag); 5701 } 5702 5703 static inline void 5704 setup_freedirect(freeblks, ip, i, needj) 5705 struct freeblks *freeblks; 5706 struct inode *ip; 5707 int i; 5708 int needj; 5709 { 5710 ufs2_daddr_t blkno; 5711 int frags; 5712 5713 blkno = DIP(ip, i_db[i]); 5714 if (blkno == 0) 5715 return; 5716 DIP_SET(ip, i_db[i], 0); 5717 frags = sblksize(ip->i_fs, ip->i_size, i); 5718 frags = numfrags(ip->i_fs, frags); 5719 newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags, 0, needj); 5720 } 5721 5722 static inline void 5723 setup_freeext(freeblks, ip, i, needj) 5724 struct freeblks *freeblks; 5725 struct inode *ip; 5726 int i; 5727 int needj; 5728 { 5729 ufs2_daddr_t blkno; 5730 int frags; 5731 5732 blkno = ip->i_din2->di_extb[i]; 5733 if (blkno == 0) 5734 return; 5735 ip->i_din2->di_extb[i] = 0; 5736 frags = sblksize(ip->i_fs, ip->i_din2->di_extsize, i); 5737 frags = numfrags(ip->i_fs, frags); 5738 newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj); 5739 } 5740 5741 static inline void 5742 setup_freeindir(freeblks, ip, i, lbn, needj) 5743 struct freeblks *freeblks; 5744 struct inode *ip; 5745 int i; 5746 ufs_lbn_t lbn; 5747 int needj; 5748 { 5749 ufs2_daddr_t blkno; 5750 5751 blkno = DIP(ip, i_ib[i]); 5752 if (blkno == 0) 5753 return; 5754 DIP_SET(ip, i_ib[i], 0); 5755 newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, ip->i_fs->fs_frag, 5756 0, needj); 5757 } 5758 5759 static inline struct freeblks * 5760 newfreeblks(mp, ip) 5761 struct mount *mp; 5762 struct inode *ip; 5763 { 5764 struct freeblks *freeblks; 5765 5766 freeblks = malloc(sizeof(struct freeblks), 5767 M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO); 5768 workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp); 5769 LIST_INIT(&freeblks->fb_jblkdephd); 5770 LIST_INIT(&freeblks->fb_jwork); 5771 freeblks->fb_ref = 0; 5772 freeblks->fb_cgwait = 0; 5773 freeblks->fb_state = ATTACHED; 5774 freeblks->fb_uid = ip->i_uid; 5775 freeblks->fb_inum = ip->i_number; 5776 freeblks->fb_vtype = ITOV(ip)->v_type; 5777 freeblks->fb_modrev = DIP(ip, i_modrev); 5778 freeblks->fb_devvp = ip->i_devvp; 5779 freeblks->fb_chkcnt = 0; 5780 freeblks->fb_len = 0; 5781 5782 return (freeblks); 5783 } 5784 5785 static void 5786 trunc_indirdep(indirdep, freeblks, bp, off) 5787 struct indirdep *indirdep; 5788 struct freeblks *freeblks; 5789 struct buf *bp; 5790 int off; 5791 { 5792 struct allocindir *aip, *aipn; 5793 5794 /* 5795 * The first set of allocindirs won't be in savedbp. 5796 */ 5797 LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn) 5798 if (aip->ai_offset > off) 5799 cancel_allocindir(aip, bp, freeblks, 1); 5800 LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn) 5801 if (aip->ai_offset > off) 5802 cancel_allocindir(aip, bp, freeblks, 1); 5803 /* 5804 * These will exist in savedbp. 5805 */ 5806 LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn) 5807 if (aip->ai_offset > off) 5808 cancel_allocindir(aip, NULL, freeblks, 0); 5809 LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn) 5810 if (aip->ai_offset > off) 5811 cancel_allocindir(aip, NULL, freeblks, 0); 5812 } 5813 5814 /* 5815 * Follow the chain of indirects down to lastlbn creating a freework 5816 * structure for each. This will be used to start indir_trunc() at 5817 * the right offset and create the journal records for the parrtial 5818 * truncation. A second step will handle the truncated dependencies. 5819 */ 5820 static int 5821 setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno) 5822 struct freeblks *freeblks; 5823 struct inode *ip; 5824 ufs_lbn_t lbn; 5825 ufs_lbn_t lastlbn; 5826 ufs2_daddr_t blkno; 5827 { 5828 struct indirdep *indirdep; 5829 struct indirdep *indirn; 5830 struct freework *freework; 5831 struct newblk *newblk; 5832 struct mount *mp; 5833 struct buf *bp; 5834 uint8_t *start; 5835 uint8_t *end; 5836 ufs_lbn_t lbnadd; 5837 int level; 5838 int error; 5839 int off; 5840 5841 5842 freework = NULL; 5843 if (blkno == 0) 5844 return (0); 5845 mp = freeblks->fb_list.wk_mp; 5846 bp = getblk(ITOV(ip), lbn, mp->mnt_stat.f_iosize, 0, 0, 0); 5847 if ((bp->b_flags & B_CACHE) == 0) { 5848 bp->b_blkno = blkptrtodb(VFSTOUFS(mp), blkno); 5849 bp->b_iocmd = BIO_READ; 5850 bp->b_flags &= ~B_INVAL; 5851 bp->b_ioflags &= ~BIO_ERROR; 5852 vfs_busy_pages(bp, 0); 5853 bp->b_iooffset = dbtob(bp->b_blkno); 5854 bstrategy(bp); 5855 curthread->td_ru.ru_inblock++; 5856 error = bufwait(bp); 5857 if (error) { 5858 brelse(bp); 5859 return (error); 5860 } 5861 } 5862 level = lbn_level(lbn); 5863 lbnadd = lbn_offset(ip->i_fs, level); 5864 /* 5865 * Compute the offset of the last block we want to keep. Store 5866 * in the freework the first block we want to completely free. 5867 */ 5868 off = (lastlbn - -(lbn + level)) / lbnadd; 5869 if (off + 1 == NINDIR(ip->i_fs)) 5870 goto nowork; 5871 freework = newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, 0, off+1, 5872 0); 5873 /* 5874 * Link the freework into the indirdep. This will prevent any new 5875 * allocations from proceeding until we are finished with the 5876 * truncate and the block is written. 5877 */ 5878 ACQUIRE_LOCK(&lk); 5879 indirdep = indirdep_lookup(mp, ip, bp); 5880 if (indirdep->ir_freeblks) 5881 panic("setup_trunc_indir: indirdep already truncated."); 5882 TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next); 5883 freework->fw_indir = indirdep; 5884 /* 5885 * Cancel any allocindirs that will not make it to disk. 5886 * We have to do this for all copies of the indirdep that 5887 * live on this newblk. 5888 */ 5889 if ((indirdep->ir_state & DEPCOMPLETE) == 0) { 5890 newblk_lookup(mp, dbtofsb(ip->i_fs, bp->b_blkno), 0, &newblk); 5891 LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next) 5892 trunc_indirdep(indirn, freeblks, bp, off); 5893 } else 5894 trunc_indirdep(indirdep, freeblks, bp, off); 5895 FREE_LOCK(&lk); 5896 /* 5897 * Creation is protected by the buf lock. The saveddata is only 5898 * needed if a full truncation follows a partial truncation but it 5899 * is difficult to allocate in that case so we fetch it anyway. 5900 */ 5901 if (indirdep->ir_saveddata == NULL) 5902 indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, 5903 M_SOFTDEP_FLAGS); 5904 nowork: 5905 /* Fetch the blkno of the child and the zero start offset. */ 5906 if (ip->i_ump->um_fstype == UFS1) { 5907 blkno = ((ufs1_daddr_t *)bp->b_data)[off]; 5908 start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1]; 5909 } else { 5910 blkno = ((ufs2_daddr_t *)bp->b_data)[off]; 5911 start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1]; 5912 } 5913 if (freework) { 5914 /* Zero the truncated pointers. */ 5915 end = bp->b_data + bp->b_bcount; 5916 bzero(start, end - start); 5917 bdwrite(bp); 5918 } else 5919 bqrelse(bp); 5920 if (level == 0) 5921 return (0); 5922 lbn++; /* adjust level */ 5923 lbn -= (off * lbnadd); 5924 return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno); 5925 } 5926 5927 /* 5928 * Complete the partial truncation of an indirect block setup by 5929 * setup_trunc_indir(). This zeros the truncated pointers in the saved 5930 * copy and writes them to disk before the freeblks is allowed to complete. 5931 */ 5932 static void 5933 complete_trunc_indir(freework) 5934 struct freework *freework; 5935 { 5936 struct freework *fwn; 5937 struct indirdep *indirdep; 5938 struct buf *bp; 5939 uintptr_t start; 5940 int count; 5941 5942 indirdep = freework->fw_indir; 5943 for (;;) { 5944 bp = indirdep->ir_bp; 5945 /* See if the block was discarded. */ 5946 if (bp == NULL) 5947 break; 5948 /* Inline part of getdirtybuf(). We dont want bremfree. */ 5949 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) 5950 break; 5951 if (BUF_LOCK(bp, 5952 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, &lk) == 0) 5953 BUF_UNLOCK(bp); 5954 ACQUIRE_LOCK(&lk); 5955 } 5956 mtx_assert(&lk, MA_OWNED); 5957 freework->fw_state |= DEPCOMPLETE; 5958 TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next); 5959 /* 5960 * Zero the pointers in the saved copy. 5961 */ 5962 if (indirdep->ir_state & UFS1FMT) 5963 start = sizeof(ufs1_daddr_t); 5964 else 5965 start = sizeof(ufs2_daddr_t); 5966 start *= freework->fw_start; 5967 count = indirdep->ir_savebp->b_bcount - start; 5968 start += (uintptr_t)indirdep->ir_savebp->b_data; 5969 bzero((char *)start, count); 5970 /* 5971 * We need to start the next truncation in the list if it has not 5972 * been started yet. 5973 */ 5974 fwn = TAILQ_FIRST(&indirdep->ir_trunc); 5975 if (fwn != NULL) { 5976 if (fwn->fw_freeblks == indirdep->ir_freeblks) 5977 TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next); 5978 if ((fwn->fw_state & ONWORKLIST) == 0) 5979 freework_enqueue(fwn); 5980 } 5981 /* 5982 * If bp is NULL the block was fully truncated, restore 5983 * the saved block list otherwise free it if it is no 5984 * longer needed. 5985 */ 5986 if (TAILQ_EMPTY(&indirdep->ir_trunc)) { 5987 if (bp == NULL) 5988 bcopy(indirdep->ir_saveddata, 5989 indirdep->ir_savebp->b_data, 5990 indirdep->ir_savebp->b_bcount); 5991 free(indirdep->ir_saveddata, M_INDIRDEP); 5992 indirdep->ir_saveddata = NULL; 5993 } 5994 /* 5995 * When bp is NULL there is a full truncation pending. We 5996 * must wait for this full truncation to be journaled before 5997 * we can release this freework because the disk pointers will 5998 * never be written as zero. 5999 */ 6000 if (bp == NULL) { 6001 if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd)) 6002 handle_written_freework(freework); 6003 else 6004 WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd, 6005 &freework->fw_list); 6006 } else { 6007 /* Complete when the real copy is written. */ 6008 WORKLIST_INSERT(&bp->b_dep, &freework->fw_list); 6009 BUF_UNLOCK(bp); 6010 } 6011 } 6012 6013 /* 6014 * Calculate the number of blocks we are going to release where datablocks 6015 * is the current total and length is the new file size. 6016 */ 6017 ufs2_daddr_t 6018 blkcount(fs, datablocks, length) 6019 struct fs *fs; 6020 ufs2_daddr_t datablocks; 6021 off_t length; 6022 { 6023 off_t totblks, numblks; 6024 6025 totblks = 0; 6026 numblks = howmany(length, fs->fs_bsize); 6027 if (numblks <= NDADDR) { 6028 totblks = howmany(length, fs->fs_fsize); 6029 goto out; 6030 } 6031 totblks = blkstofrags(fs, numblks); 6032 numblks -= NDADDR; 6033 /* 6034 * Count all single, then double, then triple indirects required. 6035 * Subtracting one indirects worth of blocks for each pass 6036 * acknowledges one of each pointed to by the inode. 6037 */ 6038 for (;;) { 6039 totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs))); 6040 numblks -= NINDIR(fs); 6041 if (numblks <= 0) 6042 break; 6043 numblks = howmany(numblks, NINDIR(fs)); 6044 } 6045 out: 6046 totblks = fsbtodb(fs, totblks); 6047 /* 6048 * Handle sparse files. We can't reclaim more blocks than the inode 6049 * references. We will correct it later in handle_complete_freeblks() 6050 * when we know the real count. 6051 */ 6052 if (totblks > datablocks) 6053 return (0); 6054 return (datablocks - totblks); 6055 } 6056 6057 /* 6058 * Handle freeblocks for journaled softupdate filesystems. 6059 * 6060 * Contrary to normal softupdates, we must preserve the block pointers in 6061 * indirects until their subordinates are free. This is to avoid journaling 6062 * every block that is freed which may consume more space than the journal 6063 * itself. The recovery program will see the free block journals at the 6064 * base of the truncated area and traverse them to reclaim space. The 6065 * pointers in the inode may be cleared immediately after the journal 6066 * records are written because each direct and indirect pointer in the 6067 * inode is recorded in a journal. This permits full truncation to proceed 6068 * asynchronously. The write order is journal -> inode -> cgs -> indirects. 6069 * 6070 * The algorithm is as follows: 6071 * 1) Traverse the in-memory state and create journal entries to release 6072 * the relevant blocks and full indirect trees. 6073 * 2) Traverse the indirect block chain adding partial truncation freework 6074 * records to indirects in the path to lastlbn. The freework will 6075 * prevent new allocation dependencies from being satisfied in this 6076 * indirect until the truncation completes. 6077 * 3) Read and lock the inode block, performing an update with the new size 6078 * and pointers. This prevents truncated data from becoming valid on 6079 * disk through step 4. 6080 * 4) Reap unsatisfied dependencies that are beyond the truncated area, 6081 * eliminate journal work for those records that do not require it. 6082 * 5) Schedule the journal records to be written followed by the inode block. 6083 * 6) Allocate any necessary frags for the end of file. 6084 * 7) Zero any partially truncated blocks. 6085 * 6086 * From this truncation proceeds asynchronously using the freework and 6087 * indir_trunc machinery. The file will not be extended again into a 6088 * partially truncated indirect block until all work is completed but 6089 * the normal dependency mechanism ensures that it is rolled back/forward 6090 * as appropriate. Further truncation may occur without delay and is 6091 * serialized in indir_trunc(). 6092 */ 6093 void 6094 softdep_journal_freeblocks(ip, cred, length, flags) 6095 struct inode *ip; /* The inode whose length is to be reduced */ 6096 struct ucred *cred; 6097 off_t length; /* The new length for the file */ 6098 int flags; /* IO_EXT and/or IO_NORMAL */ 6099 { 6100 struct freeblks *freeblks, *fbn; 6101 struct inodedep *inodedep; 6102 struct jblkdep *jblkdep; 6103 struct allocdirect *adp, *adpn; 6104 struct fs *fs; 6105 struct buf *bp; 6106 struct vnode *vp; 6107 struct mount *mp; 6108 ufs2_daddr_t extblocks, datablocks; 6109 ufs_lbn_t tmpval, lbn, lastlbn; 6110 int frags, lastoff, iboff, allocblock, needj, dflags, error, i; 6111 6112 fs = ip->i_fs; 6113 mp = UFSTOVFS(ip->i_ump); 6114 vp = ITOV(ip); 6115 needj = 1; 6116 iboff = -1; 6117 allocblock = 0; 6118 extblocks = 0; 6119 datablocks = 0; 6120 frags = 0; 6121 freeblks = newfreeblks(mp, ip); 6122 ACQUIRE_LOCK(&lk); 6123 /* 6124 * If we're truncating a removed file that will never be written 6125 * we don't need to journal the block frees. The canceled journals 6126 * for the allocations will suffice. 6127 */ 6128 dflags = DEPALLOC; 6129 if (IS_SNAPSHOT(ip)) 6130 dflags |= NODELAY; 6131 inodedep_lookup(mp, ip->i_number, dflags, &inodedep); 6132 if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED && 6133 length == 0) 6134 needj = 0; 6135 FREE_LOCK(&lk); 6136 /* 6137 * Calculate the lbn that we are truncating to. This results in -1 6138 * if we're truncating the 0 bytes. So it is the last lbn we want 6139 * to keep, not the first lbn we want to truncate. 6140 */ 6141 lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1; 6142 lastoff = blkoff(fs, length); 6143 /* 6144 * Compute frags we are keeping in lastlbn. 0 means all. 6145 */ 6146 if (lastlbn >= 0 && lastlbn < NDADDR) { 6147 frags = fragroundup(fs, lastoff); 6148 /* adp offset of last valid allocdirect. */ 6149 iboff = lastlbn; 6150 } else if (lastlbn > 0) 6151 iboff = NDADDR; 6152 if (fs->fs_magic == FS_UFS2_MAGIC) 6153 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); 6154 /* 6155 * Handle normal data blocks and indirects. This section saves 6156 * values used after the inode update to complete frag and indirect 6157 * truncation. 6158 */ 6159 if ((flags & IO_NORMAL) != 0) { 6160 /* 6161 * Handle truncation of whole direct and indirect blocks. 6162 */ 6163 for (i = iboff + 1; i < NDADDR; i++) 6164 setup_freedirect(freeblks, ip, i, needj); 6165 for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; 6166 i++, lbn += tmpval, tmpval *= NINDIR(fs)) { 6167 /* Release a whole indirect tree. */ 6168 if (lbn > lastlbn) { 6169 setup_freeindir(freeblks, ip, i, -lbn -i, 6170 needj); 6171 continue; 6172 } 6173 iboff = i + NDADDR; 6174 /* 6175 * Traverse partially truncated indirect tree. 6176 */ 6177 if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn) 6178 setup_trunc_indir(freeblks, ip, -lbn - i, 6179 lastlbn, DIP(ip, i_ib[i])); 6180 } 6181 /* 6182 * Handle partial truncation to a frag boundary. 6183 */ 6184 if (frags) { 6185 ufs2_daddr_t blkno; 6186 long oldfrags; 6187 6188 oldfrags = blksize(fs, ip, lastlbn); 6189 blkno = DIP(ip, i_db[lastlbn]); 6190 if (blkno && oldfrags != frags) { 6191 oldfrags -= frags; 6192 oldfrags = numfrags(ip->i_fs, oldfrags); 6193 blkno += numfrags(ip->i_fs, frags); 6194 newfreework(ip->i_ump, freeblks, NULL, lastlbn, 6195 blkno, oldfrags, 0, needj); 6196 } else if (blkno == 0) 6197 allocblock = 1; 6198 } 6199 /* 6200 * Add a journal record for partial truncate if we are 6201 * handling indirect blocks. Non-indirects need no extra 6202 * journaling. 6203 */ 6204 if (length != 0 && lastlbn >= NDADDR) { 6205 ip->i_flag |= IN_TRUNCATED; 6206 newjtrunc(freeblks, length, 0); 6207 } 6208 ip->i_size = length; 6209 DIP_SET(ip, i_size, ip->i_size); 6210 datablocks = DIP(ip, i_blocks) - extblocks; 6211 if (length != 0) 6212 datablocks = blkcount(ip->i_fs, datablocks, length); 6213 freeblks->fb_len = length; 6214 } 6215 if ((flags & IO_EXT) != 0) { 6216 for (i = 0; i < NXADDR; i++) 6217 setup_freeext(freeblks, ip, i, needj); 6218 ip->i_din2->di_extsize = 0; 6219 datablocks += extblocks; 6220 } 6221 #ifdef QUOTA 6222 /* Reference the quotas in case the block count is wrong in the end. */ 6223 quotaref(vp, freeblks->fb_quota); 6224 (void) chkdq(ip, -datablocks, NOCRED, 0); 6225 #endif 6226 freeblks->fb_chkcnt = -datablocks; 6227 UFS_LOCK(ip->i_ump); 6228 fs->fs_pendingblocks += datablocks; 6229 UFS_UNLOCK(ip->i_ump); 6230 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks); 6231 /* 6232 * Handle truncation of incomplete alloc direct dependencies. We 6233 * hold the inode block locked to prevent incomplete dependencies 6234 * from reaching the disk while we are eliminating those that 6235 * have been truncated. This is a partially inlined ffs_update(). 6236 */ 6237 ufs_itimes(vp); 6238 ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED); 6239 error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 6240 (int)fs->fs_bsize, cred, &bp); 6241 if (error) { 6242 brelse(bp); 6243 softdep_error("softdep_journal_freeblocks", error); 6244 return; 6245 } 6246 if (bp->b_bufsize == fs->fs_bsize) 6247 bp->b_flags |= B_CLUSTEROK; 6248 softdep_update_inodeblock(ip, bp, 0); 6249 if (ip->i_ump->um_fstype == UFS1) 6250 *((struct ufs1_dinode *)bp->b_data + 6251 ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1; 6252 else 6253 *((struct ufs2_dinode *)bp->b_data + 6254 ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2; 6255 ACQUIRE_LOCK(&lk); 6256 (void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep); 6257 if ((inodedep->id_state & IOSTARTED) != 0) 6258 panic("softdep_setup_freeblocks: inode busy"); 6259 /* 6260 * Add the freeblks structure to the list of operations that 6261 * must await the zero'ed inode being written to disk. If we 6262 * still have a bitmap dependency (needj), then the inode 6263 * has never been written to disk, so we can process the 6264 * freeblks below once we have deleted the dependencies. 6265 */ 6266 if (needj) 6267 WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); 6268 else 6269 freeblks->fb_state |= COMPLETE; 6270 if ((flags & IO_NORMAL) != 0) { 6271 TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) { 6272 if (adp->ad_offset > iboff) 6273 cancel_allocdirect(&inodedep->id_inoupdt, adp, 6274 freeblks); 6275 /* 6276 * Truncate the allocdirect. We could eliminate 6277 * or modify journal records as well. 6278 */ 6279 else if (adp->ad_offset == iboff && frags) 6280 adp->ad_newsize = frags; 6281 } 6282 } 6283 if ((flags & IO_EXT) != 0) 6284 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0) 6285 cancel_allocdirect(&inodedep->id_extupdt, adp, 6286 freeblks); 6287 /* 6288 * Add journal work. 6289 */ 6290 LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) 6291 add_to_journal(&jblkdep->jb_list); 6292 FREE_LOCK(&lk); 6293 bdwrite(bp); 6294 /* 6295 * Truncate dependency structures beyond length. 6296 */ 6297 trunc_dependencies(ip, freeblks, lastlbn, frags, flags); 6298 /* 6299 * This is only set when we need to allocate a fragment because 6300 * none existed at the end of a frag-sized file. It handles only 6301 * allocating a new, zero filled block. 6302 */ 6303 if (allocblock) { 6304 ip->i_size = length - lastoff; 6305 DIP_SET(ip, i_size, ip->i_size); 6306 error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp); 6307 if (error != 0) { 6308 softdep_error("softdep_journal_freeblks", error); 6309 return; 6310 } 6311 ip->i_size = length; 6312 DIP_SET(ip, i_size, length); 6313 ip->i_flag |= IN_CHANGE | IN_UPDATE; 6314 allocbuf(bp, frags); 6315 ffs_update(vp, 0); 6316 bawrite(bp); 6317 } else if (lastoff != 0 && vp->v_type != VDIR) { 6318 int size; 6319 6320 /* 6321 * Zero the end of a truncated frag or block. 6322 */ 6323 size = sblksize(fs, length, lastlbn); 6324 error = bread(vp, lastlbn, size, cred, &bp); 6325 if (error) { 6326 softdep_error("softdep_journal_freeblks", error); 6327 return; 6328 } 6329 bzero((char *)bp->b_data + lastoff, size - lastoff); 6330 bawrite(bp); 6331 6332 } 6333 ACQUIRE_LOCK(&lk); 6334 inodedep_lookup(mp, ip->i_number, dflags, &inodedep); 6335 TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next); 6336 freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST; 6337 /* 6338 * We zero earlier truncations so they don't erroneously 6339 * update i_blocks. 6340 */ 6341 if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0) 6342 TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next) 6343 fbn->fb_len = 0; 6344 if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE && 6345 LIST_EMPTY(&freeblks->fb_jblkdephd)) 6346 freeblks->fb_state |= INPROGRESS; 6347 else 6348 freeblks = NULL; 6349 FREE_LOCK(&lk); 6350 if (freeblks) 6351 handle_workitem_freeblocks(freeblks, 0); 6352 trunc_pages(ip, length, extblocks, flags); 6353 6354 } 6355 6356 /* 6357 * Flush a JOP_SYNC to the journal. 6358 */ 6359 void 6360 softdep_journal_fsync(ip) 6361 struct inode *ip; 6362 { 6363 struct jfsync *jfsync; 6364 6365 if ((ip->i_flag & IN_TRUNCATED) == 0) 6366 return; 6367 ip->i_flag &= ~IN_TRUNCATED; 6368 jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO); 6369 workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ip->i_ump)); 6370 jfsync->jfs_size = ip->i_size; 6371 jfsync->jfs_ino = ip->i_number; 6372 ACQUIRE_LOCK(&lk); 6373 add_to_journal(&jfsync->jfs_list); 6374 jwait(&jfsync->jfs_list, MNT_WAIT); 6375 FREE_LOCK(&lk); 6376 } 6377 6378 /* 6379 * Block de-allocation dependencies. 6380 * 6381 * When blocks are de-allocated, the on-disk pointers must be nullified before 6382 * the blocks are made available for use by other files. (The true 6383 * requirement is that old pointers must be nullified before new on-disk 6384 * pointers are set. We chose this slightly more stringent requirement to 6385 * reduce complexity.) Our implementation handles this dependency by updating 6386 * the inode (or indirect block) appropriately but delaying the actual block 6387 * de-allocation (i.e., freemap and free space count manipulation) until 6388 * after the updated versions reach stable storage. After the disk is 6389 * updated, the blocks can be safely de-allocated whenever it is convenient. 6390 * This implementation handles only the common case of reducing a file's 6391 * length to zero. Other cases are handled by the conventional synchronous 6392 * write approach. 6393 * 6394 * The ffs implementation with which we worked double-checks 6395 * the state of the block pointers and file size as it reduces 6396 * a file's length. Some of this code is replicated here in our 6397 * soft updates implementation. The freeblks->fb_chkcnt field is 6398 * used to transfer a part of this information to the procedure 6399 * that eventually de-allocates the blocks. 6400 * 6401 * This routine should be called from the routine that shortens 6402 * a file's length, before the inode's size or block pointers 6403 * are modified. It will save the block pointer information for 6404 * later release and zero the inode so that the calling routine 6405 * can release it. 6406 */ 6407 void 6408 softdep_setup_freeblocks(ip, length, flags) 6409 struct inode *ip; /* The inode whose length is to be reduced */ 6410 off_t length; /* The new length for the file */ 6411 int flags; /* IO_EXT and/or IO_NORMAL */ 6412 { 6413 struct ufs1_dinode *dp1; 6414 struct ufs2_dinode *dp2; 6415 struct freeblks *freeblks; 6416 struct inodedep *inodedep; 6417 struct allocdirect *adp; 6418 struct buf *bp; 6419 struct fs *fs; 6420 ufs2_daddr_t extblocks, datablocks; 6421 struct mount *mp; 6422 int i, delay, error, dflags; 6423 ufs_lbn_t tmpval; 6424 ufs_lbn_t lbn; 6425 6426 fs = ip->i_fs; 6427 mp = UFSTOVFS(ip->i_ump); 6428 if (length != 0) 6429 panic("softdep_setup_freeblocks: non-zero length"); 6430 freeblks = newfreeblks(mp, ip); 6431 extblocks = 0; 6432 datablocks = 0; 6433 if (fs->fs_magic == FS_UFS2_MAGIC) 6434 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); 6435 if ((flags & IO_NORMAL) != 0) { 6436 for (i = 0; i < NDADDR; i++) 6437 setup_freedirect(freeblks, ip, i, 0); 6438 for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; 6439 i++, lbn += tmpval, tmpval *= NINDIR(fs)) 6440 setup_freeindir(freeblks, ip, i, -lbn -i, 0); 6441 ip->i_size = 0; 6442 DIP_SET(ip, i_size, 0); 6443 datablocks = DIP(ip, i_blocks) - extblocks; 6444 } 6445 if ((flags & IO_EXT) != 0) { 6446 for (i = 0; i < NXADDR; i++) 6447 setup_freeext(freeblks, ip, i, 0); 6448 ip->i_din2->di_extsize = 0; 6449 datablocks += extblocks; 6450 } 6451 #ifdef QUOTA 6452 /* Reference the quotas in case the block count is wrong in the end. */ 6453 quotaref(ITOV(ip), freeblks->fb_quota); 6454 (void) chkdq(ip, -datablocks, NOCRED, 0); 6455 #endif 6456 freeblks->fb_chkcnt = -datablocks; 6457 UFS_LOCK(ip->i_ump); 6458 fs->fs_pendingblocks += datablocks; 6459 UFS_UNLOCK(ip->i_ump); 6460 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks); 6461 /* 6462 * Push the zero'ed inode to to its disk buffer so that we are free 6463 * to delete its dependencies below. Once the dependencies are gone 6464 * the buffer can be safely released. 6465 */ 6466 if ((error = bread(ip->i_devvp, 6467 fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 6468 (int)fs->fs_bsize, NOCRED, &bp)) != 0) { 6469 brelse(bp); 6470 softdep_error("softdep_setup_freeblocks", error); 6471 } 6472 if (ip->i_ump->um_fstype == UFS1) { 6473 dp1 = ((struct ufs1_dinode *)bp->b_data + 6474 ino_to_fsbo(fs, ip->i_number)); 6475 ip->i_din1->di_freelink = dp1->di_freelink; 6476 *dp1 = *ip->i_din1; 6477 } else { 6478 dp2 = ((struct ufs2_dinode *)bp->b_data + 6479 ino_to_fsbo(fs, ip->i_number)); 6480 ip->i_din2->di_freelink = dp2->di_freelink; 6481 *dp2 = *ip->i_din2; 6482 } 6483 /* 6484 * Find and eliminate any inode dependencies. 6485 */ 6486 ACQUIRE_LOCK(&lk); 6487 dflags = DEPALLOC; 6488 if (IS_SNAPSHOT(ip)) 6489 dflags |= NODELAY; 6490 (void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep); 6491 if ((inodedep->id_state & IOSTARTED) != 0) 6492 panic("softdep_setup_freeblocks: inode busy"); 6493 /* 6494 * Add the freeblks structure to the list of operations that 6495 * must await the zero'ed inode being written to disk. If we 6496 * still have a bitmap dependency (delay == 0), then the inode 6497 * has never been written to disk, so we can process the 6498 * freeblks below once we have deleted the dependencies. 6499 */ 6500 delay = (inodedep->id_state & DEPCOMPLETE); 6501 if (delay) 6502 WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); 6503 else 6504 freeblks->fb_state |= COMPLETE; 6505 /* 6506 * Because the file length has been truncated to zero, any 6507 * pending block allocation dependency structures associated 6508 * with this inode are obsolete and can simply be de-allocated. 6509 * We must first merge the two dependency lists to get rid of 6510 * any duplicate freefrag structures, then purge the merged list. 6511 * If we still have a bitmap dependency, then the inode has never 6512 * been written to disk, so we can free any fragments without delay. 6513 */ 6514 if (flags & IO_NORMAL) { 6515 merge_inode_lists(&inodedep->id_newinoupdt, 6516 &inodedep->id_inoupdt); 6517 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) 6518 cancel_allocdirect(&inodedep->id_inoupdt, adp, 6519 freeblks); 6520 } 6521 if (flags & IO_EXT) { 6522 merge_inode_lists(&inodedep->id_newextupdt, 6523 &inodedep->id_extupdt); 6524 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0) 6525 cancel_allocdirect(&inodedep->id_extupdt, adp, 6526 freeblks); 6527 } 6528 FREE_LOCK(&lk); 6529 bdwrite(bp); 6530 trunc_dependencies(ip, freeblks, -1, 0, flags); 6531 ACQUIRE_LOCK(&lk); 6532 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) 6533 (void) free_inodedep(inodedep); 6534 freeblks->fb_state |= DEPCOMPLETE; 6535 /* 6536 * If the inode with zeroed block pointers is now on disk 6537 * we can start freeing blocks. 6538 */ 6539 if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) 6540 freeblks->fb_state |= INPROGRESS; 6541 else 6542 freeblks = NULL; 6543 FREE_LOCK(&lk); 6544 if (freeblks) 6545 handle_workitem_freeblocks(freeblks, 0); 6546 trunc_pages(ip, length, extblocks, flags); 6547 } 6548 6549 /* 6550 * Eliminate pages from the page cache that back parts of this inode and 6551 * adjust the vnode pager's idea of our size. This prevents stale data 6552 * from hanging around in the page cache. 6553 */ 6554 static void 6555 trunc_pages(ip, length, extblocks, flags) 6556 struct inode *ip; 6557 off_t length; 6558 ufs2_daddr_t extblocks; 6559 int flags; 6560 { 6561 struct vnode *vp; 6562 struct fs *fs; 6563 ufs_lbn_t lbn; 6564 off_t end, extend; 6565 6566 vp = ITOV(ip); 6567 fs = ip->i_fs; 6568 extend = OFF_TO_IDX(lblktosize(fs, -extblocks)); 6569 if ((flags & IO_EXT) != 0) 6570 vn_pages_remove(vp, extend, 0); 6571 if ((flags & IO_NORMAL) == 0) 6572 return; 6573 BO_LOCK(&vp->v_bufobj); 6574 drain_output(vp); 6575 BO_UNLOCK(&vp->v_bufobj); 6576 /* 6577 * The vnode pager eliminates file pages we eliminate indirects 6578 * below. 6579 */ 6580 vnode_pager_setsize(vp, length); 6581 /* 6582 * Calculate the end based on the last indirect we want to keep. If 6583 * the block extends into indirects we can just use the negative of 6584 * its lbn. Doubles and triples exist at lower numbers so we must 6585 * be careful not to remove those, if they exist. double and triple 6586 * indirect lbns do not overlap with others so it is not important 6587 * to verify how many levels are required. 6588 */ 6589 lbn = lblkno(fs, length); 6590 if (lbn >= NDADDR) { 6591 /* Calculate the virtual lbn of the triple indirect. */ 6592 lbn = -lbn - (NIADDR - 1); 6593 end = OFF_TO_IDX(lblktosize(fs, lbn)); 6594 } else 6595 end = extend; 6596 vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end); 6597 } 6598 6599 /* 6600 * See if the buf bp is in the range eliminated by truncation. 6601 */ 6602 static int 6603 trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags) 6604 struct buf *bp; 6605 int *blkoffp; 6606 ufs_lbn_t lastlbn; 6607 int lastoff; 6608 int flags; 6609 { 6610 ufs_lbn_t lbn; 6611 6612 *blkoffp = 0; 6613 /* Only match ext/normal blocks as appropriate. */ 6614 if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) || 6615 ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0)) 6616 return (0); 6617 /* ALTDATA is always a full truncation. */ 6618 if ((bp->b_xflags & BX_ALTDATA) != 0) 6619 return (1); 6620 /* -1 is full truncation. */ 6621 if (lastlbn == -1) 6622 return (1); 6623 /* 6624 * If this is a partial truncate we only want those 6625 * blocks and indirect blocks that cover the range 6626 * we're after. 6627 */ 6628 lbn = bp->b_lblkno; 6629 if (lbn < 0) 6630 lbn = -(lbn + lbn_level(lbn)); 6631 if (lbn < lastlbn) 6632 return (0); 6633 /* Here we only truncate lblkno if it's partial. */ 6634 if (lbn == lastlbn) { 6635 if (lastoff == 0) 6636 return (0); 6637 *blkoffp = lastoff; 6638 } 6639 return (1); 6640 } 6641 6642 /* 6643 * Eliminate any dependencies that exist in memory beyond lblkno:off 6644 */ 6645 static void 6646 trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags) 6647 struct inode *ip; 6648 struct freeblks *freeblks; 6649 ufs_lbn_t lastlbn; 6650 int lastoff; 6651 int flags; 6652 { 6653 struct bufobj *bo; 6654 struct vnode *vp; 6655 struct buf *bp; 6656 struct fs *fs; 6657 int blkoff; 6658 6659 /* 6660 * We must wait for any I/O in progress to finish so that 6661 * all potential buffers on the dirty list will be visible. 6662 * Once they are all there, walk the list and get rid of 6663 * any dependencies. 6664 */ 6665 fs = ip->i_fs; 6666 vp = ITOV(ip); 6667 bo = &vp->v_bufobj; 6668 BO_LOCK(bo); 6669 drain_output(vp); 6670 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) 6671 bp->b_vflags &= ~BV_SCANNED; 6672 restart: 6673 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { 6674 if (bp->b_vflags & BV_SCANNED) 6675 continue; 6676 if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) { 6677 bp->b_vflags |= BV_SCANNED; 6678 continue; 6679 } 6680 if ((bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT)) == NULL) 6681 goto restart; 6682 BO_UNLOCK(bo); 6683 if (deallocate_dependencies(bp, freeblks, blkoff)) 6684 bqrelse(bp); 6685 else 6686 brelse(bp); 6687 BO_LOCK(bo); 6688 goto restart; 6689 } 6690 /* 6691 * Now do the work of vtruncbuf while also matching indirect blocks. 6692 */ 6693 TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) 6694 bp->b_vflags &= ~BV_SCANNED; 6695 cleanrestart: 6696 TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) { 6697 if (bp->b_vflags & BV_SCANNED) 6698 continue; 6699 if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) { 6700 bp->b_vflags |= BV_SCANNED; 6701 continue; 6702 } 6703 if (BUF_LOCK(bp, 6704 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 6705 BO_MTX(bo)) == ENOLCK) { 6706 BO_LOCK(bo); 6707 goto cleanrestart; 6708 } 6709 bp->b_vflags |= BV_SCANNED; 6710 BO_LOCK(bo); 6711 bremfree(bp); 6712 BO_UNLOCK(bo); 6713 if (blkoff != 0) { 6714 allocbuf(bp, blkoff); 6715 bqrelse(bp); 6716 } else { 6717 bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF; 6718 brelse(bp); 6719 } 6720 BO_LOCK(bo); 6721 goto cleanrestart; 6722 } 6723 drain_output(vp); 6724 BO_UNLOCK(bo); 6725 } 6726 6727 static int 6728 cancel_pagedep(pagedep, freeblks, blkoff) 6729 struct pagedep *pagedep; 6730 struct freeblks *freeblks; 6731 int blkoff; 6732 { 6733 struct jremref *jremref; 6734 struct jmvref *jmvref; 6735 struct dirrem *dirrem, *tmp; 6736 int i; 6737 6738 /* 6739 * Copy any directory remove dependencies to the list 6740 * to be processed after the freeblks proceeds. If 6741 * directory entry never made it to disk they 6742 * can be dumped directly onto the work list. 6743 */ 6744 LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) { 6745 /* Skip this directory removal if it is intended to remain. */ 6746 if (dirrem->dm_offset < blkoff) 6747 continue; 6748 /* 6749 * If there are any dirrems we wait for the journal write 6750 * to complete and then restart the buf scan as the lock 6751 * has been dropped. 6752 */ 6753 while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) { 6754 jwait(&jremref->jr_list, MNT_WAIT); 6755 return (ERESTART); 6756 } 6757 LIST_REMOVE(dirrem, dm_next); 6758 dirrem->dm_dirinum = pagedep->pd_ino; 6759 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list); 6760 } 6761 while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) { 6762 jwait(&jmvref->jm_list, MNT_WAIT); 6763 return (ERESTART); 6764 } 6765 /* 6766 * When we're partially truncating a pagedep we just want to flush 6767 * journal entries and return. There can not be any adds in the 6768 * truncated portion of the directory and newblk must remain if 6769 * part of the block remains. 6770 */ 6771 if (blkoff != 0) { 6772 struct diradd *dap; 6773 6774 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) 6775 if (dap->da_offset > blkoff) 6776 panic("cancel_pagedep: diradd %p off %d > %d", 6777 dap, dap->da_offset, blkoff); 6778 for (i = 0; i < DAHASHSZ; i++) 6779 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) 6780 if (dap->da_offset > blkoff) 6781 panic("cancel_pagedep: diradd %p off %d > %d", 6782 dap, dap->da_offset, blkoff); 6783 return (0); 6784 } 6785 /* 6786 * There should be no directory add dependencies present 6787 * as the directory could not be truncated until all 6788 * children were removed. 6789 */ 6790 KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL, 6791 ("deallocate_dependencies: pendinghd != NULL")); 6792 for (i = 0; i < DAHASHSZ; i++) 6793 KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL, 6794 ("deallocate_dependencies: diraddhd != NULL")); 6795 if ((pagedep->pd_state & NEWBLOCK) != 0) 6796 free_newdirblk(pagedep->pd_newdirblk); 6797 if (free_pagedep(pagedep) == 0) 6798 panic("Failed to free pagedep %p", pagedep); 6799 return (0); 6800 } 6801 6802 /* 6803 * Reclaim any dependency structures from a buffer that is about to 6804 * be reallocated to a new vnode. The buffer must be locked, thus, 6805 * no I/O completion operations can occur while we are manipulating 6806 * its associated dependencies. The mutex is held so that other I/O's 6807 * associated with related dependencies do not occur. 6808 */ 6809 static int 6810 deallocate_dependencies(bp, freeblks, off) 6811 struct buf *bp; 6812 struct freeblks *freeblks; 6813 int off; 6814 { 6815 struct indirdep *indirdep; 6816 struct pagedep *pagedep; 6817 struct allocdirect *adp; 6818 struct worklist *wk, *wkn; 6819 6820 ACQUIRE_LOCK(&lk); 6821 LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) { 6822 switch (wk->wk_type) { 6823 case D_INDIRDEP: 6824 indirdep = WK_INDIRDEP(wk); 6825 if (bp->b_lblkno >= 0 || 6826 bp->b_blkno != indirdep->ir_savebp->b_lblkno) 6827 panic("deallocate_dependencies: not indir"); 6828 cancel_indirdep(indirdep, bp, freeblks); 6829 continue; 6830 6831 case D_PAGEDEP: 6832 pagedep = WK_PAGEDEP(wk); 6833 if (cancel_pagedep(pagedep, freeblks, off)) { 6834 FREE_LOCK(&lk); 6835 return (ERESTART); 6836 } 6837 continue; 6838 6839 case D_ALLOCINDIR: 6840 /* 6841 * Simply remove the allocindir, we'll find it via 6842 * the indirdep where we can clear pointers if 6843 * needed. 6844 */ 6845 WORKLIST_REMOVE(wk); 6846 continue; 6847 6848 case D_FREEWORK: 6849 /* 6850 * A truncation is waiting for the zero'd pointers 6851 * to be written. It can be freed when the freeblks 6852 * is journaled. 6853 */ 6854 WORKLIST_REMOVE(wk); 6855 wk->wk_state |= ONDEPLIST; 6856 WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk); 6857 break; 6858 6859 case D_ALLOCDIRECT: 6860 adp = WK_ALLOCDIRECT(wk); 6861 if (off != 0) 6862 continue; 6863 /* FALLTHROUGH */ 6864 default: 6865 panic("deallocate_dependencies: Unexpected type %s", 6866 TYPENAME(wk->wk_type)); 6867 /* NOTREACHED */ 6868 } 6869 } 6870 FREE_LOCK(&lk); 6871 /* 6872 * Don't throw away this buf, we were partially truncating and 6873 * some deps may always remain. 6874 */ 6875 if (off) { 6876 allocbuf(bp, off); 6877 bp->b_vflags |= BV_SCANNED; 6878 return (EBUSY); 6879 } 6880 bp->b_flags |= B_INVAL | B_NOCACHE; 6881 6882 return (0); 6883 } 6884 6885 /* 6886 * An allocdirect is being canceled due to a truncate. We must make sure 6887 * the journal entry is released in concert with the blkfree that releases 6888 * the storage. Completed journal entries must not be released until the 6889 * space is no longer pointed to by the inode or in the bitmap. 6890 */ 6891 static void 6892 cancel_allocdirect(adphead, adp, freeblks) 6893 struct allocdirectlst *adphead; 6894 struct allocdirect *adp; 6895 struct freeblks *freeblks; 6896 { 6897 struct freework *freework; 6898 struct newblk *newblk; 6899 struct worklist *wk; 6900 6901 TAILQ_REMOVE(adphead, adp, ad_next); 6902 newblk = (struct newblk *)adp; 6903 freework = NULL; 6904 /* 6905 * Find the correct freework structure. 6906 */ 6907 LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) { 6908 if (wk->wk_type != D_FREEWORK) 6909 continue; 6910 freework = WK_FREEWORK(wk); 6911 if (freework->fw_blkno == newblk->nb_newblkno) 6912 break; 6913 } 6914 if (freework == NULL) 6915 panic("cancel_allocdirect: Freework not found"); 6916 /* 6917 * If a newblk exists at all we still have the journal entry that 6918 * initiated the allocation so we do not need to journal the free. 6919 */ 6920 cancel_jfreeblk(freeblks, freework->fw_blkno); 6921 /* 6922 * If the journal hasn't been written the jnewblk must be passed 6923 * to the call to ffs_blkfree that reclaims the space. We accomplish 6924 * this by linking the journal dependency into the freework to be 6925 * freed when freework_freeblock() is called. If the journal has 6926 * been written we can simply reclaim the journal space when the 6927 * freeblks work is complete. 6928 */ 6929 freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list, 6930 &freeblks->fb_jwork); 6931 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list); 6932 } 6933 6934 6935 /* 6936 * Cancel a new block allocation. May be an indirect or direct block. We 6937 * remove it from various lists and return any journal record that needs to 6938 * be resolved by the caller. 6939 * 6940 * A special consideration is made for indirects which were never pointed 6941 * at on disk and will never be found once this block is released. 6942 */ 6943 static struct jnewblk * 6944 cancel_newblk(newblk, wk, wkhd) 6945 struct newblk *newblk; 6946 struct worklist *wk; 6947 struct workhead *wkhd; 6948 { 6949 struct jnewblk *jnewblk; 6950 6951 newblk->nb_state |= GOINGAWAY; 6952 /* 6953 * Previously we traversed the completedhd on each indirdep 6954 * attached to this newblk to cancel them and gather journal 6955 * work. Since we need only the oldest journal segment and 6956 * the lowest point on the tree will always have the oldest 6957 * journal segment we are free to release the segments 6958 * of any subordinates and may leave the indirdep list to 6959 * indirdep_complete() when this newblk is freed. 6960 */ 6961 if (newblk->nb_state & ONDEPLIST) { 6962 newblk->nb_state &= ~ONDEPLIST; 6963 LIST_REMOVE(newblk, nb_deps); 6964 } 6965 if (newblk->nb_state & ONWORKLIST) 6966 WORKLIST_REMOVE(&newblk->nb_list); 6967 /* 6968 * If the journal entry hasn't been written we save a pointer to 6969 * the dependency that frees it until it is written or the 6970 * superseding operation completes. 6971 */ 6972 jnewblk = newblk->nb_jnewblk; 6973 if (jnewblk != NULL && wk != NULL) { 6974 newblk->nb_jnewblk = NULL; 6975 jnewblk->jn_dep = wk; 6976 } 6977 if (!LIST_EMPTY(&newblk->nb_jwork)) 6978 jwork_move(wkhd, &newblk->nb_jwork); 6979 /* 6980 * When truncating we must free the newdirblk early to remove 6981 * the pagedep from the hash before returning. 6982 */ 6983 if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) 6984 free_newdirblk(WK_NEWDIRBLK(wk)); 6985 if (!LIST_EMPTY(&newblk->nb_newdirblk)) 6986 panic("cancel_newblk: extra newdirblk"); 6987 6988 return (jnewblk); 6989 } 6990 6991 /* 6992 * Schedule the freefrag associated with a newblk to be released once 6993 * the pointers are written and the previous block is no longer needed. 6994 */ 6995 static void 6996 newblk_freefrag(newblk) 6997 struct newblk *newblk; 6998 { 6999 struct freefrag *freefrag; 7000 7001 if (newblk->nb_freefrag == NULL) 7002 return; 7003 freefrag = newblk->nb_freefrag; 7004 newblk->nb_freefrag = NULL; 7005 freefrag->ff_state |= COMPLETE; 7006 if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) 7007 add_to_worklist(&freefrag->ff_list, 0); 7008 } 7009 7010 /* 7011 * Free a newblk. Generate a new freefrag work request if appropriate. 7012 * This must be called after the inode pointer and any direct block pointers 7013 * are valid or fully removed via truncate or frag extension. 7014 */ 7015 static void 7016 free_newblk(newblk) 7017 struct newblk *newblk; 7018 { 7019 struct indirdep *indirdep; 7020 struct worklist *wk; 7021 7022 KASSERT(newblk->nb_jnewblk == NULL, 7023 ("free_newblk; jnewblk %p still attached", newblk->nb_jnewblk)); 7024 mtx_assert(&lk, MA_OWNED); 7025 newblk_freefrag(newblk); 7026 if (newblk->nb_state & ONDEPLIST) 7027 LIST_REMOVE(newblk, nb_deps); 7028 if (newblk->nb_state & ONWORKLIST) 7029 WORKLIST_REMOVE(&newblk->nb_list); 7030 LIST_REMOVE(newblk, nb_hash); 7031 if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) 7032 free_newdirblk(WK_NEWDIRBLK(wk)); 7033 if (!LIST_EMPTY(&newblk->nb_newdirblk)) 7034 panic("free_newblk: extra newdirblk"); 7035 while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) 7036 indirdep_complete(indirdep); 7037 handle_jwork(&newblk->nb_jwork); 7038 newblk->nb_list.wk_type = D_NEWBLK; 7039 WORKITEM_FREE(newblk, D_NEWBLK); 7040 } 7041 7042 /* 7043 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep. 7044 * This routine must be called with splbio interrupts blocked. 7045 */ 7046 static void 7047 free_newdirblk(newdirblk) 7048 struct newdirblk *newdirblk; 7049 { 7050 struct pagedep *pagedep; 7051 struct diradd *dap; 7052 struct worklist *wk; 7053 7054 mtx_assert(&lk, MA_OWNED); 7055 WORKLIST_REMOVE(&newdirblk->db_list); 7056 /* 7057 * If the pagedep is still linked onto the directory buffer 7058 * dependency chain, then some of the entries on the 7059 * pd_pendinghd list may not be committed to disk yet. In 7060 * this case, we will simply clear the NEWBLOCK flag and 7061 * let the pd_pendinghd list be processed when the pagedep 7062 * is next written. If the pagedep is no longer on the buffer 7063 * dependency chain, then all the entries on the pd_pending 7064 * list are committed to disk and we can free them here. 7065 */ 7066 pagedep = newdirblk->db_pagedep; 7067 pagedep->pd_state &= ~NEWBLOCK; 7068 if ((pagedep->pd_state & ONWORKLIST) == 0) { 7069 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 7070 free_diradd(dap, NULL); 7071 /* 7072 * If no dependencies remain, the pagedep will be freed. 7073 */ 7074 free_pagedep(pagedep); 7075 } 7076 /* Should only ever be one item in the list. */ 7077 while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) { 7078 WORKLIST_REMOVE(wk); 7079 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 7080 } 7081 WORKITEM_FREE(newdirblk, D_NEWDIRBLK); 7082 } 7083 7084 /* 7085 * Prepare an inode to be freed. The actual free operation is not 7086 * done until the zero'ed inode has been written to disk. 7087 */ 7088 void 7089 softdep_freefile(pvp, ino, mode) 7090 struct vnode *pvp; 7091 ino_t ino; 7092 int mode; 7093 { 7094 struct inode *ip = VTOI(pvp); 7095 struct inodedep *inodedep; 7096 struct freefile *freefile; 7097 struct freeblks *freeblks; 7098 7099 /* 7100 * This sets up the inode de-allocation dependency. 7101 */ 7102 freefile = malloc(sizeof(struct freefile), 7103 M_FREEFILE, M_SOFTDEP_FLAGS); 7104 workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount); 7105 freefile->fx_mode = mode; 7106 freefile->fx_oldinum = ino; 7107 freefile->fx_devvp = ip->i_devvp; 7108 LIST_INIT(&freefile->fx_jwork); 7109 UFS_LOCK(ip->i_ump); 7110 ip->i_fs->fs_pendinginodes += 1; 7111 UFS_UNLOCK(ip->i_ump); 7112 7113 /* 7114 * If the inodedep does not exist, then the zero'ed inode has 7115 * been written to disk. If the allocated inode has never been 7116 * written to disk, then the on-disk inode is zero'ed. In either 7117 * case we can free the file immediately. If the journal was 7118 * canceled before being written the inode will never make it to 7119 * disk and we must send the canceled journal entrys to 7120 * ffs_freefile() to be cleared in conjunction with the bitmap. 7121 * Any blocks waiting on the inode to write can be safely freed 7122 * here as it will never been written. 7123 */ 7124 ACQUIRE_LOCK(&lk); 7125 inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); 7126 if (inodedep) { 7127 /* 7128 * Clear out freeblks that no longer need to reference 7129 * this inode. 7130 */ 7131 while ((freeblks = 7132 TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) { 7133 TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, 7134 fb_next); 7135 freeblks->fb_state &= ~ONDEPLIST; 7136 } 7137 /* 7138 * Remove this inode from the unlinked list. 7139 */ 7140 if (inodedep->id_state & UNLINKED) { 7141 /* 7142 * Save the journal work to be freed with the bitmap 7143 * before we clear UNLINKED. Otherwise it can be lost 7144 * if the inode block is written. 7145 */ 7146 handle_bufwait(inodedep, &freefile->fx_jwork); 7147 clear_unlinked_inodedep(inodedep); 7148 /* Re-acquire inodedep as we've dropped lk. */ 7149 inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); 7150 } 7151 } 7152 if (inodedep == NULL || check_inode_unwritten(inodedep)) { 7153 FREE_LOCK(&lk); 7154 handle_workitem_freefile(freefile); 7155 return; 7156 } 7157 if ((inodedep->id_state & DEPCOMPLETE) == 0) 7158 inodedep->id_state |= GOINGAWAY; 7159 WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); 7160 FREE_LOCK(&lk); 7161 if (ip->i_number == ino) 7162 ip->i_flag |= IN_MODIFIED; 7163 } 7164 7165 /* 7166 * Check to see if an inode has never been written to disk. If 7167 * so free the inodedep and return success, otherwise return failure. 7168 * This routine must be called with splbio interrupts blocked. 7169 * 7170 * If we still have a bitmap dependency, then the inode has never 7171 * been written to disk. Drop the dependency as it is no longer 7172 * necessary since the inode is being deallocated. We set the 7173 * ALLCOMPLETE flags since the bitmap now properly shows that the 7174 * inode is not allocated. Even if the inode is actively being 7175 * written, it has been rolled back to its zero'ed state, so we 7176 * are ensured that a zero inode is what is on the disk. For short 7177 * lived files, this change will usually result in removing all the 7178 * dependencies from the inode so that it can be freed immediately. 7179 */ 7180 static int 7181 check_inode_unwritten(inodedep) 7182 struct inodedep *inodedep; 7183 { 7184 7185 mtx_assert(&lk, MA_OWNED); 7186 7187 if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 || 7188 !LIST_EMPTY(&inodedep->id_dirremhd) || 7189 !LIST_EMPTY(&inodedep->id_pendinghd) || 7190 !LIST_EMPTY(&inodedep->id_bufwait) || 7191 !LIST_EMPTY(&inodedep->id_inowait) || 7192 !TAILQ_EMPTY(&inodedep->id_inoreflst) || 7193 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 7194 !TAILQ_EMPTY(&inodedep->id_newinoupdt) || 7195 !TAILQ_EMPTY(&inodedep->id_extupdt) || 7196 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 7197 !TAILQ_EMPTY(&inodedep->id_freeblklst) || 7198 inodedep->id_mkdiradd != NULL || 7199 inodedep->id_nlinkdelta != 0) 7200 return (0); 7201 /* 7202 * Another process might be in initiate_write_inodeblock_ufs[12] 7203 * trying to allocate memory without holding "Softdep Lock". 7204 */ 7205 if ((inodedep->id_state & IOSTARTED) != 0 && 7206 inodedep->id_savedino1 == NULL) 7207 return (0); 7208 7209 if (inodedep->id_state & ONDEPLIST) 7210 LIST_REMOVE(inodedep, id_deps); 7211 inodedep->id_state &= ~ONDEPLIST; 7212 inodedep->id_state |= ALLCOMPLETE; 7213 inodedep->id_bmsafemap = NULL; 7214 if (inodedep->id_state & ONWORKLIST) 7215 WORKLIST_REMOVE(&inodedep->id_list); 7216 if (inodedep->id_savedino1 != NULL) { 7217 free(inodedep->id_savedino1, M_SAVEDINO); 7218 inodedep->id_savedino1 = NULL; 7219 } 7220 if (free_inodedep(inodedep) == 0) 7221 panic("check_inode_unwritten: busy inode"); 7222 return (1); 7223 } 7224 7225 /* 7226 * Try to free an inodedep structure. Return 1 if it could be freed. 7227 */ 7228 static int 7229 free_inodedep(inodedep) 7230 struct inodedep *inodedep; 7231 { 7232 7233 mtx_assert(&lk, MA_OWNED); 7234 if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 || 7235 (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || 7236 !LIST_EMPTY(&inodedep->id_dirremhd) || 7237 !LIST_EMPTY(&inodedep->id_pendinghd) || 7238 !LIST_EMPTY(&inodedep->id_bufwait) || 7239 !LIST_EMPTY(&inodedep->id_inowait) || 7240 !TAILQ_EMPTY(&inodedep->id_inoreflst) || 7241 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 7242 !TAILQ_EMPTY(&inodedep->id_newinoupdt) || 7243 !TAILQ_EMPTY(&inodedep->id_extupdt) || 7244 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 7245 !TAILQ_EMPTY(&inodedep->id_freeblklst) || 7246 inodedep->id_mkdiradd != NULL || 7247 inodedep->id_nlinkdelta != 0 || 7248 inodedep->id_savedino1 != NULL) 7249 return (0); 7250 if (inodedep->id_state & ONDEPLIST) 7251 LIST_REMOVE(inodedep, id_deps); 7252 LIST_REMOVE(inodedep, id_hash); 7253 WORKITEM_FREE(inodedep, D_INODEDEP); 7254 return (1); 7255 } 7256 7257 /* 7258 * Free the block referenced by a freework structure. The parent freeblks 7259 * structure is released and completed when the final cg bitmap reaches 7260 * the disk. This routine may be freeing a jnewblk which never made it to 7261 * disk in which case we do not have to wait as the operation is undone 7262 * in memory immediately. 7263 */ 7264 static void 7265 freework_freeblock(freework) 7266 struct freework *freework; 7267 { 7268 struct freeblks *freeblks; 7269 struct jnewblk *jnewblk; 7270 struct ufsmount *ump; 7271 struct workhead wkhd; 7272 struct fs *fs; 7273 int bsize; 7274 int needj; 7275 7276 mtx_assert(&lk, MA_OWNED); 7277 /* 7278 * Handle partial truncate separately. 7279 */ 7280 if (freework->fw_indir) { 7281 complete_trunc_indir(freework); 7282 return; 7283 } 7284 freeblks = freework->fw_freeblks; 7285 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 7286 fs = ump->um_fs; 7287 needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0; 7288 bsize = lfragtosize(fs, freework->fw_frags); 7289 LIST_INIT(&wkhd); 7290 /* 7291 * DEPCOMPLETE is cleared in indirblk_insert() if the block lives 7292 * on the indirblk hashtable and prevents premature freeing. 7293 */ 7294 freework->fw_state |= DEPCOMPLETE; 7295 /* 7296 * SUJ needs to wait for the segment referencing freed indirect 7297 * blocks to expire so that we know the checker will not confuse 7298 * a re-allocated indirect block with its old contents. 7299 */ 7300 if (needj && freework->fw_lbn <= -NDADDR) 7301 indirblk_insert(freework); 7302 /* 7303 * If we are canceling an existing jnewblk pass it to the free 7304 * routine, otherwise pass the freeblk which will ultimately 7305 * release the freeblks. If we're not journaling, we can just 7306 * free the freeblks immediately. 7307 */ 7308 jnewblk = freework->fw_jnewblk; 7309 if (jnewblk != NULL) { 7310 cancel_jnewblk(jnewblk, &wkhd); 7311 needj = 0; 7312 } else if (needj) { 7313 freework->fw_state |= DELAYEDFREE; 7314 freeblks->fb_cgwait++; 7315 WORKLIST_INSERT(&wkhd, &freework->fw_list); 7316 } 7317 FREE_LOCK(&lk); 7318 freeblks_free(ump, freeblks, btodb(bsize)); 7319 ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize, 7320 freeblks->fb_inum, freeblks->fb_vtype, &wkhd); 7321 ACQUIRE_LOCK(&lk); 7322 /* 7323 * The jnewblk will be discarded and the bits in the map never 7324 * made it to disk. We can immediately free the freeblk. 7325 */ 7326 if (needj == 0) 7327 handle_written_freework(freework); 7328 } 7329 7330 /* 7331 * We enqueue freework items that need processing back on the freeblks and 7332 * add the freeblks to the worklist. This makes it easier to find all work 7333 * required to flush a truncation in process_truncates(). 7334 */ 7335 static void 7336 freework_enqueue(freework) 7337 struct freework *freework; 7338 { 7339 struct freeblks *freeblks; 7340 7341 freeblks = freework->fw_freeblks; 7342 if ((freework->fw_state & INPROGRESS) == 0) 7343 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list); 7344 if ((freeblks->fb_state & 7345 (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE && 7346 LIST_EMPTY(&freeblks->fb_jblkdephd)) 7347 add_to_worklist(&freeblks->fb_list, WK_NODELAY); 7348 } 7349 7350 /* 7351 * Start, continue, or finish the process of freeing an indirect block tree. 7352 * The free operation may be paused at any point with fw_off containing the 7353 * offset to restart from. This enables us to implement some flow control 7354 * for large truncates which may fan out and generate a huge number of 7355 * dependencies. 7356 */ 7357 static void 7358 handle_workitem_indirblk(freework) 7359 struct freework *freework; 7360 { 7361 struct freeblks *freeblks; 7362 struct ufsmount *ump; 7363 struct fs *fs; 7364 7365 freeblks = freework->fw_freeblks; 7366 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 7367 fs = ump->um_fs; 7368 if (freework->fw_state & DEPCOMPLETE) { 7369 handle_written_freework(freework); 7370 return; 7371 } 7372 if (freework->fw_off == NINDIR(fs)) { 7373 freework_freeblock(freework); 7374 return; 7375 } 7376 freework->fw_state |= INPROGRESS; 7377 FREE_LOCK(&lk); 7378 indir_trunc(freework, fsbtodb(fs, freework->fw_blkno), 7379 freework->fw_lbn); 7380 ACQUIRE_LOCK(&lk); 7381 } 7382 7383 /* 7384 * Called when a freework structure attached to a cg buf is written. The 7385 * ref on either the parent or the freeblks structure is released and 7386 * the freeblks is added back to the worklist if there is more work to do. 7387 */ 7388 static void 7389 handle_written_freework(freework) 7390 struct freework *freework; 7391 { 7392 struct freeblks *freeblks; 7393 struct freework *parent; 7394 7395 freeblks = freework->fw_freeblks; 7396 parent = freework->fw_parent; 7397 if (freework->fw_state & DELAYEDFREE) 7398 freeblks->fb_cgwait--; 7399 freework->fw_state |= COMPLETE; 7400 if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE) 7401 WORKITEM_FREE(freework, D_FREEWORK); 7402 if (parent) { 7403 if (--parent->fw_ref == 0) 7404 freework_enqueue(parent); 7405 return; 7406 } 7407 if (--freeblks->fb_ref != 0) 7408 return; 7409 if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) == 7410 ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd)) 7411 add_to_worklist(&freeblks->fb_list, WK_NODELAY); 7412 } 7413 7414 /* 7415 * This workitem routine performs the block de-allocation. 7416 * The workitem is added to the pending list after the updated 7417 * inode block has been written to disk. As mentioned above, 7418 * checks regarding the number of blocks de-allocated (compared 7419 * to the number of blocks allocated for the file) are also 7420 * performed in this function. 7421 */ 7422 static int 7423 handle_workitem_freeblocks(freeblks, flags) 7424 struct freeblks *freeblks; 7425 int flags; 7426 { 7427 struct freework *freework; 7428 struct newblk *newblk; 7429 struct allocindir *aip; 7430 struct ufsmount *ump; 7431 struct worklist *wk; 7432 7433 KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd), 7434 ("handle_workitem_freeblocks: Journal entries not written.")); 7435 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 7436 ACQUIRE_LOCK(&lk); 7437 while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) { 7438 WORKLIST_REMOVE(wk); 7439 switch (wk->wk_type) { 7440 case D_DIRREM: 7441 wk->wk_state |= COMPLETE; 7442 add_to_worklist(wk, 0); 7443 continue; 7444 7445 case D_ALLOCDIRECT: 7446 free_newblk(WK_NEWBLK(wk)); 7447 continue; 7448 7449 case D_ALLOCINDIR: 7450 aip = WK_ALLOCINDIR(wk); 7451 freework = NULL; 7452 if (aip->ai_state & DELAYEDFREE) { 7453 FREE_LOCK(&lk); 7454 freework = newfreework(ump, freeblks, NULL, 7455 aip->ai_lbn, aip->ai_newblkno, 7456 ump->um_fs->fs_frag, 0, 0); 7457 ACQUIRE_LOCK(&lk); 7458 } 7459 newblk = WK_NEWBLK(wk); 7460 if (newblk->nb_jnewblk) { 7461 freework->fw_jnewblk = newblk->nb_jnewblk; 7462 newblk->nb_jnewblk->jn_dep = &freework->fw_list; 7463 newblk->nb_jnewblk = NULL; 7464 } 7465 free_newblk(newblk); 7466 continue; 7467 7468 case D_FREEWORK: 7469 freework = WK_FREEWORK(wk); 7470 if (freework->fw_lbn <= -NDADDR) 7471 handle_workitem_indirblk(freework); 7472 else 7473 freework_freeblock(freework); 7474 continue; 7475 default: 7476 panic("handle_workitem_freeblocks: Unknown type %s", 7477 TYPENAME(wk->wk_type)); 7478 } 7479 } 7480 if (freeblks->fb_ref != 0) { 7481 freeblks->fb_state &= ~INPROGRESS; 7482 wake_worklist(&freeblks->fb_list); 7483 freeblks = NULL; 7484 } 7485 FREE_LOCK(&lk); 7486 if (freeblks) 7487 return handle_complete_freeblocks(freeblks, flags); 7488 return (0); 7489 } 7490 7491 /* 7492 * Handle completion of block free via truncate. This allows fs_pending 7493 * to track the actual free block count more closely than if we only updated 7494 * it at the end. We must be careful to handle cases where the block count 7495 * on free was incorrect. 7496 */ 7497 static void 7498 freeblks_free(ump, freeblks, blocks) 7499 struct ufsmount *ump; 7500 struct freeblks *freeblks; 7501 int blocks; 7502 { 7503 struct fs *fs; 7504 ufs2_daddr_t remain; 7505 7506 UFS_LOCK(ump); 7507 remain = -freeblks->fb_chkcnt; 7508 freeblks->fb_chkcnt += blocks; 7509 if (remain > 0) { 7510 if (remain < blocks) 7511 blocks = remain; 7512 fs = ump->um_fs; 7513 fs->fs_pendingblocks -= blocks; 7514 } 7515 UFS_UNLOCK(ump); 7516 } 7517 7518 /* 7519 * Once all of the freework workitems are complete we can retire the 7520 * freeblocks dependency and any journal work awaiting completion. This 7521 * can not be called until all other dependencies are stable on disk. 7522 */ 7523 static int 7524 handle_complete_freeblocks(freeblks, flags) 7525 struct freeblks *freeblks; 7526 int flags; 7527 { 7528 struct inodedep *inodedep; 7529 struct inode *ip; 7530 struct vnode *vp; 7531 struct fs *fs; 7532 struct ufsmount *ump; 7533 ufs2_daddr_t spare; 7534 7535 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 7536 fs = ump->um_fs; 7537 flags = LK_EXCLUSIVE | flags; 7538 spare = freeblks->fb_chkcnt; 7539 7540 /* 7541 * If we did not release the expected number of blocks we may have 7542 * to adjust the inode block count here. Only do so if it wasn't 7543 * a truncation to zero and the modrev still matches. 7544 */ 7545 if (spare && freeblks->fb_len != 0) { 7546 if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum, 7547 flags, &vp, FFSV_FORCEINSMQ) != 0) 7548 return (EBUSY); 7549 ip = VTOI(vp); 7550 if (DIP(ip, i_modrev) == freeblks->fb_modrev) { 7551 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare); 7552 ip->i_flag |= IN_CHANGE; 7553 /* 7554 * We must wait so this happens before the 7555 * journal is reclaimed. 7556 */ 7557 ffs_update(vp, 1); 7558 } 7559 vput(vp); 7560 } 7561 if (spare < 0) { 7562 UFS_LOCK(ump); 7563 fs->fs_pendingblocks += spare; 7564 UFS_UNLOCK(ump); 7565 } 7566 #ifdef QUOTA 7567 /* Handle spare. */ 7568 if (spare) 7569 quotaadj(freeblks->fb_quota, ump, -spare); 7570 quotarele(freeblks->fb_quota); 7571 #endif 7572 ACQUIRE_LOCK(&lk); 7573 if (freeblks->fb_state & ONDEPLIST) { 7574 inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum, 7575 0, &inodedep); 7576 TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next); 7577 freeblks->fb_state &= ~ONDEPLIST; 7578 if (TAILQ_EMPTY(&inodedep->id_freeblklst)) 7579 free_inodedep(inodedep); 7580 } 7581 /* 7582 * All of the freeblock deps must be complete prior to this call 7583 * so it's now safe to complete earlier outstanding journal entries. 7584 */ 7585 handle_jwork(&freeblks->fb_jwork); 7586 WORKITEM_FREE(freeblks, D_FREEBLKS); 7587 FREE_LOCK(&lk); 7588 return (0); 7589 } 7590 7591 /* 7592 * Release blocks associated with the freeblks and stored in the indirect 7593 * block dbn. If level is greater than SINGLE, the block is an indirect block 7594 * and recursive calls to indirtrunc must be used to cleanse other indirect 7595 * blocks. 7596 * 7597 * This handles partial and complete truncation of blocks. Partial is noted 7598 * with goingaway == 0. In this case the freework is completed after the 7599 * zero'd indirects are written to disk. For full truncation the freework 7600 * is completed after the block is freed. 7601 */ 7602 static void 7603 indir_trunc(freework, dbn, lbn) 7604 struct freework *freework; 7605 ufs2_daddr_t dbn; 7606 ufs_lbn_t lbn; 7607 { 7608 struct freework *nfreework; 7609 struct workhead wkhd; 7610 struct freeblks *freeblks; 7611 struct buf *bp; 7612 struct fs *fs; 7613 struct indirdep *indirdep; 7614 struct ufsmount *ump; 7615 ufs1_daddr_t *bap1 = 0; 7616 ufs2_daddr_t nb, nnb, *bap2 = 0; 7617 ufs_lbn_t lbnadd, nlbn; 7618 int i, nblocks, ufs1fmt; 7619 int freedblocks; 7620 int goingaway; 7621 int freedeps; 7622 int needj; 7623 int level; 7624 int cnt; 7625 7626 freeblks = freework->fw_freeblks; 7627 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 7628 fs = ump->um_fs; 7629 /* 7630 * Get buffer of block pointers to be freed. There are three cases: 7631 * 7632 * 1) Partial truncate caches the indirdep pointer in the freework 7633 * which provides us a back copy to the save bp which holds the 7634 * pointers we want to clear. When this completes the zero 7635 * pointers are written to the real copy. 7636 * 2) The indirect is being completely truncated, cancel_indirdep() 7637 * eliminated the real copy and placed the indirdep on the saved 7638 * copy. The indirdep and buf are discarded when this completes. 7639 * 3) The indirect was not in memory, we read a copy off of the disk 7640 * using the devvp and drop and invalidate the buffer when we're 7641 * done. 7642 */ 7643 goingaway = 1; 7644 indirdep = NULL; 7645 if (freework->fw_indir != NULL) { 7646 goingaway = 0; 7647 indirdep = freework->fw_indir; 7648 bp = indirdep->ir_savebp; 7649 if (bp == NULL || bp->b_blkno != dbn) 7650 panic("indir_trunc: Bad saved buf %p blkno %jd", 7651 bp, (intmax_t)dbn); 7652 } else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) { 7653 /* 7654 * The lock prevents the buf dep list from changing and 7655 * indirects on devvp should only ever have one dependency. 7656 */ 7657 indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep)); 7658 if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0) 7659 panic("indir_trunc: Bad indirdep %p from buf %p", 7660 indirdep, bp); 7661 } else if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 7662 NOCRED, &bp) != 0) { 7663 brelse(bp); 7664 return; 7665 } 7666 ACQUIRE_LOCK(&lk); 7667 /* Protects against a race with complete_trunc_indir(). */ 7668 freework->fw_state &= ~INPROGRESS; 7669 /* 7670 * If we have an indirdep we need to enforce the truncation order 7671 * and discard it when it is complete. 7672 */ 7673 if (indirdep) { 7674 if (freework != TAILQ_FIRST(&indirdep->ir_trunc) && 7675 !TAILQ_EMPTY(&indirdep->ir_trunc)) { 7676 /* 7677 * Add the complete truncate to the list on the 7678 * indirdep to enforce in-order processing. 7679 */ 7680 if (freework->fw_indir == NULL) 7681 TAILQ_INSERT_TAIL(&indirdep->ir_trunc, 7682 freework, fw_next); 7683 FREE_LOCK(&lk); 7684 return; 7685 } 7686 /* 7687 * If we're goingaway, free the indirdep. Otherwise it will 7688 * linger until the write completes. 7689 */ 7690 if (goingaway) { 7691 free_indirdep(indirdep); 7692 ump->um_numindirdeps -= 1; 7693 } 7694 } 7695 FREE_LOCK(&lk); 7696 /* Initialize pointers depending on block size. */ 7697 if (ump->um_fstype == UFS1) { 7698 bap1 = (ufs1_daddr_t *)bp->b_data; 7699 nb = bap1[freework->fw_off]; 7700 ufs1fmt = 1; 7701 } else { 7702 bap2 = (ufs2_daddr_t *)bp->b_data; 7703 nb = bap2[freework->fw_off]; 7704 ufs1fmt = 0; 7705 } 7706 level = lbn_level(lbn); 7707 needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0; 7708 lbnadd = lbn_offset(fs, level); 7709 nblocks = btodb(fs->fs_bsize); 7710 nfreework = freework; 7711 freedeps = 0; 7712 cnt = 0; 7713 /* 7714 * Reclaim blocks. Traverses into nested indirect levels and 7715 * arranges for the current level to be freed when subordinates 7716 * are free when journaling. 7717 */ 7718 for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) { 7719 if (i != NINDIR(fs) - 1) { 7720 if (ufs1fmt) 7721 nnb = bap1[i+1]; 7722 else 7723 nnb = bap2[i+1]; 7724 } else 7725 nnb = 0; 7726 if (nb == 0) 7727 continue; 7728 cnt++; 7729 if (level != 0) { 7730 nlbn = (lbn + 1) - (i * lbnadd); 7731 if (needj != 0) { 7732 nfreework = newfreework(ump, freeblks, freework, 7733 nlbn, nb, fs->fs_frag, 0, 0); 7734 freedeps++; 7735 } 7736 indir_trunc(nfreework, fsbtodb(fs, nb), nlbn); 7737 } else { 7738 struct freedep *freedep; 7739 7740 /* 7741 * Attempt to aggregate freedep dependencies for 7742 * all blocks being released to the same CG. 7743 */ 7744 LIST_INIT(&wkhd); 7745 if (needj != 0 && 7746 (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) { 7747 freedep = newfreedep(freework); 7748 WORKLIST_INSERT_UNLOCKED(&wkhd, 7749 &freedep->fd_list); 7750 freedeps++; 7751 } 7752 ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, 7753 fs->fs_bsize, freeblks->fb_inum, 7754 freeblks->fb_vtype, &wkhd); 7755 } 7756 } 7757 if (goingaway) { 7758 bp->b_flags |= B_INVAL | B_NOCACHE; 7759 brelse(bp); 7760 } 7761 freedblocks = 0; 7762 if (level == 0) 7763 freedblocks = (nblocks * cnt); 7764 if (needj == 0) 7765 freedblocks += nblocks; 7766 freeblks_free(ump, freeblks, freedblocks); 7767 /* 7768 * If we are journaling set up the ref counts and offset so this 7769 * indirect can be completed when its children are free. 7770 */ 7771 if (needj) { 7772 ACQUIRE_LOCK(&lk); 7773 freework->fw_off = i; 7774 freework->fw_ref += freedeps; 7775 freework->fw_ref -= NINDIR(fs) + 1; 7776 if (level == 0) 7777 freeblks->fb_cgwait += freedeps; 7778 if (freework->fw_ref == 0) 7779 freework_freeblock(freework); 7780 FREE_LOCK(&lk); 7781 return; 7782 } 7783 /* 7784 * If we're not journaling we can free the indirect now. 7785 */ 7786 dbn = dbtofsb(fs, dbn); 7787 ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize, 7788 freeblks->fb_inum, freeblks->fb_vtype, NULL); 7789 /* Non SUJ softdep does single-threaded truncations. */ 7790 if (freework->fw_blkno == dbn) { 7791 freework->fw_state |= ALLCOMPLETE; 7792 ACQUIRE_LOCK(&lk); 7793 handle_written_freework(freework); 7794 FREE_LOCK(&lk); 7795 } 7796 return; 7797 } 7798 7799 /* 7800 * Cancel an allocindir when it is removed via truncation. When bp is not 7801 * NULL the indirect never appeared on disk and is scheduled to be freed 7802 * independently of the indir so we can more easily track journal work. 7803 */ 7804 static void 7805 cancel_allocindir(aip, bp, freeblks, trunc) 7806 struct allocindir *aip; 7807 struct buf *bp; 7808 struct freeblks *freeblks; 7809 int trunc; 7810 { 7811 struct indirdep *indirdep; 7812 struct freefrag *freefrag; 7813 struct newblk *newblk; 7814 7815 newblk = (struct newblk *)aip; 7816 LIST_REMOVE(aip, ai_next); 7817 /* 7818 * We must eliminate the pointer in bp if it must be freed on its 7819 * own due to partial truncate or pending journal work. 7820 */ 7821 if (bp && (trunc || newblk->nb_jnewblk)) { 7822 /* 7823 * Clear the pointer and mark the aip to be freed 7824 * directly if it never existed on disk. 7825 */ 7826 aip->ai_state |= DELAYEDFREE; 7827 indirdep = aip->ai_indirdep; 7828 if (indirdep->ir_state & UFS1FMT) 7829 ((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0; 7830 else 7831 ((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0; 7832 } 7833 /* 7834 * When truncating the previous pointer will be freed via 7835 * savedbp. Eliminate the freefrag which would dup free. 7836 */ 7837 if (trunc && (freefrag = newblk->nb_freefrag) != NULL) { 7838 newblk->nb_freefrag = NULL; 7839 if (freefrag->ff_jdep) 7840 cancel_jfreefrag( 7841 WK_JFREEFRAG(freefrag->ff_jdep)); 7842 jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork); 7843 WORKITEM_FREE(freefrag, D_FREEFRAG); 7844 } 7845 /* 7846 * If the journal hasn't been written the jnewblk must be passed 7847 * to the call to ffs_blkfree that reclaims the space. We accomplish 7848 * this by leaving the journal dependency on the newblk to be freed 7849 * when a freework is created in handle_workitem_freeblocks(). 7850 */ 7851 cancel_newblk(newblk, NULL, &freeblks->fb_jwork); 7852 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list); 7853 } 7854 7855 /* 7856 * Create the mkdir dependencies for . and .. in a new directory. Link them 7857 * in to a newdirblk so any subsequent additions are tracked properly. The 7858 * caller is responsible for adding the mkdir1 dependency to the journal 7859 * and updating id_mkdiradd. This function returns with lk held. 7860 */ 7861 static struct mkdir * 7862 setup_newdir(dap, newinum, dinum, newdirbp, mkdirp) 7863 struct diradd *dap; 7864 ino_t newinum; 7865 ino_t dinum; 7866 struct buf *newdirbp; 7867 struct mkdir **mkdirp; 7868 { 7869 struct newblk *newblk; 7870 struct pagedep *pagedep; 7871 struct inodedep *inodedep; 7872 struct newdirblk *newdirblk = 0; 7873 struct mkdir *mkdir1, *mkdir2; 7874 struct worklist *wk; 7875 struct jaddref *jaddref; 7876 struct mount *mp; 7877 7878 mp = dap->da_list.wk_mp; 7879 newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK, 7880 M_SOFTDEP_FLAGS); 7881 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); 7882 LIST_INIT(&newdirblk->db_mkdir); 7883 mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); 7884 workitem_alloc(&mkdir1->md_list, D_MKDIR, mp); 7885 mkdir1->md_state = ATTACHED | MKDIR_BODY; 7886 mkdir1->md_diradd = dap; 7887 mkdir1->md_jaddref = NULL; 7888 mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); 7889 workitem_alloc(&mkdir2->md_list, D_MKDIR, mp); 7890 mkdir2->md_state = ATTACHED | MKDIR_PARENT; 7891 mkdir2->md_diradd = dap; 7892 mkdir2->md_jaddref = NULL; 7893 if (MOUNTEDSUJ(mp) == 0) { 7894 mkdir1->md_state |= DEPCOMPLETE; 7895 mkdir2->md_state |= DEPCOMPLETE; 7896 } 7897 /* 7898 * Dependency on "." and ".." being written to disk. 7899 */ 7900 mkdir1->md_buf = newdirbp; 7901 ACQUIRE_LOCK(&lk); 7902 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); 7903 /* 7904 * We must link the pagedep, allocdirect, and newdirblk for 7905 * the initial file page so the pointer to the new directory 7906 * is not written until the directory contents are live and 7907 * any subsequent additions are not marked live until the 7908 * block is reachable via the inode. 7909 */ 7910 if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0) 7911 panic("setup_newdir: lost pagedep"); 7912 LIST_FOREACH(wk, &newdirbp->b_dep, wk_list) 7913 if (wk->wk_type == D_ALLOCDIRECT) 7914 break; 7915 if (wk == NULL) 7916 panic("setup_newdir: lost allocdirect"); 7917 if (pagedep->pd_state & NEWBLOCK) 7918 panic("setup_newdir: NEWBLOCK already set"); 7919 newblk = WK_NEWBLK(wk); 7920 pagedep->pd_state |= NEWBLOCK; 7921 pagedep->pd_newdirblk = newdirblk; 7922 newdirblk->db_pagedep = pagedep; 7923 WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); 7924 WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list); 7925 /* 7926 * Look up the inodedep for the parent directory so that we 7927 * can link mkdir2 into the pending dotdot jaddref or 7928 * the inode write if there is none. If the inode is 7929 * ALLCOMPLETE and no jaddref is present all dependencies have 7930 * been satisfied and mkdir2 can be freed. 7931 */ 7932 inodedep_lookup(mp, dinum, 0, &inodedep); 7933 if (MOUNTEDSUJ(mp)) { 7934 if (inodedep == NULL) 7935 panic("setup_newdir: Lost parent."); 7936 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 7937 inoreflst); 7938 KASSERT(jaddref != NULL && jaddref->ja_parent == newinum && 7939 (jaddref->ja_state & MKDIR_PARENT), 7940 ("setup_newdir: bad dotdot jaddref %p", jaddref)); 7941 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 7942 mkdir2->md_jaddref = jaddref; 7943 jaddref->ja_mkdir = mkdir2; 7944 } else if (inodedep == NULL || 7945 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 7946 dap->da_state &= ~MKDIR_PARENT; 7947 WORKITEM_FREE(mkdir2, D_MKDIR); 7948 } else { 7949 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 7950 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list); 7951 } 7952 *mkdirp = mkdir2; 7953 7954 return (mkdir1); 7955 } 7956 7957 /* 7958 * Directory entry addition dependencies. 7959 * 7960 * When adding a new directory entry, the inode (with its incremented link 7961 * count) must be written to disk before the directory entry's pointer to it. 7962 * Also, if the inode is newly allocated, the corresponding freemap must be 7963 * updated (on disk) before the directory entry's pointer. These requirements 7964 * are met via undo/redo on the directory entry's pointer, which consists 7965 * simply of the inode number. 7966 * 7967 * As directory entries are added and deleted, the free space within a 7968 * directory block can become fragmented. The ufs filesystem will compact 7969 * a fragmented directory block to make space for a new entry. When this 7970 * occurs, the offsets of previously added entries change. Any "diradd" 7971 * dependency structures corresponding to these entries must be updated with 7972 * the new offsets. 7973 */ 7974 7975 /* 7976 * This routine is called after the in-memory inode's link 7977 * count has been incremented, but before the directory entry's 7978 * pointer to the inode has been set. 7979 */ 7980 int 7981 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) 7982 struct buf *bp; /* buffer containing directory block */ 7983 struct inode *dp; /* inode for directory */ 7984 off_t diroffset; /* offset of new entry in directory */ 7985 ino_t newinum; /* inode referenced by new directory entry */ 7986 struct buf *newdirbp; /* non-NULL => contents of new mkdir */ 7987 int isnewblk; /* entry is in a newly allocated block */ 7988 { 7989 int offset; /* offset of new entry within directory block */ 7990 ufs_lbn_t lbn; /* block in directory containing new entry */ 7991 struct fs *fs; 7992 struct diradd *dap; 7993 struct newblk *newblk; 7994 struct pagedep *pagedep; 7995 struct inodedep *inodedep; 7996 struct newdirblk *newdirblk = 0; 7997 struct mkdir *mkdir1, *mkdir2; 7998 struct jaddref *jaddref; 7999 struct mount *mp; 8000 int isindir; 8001 8002 /* 8003 * Whiteouts have no dependencies. 8004 */ 8005 if (newinum == WINO) { 8006 if (newdirbp != NULL) 8007 bdwrite(newdirbp); 8008 return (0); 8009 } 8010 jaddref = NULL; 8011 mkdir1 = mkdir2 = NULL; 8012 mp = UFSTOVFS(dp->i_ump); 8013 fs = dp->i_fs; 8014 lbn = lblkno(fs, diroffset); 8015 offset = blkoff(fs, diroffset); 8016 dap = malloc(sizeof(struct diradd), M_DIRADD, 8017 M_SOFTDEP_FLAGS|M_ZERO); 8018 workitem_alloc(&dap->da_list, D_DIRADD, mp); 8019 dap->da_offset = offset; 8020 dap->da_newinum = newinum; 8021 dap->da_state = ATTACHED; 8022 LIST_INIT(&dap->da_jwork); 8023 isindir = bp->b_lblkno >= NDADDR; 8024 if (isnewblk && 8025 (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) { 8026 newdirblk = malloc(sizeof(struct newdirblk), 8027 M_NEWDIRBLK, M_SOFTDEP_FLAGS); 8028 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); 8029 LIST_INIT(&newdirblk->db_mkdir); 8030 } 8031 /* 8032 * If we're creating a new directory setup the dependencies and set 8033 * the dap state to wait for them. Otherwise it's COMPLETE and 8034 * we can move on. 8035 */ 8036 if (newdirbp == NULL) { 8037 dap->da_state |= DEPCOMPLETE; 8038 ACQUIRE_LOCK(&lk); 8039 } else { 8040 dap->da_state |= MKDIR_BODY | MKDIR_PARENT; 8041 mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp, 8042 &mkdir2); 8043 } 8044 /* 8045 * Link into parent directory pagedep to await its being written. 8046 */ 8047 pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep); 8048 #ifdef DEBUG 8049 if (diradd_lookup(pagedep, offset) != NULL) 8050 panic("softdep_setup_directory_add: %p already at off %d\n", 8051 diradd_lookup(pagedep, offset), offset); 8052 #endif 8053 dap->da_pagedep = pagedep; 8054 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, 8055 da_pdlist); 8056 inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep); 8057 /* 8058 * If we're journaling, link the diradd into the jaddref so it 8059 * may be completed after the journal entry is written. Otherwise, 8060 * link the diradd into its inodedep. If the inode is not yet 8061 * written place it on the bufwait list, otherwise do the post-inode 8062 * write processing to put it on the id_pendinghd list. 8063 */ 8064 if (MOUNTEDSUJ(mp)) { 8065 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 8066 inoreflst); 8067 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 8068 ("softdep_setup_directory_add: bad jaddref %p", jaddref)); 8069 jaddref->ja_diroff = diroffset; 8070 jaddref->ja_diradd = dap; 8071 add_to_journal(&jaddref->ja_list); 8072 } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) 8073 diradd_inode_written(dap, inodedep); 8074 else 8075 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 8076 /* 8077 * Add the journal entries for . and .. links now that the primary 8078 * link is written. 8079 */ 8080 if (mkdir1 != NULL && MOUNTEDSUJ(mp)) { 8081 jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, 8082 inoreflst, if_deps); 8083 KASSERT(jaddref != NULL && 8084 jaddref->ja_ino == jaddref->ja_parent && 8085 (jaddref->ja_state & MKDIR_BODY), 8086 ("softdep_setup_directory_add: bad dot jaddref %p", 8087 jaddref)); 8088 mkdir1->md_jaddref = jaddref; 8089 jaddref->ja_mkdir = mkdir1; 8090 /* 8091 * It is important that the dotdot journal entry 8092 * is added prior to the dot entry since dot writes 8093 * both the dot and dotdot links. These both must 8094 * be added after the primary link for the journal 8095 * to remain consistent. 8096 */ 8097 add_to_journal(&mkdir2->md_jaddref->ja_list); 8098 add_to_journal(&jaddref->ja_list); 8099 } 8100 /* 8101 * If we are adding a new directory remember this diradd so that if 8102 * we rename it we can keep the dot and dotdot dependencies. If 8103 * we are adding a new name for an inode that has a mkdiradd we 8104 * must be in rename and we have to move the dot and dotdot 8105 * dependencies to this new name. The old name is being orphaned 8106 * soon. 8107 */ 8108 if (mkdir1 != NULL) { 8109 if (inodedep->id_mkdiradd != NULL) 8110 panic("softdep_setup_directory_add: Existing mkdir"); 8111 inodedep->id_mkdiradd = dap; 8112 } else if (inodedep->id_mkdiradd) 8113 merge_diradd(inodedep, dap); 8114 if (newdirblk) { 8115 /* 8116 * There is nothing to do if we are already tracking 8117 * this block. 8118 */ 8119 if ((pagedep->pd_state & NEWBLOCK) != 0) { 8120 WORKITEM_FREE(newdirblk, D_NEWDIRBLK); 8121 FREE_LOCK(&lk); 8122 return (0); 8123 } 8124 if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk) 8125 == 0) 8126 panic("softdep_setup_directory_add: lost entry"); 8127 WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); 8128 pagedep->pd_state |= NEWBLOCK; 8129 pagedep->pd_newdirblk = newdirblk; 8130 newdirblk->db_pagedep = pagedep; 8131 FREE_LOCK(&lk); 8132 /* 8133 * If we extended into an indirect signal direnter to sync. 8134 */ 8135 if (isindir) 8136 return (1); 8137 return (0); 8138 } 8139 FREE_LOCK(&lk); 8140 return (0); 8141 } 8142 8143 /* 8144 * This procedure is called to change the offset of a directory 8145 * entry when compacting a directory block which must be owned 8146 * exclusively by the caller. Note that the actual entry movement 8147 * must be done in this procedure to ensure that no I/O completions 8148 * occur while the move is in progress. 8149 */ 8150 void 8151 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) 8152 struct buf *bp; /* Buffer holding directory block. */ 8153 struct inode *dp; /* inode for directory */ 8154 caddr_t base; /* address of dp->i_offset */ 8155 caddr_t oldloc; /* address of old directory location */ 8156 caddr_t newloc; /* address of new directory location */ 8157 int entrysize; /* size of directory entry */ 8158 { 8159 int offset, oldoffset, newoffset; 8160 struct pagedep *pagedep; 8161 struct jmvref *jmvref; 8162 struct diradd *dap; 8163 struct direct *de; 8164 struct mount *mp; 8165 ufs_lbn_t lbn; 8166 int flags; 8167 8168 mp = UFSTOVFS(dp->i_ump); 8169 de = (struct direct *)oldloc; 8170 jmvref = NULL; 8171 flags = 0; 8172 /* 8173 * Moves are always journaled as it would be too complex to 8174 * determine if any affected adds or removes are present in the 8175 * journal. 8176 */ 8177 if (MOUNTEDSUJ(mp)) { 8178 flags = DEPALLOC; 8179 jmvref = newjmvref(dp, de->d_ino, 8180 dp->i_offset + (oldloc - base), 8181 dp->i_offset + (newloc - base)); 8182 } 8183 lbn = lblkno(dp->i_fs, dp->i_offset); 8184 offset = blkoff(dp->i_fs, dp->i_offset); 8185 oldoffset = offset + (oldloc - base); 8186 newoffset = offset + (newloc - base); 8187 ACQUIRE_LOCK(&lk); 8188 if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0) 8189 goto done; 8190 dap = diradd_lookup(pagedep, oldoffset); 8191 if (dap) { 8192 dap->da_offset = newoffset; 8193 newoffset = DIRADDHASH(newoffset); 8194 oldoffset = DIRADDHASH(oldoffset); 8195 if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE && 8196 newoffset != oldoffset) { 8197 LIST_REMOVE(dap, da_pdlist); 8198 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset], 8199 dap, da_pdlist); 8200 } 8201 } 8202 done: 8203 if (jmvref) { 8204 jmvref->jm_pagedep = pagedep; 8205 LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps); 8206 add_to_journal(&jmvref->jm_list); 8207 } 8208 bcopy(oldloc, newloc, entrysize); 8209 FREE_LOCK(&lk); 8210 } 8211 8212 /* 8213 * Move the mkdir dependencies and journal work from one diradd to another 8214 * when renaming a directory. The new name must depend on the mkdir deps 8215 * completing as the old name did. Directories can only have one valid link 8216 * at a time so one must be canonical. 8217 */ 8218 static void 8219 merge_diradd(inodedep, newdap) 8220 struct inodedep *inodedep; 8221 struct diradd *newdap; 8222 { 8223 struct diradd *olddap; 8224 struct mkdir *mkdir, *nextmd; 8225 short state; 8226 8227 olddap = inodedep->id_mkdiradd; 8228 inodedep->id_mkdiradd = newdap; 8229 if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 8230 newdap->da_state &= ~DEPCOMPLETE; 8231 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 8232 nextmd = LIST_NEXT(mkdir, md_mkdirs); 8233 if (mkdir->md_diradd != olddap) 8234 continue; 8235 mkdir->md_diradd = newdap; 8236 state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY); 8237 newdap->da_state |= state; 8238 olddap->da_state &= ~state; 8239 if ((olddap->da_state & 8240 (MKDIR_PARENT | MKDIR_BODY)) == 0) 8241 break; 8242 } 8243 if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) 8244 panic("merge_diradd: unfound ref"); 8245 } 8246 /* 8247 * Any mkdir related journal items are not safe to be freed until 8248 * the new name is stable. 8249 */ 8250 jwork_move(&newdap->da_jwork, &olddap->da_jwork); 8251 olddap->da_state |= DEPCOMPLETE; 8252 complete_diradd(olddap); 8253 } 8254 8255 /* 8256 * Move the diradd to the pending list when all diradd dependencies are 8257 * complete. 8258 */ 8259 static void 8260 complete_diradd(dap) 8261 struct diradd *dap; 8262 { 8263 struct pagedep *pagedep; 8264 8265 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 8266 if (dap->da_state & DIRCHG) 8267 pagedep = dap->da_previous->dm_pagedep; 8268 else 8269 pagedep = dap->da_pagedep; 8270 LIST_REMOVE(dap, da_pdlist); 8271 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 8272 } 8273 } 8274 8275 /* 8276 * Cancel a diradd when a dirrem overlaps with it. We must cancel the journal 8277 * add entries and conditonally journal the remove. 8278 */ 8279 static void 8280 cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref) 8281 struct diradd *dap; 8282 struct dirrem *dirrem; 8283 struct jremref *jremref; 8284 struct jremref *dotremref; 8285 struct jremref *dotdotremref; 8286 { 8287 struct inodedep *inodedep; 8288 struct jaddref *jaddref; 8289 struct inoref *inoref; 8290 struct mkdir *mkdir; 8291 8292 /* 8293 * If no remove references were allocated we're on a non-journaled 8294 * filesystem and can skip the cancel step. 8295 */ 8296 if (jremref == NULL) { 8297 free_diradd(dap, NULL); 8298 return; 8299 } 8300 /* 8301 * Cancel the primary name an free it if it does not require 8302 * journaling. 8303 */ 8304 if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum, 8305 0, &inodedep) != 0) { 8306 /* Abort the addref that reference this diradd. */ 8307 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 8308 if (inoref->if_list.wk_type != D_JADDREF) 8309 continue; 8310 jaddref = (struct jaddref *)inoref; 8311 if (jaddref->ja_diradd != dap) 8312 continue; 8313 if (cancel_jaddref(jaddref, inodedep, 8314 &dirrem->dm_jwork) == 0) { 8315 free_jremref(jremref); 8316 jremref = NULL; 8317 } 8318 break; 8319 } 8320 } 8321 /* 8322 * Cancel subordinate names and free them if they do not require 8323 * journaling. 8324 */ 8325 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 8326 LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) { 8327 if (mkdir->md_diradd != dap) 8328 continue; 8329 if ((jaddref = mkdir->md_jaddref) == NULL) 8330 continue; 8331 mkdir->md_jaddref = NULL; 8332 if (mkdir->md_state & MKDIR_PARENT) { 8333 if (cancel_jaddref(jaddref, NULL, 8334 &dirrem->dm_jwork) == 0) { 8335 free_jremref(dotdotremref); 8336 dotdotremref = NULL; 8337 } 8338 } else { 8339 if (cancel_jaddref(jaddref, inodedep, 8340 &dirrem->dm_jwork) == 0) { 8341 free_jremref(dotremref); 8342 dotremref = NULL; 8343 } 8344 } 8345 } 8346 } 8347 8348 if (jremref) 8349 journal_jremref(dirrem, jremref, inodedep); 8350 if (dotremref) 8351 journal_jremref(dirrem, dotremref, inodedep); 8352 if (dotdotremref) 8353 journal_jremref(dirrem, dotdotremref, NULL); 8354 jwork_move(&dirrem->dm_jwork, &dap->da_jwork); 8355 free_diradd(dap, &dirrem->dm_jwork); 8356 } 8357 8358 /* 8359 * Free a diradd dependency structure. This routine must be called 8360 * with splbio interrupts blocked. 8361 */ 8362 static void 8363 free_diradd(dap, wkhd) 8364 struct diradd *dap; 8365 struct workhead *wkhd; 8366 { 8367 struct dirrem *dirrem; 8368 struct pagedep *pagedep; 8369 struct inodedep *inodedep; 8370 struct mkdir *mkdir, *nextmd; 8371 8372 mtx_assert(&lk, MA_OWNED); 8373 LIST_REMOVE(dap, da_pdlist); 8374 if (dap->da_state & ONWORKLIST) 8375 WORKLIST_REMOVE(&dap->da_list); 8376 if ((dap->da_state & DIRCHG) == 0) { 8377 pagedep = dap->da_pagedep; 8378 } else { 8379 dirrem = dap->da_previous; 8380 pagedep = dirrem->dm_pagedep; 8381 dirrem->dm_dirinum = pagedep->pd_ino; 8382 dirrem->dm_state |= COMPLETE; 8383 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 8384 add_to_worklist(&dirrem->dm_list, 0); 8385 } 8386 if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum, 8387 0, &inodedep) != 0) 8388 if (inodedep->id_mkdiradd == dap) 8389 inodedep->id_mkdiradd = NULL; 8390 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 8391 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 8392 nextmd = LIST_NEXT(mkdir, md_mkdirs); 8393 if (mkdir->md_diradd != dap) 8394 continue; 8395 dap->da_state &= 8396 ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); 8397 LIST_REMOVE(mkdir, md_mkdirs); 8398 if (mkdir->md_state & ONWORKLIST) 8399 WORKLIST_REMOVE(&mkdir->md_list); 8400 if (mkdir->md_jaddref != NULL) 8401 panic("free_diradd: Unexpected jaddref"); 8402 WORKITEM_FREE(mkdir, D_MKDIR); 8403 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) 8404 break; 8405 } 8406 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) 8407 panic("free_diradd: unfound ref"); 8408 } 8409 if (inodedep) 8410 free_inodedep(inodedep); 8411 /* 8412 * Free any journal segments waiting for the directory write. 8413 */ 8414 handle_jwork(&dap->da_jwork); 8415 WORKITEM_FREE(dap, D_DIRADD); 8416 } 8417 8418 /* 8419 * Directory entry removal dependencies. 8420 * 8421 * When removing a directory entry, the entry's inode pointer must be 8422 * zero'ed on disk before the corresponding inode's link count is decremented 8423 * (possibly freeing the inode for re-use). This dependency is handled by 8424 * updating the directory entry but delaying the inode count reduction until 8425 * after the directory block has been written to disk. After this point, the 8426 * inode count can be decremented whenever it is convenient. 8427 */ 8428 8429 /* 8430 * This routine should be called immediately after removing 8431 * a directory entry. The inode's link count should not be 8432 * decremented by the calling procedure -- the soft updates 8433 * code will do this task when it is safe. 8434 */ 8435 void 8436 softdep_setup_remove(bp, dp, ip, isrmdir) 8437 struct buf *bp; /* buffer containing directory block */ 8438 struct inode *dp; /* inode for the directory being modified */ 8439 struct inode *ip; /* inode for directory entry being removed */ 8440 int isrmdir; /* indicates if doing RMDIR */ 8441 { 8442 struct dirrem *dirrem, *prevdirrem; 8443 struct inodedep *inodedep; 8444 int direct; 8445 8446 /* 8447 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. We want 8448 * newdirrem() to setup the full directory remove which requires 8449 * isrmdir > 1. 8450 */ 8451 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 8452 /* 8453 * Add the dirrem to the inodedep's pending remove list for quick 8454 * discovery later. 8455 */ 8456 if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 8457 &inodedep) == 0) 8458 panic("softdep_setup_remove: Lost inodedep."); 8459 KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked")); 8460 dirrem->dm_state |= ONDEPLIST; 8461 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 8462 8463 /* 8464 * If the COMPLETE flag is clear, then there were no active 8465 * entries and we want to roll back to a zeroed entry until 8466 * the new inode is committed to disk. If the COMPLETE flag is 8467 * set then we have deleted an entry that never made it to 8468 * disk. If the entry we deleted resulted from a name change, 8469 * then the old name still resides on disk. We cannot delete 8470 * its inode (returned to us in prevdirrem) until the zeroed 8471 * directory entry gets to disk. The new inode has never been 8472 * referenced on the disk, so can be deleted immediately. 8473 */ 8474 if ((dirrem->dm_state & COMPLETE) == 0) { 8475 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, 8476 dm_next); 8477 FREE_LOCK(&lk); 8478 } else { 8479 if (prevdirrem != NULL) 8480 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, 8481 prevdirrem, dm_next); 8482 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; 8483 direct = LIST_EMPTY(&dirrem->dm_jremrefhd); 8484 FREE_LOCK(&lk); 8485 if (direct) 8486 handle_workitem_remove(dirrem, 0); 8487 } 8488 } 8489 8490 /* 8491 * Check for an entry matching 'offset' on both the pd_dirraddhd list and the 8492 * pd_pendinghd list of a pagedep. 8493 */ 8494 static struct diradd * 8495 diradd_lookup(pagedep, offset) 8496 struct pagedep *pagedep; 8497 int offset; 8498 { 8499 struct diradd *dap; 8500 8501 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) 8502 if (dap->da_offset == offset) 8503 return (dap); 8504 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) 8505 if (dap->da_offset == offset) 8506 return (dap); 8507 return (NULL); 8508 } 8509 8510 /* 8511 * Search for a .. diradd dependency in a directory that is being removed. 8512 * If the directory was renamed to a new parent we have a diradd rather 8513 * than a mkdir for the .. entry. We need to cancel it now before 8514 * it is found in truncate(). 8515 */ 8516 static struct jremref * 8517 cancel_diradd_dotdot(ip, dirrem, jremref) 8518 struct inode *ip; 8519 struct dirrem *dirrem; 8520 struct jremref *jremref; 8521 { 8522 struct pagedep *pagedep; 8523 struct diradd *dap; 8524 struct worklist *wk; 8525 8526 if (pagedep_lookup(UFSTOVFS(ip->i_ump), NULL, ip->i_number, 0, 0, 8527 &pagedep) == 0) 8528 return (jremref); 8529 dap = diradd_lookup(pagedep, DOTDOT_OFFSET); 8530 if (dap == NULL) 8531 return (jremref); 8532 cancel_diradd(dap, dirrem, jremref, NULL, NULL); 8533 /* 8534 * Mark any journal work as belonging to the parent so it is freed 8535 * with the .. reference. 8536 */ 8537 LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) 8538 wk->wk_state |= MKDIR_PARENT; 8539 return (NULL); 8540 } 8541 8542 /* 8543 * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to 8544 * replace it with a dirrem/diradd pair as a result of re-parenting a 8545 * directory. This ensures that we don't simultaneously have a mkdir and 8546 * a diradd for the same .. entry. 8547 */ 8548 static struct jremref * 8549 cancel_mkdir_dotdot(ip, dirrem, jremref) 8550 struct inode *ip; 8551 struct dirrem *dirrem; 8552 struct jremref *jremref; 8553 { 8554 struct inodedep *inodedep; 8555 struct jaddref *jaddref; 8556 struct mkdir *mkdir; 8557 struct diradd *dap; 8558 8559 if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 8560 &inodedep) == 0) 8561 panic("cancel_mkdir_dotdot: Lost inodedep"); 8562 dap = inodedep->id_mkdiradd; 8563 if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0) 8564 return (jremref); 8565 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; 8566 mkdir = LIST_NEXT(mkdir, md_mkdirs)) 8567 if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT) 8568 break; 8569 if (mkdir == NULL) 8570 panic("cancel_mkdir_dotdot: Unable to find mkdir\n"); 8571 if ((jaddref = mkdir->md_jaddref) != NULL) { 8572 mkdir->md_jaddref = NULL; 8573 jaddref->ja_state &= ~MKDIR_PARENT; 8574 if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0, 8575 &inodedep) == 0) 8576 panic("cancel_mkdir_dotdot: Lost parent inodedep"); 8577 if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) { 8578 journal_jremref(dirrem, jremref, inodedep); 8579 jremref = NULL; 8580 } 8581 } 8582 if (mkdir->md_state & ONWORKLIST) 8583 WORKLIST_REMOVE(&mkdir->md_list); 8584 mkdir->md_state |= ALLCOMPLETE; 8585 complete_mkdir(mkdir); 8586 return (jremref); 8587 } 8588 8589 static void 8590 journal_jremref(dirrem, jremref, inodedep) 8591 struct dirrem *dirrem; 8592 struct jremref *jremref; 8593 struct inodedep *inodedep; 8594 { 8595 8596 if (inodedep == NULL) 8597 if (inodedep_lookup(jremref->jr_list.wk_mp, 8598 jremref->jr_ref.if_ino, 0, &inodedep) == 0) 8599 panic("journal_jremref: Lost inodedep"); 8600 LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps); 8601 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); 8602 add_to_journal(&jremref->jr_list); 8603 } 8604 8605 static void 8606 dirrem_journal(dirrem, jremref, dotremref, dotdotremref) 8607 struct dirrem *dirrem; 8608 struct jremref *jremref; 8609 struct jremref *dotremref; 8610 struct jremref *dotdotremref; 8611 { 8612 struct inodedep *inodedep; 8613 8614 8615 if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0, 8616 &inodedep) == 0) 8617 panic("dirrem_journal: Lost inodedep"); 8618 journal_jremref(dirrem, jremref, inodedep); 8619 if (dotremref) 8620 journal_jremref(dirrem, dotremref, inodedep); 8621 if (dotdotremref) 8622 journal_jremref(dirrem, dotdotremref, NULL); 8623 } 8624 8625 /* 8626 * Allocate a new dirrem if appropriate and return it along with 8627 * its associated pagedep. Called without a lock, returns with lock. 8628 */ 8629 static struct dirrem * 8630 newdirrem(bp, dp, ip, isrmdir, prevdirremp) 8631 struct buf *bp; /* buffer containing directory block */ 8632 struct inode *dp; /* inode for the directory being modified */ 8633 struct inode *ip; /* inode for directory entry being removed */ 8634 int isrmdir; /* indicates if doing RMDIR */ 8635 struct dirrem **prevdirremp; /* previously referenced inode, if any */ 8636 { 8637 int offset; 8638 ufs_lbn_t lbn; 8639 struct diradd *dap; 8640 struct dirrem *dirrem; 8641 struct pagedep *pagedep; 8642 struct jremref *jremref; 8643 struct jremref *dotremref; 8644 struct jremref *dotdotremref; 8645 struct vnode *dvp; 8646 8647 /* 8648 * Whiteouts have no deletion dependencies. 8649 */ 8650 if (ip == NULL) 8651 panic("newdirrem: whiteout"); 8652 dvp = ITOV(dp); 8653 /* 8654 * If we are over our limit, try to improve the situation. 8655 * Limiting the number of dirrem structures will also limit 8656 * the number of freefile and freeblks structures. 8657 */ 8658 ACQUIRE_LOCK(&lk); 8659 if (!IS_SNAPSHOT(ip) && dep_current[D_DIRREM] > max_softdeps / 2) 8660 (void) request_cleanup(ITOV(dp)->v_mount, FLUSH_BLOCKS); 8661 FREE_LOCK(&lk); 8662 dirrem = malloc(sizeof(struct dirrem), 8663 M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO); 8664 workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount); 8665 LIST_INIT(&dirrem->dm_jremrefhd); 8666 LIST_INIT(&dirrem->dm_jwork); 8667 dirrem->dm_state = isrmdir ? RMDIR : 0; 8668 dirrem->dm_oldinum = ip->i_number; 8669 *prevdirremp = NULL; 8670 /* 8671 * Allocate remove reference structures to track journal write 8672 * dependencies. We will always have one for the link and 8673 * when doing directories we will always have one more for dot. 8674 * When renaming a directory we skip the dotdot link change so 8675 * this is not needed. 8676 */ 8677 jremref = dotremref = dotdotremref = NULL; 8678 if (DOINGSUJ(dvp)) { 8679 if (isrmdir) { 8680 jremref = newjremref(dirrem, dp, ip, dp->i_offset, 8681 ip->i_effnlink + 2); 8682 dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET, 8683 ip->i_effnlink + 1); 8684 dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET, 8685 dp->i_effnlink + 1); 8686 dotdotremref->jr_state |= MKDIR_PARENT; 8687 } else 8688 jremref = newjremref(dirrem, dp, ip, dp->i_offset, 8689 ip->i_effnlink + 1); 8690 } 8691 ACQUIRE_LOCK(&lk); 8692 lbn = lblkno(dp->i_fs, dp->i_offset); 8693 offset = blkoff(dp->i_fs, dp->i_offset); 8694 pagedep_lookup(UFSTOVFS(dp->i_ump), bp, dp->i_number, lbn, DEPALLOC, 8695 &pagedep); 8696 dirrem->dm_pagedep = pagedep; 8697 dirrem->dm_offset = offset; 8698 /* 8699 * If we're renaming a .. link to a new directory, cancel any 8700 * existing MKDIR_PARENT mkdir. If it has already been canceled 8701 * the jremref is preserved for any potential diradd in this 8702 * location. This can not coincide with a rmdir. 8703 */ 8704 if (dp->i_offset == DOTDOT_OFFSET) { 8705 if (isrmdir) 8706 panic("newdirrem: .. directory change during remove?"); 8707 jremref = cancel_mkdir_dotdot(dp, dirrem, jremref); 8708 } 8709 /* 8710 * If we're removing a directory search for the .. dependency now and 8711 * cancel it. Any pending journal work will be added to the dirrem 8712 * to be completed when the workitem remove completes. 8713 */ 8714 if (isrmdir) 8715 dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref); 8716 /* 8717 * Check for a diradd dependency for the same directory entry. 8718 * If present, then both dependencies become obsolete and can 8719 * be de-allocated. 8720 */ 8721 dap = diradd_lookup(pagedep, offset); 8722 if (dap == NULL) { 8723 /* 8724 * Link the jremref structures into the dirrem so they are 8725 * written prior to the pagedep. 8726 */ 8727 if (jremref) 8728 dirrem_journal(dirrem, jremref, dotremref, 8729 dotdotremref); 8730 return (dirrem); 8731 } 8732 /* 8733 * Must be ATTACHED at this point. 8734 */ 8735 if ((dap->da_state & ATTACHED) == 0) 8736 panic("newdirrem: not ATTACHED"); 8737 if (dap->da_newinum != ip->i_number) 8738 panic("newdirrem: inum %d should be %d", 8739 ip->i_number, dap->da_newinum); 8740 /* 8741 * If we are deleting a changed name that never made it to disk, 8742 * then return the dirrem describing the previous inode (which 8743 * represents the inode currently referenced from this entry on disk). 8744 */ 8745 if ((dap->da_state & DIRCHG) != 0) { 8746 *prevdirremp = dap->da_previous; 8747 dap->da_state &= ~DIRCHG; 8748 dap->da_pagedep = pagedep; 8749 } 8750 /* 8751 * We are deleting an entry that never made it to disk. 8752 * Mark it COMPLETE so we can delete its inode immediately. 8753 */ 8754 dirrem->dm_state |= COMPLETE; 8755 cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref); 8756 #ifdef SUJ_DEBUG 8757 if (isrmdir == 0) { 8758 struct worklist *wk; 8759 8760 LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) 8761 if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT)) 8762 panic("bad wk %p (0x%X)\n", wk, wk->wk_state); 8763 } 8764 #endif 8765 8766 return (dirrem); 8767 } 8768 8769 /* 8770 * Directory entry change dependencies. 8771 * 8772 * Changing an existing directory entry requires that an add operation 8773 * be completed first followed by a deletion. The semantics for the addition 8774 * are identical to the description of adding a new entry above except 8775 * that the rollback is to the old inode number rather than zero. Once 8776 * the addition dependency is completed, the removal is done as described 8777 * in the removal routine above. 8778 */ 8779 8780 /* 8781 * This routine should be called immediately after changing 8782 * a directory entry. The inode's link count should not be 8783 * decremented by the calling procedure -- the soft updates 8784 * code will perform this task when it is safe. 8785 */ 8786 void 8787 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 8788 struct buf *bp; /* buffer containing directory block */ 8789 struct inode *dp; /* inode for the directory being modified */ 8790 struct inode *ip; /* inode for directory entry being removed */ 8791 ino_t newinum; /* new inode number for changed entry */ 8792 int isrmdir; /* indicates if doing RMDIR */ 8793 { 8794 int offset; 8795 struct diradd *dap = NULL; 8796 struct dirrem *dirrem, *prevdirrem; 8797 struct pagedep *pagedep; 8798 struct inodedep *inodedep; 8799 struct jaddref *jaddref; 8800 struct mount *mp; 8801 8802 offset = blkoff(dp->i_fs, dp->i_offset); 8803 mp = UFSTOVFS(dp->i_ump); 8804 8805 /* 8806 * Whiteouts do not need diradd dependencies. 8807 */ 8808 if (newinum != WINO) { 8809 dap = malloc(sizeof(struct diradd), 8810 M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO); 8811 workitem_alloc(&dap->da_list, D_DIRADD, mp); 8812 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; 8813 dap->da_offset = offset; 8814 dap->da_newinum = newinum; 8815 LIST_INIT(&dap->da_jwork); 8816 } 8817 8818 /* 8819 * Allocate a new dirrem and ACQUIRE_LOCK. 8820 */ 8821 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 8822 pagedep = dirrem->dm_pagedep; 8823 /* 8824 * The possible values for isrmdir: 8825 * 0 - non-directory file rename 8826 * 1 - directory rename within same directory 8827 * inum - directory rename to new directory of given inode number 8828 * When renaming to a new directory, we are both deleting and 8829 * creating a new directory entry, so the link count on the new 8830 * directory should not change. Thus we do not need the followup 8831 * dirrem which is usually done in handle_workitem_remove. We set 8832 * the DIRCHG flag to tell handle_workitem_remove to skip the 8833 * followup dirrem. 8834 */ 8835 if (isrmdir > 1) 8836 dirrem->dm_state |= DIRCHG; 8837 8838 /* 8839 * Whiteouts have no additional dependencies, 8840 * so just put the dirrem on the correct list. 8841 */ 8842 if (newinum == WINO) { 8843 if ((dirrem->dm_state & COMPLETE) == 0) { 8844 LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, 8845 dm_next); 8846 } else { 8847 dirrem->dm_dirinum = pagedep->pd_ino; 8848 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 8849 add_to_worklist(&dirrem->dm_list, 0); 8850 } 8851 FREE_LOCK(&lk); 8852 return; 8853 } 8854 /* 8855 * Add the dirrem to the inodedep's pending remove list for quick 8856 * discovery later. A valid nlinkdelta ensures that this lookup 8857 * will not fail. 8858 */ 8859 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) 8860 panic("softdep_setup_directory_change: Lost inodedep."); 8861 dirrem->dm_state |= ONDEPLIST; 8862 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 8863 8864 /* 8865 * If the COMPLETE flag is clear, then there were no active 8866 * entries and we want to roll back to the previous inode until 8867 * the new inode is committed to disk. If the COMPLETE flag is 8868 * set, then we have deleted an entry that never made it to disk. 8869 * If the entry we deleted resulted from a name change, then the old 8870 * inode reference still resides on disk. Any rollback that we do 8871 * needs to be to that old inode (returned to us in prevdirrem). If 8872 * the entry we deleted resulted from a create, then there is 8873 * no entry on the disk, so we want to roll back to zero rather 8874 * than the uncommitted inode. In either of the COMPLETE cases we 8875 * want to immediately free the unwritten and unreferenced inode. 8876 */ 8877 if ((dirrem->dm_state & COMPLETE) == 0) { 8878 dap->da_previous = dirrem; 8879 } else { 8880 if (prevdirrem != NULL) { 8881 dap->da_previous = prevdirrem; 8882 } else { 8883 dap->da_state &= ~DIRCHG; 8884 dap->da_pagedep = pagedep; 8885 } 8886 dirrem->dm_dirinum = pagedep->pd_ino; 8887 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 8888 add_to_worklist(&dirrem->dm_list, 0); 8889 } 8890 /* 8891 * Lookup the jaddref for this journal entry. We must finish 8892 * initializing it and make the diradd write dependent on it. 8893 * If we're not journaling, put it on the id_bufwait list if the 8894 * inode is not yet written. If it is written, do the post-inode 8895 * write processing to put it on the id_pendinghd list. 8896 */ 8897 inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep); 8898 if (MOUNTEDSUJ(mp)) { 8899 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 8900 inoreflst); 8901 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 8902 ("softdep_setup_directory_change: bad jaddref %p", 8903 jaddref)); 8904 jaddref->ja_diroff = dp->i_offset; 8905 jaddref->ja_diradd = dap; 8906 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 8907 dap, da_pdlist); 8908 add_to_journal(&jaddref->ja_list); 8909 } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 8910 dap->da_state |= COMPLETE; 8911 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 8912 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 8913 } else { 8914 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 8915 dap, da_pdlist); 8916 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 8917 } 8918 /* 8919 * If we're making a new name for a directory that has not been 8920 * committed when need to move the dot and dotdot references to 8921 * this new name. 8922 */ 8923 if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET) 8924 merge_diradd(inodedep, dap); 8925 FREE_LOCK(&lk); 8926 } 8927 8928 /* 8929 * Called whenever the link count on an inode is changed. 8930 * It creates an inode dependency so that the new reference(s) 8931 * to the inode cannot be committed to disk until the updated 8932 * inode has been written. 8933 */ 8934 void 8935 softdep_change_linkcnt(ip) 8936 struct inode *ip; /* the inode with the increased link count */ 8937 { 8938 struct inodedep *inodedep; 8939 int dflags; 8940 8941 ACQUIRE_LOCK(&lk); 8942 dflags = DEPALLOC; 8943 if (IS_SNAPSHOT(ip)) 8944 dflags |= NODELAY; 8945 inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, &inodedep); 8946 if (ip->i_nlink < ip->i_effnlink) 8947 panic("softdep_change_linkcnt: bad delta"); 8948 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 8949 FREE_LOCK(&lk); 8950 } 8951 8952 /* 8953 * Attach a sbdep dependency to the superblock buf so that we can keep 8954 * track of the head of the linked list of referenced but unlinked inodes. 8955 */ 8956 void 8957 softdep_setup_sbupdate(ump, fs, bp) 8958 struct ufsmount *ump; 8959 struct fs *fs; 8960 struct buf *bp; 8961 { 8962 struct sbdep *sbdep; 8963 struct worklist *wk; 8964 8965 if (MOUNTEDSUJ(UFSTOVFS(ump)) == 0) 8966 return; 8967 LIST_FOREACH(wk, &bp->b_dep, wk_list) 8968 if (wk->wk_type == D_SBDEP) 8969 break; 8970 if (wk != NULL) 8971 return; 8972 sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS); 8973 workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump)); 8974 sbdep->sb_fs = fs; 8975 sbdep->sb_ump = ump; 8976 ACQUIRE_LOCK(&lk); 8977 WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list); 8978 FREE_LOCK(&lk); 8979 } 8980 8981 /* 8982 * Return the first unlinked inodedep which is ready to be the head of the 8983 * list. The inodedep and all those after it must have valid next pointers. 8984 */ 8985 static struct inodedep * 8986 first_unlinked_inodedep(ump) 8987 struct ufsmount *ump; 8988 { 8989 struct inodedep *inodedep; 8990 struct inodedep *idp; 8991 8992 mtx_assert(&lk, MA_OWNED); 8993 for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst); 8994 inodedep; inodedep = idp) { 8995 if ((inodedep->id_state & UNLINKNEXT) == 0) 8996 return (NULL); 8997 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 8998 if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0) 8999 break; 9000 if ((inodedep->id_state & UNLINKPREV) == 0) 9001 break; 9002 } 9003 return (inodedep); 9004 } 9005 9006 /* 9007 * Set the sujfree unlinked head pointer prior to writing a superblock. 9008 */ 9009 static void 9010 initiate_write_sbdep(sbdep) 9011 struct sbdep *sbdep; 9012 { 9013 struct inodedep *inodedep; 9014 struct fs *bpfs; 9015 struct fs *fs; 9016 9017 bpfs = sbdep->sb_fs; 9018 fs = sbdep->sb_ump->um_fs; 9019 inodedep = first_unlinked_inodedep(sbdep->sb_ump); 9020 if (inodedep) { 9021 fs->fs_sujfree = inodedep->id_ino; 9022 inodedep->id_state |= UNLINKPREV; 9023 } else 9024 fs->fs_sujfree = 0; 9025 bpfs->fs_sujfree = fs->fs_sujfree; 9026 } 9027 9028 /* 9029 * After a superblock is written determine whether it must be written again 9030 * due to a changing unlinked list head. 9031 */ 9032 static int 9033 handle_written_sbdep(sbdep, bp) 9034 struct sbdep *sbdep; 9035 struct buf *bp; 9036 { 9037 struct inodedep *inodedep; 9038 struct mount *mp; 9039 struct fs *fs; 9040 9041 mtx_assert(&lk, MA_OWNED); 9042 fs = sbdep->sb_fs; 9043 mp = UFSTOVFS(sbdep->sb_ump); 9044 /* 9045 * If the superblock doesn't match the in-memory list start over. 9046 */ 9047 inodedep = first_unlinked_inodedep(sbdep->sb_ump); 9048 if ((inodedep && fs->fs_sujfree != inodedep->id_ino) || 9049 (inodedep == NULL && fs->fs_sujfree != 0)) { 9050 bdirty(bp); 9051 return (1); 9052 } 9053 WORKITEM_FREE(sbdep, D_SBDEP); 9054 if (fs->fs_sujfree == 0) 9055 return (0); 9056 /* 9057 * Now that we have a record of this inode in stable store allow it 9058 * to be written to free up pending work. Inodes may see a lot of 9059 * write activity after they are unlinked which we must not hold up. 9060 */ 9061 for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) { 9062 if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS) 9063 panic("handle_written_sbdep: Bad inodedep %p (0x%X)", 9064 inodedep, inodedep->id_state); 9065 if (inodedep->id_state & UNLINKONLIST) 9066 break; 9067 inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST; 9068 } 9069 9070 return (0); 9071 } 9072 9073 /* 9074 * Mark an inodedep as unlinked and insert it into the in-memory unlinked list. 9075 */ 9076 static void 9077 unlinked_inodedep(mp, inodedep) 9078 struct mount *mp; 9079 struct inodedep *inodedep; 9080 { 9081 struct ufsmount *ump; 9082 9083 mtx_assert(&lk, MA_OWNED); 9084 if (MOUNTEDSUJ(mp) == 0) 9085 return; 9086 ump = VFSTOUFS(mp); 9087 ump->um_fs->fs_fmod = 1; 9088 if (inodedep->id_state & UNLINKED) 9089 panic("unlinked_inodedep: %p already unlinked\n", inodedep); 9090 inodedep->id_state |= UNLINKED; 9091 TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked); 9092 } 9093 9094 /* 9095 * Remove an inodedep from the unlinked inodedep list. This may require 9096 * disk writes if the inode has made it that far. 9097 */ 9098 static void 9099 clear_unlinked_inodedep(inodedep) 9100 struct inodedep *inodedep; 9101 { 9102 struct ufsmount *ump; 9103 struct inodedep *idp; 9104 struct inodedep *idn; 9105 struct fs *fs; 9106 struct buf *bp; 9107 ino_t ino; 9108 ino_t nino; 9109 ino_t pino; 9110 int error; 9111 9112 ump = VFSTOUFS(inodedep->id_list.wk_mp); 9113 fs = ump->um_fs; 9114 ino = inodedep->id_ino; 9115 error = 0; 9116 for (;;) { 9117 mtx_assert(&lk, MA_OWNED); 9118 KASSERT((inodedep->id_state & UNLINKED) != 0, 9119 ("clear_unlinked_inodedep: inodedep %p not unlinked", 9120 inodedep)); 9121 /* 9122 * If nothing has yet been written simply remove us from 9123 * the in memory list and return. This is the most common 9124 * case where handle_workitem_remove() loses the final 9125 * reference. 9126 */ 9127 if ((inodedep->id_state & UNLINKLINKS) == 0) 9128 break; 9129 /* 9130 * If we have a NEXT pointer and no PREV pointer we can simply 9131 * clear NEXT's PREV and remove ourselves from the list. Be 9132 * careful not to clear PREV if the superblock points at 9133 * next as well. 9134 */ 9135 idn = TAILQ_NEXT(inodedep, id_unlinked); 9136 if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) { 9137 if (idn && fs->fs_sujfree != idn->id_ino) 9138 idn->id_state &= ~UNLINKPREV; 9139 break; 9140 } 9141 /* 9142 * Here we have an inodedep which is actually linked into 9143 * the list. We must remove it by forcing a write to the 9144 * link before us, whether it be the superblock or an inode. 9145 * Unfortunately the list may change while we're waiting 9146 * on the buf lock for either resource so we must loop until 9147 * we lock the right one. If both the superblock and an 9148 * inode point to this inode we must clear the inode first 9149 * followed by the superblock. 9150 */ 9151 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 9152 pino = 0; 9153 if (idp && (idp->id_state & UNLINKNEXT)) 9154 pino = idp->id_ino; 9155 FREE_LOCK(&lk); 9156 if (pino == 0) 9157 bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), 9158 (int)fs->fs_sbsize, 0, 0, 0); 9159 else 9160 error = bread(ump->um_devvp, 9161 fsbtodb(fs, ino_to_fsba(fs, pino)), 9162 (int)fs->fs_bsize, NOCRED, &bp); 9163 ACQUIRE_LOCK(&lk); 9164 if (error) 9165 break; 9166 /* If the list has changed restart the loop. */ 9167 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 9168 nino = 0; 9169 if (idp && (idp->id_state & UNLINKNEXT)) 9170 nino = idp->id_ino; 9171 if (nino != pino || 9172 (inodedep->id_state & UNLINKPREV) != UNLINKPREV) { 9173 FREE_LOCK(&lk); 9174 brelse(bp); 9175 ACQUIRE_LOCK(&lk); 9176 continue; 9177 } 9178 nino = 0; 9179 idn = TAILQ_NEXT(inodedep, id_unlinked); 9180 if (idn) 9181 nino = idn->id_ino; 9182 /* 9183 * Remove us from the in memory list. After this we cannot 9184 * access the inodedep. 9185 */ 9186 KASSERT((inodedep->id_state & UNLINKED) != 0, 9187 ("clear_unlinked_inodedep: inodedep %p not unlinked", 9188 inodedep)); 9189 inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST); 9190 TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); 9191 FREE_LOCK(&lk); 9192 /* 9193 * The predecessor's next pointer is manually updated here 9194 * so that the NEXT flag is never cleared for an element 9195 * that is in the list. 9196 */ 9197 if (pino == 0) { 9198 bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); 9199 ffs_oldfscompat_write((struct fs *)bp->b_data, ump); 9200 softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, 9201 bp); 9202 } else if (fs->fs_magic == FS_UFS1_MAGIC) 9203 ((struct ufs1_dinode *)bp->b_data + 9204 ino_to_fsbo(fs, pino))->di_freelink = nino; 9205 else 9206 ((struct ufs2_dinode *)bp->b_data + 9207 ino_to_fsbo(fs, pino))->di_freelink = nino; 9208 /* 9209 * If the bwrite fails we have no recourse to recover. The 9210 * filesystem is corrupted already. 9211 */ 9212 bwrite(bp); 9213 ACQUIRE_LOCK(&lk); 9214 /* 9215 * If the superblock pointer still needs to be cleared force 9216 * a write here. 9217 */ 9218 if (fs->fs_sujfree == ino) { 9219 FREE_LOCK(&lk); 9220 bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), 9221 (int)fs->fs_sbsize, 0, 0, 0); 9222 bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); 9223 ffs_oldfscompat_write((struct fs *)bp->b_data, ump); 9224 softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, 9225 bp); 9226 bwrite(bp); 9227 ACQUIRE_LOCK(&lk); 9228 } 9229 9230 if (fs->fs_sujfree != ino) 9231 return; 9232 panic("clear_unlinked_inodedep: Failed to clear free head"); 9233 } 9234 if (inodedep->id_ino == fs->fs_sujfree) 9235 panic("clear_unlinked_inodedep: Freeing head of free list"); 9236 inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST); 9237 TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); 9238 return; 9239 } 9240 9241 /* 9242 * This workitem decrements the inode's link count. 9243 * If the link count reaches zero, the file is removed. 9244 */ 9245 static int 9246 handle_workitem_remove(dirrem, flags) 9247 struct dirrem *dirrem; 9248 int flags; 9249 { 9250 struct inodedep *inodedep; 9251 struct workhead dotdotwk; 9252 struct worklist *wk; 9253 struct ufsmount *ump; 9254 struct mount *mp; 9255 struct vnode *vp; 9256 struct inode *ip; 9257 ino_t oldinum; 9258 9259 if (dirrem->dm_state & ONWORKLIST) 9260 panic("handle_workitem_remove: dirrem %p still on worklist", 9261 dirrem); 9262 oldinum = dirrem->dm_oldinum; 9263 mp = dirrem->dm_list.wk_mp; 9264 ump = VFSTOUFS(mp); 9265 flags |= LK_EXCLUSIVE; 9266 if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ) != 0) 9267 return (EBUSY); 9268 ip = VTOI(vp); 9269 ACQUIRE_LOCK(&lk); 9270 if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0) 9271 panic("handle_workitem_remove: lost inodedep"); 9272 if (dirrem->dm_state & ONDEPLIST) 9273 LIST_REMOVE(dirrem, dm_inonext); 9274 KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), 9275 ("handle_workitem_remove: Journal entries not written.")); 9276 9277 /* 9278 * Move all dependencies waiting on the remove to complete 9279 * from the dirrem to the inode inowait list to be completed 9280 * after the inode has been updated and written to disk. Any 9281 * marked MKDIR_PARENT are saved to be completed when the .. ref 9282 * is removed. 9283 */ 9284 LIST_INIT(&dotdotwk); 9285 while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) { 9286 WORKLIST_REMOVE(wk); 9287 if (wk->wk_state & MKDIR_PARENT) { 9288 wk->wk_state &= ~MKDIR_PARENT; 9289 WORKLIST_INSERT(&dotdotwk, wk); 9290 continue; 9291 } 9292 WORKLIST_INSERT(&inodedep->id_inowait, wk); 9293 } 9294 LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list); 9295 /* 9296 * Normal file deletion. 9297 */ 9298 if ((dirrem->dm_state & RMDIR) == 0) { 9299 ip->i_nlink--; 9300 DIP_SET(ip, i_nlink, ip->i_nlink); 9301 ip->i_flag |= IN_CHANGE; 9302 if (ip->i_nlink < ip->i_effnlink) 9303 panic("handle_workitem_remove: bad file delta"); 9304 if (ip->i_nlink == 0) 9305 unlinked_inodedep(mp, inodedep); 9306 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 9307 KASSERT(LIST_EMPTY(&dirrem->dm_jwork), 9308 ("handle_workitem_remove: worklist not empty. %s", 9309 TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type))); 9310 WORKITEM_FREE(dirrem, D_DIRREM); 9311 FREE_LOCK(&lk); 9312 goto out; 9313 } 9314 /* 9315 * Directory deletion. Decrement reference count for both the 9316 * just deleted parent directory entry and the reference for ".". 9317 * Arrange to have the reference count on the parent decremented 9318 * to account for the loss of "..". 9319 */ 9320 ip->i_nlink -= 2; 9321 DIP_SET(ip, i_nlink, ip->i_nlink); 9322 ip->i_flag |= IN_CHANGE; 9323 if (ip->i_nlink < ip->i_effnlink) 9324 panic("handle_workitem_remove: bad dir delta"); 9325 if (ip->i_nlink == 0) 9326 unlinked_inodedep(mp, inodedep); 9327 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 9328 /* 9329 * Rename a directory to a new parent. Since, we are both deleting 9330 * and creating a new directory entry, the link count on the new 9331 * directory should not change. Thus we skip the followup dirrem. 9332 */ 9333 if (dirrem->dm_state & DIRCHG) { 9334 KASSERT(LIST_EMPTY(&dirrem->dm_jwork), 9335 ("handle_workitem_remove: DIRCHG and worklist not empty.")); 9336 WORKITEM_FREE(dirrem, D_DIRREM); 9337 FREE_LOCK(&lk); 9338 goto out; 9339 } 9340 dirrem->dm_state = ONDEPLIST; 9341 dirrem->dm_oldinum = dirrem->dm_dirinum; 9342 /* 9343 * Place the dirrem on the parent's diremhd list. 9344 */ 9345 if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0) 9346 panic("handle_workitem_remove: lost dir inodedep"); 9347 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 9348 /* 9349 * If the allocated inode has never been written to disk, then 9350 * the on-disk inode is zero'ed and we can remove the file 9351 * immediately. When journaling if the inode has been marked 9352 * unlinked and not DEPCOMPLETE we know it can never be written. 9353 */ 9354 inodedep_lookup(mp, oldinum, 0, &inodedep); 9355 if (inodedep == NULL || 9356 (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED || 9357 check_inode_unwritten(inodedep)) { 9358 FREE_LOCK(&lk); 9359 vput(vp); 9360 return handle_workitem_remove(dirrem, flags); 9361 } 9362 WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); 9363 FREE_LOCK(&lk); 9364 ip->i_flag |= IN_CHANGE; 9365 out: 9366 ffs_update(vp, 0); 9367 vput(vp); 9368 return (0); 9369 } 9370 9371 /* 9372 * Inode de-allocation dependencies. 9373 * 9374 * When an inode's link count is reduced to zero, it can be de-allocated. We 9375 * found it convenient to postpone de-allocation until after the inode is 9376 * written to disk with its new link count (zero). At this point, all of the 9377 * on-disk inode's block pointers are nullified and, with careful dependency 9378 * list ordering, all dependencies related to the inode will be satisfied and 9379 * the corresponding dependency structures de-allocated. So, if/when the 9380 * inode is reused, there will be no mixing of old dependencies with new 9381 * ones. This artificial dependency is set up by the block de-allocation 9382 * procedure above (softdep_setup_freeblocks) and completed by the 9383 * following procedure. 9384 */ 9385 static void 9386 handle_workitem_freefile(freefile) 9387 struct freefile *freefile; 9388 { 9389 struct workhead wkhd; 9390 struct fs *fs; 9391 struct inodedep *idp; 9392 struct ufsmount *ump; 9393 int error; 9394 9395 ump = VFSTOUFS(freefile->fx_list.wk_mp); 9396 fs = ump->um_fs; 9397 #ifdef DEBUG 9398 ACQUIRE_LOCK(&lk); 9399 error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp); 9400 FREE_LOCK(&lk); 9401 if (error) 9402 panic("handle_workitem_freefile: inodedep %p survived", idp); 9403 #endif 9404 UFS_LOCK(ump); 9405 fs->fs_pendinginodes -= 1; 9406 UFS_UNLOCK(ump); 9407 LIST_INIT(&wkhd); 9408 LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list); 9409 if ((error = ffs_freefile(ump, fs, freefile->fx_devvp, 9410 freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0) 9411 softdep_error("handle_workitem_freefile", error); 9412 ACQUIRE_LOCK(&lk); 9413 WORKITEM_FREE(freefile, D_FREEFILE); 9414 FREE_LOCK(&lk); 9415 } 9416 9417 9418 /* 9419 * Helper function which unlinks marker element from work list and returns 9420 * the next element on the list. 9421 */ 9422 static __inline struct worklist * 9423 markernext(struct worklist *marker) 9424 { 9425 struct worklist *next; 9426 9427 next = LIST_NEXT(marker, wk_list); 9428 LIST_REMOVE(marker, wk_list); 9429 return next; 9430 } 9431 9432 /* 9433 * Disk writes. 9434 * 9435 * The dependency structures constructed above are most actively used when file 9436 * system blocks are written to disk. No constraints are placed on when a 9437 * block can be written, but unsatisfied update dependencies are made safe by 9438 * modifying (or replacing) the source memory for the duration of the disk 9439 * write. When the disk write completes, the memory block is again brought 9440 * up-to-date. 9441 * 9442 * In-core inode structure reclamation. 9443 * 9444 * Because there are a finite number of "in-core" inode structures, they are 9445 * reused regularly. By transferring all inode-related dependencies to the 9446 * in-memory inode block and indexing them separately (via "inodedep"s), we 9447 * can allow "in-core" inode structures to be reused at any time and avoid 9448 * any increase in contention. 9449 * 9450 * Called just before entering the device driver to initiate a new disk I/O. 9451 * The buffer must be locked, thus, no I/O completion operations can occur 9452 * while we are manipulating its associated dependencies. 9453 */ 9454 static void 9455 softdep_disk_io_initiation(bp) 9456 struct buf *bp; /* structure describing disk write to occur */ 9457 { 9458 struct worklist *wk; 9459 struct worklist marker; 9460 struct inodedep *inodedep; 9461 struct freeblks *freeblks; 9462 struct jblkdep *jblkdep; 9463 struct newblk *newblk; 9464 9465 /* 9466 * We only care about write operations. There should never 9467 * be dependencies for reads. 9468 */ 9469 if (bp->b_iocmd != BIO_WRITE) 9470 panic("softdep_disk_io_initiation: not write"); 9471 9472 if (bp->b_vflags & BV_BKGRDINPROG) 9473 panic("softdep_disk_io_initiation: Writing buffer with " 9474 "background write in progress: %p", bp); 9475 9476 marker.wk_type = D_LAST + 1; /* Not a normal workitem */ 9477 PHOLD(curproc); /* Don't swap out kernel stack */ 9478 9479 ACQUIRE_LOCK(&lk); 9480 /* 9481 * Do any necessary pre-I/O processing. 9482 */ 9483 for (wk = LIST_FIRST(&bp->b_dep); wk != NULL; 9484 wk = markernext(&marker)) { 9485 LIST_INSERT_AFTER(wk, &marker, wk_list); 9486 switch (wk->wk_type) { 9487 9488 case D_PAGEDEP: 9489 initiate_write_filepage(WK_PAGEDEP(wk), bp); 9490 continue; 9491 9492 case D_INODEDEP: 9493 inodedep = WK_INODEDEP(wk); 9494 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) 9495 initiate_write_inodeblock_ufs1(inodedep, bp); 9496 else 9497 initiate_write_inodeblock_ufs2(inodedep, bp); 9498 continue; 9499 9500 case D_INDIRDEP: 9501 initiate_write_indirdep(WK_INDIRDEP(wk), bp); 9502 continue; 9503 9504 case D_BMSAFEMAP: 9505 initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp); 9506 continue; 9507 9508 case D_JSEG: 9509 WK_JSEG(wk)->js_buf = NULL; 9510 continue; 9511 9512 case D_FREEBLKS: 9513 freeblks = WK_FREEBLKS(wk); 9514 jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd); 9515 /* 9516 * We have to wait for the freeblks to be journaled 9517 * before we can write an inodeblock with updated 9518 * pointers. Be careful to arrange the marker so 9519 * we revisit the freeblks if it's not removed by 9520 * the first jwait(). 9521 */ 9522 if (jblkdep != NULL) { 9523 LIST_REMOVE(&marker, wk_list); 9524 LIST_INSERT_BEFORE(wk, &marker, wk_list); 9525 jwait(&jblkdep->jb_list, MNT_WAIT); 9526 } 9527 continue; 9528 case D_ALLOCDIRECT: 9529 case D_ALLOCINDIR: 9530 /* 9531 * We have to wait for the jnewblk to be journaled 9532 * before we can write to a block if the contents 9533 * may be confused with an earlier file's indirect 9534 * at recovery time. Handle the marker as described 9535 * above. 9536 */ 9537 newblk = WK_NEWBLK(wk); 9538 if (newblk->nb_jnewblk != NULL && 9539 indirblk_lookup(newblk->nb_list.wk_mp, 9540 newblk->nb_newblkno)) { 9541 LIST_REMOVE(&marker, wk_list); 9542 LIST_INSERT_BEFORE(wk, &marker, wk_list); 9543 jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); 9544 } 9545 continue; 9546 9547 case D_SBDEP: 9548 initiate_write_sbdep(WK_SBDEP(wk)); 9549 continue; 9550 9551 case D_MKDIR: 9552 case D_FREEWORK: 9553 case D_FREEDEP: 9554 case D_JSEGDEP: 9555 continue; 9556 9557 default: 9558 panic("handle_disk_io_initiation: Unexpected type %s", 9559 TYPENAME(wk->wk_type)); 9560 /* NOTREACHED */ 9561 } 9562 } 9563 FREE_LOCK(&lk); 9564 PRELE(curproc); /* Allow swapout of kernel stack */ 9565 } 9566 9567 /* 9568 * Called from within the procedure above to deal with unsatisfied 9569 * allocation dependencies in a directory. The buffer must be locked, 9570 * thus, no I/O completion operations can occur while we are 9571 * manipulating its associated dependencies. 9572 */ 9573 static void 9574 initiate_write_filepage(pagedep, bp) 9575 struct pagedep *pagedep; 9576 struct buf *bp; 9577 { 9578 struct jremref *jremref; 9579 struct jmvref *jmvref; 9580 struct dirrem *dirrem; 9581 struct diradd *dap; 9582 struct direct *ep; 9583 int i; 9584 9585 if (pagedep->pd_state & IOSTARTED) { 9586 /* 9587 * This can only happen if there is a driver that does not 9588 * understand chaining. Here biodone will reissue the call 9589 * to strategy for the incomplete buffers. 9590 */ 9591 printf("initiate_write_filepage: already started\n"); 9592 return; 9593 } 9594 pagedep->pd_state |= IOSTARTED; 9595 /* 9596 * Wait for all journal remove dependencies to hit the disk. 9597 * We can not allow any potentially conflicting directory adds 9598 * to be visible before removes and rollback is too difficult. 9599 * lk may be dropped and re-acquired, however we hold the buf 9600 * locked so the dependency can not go away. 9601 */ 9602 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) 9603 while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) 9604 jwait(&jremref->jr_list, MNT_WAIT); 9605 while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) 9606 jwait(&jmvref->jm_list, MNT_WAIT); 9607 for (i = 0; i < DAHASHSZ; i++) { 9608 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 9609 ep = (struct direct *) 9610 ((char *)bp->b_data + dap->da_offset); 9611 if (ep->d_ino != dap->da_newinum) 9612 panic("%s: dir inum %d != new %d", 9613 "initiate_write_filepage", 9614 ep->d_ino, dap->da_newinum); 9615 if (dap->da_state & DIRCHG) 9616 ep->d_ino = dap->da_previous->dm_oldinum; 9617 else 9618 ep->d_ino = 0; 9619 dap->da_state &= ~ATTACHED; 9620 dap->da_state |= UNDONE; 9621 } 9622 } 9623 } 9624 9625 /* 9626 * Version of initiate_write_inodeblock that handles UFS1 dinodes. 9627 * Note that any bug fixes made to this routine must be done in the 9628 * version found below. 9629 * 9630 * Called from within the procedure above to deal with unsatisfied 9631 * allocation dependencies in an inodeblock. The buffer must be 9632 * locked, thus, no I/O completion operations can occur while we 9633 * are manipulating its associated dependencies. 9634 */ 9635 static void 9636 initiate_write_inodeblock_ufs1(inodedep, bp) 9637 struct inodedep *inodedep; 9638 struct buf *bp; /* The inode block */ 9639 { 9640 struct allocdirect *adp, *lastadp; 9641 struct ufs1_dinode *dp; 9642 struct ufs1_dinode *sip; 9643 struct inoref *inoref; 9644 struct fs *fs; 9645 ufs_lbn_t i; 9646 #ifdef INVARIANTS 9647 ufs_lbn_t prevlbn = 0; 9648 #endif 9649 int deplist; 9650 9651 if (inodedep->id_state & IOSTARTED) 9652 panic("initiate_write_inodeblock_ufs1: already started"); 9653 inodedep->id_state |= IOSTARTED; 9654 fs = inodedep->id_fs; 9655 dp = (struct ufs1_dinode *)bp->b_data + 9656 ino_to_fsbo(fs, inodedep->id_ino); 9657 9658 /* 9659 * If we're on the unlinked list but have not yet written our 9660 * next pointer initialize it here. 9661 */ 9662 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 9663 struct inodedep *inon; 9664 9665 inon = TAILQ_NEXT(inodedep, id_unlinked); 9666 dp->di_freelink = inon ? inon->id_ino : 0; 9667 } 9668 /* 9669 * If the bitmap is not yet written, then the allocated 9670 * inode cannot be written to disk. 9671 */ 9672 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 9673 if (inodedep->id_savedino1 != NULL) 9674 panic("initiate_write_inodeblock_ufs1: I/O underway"); 9675 FREE_LOCK(&lk); 9676 sip = malloc(sizeof(struct ufs1_dinode), 9677 M_SAVEDINO, M_SOFTDEP_FLAGS); 9678 ACQUIRE_LOCK(&lk); 9679 inodedep->id_savedino1 = sip; 9680 *inodedep->id_savedino1 = *dp; 9681 bzero((caddr_t)dp, sizeof(struct ufs1_dinode)); 9682 dp->di_gen = inodedep->id_savedino1->di_gen; 9683 dp->di_freelink = inodedep->id_savedino1->di_freelink; 9684 return; 9685 } 9686 /* 9687 * If no dependencies, then there is nothing to roll back. 9688 */ 9689 inodedep->id_savedsize = dp->di_size; 9690 inodedep->id_savedextsize = 0; 9691 inodedep->id_savednlink = dp->di_nlink; 9692 if (TAILQ_EMPTY(&inodedep->id_inoupdt) && 9693 TAILQ_EMPTY(&inodedep->id_inoreflst)) 9694 return; 9695 /* 9696 * Revert the link count to that of the first unwritten journal entry. 9697 */ 9698 inoref = TAILQ_FIRST(&inodedep->id_inoreflst); 9699 if (inoref) 9700 dp->di_nlink = inoref->if_nlink; 9701 /* 9702 * Set the dependencies to busy. 9703 */ 9704 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 9705 adp = TAILQ_NEXT(adp, ad_next)) { 9706 #ifdef INVARIANTS 9707 if (deplist != 0 && prevlbn >= adp->ad_offset) 9708 panic("softdep_write_inodeblock: lbn order"); 9709 prevlbn = adp->ad_offset; 9710 if (adp->ad_offset < NDADDR && 9711 dp->di_db[adp->ad_offset] != adp->ad_newblkno) 9712 panic("%s: direct pointer #%jd mismatch %d != %jd", 9713 "softdep_write_inodeblock", 9714 (intmax_t)adp->ad_offset, 9715 dp->di_db[adp->ad_offset], 9716 (intmax_t)adp->ad_newblkno); 9717 if (adp->ad_offset >= NDADDR && 9718 dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) 9719 panic("%s: indirect pointer #%jd mismatch %d != %jd", 9720 "softdep_write_inodeblock", 9721 (intmax_t)adp->ad_offset - NDADDR, 9722 dp->di_ib[adp->ad_offset - NDADDR], 9723 (intmax_t)adp->ad_newblkno); 9724 deplist |= 1 << adp->ad_offset; 9725 if ((adp->ad_state & ATTACHED) == 0) 9726 panic("softdep_write_inodeblock: Unknown state 0x%x", 9727 adp->ad_state); 9728 #endif /* INVARIANTS */ 9729 adp->ad_state &= ~ATTACHED; 9730 adp->ad_state |= UNDONE; 9731 } 9732 /* 9733 * The on-disk inode cannot claim to be any larger than the last 9734 * fragment that has been written. Otherwise, the on-disk inode 9735 * might have fragments that were not the last block in the file 9736 * which would corrupt the filesystem. 9737 */ 9738 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 9739 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 9740 if (adp->ad_offset >= NDADDR) 9741 break; 9742 dp->di_db[adp->ad_offset] = adp->ad_oldblkno; 9743 /* keep going until hitting a rollback to a frag */ 9744 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 9745 continue; 9746 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 9747 for (i = adp->ad_offset + 1; i < NDADDR; i++) { 9748 #ifdef INVARIANTS 9749 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) 9750 panic("softdep_write_inodeblock: lost dep1"); 9751 #endif /* INVARIANTS */ 9752 dp->di_db[i] = 0; 9753 } 9754 for (i = 0; i < NIADDR; i++) { 9755 #ifdef INVARIANTS 9756 if (dp->di_ib[i] != 0 && 9757 (deplist & ((1 << NDADDR) << i)) == 0) 9758 panic("softdep_write_inodeblock: lost dep2"); 9759 #endif /* INVARIANTS */ 9760 dp->di_ib[i] = 0; 9761 } 9762 return; 9763 } 9764 /* 9765 * If we have zero'ed out the last allocated block of the file, 9766 * roll back the size to the last currently allocated block. 9767 * We know that this last allocated block is a full-sized as 9768 * we already checked for fragments in the loop above. 9769 */ 9770 if (lastadp != NULL && 9771 dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 9772 for (i = lastadp->ad_offset; i >= 0; i--) 9773 if (dp->di_db[i] != 0) 9774 break; 9775 dp->di_size = (i + 1) * fs->fs_bsize; 9776 } 9777 /* 9778 * The only dependencies are for indirect blocks. 9779 * 9780 * The file size for indirect block additions is not guaranteed. 9781 * Such a guarantee would be non-trivial to achieve. The conventional 9782 * synchronous write implementation also does not make this guarantee. 9783 * Fsck should catch and fix discrepancies. Arguably, the file size 9784 * can be over-estimated without destroying integrity when the file 9785 * moves into the indirect blocks (i.e., is large). If we want to 9786 * postpone fsck, we are stuck with this argument. 9787 */ 9788 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 9789 dp->di_ib[adp->ad_offset - NDADDR] = 0; 9790 } 9791 9792 /* 9793 * Version of initiate_write_inodeblock that handles UFS2 dinodes. 9794 * Note that any bug fixes made to this routine must be done in the 9795 * version found above. 9796 * 9797 * Called from within the procedure above to deal with unsatisfied 9798 * allocation dependencies in an inodeblock. The buffer must be 9799 * locked, thus, no I/O completion operations can occur while we 9800 * are manipulating its associated dependencies. 9801 */ 9802 static void 9803 initiate_write_inodeblock_ufs2(inodedep, bp) 9804 struct inodedep *inodedep; 9805 struct buf *bp; /* The inode block */ 9806 { 9807 struct allocdirect *adp, *lastadp; 9808 struct ufs2_dinode *dp; 9809 struct ufs2_dinode *sip; 9810 struct inoref *inoref; 9811 struct fs *fs; 9812 ufs_lbn_t i; 9813 #ifdef INVARIANTS 9814 ufs_lbn_t prevlbn = 0; 9815 #endif 9816 int deplist; 9817 9818 if (inodedep->id_state & IOSTARTED) 9819 panic("initiate_write_inodeblock_ufs2: already started"); 9820 inodedep->id_state |= IOSTARTED; 9821 fs = inodedep->id_fs; 9822 dp = (struct ufs2_dinode *)bp->b_data + 9823 ino_to_fsbo(fs, inodedep->id_ino); 9824 9825 /* 9826 * If we're on the unlinked list but have not yet written our 9827 * next pointer initialize it here. 9828 */ 9829 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 9830 struct inodedep *inon; 9831 9832 inon = TAILQ_NEXT(inodedep, id_unlinked); 9833 dp->di_freelink = inon ? inon->id_ino : 0; 9834 } 9835 /* 9836 * If the bitmap is not yet written, then the allocated 9837 * inode cannot be written to disk. 9838 */ 9839 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 9840 if (inodedep->id_savedino2 != NULL) 9841 panic("initiate_write_inodeblock_ufs2: I/O underway"); 9842 FREE_LOCK(&lk); 9843 sip = malloc(sizeof(struct ufs2_dinode), 9844 M_SAVEDINO, M_SOFTDEP_FLAGS); 9845 ACQUIRE_LOCK(&lk); 9846 inodedep->id_savedino2 = sip; 9847 *inodedep->id_savedino2 = *dp; 9848 bzero((caddr_t)dp, sizeof(struct ufs2_dinode)); 9849 dp->di_gen = inodedep->id_savedino2->di_gen; 9850 dp->di_freelink = inodedep->id_savedino2->di_freelink; 9851 return; 9852 } 9853 /* 9854 * If no dependencies, then there is nothing to roll back. 9855 */ 9856 inodedep->id_savedsize = dp->di_size; 9857 inodedep->id_savedextsize = dp->di_extsize; 9858 inodedep->id_savednlink = dp->di_nlink; 9859 if (TAILQ_EMPTY(&inodedep->id_inoupdt) && 9860 TAILQ_EMPTY(&inodedep->id_extupdt) && 9861 TAILQ_EMPTY(&inodedep->id_inoreflst)) 9862 return; 9863 /* 9864 * Revert the link count to that of the first unwritten journal entry. 9865 */ 9866 inoref = TAILQ_FIRST(&inodedep->id_inoreflst); 9867 if (inoref) 9868 dp->di_nlink = inoref->if_nlink; 9869 9870 /* 9871 * Set the ext data dependencies to busy. 9872 */ 9873 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; 9874 adp = TAILQ_NEXT(adp, ad_next)) { 9875 #ifdef INVARIANTS 9876 if (deplist != 0 && prevlbn >= adp->ad_offset) 9877 panic("softdep_write_inodeblock: lbn order"); 9878 prevlbn = adp->ad_offset; 9879 if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno) 9880 panic("%s: direct pointer #%jd mismatch %jd != %jd", 9881 "softdep_write_inodeblock", 9882 (intmax_t)adp->ad_offset, 9883 (intmax_t)dp->di_extb[adp->ad_offset], 9884 (intmax_t)adp->ad_newblkno); 9885 deplist |= 1 << adp->ad_offset; 9886 if ((adp->ad_state & ATTACHED) == 0) 9887 panic("softdep_write_inodeblock: Unknown state 0x%x", 9888 adp->ad_state); 9889 #endif /* INVARIANTS */ 9890 adp->ad_state &= ~ATTACHED; 9891 adp->ad_state |= UNDONE; 9892 } 9893 /* 9894 * The on-disk inode cannot claim to be any larger than the last 9895 * fragment that has been written. Otherwise, the on-disk inode 9896 * might have fragments that were not the last block in the ext 9897 * data which would corrupt the filesystem. 9898 */ 9899 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; 9900 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 9901 dp->di_extb[adp->ad_offset] = adp->ad_oldblkno; 9902 /* keep going until hitting a rollback to a frag */ 9903 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 9904 continue; 9905 dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 9906 for (i = adp->ad_offset + 1; i < NXADDR; i++) { 9907 #ifdef INVARIANTS 9908 if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) 9909 panic("softdep_write_inodeblock: lost dep1"); 9910 #endif /* INVARIANTS */ 9911 dp->di_extb[i] = 0; 9912 } 9913 lastadp = NULL; 9914 break; 9915 } 9916 /* 9917 * If we have zero'ed out the last allocated block of the ext 9918 * data, roll back the size to the last currently allocated block. 9919 * We know that this last allocated block is a full-sized as 9920 * we already checked for fragments in the loop above. 9921 */ 9922 if (lastadp != NULL && 9923 dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 9924 for (i = lastadp->ad_offset; i >= 0; i--) 9925 if (dp->di_extb[i] != 0) 9926 break; 9927 dp->di_extsize = (i + 1) * fs->fs_bsize; 9928 } 9929 /* 9930 * Set the file data dependencies to busy. 9931 */ 9932 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 9933 adp = TAILQ_NEXT(adp, ad_next)) { 9934 #ifdef INVARIANTS 9935 if (deplist != 0 && prevlbn >= adp->ad_offset) 9936 panic("softdep_write_inodeblock: lbn order"); 9937 if ((adp->ad_state & ATTACHED) == 0) 9938 panic("inodedep %p and adp %p not attached", inodedep, adp); 9939 prevlbn = adp->ad_offset; 9940 if (adp->ad_offset < NDADDR && 9941 dp->di_db[adp->ad_offset] != adp->ad_newblkno) 9942 panic("%s: direct pointer #%jd mismatch %jd != %jd", 9943 "softdep_write_inodeblock", 9944 (intmax_t)adp->ad_offset, 9945 (intmax_t)dp->di_db[adp->ad_offset], 9946 (intmax_t)adp->ad_newblkno); 9947 if (adp->ad_offset >= NDADDR && 9948 dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) 9949 panic("%s indirect pointer #%jd mismatch %jd != %jd", 9950 "softdep_write_inodeblock:", 9951 (intmax_t)adp->ad_offset - NDADDR, 9952 (intmax_t)dp->di_ib[adp->ad_offset - NDADDR], 9953 (intmax_t)adp->ad_newblkno); 9954 deplist |= 1 << adp->ad_offset; 9955 if ((adp->ad_state & ATTACHED) == 0) 9956 panic("softdep_write_inodeblock: Unknown state 0x%x", 9957 adp->ad_state); 9958 #endif /* INVARIANTS */ 9959 adp->ad_state &= ~ATTACHED; 9960 adp->ad_state |= UNDONE; 9961 } 9962 /* 9963 * The on-disk inode cannot claim to be any larger than the last 9964 * fragment that has been written. Otherwise, the on-disk inode 9965 * might have fragments that were not the last block in the file 9966 * which would corrupt the filesystem. 9967 */ 9968 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 9969 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 9970 if (adp->ad_offset >= NDADDR) 9971 break; 9972 dp->di_db[adp->ad_offset] = adp->ad_oldblkno; 9973 /* keep going until hitting a rollback to a frag */ 9974 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 9975 continue; 9976 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 9977 for (i = adp->ad_offset + 1; i < NDADDR; i++) { 9978 #ifdef INVARIANTS 9979 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) 9980 panic("softdep_write_inodeblock: lost dep2"); 9981 #endif /* INVARIANTS */ 9982 dp->di_db[i] = 0; 9983 } 9984 for (i = 0; i < NIADDR; i++) { 9985 #ifdef INVARIANTS 9986 if (dp->di_ib[i] != 0 && 9987 (deplist & ((1 << NDADDR) << i)) == 0) 9988 panic("softdep_write_inodeblock: lost dep3"); 9989 #endif /* INVARIANTS */ 9990 dp->di_ib[i] = 0; 9991 } 9992 return; 9993 } 9994 /* 9995 * If we have zero'ed out the last allocated block of the file, 9996 * roll back the size to the last currently allocated block. 9997 * We know that this last allocated block is a full-sized as 9998 * we already checked for fragments in the loop above. 9999 */ 10000 if (lastadp != NULL && 10001 dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 10002 for (i = lastadp->ad_offset; i >= 0; i--) 10003 if (dp->di_db[i] != 0) 10004 break; 10005 dp->di_size = (i + 1) * fs->fs_bsize; 10006 } 10007 /* 10008 * The only dependencies are for indirect blocks. 10009 * 10010 * The file size for indirect block additions is not guaranteed. 10011 * Such a guarantee would be non-trivial to achieve. The conventional 10012 * synchronous write implementation also does not make this guarantee. 10013 * Fsck should catch and fix discrepancies. Arguably, the file size 10014 * can be over-estimated without destroying integrity when the file 10015 * moves into the indirect blocks (i.e., is large). If we want to 10016 * postpone fsck, we are stuck with this argument. 10017 */ 10018 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 10019 dp->di_ib[adp->ad_offset - NDADDR] = 0; 10020 } 10021 10022 /* 10023 * Cancel an indirdep as a result of truncation. Release all of the 10024 * children allocindirs and place their journal work on the appropriate 10025 * list. 10026 */ 10027 static void 10028 cancel_indirdep(indirdep, bp, freeblks) 10029 struct indirdep *indirdep; 10030 struct buf *bp; 10031 struct freeblks *freeblks; 10032 { 10033 struct allocindir *aip; 10034 10035 /* 10036 * None of the indirect pointers will ever be visible, 10037 * so they can simply be tossed. GOINGAWAY ensures 10038 * that allocated pointers will be saved in the buffer 10039 * cache until they are freed. Note that they will 10040 * only be able to be found by their physical address 10041 * since the inode mapping the logical address will 10042 * be gone. The save buffer used for the safe copy 10043 * was allocated in setup_allocindir_phase2 using 10044 * the physical address so it could be used for this 10045 * purpose. Hence we swap the safe copy with the real 10046 * copy, allowing the safe copy to be freed and holding 10047 * on to the real copy for later use in indir_trunc. 10048 */ 10049 if (indirdep->ir_state & GOINGAWAY) 10050 panic("cancel_indirdep: already gone"); 10051 if ((indirdep->ir_state & DEPCOMPLETE) == 0) { 10052 indirdep->ir_state |= DEPCOMPLETE; 10053 LIST_REMOVE(indirdep, ir_next); 10054 } 10055 indirdep->ir_state |= GOINGAWAY; 10056 VFSTOUFS(indirdep->ir_list.wk_mp)->um_numindirdeps += 1; 10057 /* 10058 * Pass in bp for blocks still have journal writes 10059 * pending so we can cancel them on their own. 10060 */ 10061 while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) 10062 cancel_allocindir(aip, bp, freeblks, 0); 10063 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) 10064 cancel_allocindir(aip, NULL, freeblks, 0); 10065 while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) 10066 cancel_allocindir(aip, NULL, freeblks, 0); 10067 while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0) 10068 cancel_allocindir(aip, NULL, freeblks, 0); 10069 /* 10070 * If there are pending partial truncations we need to keep the 10071 * old block copy around until they complete. This is because 10072 * the current b_data is not a perfect superset of the available 10073 * blocks. 10074 */ 10075 if (TAILQ_EMPTY(&indirdep->ir_trunc)) 10076 bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount); 10077 else 10078 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); 10079 WORKLIST_REMOVE(&indirdep->ir_list); 10080 WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list); 10081 indirdep->ir_bp = NULL; 10082 indirdep->ir_freeblks = freeblks; 10083 } 10084 10085 /* 10086 * Free an indirdep once it no longer has new pointers to track. 10087 */ 10088 static void 10089 free_indirdep(indirdep) 10090 struct indirdep *indirdep; 10091 { 10092 10093 KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc), 10094 ("free_indirdep: Indir trunc list not empty.")); 10095 KASSERT(LIST_EMPTY(&indirdep->ir_completehd), 10096 ("free_indirdep: Complete head not empty.")); 10097 KASSERT(LIST_EMPTY(&indirdep->ir_writehd), 10098 ("free_indirdep: write head not empty.")); 10099 KASSERT(LIST_EMPTY(&indirdep->ir_donehd), 10100 ("free_indirdep: done head not empty.")); 10101 KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd), 10102 ("free_indirdep: deplist head not empty.")); 10103 KASSERT((indirdep->ir_state & DEPCOMPLETE), 10104 ("free_indirdep: %p still on newblk list.", indirdep)); 10105 KASSERT(indirdep->ir_saveddata == NULL, 10106 ("free_indirdep: %p still has saved data.", indirdep)); 10107 if (indirdep->ir_state & ONWORKLIST) 10108 WORKLIST_REMOVE(&indirdep->ir_list); 10109 WORKITEM_FREE(indirdep, D_INDIRDEP); 10110 } 10111 10112 /* 10113 * Called before a write to an indirdep. This routine is responsible for 10114 * rolling back pointers to a safe state which includes only those 10115 * allocindirs which have been completed. 10116 */ 10117 static void 10118 initiate_write_indirdep(indirdep, bp) 10119 struct indirdep *indirdep; 10120 struct buf *bp; 10121 { 10122 10123 indirdep->ir_state |= IOSTARTED; 10124 if (indirdep->ir_state & GOINGAWAY) 10125 panic("disk_io_initiation: indirdep gone"); 10126 /* 10127 * If there are no remaining dependencies, this will be writing 10128 * the real pointers. 10129 */ 10130 if (LIST_EMPTY(&indirdep->ir_deplisthd) && 10131 TAILQ_EMPTY(&indirdep->ir_trunc)) 10132 return; 10133 /* 10134 * Replace up-to-date version with safe version. 10135 */ 10136 if (indirdep->ir_saveddata == NULL) { 10137 FREE_LOCK(&lk); 10138 indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, 10139 M_SOFTDEP_FLAGS); 10140 ACQUIRE_LOCK(&lk); 10141 } 10142 indirdep->ir_state &= ~ATTACHED; 10143 indirdep->ir_state |= UNDONE; 10144 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); 10145 bcopy(indirdep->ir_savebp->b_data, bp->b_data, 10146 bp->b_bcount); 10147 } 10148 10149 /* 10150 * Called when an inode has been cleared in a cg bitmap. This finally 10151 * eliminates any canceled jaddrefs 10152 */ 10153 void 10154 softdep_setup_inofree(mp, bp, ino, wkhd) 10155 struct mount *mp; 10156 struct buf *bp; 10157 ino_t ino; 10158 struct workhead *wkhd; 10159 { 10160 struct worklist *wk, *wkn; 10161 struct inodedep *inodedep; 10162 uint8_t *inosused; 10163 struct cg *cgp; 10164 struct fs *fs; 10165 10166 ACQUIRE_LOCK(&lk); 10167 fs = VFSTOUFS(mp)->um_fs; 10168 cgp = (struct cg *)bp->b_data; 10169 inosused = cg_inosused(cgp); 10170 if (isset(inosused, ino % fs->fs_ipg)) 10171 panic("softdep_setup_inofree: inode %d not freed.", ino); 10172 if (inodedep_lookup(mp, ino, 0, &inodedep)) 10173 panic("softdep_setup_inofree: ino %d has existing inodedep %p", 10174 ino, inodedep); 10175 if (wkhd) { 10176 LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) { 10177 if (wk->wk_type != D_JADDREF) 10178 continue; 10179 WORKLIST_REMOVE(wk); 10180 /* 10181 * We can free immediately even if the jaddref 10182 * isn't attached in a background write as now 10183 * the bitmaps are reconciled. 10184 */ 10185 wk->wk_state |= COMPLETE | ATTACHED; 10186 free_jaddref(WK_JADDREF(wk)); 10187 } 10188 jwork_move(&bp->b_dep, wkhd); 10189 } 10190 FREE_LOCK(&lk); 10191 } 10192 10193 10194 /* 10195 * Called via ffs_blkfree() after a set of frags has been cleared from a cg 10196 * map. Any dependencies waiting for the write to clear are added to the 10197 * buf's list and any jnewblks that are being canceled are discarded 10198 * immediately. 10199 */ 10200 void 10201 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) 10202 struct mount *mp; 10203 struct buf *bp; 10204 ufs2_daddr_t blkno; 10205 int frags; 10206 struct workhead *wkhd; 10207 { 10208 struct bmsafemap *bmsafemap; 10209 struct jnewblk *jnewblk; 10210 struct worklist *wk; 10211 struct fs *fs; 10212 #ifdef SUJ_DEBUG 10213 uint8_t *blksfree; 10214 struct cg *cgp; 10215 ufs2_daddr_t jstart; 10216 ufs2_daddr_t jend; 10217 ufs2_daddr_t end; 10218 long bno; 10219 int i; 10220 #endif 10221 10222 ACQUIRE_LOCK(&lk); 10223 /* Lookup the bmsafemap so we track when it is dirty. */ 10224 fs = VFSTOUFS(mp)->um_fs; 10225 bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno)); 10226 /* 10227 * Detach any jnewblks which have been canceled. They must linger 10228 * until the bitmap is cleared again by ffs_blkfree() to prevent 10229 * an unjournaled allocation from hitting the disk. 10230 */ 10231 if (wkhd) { 10232 while ((wk = LIST_FIRST(wkhd)) != NULL) { 10233 WORKLIST_REMOVE(wk); 10234 if (wk->wk_type != D_JNEWBLK) { 10235 WORKLIST_INSERT(&bmsafemap->sm_freehd, wk); 10236 continue; 10237 } 10238 jnewblk = WK_JNEWBLK(wk); 10239 KASSERT(jnewblk->jn_state & GOINGAWAY, 10240 ("softdep_setup_blkfree: jnewblk not canceled.")); 10241 #ifdef SUJ_DEBUG 10242 /* 10243 * Assert that this block is free in the bitmap 10244 * before we discard the jnewblk. 10245 */ 10246 cgp = (struct cg *)bp->b_data; 10247 blksfree = cg_blksfree(cgp); 10248 bno = dtogd(fs, jnewblk->jn_blkno); 10249 for (i = jnewblk->jn_oldfrags; 10250 i < jnewblk->jn_frags; i++) { 10251 if (isset(blksfree, bno + i)) 10252 continue; 10253 panic("softdep_setup_blkfree: not free"); 10254 } 10255 #endif 10256 /* 10257 * Even if it's not attached we can free immediately 10258 * as the new bitmap is correct. 10259 */ 10260 wk->wk_state |= COMPLETE | ATTACHED; 10261 free_jnewblk(jnewblk); 10262 } 10263 } 10264 10265 #ifdef SUJ_DEBUG 10266 /* 10267 * Assert that we are not freeing a block which has an outstanding 10268 * allocation dependency. 10269 */ 10270 fs = VFSTOUFS(mp)->um_fs; 10271 bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno)); 10272 end = blkno + frags; 10273 LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { 10274 /* 10275 * Don't match against blocks that will be freed when the 10276 * background write is done. 10277 */ 10278 if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) == 10279 (COMPLETE | DEPCOMPLETE)) 10280 continue; 10281 jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags; 10282 jend = jnewblk->jn_blkno + jnewblk->jn_frags; 10283 if ((blkno >= jstart && blkno < jend) || 10284 (end > jstart && end <= jend)) { 10285 printf("state 0x%X %jd - %d %d dep %p\n", 10286 jnewblk->jn_state, jnewblk->jn_blkno, 10287 jnewblk->jn_oldfrags, jnewblk->jn_frags, 10288 jnewblk->jn_dep); 10289 panic("softdep_setup_blkfree: " 10290 "%jd-%jd(%d) overlaps with %jd-%jd", 10291 blkno, end, frags, jstart, jend); 10292 } 10293 } 10294 #endif 10295 FREE_LOCK(&lk); 10296 } 10297 10298 /* 10299 * Revert a block allocation when the journal record that describes it 10300 * is not yet written. 10301 */ 10302 int 10303 jnewblk_rollback(jnewblk, fs, cgp, blksfree) 10304 struct jnewblk *jnewblk; 10305 struct fs *fs; 10306 struct cg *cgp; 10307 uint8_t *blksfree; 10308 { 10309 ufs1_daddr_t fragno; 10310 long cgbno, bbase; 10311 int frags, blk; 10312 int i; 10313 10314 frags = 0; 10315 cgbno = dtogd(fs, jnewblk->jn_blkno); 10316 /* 10317 * We have to test which frags need to be rolled back. We may 10318 * be operating on a stale copy when doing background writes. 10319 */ 10320 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) 10321 if (isclr(blksfree, cgbno + i)) 10322 frags++; 10323 if (frags == 0) 10324 return (0); 10325 /* 10326 * This is mostly ffs_blkfree() sans some validation and 10327 * superblock updates. 10328 */ 10329 if (frags == fs->fs_frag) { 10330 fragno = fragstoblks(fs, cgbno); 10331 ffs_setblock(fs, blksfree, fragno); 10332 ffs_clusteracct(fs, cgp, fragno, 1); 10333 cgp->cg_cs.cs_nbfree++; 10334 } else { 10335 cgbno += jnewblk->jn_oldfrags; 10336 bbase = cgbno - fragnum(fs, cgbno); 10337 /* Decrement the old frags. */ 10338 blk = blkmap(fs, blksfree, bbase); 10339 ffs_fragacct(fs, blk, cgp->cg_frsum, -1); 10340 /* Deallocate the fragment */ 10341 for (i = 0; i < frags; i++) 10342 setbit(blksfree, cgbno + i); 10343 cgp->cg_cs.cs_nffree += frags; 10344 /* Add back in counts associated with the new frags */ 10345 blk = blkmap(fs, blksfree, bbase); 10346 ffs_fragacct(fs, blk, cgp->cg_frsum, 1); 10347 /* If a complete block has been reassembled, account for it. */ 10348 fragno = fragstoblks(fs, bbase); 10349 if (ffs_isblock(fs, blksfree, fragno)) { 10350 cgp->cg_cs.cs_nffree -= fs->fs_frag; 10351 ffs_clusteracct(fs, cgp, fragno, 1); 10352 cgp->cg_cs.cs_nbfree++; 10353 } 10354 } 10355 stat_jnewblk++; 10356 jnewblk->jn_state &= ~ATTACHED; 10357 jnewblk->jn_state |= UNDONE; 10358 10359 return (frags); 10360 } 10361 10362 static void 10363 initiate_write_bmsafemap(bmsafemap, bp) 10364 struct bmsafemap *bmsafemap; 10365 struct buf *bp; /* The cg block. */ 10366 { 10367 struct jaddref *jaddref; 10368 struct jnewblk *jnewblk; 10369 uint8_t *inosused; 10370 uint8_t *blksfree; 10371 struct cg *cgp; 10372 struct fs *fs; 10373 ino_t ino; 10374 10375 if (bmsafemap->sm_state & IOSTARTED) 10376 panic("initiate_write_bmsafemap: Already started\n"); 10377 bmsafemap->sm_state |= IOSTARTED; 10378 /* 10379 * Clear any inode allocations which are pending journal writes. 10380 */ 10381 if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) { 10382 cgp = (struct cg *)bp->b_data; 10383 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 10384 inosused = cg_inosused(cgp); 10385 LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) { 10386 ino = jaddref->ja_ino % fs->fs_ipg; 10387 /* 10388 * If this is a background copy the inode may not 10389 * be marked used yet. 10390 */ 10391 if (isset(inosused, ino)) { 10392 if ((jaddref->ja_mode & IFMT) == IFDIR) 10393 cgp->cg_cs.cs_ndir--; 10394 cgp->cg_cs.cs_nifree++; 10395 clrbit(inosused, ino); 10396 jaddref->ja_state &= ~ATTACHED; 10397 jaddref->ja_state |= UNDONE; 10398 stat_jaddref++; 10399 } else if ((bp->b_xflags & BX_BKGRDMARKER) == 0) 10400 panic("initiate_write_bmsafemap: inode %d " 10401 "marked free", jaddref->ja_ino); 10402 } 10403 } 10404 /* 10405 * Clear any block allocations which are pending journal writes. 10406 */ 10407 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { 10408 cgp = (struct cg *)bp->b_data; 10409 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 10410 blksfree = cg_blksfree(cgp); 10411 LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { 10412 if (jnewblk_rollback(jnewblk, fs, cgp, blksfree)) 10413 continue; 10414 if ((bp->b_xflags & BX_BKGRDMARKER) == 0) 10415 panic("initiate_write_bmsafemap: block %jd " 10416 "marked free", jnewblk->jn_blkno); 10417 } 10418 } 10419 /* 10420 * Move allocation lists to the written lists so they can be 10421 * cleared once the block write is complete. 10422 */ 10423 LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr, 10424 inodedep, id_deps); 10425 LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr, 10426 newblk, nb_deps); 10427 LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist, 10428 wk_list); 10429 } 10430 10431 /* 10432 * This routine is called during the completion interrupt 10433 * service routine for a disk write (from the procedure called 10434 * by the device driver to inform the filesystem caches of 10435 * a request completion). It should be called early in this 10436 * procedure, before the block is made available to other 10437 * processes or other routines are called. 10438 * 10439 */ 10440 static void 10441 softdep_disk_write_complete(bp) 10442 struct buf *bp; /* describes the completed disk write */ 10443 { 10444 struct worklist *wk; 10445 struct worklist *owk; 10446 struct workhead reattach; 10447 struct freeblks *freeblks; 10448 struct buf *sbp; 10449 10450 /* 10451 * If an error occurred while doing the write, then the data 10452 * has not hit the disk and the dependencies cannot be unrolled. 10453 */ 10454 if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) 10455 return; 10456 LIST_INIT(&reattach); 10457 /* 10458 * This lock must not be released anywhere in this code segment. 10459 */ 10460 sbp = NULL; 10461 owk = NULL; 10462 ACQUIRE_LOCK(&lk); 10463 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 10464 WORKLIST_REMOVE(wk); 10465 dep_write[wk->wk_type]++; 10466 if (wk == owk) 10467 panic("duplicate worklist: %p\n", wk); 10468 owk = wk; 10469 switch (wk->wk_type) { 10470 10471 case D_PAGEDEP: 10472 if (handle_written_filepage(WK_PAGEDEP(wk), bp)) 10473 WORKLIST_INSERT(&reattach, wk); 10474 continue; 10475 10476 case D_INODEDEP: 10477 if (handle_written_inodeblock(WK_INODEDEP(wk), bp)) 10478 WORKLIST_INSERT(&reattach, wk); 10479 continue; 10480 10481 case D_BMSAFEMAP: 10482 if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp)) 10483 WORKLIST_INSERT(&reattach, wk); 10484 continue; 10485 10486 case D_MKDIR: 10487 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 10488 continue; 10489 10490 case D_ALLOCDIRECT: 10491 wk->wk_state |= COMPLETE; 10492 handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL); 10493 continue; 10494 10495 case D_ALLOCINDIR: 10496 wk->wk_state |= COMPLETE; 10497 handle_allocindir_partdone(WK_ALLOCINDIR(wk)); 10498 continue; 10499 10500 case D_INDIRDEP: 10501 if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp)) 10502 WORKLIST_INSERT(&reattach, wk); 10503 continue; 10504 10505 case D_FREEBLKS: 10506 wk->wk_state |= COMPLETE; 10507 freeblks = WK_FREEBLKS(wk); 10508 if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE && 10509 LIST_EMPTY(&freeblks->fb_jblkdephd)) 10510 add_to_worklist(wk, WK_NODELAY); 10511 continue; 10512 10513 case D_FREEWORK: 10514 handle_written_freework(WK_FREEWORK(wk)); 10515 break; 10516 10517 case D_JSEGDEP: 10518 free_jsegdep(WK_JSEGDEP(wk)); 10519 continue; 10520 10521 case D_JSEG: 10522 handle_written_jseg(WK_JSEG(wk), bp); 10523 continue; 10524 10525 case D_SBDEP: 10526 if (handle_written_sbdep(WK_SBDEP(wk), bp)) 10527 WORKLIST_INSERT(&reattach, wk); 10528 continue; 10529 10530 case D_FREEDEP: 10531 free_freedep(WK_FREEDEP(wk)); 10532 continue; 10533 10534 default: 10535 panic("handle_disk_write_complete: Unknown type %s", 10536 TYPENAME(wk->wk_type)); 10537 /* NOTREACHED */ 10538 } 10539 } 10540 /* 10541 * Reattach any requests that must be redone. 10542 */ 10543 while ((wk = LIST_FIRST(&reattach)) != NULL) { 10544 WORKLIST_REMOVE(wk); 10545 WORKLIST_INSERT(&bp->b_dep, wk); 10546 } 10547 FREE_LOCK(&lk); 10548 if (sbp) 10549 brelse(sbp); 10550 } 10551 10552 /* 10553 * Called from within softdep_disk_write_complete above. Note that 10554 * this routine is always called from interrupt level with further 10555 * splbio interrupts blocked. 10556 */ 10557 static void 10558 handle_allocdirect_partdone(adp, wkhd) 10559 struct allocdirect *adp; /* the completed allocdirect */ 10560 struct workhead *wkhd; /* Work to do when inode is writtne. */ 10561 { 10562 struct allocdirectlst *listhead; 10563 struct allocdirect *listadp; 10564 struct inodedep *inodedep; 10565 long bsize; 10566 10567 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 10568 return; 10569 /* 10570 * The on-disk inode cannot claim to be any larger than the last 10571 * fragment that has been written. Otherwise, the on-disk inode 10572 * might have fragments that were not the last block in the file 10573 * which would corrupt the filesystem. Thus, we cannot free any 10574 * allocdirects after one whose ad_oldblkno claims a fragment as 10575 * these blocks must be rolled back to zero before writing the inode. 10576 * We check the currently active set of allocdirects in id_inoupdt 10577 * or id_extupdt as appropriate. 10578 */ 10579 inodedep = adp->ad_inodedep; 10580 bsize = inodedep->id_fs->fs_bsize; 10581 if (adp->ad_state & EXTDATA) 10582 listhead = &inodedep->id_extupdt; 10583 else 10584 listhead = &inodedep->id_inoupdt; 10585 TAILQ_FOREACH(listadp, listhead, ad_next) { 10586 /* found our block */ 10587 if (listadp == adp) 10588 break; 10589 /* continue if ad_oldlbn is not a fragment */ 10590 if (listadp->ad_oldsize == 0 || 10591 listadp->ad_oldsize == bsize) 10592 continue; 10593 /* hit a fragment */ 10594 return; 10595 } 10596 /* 10597 * If we have reached the end of the current list without 10598 * finding the just finished dependency, then it must be 10599 * on the future dependency list. Future dependencies cannot 10600 * be freed until they are moved to the current list. 10601 */ 10602 if (listadp == NULL) { 10603 #ifdef DEBUG 10604 if (adp->ad_state & EXTDATA) 10605 listhead = &inodedep->id_newextupdt; 10606 else 10607 listhead = &inodedep->id_newinoupdt; 10608 TAILQ_FOREACH(listadp, listhead, ad_next) 10609 /* found our block */ 10610 if (listadp == adp) 10611 break; 10612 if (listadp == NULL) 10613 panic("handle_allocdirect_partdone: lost dep"); 10614 #endif /* DEBUG */ 10615 return; 10616 } 10617 /* 10618 * If we have found the just finished dependency, then queue 10619 * it along with anything that follows it that is complete. 10620 * Since the pointer has not yet been written in the inode 10621 * as the dependency prevents it, place the allocdirect on the 10622 * bufwait list where it will be freed once the pointer is 10623 * valid. 10624 */ 10625 if (wkhd == NULL) 10626 wkhd = &inodedep->id_bufwait; 10627 for (; adp; adp = listadp) { 10628 listadp = TAILQ_NEXT(adp, ad_next); 10629 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 10630 return; 10631 TAILQ_REMOVE(listhead, adp, ad_next); 10632 WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list); 10633 } 10634 } 10635 10636 /* 10637 * Called from within softdep_disk_write_complete above. This routine 10638 * completes successfully written allocindirs. 10639 */ 10640 static void 10641 handle_allocindir_partdone(aip) 10642 struct allocindir *aip; /* the completed allocindir */ 10643 { 10644 struct indirdep *indirdep; 10645 10646 if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) 10647 return; 10648 indirdep = aip->ai_indirdep; 10649 LIST_REMOVE(aip, ai_next); 10650 /* 10651 * Don't set a pointer while the buffer is undergoing IO or while 10652 * we have active truncations. 10653 */ 10654 if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) { 10655 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); 10656 return; 10657 } 10658 if (indirdep->ir_state & UFS1FMT) 10659 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 10660 aip->ai_newblkno; 10661 else 10662 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 10663 aip->ai_newblkno; 10664 /* 10665 * Await the pointer write before freeing the allocindir. 10666 */ 10667 LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next); 10668 } 10669 10670 /* 10671 * Release segments held on a jwork list. 10672 */ 10673 static void 10674 handle_jwork(wkhd) 10675 struct workhead *wkhd; 10676 { 10677 struct worklist *wk; 10678 10679 while ((wk = LIST_FIRST(wkhd)) != NULL) { 10680 WORKLIST_REMOVE(wk); 10681 switch (wk->wk_type) { 10682 case D_JSEGDEP: 10683 free_jsegdep(WK_JSEGDEP(wk)); 10684 continue; 10685 case D_FREEDEP: 10686 free_freedep(WK_FREEDEP(wk)); 10687 continue; 10688 case D_FREEFRAG: 10689 rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep)); 10690 WORKITEM_FREE(wk, D_FREEFRAG); 10691 case D_FREEWORK: 10692 handle_written_freework(WK_FREEWORK(wk)); 10693 continue; 10694 default: 10695 panic("handle_jwork: Unknown type %s\n", 10696 TYPENAME(wk->wk_type)); 10697 } 10698 } 10699 } 10700 10701 /* 10702 * Handle the bufwait list on an inode when it is safe to release items 10703 * held there. This normally happens after an inode block is written but 10704 * may be delayed and handled later if there are pending journal items that 10705 * are not yet safe to be released. 10706 */ 10707 static struct freefile * 10708 handle_bufwait(inodedep, refhd) 10709 struct inodedep *inodedep; 10710 struct workhead *refhd; 10711 { 10712 struct jaddref *jaddref; 10713 struct freefile *freefile; 10714 struct worklist *wk; 10715 10716 freefile = NULL; 10717 while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { 10718 WORKLIST_REMOVE(wk); 10719 switch (wk->wk_type) { 10720 case D_FREEFILE: 10721 /* 10722 * We defer adding freefile to the worklist 10723 * until all other additions have been made to 10724 * ensure that it will be done after all the 10725 * old blocks have been freed. 10726 */ 10727 if (freefile != NULL) 10728 panic("handle_bufwait: freefile"); 10729 freefile = WK_FREEFILE(wk); 10730 continue; 10731 10732 case D_MKDIR: 10733 handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); 10734 continue; 10735 10736 case D_DIRADD: 10737 diradd_inode_written(WK_DIRADD(wk), inodedep); 10738 continue; 10739 10740 case D_FREEFRAG: 10741 wk->wk_state |= COMPLETE; 10742 if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE) 10743 add_to_worklist(wk, 0); 10744 continue; 10745 10746 case D_DIRREM: 10747 wk->wk_state |= COMPLETE; 10748 add_to_worklist(wk, 0); 10749 continue; 10750 10751 case D_ALLOCDIRECT: 10752 case D_ALLOCINDIR: 10753 free_newblk(WK_NEWBLK(wk)); 10754 continue; 10755 10756 case D_JNEWBLK: 10757 wk->wk_state |= COMPLETE; 10758 free_jnewblk(WK_JNEWBLK(wk)); 10759 continue; 10760 10761 /* 10762 * Save freed journal segments and add references on 10763 * the supplied list which will delay their release 10764 * until the cg bitmap is cleared on disk. 10765 */ 10766 case D_JSEGDEP: 10767 if (refhd == NULL) 10768 free_jsegdep(WK_JSEGDEP(wk)); 10769 else 10770 WORKLIST_INSERT(refhd, wk); 10771 continue; 10772 10773 case D_JADDREF: 10774 jaddref = WK_JADDREF(wk); 10775 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, 10776 if_deps); 10777 /* 10778 * Transfer any jaddrefs to the list to be freed with 10779 * the bitmap if we're handling a removed file. 10780 */ 10781 if (refhd == NULL) { 10782 wk->wk_state |= COMPLETE; 10783 free_jaddref(jaddref); 10784 } else 10785 WORKLIST_INSERT(refhd, wk); 10786 continue; 10787 10788 default: 10789 panic("handle_bufwait: Unknown type %p(%s)", 10790 wk, TYPENAME(wk->wk_type)); 10791 /* NOTREACHED */ 10792 } 10793 } 10794 return (freefile); 10795 } 10796 /* 10797 * Called from within softdep_disk_write_complete above to restore 10798 * in-memory inode block contents to their most up-to-date state. Note 10799 * that this routine is always called from interrupt level with further 10800 * splbio interrupts blocked. 10801 */ 10802 static int 10803 handle_written_inodeblock(inodedep, bp) 10804 struct inodedep *inodedep; 10805 struct buf *bp; /* buffer containing the inode block */ 10806 { 10807 struct freefile *freefile; 10808 struct allocdirect *adp, *nextadp; 10809 struct ufs1_dinode *dp1 = NULL; 10810 struct ufs2_dinode *dp2 = NULL; 10811 struct workhead wkhd; 10812 int hadchanges, fstype; 10813 ino_t freelink; 10814 10815 LIST_INIT(&wkhd); 10816 hadchanges = 0; 10817 freefile = NULL; 10818 if ((inodedep->id_state & IOSTARTED) == 0) 10819 panic("handle_written_inodeblock: not started"); 10820 inodedep->id_state &= ~IOSTARTED; 10821 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) { 10822 fstype = UFS1; 10823 dp1 = (struct ufs1_dinode *)bp->b_data + 10824 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 10825 freelink = dp1->di_freelink; 10826 } else { 10827 fstype = UFS2; 10828 dp2 = (struct ufs2_dinode *)bp->b_data + 10829 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 10830 freelink = dp2->di_freelink; 10831 } 10832 /* 10833 * Leave this inodeblock dirty until it's in the list. 10834 */ 10835 if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED) { 10836 struct inodedep *inon; 10837 10838 inon = TAILQ_NEXT(inodedep, id_unlinked); 10839 if ((inon == NULL && freelink == 0) || 10840 (inon && inon->id_ino == freelink)) { 10841 if (inon) 10842 inon->id_state |= UNLINKPREV; 10843 inodedep->id_state |= UNLINKNEXT; 10844 } 10845 hadchanges = 1; 10846 } 10847 /* 10848 * If we had to rollback the inode allocation because of 10849 * bitmaps being incomplete, then simply restore it. 10850 * Keep the block dirty so that it will not be reclaimed until 10851 * all associated dependencies have been cleared and the 10852 * corresponding updates written to disk. 10853 */ 10854 if (inodedep->id_savedino1 != NULL) { 10855 hadchanges = 1; 10856 if (fstype == UFS1) 10857 *dp1 = *inodedep->id_savedino1; 10858 else 10859 *dp2 = *inodedep->id_savedino2; 10860 free(inodedep->id_savedino1, M_SAVEDINO); 10861 inodedep->id_savedino1 = NULL; 10862 if ((bp->b_flags & B_DELWRI) == 0) 10863 stat_inode_bitmap++; 10864 bdirty(bp); 10865 /* 10866 * If the inode is clear here and GOINGAWAY it will never 10867 * be written. Process the bufwait and clear any pending 10868 * work which may include the freefile. 10869 */ 10870 if (inodedep->id_state & GOINGAWAY) 10871 goto bufwait; 10872 return (1); 10873 } 10874 inodedep->id_state |= COMPLETE; 10875 /* 10876 * Roll forward anything that had to be rolled back before 10877 * the inode could be updated. 10878 */ 10879 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { 10880 nextadp = TAILQ_NEXT(adp, ad_next); 10881 if (adp->ad_state & ATTACHED) 10882 panic("handle_written_inodeblock: new entry"); 10883 if (fstype == UFS1) { 10884 if (adp->ad_offset < NDADDR) { 10885 if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno) 10886 panic("%s %s #%jd mismatch %d != %jd", 10887 "handle_written_inodeblock:", 10888 "direct pointer", 10889 (intmax_t)adp->ad_offset, 10890 dp1->di_db[adp->ad_offset], 10891 (intmax_t)adp->ad_oldblkno); 10892 dp1->di_db[adp->ad_offset] = adp->ad_newblkno; 10893 } else { 10894 if (dp1->di_ib[adp->ad_offset - NDADDR] != 0) 10895 panic("%s: %s #%jd allocated as %d", 10896 "handle_written_inodeblock", 10897 "indirect pointer", 10898 (intmax_t)adp->ad_offset - NDADDR, 10899 dp1->di_ib[adp->ad_offset - NDADDR]); 10900 dp1->di_ib[adp->ad_offset - NDADDR] = 10901 adp->ad_newblkno; 10902 } 10903 } else { 10904 if (adp->ad_offset < NDADDR) { 10905 if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno) 10906 panic("%s: %s #%jd %s %jd != %jd", 10907 "handle_written_inodeblock", 10908 "direct pointer", 10909 (intmax_t)adp->ad_offset, "mismatch", 10910 (intmax_t)dp2->di_db[adp->ad_offset], 10911 (intmax_t)adp->ad_oldblkno); 10912 dp2->di_db[adp->ad_offset] = adp->ad_newblkno; 10913 } else { 10914 if (dp2->di_ib[adp->ad_offset - NDADDR] != 0) 10915 panic("%s: %s #%jd allocated as %jd", 10916 "handle_written_inodeblock", 10917 "indirect pointer", 10918 (intmax_t)adp->ad_offset - NDADDR, 10919 (intmax_t) 10920 dp2->di_ib[adp->ad_offset - NDADDR]); 10921 dp2->di_ib[adp->ad_offset - NDADDR] = 10922 adp->ad_newblkno; 10923 } 10924 } 10925 adp->ad_state &= ~UNDONE; 10926 adp->ad_state |= ATTACHED; 10927 hadchanges = 1; 10928 } 10929 for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) { 10930 nextadp = TAILQ_NEXT(adp, ad_next); 10931 if (adp->ad_state & ATTACHED) 10932 panic("handle_written_inodeblock: new entry"); 10933 if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno) 10934 panic("%s: direct pointers #%jd %s %jd != %jd", 10935 "handle_written_inodeblock", 10936 (intmax_t)adp->ad_offset, "mismatch", 10937 (intmax_t)dp2->di_extb[adp->ad_offset], 10938 (intmax_t)adp->ad_oldblkno); 10939 dp2->di_extb[adp->ad_offset] = adp->ad_newblkno; 10940 adp->ad_state &= ~UNDONE; 10941 adp->ad_state |= ATTACHED; 10942 hadchanges = 1; 10943 } 10944 if (hadchanges && (bp->b_flags & B_DELWRI) == 0) 10945 stat_direct_blk_ptrs++; 10946 /* 10947 * Reset the file size to its most up-to-date value. 10948 */ 10949 if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1) 10950 panic("handle_written_inodeblock: bad size"); 10951 if (inodedep->id_savednlink > LINK_MAX) 10952 panic("handle_written_inodeblock: Invalid link count " 10953 "%d for inodedep %p", inodedep->id_savednlink, inodedep); 10954 if (fstype == UFS1) { 10955 if (dp1->di_nlink != inodedep->id_savednlink) { 10956 dp1->di_nlink = inodedep->id_savednlink; 10957 hadchanges = 1; 10958 } 10959 if (dp1->di_size != inodedep->id_savedsize) { 10960 dp1->di_size = inodedep->id_savedsize; 10961 hadchanges = 1; 10962 } 10963 } else { 10964 if (dp2->di_nlink != inodedep->id_savednlink) { 10965 dp2->di_nlink = inodedep->id_savednlink; 10966 hadchanges = 1; 10967 } 10968 if (dp2->di_size != inodedep->id_savedsize) { 10969 dp2->di_size = inodedep->id_savedsize; 10970 hadchanges = 1; 10971 } 10972 if (dp2->di_extsize != inodedep->id_savedextsize) { 10973 dp2->di_extsize = inodedep->id_savedextsize; 10974 hadchanges = 1; 10975 } 10976 } 10977 inodedep->id_savedsize = -1; 10978 inodedep->id_savedextsize = -1; 10979 inodedep->id_savednlink = -1; 10980 /* 10981 * If there were any rollbacks in the inode block, then it must be 10982 * marked dirty so that its will eventually get written back in 10983 * its correct form. 10984 */ 10985 if (hadchanges) 10986 bdirty(bp); 10987 bufwait: 10988 /* 10989 * Process any allocdirects that completed during the update. 10990 */ 10991 if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) 10992 handle_allocdirect_partdone(adp, &wkhd); 10993 if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) 10994 handle_allocdirect_partdone(adp, &wkhd); 10995 /* 10996 * Process deallocations that were held pending until the 10997 * inode had been written to disk. Freeing of the inode 10998 * is delayed until after all blocks have been freed to 10999 * avoid creation of new <vfsid, inum, lbn> triples 11000 * before the old ones have been deleted. Completely 11001 * unlinked inodes are not processed until the unlinked 11002 * inode list is written or the last reference is removed. 11003 */ 11004 if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) { 11005 freefile = handle_bufwait(inodedep, NULL); 11006 if (freefile && !LIST_EMPTY(&wkhd)) { 11007 WORKLIST_INSERT(&wkhd, &freefile->fx_list); 11008 freefile = NULL; 11009 } 11010 } 11011 /* 11012 * Move rolled forward dependency completions to the bufwait list 11013 * now that those that were already written have been processed. 11014 */ 11015 if (!LIST_EMPTY(&wkhd) && hadchanges == 0) 11016 panic("handle_written_inodeblock: bufwait but no changes"); 11017 jwork_move(&inodedep->id_bufwait, &wkhd); 11018 11019 if (freefile != NULL) { 11020 /* 11021 * If the inode is goingaway it was never written. Fake up 11022 * the state here so free_inodedep() can succeed. 11023 */ 11024 if (inodedep->id_state & GOINGAWAY) 11025 inodedep->id_state |= COMPLETE | DEPCOMPLETE; 11026 if (free_inodedep(inodedep) == 0) 11027 panic("handle_written_inodeblock: live inodedep %p", 11028 inodedep); 11029 add_to_worklist(&freefile->fx_list, 0); 11030 return (0); 11031 } 11032 11033 /* 11034 * If no outstanding dependencies, free it. 11035 */ 11036 if (free_inodedep(inodedep) || 11037 (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 && 11038 TAILQ_FIRST(&inodedep->id_inoupdt) == 0 && 11039 TAILQ_FIRST(&inodedep->id_extupdt) == 0 && 11040 LIST_FIRST(&inodedep->id_bufwait) == 0)) 11041 return (0); 11042 return (hadchanges); 11043 } 11044 11045 static int 11046 handle_written_indirdep(indirdep, bp, bpp) 11047 struct indirdep *indirdep; 11048 struct buf *bp; 11049 struct buf **bpp; 11050 { 11051 struct allocindir *aip; 11052 struct buf *sbp; 11053 int chgs; 11054 11055 if (indirdep->ir_state & GOINGAWAY) 11056 panic("handle_written_indirdep: indirdep gone"); 11057 if ((indirdep->ir_state & IOSTARTED) == 0) 11058 panic("handle_written_indirdep: IO not started"); 11059 chgs = 0; 11060 /* 11061 * If there were rollbacks revert them here. 11062 */ 11063 if (indirdep->ir_saveddata) { 11064 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); 11065 if (TAILQ_EMPTY(&indirdep->ir_trunc)) { 11066 free(indirdep->ir_saveddata, M_INDIRDEP); 11067 indirdep->ir_saveddata = NULL; 11068 } 11069 chgs = 1; 11070 } 11071 indirdep->ir_state &= ~(UNDONE | IOSTARTED); 11072 indirdep->ir_state |= ATTACHED; 11073 /* 11074 * Move allocindirs with written pointers to the completehd if 11075 * the indirdep's pointer is not yet written. Otherwise 11076 * free them here. 11077 */ 11078 while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) { 11079 LIST_REMOVE(aip, ai_next); 11080 if ((indirdep->ir_state & DEPCOMPLETE) == 0) { 11081 LIST_INSERT_HEAD(&indirdep->ir_completehd, aip, 11082 ai_next); 11083 newblk_freefrag(&aip->ai_block); 11084 continue; 11085 } 11086 free_newblk(&aip->ai_block); 11087 } 11088 /* 11089 * Move allocindirs that have finished dependency processing from 11090 * the done list to the write list after updating the pointers. 11091 */ 11092 if (TAILQ_EMPTY(&indirdep->ir_trunc)) { 11093 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { 11094 handle_allocindir_partdone(aip); 11095 if (aip == LIST_FIRST(&indirdep->ir_donehd)) 11096 panic("disk_write_complete: not gone"); 11097 chgs = 1; 11098 } 11099 } 11100 /* 11101 * Preserve the indirdep if there were any changes or if it is not 11102 * yet valid on disk. 11103 */ 11104 if (chgs) { 11105 stat_indir_blk_ptrs++; 11106 bdirty(bp); 11107 return (1); 11108 } 11109 /* 11110 * If there were no changes we can discard the savedbp and detach 11111 * ourselves from the buf. We are only carrying completed pointers 11112 * in this case. 11113 */ 11114 sbp = indirdep->ir_savebp; 11115 sbp->b_flags |= B_INVAL | B_NOCACHE; 11116 indirdep->ir_savebp = NULL; 11117 indirdep->ir_bp = NULL; 11118 if (*bpp != NULL) 11119 panic("handle_written_indirdep: bp already exists."); 11120 *bpp = sbp; 11121 /* 11122 * The indirdep may not be freed until its parent points at it. 11123 */ 11124 if (indirdep->ir_state & DEPCOMPLETE) 11125 free_indirdep(indirdep); 11126 11127 return (0); 11128 } 11129 11130 /* 11131 * Process a diradd entry after its dependent inode has been written. 11132 * This routine must be called with splbio interrupts blocked. 11133 */ 11134 static void 11135 diradd_inode_written(dap, inodedep) 11136 struct diradd *dap; 11137 struct inodedep *inodedep; 11138 { 11139 11140 dap->da_state |= COMPLETE; 11141 complete_diradd(dap); 11142 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 11143 } 11144 11145 /* 11146 * Returns true if the bmsafemap will have rollbacks when written. Must 11147 * only be called with lk and the buf lock on the cg held. 11148 */ 11149 static int 11150 bmsafemap_rollbacks(bmsafemap) 11151 struct bmsafemap *bmsafemap; 11152 { 11153 11154 return (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd) | 11155 !LIST_EMPTY(&bmsafemap->sm_jnewblkhd)); 11156 } 11157 11158 /* 11159 * Re-apply an allocation when a cg write is complete. 11160 */ 11161 static int 11162 jnewblk_rollforward(jnewblk, fs, cgp, blksfree) 11163 struct jnewblk *jnewblk; 11164 struct fs *fs; 11165 struct cg *cgp; 11166 uint8_t *blksfree; 11167 { 11168 ufs1_daddr_t fragno; 11169 ufs2_daddr_t blkno; 11170 long cgbno, bbase; 11171 int frags, blk; 11172 int i; 11173 11174 frags = 0; 11175 cgbno = dtogd(fs, jnewblk->jn_blkno); 11176 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) { 11177 if (isclr(blksfree, cgbno + i)) 11178 panic("jnewblk_rollforward: re-allocated fragment"); 11179 frags++; 11180 } 11181 if (frags == fs->fs_frag) { 11182 blkno = fragstoblks(fs, cgbno); 11183 ffs_clrblock(fs, blksfree, (long)blkno); 11184 ffs_clusteracct(fs, cgp, blkno, -1); 11185 cgp->cg_cs.cs_nbfree--; 11186 } else { 11187 bbase = cgbno - fragnum(fs, cgbno); 11188 cgbno += jnewblk->jn_oldfrags; 11189 /* If a complete block had been reassembled, account for it. */ 11190 fragno = fragstoblks(fs, bbase); 11191 if (ffs_isblock(fs, blksfree, fragno)) { 11192 cgp->cg_cs.cs_nffree += fs->fs_frag; 11193 ffs_clusteracct(fs, cgp, fragno, -1); 11194 cgp->cg_cs.cs_nbfree--; 11195 } 11196 /* Decrement the old frags. */ 11197 blk = blkmap(fs, blksfree, bbase); 11198 ffs_fragacct(fs, blk, cgp->cg_frsum, -1); 11199 /* Allocate the fragment */ 11200 for (i = 0; i < frags; i++) 11201 clrbit(blksfree, cgbno + i); 11202 cgp->cg_cs.cs_nffree -= frags; 11203 /* Add back in counts associated with the new frags */ 11204 blk = blkmap(fs, blksfree, bbase); 11205 ffs_fragacct(fs, blk, cgp->cg_frsum, 1); 11206 } 11207 return (frags); 11208 } 11209 11210 /* 11211 * Complete a write to a bmsafemap structure. Roll forward any bitmap 11212 * changes if it's not a background write. Set all written dependencies 11213 * to DEPCOMPLETE and free the structure if possible. 11214 */ 11215 static int 11216 handle_written_bmsafemap(bmsafemap, bp) 11217 struct bmsafemap *bmsafemap; 11218 struct buf *bp; 11219 { 11220 struct newblk *newblk; 11221 struct inodedep *inodedep; 11222 struct jaddref *jaddref, *jatmp; 11223 struct jnewblk *jnewblk, *jntmp; 11224 struct ufsmount *ump; 11225 uint8_t *inosused; 11226 uint8_t *blksfree; 11227 struct cg *cgp; 11228 struct fs *fs; 11229 ino_t ino; 11230 int chgs; 11231 11232 if ((bmsafemap->sm_state & IOSTARTED) == 0) 11233 panic("initiate_write_bmsafemap: Not started\n"); 11234 ump = VFSTOUFS(bmsafemap->sm_list.wk_mp); 11235 chgs = 0; 11236 bmsafemap->sm_state &= ~IOSTARTED; 11237 /* 11238 * Release journal work that was waiting on the write. 11239 */ 11240 handle_jwork(&bmsafemap->sm_freewr); 11241 11242 /* 11243 * Restore unwritten inode allocation pending jaddref writes. 11244 */ 11245 if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) { 11246 cgp = (struct cg *)bp->b_data; 11247 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 11248 inosused = cg_inosused(cgp); 11249 LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd, 11250 ja_bmdeps, jatmp) { 11251 if ((jaddref->ja_state & UNDONE) == 0) 11252 continue; 11253 ino = jaddref->ja_ino % fs->fs_ipg; 11254 if (isset(inosused, ino)) 11255 panic("handle_written_bmsafemap: " 11256 "re-allocated inode"); 11257 if ((bp->b_xflags & BX_BKGRDMARKER) == 0) { 11258 if ((jaddref->ja_mode & IFMT) == IFDIR) 11259 cgp->cg_cs.cs_ndir++; 11260 cgp->cg_cs.cs_nifree--; 11261 setbit(inosused, ino); 11262 chgs = 1; 11263 } 11264 jaddref->ja_state &= ~UNDONE; 11265 jaddref->ja_state |= ATTACHED; 11266 free_jaddref(jaddref); 11267 } 11268 } 11269 /* 11270 * Restore any block allocations which are pending journal writes. 11271 */ 11272 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { 11273 cgp = (struct cg *)bp->b_data; 11274 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 11275 blksfree = cg_blksfree(cgp); 11276 LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps, 11277 jntmp) { 11278 if ((jnewblk->jn_state & UNDONE) == 0) 11279 continue; 11280 if ((bp->b_xflags & BX_BKGRDMARKER) == 0 && 11281 jnewblk_rollforward(jnewblk, fs, cgp, blksfree)) 11282 chgs = 1; 11283 jnewblk->jn_state &= ~(UNDONE | NEWBLOCK); 11284 jnewblk->jn_state |= ATTACHED; 11285 free_jnewblk(jnewblk); 11286 } 11287 } 11288 while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) { 11289 newblk->nb_state |= DEPCOMPLETE; 11290 newblk->nb_state &= ~ONDEPLIST; 11291 newblk->nb_bmsafemap = NULL; 11292 LIST_REMOVE(newblk, nb_deps); 11293 if (newblk->nb_list.wk_type == D_ALLOCDIRECT) 11294 handle_allocdirect_partdone( 11295 WK_ALLOCDIRECT(&newblk->nb_list), NULL); 11296 else if (newblk->nb_list.wk_type == D_ALLOCINDIR) 11297 handle_allocindir_partdone( 11298 WK_ALLOCINDIR(&newblk->nb_list)); 11299 else if (newblk->nb_list.wk_type != D_NEWBLK) 11300 panic("handle_written_bmsafemap: Unexpected type: %s", 11301 TYPENAME(newblk->nb_list.wk_type)); 11302 } 11303 while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) { 11304 inodedep->id_state |= DEPCOMPLETE; 11305 inodedep->id_state &= ~ONDEPLIST; 11306 LIST_REMOVE(inodedep, id_deps); 11307 inodedep->id_bmsafemap = NULL; 11308 } 11309 LIST_REMOVE(bmsafemap, sm_next); 11310 if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) && 11311 LIST_EMPTY(&bmsafemap->sm_jnewblkhd) && 11312 LIST_EMPTY(&bmsafemap->sm_newblkhd) && 11313 LIST_EMPTY(&bmsafemap->sm_inodedephd) && 11314 LIST_EMPTY(&bmsafemap->sm_freehd)) { 11315 LIST_REMOVE(bmsafemap, sm_hash); 11316 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 11317 return (0); 11318 } 11319 LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next); 11320 bdirty(bp); 11321 return (1); 11322 } 11323 11324 /* 11325 * Try to free a mkdir dependency. 11326 */ 11327 static void 11328 complete_mkdir(mkdir) 11329 struct mkdir *mkdir; 11330 { 11331 struct diradd *dap; 11332 11333 if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE) 11334 return; 11335 LIST_REMOVE(mkdir, md_mkdirs); 11336 dap = mkdir->md_diradd; 11337 dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); 11338 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) { 11339 dap->da_state |= DEPCOMPLETE; 11340 complete_diradd(dap); 11341 } 11342 WORKITEM_FREE(mkdir, D_MKDIR); 11343 } 11344 11345 /* 11346 * Handle the completion of a mkdir dependency. 11347 */ 11348 static void 11349 handle_written_mkdir(mkdir, type) 11350 struct mkdir *mkdir; 11351 int type; 11352 { 11353 11354 if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type) 11355 panic("handle_written_mkdir: bad type"); 11356 mkdir->md_state |= COMPLETE; 11357 complete_mkdir(mkdir); 11358 } 11359 11360 static int 11361 free_pagedep(pagedep) 11362 struct pagedep *pagedep; 11363 { 11364 int i; 11365 11366 if (pagedep->pd_state & NEWBLOCK) 11367 return (0); 11368 if (!LIST_EMPTY(&pagedep->pd_dirremhd)) 11369 return (0); 11370 for (i = 0; i < DAHASHSZ; i++) 11371 if (!LIST_EMPTY(&pagedep->pd_diraddhd[i])) 11372 return (0); 11373 if (!LIST_EMPTY(&pagedep->pd_pendinghd)) 11374 return (0); 11375 if (!LIST_EMPTY(&pagedep->pd_jmvrefhd)) 11376 return (0); 11377 if (pagedep->pd_state & ONWORKLIST) 11378 WORKLIST_REMOVE(&pagedep->pd_list); 11379 LIST_REMOVE(pagedep, pd_hash); 11380 WORKITEM_FREE(pagedep, D_PAGEDEP); 11381 11382 return (1); 11383 } 11384 11385 /* 11386 * Called from within softdep_disk_write_complete above. 11387 * A write operation was just completed. Removed inodes can 11388 * now be freed and associated block pointers may be committed. 11389 * Note that this routine is always called from interrupt level 11390 * with further splbio interrupts blocked. 11391 */ 11392 static int 11393 handle_written_filepage(pagedep, bp) 11394 struct pagedep *pagedep; 11395 struct buf *bp; /* buffer containing the written page */ 11396 { 11397 struct dirrem *dirrem; 11398 struct diradd *dap, *nextdap; 11399 struct direct *ep; 11400 int i, chgs; 11401 11402 if ((pagedep->pd_state & IOSTARTED) == 0) 11403 panic("handle_written_filepage: not started"); 11404 pagedep->pd_state &= ~IOSTARTED; 11405 /* 11406 * Process any directory removals that have been committed. 11407 */ 11408 while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { 11409 LIST_REMOVE(dirrem, dm_next); 11410 dirrem->dm_state |= COMPLETE; 11411 dirrem->dm_dirinum = pagedep->pd_ino; 11412 KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), 11413 ("handle_written_filepage: Journal entries not written.")); 11414 add_to_worklist(&dirrem->dm_list, 0); 11415 } 11416 /* 11417 * Free any directory additions that have been committed. 11418 * If it is a newly allocated block, we have to wait until 11419 * the on-disk directory inode claims the new block. 11420 */ 11421 if ((pagedep->pd_state & NEWBLOCK) == 0) 11422 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 11423 free_diradd(dap, NULL); 11424 /* 11425 * Uncommitted directory entries must be restored. 11426 */ 11427 for (chgs = 0, i = 0; i < DAHASHSZ; i++) { 11428 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; 11429 dap = nextdap) { 11430 nextdap = LIST_NEXT(dap, da_pdlist); 11431 if (dap->da_state & ATTACHED) 11432 panic("handle_written_filepage: attached"); 11433 ep = (struct direct *) 11434 ((char *)bp->b_data + dap->da_offset); 11435 ep->d_ino = dap->da_newinum; 11436 dap->da_state &= ~UNDONE; 11437 dap->da_state |= ATTACHED; 11438 chgs = 1; 11439 /* 11440 * If the inode referenced by the directory has 11441 * been written out, then the dependency can be 11442 * moved to the pending list. 11443 */ 11444 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 11445 LIST_REMOVE(dap, da_pdlist); 11446 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, 11447 da_pdlist); 11448 } 11449 } 11450 } 11451 /* 11452 * If there were any rollbacks in the directory, then it must be 11453 * marked dirty so that its will eventually get written back in 11454 * its correct form. 11455 */ 11456 if (chgs) { 11457 if ((bp->b_flags & B_DELWRI) == 0) 11458 stat_dir_entry++; 11459 bdirty(bp); 11460 return (1); 11461 } 11462 /* 11463 * If we are not waiting for a new directory block to be 11464 * claimed by its inode, then the pagedep will be freed. 11465 * Otherwise it will remain to track any new entries on 11466 * the page in case they are fsync'ed. 11467 */ 11468 free_pagedep(pagedep); 11469 return (0); 11470 } 11471 11472 /* 11473 * Writing back in-core inode structures. 11474 * 11475 * The filesystem only accesses an inode's contents when it occupies an 11476 * "in-core" inode structure. These "in-core" structures are separate from 11477 * the page frames used to cache inode blocks. Only the latter are 11478 * transferred to/from the disk. So, when the updated contents of the 11479 * "in-core" inode structure are copied to the corresponding in-memory inode 11480 * block, the dependencies are also transferred. The following procedure is 11481 * called when copying a dirty "in-core" inode to a cached inode block. 11482 */ 11483 11484 /* 11485 * Called when an inode is loaded from disk. If the effective link count 11486 * differed from the actual link count when it was last flushed, then we 11487 * need to ensure that the correct effective link count is put back. 11488 */ 11489 void 11490 softdep_load_inodeblock(ip) 11491 struct inode *ip; /* the "in_core" copy of the inode */ 11492 { 11493 struct inodedep *inodedep; 11494 11495 /* 11496 * Check for alternate nlink count. 11497 */ 11498 ip->i_effnlink = ip->i_nlink; 11499 ACQUIRE_LOCK(&lk); 11500 if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 11501 &inodedep) == 0) { 11502 FREE_LOCK(&lk); 11503 return; 11504 } 11505 ip->i_effnlink -= inodedep->id_nlinkdelta; 11506 FREE_LOCK(&lk); 11507 } 11508 11509 /* 11510 * This routine is called just before the "in-core" inode 11511 * information is to be copied to the in-memory inode block. 11512 * Recall that an inode block contains several inodes. If 11513 * the force flag is set, then the dependencies will be 11514 * cleared so that the update can always be made. Note that 11515 * the buffer is locked when this routine is called, so we 11516 * will never be in the middle of writing the inode block 11517 * to disk. 11518 */ 11519 void 11520 softdep_update_inodeblock(ip, bp, waitfor) 11521 struct inode *ip; /* the "in_core" copy of the inode */ 11522 struct buf *bp; /* the buffer containing the inode block */ 11523 int waitfor; /* nonzero => update must be allowed */ 11524 { 11525 struct inodedep *inodedep; 11526 struct inoref *inoref; 11527 struct worklist *wk; 11528 struct mount *mp; 11529 struct buf *ibp; 11530 struct fs *fs; 11531 int error; 11532 11533 mp = UFSTOVFS(ip->i_ump); 11534 fs = ip->i_fs; 11535 /* 11536 * Preserve the freelink that is on disk. clear_unlinked_inodedep() 11537 * does not have access to the in-core ip so must write directly into 11538 * the inode block buffer when setting freelink. 11539 */ 11540 if (fs->fs_magic == FS_UFS1_MAGIC) 11541 DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data + 11542 ino_to_fsbo(fs, ip->i_number))->di_freelink); 11543 else 11544 DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data + 11545 ino_to_fsbo(fs, ip->i_number))->di_freelink); 11546 /* 11547 * If the effective link count is not equal to the actual link 11548 * count, then we must track the difference in an inodedep while 11549 * the inode is (potentially) tossed out of the cache. Otherwise, 11550 * if there is no existing inodedep, then there are no dependencies 11551 * to track. 11552 */ 11553 ACQUIRE_LOCK(&lk); 11554 again: 11555 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { 11556 FREE_LOCK(&lk); 11557 if (ip->i_effnlink != ip->i_nlink) 11558 panic("softdep_update_inodeblock: bad link count"); 11559 return; 11560 } 11561 if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) 11562 panic("softdep_update_inodeblock: bad delta"); 11563 /* 11564 * If we're flushing all dependencies we must also move any waiting 11565 * for journal writes onto the bufwait list prior to I/O. 11566 */ 11567 if (waitfor) { 11568 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 11569 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 11570 == DEPCOMPLETE) { 11571 jwait(&inoref->if_list, MNT_WAIT); 11572 goto again; 11573 } 11574 } 11575 } 11576 /* 11577 * Changes have been initiated. Anything depending on these 11578 * changes cannot occur until this inode has been written. 11579 */ 11580 inodedep->id_state &= ~COMPLETE; 11581 if ((inodedep->id_state & ONWORKLIST) == 0) 11582 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); 11583 /* 11584 * Any new dependencies associated with the incore inode must 11585 * now be moved to the list associated with the buffer holding 11586 * the in-memory copy of the inode. Once merged process any 11587 * allocdirects that are completed by the merger. 11588 */ 11589 merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); 11590 if (!TAILQ_EMPTY(&inodedep->id_inoupdt)) 11591 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt), 11592 NULL); 11593 merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); 11594 if (!TAILQ_EMPTY(&inodedep->id_extupdt)) 11595 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt), 11596 NULL); 11597 /* 11598 * Now that the inode has been pushed into the buffer, the 11599 * operations dependent on the inode being written to disk 11600 * can be moved to the id_bufwait so that they will be 11601 * processed when the buffer I/O completes. 11602 */ 11603 while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { 11604 WORKLIST_REMOVE(wk); 11605 WORKLIST_INSERT(&inodedep->id_bufwait, wk); 11606 } 11607 /* 11608 * Newly allocated inodes cannot be written until the bitmap 11609 * that allocates them have been written (indicated by 11610 * DEPCOMPLETE being set in id_state). If we are doing a 11611 * forced sync (e.g., an fsync on a file), we force the bitmap 11612 * to be written so that the update can be done. 11613 */ 11614 if (waitfor == 0) { 11615 FREE_LOCK(&lk); 11616 return; 11617 } 11618 retry: 11619 if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) { 11620 FREE_LOCK(&lk); 11621 return; 11622 } 11623 ibp = inodedep->id_bmsafemap->sm_buf; 11624 ibp = getdirtybuf(ibp, &lk, MNT_WAIT); 11625 if (ibp == NULL) { 11626 /* 11627 * If ibp came back as NULL, the dependency could have been 11628 * freed while we slept. Look it up again, and check to see 11629 * that it has completed. 11630 */ 11631 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) 11632 goto retry; 11633 FREE_LOCK(&lk); 11634 return; 11635 } 11636 FREE_LOCK(&lk); 11637 if ((error = bwrite(ibp)) != 0) 11638 softdep_error("softdep_update_inodeblock: bwrite", error); 11639 } 11640 11641 /* 11642 * Merge the a new inode dependency list (such as id_newinoupdt) into an 11643 * old inode dependency list (such as id_inoupdt). This routine must be 11644 * called with splbio interrupts blocked. 11645 */ 11646 static void 11647 merge_inode_lists(newlisthead, oldlisthead) 11648 struct allocdirectlst *newlisthead; 11649 struct allocdirectlst *oldlisthead; 11650 { 11651 struct allocdirect *listadp, *newadp; 11652 11653 newadp = TAILQ_FIRST(newlisthead); 11654 for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) { 11655 if (listadp->ad_offset < newadp->ad_offset) { 11656 listadp = TAILQ_NEXT(listadp, ad_next); 11657 continue; 11658 } 11659 TAILQ_REMOVE(newlisthead, newadp, ad_next); 11660 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); 11661 if (listadp->ad_offset == newadp->ad_offset) { 11662 allocdirect_merge(oldlisthead, newadp, 11663 listadp); 11664 listadp = newadp; 11665 } 11666 newadp = TAILQ_FIRST(newlisthead); 11667 } 11668 while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) { 11669 TAILQ_REMOVE(newlisthead, newadp, ad_next); 11670 TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next); 11671 } 11672 } 11673 11674 /* 11675 * If we are doing an fsync, then we must ensure that any directory 11676 * entries for the inode have been written after the inode gets to disk. 11677 */ 11678 int 11679 softdep_fsync(vp) 11680 struct vnode *vp; /* the "in_core" copy of the inode */ 11681 { 11682 struct inodedep *inodedep; 11683 struct pagedep *pagedep; 11684 struct inoref *inoref; 11685 struct worklist *wk; 11686 struct diradd *dap; 11687 struct mount *mp; 11688 struct vnode *pvp; 11689 struct inode *ip; 11690 struct buf *bp; 11691 struct fs *fs; 11692 struct thread *td = curthread; 11693 int error, flushparent, pagedep_new_block; 11694 ino_t parentino; 11695 ufs_lbn_t lbn; 11696 11697 ip = VTOI(vp); 11698 fs = ip->i_fs; 11699 mp = vp->v_mount; 11700 ACQUIRE_LOCK(&lk); 11701 restart: 11702 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { 11703 FREE_LOCK(&lk); 11704 return (0); 11705 } 11706 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 11707 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 11708 == DEPCOMPLETE) { 11709 jwait(&inoref->if_list, MNT_WAIT); 11710 goto restart; 11711 } 11712 } 11713 if (!LIST_EMPTY(&inodedep->id_inowait) || 11714 !TAILQ_EMPTY(&inodedep->id_extupdt) || 11715 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 11716 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 11717 !TAILQ_EMPTY(&inodedep->id_newinoupdt)) 11718 panic("softdep_fsync: pending ops %p", inodedep); 11719 for (error = 0, flushparent = 0; ; ) { 11720 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) 11721 break; 11722 if (wk->wk_type != D_DIRADD) 11723 panic("softdep_fsync: Unexpected type %s", 11724 TYPENAME(wk->wk_type)); 11725 dap = WK_DIRADD(wk); 11726 /* 11727 * Flush our parent if this directory entry has a MKDIR_PARENT 11728 * dependency or is contained in a newly allocated block. 11729 */ 11730 if (dap->da_state & DIRCHG) 11731 pagedep = dap->da_previous->dm_pagedep; 11732 else 11733 pagedep = dap->da_pagedep; 11734 parentino = pagedep->pd_ino; 11735 lbn = pagedep->pd_lbn; 11736 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) 11737 panic("softdep_fsync: dirty"); 11738 if ((dap->da_state & MKDIR_PARENT) || 11739 (pagedep->pd_state & NEWBLOCK)) 11740 flushparent = 1; 11741 else 11742 flushparent = 0; 11743 /* 11744 * If we are being fsync'ed as part of vgone'ing this vnode, 11745 * then we will not be able to release and recover the 11746 * vnode below, so we just have to give up on writing its 11747 * directory entry out. It will eventually be written, just 11748 * not now, but then the user was not asking to have it 11749 * written, so we are not breaking any promises. 11750 */ 11751 if (vp->v_iflag & VI_DOOMED) 11752 break; 11753 /* 11754 * We prevent deadlock by always fetching inodes from the 11755 * root, moving down the directory tree. Thus, when fetching 11756 * our parent directory, we first try to get the lock. If 11757 * that fails, we must unlock ourselves before requesting 11758 * the lock on our parent. See the comment in ufs_lookup 11759 * for details on possible races. 11760 */ 11761 FREE_LOCK(&lk); 11762 if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp, 11763 FFSV_FORCEINSMQ)) { 11764 error = vfs_busy(mp, MBF_NOWAIT); 11765 if (error != 0) { 11766 vfs_ref(mp); 11767 VOP_UNLOCK(vp, 0); 11768 error = vfs_busy(mp, 0); 11769 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 11770 vfs_rel(mp); 11771 if (error != 0) 11772 return (ENOENT); 11773 if (vp->v_iflag & VI_DOOMED) { 11774 vfs_unbusy(mp); 11775 return (ENOENT); 11776 } 11777 } 11778 VOP_UNLOCK(vp, 0); 11779 error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE, 11780 &pvp, FFSV_FORCEINSMQ); 11781 vfs_unbusy(mp); 11782 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 11783 if (vp->v_iflag & VI_DOOMED) { 11784 if (error == 0) 11785 vput(pvp); 11786 error = ENOENT; 11787 } 11788 if (error != 0) 11789 return (error); 11790 } 11791 /* 11792 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps 11793 * that are contained in direct blocks will be resolved by 11794 * doing a ffs_update. Pagedeps contained in indirect blocks 11795 * may require a complete sync'ing of the directory. So, we 11796 * try the cheap and fast ffs_update first, and if that fails, 11797 * then we do the slower ffs_syncvnode of the directory. 11798 */ 11799 if (flushparent) { 11800 int locked; 11801 11802 if ((error = ffs_update(pvp, 1)) != 0) { 11803 vput(pvp); 11804 return (error); 11805 } 11806 ACQUIRE_LOCK(&lk); 11807 locked = 1; 11808 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) { 11809 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) { 11810 if (wk->wk_type != D_DIRADD) 11811 panic("softdep_fsync: Unexpected type %s", 11812 TYPENAME(wk->wk_type)); 11813 dap = WK_DIRADD(wk); 11814 if (dap->da_state & DIRCHG) 11815 pagedep = dap->da_previous->dm_pagedep; 11816 else 11817 pagedep = dap->da_pagedep; 11818 pagedep_new_block = pagedep->pd_state & NEWBLOCK; 11819 FREE_LOCK(&lk); 11820 locked = 0; 11821 if (pagedep_new_block && (error = 11822 ffs_syncvnode(pvp, MNT_WAIT, 0))) { 11823 vput(pvp); 11824 return (error); 11825 } 11826 } 11827 } 11828 if (locked) 11829 FREE_LOCK(&lk); 11830 } 11831 /* 11832 * Flush directory page containing the inode's name. 11833 */ 11834 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred, 11835 &bp); 11836 if (error == 0) 11837 error = bwrite(bp); 11838 else 11839 brelse(bp); 11840 vput(pvp); 11841 if (error != 0) 11842 return (error); 11843 ACQUIRE_LOCK(&lk); 11844 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) 11845 break; 11846 } 11847 FREE_LOCK(&lk); 11848 return (0); 11849 } 11850 11851 /* 11852 * Flush all the dirty bitmaps associated with the block device 11853 * before flushing the rest of the dirty blocks so as to reduce 11854 * the number of dependencies that will have to be rolled back. 11855 * 11856 * XXX Unused? 11857 */ 11858 void 11859 softdep_fsync_mountdev(vp) 11860 struct vnode *vp; 11861 { 11862 struct buf *bp, *nbp; 11863 struct worklist *wk; 11864 struct bufobj *bo; 11865 11866 if (!vn_isdisk(vp, NULL)) 11867 panic("softdep_fsync_mountdev: vnode not a disk"); 11868 bo = &vp->v_bufobj; 11869 restart: 11870 BO_LOCK(bo); 11871 ACQUIRE_LOCK(&lk); 11872 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 11873 /* 11874 * If it is already scheduled, skip to the next buffer. 11875 */ 11876 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 11877 continue; 11878 11879 if ((bp->b_flags & B_DELWRI) == 0) 11880 panic("softdep_fsync_mountdev: not dirty"); 11881 /* 11882 * We are only interested in bitmaps with outstanding 11883 * dependencies. 11884 */ 11885 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL || 11886 wk->wk_type != D_BMSAFEMAP || 11887 (bp->b_vflags & BV_BKGRDINPROG)) { 11888 BUF_UNLOCK(bp); 11889 continue; 11890 } 11891 FREE_LOCK(&lk); 11892 BO_UNLOCK(bo); 11893 bremfree(bp); 11894 (void) bawrite(bp); 11895 goto restart; 11896 } 11897 FREE_LOCK(&lk); 11898 drain_output(vp); 11899 BO_UNLOCK(bo); 11900 } 11901 11902 /* 11903 * Sync all cylinder groups that were dirty at the time this function is 11904 * called. Newly dirtied cgs will be inserted before the sintenel. This 11905 * is used to flush freedep activity that may be holding up writes to a 11906 * indirect block. 11907 */ 11908 static int 11909 sync_cgs(mp, waitfor) 11910 struct mount *mp; 11911 int waitfor; 11912 { 11913 struct bmsafemap *bmsafemap; 11914 struct bmsafemap *sintenel; 11915 struct ufsmount *ump; 11916 struct buf *bp; 11917 int error; 11918 11919 sintenel = malloc(sizeof(*sintenel), M_BMSAFEMAP, M_ZERO | M_WAITOK); 11920 sintenel->sm_cg = -1; 11921 ump = VFSTOUFS(mp); 11922 error = 0; 11923 ACQUIRE_LOCK(&lk); 11924 LIST_INSERT_HEAD(&ump->softdep_dirtycg, sintenel, sm_next); 11925 for (bmsafemap = LIST_NEXT(sintenel, sm_next); bmsafemap != NULL; 11926 bmsafemap = LIST_NEXT(sintenel, sm_next)) { 11927 /* Skip sintenels and cgs with no work to release. */ 11928 if (bmsafemap->sm_cg == -1 || 11929 (LIST_EMPTY(&bmsafemap->sm_freehd) && 11930 LIST_EMPTY(&bmsafemap->sm_freewr))) { 11931 LIST_REMOVE(sintenel, sm_next); 11932 LIST_INSERT_AFTER(bmsafemap, sintenel, sm_next); 11933 continue; 11934 } 11935 /* 11936 * If we don't get the lock and we're waiting try again, if 11937 * not move on to the next buf and try to sync it. 11938 */ 11939 bp = getdirtybuf(bmsafemap->sm_buf, &lk, waitfor); 11940 if (bp == NULL && waitfor == MNT_WAIT) 11941 continue; 11942 LIST_REMOVE(sintenel, sm_next); 11943 LIST_INSERT_AFTER(bmsafemap, sintenel, sm_next); 11944 if (bp == NULL) 11945 continue; 11946 FREE_LOCK(&lk); 11947 if (waitfor == MNT_NOWAIT) 11948 bawrite(bp); 11949 else 11950 error = bwrite(bp); 11951 ACQUIRE_LOCK(&lk); 11952 if (error) 11953 break; 11954 } 11955 LIST_REMOVE(sintenel, sm_next); 11956 FREE_LOCK(&lk); 11957 free(sintenel, M_BMSAFEMAP); 11958 return (error); 11959 } 11960 11961 /* 11962 * This routine is called when we are trying to synchronously flush a 11963 * file. This routine must eliminate any filesystem metadata dependencies 11964 * so that the syncing routine can succeed. 11965 */ 11966 int 11967 softdep_sync_metadata(struct vnode *vp) 11968 { 11969 int error; 11970 11971 /* 11972 * Ensure that any direct block dependencies have been cleared, 11973 * truncations are started, and inode references are journaled. 11974 */ 11975 ACQUIRE_LOCK(&lk); 11976 /* 11977 * Write all journal records to prevent rollbacks on devvp. 11978 */ 11979 if (vp->v_type == VCHR) 11980 softdep_flushjournal(vp->v_mount); 11981 error = flush_inodedep_deps(vp, vp->v_mount, VTOI(vp)->i_number); 11982 /* 11983 * Ensure that all truncates are written so we won't find deps on 11984 * indirect blocks. 11985 */ 11986 process_truncates(vp); 11987 FREE_LOCK(&lk); 11988 11989 return (error); 11990 } 11991 11992 /* 11993 * This routine is called when we are attempting to sync a buf with 11994 * dependencies. If waitfor is MNT_NOWAIT it attempts to schedule any 11995 * other IO it can but returns EBUSY if the buffer is not yet able to 11996 * be written. Dependencies which will not cause rollbacks will always 11997 * return 0. 11998 */ 11999 int 12000 softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor) 12001 { 12002 struct indirdep *indirdep; 12003 struct pagedep *pagedep; 12004 struct allocindir *aip; 12005 struct newblk *newblk; 12006 struct buf *nbp; 12007 struct worklist *wk; 12008 int i, error; 12009 12010 /* 12011 * For VCHR we just don't want to force flush any dependencies that 12012 * will cause rollbacks. 12013 */ 12014 if (vp->v_type == VCHR) { 12015 if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0)) 12016 return (EBUSY); 12017 return (0); 12018 } 12019 ACQUIRE_LOCK(&lk); 12020 /* 12021 * As we hold the buffer locked, none of its dependencies 12022 * will disappear. 12023 */ 12024 error = 0; 12025 top: 12026 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 12027 switch (wk->wk_type) { 12028 12029 case D_ALLOCDIRECT: 12030 case D_ALLOCINDIR: 12031 newblk = WK_NEWBLK(wk); 12032 if (newblk->nb_jnewblk != NULL) { 12033 if (waitfor == MNT_NOWAIT) { 12034 error = EBUSY; 12035 goto out_unlock; 12036 } 12037 jwait(&newblk->nb_jnewblk->jn_list, waitfor); 12038 goto top; 12039 } 12040 if (newblk->nb_state & DEPCOMPLETE || 12041 waitfor == MNT_NOWAIT) 12042 continue; 12043 nbp = newblk->nb_bmsafemap->sm_buf; 12044 nbp = getdirtybuf(nbp, &lk, waitfor); 12045 if (nbp == NULL) 12046 goto top; 12047 FREE_LOCK(&lk); 12048 if ((error = bwrite(nbp)) != 0) 12049 goto out; 12050 ACQUIRE_LOCK(&lk); 12051 continue; 12052 12053 case D_INDIRDEP: 12054 indirdep = WK_INDIRDEP(wk); 12055 if (waitfor == MNT_NOWAIT) { 12056 if (!TAILQ_EMPTY(&indirdep->ir_trunc) || 12057 !LIST_EMPTY(&indirdep->ir_deplisthd)) { 12058 error = EBUSY; 12059 goto out_unlock; 12060 } 12061 } 12062 if (!TAILQ_EMPTY(&indirdep->ir_trunc)) 12063 panic("softdep_sync_buf: truncation pending."); 12064 restart: 12065 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { 12066 newblk = (struct newblk *)aip; 12067 if (newblk->nb_jnewblk != NULL) { 12068 jwait(&newblk->nb_jnewblk->jn_list, 12069 waitfor); 12070 goto restart; 12071 } 12072 if (newblk->nb_state & DEPCOMPLETE) 12073 continue; 12074 nbp = newblk->nb_bmsafemap->sm_buf; 12075 nbp = getdirtybuf(nbp, &lk, waitfor); 12076 if (nbp == NULL) 12077 goto restart; 12078 FREE_LOCK(&lk); 12079 if ((error = bwrite(nbp)) != 0) 12080 goto out; 12081 ACQUIRE_LOCK(&lk); 12082 goto restart; 12083 } 12084 continue; 12085 12086 case D_PAGEDEP: 12087 /* 12088 * Only flush directory entries in synchronous passes. 12089 */ 12090 if (waitfor != MNT_WAIT) { 12091 error = EBUSY; 12092 goto out_unlock; 12093 } 12094 /* 12095 * While syncing snapshots, we must allow recursive 12096 * lookups. 12097 */ 12098 BUF_AREC(bp); 12099 /* 12100 * We are trying to sync a directory that may 12101 * have dependencies on both its own metadata 12102 * and/or dependencies on the inodes of any 12103 * recently allocated files. We walk its diradd 12104 * lists pushing out the associated inode. 12105 */ 12106 pagedep = WK_PAGEDEP(wk); 12107 for (i = 0; i < DAHASHSZ; i++) { 12108 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) 12109 continue; 12110 if ((error = flush_pagedep_deps(vp, wk->wk_mp, 12111 &pagedep->pd_diraddhd[i]))) { 12112 BUF_NOREC(bp); 12113 goto out_unlock; 12114 } 12115 } 12116 BUF_NOREC(bp); 12117 continue; 12118 12119 case D_FREEWORK: 12120 case D_FREEDEP: 12121 case D_JSEGDEP: 12122 case D_JNEWBLK: 12123 continue; 12124 12125 default: 12126 panic("softdep_sync_buf: Unknown type %s", 12127 TYPENAME(wk->wk_type)); 12128 /* NOTREACHED */ 12129 } 12130 } 12131 out_unlock: 12132 FREE_LOCK(&lk); 12133 out: 12134 return (error); 12135 } 12136 12137 /* 12138 * Flush the dependencies associated with an inodedep. 12139 * Called with splbio blocked. 12140 */ 12141 static int 12142 flush_inodedep_deps(vp, mp, ino) 12143 struct vnode *vp; 12144 struct mount *mp; 12145 ino_t ino; 12146 { 12147 struct inodedep *inodedep; 12148 struct inoref *inoref; 12149 int error, waitfor; 12150 12151 /* 12152 * This work is done in two passes. The first pass grabs most 12153 * of the buffers and begins asynchronously writing them. The 12154 * only way to wait for these asynchronous writes is to sleep 12155 * on the filesystem vnode which may stay busy for a long time 12156 * if the filesystem is active. So, instead, we make a second 12157 * pass over the dependencies blocking on each write. In the 12158 * usual case we will be blocking against a write that we 12159 * initiated, so when it is done the dependency will have been 12160 * resolved. Thus the second pass is expected to end quickly. 12161 * We give a brief window at the top of the loop to allow 12162 * any pending I/O to complete. 12163 */ 12164 for (error = 0, waitfor = MNT_NOWAIT; ; ) { 12165 if (error) 12166 return (error); 12167 FREE_LOCK(&lk); 12168 ACQUIRE_LOCK(&lk); 12169 restart: 12170 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) 12171 return (0); 12172 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 12173 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 12174 == DEPCOMPLETE) { 12175 jwait(&inoref->if_list, MNT_WAIT); 12176 goto restart; 12177 } 12178 } 12179 if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) || 12180 flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) || 12181 flush_deplist(&inodedep->id_extupdt, waitfor, &error) || 12182 flush_deplist(&inodedep->id_newextupdt, waitfor, &error)) 12183 continue; 12184 /* 12185 * If pass2, we are done, otherwise do pass 2. 12186 */ 12187 if (waitfor == MNT_WAIT) 12188 break; 12189 waitfor = MNT_WAIT; 12190 } 12191 /* 12192 * Try freeing inodedep in case all dependencies have been removed. 12193 */ 12194 if (inodedep_lookup(mp, ino, 0, &inodedep) != 0) 12195 (void) free_inodedep(inodedep); 12196 return (0); 12197 } 12198 12199 /* 12200 * Flush an inode dependency list. 12201 * Called with splbio blocked. 12202 */ 12203 static int 12204 flush_deplist(listhead, waitfor, errorp) 12205 struct allocdirectlst *listhead; 12206 int waitfor; 12207 int *errorp; 12208 { 12209 struct allocdirect *adp; 12210 struct newblk *newblk; 12211 struct buf *bp; 12212 12213 mtx_assert(&lk, MA_OWNED); 12214 TAILQ_FOREACH(adp, listhead, ad_next) { 12215 newblk = (struct newblk *)adp; 12216 if (newblk->nb_jnewblk != NULL) { 12217 jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); 12218 return (1); 12219 } 12220 if (newblk->nb_state & DEPCOMPLETE) 12221 continue; 12222 bp = newblk->nb_bmsafemap->sm_buf; 12223 bp = getdirtybuf(bp, &lk, waitfor); 12224 if (bp == NULL) { 12225 if (waitfor == MNT_NOWAIT) 12226 continue; 12227 return (1); 12228 } 12229 FREE_LOCK(&lk); 12230 if (waitfor == MNT_NOWAIT) 12231 bawrite(bp); 12232 else 12233 *errorp = bwrite(bp); 12234 ACQUIRE_LOCK(&lk); 12235 return (1); 12236 } 12237 return (0); 12238 } 12239 12240 /* 12241 * Flush dependencies associated with an allocdirect block. 12242 */ 12243 static int 12244 flush_newblk_dep(vp, mp, lbn) 12245 struct vnode *vp; 12246 struct mount *mp; 12247 ufs_lbn_t lbn; 12248 { 12249 struct newblk *newblk; 12250 struct bufobj *bo; 12251 struct inode *ip; 12252 struct buf *bp; 12253 ufs2_daddr_t blkno; 12254 int error; 12255 12256 error = 0; 12257 bo = &vp->v_bufobj; 12258 ip = VTOI(vp); 12259 blkno = DIP(ip, i_db[lbn]); 12260 if (blkno == 0) 12261 panic("flush_newblk_dep: Missing block"); 12262 ACQUIRE_LOCK(&lk); 12263 /* 12264 * Loop until all dependencies related to this block are satisfied. 12265 * We must be careful to restart after each sleep in case a write 12266 * completes some part of this process for us. 12267 */ 12268 for (;;) { 12269 if (newblk_lookup(mp, blkno, 0, &newblk) == 0) { 12270 FREE_LOCK(&lk); 12271 break; 12272 } 12273 if (newblk->nb_list.wk_type != D_ALLOCDIRECT) 12274 panic("flush_newblk_deps: Bad newblk %p", newblk); 12275 /* 12276 * Flush the journal. 12277 */ 12278 if (newblk->nb_jnewblk != NULL) { 12279 jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); 12280 continue; 12281 } 12282 /* 12283 * Write the bitmap dependency. 12284 */ 12285 if ((newblk->nb_state & DEPCOMPLETE) == 0) { 12286 bp = newblk->nb_bmsafemap->sm_buf; 12287 bp = getdirtybuf(bp, &lk, MNT_WAIT); 12288 if (bp == NULL) 12289 continue; 12290 FREE_LOCK(&lk); 12291 error = bwrite(bp); 12292 if (error) 12293 break; 12294 ACQUIRE_LOCK(&lk); 12295 continue; 12296 } 12297 /* 12298 * Write the buffer. 12299 */ 12300 FREE_LOCK(&lk); 12301 BO_LOCK(bo); 12302 bp = gbincore(bo, lbn); 12303 if (bp != NULL) { 12304 error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 12305 LK_INTERLOCK, BO_MTX(bo)); 12306 if (error == ENOLCK) { 12307 ACQUIRE_LOCK(&lk); 12308 continue; /* Slept, retry */ 12309 } 12310 if (error != 0) 12311 break; /* Failed */ 12312 if (bp->b_flags & B_DELWRI) { 12313 bremfree(bp); 12314 error = bwrite(bp); 12315 if (error) 12316 break; 12317 } else 12318 BUF_UNLOCK(bp); 12319 } else 12320 BO_UNLOCK(bo); 12321 /* 12322 * We have to wait for the direct pointers to 12323 * point at the newdirblk before the dependency 12324 * will go away. 12325 */ 12326 error = ffs_update(vp, 1); 12327 if (error) 12328 break; 12329 ACQUIRE_LOCK(&lk); 12330 } 12331 return (error); 12332 } 12333 12334 /* 12335 * Eliminate a pagedep dependency by flushing out all its diradd dependencies. 12336 * Called with splbio blocked. 12337 */ 12338 static int 12339 flush_pagedep_deps(pvp, mp, diraddhdp) 12340 struct vnode *pvp; 12341 struct mount *mp; 12342 struct diraddhd *diraddhdp; 12343 { 12344 struct inodedep *inodedep; 12345 struct inoref *inoref; 12346 struct ufsmount *ump; 12347 struct diradd *dap; 12348 struct vnode *vp; 12349 int error = 0; 12350 struct buf *bp; 12351 ino_t inum; 12352 12353 ump = VFSTOUFS(mp); 12354 restart: 12355 while ((dap = LIST_FIRST(diraddhdp)) != NULL) { 12356 /* 12357 * Flush ourselves if this directory entry 12358 * has a MKDIR_PARENT dependency. 12359 */ 12360 if (dap->da_state & MKDIR_PARENT) { 12361 FREE_LOCK(&lk); 12362 if ((error = ffs_update(pvp, 1)) != 0) 12363 break; 12364 ACQUIRE_LOCK(&lk); 12365 /* 12366 * If that cleared dependencies, go on to next. 12367 */ 12368 if (dap != LIST_FIRST(diraddhdp)) 12369 continue; 12370 if (dap->da_state & MKDIR_PARENT) 12371 panic("flush_pagedep_deps: MKDIR_PARENT"); 12372 } 12373 /* 12374 * A newly allocated directory must have its "." and 12375 * ".." entries written out before its name can be 12376 * committed in its parent. 12377 */ 12378 inum = dap->da_newinum; 12379 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) 12380 panic("flush_pagedep_deps: lost inode1"); 12381 /* 12382 * Wait for any pending journal adds to complete so we don't 12383 * cause rollbacks while syncing. 12384 */ 12385 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 12386 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 12387 == DEPCOMPLETE) { 12388 jwait(&inoref->if_list, MNT_WAIT); 12389 goto restart; 12390 } 12391 } 12392 if (dap->da_state & MKDIR_BODY) { 12393 FREE_LOCK(&lk); 12394 if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, 12395 FFSV_FORCEINSMQ))) 12396 break; 12397 error = flush_newblk_dep(vp, mp, 0); 12398 /* 12399 * If we still have the dependency we might need to 12400 * update the vnode to sync the new link count to 12401 * disk. 12402 */ 12403 if (error == 0 && dap == LIST_FIRST(diraddhdp)) 12404 error = ffs_update(vp, 1); 12405 vput(vp); 12406 if (error != 0) 12407 break; 12408 ACQUIRE_LOCK(&lk); 12409 /* 12410 * If that cleared dependencies, go on to next. 12411 */ 12412 if (dap != LIST_FIRST(diraddhdp)) 12413 continue; 12414 if (dap->da_state & MKDIR_BODY) { 12415 inodedep_lookup(UFSTOVFS(ump), inum, 0, 12416 &inodedep); 12417 panic("flush_pagedep_deps: MKDIR_BODY " 12418 "inodedep %p dap %p vp %p", 12419 inodedep, dap, vp); 12420 } 12421 } 12422 /* 12423 * Flush the inode on which the directory entry depends. 12424 * Having accounted for MKDIR_PARENT and MKDIR_BODY above, 12425 * the only remaining dependency is that the updated inode 12426 * count must get pushed to disk. The inode has already 12427 * been pushed into its inode buffer (via VOP_UPDATE) at 12428 * the time of the reference count change. So we need only 12429 * locate that buffer, ensure that there will be no rollback 12430 * caused by a bitmap dependency, then write the inode buffer. 12431 */ 12432 retry: 12433 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) 12434 panic("flush_pagedep_deps: lost inode"); 12435 /* 12436 * If the inode still has bitmap dependencies, 12437 * push them to disk. 12438 */ 12439 if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) { 12440 bp = inodedep->id_bmsafemap->sm_buf; 12441 bp = getdirtybuf(bp, &lk, MNT_WAIT); 12442 if (bp == NULL) 12443 goto retry; 12444 FREE_LOCK(&lk); 12445 if ((error = bwrite(bp)) != 0) 12446 break; 12447 ACQUIRE_LOCK(&lk); 12448 if (dap != LIST_FIRST(diraddhdp)) 12449 continue; 12450 } 12451 /* 12452 * If the inode is still sitting in a buffer waiting 12453 * to be written or waiting for the link count to be 12454 * adjusted update it here to flush it to disk. 12455 */ 12456 if (dap == LIST_FIRST(diraddhdp)) { 12457 FREE_LOCK(&lk); 12458 if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, 12459 FFSV_FORCEINSMQ))) 12460 break; 12461 error = ffs_update(vp, 1); 12462 vput(vp); 12463 if (error) 12464 break; 12465 ACQUIRE_LOCK(&lk); 12466 } 12467 /* 12468 * If we have failed to get rid of all the dependencies 12469 * then something is seriously wrong. 12470 */ 12471 if (dap == LIST_FIRST(diraddhdp)) { 12472 inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep); 12473 panic("flush_pagedep_deps: failed to flush " 12474 "inodedep %p ino %d dap %p", inodedep, inum, dap); 12475 } 12476 } 12477 if (error) 12478 ACQUIRE_LOCK(&lk); 12479 return (error); 12480 } 12481 12482 /* 12483 * A large burst of file addition or deletion activity can drive the 12484 * memory load excessively high. First attempt to slow things down 12485 * using the techniques below. If that fails, this routine requests 12486 * the offending operations to fall back to running synchronously 12487 * until the memory load returns to a reasonable level. 12488 */ 12489 int 12490 softdep_slowdown(vp) 12491 struct vnode *vp; 12492 { 12493 struct ufsmount *ump; 12494 int jlow; 12495 int max_softdeps_hard; 12496 12497 ACQUIRE_LOCK(&lk); 12498 jlow = 0; 12499 /* 12500 * Check for journal space if needed. 12501 */ 12502 if (DOINGSUJ(vp)) { 12503 ump = VFSTOUFS(vp->v_mount); 12504 if (journal_space(ump, 0) == 0) 12505 jlow = 1; 12506 } 12507 max_softdeps_hard = max_softdeps * 11 / 10; 12508 if (dep_current[D_DIRREM] < max_softdeps_hard / 2 && 12509 dep_current[D_INODEDEP] < max_softdeps_hard && 12510 VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps && 12511 dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0) { 12512 FREE_LOCK(&lk); 12513 return (0); 12514 } 12515 if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps || jlow) 12516 softdep_speedup(); 12517 stat_sync_limit_hit += 1; 12518 FREE_LOCK(&lk); 12519 if (DOINGSUJ(vp)) 12520 return (0); 12521 return (1); 12522 } 12523 12524 /* 12525 * Called by the allocation routines when they are about to fail 12526 * in the hope that we can free up the requested resource (inodes 12527 * or disk space). 12528 * 12529 * First check to see if the work list has anything on it. If it has, 12530 * clean up entries until we successfully free the requested resource. 12531 * Because this process holds inodes locked, we cannot handle any remove 12532 * requests that might block on a locked inode as that could lead to 12533 * deadlock. If the worklist yields none of the requested resource, 12534 * start syncing out vnodes to free up the needed space. 12535 */ 12536 int 12537 softdep_request_cleanup(fs, vp, cred, resource) 12538 struct fs *fs; 12539 struct vnode *vp; 12540 struct ucred *cred; 12541 int resource; 12542 { 12543 struct ufsmount *ump; 12544 struct mount *mp; 12545 struct vnode *lvp, *mvp; 12546 long starttime; 12547 ufs2_daddr_t needed; 12548 int error; 12549 12550 /* 12551 * If we are being called because of a process doing a 12552 * copy-on-write, then it is not safe to process any 12553 * worklist items as we will recurse into the copyonwrite 12554 * routine. This will result in an incoherent snapshot. 12555 * If the vnode that we hold is a snapshot, we must avoid 12556 * handling other resources that could cause deadlock. 12557 */ 12558 if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp))) 12559 return (0); 12560 12561 if (resource == FLUSH_BLOCKS_WAIT) 12562 stat_cleanup_blkrequests += 1; 12563 else 12564 stat_cleanup_inorequests += 1; 12565 12566 mp = vp->v_mount; 12567 ump = VFSTOUFS(mp); 12568 mtx_assert(UFS_MTX(ump), MA_OWNED); 12569 UFS_UNLOCK(ump); 12570 error = ffs_update(vp, 1); 12571 if (error != 0) { 12572 UFS_LOCK(ump); 12573 return (0); 12574 } 12575 /* 12576 * If we are in need of resources, consider pausing for 12577 * tickdelay to give ourselves some breathing room. 12578 */ 12579 ACQUIRE_LOCK(&lk); 12580 process_removes(vp); 12581 process_truncates(vp); 12582 request_cleanup(UFSTOVFS(ump), resource); 12583 FREE_LOCK(&lk); 12584 /* 12585 * Now clean up at least as many resources as we will need. 12586 * 12587 * When requested to clean up inodes, the number that are needed 12588 * is set by the number of simultaneous writers (mnt_writeopcount) 12589 * plus a bit of slop (2) in case some more writers show up while 12590 * we are cleaning. 12591 * 12592 * When requested to free up space, the amount of space that 12593 * we need is enough blocks to allocate a full-sized segment 12594 * (fs_contigsumsize). The number of such segments that will 12595 * be needed is set by the number of simultaneous writers 12596 * (mnt_writeopcount) plus a bit of slop (2) in case some more 12597 * writers show up while we are cleaning. 12598 * 12599 * Additionally, if we are unpriviledged and allocating space, 12600 * we need to ensure that we clean up enough blocks to get the 12601 * needed number of blocks over the threshhold of the minimum 12602 * number of blocks required to be kept free by the filesystem 12603 * (fs_minfree). 12604 */ 12605 if (resource == FLUSH_INODES_WAIT) { 12606 needed = vp->v_mount->mnt_writeopcount + 2; 12607 } else if (resource == FLUSH_BLOCKS_WAIT) { 12608 needed = (vp->v_mount->mnt_writeopcount + 2) * 12609 fs->fs_contigsumsize; 12610 if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0)) 12611 needed += fragstoblks(fs, 12612 roundup((fs->fs_dsize * fs->fs_minfree / 100) - 12613 fs->fs_cstotal.cs_nffree, fs->fs_frag)); 12614 } else { 12615 UFS_LOCK(ump); 12616 printf("softdep_request_cleanup: Unknown resource type %d\n", 12617 resource); 12618 return (0); 12619 } 12620 starttime = time_second; 12621 retry: 12622 if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 && 12623 fs->fs_cstotal.cs_nbfree <= needed) || 12624 (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && 12625 fs->fs_cstotal.cs_nifree <= needed)) { 12626 ACQUIRE_LOCK(&lk); 12627 if (ump->softdep_on_worklist > 0 && 12628 process_worklist_item(UFSTOVFS(ump), 12629 ump->softdep_on_worklist, LK_NOWAIT) != 0) 12630 stat_worklist_push += 1; 12631 FREE_LOCK(&lk); 12632 } 12633 /* 12634 * If we still need resources and there are no more worklist 12635 * entries to process to obtain them, we have to start flushing 12636 * the dirty vnodes to force the release of additional requests 12637 * to the worklist that we can then process to reap addition 12638 * resources. We walk the vnodes associated with the mount point 12639 * until we get the needed worklist requests that we can reap. 12640 */ 12641 if ((resource == FLUSH_BLOCKS_WAIT && 12642 fs->fs_cstotal.cs_nbfree <= needed) || 12643 (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && 12644 fs->fs_cstotal.cs_nifree <= needed)) { 12645 MNT_ILOCK(mp); 12646 MNT_VNODE_FOREACH(lvp, mp, mvp) { 12647 VI_LOCK(lvp); 12648 if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) { 12649 VI_UNLOCK(lvp); 12650 continue; 12651 } 12652 MNT_IUNLOCK(mp); 12653 if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT, 12654 curthread)) { 12655 MNT_ILOCK(mp); 12656 continue; 12657 } 12658 if (lvp->v_vflag & VV_NOSYNC) { /* unlinked */ 12659 vput(lvp); 12660 MNT_ILOCK(mp); 12661 continue; 12662 } 12663 (void) ffs_syncvnode(lvp, MNT_NOWAIT, 0); 12664 vput(lvp); 12665 MNT_ILOCK(mp); 12666 } 12667 MNT_IUNLOCK(mp); 12668 lvp = ump->um_devvp; 12669 if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) { 12670 VOP_FSYNC(lvp, MNT_NOWAIT, curthread); 12671 VOP_UNLOCK(lvp, 0); 12672 } 12673 if (ump->softdep_on_worklist > 0) { 12674 stat_cleanup_retries += 1; 12675 goto retry; 12676 } 12677 stat_cleanup_failures += 1; 12678 } 12679 if (time_second - starttime > stat_cleanup_high_delay) 12680 stat_cleanup_high_delay = time_second - starttime; 12681 UFS_LOCK(ump); 12682 return (1); 12683 } 12684 12685 /* 12686 * If memory utilization has gotten too high, deliberately slow things 12687 * down and speed up the I/O processing. 12688 */ 12689 extern struct thread *syncertd; 12690 static int 12691 request_cleanup(mp, resource) 12692 struct mount *mp; 12693 int resource; 12694 { 12695 struct thread *td = curthread; 12696 struct ufsmount *ump; 12697 12698 mtx_assert(&lk, MA_OWNED); 12699 /* 12700 * We never hold up the filesystem syncer or buf daemon. 12701 */ 12702 if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF)) 12703 return (0); 12704 ump = VFSTOUFS(mp); 12705 /* 12706 * First check to see if the work list has gotten backlogged. 12707 * If it has, co-opt this process to help clean up two entries. 12708 * Because this process may hold inodes locked, we cannot 12709 * handle any remove requests that might block on a locked 12710 * inode as that could lead to deadlock. We set TDP_SOFTDEP 12711 * to avoid recursively processing the worklist. 12712 */ 12713 if (ump->softdep_on_worklist > max_softdeps / 10) { 12714 td->td_pflags |= TDP_SOFTDEP; 12715 process_worklist_item(mp, 2, LK_NOWAIT); 12716 td->td_pflags &= ~TDP_SOFTDEP; 12717 stat_worklist_push += 2; 12718 return(1); 12719 } 12720 /* 12721 * Next, we attempt to speed up the syncer process. If that 12722 * is successful, then we allow the process to continue. 12723 */ 12724 if (softdep_speedup() && 12725 resource != FLUSH_BLOCKS_WAIT && 12726 resource != FLUSH_INODES_WAIT) 12727 return(0); 12728 /* 12729 * If we are resource constrained on inode dependencies, try 12730 * flushing some dirty inodes. Otherwise, we are constrained 12731 * by file deletions, so try accelerating flushes of directories 12732 * with removal dependencies. We would like to do the cleanup 12733 * here, but we probably hold an inode locked at this point and 12734 * that might deadlock against one that we try to clean. So, 12735 * the best that we can do is request the syncer daemon to do 12736 * the cleanup for us. 12737 */ 12738 switch (resource) { 12739 12740 case FLUSH_INODES: 12741 case FLUSH_INODES_WAIT: 12742 stat_ino_limit_push += 1; 12743 req_clear_inodedeps += 1; 12744 stat_countp = &stat_ino_limit_hit; 12745 break; 12746 12747 case FLUSH_BLOCKS: 12748 case FLUSH_BLOCKS_WAIT: 12749 stat_blk_limit_push += 1; 12750 req_clear_remove += 1; 12751 stat_countp = &stat_blk_limit_hit; 12752 break; 12753 12754 default: 12755 panic("request_cleanup: unknown type"); 12756 } 12757 /* 12758 * Hopefully the syncer daemon will catch up and awaken us. 12759 * We wait at most tickdelay before proceeding in any case. 12760 */ 12761 proc_waiting += 1; 12762 if (callout_pending(&softdep_callout) == FALSE) 12763 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, 12764 pause_timer, 0); 12765 12766 msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0); 12767 proc_waiting -= 1; 12768 return (1); 12769 } 12770 12771 /* 12772 * Awaken processes pausing in request_cleanup and clear proc_waiting 12773 * to indicate that there is no longer a timer running. 12774 */ 12775 static void 12776 pause_timer(arg) 12777 void *arg; 12778 { 12779 12780 /* 12781 * The callout_ API has acquired mtx and will hold it around this 12782 * function call. 12783 */ 12784 *stat_countp += 1; 12785 wakeup_one(&proc_waiting); 12786 if (proc_waiting > 0) 12787 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, 12788 pause_timer, 0); 12789 } 12790 12791 /* 12792 * Flush out a directory with at least one removal dependency in an effort to 12793 * reduce the number of dirrem, freefile, and freeblks dependency structures. 12794 */ 12795 static void 12796 clear_remove(td) 12797 struct thread *td; 12798 { 12799 struct pagedep_hashhead *pagedephd; 12800 struct pagedep *pagedep; 12801 static int next = 0; 12802 struct mount *mp; 12803 struct vnode *vp; 12804 struct bufobj *bo; 12805 int error, cnt; 12806 ino_t ino; 12807 12808 mtx_assert(&lk, MA_OWNED); 12809 12810 for (cnt = 0; cnt < pagedep_hash; cnt++) { 12811 pagedephd = &pagedep_hashtbl[next++]; 12812 if (next >= pagedep_hash) 12813 next = 0; 12814 LIST_FOREACH(pagedep, pagedephd, pd_hash) { 12815 if (LIST_EMPTY(&pagedep->pd_dirremhd)) 12816 continue; 12817 mp = pagedep->pd_list.wk_mp; 12818 ino = pagedep->pd_ino; 12819 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 12820 continue; 12821 FREE_LOCK(&lk); 12822 12823 /* 12824 * Let unmount clear deps 12825 */ 12826 error = vfs_busy(mp, MBF_NOWAIT); 12827 if (error != 0) 12828 goto finish_write; 12829 error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, 12830 FFSV_FORCEINSMQ); 12831 vfs_unbusy(mp); 12832 if (error != 0) { 12833 softdep_error("clear_remove: vget", error); 12834 goto finish_write; 12835 } 12836 if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0))) 12837 softdep_error("clear_remove: fsync", error); 12838 bo = &vp->v_bufobj; 12839 BO_LOCK(bo); 12840 drain_output(vp); 12841 BO_UNLOCK(bo); 12842 vput(vp); 12843 finish_write: 12844 vn_finished_write(mp); 12845 ACQUIRE_LOCK(&lk); 12846 return; 12847 } 12848 } 12849 } 12850 12851 /* 12852 * Clear out a block of dirty inodes in an effort to reduce 12853 * the number of inodedep dependency structures. 12854 */ 12855 static void 12856 clear_inodedeps(td) 12857 struct thread *td; 12858 { 12859 struct inodedep_hashhead *inodedephd; 12860 struct inodedep *inodedep; 12861 static int next = 0; 12862 struct mount *mp; 12863 struct vnode *vp; 12864 struct fs *fs; 12865 int error, cnt; 12866 ino_t firstino, lastino, ino; 12867 12868 mtx_assert(&lk, MA_OWNED); 12869 /* 12870 * Pick a random inode dependency to be cleared. 12871 * We will then gather up all the inodes in its block 12872 * that have dependencies and flush them out. 12873 */ 12874 for (cnt = 0; cnt < inodedep_hash; cnt++) { 12875 inodedephd = &inodedep_hashtbl[next++]; 12876 if (next >= inodedep_hash) 12877 next = 0; 12878 if ((inodedep = LIST_FIRST(inodedephd)) != NULL) 12879 break; 12880 } 12881 if (inodedep == NULL) 12882 return; 12883 fs = inodedep->id_fs; 12884 mp = inodedep->id_list.wk_mp; 12885 /* 12886 * Find the last inode in the block with dependencies. 12887 */ 12888 firstino = inodedep->id_ino & ~(INOPB(fs) - 1); 12889 for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) 12890 if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0) 12891 break; 12892 /* 12893 * Asynchronously push all but the last inode with dependencies. 12894 * Synchronously push the last inode with dependencies to ensure 12895 * that the inode block gets written to free up the inodedeps. 12896 */ 12897 for (ino = firstino; ino <= lastino; ino++) { 12898 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) 12899 continue; 12900 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 12901 continue; 12902 FREE_LOCK(&lk); 12903 error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */ 12904 if (error != 0) { 12905 vn_finished_write(mp); 12906 ACQUIRE_LOCK(&lk); 12907 return; 12908 } 12909 if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, 12910 FFSV_FORCEINSMQ)) != 0) { 12911 softdep_error("clear_inodedeps: vget", error); 12912 vfs_unbusy(mp); 12913 vn_finished_write(mp); 12914 ACQUIRE_LOCK(&lk); 12915 return; 12916 } 12917 vfs_unbusy(mp); 12918 if (ino == lastino) { 12919 if ((error = ffs_syncvnode(vp, MNT_WAIT, 0))) 12920 softdep_error("clear_inodedeps: fsync1", error); 12921 } else { 12922 if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0))) 12923 softdep_error("clear_inodedeps: fsync2", error); 12924 BO_LOCK(&vp->v_bufobj); 12925 drain_output(vp); 12926 BO_UNLOCK(&vp->v_bufobj); 12927 } 12928 vput(vp); 12929 vn_finished_write(mp); 12930 ACQUIRE_LOCK(&lk); 12931 } 12932 } 12933 12934 void 12935 softdep_buf_append(bp, wkhd) 12936 struct buf *bp; 12937 struct workhead *wkhd; 12938 { 12939 struct worklist *wk; 12940 12941 ACQUIRE_LOCK(&lk); 12942 while ((wk = LIST_FIRST(wkhd)) != NULL) { 12943 WORKLIST_REMOVE(wk); 12944 WORKLIST_INSERT(&bp->b_dep, wk); 12945 } 12946 FREE_LOCK(&lk); 12947 12948 } 12949 12950 void 12951 softdep_inode_append(ip, cred, wkhd) 12952 struct inode *ip; 12953 struct ucred *cred; 12954 struct workhead *wkhd; 12955 { 12956 struct buf *bp; 12957 struct fs *fs; 12958 int error; 12959 12960 fs = ip->i_fs; 12961 error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 12962 (int)fs->fs_bsize, cred, &bp); 12963 if (error) { 12964 softdep_freework(wkhd); 12965 return; 12966 } 12967 softdep_buf_append(bp, wkhd); 12968 bqrelse(bp); 12969 } 12970 12971 void 12972 softdep_freework(wkhd) 12973 struct workhead *wkhd; 12974 { 12975 12976 ACQUIRE_LOCK(&lk); 12977 handle_jwork(wkhd); 12978 FREE_LOCK(&lk); 12979 } 12980 12981 /* 12982 * Function to determine if the buffer has outstanding dependencies 12983 * that will cause a roll-back if the buffer is written. If wantcount 12984 * is set, return number of dependencies, otherwise just yes or no. 12985 */ 12986 static int 12987 softdep_count_dependencies(bp, wantcount) 12988 struct buf *bp; 12989 int wantcount; 12990 { 12991 struct worklist *wk; 12992 struct bmsafemap *bmsafemap; 12993 struct freework *freework; 12994 struct inodedep *inodedep; 12995 struct indirdep *indirdep; 12996 struct freeblks *freeblks; 12997 struct allocindir *aip; 12998 struct pagedep *pagedep; 12999 struct dirrem *dirrem; 13000 struct newblk *newblk; 13001 struct mkdir *mkdir; 13002 struct diradd *dap; 13003 int i, retval; 13004 13005 retval = 0; 13006 ACQUIRE_LOCK(&lk); 13007 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 13008 switch (wk->wk_type) { 13009 13010 case D_INODEDEP: 13011 inodedep = WK_INODEDEP(wk); 13012 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 13013 /* bitmap allocation dependency */ 13014 retval += 1; 13015 if (!wantcount) 13016 goto out; 13017 } 13018 if (TAILQ_FIRST(&inodedep->id_inoupdt)) { 13019 /* direct block pointer dependency */ 13020 retval += 1; 13021 if (!wantcount) 13022 goto out; 13023 } 13024 if (TAILQ_FIRST(&inodedep->id_extupdt)) { 13025 /* direct block pointer dependency */ 13026 retval += 1; 13027 if (!wantcount) 13028 goto out; 13029 } 13030 if (TAILQ_FIRST(&inodedep->id_inoreflst)) { 13031 /* Add reference dependency. */ 13032 retval += 1; 13033 if (!wantcount) 13034 goto out; 13035 } 13036 continue; 13037 13038 case D_INDIRDEP: 13039 indirdep = WK_INDIRDEP(wk); 13040 13041 TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) { 13042 /* indirect truncation dependency */ 13043 retval += 1; 13044 if (!wantcount) 13045 goto out; 13046 } 13047 13048 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { 13049 /* indirect block pointer dependency */ 13050 retval += 1; 13051 if (!wantcount) 13052 goto out; 13053 } 13054 continue; 13055 13056 case D_PAGEDEP: 13057 pagedep = WK_PAGEDEP(wk); 13058 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { 13059 if (LIST_FIRST(&dirrem->dm_jremrefhd)) { 13060 /* Journal remove ref dependency. */ 13061 retval += 1; 13062 if (!wantcount) 13063 goto out; 13064 } 13065 } 13066 for (i = 0; i < DAHASHSZ; i++) { 13067 13068 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 13069 /* directory entry dependency */ 13070 retval += 1; 13071 if (!wantcount) 13072 goto out; 13073 } 13074 } 13075 continue; 13076 13077 case D_BMSAFEMAP: 13078 bmsafemap = WK_BMSAFEMAP(wk); 13079 if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) { 13080 /* Add reference dependency. */ 13081 retval += 1; 13082 if (!wantcount) 13083 goto out; 13084 } 13085 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) { 13086 /* Allocate block dependency. */ 13087 retval += 1; 13088 if (!wantcount) 13089 goto out; 13090 } 13091 continue; 13092 13093 case D_FREEBLKS: 13094 freeblks = WK_FREEBLKS(wk); 13095 if (LIST_FIRST(&freeblks->fb_jblkdephd)) { 13096 /* Freeblk journal dependency. */ 13097 retval += 1; 13098 if (!wantcount) 13099 goto out; 13100 } 13101 continue; 13102 13103 case D_ALLOCDIRECT: 13104 case D_ALLOCINDIR: 13105 newblk = WK_NEWBLK(wk); 13106 if (newblk->nb_jnewblk) { 13107 /* Journal allocate dependency. */ 13108 retval += 1; 13109 if (!wantcount) 13110 goto out; 13111 } 13112 continue; 13113 13114 case D_MKDIR: 13115 mkdir = WK_MKDIR(wk); 13116 if (mkdir->md_jaddref) { 13117 /* Journal reference dependency. */ 13118 retval += 1; 13119 if (!wantcount) 13120 goto out; 13121 } 13122 continue; 13123 13124 case D_FREEWORK: 13125 case D_FREEDEP: 13126 case D_JSEGDEP: 13127 case D_JSEG: 13128 case D_SBDEP: 13129 /* never a dependency on these blocks */ 13130 continue; 13131 13132 default: 13133 panic("softdep_count_dependencies: Unexpected type %s", 13134 TYPENAME(wk->wk_type)); 13135 /* NOTREACHED */ 13136 } 13137 } 13138 out: 13139 FREE_LOCK(&lk); 13140 return retval; 13141 } 13142 13143 /* 13144 * Acquire exclusive access to a buffer. 13145 * Must be called with a locked mtx parameter. 13146 * Return acquired buffer or NULL on failure. 13147 */ 13148 static struct buf * 13149 getdirtybuf(bp, mtx, waitfor) 13150 struct buf *bp; 13151 struct mtx *mtx; 13152 int waitfor; 13153 { 13154 int error; 13155 13156 mtx_assert(mtx, MA_OWNED); 13157 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) { 13158 if (waitfor != MNT_WAIT) 13159 return (NULL); 13160 error = BUF_LOCK(bp, 13161 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx); 13162 /* 13163 * Even if we sucessfully acquire bp here, we have dropped 13164 * mtx, which may violates our guarantee. 13165 */ 13166 if (error == 0) 13167 BUF_UNLOCK(bp); 13168 else if (error != ENOLCK) 13169 panic("getdirtybuf: inconsistent lock: %d", error); 13170 mtx_lock(mtx); 13171 return (NULL); 13172 } 13173 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { 13174 if (mtx == &lk && waitfor == MNT_WAIT) { 13175 mtx_unlock(mtx); 13176 BO_LOCK(bp->b_bufobj); 13177 BUF_UNLOCK(bp); 13178 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { 13179 bp->b_vflags |= BV_BKGRDWAIT; 13180 msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj), 13181 PRIBIO | PDROP, "getbuf", 0); 13182 } else 13183 BO_UNLOCK(bp->b_bufobj); 13184 mtx_lock(mtx); 13185 return (NULL); 13186 } 13187 BUF_UNLOCK(bp); 13188 if (waitfor != MNT_WAIT) 13189 return (NULL); 13190 /* 13191 * The mtx argument must be bp->b_vp's mutex in 13192 * this case. 13193 */ 13194 #ifdef DEBUG_VFS_LOCKS 13195 if (bp->b_vp->v_type != VCHR) 13196 ASSERT_BO_LOCKED(bp->b_bufobj); 13197 #endif 13198 bp->b_vflags |= BV_BKGRDWAIT; 13199 msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0); 13200 return (NULL); 13201 } 13202 if ((bp->b_flags & B_DELWRI) == 0) { 13203 BUF_UNLOCK(bp); 13204 return (NULL); 13205 } 13206 bremfree(bp); 13207 return (bp); 13208 } 13209 13210 13211 /* 13212 * Check if it is safe to suspend the file system now. On entry, 13213 * the vnode interlock for devvp should be held. Return 0 with 13214 * the mount interlock held if the file system can be suspended now, 13215 * otherwise return EAGAIN with the mount interlock held. 13216 */ 13217 int 13218 softdep_check_suspend(struct mount *mp, 13219 struct vnode *devvp, 13220 int softdep_deps, 13221 int softdep_accdeps, 13222 int secondary_writes, 13223 int secondary_accwrites) 13224 { 13225 struct bufobj *bo; 13226 struct ufsmount *ump; 13227 int error; 13228 13229 ump = VFSTOUFS(mp); 13230 bo = &devvp->v_bufobj; 13231 ASSERT_BO_LOCKED(bo); 13232 13233 for (;;) { 13234 if (!TRY_ACQUIRE_LOCK(&lk)) { 13235 BO_UNLOCK(bo); 13236 ACQUIRE_LOCK(&lk); 13237 FREE_LOCK(&lk); 13238 BO_LOCK(bo); 13239 continue; 13240 } 13241 MNT_ILOCK(mp); 13242 if (mp->mnt_secondary_writes != 0) { 13243 FREE_LOCK(&lk); 13244 BO_UNLOCK(bo); 13245 msleep(&mp->mnt_secondary_writes, 13246 MNT_MTX(mp), 13247 (PUSER - 1) | PDROP, "secwr", 0); 13248 BO_LOCK(bo); 13249 continue; 13250 } 13251 break; 13252 } 13253 13254 /* 13255 * Reasons for needing more work before suspend: 13256 * - Dirty buffers on devvp. 13257 * - Softdep activity occurred after start of vnode sync loop 13258 * - Secondary writes occurred after start of vnode sync loop 13259 */ 13260 error = 0; 13261 if (bo->bo_numoutput > 0 || 13262 bo->bo_dirty.bv_cnt > 0 || 13263 softdep_deps != 0 || 13264 ump->softdep_deps != 0 || 13265 softdep_accdeps != ump->softdep_accdeps || 13266 secondary_writes != 0 || 13267 mp->mnt_secondary_writes != 0 || 13268 secondary_accwrites != mp->mnt_secondary_accwrites) 13269 error = EAGAIN; 13270 FREE_LOCK(&lk); 13271 BO_UNLOCK(bo); 13272 return (error); 13273 } 13274 13275 13276 /* 13277 * Get the number of dependency structures for the file system, both 13278 * the current number and the total number allocated. These will 13279 * later be used to detect that softdep processing has occurred. 13280 */ 13281 void 13282 softdep_get_depcounts(struct mount *mp, 13283 int *softdep_depsp, 13284 int *softdep_accdepsp) 13285 { 13286 struct ufsmount *ump; 13287 13288 ump = VFSTOUFS(mp); 13289 ACQUIRE_LOCK(&lk); 13290 *softdep_depsp = ump->softdep_deps; 13291 *softdep_accdepsp = ump->softdep_accdeps; 13292 FREE_LOCK(&lk); 13293 } 13294 13295 /* 13296 * Wait for pending output on a vnode to complete. 13297 * Must be called with vnode lock and interlock locked. 13298 * 13299 * XXX: Should just be a call to bufobj_wwait(). 13300 */ 13301 static void 13302 drain_output(vp) 13303 struct vnode *vp; 13304 { 13305 struct bufobj *bo; 13306 13307 bo = &vp->v_bufobj; 13308 ASSERT_VOP_LOCKED(vp, "drain_output"); 13309 ASSERT_BO_LOCKED(bo); 13310 13311 while (bo->bo_numoutput) { 13312 bo->bo_flag |= BO_WWAIT; 13313 msleep((caddr_t)&bo->bo_numoutput, 13314 BO_MTX(bo), PRIBIO + 1, "drainvp", 0); 13315 } 13316 } 13317 13318 /* 13319 * Called whenever a buffer that is being invalidated or reallocated 13320 * contains dependencies. This should only happen if an I/O error has 13321 * occurred. The routine is called with the buffer locked. 13322 */ 13323 static void 13324 softdep_deallocate_dependencies(bp) 13325 struct buf *bp; 13326 { 13327 13328 if ((bp->b_ioflags & BIO_ERROR) == 0) 13329 panic("softdep_deallocate_dependencies: dangling deps"); 13330 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); 13331 panic("softdep_deallocate_dependencies: unrecovered I/O error"); 13332 } 13333 13334 /* 13335 * Function to handle asynchronous write errors in the filesystem. 13336 */ 13337 static void 13338 softdep_error(func, error) 13339 char *func; 13340 int error; 13341 { 13342 13343 /* XXX should do something better! */ 13344 printf("%s: got error %d while accessing filesystem\n", func, error); 13345 } 13346 13347 #ifdef DDB 13348 13349 static void 13350 inodedep_print(struct inodedep *inodedep, int verbose) 13351 { 13352 db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d" 13353 " saveino %p\n", 13354 inodedep, inodedep->id_fs, inodedep->id_state, 13355 (intmax_t)inodedep->id_ino, 13356 (intmax_t)fsbtodb(inodedep->id_fs, 13357 ino_to_fsba(inodedep->id_fs, inodedep->id_ino)), 13358 inodedep->id_nlinkdelta, inodedep->id_savednlink, 13359 inodedep->id_savedino1); 13360 13361 if (verbose == 0) 13362 return; 13363 13364 db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, " 13365 "mkdiradd %p\n", 13366 LIST_FIRST(&inodedep->id_pendinghd), 13367 LIST_FIRST(&inodedep->id_bufwait), 13368 LIST_FIRST(&inodedep->id_inowait), 13369 TAILQ_FIRST(&inodedep->id_inoreflst), 13370 inodedep->id_mkdiradd); 13371 db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n", 13372 TAILQ_FIRST(&inodedep->id_inoupdt), 13373 TAILQ_FIRST(&inodedep->id_newinoupdt), 13374 TAILQ_FIRST(&inodedep->id_extupdt), 13375 TAILQ_FIRST(&inodedep->id_newextupdt)); 13376 } 13377 13378 DB_SHOW_COMMAND(inodedep, db_show_inodedep) 13379 { 13380 13381 if (have_addr == 0) { 13382 db_printf("Address required\n"); 13383 return; 13384 } 13385 inodedep_print((struct inodedep*)addr, 1); 13386 } 13387 13388 DB_SHOW_COMMAND(inodedeps, db_show_inodedeps) 13389 { 13390 struct inodedep_hashhead *inodedephd; 13391 struct inodedep *inodedep; 13392 struct fs *fs; 13393 int cnt; 13394 13395 fs = have_addr ? (struct fs *)addr : NULL; 13396 for (cnt = 0; cnt < inodedep_hash; cnt++) { 13397 inodedephd = &inodedep_hashtbl[cnt]; 13398 LIST_FOREACH(inodedep, inodedephd, id_hash) { 13399 if (fs != NULL && fs != inodedep->id_fs) 13400 continue; 13401 inodedep_print(inodedep, 0); 13402 } 13403 } 13404 } 13405 13406 DB_SHOW_COMMAND(worklist, db_show_worklist) 13407 { 13408 struct worklist *wk; 13409 13410 if (have_addr == 0) { 13411 db_printf("Address required\n"); 13412 return; 13413 } 13414 wk = (struct worklist *)addr; 13415 printf("worklist: %p type %s state 0x%X\n", 13416 wk, TYPENAME(wk->wk_type), wk->wk_state); 13417 } 13418 13419 DB_SHOW_COMMAND(workhead, db_show_workhead) 13420 { 13421 struct workhead *wkhd; 13422 struct worklist *wk; 13423 int i; 13424 13425 if (have_addr == 0) { 13426 db_printf("Address required\n"); 13427 return; 13428 } 13429 wkhd = (struct workhead *)addr; 13430 wk = LIST_FIRST(wkhd); 13431 for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list)) 13432 db_printf("worklist: %p type %s state 0x%X", 13433 wk, TYPENAME(wk->wk_type), wk->wk_state); 13434 if (i == 100) 13435 db_printf("workhead overflow"); 13436 printf("\n"); 13437 } 13438 13439 13440 DB_SHOW_COMMAND(mkdirs, db_show_mkdirs) 13441 { 13442 struct jaddref *jaddref; 13443 struct diradd *diradd; 13444 struct mkdir *mkdir; 13445 13446 LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) { 13447 diradd = mkdir->md_diradd; 13448 db_printf("mkdir: %p state 0x%X dap %p state 0x%X", 13449 mkdir, mkdir->md_state, diradd, diradd->da_state); 13450 if ((jaddref = mkdir->md_jaddref) != NULL) 13451 db_printf(" jaddref %p jaddref state 0x%X", 13452 jaddref, jaddref->ja_state); 13453 db_printf("\n"); 13454 } 13455 } 13456 13457 #endif /* DDB */ 13458 13459 #endif /* SOFTUPDATES */ 13460