1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright 1998, 2000 Marshall Kirk McKusick. 5 * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org> 6 * All rights reserved. 7 * 8 * The soft updates code is derived from the appendix of a University 9 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, 10 * "Soft Updates: A Solution to the Metadata Update Problem in File 11 * Systems", CSE-TR-254-95, August 1995). 12 * 13 * Further information about soft updates can be obtained from: 14 * 15 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 16 * 1614 Oxford Street mckusick@mckusick.com 17 * Berkeley, CA 94709-1608 +1-510-843-9542 18 * USA 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 24 * 1. Redistributions of source code must retain the above copyright 25 * notice, this list of conditions and the following disclaimer. 26 * 2. Redistributions in binary form must reproduce the above copyright 27 * notice, this list of conditions and the following disclaimer in the 28 * documentation and/or other materials provided with the distribution. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 31 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 32 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 33 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, 34 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 35 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 36 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 37 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 38 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 39 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 40 * 41 * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00 42 */ 43 44 #include <sys/cdefs.h> 45 __FBSDID("$FreeBSD$"); 46 47 #include "opt_ffs.h" 48 #include "opt_quota.h" 49 #include "opt_ddb.h" 50 51 #include <sys/param.h> 52 #include <sys/kernel.h> 53 #include <sys/systm.h> 54 #include <sys/bio.h> 55 #include <sys/buf.h> 56 #include <sys/kdb.h> 57 #include <sys/kthread.h> 58 #include <sys/ktr.h> 59 #include <sys/limits.h> 60 #include <sys/lock.h> 61 #include <sys/malloc.h> 62 #include <sys/mount.h> 63 #include <sys/mutex.h> 64 #include <sys/namei.h> 65 #include <sys/priv.h> 66 #include <sys/proc.h> 67 #include <sys/racct.h> 68 #include <sys/rwlock.h> 69 #include <sys/stat.h> 70 #include <sys/sysctl.h> 71 #include <sys/syslog.h> 72 #include <sys/vnode.h> 73 #include <sys/conf.h> 74 75 #include <ufs/ufs/dir.h> 76 #include <ufs/ufs/extattr.h> 77 #include <ufs/ufs/quota.h> 78 #include <ufs/ufs/inode.h> 79 #include <ufs/ufs/ufsmount.h> 80 #include <ufs/ffs/fs.h> 81 #include <ufs/ffs/softdep.h> 82 #include <ufs/ffs/ffs_extern.h> 83 #include <ufs/ufs/ufs_extern.h> 84 85 #include <vm/vm.h> 86 #include <vm/vm_extern.h> 87 #include <vm/vm_object.h> 88 89 #include <geom/geom.h> 90 #include <geom/geom_vfs.h> 91 92 #include <ddb/ddb.h> 93 94 #define KTR_SUJ 0 /* Define to KTR_SPARE. */ 95 96 #ifndef SOFTUPDATES 97 98 int 99 softdep_flushfiles(oldmnt, flags, td) 100 struct mount *oldmnt; 101 int flags; 102 struct thread *td; 103 { 104 105 panic("softdep_flushfiles called"); 106 } 107 108 int 109 softdep_mount(devvp, mp, fs, cred) 110 struct vnode *devvp; 111 struct mount *mp; 112 struct fs *fs; 113 struct ucred *cred; 114 { 115 116 return (0); 117 } 118 119 void 120 softdep_initialize() 121 { 122 123 return; 124 } 125 126 void 127 softdep_uninitialize() 128 { 129 130 return; 131 } 132 133 void 134 softdep_unmount(mp) 135 struct mount *mp; 136 { 137 138 panic("softdep_unmount called"); 139 } 140 141 void 142 softdep_setup_sbupdate(ump, fs, bp) 143 struct ufsmount *ump; 144 struct fs *fs; 145 struct buf *bp; 146 { 147 148 panic("softdep_setup_sbupdate called"); 149 } 150 151 void 152 softdep_setup_inomapdep(bp, ip, newinum, mode) 153 struct buf *bp; 154 struct inode *ip; 155 ino_t newinum; 156 int mode; 157 { 158 159 panic("softdep_setup_inomapdep called"); 160 } 161 162 void 163 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) 164 struct buf *bp; 165 struct mount *mp; 166 ufs2_daddr_t newblkno; 167 int frags; 168 int oldfrags; 169 { 170 171 panic("softdep_setup_blkmapdep called"); 172 } 173 174 void 175 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 176 struct inode *ip; 177 ufs_lbn_t lbn; 178 ufs2_daddr_t newblkno; 179 ufs2_daddr_t oldblkno; 180 long newsize; 181 long oldsize; 182 struct buf *bp; 183 { 184 185 panic("softdep_setup_allocdirect called"); 186 } 187 188 void 189 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 190 struct inode *ip; 191 ufs_lbn_t lbn; 192 ufs2_daddr_t newblkno; 193 ufs2_daddr_t oldblkno; 194 long newsize; 195 long oldsize; 196 struct buf *bp; 197 { 198 199 panic("softdep_setup_allocext called"); 200 } 201 202 void 203 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 204 struct inode *ip; 205 ufs_lbn_t lbn; 206 struct buf *bp; 207 int ptrno; 208 ufs2_daddr_t newblkno; 209 ufs2_daddr_t oldblkno; 210 struct buf *nbp; 211 { 212 213 panic("softdep_setup_allocindir_page called"); 214 } 215 216 void 217 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 218 struct buf *nbp; 219 struct inode *ip; 220 struct buf *bp; 221 int ptrno; 222 ufs2_daddr_t newblkno; 223 { 224 225 panic("softdep_setup_allocindir_meta called"); 226 } 227 228 void 229 softdep_journal_freeblocks(ip, cred, length, flags) 230 struct inode *ip; 231 struct ucred *cred; 232 off_t length; 233 int flags; 234 { 235 236 panic("softdep_journal_freeblocks called"); 237 } 238 239 void 240 softdep_journal_fsync(ip) 241 struct inode *ip; 242 { 243 244 panic("softdep_journal_fsync called"); 245 } 246 247 void 248 softdep_setup_freeblocks(ip, length, flags) 249 struct inode *ip; 250 off_t length; 251 int flags; 252 { 253 254 panic("softdep_setup_freeblocks called"); 255 } 256 257 void 258 softdep_freefile(pvp, ino, mode) 259 struct vnode *pvp; 260 ino_t ino; 261 int mode; 262 { 263 264 panic("softdep_freefile called"); 265 } 266 267 int 268 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) 269 struct buf *bp; 270 struct inode *dp; 271 off_t diroffset; 272 ino_t newinum; 273 struct buf *newdirbp; 274 int isnewblk; 275 { 276 277 panic("softdep_setup_directory_add called"); 278 } 279 280 void 281 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) 282 struct buf *bp; 283 struct inode *dp; 284 caddr_t base; 285 caddr_t oldloc; 286 caddr_t newloc; 287 int entrysize; 288 { 289 290 panic("softdep_change_directoryentry_offset called"); 291 } 292 293 void 294 softdep_setup_remove(bp, dp, ip, isrmdir) 295 struct buf *bp; 296 struct inode *dp; 297 struct inode *ip; 298 int isrmdir; 299 { 300 301 panic("softdep_setup_remove called"); 302 } 303 304 void 305 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 306 struct buf *bp; 307 struct inode *dp; 308 struct inode *ip; 309 ino_t newinum; 310 int isrmdir; 311 { 312 313 panic("softdep_setup_directory_change called"); 314 } 315 316 void 317 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) 318 struct mount *mp; 319 struct buf *bp; 320 ufs2_daddr_t blkno; 321 int frags; 322 struct workhead *wkhd; 323 { 324 325 panic("%s called", __FUNCTION__); 326 } 327 328 void 329 softdep_setup_inofree(mp, bp, ino, wkhd) 330 struct mount *mp; 331 struct buf *bp; 332 ino_t ino; 333 struct workhead *wkhd; 334 { 335 336 panic("%s called", __FUNCTION__); 337 } 338 339 void 340 softdep_setup_unlink(dp, ip) 341 struct inode *dp; 342 struct inode *ip; 343 { 344 345 panic("%s called", __FUNCTION__); 346 } 347 348 void 349 softdep_setup_link(dp, ip) 350 struct inode *dp; 351 struct inode *ip; 352 { 353 354 panic("%s called", __FUNCTION__); 355 } 356 357 void 358 softdep_revert_link(dp, ip) 359 struct inode *dp; 360 struct inode *ip; 361 { 362 363 panic("%s called", __FUNCTION__); 364 } 365 366 void 367 softdep_setup_rmdir(dp, ip) 368 struct inode *dp; 369 struct inode *ip; 370 { 371 372 panic("%s called", __FUNCTION__); 373 } 374 375 void 376 softdep_revert_rmdir(dp, ip) 377 struct inode *dp; 378 struct inode *ip; 379 { 380 381 panic("%s called", __FUNCTION__); 382 } 383 384 void 385 softdep_setup_create(dp, ip) 386 struct inode *dp; 387 struct inode *ip; 388 { 389 390 panic("%s called", __FUNCTION__); 391 } 392 393 void 394 softdep_revert_create(dp, ip) 395 struct inode *dp; 396 struct inode *ip; 397 { 398 399 panic("%s called", __FUNCTION__); 400 } 401 402 void 403 softdep_setup_mkdir(dp, ip) 404 struct inode *dp; 405 struct inode *ip; 406 { 407 408 panic("%s called", __FUNCTION__); 409 } 410 411 void 412 softdep_revert_mkdir(dp, ip) 413 struct inode *dp; 414 struct inode *ip; 415 { 416 417 panic("%s called", __FUNCTION__); 418 } 419 420 void 421 softdep_setup_dotdot_link(dp, ip) 422 struct inode *dp; 423 struct inode *ip; 424 { 425 426 panic("%s called", __FUNCTION__); 427 } 428 429 int 430 softdep_prealloc(vp, waitok) 431 struct vnode *vp; 432 int waitok; 433 { 434 435 panic("%s called", __FUNCTION__); 436 } 437 438 int 439 softdep_journal_lookup(mp, vpp) 440 struct mount *mp; 441 struct vnode **vpp; 442 { 443 444 return (ENOENT); 445 } 446 447 void 448 softdep_change_linkcnt(ip) 449 struct inode *ip; 450 { 451 452 panic("softdep_change_linkcnt called"); 453 } 454 455 void 456 softdep_load_inodeblock(ip) 457 struct inode *ip; 458 { 459 460 panic("softdep_load_inodeblock called"); 461 } 462 463 void 464 softdep_update_inodeblock(ip, bp, waitfor) 465 struct inode *ip; 466 struct buf *bp; 467 int waitfor; 468 { 469 470 panic("softdep_update_inodeblock called"); 471 } 472 473 int 474 softdep_fsync(vp) 475 struct vnode *vp; /* the "in_core" copy of the inode */ 476 { 477 478 return (0); 479 } 480 481 void 482 softdep_fsync_mountdev(vp) 483 struct vnode *vp; 484 { 485 486 return; 487 } 488 489 int 490 softdep_flushworklist(oldmnt, countp, td) 491 struct mount *oldmnt; 492 int *countp; 493 struct thread *td; 494 { 495 496 *countp = 0; 497 return (0); 498 } 499 500 int 501 softdep_sync_metadata(struct vnode *vp) 502 { 503 504 panic("softdep_sync_metadata called"); 505 } 506 507 int 508 softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor) 509 { 510 511 panic("softdep_sync_buf called"); 512 } 513 514 int 515 softdep_slowdown(vp) 516 struct vnode *vp; 517 { 518 519 panic("softdep_slowdown called"); 520 } 521 522 int 523 softdep_request_cleanup(fs, vp, cred, resource) 524 struct fs *fs; 525 struct vnode *vp; 526 struct ucred *cred; 527 int resource; 528 { 529 530 return (0); 531 } 532 533 int 534 softdep_check_suspend(struct mount *mp, 535 struct vnode *devvp, 536 int softdep_depcnt, 537 int softdep_accdepcnt, 538 int secondary_writes, 539 int secondary_accwrites) 540 { 541 struct bufobj *bo; 542 int error; 543 544 (void) softdep_depcnt, 545 (void) softdep_accdepcnt; 546 547 bo = &devvp->v_bufobj; 548 ASSERT_BO_WLOCKED(bo); 549 550 MNT_ILOCK(mp); 551 while (mp->mnt_secondary_writes != 0) { 552 BO_UNLOCK(bo); 553 msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), 554 (PUSER - 1) | PDROP, "secwr", 0); 555 BO_LOCK(bo); 556 MNT_ILOCK(mp); 557 } 558 559 /* 560 * Reasons for needing more work before suspend: 561 * - Dirty buffers on devvp. 562 * - Secondary writes occurred after start of vnode sync loop 563 */ 564 error = 0; 565 if (bo->bo_numoutput > 0 || 566 bo->bo_dirty.bv_cnt > 0 || 567 secondary_writes != 0 || 568 mp->mnt_secondary_writes != 0 || 569 secondary_accwrites != mp->mnt_secondary_accwrites) 570 error = EAGAIN; 571 BO_UNLOCK(bo); 572 return (error); 573 } 574 575 void 576 softdep_get_depcounts(struct mount *mp, 577 int *softdepactivep, 578 int *softdepactiveaccp) 579 { 580 (void) mp; 581 *softdepactivep = 0; 582 *softdepactiveaccp = 0; 583 } 584 585 void 586 softdep_buf_append(bp, wkhd) 587 struct buf *bp; 588 struct workhead *wkhd; 589 { 590 591 panic("softdep_buf_appendwork called"); 592 } 593 594 void 595 softdep_inode_append(ip, cred, wkhd) 596 struct inode *ip; 597 struct ucred *cred; 598 struct workhead *wkhd; 599 { 600 601 panic("softdep_inode_appendwork called"); 602 } 603 604 void 605 softdep_freework(wkhd) 606 struct workhead *wkhd; 607 { 608 609 panic("softdep_freework called"); 610 } 611 612 int 613 softdep_prerename(fdvp, fvp, tdvp, tvp) 614 struct vnode *fdvp; 615 struct vnode *fvp; 616 struct vnode *tdvp; 617 struct vnode *tvp; 618 { 619 620 panic("softdep_prerename called"); 621 } 622 623 int 624 softdep_prelink(dvp, vp, cnp) 625 struct vnode *dvp; 626 struct vnode *vp; 627 struct componentname *cnp; 628 { 629 630 panic("softdep_prelink called"); 631 } 632 633 #else 634 635 FEATURE(softupdates, "FFS soft-updates support"); 636 637 static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 638 "soft updates stats"); 639 static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, 640 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 641 "total dependencies allocated"); 642 static SYSCTL_NODE(_debug_softdep, OID_AUTO, highuse, 643 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 644 "high use dependencies allocated"); 645 static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, 646 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 647 "current dependencies allocated"); 648 static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, 649 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 650 "current dependencies written"); 651 652 unsigned long dep_current[D_LAST + 1]; 653 unsigned long dep_highuse[D_LAST + 1]; 654 unsigned long dep_total[D_LAST + 1]; 655 unsigned long dep_write[D_LAST + 1]; 656 657 #define SOFTDEP_TYPE(type, str, long) \ 658 static MALLOC_DEFINE(M_ ## type, #str, long); \ 659 SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD, \ 660 &dep_total[D_ ## type], 0, ""); \ 661 SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, \ 662 &dep_current[D_ ## type], 0, ""); \ 663 SYSCTL_ULONG(_debug_softdep_highuse, OID_AUTO, str, CTLFLAG_RD, \ 664 &dep_highuse[D_ ## type], 0, ""); \ 665 SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, \ 666 &dep_write[D_ ## type], 0, ""); 667 668 SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies"); 669 SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies"); 670 SOFTDEP_TYPE(BMSAFEMAP, bmsafemap, 671 "Block or frag allocated from cyl group map"); 672 SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency"); 673 SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode"); 674 SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies"); 675 SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block"); 676 SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode"); 677 SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode"); 678 SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated"); 679 SOFTDEP_TYPE(DIRADD, diradd, "New directory entry"); 680 SOFTDEP_TYPE(MKDIR, mkdir, "New directory"); 681 SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted"); 682 SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block"); 683 SOFTDEP_TYPE(FREEWORK, freework, "free an inode block"); 684 SOFTDEP_TYPE(FREEDEP, freedep, "track a block free"); 685 SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add"); 686 SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove"); 687 SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move"); 688 SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block"); 689 SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block"); 690 SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag"); 691 SOFTDEP_TYPE(JSEG, jseg, "Journal segment"); 692 SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete"); 693 SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency"); 694 SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation"); 695 SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete"); 696 697 static MALLOC_DEFINE(M_SENTINEL, "sentinel", "Worklist sentinel"); 698 699 static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes"); 700 static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations"); 701 static MALLOC_DEFINE(M_MOUNTDATA, "softdep", "Softdep per-mount data"); 702 703 #define M_SOFTDEP_FLAGS (M_WAITOK) 704 705 /* 706 * translate from workitem type to memory type 707 * MUST match the defines above, such that memtype[D_XXX] == M_XXX 708 */ 709 static struct malloc_type *memtype[] = { 710 NULL, 711 M_PAGEDEP, 712 M_INODEDEP, 713 M_BMSAFEMAP, 714 M_NEWBLK, 715 M_ALLOCDIRECT, 716 M_INDIRDEP, 717 M_ALLOCINDIR, 718 M_FREEFRAG, 719 M_FREEBLKS, 720 M_FREEFILE, 721 M_DIRADD, 722 M_MKDIR, 723 M_DIRREM, 724 M_NEWDIRBLK, 725 M_FREEWORK, 726 M_FREEDEP, 727 M_JADDREF, 728 M_JREMREF, 729 M_JMVREF, 730 M_JNEWBLK, 731 M_JFREEBLK, 732 M_JFREEFRAG, 733 M_JSEG, 734 M_JSEGDEP, 735 M_SBDEP, 736 M_JTRUNC, 737 M_JFSYNC, 738 M_SENTINEL 739 }; 740 741 #define DtoM(type) (memtype[type]) 742 743 /* 744 * Names of malloc types. 745 */ 746 #define TYPENAME(type) \ 747 ((unsigned)(type) <= D_LAST && (unsigned)(type) >= D_FIRST ? \ 748 memtype[type]->ks_shortdesc : "???") 749 /* 750 * End system adaptation definitions. 751 */ 752 753 #define DOTDOT_OFFSET offsetof(struct dirtemplate, dotdot_ino) 754 #define DOT_OFFSET offsetof(struct dirtemplate, dot_ino) 755 756 /* 757 * Internal function prototypes. 758 */ 759 static void check_clear_deps(struct mount *); 760 static void softdep_error(char *, int); 761 static int softdep_prerename_vnode(struct ufsmount *, struct vnode *); 762 static int softdep_process_worklist(struct mount *, int); 763 static int softdep_waitidle(struct mount *, int); 764 static void drain_output(struct vnode *); 765 static struct buf *getdirtybuf(struct buf *, struct rwlock *, int); 766 static int check_inodedep_free(struct inodedep *); 767 static void clear_remove(struct mount *); 768 static void clear_inodedeps(struct mount *); 769 static void unlinked_inodedep(struct mount *, struct inodedep *); 770 static void clear_unlinked_inodedep(struct inodedep *); 771 static struct inodedep *first_unlinked_inodedep(struct ufsmount *); 772 static int flush_pagedep_deps(struct vnode *, struct mount *, 773 struct diraddhd *, struct buf *); 774 static int free_pagedep(struct pagedep *); 775 static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t); 776 static int flush_inodedep_deps(struct vnode *, struct mount *, ino_t); 777 static int flush_deplist(struct allocdirectlst *, int, int *); 778 static int sync_cgs(struct mount *, int); 779 static int handle_written_filepage(struct pagedep *, struct buf *, int); 780 static int handle_written_sbdep(struct sbdep *, struct buf *); 781 static void initiate_write_sbdep(struct sbdep *); 782 static void diradd_inode_written(struct diradd *, struct inodedep *); 783 static int handle_written_indirdep(struct indirdep *, struct buf *, 784 struct buf**, int); 785 static int handle_written_inodeblock(struct inodedep *, struct buf *, int); 786 static int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *, 787 uint8_t *); 788 static int handle_written_bmsafemap(struct bmsafemap *, struct buf *, int); 789 static void handle_written_jaddref(struct jaddref *); 790 static void handle_written_jremref(struct jremref *); 791 static void handle_written_jseg(struct jseg *, struct buf *); 792 static void handle_written_jnewblk(struct jnewblk *); 793 static void handle_written_jblkdep(struct jblkdep *); 794 static void handle_written_jfreefrag(struct jfreefrag *); 795 static void complete_jseg(struct jseg *); 796 static void complete_jsegs(struct jseg *); 797 static void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *); 798 static void jaddref_write(struct jaddref *, struct jseg *, uint8_t *); 799 static void jremref_write(struct jremref *, struct jseg *, uint8_t *); 800 static void jmvref_write(struct jmvref *, struct jseg *, uint8_t *); 801 static void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *); 802 static void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data); 803 static void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *); 804 static void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *); 805 static void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *); 806 static inline void inoref_write(struct inoref *, struct jseg *, 807 struct jrefrec *); 808 static void handle_allocdirect_partdone(struct allocdirect *, 809 struct workhead *); 810 static struct jnewblk *cancel_newblk(struct newblk *, struct worklist *, 811 struct workhead *); 812 static void indirdep_complete(struct indirdep *); 813 static int indirblk_lookup(struct mount *, ufs2_daddr_t); 814 static void indirblk_insert(struct freework *); 815 static void indirblk_remove(struct freework *); 816 static void handle_allocindir_partdone(struct allocindir *); 817 static void initiate_write_filepage(struct pagedep *, struct buf *); 818 static void initiate_write_indirdep(struct indirdep*, struct buf *); 819 static void handle_written_mkdir(struct mkdir *, int); 820 static int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *, 821 uint8_t *); 822 static void initiate_write_bmsafemap(struct bmsafemap *, struct buf *); 823 static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *); 824 static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *); 825 static void handle_workitem_freefile(struct freefile *); 826 static int handle_workitem_remove(struct dirrem *, int); 827 static struct dirrem *newdirrem(struct buf *, struct inode *, 828 struct inode *, int, struct dirrem **); 829 static struct indirdep *indirdep_lookup(struct mount *, struct inode *, 830 struct buf *); 831 static void cancel_indirdep(struct indirdep *, struct buf *, 832 struct freeblks *); 833 static void free_indirdep(struct indirdep *); 834 static void free_diradd(struct diradd *, struct workhead *); 835 static void merge_diradd(struct inodedep *, struct diradd *); 836 static void complete_diradd(struct diradd *); 837 static struct diradd *diradd_lookup(struct pagedep *, int); 838 static struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *, 839 struct jremref *); 840 static struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *, 841 struct jremref *); 842 static void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *, 843 struct jremref *, struct jremref *); 844 static void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *, 845 struct jremref *); 846 static void cancel_allocindir(struct allocindir *, struct buf *bp, 847 struct freeblks *, int); 848 static int setup_trunc_indir(struct freeblks *, struct inode *, 849 ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t); 850 static void complete_trunc_indir(struct freework *); 851 static void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *, 852 int); 853 static void complete_mkdir(struct mkdir *); 854 static void free_newdirblk(struct newdirblk *); 855 static void free_jremref(struct jremref *); 856 static void free_jaddref(struct jaddref *); 857 static void free_jsegdep(struct jsegdep *); 858 static void free_jsegs(struct jblocks *); 859 static void rele_jseg(struct jseg *); 860 static void free_jseg(struct jseg *, struct jblocks *); 861 static void free_jnewblk(struct jnewblk *); 862 static void free_jblkdep(struct jblkdep *); 863 static void free_jfreefrag(struct jfreefrag *); 864 static void free_freedep(struct freedep *); 865 static void journal_jremref(struct dirrem *, struct jremref *, 866 struct inodedep *); 867 static void cancel_jnewblk(struct jnewblk *, struct workhead *); 868 static int cancel_jaddref(struct jaddref *, struct inodedep *, 869 struct workhead *); 870 static void cancel_jfreefrag(struct jfreefrag *); 871 static inline void setup_freedirect(struct freeblks *, struct inode *, 872 int, int); 873 static inline void setup_freeext(struct freeblks *, struct inode *, int, int); 874 static inline void setup_freeindir(struct freeblks *, struct inode *, int, 875 ufs_lbn_t, int); 876 static inline struct freeblks *newfreeblks(struct mount *, struct inode *); 877 static void freeblks_free(struct ufsmount *, struct freeblks *, int); 878 static void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t); 879 static ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t); 880 static int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int); 881 static void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t, 882 int, int); 883 static void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int); 884 static int cancel_pagedep(struct pagedep *, struct freeblks *, int); 885 static int deallocate_dependencies(struct buf *, struct freeblks *, int); 886 static void newblk_freefrag(struct newblk*); 887 static void free_newblk(struct newblk *); 888 static void cancel_allocdirect(struct allocdirectlst *, 889 struct allocdirect *, struct freeblks *); 890 static int check_inode_unwritten(struct inodedep *); 891 static int free_inodedep(struct inodedep *); 892 static void freework_freeblock(struct freework *, u_long); 893 static void freework_enqueue(struct freework *); 894 static int handle_workitem_freeblocks(struct freeblks *, int); 895 static int handle_complete_freeblocks(struct freeblks *, int); 896 static void handle_workitem_indirblk(struct freework *); 897 static void handle_written_freework(struct freework *); 898 static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *); 899 static struct worklist *jnewblk_merge(struct worklist *, struct worklist *, 900 struct workhead *); 901 static struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *, 902 struct inodedep *, struct allocindir *, ufs_lbn_t); 903 static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t, 904 ufs2_daddr_t, ufs_lbn_t); 905 static void handle_workitem_freefrag(struct freefrag *); 906 static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long, 907 ufs_lbn_t, u_long); 908 static void allocdirect_merge(struct allocdirectlst *, 909 struct allocdirect *, struct allocdirect *); 910 static struct freefrag *allocindir_merge(struct allocindir *, 911 struct allocindir *); 912 static int bmsafemap_find(struct bmsafemap_hashhead *, int, 913 struct bmsafemap **); 914 static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *, 915 int cg, struct bmsafemap *); 916 static int newblk_find(struct newblk_hashhead *, ufs2_daddr_t, int, 917 struct newblk **); 918 static int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **); 919 static int inodedep_find(struct inodedep_hashhead *, ino_t, 920 struct inodedep **); 921 static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **); 922 static int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t, 923 int, struct pagedep **); 924 static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t, 925 struct pagedep **); 926 static void pause_timer(void *); 927 static int request_cleanup(struct mount *, int); 928 static int softdep_request_cleanup_flush(struct mount *, struct ufsmount *); 929 static void schedule_cleanup(struct mount *); 930 static void softdep_ast_cleanup_proc(struct thread *); 931 static struct ufsmount *softdep_bp_to_mp(struct buf *bp); 932 static int process_worklist_item(struct mount *, int, int); 933 static void process_removes(struct vnode *); 934 static void process_truncates(struct vnode *); 935 static void jwork_move(struct workhead *, struct workhead *); 936 static void jwork_insert(struct workhead *, struct jsegdep *); 937 static void add_to_worklist(struct worklist *, int); 938 static void wake_worklist(struct worklist *); 939 static void wait_worklist(struct worklist *, char *); 940 static void remove_from_worklist(struct worklist *); 941 static void softdep_flush(void *); 942 static void softdep_flushjournal(struct mount *); 943 static int softdep_speedup(struct ufsmount *); 944 static void worklist_speedup(struct mount *); 945 static int journal_mount(struct mount *, struct fs *, struct ucred *); 946 static void journal_unmount(struct ufsmount *); 947 static int journal_space(struct ufsmount *, int); 948 static void journal_suspend(struct ufsmount *); 949 static int journal_unsuspend(struct ufsmount *ump); 950 static void add_to_journal(struct worklist *); 951 static void remove_from_journal(struct worklist *); 952 static bool softdep_excess_items(struct ufsmount *, int); 953 static void softdep_process_journal(struct mount *, struct worklist *, int); 954 static struct jremref *newjremref(struct dirrem *, struct inode *, 955 struct inode *ip, off_t, nlink_t); 956 static struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t, 957 uint16_t); 958 static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t, 959 uint16_t); 960 static inline struct jsegdep *inoref_jseg(struct inoref *); 961 static struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t); 962 static struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t, 963 ufs2_daddr_t, int); 964 static void adjust_newfreework(struct freeblks *, int); 965 static struct jtrunc *newjtrunc(struct freeblks *, off_t, int); 966 static void move_newblock_dep(struct jaddref *, struct inodedep *); 967 static void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t); 968 static struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *, 969 ufs2_daddr_t, long, ufs_lbn_t); 970 static struct freework *newfreework(struct ufsmount *, struct freeblks *, 971 struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int); 972 static int jwait(struct worklist *, int); 973 static struct inodedep *inodedep_lookup_ip(struct inode *); 974 static int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *); 975 static struct freefile *handle_bufwait(struct inodedep *, struct workhead *); 976 static void handle_jwork(struct workhead *); 977 static struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *, 978 struct mkdir **); 979 static struct jblocks *jblocks_create(void); 980 static ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *); 981 static void jblocks_free(struct jblocks *, struct mount *, int); 982 static void jblocks_destroy(struct jblocks *); 983 static void jblocks_add(struct jblocks *, ufs2_daddr_t, int); 984 985 /* 986 * Exported softdep operations. 987 */ 988 static void softdep_disk_io_initiation(struct buf *); 989 static void softdep_disk_write_complete(struct buf *); 990 static void softdep_deallocate_dependencies(struct buf *); 991 static int softdep_count_dependencies(struct buf *bp, int); 992 993 /* 994 * Global lock over all of soft updates. 995 */ 996 static struct mtx lk; 997 MTX_SYSINIT(softdep_lock, &lk, "global softdep", MTX_DEF); 998 999 #define ACQUIRE_GBLLOCK(lk) mtx_lock(lk) 1000 #define FREE_GBLLOCK(lk) mtx_unlock(lk) 1001 #define GBLLOCK_OWNED(lk) mtx_assert((lk), MA_OWNED) 1002 1003 /* 1004 * Per-filesystem soft-updates locking. 1005 */ 1006 #define LOCK_PTR(ump) (&(ump)->um_softdep->sd_fslock) 1007 #define TRY_ACQUIRE_LOCK(ump) rw_try_wlock(&(ump)->um_softdep->sd_fslock) 1008 #define ACQUIRE_LOCK(ump) rw_wlock(&(ump)->um_softdep->sd_fslock) 1009 #define FREE_LOCK(ump) rw_wunlock(&(ump)->um_softdep->sd_fslock) 1010 #define LOCK_OWNED(ump) rw_assert(&(ump)->um_softdep->sd_fslock, \ 1011 RA_WLOCKED) 1012 1013 #define BUF_AREC(bp) lockallowrecurse(&(bp)->b_lock) 1014 #define BUF_NOREC(bp) lockdisablerecurse(&(bp)->b_lock) 1015 1016 /* 1017 * Worklist queue management. 1018 * These routines require that the lock be held. 1019 */ 1020 #ifndef /* NOT */ INVARIANTS 1021 #define WORKLIST_INSERT(head, item) do { \ 1022 (item)->wk_state |= ONWORKLIST; \ 1023 LIST_INSERT_HEAD(head, item, wk_list); \ 1024 } while (0) 1025 #define WORKLIST_REMOVE(item) do { \ 1026 (item)->wk_state &= ~ONWORKLIST; \ 1027 LIST_REMOVE(item, wk_list); \ 1028 } while (0) 1029 #define WORKLIST_INSERT_UNLOCKED WORKLIST_INSERT 1030 #define WORKLIST_REMOVE_UNLOCKED WORKLIST_REMOVE 1031 1032 #else /* INVARIANTS */ 1033 static void worklist_insert(struct workhead *, struct worklist *, int, 1034 const char *, int); 1035 static void worklist_remove(struct worklist *, int, const char *, int); 1036 1037 #define WORKLIST_INSERT(head, item) \ 1038 worklist_insert(head, item, 1, __func__, __LINE__) 1039 #define WORKLIST_INSERT_UNLOCKED(head, item)\ 1040 worklist_insert(head, item, 0, __func__, __LINE__) 1041 #define WORKLIST_REMOVE(item)\ 1042 worklist_remove(item, 1, __func__, __LINE__) 1043 #define WORKLIST_REMOVE_UNLOCKED(item)\ 1044 worklist_remove(item, 0, __func__, __LINE__) 1045 1046 static void 1047 worklist_insert(head, item, locked, func, line) 1048 struct workhead *head; 1049 struct worklist *item; 1050 int locked; 1051 const char *func; 1052 int line; 1053 { 1054 1055 if (locked) 1056 LOCK_OWNED(VFSTOUFS(item->wk_mp)); 1057 if (item->wk_state & ONWORKLIST) 1058 panic("worklist_insert: %p %s(0x%X) already on list, " 1059 "added in function %s at line %d", 1060 item, TYPENAME(item->wk_type), item->wk_state, 1061 item->wk_func, item->wk_line); 1062 item->wk_state |= ONWORKLIST; 1063 item->wk_func = func; 1064 item->wk_line = line; 1065 LIST_INSERT_HEAD(head, item, wk_list); 1066 } 1067 1068 static void 1069 worklist_remove(item, locked, func, line) 1070 struct worklist *item; 1071 int locked; 1072 const char *func; 1073 int line; 1074 { 1075 1076 if (locked) 1077 LOCK_OWNED(VFSTOUFS(item->wk_mp)); 1078 if ((item->wk_state & ONWORKLIST) == 0) 1079 panic("worklist_remove: %p %s(0x%X) not on list, " 1080 "removed in function %s at line %d", 1081 item, TYPENAME(item->wk_type), item->wk_state, 1082 item->wk_func, item->wk_line); 1083 item->wk_state &= ~ONWORKLIST; 1084 item->wk_func = func; 1085 item->wk_line = line; 1086 LIST_REMOVE(item, wk_list); 1087 } 1088 #endif /* INVARIANTS */ 1089 1090 /* 1091 * Merge two jsegdeps keeping only the oldest one as newer references 1092 * can't be discarded until after older references. 1093 */ 1094 static inline struct jsegdep * 1095 jsegdep_merge(struct jsegdep *one, struct jsegdep *two) 1096 { 1097 struct jsegdep *swp; 1098 1099 if (two == NULL) 1100 return (one); 1101 1102 if (one->jd_seg->js_seq > two->jd_seg->js_seq) { 1103 swp = one; 1104 one = two; 1105 two = swp; 1106 } 1107 WORKLIST_REMOVE(&two->jd_list); 1108 free_jsegdep(two); 1109 1110 return (one); 1111 } 1112 1113 /* 1114 * If two freedeps are compatible free one to reduce list size. 1115 */ 1116 static inline struct freedep * 1117 freedep_merge(struct freedep *one, struct freedep *two) 1118 { 1119 if (two == NULL) 1120 return (one); 1121 1122 if (one->fd_freework == two->fd_freework) { 1123 WORKLIST_REMOVE(&two->fd_list); 1124 free_freedep(two); 1125 } 1126 return (one); 1127 } 1128 1129 /* 1130 * Move journal work from one list to another. Duplicate freedeps and 1131 * jsegdeps are coalesced to keep the lists as small as possible. 1132 */ 1133 static void 1134 jwork_move(dst, src) 1135 struct workhead *dst; 1136 struct workhead *src; 1137 { 1138 struct freedep *freedep; 1139 struct jsegdep *jsegdep; 1140 struct worklist *wkn; 1141 struct worklist *wk; 1142 1143 KASSERT(dst != src, 1144 ("jwork_move: dst == src")); 1145 freedep = NULL; 1146 jsegdep = NULL; 1147 LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) { 1148 if (wk->wk_type == D_JSEGDEP) 1149 jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); 1150 else if (wk->wk_type == D_FREEDEP) 1151 freedep = freedep_merge(WK_FREEDEP(wk), freedep); 1152 } 1153 1154 while ((wk = LIST_FIRST(src)) != NULL) { 1155 WORKLIST_REMOVE(wk); 1156 WORKLIST_INSERT(dst, wk); 1157 if (wk->wk_type == D_JSEGDEP) { 1158 jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); 1159 continue; 1160 } 1161 if (wk->wk_type == D_FREEDEP) 1162 freedep = freedep_merge(WK_FREEDEP(wk), freedep); 1163 } 1164 } 1165 1166 static void 1167 jwork_insert(dst, jsegdep) 1168 struct workhead *dst; 1169 struct jsegdep *jsegdep; 1170 { 1171 struct jsegdep *jsegdepn; 1172 struct worklist *wk; 1173 1174 LIST_FOREACH(wk, dst, wk_list) 1175 if (wk->wk_type == D_JSEGDEP) 1176 break; 1177 if (wk == NULL) { 1178 WORKLIST_INSERT(dst, &jsegdep->jd_list); 1179 return; 1180 } 1181 jsegdepn = WK_JSEGDEP(wk); 1182 if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) { 1183 WORKLIST_REMOVE(wk); 1184 free_jsegdep(jsegdepn); 1185 WORKLIST_INSERT(dst, &jsegdep->jd_list); 1186 } else 1187 free_jsegdep(jsegdep); 1188 } 1189 1190 /* 1191 * Routines for tracking and managing workitems. 1192 */ 1193 static void workitem_free(struct worklist *, int); 1194 static void workitem_alloc(struct worklist *, int, struct mount *); 1195 static void workitem_reassign(struct worklist *, int); 1196 1197 #define WORKITEM_FREE(item, type) \ 1198 workitem_free((struct worklist *)(item), (type)) 1199 #define WORKITEM_REASSIGN(item, type) \ 1200 workitem_reassign((struct worklist *)(item), (type)) 1201 1202 static void 1203 workitem_free(item, type) 1204 struct worklist *item; 1205 int type; 1206 { 1207 struct ufsmount *ump; 1208 1209 #ifdef INVARIANTS 1210 if (item->wk_state & ONWORKLIST) 1211 panic("workitem_free: %s(0x%X) still on list, " 1212 "added in function %s at line %d", 1213 TYPENAME(item->wk_type), item->wk_state, 1214 item->wk_func, item->wk_line); 1215 if (item->wk_type != type && type != D_NEWBLK) 1216 panic("workitem_free: type mismatch %s != %s", 1217 TYPENAME(item->wk_type), TYPENAME(type)); 1218 #endif 1219 if (item->wk_state & IOWAITING) 1220 wakeup(item); 1221 ump = VFSTOUFS(item->wk_mp); 1222 LOCK_OWNED(ump); 1223 KASSERT(ump->softdep_deps > 0, 1224 ("workitem_free: %s: softdep_deps going negative", 1225 ump->um_fs->fs_fsmnt)); 1226 if (--ump->softdep_deps == 0 && ump->softdep_req) 1227 wakeup(&ump->softdep_deps); 1228 KASSERT(dep_current[item->wk_type] > 0, 1229 ("workitem_free: %s: dep_current[%s] going negative", 1230 ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type))); 1231 KASSERT(ump->softdep_curdeps[item->wk_type] > 0, 1232 ("workitem_free: %s: softdep_curdeps[%s] going negative", 1233 ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type))); 1234 atomic_subtract_long(&dep_current[item->wk_type], 1); 1235 ump->softdep_curdeps[item->wk_type] -= 1; 1236 LIST_REMOVE(item, wk_all); 1237 free(item, DtoM(type)); 1238 } 1239 1240 static void 1241 workitem_alloc(item, type, mp) 1242 struct worklist *item; 1243 int type; 1244 struct mount *mp; 1245 { 1246 struct ufsmount *ump; 1247 1248 item->wk_type = type; 1249 item->wk_mp = mp; 1250 item->wk_state = 0; 1251 1252 ump = VFSTOUFS(mp); 1253 ACQUIRE_GBLLOCK(&lk); 1254 dep_current[type]++; 1255 if (dep_current[type] > dep_highuse[type]) 1256 dep_highuse[type] = dep_current[type]; 1257 dep_total[type]++; 1258 FREE_GBLLOCK(&lk); 1259 ACQUIRE_LOCK(ump); 1260 ump->softdep_curdeps[type] += 1; 1261 ump->softdep_deps++; 1262 ump->softdep_accdeps++; 1263 LIST_INSERT_HEAD(&ump->softdep_alldeps[type], item, wk_all); 1264 FREE_LOCK(ump); 1265 } 1266 1267 static void 1268 workitem_reassign(item, newtype) 1269 struct worklist *item; 1270 int newtype; 1271 { 1272 struct ufsmount *ump; 1273 1274 ump = VFSTOUFS(item->wk_mp); 1275 LOCK_OWNED(ump); 1276 KASSERT(ump->softdep_curdeps[item->wk_type] > 0, 1277 ("workitem_reassign: %s: softdep_curdeps[%s] going negative", 1278 VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type))); 1279 ump->softdep_curdeps[item->wk_type] -= 1; 1280 ump->softdep_curdeps[newtype] += 1; 1281 KASSERT(dep_current[item->wk_type] > 0, 1282 ("workitem_reassign: %s: dep_current[%s] going negative", 1283 VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type))); 1284 ACQUIRE_GBLLOCK(&lk); 1285 dep_current[newtype]++; 1286 dep_current[item->wk_type]--; 1287 if (dep_current[newtype] > dep_highuse[newtype]) 1288 dep_highuse[newtype] = dep_current[newtype]; 1289 dep_total[newtype]++; 1290 FREE_GBLLOCK(&lk); 1291 item->wk_type = newtype; 1292 LIST_REMOVE(item, wk_all); 1293 LIST_INSERT_HEAD(&ump->softdep_alldeps[newtype], item, wk_all); 1294 } 1295 1296 /* 1297 * Workitem queue management 1298 */ 1299 static int max_softdeps; /* maximum number of structs before slowdown */ 1300 static int tickdelay = 2; /* number of ticks to pause during slowdown */ 1301 static int proc_waiting; /* tracks whether we have a timeout posted */ 1302 static int *stat_countp; /* statistic to count in proc_waiting timeout */ 1303 static struct callout softdep_callout; 1304 static int req_clear_inodedeps; /* syncer process flush some inodedeps */ 1305 static int req_clear_remove; /* syncer process flush some freeblks */ 1306 static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */ 1307 1308 /* 1309 * runtime statistics 1310 */ 1311 static int stat_flush_threads; /* number of softdep flushing threads */ 1312 static int stat_worklist_push; /* number of worklist cleanups */ 1313 static int stat_delayed_inact; /* number of delayed inactivation cleanups */ 1314 static int stat_blk_limit_push; /* number of times block limit neared */ 1315 static int stat_ino_limit_push; /* number of times inode limit neared */ 1316 static int stat_blk_limit_hit; /* number of times block slowdown imposed */ 1317 static int stat_ino_limit_hit; /* number of times inode slowdown imposed */ 1318 static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */ 1319 static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */ 1320 static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */ 1321 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */ 1322 static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */ 1323 static int stat_jaddref; /* bufs redirtied as ino bitmap can not write */ 1324 static int stat_jnewblk; /* bufs redirtied as blk bitmap can not write */ 1325 static int stat_journal_min; /* Times hit journal min threshold */ 1326 static int stat_journal_low; /* Times hit journal low threshold */ 1327 static int stat_journal_wait; /* Times blocked in jwait(). */ 1328 static int stat_jwait_filepage; /* Times blocked in jwait() for filepage. */ 1329 static int stat_jwait_freeblks; /* Times blocked in jwait() for freeblks. */ 1330 static int stat_jwait_inode; /* Times blocked in jwait() for inodes. */ 1331 static int stat_jwait_newblk; /* Times blocked in jwait() for newblks. */ 1332 static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */ 1333 static int stat_cleanup_blkrequests; /* Number of block cleanup requests */ 1334 static int stat_cleanup_inorequests; /* Number of inode cleanup requests */ 1335 static int stat_cleanup_retries; /* Number of cleanups that needed to flush */ 1336 static int stat_cleanup_failures; /* Number of cleanup requests that failed */ 1337 static int stat_emptyjblocks; /* Number of potentially empty journal blocks */ 1338 1339 SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW, 1340 &max_softdeps, 0, ""); 1341 SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW, 1342 &tickdelay, 0, ""); 1343 SYSCTL_INT(_debug_softdep, OID_AUTO, flush_threads, CTLFLAG_RD, 1344 &stat_flush_threads, 0, ""); 1345 SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, 1346 CTLFLAG_RW | CTLFLAG_STATS, &stat_worklist_push, 0,""); 1347 SYSCTL_INT(_debug_softdep, OID_AUTO, delayed_inactivations, CTLFLAG_RD, 1348 &stat_delayed_inact, 0, ""); 1349 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, 1350 CTLFLAG_RW | CTLFLAG_STATS, &stat_blk_limit_push, 0,""); 1351 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, 1352 CTLFLAG_RW | CTLFLAG_STATS, &stat_ino_limit_push, 0,""); 1353 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, 1354 CTLFLAG_RW | CTLFLAG_STATS, &stat_blk_limit_hit, 0, ""); 1355 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, 1356 CTLFLAG_RW | CTLFLAG_STATS, &stat_ino_limit_hit, 0, ""); 1357 SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, 1358 CTLFLAG_RW | CTLFLAG_STATS, &stat_sync_limit_hit, 0, ""); 1359 SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, 1360 CTLFLAG_RW | CTLFLAG_STATS, &stat_indir_blk_ptrs, 0, ""); 1361 SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, 1362 CTLFLAG_RW | CTLFLAG_STATS, &stat_inode_bitmap, 0, ""); 1363 SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, 1364 CTLFLAG_RW | CTLFLAG_STATS, &stat_direct_blk_ptrs, 0, ""); 1365 SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, 1366 CTLFLAG_RW | CTLFLAG_STATS, &stat_dir_entry, 0, ""); 1367 SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, 1368 CTLFLAG_RW | CTLFLAG_STATS, &stat_jaddref, 0, ""); 1369 SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, 1370 CTLFLAG_RW | CTLFLAG_STATS, &stat_jnewblk, 0, ""); 1371 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, 1372 CTLFLAG_RW | CTLFLAG_STATS, &stat_journal_low, 0, ""); 1373 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, 1374 CTLFLAG_RW | CTLFLAG_STATS, &stat_journal_min, 0, ""); 1375 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, 1376 CTLFLAG_RW | CTLFLAG_STATS, &stat_journal_wait, 0, ""); 1377 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, 1378 CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_filepage, 0, ""); 1379 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, 1380 CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_freeblks, 0, ""); 1381 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, 1382 CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_inode, 0, ""); 1383 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, 1384 CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_newblk, 0, ""); 1385 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, 1386 CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_blkrequests, 0, ""); 1387 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, 1388 CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_inorequests, 0, ""); 1389 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, 1390 CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_high_delay, 0, ""); 1391 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, 1392 CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_retries, 0, ""); 1393 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, 1394 CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_failures, 0, ""); 1395 1396 SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW, 1397 &softdep_flushcache, 0, ""); 1398 SYSCTL_INT(_debug_softdep, OID_AUTO, emptyjblocks, CTLFLAG_RD, 1399 &stat_emptyjblocks, 0, ""); 1400 1401 SYSCTL_DECL(_vfs_ffs); 1402 1403 /* Whether to recompute the summary at mount time */ 1404 static int compute_summary_at_mount = 0; 1405 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW, 1406 &compute_summary_at_mount, 0, "Recompute summary at mount"); 1407 static int print_threads = 0; 1408 SYSCTL_INT(_debug_softdep, OID_AUTO, print_threads, CTLFLAG_RW, 1409 &print_threads, 0, "Notify flusher thread start/stop"); 1410 1411 /* List of all filesystems mounted with soft updates */ 1412 static TAILQ_HEAD(, mount_softdeps) softdepmounts; 1413 1414 static void 1415 get_parent_vp_unlock_bp(struct mount *mp, struct buf *bp, 1416 struct diraddhd *diraddhdp, struct diraddhd *unfinishedp) 1417 { 1418 struct diradd *dap; 1419 1420 /* 1421 * Requeue unfinished dependencies before 1422 * unlocking buffer, which could make 1423 * diraddhdp invalid. 1424 */ 1425 ACQUIRE_LOCK(VFSTOUFS(mp)); 1426 while ((dap = LIST_FIRST(unfinishedp)) != NULL) { 1427 LIST_REMOVE(dap, da_pdlist); 1428 LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist); 1429 } 1430 FREE_LOCK(VFSTOUFS(mp)); 1431 1432 bp->b_vflags &= ~BV_SCANNED; 1433 BUF_NOREC(bp); 1434 BUF_UNLOCK(bp); 1435 } 1436 1437 /* 1438 * This function fetches inode inum on mount point mp. We already 1439 * hold a locked vnode vp, and might have a locked buffer bp belonging 1440 * to vp. 1441 1442 * We must not block on acquiring the new inode lock as we will get 1443 * into a lock-order reversal with the buffer lock and possibly get a 1444 * deadlock. Thus if we cannot instantiate the requested vnode 1445 * without sleeping on its lock, we must unlock the vnode and the 1446 * buffer before doing a blocking on the vnode lock. We return 1447 * ERELOOKUP if we have had to unlock either the vnode or the buffer so 1448 * that the caller can reassess its state. 1449 * 1450 * Top-level VFS code (for syscalls and other consumers, e.g. callers 1451 * of VOP_FSYNC() in syncer) check for ERELOOKUP and restart at safe 1452 * point. 1453 * 1454 * Since callers expect to operate on fully constructed vnode, we also 1455 * recheck v_data after relock, and return ENOENT if NULL. 1456 * 1457 * If unlocking bp, we must unroll dequeueing its unfinished 1458 * dependencies, and clear scan flag, before unlocking. If unlocking 1459 * vp while it is under deactivation, we re-queue deactivation. 1460 */ 1461 static int 1462 get_parent_vp(struct vnode *vp, struct mount *mp, ino_t inum, struct buf *bp, 1463 struct diraddhd *diraddhdp, struct diraddhd *unfinishedp, 1464 struct vnode **rvp) 1465 { 1466 struct vnode *pvp; 1467 int error; 1468 bool bplocked; 1469 1470 ASSERT_VOP_ELOCKED(vp, "child vnode must be locked"); 1471 for (bplocked = true, pvp = NULL;;) { 1472 error = ffs_vgetf(mp, inum, LK_EXCLUSIVE | LK_NOWAIT, &pvp, 1473 FFSV_FORCEINSMQ | FFSV_FORCEINODEDEP); 1474 if (error == 0) { 1475 /* 1476 * Since we could have unlocked vp, the inode 1477 * number could no longer indicate a 1478 * constructed node. In this case, we must 1479 * restart the syscall. 1480 */ 1481 if (VTOI(pvp)->i_mode == 0 || !bplocked) { 1482 if (bp != NULL && bplocked) 1483 get_parent_vp_unlock_bp(mp, bp, 1484 diraddhdp, unfinishedp); 1485 if (VTOI(pvp)->i_mode == 0) 1486 vgone(pvp); 1487 error = ERELOOKUP; 1488 goto out2; 1489 } 1490 goto out1; 1491 } 1492 if (bp != NULL && bplocked) { 1493 get_parent_vp_unlock_bp(mp, bp, diraddhdp, unfinishedp); 1494 bplocked = false; 1495 } 1496 1497 /* 1498 * Do not drop vnode lock while inactivating during 1499 * vunref. This would result in leaks of the VI flags 1500 * and reclaiming of non-truncated vnode. Instead, 1501 * re-schedule inactivation hoping that we would be 1502 * able to sync inode later. 1503 */ 1504 if ((vp->v_iflag & VI_DOINGINACT) != 0 && 1505 (vp->v_vflag & VV_UNREF) != 0) { 1506 VI_LOCK(vp); 1507 vp->v_iflag |= VI_OWEINACT; 1508 VI_UNLOCK(vp); 1509 return (ERELOOKUP); 1510 } 1511 1512 VOP_UNLOCK(vp); 1513 error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &pvp, 1514 FFSV_FORCEINSMQ | FFSV_FORCEINODEDEP); 1515 if (error != 0) { 1516 MPASS(error != ERELOOKUP); 1517 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1518 break; 1519 } 1520 if (VTOI(pvp)->i_mode == 0) { 1521 vgone(pvp); 1522 vput(pvp); 1523 pvp = NULL; 1524 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1525 error = ERELOOKUP; 1526 break; 1527 } 1528 error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); 1529 if (error == 0) 1530 break; 1531 vput(pvp); 1532 pvp = NULL; 1533 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1534 if (vp->v_data == NULL) { 1535 error = ENOENT; 1536 break; 1537 } 1538 } 1539 if (bp != NULL) { 1540 MPASS(!bplocked); 1541 error = ERELOOKUP; 1542 } 1543 out2: 1544 if (error != 0 && pvp != NULL) { 1545 vput(pvp); 1546 pvp = NULL; 1547 } 1548 out1: 1549 *rvp = pvp; 1550 ASSERT_VOP_ELOCKED(vp, "child vnode must be locked on return"); 1551 return (error); 1552 } 1553 1554 /* 1555 * This function cleans the worklist for a filesystem. 1556 * Each filesystem running with soft dependencies gets its own 1557 * thread to run in this function. The thread is started up in 1558 * softdep_mount and shutdown in softdep_unmount. They show up 1559 * as part of the kernel "bufdaemon" process whose process 1560 * entry is available in bufdaemonproc. 1561 */ 1562 static int searchfailed; 1563 extern struct proc *bufdaemonproc; 1564 static void 1565 softdep_flush(addr) 1566 void *addr; 1567 { 1568 struct mount *mp; 1569 struct thread *td; 1570 struct ufsmount *ump; 1571 int cleanups; 1572 1573 td = curthread; 1574 td->td_pflags |= TDP_NORUNNINGBUF; 1575 mp = (struct mount *)addr; 1576 ump = VFSTOUFS(mp); 1577 atomic_add_int(&stat_flush_threads, 1); 1578 ACQUIRE_LOCK(ump); 1579 ump->softdep_flags &= ~FLUSH_STARTING; 1580 wakeup(&ump->softdep_flushtd); 1581 FREE_LOCK(ump); 1582 if (print_threads) { 1583 if (stat_flush_threads == 1) 1584 printf("Running %s at pid %d\n", bufdaemonproc->p_comm, 1585 bufdaemonproc->p_pid); 1586 printf("Start thread %s\n", td->td_name); 1587 } 1588 for (;;) { 1589 while (softdep_process_worklist(mp, 0) > 0 || 1590 (MOUNTEDSUJ(mp) && 1591 VFSTOUFS(mp)->softdep_jblocks->jb_suspended)) 1592 kthread_suspend_check(); 1593 ACQUIRE_LOCK(ump); 1594 if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0) 1595 msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, 1596 "sdflush", hz / 2); 1597 ump->softdep_flags &= ~FLUSH_CLEANUP; 1598 /* 1599 * Check to see if we are done and need to exit. 1600 */ 1601 if ((ump->softdep_flags & FLUSH_EXIT) == 0) { 1602 FREE_LOCK(ump); 1603 continue; 1604 } 1605 ump->softdep_flags &= ~FLUSH_EXIT; 1606 cleanups = ump->um_softdep->sd_cleanups; 1607 FREE_LOCK(ump); 1608 wakeup(&ump->softdep_flags); 1609 if (print_threads) { 1610 printf("Stop thread %s: searchfailed %d, " 1611 "did cleanups %d\n", 1612 td->td_name, searchfailed, cleanups); 1613 } 1614 atomic_subtract_int(&stat_flush_threads, 1); 1615 kthread_exit(); 1616 panic("kthread_exit failed\n"); 1617 } 1618 } 1619 1620 static void 1621 worklist_speedup(mp) 1622 struct mount *mp; 1623 { 1624 struct ufsmount *ump; 1625 1626 ump = VFSTOUFS(mp); 1627 LOCK_OWNED(ump); 1628 if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0) 1629 ump->softdep_flags |= FLUSH_CLEANUP; 1630 wakeup(&ump->softdep_flushtd); 1631 } 1632 1633 static void 1634 softdep_send_speedup(struct ufsmount *ump, off_t shortage, u_int flags) 1635 { 1636 struct buf *bp; 1637 1638 if ((ump->um_flags & UM_CANSPEEDUP) == 0) 1639 return; 1640 1641 bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO); 1642 bp->b_iocmd = BIO_SPEEDUP; 1643 bp->b_ioflags = flags; 1644 bp->b_bcount = omin(shortage, LONG_MAX); 1645 g_vfs_strategy(ump->um_bo, bp); 1646 bufwait(bp); 1647 free(bp, M_TRIM); 1648 } 1649 1650 static int 1651 softdep_speedup(ump) 1652 struct ufsmount *ump; 1653 { 1654 struct ufsmount *altump; 1655 struct mount_softdeps *sdp; 1656 1657 LOCK_OWNED(ump); 1658 worklist_speedup(ump->um_mountp); 1659 bd_speedup(); 1660 /* 1661 * If we have global shortages, then we need other 1662 * filesystems to help with the cleanup. Here we wakeup a 1663 * flusher thread for a filesystem that is over its fair 1664 * share of resources. 1665 */ 1666 if (req_clear_inodedeps || req_clear_remove) { 1667 ACQUIRE_GBLLOCK(&lk); 1668 TAILQ_FOREACH(sdp, &softdepmounts, sd_next) { 1669 if ((altump = sdp->sd_ump) == ump) 1670 continue; 1671 if (((req_clear_inodedeps && 1672 altump->softdep_curdeps[D_INODEDEP] > 1673 max_softdeps / stat_flush_threads) || 1674 (req_clear_remove && 1675 altump->softdep_curdeps[D_DIRREM] > 1676 (max_softdeps / 2) / stat_flush_threads)) && 1677 TRY_ACQUIRE_LOCK(altump)) 1678 break; 1679 } 1680 if (sdp == NULL) { 1681 searchfailed++; 1682 FREE_GBLLOCK(&lk); 1683 } else { 1684 /* 1685 * Move to the end of the list so we pick a 1686 * different one on out next try. 1687 */ 1688 TAILQ_REMOVE(&softdepmounts, sdp, sd_next); 1689 TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next); 1690 FREE_GBLLOCK(&lk); 1691 if ((altump->softdep_flags & 1692 (FLUSH_CLEANUP | FLUSH_EXIT)) == 0) 1693 altump->softdep_flags |= FLUSH_CLEANUP; 1694 altump->um_softdep->sd_cleanups++; 1695 wakeup(&altump->softdep_flushtd); 1696 FREE_LOCK(altump); 1697 } 1698 } 1699 return (speedup_syncer()); 1700 } 1701 1702 /* 1703 * Add an item to the end of the work queue. 1704 * This routine requires that the lock be held. 1705 * This is the only routine that adds items to the list. 1706 * The following routine is the only one that removes items 1707 * and does so in order from first to last. 1708 */ 1709 1710 #define WK_HEAD 0x0001 /* Add to HEAD. */ 1711 #define WK_NODELAY 0x0002 /* Process immediately. */ 1712 1713 static void 1714 add_to_worklist(wk, flags) 1715 struct worklist *wk; 1716 int flags; 1717 { 1718 struct ufsmount *ump; 1719 1720 ump = VFSTOUFS(wk->wk_mp); 1721 LOCK_OWNED(ump); 1722 if (wk->wk_state & ONWORKLIST) 1723 panic("add_to_worklist: %s(0x%X) already on list", 1724 TYPENAME(wk->wk_type), wk->wk_state); 1725 wk->wk_state |= ONWORKLIST; 1726 if (ump->softdep_on_worklist == 0) { 1727 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); 1728 ump->softdep_worklist_tail = wk; 1729 } else if (flags & WK_HEAD) { 1730 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); 1731 } else { 1732 LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list); 1733 ump->softdep_worklist_tail = wk; 1734 } 1735 ump->softdep_on_worklist += 1; 1736 if (flags & WK_NODELAY) 1737 worklist_speedup(wk->wk_mp); 1738 } 1739 1740 /* 1741 * Remove the item to be processed. If we are removing the last 1742 * item on the list, we need to recalculate the tail pointer. 1743 */ 1744 static void 1745 remove_from_worklist(wk) 1746 struct worklist *wk; 1747 { 1748 struct ufsmount *ump; 1749 1750 ump = VFSTOUFS(wk->wk_mp); 1751 if (ump->softdep_worklist_tail == wk) 1752 ump->softdep_worklist_tail = 1753 (struct worklist *)wk->wk_list.le_prev; 1754 WORKLIST_REMOVE(wk); 1755 ump->softdep_on_worklist -= 1; 1756 } 1757 1758 static void 1759 wake_worklist(wk) 1760 struct worklist *wk; 1761 { 1762 if (wk->wk_state & IOWAITING) { 1763 wk->wk_state &= ~IOWAITING; 1764 wakeup(wk); 1765 } 1766 } 1767 1768 static void 1769 wait_worklist(wk, wmesg) 1770 struct worklist *wk; 1771 char *wmesg; 1772 { 1773 struct ufsmount *ump; 1774 1775 ump = VFSTOUFS(wk->wk_mp); 1776 wk->wk_state |= IOWAITING; 1777 msleep(wk, LOCK_PTR(ump), PVM, wmesg, 0); 1778 } 1779 1780 /* 1781 * Process that runs once per second to handle items in the background queue. 1782 * 1783 * Note that we ensure that everything is done in the order in which they 1784 * appear in the queue. The code below depends on this property to ensure 1785 * that blocks of a file are freed before the inode itself is freed. This 1786 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated 1787 * until all the old ones have been purged from the dependency lists. 1788 */ 1789 static int 1790 softdep_process_worklist(mp, full) 1791 struct mount *mp; 1792 int full; 1793 { 1794 int cnt, matchcnt; 1795 struct ufsmount *ump; 1796 long starttime; 1797 1798 KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp")); 1799 ump = VFSTOUFS(mp); 1800 if (ump->um_softdep == NULL) 1801 return (0); 1802 matchcnt = 0; 1803 ACQUIRE_LOCK(ump); 1804 starttime = time_second; 1805 softdep_process_journal(mp, NULL, full ? MNT_WAIT : 0); 1806 check_clear_deps(mp); 1807 while (ump->softdep_on_worklist > 0) { 1808 if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0) 1809 break; 1810 else 1811 matchcnt += cnt; 1812 check_clear_deps(mp); 1813 /* 1814 * We do not generally want to stop for buffer space, but if 1815 * we are really being a buffer hog, we will stop and wait. 1816 */ 1817 if (should_yield()) { 1818 FREE_LOCK(ump); 1819 kern_yield(PRI_USER); 1820 bwillwrite(); 1821 ACQUIRE_LOCK(ump); 1822 } 1823 /* 1824 * Never allow processing to run for more than one 1825 * second. This gives the syncer thread the opportunity 1826 * to pause if appropriate. 1827 */ 1828 if (!full && starttime != time_second) 1829 break; 1830 } 1831 if (full == 0) 1832 journal_unsuspend(ump); 1833 FREE_LOCK(ump); 1834 return (matchcnt); 1835 } 1836 1837 /* 1838 * Process all removes associated with a vnode if we are running out of 1839 * journal space. Any other process which attempts to flush these will 1840 * be unable as we have the vnodes locked. 1841 */ 1842 static void 1843 process_removes(vp) 1844 struct vnode *vp; 1845 { 1846 struct inodedep *inodedep; 1847 struct dirrem *dirrem; 1848 struct ufsmount *ump; 1849 struct mount *mp; 1850 ino_t inum; 1851 1852 mp = vp->v_mount; 1853 ump = VFSTOUFS(mp); 1854 LOCK_OWNED(ump); 1855 inum = VTOI(vp)->i_number; 1856 for (;;) { 1857 top: 1858 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) 1859 return; 1860 LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) { 1861 /* 1862 * If another thread is trying to lock this vnode 1863 * it will fail but we must wait for it to do so 1864 * before we can proceed. 1865 */ 1866 if (dirrem->dm_state & INPROGRESS) { 1867 wait_worklist(&dirrem->dm_list, "pwrwait"); 1868 goto top; 1869 } 1870 if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) == 1871 (COMPLETE | ONWORKLIST)) 1872 break; 1873 } 1874 if (dirrem == NULL) 1875 return; 1876 remove_from_worklist(&dirrem->dm_list); 1877 FREE_LOCK(ump); 1878 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) 1879 panic("process_removes: suspended filesystem"); 1880 handle_workitem_remove(dirrem, 0); 1881 vn_finished_secondary_write(mp); 1882 ACQUIRE_LOCK(ump); 1883 } 1884 } 1885 1886 /* 1887 * Process all truncations associated with a vnode if we are running out 1888 * of journal space. This is called when the vnode lock is already held 1889 * and no other process can clear the truncation. This function returns 1890 * a value greater than zero if it did any work. 1891 */ 1892 static void 1893 process_truncates(vp) 1894 struct vnode *vp; 1895 { 1896 struct inodedep *inodedep; 1897 struct freeblks *freeblks; 1898 struct ufsmount *ump; 1899 struct mount *mp; 1900 ino_t inum; 1901 int cgwait; 1902 1903 mp = vp->v_mount; 1904 ump = VFSTOUFS(mp); 1905 LOCK_OWNED(ump); 1906 inum = VTOI(vp)->i_number; 1907 for (;;) { 1908 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) 1909 return; 1910 cgwait = 0; 1911 TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) { 1912 /* Journal entries not yet written. */ 1913 if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) { 1914 jwait(&LIST_FIRST( 1915 &freeblks->fb_jblkdephd)->jb_list, 1916 MNT_WAIT); 1917 break; 1918 } 1919 /* Another thread is executing this item. */ 1920 if (freeblks->fb_state & INPROGRESS) { 1921 wait_worklist(&freeblks->fb_list, "ptrwait"); 1922 break; 1923 } 1924 /* Freeblks is waiting on a inode write. */ 1925 if ((freeblks->fb_state & COMPLETE) == 0) { 1926 FREE_LOCK(ump); 1927 ffs_update(vp, 1); 1928 ACQUIRE_LOCK(ump); 1929 break; 1930 } 1931 if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) == 1932 (ALLCOMPLETE | ONWORKLIST)) { 1933 remove_from_worklist(&freeblks->fb_list); 1934 freeblks->fb_state |= INPROGRESS; 1935 FREE_LOCK(ump); 1936 if (vn_start_secondary_write(NULL, &mp, 1937 V_NOWAIT)) 1938 panic("process_truncates: " 1939 "suspended filesystem"); 1940 handle_workitem_freeblocks(freeblks, 0); 1941 vn_finished_secondary_write(mp); 1942 ACQUIRE_LOCK(ump); 1943 break; 1944 } 1945 if (freeblks->fb_cgwait) 1946 cgwait++; 1947 } 1948 if (cgwait) { 1949 FREE_LOCK(ump); 1950 sync_cgs(mp, MNT_WAIT); 1951 ffs_sync_snap(mp, MNT_WAIT); 1952 ACQUIRE_LOCK(ump); 1953 continue; 1954 } 1955 if (freeblks == NULL) 1956 break; 1957 } 1958 return; 1959 } 1960 1961 /* 1962 * Process one item on the worklist. 1963 */ 1964 static int 1965 process_worklist_item(mp, target, flags) 1966 struct mount *mp; 1967 int target; 1968 int flags; 1969 { 1970 struct worklist sentinel; 1971 struct worklist *wk; 1972 struct ufsmount *ump; 1973 int matchcnt; 1974 int error; 1975 1976 KASSERT(mp != NULL, ("process_worklist_item: NULL mp")); 1977 /* 1978 * If we are being called because of a process doing a 1979 * copy-on-write, then it is not safe to write as we may 1980 * recurse into the copy-on-write routine. 1981 */ 1982 if (curthread->td_pflags & TDP_COWINPROGRESS) 1983 return (-1); 1984 PHOLD(curproc); /* Don't let the stack go away. */ 1985 ump = VFSTOUFS(mp); 1986 LOCK_OWNED(ump); 1987 matchcnt = 0; 1988 sentinel.wk_mp = NULL; 1989 sentinel.wk_type = D_SENTINEL; 1990 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sentinel, wk_list); 1991 for (wk = LIST_NEXT(&sentinel, wk_list); wk != NULL; 1992 wk = LIST_NEXT(&sentinel, wk_list)) { 1993 if (wk->wk_type == D_SENTINEL) { 1994 LIST_REMOVE(&sentinel, wk_list); 1995 LIST_INSERT_AFTER(wk, &sentinel, wk_list); 1996 continue; 1997 } 1998 if (wk->wk_state & INPROGRESS) 1999 panic("process_worklist_item: %p already in progress.", 2000 wk); 2001 wk->wk_state |= INPROGRESS; 2002 remove_from_worklist(wk); 2003 FREE_LOCK(ump); 2004 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) 2005 panic("process_worklist_item: suspended filesystem"); 2006 switch (wk->wk_type) { 2007 case D_DIRREM: 2008 /* removal of a directory entry */ 2009 error = handle_workitem_remove(WK_DIRREM(wk), flags); 2010 break; 2011 2012 case D_FREEBLKS: 2013 /* releasing blocks and/or fragments from a file */ 2014 error = handle_workitem_freeblocks(WK_FREEBLKS(wk), 2015 flags); 2016 break; 2017 2018 case D_FREEFRAG: 2019 /* releasing a fragment when replaced as a file grows */ 2020 handle_workitem_freefrag(WK_FREEFRAG(wk)); 2021 error = 0; 2022 break; 2023 2024 case D_FREEFILE: 2025 /* releasing an inode when its link count drops to 0 */ 2026 handle_workitem_freefile(WK_FREEFILE(wk)); 2027 error = 0; 2028 break; 2029 2030 default: 2031 panic("%s_process_worklist: Unknown type %s", 2032 "softdep", TYPENAME(wk->wk_type)); 2033 /* NOTREACHED */ 2034 } 2035 vn_finished_secondary_write(mp); 2036 ACQUIRE_LOCK(ump); 2037 if (error == 0) { 2038 if (++matchcnt == target) 2039 break; 2040 continue; 2041 } 2042 /* 2043 * We have to retry the worklist item later. Wake up any 2044 * waiters who may be able to complete it immediately and 2045 * add the item back to the head so we don't try to execute 2046 * it again. 2047 */ 2048 wk->wk_state &= ~INPROGRESS; 2049 wake_worklist(wk); 2050 add_to_worklist(wk, WK_HEAD); 2051 } 2052 /* Sentinal could've become the tail from remove_from_worklist. */ 2053 if (ump->softdep_worklist_tail == &sentinel) 2054 ump->softdep_worklist_tail = 2055 (struct worklist *)sentinel.wk_list.le_prev; 2056 LIST_REMOVE(&sentinel, wk_list); 2057 PRELE(curproc); 2058 return (matchcnt); 2059 } 2060 2061 /* 2062 * Move dependencies from one buffer to another. 2063 */ 2064 int 2065 softdep_move_dependencies(oldbp, newbp) 2066 struct buf *oldbp; 2067 struct buf *newbp; 2068 { 2069 struct worklist *wk, *wktail; 2070 struct ufsmount *ump; 2071 int dirty; 2072 2073 if ((wk = LIST_FIRST(&oldbp->b_dep)) == NULL) 2074 return (0); 2075 KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0, 2076 ("softdep_move_dependencies called on non-softdep filesystem")); 2077 dirty = 0; 2078 wktail = NULL; 2079 ump = VFSTOUFS(wk->wk_mp); 2080 ACQUIRE_LOCK(ump); 2081 while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { 2082 LIST_REMOVE(wk, wk_list); 2083 if (wk->wk_type == D_BMSAFEMAP && 2084 bmsafemap_backgroundwrite(WK_BMSAFEMAP(wk), newbp)) 2085 dirty = 1; 2086 if (wktail == NULL) 2087 LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); 2088 else 2089 LIST_INSERT_AFTER(wktail, wk, wk_list); 2090 wktail = wk; 2091 } 2092 FREE_LOCK(ump); 2093 2094 return (dirty); 2095 } 2096 2097 /* 2098 * Purge the work list of all items associated with a particular mount point. 2099 */ 2100 int 2101 softdep_flushworklist(oldmnt, countp, td) 2102 struct mount *oldmnt; 2103 int *countp; 2104 struct thread *td; 2105 { 2106 struct vnode *devvp; 2107 struct ufsmount *ump; 2108 int count, error; 2109 2110 /* 2111 * Alternately flush the block device associated with the mount 2112 * point and process any dependencies that the flushing 2113 * creates. We continue until no more worklist dependencies 2114 * are found. 2115 */ 2116 *countp = 0; 2117 error = 0; 2118 ump = VFSTOUFS(oldmnt); 2119 devvp = ump->um_devvp; 2120 while ((count = softdep_process_worklist(oldmnt, 1)) > 0) { 2121 *countp += count; 2122 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 2123 error = VOP_FSYNC(devvp, MNT_WAIT, td); 2124 VOP_UNLOCK(devvp); 2125 if (error != 0) 2126 break; 2127 } 2128 return (error); 2129 } 2130 2131 #define SU_WAITIDLE_RETRIES 20 2132 static int 2133 softdep_waitidle(struct mount *mp, int flags __unused) 2134 { 2135 struct ufsmount *ump; 2136 struct vnode *devvp; 2137 struct thread *td; 2138 int error, i; 2139 2140 ump = VFSTOUFS(mp); 2141 KASSERT(ump->um_softdep != NULL, 2142 ("softdep_waitidle called on non-softdep filesystem")); 2143 devvp = ump->um_devvp; 2144 td = curthread; 2145 error = 0; 2146 ACQUIRE_LOCK(ump); 2147 for (i = 0; i < SU_WAITIDLE_RETRIES && ump->softdep_deps != 0; i++) { 2148 ump->softdep_req = 1; 2149 KASSERT((flags & FORCECLOSE) == 0 || 2150 ump->softdep_on_worklist == 0, 2151 ("softdep_waitidle: work added after flush")); 2152 msleep(&ump->softdep_deps, LOCK_PTR(ump), PVM | PDROP, 2153 "softdeps", 10 * hz); 2154 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 2155 error = VOP_FSYNC(devvp, MNT_WAIT, td); 2156 VOP_UNLOCK(devvp); 2157 ACQUIRE_LOCK(ump); 2158 if (error != 0) 2159 break; 2160 } 2161 ump->softdep_req = 0; 2162 if (i == SU_WAITIDLE_RETRIES && error == 0 && ump->softdep_deps != 0) { 2163 error = EBUSY; 2164 printf("softdep_waitidle: Failed to flush worklist for %p\n", 2165 mp); 2166 } 2167 FREE_LOCK(ump); 2168 return (error); 2169 } 2170 2171 /* 2172 * Flush all vnodes and worklist items associated with a specified mount point. 2173 */ 2174 int 2175 softdep_flushfiles(oldmnt, flags, td) 2176 struct mount *oldmnt; 2177 int flags; 2178 struct thread *td; 2179 { 2180 struct ufsmount *ump; 2181 #ifdef QUOTA 2182 int i; 2183 #endif 2184 int error, early, depcount, loopcnt, retry_flush_count, retry; 2185 int morework; 2186 2187 ump = VFSTOUFS(oldmnt); 2188 KASSERT(ump->um_softdep != NULL, 2189 ("softdep_flushfiles called on non-softdep filesystem")); 2190 loopcnt = 10; 2191 retry_flush_count = 3; 2192 retry_flush: 2193 error = 0; 2194 2195 /* 2196 * Alternately flush the vnodes associated with the mount 2197 * point and process any dependencies that the flushing 2198 * creates. In theory, this loop can happen at most twice, 2199 * but we give it a few extra just to be sure. 2200 */ 2201 for (; loopcnt > 0; loopcnt--) { 2202 /* 2203 * Do another flush in case any vnodes were brought in 2204 * as part of the cleanup operations. 2205 */ 2206 early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag & 2207 MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH; 2208 if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0) 2209 break; 2210 if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 || 2211 depcount == 0) 2212 break; 2213 } 2214 /* 2215 * If we are unmounting then it is an error to fail. If we 2216 * are simply trying to downgrade to read-only, then filesystem 2217 * activity can keep us busy forever, so we just fail with EBUSY. 2218 */ 2219 if (loopcnt == 0) { 2220 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) 2221 panic("softdep_flushfiles: looping"); 2222 error = EBUSY; 2223 } 2224 if (!error) 2225 error = softdep_waitidle(oldmnt, flags); 2226 if (!error) { 2227 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) { 2228 retry = 0; 2229 MNT_ILOCK(oldmnt); 2230 morework = oldmnt->mnt_nvnodelistsize > 0; 2231 #ifdef QUOTA 2232 UFS_LOCK(ump); 2233 for (i = 0; i < MAXQUOTAS; i++) { 2234 if (ump->um_quotas[i] != NULLVP) 2235 morework = 1; 2236 } 2237 UFS_UNLOCK(ump); 2238 #endif 2239 if (morework) { 2240 if (--retry_flush_count > 0) { 2241 retry = 1; 2242 loopcnt = 3; 2243 } else 2244 error = EBUSY; 2245 } 2246 MNT_IUNLOCK(oldmnt); 2247 if (retry) 2248 goto retry_flush; 2249 } 2250 } 2251 return (error); 2252 } 2253 2254 /* 2255 * Structure hashing. 2256 * 2257 * There are four types of structures that can be looked up: 2258 * 1) pagedep structures identified by mount point, inode number, 2259 * and logical block. 2260 * 2) inodedep structures identified by mount point and inode number. 2261 * 3) newblk structures identified by mount point and 2262 * physical block number. 2263 * 4) bmsafemap structures identified by mount point and 2264 * cylinder group number. 2265 * 2266 * The "pagedep" and "inodedep" dependency structures are hashed 2267 * separately from the file blocks and inodes to which they correspond. 2268 * This separation helps when the in-memory copy of an inode or 2269 * file block must be replaced. It also obviates the need to access 2270 * an inode or file page when simply updating (or de-allocating) 2271 * dependency structures. Lookup of newblk structures is needed to 2272 * find newly allocated blocks when trying to associate them with 2273 * their allocdirect or allocindir structure. 2274 * 2275 * The lookup routines optionally create and hash a new instance when 2276 * an existing entry is not found. The bmsafemap lookup routine always 2277 * allocates a new structure if an existing one is not found. 2278 */ 2279 #define DEPALLOC 0x0001 /* allocate structure if lookup fails */ 2280 2281 /* 2282 * Structures and routines associated with pagedep caching. 2283 */ 2284 #define PAGEDEP_HASH(ump, inum, lbn) \ 2285 (&(ump)->pagedep_hashtbl[((inum) + (lbn)) & (ump)->pagedep_hash_size]) 2286 2287 static int 2288 pagedep_find(pagedephd, ino, lbn, pagedeppp) 2289 struct pagedep_hashhead *pagedephd; 2290 ino_t ino; 2291 ufs_lbn_t lbn; 2292 struct pagedep **pagedeppp; 2293 { 2294 struct pagedep *pagedep; 2295 2296 LIST_FOREACH(pagedep, pagedephd, pd_hash) { 2297 if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn) { 2298 *pagedeppp = pagedep; 2299 return (1); 2300 } 2301 } 2302 *pagedeppp = NULL; 2303 return (0); 2304 } 2305 /* 2306 * Look up a pagedep. Return 1 if found, 0 otherwise. 2307 * If not found, allocate if DEPALLOC flag is passed. 2308 * Found or allocated entry is returned in pagedeppp. 2309 */ 2310 static int 2311 pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp) 2312 struct mount *mp; 2313 struct buf *bp; 2314 ino_t ino; 2315 ufs_lbn_t lbn; 2316 int flags; 2317 struct pagedep **pagedeppp; 2318 { 2319 struct pagedep *pagedep; 2320 struct pagedep_hashhead *pagedephd; 2321 struct worklist *wk; 2322 struct ufsmount *ump; 2323 int ret; 2324 int i; 2325 2326 ump = VFSTOUFS(mp); 2327 LOCK_OWNED(ump); 2328 if (bp) { 2329 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 2330 if (wk->wk_type == D_PAGEDEP) { 2331 *pagedeppp = WK_PAGEDEP(wk); 2332 return (1); 2333 } 2334 } 2335 } 2336 pagedephd = PAGEDEP_HASH(ump, ino, lbn); 2337 ret = pagedep_find(pagedephd, ino, lbn, pagedeppp); 2338 if (ret) { 2339 if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp) 2340 WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list); 2341 return (1); 2342 } 2343 if ((flags & DEPALLOC) == 0) 2344 return (0); 2345 FREE_LOCK(ump); 2346 pagedep = malloc(sizeof(struct pagedep), 2347 M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO); 2348 workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp); 2349 ACQUIRE_LOCK(ump); 2350 ret = pagedep_find(pagedephd, ino, lbn, pagedeppp); 2351 if (*pagedeppp) { 2352 /* 2353 * This should never happen since we only create pagedeps 2354 * with the vnode lock held. Could be an assert. 2355 */ 2356 WORKITEM_FREE(pagedep, D_PAGEDEP); 2357 return (ret); 2358 } 2359 pagedep->pd_ino = ino; 2360 pagedep->pd_lbn = lbn; 2361 LIST_INIT(&pagedep->pd_dirremhd); 2362 LIST_INIT(&pagedep->pd_pendinghd); 2363 for (i = 0; i < DAHASHSZ; i++) 2364 LIST_INIT(&pagedep->pd_diraddhd[i]); 2365 LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); 2366 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 2367 *pagedeppp = pagedep; 2368 return (0); 2369 } 2370 2371 /* 2372 * Structures and routines associated with inodedep caching. 2373 */ 2374 #define INODEDEP_HASH(ump, inum) \ 2375 (&(ump)->inodedep_hashtbl[(inum) & (ump)->inodedep_hash_size]) 2376 2377 static int 2378 inodedep_find(inodedephd, inum, inodedeppp) 2379 struct inodedep_hashhead *inodedephd; 2380 ino_t inum; 2381 struct inodedep **inodedeppp; 2382 { 2383 struct inodedep *inodedep; 2384 2385 LIST_FOREACH(inodedep, inodedephd, id_hash) 2386 if (inum == inodedep->id_ino) 2387 break; 2388 if (inodedep) { 2389 *inodedeppp = inodedep; 2390 return (1); 2391 } 2392 *inodedeppp = NULL; 2393 2394 return (0); 2395 } 2396 /* 2397 * Look up an inodedep. Return 1 if found, 0 if not found. 2398 * If not found, allocate if DEPALLOC flag is passed. 2399 * Found or allocated entry is returned in inodedeppp. 2400 */ 2401 static int 2402 inodedep_lookup(mp, inum, flags, inodedeppp) 2403 struct mount *mp; 2404 ino_t inum; 2405 int flags; 2406 struct inodedep **inodedeppp; 2407 { 2408 struct inodedep *inodedep; 2409 struct inodedep_hashhead *inodedephd; 2410 struct ufsmount *ump; 2411 struct fs *fs; 2412 2413 ump = VFSTOUFS(mp); 2414 LOCK_OWNED(ump); 2415 fs = ump->um_fs; 2416 inodedephd = INODEDEP_HASH(ump, inum); 2417 2418 if (inodedep_find(inodedephd, inum, inodedeppp)) 2419 return (1); 2420 if ((flags & DEPALLOC) == 0) 2421 return (0); 2422 /* 2423 * If the system is over its limit and our filesystem is 2424 * responsible for more than our share of that usage and 2425 * we are not in a rush, request some inodedep cleanup. 2426 */ 2427 if (softdep_excess_items(ump, D_INODEDEP)) 2428 schedule_cleanup(mp); 2429 else 2430 FREE_LOCK(ump); 2431 inodedep = malloc(sizeof(struct inodedep), 2432 M_INODEDEP, M_SOFTDEP_FLAGS); 2433 workitem_alloc(&inodedep->id_list, D_INODEDEP, mp); 2434 ACQUIRE_LOCK(ump); 2435 if (inodedep_find(inodedephd, inum, inodedeppp)) { 2436 WORKITEM_FREE(inodedep, D_INODEDEP); 2437 return (1); 2438 } 2439 inodedep->id_fs = fs; 2440 inodedep->id_ino = inum; 2441 inodedep->id_state = ALLCOMPLETE; 2442 inodedep->id_nlinkdelta = 0; 2443 inodedep->id_nlinkwrote = -1; 2444 inodedep->id_savedino1 = NULL; 2445 inodedep->id_savedsize = -1; 2446 inodedep->id_savedextsize = -1; 2447 inodedep->id_savednlink = -1; 2448 inodedep->id_bmsafemap = NULL; 2449 inodedep->id_mkdiradd = NULL; 2450 LIST_INIT(&inodedep->id_dirremhd); 2451 LIST_INIT(&inodedep->id_pendinghd); 2452 LIST_INIT(&inodedep->id_inowait); 2453 LIST_INIT(&inodedep->id_bufwait); 2454 TAILQ_INIT(&inodedep->id_inoreflst); 2455 TAILQ_INIT(&inodedep->id_inoupdt); 2456 TAILQ_INIT(&inodedep->id_newinoupdt); 2457 TAILQ_INIT(&inodedep->id_extupdt); 2458 TAILQ_INIT(&inodedep->id_newextupdt); 2459 TAILQ_INIT(&inodedep->id_freeblklst); 2460 LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); 2461 *inodedeppp = inodedep; 2462 return (0); 2463 } 2464 2465 /* 2466 * Structures and routines associated with newblk caching. 2467 */ 2468 #define NEWBLK_HASH(ump, inum) \ 2469 (&(ump)->newblk_hashtbl[(inum) & (ump)->newblk_hash_size]) 2470 2471 static int 2472 newblk_find(newblkhd, newblkno, flags, newblkpp) 2473 struct newblk_hashhead *newblkhd; 2474 ufs2_daddr_t newblkno; 2475 int flags; 2476 struct newblk **newblkpp; 2477 { 2478 struct newblk *newblk; 2479 2480 LIST_FOREACH(newblk, newblkhd, nb_hash) { 2481 if (newblkno != newblk->nb_newblkno) 2482 continue; 2483 /* 2484 * If we're creating a new dependency don't match those that 2485 * have already been converted to allocdirects. This is for 2486 * a frag extend. 2487 */ 2488 if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK) 2489 continue; 2490 break; 2491 } 2492 if (newblk) { 2493 *newblkpp = newblk; 2494 return (1); 2495 } 2496 *newblkpp = NULL; 2497 return (0); 2498 } 2499 2500 /* 2501 * Look up a newblk. Return 1 if found, 0 if not found. 2502 * If not found, allocate if DEPALLOC flag is passed. 2503 * Found or allocated entry is returned in newblkpp. 2504 */ 2505 static int 2506 newblk_lookup(mp, newblkno, flags, newblkpp) 2507 struct mount *mp; 2508 ufs2_daddr_t newblkno; 2509 int flags; 2510 struct newblk **newblkpp; 2511 { 2512 struct newblk *newblk; 2513 struct newblk_hashhead *newblkhd; 2514 struct ufsmount *ump; 2515 2516 ump = VFSTOUFS(mp); 2517 LOCK_OWNED(ump); 2518 newblkhd = NEWBLK_HASH(ump, newblkno); 2519 if (newblk_find(newblkhd, newblkno, flags, newblkpp)) 2520 return (1); 2521 if ((flags & DEPALLOC) == 0) 2522 return (0); 2523 if (softdep_excess_items(ump, D_NEWBLK) || 2524 softdep_excess_items(ump, D_ALLOCDIRECT) || 2525 softdep_excess_items(ump, D_ALLOCINDIR)) 2526 schedule_cleanup(mp); 2527 else 2528 FREE_LOCK(ump); 2529 newblk = malloc(sizeof(union allblk), M_NEWBLK, 2530 M_SOFTDEP_FLAGS | M_ZERO); 2531 workitem_alloc(&newblk->nb_list, D_NEWBLK, mp); 2532 ACQUIRE_LOCK(ump); 2533 if (newblk_find(newblkhd, newblkno, flags, newblkpp)) { 2534 WORKITEM_FREE(newblk, D_NEWBLK); 2535 return (1); 2536 } 2537 newblk->nb_freefrag = NULL; 2538 LIST_INIT(&newblk->nb_indirdeps); 2539 LIST_INIT(&newblk->nb_newdirblk); 2540 LIST_INIT(&newblk->nb_jwork); 2541 newblk->nb_state = ATTACHED; 2542 newblk->nb_newblkno = newblkno; 2543 LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); 2544 *newblkpp = newblk; 2545 return (0); 2546 } 2547 2548 /* 2549 * Structures and routines associated with freed indirect block caching. 2550 */ 2551 #define INDIR_HASH(ump, blkno) \ 2552 (&(ump)->indir_hashtbl[(blkno) & (ump)->indir_hash_size]) 2553 2554 /* 2555 * Lookup an indirect block in the indir hash table. The freework is 2556 * removed and potentially freed. The caller must do a blocking journal 2557 * write before writing to the blkno. 2558 */ 2559 static int 2560 indirblk_lookup(mp, blkno) 2561 struct mount *mp; 2562 ufs2_daddr_t blkno; 2563 { 2564 struct freework *freework; 2565 struct indir_hashhead *wkhd; 2566 struct ufsmount *ump; 2567 2568 ump = VFSTOUFS(mp); 2569 wkhd = INDIR_HASH(ump, blkno); 2570 TAILQ_FOREACH(freework, wkhd, fw_next) { 2571 if (freework->fw_blkno != blkno) 2572 continue; 2573 indirblk_remove(freework); 2574 return (1); 2575 } 2576 return (0); 2577 } 2578 2579 /* 2580 * Insert an indirect block represented by freework into the indirblk 2581 * hash table so that it may prevent the block from being re-used prior 2582 * to the journal being written. 2583 */ 2584 static void 2585 indirblk_insert(freework) 2586 struct freework *freework; 2587 { 2588 struct jblocks *jblocks; 2589 struct jseg *jseg; 2590 struct ufsmount *ump; 2591 2592 ump = VFSTOUFS(freework->fw_list.wk_mp); 2593 jblocks = ump->softdep_jblocks; 2594 jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst); 2595 if (jseg == NULL) 2596 return; 2597 2598 LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs); 2599 TAILQ_INSERT_HEAD(INDIR_HASH(ump, freework->fw_blkno), freework, 2600 fw_next); 2601 freework->fw_state &= ~DEPCOMPLETE; 2602 } 2603 2604 static void 2605 indirblk_remove(freework) 2606 struct freework *freework; 2607 { 2608 struct ufsmount *ump; 2609 2610 ump = VFSTOUFS(freework->fw_list.wk_mp); 2611 LIST_REMOVE(freework, fw_segs); 2612 TAILQ_REMOVE(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next); 2613 freework->fw_state |= DEPCOMPLETE; 2614 if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE) 2615 WORKITEM_FREE(freework, D_FREEWORK); 2616 } 2617 2618 /* 2619 * Executed during filesystem system initialization before 2620 * mounting any filesystems. 2621 */ 2622 void 2623 softdep_initialize() 2624 { 2625 2626 TAILQ_INIT(&softdepmounts); 2627 #ifdef __LP64__ 2628 max_softdeps = desiredvnodes * 4; 2629 #else 2630 max_softdeps = desiredvnodes * 2; 2631 #endif 2632 2633 /* initialise bioops hack */ 2634 bioops.io_start = softdep_disk_io_initiation; 2635 bioops.io_complete = softdep_disk_write_complete; 2636 bioops.io_deallocate = softdep_deallocate_dependencies; 2637 bioops.io_countdeps = softdep_count_dependencies; 2638 softdep_ast_cleanup = softdep_ast_cleanup_proc; 2639 2640 /* Initialize the callout with an mtx. */ 2641 callout_init_mtx(&softdep_callout, &lk, 0); 2642 } 2643 2644 /* 2645 * Executed after all filesystems have been unmounted during 2646 * filesystem module unload. 2647 */ 2648 void 2649 softdep_uninitialize() 2650 { 2651 2652 /* clear bioops hack */ 2653 bioops.io_start = NULL; 2654 bioops.io_complete = NULL; 2655 bioops.io_deallocate = NULL; 2656 bioops.io_countdeps = NULL; 2657 softdep_ast_cleanup = NULL; 2658 2659 callout_drain(&softdep_callout); 2660 } 2661 2662 /* 2663 * Called at mount time to notify the dependency code that a 2664 * filesystem wishes to use it. 2665 */ 2666 int 2667 softdep_mount(devvp, mp, fs, cred) 2668 struct vnode *devvp; 2669 struct mount *mp; 2670 struct fs *fs; 2671 struct ucred *cred; 2672 { 2673 struct csum_total cstotal; 2674 struct mount_softdeps *sdp; 2675 struct ufsmount *ump; 2676 struct cg *cgp; 2677 struct buf *bp; 2678 u_int cyl, i; 2679 int error; 2680 2681 ump = VFSTOUFS(mp); 2682 2683 sdp = malloc(sizeof(struct mount_softdeps), M_MOUNTDATA, 2684 M_WAITOK | M_ZERO); 2685 rw_init(&sdp->sd_fslock, "SUrw"); 2686 sdp->sd_ump = ump; 2687 LIST_INIT(&sdp->sd_workitem_pending); 2688 LIST_INIT(&sdp->sd_journal_pending); 2689 TAILQ_INIT(&sdp->sd_unlinked); 2690 LIST_INIT(&sdp->sd_dirtycg); 2691 sdp->sd_worklist_tail = NULL; 2692 sdp->sd_on_worklist = 0; 2693 sdp->sd_deps = 0; 2694 LIST_INIT(&sdp->sd_mkdirlisthd); 2695 sdp->sd_pdhash = hashinit(desiredvnodes / 5, M_PAGEDEP, 2696 &sdp->sd_pdhashsize); 2697 sdp->sd_pdnextclean = 0; 2698 sdp->sd_idhash = hashinit(desiredvnodes, M_INODEDEP, 2699 &sdp->sd_idhashsize); 2700 sdp->sd_idnextclean = 0; 2701 sdp->sd_newblkhash = hashinit(max_softdeps / 2, M_NEWBLK, 2702 &sdp->sd_newblkhashsize); 2703 sdp->sd_bmhash = hashinit(1024, M_BMSAFEMAP, &sdp->sd_bmhashsize); 2704 i = 1 << (ffs(desiredvnodes / 10) - 1); 2705 sdp->sd_indirhash = malloc(i * sizeof(struct indir_hashhead), 2706 M_FREEWORK, M_WAITOK); 2707 sdp->sd_indirhashsize = i - 1; 2708 for (i = 0; i <= sdp->sd_indirhashsize; i++) 2709 TAILQ_INIT(&sdp->sd_indirhash[i]); 2710 for (i = 0; i <= D_LAST; i++) 2711 LIST_INIT(&sdp->sd_alldeps[i]); 2712 ACQUIRE_GBLLOCK(&lk); 2713 TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next); 2714 FREE_GBLLOCK(&lk); 2715 2716 ump->um_softdep = sdp; 2717 MNT_ILOCK(mp); 2718 mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP; 2719 if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) { 2720 mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) | 2721 MNTK_SOFTDEP | MNTK_NOASYNC; 2722 } 2723 MNT_IUNLOCK(mp); 2724 2725 if ((fs->fs_flags & FS_SUJ) && 2726 (error = journal_mount(mp, fs, cred)) != 0) { 2727 printf("Failed to start journal: %d\n", error); 2728 softdep_unmount(mp); 2729 return (error); 2730 } 2731 /* 2732 * Start our flushing thread in the bufdaemon process. 2733 */ 2734 ACQUIRE_LOCK(ump); 2735 ump->softdep_flags |= FLUSH_STARTING; 2736 FREE_LOCK(ump); 2737 kproc_kthread_add(&softdep_flush, mp, &bufdaemonproc, 2738 &ump->softdep_flushtd, 0, 0, "softdepflush", "%s worker", 2739 mp->mnt_stat.f_mntonname); 2740 ACQUIRE_LOCK(ump); 2741 while ((ump->softdep_flags & FLUSH_STARTING) != 0) { 2742 msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, "sdstart", 2743 hz / 2); 2744 } 2745 FREE_LOCK(ump); 2746 /* 2747 * When doing soft updates, the counters in the 2748 * superblock may have gotten out of sync. Recomputation 2749 * can take a long time and can be deferred for background 2750 * fsck. However, the old behavior of scanning the cylinder 2751 * groups and recalculating them at mount time is available 2752 * by setting vfs.ffs.compute_summary_at_mount to one. 2753 */ 2754 if (compute_summary_at_mount == 0 || fs->fs_clean != 0) 2755 return (0); 2756 bzero(&cstotal, sizeof cstotal); 2757 for (cyl = 0; cyl < fs->fs_ncg; cyl++) { 2758 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), 2759 fs->fs_cgsize, cred, &bp)) != 0) { 2760 brelse(bp); 2761 softdep_unmount(mp); 2762 return (error); 2763 } 2764 cgp = (struct cg *)bp->b_data; 2765 cstotal.cs_nffree += cgp->cg_cs.cs_nffree; 2766 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; 2767 cstotal.cs_nifree += cgp->cg_cs.cs_nifree; 2768 cstotal.cs_ndir += cgp->cg_cs.cs_ndir; 2769 fs->fs_cs(fs, cyl) = cgp->cg_cs; 2770 brelse(bp); 2771 } 2772 #ifdef INVARIANTS 2773 if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) 2774 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt); 2775 #endif 2776 bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); 2777 return (0); 2778 } 2779 2780 void 2781 softdep_unmount(mp) 2782 struct mount *mp; 2783 { 2784 struct ufsmount *ump; 2785 struct mount_softdeps *ums; 2786 2787 ump = VFSTOUFS(mp); 2788 KASSERT(ump->um_softdep != NULL, 2789 ("softdep_unmount called on non-softdep filesystem")); 2790 MNT_ILOCK(mp); 2791 mp->mnt_flag &= ~MNT_SOFTDEP; 2792 if ((mp->mnt_flag & MNT_SUJ) == 0) { 2793 MNT_IUNLOCK(mp); 2794 } else { 2795 mp->mnt_flag &= ~MNT_SUJ; 2796 MNT_IUNLOCK(mp); 2797 journal_unmount(ump); 2798 } 2799 /* 2800 * Shut down our flushing thread. Check for NULL is if 2801 * softdep_mount errors out before the thread has been created. 2802 */ 2803 if (ump->softdep_flushtd != NULL) { 2804 ACQUIRE_LOCK(ump); 2805 ump->softdep_flags |= FLUSH_EXIT; 2806 wakeup(&ump->softdep_flushtd); 2807 while ((ump->softdep_flags & FLUSH_EXIT) != 0) { 2808 msleep(&ump->softdep_flags, LOCK_PTR(ump), PVM, 2809 "sdwait", 0); 2810 } 2811 KASSERT((ump->softdep_flags & FLUSH_EXIT) == 0, 2812 ("Thread shutdown failed")); 2813 FREE_LOCK(ump); 2814 } 2815 2816 /* 2817 * We are no longer have softdep structure attached to ump. 2818 */ 2819 ums = ump->um_softdep; 2820 ACQUIRE_GBLLOCK(&lk); 2821 TAILQ_REMOVE(&softdepmounts, ums, sd_next); 2822 FREE_GBLLOCK(&lk); 2823 ump->um_softdep = NULL; 2824 2825 KASSERT(ums->sd_on_journal == 0, 2826 ("ump %p ums %p on_journal %d", ump, ums, ums->sd_on_journal)); 2827 KASSERT(ums->sd_on_worklist == 0, 2828 ("ump %p ums %p on_worklist %d", ump, ums, ums->sd_on_worklist)); 2829 KASSERT(ums->sd_deps == 0, 2830 ("ump %p ums %p deps %d", ump, ums, ums->sd_deps)); 2831 2832 /* 2833 * Free up our resources. 2834 */ 2835 rw_destroy(&ums->sd_fslock); 2836 hashdestroy(ums->sd_pdhash, M_PAGEDEP, ums->sd_pdhashsize); 2837 hashdestroy(ums->sd_idhash, M_INODEDEP, ums->sd_idhashsize); 2838 hashdestroy(ums->sd_newblkhash, M_NEWBLK, ums->sd_newblkhashsize); 2839 hashdestroy(ums->sd_bmhash, M_BMSAFEMAP, ums->sd_bmhashsize); 2840 free(ums->sd_indirhash, M_FREEWORK); 2841 #ifdef INVARIANTS 2842 for (int i = 0; i <= D_LAST; i++) { 2843 KASSERT(ums->sd_curdeps[i] == 0, 2844 ("Unmount %s: Dep type %s != 0 (%ld)", ump->um_fs->fs_fsmnt, 2845 TYPENAME(i), ums->sd_curdeps[i])); 2846 KASSERT(LIST_EMPTY(&ums->sd_alldeps[i]), 2847 ("Unmount %s: Dep type %s not empty (%p)", 2848 ump->um_fs->fs_fsmnt, 2849 TYPENAME(i), LIST_FIRST(&ums->sd_alldeps[i]))); 2850 } 2851 #endif 2852 free(ums, M_MOUNTDATA); 2853 } 2854 2855 static struct jblocks * 2856 jblocks_create(void) 2857 { 2858 struct jblocks *jblocks; 2859 2860 jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO); 2861 TAILQ_INIT(&jblocks->jb_segs); 2862 jblocks->jb_avail = 10; 2863 jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail, 2864 M_JBLOCKS, M_WAITOK | M_ZERO); 2865 2866 return (jblocks); 2867 } 2868 2869 static ufs2_daddr_t 2870 jblocks_alloc(jblocks, bytes, actual) 2871 struct jblocks *jblocks; 2872 int bytes; 2873 int *actual; 2874 { 2875 ufs2_daddr_t daddr; 2876 struct jextent *jext; 2877 int freecnt; 2878 int blocks; 2879 2880 blocks = bytes / DEV_BSIZE; 2881 jext = &jblocks->jb_extent[jblocks->jb_head]; 2882 freecnt = jext->je_blocks - jblocks->jb_off; 2883 if (freecnt == 0) { 2884 jblocks->jb_off = 0; 2885 if (++jblocks->jb_head > jblocks->jb_used) 2886 jblocks->jb_head = 0; 2887 jext = &jblocks->jb_extent[jblocks->jb_head]; 2888 freecnt = jext->je_blocks; 2889 } 2890 if (freecnt > blocks) 2891 freecnt = blocks; 2892 *actual = freecnt * DEV_BSIZE; 2893 daddr = jext->je_daddr + jblocks->jb_off; 2894 jblocks->jb_off += freecnt; 2895 jblocks->jb_free -= freecnt; 2896 2897 return (daddr); 2898 } 2899 2900 static void 2901 jblocks_free(jblocks, mp, bytes) 2902 struct jblocks *jblocks; 2903 struct mount *mp; 2904 int bytes; 2905 { 2906 2907 LOCK_OWNED(VFSTOUFS(mp)); 2908 jblocks->jb_free += bytes / DEV_BSIZE; 2909 if (jblocks->jb_suspended) 2910 worklist_speedup(mp); 2911 wakeup(jblocks); 2912 } 2913 2914 static void 2915 jblocks_destroy(jblocks) 2916 struct jblocks *jblocks; 2917 { 2918 2919 if (jblocks->jb_extent) 2920 free(jblocks->jb_extent, M_JBLOCKS); 2921 free(jblocks, M_JBLOCKS); 2922 } 2923 2924 static void 2925 jblocks_add(jblocks, daddr, blocks) 2926 struct jblocks *jblocks; 2927 ufs2_daddr_t daddr; 2928 int blocks; 2929 { 2930 struct jextent *jext; 2931 2932 jblocks->jb_blocks += blocks; 2933 jblocks->jb_free += blocks; 2934 jext = &jblocks->jb_extent[jblocks->jb_used]; 2935 /* Adding the first block. */ 2936 if (jext->je_daddr == 0) { 2937 jext->je_daddr = daddr; 2938 jext->je_blocks = blocks; 2939 return; 2940 } 2941 /* Extending the last extent. */ 2942 if (jext->je_daddr + jext->je_blocks == daddr) { 2943 jext->je_blocks += blocks; 2944 return; 2945 } 2946 /* Adding a new extent. */ 2947 if (++jblocks->jb_used == jblocks->jb_avail) { 2948 jblocks->jb_avail *= 2; 2949 jext = malloc(sizeof(struct jextent) * jblocks->jb_avail, 2950 M_JBLOCKS, M_WAITOK | M_ZERO); 2951 memcpy(jext, jblocks->jb_extent, 2952 sizeof(struct jextent) * jblocks->jb_used); 2953 free(jblocks->jb_extent, M_JBLOCKS); 2954 jblocks->jb_extent = jext; 2955 } 2956 jext = &jblocks->jb_extent[jblocks->jb_used]; 2957 jext->je_daddr = daddr; 2958 jext->je_blocks = blocks; 2959 return; 2960 } 2961 2962 int 2963 softdep_journal_lookup(mp, vpp) 2964 struct mount *mp; 2965 struct vnode **vpp; 2966 { 2967 struct componentname cnp; 2968 struct vnode *dvp; 2969 ino_t sujournal; 2970 int error; 2971 2972 error = VFS_VGET(mp, UFS_ROOTINO, LK_EXCLUSIVE, &dvp); 2973 if (error) 2974 return (error); 2975 bzero(&cnp, sizeof(cnp)); 2976 cnp.cn_nameiop = LOOKUP; 2977 cnp.cn_flags = ISLASTCN; 2978 cnp.cn_cred = curthread->td_ucred; 2979 cnp.cn_pnbuf = SUJ_FILE; 2980 cnp.cn_nameptr = SUJ_FILE; 2981 cnp.cn_namelen = strlen(SUJ_FILE); 2982 error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal); 2983 vput(dvp); 2984 if (error != 0) 2985 return (error); 2986 error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp); 2987 return (error); 2988 } 2989 2990 /* 2991 * Open and verify the journal file. 2992 */ 2993 static int 2994 journal_mount(mp, fs, cred) 2995 struct mount *mp; 2996 struct fs *fs; 2997 struct ucred *cred; 2998 { 2999 struct jblocks *jblocks; 3000 struct ufsmount *ump; 3001 struct vnode *vp; 3002 struct inode *ip; 3003 ufs2_daddr_t blkno; 3004 int bcount; 3005 int error; 3006 int i; 3007 3008 ump = VFSTOUFS(mp); 3009 ump->softdep_journal_tail = NULL; 3010 ump->softdep_on_journal = 0; 3011 ump->softdep_accdeps = 0; 3012 ump->softdep_req = 0; 3013 ump->softdep_jblocks = NULL; 3014 error = softdep_journal_lookup(mp, &vp); 3015 if (error != 0) { 3016 printf("Failed to find journal. Use tunefs to create one\n"); 3017 return (error); 3018 } 3019 ip = VTOI(vp); 3020 if (ip->i_size < SUJ_MIN) { 3021 error = ENOSPC; 3022 goto out; 3023 } 3024 bcount = lblkno(fs, ip->i_size); /* Only use whole blocks. */ 3025 jblocks = jblocks_create(); 3026 for (i = 0; i < bcount; i++) { 3027 error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL); 3028 if (error) 3029 break; 3030 jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag)); 3031 } 3032 if (error) { 3033 jblocks_destroy(jblocks); 3034 goto out; 3035 } 3036 jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */ 3037 jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */ 3038 ump->softdep_jblocks = jblocks; 3039 3040 MNT_ILOCK(mp); 3041 mp->mnt_flag |= MNT_SUJ; 3042 MNT_IUNLOCK(mp); 3043 3044 /* 3045 * Only validate the journal contents if the 3046 * filesystem is clean, otherwise we write the logs 3047 * but they'll never be used. If the filesystem was 3048 * still dirty when we mounted it the journal is 3049 * invalid and a new journal can only be valid if it 3050 * starts from a clean mount. 3051 */ 3052 if (fs->fs_clean) { 3053 DIP_SET(ip, i_modrev, fs->fs_mtime); 3054 ip->i_flags |= IN_MODIFIED; 3055 ffs_update(vp, 1); 3056 } 3057 out: 3058 vput(vp); 3059 return (error); 3060 } 3061 3062 static void 3063 journal_unmount(ump) 3064 struct ufsmount *ump; 3065 { 3066 3067 if (ump->softdep_jblocks) 3068 jblocks_destroy(ump->softdep_jblocks); 3069 ump->softdep_jblocks = NULL; 3070 } 3071 3072 /* 3073 * Called when a journal record is ready to be written. Space is allocated 3074 * and the journal entry is created when the journal is flushed to stable 3075 * store. 3076 */ 3077 static void 3078 add_to_journal(wk) 3079 struct worklist *wk; 3080 { 3081 struct ufsmount *ump; 3082 3083 ump = VFSTOUFS(wk->wk_mp); 3084 LOCK_OWNED(ump); 3085 if (wk->wk_state & ONWORKLIST) 3086 panic("add_to_journal: %s(0x%X) already on list", 3087 TYPENAME(wk->wk_type), wk->wk_state); 3088 wk->wk_state |= ONWORKLIST | DEPCOMPLETE; 3089 if (LIST_EMPTY(&ump->softdep_journal_pending)) { 3090 ump->softdep_jblocks->jb_age = ticks; 3091 LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list); 3092 } else 3093 LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list); 3094 ump->softdep_journal_tail = wk; 3095 ump->softdep_on_journal += 1; 3096 } 3097 3098 /* 3099 * Remove an arbitrary item for the journal worklist maintain the tail 3100 * pointer. This happens when a new operation obviates the need to 3101 * journal an old operation. 3102 */ 3103 static void 3104 remove_from_journal(wk) 3105 struct worklist *wk; 3106 { 3107 struct ufsmount *ump; 3108 3109 ump = VFSTOUFS(wk->wk_mp); 3110 LOCK_OWNED(ump); 3111 #ifdef INVARIANTS 3112 { 3113 struct worklist *wkn; 3114 3115 LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list) 3116 if (wkn == wk) 3117 break; 3118 if (wkn == NULL) 3119 panic("remove_from_journal: %p is not in journal", wk); 3120 } 3121 #endif 3122 /* 3123 * We emulate a TAILQ to save space in most structures which do not 3124 * require TAILQ semantics. Here we must update the tail position 3125 * when removing the tail which is not the final entry. This works 3126 * only if the worklist linkage are at the beginning of the structure. 3127 */ 3128 if (ump->softdep_journal_tail == wk) 3129 ump->softdep_journal_tail = 3130 (struct worklist *)wk->wk_list.le_prev; 3131 WORKLIST_REMOVE(wk); 3132 ump->softdep_on_journal -= 1; 3133 } 3134 3135 /* 3136 * Check for journal space as well as dependency limits so the prelink 3137 * code can throttle both journaled and non-journaled filesystems. 3138 * Threshold is 0 for low and 1 for min. 3139 */ 3140 static int 3141 journal_space(ump, thresh) 3142 struct ufsmount *ump; 3143 int thresh; 3144 { 3145 struct jblocks *jblocks; 3146 int limit, avail; 3147 3148 jblocks = ump->softdep_jblocks; 3149 if (jblocks == NULL) 3150 return (1); 3151 /* 3152 * We use a tighter restriction here to prevent request_cleanup() 3153 * running in threads from running into locks we currently hold. 3154 * We have to be over the limit and our filesystem has to be 3155 * responsible for more than our share of that usage. 3156 */ 3157 limit = (max_softdeps / 10) * 9; 3158 if (dep_current[D_INODEDEP] > limit && 3159 ump->softdep_curdeps[D_INODEDEP] > limit / stat_flush_threads) 3160 return (0); 3161 if (thresh) 3162 thresh = jblocks->jb_min; 3163 else 3164 thresh = jblocks->jb_low; 3165 avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE; 3166 avail = jblocks->jb_free - avail; 3167 3168 return (avail > thresh); 3169 } 3170 3171 static void 3172 journal_suspend(ump) 3173 struct ufsmount *ump; 3174 { 3175 struct jblocks *jblocks; 3176 struct mount *mp; 3177 bool set; 3178 3179 mp = UFSTOVFS(ump); 3180 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) 3181 return; 3182 3183 jblocks = ump->softdep_jblocks; 3184 vfs_op_enter(mp); 3185 set = false; 3186 MNT_ILOCK(mp); 3187 if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { 3188 stat_journal_min++; 3189 mp->mnt_kern_flag |= MNTK_SUSPEND; 3190 mp->mnt_susp_owner = ump->softdep_flushtd; 3191 set = true; 3192 } 3193 jblocks->jb_suspended = 1; 3194 MNT_IUNLOCK(mp); 3195 if (!set) 3196 vfs_op_exit(mp); 3197 } 3198 3199 static int 3200 journal_unsuspend(struct ufsmount *ump) 3201 { 3202 struct jblocks *jblocks; 3203 struct mount *mp; 3204 3205 mp = UFSTOVFS(ump); 3206 jblocks = ump->softdep_jblocks; 3207 3208 if (jblocks != NULL && jblocks->jb_suspended && 3209 journal_space(ump, jblocks->jb_min)) { 3210 jblocks->jb_suspended = 0; 3211 FREE_LOCK(ump); 3212 mp->mnt_susp_owner = curthread; 3213 vfs_write_resume(mp, 0); 3214 ACQUIRE_LOCK(ump); 3215 return (1); 3216 } 3217 return (0); 3218 } 3219 3220 static void 3221 journal_check_space(struct ufsmount *ump) 3222 { 3223 struct mount *mp; 3224 3225 LOCK_OWNED(ump); 3226 3227 if (journal_space(ump, 0) == 0) { 3228 softdep_speedup(ump); 3229 mp = UFSTOVFS(ump); 3230 FREE_LOCK(ump); 3231 VFS_SYNC(mp, MNT_NOWAIT); 3232 ffs_sbupdate(ump, MNT_WAIT, 0); 3233 ACQUIRE_LOCK(ump); 3234 if (journal_space(ump, 1) == 0) 3235 journal_suspend(ump); 3236 } 3237 } 3238 3239 /* 3240 * Called before any allocation function to be certain that there is 3241 * sufficient space in the journal prior to creating any new records. 3242 * Since in the case of block allocation we may have multiple locked 3243 * buffers at the time of the actual allocation we can not block 3244 * when the journal records are created. Doing so would create a deadlock 3245 * if any of these buffers needed to be flushed to reclaim space. Instead 3246 * we require a sufficiently large amount of available space such that 3247 * each thread in the system could have passed this allocation check and 3248 * still have sufficient free space. With 20% of a minimum journal size 3249 * of 1MB we have 6553 records available. 3250 */ 3251 int 3252 softdep_prealloc(vp, waitok) 3253 struct vnode *vp; 3254 int waitok; 3255 { 3256 struct ufsmount *ump; 3257 3258 KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0, 3259 ("softdep_prealloc called on non-softdep filesystem")); 3260 /* 3261 * Nothing to do if we are not running journaled soft updates. 3262 * If we currently hold the snapshot lock, we must avoid 3263 * handling other resources that could cause deadlock. Do not 3264 * touch quotas vnode since it is typically recursed with 3265 * other vnode locks held. 3266 */ 3267 if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)) || 3268 (vp->v_vflag & VV_SYSTEM) != 0) 3269 return (0); 3270 ump = VFSTOUFS(vp->v_mount); 3271 ACQUIRE_LOCK(ump); 3272 if (journal_space(ump, 0)) { 3273 FREE_LOCK(ump); 3274 return (0); 3275 } 3276 stat_journal_low++; 3277 FREE_LOCK(ump); 3278 if (waitok == MNT_NOWAIT) 3279 return (ENOSPC); 3280 /* 3281 * Attempt to sync this vnode once to flush any journal 3282 * work attached to it. 3283 */ 3284 if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0) 3285 ffs_syncvnode(vp, waitok, 0); 3286 ACQUIRE_LOCK(ump); 3287 process_removes(vp); 3288 process_truncates(vp); 3289 journal_check_space(ump); 3290 FREE_LOCK(ump); 3291 3292 return (0); 3293 } 3294 3295 /* 3296 * Try hard to sync all data and metadata for the vnode, and workitems 3297 * flushing which might conflict with the vnode lock. This is a 3298 * helper for softdep_prerename(). 3299 */ 3300 static int 3301 softdep_prerename_vnode(ump, vp) 3302 struct ufsmount *ump; 3303 struct vnode *vp; 3304 { 3305 int error; 3306 3307 ASSERT_VOP_ELOCKED(vp, "prehandle"); 3308 if (vp->v_data == NULL) 3309 return (0); 3310 error = VOP_FSYNC(vp, MNT_WAIT, curthread); 3311 if (error != 0) 3312 return (error); 3313 ACQUIRE_LOCK(ump); 3314 process_removes(vp); 3315 process_truncates(vp); 3316 FREE_LOCK(ump); 3317 return (0); 3318 } 3319 3320 /* 3321 * Must be called from VOP_RENAME() after all vnodes are locked. 3322 * Ensures that there is enough journal space for rename. It is 3323 * sufficiently different from softdep_prelink() by having to handle 3324 * four vnodes. 3325 */ 3326 int 3327 softdep_prerename(fdvp, fvp, tdvp, tvp) 3328 struct vnode *fdvp; 3329 struct vnode *fvp; 3330 struct vnode *tdvp; 3331 struct vnode *tvp; 3332 { 3333 struct ufsmount *ump; 3334 int error; 3335 3336 ump = VFSTOUFS(fdvp->v_mount); 3337 3338 if (journal_space(ump, 0)) 3339 return (0); 3340 3341 VOP_UNLOCK(tdvp); 3342 VOP_UNLOCK(fvp); 3343 if (tvp != NULL && tvp != tdvp) 3344 VOP_UNLOCK(tvp); 3345 3346 error = softdep_prerename_vnode(ump, fdvp); 3347 VOP_UNLOCK(fdvp); 3348 if (error != 0) 3349 return (error); 3350 3351 VOP_LOCK(fvp, LK_EXCLUSIVE | LK_RETRY); 3352 error = softdep_prerename_vnode(ump, fvp); 3353 VOP_UNLOCK(fvp); 3354 if (error != 0) 3355 return (error); 3356 3357 if (tdvp != fdvp) { 3358 VOP_LOCK(tdvp, LK_EXCLUSIVE | LK_RETRY); 3359 error = softdep_prerename_vnode(ump, tdvp); 3360 VOP_UNLOCK(tdvp); 3361 if (error != 0) 3362 return (error); 3363 } 3364 3365 if (tvp != fvp && tvp != NULL) { 3366 VOP_LOCK(tvp, LK_EXCLUSIVE | LK_RETRY); 3367 error = softdep_prerename_vnode(ump, tvp); 3368 VOP_UNLOCK(tvp); 3369 if (error != 0) 3370 return (error); 3371 } 3372 3373 ACQUIRE_LOCK(ump); 3374 softdep_speedup(ump); 3375 process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT); 3376 journal_check_space(ump); 3377 FREE_LOCK(ump); 3378 return (ERELOOKUP); 3379 } 3380 3381 /* 3382 * Before adjusting a link count on a vnode verify that we have sufficient 3383 * journal space. If not, process operations that depend on the currently 3384 * locked pair of vnodes to try to flush space as the syncer, buf daemon, 3385 * and softdep flush threads can not acquire these locks to reclaim space. 3386 * 3387 * Returns 0 if all owned locks are still valid and were not dropped 3388 * in the process, in other case it returns either an error from sync, 3389 * or ERELOOKUP if any of the locks were re-acquired. In the later 3390 * case, the state of the vnodes cannot be relied upon and our VFS 3391 * syscall must be restarted at top level from the lookup. 3392 */ 3393 int 3394 softdep_prelink(dvp, vp, cnp) 3395 struct vnode *dvp; 3396 struct vnode *vp; 3397 struct componentname *cnp; 3398 { 3399 struct ufsmount *ump; 3400 struct nameidata *ndp; 3401 3402 ASSERT_VOP_ELOCKED(dvp, "prelink dvp"); 3403 if (vp != NULL) 3404 ASSERT_VOP_ELOCKED(vp, "prelink vp"); 3405 ump = VFSTOUFS(dvp->v_mount); 3406 3407 /* 3408 * Nothing to do if we have sufficient journal space. We skip 3409 * flushing when vp is a snapshot to avoid deadlock where 3410 * another thread is trying to update the inodeblock for dvp 3411 * and is waiting on snaplk that vp holds. 3412 */ 3413 if (journal_space(ump, 0) || (vp != NULL && IS_SNAPSHOT(VTOI(vp)))) 3414 return (0); 3415 3416 /* 3417 * Check if the journal space consumption can in theory be 3418 * accounted on dvp and vp. If the vnodes metadata was not 3419 * changed comparing with the previous round-trip into 3420 * softdep_prelink(), as indicated by the seqc generation 3421 * recorded in the nameidata, then there is no point in 3422 * starting the sync. 3423 */ 3424 ndp = __containerof(cnp, struct nameidata, ni_cnd); 3425 if (!seqc_in_modify(ndp->ni_dvp_seqc) && 3426 vn_seqc_consistent(dvp, ndp->ni_dvp_seqc) && 3427 (vp == NULL || (!seqc_in_modify(ndp->ni_vp_seqc) && 3428 vn_seqc_consistent(vp, ndp->ni_vp_seqc)))) 3429 return (0); 3430 3431 stat_journal_low++; 3432 if (vp != NULL) { 3433 VOP_UNLOCK(dvp); 3434 ffs_syncvnode(vp, MNT_NOWAIT, 0); 3435 vn_lock_pair(dvp, false, vp, true); 3436 if (dvp->v_data == NULL) 3437 goto out; 3438 } 3439 if (vp != NULL) 3440 VOP_UNLOCK(vp); 3441 ffs_syncvnode(dvp, MNT_WAIT, 0); 3442 /* Process vp before dvp as it may create .. removes. */ 3443 if (vp != NULL) { 3444 VOP_UNLOCK(dvp); 3445 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 3446 if (vp->v_data == NULL) { 3447 vn_lock_pair(dvp, false, vp, true); 3448 goto out; 3449 } 3450 ACQUIRE_LOCK(ump); 3451 process_removes(vp); 3452 process_truncates(vp); 3453 FREE_LOCK(ump); 3454 VOP_UNLOCK(vp); 3455 vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); 3456 if (dvp->v_data == NULL) { 3457 vn_lock_pair(dvp, true, vp, false); 3458 goto out; 3459 } 3460 } 3461 3462 ACQUIRE_LOCK(ump); 3463 process_removes(dvp); 3464 process_truncates(dvp); 3465 VOP_UNLOCK(dvp); 3466 softdep_speedup(ump); 3467 3468 process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT); 3469 journal_check_space(ump); 3470 FREE_LOCK(ump); 3471 3472 vn_lock_pair(dvp, false, vp, false); 3473 out: 3474 ndp->ni_dvp_seqc = vn_seqc_read_any(dvp); 3475 if (vp != NULL) 3476 ndp->ni_vp_seqc = vn_seqc_read_any(vp); 3477 return (ERELOOKUP); 3478 } 3479 3480 static void 3481 jseg_write(ump, jseg, data) 3482 struct ufsmount *ump; 3483 struct jseg *jseg; 3484 uint8_t *data; 3485 { 3486 struct jsegrec *rec; 3487 3488 rec = (struct jsegrec *)data; 3489 rec->jsr_seq = jseg->js_seq; 3490 rec->jsr_oldest = jseg->js_oldseq; 3491 rec->jsr_cnt = jseg->js_cnt; 3492 rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize; 3493 rec->jsr_crc = 0; 3494 rec->jsr_time = ump->um_fs->fs_mtime; 3495 } 3496 3497 static inline void 3498 inoref_write(inoref, jseg, rec) 3499 struct inoref *inoref; 3500 struct jseg *jseg; 3501 struct jrefrec *rec; 3502 { 3503 3504 inoref->if_jsegdep->jd_seg = jseg; 3505 rec->jr_ino = inoref->if_ino; 3506 rec->jr_parent = inoref->if_parent; 3507 rec->jr_nlink = inoref->if_nlink; 3508 rec->jr_mode = inoref->if_mode; 3509 rec->jr_diroff = inoref->if_diroff; 3510 } 3511 3512 static void 3513 jaddref_write(jaddref, jseg, data) 3514 struct jaddref *jaddref; 3515 struct jseg *jseg; 3516 uint8_t *data; 3517 { 3518 struct jrefrec *rec; 3519 3520 rec = (struct jrefrec *)data; 3521 rec->jr_op = JOP_ADDREF; 3522 inoref_write(&jaddref->ja_ref, jseg, rec); 3523 } 3524 3525 static void 3526 jremref_write(jremref, jseg, data) 3527 struct jremref *jremref; 3528 struct jseg *jseg; 3529 uint8_t *data; 3530 { 3531 struct jrefrec *rec; 3532 3533 rec = (struct jrefrec *)data; 3534 rec->jr_op = JOP_REMREF; 3535 inoref_write(&jremref->jr_ref, jseg, rec); 3536 } 3537 3538 static void 3539 jmvref_write(jmvref, jseg, data) 3540 struct jmvref *jmvref; 3541 struct jseg *jseg; 3542 uint8_t *data; 3543 { 3544 struct jmvrec *rec; 3545 3546 rec = (struct jmvrec *)data; 3547 rec->jm_op = JOP_MVREF; 3548 rec->jm_ino = jmvref->jm_ino; 3549 rec->jm_parent = jmvref->jm_parent; 3550 rec->jm_oldoff = jmvref->jm_oldoff; 3551 rec->jm_newoff = jmvref->jm_newoff; 3552 } 3553 3554 static void 3555 jnewblk_write(jnewblk, jseg, data) 3556 struct jnewblk *jnewblk; 3557 struct jseg *jseg; 3558 uint8_t *data; 3559 { 3560 struct jblkrec *rec; 3561 3562 jnewblk->jn_jsegdep->jd_seg = jseg; 3563 rec = (struct jblkrec *)data; 3564 rec->jb_op = JOP_NEWBLK; 3565 rec->jb_ino = jnewblk->jn_ino; 3566 rec->jb_blkno = jnewblk->jn_blkno; 3567 rec->jb_lbn = jnewblk->jn_lbn; 3568 rec->jb_frags = jnewblk->jn_frags; 3569 rec->jb_oldfrags = jnewblk->jn_oldfrags; 3570 } 3571 3572 static void 3573 jfreeblk_write(jfreeblk, jseg, data) 3574 struct jfreeblk *jfreeblk; 3575 struct jseg *jseg; 3576 uint8_t *data; 3577 { 3578 struct jblkrec *rec; 3579 3580 jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg; 3581 rec = (struct jblkrec *)data; 3582 rec->jb_op = JOP_FREEBLK; 3583 rec->jb_ino = jfreeblk->jf_ino; 3584 rec->jb_blkno = jfreeblk->jf_blkno; 3585 rec->jb_lbn = jfreeblk->jf_lbn; 3586 rec->jb_frags = jfreeblk->jf_frags; 3587 rec->jb_oldfrags = 0; 3588 } 3589 3590 static void 3591 jfreefrag_write(jfreefrag, jseg, data) 3592 struct jfreefrag *jfreefrag; 3593 struct jseg *jseg; 3594 uint8_t *data; 3595 { 3596 struct jblkrec *rec; 3597 3598 jfreefrag->fr_jsegdep->jd_seg = jseg; 3599 rec = (struct jblkrec *)data; 3600 rec->jb_op = JOP_FREEBLK; 3601 rec->jb_ino = jfreefrag->fr_ino; 3602 rec->jb_blkno = jfreefrag->fr_blkno; 3603 rec->jb_lbn = jfreefrag->fr_lbn; 3604 rec->jb_frags = jfreefrag->fr_frags; 3605 rec->jb_oldfrags = 0; 3606 } 3607 3608 static void 3609 jtrunc_write(jtrunc, jseg, data) 3610 struct jtrunc *jtrunc; 3611 struct jseg *jseg; 3612 uint8_t *data; 3613 { 3614 struct jtrncrec *rec; 3615 3616 jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg; 3617 rec = (struct jtrncrec *)data; 3618 rec->jt_op = JOP_TRUNC; 3619 rec->jt_ino = jtrunc->jt_ino; 3620 rec->jt_size = jtrunc->jt_size; 3621 rec->jt_extsize = jtrunc->jt_extsize; 3622 } 3623 3624 static void 3625 jfsync_write(jfsync, jseg, data) 3626 struct jfsync *jfsync; 3627 struct jseg *jseg; 3628 uint8_t *data; 3629 { 3630 struct jtrncrec *rec; 3631 3632 rec = (struct jtrncrec *)data; 3633 rec->jt_op = JOP_SYNC; 3634 rec->jt_ino = jfsync->jfs_ino; 3635 rec->jt_size = jfsync->jfs_size; 3636 rec->jt_extsize = jfsync->jfs_extsize; 3637 } 3638 3639 static void 3640 softdep_flushjournal(mp) 3641 struct mount *mp; 3642 { 3643 struct jblocks *jblocks; 3644 struct ufsmount *ump; 3645 3646 if (MOUNTEDSUJ(mp) == 0) 3647 return; 3648 ump = VFSTOUFS(mp); 3649 jblocks = ump->softdep_jblocks; 3650 ACQUIRE_LOCK(ump); 3651 while (ump->softdep_on_journal) { 3652 jblocks->jb_needseg = 1; 3653 softdep_process_journal(mp, NULL, MNT_WAIT); 3654 } 3655 FREE_LOCK(ump); 3656 } 3657 3658 static void softdep_synchronize_completed(struct bio *); 3659 static void softdep_synchronize(struct bio *, struct ufsmount *, void *); 3660 3661 static void 3662 softdep_synchronize_completed(bp) 3663 struct bio *bp; 3664 { 3665 struct jseg *oldest; 3666 struct jseg *jseg; 3667 struct ufsmount *ump; 3668 3669 /* 3670 * caller1 marks the last segment written before we issued the 3671 * synchronize cache. 3672 */ 3673 jseg = bp->bio_caller1; 3674 if (jseg == NULL) { 3675 g_destroy_bio(bp); 3676 return; 3677 } 3678 ump = VFSTOUFS(jseg->js_list.wk_mp); 3679 ACQUIRE_LOCK(ump); 3680 oldest = NULL; 3681 /* 3682 * Mark all the journal entries waiting on the synchronize cache 3683 * as completed so they may continue on. 3684 */ 3685 while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) { 3686 jseg->js_state |= COMPLETE; 3687 oldest = jseg; 3688 jseg = TAILQ_PREV(jseg, jseglst, js_next); 3689 } 3690 /* 3691 * Restart deferred journal entry processing from the oldest 3692 * completed jseg. 3693 */ 3694 if (oldest) 3695 complete_jsegs(oldest); 3696 3697 FREE_LOCK(ump); 3698 g_destroy_bio(bp); 3699 } 3700 3701 /* 3702 * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering 3703 * barriers. The journal must be written prior to any blocks that depend 3704 * on it and the journal can not be released until the blocks have be 3705 * written. This code handles both barriers simultaneously. 3706 */ 3707 static void 3708 softdep_synchronize(bp, ump, caller1) 3709 struct bio *bp; 3710 struct ufsmount *ump; 3711 void *caller1; 3712 { 3713 3714 bp->bio_cmd = BIO_FLUSH; 3715 bp->bio_flags |= BIO_ORDERED; 3716 bp->bio_data = NULL; 3717 bp->bio_offset = ump->um_cp->provider->mediasize; 3718 bp->bio_length = 0; 3719 bp->bio_done = softdep_synchronize_completed; 3720 bp->bio_caller1 = caller1; 3721 g_io_request(bp, ump->um_cp); 3722 } 3723 3724 /* 3725 * Flush some journal records to disk. 3726 */ 3727 static void 3728 softdep_process_journal(mp, needwk, flags) 3729 struct mount *mp; 3730 struct worklist *needwk; 3731 int flags; 3732 { 3733 struct jblocks *jblocks; 3734 struct ufsmount *ump; 3735 struct worklist *wk; 3736 struct jseg *jseg; 3737 struct buf *bp; 3738 struct bio *bio; 3739 uint8_t *data; 3740 struct fs *fs; 3741 int shouldflush; 3742 int segwritten; 3743 int jrecmin; /* Minimum records per block. */ 3744 int jrecmax; /* Maximum records per block. */ 3745 int size; 3746 int cnt; 3747 int off; 3748 int devbsize; 3749 3750 ump = VFSTOUFS(mp); 3751 if (ump->um_softdep == NULL || ump->um_softdep->sd_jblocks == NULL) 3752 return; 3753 shouldflush = softdep_flushcache; 3754 bio = NULL; 3755 jseg = NULL; 3756 LOCK_OWNED(ump); 3757 fs = ump->um_fs; 3758 jblocks = ump->softdep_jblocks; 3759 devbsize = ump->um_devvp->v_bufobj.bo_bsize; 3760 /* 3761 * We write anywhere between a disk block and fs block. The upper 3762 * bound is picked to prevent buffer cache fragmentation and limit 3763 * processing time per I/O. 3764 */ 3765 jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */ 3766 jrecmax = (fs->fs_bsize / devbsize) * jrecmin; 3767 segwritten = 0; 3768 for (;;) { 3769 cnt = ump->softdep_on_journal; 3770 /* 3771 * Criteria for writing a segment: 3772 * 1) We have a full block. 3773 * 2) We're called from jwait() and haven't found the 3774 * journal item yet. 3775 * 3) Always write if needseg is set. 3776 * 4) If we are called from process_worklist and have 3777 * not yet written anything we write a partial block 3778 * to enforce a 1 second maximum latency on journal 3779 * entries. 3780 */ 3781 if (cnt < (jrecmax - 1) && needwk == NULL && 3782 jblocks->jb_needseg == 0 && (segwritten || cnt == 0)) 3783 break; 3784 cnt++; 3785 /* 3786 * Verify some free journal space. softdep_prealloc() should 3787 * guarantee that we don't run out so this is indicative of 3788 * a problem with the flow control. Try to recover 3789 * gracefully in any event. 3790 */ 3791 while (jblocks->jb_free == 0) { 3792 if (flags != MNT_WAIT) 3793 break; 3794 printf("softdep: Out of journal space!\n"); 3795 softdep_speedup(ump); 3796 msleep(jblocks, LOCK_PTR(ump), PRIBIO, "jblocks", hz); 3797 } 3798 FREE_LOCK(ump); 3799 jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS); 3800 workitem_alloc(&jseg->js_list, D_JSEG, mp); 3801 LIST_INIT(&jseg->js_entries); 3802 LIST_INIT(&jseg->js_indirs); 3803 jseg->js_state = ATTACHED; 3804 if (shouldflush == 0) 3805 jseg->js_state |= COMPLETE; 3806 else if (bio == NULL) 3807 bio = g_alloc_bio(); 3808 jseg->js_jblocks = jblocks; 3809 bp = geteblk(fs->fs_bsize, 0); 3810 ACQUIRE_LOCK(ump); 3811 /* 3812 * If there was a race while we were allocating the block 3813 * and jseg the entry we care about was likely written. 3814 * We bail out in both the WAIT and NOWAIT case and assume 3815 * the caller will loop if the entry it cares about is 3816 * not written. 3817 */ 3818 cnt = ump->softdep_on_journal; 3819 if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) { 3820 bp->b_flags |= B_INVAL | B_NOCACHE; 3821 WORKITEM_FREE(jseg, D_JSEG); 3822 FREE_LOCK(ump); 3823 brelse(bp); 3824 ACQUIRE_LOCK(ump); 3825 break; 3826 } 3827 /* 3828 * Calculate the disk block size required for the available 3829 * records rounded to the min size. 3830 */ 3831 if (cnt == 0) 3832 size = devbsize; 3833 else if (cnt < jrecmax) 3834 size = howmany(cnt, jrecmin) * devbsize; 3835 else 3836 size = fs->fs_bsize; 3837 /* 3838 * Allocate a disk block for this journal data and account 3839 * for truncation of the requested size if enough contiguous 3840 * space was not available. 3841 */ 3842 bp->b_blkno = jblocks_alloc(jblocks, size, &size); 3843 bp->b_lblkno = bp->b_blkno; 3844 bp->b_offset = bp->b_blkno * DEV_BSIZE; 3845 bp->b_bcount = size; 3846 bp->b_flags &= ~B_INVAL; 3847 bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY; 3848 /* 3849 * Initialize our jseg with cnt records. Assign the next 3850 * sequence number to it and link it in-order. 3851 */ 3852 cnt = MIN(cnt, (size / devbsize) * jrecmin); 3853 jseg->js_buf = bp; 3854 jseg->js_cnt = cnt; 3855 jseg->js_refs = cnt + 1; /* Self ref. */ 3856 jseg->js_size = size; 3857 jseg->js_seq = jblocks->jb_nextseq++; 3858 if (jblocks->jb_oldestseg == NULL) 3859 jblocks->jb_oldestseg = jseg; 3860 jseg->js_oldseq = jblocks->jb_oldestseg->js_seq; 3861 TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next); 3862 if (jblocks->jb_writeseg == NULL) 3863 jblocks->jb_writeseg = jseg; 3864 /* 3865 * Start filling in records from the pending list. 3866 */ 3867 data = bp->b_data; 3868 off = 0; 3869 3870 /* 3871 * Always put a header on the first block. 3872 * XXX As with below, there might not be a chance to get 3873 * into the loop. Ensure that something valid is written. 3874 */ 3875 jseg_write(ump, jseg, data); 3876 off += JREC_SIZE; 3877 data = bp->b_data + off; 3878 3879 /* 3880 * XXX Something is wrong here. There's no work to do, 3881 * but we need to perform and I/O and allow it to complete 3882 * anyways. 3883 */ 3884 if (LIST_EMPTY(&ump->softdep_journal_pending)) 3885 stat_emptyjblocks++; 3886 3887 while ((wk = LIST_FIRST(&ump->softdep_journal_pending)) 3888 != NULL) { 3889 if (cnt == 0) 3890 break; 3891 /* Place a segment header on every device block. */ 3892 if ((off % devbsize) == 0) { 3893 jseg_write(ump, jseg, data); 3894 off += JREC_SIZE; 3895 data = bp->b_data + off; 3896 } 3897 if (wk == needwk) 3898 needwk = NULL; 3899 remove_from_journal(wk); 3900 wk->wk_state |= INPROGRESS; 3901 WORKLIST_INSERT(&jseg->js_entries, wk); 3902 switch (wk->wk_type) { 3903 case D_JADDREF: 3904 jaddref_write(WK_JADDREF(wk), jseg, data); 3905 break; 3906 case D_JREMREF: 3907 jremref_write(WK_JREMREF(wk), jseg, data); 3908 break; 3909 case D_JMVREF: 3910 jmvref_write(WK_JMVREF(wk), jseg, data); 3911 break; 3912 case D_JNEWBLK: 3913 jnewblk_write(WK_JNEWBLK(wk), jseg, data); 3914 break; 3915 case D_JFREEBLK: 3916 jfreeblk_write(WK_JFREEBLK(wk), jseg, data); 3917 break; 3918 case D_JFREEFRAG: 3919 jfreefrag_write(WK_JFREEFRAG(wk), jseg, data); 3920 break; 3921 case D_JTRUNC: 3922 jtrunc_write(WK_JTRUNC(wk), jseg, data); 3923 break; 3924 case D_JFSYNC: 3925 jfsync_write(WK_JFSYNC(wk), jseg, data); 3926 break; 3927 default: 3928 panic("process_journal: Unknown type %s", 3929 TYPENAME(wk->wk_type)); 3930 /* NOTREACHED */ 3931 } 3932 off += JREC_SIZE; 3933 data = bp->b_data + off; 3934 cnt--; 3935 } 3936 3937 /* Clear any remaining space so we don't leak kernel data */ 3938 if (size > off) 3939 bzero(data, size - off); 3940 3941 /* 3942 * Write this one buffer and continue. 3943 */ 3944 segwritten = 1; 3945 jblocks->jb_needseg = 0; 3946 WORKLIST_INSERT(&bp->b_dep, &jseg->js_list); 3947 FREE_LOCK(ump); 3948 bp->b_xflags |= BX_CVTENXIO; 3949 pbgetvp(ump->um_devvp, bp); 3950 /* 3951 * We only do the blocking wait once we find the journal 3952 * entry we're looking for. 3953 */ 3954 if (needwk == NULL && flags == MNT_WAIT) 3955 bwrite(bp); 3956 else 3957 bawrite(bp); 3958 ACQUIRE_LOCK(ump); 3959 } 3960 /* 3961 * If we wrote a segment issue a synchronize cache so the journal 3962 * is reflected on disk before the data is written. Since reclaiming 3963 * journal space also requires writing a journal record this 3964 * process also enforces a barrier before reclamation. 3965 */ 3966 if (segwritten && shouldflush) { 3967 softdep_synchronize(bio, ump, 3968 TAILQ_LAST(&jblocks->jb_segs, jseglst)); 3969 } else if (bio) 3970 g_destroy_bio(bio); 3971 /* 3972 * If we've suspended the filesystem because we ran out of journal 3973 * space either try to sync it here to make some progress or 3974 * unsuspend it if we already have. 3975 */ 3976 if (flags == 0 && jblocks->jb_suspended) { 3977 if (journal_unsuspend(ump)) 3978 return; 3979 FREE_LOCK(ump); 3980 VFS_SYNC(mp, MNT_NOWAIT); 3981 ffs_sbupdate(ump, MNT_WAIT, 0); 3982 ACQUIRE_LOCK(ump); 3983 } 3984 } 3985 3986 /* 3987 * Complete a jseg, allowing all dependencies awaiting journal writes 3988 * to proceed. Each journal dependency also attaches a jsegdep to dependent 3989 * structures so that the journal segment can be freed to reclaim space. 3990 */ 3991 static void 3992 complete_jseg(jseg) 3993 struct jseg *jseg; 3994 { 3995 struct worklist *wk; 3996 struct jmvref *jmvref; 3997 #ifdef INVARIANTS 3998 int i = 0; 3999 #endif 4000 4001 while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) { 4002 WORKLIST_REMOVE(wk); 4003 wk->wk_state &= ~INPROGRESS; 4004 wk->wk_state |= COMPLETE; 4005 KASSERT(i++ < jseg->js_cnt, 4006 ("handle_written_jseg: overflow %d >= %d", 4007 i - 1, jseg->js_cnt)); 4008 switch (wk->wk_type) { 4009 case D_JADDREF: 4010 handle_written_jaddref(WK_JADDREF(wk)); 4011 break; 4012 case D_JREMREF: 4013 handle_written_jremref(WK_JREMREF(wk)); 4014 break; 4015 case D_JMVREF: 4016 rele_jseg(jseg); /* No jsegdep. */ 4017 jmvref = WK_JMVREF(wk); 4018 LIST_REMOVE(jmvref, jm_deps); 4019 if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0) 4020 free_pagedep(jmvref->jm_pagedep); 4021 WORKITEM_FREE(jmvref, D_JMVREF); 4022 break; 4023 case D_JNEWBLK: 4024 handle_written_jnewblk(WK_JNEWBLK(wk)); 4025 break; 4026 case D_JFREEBLK: 4027 handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep); 4028 break; 4029 case D_JTRUNC: 4030 handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep); 4031 break; 4032 case D_JFSYNC: 4033 rele_jseg(jseg); /* No jsegdep. */ 4034 WORKITEM_FREE(wk, D_JFSYNC); 4035 break; 4036 case D_JFREEFRAG: 4037 handle_written_jfreefrag(WK_JFREEFRAG(wk)); 4038 break; 4039 default: 4040 panic("handle_written_jseg: Unknown type %s", 4041 TYPENAME(wk->wk_type)); 4042 /* NOTREACHED */ 4043 } 4044 } 4045 /* Release the self reference so the structure may be freed. */ 4046 rele_jseg(jseg); 4047 } 4048 4049 /* 4050 * Determine which jsegs are ready for completion processing. Waits for 4051 * synchronize cache to complete as well as forcing in-order completion 4052 * of journal entries. 4053 */ 4054 static void 4055 complete_jsegs(jseg) 4056 struct jseg *jseg; 4057 { 4058 struct jblocks *jblocks; 4059 struct jseg *jsegn; 4060 4061 jblocks = jseg->js_jblocks; 4062 /* 4063 * Don't allow out of order completions. If this isn't the first 4064 * block wait for it to write before we're done. 4065 */ 4066 if (jseg != jblocks->jb_writeseg) 4067 return; 4068 /* Iterate through available jsegs processing their entries. */ 4069 while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) { 4070 jblocks->jb_oldestwrseq = jseg->js_oldseq; 4071 jsegn = TAILQ_NEXT(jseg, js_next); 4072 complete_jseg(jseg); 4073 jseg = jsegn; 4074 } 4075 jblocks->jb_writeseg = jseg; 4076 /* 4077 * Attempt to free jsegs now that oldestwrseq may have advanced. 4078 */ 4079 free_jsegs(jblocks); 4080 } 4081 4082 /* 4083 * Mark a jseg as DEPCOMPLETE and throw away the buffer. Attempt to handle 4084 * the final completions. 4085 */ 4086 static void 4087 handle_written_jseg(jseg, bp) 4088 struct jseg *jseg; 4089 struct buf *bp; 4090 { 4091 4092 if (jseg->js_refs == 0) 4093 panic("handle_written_jseg: No self-reference on %p", jseg); 4094 jseg->js_state |= DEPCOMPLETE; 4095 /* 4096 * We'll never need this buffer again, set flags so it will be 4097 * discarded. 4098 */ 4099 bp->b_flags |= B_INVAL | B_NOCACHE; 4100 pbrelvp(bp); 4101 complete_jsegs(jseg); 4102 } 4103 4104 static inline struct jsegdep * 4105 inoref_jseg(inoref) 4106 struct inoref *inoref; 4107 { 4108 struct jsegdep *jsegdep; 4109 4110 jsegdep = inoref->if_jsegdep; 4111 inoref->if_jsegdep = NULL; 4112 4113 return (jsegdep); 4114 } 4115 4116 /* 4117 * Called once a jremref has made it to stable store. The jremref is marked 4118 * complete and we attempt to free it. Any pagedeps writes sleeping waiting 4119 * for the jremref to complete will be awoken by free_jremref. 4120 */ 4121 static void 4122 handle_written_jremref(jremref) 4123 struct jremref *jremref; 4124 { 4125 struct inodedep *inodedep; 4126 struct jsegdep *jsegdep; 4127 struct dirrem *dirrem; 4128 4129 /* Grab the jsegdep. */ 4130 jsegdep = inoref_jseg(&jremref->jr_ref); 4131 /* 4132 * Remove us from the inoref list. 4133 */ 4134 if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 4135 0, &inodedep) == 0) 4136 panic("handle_written_jremref: Lost inodedep"); 4137 TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); 4138 /* 4139 * Complete the dirrem. 4140 */ 4141 dirrem = jremref->jr_dirrem; 4142 jremref->jr_dirrem = NULL; 4143 LIST_REMOVE(jremref, jr_deps); 4144 jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT; 4145 jwork_insert(&dirrem->dm_jwork, jsegdep); 4146 if (LIST_EMPTY(&dirrem->dm_jremrefhd) && 4147 (dirrem->dm_state & COMPLETE) != 0) 4148 add_to_worklist(&dirrem->dm_list, 0); 4149 free_jremref(jremref); 4150 } 4151 4152 /* 4153 * Called once a jaddref has made it to stable store. The dependency is 4154 * marked complete and any dependent structures are added to the inode 4155 * bufwait list to be completed as soon as it is written. If a bitmap write 4156 * depends on this entry we move the inode into the inodedephd of the 4157 * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap. 4158 */ 4159 static void 4160 handle_written_jaddref(jaddref) 4161 struct jaddref *jaddref; 4162 { 4163 struct jsegdep *jsegdep; 4164 struct inodedep *inodedep; 4165 struct diradd *diradd; 4166 struct mkdir *mkdir; 4167 4168 /* Grab the jsegdep. */ 4169 jsegdep = inoref_jseg(&jaddref->ja_ref); 4170 mkdir = NULL; 4171 diradd = NULL; 4172 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 4173 0, &inodedep) == 0) 4174 panic("handle_written_jaddref: Lost inodedep."); 4175 if (jaddref->ja_diradd == NULL) 4176 panic("handle_written_jaddref: No dependency"); 4177 if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) { 4178 diradd = jaddref->ja_diradd; 4179 WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list); 4180 } else if (jaddref->ja_state & MKDIR_PARENT) { 4181 mkdir = jaddref->ja_mkdir; 4182 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list); 4183 } else if (jaddref->ja_state & MKDIR_BODY) 4184 mkdir = jaddref->ja_mkdir; 4185 else 4186 panic("handle_written_jaddref: Unknown dependency %p", 4187 jaddref->ja_diradd); 4188 jaddref->ja_diradd = NULL; /* also clears ja_mkdir */ 4189 /* 4190 * Remove us from the inode list. 4191 */ 4192 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); 4193 /* 4194 * The mkdir may be waiting on the jaddref to clear before freeing. 4195 */ 4196 if (mkdir) { 4197 KASSERT(mkdir->md_list.wk_type == D_MKDIR, 4198 ("handle_written_jaddref: Incorrect type for mkdir %s", 4199 TYPENAME(mkdir->md_list.wk_type))); 4200 mkdir->md_jaddref = NULL; 4201 diradd = mkdir->md_diradd; 4202 mkdir->md_state |= DEPCOMPLETE; 4203 complete_mkdir(mkdir); 4204 } 4205 jwork_insert(&diradd->da_jwork, jsegdep); 4206 if (jaddref->ja_state & NEWBLOCK) { 4207 inodedep->id_state |= ONDEPLIST; 4208 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd, 4209 inodedep, id_deps); 4210 } 4211 free_jaddref(jaddref); 4212 } 4213 4214 /* 4215 * Called once a jnewblk journal is written. The allocdirect or allocindir 4216 * is placed in the bmsafemap to await notification of a written bitmap. If 4217 * the operation was canceled we add the segdep to the appropriate 4218 * dependency to free the journal space once the canceling operation 4219 * completes. 4220 */ 4221 static void 4222 handle_written_jnewblk(jnewblk) 4223 struct jnewblk *jnewblk; 4224 { 4225 struct bmsafemap *bmsafemap; 4226 struct freefrag *freefrag; 4227 struct freework *freework; 4228 struct jsegdep *jsegdep; 4229 struct newblk *newblk; 4230 4231 /* Grab the jsegdep. */ 4232 jsegdep = jnewblk->jn_jsegdep; 4233 jnewblk->jn_jsegdep = NULL; 4234 if (jnewblk->jn_dep == NULL) 4235 panic("handle_written_jnewblk: No dependency for the segdep."); 4236 switch (jnewblk->jn_dep->wk_type) { 4237 case D_NEWBLK: 4238 case D_ALLOCDIRECT: 4239 case D_ALLOCINDIR: 4240 /* 4241 * Add the written block to the bmsafemap so it can 4242 * be notified when the bitmap is on disk. 4243 */ 4244 newblk = WK_NEWBLK(jnewblk->jn_dep); 4245 newblk->nb_jnewblk = NULL; 4246 if ((newblk->nb_state & GOINGAWAY) == 0) { 4247 bmsafemap = newblk->nb_bmsafemap; 4248 newblk->nb_state |= ONDEPLIST; 4249 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, 4250 nb_deps); 4251 } 4252 jwork_insert(&newblk->nb_jwork, jsegdep); 4253 break; 4254 case D_FREEFRAG: 4255 /* 4256 * A newblock being removed by a freefrag when replaced by 4257 * frag extension. 4258 */ 4259 freefrag = WK_FREEFRAG(jnewblk->jn_dep); 4260 freefrag->ff_jdep = NULL; 4261 jwork_insert(&freefrag->ff_jwork, jsegdep); 4262 break; 4263 case D_FREEWORK: 4264 /* 4265 * A direct block was removed by truncate. 4266 */ 4267 freework = WK_FREEWORK(jnewblk->jn_dep); 4268 freework->fw_jnewblk = NULL; 4269 jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep); 4270 break; 4271 default: 4272 panic("handle_written_jnewblk: Unknown type %d.", 4273 jnewblk->jn_dep->wk_type); 4274 } 4275 jnewblk->jn_dep = NULL; 4276 free_jnewblk(jnewblk); 4277 } 4278 4279 /* 4280 * Cancel a jfreefrag that won't be needed, probably due to colliding with 4281 * an in-flight allocation that has not yet been committed. Divorce us 4282 * from the freefrag and mark it DEPCOMPLETE so that it may be added 4283 * to the worklist. 4284 */ 4285 static void 4286 cancel_jfreefrag(jfreefrag) 4287 struct jfreefrag *jfreefrag; 4288 { 4289 struct freefrag *freefrag; 4290 4291 if (jfreefrag->fr_jsegdep) { 4292 free_jsegdep(jfreefrag->fr_jsegdep); 4293 jfreefrag->fr_jsegdep = NULL; 4294 } 4295 freefrag = jfreefrag->fr_freefrag; 4296 jfreefrag->fr_freefrag = NULL; 4297 free_jfreefrag(jfreefrag); 4298 freefrag->ff_state |= DEPCOMPLETE; 4299 CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno); 4300 } 4301 4302 /* 4303 * Free a jfreefrag when the parent freefrag is rendered obsolete. 4304 */ 4305 static void 4306 free_jfreefrag(jfreefrag) 4307 struct jfreefrag *jfreefrag; 4308 { 4309 4310 if (jfreefrag->fr_state & INPROGRESS) 4311 WORKLIST_REMOVE(&jfreefrag->fr_list); 4312 else if (jfreefrag->fr_state & ONWORKLIST) 4313 remove_from_journal(&jfreefrag->fr_list); 4314 if (jfreefrag->fr_freefrag != NULL) 4315 panic("free_jfreefrag: Still attached to a freefrag."); 4316 WORKITEM_FREE(jfreefrag, D_JFREEFRAG); 4317 } 4318 4319 /* 4320 * Called when the journal write for a jfreefrag completes. The parent 4321 * freefrag is added to the worklist if this completes its dependencies. 4322 */ 4323 static void 4324 handle_written_jfreefrag(jfreefrag) 4325 struct jfreefrag *jfreefrag; 4326 { 4327 struct jsegdep *jsegdep; 4328 struct freefrag *freefrag; 4329 4330 /* Grab the jsegdep. */ 4331 jsegdep = jfreefrag->fr_jsegdep; 4332 jfreefrag->fr_jsegdep = NULL; 4333 freefrag = jfreefrag->fr_freefrag; 4334 if (freefrag == NULL) 4335 panic("handle_written_jfreefrag: No freefrag."); 4336 freefrag->ff_state |= DEPCOMPLETE; 4337 freefrag->ff_jdep = NULL; 4338 jwork_insert(&freefrag->ff_jwork, jsegdep); 4339 if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) 4340 add_to_worklist(&freefrag->ff_list, 0); 4341 jfreefrag->fr_freefrag = NULL; 4342 free_jfreefrag(jfreefrag); 4343 } 4344 4345 /* 4346 * Called when the journal write for a jfreeblk completes. The jfreeblk 4347 * is removed from the freeblks list of pending journal writes and the 4348 * jsegdep is moved to the freeblks jwork to be completed when all blocks 4349 * have been reclaimed. 4350 */ 4351 static void 4352 handle_written_jblkdep(jblkdep) 4353 struct jblkdep *jblkdep; 4354 { 4355 struct freeblks *freeblks; 4356 struct jsegdep *jsegdep; 4357 4358 /* Grab the jsegdep. */ 4359 jsegdep = jblkdep->jb_jsegdep; 4360 jblkdep->jb_jsegdep = NULL; 4361 freeblks = jblkdep->jb_freeblks; 4362 LIST_REMOVE(jblkdep, jb_deps); 4363 jwork_insert(&freeblks->fb_jwork, jsegdep); 4364 /* 4365 * If the freeblks is all journaled, we can add it to the worklist. 4366 */ 4367 if (LIST_EMPTY(&freeblks->fb_jblkdephd) && 4368 (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) 4369 add_to_worklist(&freeblks->fb_list, WK_NODELAY); 4370 4371 free_jblkdep(jblkdep); 4372 } 4373 4374 static struct jsegdep * 4375 newjsegdep(struct worklist *wk) 4376 { 4377 struct jsegdep *jsegdep; 4378 4379 jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS); 4380 workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp); 4381 jsegdep->jd_seg = NULL; 4382 4383 return (jsegdep); 4384 } 4385 4386 static struct jmvref * 4387 newjmvref(dp, ino, oldoff, newoff) 4388 struct inode *dp; 4389 ino_t ino; 4390 off_t oldoff; 4391 off_t newoff; 4392 { 4393 struct jmvref *jmvref; 4394 4395 jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS); 4396 workitem_alloc(&jmvref->jm_list, D_JMVREF, ITOVFS(dp)); 4397 jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE; 4398 jmvref->jm_parent = dp->i_number; 4399 jmvref->jm_ino = ino; 4400 jmvref->jm_oldoff = oldoff; 4401 jmvref->jm_newoff = newoff; 4402 4403 return (jmvref); 4404 } 4405 4406 /* 4407 * Allocate a new jremref that tracks the removal of ip from dp with the 4408 * directory entry offset of diroff. Mark the entry as ATTACHED and 4409 * DEPCOMPLETE as we have all the information required for the journal write 4410 * and the directory has already been removed from the buffer. The caller 4411 * is responsible for linking the jremref into the pagedep and adding it 4412 * to the journal to write. The MKDIR_PARENT flag is set if we're doing 4413 * a DOTDOT addition so handle_workitem_remove() can properly assign 4414 * the jsegdep when we're done. 4415 */ 4416 static struct jremref * 4417 newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip, 4418 off_t diroff, nlink_t nlink) 4419 { 4420 struct jremref *jremref; 4421 4422 jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS); 4423 workitem_alloc(&jremref->jr_list, D_JREMREF, ITOVFS(dp)); 4424 jremref->jr_state = ATTACHED; 4425 newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff, 4426 nlink, ip->i_mode); 4427 jremref->jr_dirrem = dirrem; 4428 4429 return (jremref); 4430 } 4431 4432 static inline void 4433 newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff, 4434 nlink_t nlink, uint16_t mode) 4435 { 4436 4437 inoref->if_jsegdep = newjsegdep(&inoref->if_list); 4438 inoref->if_diroff = diroff; 4439 inoref->if_ino = ino; 4440 inoref->if_parent = parent; 4441 inoref->if_nlink = nlink; 4442 inoref->if_mode = mode; 4443 } 4444 4445 /* 4446 * Allocate a new jaddref to track the addition of ino to dp at diroff. The 4447 * directory offset may not be known until later. The caller is responsible 4448 * adding the entry to the journal when this information is available. nlink 4449 * should be the link count prior to the addition and mode is only required 4450 * to have the correct FMT. 4451 */ 4452 static struct jaddref * 4453 newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink, 4454 uint16_t mode) 4455 { 4456 struct jaddref *jaddref; 4457 4458 jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS); 4459 workitem_alloc(&jaddref->ja_list, D_JADDREF, ITOVFS(dp)); 4460 jaddref->ja_state = ATTACHED; 4461 jaddref->ja_mkdir = NULL; 4462 newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode); 4463 4464 return (jaddref); 4465 } 4466 4467 /* 4468 * Create a new free dependency for a freework. The caller is responsible 4469 * for adjusting the reference count when it has the lock held. The freedep 4470 * will track an outstanding bitmap write that will ultimately clear the 4471 * freework to continue. 4472 */ 4473 static struct freedep * 4474 newfreedep(struct freework *freework) 4475 { 4476 struct freedep *freedep; 4477 4478 freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS); 4479 workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp); 4480 freedep->fd_freework = freework; 4481 4482 return (freedep); 4483 } 4484 4485 /* 4486 * Free a freedep structure once the buffer it is linked to is written. If 4487 * this is the last reference to the freework schedule it for completion. 4488 */ 4489 static void 4490 free_freedep(freedep) 4491 struct freedep *freedep; 4492 { 4493 struct freework *freework; 4494 4495 freework = freedep->fd_freework; 4496 freework->fw_freeblks->fb_cgwait--; 4497 if (--freework->fw_ref == 0) 4498 freework_enqueue(freework); 4499 WORKITEM_FREE(freedep, D_FREEDEP); 4500 } 4501 4502 /* 4503 * Allocate a new freework structure that may be a level in an indirect 4504 * when parent is not NULL or a top level block when it is. The top level 4505 * freework structures are allocated without the per-filesystem lock held 4506 * and before the freeblks is visible outside of softdep_setup_freeblocks(). 4507 */ 4508 static struct freework * 4509 newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal) 4510 struct ufsmount *ump; 4511 struct freeblks *freeblks; 4512 struct freework *parent; 4513 ufs_lbn_t lbn; 4514 ufs2_daddr_t nb; 4515 int frags; 4516 int off; 4517 int journal; 4518 { 4519 struct freework *freework; 4520 4521 freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS); 4522 workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp); 4523 freework->fw_state = ATTACHED; 4524 freework->fw_jnewblk = NULL; 4525 freework->fw_freeblks = freeblks; 4526 freework->fw_parent = parent; 4527 freework->fw_lbn = lbn; 4528 freework->fw_blkno = nb; 4529 freework->fw_frags = frags; 4530 freework->fw_indir = NULL; 4531 freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 || 4532 lbn >= -UFS_NXADDR) ? 0 : NINDIR(ump->um_fs) + 1; 4533 freework->fw_start = freework->fw_off = off; 4534 if (journal) 4535 newjfreeblk(freeblks, lbn, nb, frags); 4536 if (parent == NULL) { 4537 ACQUIRE_LOCK(ump); 4538 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list); 4539 freeblks->fb_ref++; 4540 FREE_LOCK(ump); 4541 } 4542 4543 return (freework); 4544 } 4545 4546 /* 4547 * Eliminate a jfreeblk for a block that does not need journaling. 4548 */ 4549 static void 4550 cancel_jfreeblk(freeblks, blkno) 4551 struct freeblks *freeblks; 4552 ufs2_daddr_t blkno; 4553 { 4554 struct jfreeblk *jfreeblk; 4555 struct jblkdep *jblkdep; 4556 4557 LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) { 4558 if (jblkdep->jb_list.wk_type != D_JFREEBLK) 4559 continue; 4560 jfreeblk = WK_JFREEBLK(&jblkdep->jb_list); 4561 if (jfreeblk->jf_blkno == blkno) 4562 break; 4563 } 4564 if (jblkdep == NULL) 4565 return; 4566 CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno); 4567 free_jsegdep(jblkdep->jb_jsegdep); 4568 LIST_REMOVE(jblkdep, jb_deps); 4569 WORKITEM_FREE(jfreeblk, D_JFREEBLK); 4570 } 4571 4572 /* 4573 * Allocate a new jfreeblk to journal top level block pointer when truncating 4574 * a file. The caller must add this to the worklist when the per-filesystem 4575 * lock is held. 4576 */ 4577 static struct jfreeblk * 4578 newjfreeblk(freeblks, lbn, blkno, frags) 4579 struct freeblks *freeblks; 4580 ufs_lbn_t lbn; 4581 ufs2_daddr_t blkno; 4582 int frags; 4583 { 4584 struct jfreeblk *jfreeblk; 4585 4586 jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS); 4587 workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK, 4588 freeblks->fb_list.wk_mp); 4589 jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list); 4590 jfreeblk->jf_dep.jb_freeblks = freeblks; 4591 jfreeblk->jf_ino = freeblks->fb_inum; 4592 jfreeblk->jf_lbn = lbn; 4593 jfreeblk->jf_blkno = blkno; 4594 jfreeblk->jf_frags = frags; 4595 LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps); 4596 4597 return (jfreeblk); 4598 } 4599 4600 /* 4601 * The journal is only prepared to handle full-size block numbers, so we 4602 * have to adjust the record to reflect the change to a full-size block. 4603 * For example, suppose we have a block made up of fragments 8-15 and 4604 * want to free its last two fragments. We are given a request that says: 4605 * FREEBLK ino=5, blkno=14, lbn=0, frags=2, oldfrags=0 4606 * where frags are the number of fragments to free and oldfrags are the 4607 * number of fragments to keep. To block align it, we have to change it to 4608 * have a valid full-size blkno, so it becomes: 4609 * FREEBLK ino=5, blkno=8, lbn=0, frags=2, oldfrags=6 4610 */ 4611 static void 4612 adjust_newfreework(freeblks, frag_offset) 4613 struct freeblks *freeblks; 4614 int frag_offset; 4615 { 4616 struct jfreeblk *jfreeblk; 4617 4618 KASSERT((LIST_FIRST(&freeblks->fb_jblkdephd) != NULL && 4619 LIST_FIRST(&freeblks->fb_jblkdephd)->jb_list.wk_type == D_JFREEBLK), 4620 ("adjust_newfreework: Missing freeblks dependency")); 4621 4622 jfreeblk = WK_JFREEBLK(LIST_FIRST(&freeblks->fb_jblkdephd)); 4623 jfreeblk->jf_blkno -= frag_offset; 4624 jfreeblk->jf_frags += frag_offset; 4625 } 4626 4627 /* 4628 * Allocate a new jtrunc to track a partial truncation. 4629 */ 4630 static struct jtrunc * 4631 newjtrunc(freeblks, size, extsize) 4632 struct freeblks *freeblks; 4633 off_t size; 4634 int extsize; 4635 { 4636 struct jtrunc *jtrunc; 4637 4638 jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS); 4639 workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC, 4640 freeblks->fb_list.wk_mp); 4641 jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list); 4642 jtrunc->jt_dep.jb_freeblks = freeblks; 4643 jtrunc->jt_ino = freeblks->fb_inum; 4644 jtrunc->jt_size = size; 4645 jtrunc->jt_extsize = extsize; 4646 LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps); 4647 4648 return (jtrunc); 4649 } 4650 4651 /* 4652 * If we're canceling a new bitmap we have to search for another ref 4653 * to move into the bmsafemap dep. This might be better expressed 4654 * with another structure. 4655 */ 4656 static void 4657 move_newblock_dep(jaddref, inodedep) 4658 struct jaddref *jaddref; 4659 struct inodedep *inodedep; 4660 { 4661 struct inoref *inoref; 4662 struct jaddref *jaddrefn; 4663 4664 jaddrefn = NULL; 4665 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; 4666 inoref = TAILQ_NEXT(inoref, if_deps)) { 4667 if ((jaddref->ja_state & NEWBLOCK) && 4668 inoref->if_list.wk_type == D_JADDREF) { 4669 jaddrefn = (struct jaddref *)inoref; 4670 break; 4671 } 4672 } 4673 if (jaddrefn == NULL) 4674 return; 4675 jaddrefn->ja_state &= ~(ATTACHED | UNDONE); 4676 jaddrefn->ja_state |= jaddref->ja_state & 4677 (ATTACHED | UNDONE | NEWBLOCK); 4678 jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK); 4679 jaddref->ja_state |= ATTACHED; 4680 LIST_REMOVE(jaddref, ja_bmdeps); 4681 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn, 4682 ja_bmdeps); 4683 } 4684 4685 /* 4686 * Cancel a jaddref either before it has been written or while it is being 4687 * written. This happens when a link is removed before the add reaches 4688 * the disk. The jaddref dependency is kept linked into the bmsafemap 4689 * and inode to prevent the link count or bitmap from reaching the disk 4690 * until handle_workitem_remove() re-adjusts the counts and bitmaps as 4691 * required. 4692 * 4693 * Returns 1 if the canceled addref requires journaling of the remove and 4694 * 0 otherwise. 4695 */ 4696 static int 4697 cancel_jaddref(jaddref, inodedep, wkhd) 4698 struct jaddref *jaddref; 4699 struct inodedep *inodedep; 4700 struct workhead *wkhd; 4701 { 4702 struct inoref *inoref; 4703 struct jsegdep *jsegdep; 4704 int needsj; 4705 4706 KASSERT((jaddref->ja_state & COMPLETE) == 0, 4707 ("cancel_jaddref: Canceling complete jaddref")); 4708 if (jaddref->ja_state & (INPROGRESS | COMPLETE)) 4709 needsj = 1; 4710 else 4711 needsj = 0; 4712 if (inodedep == NULL) 4713 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 4714 0, &inodedep) == 0) 4715 panic("cancel_jaddref: Lost inodedep"); 4716 /* 4717 * We must adjust the nlink of any reference operation that follows 4718 * us so that it is consistent with the in-memory reference. This 4719 * ensures that inode nlink rollbacks always have the correct link. 4720 */ 4721 if (needsj == 0) { 4722 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; 4723 inoref = TAILQ_NEXT(inoref, if_deps)) { 4724 if (inoref->if_state & GOINGAWAY) 4725 break; 4726 inoref->if_nlink--; 4727 } 4728 } 4729 jsegdep = inoref_jseg(&jaddref->ja_ref); 4730 if (jaddref->ja_state & NEWBLOCK) 4731 move_newblock_dep(jaddref, inodedep); 4732 wake_worklist(&jaddref->ja_list); 4733 jaddref->ja_mkdir = NULL; 4734 if (jaddref->ja_state & INPROGRESS) { 4735 jaddref->ja_state &= ~INPROGRESS; 4736 WORKLIST_REMOVE(&jaddref->ja_list); 4737 jwork_insert(wkhd, jsegdep); 4738 } else { 4739 free_jsegdep(jsegdep); 4740 if (jaddref->ja_state & DEPCOMPLETE) 4741 remove_from_journal(&jaddref->ja_list); 4742 } 4743 jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE); 4744 /* 4745 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove 4746 * can arrange for them to be freed with the bitmap. Otherwise we 4747 * no longer need this addref attached to the inoreflst and it 4748 * will incorrectly adjust nlink if we leave it. 4749 */ 4750 if ((jaddref->ja_state & NEWBLOCK) == 0) { 4751 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, 4752 if_deps); 4753 jaddref->ja_state |= COMPLETE; 4754 free_jaddref(jaddref); 4755 return (needsj); 4756 } 4757 /* 4758 * Leave the head of the list for jsegdeps for fast merging. 4759 */ 4760 if (LIST_FIRST(wkhd) != NULL) { 4761 jaddref->ja_state |= ONWORKLIST; 4762 LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list); 4763 } else 4764 WORKLIST_INSERT(wkhd, &jaddref->ja_list); 4765 4766 return (needsj); 4767 } 4768 4769 /* 4770 * Attempt to free a jaddref structure when some work completes. This 4771 * should only succeed once the entry is written and all dependencies have 4772 * been notified. 4773 */ 4774 static void 4775 free_jaddref(jaddref) 4776 struct jaddref *jaddref; 4777 { 4778 4779 if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE) 4780 return; 4781 if (jaddref->ja_ref.if_jsegdep) 4782 panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n", 4783 jaddref, jaddref->ja_state); 4784 if (jaddref->ja_state & NEWBLOCK) 4785 LIST_REMOVE(jaddref, ja_bmdeps); 4786 if (jaddref->ja_state & (INPROGRESS | ONWORKLIST)) 4787 panic("free_jaddref: Bad state %p(0x%X)", 4788 jaddref, jaddref->ja_state); 4789 if (jaddref->ja_mkdir != NULL) 4790 panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state); 4791 WORKITEM_FREE(jaddref, D_JADDREF); 4792 } 4793 4794 /* 4795 * Free a jremref structure once it has been written or discarded. 4796 */ 4797 static void 4798 free_jremref(jremref) 4799 struct jremref *jremref; 4800 { 4801 4802 if (jremref->jr_ref.if_jsegdep) 4803 free_jsegdep(jremref->jr_ref.if_jsegdep); 4804 if (jremref->jr_state & INPROGRESS) 4805 panic("free_jremref: IO still pending"); 4806 WORKITEM_FREE(jremref, D_JREMREF); 4807 } 4808 4809 /* 4810 * Free a jnewblk structure. 4811 */ 4812 static void 4813 free_jnewblk(jnewblk) 4814 struct jnewblk *jnewblk; 4815 { 4816 4817 if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE) 4818 return; 4819 LIST_REMOVE(jnewblk, jn_deps); 4820 if (jnewblk->jn_dep != NULL) 4821 panic("free_jnewblk: Dependency still attached."); 4822 WORKITEM_FREE(jnewblk, D_JNEWBLK); 4823 } 4824 4825 /* 4826 * Cancel a jnewblk which has been been made redundant by frag extension. 4827 */ 4828 static void 4829 cancel_jnewblk(jnewblk, wkhd) 4830 struct jnewblk *jnewblk; 4831 struct workhead *wkhd; 4832 { 4833 struct jsegdep *jsegdep; 4834 4835 CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno); 4836 jsegdep = jnewblk->jn_jsegdep; 4837 if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL) 4838 panic("cancel_jnewblk: Invalid state"); 4839 jnewblk->jn_jsegdep = NULL; 4840 jnewblk->jn_dep = NULL; 4841 jnewblk->jn_state |= GOINGAWAY; 4842 if (jnewblk->jn_state & INPROGRESS) { 4843 jnewblk->jn_state &= ~INPROGRESS; 4844 WORKLIST_REMOVE(&jnewblk->jn_list); 4845 jwork_insert(wkhd, jsegdep); 4846 } else { 4847 free_jsegdep(jsegdep); 4848 remove_from_journal(&jnewblk->jn_list); 4849 } 4850 wake_worklist(&jnewblk->jn_list); 4851 WORKLIST_INSERT(wkhd, &jnewblk->jn_list); 4852 } 4853 4854 static void 4855 free_jblkdep(jblkdep) 4856 struct jblkdep *jblkdep; 4857 { 4858 4859 if (jblkdep->jb_list.wk_type == D_JFREEBLK) 4860 WORKITEM_FREE(jblkdep, D_JFREEBLK); 4861 else if (jblkdep->jb_list.wk_type == D_JTRUNC) 4862 WORKITEM_FREE(jblkdep, D_JTRUNC); 4863 else 4864 panic("free_jblkdep: Unexpected type %s", 4865 TYPENAME(jblkdep->jb_list.wk_type)); 4866 } 4867 4868 /* 4869 * Free a single jseg once it is no longer referenced in memory or on 4870 * disk. Reclaim journal blocks and dependencies waiting for the segment 4871 * to disappear. 4872 */ 4873 static void 4874 free_jseg(jseg, jblocks) 4875 struct jseg *jseg; 4876 struct jblocks *jblocks; 4877 { 4878 struct freework *freework; 4879 4880 /* 4881 * Free freework structures that were lingering to indicate freed 4882 * indirect blocks that forced journal write ordering on reallocate. 4883 */ 4884 while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL) 4885 indirblk_remove(freework); 4886 if (jblocks->jb_oldestseg == jseg) 4887 jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next); 4888 TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next); 4889 jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size); 4890 KASSERT(LIST_EMPTY(&jseg->js_entries), 4891 ("free_jseg: Freed jseg has valid entries.")); 4892 WORKITEM_FREE(jseg, D_JSEG); 4893 } 4894 4895 /* 4896 * Free all jsegs that meet the criteria for being reclaimed and update 4897 * oldestseg. 4898 */ 4899 static void 4900 free_jsegs(jblocks) 4901 struct jblocks *jblocks; 4902 { 4903 struct jseg *jseg; 4904 4905 /* 4906 * Free only those jsegs which have none allocated before them to 4907 * preserve the journal space ordering. 4908 */ 4909 while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) { 4910 /* 4911 * Only reclaim space when nothing depends on this journal 4912 * set and another set has written that it is no longer 4913 * valid. 4914 */ 4915 if (jseg->js_refs != 0) { 4916 jblocks->jb_oldestseg = jseg; 4917 return; 4918 } 4919 if ((jseg->js_state & ALLCOMPLETE) != ALLCOMPLETE) 4920 break; 4921 if (jseg->js_seq > jblocks->jb_oldestwrseq) 4922 break; 4923 /* 4924 * We can free jsegs that didn't write entries when 4925 * oldestwrseq == js_seq. 4926 */ 4927 if (jseg->js_seq == jblocks->jb_oldestwrseq && 4928 jseg->js_cnt != 0) 4929 break; 4930 free_jseg(jseg, jblocks); 4931 } 4932 /* 4933 * If we exited the loop above we still must discover the 4934 * oldest valid segment. 4935 */ 4936 if (jseg) 4937 for (jseg = jblocks->jb_oldestseg; jseg != NULL; 4938 jseg = TAILQ_NEXT(jseg, js_next)) 4939 if (jseg->js_refs != 0) 4940 break; 4941 jblocks->jb_oldestseg = jseg; 4942 /* 4943 * The journal has no valid records but some jsegs may still be 4944 * waiting on oldestwrseq to advance. We force a small record 4945 * out to permit these lingering records to be reclaimed. 4946 */ 4947 if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs)) 4948 jblocks->jb_needseg = 1; 4949 } 4950 4951 /* 4952 * Release one reference to a jseg and free it if the count reaches 0. This 4953 * should eventually reclaim journal space as well. 4954 */ 4955 static void 4956 rele_jseg(jseg) 4957 struct jseg *jseg; 4958 { 4959 4960 KASSERT(jseg->js_refs > 0, 4961 ("free_jseg: Invalid refcnt %d", jseg->js_refs)); 4962 if (--jseg->js_refs != 0) 4963 return; 4964 free_jsegs(jseg->js_jblocks); 4965 } 4966 4967 /* 4968 * Release a jsegdep and decrement the jseg count. 4969 */ 4970 static void 4971 free_jsegdep(jsegdep) 4972 struct jsegdep *jsegdep; 4973 { 4974 4975 if (jsegdep->jd_seg) 4976 rele_jseg(jsegdep->jd_seg); 4977 WORKITEM_FREE(jsegdep, D_JSEGDEP); 4978 } 4979 4980 /* 4981 * Wait for a journal item to make it to disk. Initiate journal processing 4982 * if required. 4983 */ 4984 static int 4985 jwait(wk, waitfor) 4986 struct worklist *wk; 4987 int waitfor; 4988 { 4989 4990 LOCK_OWNED(VFSTOUFS(wk->wk_mp)); 4991 /* 4992 * Blocking journal waits cause slow synchronous behavior. Record 4993 * stats on the frequency of these blocking operations. 4994 */ 4995 if (waitfor == MNT_WAIT) { 4996 stat_journal_wait++; 4997 switch (wk->wk_type) { 4998 case D_JREMREF: 4999 case D_JMVREF: 5000 stat_jwait_filepage++; 5001 break; 5002 case D_JTRUNC: 5003 case D_JFREEBLK: 5004 stat_jwait_freeblks++; 5005 break; 5006 case D_JNEWBLK: 5007 stat_jwait_newblk++; 5008 break; 5009 case D_JADDREF: 5010 stat_jwait_inode++; 5011 break; 5012 default: 5013 break; 5014 } 5015 } 5016 /* 5017 * If IO has not started we process the journal. We can't mark the 5018 * worklist item as IOWAITING because we drop the lock while 5019 * processing the journal and the worklist entry may be freed after 5020 * this point. The caller may call back in and re-issue the request. 5021 */ 5022 if ((wk->wk_state & INPROGRESS) == 0) { 5023 softdep_process_journal(wk->wk_mp, wk, waitfor); 5024 if (waitfor != MNT_WAIT) 5025 return (EBUSY); 5026 return (0); 5027 } 5028 if (waitfor != MNT_WAIT) 5029 return (EBUSY); 5030 wait_worklist(wk, "jwait"); 5031 return (0); 5032 } 5033 5034 /* 5035 * Lookup an inodedep based on an inode pointer and set the nlinkdelta as 5036 * appropriate. This is a convenience function to reduce duplicate code 5037 * for the setup and revert functions below. 5038 */ 5039 static struct inodedep * 5040 inodedep_lookup_ip(ip) 5041 struct inode *ip; 5042 { 5043 struct inodedep *inodedep; 5044 5045 KASSERT(ip->i_nlink >= ip->i_effnlink, 5046 ("inodedep_lookup_ip: bad delta")); 5047 (void) inodedep_lookup(ITOVFS(ip), ip->i_number, DEPALLOC, 5048 &inodedep); 5049 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 5050 KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked")); 5051 5052 return (inodedep); 5053 } 5054 5055 /* 5056 * Called prior to creating a new inode and linking it to a directory. The 5057 * jaddref structure must already be allocated by softdep_setup_inomapdep 5058 * and it is discovered here so we can initialize the mode and update 5059 * nlinkdelta. 5060 */ 5061 void 5062 softdep_setup_create(dp, ip) 5063 struct inode *dp; 5064 struct inode *ip; 5065 { 5066 struct inodedep *inodedep; 5067 struct jaddref *jaddref; 5068 struct vnode *dvp; 5069 5070 KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, 5071 ("softdep_setup_create called on non-softdep filesystem")); 5072 KASSERT(ip->i_nlink == 1, 5073 ("softdep_setup_create: Invalid link count.")); 5074 dvp = ITOV(dp); 5075 ACQUIRE_LOCK(ITOUMP(dp)); 5076 inodedep = inodedep_lookup_ip(ip); 5077 if (DOINGSUJ(dvp)) { 5078 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 5079 inoreflst); 5080 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 5081 ("softdep_setup_create: No addref structure present.")); 5082 } 5083 FREE_LOCK(ITOUMP(dp)); 5084 } 5085 5086 /* 5087 * Create a jaddref structure to track the addition of a DOTDOT link when 5088 * we are reparenting an inode as part of a rename. This jaddref will be 5089 * found by softdep_setup_directory_change. Adjusts nlinkdelta for 5090 * non-journaling softdep. 5091 */ 5092 void 5093 softdep_setup_dotdot_link(dp, ip) 5094 struct inode *dp; 5095 struct inode *ip; 5096 { 5097 struct inodedep *inodedep; 5098 struct jaddref *jaddref; 5099 struct vnode *dvp; 5100 5101 KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, 5102 ("softdep_setup_dotdot_link called on non-softdep filesystem")); 5103 dvp = ITOV(dp); 5104 jaddref = NULL; 5105 /* 5106 * We don't set MKDIR_PARENT as this is not tied to a mkdir and 5107 * is used as a normal link would be. 5108 */ 5109 if (DOINGSUJ(dvp)) 5110 jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, 5111 dp->i_effnlink - 1, dp->i_mode); 5112 ACQUIRE_LOCK(ITOUMP(dp)); 5113 inodedep = inodedep_lookup_ip(dp); 5114 if (jaddref) 5115 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 5116 if_deps); 5117 FREE_LOCK(ITOUMP(dp)); 5118 } 5119 5120 /* 5121 * Create a jaddref structure to track a new link to an inode. The directory 5122 * offset is not known until softdep_setup_directory_add or 5123 * softdep_setup_directory_change. Adjusts nlinkdelta for non-journaling 5124 * softdep. 5125 */ 5126 void 5127 softdep_setup_link(dp, ip) 5128 struct inode *dp; 5129 struct inode *ip; 5130 { 5131 struct inodedep *inodedep; 5132 struct jaddref *jaddref; 5133 struct vnode *dvp; 5134 5135 KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, 5136 ("softdep_setup_link called on non-softdep filesystem")); 5137 dvp = ITOV(dp); 5138 jaddref = NULL; 5139 if (DOINGSUJ(dvp)) 5140 jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1, 5141 ip->i_mode); 5142 ACQUIRE_LOCK(ITOUMP(dp)); 5143 inodedep = inodedep_lookup_ip(ip); 5144 if (jaddref) 5145 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 5146 if_deps); 5147 FREE_LOCK(ITOUMP(dp)); 5148 } 5149 5150 /* 5151 * Called to create the jaddref structures to track . and .. references as 5152 * well as lookup and further initialize the incomplete jaddref created 5153 * by softdep_setup_inomapdep when the inode was allocated. Adjusts 5154 * nlinkdelta for non-journaling softdep. 5155 */ 5156 void 5157 softdep_setup_mkdir(dp, ip) 5158 struct inode *dp; 5159 struct inode *ip; 5160 { 5161 struct inodedep *inodedep; 5162 struct jaddref *dotdotaddref; 5163 struct jaddref *dotaddref; 5164 struct jaddref *jaddref; 5165 struct vnode *dvp; 5166 5167 KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, 5168 ("softdep_setup_mkdir called on non-softdep filesystem")); 5169 dvp = ITOV(dp); 5170 dotaddref = dotdotaddref = NULL; 5171 if (DOINGSUJ(dvp)) { 5172 dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1, 5173 ip->i_mode); 5174 dotaddref->ja_state |= MKDIR_BODY; 5175 dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, 5176 dp->i_effnlink - 1, dp->i_mode); 5177 dotdotaddref->ja_state |= MKDIR_PARENT; 5178 } 5179 ACQUIRE_LOCK(ITOUMP(dp)); 5180 inodedep = inodedep_lookup_ip(ip); 5181 if (DOINGSUJ(dvp)) { 5182 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 5183 inoreflst); 5184 KASSERT(jaddref != NULL, 5185 ("softdep_setup_mkdir: No addref structure present.")); 5186 KASSERT(jaddref->ja_parent == dp->i_number, 5187 ("softdep_setup_mkdir: bad parent %ju", 5188 (uintmax_t)jaddref->ja_parent)); 5189 TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref, 5190 if_deps); 5191 } 5192 inodedep = inodedep_lookup_ip(dp); 5193 if (DOINGSUJ(dvp)) 5194 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, 5195 &dotdotaddref->ja_ref, if_deps); 5196 FREE_LOCK(ITOUMP(dp)); 5197 } 5198 5199 /* 5200 * Called to track nlinkdelta of the inode and parent directories prior to 5201 * unlinking a directory. 5202 */ 5203 void 5204 softdep_setup_rmdir(dp, ip) 5205 struct inode *dp; 5206 struct inode *ip; 5207 { 5208 struct vnode *dvp; 5209 5210 KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, 5211 ("softdep_setup_rmdir called on non-softdep filesystem")); 5212 dvp = ITOV(dp); 5213 ACQUIRE_LOCK(ITOUMP(dp)); 5214 (void) inodedep_lookup_ip(ip); 5215 (void) inodedep_lookup_ip(dp); 5216 FREE_LOCK(ITOUMP(dp)); 5217 } 5218 5219 /* 5220 * Called to track nlinkdelta of the inode and parent directories prior to 5221 * unlink. 5222 */ 5223 void 5224 softdep_setup_unlink(dp, ip) 5225 struct inode *dp; 5226 struct inode *ip; 5227 { 5228 struct vnode *dvp; 5229 5230 KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, 5231 ("softdep_setup_unlink called on non-softdep filesystem")); 5232 dvp = ITOV(dp); 5233 ACQUIRE_LOCK(ITOUMP(dp)); 5234 (void) inodedep_lookup_ip(ip); 5235 (void) inodedep_lookup_ip(dp); 5236 FREE_LOCK(ITOUMP(dp)); 5237 } 5238 5239 /* 5240 * Called to release the journal structures created by a failed non-directory 5241 * creation. Adjusts nlinkdelta for non-journaling softdep. 5242 */ 5243 void 5244 softdep_revert_create(dp, ip) 5245 struct inode *dp; 5246 struct inode *ip; 5247 { 5248 struct inodedep *inodedep; 5249 struct jaddref *jaddref; 5250 struct vnode *dvp; 5251 5252 KASSERT(MOUNTEDSOFTDEP(ITOVFS((dp))) != 0, 5253 ("softdep_revert_create called on non-softdep filesystem")); 5254 dvp = ITOV(dp); 5255 ACQUIRE_LOCK(ITOUMP(dp)); 5256 inodedep = inodedep_lookup_ip(ip); 5257 if (DOINGSUJ(dvp)) { 5258 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 5259 inoreflst); 5260 KASSERT(jaddref->ja_parent == dp->i_number, 5261 ("softdep_revert_create: addref parent mismatch")); 5262 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 5263 } 5264 FREE_LOCK(ITOUMP(dp)); 5265 } 5266 5267 /* 5268 * Called to release the journal structures created by a failed link 5269 * addition. Adjusts nlinkdelta for non-journaling softdep. 5270 */ 5271 void 5272 softdep_revert_link(dp, ip) 5273 struct inode *dp; 5274 struct inode *ip; 5275 { 5276 struct inodedep *inodedep; 5277 struct jaddref *jaddref; 5278 struct vnode *dvp; 5279 5280 KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, 5281 ("softdep_revert_link called on non-softdep filesystem")); 5282 dvp = ITOV(dp); 5283 ACQUIRE_LOCK(ITOUMP(dp)); 5284 inodedep = inodedep_lookup_ip(ip); 5285 if (DOINGSUJ(dvp)) { 5286 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 5287 inoreflst); 5288 KASSERT(jaddref->ja_parent == dp->i_number, 5289 ("softdep_revert_link: addref parent mismatch")); 5290 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 5291 } 5292 FREE_LOCK(ITOUMP(dp)); 5293 } 5294 5295 /* 5296 * Called to release the journal structures created by a failed mkdir 5297 * attempt. Adjusts nlinkdelta for non-journaling softdep. 5298 */ 5299 void 5300 softdep_revert_mkdir(dp, ip) 5301 struct inode *dp; 5302 struct inode *ip; 5303 { 5304 struct inodedep *inodedep; 5305 struct jaddref *jaddref; 5306 struct jaddref *dotaddref; 5307 struct vnode *dvp; 5308 5309 KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, 5310 ("softdep_revert_mkdir called on non-softdep filesystem")); 5311 dvp = ITOV(dp); 5312 5313 ACQUIRE_LOCK(ITOUMP(dp)); 5314 inodedep = inodedep_lookup_ip(dp); 5315 if (DOINGSUJ(dvp)) { 5316 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 5317 inoreflst); 5318 KASSERT(jaddref->ja_parent == ip->i_number, 5319 ("softdep_revert_mkdir: dotdot addref parent mismatch")); 5320 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 5321 } 5322 inodedep = inodedep_lookup_ip(ip); 5323 if (DOINGSUJ(dvp)) { 5324 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 5325 inoreflst); 5326 KASSERT(jaddref->ja_parent == dp->i_number, 5327 ("softdep_revert_mkdir: addref parent mismatch")); 5328 dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, 5329 inoreflst, if_deps); 5330 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 5331 KASSERT(dotaddref->ja_parent == ip->i_number, 5332 ("softdep_revert_mkdir: dot addref parent mismatch")); 5333 cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait); 5334 } 5335 FREE_LOCK(ITOUMP(dp)); 5336 } 5337 5338 /* 5339 * Called to correct nlinkdelta after a failed rmdir. 5340 */ 5341 void 5342 softdep_revert_rmdir(dp, ip) 5343 struct inode *dp; 5344 struct inode *ip; 5345 { 5346 5347 KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, 5348 ("softdep_revert_rmdir called on non-softdep filesystem")); 5349 ACQUIRE_LOCK(ITOUMP(dp)); 5350 (void) inodedep_lookup_ip(ip); 5351 (void) inodedep_lookup_ip(dp); 5352 FREE_LOCK(ITOUMP(dp)); 5353 } 5354 5355 /* 5356 * Protecting the freemaps (or bitmaps). 5357 * 5358 * To eliminate the need to execute fsck before mounting a filesystem 5359 * after a power failure, one must (conservatively) guarantee that the 5360 * on-disk copy of the bitmaps never indicate that a live inode or block is 5361 * free. So, when a block or inode is allocated, the bitmap should be 5362 * updated (on disk) before any new pointers. When a block or inode is 5363 * freed, the bitmap should not be updated until all pointers have been 5364 * reset. The latter dependency is handled by the delayed de-allocation 5365 * approach described below for block and inode de-allocation. The former 5366 * dependency is handled by calling the following procedure when a block or 5367 * inode is allocated. When an inode is allocated an "inodedep" is created 5368 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. 5369 * Each "inodedep" is also inserted into the hash indexing structure so 5370 * that any additional link additions can be made dependent on the inode 5371 * allocation. 5372 * 5373 * The ufs filesystem maintains a number of free block counts (e.g., per 5374 * cylinder group, per cylinder and per <cylinder, rotational position> pair) 5375 * in addition to the bitmaps. These counts are used to improve efficiency 5376 * during allocation and therefore must be consistent with the bitmaps. 5377 * There is no convenient way to guarantee post-crash consistency of these 5378 * counts with simple update ordering, for two main reasons: (1) The counts 5379 * and bitmaps for a single cylinder group block are not in the same disk 5380 * sector. If a disk write is interrupted (e.g., by power failure), one may 5381 * be written and the other not. (2) Some of the counts are located in the 5382 * superblock rather than the cylinder group block. So, we focus our soft 5383 * updates implementation on protecting the bitmaps. When mounting a 5384 * filesystem, we recompute the auxiliary counts from the bitmaps. 5385 */ 5386 5387 /* 5388 * Called just after updating the cylinder group block to allocate an inode. 5389 */ 5390 void 5391 softdep_setup_inomapdep(bp, ip, newinum, mode) 5392 struct buf *bp; /* buffer for cylgroup block with inode map */ 5393 struct inode *ip; /* inode related to allocation */ 5394 ino_t newinum; /* new inode number being allocated */ 5395 int mode; 5396 { 5397 struct inodedep *inodedep; 5398 struct bmsafemap *bmsafemap; 5399 struct jaddref *jaddref; 5400 struct mount *mp; 5401 struct fs *fs; 5402 5403 mp = ITOVFS(ip); 5404 KASSERT(MOUNTEDSOFTDEP(mp) != 0, 5405 ("softdep_setup_inomapdep called on non-softdep filesystem")); 5406 fs = VFSTOUFS(mp)->um_fs; 5407 jaddref = NULL; 5408 5409 /* 5410 * Allocate the journal reference add structure so that the bitmap 5411 * can be dependent on it. 5412 */ 5413 if (MOUNTEDSUJ(mp)) { 5414 jaddref = newjaddref(ip, newinum, 0, 0, mode); 5415 jaddref->ja_state |= NEWBLOCK; 5416 } 5417 5418 /* 5419 * Create a dependency for the newly allocated inode. 5420 * Panic if it already exists as something is seriously wrong. 5421 * Otherwise add it to the dependency list for the buffer holding 5422 * the cylinder group map from which it was allocated. 5423 * 5424 * We have to preallocate a bmsafemap entry in case it is needed 5425 * in bmsafemap_lookup since once we allocate the inodedep, we 5426 * have to finish initializing it before we can FREE_LOCK(). 5427 * By preallocating, we avoid FREE_LOCK() while doing a malloc 5428 * in bmsafemap_lookup. We cannot call bmsafemap_lookup before 5429 * creating the inodedep as it can be freed during the time 5430 * that we FREE_LOCK() while allocating the inodedep. We must 5431 * call workitem_alloc() before entering the locked section as 5432 * it also acquires the lock and we must avoid trying doing so 5433 * recursively. 5434 */ 5435 bmsafemap = malloc(sizeof(struct bmsafemap), 5436 M_BMSAFEMAP, M_SOFTDEP_FLAGS); 5437 workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); 5438 ACQUIRE_LOCK(ITOUMP(ip)); 5439 if ((inodedep_lookup(mp, newinum, DEPALLOC, &inodedep))) 5440 panic("softdep_setup_inomapdep: dependency %p for new" 5441 "inode already exists", inodedep); 5442 bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap); 5443 if (jaddref) { 5444 LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps); 5445 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 5446 if_deps); 5447 } else { 5448 inodedep->id_state |= ONDEPLIST; 5449 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); 5450 } 5451 inodedep->id_bmsafemap = bmsafemap; 5452 inodedep->id_state &= ~DEPCOMPLETE; 5453 FREE_LOCK(ITOUMP(ip)); 5454 } 5455 5456 /* 5457 * Called just after updating the cylinder group block to 5458 * allocate block or fragment. 5459 */ 5460 void 5461 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) 5462 struct buf *bp; /* buffer for cylgroup block with block map */ 5463 struct mount *mp; /* filesystem doing allocation */ 5464 ufs2_daddr_t newblkno; /* number of newly allocated block */ 5465 int frags; /* Number of fragments. */ 5466 int oldfrags; /* Previous number of fragments for extend. */ 5467 { 5468 struct newblk *newblk; 5469 struct bmsafemap *bmsafemap; 5470 struct jnewblk *jnewblk; 5471 struct ufsmount *ump; 5472 struct fs *fs; 5473 5474 KASSERT(MOUNTEDSOFTDEP(mp) != 0, 5475 ("softdep_setup_blkmapdep called on non-softdep filesystem")); 5476 ump = VFSTOUFS(mp); 5477 fs = ump->um_fs; 5478 jnewblk = NULL; 5479 /* 5480 * Create a dependency for the newly allocated block. 5481 * Add it to the dependency list for the buffer holding 5482 * the cylinder group map from which it was allocated. 5483 */ 5484 if (MOUNTEDSUJ(mp)) { 5485 jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS); 5486 workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp); 5487 jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list); 5488 jnewblk->jn_state = ATTACHED; 5489 jnewblk->jn_blkno = newblkno; 5490 jnewblk->jn_frags = frags; 5491 jnewblk->jn_oldfrags = oldfrags; 5492 #ifdef INVARIANTS 5493 { 5494 struct cg *cgp; 5495 uint8_t *blksfree; 5496 long bno; 5497 int i; 5498 5499 cgp = (struct cg *)bp->b_data; 5500 blksfree = cg_blksfree(cgp); 5501 bno = dtogd(fs, jnewblk->jn_blkno); 5502 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; 5503 i++) { 5504 if (isset(blksfree, bno + i)) 5505 panic("softdep_setup_blkmapdep: " 5506 "free fragment %d from %d-%d " 5507 "state 0x%X dep %p", i, 5508 jnewblk->jn_oldfrags, 5509 jnewblk->jn_frags, 5510 jnewblk->jn_state, 5511 jnewblk->jn_dep); 5512 } 5513 } 5514 #endif 5515 } 5516 5517 CTR3(KTR_SUJ, 5518 "softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d", 5519 newblkno, frags, oldfrags); 5520 ACQUIRE_LOCK(ump); 5521 if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0) 5522 panic("softdep_setup_blkmapdep: found block"); 5523 newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp, 5524 dtog(fs, newblkno), NULL); 5525 if (jnewblk) { 5526 jnewblk->jn_dep = (struct worklist *)newblk; 5527 LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps); 5528 } else { 5529 newblk->nb_state |= ONDEPLIST; 5530 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); 5531 } 5532 newblk->nb_bmsafemap = bmsafemap; 5533 newblk->nb_jnewblk = jnewblk; 5534 FREE_LOCK(ump); 5535 } 5536 5537 #define BMSAFEMAP_HASH(ump, cg) \ 5538 (&(ump)->bmsafemap_hashtbl[(cg) & (ump)->bmsafemap_hash_size]) 5539 5540 static int 5541 bmsafemap_find(bmsafemaphd, cg, bmsafemapp) 5542 struct bmsafemap_hashhead *bmsafemaphd; 5543 int cg; 5544 struct bmsafemap **bmsafemapp; 5545 { 5546 struct bmsafemap *bmsafemap; 5547 5548 LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash) 5549 if (bmsafemap->sm_cg == cg) 5550 break; 5551 if (bmsafemap) { 5552 *bmsafemapp = bmsafemap; 5553 return (1); 5554 } 5555 *bmsafemapp = NULL; 5556 5557 return (0); 5558 } 5559 5560 /* 5561 * Find the bmsafemap associated with a cylinder group buffer. 5562 * If none exists, create one. The buffer must be locked when 5563 * this routine is called and this routine must be called with 5564 * the softdep lock held. To avoid giving up the lock while 5565 * allocating a new bmsafemap, a preallocated bmsafemap may be 5566 * provided. If it is provided but not needed, it is freed. 5567 */ 5568 static struct bmsafemap * 5569 bmsafemap_lookup(mp, bp, cg, newbmsafemap) 5570 struct mount *mp; 5571 struct buf *bp; 5572 int cg; 5573 struct bmsafemap *newbmsafemap; 5574 { 5575 struct bmsafemap_hashhead *bmsafemaphd; 5576 struct bmsafemap *bmsafemap, *collision; 5577 struct worklist *wk; 5578 struct ufsmount *ump; 5579 5580 ump = VFSTOUFS(mp); 5581 LOCK_OWNED(ump); 5582 KASSERT(bp != NULL, ("bmsafemap_lookup: missing buffer")); 5583 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 5584 if (wk->wk_type == D_BMSAFEMAP) { 5585 if (newbmsafemap) 5586 WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP); 5587 return (WK_BMSAFEMAP(wk)); 5588 } 5589 } 5590 bmsafemaphd = BMSAFEMAP_HASH(ump, cg); 5591 if (bmsafemap_find(bmsafemaphd, cg, &bmsafemap) == 1) { 5592 if (newbmsafemap) 5593 WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP); 5594 return (bmsafemap); 5595 } 5596 if (newbmsafemap) { 5597 bmsafemap = newbmsafemap; 5598 } else { 5599 FREE_LOCK(ump); 5600 bmsafemap = malloc(sizeof(struct bmsafemap), 5601 M_BMSAFEMAP, M_SOFTDEP_FLAGS); 5602 workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); 5603 ACQUIRE_LOCK(ump); 5604 } 5605 bmsafemap->sm_buf = bp; 5606 LIST_INIT(&bmsafemap->sm_inodedephd); 5607 LIST_INIT(&bmsafemap->sm_inodedepwr); 5608 LIST_INIT(&bmsafemap->sm_newblkhd); 5609 LIST_INIT(&bmsafemap->sm_newblkwr); 5610 LIST_INIT(&bmsafemap->sm_jaddrefhd); 5611 LIST_INIT(&bmsafemap->sm_jnewblkhd); 5612 LIST_INIT(&bmsafemap->sm_freehd); 5613 LIST_INIT(&bmsafemap->sm_freewr); 5614 if (bmsafemap_find(bmsafemaphd, cg, &collision) == 1) { 5615 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 5616 return (collision); 5617 } 5618 bmsafemap->sm_cg = cg; 5619 LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash); 5620 LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next); 5621 WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); 5622 return (bmsafemap); 5623 } 5624 5625 /* 5626 * Direct block allocation dependencies. 5627 * 5628 * When a new block is allocated, the corresponding disk locations must be 5629 * initialized (with zeros or new data) before the on-disk inode points to 5630 * them. Also, the freemap from which the block was allocated must be 5631 * updated (on disk) before the inode's pointer. These two dependencies are 5632 * independent of each other and are needed for all file blocks and indirect 5633 * blocks that are pointed to directly by the inode. Just before the 5634 * "in-core" version of the inode is updated with a newly allocated block 5635 * number, a procedure (below) is called to setup allocation dependency 5636 * structures. These structures are removed when the corresponding 5637 * dependencies are satisfied or when the block allocation becomes obsolete 5638 * (i.e., the file is deleted, the block is de-allocated, or the block is a 5639 * fragment that gets upgraded). All of these cases are handled in 5640 * procedures described later. 5641 * 5642 * When a file extension causes a fragment to be upgraded, either to a larger 5643 * fragment or to a full block, the on-disk location may change (if the 5644 * previous fragment could not simply be extended). In this case, the old 5645 * fragment must be de-allocated, but not until after the inode's pointer has 5646 * been updated. In most cases, this is handled by later procedures, which 5647 * will construct a "freefrag" structure to be added to the workitem queue 5648 * when the inode update is complete (or obsolete). The main exception to 5649 * this is when an allocation occurs while a pending allocation dependency 5650 * (for the same block pointer) remains. This case is handled in the main 5651 * allocation dependency setup procedure by immediately freeing the 5652 * unreferenced fragments. 5653 */ 5654 void 5655 softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp) 5656 struct inode *ip; /* inode to which block is being added */ 5657 ufs_lbn_t off; /* block pointer within inode */ 5658 ufs2_daddr_t newblkno; /* disk block number being added */ 5659 ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */ 5660 long newsize; /* size of new block */ 5661 long oldsize; /* size of new block */ 5662 struct buf *bp; /* bp for allocated block */ 5663 { 5664 struct allocdirect *adp, *oldadp; 5665 struct allocdirectlst *adphead; 5666 struct freefrag *freefrag; 5667 struct inodedep *inodedep; 5668 struct pagedep *pagedep; 5669 struct jnewblk *jnewblk; 5670 struct newblk *newblk; 5671 struct mount *mp; 5672 ufs_lbn_t lbn; 5673 5674 lbn = bp->b_lblkno; 5675 mp = ITOVFS(ip); 5676 KASSERT(MOUNTEDSOFTDEP(mp) != 0, 5677 ("softdep_setup_allocdirect called on non-softdep filesystem")); 5678 if (oldblkno && oldblkno != newblkno) 5679 /* 5680 * The usual case is that a smaller fragment that 5681 * was just allocated has been replaced with a bigger 5682 * fragment or a full-size block. If it is marked as 5683 * B_DELWRI, the current contents have not been written 5684 * to disk. It is possible that the block was written 5685 * earlier, but very uncommon. If the block has never 5686 * been written, there is no need to send a BIO_DELETE 5687 * for it when it is freed. The gain from avoiding the 5688 * TRIMs for the common case of unwritten blocks far 5689 * exceeds the cost of the write amplification for the 5690 * uncommon case of failing to send a TRIM for a block 5691 * that had been written. 5692 */ 5693 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn, 5694 (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY); 5695 else 5696 freefrag = NULL; 5697 5698 CTR6(KTR_SUJ, 5699 "softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd " 5700 "off %jd newsize %ld oldsize %d", 5701 ip->i_number, newblkno, oldblkno, off, newsize, oldsize); 5702 ACQUIRE_LOCK(ITOUMP(ip)); 5703 if (off >= UFS_NDADDR) { 5704 if (lbn > 0) 5705 panic("softdep_setup_allocdirect: bad lbn %jd, off %jd", 5706 lbn, off); 5707 /* allocating an indirect block */ 5708 if (oldblkno != 0) 5709 panic("softdep_setup_allocdirect: non-zero indir"); 5710 } else { 5711 if (off != lbn) 5712 panic("softdep_setup_allocdirect: lbn %jd != off %jd", 5713 lbn, off); 5714 /* 5715 * Allocating a direct block. 5716 * 5717 * If we are allocating a directory block, then we must 5718 * allocate an associated pagedep to track additions and 5719 * deletions. 5720 */ 5721 if ((ip->i_mode & IFMT) == IFDIR) 5722 pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC, 5723 &pagedep); 5724 } 5725 if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) 5726 panic("softdep_setup_allocdirect: lost block"); 5727 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 5728 ("softdep_setup_allocdirect: newblk already initialized")); 5729 /* 5730 * Convert the newblk to an allocdirect. 5731 */ 5732 WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT); 5733 adp = (struct allocdirect *)newblk; 5734 newblk->nb_freefrag = freefrag; 5735 adp->ad_offset = off; 5736 adp->ad_oldblkno = oldblkno; 5737 adp->ad_newsize = newsize; 5738 adp->ad_oldsize = oldsize; 5739 5740 /* 5741 * Finish initializing the journal. 5742 */ 5743 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 5744 jnewblk->jn_ino = ip->i_number; 5745 jnewblk->jn_lbn = lbn; 5746 add_to_journal(&jnewblk->jn_list); 5747 } 5748 if (freefrag && freefrag->ff_jdep != NULL && 5749 freefrag->ff_jdep->wk_type == D_JFREEFRAG) 5750 add_to_journal(freefrag->ff_jdep); 5751 inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 5752 adp->ad_inodedep = inodedep; 5753 5754 WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); 5755 /* 5756 * The list of allocdirects must be kept in sorted and ascending 5757 * order so that the rollback routines can quickly determine the 5758 * first uncommitted block (the size of the file stored on disk 5759 * ends at the end of the lowest committed fragment, or if there 5760 * are no fragments, at the end of the highest committed block). 5761 * Since files generally grow, the typical case is that the new 5762 * block is to be added at the end of the list. We speed this 5763 * special case by checking against the last allocdirect in the 5764 * list before laboriously traversing the list looking for the 5765 * insertion point. 5766 */ 5767 adphead = &inodedep->id_newinoupdt; 5768 oldadp = TAILQ_LAST(adphead, allocdirectlst); 5769 if (oldadp == NULL || oldadp->ad_offset <= off) { 5770 /* insert at end of list */ 5771 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 5772 if (oldadp != NULL && oldadp->ad_offset == off) 5773 allocdirect_merge(adphead, adp, oldadp); 5774 FREE_LOCK(ITOUMP(ip)); 5775 return; 5776 } 5777 TAILQ_FOREACH(oldadp, adphead, ad_next) { 5778 if (oldadp->ad_offset >= off) 5779 break; 5780 } 5781 if (oldadp == NULL) 5782 panic("softdep_setup_allocdirect: lost entry"); 5783 /* insert in middle of list */ 5784 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 5785 if (oldadp->ad_offset == off) 5786 allocdirect_merge(adphead, adp, oldadp); 5787 5788 FREE_LOCK(ITOUMP(ip)); 5789 } 5790 5791 /* 5792 * Merge a newer and older journal record to be stored either in a 5793 * newblock or freefrag. This handles aggregating journal records for 5794 * fragment allocation into a second record as well as replacing a 5795 * journal free with an aborted journal allocation. A segment for the 5796 * oldest record will be placed on wkhd if it has been written. If not 5797 * the segment for the newer record will suffice. 5798 */ 5799 static struct worklist * 5800 jnewblk_merge(new, old, wkhd) 5801 struct worklist *new; 5802 struct worklist *old; 5803 struct workhead *wkhd; 5804 { 5805 struct jnewblk *njnewblk; 5806 struct jnewblk *jnewblk; 5807 5808 /* Handle NULLs to simplify callers. */ 5809 if (new == NULL) 5810 return (old); 5811 if (old == NULL) 5812 return (new); 5813 /* Replace a jfreefrag with a jnewblk. */ 5814 if (new->wk_type == D_JFREEFRAG) { 5815 if (WK_JNEWBLK(old)->jn_blkno != WK_JFREEFRAG(new)->fr_blkno) 5816 panic("jnewblk_merge: blkno mismatch: %p, %p", 5817 old, new); 5818 cancel_jfreefrag(WK_JFREEFRAG(new)); 5819 return (old); 5820 } 5821 if (old->wk_type != D_JNEWBLK || new->wk_type != D_JNEWBLK) 5822 panic("jnewblk_merge: Bad type: old %d new %d\n", 5823 old->wk_type, new->wk_type); 5824 /* 5825 * Handle merging of two jnewblk records that describe 5826 * different sets of fragments in the same block. 5827 */ 5828 jnewblk = WK_JNEWBLK(old); 5829 njnewblk = WK_JNEWBLK(new); 5830 if (jnewblk->jn_blkno != njnewblk->jn_blkno) 5831 panic("jnewblk_merge: Merging disparate blocks."); 5832 /* 5833 * The record may be rolled back in the cg. 5834 */ 5835 if (jnewblk->jn_state & UNDONE) { 5836 jnewblk->jn_state &= ~UNDONE; 5837 njnewblk->jn_state |= UNDONE; 5838 njnewblk->jn_state &= ~ATTACHED; 5839 } 5840 /* 5841 * We modify the newer addref and free the older so that if neither 5842 * has been written the most up-to-date copy will be on disk. If 5843 * both have been written but rolled back we only temporarily need 5844 * one of them to fix the bits when the cg write completes. 5845 */ 5846 jnewblk->jn_state |= ATTACHED | COMPLETE; 5847 njnewblk->jn_oldfrags = jnewblk->jn_oldfrags; 5848 cancel_jnewblk(jnewblk, wkhd); 5849 WORKLIST_REMOVE(&jnewblk->jn_list); 5850 free_jnewblk(jnewblk); 5851 return (new); 5852 } 5853 5854 /* 5855 * Replace an old allocdirect dependency with a newer one. 5856 */ 5857 static void 5858 allocdirect_merge(adphead, newadp, oldadp) 5859 struct allocdirectlst *adphead; /* head of list holding allocdirects */ 5860 struct allocdirect *newadp; /* allocdirect being added */ 5861 struct allocdirect *oldadp; /* existing allocdirect being checked */ 5862 { 5863 struct worklist *wk; 5864 struct freefrag *freefrag; 5865 5866 freefrag = NULL; 5867 LOCK_OWNED(VFSTOUFS(newadp->ad_list.wk_mp)); 5868 if (newadp->ad_oldblkno != oldadp->ad_newblkno || 5869 newadp->ad_oldsize != oldadp->ad_newsize || 5870 newadp->ad_offset >= UFS_NDADDR) 5871 panic("%s %jd != new %jd || old size %ld != new %ld", 5872 "allocdirect_merge: old blkno", 5873 (intmax_t)newadp->ad_oldblkno, 5874 (intmax_t)oldadp->ad_newblkno, 5875 newadp->ad_oldsize, oldadp->ad_newsize); 5876 newadp->ad_oldblkno = oldadp->ad_oldblkno; 5877 newadp->ad_oldsize = oldadp->ad_oldsize; 5878 /* 5879 * If the old dependency had a fragment to free or had never 5880 * previously had a block allocated, then the new dependency 5881 * can immediately post its freefrag and adopt the old freefrag. 5882 * This action is done by swapping the freefrag dependencies. 5883 * The new dependency gains the old one's freefrag, and the 5884 * old one gets the new one and then immediately puts it on 5885 * the worklist when it is freed by free_newblk. It is 5886 * not possible to do this swap when the old dependency had a 5887 * non-zero size but no previous fragment to free. This condition 5888 * arises when the new block is an extension of the old block. 5889 * Here, the first part of the fragment allocated to the new 5890 * dependency is part of the block currently claimed on disk by 5891 * the old dependency, so cannot legitimately be freed until the 5892 * conditions for the new dependency are fulfilled. 5893 */ 5894 freefrag = newadp->ad_freefrag; 5895 if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { 5896 newadp->ad_freefrag = oldadp->ad_freefrag; 5897 oldadp->ad_freefrag = freefrag; 5898 } 5899 /* 5900 * If we are tracking a new directory-block allocation, 5901 * move it from the old allocdirect to the new allocdirect. 5902 */ 5903 if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) { 5904 WORKLIST_REMOVE(wk); 5905 if (!LIST_EMPTY(&oldadp->ad_newdirblk)) 5906 panic("allocdirect_merge: extra newdirblk"); 5907 WORKLIST_INSERT(&newadp->ad_newdirblk, wk); 5908 } 5909 TAILQ_REMOVE(adphead, oldadp, ad_next); 5910 /* 5911 * We need to move any journal dependencies over to the freefrag 5912 * that releases this block if it exists. Otherwise we are 5913 * extending an existing block and we'll wait until that is 5914 * complete to release the journal space and extend the 5915 * new journal to cover this old space as well. 5916 */ 5917 if (freefrag == NULL) { 5918 if (oldadp->ad_newblkno != newadp->ad_newblkno) 5919 panic("allocdirect_merge: %jd != %jd", 5920 oldadp->ad_newblkno, newadp->ad_newblkno); 5921 newadp->ad_block.nb_jnewblk = (struct jnewblk *) 5922 jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list, 5923 &oldadp->ad_block.nb_jnewblk->jn_list, 5924 &newadp->ad_block.nb_jwork); 5925 oldadp->ad_block.nb_jnewblk = NULL; 5926 cancel_newblk(&oldadp->ad_block, NULL, 5927 &newadp->ad_block.nb_jwork); 5928 } else { 5929 wk = (struct worklist *) cancel_newblk(&oldadp->ad_block, 5930 &freefrag->ff_list, &freefrag->ff_jwork); 5931 freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk, 5932 &freefrag->ff_jwork); 5933 } 5934 free_newblk(&oldadp->ad_block); 5935 } 5936 5937 /* 5938 * Allocate a jfreefrag structure to journal a single block free. 5939 */ 5940 static struct jfreefrag * 5941 newjfreefrag(freefrag, ip, blkno, size, lbn) 5942 struct freefrag *freefrag; 5943 struct inode *ip; 5944 ufs2_daddr_t blkno; 5945 long size; 5946 ufs_lbn_t lbn; 5947 { 5948 struct jfreefrag *jfreefrag; 5949 struct fs *fs; 5950 5951 fs = ITOFS(ip); 5952 jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG, 5953 M_SOFTDEP_FLAGS); 5954 workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, ITOVFS(ip)); 5955 jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list); 5956 jfreefrag->fr_state = ATTACHED | DEPCOMPLETE; 5957 jfreefrag->fr_ino = ip->i_number; 5958 jfreefrag->fr_lbn = lbn; 5959 jfreefrag->fr_blkno = blkno; 5960 jfreefrag->fr_frags = numfrags(fs, size); 5961 jfreefrag->fr_freefrag = freefrag; 5962 5963 return (jfreefrag); 5964 } 5965 5966 /* 5967 * Allocate a new freefrag structure. 5968 */ 5969 static struct freefrag * 5970 newfreefrag(ip, blkno, size, lbn, key) 5971 struct inode *ip; 5972 ufs2_daddr_t blkno; 5973 long size; 5974 ufs_lbn_t lbn; 5975 u_long key; 5976 { 5977 struct freefrag *freefrag; 5978 struct ufsmount *ump; 5979 struct fs *fs; 5980 5981 CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd", 5982 ip->i_number, blkno, size, lbn); 5983 ump = ITOUMP(ip); 5984 fs = ump->um_fs; 5985 if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) 5986 panic("newfreefrag: frag size"); 5987 freefrag = malloc(sizeof(struct freefrag), 5988 M_FREEFRAG, M_SOFTDEP_FLAGS); 5989 workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ump)); 5990 freefrag->ff_state = ATTACHED; 5991 LIST_INIT(&freefrag->ff_jwork); 5992 freefrag->ff_inum = ip->i_number; 5993 freefrag->ff_vtype = ITOV(ip)->v_type; 5994 freefrag->ff_blkno = blkno; 5995 freefrag->ff_fragsize = size; 5996 freefrag->ff_key = key; 5997 5998 if (MOUNTEDSUJ(UFSTOVFS(ump))) { 5999 freefrag->ff_jdep = (struct worklist *) 6000 newjfreefrag(freefrag, ip, blkno, size, lbn); 6001 } else { 6002 freefrag->ff_state |= DEPCOMPLETE; 6003 freefrag->ff_jdep = NULL; 6004 } 6005 6006 return (freefrag); 6007 } 6008 6009 /* 6010 * This workitem de-allocates fragments that were replaced during 6011 * file block allocation. 6012 */ 6013 static void 6014 handle_workitem_freefrag(freefrag) 6015 struct freefrag *freefrag; 6016 { 6017 struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp); 6018 struct workhead wkhd; 6019 6020 CTR3(KTR_SUJ, 6021 "handle_workitem_freefrag: ino %d blkno %jd size %ld", 6022 freefrag->ff_inum, freefrag->ff_blkno, freefrag->ff_fragsize); 6023 /* 6024 * It would be illegal to add new completion items to the 6025 * freefrag after it was schedule to be done so it must be 6026 * safe to modify the list head here. 6027 */ 6028 LIST_INIT(&wkhd); 6029 ACQUIRE_LOCK(ump); 6030 LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list); 6031 /* 6032 * If the journal has not been written we must cancel it here. 6033 */ 6034 if (freefrag->ff_jdep) { 6035 if (freefrag->ff_jdep->wk_type != D_JNEWBLK) 6036 panic("handle_workitem_freefrag: Unexpected type %d\n", 6037 freefrag->ff_jdep->wk_type); 6038 cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd); 6039 } 6040 FREE_LOCK(ump); 6041 ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno, 6042 freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, 6043 &wkhd, freefrag->ff_key); 6044 ACQUIRE_LOCK(ump); 6045 WORKITEM_FREE(freefrag, D_FREEFRAG); 6046 FREE_LOCK(ump); 6047 } 6048 6049 /* 6050 * Set up a dependency structure for an external attributes data block. 6051 * This routine follows much of the structure of softdep_setup_allocdirect. 6052 * See the description of softdep_setup_allocdirect above for details. 6053 */ 6054 void 6055 softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp) 6056 struct inode *ip; 6057 ufs_lbn_t off; 6058 ufs2_daddr_t newblkno; 6059 ufs2_daddr_t oldblkno; 6060 long newsize; 6061 long oldsize; 6062 struct buf *bp; 6063 { 6064 struct allocdirect *adp, *oldadp; 6065 struct allocdirectlst *adphead; 6066 struct freefrag *freefrag; 6067 struct inodedep *inodedep; 6068 struct jnewblk *jnewblk; 6069 struct newblk *newblk; 6070 struct mount *mp; 6071 struct ufsmount *ump; 6072 ufs_lbn_t lbn; 6073 6074 mp = ITOVFS(ip); 6075 ump = VFSTOUFS(mp); 6076 KASSERT(MOUNTEDSOFTDEP(mp) != 0, 6077 ("softdep_setup_allocext called on non-softdep filesystem")); 6078 KASSERT(off < UFS_NXADDR, 6079 ("softdep_setup_allocext: lbn %lld > UFS_NXADDR", (long long)off)); 6080 6081 lbn = bp->b_lblkno; 6082 if (oldblkno && oldblkno != newblkno) 6083 /* 6084 * The usual case is that a smaller fragment that 6085 * was just allocated has been replaced with a bigger 6086 * fragment or a full-size block. If it is marked as 6087 * B_DELWRI, the current contents have not been written 6088 * to disk. It is possible that the block was written 6089 * earlier, but very uncommon. If the block has never 6090 * been written, there is no need to send a BIO_DELETE 6091 * for it when it is freed. The gain from avoiding the 6092 * TRIMs for the common case of unwritten blocks far 6093 * exceeds the cost of the write amplification for the 6094 * uncommon case of failing to send a TRIM for a block 6095 * that had been written. 6096 */ 6097 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn, 6098 (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY); 6099 else 6100 freefrag = NULL; 6101 6102 ACQUIRE_LOCK(ump); 6103 if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) 6104 panic("softdep_setup_allocext: lost block"); 6105 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 6106 ("softdep_setup_allocext: newblk already initialized")); 6107 /* 6108 * Convert the newblk to an allocdirect. 6109 */ 6110 WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT); 6111 adp = (struct allocdirect *)newblk; 6112 newblk->nb_freefrag = freefrag; 6113 adp->ad_offset = off; 6114 adp->ad_oldblkno = oldblkno; 6115 adp->ad_newsize = newsize; 6116 adp->ad_oldsize = oldsize; 6117 adp->ad_state |= EXTDATA; 6118 6119 /* 6120 * Finish initializing the journal. 6121 */ 6122 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 6123 jnewblk->jn_ino = ip->i_number; 6124 jnewblk->jn_lbn = lbn; 6125 add_to_journal(&jnewblk->jn_list); 6126 } 6127 if (freefrag && freefrag->ff_jdep != NULL && 6128 freefrag->ff_jdep->wk_type == D_JFREEFRAG) 6129 add_to_journal(freefrag->ff_jdep); 6130 inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 6131 adp->ad_inodedep = inodedep; 6132 6133 WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); 6134 /* 6135 * The list of allocdirects must be kept in sorted and ascending 6136 * order so that the rollback routines can quickly determine the 6137 * first uncommitted block (the size of the file stored on disk 6138 * ends at the end of the lowest committed fragment, or if there 6139 * are no fragments, at the end of the highest committed block). 6140 * Since files generally grow, the typical case is that the new 6141 * block is to be added at the end of the list. We speed this 6142 * special case by checking against the last allocdirect in the 6143 * list before laboriously traversing the list looking for the 6144 * insertion point. 6145 */ 6146 adphead = &inodedep->id_newextupdt; 6147 oldadp = TAILQ_LAST(adphead, allocdirectlst); 6148 if (oldadp == NULL || oldadp->ad_offset <= off) { 6149 /* insert at end of list */ 6150 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 6151 if (oldadp != NULL && oldadp->ad_offset == off) 6152 allocdirect_merge(adphead, adp, oldadp); 6153 FREE_LOCK(ump); 6154 return; 6155 } 6156 TAILQ_FOREACH(oldadp, adphead, ad_next) { 6157 if (oldadp->ad_offset >= off) 6158 break; 6159 } 6160 if (oldadp == NULL) 6161 panic("softdep_setup_allocext: lost entry"); 6162 /* insert in middle of list */ 6163 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 6164 if (oldadp->ad_offset == off) 6165 allocdirect_merge(adphead, adp, oldadp); 6166 FREE_LOCK(ump); 6167 } 6168 6169 /* 6170 * Indirect block allocation dependencies. 6171 * 6172 * The same dependencies that exist for a direct block also exist when 6173 * a new block is allocated and pointed to by an entry in a block of 6174 * indirect pointers. The undo/redo states described above are also 6175 * used here. Because an indirect block contains many pointers that 6176 * may have dependencies, a second copy of the entire in-memory indirect 6177 * block is kept. The buffer cache copy is always completely up-to-date. 6178 * The second copy, which is used only as a source for disk writes, 6179 * contains only the safe pointers (i.e., those that have no remaining 6180 * update dependencies). The second copy is freed when all pointers 6181 * are safe. The cache is not allowed to replace indirect blocks with 6182 * pending update dependencies. If a buffer containing an indirect 6183 * block with dependencies is written, these routines will mark it 6184 * dirty again. It can only be successfully written once all the 6185 * dependencies are removed. The ffs_fsync routine in conjunction with 6186 * softdep_sync_metadata work together to get all the dependencies 6187 * removed so that a file can be successfully written to disk. Three 6188 * procedures are used when setting up indirect block pointer 6189 * dependencies. The division is necessary because of the organization 6190 * of the "balloc" routine and because of the distinction between file 6191 * pages and file metadata blocks. 6192 */ 6193 6194 /* 6195 * Allocate a new allocindir structure. 6196 */ 6197 static struct allocindir * 6198 newallocindir(ip, ptrno, newblkno, oldblkno, lbn) 6199 struct inode *ip; /* inode for file being extended */ 6200 int ptrno; /* offset of pointer in indirect block */ 6201 ufs2_daddr_t newblkno; /* disk block number being added */ 6202 ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ 6203 ufs_lbn_t lbn; 6204 { 6205 struct newblk *newblk; 6206 struct allocindir *aip; 6207 struct freefrag *freefrag; 6208 struct jnewblk *jnewblk; 6209 6210 if (oldblkno) 6211 freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn, 6212 SINGLETON_KEY); 6213 else 6214 freefrag = NULL; 6215 ACQUIRE_LOCK(ITOUMP(ip)); 6216 if (newblk_lookup(ITOVFS(ip), newblkno, 0, &newblk) == 0) 6217 panic("new_allocindir: lost block"); 6218 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 6219 ("newallocindir: newblk already initialized")); 6220 WORKITEM_REASSIGN(newblk, D_ALLOCINDIR); 6221 newblk->nb_freefrag = freefrag; 6222 aip = (struct allocindir *)newblk; 6223 aip->ai_offset = ptrno; 6224 aip->ai_oldblkno = oldblkno; 6225 aip->ai_lbn = lbn; 6226 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 6227 jnewblk->jn_ino = ip->i_number; 6228 jnewblk->jn_lbn = lbn; 6229 add_to_journal(&jnewblk->jn_list); 6230 } 6231 if (freefrag && freefrag->ff_jdep != NULL && 6232 freefrag->ff_jdep->wk_type == D_JFREEFRAG) 6233 add_to_journal(freefrag->ff_jdep); 6234 return (aip); 6235 } 6236 6237 /* 6238 * Called just before setting an indirect block pointer 6239 * to a newly allocated file page. 6240 */ 6241 void 6242 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 6243 struct inode *ip; /* inode for file being extended */ 6244 ufs_lbn_t lbn; /* allocated block number within file */ 6245 struct buf *bp; /* buffer with indirect blk referencing page */ 6246 int ptrno; /* offset of pointer in indirect block */ 6247 ufs2_daddr_t newblkno; /* disk block number being added */ 6248 ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ 6249 struct buf *nbp; /* buffer holding allocated page */ 6250 { 6251 struct inodedep *inodedep; 6252 struct freefrag *freefrag; 6253 struct allocindir *aip; 6254 struct pagedep *pagedep; 6255 struct mount *mp; 6256 struct ufsmount *ump; 6257 6258 mp = ITOVFS(ip); 6259 ump = VFSTOUFS(mp); 6260 KASSERT(MOUNTEDSOFTDEP(mp) != 0, 6261 ("softdep_setup_allocindir_page called on non-softdep filesystem")); 6262 KASSERT(lbn == nbp->b_lblkno, 6263 ("softdep_setup_allocindir_page: lbn %jd != lblkno %jd", 6264 lbn, bp->b_lblkno)); 6265 CTR4(KTR_SUJ, 6266 "softdep_setup_allocindir_page: ino %d blkno %jd oldblkno %jd " 6267 "lbn %jd", ip->i_number, newblkno, oldblkno, lbn); 6268 ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page"); 6269 aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn); 6270 (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 6271 /* 6272 * If we are allocating a directory page, then we must 6273 * allocate an associated pagedep to track additions and 6274 * deletions. 6275 */ 6276 if ((ip->i_mode & IFMT) == IFDIR) 6277 pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep); 6278 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); 6279 freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn); 6280 FREE_LOCK(ump); 6281 if (freefrag) 6282 handle_workitem_freefrag(freefrag); 6283 } 6284 6285 /* 6286 * Called just before setting an indirect block pointer to a 6287 * newly allocated indirect block. 6288 */ 6289 void 6290 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 6291 struct buf *nbp; /* newly allocated indirect block */ 6292 struct inode *ip; /* inode for file being extended */ 6293 struct buf *bp; /* indirect block referencing allocated block */ 6294 int ptrno; /* offset of pointer in indirect block */ 6295 ufs2_daddr_t newblkno; /* disk block number being added */ 6296 { 6297 struct inodedep *inodedep; 6298 struct allocindir *aip; 6299 struct ufsmount *ump; 6300 ufs_lbn_t lbn; 6301 6302 ump = ITOUMP(ip); 6303 KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, 6304 ("softdep_setup_allocindir_meta called on non-softdep filesystem")); 6305 CTR3(KTR_SUJ, 6306 "softdep_setup_allocindir_meta: ino %d blkno %jd ptrno %d", 6307 ip->i_number, newblkno, ptrno); 6308 lbn = nbp->b_lblkno; 6309 ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta"); 6310 aip = newallocindir(ip, ptrno, newblkno, 0, lbn); 6311 inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep); 6312 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); 6313 if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)) 6314 panic("softdep_setup_allocindir_meta: Block already existed"); 6315 FREE_LOCK(ump); 6316 } 6317 6318 static void 6319 indirdep_complete(indirdep) 6320 struct indirdep *indirdep; 6321 { 6322 struct allocindir *aip; 6323 6324 LIST_REMOVE(indirdep, ir_next); 6325 indirdep->ir_state |= DEPCOMPLETE; 6326 6327 while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) { 6328 LIST_REMOVE(aip, ai_next); 6329 free_newblk(&aip->ai_block); 6330 } 6331 /* 6332 * If this indirdep is not attached to a buf it was simply waiting 6333 * on completion to clear completehd. free_indirdep() asserts 6334 * that nothing is dangling. 6335 */ 6336 if ((indirdep->ir_state & ONWORKLIST) == 0) 6337 free_indirdep(indirdep); 6338 } 6339 6340 static struct indirdep * 6341 indirdep_lookup(mp, ip, bp) 6342 struct mount *mp; 6343 struct inode *ip; 6344 struct buf *bp; 6345 { 6346 struct indirdep *indirdep, *newindirdep; 6347 struct newblk *newblk; 6348 struct ufsmount *ump; 6349 struct worklist *wk; 6350 struct fs *fs; 6351 ufs2_daddr_t blkno; 6352 6353 ump = VFSTOUFS(mp); 6354 LOCK_OWNED(ump); 6355 indirdep = NULL; 6356 newindirdep = NULL; 6357 fs = ump->um_fs; 6358 for (;;) { 6359 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 6360 if (wk->wk_type != D_INDIRDEP) 6361 continue; 6362 indirdep = WK_INDIRDEP(wk); 6363 break; 6364 } 6365 /* Found on the buffer worklist, no new structure to free. */ 6366 if (indirdep != NULL && newindirdep == NULL) 6367 return (indirdep); 6368 if (indirdep != NULL && newindirdep != NULL) 6369 panic("indirdep_lookup: simultaneous create"); 6370 /* None found on the buffer and a new structure is ready. */ 6371 if (indirdep == NULL && newindirdep != NULL) 6372 break; 6373 /* None found and no new structure available. */ 6374 FREE_LOCK(ump); 6375 newindirdep = malloc(sizeof(struct indirdep), 6376 M_INDIRDEP, M_SOFTDEP_FLAGS); 6377 workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp); 6378 newindirdep->ir_state = ATTACHED; 6379 if (I_IS_UFS1(ip)) 6380 newindirdep->ir_state |= UFS1FMT; 6381 TAILQ_INIT(&newindirdep->ir_trunc); 6382 newindirdep->ir_saveddata = NULL; 6383 LIST_INIT(&newindirdep->ir_deplisthd); 6384 LIST_INIT(&newindirdep->ir_donehd); 6385 LIST_INIT(&newindirdep->ir_writehd); 6386 LIST_INIT(&newindirdep->ir_completehd); 6387 if (bp->b_blkno == bp->b_lblkno) { 6388 ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp, 6389 NULL, NULL); 6390 bp->b_blkno = blkno; 6391 } 6392 newindirdep->ir_freeblks = NULL; 6393 newindirdep->ir_savebp = 6394 getblk(ump->um_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0); 6395 newindirdep->ir_bp = bp; 6396 BUF_KERNPROC(newindirdep->ir_savebp); 6397 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); 6398 ACQUIRE_LOCK(ump); 6399 } 6400 indirdep = newindirdep; 6401 WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); 6402 /* 6403 * If the block is not yet allocated we don't set DEPCOMPLETE so 6404 * that we don't free dependencies until the pointers are valid. 6405 * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather 6406 * than using the hash. 6407 */ 6408 if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)) 6409 LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next); 6410 else 6411 indirdep->ir_state |= DEPCOMPLETE; 6412 return (indirdep); 6413 } 6414 6415 /* 6416 * Called to finish the allocation of the "aip" allocated 6417 * by one of the two routines above. 6418 */ 6419 static struct freefrag * 6420 setup_allocindir_phase2(bp, ip, inodedep, aip, lbn) 6421 struct buf *bp; /* in-memory copy of the indirect block */ 6422 struct inode *ip; /* inode for file being extended */ 6423 struct inodedep *inodedep; /* Inodedep for ip */ 6424 struct allocindir *aip; /* allocindir allocated by the above routines */ 6425 ufs_lbn_t lbn; /* Logical block number for this block. */ 6426 { 6427 struct fs *fs; 6428 struct indirdep *indirdep; 6429 struct allocindir *oldaip; 6430 struct freefrag *freefrag; 6431 struct mount *mp; 6432 struct ufsmount *ump; 6433 6434 mp = ITOVFS(ip); 6435 ump = VFSTOUFS(mp); 6436 LOCK_OWNED(ump); 6437 fs = ump->um_fs; 6438 if (bp->b_lblkno >= 0) 6439 panic("setup_allocindir_phase2: not indir blk"); 6440 KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs), 6441 ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset)); 6442 indirdep = indirdep_lookup(mp, ip, bp); 6443 KASSERT(indirdep->ir_savebp != NULL, 6444 ("setup_allocindir_phase2 NULL ir_savebp")); 6445 aip->ai_indirdep = indirdep; 6446 /* 6447 * Check for an unwritten dependency for this indirect offset. If 6448 * there is, merge the old dependency into the new one. This happens 6449 * as a result of reallocblk only. 6450 */ 6451 freefrag = NULL; 6452 if (aip->ai_oldblkno != 0) { 6453 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) { 6454 if (oldaip->ai_offset == aip->ai_offset) { 6455 freefrag = allocindir_merge(aip, oldaip); 6456 goto done; 6457 } 6458 } 6459 LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) { 6460 if (oldaip->ai_offset == aip->ai_offset) { 6461 freefrag = allocindir_merge(aip, oldaip); 6462 goto done; 6463 } 6464 } 6465 } 6466 done: 6467 LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); 6468 return (freefrag); 6469 } 6470 6471 /* 6472 * Merge two allocindirs which refer to the same block. Move newblock 6473 * dependencies and setup the freefrags appropriately. 6474 */ 6475 static struct freefrag * 6476 allocindir_merge(aip, oldaip) 6477 struct allocindir *aip; 6478 struct allocindir *oldaip; 6479 { 6480 struct freefrag *freefrag; 6481 struct worklist *wk; 6482 6483 if (oldaip->ai_newblkno != aip->ai_oldblkno) 6484 panic("allocindir_merge: blkno"); 6485 aip->ai_oldblkno = oldaip->ai_oldblkno; 6486 freefrag = aip->ai_freefrag; 6487 aip->ai_freefrag = oldaip->ai_freefrag; 6488 oldaip->ai_freefrag = NULL; 6489 KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag")); 6490 /* 6491 * If we are tracking a new directory-block allocation, 6492 * move it from the old allocindir to the new allocindir. 6493 */ 6494 if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) { 6495 WORKLIST_REMOVE(wk); 6496 if (!LIST_EMPTY(&oldaip->ai_newdirblk)) 6497 panic("allocindir_merge: extra newdirblk"); 6498 WORKLIST_INSERT(&aip->ai_newdirblk, wk); 6499 } 6500 /* 6501 * We can skip journaling for this freefrag and just complete 6502 * any pending journal work for the allocindir that is being 6503 * removed after the freefrag completes. 6504 */ 6505 if (freefrag->ff_jdep) 6506 cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep)); 6507 LIST_REMOVE(oldaip, ai_next); 6508 freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block, 6509 &freefrag->ff_list, &freefrag->ff_jwork); 6510 free_newblk(&oldaip->ai_block); 6511 6512 return (freefrag); 6513 } 6514 6515 static inline void 6516 setup_freedirect(freeblks, ip, i, needj) 6517 struct freeblks *freeblks; 6518 struct inode *ip; 6519 int i; 6520 int needj; 6521 { 6522 struct ufsmount *ump; 6523 ufs2_daddr_t blkno; 6524 int frags; 6525 6526 blkno = DIP(ip, i_db[i]); 6527 if (blkno == 0) 6528 return; 6529 DIP_SET(ip, i_db[i], 0); 6530 ump = ITOUMP(ip); 6531 frags = sblksize(ump->um_fs, ip->i_size, i); 6532 frags = numfrags(ump->um_fs, frags); 6533 newfreework(ump, freeblks, NULL, i, blkno, frags, 0, needj); 6534 } 6535 6536 static inline void 6537 setup_freeext(freeblks, ip, i, needj) 6538 struct freeblks *freeblks; 6539 struct inode *ip; 6540 int i; 6541 int needj; 6542 { 6543 struct ufsmount *ump; 6544 ufs2_daddr_t blkno; 6545 int frags; 6546 6547 blkno = ip->i_din2->di_extb[i]; 6548 if (blkno == 0) 6549 return; 6550 ip->i_din2->di_extb[i] = 0; 6551 ump = ITOUMP(ip); 6552 frags = sblksize(ump->um_fs, ip->i_din2->di_extsize, i); 6553 frags = numfrags(ump->um_fs, frags); 6554 newfreework(ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj); 6555 } 6556 6557 static inline void 6558 setup_freeindir(freeblks, ip, i, lbn, needj) 6559 struct freeblks *freeblks; 6560 struct inode *ip; 6561 int i; 6562 ufs_lbn_t lbn; 6563 int needj; 6564 { 6565 struct ufsmount *ump; 6566 ufs2_daddr_t blkno; 6567 6568 blkno = DIP(ip, i_ib[i]); 6569 if (blkno == 0) 6570 return; 6571 DIP_SET(ip, i_ib[i], 0); 6572 ump = ITOUMP(ip); 6573 newfreework(ump, freeblks, NULL, lbn, blkno, ump->um_fs->fs_frag, 6574 0, needj); 6575 } 6576 6577 static inline struct freeblks * 6578 newfreeblks(mp, ip) 6579 struct mount *mp; 6580 struct inode *ip; 6581 { 6582 struct freeblks *freeblks; 6583 6584 freeblks = malloc(sizeof(struct freeblks), 6585 M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO); 6586 workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp); 6587 LIST_INIT(&freeblks->fb_jblkdephd); 6588 LIST_INIT(&freeblks->fb_jwork); 6589 freeblks->fb_ref = 0; 6590 freeblks->fb_cgwait = 0; 6591 freeblks->fb_state = ATTACHED; 6592 freeblks->fb_uid = ip->i_uid; 6593 freeblks->fb_inum = ip->i_number; 6594 freeblks->fb_vtype = ITOV(ip)->v_type; 6595 freeblks->fb_modrev = DIP(ip, i_modrev); 6596 freeblks->fb_devvp = ITODEVVP(ip); 6597 freeblks->fb_chkcnt = 0; 6598 freeblks->fb_len = 0; 6599 6600 return (freeblks); 6601 } 6602 6603 static void 6604 trunc_indirdep(indirdep, freeblks, bp, off) 6605 struct indirdep *indirdep; 6606 struct freeblks *freeblks; 6607 struct buf *bp; 6608 int off; 6609 { 6610 struct allocindir *aip, *aipn; 6611 6612 /* 6613 * The first set of allocindirs won't be in savedbp. 6614 */ 6615 LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn) 6616 if (aip->ai_offset > off) 6617 cancel_allocindir(aip, bp, freeblks, 1); 6618 LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn) 6619 if (aip->ai_offset > off) 6620 cancel_allocindir(aip, bp, freeblks, 1); 6621 /* 6622 * These will exist in savedbp. 6623 */ 6624 LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn) 6625 if (aip->ai_offset > off) 6626 cancel_allocindir(aip, NULL, freeblks, 0); 6627 LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn) 6628 if (aip->ai_offset > off) 6629 cancel_allocindir(aip, NULL, freeblks, 0); 6630 } 6631 6632 /* 6633 * Follow the chain of indirects down to lastlbn creating a freework 6634 * structure for each. This will be used to start indir_trunc() at 6635 * the right offset and create the journal records for the parrtial 6636 * truncation. A second step will handle the truncated dependencies. 6637 */ 6638 static int 6639 setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno) 6640 struct freeblks *freeblks; 6641 struct inode *ip; 6642 ufs_lbn_t lbn; 6643 ufs_lbn_t lastlbn; 6644 ufs2_daddr_t blkno; 6645 { 6646 struct indirdep *indirdep; 6647 struct indirdep *indirn; 6648 struct freework *freework; 6649 struct newblk *newblk; 6650 struct mount *mp; 6651 struct ufsmount *ump; 6652 struct buf *bp; 6653 uint8_t *start; 6654 uint8_t *end; 6655 ufs_lbn_t lbnadd; 6656 int level; 6657 int error; 6658 int off; 6659 6660 freework = NULL; 6661 if (blkno == 0) 6662 return (0); 6663 mp = freeblks->fb_list.wk_mp; 6664 ump = VFSTOUFS(mp); 6665 /* 6666 * Here, calls to VOP_BMAP() will fail. However, we already have 6667 * the on-disk address, so we just pass it to bread() instead of 6668 * having bread() attempt to calculate it using VOP_BMAP(). 6669 */ 6670 error = ffs_breadz(ump, ITOV(ip), lbn, blkptrtodb(ump, blkno), 6671 (int)mp->mnt_stat.f_iosize, NULL, NULL, 0, NOCRED, 0, NULL, &bp); 6672 if (error) 6673 return (error); 6674 level = lbn_level(lbn); 6675 lbnadd = lbn_offset(ump->um_fs, level); 6676 /* 6677 * Compute the offset of the last block we want to keep. Store 6678 * in the freework the first block we want to completely free. 6679 */ 6680 off = (lastlbn - -(lbn + level)) / lbnadd; 6681 if (off + 1 == NINDIR(ump->um_fs)) 6682 goto nowork; 6683 freework = newfreework(ump, freeblks, NULL, lbn, blkno, 0, off + 1, 0); 6684 /* 6685 * Link the freework into the indirdep. This will prevent any new 6686 * allocations from proceeding until we are finished with the 6687 * truncate and the block is written. 6688 */ 6689 ACQUIRE_LOCK(ump); 6690 indirdep = indirdep_lookup(mp, ip, bp); 6691 if (indirdep->ir_freeblks) 6692 panic("setup_trunc_indir: indirdep already truncated."); 6693 TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next); 6694 freework->fw_indir = indirdep; 6695 /* 6696 * Cancel any allocindirs that will not make it to disk. 6697 * We have to do this for all copies of the indirdep that 6698 * live on this newblk. 6699 */ 6700 if ((indirdep->ir_state & DEPCOMPLETE) == 0) { 6701 if (newblk_lookup(mp, dbtofsb(ump->um_fs, bp->b_blkno), 0, 6702 &newblk) == 0) 6703 panic("setup_trunc_indir: lost block"); 6704 LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next) 6705 trunc_indirdep(indirn, freeblks, bp, off); 6706 } else 6707 trunc_indirdep(indirdep, freeblks, bp, off); 6708 FREE_LOCK(ump); 6709 /* 6710 * Creation is protected by the buf lock. The saveddata is only 6711 * needed if a full truncation follows a partial truncation but it 6712 * is difficult to allocate in that case so we fetch it anyway. 6713 */ 6714 if (indirdep->ir_saveddata == NULL) 6715 indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, 6716 M_SOFTDEP_FLAGS); 6717 nowork: 6718 /* Fetch the blkno of the child and the zero start offset. */ 6719 if (I_IS_UFS1(ip)) { 6720 blkno = ((ufs1_daddr_t *)bp->b_data)[off]; 6721 start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1]; 6722 } else { 6723 blkno = ((ufs2_daddr_t *)bp->b_data)[off]; 6724 start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1]; 6725 } 6726 if (freework) { 6727 /* Zero the truncated pointers. */ 6728 end = bp->b_data + bp->b_bcount; 6729 bzero(start, end - start); 6730 bdwrite(bp); 6731 } else 6732 bqrelse(bp); 6733 if (level == 0) 6734 return (0); 6735 lbn++; /* adjust level */ 6736 lbn -= (off * lbnadd); 6737 return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno); 6738 } 6739 6740 /* 6741 * Complete the partial truncation of an indirect block setup by 6742 * setup_trunc_indir(). This zeros the truncated pointers in the saved 6743 * copy and writes them to disk before the freeblks is allowed to complete. 6744 */ 6745 static void 6746 complete_trunc_indir(freework) 6747 struct freework *freework; 6748 { 6749 struct freework *fwn; 6750 struct indirdep *indirdep; 6751 struct ufsmount *ump; 6752 struct buf *bp; 6753 uintptr_t start; 6754 int count; 6755 6756 ump = VFSTOUFS(freework->fw_list.wk_mp); 6757 LOCK_OWNED(ump); 6758 indirdep = freework->fw_indir; 6759 for (;;) { 6760 bp = indirdep->ir_bp; 6761 /* See if the block was discarded. */ 6762 if (bp == NULL) 6763 break; 6764 /* Inline part of getdirtybuf(). We dont want bremfree. */ 6765 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) 6766 break; 6767 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 6768 LOCK_PTR(ump)) == 0) 6769 BUF_UNLOCK(bp); 6770 ACQUIRE_LOCK(ump); 6771 } 6772 freework->fw_state |= DEPCOMPLETE; 6773 TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next); 6774 /* 6775 * Zero the pointers in the saved copy. 6776 */ 6777 if (indirdep->ir_state & UFS1FMT) 6778 start = sizeof(ufs1_daddr_t); 6779 else 6780 start = sizeof(ufs2_daddr_t); 6781 start *= freework->fw_start; 6782 count = indirdep->ir_savebp->b_bcount - start; 6783 start += (uintptr_t)indirdep->ir_savebp->b_data; 6784 bzero((char *)start, count); 6785 /* 6786 * We need to start the next truncation in the list if it has not 6787 * been started yet. 6788 */ 6789 fwn = TAILQ_FIRST(&indirdep->ir_trunc); 6790 if (fwn != NULL) { 6791 if (fwn->fw_freeblks == indirdep->ir_freeblks) 6792 TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next); 6793 if ((fwn->fw_state & ONWORKLIST) == 0) 6794 freework_enqueue(fwn); 6795 } 6796 /* 6797 * If bp is NULL the block was fully truncated, restore 6798 * the saved block list otherwise free it if it is no 6799 * longer needed. 6800 */ 6801 if (TAILQ_EMPTY(&indirdep->ir_trunc)) { 6802 if (bp == NULL) 6803 bcopy(indirdep->ir_saveddata, 6804 indirdep->ir_savebp->b_data, 6805 indirdep->ir_savebp->b_bcount); 6806 free(indirdep->ir_saveddata, M_INDIRDEP); 6807 indirdep->ir_saveddata = NULL; 6808 } 6809 /* 6810 * When bp is NULL there is a full truncation pending. We 6811 * must wait for this full truncation to be journaled before 6812 * we can release this freework because the disk pointers will 6813 * never be written as zero. 6814 */ 6815 if (bp == NULL) { 6816 if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd)) 6817 handle_written_freework(freework); 6818 else 6819 WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd, 6820 &freework->fw_list); 6821 if (fwn == NULL) { 6822 freework->fw_indir = (void *)0x0000deadbeef0000; 6823 bp = indirdep->ir_savebp; 6824 indirdep->ir_savebp = NULL; 6825 free_indirdep(indirdep); 6826 FREE_LOCK(ump); 6827 brelse(bp); 6828 ACQUIRE_LOCK(ump); 6829 } 6830 } else { 6831 /* Complete when the real copy is written. */ 6832 WORKLIST_INSERT(&bp->b_dep, &freework->fw_list); 6833 BUF_UNLOCK(bp); 6834 } 6835 } 6836 6837 /* 6838 * Calculate the number of blocks we are going to release where datablocks 6839 * is the current total and length is the new file size. 6840 */ 6841 static ufs2_daddr_t 6842 blkcount(fs, datablocks, length) 6843 struct fs *fs; 6844 ufs2_daddr_t datablocks; 6845 off_t length; 6846 { 6847 off_t totblks, numblks; 6848 6849 totblks = 0; 6850 numblks = howmany(length, fs->fs_bsize); 6851 if (numblks <= UFS_NDADDR) { 6852 totblks = howmany(length, fs->fs_fsize); 6853 goto out; 6854 } 6855 totblks = blkstofrags(fs, numblks); 6856 numblks -= UFS_NDADDR; 6857 /* 6858 * Count all single, then double, then triple indirects required. 6859 * Subtracting one indirects worth of blocks for each pass 6860 * acknowledges one of each pointed to by the inode. 6861 */ 6862 for (;;) { 6863 totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs))); 6864 numblks -= NINDIR(fs); 6865 if (numblks <= 0) 6866 break; 6867 numblks = howmany(numblks, NINDIR(fs)); 6868 } 6869 out: 6870 totblks = fsbtodb(fs, totblks); 6871 /* 6872 * Handle sparse files. We can't reclaim more blocks than the inode 6873 * references. We will correct it later in handle_complete_freeblks() 6874 * when we know the real count. 6875 */ 6876 if (totblks > datablocks) 6877 return (0); 6878 return (datablocks - totblks); 6879 } 6880 6881 /* 6882 * Handle freeblocks for journaled softupdate filesystems. 6883 * 6884 * Contrary to normal softupdates, we must preserve the block pointers in 6885 * indirects until their subordinates are free. This is to avoid journaling 6886 * every block that is freed which may consume more space than the journal 6887 * itself. The recovery program will see the free block journals at the 6888 * base of the truncated area and traverse them to reclaim space. The 6889 * pointers in the inode may be cleared immediately after the journal 6890 * records are written because each direct and indirect pointer in the 6891 * inode is recorded in a journal. This permits full truncation to proceed 6892 * asynchronously. The write order is journal -> inode -> cgs -> indirects. 6893 * 6894 * The algorithm is as follows: 6895 * 1) Traverse the in-memory state and create journal entries to release 6896 * the relevant blocks and full indirect trees. 6897 * 2) Traverse the indirect block chain adding partial truncation freework 6898 * records to indirects in the path to lastlbn. The freework will 6899 * prevent new allocation dependencies from being satisfied in this 6900 * indirect until the truncation completes. 6901 * 3) Read and lock the inode block, performing an update with the new size 6902 * and pointers. This prevents truncated data from becoming valid on 6903 * disk through step 4. 6904 * 4) Reap unsatisfied dependencies that are beyond the truncated area, 6905 * eliminate journal work for those records that do not require it. 6906 * 5) Schedule the journal records to be written followed by the inode block. 6907 * 6) Allocate any necessary frags for the end of file. 6908 * 7) Zero any partially truncated blocks. 6909 * 6910 * From this truncation proceeds asynchronously using the freework and 6911 * indir_trunc machinery. The file will not be extended again into a 6912 * partially truncated indirect block until all work is completed but 6913 * the normal dependency mechanism ensures that it is rolled back/forward 6914 * as appropriate. Further truncation may occur without delay and is 6915 * serialized in indir_trunc(). 6916 */ 6917 void 6918 softdep_journal_freeblocks(ip, cred, length, flags) 6919 struct inode *ip; /* The inode whose length is to be reduced */ 6920 struct ucred *cred; 6921 off_t length; /* The new length for the file */ 6922 int flags; /* IO_EXT and/or IO_NORMAL */ 6923 { 6924 struct freeblks *freeblks, *fbn; 6925 struct worklist *wk, *wkn; 6926 struct inodedep *inodedep; 6927 struct jblkdep *jblkdep; 6928 struct allocdirect *adp, *adpn; 6929 struct ufsmount *ump; 6930 struct fs *fs; 6931 struct buf *bp; 6932 struct vnode *vp; 6933 struct mount *mp; 6934 daddr_t dbn; 6935 ufs2_daddr_t extblocks, datablocks; 6936 ufs_lbn_t tmpval, lbn, lastlbn; 6937 int frags, lastoff, iboff, allocblock, needj, error, i; 6938 6939 ump = ITOUMP(ip); 6940 mp = UFSTOVFS(ump); 6941 fs = ump->um_fs; 6942 KASSERT(MOUNTEDSOFTDEP(mp) != 0, 6943 ("softdep_journal_freeblocks called on non-softdep filesystem")); 6944 vp = ITOV(ip); 6945 needj = 1; 6946 iboff = -1; 6947 allocblock = 0; 6948 extblocks = 0; 6949 datablocks = 0; 6950 frags = 0; 6951 freeblks = newfreeblks(mp, ip); 6952 ACQUIRE_LOCK(ump); 6953 /* 6954 * If we're truncating a removed file that will never be written 6955 * we don't need to journal the block frees. The canceled journals 6956 * for the allocations will suffice. 6957 */ 6958 inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 6959 if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED && 6960 length == 0) 6961 needj = 0; 6962 CTR3(KTR_SUJ, "softdep_journal_freeblks: ip %d length %ld needj %d", 6963 ip->i_number, length, needj); 6964 FREE_LOCK(ump); 6965 /* 6966 * Calculate the lbn that we are truncating to. This results in -1 6967 * if we're truncating the 0 bytes. So it is the last lbn we want 6968 * to keep, not the first lbn we want to truncate. 6969 */ 6970 lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1; 6971 lastoff = blkoff(fs, length); 6972 /* 6973 * Compute frags we are keeping in lastlbn. 0 means all. 6974 */ 6975 if (lastlbn >= 0 && lastlbn < UFS_NDADDR) { 6976 frags = fragroundup(fs, lastoff); 6977 /* adp offset of last valid allocdirect. */ 6978 iboff = lastlbn; 6979 } else if (lastlbn > 0) 6980 iboff = UFS_NDADDR; 6981 if (fs->fs_magic == FS_UFS2_MAGIC) 6982 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); 6983 /* 6984 * Handle normal data blocks and indirects. This section saves 6985 * values used after the inode update to complete frag and indirect 6986 * truncation. 6987 */ 6988 if ((flags & IO_NORMAL) != 0) { 6989 /* 6990 * Handle truncation of whole direct and indirect blocks. 6991 */ 6992 for (i = iboff + 1; i < UFS_NDADDR; i++) 6993 setup_freedirect(freeblks, ip, i, needj); 6994 for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR; 6995 i < UFS_NIADDR; 6996 i++, lbn += tmpval, tmpval *= NINDIR(fs)) { 6997 /* Release a whole indirect tree. */ 6998 if (lbn > lastlbn) { 6999 setup_freeindir(freeblks, ip, i, -lbn -i, 7000 needj); 7001 continue; 7002 } 7003 iboff = i + UFS_NDADDR; 7004 /* 7005 * Traverse partially truncated indirect tree. 7006 */ 7007 if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn) 7008 setup_trunc_indir(freeblks, ip, -lbn - i, 7009 lastlbn, DIP(ip, i_ib[i])); 7010 } 7011 /* 7012 * Handle partial truncation to a frag boundary. 7013 */ 7014 if (frags) { 7015 ufs2_daddr_t blkno; 7016 long oldfrags; 7017 7018 oldfrags = blksize(fs, ip, lastlbn); 7019 blkno = DIP(ip, i_db[lastlbn]); 7020 if (blkno && oldfrags != frags) { 7021 oldfrags -= frags; 7022 oldfrags = numfrags(fs, oldfrags); 7023 blkno += numfrags(fs, frags); 7024 newfreework(ump, freeblks, NULL, lastlbn, 7025 blkno, oldfrags, 0, needj); 7026 if (needj) 7027 adjust_newfreework(freeblks, 7028 numfrags(fs, frags)); 7029 } else if (blkno == 0) 7030 allocblock = 1; 7031 } 7032 /* 7033 * Add a journal record for partial truncate if we are 7034 * handling indirect blocks. Non-indirects need no extra 7035 * journaling. 7036 */ 7037 if (length != 0 && lastlbn >= UFS_NDADDR) { 7038 UFS_INODE_SET_FLAG(ip, IN_TRUNCATED); 7039 newjtrunc(freeblks, length, 0); 7040 } 7041 ip->i_size = length; 7042 DIP_SET(ip, i_size, ip->i_size); 7043 UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE); 7044 datablocks = DIP(ip, i_blocks) - extblocks; 7045 if (length != 0) 7046 datablocks = blkcount(fs, datablocks, length); 7047 freeblks->fb_len = length; 7048 } 7049 if ((flags & IO_EXT) != 0) { 7050 for (i = 0; i < UFS_NXADDR; i++) 7051 setup_freeext(freeblks, ip, i, needj); 7052 ip->i_din2->di_extsize = 0; 7053 datablocks += extblocks; 7054 UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE); 7055 } 7056 #ifdef QUOTA 7057 /* Reference the quotas in case the block count is wrong in the end. */ 7058 quotaref(vp, freeblks->fb_quota); 7059 (void) chkdq(ip, -datablocks, NOCRED, FORCE); 7060 #endif 7061 freeblks->fb_chkcnt = -datablocks; 7062 UFS_LOCK(ump); 7063 fs->fs_pendingblocks += datablocks; 7064 UFS_UNLOCK(ump); 7065 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks); 7066 /* 7067 * Handle truncation of incomplete alloc direct dependencies. We 7068 * hold the inode block locked to prevent incomplete dependencies 7069 * from reaching the disk while we are eliminating those that 7070 * have been truncated. This is a partially inlined ffs_update(). 7071 */ 7072 ufs_itimes(vp); 7073 ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED); 7074 dbn = fsbtodb(fs, ino_to_fsba(fs, ip->i_number)); 7075 error = ffs_breadz(ump, ump->um_devvp, dbn, dbn, (int)fs->fs_bsize, 7076 NULL, NULL, 0, cred, 0, NULL, &bp); 7077 if (error) { 7078 softdep_error("softdep_journal_freeblocks", error); 7079 return; 7080 } 7081 if (bp->b_bufsize == fs->fs_bsize) 7082 bp->b_flags |= B_CLUSTEROK; 7083 softdep_update_inodeblock(ip, bp, 0); 7084 if (ump->um_fstype == UFS1) { 7085 *((struct ufs1_dinode *)bp->b_data + 7086 ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1; 7087 } else { 7088 ffs_update_dinode_ckhash(fs, ip->i_din2); 7089 *((struct ufs2_dinode *)bp->b_data + 7090 ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2; 7091 } 7092 ACQUIRE_LOCK(ump); 7093 (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 7094 if ((inodedep->id_state & IOSTARTED) != 0) 7095 panic("softdep_setup_freeblocks: inode busy"); 7096 /* 7097 * Add the freeblks structure to the list of operations that 7098 * must await the zero'ed inode being written to disk. If we 7099 * still have a bitmap dependency (needj), then the inode 7100 * has never been written to disk, so we can process the 7101 * freeblks below once we have deleted the dependencies. 7102 */ 7103 if (needj) 7104 WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); 7105 else 7106 freeblks->fb_state |= COMPLETE; 7107 if ((flags & IO_NORMAL) != 0) { 7108 TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) { 7109 if (adp->ad_offset > iboff) 7110 cancel_allocdirect(&inodedep->id_inoupdt, adp, 7111 freeblks); 7112 /* 7113 * Truncate the allocdirect. We could eliminate 7114 * or modify journal records as well. 7115 */ 7116 else if (adp->ad_offset == iboff && frags) 7117 adp->ad_newsize = frags; 7118 } 7119 } 7120 if ((flags & IO_EXT) != 0) 7121 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) 7122 cancel_allocdirect(&inodedep->id_extupdt, adp, 7123 freeblks); 7124 /* 7125 * Scan the bufwait list for newblock dependencies that will never 7126 * make it to disk. 7127 */ 7128 LIST_FOREACH_SAFE(wk, &inodedep->id_bufwait, wk_list, wkn) { 7129 if (wk->wk_type != D_ALLOCDIRECT) 7130 continue; 7131 adp = WK_ALLOCDIRECT(wk); 7132 if (((flags & IO_NORMAL) != 0 && (adp->ad_offset > iboff)) || 7133 ((flags & IO_EXT) != 0 && (adp->ad_state & EXTDATA))) { 7134 cancel_jfreeblk(freeblks, adp->ad_newblkno); 7135 cancel_newblk(WK_NEWBLK(wk), NULL, &freeblks->fb_jwork); 7136 WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk); 7137 } 7138 } 7139 /* 7140 * Add journal work. 7141 */ 7142 LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) 7143 add_to_journal(&jblkdep->jb_list); 7144 FREE_LOCK(ump); 7145 bdwrite(bp); 7146 /* 7147 * Truncate dependency structures beyond length. 7148 */ 7149 trunc_dependencies(ip, freeblks, lastlbn, frags, flags); 7150 /* 7151 * This is only set when we need to allocate a fragment because 7152 * none existed at the end of a frag-sized file. It handles only 7153 * allocating a new, zero filled block. 7154 */ 7155 if (allocblock) { 7156 ip->i_size = length - lastoff; 7157 DIP_SET(ip, i_size, ip->i_size); 7158 error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp); 7159 if (error != 0) { 7160 softdep_error("softdep_journal_freeblks", error); 7161 return; 7162 } 7163 ip->i_size = length; 7164 DIP_SET(ip, i_size, length); 7165 UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_UPDATE); 7166 allocbuf(bp, frags); 7167 ffs_update(vp, 0); 7168 bawrite(bp); 7169 } else if (lastoff != 0 && vp->v_type != VDIR) { 7170 int size; 7171 7172 /* 7173 * Zero the end of a truncated frag or block. 7174 */ 7175 size = sblksize(fs, length, lastlbn); 7176 error = bread(vp, lastlbn, size, cred, &bp); 7177 if (error == 0) { 7178 bzero((char *)bp->b_data + lastoff, size - lastoff); 7179 bawrite(bp); 7180 } else if (!ffs_fsfail_cleanup(ump, error)) { 7181 softdep_error("softdep_journal_freeblks", error); 7182 return; 7183 } 7184 } 7185 ACQUIRE_LOCK(ump); 7186 inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 7187 TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next); 7188 freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST; 7189 /* 7190 * We zero earlier truncations so they don't erroneously 7191 * update i_blocks. 7192 */ 7193 if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0) 7194 TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next) 7195 fbn->fb_len = 0; 7196 if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE && 7197 LIST_EMPTY(&freeblks->fb_jblkdephd)) 7198 freeblks->fb_state |= INPROGRESS; 7199 else 7200 freeblks = NULL; 7201 FREE_LOCK(ump); 7202 if (freeblks) 7203 handle_workitem_freeblocks(freeblks, 0); 7204 trunc_pages(ip, length, extblocks, flags); 7205 7206 } 7207 7208 /* 7209 * Flush a JOP_SYNC to the journal. 7210 */ 7211 void 7212 softdep_journal_fsync(ip) 7213 struct inode *ip; 7214 { 7215 struct jfsync *jfsync; 7216 struct ufsmount *ump; 7217 7218 ump = ITOUMP(ip); 7219 KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, 7220 ("softdep_journal_fsync called on non-softdep filesystem")); 7221 if ((ip->i_flag & IN_TRUNCATED) == 0) 7222 return; 7223 ip->i_flag &= ~IN_TRUNCATED; 7224 jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO); 7225 workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ump)); 7226 jfsync->jfs_size = ip->i_size; 7227 jfsync->jfs_ino = ip->i_number; 7228 ACQUIRE_LOCK(ump); 7229 add_to_journal(&jfsync->jfs_list); 7230 jwait(&jfsync->jfs_list, MNT_WAIT); 7231 FREE_LOCK(ump); 7232 } 7233 7234 /* 7235 * Block de-allocation dependencies. 7236 * 7237 * When blocks are de-allocated, the on-disk pointers must be nullified before 7238 * the blocks are made available for use by other files. (The true 7239 * requirement is that old pointers must be nullified before new on-disk 7240 * pointers are set. We chose this slightly more stringent requirement to 7241 * reduce complexity.) Our implementation handles this dependency by updating 7242 * the inode (or indirect block) appropriately but delaying the actual block 7243 * de-allocation (i.e., freemap and free space count manipulation) until 7244 * after the updated versions reach stable storage. After the disk is 7245 * updated, the blocks can be safely de-allocated whenever it is convenient. 7246 * This implementation handles only the common case of reducing a file's 7247 * length to zero. Other cases are handled by the conventional synchronous 7248 * write approach. 7249 * 7250 * The ffs implementation with which we worked double-checks 7251 * the state of the block pointers and file size as it reduces 7252 * a file's length. Some of this code is replicated here in our 7253 * soft updates implementation. The freeblks->fb_chkcnt field is 7254 * used to transfer a part of this information to the procedure 7255 * that eventually de-allocates the blocks. 7256 * 7257 * This routine should be called from the routine that shortens 7258 * a file's length, before the inode's size or block pointers 7259 * are modified. It will save the block pointer information for 7260 * later release and zero the inode so that the calling routine 7261 * can release it. 7262 */ 7263 void 7264 softdep_setup_freeblocks(ip, length, flags) 7265 struct inode *ip; /* The inode whose length is to be reduced */ 7266 off_t length; /* The new length for the file */ 7267 int flags; /* IO_EXT and/or IO_NORMAL */ 7268 { 7269 struct ufs1_dinode *dp1; 7270 struct ufs2_dinode *dp2; 7271 struct freeblks *freeblks; 7272 struct inodedep *inodedep; 7273 struct allocdirect *adp; 7274 struct ufsmount *ump; 7275 struct buf *bp; 7276 struct fs *fs; 7277 ufs2_daddr_t extblocks, datablocks; 7278 struct mount *mp; 7279 int i, delay, error; 7280 ufs_lbn_t tmpval; 7281 ufs_lbn_t lbn; 7282 7283 ump = ITOUMP(ip); 7284 mp = UFSTOVFS(ump); 7285 KASSERT(MOUNTEDSOFTDEP(mp) != 0, 7286 ("softdep_setup_freeblocks called on non-softdep filesystem")); 7287 CTR2(KTR_SUJ, "softdep_setup_freeblks: ip %d length %ld", 7288 ip->i_number, length); 7289 KASSERT(length == 0, ("softdep_setup_freeblocks: non-zero length")); 7290 fs = ump->um_fs; 7291 if ((error = bread(ump->um_devvp, 7292 fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 7293 (int)fs->fs_bsize, NOCRED, &bp)) != 0) { 7294 if (!ffs_fsfail_cleanup(ump, error)) 7295 softdep_error("softdep_setup_freeblocks", error); 7296 return; 7297 } 7298 freeblks = newfreeblks(mp, ip); 7299 extblocks = 0; 7300 datablocks = 0; 7301 if (fs->fs_magic == FS_UFS2_MAGIC) 7302 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); 7303 if ((flags & IO_NORMAL) != 0) { 7304 for (i = 0; i < UFS_NDADDR; i++) 7305 setup_freedirect(freeblks, ip, i, 0); 7306 for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR; 7307 i < UFS_NIADDR; 7308 i++, lbn += tmpval, tmpval *= NINDIR(fs)) 7309 setup_freeindir(freeblks, ip, i, -lbn -i, 0); 7310 ip->i_size = 0; 7311 DIP_SET(ip, i_size, 0); 7312 UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE); 7313 datablocks = DIP(ip, i_blocks) - extblocks; 7314 } 7315 if ((flags & IO_EXT) != 0) { 7316 for (i = 0; i < UFS_NXADDR; i++) 7317 setup_freeext(freeblks, ip, i, 0); 7318 ip->i_din2->di_extsize = 0; 7319 datablocks += extblocks; 7320 UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE); 7321 } 7322 #ifdef QUOTA 7323 /* Reference the quotas in case the block count is wrong in the end. */ 7324 quotaref(ITOV(ip), freeblks->fb_quota); 7325 (void) chkdq(ip, -datablocks, NOCRED, FORCE); 7326 #endif 7327 freeblks->fb_chkcnt = -datablocks; 7328 UFS_LOCK(ump); 7329 fs->fs_pendingblocks += datablocks; 7330 UFS_UNLOCK(ump); 7331 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks); 7332 /* 7333 * Push the zero'ed inode to its disk buffer so that we are free 7334 * to delete its dependencies below. Once the dependencies are gone 7335 * the buffer can be safely released. 7336 */ 7337 if (ump->um_fstype == UFS1) { 7338 dp1 = ((struct ufs1_dinode *)bp->b_data + 7339 ino_to_fsbo(fs, ip->i_number)); 7340 ip->i_din1->di_freelink = dp1->di_freelink; 7341 *dp1 = *ip->i_din1; 7342 } else { 7343 dp2 = ((struct ufs2_dinode *)bp->b_data + 7344 ino_to_fsbo(fs, ip->i_number)); 7345 ip->i_din2->di_freelink = dp2->di_freelink; 7346 ffs_update_dinode_ckhash(fs, ip->i_din2); 7347 *dp2 = *ip->i_din2; 7348 } 7349 /* 7350 * Find and eliminate any inode dependencies. 7351 */ 7352 ACQUIRE_LOCK(ump); 7353 (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 7354 if ((inodedep->id_state & IOSTARTED) != 0) 7355 panic("softdep_setup_freeblocks: inode busy"); 7356 /* 7357 * Add the freeblks structure to the list of operations that 7358 * must await the zero'ed inode being written to disk. If we 7359 * still have a bitmap dependency (delay == 0), then the inode 7360 * has never been written to disk, so we can process the 7361 * freeblks below once we have deleted the dependencies. 7362 */ 7363 delay = (inodedep->id_state & DEPCOMPLETE); 7364 if (delay) 7365 WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); 7366 else 7367 freeblks->fb_state |= COMPLETE; 7368 /* 7369 * Because the file length has been truncated to zero, any 7370 * pending block allocation dependency structures associated 7371 * with this inode are obsolete and can simply be de-allocated. 7372 * We must first merge the two dependency lists to get rid of 7373 * any duplicate freefrag structures, then purge the merged list. 7374 * If we still have a bitmap dependency, then the inode has never 7375 * been written to disk, so we can free any fragments without delay. 7376 */ 7377 if (flags & IO_NORMAL) { 7378 merge_inode_lists(&inodedep->id_newinoupdt, 7379 &inodedep->id_inoupdt); 7380 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) 7381 cancel_allocdirect(&inodedep->id_inoupdt, adp, 7382 freeblks); 7383 } 7384 if (flags & IO_EXT) { 7385 merge_inode_lists(&inodedep->id_newextupdt, 7386 &inodedep->id_extupdt); 7387 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) 7388 cancel_allocdirect(&inodedep->id_extupdt, adp, 7389 freeblks); 7390 } 7391 FREE_LOCK(ump); 7392 bdwrite(bp); 7393 trunc_dependencies(ip, freeblks, -1, 0, flags); 7394 ACQUIRE_LOCK(ump); 7395 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) 7396 (void) free_inodedep(inodedep); 7397 freeblks->fb_state |= DEPCOMPLETE; 7398 /* 7399 * If the inode with zeroed block pointers is now on disk 7400 * we can start freeing blocks. 7401 */ 7402 if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) 7403 freeblks->fb_state |= INPROGRESS; 7404 else 7405 freeblks = NULL; 7406 FREE_LOCK(ump); 7407 if (freeblks) 7408 handle_workitem_freeblocks(freeblks, 0); 7409 trunc_pages(ip, length, extblocks, flags); 7410 } 7411 7412 /* 7413 * Eliminate pages from the page cache that back parts of this inode and 7414 * adjust the vnode pager's idea of our size. This prevents stale data 7415 * from hanging around in the page cache. 7416 */ 7417 static void 7418 trunc_pages(ip, length, extblocks, flags) 7419 struct inode *ip; 7420 off_t length; 7421 ufs2_daddr_t extblocks; 7422 int flags; 7423 { 7424 struct vnode *vp; 7425 struct fs *fs; 7426 ufs_lbn_t lbn; 7427 off_t end, extend; 7428 7429 vp = ITOV(ip); 7430 fs = ITOFS(ip); 7431 extend = OFF_TO_IDX(lblktosize(fs, -extblocks)); 7432 if ((flags & IO_EXT) != 0) 7433 vn_pages_remove(vp, extend, 0); 7434 if ((flags & IO_NORMAL) == 0) 7435 return; 7436 BO_LOCK(&vp->v_bufobj); 7437 drain_output(vp); 7438 BO_UNLOCK(&vp->v_bufobj); 7439 /* 7440 * The vnode pager eliminates file pages we eliminate indirects 7441 * below. 7442 */ 7443 vnode_pager_setsize(vp, length); 7444 /* 7445 * Calculate the end based on the last indirect we want to keep. If 7446 * the block extends into indirects we can just use the negative of 7447 * its lbn. Doubles and triples exist at lower numbers so we must 7448 * be careful not to remove those, if they exist. double and triple 7449 * indirect lbns do not overlap with others so it is not important 7450 * to verify how many levels are required. 7451 */ 7452 lbn = lblkno(fs, length); 7453 if (lbn >= UFS_NDADDR) { 7454 /* Calculate the virtual lbn of the triple indirect. */ 7455 lbn = -lbn - (UFS_NIADDR - 1); 7456 end = OFF_TO_IDX(lblktosize(fs, lbn)); 7457 } else 7458 end = extend; 7459 vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end); 7460 } 7461 7462 /* 7463 * See if the buf bp is in the range eliminated by truncation. 7464 */ 7465 static int 7466 trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags) 7467 struct buf *bp; 7468 int *blkoffp; 7469 ufs_lbn_t lastlbn; 7470 int lastoff; 7471 int flags; 7472 { 7473 ufs_lbn_t lbn; 7474 7475 *blkoffp = 0; 7476 /* Only match ext/normal blocks as appropriate. */ 7477 if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) || 7478 ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0)) 7479 return (0); 7480 /* ALTDATA is always a full truncation. */ 7481 if ((bp->b_xflags & BX_ALTDATA) != 0) 7482 return (1); 7483 /* -1 is full truncation. */ 7484 if (lastlbn == -1) 7485 return (1); 7486 /* 7487 * If this is a partial truncate we only want those 7488 * blocks and indirect blocks that cover the range 7489 * we're after. 7490 */ 7491 lbn = bp->b_lblkno; 7492 if (lbn < 0) 7493 lbn = -(lbn + lbn_level(lbn)); 7494 if (lbn < lastlbn) 7495 return (0); 7496 /* Here we only truncate lblkno if it's partial. */ 7497 if (lbn == lastlbn) { 7498 if (lastoff == 0) 7499 return (0); 7500 *blkoffp = lastoff; 7501 } 7502 return (1); 7503 } 7504 7505 /* 7506 * Eliminate any dependencies that exist in memory beyond lblkno:off 7507 */ 7508 static void 7509 trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags) 7510 struct inode *ip; 7511 struct freeblks *freeblks; 7512 ufs_lbn_t lastlbn; 7513 int lastoff; 7514 int flags; 7515 { 7516 struct bufobj *bo; 7517 struct vnode *vp; 7518 struct buf *bp; 7519 int blkoff; 7520 7521 /* 7522 * We must wait for any I/O in progress to finish so that 7523 * all potential buffers on the dirty list will be visible. 7524 * Once they are all there, walk the list and get rid of 7525 * any dependencies. 7526 */ 7527 vp = ITOV(ip); 7528 bo = &vp->v_bufobj; 7529 BO_LOCK(bo); 7530 drain_output(vp); 7531 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) 7532 bp->b_vflags &= ~BV_SCANNED; 7533 restart: 7534 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { 7535 if (bp->b_vflags & BV_SCANNED) 7536 continue; 7537 if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) { 7538 bp->b_vflags |= BV_SCANNED; 7539 continue; 7540 } 7541 KASSERT(bp->b_bufobj == bo, ("Wrong object in buffer")); 7542 if ((bp = getdirtybuf(bp, BO_LOCKPTR(bo), MNT_WAIT)) == NULL) 7543 goto restart; 7544 BO_UNLOCK(bo); 7545 if (deallocate_dependencies(bp, freeblks, blkoff)) 7546 bqrelse(bp); 7547 else 7548 brelse(bp); 7549 BO_LOCK(bo); 7550 goto restart; 7551 } 7552 /* 7553 * Now do the work of vtruncbuf while also matching indirect blocks. 7554 */ 7555 TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) 7556 bp->b_vflags &= ~BV_SCANNED; 7557 cleanrestart: 7558 TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) { 7559 if (bp->b_vflags & BV_SCANNED) 7560 continue; 7561 if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) { 7562 bp->b_vflags |= BV_SCANNED; 7563 continue; 7564 } 7565 if (BUF_LOCK(bp, 7566 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 7567 BO_LOCKPTR(bo)) == ENOLCK) { 7568 BO_LOCK(bo); 7569 goto cleanrestart; 7570 } 7571 BO_LOCK(bo); 7572 bp->b_vflags |= BV_SCANNED; 7573 BO_UNLOCK(bo); 7574 bremfree(bp); 7575 if (blkoff != 0) { 7576 allocbuf(bp, blkoff); 7577 bqrelse(bp); 7578 } else { 7579 bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF; 7580 brelse(bp); 7581 } 7582 BO_LOCK(bo); 7583 goto cleanrestart; 7584 } 7585 drain_output(vp); 7586 BO_UNLOCK(bo); 7587 } 7588 7589 static int 7590 cancel_pagedep(pagedep, freeblks, blkoff) 7591 struct pagedep *pagedep; 7592 struct freeblks *freeblks; 7593 int blkoff; 7594 { 7595 struct jremref *jremref; 7596 struct jmvref *jmvref; 7597 struct dirrem *dirrem, *tmp; 7598 int i; 7599 7600 /* 7601 * Copy any directory remove dependencies to the list 7602 * to be processed after the freeblks proceeds. If 7603 * directory entry never made it to disk they 7604 * can be dumped directly onto the work list. 7605 */ 7606 LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) { 7607 /* Skip this directory removal if it is intended to remain. */ 7608 if (dirrem->dm_offset < blkoff) 7609 continue; 7610 /* 7611 * If there are any dirrems we wait for the journal write 7612 * to complete and then restart the buf scan as the lock 7613 * has been dropped. 7614 */ 7615 while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) { 7616 jwait(&jremref->jr_list, MNT_WAIT); 7617 return (ERESTART); 7618 } 7619 LIST_REMOVE(dirrem, dm_next); 7620 dirrem->dm_dirinum = pagedep->pd_ino; 7621 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list); 7622 } 7623 while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) { 7624 jwait(&jmvref->jm_list, MNT_WAIT); 7625 return (ERESTART); 7626 } 7627 /* 7628 * When we're partially truncating a pagedep we just want to flush 7629 * journal entries and return. There can not be any adds in the 7630 * truncated portion of the directory and newblk must remain if 7631 * part of the block remains. 7632 */ 7633 if (blkoff != 0) { 7634 struct diradd *dap; 7635 7636 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) 7637 if (dap->da_offset > blkoff) 7638 panic("cancel_pagedep: diradd %p off %d > %d", 7639 dap, dap->da_offset, blkoff); 7640 for (i = 0; i < DAHASHSZ; i++) 7641 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) 7642 if (dap->da_offset > blkoff) 7643 panic("cancel_pagedep: diradd %p off %d > %d", 7644 dap, dap->da_offset, blkoff); 7645 return (0); 7646 } 7647 /* 7648 * There should be no directory add dependencies present 7649 * as the directory could not be truncated until all 7650 * children were removed. 7651 */ 7652 KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL, 7653 ("deallocate_dependencies: pendinghd != NULL")); 7654 for (i = 0; i < DAHASHSZ; i++) 7655 KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL, 7656 ("deallocate_dependencies: diraddhd != NULL")); 7657 if ((pagedep->pd_state & NEWBLOCK) != 0) 7658 free_newdirblk(pagedep->pd_newdirblk); 7659 if (free_pagedep(pagedep) == 0) 7660 panic("Failed to free pagedep %p", pagedep); 7661 return (0); 7662 } 7663 7664 /* 7665 * Reclaim any dependency structures from a buffer that is about to 7666 * be reallocated to a new vnode. The buffer must be locked, thus, 7667 * no I/O completion operations can occur while we are manipulating 7668 * its associated dependencies. The mutex is held so that other I/O's 7669 * associated with related dependencies do not occur. 7670 */ 7671 static int 7672 deallocate_dependencies(bp, freeblks, off) 7673 struct buf *bp; 7674 struct freeblks *freeblks; 7675 int off; 7676 { 7677 struct indirdep *indirdep; 7678 struct pagedep *pagedep; 7679 struct worklist *wk, *wkn; 7680 struct ufsmount *ump; 7681 7682 ump = softdep_bp_to_mp(bp); 7683 if (ump == NULL) 7684 goto done; 7685 ACQUIRE_LOCK(ump); 7686 LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) { 7687 switch (wk->wk_type) { 7688 case D_INDIRDEP: 7689 indirdep = WK_INDIRDEP(wk); 7690 if (bp->b_lblkno >= 0 || 7691 bp->b_blkno != indirdep->ir_savebp->b_lblkno) 7692 panic("deallocate_dependencies: not indir"); 7693 cancel_indirdep(indirdep, bp, freeblks); 7694 continue; 7695 7696 case D_PAGEDEP: 7697 pagedep = WK_PAGEDEP(wk); 7698 if (cancel_pagedep(pagedep, freeblks, off)) { 7699 FREE_LOCK(ump); 7700 return (ERESTART); 7701 } 7702 continue; 7703 7704 case D_ALLOCINDIR: 7705 /* 7706 * Simply remove the allocindir, we'll find it via 7707 * the indirdep where we can clear pointers if 7708 * needed. 7709 */ 7710 WORKLIST_REMOVE(wk); 7711 continue; 7712 7713 case D_FREEWORK: 7714 /* 7715 * A truncation is waiting for the zero'd pointers 7716 * to be written. It can be freed when the freeblks 7717 * is journaled. 7718 */ 7719 WORKLIST_REMOVE(wk); 7720 wk->wk_state |= ONDEPLIST; 7721 WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk); 7722 break; 7723 7724 case D_ALLOCDIRECT: 7725 if (off != 0) 7726 continue; 7727 /* FALLTHROUGH */ 7728 default: 7729 panic("deallocate_dependencies: Unexpected type %s", 7730 TYPENAME(wk->wk_type)); 7731 /* NOTREACHED */ 7732 } 7733 } 7734 FREE_LOCK(ump); 7735 done: 7736 /* 7737 * Don't throw away this buf, we were partially truncating and 7738 * some deps may always remain. 7739 */ 7740 if (off) { 7741 allocbuf(bp, off); 7742 bp->b_vflags |= BV_SCANNED; 7743 return (EBUSY); 7744 } 7745 bp->b_flags |= B_INVAL | B_NOCACHE; 7746 7747 return (0); 7748 } 7749 7750 /* 7751 * An allocdirect is being canceled due to a truncate. We must make sure 7752 * the journal entry is released in concert with the blkfree that releases 7753 * the storage. Completed journal entries must not be released until the 7754 * space is no longer pointed to by the inode or in the bitmap. 7755 */ 7756 static void 7757 cancel_allocdirect(adphead, adp, freeblks) 7758 struct allocdirectlst *adphead; 7759 struct allocdirect *adp; 7760 struct freeblks *freeblks; 7761 { 7762 struct freework *freework; 7763 struct newblk *newblk; 7764 struct worklist *wk; 7765 7766 TAILQ_REMOVE(adphead, adp, ad_next); 7767 newblk = (struct newblk *)adp; 7768 freework = NULL; 7769 /* 7770 * Find the correct freework structure. 7771 */ 7772 LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) { 7773 if (wk->wk_type != D_FREEWORK) 7774 continue; 7775 freework = WK_FREEWORK(wk); 7776 if (freework->fw_blkno == newblk->nb_newblkno) 7777 break; 7778 } 7779 if (freework == NULL) 7780 panic("cancel_allocdirect: Freework not found"); 7781 /* 7782 * If a newblk exists at all we still have the journal entry that 7783 * initiated the allocation so we do not need to journal the free. 7784 */ 7785 cancel_jfreeblk(freeblks, freework->fw_blkno); 7786 /* 7787 * If the journal hasn't been written the jnewblk must be passed 7788 * to the call to ffs_blkfree that reclaims the space. We accomplish 7789 * this by linking the journal dependency into the freework to be 7790 * freed when freework_freeblock() is called. If the journal has 7791 * been written we can simply reclaim the journal space when the 7792 * freeblks work is complete. 7793 */ 7794 freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list, 7795 &freeblks->fb_jwork); 7796 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list); 7797 } 7798 7799 /* 7800 * Cancel a new block allocation. May be an indirect or direct block. We 7801 * remove it from various lists and return any journal record that needs to 7802 * be resolved by the caller. 7803 * 7804 * A special consideration is made for indirects which were never pointed 7805 * at on disk and will never be found once this block is released. 7806 */ 7807 static struct jnewblk * 7808 cancel_newblk(newblk, wk, wkhd) 7809 struct newblk *newblk; 7810 struct worklist *wk; 7811 struct workhead *wkhd; 7812 { 7813 struct jnewblk *jnewblk; 7814 7815 CTR1(KTR_SUJ, "cancel_newblk: blkno %jd", newblk->nb_newblkno); 7816 7817 newblk->nb_state |= GOINGAWAY; 7818 /* 7819 * Previously we traversed the completedhd on each indirdep 7820 * attached to this newblk to cancel them and gather journal 7821 * work. Since we need only the oldest journal segment and 7822 * the lowest point on the tree will always have the oldest 7823 * journal segment we are free to release the segments 7824 * of any subordinates and may leave the indirdep list to 7825 * indirdep_complete() when this newblk is freed. 7826 */ 7827 if (newblk->nb_state & ONDEPLIST) { 7828 newblk->nb_state &= ~ONDEPLIST; 7829 LIST_REMOVE(newblk, nb_deps); 7830 } 7831 if (newblk->nb_state & ONWORKLIST) 7832 WORKLIST_REMOVE(&newblk->nb_list); 7833 /* 7834 * If the journal entry hasn't been written we save a pointer to 7835 * the dependency that frees it until it is written or the 7836 * superseding operation completes. 7837 */ 7838 jnewblk = newblk->nb_jnewblk; 7839 if (jnewblk != NULL && wk != NULL) { 7840 newblk->nb_jnewblk = NULL; 7841 jnewblk->jn_dep = wk; 7842 } 7843 if (!LIST_EMPTY(&newblk->nb_jwork)) 7844 jwork_move(wkhd, &newblk->nb_jwork); 7845 /* 7846 * When truncating we must free the newdirblk early to remove 7847 * the pagedep from the hash before returning. 7848 */ 7849 if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) 7850 free_newdirblk(WK_NEWDIRBLK(wk)); 7851 if (!LIST_EMPTY(&newblk->nb_newdirblk)) 7852 panic("cancel_newblk: extra newdirblk"); 7853 7854 return (jnewblk); 7855 } 7856 7857 /* 7858 * Schedule the freefrag associated with a newblk to be released once 7859 * the pointers are written and the previous block is no longer needed. 7860 */ 7861 static void 7862 newblk_freefrag(newblk) 7863 struct newblk *newblk; 7864 { 7865 struct freefrag *freefrag; 7866 7867 if (newblk->nb_freefrag == NULL) 7868 return; 7869 freefrag = newblk->nb_freefrag; 7870 newblk->nb_freefrag = NULL; 7871 freefrag->ff_state |= COMPLETE; 7872 if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) 7873 add_to_worklist(&freefrag->ff_list, 0); 7874 } 7875 7876 /* 7877 * Free a newblk. Generate a new freefrag work request if appropriate. 7878 * This must be called after the inode pointer and any direct block pointers 7879 * are valid or fully removed via truncate or frag extension. 7880 */ 7881 static void 7882 free_newblk(newblk) 7883 struct newblk *newblk; 7884 { 7885 struct indirdep *indirdep; 7886 struct worklist *wk; 7887 7888 KASSERT(newblk->nb_jnewblk == NULL, 7889 ("free_newblk: jnewblk %p still attached", newblk->nb_jnewblk)); 7890 KASSERT(newblk->nb_list.wk_type != D_NEWBLK, 7891 ("free_newblk: unclaimed newblk")); 7892 LOCK_OWNED(VFSTOUFS(newblk->nb_list.wk_mp)); 7893 newblk_freefrag(newblk); 7894 if (newblk->nb_state & ONDEPLIST) 7895 LIST_REMOVE(newblk, nb_deps); 7896 if (newblk->nb_state & ONWORKLIST) 7897 WORKLIST_REMOVE(&newblk->nb_list); 7898 LIST_REMOVE(newblk, nb_hash); 7899 if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) 7900 free_newdirblk(WK_NEWDIRBLK(wk)); 7901 if (!LIST_EMPTY(&newblk->nb_newdirblk)) 7902 panic("free_newblk: extra newdirblk"); 7903 while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) 7904 indirdep_complete(indirdep); 7905 handle_jwork(&newblk->nb_jwork); 7906 WORKITEM_FREE(newblk, D_NEWBLK); 7907 } 7908 7909 /* 7910 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep. 7911 */ 7912 static void 7913 free_newdirblk(newdirblk) 7914 struct newdirblk *newdirblk; 7915 { 7916 struct pagedep *pagedep; 7917 struct diradd *dap; 7918 struct worklist *wk; 7919 7920 LOCK_OWNED(VFSTOUFS(newdirblk->db_list.wk_mp)); 7921 WORKLIST_REMOVE(&newdirblk->db_list); 7922 /* 7923 * If the pagedep is still linked onto the directory buffer 7924 * dependency chain, then some of the entries on the 7925 * pd_pendinghd list may not be committed to disk yet. In 7926 * this case, we will simply clear the NEWBLOCK flag and 7927 * let the pd_pendinghd list be processed when the pagedep 7928 * is next written. If the pagedep is no longer on the buffer 7929 * dependency chain, then all the entries on the pd_pending 7930 * list are committed to disk and we can free them here. 7931 */ 7932 pagedep = newdirblk->db_pagedep; 7933 pagedep->pd_state &= ~NEWBLOCK; 7934 if ((pagedep->pd_state & ONWORKLIST) == 0) { 7935 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 7936 free_diradd(dap, NULL); 7937 /* 7938 * If no dependencies remain, the pagedep will be freed. 7939 */ 7940 free_pagedep(pagedep); 7941 } 7942 /* Should only ever be one item in the list. */ 7943 while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) { 7944 WORKLIST_REMOVE(wk); 7945 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 7946 } 7947 WORKITEM_FREE(newdirblk, D_NEWDIRBLK); 7948 } 7949 7950 /* 7951 * Prepare an inode to be freed. The actual free operation is not 7952 * done until the zero'ed inode has been written to disk. 7953 */ 7954 void 7955 softdep_freefile(pvp, ino, mode) 7956 struct vnode *pvp; 7957 ino_t ino; 7958 int mode; 7959 { 7960 struct inode *ip = VTOI(pvp); 7961 struct inodedep *inodedep; 7962 struct freefile *freefile; 7963 struct freeblks *freeblks; 7964 struct ufsmount *ump; 7965 7966 ump = ITOUMP(ip); 7967 KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, 7968 ("softdep_freefile called on non-softdep filesystem")); 7969 /* 7970 * This sets up the inode de-allocation dependency. 7971 */ 7972 freefile = malloc(sizeof(struct freefile), 7973 M_FREEFILE, M_SOFTDEP_FLAGS); 7974 workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount); 7975 freefile->fx_mode = mode; 7976 freefile->fx_oldinum = ino; 7977 freefile->fx_devvp = ump->um_devvp; 7978 LIST_INIT(&freefile->fx_jwork); 7979 UFS_LOCK(ump); 7980 ump->um_fs->fs_pendinginodes += 1; 7981 UFS_UNLOCK(ump); 7982 7983 /* 7984 * If the inodedep does not exist, then the zero'ed inode has 7985 * been written to disk. If the allocated inode has never been 7986 * written to disk, then the on-disk inode is zero'ed. In either 7987 * case we can free the file immediately. If the journal was 7988 * canceled before being written the inode will never make it to 7989 * disk and we must send the canceled journal entrys to 7990 * ffs_freefile() to be cleared in conjunction with the bitmap. 7991 * Any blocks waiting on the inode to write can be safely freed 7992 * here as it will never been written. 7993 */ 7994 ACQUIRE_LOCK(ump); 7995 inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); 7996 if (inodedep) { 7997 /* 7998 * Clear out freeblks that no longer need to reference 7999 * this inode. 8000 */ 8001 while ((freeblks = 8002 TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) { 8003 TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, 8004 fb_next); 8005 freeblks->fb_state &= ~ONDEPLIST; 8006 } 8007 /* 8008 * Remove this inode from the unlinked list. 8009 */ 8010 if (inodedep->id_state & UNLINKED) { 8011 /* 8012 * Save the journal work to be freed with the bitmap 8013 * before we clear UNLINKED. Otherwise it can be lost 8014 * if the inode block is written. 8015 */ 8016 handle_bufwait(inodedep, &freefile->fx_jwork); 8017 clear_unlinked_inodedep(inodedep); 8018 /* 8019 * Re-acquire inodedep as we've dropped the 8020 * per-filesystem lock in clear_unlinked_inodedep(). 8021 */ 8022 inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); 8023 } 8024 } 8025 if (inodedep == NULL || check_inode_unwritten(inodedep)) { 8026 FREE_LOCK(ump); 8027 handle_workitem_freefile(freefile); 8028 return; 8029 } 8030 if ((inodedep->id_state & DEPCOMPLETE) == 0) 8031 inodedep->id_state |= GOINGAWAY; 8032 WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); 8033 FREE_LOCK(ump); 8034 if (ip->i_number == ino) 8035 UFS_INODE_SET_FLAG(ip, IN_MODIFIED); 8036 } 8037 8038 /* 8039 * Check to see if an inode has never been written to disk. If 8040 * so free the inodedep and return success, otherwise return failure. 8041 * 8042 * If we still have a bitmap dependency, then the inode has never 8043 * been written to disk. Drop the dependency as it is no longer 8044 * necessary since the inode is being deallocated. We set the 8045 * ALLCOMPLETE flags since the bitmap now properly shows that the 8046 * inode is not allocated. Even if the inode is actively being 8047 * written, it has been rolled back to its zero'ed state, so we 8048 * are ensured that a zero inode is what is on the disk. For short 8049 * lived files, this change will usually result in removing all the 8050 * dependencies from the inode so that it can be freed immediately. 8051 */ 8052 static int 8053 check_inode_unwritten(inodedep) 8054 struct inodedep *inodedep; 8055 { 8056 8057 LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp)); 8058 8059 if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 || 8060 !LIST_EMPTY(&inodedep->id_dirremhd) || 8061 !LIST_EMPTY(&inodedep->id_pendinghd) || 8062 !LIST_EMPTY(&inodedep->id_bufwait) || 8063 !LIST_EMPTY(&inodedep->id_inowait) || 8064 !TAILQ_EMPTY(&inodedep->id_inoreflst) || 8065 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 8066 !TAILQ_EMPTY(&inodedep->id_newinoupdt) || 8067 !TAILQ_EMPTY(&inodedep->id_extupdt) || 8068 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 8069 !TAILQ_EMPTY(&inodedep->id_freeblklst) || 8070 inodedep->id_mkdiradd != NULL || 8071 inodedep->id_nlinkdelta != 0) 8072 return (0); 8073 /* 8074 * Another process might be in initiate_write_inodeblock_ufs[12] 8075 * trying to allocate memory without holding "Softdep Lock". 8076 */ 8077 if ((inodedep->id_state & IOSTARTED) != 0 && 8078 inodedep->id_savedino1 == NULL) 8079 return (0); 8080 8081 if (inodedep->id_state & ONDEPLIST) 8082 LIST_REMOVE(inodedep, id_deps); 8083 inodedep->id_state &= ~ONDEPLIST; 8084 inodedep->id_state |= ALLCOMPLETE; 8085 inodedep->id_bmsafemap = NULL; 8086 if (inodedep->id_state & ONWORKLIST) 8087 WORKLIST_REMOVE(&inodedep->id_list); 8088 if (inodedep->id_savedino1 != NULL) { 8089 free(inodedep->id_savedino1, M_SAVEDINO); 8090 inodedep->id_savedino1 = NULL; 8091 } 8092 if (free_inodedep(inodedep) == 0) 8093 panic("check_inode_unwritten: busy inode"); 8094 return (1); 8095 } 8096 8097 static int 8098 check_inodedep_free(inodedep) 8099 struct inodedep *inodedep; 8100 { 8101 8102 LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp)); 8103 if ((inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || 8104 !LIST_EMPTY(&inodedep->id_dirremhd) || 8105 !LIST_EMPTY(&inodedep->id_pendinghd) || 8106 !LIST_EMPTY(&inodedep->id_bufwait) || 8107 !LIST_EMPTY(&inodedep->id_inowait) || 8108 !TAILQ_EMPTY(&inodedep->id_inoreflst) || 8109 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 8110 !TAILQ_EMPTY(&inodedep->id_newinoupdt) || 8111 !TAILQ_EMPTY(&inodedep->id_extupdt) || 8112 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 8113 !TAILQ_EMPTY(&inodedep->id_freeblklst) || 8114 inodedep->id_mkdiradd != NULL || 8115 inodedep->id_nlinkdelta != 0 || 8116 inodedep->id_savedino1 != NULL) 8117 return (0); 8118 return (1); 8119 } 8120 8121 /* 8122 * Try to free an inodedep structure. Return 1 if it could be freed. 8123 */ 8124 static int 8125 free_inodedep(inodedep) 8126 struct inodedep *inodedep; 8127 { 8128 8129 LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp)); 8130 if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 || 8131 !check_inodedep_free(inodedep)) 8132 return (0); 8133 if (inodedep->id_state & ONDEPLIST) 8134 LIST_REMOVE(inodedep, id_deps); 8135 LIST_REMOVE(inodedep, id_hash); 8136 WORKITEM_FREE(inodedep, D_INODEDEP); 8137 return (1); 8138 } 8139 8140 /* 8141 * Free the block referenced by a freework structure. The parent freeblks 8142 * structure is released and completed when the final cg bitmap reaches 8143 * the disk. This routine may be freeing a jnewblk which never made it to 8144 * disk in which case we do not have to wait as the operation is undone 8145 * in memory immediately. 8146 */ 8147 static void 8148 freework_freeblock(freework, key) 8149 struct freework *freework; 8150 u_long key; 8151 { 8152 struct freeblks *freeblks; 8153 struct jnewblk *jnewblk; 8154 struct ufsmount *ump; 8155 struct workhead wkhd; 8156 struct fs *fs; 8157 int bsize; 8158 int needj; 8159 8160 ump = VFSTOUFS(freework->fw_list.wk_mp); 8161 LOCK_OWNED(ump); 8162 /* 8163 * Handle partial truncate separately. 8164 */ 8165 if (freework->fw_indir) { 8166 complete_trunc_indir(freework); 8167 return; 8168 } 8169 freeblks = freework->fw_freeblks; 8170 fs = ump->um_fs; 8171 needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0; 8172 bsize = lfragtosize(fs, freework->fw_frags); 8173 LIST_INIT(&wkhd); 8174 /* 8175 * DEPCOMPLETE is cleared in indirblk_insert() if the block lives 8176 * on the indirblk hashtable and prevents premature freeing. 8177 */ 8178 freework->fw_state |= DEPCOMPLETE; 8179 /* 8180 * SUJ needs to wait for the segment referencing freed indirect 8181 * blocks to expire so that we know the checker will not confuse 8182 * a re-allocated indirect block with its old contents. 8183 */ 8184 if (needj && freework->fw_lbn <= -UFS_NDADDR) 8185 indirblk_insert(freework); 8186 /* 8187 * If we are canceling an existing jnewblk pass it to the free 8188 * routine, otherwise pass the freeblk which will ultimately 8189 * release the freeblks. If we're not journaling, we can just 8190 * free the freeblks immediately. 8191 */ 8192 jnewblk = freework->fw_jnewblk; 8193 if (jnewblk != NULL) { 8194 cancel_jnewblk(jnewblk, &wkhd); 8195 needj = 0; 8196 } else if (needj) { 8197 freework->fw_state |= DELAYEDFREE; 8198 freeblks->fb_cgwait++; 8199 WORKLIST_INSERT(&wkhd, &freework->fw_list); 8200 } 8201 FREE_LOCK(ump); 8202 freeblks_free(ump, freeblks, btodb(bsize)); 8203 CTR4(KTR_SUJ, 8204 "freework_freeblock: ino %jd blkno %jd lbn %jd size %d", 8205 freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize); 8206 ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize, 8207 freeblks->fb_inum, freeblks->fb_vtype, &wkhd, key); 8208 ACQUIRE_LOCK(ump); 8209 /* 8210 * The jnewblk will be discarded and the bits in the map never 8211 * made it to disk. We can immediately free the freeblk. 8212 */ 8213 if (needj == 0) 8214 handle_written_freework(freework); 8215 } 8216 8217 /* 8218 * We enqueue freework items that need processing back on the freeblks and 8219 * add the freeblks to the worklist. This makes it easier to find all work 8220 * required to flush a truncation in process_truncates(). 8221 */ 8222 static void 8223 freework_enqueue(freework) 8224 struct freework *freework; 8225 { 8226 struct freeblks *freeblks; 8227 8228 freeblks = freework->fw_freeblks; 8229 if ((freework->fw_state & INPROGRESS) == 0) 8230 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list); 8231 if ((freeblks->fb_state & 8232 (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE && 8233 LIST_EMPTY(&freeblks->fb_jblkdephd)) 8234 add_to_worklist(&freeblks->fb_list, WK_NODELAY); 8235 } 8236 8237 /* 8238 * Start, continue, or finish the process of freeing an indirect block tree. 8239 * The free operation may be paused at any point with fw_off containing the 8240 * offset to restart from. This enables us to implement some flow control 8241 * for large truncates which may fan out and generate a huge number of 8242 * dependencies. 8243 */ 8244 static void 8245 handle_workitem_indirblk(freework) 8246 struct freework *freework; 8247 { 8248 struct freeblks *freeblks; 8249 struct ufsmount *ump; 8250 struct fs *fs; 8251 8252 freeblks = freework->fw_freeblks; 8253 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 8254 fs = ump->um_fs; 8255 if (freework->fw_state & DEPCOMPLETE) { 8256 handle_written_freework(freework); 8257 return; 8258 } 8259 if (freework->fw_off == NINDIR(fs)) { 8260 freework_freeblock(freework, SINGLETON_KEY); 8261 return; 8262 } 8263 freework->fw_state |= INPROGRESS; 8264 FREE_LOCK(ump); 8265 indir_trunc(freework, fsbtodb(fs, freework->fw_blkno), 8266 freework->fw_lbn); 8267 ACQUIRE_LOCK(ump); 8268 } 8269 8270 /* 8271 * Called when a freework structure attached to a cg buf is written. The 8272 * ref on either the parent or the freeblks structure is released and 8273 * the freeblks is added back to the worklist if there is more work to do. 8274 */ 8275 static void 8276 handle_written_freework(freework) 8277 struct freework *freework; 8278 { 8279 struct freeblks *freeblks; 8280 struct freework *parent; 8281 8282 freeblks = freework->fw_freeblks; 8283 parent = freework->fw_parent; 8284 if (freework->fw_state & DELAYEDFREE) 8285 freeblks->fb_cgwait--; 8286 freework->fw_state |= COMPLETE; 8287 if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE) 8288 WORKITEM_FREE(freework, D_FREEWORK); 8289 if (parent) { 8290 if (--parent->fw_ref == 0) 8291 freework_enqueue(parent); 8292 return; 8293 } 8294 if (--freeblks->fb_ref != 0) 8295 return; 8296 if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) == 8297 ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd)) 8298 add_to_worklist(&freeblks->fb_list, WK_NODELAY); 8299 } 8300 8301 /* 8302 * This workitem routine performs the block de-allocation. 8303 * The workitem is added to the pending list after the updated 8304 * inode block has been written to disk. As mentioned above, 8305 * checks regarding the number of blocks de-allocated (compared 8306 * to the number of blocks allocated for the file) are also 8307 * performed in this function. 8308 */ 8309 static int 8310 handle_workitem_freeblocks(freeblks, flags) 8311 struct freeblks *freeblks; 8312 int flags; 8313 { 8314 struct freework *freework; 8315 struct newblk *newblk; 8316 struct allocindir *aip; 8317 struct ufsmount *ump; 8318 struct worklist *wk; 8319 u_long key; 8320 8321 KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd), 8322 ("handle_workitem_freeblocks: Journal entries not written.")); 8323 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 8324 key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum); 8325 ACQUIRE_LOCK(ump); 8326 while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) { 8327 WORKLIST_REMOVE(wk); 8328 switch (wk->wk_type) { 8329 case D_DIRREM: 8330 wk->wk_state |= COMPLETE; 8331 add_to_worklist(wk, 0); 8332 continue; 8333 8334 case D_ALLOCDIRECT: 8335 free_newblk(WK_NEWBLK(wk)); 8336 continue; 8337 8338 case D_ALLOCINDIR: 8339 aip = WK_ALLOCINDIR(wk); 8340 freework = NULL; 8341 if (aip->ai_state & DELAYEDFREE) { 8342 FREE_LOCK(ump); 8343 freework = newfreework(ump, freeblks, NULL, 8344 aip->ai_lbn, aip->ai_newblkno, 8345 ump->um_fs->fs_frag, 0, 0); 8346 ACQUIRE_LOCK(ump); 8347 } 8348 newblk = WK_NEWBLK(wk); 8349 if (newblk->nb_jnewblk) { 8350 freework->fw_jnewblk = newblk->nb_jnewblk; 8351 newblk->nb_jnewblk->jn_dep = &freework->fw_list; 8352 newblk->nb_jnewblk = NULL; 8353 } 8354 free_newblk(newblk); 8355 continue; 8356 8357 case D_FREEWORK: 8358 freework = WK_FREEWORK(wk); 8359 if (freework->fw_lbn <= -UFS_NDADDR) 8360 handle_workitem_indirblk(freework); 8361 else 8362 freework_freeblock(freework, key); 8363 continue; 8364 default: 8365 panic("handle_workitem_freeblocks: Unknown type %s", 8366 TYPENAME(wk->wk_type)); 8367 } 8368 } 8369 if (freeblks->fb_ref != 0) { 8370 freeblks->fb_state &= ~INPROGRESS; 8371 wake_worklist(&freeblks->fb_list); 8372 freeblks = NULL; 8373 } 8374 FREE_LOCK(ump); 8375 ffs_blkrelease_finish(ump, key); 8376 if (freeblks) 8377 return handle_complete_freeblocks(freeblks, flags); 8378 return (0); 8379 } 8380 8381 /* 8382 * Handle completion of block free via truncate. This allows fs_pending 8383 * to track the actual free block count more closely than if we only updated 8384 * it at the end. We must be careful to handle cases where the block count 8385 * on free was incorrect. 8386 */ 8387 static void 8388 freeblks_free(ump, freeblks, blocks) 8389 struct ufsmount *ump; 8390 struct freeblks *freeblks; 8391 int blocks; 8392 { 8393 struct fs *fs; 8394 ufs2_daddr_t remain; 8395 8396 UFS_LOCK(ump); 8397 remain = -freeblks->fb_chkcnt; 8398 freeblks->fb_chkcnt += blocks; 8399 if (remain > 0) { 8400 if (remain < blocks) 8401 blocks = remain; 8402 fs = ump->um_fs; 8403 fs->fs_pendingblocks -= blocks; 8404 } 8405 UFS_UNLOCK(ump); 8406 } 8407 8408 /* 8409 * Once all of the freework workitems are complete we can retire the 8410 * freeblocks dependency and any journal work awaiting completion. This 8411 * can not be called until all other dependencies are stable on disk. 8412 */ 8413 static int 8414 handle_complete_freeblocks(freeblks, flags) 8415 struct freeblks *freeblks; 8416 int flags; 8417 { 8418 struct inodedep *inodedep; 8419 struct inode *ip; 8420 struct vnode *vp; 8421 struct fs *fs; 8422 struct ufsmount *ump; 8423 ufs2_daddr_t spare; 8424 8425 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 8426 fs = ump->um_fs; 8427 flags = LK_EXCLUSIVE | flags; 8428 spare = freeblks->fb_chkcnt; 8429 8430 /* 8431 * If we did not release the expected number of blocks we may have 8432 * to adjust the inode block count here. Only do so if it wasn't 8433 * a truncation to zero and the modrev still matches. 8434 */ 8435 if (spare && freeblks->fb_len != 0) { 8436 if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum, 8437 flags, &vp, FFSV_FORCEINSMQ | FFSV_FORCEINODEDEP) != 0) 8438 return (EBUSY); 8439 ip = VTOI(vp); 8440 if (ip->i_mode == 0) { 8441 vgone(vp); 8442 } else if (DIP(ip, i_modrev) == freeblks->fb_modrev) { 8443 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare); 8444 UFS_INODE_SET_FLAG(ip, IN_CHANGE); 8445 /* 8446 * We must wait so this happens before the 8447 * journal is reclaimed. 8448 */ 8449 ffs_update(vp, 1); 8450 } 8451 vput(vp); 8452 } 8453 if (spare < 0) { 8454 UFS_LOCK(ump); 8455 fs->fs_pendingblocks += spare; 8456 UFS_UNLOCK(ump); 8457 } 8458 #ifdef QUOTA 8459 /* Handle spare. */ 8460 if (spare) 8461 quotaadj(freeblks->fb_quota, ump, -spare); 8462 quotarele(freeblks->fb_quota); 8463 #endif 8464 ACQUIRE_LOCK(ump); 8465 if (freeblks->fb_state & ONDEPLIST) { 8466 inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum, 8467 0, &inodedep); 8468 TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next); 8469 freeblks->fb_state &= ~ONDEPLIST; 8470 if (TAILQ_EMPTY(&inodedep->id_freeblklst)) 8471 free_inodedep(inodedep); 8472 } 8473 /* 8474 * All of the freeblock deps must be complete prior to this call 8475 * so it's now safe to complete earlier outstanding journal entries. 8476 */ 8477 handle_jwork(&freeblks->fb_jwork); 8478 WORKITEM_FREE(freeblks, D_FREEBLKS); 8479 FREE_LOCK(ump); 8480 return (0); 8481 } 8482 8483 /* 8484 * Release blocks associated with the freeblks and stored in the indirect 8485 * block dbn. If level is greater than SINGLE, the block is an indirect block 8486 * and recursive calls to indirtrunc must be used to cleanse other indirect 8487 * blocks. 8488 * 8489 * This handles partial and complete truncation of blocks. Partial is noted 8490 * with goingaway == 0. In this case the freework is completed after the 8491 * zero'd indirects are written to disk. For full truncation the freework 8492 * is completed after the block is freed. 8493 */ 8494 static void 8495 indir_trunc(freework, dbn, lbn) 8496 struct freework *freework; 8497 ufs2_daddr_t dbn; 8498 ufs_lbn_t lbn; 8499 { 8500 struct freework *nfreework; 8501 struct workhead wkhd; 8502 struct freeblks *freeblks; 8503 struct buf *bp; 8504 struct fs *fs; 8505 struct indirdep *indirdep; 8506 struct mount *mp; 8507 struct ufsmount *ump; 8508 ufs1_daddr_t *bap1; 8509 ufs2_daddr_t nb, nnb, *bap2; 8510 ufs_lbn_t lbnadd, nlbn; 8511 u_long key; 8512 int nblocks, ufs1fmt, freedblocks; 8513 int goingaway, freedeps, needj, level, cnt, i, error; 8514 8515 freeblks = freework->fw_freeblks; 8516 mp = freeblks->fb_list.wk_mp; 8517 ump = VFSTOUFS(mp); 8518 fs = ump->um_fs; 8519 /* 8520 * Get buffer of block pointers to be freed. There are three cases: 8521 * 8522 * 1) Partial truncate caches the indirdep pointer in the freework 8523 * which provides us a back copy to the save bp which holds the 8524 * pointers we want to clear. When this completes the zero 8525 * pointers are written to the real copy. 8526 * 2) The indirect is being completely truncated, cancel_indirdep() 8527 * eliminated the real copy and placed the indirdep on the saved 8528 * copy. The indirdep and buf are discarded when this completes. 8529 * 3) The indirect was not in memory, we read a copy off of the disk 8530 * using the devvp and drop and invalidate the buffer when we're 8531 * done. 8532 */ 8533 goingaway = 1; 8534 indirdep = NULL; 8535 if (freework->fw_indir != NULL) { 8536 goingaway = 0; 8537 indirdep = freework->fw_indir; 8538 bp = indirdep->ir_savebp; 8539 if (bp == NULL || bp->b_blkno != dbn) 8540 panic("indir_trunc: Bad saved buf %p blkno %jd", 8541 bp, (intmax_t)dbn); 8542 } else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) { 8543 /* 8544 * The lock prevents the buf dep list from changing and 8545 * indirects on devvp should only ever have one dependency. 8546 */ 8547 indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep)); 8548 if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0) 8549 panic("indir_trunc: Bad indirdep %p from buf %p", 8550 indirdep, bp); 8551 } else { 8552 error = ffs_breadz(ump, freeblks->fb_devvp, dbn, dbn, 8553 (int)fs->fs_bsize, NULL, NULL, 0, NOCRED, 0, NULL, &bp); 8554 if (error) 8555 return; 8556 } 8557 ACQUIRE_LOCK(ump); 8558 /* Protects against a race with complete_trunc_indir(). */ 8559 freework->fw_state &= ~INPROGRESS; 8560 /* 8561 * If we have an indirdep we need to enforce the truncation order 8562 * and discard it when it is complete. 8563 */ 8564 if (indirdep) { 8565 if (freework != TAILQ_FIRST(&indirdep->ir_trunc) && 8566 !TAILQ_EMPTY(&indirdep->ir_trunc)) { 8567 /* 8568 * Add the complete truncate to the list on the 8569 * indirdep to enforce in-order processing. 8570 */ 8571 if (freework->fw_indir == NULL) 8572 TAILQ_INSERT_TAIL(&indirdep->ir_trunc, 8573 freework, fw_next); 8574 FREE_LOCK(ump); 8575 return; 8576 } 8577 /* 8578 * If we're goingaway, free the indirdep. Otherwise it will 8579 * linger until the write completes. 8580 */ 8581 if (goingaway) { 8582 KASSERT(indirdep->ir_savebp == bp, 8583 ("indir_trunc: losing ir_savebp %p", 8584 indirdep->ir_savebp)); 8585 indirdep->ir_savebp = NULL; 8586 free_indirdep(indirdep); 8587 } 8588 } 8589 FREE_LOCK(ump); 8590 /* Initialize pointers depending on block size. */ 8591 if (ump->um_fstype == UFS1) { 8592 bap1 = (ufs1_daddr_t *)bp->b_data; 8593 nb = bap1[freework->fw_off]; 8594 ufs1fmt = 1; 8595 bap2 = NULL; 8596 } else { 8597 bap2 = (ufs2_daddr_t *)bp->b_data; 8598 nb = bap2[freework->fw_off]; 8599 ufs1fmt = 0; 8600 bap1 = NULL; 8601 } 8602 level = lbn_level(lbn); 8603 needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0; 8604 lbnadd = lbn_offset(fs, level); 8605 nblocks = btodb(fs->fs_bsize); 8606 nfreework = freework; 8607 freedeps = 0; 8608 cnt = 0; 8609 /* 8610 * Reclaim blocks. Traverses into nested indirect levels and 8611 * arranges for the current level to be freed when subordinates 8612 * are free when journaling. 8613 */ 8614 key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum); 8615 for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) { 8616 if (UFS_CHECK_BLKNO(mp, freeblks->fb_inum, nb, 8617 fs->fs_bsize) != 0) 8618 nb = 0; 8619 if (i != NINDIR(fs) - 1) { 8620 if (ufs1fmt) 8621 nnb = bap1[i+1]; 8622 else 8623 nnb = bap2[i+1]; 8624 } else 8625 nnb = 0; 8626 if (nb == 0) 8627 continue; 8628 cnt++; 8629 if (level != 0) { 8630 nlbn = (lbn + 1) - (i * lbnadd); 8631 if (needj != 0) { 8632 nfreework = newfreework(ump, freeblks, freework, 8633 nlbn, nb, fs->fs_frag, 0, 0); 8634 freedeps++; 8635 } 8636 indir_trunc(nfreework, fsbtodb(fs, nb), nlbn); 8637 } else { 8638 struct freedep *freedep; 8639 8640 /* 8641 * Attempt to aggregate freedep dependencies for 8642 * all blocks being released to the same CG. 8643 */ 8644 LIST_INIT(&wkhd); 8645 if (needj != 0 && 8646 (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) { 8647 freedep = newfreedep(freework); 8648 WORKLIST_INSERT_UNLOCKED(&wkhd, 8649 &freedep->fd_list); 8650 freedeps++; 8651 } 8652 CTR3(KTR_SUJ, 8653 "indir_trunc: ino %jd blkno %jd size %d", 8654 freeblks->fb_inum, nb, fs->fs_bsize); 8655 ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, 8656 fs->fs_bsize, freeblks->fb_inum, 8657 freeblks->fb_vtype, &wkhd, key); 8658 } 8659 } 8660 ffs_blkrelease_finish(ump, key); 8661 if (goingaway) { 8662 bp->b_flags |= B_INVAL | B_NOCACHE; 8663 brelse(bp); 8664 } 8665 freedblocks = 0; 8666 if (level == 0) 8667 freedblocks = (nblocks * cnt); 8668 if (needj == 0) 8669 freedblocks += nblocks; 8670 freeblks_free(ump, freeblks, freedblocks); 8671 /* 8672 * If we are journaling set up the ref counts and offset so this 8673 * indirect can be completed when its children are free. 8674 */ 8675 if (needj) { 8676 ACQUIRE_LOCK(ump); 8677 freework->fw_off = i; 8678 freework->fw_ref += freedeps; 8679 freework->fw_ref -= NINDIR(fs) + 1; 8680 if (level == 0) 8681 freeblks->fb_cgwait += freedeps; 8682 if (freework->fw_ref == 0) 8683 freework_freeblock(freework, SINGLETON_KEY); 8684 FREE_LOCK(ump); 8685 return; 8686 } 8687 /* 8688 * If we're not journaling we can free the indirect now. 8689 */ 8690 dbn = dbtofsb(fs, dbn); 8691 CTR3(KTR_SUJ, 8692 "indir_trunc 2: ino %jd blkno %jd size %d", 8693 freeblks->fb_inum, dbn, fs->fs_bsize); 8694 ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize, 8695 freeblks->fb_inum, freeblks->fb_vtype, NULL, SINGLETON_KEY); 8696 /* Non SUJ softdep does single-threaded truncations. */ 8697 if (freework->fw_blkno == dbn) { 8698 freework->fw_state |= ALLCOMPLETE; 8699 ACQUIRE_LOCK(ump); 8700 handle_written_freework(freework); 8701 FREE_LOCK(ump); 8702 } 8703 return; 8704 } 8705 8706 /* 8707 * Cancel an allocindir when it is removed via truncation. When bp is not 8708 * NULL the indirect never appeared on disk and is scheduled to be freed 8709 * independently of the indir so we can more easily track journal work. 8710 */ 8711 static void 8712 cancel_allocindir(aip, bp, freeblks, trunc) 8713 struct allocindir *aip; 8714 struct buf *bp; 8715 struct freeblks *freeblks; 8716 int trunc; 8717 { 8718 struct indirdep *indirdep; 8719 struct freefrag *freefrag; 8720 struct newblk *newblk; 8721 8722 newblk = (struct newblk *)aip; 8723 LIST_REMOVE(aip, ai_next); 8724 /* 8725 * We must eliminate the pointer in bp if it must be freed on its 8726 * own due to partial truncate or pending journal work. 8727 */ 8728 if (bp && (trunc || newblk->nb_jnewblk)) { 8729 /* 8730 * Clear the pointer and mark the aip to be freed 8731 * directly if it never existed on disk. 8732 */ 8733 aip->ai_state |= DELAYEDFREE; 8734 indirdep = aip->ai_indirdep; 8735 if (indirdep->ir_state & UFS1FMT) 8736 ((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0; 8737 else 8738 ((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0; 8739 } 8740 /* 8741 * When truncating the previous pointer will be freed via 8742 * savedbp. Eliminate the freefrag which would dup free. 8743 */ 8744 if (trunc && (freefrag = newblk->nb_freefrag) != NULL) { 8745 newblk->nb_freefrag = NULL; 8746 if (freefrag->ff_jdep) 8747 cancel_jfreefrag( 8748 WK_JFREEFRAG(freefrag->ff_jdep)); 8749 jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork); 8750 WORKITEM_FREE(freefrag, D_FREEFRAG); 8751 } 8752 /* 8753 * If the journal hasn't been written the jnewblk must be passed 8754 * to the call to ffs_blkfree that reclaims the space. We accomplish 8755 * this by leaving the journal dependency on the newblk to be freed 8756 * when a freework is created in handle_workitem_freeblocks(). 8757 */ 8758 cancel_newblk(newblk, NULL, &freeblks->fb_jwork); 8759 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list); 8760 } 8761 8762 /* 8763 * Create the mkdir dependencies for . and .. in a new directory. Link them 8764 * in to a newdirblk so any subsequent additions are tracked properly. The 8765 * caller is responsible for adding the mkdir1 dependency to the journal 8766 * and updating id_mkdiradd. This function returns with the per-filesystem 8767 * lock held. 8768 */ 8769 static struct mkdir * 8770 setup_newdir(dap, newinum, dinum, newdirbp, mkdirp) 8771 struct diradd *dap; 8772 ino_t newinum; 8773 ino_t dinum; 8774 struct buf *newdirbp; 8775 struct mkdir **mkdirp; 8776 { 8777 struct newblk *newblk; 8778 struct pagedep *pagedep; 8779 struct inodedep *inodedep; 8780 struct newdirblk *newdirblk; 8781 struct mkdir *mkdir1, *mkdir2; 8782 struct worklist *wk; 8783 struct jaddref *jaddref; 8784 struct ufsmount *ump; 8785 struct mount *mp; 8786 8787 mp = dap->da_list.wk_mp; 8788 ump = VFSTOUFS(mp); 8789 newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK, 8790 M_SOFTDEP_FLAGS); 8791 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); 8792 LIST_INIT(&newdirblk->db_mkdir); 8793 mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); 8794 workitem_alloc(&mkdir1->md_list, D_MKDIR, mp); 8795 mkdir1->md_state = ATTACHED | MKDIR_BODY; 8796 mkdir1->md_diradd = dap; 8797 mkdir1->md_jaddref = NULL; 8798 mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); 8799 workitem_alloc(&mkdir2->md_list, D_MKDIR, mp); 8800 mkdir2->md_state = ATTACHED | MKDIR_PARENT; 8801 mkdir2->md_diradd = dap; 8802 mkdir2->md_jaddref = NULL; 8803 if (MOUNTEDSUJ(mp) == 0) { 8804 mkdir1->md_state |= DEPCOMPLETE; 8805 mkdir2->md_state |= DEPCOMPLETE; 8806 } 8807 /* 8808 * Dependency on "." and ".." being written to disk. 8809 */ 8810 mkdir1->md_buf = newdirbp; 8811 ACQUIRE_LOCK(VFSTOUFS(mp)); 8812 LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir1, md_mkdirs); 8813 /* 8814 * We must link the pagedep, allocdirect, and newdirblk for 8815 * the initial file page so the pointer to the new directory 8816 * is not written until the directory contents are live and 8817 * any subsequent additions are not marked live until the 8818 * block is reachable via the inode. 8819 */ 8820 if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0) 8821 panic("setup_newdir: lost pagedep"); 8822 LIST_FOREACH(wk, &newdirbp->b_dep, wk_list) 8823 if (wk->wk_type == D_ALLOCDIRECT) 8824 break; 8825 if (wk == NULL) 8826 panic("setup_newdir: lost allocdirect"); 8827 if (pagedep->pd_state & NEWBLOCK) 8828 panic("setup_newdir: NEWBLOCK already set"); 8829 newblk = WK_NEWBLK(wk); 8830 pagedep->pd_state |= NEWBLOCK; 8831 pagedep->pd_newdirblk = newdirblk; 8832 newdirblk->db_pagedep = pagedep; 8833 WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); 8834 WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list); 8835 /* 8836 * Look up the inodedep for the parent directory so that we 8837 * can link mkdir2 into the pending dotdot jaddref or 8838 * the inode write if there is none. If the inode is 8839 * ALLCOMPLETE and no jaddref is present all dependencies have 8840 * been satisfied and mkdir2 can be freed. 8841 */ 8842 inodedep_lookup(mp, dinum, 0, &inodedep); 8843 if (MOUNTEDSUJ(mp)) { 8844 if (inodedep == NULL) 8845 panic("setup_newdir: Lost parent."); 8846 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 8847 inoreflst); 8848 KASSERT(jaddref != NULL && jaddref->ja_parent == newinum && 8849 (jaddref->ja_state & MKDIR_PARENT), 8850 ("setup_newdir: bad dotdot jaddref %p", jaddref)); 8851 LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs); 8852 mkdir2->md_jaddref = jaddref; 8853 jaddref->ja_mkdir = mkdir2; 8854 } else if (inodedep == NULL || 8855 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 8856 dap->da_state &= ~MKDIR_PARENT; 8857 WORKITEM_FREE(mkdir2, D_MKDIR); 8858 mkdir2 = NULL; 8859 } else { 8860 LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs); 8861 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list); 8862 } 8863 *mkdirp = mkdir2; 8864 8865 return (mkdir1); 8866 } 8867 8868 /* 8869 * Directory entry addition dependencies. 8870 * 8871 * When adding a new directory entry, the inode (with its incremented link 8872 * count) must be written to disk before the directory entry's pointer to it. 8873 * Also, if the inode is newly allocated, the corresponding freemap must be 8874 * updated (on disk) before the directory entry's pointer. These requirements 8875 * are met via undo/redo on the directory entry's pointer, which consists 8876 * simply of the inode number. 8877 * 8878 * As directory entries are added and deleted, the free space within a 8879 * directory block can become fragmented. The ufs filesystem will compact 8880 * a fragmented directory block to make space for a new entry. When this 8881 * occurs, the offsets of previously added entries change. Any "diradd" 8882 * dependency structures corresponding to these entries must be updated with 8883 * the new offsets. 8884 */ 8885 8886 /* 8887 * This routine is called after the in-memory inode's link 8888 * count has been incremented, but before the directory entry's 8889 * pointer to the inode has been set. 8890 */ 8891 int 8892 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) 8893 struct buf *bp; /* buffer containing directory block */ 8894 struct inode *dp; /* inode for directory */ 8895 off_t diroffset; /* offset of new entry in directory */ 8896 ino_t newinum; /* inode referenced by new directory entry */ 8897 struct buf *newdirbp; /* non-NULL => contents of new mkdir */ 8898 int isnewblk; /* entry is in a newly allocated block */ 8899 { 8900 int offset; /* offset of new entry within directory block */ 8901 ufs_lbn_t lbn; /* block in directory containing new entry */ 8902 struct fs *fs; 8903 struct diradd *dap; 8904 struct newblk *newblk; 8905 struct pagedep *pagedep; 8906 struct inodedep *inodedep; 8907 struct newdirblk *newdirblk; 8908 struct mkdir *mkdir1, *mkdir2; 8909 struct jaddref *jaddref; 8910 struct ufsmount *ump; 8911 struct mount *mp; 8912 int isindir; 8913 8914 mp = ITOVFS(dp); 8915 ump = VFSTOUFS(mp); 8916 KASSERT(MOUNTEDSOFTDEP(mp) != 0, 8917 ("softdep_setup_directory_add called on non-softdep filesystem")); 8918 /* 8919 * Whiteouts have no dependencies. 8920 */ 8921 if (newinum == UFS_WINO) { 8922 if (newdirbp != NULL) 8923 bdwrite(newdirbp); 8924 return (0); 8925 } 8926 jaddref = NULL; 8927 mkdir1 = mkdir2 = NULL; 8928 fs = ump->um_fs; 8929 lbn = lblkno(fs, diroffset); 8930 offset = blkoff(fs, diroffset); 8931 dap = malloc(sizeof(struct diradd), M_DIRADD, 8932 M_SOFTDEP_FLAGS|M_ZERO); 8933 workitem_alloc(&dap->da_list, D_DIRADD, mp); 8934 dap->da_offset = offset; 8935 dap->da_newinum = newinum; 8936 dap->da_state = ATTACHED; 8937 LIST_INIT(&dap->da_jwork); 8938 isindir = bp->b_lblkno >= UFS_NDADDR; 8939 newdirblk = NULL; 8940 if (isnewblk && 8941 (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) { 8942 newdirblk = malloc(sizeof(struct newdirblk), 8943 M_NEWDIRBLK, M_SOFTDEP_FLAGS); 8944 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); 8945 LIST_INIT(&newdirblk->db_mkdir); 8946 } 8947 /* 8948 * If we're creating a new directory setup the dependencies and set 8949 * the dap state to wait for them. Otherwise it's COMPLETE and 8950 * we can move on. 8951 */ 8952 if (newdirbp == NULL) { 8953 dap->da_state |= DEPCOMPLETE; 8954 ACQUIRE_LOCK(ump); 8955 } else { 8956 dap->da_state |= MKDIR_BODY | MKDIR_PARENT; 8957 mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp, 8958 &mkdir2); 8959 } 8960 /* 8961 * Link into parent directory pagedep to await its being written. 8962 */ 8963 pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep); 8964 #ifdef INVARIANTS 8965 if (diradd_lookup(pagedep, offset) != NULL) 8966 panic("softdep_setup_directory_add: %p already at off %d\n", 8967 diradd_lookup(pagedep, offset), offset); 8968 #endif 8969 dap->da_pagedep = pagedep; 8970 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, 8971 da_pdlist); 8972 inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); 8973 /* 8974 * If we're journaling, link the diradd into the jaddref so it 8975 * may be completed after the journal entry is written. Otherwise, 8976 * link the diradd into its inodedep. If the inode is not yet 8977 * written place it on the bufwait list, otherwise do the post-inode 8978 * write processing to put it on the id_pendinghd list. 8979 */ 8980 if (MOUNTEDSUJ(mp)) { 8981 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 8982 inoreflst); 8983 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 8984 ("softdep_setup_directory_add: bad jaddref %p", jaddref)); 8985 jaddref->ja_diroff = diroffset; 8986 jaddref->ja_diradd = dap; 8987 add_to_journal(&jaddref->ja_list); 8988 } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) 8989 diradd_inode_written(dap, inodedep); 8990 else 8991 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 8992 /* 8993 * Add the journal entries for . and .. links now that the primary 8994 * link is written. 8995 */ 8996 if (mkdir1 != NULL && MOUNTEDSUJ(mp)) { 8997 jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, 8998 inoreflst, if_deps); 8999 KASSERT(jaddref != NULL && 9000 jaddref->ja_ino == jaddref->ja_parent && 9001 (jaddref->ja_state & MKDIR_BODY), 9002 ("softdep_setup_directory_add: bad dot jaddref %p", 9003 jaddref)); 9004 mkdir1->md_jaddref = jaddref; 9005 jaddref->ja_mkdir = mkdir1; 9006 /* 9007 * It is important that the dotdot journal entry 9008 * is added prior to the dot entry since dot writes 9009 * both the dot and dotdot links. These both must 9010 * be added after the primary link for the journal 9011 * to remain consistent. 9012 */ 9013 add_to_journal(&mkdir2->md_jaddref->ja_list); 9014 add_to_journal(&jaddref->ja_list); 9015 } 9016 /* 9017 * If we are adding a new directory remember this diradd so that if 9018 * we rename it we can keep the dot and dotdot dependencies. If 9019 * we are adding a new name for an inode that has a mkdiradd we 9020 * must be in rename and we have to move the dot and dotdot 9021 * dependencies to this new name. The old name is being orphaned 9022 * soon. 9023 */ 9024 if (mkdir1 != NULL) { 9025 if (inodedep->id_mkdiradd != NULL) 9026 panic("softdep_setup_directory_add: Existing mkdir"); 9027 inodedep->id_mkdiradd = dap; 9028 } else if (inodedep->id_mkdiradd) 9029 merge_diradd(inodedep, dap); 9030 if (newdirblk != NULL) { 9031 /* 9032 * There is nothing to do if we are already tracking 9033 * this block. 9034 */ 9035 if ((pagedep->pd_state & NEWBLOCK) != 0) { 9036 WORKITEM_FREE(newdirblk, D_NEWDIRBLK); 9037 FREE_LOCK(ump); 9038 return (0); 9039 } 9040 if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk) 9041 == 0) 9042 panic("softdep_setup_directory_add: lost entry"); 9043 WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); 9044 pagedep->pd_state |= NEWBLOCK; 9045 pagedep->pd_newdirblk = newdirblk; 9046 newdirblk->db_pagedep = pagedep; 9047 FREE_LOCK(ump); 9048 /* 9049 * If we extended into an indirect signal direnter to sync. 9050 */ 9051 if (isindir) 9052 return (1); 9053 return (0); 9054 } 9055 FREE_LOCK(ump); 9056 return (0); 9057 } 9058 9059 /* 9060 * This procedure is called to change the offset of a directory 9061 * entry when compacting a directory block which must be owned 9062 * exclusively by the caller. Note that the actual entry movement 9063 * must be done in this procedure to ensure that no I/O completions 9064 * occur while the move is in progress. 9065 */ 9066 void 9067 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) 9068 struct buf *bp; /* Buffer holding directory block. */ 9069 struct inode *dp; /* inode for directory */ 9070 caddr_t base; /* address of dp->i_offset */ 9071 caddr_t oldloc; /* address of old directory location */ 9072 caddr_t newloc; /* address of new directory location */ 9073 int entrysize; /* size of directory entry */ 9074 { 9075 int offset, oldoffset, newoffset; 9076 struct pagedep *pagedep; 9077 struct jmvref *jmvref; 9078 struct diradd *dap; 9079 struct direct *de; 9080 struct mount *mp; 9081 struct ufsmount *ump; 9082 ufs_lbn_t lbn; 9083 int flags; 9084 9085 mp = ITOVFS(dp); 9086 ump = VFSTOUFS(mp); 9087 KASSERT(MOUNTEDSOFTDEP(mp) != 0, 9088 ("softdep_change_directoryentry_offset called on " 9089 "non-softdep filesystem")); 9090 de = (struct direct *)oldloc; 9091 jmvref = NULL; 9092 flags = 0; 9093 /* 9094 * Moves are always journaled as it would be too complex to 9095 * determine if any affected adds or removes are present in the 9096 * journal. 9097 */ 9098 if (MOUNTEDSUJ(mp)) { 9099 flags = DEPALLOC; 9100 jmvref = newjmvref(dp, de->d_ino, 9101 I_OFFSET(dp) + (oldloc - base), 9102 I_OFFSET(dp) + (newloc - base)); 9103 } 9104 lbn = lblkno(ump->um_fs, I_OFFSET(dp)); 9105 offset = blkoff(ump->um_fs, I_OFFSET(dp)); 9106 oldoffset = offset + (oldloc - base); 9107 newoffset = offset + (newloc - base); 9108 ACQUIRE_LOCK(ump); 9109 if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0) 9110 goto done; 9111 dap = diradd_lookup(pagedep, oldoffset); 9112 if (dap) { 9113 dap->da_offset = newoffset; 9114 newoffset = DIRADDHASH(newoffset); 9115 oldoffset = DIRADDHASH(oldoffset); 9116 if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE && 9117 newoffset != oldoffset) { 9118 LIST_REMOVE(dap, da_pdlist); 9119 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset], 9120 dap, da_pdlist); 9121 } 9122 } 9123 done: 9124 if (jmvref) { 9125 jmvref->jm_pagedep = pagedep; 9126 LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps); 9127 add_to_journal(&jmvref->jm_list); 9128 } 9129 bcopy(oldloc, newloc, entrysize); 9130 FREE_LOCK(ump); 9131 } 9132 9133 /* 9134 * Move the mkdir dependencies and journal work from one diradd to another 9135 * when renaming a directory. The new name must depend on the mkdir deps 9136 * completing as the old name did. Directories can only have one valid link 9137 * at a time so one must be canonical. 9138 */ 9139 static void 9140 merge_diradd(inodedep, newdap) 9141 struct inodedep *inodedep; 9142 struct diradd *newdap; 9143 { 9144 struct diradd *olddap; 9145 struct mkdir *mkdir, *nextmd; 9146 struct ufsmount *ump; 9147 short state; 9148 9149 olddap = inodedep->id_mkdiradd; 9150 inodedep->id_mkdiradd = newdap; 9151 if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 9152 newdap->da_state &= ~DEPCOMPLETE; 9153 ump = VFSTOUFS(inodedep->id_list.wk_mp); 9154 for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir; 9155 mkdir = nextmd) { 9156 nextmd = LIST_NEXT(mkdir, md_mkdirs); 9157 if (mkdir->md_diradd != olddap) 9158 continue; 9159 mkdir->md_diradd = newdap; 9160 state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY); 9161 newdap->da_state |= state; 9162 olddap->da_state &= ~state; 9163 if ((olddap->da_state & 9164 (MKDIR_PARENT | MKDIR_BODY)) == 0) 9165 break; 9166 } 9167 if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) 9168 panic("merge_diradd: unfound ref"); 9169 } 9170 /* 9171 * Any mkdir related journal items are not safe to be freed until 9172 * the new name is stable. 9173 */ 9174 jwork_move(&newdap->da_jwork, &olddap->da_jwork); 9175 olddap->da_state |= DEPCOMPLETE; 9176 complete_diradd(olddap); 9177 } 9178 9179 /* 9180 * Move the diradd to the pending list when all diradd dependencies are 9181 * complete. 9182 */ 9183 static void 9184 complete_diradd(dap) 9185 struct diradd *dap; 9186 { 9187 struct pagedep *pagedep; 9188 9189 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 9190 if (dap->da_state & DIRCHG) 9191 pagedep = dap->da_previous->dm_pagedep; 9192 else 9193 pagedep = dap->da_pagedep; 9194 LIST_REMOVE(dap, da_pdlist); 9195 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 9196 } 9197 } 9198 9199 /* 9200 * Cancel a diradd when a dirrem overlaps with it. We must cancel the journal 9201 * add entries and conditonally journal the remove. 9202 */ 9203 static void 9204 cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref) 9205 struct diradd *dap; 9206 struct dirrem *dirrem; 9207 struct jremref *jremref; 9208 struct jremref *dotremref; 9209 struct jremref *dotdotremref; 9210 { 9211 struct inodedep *inodedep; 9212 struct jaddref *jaddref; 9213 struct inoref *inoref; 9214 struct ufsmount *ump; 9215 struct mkdir *mkdir; 9216 9217 /* 9218 * If no remove references were allocated we're on a non-journaled 9219 * filesystem and can skip the cancel step. 9220 */ 9221 if (jremref == NULL) { 9222 free_diradd(dap, NULL); 9223 return; 9224 } 9225 /* 9226 * Cancel the primary name an free it if it does not require 9227 * journaling. 9228 */ 9229 if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum, 9230 0, &inodedep) != 0) { 9231 /* Abort the addref that reference this diradd. */ 9232 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 9233 if (inoref->if_list.wk_type != D_JADDREF) 9234 continue; 9235 jaddref = (struct jaddref *)inoref; 9236 if (jaddref->ja_diradd != dap) 9237 continue; 9238 if (cancel_jaddref(jaddref, inodedep, 9239 &dirrem->dm_jwork) == 0) { 9240 free_jremref(jremref); 9241 jremref = NULL; 9242 } 9243 break; 9244 } 9245 } 9246 /* 9247 * Cancel subordinate names and free them if they do not require 9248 * journaling. 9249 */ 9250 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 9251 ump = VFSTOUFS(dap->da_list.wk_mp); 9252 LIST_FOREACH(mkdir, &ump->softdep_mkdirlisthd, md_mkdirs) { 9253 if (mkdir->md_diradd != dap) 9254 continue; 9255 if ((jaddref = mkdir->md_jaddref) == NULL) 9256 continue; 9257 mkdir->md_jaddref = NULL; 9258 if (mkdir->md_state & MKDIR_PARENT) { 9259 if (cancel_jaddref(jaddref, NULL, 9260 &dirrem->dm_jwork) == 0) { 9261 free_jremref(dotdotremref); 9262 dotdotremref = NULL; 9263 } 9264 } else { 9265 if (cancel_jaddref(jaddref, inodedep, 9266 &dirrem->dm_jwork) == 0) { 9267 free_jremref(dotremref); 9268 dotremref = NULL; 9269 } 9270 } 9271 } 9272 } 9273 9274 if (jremref) 9275 journal_jremref(dirrem, jremref, inodedep); 9276 if (dotremref) 9277 journal_jremref(dirrem, dotremref, inodedep); 9278 if (dotdotremref) 9279 journal_jremref(dirrem, dotdotremref, NULL); 9280 jwork_move(&dirrem->dm_jwork, &dap->da_jwork); 9281 free_diradd(dap, &dirrem->dm_jwork); 9282 } 9283 9284 /* 9285 * Free a diradd dependency structure. 9286 */ 9287 static void 9288 free_diradd(dap, wkhd) 9289 struct diradd *dap; 9290 struct workhead *wkhd; 9291 { 9292 struct dirrem *dirrem; 9293 struct pagedep *pagedep; 9294 struct inodedep *inodedep; 9295 struct mkdir *mkdir, *nextmd; 9296 struct ufsmount *ump; 9297 9298 ump = VFSTOUFS(dap->da_list.wk_mp); 9299 LOCK_OWNED(ump); 9300 LIST_REMOVE(dap, da_pdlist); 9301 if (dap->da_state & ONWORKLIST) 9302 WORKLIST_REMOVE(&dap->da_list); 9303 if ((dap->da_state & DIRCHG) == 0) { 9304 pagedep = dap->da_pagedep; 9305 } else { 9306 dirrem = dap->da_previous; 9307 pagedep = dirrem->dm_pagedep; 9308 dirrem->dm_dirinum = pagedep->pd_ino; 9309 dirrem->dm_state |= COMPLETE; 9310 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 9311 add_to_worklist(&dirrem->dm_list, 0); 9312 } 9313 if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum, 9314 0, &inodedep) != 0) 9315 if (inodedep->id_mkdiradd == dap) 9316 inodedep->id_mkdiradd = NULL; 9317 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 9318 for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir; 9319 mkdir = nextmd) { 9320 nextmd = LIST_NEXT(mkdir, md_mkdirs); 9321 if (mkdir->md_diradd != dap) 9322 continue; 9323 dap->da_state &= 9324 ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); 9325 LIST_REMOVE(mkdir, md_mkdirs); 9326 if (mkdir->md_state & ONWORKLIST) 9327 WORKLIST_REMOVE(&mkdir->md_list); 9328 if (mkdir->md_jaddref != NULL) 9329 panic("free_diradd: Unexpected jaddref"); 9330 WORKITEM_FREE(mkdir, D_MKDIR); 9331 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) 9332 break; 9333 } 9334 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) 9335 panic("free_diradd: unfound ref"); 9336 } 9337 if (inodedep) 9338 free_inodedep(inodedep); 9339 /* 9340 * Free any journal segments waiting for the directory write. 9341 */ 9342 handle_jwork(&dap->da_jwork); 9343 WORKITEM_FREE(dap, D_DIRADD); 9344 } 9345 9346 /* 9347 * Directory entry removal dependencies. 9348 * 9349 * When removing a directory entry, the entry's inode pointer must be 9350 * zero'ed on disk before the corresponding inode's link count is decremented 9351 * (possibly freeing the inode for re-use). This dependency is handled by 9352 * updating the directory entry but delaying the inode count reduction until 9353 * after the directory block has been written to disk. After this point, the 9354 * inode count can be decremented whenever it is convenient. 9355 */ 9356 9357 /* 9358 * This routine should be called immediately after removing 9359 * a directory entry. The inode's link count should not be 9360 * decremented by the calling procedure -- the soft updates 9361 * code will do this task when it is safe. 9362 */ 9363 void 9364 softdep_setup_remove(bp, dp, ip, isrmdir) 9365 struct buf *bp; /* buffer containing directory block */ 9366 struct inode *dp; /* inode for the directory being modified */ 9367 struct inode *ip; /* inode for directory entry being removed */ 9368 int isrmdir; /* indicates if doing RMDIR */ 9369 { 9370 struct dirrem *dirrem, *prevdirrem; 9371 struct inodedep *inodedep; 9372 struct ufsmount *ump; 9373 int direct; 9374 9375 ump = ITOUMP(ip); 9376 KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, 9377 ("softdep_setup_remove called on non-softdep filesystem")); 9378 /* 9379 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. We want 9380 * newdirrem() to setup the full directory remove which requires 9381 * isrmdir > 1. 9382 */ 9383 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 9384 /* 9385 * Add the dirrem to the inodedep's pending remove list for quick 9386 * discovery later. 9387 */ 9388 if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0) 9389 panic("softdep_setup_remove: Lost inodedep."); 9390 KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked")); 9391 dirrem->dm_state |= ONDEPLIST; 9392 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 9393 9394 /* 9395 * If the COMPLETE flag is clear, then there were no active 9396 * entries and we want to roll back to a zeroed entry until 9397 * the new inode is committed to disk. If the COMPLETE flag is 9398 * set then we have deleted an entry that never made it to 9399 * disk. If the entry we deleted resulted from a name change, 9400 * then the old name still resides on disk. We cannot delete 9401 * its inode (returned to us in prevdirrem) until the zeroed 9402 * directory entry gets to disk. The new inode has never been 9403 * referenced on the disk, so can be deleted immediately. 9404 */ 9405 if ((dirrem->dm_state & COMPLETE) == 0) { 9406 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, 9407 dm_next); 9408 FREE_LOCK(ump); 9409 } else { 9410 if (prevdirrem != NULL) 9411 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, 9412 prevdirrem, dm_next); 9413 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; 9414 direct = LIST_EMPTY(&dirrem->dm_jremrefhd); 9415 FREE_LOCK(ump); 9416 if (direct) 9417 handle_workitem_remove(dirrem, 0); 9418 } 9419 } 9420 9421 /* 9422 * Check for an entry matching 'offset' on both the pd_dirraddhd list and the 9423 * pd_pendinghd list of a pagedep. 9424 */ 9425 static struct diradd * 9426 diradd_lookup(pagedep, offset) 9427 struct pagedep *pagedep; 9428 int offset; 9429 { 9430 struct diradd *dap; 9431 9432 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) 9433 if (dap->da_offset == offset) 9434 return (dap); 9435 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) 9436 if (dap->da_offset == offset) 9437 return (dap); 9438 return (NULL); 9439 } 9440 9441 /* 9442 * Search for a .. diradd dependency in a directory that is being removed. 9443 * If the directory was renamed to a new parent we have a diradd rather 9444 * than a mkdir for the .. entry. We need to cancel it now before 9445 * it is found in truncate(). 9446 */ 9447 static struct jremref * 9448 cancel_diradd_dotdot(ip, dirrem, jremref) 9449 struct inode *ip; 9450 struct dirrem *dirrem; 9451 struct jremref *jremref; 9452 { 9453 struct pagedep *pagedep; 9454 struct diradd *dap; 9455 struct worklist *wk; 9456 9457 if (pagedep_lookup(ITOVFS(ip), NULL, ip->i_number, 0, 0, &pagedep) == 0) 9458 return (jremref); 9459 dap = diradd_lookup(pagedep, DOTDOT_OFFSET); 9460 if (dap == NULL) 9461 return (jremref); 9462 cancel_diradd(dap, dirrem, jremref, NULL, NULL); 9463 /* 9464 * Mark any journal work as belonging to the parent so it is freed 9465 * with the .. reference. 9466 */ 9467 LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) 9468 wk->wk_state |= MKDIR_PARENT; 9469 return (NULL); 9470 } 9471 9472 /* 9473 * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to 9474 * replace it with a dirrem/diradd pair as a result of re-parenting a 9475 * directory. This ensures that we don't simultaneously have a mkdir and 9476 * a diradd for the same .. entry. 9477 */ 9478 static struct jremref * 9479 cancel_mkdir_dotdot(ip, dirrem, jremref) 9480 struct inode *ip; 9481 struct dirrem *dirrem; 9482 struct jremref *jremref; 9483 { 9484 struct inodedep *inodedep; 9485 struct jaddref *jaddref; 9486 struct ufsmount *ump; 9487 struct mkdir *mkdir; 9488 struct diradd *dap; 9489 struct mount *mp; 9490 9491 mp = ITOVFS(ip); 9492 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) 9493 return (jremref); 9494 dap = inodedep->id_mkdiradd; 9495 if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0) 9496 return (jremref); 9497 ump = VFSTOUFS(inodedep->id_list.wk_mp); 9498 for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir; 9499 mkdir = LIST_NEXT(mkdir, md_mkdirs)) 9500 if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT) 9501 break; 9502 if (mkdir == NULL) 9503 panic("cancel_mkdir_dotdot: Unable to find mkdir\n"); 9504 if ((jaddref = mkdir->md_jaddref) != NULL) { 9505 mkdir->md_jaddref = NULL; 9506 jaddref->ja_state &= ~MKDIR_PARENT; 9507 if (inodedep_lookup(mp, jaddref->ja_ino, 0, &inodedep) == 0) 9508 panic("cancel_mkdir_dotdot: Lost parent inodedep"); 9509 if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) { 9510 journal_jremref(dirrem, jremref, inodedep); 9511 jremref = NULL; 9512 } 9513 } 9514 if (mkdir->md_state & ONWORKLIST) 9515 WORKLIST_REMOVE(&mkdir->md_list); 9516 mkdir->md_state |= ALLCOMPLETE; 9517 complete_mkdir(mkdir); 9518 return (jremref); 9519 } 9520 9521 static void 9522 journal_jremref(dirrem, jremref, inodedep) 9523 struct dirrem *dirrem; 9524 struct jremref *jremref; 9525 struct inodedep *inodedep; 9526 { 9527 9528 if (inodedep == NULL) 9529 if (inodedep_lookup(jremref->jr_list.wk_mp, 9530 jremref->jr_ref.if_ino, 0, &inodedep) == 0) 9531 panic("journal_jremref: Lost inodedep"); 9532 LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps); 9533 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); 9534 add_to_journal(&jremref->jr_list); 9535 } 9536 9537 static void 9538 dirrem_journal(dirrem, jremref, dotremref, dotdotremref) 9539 struct dirrem *dirrem; 9540 struct jremref *jremref; 9541 struct jremref *dotremref; 9542 struct jremref *dotdotremref; 9543 { 9544 struct inodedep *inodedep; 9545 9546 if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0, 9547 &inodedep) == 0) 9548 panic("dirrem_journal: Lost inodedep"); 9549 journal_jremref(dirrem, jremref, inodedep); 9550 if (dotremref) 9551 journal_jremref(dirrem, dotremref, inodedep); 9552 if (dotdotremref) 9553 journal_jremref(dirrem, dotdotremref, NULL); 9554 } 9555 9556 /* 9557 * Allocate a new dirrem if appropriate and return it along with 9558 * its associated pagedep. Called without a lock, returns with lock. 9559 */ 9560 static struct dirrem * 9561 newdirrem(bp, dp, ip, isrmdir, prevdirremp) 9562 struct buf *bp; /* buffer containing directory block */ 9563 struct inode *dp; /* inode for the directory being modified */ 9564 struct inode *ip; /* inode for directory entry being removed */ 9565 int isrmdir; /* indicates if doing RMDIR */ 9566 struct dirrem **prevdirremp; /* previously referenced inode, if any */ 9567 { 9568 int offset; 9569 ufs_lbn_t lbn; 9570 struct diradd *dap; 9571 struct dirrem *dirrem; 9572 struct pagedep *pagedep; 9573 struct jremref *jremref; 9574 struct jremref *dotremref; 9575 struct jremref *dotdotremref; 9576 struct vnode *dvp; 9577 struct ufsmount *ump; 9578 9579 /* 9580 * Whiteouts have no deletion dependencies. 9581 */ 9582 if (ip == NULL) 9583 panic("newdirrem: whiteout"); 9584 dvp = ITOV(dp); 9585 ump = ITOUMP(dp); 9586 9587 /* 9588 * If the system is over its limit and our filesystem is 9589 * responsible for more than our share of that usage and 9590 * we are not a snapshot, request some inodedep cleanup. 9591 * Limiting the number of dirrem structures will also limit 9592 * the number of freefile and freeblks structures. 9593 */ 9594 ACQUIRE_LOCK(ump); 9595 if (!IS_SNAPSHOT(ip) && softdep_excess_items(ump, D_DIRREM)) 9596 schedule_cleanup(UFSTOVFS(ump)); 9597 else 9598 FREE_LOCK(ump); 9599 dirrem = malloc(sizeof(struct dirrem), M_DIRREM, M_SOFTDEP_FLAGS | 9600 M_ZERO); 9601 workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount); 9602 LIST_INIT(&dirrem->dm_jremrefhd); 9603 LIST_INIT(&dirrem->dm_jwork); 9604 dirrem->dm_state = isrmdir ? RMDIR : 0; 9605 dirrem->dm_oldinum = ip->i_number; 9606 *prevdirremp = NULL; 9607 /* 9608 * Allocate remove reference structures to track journal write 9609 * dependencies. We will always have one for the link and 9610 * when doing directories we will always have one more for dot. 9611 * When renaming a directory we skip the dotdot link change so 9612 * this is not needed. 9613 */ 9614 jremref = dotremref = dotdotremref = NULL; 9615 if (DOINGSUJ(dvp)) { 9616 if (isrmdir) { 9617 jremref = newjremref(dirrem, dp, ip, I_OFFSET(dp), 9618 ip->i_effnlink + 2); 9619 dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET, 9620 ip->i_effnlink + 1); 9621 dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET, 9622 dp->i_effnlink + 1); 9623 dotdotremref->jr_state |= MKDIR_PARENT; 9624 } else 9625 jremref = newjremref(dirrem, dp, ip, I_OFFSET(dp), 9626 ip->i_effnlink + 1); 9627 } 9628 ACQUIRE_LOCK(ump); 9629 lbn = lblkno(ump->um_fs, I_OFFSET(dp)); 9630 offset = blkoff(ump->um_fs, I_OFFSET(dp)); 9631 pagedep_lookup(UFSTOVFS(ump), bp, dp->i_number, lbn, DEPALLOC, 9632 &pagedep); 9633 dirrem->dm_pagedep = pagedep; 9634 dirrem->dm_offset = offset; 9635 /* 9636 * If we're renaming a .. link to a new directory, cancel any 9637 * existing MKDIR_PARENT mkdir. If it has already been canceled 9638 * the jremref is preserved for any potential diradd in this 9639 * location. This can not coincide with a rmdir. 9640 */ 9641 if (I_OFFSET(dp) == DOTDOT_OFFSET) { 9642 if (isrmdir) 9643 panic("newdirrem: .. directory change during remove?"); 9644 jremref = cancel_mkdir_dotdot(dp, dirrem, jremref); 9645 } 9646 /* 9647 * If we're removing a directory search for the .. dependency now and 9648 * cancel it. Any pending journal work will be added to the dirrem 9649 * to be completed when the workitem remove completes. 9650 */ 9651 if (isrmdir) 9652 dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref); 9653 /* 9654 * Check for a diradd dependency for the same directory entry. 9655 * If present, then both dependencies become obsolete and can 9656 * be de-allocated. 9657 */ 9658 dap = diradd_lookup(pagedep, offset); 9659 if (dap == NULL) { 9660 /* 9661 * Link the jremref structures into the dirrem so they are 9662 * written prior to the pagedep. 9663 */ 9664 if (jremref) 9665 dirrem_journal(dirrem, jremref, dotremref, 9666 dotdotremref); 9667 return (dirrem); 9668 } 9669 /* 9670 * Must be ATTACHED at this point. 9671 */ 9672 if ((dap->da_state & ATTACHED) == 0) 9673 panic("newdirrem: not ATTACHED"); 9674 if (dap->da_newinum != ip->i_number) 9675 panic("newdirrem: inum %ju should be %ju", 9676 (uintmax_t)ip->i_number, (uintmax_t)dap->da_newinum); 9677 /* 9678 * If we are deleting a changed name that never made it to disk, 9679 * then return the dirrem describing the previous inode (which 9680 * represents the inode currently referenced from this entry on disk). 9681 */ 9682 if ((dap->da_state & DIRCHG) != 0) { 9683 *prevdirremp = dap->da_previous; 9684 dap->da_state &= ~DIRCHG; 9685 dap->da_pagedep = pagedep; 9686 } 9687 /* 9688 * We are deleting an entry that never made it to disk. 9689 * Mark it COMPLETE so we can delete its inode immediately. 9690 */ 9691 dirrem->dm_state |= COMPLETE; 9692 cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref); 9693 #ifdef INVARIANTS 9694 if (isrmdir == 0) { 9695 struct worklist *wk; 9696 9697 LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) 9698 if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT)) 9699 panic("bad wk %p (0x%X)\n", wk, wk->wk_state); 9700 } 9701 #endif 9702 9703 return (dirrem); 9704 } 9705 9706 /* 9707 * Directory entry change dependencies. 9708 * 9709 * Changing an existing directory entry requires that an add operation 9710 * be completed first followed by a deletion. The semantics for the addition 9711 * are identical to the description of adding a new entry above except 9712 * that the rollback is to the old inode number rather than zero. Once 9713 * the addition dependency is completed, the removal is done as described 9714 * in the removal routine above. 9715 */ 9716 9717 /* 9718 * This routine should be called immediately after changing 9719 * a directory entry. The inode's link count should not be 9720 * decremented by the calling procedure -- the soft updates 9721 * code will perform this task when it is safe. 9722 */ 9723 void 9724 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 9725 struct buf *bp; /* buffer containing directory block */ 9726 struct inode *dp; /* inode for the directory being modified */ 9727 struct inode *ip; /* inode for directory entry being removed */ 9728 ino_t newinum; /* new inode number for changed entry */ 9729 int isrmdir; /* indicates if doing RMDIR */ 9730 { 9731 int offset; 9732 struct diradd *dap = NULL; 9733 struct dirrem *dirrem, *prevdirrem; 9734 struct pagedep *pagedep; 9735 struct inodedep *inodedep; 9736 struct jaddref *jaddref; 9737 struct mount *mp; 9738 struct ufsmount *ump; 9739 9740 mp = ITOVFS(dp); 9741 ump = VFSTOUFS(mp); 9742 offset = blkoff(ump->um_fs, I_OFFSET(dp)); 9743 KASSERT(MOUNTEDSOFTDEP(mp) != 0, 9744 ("softdep_setup_directory_change called on non-softdep filesystem")); 9745 9746 /* 9747 * Whiteouts do not need diradd dependencies. 9748 */ 9749 if (newinum != UFS_WINO) { 9750 dap = malloc(sizeof(struct diradd), 9751 M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO); 9752 workitem_alloc(&dap->da_list, D_DIRADD, mp); 9753 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; 9754 dap->da_offset = offset; 9755 dap->da_newinum = newinum; 9756 LIST_INIT(&dap->da_jwork); 9757 } 9758 9759 /* 9760 * Allocate a new dirrem and ACQUIRE_LOCK. 9761 */ 9762 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 9763 pagedep = dirrem->dm_pagedep; 9764 /* 9765 * The possible values for isrmdir: 9766 * 0 - non-directory file rename 9767 * 1 - directory rename within same directory 9768 * inum - directory rename to new directory of given inode number 9769 * When renaming to a new directory, we are both deleting and 9770 * creating a new directory entry, so the link count on the new 9771 * directory should not change. Thus we do not need the followup 9772 * dirrem which is usually done in handle_workitem_remove. We set 9773 * the DIRCHG flag to tell handle_workitem_remove to skip the 9774 * followup dirrem. 9775 */ 9776 if (isrmdir > 1) 9777 dirrem->dm_state |= DIRCHG; 9778 9779 /* 9780 * Whiteouts have no additional dependencies, 9781 * so just put the dirrem on the correct list. 9782 */ 9783 if (newinum == UFS_WINO) { 9784 if ((dirrem->dm_state & COMPLETE) == 0) { 9785 LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, 9786 dm_next); 9787 } else { 9788 dirrem->dm_dirinum = pagedep->pd_ino; 9789 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 9790 add_to_worklist(&dirrem->dm_list, 0); 9791 } 9792 FREE_LOCK(ump); 9793 return; 9794 } 9795 /* 9796 * Add the dirrem to the inodedep's pending remove list for quick 9797 * discovery later. A valid nlinkdelta ensures that this lookup 9798 * will not fail. 9799 */ 9800 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) 9801 panic("softdep_setup_directory_change: Lost inodedep."); 9802 dirrem->dm_state |= ONDEPLIST; 9803 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 9804 9805 /* 9806 * If the COMPLETE flag is clear, then there were no active 9807 * entries and we want to roll back to the previous inode until 9808 * the new inode is committed to disk. If the COMPLETE flag is 9809 * set, then we have deleted an entry that never made it to disk. 9810 * If the entry we deleted resulted from a name change, then the old 9811 * inode reference still resides on disk. Any rollback that we do 9812 * needs to be to that old inode (returned to us in prevdirrem). If 9813 * the entry we deleted resulted from a create, then there is 9814 * no entry on the disk, so we want to roll back to zero rather 9815 * than the uncommitted inode. In either of the COMPLETE cases we 9816 * want to immediately free the unwritten and unreferenced inode. 9817 */ 9818 if ((dirrem->dm_state & COMPLETE) == 0) { 9819 dap->da_previous = dirrem; 9820 } else { 9821 if (prevdirrem != NULL) { 9822 dap->da_previous = prevdirrem; 9823 } else { 9824 dap->da_state &= ~DIRCHG; 9825 dap->da_pagedep = pagedep; 9826 } 9827 dirrem->dm_dirinum = pagedep->pd_ino; 9828 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 9829 add_to_worklist(&dirrem->dm_list, 0); 9830 } 9831 /* 9832 * Lookup the jaddref for this journal entry. We must finish 9833 * initializing it and make the diradd write dependent on it. 9834 * If we're not journaling, put it on the id_bufwait list if the 9835 * inode is not yet written. If it is written, do the post-inode 9836 * write processing to put it on the id_pendinghd list. 9837 */ 9838 inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); 9839 if (MOUNTEDSUJ(mp)) { 9840 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 9841 inoreflst); 9842 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 9843 ("softdep_setup_directory_change: bad jaddref %p", 9844 jaddref)); 9845 jaddref->ja_diroff = I_OFFSET(dp); 9846 jaddref->ja_diradd = dap; 9847 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 9848 dap, da_pdlist); 9849 add_to_journal(&jaddref->ja_list); 9850 } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 9851 dap->da_state |= COMPLETE; 9852 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 9853 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 9854 } else { 9855 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 9856 dap, da_pdlist); 9857 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 9858 } 9859 /* 9860 * If we're making a new name for a directory that has not been 9861 * committed when need to move the dot and dotdot references to 9862 * this new name. 9863 */ 9864 if (inodedep->id_mkdiradd && I_OFFSET(dp) != DOTDOT_OFFSET) 9865 merge_diradd(inodedep, dap); 9866 FREE_LOCK(ump); 9867 } 9868 9869 /* 9870 * Called whenever the link count on an inode is changed. 9871 * It creates an inode dependency so that the new reference(s) 9872 * to the inode cannot be committed to disk until the updated 9873 * inode has been written. 9874 */ 9875 void 9876 softdep_change_linkcnt(ip) 9877 struct inode *ip; /* the inode with the increased link count */ 9878 { 9879 struct inodedep *inodedep; 9880 struct ufsmount *ump; 9881 9882 ump = ITOUMP(ip); 9883 KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, 9884 ("softdep_change_linkcnt called on non-softdep filesystem")); 9885 ACQUIRE_LOCK(ump); 9886 inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep); 9887 if (ip->i_nlink < ip->i_effnlink) 9888 panic("softdep_change_linkcnt: bad delta"); 9889 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 9890 FREE_LOCK(ump); 9891 } 9892 9893 /* 9894 * Attach a sbdep dependency to the superblock buf so that we can keep 9895 * track of the head of the linked list of referenced but unlinked inodes. 9896 */ 9897 void 9898 softdep_setup_sbupdate(ump, fs, bp) 9899 struct ufsmount *ump; 9900 struct fs *fs; 9901 struct buf *bp; 9902 { 9903 struct sbdep *sbdep; 9904 struct worklist *wk; 9905 9906 KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, 9907 ("softdep_setup_sbupdate called on non-softdep filesystem")); 9908 LIST_FOREACH(wk, &bp->b_dep, wk_list) 9909 if (wk->wk_type == D_SBDEP) 9910 break; 9911 if (wk != NULL) 9912 return; 9913 sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS); 9914 workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump)); 9915 sbdep->sb_fs = fs; 9916 sbdep->sb_ump = ump; 9917 ACQUIRE_LOCK(ump); 9918 WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list); 9919 FREE_LOCK(ump); 9920 } 9921 9922 /* 9923 * Return the first unlinked inodedep which is ready to be the head of the 9924 * list. The inodedep and all those after it must have valid next pointers. 9925 */ 9926 static struct inodedep * 9927 first_unlinked_inodedep(ump) 9928 struct ufsmount *ump; 9929 { 9930 struct inodedep *inodedep; 9931 struct inodedep *idp; 9932 9933 LOCK_OWNED(ump); 9934 for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst); 9935 inodedep; inodedep = idp) { 9936 if ((inodedep->id_state & UNLINKNEXT) == 0) 9937 return (NULL); 9938 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 9939 if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0) 9940 break; 9941 if ((inodedep->id_state & UNLINKPREV) == 0) 9942 break; 9943 } 9944 return (inodedep); 9945 } 9946 9947 /* 9948 * Set the sujfree unlinked head pointer prior to writing a superblock. 9949 */ 9950 static void 9951 initiate_write_sbdep(sbdep) 9952 struct sbdep *sbdep; 9953 { 9954 struct inodedep *inodedep; 9955 struct fs *bpfs; 9956 struct fs *fs; 9957 9958 bpfs = sbdep->sb_fs; 9959 fs = sbdep->sb_ump->um_fs; 9960 inodedep = first_unlinked_inodedep(sbdep->sb_ump); 9961 if (inodedep) { 9962 fs->fs_sujfree = inodedep->id_ino; 9963 inodedep->id_state |= UNLINKPREV; 9964 } else 9965 fs->fs_sujfree = 0; 9966 bpfs->fs_sujfree = fs->fs_sujfree; 9967 /* 9968 * Because we have made changes to the superblock, we need to 9969 * recompute its check-hash. 9970 */ 9971 bpfs->fs_ckhash = ffs_calc_sbhash(bpfs); 9972 } 9973 9974 /* 9975 * After a superblock is written determine whether it must be written again 9976 * due to a changing unlinked list head. 9977 */ 9978 static int 9979 handle_written_sbdep(sbdep, bp) 9980 struct sbdep *sbdep; 9981 struct buf *bp; 9982 { 9983 struct inodedep *inodedep; 9984 struct fs *fs; 9985 9986 LOCK_OWNED(sbdep->sb_ump); 9987 fs = sbdep->sb_fs; 9988 /* 9989 * If the superblock doesn't match the in-memory list start over. 9990 */ 9991 inodedep = first_unlinked_inodedep(sbdep->sb_ump); 9992 if ((inodedep && fs->fs_sujfree != inodedep->id_ino) || 9993 (inodedep == NULL && fs->fs_sujfree != 0)) { 9994 bdirty(bp); 9995 return (1); 9996 } 9997 WORKITEM_FREE(sbdep, D_SBDEP); 9998 if (fs->fs_sujfree == 0) 9999 return (0); 10000 /* 10001 * Now that we have a record of this inode in stable store allow it 10002 * to be written to free up pending work. Inodes may see a lot of 10003 * write activity after they are unlinked which we must not hold up. 10004 */ 10005 for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) { 10006 if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS) 10007 panic("handle_written_sbdep: Bad inodedep %p (0x%X)", 10008 inodedep, inodedep->id_state); 10009 if (inodedep->id_state & UNLINKONLIST) 10010 break; 10011 inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST; 10012 } 10013 10014 return (0); 10015 } 10016 10017 /* 10018 * Mark an inodedep as unlinked and insert it into the in-memory unlinked list. 10019 */ 10020 static void 10021 unlinked_inodedep(mp, inodedep) 10022 struct mount *mp; 10023 struct inodedep *inodedep; 10024 { 10025 struct ufsmount *ump; 10026 10027 ump = VFSTOUFS(mp); 10028 LOCK_OWNED(ump); 10029 if (MOUNTEDSUJ(mp) == 0) 10030 return; 10031 ump->um_fs->fs_fmod = 1; 10032 if (inodedep->id_state & UNLINKED) 10033 panic("unlinked_inodedep: %p already unlinked\n", inodedep); 10034 inodedep->id_state |= UNLINKED; 10035 TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked); 10036 } 10037 10038 /* 10039 * Remove an inodedep from the unlinked inodedep list. This may require 10040 * disk writes if the inode has made it that far. 10041 */ 10042 static void 10043 clear_unlinked_inodedep(inodedep) 10044 struct inodedep *inodedep; 10045 { 10046 struct ufs2_dinode *dip; 10047 struct ufsmount *ump; 10048 struct inodedep *idp; 10049 struct inodedep *idn; 10050 struct fs *fs, *bpfs; 10051 struct buf *bp; 10052 daddr_t dbn; 10053 ino_t ino; 10054 ino_t nino; 10055 ino_t pino; 10056 int error; 10057 10058 ump = VFSTOUFS(inodedep->id_list.wk_mp); 10059 fs = ump->um_fs; 10060 ino = inodedep->id_ino; 10061 error = 0; 10062 for (;;) { 10063 LOCK_OWNED(ump); 10064 KASSERT((inodedep->id_state & UNLINKED) != 0, 10065 ("clear_unlinked_inodedep: inodedep %p not unlinked", 10066 inodedep)); 10067 /* 10068 * If nothing has yet been written simply remove us from 10069 * the in memory list and return. This is the most common 10070 * case where handle_workitem_remove() loses the final 10071 * reference. 10072 */ 10073 if ((inodedep->id_state & UNLINKLINKS) == 0) 10074 break; 10075 /* 10076 * If we have a NEXT pointer and no PREV pointer we can simply 10077 * clear NEXT's PREV and remove ourselves from the list. Be 10078 * careful not to clear PREV if the superblock points at 10079 * next as well. 10080 */ 10081 idn = TAILQ_NEXT(inodedep, id_unlinked); 10082 if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) { 10083 if (idn && fs->fs_sujfree != idn->id_ino) 10084 idn->id_state &= ~UNLINKPREV; 10085 break; 10086 } 10087 /* 10088 * Here we have an inodedep which is actually linked into 10089 * the list. We must remove it by forcing a write to the 10090 * link before us, whether it be the superblock or an inode. 10091 * Unfortunately the list may change while we're waiting 10092 * on the buf lock for either resource so we must loop until 10093 * we lock the right one. If both the superblock and an 10094 * inode point to this inode we must clear the inode first 10095 * followed by the superblock. 10096 */ 10097 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 10098 pino = 0; 10099 if (idp && (idp->id_state & UNLINKNEXT)) 10100 pino = idp->id_ino; 10101 FREE_LOCK(ump); 10102 if (pino == 0) { 10103 bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), 10104 (int)fs->fs_sbsize, 0, 0, 0); 10105 } else { 10106 dbn = fsbtodb(fs, ino_to_fsba(fs, pino)); 10107 error = ffs_breadz(ump, ump->um_devvp, dbn, dbn, 10108 (int)fs->fs_bsize, NULL, NULL, 0, NOCRED, 0, NULL, 10109 &bp); 10110 } 10111 ACQUIRE_LOCK(ump); 10112 if (error) 10113 break; 10114 /* If the list has changed restart the loop. */ 10115 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 10116 nino = 0; 10117 if (idp && (idp->id_state & UNLINKNEXT)) 10118 nino = idp->id_ino; 10119 if (nino != pino || 10120 (inodedep->id_state & UNLINKPREV) != UNLINKPREV) { 10121 FREE_LOCK(ump); 10122 brelse(bp); 10123 ACQUIRE_LOCK(ump); 10124 continue; 10125 } 10126 nino = 0; 10127 idn = TAILQ_NEXT(inodedep, id_unlinked); 10128 if (idn) 10129 nino = idn->id_ino; 10130 /* 10131 * Remove us from the in memory list. After this we cannot 10132 * access the inodedep. 10133 */ 10134 KASSERT((inodedep->id_state & UNLINKED) != 0, 10135 ("clear_unlinked_inodedep: inodedep %p not unlinked", 10136 inodedep)); 10137 inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST); 10138 TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); 10139 FREE_LOCK(ump); 10140 /* 10141 * The predecessor's next pointer is manually updated here 10142 * so that the NEXT flag is never cleared for an element 10143 * that is in the list. 10144 */ 10145 if (pino == 0) { 10146 bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); 10147 bpfs = (struct fs *)bp->b_data; 10148 ffs_oldfscompat_write(bpfs, ump); 10149 softdep_setup_sbupdate(ump, bpfs, bp); 10150 /* 10151 * Because we may have made changes to the superblock, 10152 * we need to recompute its check-hash. 10153 */ 10154 bpfs->fs_ckhash = ffs_calc_sbhash(bpfs); 10155 } else if (fs->fs_magic == FS_UFS1_MAGIC) { 10156 ((struct ufs1_dinode *)bp->b_data + 10157 ino_to_fsbo(fs, pino))->di_freelink = nino; 10158 } else { 10159 dip = (struct ufs2_dinode *)bp->b_data + 10160 ino_to_fsbo(fs, pino); 10161 dip->di_freelink = nino; 10162 ffs_update_dinode_ckhash(fs, dip); 10163 } 10164 /* 10165 * If the bwrite fails we have no recourse to recover. The 10166 * filesystem is corrupted already. 10167 */ 10168 bwrite(bp); 10169 ACQUIRE_LOCK(ump); 10170 /* 10171 * If the superblock pointer still needs to be cleared force 10172 * a write here. 10173 */ 10174 if (fs->fs_sujfree == ino) { 10175 FREE_LOCK(ump); 10176 bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), 10177 (int)fs->fs_sbsize, 0, 0, 0); 10178 bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); 10179 bpfs = (struct fs *)bp->b_data; 10180 ffs_oldfscompat_write(bpfs, ump); 10181 softdep_setup_sbupdate(ump, bpfs, bp); 10182 /* 10183 * Because we may have made changes to the superblock, 10184 * we need to recompute its check-hash. 10185 */ 10186 bpfs->fs_ckhash = ffs_calc_sbhash(bpfs); 10187 bwrite(bp); 10188 ACQUIRE_LOCK(ump); 10189 } 10190 10191 if (fs->fs_sujfree != ino) 10192 return; 10193 panic("clear_unlinked_inodedep: Failed to clear free head"); 10194 } 10195 if (inodedep->id_ino == fs->fs_sujfree) 10196 panic("clear_unlinked_inodedep: Freeing head of free list"); 10197 inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST); 10198 TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); 10199 return; 10200 } 10201 10202 /* 10203 * This workitem decrements the inode's link count. 10204 * If the link count reaches zero, the file is removed. 10205 */ 10206 static int 10207 handle_workitem_remove(dirrem, flags) 10208 struct dirrem *dirrem; 10209 int flags; 10210 { 10211 struct inodedep *inodedep; 10212 struct workhead dotdotwk; 10213 struct worklist *wk; 10214 struct ufsmount *ump; 10215 struct mount *mp; 10216 struct vnode *vp; 10217 struct inode *ip; 10218 ino_t oldinum; 10219 10220 if (dirrem->dm_state & ONWORKLIST) 10221 panic("handle_workitem_remove: dirrem %p still on worklist", 10222 dirrem); 10223 oldinum = dirrem->dm_oldinum; 10224 mp = dirrem->dm_list.wk_mp; 10225 ump = VFSTOUFS(mp); 10226 flags |= LK_EXCLUSIVE; 10227 if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ | 10228 FFSV_FORCEINODEDEP) != 0) 10229 return (EBUSY); 10230 ip = VTOI(vp); 10231 MPASS(ip->i_mode != 0); 10232 ACQUIRE_LOCK(ump); 10233 if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0) 10234 panic("handle_workitem_remove: lost inodedep"); 10235 if (dirrem->dm_state & ONDEPLIST) 10236 LIST_REMOVE(dirrem, dm_inonext); 10237 KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), 10238 ("handle_workitem_remove: Journal entries not written.")); 10239 10240 /* 10241 * Move all dependencies waiting on the remove to complete 10242 * from the dirrem to the inode inowait list to be completed 10243 * after the inode has been updated and written to disk. 10244 * 10245 * Any marked MKDIR_PARENT are saved to be completed when the 10246 * dotdot ref is removed unless DIRCHG is specified. For 10247 * directory change operations there will be no further 10248 * directory writes and the jsegdeps need to be moved along 10249 * with the rest to be completed when the inode is free or 10250 * stable in the inode free list. 10251 */ 10252 LIST_INIT(&dotdotwk); 10253 while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) { 10254 WORKLIST_REMOVE(wk); 10255 if ((dirrem->dm_state & DIRCHG) == 0 && 10256 wk->wk_state & MKDIR_PARENT) { 10257 wk->wk_state &= ~MKDIR_PARENT; 10258 WORKLIST_INSERT(&dotdotwk, wk); 10259 continue; 10260 } 10261 WORKLIST_INSERT(&inodedep->id_inowait, wk); 10262 } 10263 LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list); 10264 /* 10265 * Normal file deletion. 10266 */ 10267 if ((dirrem->dm_state & RMDIR) == 0) { 10268 ip->i_nlink--; 10269 KASSERT(ip->i_nlink >= 0, ("handle_workitem_remove: file ino " 10270 "%ju negative i_nlink %d", (intmax_t)ip->i_number, 10271 ip->i_nlink)); 10272 DIP_SET(ip, i_nlink, ip->i_nlink); 10273 UFS_INODE_SET_FLAG(ip, IN_CHANGE); 10274 if (ip->i_nlink < ip->i_effnlink) 10275 panic("handle_workitem_remove: bad file delta"); 10276 if (ip->i_nlink == 0) 10277 unlinked_inodedep(mp, inodedep); 10278 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 10279 KASSERT(LIST_EMPTY(&dirrem->dm_jwork), 10280 ("handle_workitem_remove: worklist not empty. %s", 10281 TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type))); 10282 WORKITEM_FREE(dirrem, D_DIRREM); 10283 FREE_LOCK(ump); 10284 goto out; 10285 } 10286 /* 10287 * Directory deletion. Decrement reference count for both the 10288 * just deleted parent directory entry and the reference for ".". 10289 * Arrange to have the reference count on the parent decremented 10290 * to account for the loss of "..". 10291 */ 10292 ip->i_nlink -= 2; 10293 KASSERT(ip->i_nlink >= 0, ("handle_workitem_remove: directory ino " 10294 "%ju negative i_nlink %d", (intmax_t)ip->i_number, ip->i_nlink)); 10295 DIP_SET(ip, i_nlink, ip->i_nlink); 10296 UFS_INODE_SET_FLAG(ip, IN_CHANGE); 10297 if (ip->i_nlink < ip->i_effnlink) 10298 panic("handle_workitem_remove: bad dir delta"); 10299 if (ip->i_nlink == 0) 10300 unlinked_inodedep(mp, inodedep); 10301 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 10302 /* 10303 * Rename a directory to a new parent. Since, we are both deleting 10304 * and creating a new directory entry, the link count on the new 10305 * directory should not change. Thus we skip the followup dirrem. 10306 */ 10307 if (dirrem->dm_state & DIRCHG) { 10308 KASSERT(LIST_EMPTY(&dirrem->dm_jwork), 10309 ("handle_workitem_remove: DIRCHG and worklist not empty.")); 10310 WORKITEM_FREE(dirrem, D_DIRREM); 10311 FREE_LOCK(ump); 10312 goto out; 10313 } 10314 dirrem->dm_state = ONDEPLIST; 10315 dirrem->dm_oldinum = dirrem->dm_dirinum; 10316 /* 10317 * Place the dirrem on the parent's diremhd list. 10318 */ 10319 if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0) 10320 panic("handle_workitem_remove: lost dir inodedep"); 10321 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 10322 /* 10323 * If the allocated inode has never been written to disk, then 10324 * the on-disk inode is zero'ed and we can remove the file 10325 * immediately. When journaling if the inode has been marked 10326 * unlinked and not DEPCOMPLETE we know it can never be written. 10327 */ 10328 inodedep_lookup(mp, oldinum, 0, &inodedep); 10329 if (inodedep == NULL || 10330 (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED || 10331 check_inode_unwritten(inodedep)) { 10332 FREE_LOCK(ump); 10333 vput(vp); 10334 return handle_workitem_remove(dirrem, flags); 10335 } 10336 WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); 10337 FREE_LOCK(ump); 10338 UFS_INODE_SET_FLAG(ip, IN_CHANGE); 10339 out: 10340 ffs_update(vp, 0); 10341 vput(vp); 10342 return (0); 10343 } 10344 10345 /* 10346 * Inode de-allocation dependencies. 10347 * 10348 * When an inode's link count is reduced to zero, it can be de-allocated. We 10349 * found it convenient to postpone de-allocation until after the inode is 10350 * written to disk with its new link count (zero). At this point, all of the 10351 * on-disk inode's block pointers are nullified and, with careful dependency 10352 * list ordering, all dependencies related to the inode will be satisfied and 10353 * the corresponding dependency structures de-allocated. So, if/when the 10354 * inode is reused, there will be no mixing of old dependencies with new 10355 * ones. This artificial dependency is set up by the block de-allocation 10356 * procedure above (softdep_setup_freeblocks) and completed by the 10357 * following procedure. 10358 */ 10359 static void 10360 handle_workitem_freefile(freefile) 10361 struct freefile *freefile; 10362 { 10363 struct workhead wkhd; 10364 struct fs *fs; 10365 struct ufsmount *ump; 10366 int error; 10367 #ifdef INVARIANTS 10368 struct inodedep *idp; 10369 #endif 10370 10371 ump = VFSTOUFS(freefile->fx_list.wk_mp); 10372 fs = ump->um_fs; 10373 #ifdef INVARIANTS 10374 ACQUIRE_LOCK(ump); 10375 error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp); 10376 FREE_LOCK(ump); 10377 if (error) 10378 panic("handle_workitem_freefile: inodedep %p survived", idp); 10379 #endif 10380 UFS_LOCK(ump); 10381 fs->fs_pendinginodes -= 1; 10382 UFS_UNLOCK(ump); 10383 LIST_INIT(&wkhd); 10384 LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list); 10385 if ((error = ffs_freefile(ump, fs, freefile->fx_devvp, 10386 freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0) 10387 softdep_error("handle_workitem_freefile", error); 10388 ACQUIRE_LOCK(ump); 10389 WORKITEM_FREE(freefile, D_FREEFILE); 10390 FREE_LOCK(ump); 10391 } 10392 10393 /* 10394 * Helper function which unlinks marker element from work list and returns 10395 * the next element on the list. 10396 */ 10397 static __inline struct worklist * 10398 markernext(struct worklist *marker) 10399 { 10400 struct worklist *next; 10401 10402 next = LIST_NEXT(marker, wk_list); 10403 LIST_REMOVE(marker, wk_list); 10404 return next; 10405 } 10406 10407 /* 10408 * Disk writes. 10409 * 10410 * The dependency structures constructed above are most actively used when file 10411 * system blocks are written to disk. No constraints are placed on when a 10412 * block can be written, but unsatisfied update dependencies are made safe by 10413 * modifying (or replacing) the source memory for the duration of the disk 10414 * write. When the disk write completes, the memory block is again brought 10415 * up-to-date. 10416 * 10417 * In-core inode structure reclamation. 10418 * 10419 * Because there are a finite number of "in-core" inode structures, they are 10420 * reused regularly. By transferring all inode-related dependencies to the 10421 * in-memory inode block and indexing them separately (via "inodedep"s), we 10422 * can allow "in-core" inode structures to be reused at any time and avoid 10423 * any increase in contention. 10424 * 10425 * Called just before entering the device driver to initiate a new disk I/O. 10426 * The buffer must be locked, thus, no I/O completion operations can occur 10427 * while we are manipulating its associated dependencies. 10428 */ 10429 static void 10430 softdep_disk_io_initiation(bp) 10431 struct buf *bp; /* structure describing disk write to occur */ 10432 { 10433 struct worklist *wk; 10434 struct worklist marker; 10435 struct inodedep *inodedep; 10436 struct freeblks *freeblks; 10437 struct jblkdep *jblkdep; 10438 struct newblk *newblk; 10439 struct ufsmount *ump; 10440 10441 /* 10442 * We only care about write operations. There should never 10443 * be dependencies for reads. 10444 */ 10445 if (bp->b_iocmd != BIO_WRITE) 10446 panic("softdep_disk_io_initiation: not write"); 10447 10448 if (bp->b_vflags & BV_BKGRDINPROG) 10449 panic("softdep_disk_io_initiation: Writing buffer with " 10450 "background write in progress: %p", bp); 10451 10452 ump = softdep_bp_to_mp(bp); 10453 if (ump == NULL) 10454 return; 10455 10456 marker.wk_type = D_LAST + 1; /* Not a normal workitem */ 10457 PHOLD(curproc); /* Don't swap out kernel stack */ 10458 ACQUIRE_LOCK(ump); 10459 /* 10460 * Do any necessary pre-I/O processing. 10461 */ 10462 for (wk = LIST_FIRST(&bp->b_dep); wk != NULL; 10463 wk = markernext(&marker)) { 10464 LIST_INSERT_AFTER(wk, &marker, wk_list); 10465 switch (wk->wk_type) { 10466 case D_PAGEDEP: 10467 initiate_write_filepage(WK_PAGEDEP(wk), bp); 10468 continue; 10469 10470 case D_INODEDEP: 10471 inodedep = WK_INODEDEP(wk); 10472 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) 10473 initiate_write_inodeblock_ufs1(inodedep, bp); 10474 else 10475 initiate_write_inodeblock_ufs2(inodedep, bp); 10476 continue; 10477 10478 case D_INDIRDEP: 10479 initiate_write_indirdep(WK_INDIRDEP(wk), bp); 10480 continue; 10481 10482 case D_BMSAFEMAP: 10483 initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp); 10484 continue; 10485 10486 case D_JSEG: 10487 WK_JSEG(wk)->js_buf = NULL; 10488 continue; 10489 10490 case D_FREEBLKS: 10491 freeblks = WK_FREEBLKS(wk); 10492 jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd); 10493 /* 10494 * We have to wait for the freeblks to be journaled 10495 * before we can write an inodeblock with updated 10496 * pointers. Be careful to arrange the marker so 10497 * we revisit the freeblks if it's not removed by 10498 * the first jwait(). 10499 */ 10500 if (jblkdep != NULL) { 10501 LIST_REMOVE(&marker, wk_list); 10502 LIST_INSERT_BEFORE(wk, &marker, wk_list); 10503 jwait(&jblkdep->jb_list, MNT_WAIT); 10504 } 10505 continue; 10506 case D_ALLOCDIRECT: 10507 case D_ALLOCINDIR: 10508 /* 10509 * We have to wait for the jnewblk to be journaled 10510 * before we can write to a block if the contents 10511 * may be confused with an earlier file's indirect 10512 * at recovery time. Handle the marker as described 10513 * above. 10514 */ 10515 newblk = WK_NEWBLK(wk); 10516 if (newblk->nb_jnewblk != NULL && 10517 indirblk_lookup(newblk->nb_list.wk_mp, 10518 newblk->nb_newblkno)) { 10519 LIST_REMOVE(&marker, wk_list); 10520 LIST_INSERT_BEFORE(wk, &marker, wk_list); 10521 jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); 10522 } 10523 continue; 10524 10525 case D_SBDEP: 10526 initiate_write_sbdep(WK_SBDEP(wk)); 10527 continue; 10528 10529 case D_MKDIR: 10530 case D_FREEWORK: 10531 case D_FREEDEP: 10532 case D_JSEGDEP: 10533 continue; 10534 10535 default: 10536 panic("handle_disk_io_initiation: Unexpected type %s", 10537 TYPENAME(wk->wk_type)); 10538 /* NOTREACHED */ 10539 } 10540 } 10541 FREE_LOCK(ump); 10542 PRELE(curproc); /* Allow swapout of kernel stack */ 10543 } 10544 10545 /* 10546 * Called from within the procedure above to deal with unsatisfied 10547 * allocation dependencies in a directory. The buffer must be locked, 10548 * thus, no I/O completion operations can occur while we are 10549 * manipulating its associated dependencies. 10550 */ 10551 static void 10552 initiate_write_filepage(pagedep, bp) 10553 struct pagedep *pagedep; 10554 struct buf *bp; 10555 { 10556 struct jremref *jremref; 10557 struct jmvref *jmvref; 10558 struct dirrem *dirrem; 10559 struct diradd *dap; 10560 struct direct *ep; 10561 int i; 10562 10563 if (pagedep->pd_state & IOSTARTED) { 10564 /* 10565 * This can only happen if there is a driver that does not 10566 * understand chaining. Here biodone will reissue the call 10567 * to strategy for the incomplete buffers. 10568 */ 10569 printf("initiate_write_filepage: already started\n"); 10570 return; 10571 } 10572 pagedep->pd_state |= IOSTARTED; 10573 /* 10574 * Wait for all journal remove dependencies to hit the disk. 10575 * We can not allow any potentially conflicting directory adds 10576 * to be visible before removes and rollback is too difficult. 10577 * The per-filesystem lock may be dropped and re-acquired, however 10578 * we hold the buf locked so the dependency can not go away. 10579 */ 10580 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) 10581 while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) 10582 jwait(&jremref->jr_list, MNT_WAIT); 10583 while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) 10584 jwait(&jmvref->jm_list, MNT_WAIT); 10585 for (i = 0; i < DAHASHSZ; i++) { 10586 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 10587 ep = (struct direct *) 10588 ((char *)bp->b_data + dap->da_offset); 10589 if (ep->d_ino != dap->da_newinum) 10590 panic("%s: dir inum %ju != new %ju", 10591 "initiate_write_filepage", 10592 (uintmax_t)ep->d_ino, 10593 (uintmax_t)dap->da_newinum); 10594 if (dap->da_state & DIRCHG) 10595 ep->d_ino = dap->da_previous->dm_oldinum; 10596 else 10597 ep->d_ino = 0; 10598 dap->da_state &= ~ATTACHED; 10599 dap->da_state |= UNDONE; 10600 } 10601 } 10602 } 10603 10604 /* 10605 * Version of initiate_write_inodeblock that handles UFS1 dinodes. 10606 * Note that any bug fixes made to this routine must be done in the 10607 * version found below. 10608 * 10609 * Called from within the procedure above to deal with unsatisfied 10610 * allocation dependencies in an inodeblock. The buffer must be 10611 * locked, thus, no I/O completion operations can occur while we 10612 * are manipulating its associated dependencies. 10613 */ 10614 static void 10615 initiate_write_inodeblock_ufs1(inodedep, bp) 10616 struct inodedep *inodedep; 10617 struct buf *bp; /* The inode block */ 10618 { 10619 struct allocdirect *adp, *lastadp; 10620 struct ufs1_dinode *dp; 10621 struct ufs1_dinode *sip; 10622 struct inoref *inoref; 10623 struct ufsmount *ump; 10624 struct fs *fs; 10625 ufs_lbn_t i; 10626 #ifdef INVARIANTS 10627 ufs_lbn_t prevlbn = 0; 10628 #endif 10629 int deplist; 10630 10631 if (inodedep->id_state & IOSTARTED) 10632 panic("initiate_write_inodeblock_ufs1: already started"); 10633 inodedep->id_state |= IOSTARTED; 10634 fs = inodedep->id_fs; 10635 ump = VFSTOUFS(inodedep->id_list.wk_mp); 10636 LOCK_OWNED(ump); 10637 dp = (struct ufs1_dinode *)bp->b_data + 10638 ino_to_fsbo(fs, inodedep->id_ino); 10639 10640 /* 10641 * If we're on the unlinked list but have not yet written our 10642 * next pointer initialize it here. 10643 */ 10644 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 10645 struct inodedep *inon; 10646 10647 inon = TAILQ_NEXT(inodedep, id_unlinked); 10648 dp->di_freelink = inon ? inon->id_ino : 0; 10649 } 10650 /* 10651 * If the bitmap is not yet written, then the allocated 10652 * inode cannot be written to disk. 10653 */ 10654 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 10655 if (inodedep->id_savedino1 != NULL) 10656 panic("initiate_write_inodeblock_ufs1: I/O underway"); 10657 FREE_LOCK(ump); 10658 sip = malloc(sizeof(struct ufs1_dinode), 10659 M_SAVEDINO, M_SOFTDEP_FLAGS); 10660 ACQUIRE_LOCK(ump); 10661 inodedep->id_savedino1 = sip; 10662 *inodedep->id_savedino1 = *dp; 10663 bzero((caddr_t)dp, sizeof(struct ufs1_dinode)); 10664 dp->di_gen = inodedep->id_savedino1->di_gen; 10665 dp->di_freelink = inodedep->id_savedino1->di_freelink; 10666 return; 10667 } 10668 /* 10669 * If no dependencies, then there is nothing to roll back. 10670 */ 10671 inodedep->id_savedsize = dp->di_size; 10672 inodedep->id_savedextsize = 0; 10673 inodedep->id_savednlink = dp->di_nlink; 10674 if (TAILQ_EMPTY(&inodedep->id_inoupdt) && 10675 TAILQ_EMPTY(&inodedep->id_inoreflst)) 10676 return; 10677 /* 10678 * Revert the link count to that of the first unwritten journal entry. 10679 */ 10680 inoref = TAILQ_FIRST(&inodedep->id_inoreflst); 10681 if (inoref) 10682 dp->di_nlink = inoref->if_nlink; 10683 /* 10684 * Set the dependencies to busy. 10685 */ 10686 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 10687 adp = TAILQ_NEXT(adp, ad_next)) { 10688 #ifdef INVARIANTS 10689 if (deplist != 0 && prevlbn >= adp->ad_offset) 10690 panic("softdep_write_inodeblock: lbn order"); 10691 prevlbn = adp->ad_offset; 10692 if (adp->ad_offset < UFS_NDADDR && 10693 dp->di_db[adp->ad_offset] != adp->ad_newblkno) 10694 panic("initiate_write_inodeblock_ufs1: " 10695 "direct pointer #%jd mismatch %d != %jd", 10696 (intmax_t)adp->ad_offset, 10697 dp->di_db[adp->ad_offset], 10698 (intmax_t)adp->ad_newblkno); 10699 if (adp->ad_offset >= UFS_NDADDR && 10700 dp->di_ib[adp->ad_offset - UFS_NDADDR] != adp->ad_newblkno) 10701 panic("initiate_write_inodeblock_ufs1: " 10702 "indirect pointer #%jd mismatch %d != %jd", 10703 (intmax_t)adp->ad_offset - UFS_NDADDR, 10704 dp->di_ib[adp->ad_offset - UFS_NDADDR], 10705 (intmax_t)adp->ad_newblkno); 10706 deplist |= 1 << adp->ad_offset; 10707 if ((adp->ad_state & ATTACHED) == 0) 10708 panic("initiate_write_inodeblock_ufs1: " 10709 "Unknown state 0x%x", adp->ad_state); 10710 #endif /* INVARIANTS */ 10711 adp->ad_state &= ~ATTACHED; 10712 adp->ad_state |= UNDONE; 10713 } 10714 /* 10715 * The on-disk inode cannot claim to be any larger than the last 10716 * fragment that has been written. Otherwise, the on-disk inode 10717 * might have fragments that were not the last block in the file 10718 * which would corrupt the filesystem. 10719 */ 10720 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 10721 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 10722 if (adp->ad_offset >= UFS_NDADDR) 10723 break; 10724 dp->di_db[adp->ad_offset] = adp->ad_oldblkno; 10725 /* keep going until hitting a rollback to a frag */ 10726 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 10727 continue; 10728 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 10729 for (i = adp->ad_offset + 1; i < UFS_NDADDR; i++) { 10730 #ifdef INVARIANTS 10731 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) 10732 panic("initiate_write_inodeblock_ufs1: " 10733 "lost dep1"); 10734 #endif /* INVARIANTS */ 10735 dp->di_db[i] = 0; 10736 } 10737 for (i = 0; i < UFS_NIADDR; i++) { 10738 #ifdef INVARIANTS 10739 if (dp->di_ib[i] != 0 && 10740 (deplist & ((1 << UFS_NDADDR) << i)) == 0) 10741 panic("initiate_write_inodeblock_ufs1: " 10742 "lost dep2"); 10743 #endif /* INVARIANTS */ 10744 dp->di_ib[i] = 0; 10745 } 10746 return; 10747 } 10748 /* 10749 * If we have zero'ed out the last allocated block of the file, 10750 * roll back the size to the last currently allocated block. 10751 * We know that this last allocated block is a full-sized as 10752 * we already checked for fragments in the loop above. 10753 */ 10754 if (lastadp != NULL && 10755 dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 10756 for (i = lastadp->ad_offset; i >= 0; i--) 10757 if (dp->di_db[i] != 0) 10758 break; 10759 dp->di_size = (i + 1) * fs->fs_bsize; 10760 } 10761 /* 10762 * The only dependencies are for indirect blocks. 10763 * 10764 * The file size for indirect block additions is not guaranteed. 10765 * Such a guarantee would be non-trivial to achieve. The conventional 10766 * synchronous write implementation also does not make this guarantee. 10767 * Fsck should catch and fix discrepancies. Arguably, the file size 10768 * can be over-estimated without destroying integrity when the file 10769 * moves into the indirect blocks (i.e., is large). If we want to 10770 * postpone fsck, we are stuck with this argument. 10771 */ 10772 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 10773 dp->di_ib[adp->ad_offset - UFS_NDADDR] = 0; 10774 } 10775 10776 /* 10777 * Version of initiate_write_inodeblock that handles UFS2 dinodes. 10778 * Note that any bug fixes made to this routine must be done in the 10779 * version found above. 10780 * 10781 * Called from within the procedure above to deal with unsatisfied 10782 * allocation dependencies in an inodeblock. The buffer must be 10783 * locked, thus, no I/O completion operations can occur while we 10784 * are manipulating its associated dependencies. 10785 */ 10786 static void 10787 initiate_write_inodeblock_ufs2(inodedep, bp) 10788 struct inodedep *inodedep; 10789 struct buf *bp; /* The inode block */ 10790 { 10791 struct allocdirect *adp, *lastadp; 10792 struct ufs2_dinode *dp; 10793 struct ufs2_dinode *sip; 10794 struct inoref *inoref; 10795 struct ufsmount *ump; 10796 struct fs *fs; 10797 ufs_lbn_t i; 10798 #ifdef INVARIANTS 10799 ufs_lbn_t prevlbn = 0; 10800 #endif 10801 int deplist; 10802 10803 if (inodedep->id_state & IOSTARTED) 10804 panic("initiate_write_inodeblock_ufs2: already started"); 10805 inodedep->id_state |= IOSTARTED; 10806 fs = inodedep->id_fs; 10807 ump = VFSTOUFS(inodedep->id_list.wk_mp); 10808 LOCK_OWNED(ump); 10809 dp = (struct ufs2_dinode *)bp->b_data + 10810 ino_to_fsbo(fs, inodedep->id_ino); 10811 10812 /* 10813 * If we're on the unlinked list but have not yet written our 10814 * next pointer initialize it here. 10815 */ 10816 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 10817 struct inodedep *inon; 10818 10819 inon = TAILQ_NEXT(inodedep, id_unlinked); 10820 dp->di_freelink = inon ? inon->id_ino : 0; 10821 ffs_update_dinode_ckhash(fs, dp); 10822 } 10823 /* 10824 * If the bitmap is not yet written, then the allocated 10825 * inode cannot be written to disk. 10826 */ 10827 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 10828 if (inodedep->id_savedino2 != NULL) 10829 panic("initiate_write_inodeblock_ufs2: I/O underway"); 10830 FREE_LOCK(ump); 10831 sip = malloc(sizeof(struct ufs2_dinode), 10832 M_SAVEDINO, M_SOFTDEP_FLAGS); 10833 ACQUIRE_LOCK(ump); 10834 inodedep->id_savedino2 = sip; 10835 *inodedep->id_savedino2 = *dp; 10836 bzero((caddr_t)dp, sizeof(struct ufs2_dinode)); 10837 dp->di_gen = inodedep->id_savedino2->di_gen; 10838 dp->di_freelink = inodedep->id_savedino2->di_freelink; 10839 return; 10840 } 10841 /* 10842 * If no dependencies, then there is nothing to roll back. 10843 */ 10844 inodedep->id_savedsize = dp->di_size; 10845 inodedep->id_savedextsize = dp->di_extsize; 10846 inodedep->id_savednlink = dp->di_nlink; 10847 if (TAILQ_EMPTY(&inodedep->id_inoupdt) && 10848 TAILQ_EMPTY(&inodedep->id_extupdt) && 10849 TAILQ_EMPTY(&inodedep->id_inoreflst)) 10850 return; 10851 /* 10852 * Revert the link count to that of the first unwritten journal entry. 10853 */ 10854 inoref = TAILQ_FIRST(&inodedep->id_inoreflst); 10855 if (inoref) 10856 dp->di_nlink = inoref->if_nlink; 10857 10858 /* 10859 * Set the ext data dependencies to busy. 10860 */ 10861 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; 10862 adp = TAILQ_NEXT(adp, ad_next)) { 10863 #ifdef INVARIANTS 10864 if (deplist != 0 && prevlbn >= adp->ad_offset) 10865 panic("initiate_write_inodeblock_ufs2: lbn order"); 10866 prevlbn = adp->ad_offset; 10867 if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno) 10868 panic("initiate_write_inodeblock_ufs2: " 10869 "ext pointer #%jd mismatch %jd != %jd", 10870 (intmax_t)adp->ad_offset, 10871 (intmax_t)dp->di_extb[adp->ad_offset], 10872 (intmax_t)adp->ad_newblkno); 10873 deplist |= 1 << adp->ad_offset; 10874 if ((adp->ad_state & ATTACHED) == 0) 10875 panic("initiate_write_inodeblock_ufs2: Unknown " 10876 "state 0x%x", adp->ad_state); 10877 #endif /* INVARIANTS */ 10878 adp->ad_state &= ~ATTACHED; 10879 adp->ad_state |= UNDONE; 10880 } 10881 /* 10882 * The on-disk inode cannot claim to be any larger than the last 10883 * fragment that has been written. Otherwise, the on-disk inode 10884 * might have fragments that were not the last block in the ext 10885 * data which would corrupt the filesystem. 10886 */ 10887 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; 10888 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 10889 dp->di_extb[adp->ad_offset] = adp->ad_oldblkno; 10890 /* keep going until hitting a rollback to a frag */ 10891 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 10892 continue; 10893 dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 10894 for (i = adp->ad_offset + 1; i < UFS_NXADDR; i++) { 10895 #ifdef INVARIANTS 10896 if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) 10897 panic("initiate_write_inodeblock_ufs2: " 10898 "lost dep1"); 10899 #endif /* INVARIANTS */ 10900 dp->di_extb[i] = 0; 10901 } 10902 lastadp = NULL; 10903 break; 10904 } 10905 /* 10906 * If we have zero'ed out the last allocated block of the ext 10907 * data, roll back the size to the last currently allocated block. 10908 * We know that this last allocated block is a full-sized as 10909 * we already checked for fragments in the loop above. 10910 */ 10911 if (lastadp != NULL && 10912 dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 10913 for (i = lastadp->ad_offset; i >= 0; i--) 10914 if (dp->di_extb[i] != 0) 10915 break; 10916 dp->di_extsize = (i + 1) * fs->fs_bsize; 10917 } 10918 /* 10919 * Set the file data dependencies to busy. 10920 */ 10921 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 10922 adp = TAILQ_NEXT(adp, ad_next)) { 10923 #ifdef INVARIANTS 10924 if (deplist != 0 && prevlbn >= adp->ad_offset) 10925 panic("softdep_write_inodeblock: lbn order"); 10926 if ((adp->ad_state & ATTACHED) == 0) 10927 panic("inodedep %p and adp %p not attached", inodedep, adp); 10928 prevlbn = adp->ad_offset; 10929 if (!ffs_fsfail_cleanup(ump, 0) && 10930 adp->ad_offset < UFS_NDADDR && 10931 dp->di_db[adp->ad_offset] != adp->ad_newblkno) 10932 panic("initiate_write_inodeblock_ufs2: " 10933 "direct pointer #%jd mismatch %jd != %jd", 10934 (intmax_t)adp->ad_offset, 10935 (intmax_t)dp->di_db[adp->ad_offset], 10936 (intmax_t)adp->ad_newblkno); 10937 if (!ffs_fsfail_cleanup(ump, 0) && 10938 adp->ad_offset >= UFS_NDADDR && 10939 dp->di_ib[adp->ad_offset - UFS_NDADDR] != adp->ad_newblkno) 10940 panic("initiate_write_inodeblock_ufs2: " 10941 "indirect pointer #%jd mismatch %jd != %jd", 10942 (intmax_t)adp->ad_offset - UFS_NDADDR, 10943 (intmax_t)dp->di_ib[adp->ad_offset - UFS_NDADDR], 10944 (intmax_t)adp->ad_newblkno); 10945 deplist |= 1 << adp->ad_offset; 10946 if ((adp->ad_state & ATTACHED) == 0) 10947 panic("initiate_write_inodeblock_ufs2: Unknown " 10948 "state 0x%x", adp->ad_state); 10949 #endif /* INVARIANTS */ 10950 adp->ad_state &= ~ATTACHED; 10951 adp->ad_state |= UNDONE; 10952 } 10953 /* 10954 * The on-disk inode cannot claim to be any larger than the last 10955 * fragment that has been written. Otherwise, the on-disk inode 10956 * might have fragments that were not the last block in the file 10957 * which would corrupt the filesystem. 10958 */ 10959 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 10960 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 10961 if (adp->ad_offset >= UFS_NDADDR) 10962 break; 10963 dp->di_db[adp->ad_offset] = adp->ad_oldblkno; 10964 /* keep going until hitting a rollback to a frag */ 10965 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 10966 continue; 10967 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 10968 for (i = adp->ad_offset + 1; i < UFS_NDADDR; i++) { 10969 #ifdef INVARIANTS 10970 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) 10971 panic("initiate_write_inodeblock_ufs2: " 10972 "lost dep2"); 10973 #endif /* INVARIANTS */ 10974 dp->di_db[i] = 0; 10975 } 10976 for (i = 0; i < UFS_NIADDR; i++) { 10977 #ifdef INVARIANTS 10978 if (dp->di_ib[i] != 0 && 10979 (deplist & ((1 << UFS_NDADDR) << i)) == 0) 10980 panic("initiate_write_inodeblock_ufs2: " 10981 "lost dep3"); 10982 #endif /* INVARIANTS */ 10983 dp->di_ib[i] = 0; 10984 } 10985 ffs_update_dinode_ckhash(fs, dp); 10986 return; 10987 } 10988 /* 10989 * If we have zero'ed out the last allocated block of the file, 10990 * roll back the size to the last currently allocated block. 10991 * We know that this last allocated block is a full-sized as 10992 * we already checked for fragments in the loop above. 10993 */ 10994 if (lastadp != NULL && 10995 dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 10996 for (i = lastadp->ad_offset; i >= 0; i--) 10997 if (dp->di_db[i] != 0) 10998 break; 10999 dp->di_size = (i + 1) * fs->fs_bsize; 11000 } 11001 /* 11002 * The only dependencies are for indirect blocks. 11003 * 11004 * The file size for indirect block additions is not guaranteed. 11005 * Such a guarantee would be non-trivial to achieve. The conventional 11006 * synchronous write implementation also does not make this guarantee. 11007 * Fsck should catch and fix discrepancies. Arguably, the file size 11008 * can be over-estimated without destroying integrity when the file 11009 * moves into the indirect blocks (i.e., is large). If we want to 11010 * postpone fsck, we are stuck with this argument. 11011 */ 11012 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 11013 dp->di_ib[adp->ad_offset - UFS_NDADDR] = 0; 11014 ffs_update_dinode_ckhash(fs, dp); 11015 } 11016 11017 /* 11018 * Cancel an indirdep as a result of truncation. Release all of the 11019 * children allocindirs and place their journal work on the appropriate 11020 * list. 11021 */ 11022 static void 11023 cancel_indirdep(indirdep, bp, freeblks) 11024 struct indirdep *indirdep; 11025 struct buf *bp; 11026 struct freeblks *freeblks; 11027 { 11028 struct allocindir *aip; 11029 11030 /* 11031 * None of the indirect pointers will ever be visible, 11032 * so they can simply be tossed. GOINGAWAY ensures 11033 * that allocated pointers will be saved in the buffer 11034 * cache until they are freed. Note that they will 11035 * only be able to be found by their physical address 11036 * since the inode mapping the logical address will 11037 * be gone. The save buffer used for the safe copy 11038 * was allocated in setup_allocindir_phase2 using 11039 * the physical address so it could be used for this 11040 * purpose. Hence we swap the safe copy with the real 11041 * copy, allowing the safe copy to be freed and holding 11042 * on to the real copy for later use in indir_trunc. 11043 */ 11044 if (indirdep->ir_state & GOINGAWAY) 11045 panic("cancel_indirdep: already gone"); 11046 if ((indirdep->ir_state & DEPCOMPLETE) == 0) { 11047 indirdep->ir_state |= DEPCOMPLETE; 11048 LIST_REMOVE(indirdep, ir_next); 11049 } 11050 indirdep->ir_state |= GOINGAWAY; 11051 /* 11052 * Pass in bp for blocks still have journal writes 11053 * pending so we can cancel them on their own. 11054 */ 11055 while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != NULL) 11056 cancel_allocindir(aip, bp, freeblks, 0); 11057 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL) 11058 cancel_allocindir(aip, NULL, freeblks, 0); 11059 while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL) 11060 cancel_allocindir(aip, NULL, freeblks, 0); 11061 while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) 11062 cancel_allocindir(aip, NULL, freeblks, 0); 11063 /* 11064 * If there are pending partial truncations we need to keep the 11065 * old block copy around until they complete. This is because 11066 * the current b_data is not a perfect superset of the available 11067 * blocks. 11068 */ 11069 if (TAILQ_EMPTY(&indirdep->ir_trunc)) 11070 bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount); 11071 else 11072 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); 11073 WORKLIST_REMOVE(&indirdep->ir_list); 11074 WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list); 11075 indirdep->ir_bp = NULL; 11076 indirdep->ir_freeblks = freeblks; 11077 } 11078 11079 /* 11080 * Free an indirdep once it no longer has new pointers to track. 11081 */ 11082 static void 11083 free_indirdep(indirdep) 11084 struct indirdep *indirdep; 11085 { 11086 11087 KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc), 11088 ("free_indirdep: Indir trunc list not empty.")); 11089 KASSERT(LIST_EMPTY(&indirdep->ir_completehd), 11090 ("free_indirdep: Complete head not empty.")); 11091 KASSERT(LIST_EMPTY(&indirdep->ir_writehd), 11092 ("free_indirdep: write head not empty.")); 11093 KASSERT(LIST_EMPTY(&indirdep->ir_donehd), 11094 ("free_indirdep: done head not empty.")); 11095 KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd), 11096 ("free_indirdep: deplist head not empty.")); 11097 KASSERT((indirdep->ir_state & DEPCOMPLETE), 11098 ("free_indirdep: %p still on newblk list.", indirdep)); 11099 KASSERT(indirdep->ir_saveddata == NULL, 11100 ("free_indirdep: %p still has saved data.", indirdep)); 11101 KASSERT(indirdep->ir_savebp == NULL, 11102 ("free_indirdep: %p still has savebp buffer.", indirdep)); 11103 if (indirdep->ir_state & ONWORKLIST) 11104 WORKLIST_REMOVE(&indirdep->ir_list); 11105 WORKITEM_FREE(indirdep, D_INDIRDEP); 11106 } 11107 11108 /* 11109 * Called before a write to an indirdep. This routine is responsible for 11110 * rolling back pointers to a safe state which includes only those 11111 * allocindirs which have been completed. 11112 */ 11113 static void 11114 initiate_write_indirdep(indirdep, bp) 11115 struct indirdep *indirdep; 11116 struct buf *bp; 11117 { 11118 struct ufsmount *ump; 11119 11120 indirdep->ir_state |= IOSTARTED; 11121 if (indirdep->ir_state & GOINGAWAY) 11122 panic("disk_io_initiation: indirdep gone"); 11123 /* 11124 * If there are no remaining dependencies, this will be writing 11125 * the real pointers. 11126 */ 11127 if (LIST_EMPTY(&indirdep->ir_deplisthd) && 11128 TAILQ_EMPTY(&indirdep->ir_trunc)) 11129 return; 11130 /* 11131 * Replace up-to-date version with safe version. 11132 */ 11133 if (indirdep->ir_saveddata == NULL) { 11134 ump = VFSTOUFS(indirdep->ir_list.wk_mp); 11135 LOCK_OWNED(ump); 11136 FREE_LOCK(ump); 11137 indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, 11138 M_SOFTDEP_FLAGS); 11139 ACQUIRE_LOCK(ump); 11140 } 11141 indirdep->ir_state &= ~ATTACHED; 11142 indirdep->ir_state |= UNDONE; 11143 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); 11144 bcopy(indirdep->ir_savebp->b_data, bp->b_data, 11145 bp->b_bcount); 11146 } 11147 11148 /* 11149 * Called when an inode has been cleared in a cg bitmap. This finally 11150 * eliminates any canceled jaddrefs 11151 */ 11152 void 11153 softdep_setup_inofree(mp, bp, ino, wkhd) 11154 struct mount *mp; 11155 struct buf *bp; 11156 ino_t ino; 11157 struct workhead *wkhd; 11158 { 11159 struct worklist *wk, *wkn; 11160 struct inodedep *inodedep; 11161 struct ufsmount *ump; 11162 uint8_t *inosused; 11163 struct cg *cgp; 11164 struct fs *fs; 11165 11166 KASSERT(MOUNTEDSOFTDEP(mp) != 0, 11167 ("softdep_setup_inofree called on non-softdep filesystem")); 11168 ump = VFSTOUFS(mp); 11169 ACQUIRE_LOCK(ump); 11170 if (!ffs_fsfail_cleanup(ump, 0)) { 11171 fs = ump->um_fs; 11172 cgp = (struct cg *)bp->b_data; 11173 inosused = cg_inosused(cgp); 11174 if (isset(inosused, ino % fs->fs_ipg)) 11175 panic("softdep_setup_inofree: inode %ju not freed.", 11176 (uintmax_t)ino); 11177 } 11178 if (inodedep_lookup(mp, ino, 0, &inodedep)) 11179 panic("softdep_setup_inofree: ino %ju has existing inodedep %p", 11180 (uintmax_t)ino, inodedep); 11181 if (wkhd) { 11182 LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) { 11183 if (wk->wk_type != D_JADDREF) 11184 continue; 11185 WORKLIST_REMOVE(wk); 11186 /* 11187 * We can free immediately even if the jaddref 11188 * isn't attached in a background write as now 11189 * the bitmaps are reconciled. 11190 */ 11191 wk->wk_state |= COMPLETE | ATTACHED; 11192 free_jaddref(WK_JADDREF(wk)); 11193 } 11194 jwork_move(&bp->b_dep, wkhd); 11195 } 11196 FREE_LOCK(ump); 11197 } 11198 11199 /* 11200 * Called via ffs_blkfree() after a set of frags has been cleared from a cg 11201 * map. Any dependencies waiting for the write to clear are added to the 11202 * buf's list and any jnewblks that are being canceled are discarded 11203 * immediately. 11204 */ 11205 void 11206 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) 11207 struct mount *mp; 11208 struct buf *bp; 11209 ufs2_daddr_t blkno; 11210 int frags; 11211 struct workhead *wkhd; 11212 { 11213 struct bmsafemap *bmsafemap; 11214 struct jnewblk *jnewblk; 11215 struct ufsmount *ump; 11216 struct worklist *wk; 11217 struct fs *fs; 11218 #ifdef INVARIANTS 11219 uint8_t *blksfree; 11220 struct cg *cgp; 11221 ufs2_daddr_t jstart; 11222 ufs2_daddr_t jend; 11223 ufs2_daddr_t end; 11224 long bno; 11225 int i; 11226 #endif 11227 11228 CTR3(KTR_SUJ, 11229 "softdep_setup_blkfree: blkno %jd frags %d wk head %p", 11230 blkno, frags, wkhd); 11231 11232 ump = VFSTOUFS(mp); 11233 KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, 11234 ("softdep_setup_blkfree called on non-softdep filesystem")); 11235 ACQUIRE_LOCK(ump); 11236 /* Lookup the bmsafemap so we track when it is dirty. */ 11237 fs = ump->um_fs; 11238 bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL); 11239 /* 11240 * Detach any jnewblks which have been canceled. They must linger 11241 * until the bitmap is cleared again by ffs_blkfree() to prevent 11242 * an unjournaled allocation from hitting the disk. 11243 */ 11244 if (wkhd) { 11245 while ((wk = LIST_FIRST(wkhd)) != NULL) { 11246 CTR2(KTR_SUJ, 11247 "softdep_setup_blkfree: blkno %jd wk type %d", 11248 blkno, wk->wk_type); 11249 WORKLIST_REMOVE(wk); 11250 if (wk->wk_type != D_JNEWBLK) { 11251 WORKLIST_INSERT(&bmsafemap->sm_freehd, wk); 11252 continue; 11253 } 11254 jnewblk = WK_JNEWBLK(wk); 11255 KASSERT(jnewblk->jn_state & GOINGAWAY, 11256 ("softdep_setup_blkfree: jnewblk not canceled.")); 11257 #ifdef INVARIANTS 11258 /* 11259 * Assert that this block is free in the bitmap 11260 * before we discard the jnewblk. 11261 */ 11262 cgp = (struct cg *)bp->b_data; 11263 blksfree = cg_blksfree(cgp); 11264 bno = dtogd(fs, jnewblk->jn_blkno); 11265 for (i = jnewblk->jn_oldfrags; 11266 i < jnewblk->jn_frags; i++) { 11267 if (isset(blksfree, bno + i)) 11268 continue; 11269 panic("softdep_setup_blkfree: not free"); 11270 } 11271 #endif 11272 /* 11273 * Even if it's not attached we can free immediately 11274 * as the new bitmap is correct. 11275 */ 11276 wk->wk_state |= COMPLETE | ATTACHED; 11277 free_jnewblk(jnewblk); 11278 } 11279 } 11280 11281 #ifdef INVARIANTS 11282 /* 11283 * Assert that we are not freeing a block which has an outstanding 11284 * allocation dependency. 11285 */ 11286 fs = VFSTOUFS(mp)->um_fs; 11287 bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL); 11288 end = blkno + frags; 11289 LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { 11290 /* 11291 * Don't match against blocks that will be freed when the 11292 * background write is done. 11293 */ 11294 if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) == 11295 (COMPLETE | DEPCOMPLETE)) 11296 continue; 11297 jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags; 11298 jend = jnewblk->jn_blkno + jnewblk->jn_frags; 11299 if ((blkno >= jstart && blkno < jend) || 11300 (end > jstart && end <= jend)) { 11301 printf("state 0x%X %jd - %d %d dep %p\n", 11302 jnewblk->jn_state, jnewblk->jn_blkno, 11303 jnewblk->jn_oldfrags, jnewblk->jn_frags, 11304 jnewblk->jn_dep); 11305 panic("softdep_setup_blkfree: " 11306 "%jd-%jd(%d) overlaps with %jd-%jd", 11307 blkno, end, frags, jstart, jend); 11308 } 11309 } 11310 #endif 11311 FREE_LOCK(ump); 11312 } 11313 11314 /* 11315 * Revert a block allocation when the journal record that describes it 11316 * is not yet written. 11317 */ 11318 static int 11319 jnewblk_rollback(jnewblk, fs, cgp, blksfree) 11320 struct jnewblk *jnewblk; 11321 struct fs *fs; 11322 struct cg *cgp; 11323 uint8_t *blksfree; 11324 { 11325 ufs1_daddr_t fragno; 11326 long cgbno, bbase; 11327 int frags, blk; 11328 int i; 11329 11330 frags = 0; 11331 cgbno = dtogd(fs, jnewblk->jn_blkno); 11332 /* 11333 * We have to test which frags need to be rolled back. We may 11334 * be operating on a stale copy when doing background writes. 11335 */ 11336 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) 11337 if (isclr(blksfree, cgbno + i)) 11338 frags++; 11339 if (frags == 0) 11340 return (0); 11341 /* 11342 * This is mostly ffs_blkfree() sans some validation and 11343 * superblock updates. 11344 */ 11345 if (frags == fs->fs_frag) { 11346 fragno = fragstoblks(fs, cgbno); 11347 ffs_setblock(fs, blksfree, fragno); 11348 ffs_clusteracct(fs, cgp, fragno, 1); 11349 cgp->cg_cs.cs_nbfree++; 11350 } else { 11351 cgbno += jnewblk->jn_oldfrags; 11352 bbase = cgbno - fragnum(fs, cgbno); 11353 /* Decrement the old frags. */ 11354 blk = blkmap(fs, blksfree, bbase); 11355 ffs_fragacct(fs, blk, cgp->cg_frsum, -1); 11356 /* Deallocate the fragment */ 11357 for (i = 0; i < frags; i++) 11358 setbit(blksfree, cgbno + i); 11359 cgp->cg_cs.cs_nffree += frags; 11360 /* Add back in counts associated with the new frags */ 11361 blk = blkmap(fs, blksfree, bbase); 11362 ffs_fragacct(fs, blk, cgp->cg_frsum, 1); 11363 /* If a complete block has been reassembled, account for it. */ 11364 fragno = fragstoblks(fs, bbase); 11365 if (ffs_isblock(fs, blksfree, fragno)) { 11366 cgp->cg_cs.cs_nffree -= fs->fs_frag; 11367 ffs_clusteracct(fs, cgp, fragno, 1); 11368 cgp->cg_cs.cs_nbfree++; 11369 } 11370 } 11371 stat_jnewblk++; 11372 jnewblk->jn_state &= ~ATTACHED; 11373 jnewblk->jn_state |= UNDONE; 11374 11375 return (frags); 11376 } 11377 11378 static void 11379 initiate_write_bmsafemap(bmsafemap, bp) 11380 struct bmsafemap *bmsafemap; 11381 struct buf *bp; /* The cg block. */ 11382 { 11383 struct jaddref *jaddref; 11384 struct jnewblk *jnewblk; 11385 uint8_t *inosused; 11386 uint8_t *blksfree; 11387 struct cg *cgp; 11388 struct fs *fs; 11389 ino_t ino; 11390 11391 /* 11392 * If this is a background write, we did this at the time that 11393 * the copy was made, so do not need to do it again. 11394 */ 11395 if (bmsafemap->sm_state & IOSTARTED) 11396 return; 11397 bmsafemap->sm_state |= IOSTARTED; 11398 /* 11399 * Clear any inode allocations which are pending journal writes. 11400 */ 11401 if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) { 11402 cgp = (struct cg *)bp->b_data; 11403 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 11404 inosused = cg_inosused(cgp); 11405 LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) { 11406 ino = jaddref->ja_ino % fs->fs_ipg; 11407 if (isset(inosused, ino)) { 11408 if ((jaddref->ja_mode & IFMT) == IFDIR) 11409 cgp->cg_cs.cs_ndir--; 11410 cgp->cg_cs.cs_nifree++; 11411 clrbit(inosused, ino); 11412 jaddref->ja_state &= ~ATTACHED; 11413 jaddref->ja_state |= UNDONE; 11414 stat_jaddref++; 11415 } else 11416 panic("initiate_write_bmsafemap: inode %ju " 11417 "marked free", (uintmax_t)jaddref->ja_ino); 11418 } 11419 } 11420 /* 11421 * Clear any block allocations which are pending journal writes. 11422 */ 11423 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { 11424 cgp = (struct cg *)bp->b_data; 11425 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 11426 blksfree = cg_blksfree(cgp); 11427 LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { 11428 if (jnewblk_rollback(jnewblk, fs, cgp, blksfree)) 11429 continue; 11430 panic("initiate_write_bmsafemap: block %jd " 11431 "marked free", jnewblk->jn_blkno); 11432 } 11433 } 11434 /* 11435 * Move allocation lists to the written lists so they can be 11436 * cleared once the block write is complete. 11437 */ 11438 LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr, 11439 inodedep, id_deps); 11440 LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr, 11441 newblk, nb_deps); 11442 LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist, 11443 wk_list); 11444 } 11445 11446 void 11447 softdep_handle_error(struct buf *bp) 11448 { 11449 struct ufsmount *ump; 11450 11451 ump = softdep_bp_to_mp(bp); 11452 if (ump == NULL) 11453 return; 11454 11455 if (ffs_fsfail_cleanup(ump, bp->b_error)) { 11456 /* 11457 * No future writes will succeed, so the on-disk image is safe. 11458 * Pretend that this write succeeded so that the softdep state 11459 * will be cleaned up naturally. 11460 */ 11461 bp->b_ioflags &= ~BIO_ERROR; 11462 bp->b_error = 0; 11463 } 11464 } 11465 11466 /* 11467 * This routine is called during the completion interrupt 11468 * service routine for a disk write (from the procedure called 11469 * by the device driver to inform the filesystem caches of 11470 * a request completion). It should be called early in this 11471 * procedure, before the block is made available to other 11472 * processes or other routines are called. 11473 * 11474 */ 11475 static void 11476 softdep_disk_write_complete(bp) 11477 struct buf *bp; /* describes the completed disk write */ 11478 { 11479 struct worklist *wk; 11480 struct worklist *owk; 11481 struct ufsmount *ump; 11482 struct workhead reattach; 11483 struct freeblks *freeblks; 11484 struct buf *sbp; 11485 11486 ump = softdep_bp_to_mp(bp); 11487 KASSERT(LIST_EMPTY(&bp->b_dep) || ump != NULL, 11488 ("softdep_disk_write_complete: softdep_bp_to_mp returned NULL " 11489 "with outstanding dependencies for buffer %p", bp)); 11490 if (ump == NULL) 11491 return; 11492 if ((bp->b_ioflags & BIO_ERROR) != 0) 11493 softdep_handle_error(bp); 11494 /* 11495 * If an error occurred while doing the write, then the data 11496 * has not hit the disk and the dependencies cannot be processed. 11497 * But we do have to go through and roll forward any dependencies 11498 * that were rolled back before the disk write. 11499 */ 11500 sbp = NULL; 11501 ACQUIRE_LOCK(ump); 11502 if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) { 11503 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 11504 switch (wk->wk_type) { 11505 case D_PAGEDEP: 11506 handle_written_filepage(WK_PAGEDEP(wk), bp, 0); 11507 continue; 11508 11509 case D_INODEDEP: 11510 handle_written_inodeblock(WK_INODEDEP(wk), 11511 bp, 0); 11512 continue; 11513 11514 case D_BMSAFEMAP: 11515 handle_written_bmsafemap(WK_BMSAFEMAP(wk), 11516 bp, 0); 11517 continue; 11518 11519 case D_INDIRDEP: 11520 handle_written_indirdep(WK_INDIRDEP(wk), 11521 bp, &sbp, 0); 11522 continue; 11523 default: 11524 /* nothing to roll forward */ 11525 continue; 11526 } 11527 } 11528 FREE_LOCK(ump); 11529 if (sbp) 11530 brelse(sbp); 11531 return; 11532 } 11533 LIST_INIT(&reattach); 11534 11535 /* 11536 * Ump SU lock must not be released anywhere in this code segment. 11537 */ 11538 owk = NULL; 11539 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 11540 WORKLIST_REMOVE(wk); 11541 atomic_add_long(&dep_write[wk->wk_type], 1); 11542 if (wk == owk) 11543 panic("duplicate worklist: %p\n", wk); 11544 owk = wk; 11545 switch (wk->wk_type) { 11546 case D_PAGEDEP: 11547 if (handle_written_filepage(WK_PAGEDEP(wk), bp, 11548 WRITESUCCEEDED)) 11549 WORKLIST_INSERT(&reattach, wk); 11550 continue; 11551 11552 case D_INODEDEP: 11553 if (handle_written_inodeblock(WK_INODEDEP(wk), bp, 11554 WRITESUCCEEDED)) 11555 WORKLIST_INSERT(&reattach, wk); 11556 continue; 11557 11558 case D_BMSAFEMAP: 11559 if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp, 11560 WRITESUCCEEDED)) 11561 WORKLIST_INSERT(&reattach, wk); 11562 continue; 11563 11564 case D_MKDIR: 11565 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 11566 continue; 11567 11568 case D_ALLOCDIRECT: 11569 wk->wk_state |= COMPLETE; 11570 handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL); 11571 continue; 11572 11573 case D_ALLOCINDIR: 11574 wk->wk_state |= COMPLETE; 11575 handle_allocindir_partdone(WK_ALLOCINDIR(wk)); 11576 continue; 11577 11578 case D_INDIRDEP: 11579 if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp, 11580 WRITESUCCEEDED)) 11581 WORKLIST_INSERT(&reattach, wk); 11582 continue; 11583 11584 case D_FREEBLKS: 11585 wk->wk_state |= COMPLETE; 11586 freeblks = WK_FREEBLKS(wk); 11587 if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE && 11588 LIST_EMPTY(&freeblks->fb_jblkdephd)) 11589 add_to_worklist(wk, WK_NODELAY); 11590 continue; 11591 11592 case D_FREEWORK: 11593 handle_written_freework(WK_FREEWORK(wk)); 11594 break; 11595 11596 case D_JSEGDEP: 11597 free_jsegdep(WK_JSEGDEP(wk)); 11598 continue; 11599 11600 case D_JSEG: 11601 handle_written_jseg(WK_JSEG(wk), bp); 11602 continue; 11603 11604 case D_SBDEP: 11605 if (handle_written_sbdep(WK_SBDEP(wk), bp)) 11606 WORKLIST_INSERT(&reattach, wk); 11607 continue; 11608 11609 case D_FREEDEP: 11610 free_freedep(WK_FREEDEP(wk)); 11611 continue; 11612 11613 default: 11614 panic("handle_disk_write_complete: Unknown type %s", 11615 TYPENAME(wk->wk_type)); 11616 /* NOTREACHED */ 11617 } 11618 } 11619 /* 11620 * Reattach any requests that must be redone. 11621 */ 11622 while ((wk = LIST_FIRST(&reattach)) != NULL) { 11623 WORKLIST_REMOVE(wk); 11624 WORKLIST_INSERT(&bp->b_dep, wk); 11625 } 11626 FREE_LOCK(ump); 11627 if (sbp) 11628 brelse(sbp); 11629 } 11630 11631 /* 11632 * Called from within softdep_disk_write_complete above. 11633 */ 11634 static void 11635 handle_allocdirect_partdone(adp, wkhd) 11636 struct allocdirect *adp; /* the completed allocdirect */ 11637 struct workhead *wkhd; /* Work to do when inode is writtne. */ 11638 { 11639 struct allocdirectlst *listhead; 11640 struct allocdirect *listadp; 11641 struct inodedep *inodedep; 11642 long bsize; 11643 11644 LOCK_OWNED(VFSTOUFS(adp->ad_block.nb_list.wk_mp)); 11645 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 11646 return; 11647 /* 11648 * The on-disk inode cannot claim to be any larger than the last 11649 * fragment that has been written. Otherwise, the on-disk inode 11650 * might have fragments that were not the last block in the file 11651 * which would corrupt the filesystem. Thus, we cannot free any 11652 * allocdirects after one whose ad_oldblkno claims a fragment as 11653 * these blocks must be rolled back to zero before writing the inode. 11654 * We check the currently active set of allocdirects in id_inoupdt 11655 * or id_extupdt as appropriate. 11656 */ 11657 inodedep = adp->ad_inodedep; 11658 bsize = inodedep->id_fs->fs_bsize; 11659 if (adp->ad_state & EXTDATA) 11660 listhead = &inodedep->id_extupdt; 11661 else 11662 listhead = &inodedep->id_inoupdt; 11663 TAILQ_FOREACH(listadp, listhead, ad_next) { 11664 /* found our block */ 11665 if (listadp == adp) 11666 break; 11667 /* continue if ad_oldlbn is not a fragment */ 11668 if (listadp->ad_oldsize == 0 || 11669 listadp->ad_oldsize == bsize) 11670 continue; 11671 /* hit a fragment */ 11672 return; 11673 } 11674 /* 11675 * If we have reached the end of the current list without 11676 * finding the just finished dependency, then it must be 11677 * on the future dependency list. Future dependencies cannot 11678 * be freed until they are moved to the current list. 11679 */ 11680 if (listadp == NULL) { 11681 #ifdef INVARIANTS 11682 if (adp->ad_state & EXTDATA) 11683 listhead = &inodedep->id_newextupdt; 11684 else 11685 listhead = &inodedep->id_newinoupdt; 11686 TAILQ_FOREACH(listadp, listhead, ad_next) 11687 /* found our block */ 11688 if (listadp == adp) 11689 break; 11690 if (listadp == NULL) 11691 panic("handle_allocdirect_partdone: lost dep"); 11692 #endif /* INVARIANTS */ 11693 return; 11694 } 11695 /* 11696 * If we have found the just finished dependency, then queue 11697 * it along with anything that follows it that is complete. 11698 * Since the pointer has not yet been written in the inode 11699 * as the dependency prevents it, place the allocdirect on the 11700 * bufwait list where it will be freed once the pointer is 11701 * valid. 11702 */ 11703 if (wkhd == NULL) 11704 wkhd = &inodedep->id_bufwait; 11705 for (; adp; adp = listadp) { 11706 listadp = TAILQ_NEXT(adp, ad_next); 11707 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 11708 return; 11709 TAILQ_REMOVE(listhead, adp, ad_next); 11710 WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list); 11711 } 11712 } 11713 11714 /* 11715 * Called from within softdep_disk_write_complete above. This routine 11716 * completes successfully written allocindirs. 11717 */ 11718 static void 11719 handle_allocindir_partdone(aip) 11720 struct allocindir *aip; /* the completed allocindir */ 11721 { 11722 struct indirdep *indirdep; 11723 11724 if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) 11725 return; 11726 indirdep = aip->ai_indirdep; 11727 LIST_REMOVE(aip, ai_next); 11728 /* 11729 * Don't set a pointer while the buffer is undergoing IO or while 11730 * we have active truncations. 11731 */ 11732 if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) { 11733 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); 11734 return; 11735 } 11736 if (indirdep->ir_state & UFS1FMT) 11737 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 11738 aip->ai_newblkno; 11739 else 11740 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 11741 aip->ai_newblkno; 11742 /* 11743 * Await the pointer write before freeing the allocindir. 11744 */ 11745 LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next); 11746 } 11747 11748 /* 11749 * Release segments held on a jwork list. 11750 */ 11751 static void 11752 handle_jwork(wkhd) 11753 struct workhead *wkhd; 11754 { 11755 struct worklist *wk; 11756 11757 while ((wk = LIST_FIRST(wkhd)) != NULL) { 11758 WORKLIST_REMOVE(wk); 11759 switch (wk->wk_type) { 11760 case D_JSEGDEP: 11761 free_jsegdep(WK_JSEGDEP(wk)); 11762 continue; 11763 case D_FREEDEP: 11764 free_freedep(WK_FREEDEP(wk)); 11765 continue; 11766 case D_FREEFRAG: 11767 rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep)); 11768 WORKITEM_FREE(wk, D_FREEFRAG); 11769 continue; 11770 case D_FREEWORK: 11771 handle_written_freework(WK_FREEWORK(wk)); 11772 continue; 11773 default: 11774 panic("handle_jwork: Unknown type %s\n", 11775 TYPENAME(wk->wk_type)); 11776 } 11777 } 11778 } 11779 11780 /* 11781 * Handle the bufwait list on an inode when it is safe to release items 11782 * held there. This normally happens after an inode block is written but 11783 * may be delayed and handled later if there are pending journal items that 11784 * are not yet safe to be released. 11785 */ 11786 static struct freefile * 11787 handle_bufwait(inodedep, refhd) 11788 struct inodedep *inodedep; 11789 struct workhead *refhd; 11790 { 11791 struct jaddref *jaddref; 11792 struct freefile *freefile; 11793 struct worklist *wk; 11794 11795 freefile = NULL; 11796 while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { 11797 WORKLIST_REMOVE(wk); 11798 switch (wk->wk_type) { 11799 case D_FREEFILE: 11800 /* 11801 * We defer adding freefile to the worklist 11802 * until all other additions have been made to 11803 * ensure that it will be done after all the 11804 * old blocks have been freed. 11805 */ 11806 if (freefile != NULL) 11807 panic("handle_bufwait: freefile"); 11808 freefile = WK_FREEFILE(wk); 11809 continue; 11810 11811 case D_MKDIR: 11812 handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); 11813 continue; 11814 11815 case D_DIRADD: 11816 diradd_inode_written(WK_DIRADD(wk), inodedep); 11817 continue; 11818 11819 case D_FREEFRAG: 11820 wk->wk_state |= COMPLETE; 11821 if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE) 11822 add_to_worklist(wk, 0); 11823 continue; 11824 11825 case D_DIRREM: 11826 wk->wk_state |= COMPLETE; 11827 add_to_worklist(wk, 0); 11828 continue; 11829 11830 case D_ALLOCDIRECT: 11831 case D_ALLOCINDIR: 11832 free_newblk(WK_NEWBLK(wk)); 11833 continue; 11834 11835 case D_JNEWBLK: 11836 wk->wk_state |= COMPLETE; 11837 free_jnewblk(WK_JNEWBLK(wk)); 11838 continue; 11839 11840 /* 11841 * Save freed journal segments and add references on 11842 * the supplied list which will delay their release 11843 * until the cg bitmap is cleared on disk. 11844 */ 11845 case D_JSEGDEP: 11846 if (refhd == NULL) 11847 free_jsegdep(WK_JSEGDEP(wk)); 11848 else 11849 WORKLIST_INSERT(refhd, wk); 11850 continue; 11851 11852 case D_JADDREF: 11853 jaddref = WK_JADDREF(wk); 11854 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, 11855 if_deps); 11856 /* 11857 * Transfer any jaddrefs to the list to be freed with 11858 * the bitmap if we're handling a removed file. 11859 */ 11860 if (refhd == NULL) { 11861 wk->wk_state |= COMPLETE; 11862 free_jaddref(jaddref); 11863 } else 11864 WORKLIST_INSERT(refhd, wk); 11865 continue; 11866 11867 default: 11868 panic("handle_bufwait: Unknown type %p(%s)", 11869 wk, TYPENAME(wk->wk_type)); 11870 /* NOTREACHED */ 11871 } 11872 } 11873 return (freefile); 11874 } 11875 /* 11876 * Called from within softdep_disk_write_complete above to restore 11877 * in-memory inode block contents to their most up-to-date state. Note 11878 * that this routine is always called from interrupt level with further 11879 * interrupts from this device blocked. 11880 * 11881 * If the write did not succeed, we will do all the roll-forward 11882 * operations, but we will not take the actions that will allow its 11883 * dependencies to be processed. 11884 */ 11885 static int 11886 handle_written_inodeblock(inodedep, bp, flags) 11887 struct inodedep *inodedep; 11888 struct buf *bp; /* buffer containing the inode block */ 11889 int flags; 11890 { 11891 struct freefile *freefile; 11892 struct allocdirect *adp, *nextadp; 11893 struct ufs1_dinode *dp1 = NULL; 11894 struct ufs2_dinode *dp2 = NULL; 11895 struct workhead wkhd; 11896 int hadchanges, fstype; 11897 ino_t freelink; 11898 11899 LIST_INIT(&wkhd); 11900 hadchanges = 0; 11901 freefile = NULL; 11902 if ((inodedep->id_state & IOSTARTED) == 0) 11903 panic("handle_written_inodeblock: not started"); 11904 inodedep->id_state &= ~IOSTARTED; 11905 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) { 11906 fstype = UFS1; 11907 dp1 = (struct ufs1_dinode *)bp->b_data + 11908 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 11909 freelink = dp1->di_freelink; 11910 } else { 11911 fstype = UFS2; 11912 dp2 = (struct ufs2_dinode *)bp->b_data + 11913 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 11914 freelink = dp2->di_freelink; 11915 } 11916 /* 11917 * Leave this inodeblock dirty until it's in the list. 11918 */ 11919 if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED && 11920 (flags & WRITESUCCEEDED)) { 11921 struct inodedep *inon; 11922 11923 inon = TAILQ_NEXT(inodedep, id_unlinked); 11924 if ((inon == NULL && freelink == 0) || 11925 (inon && inon->id_ino == freelink)) { 11926 if (inon) 11927 inon->id_state |= UNLINKPREV; 11928 inodedep->id_state |= UNLINKNEXT; 11929 } 11930 hadchanges = 1; 11931 } 11932 /* 11933 * If we had to rollback the inode allocation because of 11934 * bitmaps being incomplete, then simply restore it. 11935 * Keep the block dirty so that it will not be reclaimed until 11936 * all associated dependencies have been cleared and the 11937 * corresponding updates written to disk. 11938 */ 11939 if (inodedep->id_savedino1 != NULL) { 11940 hadchanges = 1; 11941 if (fstype == UFS1) 11942 *dp1 = *inodedep->id_savedino1; 11943 else 11944 *dp2 = *inodedep->id_savedino2; 11945 free(inodedep->id_savedino1, M_SAVEDINO); 11946 inodedep->id_savedino1 = NULL; 11947 if ((bp->b_flags & B_DELWRI) == 0) 11948 stat_inode_bitmap++; 11949 bdirty(bp); 11950 /* 11951 * If the inode is clear here and GOINGAWAY it will never 11952 * be written. Process the bufwait and clear any pending 11953 * work which may include the freefile. 11954 */ 11955 if (inodedep->id_state & GOINGAWAY) 11956 goto bufwait; 11957 return (1); 11958 } 11959 if (flags & WRITESUCCEEDED) 11960 inodedep->id_state |= COMPLETE; 11961 /* 11962 * Roll forward anything that had to be rolled back before 11963 * the inode could be updated. 11964 */ 11965 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { 11966 nextadp = TAILQ_NEXT(adp, ad_next); 11967 if (adp->ad_state & ATTACHED) 11968 panic("handle_written_inodeblock: new entry"); 11969 if (fstype == UFS1) { 11970 if (adp->ad_offset < UFS_NDADDR) { 11971 if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno) 11972 panic("%s %s #%jd mismatch %d != %jd", 11973 "handle_written_inodeblock:", 11974 "direct pointer", 11975 (intmax_t)adp->ad_offset, 11976 dp1->di_db[adp->ad_offset], 11977 (intmax_t)adp->ad_oldblkno); 11978 dp1->di_db[adp->ad_offset] = adp->ad_newblkno; 11979 } else { 11980 if (dp1->di_ib[adp->ad_offset - UFS_NDADDR] != 11981 0) 11982 panic("%s: %s #%jd allocated as %d", 11983 "handle_written_inodeblock", 11984 "indirect pointer", 11985 (intmax_t)adp->ad_offset - 11986 UFS_NDADDR, 11987 dp1->di_ib[adp->ad_offset - 11988 UFS_NDADDR]); 11989 dp1->di_ib[adp->ad_offset - UFS_NDADDR] = 11990 adp->ad_newblkno; 11991 } 11992 } else { 11993 if (adp->ad_offset < UFS_NDADDR) { 11994 if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno) 11995 panic("%s: %s #%jd %s %jd != %jd", 11996 "handle_written_inodeblock", 11997 "direct pointer", 11998 (intmax_t)adp->ad_offset, "mismatch", 11999 (intmax_t)dp2->di_db[adp->ad_offset], 12000 (intmax_t)adp->ad_oldblkno); 12001 dp2->di_db[adp->ad_offset] = adp->ad_newblkno; 12002 } else { 12003 if (dp2->di_ib[adp->ad_offset - UFS_NDADDR] != 12004 0) 12005 panic("%s: %s #%jd allocated as %jd", 12006 "handle_written_inodeblock", 12007 "indirect pointer", 12008 (intmax_t)adp->ad_offset - 12009 UFS_NDADDR, 12010 (intmax_t) 12011 dp2->di_ib[adp->ad_offset - 12012 UFS_NDADDR]); 12013 dp2->di_ib[adp->ad_offset - UFS_NDADDR] = 12014 adp->ad_newblkno; 12015 } 12016 } 12017 adp->ad_state &= ~UNDONE; 12018 adp->ad_state |= ATTACHED; 12019 hadchanges = 1; 12020 } 12021 for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) { 12022 nextadp = TAILQ_NEXT(adp, ad_next); 12023 if (adp->ad_state & ATTACHED) 12024 panic("handle_written_inodeblock: new entry"); 12025 if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno) 12026 panic("%s: direct pointers #%jd %s %jd != %jd", 12027 "handle_written_inodeblock", 12028 (intmax_t)adp->ad_offset, "mismatch", 12029 (intmax_t)dp2->di_extb[adp->ad_offset], 12030 (intmax_t)adp->ad_oldblkno); 12031 dp2->di_extb[adp->ad_offset] = adp->ad_newblkno; 12032 adp->ad_state &= ~UNDONE; 12033 adp->ad_state |= ATTACHED; 12034 hadchanges = 1; 12035 } 12036 if (hadchanges && (bp->b_flags & B_DELWRI) == 0) 12037 stat_direct_blk_ptrs++; 12038 /* 12039 * Reset the file size to its most up-to-date value. 12040 */ 12041 if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1) 12042 panic("handle_written_inodeblock: bad size"); 12043 if (inodedep->id_savednlink > UFS_LINK_MAX) 12044 panic("handle_written_inodeblock: Invalid link count " 12045 "%jd for inodedep %p", (uintmax_t)inodedep->id_savednlink, 12046 inodedep); 12047 if (fstype == UFS1) { 12048 if (dp1->di_nlink != inodedep->id_savednlink) { 12049 dp1->di_nlink = inodedep->id_savednlink; 12050 hadchanges = 1; 12051 } 12052 if (dp1->di_size != inodedep->id_savedsize) { 12053 dp1->di_size = inodedep->id_savedsize; 12054 hadchanges = 1; 12055 } 12056 } else { 12057 if (dp2->di_nlink != inodedep->id_savednlink) { 12058 dp2->di_nlink = inodedep->id_savednlink; 12059 hadchanges = 1; 12060 } 12061 if (dp2->di_size != inodedep->id_savedsize) { 12062 dp2->di_size = inodedep->id_savedsize; 12063 hadchanges = 1; 12064 } 12065 if (dp2->di_extsize != inodedep->id_savedextsize) { 12066 dp2->di_extsize = inodedep->id_savedextsize; 12067 hadchanges = 1; 12068 } 12069 } 12070 inodedep->id_savedsize = -1; 12071 inodedep->id_savedextsize = -1; 12072 inodedep->id_savednlink = -1; 12073 /* 12074 * If there were any rollbacks in the inode block, then it must be 12075 * marked dirty so that its will eventually get written back in 12076 * its correct form. 12077 */ 12078 if (hadchanges) { 12079 if (fstype == UFS2) 12080 ffs_update_dinode_ckhash(inodedep->id_fs, dp2); 12081 bdirty(bp); 12082 } 12083 bufwait: 12084 /* 12085 * If the write did not succeed, we have done all the roll-forward 12086 * operations, but we cannot take the actions that will allow its 12087 * dependencies to be processed. 12088 */ 12089 if ((flags & WRITESUCCEEDED) == 0) 12090 return (hadchanges); 12091 /* 12092 * Process any allocdirects that completed during the update. 12093 */ 12094 if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) 12095 handle_allocdirect_partdone(adp, &wkhd); 12096 if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) 12097 handle_allocdirect_partdone(adp, &wkhd); 12098 /* 12099 * Process deallocations that were held pending until the 12100 * inode had been written to disk. Freeing of the inode 12101 * is delayed until after all blocks have been freed to 12102 * avoid creation of new <vfsid, inum, lbn> triples 12103 * before the old ones have been deleted. Completely 12104 * unlinked inodes are not processed until the unlinked 12105 * inode list is written or the last reference is removed. 12106 */ 12107 if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) { 12108 freefile = handle_bufwait(inodedep, NULL); 12109 if (freefile && !LIST_EMPTY(&wkhd)) { 12110 WORKLIST_INSERT(&wkhd, &freefile->fx_list); 12111 freefile = NULL; 12112 } 12113 } 12114 /* 12115 * Move rolled forward dependency completions to the bufwait list 12116 * now that those that were already written have been processed. 12117 */ 12118 if (!LIST_EMPTY(&wkhd) && hadchanges == 0) 12119 panic("handle_written_inodeblock: bufwait but no changes"); 12120 jwork_move(&inodedep->id_bufwait, &wkhd); 12121 12122 if (freefile != NULL) { 12123 /* 12124 * If the inode is goingaway it was never written. Fake up 12125 * the state here so free_inodedep() can succeed. 12126 */ 12127 if (inodedep->id_state & GOINGAWAY) 12128 inodedep->id_state |= COMPLETE | DEPCOMPLETE; 12129 if (free_inodedep(inodedep) == 0) 12130 panic("handle_written_inodeblock: live inodedep %p", 12131 inodedep); 12132 add_to_worklist(&freefile->fx_list, 0); 12133 return (0); 12134 } 12135 12136 /* 12137 * If no outstanding dependencies, free it. 12138 */ 12139 if (free_inodedep(inodedep) || 12140 (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 && 12141 TAILQ_FIRST(&inodedep->id_inoupdt) == 0 && 12142 TAILQ_FIRST(&inodedep->id_extupdt) == 0 && 12143 LIST_FIRST(&inodedep->id_bufwait) == 0)) 12144 return (0); 12145 return (hadchanges); 12146 } 12147 12148 /* 12149 * Perform needed roll-forwards and kick off any dependencies that 12150 * can now be processed. 12151 * 12152 * If the write did not succeed, we will do all the roll-forward 12153 * operations, but we will not take the actions that will allow its 12154 * dependencies to be processed. 12155 */ 12156 static int 12157 handle_written_indirdep(indirdep, bp, bpp, flags) 12158 struct indirdep *indirdep; 12159 struct buf *bp; 12160 struct buf **bpp; 12161 int flags; 12162 { 12163 struct allocindir *aip; 12164 struct buf *sbp; 12165 int chgs; 12166 12167 if (indirdep->ir_state & GOINGAWAY) 12168 panic("handle_written_indirdep: indirdep gone"); 12169 if ((indirdep->ir_state & IOSTARTED) == 0) 12170 panic("handle_written_indirdep: IO not started"); 12171 chgs = 0; 12172 /* 12173 * If there were rollbacks revert them here. 12174 */ 12175 if (indirdep->ir_saveddata) { 12176 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); 12177 if (TAILQ_EMPTY(&indirdep->ir_trunc)) { 12178 free(indirdep->ir_saveddata, M_INDIRDEP); 12179 indirdep->ir_saveddata = NULL; 12180 } 12181 chgs = 1; 12182 } 12183 indirdep->ir_state &= ~(UNDONE | IOSTARTED); 12184 indirdep->ir_state |= ATTACHED; 12185 /* 12186 * If the write did not succeed, we have done all the roll-forward 12187 * operations, but we cannot take the actions that will allow its 12188 * dependencies to be processed. 12189 */ 12190 if ((flags & WRITESUCCEEDED) == 0) { 12191 stat_indir_blk_ptrs++; 12192 bdirty(bp); 12193 return (1); 12194 } 12195 /* 12196 * Move allocindirs with written pointers to the completehd if 12197 * the indirdep's pointer is not yet written. Otherwise 12198 * free them here. 12199 */ 12200 while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL) { 12201 LIST_REMOVE(aip, ai_next); 12202 if ((indirdep->ir_state & DEPCOMPLETE) == 0) { 12203 LIST_INSERT_HEAD(&indirdep->ir_completehd, aip, 12204 ai_next); 12205 newblk_freefrag(&aip->ai_block); 12206 continue; 12207 } 12208 free_newblk(&aip->ai_block); 12209 } 12210 /* 12211 * Move allocindirs that have finished dependency processing from 12212 * the done list to the write list after updating the pointers. 12213 */ 12214 if (TAILQ_EMPTY(&indirdep->ir_trunc)) { 12215 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL) { 12216 handle_allocindir_partdone(aip); 12217 if (aip == LIST_FIRST(&indirdep->ir_donehd)) 12218 panic("disk_write_complete: not gone"); 12219 chgs = 1; 12220 } 12221 } 12222 /* 12223 * Preserve the indirdep if there were any changes or if it is not 12224 * yet valid on disk. 12225 */ 12226 if (chgs) { 12227 stat_indir_blk_ptrs++; 12228 bdirty(bp); 12229 return (1); 12230 } 12231 /* 12232 * If there were no changes we can discard the savedbp and detach 12233 * ourselves from the buf. We are only carrying completed pointers 12234 * in this case. 12235 */ 12236 sbp = indirdep->ir_savebp; 12237 sbp->b_flags |= B_INVAL | B_NOCACHE; 12238 indirdep->ir_savebp = NULL; 12239 indirdep->ir_bp = NULL; 12240 if (*bpp != NULL) 12241 panic("handle_written_indirdep: bp already exists."); 12242 *bpp = sbp; 12243 /* 12244 * The indirdep may not be freed until its parent points at it. 12245 */ 12246 if (indirdep->ir_state & DEPCOMPLETE) 12247 free_indirdep(indirdep); 12248 12249 return (0); 12250 } 12251 12252 /* 12253 * Process a diradd entry after its dependent inode has been written. 12254 */ 12255 static void 12256 diradd_inode_written(dap, inodedep) 12257 struct diradd *dap; 12258 struct inodedep *inodedep; 12259 { 12260 12261 LOCK_OWNED(VFSTOUFS(dap->da_list.wk_mp)); 12262 dap->da_state |= COMPLETE; 12263 complete_diradd(dap); 12264 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 12265 } 12266 12267 /* 12268 * Returns true if the bmsafemap will have rollbacks when written. Must only 12269 * be called with the per-filesystem lock and the buf lock on the cg held. 12270 */ 12271 static int 12272 bmsafemap_backgroundwrite(bmsafemap, bp) 12273 struct bmsafemap *bmsafemap; 12274 struct buf *bp; 12275 { 12276 int dirty; 12277 12278 LOCK_OWNED(VFSTOUFS(bmsafemap->sm_list.wk_mp)); 12279 dirty = !LIST_EMPTY(&bmsafemap->sm_jaddrefhd) | 12280 !LIST_EMPTY(&bmsafemap->sm_jnewblkhd); 12281 /* 12282 * If we're initiating a background write we need to process the 12283 * rollbacks as they exist now, not as they exist when IO starts. 12284 * No other consumers will look at the contents of the shadowed 12285 * buf so this is safe to do here. 12286 */ 12287 if (bp->b_xflags & BX_BKGRDMARKER) 12288 initiate_write_bmsafemap(bmsafemap, bp); 12289 12290 return (dirty); 12291 } 12292 12293 /* 12294 * Re-apply an allocation when a cg write is complete. 12295 */ 12296 static int 12297 jnewblk_rollforward(jnewblk, fs, cgp, blksfree) 12298 struct jnewblk *jnewblk; 12299 struct fs *fs; 12300 struct cg *cgp; 12301 uint8_t *blksfree; 12302 { 12303 ufs1_daddr_t fragno; 12304 ufs2_daddr_t blkno; 12305 long cgbno, bbase; 12306 int frags, blk; 12307 int i; 12308 12309 frags = 0; 12310 cgbno = dtogd(fs, jnewblk->jn_blkno); 12311 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) { 12312 if (isclr(blksfree, cgbno + i)) 12313 panic("jnewblk_rollforward: re-allocated fragment"); 12314 frags++; 12315 } 12316 if (frags == fs->fs_frag) { 12317 blkno = fragstoblks(fs, cgbno); 12318 ffs_clrblock(fs, blksfree, (long)blkno); 12319 ffs_clusteracct(fs, cgp, blkno, -1); 12320 cgp->cg_cs.cs_nbfree--; 12321 } else { 12322 bbase = cgbno - fragnum(fs, cgbno); 12323 cgbno += jnewblk->jn_oldfrags; 12324 /* If a complete block had been reassembled, account for it. */ 12325 fragno = fragstoblks(fs, bbase); 12326 if (ffs_isblock(fs, blksfree, fragno)) { 12327 cgp->cg_cs.cs_nffree += fs->fs_frag; 12328 ffs_clusteracct(fs, cgp, fragno, -1); 12329 cgp->cg_cs.cs_nbfree--; 12330 } 12331 /* Decrement the old frags. */ 12332 blk = blkmap(fs, blksfree, bbase); 12333 ffs_fragacct(fs, blk, cgp->cg_frsum, -1); 12334 /* Allocate the fragment */ 12335 for (i = 0; i < frags; i++) 12336 clrbit(blksfree, cgbno + i); 12337 cgp->cg_cs.cs_nffree -= frags; 12338 /* Add back in counts associated with the new frags */ 12339 blk = blkmap(fs, blksfree, bbase); 12340 ffs_fragacct(fs, blk, cgp->cg_frsum, 1); 12341 } 12342 return (frags); 12343 } 12344 12345 /* 12346 * Complete a write to a bmsafemap structure. Roll forward any bitmap 12347 * changes if it's not a background write. Set all written dependencies 12348 * to DEPCOMPLETE and free the structure if possible. 12349 * 12350 * If the write did not succeed, we will do all the roll-forward 12351 * operations, but we will not take the actions that will allow its 12352 * dependencies to be processed. 12353 */ 12354 static int 12355 handle_written_bmsafemap(bmsafemap, bp, flags) 12356 struct bmsafemap *bmsafemap; 12357 struct buf *bp; 12358 int flags; 12359 { 12360 struct newblk *newblk; 12361 struct inodedep *inodedep; 12362 struct jaddref *jaddref, *jatmp; 12363 struct jnewblk *jnewblk, *jntmp; 12364 struct ufsmount *ump; 12365 uint8_t *inosused; 12366 uint8_t *blksfree; 12367 struct cg *cgp; 12368 struct fs *fs; 12369 ino_t ino; 12370 int foreground; 12371 int chgs; 12372 12373 if ((bmsafemap->sm_state & IOSTARTED) == 0) 12374 panic("handle_written_bmsafemap: Not started\n"); 12375 ump = VFSTOUFS(bmsafemap->sm_list.wk_mp); 12376 chgs = 0; 12377 bmsafemap->sm_state &= ~IOSTARTED; 12378 foreground = (bp->b_xflags & BX_BKGRDMARKER) == 0; 12379 /* 12380 * If write was successful, release journal work that was waiting 12381 * on the write. Otherwise move the work back. 12382 */ 12383 if (flags & WRITESUCCEEDED) 12384 handle_jwork(&bmsafemap->sm_freewr); 12385 else 12386 LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, 12387 worklist, wk_list); 12388 12389 /* 12390 * Restore unwritten inode allocation pending jaddref writes. 12391 */ 12392 if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) { 12393 cgp = (struct cg *)bp->b_data; 12394 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 12395 inosused = cg_inosused(cgp); 12396 LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd, 12397 ja_bmdeps, jatmp) { 12398 if ((jaddref->ja_state & UNDONE) == 0) 12399 continue; 12400 ino = jaddref->ja_ino % fs->fs_ipg; 12401 if (isset(inosused, ino)) 12402 panic("handle_written_bmsafemap: " 12403 "re-allocated inode"); 12404 /* Do the roll-forward only if it's a real copy. */ 12405 if (foreground) { 12406 if ((jaddref->ja_mode & IFMT) == IFDIR) 12407 cgp->cg_cs.cs_ndir++; 12408 cgp->cg_cs.cs_nifree--; 12409 setbit(inosused, ino); 12410 chgs = 1; 12411 } 12412 jaddref->ja_state &= ~UNDONE; 12413 jaddref->ja_state |= ATTACHED; 12414 free_jaddref(jaddref); 12415 } 12416 } 12417 /* 12418 * Restore any block allocations which are pending journal writes. 12419 */ 12420 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { 12421 cgp = (struct cg *)bp->b_data; 12422 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 12423 blksfree = cg_blksfree(cgp); 12424 LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps, 12425 jntmp) { 12426 if ((jnewblk->jn_state & UNDONE) == 0) 12427 continue; 12428 /* Do the roll-forward only if it's a real copy. */ 12429 if (foreground && 12430 jnewblk_rollforward(jnewblk, fs, cgp, blksfree)) 12431 chgs = 1; 12432 jnewblk->jn_state &= ~(UNDONE | NEWBLOCK); 12433 jnewblk->jn_state |= ATTACHED; 12434 free_jnewblk(jnewblk); 12435 } 12436 } 12437 /* 12438 * If the write did not succeed, we have done all the roll-forward 12439 * operations, but we cannot take the actions that will allow its 12440 * dependencies to be processed. 12441 */ 12442 if ((flags & WRITESUCCEEDED) == 0) { 12443 LIST_CONCAT(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr, 12444 newblk, nb_deps); 12445 LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, 12446 worklist, wk_list); 12447 if (foreground) 12448 bdirty(bp); 12449 return (1); 12450 } 12451 while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) { 12452 newblk->nb_state |= DEPCOMPLETE; 12453 newblk->nb_state &= ~ONDEPLIST; 12454 newblk->nb_bmsafemap = NULL; 12455 LIST_REMOVE(newblk, nb_deps); 12456 if (newblk->nb_list.wk_type == D_ALLOCDIRECT) 12457 handle_allocdirect_partdone( 12458 WK_ALLOCDIRECT(&newblk->nb_list), NULL); 12459 else if (newblk->nb_list.wk_type == D_ALLOCINDIR) 12460 handle_allocindir_partdone( 12461 WK_ALLOCINDIR(&newblk->nb_list)); 12462 else if (newblk->nb_list.wk_type != D_NEWBLK) 12463 panic("handle_written_bmsafemap: Unexpected type: %s", 12464 TYPENAME(newblk->nb_list.wk_type)); 12465 } 12466 while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) { 12467 inodedep->id_state |= DEPCOMPLETE; 12468 inodedep->id_state &= ~ONDEPLIST; 12469 LIST_REMOVE(inodedep, id_deps); 12470 inodedep->id_bmsafemap = NULL; 12471 } 12472 LIST_REMOVE(bmsafemap, sm_next); 12473 if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) && 12474 LIST_EMPTY(&bmsafemap->sm_jnewblkhd) && 12475 LIST_EMPTY(&bmsafemap->sm_newblkhd) && 12476 LIST_EMPTY(&bmsafemap->sm_inodedephd) && 12477 LIST_EMPTY(&bmsafemap->sm_freehd)) { 12478 LIST_REMOVE(bmsafemap, sm_hash); 12479 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 12480 return (0); 12481 } 12482 LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next); 12483 if (foreground) 12484 bdirty(bp); 12485 return (1); 12486 } 12487 12488 /* 12489 * Try to free a mkdir dependency. 12490 */ 12491 static void 12492 complete_mkdir(mkdir) 12493 struct mkdir *mkdir; 12494 { 12495 struct diradd *dap; 12496 12497 if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE) 12498 return; 12499 LIST_REMOVE(mkdir, md_mkdirs); 12500 dap = mkdir->md_diradd; 12501 dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); 12502 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) { 12503 dap->da_state |= DEPCOMPLETE; 12504 complete_diradd(dap); 12505 } 12506 WORKITEM_FREE(mkdir, D_MKDIR); 12507 } 12508 12509 /* 12510 * Handle the completion of a mkdir dependency. 12511 */ 12512 static void 12513 handle_written_mkdir(mkdir, type) 12514 struct mkdir *mkdir; 12515 int type; 12516 { 12517 12518 if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type) 12519 panic("handle_written_mkdir: bad type"); 12520 mkdir->md_state |= COMPLETE; 12521 complete_mkdir(mkdir); 12522 } 12523 12524 static int 12525 free_pagedep(pagedep) 12526 struct pagedep *pagedep; 12527 { 12528 int i; 12529 12530 if (pagedep->pd_state & NEWBLOCK) 12531 return (0); 12532 if (!LIST_EMPTY(&pagedep->pd_dirremhd)) 12533 return (0); 12534 for (i = 0; i < DAHASHSZ; i++) 12535 if (!LIST_EMPTY(&pagedep->pd_diraddhd[i])) 12536 return (0); 12537 if (!LIST_EMPTY(&pagedep->pd_pendinghd)) 12538 return (0); 12539 if (!LIST_EMPTY(&pagedep->pd_jmvrefhd)) 12540 return (0); 12541 if (pagedep->pd_state & ONWORKLIST) 12542 WORKLIST_REMOVE(&pagedep->pd_list); 12543 LIST_REMOVE(pagedep, pd_hash); 12544 WORKITEM_FREE(pagedep, D_PAGEDEP); 12545 12546 return (1); 12547 } 12548 12549 /* 12550 * Called from within softdep_disk_write_complete above. 12551 * A write operation was just completed. Removed inodes can 12552 * now be freed and associated block pointers may be committed. 12553 * Note that this routine is always called from interrupt level 12554 * with further interrupts from this device blocked. 12555 * 12556 * If the write did not succeed, we will do all the roll-forward 12557 * operations, but we will not take the actions that will allow its 12558 * dependencies to be processed. 12559 */ 12560 static int 12561 handle_written_filepage(pagedep, bp, flags) 12562 struct pagedep *pagedep; 12563 struct buf *bp; /* buffer containing the written page */ 12564 int flags; 12565 { 12566 struct dirrem *dirrem; 12567 struct diradd *dap, *nextdap; 12568 struct direct *ep; 12569 int i, chgs; 12570 12571 if ((pagedep->pd_state & IOSTARTED) == 0) 12572 panic("handle_written_filepage: not started"); 12573 pagedep->pd_state &= ~IOSTARTED; 12574 if ((flags & WRITESUCCEEDED) == 0) 12575 goto rollforward; 12576 /* 12577 * Process any directory removals that have been committed. 12578 */ 12579 while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { 12580 LIST_REMOVE(dirrem, dm_next); 12581 dirrem->dm_state |= COMPLETE; 12582 dirrem->dm_dirinum = pagedep->pd_ino; 12583 KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), 12584 ("handle_written_filepage: Journal entries not written.")); 12585 add_to_worklist(&dirrem->dm_list, 0); 12586 } 12587 /* 12588 * Free any directory additions that have been committed. 12589 * If it is a newly allocated block, we have to wait until 12590 * the on-disk directory inode claims the new block. 12591 */ 12592 if ((pagedep->pd_state & NEWBLOCK) == 0) 12593 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 12594 free_diradd(dap, NULL); 12595 rollforward: 12596 /* 12597 * Uncommitted directory entries must be restored. 12598 */ 12599 for (chgs = 0, i = 0; i < DAHASHSZ; i++) { 12600 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; 12601 dap = nextdap) { 12602 nextdap = LIST_NEXT(dap, da_pdlist); 12603 if (dap->da_state & ATTACHED) 12604 panic("handle_written_filepage: attached"); 12605 ep = (struct direct *) 12606 ((char *)bp->b_data + dap->da_offset); 12607 ep->d_ino = dap->da_newinum; 12608 dap->da_state &= ~UNDONE; 12609 dap->da_state |= ATTACHED; 12610 chgs = 1; 12611 /* 12612 * If the inode referenced by the directory has 12613 * been written out, then the dependency can be 12614 * moved to the pending list. 12615 */ 12616 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 12617 LIST_REMOVE(dap, da_pdlist); 12618 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, 12619 da_pdlist); 12620 } 12621 } 12622 } 12623 /* 12624 * If there were any rollbacks in the directory, then it must be 12625 * marked dirty so that its will eventually get written back in 12626 * its correct form. 12627 */ 12628 if (chgs || (flags & WRITESUCCEEDED) == 0) { 12629 if ((bp->b_flags & B_DELWRI) == 0) 12630 stat_dir_entry++; 12631 bdirty(bp); 12632 return (1); 12633 } 12634 /* 12635 * If we are not waiting for a new directory block to be 12636 * claimed by its inode, then the pagedep will be freed. 12637 * Otherwise it will remain to track any new entries on 12638 * the page in case they are fsync'ed. 12639 */ 12640 free_pagedep(pagedep); 12641 return (0); 12642 } 12643 12644 /* 12645 * Writing back in-core inode structures. 12646 * 12647 * The filesystem only accesses an inode's contents when it occupies an 12648 * "in-core" inode structure. These "in-core" structures are separate from 12649 * the page frames used to cache inode blocks. Only the latter are 12650 * transferred to/from the disk. So, when the updated contents of the 12651 * "in-core" inode structure are copied to the corresponding in-memory inode 12652 * block, the dependencies are also transferred. The following procedure is 12653 * called when copying a dirty "in-core" inode to a cached inode block. 12654 */ 12655 12656 /* 12657 * Called when an inode is loaded from disk. If the effective link count 12658 * differed from the actual link count when it was last flushed, then we 12659 * need to ensure that the correct effective link count is put back. 12660 */ 12661 void 12662 softdep_load_inodeblock(ip) 12663 struct inode *ip; /* the "in_core" copy of the inode */ 12664 { 12665 struct inodedep *inodedep; 12666 struct ufsmount *ump; 12667 12668 ump = ITOUMP(ip); 12669 KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, 12670 ("softdep_load_inodeblock called on non-softdep filesystem")); 12671 /* 12672 * Check for alternate nlink count. 12673 */ 12674 ip->i_effnlink = ip->i_nlink; 12675 ACQUIRE_LOCK(ump); 12676 if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0) { 12677 FREE_LOCK(ump); 12678 return; 12679 } 12680 if (ip->i_nlink != inodedep->id_nlinkwrote && 12681 inodedep->id_nlinkwrote != -1) { 12682 KASSERT(ip->i_nlink == 0 && 12683 (ump->um_flags & UM_FSFAIL_CLEANUP) != 0, 12684 ("read bad i_nlink value")); 12685 ip->i_effnlink = ip->i_nlink = inodedep->id_nlinkwrote; 12686 } 12687 ip->i_effnlink -= inodedep->id_nlinkdelta; 12688 KASSERT(ip->i_effnlink >= 0, 12689 ("softdep_load_inodeblock: negative i_effnlink")); 12690 FREE_LOCK(ump); 12691 } 12692 12693 /* 12694 * This routine is called just before the "in-core" inode 12695 * information is to be copied to the in-memory inode block. 12696 * Recall that an inode block contains several inodes. If 12697 * the force flag is set, then the dependencies will be 12698 * cleared so that the update can always be made. Note that 12699 * the buffer is locked when this routine is called, so we 12700 * will never be in the middle of writing the inode block 12701 * to disk. 12702 */ 12703 void 12704 softdep_update_inodeblock(ip, bp, waitfor) 12705 struct inode *ip; /* the "in_core" copy of the inode */ 12706 struct buf *bp; /* the buffer containing the inode block */ 12707 int waitfor; /* nonzero => update must be allowed */ 12708 { 12709 struct inodedep *inodedep; 12710 struct inoref *inoref; 12711 struct ufsmount *ump; 12712 struct worklist *wk; 12713 struct mount *mp; 12714 struct buf *ibp; 12715 struct fs *fs; 12716 int error; 12717 12718 ump = ITOUMP(ip); 12719 mp = UFSTOVFS(ump); 12720 KASSERT(MOUNTEDSOFTDEP(mp) != 0, 12721 ("softdep_update_inodeblock called on non-softdep filesystem")); 12722 fs = ump->um_fs; 12723 /* 12724 * Preserve the freelink that is on disk. clear_unlinked_inodedep() 12725 * does not have access to the in-core ip so must write directly into 12726 * the inode block buffer when setting freelink. 12727 */ 12728 if (fs->fs_magic == FS_UFS1_MAGIC) 12729 DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data + 12730 ino_to_fsbo(fs, ip->i_number))->di_freelink); 12731 else 12732 DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data + 12733 ino_to_fsbo(fs, ip->i_number))->di_freelink); 12734 /* 12735 * If the effective link count is not equal to the actual link 12736 * count, then we must track the difference in an inodedep while 12737 * the inode is (potentially) tossed out of the cache. Otherwise, 12738 * if there is no existing inodedep, then there are no dependencies 12739 * to track. 12740 */ 12741 ACQUIRE_LOCK(ump); 12742 again: 12743 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { 12744 FREE_LOCK(ump); 12745 if (ip->i_effnlink != ip->i_nlink) 12746 panic("softdep_update_inodeblock: bad link count"); 12747 return; 12748 } 12749 KASSERT(ip->i_nlink >= inodedep->id_nlinkdelta, 12750 ("softdep_update_inodeblock inconsistent ip %p i_nlink %d " 12751 "inodedep %p id_nlinkdelta %jd", 12752 ip, ip->i_nlink, inodedep, (intmax_t)inodedep->id_nlinkdelta)); 12753 inodedep->id_nlinkwrote = ip->i_nlink; 12754 if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) 12755 panic("softdep_update_inodeblock: bad delta"); 12756 /* 12757 * If we're flushing all dependencies we must also move any waiting 12758 * for journal writes onto the bufwait list prior to I/O. 12759 */ 12760 if (waitfor) { 12761 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 12762 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 12763 == DEPCOMPLETE) { 12764 jwait(&inoref->if_list, MNT_WAIT); 12765 goto again; 12766 } 12767 } 12768 } 12769 /* 12770 * Changes have been initiated. Anything depending on these 12771 * changes cannot occur until this inode has been written. 12772 */ 12773 inodedep->id_state &= ~COMPLETE; 12774 if ((inodedep->id_state & ONWORKLIST) == 0) 12775 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); 12776 /* 12777 * Any new dependencies associated with the incore inode must 12778 * now be moved to the list associated with the buffer holding 12779 * the in-memory copy of the inode. Once merged process any 12780 * allocdirects that are completed by the merger. 12781 */ 12782 merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); 12783 if (!TAILQ_EMPTY(&inodedep->id_inoupdt)) 12784 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt), 12785 NULL); 12786 merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); 12787 if (!TAILQ_EMPTY(&inodedep->id_extupdt)) 12788 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt), 12789 NULL); 12790 /* 12791 * Now that the inode has been pushed into the buffer, the 12792 * operations dependent on the inode being written to disk 12793 * can be moved to the id_bufwait so that they will be 12794 * processed when the buffer I/O completes. 12795 */ 12796 while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { 12797 WORKLIST_REMOVE(wk); 12798 WORKLIST_INSERT(&inodedep->id_bufwait, wk); 12799 } 12800 /* 12801 * Newly allocated inodes cannot be written until the bitmap 12802 * that allocates them have been written (indicated by 12803 * DEPCOMPLETE being set in id_state). If we are doing a 12804 * forced sync (e.g., an fsync on a file), we force the bitmap 12805 * to be written so that the update can be done. 12806 */ 12807 if (waitfor == 0) { 12808 FREE_LOCK(ump); 12809 return; 12810 } 12811 retry: 12812 if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) { 12813 FREE_LOCK(ump); 12814 return; 12815 } 12816 ibp = inodedep->id_bmsafemap->sm_buf; 12817 ibp = getdirtybuf(ibp, LOCK_PTR(ump), MNT_WAIT); 12818 if (ibp == NULL) { 12819 /* 12820 * If ibp came back as NULL, the dependency could have been 12821 * freed while we slept. Look it up again, and check to see 12822 * that it has completed. 12823 */ 12824 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) 12825 goto retry; 12826 FREE_LOCK(ump); 12827 return; 12828 } 12829 FREE_LOCK(ump); 12830 if ((error = bwrite(ibp)) != 0) 12831 softdep_error("softdep_update_inodeblock: bwrite", error); 12832 } 12833 12834 /* 12835 * Merge the a new inode dependency list (such as id_newinoupdt) into an 12836 * old inode dependency list (such as id_inoupdt). 12837 */ 12838 static void 12839 merge_inode_lists(newlisthead, oldlisthead) 12840 struct allocdirectlst *newlisthead; 12841 struct allocdirectlst *oldlisthead; 12842 { 12843 struct allocdirect *listadp, *newadp; 12844 12845 newadp = TAILQ_FIRST(newlisthead); 12846 if (newadp != NULL) 12847 LOCK_OWNED(VFSTOUFS(newadp->ad_block.nb_list.wk_mp)); 12848 for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) { 12849 if (listadp->ad_offset < newadp->ad_offset) { 12850 listadp = TAILQ_NEXT(listadp, ad_next); 12851 continue; 12852 } 12853 TAILQ_REMOVE(newlisthead, newadp, ad_next); 12854 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); 12855 if (listadp->ad_offset == newadp->ad_offset) { 12856 allocdirect_merge(oldlisthead, newadp, 12857 listadp); 12858 listadp = newadp; 12859 } 12860 newadp = TAILQ_FIRST(newlisthead); 12861 } 12862 while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) { 12863 TAILQ_REMOVE(newlisthead, newadp, ad_next); 12864 TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next); 12865 } 12866 } 12867 12868 /* 12869 * If we are doing an fsync, then we must ensure that any directory 12870 * entries for the inode have been written after the inode gets to disk. 12871 */ 12872 int 12873 softdep_fsync(vp) 12874 struct vnode *vp; /* the "in_core" copy of the inode */ 12875 { 12876 struct inodedep *inodedep; 12877 struct pagedep *pagedep; 12878 struct inoref *inoref; 12879 struct ufsmount *ump; 12880 struct worklist *wk; 12881 struct diradd *dap; 12882 struct mount *mp; 12883 struct vnode *pvp; 12884 struct inode *ip; 12885 struct buf *bp; 12886 struct fs *fs; 12887 struct thread *td = curthread; 12888 int error, flushparent, pagedep_new_block; 12889 ino_t parentino; 12890 ufs_lbn_t lbn; 12891 12892 ip = VTOI(vp); 12893 mp = vp->v_mount; 12894 ump = VFSTOUFS(mp); 12895 fs = ump->um_fs; 12896 if (MOUNTEDSOFTDEP(mp) == 0) 12897 return (0); 12898 ACQUIRE_LOCK(ump); 12899 restart: 12900 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { 12901 FREE_LOCK(ump); 12902 return (0); 12903 } 12904 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 12905 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 12906 == DEPCOMPLETE) { 12907 jwait(&inoref->if_list, MNT_WAIT); 12908 goto restart; 12909 } 12910 } 12911 if (!LIST_EMPTY(&inodedep->id_inowait) || 12912 !TAILQ_EMPTY(&inodedep->id_extupdt) || 12913 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 12914 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 12915 !TAILQ_EMPTY(&inodedep->id_newinoupdt)) 12916 panic("softdep_fsync: pending ops %p", inodedep); 12917 for (error = 0, flushparent = 0; ; ) { 12918 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) 12919 break; 12920 if (wk->wk_type != D_DIRADD) 12921 panic("softdep_fsync: Unexpected type %s", 12922 TYPENAME(wk->wk_type)); 12923 dap = WK_DIRADD(wk); 12924 /* 12925 * Flush our parent if this directory entry has a MKDIR_PARENT 12926 * dependency or is contained in a newly allocated block. 12927 */ 12928 if (dap->da_state & DIRCHG) 12929 pagedep = dap->da_previous->dm_pagedep; 12930 else 12931 pagedep = dap->da_pagedep; 12932 parentino = pagedep->pd_ino; 12933 lbn = pagedep->pd_lbn; 12934 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) 12935 panic("softdep_fsync: dirty"); 12936 if ((dap->da_state & MKDIR_PARENT) || 12937 (pagedep->pd_state & NEWBLOCK)) 12938 flushparent = 1; 12939 else 12940 flushparent = 0; 12941 /* 12942 * If we are being fsync'ed as part of vgone'ing this vnode, 12943 * then we will not be able to release and recover the 12944 * vnode below, so we just have to give up on writing its 12945 * directory entry out. It will eventually be written, just 12946 * not now, but then the user was not asking to have it 12947 * written, so we are not breaking any promises. 12948 */ 12949 if (VN_IS_DOOMED(vp)) 12950 break; 12951 /* 12952 * We prevent deadlock by always fetching inodes from the 12953 * root, moving down the directory tree. Thus, when fetching 12954 * our parent directory, we first try to get the lock. If 12955 * that fails, we must unlock ourselves before requesting 12956 * the lock on our parent. See the comment in ufs_lookup 12957 * for details on possible races. 12958 */ 12959 FREE_LOCK(ump); 12960 error = get_parent_vp(vp, mp, parentino, NULL, NULL, NULL, 12961 &pvp); 12962 if (error == ERELOOKUP) 12963 error = 0; 12964 if (error != 0) 12965 return (error); 12966 /* 12967 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps 12968 * that are contained in direct blocks will be resolved by 12969 * doing a ffs_update. Pagedeps contained in indirect blocks 12970 * may require a complete sync'ing of the directory. So, we 12971 * try the cheap and fast ffs_update first, and if that fails, 12972 * then we do the slower ffs_syncvnode of the directory. 12973 */ 12974 if (flushparent) { 12975 int locked; 12976 12977 if ((error = ffs_update(pvp, 1)) != 0) { 12978 vput(pvp); 12979 return (error); 12980 } 12981 ACQUIRE_LOCK(ump); 12982 locked = 1; 12983 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) { 12984 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) { 12985 if (wk->wk_type != D_DIRADD) 12986 panic("softdep_fsync: Unexpected type %s", 12987 TYPENAME(wk->wk_type)); 12988 dap = WK_DIRADD(wk); 12989 if (dap->da_state & DIRCHG) 12990 pagedep = dap->da_previous->dm_pagedep; 12991 else 12992 pagedep = dap->da_pagedep; 12993 pagedep_new_block = pagedep->pd_state & NEWBLOCK; 12994 FREE_LOCK(ump); 12995 locked = 0; 12996 if (pagedep_new_block && (error = 12997 ffs_syncvnode(pvp, MNT_WAIT, 0))) { 12998 vput(pvp); 12999 return (error); 13000 } 13001 } 13002 } 13003 if (locked) 13004 FREE_LOCK(ump); 13005 } 13006 /* 13007 * Flush directory page containing the inode's name. 13008 */ 13009 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred, 13010 &bp); 13011 if (error == 0) 13012 error = bwrite(bp); 13013 else 13014 brelse(bp); 13015 vput(pvp); 13016 if (!ffs_fsfail_cleanup(ump, error)) 13017 return (error); 13018 ACQUIRE_LOCK(ump); 13019 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) 13020 break; 13021 } 13022 FREE_LOCK(ump); 13023 return (0); 13024 } 13025 13026 /* 13027 * Flush all the dirty bitmaps associated with the block device 13028 * before flushing the rest of the dirty blocks so as to reduce 13029 * the number of dependencies that will have to be rolled back. 13030 * 13031 * XXX Unused? 13032 */ 13033 void 13034 softdep_fsync_mountdev(vp) 13035 struct vnode *vp; 13036 { 13037 struct buf *bp, *nbp; 13038 struct worklist *wk; 13039 struct bufobj *bo; 13040 13041 if (!vn_isdisk(vp)) 13042 panic("softdep_fsync_mountdev: vnode not a disk"); 13043 bo = &vp->v_bufobj; 13044 restart: 13045 BO_LOCK(bo); 13046 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 13047 /* 13048 * If it is already scheduled, skip to the next buffer. 13049 */ 13050 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 13051 continue; 13052 13053 if ((bp->b_flags & B_DELWRI) == 0) 13054 panic("softdep_fsync_mountdev: not dirty"); 13055 /* 13056 * We are only interested in bitmaps with outstanding 13057 * dependencies. 13058 */ 13059 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL || 13060 wk->wk_type != D_BMSAFEMAP || 13061 (bp->b_vflags & BV_BKGRDINPROG)) { 13062 BUF_UNLOCK(bp); 13063 continue; 13064 } 13065 BO_UNLOCK(bo); 13066 bremfree(bp); 13067 (void) bawrite(bp); 13068 goto restart; 13069 } 13070 drain_output(vp); 13071 BO_UNLOCK(bo); 13072 } 13073 13074 /* 13075 * Sync all cylinder groups that were dirty at the time this function is 13076 * called. Newly dirtied cgs will be inserted before the sentinel. This 13077 * is used to flush freedep activity that may be holding up writes to a 13078 * indirect block. 13079 */ 13080 static int 13081 sync_cgs(mp, waitfor) 13082 struct mount *mp; 13083 int waitfor; 13084 { 13085 struct bmsafemap *bmsafemap; 13086 struct bmsafemap *sentinel; 13087 struct ufsmount *ump; 13088 struct buf *bp; 13089 int error; 13090 13091 sentinel = malloc(sizeof(*sentinel), M_BMSAFEMAP, M_ZERO | M_WAITOK); 13092 sentinel->sm_cg = -1; 13093 ump = VFSTOUFS(mp); 13094 error = 0; 13095 ACQUIRE_LOCK(ump); 13096 LIST_INSERT_HEAD(&ump->softdep_dirtycg, sentinel, sm_next); 13097 for (bmsafemap = LIST_NEXT(sentinel, sm_next); bmsafemap != NULL; 13098 bmsafemap = LIST_NEXT(sentinel, sm_next)) { 13099 /* Skip sentinels and cgs with no work to release. */ 13100 if (bmsafemap->sm_cg == -1 || 13101 (LIST_EMPTY(&bmsafemap->sm_freehd) && 13102 LIST_EMPTY(&bmsafemap->sm_freewr))) { 13103 LIST_REMOVE(sentinel, sm_next); 13104 LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next); 13105 continue; 13106 } 13107 /* 13108 * If we don't get the lock and we're waiting try again, if 13109 * not move on to the next buf and try to sync it. 13110 */ 13111 bp = getdirtybuf(bmsafemap->sm_buf, LOCK_PTR(ump), waitfor); 13112 if (bp == NULL && waitfor == MNT_WAIT) 13113 continue; 13114 LIST_REMOVE(sentinel, sm_next); 13115 LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next); 13116 if (bp == NULL) 13117 continue; 13118 FREE_LOCK(ump); 13119 if (waitfor == MNT_NOWAIT) 13120 bawrite(bp); 13121 else 13122 error = bwrite(bp); 13123 ACQUIRE_LOCK(ump); 13124 if (error) 13125 break; 13126 } 13127 LIST_REMOVE(sentinel, sm_next); 13128 FREE_LOCK(ump); 13129 free(sentinel, M_BMSAFEMAP); 13130 return (error); 13131 } 13132 13133 /* 13134 * This routine is called when we are trying to synchronously flush a 13135 * file. This routine must eliminate any filesystem metadata dependencies 13136 * so that the syncing routine can succeed. 13137 */ 13138 int 13139 softdep_sync_metadata(struct vnode *vp) 13140 { 13141 struct inode *ip; 13142 int error; 13143 13144 ip = VTOI(vp); 13145 KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0, 13146 ("softdep_sync_metadata called on non-softdep filesystem")); 13147 /* 13148 * Ensure that any direct block dependencies have been cleared, 13149 * truncations are started, and inode references are journaled. 13150 */ 13151 ACQUIRE_LOCK(VFSTOUFS(vp->v_mount)); 13152 /* 13153 * Write all journal records to prevent rollbacks on devvp. 13154 */ 13155 if (vp->v_type == VCHR) 13156 softdep_flushjournal(vp->v_mount); 13157 error = flush_inodedep_deps(vp, vp->v_mount, ip->i_number); 13158 /* 13159 * Ensure that all truncates are written so we won't find deps on 13160 * indirect blocks. 13161 */ 13162 process_truncates(vp); 13163 FREE_LOCK(VFSTOUFS(vp->v_mount)); 13164 13165 return (error); 13166 } 13167 13168 /* 13169 * This routine is called when we are attempting to sync a buf with 13170 * dependencies. If waitfor is MNT_NOWAIT it attempts to schedule any 13171 * other IO it can but returns EBUSY if the buffer is not yet able to 13172 * be written. Dependencies which will not cause rollbacks will always 13173 * return 0. 13174 */ 13175 int 13176 softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor) 13177 { 13178 struct indirdep *indirdep; 13179 struct pagedep *pagedep; 13180 struct allocindir *aip; 13181 struct newblk *newblk; 13182 struct ufsmount *ump; 13183 struct buf *nbp; 13184 struct worklist *wk; 13185 int i, error; 13186 13187 KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0, 13188 ("softdep_sync_buf called on non-softdep filesystem")); 13189 /* 13190 * For VCHR we just don't want to force flush any dependencies that 13191 * will cause rollbacks. 13192 */ 13193 if (vp->v_type == VCHR) { 13194 if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0)) 13195 return (EBUSY); 13196 return (0); 13197 } 13198 ump = VFSTOUFS(vp->v_mount); 13199 ACQUIRE_LOCK(ump); 13200 /* 13201 * As we hold the buffer locked, none of its dependencies 13202 * will disappear. 13203 */ 13204 error = 0; 13205 top: 13206 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 13207 switch (wk->wk_type) { 13208 case D_ALLOCDIRECT: 13209 case D_ALLOCINDIR: 13210 newblk = WK_NEWBLK(wk); 13211 if (newblk->nb_jnewblk != NULL) { 13212 if (waitfor == MNT_NOWAIT) { 13213 error = EBUSY; 13214 goto out_unlock; 13215 } 13216 jwait(&newblk->nb_jnewblk->jn_list, waitfor); 13217 goto top; 13218 } 13219 if (newblk->nb_state & DEPCOMPLETE || 13220 waitfor == MNT_NOWAIT) 13221 continue; 13222 nbp = newblk->nb_bmsafemap->sm_buf; 13223 nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor); 13224 if (nbp == NULL) 13225 goto top; 13226 FREE_LOCK(ump); 13227 if ((error = bwrite(nbp)) != 0) 13228 goto out; 13229 ACQUIRE_LOCK(ump); 13230 continue; 13231 13232 case D_INDIRDEP: 13233 indirdep = WK_INDIRDEP(wk); 13234 if (waitfor == MNT_NOWAIT) { 13235 if (!TAILQ_EMPTY(&indirdep->ir_trunc) || 13236 !LIST_EMPTY(&indirdep->ir_deplisthd)) { 13237 error = EBUSY; 13238 goto out_unlock; 13239 } 13240 } 13241 if (!TAILQ_EMPTY(&indirdep->ir_trunc)) 13242 panic("softdep_sync_buf: truncation pending."); 13243 restart: 13244 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { 13245 newblk = (struct newblk *)aip; 13246 if (newblk->nb_jnewblk != NULL) { 13247 jwait(&newblk->nb_jnewblk->jn_list, 13248 waitfor); 13249 goto restart; 13250 } 13251 if (newblk->nb_state & DEPCOMPLETE) 13252 continue; 13253 nbp = newblk->nb_bmsafemap->sm_buf; 13254 nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor); 13255 if (nbp == NULL) 13256 goto restart; 13257 FREE_LOCK(ump); 13258 if ((error = bwrite(nbp)) != 0) 13259 goto out; 13260 ACQUIRE_LOCK(ump); 13261 goto restart; 13262 } 13263 continue; 13264 13265 case D_PAGEDEP: 13266 /* 13267 * Only flush directory entries in synchronous passes. 13268 */ 13269 if (waitfor != MNT_WAIT) { 13270 error = EBUSY; 13271 goto out_unlock; 13272 } 13273 /* 13274 * While syncing snapshots, we must allow recursive 13275 * lookups. 13276 */ 13277 BUF_AREC(bp); 13278 /* 13279 * We are trying to sync a directory that may 13280 * have dependencies on both its own metadata 13281 * and/or dependencies on the inodes of any 13282 * recently allocated files. We walk its diradd 13283 * lists pushing out the associated inode. 13284 */ 13285 pagedep = WK_PAGEDEP(wk); 13286 for (i = 0; i < DAHASHSZ; i++) { 13287 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) 13288 continue; 13289 error = flush_pagedep_deps(vp, wk->wk_mp, 13290 &pagedep->pd_diraddhd[i], bp); 13291 if (error != 0) { 13292 if (error != ERELOOKUP) 13293 BUF_NOREC(bp); 13294 goto out_unlock; 13295 } 13296 } 13297 BUF_NOREC(bp); 13298 continue; 13299 13300 case D_FREEWORK: 13301 case D_FREEDEP: 13302 case D_JSEGDEP: 13303 case D_JNEWBLK: 13304 continue; 13305 13306 default: 13307 panic("softdep_sync_buf: Unknown type %s", 13308 TYPENAME(wk->wk_type)); 13309 /* NOTREACHED */ 13310 } 13311 } 13312 out_unlock: 13313 FREE_LOCK(ump); 13314 out: 13315 return (error); 13316 } 13317 13318 /* 13319 * Flush the dependencies associated with an inodedep. 13320 */ 13321 static int 13322 flush_inodedep_deps(vp, mp, ino) 13323 struct vnode *vp; 13324 struct mount *mp; 13325 ino_t ino; 13326 { 13327 struct inodedep *inodedep; 13328 struct inoref *inoref; 13329 struct ufsmount *ump; 13330 int error, waitfor; 13331 13332 /* 13333 * This work is done in two passes. The first pass grabs most 13334 * of the buffers and begins asynchronously writing them. The 13335 * only way to wait for these asynchronous writes is to sleep 13336 * on the filesystem vnode which may stay busy for a long time 13337 * if the filesystem is active. So, instead, we make a second 13338 * pass over the dependencies blocking on each write. In the 13339 * usual case we will be blocking against a write that we 13340 * initiated, so when it is done the dependency will have been 13341 * resolved. Thus the second pass is expected to end quickly. 13342 * We give a brief window at the top of the loop to allow 13343 * any pending I/O to complete. 13344 */ 13345 ump = VFSTOUFS(mp); 13346 LOCK_OWNED(ump); 13347 for (error = 0, waitfor = MNT_NOWAIT; ; ) { 13348 if (error) 13349 return (error); 13350 FREE_LOCK(ump); 13351 ACQUIRE_LOCK(ump); 13352 restart: 13353 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) 13354 return (0); 13355 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 13356 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 13357 == DEPCOMPLETE) { 13358 jwait(&inoref->if_list, MNT_WAIT); 13359 goto restart; 13360 } 13361 } 13362 if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) || 13363 flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) || 13364 flush_deplist(&inodedep->id_extupdt, waitfor, &error) || 13365 flush_deplist(&inodedep->id_newextupdt, waitfor, &error)) 13366 continue; 13367 /* 13368 * If pass2, we are done, otherwise do pass 2. 13369 */ 13370 if (waitfor == MNT_WAIT) 13371 break; 13372 waitfor = MNT_WAIT; 13373 } 13374 /* 13375 * Try freeing inodedep in case all dependencies have been removed. 13376 */ 13377 if (inodedep_lookup(mp, ino, 0, &inodedep) != 0) 13378 (void) free_inodedep(inodedep); 13379 return (0); 13380 } 13381 13382 /* 13383 * Flush an inode dependency list. 13384 */ 13385 static int 13386 flush_deplist(listhead, waitfor, errorp) 13387 struct allocdirectlst *listhead; 13388 int waitfor; 13389 int *errorp; 13390 { 13391 struct allocdirect *adp; 13392 struct newblk *newblk; 13393 struct ufsmount *ump; 13394 struct buf *bp; 13395 13396 if ((adp = TAILQ_FIRST(listhead)) == NULL) 13397 return (0); 13398 ump = VFSTOUFS(adp->ad_list.wk_mp); 13399 LOCK_OWNED(ump); 13400 TAILQ_FOREACH(adp, listhead, ad_next) { 13401 newblk = (struct newblk *)adp; 13402 if (newblk->nb_jnewblk != NULL) { 13403 jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); 13404 return (1); 13405 } 13406 if (newblk->nb_state & DEPCOMPLETE) 13407 continue; 13408 bp = newblk->nb_bmsafemap->sm_buf; 13409 bp = getdirtybuf(bp, LOCK_PTR(ump), waitfor); 13410 if (bp == NULL) { 13411 if (waitfor == MNT_NOWAIT) 13412 continue; 13413 return (1); 13414 } 13415 FREE_LOCK(ump); 13416 if (waitfor == MNT_NOWAIT) 13417 bawrite(bp); 13418 else 13419 *errorp = bwrite(bp); 13420 ACQUIRE_LOCK(ump); 13421 return (1); 13422 } 13423 return (0); 13424 } 13425 13426 /* 13427 * Flush dependencies associated with an allocdirect block. 13428 */ 13429 static int 13430 flush_newblk_dep(vp, mp, lbn) 13431 struct vnode *vp; 13432 struct mount *mp; 13433 ufs_lbn_t lbn; 13434 { 13435 struct newblk *newblk; 13436 struct ufsmount *ump; 13437 struct bufobj *bo; 13438 struct inode *ip; 13439 struct buf *bp; 13440 ufs2_daddr_t blkno; 13441 int error; 13442 13443 error = 0; 13444 bo = &vp->v_bufobj; 13445 ip = VTOI(vp); 13446 blkno = DIP(ip, i_db[lbn]); 13447 if (blkno == 0) 13448 panic("flush_newblk_dep: Missing block"); 13449 ump = VFSTOUFS(mp); 13450 ACQUIRE_LOCK(ump); 13451 /* 13452 * Loop until all dependencies related to this block are satisfied. 13453 * We must be careful to restart after each sleep in case a write 13454 * completes some part of this process for us. 13455 */ 13456 for (;;) { 13457 if (newblk_lookup(mp, blkno, 0, &newblk) == 0) { 13458 FREE_LOCK(ump); 13459 break; 13460 } 13461 if (newblk->nb_list.wk_type != D_ALLOCDIRECT) 13462 panic("flush_newblk_dep: Bad newblk %p", newblk); 13463 /* 13464 * Flush the journal. 13465 */ 13466 if (newblk->nb_jnewblk != NULL) { 13467 jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); 13468 continue; 13469 } 13470 /* 13471 * Write the bitmap dependency. 13472 */ 13473 if ((newblk->nb_state & DEPCOMPLETE) == 0) { 13474 bp = newblk->nb_bmsafemap->sm_buf; 13475 bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT); 13476 if (bp == NULL) 13477 continue; 13478 FREE_LOCK(ump); 13479 error = bwrite(bp); 13480 if (error) 13481 break; 13482 ACQUIRE_LOCK(ump); 13483 continue; 13484 } 13485 /* 13486 * Write the buffer. 13487 */ 13488 FREE_LOCK(ump); 13489 BO_LOCK(bo); 13490 bp = gbincore(bo, lbn); 13491 if (bp != NULL) { 13492 error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 13493 LK_INTERLOCK, BO_LOCKPTR(bo)); 13494 if (error == ENOLCK) { 13495 ACQUIRE_LOCK(ump); 13496 error = 0; 13497 continue; /* Slept, retry */ 13498 } 13499 if (error != 0) 13500 break; /* Failed */ 13501 if (bp->b_flags & B_DELWRI) { 13502 bremfree(bp); 13503 error = bwrite(bp); 13504 if (error) 13505 break; 13506 } else 13507 BUF_UNLOCK(bp); 13508 } else 13509 BO_UNLOCK(bo); 13510 /* 13511 * We have to wait for the direct pointers to 13512 * point at the newdirblk before the dependency 13513 * will go away. 13514 */ 13515 error = ffs_update(vp, 1); 13516 if (error) 13517 break; 13518 ACQUIRE_LOCK(ump); 13519 } 13520 return (error); 13521 } 13522 13523 /* 13524 * Eliminate a pagedep dependency by flushing out all its diradd dependencies. 13525 */ 13526 static int 13527 flush_pagedep_deps(pvp, mp, diraddhdp, locked_bp) 13528 struct vnode *pvp; 13529 struct mount *mp; 13530 struct diraddhd *diraddhdp; 13531 struct buf *locked_bp; 13532 { 13533 struct inodedep *inodedep; 13534 struct inoref *inoref; 13535 struct ufsmount *ump; 13536 struct diradd *dap; 13537 struct vnode *vp; 13538 int error = 0; 13539 struct buf *bp; 13540 ino_t inum; 13541 struct diraddhd unfinished; 13542 13543 LIST_INIT(&unfinished); 13544 ump = VFSTOUFS(mp); 13545 LOCK_OWNED(ump); 13546 restart: 13547 while ((dap = LIST_FIRST(diraddhdp)) != NULL) { 13548 /* 13549 * Flush ourselves if this directory entry 13550 * has a MKDIR_PARENT dependency. 13551 */ 13552 if (dap->da_state & MKDIR_PARENT) { 13553 FREE_LOCK(ump); 13554 if ((error = ffs_update(pvp, 1)) != 0) 13555 break; 13556 ACQUIRE_LOCK(ump); 13557 /* 13558 * If that cleared dependencies, go on to next. 13559 */ 13560 if (dap != LIST_FIRST(diraddhdp)) 13561 continue; 13562 /* 13563 * All MKDIR_PARENT dependencies and all the 13564 * NEWBLOCK pagedeps that are contained in direct 13565 * blocks were resolved by doing above ffs_update. 13566 * Pagedeps contained in indirect blocks may 13567 * require a complete sync'ing of the directory. 13568 * We are in the midst of doing a complete sync, 13569 * so if they are not resolved in this pass we 13570 * defer them for now as they will be sync'ed by 13571 * our caller shortly. 13572 */ 13573 LIST_REMOVE(dap, da_pdlist); 13574 LIST_INSERT_HEAD(&unfinished, dap, da_pdlist); 13575 continue; 13576 } 13577 /* 13578 * A newly allocated directory must have its "." and 13579 * ".." entries written out before its name can be 13580 * committed in its parent. 13581 */ 13582 inum = dap->da_newinum; 13583 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) 13584 panic("flush_pagedep_deps: lost inode1"); 13585 /* 13586 * Wait for any pending journal adds to complete so we don't 13587 * cause rollbacks while syncing. 13588 */ 13589 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 13590 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 13591 == DEPCOMPLETE) { 13592 jwait(&inoref->if_list, MNT_WAIT); 13593 goto restart; 13594 } 13595 } 13596 if (dap->da_state & MKDIR_BODY) { 13597 FREE_LOCK(ump); 13598 error = get_parent_vp(pvp, mp, inum, locked_bp, 13599 diraddhdp, &unfinished, &vp); 13600 if (error != 0) 13601 break; 13602 error = flush_newblk_dep(vp, mp, 0); 13603 /* 13604 * If we still have the dependency we might need to 13605 * update the vnode to sync the new link count to 13606 * disk. 13607 */ 13608 if (error == 0 && dap == LIST_FIRST(diraddhdp)) 13609 error = ffs_update(vp, 1); 13610 vput(vp); 13611 if (error != 0) 13612 break; 13613 ACQUIRE_LOCK(ump); 13614 /* 13615 * If that cleared dependencies, go on to next. 13616 */ 13617 if (dap != LIST_FIRST(diraddhdp)) 13618 continue; 13619 if (dap->da_state & MKDIR_BODY) { 13620 inodedep_lookup(UFSTOVFS(ump), inum, 0, 13621 &inodedep); 13622 panic("flush_pagedep_deps: MKDIR_BODY " 13623 "inodedep %p dap %p vp %p", 13624 inodedep, dap, vp); 13625 } 13626 } 13627 /* 13628 * Flush the inode on which the directory entry depends. 13629 * Having accounted for MKDIR_PARENT and MKDIR_BODY above, 13630 * the only remaining dependency is that the updated inode 13631 * count must get pushed to disk. The inode has already 13632 * been pushed into its inode buffer (via VOP_UPDATE) at 13633 * the time of the reference count change. So we need only 13634 * locate that buffer, ensure that there will be no rollback 13635 * caused by a bitmap dependency, then write the inode buffer. 13636 */ 13637 retry: 13638 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) 13639 panic("flush_pagedep_deps: lost inode"); 13640 /* 13641 * If the inode still has bitmap dependencies, 13642 * push them to disk. 13643 */ 13644 if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) { 13645 bp = inodedep->id_bmsafemap->sm_buf; 13646 bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT); 13647 if (bp == NULL) 13648 goto retry; 13649 FREE_LOCK(ump); 13650 if ((error = bwrite(bp)) != 0) 13651 break; 13652 ACQUIRE_LOCK(ump); 13653 if (dap != LIST_FIRST(diraddhdp)) 13654 continue; 13655 } 13656 /* 13657 * If the inode is still sitting in a buffer waiting 13658 * to be written or waiting for the link count to be 13659 * adjusted update it here to flush it to disk. 13660 */ 13661 if (dap == LIST_FIRST(diraddhdp)) { 13662 FREE_LOCK(ump); 13663 error = get_parent_vp(pvp, mp, inum, locked_bp, 13664 diraddhdp, &unfinished, &vp); 13665 if (error != 0) 13666 break; 13667 error = ffs_update(vp, 1); 13668 vput(vp); 13669 if (error) 13670 break; 13671 ACQUIRE_LOCK(ump); 13672 } 13673 /* 13674 * If we have failed to get rid of all the dependencies 13675 * then something is seriously wrong. 13676 */ 13677 if (dap == LIST_FIRST(diraddhdp)) { 13678 inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep); 13679 panic("flush_pagedep_deps: failed to flush " 13680 "inodedep %p ino %ju dap %p", 13681 inodedep, (uintmax_t)inum, dap); 13682 } 13683 } 13684 if (error) 13685 ACQUIRE_LOCK(ump); 13686 while ((dap = LIST_FIRST(&unfinished)) != NULL) { 13687 LIST_REMOVE(dap, da_pdlist); 13688 LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist); 13689 } 13690 return (error); 13691 } 13692 13693 /* 13694 * A large burst of file addition or deletion activity can drive the 13695 * memory load excessively high. First attempt to slow things down 13696 * using the techniques below. If that fails, this routine requests 13697 * the offending operations to fall back to running synchronously 13698 * until the memory load returns to a reasonable level. 13699 */ 13700 int 13701 softdep_slowdown(vp) 13702 struct vnode *vp; 13703 { 13704 struct ufsmount *ump; 13705 int jlow; 13706 int max_softdeps_hard; 13707 13708 KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0, 13709 ("softdep_slowdown called on non-softdep filesystem")); 13710 ump = VFSTOUFS(vp->v_mount); 13711 ACQUIRE_LOCK(ump); 13712 jlow = 0; 13713 /* 13714 * Check for journal space if needed. 13715 */ 13716 if (DOINGSUJ(vp)) { 13717 if (journal_space(ump, 0) == 0) 13718 jlow = 1; 13719 } 13720 /* 13721 * If the system is under its limits and our filesystem is 13722 * not responsible for more than our share of the usage and 13723 * we are not low on journal space, then no need to slow down. 13724 */ 13725 max_softdeps_hard = max_softdeps * 11 / 10; 13726 if (dep_current[D_DIRREM] < max_softdeps_hard / 2 && 13727 dep_current[D_INODEDEP] < max_softdeps_hard && 13728 dep_current[D_INDIRDEP] < max_softdeps_hard / 1000 && 13729 dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0 && 13730 ump->softdep_curdeps[D_DIRREM] < 13731 (max_softdeps_hard / 2) / stat_flush_threads && 13732 ump->softdep_curdeps[D_INODEDEP] < 13733 max_softdeps_hard / stat_flush_threads && 13734 ump->softdep_curdeps[D_INDIRDEP] < 13735 (max_softdeps_hard / 1000) / stat_flush_threads && 13736 ump->softdep_curdeps[D_FREEBLKS] < 13737 max_softdeps_hard / stat_flush_threads) { 13738 FREE_LOCK(ump); 13739 return (0); 13740 } 13741 /* 13742 * If the journal is low or our filesystem is over its limit 13743 * then speedup the cleanup. 13744 */ 13745 if (ump->softdep_curdeps[D_INDIRDEP] < 13746 (max_softdeps_hard / 1000) / stat_flush_threads || jlow) 13747 softdep_speedup(ump); 13748 stat_sync_limit_hit += 1; 13749 FREE_LOCK(ump); 13750 /* 13751 * We only slow down the rate at which new dependencies are 13752 * generated if we are not using journaling. With journaling, 13753 * the cleanup should always be sufficient to keep things 13754 * under control. 13755 */ 13756 if (DOINGSUJ(vp)) 13757 return (0); 13758 return (1); 13759 } 13760 13761 static int 13762 softdep_request_cleanup_filter(struct vnode *vp, void *arg __unused) 13763 { 13764 return ((vp->v_iflag & VI_OWEINACT) != 0 && vp->v_usecount == 0 && 13765 ((vp->v_vflag & VV_NOSYNC) != 0 || VTOI(vp)->i_effnlink == 0)); 13766 } 13767 13768 static void 13769 softdep_request_cleanup_inactivate(struct mount *mp) 13770 { 13771 struct vnode *vp, *mvp; 13772 int error; 13773 13774 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, softdep_request_cleanup_filter, 13775 NULL) { 13776 vholdl(vp); 13777 vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY); 13778 VI_LOCK(vp); 13779 if (vp->v_data != NULL && vp->v_usecount == 0) { 13780 while ((vp->v_iflag & VI_OWEINACT) != 0) { 13781 error = vinactive(vp); 13782 if (error != 0 && error != ERELOOKUP) 13783 break; 13784 } 13785 atomic_add_int(&stat_delayed_inact, 1); 13786 } 13787 VOP_UNLOCK(vp); 13788 vdropl(vp); 13789 } 13790 } 13791 13792 /* 13793 * Called by the allocation routines when they are about to fail 13794 * in the hope that we can free up the requested resource (inodes 13795 * or disk space). 13796 * 13797 * First check to see if the work list has anything on it. If it has, 13798 * clean up entries until we successfully free the requested resource. 13799 * Because this process holds inodes locked, we cannot handle any remove 13800 * requests that might block on a locked inode as that could lead to 13801 * deadlock. If the worklist yields none of the requested resource, 13802 * start syncing out vnodes to free up the needed space. 13803 */ 13804 int 13805 softdep_request_cleanup(fs, vp, cred, resource) 13806 struct fs *fs; 13807 struct vnode *vp; 13808 struct ucred *cred; 13809 int resource; 13810 { 13811 struct ufsmount *ump; 13812 struct mount *mp; 13813 long starttime; 13814 ufs2_daddr_t needed; 13815 int error, failed_vnode; 13816 13817 /* 13818 * If we are being called because of a process doing a 13819 * copy-on-write, then it is not safe to process any 13820 * worklist items as we will recurse into the copyonwrite 13821 * routine. This will result in an incoherent snapshot. 13822 * If the vnode that we hold is a snapshot, we must avoid 13823 * handling other resources that could cause deadlock. 13824 */ 13825 if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp))) 13826 return (0); 13827 13828 if (resource == FLUSH_BLOCKS_WAIT) 13829 stat_cleanup_blkrequests += 1; 13830 else 13831 stat_cleanup_inorequests += 1; 13832 13833 mp = vp->v_mount; 13834 ump = VFSTOUFS(mp); 13835 mtx_assert(UFS_MTX(ump), MA_OWNED); 13836 UFS_UNLOCK(ump); 13837 error = ffs_update(vp, 1); 13838 if (error != 0 || MOUNTEDSOFTDEP(mp) == 0) { 13839 UFS_LOCK(ump); 13840 return (0); 13841 } 13842 /* 13843 * If we are in need of resources, start by cleaning up 13844 * any block removals associated with our inode. 13845 */ 13846 ACQUIRE_LOCK(ump); 13847 process_removes(vp); 13848 process_truncates(vp); 13849 FREE_LOCK(ump); 13850 /* 13851 * Now clean up at least as many resources as we will need. 13852 * 13853 * When requested to clean up inodes, the number that are needed 13854 * is set by the number of simultaneous writers (mnt_writeopcount) 13855 * plus a bit of slop (2) in case some more writers show up while 13856 * we are cleaning. 13857 * 13858 * When requested to free up space, the amount of space that 13859 * we need is enough blocks to allocate a full-sized segment 13860 * (fs_contigsumsize). The number of such segments that will 13861 * be needed is set by the number of simultaneous writers 13862 * (mnt_writeopcount) plus a bit of slop (2) in case some more 13863 * writers show up while we are cleaning. 13864 * 13865 * Additionally, if we are unpriviledged and allocating space, 13866 * we need to ensure that we clean up enough blocks to get the 13867 * needed number of blocks over the threshold of the minimum 13868 * number of blocks required to be kept free by the filesystem 13869 * (fs_minfree). 13870 */ 13871 if (resource == FLUSH_INODES_WAIT) { 13872 needed = vfs_mount_fetch_counter(vp->v_mount, 13873 MNT_COUNT_WRITEOPCOUNT) + 2; 13874 } else if (resource == FLUSH_BLOCKS_WAIT) { 13875 needed = (vfs_mount_fetch_counter(vp->v_mount, 13876 MNT_COUNT_WRITEOPCOUNT) + 2) * fs->fs_contigsumsize; 13877 if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE)) 13878 needed += fragstoblks(fs, 13879 roundup((fs->fs_dsize * fs->fs_minfree / 100) - 13880 fs->fs_cstotal.cs_nffree, fs->fs_frag)); 13881 } else { 13882 printf("softdep_request_cleanup: Unknown resource type %d\n", 13883 resource); 13884 UFS_LOCK(ump); 13885 return (0); 13886 } 13887 starttime = time_second; 13888 retry: 13889 if (resource == FLUSH_BLOCKS_WAIT && 13890 fs->fs_cstotal.cs_nbfree <= needed) 13891 softdep_send_speedup(ump, needed * fs->fs_bsize, 13892 BIO_SPEEDUP_TRIM); 13893 if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 && 13894 fs->fs_cstotal.cs_nbfree <= needed) || 13895 (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && 13896 fs->fs_cstotal.cs_nifree <= needed)) { 13897 ACQUIRE_LOCK(ump); 13898 if (ump->softdep_on_worklist > 0 && 13899 process_worklist_item(UFSTOVFS(ump), 13900 ump->softdep_on_worklist, LK_NOWAIT) != 0) 13901 stat_worklist_push += 1; 13902 FREE_LOCK(ump); 13903 } 13904 13905 /* 13906 * Check that there are vnodes pending inactivation. As they 13907 * have been unlinked, inactivating them will free up their 13908 * inodes. 13909 */ 13910 ACQUIRE_LOCK(ump); 13911 if (resource == FLUSH_INODES_WAIT && 13912 fs->fs_cstotal.cs_nifree <= needed && 13913 fs->fs_pendinginodes <= needed) { 13914 if ((ump->um_softdep->sd_flags & FLUSH_DI_ACTIVE) == 0) { 13915 ump->um_softdep->sd_flags |= FLUSH_DI_ACTIVE; 13916 FREE_LOCK(ump); 13917 softdep_request_cleanup_inactivate(mp); 13918 ACQUIRE_LOCK(ump); 13919 ump->um_softdep->sd_flags &= ~FLUSH_DI_ACTIVE; 13920 wakeup(&ump->um_softdep->sd_flags); 13921 } else { 13922 while ((ump->um_softdep->sd_flags & 13923 FLUSH_DI_ACTIVE) != 0) { 13924 msleep(&ump->um_softdep->sd_flags, 13925 LOCK_PTR(ump), PVM, "ffsvina", hz); 13926 } 13927 } 13928 } 13929 FREE_LOCK(ump); 13930 13931 /* 13932 * If we still need resources and there are no more worklist 13933 * entries to process to obtain them, we have to start flushing 13934 * the dirty vnodes to force the release of additional requests 13935 * to the worklist that we can then process to reap addition 13936 * resources. We walk the vnodes associated with the mount point 13937 * until we get the needed worklist requests that we can reap. 13938 * 13939 * If there are several threads all needing to clean the same 13940 * mount point, only one is allowed to walk the mount list. 13941 * When several threads all try to walk the same mount list, 13942 * they end up competing with each other and often end up in 13943 * livelock. This approach ensures that forward progress is 13944 * made at the cost of occational ENOSPC errors being returned 13945 * that might otherwise have been avoided. 13946 */ 13947 error = 1; 13948 if ((resource == FLUSH_BLOCKS_WAIT && 13949 fs->fs_cstotal.cs_nbfree <= needed) || 13950 (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && 13951 fs->fs_cstotal.cs_nifree <= needed)) { 13952 ACQUIRE_LOCK(ump); 13953 if ((ump->um_softdep->sd_flags & FLUSH_RC_ACTIVE) == 0) { 13954 ump->um_softdep->sd_flags |= FLUSH_RC_ACTIVE; 13955 FREE_LOCK(ump); 13956 failed_vnode = softdep_request_cleanup_flush(mp, ump); 13957 ACQUIRE_LOCK(ump); 13958 ump->um_softdep->sd_flags &= ~FLUSH_RC_ACTIVE; 13959 wakeup(&ump->um_softdep->sd_flags); 13960 FREE_LOCK(ump); 13961 if (ump->softdep_on_worklist > 0) { 13962 stat_cleanup_retries += 1; 13963 if (!failed_vnode) 13964 goto retry; 13965 } 13966 } else { 13967 while ((ump->um_softdep->sd_flags & 13968 FLUSH_RC_ACTIVE) != 0) { 13969 msleep(&ump->um_softdep->sd_flags, 13970 LOCK_PTR(ump), PVM, "ffsrca", hz); 13971 } 13972 FREE_LOCK(ump); 13973 error = 0; 13974 } 13975 stat_cleanup_failures += 1; 13976 } 13977 if (time_second - starttime > stat_cleanup_high_delay) 13978 stat_cleanup_high_delay = time_second - starttime; 13979 UFS_LOCK(ump); 13980 return (error); 13981 } 13982 13983 /* 13984 * Scan the vnodes for the specified mount point flushing out any 13985 * vnodes that can be locked without waiting. Finally, try to flush 13986 * the device associated with the mount point if it can be locked 13987 * without waiting. 13988 * 13989 * We return 0 if we were able to lock every vnode in our scan. 13990 * If we had to skip one or more vnodes, we return 1. 13991 */ 13992 static int 13993 softdep_request_cleanup_flush(mp, ump) 13994 struct mount *mp; 13995 struct ufsmount *ump; 13996 { 13997 struct thread *td; 13998 struct vnode *lvp, *mvp; 13999 int failed_vnode; 14000 14001 failed_vnode = 0; 14002 td = curthread; 14003 MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) { 14004 if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) { 14005 VI_UNLOCK(lvp); 14006 continue; 14007 } 14008 if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT) != 0) { 14009 failed_vnode = 1; 14010 continue; 14011 } 14012 if (lvp->v_vflag & VV_NOSYNC) { /* unlinked */ 14013 vput(lvp); 14014 continue; 14015 } 14016 (void) ffs_syncvnode(lvp, MNT_NOWAIT, 0); 14017 vput(lvp); 14018 } 14019 lvp = ump->um_devvp; 14020 if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) { 14021 VOP_FSYNC(lvp, MNT_NOWAIT, td); 14022 VOP_UNLOCK(lvp); 14023 } 14024 return (failed_vnode); 14025 } 14026 14027 static bool 14028 softdep_excess_items(struct ufsmount *ump, int item) 14029 { 14030 14031 KASSERT(item >= 0 && item < D_LAST, ("item %d", item)); 14032 return (dep_current[item] > max_softdeps && 14033 ump->softdep_curdeps[item] > max_softdeps / 14034 stat_flush_threads); 14035 } 14036 14037 static void 14038 schedule_cleanup(struct mount *mp) 14039 { 14040 struct ufsmount *ump; 14041 struct thread *td; 14042 14043 ump = VFSTOUFS(mp); 14044 LOCK_OWNED(ump); 14045 FREE_LOCK(ump); 14046 td = curthread; 14047 if ((td->td_pflags & TDP_KTHREAD) != 0 && 14048 (td->td_proc->p_flag2 & P2_AST_SU) == 0) { 14049 /* 14050 * No ast is delivered to kernel threads, so nobody 14051 * would deref the mp. Some kernel threads 14052 * explicitely check for AST, e.g. NFS daemon does 14053 * this in the serving loop. 14054 */ 14055 return; 14056 } 14057 if (td->td_su != NULL) 14058 vfs_rel(td->td_su); 14059 vfs_ref(mp); 14060 td->td_su = mp; 14061 thread_lock(td); 14062 td->td_flags |= TDF_ASTPENDING; 14063 thread_unlock(td); 14064 } 14065 14066 static void 14067 softdep_ast_cleanup_proc(struct thread *td) 14068 { 14069 struct mount *mp; 14070 struct ufsmount *ump; 14071 int error; 14072 bool req; 14073 14074 while ((mp = td->td_su) != NULL) { 14075 td->td_su = NULL; 14076 error = vfs_busy(mp, MBF_NOWAIT); 14077 vfs_rel(mp); 14078 if (error != 0) 14079 return; 14080 if (ffs_own_mount(mp) && MOUNTEDSOFTDEP(mp)) { 14081 ump = VFSTOUFS(mp); 14082 for (;;) { 14083 req = false; 14084 ACQUIRE_LOCK(ump); 14085 if (softdep_excess_items(ump, D_INODEDEP)) { 14086 req = true; 14087 request_cleanup(mp, FLUSH_INODES); 14088 } 14089 if (softdep_excess_items(ump, D_DIRREM)) { 14090 req = true; 14091 request_cleanup(mp, FLUSH_BLOCKS); 14092 } 14093 FREE_LOCK(ump); 14094 if (softdep_excess_items(ump, D_NEWBLK) || 14095 softdep_excess_items(ump, D_ALLOCDIRECT) || 14096 softdep_excess_items(ump, D_ALLOCINDIR)) { 14097 error = vn_start_write(NULL, &mp, 14098 V_WAIT); 14099 if (error == 0) { 14100 req = true; 14101 VFS_SYNC(mp, MNT_WAIT); 14102 vn_finished_write(mp); 14103 } 14104 } 14105 if ((td->td_pflags & TDP_KTHREAD) != 0 || !req) 14106 break; 14107 } 14108 } 14109 vfs_unbusy(mp); 14110 } 14111 if ((mp = td->td_su) != NULL) { 14112 td->td_su = NULL; 14113 vfs_rel(mp); 14114 } 14115 } 14116 14117 /* 14118 * If memory utilization has gotten too high, deliberately slow things 14119 * down and speed up the I/O processing. 14120 */ 14121 static int 14122 request_cleanup(mp, resource) 14123 struct mount *mp; 14124 int resource; 14125 { 14126 struct thread *td = curthread; 14127 struct ufsmount *ump; 14128 14129 ump = VFSTOUFS(mp); 14130 LOCK_OWNED(ump); 14131 /* 14132 * We never hold up the filesystem syncer or buf daemon. 14133 */ 14134 if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF)) 14135 return (0); 14136 /* 14137 * First check to see if the work list has gotten backlogged. 14138 * If it has, co-opt this process to help clean up two entries. 14139 * Because this process may hold inodes locked, we cannot 14140 * handle any remove requests that might block on a locked 14141 * inode as that could lead to deadlock. We set TDP_SOFTDEP 14142 * to avoid recursively processing the worklist. 14143 */ 14144 if (ump->softdep_on_worklist > max_softdeps / 10) { 14145 td->td_pflags |= TDP_SOFTDEP; 14146 process_worklist_item(mp, 2, LK_NOWAIT); 14147 td->td_pflags &= ~TDP_SOFTDEP; 14148 stat_worklist_push += 2; 14149 return(1); 14150 } 14151 /* 14152 * Next, we attempt to speed up the syncer process. If that 14153 * is successful, then we allow the process to continue. 14154 */ 14155 if (softdep_speedup(ump) && 14156 resource != FLUSH_BLOCKS_WAIT && 14157 resource != FLUSH_INODES_WAIT) 14158 return(0); 14159 /* 14160 * If we are resource constrained on inode dependencies, try 14161 * flushing some dirty inodes. Otherwise, we are constrained 14162 * by file deletions, so try accelerating flushes of directories 14163 * with removal dependencies. We would like to do the cleanup 14164 * here, but we probably hold an inode locked at this point and 14165 * that might deadlock against one that we try to clean. So, 14166 * the best that we can do is request the syncer daemon to do 14167 * the cleanup for us. 14168 */ 14169 switch (resource) { 14170 case FLUSH_INODES: 14171 case FLUSH_INODES_WAIT: 14172 ACQUIRE_GBLLOCK(&lk); 14173 stat_ino_limit_push += 1; 14174 req_clear_inodedeps += 1; 14175 FREE_GBLLOCK(&lk); 14176 stat_countp = &stat_ino_limit_hit; 14177 break; 14178 14179 case FLUSH_BLOCKS: 14180 case FLUSH_BLOCKS_WAIT: 14181 ACQUIRE_GBLLOCK(&lk); 14182 stat_blk_limit_push += 1; 14183 req_clear_remove += 1; 14184 FREE_GBLLOCK(&lk); 14185 stat_countp = &stat_blk_limit_hit; 14186 break; 14187 14188 default: 14189 panic("request_cleanup: unknown type"); 14190 } 14191 /* 14192 * Hopefully the syncer daemon will catch up and awaken us. 14193 * We wait at most tickdelay before proceeding in any case. 14194 */ 14195 ACQUIRE_GBLLOCK(&lk); 14196 FREE_LOCK(ump); 14197 proc_waiting += 1; 14198 if (callout_pending(&softdep_callout) == FALSE) 14199 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, 14200 pause_timer, 0); 14201 14202 if ((td->td_pflags & TDP_KTHREAD) == 0) 14203 msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0); 14204 proc_waiting -= 1; 14205 FREE_GBLLOCK(&lk); 14206 ACQUIRE_LOCK(ump); 14207 return (1); 14208 } 14209 14210 /* 14211 * Awaken processes pausing in request_cleanup and clear proc_waiting 14212 * to indicate that there is no longer a timer running. Pause_timer 14213 * will be called with the global softdep mutex (&lk) locked. 14214 */ 14215 static void 14216 pause_timer(arg) 14217 void *arg; 14218 { 14219 14220 GBLLOCK_OWNED(&lk); 14221 /* 14222 * The callout_ API has acquired mtx and will hold it around this 14223 * function call. 14224 */ 14225 *stat_countp += proc_waiting; 14226 wakeup(&proc_waiting); 14227 } 14228 14229 /* 14230 * If requested, try removing inode or removal dependencies. 14231 */ 14232 static void 14233 check_clear_deps(mp) 14234 struct mount *mp; 14235 { 14236 struct ufsmount *ump; 14237 bool suj_susp; 14238 14239 /* 14240 * Tell the lower layers that any TRIM or WRITE transactions that have 14241 * been delayed for performance reasons should proceed to help alleviate 14242 * the shortage faster. The race between checking req_* and the softdep 14243 * mutex (lk) is fine since this is an advisory operation that at most 14244 * causes deferred work to be done sooner. 14245 */ 14246 ump = VFSTOUFS(mp); 14247 suj_susp = ump->um_softdep->sd_jblocks != NULL && 14248 ump->softdep_jblocks->jb_suspended; 14249 if (req_clear_remove || req_clear_inodedeps || suj_susp) { 14250 FREE_LOCK(ump); 14251 softdep_send_speedup(ump, 0, BIO_SPEEDUP_TRIM | BIO_SPEEDUP_WRITE); 14252 ACQUIRE_LOCK(ump); 14253 } 14254 14255 /* 14256 * If we are suspended, it may be because of our using 14257 * too many inodedeps, so help clear them out. 14258 */ 14259 if (suj_susp) 14260 clear_inodedeps(mp); 14261 14262 /* 14263 * General requests for cleanup of backed up dependencies 14264 */ 14265 ACQUIRE_GBLLOCK(&lk); 14266 if (req_clear_inodedeps) { 14267 req_clear_inodedeps -= 1; 14268 FREE_GBLLOCK(&lk); 14269 clear_inodedeps(mp); 14270 ACQUIRE_GBLLOCK(&lk); 14271 wakeup(&proc_waiting); 14272 } 14273 if (req_clear_remove) { 14274 req_clear_remove -= 1; 14275 FREE_GBLLOCK(&lk); 14276 clear_remove(mp); 14277 ACQUIRE_GBLLOCK(&lk); 14278 wakeup(&proc_waiting); 14279 } 14280 FREE_GBLLOCK(&lk); 14281 } 14282 14283 /* 14284 * Flush out a directory with at least one removal dependency in an effort to 14285 * reduce the number of dirrem, freefile, and freeblks dependency structures. 14286 */ 14287 static void 14288 clear_remove(mp) 14289 struct mount *mp; 14290 { 14291 struct pagedep_hashhead *pagedephd; 14292 struct pagedep *pagedep; 14293 struct ufsmount *ump; 14294 struct vnode *vp; 14295 struct bufobj *bo; 14296 int error, cnt; 14297 ino_t ino; 14298 14299 ump = VFSTOUFS(mp); 14300 LOCK_OWNED(ump); 14301 14302 for (cnt = 0; cnt <= ump->pagedep_hash_size; cnt++) { 14303 pagedephd = &ump->pagedep_hashtbl[ump->pagedep_nextclean++]; 14304 if (ump->pagedep_nextclean > ump->pagedep_hash_size) 14305 ump->pagedep_nextclean = 0; 14306 LIST_FOREACH(pagedep, pagedephd, pd_hash) { 14307 if (LIST_EMPTY(&pagedep->pd_dirremhd)) 14308 continue; 14309 ino = pagedep->pd_ino; 14310 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 14311 continue; 14312 FREE_LOCK(ump); 14313 14314 /* 14315 * Let unmount clear deps 14316 */ 14317 error = vfs_busy(mp, MBF_NOWAIT); 14318 if (error != 0) 14319 goto finish_write; 14320 error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, 14321 FFSV_FORCEINSMQ | FFSV_FORCEINODEDEP); 14322 vfs_unbusy(mp); 14323 if (error != 0) { 14324 softdep_error("clear_remove: vget", error); 14325 goto finish_write; 14326 } 14327 MPASS(VTOI(vp)->i_mode != 0); 14328 if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0))) 14329 softdep_error("clear_remove: fsync", error); 14330 bo = &vp->v_bufobj; 14331 BO_LOCK(bo); 14332 drain_output(vp); 14333 BO_UNLOCK(bo); 14334 vput(vp); 14335 finish_write: 14336 vn_finished_write(mp); 14337 ACQUIRE_LOCK(ump); 14338 return; 14339 } 14340 } 14341 } 14342 14343 /* 14344 * Clear out a block of dirty inodes in an effort to reduce 14345 * the number of inodedep dependency structures. 14346 */ 14347 static void 14348 clear_inodedeps(mp) 14349 struct mount *mp; 14350 { 14351 struct inodedep_hashhead *inodedephd; 14352 struct inodedep *inodedep; 14353 struct ufsmount *ump; 14354 struct vnode *vp; 14355 struct fs *fs; 14356 int error, cnt; 14357 ino_t firstino, lastino, ino; 14358 14359 ump = VFSTOUFS(mp); 14360 fs = ump->um_fs; 14361 LOCK_OWNED(ump); 14362 /* 14363 * Pick a random inode dependency to be cleared. 14364 * We will then gather up all the inodes in its block 14365 * that have dependencies and flush them out. 14366 */ 14367 for (cnt = 0; cnt <= ump->inodedep_hash_size; cnt++) { 14368 inodedephd = &ump->inodedep_hashtbl[ump->inodedep_nextclean++]; 14369 if (ump->inodedep_nextclean > ump->inodedep_hash_size) 14370 ump->inodedep_nextclean = 0; 14371 if ((inodedep = LIST_FIRST(inodedephd)) != NULL) 14372 break; 14373 } 14374 if (inodedep == NULL) 14375 return; 14376 /* 14377 * Find the last inode in the block with dependencies. 14378 */ 14379 firstino = rounddown2(inodedep->id_ino, INOPB(fs)); 14380 for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) 14381 if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0) 14382 break; 14383 /* 14384 * Asynchronously push all but the last inode with dependencies. 14385 * Synchronously push the last inode with dependencies to ensure 14386 * that the inode block gets written to free up the inodedeps. 14387 */ 14388 for (ino = firstino; ino <= lastino; ino++) { 14389 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) 14390 continue; 14391 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 14392 continue; 14393 FREE_LOCK(ump); 14394 error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */ 14395 if (error != 0) { 14396 vn_finished_write(mp); 14397 ACQUIRE_LOCK(ump); 14398 return; 14399 } 14400 if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, 14401 FFSV_FORCEINSMQ | FFSV_FORCEINODEDEP)) != 0) { 14402 softdep_error("clear_inodedeps: vget", error); 14403 vfs_unbusy(mp); 14404 vn_finished_write(mp); 14405 ACQUIRE_LOCK(ump); 14406 return; 14407 } 14408 vfs_unbusy(mp); 14409 if (VTOI(vp)->i_mode == 0) { 14410 vgone(vp); 14411 } else if (ino == lastino) { 14412 do { 14413 error = ffs_syncvnode(vp, MNT_WAIT, 0); 14414 } while (error == ERELOOKUP); 14415 if (error != 0) 14416 softdep_error("clear_inodedeps: fsync1", error); 14417 } else { 14418 if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0))) 14419 softdep_error("clear_inodedeps: fsync2", error); 14420 BO_LOCK(&vp->v_bufobj); 14421 drain_output(vp); 14422 BO_UNLOCK(&vp->v_bufobj); 14423 } 14424 vput(vp); 14425 vn_finished_write(mp); 14426 ACQUIRE_LOCK(ump); 14427 } 14428 } 14429 14430 void 14431 softdep_buf_append(bp, wkhd) 14432 struct buf *bp; 14433 struct workhead *wkhd; 14434 { 14435 struct worklist *wk; 14436 struct ufsmount *ump; 14437 14438 if ((wk = LIST_FIRST(wkhd)) == NULL) 14439 return; 14440 KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0, 14441 ("softdep_buf_append called on non-softdep filesystem")); 14442 ump = VFSTOUFS(wk->wk_mp); 14443 ACQUIRE_LOCK(ump); 14444 while ((wk = LIST_FIRST(wkhd)) != NULL) { 14445 WORKLIST_REMOVE(wk); 14446 WORKLIST_INSERT(&bp->b_dep, wk); 14447 } 14448 FREE_LOCK(ump); 14449 14450 } 14451 14452 void 14453 softdep_inode_append(ip, cred, wkhd) 14454 struct inode *ip; 14455 struct ucred *cred; 14456 struct workhead *wkhd; 14457 { 14458 struct buf *bp; 14459 struct fs *fs; 14460 struct ufsmount *ump; 14461 int error; 14462 14463 ump = ITOUMP(ip); 14464 KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, 14465 ("softdep_inode_append called on non-softdep filesystem")); 14466 fs = ump->um_fs; 14467 error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 14468 (int)fs->fs_bsize, cred, &bp); 14469 if (error) { 14470 bqrelse(bp); 14471 softdep_freework(wkhd); 14472 return; 14473 } 14474 softdep_buf_append(bp, wkhd); 14475 bqrelse(bp); 14476 } 14477 14478 void 14479 softdep_freework(wkhd) 14480 struct workhead *wkhd; 14481 { 14482 struct worklist *wk; 14483 struct ufsmount *ump; 14484 14485 if ((wk = LIST_FIRST(wkhd)) == NULL) 14486 return; 14487 KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0, 14488 ("softdep_freework called on non-softdep filesystem")); 14489 ump = VFSTOUFS(wk->wk_mp); 14490 ACQUIRE_LOCK(ump); 14491 handle_jwork(wkhd); 14492 FREE_LOCK(ump); 14493 } 14494 14495 static struct ufsmount * 14496 softdep_bp_to_mp(bp) 14497 struct buf *bp; 14498 { 14499 struct mount *mp; 14500 struct vnode *vp; 14501 14502 if (LIST_EMPTY(&bp->b_dep)) 14503 return (NULL); 14504 vp = bp->b_vp; 14505 KASSERT(vp != NULL, 14506 ("%s, buffer with dependencies lacks vnode", __func__)); 14507 14508 /* 14509 * The ump mount point is stable after we get a correct 14510 * pointer, since bp is locked and this prevents unmount from 14511 * proceeding. But to get to it, we cannot dereference bp->b_dep 14512 * head wk_mp, because we do not yet own SU ump lock and 14513 * workitem might be freed while dereferenced. 14514 */ 14515 retry: 14516 switch (vp->v_type) { 14517 case VCHR: 14518 VI_LOCK(vp); 14519 mp = vp->v_type == VCHR ? vp->v_rdev->si_mountpt : NULL; 14520 VI_UNLOCK(vp); 14521 if (mp == NULL) 14522 goto retry; 14523 break; 14524 case VREG: 14525 case VDIR: 14526 case VLNK: 14527 case VFIFO: 14528 case VSOCK: 14529 mp = vp->v_mount; 14530 break; 14531 case VBLK: 14532 vn_printf(vp, "softdep_bp_to_mp: unexpected block device\n"); 14533 /* FALLTHROUGH */ 14534 case VNON: 14535 case VBAD: 14536 case VMARKER: 14537 mp = NULL; 14538 break; 14539 default: 14540 vn_printf(vp, "unknown vnode type"); 14541 mp = NULL; 14542 break; 14543 } 14544 return (VFSTOUFS(mp)); 14545 } 14546 14547 /* 14548 * Function to determine if the buffer has outstanding dependencies 14549 * that will cause a roll-back if the buffer is written. If wantcount 14550 * is set, return number of dependencies, otherwise just yes or no. 14551 */ 14552 static int 14553 softdep_count_dependencies(bp, wantcount) 14554 struct buf *bp; 14555 int wantcount; 14556 { 14557 struct worklist *wk; 14558 struct ufsmount *ump; 14559 struct bmsafemap *bmsafemap; 14560 struct freework *freework; 14561 struct inodedep *inodedep; 14562 struct indirdep *indirdep; 14563 struct freeblks *freeblks; 14564 struct allocindir *aip; 14565 struct pagedep *pagedep; 14566 struct dirrem *dirrem; 14567 struct newblk *newblk; 14568 struct mkdir *mkdir; 14569 struct diradd *dap; 14570 int i, retval; 14571 14572 ump = softdep_bp_to_mp(bp); 14573 if (ump == NULL) 14574 return (0); 14575 retval = 0; 14576 ACQUIRE_LOCK(ump); 14577 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 14578 switch (wk->wk_type) { 14579 case D_INODEDEP: 14580 inodedep = WK_INODEDEP(wk); 14581 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 14582 /* bitmap allocation dependency */ 14583 retval += 1; 14584 if (!wantcount) 14585 goto out; 14586 } 14587 if (TAILQ_FIRST(&inodedep->id_inoupdt)) { 14588 /* direct block pointer dependency */ 14589 retval += 1; 14590 if (!wantcount) 14591 goto out; 14592 } 14593 if (TAILQ_FIRST(&inodedep->id_extupdt)) { 14594 /* direct block pointer dependency */ 14595 retval += 1; 14596 if (!wantcount) 14597 goto out; 14598 } 14599 if (TAILQ_FIRST(&inodedep->id_inoreflst)) { 14600 /* Add reference dependency. */ 14601 retval += 1; 14602 if (!wantcount) 14603 goto out; 14604 } 14605 continue; 14606 14607 case D_INDIRDEP: 14608 indirdep = WK_INDIRDEP(wk); 14609 14610 TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) { 14611 /* indirect truncation dependency */ 14612 retval += 1; 14613 if (!wantcount) 14614 goto out; 14615 } 14616 14617 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { 14618 /* indirect block pointer dependency */ 14619 retval += 1; 14620 if (!wantcount) 14621 goto out; 14622 } 14623 continue; 14624 14625 case D_PAGEDEP: 14626 pagedep = WK_PAGEDEP(wk); 14627 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { 14628 if (LIST_FIRST(&dirrem->dm_jremrefhd)) { 14629 /* Journal remove ref dependency. */ 14630 retval += 1; 14631 if (!wantcount) 14632 goto out; 14633 } 14634 } 14635 for (i = 0; i < DAHASHSZ; i++) { 14636 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 14637 /* directory entry dependency */ 14638 retval += 1; 14639 if (!wantcount) 14640 goto out; 14641 } 14642 } 14643 continue; 14644 14645 case D_BMSAFEMAP: 14646 bmsafemap = WK_BMSAFEMAP(wk); 14647 if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) { 14648 /* Add reference dependency. */ 14649 retval += 1; 14650 if (!wantcount) 14651 goto out; 14652 } 14653 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) { 14654 /* Allocate block dependency. */ 14655 retval += 1; 14656 if (!wantcount) 14657 goto out; 14658 } 14659 continue; 14660 14661 case D_FREEBLKS: 14662 freeblks = WK_FREEBLKS(wk); 14663 if (LIST_FIRST(&freeblks->fb_jblkdephd)) { 14664 /* Freeblk journal dependency. */ 14665 retval += 1; 14666 if (!wantcount) 14667 goto out; 14668 } 14669 continue; 14670 14671 case D_ALLOCDIRECT: 14672 case D_ALLOCINDIR: 14673 newblk = WK_NEWBLK(wk); 14674 if (newblk->nb_jnewblk) { 14675 /* Journal allocate dependency. */ 14676 retval += 1; 14677 if (!wantcount) 14678 goto out; 14679 } 14680 continue; 14681 14682 case D_MKDIR: 14683 mkdir = WK_MKDIR(wk); 14684 if (mkdir->md_jaddref) { 14685 /* Journal reference dependency. */ 14686 retval += 1; 14687 if (!wantcount) 14688 goto out; 14689 } 14690 continue; 14691 14692 case D_FREEWORK: 14693 case D_FREEDEP: 14694 case D_JSEGDEP: 14695 case D_JSEG: 14696 case D_SBDEP: 14697 /* never a dependency on these blocks */ 14698 continue; 14699 14700 default: 14701 panic("softdep_count_dependencies: Unexpected type %s", 14702 TYPENAME(wk->wk_type)); 14703 /* NOTREACHED */ 14704 } 14705 } 14706 out: 14707 FREE_LOCK(ump); 14708 return (retval); 14709 } 14710 14711 /* 14712 * Acquire exclusive access to a buffer. 14713 * Must be called with a locked mtx parameter. 14714 * Return acquired buffer or NULL on failure. 14715 */ 14716 static struct buf * 14717 getdirtybuf(bp, lock, waitfor) 14718 struct buf *bp; 14719 struct rwlock *lock; 14720 int waitfor; 14721 { 14722 int error; 14723 14724 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) { 14725 if (waitfor != MNT_WAIT) 14726 return (NULL); 14727 error = BUF_LOCK(bp, 14728 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, lock); 14729 /* 14730 * Even if we successfully acquire bp here, we have dropped 14731 * lock, which may violates our guarantee. 14732 */ 14733 if (error == 0) 14734 BUF_UNLOCK(bp); 14735 else if (error != ENOLCK) 14736 panic("getdirtybuf: inconsistent lock: %d", error); 14737 rw_wlock(lock); 14738 return (NULL); 14739 } 14740 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { 14741 if (lock != BO_LOCKPTR(bp->b_bufobj) && waitfor == MNT_WAIT) { 14742 rw_wunlock(lock); 14743 BO_LOCK(bp->b_bufobj); 14744 BUF_UNLOCK(bp); 14745 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { 14746 bp->b_vflags |= BV_BKGRDWAIT; 14747 msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj), 14748 PRIBIO | PDROP, "getbuf", 0); 14749 } else 14750 BO_UNLOCK(bp->b_bufobj); 14751 rw_wlock(lock); 14752 return (NULL); 14753 } 14754 BUF_UNLOCK(bp); 14755 if (waitfor != MNT_WAIT) 14756 return (NULL); 14757 #ifdef DEBUG_VFS_LOCKS 14758 if (bp->b_vp->v_type != VCHR) 14759 ASSERT_BO_WLOCKED(bp->b_bufobj); 14760 #endif 14761 bp->b_vflags |= BV_BKGRDWAIT; 14762 rw_sleep(&bp->b_xflags, lock, PRIBIO, "getbuf", 0); 14763 return (NULL); 14764 } 14765 if ((bp->b_flags & B_DELWRI) == 0) { 14766 BUF_UNLOCK(bp); 14767 return (NULL); 14768 } 14769 bremfree(bp); 14770 return (bp); 14771 } 14772 14773 /* 14774 * Check if it is safe to suspend the file system now. On entry, 14775 * the vnode interlock for devvp should be held. Return 0 with 14776 * the mount interlock held if the file system can be suspended now, 14777 * otherwise return EAGAIN with the mount interlock held. 14778 */ 14779 int 14780 softdep_check_suspend(struct mount *mp, 14781 struct vnode *devvp, 14782 int softdep_depcnt, 14783 int softdep_accdepcnt, 14784 int secondary_writes, 14785 int secondary_accwrites) 14786 { 14787 struct buf *bp; 14788 struct bufobj *bo; 14789 struct ufsmount *ump; 14790 struct inodedep *inodedep; 14791 struct indirdep *indirdep; 14792 struct worklist *wk, *nextwk; 14793 int error, unlinked; 14794 14795 bo = &devvp->v_bufobj; 14796 ASSERT_BO_WLOCKED(bo); 14797 14798 /* 14799 * If we are not running with soft updates, then we need only 14800 * deal with secondary writes as we try to suspend. 14801 */ 14802 if (MOUNTEDSOFTDEP(mp) == 0) { 14803 MNT_ILOCK(mp); 14804 while (mp->mnt_secondary_writes != 0) { 14805 BO_UNLOCK(bo); 14806 msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), 14807 (PUSER - 1) | PDROP, "secwr", 0); 14808 BO_LOCK(bo); 14809 MNT_ILOCK(mp); 14810 } 14811 14812 /* 14813 * Reasons for needing more work before suspend: 14814 * - Dirty buffers on devvp. 14815 * - Secondary writes occurred after start of vnode sync loop 14816 */ 14817 error = 0; 14818 if (bo->bo_numoutput > 0 || 14819 bo->bo_dirty.bv_cnt > 0 || 14820 secondary_writes != 0 || 14821 mp->mnt_secondary_writes != 0 || 14822 secondary_accwrites != mp->mnt_secondary_accwrites) 14823 error = EAGAIN; 14824 BO_UNLOCK(bo); 14825 return (error); 14826 } 14827 14828 /* 14829 * If we are running with soft updates, then we need to coordinate 14830 * with them as we try to suspend. 14831 */ 14832 ump = VFSTOUFS(mp); 14833 for (;;) { 14834 if (!TRY_ACQUIRE_LOCK(ump)) { 14835 BO_UNLOCK(bo); 14836 ACQUIRE_LOCK(ump); 14837 FREE_LOCK(ump); 14838 BO_LOCK(bo); 14839 continue; 14840 } 14841 MNT_ILOCK(mp); 14842 if (mp->mnt_secondary_writes != 0) { 14843 FREE_LOCK(ump); 14844 BO_UNLOCK(bo); 14845 msleep(&mp->mnt_secondary_writes, 14846 MNT_MTX(mp), 14847 (PUSER - 1) | PDROP, "secwr", 0); 14848 BO_LOCK(bo); 14849 continue; 14850 } 14851 break; 14852 } 14853 14854 unlinked = 0; 14855 if (MOUNTEDSUJ(mp)) { 14856 for (inodedep = TAILQ_FIRST(&ump->softdep_unlinked); 14857 inodedep != NULL; 14858 inodedep = TAILQ_NEXT(inodedep, id_unlinked)) { 14859 if ((inodedep->id_state & (UNLINKED | UNLINKLINKS | 14860 UNLINKONLIST)) != (UNLINKED | UNLINKLINKS | 14861 UNLINKONLIST) || 14862 !check_inodedep_free(inodedep)) 14863 continue; 14864 unlinked++; 14865 } 14866 } 14867 14868 /* 14869 * XXX Check for orphaned indirdep dependency structures. 14870 * 14871 * During forcible unmount after a disk failure there is a 14872 * bug that causes one or more indirdep dependency structures 14873 * to fail to be deallocated. We check for them here and clean 14874 * them up so that the unmount can succeed. 14875 */ 14876 if ((ump->um_flags & UM_FSFAIL_CLEANUP) != 0 && ump->softdep_deps > 0 && 14877 ump->softdep_deps == ump->softdep_curdeps[D_INDIRDEP]) { 14878 LIST_FOREACH_SAFE(wk, &ump->softdep_alldeps[D_INDIRDEP], 14879 wk_all, nextwk) { 14880 indirdep = WK_INDIRDEP(wk); 14881 if ((indirdep->ir_state & (GOINGAWAY | DEPCOMPLETE)) != 14882 (GOINGAWAY | DEPCOMPLETE) || 14883 !TAILQ_EMPTY(&indirdep->ir_trunc) || 14884 !LIST_EMPTY(&indirdep->ir_completehd) || 14885 !LIST_EMPTY(&indirdep->ir_writehd) || 14886 !LIST_EMPTY(&indirdep->ir_donehd) || 14887 !LIST_EMPTY(&indirdep->ir_deplisthd) || 14888 indirdep->ir_saveddata != NULL || 14889 indirdep->ir_savebp == NULL) { 14890 printf("%s: skipping orphaned indirdep %p\n", 14891 __FUNCTION__, indirdep); 14892 continue; 14893 } 14894 printf("%s: freeing orphaned indirdep %p\n", 14895 __FUNCTION__, indirdep); 14896 bp = indirdep->ir_savebp; 14897 indirdep->ir_savebp = NULL; 14898 free_indirdep(indirdep); 14899 FREE_LOCK(ump); 14900 brelse(bp); 14901 while (!TRY_ACQUIRE_LOCK(ump)) { 14902 BO_UNLOCK(bo); 14903 ACQUIRE_LOCK(ump); 14904 FREE_LOCK(ump); 14905 BO_LOCK(bo); 14906 } 14907 } 14908 } 14909 14910 /* 14911 * Reasons for needing more work before suspend: 14912 * - Dirty buffers on devvp. 14913 * - Dependency structures still exist 14914 * - Softdep activity occurred after start of vnode sync loop 14915 * - Secondary writes occurred after start of vnode sync loop 14916 */ 14917 error = 0; 14918 if (bo->bo_numoutput > 0 || 14919 bo->bo_dirty.bv_cnt > 0 || 14920 softdep_depcnt != unlinked || 14921 ump->softdep_deps != unlinked || 14922 softdep_accdepcnt != ump->softdep_accdeps || 14923 secondary_writes != 0 || 14924 mp->mnt_secondary_writes != 0 || 14925 secondary_accwrites != mp->mnt_secondary_accwrites) 14926 error = EAGAIN; 14927 FREE_LOCK(ump); 14928 BO_UNLOCK(bo); 14929 return (error); 14930 } 14931 14932 /* 14933 * Get the number of dependency structures for the file system, both 14934 * the current number and the total number allocated. These will 14935 * later be used to detect that softdep processing has occurred. 14936 */ 14937 void 14938 softdep_get_depcounts(struct mount *mp, 14939 int *softdep_depsp, 14940 int *softdep_accdepsp) 14941 { 14942 struct ufsmount *ump; 14943 14944 if (MOUNTEDSOFTDEP(mp) == 0) { 14945 *softdep_depsp = 0; 14946 *softdep_accdepsp = 0; 14947 return; 14948 } 14949 ump = VFSTOUFS(mp); 14950 ACQUIRE_LOCK(ump); 14951 *softdep_depsp = ump->softdep_deps; 14952 *softdep_accdepsp = ump->softdep_accdeps; 14953 FREE_LOCK(ump); 14954 } 14955 14956 /* 14957 * Wait for pending output on a vnode to complete. 14958 */ 14959 static void 14960 drain_output(vp) 14961 struct vnode *vp; 14962 { 14963 14964 ASSERT_VOP_LOCKED(vp, "drain_output"); 14965 (void)bufobj_wwait(&vp->v_bufobj, 0, 0); 14966 } 14967 14968 /* 14969 * Called whenever a buffer that is being invalidated or reallocated 14970 * contains dependencies. This should only happen if an I/O error has 14971 * occurred. The routine is called with the buffer locked. 14972 */ 14973 static void 14974 softdep_deallocate_dependencies(bp) 14975 struct buf *bp; 14976 { 14977 14978 if ((bp->b_ioflags & BIO_ERROR) == 0) 14979 panic("softdep_deallocate_dependencies: dangling deps"); 14980 if (bp->b_vp != NULL && bp->b_vp->v_mount != NULL) 14981 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); 14982 else 14983 printf("softdep_deallocate_dependencies: " 14984 "got error %d while accessing filesystem\n", bp->b_error); 14985 if (bp->b_error != ENXIO) 14986 panic("softdep_deallocate_dependencies: unrecovered I/O error"); 14987 } 14988 14989 /* 14990 * Function to handle asynchronous write errors in the filesystem. 14991 */ 14992 static void 14993 softdep_error(func, error) 14994 char *func; 14995 int error; 14996 { 14997 14998 /* XXX should do something better! */ 14999 printf("%s: got error %d while accessing filesystem\n", func, error); 15000 } 15001 15002 #ifdef DDB 15003 15004 /* exported to ffs_vfsops.c */ 15005 extern void db_print_ffs(struct ufsmount *ump); 15006 void 15007 db_print_ffs(struct ufsmount *ump) 15008 { 15009 db_printf("mp %p (%s) devvp %p\n", ump->um_mountp, 15010 ump->um_mountp->mnt_stat.f_mntonname, ump->um_devvp); 15011 db_printf(" fs %p ", ump->um_fs); 15012 15013 if (ump->um_softdep != NULL) { 15014 db_printf("su_wl %d su_deps %d su_req %d\n", 15015 ump->softdep_on_worklist, ump->softdep_deps, 15016 ump->softdep_req); 15017 } else { 15018 db_printf("su disabled\n"); 15019 } 15020 } 15021 15022 static void 15023 worklist_print(struct worklist *wk, int verbose) 15024 { 15025 15026 if (!verbose) { 15027 db_printf("%s: %p state 0x%b\n", TYPENAME(wk->wk_type), wk, 15028 (u_int)wk->wk_state, PRINT_SOFTDEP_FLAGS); 15029 return; 15030 } 15031 db_printf("worklist: %p type %s state 0x%b next %p\n ", wk, 15032 TYPENAME(wk->wk_type), (u_int)wk->wk_state, PRINT_SOFTDEP_FLAGS, 15033 LIST_NEXT(wk, wk_list)); 15034 db_print_ffs(VFSTOUFS(wk->wk_mp)); 15035 } 15036 15037 static void 15038 inodedep_print(struct inodedep *inodedep, int verbose) 15039 { 15040 15041 worklist_print(&inodedep->id_list, 0); 15042 db_printf(" fs %p ino %jd inoblk %jd delta %jd nlink %jd\n", 15043 inodedep->id_fs, 15044 (intmax_t)inodedep->id_ino, 15045 (intmax_t)fsbtodb(inodedep->id_fs, 15046 ino_to_fsba(inodedep->id_fs, inodedep->id_ino)), 15047 (intmax_t)inodedep->id_nlinkdelta, 15048 (intmax_t)inodedep->id_savednlink); 15049 15050 if (verbose == 0) 15051 return; 15052 15053 db_printf(" bmsafemap %p, mkdiradd %p, inoreflst %p\n", 15054 inodedep->id_bmsafemap, 15055 inodedep->id_mkdiradd, 15056 TAILQ_FIRST(&inodedep->id_inoreflst)); 15057 db_printf(" dirremhd %p, pendinghd %p, bufwait %p\n", 15058 LIST_FIRST(&inodedep->id_dirremhd), 15059 LIST_FIRST(&inodedep->id_pendinghd), 15060 LIST_FIRST(&inodedep->id_bufwait)); 15061 db_printf(" inowait %p, inoupdt %p, newinoupdt %p\n", 15062 LIST_FIRST(&inodedep->id_inowait), 15063 TAILQ_FIRST(&inodedep->id_inoupdt), 15064 TAILQ_FIRST(&inodedep->id_newinoupdt)); 15065 db_printf(" extupdt %p, newextupdt %p, freeblklst %p\n", 15066 TAILQ_FIRST(&inodedep->id_extupdt), 15067 TAILQ_FIRST(&inodedep->id_newextupdt), 15068 TAILQ_FIRST(&inodedep->id_freeblklst)); 15069 db_printf(" saveino %p, savedsize %jd, savedextsize %jd\n", 15070 inodedep->id_savedino1, 15071 (intmax_t)inodedep->id_savedsize, 15072 (intmax_t)inodedep->id_savedextsize); 15073 } 15074 15075 static void 15076 newblk_print(struct newblk *nbp) 15077 { 15078 15079 worklist_print(&nbp->nb_list, 0); 15080 db_printf(" newblkno %jd\n", (intmax_t)nbp->nb_newblkno); 15081 db_printf(" jnewblk %p, bmsafemap %p, freefrag %p\n", 15082 &nbp->nb_jnewblk, 15083 &nbp->nb_bmsafemap, 15084 &nbp->nb_freefrag); 15085 db_printf(" indirdeps %p, newdirblk %p, jwork %p\n", 15086 LIST_FIRST(&nbp->nb_indirdeps), 15087 LIST_FIRST(&nbp->nb_newdirblk), 15088 LIST_FIRST(&nbp->nb_jwork)); 15089 } 15090 15091 static void 15092 allocdirect_print(struct allocdirect *adp) 15093 { 15094 15095 newblk_print(&adp->ad_block); 15096 db_printf(" oldblkno %jd, oldsize %ld, newsize %ld\n", 15097 adp->ad_oldblkno, adp->ad_oldsize, adp->ad_newsize); 15098 db_printf(" offset %d, inodedep %p\n", 15099 adp->ad_offset, adp->ad_inodedep); 15100 } 15101 15102 static void 15103 allocindir_print(struct allocindir *aip) 15104 { 15105 15106 newblk_print(&aip->ai_block); 15107 db_printf(" oldblkno %jd, lbn %jd\n", 15108 (intmax_t)aip->ai_oldblkno, (intmax_t)aip->ai_lbn); 15109 db_printf(" offset %d, indirdep %p\n", 15110 aip->ai_offset, aip->ai_indirdep); 15111 } 15112 15113 static void 15114 mkdir_print(struct mkdir *mkdir) 15115 { 15116 15117 worklist_print(&mkdir->md_list, 0); 15118 db_printf(" diradd %p, jaddref %p, buf %p\n", 15119 mkdir->md_diradd, mkdir->md_jaddref, mkdir->md_buf); 15120 } 15121 15122 DB_SHOW_COMMAND(sd_inodedep, db_show_sd_inodedep) 15123 { 15124 15125 if (have_addr == 0) { 15126 db_printf("inodedep address required\n"); 15127 return; 15128 } 15129 inodedep_print((struct inodedep*)addr, 1); 15130 } 15131 15132 DB_SHOW_COMMAND(sd_allinodedeps, db_show_sd_allinodedeps) 15133 { 15134 struct inodedep_hashhead *inodedephd; 15135 struct inodedep *inodedep; 15136 struct ufsmount *ump; 15137 int cnt; 15138 15139 if (have_addr == 0) { 15140 db_printf("ufsmount address required\n"); 15141 return; 15142 } 15143 ump = (struct ufsmount *)addr; 15144 for (cnt = 0; cnt < ump->inodedep_hash_size; cnt++) { 15145 inodedephd = &ump->inodedep_hashtbl[cnt]; 15146 LIST_FOREACH(inodedep, inodedephd, id_hash) { 15147 inodedep_print(inodedep, 0); 15148 } 15149 } 15150 } 15151 15152 DB_SHOW_COMMAND(sd_worklist, db_show_sd_worklist) 15153 { 15154 15155 if (have_addr == 0) { 15156 db_printf("worklist address required\n"); 15157 return; 15158 } 15159 worklist_print((struct worklist *)addr, 1); 15160 } 15161 15162 DB_SHOW_COMMAND(sd_workhead, db_show_sd_workhead) 15163 { 15164 struct worklist *wk; 15165 struct workhead *wkhd; 15166 15167 if (have_addr == 0) { 15168 db_printf("worklist address required " 15169 "(for example value in bp->b_dep)\n"); 15170 return; 15171 } 15172 /* 15173 * We often do not have the address of the worklist head but 15174 * instead a pointer to its first entry (e.g., we have the 15175 * contents of bp->b_dep rather than &bp->b_dep). But the back 15176 * pointer of bp->b_dep will point at the head of the list, so 15177 * we cheat and use that instead. If we are in the middle of 15178 * a list we will still get the same result, so nothing 15179 * unexpected will result. 15180 */ 15181 wk = (struct worklist *)addr; 15182 if (wk == NULL) 15183 return; 15184 wkhd = (struct workhead *)wk->wk_list.le_prev; 15185 LIST_FOREACH(wk, wkhd, wk_list) { 15186 switch(wk->wk_type) { 15187 case D_INODEDEP: 15188 inodedep_print(WK_INODEDEP(wk), 0); 15189 continue; 15190 case D_ALLOCDIRECT: 15191 allocdirect_print(WK_ALLOCDIRECT(wk)); 15192 continue; 15193 case D_ALLOCINDIR: 15194 allocindir_print(WK_ALLOCINDIR(wk)); 15195 continue; 15196 case D_MKDIR: 15197 mkdir_print(WK_MKDIR(wk)); 15198 continue; 15199 default: 15200 worklist_print(wk, 0); 15201 continue; 15202 } 15203 } 15204 } 15205 15206 DB_SHOW_COMMAND(sd_mkdir, db_show_sd_mkdir) 15207 { 15208 if (have_addr == 0) { 15209 db_printf("mkdir address required\n"); 15210 return; 15211 } 15212 mkdir_print((struct mkdir *)addr); 15213 } 15214 15215 DB_SHOW_COMMAND(sd_mkdir_list, db_show_sd_mkdir_list) 15216 { 15217 struct mkdirlist *mkdirlisthd; 15218 struct mkdir *mkdir; 15219 15220 if (have_addr == 0) { 15221 db_printf("mkdir listhead address required\n"); 15222 return; 15223 } 15224 mkdirlisthd = (struct mkdirlist *)addr; 15225 LIST_FOREACH(mkdir, mkdirlisthd, md_mkdirs) { 15226 mkdir_print(mkdir); 15227 if (mkdir->md_diradd != NULL) { 15228 db_printf(" "); 15229 worklist_print(&mkdir->md_diradd->da_list, 0); 15230 } 15231 if (mkdir->md_jaddref != NULL) { 15232 db_printf(" "); 15233 worklist_print(&mkdir->md_jaddref->ja_list, 0); 15234 } 15235 } 15236 } 15237 15238 DB_SHOW_COMMAND(sd_allocdirect, db_show_sd_allocdirect) 15239 { 15240 if (have_addr == 0) { 15241 db_printf("allocdirect address required\n"); 15242 return; 15243 } 15244 allocdirect_print((struct allocdirect *)addr); 15245 } 15246 15247 DB_SHOW_COMMAND(sd_allocindir, db_show_sd_allocindir) 15248 { 15249 if (have_addr == 0) { 15250 db_printf("allocindir address required\n"); 15251 return; 15252 } 15253 allocindir_print((struct allocindir *)addr); 15254 } 15255 15256 #endif /* DDB */ 15257 15258 #endif /* SOFTUPDATES */ 15259