1 /*- 2 * Copyright 1998, 2000 Marshall Kirk McKusick. 3 * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org> 4 * All rights reserved. 5 * 6 * The soft updates code is derived from the appendix of a University 7 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, 8 * "Soft Updates: A Solution to the Metadata Update Problem in File 9 * Systems", CSE-TR-254-95, August 1995). 10 * 11 * Further information about soft updates can be obtained from: 12 * 13 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 14 * 1614 Oxford Street mckusick@mckusick.com 15 * Berkeley, CA 94709-1608 +1-510-843-9542 16 * USA 17 * 18 * Redistribution and use in source and binary forms, with or without 19 * modification, are permitted provided that the following conditions 20 * are met: 21 * 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 31 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, 32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 34 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 35 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 36 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 37 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 * 39 * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00 40 */ 41 42 #include <sys/cdefs.h> 43 __FBSDID("$FreeBSD$"); 44 45 #include "opt_ffs.h" 46 #include "opt_ddb.h" 47 48 /* 49 * For now we want the safety net that the DEBUG flag provides. 50 */ 51 #ifndef DEBUG 52 #define DEBUG 53 #endif 54 55 #include <sys/param.h> 56 #include <sys/kernel.h> 57 #include <sys/systm.h> 58 #include <sys/bio.h> 59 #include <sys/buf.h> 60 #include <sys/kdb.h> 61 #include <sys/kthread.h> 62 #include <sys/lock.h> 63 #include <sys/malloc.h> 64 #include <sys/mount.h> 65 #include <sys/mutex.h> 66 #include <sys/namei.h> 67 #include <sys/priv.h> 68 #include <sys/proc.h> 69 #include <sys/stat.h> 70 #include <sys/sysctl.h> 71 #include <sys/syslog.h> 72 #include <sys/vnode.h> 73 #include <sys/conf.h> 74 #include <ufs/ufs/dir.h> 75 #include <ufs/ufs/extattr.h> 76 #include <ufs/ufs/quota.h> 77 #include <ufs/ufs/inode.h> 78 #include <ufs/ufs/ufsmount.h> 79 #include <ufs/ffs/fs.h> 80 #include <ufs/ffs/softdep.h> 81 #include <ufs/ffs/ffs_extern.h> 82 #include <ufs/ufs/ufs_extern.h> 83 84 #include <vm/vm.h> 85 86 #include <ddb/ddb.h> 87 88 #ifndef SOFTUPDATES 89 90 int 91 softdep_flushfiles(oldmnt, flags, td) 92 struct mount *oldmnt; 93 int flags; 94 struct thread *td; 95 { 96 97 panic("softdep_flushfiles called"); 98 } 99 100 int 101 softdep_mount(devvp, mp, fs, cred) 102 struct vnode *devvp; 103 struct mount *mp; 104 struct fs *fs; 105 struct ucred *cred; 106 { 107 108 return (0); 109 } 110 111 void 112 softdep_initialize() 113 { 114 115 return; 116 } 117 118 void 119 softdep_uninitialize() 120 { 121 122 return; 123 } 124 125 void 126 softdep_unmount(mp) 127 struct mount *mp; 128 { 129 130 } 131 132 void 133 softdep_setup_sbupdate(ump, fs, bp) 134 struct ufsmount *ump; 135 struct fs *fs; 136 struct buf *bp; 137 { 138 } 139 140 void 141 softdep_setup_inomapdep(bp, ip, newinum) 142 struct buf *bp; 143 struct inode *ip; 144 ino_t newinum; 145 { 146 147 panic("softdep_setup_inomapdep called"); 148 } 149 150 void 151 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) 152 struct buf *bp; 153 struct mount *mp; 154 ufs2_daddr_t newblkno; 155 int frags; 156 int oldfrags; 157 { 158 159 panic("softdep_setup_blkmapdep called"); 160 } 161 162 void 163 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 164 struct inode *ip; 165 ufs_lbn_t lbn; 166 ufs2_daddr_t newblkno; 167 ufs2_daddr_t oldblkno; 168 long newsize; 169 long oldsize; 170 struct buf *bp; 171 { 172 173 panic("softdep_setup_allocdirect called"); 174 } 175 176 void 177 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 178 struct inode *ip; 179 ufs_lbn_t lbn; 180 ufs2_daddr_t newblkno; 181 ufs2_daddr_t oldblkno; 182 long newsize; 183 long oldsize; 184 struct buf *bp; 185 { 186 187 panic("softdep_setup_allocext called"); 188 } 189 190 void 191 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 192 struct inode *ip; 193 ufs_lbn_t lbn; 194 struct buf *bp; 195 int ptrno; 196 ufs2_daddr_t newblkno; 197 ufs2_daddr_t oldblkno; 198 struct buf *nbp; 199 { 200 201 panic("softdep_setup_allocindir_page called"); 202 } 203 204 void 205 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 206 struct buf *nbp; 207 struct inode *ip; 208 struct buf *bp; 209 int ptrno; 210 ufs2_daddr_t newblkno; 211 { 212 213 panic("softdep_setup_allocindir_meta called"); 214 } 215 216 void 217 softdep_setup_freeblocks(ip, length, flags) 218 struct inode *ip; 219 off_t length; 220 int flags; 221 { 222 223 panic("softdep_setup_freeblocks called"); 224 } 225 226 void 227 softdep_freefile(pvp, ino, mode) 228 struct vnode *pvp; 229 ino_t ino; 230 int mode; 231 { 232 233 panic("softdep_freefile called"); 234 } 235 236 int 237 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) 238 struct buf *bp; 239 struct inode *dp; 240 off_t diroffset; 241 ino_t newinum; 242 struct buf *newdirbp; 243 int isnewblk; 244 { 245 246 panic("softdep_setup_directory_add called"); 247 } 248 249 void 250 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) 251 struct buf *bp; 252 struct inode *dp; 253 caddr_t base; 254 caddr_t oldloc; 255 caddr_t newloc; 256 int entrysize; 257 { 258 259 panic("softdep_change_directoryentry_offset called"); 260 } 261 262 void 263 softdep_setup_remove(bp, dp, ip, isrmdir) 264 struct buf *bp; 265 struct inode *dp; 266 struct inode *ip; 267 int isrmdir; 268 { 269 270 panic("softdep_setup_remove called"); 271 } 272 273 void 274 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 275 struct buf *bp; 276 struct inode *dp; 277 struct inode *ip; 278 ino_t newinum; 279 int isrmdir; 280 { 281 282 panic("softdep_setup_directory_change called"); 283 } 284 285 void * 286 softdep_setup_trunc(vp, length, flags) 287 struct vnode *vp; 288 off_t length; 289 int flags; 290 { 291 292 panic("%s called", __FUNCTION__); 293 294 return (NULL); 295 } 296 297 int 298 softdep_complete_trunc(vp, cookie) 299 struct vnode *vp; 300 void *cookie; 301 { 302 303 panic("%s called", __FUNCTION__); 304 305 return (0); 306 } 307 308 void 309 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) 310 struct mount *mp; 311 struct buf *bp; 312 ufs2_daddr_t blkno; 313 int frags; 314 struct workhead *wkhd; 315 { 316 317 panic("%s called", __FUNCTION__); 318 } 319 320 void 321 softdep_setup_inofree(mp, bp, ino, wkhd) 322 struct mount *mp; 323 struct buf *bp; 324 ino_t ino; 325 struct workhead *wkhd; 326 { 327 328 panic("%s called", __FUNCTION__); 329 } 330 331 void 332 softdep_setup_unlink(dp, ip) 333 struct inode *dp; 334 struct inode *ip; 335 { 336 337 panic("%s called", __FUNCTION__); 338 } 339 340 void 341 softdep_setup_link(dp, ip) 342 struct inode *dp; 343 struct inode *ip; 344 { 345 346 panic("%s called", __FUNCTION__); 347 } 348 349 void 350 softdep_revert_link(dp, ip) 351 struct inode *dp; 352 struct inode *ip; 353 { 354 355 panic("%s called", __FUNCTION__); 356 } 357 358 void 359 softdep_setup_rmdir(dp, ip) 360 struct inode *dp; 361 struct inode *ip; 362 { 363 364 panic("%s called", __FUNCTION__); 365 } 366 367 void 368 softdep_revert_rmdir(dp, ip) 369 struct inode *dp; 370 struct inode *ip; 371 { 372 373 panic("%s called", __FUNCTION__); 374 } 375 376 void 377 softdep_setup_create(dp, ip) 378 struct inode *dp; 379 struct inode *ip; 380 { 381 382 panic("%s called", __FUNCTION__); 383 } 384 385 void 386 softdep_revert_create(dp, ip) 387 struct inode *dp; 388 struct inode *ip; 389 { 390 391 panic("%s called", __FUNCTION__); 392 } 393 394 void 395 softdep_setup_mkdir(dp, ip) 396 struct inode *dp; 397 struct inode *ip; 398 { 399 400 panic("%s called", __FUNCTION__); 401 } 402 403 void 404 softdep_revert_mkdir(dp, ip) 405 struct inode *dp; 406 struct inode *ip; 407 { 408 409 panic("%s called", __FUNCTION__); 410 } 411 412 void 413 softdep_setup_dotdot_link(dp, ip) 414 struct inode *dp; 415 struct inode *ip; 416 { 417 418 panic("%s called", __FUNCTION__); 419 } 420 421 int 422 softdep_prealloc(vp, waitok) 423 struct vnode *vp; 424 int waitok; 425 { 426 427 panic("%s called", __FUNCTION__); 428 429 return (0); 430 } 431 432 int 433 softdep_journal_lookup(mp, vpp) 434 struct mount *mp; 435 struct vnode **vpp; 436 { 437 438 return (ENOENT); 439 } 440 441 void 442 softdep_change_linkcnt(ip) 443 struct inode *ip; 444 { 445 446 panic("softdep_change_linkcnt called"); 447 } 448 449 void 450 softdep_load_inodeblock(ip) 451 struct inode *ip; 452 { 453 454 panic("softdep_load_inodeblock called"); 455 } 456 457 void 458 softdep_update_inodeblock(ip, bp, waitfor) 459 struct inode *ip; 460 struct buf *bp; 461 int waitfor; 462 { 463 464 panic("softdep_update_inodeblock called"); 465 } 466 467 int 468 softdep_fsync(vp) 469 struct vnode *vp; /* the "in_core" copy of the inode */ 470 { 471 472 return (0); 473 } 474 475 void 476 softdep_fsync_mountdev(vp) 477 struct vnode *vp; 478 { 479 480 return; 481 } 482 483 int 484 softdep_flushworklist(oldmnt, countp, td) 485 struct mount *oldmnt; 486 int *countp; 487 struct thread *td; 488 { 489 490 *countp = 0; 491 return (0); 492 } 493 494 int 495 softdep_sync_metadata(struct vnode *vp) 496 { 497 498 return (0); 499 } 500 501 int 502 softdep_slowdown(vp) 503 struct vnode *vp; 504 { 505 506 panic("softdep_slowdown called"); 507 } 508 509 void 510 softdep_releasefile(ip) 511 struct inode *ip; /* inode with the zero effective link count */ 512 { 513 514 panic("softdep_releasefile called"); 515 } 516 517 int 518 softdep_request_cleanup(fs, vp, cred, resource) 519 struct fs *fs; 520 struct vnode *vp; 521 struct ucred *cred; 522 int resource; 523 { 524 525 return (0); 526 } 527 528 int 529 softdep_check_suspend(struct mount *mp, 530 struct vnode *devvp, 531 int softdep_deps, 532 int softdep_accdeps, 533 int secondary_writes, 534 int secondary_accwrites) 535 { 536 struct bufobj *bo; 537 int error; 538 539 (void) softdep_deps, 540 (void) softdep_accdeps; 541 542 bo = &devvp->v_bufobj; 543 ASSERT_BO_LOCKED(bo); 544 545 MNT_ILOCK(mp); 546 while (mp->mnt_secondary_writes != 0) { 547 BO_UNLOCK(bo); 548 msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), 549 (PUSER - 1) | PDROP, "secwr", 0); 550 BO_LOCK(bo); 551 MNT_ILOCK(mp); 552 } 553 554 /* 555 * Reasons for needing more work before suspend: 556 * - Dirty buffers on devvp. 557 * - Secondary writes occurred after start of vnode sync loop 558 */ 559 error = 0; 560 if (bo->bo_numoutput > 0 || 561 bo->bo_dirty.bv_cnt > 0 || 562 secondary_writes != 0 || 563 mp->mnt_secondary_writes != 0 || 564 secondary_accwrites != mp->mnt_secondary_accwrites) 565 error = EAGAIN; 566 BO_UNLOCK(bo); 567 return (error); 568 } 569 570 void 571 softdep_get_depcounts(struct mount *mp, 572 int *softdepactivep, 573 int *softdepactiveaccp) 574 { 575 (void) mp; 576 *softdepactivep = 0; 577 *softdepactiveaccp = 0; 578 } 579 580 #else 581 582 FEATURE(softupdates, "FFS soft-updates support"); 583 584 /* 585 * These definitions need to be adapted to the system to which 586 * this file is being ported. 587 */ 588 589 #define M_SOFTDEP_FLAGS (M_WAITOK) 590 591 #define D_PAGEDEP 0 592 #define D_INODEDEP 1 593 #define D_BMSAFEMAP 2 594 #define D_NEWBLK 3 595 #define D_ALLOCDIRECT 4 596 #define D_INDIRDEP 5 597 #define D_ALLOCINDIR 6 598 #define D_FREEFRAG 7 599 #define D_FREEBLKS 8 600 #define D_FREEFILE 9 601 #define D_DIRADD 10 602 #define D_MKDIR 11 603 #define D_DIRREM 12 604 #define D_NEWDIRBLK 13 605 #define D_FREEWORK 14 606 #define D_FREEDEP 15 607 #define D_JADDREF 16 608 #define D_JREMREF 17 609 #define D_JMVREF 18 610 #define D_JNEWBLK 19 611 #define D_JFREEBLK 20 612 #define D_JFREEFRAG 21 613 #define D_JSEG 22 614 #define D_JSEGDEP 23 615 #define D_SBDEP 24 616 #define D_JTRUNC 25 617 #define D_LAST D_JTRUNC 618 619 unsigned long dep_current[D_LAST + 1]; 620 unsigned long dep_total[D_LAST + 1]; 621 622 623 SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0, "soft updates stats"); 624 SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0, 625 "total dependencies allocated"); 626 SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0, 627 "current dependencies allocated"); 628 629 #define SOFTDEP_TYPE(type, str, long) \ 630 static MALLOC_DEFINE(M_ ## type, #str, long); \ 631 SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD, \ 632 &dep_total[D_ ## type], 0, ""); \ 633 SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, \ 634 &dep_current[D_ ## type], 0, ""); 635 636 SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies"); 637 SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies"); 638 SOFTDEP_TYPE(BMSAFEMAP, bmsafemap, 639 "Block or frag allocated from cyl group map"); 640 SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency"); 641 SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode"); 642 SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies"); 643 SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block"); 644 SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode"); 645 SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode"); 646 SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated"); 647 SOFTDEP_TYPE(DIRADD, diradd, "New directory entry"); 648 SOFTDEP_TYPE(MKDIR, mkdir, "New directory"); 649 SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted"); 650 SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block"); 651 SOFTDEP_TYPE(FREEWORK, freework, "free an inode block"); 652 SOFTDEP_TYPE(FREEDEP, freedep, "track a block free"); 653 SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add"); 654 SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove"); 655 SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move"); 656 SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block"); 657 SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block"); 658 SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag"); 659 SOFTDEP_TYPE(JSEG, jseg, "Journal segment"); 660 SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete"); 661 SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency"); 662 SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation"); 663 664 static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes"); 665 static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations"); 666 667 /* 668 * translate from workitem type to memory type 669 * MUST match the defines above, such that memtype[D_XXX] == M_XXX 670 */ 671 static struct malloc_type *memtype[] = { 672 M_PAGEDEP, 673 M_INODEDEP, 674 M_BMSAFEMAP, 675 M_NEWBLK, 676 M_ALLOCDIRECT, 677 M_INDIRDEP, 678 M_ALLOCINDIR, 679 M_FREEFRAG, 680 M_FREEBLKS, 681 M_FREEFILE, 682 M_DIRADD, 683 M_MKDIR, 684 M_DIRREM, 685 M_NEWDIRBLK, 686 M_FREEWORK, 687 M_FREEDEP, 688 M_JADDREF, 689 M_JREMREF, 690 M_JMVREF, 691 M_JNEWBLK, 692 M_JFREEBLK, 693 M_JFREEFRAG, 694 M_JSEG, 695 M_JSEGDEP, 696 M_SBDEP, 697 M_JTRUNC 698 }; 699 700 static LIST_HEAD(mkdirlist, mkdir) mkdirlisthd; 701 702 #define DtoM(type) (memtype[type]) 703 704 /* 705 * Names of malloc types. 706 */ 707 #define TYPENAME(type) \ 708 ((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???") 709 /* 710 * End system adaptation definitions. 711 */ 712 713 #define DOTDOT_OFFSET offsetof(struct dirtemplate, dotdot_ino) 714 #define DOT_OFFSET offsetof(struct dirtemplate, dot_ino) 715 716 /* 717 * Forward declarations. 718 */ 719 struct inodedep_hashhead; 720 struct newblk_hashhead; 721 struct pagedep_hashhead; 722 struct bmsafemap_hashhead; 723 724 /* 725 * Internal function prototypes. 726 */ 727 static void softdep_error(char *, int); 728 static void drain_output(struct vnode *); 729 static struct buf *getdirtybuf(struct buf *, struct mtx *, int); 730 static void clear_remove(struct thread *); 731 static void clear_inodedeps(struct thread *); 732 static void unlinked_inodedep(struct mount *, struct inodedep *); 733 static void clear_unlinked_inodedep(struct inodedep *); 734 static struct inodedep *first_unlinked_inodedep(struct ufsmount *); 735 static int flush_pagedep_deps(struct vnode *, struct mount *, 736 struct diraddhd *); 737 static void free_pagedep(struct pagedep *); 738 static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t); 739 static int flush_inodedep_deps(struct mount *, ino_t); 740 static int flush_deplist(struct allocdirectlst *, int, int *); 741 static int handle_written_filepage(struct pagedep *, struct buf *); 742 static int handle_written_sbdep(struct sbdep *, struct buf *); 743 static void initiate_write_sbdep(struct sbdep *); 744 static void diradd_inode_written(struct diradd *, struct inodedep *); 745 static int handle_written_indirdep(struct indirdep *, struct buf *, 746 struct buf**); 747 static int handle_written_inodeblock(struct inodedep *, struct buf *); 748 static int handle_written_bmsafemap(struct bmsafemap *, struct buf *); 749 static void handle_written_jaddref(struct jaddref *); 750 static void handle_written_jremref(struct jremref *); 751 static void handle_written_jseg(struct jseg *, struct buf *); 752 static void handle_written_jnewblk(struct jnewblk *); 753 static void handle_written_jfreeblk(struct jfreeblk *); 754 static void handle_written_jfreefrag(struct jfreefrag *); 755 static void complete_jseg(struct jseg *); 756 static void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *); 757 static void jaddref_write(struct jaddref *, struct jseg *, uint8_t *); 758 static void jremref_write(struct jremref *, struct jseg *, uint8_t *); 759 static void jmvref_write(struct jmvref *, struct jseg *, uint8_t *); 760 static void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *); 761 static void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *); 762 static void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *); 763 static void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *); 764 static inline void inoref_write(struct inoref *, struct jseg *, 765 struct jrefrec *); 766 static void handle_allocdirect_partdone(struct allocdirect *, 767 struct workhead *); 768 static struct jnewblk *cancel_newblk(struct newblk *, struct worklist *, 769 struct workhead *); 770 static void indirdep_complete(struct indirdep *); 771 static int indirblk_inseg(struct mount *, ufs2_daddr_t); 772 static void handle_allocindir_partdone(struct allocindir *); 773 static void initiate_write_filepage(struct pagedep *, struct buf *); 774 static void initiate_write_indirdep(struct indirdep*, struct buf *); 775 static void handle_written_mkdir(struct mkdir *, int); 776 static void initiate_write_bmsafemap(struct bmsafemap *, struct buf *); 777 static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *); 778 static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *); 779 static void handle_workitem_freefile(struct freefile *); 780 static void handle_workitem_remove(struct dirrem *, struct vnode *); 781 static struct dirrem *newdirrem(struct buf *, struct inode *, 782 struct inode *, int, struct dirrem **); 783 static void cancel_indirdep(struct indirdep *, struct buf *, struct inodedep *, 784 struct freeblks *); 785 static void free_indirdep(struct indirdep *); 786 static void free_diradd(struct diradd *, struct workhead *); 787 static void merge_diradd(struct inodedep *, struct diradd *); 788 static void complete_diradd(struct diradd *); 789 static struct diradd *diradd_lookup(struct pagedep *, int); 790 static struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *, 791 struct jremref *); 792 static struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *, 793 struct jremref *); 794 static void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *, 795 struct jremref *, struct jremref *); 796 static void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *, 797 struct jremref *); 798 static void cancel_allocindir(struct allocindir *, struct inodedep *, 799 struct freeblks *); 800 static void complete_mkdir(struct mkdir *); 801 static void free_newdirblk(struct newdirblk *); 802 static void free_jremref(struct jremref *); 803 static void free_jaddref(struct jaddref *); 804 static void free_jsegdep(struct jsegdep *); 805 static void free_jsegs(struct jblocks *); 806 static void rele_jseg(struct jseg *); 807 static void free_jseg(struct jseg *, struct jblocks *); 808 static void free_jnewblk(struct jnewblk *); 809 static void free_jfreeblk(struct jfreeblk *); 810 static void free_jfreefrag(struct jfreefrag *); 811 static void free_freedep(struct freedep *); 812 static void journal_jremref(struct dirrem *, struct jremref *, 813 struct inodedep *); 814 static void cancel_jnewblk(struct jnewblk *, struct workhead *); 815 static int cancel_jaddref(struct jaddref *, struct inodedep *, 816 struct workhead *); 817 static void cancel_jfreefrag(struct jfreefrag *); 818 static inline void setup_freedirect(struct freeblks *, struct inode *, 819 int, int); 820 static inline void setup_freeext(struct freeblks *, struct inode *, int, int); 821 static inline void setup_freeindir(struct freeblks *, struct inode *, int i, 822 ufs_lbn_t, int); 823 static inline struct freeblks *newfreeblks(struct mount *, struct inode *); 824 static void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t); 825 static void softdep_trunc_deps(struct vnode *, struct freeblks *, ufs_lbn_t, 826 int, int); 827 static int cancel_pagedep(struct pagedep *, struct inodedep *, 828 struct freeblks *); 829 static int deallocate_dependencies(struct buf *, struct inodedep *, 830 struct freeblks *, int off); 831 static void free_newblk(struct newblk *); 832 static void cancel_allocdirect(struct allocdirectlst *, 833 struct allocdirect *, struct freeblks *, int); 834 static int check_inode_unwritten(struct inodedep *); 835 static int free_inodedep(struct inodedep *); 836 static void freework_freeblock(struct freework *); 837 static void handle_workitem_freeblocks(struct freeblks *, int); 838 static void handle_complete_freeblocks(struct freeblks *); 839 static void handle_workitem_indirblk(struct freework *); 840 static void handle_written_freework(struct freework *); 841 static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *); 842 static struct worklist *jnewblk_merge(struct worklist *, struct worklist *, 843 struct workhead *); 844 static void setup_allocindir_phase2(struct buf *, struct inode *, 845 struct inodedep *, struct allocindir *, ufs_lbn_t); 846 static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t, 847 ufs2_daddr_t, ufs_lbn_t); 848 static void handle_workitem_freefrag(struct freefrag *); 849 static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long, 850 ufs_lbn_t); 851 static void allocdirect_merge(struct allocdirectlst *, 852 struct allocdirect *, struct allocdirect *); 853 static struct freefrag *allocindir_merge(struct allocindir *, 854 struct allocindir *); 855 static int bmsafemap_find(struct bmsafemap_hashhead *, struct mount *, int, 856 struct bmsafemap **); 857 static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *, 858 int cg); 859 static int newblk_find(struct newblk_hashhead *, struct mount *, ufs2_daddr_t, 860 int, struct newblk **); 861 static int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **); 862 static int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t, 863 struct inodedep **); 864 static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **); 865 static int pagedep_lookup(struct mount *, ino_t, ufs_lbn_t, int, 866 struct pagedep **); 867 static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t, 868 struct mount *mp, int, struct pagedep **); 869 static void pause_timer(void *); 870 static int request_cleanup(struct mount *, int); 871 static int process_worklist_item(struct mount *, int); 872 static void process_removes(struct vnode *); 873 static void jwork_move(struct workhead *, struct workhead *); 874 static void add_to_worklist(struct worklist *, int); 875 static void remove_from_worklist(struct worklist *); 876 static void softdep_flush(void); 877 static int softdep_speedup(void); 878 static void worklist_speedup(void); 879 static int journal_mount(struct mount *, struct fs *, struct ucred *); 880 static void journal_unmount(struct mount *); 881 static int journal_space(struct ufsmount *, int); 882 static void journal_suspend(struct ufsmount *); 883 static int journal_unsuspend(struct ufsmount *ump); 884 static void softdep_prelink(struct vnode *, struct vnode *); 885 static void add_to_journal(struct worklist *); 886 static void remove_from_journal(struct worklist *); 887 static void softdep_process_journal(struct mount *, struct worklist *, int); 888 static struct jremref *newjremref(struct dirrem *, struct inode *, 889 struct inode *ip, off_t, nlink_t); 890 static struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t, 891 uint16_t); 892 static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t, 893 uint16_t); 894 static inline struct jsegdep *inoref_jseg(struct inoref *); 895 static struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t); 896 static struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t, 897 ufs2_daddr_t, int); 898 static struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *, 899 ufs2_daddr_t, long, ufs_lbn_t); 900 static struct freework *newfreework(struct ufsmount *, struct freeblks *, 901 struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int); 902 static void jwait(struct worklist *wk); 903 static struct inodedep *inodedep_lookup_ip(struct inode *); 904 static int bmsafemap_rollbacks(struct bmsafemap *); 905 static struct freefile *handle_bufwait(struct inodedep *, struct workhead *); 906 static void handle_jwork(struct workhead *); 907 static struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *, 908 struct mkdir **); 909 static struct jblocks *jblocks_create(void); 910 static ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *); 911 static void jblocks_free(struct jblocks *, struct mount *, int); 912 static void jblocks_destroy(struct jblocks *); 913 static void jblocks_add(struct jblocks *, ufs2_daddr_t, int); 914 915 /* 916 * Exported softdep operations. 917 */ 918 static void softdep_disk_io_initiation(struct buf *); 919 static void softdep_disk_write_complete(struct buf *); 920 static void softdep_deallocate_dependencies(struct buf *); 921 static int softdep_count_dependencies(struct buf *bp, int); 922 923 static struct mtx lk; 924 MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF); 925 926 #define TRY_ACQUIRE_LOCK(lk) mtx_trylock(lk) 927 #define ACQUIRE_LOCK(lk) mtx_lock(lk) 928 #define FREE_LOCK(lk) mtx_unlock(lk) 929 930 #define BUF_AREC(bp) lockallowrecurse(&(bp)->b_lock) 931 #define BUF_NOREC(bp) lockdisablerecurse(&(bp)->b_lock) 932 933 /* 934 * Worklist queue management. 935 * These routines require that the lock be held. 936 */ 937 #ifndef /* NOT */ DEBUG 938 #define WORKLIST_INSERT(head, item) do { \ 939 (item)->wk_state |= ONWORKLIST; \ 940 LIST_INSERT_HEAD(head, item, wk_list); \ 941 } while (0) 942 #define WORKLIST_REMOVE(item) do { \ 943 (item)->wk_state &= ~ONWORKLIST; \ 944 LIST_REMOVE(item, wk_list); \ 945 } while (0) 946 #define WORKLIST_INSERT_UNLOCKED WORKLIST_INSERT 947 #define WORKLIST_REMOVE_UNLOCKED WORKLIST_REMOVE 948 949 #else /* DEBUG */ 950 static void worklist_insert(struct workhead *, struct worklist *, int); 951 static void worklist_remove(struct worklist *, int); 952 953 #define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1) 954 #define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0) 955 #define WORKLIST_REMOVE(item) worklist_remove(item, 1) 956 #define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0) 957 958 static void 959 worklist_insert(head, item, locked) 960 struct workhead *head; 961 struct worklist *item; 962 int locked; 963 { 964 965 if (locked) 966 mtx_assert(&lk, MA_OWNED); 967 if (item->wk_state & ONWORKLIST) 968 panic("worklist_insert: %p %s(0x%X) already on list", 969 item, TYPENAME(item->wk_type), item->wk_state); 970 item->wk_state |= ONWORKLIST; 971 LIST_INSERT_HEAD(head, item, wk_list); 972 } 973 974 static void 975 worklist_remove(item, locked) 976 struct worklist *item; 977 int locked; 978 { 979 980 if (locked) 981 mtx_assert(&lk, MA_OWNED); 982 if ((item->wk_state & ONWORKLIST) == 0) 983 panic("worklist_remove: %p %s(0x%X) not on list", 984 item, TYPENAME(item->wk_type), item->wk_state); 985 item->wk_state &= ~ONWORKLIST; 986 LIST_REMOVE(item, wk_list); 987 } 988 #endif /* DEBUG */ 989 990 /* 991 * Merge two jsegdeps keeping only the oldest one as newer references 992 * can't be discarded until after older references. 993 */ 994 static inline struct jsegdep * 995 jsegdep_merge(struct jsegdep *one, struct jsegdep *two) 996 { 997 struct jsegdep *swp; 998 999 if (two == NULL) 1000 return (one); 1001 1002 if (one->jd_seg->js_seq > two->jd_seg->js_seq) { 1003 swp = one; 1004 one = two; 1005 two = swp; 1006 } 1007 WORKLIST_REMOVE(&two->jd_list); 1008 free_jsegdep(two); 1009 1010 return (one); 1011 } 1012 1013 /* 1014 * If two freedeps are compatible free one to reduce list size. 1015 */ 1016 static inline struct freedep * 1017 freedep_merge(struct freedep *one, struct freedep *two) 1018 { 1019 if (two == NULL) 1020 return (one); 1021 1022 if (one->fd_freework == two->fd_freework) { 1023 WORKLIST_REMOVE(&two->fd_list); 1024 free_freedep(two); 1025 } 1026 return (one); 1027 } 1028 1029 /* 1030 * Move journal work from one list to another. Duplicate freedeps and 1031 * jsegdeps are coalesced to keep the lists as small as possible. 1032 */ 1033 static void 1034 jwork_move(dst, src) 1035 struct workhead *dst; 1036 struct workhead *src; 1037 { 1038 struct freedep *freedep; 1039 struct jsegdep *jsegdep; 1040 struct worklist *wkn; 1041 struct worklist *wk; 1042 1043 KASSERT(dst != src, 1044 ("jwork_move: dst == src")); 1045 freedep = NULL; 1046 jsegdep = NULL; 1047 LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) { 1048 if (wk->wk_type == D_JSEGDEP) 1049 jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); 1050 if (wk->wk_type == D_FREEDEP) 1051 freedep = freedep_merge(WK_FREEDEP(wk), freedep); 1052 } 1053 1054 mtx_assert(&lk, MA_OWNED); 1055 while ((wk = LIST_FIRST(src)) != NULL) { 1056 WORKLIST_REMOVE(wk); 1057 WORKLIST_INSERT(dst, wk); 1058 if (wk->wk_type == D_JSEGDEP) { 1059 jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); 1060 continue; 1061 } 1062 if (wk->wk_type == D_FREEDEP) 1063 freedep = freedep_merge(WK_FREEDEP(wk), freedep); 1064 } 1065 } 1066 1067 /* 1068 * Routines for tracking and managing workitems. 1069 */ 1070 static void workitem_free(struct worklist *, int); 1071 static void workitem_alloc(struct worklist *, int, struct mount *); 1072 1073 #define WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type)) 1074 1075 static void 1076 workitem_free(item, type) 1077 struct worklist *item; 1078 int type; 1079 { 1080 struct ufsmount *ump; 1081 mtx_assert(&lk, MA_OWNED); 1082 1083 #ifdef DEBUG 1084 if (item->wk_state & ONWORKLIST) 1085 panic("workitem_free: %s(0x%X) still on list", 1086 TYPENAME(item->wk_type), item->wk_state); 1087 if (item->wk_type != type) 1088 panic("workitem_free: type mismatch %s != %s", 1089 TYPENAME(item->wk_type), TYPENAME(type)); 1090 #endif 1091 ump = VFSTOUFS(item->wk_mp); 1092 if (--ump->softdep_deps == 0 && ump->softdep_req) 1093 wakeup(&ump->softdep_deps); 1094 dep_current[type]--; 1095 free(item, DtoM(type)); 1096 } 1097 1098 static void 1099 workitem_alloc(item, type, mp) 1100 struct worklist *item; 1101 int type; 1102 struct mount *mp; 1103 { 1104 item->wk_type = type; 1105 item->wk_mp = mp; 1106 item->wk_state = 0; 1107 ACQUIRE_LOCK(&lk); 1108 dep_current[type]++; 1109 dep_total[type]++; 1110 VFSTOUFS(mp)->softdep_deps++; 1111 VFSTOUFS(mp)->softdep_accdeps++; 1112 FREE_LOCK(&lk); 1113 } 1114 1115 /* 1116 * Workitem queue management 1117 */ 1118 static int max_softdeps; /* maximum number of structs before slowdown */ 1119 static int maxindirdeps = 50; /* max number of indirdeps before slowdown */ 1120 static int tickdelay = 2; /* number of ticks to pause during slowdown */ 1121 static int proc_waiting; /* tracks whether we have a timeout posted */ 1122 static int *stat_countp; /* statistic to count in proc_waiting timeout */ 1123 static struct callout softdep_callout; 1124 static int req_pending; 1125 static int req_clear_inodedeps; /* syncer process flush some inodedeps */ 1126 static int req_clear_remove; /* syncer process flush some freeblks */ 1127 1128 /* 1129 * runtime statistics 1130 */ 1131 static int stat_worklist_push; /* number of worklist cleanups */ 1132 static int stat_blk_limit_push; /* number of times block limit neared */ 1133 static int stat_ino_limit_push; /* number of times inode limit neared */ 1134 static int stat_blk_limit_hit; /* number of times block slowdown imposed */ 1135 static int stat_ino_limit_hit; /* number of times inode slowdown imposed */ 1136 static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */ 1137 static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */ 1138 static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */ 1139 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */ 1140 static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */ 1141 static int stat_jaddref; /* bufs redirtied as ino bitmap can not write */ 1142 static int stat_jnewblk; /* bufs redirtied as blk bitmap can not write */ 1143 static int stat_journal_min; /* Times hit journal min threshold */ 1144 static int stat_journal_low; /* Times hit journal low threshold */ 1145 static int stat_journal_wait; /* Times blocked in jwait(). */ 1146 static int stat_jwait_filepage; /* Times blocked in jwait() for filepage. */ 1147 static int stat_jwait_freeblks; /* Times blocked in jwait() for freeblks. */ 1148 static int stat_jwait_inode; /* Times blocked in jwait() for inodes. */ 1149 static int stat_jwait_newblk; /* Times blocked in jwait() for newblks. */ 1150 static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */ 1151 static int stat_cleanup_blkrequests; /* Number of block cleanup requests */ 1152 static int stat_cleanup_inorequests; /* Number of inode cleanup requests */ 1153 static int stat_cleanup_retries; /* Number of cleanups that needed to flush */ 1154 static int stat_cleanup_failures; /* Number of cleanup requests that failed */ 1155 1156 SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW, 1157 &max_softdeps, 0, ""); 1158 SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW, 1159 &tickdelay, 0, ""); 1160 SYSCTL_INT(_debug_softdep, OID_AUTO, maxindirdeps, CTLFLAG_RW, 1161 &maxindirdeps, 0, ""); 1162 SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW, 1163 &stat_worklist_push, 0,""); 1164 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW, 1165 &stat_blk_limit_push, 0,""); 1166 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW, 1167 &stat_ino_limit_push, 0,""); 1168 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW, 1169 &stat_blk_limit_hit, 0, ""); 1170 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW, 1171 &stat_ino_limit_hit, 0, ""); 1172 SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW, 1173 &stat_sync_limit_hit, 0, ""); 1174 SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, 1175 &stat_indir_blk_ptrs, 0, ""); 1176 SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW, 1177 &stat_inode_bitmap, 0, ""); 1178 SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, 1179 &stat_direct_blk_ptrs, 0, ""); 1180 SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW, 1181 &stat_dir_entry, 0, ""); 1182 SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW, 1183 &stat_jaddref, 0, ""); 1184 SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW, 1185 &stat_jnewblk, 0, ""); 1186 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW, 1187 &stat_journal_low, 0, ""); 1188 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW, 1189 &stat_journal_min, 0, ""); 1190 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW, 1191 &stat_journal_wait, 0, ""); 1192 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW, 1193 &stat_jwait_filepage, 0, ""); 1194 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW, 1195 &stat_jwait_freeblks, 0, ""); 1196 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW, 1197 &stat_jwait_inode, 0, ""); 1198 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW, 1199 &stat_jwait_newblk, 0, ""); 1200 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW, 1201 &stat_cleanup_blkrequests, 0, ""); 1202 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW, 1203 &stat_cleanup_inorequests, 0, ""); 1204 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW, 1205 &stat_cleanup_high_delay, 0, ""); 1206 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW, 1207 &stat_cleanup_retries, 0, ""); 1208 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW, 1209 &stat_cleanup_failures, 0, ""); 1210 1211 SYSCTL_DECL(_vfs_ffs); 1212 1213 LIST_HEAD(bmsafemap_hashhead, bmsafemap) *bmsafemap_hashtbl; 1214 static u_long bmsafemap_hash; /* size of hash table - 1 */ 1215 1216 static int compute_summary_at_mount = 0; /* Whether to recompute the summary at mount time */ 1217 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW, 1218 &compute_summary_at_mount, 0, "Recompute summary at mount"); 1219 1220 static struct proc *softdepproc; 1221 static struct kproc_desc softdep_kp = { 1222 "softdepflush", 1223 softdep_flush, 1224 &softdepproc 1225 }; 1226 SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start, 1227 &softdep_kp); 1228 1229 static void 1230 softdep_flush(void) 1231 { 1232 struct mount *nmp; 1233 struct mount *mp; 1234 struct ufsmount *ump; 1235 struct thread *td; 1236 int remaining; 1237 int progress; 1238 int vfslocked; 1239 1240 td = curthread; 1241 td->td_pflags |= TDP_NORUNNINGBUF; 1242 1243 for (;;) { 1244 kproc_suspend_check(softdepproc); 1245 vfslocked = VFS_LOCK_GIANT((struct mount *)NULL); 1246 ACQUIRE_LOCK(&lk); 1247 /* 1248 * If requested, try removing inode or removal dependencies. 1249 */ 1250 if (req_clear_inodedeps) { 1251 clear_inodedeps(td); 1252 req_clear_inodedeps -= 1; 1253 wakeup_one(&proc_waiting); 1254 } 1255 if (req_clear_remove) { 1256 clear_remove(td); 1257 req_clear_remove -= 1; 1258 wakeup_one(&proc_waiting); 1259 } 1260 FREE_LOCK(&lk); 1261 VFS_UNLOCK_GIANT(vfslocked); 1262 remaining = progress = 0; 1263 mtx_lock(&mountlist_mtx); 1264 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 1265 nmp = TAILQ_NEXT(mp, mnt_list); 1266 if ((mp->mnt_flag & MNT_SOFTDEP) == 0) 1267 continue; 1268 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) 1269 continue; 1270 vfslocked = VFS_LOCK_GIANT(mp); 1271 progress += softdep_process_worklist(mp, 0); 1272 ump = VFSTOUFS(mp); 1273 remaining += ump->softdep_on_worklist - 1274 ump->softdep_on_worklist_inprogress; 1275 VFS_UNLOCK_GIANT(vfslocked); 1276 mtx_lock(&mountlist_mtx); 1277 nmp = TAILQ_NEXT(mp, mnt_list); 1278 vfs_unbusy(mp); 1279 } 1280 mtx_unlock(&mountlist_mtx); 1281 if (remaining && progress) 1282 continue; 1283 ACQUIRE_LOCK(&lk); 1284 if (!req_pending) 1285 msleep(&req_pending, &lk, PVM, "sdflush", hz); 1286 req_pending = 0; 1287 FREE_LOCK(&lk); 1288 } 1289 } 1290 1291 static void 1292 worklist_speedup(void) 1293 { 1294 mtx_assert(&lk, MA_OWNED); 1295 if (req_pending == 0) { 1296 req_pending = 1; 1297 wakeup(&req_pending); 1298 } 1299 } 1300 1301 static int 1302 softdep_speedup(void) 1303 { 1304 1305 worklist_speedup(); 1306 bd_speedup(); 1307 return speedup_syncer(); 1308 } 1309 1310 /* 1311 * Add an item to the end of the work queue. 1312 * This routine requires that the lock be held. 1313 * This is the only routine that adds items to the list. 1314 * The following routine is the only one that removes items 1315 * and does so in order from first to last. 1316 */ 1317 static void 1318 add_to_worklist(wk, nodelay) 1319 struct worklist *wk; 1320 int nodelay; 1321 { 1322 struct ufsmount *ump; 1323 1324 mtx_assert(&lk, MA_OWNED); 1325 ump = VFSTOUFS(wk->wk_mp); 1326 if (wk->wk_state & ONWORKLIST) 1327 panic("add_to_worklist: %s(0x%X) already on list", 1328 TYPENAME(wk->wk_type), wk->wk_state); 1329 wk->wk_state |= ONWORKLIST; 1330 if (LIST_EMPTY(&ump->softdep_workitem_pending)) 1331 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); 1332 else 1333 LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list); 1334 ump->softdep_worklist_tail = wk; 1335 ump->softdep_on_worklist += 1; 1336 if (nodelay) 1337 worklist_speedup(); 1338 } 1339 1340 /* 1341 * Remove the item to be processed. If we are removing the last 1342 * item on the list, we need to recalculate the tail pointer. 1343 */ 1344 static void 1345 remove_from_worklist(wk) 1346 struct worklist *wk; 1347 { 1348 struct ufsmount *ump; 1349 struct worklist *wkend; 1350 1351 ump = VFSTOUFS(wk->wk_mp); 1352 WORKLIST_REMOVE(wk); 1353 if (wk == ump->softdep_worklist_tail) { 1354 LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list) 1355 if (LIST_NEXT(wkend, wk_list) == NULL) 1356 break; 1357 ump->softdep_worklist_tail = wkend; 1358 } 1359 ump->softdep_on_worklist -= 1; 1360 } 1361 1362 /* 1363 * Process that runs once per second to handle items in the background queue. 1364 * 1365 * Note that we ensure that everything is done in the order in which they 1366 * appear in the queue. The code below depends on this property to ensure 1367 * that blocks of a file are freed before the inode itself is freed. This 1368 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated 1369 * until all the old ones have been purged from the dependency lists. 1370 */ 1371 int 1372 softdep_process_worklist(mp, full) 1373 struct mount *mp; 1374 int full; 1375 { 1376 struct thread *td = curthread; 1377 int cnt, matchcnt; 1378 struct ufsmount *ump; 1379 long starttime; 1380 1381 KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp")); 1382 /* 1383 * Record the process identifier of our caller so that we can give 1384 * this process preferential treatment in request_cleanup below. 1385 */ 1386 matchcnt = 0; 1387 ump = VFSTOUFS(mp); 1388 ACQUIRE_LOCK(&lk); 1389 starttime = time_second; 1390 softdep_process_journal(mp, NULL, full?MNT_WAIT:0); 1391 while (ump->softdep_on_worklist > 0) { 1392 if ((cnt = process_worklist_item(mp, LK_NOWAIT)) == -1) 1393 break; 1394 else 1395 matchcnt += cnt; 1396 /* 1397 * If requested, try removing inode or removal dependencies. 1398 */ 1399 if (req_clear_inodedeps) { 1400 clear_inodedeps(td); 1401 req_clear_inodedeps -= 1; 1402 wakeup_one(&proc_waiting); 1403 } 1404 if (req_clear_remove) { 1405 clear_remove(td); 1406 req_clear_remove -= 1; 1407 wakeup_one(&proc_waiting); 1408 } 1409 /* 1410 * We do not generally want to stop for buffer space, but if 1411 * we are really being a buffer hog, we will stop and wait. 1412 */ 1413 if (should_yield()) { 1414 FREE_LOCK(&lk); 1415 kern_yield(PRI_UNCHANGED); 1416 bwillwrite(); 1417 ACQUIRE_LOCK(&lk); 1418 } 1419 /* 1420 * Never allow processing to run for more than one 1421 * second. Otherwise the other mountpoints may get 1422 * excessively backlogged. 1423 */ 1424 if (!full && starttime != time_second) 1425 break; 1426 } 1427 if (full == 0) 1428 journal_unsuspend(ump); 1429 FREE_LOCK(&lk); 1430 return (matchcnt); 1431 } 1432 1433 /* 1434 * Process all removes associated with a vnode if we are running out of 1435 * journal space. Any other process which attempts to flush these will 1436 * be unable as we have the vnodes locked. 1437 */ 1438 static void 1439 process_removes(vp) 1440 struct vnode *vp; 1441 { 1442 struct inodedep *inodedep; 1443 struct dirrem *dirrem; 1444 struct mount *mp; 1445 ino_t inum; 1446 1447 mtx_assert(&lk, MA_OWNED); 1448 1449 mp = vp->v_mount; 1450 inum = VTOI(vp)->i_number; 1451 for (;;) { 1452 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) 1453 return; 1454 LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) 1455 if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) == 1456 (COMPLETE | ONWORKLIST)) 1457 break; 1458 if (dirrem == NULL) 1459 return; 1460 /* 1461 * If another thread is trying to lock this vnode it will 1462 * fail but we must wait for it to do so before we can 1463 * proceed. 1464 */ 1465 if (dirrem->dm_state & INPROGRESS) { 1466 dirrem->dm_state |= IOWAITING; 1467 msleep(&dirrem->dm_list, &lk, PVM, "pwrwait", 0); 1468 continue; 1469 } 1470 remove_from_worklist(&dirrem->dm_list); 1471 FREE_LOCK(&lk); 1472 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) 1473 panic("process_removes: suspended filesystem"); 1474 handle_workitem_remove(dirrem, vp); 1475 vn_finished_secondary_write(mp); 1476 ACQUIRE_LOCK(&lk); 1477 } 1478 } 1479 1480 /* 1481 * Process one item on the worklist. 1482 */ 1483 static int 1484 process_worklist_item(mp, flags) 1485 struct mount *mp; 1486 int flags; 1487 { 1488 struct worklist *wk; 1489 struct ufsmount *ump; 1490 struct vnode *vp; 1491 int matchcnt = 0; 1492 1493 mtx_assert(&lk, MA_OWNED); 1494 KASSERT(mp != NULL, ("process_worklist_item: NULL mp")); 1495 /* 1496 * If we are being called because of a process doing a 1497 * copy-on-write, then it is not safe to write as we may 1498 * recurse into the copy-on-write routine. 1499 */ 1500 if (curthread->td_pflags & TDP_COWINPROGRESS) 1501 return (-1); 1502 /* 1503 * Normally we just process each item on the worklist in order. 1504 * However, if we are in a situation where we cannot lock any 1505 * inodes, we have to skip over any dirrem requests whose 1506 * vnodes are resident and locked. 1507 */ 1508 vp = NULL; 1509 ump = VFSTOUFS(mp); 1510 LIST_FOREACH(wk, &ump->softdep_workitem_pending, wk_list) { 1511 if (wk->wk_state & INPROGRESS) 1512 continue; 1513 if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM) 1514 break; 1515 wk->wk_state |= INPROGRESS; 1516 ump->softdep_on_worklist_inprogress++; 1517 FREE_LOCK(&lk); 1518 ffs_vgetf(mp, WK_DIRREM(wk)->dm_oldinum, 1519 LK_NOWAIT | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ); 1520 ACQUIRE_LOCK(&lk); 1521 if (wk->wk_state & IOWAITING) { 1522 wk->wk_state &= ~IOWAITING; 1523 wakeup(wk); 1524 } 1525 wk->wk_state &= ~INPROGRESS; 1526 ump->softdep_on_worklist_inprogress--; 1527 if (vp != NULL) 1528 break; 1529 } 1530 if (wk == 0) 1531 return (-1); 1532 remove_from_worklist(wk); 1533 FREE_LOCK(&lk); 1534 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) 1535 panic("process_worklist_item: suspended filesystem"); 1536 matchcnt++; 1537 switch (wk->wk_type) { 1538 1539 case D_DIRREM: 1540 /* removal of a directory entry */ 1541 handle_workitem_remove(WK_DIRREM(wk), vp); 1542 if (vp) 1543 vput(vp); 1544 break; 1545 1546 case D_FREEBLKS: 1547 /* releasing blocks and/or fragments from a file */ 1548 handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT); 1549 break; 1550 1551 case D_FREEFRAG: 1552 /* releasing a fragment when replaced as a file grows */ 1553 handle_workitem_freefrag(WK_FREEFRAG(wk)); 1554 break; 1555 1556 case D_FREEFILE: 1557 /* releasing an inode when its link count drops to 0 */ 1558 handle_workitem_freefile(WK_FREEFILE(wk)); 1559 break; 1560 1561 case D_FREEWORK: 1562 /* Final block in an indirect was freed. */ 1563 handle_workitem_indirblk(WK_FREEWORK(wk)); 1564 break; 1565 1566 default: 1567 panic("%s_process_worklist: Unknown type %s", 1568 "softdep", TYPENAME(wk->wk_type)); 1569 /* NOTREACHED */ 1570 } 1571 vn_finished_secondary_write(mp); 1572 ACQUIRE_LOCK(&lk); 1573 return (matchcnt); 1574 } 1575 1576 /* 1577 * Move dependencies from one buffer to another. 1578 */ 1579 int 1580 softdep_move_dependencies(oldbp, newbp) 1581 struct buf *oldbp; 1582 struct buf *newbp; 1583 { 1584 struct worklist *wk, *wktail; 1585 int dirty; 1586 1587 dirty = 0; 1588 wktail = NULL; 1589 ACQUIRE_LOCK(&lk); 1590 while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { 1591 LIST_REMOVE(wk, wk_list); 1592 if (wk->wk_type == D_BMSAFEMAP && 1593 bmsafemap_rollbacks(WK_BMSAFEMAP(wk))) 1594 dirty = 1; 1595 if (wktail == 0) 1596 LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); 1597 else 1598 LIST_INSERT_AFTER(wktail, wk, wk_list); 1599 wktail = wk; 1600 } 1601 FREE_LOCK(&lk); 1602 1603 return (dirty); 1604 } 1605 1606 /* 1607 * Purge the work list of all items associated with a particular mount point. 1608 */ 1609 int 1610 softdep_flushworklist(oldmnt, countp, td) 1611 struct mount *oldmnt; 1612 int *countp; 1613 struct thread *td; 1614 { 1615 struct vnode *devvp; 1616 int count, error = 0; 1617 struct ufsmount *ump; 1618 1619 /* 1620 * Alternately flush the block device associated with the mount 1621 * point and process any dependencies that the flushing 1622 * creates. We continue until no more worklist dependencies 1623 * are found. 1624 */ 1625 *countp = 0; 1626 ump = VFSTOUFS(oldmnt); 1627 devvp = ump->um_devvp; 1628 while ((count = softdep_process_worklist(oldmnt, 1)) > 0) { 1629 *countp += count; 1630 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1631 error = VOP_FSYNC(devvp, MNT_WAIT, td); 1632 VOP_UNLOCK(devvp, 0); 1633 if (error) 1634 break; 1635 } 1636 return (error); 1637 } 1638 1639 int 1640 softdep_waitidle(struct mount *mp) 1641 { 1642 struct ufsmount *ump; 1643 int error; 1644 int i; 1645 1646 ump = VFSTOUFS(mp); 1647 ACQUIRE_LOCK(&lk); 1648 for (i = 0; i < 10 && ump->softdep_deps; i++) { 1649 ump->softdep_req = 1; 1650 if (ump->softdep_on_worklist) 1651 panic("softdep_waitidle: work added after flush."); 1652 msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1); 1653 } 1654 ump->softdep_req = 0; 1655 FREE_LOCK(&lk); 1656 error = 0; 1657 if (i == 10) { 1658 error = EBUSY; 1659 printf("softdep_waitidle: Failed to flush worklist for %p\n", 1660 mp); 1661 } 1662 1663 return (error); 1664 } 1665 1666 /* 1667 * Flush all vnodes and worklist items associated with a specified mount point. 1668 */ 1669 int 1670 softdep_flushfiles(oldmnt, flags, td) 1671 struct mount *oldmnt; 1672 int flags; 1673 struct thread *td; 1674 { 1675 int error, depcount, loopcnt, retry_flush_count, retry; 1676 1677 loopcnt = 10; 1678 retry_flush_count = 3; 1679 retry_flush: 1680 error = 0; 1681 1682 /* 1683 * Alternately flush the vnodes associated with the mount 1684 * point and process any dependencies that the flushing 1685 * creates. In theory, this loop can happen at most twice, 1686 * but we give it a few extra just to be sure. 1687 */ 1688 for (; loopcnt > 0; loopcnt--) { 1689 /* 1690 * Do another flush in case any vnodes were brought in 1691 * as part of the cleanup operations. 1692 */ 1693 if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0) 1694 break; 1695 if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 || 1696 depcount == 0) 1697 break; 1698 } 1699 /* 1700 * If we are unmounting then it is an error to fail. If we 1701 * are simply trying to downgrade to read-only, then filesystem 1702 * activity can keep us busy forever, so we just fail with EBUSY. 1703 */ 1704 if (loopcnt == 0) { 1705 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) 1706 panic("softdep_flushfiles: looping"); 1707 error = EBUSY; 1708 } 1709 if (!error) 1710 error = softdep_waitidle(oldmnt); 1711 if (!error) { 1712 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) { 1713 retry = 0; 1714 MNT_ILOCK(oldmnt); 1715 KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0, 1716 ("softdep_flushfiles: !MNTK_NOINSMNTQ")); 1717 if (oldmnt->mnt_nvnodelistsize > 0) { 1718 if (--retry_flush_count > 0) { 1719 retry = 1; 1720 loopcnt = 3; 1721 } else 1722 error = EBUSY; 1723 } 1724 MNT_IUNLOCK(oldmnt); 1725 if (retry) 1726 goto retry_flush; 1727 } 1728 } 1729 return (error); 1730 } 1731 1732 /* 1733 * Structure hashing. 1734 * 1735 * There are three types of structures that can be looked up: 1736 * 1) pagedep structures identified by mount point, inode number, 1737 * and logical block. 1738 * 2) inodedep structures identified by mount point and inode number. 1739 * 3) newblk structures identified by mount point and 1740 * physical block number. 1741 * 1742 * The "pagedep" and "inodedep" dependency structures are hashed 1743 * separately from the file blocks and inodes to which they correspond. 1744 * This separation helps when the in-memory copy of an inode or 1745 * file block must be replaced. It also obviates the need to access 1746 * an inode or file page when simply updating (or de-allocating) 1747 * dependency structures. Lookup of newblk structures is needed to 1748 * find newly allocated blocks when trying to associate them with 1749 * their allocdirect or allocindir structure. 1750 * 1751 * The lookup routines optionally create and hash a new instance when 1752 * an existing entry is not found. 1753 */ 1754 #define DEPALLOC 0x0001 /* allocate structure if lookup fails */ 1755 #define NODELAY 0x0002 /* cannot do background work */ 1756 1757 /* 1758 * Structures and routines associated with pagedep caching. 1759 */ 1760 LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl; 1761 u_long pagedep_hash; /* size of hash table - 1 */ 1762 #define PAGEDEP_HASH(mp, inum, lbn) \ 1763 (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \ 1764 pagedep_hash]) 1765 1766 static int 1767 pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp) 1768 struct pagedep_hashhead *pagedephd; 1769 ino_t ino; 1770 ufs_lbn_t lbn; 1771 struct mount *mp; 1772 int flags; 1773 struct pagedep **pagedeppp; 1774 { 1775 struct pagedep *pagedep; 1776 1777 LIST_FOREACH(pagedep, pagedephd, pd_hash) 1778 if (ino == pagedep->pd_ino && 1779 lbn == pagedep->pd_lbn && 1780 mp == pagedep->pd_list.wk_mp) 1781 break; 1782 if (pagedep) { 1783 *pagedeppp = pagedep; 1784 if ((flags & DEPALLOC) != 0 && 1785 (pagedep->pd_state & ONWORKLIST) == 0) 1786 return (0); 1787 return (1); 1788 } 1789 *pagedeppp = NULL; 1790 return (0); 1791 } 1792 /* 1793 * Look up a pagedep. Return 1 if found, 0 if not found or found 1794 * when asked to allocate but not associated with any buffer. 1795 * If not found, allocate if DEPALLOC flag is passed. 1796 * Found or allocated entry is returned in pagedeppp. 1797 * This routine must be called with splbio interrupts blocked. 1798 */ 1799 static int 1800 pagedep_lookup(mp, ino, lbn, flags, pagedeppp) 1801 struct mount *mp; 1802 ino_t ino; 1803 ufs_lbn_t lbn; 1804 int flags; 1805 struct pagedep **pagedeppp; 1806 { 1807 struct pagedep *pagedep; 1808 struct pagedep_hashhead *pagedephd; 1809 int ret; 1810 int i; 1811 1812 mtx_assert(&lk, MA_OWNED); 1813 pagedephd = PAGEDEP_HASH(mp, ino, lbn); 1814 1815 ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp); 1816 if (*pagedeppp || (flags & DEPALLOC) == 0) 1817 return (ret); 1818 FREE_LOCK(&lk); 1819 pagedep = malloc(sizeof(struct pagedep), 1820 M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO); 1821 workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp); 1822 ACQUIRE_LOCK(&lk); 1823 ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp); 1824 if (*pagedeppp) { 1825 WORKITEM_FREE(pagedep, D_PAGEDEP); 1826 return (ret); 1827 } 1828 pagedep->pd_ino = ino; 1829 pagedep->pd_lbn = lbn; 1830 LIST_INIT(&pagedep->pd_dirremhd); 1831 LIST_INIT(&pagedep->pd_pendinghd); 1832 for (i = 0; i < DAHASHSZ; i++) 1833 LIST_INIT(&pagedep->pd_diraddhd[i]); 1834 LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); 1835 *pagedeppp = pagedep; 1836 return (0); 1837 } 1838 1839 /* 1840 * Structures and routines associated with inodedep caching. 1841 */ 1842 LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl; 1843 static u_long inodedep_hash; /* size of hash table - 1 */ 1844 #define INODEDEP_HASH(fs, inum) \ 1845 (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash]) 1846 1847 static int 1848 inodedep_find(inodedephd, fs, inum, inodedeppp) 1849 struct inodedep_hashhead *inodedephd; 1850 struct fs *fs; 1851 ino_t inum; 1852 struct inodedep **inodedeppp; 1853 { 1854 struct inodedep *inodedep; 1855 1856 LIST_FOREACH(inodedep, inodedephd, id_hash) 1857 if (inum == inodedep->id_ino && fs == inodedep->id_fs) 1858 break; 1859 if (inodedep) { 1860 *inodedeppp = inodedep; 1861 return (1); 1862 } 1863 *inodedeppp = NULL; 1864 1865 return (0); 1866 } 1867 /* 1868 * Look up an inodedep. Return 1 if found, 0 if not found. 1869 * If not found, allocate if DEPALLOC flag is passed. 1870 * Found or allocated entry is returned in inodedeppp. 1871 * This routine must be called with splbio interrupts blocked. 1872 */ 1873 static int 1874 inodedep_lookup(mp, inum, flags, inodedeppp) 1875 struct mount *mp; 1876 ino_t inum; 1877 int flags; 1878 struct inodedep **inodedeppp; 1879 { 1880 struct inodedep *inodedep; 1881 struct inodedep_hashhead *inodedephd; 1882 struct fs *fs; 1883 1884 mtx_assert(&lk, MA_OWNED); 1885 fs = VFSTOUFS(mp)->um_fs; 1886 inodedephd = INODEDEP_HASH(fs, inum); 1887 1888 if (inodedep_find(inodedephd, fs, inum, inodedeppp)) 1889 return (1); 1890 if ((flags & DEPALLOC) == 0) 1891 return (0); 1892 /* 1893 * If we are over our limit, try to improve the situation. 1894 */ 1895 if (dep_current[D_INODEDEP] > max_softdeps && (flags & NODELAY) == 0) 1896 request_cleanup(mp, FLUSH_INODES); 1897 FREE_LOCK(&lk); 1898 inodedep = malloc(sizeof(struct inodedep), 1899 M_INODEDEP, M_SOFTDEP_FLAGS); 1900 workitem_alloc(&inodedep->id_list, D_INODEDEP, mp); 1901 ACQUIRE_LOCK(&lk); 1902 if (inodedep_find(inodedephd, fs, inum, inodedeppp)) { 1903 WORKITEM_FREE(inodedep, D_INODEDEP); 1904 return (1); 1905 } 1906 inodedep->id_fs = fs; 1907 inodedep->id_ino = inum; 1908 inodedep->id_state = ALLCOMPLETE; 1909 inodedep->id_nlinkdelta = 0; 1910 inodedep->id_savedino1 = NULL; 1911 inodedep->id_savedsize = -1; 1912 inodedep->id_savedextsize = -1; 1913 inodedep->id_savednlink = -1; 1914 inodedep->id_bmsafemap = NULL; 1915 inodedep->id_mkdiradd = NULL; 1916 LIST_INIT(&inodedep->id_dirremhd); 1917 LIST_INIT(&inodedep->id_pendinghd); 1918 LIST_INIT(&inodedep->id_inowait); 1919 LIST_INIT(&inodedep->id_bufwait); 1920 TAILQ_INIT(&inodedep->id_inoreflst); 1921 TAILQ_INIT(&inodedep->id_inoupdt); 1922 TAILQ_INIT(&inodedep->id_newinoupdt); 1923 TAILQ_INIT(&inodedep->id_extupdt); 1924 TAILQ_INIT(&inodedep->id_newextupdt); 1925 LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); 1926 *inodedeppp = inodedep; 1927 return (0); 1928 } 1929 1930 /* 1931 * Structures and routines associated with newblk caching. 1932 */ 1933 LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl; 1934 u_long newblk_hash; /* size of hash table - 1 */ 1935 #define NEWBLK_HASH(fs, inum) \ 1936 (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash]) 1937 1938 static int 1939 newblk_find(newblkhd, mp, newblkno, flags, newblkpp) 1940 struct newblk_hashhead *newblkhd; 1941 struct mount *mp; 1942 ufs2_daddr_t newblkno; 1943 int flags; 1944 struct newblk **newblkpp; 1945 { 1946 struct newblk *newblk; 1947 1948 LIST_FOREACH(newblk, newblkhd, nb_hash) { 1949 if (newblkno != newblk->nb_newblkno) 1950 continue; 1951 if (mp != newblk->nb_list.wk_mp) 1952 continue; 1953 /* 1954 * If we're creating a new dependency don't match those that 1955 * have already been converted to allocdirects. This is for 1956 * a frag extend. 1957 */ 1958 if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK) 1959 continue; 1960 break; 1961 } 1962 if (newblk) { 1963 *newblkpp = newblk; 1964 return (1); 1965 } 1966 *newblkpp = NULL; 1967 return (0); 1968 } 1969 1970 /* 1971 * Look up a newblk. Return 1 if found, 0 if not found. 1972 * If not found, allocate if DEPALLOC flag is passed. 1973 * Found or allocated entry is returned in newblkpp. 1974 */ 1975 static int 1976 newblk_lookup(mp, newblkno, flags, newblkpp) 1977 struct mount *mp; 1978 ufs2_daddr_t newblkno; 1979 int flags; 1980 struct newblk **newblkpp; 1981 { 1982 struct newblk *newblk; 1983 struct newblk_hashhead *newblkhd; 1984 1985 newblkhd = NEWBLK_HASH(VFSTOUFS(mp)->um_fs, newblkno); 1986 if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) 1987 return (1); 1988 if ((flags & DEPALLOC) == 0) 1989 return (0); 1990 FREE_LOCK(&lk); 1991 newblk = malloc(sizeof(union allblk), M_NEWBLK, 1992 M_SOFTDEP_FLAGS | M_ZERO); 1993 workitem_alloc(&newblk->nb_list, D_NEWBLK, mp); 1994 ACQUIRE_LOCK(&lk); 1995 if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) { 1996 WORKITEM_FREE(newblk, D_NEWBLK); 1997 return (1); 1998 } 1999 newblk->nb_freefrag = NULL; 2000 LIST_INIT(&newblk->nb_indirdeps); 2001 LIST_INIT(&newblk->nb_newdirblk); 2002 LIST_INIT(&newblk->nb_jwork); 2003 newblk->nb_state = ATTACHED; 2004 newblk->nb_newblkno = newblkno; 2005 LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); 2006 *newblkpp = newblk; 2007 return (0); 2008 } 2009 2010 /* 2011 * Structures and routines associated with indir caching. 2012 */ 2013 struct workhead *indir_hashtbl; 2014 u_long indir_hash; /* size of hash table - 1 */ 2015 #define INDIR_HASH(mp, blkno) \ 2016 (&indir_hashtbl[((((register_t)(mp)) >> 13) + (blkno)) & indir_hash]) 2017 2018 static int 2019 indirblk_inseg(mp, blkno) 2020 struct mount *mp; 2021 ufs2_daddr_t blkno; 2022 { 2023 struct freework *freework; 2024 struct workhead *wkhd; 2025 struct worklist *wk; 2026 2027 wkhd = INDIR_HASH(mp, blkno); 2028 LIST_FOREACH(wk, wkhd, wk_list) { 2029 freework = WK_FREEWORK(wk); 2030 if (freework->fw_blkno == blkno && 2031 freework->fw_list.wk_mp == mp) { 2032 LIST_REMOVE(freework, fw_next); 2033 WORKLIST_REMOVE(&freework->fw_list); 2034 WORKITEM_FREE(freework, D_FREEWORK); 2035 return (1); 2036 } 2037 } 2038 return (0); 2039 } 2040 2041 /* 2042 * Executed during filesystem system initialization before 2043 * mounting any filesystems. 2044 */ 2045 void 2046 softdep_initialize() 2047 { 2048 2049 LIST_INIT(&mkdirlisthd); 2050 max_softdeps = desiredvnodes * 4; 2051 pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash); 2052 inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); 2053 newblk_hashtbl = hashinit(desiredvnodes / 5, M_NEWBLK, &newblk_hash); 2054 bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash); 2055 indir_hashtbl = hashinit(desiredvnodes / 10, M_FREEWORK, &indir_hash); 2056 2057 /* initialise bioops hack */ 2058 bioops.io_start = softdep_disk_io_initiation; 2059 bioops.io_complete = softdep_disk_write_complete; 2060 bioops.io_deallocate = softdep_deallocate_dependencies; 2061 bioops.io_countdeps = softdep_count_dependencies; 2062 2063 /* Initialize the callout with an mtx. */ 2064 callout_init_mtx(&softdep_callout, &lk, 0); 2065 } 2066 2067 /* 2068 * Executed after all filesystems have been unmounted during 2069 * filesystem module unload. 2070 */ 2071 void 2072 softdep_uninitialize() 2073 { 2074 2075 callout_drain(&softdep_callout); 2076 hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash); 2077 hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash); 2078 hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash); 2079 hashdestroy(bmsafemap_hashtbl, M_BMSAFEMAP, bmsafemap_hash); 2080 } 2081 2082 /* 2083 * Called at mount time to notify the dependency code that a 2084 * filesystem wishes to use it. 2085 */ 2086 int 2087 softdep_mount(devvp, mp, fs, cred) 2088 struct vnode *devvp; 2089 struct mount *mp; 2090 struct fs *fs; 2091 struct ucred *cred; 2092 { 2093 struct csum_total cstotal; 2094 struct ufsmount *ump; 2095 struct cg *cgp; 2096 struct buf *bp; 2097 int error, cyl; 2098 2099 MNT_ILOCK(mp); 2100 mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP; 2101 if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) { 2102 mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) | 2103 MNTK_SOFTDEP; 2104 mp->mnt_noasync++; 2105 } 2106 MNT_IUNLOCK(mp); 2107 ump = VFSTOUFS(mp); 2108 LIST_INIT(&ump->softdep_workitem_pending); 2109 LIST_INIT(&ump->softdep_journal_pending); 2110 TAILQ_INIT(&ump->softdep_unlinked); 2111 ump->softdep_worklist_tail = NULL; 2112 ump->softdep_on_worklist = 0; 2113 ump->softdep_deps = 0; 2114 if ((fs->fs_flags & FS_SUJ) && 2115 (error = journal_mount(mp, fs, cred)) != 0) { 2116 printf("Failed to start journal: %d\n", error); 2117 return (error); 2118 } 2119 /* 2120 * When doing soft updates, the counters in the 2121 * superblock may have gotten out of sync. Recomputation 2122 * can take a long time and can be deferred for background 2123 * fsck. However, the old behavior of scanning the cylinder 2124 * groups and recalculating them at mount time is available 2125 * by setting vfs.ffs.compute_summary_at_mount to one. 2126 */ 2127 if (compute_summary_at_mount == 0 || fs->fs_clean != 0) 2128 return (0); 2129 bzero(&cstotal, sizeof cstotal); 2130 for (cyl = 0; cyl < fs->fs_ncg; cyl++) { 2131 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), 2132 fs->fs_cgsize, cred, &bp)) != 0) { 2133 brelse(bp); 2134 return (error); 2135 } 2136 cgp = (struct cg *)bp->b_data; 2137 cstotal.cs_nffree += cgp->cg_cs.cs_nffree; 2138 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; 2139 cstotal.cs_nifree += cgp->cg_cs.cs_nifree; 2140 cstotal.cs_ndir += cgp->cg_cs.cs_ndir; 2141 fs->fs_cs(fs, cyl) = cgp->cg_cs; 2142 brelse(bp); 2143 } 2144 #ifdef DEBUG 2145 if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) 2146 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt); 2147 #endif 2148 bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); 2149 return (0); 2150 } 2151 2152 void 2153 softdep_unmount(mp) 2154 struct mount *mp; 2155 { 2156 2157 if (mp->mnt_kern_flag & MNTK_SUJ) 2158 journal_unmount(mp); 2159 } 2160 2161 struct jblocks { 2162 struct jseglst jb_segs; /* TAILQ of current segments. */ 2163 struct jseg *jb_writeseg; /* Next write to complete. */ 2164 struct jseg *jb_oldestseg; /* Oldest segment with valid entries. */ 2165 struct jextent *jb_extent; /* Extent array. */ 2166 uint64_t jb_nextseq; /* Next sequence number. */ 2167 uint64_t jb_oldestwrseq; /* Oldest written sequence number. */ 2168 uint8_t jb_needseg; /* Need a forced segment. */ 2169 uint8_t jb_suspended; /* Did journal suspend writes? */ 2170 int jb_avail; /* Available extents. */ 2171 int jb_used; /* Last used extent. */ 2172 int jb_head; /* Allocator head. */ 2173 int jb_off; /* Allocator extent offset. */ 2174 int jb_blocks; /* Total disk blocks covered. */ 2175 int jb_free; /* Total disk blocks free. */ 2176 int jb_min; /* Minimum free space. */ 2177 int jb_low; /* Low on space. */ 2178 int jb_age; /* Insertion time of oldest rec. */ 2179 }; 2180 2181 struct jextent { 2182 ufs2_daddr_t je_daddr; /* Disk block address. */ 2183 int je_blocks; /* Disk block count. */ 2184 }; 2185 2186 static struct jblocks * 2187 jblocks_create(void) 2188 { 2189 struct jblocks *jblocks; 2190 2191 jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO); 2192 TAILQ_INIT(&jblocks->jb_segs); 2193 jblocks->jb_avail = 10; 2194 jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail, 2195 M_JBLOCKS, M_WAITOK | M_ZERO); 2196 2197 return (jblocks); 2198 } 2199 2200 static ufs2_daddr_t 2201 jblocks_alloc(jblocks, bytes, actual) 2202 struct jblocks *jblocks; 2203 int bytes; 2204 int *actual; 2205 { 2206 ufs2_daddr_t daddr; 2207 struct jextent *jext; 2208 int freecnt; 2209 int blocks; 2210 2211 blocks = bytes / DEV_BSIZE; 2212 jext = &jblocks->jb_extent[jblocks->jb_head]; 2213 freecnt = jext->je_blocks - jblocks->jb_off; 2214 if (freecnt == 0) { 2215 jblocks->jb_off = 0; 2216 if (++jblocks->jb_head > jblocks->jb_used) 2217 jblocks->jb_head = 0; 2218 jext = &jblocks->jb_extent[jblocks->jb_head]; 2219 freecnt = jext->je_blocks; 2220 } 2221 if (freecnt > blocks) 2222 freecnt = blocks; 2223 *actual = freecnt * DEV_BSIZE; 2224 daddr = jext->je_daddr + jblocks->jb_off; 2225 jblocks->jb_off += freecnt; 2226 jblocks->jb_free -= freecnt; 2227 2228 return (daddr); 2229 } 2230 2231 static void 2232 jblocks_free(jblocks, mp, bytes) 2233 struct jblocks *jblocks; 2234 struct mount *mp; 2235 int bytes; 2236 { 2237 2238 jblocks->jb_free += bytes / DEV_BSIZE; 2239 if (jblocks->jb_suspended) 2240 worklist_speedup(); 2241 wakeup(jblocks); 2242 } 2243 2244 static void 2245 jblocks_destroy(jblocks) 2246 struct jblocks *jblocks; 2247 { 2248 2249 if (jblocks->jb_extent) 2250 free(jblocks->jb_extent, M_JBLOCKS); 2251 free(jblocks, M_JBLOCKS); 2252 } 2253 2254 static void 2255 jblocks_add(jblocks, daddr, blocks) 2256 struct jblocks *jblocks; 2257 ufs2_daddr_t daddr; 2258 int blocks; 2259 { 2260 struct jextent *jext; 2261 2262 jblocks->jb_blocks += blocks; 2263 jblocks->jb_free += blocks; 2264 jext = &jblocks->jb_extent[jblocks->jb_used]; 2265 /* Adding the first block. */ 2266 if (jext->je_daddr == 0) { 2267 jext->je_daddr = daddr; 2268 jext->je_blocks = blocks; 2269 return; 2270 } 2271 /* Extending the last extent. */ 2272 if (jext->je_daddr + jext->je_blocks == daddr) { 2273 jext->je_blocks += blocks; 2274 return; 2275 } 2276 /* Adding a new extent. */ 2277 if (++jblocks->jb_used == jblocks->jb_avail) { 2278 jblocks->jb_avail *= 2; 2279 jext = malloc(sizeof(struct jextent) * jblocks->jb_avail, 2280 M_JBLOCKS, M_WAITOK | M_ZERO); 2281 memcpy(jext, jblocks->jb_extent, 2282 sizeof(struct jextent) * jblocks->jb_used); 2283 free(jblocks->jb_extent, M_JBLOCKS); 2284 jblocks->jb_extent = jext; 2285 } 2286 jext = &jblocks->jb_extent[jblocks->jb_used]; 2287 jext->je_daddr = daddr; 2288 jext->je_blocks = blocks; 2289 return; 2290 } 2291 2292 int 2293 softdep_journal_lookup(mp, vpp) 2294 struct mount *mp; 2295 struct vnode **vpp; 2296 { 2297 struct componentname cnp; 2298 struct vnode *dvp; 2299 ino_t sujournal; 2300 int error; 2301 2302 error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp); 2303 if (error) 2304 return (error); 2305 bzero(&cnp, sizeof(cnp)); 2306 cnp.cn_nameiop = LOOKUP; 2307 cnp.cn_flags = ISLASTCN; 2308 cnp.cn_thread = curthread; 2309 cnp.cn_cred = curthread->td_ucred; 2310 cnp.cn_pnbuf = SUJ_FILE; 2311 cnp.cn_nameptr = SUJ_FILE; 2312 cnp.cn_namelen = strlen(SUJ_FILE); 2313 error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal); 2314 vput(dvp); 2315 if (error != 0) 2316 return (error); 2317 error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp); 2318 return (error); 2319 } 2320 2321 /* 2322 * Open and verify the journal file. 2323 */ 2324 static int 2325 journal_mount(mp, fs, cred) 2326 struct mount *mp; 2327 struct fs *fs; 2328 struct ucred *cred; 2329 { 2330 struct jblocks *jblocks; 2331 struct vnode *vp; 2332 struct inode *ip; 2333 ufs2_daddr_t blkno; 2334 int bcount; 2335 int error; 2336 int i; 2337 2338 error = softdep_journal_lookup(mp, &vp); 2339 if (error != 0) { 2340 printf("Failed to find journal. Use tunefs to create one\n"); 2341 return (error); 2342 } 2343 ip = VTOI(vp); 2344 if (ip->i_size < SUJ_MIN) { 2345 error = ENOSPC; 2346 goto out; 2347 } 2348 bcount = lblkno(fs, ip->i_size); /* Only use whole blocks. */ 2349 jblocks = jblocks_create(); 2350 for (i = 0; i < bcount; i++) { 2351 error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL); 2352 if (error) 2353 break; 2354 jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag)); 2355 } 2356 if (error) { 2357 jblocks_destroy(jblocks); 2358 goto out; 2359 } 2360 jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */ 2361 jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */ 2362 VFSTOUFS(mp)->softdep_jblocks = jblocks; 2363 out: 2364 if (error == 0) { 2365 MNT_ILOCK(mp); 2366 mp->mnt_kern_flag |= MNTK_SUJ; 2367 MNT_IUNLOCK(mp); 2368 /* 2369 * Only validate the journal contents if the 2370 * filesystem is clean, otherwise we write the logs 2371 * but they'll never be used. If the filesystem was 2372 * still dirty when we mounted it the journal is 2373 * invalid and a new journal can only be valid if it 2374 * starts from a clean mount. 2375 */ 2376 if (fs->fs_clean) { 2377 DIP_SET(ip, i_modrev, fs->fs_mtime); 2378 ip->i_flags |= IN_MODIFIED; 2379 ffs_update(vp, 1); 2380 } 2381 } 2382 vput(vp); 2383 return (error); 2384 } 2385 2386 static void 2387 journal_unmount(mp) 2388 struct mount *mp; 2389 { 2390 struct ufsmount *ump; 2391 2392 ump = VFSTOUFS(mp); 2393 if (ump->softdep_jblocks) 2394 jblocks_destroy(ump->softdep_jblocks); 2395 ump->softdep_jblocks = NULL; 2396 } 2397 2398 /* 2399 * Called when a journal record is ready to be written. Space is allocated 2400 * and the journal entry is created when the journal is flushed to stable 2401 * store. 2402 */ 2403 static void 2404 add_to_journal(wk) 2405 struct worklist *wk; 2406 { 2407 struct ufsmount *ump; 2408 2409 mtx_assert(&lk, MA_OWNED); 2410 ump = VFSTOUFS(wk->wk_mp); 2411 if (wk->wk_state & ONWORKLIST) 2412 panic("add_to_journal: %s(0x%X) already on list", 2413 TYPENAME(wk->wk_type), wk->wk_state); 2414 wk->wk_state |= ONWORKLIST | DEPCOMPLETE; 2415 if (LIST_EMPTY(&ump->softdep_journal_pending)) { 2416 ump->softdep_jblocks->jb_age = ticks; 2417 LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list); 2418 } else 2419 LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list); 2420 ump->softdep_journal_tail = wk; 2421 ump->softdep_on_journal += 1; 2422 } 2423 2424 /* 2425 * Remove an arbitrary item for the journal worklist maintain the tail 2426 * pointer. This happens when a new operation obviates the need to 2427 * journal an old operation. 2428 */ 2429 static void 2430 remove_from_journal(wk) 2431 struct worklist *wk; 2432 { 2433 struct ufsmount *ump; 2434 2435 mtx_assert(&lk, MA_OWNED); 2436 ump = VFSTOUFS(wk->wk_mp); 2437 #ifdef SUJ_DEBUG 2438 { 2439 struct worklist *wkn; 2440 2441 LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list) 2442 if (wkn == wk) 2443 break; 2444 if (wkn == NULL) 2445 panic("remove_from_journal: %p is not in journal", wk); 2446 } 2447 #endif 2448 /* 2449 * We emulate a TAILQ to save space in most structures which do not 2450 * require TAILQ semantics. Here we must update the tail position 2451 * when removing the tail which is not the final entry. This works 2452 * only if the worklist linkage are at the beginning of the structure. 2453 */ 2454 if (ump->softdep_journal_tail == wk) 2455 ump->softdep_journal_tail = 2456 (struct worklist *)wk->wk_list.le_prev; 2457 2458 WORKLIST_REMOVE(wk); 2459 ump->softdep_on_journal -= 1; 2460 } 2461 2462 /* 2463 * Check for journal space as well as dependency limits so the prelink 2464 * code can throttle both journaled and non-journaled filesystems. 2465 * Threshold is 0 for low and 1 for min. 2466 */ 2467 static int 2468 journal_space(ump, thresh) 2469 struct ufsmount *ump; 2470 int thresh; 2471 { 2472 struct jblocks *jblocks; 2473 int avail; 2474 2475 jblocks = ump->softdep_jblocks; 2476 if (jblocks == NULL) 2477 return (1); 2478 /* 2479 * We use a tighter restriction here to prevent request_cleanup() 2480 * running in threads from running into locks we currently hold. 2481 */ 2482 if (dep_current[D_INODEDEP] > (max_softdeps / 10) * 9) 2483 return (0); 2484 if (thresh) 2485 thresh = jblocks->jb_min; 2486 else 2487 thresh = jblocks->jb_low; 2488 avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE; 2489 avail = jblocks->jb_free - avail; 2490 2491 return (avail > thresh); 2492 } 2493 2494 static void 2495 journal_suspend(ump) 2496 struct ufsmount *ump; 2497 { 2498 struct jblocks *jblocks; 2499 struct mount *mp; 2500 2501 mp = UFSTOVFS(ump); 2502 jblocks = ump->softdep_jblocks; 2503 MNT_ILOCK(mp); 2504 if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { 2505 stat_journal_min++; 2506 mp->mnt_kern_flag |= MNTK_SUSPEND; 2507 mp->mnt_susp_owner = FIRST_THREAD_IN_PROC(softdepproc); 2508 } 2509 jblocks->jb_suspended = 1; 2510 MNT_IUNLOCK(mp); 2511 } 2512 2513 static int 2514 journal_unsuspend(struct ufsmount *ump) 2515 { 2516 struct jblocks *jblocks; 2517 struct mount *mp; 2518 2519 mp = UFSTOVFS(ump); 2520 jblocks = ump->softdep_jblocks; 2521 2522 if (jblocks != NULL && jblocks->jb_suspended && 2523 journal_space(ump, jblocks->jb_min)) { 2524 jblocks->jb_suspended = 0; 2525 FREE_LOCK(&lk); 2526 mp->mnt_susp_owner = curthread; 2527 vfs_write_resume(mp); 2528 ACQUIRE_LOCK(&lk); 2529 return (1); 2530 } 2531 return (0); 2532 } 2533 2534 /* 2535 * Called before any allocation function to be certain that there is 2536 * sufficient space in the journal prior to creating any new records. 2537 * Since in the case of block allocation we may have multiple locked 2538 * buffers at the time of the actual allocation we can not block 2539 * when the journal records are created. Doing so would create a deadlock 2540 * if any of these buffers needed to be flushed to reclaim space. Instead 2541 * we require a sufficiently large amount of available space such that 2542 * each thread in the system could have passed this allocation check and 2543 * still have sufficient free space. With 20% of a minimum journal size 2544 * of 1MB we have 6553 records available. 2545 */ 2546 int 2547 softdep_prealloc(vp, waitok) 2548 struct vnode *vp; 2549 int waitok; 2550 { 2551 struct ufsmount *ump; 2552 2553 if (DOINGSUJ(vp) == 0) 2554 return (0); 2555 ump = VFSTOUFS(vp->v_mount); 2556 ACQUIRE_LOCK(&lk); 2557 if (journal_space(ump, 0)) { 2558 FREE_LOCK(&lk); 2559 return (0); 2560 } 2561 stat_journal_low++; 2562 FREE_LOCK(&lk); 2563 if (waitok == MNT_NOWAIT) 2564 return (ENOSPC); 2565 /* 2566 * Attempt to sync this vnode once to flush any journal 2567 * work attached to it. 2568 */ 2569 if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0) 2570 ffs_syncvnode(vp, waitok); 2571 ACQUIRE_LOCK(&lk); 2572 process_removes(vp); 2573 if (journal_space(ump, 0) == 0) { 2574 softdep_speedup(); 2575 if (journal_space(ump, 1) == 0) 2576 journal_suspend(ump); 2577 } 2578 FREE_LOCK(&lk); 2579 2580 return (0); 2581 } 2582 2583 /* 2584 * Before adjusting a link count on a vnode verify that we have sufficient 2585 * journal space. If not, process operations that depend on the currently 2586 * locked pair of vnodes to try to flush space as the syncer, buf daemon, 2587 * and softdep flush threads can not acquire these locks to reclaim space. 2588 */ 2589 static void 2590 softdep_prelink(dvp, vp) 2591 struct vnode *dvp; 2592 struct vnode *vp; 2593 { 2594 struct ufsmount *ump; 2595 2596 ump = VFSTOUFS(dvp->v_mount); 2597 mtx_assert(&lk, MA_OWNED); 2598 if (journal_space(ump, 0)) 2599 return; 2600 stat_journal_low++; 2601 FREE_LOCK(&lk); 2602 if (vp) 2603 ffs_syncvnode(vp, MNT_NOWAIT); 2604 ffs_syncvnode(dvp, MNT_WAIT); 2605 ACQUIRE_LOCK(&lk); 2606 /* Process vp before dvp as it may create .. removes. */ 2607 if (vp) 2608 process_removes(vp); 2609 process_removes(dvp); 2610 softdep_speedup(); 2611 process_worklist_item(UFSTOVFS(ump), LK_NOWAIT); 2612 process_worklist_item(UFSTOVFS(ump), LK_NOWAIT); 2613 if (journal_space(ump, 0) == 0) { 2614 softdep_speedup(); 2615 if (journal_space(ump, 1) == 0) 2616 journal_suspend(ump); 2617 } 2618 } 2619 2620 static void 2621 jseg_write(ump, jseg, data) 2622 struct ufsmount *ump; 2623 struct jseg *jseg; 2624 uint8_t *data; 2625 { 2626 struct jsegrec *rec; 2627 2628 rec = (struct jsegrec *)data; 2629 rec->jsr_seq = jseg->js_seq; 2630 rec->jsr_oldest = jseg->js_oldseq; 2631 rec->jsr_cnt = jseg->js_cnt; 2632 rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize; 2633 rec->jsr_crc = 0; 2634 rec->jsr_time = ump->um_fs->fs_mtime; 2635 } 2636 2637 static inline void 2638 inoref_write(inoref, jseg, rec) 2639 struct inoref *inoref; 2640 struct jseg *jseg; 2641 struct jrefrec *rec; 2642 { 2643 2644 inoref->if_jsegdep->jd_seg = jseg; 2645 rec->jr_ino = inoref->if_ino; 2646 rec->jr_parent = inoref->if_parent; 2647 rec->jr_nlink = inoref->if_nlink; 2648 rec->jr_mode = inoref->if_mode; 2649 rec->jr_diroff = inoref->if_diroff; 2650 } 2651 2652 static void 2653 jaddref_write(jaddref, jseg, data) 2654 struct jaddref *jaddref; 2655 struct jseg *jseg; 2656 uint8_t *data; 2657 { 2658 struct jrefrec *rec; 2659 2660 rec = (struct jrefrec *)data; 2661 rec->jr_op = JOP_ADDREF; 2662 inoref_write(&jaddref->ja_ref, jseg, rec); 2663 } 2664 2665 static void 2666 jremref_write(jremref, jseg, data) 2667 struct jremref *jremref; 2668 struct jseg *jseg; 2669 uint8_t *data; 2670 { 2671 struct jrefrec *rec; 2672 2673 rec = (struct jrefrec *)data; 2674 rec->jr_op = JOP_REMREF; 2675 inoref_write(&jremref->jr_ref, jseg, rec); 2676 } 2677 2678 static void 2679 jmvref_write(jmvref, jseg, data) 2680 struct jmvref *jmvref; 2681 struct jseg *jseg; 2682 uint8_t *data; 2683 { 2684 struct jmvrec *rec; 2685 2686 rec = (struct jmvrec *)data; 2687 rec->jm_op = JOP_MVREF; 2688 rec->jm_ino = jmvref->jm_ino; 2689 rec->jm_parent = jmvref->jm_parent; 2690 rec->jm_oldoff = jmvref->jm_oldoff; 2691 rec->jm_newoff = jmvref->jm_newoff; 2692 } 2693 2694 static void 2695 jnewblk_write(jnewblk, jseg, data) 2696 struct jnewblk *jnewblk; 2697 struct jseg *jseg; 2698 uint8_t *data; 2699 { 2700 struct jblkrec *rec; 2701 2702 jnewblk->jn_jsegdep->jd_seg = jseg; 2703 rec = (struct jblkrec *)data; 2704 rec->jb_op = JOP_NEWBLK; 2705 rec->jb_ino = jnewblk->jn_ino; 2706 rec->jb_blkno = jnewblk->jn_blkno; 2707 rec->jb_lbn = jnewblk->jn_lbn; 2708 rec->jb_frags = jnewblk->jn_frags; 2709 rec->jb_oldfrags = jnewblk->jn_oldfrags; 2710 } 2711 2712 static void 2713 jfreeblk_write(jfreeblk, jseg, data) 2714 struct jfreeblk *jfreeblk; 2715 struct jseg *jseg; 2716 uint8_t *data; 2717 { 2718 struct jblkrec *rec; 2719 2720 jfreeblk->jf_jsegdep->jd_seg = jseg; 2721 rec = (struct jblkrec *)data; 2722 rec->jb_op = JOP_FREEBLK; 2723 rec->jb_ino = jfreeblk->jf_ino; 2724 rec->jb_blkno = jfreeblk->jf_blkno; 2725 rec->jb_lbn = jfreeblk->jf_lbn; 2726 rec->jb_frags = jfreeblk->jf_frags; 2727 rec->jb_oldfrags = 0; 2728 } 2729 2730 static void 2731 jfreefrag_write(jfreefrag, jseg, data) 2732 struct jfreefrag *jfreefrag; 2733 struct jseg *jseg; 2734 uint8_t *data; 2735 { 2736 struct jblkrec *rec; 2737 2738 jfreefrag->fr_jsegdep->jd_seg = jseg; 2739 rec = (struct jblkrec *)data; 2740 rec->jb_op = JOP_FREEBLK; 2741 rec->jb_ino = jfreefrag->fr_ino; 2742 rec->jb_blkno = jfreefrag->fr_blkno; 2743 rec->jb_lbn = jfreefrag->fr_lbn; 2744 rec->jb_frags = jfreefrag->fr_frags; 2745 rec->jb_oldfrags = 0; 2746 } 2747 2748 static void 2749 jtrunc_write(jtrunc, jseg, data) 2750 struct jtrunc *jtrunc; 2751 struct jseg *jseg; 2752 uint8_t *data; 2753 { 2754 struct jtrncrec *rec; 2755 2756 rec = (struct jtrncrec *)data; 2757 rec->jt_op = JOP_TRUNC; 2758 rec->jt_ino = jtrunc->jt_ino; 2759 rec->jt_size = jtrunc->jt_size; 2760 rec->jt_extsize = jtrunc->jt_extsize; 2761 } 2762 2763 /* 2764 * Flush some journal records to disk. 2765 */ 2766 static void 2767 softdep_process_journal(mp, needwk, flags) 2768 struct mount *mp; 2769 struct worklist *needwk; 2770 int flags; 2771 { 2772 struct jblocks *jblocks; 2773 struct ufsmount *ump; 2774 struct worklist *wk; 2775 struct jseg *jseg; 2776 struct buf *bp; 2777 uint8_t *data; 2778 struct fs *fs; 2779 int segwritten; 2780 int jrecmin; /* Minimum records per block. */ 2781 int jrecmax; /* Maximum records per block. */ 2782 int size; 2783 int cnt; 2784 int off; 2785 int devbsize; 2786 2787 if ((mp->mnt_kern_flag & MNTK_SUJ) == 0) 2788 return; 2789 ump = VFSTOUFS(mp); 2790 fs = ump->um_fs; 2791 jblocks = ump->softdep_jblocks; 2792 devbsize = ump->um_devvp->v_bufobj.bo_bsize; 2793 /* 2794 * We write anywhere between a disk block and fs block. The upper 2795 * bound is picked to prevent buffer cache fragmentation and limit 2796 * processing time per I/O. 2797 */ 2798 jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */ 2799 jrecmax = (fs->fs_bsize / devbsize) * jrecmin; 2800 segwritten = 0; 2801 for (;;) { 2802 cnt = ump->softdep_on_journal; 2803 /* 2804 * Criteria for writing a segment: 2805 * 1) We have a full block. 2806 * 2) We're called from jwait() and haven't found the 2807 * journal item yet. 2808 * 3) Always write if needseg is set. 2809 * 4) If we are called from process_worklist and have 2810 * not yet written anything we write a partial block 2811 * to enforce a 1 second maximum latency on journal 2812 * entries. 2813 */ 2814 if (cnt < (jrecmax - 1) && needwk == NULL && 2815 jblocks->jb_needseg == 0 && (segwritten || cnt == 0)) 2816 break; 2817 cnt++; 2818 /* 2819 * Verify some free journal space. softdep_prealloc() should 2820 * guarantee that we don't run out so this is indicative of 2821 * a problem with the flow control. Try to recover 2822 * gracefully in any event. 2823 */ 2824 while (jblocks->jb_free == 0) { 2825 if (flags != MNT_WAIT) 2826 break; 2827 printf("softdep: Out of journal space!\n"); 2828 softdep_speedup(); 2829 msleep(jblocks, &lk, PRIBIO, "jblocks", hz); 2830 } 2831 FREE_LOCK(&lk); 2832 jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS); 2833 workitem_alloc(&jseg->js_list, D_JSEG, mp); 2834 LIST_INIT(&jseg->js_entries); 2835 LIST_INIT(&jseg->js_indirs); 2836 jseg->js_state = ATTACHED; 2837 jseg->js_jblocks = jblocks; 2838 bp = geteblk(fs->fs_bsize, 0); 2839 ACQUIRE_LOCK(&lk); 2840 /* 2841 * If there was a race while we were allocating the block 2842 * and jseg the entry we care about was likely written. 2843 * We bail out in both the WAIT and NOWAIT case and assume 2844 * the caller will loop if the entry it cares about is 2845 * not written. 2846 */ 2847 cnt = ump->softdep_on_journal; 2848 if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) { 2849 bp->b_flags |= B_INVAL | B_NOCACHE; 2850 WORKITEM_FREE(jseg, D_JSEG); 2851 FREE_LOCK(&lk); 2852 brelse(bp); 2853 ACQUIRE_LOCK(&lk); 2854 break; 2855 } 2856 /* 2857 * Calculate the disk block size required for the available 2858 * records rounded to the min size. 2859 */ 2860 if (cnt == 0) 2861 size = devbsize; 2862 else if (cnt < jrecmax) 2863 size = howmany(cnt, jrecmin) * devbsize; 2864 else 2865 size = fs->fs_bsize; 2866 /* 2867 * Allocate a disk block for this journal data and account 2868 * for truncation of the requested size if enough contiguous 2869 * space was not available. 2870 */ 2871 bp->b_blkno = jblocks_alloc(jblocks, size, &size); 2872 bp->b_lblkno = bp->b_blkno; 2873 bp->b_offset = bp->b_blkno * DEV_BSIZE; 2874 bp->b_bcount = size; 2875 bp->b_bufobj = &ump->um_devvp->v_bufobj; 2876 bp->b_flags &= ~B_INVAL; 2877 bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY; 2878 /* 2879 * Initialize our jseg with cnt records. Assign the next 2880 * sequence number to it and link it in-order. 2881 */ 2882 cnt = MIN(cnt, (size / devbsize) * jrecmin); 2883 jseg->js_buf = bp; 2884 jseg->js_cnt = cnt; 2885 jseg->js_refs = cnt + 1; /* Self ref. */ 2886 jseg->js_size = size; 2887 jseg->js_seq = jblocks->jb_nextseq++; 2888 if (jblocks->jb_oldestseg == NULL) 2889 jblocks->jb_oldestseg = jseg; 2890 jseg->js_oldseq = jblocks->jb_oldestseg->js_seq; 2891 TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next); 2892 if (jblocks->jb_writeseg == NULL) 2893 jblocks->jb_writeseg = jseg; 2894 /* 2895 * Start filling in records from the pending list. 2896 */ 2897 data = bp->b_data; 2898 off = 0; 2899 while ((wk = LIST_FIRST(&ump->softdep_journal_pending)) 2900 != NULL) { 2901 if (cnt == 0) 2902 break; 2903 /* Place a segment header on every device block. */ 2904 if ((off % devbsize) == 0) { 2905 jseg_write(ump, jseg, data); 2906 off += JREC_SIZE; 2907 data = bp->b_data + off; 2908 } 2909 if (wk == needwk) 2910 needwk = NULL; 2911 remove_from_journal(wk); 2912 wk->wk_state |= IOSTARTED; 2913 WORKLIST_INSERT(&jseg->js_entries, wk); 2914 switch (wk->wk_type) { 2915 case D_JADDREF: 2916 jaddref_write(WK_JADDREF(wk), jseg, data); 2917 break; 2918 case D_JREMREF: 2919 jremref_write(WK_JREMREF(wk), jseg, data); 2920 break; 2921 case D_JMVREF: 2922 jmvref_write(WK_JMVREF(wk), jseg, data); 2923 break; 2924 case D_JNEWBLK: 2925 jnewblk_write(WK_JNEWBLK(wk), jseg, data); 2926 break; 2927 case D_JFREEBLK: 2928 jfreeblk_write(WK_JFREEBLK(wk), jseg, data); 2929 break; 2930 case D_JFREEFRAG: 2931 jfreefrag_write(WK_JFREEFRAG(wk), jseg, data); 2932 break; 2933 case D_JTRUNC: 2934 jtrunc_write(WK_JTRUNC(wk), jseg, data); 2935 break; 2936 default: 2937 panic("process_journal: Unknown type %s", 2938 TYPENAME(wk->wk_type)); 2939 /* NOTREACHED */ 2940 } 2941 off += JREC_SIZE; 2942 data = bp->b_data + off; 2943 cnt--; 2944 } 2945 /* 2946 * Write this one buffer and continue. 2947 */ 2948 segwritten = 1; 2949 jblocks->jb_needseg = 0; 2950 WORKLIST_INSERT(&bp->b_dep, &jseg->js_list); 2951 FREE_LOCK(&lk); 2952 BO_LOCK(bp->b_bufobj); 2953 bgetvp(ump->um_devvp, bp); 2954 BO_UNLOCK(bp->b_bufobj); 2955 /* 2956 * We only do the blocking wait once we find the journal 2957 * entry we're looking for. 2958 */ 2959 if (needwk == NULL && flags & MNT_WAIT) 2960 bwrite(bp); 2961 else 2962 bawrite(bp); 2963 ACQUIRE_LOCK(&lk); 2964 } 2965 /* 2966 * If we've suspended the filesystem because we ran out of journal 2967 * space either try to sync it here to make some progress or 2968 * unsuspend it if we already have. 2969 */ 2970 if (flags == 0 && jblocks->jb_suspended) { 2971 if (journal_unsuspend(ump)) 2972 return; 2973 FREE_LOCK(&lk); 2974 VFS_SYNC(mp, MNT_NOWAIT); 2975 ffs_sbupdate(ump, MNT_WAIT, 0); 2976 ACQUIRE_LOCK(&lk); 2977 } 2978 } 2979 2980 /* 2981 * Complete a jseg, allowing all dependencies awaiting journal writes 2982 * to proceed. Each journal dependency also attaches a jsegdep to dependent 2983 * structures so that the journal segment can be freed to reclaim space. 2984 */ 2985 static void 2986 complete_jseg(jseg) 2987 struct jseg *jseg; 2988 { 2989 struct worklist *wk; 2990 struct jmvref *jmvref; 2991 int waiting; 2992 #ifdef INVARIANTS 2993 int i = 0; 2994 #endif 2995 2996 while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) { 2997 WORKLIST_REMOVE(wk); 2998 waiting = wk->wk_state & IOWAITING; 2999 wk->wk_state &= ~(IOSTARTED | IOWAITING); 3000 wk->wk_state |= COMPLETE; 3001 KASSERT(i++ < jseg->js_cnt, 3002 ("handle_written_jseg: overflow %d >= %d", 3003 i - 1, jseg->js_cnt)); 3004 switch (wk->wk_type) { 3005 case D_JADDREF: 3006 handle_written_jaddref(WK_JADDREF(wk)); 3007 break; 3008 case D_JREMREF: 3009 handle_written_jremref(WK_JREMREF(wk)); 3010 break; 3011 case D_JMVREF: 3012 /* No jsegdep here. */ 3013 rele_jseg(jseg); 3014 jmvref = WK_JMVREF(wk); 3015 LIST_REMOVE(jmvref, jm_deps); 3016 free_pagedep(jmvref->jm_pagedep); 3017 WORKITEM_FREE(jmvref, D_JMVREF); 3018 break; 3019 case D_JNEWBLK: 3020 handle_written_jnewblk(WK_JNEWBLK(wk)); 3021 break; 3022 case D_JFREEBLK: 3023 handle_written_jfreeblk(WK_JFREEBLK(wk)); 3024 break; 3025 case D_JFREEFRAG: 3026 handle_written_jfreefrag(WK_JFREEFRAG(wk)); 3027 break; 3028 case D_JTRUNC: 3029 WK_JTRUNC(wk)->jt_jsegdep->jd_seg = jseg; 3030 WORKITEM_FREE(wk, D_JTRUNC); 3031 break; 3032 default: 3033 panic("handle_written_jseg: Unknown type %s", 3034 TYPENAME(wk->wk_type)); 3035 /* NOTREACHED */ 3036 } 3037 if (waiting) 3038 wakeup(wk); 3039 } 3040 /* Release the self reference so the structure may be freed. */ 3041 rele_jseg(jseg); 3042 } 3043 3044 /* 3045 * Mark a jseg as DEPCOMPLETE and throw away the buffer. Handle jseg 3046 * completions in order only. 3047 */ 3048 static void 3049 handle_written_jseg(jseg, bp) 3050 struct jseg *jseg; 3051 struct buf *bp; 3052 { 3053 struct jblocks *jblocks; 3054 struct jseg *jsegn; 3055 3056 if (jseg->js_refs == 0) 3057 panic("handle_written_jseg: No self-reference on %p", jseg); 3058 jseg->js_state |= DEPCOMPLETE; 3059 /* 3060 * We'll never need this buffer again, set flags so it will be 3061 * discarded. 3062 */ 3063 bp->b_flags |= B_INVAL | B_NOCACHE; 3064 jblocks = jseg->js_jblocks; 3065 /* 3066 * Don't allow out of order completions. If this isn't the first 3067 * block wait for it to write before we're done. 3068 */ 3069 if (jseg != jblocks->jb_writeseg) 3070 return; 3071 /* Iterate through available jsegs processing their entries. */ 3072 do { 3073 jblocks->jb_oldestwrseq = jseg->js_oldseq; 3074 jsegn = TAILQ_NEXT(jseg, js_next); 3075 complete_jseg(jseg); 3076 jseg = jsegn; 3077 } while (jseg && jseg->js_state & DEPCOMPLETE); 3078 jblocks->jb_writeseg = jseg; 3079 /* 3080 * Attempt to free jsegs now that oldestwrseq may have advanced. 3081 */ 3082 free_jsegs(jblocks); 3083 } 3084 3085 static inline struct jsegdep * 3086 inoref_jseg(inoref) 3087 struct inoref *inoref; 3088 { 3089 struct jsegdep *jsegdep; 3090 3091 jsegdep = inoref->if_jsegdep; 3092 inoref->if_jsegdep = NULL; 3093 3094 return (jsegdep); 3095 } 3096 3097 /* 3098 * Called once a jremref has made it to stable store. The jremref is marked 3099 * complete and we attempt to free it. Any pagedeps writes sleeping waiting 3100 * for the jremref to complete will be awoken by free_jremref. 3101 */ 3102 static void 3103 handle_written_jremref(jremref) 3104 struct jremref *jremref; 3105 { 3106 struct inodedep *inodedep; 3107 struct jsegdep *jsegdep; 3108 struct dirrem *dirrem; 3109 3110 /* Grab the jsegdep. */ 3111 jsegdep = inoref_jseg(&jremref->jr_ref); 3112 /* 3113 * Remove us from the inoref list. 3114 */ 3115 if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 3116 0, &inodedep) == 0) 3117 panic("handle_written_jremref: Lost inodedep"); 3118 TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); 3119 /* 3120 * Complete the dirrem. 3121 */ 3122 dirrem = jremref->jr_dirrem; 3123 jremref->jr_dirrem = NULL; 3124 LIST_REMOVE(jremref, jr_deps); 3125 jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT; 3126 WORKLIST_INSERT(&dirrem->dm_jwork, &jsegdep->jd_list); 3127 if (LIST_EMPTY(&dirrem->dm_jremrefhd) && 3128 (dirrem->dm_state & COMPLETE) != 0) 3129 add_to_worklist(&dirrem->dm_list, 0); 3130 free_jremref(jremref); 3131 } 3132 3133 /* 3134 * Called once a jaddref has made it to stable store. The dependency is 3135 * marked complete and any dependent structures are added to the inode 3136 * bufwait list to be completed as soon as it is written. If a bitmap write 3137 * depends on this entry we move the inode into the inodedephd of the 3138 * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap. 3139 */ 3140 static void 3141 handle_written_jaddref(jaddref) 3142 struct jaddref *jaddref; 3143 { 3144 struct jsegdep *jsegdep; 3145 struct inodedep *inodedep; 3146 struct diradd *diradd; 3147 struct mkdir *mkdir; 3148 3149 /* Grab the jsegdep. */ 3150 jsegdep = inoref_jseg(&jaddref->ja_ref); 3151 mkdir = NULL; 3152 diradd = NULL; 3153 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 3154 0, &inodedep) == 0) 3155 panic("handle_written_jaddref: Lost inodedep."); 3156 if (jaddref->ja_diradd == NULL) 3157 panic("handle_written_jaddref: No dependency"); 3158 if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) { 3159 diradd = jaddref->ja_diradd; 3160 WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list); 3161 } else if (jaddref->ja_state & MKDIR_PARENT) { 3162 mkdir = jaddref->ja_mkdir; 3163 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list); 3164 } else if (jaddref->ja_state & MKDIR_BODY) 3165 mkdir = jaddref->ja_mkdir; 3166 else 3167 panic("handle_written_jaddref: Unknown dependency %p", 3168 jaddref->ja_diradd); 3169 jaddref->ja_diradd = NULL; /* also clears ja_mkdir */ 3170 /* 3171 * Remove us from the inode list. 3172 */ 3173 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); 3174 /* 3175 * The mkdir may be waiting on the jaddref to clear before freeing. 3176 */ 3177 if (mkdir) { 3178 KASSERT(mkdir->md_list.wk_type == D_MKDIR, 3179 ("handle_written_jaddref: Incorrect type for mkdir %s", 3180 TYPENAME(mkdir->md_list.wk_type))); 3181 mkdir->md_jaddref = NULL; 3182 diradd = mkdir->md_diradd; 3183 mkdir->md_state |= DEPCOMPLETE; 3184 complete_mkdir(mkdir); 3185 } 3186 WORKLIST_INSERT(&diradd->da_jwork, &jsegdep->jd_list); 3187 if (jaddref->ja_state & NEWBLOCK) { 3188 inodedep->id_state |= ONDEPLIST; 3189 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd, 3190 inodedep, id_deps); 3191 } 3192 free_jaddref(jaddref); 3193 } 3194 3195 /* 3196 * Called once a jnewblk journal is written. The allocdirect or allocindir 3197 * is placed in the bmsafemap to await notification of a written bitmap. If 3198 * the operation was canceled we add the segdep to the appropriate 3199 * dependency to free the journal space once the canceling operation 3200 * completes. 3201 */ 3202 static void 3203 handle_written_jnewblk(jnewblk) 3204 struct jnewblk *jnewblk; 3205 { 3206 struct bmsafemap *bmsafemap; 3207 struct freefrag *freefrag; 3208 struct jsegdep *jsegdep; 3209 struct newblk *newblk; 3210 struct freework *freework; 3211 struct indirdep *indirdep; 3212 3213 /* Grab the jsegdep. */ 3214 jsegdep = jnewblk->jn_jsegdep; 3215 jnewblk->jn_jsegdep = NULL; 3216 if (jnewblk->jn_dep == NULL) 3217 panic("handle_written_jnewblk: No dependency for the segdep."); 3218 switch (jnewblk->jn_dep->wk_type) { 3219 case D_NEWBLK: 3220 case D_ALLOCDIRECT: 3221 case D_ALLOCINDIR: 3222 /* 3223 * Add the written block to the bmsafemap so it can 3224 * be notified when the bitmap is on disk. 3225 */ 3226 newblk = WK_NEWBLK(jnewblk->jn_dep); 3227 newblk->nb_jnewblk = NULL; 3228 bmsafemap = newblk->nb_bmsafemap; 3229 newblk->nb_state |= ONDEPLIST; 3230 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); 3231 WORKLIST_INSERT(&newblk->nb_jwork, &jsegdep->jd_list); 3232 break; 3233 case D_FREEFRAG: 3234 /* 3235 * A newblock being removed by a freefrag when replaced by 3236 * frag extension. 3237 */ 3238 freefrag = WK_FREEFRAG(jnewblk->jn_dep); 3239 freefrag->ff_jdep = NULL; 3240 WORKLIST_INSERT(&freefrag->ff_jwork, &jsegdep->jd_list); 3241 break; 3242 case D_FREEWORK: 3243 /* 3244 * A direct block was removed by truncate. 3245 */ 3246 freework = WK_FREEWORK(jnewblk->jn_dep); 3247 freework->fw_jnewblk = NULL; 3248 WORKLIST_INSERT(&freework->fw_jwork, &jsegdep->jd_list); 3249 break; 3250 case D_INDIRDEP: 3251 /* 3252 * An indirect block was removed by truncate. 3253 */ 3254 indirdep = WK_INDIRDEP(jnewblk->jn_dep); 3255 LIST_REMOVE(jnewblk, jn_indirdeps); 3256 WORKLIST_INSERT(&indirdep->ir_jwork, &jsegdep->jd_list); 3257 break; 3258 default: 3259 panic("handle_written_jnewblk: Unknown type %d.", 3260 jnewblk->jn_dep->wk_type); 3261 } 3262 jnewblk->jn_dep = NULL; 3263 free_jnewblk(jnewblk); 3264 } 3265 3266 /* 3267 * Cancel a jfreefrag that won't be needed, probably due to colliding with 3268 * an in-flight allocation that has not yet been committed. Divorce us 3269 * from the freefrag and mark it DEPCOMPLETE so that it may be added 3270 * to the worklist. 3271 */ 3272 static void 3273 cancel_jfreefrag(jfreefrag) 3274 struct jfreefrag *jfreefrag; 3275 { 3276 struct freefrag *freefrag; 3277 3278 if (jfreefrag->fr_jsegdep) { 3279 free_jsegdep(jfreefrag->fr_jsegdep); 3280 jfreefrag->fr_jsegdep = NULL; 3281 } 3282 freefrag = jfreefrag->fr_freefrag; 3283 jfreefrag->fr_freefrag = NULL; 3284 free_jfreefrag(jfreefrag); 3285 freefrag->ff_state |= DEPCOMPLETE; 3286 } 3287 3288 /* 3289 * Free a jfreefrag when the parent freefrag is rendered obsolete. 3290 */ 3291 static void 3292 free_jfreefrag(jfreefrag) 3293 struct jfreefrag *jfreefrag; 3294 { 3295 3296 if (jfreefrag->fr_state & IOSTARTED) 3297 WORKLIST_REMOVE(&jfreefrag->fr_list); 3298 else if (jfreefrag->fr_state & ONWORKLIST) 3299 remove_from_journal(&jfreefrag->fr_list); 3300 if (jfreefrag->fr_freefrag != NULL) 3301 panic("free_jfreefrag: Still attached to a freefrag."); 3302 WORKITEM_FREE(jfreefrag, D_JFREEFRAG); 3303 } 3304 3305 /* 3306 * Called when the journal write for a jfreefrag completes. The parent 3307 * freefrag is added to the worklist if this completes its dependencies. 3308 */ 3309 static void 3310 handle_written_jfreefrag(jfreefrag) 3311 struct jfreefrag *jfreefrag; 3312 { 3313 struct jsegdep *jsegdep; 3314 struct freefrag *freefrag; 3315 3316 /* Grab the jsegdep. */ 3317 jsegdep = jfreefrag->fr_jsegdep; 3318 jfreefrag->fr_jsegdep = NULL; 3319 freefrag = jfreefrag->fr_freefrag; 3320 if (freefrag == NULL) 3321 panic("handle_written_jfreefrag: No freefrag."); 3322 freefrag->ff_state |= DEPCOMPLETE; 3323 freefrag->ff_jdep = NULL; 3324 WORKLIST_INSERT(&freefrag->ff_jwork, &jsegdep->jd_list); 3325 if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) 3326 add_to_worklist(&freefrag->ff_list, 0); 3327 jfreefrag->fr_freefrag = NULL; 3328 free_jfreefrag(jfreefrag); 3329 } 3330 3331 /* 3332 * Called when the journal write for a jfreeblk completes. The jfreeblk 3333 * is removed from the freeblks list of pending journal writes and the 3334 * jsegdep is moved to the freeblks jwork to be completed when all blocks 3335 * have been reclaimed. 3336 */ 3337 static void 3338 handle_written_jfreeblk(jfreeblk) 3339 struct jfreeblk *jfreeblk; 3340 { 3341 struct freeblks *freeblks; 3342 struct jsegdep *jsegdep; 3343 3344 /* Grab the jsegdep. */ 3345 jsegdep = jfreeblk->jf_jsegdep; 3346 jfreeblk->jf_jsegdep = NULL; 3347 freeblks = jfreeblk->jf_freeblks; 3348 LIST_REMOVE(jfreeblk, jf_deps); 3349 WORKLIST_INSERT(&freeblks->fb_jwork, &jsegdep->jd_list); 3350 /* 3351 * If the freeblks is all journaled, we can add it to the worklist. 3352 */ 3353 if (LIST_EMPTY(&freeblks->fb_jfreeblkhd) && 3354 (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) { 3355 /* Remove from the b_dep that is waiting on this write. */ 3356 if (freeblks->fb_state & ONWORKLIST) 3357 WORKLIST_REMOVE(&freeblks->fb_list); 3358 add_to_worklist(&freeblks->fb_list, 1); 3359 } 3360 3361 free_jfreeblk(jfreeblk); 3362 } 3363 3364 static struct jsegdep * 3365 newjsegdep(struct worklist *wk) 3366 { 3367 struct jsegdep *jsegdep; 3368 3369 jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS); 3370 workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp); 3371 jsegdep->jd_seg = NULL; 3372 3373 return (jsegdep); 3374 } 3375 3376 static struct jmvref * 3377 newjmvref(dp, ino, oldoff, newoff) 3378 struct inode *dp; 3379 ino_t ino; 3380 off_t oldoff; 3381 off_t newoff; 3382 { 3383 struct jmvref *jmvref; 3384 3385 jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS); 3386 workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump)); 3387 jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE; 3388 jmvref->jm_parent = dp->i_number; 3389 jmvref->jm_ino = ino; 3390 jmvref->jm_oldoff = oldoff; 3391 jmvref->jm_newoff = newoff; 3392 3393 return (jmvref); 3394 } 3395 3396 /* 3397 * Allocate a new jremref that tracks the removal of ip from dp with the 3398 * directory entry offset of diroff. Mark the entry as ATTACHED and 3399 * DEPCOMPLETE as we have all the information required for the journal write 3400 * and the directory has already been removed from the buffer. The caller 3401 * is responsible for linking the jremref into the pagedep and adding it 3402 * to the journal to write. The MKDIR_PARENT flag is set if we're doing 3403 * a DOTDOT addition so handle_workitem_remove() can properly assign 3404 * the jsegdep when we're done. 3405 */ 3406 static struct jremref * 3407 newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip, 3408 off_t diroff, nlink_t nlink) 3409 { 3410 struct jremref *jremref; 3411 3412 jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS); 3413 workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump)); 3414 jremref->jr_state = ATTACHED; 3415 newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff, 3416 nlink, ip->i_mode); 3417 jremref->jr_dirrem = dirrem; 3418 3419 return (jremref); 3420 } 3421 3422 static inline void 3423 newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff, 3424 nlink_t nlink, uint16_t mode) 3425 { 3426 3427 inoref->if_jsegdep = newjsegdep(&inoref->if_list); 3428 inoref->if_diroff = diroff; 3429 inoref->if_ino = ino; 3430 inoref->if_parent = parent; 3431 inoref->if_nlink = nlink; 3432 inoref->if_mode = mode; 3433 } 3434 3435 /* 3436 * Allocate a new jaddref to track the addition of ino to dp at diroff. The 3437 * directory offset may not be known until later. The caller is responsible 3438 * adding the entry to the journal when this information is available. nlink 3439 * should be the link count prior to the addition and mode is only required 3440 * to have the correct FMT. 3441 */ 3442 static struct jaddref * 3443 newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink, 3444 uint16_t mode) 3445 { 3446 struct jaddref *jaddref; 3447 3448 jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS); 3449 workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump)); 3450 jaddref->ja_state = ATTACHED; 3451 jaddref->ja_mkdir = NULL; 3452 newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode); 3453 3454 return (jaddref); 3455 } 3456 3457 /* 3458 * Create a new free dependency for a freework. The caller is responsible 3459 * for adjusting the reference count when it has the lock held. The freedep 3460 * will track an outstanding bitmap write that will ultimately clear the 3461 * freework to continue. 3462 */ 3463 static struct freedep * 3464 newfreedep(struct freework *freework) 3465 { 3466 struct freedep *freedep; 3467 3468 freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS); 3469 workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp); 3470 freedep->fd_freework = freework; 3471 3472 return (freedep); 3473 } 3474 3475 /* 3476 * Free a freedep structure once the buffer it is linked to is written. If 3477 * this is the last reference to the freework schedule it for completion. 3478 */ 3479 static void 3480 free_freedep(freedep) 3481 struct freedep *freedep; 3482 { 3483 3484 if (--freedep->fd_freework->fw_ref == 0) 3485 add_to_worklist(&freedep->fd_freework->fw_list, 1); 3486 WORKITEM_FREE(freedep, D_FREEDEP); 3487 } 3488 3489 /* 3490 * Allocate a new freework structure that may be a level in an indirect 3491 * when parent is not NULL or a top level block when it is. The top level 3492 * freework structures are allocated without lk held and before the freeblks 3493 * is visible outside of softdep_setup_freeblocks(). 3494 */ 3495 static struct freework * 3496 newfreework(ump, freeblks, parent, lbn, nb, frags, journal) 3497 struct ufsmount *ump; 3498 struct freeblks *freeblks; 3499 struct freework *parent; 3500 ufs_lbn_t lbn; 3501 ufs2_daddr_t nb; 3502 int frags; 3503 int journal; 3504 { 3505 struct freework *freework; 3506 3507 freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS); 3508 workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp); 3509 freework->fw_jnewblk = NULL; 3510 freework->fw_freeblks = freeblks; 3511 freework->fw_parent = parent; 3512 freework->fw_lbn = lbn; 3513 freework->fw_blkno = nb; 3514 freework->fw_frags = frags; 3515 freework->fw_ref = ((UFSTOVFS(ump)->mnt_kern_flag & MNTK_SUJ) == 0 || 3516 lbn >= -NXADDR) ? 0 : NINDIR(ump->um_fs) + 1; 3517 freework->fw_off = 0; 3518 LIST_INIT(&freework->fw_jwork); 3519 3520 if (parent == NULL) { 3521 WORKLIST_INSERT_UNLOCKED(&freeblks->fb_freeworkhd, 3522 &freework->fw_list); 3523 freeblks->fb_ref++; 3524 } 3525 if (journal) 3526 newjfreeblk(freeblks, lbn, nb, frags); 3527 3528 return (freework); 3529 } 3530 3531 /* 3532 * Allocate a new jfreeblk to journal top level block pointer when truncating 3533 * a file. The caller must add this to the worklist when lk is held. 3534 */ 3535 static struct jfreeblk * 3536 newjfreeblk(freeblks, lbn, blkno, frags) 3537 struct freeblks *freeblks; 3538 ufs_lbn_t lbn; 3539 ufs2_daddr_t blkno; 3540 int frags; 3541 { 3542 struct jfreeblk *jfreeblk; 3543 3544 jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS); 3545 workitem_alloc(&jfreeblk->jf_list, D_JFREEBLK, freeblks->fb_list.wk_mp); 3546 jfreeblk->jf_jsegdep = newjsegdep(&jfreeblk->jf_list); 3547 jfreeblk->jf_state = ATTACHED | DEPCOMPLETE; 3548 jfreeblk->jf_ino = freeblks->fb_previousinum; 3549 jfreeblk->jf_lbn = lbn; 3550 jfreeblk->jf_blkno = blkno; 3551 jfreeblk->jf_frags = frags; 3552 jfreeblk->jf_freeblks = freeblks; 3553 LIST_INSERT_HEAD(&freeblks->fb_jfreeblkhd, jfreeblk, jf_deps); 3554 3555 return (jfreeblk); 3556 } 3557 3558 static void move_newblock_dep(struct jaddref *, struct inodedep *); 3559 /* 3560 * If we're canceling a new bitmap we have to search for another ref 3561 * to move into the bmsafemap dep. This might be better expressed 3562 * with another structure. 3563 */ 3564 static void 3565 move_newblock_dep(jaddref, inodedep) 3566 struct jaddref *jaddref; 3567 struct inodedep *inodedep; 3568 { 3569 struct inoref *inoref; 3570 struct jaddref *jaddrefn; 3571 3572 jaddrefn = NULL; 3573 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; 3574 inoref = TAILQ_NEXT(inoref, if_deps)) { 3575 if ((jaddref->ja_state & NEWBLOCK) && 3576 inoref->if_list.wk_type == D_JADDREF) { 3577 jaddrefn = (struct jaddref *)inoref; 3578 break; 3579 } 3580 } 3581 if (jaddrefn == NULL) 3582 return; 3583 jaddrefn->ja_state &= ~(ATTACHED | UNDONE); 3584 jaddrefn->ja_state |= jaddref->ja_state & 3585 (ATTACHED | UNDONE | NEWBLOCK); 3586 jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK); 3587 jaddref->ja_state |= ATTACHED; 3588 LIST_REMOVE(jaddref, ja_bmdeps); 3589 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn, 3590 ja_bmdeps); 3591 } 3592 3593 /* 3594 * Cancel a jaddref either before it has been written or while it is being 3595 * written. This happens when a link is removed before the add reaches 3596 * the disk. The jaddref dependency is kept linked into the bmsafemap 3597 * and inode to prevent the link count or bitmap from reaching the disk 3598 * until handle_workitem_remove() re-adjusts the counts and bitmaps as 3599 * required. 3600 * 3601 * Returns 1 if the canceled addref requires journaling of the remove and 3602 * 0 otherwise. 3603 */ 3604 static int 3605 cancel_jaddref(jaddref, inodedep, wkhd) 3606 struct jaddref *jaddref; 3607 struct inodedep *inodedep; 3608 struct workhead *wkhd; 3609 { 3610 struct inoref *inoref; 3611 struct jsegdep *jsegdep; 3612 int needsj; 3613 3614 KASSERT((jaddref->ja_state & COMPLETE) == 0, 3615 ("cancel_jaddref: Canceling complete jaddref")); 3616 if (jaddref->ja_state & (IOSTARTED | COMPLETE)) 3617 needsj = 1; 3618 else 3619 needsj = 0; 3620 if (inodedep == NULL) 3621 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 3622 0, &inodedep) == 0) 3623 panic("cancel_jaddref: Lost inodedep"); 3624 /* 3625 * We must adjust the nlink of any reference operation that follows 3626 * us so that it is consistent with the in-memory reference. This 3627 * ensures that inode nlink rollbacks always have the correct link. 3628 */ 3629 if (needsj == 0) { 3630 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; 3631 inoref = TAILQ_NEXT(inoref, if_deps)) { 3632 if (inoref->if_state & GOINGAWAY) 3633 break; 3634 inoref->if_nlink--; 3635 } 3636 } 3637 jsegdep = inoref_jseg(&jaddref->ja_ref); 3638 if (jaddref->ja_state & NEWBLOCK) 3639 move_newblock_dep(jaddref, inodedep); 3640 if (jaddref->ja_state & IOWAITING) { 3641 jaddref->ja_state &= ~IOWAITING; 3642 wakeup(&jaddref->ja_list); 3643 } 3644 jaddref->ja_mkdir = NULL; 3645 if (jaddref->ja_state & IOSTARTED) { 3646 jaddref->ja_state &= ~IOSTARTED; 3647 WORKLIST_REMOVE(&jaddref->ja_list); 3648 WORKLIST_INSERT(wkhd, &jsegdep->jd_list); 3649 } else { 3650 free_jsegdep(jsegdep); 3651 if (jaddref->ja_state & DEPCOMPLETE) 3652 remove_from_journal(&jaddref->ja_list); 3653 } 3654 jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE); 3655 /* 3656 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove 3657 * can arrange for them to be freed with the bitmap. Otherwise we 3658 * no longer need this addref attached to the inoreflst and it 3659 * will incorrectly adjust nlink if we leave it. 3660 */ 3661 if ((jaddref->ja_state & NEWBLOCK) == 0) { 3662 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, 3663 if_deps); 3664 jaddref->ja_state |= COMPLETE; 3665 free_jaddref(jaddref); 3666 return (needsj); 3667 } 3668 /* 3669 * Leave the head of the list for jsegdeps for fast merging. 3670 */ 3671 if (LIST_FIRST(wkhd) != NULL) { 3672 jaddref->ja_state |= ONWORKLIST; 3673 LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list); 3674 } else 3675 WORKLIST_INSERT(wkhd, &jaddref->ja_list); 3676 3677 return (needsj); 3678 } 3679 3680 /* 3681 * Attempt to free a jaddref structure when some work completes. This 3682 * should only succeed once the entry is written and all dependencies have 3683 * been notified. 3684 */ 3685 static void 3686 free_jaddref(jaddref) 3687 struct jaddref *jaddref; 3688 { 3689 3690 if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE) 3691 return; 3692 if (jaddref->ja_ref.if_jsegdep) 3693 panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n", 3694 jaddref, jaddref->ja_state); 3695 if (jaddref->ja_state & NEWBLOCK) 3696 LIST_REMOVE(jaddref, ja_bmdeps); 3697 if (jaddref->ja_state & (IOSTARTED | ONWORKLIST)) 3698 panic("free_jaddref: Bad state %p(0x%X)", 3699 jaddref, jaddref->ja_state); 3700 if (jaddref->ja_mkdir != NULL) 3701 panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state); 3702 WORKITEM_FREE(jaddref, D_JADDREF); 3703 } 3704 3705 /* 3706 * Free a jremref structure once it has been written or discarded. 3707 */ 3708 static void 3709 free_jremref(jremref) 3710 struct jremref *jremref; 3711 { 3712 3713 if (jremref->jr_ref.if_jsegdep) 3714 free_jsegdep(jremref->jr_ref.if_jsegdep); 3715 if (jremref->jr_state & IOSTARTED) 3716 panic("free_jremref: IO still pending"); 3717 WORKITEM_FREE(jremref, D_JREMREF); 3718 } 3719 3720 /* 3721 * Free a jnewblk structure. 3722 */ 3723 static void 3724 free_jnewblk(jnewblk) 3725 struct jnewblk *jnewblk; 3726 { 3727 3728 if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE) 3729 return; 3730 LIST_REMOVE(jnewblk, jn_deps); 3731 if (jnewblk->jn_dep != NULL) 3732 panic("free_jnewblk: Dependency still attached."); 3733 WORKITEM_FREE(jnewblk, D_JNEWBLK); 3734 } 3735 3736 /* 3737 * Cancel a jnewblk which has been superseded by a freeblk. The jnewblk 3738 * is kept linked into the bmsafemap until the free completes, thus 3739 * preventing the modified state from ever reaching disk. The free 3740 * routine must pass this structure via ffs_blkfree() to 3741 * softdep_setup_freeblks() so there is no race in releasing the space. 3742 */ 3743 static void 3744 cancel_jnewblk(jnewblk, wkhd) 3745 struct jnewblk *jnewblk; 3746 struct workhead *wkhd; 3747 { 3748 struct jsegdep *jsegdep; 3749 3750 jsegdep = jnewblk->jn_jsegdep; 3751 if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL) 3752 panic("cancel_jnewblk: Invalid state"); 3753 jnewblk->jn_jsegdep = NULL; 3754 jnewblk->jn_dep = NULL; 3755 jnewblk->jn_state |= GOINGAWAY; 3756 if (jnewblk->jn_state & IOSTARTED) { 3757 jnewblk->jn_state &= ~IOSTARTED; 3758 WORKLIST_REMOVE(&jnewblk->jn_list); 3759 WORKLIST_INSERT(wkhd, &jsegdep->jd_list); 3760 } else { 3761 free_jsegdep(jsegdep); 3762 remove_from_journal(&jnewblk->jn_list); 3763 } 3764 if (jnewblk->jn_state & IOWAITING) { 3765 jnewblk->jn_state &= ~IOWAITING; 3766 wakeup(&jnewblk->jn_list); 3767 } 3768 WORKLIST_INSERT(wkhd, &jnewblk->jn_list); 3769 } 3770 3771 static void 3772 free_jfreeblk(jfreeblk) 3773 struct jfreeblk *jfreeblk; 3774 { 3775 3776 WORKITEM_FREE(jfreeblk, D_JFREEBLK); 3777 } 3778 3779 /* 3780 * Free a single jseg once it is no longer referenced in memory or on 3781 * disk. Reclaim journal blocks and dependencies waiting for the segment 3782 * to disappear. 3783 */ 3784 static void 3785 free_jseg(jseg, jblocks) 3786 struct jseg *jseg; 3787 struct jblocks *jblocks; 3788 { 3789 struct freework *freework; 3790 3791 /* 3792 * Free freework structures that were lingering to indicate freed 3793 * indirect blocks that forced journal write ordering on reallocate. 3794 */ 3795 while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL) { 3796 LIST_REMOVE(freework, fw_next); 3797 WORKLIST_REMOVE(&freework->fw_list); 3798 WORKITEM_FREE(freework, D_FREEWORK); 3799 } 3800 if (jblocks->jb_oldestseg == jseg) 3801 jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next); 3802 TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next); 3803 jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size); 3804 KASSERT(LIST_EMPTY(&jseg->js_entries), 3805 ("free_jseg: Freed jseg has valid entries.")); 3806 WORKITEM_FREE(jseg, D_JSEG); 3807 } 3808 3809 /* 3810 * Free all jsegs that meet the criteria for being reclaimed and update 3811 * oldestseg. 3812 */ 3813 static void 3814 free_jsegs(jblocks) 3815 struct jblocks *jblocks; 3816 { 3817 struct jseg *jseg; 3818 3819 /* 3820 * Free only those jsegs which have none allocated before them to 3821 * preserve the journal space ordering. 3822 */ 3823 while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) { 3824 /* 3825 * Only reclaim space when nothing depends on this journal 3826 * set and another set has written that it is no longer 3827 * valid. 3828 */ 3829 if (jseg->js_refs != 0) { 3830 jblocks->jb_oldestseg = jseg; 3831 return; 3832 } 3833 if (!LIST_EMPTY(&jseg->js_indirs) && 3834 jseg->js_seq >= jblocks->jb_oldestwrseq) 3835 break; 3836 free_jseg(jseg, jblocks); 3837 } 3838 /* 3839 * If we exited the loop above we still must discover the 3840 * oldest valid segment. 3841 */ 3842 if (jseg) 3843 for (jseg = jblocks->jb_oldestseg; jseg != NULL; 3844 jseg = TAILQ_NEXT(jseg, js_next)) 3845 if (jseg->js_refs != 0) 3846 break; 3847 jblocks->jb_oldestseg = jseg; 3848 /* 3849 * The journal has no valid records but some jsegs may still be 3850 * waiting on oldestwrseq to advance. We force a small record 3851 * out to permit these lingering records to be reclaimed. 3852 */ 3853 if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs)) 3854 jblocks->jb_needseg = 1; 3855 } 3856 3857 /* 3858 * Release one reference to a jseg and free it if the count reaches 0. This 3859 * should eventually reclaim journal space as well. 3860 */ 3861 static void 3862 rele_jseg(jseg) 3863 struct jseg *jseg; 3864 { 3865 3866 KASSERT(jseg->js_refs > 0, 3867 ("free_jseg: Invalid refcnt %d", jseg->js_refs)); 3868 if (--jseg->js_refs != 0) 3869 return; 3870 free_jsegs(jseg->js_jblocks); 3871 } 3872 3873 /* 3874 * Release a jsegdep and decrement the jseg count. 3875 */ 3876 static void 3877 free_jsegdep(jsegdep) 3878 struct jsegdep *jsegdep; 3879 { 3880 3881 if (jsegdep->jd_seg) 3882 rele_jseg(jsegdep->jd_seg); 3883 WORKITEM_FREE(jsegdep, D_JSEGDEP); 3884 } 3885 3886 /* 3887 * Wait for a journal item to make it to disk. Initiate journal processing 3888 * if required. 3889 */ 3890 static void 3891 jwait(wk) 3892 struct worklist *wk; 3893 { 3894 3895 stat_journal_wait++; 3896 /* 3897 * If IO has not started we process the journal. We can't mark the 3898 * worklist item as IOWAITING because we drop the lock while 3899 * processing the journal and the worklist entry may be freed after 3900 * this point. The caller may call back in and re-issue the request. 3901 */ 3902 if ((wk->wk_state & IOSTARTED) == 0) { 3903 softdep_process_journal(wk->wk_mp, wk, MNT_WAIT); 3904 return; 3905 } 3906 wk->wk_state |= IOWAITING; 3907 msleep(wk, &lk, PRIBIO, "jwait", 0); 3908 } 3909 3910 /* 3911 * Lookup an inodedep based on an inode pointer and set the nlinkdelta as 3912 * appropriate. This is a convenience function to reduce duplicate code 3913 * for the setup and revert functions below. 3914 */ 3915 static struct inodedep * 3916 inodedep_lookup_ip(ip) 3917 struct inode *ip; 3918 { 3919 struct inodedep *inodedep; 3920 3921 KASSERT(ip->i_nlink >= ip->i_effnlink, 3922 ("inodedep_lookup_ip: bad delta")); 3923 (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 3924 DEPALLOC, &inodedep); 3925 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 3926 3927 return (inodedep); 3928 } 3929 3930 /* 3931 * Create a journal entry that describes a truncate that we're about to 3932 * perform. The inode allocations and frees between here and the completion 3933 * of the operation are done asynchronously and without journaling. At 3934 * the end of the operation the vnode is sync'd and the journal space 3935 * is released. Recovery will discover the partially completed truncate 3936 * and complete it. 3937 */ 3938 void * 3939 softdep_setup_trunc(vp, length, flags) 3940 struct vnode *vp; 3941 off_t length; 3942 int flags; 3943 { 3944 struct jsegdep *jsegdep; 3945 struct jtrunc *jtrunc; 3946 struct ufsmount *ump; 3947 struct inode *ip; 3948 3949 softdep_prealloc(vp, MNT_WAIT); 3950 ip = VTOI(vp); 3951 ump = VFSTOUFS(vp->v_mount); 3952 jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS); 3953 workitem_alloc(&jtrunc->jt_list, D_JTRUNC, vp->v_mount); 3954 jsegdep = jtrunc->jt_jsegdep = newjsegdep(&jtrunc->jt_list); 3955 jtrunc->jt_ino = ip->i_number; 3956 jtrunc->jt_extsize = 0; 3957 jtrunc->jt_size = length; 3958 if ((flags & IO_EXT) == 0 && ump->um_fstype == UFS2) 3959 jtrunc->jt_extsize = ip->i_din2->di_extsize; 3960 if ((flags & IO_NORMAL) == 0) 3961 jtrunc->jt_size = DIP(ip, i_size); 3962 ACQUIRE_LOCK(&lk); 3963 add_to_journal(&jtrunc->jt_list); 3964 while (jsegdep->jd_seg == NULL) { 3965 stat_jwait_freeblks++; 3966 jwait(&jtrunc->jt_list); 3967 } 3968 FREE_LOCK(&lk); 3969 3970 return (jsegdep); 3971 } 3972 3973 /* 3974 * After synchronous truncation is complete we free sync the vnode and 3975 * release the jsegdep so the journal space can be freed. 3976 */ 3977 int 3978 softdep_complete_trunc(vp, cookie) 3979 struct vnode *vp; 3980 void *cookie; 3981 { 3982 int error; 3983 3984 error = ffs_syncvnode(vp, MNT_WAIT); 3985 ACQUIRE_LOCK(&lk); 3986 free_jsegdep((struct jsegdep *)cookie); 3987 FREE_LOCK(&lk); 3988 3989 return (error); 3990 } 3991 3992 /* 3993 * Called prior to creating a new inode and linking it to a directory. The 3994 * jaddref structure must already be allocated by softdep_setup_inomapdep 3995 * and it is discovered here so we can initialize the mode and update 3996 * nlinkdelta. 3997 */ 3998 void 3999 softdep_setup_create(dp, ip) 4000 struct inode *dp; 4001 struct inode *ip; 4002 { 4003 struct inodedep *inodedep; 4004 struct jaddref *jaddref; 4005 struct vnode *dvp; 4006 4007 KASSERT(ip->i_nlink == 1, 4008 ("softdep_setup_create: Invalid link count.")); 4009 dvp = ITOV(dp); 4010 ACQUIRE_LOCK(&lk); 4011 inodedep = inodedep_lookup_ip(ip); 4012 if (DOINGSUJ(dvp)) { 4013 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4014 inoreflst); 4015 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 4016 ("softdep_setup_create: No addref structure present.")); 4017 jaddref->ja_mode = ip->i_mode; 4018 } 4019 softdep_prelink(dvp, NULL); 4020 FREE_LOCK(&lk); 4021 } 4022 4023 /* 4024 * Create a jaddref structure to track the addition of a DOTDOT link when 4025 * we are reparenting an inode as part of a rename. This jaddref will be 4026 * found by softdep_setup_directory_change. Adjusts nlinkdelta for 4027 * non-journaling softdep. 4028 */ 4029 void 4030 softdep_setup_dotdot_link(dp, ip) 4031 struct inode *dp; 4032 struct inode *ip; 4033 { 4034 struct inodedep *inodedep; 4035 struct jaddref *jaddref; 4036 struct vnode *dvp; 4037 struct vnode *vp; 4038 4039 dvp = ITOV(dp); 4040 vp = ITOV(ip); 4041 jaddref = NULL; 4042 /* 4043 * We don't set MKDIR_PARENT as this is not tied to a mkdir and 4044 * is used as a normal link would be. 4045 */ 4046 if (DOINGSUJ(dvp)) 4047 jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, 4048 dp->i_effnlink - 1, dp->i_mode); 4049 ACQUIRE_LOCK(&lk); 4050 inodedep = inodedep_lookup_ip(dp); 4051 if (jaddref) 4052 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 4053 if_deps); 4054 softdep_prelink(dvp, ITOV(ip)); 4055 FREE_LOCK(&lk); 4056 } 4057 4058 /* 4059 * Create a jaddref structure to track a new link to an inode. The directory 4060 * offset is not known until softdep_setup_directory_add or 4061 * softdep_setup_directory_change. Adjusts nlinkdelta for non-journaling 4062 * softdep. 4063 */ 4064 void 4065 softdep_setup_link(dp, ip) 4066 struct inode *dp; 4067 struct inode *ip; 4068 { 4069 struct inodedep *inodedep; 4070 struct jaddref *jaddref; 4071 struct vnode *dvp; 4072 4073 dvp = ITOV(dp); 4074 jaddref = NULL; 4075 if (DOINGSUJ(dvp)) 4076 jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1, 4077 ip->i_mode); 4078 ACQUIRE_LOCK(&lk); 4079 inodedep = inodedep_lookup_ip(ip); 4080 if (jaddref) 4081 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 4082 if_deps); 4083 softdep_prelink(dvp, ITOV(ip)); 4084 FREE_LOCK(&lk); 4085 } 4086 4087 /* 4088 * Called to create the jaddref structures to track . and .. references as 4089 * well as lookup and further initialize the incomplete jaddref created 4090 * by softdep_setup_inomapdep when the inode was allocated. Adjusts 4091 * nlinkdelta for non-journaling softdep. 4092 */ 4093 void 4094 softdep_setup_mkdir(dp, ip) 4095 struct inode *dp; 4096 struct inode *ip; 4097 { 4098 struct inodedep *inodedep; 4099 struct jaddref *dotdotaddref; 4100 struct jaddref *dotaddref; 4101 struct jaddref *jaddref; 4102 struct vnode *dvp; 4103 4104 dvp = ITOV(dp); 4105 dotaddref = dotdotaddref = NULL; 4106 if (DOINGSUJ(dvp)) { 4107 dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1, 4108 ip->i_mode); 4109 dotaddref->ja_state |= MKDIR_BODY; 4110 dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, 4111 dp->i_effnlink - 1, dp->i_mode); 4112 dotdotaddref->ja_state |= MKDIR_PARENT; 4113 } 4114 ACQUIRE_LOCK(&lk); 4115 inodedep = inodedep_lookup_ip(ip); 4116 if (DOINGSUJ(dvp)) { 4117 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4118 inoreflst); 4119 KASSERT(jaddref != NULL, 4120 ("softdep_setup_mkdir: No addref structure present.")); 4121 KASSERT(jaddref->ja_parent == dp->i_number, 4122 ("softdep_setup_mkdir: bad parent %d", 4123 jaddref->ja_parent)); 4124 jaddref->ja_mode = ip->i_mode; 4125 TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref, 4126 if_deps); 4127 } 4128 inodedep = inodedep_lookup_ip(dp); 4129 if (DOINGSUJ(dvp)) 4130 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, 4131 &dotdotaddref->ja_ref, if_deps); 4132 softdep_prelink(ITOV(dp), NULL); 4133 FREE_LOCK(&lk); 4134 } 4135 4136 /* 4137 * Called to track nlinkdelta of the inode and parent directories prior to 4138 * unlinking a directory. 4139 */ 4140 void 4141 softdep_setup_rmdir(dp, ip) 4142 struct inode *dp; 4143 struct inode *ip; 4144 { 4145 struct vnode *dvp; 4146 4147 dvp = ITOV(dp); 4148 ACQUIRE_LOCK(&lk); 4149 (void) inodedep_lookup_ip(ip); 4150 (void) inodedep_lookup_ip(dp); 4151 softdep_prelink(dvp, ITOV(ip)); 4152 FREE_LOCK(&lk); 4153 } 4154 4155 /* 4156 * Called to track nlinkdelta of the inode and parent directories prior to 4157 * unlink. 4158 */ 4159 void 4160 softdep_setup_unlink(dp, ip) 4161 struct inode *dp; 4162 struct inode *ip; 4163 { 4164 struct vnode *dvp; 4165 4166 dvp = ITOV(dp); 4167 ACQUIRE_LOCK(&lk); 4168 (void) inodedep_lookup_ip(ip); 4169 (void) inodedep_lookup_ip(dp); 4170 softdep_prelink(dvp, ITOV(ip)); 4171 FREE_LOCK(&lk); 4172 } 4173 4174 /* 4175 * Called to release the journal structures created by a failed non-directory 4176 * creation. Adjusts nlinkdelta for non-journaling softdep. 4177 */ 4178 void 4179 softdep_revert_create(dp, ip) 4180 struct inode *dp; 4181 struct inode *ip; 4182 { 4183 struct inodedep *inodedep; 4184 struct jaddref *jaddref; 4185 struct vnode *dvp; 4186 4187 dvp = ITOV(dp); 4188 ACQUIRE_LOCK(&lk); 4189 inodedep = inodedep_lookup_ip(ip); 4190 if (DOINGSUJ(dvp)) { 4191 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4192 inoreflst); 4193 KASSERT(jaddref->ja_parent == dp->i_number, 4194 ("softdep_revert_create: addref parent mismatch")); 4195 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4196 } 4197 FREE_LOCK(&lk); 4198 } 4199 4200 /* 4201 * Called to release the journal structures created by a failed dotdot link 4202 * creation. Adjusts nlinkdelta for non-journaling softdep. 4203 */ 4204 void 4205 softdep_revert_dotdot_link(dp, ip) 4206 struct inode *dp; 4207 struct inode *ip; 4208 { 4209 struct inodedep *inodedep; 4210 struct jaddref *jaddref; 4211 struct vnode *dvp; 4212 4213 dvp = ITOV(dp); 4214 ACQUIRE_LOCK(&lk); 4215 inodedep = inodedep_lookup_ip(dp); 4216 if (DOINGSUJ(dvp)) { 4217 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4218 inoreflst); 4219 KASSERT(jaddref->ja_parent == ip->i_number, 4220 ("softdep_revert_dotdot_link: addref parent mismatch")); 4221 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4222 } 4223 FREE_LOCK(&lk); 4224 } 4225 4226 /* 4227 * Called to release the journal structures created by a failed link 4228 * addition. Adjusts nlinkdelta for non-journaling softdep. 4229 */ 4230 void 4231 softdep_revert_link(dp, ip) 4232 struct inode *dp; 4233 struct inode *ip; 4234 { 4235 struct inodedep *inodedep; 4236 struct jaddref *jaddref; 4237 struct vnode *dvp; 4238 4239 dvp = ITOV(dp); 4240 ACQUIRE_LOCK(&lk); 4241 inodedep = inodedep_lookup_ip(ip); 4242 if (DOINGSUJ(dvp)) { 4243 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4244 inoreflst); 4245 KASSERT(jaddref->ja_parent == dp->i_number, 4246 ("softdep_revert_link: addref parent mismatch")); 4247 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4248 } 4249 FREE_LOCK(&lk); 4250 } 4251 4252 /* 4253 * Called to release the journal structures created by a failed mkdir 4254 * attempt. Adjusts nlinkdelta for non-journaling softdep. 4255 */ 4256 void 4257 softdep_revert_mkdir(dp, ip) 4258 struct inode *dp; 4259 struct inode *ip; 4260 { 4261 struct inodedep *inodedep; 4262 struct jaddref *jaddref; 4263 struct jaddref *dotaddref; 4264 struct vnode *dvp; 4265 4266 dvp = ITOV(dp); 4267 4268 ACQUIRE_LOCK(&lk); 4269 inodedep = inodedep_lookup_ip(dp); 4270 if (DOINGSUJ(dvp)) { 4271 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4272 inoreflst); 4273 KASSERT(jaddref->ja_parent == ip->i_number, 4274 ("softdep_revert_mkdir: dotdot addref parent mismatch")); 4275 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4276 } 4277 inodedep = inodedep_lookup_ip(ip); 4278 if (DOINGSUJ(dvp)) { 4279 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4280 inoreflst); 4281 KASSERT(jaddref->ja_parent == dp->i_number, 4282 ("softdep_revert_mkdir: addref parent mismatch")); 4283 dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, 4284 inoreflst, if_deps); 4285 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4286 KASSERT(dotaddref->ja_parent == ip->i_number, 4287 ("softdep_revert_mkdir: dot addref parent mismatch")); 4288 cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait); 4289 } 4290 FREE_LOCK(&lk); 4291 } 4292 4293 /* 4294 * Called to correct nlinkdelta after a failed rmdir. 4295 */ 4296 void 4297 softdep_revert_rmdir(dp, ip) 4298 struct inode *dp; 4299 struct inode *ip; 4300 { 4301 4302 ACQUIRE_LOCK(&lk); 4303 (void) inodedep_lookup_ip(ip); 4304 (void) inodedep_lookup_ip(dp); 4305 FREE_LOCK(&lk); 4306 } 4307 4308 /* 4309 * Protecting the freemaps (or bitmaps). 4310 * 4311 * To eliminate the need to execute fsck before mounting a filesystem 4312 * after a power failure, one must (conservatively) guarantee that the 4313 * on-disk copy of the bitmaps never indicate that a live inode or block is 4314 * free. So, when a block or inode is allocated, the bitmap should be 4315 * updated (on disk) before any new pointers. When a block or inode is 4316 * freed, the bitmap should not be updated until all pointers have been 4317 * reset. The latter dependency is handled by the delayed de-allocation 4318 * approach described below for block and inode de-allocation. The former 4319 * dependency is handled by calling the following procedure when a block or 4320 * inode is allocated. When an inode is allocated an "inodedep" is created 4321 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. 4322 * Each "inodedep" is also inserted into the hash indexing structure so 4323 * that any additional link additions can be made dependent on the inode 4324 * allocation. 4325 * 4326 * The ufs filesystem maintains a number of free block counts (e.g., per 4327 * cylinder group, per cylinder and per <cylinder, rotational position> pair) 4328 * in addition to the bitmaps. These counts are used to improve efficiency 4329 * during allocation and therefore must be consistent with the bitmaps. 4330 * There is no convenient way to guarantee post-crash consistency of these 4331 * counts with simple update ordering, for two main reasons: (1) The counts 4332 * and bitmaps for a single cylinder group block are not in the same disk 4333 * sector. If a disk write is interrupted (e.g., by power failure), one may 4334 * be written and the other not. (2) Some of the counts are located in the 4335 * superblock rather than the cylinder group block. So, we focus our soft 4336 * updates implementation on protecting the bitmaps. When mounting a 4337 * filesystem, we recompute the auxiliary counts from the bitmaps. 4338 */ 4339 4340 /* 4341 * Called just after updating the cylinder group block to allocate an inode. 4342 */ 4343 void 4344 softdep_setup_inomapdep(bp, ip, newinum) 4345 struct buf *bp; /* buffer for cylgroup block with inode map */ 4346 struct inode *ip; /* inode related to allocation */ 4347 ino_t newinum; /* new inode number being allocated */ 4348 { 4349 struct inodedep *inodedep; 4350 struct bmsafemap *bmsafemap; 4351 struct jaddref *jaddref; 4352 struct mount *mp; 4353 struct fs *fs; 4354 4355 mp = UFSTOVFS(ip->i_ump); 4356 fs = ip->i_ump->um_fs; 4357 jaddref = NULL; 4358 4359 /* 4360 * Allocate the journal reference add structure so that the bitmap 4361 * can be dependent on it. 4362 */ 4363 if (mp->mnt_kern_flag & MNTK_SUJ) { 4364 jaddref = newjaddref(ip, newinum, 0, 0, 0); 4365 jaddref->ja_state |= NEWBLOCK; 4366 } 4367 4368 /* 4369 * Create a dependency for the newly allocated inode. 4370 * Panic if it already exists as something is seriously wrong. 4371 * Otherwise add it to the dependency list for the buffer holding 4372 * the cylinder group map from which it was allocated. 4373 */ 4374 ACQUIRE_LOCK(&lk); 4375 if ((inodedep_lookup(mp, newinum, DEPALLOC|NODELAY, &inodedep))) 4376 panic("softdep_setup_inomapdep: dependency %p for new" 4377 "inode already exists", inodedep); 4378 bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum)); 4379 if (jaddref) { 4380 LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps); 4381 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 4382 if_deps); 4383 } else { 4384 inodedep->id_state |= ONDEPLIST; 4385 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); 4386 } 4387 inodedep->id_bmsafemap = bmsafemap; 4388 inodedep->id_state &= ~DEPCOMPLETE; 4389 FREE_LOCK(&lk); 4390 } 4391 4392 /* 4393 * Called just after updating the cylinder group block to 4394 * allocate block or fragment. 4395 */ 4396 void 4397 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) 4398 struct buf *bp; /* buffer for cylgroup block with block map */ 4399 struct mount *mp; /* filesystem doing allocation */ 4400 ufs2_daddr_t newblkno; /* number of newly allocated block */ 4401 int frags; /* Number of fragments. */ 4402 int oldfrags; /* Previous number of fragments for extend. */ 4403 { 4404 struct newblk *newblk; 4405 struct bmsafemap *bmsafemap; 4406 struct jnewblk *jnewblk; 4407 struct fs *fs; 4408 4409 fs = VFSTOUFS(mp)->um_fs; 4410 jnewblk = NULL; 4411 /* 4412 * Create a dependency for the newly allocated block. 4413 * Add it to the dependency list for the buffer holding 4414 * the cylinder group map from which it was allocated. 4415 */ 4416 if (mp->mnt_kern_flag & MNTK_SUJ) { 4417 jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS); 4418 workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp); 4419 jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list); 4420 jnewblk->jn_state = ATTACHED; 4421 jnewblk->jn_blkno = newblkno; 4422 jnewblk->jn_frags = frags; 4423 jnewblk->jn_oldfrags = oldfrags; 4424 #ifdef SUJ_DEBUG 4425 { 4426 struct cg *cgp; 4427 uint8_t *blksfree; 4428 long bno; 4429 int i; 4430 4431 cgp = (struct cg *)bp->b_data; 4432 blksfree = cg_blksfree(cgp); 4433 bno = dtogd(fs, jnewblk->jn_blkno); 4434 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; 4435 i++) { 4436 if (isset(blksfree, bno + i)) 4437 panic("softdep_setup_blkmapdep: " 4438 "free fragment %d from %d-%d " 4439 "state 0x%X dep %p", i, 4440 jnewblk->jn_oldfrags, 4441 jnewblk->jn_frags, 4442 jnewblk->jn_state, 4443 jnewblk->jn_dep); 4444 } 4445 } 4446 #endif 4447 } 4448 ACQUIRE_LOCK(&lk); 4449 if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0) 4450 panic("softdep_setup_blkmapdep: found block"); 4451 newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp, 4452 dtog(fs, newblkno)); 4453 if (jnewblk) { 4454 jnewblk->jn_dep = (struct worklist *)newblk; 4455 LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps); 4456 } else { 4457 newblk->nb_state |= ONDEPLIST; 4458 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); 4459 } 4460 newblk->nb_bmsafemap = bmsafemap; 4461 newblk->nb_jnewblk = jnewblk; 4462 FREE_LOCK(&lk); 4463 } 4464 4465 #define BMSAFEMAP_HASH(fs, cg) \ 4466 (&bmsafemap_hashtbl[((((register_t)(fs)) >> 13) + (cg)) & bmsafemap_hash]) 4467 4468 static int 4469 bmsafemap_find(bmsafemaphd, mp, cg, bmsafemapp) 4470 struct bmsafemap_hashhead *bmsafemaphd; 4471 struct mount *mp; 4472 int cg; 4473 struct bmsafemap **bmsafemapp; 4474 { 4475 struct bmsafemap *bmsafemap; 4476 4477 LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash) 4478 if (bmsafemap->sm_list.wk_mp == mp && bmsafemap->sm_cg == cg) 4479 break; 4480 if (bmsafemap) { 4481 *bmsafemapp = bmsafemap; 4482 return (1); 4483 } 4484 *bmsafemapp = NULL; 4485 4486 return (0); 4487 } 4488 4489 /* 4490 * Find the bmsafemap associated with a cylinder group buffer. 4491 * If none exists, create one. The buffer must be locked when 4492 * this routine is called and this routine must be called with 4493 * splbio interrupts blocked. 4494 */ 4495 static struct bmsafemap * 4496 bmsafemap_lookup(mp, bp, cg) 4497 struct mount *mp; 4498 struct buf *bp; 4499 int cg; 4500 { 4501 struct bmsafemap_hashhead *bmsafemaphd; 4502 struct bmsafemap *bmsafemap, *collision; 4503 struct worklist *wk; 4504 struct fs *fs; 4505 4506 mtx_assert(&lk, MA_OWNED); 4507 if (bp) 4508 LIST_FOREACH(wk, &bp->b_dep, wk_list) 4509 if (wk->wk_type == D_BMSAFEMAP) 4510 return (WK_BMSAFEMAP(wk)); 4511 fs = VFSTOUFS(mp)->um_fs; 4512 bmsafemaphd = BMSAFEMAP_HASH(fs, cg); 4513 if (bmsafemap_find(bmsafemaphd, mp, cg, &bmsafemap) == 1) 4514 return (bmsafemap); 4515 FREE_LOCK(&lk); 4516 bmsafemap = malloc(sizeof(struct bmsafemap), 4517 M_BMSAFEMAP, M_SOFTDEP_FLAGS); 4518 workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); 4519 bmsafemap->sm_buf = bp; 4520 LIST_INIT(&bmsafemap->sm_inodedephd); 4521 LIST_INIT(&bmsafemap->sm_inodedepwr); 4522 LIST_INIT(&bmsafemap->sm_newblkhd); 4523 LIST_INIT(&bmsafemap->sm_newblkwr); 4524 LIST_INIT(&bmsafemap->sm_jaddrefhd); 4525 LIST_INIT(&bmsafemap->sm_jnewblkhd); 4526 ACQUIRE_LOCK(&lk); 4527 if (bmsafemap_find(bmsafemaphd, mp, cg, &collision) == 1) { 4528 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 4529 return (collision); 4530 } 4531 bmsafemap->sm_cg = cg; 4532 LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash); 4533 WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); 4534 return (bmsafemap); 4535 } 4536 4537 /* 4538 * Direct block allocation dependencies. 4539 * 4540 * When a new block is allocated, the corresponding disk locations must be 4541 * initialized (with zeros or new data) before the on-disk inode points to 4542 * them. Also, the freemap from which the block was allocated must be 4543 * updated (on disk) before the inode's pointer. These two dependencies are 4544 * independent of each other and are needed for all file blocks and indirect 4545 * blocks that are pointed to directly by the inode. Just before the 4546 * "in-core" version of the inode is updated with a newly allocated block 4547 * number, a procedure (below) is called to setup allocation dependency 4548 * structures. These structures are removed when the corresponding 4549 * dependencies are satisfied or when the block allocation becomes obsolete 4550 * (i.e., the file is deleted, the block is de-allocated, or the block is a 4551 * fragment that gets upgraded). All of these cases are handled in 4552 * procedures described later. 4553 * 4554 * When a file extension causes a fragment to be upgraded, either to a larger 4555 * fragment or to a full block, the on-disk location may change (if the 4556 * previous fragment could not simply be extended). In this case, the old 4557 * fragment must be de-allocated, but not until after the inode's pointer has 4558 * been updated. In most cases, this is handled by later procedures, which 4559 * will construct a "freefrag" structure to be added to the workitem queue 4560 * when the inode update is complete (or obsolete). The main exception to 4561 * this is when an allocation occurs while a pending allocation dependency 4562 * (for the same block pointer) remains. This case is handled in the main 4563 * allocation dependency setup procedure by immediately freeing the 4564 * unreferenced fragments. 4565 */ 4566 void 4567 softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp) 4568 struct inode *ip; /* inode to which block is being added */ 4569 ufs_lbn_t off; /* block pointer within inode */ 4570 ufs2_daddr_t newblkno; /* disk block number being added */ 4571 ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */ 4572 long newsize; /* size of new block */ 4573 long oldsize; /* size of new block */ 4574 struct buf *bp; /* bp for allocated block */ 4575 { 4576 struct allocdirect *adp, *oldadp; 4577 struct allocdirectlst *adphead; 4578 struct freefrag *freefrag; 4579 struct inodedep *inodedep; 4580 struct pagedep *pagedep; 4581 struct jnewblk *jnewblk; 4582 struct newblk *newblk; 4583 struct mount *mp; 4584 ufs_lbn_t lbn; 4585 4586 lbn = bp->b_lblkno; 4587 mp = UFSTOVFS(ip->i_ump); 4588 if (oldblkno && oldblkno != newblkno) 4589 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); 4590 else 4591 freefrag = NULL; 4592 4593 ACQUIRE_LOCK(&lk); 4594 if (off >= NDADDR) { 4595 if (lbn > 0) 4596 panic("softdep_setup_allocdirect: bad lbn %jd, off %jd", 4597 lbn, off); 4598 /* allocating an indirect block */ 4599 if (oldblkno != 0) 4600 panic("softdep_setup_allocdirect: non-zero indir"); 4601 } else { 4602 if (off != lbn) 4603 panic("softdep_setup_allocdirect: lbn %jd != off %jd", 4604 lbn, off); 4605 /* 4606 * Allocating a direct block. 4607 * 4608 * If we are allocating a directory block, then we must 4609 * allocate an associated pagedep to track additions and 4610 * deletions. 4611 */ 4612 if ((ip->i_mode & IFMT) == IFDIR && 4613 pagedep_lookup(mp, ip->i_number, off, DEPALLOC, 4614 &pagedep) == 0) 4615 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 4616 } 4617 if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) 4618 panic("softdep_setup_allocdirect: lost block"); 4619 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 4620 ("softdep_setup_allocdirect: newblk already initialized")); 4621 /* 4622 * Convert the newblk to an allocdirect. 4623 */ 4624 newblk->nb_list.wk_type = D_ALLOCDIRECT; 4625 adp = (struct allocdirect *)newblk; 4626 newblk->nb_freefrag = freefrag; 4627 adp->ad_offset = off; 4628 adp->ad_oldblkno = oldblkno; 4629 adp->ad_newsize = newsize; 4630 adp->ad_oldsize = oldsize; 4631 4632 /* 4633 * Finish initializing the journal. 4634 */ 4635 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 4636 jnewblk->jn_ino = ip->i_number; 4637 jnewblk->jn_lbn = lbn; 4638 add_to_journal(&jnewblk->jn_list); 4639 } 4640 if (freefrag && freefrag->ff_jdep != NULL && 4641 freefrag->ff_jdep->wk_type == D_JFREEFRAG) 4642 add_to_journal(freefrag->ff_jdep); 4643 inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); 4644 adp->ad_inodedep = inodedep; 4645 4646 WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); 4647 /* 4648 * The list of allocdirects must be kept in sorted and ascending 4649 * order so that the rollback routines can quickly determine the 4650 * first uncommitted block (the size of the file stored on disk 4651 * ends at the end of the lowest committed fragment, or if there 4652 * are no fragments, at the end of the highest committed block). 4653 * Since files generally grow, the typical case is that the new 4654 * block is to be added at the end of the list. We speed this 4655 * special case by checking against the last allocdirect in the 4656 * list before laboriously traversing the list looking for the 4657 * insertion point. 4658 */ 4659 adphead = &inodedep->id_newinoupdt; 4660 oldadp = TAILQ_LAST(adphead, allocdirectlst); 4661 if (oldadp == NULL || oldadp->ad_offset <= off) { 4662 /* insert at end of list */ 4663 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 4664 if (oldadp != NULL && oldadp->ad_offset == off) 4665 allocdirect_merge(adphead, adp, oldadp); 4666 FREE_LOCK(&lk); 4667 return; 4668 } 4669 TAILQ_FOREACH(oldadp, adphead, ad_next) { 4670 if (oldadp->ad_offset >= off) 4671 break; 4672 } 4673 if (oldadp == NULL) 4674 panic("softdep_setup_allocdirect: lost entry"); 4675 /* insert in middle of list */ 4676 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 4677 if (oldadp->ad_offset == off) 4678 allocdirect_merge(adphead, adp, oldadp); 4679 4680 FREE_LOCK(&lk); 4681 } 4682 4683 /* 4684 * Merge a newer and older journal record to be stored either in a 4685 * newblock or freefrag. This handles aggregating journal records for 4686 * fragment allocation into a second record as well as replacing a 4687 * journal free with an aborted journal allocation. A segment for the 4688 * oldest record will be placed on wkhd if it has been written. If not 4689 * the segment for the newer record will suffice. 4690 */ 4691 static struct worklist * 4692 jnewblk_merge(new, old, wkhd) 4693 struct worklist *new; 4694 struct worklist *old; 4695 struct workhead *wkhd; 4696 { 4697 struct jnewblk *njnewblk; 4698 struct jnewblk *jnewblk; 4699 4700 /* Handle NULLs to simplify callers. */ 4701 if (new == NULL) 4702 return (old); 4703 if (old == NULL) 4704 return (new); 4705 /* Replace a jfreefrag with a jnewblk. */ 4706 if (new->wk_type == D_JFREEFRAG) { 4707 cancel_jfreefrag(WK_JFREEFRAG(new)); 4708 return (old); 4709 } 4710 /* 4711 * Handle merging of two jnewblk records that describe 4712 * different sets of fragments in the same block. 4713 */ 4714 jnewblk = WK_JNEWBLK(old); 4715 njnewblk = WK_JNEWBLK(new); 4716 if (jnewblk->jn_blkno != njnewblk->jn_blkno) 4717 panic("jnewblk_merge: Merging disparate blocks."); 4718 /* 4719 * The record may be rolled back in the cg update bits 4720 * appropriately. NEWBLOCK here alerts the cg rollback code 4721 * that the frag bits have changed. 4722 */ 4723 if (jnewblk->jn_state & UNDONE) { 4724 njnewblk->jn_state |= UNDONE | NEWBLOCK; 4725 njnewblk->jn_state &= ~ATTACHED; 4726 jnewblk->jn_state &= ~UNDONE; 4727 } 4728 /* 4729 * We modify the newer addref and free the older so that if neither 4730 * has been written the most up-to-date copy will be on disk. If 4731 * both have been written but rolled back we only temporarily need 4732 * one of them to fix the bits when the cg write completes. 4733 */ 4734 jnewblk->jn_state |= ATTACHED | COMPLETE; 4735 njnewblk->jn_oldfrags = jnewblk->jn_oldfrags; 4736 cancel_jnewblk(jnewblk, wkhd); 4737 WORKLIST_REMOVE(&jnewblk->jn_list); 4738 free_jnewblk(jnewblk); 4739 return (new); 4740 } 4741 4742 /* 4743 * Replace an old allocdirect dependency with a newer one. 4744 * This routine must be called with splbio interrupts blocked. 4745 */ 4746 static void 4747 allocdirect_merge(adphead, newadp, oldadp) 4748 struct allocdirectlst *adphead; /* head of list holding allocdirects */ 4749 struct allocdirect *newadp; /* allocdirect being added */ 4750 struct allocdirect *oldadp; /* existing allocdirect being checked */ 4751 { 4752 struct worklist *wk; 4753 struct freefrag *freefrag; 4754 struct newdirblk *newdirblk; 4755 4756 freefrag = NULL; 4757 mtx_assert(&lk, MA_OWNED); 4758 if (newadp->ad_oldblkno != oldadp->ad_newblkno || 4759 newadp->ad_oldsize != oldadp->ad_newsize || 4760 newadp->ad_offset >= NDADDR) 4761 panic("%s %jd != new %jd || old size %ld != new %ld", 4762 "allocdirect_merge: old blkno", 4763 (intmax_t)newadp->ad_oldblkno, 4764 (intmax_t)oldadp->ad_newblkno, 4765 newadp->ad_oldsize, oldadp->ad_newsize); 4766 newadp->ad_oldblkno = oldadp->ad_oldblkno; 4767 newadp->ad_oldsize = oldadp->ad_oldsize; 4768 /* 4769 * If the old dependency had a fragment to free or had never 4770 * previously had a block allocated, then the new dependency 4771 * can immediately post its freefrag and adopt the old freefrag. 4772 * This action is done by swapping the freefrag dependencies. 4773 * The new dependency gains the old one's freefrag, and the 4774 * old one gets the new one and then immediately puts it on 4775 * the worklist when it is freed by free_newblk. It is 4776 * not possible to do this swap when the old dependency had a 4777 * non-zero size but no previous fragment to free. This condition 4778 * arises when the new block is an extension of the old block. 4779 * Here, the first part of the fragment allocated to the new 4780 * dependency is part of the block currently claimed on disk by 4781 * the old dependency, so cannot legitimately be freed until the 4782 * conditions for the new dependency are fulfilled. 4783 */ 4784 freefrag = newadp->ad_freefrag; 4785 if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { 4786 newadp->ad_freefrag = oldadp->ad_freefrag; 4787 oldadp->ad_freefrag = freefrag; 4788 } 4789 /* 4790 * If we are tracking a new directory-block allocation, 4791 * move it from the old allocdirect to the new allocdirect. 4792 */ 4793 if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) { 4794 newdirblk = WK_NEWDIRBLK(wk); 4795 WORKLIST_REMOVE(&newdirblk->db_list); 4796 if (!LIST_EMPTY(&oldadp->ad_newdirblk)) 4797 panic("allocdirect_merge: extra newdirblk"); 4798 WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list); 4799 } 4800 TAILQ_REMOVE(adphead, oldadp, ad_next); 4801 /* 4802 * We need to move any journal dependencies over to the freefrag 4803 * that releases this block if it exists. Otherwise we are 4804 * extending an existing block and we'll wait until that is 4805 * complete to release the journal space and extend the 4806 * new journal to cover this old space as well. 4807 */ 4808 if (freefrag == NULL) { 4809 if (oldadp->ad_newblkno != newadp->ad_newblkno) 4810 panic("allocdirect_merge: %jd != %jd", 4811 oldadp->ad_newblkno, newadp->ad_newblkno); 4812 newadp->ad_block.nb_jnewblk = (struct jnewblk *) 4813 jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list, 4814 &oldadp->ad_block.nb_jnewblk->jn_list, 4815 &newadp->ad_block.nb_jwork); 4816 oldadp->ad_block.nb_jnewblk = NULL; 4817 if (cancel_newblk(&oldadp->ad_block, NULL, 4818 &newadp->ad_block.nb_jwork)) 4819 panic("allocdirect_merge: Unexpected dependency."); 4820 } else { 4821 wk = (struct worklist *) cancel_newblk(&oldadp->ad_block, 4822 &freefrag->ff_list, &freefrag->ff_jwork); 4823 freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk, 4824 &freefrag->ff_jwork); 4825 } 4826 free_newblk(&oldadp->ad_block); 4827 } 4828 4829 /* 4830 * Allocate a jfreefrag structure to journal a single block free. 4831 */ 4832 static struct jfreefrag * 4833 newjfreefrag(freefrag, ip, blkno, size, lbn) 4834 struct freefrag *freefrag; 4835 struct inode *ip; 4836 ufs2_daddr_t blkno; 4837 long size; 4838 ufs_lbn_t lbn; 4839 { 4840 struct jfreefrag *jfreefrag; 4841 struct fs *fs; 4842 4843 fs = ip->i_fs; 4844 jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG, 4845 M_SOFTDEP_FLAGS); 4846 workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump)); 4847 jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list); 4848 jfreefrag->fr_state = ATTACHED | DEPCOMPLETE; 4849 jfreefrag->fr_ino = ip->i_number; 4850 jfreefrag->fr_lbn = lbn; 4851 jfreefrag->fr_blkno = blkno; 4852 jfreefrag->fr_frags = numfrags(fs, size); 4853 jfreefrag->fr_freefrag = freefrag; 4854 4855 return (jfreefrag); 4856 } 4857 4858 /* 4859 * Allocate a new freefrag structure. 4860 */ 4861 static struct freefrag * 4862 newfreefrag(ip, blkno, size, lbn) 4863 struct inode *ip; 4864 ufs2_daddr_t blkno; 4865 long size; 4866 ufs_lbn_t lbn; 4867 { 4868 struct freefrag *freefrag; 4869 struct fs *fs; 4870 4871 fs = ip->i_fs; 4872 if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) 4873 panic("newfreefrag: frag size"); 4874 freefrag = malloc(sizeof(struct freefrag), 4875 M_FREEFRAG, M_SOFTDEP_FLAGS); 4876 workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump)); 4877 freefrag->ff_state = ATTACHED; 4878 LIST_INIT(&freefrag->ff_jwork); 4879 freefrag->ff_inum = ip->i_number; 4880 freefrag->ff_blkno = blkno; 4881 freefrag->ff_fragsize = size; 4882 4883 if (fs->fs_flags & FS_SUJ) { 4884 freefrag->ff_jdep = (struct worklist *) 4885 newjfreefrag(freefrag, ip, blkno, size, lbn); 4886 } else { 4887 freefrag->ff_state |= DEPCOMPLETE; 4888 freefrag->ff_jdep = NULL; 4889 } 4890 4891 return (freefrag); 4892 } 4893 4894 /* 4895 * This workitem de-allocates fragments that were replaced during 4896 * file block allocation. 4897 */ 4898 static void 4899 handle_workitem_freefrag(freefrag) 4900 struct freefrag *freefrag; 4901 { 4902 struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp); 4903 struct workhead wkhd; 4904 4905 /* 4906 * It would be illegal to add new completion items to the 4907 * freefrag after it was schedule to be done so it must be 4908 * safe to modify the list head here. 4909 */ 4910 LIST_INIT(&wkhd); 4911 ACQUIRE_LOCK(&lk); 4912 LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list); 4913 /* 4914 * If the journal has not been written we must cancel it here. 4915 */ 4916 if (freefrag->ff_jdep) { 4917 if (freefrag->ff_jdep->wk_type != D_JNEWBLK) 4918 panic("handle_workitem_freefrag: Unexpected type %d\n", 4919 freefrag->ff_jdep->wk_type); 4920 cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd); 4921 } 4922 FREE_LOCK(&lk); 4923 ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno, 4924 freefrag->ff_fragsize, freefrag->ff_inum, &wkhd); 4925 ACQUIRE_LOCK(&lk); 4926 WORKITEM_FREE(freefrag, D_FREEFRAG); 4927 FREE_LOCK(&lk); 4928 } 4929 4930 /* 4931 * Set up a dependency structure for an external attributes data block. 4932 * This routine follows much of the structure of softdep_setup_allocdirect. 4933 * See the description of softdep_setup_allocdirect above for details. 4934 */ 4935 void 4936 softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp) 4937 struct inode *ip; 4938 ufs_lbn_t off; 4939 ufs2_daddr_t newblkno; 4940 ufs2_daddr_t oldblkno; 4941 long newsize; 4942 long oldsize; 4943 struct buf *bp; 4944 { 4945 struct allocdirect *adp, *oldadp; 4946 struct allocdirectlst *adphead; 4947 struct freefrag *freefrag; 4948 struct inodedep *inodedep; 4949 struct jnewblk *jnewblk; 4950 struct newblk *newblk; 4951 struct mount *mp; 4952 ufs_lbn_t lbn; 4953 4954 if (off >= NXADDR) 4955 panic("softdep_setup_allocext: lbn %lld > NXADDR", 4956 (long long)off); 4957 4958 lbn = bp->b_lblkno; 4959 mp = UFSTOVFS(ip->i_ump); 4960 if (oldblkno && oldblkno != newblkno) 4961 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); 4962 else 4963 freefrag = NULL; 4964 4965 ACQUIRE_LOCK(&lk); 4966 if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) 4967 panic("softdep_setup_allocext: lost block"); 4968 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 4969 ("softdep_setup_allocext: newblk already initialized")); 4970 /* 4971 * Convert the newblk to an allocdirect. 4972 */ 4973 newblk->nb_list.wk_type = D_ALLOCDIRECT; 4974 adp = (struct allocdirect *)newblk; 4975 newblk->nb_freefrag = freefrag; 4976 adp->ad_offset = off; 4977 adp->ad_oldblkno = oldblkno; 4978 adp->ad_newsize = newsize; 4979 adp->ad_oldsize = oldsize; 4980 adp->ad_state |= EXTDATA; 4981 4982 /* 4983 * Finish initializing the journal. 4984 */ 4985 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 4986 jnewblk->jn_ino = ip->i_number; 4987 jnewblk->jn_lbn = lbn; 4988 add_to_journal(&jnewblk->jn_list); 4989 } 4990 if (freefrag && freefrag->ff_jdep != NULL && 4991 freefrag->ff_jdep->wk_type == D_JFREEFRAG) 4992 add_to_journal(freefrag->ff_jdep); 4993 inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); 4994 adp->ad_inodedep = inodedep; 4995 4996 WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); 4997 /* 4998 * The list of allocdirects must be kept in sorted and ascending 4999 * order so that the rollback routines can quickly determine the 5000 * first uncommitted block (the size of the file stored on disk 5001 * ends at the end of the lowest committed fragment, or if there 5002 * are no fragments, at the end of the highest committed block). 5003 * Since files generally grow, the typical case is that the new 5004 * block is to be added at the end of the list. We speed this 5005 * special case by checking against the last allocdirect in the 5006 * list before laboriously traversing the list looking for the 5007 * insertion point. 5008 */ 5009 adphead = &inodedep->id_newextupdt; 5010 oldadp = TAILQ_LAST(adphead, allocdirectlst); 5011 if (oldadp == NULL || oldadp->ad_offset <= off) { 5012 /* insert at end of list */ 5013 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 5014 if (oldadp != NULL && oldadp->ad_offset == off) 5015 allocdirect_merge(adphead, adp, oldadp); 5016 FREE_LOCK(&lk); 5017 return; 5018 } 5019 TAILQ_FOREACH(oldadp, adphead, ad_next) { 5020 if (oldadp->ad_offset >= off) 5021 break; 5022 } 5023 if (oldadp == NULL) 5024 panic("softdep_setup_allocext: lost entry"); 5025 /* insert in middle of list */ 5026 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 5027 if (oldadp->ad_offset == off) 5028 allocdirect_merge(adphead, adp, oldadp); 5029 FREE_LOCK(&lk); 5030 } 5031 5032 /* 5033 * Indirect block allocation dependencies. 5034 * 5035 * The same dependencies that exist for a direct block also exist when 5036 * a new block is allocated and pointed to by an entry in a block of 5037 * indirect pointers. The undo/redo states described above are also 5038 * used here. Because an indirect block contains many pointers that 5039 * may have dependencies, a second copy of the entire in-memory indirect 5040 * block is kept. The buffer cache copy is always completely up-to-date. 5041 * The second copy, which is used only as a source for disk writes, 5042 * contains only the safe pointers (i.e., those that have no remaining 5043 * update dependencies). The second copy is freed when all pointers 5044 * are safe. The cache is not allowed to replace indirect blocks with 5045 * pending update dependencies. If a buffer containing an indirect 5046 * block with dependencies is written, these routines will mark it 5047 * dirty again. It can only be successfully written once all the 5048 * dependencies are removed. The ffs_fsync routine in conjunction with 5049 * softdep_sync_metadata work together to get all the dependencies 5050 * removed so that a file can be successfully written to disk. Three 5051 * procedures are used when setting up indirect block pointer 5052 * dependencies. The division is necessary because of the organization 5053 * of the "balloc" routine and because of the distinction between file 5054 * pages and file metadata blocks. 5055 */ 5056 5057 /* 5058 * Allocate a new allocindir structure. 5059 */ 5060 static struct allocindir * 5061 newallocindir(ip, ptrno, newblkno, oldblkno, lbn) 5062 struct inode *ip; /* inode for file being extended */ 5063 int ptrno; /* offset of pointer in indirect block */ 5064 ufs2_daddr_t newblkno; /* disk block number being added */ 5065 ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ 5066 ufs_lbn_t lbn; 5067 { 5068 struct newblk *newblk; 5069 struct allocindir *aip; 5070 struct freefrag *freefrag; 5071 struct jnewblk *jnewblk; 5072 5073 if (oldblkno) 5074 freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn); 5075 else 5076 freefrag = NULL; 5077 ACQUIRE_LOCK(&lk); 5078 if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0) 5079 panic("new_allocindir: lost block"); 5080 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 5081 ("newallocindir: newblk already initialized")); 5082 newblk->nb_list.wk_type = D_ALLOCINDIR; 5083 newblk->nb_freefrag = freefrag; 5084 aip = (struct allocindir *)newblk; 5085 aip->ai_offset = ptrno; 5086 aip->ai_oldblkno = oldblkno; 5087 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 5088 jnewblk->jn_ino = ip->i_number; 5089 jnewblk->jn_lbn = lbn; 5090 add_to_journal(&jnewblk->jn_list); 5091 } 5092 if (freefrag && freefrag->ff_jdep != NULL && 5093 freefrag->ff_jdep->wk_type == D_JFREEFRAG) 5094 add_to_journal(freefrag->ff_jdep); 5095 return (aip); 5096 } 5097 5098 /* 5099 * Called just before setting an indirect block pointer 5100 * to a newly allocated file page. 5101 */ 5102 void 5103 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 5104 struct inode *ip; /* inode for file being extended */ 5105 ufs_lbn_t lbn; /* allocated block number within file */ 5106 struct buf *bp; /* buffer with indirect blk referencing page */ 5107 int ptrno; /* offset of pointer in indirect block */ 5108 ufs2_daddr_t newblkno; /* disk block number being added */ 5109 ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ 5110 struct buf *nbp; /* buffer holding allocated page */ 5111 { 5112 struct inodedep *inodedep; 5113 struct allocindir *aip; 5114 struct pagedep *pagedep; 5115 struct mount *mp; 5116 5117 if (lbn != nbp->b_lblkno) 5118 panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd", 5119 lbn, bp->b_lblkno); 5120 ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page"); 5121 mp = UFSTOVFS(ip->i_ump); 5122 aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn); 5123 (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 5124 /* 5125 * If we are allocating a directory page, then we must 5126 * allocate an associated pagedep to track additions and 5127 * deletions. 5128 */ 5129 if ((ip->i_mode & IFMT) == IFDIR && 5130 pagedep_lookup(mp, ip->i_number, lbn, DEPALLOC, &pagedep) == 0) 5131 WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list); 5132 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); 5133 setup_allocindir_phase2(bp, ip, inodedep, aip, lbn); 5134 FREE_LOCK(&lk); 5135 } 5136 5137 /* 5138 * Called just before setting an indirect block pointer to a 5139 * newly allocated indirect block. 5140 */ 5141 void 5142 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 5143 struct buf *nbp; /* newly allocated indirect block */ 5144 struct inode *ip; /* inode for file being extended */ 5145 struct buf *bp; /* indirect block referencing allocated block */ 5146 int ptrno; /* offset of pointer in indirect block */ 5147 ufs2_daddr_t newblkno; /* disk block number being added */ 5148 { 5149 struct inodedep *inodedep; 5150 struct allocindir *aip; 5151 ufs_lbn_t lbn; 5152 5153 lbn = nbp->b_lblkno; 5154 ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta"); 5155 aip = newallocindir(ip, ptrno, newblkno, 0, lbn); 5156 inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep); 5157 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); 5158 setup_allocindir_phase2(bp, ip, inodedep, aip, lbn); 5159 FREE_LOCK(&lk); 5160 } 5161 5162 static void 5163 indirdep_complete(indirdep) 5164 struct indirdep *indirdep; 5165 { 5166 struct allocindir *aip; 5167 5168 LIST_REMOVE(indirdep, ir_next); 5169 indirdep->ir_state &= ~ONDEPLIST; 5170 5171 while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) { 5172 LIST_REMOVE(aip, ai_next); 5173 free_newblk(&aip->ai_block); 5174 } 5175 /* 5176 * If this indirdep is not attached to a buf it was simply waiting 5177 * on completion to clear completehd. free_indirdep() asserts 5178 * that nothing is dangling. 5179 */ 5180 if ((indirdep->ir_state & ONWORKLIST) == 0) 5181 free_indirdep(indirdep); 5182 } 5183 5184 /* 5185 * Called to finish the allocation of the "aip" allocated 5186 * by one of the two routines above. 5187 */ 5188 static void 5189 setup_allocindir_phase2(bp, ip, inodedep, aip, lbn) 5190 struct buf *bp; /* in-memory copy of the indirect block */ 5191 struct inode *ip; /* inode for file being extended */ 5192 struct inodedep *inodedep; /* Inodedep for ip */ 5193 struct allocindir *aip; /* allocindir allocated by the above routines */ 5194 ufs_lbn_t lbn; /* Logical block number for this block. */ 5195 { 5196 struct worklist *wk; 5197 struct fs *fs; 5198 struct newblk *newblk; 5199 struct indirdep *indirdep, *newindirdep; 5200 struct allocindir *oldaip; 5201 struct freefrag *freefrag; 5202 struct mount *mp; 5203 ufs2_daddr_t blkno; 5204 5205 mp = UFSTOVFS(ip->i_ump); 5206 fs = ip->i_fs; 5207 mtx_assert(&lk, MA_OWNED); 5208 if (bp->b_lblkno >= 0) 5209 panic("setup_allocindir_phase2: not indir blk"); 5210 for (freefrag = NULL, indirdep = NULL, newindirdep = NULL; ; ) { 5211 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 5212 if (wk->wk_type != D_INDIRDEP) 5213 continue; 5214 indirdep = WK_INDIRDEP(wk); 5215 break; 5216 } 5217 if (indirdep == NULL && newindirdep) { 5218 indirdep = newindirdep; 5219 newindirdep = NULL; 5220 WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); 5221 if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, 5222 &newblk)) { 5223 indirdep->ir_state |= ONDEPLIST; 5224 LIST_INSERT_HEAD(&newblk->nb_indirdeps, 5225 indirdep, ir_next); 5226 } else 5227 indirdep->ir_state |= DEPCOMPLETE; 5228 } 5229 if (indirdep) { 5230 aip->ai_indirdep = indirdep; 5231 /* 5232 * Check to see if there is an existing dependency 5233 * for this block. If there is, merge the old 5234 * dependency into the new one. This happens 5235 * as a result of reallocblk only. 5236 */ 5237 if (aip->ai_oldblkno == 0) 5238 oldaip = NULL; 5239 else 5240 5241 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, 5242 ai_next) 5243 if (oldaip->ai_offset == aip->ai_offset) 5244 break; 5245 if (oldaip != NULL) 5246 freefrag = allocindir_merge(aip, oldaip); 5247 LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); 5248 KASSERT(aip->ai_offset >= 0 && 5249 aip->ai_offset < NINDIR(ip->i_ump->um_fs), 5250 ("setup_allocindir_phase2: Bad offset %d", 5251 aip->ai_offset)); 5252 KASSERT(indirdep->ir_savebp != NULL, 5253 ("setup_allocindir_phase2 NULL ir_savebp")); 5254 if (ip->i_ump->um_fstype == UFS1) 5255 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data) 5256 [aip->ai_offset] = aip->ai_oldblkno; 5257 else 5258 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data) 5259 [aip->ai_offset] = aip->ai_oldblkno; 5260 FREE_LOCK(&lk); 5261 if (freefrag != NULL) 5262 handle_workitem_freefrag(freefrag); 5263 } else 5264 FREE_LOCK(&lk); 5265 if (newindirdep) { 5266 newindirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE; 5267 brelse(newindirdep->ir_savebp); 5268 ACQUIRE_LOCK(&lk); 5269 WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP); 5270 if (indirdep) 5271 break; 5272 FREE_LOCK(&lk); 5273 } 5274 if (indirdep) { 5275 ACQUIRE_LOCK(&lk); 5276 break; 5277 } 5278 newindirdep = malloc(sizeof(struct indirdep), 5279 M_INDIRDEP, M_SOFTDEP_FLAGS); 5280 workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp); 5281 newindirdep->ir_state = ATTACHED; 5282 if (ip->i_ump->um_fstype == UFS1) 5283 newindirdep->ir_state |= UFS1FMT; 5284 newindirdep->ir_saveddata = NULL; 5285 LIST_INIT(&newindirdep->ir_deplisthd); 5286 LIST_INIT(&newindirdep->ir_donehd); 5287 LIST_INIT(&newindirdep->ir_writehd); 5288 LIST_INIT(&newindirdep->ir_completehd); 5289 LIST_INIT(&newindirdep->ir_jwork); 5290 LIST_INIT(&newindirdep->ir_jnewblkhd); 5291 if (bp->b_blkno == bp->b_lblkno) { 5292 ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp, 5293 NULL, NULL); 5294 bp->b_blkno = blkno; 5295 } 5296 newindirdep->ir_savebp = 5297 getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0); 5298 BUF_KERNPROC(newindirdep->ir_savebp); 5299 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); 5300 ACQUIRE_LOCK(&lk); 5301 } 5302 } 5303 5304 /* 5305 * Merge two allocindirs which refer to the same block. Move newblock 5306 * dependencies and setup the freefrags appropriately. 5307 */ 5308 static struct freefrag * 5309 allocindir_merge(aip, oldaip) 5310 struct allocindir *aip; 5311 struct allocindir *oldaip; 5312 { 5313 struct newdirblk *newdirblk; 5314 struct freefrag *freefrag; 5315 struct worklist *wk; 5316 5317 if (oldaip->ai_newblkno != aip->ai_oldblkno) 5318 panic("allocindir_merge: blkno"); 5319 aip->ai_oldblkno = oldaip->ai_oldblkno; 5320 freefrag = aip->ai_freefrag; 5321 aip->ai_freefrag = oldaip->ai_freefrag; 5322 oldaip->ai_freefrag = NULL; 5323 KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag")); 5324 /* 5325 * If we are tracking a new directory-block allocation, 5326 * move it from the old allocindir to the new allocindir. 5327 */ 5328 if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) { 5329 newdirblk = WK_NEWDIRBLK(wk); 5330 WORKLIST_REMOVE(&newdirblk->db_list); 5331 if (!LIST_EMPTY(&oldaip->ai_newdirblk)) 5332 panic("allocindir_merge: extra newdirblk"); 5333 WORKLIST_INSERT(&aip->ai_newdirblk, &newdirblk->db_list); 5334 } 5335 /* 5336 * We can skip journaling for this freefrag and just complete 5337 * any pending journal work for the allocindir that is being 5338 * removed after the freefrag completes. 5339 */ 5340 if (freefrag->ff_jdep) 5341 cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep)); 5342 LIST_REMOVE(oldaip, ai_next); 5343 freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block, 5344 &freefrag->ff_list, &freefrag->ff_jwork); 5345 free_newblk(&oldaip->ai_block); 5346 5347 return (freefrag); 5348 } 5349 5350 static inline void 5351 setup_freedirect(freeblks, ip, i, needj) 5352 struct freeblks *freeblks; 5353 struct inode *ip; 5354 int i; 5355 int needj; 5356 { 5357 ufs2_daddr_t blkno; 5358 int frags; 5359 5360 blkno = DIP(ip, i_db[i]); 5361 if (blkno == 0) 5362 return; 5363 DIP_SET(ip, i_db[i], 0); 5364 frags = sblksize(ip->i_fs, ip->i_size, i); 5365 frags = numfrags(ip->i_fs, frags); 5366 newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags, needj); 5367 } 5368 5369 static inline void 5370 setup_freeext(freeblks, ip, i, needj) 5371 struct freeblks *freeblks; 5372 struct inode *ip; 5373 int i; 5374 int needj; 5375 { 5376 ufs2_daddr_t blkno; 5377 int frags; 5378 5379 blkno = ip->i_din2->di_extb[i]; 5380 if (blkno == 0) 5381 return; 5382 ip->i_din2->di_extb[i] = 0; 5383 frags = sblksize(ip->i_fs, ip->i_din2->di_extsize, i); 5384 frags = numfrags(ip->i_fs, frags); 5385 newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno, frags, needj); 5386 } 5387 5388 static inline void 5389 setup_freeindir(freeblks, ip, i, lbn, needj) 5390 struct freeblks *freeblks; 5391 struct inode *ip; 5392 ufs_lbn_t lbn; 5393 int i; 5394 int needj; 5395 { 5396 ufs2_daddr_t blkno; 5397 5398 blkno = DIP(ip, i_ib[i]); 5399 if (blkno == 0) 5400 return; 5401 DIP_SET(ip, i_ib[i], 0); 5402 newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, ip->i_fs->fs_frag, 5403 needj); 5404 } 5405 5406 static inline struct freeblks * 5407 newfreeblks(mp, ip) 5408 struct mount *mp; 5409 struct inode *ip; 5410 { 5411 struct freeblks *freeblks; 5412 5413 freeblks = malloc(sizeof(struct freeblks), 5414 M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO); 5415 workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp); 5416 LIST_INIT(&freeblks->fb_jfreeblkhd); 5417 LIST_INIT(&freeblks->fb_jwork); 5418 freeblks->fb_state = ATTACHED; 5419 freeblks->fb_uid = ip->i_uid; 5420 freeblks->fb_previousinum = ip->i_number; 5421 freeblks->fb_devvp = ip->i_devvp; 5422 freeblks->fb_chkcnt = 0; 5423 5424 return (freeblks); 5425 } 5426 5427 /* 5428 * Block de-allocation dependencies. 5429 * 5430 * When blocks are de-allocated, the on-disk pointers must be nullified before 5431 * the blocks are made available for use by other files. (The true 5432 * requirement is that old pointers must be nullified before new on-disk 5433 * pointers are set. We chose this slightly more stringent requirement to 5434 * reduce complexity.) Our implementation handles this dependency by updating 5435 * the inode (or indirect block) appropriately but delaying the actual block 5436 * de-allocation (i.e., freemap and free space count manipulation) until 5437 * after the updated versions reach stable storage. After the disk is 5438 * updated, the blocks can be safely de-allocated whenever it is convenient. 5439 * This implementation handles only the common case of reducing a file's 5440 * length to zero. Other cases are handled by the conventional synchronous 5441 * write approach. 5442 * 5443 * The ffs implementation with which we worked double-checks 5444 * the state of the block pointers and file size as it reduces 5445 * a file's length. Some of this code is replicated here in our 5446 * soft updates implementation. The freeblks->fb_chkcnt field is 5447 * used to transfer a part of this information to the procedure 5448 * that eventually de-allocates the blocks. 5449 * 5450 * This routine should be called from the routine that shortens 5451 * a file's length, before the inode's size or block pointers 5452 * are modified. It will save the block pointer information for 5453 * later release and zero the inode so that the calling routine 5454 * can release it. 5455 */ 5456 void 5457 softdep_setup_freeblocks(ip, length, flags) 5458 struct inode *ip; /* The inode whose length is to be reduced */ 5459 off_t length; /* The new length for the file */ 5460 int flags; /* IO_EXT and/or IO_NORMAL */ 5461 { 5462 struct ufs1_dinode *dp1; 5463 struct ufs2_dinode *dp2; 5464 struct freeblks *freeblks; 5465 struct inodedep *inodedep; 5466 struct allocdirect *adp; 5467 struct jfreeblk *jfreeblk; 5468 struct buf *bp; 5469 struct fs *fs; 5470 ufs2_daddr_t extblocks, datablocks; 5471 struct mount *mp; 5472 int i, delay, error; 5473 ufs_lbn_t tmpval; 5474 ufs_lbn_t lbn; 5475 int needj; 5476 5477 fs = ip->i_fs; 5478 mp = UFSTOVFS(ip->i_ump); 5479 if (length != 0) 5480 panic("softdep_setup_freeblocks: non-zero length"); 5481 freeblks = newfreeblks(mp, ip); 5482 ACQUIRE_LOCK(&lk); 5483 /* 5484 * If we're truncating a removed file that will never be written 5485 * we don't need to journal the block frees. The canceled journals 5486 * for the allocations will suffice. 5487 */ 5488 inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 5489 if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED || 5490 (fs->fs_flags & FS_SUJ) == 0) 5491 needj = 0; 5492 else 5493 needj = 1; 5494 FREE_LOCK(&lk); 5495 extblocks = 0; 5496 if (fs->fs_magic == FS_UFS2_MAGIC) 5497 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); 5498 datablocks = DIP(ip, i_blocks) - extblocks; 5499 if ((flags & IO_NORMAL) != 0) { 5500 for (i = 0; i < NDADDR; i++) 5501 setup_freedirect(freeblks, ip, i, needj); 5502 for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; 5503 i++, lbn += tmpval, tmpval *= NINDIR(fs)) 5504 setup_freeindir(freeblks, ip, i, -lbn -i, needj); 5505 ip->i_size = 0; 5506 DIP_SET(ip, i_size, 0); 5507 freeblks->fb_chkcnt = datablocks; 5508 UFS_LOCK(ip->i_ump); 5509 fs->fs_pendingblocks += datablocks; 5510 UFS_UNLOCK(ip->i_ump); 5511 } 5512 if ((flags & IO_EXT) != 0) { 5513 for (i = 0; i < NXADDR; i++) 5514 setup_freeext(freeblks, ip, i, needj); 5515 ip->i_din2->di_extsize = 0; 5516 freeblks->fb_chkcnt += extblocks; 5517 } 5518 if (LIST_EMPTY(&freeblks->fb_jfreeblkhd)) 5519 needj = 0; 5520 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt); 5521 /* 5522 * Push the zero'ed inode to to its disk buffer so that we are free 5523 * to delete its dependencies below. Once the dependencies are gone 5524 * the buffer can be safely released. 5525 */ 5526 if ((error = bread(ip->i_devvp, 5527 fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 5528 (int)fs->fs_bsize, NOCRED, &bp)) != 0) { 5529 brelse(bp); 5530 softdep_error("softdep_setup_freeblocks", error); 5531 } 5532 if (ip->i_ump->um_fstype == UFS1) { 5533 dp1 = ((struct ufs1_dinode *)bp->b_data + 5534 ino_to_fsbo(fs, ip->i_number)); 5535 ip->i_din1->di_freelink = dp1->di_freelink; 5536 *dp1 = *ip->i_din1; 5537 } else { 5538 dp2 = ((struct ufs2_dinode *)bp->b_data + 5539 ino_to_fsbo(fs, ip->i_number)); 5540 ip->i_din2->di_freelink = dp2->di_freelink; 5541 *dp2 = *ip->i_din2; 5542 } 5543 /* 5544 * Find and eliminate any inode dependencies. 5545 */ 5546 ACQUIRE_LOCK(&lk); 5547 (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 5548 if ((inodedep->id_state & IOSTARTED) != 0) 5549 panic("softdep_setup_freeblocks: inode busy"); 5550 /* 5551 * Add the freeblks structure to the list of operations that 5552 * must await the zero'ed inode being written to disk. If we 5553 * still have a bitmap dependency (delay == 0), then the inode 5554 * has never been written to disk, so we can process the 5555 * freeblks below once we have deleted the dependencies. 5556 */ 5557 delay = (inodedep->id_state & DEPCOMPLETE); 5558 if (delay) 5559 WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); 5560 else if (needj) 5561 freeblks->fb_state |= COMPLETE; 5562 /* 5563 * Because the file length has been truncated to zero, any 5564 * pending block allocation dependency structures associated 5565 * with this inode are obsolete and can simply be de-allocated. 5566 * We must first merge the two dependency lists to get rid of 5567 * any duplicate freefrag structures, then purge the merged list. 5568 * If we still have a bitmap dependency, then the inode has never 5569 * been written to disk, so we can free any fragments without delay. 5570 */ 5571 if (flags & IO_NORMAL) { 5572 merge_inode_lists(&inodedep->id_newinoupdt, 5573 &inodedep->id_inoupdt); 5574 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) 5575 cancel_allocdirect(&inodedep->id_inoupdt, adp, 5576 freeblks, delay); 5577 } 5578 if (flags & IO_EXT) { 5579 merge_inode_lists(&inodedep->id_newextupdt, 5580 &inodedep->id_extupdt); 5581 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0) 5582 cancel_allocdirect(&inodedep->id_extupdt, adp, 5583 freeblks, delay); 5584 } 5585 LIST_FOREACH(jfreeblk, &freeblks->fb_jfreeblkhd, jf_deps) 5586 add_to_journal(&jfreeblk->jf_list); 5587 5588 FREE_LOCK(&lk); 5589 bdwrite(bp); 5590 softdep_trunc_deps(ITOV(ip), freeblks, 0, 0, flags); 5591 ACQUIRE_LOCK(&lk); 5592 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) 5593 (void) free_inodedep(inodedep); 5594 5595 if (delay || needj) 5596 freeblks->fb_state |= DEPCOMPLETE; 5597 if (delay) { 5598 /* 5599 * If the inode with zeroed block pointers is now on disk 5600 * we can start freeing blocks. Add freeblks to the worklist 5601 * instead of calling handle_workitem_freeblocks directly as 5602 * it is more likely that additional IO is needed to complete 5603 * the request here than in the !delay case. 5604 */ 5605 if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) 5606 add_to_worklist(&freeblks->fb_list, 1); 5607 } 5608 if (needj && LIST_EMPTY(&freeblks->fb_jfreeblkhd)) 5609 needj = 0; 5610 5611 FREE_LOCK(&lk); 5612 /* 5613 * If the inode has never been written to disk (delay == 0) and 5614 * we're not waiting on any journal writes, then we can process the 5615 * freeblks now that we have deleted the dependencies. 5616 */ 5617 if (!delay && !needj) 5618 handle_workitem_freeblocks(freeblks, 0); 5619 } 5620 5621 /* 5622 * Eliminate any dependencies that exist in memory beyond lblkno:off 5623 */ 5624 static void 5625 softdep_trunc_deps(vp, freeblks, lblkno, off, flags) 5626 struct vnode *vp; 5627 struct freeblks *freeblks; 5628 ufs_lbn_t lblkno; 5629 int off; 5630 int flags; 5631 { 5632 struct inodedep *inodedep; 5633 struct bufobj *bo; 5634 struct buf *bp; 5635 struct mount *mp; 5636 ino_t ino; 5637 5638 /* 5639 * We must wait for any I/O in progress to finish so that 5640 * all potential buffers on the dirty list will be visible. 5641 * Once they are all there, walk the list and get rid of 5642 * any dependencies. 5643 */ 5644 ino = VTOI(vp)->i_number; 5645 mp = vp->v_mount; 5646 bo = &vp->v_bufobj; 5647 BO_LOCK(bo); 5648 drain_output(vp); 5649 restart: 5650 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { 5651 if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) || 5652 ((flags & IO_NORMAL) == 0 && 5653 (bp->b_xflags & BX_ALTDATA) == 0)) 5654 continue; 5655 if ((bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT)) == NULL) 5656 goto restart; 5657 BO_UNLOCK(bo); 5658 ACQUIRE_LOCK(&lk); 5659 (void) inodedep_lookup(mp, ino, 0, &inodedep); 5660 if (deallocate_dependencies(bp, inodedep, freeblks, 0)) 5661 bp->b_flags |= B_INVAL | B_NOCACHE; 5662 FREE_LOCK(&lk); 5663 brelse(bp); 5664 BO_LOCK(bo); 5665 goto restart; 5666 } 5667 BO_UNLOCK(bo); 5668 } 5669 5670 static int 5671 cancel_pagedep(pagedep, inodedep, freeblks) 5672 struct pagedep *pagedep; 5673 struct inodedep *inodedep; 5674 struct freeblks *freeblks; 5675 { 5676 struct newdirblk *newdirblk; 5677 struct jremref *jremref; 5678 struct jmvref *jmvref; 5679 struct dirrem *dirrem; 5680 int i; 5681 5682 /* 5683 * There should be no directory add dependencies present 5684 * as the directory could not be truncated until all 5685 * children were removed. 5686 */ 5687 KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL, 5688 ("deallocate_dependencies: pendinghd != NULL")); 5689 for (i = 0; i < DAHASHSZ; i++) 5690 KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL, 5691 ("deallocate_dependencies: diraddhd != NULL")); 5692 /* 5693 * Copy any directory remove dependencies to the list 5694 * to be processed after the zero'ed inode is written. 5695 * If the inode has already been written, then they 5696 * can be dumped directly onto the work list. 5697 */ 5698 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { 5699 /* 5700 * If there are any dirrems we wait for 5701 * the journal write to complete and 5702 * then restart the buf scan as the lock 5703 * has been dropped. 5704 */ 5705 while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) 5706 != NULL) { 5707 stat_jwait_filepage++; 5708 jwait(&jremref->jr_list); 5709 return (ERESTART); 5710 } 5711 LIST_REMOVE(dirrem, dm_next); 5712 dirrem->dm_dirinum = pagedep->pd_ino; 5713 if (inodedep == NULL || 5714 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 5715 dirrem->dm_state |= COMPLETE; 5716 add_to_worklist(&dirrem->dm_list, 0); 5717 } else 5718 WORKLIST_INSERT(&inodedep->id_bufwait, 5719 &dirrem->dm_list); 5720 } 5721 if ((pagedep->pd_state & NEWBLOCK) != 0) { 5722 newdirblk = pagedep->pd_newdirblk; 5723 WORKLIST_REMOVE(&newdirblk->db_list); 5724 free_newdirblk(newdirblk); 5725 } 5726 while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) { 5727 stat_jwait_filepage++; 5728 jwait(&jmvref->jm_list); 5729 return (ERESTART); 5730 } 5731 WORKLIST_REMOVE(&pagedep->pd_list); 5732 LIST_REMOVE(pagedep, pd_hash); 5733 WORKITEM_FREE(pagedep, D_PAGEDEP); 5734 return (0); 5735 } 5736 5737 /* 5738 * Reclaim any dependency structures from a buffer that is about to 5739 * be reallocated to a new vnode. The buffer must be locked, thus, 5740 * no I/O completion operations can occur while we are manipulating 5741 * its associated dependencies. The mutex is held so that other I/O's 5742 * associated with related dependencies do not occur. Returns 1 if 5743 * all dependencies were cleared, 0 otherwise. 5744 */ 5745 static int 5746 deallocate_dependencies(bp, inodedep, freeblks, off) 5747 struct buf *bp; 5748 struct inodedep *inodedep; 5749 struct freeblks *freeblks; 5750 int off; 5751 { 5752 struct worklist *wk; 5753 struct indirdep *indirdep; 5754 struct allocindir *aip; 5755 struct pagedep *pagedep; 5756 5757 mtx_assert(&lk, MA_OWNED); 5758 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 5759 switch (wk->wk_type) { 5760 5761 case D_INDIRDEP: 5762 indirdep = WK_INDIRDEP(wk); 5763 if (bp->b_lblkno >= 0 || 5764 bp->b_blkno != indirdep->ir_savebp->b_lblkno) 5765 panic("deallocate_dependencies: not indir"); 5766 cancel_indirdep(indirdep, bp, inodedep, freeblks); 5767 continue; 5768 5769 case D_PAGEDEP: 5770 pagedep = WK_PAGEDEP(wk); 5771 if (cancel_pagedep(pagedep, inodedep, freeblks)) 5772 return (0); 5773 continue; 5774 5775 case D_ALLOCINDIR: 5776 aip = WK_ALLOCINDIR(wk); 5777 cancel_allocindir(aip, inodedep, freeblks); 5778 continue; 5779 5780 case D_ALLOCDIRECT: 5781 case D_INODEDEP: 5782 panic("deallocate_dependencies: Unexpected type %s", 5783 TYPENAME(wk->wk_type)); 5784 /* NOTREACHED */ 5785 5786 default: 5787 panic("deallocate_dependencies: Unknown type %s", 5788 TYPENAME(wk->wk_type)); 5789 /* NOTREACHED */ 5790 } 5791 } 5792 5793 return (1); 5794 } 5795 5796 /* 5797 * An allocdirect is being canceled due to a truncate. We must make sure 5798 * the journal entry is released in concert with the blkfree that releases 5799 * the storage. Completed journal entries must not be released until the 5800 * space is no longer pointed to by the inode or in the bitmap. 5801 */ 5802 static void 5803 cancel_allocdirect(adphead, adp, freeblks, delay) 5804 struct allocdirectlst *adphead; 5805 struct allocdirect *adp; 5806 struct freeblks *freeblks; 5807 int delay; 5808 { 5809 struct freework *freework; 5810 struct newblk *newblk; 5811 struct worklist *wk; 5812 ufs_lbn_t lbn; 5813 5814 TAILQ_REMOVE(adphead, adp, ad_next); 5815 newblk = (struct newblk *)adp; 5816 /* 5817 * If the journal hasn't been written the jnewblk must be passed 5818 * to the call to ffs_blkfree that reclaims the space. We accomplish 5819 * this by linking the journal dependency into the freework to be 5820 * freed when freework_freeblock() is called. If the journal has 5821 * been written we can simply reclaim the journal space when the 5822 * freeblks work is complete. 5823 */ 5824 if (newblk->nb_jnewblk == NULL) { 5825 if (cancel_newblk(newblk, NULL, &freeblks->fb_jwork) != NULL) 5826 panic("cancel_allocdirect: Unexpected dependency"); 5827 goto found; 5828 } 5829 lbn = newblk->nb_jnewblk->jn_lbn; 5830 /* 5831 * Find the correct freework structure so it releases the canceled 5832 * journal when the bitmap is cleared. This preserves rollback 5833 * until the allocation is reverted. 5834 */ 5835 LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) { 5836 freework = WK_FREEWORK(wk); 5837 if (freework->fw_lbn != lbn) 5838 continue; 5839 freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list, 5840 &freework->fw_jwork); 5841 goto found; 5842 } 5843 panic("cancel_allocdirect: Freework not found for lbn %jd\n", lbn); 5844 found: 5845 if (delay) 5846 WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, 5847 &newblk->nb_list); 5848 else 5849 free_newblk(newblk); 5850 return; 5851 } 5852 5853 5854 /* 5855 * Cancel a new block allocation. May be an indirect or direct block. We 5856 * remove it from various lists and return any journal record that needs to 5857 * be resolved by the caller. 5858 * 5859 * A special consideration is made for indirects which were never pointed 5860 * at on disk and will never be found once this block is released. 5861 */ 5862 static struct jnewblk * 5863 cancel_newblk(newblk, wk, wkhd) 5864 struct newblk *newblk; 5865 struct worklist *wk; 5866 struct workhead *wkhd; 5867 { 5868 struct indirdep *indirdep; 5869 struct allocindir *aip; 5870 struct jnewblk *jnewblk; 5871 5872 while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) { 5873 indirdep->ir_state &= ~ONDEPLIST; 5874 LIST_REMOVE(indirdep, ir_next); 5875 /* 5876 * If an indirdep is not on the buf worklist we need to 5877 * free it here as deallocate_dependencies() will never 5878 * find it. These pointers were never visible on disk and 5879 * can be discarded immediately. 5880 */ 5881 while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) { 5882 LIST_REMOVE(aip, ai_next); 5883 if (cancel_newblk(&aip->ai_block, NULL, wkhd) != NULL) 5884 panic("cancel_newblk: aip has journal entry"); 5885 free_newblk(&aip->ai_block); 5886 } 5887 /* 5888 * If this indirdep is not attached to a buf it was simply 5889 * waiting on completion to clear completehd. free_indirdep() 5890 * asserts that nothing is dangling. 5891 */ 5892 if ((indirdep->ir_state & ONWORKLIST) == 0) 5893 free_indirdep(indirdep); 5894 } 5895 if (newblk->nb_state & ONDEPLIST) { 5896 newblk->nb_state &= ~ONDEPLIST; 5897 LIST_REMOVE(newblk, nb_deps); 5898 } 5899 if (newblk->nb_state & ONWORKLIST) 5900 WORKLIST_REMOVE(&newblk->nb_list); 5901 /* 5902 * If the journal entry hasn't been written we save a pointer to 5903 * the dependency that frees it until it is written or the 5904 * superseding operation completes. 5905 */ 5906 jnewblk = newblk->nb_jnewblk; 5907 if (jnewblk != NULL) { 5908 newblk->nb_jnewblk = NULL; 5909 jnewblk->jn_dep = wk; 5910 } 5911 if (!LIST_EMPTY(&newblk->nb_jwork)) 5912 jwork_move(wkhd, &newblk->nb_jwork); 5913 5914 return (jnewblk); 5915 } 5916 5917 /* 5918 * Free a newblk. Generate a new freefrag work request if appropriate. 5919 * This must be called after the inode pointer and any direct block pointers 5920 * are valid or fully removed via truncate or frag extension. 5921 */ 5922 static void 5923 free_newblk(newblk) 5924 struct newblk *newblk; 5925 { 5926 struct indirdep *indirdep; 5927 struct newdirblk *newdirblk; 5928 struct freefrag *freefrag; 5929 struct worklist *wk; 5930 5931 mtx_assert(&lk, MA_OWNED); 5932 if (newblk->nb_state & ONDEPLIST) 5933 LIST_REMOVE(newblk, nb_deps); 5934 if (newblk->nb_state & ONWORKLIST) 5935 WORKLIST_REMOVE(&newblk->nb_list); 5936 LIST_REMOVE(newblk, nb_hash); 5937 if ((freefrag = newblk->nb_freefrag) != NULL) { 5938 freefrag->ff_state |= COMPLETE; 5939 if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) 5940 add_to_worklist(&freefrag->ff_list, 0); 5941 } 5942 if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) { 5943 newdirblk = WK_NEWDIRBLK(wk); 5944 WORKLIST_REMOVE(&newdirblk->db_list); 5945 if (!LIST_EMPTY(&newblk->nb_newdirblk)) 5946 panic("free_newblk: extra newdirblk"); 5947 free_newdirblk(newdirblk); 5948 } 5949 while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) { 5950 indirdep->ir_state |= DEPCOMPLETE; 5951 indirdep_complete(indirdep); 5952 } 5953 KASSERT(newblk->nb_jnewblk == NULL, 5954 ("free_newblk; jnewblk %p still attached", newblk->nb_jnewblk)); 5955 handle_jwork(&newblk->nb_jwork); 5956 newblk->nb_list.wk_type = D_NEWBLK; 5957 WORKITEM_FREE(newblk, D_NEWBLK); 5958 } 5959 5960 /* 5961 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep. 5962 * This routine must be called with splbio interrupts blocked. 5963 */ 5964 static void 5965 free_newdirblk(newdirblk) 5966 struct newdirblk *newdirblk; 5967 { 5968 struct pagedep *pagedep; 5969 struct diradd *dap; 5970 struct worklist *wk; 5971 int i; 5972 5973 mtx_assert(&lk, MA_OWNED); 5974 /* 5975 * If the pagedep is still linked onto the directory buffer 5976 * dependency chain, then some of the entries on the 5977 * pd_pendinghd list may not be committed to disk yet. In 5978 * this case, we will simply clear the NEWBLOCK flag and 5979 * let the pd_pendinghd list be processed when the pagedep 5980 * is next written. If the pagedep is no longer on the buffer 5981 * dependency chain, then all the entries on the pd_pending 5982 * list are committed to disk and we can free them here. 5983 */ 5984 pagedep = newdirblk->db_pagedep; 5985 pagedep->pd_state &= ~NEWBLOCK; 5986 if ((pagedep->pd_state & ONWORKLIST) == 0) 5987 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 5988 free_diradd(dap, NULL); 5989 /* 5990 * If no dependencies remain, the pagedep will be freed. 5991 */ 5992 for (i = 0; i < DAHASHSZ; i++) 5993 if (!LIST_EMPTY(&pagedep->pd_diraddhd[i])) 5994 break; 5995 if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0 && 5996 LIST_EMPTY(&pagedep->pd_jmvrefhd)) { 5997 KASSERT(LIST_FIRST(&pagedep->pd_dirremhd) == NULL, 5998 ("free_newdirblk: Freeing non-free pagedep %p", pagedep)); 5999 LIST_REMOVE(pagedep, pd_hash); 6000 WORKITEM_FREE(pagedep, D_PAGEDEP); 6001 } 6002 /* Should only ever be one item in the list. */ 6003 while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) { 6004 WORKLIST_REMOVE(wk); 6005 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 6006 } 6007 WORKITEM_FREE(newdirblk, D_NEWDIRBLK); 6008 } 6009 6010 /* 6011 * Prepare an inode to be freed. The actual free operation is not 6012 * done until the zero'ed inode has been written to disk. 6013 */ 6014 void 6015 softdep_freefile(pvp, ino, mode) 6016 struct vnode *pvp; 6017 ino_t ino; 6018 int mode; 6019 { 6020 struct inode *ip = VTOI(pvp); 6021 struct inodedep *inodedep; 6022 struct freefile *freefile; 6023 6024 /* 6025 * This sets up the inode de-allocation dependency. 6026 */ 6027 freefile = malloc(sizeof(struct freefile), 6028 M_FREEFILE, M_SOFTDEP_FLAGS); 6029 workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount); 6030 freefile->fx_mode = mode; 6031 freefile->fx_oldinum = ino; 6032 freefile->fx_devvp = ip->i_devvp; 6033 LIST_INIT(&freefile->fx_jwork); 6034 UFS_LOCK(ip->i_ump); 6035 ip->i_fs->fs_pendinginodes += 1; 6036 UFS_UNLOCK(ip->i_ump); 6037 6038 /* 6039 * If the inodedep does not exist, then the zero'ed inode has 6040 * been written to disk. If the allocated inode has never been 6041 * written to disk, then the on-disk inode is zero'ed. In either 6042 * case we can free the file immediately. If the journal was 6043 * canceled before being written the inode will never make it to 6044 * disk and we must send the canceled journal entrys to 6045 * ffs_freefile() to be cleared in conjunction with the bitmap. 6046 * Any blocks waiting on the inode to write can be safely freed 6047 * here as it will never been written. 6048 */ 6049 ACQUIRE_LOCK(&lk); 6050 inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); 6051 /* 6052 * Remove this inode from the unlinked list and set 6053 * GOINGAWAY as appropriate to indicate that this inode 6054 * will never be written. 6055 */ 6056 if (inodedep && inodedep->id_state & UNLINKED) { 6057 /* 6058 * Save the journal work to be freed with the bitmap 6059 * before we clear UNLINKED. Otherwise it can be lost 6060 * if the inode block is written. 6061 */ 6062 handle_bufwait(inodedep, &freefile->fx_jwork); 6063 clear_unlinked_inodedep(inodedep); 6064 /* Re-acquire inodedep as we've dropped lk. */ 6065 inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); 6066 } 6067 if (inodedep == NULL || check_inode_unwritten(inodedep)) { 6068 FREE_LOCK(&lk); 6069 handle_workitem_freefile(freefile); 6070 return; 6071 } 6072 if (inodedep && (inodedep->id_state & DEPCOMPLETE) == 0) 6073 inodedep->id_state |= GOINGAWAY; 6074 WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); 6075 FREE_LOCK(&lk); 6076 if (ip->i_number == ino) 6077 ip->i_flag |= IN_MODIFIED; 6078 } 6079 6080 /* 6081 * Check to see if an inode has never been written to disk. If 6082 * so free the inodedep and return success, otherwise return failure. 6083 * This routine must be called with splbio interrupts blocked. 6084 * 6085 * If we still have a bitmap dependency, then the inode has never 6086 * been written to disk. Drop the dependency as it is no longer 6087 * necessary since the inode is being deallocated. We set the 6088 * ALLCOMPLETE flags since the bitmap now properly shows that the 6089 * inode is not allocated. Even if the inode is actively being 6090 * written, it has been rolled back to its zero'ed state, so we 6091 * are ensured that a zero inode is what is on the disk. For short 6092 * lived files, this change will usually result in removing all the 6093 * dependencies from the inode so that it can be freed immediately. 6094 */ 6095 static int 6096 check_inode_unwritten(inodedep) 6097 struct inodedep *inodedep; 6098 { 6099 6100 mtx_assert(&lk, MA_OWNED); 6101 6102 if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 || 6103 !LIST_EMPTY(&inodedep->id_pendinghd) || 6104 !LIST_EMPTY(&inodedep->id_bufwait) || 6105 !LIST_EMPTY(&inodedep->id_inowait) || 6106 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 6107 !TAILQ_EMPTY(&inodedep->id_newinoupdt) || 6108 !TAILQ_EMPTY(&inodedep->id_extupdt) || 6109 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 6110 inodedep->id_mkdiradd != NULL || 6111 inodedep->id_nlinkdelta != 0) 6112 return (0); 6113 /* 6114 * Another process might be in initiate_write_inodeblock_ufs[12] 6115 * trying to allocate memory without holding "Softdep Lock". 6116 */ 6117 if ((inodedep->id_state & IOSTARTED) != 0 && 6118 inodedep->id_savedino1 == NULL) 6119 return (0); 6120 6121 if (inodedep->id_state & ONDEPLIST) 6122 LIST_REMOVE(inodedep, id_deps); 6123 inodedep->id_state &= ~ONDEPLIST; 6124 inodedep->id_state |= ALLCOMPLETE; 6125 inodedep->id_bmsafemap = NULL; 6126 if (inodedep->id_state & ONWORKLIST) 6127 WORKLIST_REMOVE(&inodedep->id_list); 6128 if (inodedep->id_savedino1 != NULL) { 6129 free(inodedep->id_savedino1, M_SAVEDINO); 6130 inodedep->id_savedino1 = NULL; 6131 } 6132 if (free_inodedep(inodedep) == 0) 6133 panic("check_inode_unwritten: busy inode"); 6134 return (1); 6135 } 6136 6137 /* 6138 * Try to free an inodedep structure. Return 1 if it could be freed. 6139 */ 6140 static int 6141 free_inodedep(inodedep) 6142 struct inodedep *inodedep; 6143 { 6144 6145 mtx_assert(&lk, MA_OWNED); 6146 if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 || 6147 (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || 6148 !LIST_EMPTY(&inodedep->id_dirremhd) || 6149 !LIST_EMPTY(&inodedep->id_pendinghd) || 6150 !LIST_EMPTY(&inodedep->id_bufwait) || 6151 !LIST_EMPTY(&inodedep->id_inowait) || 6152 !TAILQ_EMPTY(&inodedep->id_inoreflst) || 6153 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 6154 !TAILQ_EMPTY(&inodedep->id_newinoupdt) || 6155 !TAILQ_EMPTY(&inodedep->id_extupdt) || 6156 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 6157 inodedep->id_mkdiradd != NULL || 6158 inodedep->id_nlinkdelta != 0 || 6159 inodedep->id_savedino1 != NULL) 6160 return (0); 6161 if (inodedep->id_state & ONDEPLIST) 6162 LIST_REMOVE(inodedep, id_deps); 6163 LIST_REMOVE(inodedep, id_hash); 6164 WORKITEM_FREE(inodedep, D_INODEDEP); 6165 return (1); 6166 } 6167 6168 /* 6169 * Free the block referenced by a freework structure. The parent freeblks 6170 * structure is released and completed when the final cg bitmap reaches 6171 * the disk. This routine may be freeing a jnewblk which never made it to 6172 * disk in which case we do not have to wait as the operation is undone 6173 * in memory immediately. 6174 */ 6175 static void 6176 freework_freeblock(freework) 6177 struct freework *freework; 6178 { 6179 struct freeblks *freeblks; 6180 struct jnewblk *jnewblk; 6181 struct ufsmount *ump; 6182 struct workhead wkhd; 6183 struct fs *fs; 6184 int pending; 6185 int bsize; 6186 int needj; 6187 6188 freeblks = freework->fw_freeblks; 6189 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 6190 fs = ump->um_fs; 6191 needj = freeblks->fb_list.wk_mp->mnt_kern_flag & MNTK_SUJ; 6192 bsize = lfragtosize(fs, freework->fw_frags); 6193 pending = btodb(bsize); 6194 LIST_INIT(&wkhd); 6195 /* 6196 * If we are canceling an existing jnewblk pass it to the free 6197 * routine, otherwise pass the freeblk which will ultimately 6198 * release the freeblks. If we're not journaling, we can just 6199 * free the freeblks immediately. 6200 */ 6201 ACQUIRE_LOCK(&lk); 6202 LIST_SWAP(&wkhd, &freework->fw_jwork, worklist, wk_list); 6203 jnewblk = freework->fw_jnewblk; 6204 if (jnewblk != NULL) { 6205 /* Could've already been canceled in indir_trunc(). */ 6206 if ((jnewblk->jn_state & GOINGAWAY) == 0) 6207 cancel_jnewblk(jnewblk, &wkhd); 6208 needj = 0; 6209 } else if (needj) 6210 WORKLIST_INSERT(&wkhd, &freework->fw_list); 6211 freeblks->fb_chkcnt -= pending; 6212 FREE_LOCK(&lk); 6213 /* 6214 * extattr blocks don't show up in pending blocks. XXX why? 6215 */ 6216 if (freework->fw_lbn >= 0 || freework->fw_lbn <= -NDADDR) { 6217 UFS_LOCK(ump); 6218 fs->fs_pendingblocks -= pending; 6219 UFS_UNLOCK(ump); 6220 } 6221 ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, 6222 bsize, freeblks->fb_previousinum, &wkhd); 6223 if (needj) 6224 return; 6225 /* 6226 * The jnewblk will be discarded and the bits in the map never 6227 * made it to disk. We can immediately free the freeblk. 6228 */ 6229 ACQUIRE_LOCK(&lk); 6230 handle_written_freework(freework); 6231 FREE_LOCK(&lk); 6232 } 6233 6234 /* 6235 * Start, continue, or finish the process of freeing an indirect block tree. 6236 * The free operation may be paused at any point with fw_off containing the 6237 * offset to restart from. This enables us to implement some flow control 6238 * for large truncates which may fan out and generate a huge number of 6239 * dependencies. 6240 */ 6241 static void 6242 handle_workitem_indirblk(freework) 6243 struct freework *freework; 6244 { 6245 struct freeblks *freeblks; 6246 struct ufsmount *ump; 6247 struct fs *fs; 6248 6249 6250 freeblks = freework->fw_freeblks; 6251 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 6252 fs = ump->um_fs; 6253 if (freework->fw_off == NINDIR(fs)) 6254 freework_freeblock(freework); 6255 else 6256 indir_trunc(freework, fsbtodb(fs, freework->fw_blkno), 6257 freework->fw_lbn); 6258 } 6259 6260 /* 6261 * Called when a freework structure attached to a cg buf is written. The 6262 * ref on either the parent or the freeblks structure is released and 6263 * either may be added to the worklist if it is the final ref. 6264 */ 6265 static void 6266 handle_written_freework(freework) 6267 struct freework *freework; 6268 { 6269 struct freeblks *freeblks; 6270 struct freework *parent; 6271 struct jsegdep *jsegdep; 6272 struct worklist *wk; 6273 int needj; 6274 6275 needj = 0; 6276 freeblks = freework->fw_freeblks; 6277 parent = freework->fw_parent; 6278 /* 6279 * SUJ needs to wait for the segment referencing freed indirect 6280 * blocks to expire so that we know the checker will not confuse 6281 * a re-allocated indirect block with its old contents. 6282 */ 6283 if (freework->fw_lbn <= -NDADDR && 6284 freework->fw_list.wk_mp->mnt_kern_flag & MNTK_SUJ) { 6285 LIST_FOREACH(wk, &freeblks->fb_jwork, wk_list) 6286 if (wk->wk_type == D_JSEGDEP) 6287 break; 6288 if (wk) { 6289 jsegdep = WK_JSEGDEP(wk); 6290 LIST_INSERT_HEAD(&jsegdep->jd_seg->js_indirs, 6291 freework, fw_next); 6292 WORKLIST_INSERT(INDIR_HASH(freework->fw_list.wk_mp, 6293 freework->fw_blkno), &freework->fw_list); 6294 needj = 1; 6295 } 6296 } 6297 if (parent) { 6298 if (--parent->fw_ref != 0) 6299 parent = NULL; 6300 freeblks = NULL; 6301 } else if (--freeblks->fb_ref != 0) 6302 freeblks = NULL; 6303 if (needj == 0) 6304 WORKITEM_FREE(freework, D_FREEWORK); 6305 /* 6306 * Don't delay these block frees or it takes an intolerable amount 6307 * of time to process truncates and free their journal entries. 6308 */ 6309 if (freeblks) 6310 add_to_worklist(&freeblks->fb_list, 1); 6311 if (parent) 6312 add_to_worklist(&parent->fw_list, 1); 6313 } 6314 6315 /* 6316 * This workitem routine performs the block de-allocation. 6317 * The workitem is added to the pending list after the updated 6318 * inode block has been written to disk. As mentioned above, 6319 * checks regarding the number of blocks de-allocated (compared 6320 * to the number of blocks allocated for the file) are also 6321 * performed in this function. 6322 */ 6323 static void 6324 handle_workitem_freeblocks(freeblks, flags) 6325 struct freeblks *freeblks; 6326 int flags; 6327 { 6328 struct freework *freework; 6329 struct worklist *wk; 6330 6331 KASSERT(LIST_EMPTY(&freeblks->fb_jfreeblkhd), 6332 ("handle_workitem_freeblocks: Journal entries not written.")); 6333 if (LIST_EMPTY(&freeblks->fb_freeworkhd)) { 6334 handle_complete_freeblocks(freeblks); 6335 return; 6336 } 6337 freeblks->fb_ref++; 6338 while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) { 6339 KASSERT(wk->wk_type == D_FREEWORK, 6340 ("handle_workitem_freeblocks: Unknown type %s", 6341 TYPENAME(wk->wk_type))); 6342 WORKLIST_REMOVE_UNLOCKED(wk); 6343 freework = WK_FREEWORK(wk); 6344 if (freework->fw_lbn <= -NDADDR) 6345 handle_workitem_indirblk(freework); 6346 else 6347 freework_freeblock(freework); 6348 } 6349 ACQUIRE_LOCK(&lk); 6350 if (--freeblks->fb_ref != 0) 6351 freeblks = NULL; 6352 FREE_LOCK(&lk); 6353 if (freeblks) 6354 handle_complete_freeblocks(freeblks); 6355 } 6356 6357 /* 6358 * Once all of the freework workitems are complete we can retire the 6359 * freeblocks dependency and any journal work awaiting completion. This 6360 * can not be called until all other dependencies are stable on disk. 6361 */ 6362 static void 6363 handle_complete_freeblocks(freeblks) 6364 struct freeblks *freeblks; 6365 { 6366 struct inode *ip; 6367 struct vnode *vp; 6368 struct fs *fs; 6369 struct ufsmount *ump; 6370 int flags; 6371 6372 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 6373 fs = ump->um_fs; 6374 flags = LK_NOWAIT; 6375 6376 /* 6377 * If we still have not finished background cleanup, then check 6378 * to see if the block count needs to be adjusted. 6379 */ 6380 if (freeblks->fb_chkcnt != 0 && (fs->fs_flags & FS_UNCLEAN) != 0 && 6381 ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_previousinum, 6382 (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ) == 0) { 6383 ip = VTOI(vp); 6384 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + freeblks->fb_chkcnt); 6385 ip->i_flag |= IN_CHANGE; 6386 vput(vp); 6387 } 6388 6389 if (!(freeblks->fb_chkcnt == 0 || 6390 ((fs->fs_flags & FS_UNCLEAN) != 0 && (flags & LK_NOWAIT) == 0))) 6391 printf( 6392 "handle_workitem_freeblocks: inode %ju block count %jd\n", 6393 (uintmax_t)freeblks->fb_previousinum, 6394 (intmax_t)freeblks->fb_chkcnt); 6395 6396 ACQUIRE_LOCK(&lk); 6397 /* 6398 * All of the freeblock deps must be complete prior to this call 6399 * so it's now safe to complete earlier outstanding journal entries. 6400 */ 6401 handle_jwork(&freeblks->fb_jwork); 6402 WORKITEM_FREE(freeblks, D_FREEBLKS); 6403 FREE_LOCK(&lk); 6404 } 6405 6406 /* 6407 * Release blocks associated with the inode ip and stored in the indirect 6408 * block dbn. If level is greater than SINGLE, the block is an indirect block 6409 * and recursive calls to indirtrunc must be used to cleanse other indirect 6410 * blocks. 6411 */ 6412 static void 6413 indir_trunc(freework, dbn, lbn) 6414 struct freework *freework; 6415 ufs2_daddr_t dbn; 6416 ufs_lbn_t lbn; 6417 { 6418 struct freework *nfreework; 6419 struct workhead wkhd; 6420 struct jnewblk *jnewblkn; 6421 struct jnewblk *jnewblk; 6422 struct freeblks *freeblks; 6423 struct buf *bp; 6424 struct fs *fs; 6425 struct worklist *wkn; 6426 struct worklist *wk; 6427 struct indirdep *indirdep; 6428 struct ufsmount *ump; 6429 ufs1_daddr_t *bap1 = 0; 6430 ufs2_daddr_t nb, nnb, *bap2 = 0; 6431 ufs_lbn_t lbnadd; 6432 int i, nblocks, ufs1fmt; 6433 int fs_pendingblocks; 6434 int freedeps; 6435 int needj; 6436 int level; 6437 int cnt; 6438 6439 LIST_INIT(&wkhd); 6440 level = lbn_level(lbn); 6441 if (level == -1) 6442 panic("indir_trunc: Invalid lbn %jd\n", lbn); 6443 freeblks = freework->fw_freeblks; 6444 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 6445 fs = ump->um_fs; 6446 fs_pendingblocks = 0; 6447 freedeps = 0; 6448 needj = UFSTOVFS(ump)->mnt_kern_flag & MNTK_SUJ; 6449 lbnadd = lbn_offset(fs, level); 6450 /* 6451 * Get buffer of block pointers to be freed. This routine is not 6452 * called until the zero'ed inode has been written, so it is safe 6453 * to free blocks as they are encountered. Because the inode has 6454 * been zero'ed, calls to bmap on these blocks will fail. So, we 6455 * have to use the on-disk address and the block device for the 6456 * filesystem to look them up. If the file was deleted before its 6457 * indirect blocks were all written to disk, the routine that set 6458 * us up (deallocate_dependencies) will have arranged to leave 6459 * a complete copy of the indirect block in memory for our use. 6460 * Otherwise we have to read the blocks in from the disk. 6461 */ 6462 #ifdef notyet 6463 bp = getblk(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 0, 0, 6464 GB_NOCREAT); 6465 #else 6466 bp = incore(&freeblks->fb_devvp->v_bufobj, dbn); 6467 #endif 6468 ACQUIRE_LOCK(&lk); 6469 if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) { 6470 if (wk->wk_type != D_INDIRDEP || 6471 (wk->wk_state & GOINGAWAY) == 0) 6472 panic("indir_trunc: lost indirdep %p", wk); 6473 indirdep = WK_INDIRDEP(wk); 6474 LIST_SWAP(&wkhd, &indirdep->ir_jwork, worklist, wk_list); 6475 LIST_FOREACH_SAFE(jnewblk, &indirdep->ir_jnewblkhd, 6476 jn_indirdeps, jnewblkn) { 6477 /* 6478 * XXX This cancel may cause some lengthy delay 6479 * before the record is reclaimed below. 6480 */ 6481 LIST_REMOVE(jnewblk, jn_indirdeps); 6482 cancel_jnewblk(jnewblk, &wkhd); 6483 } 6484 6485 free_indirdep(indirdep); 6486 if (!LIST_EMPTY(&bp->b_dep)) 6487 panic("indir_trunc: dangling dep %p", 6488 LIST_FIRST(&bp->b_dep)); 6489 ump->um_numindirdeps -= 1; 6490 FREE_LOCK(&lk); 6491 } else { 6492 #ifdef notyet 6493 if (bp) 6494 brelse(bp); 6495 #endif 6496 FREE_LOCK(&lk); 6497 if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 6498 NOCRED, &bp) != 0) { 6499 brelse(bp); 6500 return; 6501 } 6502 } 6503 /* 6504 * Recursively free indirect blocks. 6505 */ 6506 if (ump->um_fstype == UFS1) { 6507 ufs1fmt = 1; 6508 bap1 = (ufs1_daddr_t *)bp->b_data; 6509 } else { 6510 ufs1fmt = 0; 6511 bap2 = (ufs2_daddr_t *)bp->b_data; 6512 } 6513 6514 /* 6515 * Reclaim indirect blocks which never made it to disk. 6516 */ 6517 cnt = 0; 6518 LIST_FOREACH_SAFE(wk, &wkhd, wk_list, wkn) { 6519 if (wk->wk_type != D_JNEWBLK) 6520 continue; 6521 /* XXX Is the lock necessary here for more than an assert? */ 6522 ACQUIRE_LOCK(&lk); 6523 WORKLIST_REMOVE(wk); 6524 FREE_LOCK(&lk); 6525 jnewblk = WK_JNEWBLK(wk); 6526 if (jnewblk->jn_lbn > 0) 6527 i = (jnewblk->jn_lbn - -lbn) / lbnadd; 6528 else 6529 i = (-(jnewblk->jn_lbn + level - 1) - -(lbn + level)) / 6530 lbnadd; 6531 KASSERT(i >= 0 && i < NINDIR(fs), 6532 ("indir_trunc: Index out of range %d parent %jd lbn %jd level %d", 6533 i, lbn, jnewblk->jn_lbn, level)); 6534 /* Clear the pointer so it isn't found below. */ 6535 if (ufs1fmt) { 6536 nb = bap1[i]; 6537 bap1[i] = 0; 6538 } else { 6539 nb = bap2[i]; 6540 bap2[i] = 0; 6541 } 6542 KASSERT(nb == jnewblk->jn_blkno, 6543 ("indir_trunc: Block mismatch %jd != %jd", 6544 nb, jnewblk->jn_blkno)); 6545 if (level != 0) { 6546 ufs_lbn_t nlbn; 6547 6548 nlbn = (lbn + 1) - (i * lbnadd); 6549 nfreework = newfreework(ump, freeblks, freework, 6550 nlbn, nb, fs->fs_frag, 0); 6551 nfreework->fw_jnewblk = jnewblk; 6552 freedeps++; 6553 indir_trunc(nfreework, fsbtodb(fs, nb), nlbn); 6554 } else { 6555 struct workhead freewk; 6556 6557 LIST_INIT(&freewk); 6558 ACQUIRE_LOCK(&lk); 6559 WORKLIST_INSERT(&freewk, wk); 6560 FREE_LOCK(&lk); 6561 ffs_blkfree(ump, fs, freeblks->fb_devvp, 6562 jnewblk->jn_blkno, fs->fs_bsize, 6563 freeblks->fb_previousinum, &freewk); 6564 } 6565 cnt++; 6566 } 6567 ACQUIRE_LOCK(&lk); 6568 /* Any remaining journal work can be completed with freeblks. */ 6569 jwork_move(&freeblks->fb_jwork, &wkhd); 6570 FREE_LOCK(&lk); 6571 nblocks = btodb(fs->fs_bsize); 6572 if (ufs1fmt) 6573 nb = bap1[0]; 6574 else 6575 nb = bap2[0]; 6576 nfreework = freework; 6577 /* 6578 * Reclaim on disk blocks. 6579 */ 6580 for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) { 6581 if (i != NINDIR(fs) - 1) { 6582 if (ufs1fmt) 6583 nnb = bap1[i+1]; 6584 else 6585 nnb = bap2[i+1]; 6586 } else 6587 nnb = 0; 6588 if (nb == 0) 6589 continue; 6590 cnt++; 6591 if (level != 0) { 6592 ufs_lbn_t nlbn; 6593 6594 nlbn = (lbn + 1) - (i * lbnadd); 6595 if (needj != 0) { 6596 nfreework = newfreework(ump, freeblks, freework, 6597 nlbn, nb, fs->fs_frag, 0); 6598 freedeps++; 6599 } 6600 indir_trunc(nfreework, fsbtodb(fs, nb), nlbn); 6601 } else { 6602 struct freedep *freedep; 6603 6604 /* 6605 * Attempt to aggregate freedep dependencies for 6606 * all blocks being released to the same CG. 6607 */ 6608 LIST_INIT(&wkhd); 6609 if (needj != 0 && 6610 (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) { 6611 freedep = newfreedep(freework); 6612 WORKLIST_INSERT_UNLOCKED(&wkhd, 6613 &freedep->fd_list); 6614 freedeps++; 6615 } 6616 ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, 6617 fs->fs_bsize, freeblks->fb_previousinum, &wkhd); 6618 } 6619 } 6620 if (level == 0) 6621 fs_pendingblocks = (nblocks * cnt); 6622 /* 6623 * If we're not journaling we can free the indirect now. Otherwise 6624 * setup the ref counts and offset so this indirect can be completed 6625 * when its children are free. 6626 */ 6627 if (needj == 0) { 6628 fs_pendingblocks += nblocks; 6629 dbn = dbtofsb(fs, dbn); 6630 ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize, 6631 freeblks->fb_previousinum, NULL); 6632 ACQUIRE_LOCK(&lk); 6633 freeblks->fb_chkcnt -= fs_pendingblocks; 6634 if (freework->fw_blkno == dbn) 6635 handle_written_freework(freework); 6636 FREE_LOCK(&lk); 6637 freework = NULL; 6638 } else { 6639 ACQUIRE_LOCK(&lk); 6640 freework->fw_off = i; 6641 freework->fw_ref += freedeps; 6642 freework->fw_ref -= NINDIR(fs) + 1; 6643 if (freework->fw_ref != 0) 6644 freework = NULL; 6645 freeblks->fb_chkcnt -= fs_pendingblocks; 6646 FREE_LOCK(&lk); 6647 } 6648 if (fs_pendingblocks) { 6649 UFS_LOCK(ump); 6650 fs->fs_pendingblocks -= fs_pendingblocks; 6651 UFS_UNLOCK(ump); 6652 } 6653 bp->b_flags |= B_INVAL | B_NOCACHE; 6654 brelse(bp); 6655 if (freework) 6656 handle_workitem_indirblk(freework); 6657 return; 6658 } 6659 6660 /* 6661 * Cancel an allocindir when it is removed via truncation. 6662 */ 6663 static void 6664 cancel_allocindir(aip, inodedep, freeblks) 6665 struct allocindir *aip; 6666 struct inodedep *inodedep; 6667 struct freeblks *freeblks; 6668 { 6669 struct jnewblk *jnewblk; 6670 struct newblk *newblk; 6671 6672 /* 6673 * If the journal hasn't been written the jnewblk must be passed 6674 * to the call to ffs_blkfree that reclaims the space. We accomplish 6675 * this by linking the journal dependency into the indirdep to be 6676 * freed when indir_trunc() is called. If the journal has already 6677 * been written we can simply reclaim the journal space when the 6678 * freeblks work is complete. 6679 */ 6680 LIST_REMOVE(aip, ai_next); 6681 newblk = (struct newblk *)aip; 6682 if (newblk->nb_jnewblk == NULL) { 6683 if (cancel_newblk(newblk, NULL, &freeblks->fb_jwork)) 6684 panic("cancel_allocindir: Unexpected dependency."); 6685 } else { 6686 jnewblk = cancel_newblk(newblk, &aip->ai_indirdep->ir_list, 6687 &aip->ai_indirdep->ir_jwork); 6688 if (jnewblk) 6689 LIST_INSERT_HEAD(&aip->ai_indirdep->ir_jnewblkhd, 6690 jnewblk, jn_indirdeps); 6691 } 6692 if (inodedep && inodedep->id_state & DEPCOMPLETE) 6693 WORKLIST_INSERT(&inodedep->id_bufwait, &newblk->nb_list); 6694 else 6695 free_newblk(newblk); 6696 } 6697 6698 /* 6699 * Create the mkdir dependencies for . and .. in a new directory. Link them 6700 * in to a newdirblk so any subsequent additions are tracked properly. The 6701 * caller is responsible for adding the mkdir1 dependency to the journal 6702 * and updating id_mkdiradd. This function returns with lk held. 6703 */ 6704 static struct mkdir * 6705 setup_newdir(dap, newinum, dinum, newdirbp, mkdirp) 6706 struct diradd *dap; 6707 ino_t newinum; 6708 ino_t dinum; 6709 struct buf *newdirbp; 6710 struct mkdir **mkdirp; 6711 { 6712 struct newblk *newblk; 6713 struct pagedep *pagedep; 6714 struct inodedep *inodedep; 6715 struct newdirblk *newdirblk = 0; 6716 struct mkdir *mkdir1, *mkdir2; 6717 struct worklist *wk; 6718 struct jaddref *jaddref; 6719 struct mount *mp; 6720 6721 mp = dap->da_list.wk_mp; 6722 newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK, 6723 M_SOFTDEP_FLAGS); 6724 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); 6725 LIST_INIT(&newdirblk->db_mkdir); 6726 mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); 6727 workitem_alloc(&mkdir1->md_list, D_MKDIR, mp); 6728 mkdir1->md_state = ATTACHED | MKDIR_BODY; 6729 mkdir1->md_diradd = dap; 6730 mkdir1->md_jaddref = NULL; 6731 mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); 6732 workitem_alloc(&mkdir2->md_list, D_MKDIR, mp); 6733 mkdir2->md_state = ATTACHED | MKDIR_PARENT; 6734 mkdir2->md_diradd = dap; 6735 mkdir2->md_jaddref = NULL; 6736 if ((mp->mnt_kern_flag & MNTK_SUJ) == 0) { 6737 mkdir1->md_state |= DEPCOMPLETE; 6738 mkdir2->md_state |= DEPCOMPLETE; 6739 } 6740 /* 6741 * Dependency on "." and ".." being written to disk. 6742 */ 6743 mkdir1->md_buf = newdirbp; 6744 ACQUIRE_LOCK(&lk); 6745 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); 6746 /* 6747 * We must link the pagedep, allocdirect, and newdirblk for 6748 * the initial file page so the pointer to the new directory 6749 * is not written until the directory contents are live and 6750 * any subsequent additions are not marked live until the 6751 * block is reachable via the inode. 6752 */ 6753 if (pagedep_lookup(mp, newinum, 0, 0, &pagedep) == 0) 6754 panic("setup_newdir: lost pagedep"); 6755 LIST_FOREACH(wk, &newdirbp->b_dep, wk_list) 6756 if (wk->wk_type == D_ALLOCDIRECT) 6757 break; 6758 if (wk == NULL) 6759 panic("setup_newdir: lost allocdirect"); 6760 newblk = WK_NEWBLK(wk); 6761 pagedep->pd_state |= NEWBLOCK; 6762 pagedep->pd_newdirblk = newdirblk; 6763 newdirblk->db_pagedep = pagedep; 6764 WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); 6765 WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list); 6766 /* 6767 * Look up the inodedep for the parent directory so that we 6768 * can link mkdir2 into the pending dotdot jaddref or 6769 * the inode write if there is none. If the inode is 6770 * ALLCOMPLETE and no jaddref is present all dependencies have 6771 * been satisfied and mkdir2 can be freed. 6772 */ 6773 inodedep_lookup(mp, dinum, 0, &inodedep); 6774 if (mp->mnt_kern_flag & MNTK_SUJ) { 6775 if (inodedep == NULL) 6776 panic("setup_newdir: Lost parent."); 6777 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 6778 inoreflst); 6779 KASSERT(jaddref != NULL && jaddref->ja_parent == newinum && 6780 (jaddref->ja_state & MKDIR_PARENT), 6781 ("setup_newdir: bad dotdot jaddref %p", jaddref)); 6782 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 6783 mkdir2->md_jaddref = jaddref; 6784 jaddref->ja_mkdir = mkdir2; 6785 } else if (inodedep == NULL || 6786 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 6787 dap->da_state &= ~MKDIR_PARENT; 6788 WORKITEM_FREE(mkdir2, D_MKDIR); 6789 } else { 6790 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 6791 WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list); 6792 } 6793 *mkdirp = mkdir2; 6794 6795 return (mkdir1); 6796 } 6797 6798 /* 6799 * Directory entry addition dependencies. 6800 * 6801 * When adding a new directory entry, the inode (with its incremented link 6802 * count) must be written to disk before the directory entry's pointer to it. 6803 * Also, if the inode is newly allocated, the corresponding freemap must be 6804 * updated (on disk) before the directory entry's pointer. These requirements 6805 * are met via undo/redo on the directory entry's pointer, which consists 6806 * simply of the inode number. 6807 * 6808 * As directory entries are added and deleted, the free space within a 6809 * directory block can become fragmented. The ufs filesystem will compact 6810 * a fragmented directory block to make space for a new entry. When this 6811 * occurs, the offsets of previously added entries change. Any "diradd" 6812 * dependency structures corresponding to these entries must be updated with 6813 * the new offsets. 6814 */ 6815 6816 /* 6817 * This routine is called after the in-memory inode's link 6818 * count has been incremented, but before the directory entry's 6819 * pointer to the inode has been set. 6820 */ 6821 int 6822 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) 6823 struct buf *bp; /* buffer containing directory block */ 6824 struct inode *dp; /* inode for directory */ 6825 off_t diroffset; /* offset of new entry in directory */ 6826 ino_t newinum; /* inode referenced by new directory entry */ 6827 struct buf *newdirbp; /* non-NULL => contents of new mkdir */ 6828 int isnewblk; /* entry is in a newly allocated block */ 6829 { 6830 int offset; /* offset of new entry within directory block */ 6831 ufs_lbn_t lbn; /* block in directory containing new entry */ 6832 struct fs *fs; 6833 struct diradd *dap; 6834 struct newblk *newblk; 6835 struct pagedep *pagedep; 6836 struct inodedep *inodedep; 6837 struct newdirblk *newdirblk = 0; 6838 struct mkdir *mkdir1, *mkdir2; 6839 struct jaddref *jaddref; 6840 struct mount *mp; 6841 int isindir; 6842 6843 /* 6844 * Whiteouts have no dependencies. 6845 */ 6846 if (newinum == WINO) { 6847 if (newdirbp != NULL) 6848 bdwrite(newdirbp); 6849 return (0); 6850 } 6851 jaddref = NULL; 6852 mkdir1 = mkdir2 = NULL; 6853 mp = UFSTOVFS(dp->i_ump); 6854 fs = dp->i_fs; 6855 lbn = lblkno(fs, diroffset); 6856 offset = blkoff(fs, diroffset); 6857 dap = malloc(sizeof(struct diradd), M_DIRADD, 6858 M_SOFTDEP_FLAGS|M_ZERO); 6859 workitem_alloc(&dap->da_list, D_DIRADD, mp); 6860 dap->da_offset = offset; 6861 dap->da_newinum = newinum; 6862 dap->da_state = ATTACHED; 6863 LIST_INIT(&dap->da_jwork); 6864 isindir = bp->b_lblkno >= NDADDR; 6865 if (isnewblk && 6866 (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) { 6867 newdirblk = malloc(sizeof(struct newdirblk), 6868 M_NEWDIRBLK, M_SOFTDEP_FLAGS); 6869 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); 6870 LIST_INIT(&newdirblk->db_mkdir); 6871 } 6872 /* 6873 * If we're creating a new directory setup the dependencies and set 6874 * the dap state to wait for them. Otherwise it's COMPLETE and 6875 * we can move on. 6876 */ 6877 if (newdirbp == NULL) { 6878 dap->da_state |= DEPCOMPLETE; 6879 ACQUIRE_LOCK(&lk); 6880 } else { 6881 dap->da_state |= MKDIR_BODY | MKDIR_PARENT; 6882 mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp, 6883 &mkdir2); 6884 } 6885 /* 6886 * Link into parent directory pagedep to await its being written. 6887 */ 6888 if (pagedep_lookup(mp, dp->i_number, lbn, DEPALLOC, &pagedep) == 0) 6889 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 6890 #ifdef DEBUG 6891 if (diradd_lookup(pagedep, offset) != NULL) 6892 panic("softdep_setup_directory_add: %p already at off %d\n", 6893 diradd_lookup(pagedep, offset), offset); 6894 #endif 6895 dap->da_pagedep = pagedep; 6896 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, 6897 da_pdlist); 6898 inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); 6899 /* 6900 * If we're journaling, link the diradd into the jaddref so it 6901 * may be completed after the journal entry is written. Otherwise, 6902 * link the diradd into its inodedep. If the inode is not yet 6903 * written place it on the bufwait list, otherwise do the post-inode 6904 * write processing to put it on the id_pendinghd list. 6905 */ 6906 if (mp->mnt_kern_flag & MNTK_SUJ) { 6907 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 6908 inoreflst); 6909 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 6910 ("softdep_setup_directory_add: bad jaddref %p", jaddref)); 6911 jaddref->ja_diroff = diroffset; 6912 jaddref->ja_diradd = dap; 6913 add_to_journal(&jaddref->ja_list); 6914 } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) 6915 diradd_inode_written(dap, inodedep); 6916 else 6917 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 6918 /* 6919 * Add the journal entries for . and .. links now that the primary 6920 * link is written. 6921 */ 6922 if (mkdir1 != NULL && mp->mnt_kern_flag & MNTK_SUJ) { 6923 jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, 6924 inoreflst, if_deps); 6925 KASSERT(jaddref != NULL && 6926 jaddref->ja_ino == jaddref->ja_parent && 6927 (jaddref->ja_state & MKDIR_BODY), 6928 ("softdep_setup_directory_add: bad dot jaddref %p", 6929 jaddref)); 6930 mkdir1->md_jaddref = jaddref; 6931 jaddref->ja_mkdir = mkdir1; 6932 /* 6933 * It is important that the dotdot journal entry 6934 * is added prior to the dot entry since dot writes 6935 * both the dot and dotdot links. These both must 6936 * be added after the primary link for the journal 6937 * to remain consistent. 6938 */ 6939 add_to_journal(&mkdir2->md_jaddref->ja_list); 6940 add_to_journal(&jaddref->ja_list); 6941 } 6942 /* 6943 * If we are adding a new directory remember this diradd so that if 6944 * we rename it we can keep the dot and dotdot dependencies. If 6945 * we are adding a new name for an inode that has a mkdiradd we 6946 * must be in rename and we have to move the dot and dotdot 6947 * dependencies to this new name. The old name is being orphaned 6948 * soon. 6949 */ 6950 if (mkdir1 != NULL) { 6951 if (inodedep->id_mkdiradd != NULL) 6952 panic("softdep_setup_directory_add: Existing mkdir"); 6953 inodedep->id_mkdiradd = dap; 6954 } else if (inodedep->id_mkdiradd) 6955 merge_diradd(inodedep, dap); 6956 if (newdirblk) { 6957 /* 6958 * There is nothing to do if we are already tracking 6959 * this block. 6960 */ 6961 if ((pagedep->pd_state & NEWBLOCK) != 0) { 6962 WORKITEM_FREE(newdirblk, D_NEWDIRBLK); 6963 FREE_LOCK(&lk); 6964 return (0); 6965 } 6966 if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk) 6967 == 0) 6968 panic("softdep_setup_directory_add: lost entry"); 6969 WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); 6970 pagedep->pd_state |= NEWBLOCK; 6971 pagedep->pd_newdirblk = newdirblk; 6972 newdirblk->db_pagedep = pagedep; 6973 FREE_LOCK(&lk); 6974 /* 6975 * If we extended into an indirect signal direnter to sync. 6976 */ 6977 if (isindir) 6978 return (1); 6979 return (0); 6980 } 6981 FREE_LOCK(&lk); 6982 return (0); 6983 } 6984 6985 /* 6986 * This procedure is called to change the offset of a directory 6987 * entry when compacting a directory block which must be owned 6988 * exclusively by the caller. Note that the actual entry movement 6989 * must be done in this procedure to ensure that no I/O completions 6990 * occur while the move is in progress. 6991 */ 6992 void 6993 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) 6994 struct buf *bp; /* Buffer holding directory block. */ 6995 struct inode *dp; /* inode for directory */ 6996 caddr_t base; /* address of dp->i_offset */ 6997 caddr_t oldloc; /* address of old directory location */ 6998 caddr_t newloc; /* address of new directory location */ 6999 int entrysize; /* size of directory entry */ 7000 { 7001 int offset, oldoffset, newoffset; 7002 struct pagedep *pagedep; 7003 struct jmvref *jmvref; 7004 struct diradd *dap; 7005 struct direct *de; 7006 struct mount *mp; 7007 ufs_lbn_t lbn; 7008 int flags; 7009 7010 mp = UFSTOVFS(dp->i_ump); 7011 de = (struct direct *)oldloc; 7012 jmvref = NULL; 7013 flags = 0; 7014 /* 7015 * Moves are always journaled as it would be too complex to 7016 * determine if any affected adds or removes are present in the 7017 * journal. 7018 */ 7019 if (mp->mnt_kern_flag & MNTK_SUJ) { 7020 flags = DEPALLOC; 7021 jmvref = newjmvref(dp, de->d_ino, 7022 dp->i_offset + (oldloc - base), 7023 dp->i_offset + (newloc - base)); 7024 } 7025 lbn = lblkno(dp->i_fs, dp->i_offset); 7026 offset = blkoff(dp->i_fs, dp->i_offset); 7027 oldoffset = offset + (oldloc - base); 7028 newoffset = offset + (newloc - base); 7029 ACQUIRE_LOCK(&lk); 7030 if (pagedep_lookup(mp, dp->i_number, lbn, flags, &pagedep) == 0) { 7031 if (pagedep) 7032 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 7033 goto done; 7034 } 7035 dap = diradd_lookup(pagedep, oldoffset); 7036 if (dap) { 7037 dap->da_offset = newoffset; 7038 newoffset = DIRADDHASH(newoffset); 7039 oldoffset = DIRADDHASH(oldoffset); 7040 if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE && 7041 newoffset != oldoffset) { 7042 LIST_REMOVE(dap, da_pdlist); 7043 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset], 7044 dap, da_pdlist); 7045 } 7046 } 7047 done: 7048 if (jmvref) { 7049 jmvref->jm_pagedep = pagedep; 7050 LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps); 7051 add_to_journal(&jmvref->jm_list); 7052 } 7053 bcopy(oldloc, newloc, entrysize); 7054 FREE_LOCK(&lk); 7055 } 7056 7057 /* 7058 * Move the mkdir dependencies and journal work from one diradd to another 7059 * when renaming a directory. The new name must depend on the mkdir deps 7060 * completing as the old name did. Directories can only have one valid link 7061 * at a time so one must be canonical. 7062 */ 7063 static void 7064 merge_diradd(inodedep, newdap) 7065 struct inodedep *inodedep; 7066 struct diradd *newdap; 7067 { 7068 struct diradd *olddap; 7069 struct mkdir *mkdir, *nextmd; 7070 short state; 7071 7072 olddap = inodedep->id_mkdiradd; 7073 inodedep->id_mkdiradd = newdap; 7074 if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 7075 newdap->da_state &= ~DEPCOMPLETE; 7076 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 7077 nextmd = LIST_NEXT(mkdir, md_mkdirs); 7078 if (mkdir->md_diradd != olddap) 7079 continue; 7080 mkdir->md_diradd = newdap; 7081 state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY); 7082 newdap->da_state |= state; 7083 olddap->da_state &= ~state; 7084 if ((olddap->da_state & 7085 (MKDIR_PARENT | MKDIR_BODY)) == 0) 7086 break; 7087 } 7088 if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) 7089 panic("merge_diradd: unfound ref"); 7090 } 7091 /* 7092 * Any mkdir related journal items are not safe to be freed until 7093 * the new name is stable. 7094 */ 7095 jwork_move(&newdap->da_jwork, &olddap->da_jwork); 7096 olddap->da_state |= DEPCOMPLETE; 7097 complete_diradd(olddap); 7098 } 7099 7100 /* 7101 * Move the diradd to the pending list when all diradd dependencies are 7102 * complete. 7103 */ 7104 static void 7105 complete_diradd(dap) 7106 struct diradd *dap; 7107 { 7108 struct pagedep *pagedep; 7109 7110 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 7111 if (dap->da_state & DIRCHG) 7112 pagedep = dap->da_previous->dm_pagedep; 7113 else 7114 pagedep = dap->da_pagedep; 7115 LIST_REMOVE(dap, da_pdlist); 7116 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 7117 } 7118 } 7119 7120 /* 7121 * Cancel a diradd when a dirrem overlaps with it. We must cancel the journal 7122 * add entries and conditonally journal the remove. 7123 */ 7124 static void 7125 cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref) 7126 struct diradd *dap; 7127 struct dirrem *dirrem; 7128 struct jremref *jremref; 7129 struct jremref *dotremref; 7130 struct jremref *dotdotremref; 7131 { 7132 struct inodedep *inodedep; 7133 struct jaddref *jaddref; 7134 struct inoref *inoref; 7135 struct mkdir *mkdir; 7136 7137 /* 7138 * If no remove references were allocated we're on a non-journaled 7139 * filesystem and can skip the cancel step. 7140 */ 7141 if (jremref == NULL) { 7142 free_diradd(dap, NULL); 7143 return; 7144 } 7145 /* 7146 * Cancel the primary name an free it if it does not require 7147 * journaling. 7148 */ 7149 if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum, 7150 0, &inodedep) != 0) { 7151 /* Abort the addref that reference this diradd. */ 7152 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 7153 if (inoref->if_list.wk_type != D_JADDREF) 7154 continue; 7155 jaddref = (struct jaddref *)inoref; 7156 if (jaddref->ja_diradd != dap) 7157 continue; 7158 if (cancel_jaddref(jaddref, inodedep, 7159 &dirrem->dm_jwork) == 0) { 7160 free_jremref(jremref); 7161 jremref = NULL; 7162 } 7163 break; 7164 } 7165 } 7166 /* 7167 * Cancel subordinate names and free them if they do not require 7168 * journaling. 7169 */ 7170 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 7171 LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) { 7172 if (mkdir->md_diradd != dap) 7173 continue; 7174 if ((jaddref = mkdir->md_jaddref) == NULL) 7175 continue; 7176 mkdir->md_jaddref = NULL; 7177 if (mkdir->md_state & MKDIR_PARENT) { 7178 if (cancel_jaddref(jaddref, NULL, 7179 &dirrem->dm_jwork) == 0) { 7180 free_jremref(dotdotremref); 7181 dotdotremref = NULL; 7182 } 7183 } else { 7184 if (cancel_jaddref(jaddref, inodedep, 7185 &dirrem->dm_jwork) == 0) { 7186 free_jremref(dotremref); 7187 dotremref = NULL; 7188 } 7189 } 7190 } 7191 } 7192 7193 if (jremref) 7194 journal_jremref(dirrem, jremref, inodedep); 7195 if (dotremref) 7196 journal_jremref(dirrem, dotremref, inodedep); 7197 if (dotdotremref) 7198 journal_jremref(dirrem, dotdotremref, NULL); 7199 jwork_move(&dirrem->dm_jwork, &dap->da_jwork); 7200 free_diradd(dap, &dirrem->dm_jwork); 7201 } 7202 7203 /* 7204 * Free a diradd dependency structure. This routine must be called 7205 * with splbio interrupts blocked. 7206 */ 7207 static void 7208 free_diradd(dap, wkhd) 7209 struct diradd *dap; 7210 struct workhead *wkhd; 7211 { 7212 struct dirrem *dirrem; 7213 struct pagedep *pagedep; 7214 struct inodedep *inodedep; 7215 struct mkdir *mkdir, *nextmd; 7216 7217 mtx_assert(&lk, MA_OWNED); 7218 LIST_REMOVE(dap, da_pdlist); 7219 if (dap->da_state & ONWORKLIST) 7220 WORKLIST_REMOVE(&dap->da_list); 7221 if ((dap->da_state & DIRCHG) == 0) { 7222 pagedep = dap->da_pagedep; 7223 } else { 7224 dirrem = dap->da_previous; 7225 pagedep = dirrem->dm_pagedep; 7226 dirrem->dm_dirinum = pagedep->pd_ino; 7227 dirrem->dm_state |= COMPLETE; 7228 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 7229 add_to_worklist(&dirrem->dm_list, 0); 7230 } 7231 if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum, 7232 0, &inodedep) != 0) 7233 if (inodedep->id_mkdiradd == dap) 7234 inodedep->id_mkdiradd = NULL; 7235 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 7236 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 7237 nextmd = LIST_NEXT(mkdir, md_mkdirs); 7238 if (mkdir->md_diradd != dap) 7239 continue; 7240 dap->da_state &= 7241 ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); 7242 LIST_REMOVE(mkdir, md_mkdirs); 7243 if (mkdir->md_state & ONWORKLIST) 7244 WORKLIST_REMOVE(&mkdir->md_list); 7245 if (mkdir->md_jaddref != NULL) 7246 panic("free_diradd: Unexpected jaddref"); 7247 WORKITEM_FREE(mkdir, D_MKDIR); 7248 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) 7249 break; 7250 } 7251 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) 7252 panic("free_diradd: unfound ref"); 7253 } 7254 if (inodedep) 7255 free_inodedep(inodedep); 7256 /* 7257 * Free any journal segments waiting for the directory write. 7258 */ 7259 handle_jwork(&dap->da_jwork); 7260 WORKITEM_FREE(dap, D_DIRADD); 7261 } 7262 7263 /* 7264 * Directory entry removal dependencies. 7265 * 7266 * When removing a directory entry, the entry's inode pointer must be 7267 * zero'ed on disk before the corresponding inode's link count is decremented 7268 * (possibly freeing the inode for re-use). This dependency is handled by 7269 * updating the directory entry but delaying the inode count reduction until 7270 * after the directory block has been written to disk. After this point, the 7271 * inode count can be decremented whenever it is convenient. 7272 */ 7273 7274 /* 7275 * This routine should be called immediately after removing 7276 * a directory entry. The inode's link count should not be 7277 * decremented by the calling procedure -- the soft updates 7278 * code will do this task when it is safe. 7279 */ 7280 void 7281 softdep_setup_remove(bp, dp, ip, isrmdir) 7282 struct buf *bp; /* buffer containing directory block */ 7283 struct inode *dp; /* inode for the directory being modified */ 7284 struct inode *ip; /* inode for directory entry being removed */ 7285 int isrmdir; /* indicates if doing RMDIR */ 7286 { 7287 struct dirrem *dirrem, *prevdirrem; 7288 struct inodedep *inodedep; 7289 int direct; 7290 7291 /* 7292 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. We want 7293 * newdirrem() to setup the full directory remove which requires 7294 * isrmdir > 1. 7295 */ 7296 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 7297 /* 7298 * Add the dirrem to the inodedep's pending remove list for quick 7299 * discovery later. 7300 */ 7301 if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 7302 &inodedep) == 0) 7303 panic("softdep_setup_remove: Lost inodedep."); 7304 dirrem->dm_state |= ONDEPLIST; 7305 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 7306 7307 /* 7308 * If the COMPLETE flag is clear, then there were no active 7309 * entries and we want to roll back to a zeroed entry until 7310 * the new inode is committed to disk. If the COMPLETE flag is 7311 * set then we have deleted an entry that never made it to 7312 * disk. If the entry we deleted resulted from a name change, 7313 * then the old name still resides on disk. We cannot delete 7314 * its inode (returned to us in prevdirrem) until the zeroed 7315 * directory entry gets to disk. The new inode has never been 7316 * referenced on the disk, so can be deleted immediately. 7317 */ 7318 if ((dirrem->dm_state & COMPLETE) == 0) { 7319 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, 7320 dm_next); 7321 FREE_LOCK(&lk); 7322 } else { 7323 if (prevdirrem != NULL) 7324 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, 7325 prevdirrem, dm_next); 7326 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; 7327 direct = LIST_EMPTY(&dirrem->dm_jremrefhd); 7328 FREE_LOCK(&lk); 7329 if (direct) 7330 handle_workitem_remove(dirrem, NULL); 7331 } 7332 } 7333 7334 /* 7335 * Check for an entry matching 'offset' on both the pd_dirraddhd list and the 7336 * pd_pendinghd list of a pagedep. 7337 */ 7338 static struct diradd * 7339 diradd_lookup(pagedep, offset) 7340 struct pagedep *pagedep; 7341 int offset; 7342 { 7343 struct diradd *dap; 7344 7345 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) 7346 if (dap->da_offset == offset) 7347 return (dap); 7348 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) 7349 if (dap->da_offset == offset) 7350 return (dap); 7351 return (NULL); 7352 } 7353 7354 /* 7355 * Search for a .. diradd dependency in a directory that is being removed. 7356 * If the directory was renamed to a new parent we have a diradd rather 7357 * than a mkdir for the .. entry. We need to cancel it now before 7358 * it is found in truncate(). 7359 */ 7360 static struct jremref * 7361 cancel_diradd_dotdot(ip, dirrem, jremref) 7362 struct inode *ip; 7363 struct dirrem *dirrem; 7364 struct jremref *jremref; 7365 { 7366 struct pagedep *pagedep; 7367 struct diradd *dap; 7368 struct worklist *wk; 7369 7370 if (pagedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 0, 7371 &pagedep) == 0) 7372 return (jremref); 7373 dap = diradd_lookup(pagedep, DOTDOT_OFFSET); 7374 if (dap == NULL) 7375 return (jremref); 7376 cancel_diradd(dap, dirrem, jremref, NULL, NULL); 7377 /* 7378 * Mark any journal work as belonging to the parent so it is freed 7379 * with the .. reference. 7380 */ 7381 LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) 7382 wk->wk_state |= MKDIR_PARENT; 7383 return (NULL); 7384 } 7385 7386 /* 7387 * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to 7388 * replace it with a dirrem/diradd pair as a result of re-parenting a 7389 * directory. This ensures that we don't simultaneously have a mkdir and 7390 * a diradd for the same .. entry. 7391 */ 7392 static struct jremref * 7393 cancel_mkdir_dotdot(ip, dirrem, jremref) 7394 struct inode *ip; 7395 struct dirrem *dirrem; 7396 struct jremref *jremref; 7397 { 7398 struct inodedep *inodedep; 7399 struct jaddref *jaddref; 7400 struct mkdir *mkdir; 7401 struct diradd *dap; 7402 7403 if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 7404 &inodedep) == 0) 7405 panic("cancel_mkdir_dotdot: Lost inodedep"); 7406 dap = inodedep->id_mkdiradd; 7407 if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0) 7408 return (jremref); 7409 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; 7410 mkdir = LIST_NEXT(mkdir, md_mkdirs)) 7411 if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT) 7412 break; 7413 if (mkdir == NULL) 7414 panic("cancel_mkdir_dotdot: Unable to find mkdir\n"); 7415 if ((jaddref = mkdir->md_jaddref) != NULL) { 7416 mkdir->md_jaddref = NULL; 7417 jaddref->ja_state &= ~MKDIR_PARENT; 7418 if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0, 7419 &inodedep) == 0) 7420 panic("cancel_mkdir_dotdot: Lost parent inodedep"); 7421 if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) { 7422 journal_jremref(dirrem, jremref, inodedep); 7423 jremref = NULL; 7424 } 7425 } 7426 if (mkdir->md_state & ONWORKLIST) 7427 WORKLIST_REMOVE(&mkdir->md_list); 7428 mkdir->md_state |= ALLCOMPLETE; 7429 complete_mkdir(mkdir); 7430 return (jremref); 7431 } 7432 7433 static void 7434 journal_jremref(dirrem, jremref, inodedep) 7435 struct dirrem *dirrem; 7436 struct jremref *jremref; 7437 struct inodedep *inodedep; 7438 { 7439 7440 if (inodedep == NULL) 7441 if (inodedep_lookup(jremref->jr_list.wk_mp, 7442 jremref->jr_ref.if_ino, 0, &inodedep) == 0) 7443 panic("journal_jremref: Lost inodedep"); 7444 LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps); 7445 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); 7446 add_to_journal(&jremref->jr_list); 7447 } 7448 7449 static void 7450 dirrem_journal(dirrem, jremref, dotremref, dotdotremref) 7451 struct dirrem *dirrem; 7452 struct jremref *jremref; 7453 struct jremref *dotremref; 7454 struct jremref *dotdotremref; 7455 { 7456 struct inodedep *inodedep; 7457 7458 7459 if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0, 7460 &inodedep) == 0) 7461 panic("dirrem_journal: Lost inodedep"); 7462 journal_jremref(dirrem, jremref, inodedep); 7463 if (dotremref) 7464 journal_jremref(dirrem, dotremref, inodedep); 7465 if (dotdotremref) 7466 journal_jremref(dirrem, dotdotremref, NULL); 7467 } 7468 7469 /* 7470 * Allocate a new dirrem if appropriate and return it along with 7471 * its associated pagedep. Called without a lock, returns with lock. 7472 */ 7473 static struct dirrem * 7474 newdirrem(bp, dp, ip, isrmdir, prevdirremp) 7475 struct buf *bp; /* buffer containing directory block */ 7476 struct inode *dp; /* inode for the directory being modified */ 7477 struct inode *ip; /* inode for directory entry being removed */ 7478 int isrmdir; /* indicates if doing RMDIR */ 7479 struct dirrem **prevdirremp; /* previously referenced inode, if any */ 7480 { 7481 int offset; 7482 ufs_lbn_t lbn; 7483 struct diradd *dap; 7484 struct dirrem *dirrem; 7485 struct pagedep *pagedep; 7486 struct jremref *jremref; 7487 struct jremref *dotremref; 7488 struct jremref *dotdotremref; 7489 struct vnode *dvp; 7490 7491 /* 7492 * Whiteouts have no deletion dependencies. 7493 */ 7494 if (ip == NULL) 7495 panic("newdirrem: whiteout"); 7496 dvp = ITOV(dp); 7497 /* 7498 * If we are over our limit, try to improve the situation. 7499 * Limiting the number of dirrem structures will also limit 7500 * the number of freefile and freeblks structures. 7501 */ 7502 ACQUIRE_LOCK(&lk); 7503 if (!(ip->i_flags & SF_SNAPSHOT) && 7504 dep_current[D_DIRREM] > max_softdeps / 2) 7505 (void) request_cleanup(ITOV(dp)->v_mount, FLUSH_BLOCKS); 7506 FREE_LOCK(&lk); 7507 dirrem = malloc(sizeof(struct dirrem), 7508 M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO); 7509 workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount); 7510 LIST_INIT(&dirrem->dm_jremrefhd); 7511 LIST_INIT(&dirrem->dm_jwork); 7512 dirrem->dm_state = isrmdir ? RMDIR : 0; 7513 dirrem->dm_oldinum = ip->i_number; 7514 *prevdirremp = NULL; 7515 /* 7516 * Allocate remove reference structures to track journal write 7517 * dependencies. We will always have one for the link and 7518 * when doing directories we will always have one more for dot. 7519 * When renaming a directory we skip the dotdot link change so 7520 * this is not needed. 7521 */ 7522 jremref = dotremref = dotdotremref = NULL; 7523 if (DOINGSUJ(dvp)) { 7524 if (isrmdir) { 7525 jremref = newjremref(dirrem, dp, ip, dp->i_offset, 7526 ip->i_effnlink + 2); 7527 dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET, 7528 ip->i_effnlink + 1); 7529 dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET, 7530 dp->i_effnlink + 1); 7531 dotdotremref->jr_state |= MKDIR_PARENT; 7532 } else 7533 jremref = newjremref(dirrem, dp, ip, dp->i_offset, 7534 ip->i_effnlink + 1); 7535 } 7536 ACQUIRE_LOCK(&lk); 7537 lbn = lblkno(dp->i_fs, dp->i_offset); 7538 offset = blkoff(dp->i_fs, dp->i_offset); 7539 if (pagedep_lookup(UFSTOVFS(dp->i_ump), dp->i_number, lbn, DEPALLOC, 7540 &pagedep) == 0) 7541 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 7542 dirrem->dm_pagedep = pagedep; 7543 /* 7544 * If we're renaming a .. link to a new directory, cancel any 7545 * existing MKDIR_PARENT mkdir. If it has already been canceled 7546 * the jremref is preserved for any potential diradd in this 7547 * location. This can not coincide with a rmdir. 7548 */ 7549 if (dp->i_offset == DOTDOT_OFFSET) { 7550 if (isrmdir) 7551 panic("newdirrem: .. directory change during remove?"); 7552 jremref = cancel_mkdir_dotdot(dp, dirrem, jremref); 7553 } 7554 /* 7555 * If we're removing a directory search for the .. dependency now and 7556 * cancel it. Any pending journal work will be added to the dirrem 7557 * to be completed when the workitem remove completes. 7558 */ 7559 if (isrmdir) 7560 dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref); 7561 /* 7562 * Check for a diradd dependency for the same directory entry. 7563 * If present, then both dependencies become obsolete and can 7564 * be de-allocated. 7565 */ 7566 dap = diradd_lookup(pagedep, offset); 7567 if (dap == NULL) { 7568 /* 7569 * Link the jremref structures into the dirrem so they are 7570 * written prior to the pagedep. 7571 */ 7572 if (jremref) 7573 dirrem_journal(dirrem, jremref, dotremref, 7574 dotdotremref); 7575 return (dirrem); 7576 } 7577 /* 7578 * Must be ATTACHED at this point. 7579 */ 7580 if ((dap->da_state & ATTACHED) == 0) 7581 panic("newdirrem: not ATTACHED"); 7582 if (dap->da_newinum != ip->i_number) 7583 panic("newdirrem: inum %d should be %d", 7584 ip->i_number, dap->da_newinum); 7585 /* 7586 * If we are deleting a changed name that never made it to disk, 7587 * then return the dirrem describing the previous inode (which 7588 * represents the inode currently referenced from this entry on disk). 7589 */ 7590 if ((dap->da_state & DIRCHG) != 0) { 7591 *prevdirremp = dap->da_previous; 7592 dap->da_state &= ~DIRCHG; 7593 dap->da_pagedep = pagedep; 7594 } 7595 /* 7596 * We are deleting an entry that never made it to disk. 7597 * Mark it COMPLETE so we can delete its inode immediately. 7598 */ 7599 dirrem->dm_state |= COMPLETE; 7600 cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref); 7601 #ifdef SUJ_DEBUG 7602 if (isrmdir == 0) { 7603 struct worklist *wk; 7604 7605 LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) 7606 if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT)) 7607 panic("bad wk %p (0x%X)\n", wk, wk->wk_state); 7608 } 7609 #endif 7610 7611 return (dirrem); 7612 } 7613 7614 /* 7615 * Directory entry change dependencies. 7616 * 7617 * Changing an existing directory entry requires that an add operation 7618 * be completed first followed by a deletion. The semantics for the addition 7619 * are identical to the description of adding a new entry above except 7620 * that the rollback is to the old inode number rather than zero. Once 7621 * the addition dependency is completed, the removal is done as described 7622 * in the removal routine above. 7623 */ 7624 7625 /* 7626 * This routine should be called immediately after changing 7627 * a directory entry. The inode's link count should not be 7628 * decremented by the calling procedure -- the soft updates 7629 * code will perform this task when it is safe. 7630 */ 7631 void 7632 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 7633 struct buf *bp; /* buffer containing directory block */ 7634 struct inode *dp; /* inode for the directory being modified */ 7635 struct inode *ip; /* inode for directory entry being removed */ 7636 ino_t newinum; /* new inode number for changed entry */ 7637 int isrmdir; /* indicates if doing RMDIR */ 7638 { 7639 int offset; 7640 struct diradd *dap = NULL; 7641 struct dirrem *dirrem, *prevdirrem; 7642 struct pagedep *pagedep; 7643 struct inodedep *inodedep; 7644 struct jaddref *jaddref; 7645 struct mount *mp; 7646 7647 offset = blkoff(dp->i_fs, dp->i_offset); 7648 mp = UFSTOVFS(dp->i_ump); 7649 7650 /* 7651 * Whiteouts do not need diradd dependencies. 7652 */ 7653 if (newinum != WINO) { 7654 dap = malloc(sizeof(struct diradd), 7655 M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO); 7656 workitem_alloc(&dap->da_list, D_DIRADD, mp); 7657 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; 7658 dap->da_offset = offset; 7659 dap->da_newinum = newinum; 7660 LIST_INIT(&dap->da_jwork); 7661 } 7662 7663 /* 7664 * Allocate a new dirrem and ACQUIRE_LOCK. 7665 */ 7666 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 7667 pagedep = dirrem->dm_pagedep; 7668 /* 7669 * The possible values for isrmdir: 7670 * 0 - non-directory file rename 7671 * 1 - directory rename within same directory 7672 * inum - directory rename to new directory of given inode number 7673 * When renaming to a new directory, we are both deleting and 7674 * creating a new directory entry, so the link count on the new 7675 * directory should not change. Thus we do not need the followup 7676 * dirrem which is usually done in handle_workitem_remove. We set 7677 * the DIRCHG flag to tell handle_workitem_remove to skip the 7678 * followup dirrem. 7679 */ 7680 if (isrmdir > 1) 7681 dirrem->dm_state |= DIRCHG; 7682 7683 /* 7684 * Whiteouts have no additional dependencies, 7685 * so just put the dirrem on the correct list. 7686 */ 7687 if (newinum == WINO) { 7688 if ((dirrem->dm_state & COMPLETE) == 0) { 7689 LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, 7690 dm_next); 7691 } else { 7692 dirrem->dm_dirinum = pagedep->pd_ino; 7693 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 7694 add_to_worklist(&dirrem->dm_list, 0); 7695 } 7696 FREE_LOCK(&lk); 7697 return; 7698 } 7699 /* 7700 * Add the dirrem to the inodedep's pending remove list for quick 7701 * discovery later. A valid nlinkdelta ensures that this lookup 7702 * will not fail. 7703 */ 7704 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) 7705 panic("softdep_setup_directory_change: Lost inodedep."); 7706 dirrem->dm_state |= ONDEPLIST; 7707 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 7708 7709 /* 7710 * If the COMPLETE flag is clear, then there were no active 7711 * entries and we want to roll back to the previous inode until 7712 * the new inode is committed to disk. If the COMPLETE flag is 7713 * set, then we have deleted an entry that never made it to disk. 7714 * If the entry we deleted resulted from a name change, then the old 7715 * inode reference still resides on disk. Any rollback that we do 7716 * needs to be to that old inode (returned to us in prevdirrem). If 7717 * the entry we deleted resulted from a create, then there is 7718 * no entry on the disk, so we want to roll back to zero rather 7719 * than the uncommitted inode. In either of the COMPLETE cases we 7720 * want to immediately free the unwritten and unreferenced inode. 7721 */ 7722 if ((dirrem->dm_state & COMPLETE) == 0) { 7723 dap->da_previous = dirrem; 7724 } else { 7725 if (prevdirrem != NULL) { 7726 dap->da_previous = prevdirrem; 7727 } else { 7728 dap->da_state &= ~DIRCHG; 7729 dap->da_pagedep = pagedep; 7730 } 7731 dirrem->dm_dirinum = pagedep->pd_ino; 7732 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 7733 add_to_worklist(&dirrem->dm_list, 0); 7734 } 7735 /* 7736 * Lookup the jaddref for this journal entry. We must finish 7737 * initializing it and make the diradd write dependent on it. 7738 * If we're not journaling Put it on the id_bufwait list if the inode 7739 * is not yet written. If it is written, do the post-inode write 7740 * processing to put it on the id_pendinghd list. 7741 */ 7742 inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); 7743 if (mp->mnt_kern_flag & MNTK_SUJ) { 7744 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 7745 inoreflst); 7746 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 7747 ("softdep_setup_directory_change: bad jaddref %p", 7748 jaddref)); 7749 jaddref->ja_diroff = dp->i_offset; 7750 jaddref->ja_diradd = dap; 7751 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 7752 dap, da_pdlist); 7753 add_to_journal(&jaddref->ja_list); 7754 } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 7755 dap->da_state |= COMPLETE; 7756 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 7757 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 7758 } else { 7759 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 7760 dap, da_pdlist); 7761 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 7762 } 7763 /* 7764 * If we're making a new name for a directory that has not been 7765 * committed when need to move the dot and dotdot references to 7766 * this new name. 7767 */ 7768 if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET) 7769 merge_diradd(inodedep, dap); 7770 FREE_LOCK(&lk); 7771 } 7772 7773 /* 7774 * Called whenever the link count on an inode is changed. 7775 * It creates an inode dependency so that the new reference(s) 7776 * to the inode cannot be committed to disk until the updated 7777 * inode has been written. 7778 */ 7779 void 7780 softdep_change_linkcnt(ip) 7781 struct inode *ip; /* the inode with the increased link count */ 7782 { 7783 struct inodedep *inodedep; 7784 7785 ACQUIRE_LOCK(&lk); 7786 inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep); 7787 if (ip->i_nlink < ip->i_effnlink) 7788 panic("softdep_change_linkcnt: bad delta"); 7789 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 7790 FREE_LOCK(&lk); 7791 } 7792 7793 /* 7794 * Attach a sbdep dependency to the superblock buf so that we can keep 7795 * track of the head of the linked list of referenced but unlinked inodes. 7796 */ 7797 void 7798 softdep_setup_sbupdate(ump, fs, bp) 7799 struct ufsmount *ump; 7800 struct fs *fs; 7801 struct buf *bp; 7802 { 7803 struct sbdep *sbdep; 7804 struct worklist *wk; 7805 7806 if ((fs->fs_flags & FS_SUJ) == 0) 7807 return; 7808 LIST_FOREACH(wk, &bp->b_dep, wk_list) 7809 if (wk->wk_type == D_SBDEP) 7810 break; 7811 if (wk != NULL) 7812 return; 7813 sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS); 7814 workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump)); 7815 sbdep->sb_fs = fs; 7816 sbdep->sb_ump = ump; 7817 ACQUIRE_LOCK(&lk); 7818 WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list); 7819 FREE_LOCK(&lk); 7820 } 7821 7822 /* 7823 * Return the first unlinked inodedep which is ready to be the head of the 7824 * list. The inodedep and all those after it must have valid next pointers. 7825 */ 7826 static struct inodedep * 7827 first_unlinked_inodedep(ump) 7828 struct ufsmount *ump; 7829 { 7830 struct inodedep *inodedep; 7831 struct inodedep *idp; 7832 7833 for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst); 7834 inodedep; inodedep = idp) { 7835 if ((inodedep->id_state & UNLINKNEXT) == 0) 7836 return (NULL); 7837 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 7838 if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0) 7839 break; 7840 if ((inodedep->id_state & UNLINKPREV) == 0) 7841 panic("first_unlinked_inodedep: prev != next"); 7842 } 7843 if (inodedep == NULL) 7844 return (NULL); 7845 7846 return (inodedep); 7847 } 7848 7849 /* 7850 * Set the sujfree unlinked head pointer prior to writing a superblock. 7851 */ 7852 static void 7853 initiate_write_sbdep(sbdep) 7854 struct sbdep *sbdep; 7855 { 7856 struct inodedep *inodedep; 7857 struct fs *bpfs; 7858 struct fs *fs; 7859 7860 bpfs = sbdep->sb_fs; 7861 fs = sbdep->sb_ump->um_fs; 7862 inodedep = first_unlinked_inodedep(sbdep->sb_ump); 7863 if (inodedep) { 7864 fs->fs_sujfree = inodedep->id_ino; 7865 inodedep->id_state |= UNLINKPREV; 7866 } else 7867 fs->fs_sujfree = 0; 7868 bpfs->fs_sujfree = fs->fs_sujfree; 7869 } 7870 7871 /* 7872 * After a superblock is written determine whether it must be written again 7873 * due to a changing unlinked list head. 7874 */ 7875 static int 7876 handle_written_sbdep(sbdep, bp) 7877 struct sbdep *sbdep; 7878 struct buf *bp; 7879 { 7880 struct inodedep *inodedep; 7881 struct mount *mp; 7882 struct fs *fs; 7883 7884 fs = sbdep->sb_fs; 7885 mp = UFSTOVFS(sbdep->sb_ump); 7886 inodedep = first_unlinked_inodedep(sbdep->sb_ump); 7887 if ((inodedep && fs->fs_sujfree != inodedep->id_ino) || 7888 (inodedep == NULL && fs->fs_sujfree != 0)) { 7889 bdirty(bp); 7890 return (1); 7891 } 7892 WORKITEM_FREE(sbdep, D_SBDEP); 7893 if (fs->fs_sujfree == 0) 7894 return (0); 7895 if (inodedep_lookup(mp, fs->fs_sujfree, 0, &inodedep) == 0) 7896 panic("handle_written_sbdep: lost inodedep"); 7897 /* 7898 * Now that we have a record of this inode in stable store allow it 7899 * to be written to free up pending work. Inodes may see a lot of 7900 * write activity after they are unlinked which we must not hold up. 7901 */ 7902 for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) { 7903 if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS) 7904 panic("handle_written_sbdep: Bad inodedep %p (0x%X)", 7905 inodedep, inodedep->id_state); 7906 if (inodedep->id_state & UNLINKONLIST) 7907 break; 7908 inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST; 7909 } 7910 7911 return (0); 7912 } 7913 7914 /* 7915 * Mark an inodedep as unlinked and insert it into the in-memory unlinked list. 7916 */ 7917 static void 7918 unlinked_inodedep(mp, inodedep) 7919 struct mount *mp; 7920 struct inodedep *inodedep; 7921 { 7922 struct ufsmount *ump; 7923 7924 if ((mp->mnt_kern_flag & MNTK_SUJ) == 0) 7925 return; 7926 ump = VFSTOUFS(mp); 7927 ump->um_fs->fs_fmod = 1; 7928 inodedep->id_state |= UNLINKED; 7929 TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked); 7930 } 7931 7932 /* 7933 * Remove an inodedep from the unlinked inodedep list. This may require 7934 * disk writes if the inode has made it that far. 7935 */ 7936 static void 7937 clear_unlinked_inodedep(inodedep) 7938 struct inodedep *inodedep; 7939 { 7940 struct ufsmount *ump; 7941 struct inodedep *idp; 7942 struct inodedep *idn; 7943 struct fs *fs; 7944 struct buf *bp; 7945 ino_t ino; 7946 ino_t nino; 7947 ino_t pino; 7948 int error; 7949 7950 ump = VFSTOUFS(inodedep->id_list.wk_mp); 7951 fs = ump->um_fs; 7952 ino = inodedep->id_ino; 7953 error = 0; 7954 for (;;) { 7955 /* 7956 * If nothing has yet been written simply remove us from 7957 * the in memory list and return. This is the most common 7958 * case where handle_workitem_remove() loses the final 7959 * reference. 7960 */ 7961 if ((inodedep->id_state & UNLINKLINKS) == 0) 7962 break; 7963 /* 7964 * If we have a NEXT pointer and no PREV pointer we can simply 7965 * clear NEXT's PREV and remove ourselves from the list. Be 7966 * careful not to clear PREV if the superblock points at 7967 * next as well. 7968 */ 7969 idn = TAILQ_NEXT(inodedep, id_unlinked); 7970 if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) { 7971 if (idn && fs->fs_sujfree != idn->id_ino) 7972 idn->id_state &= ~UNLINKPREV; 7973 break; 7974 } 7975 /* 7976 * Here we have an inodedep which is actually linked into 7977 * the list. We must remove it by forcing a write to the 7978 * link before us, whether it be the superblock or an inode. 7979 * Unfortunately the list may change while we're waiting 7980 * on the buf lock for either resource so we must loop until 7981 * we lock the right one. If both the superblock and an 7982 * inode point to this inode we must clear the inode first 7983 * followed by the superblock. 7984 */ 7985 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 7986 pino = 0; 7987 if (idp && (idp->id_state & UNLINKNEXT)) 7988 pino = idp->id_ino; 7989 FREE_LOCK(&lk); 7990 if (pino == 0) 7991 bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), 7992 (int)fs->fs_sbsize, 0, 0, 0); 7993 else 7994 error = bread(ump->um_devvp, 7995 fsbtodb(fs, ino_to_fsba(fs, pino)), 7996 (int)fs->fs_bsize, NOCRED, &bp); 7997 ACQUIRE_LOCK(&lk); 7998 if (error) 7999 break; 8000 /* If the list has changed restart the loop. */ 8001 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 8002 nino = 0; 8003 if (idp && (idp->id_state & UNLINKNEXT)) 8004 nino = idp->id_ino; 8005 if (nino != pino || 8006 (inodedep->id_state & UNLINKPREV) != UNLINKPREV) { 8007 FREE_LOCK(&lk); 8008 brelse(bp); 8009 ACQUIRE_LOCK(&lk); 8010 continue; 8011 } 8012 /* 8013 * Remove us from the in memory list. After this we cannot 8014 * access the inodedep. 8015 */ 8016 idn = TAILQ_NEXT(inodedep, id_unlinked); 8017 inodedep->id_state &= ~(UNLINKED | UNLINKLINKS); 8018 TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); 8019 /* 8020 * Determine the next inode number. 8021 */ 8022 nino = 0; 8023 if (idn) { 8024 /* 8025 * If next isn't on the list we can just clear prev's 8026 * state and schedule it to be fixed later. No need 8027 * to synchronously write if we're not in the real 8028 * list. 8029 */ 8030 if ((idn->id_state & UNLINKPREV) == 0 && pino != 0) { 8031 idp->id_state &= ~UNLINKNEXT; 8032 if ((idp->id_state & ONWORKLIST) == 0) 8033 WORKLIST_INSERT(&bp->b_dep, 8034 &idp->id_list); 8035 FREE_LOCK(&lk); 8036 bawrite(bp); 8037 ACQUIRE_LOCK(&lk); 8038 return; 8039 } 8040 nino = idn->id_ino; 8041 } 8042 FREE_LOCK(&lk); 8043 /* 8044 * The predecessor's next pointer is manually updated here 8045 * so that the NEXT flag is never cleared for an element 8046 * that is in the list. 8047 */ 8048 if (pino == 0) { 8049 bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); 8050 ffs_oldfscompat_write((struct fs *)bp->b_data, ump); 8051 softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, 8052 bp); 8053 } else if (fs->fs_magic == FS_UFS1_MAGIC) 8054 ((struct ufs1_dinode *)bp->b_data + 8055 ino_to_fsbo(fs, pino))->di_freelink = nino; 8056 else 8057 ((struct ufs2_dinode *)bp->b_data + 8058 ino_to_fsbo(fs, pino))->di_freelink = nino; 8059 /* 8060 * If the bwrite fails we have no recourse to recover. The 8061 * filesystem is corrupted already. 8062 */ 8063 bwrite(bp); 8064 ACQUIRE_LOCK(&lk); 8065 /* 8066 * If the superblock pointer still needs to be cleared force 8067 * a write here. 8068 */ 8069 if (fs->fs_sujfree == ino) { 8070 FREE_LOCK(&lk); 8071 bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), 8072 (int)fs->fs_sbsize, 0, 0, 0); 8073 bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); 8074 ffs_oldfscompat_write((struct fs *)bp->b_data, ump); 8075 softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, 8076 bp); 8077 bwrite(bp); 8078 ACQUIRE_LOCK(&lk); 8079 } 8080 if (fs->fs_sujfree != ino) 8081 return; 8082 panic("clear_unlinked_inodedep: Failed to clear free head"); 8083 } 8084 if (inodedep->id_ino == fs->fs_sujfree) 8085 panic("clear_unlinked_inodedep: Freeing head of free list"); 8086 inodedep->id_state &= ~(UNLINKED | UNLINKLINKS); 8087 TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); 8088 return; 8089 } 8090 8091 /* 8092 * This workitem decrements the inode's link count. 8093 * If the link count reaches zero, the file is removed. 8094 */ 8095 static void 8096 handle_workitem_remove(dirrem, xp) 8097 struct dirrem *dirrem; 8098 struct vnode *xp; 8099 { 8100 struct inodedep *inodedep; 8101 struct workhead dotdotwk; 8102 struct worklist *wk; 8103 struct ufsmount *ump; 8104 struct mount *mp; 8105 struct vnode *vp; 8106 struct inode *ip; 8107 ino_t oldinum; 8108 int error; 8109 8110 if (dirrem->dm_state & ONWORKLIST) 8111 panic("handle_workitem_remove: dirrem %p still on worklist", 8112 dirrem); 8113 oldinum = dirrem->dm_oldinum; 8114 mp = dirrem->dm_list.wk_mp; 8115 ump = VFSTOUFS(mp); 8116 if ((vp = xp) == NULL && 8117 (error = ffs_vgetf(mp, oldinum, LK_EXCLUSIVE, &vp, 8118 FFSV_FORCEINSMQ)) != 0) { 8119 softdep_error("handle_workitem_remove: vget", error); 8120 return; 8121 } 8122 ip = VTOI(vp); 8123 ACQUIRE_LOCK(&lk); 8124 if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0) 8125 panic("handle_workitem_remove: lost inodedep"); 8126 if (dirrem->dm_state & ONDEPLIST) 8127 LIST_REMOVE(dirrem, dm_inonext); 8128 KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), 8129 ("handle_workitem_remove: Journal entries not written.")); 8130 8131 /* 8132 * Move all dependencies waiting on the remove to complete 8133 * from the dirrem to the inode inowait list to be completed 8134 * after the inode has been updated and written to disk. Any 8135 * marked MKDIR_PARENT are saved to be completed when the .. ref 8136 * is removed. 8137 */ 8138 LIST_INIT(&dotdotwk); 8139 while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) { 8140 WORKLIST_REMOVE(wk); 8141 if (wk->wk_state & MKDIR_PARENT) { 8142 wk->wk_state &= ~MKDIR_PARENT; 8143 WORKLIST_INSERT(&dotdotwk, wk); 8144 continue; 8145 } 8146 WORKLIST_INSERT(&inodedep->id_inowait, wk); 8147 } 8148 LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list); 8149 /* 8150 * Normal file deletion. 8151 */ 8152 if ((dirrem->dm_state & RMDIR) == 0) { 8153 ip->i_nlink--; 8154 DIP_SET(ip, i_nlink, ip->i_nlink); 8155 ip->i_flag |= IN_CHANGE; 8156 if (ip->i_nlink < ip->i_effnlink) 8157 panic("handle_workitem_remove: bad file delta"); 8158 if (ip->i_nlink == 0) 8159 unlinked_inodedep(mp, inodedep); 8160 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 8161 KASSERT(LIST_EMPTY(&dirrem->dm_jwork), 8162 ("handle_workitem_remove: worklist not empty. %s", 8163 TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type))); 8164 WORKITEM_FREE(dirrem, D_DIRREM); 8165 FREE_LOCK(&lk); 8166 goto out; 8167 } 8168 /* 8169 * Directory deletion. Decrement reference count for both the 8170 * just deleted parent directory entry and the reference for ".". 8171 * Arrange to have the reference count on the parent decremented 8172 * to account for the loss of "..". 8173 */ 8174 ip->i_nlink -= 2; 8175 DIP_SET(ip, i_nlink, ip->i_nlink); 8176 ip->i_flag |= IN_CHANGE; 8177 if (ip->i_nlink < ip->i_effnlink) 8178 panic("handle_workitem_remove: bad dir delta"); 8179 if (ip->i_nlink == 0) 8180 unlinked_inodedep(mp, inodedep); 8181 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 8182 /* 8183 * Rename a directory to a new parent. Since, we are both deleting 8184 * and creating a new directory entry, the link count on the new 8185 * directory should not change. Thus we skip the followup dirrem. 8186 */ 8187 if (dirrem->dm_state & DIRCHG) { 8188 KASSERT(LIST_EMPTY(&dirrem->dm_jwork), 8189 ("handle_workitem_remove: DIRCHG and worklist not empty.")); 8190 WORKITEM_FREE(dirrem, D_DIRREM); 8191 FREE_LOCK(&lk); 8192 goto out; 8193 } 8194 dirrem->dm_state = ONDEPLIST; 8195 dirrem->dm_oldinum = dirrem->dm_dirinum; 8196 /* 8197 * Place the dirrem on the parent's diremhd list. 8198 */ 8199 if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0) 8200 panic("handle_workitem_remove: lost dir inodedep"); 8201 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 8202 /* 8203 * If the allocated inode has never been written to disk, then 8204 * the on-disk inode is zero'ed and we can remove the file 8205 * immediately. When journaling if the inode has been marked 8206 * unlinked and not DEPCOMPLETE we know it can never be written. 8207 */ 8208 inodedep_lookup(mp, oldinum, 0, &inodedep); 8209 if (inodedep == NULL || 8210 (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED || 8211 check_inode_unwritten(inodedep)) { 8212 if (xp != NULL) 8213 add_to_worklist(&dirrem->dm_list, 0); 8214 FREE_LOCK(&lk); 8215 if (xp == NULL) { 8216 vput(vp); 8217 handle_workitem_remove(dirrem, NULL); 8218 } 8219 return; 8220 } 8221 WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); 8222 FREE_LOCK(&lk); 8223 ip->i_flag |= IN_CHANGE; 8224 out: 8225 ffs_update(vp, 0); 8226 if (xp == NULL) 8227 vput(vp); 8228 } 8229 8230 /* 8231 * Inode de-allocation dependencies. 8232 * 8233 * When an inode's link count is reduced to zero, it can be de-allocated. We 8234 * found it convenient to postpone de-allocation until after the inode is 8235 * written to disk with its new link count (zero). At this point, all of the 8236 * on-disk inode's block pointers are nullified and, with careful dependency 8237 * list ordering, all dependencies related to the inode will be satisfied and 8238 * the corresponding dependency structures de-allocated. So, if/when the 8239 * inode is reused, there will be no mixing of old dependencies with new 8240 * ones. This artificial dependency is set up by the block de-allocation 8241 * procedure above (softdep_setup_freeblocks) and completed by the 8242 * following procedure. 8243 */ 8244 static void 8245 handle_workitem_freefile(freefile) 8246 struct freefile *freefile; 8247 { 8248 struct workhead wkhd; 8249 struct fs *fs; 8250 struct inodedep *idp; 8251 struct ufsmount *ump; 8252 int error; 8253 8254 ump = VFSTOUFS(freefile->fx_list.wk_mp); 8255 fs = ump->um_fs; 8256 #ifdef DEBUG 8257 ACQUIRE_LOCK(&lk); 8258 error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp); 8259 FREE_LOCK(&lk); 8260 if (error) 8261 panic("handle_workitem_freefile: inodedep %p survived", idp); 8262 #endif 8263 UFS_LOCK(ump); 8264 fs->fs_pendinginodes -= 1; 8265 UFS_UNLOCK(ump); 8266 LIST_INIT(&wkhd); 8267 LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list); 8268 if ((error = ffs_freefile(ump, fs, freefile->fx_devvp, 8269 freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0) 8270 softdep_error("handle_workitem_freefile", error); 8271 ACQUIRE_LOCK(&lk); 8272 WORKITEM_FREE(freefile, D_FREEFILE); 8273 FREE_LOCK(&lk); 8274 } 8275 8276 8277 /* 8278 * Helper function which unlinks marker element from work list and returns 8279 * the next element on the list. 8280 */ 8281 static __inline struct worklist * 8282 markernext(struct worklist *marker) 8283 { 8284 struct worklist *next; 8285 8286 next = LIST_NEXT(marker, wk_list); 8287 LIST_REMOVE(marker, wk_list); 8288 return next; 8289 } 8290 8291 /* 8292 * Disk writes. 8293 * 8294 * The dependency structures constructed above are most actively used when file 8295 * system blocks are written to disk. No constraints are placed on when a 8296 * block can be written, but unsatisfied update dependencies are made safe by 8297 * modifying (or replacing) the source memory for the duration of the disk 8298 * write. When the disk write completes, the memory block is again brought 8299 * up-to-date. 8300 * 8301 * In-core inode structure reclamation. 8302 * 8303 * Because there are a finite number of "in-core" inode structures, they are 8304 * reused regularly. By transferring all inode-related dependencies to the 8305 * in-memory inode block and indexing them separately (via "inodedep"s), we 8306 * can allow "in-core" inode structures to be reused at any time and avoid 8307 * any increase in contention. 8308 * 8309 * Called just before entering the device driver to initiate a new disk I/O. 8310 * The buffer must be locked, thus, no I/O completion operations can occur 8311 * while we are manipulating its associated dependencies. 8312 */ 8313 static void 8314 softdep_disk_io_initiation(bp) 8315 struct buf *bp; /* structure describing disk write to occur */ 8316 { 8317 struct worklist *wk; 8318 struct worklist marker; 8319 struct inodedep *inodedep; 8320 struct freeblks *freeblks; 8321 struct jfreeblk *jfreeblk; 8322 struct newblk *newblk; 8323 8324 /* 8325 * We only care about write operations. There should never 8326 * be dependencies for reads. 8327 */ 8328 if (bp->b_iocmd != BIO_WRITE) 8329 panic("softdep_disk_io_initiation: not write"); 8330 8331 if (bp->b_vflags & BV_BKGRDINPROG) 8332 panic("softdep_disk_io_initiation: Writing buffer with " 8333 "background write in progress: %p", bp); 8334 8335 marker.wk_type = D_LAST + 1; /* Not a normal workitem */ 8336 PHOLD(curproc); /* Don't swap out kernel stack */ 8337 8338 ACQUIRE_LOCK(&lk); 8339 /* 8340 * Do any necessary pre-I/O processing. 8341 */ 8342 for (wk = LIST_FIRST(&bp->b_dep); wk != NULL; 8343 wk = markernext(&marker)) { 8344 LIST_INSERT_AFTER(wk, &marker, wk_list); 8345 switch (wk->wk_type) { 8346 8347 case D_PAGEDEP: 8348 initiate_write_filepage(WK_PAGEDEP(wk), bp); 8349 continue; 8350 8351 case D_INODEDEP: 8352 inodedep = WK_INODEDEP(wk); 8353 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) 8354 initiate_write_inodeblock_ufs1(inodedep, bp); 8355 else 8356 initiate_write_inodeblock_ufs2(inodedep, bp); 8357 continue; 8358 8359 case D_INDIRDEP: 8360 initiate_write_indirdep(WK_INDIRDEP(wk), bp); 8361 continue; 8362 8363 case D_BMSAFEMAP: 8364 initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp); 8365 continue; 8366 8367 case D_JSEG: 8368 WK_JSEG(wk)->js_buf = NULL; 8369 continue; 8370 8371 case D_FREEBLKS: 8372 freeblks = WK_FREEBLKS(wk); 8373 jfreeblk = LIST_FIRST(&freeblks->fb_jfreeblkhd); 8374 /* 8375 * We have to wait for the jfreeblks to be journaled 8376 * before we can write an inodeblock with updated 8377 * pointers. Be careful to arrange the marker so 8378 * we revisit the jfreeblk if it's not removed by 8379 * the first jwait(). 8380 */ 8381 if (jfreeblk != NULL) { 8382 LIST_REMOVE(&marker, wk_list); 8383 LIST_INSERT_BEFORE(wk, &marker, wk_list); 8384 stat_jwait_freeblks++; 8385 jwait(&jfreeblk->jf_list); 8386 } 8387 continue; 8388 case D_ALLOCDIRECT: 8389 case D_ALLOCINDIR: 8390 /* 8391 * We have to wait for the jnewblk to be journaled 8392 * before we can write to a block if the contents 8393 * may be confused with an earlier file's indirect 8394 * at recovery time. Handle the marker as described 8395 * above. 8396 */ 8397 newblk = WK_NEWBLK(wk); 8398 if (newblk->nb_jnewblk != NULL && 8399 indirblk_inseg(newblk->nb_list.wk_mp, 8400 newblk->nb_newblkno)) { 8401 LIST_REMOVE(&marker, wk_list); 8402 LIST_INSERT_BEFORE(wk, &marker, wk_list); 8403 stat_jwait_newblk++; 8404 jwait(&newblk->nb_jnewblk->jn_list); 8405 } 8406 continue; 8407 8408 case D_SBDEP: 8409 initiate_write_sbdep(WK_SBDEP(wk)); 8410 continue; 8411 8412 case D_MKDIR: 8413 case D_FREEWORK: 8414 case D_FREEDEP: 8415 case D_JSEGDEP: 8416 continue; 8417 8418 default: 8419 panic("handle_disk_io_initiation: Unexpected type %s", 8420 TYPENAME(wk->wk_type)); 8421 /* NOTREACHED */ 8422 } 8423 } 8424 FREE_LOCK(&lk); 8425 PRELE(curproc); /* Allow swapout of kernel stack */ 8426 } 8427 8428 /* 8429 * Called from within the procedure above to deal with unsatisfied 8430 * allocation dependencies in a directory. The buffer must be locked, 8431 * thus, no I/O completion operations can occur while we are 8432 * manipulating its associated dependencies. 8433 */ 8434 static void 8435 initiate_write_filepage(pagedep, bp) 8436 struct pagedep *pagedep; 8437 struct buf *bp; 8438 { 8439 struct jremref *jremref; 8440 struct jmvref *jmvref; 8441 struct dirrem *dirrem; 8442 struct diradd *dap; 8443 struct direct *ep; 8444 int i; 8445 8446 if (pagedep->pd_state & IOSTARTED) { 8447 /* 8448 * This can only happen if there is a driver that does not 8449 * understand chaining. Here biodone will reissue the call 8450 * to strategy for the incomplete buffers. 8451 */ 8452 printf("initiate_write_filepage: already started\n"); 8453 return; 8454 } 8455 pagedep->pd_state |= IOSTARTED; 8456 /* 8457 * Wait for all journal remove dependencies to hit the disk. 8458 * We can not allow any potentially conflicting directory adds 8459 * to be visible before removes and rollback is too difficult. 8460 * lk may be dropped and re-acquired, however we hold the buf 8461 * locked so the dependency can not go away. 8462 */ 8463 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) 8464 while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) { 8465 stat_jwait_filepage++; 8466 jwait(&jremref->jr_list); 8467 } 8468 while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) { 8469 stat_jwait_filepage++; 8470 jwait(&jmvref->jm_list); 8471 } 8472 for (i = 0; i < DAHASHSZ; i++) { 8473 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 8474 ep = (struct direct *) 8475 ((char *)bp->b_data + dap->da_offset); 8476 if (ep->d_ino != dap->da_newinum) 8477 panic("%s: dir inum %d != new %d", 8478 "initiate_write_filepage", 8479 ep->d_ino, dap->da_newinum); 8480 if (dap->da_state & DIRCHG) 8481 ep->d_ino = dap->da_previous->dm_oldinum; 8482 else 8483 ep->d_ino = 0; 8484 dap->da_state &= ~ATTACHED; 8485 dap->da_state |= UNDONE; 8486 } 8487 } 8488 } 8489 8490 /* 8491 * Version of initiate_write_inodeblock that handles UFS1 dinodes. 8492 * Note that any bug fixes made to this routine must be done in the 8493 * version found below. 8494 * 8495 * Called from within the procedure above to deal with unsatisfied 8496 * allocation dependencies in an inodeblock. The buffer must be 8497 * locked, thus, no I/O completion operations can occur while we 8498 * are manipulating its associated dependencies. 8499 */ 8500 static void 8501 initiate_write_inodeblock_ufs1(inodedep, bp) 8502 struct inodedep *inodedep; 8503 struct buf *bp; /* The inode block */ 8504 { 8505 struct allocdirect *adp, *lastadp; 8506 struct ufs1_dinode *dp; 8507 struct ufs1_dinode *sip; 8508 struct inoref *inoref; 8509 struct fs *fs; 8510 ufs_lbn_t i; 8511 #ifdef INVARIANTS 8512 ufs_lbn_t prevlbn = 0; 8513 #endif 8514 int deplist; 8515 8516 if (inodedep->id_state & IOSTARTED) 8517 panic("initiate_write_inodeblock_ufs1: already started"); 8518 inodedep->id_state |= IOSTARTED; 8519 fs = inodedep->id_fs; 8520 dp = (struct ufs1_dinode *)bp->b_data + 8521 ino_to_fsbo(fs, inodedep->id_ino); 8522 8523 /* 8524 * If we're on the unlinked list but have not yet written our 8525 * next pointer initialize it here. 8526 */ 8527 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 8528 struct inodedep *inon; 8529 8530 inon = TAILQ_NEXT(inodedep, id_unlinked); 8531 dp->di_freelink = inon ? inon->id_ino : 0; 8532 } 8533 /* 8534 * If the bitmap is not yet written, then the allocated 8535 * inode cannot be written to disk. 8536 */ 8537 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 8538 if (inodedep->id_savedino1 != NULL) 8539 panic("initiate_write_inodeblock_ufs1: I/O underway"); 8540 FREE_LOCK(&lk); 8541 sip = malloc(sizeof(struct ufs1_dinode), 8542 M_SAVEDINO, M_SOFTDEP_FLAGS); 8543 ACQUIRE_LOCK(&lk); 8544 inodedep->id_savedino1 = sip; 8545 *inodedep->id_savedino1 = *dp; 8546 bzero((caddr_t)dp, sizeof(struct ufs1_dinode)); 8547 dp->di_gen = inodedep->id_savedino1->di_gen; 8548 dp->di_freelink = inodedep->id_savedino1->di_freelink; 8549 return; 8550 } 8551 /* 8552 * If no dependencies, then there is nothing to roll back. 8553 */ 8554 inodedep->id_savedsize = dp->di_size; 8555 inodedep->id_savedextsize = 0; 8556 inodedep->id_savednlink = dp->di_nlink; 8557 if (TAILQ_EMPTY(&inodedep->id_inoupdt) && 8558 TAILQ_EMPTY(&inodedep->id_inoreflst)) 8559 return; 8560 /* 8561 * Revert the link count to that of the first unwritten journal entry. 8562 */ 8563 inoref = TAILQ_FIRST(&inodedep->id_inoreflst); 8564 if (inoref) 8565 dp->di_nlink = inoref->if_nlink; 8566 /* 8567 * Set the dependencies to busy. 8568 */ 8569 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 8570 adp = TAILQ_NEXT(adp, ad_next)) { 8571 #ifdef INVARIANTS 8572 if (deplist != 0 && prevlbn >= adp->ad_offset) 8573 panic("softdep_write_inodeblock: lbn order"); 8574 prevlbn = adp->ad_offset; 8575 if (adp->ad_offset < NDADDR && 8576 dp->di_db[adp->ad_offset] != adp->ad_newblkno) 8577 panic("%s: direct pointer #%jd mismatch %d != %jd", 8578 "softdep_write_inodeblock", 8579 (intmax_t)adp->ad_offset, 8580 dp->di_db[adp->ad_offset], 8581 (intmax_t)adp->ad_newblkno); 8582 if (adp->ad_offset >= NDADDR && 8583 dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) 8584 panic("%s: indirect pointer #%jd mismatch %d != %jd", 8585 "softdep_write_inodeblock", 8586 (intmax_t)adp->ad_offset - NDADDR, 8587 dp->di_ib[adp->ad_offset - NDADDR], 8588 (intmax_t)adp->ad_newblkno); 8589 deplist |= 1 << adp->ad_offset; 8590 if ((adp->ad_state & ATTACHED) == 0) 8591 panic("softdep_write_inodeblock: Unknown state 0x%x", 8592 adp->ad_state); 8593 #endif /* INVARIANTS */ 8594 adp->ad_state &= ~ATTACHED; 8595 adp->ad_state |= UNDONE; 8596 } 8597 /* 8598 * The on-disk inode cannot claim to be any larger than the last 8599 * fragment that has been written. Otherwise, the on-disk inode 8600 * might have fragments that were not the last block in the file 8601 * which would corrupt the filesystem. 8602 */ 8603 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 8604 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 8605 if (adp->ad_offset >= NDADDR) 8606 break; 8607 dp->di_db[adp->ad_offset] = adp->ad_oldblkno; 8608 /* keep going until hitting a rollback to a frag */ 8609 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 8610 continue; 8611 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 8612 for (i = adp->ad_offset + 1; i < NDADDR; i++) { 8613 #ifdef INVARIANTS 8614 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) 8615 panic("softdep_write_inodeblock: lost dep1"); 8616 #endif /* INVARIANTS */ 8617 dp->di_db[i] = 0; 8618 } 8619 for (i = 0; i < NIADDR; i++) { 8620 #ifdef INVARIANTS 8621 if (dp->di_ib[i] != 0 && 8622 (deplist & ((1 << NDADDR) << i)) == 0) 8623 panic("softdep_write_inodeblock: lost dep2"); 8624 #endif /* INVARIANTS */ 8625 dp->di_ib[i] = 0; 8626 } 8627 return; 8628 } 8629 /* 8630 * If we have zero'ed out the last allocated block of the file, 8631 * roll back the size to the last currently allocated block. 8632 * We know that this last allocated block is a full-sized as 8633 * we already checked for fragments in the loop above. 8634 */ 8635 if (lastadp != NULL && 8636 dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 8637 for (i = lastadp->ad_offset; i >= 0; i--) 8638 if (dp->di_db[i] != 0) 8639 break; 8640 dp->di_size = (i + 1) * fs->fs_bsize; 8641 } 8642 /* 8643 * The only dependencies are for indirect blocks. 8644 * 8645 * The file size for indirect block additions is not guaranteed. 8646 * Such a guarantee would be non-trivial to achieve. The conventional 8647 * synchronous write implementation also does not make this guarantee. 8648 * Fsck should catch and fix discrepancies. Arguably, the file size 8649 * can be over-estimated without destroying integrity when the file 8650 * moves into the indirect blocks (i.e., is large). If we want to 8651 * postpone fsck, we are stuck with this argument. 8652 */ 8653 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 8654 dp->di_ib[adp->ad_offset - NDADDR] = 0; 8655 } 8656 8657 /* 8658 * Version of initiate_write_inodeblock that handles UFS2 dinodes. 8659 * Note that any bug fixes made to this routine must be done in the 8660 * version found above. 8661 * 8662 * Called from within the procedure above to deal with unsatisfied 8663 * allocation dependencies in an inodeblock. The buffer must be 8664 * locked, thus, no I/O completion operations can occur while we 8665 * are manipulating its associated dependencies. 8666 */ 8667 static void 8668 initiate_write_inodeblock_ufs2(inodedep, bp) 8669 struct inodedep *inodedep; 8670 struct buf *bp; /* The inode block */ 8671 { 8672 struct allocdirect *adp, *lastadp; 8673 struct ufs2_dinode *dp; 8674 struct ufs2_dinode *sip; 8675 struct inoref *inoref; 8676 struct fs *fs; 8677 ufs_lbn_t i; 8678 #ifdef INVARIANTS 8679 ufs_lbn_t prevlbn = 0; 8680 #endif 8681 int deplist; 8682 8683 if (inodedep->id_state & IOSTARTED) 8684 panic("initiate_write_inodeblock_ufs2: already started"); 8685 inodedep->id_state |= IOSTARTED; 8686 fs = inodedep->id_fs; 8687 dp = (struct ufs2_dinode *)bp->b_data + 8688 ino_to_fsbo(fs, inodedep->id_ino); 8689 8690 /* 8691 * If we're on the unlinked list but have not yet written our 8692 * next pointer initialize it here. 8693 */ 8694 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 8695 struct inodedep *inon; 8696 8697 inon = TAILQ_NEXT(inodedep, id_unlinked); 8698 dp->di_freelink = inon ? inon->id_ino : 0; 8699 } 8700 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == 8701 (UNLINKED | UNLINKNEXT)) { 8702 struct inodedep *inon; 8703 ino_t freelink; 8704 8705 inon = TAILQ_NEXT(inodedep, id_unlinked); 8706 freelink = inon ? inon->id_ino : 0; 8707 if (freelink != dp->di_freelink) 8708 panic("ino %p(0x%X) %d, %d != %d", 8709 inodedep, inodedep->id_state, inodedep->id_ino, 8710 freelink, dp->di_freelink); 8711 } 8712 /* 8713 * If the bitmap is not yet written, then the allocated 8714 * inode cannot be written to disk. 8715 */ 8716 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 8717 if (inodedep->id_savedino2 != NULL) 8718 panic("initiate_write_inodeblock_ufs2: I/O underway"); 8719 FREE_LOCK(&lk); 8720 sip = malloc(sizeof(struct ufs2_dinode), 8721 M_SAVEDINO, M_SOFTDEP_FLAGS); 8722 ACQUIRE_LOCK(&lk); 8723 inodedep->id_savedino2 = sip; 8724 *inodedep->id_savedino2 = *dp; 8725 bzero((caddr_t)dp, sizeof(struct ufs2_dinode)); 8726 dp->di_gen = inodedep->id_savedino2->di_gen; 8727 dp->di_freelink = inodedep->id_savedino2->di_freelink; 8728 return; 8729 } 8730 /* 8731 * If no dependencies, then there is nothing to roll back. 8732 */ 8733 inodedep->id_savedsize = dp->di_size; 8734 inodedep->id_savedextsize = dp->di_extsize; 8735 inodedep->id_savednlink = dp->di_nlink; 8736 if (TAILQ_EMPTY(&inodedep->id_inoupdt) && 8737 TAILQ_EMPTY(&inodedep->id_extupdt) && 8738 TAILQ_EMPTY(&inodedep->id_inoreflst)) 8739 return; 8740 /* 8741 * Revert the link count to that of the first unwritten journal entry. 8742 */ 8743 inoref = TAILQ_FIRST(&inodedep->id_inoreflst); 8744 if (inoref) 8745 dp->di_nlink = inoref->if_nlink; 8746 8747 /* 8748 * Set the ext data dependencies to busy. 8749 */ 8750 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; 8751 adp = TAILQ_NEXT(adp, ad_next)) { 8752 #ifdef INVARIANTS 8753 if (deplist != 0 && prevlbn >= adp->ad_offset) 8754 panic("softdep_write_inodeblock: lbn order"); 8755 prevlbn = adp->ad_offset; 8756 if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno) 8757 panic("%s: direct pointer #%jd mismatch %jd != %jd", 8758 "softdep_write_inodeblock", 8759 (intmax_t)adp->ad_offset, 8760 (intmax_t)dp->di_extb[adp->ad_offset], 8761 (intmax_t)adp->ad_newblkno); 8762 deplist |= 1 << adp->ad_offset; 8763 if ((adp->ad_state & ATTACHED) == 0) 8764 panic("softdep_write_inodeblock: Unknown state 0x%x", 8765 adp->ad_state); 8766 #endif /* INVARIANTS */ 8767 adp->ad_state &= ~ATTACHED; 8768 adp->ad_state |= UNDONE; 8769 } 8770 /* 8771 * The on-disk inode cannot claim to be any larger than the last 8772 * fragment that has been written. Otherwise, the on-disk inode 8773 * might have fragments that were not the last block in the ext 8774 * data which would corrupt the filesystem. 8775 */ 8776 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; 8777 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 8778 dp->di_extb[adp->ad_offset] = adp->ad_oldblkno; 8779 /* keep going until hitting a rollback to a frag */ 8780 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 8781 continue; 8782 dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 8783 for (i = adp->ad_offset + 1; i < NXADDR; i++) { 8784 #ifdef INVARIANTS 8785 if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) 8786 panic("softdep_write_inodeblock: lost dep1"); 8787 #endif /* INVARIANTS */ 8788 dp->di_extb[i] = 0; 8789 } 8790 lastadp = NULL; 8791 break; 8792 } 8793 /* 8794 * If we have zero'ed out the last allocated block of the ext 8795 * data, roll back the size to the last currently allocated block. 8796 * We know that this last allocated block is a full-sized as 8797 * we already checked for fragments in the loop above. 8798 */ 8799 if (lastadp != NULL && 8800 dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 8801 for (i = lastadp->ad_offset; i >= 0; i--) 8802 if (dp->di_extb[i] != 0) 8803 break; 8804 dp->di_extsize = (i + 1) * fs->fs_bsize; 8805 } 8806 /* 8807 * Set the file data dependencies to busy. 8808 */ 8809 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 8810 adp = TAILQ_NEXT(adp, ad_next)) { 8811 #ifdef INVARIANTS 8812 if (deplist != 0 && prevlbn >= adp->ad_offset) 8813 panic("softdep_write_inodeblock: lbn order"); 8814 prevlbn = adp->ad_offset; 8815 if (adp->ad_offset < NDADDR && 8816 dp->di_db[adp->ad_offset] != adp->ad_newblkno) 8817 panic("%s: direct pointer #%jd mismatch %jd != %jd", 8818 "softdep_write_inodeblock", 8819 (intmax_t)adp->ad_offset, 8820 (intmax_t)dp->di_db[adp->ad_offset], 8821 (intmax_t)adp->ad_newblkno); 8822 if (adp->ad_offset >= NDADDR && 8823 dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) 8824 panic("%s indirect pointer #%jd mismatch %jd != %jd", 8825 "softdep_write_inodeblock:", 8826 (intmax_t)adp->ad_offset - NDADDR, 8827 (intmax_t)dp->di_ib[adp->ad_offset - NDADDR], 8828 (intmax_t)adp->ad_newblkno); 8829 deplist |= 1 << adp->ad_offset; 8830 if ((adp->ad_state & ATTACHED) == 0) 8831 panic("softdep_write_inodeblock: Unknown state 0x%x", 8832 adp->ad_state); 8833 #endif /* INVARIANTS */ 8834 adp->ad_state &= ~ATTACHED; 8835 adp->ad_state |= UNDONE; 8836 } 8837 /* 8838 * The on-disk inode cannot claim to be any larger than the last 8839 * fragment that has been written. Otherwise, the on-disk inode 8840 * might have fragments that were not the last block in the file 8841 * which would corrupt the filesystem. 8842 */ 8843 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 8844 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 8845 if (adp->ad_offset >= NDADDR) 8846 break; 8847 dp->di_db[adp->ad_offset] = adp->ad_oldblkno; 8848 /* keep going until hitting a rollback to a frag */ 8849 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 8850 continue; 8851 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 8852 for (i = adp->ad_offset + 1; i < NDADDR; i++) { 8853 #ifdef INVARIANTS 8854 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) 8855 panic("softdep_write_inodeblock: lost dep2"); 8856 #endif /* INVARIANTS */ 8857 dp->di_db[i] = 0; 8858 } 8859 for (i = 0; i < NIADDR; i++) { 8860 #ifdef INVARIANTS 8861 if (dp->di_ib[i] != 0 && 8862 (deplist & ((1 << NDADDR) << i)) == 0) 8863 panic("softdep_write_inodeblock: lost dep3"); 8864 #endif /* INVARIANTS */ 8865 dp->di_ib[i] = 0; 8866 } 8867 return; 8868 } 8869 /* 8870 * If we have zero'ed out the last allocated block of the file, 8871 * roll back the size to the last currently allocated block. 8872 * We know that this last allocated block is a full-sized as 8873 * we already checked for fragments in the loop above. 8874 */ 8875 if (lastadp != NULL && 8876 dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 8877 for (i = lastadp->ad_offset; i >= 0; i--) 8878 if (dp->di_db[i] != 0) 8879 break; 8880 dp->di_size = (i + 1) * fs->fs_bsize; 8881 } 8882 /* 8883 * The only dependencies are for indirect blocks. 8884 * 8885 * The file size for indirect block additions is not guaranteed. 8886 * Such a guarantee would be non-trivial to achieve. The conventional 8887 * synchronous write implementation also does not make this guarantee. 8888 * Fsck should catch and fix discrepancies. Arguably, the file size 8889 * can be over-estimated without destroying integrity when the file 8890 * moves into the indirect blocks (i.e., is large). If we want to 8891 * postpone fsck, we are stuck with this argument. 8892 */ 8893 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 8894 dp->di_ib[adp->ad_offset - NDADDR] = 0; 8895 } 8896 8897 /* 8898 * Cancel an indirdep as a result of truncation. Release all of the 8899 * children allocindirs and place their journal work on the appropriate 8900 * list. 8901 */ 8902 static void 8903 cancel_indirdep(indirdep, bp, inodedep, freeblks) 8904 struct indirdep *indirdep; 8905 struct buf *bp; 8906 struct inodedep *inodedep; 8907 struct freeblks *freeblks; 8908 { 8909 struct allocindir *aip; 8910 8911 /* 8912 * None of the indirect pointers will ever be visible, 8913 * so they can simply be tossed. GOINGAWAY ensures 8914 * that allocated pointers will be saved in the buffer 8915 * cache until they are freed. Note that they will 8916 * only be able to be found by their physical address 8917 * since the inode mapping the logical address will 8918 * be gone. The save buffer used for the safe copy 8919 * was allocated in setup_allocindir_phase2 using 8920 * the physical address so it could be used for this 8921 * purpose. Hence we swap the safe copy with the real 8922 * copy, allowing the safe copy to be freed and holding 8923 * on to the real copy for later use in indir_trunc. 8924 */ 8925 if (indirdep->ir_state & GOINGAWAY) 8926 panic("cancel_indirdep: already gone"); 8927 if (indirdep->ir_state & ONDEPLIST) { 8928 indirdep->ir_state &= ~ONDEPLIST; 8929 LIST_REMOVE(indirdep, ir_next); 8930 } 8931 indirdep->ir_state |= GOINGAWAY; 8932 VFSTOUFS(indirdep->ir_list.wk_mp)->um_numindirdeps += 1; 8933 while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) 8934 cancel_allocindir(aip, inodedep, freeblks); 8935 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) 8936 cancel_allocindir(aip, inodedep, freeblks); 8937 while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) 8938 cancel_allocindir(aip, inodedep, freeblks); 8939 while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0) 8940 cancel_allocindir(aip, inodedep, freeblks); 8941 bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount); 8942 WORKLIST_REMOVE(&indirdep->ir_list); 8943 WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list); 8944 indirdep->ir_savebp = NULL; 8945 } 8946 8947 /* 8948 * Free an indirdep once it no longer has new pointers to track. 8949 */ 8950 static void 8951 free_indirdep(indirdep) 8952 struct indirdep *indirdep; 8953 { 8954 8955 KASSERT(LIST_EMPTY(&indirdep->ir_jwork), 8956 ("free_indirdep: Journal work not empty.")); 8957 KASSERT(LIST_EMPTY(&indirdep->ir_jnewblkhd), 8958 ("free_indirdep: Journal new block list not empty.")); 8959 KASSERT(LIST_EMPTY(&indirdep->ir_completehd), 8960 ("free_indirdep: Complete head not empty.")); 8961 KASSERT(LIST_EMPTY(&indirdep->ir_writehd), 8962 ("free_indirdep: write head not empty.")); 8963 KASSERT(LIST_EMPTY(&indirdep->ir_donehd), 8964 ("free_indirdep: done head not empty.")); 8965 KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd), 8966 ("free_indirdep: deplist head not empty.")); 8967 KASSERT(indirdep->ir_savebp == NULL, 8968 ("free_indirdep: %p ir_savebp != NULL", indirdep)); 8969 KASSERT((indirdep->ir_state & ONDEPLIST) == 0, 8970 ("free_indirdep: %p still on deplist.", indirdep)); 8971 if (indirdep->ir_state & ONWORKLIST) 8972 WORKLIST_REMOVE(&indirdep->ir_list); 8973 WORKITEM_FREE(indirdep, D_INDIRDEP); 8974 } 8975 8976 /* 8977 * Called before a write to an indirdep. This routine is responsible for 8978 * rolling back pointers to a safe state which includes only those 8979 * allocindirs which have been completed. 8980 */ 8981 static void 8982 initiate_write_indirdep(indirdep, bp) 8983 struct indirdep *indirdep; 8984 struct buf *bp; 8985 { 8986 8987 if (indirdep->ir_state & GOINGAWAY) 8988 panic("disk_io_initiation: indirdep gone"); 8989 8990 /* 8991 * If there are no remaining dependencies, this will be writing 8992 * the real pointers. 8993 */ 8994 if (LIST_EMPTY(&indirdep->ir_deplisthd)) 8995 return; 8996 /* 8997 * Replace up-to-date version with safe version. 8998 */ 8999 FREE_LOCK(&lk); 9000 indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, 9001 M_SOFTDEP_FLAGS); 9002 ACQUIRE_LOCK(&lk); 9003 indirdep->ir_state &= ~ATTACHED; 9004 indirdep->ir_state |= UNDONE; 9005 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); 9006 bcopy(indirdep->ir_savebp->b_data, bp->b_data, 9007 bp->b_bcount); 9008 } 9009 9010 /* 9011 * Called when an inode has been cleared in a cg bitmap. This finally 9012 * eliminates any canceled jaddrefs 9013 */ 9014 void 9015 softdep_setup_inofree(mp, bp, ino, wkhd) 9016 struct mount *mp; 9017 struct buf *bp; 9018 ino_t ino; 9019 struct workhead *wkhd; 9020 { 9021 struct worklist *wk, *wkn; 9022 struct inodedep *inodedep; 9023 uint8_t *inosused; 9024 struct cg *cgp; 9025 struct fs *fs; 9026 9027 ACQUIRE_LOCK(&lk); 9028 fs = VFSTOUFS(mp)->um_fs; 9029 cgp = (struct cg *)bp->b_data; 9030 inosused = cg_inosused(cgp); 9031 if (isset(inosused, ino % fs->fs_ipg)) 9032 panic("softdep_setup_inofree: inode %d not freed.", ino); 9033 if (inodedep_lookup(mp, ino, 0, &inodedep)) 9034 panic("softdep_setup_inofree: ino %d has existing inodedep %p", 9035 ino, inodedep); 9036 if (wkhd) { 9037 LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) { 9038 if (wk->wk_type != D_JADDREF) 9039 continue; 9040 WORKLIST_REMOVE(wk); 9041 /* 9042 * We can free immediately even if the jaddref 9043 * isn't attached in a background write as now 9044 * the bitmaps are reconciled. 9045 */ 9046 wk->wk_state |= COMPLETE | ATTACHED; 9047 free_jaddref(WK_JADDREF(wk)); 9048 } 9049 jwork_move(&bp->b_dep, wkhd); 9050 } 9051 FREE_LOCK(&lk); 9052 } 9053 9054 9055 /* 9056 * Called via ffs_blkfree() after a set of frags has been cleared from a cg 9057 * map. Any dependencies waiting for the write to clear are added to the 9058 * buf's list and any jnewblks that are being canceled are discarded 9059 * immediately. 9060 */ 9061 void 9062 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) 9063 struct mount *mp; 9064 struct buf *bp; 9065 ufs2_daddr_t blkno; 9066 int frags; 9067 struct workhead *wkhd; 9068 { 9069 struct jnewblk *jnewblk; 9070 struct worklist *wk, *wkn; 9071 #ifdef SUJ_DEBUG 9072 struct bmsafemap *bmsafemap; 9073 struct fs *fs; 9074 uint8_t *blksfree; 9075 struct cg *cgp; 9076 ufs2_daddr_t jstart; 9077 ufs2_daddr_t jend; 9078 ufs2_daddr_t end; 9079 long bno; 9080 int i; 9081 #endif 9082 9083 ACQUIRE_LOCK(&lk); 9084 /* 9085 * Detach any jnewblks which have been canceled. They must linger 9086 * until the bitmap is cleared again by ffs_blkfree() to prevent 9087 * an unjournaled allocation from hitting the disk. 9088 */ 9089 if (wkhd) { 9090 LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) { 9091 if (wk->wk_type != D_JNEWBLK) 9092 continue; 9093 jnewblk = WK_JNEWBLK(wk); 9094 KASSERT(jnewblk->jn_state & GOINGAWAY, 9095 ("softdep_setup_blkfree: jnewblk not canceled.")); 9096 WORKLIST_REMOVE(wk); 9097 #ifdef SUJ_DEBUG 9098 /* 9099 * Assert that this block is free in the bitmap 9100 * before we discard the jnewblk. 9101 */ 9102 fs = VFSTOUFS(mp)->um_fs; 9103 cgp = (struct cg *)bp->b_data; 9104 blksfree = cg_blksfree(cgp); 9105 bno = dtogd(fs, jnewblk->jn_blkno); 9106 for (i = jnewblk->jn_oldfrags; 9107 i < jnewblk->jn_frags; i++) { 9108 if (isset(blksfree, bno + i)) 9109 continue; 9110 panic("softdep_setup_blkfree: not free"); 9111 } 9112 #endif 9113 /* 9114 * Even if it's not attached we can free immediately 9115 * as the new bitmap is correct. 9116 */ 9117 wk->wk_state |= COMPLETE | ATTACHED; 9118 free_jnewblk(jnewblk); 9119 } 9120 /* 9121 * The buf must be locked by the caller otherwise these could 9122 * be added while it's being written and the write would 9123 * complete them before they made it to disk. 9124 */ 9125 jwork_move(&bp->b_dep, wkhd); 9126 } 9127 9128 #ifdef SUJ_DEBUG 9129 /* 9130 * Assert that we are not freeing a block which has an outstanding 9131 * allocation dependency. 9132 */ 9133 fs = VFSTOUFS(mp)->um_fs; 9134 bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno)); 9135 end = blkno + frags; 9136 LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { 9137 /* 9138 * Don't match against blocks that will be freed when the 9139 * background write is done. 9140 */ 9141 if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) == 9142 (COMPLETE | DEPCOMPLETE)) 9143 continue; 9144 jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags; 9145 jend = jnewblk->jn_blkno + jnewblk->jn_frags; 9146 if ((blkno >= jstart && blkno < jend) || 9147 (end > jstart && end <= jend)) { 9148 printf("state 0x%X %jd - %d %d dep %p\n", 9149 jnewblk->jn_state, jnewblk->jn_blkno, 9150 jnewblk->jn_oldfrags, jnewblk->jn_frags, 9151 jnewblk->jn_dep); 9152 panic("softdep_setup_blkfree: " 9153 "%jd-%jd(%d) overlaps with %jd-%jd", 9154 blkno, end, frags, jstart, jend); 9155 } 9156 } 9157 #endif 9158 FREE_LOCK(&lk); 9159 } 9160 9161 static void 9162 initiate_write_bmsafemap(bmsafemap, bp) 9163 struct bmsafemap *bmsafemap; 9164 struct buf *bp; /* The cg block. */ 9165 { 9166 struct jaddref *jaddref; 9167 struct jnewblk *jnewblk; 9168 uint8_t *inosused; 9169 uint8_t *blksfree; 9170 struct cg *cgp; 9171 struct fs *fs; 9172 int cleared; 9173 ino_t ino; 9174 long bno; 9175 int i; 9176 9177 if (bmsafemap->sm_state & IOSTARTED) 9178 panic("initiate_write_bmsafemap: Already started\n"); 9179 bmsafemap->sm_state |= IOSTARTED; 9180 /* 9181 * Clear any inode allocations which are pending journal writes. 9182 */ 9183 if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) { 9184 cgp = (struct cg *)bp->b_data; 9185 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 9186 inosused = cg_inosused(cgp); 9187 LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) { 9188 ino = jaddref->ja_ino % fs->fs_ipg; 9189 /* 9190 * If this is a background copy the inode may not 9191 * be marked used yet. 9192 */ 9193 if (isset(inosused, ino)) { 9194 if ((jaddref->ja_mode & IFMT) == IFDIR) 9195 cgp->cg_cs.cs_ndir--; 9196 cgp->cg_cs.cs_nifree++; 9197 clrbit(inosused, ino); 9198 jaddref->ja_state &= ~ATTACHED; 9199 jaddref->ja_state |= UNDONE; 9200 stat_jaddref++; 9201 } else if ((bp->b_xflags & BX_BKGRDMARKER) == 0) 9202 panic("initiate_write_bmsafemap: inode %d " 9203 "marked free", jaddref->ja_ino); 9204 } 9205 } 9206 /* 9207 * Clear any block allocations which are pending journal writes. 9208 */ 9209 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { 9210 cgp = (struct cg *)bp->b_data; 9211 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 9212 blksfree = cg_blksfree(cgp); 9213 LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { 9214 bno = dtogd(fs, jnewblk->jn_blkno); 9215 cleared = 0; 9216 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; 9217 i++) { 9218 if (isclr(blksfree, bno + i)) { 9219 cleared = 1; 9220 setbit(blksfree, bno + i); 9221 } 9222 } 9223 /* 9224 * We may not clear the block if it's a background 9225 * copy. In that case there is no reason to detach 9226 * it. 9227 */ 9228 if (cleared) { 9229 stat_jnewblk++; 9230 jnewblk->jn_state &= ~ATTACHED; 9231 jnewblk->jn_state |= UNDONE; 9232 } else if ((bp->b_xflags & BX_BKGRDMARKER) == 0) 9233 panic("initiate_write_bmsafemap: block %jd " 9234 "marked free", jnewblk->jn_blkno); 9235 } 9236 } 9237 /* 9238 * Move allocation lists to the written lists so they can be 9239 * cleared once the block write is complete. 9240 */ 9241 LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr, 9242 inodedep, id_deps); 9243 LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr, 9244 newblk, nb_deps); 9245 } 9246 9247 /* 9248 * This routine is called during the completion interrupt 9249 * service routine for a disk write (from the procedure called 9250 * by the device driver to inform the filesystem caches of 9251 * a request completion). It should be called early in this 9252 * procedure, before the block is made available to other 9253 * processes or other routines are called. 9254 * 9255 */ 9256 static void 9257 softdep_disk_write_complete(bp) 9258 struct buf *bp; /* describes the completed disk write */ 9259 { 9260 struct worklist *wk; 9261 struct worklist *owk; 9262 struct workhead reattach; 9263 struct buf *sbp; 9264 9265 /* 9266 * If an error occurred while doing the write, then the data 9267 * has not hit the disk and the dependencies cannot be unrolled. 9268 */ 9269 if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) 9270 return; 9271 LIST_INIT(&reattach); 9272 /* 9273 * This lock must not be released anywhere in this code segment. 9274 */ 9275 sbp = NULL; 9276 owk = NULL; 9277 ACQUIRE_LOCK(&lk); 9278 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 9279 WORKLIST_REMOVE(wk); 9280 if (wk == owk) 9281 panic("duplicate worklist: %p\n", wk); 9282 owk = wk; 9283 switch (wk->wk_type) { 9284 9285 case D_PAGEDEP: 9286 if (handle_written_filepage(WK_PAGEDEP(wk), bp)) 9287 WORKLIST_INSERT(&reattach, wk); 9288 continue; 9289 9290 case D_INODEDEP: 9291 if (handle_written_inodeblock(WK_INODEDEP(wk), bp)) 9292 WORKLIST_INSERT(&reattach, wk); 9293 continue; 9294 9295 case D_BMSAFEMAP: 9296 if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp)) 9297 WORKLIST_INSERT(&reattach, wk); 9298 continue; 9299 9300 case D_MKDIR: 9301 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 9302 continue; 9303 9304 case D_ALLOCDIRECT: 9305 wk->wk_state |= COMPLETE; 9306 handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL); 9307 continue; 9308 9309 case D_ALLOCINDIR: 9310 wk->wk_state |= COMPLETE; 9311 handle_allocindir_partdone(WK_ALLOCINDIR(wk)); 9312 continue; 9313 9314 case D_INDIRDEP: 9315 if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp)) 9316 WORKLIST_INSERT(&reattach, wk); 9317 continue; 9318 9319 case D_FREEBLKS: 9320 wk->wk_state |= COMPLETE; 9321 if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE) 9322 add_to_worklist(wk, 1); 9323 continue; 9324 9325 case D_FREEWORK: 9326 handle_written_freework(WK_FREEWORK(wk)); 9327 break; 9328 9329 case D_FREEDEP: 9330 free_freedep(WK_FREEDEP(wk)); 9331 continue; 9332 9333 case D_JSEGDEP: 9334 free_jsegdep(WK_JSEGDEP(wk)); 9335 continue; 9336 9337 case D_JSEG: 9338 handle_written_jseg(WK_JSEG(wk), bp); 9339 continue; 9340 9341 case D_SBDEP: 9342 if (handle_written_sbdep(WK_SBDEP(wk), bp)) 9343 WORKLIST_INSERT(&reattach, wk); 9344 continue; 9345 9346 default: 9347 panic("handle_disk_write_complete: Unknown type %s", 9348 TYPENAME(wk->wk_type)); 9349 /* NOTREACHED */ 9350 } 9351 } 9352 /* 9353 * Reattach any requests that must be redone. 9354 */ 9355 while ((wk = LIST_FIRST(&reattach)) != NULL) { 9356 WORKLIST_REMOVE(wk); 9357 WORKLIST_INSERT(&bp->b_dep, wk); 9358 } 9359 FREE_LOCK(&lk); 9360 if (sbp) 9361 brelse(sbp); 9362 } 9363 9364 /* 9365 * Called from within softdep_disk_write_complete above. Note that 9366 * this routine is always called from interrupt level with further 9367 * splbio interrupts blocked. 9368 */ 9369 static void 9370 handle_allocdirect_partdone(adp, wkhd) 9371 struct allocdirect *adp; /* the completed allocdirect */ 9372 struct workhead *wkhd; /* Work to do when inode is writtne. */ 9373 { 9374 struct allocdirectlst *listhead; 9375 struct allocdirect *listadp; 9376 struct inodedep *inodedep; 9377 long bsize; 9378 9379 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 9380 return; 9381 /* 9382 * The on-disk inode cannot claim to be any larger than the last 9383 * fragment that has been written. Otherwise, the on-disk inode 9384 * might have fragments that were not the last block in the file 9385 * which would corrupt the filesystem. Thus, we cannot free any 9386 * allocdirects after one whose ad_oldblkno claims a fragment as 9387 * these blocks must be rolled back to zero before writing the inode. 9388 * We check the currently active set of allocdirects in id_inoupdt 9389 * or id_extupdt as appropriate. 9390 */ 9391 inodedep = adp->ad_inodedep; 9392 bsize = inodedep->id_fs->fs_bsize; 9393 if (adp->ad_state & EXTDATA) 9394 listhead = &inodedep->id_extupdt; 9395 else 9396 listhead = &inodedep->id_inoupdt; 9397 TAILQ_FOREACH(listadp, listhead, ad_next) { 9398 /* found our block */ 9399 if (listadp == adp) 9400 break; 9401 /* continue if ad_oldlbn is not a fragment */ 9402 if (listadp->ad_oldsize == 0 || 9403 listadp->ad_oldsize == bsize) 9404 continue; 9405 /* hit a fragment */ 9406 return; 9407 } 9408 /* 9409 * If we have reached the end of the current list without 9410 * finding the just finished dependency, then it must be 9411 * on the future dependency list. Future dependencies cannot 9412 * be freed until they are moved to the current list. 9413 */ 9414 if (listadp == NULL) { 9415 #ifdef DEBUG 9416 if (adp->ad_state & EXTDATA) 9417 listhead = &inodedep->id_newextupdt; 9418 else 9419 listhead = &inodedep->id_newinoupdt; 9420 TAILQ_FOREACH(listadp, listhead, ad_next) 9421 /* found our block */ 9422 if (listadp == adp) 9423 break; 9424 if (listadp == NULL) 9425 panic("handle_allocdirect_partdone: lost dep"); 9426 #endif /* DEBUG */ 9427 return; 9428 } 9429 /* 9430 * If we have found the just finished dependency, then queue 9431 * it along with anything that follows it that is complete. 9432 * Since the pointer has not yet been written in the inode 9433 * as the dependency prevents it, place the allocdirect on the 9434 * bufwait list where it will be freed once the pointer is 9435 * valid. 9436 */ 9437 if (wkhd == NULL) 9438 wkhd = &inodedep->id_bufwait; 9439 for (; adp; adp = listadp) { 9440 listadp = TAILQ_NEXT(adp, ad_next); 9441 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 9442 return; 9443 TAILQ_REMOVE(listhead, adp, ad_next); 9444 WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list); 9445 } 9446 } 9447 9448 /* 9449 * Called from within softdep_disk_write_complete above. This routine 9450 * completes successfully written allocindirs. 9451 */ 9452 static void 9453 handle_allocindir_partdone(aip) 9454 struct allocindir *aip; /* the completed allocindir */ 9455 { 9456 struct indirdep *indirdep; 9457 9458 if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) 9459 return; 9460 indirdep = aip->ai_indirdep; 9461 LIST_REMOVE(aip, ai_next); 9462 if (indirdep->ir_state & UNDONE) { 9463 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); 9464 return; 9465 } 9466 if (indirdep->ir_state & UFS1FMT) 9467 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 9468 aip->ai_newblkno; 9469 else 9470 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 9471 aip->ai_newblkno; 9472 /* 9473 * Await the pointer write before freeing the allocindir. 9474 */ 9475 LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next); 9476 } 9477 9478 /* 9479 * Release segments held on a jwork list. 9480 */ 9481 static void 9482 handle_jwork(wkhd) 9483 struct workhead *wkhd; 9484 { 9485 struct worklist *wk; 9486 9487 while ((wk = LIST_FIRST(wkhd)) != NULL) { 9488 WORKLIST_REMOVE(wk); 9489 switch (wk->wk_type) { 9490 case D_JSEGDEP: 9491 free_jsegdep(WK_JSEGDEP(wk)); 9492 continue; 9493 default: 9494 panic("handle_jwork: Unknown type %s\n", 9495 TYPENAME(wk->wk_type)); 9496 } 9497 } 9498 } 9499 9500 /* 9501 * Handle the bufwait list on an inode when it is safe to release items 9502 * held there. This normally happens after an inode block is written but 9503 * may be delayed and handled later if there are pending journal items that 9504 * are not yet safe to be released. 9505 */ 9506 static struct freefile * 9507 handle_bufwait(inodedep, refhd) 9508 struct inodedep *inodedep; 9509 struct workhead *refhd; 9510 { 9511 struct jaddref *jaddref; 9512 struct freefile *freefile; 9513 struct worklist *wk; 9514 9515 freefile = NULL; 9516 while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { 9517 WORKLIST_REMOVE(wk); 9518 switch (wk->wk_type) { 9519 case D_FREEFILE: 9520 /* 9521 * We defer adding freefile to the worklist 9522 * until all other additions have been made to 9523 * ensure that it will be done after all the 9524 * old blocks have been freed. 9525 */ 9526 if (freefile != NULL) 9527 panic("handle_bufwait: freefile"); 9528 freefile = WK_FREEFILE(wk); 9529 continue; 9530 9531 case D_MKDIR: 9532 handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); 9533 continue; 9534 9535 case D_DIRADD: 9536 diradd_inode_written(WK_DIRADD(wk), inodedep); 9537 continue; 9538 9539 case D_FREEFRAG: 9540 wk->wk_state |= COMPLETE; 9541 if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE) 9542 add_to_worklist(wk, 0); 9543 continue; 9544 9545 case D_DIRREM: 9546 wk->wk_state |= COMPLETE; 9547 add_to_worklist(wk, 0); 9548 continue; 9549 9550 case D_ALLOCDIRECT: 9551 case D_ALLOCINDIR: 9552 free_newblk(WK_NEWBLK(wk)); 9553 continue; 9554 9555 case D_JNEWBLK: 9556 wk->wk_state |= COMPLETE; 9557 free_jnewblk(WK_JNEWBLK(wk)); 9558 continue; 9559 9560 /* 9561 * Save freed journal segments and add references on 9562 * the supplied list which will delay their release 9563 * until the cg bitmap is cleared on disk. 9564 */ 9565 case D_JSEGDEP: 9566 if (refhd == NULL) 9567 free_jsegdep(WK_JSEGDEP(wk)); 9568 else 9569 WORKLIST_INSERT(refhd, wk); 9570 continue; 9571 9572 case D_JADDREF: 9573 jaddref = WK_JADDREF(wk); 9574 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, 9575 if_deps); 9576 /* 9577 * Transfer any jaddrefs to the list to be freed with 9578 * the bitmap if we're handling a removed file. 9579 */ 9580 if (refhd == NULL) { 9581 wk->wk_state |= COMPLETE; 9582 free_jaddref(jaddref); 9583 } else 9584 WORKLIST_INSERT(refhd, wk); 9585 continue; 9586 9587 default: 9588 panic("handle_bufwait: Unknown type %p(%s)", 9589 wk, TYPENAME(wk->wk_type)); 9590 /* NOTREACHED */ 9591 } 9592 } 9593 return (freefile); 9594 } 9595 /* 9596 * Called from within softdep_disk_write_complete above to restore 9597 * in-memory inode block contents to their most up-to-date state. Note 9598 * that this routine is always called from interrupt level with further 9599 * splbio interrupts blocked. 9600 */ 9601 static int 9602 handle_written_inodeblock(inodedep, bp) 9603 struct inodedep *inodedep; 9604 struct buf *bp; /* buffer containing the inode block */ 9605 { 9606 struct freefile *freefile; 9607 struct allocdirect *adp, *nextadp; 9608 struct ufs1_dinode *dp1 = NULL; 9609 struct ufs2_dinode *dp2 = NULL; 9610 struct workhead wkhd; 9611 int hadchanges, fstype; 9612 ino_t freelink; 9613 9614 LIST_INIT(&wkhd); 9615 hadchanges = 0; 9616 freefile = NULL; 9617 if ((inodedep->id_state & IOSTARTED) == 0) 9618 panic("handle_written_inodeblock: not started"); 9619 inodedep->id_state &= ~IOSTARTED; 9620 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) { 9621 fstype = UFS1; 9622 dp1 = (struct ufs1_dinode *)bp->b_data + 9623 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 9624 freelink = dp1->di_freelink; 9625 } else { 9626 fstype = UFS2; 9627 dp2 = (struct ufs2_dinode *)bp->b_data + 9628 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 9629 freelink = dp2->di_freelink; 9630 } 9631 /* 9632 * If we wrote a valid freelink pointer during the last write 9633 * record it here. 9634 */ 9635 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 9636 struct inodedep *inon; 9637 9638 inon = TAILQ_NEXT(inodedep, id_unlinked); 9639 if ((inon == NULL && freelink == 0) || 9640 (inon && inon->id_ino == freelink)) { 9641 if (inon) 9642 inon->id_state |= UNLINKPREV; 9643 inodedep->id_state |= UNLINKNEXT; 9644 } else 9645 hadchanges = 1; 9646 } 9647 /* Leave this inodeblock dirty until it's in the list. */ 9648 if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED) 9649 hadchanges = 1; 9650 /* 9651 * If we had to rollback the inode allocation because of 9652 * bitmaps being incomplete, then simply restore it. 9653 * Keep the block dirty so that it will not be reclaimed until 9654 * all associated dependencies have been cleared and the 9655 * corresponding updates written to disk. 9656 */ 9657 if (inodedep->id_savedino1 != NULL) { 9658 hadchanges = 1; 9659 if (fstype == UFS1) 9660 *dp1 = *inodedep->id_savedino1; 9661 else 9662 *dp2 = *inodedep->id_savedino2; 9663 free(inodedep->id_savedino1, M_SAVEDINO); 9664 inodedep->id_savedino1 = NULL; 9665 if ((bp->b_flags & B_DELWRI) == 0) 9666 stat_inode_bitmap++; 9667 bdirty(bp); 9668 /* 9669 * If the inode is clear here and GOINGAWAY it will never 9670 * be written. Process the bufwait and clear any pending 9671 * work which may include the freefile. 9672 */ 9673 if (inodedep->id_state & GOINGAWAY) 9674 goto bufwait; 9675 return (1); 9676 } 9677 inodedep->id_state |= COMPLETE; 9678 /* 9679 * Roll forward anything that had to be rolled back before 9680 * the inode could be updated. 9681 */ 9682 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { 9683 nextadp = TAILQ_NEXT(adp, ad_next); 9684 if (adp->ad_state & ATTACHED) 9685 panic("handle_written_inodeblock: new entry"); 9686 if (fstype == UFS1) { 9687 if (adp->ad_offset < NDADDR) { 9688 if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno) 9689 panic("%s %s #%jd mismatch %d != %jd", 9690 "handle_written_inodeblock:", 9691 "direct pointer", 9692 (intmax_t)adp->ad_offset, 9693 dp1->di_db[adp->ad_offset], 9694 (intmax_t)adp->ad_oldblkno); 9695 dp1->di_db[adp->ad_offset] = adp->ad_newblkno; 9696 } else { 9697 if (dp1->di_ib[adp->ad_offset - NDADDR] != 0) 9698 panic("%s: %s #%jd allocated as %d", 9699 "handle_written_inodeblock", 9700 "indirect pointer", 9701 (intmax_t)adp->ad_offset - NDADDR, 9702 dp1->di_ib[adp->ad_offset - NDADDR]); 9703 dp1->di_ib[adp->ad_offset - NDADDR] = 9704 adp->ad_newblkno; 9705 } 9706 } else { 9707 if (adp->ad_offset < NDADDR) { 9708 if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno) 9709 panic("%s: %s #%jd %s %jd != %jd", 9710 "handle_written_inodeblock", 9711 "direct pointer", 9712 (intmax_t)adp->ad_offset, "mismatch", 9713 (intmax_t)dp2->di_db[adp->ad_offset], 9714 (intmax_t)adp->ad_oldblkno); 9715 dp2->di_db[adp->ad_offset] = adp->ad_newblkno; 9716 } else { 9717 if (dp2->di_ib[adp->ad_offset - NDADDR] != 0) 9718 panic("%s: %s #%jd allocated as %jd", 9719 "handle_written_inodeblock", 9720 "indirect pointer", 9721 (intmax_t)adp->ad_offset - NDADDR, 9722 (intmax_t) 9723 dp2->di_ib[adp->ad_offset - NDADDR]); 9724 dp2->di_ib[adp->ad_offset - NDADDR] = 9725 adp->ad_newblkno; 9726 } 9727 } 9728 adp->ad_state &= ~UNDONE; 9729 adp->ad_state |= ATTACHED; 9730 hadchanges = 1; 9731 } 9732 for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) { 9733 nextadp = TAILQ_NEXT(adp, ad_next); 9734 if (adp->ad_state & ATTACHED) 9735 panic("handle_written_inodeblock: new entry"); 9736 if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno) 9737 panic("%s: direct pointers #%jd %s %jd != %jd", 9738 "handle_written_inodeblock", 9739 (intmax_t)adp->ad_offset, "mismatch", 9740 (intmax_t)dp2->di_extb[adp->ad_offset], 9741 (intmax_t)adp->ad_oldblkno); 9742 dp2->di_extb[adp->ad_offset] = adp->ad_newblkno; 9743 adp->ad_state &= ~UNDONE; 9744 adp->ad_state |= ATTACHED; 9745 hadchanges = 1; 9746 } 9747 if (hadchanges && (bp->b_flags & B_DELWRI) == 0) 9748 stat_direct_blk_ptrs++; 9749 /* 9750 * Reset the file size to its most up-to-date value. 9751 */ 9752 if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1) 9753 panic("handle_written_inodeblock: bad size"); 9754 if (inodedep->id_savednlink > LINK_MAX) 9755 panic("handle_written_inodeblock: Invalid link count " 9756 "%d for inodedep %p", inodedep->id_savednlink, inodedep); 9757 if (fstype == UFS1) { 9758 if (dp1->di_nlink != inodedep->id_savednlink) { 9759 dp1->di_nlink = inodedep->id_savednlink; 9760 hadchanges = 1; 9761 } 9762 if (dp1->di_size != inodedep->id_savedsize) { 9763 dp1->di_size = inodedep->id_savedsize; 9764 hadchanges = 1; 9765 } 9766 } else { 9767 if (dp2->di_nlink != inodedep->id_savednlink) { 9768 dp2->di_nlink = inodedep->id_savednlink; 9769 hadchanges = 1; 9770 } 9771 if (dp2->di_size != inodedep->id_savedsize) { 9772 dp2->di_size = inodedep->id_savedsize; 9773 hadchanges = 1; 9774 } 9775 if (dp2->di_extsize != inodedep->id_savedextsize) { 9776 dp2->di_extsize = inodedep->id_savedextsize; 9777 hadchanges = 1; 9778 } 9779 } 9780 inodedep->id_savedsize = -1; 9781 inodedep->id_savedextsize = -1; 9782 inodedep->id_savednlink = -1; 9783 /* 9784 * If there were any rollbacks in the inode block, then it must be 9785 * marked dirty so that its will eventually get written back in 9786 * its correct form. 9787 */ 9788 if (hadchanges) 9789 bdirty(bp); 9790 bufwait: 9791 /* 9792 * Process any allocdirects that completed during the update. 9793 */ 9794 if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) 9795 handle_allocdirect_partdone(adp, &wkhd); 9796 if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) 9797 handle_allocdirect_partdone(adp, &wkhd); 9798 /* 9799 * Process deallocations that were held pending until the 9800 * inode had been written to disk. Freeing of the inode 9801 * is delayed until after all blocks have been freed to 9802 * avoid creation of new <vfsid, inum, lbn> triples 9803 * before the old ones have been deleted. Completely 9804 * unlinked inodes are not processed until the unlinked 9805 * inode list is written or the last reference is removed. 9806 */ 9807 if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) { 9808 freefile = handle_bufwait(inodedep, NULL); 9809 if (freefile && !LIST_EMPTY(&wkhd)) { 9810 WORKLIST_INSERT(&wkhd, &freefile->fx_list); 9811 freefile = NULL; 9812 } 9813 } 9814 /* 9815 * Move rolled forward dependency completions to the bufwait list 9816 * now that those that were already written have been processed. 9817 */ 9818 if (!LIST_EMPTY(&wkhd) && hadchanges == 0) 9819 panic("handle_written_inodeblock: bufwait but no changes"); 9820 jwork_move(&inodedep->id_bufwait, &wkhd); 9821 9822 if (freefile != NULL) { 9823 /* 9824 * If the inode is goingaway it was never written. Fake up 9825 * the state here so free_inodedep() can succeed. 9826 */ 9827 if (inodedep->id_state & GOINGAWAY) 9828 inodedep->id_state |= COMPLETE | DEPCOMPLETE; 9829 if (free_inodedep(inodedep) == 0) 9830 panic("handle_written_inodeblock: live inodedep %p", 9831 inodedep); 9832 add_to_worklist(&freefile->fx_list, 0); 9833 return (0); 9834 } 9835 9836 /* 9837 * If no outstanding dependencies, free it. 9838 */ 9839 if (free_inodedep(inodedep) || 9840 (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 && 9841 TAILQ_FIRST(&inodedep->id_inoupdt) == 0 && 9842 TAILQ_FIRST(&inodedep->id_extupdt) == 0 && 9843 LIST_FIRST(&inodedep->id_bufwait) == 0)) 9844 return (0); 9845 return (hadchanges); 9846 } 9847 9848 static int 9849 handle_written_indirdep(indirdep, bp, bpp) 9850 struct indirdep *indirdep; 9851 struct buf *bp; 9852 struct buf **bpp; 9853 { 9854 struct allocindir *aip; 9855 int chgs; 9856 9857 if (indirdep->ir_state & GOINGAWAY) 9858 panic("disk_write_complete: indirdep gone"); 9859 chgs = 0; 9860 /* 9861 * If there were rollbacks revert them here. 9862 */ 9863 if (indirdep->ir_saveddata) { 9864 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); 9865 free(indirdep->ir_saveddata, M_INDIRDEP); 9866 indirdep->ir_saveddata = 0; 9867 chgs = 1; 9868 } 9869 indirdep->ir_state &= ~UNDONE; 9870 indirdep->ir_state |= ATTACHED; 9871 /* 9872 * Move allocindirs with written pointers to the completehd if 9873 * the indirdep's pointer is not yet written. Otherwise 9874 * free them here. 9875 */ 9876 while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) { 9877 LIST_REMOVE(aip, ai_next); 9878 if ((indirdep->ir_state & DEPCOMPLETE) == 0) { 9879 LIST_INSERT_HEAD(&indirdep->ir_completehd, aip, 9880 ai_next); 9881 continue; 9882 } 9883 free_newblk(&aip->ai_block); 9884 } 9885 /* 9886 * Move allocindirs that have finished dependency processing from 9887 * the done list to the write list after updating the pointers. 9888 */ 9889 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { 9890 handle_allocindir_partdone(aip); 9891 if (aip == LIST_FIRST(&indirdep->ir_donehd)) 9892 panic("disk_write_complete: not gone"); 9893 chgs = 1; 9894 } 9895 /* 9896 * If this indirdep has been detached from its newblk during 9897 * I/O we need to keep this dep attached to the buffer so 9898 * deallocate_dependencies can find it and properly resolve 9899 * any outstanding dependencies. 9900 */ 9901 if ((indirdep->ir_state & (ONDEPLIST | DEPCOMPLETE)) == 0) 9902 chgs = 1; 9903 if ((bp->b_flags & B_DELWRI) == 0) 9904 stat_indir_blk_ptrs++; 9905 /* 9906 * If there were no changes we can discard the savedbp and detach 9907 * ourselves from the buf. We are only carrying completed pointers 9908 * in this case. 9909 */ 9910 if (chgs == 0) { 9911 struct buf *sbp; 9912 9913 sbp = indirdep->ir_savebp; 9914 sbp->b_flags |= B_INVAL | B_NOCACHE; 9915 indirdep->ir_savebp = NULL; 9916 if (*bpp != NULL) 9917 panic("handle_written_indirdep: bp already exists."); 9918 *bpp = sbp; 9919 } else 9920 bdirty(bp); 9921 /* 9922 * If there are no fresh dependencies and none waiting on writes 9923 * we can free the indirdep. 9924 */ 9925 if ((indirdep->ir_state & DEPCOMPLETE) && chgs == 0) { 9926 if (indirdep->ir_state & ONDEPLIST) 9927 LIST_REMOVE(indirdep, ir_next); 9928 free_indirdep(indirdep); 9929 return (0); 9930 } 9931 9932 return (chgs); 9933 } 9934 9935 /* 9936 * Process a diradd entry after its dependent inode has been written. 9937 * This routine must be called with splbio interrupts blocked. 9938 */ 9939 static void 9940 diradd_inode_written(dap, inodedep) 9941 struct diradd *dap; 9942 struct inodedep *inodedep; 9943 { 9944 9945 dap->da_state |= COMPLETE; 9946 complete_diradd(dap); 9947 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 9948 } 9949 9950 /* 9951 * Returns true if the bmsafemap will have rollbacks when written. Must 9952 * only be called with lk and the buf lock on the cg held. 9953 */ 9954 static int 9955 bmsafemap_rollbacks(bmsafemap) 9956 struct bmsafemap *bmsafemap; 9957 { 9958 9959 return (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd) | 9960 !LIST_EMPTY(&bmsafemap->sm_jnewblkhd)); 9961 } 9962 9963 /* 9964 * Complete a write to a bmsafemap structure. Roll forward any bitmap 9965 * changes if it's not a background write. Set all written dependencies 9966 * to DEPCOMPLETE and free the structure if possible. 9967 */ 9968 static int 9969 handle_written_bmsafemap(bmsafemap, bp) 9970 struct bmsafemap *bmsafemap; 9971 struct buf *bp; 9972 { 9973 struct newblk *newblk; 9974 struct inodedep *inodedep; 9975 struct jaddref *jaddref, *jatmp; 9976 struct jnewblk *jnewblk, *jntmp; 9977 uint8_t *inosused; 9978 uint8_t *blksfree; 9979 struct cg *cgp; 9980 struct fs *fs; 9981 ino_t ino; 9982 long bno; 9983 int chgs; 9984 int i; 9985 9986 if ((bmsafemap->sm_state & IOSTARTED) == 0) 9987 panic("initiate_write_bmsafemap: Not started\n"); 9988 chgs = 0; 9989 bmsafemap->sm_state &= ~IOSTARTED; 9990 /* 9991 * Restore unwritten inode allocation pending jaddref writes. 9992 */ 9993 if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) { 9994 cgp = (struct cg *)bp->b_data; 9995 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 9996 inosused = cg_inosused(cgp); 9997 LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd, 9998 ja_bmdeps, jatmp) { 9999 if ((jaddref->ja_state & UNDONE) == 0) 10000 continue; 10001 ino = jaddref->ja_ino % fs->fs_ipg; 10002 if (isset(inosused, ino)) 10003 panic("handle_written_bmsafemap: " 10004 "re-allocated inode"); 10005 if ((bp->b_xflags & BX_BKGRDMARKER) == 0) { 10006 if ((jaddref->ja_mode & IFMT) == IFDIR) 10007 cgp->cg_cs.cs_ndir++; 10008 cgp->cg_cs.cs_nifree--; 10009 setbit(inosused, ino); 10010 chgs = 1; 10011 } 10012 jaddref->ja_state &= ~UNDONE; 10013 jaddref->ja_state |= ATTACHED; 10014 free_jaddref(jaddref); 10015 } 10016 } 10017 /* 10018 * Restore any block allocations which are pending journal writes. 10019 */ 10020 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { 10021 cgp = (struct cg *)bp->b_data; 10022 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 10023 blksfree = cg_blksfree(cgp); 10024 LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps, 10025 jntmp) { 10026 if ((jnewblk->jn_state & UNDONE) == 0) 10027 continue; 10028 bno = dtogd(fs, jnewblk->jn_blkno); 10029 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; 10030 i++) { 10031 if (bp->b_xflags & BX_BKGRDMARKER) 10032 break; 10033 if ((jnewblk->jn_state & NEWBLOCK) == 0 && 10034 isclr(blksfree, bno + i)) 10035 panic("handle_written_bmsafemap: " 10036 "re-allocated fragment"); 10037 clrbit(blksfree, bno + i); 10038 chgs = 1; 10039 } 10040 jnewblk->jn_state &= ~(UNDONE | NEWBLOCK); 10041 jnewblk->jn_state |= ATTACHED; 10042 free_jnewblk(jnewblk); 10043 } 10044 } 10045 while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) { 10046 newblk->nb_state |= DEPCOMPLETE; 10047 newblk->nb_state &= ~ONDEPLIST; 10048 newblk->nb_bmsafemap = NULL; 10049 LIST_REMOVE(newblk, nb_deps); 10050 if (newblk->nb_list.wk_type == D_ALLOCDIRECT) 10051 handle_allocdirect_partdone( 10052 WK_ALLOCDIRECT(&newblk->nb_list), NULL); 10053 else if (newblk->nb_list.wk_type == D_ALLOCINDIR) 10054 handle_allocindir_partdone( 10055 WK_ALLOCINDIR(&newblk->nb_list)); 10056 else if (newblk->nb_list.wk_type != D_NEWBLK) 10057 panic("handle_written_bmsafemap: Unexpected type: %s", 10058 TYPENAME(newblk->nb_list.wk_type)); 10059 } 10060 while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) { 10061 inodedep->id_state |= DEPCOMPLETE; 10062 inodedep->id_state &= ~ONDEPLIST; 10063 LIST_REMOVE(inodedep, id_deps); 10064 inodedep->id_bmsafemap = NULL; 10065 } 10066 if (LIST_EMPTY(&bmsafemap->sm_jaddrefhd) && 10067 LIST_EMPTY(&bmsafemap->sm_jnewblkhd) && 10068 LIST_EMPTY(&bmsafemap->sm_newblkhd) && 10069 LIST_EMPTY(&bmsafemap->sm_inodedephd)) { 10070 if (chgs) 10071 bdirty(bp); 10072 LIST_REMOVE(bmsafemap, sm_hash); 10073 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 10074 return (0); 10075 } 10076 bdirty(bp); 10077 return (1); 10078 } 10079 10080 /* 10081 * Try to free a mkdir dependency. 10082 */ 10083 static void 10084 complete_mkdir(mkdir) 10085 struct mkdir *mkdir; 10086 { 10087 struct diradd *dap; 10088 10089 if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE) 10090 return; 10091 LIST_REMOVE(mkdir, md_mkdirs); 10092 dap = mkdir->md_diradd; 10093 dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); 10094 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) { 10095 dap->da_state |= DEPCOMPLETE; 10096 complete_diradd(dap); 10097 } 10098 WORKITEM_FREE(mkdir, D_MKDIR); 10099 } 10100 10101 /* 10102 * Handle the completion of a mkdir dependency. 10103 */ 10104 static void 10105 handle_written_mkdir(mkdir, type) 10106 struct mkdir *mkdir; 10107 int type; 10108 { 10109 10110 if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type) 10111 panic("handle_written_mkdir: bad type"); 10112 mkdir->md_state |= COMPLETE; 10113 complete_mkdir(mkdir); 10114 } 10115 10116 static void 10117 free_pagedep(pagedep) 10118 struct pagedep *pagedep; 10119 { 10120 int i; 10121 10122 if (pagedep->pd_state & (NEWBLOCK | ONWORKLIST)) 10123 return; 10124 for (i = 0; i < DAHASHSZ; i++) 10125 if (!LIST_EMPTY(&pagedep->pd_diraddhd[i])) 10126 return; 10127 if (!LIST_EMPTY(&pagedep->pd_jmvrefhd)) 10128 return; 10129 if (!LIST_EMPTY(&pagedep->pd_dirremhd)) 10130 return; 10131 if (!LIST_EMPTY(&pagedep->pd_pendinghd)) 10132 return; 10133 LIST_REMOVE(pagedep, pd_hash); 10134 WORKITEM_FREE(pagedep, D_PAGEDEP); 10135 } 10136 10137 /* 10138 * Called from within softdep_disk_write_complete above. 10139 * A write operation was just completed. Removed inodes can 10140 * now be freed and associated block pointers may be committed. 10141 * Note that this routine is always called from interrupt level 10142 * with further splbio interrupts blocked. 10143 */ 10144 static int 10145 handle_written_filepage(pagedep, bp) 10146 struct pagedep *pagedep; 10147 struct buf *bp; /* buffer containing the written page */ 10148 { 10149 struct dirrem *dirrem; 10150 struct diradd *dap, *nextdap; 10151 struct direct *ep; 10152 int i, chgs; 10153 10154 if ((pagedep->pd_state & IOSTARTED) == 0) 10155 panic("handle_written_filepage: not started"); 10156 pagedep->pd_state &= ~IOSTARTED; 10157 /* 10158 * Process any directory removals that have been committed. 10159 */ 10160 while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { 10161 LIST_REMOVE(dirrem, dm_next); 10162 dirrem->dm_state |= COMPLETE; 10163 dirrem->dm_dirinum = pagedep->pd_ino; 10164 KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), 10165 ("handle_written_filepage: Journal entries not written.")); 10166 add_to_worklist(&dirrem->dm_list, 0); 10167 } 10168 /* 10169 * Free any directory additions that have been committed. 10170 * If it is a newly allocated block, we have to wait until 10171 * the on-disk directory inode claims the new block. 10172 */ 10173 if ((pagedep->pd_state & NEWBLOCK) == 0) 10174 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 10175 free_diradd(dap, NULL); 10176 /* 10177 * Uncommitted directory entries must be restored. 10178 */ 10179 for (chgs = 0, i = 0; i < DAHASHSZ; i++) { 10180 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; 10181 dap = nextdap) { 10182 nextdap = LIST_NEXT(dap, da_pdlist); 10183 if (dap->da_state & ATTACHED) 10184 panic("handle_written_filepage: attached"); 10185 ep = (struct direct *) 10186 ((char *)bp->b_data + dap->da_offset); 10187 ep->d_ino = dap->da_newinum; 10188 dap->da_state &= ~UNDONE; 10189 dap->da_state |= ATTACHED; 10190 chgs = 1; 10191 /* 10192 * If the inode referenced by the directory has 10193 * been written out, then the dependency can be 10194 * moved to the pending list. 10195 */ 10196 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 10197 LIST_REMOVE(dap, da_pdlist); 10198 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, 10199 da_pdlist); 10200 } 10201 } 10202 } 10203 /* 10204 * If there were any rollbacks in the directory, then it must be 10205 * marked dirty so that its will eventually get written back in 10206 * its correct form. 10207 */ 10208 if (chgs) { 10209 if ((bp->b_flags & B_DELWRI) == 0) 10210 stat_dir_entry++; 10211 bdirty(bp); 10212 return (1); 10213 } 10214 /* 10215 * If we are not waiting for a new directory block to be 10216 * claimed by its inode, then the pagedep will be freed. 10217 * Otherwise it will remain to track any new entries on 10218 * the page in case they are fsync'ed. 10219 */ 10220 if ((pagedep->pd_state & NEWBLOCK) == 0 && 10221 LIST_EMPTY(&pagedep->pd_jmvrefhd)) { 10222 LIST_REMOVE(pagedep, pd_hash); 10223 WORKITEM_FREE(pagedep, D_PAGEDEP); 10224 } 10225 return (0); 10226 } 10227 10228 /* 10229 * Writing back in-core inode structures. 10230 * 10231 * The filesystem only accesses an inode's contents when it occupies an 10232 * "in-core" inode structure. These "in-core" structures are separate from 10233 * the page frames used to cache inode blocks. Only the latter are 10234 * transferred to/from the disk. So, when the updated contents of the 10235 * "in-core" inode structure are copied to the corresponding in-memory inode 10236 * block, the dependencies are also transferred. The following procedure is 10237 * called when copying a dirty "in-core" inode to a cached inode block. 10238 */ 10239 10240 /* 10241 * Called when an inode is loaded from disk. If the effective link count 10242 * differed from the actual link count when it was last flushed, then we 10243 * need to ensure that the correct effective link count is put back. 10244 */ 10245 void 10246 softdep_load_inodeblock(ip) 10247 struct inode *ip; /* the "in_core" copy of the inode */ 10248 { 10249 struct inodedep *inodedep; 10250 10251 /* 10252 * Check for alternate nlink count. 10253 */ 10254 ip->i_effnlink = ip->i_nlink; 10255 ACQUIRE_LOCK(&lk); 10256 if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 10257 &inodedep) == 0) { 10258 FREE_LOCK(&lk); 10259 return; 10260 } 10261 ip->i_effnlink -= inodedep->id_nlinkdelta; 10262 FREE_LOCK(&lk); 10263 } 10264 10265 /* 10266 * This routine is called just before the "in-core" inode 10267 * information is to be copied to the in-memory inode block. 10268 * Recall that an inode block contains several inodes. If 10269 * the force flag is set, then the dependencies will be 10270 * cleared so that the update can always be made. Note that 10271 * the buffer is locked when this routine is called, so we 10272 * will never be in the middle of writing the inode block 10273 * to disk. 10274 */ 10275 void 10276 softdep_update_inodeblock(ip, bp, waitfor) 10277 struct inode *ip; /* the "in_core" copy of the inode */ 10278 struct buf *bp; /* the buffer containing the inode block */ 10279 int waitfor; /* nonzero => update must be allowed */ 10280 { 10281 struct inodedep *inodedep; 10282 struct inoref *inoref; 10283 struct worklist *wk; 10284 struct mount *mp; 10285 struct buf *ibp; 10286 struct fs *fs; 10287 int error; 10288 10289 mp = UFSTOVFS(ip->i_ump); 10290 fs = ip->i_fs; 10291 /* 10292 * Preserve the freelink that is on disk. clear_unlinked_inodedep() 10293 * does not have access to the in-core ip so must write directly into 10294 * the inode block buffer when setting freelink. 10295 */ 10296 if (fs->fs_magic == FS_UFS1_MAGIC) 10297 DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data + 10298 ino_to_fsbo(fs, ip->i_number))->di_freelink); 10299 else 10300 DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data + 10301 ino_to_fsbo(fs, ip->i_number))->di_freelink); 10302 /* 10303 * If the effective link count is not equal to the actual link 10304 * count, then we must track the difference in an inodedep while 10305 * the inode is (potentially) tossed out of the cache. Otherwise, 10306 * if there is no existing inodedep, then there are no dependencies 10307 * to track. 10308 */ 10309 ACQUIRE_LOCK(&lk); 10310 again: 10311 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { 10312 FREE_LOCK(&lk); 10313 if (ip->i_effnlink != ip->i_nlink) 10314 panic("softdep_update_inodeblock: bad link count"); 10315 return; 10316 } 10317 if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) 10318 panic("softdep_update_inodeblock: bad delta"); 10319 /* 10320 * If we're flushing all dependencies we must also move any waiting 10321 * for journal writes onto the bufwait list prior to I/O. 10322 */ 10323 if (waitfor) { 10324 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 10325 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 10326 == DEPCOMPLETE) { 10327 stat_jwait_inode++; 10328 jwait(&inoref->if_list); 10329 goto again; 10330 } 10331 } 10332 } 10333 /* 10334 * Changes have been initiated. Anything depending on these 10335 * changes cannot occur until this inode has been written. 10336 */ 10337 inodedep->id_state &= ~COMPLETE; 10338 if ((inodedep->id_state & ONWORKLIST) == 0) 10339 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); 10340 /* 10341 * Any new dependencies associated with the incore inode must 10342 * now be moved to the list associated with the buffer holding 10343 * the in-memory copy of the inode. Once merged process any 10344 * allocdirects that are completed by the merger. 10345 */ 10346 merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); 10347 if (!TAILQ_EMPTY(&inodedep->id_inoupdt)) 10348 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt), 10349 NULL); 10350 merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); 10351 if (!TAILQ_EMPTY(&inodedep->id_extupdt)) 10352 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt), 10353 NULL); 10354 /* 10355 * Now that the inode has been pushed into the buffer, the 10356 * operations dependent on the inode being written to disk 10357 * can be moved to the id_bufwait so that they will be 10358 * processed when the buffer I/O completes. 10359 */ 10360 while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { 10361 WORKLIST_REMOVE(wk); 10362 WORKLIST_INSERT(&inodedep->id_bufwait, wk); 10363 } 10364 /* 10365 * Newly allocated inodes cannot be written until the bitmap 10366 * that allocates them have been written (indicated by 10367 * DEPCOMPLETE being set in id_state). If we are doing a 10368 * forced sync (e.g., an fsync on a file), we force the bitmap 10369 * to be written so that the update can be done. 10370 */ 10371 if (waitfor == 0) { 10372 FREE_LOCK(&lk); 10373 return; 10374 } 10375 retry: 10376 if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) { 10377 FREE_LOCK(&lk); 10378 return; 10379 } 10380 ibp = inodedep->id_bmsafemap->sm_buf; 10381 ibp = getdirtybuf(ibp, &lk, MNT_WAIT); 10382 if (ibp == NULL) { 10383 /* 10384 * If ibp came back as NULL, the dependency could have been 10385 * freed while we slept. Look it up again, and check to see 10386 * that it has completed. 10387 */ 10388 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) 10389 goto retry; 10390 FREE_LOCK(&lk); 10391 return; 10392 } 10393 FREE_LOCK(&lk); 10394 if ((error = bwrite(ibp)) != 0) 10395 softdep_error("softdep_update_inodeblock: bwrite", error); 10396 } 10397 10398 /* 10399 * Merge the a new inode dependency list (such as id_newinoupdt) into an 10400 * old inode dependency list (such as id_inoupdt). This routine must be 10401 * called with splbio interrupts blocked. 10402 */ 10403 static void 10404 merge_inode_lists(newlisthead, oldlisthead) 10405 struct allocdirectlst *newlisthead; 10406 struct allocdirectlst *oldlisthead; 10407 { 10408 struct allocdirect *listadp, *newadp; 10409 10410 newadp = TAILQ_FIRST(newlisthead); 10411 for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) { 10412 if (listadp->ad_offset < newadp->ad_offset) { 10413 listadp = TAILQ_NEXT(listadp, ad_next); 10414 continue; 10415 } 10416 TAILQ_REMOVE(newlisthead, newadp, ad_next); 10417 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); 10418 if (listadp->ad_offset == newadp->ad_offset) { 10419 allocdirect_merge(oldlisthead, newadp, 10420 listadp); 10421 listadp = newadp; 10422 } 10423 newadp = TAILQ_FIRST(newlisthead); 10424 } 10425 while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) { 10426 TAILQ_REMOVE(newlisthead, newadp, ad_next); 10427 TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next); 10428 } 10429 } 10430 10431 /* 10432 * If we are doing an fsync, then we must ensure that any directory 10433 * entries for the inode have been written after the inode gets to disk. 10434 */ 10435 int 10436 softdep_fsync(vp) 10437 struct vnode *vp; /* the "in_core" copy of the inode */ 10438 { 10439 struct inodedep *inodedep; 10440 struct pagedep *pagedep; 10441 struct inoref *inoref; 10442 struct worklist *wk; 10443 struct diradd *dap; 10444 struct mount *mp; 10445 struct vnode *pvp; 10446 struct inode *ip; 10447 struct buf *bp; 10448 struct fs *fs; 10449 struct thread *td = curthread; 10450 int error, flushparent, pagedep_new_block; 10451 ino_t parentino; 10452 ufs_lbn_t lbn; 10453 10454 ip = VTOI(vp); 10455 fs = ip->i_fs; 10456 mp = vp->v_mount; 10457 ACQUIRE_LOCK(&lk); 10458 restart: 10459 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { 10460 FREE_LOCK(&lk); 10461 return (0); 10462 } 10463 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 10464 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 10465 == DEPCOMPLETE) { 10466 stat_jwait_inode++; 10467 jwait(&inoref->if_list); 10468 goto restart; 10469 } 10470 } 10471 if (!LIST_EMPTY(&inodedep->id_inowait) || 10472 !TAILQ_EMPTY(&inodedep->id_extupdt) || 10473 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 10474 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 10475 !TAILQ_EMPTY(&inodedep->id_newinoupdt)) 10476 panic("softdep_fsync: pending ops %p", inodedep); 10477 for (error = 0, flushparent = 0; ; ) { 10478 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) 10479 break; 10480 if (wk->wk_type != D_DIRADD) 10481 panic("softdep_fsync: Unexpected type %s", 10482 TYPENAME(wk->wk_type)); 10483 dap = WK_DIRADD(wk); 10484 /* 10485 * Flush our parent if this directory entry has a MKDIR_PARENT 10486 * dependency or is contained in a newly allocated block. 10487 */ 10488 if (dap->da_state & DIRCHG) 10489 pagedep = dap->da_previous->dm_pagedep; 10490 else 10491 pagedep = dap->da_pagedep; 10492 parentino = pagedep->pd_ino; 10493 lbn = pagedep->pd_lbn; 10494 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) 10495 panic("softdep_fsync: dirty"); 10496 if ((dap->da_state & MKDIR_PARENT) || 10497 (pagedep->pd_state & NEWBLOCK)) 10498 flushparent = 1; 10499 else 10500 flushparent = 0; 10501 /* 10502 * If we are being fsync'ed as part of vgone'ing this vnode, 10503 * then we will not be able to release and recover the 10504 * vnode below, so we just have to give up on writing its 10505 * directory entry out. It will eventually be written, just 10506 * not now, but then the user was not asking to have it 10507 * written, so we are not breaking any promises. 10508 */ 10509 if (vp->v_iflag & VI_DOOMED) 10510 break; 10511 /* 10512 * We prevent deadlock by always fetching inodes from the 10513 * root, moving down the directory tree. Thus, when fetching 10514 * our parent directory, we first try to get the lock. If 10515 * that fails, we must unlock ourselves before requesting 10516 * the lock on our parent. See the comment in ufs_lookup 10517 * for details on possible races. 10518 */ 10519 FREE_LOCK(&lk); 10520 if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp, 10521 FFSV_FORCEINSMQ)) { 10522 error = vfs_busy(mp, MBF_NOWAIT); 10523 if (error != 0) { 10524 vfs_ref(mp); 10525 VOP_UNLOCK(vp, 0); 10526 error = vfs_busy(mp, 0); 10527 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 10528 vfs_rel(mp); 10529 if (error != 0) 10530 return (ENOENT); 10531 if (vp->v_iflag & VI_DOOMED) { 10532 vfs_unbusy(mp); 10533 return (ENOENT); 10534 } 10535 } 10536 VOP_UNLOCK(vp, 0); 10537 error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE, 10538 &pvp, FFSV_FORCEINSMQ); 10539 vfs_unbusy(mp); 10540 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 10541 if (vp->v_iflag & VI_DOOMED) { 10542 if (error == 0) 10543 vput(pvp); 10544 error = ENOENT; 10545 } 10546 if (error != 0) 10547 return (error); 10548 } 10549 /* 10550 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps 10551 * that are contained in direct blocks will be resolved by 10552 * doing a ffs_update. Pagedeps contained in indirect blocks 10553 * may require a complete sync'ing of the directory. So, we 10554 * try the cheap and fast ffs_update first, and if that fails, 10555 * then we do the slower ffs_syncvnode of the directory. 10556 */ 10557 if (flushparent) { 10558 int locked; 10559 10560 if ((error = ffs_update(pvp, 1)) != 0) { 10561 vput(pvp); 10562 return (error); 10563 } 10564 ACQUIRE_LOCK(&lk); 10565 locked = 1; 10566 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) { 10567 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) { 10568 if (wk->wk_type != D_DIRADD) 10569 panic("softdep_fsync: Unexpected type %s", 10570 TYPENAME(wk->wk_type)); 10571 dap = WK_DIRADD(wk); 10572 if (dap->da_state & DIRCHG) 10573 pagedep = dap->da_previous->dm_pagedep; 10574 else 10575 pagedep = dap->da_pagedep; 10576 pagedep_new_block = pagedep->pd_state & NEWBLOCK; 10577 FREE_LOCK(&lk); 10578 locked = 0; 10579 if (pagedep_new_block && 10580 (error = ffs_syncvnode(pvp, MNT_WAIT))) { 10581 vput(pvp); 10582 return (error); 10583 } 10584 } 10585 } 10586 if (locked) 10587 FREE_LOCK(&lk); 10588 } 10589 /* 10590 * Flush directory page containing the inode's name. 10591 */ 10592 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred, 10593 &bp); 10594 if (error == 0) 10595 error = bwrite(bp); 10596 else 10597 brelse(bp); 10598 vput(pvp); 10599 if (error != 0) 10600 return (error); 10601 ACQUIRE_LOCK(&lk); 10602 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) 10603 break; 10604 } 10605 FREE_LOCK(&lk); 10606 return (0); 10607 } 10608 10609 /* 10610 * Flush all the dirty bitmaps associated with the block device 10611 * before flushing the rest of the dirty blocks so as to reduce 10612 * the number of dependencies that will have to be rolled back. 10613 */ 10614 void 10615 softdep_fsync_mountdev(vp) 10616 struct vnode *vp; 10617 { 10618 struct buf *bp, *nbp; 10619 struct worklist *wk; 10620 struct bufobj *bo; 10621 10622 if (!vn_isdisk(vp, NULL)) 10623 panic("softdep_fsync_mountdev: vnode not a disk"); 10624 bo = &vp->v_bufobj; 10625 restart: 10626 BO_LOCK(bo); 10627 ACQUIRE_LOCK(&lk); 10628 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 10629 /* 10630 * If it is already scheduled, skip to the next buffer. 10631 */ 10632 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 10633 continue; 10634 10635 if ((bp->b_flags & B_DELWRI) == 0) 10636 panic("softdep_fsync_mountdev: not dirty"); 10637 /* 10638 * We are only interested in bitmaps with outstanding 10639 * dependencies. 10640 */ 10641 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL || 10642 wk->wk_type != D_BMSAFEMAP || 10643 (bp->b_vflags & BV_BKGRDINPROG)) { 10644 BUF_UNLOCK(bp); 10645 continue; 10646 } 10647 FREE_LOCK(&lk); 10648 BO_UNLOCK(bo); 10649 bremfree(bp); 10650 (void) bawrite(bp); 10651 goto restart; 10652 } 10653 FREE_LOCK(&lk); 10654 drain_output(vp); 10655 BO_UNLOCK(bo); 10656 } 10657 10658 /* 10659 * This routine is called when we are trying to synchronously flush a 10660 * file. This routine must eliminate any filesystem metadata dependencies 10661 * so that the syncing routine can succeed by pushing the dirty blocks 10662 * associated with the file. If any I/O errors occur, they are returned. 10663 */ 10664 int 10665 softdep_sync_metadata(struct vnode *vp) 10666 { 10667 struct pagedep *pagedep; 10668 struct allocindir *aip; 10669 struct newblk *newblk; 10670 struct buf *bp, *nbp; 10671 struct worklist *wk; 10672 struct bufobj *bo; 10673 int i, error, waitfor; 10674 10675 if (!DOINGSOFTDEP(vp)) 10676 return (0); 10677 /* 10678 * Ensure that any direct block dependencies have been cleared. 10679 */ 10680 ACQUIRE_LOCK(&lk); 10681 if ((error = flush_inodedep_deps(vp->v_mount, VTOI(vp)->i_number))) { 10682 FREE_LOCK(&lk); 10683 return (error); 10684 } 10685 FREE_LOCK(&lk); 10686 /* 10687 * For most files, the only metadata dependencies are the 10688 * cylinder group maps that allocate their inode or blocks. 10689 * The block allocation dependencies can be found by traversing 10690 * the dependency lists for any buffers that remain on their 10691 * dirty buffer list. The inode allocation dependency will 10692 * be resolved when the inode is updated with MNT_WAIT. 10693 * This work is done in two passes. The first pass grabs most 10694 * of the buffers and begins asynchronously writing them. The 10695 * only way to wait for these asynchronous writes is to sleep 10696 * on the filesystem vnode which may stay busy for a long time 10697 * if the filesystem is active. So, instead, we make a second 10698 * pass over the dependencies blocking on each write. In the 10699 * usual case we will be blocking against a write that we 10700 * initiated, so when it is done the dependency will have been 10701 * resolved. Thus the second pass is expected to end quickly. 10702 */ 10703 waitfor = MNT_NOWAIT; 10704 bo = &vp->v_bufobj; 10705 10706 top: 10707 /* 10708 * We must wait for any I/O in progress to finish so that 10709 * all potential buffers on the dirty list will be visible. 10710 */ 10711 BO_LOCK(bo); 10712 drain_output(vp); 10713 while ((bp = TAILQ_FIRST(&bo->bo_dirty.bv_hd)) != NULL) { 10714 bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT); 10715 if (bp) 10716 break; 10717 } 10718 BO_UNLOCK(bo); 10719 if (bp == NULL) 10720 return (0); 10721 loop: 10722 /* While syncing snapshots, we must allow recursive lookups */ 10723 BUF_AREC(bp); 10724 ACQUIRE_LOCK(&lk); 10725 /* 10726 * As we hold the buffer locked, none of its dependencies 10727 * will disappear. 10728 */ 10729 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 10730 switch (wk->wk_type) { 10731 10732 case D_ALLOCDIRECT: 10733 case D_ALLOCINDIR: 10734 newblk = WK_NEWBLK(wk); 10735 if (newblk->nb_jnewblk != NULL) { 10736 stat_jwait_newblk++; 10737 jwait(&newblk->nb_jnewblk->jn_list); 10738 goto restart; 10739 } 10740 if (newblk->nb_state & DEPCOMPLETE) 10741 continue; 10742 nbp = newblk->nb_bmsafemap->sm_buf; 10743 nbp = getdirtybuf(nbp, &lk, waitfor); 10744 if (nbp == NULL) 10745 continue; 10746 FREE_LOCK(&lk); 10747 if (waitfor == MNT_NOWAIT) { 10748 bawrite(nbp); 10749 } else if ((error = bwrite(nbp)) != 0) { 10750 break; 10751 } 10752 ACQUIRE_LOCK(&lk); 10753 continue; 10754 10755 case D_INDIRDEP: 10756 restart: 10757 10758 LIST_FOREACH(aip, 10759 &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) { 10760 newblk = (struct newblk *)aip; 10761 if (newblk->nb_jnewblk != NULL) { 10762 stat_jwait_newblk++; 10763 jwait(&newblk->nb_jnewblk->jn_list); 10764 goto restart; 10765 } 10766 if (newblk->nb_state & DEPCOMPLETE) 10767 continue; 10768 nbp = newblk->nb_bmsafemap->sm_buf; 10769 nbp = getdirtybuf(nbp, &lk, MNT_WAIT); 10770 if (nbp == NULL) 10771 goto restart; 10772 FREE_LOCK(&lk); 10773 if ((error = bwrite(nbp)) != 0) { 10774 goto loop_end; 10775 } 10776 ACQUIRE_LOCK(&lk); 10777 goto restart; 10778 } 10779 continue; 10780 10781 case D_PAGEDEP: 10782 /* 10783 * We are trying to sync a directory that may 10784 * have dependencies on both its own metadata 10785 * and/or dependencies on the inodes of any 10786 * recently allocated files. We walk its diradd 10787 * lists pushing out the associated inode. 10788 */ 10789 pagedep = WK_PAGEDEP(wk); 10790 for (i = 0; i < DAHASHSZ; i++) { 10791 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) 10792 continue; 10793 if ((error = 10794 flush_pagedep_deps(vp, wk->wk_mp, 10795 &pagedep->pd_diraddhd[i]))) { 10796 FREE_LOCK(&lk); 10797 goto loop_end; 10798 } 10799 } 10800 continue; 10801 10802 default: 10803 panic("softdep_sync_metadata: Unknown type %s", 10804 TYPENAME(wk->wk_type)); 10805 /* NOTREACHED */ 10806 } 10807 loop_end: 10808 /* We reach here only in error and unlocked */ 10809 if (error == 0) 10810 panic("softdep_sync_metadata: zero error"); 10811 BUF_NOREC(bp); 10812 bawrite(bp); 10813 return (error); 10814 } 10815 FREE_LOCK(&lk); 10816 BO_LOCK(bo); 10817 while ((nbp = TAILQ_NEXT(bp, b_bobufs)) != NULL) { 10818 nbp = getdirtybuf(nbp, BO_MTX(bo), MNT_WAIT); 10819 if (nbp) 10820 break; 10821 } 10822 BO_UNLOCK(bo); 10823 BUF_NOREC(bp); 10824 bawrite(bp); 10825 if (nbp != NULL) { 10826 bp = nbp; 10827 goto loop; 10828 } 10829 /* 10830 * The brief unlock is to allow any pent up dependency 10831 * processing to be done. Then proceed with the second pass. 10832 */ 10833 if (waitfor == MNT_NOWAIT) { 10834 waitfor = MNT_WAIT; 10835 goto top; 10836 } 10837 10838 /* 10839 * If we have managed to get rid of all the dirty buffers, 10840 * then we are done. For certain directories and block 10841 * devices, we may need to do further work. 10842 * 10843 * We must wait for any I/O in progress to finish so that 10844 * all potential buffers on the dirty list will be visible. 10845 */ 10846 BO_LOCK(bo); 10847 drain_output(vp); 10848 BO_UNLOCK(bo); 10849 return ffs_update(vp, 1); 10850 /* return (0); */ 10851 } 10852 10853 /* 10854 * Flush the dependencies associated with an inodedep. 10855 * Called with splbio blocked. 10856 */ 10857 static int 10858 flush_inodedep_deps(mp, ino) 10859 struct mount *mp; 10860 ino_t ino; 10861 { 10862 struct inodedep *inodedep; 10863 struct inoref *inoref; 10864 int error, waitfor; 10865 10866 /* 10867 * This work is done in two passes. The first pass grabs most 10868 * of the buffers and begins asynchronously writing them. The 10869 * only way to wait for these asynchronous writes is to sleep 10870 * on the filesystem vnode which may stay busy for a long time 10871 * if the filesystem is active. So, instead, we make a second 10872 * pass over the dependencies blocking on each write. In the 10873 * usual case we will be blocking against a write that we 10874 * initiated, so when it is done the dependency will have been 10875 * resolved. Thus the second pass is expected to end quickly. 10876 * We give a brief window at the top of the loop to allow 10877 * any pending I/O to complete. 10878 */ 10879 for (error = 0, waitfor = MNT_NOWAIT; ; ) { 10880 if (error) 10881 return (error); 10882 FREE_LOCK(&lk); 10883 ACQUIRE_LOCK(&lk); 10884 restart: 10885 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) 10886 return (0); 10887 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 10888 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 10889 == DEPCOMPLETE) { 10890 stat_jwait_inode++; 10891 jwait(&inoref->if_list); 10892 goto restart; 10893 } 10894 } 10895 if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) || 10896 flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) || 10897 flush_deplist(&inodedep->id_extupdt, waitfor, &error) || 10898 flush_deplist(&inodedep->id_newextupdt, waitfor, &error)) 10899 continue; 10900 /* 10901 * If pass2, we are done, otherwise do pass 2. 10902 */ 10903 if (waitfor == MNT_WAIT) 10904 break; 10905 waitfor = MNT_WAIT; 10906 } 10907 /* 10908 * Try freeing inodedep in case all dependencies have been removed. 10909 */ 10910 if (inodedep_lookup(mp, ino, 0, &inodedep) != 0) 10911 (void) free_inodedep(inodedep); 10912 return (0); 10913 } 10914 10915 /* 10916 * Flush an inode dependency list. 10917 * Called with splbio blocked. 10918 */ 10919 static int 10920 flush_deplist(listhead, waitfor, errorp) 10921 struct allocdirectlst *listhead; 10922 int waitfor; 10923 int *errorp; 10924 { 10925 struct allocdirect *adp; 10926 struct newblk *newblk; 10927 struct buf *bp; 10928 10929 mtx_assert(&lk, MA_OWNED); 10930 TAILQ_FOREACH(adp, listhead, ad_next) { 10931 newblk = (struct newblk *)adp; 10932 if (newblk->nb_jnewblk != NULL) { 10933 stat_jwait_newblk++; 10934 jwait(&newblk->nb_jnewblk->jn_list); 10935 return (1); 10936 } 10937 if (newblk->nb_state & DEPCOMPLETE) 10938 continue; 10939 bp = newblk->nb_bmsafemap->sm_buf; 10940 bp = getdirtybuf(bp, &lk, waitfor); 10941 if (bp == NULL) { 10942 if (waitfor == MNT_NOWAIT) 10943 continue; 10944 return (1); 10945 } 10946 FREE_LOCK(&lk); 10947 if (waitfor == MNT_NOWAIT) { 10948 bawrite(bp); 10949 } else if ((*errorp = bwrite(bp)) != 0) { 10950 ACQUIRE_LOCK(&lk); 10951 return (1); 10952 } 10953 ACQUIRE_LOCK(&lk); 10954 return (1); 10955 } 10956 return (0); 10957 } 10958 10959 /* 10960 * Flush dependencies associated with an allocdirect block. 10961 */ 10962 static int 10963 flush_newblk_dep(vp, mp, lbn) 10964 struct vnode *vp; 10965 struct mount *mp; 10966 ufs_lbn_t lbn; 10967 { 10968 struct newblk *newblk; 10969 struct bufobj *bo; 10970 struct inode *ip; 10971 struct buf *bp; 10972 ufs2_daddr_t blkno; 10973 int error; 10974 10975 error = 0; 10976 bo = &vp->v_bufobj; 10977 ip = VTOI(vp); 10978 blkno = DIP(ip, i_db[lbn]); 10979 if (blkno == 0) 10980 panic("flush_newblk_dep: Missing block"); 10981 ACQUIRE_LOCK(&lk); 10982 /* 10983 * Loop until all dependencies related to this block are satisfied. 10984 * We must be careful to restart after each sleep in case a write 10985 * completes some part of this process for us. 10986 */ 10987 for (;;) { 10988 if (newblk_lookup(mp, blkno, 0, &newblk) == 0) { 10989 FREE_LOCK(&lk); 10990 break; 10991 } 10992 if (newblk->nb_list.wk_type != D_ALLOCDIRECT) 10993 panic("flush_newblk_deps: Bad newblk %p", newblk); 10994 /* 10995 * Flush the journal. 10996 */ 10997 if (newblk->nb_jnewblk != NULL) { 10998 stat_jwait_newblk++; 10999 jwait(&newblk->nb_jnewblk->jn_list); 11000 continue; 11001 } 11002 /* 11003 * Write the bitmap dependency. 11004 */ 11005 if ((newblk->nb_state & DEPCOMPLETE) == 0) { 11006 bp = newblk->nb_bmsafemap->sm_buf; 11007 bp = getdirtybuf(bp, &lk, MNT_WAIT); 11008 if (bp == NULL) 11009 continue; 11010 FREE_LOCK(&lk); 11011 error = bwrite(bp); 11012 if (error) 11013 break; 11014 ACQUIRE_LOCK(&lk); 11015 continue; 11016 } 11017 /* 11018 * Write the buffer. 11019 */ 11020 FREE_LOCK(&lk); 11021 BO_LOCK(bo); 11022 bp = gbincore(bo, lbn); 11023 if (bp != NULL) { 11024 error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 11025 LK_INTERLOCK, BO_MTX(bo)); 11026 if (error == ENOLCK) { 11027 ACQUIRE_LOCK(&lk); 11028 continue; /* Slept, retry */ 11029 } 11030 if (error != 0) 11031 break; /* Failed */ 11032 if (bp->b_flags & B_DELWRI) { 11033 bremfree(bp); 11034 error = bwrite(bp); 11035 if (error) 11036 break; 11037 } else 11038 BUF_UNLOCK(bp); 11039 } else 11040 BO_UNLOCK(bo); 11041 /* 11042 * We have to wait for the direct pointers to 11043 * point at the newdirblk before the dependency 11044 * will go away. 11045 */ 11046 error = ffs_update(vp, MNT_WAIT); 11047 if (error) 11048 break; 11049 ACQUIRE_LOCK(&lk); 11050 } 11051 return (error); 11052 } 11053 11054 /* 11055 * Eliminate a pagedep dependency by flushing out all its diradd dependencies. 11056 * Called with splbio blocked. 11057 */ 11058 static int 11059 flush_pagedep_deps(pvp, mp, diraddhdp) 11060 struct vnode *pvp; 11061 struct mount *mp; 11062 struct diraddhd *diraddhdp; 11063 { 11064 struct inodedep *inodedep; 11065 struct inoref *inoref; 11066 struct ufsmount *ump; 11067 struct diradd *dap; 11068 struct vnode *vp; 11069 int error = 0; 11070 struct buf *bp; 11071 ino_t inum; 11072 11073 ump = VFSTOUFS(mp); 11074 restart: 11075 while ((dap = LIST_FIRST(diraddhdp)) != NULL) { 11076 /* 11077 * Flush ourselves if this directory entry 11078 * has a MKDIR_PARENT dependency. 11079 */ 11080 if (dap->da_state & MKDIR_PARENT) { 11081 FREE_LOCK(&lk); 11082 if ((error = ffs_update(pvp, MNT_WAIT)) != 0) 11083 break; 11084 ACQUIRE_LOCK(&lk); 11085 /* 11086 * If that cleared dependencies, go on to next. 11087 */ 11088 if (dap != LIST_FIRST(diraddhdp)) 11089 continue; 11090 if (dap->da_state & MKDIR_PARENT) 11091 panic("flush_pagedep_deps: MKDIR_PARENT"); 11092 } 11093 /* 11094 * A newly allocated directory must have its "." and 11095 * ".." entries written out before its name can be 11096 * committed in its parent. 11097 */ 11098 inum = dap->da_newinum; 11099 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) 11100 panic("flush_pagedep_deps: lost inode1"); 11101 /* 11102 * Wait for any pending journal adds to complete so we don't 11103 * cause rollbacks while syncing. 11104 */ 11105 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 11106 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 11107 == DEPCOMPLETE) { 11108 stat_jwait_inode++; 11109 jwait(&inoref->if_list); 11110 goto restart; 11111 } 11112 } 11113 if (dap->da_state & MKDIR_BODY) { 11114 FREE_LOCK(&lk); 11115 if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, 11116 FFSV_FORCEINSMQ))) 11117 break; 11118 error = flush_newblk_dep(vp, mp, 0); 11119 /* 11120 * If we still have the dependency we might need to 11121 * update the vnode to sync the new link count to 11122 * disk. 11123 */ 11124 if (error == 0 && dap == LIST_FIRST(diraddhdp)) 11125 error = ffs_update(vp, MNT_WAIT); 11126 vput(vp); 11127 if (error != 0) 11128 break; 11129 ACQUIRE_LOCK(&lk); 11130 /* 11131 * If that cleared dependencies, go on to next. 11132 */ 11133 if (dap != LIST_FIRST(diraddhdp)) 11134 continue; 11135 if (dap->da_state & MKDIR_BODY) { 11136 inodedep_lookup(UFSTOVFS(ump), inum, 0, 11137 &inodedep); 11138 panic("flush_pagedep_deps: MKDIR_BODY " 11139 "inodedep %p dap %p vp %p", 11140 inodedep, dap, vp); 11141 } 11142 } 11143 /* 11144 * Flush the inode on which the directory entry depends. 11145 * Having accounted for MKDIR_PARENT and MKDIR_BODY above, 11146 * the only remaining dependency is that the updated inode 11147 * count must get pushed to disk. The inode has already 11148 * been pushed into its inode buffer (via VOP_UPDATE) at 11149 * the time of the reference count change. So we need only 11150 * locate that buffer, ensure that there will be no rollback 11151 * caused by a bitmap dependency, then write the inode buffer. 11152 */ 11153 retry: 11154 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) 11155 panic("flush_pagedep_deps: lost inode"); 11156 /* 11157 * If the inode still has bitmap dependencies, 11158 * push them to disk. 11159 */ 11160 if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) { 11161 bp = inodedep->id_bmsafemap->sm_buf; 11162 bp = getdirtybuf(bp, &lk, MNT_WAIT); 11163 if (bp == NULL) 11164 goto retry; 11165 FREE_LOCK(&lk); 11166 if ((error = bwrite(bp)) != 0) 11167 break; 11168 ACQUIRE_LOCK(&lk); 11169 if (dap != LIST_FIRST(diraddhdp)) 11170 continue; 11171 } 11172 /* 11173 * If the inode is still sitting in a buffer waiting 11174 * to be written or waiting for the link count to be 11175 * adjusted update it here to flush it to disk. 11176 */ 11177 if (dap == LIST_FIRST(diraddhdp)) { 11178 FREE_LOCK(&lk); 11179 if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, 11180 FFSV_FORCEINSMQ))) 11181 break; 11182 error = ffs_update(vp, MNT_WAIT); 11183 vput(vp); 11184 if (error) 11185 break; 11186 ACQUIRE_LOCK(&lk); 11187 } 11188 /* 11189 * If we have failed to get rid of all the dependencies 11190 * then something is seriously wrong. 11191 */ 11192 if (dap == LIST_FIRST(diraddhdp)) { 11193 inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep); 11194 panic("flush_pagedep_deps: failed to flush " 11195 "inodedep %p ino %d dap %p", inodedep, inum, dap); 11196 } 11197 } 11198 if (error) 11199 ACQUIRE_LOCK(&lk); 11200 return (error); 11201 } 11202 11203 /* 11204 * A large burst of file addition or deletion activity can drive the 11205 * memory load excessively high. First attempt to slow things down 11206 * using the techniques below. If that fails, this routine requests 11207 * the offending operations to fall back to running synchronously 11208 * until the memory load returns to a reasonable level. 11209 */ 11210 int 11211 softdep_slowdown(vp) 11212 struct vnode *vp; 11213 { 11214 struct ufsmount *ump; 11215 int jlow; 11216 int max_softdeps_hard; 11217 11218 ACQUIRE_LOCK(&lk); 11219 jlow = 0; 11220 /* 11221 * Check for journal space if needed. 11222 */ 11223 if (DOINGSUJ(vp)) { 11224 ump = VFSTOUFS(vp->v_mount); 11225 if (journal_space(ump, 0) == 0) 11226 jlow = 1; 11227 } 11228 max_softdeps_hard = max_softdeps * 11 / 10; 11229 if (dep_current[D_DIRREM] < max_softdeps_hard / 2 && 11230 dep_current[D_INODEDEP] < max_softdeps_hard && 11231 VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps && 11232 dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0) { 11233 FREE_LOCK(&lk); 11234 return (0); 11235 } 11236 if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps || jlow) 11237 softdep_speedup(); 11238 stat_sync_limit_hit += 1; 11239 FREE_LOCK(&lk); 11240 return (1); 11241 } 11242 11243 /* 11244 * Called by the allocation routines when they are about to fail 11245 * in the hope that we can free up the requested resource (inodes 11246 * or disk space). 11247 * 11248 * First check to see if the work list has anything on it. If it has, 11249 * clean up entries until we successfully free the requested resource. 11250 * Because this process holds inodes locked, we cannot handle any remove 11251 * requests that might block on a locked inode as that could lead to 11252 * deadlock. If the worklist yields none of the requested resource, 11253 * start syncing out vnodes to free up the needed space. 11254 */ 11255 int 11256 softdep_request_cleanup(fs, vp, cred, resource) 11257 struct fs *fs; 11258 struct vnode *vp; 11259 struct ucred *cred; 11260 int resource; 11261 { 11262 struct ufsmount *ump; 11263 struct mount *mp; 11264 struct vnode *lvp, *mvp; 11265 long starttime; 11266 ufs2_daddr_t needed; 11267 int error; 11268 11269 mp = vp->v_mount; 11270 ump = VTOI(vp)->i_ump; 11271 mtx_assert(UFS_MTX(ump), MA_OWNED); 11272 if (resource == FLUSH_BLOCKS_WAIT) 11273 stat_cleanup_blkrequests += 1; 11274 else 11275 stat_cleanup_inorequests += 1; 11276 /* 11277 * If we are being called because of a process doing a 11278 * copy-on-write, then it is not safe to update the vnode 11279 * as we may recurse into the copy-on-write routine. 11280 */ 11281 if (!(curthread->td_pflags & TDP_COWINPROGRESS)) { 11282 UFS_UNLOCK(ump); 11283 error = ffs_update(vp, 1); 11284 UFS_LOCK(ump); 11285 if (error != 0) 11286 return (0); 11287 } 11288 /* 11289 * If we are in need of resources, consider pausing for 11290 * tickdelay to give ourselves some breathing room. 11291 */ 11292 UFS_UNLOCK(ump); 11293 ACQUIRE_LOCK(&lk); 11294 request_cleanup(UFSTOVFS(ump), resource); 11295 FREE_LOCK(&lk); 11296 UFS_LOCK(ump); 11297 /* 11298 * Now clean up at least as many resources as we will need. 11299 * 11300 * When requested to clean up inodes, the number that are needed 11301 * is set by the number of simultaneous writers (mnt_writeopcount) 11302 * plus a bit of slop (2) in case some more writers show up while 11303 * we are cleaning. 11304 * 11305 * When requested to free up space, the amount of space that 11306 * we need is enough blocks to allocate a full-sized segment 11307 * (fs_contigsumsize). The number of such segments that will 11308 * be needed is set by the number of simultaneous writers 11309 * (mnt_writeopcount) plus a bit of slop (2) in case some more 11310 * writers show up while we are cleaning. 11311 * 11312 * Additionally, if we are unpriviledged and allocating space, 11313 * we need to ensure that we clean up enough blocks to get the 11314 * needed number of blocks over the threshhold of the minimum 11315 * number of blocks required to be kept free by the filesystem 11316 * (fs_minfree). 11317 */ 11318 if (resource == FLUSH_INODES_WAIT) { 11319 needed = vp->v_mount->mnt_writeopcount + 2; 11320 } else if (resource == FLUSH_BLOCKS_WAIT) { 11321 needed = (vp->v_mount->mnt_writeopcount + 2) * 11322 fs->fs_contigsumsize; 11323 if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0)) 11324 needed += fragstoblks(fs, 11325 roundup((fs->fs_dsize * fs->fs_minfree / 100) - 11326 fs->fs_cstotal.cs_nffree, fs->fs_frag)); 11327 } else { 11328 printf("softdep_request_cleanup: Unknown resource type %d\n", 11329 resource); 11330 return (0); 11331 } 11332 starttime = time_second; 11333 retry: 11334 while ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 && 11335 fs->fs_cstotal.cs_nbfree <= needed) || 11336 (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && 11337 fs->fs_cstotal.cs_nifree <= needed)) { 11338 UFS_UNLOCK(ump); 11339 ACQUIRE_LOCK(&lk); 11340 process_removes(vp); 11341 if (ump->softdep_on_worklist > 0 && 11342 process_worklist_item(UFSTOVFS(ump), LK_NOWAIT) != -1) { 11343 stat_worklist_push += 1; 11344 FREE_LOCK(&lk); 11345 UFS_LOCK(ump); 11346 continue; 11347 } 11348 FREE_LOCK(&lk); 11349 UFS_LOCK(ump); 11350 } 11351 /* 11352 * If we still need resources and there are no more worklist 11353 * entries to process to obtain them, we have to start flushing 11354 * the dirty vnodes to force the release of additional requests 11355 * to the worklist that we can then process to reap addition 11356 * resources. We walk the vnodes associated with the mount point 11357 * until we get the needed worklist requests that we can reap. 11358 */ 11359 if ((resource == FLUSH_BLOCKS_WAIT && 11360 fs->fs_cstotal.cs_nbfree <= needed) || 11361 (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && 11362 fs->fs_cstotal.cs_nifree <= needed)) { 11363 UFS_UNLOCK(ump); 11364 MNT_ILOCK(mp); 11365 MNT_VNODE_FOREACH(lvp, mp, mvp) { 11366 UFS_LOCK(ump); 11367 if (ump->softdep_on_worklist > 0) { 11368 UFS_UNLOCK(ump); 11369 MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); 11370 MNT_IUNLOCK(mp); 11371 UFS_LOCK(ump); 11372 stat_cleanup_retries += 1; 11373 goto retry; 11374 } 11375 UFS_UNLOCK(ump); 11376 VI_LOCK(lvp); 11377 if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0 || 11378 VOP_ISLOCKED(lvp) != 0) { 11379 VI_UNLOCK(lvp); 11380 continue; 11381 } 11382 MNT_IUNLOCK(mp); 11383 if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK, curthread)) { 11384 MNT_ILOCK(mp); 11385 continue; 11386 } 11387 if (lvp->v_vflag & VV_NOSYNC) { /* unlinked */ 11388 vput(lvp); 11389 MNT_ILOCK(mp); 11390 continue; 11391 } 11392 (void) ffs_syncvnode(lvp, MNT_WAIT); 11393 vput(lvp); 11394 MNT_ILOCK(mp); 11395 } 11396 MNT_IUNLOCK(mp); 11397 stat_cleanup_failures += 1; 11398 UFS_LOCK(ump); 11399 } 11400 if (time_second - starttime > stat_cleanup_high_delay) 11401 stat_cleanup_high_delay = time_second - starttime; 11402 return (1); 11403 } 11404 11405 /* 11406 * If memory utilization has gotten too high, deliberately slow things 11407 * down and speed up the I/O processing. 11408 */ 11409 extern struct thread *syncertd; 11410 static int 11411 request_cleanup(mp, resource) 11412 struct mount *mp; 11413 int resource; 11414 { 11415 struct thread *td = curthread; 11416 struct ufsmount *ump; 11417 11418 mtx_assert(&lk, MA_OWNED); 11419 /* 11420 * We never hold up the filesystem syncer or buf daemon. 11421 */ 11422 if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF)) 11423 return (0); 11424 ump = VFSTOUFS(mp); 11425 /* 11426 * First check to see if the work list has gotten backlogged. 11427 * If it has, co-opt this process to help clean up two entries. 11428 * Because this process may hold inodes locked, we cannot 11429 * handle any remove requests that might block on a locked 11430 * inode as that could lead to deadlock. We set TDP_SOFTDEP 11431 * to avoid recursively processing the worklist. 11432 */ 11433 if (ump->softdep_on_worklist > max_softdeps / 10) { 11434 td->td_pflags |= TDP_SOFTDEP; 11435 process_worklist_item(mp, LK_NOWAIT); 11436 process_worklist_item(mp, LK_NOWAIT); 11437 td->td_pflags &= ~TDP_SOFTDEP; 11438 stat_worklist_push += 2; 11439 return(1); 11440 } 11441 /* 11442 * Next, we attempt to speed up the syncer process. If that 11443 * is successful, then we allow the process to continue. 11444 */ 11445 if (softdep_speedup() && 11446 resource != FLUSH_BLOCKS_WAIT && 11447 resource != FLUSH_INODES_WAIT) 11448 return(0); 11449 /* 11450 * If we are resource constrained on inode dependencies, try 11451 * flushing some dirty inodes. Otherwise, we are constrained 11452 * by file deletions, so try accelerating flushes of directories 11453 * with removal dependencies. We would like to do the cleanup 11454 * here, but we probably hold an inode locked at this point and 11455 * that might deadlock against one that we try to clean. So, 11456 * the best that we can do is request the syncer daemon to do 11457 * the cleanup for us. 11458 */ 11459 switch (resource) { 11460 11461 case FLUSH_INODES: 11462 case FLUSH_INODES_WAIT: 11463 stat_ino_limit_push += 1; 11464 req_clear_inodedeps += 1; 11465 stat_countp = &stat_ino_limit_hit; 11466 break; 11467 11468 case FLUSH_BLOCKS: 11469 case FLUSH_BLOCKS_WAIT: 11470 stat_blk_limit_push += 1; 11471 req_clear_remove += 1; 11472 stat_countp = &stat_blk_limit_hit; 11473 break; 11474 11475 default: 11476 panic("request_cleanup: unknown type"); 11477 } 11478 /* 11479 * Hopefully the syncer daemon will catch up and awaken us. 11480 * We wait at most tickdelay before proceeding in any case. 11481 */ 11482 proc_waiting += 1; 11483 if (callout_pending(&softdep_callout) == FALSE) 11484 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, 11485 pause_timer, 0); 11486 11487 msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0); 11488 proc_waiting -= 1; 11489 return (1); 11490 } 11491 11492 /* 11493 * Awaken processes pausing in request_cleanup and clear proc_waiting 11494 * to indicate that there is no longer a timer running. 11495 */ 11496 static void 11497 pause_timer(arg) 11498 void *arg; 11499 { 11500 11501 /* 11502 * The callout_ API has acquired mtx and will hold it around this 11503 * function call. 11504 */ 11505 *stat_countp += 1; 11506 wakeup_one(&proc_waiting); 11507 if (proc_waiting > 0) 11508 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, 11509 pause_timer, 0); 11510 } 11511 11512 /* 11513 * Flush out a directory with at least one removal dependency in an effort to 11514 * reduce the number of dirrem, freefile, and freeblks dependency structures. 11515 */ 11516 static void 11517 clear_remove(td) 11518 struct thread *td; 11519 { 11520 struct pagedep_hashhead *pagedephd; 11521 struct pagedep *pagedep; 11522 static int next = 0; 11523 struct mount *mp; 11524 struct vnode *vp; 11525 struct bufobj *bo; 11526 int error, cnt; 11527 ino_t ino; 11528 11529 mtx_assert(&lk, MA_OWNED); 11530 11531 for (cnt = 0; cnt < pagedep_hash; cnt++) { 11532 pagedephd = &pagedep_hashtbl[next++]; 11533 if (next >= pagedep_hash) 11534 next = 0; 11535 LIST_FOREACH(pagedep, pagedephd, pd_hash) { 11536 if (LIST_EMPTY(&pagedep->pd_dirremhd)) 11537 continue; 11538 mp = pagedep->pd_list.wk_mp; 11539 ino = pagedep->pd_ino; 11540 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 11541 continue; 11542 FREE_LOCK(&lk); 11543 11544 /* 11545 * Let unmount clear deps 11546 */ 11547 error = vfs_busy(mp, MBF_NOWAIT); 11548 if (error != 0) 11549 goto finish_write; 11550 error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, 11551 FFSV_FORCEINSMQ); 11552 vfs_unbusy(mp); 11553 if (error != 0) { 11554 softdep_error("clear_remove: vget", error); 11555 goto finish_write; 11556 } 11557 if ((error = ffs_syncvnode(vp, MNT_NOWAIT))) 11558 softdep_error("clear_remove: fsync", error); 11559 bo = &vp->v_bufobj; 11560 BO_LOCK(bo); 11561 drain_output(vp); 11562 BO_UNLOCK(bo); 11563 vput(vp); 11564 finish_write: 11565 vn_finished_write(mp); 11566 ACQUIRE_LOCK(&lk); 11567 return; 11568 } 11569 } 11570 } 11571 11572 /* 11573 * Clear out a block of dirty inodes in an effort to reduce 11574 * the number of inodedep dependency structures. 11575 */ 11576 static void 11577 clear_inodedeps(td) 11578 struct thread *td; 11579 { 11580 struct inodedep_hashhead *inodedephd; 11581 struct inodedep *inodedep; 11582 static int next = 0; 11583 struct mount *mp; 11584 struct vnode *vp; 11585 struct fs *fs; 11586 int error, cnt; 11587 ino_t firstino, lastino, ino; 11588 11589 mtx_assert(&lk, MA_OWNED); 11590 /* 11591 * Pick a random inode dependency to be cleared. 11592 * We will then gather up all the inodes in its block 11593 * that have dependencies and flush them out. 11594 */ 11595 for (cnt = 0; cnt < inodedep_hash; cnt++) { 11596 inodedephd = &inodedep_hashtbl[next++]; 11597 if (next >= inodedep_hash) 11598 next = 0; 11599 if ((inodedep = LIST_FIRST(inodedephd)) != NULL) 11600 break; 11601 } 11602 if (inodedep == NULL) 11603 return; 11604 fs = inodedep->id_fs; 11605 mp = inodedep->id_list.wk_mp; 11606 /* 11607 * Find the last inode in the block with dependencies. 11608 */ 11609 firstino = inodedep->id_ino & ~(INOPB(fs) - 1); 11610 for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) 11611 if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0) 11612 break; 11613 /* 11614 * Asynchronously push all but the last inode with dependencies. 11615 * Synchronously push the last inode with dependencies to ensure 11616 * that the inode block gets written to free up the inodedeps. 11617 */ 11618 for (ino = firstino; ino <= lastino; ino++) { 11619 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) 11620 continue; 11621 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 11622 continue; 11623 FREE_LOCK(&lk); 11624 error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */ 11625 if (error != 0) { 11626 vn_finished_write(mp); 11627 ACQUIRE_LOCK(&lk); 11628 return; 11629 } 11630 if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, 11631 FFSV_FORCEINSMQ)) != 0) { 11632 softdep_error("clear_inodedeps: vget", error); 11633 vfs_unbusy(mp); 11634 vn_finished_write(mp); 11635 ACQUIRE_LOCK(&lk); 11636 return; 11637 } 11638 vfs_unbusy(mp); 11639 if (ino == lastino) { 11640 if ((error = ffs_syncvnode(vp, MNT_WAIT))) 11641 softdep_error("clear_inodedeps: fsync1", error); 11642 } else { 11643 if ((error = ffs_syncvnode(vp, MNT_NOWAIT))) 11644 softdep_error("clear_inodedeps: fsync2", error); 11645 BO_LOCK(&vp->v_bufobj); 11646 drain_output(vp); 11647 BO_UNLOCK(&vp->v_bufobj); 11648 } 11649 vput(vp); 11650 vn_finished_write(mp); 11651 ACQUIRE_LOCK(&lk); 11652 } 11653 } 11654 11655 /* 11656 * Function to determine if the buffer has outstanding dependencies 11657 * that will cause a roll-back if the buffer is written. If wantcount 11658 * is set, return number of dependencies, otherwise just yes or no. 11659 */ 11660 static int 11661 softdep_count_dependencies(bp, wantcount) 11662 struct buf *bp; 11663 int wantcount; 11664 { 11665 struct worklist *wk; 11666 struct bmsafemap *bmsafemap; 11667 struct inodedep *inodedep; 11668 struct indirdep *indirdep; 11669 struct freeblks *freeblks; 11670 struct allocindir *aip; 11671 struct pagedep *pagedep; 11672 struct dirrem *dirrem; 11673 struct newblk *newblk; 11674 struct mkdir *mkdir; 11675 struct diradd *dap; 11676 int i, retval; 11677 11678 retval = 0; 11679 ACQUIRE_LOCK(&lk); 11680 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 11681 switch (wk->wk_type) { 11682 11683 case D_INODEDEP: 11684 inodedep = WK_INODEDEP(wk); 11685 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 11686 /* bitmap allocation dependency */ 11687 retval += 1; 11688 if (!wantcount) 11689 goto out; 11690 } 11691 if (TAILQ_FIRST(&inodedep->id_inoupdt)) { 11692 /* direct block pointer dependency */ 11693 retval += 1; 11694 if (!wantcount) 11695 goto out; 11696 } 11697 if (TAILQ_FIRST(&inodedep->id_extupdt)) { 11698 /* direct block pointer dependency */ 11699 retval += 1; 11700 if (!wantcount) 11701 goto out; 11702 } 11703 if (TAILQ_FIRST(&inodedep->id_inoreflst)) { 11704 /* Add reference dependency. */ 11705 retval += 1; 11706 if (!wantcount) 11707 goto out; 11708 } 11709 continue; 11710 11711 case D_INDIRDEP: 11712 indirdep = WK_INDIRDEP(wk); 11713 11714 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { 11715 /* indirect block pointer dependency */ 11716 retval += 1; 11717 if (!wantcount) 11718 goto out; 11719 } 11720 continue; 11721 11722 case D_PAGEDEP: 11723 pagedep = WK_PAGEDEP(wk); 11724 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { 11725 if (LIST_FIRST(&dirrem->dm_jremrefhd)) { 11726 /* Journal remove ref dependency. */ 11727 retval += 1; 11728 if (!wantcount) 11729 goto out; 11730 } 11731 } 11732 for (i = 0; i < DAHASHSZ; i++) { 11733 11734 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 11735 /* directory entry dependency */ 11736 retval += 1; 11737 if (!wantcount) 11738 goto out; 11739 } 11740 } 11741 continue; 11742 11743 case D_BMSAFEMAP: 11744 bmsafemap = WK_BMSAFEMAP(wk); 11745 if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) { 11746 /* Add reference dependency. */ 11747 retval += 1; 11748 if (!wantcount) 11749 goto out; 11750 } 11751 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) { 11752 /* Allocate block dependency. */ 11753 retval += 1; 11754 if (!wantcount) 11755 goto out; 11756 } 11757 continue; 11758 11759 case D_FREEBLKS: 11760 freeblks = WK_FREEBLKS(wk); 11761 if (LIST_FIRST(&freeblks->fb_jfreeblkhd)) { 11762 /* Freeblk journal dependency. */ 11763 retval += 1; 11764 if (!wantcount) 11765 goto out; 11766 } 11767 continue; 11768 11769 case D_ALLOCDIRECT: 11770 case D_ALLOCINDIR: 11771 newblk = WK_NEWBLK(wk); 11772 if (newblk->nb_jnewblk) { 11773 /* Journal allocate dependency. */ 11774 retval += 1; 11775 if (!wantcount) 11776 goto out; 11777 } 11778 continue; 11779 11780 case D_MKDIR: 11781 mkdir = WK_MKDIR(wk); 11782 if (mkdir->md_jaddref) { 11783 /* Journal reference dependency. */ 11784 retval += 1; 11785 if (!wantcount) 11786 goto out; 11787 } 11788 continue; 11789 11790 case D_FREEWORK: 11791 case D_FREEDEP: 11792 case D_JSEGDEP: 11793 case D_JSEG: 11794 case D_SBDEP: 11795 /* never a dependency on these blocks */ 11796 continue; 11797 11798 default: 11799 panic("softdep_count_dependencies: Unexpected type %s", 11800 TYPENAME(wk->wk_type)); 11801 /* NOTREACHED */ 11802 } 11803 } 11804 out: 11805 FREE_LOCK(&lk); 11806 return retval; 11807 } 11808 11809 /* 11810 * Acquire exclusive access to a buffer. 11811 * Must be called with a locked mtx parameter. 11812 * Return acquired buffer or NULL on failure. 11813 */ 11814 static struct buf * 11815 getdirtybuf(bp, mtx, waitfor) 11816 struct buf *bp; 11817 struct mtx *mtx; 11818 int waitfor; 11819 { 11820 int error; 11821 11822 mtx_assert(mtx, MA_OWNED); 11823 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) { 11824 if (waitfor != MNT_WAIT) 11825 return (NULL); 11826 error = BUF_LOCK(bp, 11827 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx); 11828 /* 11829 * Even if we sucessfully acquire bp here, we have dropped 11830 * mtx, which may violates our guarantee. 11831 */ 11832 if (error == 0) 11833 BUF_UNLOCK(bp); 11834 else if (error != ENOLCK) 11835 panic("getdirtybuf: inconsistent lock: %d", error); 11836 mtx_lock(mtx); 11837 return (NULL); 11838 } 11839 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { 11840 if (mtx == &lk && waitfor == MNT_WAIT) { 11841 mtx_unlock(mtx); 11842 BO_LOCK(bp->b_bufobj); 11843 BUF_UNLOCK(bp); 11844 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { 11845 bp->b_vflags |= BV_BKGRDWAIT; 11846 msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj), 11847 PRIBIO | PDROP, "getbuf", 0); 11848 } else 11849 BO_UNLOCK(bp->b_bufobj); 11850 mtx_lock(mtx); 11851 return (NULL); 11852 } 11853 BUF_UNLOCK(bp); 11854 if (waitfor != MNT_WAIT) 11855 return (NULL); 11856 /* 11857 * The mtx argument must be bp->b_vp's mutex in 11858 * this case. 11859 */ 11860 #ifdef DEBUG_VFS_LOCKS 11861 if (bp->b_vp->v_type != VCHR) 11862 ASSERT_BO_LOCKED(bp->b_bufobj); 11863 #endif 11864 bp->b_vflags |= BV_BKGRDWAIT; 11865 msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0); 11866 return (NULL); 11867 } 11868 if ((bp->b_flags & B_DELWRI) == 0) { 11869 BUF_UNLOCK(bp); 11870 return (NULL); 11871 } 11872 bremfree(bp); 11873 return (bp); 11874 } 11875 11876 11877 /* 11878 * Check if it is safe to suspend the file system now. On entry, 11879 * the vnode interlock for devvp should be held. Return 0 with 11880 * the mount interlock held if the file system can be suspended now, 11881 * otherwise return EAGAIN with the mount interlock held. 11882 */ 11883 int 11884 softdep_check_suspend(struct mount *mp, 11885 struct vnode *devvp, 11886 int softdep_deps, 11887 int softdep_accdeps, 11888 int secondary_writes, 11889 int secondary_accwrites) 11890 { 11891 struct bufobj *bo; 11892 struct ufsmount *ump; 11893 int error; 11894 11895 ump = VFSTOUFS(mp); 11896 bo = &devvp->v_bufobj; 11897 ASSERT_BO_LOCKED(bo); 11898 11899 for (;;) { 11900 if (!TRY_ACQUIRE_LOCK(&lk)) { 11901 BO_UNLOCK(bo); 11902 ACQUIRE_LOCK(&lk); 11903 FREE_LOCK(&lk); 11904 BO_LOCK(bo); 11905 continue; 11906 } 11907 MNT_ILOCK(mp); 11908 if (mp->mnt_secondary_writes != 0) { 11909 FREE_LOCK(&lk); 11910 BO_UNLOCK(bo); 11911 msleep(&mp->mnt_secondary_writes, 11912 MNT_MTX(mp), 11913 (PUSER - 1) | PDROP, "secwr", 0); 11914 BO_LOCK(bo); 11915 continue; 11916 } 11917 break; 11918 } 11919 11920 /* 11921 * Reasons for needing more work before suspend: 11922 * - Dirty buffers on devvp. 11923 * - Softdep activity occurred after start of vnode sync loop 11924 * - Secondary writes occurred after start of vnode sync loop 11925 */ 11926 error = 0; 11927 if (bo->bo_numoutput > 0 || 11928 bo->bo_dirty.bv_cnt > 0 || 11929 softdep_deps != 0 || 11930 ump->softdep_deps != 0 || 11931 softdep_accdeps != ump->softdep_accdeps || 11932 secondary_writes != 0 || 11933 mp->mnt_secondary_writes != 0 || 11934 secondary_accwrites != mp->mnt_secondary_accwrites) 11935 error = EAGAIN; 11936 FREE_LOCK(&lk); 11937 BO_UNLOCK(bo); 11938 return (error); 11939 } 11940 11941 11942 /* 11943 * Get the number of dependency structures for the file system, both 11944 * the current number and the total number allocated. These will 11945 * later be used to detect that softdep processing has occurred. 11946 */ 11947 void 11948 softdep_get_depcounts(struct mount *mp, 11949 int *softdep_depsp, 11950 int *softdep_accdepsp) 11951 { 11952 struct ufsmount *ump; 11953 11954 ump = VFSTOUFS(mp); 11955 ACQUIRE_LOCK(&lk); 11956 *softdep_depsp = ump->softdep_deps; 11957 *softdep_accdepsp = ump->softdep_accdeps; 11958 FREE_LOCK(&lk); 11959 } 11960 11961 /* 11962 * Wait for pending output on a vnode to complete. 11963 * Must be called with vnode lock and interlock locked. 11964 * 11965 * XXX: Should just be a call to bufobj_wwait(). 11966 */ 11967 static void 11968 drain_output(vp) 11969 struct vnode *vp; 11970 { 11971 struct bufobj *bo; 11972 11973 bo = &vp->v_bufobj; 11974 ASSERT_VOP_LOCKED(vp, "drain_output"); 11975 ASSERT_BO_LOCKED(bo); 11976 11977 while (bo->bo_numoutput) { 11978 bo->bo_flag |= BO_WWAIT; 11979 msleep((caddr_t)&bo->bo_numoutput, 11980 BO_MTX(bo), PRIBIO + 1, "drainvp", 0); 11981 } 11982 } 11983 11984 /* 11985 * Called whenever a buffer that is being invalidated or reallocated 11986 * contains dependencies. This should only happen if an I/O error has 11987 * occurred. The routine is called with the buffer locked. 11988 */ 11989 static void 11990 softdep_deallocate_dependencies(bp) 11991 struct buf *bp; 11992 { 11993 11994 if ((bp->b_ioflags & BIO_ERROR) == 0) 11995 panic("softdep_deallocate_dependencies: dangling deps"); 11996 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); 11997 panic("softdep_deallocate_dependencies: unrecovered I/O error"); 11998 } 11999 12000 /* 12001 * Function to handle asynchronous write errors in the filesystem. 12002 */ 12003 static void 12004 softdep_error(func, error) 12005 char *func; 12006 int error; 12007 { 12008 12009 /* XXX should do something better! */ 12010 printf("%s: got error %d while accessing filesystem\n", func, error); 12011 } 12012 12013 #ifdef DDB 12014 12015 static void 12016 inodedep_print(struct inodedep *inodedep, int verbose) 12017 { 12018 db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d" 12019 " saveino %p\n", 12020 inodedep, inodedep->id_fs, inodedep->id_state, 12021 (intmax_t)inodedep->id_ino, 12022 (intmax_t)fsbtodb(inodedep->id_fs, 12023 ino_to_fsba(inodedep->id_fs, inodedep->id_ino)), 12024 inodedep->id_nlinkdelta, inodedep->id_savednlink, 12025 inodedep->id_savedino1); 12026 12027 if (verbose == 0) 12028 return; 12029 12030 db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, " 12031 "mkdiradd %p\n", 12032 LIST_FIRST(&inodedep->id_pendinghd), 12033 LIST_FIRST(&inodedep->id_bufwait), 12034 LIST_FIRST(&inodedep->id_inowait), 12035 TAILQ_FIRST(&inodedep->id_inoreflst), 12036 inodedep->id_mkdiradd); 12037 db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n", 12038 TAILQ_FIRST(&inodedep->id_inoupdt), 12039 TAILQ_FIRST(&inodedep->id_newinoupdt), 12040 TAILQ_FIRST(&inodedep->id_extupdt), 12041 TAILQ_FIRST(&inodedep->id_newextupdt)); 12042 } 12043 12044 DB_SHOW_COMMAND(inodedep, db_show_inodedep) 12045 { 12046 12047 if (have_addr == 0) { 12048 db_printf("Address required\n"); 12049 return; 12050 } 12051 inodedep_print((struct inodedep*)addr, 1); 12052 } 12053 12054 DB_SHOW_COMMAND(inodedeps, db_show_inodedeps) 12055 { 12056 struct inodedep_hashhead *inodedephd; 12057 struct inodedep *inodedep; 12058 struct fs *fs; 12059 int cnt; 12060 12061 fs = have_addr ? (struct fs *)addr : NULL; 12062 for (cnt = 0; cnt < inodedep_hash; cnt++) { 12063 inodedephd = &inodedep_hashtbl[cnt]; 12064 LIST_FOREACH(inodedep, inodedephd, id_hash) { 12065 if (fs != NULL && fs != inodedep->id_fs) 12066 continue; 12067 inodedep_print(inodedep, 0); 12068 } 12069 } 12070 } 12071 12072 DB_SHOW_COMMAND(worklist, db_show_worklist) 12073 { 12074 struct worklist *wk; 12075 12076 if (have_addr == 0) { 12077 db_printf("Address required\n"); 12078 return; 12079 } 12080 wk = (struct worklist *)addr; 12081 printf("worklist: %p type %s state 0x%X\n", 12082 wk, TYPENAME(wk->wk_type), wk->wk_state); 12083 } 12084 12085 DB_SHOW_COMMAND(workhead, db_show_workhead) 12086 { 12087 struct workhead *wkhd; 12088 struct worklist *wk; 12089 int i; 12090 12091 if (have_addr == 0) { 12092 db_printf("Address required\n"); 12093 return; 12094 } 12095 wkhd = (struct workhead *)addr; 12096 wk = LIST_FIRST(wkhd); 12097 for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list)) 12098 db_printf("worklist: %p type %s state 0x%X", 12099 wk, TYPENAME(wk->wk_type), wk->wk_state); 12100 if (i == 100) 12101 db_printf("workhead overflow"); 12102 printf("\n"); 12103 } 12104 12105 12106 DB_SHOW_COMMAND(mkdirs, db_show_mkdirs) 12107 { 12108 struct jaddref *jaddref; 12109 struct diradd *diradd; 12110 struct mkdir *mkdir; 12111 12112 LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) { 12113 diradd = mkdir->md_diradd; 12114 db_printf("mkdir: %p state 0x%X dap %p state 0x%X", 12115 mkdir, mkdir->md_state, diradd, diradd->da_state); 12116 if ((jaddref = mkdir->md_jaddref) != NULL) 12117 db_printf(" jaddref %p jaddref state 0x%X", 12118 jaddref, jaddref->ja_state); 12119 db_printf("\n"); 12120 } 12121 } 12122 12123 #endif /* DDB */ 12124 12125 #endif /* SOFTUPDATES */ 12126