1 /*- 2 * Copyright 1998, 2000 Marshall Kirk McKusick. 3 * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org> 4 * All rights reserved. 5 * 6 * The soft updates code is derived from the appendix of a University 7 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, 8 * "Soft Updates: A Solution to the Metadata Update Problem in File 9 * Systems", CSE-TR-254-95, August 1995). 10 * 11 * Further information about soft updates can be obtained from: 12 * 13 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 14 * 1614 Oxford Street mckusick@mckusick.com 15 * Berkeley, CA 94709-1608 +1-510-843-9542 16 * USA 17 * 18 * Redistribution and use in source and binary forms, with or without 19 * modification, are permitted provided that the following conditions 20 * are met: 21 * 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 31 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, 32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 34 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 35 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 36 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 37 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 * 39 * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00 40 */ 41 42 #include <sys/cdefs.h> 43 __FBSDID("$FreeBSD$"); 44 45 #include "opt_ffs.h" 46 #include "opt_ddb.h" 47 48 /* 49 * For now we want the safety net that the DEBUG flag provides. 50 */ 51 #ifndef DEBUG 52 #define DEBUG 53 #endif 54 55 #include <sys/param.h> 56 #include <sys/kernel.h> 57 #include <sys/systm.h> 58 #include <sys/bio.h> 59 #include <sys/buf.h> 60 #include <sys/kdb.h> 61 #include <sys/kthread.h> 62 #include <sys/lock.h> 63 #include <sys/malloc.h> 64 #include <sys/mount.h> 65 #include <sys/mutex.h> 66 #include <sys/namei.h> 67 #include <sys/proc.h> 68 #include <sys/stat.h> 69 #include <sys/sysctl.h> 70 #include <sys/syslog.h> 71 #include <sys/vnode.h> 72 #include <sys/conf.h> 73 #include <ufs/ufs/dir.h> 74 #include <ufs/ufs/extattr.h> 75 #include <ufs/ufs/quota.h> 76 #include <ufs/ufs/inode.h> 77 #include <ufs/ufs/ufsmount.h> 78 #include <ufs/ffs/fs.h> 79 #include <ufs/ffs/softdep.h> 80 #include <ufs/ffs/ffs_extern.h> 81 #include <ufs/ufs/ufs_extern.h> 82 83 #include <vm/vm.h> 84 85 #include <ddb/ddb.h> 86 87 #ifndef SOFTUPDATES 88 89 int 90 softdep_flushfiles(oldmnt, flags, td) 91 struct mount *oldmnt; 92 int flags; 93 struct thread *td; 94 { 95 96 panic("softdep_flushfiles called"); 97 } 98 99 int 100 softdep_mount(devvp, mp, fs, cred) 101 struct vnode *devvp; 102 struct mount *mp; 103 struct fs *fs; 104 struct ucred *cred; 105 { 106 107 return (0); 108 } 109 110 void 111 softdep_initialize() 112 { 113 114 return; 115 } 116 117 void 118 softdep_uninitialize() 119 { 120 121 return; 122 } 123 124 void 125 softdep_unmount(mp) 126 struct mount *mp; 127 { 128 129 } 130 131 void 132 softdep_setup_sbupdate(ump, fs, bp) 133 struct ufsmount *ump; 134 struct fs *fs; 135 struct buf *bp; 136 { 137 } 138 139 void 140 softdep_setup_inomapdep(bp, ip, newinum) 141 struct buf *bp; 142 struct inode *ip; 143 ino_t newinum; 144 { 145 146 panic("softdep_setup_inomapdep called"); 147 } 148 149 void 150 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) 151 struct buf *bp; 152 struct mount *mp; 153 ufs2_daddr_t newblkno; 154 int frags; 155 int oldfrags; 156 { 157 158 panic("softdep_setup_blkmapdep called"); 159 } 160 161 void 162 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 163 struct inode *ip; 164 ufs_lbn_t lbn; 165 ufs2_daddr_t newblkno; 166 ufs2_daddr_t oldblkno; 167 long newsize; 168 long oldsize; 169 struct buf *bp; 170 { 171 172 panic("softdep_setup_allocdirect called"); 173 } 174 175 void 176 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 177 struct inode *ip; 178 ufs_lbn_t lbn; 179 ufs2_daddr_t newblkno; 180 ufs2_daddr_t oldblkno; 181 long newsize; 182 long oldsize; 183 struct buf *bp; 184 { 185 186 panic("softdep_setup_allocext called"); 187 } 188 189 void 190 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 191 struct inode *ip; 192 ufs_lbn_t lbn; 193 struct buf *bp; 194 int ptrno; 195 ufs2_daddr_t newblkno; 196 ufs2_daddr_t oldblkno; 197 struct buf *nbp; 198 { 199 200 panic("softdep_setup_allocindir_page called"); 201 } 202 203 void 204 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 205 struct buf *nbp; 206 struct inode *ip; 207 struct buf *bp; 208 int ptrno; 209 ufs2_daddr_t newblkno; 210 { 211 212 panic("softdep_setup_allocindir_meta called"); 213 } 214 215 void 216 softdep_setup_freeblocks(ip, length, flags) 217 struct inode *ip; 218 off_t length; 219 int flags; 220 { 221 222 panic("softdep_setup_freeblocks called"); 223 } 224 225 void 226 softdep_freefile(pvp, ino, mode) 227 struct vnode *pvp; 228 ino_t ino; 229 int mode; 230 { 231 232 panic("softdep_freefile called"); 233 } 234 235 int 236 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) 237 struct buf *bp; 238 struct inode *dp; 239 off_t diroffset; 240 ino_t newinum; 241 struct buf *newdirbp; 242 int isnewblk; 243 { 244 245 panic("softdep_setup_directory_add called"); 246 } 247 248 void 249 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) 250 struct buf *bp; 251 struct inode *dp; 252 caddr_t base; 253 caddr_t oldloc; 254 caddr_t newloc; 255 int entrysize; 256 { 257 258 panic("softdep_change_directoryentry_offset called"); 259 } 260 261 void 262 softdep_setup_remove(bp, dp, ip, isrmdir) 263 struct buf *bp; 264 struct inode *dp; 265 struct inode *ip; 266 int isrmdir; 267 { 268 269 panic("softdep_setup_remove called"); 270 } 271 272 void 273 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 274 struct buf *bp; 275 struct inode *dp; 276 struct inode *ip; 277 ino_t newinum; 278 int isrmdir; 279 { 280 281 panic("softdep_setup_directory_change called"); 282 } 283 284 void * 285 softdep_setup_trunc(vp, length, flags) 286 struct vnode *vp; 287 off_t length; 288 int flags; 289 { 290 291 panic("%s called", __FUNCTION__); 292 293 return (NULL); 294 } 295 296 int 297 softdep_complete_trunc(vp, cookie) 298 struct vnode *vp; 299 void *cookie; 300 { 301 302 panic("%s called", __FUNCTION__); 303 304 return (0); 305 } 306 307 void 308 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) 309 struct mount *mp; 310 struct buf *bp; 311 ufs2_daddr_t blkno; 312 int frags; 313 struct workhead *wkhd; 314 { 315 316 panic("%s called", __FUNCTION__); 317 } 318 319 void 320 softdep_setup_inofree(mp, bp, ino, wkhd) 321 struct mount *mp; 322 struct buf *bp; 323 ino_t ino; 324 struct workhead *wkhd; 325 { 326 327 panic("%s called", __FUNCTION__); 328 } 329 330 void 331 softdep_setup_unlink(dp, ip) 332 struct inode *dp; 333 struct inode *ip; 334 { 335 336 panic("%s called", __FUNCTION__); 337 } 338 339 void 340 softdep_setup_link(dp, ip) 341 struct inode *dp; 342 struct inode *ip; 343 { 344 345 panic("%s called", __FUNCTION__); 346 } 347 348 void 349 softdep_revert_link(dp, ip) 350 struct inode *dp; 351 struct inode *ip; 352 { 353 354 panic("%s called", __FUNCTION__); 355 } 356 357 void 358 softdep_setup_rmdir(dp, ip) 359 struct inode *dp; 360 struct inode *ip; 361 { 362 363 panic("%s called", __FUNCTION__); 364 } 365 366 void 367 softdep_revert_rmdir(dp, ip) 368 struct inode *dp; 369 struct inode *ip; 370 { 371 372 panic("%s called", __FUNCTION__); 373 } 374 375 void 376 softdep_setup_create(dp, ip) 377 struct inode *dp; 378 struct inode *ip; 379 { 380 381 panic("%s called", __FUNCTION__); 382 } 383 384 void 385 softdep_revert_create(dp, ip) 386 struct inode *dp; 387 struct inode *ip; 388 { 389 390 panic("%s called", __FUNCTION__); 391 } 392 393 void 394 softdep_setup_mkdir(dp, ip) 395 struct inode *dp; 396 struct inode *ip; 397 { 398 399 panic("%s called", __FUNCTION__); 400 } 401 402 void 403 softdep_revert_mkdir(dp, ip) 404 struct inode *dp; 405 struct inode *ip; 406 { 407 408 panic("%s called", __FUNCTION__); 409 } 410 411 void 412 softdep_setup_dotdot_link(dp, ip) 413 struct inode *dp; 414 struct inode *ip; 415 { 416 417 panic("%s called", __FUNCTION__); 418 } 419 420 int 421 softdep_prealloc(vp, waitok) 422 struct vnode *vp; 423 int waitok; 424 { 425 426 panic("%s called", __FUNCTION__); 427 428 return (0); 429 } 430 431 int 432 softdep_journal_lookup(mp, vpp) 433 struct mount *mp; 434 struct vnode **vpp; 435 { 436 437 return (ENOENT); 438 } 439 440 void 441 softdep_change_linkcnt(ip) 442 struct inode *ip; 443 { 444 445 panic("softdep_change_linkcnt called"); 446 } 447 448 void 449 softdep_load_inodeblock(ip) 450 struct inode *ip; 451 { 452 453 panic("softdep_load_inodeblock called"); 454 } 455 456 void 457 softdep_update_inodeblock(ip, bp, waitfor) 458 struct inode *ip; 459 struct buf *bp; 460 int waitfor; 461 { 462 463 panic("softdep_update_inodeblock called"); 464 } 465 466 int 467 softdep_fsync(vp) 468 struct vnode *vp; /* the "in_core" copy of the inode */ 469 { 470 471 return (0); 472 } 473 474 void 475 softdep_fsync_mountdev(vp) 476 struct vnode *vp; 477 { 478 479 return; 480 } 481 482 int 483 softdep_flushworklist(oldmnt, countp, td) 484 struct mount *oldmnt; 485 int *countp; 486 struct thread *td; 487 { 488 489 *countp = 0; 490 return (0); 491 } 492 493 int 494 softdep_sync_metadata(struct vnode *vp) 495 { 496 497 return (0); 498 } 499 500 int 501 softdep_slowdown(vp) 502 struct vnode *vp; 503 { 504 505 panic("softdep_slowdown called"); 506 } 507 508 void 509 softdep_releasefile(ip) 510 struct inode *ip; /* inode with the zero effective link count */ 511 { 512 513 panic("softdep_releasefile called"); 514 } 515 516 int 517 softdep_request_cleanup(fs, vp) 518 struct fs *fs; 519 struct vnode *vp; 520 { 521 522 return (0); 523 } 524 525 int 526 softdep_check_suspend(struct mount *mp, 527 struct vnode *devvp, 528 int softdep_deps, 529 int softdep_accdeps, 530 int secondary_writes, 531 int secondary_accwrites) 532 { 533 struct bufobj *bo; 534 int error; 535 536 (void) softdep_deps, 537 (void) softdep_accdeps; 538 539 bo = &devvp->v_bufobj; 540 ASSERT_BO_LOCKED(bo); 541 542 MNT_ILOCK(mp); 543 while (mp->mnt_secondary_writes != 0) { 544 BO_UNLOCK(bo); 545 msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), 546 (PUSER - 1) | PDROP, "secwr", 0); 547 BO_LOCK(bo); 548 MNT_ILOCK(mp); 549 } 550 551 /* 552 * Reasons for needing more work before suspend: 553 * - Dirty buffers on devvp. 554 * - Secondary writes occurred after start of vnode sync loop 555 */ 556 error = 0; 557 if (bo->bo_numoutput > 0 || 558 bo->bo_dirty.bv_cnt > 0 || 559 secondary_writes != 0 || 560 mp->mnt_secondary_writes != 0 || 561 secondary_accwrites != mp->mnt_secondary_accwrites) 562 error = EAGAIN; 563 BO_UNLOCK(bo); 564 return (error); 565 } 566 567 void 568 softdep_get_depcounts(struct mount *mp, 569 int *softdepactivep, 570 int *softdepactiveaccp) 571 { 572 (void) mp; 573 *softdepactivep = 0; 574 *softdepactiveaccp = 0; 575 } 576 577 #else 578 /* 579 * These definitions need to be adapted to the system to which 580 * this file is being ported. 581 */ 582 583 #define M_SOFTDEP_FLAGS (M_WAITOK | M_USE_RESERVE) 584 585 #define D_PAGEDEP 0 586 #define D_INODEDEP 1 587 #define D_BMSAFEMAP 2 588 #define D_NEWBLK 3 589 #define D_ALLOCDIRECT 4 590 #define D_INDIRDEP 5 591 #define D_ALLOCINDIR 6 592 #define D_FREEFRAG 7 593 #define D_FREEBLKS 8 594 #define D_FREEFILE 9 595 #define D_DIRADD 10 596 #define D_MKDIR 11 597 #define D_DIRREM 12 598 #define D_NEWDIRBLK 13 599 #define D_FREEWORK 14 600 #define D_FREEDEP 15 601 #define D_JADDREF 16 602 #define D_JREMREF 17 603 #define D_JMVREF 18 604 #define D_JNEWBLK 19 605 #define D_JFREEBLK 20 606 #define D_JFREEFRAG 21 607 #define D_JSEG 22 608 #define D_JSEGDEP 23 609 #define D_SBDEP 24 610 #define D_JTRUNC 25 611 #define D_LAST D_JTRUNC 612 613 unsigned long dep_current[D_LAST + 1]; 614 unsigned long dep_total[D_LAST + 1]; 615 616 617 SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0, "soft updates stats"); 618 SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0, 619 "total dependencies allocated"); 620 SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0, 621 "current dependencies allocated"); 622 623 #define SOFTDEP_TYPE(type, str, long) \ 624 static MALLOC_DEFINE(M_ ## type, #str, long); \ 625 SYSCTL_LONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD, \ 626 &dep_total[D_ ## type], 0, ""); \ 627 SYSCTL_LONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, \ 628 &dep_current[D_ ## type], 0, ""); 629 630 SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies"); 631 SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies"); 632 SOFTDEP_TYPE(BMSAFEMAP, bmsafemap, 633 "Block or frag allocated from cyl group map"); 634 SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency"); 635 SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode"); 636 SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies"); 637 SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block"); 638 SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode"); 639 SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode"); 640 SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated"); 641 SOFTDEP_TYPE(DIRADD, diradd, "New directory entry"); 642 SOFTDEP_TYPE(MKDIR, mkdir, "New directory"); 643 SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted"); 644 SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block"); 645 SOFTDEP_TYPE(FREEWORK, freework, "free an inode block"); 646 SOFTDEP_TYPE(FREEDEP, freedep, "track a block free"); 647 SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add"); 648 SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove"); 649 SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move"); 650 SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block"); 651 SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block"); 652 SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag"); 653 SOFTDEP_TYPE(JSEG, jseg, "Journal segment"); 654 SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete"); 655 SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency"); 656 SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation"); 657 658 static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes"); 659 static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations"); 660 661 /* 662 * translate from workitem type to memory type 663 * MUST match the defines above, such that memtype[D_XXX] == M_XXX 664 */ 665 static struct malloc_type *memtype[] = { 666 M_PAGEDEP, 667 M_INODEDEP, 668 M_BMSAFEMAP, 669 M_NEWBLK, 670 M_ALLOCDIRECT, 671 M_INDIRDEP, 672 M_ALLOCINDIR, 673 M_FREEFRAG, 674 M_FREEBLKS, 675 M_FREEFILE, 676 M_DIRADD, 677 M_MKDIR, 678 M_DIRREM, 679 M_NEWDIRBLK, 680 M_FREEWORK, 681 M_FREEDEP, 682 M_JADDREF, 683 M_JREMREF, 684 M_JMVREF, 685 M_JNEWBLK, 686 M_JFREEBLK, 687 M_JFREEFRAG, 688 M_JSEG, 689 M_JSEGDEP, 690 M_SBDEP, 691 M_JTRUNC 692 }; 693 694 #define DtoM(type) (memtype[type]) 695 696 /* 697 * Names of malloc types. 698 */ 699 #define TYPENAME(type) \ 700 ((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???") 701 /* 702 * End system adaptation definitions. 703 */ 704 705 #define DOTDOT_OFFSET offsetof(struct dirtemplate, dotdot_ino) 706 #define DOT_OFFSET offsetof(struct dirtemplate, dot_ino) 707 708 /* 709 * Forward declarations. 710 */ 711 struct inodedep_hashhead; 712 struct newblk_hashhead; 713 struct pagedep_hashhead; 714 struct bmsafemap_hashhead; 715 716 /* 717 * Internal function prototypes. 718 */ 719 static void softdep_error(char *, int); 720 static void drain_output(struct vnode *); 721 static struct buf *getdirtybuf(struct buf *, struct mtx *, int); 722 static void clear_remove(struct thread *); 723 static void clear_inodedeps(struct thread *); 724 static void unlinked_inodedep(struct mount *, struct inodedep *); 725 static void clear_unlinked_inodedep(struct inodedep *); 726 static struct inodedep *first_unlinked_inodedep(struct ufsmount *); 727 static int flush_pagedep_deps(struct vnode *, struct mount *, 728 struct diraddhd *); 729 static void free_pagedep(struct pagedep *); 730 static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t); 731 static int flush_inodedep_deps(struct mount *, ino_t); 732 static int flush_deplist(struct allocdirectlst *, int, int *); 733 static int handle_written_filepage(struct pagedep *, struct buf *); 734 static int handle_written_sbdep(struct sbdep *, struct buf *); 735 static void initiate_write_sbdep(struct sbdep *); 736 static void diradd_inode_written(struct diradd *, struct inodedep *); 737 static int handle_written_indirdep(struct indirdep *, struct buf *, 738 struct buf**); 739 static int handle_written_inodeblock(struct inodedep *, struct buf *); 740 static int handle_written_bmsafemap(struct bmsafemap *, struct buf *); 741 static void handle_written_jaddref(struct jaddref *); 742 static void handle_written_jremref(struct jremref *); 743 static void handle_written_jseg(struct jseg *, struct buf *); 744 static void handle_written_jnewblk(struct jnewblk *); 745 static void handle_written_jfreeblk(struct jfreeblk *); 746 static void handle_written_jfreefrag(struct jfreefrag *); 747 static void complete_jseg(struct jseg *); 748 static void jseg_write(struct fs *, struct jblocks *, struct jseg *, 749 uint8_t *); 750 static void jaddref_write(struct jaddref *, struct jseg *, uint8_t *); 751 static void jremref_write(struct jremref *, struct jseg *, uint8_t *); 752 static void jmvref_write(struct jmvref *, struct jseg *, uint8_t *); 753 static void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *); 754 static void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *); 755 static void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *); 756 static void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *); 757 static inline void inoref_write(struct inoref *, struct jseg *, 758 struct jrefrec *); 759 static void handle_allocdirect_partdone(struct allocdirect *, 760 struct workhead *); 761 static void cancel_newblk(struct newblk *, struct workhead *); 762 static void indirdep_complete(struct indirdep *); 763 static void handle_allocindir_partdone(struct allocindir *); 764 static void initiate_write_filepage(struct pagedep *, struct buf *); 765 static void initiate_write_indirdep(struct indirdep*, struct buf *); 766 static void handle_written_mkdir(struct mkdir *, int); 767 static void initiate_write_bmsafemap(struct bmsafemap *, struct buf *); 768 static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *); 769 static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *); 770 static void handle_workitem_freefile(struct freefile *); 771 static void handle_workitem_remove(struct dirrem *, struct vnode *); 772 static struct dirrem *newdirrem(struct buf *, struct inode *, 773 struct inode *, int, struct dirrem **); 774 static void cancel_indirdep(struct indirdep *, struct buf *, struct inodedep *, 775 struct freeblks *); 776 static void free_indirdep(struct indirdep *); 777 static void free_diradd(struct diradd *, struct workhead *); 778 static void merge_diradd(struct inodedep *, struct diradd *); 779 static void complete_diradd(struct diradd *); 780 static struct diradd *diradd_lookup(struct pagedep *, int); 781 static struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *, 782 struct jremref *); 783 static struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *, 784 struct jremref *); 785 static void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *, 786 struct jremref *, struct jremref *); 787 static void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *, 788 struct jremref *); 789 static void cancel_allocindir(struct allocindir *, struct inodedep *, 790 struct freeblks *); 791 static void complete_mkdir(struct mkdir *); 792 static void free_newdirblk(struct newdirblk *); 793 static void free_jremref(struct jremref *); 794 static void free_jaddref(struct jaddref *); 795 static void free_jsegdep(struct jsegdep *); 796 static void free_jseg(struct jseg *); 797 static void free_jnewblk(struct jnewblk *); 798 static void free_jfreeblk(struct jfreeblk *); 799 static void free_jfreefrag(struct jfreefrag *); 800 static void free_freedep(struct freedep *); 801 static void journal_jremref(struct dirrem *, struct jremref *, 802 struct inodedep *); 803 static void cancel_jnewblk(struct jnewblk *, struct workhead *); 804 static int cancel_jaddref(struct jaddref *, struct inodedep *, 805 struct workhead *); 806 static void cancel_jfreefrag(struct jfreefrag *); 807 static void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t); 808 static int deallocate_dependencies(struct buf *, struct inodedep *, 809 struct freeblks *); 810 static void free_newblk(struct newblk *); 811 static void cancel_allocdirect(struct allocdirectlst *, 812 struct allocdirect *, struct freeblks *, int); 813 static int check_inode_unwritten(struct inodedep *); 814 static int free_inodedep(struct inodedep *); 815 static void freework_freeblock(struct freework *); 816 static void handle_workitem_freeblocks(struct freeblks *, int); 817 static void handle_complete_freeblocks(struct freeblks *); 818 static void handle_workitem_indirblk(struct freework *); 819 static void handle_written_freework(struct freework *); 820 static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *); 821 static void setup_allocindir_phase2(struct buf *, struct inode *, 822 struct inodedep *, struct allocindir *, ufs_lbn_t); 823 static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t, 824 ufs2_daddr_t, ufs_lbn_t); 825 static void handle_workitem_freefrag(struct freefrag *); 826 static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long, 827 ufs_lbn_t); 828 static void allocdirect_merge(struct allocdirectlst *, 829 struct allocdirect *, struct allocdirect *); 830 static struct freefrag *allocindir_merge(struct allocindir *, 831 struct allocindir *); 832 static int bmsafemap_find(struct bmsafemap_hashhead *, struct mount *, int, 833 struct bmsafemap **); 834 static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *, 835 int cg); 836 static int newblk_find(struct newblk_hashhead *, struct mount *, ufs2_daddr_t, 837 int, struct newblk **); 838 static int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **); 839 static int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t, 840 struct inodedep **); 841 static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **); 842 static int pagedep_lookup(struct mount *, ino_t, ufs_lbn_t, int, 843 struct pagedep **); 844 static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t, 845 struct mount *mp, int, struct pagedep **); 846 static void pause_timer(void *); 847 static int request_cleanup(struct mount *, int); 848 static int process_worklist_item(struct mount *, int); 849 static void process_removes(struct vnode *); 850 static void jwork_move(struct workhead *, struct workhead *); 851 static void add_to_worklist(struct worklist *, int); 852 static void remove_from_worklist(struct worklist *); 853 static void softdep_flush(void); 854 static int softdep_speedup(void); 855 static void worklist_speedup(void); 856 static int journal_mount(struct mount *, struct fs *, struct ucred *); 857 static void journal_unmount(struct mount *); 858 static int journal_space(struct ufsmount *, int); 859 static void journal_suspend(struct ufsmount *); 860 static int journal_unsuspend(struct ufsmount *ump); 861 static void softdep_prelink(struct vnode *, struct vnode *); 862 static void add_to_journal(struct worklist *); 863 static void remove_from_journal(struct worklist *); 864 static void softdep_process_journal(struct mount *, int); 865 static struct jremref *newjremref(struct dirrem *, struct inode *, 866 struct inode *ip, off_t, nlink_t); 867 static struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t, 868 uint16_t); 869 static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t, 870 uint16_t); 871 static inline struct jsegdep *inoref_jseg(struct inoref *); 872 static struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t); 873 static struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t, 874 ufs2_daddr_t, int); 875 static struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *, 876 ufs2_daddr_t, long, ufs_lbn_t); 877 static struct freework *newfreework(struct freeblks *, struct freework *, 878 ufs_lbn_t, ufs2_daddr_t, int, int); 879 static void jwait(struct worklist *wk); 880 static struct inodedep *inodedep_lookup_ip(struct inode *); 881 static int bmsafemap_rollbacks(struct bmsafemap *); 882 static struct freefile *handle_bufwait(struct inodedep *, struct workhead *); 883 static void handle_jwork(struct workhead *); 884 static struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *, 885 struct mkdir **); 886 static struct jblocks *jblocks_create(void); 887 static ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *); 888 static void jblocks_free(struct jblocks *, struct mount *, int); 889 static void jblocks_destroy(struct jblocks *); 890 static void jblocks_add(struct jblocks *, ufs2_daddr_t, int); 891 892 /* 893 * Exported softdep operations. 894 */ 895 static void softdep_disk_io_initiation(struct buf *); 896 static void softdep_disk_write_complete(struct buf *); 897 static void softdep_deallocate_dependencies(struct buf *); 898 static int softdep_count_dependencies(struct buf *bp, int); 899 900 static struct mtx lk; 901 MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF); 902 903 #define TRY_ACQUIRE_LOCK(lk) mtx_trylock(lk) 904 #define ACQUIRE_LOCK(lk) mtx_lock(lk) 905 #define FREE_LOCK(lk) mtx_unlock(lk) 906 907 #define BUF_AREC(bp) lockallowrecurse(&(bp)->b_lock) 908 #define BUF_NOREC(bp) lockdisablerecurse(&(bp)->b_lock) 909 910 /* 911 * Worklist queue management. 912 * These routines require that the lock be held. 913 */ 914 #ifndef /* NOT */ DEBUG 915 #define WORKLIST_INSERT(head, item) do { \ 916 (item)->wk_state |= ONWORKLIST; \ 917 LIST_INSERT_HEAD(head, item, wk_list); \ 918 } while (0) 919 #define WORKLIST_REMOVE(item) do { \ 920 (item)->wk_state &= ~ONWORKLIST; \ 921 LIST_REMOVE(item, wk_list); \ 922 } while (0) 923 #define WORKLIST_INSERT_UNLOCKED WORKLIST_INSERT 924 #define WORKLIST_REMOVE_UNLOCKED WORKLIST_REMOVE 925 926 #else /* DEBUG */ 927 static void worklist_insert(struct workhead *, struct worklist *, int); 928 static void worklist_remove(struct worklist *, int); 929 930 #define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1) 931 #define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0) 932 #define WORKLIST_REMOVE(item) worklist_remove(item, 1) 933 #define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0) 934 935 static void 936 worklist_insert(head, item, locked) 937 struct workhead *head; 938 struct worklist *item; 939 int locked; 940 { 941 942 if (locked) 943 mtx_assert(&lk, MA_OWNED); 944 if (item->wk_state & ONWORKLIST) 945 panic("worklist_insert: %p %s(0x%X) already on list", 946 item, TYPENAME(item->wk_type), item->wk_state); 947 item->wk_state |= ONWORKLIST; 948 LIST_INSERT_HEAD(head, item, wk_list); 949 } 950 951 static void 952 worklist_remove(item, locked) 953 struct worklist *item; 954 int locked; 955 { 956 957 if (locked) 958 mtx_assert(&lk, MA_OWNED); 959 if ((item->wk_state & ONWORKLIST) == 0) 960 panic("worklist_remove: %p %s(0x%X) not on list", 961 item, TYPENAME(item->wk_type), item->wk_state); 962 item->wk_state &= ~ONWORKLIST; 963 LIST_REMOVE(item, wk_list); 964 } 965 #endif /* DEBUG */ 966 967 /* 968 * Merge two jsegdeps keeping only the oldest one as newer references 969 * can't be discarded until after older references. 970 */ 971 static inline struct jsegdep * 972 jsegdep_merge(struct jsegdep *one, struct jsegdep *two) 973 { 974 struct jsegdep *swp; 975 976 if (two == NULL) 977 return (one); 978 979 if (one->jd_seg->js_seq > two->jd_seg->js_seq) { 980 swp = one; 981 one = two; 982 two = swp; 983 } 984 WORKLIST_REMOVE(&two->jd_list); 985 free_jsegdep(two); 986 987 return (one); 988 } 989 990 /* 991 * If two freedeps are compatible free one to reduce list size. 992 */ 993 static inline struct freedep * 994 freedep_merge(struct freedep *one, struct freedep *two) 995 { 996 if (two == NULL) 997 return (one); 998 999 if (one->fd_freework == two->fd_freework) { 1000 WORKLIST_REMOVE(&two->fd_list); 1001 free_freedep(two); 1002 } 1003 return (one); 1004 } 1005 1006 /* 1007 * Move journal work from one list to another. Duplicate freedeps and 1008 * jsegdeps are coalesced to keep the lists as small as possible. 1009 */ 1010 static void 1011 jwork_move(dst, src) 1012 struct workhead *dst; 1013 struct workhead *src; 1014 { 1015 struct freedep *freedep; 1016 struct jsegdep *jsegdep; 1017 struct worklist *wkn; 1018 struct worklist *wk; 1019 1020 KASSERT(dst != src, 1021 ("jwork_move: dst == src")); 1022 freedep = NULL; 1023 jsegdep = NULL; 1024 LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) { 1025 if (wk->wk_type == D_JSEGDEP) 1026 jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); 1027 if (wk->wk_type == D_FREEDEP) 1028 freedep = freedep_merge(WK_FREEDEP(wk), freedep); 1029 } 1030 1031 mtx_assert(&lk, MA_OWNED); 1032 while ((wk = LIST_FIRST(src)) != NULL) { 1033 WORKLIST_REMOVE(wk); 1034 WORKLIST_INSERT(dst, wk); 1035 if (wk->wk_type == D_JSEGDEP) { 1036 jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); 1037 continue; 1038 } 1039 if (wk->wk_type == D_FREEDEP) 1040 freedep = freedep_merge(WK_FREEDEP(wk), freedep); 1041 } 1042 } 1043 1044 /* 1045 * Routines for tracking and managing workitems. 1046 */ 1047 static void workitem_free(struct worklist *, int); 1048 static void workitem_alloc(struct worklist *, int, struct mount *); 1049 1050 #define WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type)) 1051 1052 static void 1053 workitem_free(item, type) 1054 struct worklist *item; 1055 int type; 1056 { 1057 struct ufsmount *ump; 1058 mtx_assert(&lk, MA_OWNED); 1059 1060 #ifdef DEBUG 1061 if (item->wk_state & ONWORKLIST) 1062 panic("workitem_free: %s(0x%X) still on list", 1063 TYPENAME(item->wk_type), item->wk_state); 1064 if (item->wk_type != type) 1065 panic("workitem_free: type mismatch %s != %s", 1066 TYPENAME(item->wk_type), TYPENAME(type)); 1067 #endif 1068 ump = VFSTOUFS(item->wk_mp); 1069 if (--ump->softdep_deps == 0 && ump->softdep_req) 1070 wakeup(&ump->softdep_deps); 1071 dep_current[type]--; 1072 free(item, DtoM(type)); 1073 } 1074 1075 static void 1076 workitem_alloc(item, type, mp) 1077 struct worklist *item; 1078 int type; 1079 struct mount *mp; 1080 { 1081 item->wk_type = type; 1082 item->wk_mp = mp; 1083 item->wk_state = 0; 1084 ACQUIRE_LOCK(&lk); 1085 dep_current[type]++; 1086 dep_total[type]++; 1087 VFSTOUFS(mp)->softdep_deps++; 1088 VFSTOUFS(mp)->softdep_accdeps++; 1089 FREE_LOCK(&lk); 1090 } 1091 1092 /* 1093 * Workitem queue management 1094 */ 1095 static int max_softdeps; /* maximum number of structs before slowdown */ 1096 static int maxindirdeps = 50; /* max number of indirdeps before slowdown */ 1097 static int tickdelay = 2; /* number of ticks to pause during slowdown */ 1098 static int proc_waiting; /* tracks whether we have a timeout posted */ 1099 static int *stat_countp; /* statistic to count in proc_waiting timeout */ 1100 static struct callout softdep_callout; 1101 static int req_pending; 1102 static int req_clear_inodedeps; /* syncer process flush some inodedeps */ 1103 #define FLUSH_INODES 1 1104 static int req_clear_remove; /* syncer process flush some freeblks */ 1105 #define FLUSH_REMOVE 2 1106 #define FLUSH_REMOVE_WAIT 3 1107 static long num_freeblkdep; /* number of freeblks workitems allocated */ 1108 1109 /* 1110 * runtime statistics 1111 */ 1112 static int stat_worklist_push; /* number of worklist cleanups */ 1113 static int stat_blk_limit_push; /* number of times block limit neared */ 1114 static int stat_ino_limit_push; /* number of times inode limit neared */ 1115 static int stat_blk_limit_hit; /* number of times block slowdown imposed */ 1116 static int stat_ino_limit_hit; /* number of times inode slowdown imposed */ 1117 static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */ 1118 static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */ 1119 static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */ 1120 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */ 1121 static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */ 1122 static int stat_jaddref; /* bufs redirtied as ino bitmap can not write */ 1123 static int stat_jnewblk; /* bufs redirtied as blk bitmap can not write */ 1124 static int stat_journal_min; /* Times hit journal min threshold */ 1125 static int stat_journal_low; /* Times hit journal low threshold */ 1126 static int stat_journal_wait; /* Times blocked in jwait(). */ 1127 static int stat_jwait_filepage; /* Times blocked in jwait() for filepage. */ 1128 static int stat_jwait_freeblks; /* Times blocked in jwait() for freeblks. */ 1129 static int stat_jwait_inode; /* Times blocked in jwait() for inodes. */ 1130 static int stat_jwait_newblk; /* Times blocked in jwait() for newblks. */ 1131 1132 SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW, 1133 &max_softdeps, 0, ""); 1134 SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW, 1135 &tickdelay, 0, ""); 1136 SYSCTL_INT(_debug_softdep, OID_AUTO, maxindirdeps, CTLFLAG_RW, 1137 &maxindirdeps, 0, ""); 1138 SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW, 1139 &stat_worklist_push, 0,""); 1140 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW, 1141 &stat_blk_limit_push, 0,""); 1142 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW, 1143 &stat_ino_limit_push, 0,""); 1144 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW, 1145 &stat_blk_limit_hit, 0, ""); 1146 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW, 1147 &stat_ino_limit_hit, 0, ""); 1148 SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW, 1149 &stat_sync_limit_hit, 0, ""); 1150 SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, 1151 &stat_indir_blk_ptrs, 0, ""); 1152 SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW, 1153 &stat_inode_bitmap, 0, ""); 1154 SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, 1155 &stat_direct_blk_ptrs, 0, ""); 1156 SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW, 1157 &stat_dir_entry, 0, ""); 1158 SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW, 1159 &stat_jaddref, 0, ""); 1160 SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW, 1161 &stat_jnewblk, 0, ""); 1162 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW, 1163 &stat_journal_low, 0, ""); 1164 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW, 1165 &stat_journal_min, 0, ""); 1166 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW, 1167 &stat_journal_wait, 0, ""); 1168 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW, 1169 &stat_jwait_filepage, 0, ""); 1170 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW, 1171 &stat_jwait_freeblks, 0, ""); 1172 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW, 1173 &stat_jwait_inode, 0, ""); 1174 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW, 1175 &stat_jwait_newblk, 0, ""); 1176 1177 SYSCTL_DECL(_vfs_ffs); 1178 1179 LIST_HEAD(bmsafemap_hashhead, bmsafemap) *bmsafemap_hashtbl; 1180 static u_long bmsafemap_hash; /* size of hash table - 1 */ 1181 1182 static int compute_summary_at_mount = 0; /* Whether to recompute the summary at mount time */ 1183 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW, 1184 &compute_summary_at_mount, 0, "Recompute summary at mount"); 1185 1186 static struct proc *softdepproc; 1187 static struct kproc_desc softdep_kp = { 1188 "softdepflush", 1189 softdep_flush, 1190 &softdepproc 1191 }; 1192 SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start, 1193 &softdep_kp); 1194 1195 static void 1196 softdep_flush(void) 1197 { 1198 struct mount *nmp; 1199 struct mount *mp; 1200 struct ufsmount *ump; 1201 struct thread *td; 1202 int remaining; 1203 int progress; 1204 int vfslocked; 1205 1206 td = curthread; 1207 td->td_pflags |= TDP_NORUNNINGBUF; 1208 1209 for (;;) { 1210 kproc_suspend_check(softdepproc); 1211 vfslocked = VFS_LOCK_GIANT((struct mount *)NULL); 1212 ACQUIRE_LOCK(&lk); 1213 /* 1214 * If requested, try removing inode or removal dependencies. 1215 */ 1216 if (req_clear_inodedeps) { 1217 clear_inodedeps(td); 1218 req_clear_inodedeps -= 1; 1219 wakeup_one(&proc_waiting); 1220 } 1221 if (req_clear_remove) { 1222 clear_remove(td); 1223 req_clear_remove -= 1; 1224 wakeup_one(&proc_waiting); 1225 } 1226 FREE_LOCK(&lk); 1227 VFS_UNLOCK_GIANT(vfslocked); 1228 remaining = progress = 0; 1229 mtx_lock(&mountlist_mtx); 1230 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 1231 nmp = TAILQ_NEXT(mp, mnt_list); 1232 if ((mp->mnt_flag & MNT_SOFTDEP) == 0) 1233 continue; 1234 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) 1235 continue; 1236 vfslocked = VFS_LOCK_GIANT(mp); 1237 progress += softdep_process_worklist(mp, 0); 1238 ump = VFSTOUFS(mp); 1239 remaining += ump->softdep_on_worklist - 1240 ump->softdep_on_worklist_inprogress; 1241 VFS_UNLOCK_GIANT(vfslocked); 1242 mtx_lock(&mountlist_mtx); 1243 nmp = TAILQ_NEXT(mp, mnt_list); 1244 vfs_unbusy(mp); 1245 } 1246 mtx_unlock(&mountlist_mtx); 1247 if (remaining && progress) 1248 continue; 1249 ACQUIRE_LOCK(&lk); 1250 if (!req_pending) 1251 msleep(&req_pending, &lk, PVM, "sdflush", hz); 1252 req_pending = 0; 1253 FREE_LOCK(&lk); 1254 } 1255 } 1256 1257 static void 1258 worklist_speedup(void) 1259 { 1260 mtx_assert(&lk, MA_OWNED); 1261 if (req_pending == 0) { 1262 req_pending = 1; 1263 wakeup(&req_pending); 1264 } 1265 } 1266 1267 static int 1268 softdep_speedup(void) 1269 { 1270 1271 worklist_speedup(); 1272 bd_speedup(); 1273 return speedup_syncer(); 1274 } 1275 1276 /* 1277 * Add an item to the end of the work queue. 1278 * This routine requires that the lock be held. 1279 * This is the only routine that adds items to the list. 1280 * The following routine is the only one that removes items 1281 * and does so in order from first to last. 1282 */ 1283 static void 1284 add_to_worklist(wk, nodelay) 1285 struct worklist *wk; 1286 int nodelay; 1287 { 1288 struct ufsmount *ump; 1289 1290 mtx_assert(&lk, MA_OWNED); 1291 ump = VFSTOUFS(wk->wk_mp); 1292 if (wk->wk_state & ONWORKLIST) 1293 panic("add_to_worklist: %s(0x%X) already on list", 1294 TYPENAME(wk->wk_type), wk->wk_state); 1295 wk->wk_state |= ONWORKLIST; 1296 if (LIST_EMPTY(&ump->softdep_workitem_pending)) 1297 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); 1298 else 1299 LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list); 1300 ump->softdep_worklist_tail = wk; 1301 ump->softdep_on_worklist += 1; 1302 if (nodelay) 1303 worklist_speedup(); 1304 } 1305 1306 /* 1307 * Remove the item to be processed. If we are removing the last 1308 * item on the list, we need to recalculate the tail pointer. 1309 */ 1310 static void 1311 remove_from_worklist(wk) 1312 struct worklist *wk; 1313 { 1314 struct ufsmount *ump; 1315 struct worklist *wkend; 1316 1317 ump = VFSTOUFS(wk->wk_mp); 1318 WORKLIST_REMOVE(wk); 1319 if (wk == ump->softdep_worklist_tail) { 1320 LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list) 1321 if (LIST_NEXT(wkend, wk_list) == NULL) 1322 break; 1323 ump->softdep_worklist_tail = wkend; 1324 } 1325 ump->softdep_on_worklist -= 1; 1326 } 1327 1328 /* 1329 * Process that runs once per second to handle items in the background queue. 1330 * 1331 * Note that we ensure that everything is done in the order in which they 1332 * appear in the queue. The code below depends on this property to ensure 1333 * that blocks of a file are freed before the inode itself is freed. This 1334 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated 1335 * until all the old ones have been purged from the dependency lists. 1336 */ 1337 int 1338 softdep_process_worklist(mp, full) 1339 struct mount *mp; 1340 int full; 1341 { 1342 struct thread *td = curthread; 1343 int cnt, matchcnt, loopcount; 1344 struct ufsmount *ump; 1345 long starttime; 1346 1347 KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp")); 1348 /* 1349 * Record the process identifier of our caller so that we can give 1350 * this process preferential treatment in request_cleanup below. 1351 */ 1352 matchcnt = 0; 1353 ump = VFSTOUFS(mp); 1354 ACQUIRE_LOCK(&lk); 1355 loopcount = 1; 1356 starttime = time_second; 1357 softdep_process_journal(mp, full?MNT_WAIT:0); 1358 while (ump->softdep_on_worklist > 0) { 1359 if ((cnt = process_worklist_item(mp, LK_NOWAIT)) == -1) 1360 break; 1361 else 1362 matchcnt += cnt; 1363 /* 1364 * If requested, try removing inode or removal dependencies. 1365 */ 1366 if (req_clear_inodedeps) { 1367 clear_inodedeps(td); 1368 req_clear_inodedeps -= 1; 1369 wakeup_one(&proc_waiting); 1370 } 1371 if (req_clear_remove) { 1372 clear_remove(td); 1373 req_clear_remove -= 1; 1374 wakeup_one(&proc_waiting); 1375 } 1376 /* 1377 * We do not generally want to stop for buffer space, but if 1378 * we are really being a buffer hog, we will stop and wait. 1379 */ 1380 if (loopcount++ % 128 == 0) { 1381 FREE_LOCK(&lk); 1382 uio_yield(); 1383 bwillwrite(); 1384 ACQUIRE_LOCK(&lk); 1385 } 1386 /* 1387 * Never allow processing to run for more than one 1388 * second. Otherwise the other mountpoints may get 1389 * excessively backlogged. 1390 */ 1391 if (!full && starttime != time_second) 1392 break; 1393 } 1394 if (full == 0) 1395 journal_unsuspend(ump); 1396 FREE_LOCK(&lk); 1397 return (matchcnt); 1398 } 1399 1400 /* 1401 * Process all removes associated with a vnode if we are running out of 1402 * journal space. Any other process which attempts to flush these will 1403 * be unable as we have the vnodes locked. 1404 */ 1405 static void 1406 process_removes(vp) 1407 struct vnode *vp; 1408 { 1409 struct inodedep *inodedep; 1410 struct dirrem *dirrem; 1411 struct mount *mp; 1412 ino_t inum; 1413 1414 mtx_assert(&lk, MA_OWNED); 1415 1416 mp = vp->v_mount; 1417 inum = VTOI(vp)->i_number; 1418 for (;;) { 1419 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) 1420 return; 1421 LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) 1422 if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) == 1423 (COMPLETE | ONWORKLIST)) 1424 break; 1425 if (dirrem == NULL) 1426 return; 1427 /* 1428 * If another thread is trying to lock this vnode it will 1429 * fail but we must wait for it to do so before we can 1430 * proceed. 1431 */ 1432 if (dirrem->dm_state & INPROGRESS) { 1433 dirrem->dm_state |= IOWAITING; 1434 msleep(&dirrem->dm_list, &lk, PVM, "pwrwait", 0); 1435 continue; 1436 } 1437 remove_from_worklist(&dirrem->dm_list); 1438 FREE_LOCK(&lk); 1439 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) 1440 panic("process_removes: suspended filesystem"); 1441 handle_workitem_remove(dirrem, vp); 1442 vn_finished_secondary_write(mp); 1443 ACQUIRE_LOCK(&lk); 1444 } 1445 } 1446 1447 /* 1448 * Process one item on the worklist. 1449 */ 1450 static int 1451 process_worklist_item(mp, flags) 1452 struct mount *mp; 1453 int flags; 1454 { 1455 struct worklist *wk; 1456 struct ufsmount *ump; 1457 struct vnode *vp; 1458 int matchcnt = 0; 1459 1460 mtx_assert(&lk, MA_OWNED); 1461 KASSERT(mp != NULL, ("process_worklist_item: NULL mp")); 1462 /* 1463 * If we are being called because of a process doing a 1464 * copy-on-write, then it is not safe to write as we may 1465 * recurse into the copy-on-write routine. 1466 */ 1467 if (curthread->td_pflags & TDP_COWINPROGRESS) 1468 return (-1); 1469 /* 1470 * Normally we just process each item on the worklist in order. 1471 * However, if we are in a situation where we cannot lock any 1472 * inodes, we have to skip over any dirrem requests whose 1473 * vnodes are resident and locked. 1474 */ 1475 vp = NULL; 1476 ump = VFSTOUFS(mp); 1477 LIST_FOREACH(wk, &ump->softdep_workitem_pending, wk_list) { 1478 if (wk->wk_state & INPROGRESS) 1479 continue; 1480 if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM) 1481 break; 1482 wk->wk_state |= INPROGRESS; 1483 ump->softdep_on_worklist_inprogress++; 1484 FREE_LOCK(&lk); 1485 ffs_vgetf(mp, WK_DIRREM(wk)->dm_oldinum, 1486 LK_NOWAIT | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ); 1487 ACQUIRE_LOCK(&lk); 1488 if (wk->wk_state & IOWAITING) { 1489 wk->wk_state &= ~IOWAITING; 1490 wakeup(wk); 1491 } 1492 wk->wk_state &= ~INPROGRESS; 1493 ump->softdep_on_worklist_inprogress--; 1494 if (vp != NULL) 1495 break; 1496 } 1497 if (wk == 0) 1498 return (-1); 1499 remove_from_worklist(wk); 1500 FREE_LOCK(&lk); 1501 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) 1502 panic("process_worklist_item: suspended filesystem"); 1503 matchcnt++; 1504 switch (wk->wk_type) { 1505 1506 case D_DIRREM: 1507 /* removal of a directory entry */ 1508 handle_workitem_remove(WK_DIRREM(wk), vp); 1509 if (vp) 1510 vput(vp); 1511 break; 1512 1513 case D_FREEBLKS: 1514 /* releasing blocks and/or fragments from a file */ 1515 handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT); 1516 break; 1517 1518 case D_FREEFRAG: 1519 /* releasing a fragment when replaced as a file grows */ 1520 handle_workitem_freefrag(WK_FREEFRAG(wk)); 1521 break; 1522 1523 case D_FREEFILE: 1524 /* releasing an inode when its link count drops to 0 */ 1525 handle_workitem_freefile(WK_FREEFILE(wk)); 1526 break; 1527 1528 case D_FREEWORK: 1529 /* Final block in an indirect was freed. */ 1530 handle_workitem_indirblk(WK_FREEWORK(wk)); 1531 break; 1532 1533 default: 1534 panic("%s_process_worklist: Unknown type %s", 1535 "softdep", TYPENAME(wk->wk_type)); 1536 /* NOTREACHED */ 1537 } 1538 vn_finished_secondary_write(mp); 1539 ACQUIRE_LOCK(&lk); 1540 return (matchcnt); 1541 } 1542 1543 /* 1544 * Move dependencies from one buffer to another. 1545 */ 1546 int 1547 softdep_move_dependencies(oldbp, newbp) 1548 struct buf *oldbp; 1549 struct buf *newbp; 1550 { 1551 struct worklist *wk, *wktail; 1552 int dirty; 1553 1554 dirty = 0; 1555 wktail = NULL; 1556 ACQUIRE_LOCK(&lk); 1557 while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { 1558 LIST_REMOVE(wk, wk_list); 1559 if (wk->wk_type == D_BMSAFEMAP && 1560 bmsafemap_rollbacks(WK_BMSAFEMAP(wk))) 1561 dirty = 1; 1562 if (wktail == 0) 1563 LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); 1564 else 1565 LIST_INSERT_AFTER(wktail, wk, wk_list); 1566 wktail = wk; 1567 } 1568 FREE_LOCK(&lk); 1569 1570 return (dirty); 1571 } 1572 1573 /* 1574 * Purge the work list of all items associated with a particular mount point. 1575 */ 1576 int 1577 softdep_flushworklist(oldmnt, countp, td) 1578 struct mount *oldmnt; 1579 int *countp; 1580 struct thread *td; 1581 { 1582 struct vnode *devvp; 1583 int count, error = 0; 1584 struct ufsmount *ump; 1585 1586 /* 1587 * Alternately flush the block device associated with the mount 1588 * point and process any dependencies that the flushing 1589 * creates. We continue until no more worklist dependencies 1590 * are found. 1591 */ 1592 *countp = 0; 1593 ump = VFSTOUFS(oldmnt); 1594 devvp = ump->um_devvp; 1595 while ((count = softdep_process_worklist(oldmnt, 1)) > 0) { 1596 *countp += count; 1597 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1598 error = VOP_FSYNC(devvp, MNT_WAIT, td); 1599 VOP_UNLOCK(devvp, 0); 1600 if (error) 1601 break; 1602 } 1603 return (error); 1604 } 1605 1606 int 1607 softdep_waitidle(struct mount *mp) 1608 { 1609 struct ufsmount *ump; 1610 int error; 1611 int i; 1612 1613 ump = VFSTOUFS(mp); 1614 ACQUIRE_LOCK(&lk); 1615 for (i = 0; i < 10 && ump->softdep_deps; i++) { 1616 ump->softdep_req = 1; 1617 if (ump->softdep_on_worklist) 1618 panic("softdep_waitidle: work added after flush."); 1619 msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1); 1620 } 1621 ump->softdep_req = 0; 1622 FREE_LOCK(&lk); 1623 error = 0; 1624 if (i == 10) { 1625 error = EBUSY; 1626 printf("softdep_waitidle: Failed to flush worklist for %p\n", 1627 mp); 1628 } 1629 1630 return (error); 1631 } 1632 1633 /* 1634 * Flush all vnodes and worklist items associated with a specified mount point. 1635 */ 1636 int 1637 softdep_flushfiles(oldmnt, flags, td) 1638 struct mount *oldmnt; 1639 int flags; 1640 struct thread *td; 1641 { 1642 int error, depcount, loopcnt, retry_flush_count, retry; 1643 1644 loopcnt = 10; 1645 retry_flush_count = 3; 1646 retry_flush: 1647 error = 0; 1648 1649 /* 1650 * Alternately flush the vnodes associated with the mount 1651 * point and process any dependencies that the flushing 1652 * creates. In theory, this loop can happen at most twice, 1653 * but we give it a few extra just to be sure. 1654 */ 1655 for (; loopcnt > 0; loopcnt--) { 1656 /* 1657 * Do another flush in case any vnodes were brought in 1658 * as part of the cleanup operations. 1659 */ 1660 if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0) 1661 break; 1662 if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 || 1663 depcount == 0) 1664 break; 1665 } 1666 /* 1667 * If we are unmounting then it is an error to fail. If we 1668 * are simply trying to downgrade to read-only, then filesystem 1669 * activity can keep us busy forever, so we just fail with EBUSY. 1670 */ 1671 if (loopcnt == 0) { 1672 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) 1673 panic("softdep_flushfiles: looping"); 1674 error = EBUSY; 1675 } 1676 if (!error) 1677 error = softdep_waitidle(oldmnt); 1678 if (!error) { 1679 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) { 1680 retry = 0; 1681 MNT_ILOCK(oldmnt); 1682 KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0, 1683 ("softdep_flushfiles: !MNTK_NOINSMNTQ")); 1684 if (oldmnt->mnt_nvnodelistsize > 0) { 1685 if (--retry_flush_count > 0) { 1686 retry = 1; 1687 loopcnt = 3; 1688 } else 1689 error = EBUSY; 1690 } 1691 MNT_IUNLOCK(oldmnt); 1692 if (retry) 1693 goto retry_flush; 1694 } 1695 } 1696 return (error); 1697 } 1698 1699 /* 1700 * Structure hashing. 1701 * 1702 * There are three types of structures that can be looked up: 1703 * 1) pagedep structures identified by mount point, inode number, 1704 * and logical block. 1705 * 2) inodedep structures identified by mount point and inode number. 1706 * 3) newblk structures identified by mount point and 1707 * physical block number. 1708 * 1709 * The "pagedep" and "inodedep" dependency structures are hashed 1710 * separately from the file blocks and inodes to which they correspond. 1711 * This separation helps when the in-memory copy of an inode or 1712 * file block must be replaced. It also obviates the need to access 1713 * an inode or file page when simply updating (or de-allocating) 1714 * dependency structures. Lookup of newblk structures is needed to 1715 * find newly allocated blocks when trying to associate them with 1716 * their allocdirect or allocindir structure. 1717 * 1718 * The lookup routines optionally create and hash a new instance when 1719 * an existing entry is not found. 1720 */ 1721 #define DEPALLOC 0x0001 /* allocate structure if lookup fails */ 1722 #define NODELAY 0x0002 /* cannot do background work */ 1723 1724 /* 1725 * Structures and routines associated with pagedep caching. 1726 */ 1727 LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl; 1728 u_long pagedep_hash; /* size of hash table - 1 */ 1729 #define PAGEDEP_HASH(mp, inum, lbn) \ 1730 (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \ 1731 pagedep_hash]) 1732 1733 static int 1734 pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp) 1735 struct pagedep_hashhead *pagedephd; 1736 ino_t ino; 1737 ufs_lbn_t lbn; 1738 struct mount *mp; 1739 int flags; 1740 struct pagedep **pagedeppp; 1741 { 1742 struct pagedep *pagedep; 1743 1744 LIST_FOREACH(pagedep, pagedephd, pd_hash) 1745 if (ino == pagedep->pd_ino && 1746 lbn == pagedep->pd_lbn && 1747 mp == pagedep->pd_list.wk_mp) 1748 break; 1749 if (pagedep) { 1750 *pagedeppp = pagedep; 1751 if ((flags & DEPALLOC) != 0 && 1752 (pagedep->pd_state & ONWORKLIST) == 0) 1753 return (0); 1754 return (1); 1755 } 1756 *pagedeppp = NULL; 1757 return (0); 1758 } 1759 /* 1760 * Look up a pagedep. Return 1 if found, 0 if not found or found 1761 * when asked to allocate but not associated with any buffer. 1762 * If not found, allocate if DEPALLOC flag is passed. 1763 * Found or allocated entry is returned in pagedeppp. 1764 * This routine must be called with splbio interrupts blocked. 1765 */ 1766 static int 1767 pagedep_lookup(mp, ino, lbn, flags, pagedeppp) 1768 struct mount *mp; 1769 ino_t ino; 1770 ufs_lbn_t lbn; 1771 int flags; 1772 struct pagedep **pagedeppp; 1773 { 1774 struct pagedep *pagedep; 1775 struct pagedep_hashhead *pagedephd; 1776 int ret; 1777 int i; 1778 1779 mtx_assert(&lk, MA_OWNED); 1780 pagedephd = PAGEDEP_HASH(mp, ino, lbn); 1781 1782 ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp); 1783 if (*pagedeppp || (flags & DEPALLOC) == 0) 1784 return (ret); 1785 FREE_LOCK(&lk); 1786 pagedep = malloc(sizeof(struct pagedep), 1787 M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO); 1788 workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp); 1789 ACQUIRE_LOCK(&lk); 1790 ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp); 1791 if (*pagedeppp) { 1792 WORKITEM_FREE(pagedep, D_PAGEDEP); 1793 return (ret); 1794 } 1795 pagedep->pd_ino = ino; 1796 pagedep->pd_lbn = lbn; 1797 LIST_INIT(&pagedep->pd_dirremhd); 1798 LIST_INIT(&pagedep->pd_pendinghd); 1799 for (i = 0; i < DAHASHSZ; i++) 1800 LIST_INIT(&pagedep->pd_diraddhd[i]); 1801 LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); 1802 *pagedeppp = pagedep; 1803 return (0); 1804 } 1805 1806 /* 1807 * Structures and routines associated with inodedep caching. 1808 */ 1809 LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl; 1810 static u_long inodedep_hash; /* size of hash table - 1 */ 1811 static long num_inodedep; /* number of inodedep allocated */ 1812 #define INODEDEP_HASH(fs, inum) \ 1813 (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash]) 1814 1815 static int 1816 inodedep_find(inodedephd, fs, inum, inodedeppp) 1817 struct inodedep_hashhead *inodedephd; 1818 struct fs *fs; 1819 ino_t inum; 1820 struct inodedep **inodedeppp; 1821 { 1822 struct inodedep *inodedep; 1823 1824 LIST_FOREACH(inodedep, inodedephd, id_hash) 1825 if (inum == inodedep->id_ino && fs == inodedep->id_fs) 1826 break; 1827 if (inodedep) { 1828 *inodedeppp = inodedep; 1829 return (1); 1830 } 1831 *inodedeppp = NULL; 1832 1833 return (0); 1834 } 1835 /* 1836 * Look up an inodedep. Return 1 if found, 0 if not found. 1837 * If not found, allocate if DEPALLOC flag is passed. 1838 * Found or allocated entry is returned in inodedeppp. 1839 * This routine must be called with splbio interrupts blocked. 1840 */ 1841 static int 1842 inodedep_lookup(mp, inum, flags, inodedeppp) 1843 struct mount *mp; 1844 ino_t inum; 1845 int flags; 1846 struct inodedep **inodedeppp; 1847 { 1848 struct inodedep *inodedep; 1849 struct inodedep_hashhead *inodedephd; 1850 struct fs *fs; 1851 1852 mtx_assert(&lk, MA_OWNED); 1853 fs = VFSTOUFS(mp)->um_fs; 1854 inodedephd = INODEDEP_HASH(fs, inum); 1855 1856 if (inodedep_find(inodedephd, fs, inum, inodedeppp)) 1857 return (1); 1858 if ((flags & DEPALLOC) == 0) 1859 return (0); 1860 /* 1861 * If we are over our limit, try to improve the situation. 1862 */ 1863 if (num_inodedep > max_softdeps && (flags & NODELAY) == 0) 1864 request_cleanup(mp, FLUSH_INODES); 1865 FREE_LOCK(&lk); 1866 inodedep = malloc(sizeof(struct inodedep), 1867 M_INODEDEP, M_SOFTDEP_FLAGS); 1868 workitem_alloc(&inodedep->id_list, D_INODEDEP, mp); 1869 ACQUIRE_LOCK(&lk); 1870 if (inodedep_find(inodedephd, fs, inum, inodedeppp)) { 1871 WORKITEM_FREE(inodedep, D_INODEDEP); 1872 return (1); 1873 } 1874 num_inodedep += 1; 1875 inodedep->id_fs = fs; 1876 inodedep->id_ino = inum; 1877 inodedep->id_state = ALLCOMPLETE; 1878 inodedep->id_nlinkdelta = 0; 1879 inodedep->id_savedino1 = NULL; 1880 inodedep->id_savedsize = -1; 1881 inodedep->id_savedextsize = -1; 1882 inodedep->id_savednlink = -1; 1883 inodedep->id_bmsafemap = NULL; 1884 inodedep->id_mkdiradd = NULL; 1885 LIST_INIT(&inodedep->id_dirremhd); 1886 LIST_INIT(&inodedep->id_pendinghd); 1887 LIST_INIT(&inodedep->id_inowait); 1888 LIST_INIT(&inodedep->id_bufwait); 1889 TAILQ_INIT(&inodedep->id_inoreflst); 1890 TAILQ_INIT(&inodedep->id_inoupdt); 1891 TAILQ_INIT(&inodedep->id_newinoupdt); 1892 TAILQ_INIT(&inodedep->id_extupdt); 1893 TAILQ_INIT(&inodedep->id_newextupdt); 1894 LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); 1895 *inodedeppp = inodedep; 1896 return (0); 1897 } 1898 1899 /* 1900 * Structures and routines associated with newblk caching. 1901 */ 1902 LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl; 1903 u_long newblk_hash; /* size of hash table - 1 */ 1904 #define NEWBLK_HASH(fs, inum) \ 1905 (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash]) 1906 1907 static int 1908 newblk_find(newblkhd, mp, newblkno, flags, newblkpp) 1909 struct newblk_hashhead *newblkhd; 1910 struct mount *mp; 1911 ufs2_daddr_t newblkno; 1912 int flags; 1913 struct newblk **newblkpp; 1914 { 1915 struct newblk *newblk; 1916 1917 LIST_FOREACH(newblk, newblkhd, nb_hash) { 1918 if (newblkno != newblk->nb_newblkno) 1919 continue; 1920 if (mp != newblk->nb_list.wk_mp) 1921 continue; 1922 /* 1923 * If we're creating a new dependency don't match those that 1924 * have already been converted to allocdirects. This is for 1925 * a frag extend. 1926 */ 1927 if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK) 1928 continue; 1929 break; 1930 } 1931 if (newblk) { 1932 *newblkpp = newblk; 1933 return (1); 1934 } 1935 *newblkpp = NULL; 1936 return (0); 1937 } 1938 1939 /* 1940 * Look up a newblk. Return 1 if found, 0 if not found. 1941 * If not found, allocate if DEPALLOC flag is passed. 1942 * Found or allocated entry is returned in newblkpp. 1943 */ 1944 static int 1945 newblk_lookup(mp, newblkno, flags, newblkpp) 1946 struct mount *mp; 1947 ufs2_daddr_t newblkno; 1948 int flags; 1949 struct newblk **newblkpp; 1950 { 1951 struct newblk *newblk; 1952 struct newblk_hashhead *newblkhd; 1953 1954 newblkhd = NEWBLK_HASH(VFSTOUFS(mp)->um_fs, newblkno); 1955 if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) 1956 return (1); 1957 if ((flags & DEPALLOC) == 0) 1958 return (0); 1959 FREE_LOCK(&lk); 1960 newblk = malloc(sizeof(union allblk), M_NEWBLK, 1961 M_SOFTDEP_FLAGS | M_ZERO); 1962 workitem_alloc(&newblk->nb_list, D_NEWBLK, mp); 1963 ACQUIRE_LOCK(&lk); 1964 if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) { 1965 WORKITEM_FREE(newblk, D_NEWBLK); 1966 return (1); 1967 } 1968 newblk->nb_freefrag = NULL; 1969 LIST_INIT(&newblk->nb_indirdeps); 1970 LIST_INIT(&newblk->nb_newdirblk); 1971 LIST_INIT(&newblk->nb_jwork); 1972 newblk->nb_state = ATTACHED; 1973 newblk->nb_newblkno = newblkno; 1974 LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); 1975 *newblkpp = newblk; 1976 return (0); 1977 } 1978 1979 /* 1980 * Executed during filesystem system initialization before 1981 * mounting any filesystems. 1982 */ 1983 void 1984 softdep_initialize() 1985 { 1986 1987 LIST_INIT(&mkdirlisthd); 1988 max_softdeps = desiredvnodes * 4; 1989 pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash); 1990 inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); 1991 newblk_hashtbl = hashinit(desiredvnodes / 5, M_NEWBLK, &newblk_hash); 1992 bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash); 1993 1994 /* initialise bioops hack */ 1995 bioops.io_start = softdep_disk_io_initiation; 1996 bioops.io_complete = softdep_disk_write_complete; 1997 bioops.io_deallocate = softdep_deallocate_dependencies; 1998 bioops.io_countdeps = softdep_count_dependencies; 1999 2000 /* Initialize the callout with an mtx. */ 2001 callout_init_mtx(&softdep_callout, &lk, 0); 2002 } 2003 2004 /* 2005 * Executed after all filesystems have been unmounted during 2006 * filesystem module unload. 2007 */ 2008 void 2009 softdep_uninitialize() 2010 { 2011 2012 callout_drain(&softdep_callout); 2013 hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash); 2014 hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash); 2015 hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash); 2016 hashdestroy(bmsafemap_hashtbl, M_BMSAFEMAP, bmsafemap_hash); 2017 } 2018 2019 /* 2020 * Called at mount time to notify the dependency code that a 2021 * filesystem wishes to use it. 2022 */ 2023 int 2024 softdep_mount(devvp, mp, fs, cred) 2025 struct vnode *devvp; 2026 struct mount *mp; 2027 struct fs *fs; 2028 struct ucred *cred; 2029 { 2030 struct csum_total cstotal; 2031 struct ufsmount *ump; 2032 struct cg *cgp; 2033 struct buf *bp; 2034 int error, cyl; 2035 2036 MNT_ILOCK(mp); 2037 mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP; 2038 if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) { 2039 mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) | 2040 MNTK_SOFTDEP; 2041 mp->mnt_noasync++; 2042 } 2043 MNT_IUNLOCK(mp); 2044 ump = VFSTOUFS(mp); 2045 LIST_INIT(&ump->softdep_workitem_pending); 2046 LIST_INIT(&ump->softdep_journal_pending); 2047 TAILQ_INIT(&ump->softdep_unlinked); 2048 ump->softdep_worklist_tail = NULL; 2049 ump->softdep_on_worklist = 0; 2050 ump->softdep_deps = 0; 2051 if ((fs->fs_flags & FS_SUJ) && 2052 (error = journal_mount(mp, fs, cred)) != 0) { 2053 printf("Failed to start journal: %d\n", error); 2054 return (error); 2055 } 2056 /* 2057 * When doing soft updates, the counters in the 2058 * superblock may have gotten out of sync. Recomputation 2059 * can take a long time and can be deferred for background 2060 * fsck. However, the old behavior of scanning the cylinder 2061 * groups and recalculating them at mount time is available 2062 * by setting vfs.ffs.compute_summary_at_mount to one. 2063 */ 2064 if (compute_summary_at_mount == 0 || fs->fs_clean != 0) 2065 return (0); 2066 bzero(&cstotal, sizeof cstotal); 2067 for (cyl = 0; cyl < fs->fs_ncg; cyl++) { 2068 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), 2069 fs->fs_cgsize, cred, &bp)) != 0) { 2070 brelse(bp); 2071 return (error); 2072 } 2073 cgp = (struct cg *)bp->b_data; 2074 cstotal.cs_nffree += cgp->cg_cs.cs_nffree; 2075 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; 2076 cstotal.cs_nifree += cgp->cg_cs.cs_nifree; 2077 cstotal.cs_ndir += cgp->cg_cs.cs_ndir; 2078 fs->fs_cs(fs, cyl) = cgp->cg_cs; 2079 brelse(bp); 2080 } 2081 #ifdef DEBUG 2082 if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) 2083 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt); 2084 #endif 2085 bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); 2086 return (0); 2087 } 2088 2089 void 2090 softdep_unmount(mp) 2091 struct mount *mp; 2092 { 2093 2094 if (mp->mnt_kern_flag & MNTK_SUJ) 2095 journal_unmount(mp); 2096 } 2097 2098 struct jblocks { 2099 struct jseglst jb_segs; /* TAILQ of current segments. */ 2100 struct jseg *jb_writeseg; /* Next write to complete. */ 2101 struct jextent *jb_extent; /* Extent array. */ 2102 uint64_t jb_nextseq; /* Next sequence number. */ 2103 uint64_t jb_oldestseq; /* Oldest active sequence number. */ 2104 int jb_avail; /* Available extents. */ 2105 int jb_used; /* Last used extent. */ 2106 int jb_head; /* Allocator head. */ 2107 int jb_off; /* Allocator extent offset. */ 2108 int jb_blocks; /* Total disk blocks covered. */ 2109 int jb_free; /* Total disk blocks free. */ 2110 int jb_min; /* Minimum free space. */ 2111 int jb_low; /* Low on space. */ 2112 int jb_age; /* Insertion time of oldest rec. */ 2113 int jb_suspended; /* Did journal suspend writes? */ 2114 }; 2115 2116 struct jextent { 2117 ufs2_daddr_t je_daddr; /* Disk block address. */ 2118 int je_blocks; /* Disk block count. */ 2119 }; 2120 2121 static struct jblocks * 2122 jblocks_create(void) 2123 { 2124 struct jblocks *jblocks; 2125 2126 jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO); 2127 TAILQ_INIT(&jblocks->jb_segs); 2128 jblocks->jb_avail = 10; 2129 jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail, 2130 M_JBLOCKS, M_WAITOK | M_ZERO); 2131 2132 return (jblocks); 2133 } 2134 2135 static ufs2_daddr_t 2136 jblocks_alloc(jblocks, bytes, actual) 2137 struct jblocks *jblocks; 2138 int bytes; 2139 int *actual; 2140 { 2141 ufs2_daddr_t daddr; 2142 struct jextent *jext; 2143 int freecnt; 2144 int blocks; 2145 2146 blocks = bytes / DEV_BSIZE; 2147 jext = &jblocks->jb_extent[jblocks->jb_head]; 2148 freecnt = jext->je_blocks - jblocks->jb_off; 2149 if (freecnt == 0) { 2150 jblocks->jb_off = 0; 2151 if (++jblocks->jb_head > jblocks->jb_used) 2152 jblocks->jb_head = 0; 2153 jext = &jblocks->jb_extent[jblocks->jb_head]; 2154 freecnt = jext->je_blocks; 2155 } 2156 if (freecnt > blocks) 2157 freecnt = blocks; 2158 *actual = freecnt * DEV_BSIZE; 2159 daddr = jext->je_daddr + jblocks->jb_off; 2160 jblocks->jb_off += freecnt; 2161 jblocks->jb_free -= freecnt; 2162 2163 return (daddr); 2164 } 2165 2166 static void 2167 jblocks_free(jblocks, mp, bytes) 2168 struct jblocks *jblocks; 2169 struct mount *mp; 2170 int bytes; 2171 { 2172 2173 jblocks->jb_free += bytes / DEV_BSIZE; 2174 if (jblocks->jb_suspended) 2175 worklist_speedup(); 2176 wakeup(jblocks); 2177 } 2178 2179 static void 2180 jblocks_destroy(jblocks) 2181 struct jblocks *jblocks; 2182 { 2183 2184 if (jblocks->jb_extent) 2185 free(jblocks->jb_extent, M_JBLOCKS); 2186 free(jblocks, M_JBLOCKS); 2187 } 2188 2189 static void 2190 jblocks_add(jblocks, daddr, blocks) 2191 struct jblocks *jblocks; 2192 ufs2_daddr_t daddr; 2193 int blocks; 2194 { 2195 struct jextent *jext; 2196 2197 jblocks->jb_blocks += blocks; 2198 jblocks->jb_free += blocks; 2199 jext = &jblocks->jb_extent[jblocks->jb_used]; 2200 /* Adding the first block. */ 2201 if (jext->je_daddr == 0) { 2202 jext->je_daddr = daddr; 2203 jext->je_blocks = blocks; 2204 return; 2205 } 2206 /* Extending the last extent. */ 2207 if (jext->je_daddr + jext->je_blocks == daddr) { 2208 jext->je_blocks += blocks; 2209 return; 2210 } 2211 /* Adding a new extent. */ 2212 if (++jblocks->jb_used == jblocks->jb_avail) { 2213 jblocks->jb_avail *= 2; 2214 jext = malloc(sizeof(struct jextent) * jblocks->jb_avail, 2215 M_JBLOCKS, M_WAITOK | M_ZERO); 2216 memcpy(jext, jblocks->jb_extent, 2217 sizeof(struct jextent) * jblocks->jb_used); 2218 free(jblocks->jb_extent, M_JBLOCKS); 2219 jblocks->jb_extent = jext; 2220 } 2221 jext = &jblocks->jb_extent[jblocks->jb_used]; 2222 jext->je_daddr = daddr; 2223 jext->je_blocks = blocks; 2224 return; 2225 } 2226 2227 int 2228 softdep_journal_lookup(mp, vpp) 2229 struct mount *mp; 2230 struct vnode **vpp; 2231 { 2232 struct componentname cnp; 2233 struct vnode *dvp; 2234 ino_t sujournal; 2235 int error; 2236 2237 error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp); 2238 if (error) 2239 return (error); 2240 bzero(&cnp, sizeof(cnp)); 2241 cnp.cn_nameiop = LOOKUP; 2242 cnp.cn_flags = ISLASTCN; 2243 cnp.cn_thread = curthread; 2244 cnp.cn_cred = curthread->td_ucred; 2245 cnp.cn_pnbuf = SUJ_FILE; 2246 cnp.cn_nameptr = SUJ_FILE; 2247 cnp.cn_namelen = strlen(SUJ_FILE); 2248 error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal); 2249 vput(dvp); 2250 if (error != 0) 2251 return (error); 2252 error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp); 2253 return (error); 2254 } 2255 2256 /* 2257 * Open and verify the journal file. 2258 */ 2259 static int 2260 journal_mount(mp, fs, cred) 2261 struct mount *mp; 2262 struct fs *fs; 2263 struct ucred *cred; 2264 { 2265 struct jblocks *jblocks; 2266 struct vnode *vp; 2267 struct inode *ip; 2268 ufs2_daddr_t blkno; 2269 int bcount; 2270 int error; 2271 int i; 2272 2273 mp->mnt_kern_flag |= MNTK_SUJ; 2274 error = softdep_journal_lookup(mp, &vp); 2275 if (error != 0) { 2276 printf("Failed to find journal. Use tunefs to create one\n"); 2277 return (error); 2278 } 2279 ip = VTOI(vp); 2280 if (ip->i_size < SUJ_MIN) { 2281 error = ENOSPC; 2282 goto out; 2283 } 2284 bcount = lblkno(fs, ip->i_size); /* Only use whole blocks. */ 2285 jblocks = jblocks_create(); 2286 for (i = 0; i < bcount; i++) { 2287 error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL); 2288 if (error) 2289 break; 2290 jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag)); 2291 } 2292 if (error) { 2293 jblocks_destroy(jblocks); 2294 goto out; 2295 } 2296 jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */ 2297 jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */ 2298 /* 2299 * Only validate the journal contents if the filesystem is clean, 2300 * otherwise we write the logs but they'll never be used. If the 2301 * filesystem was still dirty when we mounted it the journal is 2302 * invalid and a new journal can only be valid if it starts from a 2303 * clean mount. 2304 */ 2305 if (fs->fs_clean) { 2306 DIP_SET(ip, i_modrev, fs->fs_mtime); 2307 ip->i_flags |= IN_MODIFIED; 2308 ffs_update(vp, 1); 2309 } 2310 VFSTOUFS(mp)->softdep_jblocks = jblocks; 2311 out: 2312 vput(vp); 2313 return (error); 2314 } 2315 2316 static void 2317 journal_unmount(mp) 2318 struct mount *mp; 2319 { 2320 struct ufsmount *ump; 2321 2322 ump = VFSTOUFS(mp); 2323 if (ump->softdep_jblocks) 2324 jblocks_destroy(ump->softdep_jblocks); 2325 ump->softdep_jblocks = NULL; 2326 } 2327 2328 /* 2329 * Called when a journal record is ready to be written. Space is allocated 2330 * and the journal entry is created when the journal is flushed to stable 2331 * store. 2332 */ 2333 static void 2334 add_to_journal(wk) 2335 struct worklist *wk; 2336 { 2337 struct ufsmount *ump; 2338 2339 mtx_assert(&lk, MA_OWNED); 2340 ump = VFSTOUFS(wk->wk_mp); 2341 if (wk->wk_state & ONWORKLIST) 2342 panic("add_to_journal: %s(0x%X) already on list", 2343 TYPENAME(wk->wk_type), wk->wk_state); 2344 wk->wk_state |= ONWORKLIST | DEPCOMPLETE; 2345 if (LIST_EMPTY(&ump->softdep_journal_pending)) { 2346 ump->softdep_jblocks->jb_age = ticks; 2347 LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list); 2348 } else 2349 LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list); 2350 ump->softdep_journal_tail = wk; 2351 ump->softdep_on_journal += 1; 2352 } 2353 2354 /* 2355 * Remove an arbitrary item for the journal worklist maintain the tail 2356 * pointer. This happens when a new operation obviates the need to 2357 * journal an old operation. 2358 */ 2359 static void 2360 remove_from_journal(wk) 2361 struct worklist *wk; 2362 { 2363 struct ufsmount *ump; 2364 2365 mtx_assert(&lk, MA_OWNED); 2366 ump = VFSTOUFS(wk->wk_mp); 2367 #ifdef SUJ_DEBUG 2368 { 2369 struct worklist *wkn; 2370 2371 LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list) 2372 if (wkn == wk) 2373 break; 2374 if (wkn == NULL) 2375 panic("remove_from_journal: %p is not in journal", wk); 2376 } 2377 #endif 2378 /* 2379 * We emulate a TAILQ to save space in most structures which do not 2380 * require TAILQ semantics. Here we must update the tail position 2381 * when removing the tail which is not the final entry. 2382 */ 2383 if (ump->softdep_journal_tail == wk) 2384 ump->softdep_journal_tail = 2385 (struct worklist *)wk->wk_list.le_prev; 2386 2387 WORKLIST_REMOVE(wk); 2388 ump->softdep_on_journal -= 1; 2389 } 2390 2391 /* 2392 * Check for journal space as well as dependency limits so the prelink 2393 * code can throttle both journaled and non-journaled filesystems. 2394 * Threshold is 0 for low and 1 for min. 2395 */ 2396 static int 2397 journal_space(ump, thresh) 2398 struct ufsmount *ump; 2399 int thresh; 2400 { 2401 struct jblocks *jblocks; 2402 int avail; 2403 2404 jblocks = ump->softdep_jblocks; 2405 if (jblocks == NULL) 2406 return (1); 2407 /* 2408 * We use a tighter restriction here to prevent request_cleanup() 2409 * running in threads from running into locks we currently hold. 2410 */ 2411 if (num_inodedep > (max_softdeps / 10) * 9) 2412 return (0); 2413 if (thresh) 2414 thresh = jblocks->jb_min; 2415 else 2416 thresh = jblocks->jb_low; 2417 avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE; 2418 avail = jblocks->jb_free - avail; 2419 2420 return (avail > thresh); 2421 } 2422 2423 static void 2424 journal_suspend(ump) 2425 struct ufsmount *ump; 2426 { 2427 struct jblocks *jblocks; 2428 struct mount *mp; 2429 2430 mp = UFSTOVFS(ump); 2431 jblocks = ump->softdep_jblocks; 2432 MNT_ILOCK(mp); 2433 if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { 2434 stat_journal_min++; 2435 mp->mnt_kern_flag |= MNTK_SUSPEND; 2436 mp->mnt_susp_owner = FIRST_THREAD_IN_PROC(softdepproc); 2437 } 2438 jblocks->jb_suspended = 1; 2439 MNT_IUNLOCK(mp); 2440 } 2441 2442 static int 2443 journal_unsuspend(struct ufsmount *ump) 2444 { 2445 struct jblocks *jblocks; 2446 struct mount *mp; 2447 2448 mp = UFSTOVFS(ump); 2449 jblocks = ump->softdep_jblocks; 2450 2451 if (jblocks != NULL && jblocks->jb_suspended && 2452 journal_space(ump, jblocks->jb_min)) { 2453 jblocks->jb_suspended = 0; 2454 FREE_LOCK(&lk); 2455 mp->mnt_susp_owner = curthread; 2456 vfs_write_resume(mp); 2457 ACQUIRE_LOCK(&lk); 2458 return (1); 2459 } 2460 return (0); 2461 } 2462 2463 /* 2464 * Called before any allocation function to be certain that there is 2465 * sufficient space in the journal prior to creating any new records. 2466 * Since in the case of block allocation we may have multiple locked 2467 * buffers at the time of the actual allocation we can not block 2468 * when the journal records are created. Doing so would create a deadlock 2469 * if any of these buffers needed to be flushed to reclaim space. Instead 2470 * we require a sufficiently large amount of available space such that 2471 * each thread in the system could have passed this allocation check and 2472 * still have sufficient free space. With 20% of a minimum journal size 2473 * of 1MB we have 6553 records available. 2474 */ 2475 int 2476 softdep_prealloc(vp, waitok) 2477 struct vnode *vp; 2478 int waitok; 2479 { 2480 struct ufsmount *ump; 2481 2482 if (DOINGSUJ(vp) == 0) 2483 return (0); 2484 ump = VFSTOUFS(vp->v_mount); 2485 ACQUIRE_LOCK(&lk); 2486 if (journal_space(ump, 0)) { 2487 FREE_LOCK(&lk); 2488 return (0); 2489 } 2490 stat_journal_low++; 2491 FREE_LOCK(&lk); 2492 if (waitok == MNT_NOWAIT) 2493 return (ENOSPC); 2494 /* 2495 * Attempt to sync this vnode once to flush any journal 2496 * work attached to it. 2497 */ 2498 if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0) 2499 ffs_syncvnode(vp, waitok); 2500 ACQUIRE_LOCK(&lk); 2501 process_removes(vp); 2502 if (journal_space(ump, 0) == 0) { 2503 softdep_speedup(); 2504 if (journal_space(ump, 1) == 0) 2505 journal_suspend(ump); 2506 } 2507 FREE_LOCK(&lk); 2508 2509 return (0); 2510 } 2511 2512 /* 2513 * Before adjusting a link count on a vnode verify that we have sufficient 2514 * journal space. If not, process operations that depend on the currently 2515 * locked pair of vnodes to try to flush space as the syncer, buf daemon, 2516 * and softdep flush threads can not acquire these locks to reclaim space. 2517 */ 2518 static void 2519 softdep_prelink(dvp, vp) 2520 struct vnode *dvp; 2521 struct vnode *vp; 2522 { 2523 struct ufsmount *ump; 2524 2525 ump = VFSTOUFS(dvp->v_mount); 2526 mtx_assert(&lk, MA_OWNED); 2527 if (journal_space(ump, 0)) 2528 return; 2529 stat_journal_low++; 2530 FREE_LOCK(&lk); 2531 if (vp) 2532 ffs_syncvnode(vp, MNT_NOWAIT); 2533 ffs_syncvnode(dvp, MNT_WAIT); 2534 ACQUIRE_LOCK(&lk); 2535 /* Process vp before dvp as it may create .. removes. */ 2536 if (vp) 2537 process_removes(vp); 2538 process_removes(dvp); 2539 softdep_speedup(); 2540 process_worklist_item(UFSTOVFS(ump), LK_NOWAIT); 2541 process_worklist_item(UFSTOVFS(ump), LK_NOWAIT); 2542 if (journal_space(ump, 0) == 0) { 2543 softdep_speedup(); 2544 if (journal_space(ump, 1) == 0) 2545 journal_suspend(ump); 2546 } 2547 } 2548 2549 static void 2550 jseg_write(fs, jblocks, jseg, data) 2551 struct fs *fs; 2552 struct jblocks *jblocks; 2553 struct jseg *jseg; 2554 uint8_t *data; 2555 { 2556 struct jsegrec *rec; 2557 2558 rec = (struct jsegrec *)data; 2559 rec->jsr_seq = jseg->js_seq; 2560 rec->jsr_oldest = jblocks->jb_oldestseq; 2561 rec->jsr_cnt = jseg->js_cnt; 2562 rec->jsr_blocks = jseg->js_size / DEV_BSIZE; 2563 rec->jsr_crc = 0; 2564 rec->jsr_time = fs->fs_mtime; 2565 } 2566 2567 static inline void 2568 inoref_write(inoref, jseg, rec) 2569 struct inoref *inoref; 2570 struct jseg *jseg; 2571 struct jrefrec *rec; 2572 { 2573 2574 inoref->if_jsegdep->jd_seg = jseg; 2575 rec->jr_ino = inoref->if_ino; 2576 rec->jr_parent = inoref->if_parent; 2577 rec->jr_nlink = inoref->if_nlink; 2578 rec->jr_mode = inoref->if_mode; 2579 rec->jr_diroff = inoref->if_diroff; 2580 } 2581 2582 static void 2583 jaddref_write(jaddref, jseg, data) 2584 struct jaddref *jaddref; 2585 struct jseg *jseg; 2586 uint8_t *data; 2587 { 2588 struct jrefrec *rec; 2589 2590 rec = (struct jrefrec *)data; 2591 rec->jr_op = JOP_ADDREF; 2592 inoref_write(&jaddref->ja_ref, jseg, rec); 2593 } 2594 2595 static void 2596 jremref_write(jremref, jseg, data) 2597 struct jremref *jremref; 2598 struct jseg *jseg; 2599 uint8_t *data; 2600 { 2601 struct jrefrec *rec; 2602 2603 rec = (struct jrefrec *)data; 2604 rec->jr_op = JOP_REMREF; 2605 inoref_write(&jremref->jr_ref, jseg, rec); 2606 } 2607 2608 static void 2609 jmvref_write(jmvref, jseg, data) 2610 struct jmvref *jmvref; 2611 struct jseg *jseg; 2612 uint8_t *data; 2613 { 2614 struct jmvrec *rec; 2615 2616 rec = (struct jmvrec *)data; 2617 rec->jm_op = JOP_MVREF; 2618 rec->jm_ino = jmvref->jm_ino; 2619 rec->jm_parent = jmvref->jm_parent; 2620 rec->jm_oldoff = jmvref->jm_oldoff; 2621 rec->jm_newoff = jmvref->jm_newoff; 2622 } 2623 2624 static void 2625 jnewblk_write(jnewblk, jseg, data) 2626 struct jnewblk *jnewblk; 2627 struct jseg *jseg; 2628 uint8_t *data; 2629 { 2630 struct jblkrec *rec; 2631 2632 jnewblk->jn_jsegdep->jd_seg = jseg; 2633 rec = (struct jblkrec *)data; 2634 rec->jb_op = JOP_NEWBLK; 2635 rec->jb_ino = jnewblk->jn_ino; 2636 rec->jb_blkno = jnewblk->jn_blkno; 2637 rec->jb_lbn = jnewblk->jn_lbn; 2638 rec->jb_frags = jnewblk->jn_frags; 2639 rec->jb_oldfrags = jnewblk->jn_oldfrags; 2640 } 2641 2642 static void 2643 jfreeblk_write(jfreeblk, jseg, data) 2644 struct jfreeblk *jfreeblk; 2645 struct jseg *jseg; 2646 uint8_t *data; 2647 { 2648 struct jblkrec *rec; 2649 2650 jfreeblk->jf_jsegdep->jd_seg = jseg; 2651 rec = (struct jblkrec *)data; 2652 rec->jb_op = JOP_FREEBLK; 2653 rec->jb_ino = jfreeblk->jf_ino; 2654 rec->jb_blkno = jfreeblk->jf_blkno; 2655 rec->jb_lbn = jfreeblk->jf_lbn; 2656 rec->jb_frags = jfreeblk->jf_frags; 2657 rec->jb_oldfrags = 0; 2658 } 2659 2660 static void 2661 jfreefrag_write(jfreefrag, jseg, data) 2662 struct jfreefrag *jfreefrag; 2663 struct jseg *jseg; 2664 uint8_t *data; 2665 { 2666 struct jblkrec *rec; 2667 2668 jfreefrag->fr_jsegdep->jd_seg = jseg; 2669 rec = (struct jblkrec *)data; 2670 rec->jb_op = JOP_FREEBLK; 2671 rec->jb_ino = jfreefrag->fr_ino; 2672 rec->jb_blkno = jfreefrag->fr_blkno; 2673 rec->jb_lbn = jfreefrag->fr_lbn; 2674 rec->jb_frags = jfreefrag->fr_frags; 2675 rec->jb_oldfrags = 0; 2676 } 2677 2678 static void 2679 jtrunc_write(jtrunc, jseg, data) 2680 struct jtrunc *jtrunc; 2681 struct jseg *jseg; 2682 uint8_t *data; 2683 { 2684 struct jtrncrec *rec; 2685 2686 rec = (struct jtrncrec *)data; 2687 rec->jt_op = JOP_TRUNC; 2688 rec->jt_ino = jtrunc->jt_ino; 2689 rec->jt_size = jtrunc->jt_size; 2690 rec->jt_extsize = jtrunc->jt_extsize; 2691 } 2692 2693 /* 2694 * Flush some journal records to disk. 2695 */ 2696 static void 2697 softdep_process_journal(mp, flags) 2698 struct mount *mp; 2699 int flags; 2700 { 2701 struct jblocks *jblocks; 2702 struct ufsmount *ump; 2703 struct worklist *wk; 2704 struct jseg *jseg; 2705 struct buf *bp; 2706 uint8_t *data; 2707 struct fs *fs; 2708 int segwritten; 2709 int jrecmin; /* Minimum records per block. */ 2710 int jrecmax; /* Maximum records per block. */ 2711 int size; 2712 int cnt; 2713 int off; 2714 2715 if ((mp->mnt_kern_flag & MNTK_SUJ) == 0) 2716 return; 2717 ump = VFSTOUFS(mp); 2718 fs = ump->um_fs; 2719 jblocks = ump->softdep_jblocks; 2720 /* 2721 * We write anywhere between a disk block and fs block. The upper 2722 * bound is picked to prevent buffer cache fragmentation and limit 2723 * processing time per I/O. 2724 */ 2725 jrecmin = (DEV_BSIZE / JREC_SIZE) - 1; /* -1 for seg header */ 2726 jrecmax = (fs->fs_bsize / DEV_BSIZE) * jrecmin; 2727 segwritten = 0; 2728 while ((cnt = ump->softdep_on_journal) != 0) { 2729 /* 2730 * Create a new segment to hold as many as 'cnt' journal 2731 * entries and add them to the segment. Notice cnt is 2732 * off by one to account for the space required by the 2733 * jsegrec. If we don't have a full block to log skip it 2734 * unless we haven't written anything. 2735 */ 2736 cnt++; 2737 if (cnt < jrecmax && segwritten) 2738 break; 2739 /* 2740 * Verify some free journal space. softdep_prealloc() should 2741 * guarantee that we don't run out so this is indicative of 2742 * a problem with the flow control. Try to recover 2743 * gracefully in any event. 2744 */ 2745 while (jblocks->jb_free == 0) { 2746 if (flags != MNT_WAIT) 2747 break; 2748 printf("softdep: Out of journal space!\n"); 2749 softdep_speedup(); 2750 msleep(jblocks, &lk, PRIBIO, "jblocks", hz); 2751 } 2752 FREE_LOCK(&lk); 2753 jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS); 2754 workitem_alloc(&jseg->js_list, D_JSEG, mp); 2755 LIST_INIT(&jseg->js_entries); 2756 jseg->js_state = ATTACHED; 2757 jseg->js_jblocks = jblocks; 2758 bp = geteblk(fs->fs_bsize, 0); 2759 ACQUIRE_LOCK(&lk); 2760 /* 2761 * If there was a race while we were allocating the block 2762 * and jseg the entry we care about was likely written. 2763 * We bail out in both the WAIT and NOWAIT case and assume 2764 * the caller will loop if the entry it cares about is 2765 * not written. 2766 */ 2767 if (ump->softdep_on_journal == 0 || jblocks->jb_free == 0) { 2768 bp->b_flags |= B_INVAL | B_NOCACHE; 2769 WORKITEM_FREE(jseg, D_JSEG); 2770 FREE_LOCK(&lk); 2771 brelse(bp); 2772 ACQUIRE_LOCK(&lk); 2773 break; 2774 } 2775 /* 2776 * Calculate the disk block size required for the available 2777 * records rounded to the min size. 2778 */ 2779 cnt = ump->softdep_on_journal; 2780 if (cnt < jrecmax) 2781 size = howmany(cnt, jrecmin) * DEV_BSIZE; 2782 else 2783 size = fs->fs_bsize; 2784 /* 2785 * Allocate a disk block for this journal data and account 2786 * for truncation of the requested size if enough contiguous 2787 * space was not available. 2788 */ 2789 bp->b_blkno = jblocks_alloc(jblocks, size, &size); 2790 bp->b_lblkno = bp->b_blkno; 2791 bp->b_offset = bp->b_blkno * DEV_BSIZE; 2792 bp->b_bcount = size; 2793 bp->b_bufobj = &ump->um_devvp->v_bufobj; 2794 bp->b_flags &= ~B_INVAL; 2795 bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY; 2796 /* 2797 * Initialize our jseg with cnt records. Assign the next 2798 * sequence number to it and link it in-order. 2799 */ 2800 cnt = MIN(ump->softdep_on_journal, 2801 (size / DEV_BSIZE) * jrecmin); 2802 jseg->js_buf = bp; 2803 jseg->js_cnt = cnt; 2804 jseg->js_refs = cnt + 1; /* Self ref. */ 2805 jseg->js_size = size; 2806 jseg->js_seq = jblocks->jb_nextseq++; 2807 if (TAILQ_EMPTY(&jblocks->jb_segs)) 2808 jblocks->jb_oldestseq = jseg->js_seq; 2809 TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next); 2810 if (jblocks->jb_writeseg == NULL) 2811 jblocks->jb_writeseg = jseg; 2812 /* 2813 * Start filling in records from the pending list. 2814 */ 2815 data = bp->b_data; 2816 off = 0; 2817 while ((wk = LIST_FIRST(&ump->softdep_journal_pending)) 2818 != NULL) { 2819 /* Place a segment header on every device block. */ 2820 if ((off % DEV_BSIZE) == 0) { 2821 jseg_write(fs, jblocks, jseg, data); 2822 off += JREC_SIZE; 2823 data = bp->b_data + off; 2824 } 2825 remove_from_journal(wk); 2826 wk->wk_state |= IOSTARTED; 2827 WORKLIST_INSERT(&jseg->js_entries, wk); 2828 switch (wk->wk_type) { 2829 case D_JADDREF: 2830 jaddref_write(WK_JADDREF(wk), jseg, data); 2831 break; 2832 case D_JREMREF: 2833 jremref_write(WK_JREMREF(wk), jseg, data); 2834 break; 2835 case D_JMVREF: 2836 jmvref_write(WK_JMVREF(wk), jseg, data); 2837 break; 2838 case D_JNEWBLK: 2839 jnewblk_write(WK_JNEWBLK(wk), jseg, data); 2840 break; 2841 case D_JFREEBLK: 2842 jfreeblk_write(WK_JFREEBLK(wk), jseg, data); 2843 break; 2844 case D_JFREEFRAG: 2845 jfreefrag_write(WK_JFREEFRAG(wk), jseg, data); 2846 break; 2847 case D_JTRUNC: 2848 jtrunc_write(WK_JTRUNC(wk), jseg, data); 2849 break; 2850 default: 2851 panic("process_journal: Unknown type %s", 2852 TYPENAME(wk->wk_type)); 2853 /* NOTREACHED */ 2854 } 2855 if (--cnt == 0) 2856 break; 2857 off += JREC_SIZE; 2858 data = bp->b_data + off; 2859 } 2860 /* 2861 * Write this one buffer and continue. 2862 */ 2863 WORKLIST_INSERT(&bp->b_dep, &jseg->js_list); 2864 FREE_LOCK(&lk); 2865 BO_LOCK(bp->b_bufobj); 2866 bgetvp(ump->um_devvp, bp); 2867 BO_UNLOCK(bp->b_bufobj); 2868 if (flags == MNT_NOWAIT) 2869 bawrite(bp); 2870 else 2871 bwrite(bp); 2872 ACQUIRE_LOCK(&lk); 2873 } 2874 /* 2875 * If we've suspended the filesystem because we ran out of journal 2876 * space either try to sync it here to make some progress or 2877 * unsuspend it if we already have. 2878 */ 2879 if (flags == 0 && jblocks->jb_suspended) { 2880 if (journal_unsuspend(ump)) 2881 return; 2882 FREE_LOCK(&lk); 2883 VFS_SYNC(mp, MNT_NOWAIT); 2884 ffs_sbupdate(ump, MNT_WAIT, 0); 2885 ACQUIRE_LOCK(&lk); 2886 } 2887 } 2888 2889 /* 2890 * Complete a jseg, allowing all dependencies awaiting journal writes 2891 * to proceed. Each journal dependency also attaches a jsegdep to dependent 2892 * structures so that the journal segment can be freed to reclaim space. 2893 */ 2894 static void 2895 complete_jseg(jseg) 2896 struct jseg *jseg; 2897 { 2898 struct worklist *wk; 2899 struct jmvref *jmvref; 2900 int waiting; 2901 int i; 2902 2903 i = 0; 2904 while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) { 2905 WORKLIST_REMOVE(wk); 2906 waiting = wk->wk_state & IOWAITING; 2907 wk->wk_state &= ~(IOSTARTED | IOWAITING); 2908 wk->wk_state |= COMPLETE; 2909 KASSERT(i < jseg->js_cnt, 2910 ("handle_written_jseg: overflow %d >= %d", 2911 i, jseg->js_cnt)); 2912 switch (wk->wk_type) { 2913 case D_JADDREF: 2914 handle_written_jaddref(WK_JADDREF(wk)); 2915 break; 2916 case D_JREMREF: 2917 handle_written_jremref(WK_JREMREF(wk)); 2918 break; 2919 case D_JMVREF: 2920 /* No jsegdep here. */ 2921 free_jseg(jseg); 2922 jmvref = WK_JMVREF(wk); 2923 LIST_REMOVE(jmvref, jm_deps); 2924 free_pagedep(jmvref->jm_pagedep); 2925 WORKITEM_FREE(jmvref, D_JMVREF); 2926 break; 2927 case D_JNEWBLK: 2928 handle_written_jnewblk(WK_JNEWBLK(wk)); 2929 break; 2930 case D_JFREEBLK: 2931 handle_written_jfreeblk(WK_JFREEBLK(wk)); 2932 break; 2933 case D_JFREEFRAG: 2934 handle_written_jfreefrag(WK_JFREEFRAG(wk)); 2935 break; 2936 case D_JTRUNC: 2937 WK_JTRUNC(wk)->jt_jsegdep->jd_seg = jseg; 2938 WORKITEM_FREE(wk, D_JTRUNC); 2939 break; 2940 default: 2941 panic("handle_written_jseg: Unknown type %s", 2942 TYPENAME(wk->wk_type)); 2943 /* NOTREACHED */ 2944 } 2945 if (waiting) 2946 wakeup(wk); 2947 } 2948 /* Release the self reference so the structure may be freed. */ 2949 free_jseg(jseg); 2950 } 2951 2952 /* 2953 * Mark a jseg as DEPCOMPLETE and throw away the buffer. Handle jseg 2954 * completions in order only. 2955 */ 2956 static void 2957 handle_written_jseg(jseg, bp) 2958 struct jseg *jseg; 2959 struct buf *bp; 2960 { 2961 struct jblocks *jblocks; 2962 struct jseg *jsegn; 2963 2964 if (jseg->js_refs == 0) 2965 panic("handle_written_jseg: No self-reference on %p", jseg); 2966 jseg->js_state |= DEPCOMPLETE; 2967 /* 2968 * We'll never need this buffer again, set flags so it will be 2969 * discarded. 2970 */ 2971 bp->b_flags |= B_INVAL | B_NOCACHE; 2972 jblocks = jseg->js_jblocks; 2973 /* 2974 * Don't allow out of order completions. If this isn't the first 2975 * block wait for it to write before we're done. 2976 */ 2977 if (jseg != jblocks->jb_writeseg) 2978 return; 2979 /* Iterate through available jsegs processing their entries. */ 2980 do { 2981 jsegn = TAILQ_NEXT(jseg, js_next); 2982 complete_jseg(jseg); 2983 jseg = jsegn; 2984 } while (jseg && jseg->js_state & DEPCOMPLETE); 2985 jblocks->jb_writeseg = jseg; 2986 } 2987 2988 static inline struct jsegdep * 2989 inoref_jseg(inoref) 2990 struct inoref *inoref; 2991 { 2992 struct jsegdep *jsegdep; 2993 2994 jsegdep = inoref->if_jsegdep; 2995 inoref->if_jsegdep = NULL; 2996 2997 return (jsegdep); 2998 } 2999 3000 /* 3001 * Called once a jremref has made it to stable store. The jremref is marked 3002 * complete and we attempt to free it. Any pagedeps writes sleeping waiting 3003 * for the jremref to complete will be awoken by free_jremref. 3004 */ 3005 static void 3006 handle_written_jremref(jremref) 3007 struct jremref *jremref; 3008 { 3009 struct inodedep *inodedep; 3010 struct jsegdep *jsegdep; 3011 struct dirrem *dirrem; 3012 3013 /* Grab the jsegdep. */ 3014 jsegdep = inoref_jseg(&jremref->jr_ref); 3015 /* 3016 * Remove us from the inoref list. 3017 */ 3018 if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 3019 0, &inodedep) == 0) 3020 panic("handle_written_jremref: Lost inodedep"); 3021 TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); 3022 /* 3023 * Complete the dirrem. 3024 */ 3025 dirrem = jremref->jr_dirrem; 3026 jremref->jr_dirrem = NULL; 3027 LIST_REMOVE(jremref, jr_deps); 3028 jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT; 3029 WORKLIST_INSERT(&dirrem->dm_jwork, &jsegdep->jd_list); 3030 if (LIST_EMPTY(&dirrem->dm_jremrefhd) && 3031 (dirrem->dm_state & COMPLETE) != 0) 3032 add_to_worklist(&dirrem->dm_list, 0); 3033 free_jremref(jremref); 3034 } 3035 3036 /* 3037 * Called once a jaddref has made it to stable store. The dependency is 3038 * marked complete and any dependent structures are added to the inode 3039 * bufwait list to be completed as soon as it is written. If a bitmap write 3040 * depends on this entry we move the inode into the inodedephd of the 3041 * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap. 3042 */ 3043 static void 3044 handle_written_jaddref(jaddref) 3045 struct jaddref *jaddref; 3046 { 3047 struct jsegdep *jsegdep; 3048 struct inodedep *inodedep; 3049 struct diradd *diradd; 3050 struct mkdir *mkdir; 3051 3052 /* Grab the jsegdep. */ 3053 jsegdep = inoref_jseg(&jaddref->ja_ref); 3054 mkdir = NULL; 3055 diradd = NULL; 3056 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 3057 0, &inodedep) == 0) 3058 panic("handle_written_jaddref: Lost inodedep."); 3059 if (jaddref->ja_diradd == NULL) 3060 panic("handle_written_jaddref: No dependency"); 3061 if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) { 3062 diradd = jaddref->ja_diradd; 3063 WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list); 3064 } else if (jaddref->ja_state & MKDIR_PARENT) { 3065 mkdir = jaddref->ja_mkdir; 3066 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list); 3067 } else if (jaddref->ja_state & MKDIR_BODY) 3068 mkdir = jaddref->ja_mkdir; 3069 else 3070 panic("handle_written_jaddref: Unknown dependency %p", 3071 jaddref->ja_diradd); 3072 jaddref->ja_diradd = NULL; /* also clears ja_mkdir */ 3073 /* 3074 * Remove us from the inode list. 3075 */ 3076 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); 3077 /* 3078 * The mkdir may be waiting on the jaddref to clear before freeing. 3079 */ 3080 if (mkdir) { 3081 KASSERT(mkdir->md_list.wk_type == D_MKDIR, 3082 ("handle_written_jaddref: Incorrect type for mkdir %s", 3083 TYPENAME(mkdir->md_list.wk_type))); 3084 mkdir->md_jaddref = NULL; 3085 diradd = mkdir->md_diradd; 3086 mkdir->md_state |= DEPCOMPLETE; 3087 complete_mkdir(mkdir); 3088 } 3089 WORKLIST_INSERT(&diradd->da_jwork, &jsegdep->jd_list); 3090 if (jaddref->ja_state & NEWBLOCK) { 3091 inodedep->id_state |= ONDEPLIST; 3092 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd, 3093 inodedep, id_deps); 3094 } 3095 free_jaddref(jaddref); 3096 } 3097 3098 /* 3099 * Called once a jnewblk journal is written. The allocdirect or allocindir 3100 * is placed in the bmsafemap to await notification of a written bitmap. 3101 */ 3102 static void 3103 handle_written_jnewblk(jnewblk) 3104 struct jnewblk *jnewblk; 3105 { 3106 struct bmsafemap *bmsafemap; 3107 struct jsegdep *jsegdep; 3108 struct newblk *newblk; 3109 3110 /* Grab the jsegdep. */ 3111 jsegdep = jnewblk->jn_jsegdep; 3112 jnewblk->jn_jsegdep = NULL; 3113 /* 3114 * Add the written block to the bmsafemap so it can be notified when 3115 * the bitmap is on disk. 3116 */ 3117 newblk = jnewblk->jn_newblk; 3118 jnewblk->jn_newblk = NULL; 3119 if (newblk == NULL) 3120 panic("handle_written_jnewblk: No dependency for the segdep."); 3121 3122 newblk->nb_jnewblk = NULL; 3123 bmsafemap = newblk->nb_bmsafemap; 3124 WORKLIST_INSERT(&newblk->nb_jwork, &jsegdep->jd_list); 3125 newblk->nb_state |= ONDEPLIST; 3126 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); 3127 free_jnewblk(jnewblk); 3128 } 3129 3130 /* 3131 * Cancel a jfreefrag that won't be needed, probably due to colliding with 3132 * an in-flight allocation that has not yet been committed. Divorce us 3133 * from the freefrag and mark it DEPCOMPLETE so that it may be added 3134 * to the worklist. 3135 */ 3136 static void 3137 cancel_jfreefrag(jfreefrag) 3138 struct jfreefrag *jfreefrag; 3139 { 3140 struct freefrag *freefrag; 3141 3142 if (jfreefrag->fr_jsegdep) { 3143 free_jsegdep(jfreefrag->fr_jsegdep); 3144 jfreefrag->fr_jsegdep = NULL; 3145 } 3146 freefrag = jfreefrag->fr_freefrag; 3147 jfreefrag->fr_freefrag = NULL; 3148 freefrag->ff_jfreefrag = NULL; 3149 free_jfreefrag(jfreefrag); 3150 freefrag->ff_state |= DEPCOMPLETE; 3151 } 3152 3153 /* 3154 * Free a jfreefrag when the parent freefrag is rendered obsolete. 3155 */ 3156 static void 3157 free_jfreefrag(jfreefrag) 3158 struct jfreefrag *jfreefrag; 3159 { 3160 3161 if (jfreefrag->fr_state & IOSTARTED) 3162 WORKLIST_REMOVE(&jfreefrag->fr_list); 3163 else if (jfreefrag->fr_state & ONWORKLIST) 3164 remove_from_journal(&jfreefrag->fr_list); 3165 if (jfreefrag->fr_freefrag != NULL) 3166 panic("free_jfreefrag: Still attached to a freefrag."); 3167 WORKITEM_FREE(jfreefrag, D_JFREEFRAG); 3168 } 3169 3170 /* 3171 * Called when the journal write for a jfreefrag completes. The parent 3172 * freefrag is added to the worklist if this completes its dependencies. 3173 */ 3174 static void 3175 handle_written_jfreefrag(jfreefrag) 3176 struct jfreefrag *jfreefrag; 3177 { 3178 struct jsegdep *jsegdep; 3179 struct freefrag *freefrag; 3180 3181 /* Grab the jsegdep. */ 3182 jsegdep = jfreefrag->fr_jsegdep; 3183 jfreefrag->fr_jsegdep = NULL; 3184 freefrag = jfreefrag->fr_freefrag; 3185 if (freefrag == NULL) 3186 panic("handle_written_jfreefrag: No freefrag."); 3187 freefrag->ff_state |= DEPCOMPLETE; 3188 freefrag->ff_jfreefrag = NULL; 3189 WORKLIST_INSERT(&freefrag->ff_jwork, &jsegdep->jd_list); 3190 if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) 3191 add_to_worklist(&freefrag->ff_list, 0); 3192 jfreefrag->fr_freefrag = NULL; 3193 free_jfreefrag(jfreefrag); 3194 } 3195 3196 /* 3197 * Called when the journal write for a jfreeblk completes. The jfreeblk 3198 * is removed from the freeblks list of pending journal writes and the 3199 * jsegdep is moved to the freeblks jwork to be completed when all blocks 3200 * have been reclaimed. 3201 */ 3202 static void 3203 handle_written_jfreeblk(jfreeblk) 3204 struct jfreeblk *jfreeblk; 3205 { 3206 struct freeblks *freeblks; 3207 struct jsegdep *jsegdep; 3208 3209 /* Grab the jsegdep. */ 3210 jsegdep = jfreeblk->jf_jsegdep; 3211 jfreeblk->jf_jsegdep = NULL; 3212 freeblks = jfreeblk->jf_freeblks; 3213 LIST_REMOVE(jfreeblk, jf_deps); 3214 WORKLIST_INSERT(&freeblks->fb_jwork, &jsegdep->jd_list); 3215 /* 3216 * If the freeblks is all journaled, we can add it to the worklist. 3217 */ 3218 if (LIST_EMPTY(&freeblks->fb_jfreeblkhd) && 3219 (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) { 3220 /* Remove from the b_dep that is waiting on this write. */ 3221 if (freeblks->fb_state & ONWORKLIST) 3222 WORKLIST_REMOVE(&freeblks->fb_list); 3223 add_to_worklist(&freeblks->fb_list, 1); 3224 } 3225 3226 free_jfreeblk(jfreeblk); 3227 } 3228 3229 static struct jsegdep * 3230 newjsegdep(struct worklist *wk) 3231 { 3232 struct jsegdep *jsegdep; 3233 3234 jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS); 3235 workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp); 3236 jsegdep->jd_seg = NULL; 3237 3238 return (jsegdep); 3239 } 3240 3241 static struct jmvref * 3242 newjmvref(dp, ino, oldoff, newoff) 3243 struct inode *dp; 3244 ino_t ino; 3245 off_t oldoff; 3246 off_t newoff; 3247 { 3248 struct jmvref *jmvref; 3249 3250 jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS); 3251 workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump)); 3252 jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE; 3253 jmvref->jm_parent = dp->i_number; 3254 jmvref->jm_ino = ino; 3255 jmvref->jm_oldoff = oldoff; 3256 jmvref->jm_newoff = newoff; 3257 3258 return (jmvref); 3259 } 3260 3261 /* 3262 * Allocate a new jremref that tracks the removal of ip from dp with the 3263 * directory entry offset of diroff. Mark the entry as ATTACHED and 3264 * DEPCOMPLETE as we have all the information required for the journal write 3265 * and the directory has already been removed from the buffer. The caller 3266 * is responsible for linking the jremref into the pagedep and adding it 3267 * to the journal to write. The MKDIR_PARENT flag is set if we're doing 3268 * a DOTDOT addition so handle_workitem_remove() can properly assign 3269 * the jsegdep when we're done. 3270 */ 3271 static struct jremref * 3272 newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip, 3273 off_t diroff, nlink_t nlink) 3274 { 3275 struct jremref *jremref; 3276 3277 jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS); 3278 workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump)); 3279 jremref->jr_state = ATTACHED; 3280 newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff, 3281 nlink, ip->i_mode); 3282 jremref->jr_dirrem = dirrem; 3283 3284 return (jremref); 3285 } 3286 3287 static inline void 3288 newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff, 3289 nlink_t nlink, uint16_t mode) 3290 { 3291 3292 inoref->if_jsegdep = newjsegdep(&inoref->if_list); 3293 inoref->if_diroff = diroff; 3294 inoref->if_ino = ino; 3295 inoref->if_parent = parent; 3296 inoref->if_nlink = nlink; 3297 inoref->if_mode = mode; 3298 } 3299 3300 /* 3301 * Allocate a new jaddref to track the addition of ino to dp at diroff. The 3302 * directory offset may not be known until later. The caller is responsible 3303 * adding the entry to the journal when this information is available. nlink 3304 * should be the link count prior to the addition and mode is only required 3305 * to have the correct FMT. 3306 */ 3307 static struct jaddref * 3308 newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink, 3309 uint16_t mode) 3310 { 3311 struct jaddref *jaddref; 3312 3313 jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS); 3314 workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump)); 3315 jaddref->ja_state = ATTACHED; 3316 jaddref->ja_mkdir = NULL; 3317 newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode); 3318 3319 return (jaddref); 3320 } 3321 3322 /* 3323 * Create a new free dependency for a freework. The caller is responsible 3324 * for adjusting the reference count when it has the lock held. The freedep 3325 * will track an outstanding bitmap write that will ultimately clear the 3326 * freework to continue. 3327 */ 3328 static struct freedep * 3329 newfreedep(struct freework *freework) 3330 { 3331 struct freedep *freedep; 3332 3333 freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS); 3334 workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp); 3335 freedep->fd_freework = freework; 3336 3337 return (freedep); 3338 } 3339 3340 /* 3341 * Free a freedep structure once the buffer it is linked to is written. If 3342 * this is the last reference to the freework schedule it for completion. 3343 */ 3344 static void 3345 free_freedep(freedep) 3346 struct freedep *freedep; 3347 { 3348 3349 if (--freedep->fd_freework->fw_ref == 0) 3350 add_to_worklist(&freedep->fd_freework->fw_list, 1); 3351 WORKITEM_FREE(freedep, D_FREEDEP); 3352 } 3353 3354 /* 3355 * Allocate a new freework structure that may be a level in an indirect 3356 * when parent is not NULL or a top level block when it is. The top level 3357 * freework structures are allocated without lk held and before the freeblks 3358 * is visible outside of softdep_setup_freeblocks(). 3359 */ 3360 static struct freework * 3361 newfreework(freeblks, parent, lbn, nb, frags, journal) 3362 struct freeblks *freeblks; 3363 struct freework *parent; 3364 ufs_lbn_t lbn; 3365 ufs2_daddr_t nb; 3366 int frags; 3367 int journal; 3368 { 3369 struct freework *freework; 3370 3371 freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS); 3372 workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp); 3373 freework->fw_freeblks = freeblks; 3374 freework->fw_parent = parent; 3375 freework->fw_lbn = lbn; 3376 freework->fw_blkno = nb; 3377 freework->fw_frags = frags; 3378 freework->fw_ref = 0; 3379 freework->fw_off = 0; 3380 LIST_INIT(&freework->fw_jwork); 3381 3382 if (parent == NULL) { 3383 WORKLIST_INSERT_UNLOCKED(&freeblks->fb_freeworkhd, 3384 &freework->fw_list); 3385 freeblks->fb_ref++; 3386 } 3387 if (journal) 3388 newjfreeblk(freeblks, lbn, nb, frags); 3389 3390 return (freework); 3391 } 3392 3393 /* 3394 * Allocate a new jfreeblk to journal top level block pointer when truncating 3395 * a file. The caller must add this to the worklist when lk is held. 3396 */ 3397 static struct jfreeblk * 3398 newjfreeblk(freeblks, lbn, blkno, frags) 3399 struct freeblks *freeblks; 3400 ufs_lbn_t lbn; 3401 ufs2_daddr_t blkno; 3402 int frags; 3403 { 3404 struct jfreeblk *jfreeblk; 3405 3406 jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS); 3407 workitem_alloc(&jfreeblk->jf_list, D_JFREEBLK, freeblks->fb_list.wk_mp); 3408 jfreeblk->jf_jsegdep = newjsegdep(&jfreeblk->jf_list); 3409 jfreeblk->jf_state = ATTACHED | DEPCOMPLETE; 3410 jfreeblk->jf_ino = freeblks->fb_previousinum; 3411 jfreeblk->jf_lbn = lbn; 3412 jfreeblk->jf_blkno = blkno; 3413 jfreeblk->jf_frags = frags; 3414 jfreeblk->jf_freeblks = freeblks; 3415 LIST_INSERT_HEAD(&freeblks->fb_jfreeblkhd, jfreeblk, jf_deps); 3416 3417 return (jfreeblk); 3418 } 3419 3420 static void move_newblock_dep(struct jaddref *, struct inodedep *); 3421 /* 3422 * If we're canceling a new bitmap we have to search for another ref 3423 * to move into the bmsafemap dep. This might be better expressed 3424 * with another structure. 3425 */ 3426 static void 3427 move_newblock_dep(jaddref, inodedep) 3428 struct jaddref *jaddref; 3429 struct inodedep *inodedep; 3430 { 3431 struct inoref *inoref; 3432 struct jaddref *jaddrefn; 3433 3434 jaddrefn = NULL; 3435 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; 3436 inoref = TAILQ_NEXT(inoref, if_deps)) { 3437 if ((jaddref->ja_state & NEWBLOCK) && 3438 inoref->if_list.wk_type == D_JADDREF) { 3439 jaddrefn = (struct jaddref *)inoref; 3440 break; 3441 } 3442 } 3443 if (jaddrefn == NULL) 3444 return; 3445 jaddrefn->ja_state &= ~(ATTACHED | UNDONE); 3446 jaddrefn->ja_state |= jaddref->ja_state & 3447 (ATTACHED | UNDONE | NEWBLOCK); 3448 jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK); 3449 jaddref->ja_state |= ATTACHED; 3450 LIST_REMOVE(jaddref, ja_bmdeps); 3451 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn, 3452 ja_bmdeps); 3453 } 3454 3455 /* 3456 * Cancel a jaddref either before it has been written or while it is being 3457 * written. This happens when a link is removed before the add reaches 3458 * the disk. The jaddref dependency is kept linked into the bmsafemap 3459 * and inode to prevent the link count or bitmap from reaching the disk 3460 * until handle_workitem_remove() re-adjusts the counts and bitmaps as 3461 * required. 3462 * 3463 * Returns 1 if the canceled addref requires journaling of the remove and 3464 * 0 otherwise. 3465 */ 3466 static int 3467 cancel_jaddref(jaddref, inodedep, wkhd) 3468 struct jaddref *jaddref; 3469 struct inodedep *inodedep; 3470 struct workhead *wkhd; 3471 { 3472 struct inoref *inoref; 3473 struct jsegdep *jsegdep; 3474 int needsj; 3475 3476 KASSERT((jaddref->ja_state & COMPLETE) == 0, 3477 ("cancel_jaddref: Canceling complete jaddref")); 3478 if (jaddref->ja_state & (IOSTARTED | COMPLETE)) 3479 needsj = 1; 3480 else 3481 needsj = 0; 3482 if (inodedep == NULL) 3483 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 3484 0, &inodedep) == 0) 3485 panic("cancel_jaddref: Lost inodedep"); 3486 /* 3487 * We must adjust the nlink of any reference operation that follows 3488 * us so that it is consistent with the in-memory reference. This 3489 * ensures that inode nlink rollbacks always have the correct link. 3490 */ 3491 if (needsj == 0) 3492 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; 3493 inoref = TAILQ_NEXT(inoref, if_deps)) 3494 inoref->if_nlink--; 3495 jsegdep = inoref_jseg(&jaddref->ja_ref); 3496 if (jaddref->ja_state & NEWBLOCK) 3497 move_newblock_dep(jaddref, inodedep); 3498 if (jaddref->ja_state & IOWAITING) { 3499 jaddref->ja_state &= ~IOWAITING; 3500 wakeup(&jaddref->ja_list); 3501 } 3502 jaddref->ja_mkdir = NULL; 3503 if (jaddref->ja_state & IOSTARTED) { 3504 jaddref->ja_state &= ~IOSTARTED; 3505 WORKLIST_REMOVE(&jaddref->ja_list); 3506 WORKLIST_INSERT(wkhd, &jsegdep->jd_list); 3507 } else { 3508 free_jsegdep(jsegdep); 3509 if (jaddref->ja_state & DEPCOMPLETE) 3510 remove_from_journal(&jaddref->ja_list); 3511 } 3512 /* 3513 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove 3514 * can arrange for them to be freed with the bitmap. Otherwise we 3515 * no longer need this addref attached to the inoreflst and it 3516 * will incorrectly adjust nlink if we leave it. 3517 */ 3518 if ((jaddref->ja_state & NEWBLOCK) == 0) { 3519 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, 3520 if_deps); 3521 jaddref->ja_state |= COMPLETE; 3522 free_jaddref(jaddref); 3523 return (needsj); 3524 } 3525 jaddref->ja_state |= GOINGAWAY; 3526 /* 3527 * Leave the head of the list for jsegdeps for fast merging. 3528 */ 3529 if (LIST_FIRST(wkhd) != NULL) { 3530 jaddref->ja_state |= ONWORKLIST; 3531 LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list); 3532 } else 3533 WORKLIST_INSERT(wkhd, &jaddref->ja_list); 3534 3535 return (needsj); 3536 } 3537 3538 /* 3539 * Attempt to free a jaddref structure when some work completes. This 3540 * should only succeed once the entry is written and all dependencies have 3541 * been notified. 3542 */ 3543 static void 3544 free_jaddref(jaddref) 3545 struct jaddref *jaddref; 3546 { 3547 3548 if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE) 3549 return; 3550 if (jaddref->ja_ref.if_jsegdep) 3551 panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n", 3552 jaddref, jaddref->ja_state); 3553 if (jaddref->ja_state & NEWBLOCK) 3554 LIST_REMOVE(jaddref, ja_bmdeps); 3555 if (jaddref->ja_state & (IOSTARTED | ONWORKLIST)) 3556 panic("free_jaddref: Bad state %p(0x%X)", 3557 jaddref, jaddref->ja_state); 3558 if (jaddref->ja_mkdir != NULL) 3559 panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state); 3560 WORKITEM_FREE(jaddref, D_JADDREF); 3561 } 3562 3563 /* 3564 * Free a jremref structure once it has been written or discarded. 3565 */ 3566 static void 3567 free_jremref(jremref) 3568 struct jremref *jremref; 3569 { 3570 3571 if (jremref->jr_ref.if_jsegdep) 3572 free_jsegdep(jremref->jr_ref.if_jsegdep); 3573 if (jremref->jr_state & IOSTARTED) 3574 panic("free_jremref: IO still pending"); 3575 WORKITEM_FREE(jremref, D_JREMREF); 3576 } 3577 3578 /* 3579 * Free a jnewblk structure. 3580 */ 3581 static void 3582 free_jnewblk(jnewblk) 3583 struct jnewblk *jnewblk; 3584 { 3585 3586 if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE) 3587 return; 3588 LIST_REMOVE(jnewblk, jn_deps); 3589 if (jnewblk->jn_newblk != NULL) 3590 panic("free_jnewblk: Dependency still attached."); 3591 WORKITEM_FREE(jnewblk, D_JNEWBLK); 3592 } 3593 3594 /* 3595 * Cancel a jnewblk which has been superseded by a freeblk. The jnewblk 3596 * is kept linked into the bmsafemap until the free completes, thus 3597 * preventing the modified state from ever reaching disk. The free 3598 * routine must pass this structure via ffs_blkfree() to 3599 * softdep_setup_freeblks() so there is no race in releasing the space. 3600 */ 3601 static void 3602 cancel_jnewblk(jnewblk, wkhd) 3603 struct jnewblk *jnewblk; 3604 struct workhead *wkhd; 3605 { 3606 struct jsegdep *jsegdep; 3607 3608 jsegdep = jnewblk->jn_jsegdep; 3609 jnewblk->jn_jsegdep = NULL; 3610 free_jsegdep(jsegdep); 3611 jnewblk->jn_newblk = NULL; 3612 jnewblk->jn_state |= GOINGAWAY; 3613 if (jnewblk->jn_state & IOSTARTED) { 3614 jnewblk->jn_state &= ~IOSTARTED; 3615 WORKLIST_REMOVE(&jnewblk->jn_list); 3616 } else 3617 remove_from_journal(&jnewblk->jn_list); 3618 /* 3619 * Leave the head of the list for jsegdeps for fast merging. 3620 */ 3621 if (LIST_FIRST(wkhd) != NULL) { 3622 jnewblk->jn_state |= ONWORKLIST; 3623 LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jnewblk->jn_list, wk_list); 3624 } else 3625 WORKLIST_INSERT(wkhd, &jnewblk->jn_list); 3626 if (jnewblk->jn_state & IOWAITING) { 3627 jnewblk->jn_state &= ~IOWAITING; 3628 wakeup(&jnewblk->jn_list); 3629 } 3630 } 3631 3632 static void 3633 free_jfreeblk(jfreeblk) 3634 struct jfreeblk *jfreeblk; 3635 { 3636 3637 WORKITEM_FREE(jfreeblk, D_JFREEBLK); 3638 } 3639 3640 /* 3641 * Release one reference to a jseg and free it if the count reaches 0. This 3642 * should eventually reclaim journal space as well. 3643 */ 3644 static void 3645 free_jseg(jseg) 3646 struct jseg *jseg; 3647 { 3648 struct jblocks *jblocks; 3649 3650 KASSERT(jseg->js_refs > 0, 3651 ("free_jseg: Invalid refcnt %d", jseg->js_refs)); 3652 if (--jseg->js_refs != 0) 3653 return; 3654 /* 3655 * Free only those jsegs which have none allocated before them to 3656 * preserve the journal space ordering. 3657 */ 3658 jblocks = jseg->js_jblocks; 3659 while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) { 3660 jblocks->jb_oldestseq = jseg->js_seq; 3661 if (jseg->js_refs != 0) 3662 break; 3663 TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next); 3664 jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size); 3665 KASSERT(LIST_EMPTY(&jseg->js_entries), 3666 ("free_jseg: Freed jseg has valid entries.")); 3667 WORKITEM_FREE(jseg, D_JSEG); 3668 } 3669 } 3670 3671 /* 3672 * Release a jsegdep and decrement the jseg count. 3673 */ 3674 static void 3675 free_jsegdep(jsegdep) 3676 struct jsegdep *jsegdep; 3677 { 3678 3679 if (jsegdep->jd_seg) 3680 free_jseg(jsegdep->jd_seg); 3681 WORKITEM_FREE(jsegdep, D_JSEGDEP); 3682 } 3683 3684 /* 3685 * Wait for a journal item to make it to disk. Initiate journal processing 3686 * if required. 3687 */ 3688 static void 3689 jwait(wk) 3690 struct worklist *wk; 3691 { 3692 3693 stat_journal_wait++; 3694 /* 3695 * If IO has not started we process the journal. We can't mark the 3696 * worklist item as IOWAITING because we drop the lock while 3697 * processing the journal and the worklist entry may be freed after 3698 * this point. The caller may call back in and re-issue the request. 3699 */ 3700 if ((wk->wk_state & IOSTARTED) == 0) { 3701 softdep_process_journal(wk->wk_mp, MNT_WAIT); 3702 return; 3703 } 3704 wk->wk_state |= IOWAITING; 3705 msleep(wk, &lk, PRIBIO, "jwait", 0); 3706 } 3707 3708 /* 3709 * Lookup an inodedep based on an inode pointer and set the nlinkdelta as 3710 * appropriate. This is a convenience function to reduce duplicate code 3711 * for the setup and revert functions below. 3712 */ 3713 static struct inodedep * 3714 inodedep_lookup_ip(ip) 3715 struct inode *ip; 3716 { 3717 struct inodedep *inodedep; 3718 3719 KASSERT(ip->i_nlink >= ip->i_effnlink, 3720 ("inodedep_lookup_ip: bad delta")); 3721 (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 3722 DEPALLOC, &inodedep); 3723 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 3724 3725 return (inodedep); 3726 } 3727 3728 /* 3729 * Create a journal entry that describes a truncate that we're about to 3730 * perform. The inode allocations and frees between here and the completion 3731 * of the operation are done asynchronously and without journaling. At 3732 * the end of the operation the vnode is sync'd and the journal space 3733 * is released. Recovery will discover the partially completed truncate 3734 * and complete it. 3735 */ 3736 void * 3737 softdep_setup_trunc(vp, length, flags) 3738 struct vnode *vp; 3739 off_t length; 3740 int flags; 3741 { 3742 struct jsegdep *jsegdep; 3743 struct jtrunc *jtrunc; 3744 struct ufsmount *ump; 3745 struct inode *ip; 3746 3747 softdep_prealloc(vp, MNT_WAIT); 3748 ip = VTOI(vp); 3749 ump = VFSTOUFS(vp->v_mount); 3750 jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS); 3751 workitem_alloc(&jtrunc->jt_list, D_JTRUNC, vp->v_mount); 3752 jsegdep = jtrunc->jt_jsegdep = newjsegdep(&jtrunc->jt_list); 3753 jtrunc->jt_ino = ip->i_number; 3754 jtrunc->jt_extsize = 0; 3755 jtrunc->jt_size = length; 3756 if ((flags & IO_EXT) == 0 && ump->um_fstype == UFS2) 3757 jtrunc->jt_extsize = ip->i_din2->di_extsize; 3758 if ((flags & IO_NORMAL) == 0) 3759 jtrunc->jt_size = DIP(ip, i_size); 3760 ACQUIRE_LOCK(&lk); 3761 add_to_journal(&jtrunc->jt_list); 3762 while (jsegdep->jd_seg == NULL) { 3763 stat_jwait_freeblks++; 3764 jwait(&jtrunc->jt_list); 3765 } 3766 FREE_LOCK(&lk); 3767 3768 return (jsegdep); 3769 } 3770 3771 /* 3772 * After synchronous truncation is complete we free sync the vnode and 3773 * release the jsegdep so the journal space can be freed. 3774 */ 3775 int 3776 softdep_complete_trunc(vp, cookie) 3777 struct vnode *vp; 3778 void *cookie; 3779 { 3780 int error; 3781 3782 error = ffs_syncvnode(vp, MNT_WAIT); 3783 ACQUIRE_LOCK(&lk); 3784 free_jsegdep((struct jsegdep *)cookie); 3785 FREE_LOCK(&lk); 3786 3787 return (error); 3788 } 3789 3790 /* 3791 * Called prior to creating a new inode and linking it to a directory. The 3792 * jaddref structure must already be allocated by softdep_setup_inomapdep 3793 * and it is discovered here so we can initialize the mode and update 3794 * nlinkdelta. 3795 */ 3796 void 3797 softdep_setup_create(dp, ip) 3798 struct inode *dp; 3799 struct inode *ip; 3800 { 3801 struct inodedep *inodedep; 3802 struct jaddref *jaddref; 3803 struct vnode *dvp; 3804 3805 KASSERT(ip->i_nlink == 1, 3806 ("softdep_setup_create: Invalid link count.")); 3807 dvp = ITOV(dp); 3808 ACQUIRE_LOCK(&lk); 3809 inodedep = inodedep_lookup_ip(ip); 3810 if (DOINGSUJ(dvp)) { 3811 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 3812 inoreflst); 3813 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 3814 ("softdep_setup_create: No addref structure present.")); 3815 jaddref->ja_mode = ip->i_mode; 3816 } 3817 softdep_prelink(dvp, NULL); 3818 FREE_LOCK(&lk); 3819 } 3820 3821 /* 3822 * Create a jaddref structure to track the addition of a DOTDOT link when 3823 * we are reparenting an inode as part of a rename. This jaddref will be 3824 * found by softdep_setup_directory_change. Adjusts nlinkdelta for 3825 * non-journaling softdep. 3826 */ 3827 void 3828 softdep_setup_dotdot_link(dp, ip) 3829 struct inode *dp; 3830 struct inode *ip; 3831 { 3832 struct inodedep *inodedep; 3833 struct jaddref *jaddref; 3834 struct vnode *dvp; 3835 struct vnode *vp; 3836 3837 dvp = ITOV(dp); 3838 vp = ITOV(ip); 3839 jaddref = NULL; 3840 /* 3841 * We don't set MKDIR_PARENT as this is not tied to a mkdir and 3842 * is used as a normal link would be. 3843 */ 3844 if (DOINGSUJ(dvp)) 3845 jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, 3846 dp->i_effnlink - 1, dp->i_mode); 3847 ACQUIRE_LOCK(&lk); 3848 inodedep = inodedep_lookup_ip(dp); 3849 if (jaddref) 3850 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 3851 if_deps); 3852 softdep_prelink(dvp, ITOV(ip)); 3853 FREE_LOCK(&lk); 3854 } 3855 3856 /* 3857 * Create a jaddref structure to track a new link to an inode. The directory 3858 * offset is not known until softdep_setup_directory_add or 3859 * softdep_setup_directory_change. Adjusts nlinkdelta for non-journaling 3860 * softdep. 3861 */ 3862 void 3863 softdep_setup_link(dp, ip) 3864 struct inode *dp; 3865 struct inode *ip; 3866 { 3867 struct inodedep *inodedep; 3868 struct jaddref *jaddref; 3869 struct vnode *dvp; 3870 3871 dvp = ITOV(dp); 3872 jaddref = NULL; 3873 if (DOINGSUJ(dvp)) 3874 jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1, 3875 ip->i_mode); 3876 ACQUIRE_LOCK(&lk); 3877 inodedep = inodedep_lookup_ip(ip); 3878 if (jaddref) 3879 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 3880 if_deps); 3881 softdep_prelink(dvp, ITOV(ip)); 3882 FREE_LOCK(&lk); 3883 } 3884 3885 /* 3886 * Called to create the jaddref structures to track . and .. references as 3887 * well as lookup and further initialize the incomplete jaddref created 3888 * by softdep_setup_inomapdep when the inode was allocated. Adjusts 3889 * nlinkdelta for non-journaling softdep. 3890 */ 3891 void 3892 softdep_setup_mkdir(dp, ip) 3893 struct inode *dp; 3894 struct inode *ip; 3895 { 3896 struct inodedep *inodedep; 3897 struct jaddref *dotdotaddref; 3898 struct jaddref *dotaddref; 3899 struct jaddref *jaddref; 3900 struct vnode *dvp; 3901 3902 dvp = ITOV(dp); 3903 dotaddref = dotdotaddref = NULL; 3904 if (DOINGSUJ(dvp)) { 3905 dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1, 3906 ip->i_mode); 3907 dotaddref->ja_state |= MKDIR_BODY; 3908 dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, 3909 dp->i_effnlink - 1, dp->i_mode); 3910 dotdotaddref->ja_state |= MKDIR_PARENT; 3911 } 3912 ACQUIRE_LOCK(&lk); 3913 inodedep = inodedep_lookup_ip(ip); 3914 if (DOINGSUJ(dvp)) { 3915 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 3916 inoreflst); 3917 KASSERT(jaddref != NULL, 3918 ("softdep_setup_mkdir: No addref structure present.")); 3919 KASSERT(jaddref->ja_parent == dp->i_number, 3920 ("softdep_setup_mkdir: bad parent %d", 3921 jaddref->ja_parent)); 3922 jaddref->ja_mode = ip->i_mode; 3923 TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref, 3924 if_deps); 3925 } 3926 inodedep = inodedep_lookup_ip(dp); 3927 if (DOINGSUJ(dvp)) 3928 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, 3929 &dotdotaddref->ja_ref, if_deps); 3930 softdep_prelink(ITOV(dp), NULL); 3931 FREE_LOCK(&lk); 3932 } 3933 3934 /* 3935 * Called to track nlinkdelta of the inode and parent directories prior to 3936 * unlinking a directory. 3937 */ 3938 void 3939 softdep_setup_rmdir(dp, ip) 3940 struct inode *dp; 3941 struct inode *ip; 3942 { 3943 struct vnode *dvp; 3944 3945 dvp = ITOV(dp); 3946 ACQUIRE_LOCK(&lk); 3947 (void) inodedep_lookup_ip(ip); 3948 (void) inodedep_lookup_ip(dp); 3949 softdep_prelink(dvp, ITOV(ip)); 3950 FREE_LOCK(&lk); 3951 } 3952 3953 /* 3954 * Called to track nlinkdelta of the inode and parent directories prior to 3955 * unlink. 3956 */ 3957 void 3958 softdep_setup_unlink(dp, ip) 3959 struct inode *dp; 3960 struct inode *ip; 3961 { 3962 struct vnode *dvp; 3963 3964 dvp = ITOV(dp); 3965 ACQUIRE_LOCK(&lk); 3966 (void) inodedep_lookup_ip(ip); 3967 (void) inodedep_lookup_ip(dp); 3968 softdep_prelink(dvp, ITOV(ip)); 3969 FREE_LOCK(&lk); 3970 } 3971 3972 /* 3973 * Called to release the journal structures created by a failed non-directory 3974 * creation. Adjusts nlinkdelta for non-journaling softdep. 3975 */ 3976 void 3977 softdep_revert_create(dp, ip) 3978 struct inode *dp; 3979 struct inode *ip; 3980 { 3981 struct inodedep *inodedep; 3982 struct jaddref *jaddref; 3983 struct vnode *dvp; 3984 3985 dvp = ITOV(dp); 3986 ACQUIRE_LOCK(&lk); 3987 inodedep = inodedep_lookup_ip(ip); 3988 if (DOINGSUJ(dvp)) { 3989 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 3990 inoreflst); 3991 KASSERT(jaddref->ja_parent == dp->i_number, 3992 ("softdep_revert_create: addref parent mismatch")); 3993 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 3994 } 3995 FREE_LOCK(&lk); 3996 } 3997 3998 /* 3999 * Called to release the journal structures created by a failed dotdot link 4000 * creation. Adjusts nlinkdelta for non-journaling softdep. 4001 */ 4002 void 4003 softdep_revert_dotdot_link(dp, ip) 4004 struct inode *dp; 4005 struct inode *ip; 4006 { 4007 struct inodedep *inodedep; 4008 struct jaddref *jaddref; 4009 struct vnode *dvp; 4010 4011 dvp = ITOV(dp); 4012 ACQUIRE_LOCK(&lk); 4013 inodedep = inodedep_lookup_ip(dp); 4014 if (DOINGSUJ(dvp)) { 4015 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4016 inoreflst); 4017 KASSERT(jaddref->ja_parent == ip->i_number, 4018 ("softdep_revert_dotdot_link: addref parent mismatch")); 4019 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4020 } 4021 FREE_LOCK(&lk); 4022 } 4023 4024 /* 4025 * Called to release the journal structures created by a failed link 4026 * addition. Adjusts nlinkdelta for non-journaling softdep. 4027 */ 4028 void 4029 softdep_revert_link(dp, ip) 4030 struct inode *dp; 4031 struct inode *ip; 4032 { 4033 struct inodedep *inodedep; 4034 struct jaddref *jaddref; 4035 struct vnode *dvp; 4036 4037 dvp = ITOV(dp); 4038 ACQUIRE_LOCK(&lk); 4039 inodedep = inodedep_lookup_ip(ip); 4040 if (DOINGSUJ(dvp)) { 4041 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4042 inoreflst); 4043 KASSERT(jaddref->ja_parent == dp->i_number, 4044 ("softdep_revert_link: addref parent mismatch")); 4045 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4046 } 4047 FREE_LOCK(&lk); 4048 } 4049 4050 /* 4051 * Called to release the journal structures created by a failed mkdir 4052 * attempt. Adjusts nlinkdelta for non-journaling softdep. 4053 */ 4054 void 4055 softdep_revert_mkdir(dp, ip) 4056 struct inode *dp; 4057 struct inode *ip; 4058 { 4059 struct inodedep *inodedep; 4060 struct jaddref *jaddref; 4061 struct vnode *dvp; 4062 4063 dvp = ITOV(dp); 4064 4065 ACQUIRE_LOCK(&lk); 4066 inodedep = inodedep_lookup_ip(dp); 4067 if (DOINGSUJ(dvp)) { 4068 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4069 inoreflst); 4070 KASSERT(jaddref->ja_parent == ip->i_number, 4071 ("softdep_revert_mkdir: dotdot addref parent mismatch")); 4072 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4073 } 4074 inodedep = inodedep_lookup_ip(ip); 4075 if (DOINGSUJ(dvp)) { 4076 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4077 inoreflst); 4078 KASSERT(jaddref->ja_parent == dp->i_number, 4079 ("softdep_revert_mkdir: addref parent mismatch")); 4080 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4081 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4082 inoreflst); 4083 KASSERT(jaddref->ja_parent == ip->i_number, 4084 ("softdep_revert_mkdir: dot addref parent mismatch")); 4085 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4086 } 4087 FREE_LOCK(&lk); 4088 } 4089 4090 /* 4091 * Called to correct nlinkdelta after a failed rmdir. 4092 */ 4093 void 4094 softdep_revert_rmdir(dp, ip) 4095 struct inode *dp; 4096 struct inode *ip; 4097 { 4098 4099 ACQUIRE_LOCK(&lk); 4100 (void) inodedep_lookup_ip(ip); 4101 (void) inodedep_lookup_ip(dp); 4102 FREE_LOCK(&lk); 4103 } 4104 4105 /* 4106 * Protecting the freemaps (or bitmaps). 4107 * 4108 * To eliminate the need to execute fsck before mounting a filesystem 4109 * after a power failure, one must (conservatively) guarantee that the 4110 * on-disk copy of the bitmaps never indicate that a live inode or block is 4111 * free. So, when a block or inode is allocated, the bitmap should be 4112 * updated (on disk) before any new pointers. When a block or inode is 4113 * freed, the bitmap should not be updated until all pointers have been 4114 * reset. The latter dependency is handled by the delayed de-allocation 4115 * approach described below for block and inode de-allocation. The former 4116 * dependency is handled by calling the following procedure when a block or 4117 * inode is allocated. When an inode is allocated an "inodedep" is created 4118 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. 4119 * Each "inodedep" is also inserted into the hash indexing structure so 4120 * that any additional link additions can be made dependent on the inode 4121 * allocation. 4122 * 4123 * The ufs filesystem maintains a number of free block counts (e.g., per 4124 * cylinder group, per cylinder and per <cylinder, rotational position> pair) 4125 * in addition to the bitmaps. These counts are used to improve efficiency 4126 * during allocation and therefore must be consistent with the bitmaps. 4127 * There is no convenient way to guarantee post-crash consistency of these 4128 * counts with simple update ordering, for two main reasons: (1) The counts 4129 * and bitmaps for a single cylinder group block are not in the same disk 4130 * sector. If a disk write is interrupted (e.g., by power failure), one may 4131 * be written and the other not. (2) Some of the counts are located in the 4132 * superblock rather than the cylinder group block. So, we focus our soft 4133 * updates implementation on protecting the bitmaps. When mounting a 4134 * filesystem, we recompute the auxiliary counts from the bitmaps. 4135 */ 4136 4137 /* 4138 * Called just after updating the cylinder group block to allocate an inode. 4139 */ 4140 void 4141 softdep_setup_inomapdep(bp, ip, newinum) 4142 struct buf *bp; /* buffer for cylgroup block with inode map */ 4143 struct inode *ip; /* inode related to allocation */ 4144 ino_t newinum; /* new inode number being allocated */ 4145 { 4146 struct inodedep *inodedep; 4147 struct bmsafemap *bmsafemap; 4148 struct jaddref *jaddref; 4149 struct mount *mp; 4150 struct fs *fs; 4151 4152 mp = UFSTOVFS(ip->i_ump); 4153 fs = ip->i_ump->um_fs; 4154 jaddref = NULL; 4155 4156 /* 4157 * Allocate the journal reference add structure so that the bitmap 4158 * can be dependent on it. 4159 */ 4160 if (mp->mnt_kern_flag & MNTK_SUJ) { 4161 jaddref = newjaddref(ip, newinum, 0, 0, 0); 4162 jaddref->ja_state |= NEWBLOCK; 4163 } 4164 4165 /* 4166 * Create a dependency for the newly allocated inode. 4167 * Panic if it already exists as something is seriously wrong. 4168 * Otherwise add it to the dependency list for the buffer holding 4169 * the cylinder group map from which it was allocated. 4170 */ 4171 ACQUIRE_LOCK(&lk); 4172 if ((inodedep_lookup(mp, newinum, DEPALLOC|NODELAY, &inodedep))) 4173 panic("softdep_setup_inomapdep: dependency %p for new" 4174 "inode already exists", inodedep); 4175 bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum)); 4176 if (jaddref) { 4177 LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps); 4178 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 4179 if_deps); 4180 } else { 4181 inodedep->id_state |= ONDEPLIST; 4182 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); 4183 } 4184 inodedep->id_bmsafemap = bmsafemap; 4185 inodedep->id_state &= ~DEPCOMPLETE; 4186 FREE_LOCK(&lk); 4187 } 4188 4189 /* 4190 * Called just after updating the cylinder group block to 4191 * allocate block or fragment. 4192 */ 4193 void 4194 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) 4195 struct buf *bp; /* buffer for cylgroup block with block map */ 4196 struct mount *mp; /* filesystem doing allocation */ 4197 ufs2_daddr_t newblkno; /* number of newly allocated block */ 4198 int frags; /* Number of fragments. */ 4199 int oldfrags; /* Previous number of fragments for extend. */ 4200 { 4201 struct newblk *newblk; 4202 struct bmsafemap *bmsafemap; 4203 struct jnewblk *jnewblk; 4204 struct fs *fs; 4205 4206 fs = VFSTOUFS(mp)->um_fs; 4207 jnewblk = NULL; 4208 /* 4209 * Create a dependency for the newly allocated block. 4210 * Add it to the dependency list for the buffer holding 4211 * the cylinder group map from which it was allocated. 4212 */ 4213 if (mp->mnt_kern_flag & MNTK_SUJ) { 4214 jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS); 4215 workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp); 4216 jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list); 4217 jnewblk->jn_state = ATTACHED; 4218 jnewblk->jn_blkno = newblkno; 4219 jnewblk->jn_frags = frags; 4220 jnewblk->jn_oldfrags = oldfrags; 4221 #ifdef SUJ_DEBUG 4222 { 4223 struct cg *cgp; 4224 uint8_t *blksfree; 4225 long bno; 4226 int i; 4227 4228 cgp = (struct cg *)bp->b_data; 4229 blksfree = cg_blksfree(cgp); 4230 bno = dtogd(fs, jnewblk->jn_blkno); 4231 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; 4232 i++) { 4233 if (isset(blksfree, bno + i)) 4234 panic("softdep_setup_blkmapdep: " 4235 "free fragment %d from %d-%d " 4236 "state 0x%X dep %p", i, 4237 jnewblk->jn_oldfrags, 4238 jnewblk->jn_frags, 4239 jnewblk->jn_state, 4240 jnewblk->jn_newblk); 4241 } 4242 } 4243 #endif 4244 } 4245 ACQUIRE_LOCK(&lk); 4246 if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0) 4247 panic("softdep_setup_blkmapdep: found block"); 4248 newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp, 4249 dtog(fs, newblkno)); 4250 if (jnewblk) { 4251 jnewblk->jn_newblk = newblk; 4252 LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps); 4253 } else { 4254 newblk->nb_state |= ONDEPLIST; 4255 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); 4256 } 4257 newblk->nb_bmsafemap = bmsafemap; 4258 newblk->nb_jnewblk = jnewblk; 4259 FREE_LOCK(&lk); 4260 } 4261 4262 #define BMSAFEMAP_HASH(fs, cg) \ 4263 (&bmsafemap_hashtbl[((((register_t)(fs)) >> 13) + (cg)) & bmsafemap_hash]) 4264 4265 static int 4266 bmsafemap_find(bmsafemaphd, mp, cg, bmsafemapp) 4267 struct bmsafemap_hashhead *bmsafemaphd; 4268 struct mount *mp; 4269 int cg; 4270 struct bmsafemap **bmsafemapp; 4271 { 4272 struct bmsafemap *bmsafemap; 4273 4274 LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash) 4275 if (bmsafemap->sm_list.wk_mp == mp && bmsafemap->sm_cg == cg) 4276 break; 4277 if (bmsafemap) { 4278 *bmsafemapp = bmsafemap; 4279 return (1); 4280 } 4281 *bmsafemapp = NULL; 4282 4283 return (0); 4284 } 4285 4286 /* 4287 * Find the bmsafemap associated with a cylinder group buffer. 4288 * If none exists, create one. The buffer must be locked when 4289 * this routine is called and this routine must be called with 4290 * splbio interrupts blocked. 4291 */ 4292 static struct bmsafemap * 4293 bmsafemap_lookup(mp, bp, cg) 4294 struct mount *mp; 4295 struct buf *bp; 4296 int cg; 4297 { 4298 struct bmsafemap_hashhead *bmsafemaphd; 4299 struct bmsafemap *bmsafemap, *collision; 4300 struct worklist *wk; 4301 struct fs *fs; 4302 4303 mtx_assert(&lk, MA_OWNED); 4304 if (bp) 4305 LIST_FOREACH(wk, &bp->b_dep, wk_list) 4306 if (wk->wk_type == D_BMSAFEMAP) 4307 return (WK_BMSAFEMAP(wk)); 4308 fs = VFSTOUFS(mp)->um_fs; 4309 bmsafemaphd = BMSAFEMAP_HASH(fs, cg); 4310 if (bmsafemap_find(bmsafemaphd, mp, cg, &bmsafemap) == 1) 4311 return (bmsafemap); 4312 FREE_LOCK(&lk); 4313 bmsafemap = malloc(sizeof(struct bmsafemap), 4314 M_BMSAFEMAP, M_SOFTDEP_FLAGS); 4315 workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); 4316 bmsafemap->sm_buf = bp; 4317 LIST_INIT(&bmsafemap->sm_inodedephd); 4318 LIST_INIT(&bmsafemap->sm_inodedepwr); 4319 LIST_INIT(&bmsafemap->sm_newblkhd); 4320 LIST_INIT(&bmsafemap->sm_newblkwr); 4321 LIST_INIT(&bmsafemap->sm_jaddrefhd); 4322 LIST_INIT(&bmsafemap->sm_jnewblkhd); 4323 ACQUIRE_LOCK(&lk); 4324 if (bmsafemap_find(bmsafemaphd, mp, cg, &collision) == 1) { 4325 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 4326 return (collision); 4327 } 4328 bmsafemap->sm_cg = cg; 4329 LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash); 4330 WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); 4331 return (bmsafemap); 4332 } 4333 4334 /* 4335 * Direct block allocation dependencies. 4336 * 4337 * When a new block is allocated, the corresponding disk locations must be 4338 * initialized (with zeros or new data) before the on-disk inode points to 4339 * them. Also, the freemap from which the block was allocated must be 4340 * updated (on disk) before the inode's pointer. These two dependencies are 4341 * independent of each other and are needed for all file blocks and indirect 4342 * blocks that are pointed to directly by the inode. Just before the 4343 * "in-core" version of the inode is updated with a newly allocated block 4344 * number, a procedure (below) is called to setup allocation dependency 4345 * structures. These structures are removed when the corresponding 4346 * dependencies are satisfied or when the block allocation becomes obsolete 4347 * (i.e., the file is deleted, the block is de-allocated, or the block is a 4348 * fragment that gets upgraded). All of these cases are handled in 4349 * procedures described later. 4350 * 4351 * When a file extension causes a fragment to be upgraded, either to a larger 4352 * fragment or to a full block, the on-disk location may change (if the 4353 * previous fragment could not simply be extended). In this case, the old 4354 * fragment must be de-allocated, but not until after the inode's pointer has 4355 * been updated. In most cases, this is handled by later procedures, which 4356 * will construct a "freefrag" structure to be added to the workitem queue 4357 * when the inode update is complete (or obsolete). The main exception to 4358 * this is when an allocation occurs while a pending allocation dependency 4359 * (for the same block pointer) remains. This case is handled in the main 4360 * allocation dependency setup procedure by immediately freeing the 4361 * unreferenced fragments. 4362 */ 4363 void 4364 softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp) 4365 struct inode *ip; /* inode to which block is being added */ 4366 ufs_lbn_t off; /* block pointer within inode */ 4367 ufs2_daddr_t newblkno; /* disk block number being added */ 4368 ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */ 4369 long newsize; /* size of new block */ 4370 long oldsize; /* size of new block */ 4371 struct buf *bp; /* bp for allocated block */ 4372 { 4373 struct allocdirect *adp, *oldadp; 4374 struct allocdirectlst *adphead; 4375 struct freefrag *freefrag; 4376 struct inodedep *inodedep; 4377 struct pagedep *pagedep; 4378 struct jnewblk *jnewblk; 4379 struct newblk *newblk; 4380 struct mount *mp; 4381 ufs_lbn_t lbn; 4382 4383 lbn = bp->b_lblkno; 4384 mp = UFSTOVFS(ip->i_ump); 4385 if (oldblkno && oldblkno != newblkno) 4386 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); 4387 else 4388 freefrag = NULL; 4389 4390 ACQUIRE_LOCK(&lk); 4391 if (off >= NDADDR) { 4392 if (lbn > 0) 4393 panic("softdep_setup_allocdirect: bad lbn %jd, off %jd", 4394 lbn, off); 4395 /* allocating an indirect block */ 4396 if (oldblkno != 0) 4397 panic("softdep_setup_allocdirect: non-zero indir"); 4398 } else { 4399 if (off != lbn) 4400 panic("softdep_setup_allocdirect: lbn %jd != off %jd", 4401 lbn, off); 4402 /* 4403 * Allocating a direct block. 4404 * 4405 * If we are allocating a directory block, then we must 4406 * allocate an associated pagedep to track additions and 4407 * deletions. 4408 */ 4409 if ((ip->i_mode & IFMT) == IFDIR && 4410 pagedep_lookup(mp, ip->i_number, off, DEPALLOC, 4411 &pagedep) == 0) 4412 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 4413 } 4414 if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) 4415 panic("softdep_setup_allocdirect: lost block"); 4416 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 4417 ("softdep_setup_allocdirect: newblk already initialized")); 4418 /* 4419 * Convert the newblk to an allocdirect. 4420 */ 4421 newblk->nb_list.wk_type = D_ALLOCDIRECT; 4422 adp = (struct allocdirect *)newblk; 4423 newblk->nb_freefrag = freefrag; 4424 adp->ad_offset = off; 4425 adp->ad_oldblkno = oldblkno; 4426 adp->ad_newsize = newsize; 4427 adp->ad_oldsize = oldsize; 4428 4429 /* 4430 * Finish initializing the journal. 4431 */ 4432 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 4433 jnewblk->jn_ino = ip->i_number; 4434 jnewblk->jn_lbn = lbn; 4435 add_to_journal(&jnewblk->jn_list); 4436 } 4437 if (freefrag && freefrag->ff_jfreefrag != NULL) 4438 add_to_journal(&freefrag->ff_jfreefrag->fr_list); 4439 inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); 4440 adp->ad_inodedep = inodedep; 4441 4442 WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); 4443 /* 4444 * The list of allocdirects must be kept in sorted and ascending 4445 * order so that the rollback routines can quickly determine the 4446 * first uncommitted block (the size of the file stored on disk 4447 * ends at the end of the lowest committed fragment, or if there 4448 * are no fragments, at the end of the highest committed block). 4449 * Since files generally grow, the typical case is that the new 4450 * block is to be added at the end of the list. We speed this 4451 * special case by checking against the last allocdirect in the 4452 * list before laboriously traversing the list looking for the 4453 * insertion point. 4454 */ 4455 adphead = &inodedep->id_newinoupdt; 4456 oldadp = TAILQ_LAST(adphead, allocdirectlst); 4457 if (oldadp == NULL || oldadp->ad_offset <= off) { 4458 /* insert at end of list */ 4459 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 4460 if (oldadp != NULL && oldadp->ad_offset == off) 4461 allocdirect_merge(adphead, adp, oldadp); 4462 FREE_LOCK(&lk); 4463 return; 4464 } 4465 TAILQ_FOREACH(oldadp, adphead, ad_next) { 4466 if (oldadp->ad_offset >= off) 4467 break; 4468 } 4469 if (oldadp == NULL) 4470 panic("softdep_setup_allocdirect: lost entry"); 4471 /* insert in middle of list */ 4472 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 4473 if (oldadp->ad_offset == off) 4474 allocdirect_merge(adphead, adp, oldadp); 4475 4476 FREE_LOCK(&lk); 4477 } 4478 4479 /* 4480 * Replace an old allocdirect dependency with a newer one. 4481 * This routine must be called with splbio interrupts blocked. 4482 */ 4483 static void 4484 allocdirect_merge(adphead, newadp, oldadp) 4485 struct allocdirectlst *adphead; /* head of list holding allocdirects */ 4486 struct allocdirect *newadp; /* allocdirect being added */ 4487 struct allocdirect *oldadp; /* existing allocdirect being checked */ 4488 { 4489 struct worklist *wk; 4490 struct freefrag *freefrag; 4491 struct newdirblk *newdirblk; 4492 4493 freefrag = NULL; 4494 mtx_assert(&lk, MA_OWNED); 4495 if (newadp->ad_oldblkno != oldadp->ad_newblkno || 4496 newadp->ad_oldsize != oldadp->ad_newsize || 4497 newadp->ad_offset >= NDADDR) 4498 panic("%s %jd != new %jd || old size %ld != new %ld", 4499 "allocdirect_merge: old blkno", 4500 (intmax_t)newadp->ad_oldblkno, 4501 (intmax_t)oldadp->ad_newblkno, 4502 newadp->ad_oldsize, oldadp->ad_newsize); 4503 newadp->ad_oldblkno = oldadp->ad_oldblkno; 4504 newadp->ad_oldsize = oldadp->ad_oldsize; 4505 /* 4506 * If the old dependency had a fragment to free or had never 4507 * previously had a block allocated, then the new dependency 4508 * can immediately post its freefrag and adopt the old freefrag. 4509 * This action is done by swapping the freefrag dependencies. 4510 * The new dependency gains the old one's freefrag, and the 4511 * old one gets the new one and then immediately puts it on 4512 * the worklist when it is freed by free_newblk. It is 4513 * not possible to do this swap when the old dependency had a 4514 * non-zero size but no previous fragment to free. This condition 4515 * arises when the new block is an extension of the old block. 4516 * Here, the first part of the fragment allocated to the new 4517 * dependency is part of the block currently claimed on disk by 4518 * the old dependency, so cannot legitimately be freed until the 4519 * conditions for the new dependency are fulfilled. 4520 */ 4521 freefrag = newadp->ad_freefrag; 4522 if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { 4523 newadp->ad_freefrag = oldadp->ad_freefrag; 4524 oldadp->ad_freefrag = freefrag; 4525 } 4526 /* 4527 * If we are tracking a new directory-block allocation, 4528 * move it from the old allocdirect to the new allocdirect. 4529 */ 4530 if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) { 4531 newdirblk = WK_NEWDIRBLK(wk); 4532 WORKLIST_REMOVE(&newdirblk->db_list); 4533 if (!LIST_EMPTY(&oldadp->ad_newdirblk)) 4534 panic("allocdirect_merge: extra newdirblk"); 4535 WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list); 4536 } 4537 TAILQ_REMOVE(adphead, oldadp, ad_next); 4538 /* 4539 * We need to move any journal dependencies over to the freefrag 4540 * that releases this block if it exists. Otherwise we are 4541 * extending an existing block and we'll wait until that is 4542 * complete to release the journal space and extend the 4543 * new journal to cover this old space as well. 4544 */ 4545 if (freefrag == NULL) { 4546 struct jnewblk *jnewblk; 4547 struct jnewblk *njnewblk; 4548 4549 if (oldadp->ad_newblkno != newadp->ad_newblkno) 4550 panic("allocdirect_merge: %jd != %jd", 4551 oldadp->ad_newblkno, newadp->ad_newblkno); 4552 jnewblk = oldadp->ad_block.nb_jnewblk; 4553 cancel_newblk(&oldadp->ad_block, &newadp->ad_block.nb_jwork); 4554 /* 4555 * We have an unwritten jnewblk, we need to merge the 4556 * frag bits with our own. The newer adp's journal can not 4557 * be written prior to the old one so no need to check for 4558 * it here. 4559 */ 4560 if (jnewblk) { 4561 njnewblk = newadp->ad_block.nb_jnewblk; 4562 if (njnewblk == NULL) 4563 panic("allocdirect_merge: No jnewblk"); 4564 if (jnewblk->jn_state & UNDONE) { 4565 njnewblk->jn_state |= UNDONE | NEWBLOCK; 4566 njnewblk->jn_state &= ~ATTACHED; 4567 jnewblk->jn_state &= ~UNDONE; 4568 } 4569 njnewblk->jn_oldfrags = jnewblk->jn_oldfrags; 4570 WORKLIST_REMOVE(&jnewblk->jn_list); 4571 jnewblk->jn_state |= ATTACHED | COMPLETE; 4572 free_jnewblk(jnewblk); 4573 } 4574 } else { 4575 /* 4576 * We can skip journaling for this freefrag and just complete 4577 * any pending journal work for the allocdirect that is being 4578 * removed after the freefrag completes. 4579 */ 4580 if (freefrag->ff_jfreefrag) 4581 cancel_jfreefrag(freefrag->ff_jfreefrag); 4582 cancel_newblk(&oldadp->ad_block, &freefrag->ff_jwork); 4583 } 4584 free_newblk(&oldadp->ad_block); 4585 } 4586 4587 /* 4588 * Allocate a jfreefrag structure to journal a single block free. 4589 */ 4590 static struct jfreefrag * 4591 newjfreefrag(freefrag, ip, blkno, size, lbn) 4592 struct freefrag *freefrag; 4593 struct inode *ip; 4594 ufs2_daddr_t blkno; 4595 long size; 4596 ufs_lbn_t lbn; 4597 { 4598 struct jfreefrag *jfreefrag; 4599 struct fs *fs; 4600 4601 fs = ip->i_fs; 4602 jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG, 4603 M_SOFTDEP_FLAGS); 4604 workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump)); 4605 jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list); 4606 jfreefrag->fr_state = ATTACHED | DEPCOMPLETE; 4607 jfreefrag->fr_ino = ip->i_number; 4608 jfreefrag->fr_lbn = lbn; 4609 jfreefrag->fr_blkno = blkno; 4610 jfreefrag->fr_frags = numfrags(fs, size); 4611 jfreefrag->fr_freefrag = freefrag; 4612 4613 return (jfreefrag); 4614 } 4615 4616 /* 4617 * Allocate a new freefrag structure. 4618 */ 4619 static struct freefrag * 4620 newfreefrag(ip, blkno, size, lbn) 4621 struct inode *ip; 4622 ufs2_daddr_t blkno; 4623 long size; 4624 ufs_lbn_t lbn; 4625 { 4626 struct freefrag *freefrag; 4627 struct fs *fs; 4628 4629 fs = ip->i_fs; 4630 if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) 4631 panic("newfreefrag: frag size"); 4632 freefrag = malloc(sizeof(struct freefrag), 4633 M_FREEFRAG, M_SOFTDEP_FLAGS); 4634 workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump)); 4635 freefrag->ff_state = ATTACHED; 4636 LIST_INIT(&freefrag->ff_jwork); 4637 freefrag->ff_inum = ip->i_number; 4638 freefrag->ff_blkno = blkno; 4639 freefrag->ff_fragsize = size; 4640 4641 if (fs->fs_flags & FS_SUJ) { 4642 freefrag->ff_jfreefrag = 4643 newjfreefrag(freefrag, ip, blkno, size, lbn); 4644 } else { 4645 freefrag->ff_state |= DEPCOMPLETE; 4646 freefrag->ff_jfreefrag = NULL; 4647 } 4648 4649 return (freefrag); 4650 } 4651 4652 /* 4653 * This workitem de-allocates fragments that were replaced during 4654 * file block allocation. 4655 */ 4656 static void 4657 handle_workitem_freefrag(freefrag) 4658 struct freefrag *freefrag; 4659 { 4660 struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp); 4661 struct workhead wkhd; 4662 4663 /* 4664 * It would be illegal to add new completion items to the 4665 * freefrag after it was schedule to be done so it must be 4666 * safe to modify the list head here. 4667 */ 4668 LIST_INIT(&wkhd); 4669 LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list); 4670 ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno, 4671 freefrag->ff_fragsize, freefrag->ff_inum, &wkhd); 4672 ACQUIRE_LOCK(&lk); 4673 WORKITEM_FREE(freefrag, D_FREEFRAG); 4674 FREE_LOCK(&lk); 4675 } 4676 4677 /* 4678 * Set up a dependency structure for an external attributes data block. 4679 * This routine follows much of the structure of softdep_setup_allocdirect. 4680 * See the description of softdep_setup_allocdirect above for details. 4681 */ 4682 void 4683 softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp) 4684 struct inode *ip; 4685 ufs_lbn_t off; 4686 ufs2_daddr_t newblkno; 4687 ufs2_daddr_t oldblkno; 4688 long newsize; 4689 long oldsize; 4690 struct buf *bp; 4691 { 4692 struct allocdirect *adp, *oldadp; 4693 struct allocdirectlst *adphead; 4694 struct freefrag *freefrag; 4695 struct inodedep *inodedep; 4696 struct jnewblk *jnewblk; 4697 struct newblk *newblk; 4698 struct mount *mp; 4699 ufs_lbn_t lbn; 4700 4701 if (off >= NXADDR) 4702 panic("softdep_setup_allocext: lbn %lld > NXADDR", 4703 (long long)off); 4704 4705 lbn = bp->b_lblkno; 4706 mp = UFSTOVFS(ip->i_ump); 4707 if (oldblkno && oldblkno != newblkno) 4708 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); 4709 else 4710 freefrag = NULL; 4711 4712 ACQUIRE_LOCK(&lk); 4713 if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) 4714 panic("softdep_setup_allocext: lost block"); 4715 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 4716 ("softdep_setup_allocext: newblk already initialized")); 4717 /* 4718 * Convert the newblk to an allocdirect. 4719 */ 4720 newblk->nb_list.wk_type = D_ALLOCDIRECT; 4721 adp = (struct allocdirect *)newblk; 4722 newblk->nb_freefrag = freefrag; 4723 adp->ad_offset = off; 4724 adp->ad_oldblkno = oldblkno; 4725 adp->ad_newsize = newsize; 4726 adp->ad_oldsize = oldsize; 4727 adp->ad_state |= EXTDATA; 4728 4729 /* 4730 * Finish initializing the journal. 4731 */ 4732 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 4733 jnewblk->jn_ino = ip->i_number; 4734 jnewblk->jn_lbn = lbn; 4735 add_to_journal(&jnewblk->jn_list); 4736 } 4737 if (freefrag && freefrag->ff_jfreefrag != NULL) 4738 add_to_journal(&freefrag->ff_jfreefrag->fr_list); 4739 inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); 4740 adp->ad_inodedep = inodedep; 4741 4742 WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); 4743 /* 4744 * The list of allocdirects must be kept in sorted and ascending 4745 * order so that the rollback routines can quickly determine the 4746 * first uncommitted block (the size of the file stored on disk 4747 * ends at the end of the lowest committed fragment, or if there 4748 * are no fragments, at the end of the highest committed block). 4749 * Since files generally grow, the typical case is that the new 4750 * block is to be added at the end of the list. We speed this 4751 * special case by checking against the last allocdirect in the 4752 * list before laboriously traversing the list looking for the 4753 * insertion point. 4754 */ 4755 adphead = &inodedep->id_newextupdt; 4756 oldadp = TAILQ_LAST(adphead, allocdirectlst); 4757 if (oldadp == NULL || oldadp->ad_offset <= off) { 4758 /* insert at end of list */ 4759 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 4760 if (oldadp != NULL && oldadp->ad_offset == off) 4761 allocdirect_merge(adphead, adp, oldadp); 4762 FREE_LOCK(&lk); 4763 return; 4764 } 4765 TAILQ_FOREACH(oldadp, adphead, ad_next) { 4766 if (oldadp->ad_offset >= off) 4767 break; 4768 } 4769 if (oldadp == NULL) 4770 panic("softdep_setup_allocext: lost entry"); 4771 /* insert in middle of list */ 4772 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 4773 if (oldadp->ad_offset == off) 4774 allocdirect_merge(adphead, adp, oldadp); 4775 FREE_LOCK(&lk); 4776 } 4777 4778 /* 4779 * Indirect block allocation dependencies. 4780 * 4781 * The same dependencies that exist for a direct block also exist when 4782 * a new block is allocated and pointed to by an entry in a block of 4783 * indirect pointers. The undo/redo states described above are also 4784 * used here. Because an indirect block contains many pointers that 4785 * may have dependencies, a second copy of the entire in-memory indirect 4786 * block is kept. The buffer cache copy is always completely up-to-date. 4787 * The second copy, which is used only as a source for disk writes, 4788 * contains only the safe pointers (i.e., those that have no remaining 4789 * update dependencies). The second copy is freed when all pointers 4790 * are safe. The cache is not allowed to replace indirect blocks with 4791 * pending update dependencies. If a buffer containing an indirect 4792 * block with dependencies is written, these routines will mark it 4793 * dirty again. It can only be successfully written once all the 4794 * dependencies are removed. The ffs_fsync routine in conjunction with 4795 * softdep_sync_metadata work together to get all the dependencies 4796 * removed so that a file can be successfully written to disk. Three 4797 * procedures are used when setting up indirect block pointer 4798 * dependencies. The division is necessary because of the organization 4799 * of the "balloc" routine and because of the distinction between file 4800 * pages and file metadata blocks. 4801 */ 4802 4803 /* 4804 * Allocate a new allocindir structure. 4805 */ 4806 static struct allocindir * 4807 newallocindir(ip, ptrno, newblkno, oldblkno, lbn) 4808 struct inode *ip; /* inode for file being extended */ 4809 int ptrno; /* offset of pointer in indirect block */ 4810 ufs2_daddr_t newblkno; /* disk block number being added */ 4811 ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ 4812 ufs_lbn_t lbn; 4813 { 4814 struct newblk *newblk; 4815 struct allocindir *aip; 4816 struct freefrag *freefrag; 4817 struct jnewblk *jnewblk; 4818 4819 if (oldblkno) 4820 freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn); 4821 else 4822 freefrag = NULL; 4823 ACQUIRE_LOCK(&lk); 4824 if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0) 4825 panic("new_allocindir: lost block"); 4826 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 4827 ("newallocindir: newblk already initialized")); 4828 newblk->nb_list.wk_type = D_ALLOCINDIR; 4829 newblk->nb_freefrag = freefrag; 4830 aip = (struct allocindir *)newblk; 4831 aip->ai_offset = ptrno; 4832 aip->ai_oldblkno = oldblkno; 4833 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 4834 jnewblk->jn_ino = ip->i_number; 4835 jnewblk->jn_lbn = lbn; 4836 add_to_journal(&jnewblk->jn_list); 4837 } 4838 if (freefrag && freefrag->ff_jfreefrag != NULL) 4839 add_to_journal(&freefrag->ff_jfreefrag->fr_list); 4840 return (aip); 4841 } 4842 4843 /* 4844 * Called just before setting an indirect block pointer 4845 * to a newly allocated file page. 4846 */ 4847 void 4848 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 4849 struct inode *ip; /* inode for file being extended */ 4850 ufs_lbn_t lbn; /* allocated block number within file */ 4851 struct buf *bp; /* buffer with indirect blk referencing page */ 4852 int ptrno; /* offset of pointer in indirect block */ 4853 ufs2_daddr_t newblkno; /* disk block number being added */ 4854 ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ 4855 struct buf *nbp; /* buffer holding allocated page */ 4856 { 4857 struct inodedep *inodedep; 4858 struct allocindir *aip; 4859 struct pagedep *pagedep; 4860 struct mount *mp; 4861 4862 if (lbn != nbp->b_lblkno) 4863 panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd", 4864 lbn, bp->b_lblkno); 4865 ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page"); 4866 mp = UFSTOVFS(ip->i_ump); 4867 aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn); 4868 (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 4869 /* 4870 * If we are allocating a directory page, then we must 4871 * allocate an associated pagedep to track additions and 4872 * deletions. 4873 */ 4874 if ((ip->i_mode & IFMT) == IFDIR && 4875 pagedep_lookup(mp, ip->i_number, lbn, DEPALLOC, &pagedep) == 0) 4876 WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list); 4877 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); 4878 setup_allocindir_phase2(bp, ip, inodedep, aip, lbn); 4879 FREE_LOCK(&lk); 4880 } 4881 4882 /* 4883 * Called just before setting an indirect block pointer to a 4884 * newly allocated indirect block. 4885 */ 4886 void 4887 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 4888 struct buf *nbp; /* newly allocated indirect block */ 4889 struct inode *ip; /* inode for file being extended */ 4890 struct buf *bp; /* indirect block referencing allocated block */ 4891 int ptrno; /* offset of pointer in indirect block */ 4892 ufs2_daddr_t newblkno; /* disk block number being added */ 4893 { 4894 struct inodedep *inodedep; 4895 struct allocindir *aip; 4896 ufs_lbn_t lbn; 4897 4898 lbn = nbp->b_lblkno; 4899 ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta"); 4900 aip = newallocindir(ip, ptrno, newblkno, 0, lbn); 4901 inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep); 4902 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); 4903 setup_allocindir_phase2(bp, ip, inodedep, aip, lbn); 4904 FREE_LOCK(&lk); 4905 } 4906 4907 static void 4908 indirdep_complete(indirdep) 4909 struct indirdep *indirdep; 4910 { 4911 struct allocindir *aip; 4912 4913 LIST_REMOVE(indirdep, ir_next); 4914 indirdep->ir_state &= ~ONDEPLIST; 4915 4916 while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) { 4917 LIST_REMOVE(aip, ai_next); 4918 free_newblk(&aip->ai_block); 4919 } 4920 /* 4921 * If this indirdep is not attached to a buf it was simply waiting 4922 * on completion to clear completehd. free_indirdep() asserts 4923 * that nothing is dangling. 4924 */ 4925 if ((indirdep->ir_state & ONWORKLIST) == 0) 4926 free_indirdep(indirdep); 4927 } 4928 4929 /* 4930 * Called to finish the allocation of the "aip" allocated 4931 * by one of the two routines above. 4932 */ 4933 static void 4934 setup_allocindir_phase2(bp, ip, inodedep, aip, lbn) 4935 struct buf *bp; /* in-memory copy of the indirect block */ 4936 struct inode *ip; /* inode for file being extended */ 4937 struct inodedep *inodedep; /* Inodedep for ip */ 4938 struct allocindir *aip; /* allocindir allocated by the above routines */ 4939 ufs_lbn_t lbn; /* Logical block number for this block. */ 4940 { 4941 struct worklist *wk; 4942 struct fs *fs; 4943 struct newblk *newblk; 4944 struct indirdep *indirdep, *newindirdep; 4945 struct allocindir *oldaip; 4946 struct freefrag *freefrag; 4947 struct mount *mp; 4948 ufs2_daddr_t blkno; 4949 4950 mp = UFSTOVFS(ip->i_ump); 4951 fs = ip->i_fs; 4952 mtx_assert(&lk, MA_OWNED); 4953 if (bp->b_lblkno >= 0) 4954 panic("setup_allocindir_phase2: not indir blk"); 4955 for (freefrag = NULL, indirdep = NULL, newindirdep = NULL; ; ) { 4956 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 4957 if (wk->wk_type != D_INDIRDEP) 4958 continue; 4959 indirdep = WK_INDIRDEP(wk); 4960 break; 4961 } 4962 if (indirdep == NULL && newindirdep) { 4963 indirdep = newindirdep; 4964 newindirdep = NULL; 4965 WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); 4966 if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, 4967 &newblk)) { 4968 indirdep->ir_state |= ONDEPLIST; 4969 LIST_INSERT_HEAD(&newblk->nb_indirdeps, 4970 indirdep, ir_next); 4971 } else 4972 indirdep->ir_state |= DEPCOMPLETE; 4973 } 4974 if (indirdep) { 4975 aip->ai_indirdep = indirdep; 4976 /* 4977 * Check to see if there is an existing dependency 4978 * for this block. If there is, merge the old 4979 * dependency into the new one. This happens 4980 * as a result of reallocblk only. 4981 */ 4982 if (aip->ai_oldblkno == 0) 4983 oldaip = NULL; 4984 else 4985 4986 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, 4987 ai_next) 4988 if (oldaip->ai_offset == aip->ai_offset) 4989 break; 4990 if (oldaip != NULL) 4991 freefrag = allocindir_merge(aip, oldaip); 4992 LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); 4993 KASSERT(aip->ai_offset >= 0 && 4994 aip->ai_offset < NINDIR(ip->i_ump->um_fs), 4995 ("setup_allocindir_phase2: Bad offset %d", 4996 aip->ai_offset)); 4997 KASSERT(indirdep->ir_savebp != NULL, 4998 ("setup_allocindir_phase2 NULL ir_savebp")); 4999 if (ip->i_ump->um_fstype == UFS1) 5000 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data) 5001 [aip->ai_offset] = aip->ai_oldblkno; 5002 else 5003 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data) 5004 [aip->ai_offset] = aip->ai_oldblkno; 5005 FREE_LOCK(&lk); 5006 if (freefrag != NULL) 5007 handle_workitem_freefrag(freefrag); 5008 } else 5009 FREE_LOCK(&lk); 5010 if (newindirdep) { 5011 newindirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE; 5012 brelse(newindirdep->ir_savebp); 5013 ACQUIRE_LOCK(&lk); 5014 WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP); 5015 if (indirdep) 5016 break; 5017 FREE_LOCK(&lk); 5018 } 5019 if (indirdep) { 5020 ACQUIRE_LOCK(&lk); 5021 break; 5022 } 5023 newindirdep = malloc(sizeof(struct indirdep), 5024 M_INDIRDEP, M_SOFTDEP_FLAGS); 5025 workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp); 5026 newindirdep->ir_state = ATTACHED; 5027 if (ip->i_ump->um_fstype == UFS1) 5028 newindirdep->ir_state |= UFS1FMT; 5029 newindirdep->ir_saveddata = NULL; 5030 LIST_INIT(&newindirdep->ir_deplisthd); 5031 LIST_INIT(&newindirdep->ir_donehd); 5032 LIST_INIT(&newindirdep->ir_writehd); 5033 LIST_INIT(&newindirdep->ir_completehd); 5034 LIST_INIT(&newindirdep->ir_jwork); 5035 if (bp->b_blkno == bp->b_lblkno) { 5036 ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp, 5037 NULL, NULL); 5038 bp->b_blkno = blkno; 5039 } 5040 newindirdep->ir_savebp = 5041 getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0); 5042 BUF_KERNPROC(newindirdep->ir_savebp); 5043 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); 5044 ACQUIRE_LOCK(&lk); 5045 } 5046 } 5047 5048 /* 5049 * Merge two allocindirs which refer to the same block. Move newblock 5050 * dependencies and setup the freefrags appropriately. 5051 */ 5052 static struct freefrag * 5053 allocindir_merge(aip, oldaip) 5054 struct allocindir *aip; 5055 struct allocindir *oldaip; 5056 { 5057 struct newdirblk *newdirblk; 5058 struct freefrag *freefrag; 5059 struct worklist *wk; 5060 5061 if (oldaip->ai_newblkno != aip->ai_oldblkno) 5062 panic("allocindir_merge: blkno"); 5063 aip->ai_oldblkno = oldaip->ai_oldblkno; 5064 freefrag = aip->ai_freefrag; 5065 aip->ai_freefrag = oldaip->ai_freefrag; 5066 oldaip->ai_freefrag = NULL; 5067 KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag")); 5068 /* 5069 * If we are tracking a new directory-block allocation, 5070 * move it from the old allocindir to the new allocindir. 5071 */ 5072 if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) { 5073 newdirblk = WK_NEWDIRBLK(wk); 5074 WORKLIST_REMOVE(&newdirblk->db_list); 5075 if (!LIST_EMPTY(&oldaip->ai_newdirblk)) 5076 panic("allocindir_merge: extra newdirblk"); 5077 WORKLIST_INSERT(&aip->ai_newdirblk, &newdirblk->db_list); 5078 } 5079 /* 5080 * We can skip journaling for this freefrag and just complete 5081 * any pending journal work for the allocindir that is being 5082 * removed after the freefrag completes. 5083 */ 5084 if (freefrag->ff_jfreefrag) 5085 cancel_jfreefrag(freefrag->ff_jfreefrag); 5086 LIST_REMOVE(oldaip, ai_next); 5087 cancel_newblk(&oldaip->ai_block, &freefrag->ff_jwork); 5088 free_newblk(&oldaip->ai_block); 5089 5090 return (freefrag); 5091 } 5092 5093 /* 5094 * Block de-allocation dependencies. 5095 * 5096 * When blocks are de-allocated, the on-disk pointers must be nullified before 5097 * the blocks are made available for use by other files. (The true 5098 * requirement is that old pointers must be nullified before new on-disk 5099 * pointers are set. We chose this slightly more stringent requirement to 5100 * reduce complexity.) Our implementation handles this dependency by updating 5101 * the inode (or indirect block) appropriately but delaying the actual block 5102 * de-allocation (i.e., freemap and free space count manipulation) until 5103 * after the updated versions reach stable storage. After the disk is 5104 * updated, the blocks can be safely de-allocated whenever it is convenient. 5105 * This implementation handles only the common case of reducing a file's 5106 * length to zero. Other cases are handled by the conventional synchronous 5107 * write approach. 5108 * 5109 * The ffs implementation with which we worked double-checks 5110 * the state of the block pointers and file size as it reduces 5111 * a file's length. Some of this code is replicated here in our 5112 * soft updates implementation. The freeblks->fb_chkcnt field is 5113 * used to transfer a part of this information to the procedure 5114 * that eventually de-allocates the blocks. 5115 * 5116 * This routine should be called from the routine that shortens 5117 * a file's length, before the inode's size or block pointers 5118 * are modified. It will save the block pointer information for 5119 * later release and zero the inode so that the calling routine 5120 * can release it. 5121 */ 5122 void 5123 softdep_setup_freeblocks(ip, length, flags) 5124 struct inode *ip; /* The inode whose length is to be reduced */ 5125 off_t length; /* The new length for the file */ 5126 int flags; /* IO_EXT and/or IO_NORMAL */ 5127 { 5128 struct ufs1_dinode *dp1; 5129 struct ufs2_dinode *dp2; 5130 struct freeblks *freeblks; 5131 struct inodedep *inodedep; 5132 struct allocdirect *adp; 5133 struct jfreeblk *jfreeblk; 5134 struct bufobj *bo; 5135 struct vnode *vp; 5136 struct buf *bp; 5137 struct fs *fs; 5138 ufs2_daddr_t extblocks, datablocks; 5139 struct mount *mp; 5140 int i, delay, error; 5141 ufs2_daddr_t blkno; 5142 ufs_lbn_t tmpval; 5143 ufs_lbn_t lbn; 5144 long oldextsize; 5145 long oldsize; 5146 int frags; 5147 int needj; 5148 5149 fs = ip->i_fs; 5150 mp = UFSTOVFS(ip->i_ump); 5151 if (length != 0) 5152 panic("softdep_setup_freeblocks: non-zero length"); 5153 freeblks = malloc(sizeof(struct freeblks), 5154 M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO); 5155 workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp); 5156 LIST_INIT(&freeblks->fb_jfreeblkhd); 5157 LIST_INIT(&freeblks->fb_jwork); 5158 freeblks->fb_state = ATTACHED; 5159 freeblks->fb_uid = ip->i_uid; 5160 freeblks->fb_previousinum = ip->i_number; 5161 freeblks->fb_devvp = ip->i_devvp; 5162 freeblks->fb_chkcnt = 0; 5163 ACQUIRE_LOCK(&lk); 5164 /* 5165 * If we're truncating a removed file that will never be written 5166 * we don't need to journal the block frees. The canceled journals 5167 * for the allocations will suffice. 5168 */ 5169 inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 5170 if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED || 5171 (fs->fs_flags & FS_SUJ) == 0) 5172 needj = 0; 5173 else 5174 needj = 1; 5175 num_freeblkdep++; 5176 FREE_LOCK(&lk); 5177 extblocks = 0; 5178 if (fs->fs_magic == FS_UFS2_MAGIC) 5179 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); 5180 datablocks = DIP(ip, i_blocks) - extblocks; 5181 if ((flags & IO_NORMAL) != 0) { 5182 oldsize = ip->i_size; 5183 ip->i_size = 0; 5184 DIP_SET(ip, i_size, 0); 5185 freeblks->fb_chkcnt = datablocks; 5186 for (i = 0; i < NDADDR; i++) { 5187 blkno = DIP(ip, i_db[i]); 5188 DIP_SET(ip, i_db[i], 0); 5189 if (blkno == 0) 5190 continue; 5191 frags = sblksize(fs, oldsize, i); 5192 frags = numfrags(fs, frags); 5193 newfreework(freeblks, NULL, i, blkno, frags, needj); 5194 } 5195 for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; 5196 i++, tmpval *= NINDIR(fs)) { 5197 blkno = DIP(ip, i_ib[i]); 5198 DIP_SET(ip, i_ib[i], 0); 5199 if (blkno) 5200 newfreework(freeblks, NULL, -lbn - i, blkno, 5201 fs->fs_frag, needj); 5202 lbn += tmpval; 5203 } 5204 UFS_LOCK(ip->i_ump); 5205 fs->fs_pendingblocks += datablocks; 5206 UFS_UNLOCK(ip->i_ump); 5207 } 5208 if ((flags & IO_EXT) != 0) { 5209 oldextsize = ip->i_din2->di_extsize; 5210 ip->i_din2->di_extsize = 0; 5211 freeblks->fb_chkcnt += extblocks; 5212 for (i = 0; i < NXADDR; i++) { 5213 blkno = ip->i_din2->di_extb[i]; 5214 ip->i_din2->di_extb[i] = 0; 5215 if (blkno == 0) 5216 continue; 5217 frags = sblksize(fs, oldextsize, i); 5218 frags = numfrags(fs, frags); 5219 newfreework(freeblks, NULL, -1 - i, blkno, frags, 5220 needj); 5221 } 5222 } 5223 if (LIST_EMPTY(&freeblks->fb_jfreeblkhd)) 5224 needj = 0; 5225 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt); 5226 /* 5227 * Push the zero'ed inode to to its disk buffer so that we are free 5228 * to delete its dependencies below. Once the dependencies are gone 5229 * the buffer can be safely released. 5230 */ 5231 if ((error = bread(ip->i_devvp, 5232 fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 5233 (int)fs->fs_bsize, NOCRED, &bp)) != 0) { 5234 brelse(bp); 5235 softdep_error("softdep_setup_freeblocks", error); 5236 } 5237 if (ip->i_ump->um_fstype == UFS1) { 5238 dp1 = ((struct ufs1_dinode *)bp->b_data + 5239 ino_to_fsbo(fs, ip->i_number)); 5240 ip->i_din1->di_freelink = dp1->di_freelink; 5241 *dp1 = *ip->i_din1; 5242 } else { 5243 dp2 = ((struct ufs2_dinode *)bp->b_data + 5244 ino_to_fsbo(fs, ip->i_number)); 5245 ip->i_din2->di_freelink = dp2->di_freelink; 5246 *dp2 = *ip->i_din2; 5247 } 5248 /* 5249 * Find and eliminate any inode dependencies. 5250 */ 5251 ACQUIRE_LOCK(&lk); 5252 (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 5253 if ((inodedep->id_state & IOSTARTED) != 0) 5254 panic("softdep_setup_freeblocks: inode busy"); 5255 /* 5256 * Add the freeblks structure to the list of operations that 5257 * must await the zero'ed inode being written to disk. If we 5258 * still have a bitmap dependency (delay == 0), then the inode 5259 * has never been written to disk, so we can process the 5260 * freeblks below once we have deleted the dependencies. 5261 */ 5262 delay = (inodedep->id_state & DEPCOMPLETE); 5263 if (delay) 5264 WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); 5265 else if (needj) 5266 freeblks->fb_state |= DEPCOMPLETE | COMPLETE; 5267 /* 5268 * Because the file length has been truncated to zero, any 5269 * pending block allocation dependency structures associated 5270 * with this inode are obsolete and can simply be de-allocated. 5271 * We must first merge the two dependency lists to get rid of 5272 * any duplicate freefrag structures, then purge the merged list. 5273 * If we still have a bitmap dependency, then the inode has never 5274 * been written to disk, so we can free any fragments without delay. 5275 */ 5276 if (flags & IO_NORMAL) { 5277 merge_inode_lists(&inodedep->id_newinoupdt, 5278 &inodedep->id_inoupdt); 5279 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) 5280 cancel_allocdirect(&inodedep->id_inoupdt, adp, 5281 freeblks, delay); 5282 } 5283 if (flags & IO_EXT) { 5284 merge_inode_lists(&inodedep->id_newextupdt, 5285 &inodedep->id_extupdt); 5286 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0) 5287 cancel_allocdirect(&inodedep->id_extupdt, adp, 5288 freeblks, delay); 5289 } 5290 LIST_FOREACH(jfreeblk, &freeblks->fb_jfreeblkhd, jf_deps) 5291 add_to_journal(&jfreeblk->jf_list); 5292 5293 FREE_LOCK(&lk); 5294 bdwrite(bp); 5295 /* 5296 * We must wait for any I/O in progress to finish so that 5297 * all potential buffers on the dirty list will be visible. 5298 * Once they are all there, walk the list and get rid of 5299 * any dependencies. 5300 */ 5301 vp = ITOV(ip); 5302 bo = &vp->v_bufobj; 5303 BO_LOCK(bo); 5304 drain_output(vp); 5305 restart: 5306 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { 5307 if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) || 5308 ((flags & IO_NORMAL) == 0 && 5309 (bp->b_xflags & BX_ALTDATA) == 0)) 5310 continue; 5311 if ((bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT)) == NULL) 5312 goto restart; 5313 BO_UNLOCK(bo); 5314 ACQUIRE_LOCK(&lk); 5315 (void) inodedep_lookup(mp, ip->i_number, 0, &inodedep); 5316 if (deallocate_dependencies(bp, inodedep, freeblks)) 5317 bp->b_flags |= B_INVAL | B_NOCACHE; 5318 FREE_LOCK(&lk); 5319 brelse(bp); 5320 BO_LOCK(bo); 5321 goto restart; 5322 } 5323 BO_UNLOCK(bo); 5324 ACQUIRE_LOCK(&lk); 5325 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) 5326 (void) free_inodedep(inodedep); 5327 5328 if (delay) { 5329 freeblks->fb_state |= DEPCOMPLETE; 5330 /* 5331 * If the inode with zeroed block pointers is now on disk 5332 * we can start freeing blocks. Add freeblks to the worklist 5333 * instead of calling handle_workitem_freeblocks directly as 5334 * it is more likely that additional IO is needed to complete 5335 * the request here than in the !delay case. 5336 */ 5337 if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) 5338 add_to_worklist(&freeblks->fb_list, 1); 5339 } 5340 5341 FREE_LOCK(&lk); 5342 /* 5343 * If the inode has never been written to disk (delay == 0) and 5344 * we're not waiting on any journal writes, then we can process the 5345 * freeblks now that we have deleted the dependencies. 5346 */ 5347 if (!delay && !needj) 5348 handle_workitem_freeblocks(freeblks, 0); 5349 } 5350 5351 /* 5352 * Reclaim any dependency structures from a buffer that is about to 5353 * be reallocated to a new vnode. The buffer must be locked, thus, 5354 * no I/O completion operations can occur while we are manipulating 5355 * its associated dependencies. The mutex is held so that other I/O's 5356 * associated with related dependencies do not occur. Returns 1 if 5357 * all dependencies were cleared, 0 otherwise. 5358 */ 5359 static int 5360 deallocate_dependencies(bp, inodedep, freeblks) 5361 struct buf *bp; 5362 struct inodedep *inodedep; 5363 struct freeblks *freeblks; 5364 { 5365 struct worklist *wk; 5366 struct indirdep *indirdep; 5367 struct newdirblk *newdirblk; 5368 struct allocindir *aip; 5369 struct pagedep *pagedep; 5370 struct jremref *jremref; 5371 struct jmvref *jmvref; 5372 struct dirrem *dirrem; 5373 int i; 5374 5375 mtx_assert(&lk, MA_OWNED); 5376 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 5377 switch (wk->wk_type) { 5378 5379 case D_INDIRDEP: 5380 indirdep = WK_INDIRDEP(wk); 5381 if (bp->b_lblkno >= 0 || 5382 bp->b_blkno != indirdep->ir_savebp->b_lblkno) 5383 panic("deallocate_dependencies: not indir"); 5384 cancel_indirdep(indirdep, bp, inodedep, freeblks); 5385 continue; 5386 5387 case D_PAGEDEP: 5388 pagedep = WK_PAGEDEP(wk); 5389 /* 5390 * There should be no directory add dependencies present 5391 * as the directory could not be truncated until all 5392 * children were removed. 5393 */ 5394 KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL, 5395 ("deallocate_dependencies: pendinghd != NULL")); 5396 for (i = 0; i < DAHASHSZ; i++) 5397 KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL, 5398 ("deallocate_dependencies: diraddhd != NULL")); 5399 /* 5400 * Copy any directory remove dependencies to the list 5401 * to be processed after the zero'ed inode is written. 5402 * If the inode has already been written, then they 5403 * can be dumped directly onto the work list. 5404 */ 5405 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { 5406 /* 5407 * If there are any dirrems we wait for 5408 * the journal write to complete and 5409 * then restart the buf scan as the lock 5410 * has been dropped. 5411 */ 5412 while ((jremref = 5413 LIST_FIRST(&dirrem->dm_jremrefhd)) 5414 != NULL) { 5415 stat_jwait_filepage++; 5416 jwait(&jremref->jr_list); 5417 return (0); 5418 } 5419 LIST_REMOVE(dirrem, dm_next); 5420 dirrem->dm_dirinum = pagedep->pd_ino; 5421 if (inodedep == NULL || 5422 (inodedep->id_state & ALLCOMPLETE) == 5423 ALLCOMPLETE) { 5424 dirrem->dm_state |= COMPLETE; 5425 add_to_worklist(&dirrem->dm_list, 0); 5426 } else 5427 WORKLIST_INSERT(&inodedep->id_bufwait, 5428 &dirrem->dm_list); 5429 } 5430 if ((pagedep->pd_state & NEWBLOCK) != 0) { 5431 newdirblk = pagedep->pd_newdirblk; 5432 WORKLIST_REMOVE(&newdirblk->db_list); 5433 free_newdirblk(newdirblk); 5434 } 5435 while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) 5436 != NULL) { 5437 stat_jwait_filepage++; 5438 jwait(&jmvref->jm_list); 5439 return (0); 5440 } 5441 WORKLIST_REMOVE(&pagedep->pd_list); 5442 LIST_REMOVE(pagedep, pd_hash); 5443 WORKITEM_FREE(pagedep, D_PAGEDEP); 5444 continue; 5445 5446 case D_ALLOCINDIR: 5447 aip = WK_ALLOCINDIR(wk); 5448 cancel_allocindir(aip, inodedep, freeblks); 5449 continue; 5450 5451 case D_ALLOCDIRECT: 5452 case D_INODEDEP: 5453 panic("deallocate_dependencies: Unexpected type %s", 5454 TYPENAME(wk->wk_type)); 5455 /* NOTREACHED */ 5456 5457 default: 5458 panic("deallocate_dependencies: Unknown type %s", 5459 TYPENAME(wk->wk_type)); 5460 /* NOTREACHED */ 5461 } 5462 } 5463 5464 return (1); 5465 } 5466 5467 /* 5468 * An allocdirect is being canceled due to a truncate. We must make sure 5469 * the journal entry is released in concert with the blkfree that releases 5470 * the storage. Completed journal entries must not be released until the 5471 * space is no longer pointed to by the inode or in the bitmap. 5472 */ 5473 static void 5474 cancel_allocdirect(adphead, adp, freeblks, delay) 5475 struct allocdirectlst *adphead; 5476 struct allocdirect *adp; 5477 struct freeblks *freeblks; 5478 int delay; 5479 { 5480 struct freework *freework; 5481 struct newblk *newblk; 5482 struct worklist *wk; 5483 ufs_lbn_t lbn; 5484 5485 TAILQ_REMOVE(adphead, adp, ad_next); 5486 newblk = (struct newblk *)adp; 5487 /* 5488 * If the journal hasn't been written the jnewblk must be passed 5489 * to the call to ffs_freeblk that reclaims the space. We accomplish 5490 * this by linking the journal dependency into the freework to be 5491 * freed when freework_freeblock() is called. If the journal has 5492 * been written we can simply reclaim the journal space when the 5493 * freeblks work is complete. 5494 */ 5495 if (newblk->nb_jnewblk == NULL) { 5496 cancel_newblk(newblk, &freeblks->fb_jwork); 5497 goto found; 5498 } 5499 lbn = newblk->nb_jnewblk->jn_lbn; 5500 /* 5501 * Find the correct freework structure so it releases the canceled 5502 * journal when the bitmap is cleared. This preserves rollback 5503 * until the allocation is reverted. 5504 */ 5505 LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) { 5506 freework = WK_FREEWORK(wk); 5507 if (freework->fw_lbn != lbn) 5508 continue; 5509 cancel_newblk(newblk, &freework->fw_jwork); 5510 goto found; 5511 } 5512 panic("cancel_allocdirect: Freework not found for lbn %jd\n", lbn); 5513 found: 5514 if (delay) 5515 WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, 5516 &newblk->nb_list); 5517 else 5518 free_newblk(newblk); 5519 return; 5520 } 5521 5522 5523 static void 5524 cancel_newblk(newblk, wkhd) 5525 struct newblk *newblk; 5526 struct workhead *wkhd; 5527 { 5528 struct indirdep *indirdep; 5529 struct allocindir *aip; 5530 5531 while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) { 5532 indirdep->ir_state &= ~ONDEPLIST; 5533 LIST_REMOVE(indirdep, ir_next); 5534 /* 5535 * If an indirdep is not on the buf worklist we need to 5536 * free it here as deallocate_dependencies() will never 5537 * find it. These pointers were never visible on disk and 5538 * can be discarded immediately. 5539 */ 5540 while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) { 5541 LIST_REMOVE(aip, ai_next); 5542 cancel_newblk(&aip->ai_block, wkhd); 5543 free_newblk(&aip->ai_block); 5544 } 5545 /* 5546 * If this indirdep is not attached to a buf it was simply 5547 * waiting on completion to clear completehd. free_indirdep() 5548 * asserts that nothing is dangling. 5549 */ 5550 if ((indirdep->ir_state & ONWORKLIST) == 0) 5551 free_indirdep(indirdep); 5552 } 5553 if (newblk->nb_state & ONDEPLIST) { 5554 newblk->nb_state &= ~ONDEPLIST; 5555 LIST_REMOVE(newblk, nb_deps); 5556 } 5557 if (newblk->nb_state & ONWORKLIST) 5558 WORKLIST_REMOVE(&newblk->nb_list); 5559 /* 5560 * If the journal entry hasn't been written we hold onto the dep 5561 * until it is safe to free along with the other journal work. 5562 */ 5563 if (newblk->nb_jnewblk != NULL) { 5564 cancel_jnewblk(newblk->nb_jnewblk, wkhd); 5565 newblk->nb_jnewblk = NULL; 5566 } 5567 if (!LIST_EMPTY(&newblk->nb_jwork)) 5568 jwork_move(wkhd, &newblk->nb_jwork); 5569 } 5570 5571 /* 5572 * Free a newblk. Generate a new freefrag work request if appropriate. 5573 * This must be called after the inode pointer and any direct block pointers 5574 * are valid or fully removed via truncate or frag extension. 5575 */ 5576 static void 5577 free_newblk(newblk) 5578 struct newblk *newblk; 5579 { 5580 struct indirdep *indirdep; 5581 struct newdirblk *newdirblk; 5582 struct freefrag *freefrag; 5583 struct worklist *wk; 5584 5585 mtx_assert(&lk, MA_OWNED); 5586 if (newblk->nb_state & ONDEPLIST) 5587 LIST_REMOVE(newblk, nb_deps); 5588 if (newblk->nb_state & ONWORKLIST) 5589 WORKLIST_REMOVE(&newblk->nb_list); 5590 LIST_REMOVE(newblk, nb_hash); 5591 if ((freefrag = newblk->nb_freefrag) != NULL) { 5592 freefrag->ff_state |= COMPLETE; 5593 if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) 5594 add_to_worklist(&freefrag->ff_list, 0); 5595 } 5596 if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) { 5597 newdirblk = WK_NEWDIRBLK(wk); 5598 WORKLIST_REMOVE(&newdirblk->db_list); 5599 if (!LIST_EMPTY(&newblk->nb_newdirblk)) 5600 panic("free_newblk: extra newdirblk"); 5601 free_newdirblk(newdirblk); 5602 } 5603 while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) { 5604 indirdep->ir_state |= DEPCOMPLETE; 5605 indirdep_complete(indirdep); 5606 } 5607 KASSERT(newblk->nb_jnewblk == NULL, 5608 ("free_newblk; jnewblk %p still attached", newblk->nb_jnewblk)); 5609 handle_jwork(&newblk->nb_jwork); 5610 newblk->nb_list.wk_type = D_NEWBLK; 5611 WORKITEM_FREE(newblk, D_NEWBLK); 5612 } 5613 5614 /* 5615 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep. 5616 * This routine must be called with splbio interrupts blocked. 5617 */ 5618 static void 5619 free_newdirblk(newdirblk) 5620 struct newdirblk *newdirblk; 5621 { 5622 struct pagedep *pagedep; 5623 struct diradd *dap; 5624 struct worklist *wk; 5625 int i; 5626 5627 mtx_assert(&lk, MA_OWNED); 5628 /* 5629 * If the pagedep is still linked onto the directory buffer 5630 * dependency chain, then some of the entries on the 5631 * pd_pendinghd list may not be committed to disk yet. In 5632 * this case, we will simply clear the NEWBLOCK flag and 5633 * let the pd_pendinghd list be processed when the pagedep 5634 * is next written. If the pagedep is no longer on the buffer 5635 * dependency chain, then all the entries on the pd_pending 5636 * list are committed to disk and we can free them here. 5637 */ 5638 pagedep = newdirblk->db_pagedep; 5639 pagedep->pd_state &= ~NEWBLOCK; 5640 if ((pagedep->pd_state & ONWORKLIST) == 0) 5641 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 5642 free_diradd(dap, NULL); 5643 /* 5644 * If no dependencies remain, the pagedep will be freed. 5645 */ 5646 for (i = 0; i < DAHASHSZ; i++) 5647 if (!LIST_EMPTY(&pagedep->pd_diraddhd[i])) 5648 break; 5649 if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0 && 5650 LIST_EMPTY(&pagedep->pd_jmvrefhd)) { 5651 KASSERT(LIST_FIRST(&pagedep->pd_dirremhd) == NULL, 5652 ("free_newdirblk: Freeing non-free pagedep %p", pagedep)); 5653 LIST_REMOVE(pagedep, pd_hash); 5654 WORKITEM_FREE(pagedep, D_PAGEDEP); 5655 } 5656 /* Should only ever be one item in the list. */ 5657 while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) { 5658 WORKLIST_REMOVE(wk); 5659 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 5660 } 5661 WORKITEM_FREE(newdirblk, D_NEWDIRBLK); 5662 } 5663 5664 /* 5665 * Prepare an inode to be freed. The actual free operation is not 5666 * done until the zero'ed inode has been written to disk. 5667 */ 5668 void 5669 softdep_freefile(pvp, ino, mode) 5670 struct vnode *pvp; 5671 ino_t ino; 5672 int mode; 5673 { 5674 struct inode *ip = VTOI(pvp); 5675 struct inodedep *inodedep; 5676 struct freefile *freefile; 5677 5678 /* 5679 * This sets up the inode de-allocation dependency. 5680 */ 5681 freefile = malloc(sizeof(struct freefile), 5682 M_FREEFILE, M_SOFTDEP_FLAGS); 5683 workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount); 5684 freefile->fx_mode = mode; 5685 freefile->fx_oldinum = ino; 5686 freefile->fx_devvp = ip->i_devvp; 5687 LIST_INIT(&freefile->fx_jwork); 5688 UFS_LOCK(ip->i_ump); 5689 ip->i_fs->fs_pendinginodes += 1; 5690 UFS_UNLOCK(ip->i_ump); 5691 5692 /* 5693 * If the inodedep does not exist, then the zero'ed inode has 5694 * been written to disk. If the allocated inode has never been 5695 * written to disk, then the on-disk inode is zero'ed. In either 5696 * case we can free the file immediately. If the journal was 5697 * canceled before being written the inode will never make it to 5698 * disk and we must send the canceled journal entrys to 5699 * ffs_freefile() to be cleared in conjunction with the bitmap. 5700 * Any blocks waiting on the inode to write can be safely freed 5701 * here as it will never been written. 5702 */ 5703 ACQUIRE_LOCK(&lk); 5704 inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); 5705 /* 5706 * Remove this inode from the unlinked list and set 5707 * GOINGAWAY as appropriate to indicate that this inode 5708 * will never be written. 5709 */ 5710 if (inodedep && inodedep->id_state & UNLINKED) { 5711 /* 5712 * Save the journal work to be freed with the bitmap 5713 * before we clear UNLINKED. Otherwise it can be lost 5714 * if the inode block is written. 5715 */ 5716 handle_bufwait(inodedep, &freefile->fx_jwork); 5717 clear_unlinked_inodedep(inodedep); 5718 /* Re-acquire inodedep as we've dropped lk. */ 5719 inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); 5720 if (inodedep && (inodedep->id_state & DEPCOMPLETE) == 0) 5721 inodedep->id_state |= GOINGAWAY; 5722 } 5723 if (inodedep == NULL || check_inode_unwritten(inodedep)) { 5724 FREE_LOCK(&lk); 5725 handle_workitem_freefile(freefile); 5726 return; 5727 } 5728 WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); 5729 FREE_LOCK(&lk); 5730 if (ip->i_number == ino) 5731 ip->i_flag |= IN_MODIFIED; 5732 } 5733 5734 /* 5735 * Check to see if an inode has never been written to disk. If 5736 * so free the inodedep and return success, otherwise return failure. 5737 * This routine must be called with splbio interrupts blocked. 5738 * 5739 * If we still have a bitmap dependency, then the inode has never 5740 * been written to disk. Drop the dependency as it is no longer 5741 * necessary since the inode is being deallocated. We set the 5742 * ALLCOMPLETE flags since the bitmap now properly shows that the 5743 * inode is not allocated. Even if the inode is actively being 5744 * written, it has been rolled back to its zero'ed state, so we 5745 * are ensured that a zero inode is what is on the disk. For short 5746 * lived files, this change will usually result in removing all the 5747 * dependencies from the inode so that it can be freed immediately. 5748 */ 5749 static int 5750 check_inode_unwritten(inodedep) 5751 struct inodedep *inodedep; 5752 { 5753 5754 mtx_assert(&lk, MA_OWNED); 5755 5756 if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 || 5757 !LIST_EMPTY(&inodedep->id_pendinghd) || 5758 !LIST_EMPTY(&inodedep->id_bufwait) || 5759 !LIST_EMPTY(&inodedep->id_inowait) || 5760 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 5761 !TAILQ_EMPTY(&inodedep->id_newinoupdt) || 5762 !TAILQ_EMPTY(&inodedep->id_extupdt) || 5763 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 5764 inodedep->id_mkdiradd != NULL || 5765 inodedep->id_nlinkdelta != 0) 5766 return (0); 5767 /* 5768 * Another process might be in initiate_write_inodeblock_ufs[12] 5769 * trying to allocate memory without holding "Softdep Lock". 5770 */ 5771 if ((inodedep->id_state & IOSTARTED) != 0 && 5772 inodedep->id_savedino1 == NULL) 5773 return (0); 5774 5775 if (inodedep->id_state & ONDEPLIST) 5776 LIST_REMOVE(inodedep, id_deps); 5777 inodedep->id_state &= ~ONDEPLIST; 5778 inodedep->id_state |= ALLCOMPLETE; 5779 inodedep->id_bmsafemap = NULL; 5780 if (inodedep->id_state & ONWORKLIST) 5781 WORKLIST_REMOVE(&inodedep->id_list); 5782 if (inodedep->id_savedino1 != NULL) { 5783 free(inodedep->id_savedino1, M_SAVEDINO); 5784 inodedep->id_savedino1 = NULL; 5785 } 5786 if (free_inodedep(inodedep) == 0) 5787 panic("check_inode_unwritten: busy inode"); 5788 return (1); 5789 } 5790 5791 /* 5792 * Try to free an inodedep structure. Return 1 if it could be freed. 5793 */ 5794 static int 5795 free_inodedep(inodedep) 5796 struct inodedep *inodedep; 5797 { 5798 5799 mtx_assert(&lk, MA_OWNED); 5800 if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 || 5801 (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || 5802 !LIST_EMPTY(&inodedep->id_dirremhd) || 5803 !LIST_EMPTY(&inodedep->id_pendinghd) || 5804 !LIST_EMPTY(&inodedep->id_bufwait) || 5805 !LIST_EMPTY(&inodedep->id_inowait) || 5806 !TAILQ_EMPTY(&inodedep->id_inoreflst) || 5807 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 5808 !TAILQ_EMPTY(&inodedep->id_newinoupdt) || 5809 !TAILQ_EMPTY(&inodedep->id_extupdt) || 5810 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 5811 inodedep->id_mkdiradd != NULL || 5812 inodedep->id_nlinkdelta != 0 || 5813 inodedep->id_savedino1 != NULL) 5814 return (0); 5815 if (inodedep->id_state & ONDEPLIST) 5816 LIST_REMOVE(inodedep, id_deps); 5817 LIST_REMOVE(inodedep, id_hash); 5818 WORKITEM_FREE(inodedep, D_INODEDEP); 5819 num_inodedep -= 1; 5820 return (1); 5821 } 5822 5823 /* 5824 * Free the block referenced by a freework structure. The parent freeblks 5825 * structure is released and completed when the final cg bitmap reaches 5826 * the disk. This routine may be freeing a jnewblk which never made it to 5827 * disk in which case we do not have to wait as the operation is undone 5828 * in memory immediately. 5829 */ 5830 static void 5831 freework_freeblock(freework) 5832 struct freework *freework; 5833 { 5834 struct freeblks *freeblks; 5835 struct ufsmount *ump; 5836 struct workhead wkhd; 5837 struct fs *fs; 5838 int complete; 5839 int pending; 5840 int bsize; 5841 int needj; 5842 5843 freeblks = freework->fw_freeblks; 5844 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 5845 fs = ump->um_fs; 5846 needj = freeblks->fb_list.wk_mp->mnt_kern_flag & MNTK_SUJ; 5847 complete = 0; 5848 LIST_INIT(&wkhd); 5849 /* 5850 * If we are canceling an existing jnewblk pass it to the free 5851 * routine, otherwise pass the freeblk which will ultimately 5852 * release the freeblks. If we're not journaling, we can just 5853 * free the freeblks immediately. 5854 */ 5855 if (!LIST_EMPTY(&freework->fw_jwork)) { 5856 LIST_SWAP(&wkhd, &freework->fw_jwork, worklist, wk_list); 5857 complete = 1; 5858 } else if (needj) 5859 WORKLIST_INSERT_UNLOCKED(&wkhd, &freework->fw_list); 5860 bsize = lfragtosize(fs, freework->fw_frags); 5861 pending = btodb(bsize); 5862 ACQUIRE_LOCK(&lk); 5863 freeblks->fb_chkcnt -= pending; 5864 FREE_LOCK(&lk); 5865 /* 5866 * extattr blocks don't show up in pending blocks. XXX why? 5867 */ 5868 if (freework->fw_lbn >= 0 || freework->fw_lbn <= -NDADDR) { 5869 UFS_LOCK(ump); 5870 fs->fs_pendingblocks -= pending; 5871 UFS_UNLOCK(ump); 5872 } 5873 ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, 5874 bsize, freeblks->fb_previousinum, &wkhd); 5875 if (complete == 0 && needj) 5876 return; 5877 /* 5878 * The jnewblk will be discarded and the bits in the map never 5879 * made it to disk. We can immediately free the freeblk. 5880 */ 5881 ACQUIRE_LOCK(&lk); 5882 handle_written_freework(freework); 5883 FREE_LOCK(&lk); 5884 } 5885 5886 /* 5887 * Start, continue, or finish the process of freeing an indirect block tree. 5888 * The free operation may be paused at any point with fw_off containing the 5889 * offset to restart from. This enables us to implement some flow control 5890 * for large truncates which may fan out and generate a huge number of 5891 * dependencies. 5892 */ 5893 static void 5894 handle_workitem_indirblk(freework) 5895 struct freework *freework; 5896 { 5897 struct freeblks *freeblks; 5898 struct ufsmount *ump; 5899 struct fs *fs; 5900 5901 5902 freeblks = freework->fw_freeblks; 5903 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 5904 fs = ump->um_fs; 5905 if (freework->fw_off == NINDIR(fs)) 5906 freework_freeblock(freework); 5907 else 5908 indir_trunc(freework, fsbtodb(fs, freework->fw_blkno), 5909 freework->fw_lbn); 5910 } 5911 5912 /* 5913 * Called when a freework structure attached to a cg buf is written. The 5914 * ref on either the parent or the freeblks structure is released and 5915 * either may be added to the worklist if it is the final ref. 5916 */ 5917 static void 5918 handle_written_freework(freework) 5919 struct freework *freework; 5920 { 5921 struct freeblks *freeblks; 5922 struct freework *parent; 5923 5924 freeblks = freework->fw_freeblks; 5925 parent = freework->fw_parent; 5926 if (parent) { 5927 if (--parent->fw_ref != 0) 5928 parent = NULL; 5929 freeblks = NULL; 5930 } else if (--freeblks->fb_ref != 0) 5931 freeblks = NULL; 5932 WORKITEM_FREE(freework, D_FREEWORK); 5933 /* 5934 * Don't delay these block frees or it takes an intolerable amount 5935 * of time to process truncates and free their journal entries. 5936 */ 5937 if (freeblks) 5938 add_to_worklist(&freeblks->fb_list, 1); 5939 if (parent) 5940 add_to_worklist(&parent->fw_list, 1); 5941 } 5942 5943 /* 5944 * This workitem routine performs the block de-allocation. 5945 * The workitem is added to the pending list after the updated 5946 * inode block has been written to disk. As mentioned above, 5947 * checks regarding the number of blocks de-allocated (compared 5948 * to the number of blocks allocated for the file) are also 5949 * performed in this function. 5950 */ 5951 static void 5952 handle_workitem_freeblocks(freeblks, flags) 5953 struct freeblks *freeblks; 5954 int flags; 5955 { 5956 struct freework *freework; 5957 struct worklist *wk; 5958 5959 KASSERT(LIST_EMPTY(&freeblks->fb_jfreeblkhd), 5960 ("handle_workitem_freeblocks: Journal entries not written.")); 5961 if (LIST_EMPTY(&freeblks->fb_freeworkhd)) { 5962 handle_complete_freeblocks(freeblks); 5963 return; 5964 } 5965 freeblks->fb_ref++; 5966 while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) { 5967 KASSERT(wk->wk_type == D_FREEWORK, 5968 ("handle_workitem_freeblocks: Unknown type %s", 5969 TYPENAME(wk->wk_type))); 5970 WORKLIST_REMOVE_UNLOCKED(wk); 5971 freework = WK_FREEWORK(wk); 5972 if (freework->fw_lbn <= -NDADDR) 5973 handle_workitem_indirblk(freework); 5974 else 5975 freework_freeblock(freework); 5976 } 5977 ACQUIRE_LOCK(&lk); 5978 if (--freeblks->fb_ref != 0) 5979 freeblks = NULL; 5980 FREE_LOCK(&lk); 5981 if (freeblks) 5982 handle_complete_freeblocks(freeblks); 5983 } 5984 5985 /* 5986 * Once all of the freework workitems are complete we can retire the 5987 * freeblocks dependency and any journal work awaiting completion. This 5988 * can not be called until all other dependencies are stable on disk. 5989 */ 5990 static void 5991 handle_complete_freeblocks(freeblks) 5992 struct freeblks *freeblks; 5993 { 5994 struct inode *ip; 5995 struct vnode *vp; 5996 struct fs *fs; 5997 struct ufsmount *ump; 5998 int flags; 5999 6000 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 6001 fs = ump->um_fs; 6002 flags = LK_NOWAIT; 6003 6004 /* 6005 * If we still have not finished background cleanup, then check 6006 * to see if the block count needs to be adjusted. 6007 */ 6008 if (freeblks->fb_chkcnt != 0 && (fs->fs_flags & FS_UNCLEAN) != 0 && 6009 ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_previousinum, 6010 (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ) == 0) { 6011 ip = VTOI(vp); 6012 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + freeblks->fb_chkcnt); 6013 ip->i_flag |= IN_CHANGE; 6014 vput(vp); 6015 } 6016 6017 #ifdef INVARIANTS 6018 if (freeblks->fb_chkcnt != 0 && 6019 ((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0)) 6020 printf("handle_workitem_freeblocks: block count\n"); 6021 #endif /* INVARIANTS */ 6022 6023 ACQUIRE_LOCK(&lk); 6024 /* 6025 * All of the freeblock deps must be complete prior to this call 6026 * so it's now safe to complete earlier outstanding journal entries. 6027 */ 6028 handle_jwork(&freeblks->fb_jwork); 6029 WORKITEM_FREE(freeblks, D_FREEBLKS); 6030 num_freeblkdep--; 6031 FREE_LOCK(&lk); 6032 } 6033 6034 /* 6035 * Release blocks associated with the inode ip and stored in the indirect 6036 * block dbn. If level is greater than SINGLE, the block is an indirect block 6037 * and recursive calls to indirtrunc must be used to cleanse other indirect 6038 * blocks. 6039 */ 6040 static void 6041 indir_trunc(freework, dbn, lbn) 6042 struct freework *freework; 6043 ufs2_daddr_t dbn; 6044 ufs_lbn_t lbn; 6045 { 6046 struct freework *nfreework; 6047 struct workhead wkhd; 6048 struct jnewblk *jnewblk; 6049 struct freeblks *freeblks; 6050 struct buf *bp; 6051 struct fs *fs; 6052 struct worklist *wkn; 6053 struct worklist *wk; 6054 struct indirdep *indirdep; 6055 struct ufsmount *ump; 6056 ufs1_daddr_t *bap1 = 0; 6057 ufs2_daddr_t nb, nnb, *bap2 = 0; 6058 ufs_lbn_t lbnadd; 6059 int i, nblocks, ufs1fmt; 6060 int fs_pendingblocks; 6061 int freedeps; 6062 int needj; 6063 int level; 6064 int cnt; 6065 6066 LIST_INIT(&wkhd); 6067 level = lbn_level(lbn); 6068 if (level == -1) 6069 panic("indir_trunc: Invalid lbn %jd\n", lbn); 6070 freeblks = freework->fw_freeblks; 6071 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 6072 fs = ump->um_fs; 6073 fs_pendingblocks = 0; 6074 freedeps = 0; 6075 needj = UFSTOVFS(ump)->mnt_kern_flag & MNTK_SUJ; 6076 lbnadd = 1; 6077 for (i = level; i > 0; i--) 6078 lbnadd *= NINDIR(fs); 6079 /* 6080 * Get buffer of block pointers to be freed. This routine is not 6081 * called until the zero'ed inode has been written, so it is safe 6082 * to free blocks as they are encountered. Because the inode has 6083 * been zero'ed, calls to bmap on these blocks will fail. So, we 6084 * have to use the on-disk address and the block device for the 6085 * filesystem to look them up. If the file was deleted before its 6086 * indirect blocks were all written to disk, the routine that set 6087 * us up (deallocate_dependencies) will have arranged to leave 6088 * a complete copy of the indirect block in memory for our use. 6089 * Otherwise we have to read the blocks in from the disk. 6090 */ 6091 #ifdef notyet 6092 bp = getblk(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 0, 0, 6093 GB_NOCREAT); 6094 #else 6095 bp = incore(&freeblks->fb_devvp->v_bufobj, dbn); 6096 #endif 6097 ACQUIRE_LOCK(&lk); 6098 if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) { 6099 if (wk->wk_type != D_INDIRDEP || 6100 (wk->wk_state & GOINGAWAY) == 0) 6101 panic("indir_trunc: lost indirdep %p", wk); 6102 indirdep = WK_INDIRDEP(wk); 6103 LIST_SWAP(&wkhd, &indirdep->ir_jwork, worklist, wk_list); 6104 free_indirdep(indirdep); 6105 if (!LIST_EMPTY(&bp->b_dep)) 6106 panic("indir_trunc: dangling dep %p", 6107 LIST_FIRST(&bp->b_dep)); 6108 ump->um_numindirdeps -= 1; 6109 FREE_LOCK(&lk); 6110 } else { 6111 #ifdef notyet 6112 if (bp) 6113 brelse(bp); 6114 #endif 6115 FREE_LOCK(&lk); 6116 if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 6117 NOCRED, &bp) != 0) { 6118 brelse(bp); 6119 return; 6120 } 6121 } 6122 /* 6123 * Recursively free indirect blocks. 6124 */ 6125 if (ump->um_fstype == UFS1) { 6126 ufs1fmt = 1; 6127 bap1 = (ufs1_daddr_t *)bp->b_data; 6128 } else { 6129 ufs1fmt = 0; 6130 bap2 = (ufs2_daddr_t *)bp->b_data; 6131 } 6132 /* 6133 * Reclaim indirect blocks which never made it to disk. 6134 */ 6135 cnt = 0; 6136 LIST_FOREACH_SAFE(wk, &wkhd, wk_list, wkn) { 6137 struct workhead freewk; 6138 if (wk->wk_type != D_JNEWBLK) 6139 continue; 6140 WORKLIST_REMOVE_UNLOCKED(wk); 6141 LIST_INIT(&freewk); 6142 WORKLIST_INSERT_UNLOCKED(&freewk, wk); 6143 jnewblk = WK_JNEWBLK(wk); 6144 if (jnewblk->jn_lbn > 0) 6145 i = (jnewblk->jn_lbn - -lbn) / lbnadd; 6146 else 6147 i = (jnewblk->jn_lbn - (lbn + 1)) / lbnadd; 6148 KASSERT(i >= 0 && i < NINDIR(fs), 6149 ("indir_trunc: Index out of range %d parent %jd lbn %jd", 6150 i, lbn, jnewblk->jn_lbn)); 6151 /* Clear the pointer so it isn't found below. */ 6152 if (ufs1fmt) { 6153 nb = bap1[i]; 6154 bap1[i] = 0; 6155 } else { 6156 nb = bap2[i]; 6157 bap2[i] = 0; 6158 } 6159 KASSERT(nb == jnewblk->jn_blkno, 6160 ("indir_trunc: Block mismatch %jd != %jd", 6161 nb, jnewblk->jn_blkno)); 6162 ffs_blkfree(ump, fs, freeblks->fb_devvp, jnewblk->jn_blkno, 6163 fs->fs_bsize, freeblks->fb_previousinum, &freewk); 6164 cnt++; 6165 } 6166 ACQUIRE_LOCK(&lk); 6167 if (needj) 6168 freework->fw_ref += NINDIR(fs) + 1; 6169 /* Any remaining journal work can be completed with freeblks. */ 6170 jwork_move(&freeblks->fb_jwork, &wkhd); 6171 FREE_LOCK(&lk); 6172 nblocks = btodb(fs->fs_bsize); 6173 if (ufs1fmt) 6174 nb = bap1[0]; 6175 else 6176 nb = bap2[0]; 6177 nfreework = freework; 6178 /* 6179 * Reclaim on disk blocks. 6180 */ 6181 for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) { 6182 if (i != NINDIR(fs) - 1) { 6183 if (ufs1fmt) 6184 nnb = bap1[i+1]; 6185 else 6186 nnb = bap2[i+1]; 6187 } else 6188 nnb = 0; 6189 if (nb == 0) 6190 continue; 6191 cnt++; 6192 if (level != 0) { 6193 ufs_lbn_t nlbn; 6194 6195 nlbn = (lbn + 1) - (i * lbnadd); 6196 if (needj != 0) { 6197 nfreework = newfreework(freeblks, freework, 6198 nlbn, nb, fs->fs_frag, 0); 6199 freedeps++; 6200 } 6201 indir_trunc(nfreework, fsbtodb(fs, nb), nlbn); 6202 } else { 6203 struct freedep *freedep; 6204 6205 /* 6206 * Attempt to aggregate freedep dependencies for 6207 * all blocks being released to the same CG. 6208 */ 6209 LIST_INIT(&wkhd); 6210 if (needj != 0 && 6211 (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) { 6212 freedep = newfreedep(freework); 6213 WORKLIST_INSERT_UNLOCKED(&wkhd, 6214 &freedep->fd_list); 6215 freedeps++; 6216 } 6217 ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, 6218 fs->fs_bsize, freeblks->fb_previousinum, &wkhd); 6219 } 6220 } 6221 if (level == 0) 6222 fs_pendingblocks = (nblocks * cnt); 6223 /* 6224 * If we're not journaling we can free the indirect now. Otherwise 6225 * setup the ref counts and offset so this indirect can be completed 6226 * when its children are free. 6227 */ 6228 if (needj == 0) { 6229 fs_pendingblocks += nblocks; 6230 dbn = dbtofsb(fs, dbn); 6231 ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize, 6232 freeblks->fb_previousinum, NULL); 6233 ACQUIRE_LOCK(&lk); 6234 freeblks->fb_chkcnt -= fs_pendingblocks; 6235 if (freework->fw_blkno == dbn) 6236 handle_written_freework(freework); 6237 FREE_LOCK(&lk); 6238 freework = NULL; 6239 } else { 6240 ACQUIRE_LOCK(&lk); 6241 freework->fw_off = i; 6242 freework->fw_ref += freedeps; 6243 freework->fw_ref -= NINDIR(fs) + 1; 6244 if (freework->fw_ref != 0) 6245 freework = NULL; 6246 freeblks->fb_chkcnt -= fs_pendingblocks; 6247 FREE_LOCK(&lk); 6248 } 6249 if (fs_pendingblocks) { 6250 UFS_LOCK(ump); 6251 fs->fs_pendingblocks -= fs_pendingblocks; 6252 UFS_UNLOCK(ump); 6253 } 6254 bp->b_flags |= B_INVAL | B_NOCACHE; 6255 brelse(bp); 6256 if (freework) 6257 handle_workitem_indirblk(freework); 6258 return; 6259 } 6260 6261 /* 6262 * Cancel an allocindir when it is removed via truncation. 6263 */ 6264 static void 6265 cancel_allocindir(aip, inodedep, freeblks) 6266 struct allocindir *aip; 6267 struct inodedep *inodedep; 6268 struct freeblks *freeblks; 6269 { 6270 struct newblk *newblk; 6271 6272 /* 6273 * If the journal hasn't been written the jnewblk must be passed 6274 * to the call to ffs_freeblk that reclaims the space. We accomplish 6275 * this by linking the journal dependency into the indirdep to be 6276 * freed when indir_trunc() is called. If the journal has already 6277 * been written we can simply reclaim the journal space when the 6278 * freeblks work is complete. 6279 */ 6280 LIST_REMOVE(aip, ai_next); 6281 newblk = (struct newblk *)aip; 6282 if (newblk->nb_jnewblk == NULL) 6283 cancel_newblk(newblk, &freeblks->fb_jwork); 6284 else 6285 cancel_newblk(newblk, &aip->ai_indirdep->ir_jwork); 6286 if (inodedep && inodedep->id_state & DEPCOMPLETE) 6287 WORKLIST_INSERT(&inodedep->id_bufwait, &newblk->nb_list); 6288 else 6289 free_newblk(newblk); 6290 } 6291 6292 /* 6293 * Create the mkdir dependencies for . and .. in a new directory. Link them 6294 * in to a newdirblk so any subsequent additions are tracked properly. The 6295 * caller is responsible for adding the mkdir1 dependency to the journal 6296 * and updating id_mkdiradd. This function returns with lk held. 6297 */ 6298 static struct mkdir * 6299 setup_newdir(dap, newinum, dinum, newdirbp, mkdirp) 6300 struct diradd *dap; 6301 ino_t newinum; 6302 ino_t dinum; 6303 struct buf *newdirbp; 6304 struct mkdir **mkdirp; 6305 { 6306 struct newblk *newblk; 6307 struct pagedep *pagedep; 6308 struct inodedep *inodedep; 6309 struct newdirblk *newdirblk = 0; 6310 struct mkdir *mkdir1, *mkdir2; 6311 struct worklist *wk; 6312 struct jaddref *jaddref; 6313 struct mount *mp; 6314 6315 mp = dap->da_list.wk_mp; 6316 newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK, 6317 M_SOFTDEP_FLAGS); 6318 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); 6319 LIST_INIT(&newdirblk->db_mkdir); 6320 mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); 6321 workitem_alloc(&mkdir1->md_list, D_MKDIR, mp); 6322 mkdir1->md_state = ATTACHED | MKDIR_BODY; 6323 mkdir1->md_diradd = dap; 6324 mkdir1->md_jaddref = NULL; 6325 mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); 6326 workitem_alloc(&mkdir2->md_list, D_MKDIR, mp); 6327 mkdir2->md_state = ATTACHED | MKDIR_PARENT; 6328 mkdir2->md_diradd = dap; 6329 mkdir2->md_jaddref = NULL; 6330 if ((mp->mnt_kern_flag & MNTK_SUJ) == 0) { 6331 mkdir1->md_state |= DEPCOMPLETE; 6332 mkdir2->md_state |= DEPCOMPLETE; 6333 } 6334 /* 6335 * Dependency on "." and ".." being written to disk. 6336 */ 6337 mkdir1->md_buf = newdirbp; 6338 ACQUIRE_LOCK(&lk); 6339 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); 6340 /* 6341 * We must link the pagedep, allocdirect, and newdirblk for 6342 * the initial file page so the pointer to the new directory 6343 * is not written until the directory contents are live and 6344 * any subsequent additions are not marked live until the 6345 * block is reachable via the inode. 6346 */ 6347 if (pagedep_lookup(mp, newinum, 0, 0, &pagedep) == 0) 6348 panic("setup_newdir: lost pagedep"); 6349 LIST_FOREACH(wk, &newdirbp->b_dep, wk_list) 6350 if (wk->wk_type == D_ALLOCDIRECT) 6351 break; 6352 if (wk == NULL) 6353 panic("setup_newdir: lost allocdirect"); 6354 newblk = WK_NEWBLK(wk); 6355 pagedep->pd_state |= NEWBLOCK; 6356 pagedep->pd_newdirblk = newdirblk; 6357 newdirblk->db_pagedep = pagedep; 6358 WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); 6359 WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list); 6360 /* 6361 * Look up the inodedep for the parent directory so that we 6362 * can link mkdir2 into the pending dotdot jaddref or 6363 * the inode write if there is none. If the inode is 6364 * ALLCOMPLETE and no jaddref is present all dependencies have 6365 * been satisfied and mkdir2 can be freed. 6366 */ 6367 inodedep_lookup(mp, dinum, 0, &inodedep); 6368 if (mp->mnt_kern_flag & MNTK_SUJ) { 6369 if (inodedep == NULL) 6370 panic("setup_newdir: Lost parent."); 6371 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 6372 inoreflst); 6373 KASSERT(jaddref != NULL && jaddref->ja_parent == newinum && 6374 (jaddref->ja_state & MKDIR_PARENT), 6375 ("setup_newdir: bad dotdot jaddref %p", jaddref)); 6376 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 6377 mkdir2->md_jaddref = jaddref; 6378 jaddref->ja_mkdir = mkdir2; 6379 } else if (inodedep == NULL || 6380 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 6381 dap->da_state &= ~MKDIR_PARENT; 6382 WORKITEM_FREE(mkdir2, D_MKDIR); 6383 } else { 6384 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 6385 WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list); 6386 } 6387 *mkdirp = mkdir2; 6388 6389 return (mkdir1); 6390 } 6391 6392 /* 6393 * Directory entry addition dependencies. 6394 * 6395 * When adding a new directory entry, the inode (with its incremented link 6396 * count) must be written to disk before the directory entry's pointer to it. 6397 * Also, if the inode is newly allocated, the corresponding freemap must be 6398 * updated (on disk) before the directory entry's pointer. These requirements 6399 * are met via undo/redo on the directory entry's pointer, which consists 6400 * simply of the inode number. 6401 * 6402 * As directory entries are added and deleted, the free space within a 6403 * directory block can become fragmented. The ufs filesystem will compact 6404 * a fragmented directory block to make space for a new entry. When this 6405 * occurs, the offsets of previously added entries change. Any "diradd" 6406 * dependency structures corresponding to these entries must be updated with 6407 * the new offsets. 6408 */ 6409 6410 /* 6411 * This routine is called after the in-memory inode's link 6412 * count has been incremented, but before the directory entry's 6413 * pointer to the inode has been set. 6414 */ 6415 int 6416 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) 6417 struct buf *bp; /* buffer containing directory block */ 6418 struct inode *dp; /* inode for directory */ 6419 off_t diroffset; /* offset of new entry in directory */ 6420 ino_t newinum; /* inode referenced by new directory entry */ 6421 struct buf *newdirbp; /* non-NULL => contents of new mkdir */ 6422 int isnewblk; /* entry is in a newly allocated block */ 6423 { 6424 int offset; /* offset of new entry within directory block */ 6425 ufs_lbn_t lbn; /* block in directory containing new entry */ 6426 struct fs *fs; 6427 struct diradd *dap; 6428 struct newblk *newblk; 6429 struct pagedep *pagedep; 6430 struct inodedep *inodedep; 6431 struct newdirblk *newdirblk = 0; 6432 struct mkdir *mkdir1, *mkdir2; 6433 struct jaddref *jaddref; 6434 struct mount *mp; 6435 int isindir; 6436 6437 /* 6438 * Whiteouts have no dependencies. 6439 */ 6440 if (newinum == WINO) { 6441 if (newdirbp != NULL) 6442 bdwrite(newdirbp); 6443 return (0); 6444 } 6445 jaddref = NULL; 6446 mkdir1 = mkdir2 = NULL; 6447 mp = UFSTOVFS(dp->i_ump); 6448 fs = dp->i_fs; 6449 lbn = lblkno(fs, diroffset); 6450 offset = blkoff(fs, diroffset); 6451 dap = malloc(sizeof(struct diradd), M_DIRADD, 6452 M_SOFTDEP_FLAGS|M_ZERO); 6453 workitem_alloc(&dap->da_list, D_DIRADD, mp); 6454 dap->da_offset = offset; 6455 dap->da_newinum = newinum; 6456 dap->da_state = ATTACHED; 6457 LIST_INIT(&dap->da_jwork); 6458 isindir = bp->b_lblkno >= NDADDR; 6459 if (isnewblk && 6460 (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) { 6461 newdirblk = malloc(sizeof(struct newdirblk), 6462 M_NEWDIRBLK, M_SOFTDEP_FLAGS); 6463 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); 6464 LIST_INIT(&newdirblk->db_mkdir); 6465 } 6466 /* 6467 * If we're creating a new directory setup the dependencies and set 6468 * the dap state to wait for them. Otherwise it's COMPLETE and 6469 * we can move on. 6470 */ 6471 if (newdirbp == NULL) { 6472 dap->da_state |= DEPCOMPLETE; 6473 ACQUIRE_LOCK(&lk); 6474 } else { 6475 dap->da_state |= MKDIR_BODY | MKDIR_PARENT; 6476 mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp, 6477 &mkdir2); 6478 } 6479 /* 6480 * Link into parent directory pagedep to await its being written. 6481 */ 6482 if (pagedep_lookup(mp, dp->i_number, lbn, DEPALLOC, &pagedep) == 0) 6483 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 6484 #ifdef DEBUG 6485 if (diradd_lookup(pagedep, offset) != NULL) 6486 panic("softdep_setup_directory_add: %p already at off %d\n", 6487 diradd_lookup(pagedep, offset), offset); 6488 #endif 6489 dap->da_pagedep = pagedep; 6490 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, 6491 da_pdlist); 6492 inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); 6493 /* 6494 * If we're journaling, link the diradd into the jaddref so it 6495 * may be completed after the journal entry is written. Otherwise, 6496 * link the diradd into its inodedep. If the inode is not yet 6497 * written place it on the bufwait list, otherwise do the post-inode 6498 * write processing to put it on the id_pendinghd list. 6499 */ 6500 if (mp->mnt_kern_flag & MNTK_SUJ) { 6501 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 6502 inoreflst); 6503 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 6504 ("softdep_setup_directory_add: bad jaddref %p", jaddref)); 6505 jaddref->ja_diroff = diroffset; 6506 jaddref->ja_diradd = dap; 6507 add_to_journal(&jaddref->ja_list); 6508 } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) 6509 diradd_inode_written(dap, inodedep); 6510 else 6511 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 6512 /* 6513 * Add the journal entries for . and .. links now that the primary 6514 * link is written. 6515 */ 6516 if (mkdir1 != NULL && mp->mnt_kern_flag & MNTK_SUJ) { 6517 jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, 6518 inoreflst, if_deps); 6519 KASSERT(jaddref != NULL && 6520 jaddref->ja_ino == jaddref->ja_parent && 6521 (jaddref->ja_state & MKDIR_BODY), 6522 ("softdep_setup_directory_add: bad dot jaddref %p", 6523 jaddref)); 6524 mkdir1->md_jaddref = jaddref; 6525 jaddref->ja_mkdir = mkdir1; 6526 /* 6527 * It is important that the dotdot journal entry 6528 * is added prior to the dot entry since dot writes 6529 * both the dot and dotdot links. These both must 6530 * be added after the primary link for the journal 6531 * to remain consistent. 6532 */ 6533 add_to_journal(&mkdir2->md_jaddref->ja_list); 6534 add_to_journal(&jaddref->ja_list); 6535 } 6536 /* 6537 * If we are adding a new directory remember this diradd so that if 6538 * we rename it we can keep the dot and dotdot dependencies. If 6539 * we are adding a new name for an inode that has a mkdiradd we 6540 * must be in rename and we have to move the dot and dotdot 6541 * dependencies to this new name. The old name is being orphaned 6542 * soon. 6543 */ 6544 if (mkdir1 != NULL) { 6545 if (inodedep->id_mkdiradd != NULL) 6546 panic("softdep_setup_directory_add: Existing mkdir"); 6547 inodedep->id_mkdiradd = dap; 6548 } else if (inodedep->id_mkdiradd) 6549 merge_diradd(inodedep, dap); 6550 if (newdirblk) { 6551 /* 6552 * There is nothing to do if we are already tracking 6553 * this block. 6554 */ 6555 if ((pagedep->pd_state & NEWBLOCK) != 0) { 6556 WORKITEM_FREE(newdirblk, D_NEWDIRBLK); 6557 FREE_LOCK(&lk); 6558 return (0); 6559 } 6560 if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk) 6561 == 0) 6562 panic("softdep_setup_directory_add: lost entry"); 6563 WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); 6564 pagedep->pd_state |= NEWBLOCK; 6565 pagedep->pd_newdirblk = newdirblk; 6566 newdirblk->db_pagedep = pagedep; 6567 FREE_LOCK(&lk); 6568 /* 6569 * If we extended into an indirect signal direnter to sync. 6570 */ 6571 if (isindir) 6572 return (1); 6573 return (0); 6574 } 6575 FREE_LOCK(&lk); 6576 return (0); 6577 } 6578 6579 /* 6580 * This procedure is called to change the offset of a directory 6581 * entry when compacting a directory block which must be owned 6582 * exclusively by the caller. Note that the actual entry movement 6583 * must be done in this procedure to ensure that no I/O completions 6584 * occur while the move is in progress. 6585 */ 6586 void 6587 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) 6588 struct buf *bp; /* Buffer holding directory block. */ 6589 struct inode *dp; /* inode for directory */ 6590 caddr_t base; /* address of dp->i_offset */ 6591 caddr_t oldloc; /* address of old directory location */ 6592 caddr_t newloc; /* address of new directory location */ 6593 int entrysize; /* size of directory entry */ 6594 { 6595 int offset, oldoffset, newoffset; 6596 struct pagedep *pagedep; 6597 struct jmvref *jmvref; 6598 struct diradd *dap; 6599 struct direct *de; 6600 struct mount *mp; 6601 ufs_lbn_t lbn; 6602 int flags; 6603 6604 mp = UFSTOVFS(dp->i_ump); 6605 de = (struct direct *)oldloc; 6606 jmvref = NULL; 6607 flags = 0; 6608 /* 6609 * Moves are always journaled as it would be too complex to 6610 * determine if any affected adds or removes are present in the 6611 * journal. 6612 */ 6613 if (mp->mnt_kern_flag & MNTK_SUJ) { 6614 flags = DEPALLOC; 6615 jmvref = newjmvref(dp, de->d_ino, 6616 dp->i_offset + (oldloc - base), 6617 dp->i_offset + (newloc - base)); 6618 } 6619 lbn = lblkno(dp->i_fs, dp->i_offset); 6620 offset = blkoff(dp->i_fs, dp->i_offset); 6621 oldoffset = offset + (oldloc - base); 6622 newoffset = offset + (newloc - base); 6623 ACQUIRE_LOCK(&lk); 6624 if (pagedep_lookup(mp, dp->i_number, lbn, flags, &pagedep) == 0) { 6625 if (pagedep) 6626 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 6627 goto done; 6628 } 6629 dap = diradd_lookup(pagedep, oldoffset); 6630 if (dap) { 6631 dap->da_offset = newoffset; 6632 newoffset = DIRADDHASH(newoffset); 6633 oldoffset = DIRADDHASH(oldoffset); 6634 if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE && 6635 newoffset != oldoffset) { 6636 LIST_REMOVE(dap, da_pdlist); 6637 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset], 6638 dap, da_pdlist); 6639 } 6640 } 6641 done: 6642 if (jmvref) { 6643 jmvref->jm_pagedep = pagedep; 6644 LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps); 6645 add_to_journal(&jmvref->jm_list); 6646 } 6647 bcopy(oldloc, newloc, entrysize); 6648 FREE_LOCK(&lk); 6649 } 6650 6651 /* 6652 * Move the mkdir dependencies and journal work from one diradd to another 6653 * when renaming a directory. The new name must depend on the mkdir deps 6654 * completing as the old name did. Directories can only have one valid link 6655 * at a time so one must be canonical. 6656 */ 6657 static void 6658 merge_diradd(inodedep, newdap) 6659 struct inodedep *inodedep; 6660 struct diradd *newdap; 6661 { 6662 struct diradd *olddap; 6663 struct mkdir *mkdir, *nextmd; 6664 short state; 6665 6666 olddap = inodedep->id_mkdiradd; 6667 inodedep->id_mkdiradd = newdap; 6668 if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 6669 newdap->da_state &= ~DEPCOMPLETE; 6670 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 6671 nextmd = LIST_NEXT(mkdir, md_mkdirs); 6672 if (mkdir->md_diradd != olddap) 6673 continue; 6674 mkdir->md_diradd = newdap; 6675 state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY); 6676 newdap->da_state |= state; 6677 olddap->da_state &= ~state; 6678 if ((olddap->da_state & 6679 (MKDIR_PARENT | MKDIR_BODY)) == 0) 6680 break; 6681 } 6682 if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) 6683 panic("merge_diradd: unfound ref"); 6684 } 6685 /* 6686 * Any mkdir related journal items are not safe to be freed until 6687 * the new name is stable. 6688 */ 6689 jwork_move(&newdap->da_jwork, &olddap->da_jwork); 6690 olddap->da_state |= DEPCOMPLETE; 6691 complete_diradd(olddap); 6692 } 6693 6694 /* 6695 * Move the diradd to the pending list when all diradd dependencies are 6696 * complete. 6697 */ 6698 static void 6699 complete_diradd(dap) 6700 struct diradd *dap; 6701 { 6702 struct pagedep *pagedep; 6703 6704 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 6705 if (dap->da_state & DIRCHG) 6706 pagedep = dap->da_previous->dm_pagedep; 6707 else 6708 pagedep = dap->da_pagedep; 6709 LIST_REMOVE(dap, da_pdlist); 6710 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 6711 } 6712 } 6713 6714 /* 6715 * Cancel a diradd when a dirrem overlaps with it. We must cancel the journal 6716 * add entries and conditonally journal the remove. 6717 */ 6718 static void 6719 cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref) 6720 struct diradd *dap; 6721 struct dirrem *dirrem; 6722 struct jremref *jremref; 6723 struct jremref *dotremref; 6724 struct jremref *dotdotremref; 6725 { 6726 struct inodedep *inodedep; 6727 struct jaddref *jaddref; 6728 struct inoref *inoref; 6729 struct mkdir *mkdir; 6730 6731 /* 6732 * If no remove references were allocated we're on a non-journaled 6733 * filesystem and can skip the cancel step. 6734 */ 6735 if (jremref == NULL) { 6736 free_diradd(dap, NULL); 6737 return; 6738 } 6739 /* 6740 * Cancel the primary name an free it if it does not require 6741 * journaling. 6742 */ 6743 if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum, 6744 0, &inodedep) != 0) { 6745 /* Abort the addref that reference this diradd. */ 6746 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 6747 if (inoref->if_list.wk_type != D_JADDREF) 6748 continue; 6749 jaddref = (struct jaddref *)inoref; 6750 if (jaddref->ja_diradd != dap) 6751 continue; 6752 if (cancel_jaddref(jaddref, inodedep, 6753 &dirrem->dm_jwork) == 0) { 6754 free_jremref(jremref); 6755 jremref = NULL; 6756 } 6757 break; 6758 } 6759 } 6760 /* 6761 * Cancel subordinate names and free them if they do not require 6762 * journaling. 6763 */ 6764 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 6765 LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) { 6766 if (mkdir->md_diradd != dap) 6767 continue; 6768 if ((jaddref = mkdir->md_jaddref) == NULL) 6769 continue; 6770 mkdir->md_jaddref = NULL; 6771 if (mkdir->md_state & MKDIR_PARENT) { 6772 if (cancel_jaddref(jaddref, NULL, 6773 &dirrem->dm_jwork) == 0) { 6774 free_jremref(dotdotremref); 6775 dotdotremref = NULL; 6776 } 6777 } else { 6778 if (cancel_jaddref(jaddref, inodedep, 6779 &dirrem->dm_jwork) == 0) { 6780 free_jremref(dotremref); 6781 dotremref = NULL; 6782 } 6783 } 6784 } 6785 } 6786 6787 if (jremref) 6788 journal_jremref(dirrem, jremref, inodedep); 6789 if (dotremref) 6790 journal_jremref(dirrem, dotremref, inodedep); 6791 if (dotdotremref) 6792 journal_jremref(dirrem, dotdotremref, NULL); 6793 jwork_move(&dirrem->dm_jwork, &dap->da_jwork); 6794 free_diradd(dap, &dirrem->dm_jwork); 6795 } 6796 6797 /* 6798 * Free a diradd dependency structure. This routine must be called 6799 * with splbio interrupts blocked. 6800 */ 6801 static void 6802 free_diradd(dap, wkhd) 6803 struct diradd *dap; 6804 struct workhead *wkhd; 6805 { 6806 struct dirrem *dirrem; 6807 struct pagedep *pagedep; 6808 struct inodedep *inodedep; 6809 struct mkdir *mkdir, *nextmd; 6810 6811 mtx_assert(&lk, MA_OWNED); 6812 LIST_REMOVE(dap, da_pdlist); 6813 if (dap->da_state & ONWORKLIST) 6814 WORKLIST_REMOVE(&dap->da_list); 6815 if ((dap->da_state & DIRCHG) == 0) { 6816 pagedep = dap->da_pagedep; 6817 } else { 6818 dirrem = dap->da_previous; 6819 pagedep = dirrem->dm_pagedep; 6820 dirrem->dm_dirinum = pagedep->pd_ino; 6821 dirrem->dm_state |= COMPLETE; 6822 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 6823 add_to_worklist(&dirrem->dm_list, 0); 6824 } 6825 if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum, 6826 0, &inodedep) != 0) 6827 if (inodedep->id_mkdiradd == dap) 6828 inodedep->id_mkdiradd = NULL; 6829 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 6830 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 6831 nextmd = LIST_NEXT(mkdir, md_mkdirs); 6832 if (mkdir->md_diradd != dap) 6833 continue; 6834 dap->da_state &= 6835 ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); 6836 LIST_REMOVE(mkdir, md_mkdirs); 6837 if (mkdir->md_state & ONWORKLIST) 6838 WORKLIST_REMOVE(&mkdir->md_list); 6839 if (mkdir->md_jaddref != NULL) 6840 panic("free_diradd: Unexpected jaddref"); 6841 WORKITEM_FREE(mkdir, D_MKDIR); 6842 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) 6843 break; 6844 } 6845 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) 6846 panic("free_diradd: unfound ref"); 6847 } 6848 if (inodedep) 6849 free_inodedep(inodedep); 6850 /* 6851 * Free any journal segments waiting for the directory write. 6852 */ 6853 handle_jwork(&dap->da_jwork); 6854 WORKITEM_FREE(dap, D_DIRADD); 6855 } 6856 6857 /* 6858 * Directory entry removal dependencies. 6859 * 6860 * When removing a directory entry, the entry's inode pointer must be 6861 * zero'ed on disk before the corresponding inode's link count is decremented 6862 * (possibly freeing the inode for re-use). This dependency is handled by 6863 * updating the directory entry but delaying the inode count reduction until 6864 * after the directory block has been written to disk. After this point, the 6865 * inode count can be decremented whenever it is convenient. 6866 */ 6867 6868 /* 6869 * This routine should be called immediately after removing 6870 * a directory entry. The inode's link count should not be 6871 * decremented by the calling procedure -- the soft updates 6872 * code will do this task when it is safe. 6873 */ 6874 void 6875 softdep_setup_remove(bp, dp, ip, isrmdir) 6876 struct buf *bp; /* buffer containing directory block */ 6877 struct inode *dp; /* inode for the directory being modified */ 6878 struct inode *ip; /* inode for directory entry being removed */ 6879 int isrmdir; /* indicates if doing RMDIR */ 6880 { 6881 struct dirrem *dirrem, *prevdirrem; 6882 struct inodedep *inodedep; 6883 int direct; 6884 6885 /* 6886 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. We want 6887 * newdirrem() to setup the full directory remove which requires 6888 * isrmdir > 1. 6889 */ 6890 dirrem = newdirrem(bp, dp, ip, isrmdir?2:0, &prevdirrem); 6891 /* 6892 * Add the dirrem to the inodedep's pending remove list for quick 6893 * discovery later. 6894 */ 6895 if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 6896 &inodedep) == 0) 6897 panic("softdep_setup_remove: Lost inodedep."); 6898 dirrem->dm_state |= ONDEPLIST; 6899 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 6900 6901 /* 6902 * If the COMPLETE flag is clear, then there were no active 6903 * entries and we want to roll back to a zeroed entry until 6904 * the new inode is committed to disk. If the COMPLETE flag is 6905 * set then we have deleted an entry that never made it to 6906 * disk. If the entry we deleted resulted from a name change, 6907 * then the old name still resides on disk. We cannot delete 6908 * its inode (returned to us in prevdirrem) until the zeroed 6909 * directory entry gets to disk. The new inode has never been 6910 * referenced on the disk, so can be deleted immediately. 6911 */ 6912 if ((dirrem->dm_state & COMPLETE) == 0) { 6913 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, 6914 dm_next); 6915 FREE_LOCK(&lk); 6916 } else { 6917 if (prevdirrem != NULL) 6918 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, 6919 prevdirrem, dm_next); 6920 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; 6921 direct = LIST_EMPTY(&dirrem->dm_jremrefhd); 6922 FREE_LOCK(&lk); 6923 if (direct) 6924 handle_workitem_remove(dirrem, NULL); 6925 } 6926 } 6927 6928 /* 6929 * Check for an entry matching 'offset' on both the pd_dirraddhd list and the 6930 * pd_pendinghd list of a pagedep. 6931 */ 6932 static struct diradd * 6933 diradd_lookup(pagedep, offset) 6934 struct pagedep *pagedep; 6935 int offset; 6936 { 6937 struct diradd *dap; 6938 6939 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) 6940 if (dap->da_offset == offset) 6941 return (dap); 6942 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) 6943 if (dap->da_offset == offset) 6944 return (dap); 6945 return (NULL); 6946 } 6947 6948 /* 6949 * Search for a .. diradd dependency in a directory that is being removed. 6950 * If the directory was renamed to a new parent we have a diradd rather 6951 * than a mkdir for the .. entry. We need to cancel it now before 6952 * it is found in truncate(). 6953 */ 6954 static struct jremref * 6955 cancel_diradd_dotdot(ip, dirrem, jremref) 6956 struct inode *ip; 6957 struct dirrem *dirrem; 6958 struct jremref *jremref; 6959 { 6960 struct pagedep *pagedep; 6961 struct diradd *dap; 6962 struct worklist *wk; 6963 6964 if (pagedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 0, 6965 &pagedep) == 0) 6966 return (jremref); 6967 dap = diradd_lookup(pagedep, DOTDOT_OFFSET); 6968 if (dap == NULL) 6969 return (jremref); 6970 cancel_diradd(dap, dirrem, jremref, NULL, NULL); 6971 /* 6972 * Mark any journal work as belonging to the parent so it is freed 6973 * with the .. reference. 6974 */ 6975 LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) 6976 wk->wk_state |= MKDIR_PARENT; 6977 return (NULL); 6978 } 6979 6980 /* 6981 * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to 6982 * replace it with a dirrem/diradd pair as a result of re-parenting a 6983 * directory. This ensures that we don't simultaneously have a mkdir and 6984 * a diradd for the same .. entry. 6985 */ 6986 static struct jremref * 6987 cancel_mkdir_dotdot(ip, dirrem, jremref) 6988 struct inode *ip; 6989 struct dirrem *dirrem; 6990 struct jremref *jremref; 6991 { 6992 struct inodedep *inodedep; 6993 struct jaddref *jaddref; 6994 struct mkdir *mkdir; 6995 struct diradd *dap; 6996 6997 if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 6998 &inodedep) == 0) 6999 panic("cancel_mkdir_dotdot: Lost inodedep"); 7000 dap = inodedep->id_mkdiradd; 7001 if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0) 7002 return (jremref); 7003 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; 7004 mkdir = LIST_NEXT(mkdir, md_mkdirs)) 7005 if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT) 7006 break; 7007 if (mkdir == NULL) 7008 panic("cancel_mkdir_dotdot: Unable to find mkdir\n"); 7009 if ((jaddref = mkdir->md_jaddref) != NULL) { 7010 mkdir->md_jaddref = NULL; 7011 jaddref->ja_state &= ~MKDIR_PARENT; 7012 if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0, 7013 &inodedep) == 0) 7014 panic("cancel_mkdir_dotdot: Lost parent inodedep"); 7015 if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) { 7016 journal_jremref(dirrem, jremref, inodedep); 7017 jremref = NULL; 7018 } 7019 } 7020 if (mkdir->md_state & ONWORKLIST) 7021 WORKLIST_REMOVE(&mkdir->md_list); 7022 mkdir->md_state |= ALLCOMPLETE; 7023 complete_mkdir(mkdir); 7024 return (jremref); 7025 } 7026 7027 static void 7028 journal_jremref(dirrem, jremref, inodedep) 7029 struct dirrem *dirrem; 7030 struct jremref *jremref; 7031 struct inodedep *inodedep; 7032 { 7033 7034 if (inodedep == NULL) 7035 if (inodedep_lookup(jremref->jr_list.wk_mp, 7036 jremref->jr_ref.if_ino, 0, &inodedep) == 0) 7037 panic("journal_jremref: Lost inodedep"); 7038 LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps); 7039 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); 7040 add_to_journal(&jremref->jr_list); 7041 } 7042 7043 static void 7044 dirrem_journal(dirrem, jremref, dotremref, dotdotremref) 7045 struct dirrem *dirrem; 7046 struct jremref *jremref; 7047 struct jremref *dotremref; 7048 struct jremref *dotdotremref; 7049 { 7050 struct inodedep *inodedep; 7051 7052 7053 if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0, 7054 &inodedep) == 0) 7055 panic("dirrem_journal: Lost inodedep"); 7056 journal_jremref(dirrem, jremref, inodedep); 7057 if (dotremref) 7058 journal_jremref(dirrem, dotremref, inodedep); 7059 if (dotdotremref) 7060 journal_jremref(dirrem, dotdotremref, NULL); 7061 } 7062 7063 /* 7064 * Allocate a new dirrem if appropriate and return it along with 7065 * its associated pagedep. Called without a lock, returns with lock. 7066 */ 7067 static long num_dirrem; /* number of dirrem allocated */ 7068 static struct dirrem * 7069 newdirrem(bp, dp, ip, isrmdir, prevdirremp) 7070 struct buf *bp; /* buffer containing directory block */ 7071 struct inode *dp; /* inode for the directory being modified */ 7072 struct inode *ip; /* inode for directory entry being removed */ 7073 int isrmdir; /* indicates if doing RMDIR */ 7074 struct dirrem **prevdirremp; /* previously referenced inode, if any */ 7075 { 7076 int offset; 7077 ufs_lbn_t lbn; 7078 struct diradd *dap; 7079 struct dirrem *dirrem; 7080 struct pagedep *pagedep; 7081 struct jremref *jremref; 7082 struct jremref *dotremref; 7083 struct jremref *dotdotremref; 7084 struct vnode *dvp; 7085 7086 /* 7087 * Whiteouts have no deletion dependencies. 7088 */ 7089 if (ip == NULL) 7090 panic("newdirrem: whiteout"); 7091 dvp = ITOV(dp); 7092 /* 7093 * If we are over our limit, try to improve the situation. 7094 * Limiting the number of dirrem structures will also limit 7095 * the number of freefile and freeblks structures. 7096 */ 7097 ACQUIRE_LOCK(&lk); 7098 if (!(ip->i_flags & SF_SNAPSHOT) && num_dirrem > max_softdeps / 2) 7099 (void) request_cleanup(ITOV(dp)->v_mount, FLUSH_REMOVE); 7100 num_dirrem += 1; 7101 FREE_LOCK(&lk); 7102 dirrem = malloc(sizeof(struct dirrem), 7103 M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO); 7104 workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount); 7105 LIST_INIT(&dirrem->dm_jremrefhd); 7106 LIST_INIT(&dirrem->dm_jwork); 7107 dirrem->dm_state = isrmdir ? RMDIR : 0; 7108 dirrem->dm_oldinum = ip->i_number; 7109 *prevdirremp = NULL; 7110 /* 7111 * Allocate remove reference structures to track journal write 7112 * dependencies. We will always have one for the link and 7113 * when doing directories we will always have one more for dot. 7114 * When renaming a directory we skip the dotdot link change so 7115 * this is not needed. 7116 */ 7117 jremref = dotremref = dotdotremref = NULL; 7118 if (DOINGSUJ(dvp)) { 7119 if (isrmdir) { 7120 jremref = newjremref(dirrem, dp, ip, dp->i_offset, 7121 ip->i_effnlink + 2); 7122 dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET, 7123 ip->i_effnlink + 1); 7124 } else 7125 jremref = newjremref(dirrem, dp, ip, dp->i_offset, 7126 ip->i_effnlink + 1); 7127 if (isrmdir > 1) { 7128 dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET, 7129 dp->i_effnlink + 1); 7130 dotdotremref->jr_state |= MKDIR_PARENT; 7131 } 7132 } 7133 ACQUIRE_LOCK(&lk); 7134 lbn = lblkno(dp->i_fs, dp->i_offset); 7135 offset = blkoff(dp->i_fs, dp->i_offset); 7136 if (pagedep_lookup(UFSTOVFS(dp->i_ump), dp->i_number, lbn, DEPALLOC, 7137 &pagedep) == 0) 7138 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 7139 dirrem->dm_pagedep = pagedep; 7140 /* 7141 * If we're renaming a .. link to a new directory, cancel any 7142 * existing MKDIR_PARENT mkdir. If it has already been canceled 7143 * the jremref is preserved for any potential diradd in this 7144 * location. This can not coincide with a rmdir. 7145 */ 7146 if (dp->i_offset == DOTDOT_OFFSET) { 7147 if (isrmdir) 7148 panic("newdirrem: .. directory change during remove?"); 7149 jremref = cancel_mkdir_dotdot(dp, dirrem, jremref); 7150 } 7151 /* 7152 * If we're removing a directory search for the .. dependency now and 7153 * cancel it. Any pending journal work will be added to the dirrem 7154 * to be completed when the workitem remove completes. 7155 */ 7156 if (isrmdir > 1) 7157 dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref); 7158 /* 7159 * Check for a diradd dependency for the same directory entry. 7160 * If present, then both dependencies become obsolete and can 7161 * be de-allocated. 7162 */ 7163 dap = diradd_lookup(pagedep, offset); 7164 if (dap == NULL) { 7165 /* 7166 * Link the jremref structures into the dirrem so they are 7167 * written prior to the pagedep. 7168 */ 7169 if (jremref) 7170 dirrem_journal(dirrem, jremref, dotremref, 7171 dotdotremref); 7172 return (dirrem); 7173 } 7174 /* 7175 * Must be ATTACHED at this point. 7176 */ 7177 if ((dap->da_state & ATTACHED) == 0) 7178 panic("newdirrem: not ATTACHED"); 7179 if (dap->da_newinum != ip->i_number) 7180 panic("newdirrem: inum %d should be %d", 7181 ip->i_number, dap->da_newinum); 7182 /* 7183 * If we are deleting a changed name that never made it to disk, 7184 * then return the dirrem describing the previous inode (which 7185 * represents the inode currently referenced from this entry on disk). 7186 */ 7187 if ((dap->da_state & DIRCHG) != 0) { 7188 *prevdirremp = dap->da_previous; 7189 dap->da_state &= ~DIRCHG; 7190 dap->da_pagedep = pagedep; 7191 } 7192 /* 7193 * We are deleting an entry that never made it to disk. 7194 * Mark it COMPLETE so we can delete its inode immediately. 7195 */ 7196 dirrem->dm_state |= COMPLETE; 7197 cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref); 7198 #ifdef SUJ_DEBUG 7199 if (isrmdir == 0) { 7200 struct worklist *wk; 7201 7202 LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) 7203 if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT)) 7204 panic("bad wk %p (0x%X)\n", wk, wk->wk_state); 7205 } 7206 #endif 7207 7208 return (dirrem); 7209 } 7210 7211 /* 7212 * Directory entry change dependencies. 7213 * 7214 * Changing an existing directory entry requires that an add operation 7215 * be completed first followed by a deletion. The semantics for the addition 7216 * are identical to the description of adding a new entry above except 7217 * that the rollback is to the old inode number rather than zero. Once 7218 * the addition dependency is completed, the removal is done as described 7219 * in the removal routine above. 7220 */ 7221 7222 /* 7223 * This routine should be called immediately after changing 7224 * a directory entry. The inode's link count should not be 7225 * decremented by the calling procedure -- the soft updates 7226 * code will perform this task when it is safe. 7227 */ 7228 void 7229 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 7230 struct buf *bp; /* buffer containing directory block */ 7231 struct inode *dp; /* inode for the directory being modified */ 7232 struct inode *ip; /* inode for directory entry being removed */ 7233 ino_t newinum; /* new inode number for changed entry */ 7234 int isrmdir; /* indicates if doing RMDIR */ 7235 { 7236 int offset; 7237 struct diradd *dap = NULL; 7238 struct dirrem *dirrem, *prevdirrem; 7239 struct pagedep *pagedep; 7240 struct inodedep *inodedep; 7241 struct jaddref *jaddref; 7242 struct mount *mp; 7243 7244 offset = blkoff(dp->i_fs, dp->i_offset); 7245 mp = UFSTOVFS(dp->i_ump); 7246 7247 /* 7248 * Whiteouts do not need diradd dependencies. 7249 */ 7250 if (newinum != WINO) { 7251 dap = malloc(sizeof(struct diradd), 7252 M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO); 7253 workitem_alloc(&dap->da_list, D_DIRADD, mp); 7254 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; 7255 dap->da_offset = offset; 7256 dap->da_newinum = newinum; 7257 LIST_INIT(&dap->da_jwork); 7258 } 7259 7260 /* 7261 * Allocate a new dirrem and ACQUIRE_LOCK. 7262 */ 7263 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 7264 pagedep = dirrem->dm_pagedep; 7265 /* 7266 * The possible values for isrmdir: 7267 * 0 - non-directory file rename 7268 * 1 - directory rename within same directory 7269 * inum - directory rename to new directory of given inode number 7270 * When renaming to a new directory, we are both deleting and 7271 * creating a new directory entry, so the link count on the new 7272 * directory should not change. Thus we do not need the followup 7273 * dirrem which is usually done in handle_workitem_remove. We set 7274 * the DIRCHG flag to tell handle_workitem_remove to skip the 7275 * followup dirrem. 7276 */ 7277 if (isrmdir > 1) 7278 dirrem->dm_state |= DIRCHG; 7279 7280 /* 7281 * Whiteouts have no additional dependencies, 7282 * so just put the dirrem on the correct list. 7283 */ 7284 if (newinum == WINO) { 7285 if ((dirrem->dm_state & COMPLETE) == 0) { 7286 LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, 7287 dm_next); 7288 } else { 7289 dirrem->dm_dirinum = pagedep->pd_ino; 7290 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 7291 add_to_worklist(&dirrem->dm_list, 0); 7292 } 7293 FREE_LOCK(&lk); 7294 return; 7295 } 7296 /* 7297 * Add the dirrem to the inodedep's pending remove list for quick 7298 * discovery later. A valid nlinkdelta ensures that this lookup 7299 * will not fail. 7300 */ 7301 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) 7302 panic("softdep_setup_directory_change: Lost inodedep."); 7303 dirrem->dm_state |= ONDEPLIST; 7304 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 7305 7306 /* 7307 * If the COMPLETE flag is clear, then there were no active 7308 * entries and we want to roll back to the previous inode until 7309 * the new inode is committed to disk. If the COMPLETE flag is 7310 * set, then we have deleted an entry that never made it to disk. 7311 * If the entry we deleted resulted from a name change, then the old 7312 * inode reference still resides on disk. Any rollback that we do 7313 * needs to be to that old inode (returned to us in prevdirrem). If 7314 * the entry we deleted resulted from a create, then there is 7315 * no entry on the disk, so we want to roll back to zero rather 7316 * than the uncommitted inode. In either of the COMPLETE cases we 7317 * want to immediately free the unwritten and unreferenced inode. 7318 */ 7319 if ((dirrem->dm_state & COMPLETE) == 0) { 7320 dap->da_previous = dirrem; 7321 } else { 7322 if (prevdirrem != NULL) { 7323 dap->da_previous = prevdirrem; 7324 } else { 7325 dap->da_state &= ~DIRCHG; 7326 dap->da_pagedep = pagedep; 7327 } 7328 dirrem->dm_dirinum = pagedep->pd_ino; 7329 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 7330 add_to_worklist(&dirrem->dm_list, 0); 7331 } 7332 /* 7333 * Lookup the jaddref for this journal entry. We must finish 7334 * initializing it and make the diradd write dependent on it. 7335 * If we're not journaling Put it on the id_bufwait list if the inode 7336 * is not yet written. If it is written, do the post-inode write 7337 * processing to put it on the id_pendinghd list. 7338 */ 7339 inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); 7340 if (mp->mnt_kern_flag & MNTK_SUJ) { 7341 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 7342 inoreflst); 7343 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 7344 ("softdep_setup_directory_change: bad jaddref %p", 7345 jaddref)); 7346 jaddref->ja_diroff = dp->i_offset; 7347 jaddref->ja_diradd = dap; 7348 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 7349 dap, da_pdlist); 7350 add_to_journal(&jaddref->ja_list); 7351 } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 7352 dap->da_state |= COMPLETE; 7353 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 7354 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 7355 } else { 7356 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 7357 dap, da_pdlist); 7358 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 7359 } 7360 /* 7361 * If we're making a new name for a directory that has not been 7362 * committed when need to move the dot and dotdot references to 7363 * this new name. 7364 */ 7365 if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET) 7366 merge_diradd(inodedep, dap); 7367 FREE_LOCK(&lk); 7368 } 7369 7370 /* 7371 * Called whenever the link count on an inode is changed. 7372 * It creates an inode dependency so that the new reference(s) 7373 * to the inode cannot be committed to disk until the updated 7374 * inode has been written. 7375 */ 7376 void 7377 softdep_change_linkcnt(ip) 7378 struct inode *ip; /* the inode with the increased link count */ 7379 { 7380 struct inodedep *inodedep; 7381 7382 ACQUIRE_LOCK(&lk); 7383 inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep); 7384 if (ip->i_nlink < ip->i_effnlink) 7385 panic("softdep_change_linkcnt: bad delta"); 7386 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 7387 FREE_LOCK(&lk); 7388 } 7389 7390 /* 7391 * Attach a sbdep dependency to the superblock buf so that we can keep 7392 * track of the head of the linked list of referenced but unlinked inodes. 7393 */ 7394 void 7395 softdep_setup_sbupdate(ump, fs, bp) 7396 struct ufsmount *ump; 7397 struct fs *fs; 7398 struct buf *bp; 7399 { 7400 struct sbdep *sbdep; 7401 struct worklist *wk; 7402 7403 if ((fs->fs_flags & FS_SUJ) == 0) 7404 return; 7405 LIST_FOREACH(wk, &bp->b_dep, wk_list) 7406 if (wk->wk_type == D_SBDEP) 7407 break; 7408 if (wk != NULL) 7409 return; 7410 sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS); 7411 workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump)); 7412 sbdep->sb_fs = fs; 7413 sbdep->sb_ump = ump; 7414 ACQUIRE_LOCK(&lk); 7415 WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list); 7416 FREE_LOCK(&lk); 7417 } 7418 7419 /* 7420 * Return the first unlinked inodedep which is ready to be the head of the 7421 * list. The inodedep and all those after it must have valid next pointers. 7422 */ 7423 static struct inodedep * 7424 first_unlinked_inodedep(ump) 7425 struct ufsmount *ump; 7426 { 7427 struct inodedep *inodedep; 7428 struct inodedep *idp; 7429 7430 for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst); 7431 inodedep; inodedep = idp) { 7432 if ((inodedep->id_state & UNLINKNEXT) == 0) 7433 return (NULL); 7434 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 7435 if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0) 7436 break; 7437 if ((inodedep->id_state & UNLINKPREV) == 0) 7438 panic("first_unlinked_inodedep: prev != next"); 7439 } 7440 if (inodedep == NULL) 7441 return (NULL); 7442 7443 return (inodedep); 7444 } 7445 7446 /* 7447 * Set the sujfree unlinked head pointer prior to writing a superblock. 7448 */ 7449 static void 7450 initiate_write_sbdep(sbdep) 7451 struct sbdep *sbdep; 7452 { 7453 struct inodedep *inodedep; 7454 struct fs *bpfs; 7455 struct fs *fs; 7456 7457 bpfs = sbdep->sb_fs; 7458 fs = sbdep->sb_ump->um_fs; 7459 inodedep = first_unlinked_inodedep(sbdep->sb_ump); 7460 if (inodedep) { 7461 fs->fs_sujfree = inodedep->id_ino; 7462 inodedep->id_state |= UNLINKPREV; 7463 } else 7464 fs->fs_sujfree = 0; 7465 bpfs->fs_sujfree = fs->fs_sujfree; 7466 } 7467 7468 /* 7469 * After a superblock is written determine whether it must be written again 7470 * due to a changing unlinked list head. 7471 */ 7472 static int 7473 handle_written_sbdep(sbdep, bp) 7474 struct sbdep *sbdep; 7475 struct buf *bp; 7476 { 7477 struct inodedep *inodedep; 7478 struct mount *mp; 7479 struct fs *fs; 7480 7481 fs = sbdep->sb_fs; 7482 mp = UFSTOVFS(sbdep->sb_ump); 7483 inodedep = first_unlinked_inodedep(sbdep->sb_ump); 7484 if ((inodedep && fs->fs_sujfree != inodedep->id_ino) || 7485 (inodedep == NULL && fs->fs_sujfree != 0)) { 7486 bdirty(bp); 7487 return (1); 7488 } 7489 WORKITEM_FREE(sbdep, D_SBDEP); 7490 if (fs->fs_sujfree == 0) 7491 return (0); 7492 if (inodedep_lookup(mp, fs->fs_sujfree, 0, &inodedep) == 0) 7493 panic("handle_written_sbdep: lost inodedep"); 7494 /* 7495 * Now that we have a record of this indode in stable store allow it 7496 * to be written to free up pending work. Inodes may see a lot of 7497 * write activity after they are unlinked which we must not hold up. 7498 */ 7499 for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) { 7500 if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS) 7501 panic("handle_written_sbdep: Bad inodedep %p (0x%X)", 7502 inodedep, inodedep->id_state); 7503 if (inodedep->id_state & UNLINKONLIST) 7504 break; 7505 inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST; 7506 } 7507 7508 return (0); 7509 } 7510 7511 /* 7512 * Mark an inodedep has unlinked and insert it into the in-memory unlinked 7513 * list. 7514 */ 7515 static void 7516 unlinked_inodedep(mp, inodedep) 7517 struct mount *mp; 7518 struct inodedep *inodedep; 7519 { 7520 struct ufsmount *ump; 7521 7522 if ((mp->mnt_kern_flag & MNTK_SUJ) == 0) 7523 return; 7524 ump = VFSTOUFS(mp); 7525 ump->um_fs->fs_fmod = 1; 7526 inodedep->id_state |= UNLINKED; 7527 TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked); 7528 } 7529 7530 /* 7531 * Remove an inodedep from the unlinked inodedep list. This may require 7532 * disk writes if the inode has made it that far. 7533 */ 7534 static void 7535 clear_unlinked_inodedep(inodedep) 7536 struct inodedep *inodedep; 7537 { 7538 struct ufsmount *ump; 7539 struct inodedep *idp; 7540 struct inodedep *idn; 7541 struct fs *fs; 7542 struct buf *bp; 7543 ino_t ino; 7544 ino_t nino; 7545 ino_t pino; 7546 int error; 7547 7548 ump = VFSTOUFS(inodedep->id_list.wk_mp); 7549 fs = ump->um_fs; 7550 ino = inodedep->id_ino; 7551 error = 0; 7552 for (;;) { 7553 /* 7554 * If nothing has yet been written simply remove us from 7555 * the in memory list and return. This is the most common 7556 * case where handle_workitem_remove() loses the final 7557 * reference. 7558 */ 7559 if ((inodedep->id_state & UNLINKLINKS) == 0) 7560 break; 7561 /* 7562 * If we have a NEXT pointer and no PREV pointer we can simply 7563 * clear NEXT's PREV and remove ourselves from the list. Be 7564 * careful not to clear PREV if the superblock points at 7565 * next as well. 7566 */ 7567 idn = TAILQ_NEXT(inodedep, id_unlinked); 7568 if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) { 7569 if (idn && fs->fs_sujfree != idn->id_ino) 7570 idn->id_state &= ~UNLINKPREV; 7571 break; 7572 } 7573 /* 7574 * Here we have an inodedep which is actually linked into 7575 * the list. We must remove it by forcing a write to the 7576 * link before us, whether it be the superblock or an inode. 7577 * Unfortunately the list may change while we're waiting 7578 * on the buf lock for either resource so we must loop until 7579 * we lock. the right one. If both the superblock and an 7580 * inode point to this inode we must clear the inode first 7581 * followed by the superblock. 7582 */ 7583 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 7584 pino = 0; 7585 if (idp && (idp->id_state & UNLINKNEXT)) 7586 pino = idp->id_ino; 7587 FREE_LOCK(&lk); 7588 if (pino == 0) 7589 bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), 7590 (int)fs->fs_sbsize, 0, 0, 0); 7591 else 7592 error = bread(ump->um_devvp, 7593 fsbtodb(fs, ino_to_fsba(fs, pino)), 7594 (int)fs->fs_bsize, NOCRED, &bp); 7595 ACQUIRE_LOCK(&lk); 7596 if (error) 7597 break; 7598 /* If the list has changed restart the loop. */ 7599 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 7600 nino = 0; 7601 if (idp && (idp->id_state & UNLINKNEXT)) 7602 nino = idp->id_ino; 7603 if (nino != pino || 7604 (inodedep->id_state & UNLINKPREV) != UNLINKPREV) { 7605 FREE_LOCK(&lk); 7606 brelse(bp); 7607 ACQUIRE_LOCK(&lk); 7608 continue; 7609 } 7610 /* 7611 * Remove us from the in memory list. After this we cannot 7612 * access the inodedep. 7613 */ 7614 idn = TAILQ_NEXT(inodedep, id_unlinked); 7615 inodedep->id_state &= ~(UNLINKED | UNLINKLINKS); 7616 TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); 7617 /* 7618 * Determine the next inode number. 7619 */ 7620 nino = 0; 7621 if (idn) { 7622 /* 7623 * If next isn't on the list we can just clear prev's 7624 * state and schedule it to be fixed later. No need 7625 * to synchronously write if we're not in the real 7626 * list. 7627 */ 7628 if ((idn->id_state & UNLINKPREV) == 0 && pino != 0) { 7629 idp->id_state &= ~UNLINKNEXT; 7630 if ((idp->id_state & ONWORKLIST) == 0) 7631 WORKLIST_INSERT(&bp->b_dep, 7632 &idp->id_list); 7633 FREE_LOCK(&lk); 7634 bawrite(bp); 7635 ACQUIRE_LOCK(&lk); 7636 return; 7637 } 7638 nino = idn->id_ino; 7639 } 7640 FREE_LOCK(&lk); 7641 /* 7642 * The predecessor's next pointer is manually updated here 7643 * so that the NEXT flag is never cleared for an element 7644 * that is in the list. 7645 */ 7646 if (pino == 0) { 7647 bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); 7648 ffs_oldfscompat_write((struct fs *)bp->b_data, ump); 7649 softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, 7650 bp); 7651 } else if (fs->fs_magic == FS_UFS1_MAGIC) 7652 ((struct ufs1_dinode *)bp->b_data + 7653 ino_to_fsbo(fs, pino))->di_freelink = nino; 7654 else 7655 ((struct ufs2_dinode *)bp->b_data + 7656 ino_to_fsbo(fs, pino))->di_freelink = nino; 7657 /* 7658 * If the bwrite fails we have no recourse to recover. The 7659 * filesystem is corrupted already. 7660 */ 7661 bwrite(bp); 7662 ACQUIRE_LOCK(&lk); 7663 /* 7664 * If the superblock pointer still needs to be cleared force 7665 * a write here. 7666 */ 7667 if (fs->fs_sujfree == ino) { 7668 FREE_LOCK(&lk); 7669 bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), 7670 (int)fs->fs_sbsize, 0, 0, 0); 7671 bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); 7672 ffs_oldfscompat_write((struct fs *)bp->b_data, ump); 7673 softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, 7674 bp); 7675 bwrite(bp); 7676 ACQUIRE_LOCK(&lk); 7677 } 7678 if (fs->fs_sujfree != ino) 7679 return; 7680 panic("clear_unlinked_inodedep: Failed to clear free head"); 7681 } 7682 if (inodedep->id_ino == fs->fs_sujfree) 7683 panic("clear_unlinked_inodedep: Freeing head of free list"); 7684 inodedep->id_state &= ~(UNLINKED | UNLINKLINKS); 7685 TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); 7686 return; 7687 } 7688 7689 /* 7690 * This workitem decrements the inode's link count. 7691 * If the link count reaches zero, the file is removed. 7692 */ 7693 static void 7694 handle_workitem_remove(dirrem, xp) 7695 struct dirrem *dirrem; 7696 struct vnode *xp; 7697 { 7698 struct inodedep *inodedep; 7699 struct workhead dotdotwk; 7700 struct worklist *wk; 7701 struct ufsmount *ump; 7702 struct mount *mp; 7703 struct vnode *vp; 7704 struct inode *ip; 7705 ino_t oldinum; 7706 int error; 7707 7708 if (dirrem->dm_state & ONWORKLIST) 7709 panic("handle_workitem_remove: dirrem %p still on worklist", 7710 dirrem); 7711 oldinum = dirrem->dm_oldinum; 7712 mp = dirrem->dm_list.wk_mp; 7713 ump = VFSTOUFS(mp); 7714 if ((vp = xp) == NULL && 7715 (error = ffs_vgetf(mp, oldinum, LK_EXCLUSIVE, &vp, 7716 FFSV_FORCEINSMQ)) != 0) { 7717 softdep_error("handle_workitem_remove: vget", error); 7718 return; 7719 } 7720 ip = VTOI(vp); 7721 ACQUIRE_LOCK(&lk); 7722 if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0) 7723 panic("handle_workitem_remove: lost inodedep"); 7724 if (dirrem->dm_state & ONDEPLIST) 7725 LIST_REMOVE(dirrem, dm_inonext); 7726 KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), 7727 ("handle_workitem_remove: Journal entries not written.")); 7728 7729 /* 7730 * Move all dependencies waiting on the remove to complete 7731 * from the dirrem to the inode inowait list to be completed 7732 * after the inode has been updated and written to disk. Any 7733 * marked MKDIR_PARENT are saved to be completed when the .. ref 7734 * is removed. 7735 */ 7736 LIST_INIT(&dotdotwk); 7737 while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) { 7738 WORKLIST_REMOVE(wk); 7739 if (wk->wk_state & MKDIR_PARENT) { 7740 wk->wk_state &= ~MKDIR_PARENT; 7741 WORKLIST_INSERT(&dotdotwk, wk); 7742 continue; 7743 } 7744 WORKLIST_INSERT(&inodedep->id_inowait, wk); 7745 } 7746 LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list); 7747 /* 7748 * Normal file deletion. 7749 */ 7750 if ((dirrem->dm_state & RMDIR) == 0) { 7751 ip->i_nlink--; 7752 DIP_SET(ip, i_nlink, ip->i_nlink); 7753 ip->i_flag |= IN_CHANGE; 7754 if (ip->i_nlink < ip->i_effnlink) 7755 panic("handle_workitem_remove: bad file delta"); 7756 if (ip->i_nlink == 0) 7757 unlinked_inodedep(mp, inodedep); 7758 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 7759 num_dirrem -= 1; 7760 KASSERT(LIST_EMPTY(&dirrem->dm_jwork), 7761 ("handle_workitem_remove: worklist not empty. %s", 7762 TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type))); 7763 WORKITEM_FREE(dirrem, D_DIRREM); 7764 FREE_LOCK(&lk); 7765 goto out; 7766 } 7767 /* 7768 * Directory deletion. Decrement reference count for both the 7769 * just deleted parent directory entry and the reference for ".". 7770 * Arrange to have the reference count on the parent decremented 7771 * to account for the loss of "..". 7772 */ 7773 ip->i_nlink -= 2; 7774 DIP_SET(ip, i_nlink, ip->i_nlink); 7775 ip->i_flag |= IN_CHANGE; 7776 if (ip->i_nlink < ip->i_effnlink) 7777 panic("handle_workitem_remove: bad dir delta"); 7778 if (ip->i_nlink == 0) 7779 unlinked_inodedep(mp, inodedep); 7780 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 7781 /* 7782 * Rename a directory to a new parent. Since, we are both deleting 7783 * and creating a new directory entry, the link count on the new 7784 * directory should not change. Thus we skip the followup dirrem. 7785 */ 7786 if (dirrem->dm_state & DIRCHG) { 7787 KASSERT(LIST_EMPTY(&dirrem->dm_jwork), 7788 ("handle_workitem_remove: DIRCHG and worklist not empty.")); 7789 num_dirrem -= 1; 7790 WORKITEM_FREE(dirrem, D_DIRREM); 7791 FREE_LOCK(&lk); 7792 goto out; 7793 } 7794 dirrem->dm_state = ONDEPLIST; 7795 dirrem->dm_oldinum = dirrem->dm_dirinum; 7796 /* 7797 * Place the dirrem on the parent's diremhd list. 7798 */ 7799 if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0) 7800 panic("handle_workitem_remove: lost dir inodedep"); 7801 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 7802 /* 7803 * If the allocated inode has never been written to disk, then 7804 * the on-disk inode is zero'ed and we can remove the file 7805 * immediately. When journaling if the inode has been marked 7806 * unlinked and not DEPCOMPLETE we know it can never be written. 7807 */ 7808 inodedep_lookup(mp, oldinum, 0, &inodedep); 7809 if (inodedep == NULL || 7810 (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED || 7811 check_inode_unwritten(inodedep)) { 7812 if (xp != NULL) 7813 add_to_worklist(&dirrem->dm_list, 0); 7814 FREE_LOCK(&lk); 7815 if (xp == NULL) { 7816 vput(vp); 7817 handle_workitem_remove(dirrem, NULL); 7818 } 7819 return; 7820 } 7821 WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); 7822 FREE_LOCK(&lk); 7823 ip->i_flag |= IN_CHANGE; 7824 out: 7825 ffs_update(vp, 0); 7826 if (xp == NULL) 7827 vput(vp); 7828 } 7829 7830 /* 7831 * Inode de-allocation dependencies. 7832 * 7833 * When an inode's link count is reduced to zero, it can be de-allocated. We 7834 * found it convenient to postpone de-allocation until after the inode is 7835 * written to disk with its new link count (zero). At this point, all of the 7836 * on-disk inode's block pointers are nullified and, with careful dependency 7837 * list ordering, all dependencies related to the inode will be satisfied and 7838 * the corresponding dependency structures de-allocated. So, if/when the 7839 * inode is reused, there will be no mixing of old dependencies with new 7840 * ones. This artificial dependency is set up by the block de-allocation 7841 * procedure above (softdep_setup_freeblocks) and completed by the 7842 * following procedure. 7843 */ 7844 static void 7845 handle_workitem_freefile(freefile) 7846 struct freefile *freefile; 7847 { 7848 struct workhead wkhd; 7849 struct fs *fs; 7850 struct inodedep *idp; 7851 struct ufsmount *ump; 7852 int error; 7853 7854 ump = VFSTOUFS(freefile->fx_list.wk_mp); 7855 fs = ump->um_fs; 7856 #ifdef DEBUG 7857 ACQUIRE_LOCK(&lk); 7858 error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp); 7859 FREE_LOCK(&lk); 7860 if (error) 7861 panic("handle_workitem_freefile: inodedep %p survived", idp); 7862 #endif 7863 UFS_LOCK(ump); 7864 fs->fs_pendinginodes -= 1; 7865 UFS_UNLOCK(ump); 7866 LIST_INIT(&wkhd); 7867 LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list); 7868 if ((error = ffs_freefile(ump, fs, freefile->fx_devvp, 7869 freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0) 7870 softdep_error("handle_workitem_freefile", error); 7871 ACQUIRE_LOCK(&lk); 7872 WORKITEM_FREE(freefile, D_FREEFILE); 7873 FREE_LOCK(&lk); 7874 } 7875 7876 7877 /* 7878 * Helper function which unlinks marker element from work list and returns 7879 * the next element on the list. 7880 */ 7881 static __inline struct worklist * 7882 markernext(struct worklist *marker) 7883 { 7884 struct worklist *next; 7885 7886 next = LIST_NEXT(marker, wk_list); 7887 LIST_REMOVE(marker, wk_list); 7888 return next; 7889 } 7890 7891 /* 7892 * Disk writes. 7893 * 7894 * The dependency structures constructed above are most actively used when file 7895 * system blocks are written to disk. No constraints are placed on when a 7896 * block can be written, but unsatisfied update dependencies are made safe by 7897 * modifying (or replacing) the source memory for the duration of the disk 7898 * write. When the disk write completes, the memory block is again brought 7899 * up-to-date. 7900 * 7901 * In-core inode structure reclamation. 7902 * 7903 * Because there are a finite number of "in-core" inode structures, they are 7904 * reused regularly. By transferring all inode-related dependencies to the 7905 * in-memory inode block and indexing them separately (via "inodedep"s), we 7906 * can allow "in-core" inode structures to be reused at any time and avoid 7907 * any increase in contention. 7908 * 7909 * Called just before entering the device driver to initiate a new disk I/O. 7910 * The buffer must be locked, thus, no I/O completion operations can occur 7911 * while we are manipulating its associated dependencies. 7912 */ 7913 static void 7914 softdep_disk_io_initiation(bp) 7915 struct buf *bp; /* structure describing disk write to occur */ 7916 { 7917 struct worklist *wk; 7918 struct worklist marker; 7919 struct inodedep *inodedep; 7920 struct freeblks *freeblks; 7921 struct jfreeblk *jfreeblk; 7922 struct newblk *newblk; 7923 7924 /* 7925 * We only care about write operations. There should never 7926 * be dependencies for reads. 7927 */ 7928 if (bp->b_iocmd != BIO_WRITE) 7929 panic("softdep_disk_io_initiation: not write"); 7930 7931 if (bp->b_vflags & BV_BKGRDINPROG) 7932 panic("softdep_disk_io_initiation: Writing buffer with " 7933 "background write in progress: %p", bp); 7934 7935 marker.wk_type = D_LAST + 1; /* Not a normal workitem */ 7936 PHOLD(curproc); /* Don't swap out kernel stack */ 7937 7938 ACQUIRE_LOCK(&lk); 7939 /* 7940 * Do any necessary pre-I/O processing. 7941 */ 7942 for (wk = LIST_FIRST(&bp->b_dep); wk != NULL; 7943 wk = markernext(&marker)) { 7944 LIST_INSERT_AFTER(wk, &marker, wk_list); 7945 switch (wk->wk_type) { 7946 7947 case D_PAGEDEP: 7948 initiate_write_filepage(WK_PAGEDEP(wk), bp); 7949 continue; 7950 7951 case D_INODEDEP: 7952 inodedep = WK_INODEDEP(wk); 7953 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) 7954 initiate_write_inodeblock_ufs1(inodedep, bp); 7955 else 7956 initiate_write_inodeblock_ufs2(inodedep, bp); 7957 continue; 7958 7959 case D_INDIRDEP: 7960 initiate_write_indirdep(WK_INDIRDEP(wk), bp); 7961 continue; 7962 7963 case D_BMSAFEMAP: 7964 initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp); 7965 continue; 7966 7967 case D_JSEG: 7968 WK_JSEG(wk)->js_buf = NULL; 7969 continue; 7970 7971 case D_FREEBLKS: 7972 freeblks = WK_FREEBLKS(wk); 7973 jfreeblk = LIST_FIRST(&freeblks->fb_jfreeblkhd); 7974 /* 7975 * We have to wait for the jfreeblks to be journaled 7976 * before we can write an inodeblock with updated 7977 * pointers. Be careful to arrange the marker so 7978 * we revisit the jfreeblk if it's not removed by 7979 * the first jwait(). 7980 */ 7981 if (jfreeblk != NULL) { 7982 LIST_REMOVE(&marker, wk_list); 7983 LIST_INSERT_BEFORE(wk, &marker, wk_list); 7984 jwait(&jfreeblk->jf_list); 7985 } 7986 continue; 7987 case D_ALLOCDIRECT: 7988 case D_ALLOCINDIR: 7989 /* 7990 * We have to wait for the jnewblk to be journaled 7991 * before we can write to a block otherwise the 7992 * contents may be confused with an earlier file 7993 * at recovery time. Handle the marker as described 7994 * above. 7995 */ 7996 newblk = WK_NEWBLK(wk); 7997 if (newblk->nb_jnewblk != NULL) { 7998 LIST_REMOVE(&marker, wk_list); 7999 LIST_INSERT_BEFORE(wk, &marker, wk_list); 8000 jwait(&newblk->nb_jnewblk->jn_list); 8001 } 8002 continue; 8003 8004 case D_SBDEP: 8005 initiate_write_sbdep(WK_SBDEP(wk)); 8006 continue; 8007 8008 case D_MKDIR: 8009 case D_FREEWORK: 8010 case D_FREEDEP: 8011 case D_JSEGDEP: 8012 continue; 8013 8014 default: 8015 panic("handle_disk_io_initiation: Unexpected type %s", 8016 TYPENAME(wk->wk_type)); 8017 /* NOTREACHED */ 8018 } 8019 } 8020 FREE_LOCK(&lk); 8021 PRELE(curproc); /* Allow swapout of kernel stack */ 8022 } 8023 8024 /* 8025 * Called from within the procedure above to deal with unsatisfied 8026 * allocation dependencies in a directory. The buffer must be locked, 8027 * thus, no I/O completion operations can occur while we are 8028 * manipulating its associated dependencies. 8029 */ 8030 static void 8031 initiate_write_filepage(pagedep, bp) 8032 struct pagedep *pagedep; 8033 struct buf *bp; 8034 { 8035 struct jremref *jremref; 8036 struct jmvref *jmvref; 8037 struct dirrem *dirrem; 8038 struct diradd *dap; 8039 struct direct *ep; 8040 int i; 8041 8042 if (pagedep->pd_state & IOSTARTED) { 8043 /* 8044 * This can only happen if there is a driver that does not 8045 * understand chaining. Here biodone will reissue the call 8046 * to strategy for the incomplete buffers. 8047 */ 8048 printf("initiate_write_filepage: already started\n"); 8049 return; 8050 } 8051 pagedep->pd_state |= IOSTARTED; 8052 /* 8053 * Wait for all journal remove dependencies to hit the disk. 8054 * We can not allow any potentially conflicting directory adds 8055 * to be visible before removes and rollback is too difficult. 8056 * lk may be dropped and re-acquired, however we hold the buf 8057 * locked so the dependency can not go away. 8058 */ 8059 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) 8060 while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) { 8061 stat_jwait_filepage++; 8062 jwait(&jremref->jr_list); 8063 } 8064 while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) { 8065 stat_jwait_filepage++; 8066 jwait(&jmvref->jm_list); 8067 } 8068 for (i = 0; i < DAHASHSZ; i++) { 8069 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 8070 ep = (struct direct *) 8071 ((char *)bp->b_data + dap->da_offset); 8072 if (ep->d_ino != dap->da_newinum) 8073 panic("%s: dir inum %d != new %d", 8074 "initiate_write_filepage", 8075 ep->d_ino, dap->da_newinum); 8076 if (dap->da_state & DIRCHG) 8077 ep->d_ino = dap->da_previous->dm_oldinum; 8078 else 8079 ep->d_ino = 0; 8080 dap->da_state &= ~ATTACHED; 8081 dap->da_state |= UNDONE; 8082 } 8083 } 8084 } 8085 8086 /* 8087 * Version of initiate_write_inodeblock that handles UFS1 dinodes. 8088 * Note that any bug fixes made to this routine must be done in the 8089 * version found below. 8090 * 8091 * Called from within the procedure above to deal with unsatisfied 8092 * allocation dependencies in an inodeblock. The buffer must be 8093 * locked, thus, no I/O completion operations can occur while we 8094 * are manipulating its associated dependencies. 8095 */ 8096 static void 8097 initiate_write_inodeblock_ufs1(inodedep, bp) 8098 struct inodedep *inodedep; 8099 struct buf *bp; /* The inode block */ 8100 { 8101 struct allocdirect *adp, *lastadp; 8102 struct ufs1_dinode *dp; 8103 struct ufs1_dinode *sip; 8104 struct inoref *inoref; 8105 struct fs *fs; 8106 ufs_lbn_t i; 8107 #ifdef INVARIANTS 8108 ufs_lbn_t prevlbn = 0; 8109 #endif 8110 int deplist; 8111 8112 if (inodedep->id_state & IOSTARTED) 8113 panic("initiate_write_inodeblock_ufs1: already started"); 8114 inodedep->id_state |= IOSTARTED; 8115 fs = inodedep->id_fs; 8116 dp = (struct ufs1_dinode *)bp->b_data + 8117 ino_to_fsbo(fs, inodedep->id_ino); 8118 8119 /* 8120 * If we're on the unlinked list but have not yet written our 8121 * next pointer initialize it here. 8122 */ 8123 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 8124 struct inodedep *inon; 8125 8126 inon = TAILQ_NEXT(inodedep, id_unlinked); 8127 dp->di_freelink = inon ? inon->id_ino : 0; 8128 } 8129 /* 8130 * If the bitmap is not yet written, then the allocated 8131 * inode cannot be written to disk. 8132 */ 8133 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 8134 if (inodedep->id_savedino1 != NULL) 8135 panic("initiate_write_inodeblock_ufs1: I/O underway"); 8136 FREE_LOCK(&lk); 8137 sip = malloc(sizeof(struct ufs1_dinode), 8138 M_SAVEDINO, M_SOFTDEP_FLAGS); 8139 ACQUIRE_LOCK(&lk); 8140 inodedep->id_savedino1 = sip; 8141 *inodedep->id_savedino1 = *dp; 8142 bzero((caddr_t)dp, sizeof(struct ufs1_dinode)); 8143 dp->di_gen = inodedep->id_savedino1->di_gen; 8144 dp->di_freelink = inodedep->id_savedino1->di_freelink; 8145 return; 8146 } 8147 /* 8148 * If no dependencies, then there is nothing to roll back. 8149 */ 8150 inodedep->id_savedsize = dp->di_size; 8151 inodedep->id_savedextsize = 0; 8152 inodedep->id_savednlink = dp->di_nlink; 8153 if (TAILQ_EMPTY(&inodedep->id_inoupdt) && 8154 TAILQ_EMPTY(&inodedep->id_inoreflst)) 8155 return; 8156 /* 8157 * Revert the link count to that of the first unwritten journal entry. 8158 */ 8159 inoref = TAILQ_FIRST(&inodedep->id_inoreflst); 8160 if (inoref) 8161 dp->di_nlink = inoref->if_nlink; 8162 /* 8163 * Set the dependencies to busy. 8164 */ 8165 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 8166 adp = TAILQ_NEXT(adp, ad_next)) { 8167 #ifdef INVARIANTS 8168 if (deplist != 0 && prevlbn >= adp->ad_offset) 8169 panic("softdep_write_inodeblock: lbn order"); 8170 prevlbn = adp->ad_offset; 8171 if (adp->ad_offset < NDADDR && 8172 dp->di_db[adp->ad_offset] != adp->ad_newblkno) 8173 panic("%s: direct pointer #%jd mismatch %d != %jd", 8174 "softdep_write_inodeblock", 8175 (intmax_t)adp->ad_offset, 8176 dp->di_db[adp->ad_offset], 8177 (intmax_t)adp->ad_newblkno); 8178 if (adp->ad_offset >= NDADDR && 8179 dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) 8180 panic("%s: indirect pointer #%jd mismatch %d != %jd", 8181 "softdep_write_inodeblock", 8182 (intmax_t)adp->ad_offset - NDADDR, 8183 dp->di_ib[adp->ad_offset - NDADDR], 8184 (intmax_t)adp->ad_newblkno); 8185 deplist |= 1 << adp->ad_offset; 8186 if ((adp->ad_state & ATTACHED) == 0) 8187 panic("softdep_write_inodeblock: Unknown state 0x%x", 8188 adp->ad_state); 8189 #endif /* INVARIANTS */ 8190 adp->ad_state &= ~ATTACHED; 8191 adp->ad_state |= UNDONE; 8192 } 8193 /* 8194 * The on-disk inode cannot claim to be any larger than the last 8195 * fragment that has been written. Otherwise, the on-disk inode 8196 * might have fragments that were not the last block in the file 8197 * which would corrupt the filesystem. 8198 */ 8199 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 8200 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 8201 if (adp->ad_offset >= NDADDR) 8202 break; 8203 dp->di_db[adp->ad_offset] = adp->ad_oldblkno; 8204 /* keep going until hitting a rollback to a frag */ 8205 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 8206 continue; 8207 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 8208 for (i = adp->ad_offset + 1; i < NDADDR; i++) { 8209 #ifdef INVARIANTS 8210 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) 8211 panic("softdep_write_inodeblock: lost dep1"); 8212 #endif /* INVARIANTS */ 8213 dp->di_db[i] = 0; 8214 } 8215 for (i = 0; i < NIADDR; i++) { 8216 #ifdef INVARIANTS 8217 if (dp->di_ib[i] != 0 && 8218 (deplist & ((1 << NDADDR) << i)) == 0) 8219 panic("softdep_write_inodeblock: lost dep2"); 8220 #endif /* INVARIANTS */ 8221 dp->di_ib[i] = 0; 8222 } 8223 return; 8224 } 8225 /* 8226 * If we have zero'ed out the last allocated block of the file, 8227 * roll back the size to the last currently allocated block. 8228 * We know that this last allocated block is a full-sized as 8229 * we already checked for fragments in the loop above. 8230 */ 8231 if (lastadp != NULL && 8232 dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 8233 for (i = lastadp->ad_offset; i >= 0; i--) 8234 if (dp->di_db[i] != 0) 8235 break; 8236 dp->di_size = (i + 1) * fs->fs_bsize; 8237 } 8238 /* 8239 * The only dependencies are for indirect blocks. 8240 * 8241 * The file size for indirect block additions is not guaranteed. 8242 * Such a guarantee would be non-trivial to achieve. The conventional 8243 * synchronous write implementation also does not make this guarantee. 8244 * Fsck should catch and fix discrepancies. Arguably, the file size 8245 * can be over-estimated without destroying integrity when the file 8246 * moves into the indirect blocks (i.e., is large). If we want to 8247 * postpone fsck, we are stuck with this argument. 8248 */ 8249 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 8250 dp->di_ib[adp->ad_offset - NDADDR] = 0; 8251 } 8252 8253 /* 8254 * Version of initiate_write_inodeblock that handles UFS2 dinodes. 8255 * Note that any bug fixes made to this routine must be done in the 8256 * version found above. 8257 * 8258 * Called from within the procedure above to deal with unsatisfied 8259 * allocation dependencies in an inodeblock. The buffer must be 8260 * locked, thus, no I/O completion operations can occur while we 8261 * are manipulating its associated dependencies. 8262 */ 8263 static void 8264 initiate_write_inodeblock_ufs2(inodedep, bp) 8265 struct inodedep *inodedep; 8266 struct buf *bp; /* The inode block */ 8267 { 8268 struct allocdirect *adp, *lastadp; 8269 struct ufs2_dinode *dp; 8270 struct ufs2_dinode *sip; 8271 struct inoref *inoref; 8272 struct fs *fs; 8273 ufs_lbn_t i; 8274 #ifdef INVARIANTS 8275 ufs_lbn_t prevlbn = 0; 8276 #endif 8277 int deplist; 8278 8279 if (inodedep->id_state & IOSTARTED) 8280 panic("initiate_write_inodeblock_ufs2: already started"); 8281 inodedep->id_state |= IOSTARTED; 8282 fs = inodedep->id_fs; 8283 dp = (struct ufs2_dinode *)bp->b_data + 8284 ino_to_fsbo(fs, inodedep->id_ino); 8285 8286 /* 8287 * If we're on the unlinked list but have not yet written our 8288 * next pointer initialize it here. 8289 */ 8290 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 8291 struct inodedep *inon; 8292 8293 inon = TAILQ_NEXT(inodedep, id_unlinked); 8294 dp->di_freelink = inon ? inon->id_ino : 0; 8295 } 8296 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == 8297 (UNLINKED | UNLINKNEXT)) { 8298 struct inodedep *inon; 8299 ino_t freelink; 8300 8301 inon = TAILQ_NEXT(inodedep, id_unlinked); 8302 freelink = inon ? inon->id_ino : 0; 8303 if (freelink != dp->di_freelink) 8304 panic("ino %p(0x%X) %d, %d != %d", 8305 inodedep, inodedep->id_state, inodedep->id_ino, 8306 freelink, dp->di_freelink); 8307 } 8308 /* 8309 * If the bitmap is not yet written, then the allocated 8310 * inode cannot be written to disk. 8311 */ 8312 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 8313 if (inodedep->id_savedino2 != NULL) 8314 panic("initiate_write_inodeblock_ufs2: I/O underway"); 8315 FREE_LOCK(&lk); 8316 sip = malloc(sizeof(struct ufs2_dinode), 8317 M_SAVEDINO, M_SOFTDEP_FLAGS); 8318 ACQUIRE_LOCK(&lk); 8319 inodedep->id_savedino2 = sip; 8320 *inodedep->id_savedino2 = *dp; 8321 bzero((caddr_t)dp, sizeof(struct ufs2_dinode)); 8322 dp->di_gen = inodedep->id_savedino2->di_gen; 8323 dp->di_freelink = inodedep->id_savedino2->di_freelink; 8324 return; 8325 } 8326 /* 8327 * If no dependencies, then there is nothing to roll back. 8328 */ 8329 inodedep->id_savedsize = dp->di_size; 8330 inodedep->id_savedextsize = dp->di_extsize; 8331 inodedep->id_savednlink = dp->di_nlink; 8332 if (TAILQ_EMPTY(&inodedep->id_inoupdt) && 8333 TAILQ_EMPTY(&inodedep->id_extupdt) && 8334 TAILQ_EMPTY(&inodedep->id_inoreflst)) 8335 return; 8336 /* 8337 * Revert the link count to that of the first unwritten journal entry. 8338 */ 8339 inoref = TAILQ_FIRST(&inodedep->id_inoreflst); 8340 if (inoref) 8341 dp->di_nlink = inoref->if_nlink; 8342 8343 /* 8344 * Set the ext data dependencies to busy. 8345 */ 8346 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; 8347 adp = TAILQ_NEXT(adp, ad_next)) { 8348 #ifdef INVARIANTS 8349 if (deplist != 0 && prevlbn >= adp->ad_offset) 8350 panic("softdep_write_inodeblock: lbn order"); 8351 prevlbn = adp->ad_offset; 8352 if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno) 8353 panic("%s: direct pointer #%jd mismatch %jd != %jd", 8354 "softdep_write_inodeblock", 8355 (intmax_t)adp->ad_offset, 8356 (intmax_t)dp->di_extb[adp->ad_offset], 8357 (intmax_t)adp->ad_newblkno); 8358 deplist |= 1 << adp->ad_offset; 8359 if ((adp->ad_state & ATTACHED) == 0) 8360 panic("softdep_write_inodeblock: Unknown state 0x%x", 8361 adp->ad_state); 8362 #endif /* INVARIANTS */ 8363 adp->ad_state &= ~ATTACHED; 8364 adp->ad_state |= UNDONE; 8365 } 8366 /* 8367 * The on-disk inode cannot claim to be any larger than the last 8368 * fragment that has been written. Otherwise, the on-disk inode 8369 * might have fragments that were not the last block in the ext 8370 * data which would corrupt the filesystem. 8371 */ 8372 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; 8373 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 8374 dp->di_extb[adp->ad_offset] = adp->ad_oldblkno; 8375 /* keep going until hitting a rollback to a frag */ 8376 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 8377 continue; 8378 dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 8379 for (i = adp->ad_offset + 1; i < NXADDR; i++) { 8380 #ifdef INVARIANTS 8381 if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) 8382 panic("softdep_write_inodeblock: lost dep1"); 8383 #endif /* INVARIANTS */ 8384 dp->di_extb[i] = 0; 8385 } 8386 lastadp = NULL; 8387 break; 8388 } 8389 /* 8390 * If we have zero'ed out the last allocated block of the ext 8391 * data, roll back the size to the last currently allocated block. 8392 * We know that this last allocated block is a full-sized as 8393 * we already checked for fragments in the loop above. 8394 */ 8395 if (lastadp != NULL && 8396 dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 8397 for (i = lastadp->ad_offset; i >= 0; i--) 8398 if (dp->di_extb[i] != 0) 8399 break; 8400 dp->di_extsize = (i + 1) * fs->fs_bsize; 8401 } 8402 /* 8403 * Set the file data dependencies to busy. 8404 */ 8405 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 8406 adp = TAILQ_NEXT(adp, ad_next)) { 8407 #ifdef INVARIANTS 8408 if (deplist != 0 && prevlbn >= adp->ad_offset) 8409 panic("softdep_write_inodeblock: lbn order"); 8410 prevlbn = adp->ad_offset; 8411 if (adp->ad_offset < NDADDR && 8412 dp->di_db[adp->ad_offset] != adp->ad_newblkno) 8413 panic("%s: direct pointer #%jd mismatch %jd != %jd", 8414 "softdep_write_inodeblock", 8415 (intmax_t)adp->ad_offset, 8416 (intmax_t)dp->di_db[adp->ad_offset], 8417 (intmax_t)adp->ad_newblkno); 8418 if (adp->ad_offset >= NDADDR && 8419 dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) 8420 panic("%s indirect pointer #%jd mismatch %jd != %jd", 8421 "softdep_write_inodeblock:", 8422 (intmax_t)adp->ad_offset - NDADDR, 8423 (intmax_t)dp->di_ib[adp->ad_offset - NDADDR], 8424 (intmax_t)adp->ad_newblkno); 8425 deplist |= 1 << adp->ad_offset; 8426 if ((adp->ad_state & ATTACHED) == 0) 8427 panic("softdep_write_inodeblock: Unknown state 0x%x", 8428 adp->ad_state); 8429 #endif /* INVARIANTS */ 8430 adp->ad_state &= ~ATTACHED; 8431 adp->ad_state |= UNDONE; 8432 } 8433 /* 8434 * The on-disk inode cannot claim to be any larger than the last 8435 * fragment that has been written. Otherwise, the on-disk inode 8436 * might have fragments that were not the last block in the file 8437 * which would corrupt the filesystem. 8438 */ 8439 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 8440 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 8441 if (adp->ad_offset >= NDADDR) 8442 break; 8443 dp->di_db[adp->ad_offset] = adp->ad_oldblkno; 8444 /* keep going until hitting a rollback to a frag */ 8445 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 8446 continue; 8447 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 8448 for (i = adp->ad_offset + 1; i < NDADDR; i++) { 8449 #ifdef INVARIANTS 8450 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) 8451 panic("softdep_write_inodeblock: lost dep2"); 8452 #endif /* INVARIANTS */ 8453 dp->di_db[i] = 0; 8454 } 8455 for (i = 0; i < NIADDR; i++) { 8456 #ifdef INVARIANTS 8457 if (dp->di_ib[i] != 0 && 8458 (deplist & ((1 << NDADDR) << i)) == 0) 8459 panic("softdep_write_inodeblock: lost dep3"); 8460 #endif /* INVARIANTS */ 8461 dp->di_ib[i] = 0; 8462 } 8463 return; 8464 } 8465 /* 8466 * If we have zero'ed out the last allocated block of the file, 8467 * roll back the size to the last currently allocated block. 8468 * We know that this last allocated block is a full-sized as 8469 * we already checked for fragments in the loop above. 8470 */ 8471 if (lastadp != NULL && 8472 dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 8473 for (i = lastadp->ad_offset; i >= 0; i--) 8474 if (dp->di_db[i] != 0) 8475 break; 8476 dp->di_size = (i + 1) * fs->fs_bsize; 8477 } 8478 /* 8479 * The only dependencies are for indirect blocks. 8480 * 8481 * The file size for indirect block additions is not guaranteed. 8482 * Such a guarantee would be non-trivial to achieve. The conventional 8483 * synchronous write implementation also does not make this guarantee. 8484 * Fsck should catch and fix discrepancies. Arguably, the file size 8485 * can be over-estimated without destroying integrity when the file 8486 * moves into the indirect blocks (i.e., is large). If we want to 8487 * postpone fsck, we are stuck with this argument. 8488 */ 8489 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 8490 dp->di_ib[adp->ad_offset - NDADDR] = 0; 8491 } 8492 8493 /* 8494 * Cancel an indirdep as a result of truncation. Release all of the 8495 * children allocindirs and place their journal work on the appropriate 8496 * list. 8497 */ 8498 static void 8499 cancel_indirdep(indirdep, bp, inodedep, freeblks) 8500 struct indirdep *indirdep; 8501 struct buf *bp; 8502 struct inodedep *inodedep; 8503 struct freeblks *freeblks; 8504 { 8505 struct allocindir *aip; 8506 8507 /* 8508 * None of the indirect pointers will ever be visible, 8509 * so they can simply be tossed. GOINGAWAY ensures 8510 * that allocated pointers will be saved in the buffer 8511 * cache until they are freed. Note that they will 8512 * only be able to be found by their physical address 8513 * since the inode mapping the logical address will 8514 * be gone. The save buffer used for the safe copy 8515 * was allocated in setup_allocindir_phase2 using 8516 * the physical address so it could be used for this 8517 * purpose. Hence we swap the safe copy with the real 8518 * copy, allowing the safe copy to be freed and holding 8519 * on to the real copy for later use in indir_trunc. 8520 */ 8521 if (indirdep->ir_state & GOINGAWAY) 8522 panic("cancel_indirdep: already gone"); 8523 if (indirdep->ir_state & ONDEPLIST) { 8524 indirdep->ir_state &= ~ONDEPLIST; 8525 LIST_REMOVE(indirdep, ir_next); 8526 } 8527 indirdep->ir_state |= GOINGAWAY; 8528 VFSTOUFS(indirdep->ir_list.wk_mp)->um_numindirdeps += 1; 8529 while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) 8530 cancel_allocindir(aip, inodedep, freeblks); 8531 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) 8532 cancel_allocindir(aip, inodedep, freeblks); 8533 while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) 8534 cancel_allocindir(aip, inodedep, freeblks); 8535 while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0) 8536 cancel_allocindir(aip, inodedep, freeblks); 8537 bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount); 8538 WORKLIST_REMOVE(&indirdep->ir_list); 8539 WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list); 8540 indirdep->ir_savebp = NULL; 8541 } 8542 8543 /* 8544 * Free an indirdep once it no longer has new pointers to track. 8545 */ 8546 static void 8547 free_indirdep(indirdep) 8548 struct indirdep *indirdep; 8549 { 8550 8551 KASSERT(LIST_EMPTY(&indirdep->ir_jwork), 8552 ("free_indirdep: Journal work not empty.")); 8553 KASSERT(LIST_EMPTY(&indirdep->ir_completehd), 8554 ("free_indirdep: Complete head not empty.")); 8555 KASSERT(LIST_EMPTY(&indirdep->ir_writehd), 8556 ("free_indirdep: write head not empty.")); 8557 KASSERT(LIST_EMPTY(&indirdep->ir_donehd), 8558 ("free_indirdep: done head not empty.")); 8559 KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd), 8560 ("free_indirdep: deplist head not empty.")); 8561 KASSERT(indirdep->ir_savebp == NULL, 8562 ("free_indirdep: %p ir_savebp != NULL", indirdep)); 8563 KASSERT((indirdep->ir_state & ONDEPLIST) == 0, 8564 ("free_indirdep: %p still on deplist.", indirdep)); 8565 if (indirdep->ir_state & ONWORKLIST) 8566 WORKLIST_REMOVE(&indirdep->ir_list); 8567 WORKITEM_FREE(indirdep, D_INDIRDEP); 8568 } 8569 8570 /* 8571 * Called before a write to an indirdep. This routine is responsible for 8572 * rolling back pointers to a safe state which includes only those 8573 * allocindirs which have been completed. 8574 */ 8575 static void 8576 initiate_write_indirdep(indirdep, bp) 8577 struct indirdep *indirdep; 8578 struct buf *bp; 8579 { 8580 8581 if (indirdep->ir_state & GOINGAWAY) 8582 panic("disk_io_initiation: indirdep gone"); 8583 8584 /* 8585 * If there are no remaining dependencies, this will be writing 8586 * the real pointers. 8587 */ 8588 if (LIST_EMPTY(&indirdep->ir_deplisthd)) 8589 return; 8590 /* 8591 * Replace up-to-date version with safe version. 8592 */ 8593 FREE_LOCK(&lk); 8594 indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, 8595 M_SOFTDEP_FLAGS); 8596 ACQUIRE_LOCK(&lk); 8597 indirdep->ir_state &= ~ATTACHED; 8598 indirdep->ir_state |= UNDONE; 8599 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); 8600 bcopy(indirdep->ir_savebp->b_data, bp->b_data, 8601 bp->b_bcount); 8602 } 8603 8604 /* 8605 * Called when an inode has been cleared in a cg bitmap. This finally 8606 * eliminates any canceled jaddrefs 8607 */ 8608 void 8609 softdep_setup_inofree(mp, bp, ino, wkhd) 8610 struct mount *mp; 8611 struct buf *bp; 8612 ino_t ino; 8613 struct workhead *wkhd; 8614 { 8615 struct worklist *wk, *wkn; 8616 struct inodedep *inodedep; 8617 uint8_t *inosused; 8618 struct cg *cgp; 8619 struct fs *fs; 8620 8621 ACQUIRE_LOCK(&lk); 8622 fs = VFSTOUFS(mp)->um_fs; 8623 cgp = (struct cg *)bp->b_data; 8624 inosused = cg_inosused(cgp); 8625 if (isset(inosused, ino % fs->fs_ipg)) 8626 panic("softdep_setup_inofree: inode %d not freed.", ino); 8627 if (inodedep_lookup(mp, ino, 0, &inodedep)) 8628 panic("softdep_setup_inofree: ino %d has existing inodedep %p", 8629 ino, inodedep); 8630 if (wkhd) { 8631 LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) { 8632 if (wk->wk_type != D_JADDREF) 8633 continue; 8634 WORKLIST_REMOVE(wk); 8635 /* 8636 * We can free immediately even if the jaddref 8637 * isn't attached in a background write as now 8638 * the bitmaps are reconciled. 8639 */ 8640 wk->wk_state |= COMPLETE | ATTACHED; 8641 free_jaddref(WK_JADDREF(wk)); 8642 } 8643 jwork_move(&bp->b_dep, wkhd); 8644 } 8645 FREE_LOCK(&lk); 8646 } 8647 8648 8649 /* 8650 * Called via ffs_blkfree() after a set of frags has been cleared from a cg 8651 * map. Any dependencies waiting for the write to clear are added to the 8652 * buf's list and any jnewblks that are being canceled are discarded 8653 * immediately. 8654 */ 8655 void 8656 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) 8657 struct mount *mp; 8658 struct buf *bp; 8659 ufs2_daddr_t blkno; 8660 int frags; 8661 struct workhead *wkhd; 8662 { 8663 struct jnewblk *jnewblk; 8664 struct worklist *wk, *wkn; 8665 #ifdef SUJ_DEBUG 8666 struct bmsafemap *bmsafemap; 8667 struct fs *fs; 8668 uint8_t *blksfree; 8669 struct cg *cgp; 8670 ufs2_daddr_t jstart; 8671 ufs2_daddr_t jend; 8672 ufs2_daddr_t end; 8673 long bno; 8674 int i; 8675 #endif 8676 8677 ACQUIRE_LOCK(&lk); 8678 /* 8679 * Detach any jnewblks which have been canceled. They must linger 8680 * until the bitmap is cleared again by ffs_blkfree() to prevent 8681 * an unjournaled allocation from hitting the disk. 8682 */ 8683 if (wkhd) { 8684 LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) { 8685 if (wk->wk_type != D_JNEWBLK) 8686 continue; 8687 jnewblk = WK_JNEWBLK(wk); 8688 KASSERT(jnewblk->jn_state & GOINGAWAY, 8689 ("softdep_setup_blkfree: jnewblk not canceled.")); 8690 WORKLIST_REMOVE(wk); 8691 #ifdef SUJ_DEBUG 8692 /* 8693 * Assert that this block is free in the bitmap 8694 * before we discard the jnewblk. 8695 */ 8696 fs = VFSTOUFS(mp)->um_fs; 8697 cgp = (struct cg *)bp->b_data; 8698 blksfree = cg_blksfree(cgp); 8699 bno = dtogd(fs, jnewblk->jn_blkno); 8700 for (i = jnewblk->jn_oldfrags; 8701 i < jnewblk->jn_frags; i++) { 8702 if (isset(blksfree, bno + i)) 8703 continue; 8704 panic("softdep_setup_blkfree: not free"); 8705 } 8706 #endif 8707 /* 8708 * Even if it's not attached we can free immediately 8709 * as the new bitmap is correct. 8710 */ 8711 wk->wk_state |= COMPLETE | ATTACHED; 8712 free_jnewblk(jnewblk); 8713 } 8714 /* 8715 * The buf must be locked by the caller otherwise these could 8716 * be added while it's being written and the write would 8717 * complete them before they made it to disk. 8718 */ 8719 jwork_move(&bp->b_dep, wkhd); 8720 } 8721 8722 #ifdef SUJ_DEBUG 8723 /* 8724 * Assert that we are not freeing a block which has an outstanding 8725 * allocation dependency. 8726 */ 8727 fs = VFSTOUFS(mp)->um_fs; 8728 bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno)); 8729 end = blkno + frags; 8730 LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { 8731 /* 8732 * Don't match against blocks that will be freed when the 8733 * background write is done. 8734 */ 8735 if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) == 8736 (COMPLETE | DEPCOMPLETE)) 8737 continue; 8738 jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags; 8739 jend = jnewblk->jn_blkno + jnewblk->jn_frags; 8740 if ((blkno >= jstart && blkno < jend) || 8741 (end > jstart && end <= jend)) { 8742 printf("state 0x%X %jd - %d %d dep %p\n", 8743 jnewblk->jn_state, jnewblk->jn_blkno, 8744 jnewblk->jn_oldfrags, jnewblk->jn_frags, 8745 jnewblk->jn_newblk); 8746 panic("softdep_setup_blkfree: " 8747 "%jd-%jd(%d) overlaps with %jd-%jd", 8748 blkno, end, frags, jstart, jend); 8749 } 8750 } 8751 #endif 8752 FREE_LOCK(&lk); 8753 } 8754 8755 static void 8756 initiate_write_bmsafemap(bmsafemap, bp) 8757 struct bmsafemap *bmsafemap; 8758 struct buf *bp; /* The cg block. */ 8759 { 8760 struct jaddref *jaddref; 8761 struct jnewblk *jnewblk; 8762 uint8_t *inosused; 8763 uint8_t *blksfree; 8764 struct cg *cgp; 8765 struct fs *fs; 8766 int cleared; 8767 ino_t ino; 8768 long bno; 8769 int i; 8770 8771 if (bmsafemap->sm_state & IOSTARTED) 8772 panic("initiate_write_bmsafemap: Already started\n"); 8773 bmsafemap->sm_state |= IOSTARTED; 8774 /* 8775 * Clear any inode allocations which are pending journal writes. 8776 */ 8777 if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) { 8778 cgp = (struct cg *)bp->b_data; 8779 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 8780 inosused = cg_inosused(cgp); 8781 LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) { 8782 ino = jaddref->ja_ino % fs->fs_ipg; 8783 /* 8784 * If this is a background copy the inode may not 8785 * be marked used yet. 8786 */ 8787 if (isset(inosused, ino)) { 8788 if ((jaddref->ja_mode & IFMT) == IFDIR) 8789 cgp->cg_cs.cs_ndir--; 8790 cgp->cg_cs.cs_nifree++; 8791 clrbit(inosused, ino); 8792 jaddref->ja_state &= ~ATTACHED; 8793 jaddref->ja_state |= UNDONE; 8794 stat_jaddref++; 8795 } else if ((bp->b_xflags & BX_BKGRDMARKER) == 0) 8796 panic("initiate_write_bmsafemap: inode %d " 8797 "marked free", jaddref->ja_ino); 8798 } 8799 } 8800 /* 8801 * Clear any block allocations which are pending journal writes. 8802 */ 8803 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { 8804 cgp = (struct cg *)bp->b_data; 8805 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 8806 blksfree = cg_blksfree(cgp); 8807 LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { 8808 bno = dtogd(fs, jnewblk->jn_blkno); 8809 cleared = 0; 8810 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; 8811 i++) { 8812 if (isclr(blksfree, bno + i)) { 8813 cleared = 1; 8814 setbit(blksfree, bno + i); 8815 } 8816 } 8817 /* 8818 * We may not clear the block if it's a background 8819 * copy. In that case there is no reason to detach 8820 * it. 8821 */ 8822 if (cleared) { 8823 stat_jnewblk++; 8824 jnewblk->jn_state &= ~ATTACHED; 8825 jnewblk->jn_state |= UNDONE; 8826 } else if ((bp->b_xflags & BX_BKGRDMARKER) == 0) 8827 panic("initiate_write_bmsafemap: block %jd " 8828 "marked free", jnewblk->jn_blkno); 8829 } 8830 } 8831 /* 8832 * Move allocation lists to the written lists so they can be 8833 * cleared once the block write is complete. 8834 */ 8835 LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr, 8836 inodedep, id_deps); 8837 LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr, 8838 newblk, nb_deps); 8839 } 8840 8841 /* 8842 * This routine is called during the completion interrupt 8843 * service routine for a disk write (from the procedure called 8844 * by the device driver to inform the filesystem caches of 8845 * a request completion). It should be called early in this 8846 * procedure, before the block is made available to other 8847 * processes or other routines are called. 8848 * 8849 */ 8850 static void 8851 softdep_disk_write_complete(bp) 8852 struct buf *bp; /* describes the completed disk write */ 8853 { 8854 struct worklist *wk; 8855 struct worklist *owk; 8856 struct workhead reattach; 8857 struct buf *sbp; 8858 8859 /* 8860 * If an error occurred while doing the write, then the data 8861 * has not hit the disk and the dependencies cannot be unrolled. 8862 */ 8863 if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) 8864 return; 8865 LIST_INIT(&reattach); 8866 /* 8867 * This lock must not be released anywhere in this code segment. 8868 */ 8869 sbp = NULL; 8870 owk = NULL; 8871 ACQUIRE_LOCK(&lk); 8872 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 8873 WORKLIST_REMOVE(wk); 8874 if (wk == owk) 8875 panic("duplicate worklist: %p\n", wk); 8876 owk = wk; 8877 switch (wk->wk_type) { 8878 8879 case D_PAGEDEP: 8880 if (handle_written_filepage(WK_PAGEDEP(wk), bp)) 8881 WORKLIST_INSERT(&reattach, wk); 8882 continue; 8883 8884 case D_INODEDEP: 8885 if (handle_written_inodeblock(WK_INODEDEP(wk), bp)) 8886 WORKLIST_INSERT(&reattach, wk); 8887 continue; 8888 8889 case D_BMSAFEMAP: 8890 if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp)) 8891 WORKLIST_INSERT(&reattach, wk); 8892 continue; 8893 8894 case D_MKDIR: 8895 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 8896 continue; 8897 8898 case D_ALLOCDIRECT: 8899 wk->wk_state |= COMPLETE; 8900 handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL); 8901 continue; 8902 8903 case D_ALLOCINDIR: 8904 wk->wk_state |= COMPLETE; 8905 handle_allocindir_partdone(WK_ALLOCINDIR(wk)); 8906 continue; 8907 8908 case D_INDIRDEP: 8909 if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp)) 8910 WORKLIST_INSERT(&reattach, wk); 8911 continue; 8912 8913 case D_FREEBLKS: 8914 wk->wk_state |= COMPLETE; 8915 if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE) 8916 add_to_worklist(wk, 1); 8917 continue; 8918 8919 case D_FREEWORK: 8920 handle_written_freework(WK_FREEWORK(wk)); 8921 break; 8922 8923 case D_FREEDEP: 8924 free_freedep(WK_FREEDEP(wk)); 8925 continue; 8926 8927 case D_JSEGDEP: 8928 free_jsegdep(WK_JSEGDEP(wk)); 8929 continue; 8930 8931 case D_JSEG: 8932 handle_written_jseg(WK_JSEG(wk), bp); 8933 continue; 8934 8935 case D_SBDEP: 8936 if (handle_written_sbdep(WK_SBDEP(wk), bp)) 8937 WORKLIST_INSERT(&reattach, wk); 8938 continue; 8939 8940 default: 8941 panic("handle_disk_write_complete: Unknown type %s", 8942 TYPENAME(wk->wk_type)); 8943 /* NOTREACHED */ 8944 } 8945 } 8946 /* 8947 * Reattach any requests that must be redone. 8948 */ 8949 while ((wk = LIST_FIRST(&reattach)) != NULL) { 8950 WORKLIST_REMOVE(wk); 8951 WORKLIST_INSERT(&bp->b_dep, wk); 8952 } 8953 FREE_LOCK(&lk); 8954 if (sbp) 8955 brelse(sbp); 8956 } 8957 8958 /* 8959 * Called from within softdep_disk_write_complete above. Note that 8960 * this routine is always called from interrupt level with further 8961 * splbio interrupts blocked. 8962 */ 8963 static void 8964 handle_allocdirect_partdone(adp, wkhd) 8965 struct allocdirect *adp; /* the completed allocdirect */ 8966 struct workhead *wkhd; /* Work to do when inode is writtne. */ 8967 { 8968 struct allocdirectlst *listhead; 8969 struct allocdirect *listadp; 8970 struct inodedep *inodedep; 8971 long bsize; 8972 8973 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 8974 return; 8975 /* 8976 * The on-disk inode cannot claim to be any larger than the last 8977 * fragment that has been written. Otherwise, the on-disk inode 8978 * might have fragments that were not the last block in the file 8979 * which would corrupt the filesystem. Thus, we cannot free any 8980 * allocdirects after one whose ad_oldblkno claims a fragment as 8981 * these blocks must be rolled back to zero before writing the inode. 8982 * We check the currently active set of allocdirects in id_inoupdt 8983 * or id_extupdt as appropriate. 8984 */ 8985 inodedep = adp->ad_inodedep; 8986 bsize = inodedep->id_fs->fs_bsize; 8987 if (adp->ad_state & EXTDATA) 8988 listhead = &inodedep->id_extupdt; 8989 else 8990 listhead = &inodedep->id_inoupdt; 8991 TAILQ_FOREACH(listadp, listhead, ad_next) { 8992 /* found our block */ 8993 if (listadp == adp) 8994 break; 8995 /* continue if ad_oldlbn is not a fragment */ 8996 if (listadp->ad_oldsize == 0 || 8997 listadp->ad_oldsize == bsize) 8998 continue; 8999 /* hit a fragment */ 9000 return; 9001 } 9002 /* 9003 * If we have reached the end of the current list without 9004 * finding the just finished dependency, then it must be 9005 * on the future dependency list. Future dependencies cannot 9006 * be freed until they are moved to the current list. 9007 */ 9008 if (listadp == NULL) { 9009 #ifdef DEBUG 9010 if (adp->ad_state & EXTDATA) 9011 listhead = &inodedep->id_newextupdt; 9012 else 9013 listhead = &inodedep->id_newinoupdt; 9014 TAILQ_FOREACH(listadp, listhead, ad_next) 9015 /* found our block */ 9016 if (listadp == adp) 9017 break; 9018 if (listadp == NULL) 9019 panic("handle_allocdirect_partdone: lost dep"); 9020 #endif /* DEBUG */ 9021 return; 9022 } 9023 /* 9024 * If we have found the just finished dependency, then queue 9025 * it along with anything that follows it that is complete. 9026 * Since the pointer has not yet been written in the inode 9027 * as the dependency prevents it, place the allocdirect on the 9028 * bufwait list where it will be freed once the pointer is 9029 * valid. 9030 */ 9031 if (wkhd == NULL) 9032 wkhd = &inodedep->id_bufwait; 9033 for (; adp; adp = listadp) { 9034 listadp = TAILQ_NEXT(adp, ad_next); 9035 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 9036 return; 9037 TAILQ_REMOVE(listhead, adp, ad_next); 9038 WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list); 9039 } 9040 } 9041 9042 /* 9043 * Called from within softdep_disk_write_complete above. This routine 9044 * completes successfully written allocindirs. 9045 */ 9046 static void 9047 handle_allocindir_partdone(aip) 9048 struct allocindir *aip; /* the completed allocindir */ 9049 { 9050 struct indirdep *indirdep; 9051 9052 if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) 9053 return; 9054 indirdep = aip->ai_indirdep; 9055 LIST_REMOVE(aip, ai_next); 9056 if (indirdep->ir_state & UNDONE) { 9057 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); 9058 return; 9059 } 9060 if (indirdep->ir_state & UFS1FMT) 9061 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 9062 aip->ai_newblkno; 9063 else 9064 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 9065 aip->ai_newblkno; 9066 /* 9067 * Await the pointer write before freeing the allocindir. 9068 */ 9069 LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next); 9070 } 9071 9072 /* 9073 * Release segments held on a jwork list. 9074 */ 9075 static void 9076 handle_jwork(wkhd) 9077 struct workhead *wkhd; 9078 { 9079 struct worklist *wk; 9080 9081 while ((wk = LIST_FIRST(wkhd)) != NULL) { 9082 WORKLIST_REMOVE(wk); 9083 switch (wk->wk_type) { 9084 case D_JSEGDEP: 9085 free_jsegdep(WK_JSEGDEP(wk)); 9086 continue; 9087 default: 9088 panic("handle_jwork: Unknown type %s\n", 9089 TYPENAME(wk->wk_type)); 9090 } 9091 } 9092 } 9093 9094 /* 9095 * Handle the bufwait list on an inode when it is safe to release items 9096 * held there. This normally happens after an inode block is written but 9097 * may be delayed and handle later if there are pending journal items that 9098 * are not yet safe to be released. 9099 */ 9100 static struct freefile * 9101 handle_bufwait(inodedep, refhd) 9102 struct inodedep *inodedep; 9103 struct workhead *refhd; 9104 { 9105 struct jaddref *jaddref; 9106 struct freefile *freefile; 9107 struct worklist *wk; 9108 9109 freefile = NULL; 9110 while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { 9111 WORKLIST_REMOVE(wk); 9112 switch (wk->wk_type) { 9113 case D_FREEFILE: 9114 /* 9115 * We defer adding freefile to the worklist 9116 * until all other additions have been made to 9117 * ensure that it will be done after all the 9118 * old blocks have been freed. 9119 */ 9120 if (freefile != NULL) 9121 panic("handle_bufwait: freefile"); 9122 freefile = WK_FREEFILE(wk); 9123 continue; 9124 9125 case D_MKDIR: 9126 handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); 9127 continue; 9128 9129 case D_DIRADD: 9130 diradd_inode_written(WK_DIRADD(wk), inodedep); 9131 continue; 9132 9133 case D_FREEFRAG: 9134 wk->wk_state |= COMPLETE; 9135 if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE) 9136 add_to_worklist(wk, 0); 9137 continue; 9138 9139 case D_DIRREM: 9140 wk->wk_state |= COMPLETE; 9141 add_to_worklist(wk, 0); 9142 continue; 9143 9144 case D_ALLOCDIRECT: 9145 case D_ALLOCINDIR: 9146 free_newblk(WK_NEWBLK(wk)); 9147 continue; 9148 9149 case D_JNEWBLK: 9150 wk->wk_state |= COMPLETE; 9151 free_jnewblk(WK_JNEWBLK(wk)); 9152 continue; 9153 9154 /* 9155 * Save freed journal segments and add references on 9156 * the supplied list which will delay their release 9157 * until the cg bitmap is cleared on disk. 9158 */ 9159 case D_JSEGDEP: 9160 if (refhd == NULL) 9161 free_jsegdep(WK_JSEGDEP(wk)); 9162 else 9163 WORKLIST_INSERT(refhd, wk); 9164 continue; 9165 9166 case D_JADDREF: 9167 jaddref = WK_JADDREF(wk); 9168 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, 9169 if_deps); 9170 /* 9171 * Transfer any jaddrefs to the list to be freed with 9172 * the bitmap if we're handling a removed file. 9173 */ 9174 if (refhd == NULL) { 9175 wk->wk_state |= COMPLETE; 9176 free_jaddref(jaddref); 9177 } else 9178 WORKLIST_INSERT(refhd, wk); 9179 continue; 9180 9181 default: 9182 panic("handle_bufwait: Unknown type %p(%s)", 9183 wk, TYPENAME(wk->wk_type)); 9184 /* NOTREACHED */ 9185 } 9186 } 9187 return (freefile); 9188 } 9189 /* 9190 * Called from within softdep_disk_write_complete above to restore 9191 * in-memory inode block contents to their most up-to-date state. Note 9192 * that this routine is always called from interrupt level with further 9193 * splbio interrupts blocked. 9194 */ 9195 static int 9196 handle_written_inodeblock(inodedep, bp) 9197 struct inodedep *inodedep; 9198 struct buf *bp; /* buffer containing the inode block */ 9199 { 9200 struct freefile *freefile; 9201 struct allocdirect *adp, *nextadp; 9202 struct ufs1_dinode *dp1 = NULL; 9203 struct ufs2_dinode *dp2 = NULL; 9204 struct workhead wkhd; 9205 int hadchanges, fstype; 9206 ino_t freelink; 9207 9208 LIST_INIT(&wkhd); 9209 hadchanges = 0; 9210 freefile = NULL; 9211 if ((inodedep->id_state & IOSTARTED) == 0) 9212 panic("handle_written_inodeblock: not started"); 9213 inodedep->id_state &= ~IOSTARTED; 9214 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) { 9215 fstype = UFS1; 9216 dp1 = (struct ufs1_dinode *)bp->b_data + 9217 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 9218 freelink = dp1->di_freelink; 9219 } else { 9220 fstype = UFS2; 9221 dp2 = (struct ufs2_dinode *)bp->b_data + 9222 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 9223 freelink = dp2->di_freelink; 9224 } 9225 /* 9226 * If we wrote a valid freelink pointer during the last write 9227 * record it here. 9228 */ 9229 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 9230 struct inodedep *inon; 9231 9232 inon = TAILQ_NEXT(inodedep, id_unlinked); 9233 if ((inon == NULL && freelink == 0) || 9234 (inon && inon->id_ino == freelink)) { 9235 if (inon) 9236 inon->id_state |= UNLINKPREV; 9237 inodedep->id_state |= UNLINKNEXT; 9238 } else 9239 hadchanges = 1; 9240 } 9241 /* Leave this inodeblock dirty until it's in the list. */ 9242 if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED) 9243 hadchanges = 1; 9244 /* 9245 * If we had to rollback the inode allocation because of 9246 * bitmaps being incomplete, then simply restore it. 9247 * Keep the block dirty so that it will not be reclaimed until 9248 * all associated dependencies have been cleared and the 9249 * corresponding updates written to disk. 9250 */ 9251 if (inodedep->id_savedino1 != NULL) { 9252 hadchanges = 1; 9253 if (fstype == UFS1) 9254 *dp1 = *inodedep->id_savedino1; 9255 else 9256 *dp2 = *inodedep->id_savedino2; 9257 free(inodedep->id_savedino1, M_SAVEDINO); 9258 inodedep->id_savedino1 = NULL; 9259 if ((bp->b_flags & B_DELWRI) == 0) 9260 stat_inode_bitmap++; 9261 bdirty(bp); 9262 /* 9263 * If the inode is clear here and GOINGAWAY it will never 9264 * be written. Process the bufwait and clear any pending 9265 * work which may include the freefile. 9266 */ 9267 if (inodedep->id_state & GOINGAWAY) 9268 goto bufwait; 9269 return (1); 9270 } 9271 inodedep->id_state |= COMPLETE; 9272 /* 9273 * Roll forward anything that had to be rolled back before 9274 * the inode could be updated. 9275 */ 9276 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { 9277 nextadp = TAILQ_NEXT(adp, ad_next); 9278 if (adp->ad_state & ATTACHED) 9279 panic("handle_written_inodeblock: new entry"); 9280 if (fstype == UFS1) { 9281 if (adp->ad_offset < NDADDR) { 9282 if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno) 9283 panic("%s %s #%jd mismatch %d != %jd", 9284 "handle_written_inodeblock:", 9285 "direct pointer", 9286 (intmax_t)adp->ad_offset, 9287 dp1->di_db[adp->ad_offset], 9288 (intmax_t)adp->ad_oldblkno); 9289 dp1->di_db[adp->ad_offset] = adp->ad_newblkno; 9290 } else { 9291 if (dp1->di_ib[adp->ad_offset - NDADDR] != 0) 9292 panic("%s: %s #%jd allocated as %d", 9293 "handle_written_inodeblock", 9294 "indirect pointer", 9295 (intmax_t)adp->ad_offset - NDADDR, 9296 dp1->di_ib[adp->ad_offset - NDADDR]); 9297 dp1->di_ib[adp->ad_offset - NDADDR] = 9298 adp->ad_newblkno; 9299 } 9300 } else { 9301 if (adp->ad_offset < NDADDR) { 9302 if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno) 9303 panic("%s: %s #%jd %s %jd != %jd", 9304 "handle_written_inodeblock", 9305 "direct pointer", 9306 (intmax_t)adp->ad_offset, "mismatch", 9307 (intmax_t)dp2->di_db[adp->ad_offset], 9308 (intmax_t)adp->ad_oldblkno); 9309 dp2->di_db[adp->ad_offset] = adp->ad_newblkno; 9310 } else { 9311 if (dp2->di_ib[adp->ad_offset - NDADDR] != 0) 9312 panic("%s: %s #%jd allocated as %jd", 9313 "handle_written_inodeblock", 9314 "indirect pointer", 9315 (intmax_t)adp->ad_offset - NDADDR, 9316 (intmax_t) 9317 dp2->di_ib[adp->ad_offset - NDADDR]); 9318 dp2->di_ib[adp->ad_offset - NDADDR] = 9319 adp->ad_newblkno; 9320 } 9321 } 9322 adp->ad_state &= ~UNDONE; 9323 adp->ad_state |= ATTACHED; 9324 hadchanges = 1; 9325 } 9326 for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) { 9327 nextadp = TAILQ_NEXT(adp, ad_next); 9328 if (adp->ad_state & ATTACHED) 9329 panic("handle_written_inodeblock: new entry"); 9330 if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno) 9331 panic("%s: direct pointers #%jd %s %jd != %jd", 9332 "handle_written_inodeblock", 9333 (intmax_t)adp->ad_offset, "mismatch", 9334 (intmax_t)dp2->di_extb[adp->ad_offset], 9335 (intmax_t)adp->ad_oldblkno); 9336 dp2->di_extb[adp->ad_offset] = adp->ad_newblkno; 9337 adp->ad_state &= ~UNDONE; 9338 adp->ad_state |= ATTACHED; 9339 hadchanges = 1; 9340 } 9341 if (hadchanges && (bp->b_flags & B_DELWRI) == 0) 9342 stat_direct_blk_ptrs++; 9343 /* 9344 * Reset the file size to its most up-to-date value. 9345 */ 9346 if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1) 9347 panic("handle_written_inodeblock: bad size"); 9348 if (inodedep->id_savednlink > LINK_MAX) 9349 panic("handle_written_inodeblock: Invalid link count " 9350 "%d for inodedep %p", inodedep->id_savednlink, inodedep); 9351 if (fstype == UFS1) { 9352 if (dp1->di_nlink != inodedep->id_savednlink) { 9353 dp1->di_nlink = inodedep->id_savednlink; 9354 hadchanges = 1; 9355 } 9356 if (dp1->di_size != inodedep->id_savedsize) { 9357 dp1->di_size = inodedep->id_savedsize; 9358 hadchanges = 1; 9359 } 9360 } else { 9361 if (dp2->di_nlink != inodedep->id_savednlink) { 9362 dp2->di_nlink = inodedep->id_savednlink; 9363 hadchanges = 1; 9364 } 9365 if (dp2->di_size != inodedep->id_savedsize) { 9366 dp2->di_size = inodedep->id_savedsize; 9367 hadchanges = 1; 9368 } 9369 if (dp2->di_extsize != inodedep->id_savedextsize) { 9370 dp2->di_extsize = inodedep->id_savedextsize; 9371 hadchanges = 1; 9372 } 9373 } 9374 inodedep->id_savedsize = -1; 9375 inodedep->id_savedextsize = -1; 9376 inodedep->id_savednlink = -1; 9377 /* 9378 * If there were any rollbacks in the inode block, then it must be 9379 * marked dirty so that its will eventually get written back in 9380 * its correct form. 9381 */ 9382 if (hadchanges) 9383 bdirty(bp); 9384 bufwait: 9385 /* 9386 * Process any allocdirects that completed during the update. 9387 */ 9388 if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) 9389 handle_allocdirect_partdone(adp, &wkhd); 9390 if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) 9391 handle_allocdirect_partdone(adp, &wkhd); 9392 /* 9393 * Process deallocations that were held pending until the 9394 * inode had been written to disk. Freeing of the inode 9395 * is delayed until after all blocks have been freed to 9396 * avoid creation of new <vfsid, inum, lbn> triples 9397 * before the old ones have been deleted. Completely 9398 * unlinked inodes are not processed until the unlinked 9399 * inode list is written or the last reference is removed. 9400 */ 9401 if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) { 9402 freefile = handle_bufwait(inodedep, NULL); 9403 if (freefile && !LIST_EMPTY(&wkhd)) { 9404 WORKLIST_INSERT(&wkhd, &freefile->fx_list); 9405 freefile = NULL; 9406 } 9407 } 9408 /* 9409 * Move rolled forward dependency completions to the bufwait list 9410 * now that those that were already written have been processed. 9411 */ 9412 if (!LIST_EMPTY(&wkhd) && hadchanges == 0) 9413 panic("handle_written_inodeblock: bufwait but no changes"); 9414 jwork_move(&inodedep->id_bufwait, &wkhd); 9415 9416 if (freefile != NULL) { 9417 /* 9418 * If the inode is goingaway it was never written. Fake up 9419 * the state here so free_inodedep() can succeed. 9420 */ 9421 if (inodedep->id_state & GOINGAWAY) 9422 inodedep->id_state |= COMPLETE | DEPCOMPLETE; 9423 if (free_inodedep(inodedep) == 0) 9424 panic("handle_written_inodeblock: live inodedep %p", 9425 inodedep); 9426 add_to_worklist(&freefile->fx_list, 0); 9427 return (0); 9428 } 9429 9430 /* 9431 * If no outstanding dependencies, free it. 9432 */ 9433 if (free_inodedep(inodedep) || 9434 (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 && 9435 TAILQ_FIRST(&inodedep->id_inoupdt) == 0 && 9436 TAILQ_FIRST(&inodedep->id_extupdt) == 0 && 9437 LIST_FIRST(&inodedep->id_bufwait) == 0)) 9438 return (0); 9439 return (hadchanges); 9440 } 9441 9442 static int 9443 handle_written_indirdep(indirdep, bp, bpp) 9444 struct indirdep *indirdep; 9445 struct buf *bp; 9446 struct buf **bpp; 9447 { 9448 struct allocindir *aip; 9449 int chgs; 9450 9451 if (indirdep->ir_state & GOINGAWAY) 9452 panic("disk_write_complete: indirdep gone"); 9453 chgs = 0; 9454 /* 9455 * If there were rollbacks revert them here. 9456 */ 9457 if (indirdep->ir_saveddata) { 9458 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); 9459 free(indirdep->ir_saveddata, M_INDIRDEP); 9460 indirdep->ir_saveddata = 0; 9461 chgs = 1; 9462 } 9463 indirdep->ir_state &= ~UNDONE; 9464 indirdep->ir_state |= ATTACHED; 9465 /* 9466 * Move allocindirs with written pointers to the completehd if 9467 * the the indirdep's pointer is not yet written. Otherwise 9468 * free them here. 9469 */ 9470 while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) { 9471 LIST_REMOVE(aip, ai_next); 9472 if ((indirdep->ir_state & DEPCOMPLETE) == 0) { 9473 LIST_INSERT_HEAD(&indirdep->ir_completehd, aip, 9474 ai_next); 9475 continue; 9476 } 9477 free_newblk(&aip->ai_block); 9478 } 9479 /* 9480 * Move allocindirs that have finished dependency processing from 9481 * the done list to the write list after updating the pointers. 9482 */ 9483 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { 9484 handle_allocindir_partdone(aip); 9485 if (aip == LIST_FIRST(&indirdep->ir_donehd)) 9486 panic("disk_write_complete: not gone"); 9487 chgs = 1; 9488 } 9489 /* 9490 * If this indirdep has been detached from its newblk during 9491 * I/O we need to keep this dep attached to the buffer so 9492 * deallocate_dependencies can find it and properly resolve 9493 * any outstanding dependencies. 9494 */ 9495 if ((indirdep->ir_state & (ONDEPLIST | DEPCOMPLETE)) == 0) 9496 chgs = 1; 9497 if ((bp->b_flags & B_DELWRI) == 0) 9498 stat_indir_blk_ptrs++; 9499 /* 9500 * If there were no changes we can discard the savedbp and detach 9501 * ourselves from the buf. We are only carrying completed pointers 9502 * in this case. 9503 */ 9504 if (chgs == 0) { 9505 struct buf *sbp; 9506 9507 sbp = indirdep->ir_savebp; 9508 sbp->b_flags |= B_INVAL | B_NOCACHE; 9509 indirdep->ir_savebp = NULL; 9510 if (*bpp != NULL) 9511 panic("handle_written_indirdep: bp already exists."); 9512 *bpp = sbp; 9513 } else 9514 bdirty(bp); 9515 /* 9516 * If there are no fresh dependencies and none waiting on writes 9517 * we can free the indirdep. 9518 */ 9519 if ((indirdep->ir_state & DEPCOMPLETE) && chgs == 0) { 9520 if (indirdep->ir_state & ONDEPLIST) 9521 LIST_REMOVE(indirdep, ir_next); 9522 free_indirdep(indirdep); 9523 return (0); 9524 } 9525 9526 return (chgs); 9527 } 9528 9529 /* 9530 * Process a diradd entry after its dependent inode has been written. 9531 * This routine must be called with splbio interrupts blocked. 9532 */ 9533 static void 9534 diradd_inode_written(dap, inodedep) 9535 struct diradd *dap; 9536 struct inodedep *inodedep; 9537 { 9538 9539 dap->da_state |= COMPLETE; 9540 complete_diradd(dap); 9541 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 9542 } 9543 9544 /* 9545 * Returns true if the bmsafemap will have rollbacks when written. Must 9546 * only be called with lk and the buf lock on the cg held. 9547 */ 9548 static int 9549 bmsafemap_rollbacks(bmsafemap) 9550 struct bmsafemap *bmsafemap; 9551 { 9552 9553 return (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd) | 9554 !LIST_EMPTY(&bmsafemap->sm_jnewblkhd)); 9555 } 9556 9557 /* 9558 * Complete a write to a bmsafemap structure. Roll forward any bitmap 9559 * changes if it's not a background write. Set all written dependencies 9560 * to DEPCOMPLETE and free the structure if possible. 9561 */ 9562 static int 9563 handle_written_bmsafemap(bmsafemap, bp) 9564 struct bmsafemap *bmsafemap; 9565 struct buf *bp; 9566 { 9567 struct newblk *newblk; 9568 struct inodedep *inodedep; 9569 struct jaddref *jaddref, *jatmp; 9570 struct jnewblk *jnewblk, *jntmp; 9571 uint8_t *inosused; 9572 uint8_t *blksfree; 9573 struct cg *cgp; 9574 struct fs *fs; 9575 ino_t ino; 9576 long bno; 9577 int chgs; 9578 int i; 9579 9580 if ((bmsafemap->sm_state & IOSTARTED) == 0) 9581 panic("initiate_write_bmsafemap: Not started\n"); 9582 chgs = 0; 9583 bmsafemap->sm_state &= ~IOSTARTED; 9584 /* 9585 * Restore unwritten inode allocation pending jaddref writes. 9586 */ 9587 if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) { 9588 cgp = (struct cg *)bp->b_data; 9589 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 9590 inosused = cg_inosused(cgp); 9591 LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd, 9592 ja_bmdeps, jatmp) { 9593 if ((jaddref->ja_state & UNDONE) == 0) 9594 continue; 9595 ino = jaddref->ja_ino % fs->fs_ipg; 9596 if (isset(inosused, ino)) 9597 panic("handle_written_bmsafemap: " 9598 "re-allocated inode"); 9599 if ((bp->b_xflags & BX_BKGRDMARKER) == 0) { 9600 if ((jaddref->ja_mode & IFMT) == IFDIR) 9601 cgp->cg_cs.cs_ndir++; 9602 cgp->cg_cs.cs_nifree--; 9603 setbit(inosused, ino); 9604 chgs = 1; 9605 } 9606 jaddref->ja_state &= ~UNDONE; 9607 jaddref->ja_state |= ATTACHED; 9608 free_jaddref(jaddref); 9609 } 9610 } 9611 /* 9612 * Restore any block allocations which are pending journal writes. 9613 */ 9614 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { 9615 cgp = (struct cg *)bp->b_data; 9616 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 9617 blksfree = cg_blksfree(cgp); 9618 LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps, 9619 jntmp) { 9620 if ((jnewblk->jn_state & UNDONE) == 0) 9621 continue; 9622 bno = dtogd(fs, jnewblk->jn_blkno); 9623 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; 9624 i++) { 9625 if (bp->b_xflags & BX_BKGRDMARKER) 9626 break; 9627 if ((jnewblk->jn_state & NEWBLOCK) == 0 && 9628 isclr(blksfree, bno + i)) 9629 panic("handle_written_bmsafemap: " 9630 "re-allocated fragment"); 9631 clrbit(blksfree, bno + i); 9632 chgs = 1; 9633 } 9634 jnewblk->jn_state &= ~(UNDONE | NEWBLOCK); 9635 jnewblk->jn_state |= ATTACHED; 9636 free_jnewblk(jnewblk); 9637 } 9638 } 9639 while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) { 9640 newblk->nb_state |= DEPCOMPLETE; 9641 newblk->nb_state &= ~ONDEPLIST; 9642 newblk->nb_bmsafemap = NULL; 9643 LIST_REMOVE(newblk, nb_deps); 9644 if (newblk->nb_list.wk_type == D_ALLOCDIRECT) 9645 handle_allocdirect_partdone( 9646 WK_ALLOCDIRECT(&newblk->nb_list), NULL); 9647 else if (newblk->nb_list.wk_type == D_ALLOCINDIR) 9648 handle_allocindir_partdone( 9649 WK_ALLOCINDIR(&newblk->nb_list)); 9650 else if (newblk->nb_list.wk_type != D_NEWBLK) 9651 panic("handle_written_bmsafemap: Unexpected type: %s", 9652 TYPENAME(newblk->nb_list.wk_type)); 9653 } 9654 while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) { 9655 inodedep->id_state |= DEPCOMPLETE; 9656 inodedep->id_state &= ~ONDEPLIST; 9657 LIST_REMOVE(inodedep, id_deps); 9658 inodedep->id_bmsafemap = NULL; 9659 } 9660 if (LIST_EMPTY(&bmsafemap->sm_jaddrefhd) && 9661 LIST_EMPTY(&bmsafemap->sm_jnewblkhd) && 9662 LIST_EMPTY(&bmsafemap->sm_newblkhd) && 9663 LIST_EMPTY(&bmsafemap->sm_inodedephd)) { 9664 if (chgs) 9665 bdirty(bp); 9666 LIST_REMOVE(bmsafemap, sm_hash); 9667 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 9668 return (0); 9669 } 9670 bdirty(bp); 9671 return (1); 9672 } 9673 9674 /* 9675 * Try to free a mkdir dependency. 9676 */ 9677 static void 9678 complete_mkdir(mkdir) 9679 struct mkdir *mkdir; 9680 { 9681 struct diradd *dap; 9682 9683 if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE) 9684 return; 9685 LIST_REMOVE(mkdir, md_mkdirs); 9686 dap = mkdir->md_diradd; 9687 dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); 9688 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) { 9689 dap->da_state |= DEPCOMPLETE; 9690 complete_diradd(dap); 9691 } 9692 WORKITEM_FREE(mkdir, D_MKDIR); 9693 } 9694 9695 /* 9696 * Handle the completion of a mkdir dependency. 9697 */ 9698 static void 9699 handle_written_mkdir(mkdir, type) 9700 struct mkdir *mkdir; 9701 int type; 9702 { 9703 9704 if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type) 9705 panic("handle_written_mkdir: bad type"); 9706 mkdir->md_state |= COMPLETE; 9707 complete_mkdir(mkdir); 9708 } 9709 9710 static void 9711 free_pagedep(pagedep) 9712 struct pagedep *pagedep; 9713 { 9714 int i; 9715 9716 if (pagedep->pd_state & (NEWBLOCK | ONWORKLIST)) 9717 return; 9718 for (i = 0; i < DAHASHSZ; i++) 9719 if (!LIST_EMPTY(&pagedep->pd_diraddhd[i])) 9720 return; 9721 if (!LIST_EMPTY(&pagedep->pd_jmvrefhd)) 9722 return; 9723 if (!LIST_EMPTY(&pagedep->pd_dirremhd)) 9724 return; 9725 if (!LIST_EMPTY(&pagedep->pd_pendinghd)) 9726 return; 9727 LIST_REMOVE(pagedep, pd_hash); 9728 WORKITEM_FREE(pagedep, D_PAGEDEP); 9729 } 9730 9731 /* 9732 * Called from within softdep_disk_write_complete above. 9733 * A write operation was just completed. Removed inodes can 9734 * now be freed and associated block pointers may be committed. 9735 * Note that this routine is always called from interrupt level 9736 * with further splbio interrupts blocked. 9737 */ 9738 static int 9739 handle_written_filepage(pagedep, bp) 9740 struct pagedep *pagedep; 9741 struct buf *bp; /* buffer containing the written page */ 9742 { 9743 struct dirrem *dirrem; 9744 struct diradd *dap, *nextdap; 9745 struct direct *ep; 9746 int i, chgs; 9747 9748 if ((pagedep->pd_state & IOSTARTED) == 0) 9749 panic("handle_written_filepage: not started"); 9750 pagedep->pd_state &= ~IOSTARTED; 9751 /* 9752 * Process any directory removals that have been committed. 9753 */ 9754 while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { 9755 LIST_REMOVE(dirrem, dm_next); 9756 dirrem->dm_state |= COMPLETE; 9757 dirrem->dm_dirinum = pagedep->pd_ino; 9758 KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), 9759 ("handle_written_filepage: Journal entries not written.")); 9760 add_to_worklist(&dirrem->dm_list, 0); 9761 } 9762 /* 9763 * Free any directory additions that have been committed. 9764 * If it is a newly allocated block, we have to wait until 9765 * the on-disk directory inode claims the new block. 9766 */ 9767 if ((pagedep->pd_state & NEWBLOCK) == 0) 9768 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 9769 free_diradd(dap, NULL); 9770 /* 9771 * Uncommitted directory entries must be restored. 9772 */ 9773 for (chgs = 0, i = 0; i < DAHASHSZ; i++) { 9774 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; 9775 dap = nextdap) { 9776 nextdap = LIST_NEXT(dap, da_pdlist); 9777 if (dap->da_state & ATTACHED) 9778 panic("handle_written_filepage: attached"); 9779 ep = (struct direct *) 9780 ((char *)bp->b_data + dap->da_offset); 9781 ep->d_ino = dap->da_newinum; 9782 dap->da_state &= ~UNDONE; 9783 dap->da_state |= ATTACHED; 9784 chgs = 1; 9785 /* 9786 * If the inode referenced by the directory has 9787 * been written out, then the dependency can be 9788 * moved to the pending list. 9789 */ 9790 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 9791 LIST_REMOVE(dap, da_pdlist); 9792 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, 9793 da_pdlist); 9794 } 9795 } 9796 } 9797 /* 9798 * If there were any rollbacks in the directory, then it must be 9799 * marked dirty so that its will eventually get written back in 9800 * its correct form. 9801 */ 9802 if (chgs) { 9803 if ((bp->b_flags & B_DELWRI) == 0) 9804 stat_dir_entry++; 9805 bdirty(bp); 9806 return (1); 9807 } 9808 /* 9809 * If we are not waiting for a new directory block to be 9810 * claimed by its inode, then the pagedep will be freed. 9811 * Otherwise it will remain to track any new entries on 9812 * the page in case they are fsync'ed. 9813 */ 9814 if ((pagedep->pd_state & NEWBLOCK) == 0 && 9815 LIST_EMPTY(&pagedep->pd_jmvrefhd)) { 9816 LIST_REMOVE(pagedep, pd_hash); 9817 WORKITEM_FREE(pagedep, D_PAGEDEP); 9818 } 9819 return (0); 9820 } 9821 9822 /* 9823 * Writing back in-core inode structures. 9824 * 9825 * The filesystem only accesses an inode's contents when it occupies an 9826 * "in-core" inode structure. These "in-core" structures are separate from 9827 * the page frames used to cache inode blocks. Only the latter are 9828 * transferred to/from the disk. So, when the updated contents of the 9829 * "in-core" inode structure are copied to the corresponding in-memory inode 9830 * block, the dependencies are also transferred. The following procedure is 9831 * called when copying a dirty "in-core" inode to a cached inode block. 9832 */ 9833 9834 /* 9835 * Called when an inode is loaded from disk. If the effective link count 9836 * differed from the actual link count when it was last flushed, then we 9837 * need to ensure that the correct effective link count is put back. 9838 */ 9839 void 9840 softdep_load_inodeblock(ip) 9841 struct inode *ip; /* the "in_core" copy of the inode */ 9842 { 9843 struct inodedep *inodedep; 9844 9845 /* 9846 * Check for alternate nlink count. 9847 */ 9848 ip->i_effnlink = ip->i_nlink; 9849 ACQUIRE_LOCK(&lk); 9850 if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 9851 &inodedep) == 0) { 9852 FREE_LOCK(&lk); 9853 return; 9854 } 9855 ip->i_effnlink -= inodedep->id_nlinkdelta; 9856 FREE_LOCK(&lk); 9857 } 9858 9859 /* 9860 * This routine is called just before the "in-core" inode 9861 * information is to be copied to the in-memory inode block. 9862 * Recall that an inode block contains several inodes. If 9863 * the force flag is set, then the dependencies will be 9864 * cleared so that the update can always be made. Note that 9865 * the buffer is locked when this routine is called, so we 9866 * will never be in the middle of writing the inode block 9867 * to disk. 9868 */ 9869 void 9870 softdep_update_inodeblock(ip, bp, waitfor) 9871 struct inode *ip; /* the "in_core" copy of the inode */ 9872 struct buf *bp; /* the buffer containing the inode block */ 9873 int waitfor; /* nonzero => update must be allowed */ 9874 { 9875 struct inodedep *inodedep; 9876 struct inoref *inoref; 9877 struct worklist *wk; 9878 struct mount *mp; 9879 struct buf *ibp; 9880 struct fs *fs; 9881 int error; 9882 9883 mp = UFSTOVFS(ip->i_ump); 9884 fs = ip->i_fs; 9885 /* 9886 * Preserve the freelink that is on disk. clear_unlinked_inodedep() 9887 * does not have access to the in-core ip so must write directly into 9888 * the inode block buffer when setting freelink. 9889 */ 9890 if (fs->fs_magic == FS_UFS1_MAGIC) 9891 DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data + 9892 ino_to_fsbo(fs, ip->i_number))->di_freelink); 9893 else 9894 DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data + 9895 ino_to_fsbo(fs, ip->i_number))->di_freelink); 9896 /* 9897 * If the effective link count is not equal to the actual link 9898 * count, then we must track the difference in an inodedep while 9899 * the inode is (potentially) tossed out of the cache. Otherwise, 9900 * if there is no existing inodedep, then there are no dependencies 9901 * to track. 9902 */ 9903 ACQUIRE_LOCK(&lk); 9904 again: 9905 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { 9906 FREE_LOCK(&lk); 9907 if (ip->i_effnlink != ip->i_nlink) 9908 panic("softdep_update_inodeblock: bad link count"); 9909 return; 9910 } 9911 if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) 9912 panic("softdep_update_inodeblock: bad delta"); 9913 /* 9914 * If we're flushing all dependencies we must also move any waiting 9915 * for journal writes onto the bufwait list prior to I/O. 9916 */ 9917 if (waitfor) { 9918 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 9919 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 9920 == DEPCOMPLETE) { 9921 stat_jwait_inode++; 9922 jwait(&inoref->if_list); 9923 goto again; 9924 } 9925 } 9926 } 9927 /* 9928 * Changes have been initiated. Anything depending on these 9929 * changes cannot occur until this inode has been written. 9930 */ 9931 inodedep->id_state &= ~COMPLETE; 9932 if ((inodedep->id_state & ONWORKLIST) == 0) 9933 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); 9934 /* 9935 * Any new dependencies associated with the incore inode must 9936 * now be moved to the list associated with the buffer holding 9937 * the in-memory copy of the inode. Once merged process any 9938 * allocdirects that are completed by the merger. 9939 */ 9940 merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); 9941 if (!TAILQ_EMPTY(&inodedep->id_inoupdt)) 9942 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt), 9943 NULL); 9944 merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); 9945 if (!TAILQ_EMPTY(&inodedep->id_extupdt)) 9946 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt), 9947 NULL); 9948 /* 9949 * Now that the inode has been pushed into the buffer, the 9950 * operations dependent on the inode being written to disk 9951 * can be moved to the id_bufwait so that they will be 9952 * processed when the buffer I/O completes. 9953 */ 9954 while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { 9955 WORKLIST_REMOVE(wk); 9956 WORKLIST_INSERT(&inodedep->id_bufwait, wk); 9957 } 9958 /* 9959 * Newly allocated inodes cannot be written until the bitmap 9960 * that allocates them have been written (indicated by 9961 * DEPCOMPLETE being set in id_state). If we are doing a 9962 * forced sync (e.g., an fsync on a file), we force the bitmap 9963 * to be written so that the update can be done. 9964 */ 9965 if (waitfor == 0) { 9966 FREE_LOCK(&lk); 9967 return; 9968 } 9969 retry: 9970 if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) { 9971 FREE_LOCK(&lk); 9972 return; 9973 } 9974 ibp = inodedep->id_bmsafemap->sm_buf; 9975 ibp = getdirtybuf(ibp, &lk, MNT_WAIT); 9976 if (ibp == NULL) { 9977 /* 9978 * If ibp came back as NULL, the dependency could have been 9979 * freed while we slept. Look it up again, and check to see 9980 * that it has completed. 9981 */ 9982 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) 9983 goto retry; 9984 FREE_LOCK(&lk); 9985 return; 9986 } 9987 FREE_LOCK(&lk); 9988 if ((error = bwrite(ibp)) != 0) 9989 softdep_error("softdep_update_inodeblock: bwrite", error); 9990 } 9991 9992 /* 9993 * Merge the a new inode dependency list (such as id_newinoupdt) into an 9994 * old inode dependency list (such as id_inoupdt). This routine must be 9995 * called with splbio interrupts blocked. 9996 */ 9997 static void 9998 merge_inode_lists(newlisthead, oldlisthead) 9999 struct allocdirectlst *newlisthead; 10000 struct allocdirectlst *oldlisthead; 10001 { 10002 struct allocdirect *listadp, *newadp; 10003 10004 newadp = TAILQ_FIRST(newlisthead); 10005 for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) { 10006 if (listadp->ad_offset < newadp->ad_offset) { 10007 listadp = TAILQ_NEXT(listadp, ad_next); 10008 continue; 10009 } 10010 TAILQ_REMOVE(newlisthead, newadp, ad_next); 10011 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); 10012 if (listadp->ad_offset == newadp->ad_offset) { 10013 allocdirect_merge(oldlisthead, newadp, 10014 listadp); 10015 listadp = newadp; 10016 } 10017 newadp = TAILQ_FIRST(newlisthead); 10018 } 10019 while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) { 10020 TAILQ_REMOVE(newlisthead, newadp, ad_next); 10021 TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next); 10022 } 10023 } 10024 10025 /* 10026 * If we are doing an fsync, then we must ensure that any directory 10027 * entries for the inode have been written after the inode gets to disk. 10028 */ 10029 int 10030 softdep_fsync(vp) 10031 struct vnode *vp; /* the "in_core" copy of the inode */ 10032 { 10033 struct inodedep *inodedep; 10034 struct pagedep *pagedep; 10035 struct inoref *inoref; 10036 struct worklist *wk; 10037 struct diradd *dap; 10038 struct mount *mp; 10039 struct vnode *pvp; 10040 struct inode *ip; 10041 struct buf *bp; 10042 struct fs *fs; 10043 struct thread *td = curthread; 10044 int error, flushparent, pagedep_new_block; 10045 ino_t parentino; 10046 ufs_lbn_t lbn; 10047 10048 ip = VTOI(vp); 10049 fs = ip->i_fs; 10050 mp = vp->v_mount; 10051 ACQUIRE_LOCK(&lk); 10052 restart: 10053 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { 10054 FREE_LOCK(&lk); 10055 return (0); 10056 } 10057 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 10058 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 10059 == DEPCOMPLETE) { 10060 stat_jwait_inode++; 10061 jwait(&inoref->if_list); 10062 goto restart; 10063 } 10064 } 10065 if (!LIST_EMPTY(&inodedep->id_inowait) || 10066 !TAILQ_EMPTY(&inodedep->id_extupdt) || 10067 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 10068 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 10069 !TAILQ_EMPTY(&inodedep->id_newinoupdt)) 10070 panic("softdep_fsync: pending ops %p", inodedep); 10071 for (error = 0, flushparent = 0; ; ) { 10072 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) 10073 break; 10074 if (wk->wk_type != D_DIRADD) 10075 panic("softdep_fsync: Unexpected type %s", 10076 TYPENAME(wk->wk_type)); 10077 dap = WK_DIRADD(wk); 10078 /* 10079 * Flush our parent if this directory entry has a MKDIR_PARENT 10080 * dependency or is contained in a newly allocated block. 10081 */ 10082 if (dap->da_state & DIRCHG) 10083 pagedep = dap->da_previous->dm_pagedep; 10084 else 10085 pagedep = dap->da_pagedep; 10086 parentino = pagedep->pd_ino; 10087 lbn = pagedep->pd_lbn; 10088 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) 10089 panic("softdep_fsync: dirty"); 10090 if ((dap->da_state & MKDIR_PARENT) || 10091 (pagedep->pd_state & NEWBLOCK)) 10092 flushparent = 1; 10093 else 10094 flushparent = 0; 10095 /* 10096 * If we are being fsync'ed as part of vgone'ing this vnode, 10097 * then we will not be able to release and recover the 10098 * vnode below, so we just have to give up on writing its 10099 * directory entry out. It will eventually be written, just 10100 * not now, but then the user was not asking to have it 10101 * written, so we are not breaking any promises. 10102 */ 10103 if (vp->v_iflag & VI_DOOMED) 10104 break; 10105 /* 10106 * We prevent deadlock by always fetching inodes from the 10107 * root, moving down the directory tree. Thus, when fetching 10108 * our parent directory, we first try to get the lock. If 10109 * that fails, we must unlock ourselves before requesting 10110 * the lock on our parent. See the comment in ufs_lookup 10111 * for details on possible races. 10112 */ 10113 FREE_LOCK(&lk); 10114 if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp, 10115 FFSV_FORCEINSMQ)) { 10116 error = vfs_busy(mp, MBF_NOWAIT); 10117 if (error != 0) { 10118 vfs_ref(mp); 10119 VOP_UNLOCK(vp, 0); 10120 error = vfs_busy(mp, 0); 10121 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 10122 vfs_rel(mp); 10123 if (error != 0) 10124 return (ENOENT); 10125 if (vp->v_iflag & VI_DOOMED) { 10126 vfs_unbusy(mp); 10127 return (ENOENT); 10128 } 10129 } 10130 VOP_UNLOCK(vp, 0); 10131 error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE, 10132 &pvp, FFSV_FORCEINSMQ); 10133 vfs_unbusy(mp); 10134 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 10135 if (vp->v_iflag & VI_DOOMED) { 10136 if (error == 0) 10137 vput(pvp); 10138 error = ENOENT; 10139 } 10140 if (error != 0) 10141 return (error); 10142 } 10143 /* 10144 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps 10145 * that are contained in direct blocks will be resolved by 10146 * doing a ffs_update. Pagedeps contained in indirect blocks 10147 * may require a complete sync'ing of the directory. So, we 10148 * try the cheap and fast ffs_update first, and if that fails, 10149 * then we do the slower ffs_syncvnode of the directory. 10150 */ 10151 if (flushparent) { 10152 int locked; 10153 10154 if ((error = ffs_update(pvp, 1)) != 0) { 10155 vput(pvp); 10156 return (error); 10157 } 10158 ACQUIRE_LOCK(&lk); 10159 locked = 1; 10160 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) { 10161 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) { 10162 if (wk->wk_type != D_DIRADD) 10163 panic("softdep_fsync: Unexpected type %s", 10164 TYPENAME(wk->wk_type)); 10165 dap = WK_DIRADD(wk); 10166 if (dap->da_state & DIRCHG) 10167 pagedep = dap->da_previous->dm_pagedep; 10168 else 10169 pagedep = dap->da_pagedep; 10170 pagedep_new_block = pagedep->pd_state & NEWBLOCK; 10171 FREE_LOCK(&lk); 10172 locked = 0; 10173 if (pagedep_new_block && 10174 (error = ffs_syncvnode(pvp, MNT_WAIT))) { 10175 vput(pvp); 10176 return (error); 10177 } 10178 } 10179 } 10180 if (locked) 10181 FREE_LOCK(&lk); 10182 } 10183 /* 10184 * Flush directory page containing the inode's name. 10185 */ 10186 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred, 10187 &bp); 10188 if (error == 0) 10189 error = bwrite(bp); 10190 else 10191 brelse(bp); 10192 vput(pvp); 10193 if (error != 0) 10194 return (error); 10195 ACQUIRE_LOCK(&lk); 10196 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) 10197 break; 10198 } 10199 FREE_LOCK(&lk); 10200 return (0); 10201 } 10202 10203 /* 10204 * Flush all the dirty bitmaps associated with the block device 10205 * before flushing the rest of the dirty blocks so as to reduce 10206 * the number of dependencies that will have to be rolled back. 10207 */ 10208 void 10209 softdep_fsync_mountdev(vp) 10210 struct vnode *vp; 10211 { 10212 struct buf *bp, *nbp; 10213 struct worklist *wk; 10214 struct bufobj *bo; 10215 10216 if (!vn_isdisk(vp, NULL)) 10217 panic("softdep_fsync_mountdev: vnode not a disk"); 10218 bo = &vp->v_bufobj; 10219 restart: 10220 BO_LOCK(bo); 10221 ACQUIRE_LOCK(&lk); 10222 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 10223 /* 10224 * If it is already scheduled, skip to the next buffer. 10225 */ 10226 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 10227 continue; 10228 10229 if ((bp->b_flags & B_DELWRI) == 0) 10230 panic("softdep_fsync_mountdev: not dirty"); 10231 /* 10232 * We are only interested in bitmaps with outstanding 10233 * dependencies. 10234 */ 10235 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL || 10236 wk->wk_type != D_BMSAFEMAP || 10237 (bp->b_vflags & BV_BKGRDINPROG)) { 10238 BUF_UNLOCK(bp); 10239 continue; 10240 } 10241 FREE_LOCK(&lk); 10242 BO_UNLOCK(bo); 10243 bremfree(bp); 10244 (void) bawrite(bp); 10245 goto restart; 10246 } 10247 FREE_LOCK(&lk); 10248 drain_output(vp); 10249 BO_UNLOCK(bo); 10250 } 10251 10252 /* 10253 * This routine is called when we are trying to synchronously flush a 10254 * file. This routine must eliminate any filesystem metadata dependencies 10255 * so that the syncing routine can succeed by pushing the dirty blocks 10256 * associated with the file. If any I/O errors occur, they are returned. 10257 */ 10258 int 10259 softdep_sync_metadata(struct vnode *vp) 10260 { 10261 struct pagedep *pagedep; 10262 struct allocindir *aip; 10263 struct newblk *newblk; 10264 struct buf *bp, *nbp; 10265 struct worklist *wk; 10266 struct bufobj *bo; 10267 int i, error, waitfor; 10268 10269 if (!DOINGSOFTDEP(vp)) 10270 return (0); 10271 /* 10272 * Ensure that any direct block dependencies have been cleared. 10273 */ 10274 ACQUIRE_LOCK(&lk); 10275 if ((error = flush_inodedep_deps(vp->v_mount, VTOI(vp)->i_number))) { 10276 FREE_LOCK(&lk); 10277 return (error); 10278 } 10279 FREE_LOCK(&lk); 10280 /* 10281 * For most files, the only metadata dependencies are the 10282 * cylinder group maps that allocate their inode or blocks. 10283 * The block allocation dependencies can be found by traversing 10284 * the dependency lists for any buffers that remain on their 10285 * dirty buffer list. The inode allocation dependency will 10286 * be resolved when the inode is updated with MNT_WAIT. 10287 * This work is done in two passes. The first pass grabs most 10288 * of the buffers and begins asynchronously writing them. The 10289 * only way to wait for these asynchronous writes is to sleep 10290 * on the filesystem vnode which may stay busy for a long time 10291 * if the filesystem is active. So, instead, we make a second 10292 * pass over the dependencies blocking on each write. In the 10293 * usual case we will be blocking against a write that we 10294 * initiated, so when it is done the dependency will have been 10295 * resolved. Thus the second pass is expected to end quickly. 10296 */ 10297 waitfor = MNT_NOWAIT; 10298 bo = &vp->v_bufobj; 10299 10300 top: 10301 /* 10302 * We must wait for any I/O in progress to finish so that 10303 * all potential buffers on the dirty list will be visible. 10304 */ 10305 BO_LOCK(bo); 10306 drain_output(vp); 10307 while ((bp = TAILQ_FIRST(&bo->bo_dirty.bv_hd)) != NULL) { 10308 bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT); 10309 if (bp) 10310 break; 10311 } 10312 BO_UNLOCK(bo); 10313 if (bp == NULL) 10314 return (0); 10315 loop: 10316 /* While syncing snapshots, we must allow recursive lookups */ 10317 BUF_AREC(bp); 10318 ACQUIRE_LOCK(&lk); 10319 /* 10320 * As we hold the buffer locked, none of its dependencies 10321 * will disappear. 10322 */ 10323 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 10324 switch (wk->wk_type) { 10325 10326 case D_ALLOCDIRECT: 10327 case D_ALLOCINDIR: 10328 newblk = WK_NEWBLK(wk); 10329 if (newblk->nb_jnewblk != NULL) { 10330 stat_jwait_newblk++; 10331 jwait(&newblk->nb_jnewblk->jn_list); 10332 goto restart; 10333 } 10334 if (newblk->nb_state & DEPCOMPLETE) 10335 continue; 10336 nbp = newblk->nb_bmsafemap->sm_buf; 10337 nbp = getdirtybuf(nbp, &lk, waitfor); 10338 if (nbp == NULL) 10339 continue; 10340 FREE_LOCK(&lk); 10341 if (waitfor == MNT_NOWAIT) { 10342 bawrite(nbp); 10343 } else if ((error = bwrite(nbp)) != 0) { 10344 break; 10345 } 10346 ACQUIRE_LOCK(&lk); 10347 continue; 10348 10349 case D_INDIRDEP: 10350 restart: 10351 10352 LIST_FOREACH(aip, 10353 &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) { 10354 newblk = (struct newblk *)aip; 10355 if (newblk->nb_jnewblk != NULL) { 10356 stat_jwait_newblk++; 10357 jwait(&newblk->nb_jnewblk->jn_list); 10358 goto restart; 10359 } 10360 if (newblk->nb_state & DEPCOMPLETE) 10361 continue; 10362 nbp = newblk->nb_bmsafemap->sm_buf; 10363 nbp = getdirtybuf(nbp, &lk, MNT_WAIT); 10364 if (nbp == NULL) 10365 goto restart; 10366 FREE_LOCK(&lk); 10367 if ((error = bwrite(nbp)) != 0) { 10368 goto loop_end; 10369 } 10370 ACQUIRE_LOCK(&lk); 10371 goto restart; 10372 } 10373 continue; 10374 10375 case D_PAGEDEP: 10376 /* 10377 * We are trying to sync a directory that may 10378 * have dependencies on both its own metadata 10379 * and/or dependencies on the inodes of any 10380 * recently allocated files. We walk its diradd 10381 * lists pushing out the associated inode. 10382 */ 10383 pagedep = WK_PAGEDEP(wk); 10384 for (i = 0; i < DAHASHSZ; i++) { 10385 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) 10386 continue; 10387 if ((error = 10388 flush_pagedep_deps(vp, wk->wk_mp, 10389 &pagedep->pd_diraddhd[i]))) { 10390 FREE_LOCK(&lk); 10391 goto loop_end; 10392 } 10393 } 10394 continue; 10395 10396 default: 10397 panic("softdep_sync_metadata: Unknown type %s", 10398 TYPENAME(wk->wk_type)); 10399 /* NOTREACHED */ 10400 } 10401 loop_end: 10402 /* We reach here only in error and unlocked */ 10403 if (error == 0) 10404 panic("softdep_sync_metadata: zero error"); 10405 BUF_NOREC(bp); 10406 bawrite(bp); 10407 return (error); 10408 } 10409 FREE_LOCK(&lk); 10410 BO_LOCK(bo); 10411 while ((nbp = TAILQ_NEXT(bp, b_bobufs)) != NULL) { 10412 nbp = getdirtybuf(nbp, BO_MTX(bo), MNT_WAIT); 10413 if (nbp) 10414 break; 10415 } 10416 BO_UNLOCK(bo); 10417 BUF_NOREC(bp); 10418 bawrite(bp); 10419 if (nbp != NULL) { 10420 bp = nbp; 10421 goto loop; 10422 } 10423 /* 10424 * The brief unlock is to allow any pent up dependency 10425 * processing to be done. Then proceed with the second pass. 10426 */ 10427 if (waitfor == MNT_NOWAIT) { 10428 waitfor = MNT_WAIT; 10429 goto top; 10430 } 10431 10432 /* 10433 * If we have managed to get rid of all the dirty buffers, 10434 * then we are done. For certain directories and block 10435 * devices, we may need to do further work. 10436 * 10437 * We must wait for any I/O in progress to finish so that 10438 * all potential buffers on the dirty list will be visible. 10439 */ 10440 BO_LOCK(bo); 10441 drain_output(vp); 10442 BO_UNLOCK(bo); 10443 return ffs_update(vp, 1); 10444 /* return (0); */ 10445 } 10446 10447 /* 10448 * Flush the dependencies associated with an inodedep. 10449 * Called with splbio blocked. 10450 */ 10451 static int 10452 flush_inodedep_deps(mp, ino) 10453 struct mount *mp; 10454 ino_t ino; 10455 { 10456 struct inodedep *inodedep; 10457 struct inoref *inoref; 10458 int error, waitfor; 10459 10460 /* 10461 * This work is done in two passes. The first pass grabs most 10462 * of the buffers and begins asynchronously writing them. The 10463 * only way to wait for these asynchronous writes is to sleep 10464 * on the filesystem vnode which may stay busy for a long time 10465 * if the filesystem is active. So, instead, we make a second 10466 * pass over the dependencies blocking on each write. In the 10467 * usual case we will be blocking against a write that we 10468 * initiated, so when it is done the dependency will have been 10469 * resolved. Thus the second pass is expected to end quickly. 10470 * We give a brief window at the top of the loop to allow 10471 * any pending I/O to complete. 10472 */ 10473 for (error = 0, waitfor = MNT_NOWAIT; ; ) { 10474 if (error) 10475 return (error); 10476 FREE_LOCK(&lk); 10477 ACQUIRE_LOCK(&lk); 10478 restart: 10479 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) 10480 return (0); 10481 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 10482 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 10483 == DEPCOMPLETE) { 10484 stat_jwait_inode++; 10485 jwait(&inoref->if_list); 10486 goto restart; 10487 } 10488 } 10489 if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) || 10490 flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) || 10491 flush_deplist(&inodedep->id_extupdt, waitfor, &error) || 10492 flush_deplist(&inodedep->id_newextupdt, waitfor, &error)) 10493 continue; 10494 /* 10495 * If pass2, we are done, otherwise do pass 2. 10496 */ 10497 if (waitfor == MNT_WAIT) 10498 break; 10499 waitfor = MNT_WAIT; 10500 } 10501 /* 10502 * Try freeing inodedep in case all dependencies have been removed. 10503 */ 10504 if (inodedep_lookup(mp, ino, 0, &inodedep) != 0) 10505 (void) free_inodedep(inodedep); 10506 return (0); 10507 } 10508 10509 /* 10510 * Flush an inode dependency list. 10511 * Called with splbio blocked. 10512 */ 10513 static int 10514 flush_deplist(listhead, waitfor, errorp) 10515 struct allocdirectlst *listhead; 10516 int waitfor; 10517 int *errorp; 10518 { 10519 struct allocdirect *adp; 10520 struct newblk *newblk; 10521 struct buf *bp; 10522 10523 mtx_assert(&lk, MA_OWNED); 10524 TAILQ_FOREACH(adp, listhead, ad_next) { 10525 newblk = (struct newblk *)adp; 10526 if (newblk->nb_jnewblk != NULL) { 10527 stat_jwait_newblk++; 10528 jwait(&newblk->nb_jnewblk->jn_list); 10529 return (1); 10530 } 10531 if (newblk->nb_state & DEPCOMPLETE) 10532 continue; 10533 bp = newblk->nb_bmsafemap->sm_buf; 10534 bp = getdirtybuf(bp, &lk, waitfor); 10535 if (bp == NULL) { 10536 if (waitfor == MNT_NOWAIT) 10537 continue; 10538 return (1); 10539 } 10540 FREE_LOCK(&lk); 10541 if (waitfor == MNT_NOWAIT) { 10542 bawrite(bp); 10543 } else if ((*errorp = bwrite(bp)) != 0) { 10544 ACQUIRE_LOCK(&lk); 10545 return (1); 10546 } 10547 ACQUIRE_LOCK(&lk); 10548 return (1); 10549 } 10550 return (0); 10551 } 10552 10553 /* 10554 * Flush dependencies associated with an allocdirect block. 10555 */ 10556 static int 10557 flush_newblk_dep(vp, mp, lbn) 10558 struct vnode *vp; 10559 struct mount *mp; 10560 ufs_lbn_t lbn; 10561 { 10562 struct newblk *newblk; 10563 struct bufobj *bo; 10564 struct inode *ip; 10565 struct buf *bp; 10566 ufs2_daddr_t blkno; 10567 int error; 10568 10569 error = 0; 10570 bo = &vp->v_bufobj; 10571 ip = VTOI(vp); 10572 blkno = DIP(ip, i_db[lbn]); 10573 if (blkno == 0) 10574 panic("flush_newblk_dep: Missing block"); 10575 ACQUIRE_LOCK(&lk); 10576 /* 10577 * Loop until all dependencies related to this block are satisfied. 10578 * We must be careful to restart after each sleep in case a write 10579 * completes some part of this process for us. 10580 */ 10581 for (;;) { 10582 if (newblk_lookup(mp, blkno, 0, &newblk) == 0) { 10583 FREE_LOCK(&lk); 10584 break; 10585 } 10586 if (newblk->nb_list.wk_type != D_ALLOCDIRECT) 10587 panic("flush_newblk_deps: Bad newblk %p", newblk); 10588 /* 10589 * Flush the journal. 10590 */ 10591 if (newblk->nb_jnewblk != NULL) { 10592 stat_jwait_newblk++; 10593 jwait(&newblk->nb_jnewblk->jn_list); 10594 continue; 10595 } 10596 /* 10597 * Write the bitmap dependency. 10598 */ 10599 if ((newblk->nb_state & DEPCOMPLETE) == 0) { 10600 bp = newblk->nb_bmsafemap->sm_buf; 10601 bp = getdirtybuf(bp, &lk, MNT_WAIT); 10602 if (bp == NULL) 10603 continue; 10604 FREE_LOCK(&lk); 10605 error = bwrite(bp); 10606 if (error) 10607 break; 10608 ACQUIRE_LOCK(&lk); 10609 continue; 10610 } 10611 /* 10612 * Write the buffer. 10613 */ 10614 FREE_LOCK(&lk); 10615 BO_LOCK(bo); 10616 bp = gbincore(bo, lbn); 10617 if (bp != NULL) { 10618 error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 10619 LK_INTERLOCK, BO_MTX(bo)); 10620 if (error == ENOLCK) { 10621 ACQUIRE_LOCK(&lk); 10622 continue; /* Slept, retry */ 10623 } 10624 if (error != 0) 10625 break; /* Failed */ 10626 if (bp->b_flags & B_DELWRI) { 10627 bremfree(bp); 10628 error = bwrite(bp); 10629 if (error) 10630 break; 10631 } else 10632 BUF_UNLOCK(bp); 10633 } else 10634 BO_UNLOCK(bo); 10635 /* 10636 * We have to wait for the direct pointers to 10637 * point at the newdirblk before the dependency 10638 * will go away. 10639 */ 10640 error = ffs_update(vp, MNT_WAIT); 10641 if (error) 10642 break; 10643 ACQUIRE_LOCK(&lk); 10644 } 10645 return (error); 10646 } 10647 10648 /* 10649 * Eliminate a pagedep dependency by flushing out all its diradd dependencies. 10650 * Called with splbio blocked. 10651 */ 10652 static int 10653 flush_pagedep_deps(pvp, mp, diraddhdp) 10654 struct vnode *pvp; 10655 struct mount *mp; 10656 struct diraddhd *diraddhdp; 10657 { 10658 struct inodedep *inodedep; 10659 struct inoref *inoref; 10660 struct ufsmount *ump; 10661 struct diradd *dap; 10662 struct vnode *vp; 10663 int error = 0; 10664 struct buf *bp; 10665 ino_t inum; 10666 10667 ump = VFSTOUFS(mp); 10668 restart: 10669 while ((dap = LIST_FIRST(diraddhdp)) != NULL) { 10670 /* 10671 * Flush ourselves if this directory entry 10672 * has a MKDIR_PARENT dependency. 10673 */ 10674 if (dap->da_state & MKDIR_PARENT) { 10675 FREE_LOCK(&lk); 10676 if ((error = ffs_update(pvp, MNT_WAIT)) != 0) 10677 break; 10678 ACQUIRE_LOCK(&lk); 10679 /* 10680 * If that cleared dependencies, go on to next. 10681 */ 10682 if (dap != LIST_FIRST(diraddhdp)) 10683 continue; 10684 if (dap->da_state & MKDIR_PARENT) 10685 panic("flush_pagedep_deps: MKDIR_PARENT"); 10686 } 10687 /* 10688 * A newly allocated directory must have its "." and 10689 * ".." entries written out before its name can be 10690 * committed in its parent. 10691 */ 10692 inum = dap->da_newinum; 10693 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) 10694 panic("flush_pagedep_deps: lost inode1"); 10695 /* 10696 * Wait for any pending journal adds to complete so we don't 10697 * cause rollbacks while syncing. 10698 */ 10699 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 10700 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 10701 == DEPCOMPLETE) { 10702 stat_jwait_inode++; 10703 jwait(&inoref->if_list); 10704 goto restart; 10705 } 10706 } 10707 if (dap->da_state & MKDIR_BODY) { 10708 FREE_LOCK(&lk); 10709 if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, 10710 FFSV_FORCEINSMQ))) 10711 break; 10712 error = flush_newblk_dep(vp, mp, 0); 10713 /* 10714 * If we still have the dependency we might need to 10715 * update the vnode to sync the new link count to 10716 * disk. 10717 */ 10718 if (error == 0 && dap == LIST_FIRST(diraddhdp)) 10719 error = ffs_update(vp, MNT_WAIT); 10720 vput(vp); 10721 if (error != 0) 10722 break; 10723 ACQUIRE_LOCK(&lk); 10724 /* 10725 * If that cleared dependencies, go on to next. 10726 */ 10727 if (dap != LIST_FIRST(diraddhdp)) 10728 continue; 10729 if (dap->da_state & MKDIR_BODY) { 10730 inodedep_lookup(UFSTOVFS(ump), inum, 0, 10731 &inodedep); 10732 panic("flush_pagedep_deps: MKDIR_BODY " 10733 "inodedep %p dap %p vp %p", 10734 inodedep, dap, vp); 10735 } 10736 } 10737 /* 10738 * Flush the inode on which the directory entry depends. 10739 * Having accounted for MKDIR_PARENT and MKDIR_BODY above, 10740 * the only remaining dependency is that the updated inode 10741 * count must get pushed to disk. The inode has already 10742 * been pushed into its inode buffer (via VOP_UPDATE) at 10743 * the time of the reference count change. So we need only 10744 * locate that buffer, ensure that there will be no rollback 10745 * caused by a bitmap dependency, then write the inode buffer. 10746 */ 10747 retry: 10748 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) 10749 panic("flush_pagedep_deps: lost inode"); 10750 /* 10751 * If the inode still has bitmap dependencies, 10752 * push them to disk. 10753 */ 10754 if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) { 10755 bp = inodedep->id_bmsafemap->sm_buf; 10756 bp = getdirtybuf(bp, &lk, MNT_WAIT); 10757 if (bp == NULL) 10758 goto retry; 10759 FREE_LOCK(&lk); 10760 if ((error = bwrite(bp)) != 0) 10761 break; 10762 ACQUIRE_LOCK(&lk); 10763 if (dap != LIST_FIRST(diraddhdp)) 10764 continue; 10765 } 10766 /* 10767 * If the inode is still sitting in a buffer waiting 10768 * to be written or waiting for the link count to be 10769 * adjusted update it here to flush it to disk. 10770 */ 10771 if (dap == LIST_FIRST(diraddhdp)) { 10772 FREE_LOCK(&lk); 10773 if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, 10774 FFSV_FORCEINSMQ))) 10775 break; 10776 error = ffs_update(vp, MNT_WAIT); 10777 vput(vp); 10778 if (error) 10779 break; 10780 ACQUIRE_LOCK(&lk); 10781 } 10782 /* 10783 * If we have failed to get rid of all the dependencies 10784 * then something is seriously wrong. 10785 */ 10786 if (dap == LIST_FIRST(diraddhdp)) { 10787 inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep); 10788 panic("flush_pagedep_deps: failed to flush " 10789 "inodedep %p ino %d dap %p", inodedep, inum, dap); 10790 } 10791 } 10792 if (error) 10793 ACQUIRE_LOCK(&lk); 10794 return (error); 10795 } 10796 10797 /* 10798 * A large burst of file addition or deletion activity can drive the 10799 * memory load excessively high. First attempt to slow things down 10800 * using the techniques below. If that fails, this routine requests 10801 * the offending operations to fall back to running synchronously 10802 * until the memory load returns to a reasonable level. 10803 */ 10804 int 10805 softdep_slowdown(vp) 10806 struct vnode *vp; 10807 { 10808 struct ufsmount *ump; 10809 int jlow; 10810 int max_softdeps_hard; 10811 10812 ACQUIRE_LOCK(&lk); 10813 jlow = 0; 10814 /* 10815 * Check for journal space if needed. 10816 */ 10817 if (DOINGSUJ(vp)) { 10818 ump = VFSTOUFS(vp->v_mount); 10819 if (journal_space(ump, 0) == 0) 10820 jlow = 1; 10821 } 10822 max_softdeps_hard = max_softdeps * 11 / 10; 10823 if (num_dirrem < max_softdeps_hard / 2 && 10824 num_inodedep < max_softdeps_hard && 10825 VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps && 10826 num_freeblkdep < max_softdeps_hard && jlow == 0) { 10827 FREE_LOCK(&lk); 10828 return (0); 10829 } 10830 if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps || jlow) 10831 softdep_speedup(); 10832 stat_sync_limit_hit += 1; 10833 FREE_LOCK(&lk); 10834 return (1); 10835 } 10836 10837 /* 10838 * Called by the allocation routines when they are about to fail 10839 * in the hope that we can free up some disk space. 10840 * 10841 * First check to see if the work list has anything on it. If it has, 10842 * clean up entries until we successfully free some space. Because this 10843 * process holds inodes locked, we cannot handle any remove requests 10844 * that might block on a locked inode as that could lead to deadlock. 10845 * If the worklist yields no free space, encourage the syncer daemon 10846 * to help us. In no event will we try for longer than tickdelay seconds. 10847 */ 10848 int 10849 softdep_request_cleanup(fs, vp) 10850 struct fs *fs; 10851 struct vnode *vp; 10852 { 10853 struct ufsmount *ump; 10854 long starttime; 10855 ufs2_daddr_t needed; 10856 int error; 10857 10858 ump = VTOI(vp)->i_ump; 10859 mtx_assert(UFS_MTX(ump), MA_OWNED); 10860 needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize; 10861 starttime = time_second + tickdelay; 10862 /* 10863 * If we are being called because of a process doing a 10864 * copy-on-write, then it is not safe to update the vnode 10865 * as we may recurse into the copy-on-write routine. 10866 */ 10867 if (!(curthread->td_pflags & TDP_COWINPROGRESS)) { 10868 UFS_UNLOCK(ump); 10869 error = ffs_update(vp, 1); 10870 UFS_LOCK(ump); 10871 if (error != 0) 10872 return (0); 10873 } 10874 while (fs->fs_pendingblocks > 0 && fs->fs_cstotal.cs_nbfree <= needed) { 10875 if (time_second > starttime) 10876 return (0); 10877 UFS_UNLOCK(ump); 10878 ACQUIRE_LOCK(&lk); 10879 process_removes(vp); 10880 if (ump->softdep_on_worklist > 0 && 10881 process_worklist_item(UFSTOVFS(ump), LK_NOWAIT) != -1) { 10882 stat_worklist_push += 1; 10883 FREE_LOCK(&lk); 10884 UFS_LOCK(ump); 10885 continue; 10886 } 10887 request_cleanup(UFSTOVFS(ump), FLUSH_REMOVE_WAIT); 10888 FREE_LOCK(&lk); 10889 UFS_LOCK(ump); 10890 } 10891 return (1); 10892 } 10893 10894 /* 10895 * If memory utilization has gotten too high, deliberately slow things 10896 * down and speed up the I/O processing. 10897 */ 10898 extern struct thread *syncertd; 10899 static int 10900 request_cleanup(mp, resource) 10901 struct mount *mp; 10902 int resource; 10903 { 10904 struct thread *td = curthread; 10905 struct ufsmount *ump; 10906 10907 mtx_assert(&lk, MA_OWNED); 10908 /* 10909 * We never hold up the filesystem syncer or buf daemon. 10910 */ 10911 if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF)) 10912 return (0); 10913 ump = VFSTOUFS(mp); 10914 /* 10915 * First check to see if the work list has gotten backlogged. 10916 * If it has, co-opt this process to help clean up two entries. 10917 * Because this process may hold inodes locked, we cannot 10918 * handle any remove requests that might block on a locked 10919 * inode as that could lead to deadlock. We set TDP_SOFTDEP 10920 * to avoid recursively processing the worklist. 10921 */ 10922 if (ump->softdep_on_worklist > max_softdeps / 10) { 10923 td->td_pflags |= TDP_SOFTDEP; 10924 process_worklist_item(mp, LK_NOWAIT); 10925 process_worklist_item(mp, LK_NOWAIT); 10926 td->td_pflags &= ~TDP_SOFTDEP; 10927 stat_worklist_push += 2; 10928 return(1); 10929 } 10930 /* 10931 * Next, we attempt to speed up the syncer process. If that 10932 * is successful, then we allow the process to continue. 10933 */ 10934 if (softdep_speedup() && resource != FLUSH_REMOVE_WAIT) 10935 return(0); 10936 /* 10937 * If we are resource constrained on inode dependencies, try 10938 * flushing some dirty inodes. Otherwise, we are constrained 10939 * by file deletions, so try accelerating flushes of directories 10940 * with removal dependencies. We would like to do the cleanup 10941 * here, but we probably hold an inode locked at this point and 10942 * that might deadlock against one that we try to clean. So, 10943 * the best that we can do is request the syncer daemon to do 10944 * the cleanup for us. 10945 */ 10946 switch (resource) { 10947 10948 case FLUSH_INODES: 10949 stat_ino_limit_push += 1; 10950 req_clear_inodedeps += 1; 10951 stat_countp = &stat_ino_limit_hit; 10952 break; 10953 10954 case FLUSH_REMOVE: 10955 case FLUSH_REMOVE_WAIT: 10956 stat_blk_limit_push += 1; 10957 req_clear_remove += 1; 10958 stat_countp = &stat_blk_limit_hit; 10959 break; 10960 10961 default: 10962 panic("request_cleanup: unknown type"); 10963 } 10964 /* 10965 * Hopefully the syncer daemon will catch up and awaken us. 10966 * We wait at most tickdelay before proceeding in any case. 10967 */ 10968 proc_waiting += 1; 10969 if (callout_pending(&softdep_callout) == FALSE) 10970 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, 10971 pause_timer, 0); 10972 10973 msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0); 10974 proc_waiting -= 1; 10975 return (1); 10976 } 10977 10978 /* 10979 * Awaken processes pausing in request_cleanup and clear proc_waiting 10980 * to indicate that there is no longer a timer running. 10981 */ 10982 static void 10983 pause_timer(arg) 10984 void *arg; 10985 { 10986 10987 /* 10988 * The callout_ API has acquired mtx and will hold it around this 10989 * function call. 10990 */ 10991 *stat_countp += 1; 10992 wakeup_one(&proc_waiting); 10993 if (proc_waiting > 0) 10994 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, 10995 pause_timer, 0); 10996 } 10997 10998 /* 10999 * Flush out a directory with at least one removal dependency in an effort to 11000 * reduce the number of dirrem, freefile, and freeblks dependency structures. 11001 */ 11002 static void 11003 clear_remove(td) 11004 struct thread *td; 11005 { 11006 struct pagedep_hashhead *pagedephd; 11007 struct pagedep *pagedep; 11008 static int next = 0; 11009 struct mount *mp; 11010 struct vnode *vp; 11011 struct bufobj *bo; 11012 int error, cnt; 11013 ino_t ino; 11014 11015 mtx_assert(&lk, MA_OWNED); 11016 11017 for (cnt = 0; cnt < pagedep_hash; cnt++) { 11018 pagedephd = &pagedep_hashtbl[next++]; 11019 if (next >= pagedep_hash) 11020 next = 0; 11021 LIST_FOREACH(pagedep, pagedephd, pd_hash) { 11022 if (LIST_EMPTY(&pagedep->pd_dirremhd)) 11023 continue; 11024 mp = pagedep->pd_list.wk_mp; 11025 ino = pagedep->pd_ino; 11026 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 11027 continue; 11028 FREE_LOCK(&lk); 11029 11030 /* 11031 * Let unmount clear deps 11032 */ 11033 error = vfs_busy(mp, MBF_NOWAIT); 11034 if (error != 0) 11035 goto finish_write; 11036 error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, 11037 FFSV_FORCEINSMQ); 11038 vfs_unbusy(mp); 11039 if (error != 0) { 11040 softdep_error("clear_remove: vget", error); 11041 goto finish_write; 11042 } 11043 if ((error = ffs_syncvnode(vp, MNT_NOWAIT))) 11044 softdep_error("clear_remove: fsync", error); 11045 bo = &vp->v_bufobj; 11046 BO_LOCK(bo); 11047 drain_output(vp); 11048 BO_UNLOCK(bo); 11049 vput(vp); 11050 finish_write: 11051 vn_finished_write(mp); 11052 ACQUIRE_LOCK(&lk); 11053 return; 11054 } 11055 } 11056 } 11057 11058 /* 11059 * Clear out a block of dirty inodes in an effort to reduce 11060 * the number of inodedep dependency structures. 11061 */ 11062 static void 11063 clear_inodedeps(td) 11064 struct thread *td; 11065 { 11066 struct inodedep_hashhead *inodedephd; 11067 struct inodedep *inodedep; 11068 static int next = 0; 11069 struct mount *mp; 11070 struct vnode *vp; 11071 struct fs *fs; 11072 int error, cnt; 11073 ino_t firstino, lastino, ino; 11074 11075 mtx_assert(&lk, MA_OWNED); 11076 /* 11077 * Pick a random inode dependency to be cleared. 11078 * We will then gather up all the inodes in its block 11079 * that have dependencies and flush them out. 11080 */ 11081 for (cnt = 0; cnt < inodedep_hash; cnt++) { 11082 inodedephd = &inodedep_hashtbl[next++]; 11083 if (next >= inodedep_hash) 11084 next = 0; 11085 if ((inodedep = LIST_FIRST(inodedephd)) != NULL) 11086 break; 11087 } 11088 if (inodedep == NULL) 11089 return; 11090 fs = inodedep->id_fs; 11091 mp = inodedep->id_list.wk_mp; 11092 /* 11093 * Find the last inode in the block with dependencies. 11094 */ 11095 firstino = inodedep->id_ino & ~(INOPB(fs) - 1); 11096 for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) 11097 if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0) 11098 break; 11099 /* 11100 * Asynchronously push all but the last inode with dependencies. 11101 * Synchronously push the last inode with dependencies to ensure 11102 * that the inode block gets written to free up the inodedeps. 11103 */ 11104 for (ino = firstino; ino <= lastino; ino++) { 11105 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) 11106 continue; 11107 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 11108 continue; 11109 FREE_LOCK(&lk); 11110 error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */ 11111 if (error != 0) { 11112 vn_finished_write(mp); 11113 ACQUIRE_LOCK(&lk); 11114 return; 11115 } 11116 if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, 11117 FFSV_FORCEINSMQ)) != 0) { 11118 softdep_error("clear_inodedeps: vget", error); 11119 vfs_unbusy(mp); 11120 vn_finished_write(mp); 11121 ACQUIRE_LOCK(&lk); 11122 return; 11123 } 11124 vfs_unbusy(mp); 11125 if (ino == lastino) { 11126 if ((error = ffs_syncvnode(vp, MNT_WAIT))) 11127 softdep_error("clear_inodedeps: fsync1", error); 11128 } else { 11129 if ((error = ffs_syncvnode(vp, MNT_NOWAIT))) 11130 softdep_error("clear_inodedeps: fsync2", error); 11131 BO_LOCK(&vp->v_bufobj); 11132 drain_output(vp); 11133 BO_UNLOCK(&vp->v_bufobj); 11134 } 11135 vput(vp); 11136 vn_finished_write(mp); 11137 ACQUIRE_LOCK(&lk); 11138 } 11139 } 11140 11141 /* 11142 * Function to determine if the buffer has outstanding dependencies 11143 * that will cause a roll-back if the buffer is written. If wantcount 11144 * is set, return number of dependencies, otherwise just yes or no. 11145 */ 11146 static int 11147 softdep_count_dependencies(bp, wantcount) 11148 struct buf *bp; 11149 int wantcount; 11150 { 11151 struct worklist *wk; 11152 struct bmsafemap *bmsafemap; 11153 struct inodedep *inodedep; 11154 struct indirdep *indirdep; 11155 struct freeblks *freeblks; 11156 struct allocindir *aip; 11157 struct pagedep *pagedep; 11158 struct dirrem *dirrem; 11159 struct newblk *newblk; 11160 struct mkdir *mkdir; 11161 struct diradd *dap; 11162 int i, retval; 11163 11164 retval = 0; 11165 ACQUIRE_LOCK(&lk); 11166 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 11167 switch (wk->wk_type) { 11168 11169 case D_INODEDEP: 11170 inodedep = WK_INODEDEP(wk); 11171 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 11172 /* bitmap allocation dependency */ 11173 retval += 1; 11174 if (!wantcount) 11175 goto out; 11176 } 11177 if (TAILQ_FIRST(&inodedep->id_inoupdt)) { 11178 /* direct block pointer dependency */ 11179 retval += 1; 11180 if (!wantcount) 11181 goto out; 11182 } 11183 if (TAILQ_FIRST(&inodedep->id_extupdt)) { 11184 /* direct block pointer dependency */ 11185 retval += 1; 11186 if (!wantcount) 11187 goto out; 11188 } 11189 if (TAILQ_FIRST(&inodedep->id_inoreflst)) { 11190 /* Add reference dependency. */ 11191 retval += 1; 11192 if (!wantcount) 11193 goto out; 11194 } 11195 continue; 11196 11197 case D_INDIRDEP: 11198 indirdep = WK_INDIRDEP(wk); 11199 11200 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { 11201 /* indirect block pointer dependency */ 11202 retval += 1; 11203 if (!wantcount) 11204 goto out; 11205 } 11206 continue; 11207 11208 case D_PAGEDEP: 11209 pagedep = WK_PAGEDEP(wk); 11210 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { 11211 if (LIST_FIRST(&dirrem->dm_jremrefhd)) { 11212 /* Journal remove ref dependency. */ 11213 retval += 1; 11214 if (!wantcount) 11215 goto out; 11216 } 11217 } 11218 for (i = 0; i < DAHASHSZ; i++) { 11219 11220 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 11221 /* directory entry dependency */ 11222 retval += 1; 11223 if (!wantcount) 11224 goto out; 11225 } 11226 } 11227 continue; 11228 11229 case D_BMSAFEMAP: 11230 bmsafemap = WK_BMSAFEMAP(wk); 11231 if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) { 11232 /* Add reference dependency. */ 11233 retval += 1; 11234 if (!wantcount) 11235 goto out; 11236 } 11237 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) { 11238 /* Allocate block dependency. */ 11239 retval += 1; 11240 if (!wantcount) 11241 goto out; 11242 } 11243 continue; 11244 11245 case D_FREEBLKS: 11246 freeblks = WK_FREEBLKS(wk); 11247 if (LIST_FIRST(&freeblks->fb_jfreeblkhd)) { 11248 /* Freeblk journal dependency. */ 11249 retval += 1; 11250 if (!wantcount) 11251 goto out; 11252 } 11253 continue; 11254 11255 case D_ALLOCDIRECT: 11256 case D_ALLOCINDIR: 11257 newblk = WK_NEWBLK(wk); 11258 if (newblk->nb_jnewblk) { 11259 /* Journal allocate dependency. */ 11260 retval += 1; 11261 if (!wantcount) 11262 goto out; 11263 } 11264 continue; 11265 11266 case D_MKDIR: 11267 mkdir = WK_MKDIR(wk); 11268 if (mkdir->md_jaddref) { 11269 /* Journal reference dependency. */ 11270 retval += 1; 11271 if (!wantcount) 11272 goto out; 11273 } 11274 continue; 11275 11276 case D_FREEWORK: 11277 case D_FREEDEP: 11278 case D_JSEGDEP: 11279 case D_JSEG: 11280 case D_SBDEP: 11281 /* never a dependency on these blocks */ 11282 continue; 11283 11284 default: 11285 panic("softdep_count_dependencies: Unexpected type %s", 11286 TYPENAME(wk->wk_type)); 11287 /* NOTREACHED */ 11288 } 11289 } 11290 out: 11291 FREE_LOCK(&lk); 11292 return retval; 11293 } 11294 11295 /* 11296 * Acquire exclusive access to a buffer. 11297 * Must be called with a locked mtx parameter. 11298 * Return acquired buffer or NULL on failure. 11299 */ 11300 static struct buf * 11301 getdirtybuf(bp, mtx, waitfor) 11302 struct buf *bp; 11303 struct mtx *mtx; 11304 int waitfor; 11305 { 11306 int error; 11307 11308 mtx_assert(mtx, MA_OWNED); 11309 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) { 11310 if (waitfor != MNT_WAIT) 11311 return (NULL); 11312 error = BUF_LOCK(bp, 11313 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx); 11314 /* 11315 * Even if we sucessfully acquire bp here, we have dropped 11316 * mtx, which may violates our guarantee. 11317 */ 11318 if (error == 0) 11319 BUF_UNLOCK(bp); 11320 else if (error != ENOLCK) 11321 panic("getdirtybuf: inconsistent lock: %d", error); 11322 mtx_lock(mtx); 11323 return (NULL); 11324 } 11325 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { 11326 if (mtx == &lk && waitfor == MNT_WAIT) { 11327 mtx_unlock(mtx); 11328 BO_LOCK(bp->b_bufobj); 11329 BUF_UNLOCK(bp); 11330 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { 11331 bp->b_vflags |= BV_BKGRDWAIT; 11332 msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj), 11333 PRIBIO | PDROP, "getbuf", 0); 11334 } else 11335 BO_UNLOCK(bp->b_bufobj); 11336 mtx_lock(mtx); 11337 return (NULL); 11338 } 11339 BUF_UNLOCK(bp); 11340 if (waitfor != MNT_WAIT) 11341 return (NULL); 11342 /* 11343 * The mtx argument must be bp->b_vp's mutex in 11344 * this case. 11345 */ 11346 #ifdef DEBUG_VFS_LOCKS 11347 if (bp->b_vp->v_type != VCHR) 11348 ASSERT_BO_LOCKED(bp->b_bufobj); 11349 #endif 11350 bp->b_vflags |= BV_BKGRDWAIT; 11351 msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0); 11352 return (NULL); 11353 } 11354 if ((bp->b_flags & B_DELWRI) == 0) { 11355 BUF_UNLOCK(bp); 11356 return (NULL); 11357 } 11358 bremfree(bp); 11359 return (bp); 11360 } 11361 11362 11363 /* 11364 * Check if it is safe to suspend the file system now. On entry, 11365 * the vnode interlock for devvp should be held. Return 0 with 11366 * the mount interlock held if the file system can be suspended now, 11367 * otherwise return EAGAIN with the mount interlock held. 11368 */ 11369 int 11370 softdep_check_suspend(struct mount *mp, 11371 struct vnode *devvp, 11372 int softdep_deps, 11373 int softdep_accdeps, 11374 int secondary_writes, 11375 int secondary_accwrites) 11376 { 11377 struct bufobj *bo; 11378 struct ufsmount *ump; 11379 int error; 11380 11381 ump = VFSTOUFS(mp); 11382 bo = &devvp->v_bufobj; 11383 ASSERT_BO_LOCKED(bo); 11384 11385 for (;;) { 11386 if (!TRY_ACQUIRE_LOCK(&lk)) { 11387 BO_UNLOCK(bo); 11388 ACQUIRE_LOCK(&lk); 11389 FREE_LOCK(&lk); 11390 BO_LOCK(bo); 11391 continue; 11392 } 11393 MNT_ILOCK(mp); 11394 if (mp->mnt_secondary_writes != 0) { 11395 FREE_LOCK(&lk); 11396 BO_UNLOCK(bo); 11397 msleep(&mp->mnt_secondary_writes, 11398 MNT_MTX(mp), 11399 (PUSER - 1) | PDROP, "secwr", 0); 11400 BO_LOCK(bo); 11401 continue; 11402 } 11403 break; 11404 } 11405 11406 /* 11407 * Reasons for needing more work before suspend: 11408 * - Dirty buffers on devvp. 11409 * - Softdep activity occurred after start of vnode sync loop 11410 * - Secondary writes occurred after start of vnode sync loop 11411 */ 11412 error = 0; 11413 if (bo->bo_numoutput > 0 || 11414 bo->bo_dirty.bv_cnt > 0 || 11415 softdep_deps != 0 || 11416 ump->softdep_deps != 0 || 11417 softdep_accdeps != ump->softdep_accdeps || 11418 secondary_writes != 0 || 11419 mp->mnt_secondary_writes != 0 || 11420 secondary_accwrites != mp->mnt_secondary_accwrites) 11421 error = EAGAIN; 11422 FREE_LOCK(&lk); 11423 BO_UNLOCK(bo); 11424 return (error); 11425 } 11426 11427 11428 /* 11429 * Get the number of dependency structures for the file system, both 11430 * the current number and the total number allocated. These will 11431 * later be used to detect that softdep processing has occurred. 11432 */ 11433 void 11434 softdep_get_depcounts(struct mount *mp, 11435 int *softdep_depsp, 11436 int *softdep_accdepsp) 11437 { 11438 struct ufsmount *ump; 11439 11440 ump = VFSTOUFS(mp); 11441 ACQUIRE_LOCK(&lk); 11442 *softdep_depsp = ump->softdep_deps; 11443 *softdep_accdepsp = ump->softdep_accdeps; 11444 FREE_LOCK(&lk); 11445 } 11446 11447 /* 11448 * Wait for pending output on a vnode to complete. 11449 * Must be called with vnode lock and interlock locked. 11450 * 11451 * XXX: Should just be a call to bufobj_wwait(). 11452 */ 11453 static void 11454 drain_output(vp) 11455 struct vnode *vp; 11456 { 11457 struct bufobj *bo; 11458 11459 bo = &vp->v_bufobj; 11460 ASSERT_VOP_LOCKED(vp, "drain_output"); 11461 ASSERT_BO_LOCKED(bo); 11462 11463 while (bo->bo_numoutput) { 11464 bo->bo_flag |= BO_WWAIT; 11465 msleep((caddr_t)&bo->bo_numoutput, 11466 BO_MTX(bo), PRIBIO + 1, "drainvp", 0); 11467 } 11468 } 11469 11470 /* 11471 * Called whenever a buffer that is being invalidated or reallocated 11472 * contains dependencies. This should only happen if an I/O error has 11473 * occurred. The routine is called with the buffer locked. 11474 */ 11475 static void 11476 softdep_deallocate_dependencies(bp) 11477 struct buf *bp; 11478 { 11479 11480 if ((bp->b_ioflags & BIO_ERROR) == 0) 11481 panic("softdep_deallocate_dependencies: dangling deps"); 11482 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); 11483 panic("softdep_deallocate_dependencies: unrecovered I/O error"); 11484 } 11485 11486 /* 11487 * Function to handle asynchronous write errors in the filesystem. 11488 */ 11489 static void 11490 softdep_error(func, error) 11491 char *func; 11492 int error; 11493 { 11494 11495 /* XXX should do something better! */ 11496 printf("%s: got error %d while accessing filesystem\n", func, error); 11497 } 11498 11499 #ifdef DDB 11500 11501 static void 11502 inodedep_print(struct inodedep *inodedep, int verbose) 11503 { 11504 db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d" 11505 " saveino %p\n", 11506 inodedep, inodedep->id_fs, inodedep->id_state, 11507 (intmax_t)inodedep->id_ino, 11508 (intmax_t)fsbtodb(inodedep->id_fs, 11509 ino_to_fsba(inodedep->id_fs, inodedep->id_ino)), 11510 inodedep->id_nlinkdelta, inodedep->id_savednlink, 11511 inodedep->id_savedino1); 11512 11513 if (verbose == 0) 11514 return; 11515 11516 db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, " 11517 "mkdiradd %p\n", 11518 LIST_FIRST(&inodedep->id_pendinghd), 11519 LIST_FIRST(&inodedep->id_bufwait), 11520 LIST_FIRST(&inodedep->id_inowait), 11521 TAILQ_FIRST(&inodedep->id_inoreflst), 11522 inodedep->id_mkdiradd); 11523 db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n", 11524 TAILQ_FIRST(&inodedep->id_inoupdt), 11525 TAILQ_FIRST(&inodedep->id_newinoupdt), 11526 TAILQ_FIRST(&inodedep->id_extupdt), 11527 TAILQ_FIRST(&inodedep->id_newextupdt)); 11528 } 11529 11530 DB_SHOW_COMMAND(inodedep, db_show_inodedep) 11531 { 11532 11533 if (have_addr == 0) { 11534 db_printf("Address required\n"); 11535 return; 11536 } 11537 inodedep_print((struct inodedep*)addr, 1); 11538 } 11539 11540 DB_SHOW_COMMAND(inodedeps, db_show_inodedeps) 11541 { 11542 struct inodedep_hashhead *inodedephd; 11543 struct inodedep *inodedep; 11544 struct fs *fs; 11545 int cnt; 11546 11547 fs = have_addr ? (struct fs *)addr : NULL; 11548 for (cnt = 0; cnt < inodedep_hash; cnt++) { 11549 inodedephd = &inodedep_hashtbl[cnt]; 11550 LIST_FOREACH(inodedep, inodedephd, id_hash) { 11551 if (fs != NULL && fs != inodedep->id_fs) 11552 continue; 11553 inodedep_print(inodedep, 0); 11554 } 11555 } 11556 } 11557 11558 DB_SHOW_COMMAND(worklist, db_show_worklist) 11559 { 11560 struct worklist *wk; 11561 11562 if (have_addr == 0) { 11563 db_printf("Address required\n"); 11564 return; 11565 } 11566 wk = (struct worklist *)addr; 11567 printf("worklist: %p type %s state 0x%X\n", 11568 wk, TYPENAME(wk->wk_type), wk->wk_state); 11569 } 11570 11571 DB_SHOW_COMMAND(workhead, db_show_workhead) 11572 { 11573 struct workhead *wkhd; 11574 struct worklist *wk; 11575 int i; 11576 11577 if (have_addr == 0) { 11578 db_printf("Address required\n"); 11579 return; 11580 } 11581 wkhd = (struct workhead *)addr; 11582 wk = LIST_FIRST(wkhd); 11583 for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list)) 11584 db_printf("worklist: %p type %s state 0x%X", 11585 wk, TYPENAME(wk->wk_type), wk->wk_state); 11586 if (i == 100) 11587 db_printf("workhead overflow"); 11588 printf("\n"); 11589 } 11590 11591 11592 DB_SHOW_COMMAND(mkdirs, db_show_mkdirs) 11593 { 11594 struct jaddref *jaddref; 11595 struct diradd *diradd; 11596 struct mkdir *mkdir; 11597 11598 LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) { 11599 diradd = mkdir->md_diradd; 11600 db_printf("mkdir: %p state 0x%X dap %p state 0x%X", 11601 mkdir, mkdir->md_state, diradd, diradd->da_state); 11602 if ((jaddref = mkdir->md_jaddref) != NULL) 11603 db_printf(" jaddref %p jaddref state 0x%X", 11604 jaddref, jaddref->ja_state); 11605 db_printf("\n"); 11606 } 11607 } 11608 11609 #endif /* DDB */ 11610 11611 #endif /* SOFTUPDATES */ 11612