1 /*- 2 * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * The soft updates code is derived from the appendix of a University 5 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, 6 * "Soft Updates: A Solution to the Metadata Update Problem in File 7 * Systems", CSE-TR-254-95, August 1995). 8 * 9 * Further information about soft updates can be obtained from: 10 * 11 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 12 * 1614 Oxford Street mckusick@mckusick.com 13 * Berkeley, CA 94709-1608 +1-510-843-9542 14 * USA 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 26 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 27 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 28 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 29 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 30 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00 39 */ 40 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 #include "opt_ffs.h" 45 #include "opt_ddb.h" 46 47 /* 48 * For now we want the safety net that the DEBUG flag provides. 49 */ 50 #ifndef DEBUG 51 #define DEBUG 52 #endif 53 54 #include <sys/param.h> 55 #include <sys/kernel.h> 56 #include <sys/systm.h> 57 #include <sys/bio.h> 58 #include <sys/buf.h> 59 #include <sys/kdb.h> 60 #include <sys/kthread.h> 61 #include <sys/lock.h> 62 #include <sys/malloc.h> 63 #include <sys/mount.h> 64 #include <sys/mutex.h> 65 #include <sys/proc.h> 66 #include <sys/stat.h> 67 #include <sys/sysctl.h> 68 #include <sys/syslog.h> 69 #include <sys/vnode.h> 70 #include <sys/conf.h> 71 #include <ufs/ufs/dir.h> 72 #include <ufs/ufs/extattr.h> 73 #include <ufs/ufs/quota.h> 74 #include <ufs/ufs/inode.h> 75 #include <ufs/ufs/ufsmount.h> 76 #include <ufs/ffs/fs.h> 77 #include <ufs/ffs/softdep.h> 78 #include <ufs/ffs/ffs_extern.h> 79 #include <ufs/ufs/ufs_extern.h> 80 81 #include <vm/vm.h> 82 83 #include <ddb/ddb.h> 84 85 #ifndef SOFTUPDATES 86 87 int 88 softdep_flushfiles(oldmnt, flags, td) 89 struct mount *oldmnt; 90 int flags; 91 struct thread *td; 92 { 93 94 panic("softdep_flushfiles called"); 95 } 96 97 int 98 softdep_mount(devvp, mp, fs, cred) 99 struct vnode *devvp; 100 struct mount *mp; 101 struct fs *fs; 102 struct ucred *cred; 103 { 104 105 return (0); 106 } 107 108 void 109 softdep_initialize() 110 { 111 112 return; 113 } 114 115 void 116 softdep_uninitialize() 117 { 118 119 return; 120 } 121 122 void 123 softdep_setup_inomapdep(bp, ip, newinum) 124 struct buf *bp; 125 struct inode *ip; 126 ino_t newinum; 127 { 128 129 panic("softdep_setup_inomapdep called"); 130 } 131 132 void 133 softdep_setup_blkmapdep(bp, mp, newblkno) 134 struct buf *bp; 135 struct mount *mp; 136 ufs2_daddr_t newblkno; 137 { 138 139 panic("softdep_setup_blkmapdep called"); 140 } 141 142 void 143 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 144 struct inode *ip; 145 ufs_lbn_t lbn; 146 ufs2_daddr_t newblkno; 147 ufs2_daddr_t oldblkno; 148 long newsize; 149 long oldsize; 150 struct buf *bp; 151 { 152 153 panic("softdep_setup_allocdirect called"); 154 } 155 156 void 157 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 158 struct inode *ip; 159 ufs_lbn_t lbn; 160 ufs2_daddr_t newblkno; 161 ufs2_daddr_t oldblkno; 162 long newsize; 163 long oldsize; 164 struct buf *bp; 165 { 166 167 panic("softdep_setup_allocext called"); 168 } 169 170 void 171 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 172 struct inode *ip; 173 ufs_lbn_t lbn; 174 struct buf *bp; 175 int ptrno; 176 ufs2_daddr_t newblkno; 177 ufs2_daddr_t oldblkno; 178 struct buf *nbp; 179 { 180 181 panic("softdep_setup_allocindir_page called"); 182 } 183 184 void 185 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 186 struct buf *nbp; 187 struct inode *ip; 188 struct buf *bp; 189 int ptrno; 190 ufs2_daddr_t newblkno; 191 { 192 193 panic("softdep_setup_allocindir_meta called"); 194 } 195 196 void 197 softdep_setup_freeblocks(ip, length, flags) 198 struct inode *ip; 199 off_t length; 200 int flags; 201 { 202 203 panic("softdep_setup_freeblocks called"); 204 } 205 206 void 207 softdep_freefile(pvp, ino, mode) 208 struct vnode *pvp; 209 ino_t ino; 210 int mode; 211 { 212 213 panic("softdep_freefile called"); 214 } 215 216 int 217 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) 218 struct buf *bp; 219 struct inode *dp; 220 off_t diroffset; 221 ino_t newinum; 222 struct buf *newdirbp; 223 int isnewblk; 224 { 225 226 panic("softdep_setup_directory_add called"); 227 } 228 229 void 230 softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize) 231 struct inode *dp; 232 caddr_t base; 233 caddr_t oldloc; 234 caddr_t newloc; 235 int entrysize; 236 { 237 238 panic("softdep_change_directoryentry_offset called"); 239 } 240 241 void 242 softdep_setup_remove(bp, dp, ip, isrmdir) 243 struct buf *bp; 244 struct inode *dp; 245 struct inode *ip; 246 int isrmdir; 247 { 248 249 panic("softdep_setup_remove called"); 250 } 251 252 void 253 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 254 struct buf *bp; 255 struct inode *dp; 256 struct inode *ip; 257 ino_t newinum; 258 int isrmdir; 259 { 260 261 panic("softdep_setup_directory_change called"); 262 } 263 264 void 265 softdep_change_linkcnt(ip) 266 struct inode *ip; 267 { 268 269 panic("softdep_change_linkcnt called"); 270 } 271 272 void 273 softdep_load_inodeblock(ip) 274 struct inode *ip; 275 { 276 277 panic("softdep_load_inodeblock called"); 278 } 279 280 void 281 softdep_update_inodeblock(ip, bp, waitfor) 282 struct inode *ip; 283 struct buf *bp; 284 int waitfor; 285 { 286 287 panic("softdep_update_inodeblock called"); 288 } 289 290 int 291 softdep_fsync(vp) 292 struct vnode *vp; /* the "in_core" copy of the inode */ 293 { 294 295 return (0); 296 } 297 298 void 299 softdep_fsync_mountdev(vp) 300 struct vnode *vp; 301 { 302 303 return; 304 } 305 306 int 307 softdep_flushworklist(oldmnt, countp, td) 308 struct mount *oldmnt; 309 int *countp; 310 struct thread *td; 311 { 312 313 *countp = 0; 314 return (0); 315 } 316 317 int 318 softdep_sync_metadata(struct vnode *vp) 319 { 320 321 return (0); 322 } 323 324 int 325 softdep_slowdown(vp) 326 struct vnode *vp; 327 { 328 329 panic("softdep_slowdown called"); 330 } 331 332 void 333 softdep_releasefile(ip) 334 struct inode *ip; /* inode with the zero effective link count */ 335 { 336 337 panic("softdep_releasefile called"); 338 } 339 340 int 341 softdep_request_cleanup(fs, vp) 342 struct fs *fs; 343 struct vnode *vp; 344 { 345 346 return (0); 347 } 348 349 int 350 softdep_check_suspend(struct mount *mp, 351 struct vnode *devvp, 352 int softdep_deps, 353 int softdep_accdeps, 354 int secondary_writes, 355 int secondary_accwrites) 356 { 357 struct bufobj *bo; 358 int error; 359 360 (void) softdep_deps, 361 (void) softdep_accdeps; 362 363 bo = &devvp->v_bufobj; 364 ASSERT_BO_LOCKED(bo); 365 366 MNT_ILOCK(mp); 367 while (mp->mnt_secondary_writes != 0) { 368 BO_UNLOCK(bo); 369 msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), 370 (PUSER - 1) | PDROP, "secwr", 0); 371 BO_LOCK(bo); 372 MNT_ILOCK(mp); 373 } 374 375 /* 376 * Reasons for needing more work before suspend: 377 * - Dirty buffers on devvp. 378 * - Secondary writes occurred after start of vnode sync loop 379 */ 380 error = 0; 381 if (bo->bo_numoutput > 0 || 382 bo->bo_dirty.bv_cnt > 0 || 383 secondary_writes != 0 || 384 mp->mnt_secondary_writes != 0 || 385 secondary_accwrites != mp->mnt_secondary_accwrites) 386 error = EAGAIN; 387 BO_UNLOCK(bo); 388 return (error); 389 } 390 391 void 392 softdep_get_depcounts(struct mount *mp, 393 int *softdepactivep, 394 int *softdepactiveaccp) 395 { 396 (void) mp; 397 *softdepactivep = 0; 398 *softdepactiveaccp = 0; 399 } 400 401 #else 402 /* 403 * These definitions need to be adapted to the system to which 404 * this file is being ported. 405 */ 406 /* 407 * malloc types defined for the softdep system. 408 */ 409 static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies"); 410 static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies"); 411 static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation"); 412 static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map"); 413 static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode"); 414 static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies"); 415 static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block"); 416 static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode"); 417 static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode"); 418 static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated"); 419 static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry"); 420 static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory"); 421 static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted"); 422 static MALLOC_DEFINE(M_NEWDIRBLK, "newdirblk","Unclaimed new directory block"); 423 static MALLOC_DEFINE(M_SAVEDINO, "savedino","Saved inodes"); 424 425 #define M_SOFTDEP_FLAGS (M_WAITOK | M_USE_RESERVE) 426 427 #define D_PAGEDEP 0 428 #define D_INODEDEP 1 429 #define D_NEWBLK 2 430 #define D_BMSAFEMAP 3 431 #define D_ALLOCDIRECT 4 432 #define D_INDIRDEP 5 433 #define D_ALLOCINDIR 6 434 #define D_FREEFRAG 7 435 #define D_FREEBLKS 8 436 #define D_FREEFILE 9 437 #define D_DIRADD 10 438 #define D_MKDIR 11 439 #define D_DIRREM 12 440 #define D_NEWDIRBLK 13 441 #define D_LAST D_NEWDIRBLK 442 443 /* 444 * translate from workitem type to memory type 445 * MUST match the defines above, such that memtype[D_XXX] == M_XXX 446 */ 447 static struct malloc_type *memtype[] = { 448 M_PAGEDEP, 449 M_INODEDEP, 450 M_NEWBLK, 451 M_BMSAFEMAP, 452 M_ALLOCDIRECT, 453 M_INDIRDEP, 454 M_ALLOCINDIR, 455 M_FREEFRAG, 456 M_FREEBLKS, 457 M_FREEFILE, 458 M_DIRADD, 459 M_MKDIR, 460 M_DIRREM, 461 M_NEWDIRBLK 462 }; 463 464 #define DtoM(type) (memtype[type]) 465 466 /* 467 * Names of malloc types. 468 */ 469 #define TYPENAME(type) \ 470 ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???") 471 /* 472 * End system adaptation definitions. 473 */ 474 475 /* 476 * Forward declarations. 477 */ 478 struct inodedep_hashhead; 479 struct newblk_hashhead; 480 struct pagedep_hashhead; 481 482 /* 483 * Internal function prototypes. 484 */ 485 static void softdep_error(char *, int); 486 static void drain_output(struct vnode *); 487 static struct buf *getdirtybuf(struct buf *, struct mtx *, int); 488 static void clear_remove(struct thread *); 489 static void clear_inodedeps(struct thread *); 490 static int flush_pagedep_deps(struct vnode *, struct mount *, 491 struct diraddhd *); 492 static int flush_inodedep_deps(struct mount *, ino_t); 493 static int flush_deplist(struct allocdirectlst *, int, int *); 494 static int handle_written_filepage(struct pagedep *, struct buf *); 495 static void diradd_inode_written(struct diradd *, struct inodedep *); 496 static int handle_written_inodeblock(struct inodedep *, struct buf *); 497 static void handle_allocdirect_partdone(struct allocdirect *); 498 static void handle_allocindir_partdone(struct allocindir *); 499 static void initiate_write_filepage(struct pagedep *, struct buf *); 500 static void handle_written_mkdir(struct mkdir *, int); 501 static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *); 502 static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *); 503 static void handle_workitem_freefile(struct freefile *); 504 static void handle_workitem_remove(struct dirrem *, struct vnode *); 505 static struct dirrem *newdirrem(struct buf *, struct inode *, 506 struct inode *, int, struct dirrem **); 507 static void free_diradd(struct diradd *); 508 static void free_allocindir(struct allocindir *, struct inodedep *); 509 static void free_newdirblk(struct newdirblk *); 510 static int indir_trunc(struct freeblks *, ufs2_daddr_t, int, ufs_lbn_t, 511 ufs2_daddr_t *); 512 static void deallocate_dependencies(struct buf *, struct inodedep *); 513 static void free_allocdirect(struct allocdirectlst *, 514 struct allocdirect *, int); 515 static int check_inode_unwritten(struct inodedep *); 516 static int free_inodedep(struct inodedep *); 517 static void handle_workitem_freeblocks(struct freeblks *, int); 518 static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *); 519 static void setup_allocindir_phase2(struct buf *, struct inode *, 520 struct allocindir *); 521 static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t, 522 ufs2_daddr_t); 523 static void handle_workitem_freefrag(struct freefrag *); 524 static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long); 525 static void allocdirect_merge(struct allocdirectlst *, 526 struct allocdirect *, struct allocdirect *); 527 static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *); 528 static int newblk_find(struct newblk_hashhead *, struct fs *, ufs2_daddr_t, 529 struct newblk **); 530 static int newblk_lookup(struct fs *, ufs2_daddr_t, int, struct newblk **); 531 static int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t, 532 struct inodedep **); 533 static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **); 534 static int pagedep_lookup(struct inode *, ufs_lbn_t, int, struct pagedep **); 535 static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t, 536 struct mount *mp, int, struct pagedep **); 537 static void pause_timer(void *); 538 static int request_cleanup(struct mount *, int); 539 static int process_worklist_item(struct mount *, int); 540 static void add_to_worklist(struct worklist *); 541 static void softdep_flush(void); 542 static int softdep_speedup(void); 543 544 /* 545 * Exported softdep operations. 546 */ 547 static void softdep_disk_io_initiation(struct buf *); 548 static void softdep_disk_write_complete(struct buf *); 549 static void softdep_deallocate_dependencies(struct buf *); 550 static int softdep_count_dependencies(struct buf *bp, int); 551 552 static struct mtx lk; 553 MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF); 554 555 #define TRY_ACQUIRE_LOCK(lk) mtx_trylock(lk) 556 #define ACQUIRE_LOCK(lk) mtx_lock(lk) 557 #define FREE_LOCK(lk) mtx_unlock(lk) 558 559 #define BUF_AREC(bp) ((bp)->b_lock.lock_object.lo_flags |= LO_RECURSABLE) 560 #define BUF_NOREC(bp) ((bp)->b_lock.lock_object.lo_flags &= ~LO_RECURSABLE) 561 562 /* 563 * Worklist queue management. 564 * These routines require that the lock be held. 565 */ 566 #ifndef /* NOT */ DEBUG 567 #define WORKLIST_INSERT(head, item) do { \ 568 (item)->wk_state |= ONWORKLIST; \ 569 LIST_INSERT_HEAD(head, item, wk_list); \ 570 } while (0) 571 #define WORKLIST_REMOVE(item) do { \ 572 (item)->wk_state &= ~ONWORKLIST; \ 573 LIST_REMOVE(item, wk_list); \ 574 } while (0) 575 #else /* DEBUG */ 576 static void worklist_insert(struct workhead *, struct worklist *); 577 static void worklist_remove(struct worklist *); 578 579 #define WORKLIST_INSERT(head, item) worklist_insert(head, item) 580 #define WORKLIST_REMOVE(item) worklist_remove(item) 581 582 static void 583 worklist_insert(head, item) 584 struct workhead *head; 585 struct worklist *item; 586 { 587 588 mtx_assert(&lk, MA_OWNED); 589 if (item->wk_state & ONWORKLIST) 590 panic("worklist_insert: already on list"); 591 item->wk_state |= ONWORKLIST; 592 LIST_INSERT_HEAD(head, item, wk_list); 593 } 594 595 static void 596 worklist_remove(item) 597 struct worklist *item; 598 { 599 600 mtx_assert(&lk, MA_OWNED); 601 if ((item->wk_state & ONWORKLIST) == 0) 602 panic("worklist_remove: not on list"); 603 item->wk_state &= ~ONWORKLIST; 604 LIST_REMOVE(item, wk_list); 605 } 606 #endif /* DEBUG */ 607 608 /* 609 * Routines for tracking and managing workitems. 610 */ 611 static void workitem_free(struct worklist *, int); 612 static void workitem_alloc(struct worklist *, int, struct mount *); 613 614 #define WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type)) 615 616 static void 617 workitem_free(item, type) 618 struct worklist *item; 619 int type; 620 { 621 struct ufsmount *ump; 622 mtx_assert(&lk, MA_OWNED); 623 624 #ifdef DEBUG 625 if (item->wk_state & ONWORKLIST) 626 panic("workitem_free: still on list"); 627 if (item->wk_type != type) 628 panic("workitem_free: type mismatch"); 629 #endif 630 ump = VFSTOUFS(item->wk_mp); 631 if (--ump->softdep_deps == 0 && ump->softdep_req) 632 wakeup(&ump->softdep_deps); 633 free(item, DtoM(type)); 634 } 635 636 static void 637 workitem_alloc(item, type, mp) 638 struct worklist *item; 639 int type; 640 struct mount *mp; 641 { 642 item->wk_type = type; 643 item->wk_mp = mp; 644 item->wk_state = 0; 645 ACQUIRE_LOCK(&lk); 646 VFSTOUFS(mp)->softdep_deps++; 647 VFSTOUFS(mp)->softdep_accdeps++; 648 FREE_LOCK(&lk); 649 } 650 651 /* 652 * Workitem queue management 653 */ 654 static int max_softdeps; /* maximum number of structs before slowdown */ 655 static int maxindirdeps = 50; /* max number of indirdeps before slowdown */ 656 static int tickdelay = 2; /* number of ticks to pause during slowdown */ 657 static int proc_waiting; /* tracks whether we have a timeout posted */ 658 static int *stat_countp; /* statistic to count in proc_waiting timeout */ 659 static struct callout softdep_callout; 660 static int req_pending; 661 static int req_clear_inodedeps; /* syncer process flush some inodedeps */ 662 #define FLUSH_INODES 1 663 static int req_clear_remove; /* syncer process flush some freeblks */ 664 #define FLUSH_REMOVE 2 665 #define FLUSH_REMOVE_WAIT 3 666 static long num_freeblkdep; /* number of freeblks workitems allocated */ 667 668 /* 669 * runtime statistics 670 */ 671 static int stat_worklist_push; /* number of worklist cleanups */ 672 static int stat_blk_limit_push; /* number of times block limit neared */ 673 static int stat_ino_limit_push; /* number of times inode limit neared */ 674 static int stat_blk_limit_hit; /* number of times block slowdown imposed */ 675 static int stat_ino_limit_hit; /* number of times inode slowdown imposed */ 676 static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */ 677 static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */ 678 static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */ 679 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */ 680 static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */ 681 682 SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, ""); 683 SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, ""); 684 SYSCTL_INT(_debug, OID_AUTO, maxindirdeps, CTLFLAG_RW, &maxindirdeps, 0, ""); 685 SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,""); 686 SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,""); 687 SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,""); 688 SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, ""); 689 SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, ""); 690 SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, ""); 691 SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, ""); 692 SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, ""); 693 SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, ""); 694 SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, ""); 695 /* SYSCTL_INT(_debug, OID_AUTO, worklist_num, CTLFLAG_RD, &softdep_on_worklist, 0, ""); */ 696 697 SYSCTL_DECL(_vfs_ffs); 698 699 static int compute_summary_at_mount = 0; /* Whether to recompute the summary at mount time */ 700 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW, 701 &compute_summary_at_mount, 0, "Recompute summary at mount"); 702 703 static struct proc *softdepproc; 704 static struct kproc_desc softdep_kp = { 705 "softdepflush", 706 softdep_flush, 707 &softdepproc 708 }; 709 SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start, 710 &softdep_kp); 711 712 static void 713 softdep_flush(void) 714 { 715 struct mount *nmp; 716 struct mount *mp; 717 struct ufsmount *ump; 718 struct thread *td; 719 int remaining; 720 int vfslocked; 721 722 td = curthread; 723 td->td_pflags |= TDP_NORUNNINGBUF; 724 725 for (;;) { 726 kproc_suspend_check(softdepproc); 727 vfslocked = VFS_LOCK_GIANT((struct mount *)NULL); 728 ACQUIRE_LOCK(&lk); 729 /* 730 * If requested, try removing inode or removal dependencies. 731 */ 732 if (req_clear_inodedeps) { 733 clear_inodedeps(td); 734 req_clear_inodedeps -= 1; 735 wakeup_one(&proc_waiting); 736 } 737 if (req_clear_remove) { 738 clear_remove(td); 739 req_clear_remove -= 1; 740 wakeup_one(&proc_waiting); 741 } 742 FREE_LOCK(&lk); 743 VFS_UNLOCK_GIANT(vfslocked); 744 remaining = 0; 745 mtx_lock(&mountlist_mtx); 746 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 747 nmp = TAILQ_NEXT(mp, mnt_list); 748 if ((mp->mnt_flag & MNT_SOFTDEP) == 0) 749 continue; 750 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) 751 continue; 752 vfslocked = VFS_LOCK_GIANT(mp); 753 softdep_process_worklist(mp, 0); 754 ump = VFSTOUFS(mp); 755 remaining += ump->softdep_on_worklist - 756 ump->softdep_on_worklist_inprogress; 757 VFS_UNLOCK_GIANT(vfslocked); 758 mtx_lock(&mountlist_mtx); 759 nmp = TAILQ_NEXT(mp, mnt_list); 760 vfs_unbusy(mp); 761 } 762 mtx_unlock(&mountlist_mtx); 763 if (remaining) 764 continue; 765 ACQUIRE_LOCK(&lk); 766 if (!req_pending) 767 msleep(&req_pending, &lk, PVM, "sdflush", hz); 768 req_pending = 0; 769 FREE_LOCK(&lk); 770 } 771 } 772 773 static int 774 softdep_speedup(void) 775 { 776 777 mtx_assert(&lk, MA_OWNED); 778 if (req_pending == 0) { 779 req_pending = 1; 780 wakeup(&req_pending); 781 } 782 783 return speedup_syncer(); 784 } 785 786 /* 787 * Add an item to the end of the work queue. 788 * This routine requires that the lock be held. 789 * This is the only routine that adds items to the list. 790 * The following routine is the only one that removes items 791 * and does so in order from first to last. 792 */ 793 static void 794 add_to_worklist(wk) 795 struct worklist *wk; 796 { 797 struct ufsmount *ump; 798 799 mtx_assert(&lk, MA_OWNED); 800 ump = VFSTOUFS(wk->wk_mp); 801 if (wk->wk_state & ONWORKLIST) 802 panic("add_to_worklist: already on list"); 803 wk->wk_state |= ONWORKLIST; 804 if (LIST_EMPTY(&ump->softdep_workitem_pending)) 805 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); 806 else 807 LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list); 808 ump->softdep_worklist_tail = wk; 809 ump->softdep_on_worklist += 1; 810 } 811 812 /* 813 * Process that runs once per second to handle items in the background queue. 814 * 815 * Note that we ensure that everything is done in the order in which they 816 * appear in the queue. The code below depends on this property to ensure 817 * that blocks of a file are freed before the inode itself is freed. This 818 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated 819 * until all the old ones have been purged from the dependency lists. 820 */ 821 int 822 softdep_process_worklist(mp, full) 823 struct mount *mp; 824 int full; 825 { 826 struct thread *td = curthread; 827 int cnt, matchcnt, loopcount; 828 struct ufsmount *ump; 829 long starttime; 830 831 KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp")); 832 /* 833 * Record the process identifier of our caller so that we can give 834 * this process preferential treatment in request_cleanup below. 835 */ 836 matchcnt = 0; 837 ump = VFSTOUFS(mp); 838 ACQUIRE_LOCK(&lk); 839 loopcount = 1; 840 starttime = time_second; 841 while (ump->softdep_on_worklist > 0) { 842 if ((cnt = process_worklist_item(mp, 0)) == -1) 843 break; 844 else 845 matchcnt += cnt; 846 /* 847 * If requested, try removing inode or removal dependencies. 848 */ 849 if (req_clear_inodedeps) { 850 clear_inodedeps(td); 851 req_clear_inodedeps -= 1; 852 wakeup_one(&proc_waiting); 853 } 854 if (req_clear_remove) { 855 clear_remove(td); 856 req_clear_remove -= 1; 857 wakeup_one(&proc_waiting); 858 } 859 /* 860 * We do not generally want to stop for buffer space, but if 861 * we are really being a buffer hog, we will stop and wait. 862 */ 863 if (loopcount++ % 128 == 0) { 864 FREE_LOCK(&lk); 865 uio_yield(); 866 bwillwrite(); 867 ACQUIRE_LOCK(&lk); 868 } 869 /* 870 * Never allow processing to run for more than one 871 * second. Otherwise the other mountpoints may get 872 * excessively backlogged. 873 */ 874 if (!full && starttime != time_second) { 875 matchcnt = -1; 876 break; 877 } 878 } 879 FREE_LOCK(&lk); 880 return (matchcnt); 881 } 882 883 /* 884 * Process one item on the worklist. 885 */ 886 static int 887 process_worklist_item(mp, flags) 888 struct mount *mp; 889 int flags; 890 { 891 struct worklist *wk, *wkend; 892 struct ufsmount *ump; 893 struct vnode *vp; 894 int matchcnt = 0; 895 896 mtx_assert(&lk, MA_OWNED); 897 KASSERT(mp != NULL, ("process_worklist_item: NULL mp")); 898 /* 899 * If we are being called because of a process doing a 900 * copy-on-write, then it is not safe to write as we may 901 * recurse into the copy-on-write routine. 902 */ 903 if (curthread->td_pflags & TDP_COWINPROGRESS) 904 return (-1); 905 /* 906 * Normally we just process each item on the worklist in order. 907 * However, if we are in a situation where we cannot lock any 908 * inodes, we have to skip over any dirrem requests whose 909 * vnodes are resident and locked. 910 */ 911 ump = VFSTOUFS(mp); 912 vp = NULL; 913 LIST_FOREACH(wk, &ump->softdep_workitem_pending, wk_list) { 914 if (wk->wk_state & INPROGRESS) 915 continue; 916 if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM) 917 break; 918 wk->wk_state |= INPROGRESS; 919 ump->softdep_on_worklist_inprogress++; 920 FREE_LOCK(&lk); 921 ffs_vgetf(mp, WK_DIRREM(wk)->dm_oldinum, 922 LK_NOWAIT | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ); 923 ACQUIRE_LOCK(&lk); 924 wk->wk_state &= ~INPROGRESS; 925 ump->softdep_on_worklist_inprogress--; 926 if (vp != NULL) 927 break; 928 } 929 if (wk == 0) 930 return (-1); 931 /* 932 * Remove the item to be processed. If we are removing the last 933 * item on the list, we need to recalculate the tail pointer. 934 * As this happens rarely and usually when the list is short, 935 * we just run down the list to find it rather than tracking it 936 * in the above loop. 937 */ 938 WORKLIST_REMOVE(wk); 939 if (wk == ump->softdep_worklist_tail) { 940 LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list) 941 if (LIST_NEXT(wkend, wk_list) == NULL) 942 break; 943 ump->softdep_worklist_tail = wkend; 944 } 945 ump->softdep_on_worklist -= 1; 946 FREE_LOCK(&lk); 947 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) 948 panic("process_worklist_item: suspended filesystem"); 949 matchcnt++; 950 switch (wk->wk_type) { 951 952 case D_DIRREM: 953 /* removal of a directory entry */ 954 handle_workitem_remove(WK_DIRREM(wk), vp); 955 break; 956 957 case D_FREEBLKS: 958 /* releasing blocks and/or fragments from a file */ 959 handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT); 960 break; 961 962 case D_FREEFRAG: 963 /* releasing a fragment when replaced as a file grows */ 964 handle_workitem_freefrag(WK_FREEFRAG(wk)); 965 break; 966 967 case D_FREEFILE: 968 /* releasing an inode when its link count drops to 0 */ 969 handle_workitem_freefile(WK_FREEFILE(wk)); 970 break; 971 972 default: 973 panic("%s_process_worklist: Unknown type %s", 974 "softdep", TYPENAME(wk->wk_type)); 975 /* NOTREACHED */ 976 } 977 vn_finished_secondary_write(mp); 978 ACQUIRE_LOCK(&lk); 979 return (matchcnt); 980 } 981 982 /* 983 * Move dependencies from one buffer to another. 984 */ 985 void 986 softdep_move_dependencies(oldbp, newbp) 987 struct buf *oldbp; 988 struct buf *newbp; 989 { 990 struct worklist *wk, *wktail; 991 992 if (!LIST_EMPTY(&newbp->b_dep)) 993 panic("softdep_move_dependencies: need merge code"); 994 wktail = 0; 995 ACQUIRE_LOCK(&lk); 996 while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { 997 LIST_REMOVE(wk, wk_list); 998 if (wktail == 0) 999 LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); 1000 else 1001 LIST_INSERT_AFTER(wktail, wk, wk_list); 1002 wktail = wk; 1003 } 1004 FREE_LOCK(&lk); 1005 } 1006 1007 /* 1008 * Purge the work list of all items associated with a particular mount point. 1009 */ 1010 int 1011 softdep_flushworklist(oldmnt, countp, td) 1012 struct mount *oldmnt; 1013 int *countp; 1014 struct thread *td; 1015 { 1016 struct vnode *devvp; 1017 int count, error = 0; 1018 struct ufsmount *ump; 1019 1020 /* 1021 * Alternately flush the block device associated with the mount 1022 * point and process any dependencies that the flushing 1023 * creates. We continue until no more worklist dependencies 1024 * are found. 1025 */ 1026 *countp = 0; 1027 ump = VFSTOUFS(oldmnt); 1028 devvp = ump->um_devvp; 1029 while ((count = softdep_process_worklist(oldmnt, 1)) > 0) { 1030 *countp += count; 1031 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1032 error = VOP_FSYNC(devvp, MNT_WAIT, td); 1033 VOP_UNLOCK(devvp, 0); 1034 if (error) 1035 break; 1036 } 1037 return (error); 1038 } 1039 1040 int 1041 softdep_waitidle(struct mount *mp) 1042 { 1043 struct ufsmount *ump; 1044 int error; 1045 int i; 1046 1047 ump = VFSTOUFS(mp); 1048 ACQUIRE_LOCK(&lk); 1049 for (i = 0; i < 10 && ump->softdep_deps; i++) { 1050 ump->softdep_req = 1; 1051 if (ump->softdep_on_worklist) 1052 panic("softdep_waitidle: work added after flush."); 1053 msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1); 1054 } 1055 ump->softdep_req = 0; 1056 FREE_LOCK(&lk); 1057 error = 0; 1058 if (i == 10) { 1059 error = EBUSY; 1060 printf("softdep_waitidle: Failed to flush worklist for %p\n", 1061 mp); 1062 } 1063 1064 return (error); 1065 } 1066 1067 /* 1068 * Flush all vnodes and worklist items associated with a specified mount point. 1069 */ 1070 int 1071 softdep_flushfiles(oldmnt, flags, td) 1072 struct mount *oldmnt; 1073 int flags; 1074 struct thread *td; 1075 { 1076 int error, depcount, loopcnt, retry_flush_count, retry; 1077 1078 loopcnt = 10; 1079 retry_flush_count = 3; 1080 retry_flush: 1081 error = 0; 1082 1083 /* 1084 * Alternately flush the vnodes associated with the mount 1085 * point and process any dependencies that the flushing 1086 * creates. In theory, this loop can happen at most twice, 1087 * but we give it a few extra just to be sure. 1088 */ 1089 for (; loopcnt > 0; loopcnt--) { 1090 /* 1091 * Do another flush in case any vnodes were brought in 1092 * as part of the cleanup operations. 1093 */ 1094 if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0) 1095 break; 1096 if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 || 1097 depcount == 0) 1098 break; 1099 } 1100 /* 1101 * If we are unmounting then it is an error to fail. If we 1102 * are simply trying to downgrade to read-only, then filesystem 1103 * activity can keep us busy forever, so we just fail with EBUSY. 1104 */ 1105 if (loopcnt == 0) { 1106 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) 1107 panic("softdep_flushfiles: looping"); 1108 error = EBUSY; 1109 } 1110 if (!error) 1111 error = softdep_waitidle(oldmnt); 1112 if (!error) { 1113 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) { 1114 retry = 0; 1115 MNT_ILOCK(oldmnt); 1116 KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0, 1117 ("softdep_flushfiles: !MNTK_NOINSMNTQ")); 1118 if (oldmnt->mnt_nvnodelistsize > 0) { 1119 if (--retry_flush_count > 0) { 1120 retry = 1; 1121 loopcnt = 3; 1122 } else 1123 error = EBUSY; 1124 } 1125 MNT_IUNLOCK(oldmnt); 1126 if (retry) 1127 goto retry_flush; 1128 } 1129 } 1130 return (error); 1131 } 1132 1133 /* 1134 * Structure hashing. 1135 * 1136 * There are three types of structures that can be looked up: 1137 * 1) pagedep structures identified by mount point, inode number, 1138 * and logical block. 1139 * 2) inodedep structures identified by mount point and inode number. 1140 * 3) newblk structures identified by mount point and 1141 * physical block number. 1142 * 1143 * The "pagedep" and "inodedep" dependency structures are hashed 1144 * separately from the file blocks and inodes to which they correspond. 1145 * This separation helps when the in-memory copy of an inode or 1146 * file block must be replaced. It also obviates the need to access 1147 * an inode or file page when simply updating (or de-allocating) 1148 * dependency structures. Lookup of newblk structures is needed to 1149 * find newly allocated blocks when trying to associate them with 1150 * their allocdirect or allocindir structure. 1151 * 1152 * The lookup routines optionally create and hash a new instance when 1153 * an existing entry is not found. 1154 */ 1155 #define DEPALLOC 0x0001 /* allocate structure if lookup fails */ 1156 #define NODELAY 0x0002 /* cannot do background work */ 1157 1158 /* 1159 * Structures and routines associated with pagedep caching. 1160 */ 1161 LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl; 1162 u_long pagedep_hash; /* size of hash table - 1 */ 1163 #define PAGEDEP_HASH(mp, inum, lbn) \ 1164 (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \ 1165 pagedep_hash]) 1166 1167 static int 1168 pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp) 1169 struct pagedep_hashhead *pagedephd; 1170 ino_t ino; 1171 ufs_lbn_t lbn; 1172 struct mount *mp; 1173 int flags; 1174 struct pagedep **pagedeppp; 1175 { 1176 struct pagedep *pagedep; 1177 1178 LIST_FOREACH(pagedep, pagedephd, pd_hash) 1179 if (ino == pagedep->pd_ino && 1180 lbn == pagedep->pd_lbn && 1181 mp == pagedep->pd_list.wk_mp) 1182 break; 1183 if (pagedep) { 1184 *pagedeppp = pagedep; 1185 if ((flags & DEPALLOC) != 0 && 1186 (pagedep->pd_state & ONWORKLIST) == 0) 1187 return (0); 1188 return (1); 1189 } 1190 *pagedeppp = NULL; 1191 return (0); 1192 } 1193 /* 1194 * Look up a pagedep. Return 1 if found, 0 if not found or found 1195 * when asked to allocate but not associated with any buffer. 1196 * If not found, allocate if DEPALLOC flag is passed. 1197 * Found or allocated entry is returned in pagedeppp. 1198 * This routine must be called with splbio interrupts blocked. 1199 */ 1200 static int 1201 pagedep_lookup(ip, lbn, flags, pagedeppp) 1202 struct inode *ip; 1203 ufs_lbn_t lbn; 1204 int flags; 1205 struct pagedep **pagedeppp; 1206 { 1207 struct pagedep *pagedep; 1208 struct pagedep_hashhead *pagedephd; 1209 struct mount *mp; 1210 int ret; 1211 int i; 1212 1213 mtx_assert(&lk, MA_OWNED); 1214 mp = ITOV(ip)->v_mount; 1215 pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn); 1216 1217 ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp); 1218 if (*pagedeppp || (flags & DEPALLOC) == 0) 1219 return (ret); 1220 FREE_LOCK(&lk); 1221 pagedep = malloc(sizeof(struct pagedep), 1222 M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO); 1223 workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp); 1224 ACQUIRE_LOCK(&lk); 1225 ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp); 1226 if (*pagedeppp) { 1227 WORKITEM_FREE(pagedep, D_PAGEDEP); 1228 return (ret); 1229 } 1230 pagedep->pd_ino = ip->i_number; 1231 pagedep->pd_lbn = lbn; 1232 LIST_INIT(&pagedep->pd_dirremhd); 1233 LIST_INIT(&pagedep->pd_pendinghd); 1234 for (i = 0; i < DAHASHSZ; i++) 1235 LIST_INIT(&pagedep->pd_diraddhd[i]); 1236 LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); 1237 *pagedeppp = pagedep; 1238 return (0); 1239 } 1240 1241 /* 1242 * Structures and routines associated with inodedep caching. 1243 */ 1244 LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl; 1245 static u_long inodedep_hash; /* size of hash table - 1 */ 1246 static long num_inodedep; /* number of inodedep allocated */ 1247 #define INODEDEP_HASH(fs, inum) \ 1248 (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash]) 1249 1250 static int 1251 inodedep_find(inodedephd, fs, inum, inodedeppp) 1252 struct inodedep_hashhead *inodedephd; 1253 struct fs *fs; 1254 ino_t inum; 1255 struct inodedep **inodedeppp; 1256 { 1257 struct inodedep *inodedep; 1258 1259 LIST_FOREACH(inodedep, inodedephd, id_hash) 1260 if (inum == inodedep->id_ino && fs == inodedep->id_fs) 1261 break; 1262 if (inodedep) { 1263 *inodedeppp = inodedep; 1264 return (1); 1265 } 1266 *inodedeppp = NULL; 1267 1268 return (0); 1269 } 1270 /* 1271 * Look up an inodedep. Return 1 if found, 0 if not found. 1272 * If not found, allocate if DEPALLOC flag is passed. 1273 * Found or allocated entry is returned in inodedeppp. 1274 * This routine must be called with splbio interrupts blocked. 1275 */ 1276 static int 1277 inodedep_lookup(mp, inum, flags, inodedeppp) 1278 struct mount *mp; 1279 ino_t inum; 1280 int flags; 1281 struct inodedep **inodedeppp; 1282 { 1283 struct inodedep *inodedep; 1284 struct inodedep_hashhead *inodedephd; 1285 struct fs *fs; 1286 1287 mtx_assert(&lk, MA_OWNED); 1288 fs = VFSTOUFS(mp)->um_fs; 1289 inodedephd = INODEDEP_HASH(fs, inum); 1290 1291 if (inodedep_find(inodedephd, fs, inum, inodedeppp)) 1292 return (1); 1293 if ((flags & DEPALLOC) == 0) 1294 return (0); 1295 /* 1296 * If we are over our limit, try to improve the situation. 1297 */ 1298 if (num_inodedep > max_softdeps && (flags & NODELAY) == 0) 1299 request_cleanup(mp, FLUSH_INODES); 1300 FREE_LOCK(&lk); 1301 inodedep = malloc(sizeof(struct inodedep), 1302 M_INODEDEP, M_SOFTDEP_FLAGS); 1303 workitem_alloc(&inodedep->id_list, D_INODEDEP, mp); 1304 ACQUIRE_LOCK(&lk); 1305 if (inodedep_find(inodedephd, fs, inum, inodedeppp)) { 1306 WORKITEM_FREE(inodedep, D_INODEDEP); 1307 return (1); 1308 } 1309 num_inodedep += 1; 1310 inodedep->id_fs = fs; 1311 inodedep->id_ino = inum; 1312 inodedep->id_state = ALLCOMPLETE; 1313 inodedep->id_nlinkdelta = 0; 1314 inodedep->id_savedino1 = NULL; 1315 inodedep->id_savedsize = -1; 1316 inodedep->id_savedextsize = -1; 1317 inodedep->id_buf = NULL; 1318 LIST_INIT(&inodedep->id_pendinghd); 1319 LIST_INIT(&inodedep->id_inowait); 1320 LIST_INIT(&inodedep->id_bufwait); 1321 TAILQ_INIT(&inodedep->id_inoupdt); 1322 TAILQ_INIT(&inodedep->id_newinoupdt); 1323 TAILQ_INIT(&inodedep->id_extupdt); 1324 TAILQ_INIT(&inodedep->id_newextupdt); 1325 LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); 1326 *inodedeppp = inodedep; 1327 return (0); 1328 } 1329 1330 /* 1331 * Structures and routines associated with newblk caching. 1332 */ 1333 LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl; 1334 u_long newblk_hash; /* size of hash table - 1 */ 1335 #define NEWBLK_HASH(fs, inum) \ 1336 (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash]) 1337 1338 static int 1339 newblk_find(newblkhd, fs, newblkno, newblkpp) 1340 struct newblk_hashhead *newblkhd; 1341 struct fs *fs; 1342 ufs2_daddr_t newblkno; 1343 struct newblk **newblkpp; 1344 { 1345 struct newblk *newblk; 1346 1347 LIST_FOREACH(newblk, newblkhd, nb_hash) 1348 if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs) 1349 break; 1350 if (newblk) { 1351 *newblkpp = newblk; 1352 return (1); 1353 } 1354 *newblkpp = NULL; 1355 return (0); 1356 } 1357 1358 /* 1359 * Look up a newblk. Return 1 if found, 0 if not found. 1360 * If not found, allocate if DEPALLOC flag is passed. 1361 * Found or allocated entry is returned in newblkpp. 1362 */ 1363 static int 1364 newblk_lookup(fs, newblkno, flags, newblkpp) 1365 struct fs *fs; 1366 ufs2_daddr_t newblkno; 1367 int flags; 1368 struct newblk **newblkpp; 1369 { 1370 struct newblk *newblk; 1371 struct newblk_hashhead *newblkhd; 1372 1373 newblkhd = NEWBLK_HASH(fs, newblkno); 1374 if (newblk_find(newblkhd, fs, newblkno, newblkpp)) 1375 return (1); 1376 if ((flags & DEPALLOC) == 0) 1377 return (0); 1378 FREE_LOCK(&lk); 1379 newblk = malloc(sizeof(struct newblk), 1380 M_NEWBLK, M_SOFTDEP_FLAGS); 1381 ACQUIRE_LOCK(&lk); 1382 if (newblk_find(newblkhd, fs, newblkno, newblkpp)) { 1383 free(newblk, M_NEWBLK); 1384 return (1); 1385 } 1386 newblk->nb_state = 0; 1387 newblk->nb_fs = fs; 1388 newblk->nb_newblkno = newblkno; 1389 LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); 1390 *newblkpp = newblk; 1391 return (0); 1392 } 1393 1394 /* 1395 * Executed during filesystem system initialization before 1396 * mounting any filesystems. 1397 */ 1398 void 1399 softdep_initialize() 1400 { 1401 1402 LIST_INIT(&mkdirlisthd); 1403 max_softdeps = desiredvnodes * 4; 1404 pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, 1405 &pagedep_hash); 1406 inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); 1407 newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash); 1408 1409 /* initialise bioops hack */ 1410 bioops.io_start = softdep_disk_io_initiation; 1411 bioops.io_complete = softdep_disk_write_complete; 1412 bioops.io_deallocate = softdep_deallocate_dependencies; 1413 bioops.io_countdeps = softdep_count_dependencies; 1414 1415 /* Initialize the callout with an mtx. */ 1416 callout_init_mtx(&softdep_callout, &lk, 0); 1417 } 1418 1419 /* 1420 * Executed after all filesystems have been unmounted during 1421 * filesystem module unload. 1422 */ 1423 void 1424 softdep_uninitialize() 1425 { 1426 1427 callout_drain(&softdep_callout); 1428 hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash); 1429 hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash); 1430 hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash); 1431 } 1432 1433 /* 1434 * Called at mount time to notify the dependency code that a 1435 * filesystem wishes to use it. 1436 */ 1437 int 1438 softdep_mount(devvp, mp, fs, cred) 1439 struct vnode *devvp; 1440 struct mount *mp; 1441 struct fs *fs; 1442 struct ucred *cred; 1443 { 1444 struct csum_total cstotal; 1445 struct ufsmount *ump; 1446 struct cg *cgp; 1447 struct buf *bp; 1448 int error, cyl; 1449 1450 MNT_ILOCK(mp); 1451 mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP; 1452 if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) { 1453 mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) | 1454 MNTK_SOFTDEP; 1455 mp->mnt_noasync++; 1456 } 1457 MNT_IUNLOCK(mp); 1458 ump = VFSTOUFS(mp); 1459 LIST_INIT(&ump->softdep_workitem_pending); 1460 ump->softdep_worklist_tail = NULL; 1461 ump->softdep_on_worklist = 0; 1462 ump->softdep_deps = 0; 1463 /* 1464 * When doing soft updates, the counters in the 1465 * superblock may have gotten out of sync. Recomputation 1466 * can take a long time and can be deferred for background 1467 * fsck. However, the old behavior of scanning the cylinder 1468 * groups and recalculating them at mount time is available 1469 * by setting vfs.ffs.compute_summary_at_mount to one. 1470 */ 1471 if (compute_summary_at_mount == 0 || fs->fs_clean != 0) 1472 return (0); 1473 bzero(&cstotal, sizeof cstotal); 1474 for (cyl = 0; cyl < fs->fs_ncg; cyl++) { 1475 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), 1476 fs->fs_cgsize, cred, &bp)) != 0) { 1477 brelse(bp); 1478 return (error); 1479 } 1480 cgp = (struct cg *)bp->b_data; 1481 cstotal.cs_nffree += cgp->cg_cs.cs_nffree; 1482 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; 1483 cstotal.cs_nifree += cgp->cg_cs.cs_nifree; 1484 cstotal.cs_ndir += cgp->cg_cs.cs_ndir; 1485 fs->fs_cs(fs, cyl) = cgp->cg_cs; 1486 brelse(bp); 1487 } 1488 #ifdef DEBUG 1489 if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) 1490 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt); 1491 #endif 1492 bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); 1493 return (0); 1494 } 1495 1496 /* 1497 * Protecting the freemaps (or bitmaps). 1498 * 1499 * To eliminate the need to execute fsck before mounting a filesystem 1500 * after a power failure, one must (conservatively) guarantee that the 1501 * on-disk copy of the bitmaps never indicate that a live inode or block is 1502 * free. So, when a block or inode is allocated, the bitmap should be 1503 * updated (on disk) before any new pointers. When a block or inode is 1504 * freed, the bitmap should not be updated until all pointers have been 1505 * reset. The latter dependency is handled by the delayed de-allocation 1506 * approach described below for block and inode de-allocation. The former 1507 * dependency is handled by calling the following procedure when a block or 1508 * inode is allocated. When an inode is allocated an "inodedep" is created 1509 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. 1510 * Each "inodedep" is also inserted into the hash indexing structure so 1511 * that any additional link additions can be made dependent on the inode 1512 * allocation. 1513 * 1514 * The ufs filesystem maintains a number of free block counts (e.g., per 1515 * cylinder group, per cylinder and per <cylinder, rotational position> pair) 1516 * in addition to the bitmaps. These counts are used to improve efficiency 1517 * during allocation and therefore must be consistent with the bitmaps. 1518 * There is no convenient way to guarantee post-crash consistency of these 1519 * counts with simple update ordering, for two main reasons: (1) The counts 1520 * and bitmaps for a single cylinder group block are not in the same disk 1521 * sector. If a disk write is interrupted (e.g., by power failure), one may 1522 * be written and the other not. (2) Some of the counts are located in the 1523 * superblock rather than the cylinder group block. So, we focus our soft 1524 * updates implementation on protecting the bitmaps. When mounting a 1525 * filesystem, we recompute the auxiliary counts from the bitmaps. 1526 */ 1527 1528 /* 1529 * Called just after updating the cylinder group block to allocate an inode. 1530 */ 1531 void 1532 softdep_setup_inomapdep(bp, ip, newinum) 1533 struct buf *bp; /* buffer for cylgroup block with inode map */ 1534 struct inode *ip; /* inode related to allocation */ 1535 ino_t newinum; /* new inode number being allocated */ 1536 { 1537 struct inodedep *inodedep; 1538 struct bmsafemap *bmsafemap; 1539 1540 /* 1541 * Create a dependency for the newly allocated inode. 1542 * Panic if it already exists as something is seriously wrong. 1543 * Otherwise add it to the dependency list for the buffer holding 1544 * the cylinder group map from which it was allocated. 1545 */ 1546 ACQUIRE_LOCK(&lk); 1547 if ((inodedep_lookup(UFSTOVFS(ip->i_ump), newinum, DEPALLOC|NODELAY, 1548 &inodedep))) 1549 panic("softdep_setup_inomapdep: dependency for new inode " 1550 "already exists"); 1551 inodedep->id_buf = bp; 1552 inodedep->id_state &= ~DEPCOMPLETE; 1553 bmsafemap = bmsafemap_lookup(inodedep->id_list.wk_mp, bp); 1554 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); 1555 FREE_LOCK(&lk); 1556 } 1557 1558 /* 1559 * Called just after updating the cylinder group block to 1560 * allocate block or fragment. 1561 */ 1562 void 1563 softdep_setup_blkmapdep(bp, mp, newblkno) 1564 struct buf *bp; /* buffer for cylgroup block with block map */ 1565 struct mount *mp; /* filesystem doing allocation */ 1566 ufs2_daddr_t newblkno; /* number of newly allocated block */ 1567 { 1568 struct newblk *newblk; 1569 struct bmsafemap *bmsafemap; 1570 struct fs *fs; 1571 1572 fs = VFSTOUFS(mp)->um_fs; 1573 /* 1574 * Create a dependency for the newly allocated block. 1575 * Add it to the dependency list for the buffer holding 1576 * the cylinder group map from which it was allocated. 1577 */ 1578 ACQUIRE_LOCK(&lk); 1579 if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0) 1580 panic("softdep_setup_blkmapdep: found block"); 1581 newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp); 1582 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); 1583 FREE_LOCK(&lk); 1584 } 1585 1586 /* 1587 * Find the bmsafemap associated with a cylinder group buffer. 1588 * If none exists, create one. The buffer must be locked when 1589 * this routine is called and this routine must be called with 1590 * splbio interrupts blocked. 1591 */ 1592 static struct bmsafemap * 1593 bmsafemap_lookup(mp, bp) 1594 struct mount *mp; 1595 struct buf *bp; 1596 { 1597 struct bmsafemap *bmsafemap; 1598 struct worklist *wk; 1599 1600 mtx_assert(&lk, MA_OWNED); 1601 LIST_FOREACH(wk, &bp->b_dep, wk_list) 1602 if (wk->wk_type == D_BMSAFEMAP) 1603 return (WK_BMSAFEMAP(wk)); 1604 FREE_LOCK(&lk); 1605 bmsafemap = malloc(sizeof(struct bmsafemap), 1606 M_BMSAFEMAP, M_SOFTDEP_FLAGS); 1607 workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); 1608 bmsafemap->sm_buf = bp; 1609 LIST_INIT(&bmsafemap->sm_allocdirecthd); 1610 LIST_INIT(&bmsafemap->sm_allocindirhd); 1611 LIST_INIT(&bmsafemap->sm_inodedephd); 1612 LIST_INIT(&bmsafemap->sm_newblkhd); 1613 ACQUIRE_LOCK(&lk); 1614 WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); 1615 return (bmsafemap); 1616 } 1617 1618 /* 1619 * Direct block allocation dependencies. 1620 * 1621 * When a new block is allocated, the corresponding disk locations must be 1622 * initialized (with zeros or new data) before the on-disk inode points to 1623 * them. Also, the freemap from which the block was allocated must be 1624 * updated (on disk) before the inode's pointer. These two dependencies are 1625 * independent of each other and are needed for all file blocks and indirect 1626 * blocks that are pointed to directly by the inode. Just before the 1627 * "in-core" version of the inode is updated with a newly allocated block 1628 * number, a procedure (below) is called to setup allocation dependency 1629 * structures. These structures are removed when the corresponding 1630 * dependencies are satisfied or when the block allocation becomes obsolete 1631 * (i.e., the file is deleted, the block is de-allocated, or the block is a 1632 * fragment that gets upgraded). All of these cases are handled in 1633 * procedures described later. 1634 * 1635 * When a file extension causes a fragment to be upgraded, either to a larger 1636 * fragment or to a full block, the on-disk location may change (if the 1637 * previous fragment could not simply be extended). In this case, the old 1638 * fragment must be de-allocated, but not until after the inode's pointer has 1639 * been updated. In most cases, this is handled by later procedures, which 1640 * will construct a "freefrag" structure to be added to the workitem queue 1641 * when the inode update is complete (or obsolete). The main exception to 1642 * this is when an allocation occurs while a pending allocation dependency 1643 * (for the same block pointer) remains. This case is handled in the main 1644 * allocation dependency setup procedure by immediately freeing the 1645 * unreferenced fragments. 1646 */ 1647 void 1648 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 1649 struct inode *ip; /* inode to which block is being added */ 1650 ufs_lbn_t lbn; /* block pointer within inode */ 1651 ufs2_daddr_t newblkno; /* disk block number being added */ 1652 ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */ 1653 long newsize; /* size of new block */ 1654 long oldsize; /* size of new block */ 1655 struct buf *bp; /* bp for allocated block */ 1656 { 1657 struct allocdirect *adp, *oldadp; 1658 struct allocdirectlst *adphead; 1659 struct bmsafemap *bmsafemap; 1660 struct inodedep *inodedep; 1661 struct pagedep *pagedep; 1662 struct newblk *newblk; 1663 struct mount *mp; 1664 1665 mp = UFSTOVFS(ip->i_ump); 1666 adp = malloc(sizeof(struct allocdirect), 1667 M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO); 1668 workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp); 1669 adp->ad_lbn = lbn; 1670 adp->ad_newblkno = newblkno; 1671 adp->ad_oldblkno = oldblkno; 1672 adp->ad_newsize = newsize; 1673 adp->ad_oldsize = oldsize; 1674 adp->ad_state = ATTACHED; 1675 LIST_INIT(&adp->ad_newdirblk); 1676 if (newblkno == oldblkno) 1677 adp->ad_freefrag = NULL; 1678 else 1679 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize); 1680 1681 ACQUIRE_LOCK(&lk); 1682 if (lbn >= NDADDR) { 1683 /* allocating an indirect block */ 1684 if (oldblkno != 0) 1685 panic("softdep_setup_allocdirect: non-zero indir"); 1686 } else { 1687 /* 1688 * Allocating a direct block. 1689 * 1690 * If we are allocating a directory block, then we must 1691 * allocate an associated pagedep to track additions and 1692 * deletions. 1693 */ 1694 if ((ip->i_mode & IFMT) == IFDIR && 1695 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) 1696 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 1697 } 1698 if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0) 1699 panic("softdep_setup_allocdirect: lost block"); 1700 if (newblk->nb_state == DEPCOMPLETE) { 1701 adp->ad_state |= DEPCOMPLETE; 1702 adp->ad_buf = NULL; 1703 } else { 1704 bmsafemap = newblk->nb_bmsafemap; 1705 adp->ad_buf = bmsafemap->sm_buf; 1706 LIST_REMOVE(newblk, nb_deps); 1707 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); 1708 } 1709 LIST_REMOVE(newblk, nb_hash); 1710 free(newblk, M_NEWBLK); 1711 1712 inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); 1713 adp->ad_inodedep = inodedep; 1714 WORKLIST_INSERT(&bp->b_dep, &adp->ad_list); 1715 /* 1716 * The list of allocdirects must be kept in sorted and ascending 1717 * order so that the rollback routines can quickly determine the 1718 * first uncommitted block (the size of the file stored on disk 1719 * ends at the end of the lowest committed fragment, or if there 1720 * are no fragments, at the end of the highest committed block). 1721 * Since files generally grow, the typical case is that the new 1722 * block is to be added at the end of the list. We speed this 1723 * special case by checking against the last allocdirect in the 1724 * list before laboriously traversing the list looking for the 1725 * insertion point. 1726 */ 1727 adphead = &inodedep->id_newinoupdt; 1728 oldadp = TAILQ_LAST(adphead, allocdirectlst); 1729 if (oldadp == NULL || oldadp->ad_lbn <= lbn) { 1730 /* insert at end of list */ 1731 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 1732 if (oldadp != NULL && oldadp->ad_lbn == lbn) 1733 allocdirect_merge(adphead, adp, oldadp); 1734 FREE_LOCK(&lk); 1735 return; 1736 } 1737 TAILQ_FOREACH(oldadp, adphead, ad_next) { 1738 if (oldadp->ad_lbn >= lbn) 1739 break; 1740 } 1741 if (oldadp == NULL) 1742 panic("softdep_setup_allocdirect: lost entry"); 1743 /* insert in middle of list */ 1744 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 1745 if (oldadp->ad_lbn == lbn) 1746 allocdirect_merge(adphead, adp, oldadp); 1747 FREE_LOCK(&lk); 1748 } 1749 1750 /* 1751 * Replace an old allocdirect dependency with a newer one. 1752 * This routine must be called with splbio interrupts blocked. 1753 */ 1754 static void 1755 allocdirect_merge(adphead, newadp, oldadp) 1756 struct allocdirectlst *adphead; /* head of list holding allocdirects */ 1757 struct allocdirect *newadp; /* allocdirect being added */ 1758 struct allocdirect *oldadp; /* existing allocdirect being checked */ 1759 { 1760 struct worklist *wk; 1761 struct freefrag *freefrag; 1762 struct newdirblk *newdirblk; 1763 1764 mtx_assert(&lk, MA_OWNED); 1765 if (newadp->ad_oldblkno != oldadp->ad_newblkno || 1766 newadp->ad_oldsize != oldadp->ad_newsize || 1767 newadp->ad_lbn >= NDADDR) 1768 panic("%s %jd != new %jd || old size %ld != new %ld", 1769 "allocdirect_merge: old blkno", 1770 (intmax_t)newadp->ad_oldblkno, 1771 (intmax_t)oldadp->ad_newblkno, 1772 newadp->ad_oldsize, oldadp->ad_newsize); 1773 newadp->ad_oldblkno = oldadp->ad_oldblkno; 1774 newadp->ad_oldsize = oldadp->ad_oldsize; 1775 /* 1776 * If the old dependency had a fragment to free or had never 1777 * previously had a block allocated, then the new dependency 1778 * can immediately post its freefrag and adopt the old freefrag. 1779 * This action is done by swapping the freefrag dependencies. 1780 * The new dependency gains the old one's freefrag, and the 1781 * old one gets the new one and then immediately puts it on 1782 * the worklist when it is freed by free_allocdirect. It is 1783 * not possible to do this swap when the old dependency had a 1784 * non-zero size but no previous fragment to free. This condition 1785 * arises when the new block is an extension of the old block. 1786 * Here, the first part of the fragment allocated to the new 1787 * dependency is part of the block currently claimed on disk by 1788 * the old dependency, so cannot legitimately be freed until the 1789 * conditions for the new dependency are fulfilled. 1790 */ 1791 if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { 1792 freefrag = newadp->ad_freefrag; 1793 newadp->ad_freefrag = oldadp->ad_freefrag; 1794 oldadp->ad_freefrag = freefrag; 1795 } 1796 /* 1797 * If we are tracking a new directory-block allocation, 1798 * move it from the old allocdirect to the new allocdirect. 1799 */ 1800 if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) { 1801 newdirblk = WK_NEWDIRBLK(wk); 1802 WORKLIST_REMOVE(&newdirblk->db_list); 1803 if (!LIST_EMPTY(&oldadp->ad_newdirblk)) 1804 panic("allocdirect_merge: extra newdirblk"); 1805 WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list); 1806 } 1807 free_allocdirect(adphead, oldadp, 0); 1808 } 1809 1810 /* 1811 * Allocate a new freefrag structure if needed. 1812 */ 1813 static struct freefrag * 1814 newfreefrag(ip, blkno, size) 1815 struct inode *ip; 1816 ufs2_daddr_t blkno; 1817 long size; 1818 { 1819 struct freefrag *freefrag; 1820 struct fs *fs; 1821 1822 if (blkno == 0) 1823 return (NULL); 1824 fs = ip->i_fs; 1825 if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) 1826 panic("newfreefrag: frag size"); 1827 freefrag = malloc(sizeof(struct freefrag), 1828 M_FREEFRAG, M_SOFTDEP_FLAGS); 1829 workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump)); 1830 freefrag->ff_inum = ip->i_number; 1831 freefrag->ff_blkno = blkno; 1832 freefrag->ff_fragsize = size; 1833 return (freefrag); 1834 } 1835 1836 /* 1837 * This workitem de-allocates fragments that were replaced during 1838 * file block allocation. 1839 */ 1840 static void 1841 handle_workitem_freefrag(freefrag) 1842 struct freefrag *freefrag; 1843 { 1844 struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp); 1845 1846 ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno, 1847 freefrag->ff_fragsize, freefrag->ff_inum); 1848 ACQUIRE_LOCK(&lk); 1849 WORKITEM_FREE(freefrag, D_FREEFRAG); 1850 FREE_LOCK(&lk); 1851 } 1852 1853 /* 1854 * Set up a dependency structure for an external attributes data block. 1855 * This routine follows much of the structure of softdep_setup_allocdirect. 1856 * See the description of softdep_setup_allocdirect above for details. 1857 */ 1858 void 1859 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 1860 struct inode *ip; 1861 ufs_lbn_t lbn; 1862 ufs2_daddr_t newblkno; 1863 ufs2_daddr_t oldblkno; 1864 long newsize; 1865 long oldsize; 1866 struct buf *bp; 1867 { 1868 struct allocdirect *adp, *oldadp; 1869 struct allocdirectlst *adphead; 1870 struct bmsafemap *bmsafemap; 1871 struct inodedep *inodedep; 1872 struct newblk *newblk; 1873 struct mount *mp; 1874 1875 mp = UFSTOVFS(ip->i_ump); 1876 adp = malloc(sizeof(struct allocdirect), 1877 M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO); 1878 workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp); 1879 adp->ad_lbn = lbn; 1880 adp->ad_newblkno = newblkno; 1881 adp->ad_oldblkno = oldblkno; 1882 adp->ad_newsize = newsize; 1883 adp->ad_oldsize = oldsize; 1884 adp->ad_state = ATTACHED | EXTDATA; 1885 LIST_INIT(&adp->ad_newdirblk); 1886 if (newblkno == oldblkno) 1887 adp->ad_freefrag = NULL; 1888 else 1889 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize); 1890 1891 ACQUIRE_LOCK(&lk); 1892 if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0) 1893 panic("softdep_setup_allocext: lost block"); 1894 1895 inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); 1896 adp->ad_inodedep = inodedep; 1897 1898 if (newblk->nb_state == DEPCOMPLETE) { 1899 adp->ad_state |= DEPCOMPLETE; 1900 adp->ad_buf = NULL; 1901 } else { 1902 bmsafemap = newblk->nb_bmsafemap; 1903 adp->ad_buf = bmsafemap->sm_buf; 1904 LIST_REMOVE(newblk, nb_deps); 1905 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); 1906 } 1907 LIST_REMOVE(newblk, nb_hash); 1908 free(newblk, M_NEWBLK); 1909 1910 WORKLIST_INSERT(&bp->b_dep, &adp->ad_list); 1911 if (lbn >= NXADDR) 1912 panic("softdep_setup_allocext: lbn %lld > NXADDR", 1913 (long long)lbn); 1914 /* 1915 * The list of allocdirects must be kept in sorted and ascending 1916 * order so that the rollback routines can quickly determine the 1917 * first uncommitted block (the size of the file stored on disk 1918 * ends at the end of the lowest committed fragment, or if there 1919 * are no fragments, at the end of the highest committed block). 1920 * Since files generally grow, the typical case is that the new 1921 * block is to be added at the end of the list. We speed this 1922 * special case by checking against the last allocdirect in the 1923 * list before laboriously traversing the list looking for the 1924 * insertion point. 1925 */ 1926 adphead = &inodedep->id_newextupdt; 1927 oldadp = TAILQ_LAST(adphead, allocdirectlst); 1928 if (oldadp == NULL || oldadp->ad_lbn <= lbn) { 1929 /* insert at end of list */ 1930 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 1931 if (oldadp != NULL && oldadp->ad_lbn == lbn) 1932 allocdirect_merge(adphead, adp, oldadp); 1933 FREE_LOCK(&lk); 1934 return; 1935 } 1936 TAILQ_FOREACH(oldadp, adphead, ad_next) { 1937 if (oldadp->ad_lbn >= lbn) 1938 break; 1939 } 1940 if (oldadp == NULL) 1941 panic("softdep_setup_allocext: lost entry"); 1942 /* insert in middle of list */ 1943 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 1944 if (oldadp->ad_lbn == lbn) 1945 allocdirect_merge(adphead, adp, oldadp); 1946 FREE_LOCK(&lk); 1947 } 1948 1949 /* 1950 * Indirect block allocation dependencies. 1951 * 1952 * The same dependencies that exist for a direct block also exist when 1953 * a new block is allocated and pointed to by an entry in a block of 1954 * indirect pointers. The undo/redo states described above are also 1955 * used here. Because an indirect block contains many pointers that 1956 * may have dependencies, a second copy of the entire in-memory indirect 1957 * block is kept. The buffer cache copy is always completely up-to-date. 1958 * The second copy, which is used only as a source for disk writes, 1959 * contains only the safe pointers (i.e., those that have no remaining 1960 * update dependencies). The second copy is freed when all pointers 1961 * are safe. The cache is not allowed to replace indirect blocks with 1962 * pending update dependencies. If a buffer containing an indirect 1963 * block with dependencies is written, these routines will mark it 1964 * dirty again. It can only be successfully written once all the 1965 * dependencies are removed. The ffs_fsync routine in conjunction with 1966 * softdep_sync_metadata work together to get all the dependencies 1967 * removed so that a file can be successfully written to disk. Three 1968 * procedures are used when setting up indirect block pointer 1969 * dependencies. The division is necessary because of the organization 1970 * of the "balloc" routine and because of the distinction between file 1971 * pages and file metadata blocks. 1972 */ 1973 1974 /* 1975 * Allocate a new allocindir structure. 1976 */ 1977 static struct allocindir * 1978 newallocindir(ip, ptrno, newblkno, oldblkno) 1979 struct inode *ip; /* inode for file being extended */ 1980 int ptrno; /* offset of pointer in indirect block */ 1981 ufs2_daddr_t newblkno; /* disk block number being added */ 1982 ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ 1983 { 1984 struct allocindir *aip; 1985 1986 aip = malloc(sizeof(struct allocindir), 1987 M_ALLOCINDIR, M_SOFTDEP_FLAGS|M_ZERO); 1988 workitem_alloc(&aip->ai_list, D_ALLOCINDIR, UFSTOVFS(ip->i_ump)); 1989 aip->ai_state = ATTACHED; 1990 aip->ai_offset = ptrno; 1991 aip->ai_newblkno = newblkno; 1992 aip->ai_oldblkno = oldblkno; 1993 aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize); 1994 return (aip); 1995 } 1996 1997 /* 1998 * Called just before setting an indirect block pointer 1999 * to a newly allocated file page. 2000 */ 2001 void 2002 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 2003 struct inode *ip; /* inode for file being extended */ 2004 ufs_lbn_t lbn; /* allocated block number within file */ 2005 struct buf *bp; /* buffer with indirect blk referencing page */ 2006 int ptrno; /* offset of pointer in indirect block */ 2007 ufs2_daddr_t newblkno; /* disk block number being added */ 2008 ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ 2009 struct buf *nbp; /* buffer holding allocated page */ 2010 { 2011 struct allocindir *aip; 2012 struct pagedep *pagedep; 2013 2014 ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page"); 2015 aip = newallocindir(ip, ptrno, newblkno, oldblkno); 2016 ACQUIRE_LOCK(&lk); 2017 /* 2018 * If we are allocating a directory page, then we must 2019 * allocate an associated pagedep to track additions and 2020 * deletions. 2021 */ 2022 if ((ip->i_mode & IFMT) == IFDIR && 2023 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) 2024 WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list); 2025 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); 2026 setup_allocindir_phase2(bp, ip, aip); 2027 FREE_LOCK(&lk); 2028 } 2029 2030 /* 2031 * Called just before setting an indirect block pointer to a 2032 * newly allocated indirect block. 2033 */ 2034 void 2035 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 2036 struct buf *nbp; /* newly allocated indirect block */ 2037 struct inode *ip; /* inode for file being extended */ 2038 struct buf *bp; /* indirect block referencing allocated block */ 2039 int ptrno; /* offset of pointer in indirect block */ 2040 ufs2_daddr_t newblkno; /* disk block number being added */ 2041 { 2042 struct allocindir *aip; 2043 2044 ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta"); 2045 aip = newallocindir(ip, ptrno, newblkno, 0); 2046 ACQUIRE_LOCK(&lk); 2047 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); 2048 setup_allocindir_phase2(bp, ip, aip); 2049 FREE_LOCK(&lk); 2050 } 2051 2052 /* 2053 * Called to finish the allocation of the "aip" allocated 2054 * by one of the two routines above. 2055 */ 2056 static void 2057 setup_allocindir_phase2(bp, ip, aip) 2058 struct buf *bp; /* in-memory copy of the indirect block */ 2059 struct inode *ip; /* inode for file being extended */ 2060 struct allocindir *aip; /* allocindir allocated by the above routines */ 2061 { 2062 struct worklist *wk; 2063 struct indirdep *indirdep, *newindirdep; 2064 struct bmsafemap *bmsafemap; 2065 struct allocindir *oldaip; 2066 struct freefrag *freefrag; 2067 struct newblk *newblk; 2068 ufs2_daddr_t blkno; 2069 2070 mtx_assert(&lk, MA_OWNED); 2071 if (bp->b_lblkno >= 0) 2072 panic("setup_allocindir_phase2: not indir blk"); 2073 for (indirdep = NULL, newindirdep = NULL; ; ) { 2074 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 2075 if (wk->wk_type != D_INDIRDEP) 2076 continue; 2077 indirdep = WK_INDIRDEP(wk); 2078 break; 2079 } 2080 if (indirdep == NULL && newindirdep) { 2081 indirdep = newindirdep; 2082 WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); 2083 newindirdep = NULL; 2084 } 2085 if (indirdep) { 2086 if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0, 2087 &newblk) == 0) 2088 panic("setup_allocindir: lost block"); 2089 if (newblk->nb_state == DEPCOMPLETE) { 2090 aip->ai_state |= DEPCOMPLETE; 2091 aip->ai_buf = NULL; 2092 } else { 2093 bmsafemap = newblk->nb_bmsafemap; 2094 aip->ai_buf = bmsafemap->sm_buf; 2095 LIST_REMOVE(newblk, nb_deps); 2096 LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd, 2097 aip, ai_deps); 2098 } 2099 LIST_REMOVE(newblk, nb_hash); 2100 free(newblk, M_NEWBLK); 2101 aip->ai_indirdep = indirdep; 2102 /* 2103 * Check to see if there is an existing dependency 2104 * for this block. If there is, merge the old 2105 * dependency into the new one. 2106 */ 2107 if (aip->ai_oldblkno == 0) 2108 oldaip = NULL; 2109 else 2110 2111 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) 2112 if (oldaip->ai_offset == aip->ai_offset) 2113 break; 2114 freefrag = NULL; 2115 if (oldaip != NULL) { 2116 if (oldaip->ai_newblkno != aip->ai_oldblkno) 2117 panic("setup_allocindir_phase2: blkno"); 2118 aip->ai_oldblkno = oldaip->ai_oldblkno; 2119 freefrag = aip->ai_freefrag; 2120 aip->ai_freefrag = oldaip->ai_freefrag; 2121 oldaip->ai_freefrag = NULL; 2122 free_allocindir(oldaip, NULL); 2123 } 2124 LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); 2125 if (ip->i_ump->um_fstype == UFS1) 2126 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data) 2127 [aip->ai_offset] = aip->ai_oldblkno; 2128 else 2129 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data) 2130 [aip->ai_offset] = aip->ai_oldblkno; 2131 FREE_LOCK(&lk); 2132 if (freefrag != NULL) 2133 handle_workitem_freefrag(freefrag); 2134 } else 2135 FREE_LOCK(&lk); 2136 if (newindirdep) { 2137 newindirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE; 2138 brelse(newindirdep->ir_savebp); 2139 ACQUIRE_LOCK(&lk); 2140 WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP); 2141 if (indirdep) 2142 break; 2143 FREE_LOCK(&lk); 2144 } 2145 if (indirdep) { 2146 ACQUIRE_LOCK(&lk); 2147 break; 2148 } 2149 newindirdep = malloc(sizeof(struct indirdep), 2150 M_INDIRDEP, M_SOFTDEP_FLAGS); 2151 workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, 2152 UFSTOVFS(ip->i_ump)); 2153 newindirdep->ir_state = ATTACHED; 2154 if (ip->i_ump->um_fstype == UFS1) 2155 newindirdep->ir_state |= UFS1FMT; 2156 LIST_INIT(&newindirdep->ir_deplisthd); 2157 LIST_INIT(&newindirdep->ir_donehd); 2158 if (bp->b_blkno == bp->b_lblkno) { 2159 ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp, 2160 NULL, NULL); 2161 bp->b_blkno = blkno; 2162 } 2163 newindirdep->ir_savebp = 2164 getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0); 2165 BUF_KERNPROC(newindirdep->ir_savebp); 2166 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); 2167 ACQUIRE_LOCK(&lk); 2168 } 2169 } 2170 2171 /* 2172 * Block de-allocation dependencies. 2173 * 2174 * When blocks are de-allocated, the on-disk pointers must be nullified before 2175 * the blocks are made available for use by other files. (The true 2176 * requirement is that old pointers must be nullified before new on-disk 2177 * pointers are set. We chose this slightly more stringent requirement to 2178 * reduce complexity.) Our implementation handles this dependency by updating 2179 * the inode (or indirect block) appropriately but delaying the actual block 2180 * de-allocation (i.e., freemap and free space count manipulation) until 2181 * after the updated versions reach stable storage. After the disk is 2182 * updated, the blocks can be safely de-allocated whenever it is convenient. 2183 * This implementation handles only the common case of reducing a file's 2184 * length to zero. Other cases are handled by the conventional synchronous 2185 * write approach. 2186 * 2187 * The ffs implementation with which we worked double-checks 2188 * the state of the block pointers and file size as it reduces 2189 * a file's length. Some of this code is replicated here in our 2190 * soft updates implementation. The freeblks->fb_chkcnt field is 2191 * used to transfer a part of this information to the procedure 2192 * that eventually de-allocates the blocks. 2193 * 2194 * This routine should be called from the routine that shortens 2195 * a file's length, before the inode's size or block pointers 2196 * are modified. It will save the block pointer information for 2197 * later release and zero the inode so that the calling routine 2198 * can release it. 2199 */ 2200 void 2201 softdep_setup_freeblocks(ip, length, flags) 2202 struct inode *ip; /* The inode whose length is to be reduced */ 2203 off_t length; /* The new length for the file */ 2204 int flags; /* IO_EXT and/or IO_NORMAL */ 2205 { 2206 struct freeblks *freeblks; 2207 struct inodedep *inodedep; 2208 struct allocdirect *adp; 2209 struct bufobj *bo; 2210 struct vnode *vp; 2211 struct buf *bp; 2212 struct fs *fs; 2213 ufs2_daddr_t extblocks, datablocks; 2214 struct mount *mp; 2215 int i, delay, error; 2216 2217 fs = ip->i_fs; 2218 mp = UFSTOVFS(ip->i_ump); 2219 if (length != 0) 2220 panic("softdep_setup_freeblocks: non-zero length"); 2221 freeblks = malloc(sizeof(struct freeblks), 2222 M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO); 2223 workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp); 2224 freeblks->fb_state = ATTACHED; 2225 freeblks->fb_uid = ip->i_uid; 2226 freeblks->fb_previousinum = ip->i_number; 2227 freeblks->fb_devvp = ip->i_devvp; 2228 ACQUIRE_LOCK(&lk); 2229 num_freeblkdep++; 2230 FREE_LOCK(&lk); 2231 extblocks = 0; 2232 if (fs->fs_magic == FS_UFS2_MAGIC) 2233 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); 2234 datablocks = DIP(ip, i_blocks) - extblocks; 2235 if ((flags & IO_NORMAL) == 0) { 2236 freeblks->fb_oldsize = 0; 2237 freeblks->fb_chkcnt = 0; 2238 } else { 2239 freeblks->fb_oldsize = ip->i_size; 2240 ip->i_size = 0; 2241 DIP_SET(ip, i_size, 0); 2242 freeblks->fb_chkcnt = datablocks; 2243 for (i = 0; i < NDADDR; i++) { 2244 freeblks->fb_dblks[i] = DIP(ip, i_db[i]); 2245 DIP_SET(ip, i_db[i], 0); 2246 } 2247 for (i = 0; i < NIADDR; i++) { 2248 freeblks->fb_iblks[i] = DIP(ip, i_ib[i]); 2249 DIP_SET(ip, i_ib[i], 0); 2250 } 2251 /* 2252 * If the file was removed, then the space being freed was 2253 * accounted for then (see softdep_releasefile()). If the 2254 * file is merely being truncated, then we account for it now. 2255 */ 2256 if ((ip->i_flag & IN_SPACECOUNTED) == 0) { 2257 UFS_LOCK(ip->i_ump); 2258 fs->fs_pendingblocks += datablocks; 2259 UFS_UNLOCK(ip->i_ump); 2260 } 2261 } 2262 if ((flags & IO_EXT) == 0) { 2263 freeblks->fb_oldextsize = 0; 2264 } else { 2265 freeblks->fb_oldextsize = ip->i_din2->di_extsize; 2266 ip->i_din2->di_extsize = 0; 2267 freeblks->fb_chkcnt += extblocks; 2268 for (i = 0; i < NXADDR; i++) { 2269 freeblks->fb_eblks[i] = ip->i_din2->di_extb[i]; 2270 ip->i_din2->di_extb[i] = 0; 2271 } 2272 } 2273 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt); 2274 /* 2275 * Push the zero'ed inode to to its disk buffer so that we are free 2276 * to delete its dependencies below. Once the dependencies are gone 2277 * the buffer can be safely released. 2278 */ 2279 if ((error = bread(ip->i_devvp, 2280 fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 2281 (int)fs->fs_bsize, NOCRED, &bp)) != 0) { 2282 brelse(bp); 2283 softdep_error("softdep_setup_freeblocks", error); 2284 } 2285 if (ip->i_ump->um_fstype == UFS1) 2286 *((struct ufs1_dinode *)bp->b_data + 2287 ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1; 2288 else 2289 *((struct ufs2_dinode *)bp->b_data + 2290 ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2; 2291 /* 2292 * Find and eliminate any inode dependencies. 2293 */ 2294 ACQUIRE_LOCK(&lk); 2295 (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 2296 if ((inodedep->id_state & IOSTARTED) != 0) 2297 panic("softdep_setup_freeblocks: inode busy"); 2298 /* 2299 * Add the freeblks structure to the list of operations that 2300 * must await the zero'ed inode being written to disk. If we 2301 * still have a bitmap dependency (delay == 0), then the inode 2302 * has never been written to disk, so we can process the 2303 * freeblks below once we have deleted the dependencies. 2304 */ 2305 delay = (inodedep->id_state & DEPCOMPLETE); 2306 if (delay) 2307 WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list); 2308 /* 2309 * Because the file length has been truncated to zero, any 2310 * pending block allocation dependency structures associated 2311 * with this inode are obsolete and can simply be de-allocated. 2312 * We must first merge the two dependency lists to get rid of 2313 * any duplicate freefrag structures, then purge the merged list. 2314 * If we still have a bitmap dependency, then the inode has never 2315 * been written to disk, so we can free any fragments without delay. 2316 */ 2317 if (flags & IO_NORMAL) { 2318 merge_inode_lists(&inodedep->id_newinoupdt, 2319 &inodedep->id_inoupdt); 2320 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) 2321 free_allocdirect(&inodedep->id_inoupdt, adp, delay); 2322 } 2323 if (flags & IO_EXT) { 2324 merge_inode_lists(&inodedep->id_newextupdt, 2325 &inodedep->id_extupdt); 2326 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0) 2327 free_allocdirect(&inodedep->id_extupdt, adp, delay); 2328 } 2329 FREE_LOCK(&lk); 2330 bdwrite(bp); 2331 /* 2332 * We must wait for any I/O in progress to finish so that 2333 * all potential buffers on the dirty list will be visible. 2334 * Once they are all there, walk the list and get rid of 2335 * any dependencies. 2336 */ 2337 vp = ITOV(ip); 2338 bo = &vp->v_bufobj; 2339 BO_LOCK(bo); 2340 drain_output(vp); 2341 restart: 2342 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { 2343 if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) || 2344 ((flags & IO_NORMAL) == 0 && 2345 (bp->b_xflags & BX_ALTDATA) == 0)) 2346 continue; 2347 if ((bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT)) == NULL) 2348 goto restart; 2349 BO_UNLOCK(bo); 2350 ACQUIRE_LOCK(&lk); 2351 (void) inodedep_lookup(mp, ip->i_number, 0, &inodedep); 2352 deallocate_dependencies(bp, inodedep); 2353 FREE_LOCK(&lk); 2354 bp->b_flags |= B_INVAL | B_NOCACHE; 2355 brelse(bp); 2356 BO_LOCK(bo); 2357 goto restart; 2358 } 2359 BO_UNLOCK(bo); 2360 ACQUIRE_LOCK(&lk); 2361 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) 2362 (void) free_inodedep(inodedep); 2363 2364 if(delay) { 2365 freeblks->fb_state |= DEPCOMPLETE; 2366 /* 2367 * If the inode with zeroed block pointers is now on disk 2368 * we can start freeing blocks. Add freeblks to the worklist 2369 * instead of calling handle_workitem_freeblocks directly as 2370 * it is more likely that additional IO is needed to complete 2371 * the request here than in the !delay case. 2372 */ 2373 if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) 2374 add_to_worklist(&freeblks->fb_list); 2375 } 2376 2377 FREE_LOCK(&lk); 2378 /* 2379 * If the inode has never been written to disk (delay == 0), 2380 * then we can process the freeblks now that we have deleted 2381 * the dependencies. 2382 */ 2383 if (!delay) 2384 handle_workitem_freeblocks(freeblks, 0); 2385 } 2386 2387 /* 2388 * Reclaim any dependency structures from a buffer that is about to 2389 * be reallocated to a new vnode. The buffer must be locked, thus, 2390 * no I/O completion operations can occur while we are manipulating 2391 * its associated dependencies. The mutex is held so that other I/O's 2392 * associated with related dependencies do not occur. 2393 */ 2394 static void 2395 deallocate_dependencies(bp, inodedep) 2396 struct buf *bp; 2397 struct inodedep *inodedep; 2398 { 2399 struct worklist *wk; 2400 struct indirdep *indirdep; 2401 struct allocindir *aip; 2402 struct pagedep *pagedep; 2403 struct dirrem *dirrem; 2404 struct diradd *dap; 2405 int i; 2406 2407 mtx_assert(&lk, MA_OWNED); 2408 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 2409 switch (wk->wk_type) { 2410 2411 case D_INDIRDEP: 2412 indirdep = WK_INDIRDEP(wk); 2413 /* 2414 * None of the indirect pointers will ever be visible, 2415 * so they can simply be tossed. GOINGAWAY ensures 2416 * that allocated pointers will be saved in the buffer 2417 * cache until they are freed. Note that they will 2418 * only be able to be found by their physical address 2419 * since the inode mapping the logical address will 2420 * be gone. The save buffer used for the safe copy 2421 * was allocated in setup_allocindir_phase2 using 2422 * the physical address so it could be used for this 2423 * purpose. Hence we swap the safe copy with the real 2424 * copy, allowing the safe copy to be freed and holding 2425 * on to the real copy for later use in indir_trunc. 2426 */ 2427 if (indirdep->ir_state & GOINGAWAY) 2428 panic("deallocate_dependencies: already gone"); 2429 indirdep->ir_state |= GOINGAWAY; 2430 VFSTOUFS(bp->b_vp->v_mount)->um_numindirdeps += 1; 2431 while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) 2432 free_allocindir(aip, inodedep); 2433 if (bp->b_lblkno >= 0 || 2434 bp->b_blkno != indirdep->ir_savebp->b_lblkno) 2435 panic("deallocate_dependencies: not indir"); 2436 bcopy(bp->b_data, indirdep->ir_savebp->b_data, 2437 bp->b_bcount); 2438 WORKLIST_REMOVE(wk); 2439 WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk); 2440 continue; 2441 2442 case D_PAGEDEP: 2443 pagedep = WK_PAGEDEP(wk); 2444 /* 2445 * None of the directory additions will ever be 2446 * visible, so they can simply be tossed. 2447 */ 2448 for (i = 0; i < DAHASHSZ; i++) 2449 while ((dap = 2450 LIST_FIRST(&pagedep->pd_diraddhd[i]))) 2451 free_diradd(dap); 2452 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0) 2453 free_diradd(dap); 2454 /* 2455 * Copy any directory remove dependencies to the list 2456 * to be processed after the zero'ed inode is written. 2457 * If the inode has already been written, then they 2458 * can be dumped directly onto the work list. 2459 */ 2460 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { 2461 LIST_REMOVE(dirrem, dm_next); 2462 dirrem->dm_dirinum = pagedep->pd_ino; 2463 if (inodedep == NULL || 2464 (inodedep->id_state & ALLCOMPLETE) == 2465 ALLCOMPLETE) 2466 add_to_worklist(&dirrem->dm_list); 2467 else 2468 WORKLIST_INSERT(&inodedep->id_bufwait, 2469 &dirrem->dm_list); 2470 } 2471 if ((pagedep->pd_state & NEWBLOCK) != 0) { 2472 LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list) 2473 if (wk->wk_type == D_NEWDIRBLK && 2474 WK_NEWDIRBLK(wk)->db_pagedep == 2475 pagedep) 2476 break; 2477 if (wk != NULL) { 2478 WORKLIST_REMOVE(wk); 2479 free_newdirblk(WK_NEWDIRBLK(wk)); 2480 } else 2481 panic("deallocate_dependencies: " 2482 "lost pagedep"); 2483 } 2484 WORKLIST_REMOVE(&pagedep->pd_list); 2485 LIST_REMOVE(pagedep, pd_hash); 2486 WORKITEM_FREE(pagedep, D_PAGEDEP); 2487 continue; 2488 2489 case D_ALLOCINDIR: 2490 free_allocindir(WK_ALLOCINDIR(wk), inodedep); 2491 continue; 2492 2493 case D_ALLOCDIRECT: 2494 case D_INODEDEP: 2495 panic("deallocate_dependencies: Unexpected type %s", 2496 TYPENAME(wk->wk_type)); 2497 /* NOTREACHED */ 2498 2499 default: 2500 panic("deallocate_dependencies: Unknown type %s", 2501 TYPENAME(wk->wk_type)); 2502 /* NOTREACHED */ 2503 } 2504 } 2505 } 2506 2507 /* 2508 * Free an allocdirect. Generate a new freefrag work request if appropriate. 2509 * This routine must be called with splbio interrupts blocked. 2510 */ 2511 static void 2512 free_allocdirect(adphead, adp, delay) 2513 struct allocdirectlst *adphead; 2514 struct allocdirect *adp; 2515 int delay; 2516 { 2517 struct newdirblk *newdirblk; 2518 struct worklist *wk; 2519 2520 mtx_assert(&lk, MA_OWNED); 2521 if ((adp->ad_state & DEPCOMPLETE) == 0) 2522 LIST_REMOVE(adp, ad_deps); 2523 TAILQ_REMOVE(adphead, adp, ad_next); 2524 if ((adp->ad_state & COMPLETE) == 0) 2525 WORKLIST_REMOVE(&adp->ad_list); 2526 if (adp->ad_freefrag != NULL) { 2527 if (delay) 2528 WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, 2529 &adp->ad_freefrag->ff_list); 2530 else 2531 add_to_worklist(&adp->ad_freefrag->ff_list); 2532 } 2533 if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) { 2534 newdirblk = WK_NEWDIRBLK(wk); 2535 WORKLIST_REMOVE(&newdirblk->db_list); 2536 if (!LIST_EMPTY(&adp->ad_newdirblk)) 2537 panic("free_allocdirect: extra newdirblk"); 2538 if (delay) 2539 WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, 2540 &newdirblk->db_list); 2541 else 2542 free_newdirblk(newdirblk); 2543 } 2544 WORKITEM_FREE(adp, D_ALLOCDIRECT); 2545 } 2546 2547 /* 2548 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep. 2549 * This routine must be called with splbio interrupts blocked. 2550 */ 2551 static void 2552 free_newdirblk(newdirblk) 2553 struct newdirblk *newdirblk; 2554 { 2555 struct pagedep *pagedep; 2556 struct diradd *dap; 2557 int i; 2558 2559 mtx_assert(&lk, MA_OWNED); 2560 /* 2561 * If the pagedep is still linked onto the directory buffer 2562 * dependency chain, then some of the entries on the 2563 * pd_pendinghd list may not be committed to disk yet. In 2564 * this case, we will simply clear the NEWBLOCK flag and 2565 * let the pd_pendinghd list be processed when the pagedep 2566 * is next written. If the pagedep is no longer on the buffer 2567 * dependency chain, then all the entries on the pd_pending 2568 * list are committed to disk and we can free them here. 2569 */ 2570 pagedep = newdirblk->db_pagedep; 2571 pagedep->pd_state &= ~NEWBLOCK; 2572 if ((pagedep->pd_state & ONWORKLIST) == 0) 2573 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 2574 free_diradd(dap); 2575 /* 2576 * If no dependencies remain, the pagedep will be freed. 2577 */ 2578 for (i = 0; i < DAHASHSZ; i++) 2579 if (!LIST_EMPTY(&pagedep->pd_diraddhd[i])) 2580 break; 2581 if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) { 2582 LIST_REMOVE(pagedep, pd_hash); 2583 WORKITEM_FREE(pagedep, D_PAGEDEP); 2584 } 2585 WORKITEM_FREE(newdirblk, D_NEWDIRBLK); 2586 } 2587 2588 /* 2589 * Prepare an inode to be freed. The actual free operation is not 2590 * done until the zero'ed inode has been written to disk. 2591 */ 2592 void 2593 softdep_freefile(pvp, ino, mode) 2594 struct vnode *pvp; 2595 ino_t ino; 2596 int mode; 2597 { 2598 struct inode *ip = VTOI(pvp); 2599 struct inodedep *inodedep; 2600 struct freefile *freefile; 2601 2602 /* 2603 * This sets up the inode de-allocation dependency. 2604 */ 2605 freefile = malloc(sizeof(struct freefile), 2606 M_FREEFILE, M_SOFTDEP_FLAGS); 2607 workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount); 2608 freefile->fx_mode = mode; 2609 freefile->fx_oldinum = ino; 2610 freefile->fx_devvp = ip->i_devvp; 2611 if ((ip->i_flag & IN_SPACECOUNTED) == 0) { 2612 UFS_LOCK(ip->i_ump); 2613 ip->i_fs->fs_pendinginodes += 1; 2614 UFS_UNLOCK(ip->i_ump); 2615 } 2616 2617 /* 2618 * If the inodedep does not exist, then the zero'ed inode has 2619 * been written to disk. If the allocated inode has never been 2620 * written to disk, then the on-disk inode is zero'ed. In either 2621 * case we can free the file immediately. 2622 */ 2623 ACQUIRE_LOCK(&lk); 2624 if (inodedep_lookup(pvp->v_mount, ino, 0, &inodedep) == 0 || 2625 check_inode_unwritten(inodedep)) { 2626 FREE_LOCK(&lk); 2627 handle_workitem_freefile(freefile); 2628 return; 2629 } 2630 WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); 2631 FREE_LOCK(&lk); 2632 if (ip->i_number == ino) 2633 ip->i_flag |= IN_MODIFIED; 2634 } 2635 2636 /* 2637 * Check to see if an inode has never been written to disk. If 2638 * so free the inodedep and return success, otherwise return failure. 2639 * This routine must be called with splbio interrupts blocked. 2640 * 2641 * If we still have a bitmap dependency, then the inode has never 2642 * been written to disk. Drop the dependency as it is no longer 2643 * necessary since the inode is being deallocated. We set the 2644 * ALLCOMPLETE flags since the bitmap now properly shows that the 2645 * inode is not allocated. Even if the inode is actively being 2646 * written, it has been rolled back to its zero'ed state, so we 2647 * are ensured that a zero inode is what is on the disk. For short 2648 * lived files, this change will usually result in removing all the 2649 * dependencies from the inode so that it can be freed immediately. 2650 */ 2651 static int 2652 check_inode_unwritten(inodedep) 2653 struct inodedep *inodedep; 2654 { 2655 2656 mtx_assert(&lk, MA_OWNED); 2657 if ((inodedep->id_state & DEPCOMPLETE) != 0 || 2658 !LIST_EMPTY(&inodedep->id_pendinghd) || 2659 !LIST_EMPTY(&inodedep->id_bufwait) || 2660 !LIST_EMPTY(&inodedep->id_inowait) || 2661 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 2662 !TAILQ_EMPTY(&inodedep->id_newinoupdt) || 2663 !TAILQ_EMPTY(&inodedep->id_extupdt) || 2664 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 2665 inodedep->id_nlinkdelta != 0) 2666 return (0); 2667 2668 /* 2669 * Another process might be in initiate_write_inodeblock_ufs[12] 2670 * trying to allocate memory without holding "Softdep Lock". 2671 */ 2672 if ((inodedep->id_state & IOSTARTED) != 0 && 2673 inodedep->id_savedino1 == NULL) 2674 return (0); 2675 2676 inodedep->id_state |= ALLCOMPLETE; 2677 LIST_REMOVE(inodedep, id_deps); 2678 inodedep->id_buf = NULL; 2679 if (inodedep->id_state & ONWORKLIST) 2680 WORKLIST_REMOVE(&inodedep->id_list); 2681 if (inodedep->id_savedino1 != NULL) { 2682 free(inodedep->id_savedino1, M_SAVEDINO); 2683 inodedep->id_savedino1 = NULL; 2684 } 2685 if (free_inodedep(inodedep) == 0) 2686 panic("check_inode_unwritten: busy inode"); 2687 return (1); 2688 } 2689 2690 /* 2691 * Try to free an inodedep structure. Return 1 if it could be freed. 2692 */ 2693 static int 2694 free_inodedep(inodedep) 2695 struct inodedep *inodedep; 2696 { 2697 2698 mtx_assert(&lk, MA_OWNED); 2699 if ((inodedep->id_state & ONWORKLIST) != 0 || 2700 (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || 2701 !LIST_EMPTY(&inodedep->id_pendinghd) || 2702 !LIST_EMPTY(&inodedep->id_bufwait) || 2703 !LIST_EMPTY(&inodedep->id_inowait) || 2704 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 2705 !TAILQ_EMPTY(&inodedep->id_newinoupdt) || 2706 !TAILQ_EMPTY(&inodedep->id_extupdt) || 2707 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 2708 inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL) 2709 return (0); 2710 LIST_REMOVE(inodedep, id_hash); 2711 WORKITEM_FREE(inodedep, D_INODEDEP); 2712 num_inodedep -= 1; 2713 return (1); 2714 } 2715 2716 /* 2717 * This workitem routine performs the block de-allocation. 2718 * The workitem is added to the pending list after the updated 2719 * inode block has been written to disk. As mentioned above, 2720 * checks regarding the number of blocks de-allocated (compared 2721 * to the number of blocks allocated for the file) are also 2722 * performed in this function. 2723 */ 2724 static void 2725 handle_workitem_freeblocks(freeblks, flags) 2726 struct freeblks *freeblks; 2727 int flags; 2728 { 2729 struct inode *ip; 2730 struct vnode *vp; 2731 struct fs *fs; 2732 struct ufsmount *ump; 2733 int i, nblocks, level, bsize; 2734 ufs2_daddr_t bn, blocksreleased = 0; 2735 int error, allerror = 0; 2736 ufs_lbn_t baselbns[NIADDR], tmpval; 2737 int fs_pendingblocks; 2738 2739 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 2740 fs = ump->um_fs; 2741 fs_pendingblocks = 0; 2742 tmpval = 1; 2743 baselbns[0] = NDADDR; 2744 for (i = 1; i < NIADDR; i++) { 2745 tmpval *= NINDIR(fs); 2746 baselbns[i] = baselbns[i - 1] + tmpval; 2747 } 2748 nblocks = btodb(fs->fs_bsize); 2749 blocksreleased = 0; 2750 /* 2751 * Release all extended attribute blocks or frags. 2752 */ 2753 if (freeblks->fb_oldextsize > 0) { 2754 for (i = (NXADDR - 1); i >= 0; i--) { 2755 if ((bn = freeblks->fb_eblks[i]) == 0) 2756 continue; 2757 bsize = sblksize(fs, freeblks->fb_oldextsize, i); 2758 ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize, 2759 freeblks->fb_previousinum); 2760 blocksreleased += btodb(bsize); 2761 } 2762 } 2763 /* 2764 * Release all data blocks or frags. 2765 */ 2766 if (freeblks->fb_oldsize > 0) { 2767 /* 2768 * Indirect blocks first. 2769 */ 2770 for (level = (NIADDR - 1); level >= 0; level--) { 2771 if ((bn = freeblks->fb_iblks[level]) == 0) 2772 continue; 2773 if ((error = indir_trunc(freeblks, fsbtodb(fs, bn), 2774 level, baselbns[level], &blocksreleased)) != 0) 2775 allerror = error; 2776 ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, 2777 fs->fs_bsize, freeblks->fb_previousinum); 2778 fs_pendingblocks += nblocks; 2779 blocksreleased += nblocks; 2780 } 2781 /* 2782 * All direct blocks or frags. 2783 */ 2784 for (i = (NDADDR - 1); i >= 0; i--) { 2785 if ((bn = freeblks->fb_dblks[i]) == 0) 2786 continue; 2787 bsize = sblksize(fs, freeblks->fb_oldsize, i); 2788 ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize, 2789 freeblks->fb_previousinum); 2790 fs_pendingblocks += btodb(bsize); 2791 blocksreleased += btodb(bsize); 2792 } 2793 } 2794 UFS_LOCK(ump); 2795 fs->fs_pendingblocks -= fs_pendingblocks; 2796 UFS_UNLOCK(ump); 2797 /* 2798 * If we still have not finished background cleanup, then check 2799 * to see if the block count needs to be adjusted. 2800 */ 2801 if (freeblks->fb_chkcnt != blocksreleased && 2802 (fs->fs_flags & FS_UNCLEAN) != 0 && 2803 ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_previousinum, 2804 (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ) 2805 == 0) { 2806 ip = VTOI(vp); 2807 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + \ 2808 freeblks->fb_chkcnt - blocksreleased); 2809 ip->i_flag |= IN_CHANGE; 2810 vput(vp); 2811 } 2812 2813 #ifdef INVARIANTS 2814 if (freeblks->fb_chkcnt != blocksreleased && 2815 ((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0)) 2816 printf("handle_workitem_freeblocks: block count\n"); 2817 if (allerror) 2818 softdep_error("handle_workitem_freeblks", allerror); 2819 #endif /* INVARIANTS */ 2820 2821 ACQUIRE_LOCK(&lk); 2822 WORKITEM_FREE(freeblks, D_FREEBLKS); 2823 num_freeblkdep--; 2824 FREE_LOCK(&lk); 2825 } 2826 2827 /* 2828 * Release blocks associated with the inode ip and stored in the indirect 2829 * block dbn. If level is greater than SINGLE, the block is an indirect block 2830 * and recursive calls to indirtrunc must be used to cleanse other indirect 2831 * blocks. 2832 */ 2833 static int 2834 indir_trunc(freeblks, dbn, level, lbn, countp) 2835 struct freeblks *freeblks; 2836 ufs2_daddr_t dbn; 2837 int level; 2838 ufs_lbn_t lbn; 2839 ufs2_daddr_t *countp; 2840 { 2841 struct buf *bp; 2842 struct fs *fs; 2843 struct worklist *wk; 2844 struct indirdep *indirdep; 2845 struct ufsmount *ump; 2846 ufs1_daddr_t *bap1 = 0; 2847 ufs2_daddr_t nb, *bap2 = 0; 2848 ufs_lbn_t lbnadd; 2849 int i, nblocks, ufs1fmt; 2850 int error, allerror = 0; 2851 int fs_pendingblocks; 2852 2853 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 2854 fs = ump->um_fs; 2855 fs_pendingblocks = 0; 2856 lbnadd = 1; 2857 for (i = level; i > 0; i--) 2858 lbnadd *= NINDIR(fs); 2859 /* 2860 * Get buffer of block pointers to be freed. This routine is not 2861 * called until the zero'ed inode has been written, so it is safe 2862 * to free blocks as they are encountered. Because the inode has 2863 * been zero'ed, calls to bmap on these blocks will fail. So, we 2864 * have to use the on-disk address and the block device for the 2865 * filesystem to look them up. If the file was deleted before its 2866 * indirect blocks were all written to disk, the routine that set 2867 * us up (deallocate_dependencies) will have arranged to leave 2868 * a complete copy of the indirect block in memory for our use. 2869 * Otherwise we have to read the blocks in from the disk. 2870 */ 2871 #ifdef notyet 2872 bp = getblk(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 0, 0, 2873 GB_NOCREAT); 2874 #else 2875 bp = incore(&freeblks->fb_devvp->v_bufobj, dbn); 2876 #endif 2877 ACQUIRE_LOCK(&lk); 2878 if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) { 2879 if (wk->wk_type != D_INDIRDEP || 2880 (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp || 2881 (indirdep->ir_state & GOINGAWAY) == 0) 2882 panic("indir_trunc: lost indirdep"); 2883 WORKLIST_REMOVE(wk); 2884 WORKITEM_FREE(indirdep, D_INDIRDEP); 2885 if (!LIST_EMPTY(&bp->b_dep)) 2886 panic("indir_trunc: dangling dep"); 2887 ump->um_numindirdeps -= 1; 2888 FREE_LOCK(&lk); 2889 } else { 2890 #ifdef notyet 2891 if (bp) 2892 brelse(bp); 2893 #endif 2894 FREE_LOCK(&lk); 2895 error = bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 2896 NOCRED, &bp); 2897 if (error) { 2898 brelse(bp); 2899 return (error); 2900 } 2901 } 2902 /* 2903 * Recursively free indirect blocks. 2904 */ 2905 if (ump->um_fstype == UFS1) { 2906 ufs1fmt = 1; 2907 bap1 = (ufs1_daddr_t *)bp->b_data; 2908 } else { 2909 ufs1fmt = 0; 2910 bap2 = (ufs2_daddr_t *)bp->b_data; 2911 } 2912 nblocks = btodb(fs->fs_bsize); 2913 for (i = NINDIR(fs) - 1; i >= 0; i--) { 2914 if (ufs1fmt) 2915 nb = bap1[i]; 2916 else 2917 nb = bap2[i]; 2918 if (nb == 0) 2919 continue; 2920 if (level != 0) { 2921 if ((error = indir_trunc(freeblks, fsbtodb(fs, nb), 2922 level - 1, lbn + (i * lbnadd), countp)) != 0) 2923 allerror = error; 2924 } 2925 ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, fs->fs_bsize, 2926 freeblks->fb_previousinum); 2927 fs_pendingblocks += nblocks; 2928 *countp += nblocks; 2929 } 2930 UFS_LOCK(ump); 2931 fs->fs_pendingblocks -= fs_pendingblocks; 2932 UFS_UNLOCK(ump); 2933 bp->b_flags |= B_INVAL | B_NOCACHE; 2934 brelse(bp); 2935 return (allerror); 2936 } 2937 2938 /* 2939 * Free an allocindir. 2940 * This routine must be called with splbio interrupts blocked. 2941 */ 2942 static void 2943 free_allocindir(aip, inodedep) 2944 struct allocindir *aip; 2945 struct inodedep *inodedep; 2946 { 2947 struct freefrag *freefrag; 2948 2949 mtx_assert(&lk, MA_OWNED); 2950 if ((aip->ai_state & DEPCOMPLETE) == 0) 2951 LIST_REMOVE(aip, ai_deps); 2952 if (aip->ai_state & ONWORKLIST) 2953 WORKLIST_REMOVE(&aip->ai_list); 2954 LIST_REMOVE(aip, ai_next); 2955 if ((freefrag = aip->ai_freefrag) != NULL) { 2956 if (inodedep == NULL) 2957 add_to_worklist(&freefrag->ff_list); 2958 else 2959 WORKLIST_INSERT(&inodedep->id_bufwait, 2960 &freefrag->ff_list); 2961 } 2962 WORKITEM_FREE(aip, D_ALLOCINDIR); 2963 } 2964 2965 /* 2966 * Directory entry addition dependencies. 2967 * 2968 * When adding a new directory entry, the inode (with its incremented link 2969 * count) must be written to disk before the directory entry's pointer to it. 2970 * Also, if the inode is newly allocated, the corresponding freemap must be 2971 * updated (on disk) before the directory entry's pointer. These requirements 2972 * are met via undo/redo on the directory entry's pointer, which consists 2973 * simply of the inode number. 2974 * 2975 * As directory entries are added and deleted, the free space within a 2976 * directory block can become fragmented. The ufs filesystem will compact 2977 * a fragmented directory block to make space for a new entry. When this 2978 * occurs, the offsets of previously added entries change. Any "diradd" 2979 * dependency structures corresponding to these entries must be updated with 2980 * the new offsets. 2981 */ 2982 2983 /* 2984 * This routine is called after the in-memory inode's link 2985 * count has been incremented, but before the directory entry's 2986 * pointer to the inode has been set. 2987 */ 2988 int 2989 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) 2990 struct buf *bp; /* buffer containing directory block */ 2991 struct inode *dp; /* inode for directory */ 2992 off_t diroffset; /* offset of new entry in directory */ 2993 ino_t newinum; /* inode referenced by new directory entry */ 2994 struct buf *newdirbp; /* non-NULL => contents of new mkdir */ 2995 int isnewblk; /* entry is in a newly allocated block */ 2996 { 2997 int offset; /* offset of new entry within directory block */ 2998 ufs_lbn_t lbn; /* block in directory containing new entry */ 2999 struct fs *fs; 3000 struct diradd *dap; 3001 struct allocdirect *adp; 3002 struct pagedep *pagedep; 3003 struct inodedep *inodedep; 3004 struct newdirblk *newdirblk = 0; 3005 struct mkdir *mkdir1, *mkdir2; 3006 struct mount *mp; 3007 3008 /* 3009 * Whiteouts have no dependencies. 3010 */ 3011 if (newinum == WINO) { 3012 if (newdirbp != NULL) 3013 bdwrite(newdirbp); 3014 return (0); 3015 } 3016 mp = UFSTOVFS(dp->i_ump); 3017 fs = dp->i_fs; 3018 lbn = lblkno(fs, diroffset); 3019 offset = blkoff(fs, diroffset); 3020 dap = malloc(sizeof(struct diradd), M_DIRADD, 3021 M_SOFTDEP_FLAGS|M_ZERO); 3022 workitem_alloc(&dap->da_list, D_DIRADD, mp); 3023 dap->da_offset = offset; 3024 dap->da_newinum = newinum; 3025 dap->da_state = ATTACHED; 3026 if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) { 3027 newdirblk = malloc(sizeof(struct newdirblk), 3028 M_NEWDIRBLK, M_SOFTDEP_FLAGS); 3029 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); 3030 } 3031 if (newdirbp == NULL) { 3032 dap->da_state |= DEPCOMPLETE; 3033 ACQUIRE_LOCK(&lk); 3034 } else { 3035 dap->da_state |= MKDIR_BODY | MKDIR_PARENT; 3036 mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, 3037 M_SOFTDEP_FLAGS); 3038 workitem_alloc(&mkdir1->md_list, D_MKDIR, mp); 3039 mkdir1->md_state = MKDIR_BODY; 3040 mkdir1->md_diradd = dap; 3041 mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, 3042 M_SOFTDEP_FLAGS); 3043 workitem_alloc(&mkdir2->md_list, D_MKDIR, mp); 3044 mkdir2->md_state = MKDIR_PARENT; 3045 mkdir2->md_diradd = dap; 3046 /* 3047 * Dependency on "." and ".." being written to disk. 3048 */ 3049 mkdir1->md_buf = newdirbp; 3050 ACQUIRE_LOCK(&lk); 3051 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); 3052 WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list); 3053 FREE_LOCK(&lk); 3054 bdwrite(newdirbp); 3055 /* 3056 * Dependency on link count increase for parent directory 3057 */ 3058 ACQUIRE_LOCK(&lk); 3059 if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0 3060 || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 3061 dap->da_state &= ~MKDIR_PARENT; 3062 WORKITEM_FREE(mkdir2, D_MKDIR); 3063 } else { 3064 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 3065 WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list); 3066 } 3067 } 3068 /* 3069 * Link into parent directory pagedep to await its being written. 3070 */ 3071 if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) 3072 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 3073 dap->da_pagedep = pagedep; 3074 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, 3075 da_pdlist); 3076 /* 3077 * Link into its inodedep. Put it on the id_bufwait list if the inode 3078 * is not yet written. If it is written, do the post-inode write 3079 * processing to put it on the id_pendinghd list. 3080 */ 3081 (void) inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); 3082 if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) 3083 diradd_inode_written(dap, inodedep); 3084 else 3085 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 3086 if (isnewblk) { 3087 /* 3088 * Directories growing into indirect blocks are rare 3089 * enough and the frequency of new block allocation 3090 * in those cases even more rare, that we choose not 3091 * to bother tracking them. Rather we simply force the 3092 * new directory entry to disk. 3093 */ 3094 if (lbn >= NDADDR) { 3095 FREE_LOCK(&lk); 3096 /* 3097 * We only have a new allocation when at the 3098 * beginning of a new block, not when we are 3099 * expanding into an existing block. 3100 */ 3101 if (blkoff(fs, diroffset) == 0) 3102 return (1); 3103 return (0); 3104 } 3105 /* 3106 * We only have a new allocation when at the beginning 3107 * of a new fragment, not when we are expanding into an 3108 * existing fragment. Also, there is nothing to do if we 3109 * are already tracking this block. 3110 */ 3111 if (fragoff(fs, diroffset) != 0) { 3112 FREE_LOCK(&lk); 3113 return (0); 3114 } 3115 if ((pagedep->pd_state & NEWBLOCK) != 0) { 3116 WORKITEM_FREE(newdirblk, D_NEWDIRBLK); 3117 FREE_LOCK(&lk); 3118 return (0); 3119 } 3120 /* 3121 * Find our associated allocdirect and have it track us. 3122 */ 3123 if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0) 3124 panic("softdep_setup_directory_add: lost inodedep"); 3125 adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst); 3126 if (adp == NULL || adp->ad_lbn != lbn) 3127 panic("softdep_setup_directory_add: lost entry"); 3128 pagedep->pd_state |= NEWBLOCK; 3129 newdirblk->db_pagedep = pagedep; 3130 WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list); 3131 } 3132 FREE_LOCK(&lk); 3133 return (0); 3134 } 3135 3136 /* 3137 * This procedure is called to change the offset of a directory 3138 * entry when compacting a directory block which must be owned 3139 * exclusively by the caller. Note that the actual entry movement 3140 * must be done in this procedure to ensure that no I/O completions 3141 * occur while the move is in progress. 3142 */ 3143 void 3144 softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize) 3145 struct inode *dp; /* inode for directory */ 3146 caddr_t base; /* address of dp->i_offset */ 3147 caddr_t oldloc; /* address of old directory location */ 3148 caddr_t newloc; /* address of new directory location */ 3149 int entrysize; /* size of directory entry */ 3150 { 3151 int offset, oldoffset, newoffset; 3152 struct pagedep *pagedep; 3153 struct diradd *dap; 3154 ufs_lbn_t lbn; 3155 3156 ACQUIRE_LOCK(&lk); 3157 lbn = lblkno(dp->i_fs, dp->i_offset); 3158 offset = blkoff(dp->i_fs, dp->i_offset); 3159 if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0) 3160 goto done; 3161 oldoffset = offset + (oldloc - base); 3162 newoffset = offset + (newloc - base); 3163 3164 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) { 3165 if (dap->da_offset != oldoffset) 3166 continue; 3167 dap->da_offset = newoffset; 3168 if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset)) 3169 break; 3170 LIST_REMOVE(dap, da_pdlist); 3171 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)], 3172 dap, da_pdlist); 3173 break; 3174 } 3175 if (dap == NULL) { 3176 3177 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) { 3178 if (dap->da_offset == oldoffset) { 3179 dap->da_offset = newoffset; 3180 break; 3181 } 3182 } 3183 } 3184 done: 3185 bcopy(oldloc, newloc, entrysize); 3186 FREE_LOCK(&lk); 3187 } 3188 3189 /* 3190 * Free a diradd dependency structure. This routine must be called 3191 * with splbio interrupts blocked. 3192 */ 3193 static void 3194 free_diradd(dap) 3195 struct diradd *dap; 3196 { 3197 struct dirrem *dirrem; 3198 struct pagedep *pagedep; 3199 struct inodedep *inodedep; 3200 struct mkdir *mkdir, *nextmd; 3201 3202 mtx_assert(&lk, MA_OWNED); 3203 WORKLIST_REMOVE(&dap->da_list); 3204 LIST_REMOVE(dap, da_pdlist); 3205 if ((dap->da_state & DIRCHG) == 0) { 3206 pagedep = dap->da_pagedep; 3207 } else { 3208 dirrem = dap->da_previous; 3209 pagedep = dirrem->dm_pagedep; 3210 dirrem->dm_dirinum = pagedep->pd_ino; 3211 add_to_worklist(&dirrem->dm_list); 3212 } 3213 if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum, 3214 0, &inodedep) != 0) 3215 (void) free_inodedep(inodedep); 3216 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 3217 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 3218 nextmd = LIST_NEXT(mkdir, md_mkdirs); 3219 if (mkdir->md_diradd != dap) 3220 continue; 3221 dap->da_state &= ~mkdir->md_state; 3222 WORKLIST_REMOVE(&mkdir->md_list); 3223 LIST_REMOVE(mkdir, md_mkdirs); 3224 WORKITEM_FREE(mkdir, D_MKDIR); 3225 } 3226 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) 3227 panic("free_diradd: unfound ref"); 3228 } 3229 WORKITEM_FREE(dap, D_DIRADD); 3230 } 3231 3232 /* 3233 * Directory entry removal dependencies. 3234 * 3235 * When removing a directory entry, the entry's inode pointer must be 3236 * zero'ed on disk before the corresponding inode's link count is decremented 3237 * (possibly freeing the inode for re-use). This dependency is handled by 3238 * updating the directory entry but delaying the inode count reduction until 3239 * after the directory block has been written to disk. After this point, the 3240 * inode count can be decremented whenever it is convenient. 3241 */ 3242 3243 /* 3244 * This routine should be called immediately after removing 3245 * a directory entry. The inode's link count should not be 3246 * decremented by the calling procedure -- the soft updates 3247 * code will do this task when it is safe. 3248 */ 3249 void 3250 softdep_setup_remove(bp, dp, ip, isrmdir) 3251 struct buf *bp; /* buffer containing directory block */ 3252 struct inode *dp; /* inode for the directory being modified */ 3253 struct inode *ip; /* inode for directory entry being removed */ 3254 int isrmdir; /* indicates if doing RMDIR */ 3255 { 3256 struct dirrem *dirrem, *prevdirrem; 3257 3258 /* 3259 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. 3260 */ 3261 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 3262 3263 /* 3264 * If the COMPLETE flag is clear, then there were no active 3265 * entries and we want to roll back to a zeroed entry until 3266 * the new inode is committed to disk. If the COMPLETE flag is 3267 * set then we have deleted an entry that never made it to 3268 * disk. If the entry we deleted resulted from a name change, 3269 * then the old name still resides on disk. We cannot delete 3270 * its inode (returned to us in prevdirrem) until the zeroed 3271 * directory entry gets to disk. The new inode has never been 3272 * referenced on the disk, so can be deleted immediately. 3273 */ 3274 if ((dirrem->dm_state & COMPLETE) == 0) { 3275 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, 3276 dm_next); 3277 FREE_LOCK(&lk); 3278 } else { 3279 if (prevdirrem != NULL) 3280 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, 3281 prevdirrem, dm_next); 3282 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; 3283 FREE_LOCK(&lk); 3284 handle_workitem_remove(dirrem, NULL); 3285 } 3286 } 3287 3288 /* 3289 * Allocate a new dirrem if appropriate and return it along with 3290 * its associated pagedep. Called without a lock, returns with lock. 3291 */ 3292 static long num_dirrem; /* number of dirrem allocated */ 3293 static struct dirrem * 3294 newdirrem(bp, dp, ip, isrmdir, prevdirremp) 3295 struct buf *bp; /* buffer containing directory block */ 3296 struct inode *dp; /* inode for the directory being modified */ 3297 struct inode *ip; /* inode for directory entry being removed */ 3298 int isrmdir; /* indicates if doing RMDIR */ 3299 struct dirrem **prevdirremp; /* previously referenced inode, if any */ 3300 { 3301 int offset; 3302 ufs_lbn_t lbn; 3303 struct diradd *dap; 3304 struct dirrem *dirrem; 3305 struct pagedep *pagedep; 3306 3307 /* 3308 * Whiteouts have no deletion dependencies. 3309 */ 3310 if (ip == NULL) 3311 panic("newdirrem: whiteout"); 3312 /* 3313 * If we are over our limit, try to improve the situation. 3314 * Limiting the number of dirrem structures will also limit 3315 * the number of freefile and freeblks structures. 3316 */ 3317 ACQUIRE_LOCK(&lk); 3318 if (!(ip->i_flags & SF_SNAPSHOT) && num_dirrem > max_softdeps / 2) 3319 (void) request_cleanup(ITOV(dp)->v_mount, FLUSH_REMOVE); 3320 num_dirrem += 1; 3321 FREE_LOCK(&lk); 3322 dirrem = malloc(sizeof(struct dirrem), 3323 M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO); 3324 workitem_alloc(&dirrem->dm_list, D_DIRREM, ITOV(dp)->v_mount); 3325 dirrem->dm_state = isrmdir ? RMDIR : 0; 3326 dirrem->dm_oldinum = ip->i_number; 3327 *prevdirremp = NULL; 3328 3329 ACQUIRE_LOCK(&lk); 3330 lbn = lblkno(dp->i_fs, dp->i_offset); 3331 offset = blkoff(dp->i_fs, dp->i_offset); 3332 if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) 3333 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 3334 dirrem->dm_pagedep = pagedep; 3335 /* 3336 * Check for a diradd dependency for the same directory entry. 3337 * If present, then both dependencies become obsolete and can 3338 * be de-allocated. Check for an entry on both the pd_dirraddhd 3339 * list and the pd_pendinghd list. 3340 */ 3341 3342 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) 3343 if (dap->da_offset == offset) 3344 break; 3345 if (dap == NULL) { 3346 3347 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) 3348 if (dap->da_offset == offset) 3349 break; 3350 if (dap == NULL) 3351 return (dirrem); 3352 } 3353 /* 3354 * Must be ATTACHED at this point. 3355 */ 3356 if ((dap->da_state & ATTACHED) == 0) 3357 panic("newdirrem: not ATTACHED"); 3358 if (dap->da_newinum != ip->i_number) 3359 panic("newdirrem: inum %d should be %d", 3360 ip->i_number, dap->da_newinum); 3361 /* 3362 * If we are deleting a changed name that never made it to disk, 3363 * then return the dirrem describing the previous inode (which 3364 * represents the inode currently referenced from this entry on disk). 3365 */ 3366 if ((dap->da_state & DIRCHG) != 0) { 3367 *prevdirremp = dap->da_previous; 3368 dap->da_state &= ~DIRCHG; 3369 dap->da_pagedep = pagedep; 3370 } 3371 /* 3372 * We are deleting an entry that never made it to disk. 3373 * Mark it COMPLETE so we can delete its inode immediately. 3374 */ 3375 dirrem->dm_state |= COMPLETE; 3376 free_diradd(dap); 3377 return (dirrem); 3378 } 3379 3380 /* 3381 * Directory entry change dependencies. 3382 * 3383 * Changing an existing directory entry requires that an add operation 3384 * be completed first followed by a deletion. The semantics for the addition 3385 * are identical to the description of adding a new entry above except 3386 * that the rollback is to the old inode number rather than zero. Once 3387 * the addition dependency is completed, the removal is done as described 3388 * in the removal routine above. 3389 */ 3390 3391 /* 3392 * This routine should be called immediately after changing 3393 * a directory entry. The inode's link count should not be 3394 * decremented by the calling procedure -- the soft updates 3395 * code will perform this task when it is safe. 3396 */ 3397 void 3398 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 3399 struct buf *bp; /* buffer containing directory block */ 3400 struct inode *dp; /* inode for the directory being modified */ 3401 struct inode *ip; /* inode for directory entry being removed */ 3402 ino_t newinum; /* new inode number for changed entry */ 3403 int isrmdir; /* indicates if doing RMDIR */ 3404 { 3405 int offset; 3406 struct diradd *dap = NULL; 3407 struct dirrem *dirrem, *prevdirrem; 3408 struct pagedep *pagedep; 3409 struct inodedep *inodedep; 3410 struct mount *mp; 3411 3412 offset = blkoff(dp->i_fs, dp->i_offset); 3413 mp = UFSTOVFS(dp->i_ump); 3414 3415 /* 3416 * Whiteouts do not need diradd dependencies. 3417 */ 3418 if (newinum != WINO) { 3419 dap = malloc(sizeof(struct diradd), 3420 M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO); 3421 workitem_alloc(&dap->da_list, D_DIRADD, mp); 3422 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; 3423 dap->da_offset = offset; 3424 dap->da_newinum = newinum; 3425 } 3426 3427 /* 3428 * Allocate a new dirrem and ACQUIRE_LOCK. 3429 */ 3430 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 3431 pagedep = dirrem->dm_pagedep; 3432 /* 3433 * The possible values for isrmdir: 3434 * 0 - non-directory file rename 3435 * 1 - directory rename within same directory 3436 * inum - directory rename to new directory of given inode number 3437 * When renaming to a new directory, we are both deleting and 3438 * creating a new directory entry, so the link count on the new 3439 * directory should not change. Thus we do not need the followup 3440 * dirrem which is usually done in handle_workitem_remove. We set 3441 * the DIRCHG flag to tell handle_workitem_remove to skip the 3442 * followup dirrem. 3443 */ 3444 if (isrmdir > 1) 3445 dirrem->dm_state |= DIRCHG; 3446 3447 /* 3448 * Whiteouts have no additional dependencies, 3449 * so just put the dirrem on the correct list. 3450 */ 3451 if (newinum == WINO) { 3452 if ((dirrem->dm_state & COMPLETE) == 0) { 3453 LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, 3454 dm_next); 3455 } else { 3456 dirrem->dm_dirinum = pagedep->pd_ino; 3457 add_to_worklist(&dirrem->dm_list); 3458 } 3459 FREE_LOCK(&lk); 3460 return; 3461 } 3462 3463 /* 3464 * If the COMPLETE flag is clear, then there were no active 3465 * entries and we want to roll back to the previous inode until 3466 * the new inode is committed to disk. If the COMPLETE flag is 3467 * set, then we have deleted an entry that never made it to disk. 3468 * If the entry we deleted resulted from a name change, then the old 3469 * inode reference still resides on disk. Any rollback that we do 3470 * needs to be to that old inode (returned to us in prevdirrem). If 3471 * the entry we deleted resulted from a create, then there is 3472 * no entry on the disk, so we want to roll back to zero rather 3473 * than the uncommitted inode. In either of the COMPLETE cases we 3474 * want to immediately free the unwritten and unreferenced inode. 3475 */ 3476 if ((dirrem->dm_state & COMPLETE) == 0) { 3477 dap->da_previous = dirrem; 3478 } else { 3479 if (prevdirrem != NULL) { 3480 dap->da_previous = prevdirrem; 3481 } else { 3482 dap->da_state &= ~DIRCHG; 3483 dap->da_pagedep = pagedep; 3484 } 3485 dirrem->dm_dirinum = pagedep->pd_ino; 3486 add_to_worklist(&dirrem->dm_list); 3487 } 3488 /* 3489 * Link into its inodedep. Put it on the id_bufwait list if the inode 3490 * is not yet written. If it is written, do the post-inode write 3491 * processing to put it on the id_pendinghd list. 3492 */ 3493 if (inodedep_lookup(mp, newinum, DEPALLOC, &inodedep) == 0 || 3494 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 3495 dap->da_state |= COMPLETE; 3496 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 3497 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 3498 } else { 3499 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 3500 dap, da_pdlist); 3501 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 3502 } 3503 FREE_LOCK(&lk); 3504 } 3505 3506 /* 3507 * Called whenever the link count on an inode is changed. 3508 * It creates an inode dependency so that the new reference(s) 3509 * to the inode cannot be committed to disk until the updated 3510 * inode has been written. 3511 */ 3512 void 3513 softdep_change_linkcnt(ip) 3514 struct inode *ip; /* the inode with the increased link count */ 3515 { 3516 struct inodedep *inodedep; 3517 3518 ACQUIRE_LOCK(&lk); 3519 (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 3520 DEPALLOC, &inodedep); 3521 if (ip->i_nlink < ip->i_effnlink) 3522 panic("softdep_change_linkcnt: bad delta"); 3523 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 3524 FREE_LOCK(&lk); 3525 } 3526 3527 /* 3528 * Called when the effective link count and the reference count 3529 * on an inode drops to zero. At this point there are no names 3530 * referencing the file in the filesystem and no active file 3531 * references. The space associated with the file will be freed 3532 * as soon as the necessary soft dependencies are cleared. 3533 */ 3534 void 3535 softdep_releasefile(ip) 3536 struct inode *ip; /* inode with the zero effective link count */ 3537 { 3538 struct inodedep *inodedep; 3539 struct fs *fs; 3540 int extblocks; 3541 3542 if (ip->i_effnlink > 0) 3543 panic("softdep_releasefile: file still referenced"); 3544 /* 3545 * We may be called several times as the on-disk link count 3546 * drops to zero. We only want to account for the space once. 3547 */ 3548 if (ip->i_flag & IN_SPACECOUNTED) 3549 return; 3550 /* 3551 * We have to deactivate a snapshot otherwise copyonwrites may 3552 * add blocks and the cleanup may remove blocks after we have 3553 * tried to account for them. 3554 */ 3555 if ((ip->i_flags & SF_SNAPSHOT) != 0) 3556 ffs_snapremove(ITOV(ip)); 3557 /* 3558 * If we are tracking an nlinkdelta, we have to also remember 3559 * whether we accounted for the freed space yet. 3560 */ 3561 ACQUIRE_LOCK(&lk); 3562 if ((inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, &inodedep))) 3563 inodedep->id_state |= SPACECOUNTED; 3564 FREE_LOCK(&lk); 3565 fs = ip->i_fs; 3566 extblocks = 0; 3567 if (fs->fs_magic == FS_UFS2_MAGIC) 3568 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); 3569 UFS_LOCK(ip->i_ump); 3570 ip->i_fs->fs_pendingblocks += DIP(ip, i_blocks) - extblocks; 3571 ip->i_fs->fs_pendinginodes += 1; 3572 UFS_UNLOCK(ip->i_ump); 3573 ip->i_flag |= IN_SPACECOUNTED; 3574 } 3575 3576 /* 3577 * This workitem decrements the inode's link count. 3578 * If the link count reaches zero, the file is removed. 3579 */ 3580 static void 3581 handle_workitem_remove(dirrem, xp) 3582 struct dirrem *dirrem; 3583 struct vnode *xp; 3584 { 3585 struct thread *td = curthread; 3586 struct inodedep *inodedep; 3587 struct vnode *vp; 3588 struct inode *ip; 3589 ino_t oldinum; 3590 int error; 3591 3592 if ((vp = xp) == NULL && 3593 (error = ffs_vgetf(dirrem->dm_list.wk_mp, 3594 dirrem->dm_oldinum, LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ)) != 0) { 3595 softdep_error("handle_workitem_remove: vget", error); 3596 return; 3597 } 3598 ip = VTOI(vp); 3599 ACQUIRE_LOCK(&lk); 3600 if ((inodedep_lookup(dirrem->dm_list.wk_mp, 3601 dirrem->dm_oldinum, 0, &inodedep)) == 0) 3602 panic("handle_workitem_remove: lost inodedep"); 3603 /* 3604 * Normal file deletion. 3605 */ 3606 if ((dirrem->dm_state & RMDIR) == 0) { 3607 ip->i_nlink--; 3608 DIP_SET(ip, i_nlink, ip->i_nlink); 3609 ip->i_flag |= IN_CHANGE; 3610 if (ip->i_nlink < ip->i_effnlink) 3611 panic("handle_workitem_remove: bad file delta"); 3612 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 3613 num_dirrem -= 1; 3614 WORKITEM_FREE(dirrem, D_DIRREM); 3615 FREE_LOCK(&lk); 3616 vput(vp); 3617 return; 3618 } 3619 /* 3620 * Directory deletion. Decrement reference count for both the 3621 * just deleted parent directory entry and the reference for ".". 3622 * Next truncate the directory to length zero. When the 3623 * truncation completes, arrange to have the reference count on 3624 * the parent decremented to account for the loss of "..". 3625 */ 3626 ip->i_nlink -= 2; 3627 DIP_SET(ip, i_nlink, ip->i_nlink); 3628 ip->i_flag |= IN_CHANGE; 3629 if (ip->i_nlink < ip->i_effnlink) 3630 panic("handle_workitem_remove: bad dir delta"); 3631 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 3632 FREE_LOCK(&lk); 3633 if ((error = ffs_truncate(vp, (off_t)0, 0, td->td_ucred, td)) != 0) 3634 softdep_error("handle_workitem_remove: truncate", error); 3635 ACQUIRE_LOCK(&lk); 3636 /* 3637 * Rename a directory to a new parent. Since, we are both deleting 3638 * and creating a new directory entry, the link count on the new 3639 * directory should not change. Thus we skip the followup dirrem. 3640 */ 3641 if (dirrem->dm_state & DIRCHG) { 3642 num_dirrem -= 1; 3643 WORKITEM_FREE(dirrem, D_DIRREM); 3644 FREE_LOCK(&lk); 3645 vput(vp); 3646 return; 3647 } 3648 /* 3649 * If the inodedep does not exist, then the zero'ed inode has 3650 * been written to disk. If the allocated inode has never been 3651 * written to disk, then the on-disk inode is zero'ed. In either 3652 * case we can remove the file immediately. 3653 */ 3654 dirrem->dm_state = 0; 3655 oldinum = dirrem->dm_oldinum; 3656 dirrem->dm_oldinum = dirrem->dm_dirinum; 3657 if (inodedep_lookup(dirrem->dm_list.wk_mp, oldinum, 3658 0, &inodedep) == 0 || check_inode_unwritten(inodedep)) { 3659 if (xp != NULL) 3660 add_to_worklist(&dirrem->dm_list); 3661 FREE_LOCK(&lk); 3662 vput(vp); 3663 if (xp == NULL) 3664 handle_workitem_remove(dirrem, NULL); 3665 return; 3666 } 3667 WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); 3668 FREE_LOCK(&lk); 3669 ip->i_flag |= IN_CHANGE; 3670 ffs_update(vp, 0); 3671 vput(vp); 3672 } 3673 3674 /* 3675 * Inode de-allocation dependencies. 3676 * 3677 * When an inode's link count is reduced to zero, it can be de-allocated. We 3678 * found it convenient to postpone de-allocation until after the inode is 3679 * written to disk with its new link count (zero). At this point, all of the 3680 * on-disk inode's block pointers are nullified and, with careful dependency 3681 * list ordering, all dependencies related to the inode will be satisfied and 3682 * the corresponding dependency structures de-allocated. So, if/when the 3683 * inode is reused, there will be no mixing of old dependencies with new 3684 * ones. This artificial dependency is set up by the block de-allocation 3685 * procedure above (softdep_setup_freeblocks) and completed by the 3686 * following procedure. 3687 */ 3688 static void 3689 handle_workitem_freefile(freefile) 3690 struct freefile *freefile; 3691 { 3692 struct fs *fs; 3693 struct inodedep *idp; 3694 struct ufsmount *ump; 3695 int error; 3696 3697 ump = VFSTOUFS(freefile->fx_list.wk_mp); 3698 fs = ump->um_fs; 3699 #ifdef DEBUG 3700 ACQUIRE_LOCK(&lk); 3701 error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp); 3702 FREE_LOCK(&lk); 3703 if (error) 3704 panic("handle_workitem_freefile: inodedep survived"); 3705 #endif 3706 UFS_LOCK(ump); 3707 fs->fs_pendinginodes -= 1; 3708 UFS_UNLOCK(ump); 3709 if ((error = ffs_freefile(ump, fs, freefile->fx_devvp, 3710 freefile->fx_oldinum, freefile->fx_mode)) != 0) 3711 softdep_error("handle_workitem_freefile", error); 3712 ACQUIRE_LOCK(&lk); 3713 WORKITEM_FREE(freefile, D_FREEFILE); 3714 FREE_LOCK(&lk); 3715 } 3716 3717 3718 /* 3719 * Helper function which unlinks marker element from work list and returns 3720 * the next element on the list. 3721 */ 3722 static __inline struct worklist * 3723 markernext(struct worklist *marker) 3724 { 3725 struct worklist *next; 3726 3727 next = LIST_NEXT(marker, wk_list); 3728 LIST_REMOVE(marker, wk_list); 3729 return next; 3730 } 3731 3732 /* 3733 * Disk writes. 3734 * 3735 * The dependency structures constructed above are most actively used when file 3736 * system blocks are written to disk. No constraints are placed on when a 3737 * block can be written, but unsatisfied update dependencies are made safe by 3738 * modifying (or replacing) the source memory for the duration of the disk 3739 * write. When the disk write completes, the memory block is again brought 3740 * up-to-date. 3741 * 3742 * In-core inode structure reclamation. 3743 * 3744 * Because there are a finite number of "in-core" inode structures, they are 3745 * reused regularly. By transferring all inode-related dependencies to the 3746 * in-memory inode block and indexing them separately (via "inodedep"s), we 3747 * can allow "in-core" inode structures to be reused at any time and avoid 3748 * any increase in contention. 3749 * 3750 * Called just before entering the device driver to initiate a new disk I/O. 3751 * The buffer must be locked, thus, no I/O completion operations can occur 3752 * while we are manipulating its associated dependencies. 3753 */ 3754 static void 3755 softdep_disk_io_initiation(bp) 3756 struct buf *bp; /* structure describing disk write to occur */ 3757 { 3758 struct worklist *wk; 3759 struct worklist marker; 3760 struct indirdep *indirdep; 3761 struct inodedep *inodedep; 3762 3763 /* 3764 * We only care about write operations. There should never 3765 * be dependencies for reads. 3766 */ 3767 if (bp->b_iocmd != BIO_WRITE) 3768 panic("softdep_disk_io_initiation: not write"); 3769 3770 marker.wk_type = D_LAST + 1; /* Not a normal workitem */ 3771 PHOLD(curproc); /* Don't swap out kernel stack */ 3772 3773 ACQUIRE_LOCK(&lk); 3774 /* 3775 * Do any necessary pre-I/O processing. 3776 */ 3777 for (wk = LIST_FIRST(&bp->b_dep); wk != NULL; 3778 wk = markernext(&marker)) { 3779 LIST_INSERT_AFTER(wk, &marker, wk_list); 3780 switch (wk->wk_type) { 3781 3782 case D_PAGEDEP: 3783 initiate_write_filepage(WK_PAGEDEP(wk), bp); 3784 continue; 3785 3786 case D_INODEDEP: 3787 inodedep = WK_INODEDEP(wk); 3788 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) 3789 initiate_write_inodeblock_ufs1(inodedep, bp); 3790 else 3791 initiate_write_inodeblock_ufs2(inodedep, bp); 3792 continue; 3793 3794 case D_INDIRDEP: 3795 indirdep = WK_INDIRDEP(wk); 3796 if (indirdep->ir_state & GOINGAWAY) 3797 panic("disk_io_initiation: indirdep gone"); 3798 /* 3799 * If there are no remaining dependencies, this 3800 * will be writing the real pointers, so the 3801 * dependency can be freed. 3802 */ 3803 if (LIST_EMPTY(&indirdep->ir_deplisthd)) { 3804 struct buf *bp; 3805 3806 bp = indirdep->ir_savebp; 3807 bp->b_flags |= B_INVAL | B_NOCACHE; 3808 /* inline expand WORKLIST_REMOVE(wk); */ 3809 wk->wk_state &= ~ONWORKLIST; 3810 LIST_REMOVE(wk, wk_list); 3811 WORKITEM_FREE(indirdep, D_INDIRDEP); 3812 FREE_LOCK(&lk); 3813 brelse(bp); 3814 ACQUIRE_LOCK(&lk); 3815 continue; 3816 } 3817 /* 3818 * Replace up-to-date version with safe version. 3819 */ 3820 FREE_LOCK(&lk); 3821 indirdep->ir_saveddata = malloc(bp->b_bcount, 3822 M_INDIRDEP, M_SOFTDEP_FLAGS); 3823 ACQUIRE_LOCK(&lk); 3824 indirdep->ir_state &= ~ATTACHED; 3825 indirdep->ir_state |= UNDONE; 3826 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); 3827 bcopy(indirdep->ir_savebp->b_data, bp->b_data, 3828 bp->b_bcount); 3829 continue; 3830 3831 case D_MKDIR: 3832 case D_BMSAFEMAP: 3833 case D_ALLOCDIRECT: 3834 case D_ALLOCINDIR: 3835 continue; 3836 3837 default: 3838 panic("handle_disk_io_initiation: Unexpected type %s", 3839 TYPENAME(wk->wk_type)); 3840 /* NOTREACHED */ 3841 } 3842 } 3843 FREE_LOCK(&lk); 3844 PRELE(curproc); /* Allow swapout of kernel stack */ 3845 } 3846 3847 /* 3848 * Called from within the procedure above to deal with unsatisfied 3849 * allocation dependencies in a directory. The buffer must be locked, 3850 * thus, no I/O completion operations can occur while we are 3851 * manipulating its associated dependencies. 3852 */ 3853 static void 3854 initiate_write_filepage(pagedep, bp) 3855 struct pagedep *pagedep; 3856 struct buf *bp; 3857 { 3858 struct diradd *dap; 3859 struct direct *ep; 3860 int i; 3861 3862 if (pagedep->pd_state & IOSTARTED) { 3863 /* 3864 * This can only happen if there is a driver that does not 3865 * understand chaining. Here biodone will reissue the call 3866 * to strategy for the incomplete buffers. 3867 */ 3868 printf("initiate_write_filepage: already started\n"); 3869 return; 3870 } 3871 pagedep->pd_state |= IOSTARTED; 3872 for (i = 0; i < DAHASHSZ; i++) { 3873 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 3874 ep = (struct direct *) 3875 ((char *)bp->b_data + dap->da_offset); 3876 if (ep->d_ino != dap->da_newinum) 3877 panic("%s: dir inum %d != new %d", 3878 "initiate_write_filepage", 3879 ep->d_ino, dap->da_newinum); 3880 if (dap->da_state & DIRCHG) 3881 ep->d_ino = dap->da_previous->dm_oldinum; 3882 else 3883 ep->d_ino = 0; 3884 dap->da_state &= ~ATTACHED; 3885 dap->da_state |= UNDONE; 3886 } 3887 } 3888 } 3889 3890 /* 3891 * Version of initiate_write_inodeblock that handles UFS1 dinodes. 3892 * Note that any bug fixes made to this routine must be done in the 3893 * version found below. 3894 * 3895 * Called from within the procedure above to deal with unsatisfied 3896 * allocation dependencies in an inodeblock. The buffer must be 3897 * locked, thus, no I/O completion operations can occur while we 3898 * are manipulating its associated dependencies. 3899 */ 3900 static void 3901 initiate_write_inodeblock_ufs1(inodedep, bp) 3902 struct inodedep *inodedep; 3903 struct buf *bp; /* The inode block */ 3904 { 3905 struct allocdirect *adp, *lastadp; 3906 struct ufs1_dinode *dp; 3907 struct ufs1_dinode *sip; 3908 struct fs *fs; 3909 ufs_lbn_t i; 3910 #ifdef INVARIANTS 3911 ufs_lbn_t prevlbn = 0; 3912 #endif 3913 int deplist; 3914 3915 if (inodedep->id_state & IOSTARTED) 3916 panic("initiate_write_inodeblock_ufs1: already started"); 3917 inodedep->id_state |= IOSTARTED; 3918 fs = inodedep->id_fs; 3919 dp = (struct ufs1_dinode *)bp->b_data + 3920 ino_to_fsbo(fs, inodedep->id_ino); 3921 /* 3922 * If the bitmap is not yet written, then the allocated 3923 * inode cannot be written to disk. 3924 */ 3925 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 3926 if (inodedep->id_savedino1 != NULL) 3927 panic("initiate_write_inodeblock_ufs1: I/O underway"); 3928 FREE_LOCK(&lk); 3929 sip = malloc(sizeof(struct ufs1_dinode), 3930 M_SAVEDINO, M_SOFTDEP_FLAGS); 3931 ACQUIRE_LOCK(&lk); 3932 inodedep->id_savedino1 = sip; 3933 *inodedep->id_savedino1 = *dp; 3934 bzero((caddr_t)dp, sizeof(struct ufs1_dinode)); 3935 dp->di_gen = inodedep->id_savedino1->di_gen; 3936 return; 3937 } 3938 /* 3939 * If no dependencies, then there is nothing to roll back. 3940 */ 3941 inodedep->id_savedsize = dp->di_size; 3942 inodedep->id_savedextsize = 0; 3943 if (TAILQ_EMPTY(&inodedep->id_inoupdt)) 3944 return; 3945 /* 3946 * Set the dependencies to busy. 3947 */ 3948 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 3949 adp = TAILQ_NEXT(adp, ad_next)) { 3950 #ifdef INVARIANTS 3951 if (deplist != 0 && prevlbn >= adp->ad_lbn) 3952 panic("softdep_write_inodeblock: lbn order"); 3953 prevlbn = adp->ad_lbn; 3954 if (adp->ad_lbn < NDADDR && 3955 dp->di_db[adp->ad_lbn] != adp->ad_newblkno) 3956 panic("%s: direct pointer #%jd mismatch %d != %jd", 3957 "softdep_write_inodeblock", 3958 (intmax_t)adp->ad_lbn, 3959 dp->di_db[adp->ad_lbn], 3960 (intmax_t)adp->ad_newblkno); 3961 if (adp->ad_lbn >= NDADDR && 3962 dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) 3963 panic("%s: indirect pointer #%jd mismatch %d != %jd", 3964 "softdep_write_inodeblock", 3965 (intmax_t)adp->ad_lbn - NDADDR, 3966 dp->di_ib[adp->ad_lbn - NDADDR], 3967 (intmax_t)adp->ad_newblkno); 3968 deplist |= 1 << adp->ad_lbn; 3969 if ((adp->ad_state & ATTACHED) == 0) 3970 panic("softdep_write_inodeblock: Unknown state 0x%x", 3971 adp->ad_state); 3972 #endif /* INVARIANTS */ 3973 adp->ad_state &= ~ATTACHED; 3974 adp->ad_state |= UNDONE; 3975 } 3976 /* 3977 * The on-disk inode cannot claim to be any larger than the last 3978 * fragment that has been written. Otherwise, the on-disk inode 3979 * might have fragments that were not the last block in the file 3980 * which would corrupt the filesystem. 3981 */ 3982 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 3983 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 3984 if (adp->ad_lbn >= NDADDR) 3985 break; 3986 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno; 3987 /* keep going until hitting a rollback to a frag */ 3988 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 3989 continue; 3990 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; 3991 for (i = adp->ad_lbn + 1; i < NDADDR; i++) { 3992 #ifdef INVARIANTS 3993 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) 3994 panic("softdep_write_inodeblock: lost dep1"); 3995 #endif /* INVARIANTS */ 3996 dp->di_db[i] = 0; 3997 } 3998 for (i = 0; i < NIADDR; i++) { 3999 #ifdef INVARIANTS 4000 if (dp->di_ib[i] != 0 && 4001 (deplist & ((1 << NDADDR) << i)) == 0) 4002 panic("softdep_write_inodeblock: lost dep2"); 4003 #endif /* INVARIANTS */ 4004 dp->di_ib[i] = 0; 4005 } 4006 return; 4007 } 4008 /* 4009 * If we have zero'ed out the last allocated block of the file, 4010 * roll back the size to the last currently allocated block. 4011 * We know that this last allocated block is a full-sized as 4012 * we already checked for fragments in the loop above. 4013 */ 4014 if (lastadp != NULL && 4015 dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { 4016 for (i = lastadp->ad_lbn; i >= 0; i--) 4017 if (dp->di_db[i] != 0) 4018 break; 4019 dp->di_size = (i + 1) * fs->fs_bsize; 4020 } 4021 /* 4022 * The only dependencies are for indirect blocks. 4023 * 4024 * The file size for indirect block additions is not guaranteed. 4025 * Such a guarantee would be non-trivial to achieve. The conventional 4026 * synchronous write implementation also does not make this guarantee. 4027 * Fsck should catch and fix discrepancies. Arguably, the file size 4028 * can be over-estimated without destroying integrity when the file 4029 * moves into the indirect blocks (i.e., is large). If we want to 4030 * postpone fsck, we are stuck with this argument. 4031 */ 4032 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 4033 dp->di_ib[adp->ad_lbn - NDADDR] = 0; 4034 } 4035 4036 /* 4037 * Version of initiate_write_inodeblock that handles UFS2 dinodes. 4038 * Note that any bug fixes made to this routine must be done in the 4039 * version found above. 4040 * 4041 * Called from within the procedure above to deal with unsatisfied 4042 * allocation dependencies in an inodeblock. The buffer must be 4043 * locked, thus, no I/O completion operations can occur while we 4044 * are manipulating its associated dependencies. 4045 */ 4046 static void 4047 initiate_write_inodeblock_ufs2(inodedep, bp) 4048 struct inodedep *inodedep; 4049 struct buf *bp; /* The inode block */ 4050 { 4051 struct allocdirect *adp, *lastadp; 4052 struct ufs2_dinode *dp; 4053 struct ufs2_dinode *sip; 4054 struct fs *fs; 4055 ufs_lbn_t i; 4056 #ifdef INVARIANTS 4057 ufs_lbn_t prevlbn = 0; 4058 #endif 4059 int deplist; 4060 4061 if (inodedep->id_state & IOSTARTED) 4062 panic("initiate_write_inodeblock_ufs2: already started"); 4063 inodedep->id_state |= IOSTARTED; 4064 fs = inodedep->id_fs; 4065 dp = (struct ufs2_dinode *)bp->b_data + 4066 ino_to_fsbo(fs, inodedep->id_ino); 4067 /* 4068 * If the bitmap is not yet written, then the allocated 4069 * inode cannot be written to disk. 4070 */ 4071 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 4072 if (inodedep->id_savedino2 != NULL) 4073 panic("initiate_write_inodeblock_ufs2: I/O underway"); 4074 FREE_LOCK(&lk); 4075 sip = malloc(sizeof(struct ufs2_dinode), 4076 M_SAVEDINO, M_SOFTDEP_FLAGS); 4077 ACQUIRE_LOCK(&lk); 4078 inodedep->id_savedino2 = sip; 4079 *inodedep->id_savedino2 = *dp; 4080 bzero((caddr_t)dp, sizeof(struct ufs2_dinode)); 4081 dp->di_gen = inodedep->id_savedino2->di_gen; 4082 return; 4083 } 4084 /* 4085 * If no dependencies, then there is nothing to roll back. 4086 */ 4087 inodedep->id_savedsize = dp->di_size; 4088 inodedep->id_savedextsize = dp->di_extsize; 4089 if (TAILQ_EMPTY(&inodedep->id_inoupdt) && 4090 TAILQ_EMPTY(&inodedep->id_extupdt)) 4091 return; 4092 /* 4093 * Set the ext data dependencies to busy. 4094 */ 4095 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; 4096 adp = TAILQ_NEXT(adp, ad_next)) { 4097 #ifdef INVARIANTS 4098 if (deplist != 0 && prevlbn >= adp->ad_lbn) 4099 panic("softdep_write_inodeblock: lbn order"); 4100 prevlbn = adp->ad_lbn; 4101 if (dp->di_extb[adp->ad_lbn] != adp->ad_newblkno) 4102 panic("%s: direct pointer #%jd mismatch %jd != %jd", 4103 "softdep_write_inodeblock", 4104 (intmax_t)adp->ad_lbn, 4105 (intmax_t)dp->di_extb[adp->ad_lbn], 4106 (intmax_t)adp->ad_newblkno); 4107 deplist |= 1 << adp->ad_lbn; 4108 if ((adp->ad_state & ATTACHED) == 0) 4109 panic("softdep_write_inodeblock: Unknown state 0x%x", 4110 adp->ad_state); 4111 #endif /* INVARIANTS */ 4112 adp->ad_state &= ~ATTACHED; 4113 adp->ad_state |= UNDONE; 4114 } 4115 /* 4116 * The on-disk inode cannot claim to be any larger than the last 4117 * fragment that has been written. Otherwise, the on-disk inode 4118 * might have fragments that were not the last block in the ext 4119 * data which would corrupt the filesystem. 4120 */ 4121 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; 4122 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 4123 dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno; 4124 /* keep going until hitting a rollback to a frag */ 4125 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 4126 continue; 4127 dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; 4128 for (i = adp->ad_lbn + 1; i < NXADDR; i++) { 4129 #ifdef INVARIANTS 4130 if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) 4131 panic("softdep_write_inodeblock: lost dep1"); 4132 #endif /* INVARIANTS */ 4133 dp->di_extb[i] = 0; 4134 } 4135 lastadp = NULL; 4136 break; 4137 } 4138 /* 4139 * If we have zero'ed out the last allocated block of the ext 4140 * data, roll back the size to the last currently allocated block. 4141 * We know that this last allocated block is a full-sized as 4142 * we already checked for fragments in the loop above. 4143 */ 4144 if (lastadp != NULL && 4145 dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { 4146 for (i = lastadp->ad_lbn; i >= 0; i--) 4147 if (dp->di_extb[i] != 0) 4148 break; 4149 dp->di_extsize = (i + 1) * fs->fs_bsize; 4150 } 4151 /* 4152 * Set the file data dependencies to busy. 4153 */ 4154 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 4155 adp = TAILQ_NEXT(adp, ad_next)) { 4156 #ifdef INVARIANTS 4157 if (deplist != 0 && prevlbn >= adp->ad_lbn) 4158 panic("softdep_write_inodeblock: lbn order"); 4159 prevlbn = adp->ad_lbn; 4160 if (adp->ad_lbn < NDADDR && 4161 dp->di_db[adp->ad_lbn] != adp->ad_newblkno) 4162 panic("%s: direct pointer #%jd mismatch %jd != %jd", 4163 "softdep_write_inodeblock", 4164 (intmax_t)adp->ad_lbn, 4165 (intmax_t)dp->di_db[adp->ad_lbn], 4166 (intmax_t)adp->ad_newblkno); 4167 if (adp->ad_lbn >= NDADDR && 4168 dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) 4169 panic("%s indirect pointer #%jd mismatch %jd != %jd", 4170 "softdep_write_inodeblock:", 4171 (intmax_t)adp->ad_lbn - NDADDR, 4172 (intmax_t)dp->di_ib[adp->ad_lbn - NDADDR], 4173 (intmax_t)adp->ad_newblkno); 4174 deplist |= 1 << adp->ad_lbn; 4175 if ((adp->ad_state & ATTACHED) == 0) 4176 panic("softdep_write_inodeblock: Unknown state 0x%x", 4177 adp->ad_state); 4178 #endif /* INVARIANTS */ 4179 adp->ad_state &= ~ATTACHED; 4180 adp->ad_state |= UNDONE; 4181 } 4182 /* 4183 * The on-disk inode cannot claim to be any larger than the last 4184 * fragment that has been written. Otherwise, the on-disk inode 4185 * might have fragments that were not the last block in the file 4186 * which would corrupt the filesystem. 4187 */ 4188 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 4189 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 4190 if (adp->ad_lbn >= NDADDR) 4191 break; 4192 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno; 4193 /* keep going until hitting a rollback to a frag */ 4194 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 4195 continue; 4196 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; 4197 for (i = adp->ad_lbn + 1; i < NDADDR; i++) { 4198 #ifdef INVARIANTS 4199 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) 4200 panic("softdep_write_inodeblock: lost dep2"); 4201 #endif /* INVARIANTS */ 4202 dp->di_db[i] = 0; 4203 } 4204 for (i = 0; i < NIADDR; i++) { 4205 #ifdef INVARIANTS 4206 if (dp->di_ib[i] != 0 && 4207 (deplist & ((1 << NDADDR) << i)) == 0) 4208 panic("softdep_write_inodeblock: lost dep3"); 4209 #endif /* INVARIANTS */ 4210 dp->di_ib[i] = 0; 4211 } 4212 return; 4213 } 4214 /* 4215 * If we have zero'ed out the last allocated block of the file, 4216 * roll back the size to the last currently allocated block. 4217 * We know that this last allocated block is a full-sized as 4218 * we already checked for fragments in the loop above. 4219 */ 4220 if (lastadp != NULL && 4221 dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { 4222 for (i = lastadp->ad_lbn; i >= 0; i--) 4223 if (dp->di_db[i] != 0) 4224 break; 4225 dp->di_size = (i + 1) * fs->fs_bsize; 4226 } 4227 /* 4228 * The only dependencies are for indirect blocks. 4229 * 4230 * The file size for indirect block additions is not guaranteed. 4231 * Such a guarantee would be non-trivial to achieve. The conventional 4232 * synchronous write implementation also does not make this guarantee. 4233 * Fsck should catch and fix discrepancies. Arguably, the file size 4234 * can be over-estimated without destroying integrity when the file 4235 * moves into the indirect blocks (i.e., is large). If we want to 4236 * postpone fsck, we are stuck with this argument. 4237 */ 4238 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 4239 dp->di_ib[adp->ad_lbn - NDADDR] = 0; 4240 } 4241 4242 /* 4243 * This routine is called during the completion interrupt 4244 * service routine for a disk write (from the procedure called 4245 * by the device driver to inform the filesystem caches of 4246 * a request completion). It should be called early in this 4247 * procedure, before the block is made available to other 4248 * processes or other routines are called. 4249 */ 4250 static void 4251 softdep_disk_write_complete(bp) 4252 struct buf *bp; /* describes the completed disk write */ 4253 { 4254 struct worklist *wk; 4255 struct worklist *owk; 4256 struct workhead reattach; 4257 struct newblk *newblk; 4258 struct allocindir *aip; 4259 struct allocdirect *adp; 4260 struct indirdep *indirdep; 4261 struct inodedep *inodedep; 4262 struct bmsafemap *bmsafemap; 4263 4264 /* 4265 * If an error occurred while doing the write, then the data 4266 * has not hit the disk and the dependencies cannot be unrolled. 4267 */ 4268 if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) 4269 return; 4270 LIST_INIT(&reattach); 4271 /* 4272 * This lock must not be released anywhere in this code segment. 4273 */ 4274 ACQUIRE_LOCK(&lk); 4275 owk = NULL; 4276 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 4277 WORKLIST_REMOVE(wk); 4278 if (wk == owk) 4279 panic("duplicate worklist: %p\n", wk); 4280 owk = wk; 4281 switch (wk->wk_type) { 4282 4283 case D_PAGEDEP: 4284 if (handle_written_filepage(WK_PAGEDEP(wk), bp)) 4285 WORKLIST_INSERT(&reattach, wk); 4286 continue; 4287 4288 case D_INODEDEP: 4289 if (handle_written_inodeblock(WK_INODEDEP(wk), bp)) 4290 WORKLIST_INSERT(&reattach, wk); 4291 continue; 4292 4293 case D_BMSAFEMAP: 4294 bmsafemap = WK_BMSAFEMAP(wk); 4295 while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) { 4296 newblk->nb_state |= DEPCOMPLETE; 4297 newblk->nb_bmsafemap = NULL; 4298 LIST_REMOVE(newblk, nb_deps); 4299 } 4300 while ((adp = 4301 LIST_FIRST(&bmsafemap->sm_allocdirecthd))) { 4302 adp->ad_state |= DEPCOMPLETE; 4303 adp->ad_buf = NULL; 4304 LIST_REMOVE(adp, ad_deps); 4305 handle_allocdirect_partdone(adp); 4306 } 4307 while ((aip = 4308 LIST_FIRST(&bmsafemap->sm_allocindirhd))) { 4309 aip->ai_state |= DEPCOMPLETE; 4310 aip->ai_buf = NULL; 4311 LIST_REMOVE(aip, ai_deps); 4312 handle_allocindir_partdone(aip); 4313 } 4314 while ((inodedep = 4315 LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) { 4316 inodedep->id_state |= DEPCOMPLETE; 4317 LIST_REMOVE(inodedep, id_deps); 4318 inodedep->id_buf = NULL; 4319 } 4320 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 4321 continue; 4322 4323 case D_MKDIR: 4324 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 4325 continue; 4326 4327 case D_ALLOCDIRECT: 4328 adp = WK_ALLOCDIRECT(wk); 4329 adp->ad_state |= COMPLETE; 4330 handle_allocdirect_partdone(adp); 4331 continue; 4332 4333 case D_ALLOCINDIR: 4334 aip = WK_ALLOCINDIR(wk); 4335 aip->ai_state |= COMPLETE; 4336 handle_allocindir_partdone(aip); 4337 continue; 4338 4339 case D_INDIRDEP: 4340 indirdep = WK_INDIRDEP(wk); 4341 if (indirdep->ir_state & GOINGAWAY) 4342 panic("disk_write_complete: indirdep gone"); 4343 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); 4344 free(indirdep->ir_saveddata, M_INDIRDEP); 4345 indirdep->ir_saveddata = 0; 4346 indirdep->ir_state &= ~UNDONE; 4347 indirdep->ir_state |= ATTACHED; 4348 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { 4349 handle_allocindir_partdone(aip); 4350 if (aip == LIST_FIRST(&indirdep->ir_donehd)) 4351 panic("disk_write_complete: not gone"); 4352 } 4353 WORKLIST_INSERT(&reattach, wk); 4354 if ((bp->b_flags & B_DELWRI) == 0) 4355 stat_indir_blk_ptrs++; 4356 bdirty(bp); 4357 continue; 4358 4359 default: 4360 panic("handle_disk_write_complete: Unknown type %s", 4361 TYPENAME(wk->wk_type)); 4362 /* NOTREACHED */ 4363 } 4364 } 4365 /* 4366 * Reattach any requests that must be redone. 4367 */ 4368 while ((wk = LIST_FIRST(&reattach)) != NULL) { 4369 WORKLIST_REMOVE(wk); 4370 WORKLIST_INSERT(&bp->b_dep, wk); 4371 } 4372 FREE_LOCK(&lk); 4373 } 4374 4375 /* 4376 * Called from within softdep_disk_write_complete above. Note that 4377 * this routine is always called from interrupt level with further 4378 * splbio interrupts blocked. 4379 */ 4380 static void 4381 handle_allocdirect_partdone(adp) 4382 struct allocdirect *adp; /* the completed allocdirect */ 4383 { 4384 struct allocdirectlst *listhead; 4385 struct allocdirect *listadp; 4386 struct inodedep *inodedep; 4387 long bsize, delay; 4388 4389 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 4390 return; 4391 if (adp->ad_buf != NULL) 4392 panic("handle_allocdirect_partdone: dangling dep"); 4393 /* 4394 * The on-disk inode cannot claim to be any larger than the last 4395 * fragment that has been written. Otherwise, the on-disk inode 4396 * might have fragments that were not the last block in the file 4397 * which would corrupt the filesystem. Thus, we cannot free any 4398 * allocdirects after one whose ad_oldblkno claims a fragment as 4399 * these blocks must be rolled back to zero before writing the inode. 4400 * We check the currently active set of allocdirects in id_inoupdt 4401 * or id_extupdt as appropriate. 4402 */ 4403 inodedep = adp->ad_inodedep; 4404 bsize = inodedep->id_fs->fs_bsize; 4405 if (adp->ad_state & EXTDATA) 4406 listhead = &inodedep->id_extupdt; 4407 else 4408 listhead = &inodedep->id_inoupdt; 4409 TAILQ_FOREACH(listadp, listhead, ad_next) { 4410 /* found our block */ 4411 if (listadp == adp) 4412 break; 4413 /* continue if ad_oldlbn is not a fragment */ 4414 if (listadp->ad_oldsize == 0 || 4415 listadp->ad_oldsize == bsize) 4416 continue; 4417 /* hit a fragment */ 4418 return; 4419 } 4420 /* 4421 * If we have reached the end of the current list without 4422 * finding the just finished dependency, then it must be 4423 * on the future dependency list. Future dependencies cannot 4424 * be freed until they are moved to the current list. 4425 */ 4426 if (listadp == NULL) { 4427 #ifdef DEBUG 4428 if (adp->ad_state & EXTDATA) 4429 listhead = &inodedep->id_newextupdt; 4430 else 4431 listhead = &inodedep->id_newinoupdt; 4432 TAILQ_FOREACH(listadp, listhead, ad_next) 4433 /* found our block */ 4434 if (listadp == adp) 4435 break; 4436 if (listadp == NULL) 4437 panic("handle_allocdirect_partdone: lost dep"); 4438 #endif /* DEBUG */ 4439 return; 4440 } 4441 /* 4442 * If we have found the just finished dependency, then free 4443 * it along with anything that follows it that is complete. 4444 * If the inode still has a bitmap dependency, then it has 4445 * never been written to disk, hence the on-disk inode cannot 4446 * reference the old fragment so we can free it without delay. 4447 */ 4448 delay = (inodedep->id_state & DEPCOMPLETE); 4449 for (; adp; adp = listadp) { 4450 listadp = TAILQ_NEXT(adp, ad_next); 4451 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 4452 return; 4453 free_allocdirect(listhead, adp, delay); 4454 } 4455 } 4456 4457 /* 4458 * Called from within softdep_disk_write_complete above. Note that 4459 * this routine is always called from interrupt level with further 4460 * splbio interrupts blocked. 4461 */ 4462 static void 4463 handle_allocindir_partdone(aip) 4464 struct allocindir *aip; /* the completed allocindir */ 4465 { 4466 struct indirdep *indirdep; 4467 4468 if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) 4469 return; 4470 if (aip->ai_buf != NULL) 4471 panic("handle_allocindir_partdone: dangling dependency"); 4472 indirdep = aip->ai_indirdep; 4473 if (indirdep->ir_state & UNDONE) { 4474 LIST_REMOVE(aip, ai_next); 4475 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); 4476 return; 4477 } 4478 if (indirdep->ir_state & UFS1FMT) 4479 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 4480 aip->ai_newblkno; 4481 else 4482 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 4483 aip->ai_newblkno; 4484 LIST_REMOVE(aip, ai_next); 4485 if (aip->ai_freefrag != NULL) 4486 add_to_worklist(&aip->ai_freefrag->ff_list); 4487 WORKITEM_FREE(aip, D_ALLOCINDIR); 4488 } 4489 4490 /* 4491 * Called from within softdep_disk_write_complete above to restore 4492 * in-memory inode block contents to their most up-to-date state. Note 4493 * that this routine is always called from interrupt level with further 4494 * splbio interrupts blocked. 4495 */ 4496 static int 4497 handle_written_inodeblock(inodedep, bp) 4498 struct inodedep *inodedep; 4499 struct buf *bp; /* buffer containing the inode block */ 4500 { 4501 struct worklist *wk, *filefree; 4502 struct allocdirect *adp, *nextadp; 4503 struct ufs1_dinode *dp1 = NULL; 4504 struct ufs2_dinode *dp2 = NULL; 4505 int hadchanges, fstype; 4506 4507 if ((inodedep->id_state & IOSTARTED) == 0) 4508 panic("handle_written_inodeblock: not started"); 4509 inodedep->id_state &= ~IOSTARTED; 4510 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) { 4511 fstype = UFS1; 4512 dp1 = (struct ufs1_dinode *)bp->b_data + 4513 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 4514 } else { 4515 fstype = UFS2; 4516 dp2 = (struct ufs2_dinode *)bp->b_data + 4517 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 4518 } 4519 /* 4520 * If we had to rollback the inode allocation because of 4521 * bitmaps being incomplete, then simply restore it. 4522 * Keep the block dirty so that it will not be reclaimed until 4523 * all associated dependencies have been cleared and the 4524 * corresponding updates written to disk. 4525 */ 4526 if (inodedep->id_savedino1 != NULL) { 4527 if (fstype == UFS1) 4528 *dp1 = *inodedep->id_savedino1; 4529 else 4530 *dp2 = *inodedep->id_savedino2; 4531 free(inodedep->id_savedino1, M_SAVEDINO); 4532 inodedep->id_savedino1 = NULL; 4533 if ((bp->b_flags & B_DELWRI) == 0) 4534 stat_inode_bitmap++; 4535 bdirty(bp); 4536 return (1); 4537 } 4538 inodedep->id_state |= COMPLETE; 4539 /* 4540 * Roll forward anything that had to be rolled back before 4541 * the inode could be updated. 4542 */ 4543 hadchanges = 0; 4544 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { 4545 nextadp = TAILQ_NEXT(adp, ad_next); 4546 if (adp->ad_state & ATTACHED) 4547 panic("handle_written_inodeblock: new entry"); 4548 if (fstype == UFS1) { 4549 if (adp->ad_lbn < NDADDR) { 4550 if (dp1->di_db[adp->ad_lbn]!=adp->ad_oldblkno) 4551 panic("%s %s #%jd mismatch %d != %jd", 4552 "handle_written_inodeblock:", 4553 "direct pointer", 4554 (intmax_t)adp->ad_lbn, 4555 dp1->di_db[adp->ad_lbn], 4556 (intmax_t)adp->ad_oldblkno); 4557 dp1->di_db[adp->ad_lbn] = adp->ad_newblkno; 4558 } else { 4559 if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0) 4560 panic("%s: %s #%jd allocated as %d", 4561 "handle_written_inodeblock", 4562 "indirect pointer", 4563 (intmax_t)adp->ad_lbn - NDADDR, 4564 dp1->di_ib[adp->ad_lbn - NDADDR]); 4565 dp1->di_ib[adp->ad_lbn - NDADDR] = 4566 adp->ad_newblkno; 4567 } 4568 } else { 4569 if (adp->ad_lbn < NDADDR) { 4570 if (dp2->di_db[adp->ad_lbn]!=adp->ad_oldblkno) 4571 panic("%s: %s #%jd %s %jd != %jd", 4572 "handle_written_inodeblock", 4573 "direct pointer", 4574 (intmax_t)adp->ad_lbn, "mismatch", 4575 (intmax_t)dp2->di_db[adp->ad_lbn], 4576 (intmax_t)adp->ad_oldblkno); 4577 dp2->di_db[adp->ad_lbn] = adp->ad_newblkno; 4578 } else { 4579 if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0) 4580 panic("%s: %s #%jd allocated as %jd", 4581 "handle_written_inodeblock", 4582 "indirect pointer", 4583 (intmax_t)adp->ad_lbn - NDADDR, 4584 (intmax_t) 4585 dp2->di_ib[adp->ad_lbn - NDADDR]); 4586 dp2->di_ib[adp->ad_lbn - NDADDR] = 4587 adp->ad_newblkno; 4588 } 4589 } 4590 adp->ad_state &= ~UNDONE; 4591 adp->ad_state |= ATTACHED; 4592 hadchanges = 1; 4593 } 4594 for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) { 4595 nextadp = TAILQ_NEXT(adp, ad_next); 4596 if (adp->ad_state & ATTACHED) 4597 panic("handle_written_inodeblock: new entry"); 4598 if (dp2->di_extb[adp->ad_lbn] != adp->ad_oldblkno) 4599 panic("%s: direct pointers #%jd %s %jd != %jd", 4600 "handle_written_inodeblock", 4601 (intmax_t)adp->ad_lbn, "mismatch", 4602 (intmax_t)dp2->di_extb[adp->ad_lbn], 4603 (intmax_t)adp->ad_oldblkno); 4604 dp2->di_extb[adp->ad_lbn] = adp->ad_newblkno; 4605 adp->ad_state &= ~UNDONE; 4606 adp->ad_state |= ATTACHED; 4607 hadchanges = 1; 4608 } 4609 if (hadchanges && (bp->b_flags & B_DELWRI) == 0) 4610 stat_direct_blk_ptrs++; 4611 /* 4612 * Reset the file size to its most up-to-date value. 4613 */ 4614 if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1) 4615 panic("handle_written_inodeblock: bad size"); 4616 if (fstype == UFS1) { 4617 if (dp1->di_size != inodedep->id_savedsize) { 4618 dp1->di_size = inodedep->id_savedsize; 4619 hadchanges = 1; 4620 } 4621 } else { 4622 if (dp2->di_size != inodedep->id_savedsize) { 4623 dp2->di_size = inodedep->id_savedsize; 4624 hadchanges = 1; 4625 } 4626 if (dp2->di_extsize != inodedep->id_savedextsize) { 4627 dp2->di_extsize = inodedep->id_savedextsize; 4628 hadchanges = 1; 4629 } 4630 } 4631 inodedep->id_savedsize = -1; 4632 inodedep->id_savedextsize = -1; 4633 /* 4634 * If there were any rollbacks in the inode block, then it must be 4635 * marked dirty so that its will eventually get written back in 4636 * its correct form. 4637 */ 4638 if (hadchanges) 4639 bdirty(bp); 4640 /* 4641 * Process any allocdirects that completed during the update. 4642 */ 4643 if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) 4644 handle_allocdirect_partdone(adp); 4645 if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) 4646 handle_allocdirect_partdone(adp); 4647 /* 4648 * Process deallocations that were held pending until the 4649 * inode had been written to disk. Freeing of the inode 4650 * is delayed until after all blocks have been freed to 4651 * avoid creation of new <vfsid, inum, lbn> triples 4652 * before the old ones have been deleted. 4653 */ 4654 filefree = NULL; 4655 while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { 4656 WORKLIST_REMOVE(wk); 4657 switch (wk->wk_type) { 4658 4659 case D_FREEFILE: 4660 /* 4661 * We defer adding filefree to the worklist until 4662 * all other additions have been made to ensure 4663 * that it will be done after all the old blocks 4664 * have been freed. 4665 */ 4666 if (filefree != NULL) 4667 panic("handle_written_inodeblock: filefree"); 4668 filefree = wk; 4669 continue; 4670 4671 case D_MKDIR: 4672 handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); 4673 continue; 4674 4675 case D_DIRADD: 4676 diradd_inode_written(WK_DIRADD(wk), inodedep); 4677 continue; 4678 4679 case D_FREEBLKS: 4680 wk->wk_state |= COMPLETE; 4681 if ((wk->wk_state & ALLCOMPLETE) != ALLCOMPLETE) 4682 continue; 4683 /* -- fall through -- */ 4684 case D_FREEFRAG: 4685 case D_DIRREM: 4686 add_to_worklist(wk); 4687 continue; 4688 4689 case D_NEWDIRBLK: 4690 free_newdirblk(WK_NEWDIRBLK(wk)); 4691 continue; 4692 4693 default: 4694 panic("handle_written_inodeblock: Unknown type %s", 4695 TYPENAME(wk->wk_type)); 4696 /* NOTREACHED */ 4697 } 4698 } 4699 if (filefree != NULL) { 4700 if (free_inodedep(inodedep) == 0) 4701 panic("handle_written_inodeblock: live inodedep"); 4702 add_to_worklist(filefree); 4703 return (0); 4704 } 4705 4706 /* 4707 * If no outstanding dependencies, free it. 4708 */ 4709 if (free_inodedep(inodedep) || 4710 (TAILQ_FIRST(&inodedep->id_inoupdt) == 0 && 4711 TAILQ_FIRST(&inodedep->id_extupdt) == 0)) 4712 return (0); 4713 return (hadchanges); 4714 } 4715 4716 /* 4717 * Process a diradd entry after its dependent inode has been written. 4718 * This routine must be called with splbio interrupts blocked. 4719 */ 4720 static void 4721 diradd_inode_written(dap, inodedep) 4722 struct diradd *dap; 4723 struct inodedep *inodedep; 4724 { 4725 struct pagedep *pagedep; 4726 4727 dap->da_state |= COMPLETE; 4728 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 4729 if (dap->da_state & DIRCHG) 4730 pagedep = dap->da_previous->dm_pagedep; 4731 else 4732 pagedep = dap->da_pagedep; 4733 LIST_REMOVE(dap, da_pdlist); 4734 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 4735 } 4736 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 4737 } 4738 4739 /* 4740 * Handle the completion of a mkdir dependency. 4741 */ 4742 static void 4743 handle_written_mkdir(mkdir, type) 4744 struct mkdir *mkdir; 4745 int type; 4746 { 4747 struct diradd *dap; 4748 struct pagedep *pagedep; 4749 4750 if (mkdir->md_state != type) 4751 panic("handle_written_mkdir: bad type"); 4752 dap = mkdir->md_diradd; 4753 dap->da_state &= ~type; 4754 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) 4755 dap->da_state |= DEPCOMPLETE; 4756 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 4757 if (dap->da_state & DIRCHG) 4758 pagedep = dap->da_previous->dm_pagedep; 4759 else 4760 pagedep = dap->da_pagedep; 4761 LIST_REMOVE(dap, da_pdlist); 4762 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 4763 } 4764 LIST_REMOVE(mkdir, md_mkdirs); 4765 WORKITEM_FREE(mkdir, D_MKDIR); 4766 } 4767 4768 /* 4769 * Called from within softdep_disk_write_complete above. 4770 * A write operation was just completed. Removed inodes can 4771 * now be freed and associated block pointers may be committed. 4772 * Note that this routine is always called from interrupt level 4773 * with further splbio interrupts blocked. 4774 */ 4775 static int 4776 handle_written_filepage(pagedep, bp) 4777 struct pagedep *pagedep; 4778 struct buf *bp; /* buffer containing the written page */ 4779 { 4780 struct dirrem *dirrem; 4781 struct diradd *dap, *nextdap; 4782 struct direct *ep; 4783 int i, chgs; 4784 4785 if ((pagedep->pd_state & IOSTARTED) == 0) 4786 panic("handle_written_filepage: not started"); 4787 pagedep->pd_state &= ~IOSTARTED; 4788 /* 4789 * Process any directory removals that have been committed. 4790 */ 4791 while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { 4792 LIST_REMOVE(dirrem, dm_next); 4793 dirrem->dm_dirinum = pagedep->pd_ino; 4794 add_to_worklist(&dirrem->dm_list); 4795 } 4796 /* 4797 * Free any directory additions that have been committed. 4798 * If it is a newly allocated block, we have to wait until 4799 * the on-disk directory inode claims the new block. 4800 */ 4801 if ((pagedep->pd_state & NEWBLOCK) == 0) 4802 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 4803 free_diradd(dap); 4804 /* 4805 * Uncommitted directory entries must be restored. 4806 */ 4807 for (chgs = 0, i = 0; i < DAHASHSZ; i++) { 4808 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; 4809 dap = nextdap) { 4810 nextdap = LIST_NEXT(dap, da_pdlist); 4811 if (dap->da_state & ATTACHED) 4812 panic("handle_written_filepage: attached"); 4813 ep = (struct direct *) 4814 ((char *)bp->b_data + dap->da_offset); 4815 ep->d_ino = dap->da_newinum; 4816 dap->da_state &= ~UNDONE; 4817 dap->da_state |= ATTACHED; 4818 chgs = 1; 4819 /* 4820 * If the inode referenced by the directory has 4821 * been written out, then the dependency can be 4822 * moved to the pending list. 4823 */ 4824 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 4825 LIST_REMOVE(dap, da_pdlist); 4826 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, 4827 da_pdlist); 4828 } 4829 } 4830 } 4831 /* 4832 * If there were any rollbacks in the directory, then it must be 4833 * marked dirty so that its will eventually get written back in 4834 * its correct form. 4835 */ 4836 if (chgs) { 4837 if ((bp->b_flags & B_DELWRI) == 0) 4838 stat_dir_entry++; 4839 bdirty(bp); 4840 return (1); 4841 } 4842 /* 4843 * If we are not waiting for a new directory block to be 4844 * claimed by its inode, then the pagedep will be freed. 4845 * Otherwise it will remain to track any new entries on 4846 * the page in case they are fsync'ed. 4847 */ 4848 if ((pagedep->pd_state & NEWBLOCK) == 0) { 4849 LIST_REMOVE(pagedep, pd_hash); 4850 WORKITEM_FREE(pagedep, D_PAGEDEP); 4851 } 4852 return (0); 4853 } 4854 4855 /* 4856 * Writing back in-core inode structures. 4857 * 4858 * The filesystem only accesses an inode's contents when it occupies an 4859 * "in-core" inode structure. These "in-core" structures are separate from 4860 * the page frames used to cache inode blocks. Only the latter are 4861 * transferred to/from the disk. So, when the updated contents of the 4862 * "in-core" inode structure are copied to the corresponding in-memory inode 4863 * block, the dependencies are also transferred. The following procedure is 4864 * called when copying a dirty "in-core" inode to a cached inode block. 4865 */ 4866 4867 /* 4868 * Called when an inode is loaded from disk. If the effective link count 4869 * differed from the actual link count when it was last flushed, then we 4870 * need to ensure that the correct effective link count is put back. 4871 */ 4872 void 4873 softdep_load_inodeblock(ip) 4874 struct inode *ip; /* the "in_core" copy of the inode */ 4875 { 4876 struct inodedep *inodedep; 4877 4878 /* 4879 * Check for alternate nlink count. 4880 */ 4881 ip->i_effnlink = ip->i_nlink; 4882 ACQUIRE_LOCK(&lk); 4883 if (inodedep_lookup(UFSTOVFS(ip->i_ump), 4884 ip->i_number, 0, &inodedep) == 0) { 4885 FREE_LOCK(&lk); 4886 return; 4887 } 4888 ip->i_effnlink -= inodedep->id_nlinkdelta; 4889 if (inodedep->id_state & SPACECOUNTED) 4890 ip->i_flag |= IN_SPACECOUNTED; 4891 FREE_LOCK(&lk); 4892 } 4893 4894 /* 4895 * This routine is called just before the "in-core" inode 4896 * information is to be copied to the in-memory inode block. 4897 * Recall that an inode block contains several inodes. If 4898 * the force flag is set, then the dependencies will be 4899 * cleared so that the update can always be made. Note that 4900 * the buffer is locked when this routine is called, so we 4901 * will never be in the middle of writing the inode block 4902 * to disk. 4903 */ 4904 void 4905 softdep_update_inodeblock(ip, bp, waitfor) 4906 struct inode *ip; /* the "in_core" copy of the inode */ 4907 struct buf *bp; /* the buffer containing the inode block */ 4908 int waitfor; /* nonzero => update must be allowed */ 4909 { 4910 struct inodedep *inodedep; 4911 struct worklist *wk; 4912 struct mount *mp; 4913 struct buf *ibp; 4914 int error; 4915 4916 /* 4917 * If the effective link count is not equal to the actual link 4918 * count, then we must track the difference in an inodedep while 4919 * the inode is (potentially) tossed out of the cache. Otherwise, 4920 * if there is no existing inodedep, then there are no dependencies 4921 * to track. 4922 */ 4923 mp = UFSTOVFS(ip->i_ump); 4924 ACQUIRE_LOCK(&lk); 4925 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { 4926 FREE_LOCK(&lk); 4927 if (ip->i_effnlink != ip->i_nlink) 4928 panic("softdep_update_inodeblock: bad link count"); 4929 return; 4930 } 4931 if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) 4932 panic("softdep_update_inodeblock: bad delta"); 4933 /* 4934 * Changes have been initiated. Anything depending on these 4935 * changes cannot occur until this inode has been written. 4936 */ 4937 inodedep->id_state &= ~COMPLETE; 4938 if ((inodedep->id_state & ONWORKLIST) == 0) 4939 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); 4940 /* 4941 * Any new dependencies associated with the incore inode must 4942 * now be moved to the list associated with the buffer holding 4943 * the in-memory copy of the inode. Once merged process any 4944 * allocdirects that are completed by the merger. 4945 */ 4946 merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); 4947 if (!TAILQ_EMPTY(&inodedep->id_inoupdt)) 4948 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt)); 4949 merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); 4950 if (!TAILQ_EMPTY(&inodedep->id_extupdt)) 4951 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt)); 4952 /* 4953 * Now that the inode has been pushed into the buffer, the 4954 * operations dependent on the inode being written to disk 4955 * can be moved to the id_bufwait so that they will be 4956 * processed when the buffer I/O completes. 4957 */ 4958 while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { 4959 WORKLIST_REMOVE(wk); 4960 WORKLIST_INSERT(&inodedep->id_bufwait, wk); 4961 } 4962 /* 4963 * Newly allocated inodes cannot be written until the bitmap 4964 * that allocates them have been written (indicated by 4965 * DEPCOMPLETE being set in id_state). If we are doing a 4966 * forced sync (e.g., an fsync on a file), we force the bitmap 4967 * to be written so that the update can be done. 4968 */ 4969 if (waitfor == 0) { 4970 FREE_LOCK(&lk); 4971 return; 4972 } 4973 retry: 4974 if ((inodedep->id_state & DEPCOMPLETE) != 0) { 4975 FREE_LOCK(&lk); 4976 return; 4977 } 4978 ibp = inodedep->id_buf; 4979 ibp = getdirtybuf(ibp, &lk, MNT_WAIT); 4980 if (ibp == NULL) { 4981 /* 4982 * If ibp came back as NULL, the dependency could have been 4983 * freed while we slept. Look it up again, and check to see 4984 * that it has completed. 4985 */ 4986 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) 4987 goto retry; 4988 FREE_LOCK(&lk); 4989 return; 4990 } 4991 FREE_LOCK(&lk); 4992 if ((error = bwrite(ibp)) != 0) 4993 softdep_error("softdep_update_inodeblock: bwrite", error); 4994 } 4995 4996 /* 4997 * Merge the a new inode dependency list (such as id_newinoupdt) into an 4998 * old inode dependency list (such as id_inoupdt). This routine must be 4999 * called with splbio interrupts blocked. 5000 */ 5001 static void 5002 merge_inode_lists(newlisthead, oldlisthead) 5003 struct allocdirectlst *newlisthead; 5004 struct allocdirectlst *oldlisthead; 5005 { 5006 struct allocdirect *listadp, *newadp; 5007 5008 newadp = TAILQ_FIRST(newlisthead); 5009 for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) { 5010 if (listadp->ad_lbn < newadp->ad_lbn) { 5011 listadp = TAILQ_NEXT(listadp, ad_next); 5012 continue; 5013 } 5014 TAILQ_REMOVE(newlisthead, newadp, ad_next); 5015 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); 5016 if (listadp->ad_lbn == newadp->ad_lbn) { 5017 allocdirect_merge(oldlisthead, newadp, 5018 listadp); 5019 listadp = newadp; 5020 } 5021 newadp = TAILQ_FIRST(newlisthead); 5022 } 5023 while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) { 5024 TAILQ_REMOVE(newlisthead, newadp, ad_next); 5025 TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next); 5026 } 5027 } 5028 5029 /* 5030 * If we are doing an fsync, then we must ensure that any directory 5031 * entries for the inode have been written after the inode gets to disk. 5032 */ 5033 int 5034 softdep_fsync(vp) 5035 struct vnode *vp; /* the "in_core" copy of the inode */ 5036 { 5037 struct inodedep *inodedep; 5038 struct pagedep *pagedep; 5039 struct worklist *wk; 5040 struct diradd *dap; 5041 struct mount *mp; 5042 struct vnode *pvp; 5043 struct inode *ip; 5044 struct buf *bp; 5045 struct fs *fs; 5046 struct thread *td = curthread; 5047 int error, flushparent, pagedep_new_block; 5048 ino_t parentino; 5049 ufs_lbn_t lbn; 5050 5051 ip = VTOI(vp); 5052 fs = ip->i_fs; 5053 mp = vp->v_mount; 5054 ACQUIRE_LOCK(&lk); 5055 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { 5056 FREE_LOCK(&lk); 5057 return (0); 5058 } 5059 if (!LIST_EMPTY(&inodedep->id_inowait) || 5060 !LIST_EMPTY(&inodedep->id_bufwait) || 5061 !TAILQ_EMPTY(&inodedep->id_extupdt) || 5062 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 5063 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 5064 !TAILQ_EMPTY(&inodedep->id_newinoupdt)) 5065 panic("softdep_fsync: pending ops"); 5066 for (error = 0, flushparent = 0; ; ) { 5067 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) 5068 break; 5069 if (wk->wk_type != D_DIRADD) 5070 panic("softdep_fsync: Unexpected type %s", 5071 TYPENAME(wk->wk_type)); 5072 dap = WK_DIRADD(wk); 5073 /* 5074 * Flush our parent if this directory entry has a MKDIR_PARENT 5075 * dependency or is contained in a newly allocated block. 5076 */ 5077 if (dap->da_state & DIRCHG) 5078 pagedep = dap->da_previous->dm_pagedep; 5079 else 5080 pagedep = dap->da_pagedep; 5081 parentino = pagedep->pd_ino; 5082 lbn = pagedep->pd_lbn; 5083 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) 5084 panic("softdep_fsync: dirty"); 5085 if ((dap->da_state & MKDIR_PARENT) || 5086 (pagedep->pd_state & NEWBLOCK)) 5087 flushparent = 1; 5088 else 5089 flushparent = 0; 5090 /* 5091 * If we are being fsync'ed as part of vgone'ing this vnode, 5092 * then we will not be able to release and recover the 5093 * vnode below, so we just have to give up on writing its 5094 * directory entry out. It will eventually be written, just 5095 * not now, but then the user was not asking to have it 5096 * written, so we are not breaking any promises. 5097 */ 5098 if (vp->v_iflag & VI_DOOMED) 5099 break; 5100 /* 5101 * We prevent deadlock by always fetching inodes from the 5102 * root, moving down the directory tree. Thus, when fetching 5103 * our parent directory, we first try to get the lock. If 5104 * that fails, we must unlock ourselves before requesting 5105 * the lock on our parent. See the comment in ufs_lookup 5106 * for details on possible races. 5107 */ 5108 FREE_LOCK(&lk); 5109 if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp, 5110 FFSV_FORCEINSMQ)) { 5111 error = vfs_busy(mp, MBF_NOWAIT); 5112 if (error != 0) { 5113 vfs_ref(mp); 5114 VOP_UNLOCK(vp, 0); 5115 error = vfs_busy(mp, 0); 5116 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5117 vfs_rel(mp); 5118 if (error != 0) 5119 return (ENOENT); 5120 if (vp->v_iflag & VI_DOOMED) { 5121 vfs_unbusy(mp); 5122 return (ENOENT); 5123 } 5124 } 5125 VOP_UNLOCK(vp, 0); 5126 error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE, 5127 &pvp, FFSV_FORCEINSMQ); 5128 vfs_unbusy(mp); 5129 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5130 if (vp->v_iflag & VI_DOOMED) { 5131 if (error == 0) 5132 vput(pvp); 5133 error = ENOENT; 5134 } 5135 if (error != 0) 5136 return (error); 5137 } 5138 /* 5139 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps 5140 * that are contained in direct blocks will be resolved by 5141 * doing a ffs_update. Pagedeps contained in indirect blocks 5142 * may require a complete sync'ing of the directory. So, we 5143 * try the cheap and fast ffs_update first, and if that fails, 5144 * then we do the slower ffs_syncvnode of the directory. 5145 */ 5146 if (flushparent) { 5147 int locked; 5148 5149 if ((error = ffs_update(pvp, 1)) != 0) { 5150 vput(pvp); 5151 return (error); 5152 } 5153 ACQUIRE_LOCK(&lk); 5154 locked = 1; 5155 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) { 5156 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) { 5157 if (wk->wk_type != D_DIRADD) 5158 panic("softdep_fsync: Unexpected type %s", 5159 TYPENAME(wk->wk_type)); 5160 dap = WK_DIRADD(wk); 5161 if (dap->da_state & DIRCHG) 5162 pagedep = dap->da_previous->dm_pagedep; 5163 else 5164 pagedep = dap->da_pagedep; 5165 pagedep_new_block = pagedep->pd_state & NEWBLOCK; 5166 FREE_LOCK(&lk); 5167 locked = 0; 5168 if (pagedep_new_block && 5169 (error = ffs_syncvnode(pvp, MNT_WAIT))) { 5170 vput(pvp); 5171 return (error); 5172 } 5173 } 5174 } 5175 if (locked) 5176 FREE_LOCK(&lk); 5177 } 5178 /* 5179 * Flush directory page containing the inode's name. 5180 */ 5181 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred, 5182 &bp); 5183 if (error == 0) 5184 error = bwrite(bp); 5185 else 5186 brelse(bp); 5187 vput(pvp); 5188 if (error != 0) 5189 return (error); 5190 ACQUIRE_LOCK(&lk); 5191 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) 5192 break; 5193 } 5194 FREE_LOCK(&lk); 5195 return (0); 5196 } 5197 5198 /* 5199 * Flush all the dirty bitmaps associated with the block device 5200 * before flushing the rest of the dirty blocks so as to reduce 5201 * the number of dependencies that will have to be rolled back. 5202 */ 5203 void 5204 softdep_fsync_mountdev(vp) 5205 struct vnode *vp; 5206 { 5207 struct buf *bp, *nbp; 5208 struct worklist *wk; 5209 struct bufobj *bo; 5210 5211 if (!vn_isdisk(vp, NULL)) 5212 panic("softdep_fsync_mountdev: vnode not a disk"); 5213 bo = &vp->v_bufobj; 5214 restart: 5215 BO_LOCK(bo); 5216 ACQUIRE_LOCK(&lk); 5217 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 5218 /* 5219 * If it is already scheduled, skip to the next buffer. 5220 */ 5221 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 5222 continue; 5223 5224 if ((bp->b_flags & B_DELWRI) == 0) 5225 panic("softdep_fsync_mountdev: not dirty"); 5226 /* 5227 * We are only interested in bitmaps with outstanding 5228 * dependencies. 5229 */ 5230 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL || 5231 wk->wk_type != D_BMSAFEMAP || 5232 (bp->b_vflags & BV_BKGRDINPROG)) { 5233 BUF_UNLOCK(bp); 5234 continue; 5235 } 5236 FREE_LOCK(&lk); 5237 BO_UNLOCK(bo); 5238 bremfree(bp); 5239 (void) bawrite(bp); 5240 goto restart; 5241 } 5242 FREE_LOCK(&lk); 5243 drain_output(vp); 5244 BO_UNLOCK(bo); 5245 } 5246 5247 /* 5248 * This routine is called when we are trying to synchronously flush a 5249 * file. This routine must eliminate any filesystem metadata dependencies 5250 * so that the syncing routine can succeed by pushing the dirty blocks 5251 * associated with the file. If any I/O errors occur, they are returned. 5252 */ 5253 int 5254 softdep_sync_metadata(struct vnode *vp) 5255 { 5256 struct pagedep *pagedep; 5257 struct allocdirect *adp; 5258 struct allocindir *aip; 5259 struct buf *bp, *nbp; 5260 struct worklist *wk; 5261 struct bufobj *bo; 5262 int i, error, waitfor; 5263 5264 if (!DOINGSOFTDEP(vp)) 5265 return (0); 5266 /* 5267 * Ensure that any direct block dependencies have been cleared. 5268 */ 5269 ACQUIRE_LOCK(&lk); 5270 if ((error = flush_inodedep_deps(vp->v_mount, VTOI(vp)->i_number))) { 5271 FREE_LOCK(&lk); 5272 return (error); 5273 } 5274 FREE_LOCK(&lk); 5275 /* 5276 * For most files, the only metadata dependencies are the 5277 * cylinder group maps that allocate their inode or blocks. 5278 * The block allocation dependencies can be found by traversing 5279 * the dependency lists for any buffers that remain on their 5280 * dirty buffer list. The inode allocation dependency will 5281 * be resolved when the inode is updated with MNT_WAIT. 5282 * This work is done in two passes. The first pass grabs most 5283 * of the buffers and begins asynchronously writing them. The 5284 * only way to wait for these asynchronous writes is to sleep 5285 * on the filesystem vnode which may stay busy for a long time 5286 * if the filesystem is active. So, instead, we make a second 5287 * pass over the dependencies blocking on each write. In the 5288 * usual case we will be blocking against a write that we 5289 * initiated, so when it is done the dependency will have been 5290 * resolved. Thus the second pass is expected to end quickly. 5291 */ 5292 waitfor = MNT_NOWAIT; 5293 bo = &vp->v_bufobj; 5294 5295 top: 5296 /* 5297 * We must wait for any I/O in progress to finish so that 5298 * all potential buffers on the dirty list will be visible. 5299 */ 5300 BO_LOCK(bo); 5301 drain_output(vp); 5302 while ((bp = TAILQ_FIRST(&bo->bo_dirty.bv_hd)) != NULL) { 5303 bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT); 5304 if (bp) 5305 break; 5306 } 5307 BO_UNLOCK(bo); 5308 if (bp == NULL) 5309 return (0); 5310 loop: 5311 /* While syncing snapshots, we must allow recursive lookups */ 5312 BUF_AREC(bp); 5313 ACQUIRE_LOCK(&lk); 5314 /* 5315 * As we hold the buffer locked, none of its dependencies 5316 * will disappear. 5317 */ 5318 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 5319 switch (wk->wk_type) { 5320 5321 case D_ALLOCDIRECT: 5322 adp = WK_ALLOCDIRECT(wk); 5323 if (adp->ad_state & DEPCOMPLETE) 5324 continue; 5325 nbp = adp->ad_buf; 5326 nbp = getdirtybuf(nbp, &lk, waitfor); 5327 if (nbp == NULL) 5328 continue; 5329 FREE_LOCK(&lk); 5330 if (waitfor == MNT_NOWAIT) { 5331 bawrite(nbp); 5332 } else if ((error = bwrite(nbp)) != 0) { 5333 break; 5334 } 5335 ACQUIRE_LOCK(&lk); 5336 continue; 5337 5338 case D_ALLOCINDIR: 5339 aip = WK_ALLOCINDIR(wk); 5340 if (aip->ai_state & DEPCOMPLETE) 5341 continue; 5342 nbp = aip->ai_buf; 5343 nbp = getdirtybuf(nbp, &lk, waitfor); 5344 if (nbp == NULL) 5345 continue; 5346 FREE_LOCK(&lk); 5347 if (waitfor == MNT_NOWAIT) { 5348 bawrite(nbp); 5349 } else if ((error = bwrite(nbp)) != 0) { 5350 break; 5351 } 5352 ACQUIRE_LOCK(&lk); 5353 continue; 5354 5355 case D_INDIRDEP: 5356 restart: 5357 5358 LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) { 5359 if (aip->ai_state & DEPCOMPLETE) 5360 continue; 5361 nbp = aip->ai_buf; 5362 nbp = getdirtybuf(nbp, &lk, MNT_WAIT); 5363 if (nbp == NULL) 5364 goto restart; 5365 FREE_LOCK(&lk); 5366 if ((error = bwrite(nbp)) != 0) { 5367 goto loop_end; 5368 } 5369 ACQUIRE_LOCK(&lk); 5370 goto restart; 5371 } 5372 continue; 5373 5374 case D_INODEDEP: 5375 if ((error = flush_inodedep_deps(wk->wk_mp, 5376 WK_INODEDEP(wk)->id_ino)) != 0) { 5377 FREE_LOCK(&lk); 5378 break; 5379 } 5380 continue; 5381 5382 case D_PAGEDEP: 5383 /* 5384 * We are trying to sync a directory that may 5385 * have dependencies on both its own metadata 5386 * and/or dependencies on the inodes of any 5387 * recently allocated files. We walk its diradd 5388 * lists pushing out the associated inode. 5389 */ 5390 pagedep = WK_PAGEDEP(wk); 5391 for (i = 0; i < DAHASHSZ; i++) { 5392 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) 5393 continue; 5394 if ((error = 5395 flush_pagedep_deps(vp, wk->wk_mp, 5396 &pagedep->pd_diraddhd[i]))) { 5397 FREE_LOCK(&lk); 5398 goto loop_end; 5399 } 5400 } 5401 continue; 5402 5403 case D_MKDIR: 5404 /* 5405 * This case should never happen if the vnode has 5406 * been properly sync'ed. However, if this function 5407 * is used at a place where the vnode has not yet 5408 * been sync'ed, this dependency can show up. So, 5409 * rather than panic, just flush it. 5410 */ 5411 nbp = WK_MKDIR(wk)->md_buf; 5412 nbp = getdirtybuf(nbp, &lk, waitfor); 5413 if (nbp == NULL) 5414 continue; 5415 FREE_LOCK(&lk); 5416 if (waitfor == MNT_NOWAIT) { 5417 bawrite(nbp); 5418 } else if ((error = bwrite(nbp)) != 0) { 5419 break; 5420 } 5421 ACQUIRE_LOCK(&lk); 5422 continue; 5423 5424 case D_BMSAFEMAP: 5425 /* 5426 * This case should never happen if the vnode has 5427 * been properly sync'ed. However, if this function 5428 * is used at a place where the vnode has not yet 5429 * been sync'ed, this dependency can show up. So, 5430 * rather than panic, just flush it. 5431 */ 5432 nbp = WK_BMSAFEMAP(wk)->sm_buf; 5433 nbp = getdirtybuf(nbp, &lk, waitfor); 5434 if (nbp == NULL) 5435 continue; 5436 FREE_LOCK(&lk); 5437 if (waitfor == MNT_NOWAIT) { 5438 bawrite(nbp); 5439 } else if ((error = bwrite(nbp)) != 0) { 5440 break; 5441 } 5442 ACQUIRE_LOCK(&lk); 5443 continue; 5444 5445 default: 5446 panic("softdep_sync_metadata: Unknown type %s", 5447 TYPENAME(wk->wk_type)); 5448 /* NOTREACHED */ 5449 } 5450 loop_end: 5451 /* We reach here only in error and unlocked */ 5452 if (error == 0) 5453 panic("softdep_sync_metadata: zero error"); 5454 BUF_NOREC(bp); 5455 bawrite(bp); 5456 return (error); 5457 } 5458 FREE_LOCK(&lk); 5459 BO_LOCK(bo); 5460 while ((nbp = TAILQ_NEXT(bp, b_bobufs)) != NULL) { 5461 nbp = getdirtybuf(nbp, BO_MTX(bo), MNT_WAIT); 5462 if (nbp) 5463 break; 5464 } 5465 BO_UNLOCK(bo); 5466 BUF_NOREC(bp); 5467 bawrite(bp); 5468 if (nbp != NULL) { 5469 bp = nbp; 5470 goto loop; 5471 } 5472 /* 5473 * The brief unlock is to allow any pent up dependency 5474 * processing to be done. Then proceed with the second pass. 5475 */ 5476 if (waitfor == MNT_NOWAIT) { 5477 waitfor = MNT_WAIT; 5478 goto top; 5479 } 5480 5481 /* 5482 * If we have managed to get rid of all the dirty buffers, 5483 * then we are done. For certain directories and block 5484 * devices, we may need to do further work. 5485 * 5486 * We must wait for any I/O in progress to finish so that 5487 * all potential buffers on the dirty list will be visible. 5488 */ 5489 BO_LOCK(bo); 5490 drain_output(vp); 5491 BO_UNLOCK(bo); 5492 return (0); 5493 } 5494 5495 /* 5496 * Flush the dependencies associated with an inodedep. 5497 * Called with splbio blocked. 5498 */ 5499 static int 5500 flush_inodedep_deps(mp, ino) 5501 struct mount *mp; 5502 ino_t ino; 5503 { 5504 struct inodedep *inodedep; 5505 int error, waitfor; 5506 5507 /* 5508 * This work is done in two passes. The first pass grabs most 5509 * of the buffers and begins asynchronously writing them. The 5510 * only way to wait for these asynchronous writes is to sleep 5511 * on the filesystem vnode which may stay busy for a long time 5512 * if the filesystem is active. So, instead, we make a second 5513 * pass over the dependencies blocking on each write. In the 5514 * usual case we will be blocking against a write that we 5515 * initiated, so when it is done the dependency will have been 5516 * resolved. Thus the second pass is expected to end quickly. 5517 * We give a brief window at the top of the loop to allow 5518 * any pending I/O to complete. 5519 */ 5520 for (error = 0, waitfor = MNT_NOWAIT; ; ) { 5521 if (error) 5522 return (error); 5523 FREE_LOCK(&lk); 5524 ACQUIRE_LOCK(&lk); 5525 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) 5526 return (0); 5527 if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) || 5528 flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) || 5529 flush_deplist(&inodedep->id_extupdt, waitfor, &error) || 5530 flush_deplist(&inodedep->id_newextupdt, waitfor, &error)) 5531 continue; 5532 /* 5533 * If pass2, we are done, otherwise do pass 2. 5534 */ 5535 if (waitfor == MNT_WAIT) 5536 break; 5537 waitfor = MNT_WAIT; 5538 } 5539 /* 5540 * Try freeing inodedep in case all dependencies have been removed. 5541 */ 5542 if (inodedep_lookup(mp, ino, 0, &inodedep) != 0) 5543 (void) free_inodedep(inodedep); 5544 return (0); 5545 } 5546 5547 /* 5548 * Flush an inode dependency list. 5549 * Called with splbio blocked. 5550 */ 5551 static int 5552 flush_deplist(listhead, waitfor, errorp) 5553 struct allocdirectlst *listhead; 5554 int waitfor; 5555 int *errorp; 5556 { 5557 struct allocdirect *adp; 5558 struct buf *bp; 5559 5560 mtx_assert(&lk, MA_OWNED); 5561 TAILQ_FOREACH(adp, listhead, ad_next) { 5562 if (adp->ad_state & DEPCOMPLETE) 5563 continue; 5564 bp = adp->ad_buf; 5565 bp = getdirtybuf(bp, &lk, waitfor); 5566 if (bp == NULL) { 5567 if (waitfor == MNT_NOWAIT) 5568 continue; 5569 return (1); 5570 } 5571 FREE_LOCK(&lk); 5572 if (waitfor == MNT_NOWAIT) { 5573 bawrite(bp); 5574 } else if ((*errorp = bwrite(bp)) != 0) { 5575 ACQUIRE_LOCK(&lk); 5576 return (1); 5577 } 5578 ACQUIRE_LOCK(&lk); 5579 return (1); 5580 } 5581 return (0); 5582 } 5583 5584 /* 5585 * Eliminate a pagedep dependency by flushing out all its diradd dependencies. 5586 * Called with splbio blocked. 5587 */ 5588 static int 5589 flush_pagedep_deps(pvp, mp, diraddhdp) 5590 struct vnode *pvp; 5591 struct mount *mp; 5592 struct diraddhd *diraddhdp; 5593 { 5594 struct inodedep *inodedep; 5595 struct ufsmount *ump; 5596 struct diradd *dap; 5597 struct vnode *vp; 5598 struct bufobj *bo; 5599 int error = 0; 5600 struct buf *bp; 5601 ino_t inum; 5602 struct worklist *wk; 5603 5604 ump = VFSTOUFS(mp); 5605 while ((dap = LIST_FIRST(diraddhdp)) != NULL) { 5606 /* 5607 * Flush ourselves if this directory entry 5608 * has a MKDIR_PARENT dependency. 5609 */ 5610 if (dap->da_state & MKDIR_PARENT) { 5611 FREE_LOCK(&lk); 5612 if ((error = ffs_update(pvp, 1)) != 0) 5613 break; 5614 ACQUIRE_LOCK(&lk); 5615 /* 5616 * If that cleared dependencies, go on to next. 5617 */ 5618 if (dap != LIST_FIRST(diraddhdp)) 5619 continue; 5620 if (dap->da_state & MKDIR_PARENT) 5621 panic("flush_pagedep_deps: MKDIR_PARENT"); 5622 } 5623 /* 5624 * A newly allocated directory must have its "." and 5625 * ".." entries written out before its name can be 5626 * committed in its parent. We do not want or need 5627 * the full semantics of a synchronous ffs_syncvnode as 5628 * that may end up here again, once for each directory 5629 * level in the filesystem. Instead, we push the blocks 5630 * and wait for them to clear. We have to fsync twice 5631 * because the first call may choose to defer blocks 5632 * that still have dependencies, but deferral will 5633 * happen at most once. 5634 */ 5635 inum = dap->da_newinum; 5636 if (dap->da_state & MKDIR_BODY) { 5637 FREE_LOCK(&lk); 5638 if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, 5639 FFSV_FORCEINSMQ))) 5640 break; 5641 if ((error=ffs_syncvnode(vp, MNT_NOWAIT)) || 5642 (error=ffs_syncvnode(vp, MNT_NOWAIT))) { 5643 vput(vp); 5644 break; 5645 } 5646 bo = &vp->v_bufobj; 5647 BO_LOCK(bo); 5648 drain_output(vp); 5649 /* 5650 * If first block is still dirty with a D_MKDIR 5651 * dependency then it needs to be written now. 5652 */ 5653 for (;;) { 5654 error = 0; 5655 bp = gbincore(bo, 0); 5656 if (bp == NULL) 5657 break; /* First block not present */ 5658 error = BUF_LOCK(bp, 5659 LK_EXCLUSIVE | 5660 LK_SLEEPFAIL | 5661 LK_INTERLOCK, 5662 BO_MTX(bo)); 5663 BO_LOCK(bo); 5664 if (error == ENOLCK) 5665 continue; /* Slept, retry */ 5666 if (error != 0) 5667 break; /* Failed */ 5668 if ((bp->b_flags & B_DELWRI) == 0) { 5669 BUF_UNLOCK(bp); 5670 break; /* Buffer not dirty */ 5671 } 5672 for (wk = LIST_FIRST(&bp->b_dep); 5673 wk != NULL; 5674 wk = LIST_NEXT(wk, wk_list)) 5675 if (wk->wk_type == D_MKDIR) 5676 break; 5677 if (wk == NULL) 5678 BUF_UNLOCK(bp); /* Dependency gone */ 5679 else { 5680 /* 5681 * D_MKDIR dependency remains, 5682 * must write buffer to stable 5683 * storage. 5684 */ 5685 BO_UNLOCK(bo); 5686 bremfree(bp); 5687 error = bwrite(bp); 5688 BO_LOCK(bo); 5689 } 5690 break; 5691 } 5692 BO_UNLOCK(bo); 5693 vput(vp); 5694 if (error != 0) 5695 break; /* Flushing of first block failed */ 5696 ACQUIRE_LOCK(&lk); 5697 /* 5698 * If that cleared dependencies, go on to next. 5699 */ 5700 if (dap != LIST_FIRST(diraddhdp)) 5701 continue; 5702 if (dap->da_state & MKDIR_BODY) 5703 panic("flush_pagedep_deps: MKDIR_BODY"); 5704 } 5705 /* 5706 * Flush the inode on which the directory entry depends. 5707 * Having accounted for MKDIR_PARENT and MKDIR_BODY above, 5708 * the only remaining dependency is that the updated inode 5709 * count must get pushed to disk. The inode has already 5710 * been pushed into its inode buffer (via VOP_UPDATE) at 5711 * the time of the reference count change. So we need only 5712 * locate that buffer, ensure that there will be no rollback 5713 * caused by a bitmap dependency, then write the inode buffer. 5714 */ 5715 retry: 5716 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) 5717 panic("flush_pagedep_deps: lost inode"); 5718 /* 5719 * If the inode still has bitmap dependencies, 5720 * push them to disk. 5721 */ 5722 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 5723 bp = inodedep->id_buf; 5724 bp = getdirtybuf(bp, &lk, MNT_WAIT); 5725 if (bp == NULL) 5726 goto retry; 5727 FREE_LOCK(&lk); 5728 if ((error = bwrite(bp)) != 0) 5729 break; 5730 ACQUIRE_LOCK(&lk); 5731 if (dap != LIST_FIRST(diraddhdp)) 5732 continue; 5733 } 5734 /* 5735 * If the inode is still sitting in a buffer waiting 5736 * to be written, push it to disk. 5737 */ 5738 FREE_LOCK(&lk); 5739 if ((error = bread(ump->um_devvp, 5740 fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)), 5741 (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) { 5742 brelse(bp); 5743 break; 5744 } 5745 if ((error = bwrite(bp)) != 0) 5746 break; 5747 ACQUIRE_LOCK(&lk); 5748 /* 5749 * If we have failed to get rid of all the dependencies 5750 * then something is seriously wrong. 5751 */ 5752 if (dap == LIST_FIRST(diraddhdp)) 5753 panic("flush_pagedep_deps: flush failed"); 5754 } 5755 if (error) 5756 ACQUIRE_LOCK(&lk); 5757 return (error); 5758 } 5759 5760 /* 5761 * A large burst of file addition or deletion activity can drive the 5762 * memory load excessively high. First attempt to slow things down 5763 * using the techniques below. If that fails, this routine requests 5764 * the offending operations to fall back to running synchronously 5765 * until the memory load returns to a reasonable level. 5766 */ 5767 int 5768 softdep_slowdown(vp) 5769 struct vnode *vp; 5770 { 5771 int max_softdeps_hard; 5772 5773 ACQUIRE_LOCK(&lk); 5774 max_softdeps_hard = max_softdeps * 11 / 10; 5775 if (num_dirrem < max_softdeps_hard / 2 && 5776 num_inodedep < max_softdeps_hard && 5777 VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps && 5778 num_freeblkdep < max_softdeps_hard) { 5779 FREE_LOCK(&lk); 5780 return (0); 5781 } 5782 if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps) 5783 softdep_speedup(); 5784 stat_sync_limit_hit += 1; 5785 FREE_LOCK(&lk); 5786 return (1); 5787 } 5788 5789 /* 5790 * Called by the allocation routines when they are about to fail 5791 * in the hope that we can free up some disk space. 5792 * 5793 * First check to see if the work list has anything on it. If it has, 5794 * clean up entries until we successfully free some space. Because this 5795 * process holds inodes locked, we cannot handle any remove requests 5796 * that might block on a locked inode as that could lead to deadlock. 5797 * If the worklist yields no free space, encourage the syncer daemon 5798 * to help us. In no event will we try for longer than tickdelay seconds. 5799 */ 5800 int 5801 softdep_request_cleanup(fs, vp) 5802 struct fs *fs; 5803 struct vnode *vp; 5804 { 5805 struct ufsmount *ump; 5806 long starttime; 5807 ufs2_daddr_t needed; 5808 int error; 5809 5810 ump = VTOI(vp)->i_ump; 5811 mtx_assert(UFS_MTX(ump), MA_OWNED); 5812 needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize; 5813 starttime = time_second + tickdelay; 5814 /* 5815 * If we are being called because of a process doing a 5816 * copy-on-write, then it is not safe to update the vnode 5817 * as we may recurse into the copy-on-write routine. 5818 */ 5819 if (!(curthread->td_pflags & TDP_COWINPROGRESS)) { 5820 UFS_UNLOCK(ump); 5821 error = ffs_update(vp, 1); 5822 UFS_LOCK(ump); 5823 if (error != 0) 5824 return (0); 5825 } 5826 while (fs->fs_pendingblocks > 0 && fs->fs_cstotal.cs_nbfree <= needed) { 5827 if (time_second > starttime) 5828 return (0); 5829 UFS_UNLOCK(ump); 5830 ACQUIRE_LOCK(&lk); 5831 if (ump->softdep_on_worklist > 0 && 5832 process_worklist_item(UFSTOVFS(ump), LK_NOWAIT) != -1) { 5833 stat_worklist_push += 1; 5834 FREE_LOCK(&lk); 5835 UFS_LOCK(ump); 5836 continue; 5837 } 5838 request_cleanup(UFSTOVFS(ump), FLUSH_REMOVE_WAIT); 5839 FREE_LOCK(&lk); 5840 UFS_LOCK(ump); 5841 } 5842 return (1); 5843 } 5844 5845 /* 5846 * If memory utilization has gotten too high, deliberately slow things 5847 * down and speed up the I/O processing. 5848 */ 5849 extern struct thread *syncertd; 5850 static int 5851 request_cleanup(mp, resource) 5852 struct mount *mp; 5853 int resource; 5854 { 5855 struct thread *td = curthread; 5856 struct ufsmount *ump; 5857 5858 mtx_assert(&lk, MA_OWNED); 5859 /* 5860 * We never hold up the filesystem syncer or buf daemon. 5861 */ 5862 if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF)) 5863 return (0); 5864 ump = VFSTOUFS(mp); 5865 /* 5866 * First check to see if the work list has gotten backlogged. 5867 * If it has, co-opt this process to help clean up two entries. 5868 * Because this process may hold inodes locked, we cannot 5869 * handle any remove requests that might block on a locked 5870 * inode as that could lead to deadlock. We set TDP_SOFTDEP 5871 * to avoid recursively processing the worklist. 5872 */ 5873 if (ump->softdep_on_worklist > max_softdeps / 10) { 5874 td->td_pflags |= TDP_SOFTDEP; 5875 process_worklist_item(mp, LK_NOWAIT); 5876 process_worklist_item(mp, LK_NOWAIT); 5877 td->td_pflags &= ~TDP_SOFTDEP; 5878 stat_worklist_push += 2; 5879 return(1); 5880 } 5881 /* 5882 * Next, we attempt to speed up the syncer process. If that 5883 * is successful, then we allow the process to continue. 5884 */ 5885 if (softdep_speedup() && resource != FLUSH_REMOVE_WAIT) 5886 return(0); 5887 /* 5888 * If we are resource constrained on inode dependencies, try 5889 * flushing some dirty inodes. Otherwise, we are constrained 5890 * by file deletions, so try accelerating flushes of directories 5891 * with removal dependencies. We would like to do the cleanup 5892 * here, but we probably hold an inode locked at this point and 5893 * that might deadlock against one that we try to clean. So, 5894 * the best that we can do is request the syncer daemon to do 5895 * the cleanup for us. 5896 */ 5897 switch (resource) { 5898 5899 case FLUSH_INODES: 5900 stat_ino_limit_push += 1; 5901 req_clear_inodedeps += 1; 5902 stat_countp = &stat_ino_limit_hit; 5903 break; 5904 5905 case FLUSH_REMOVE: 5906 case FLUSH_REMOVE_WAIT: 5907 stat_blk_limit_push += 1; 5908 req_clear_remove += 1; 5909 stat_countp = &stat_blk_limit_hit; 5910 break; 5911 5912 default: 5913 panic("request_cleanup: unknown type"); 5914 } 5915 /* 5916 * Hopefully the syncer daemon will catch up and awaken us. 5917 * We wait at most tickdelay before proceeding in any case. 5918 */ 5919 proc_waiting += 1; 5920 if (callout_pending(&softdep_callout) == FALSE) 5921 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, 5922 pause_timer, 0); 5923 5924 msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0); 5925 proc_waiting -= 1; 5926 return (1); 5927 } 5928 5929 /* 5930 * Awaken processes pausing in request_cleanup and clear proc_waiting 5931 * to indicate that there is no longer a timer running. 5932 */ 5933 static void 5934 pause_timer(arg) 5935 void *arg; 5936 { 5937 5938 /* 5939 * The callout_ API has acquired mtx and will hold it around this 5940 * function call. 5941 */ 5942 *stat_countp += 1; 5943 wakeup_one(&proc_waiting); 5944 if (proc_waiting > 0) 5945 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, 5946 pause_timer, 0); 5947 } 5948 5949 /* 5950 * Flush out a directory with at least one removal dependency in an effort to 5951 * reduce the number of dirrem, freefile, and freeblks dependency structures. 5952 */ 5953 static void 5954 clear_remove(td) 5955 struct thread *td; 5956 { 5957 struct pagedep_hashhead *pagedephd; 5958 struct pagedep *pagedep; 5959 static int next = 0; 5960 struct mount *mp; 5961 struct vnode *vp; 5962 struct bufobj *bo; 5963 int error, cnt; 5964 ino_t ino; 5965 5966 mtx_assert(&lk, MA_OWNED); 5967 5968 for (cnt = 0; cnt < pagedep_hash; cnt++) { 5969 pagedephd = &pagedep_hashtbl[next++]; 5970 if (next >= pagedep_hash) 5971 next = 0; 5972 LIST_FOREACH(pagedep, pagedephd, pd_hash) { 5973 if (LIST_EMPTY(&pagedep->pd_dirremhd)) 5974 continue; 5975 mp = pagedep->pd_list.wk_mp; 5976 ino = pagedep->pd_ino; 5977 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 5978 continue; 5979 FREE_LOCK(&lk); 5980 if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, 5981 FFSV_FORCEINSMQ))) { 5982 softdep_error("clear_remove: vget", error); 5983 vn_finished_write(mp); 5984 ACQUIRE_LOCK(&lk); 5985 return; 5986 } 5987 if ((error = ffs_syncvnode(vp, MNT_NOWAIT))) 5988 softdep_error("clear_remove: fsync", error); 5989 bo = &vp->v_bufobj; 5990 BO_LOCK(bo); 5991 drain_output(vp); 5992 BO_UNLOCK(bo); 5993 vput(vp); 5994 vn_finished_write(mp); 5995 ACQUIRE_LOCK(&lk); 5996 return; 5997 } 5998 } 5999 } 6000 6001 /* 6002 * Clear out a block of dirty inodes in an effort to reduce 6003 * the number of inodedep dependency structures. 6004 */ 6005 static void 6006 clear_inodedeps(td) 6007 struct thread *td; 6008 { 6009 struct inodedep_hashhead *inodedephd; 6010 struct inodedep *inodedep; 6011 static int next = 0; 6012 struct mount *mp; 6013 struct vnode *vp; 6014 struct fs *fs; 6015 int error, cnt; 6016 ino_t firstino, lastino, ino; 6017 6018 mtx_assert(&lk, MA_OWNED); 6019 /* 6020 * Pick a random inode dependency to be cleared. 6021 * We will then gather up all the inodes in its block 6022 * that have dependencies and flush them out. 6023 */ 6024 for (cnt = 0; cnt < inodedep_hash; cnt++) { 6025 inodedephd = &inodedep_hashtbl[next++]; 6026 if (next >= inodedep_hash) 6027 next = 0; 6028 if ((inodedep = LIST_FIRST(inodedephd)) != NULL) 6029 break; 6030 } 6031 if (inodedep == NULL) 6032 return; 6033 fs = inodedep->id_fs; 6034 mp = inodedep->id_list.wk_mp; 6035 /* 6036 * Find the last inode in the block with dependencies. 6037 */ 6038 firstino = inodedep->id_ino & ~(INOPB(fs) - 1); 6039 for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) 6040 if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0) 6041 break; 6042 /* 6043 * Asynchronously push all but the last inode with dependencies. 6044 * Synchronously push the last inode with dependencies to ensure 6045 * that the inode block gets written to free up the inodedeps. 6046 */ 6047 for (ino = firstino; ino <= lastino; ino++) { 6048 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) 6049 continue; 6050 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 6051 continue; 6052 FREE_LOCK(&lk); 6053 if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, 6054 FFSV_FORCEINSMQ)) != 0) { 6055 softdep_error("clear_inodedeps: vget", error); 6056 vn_finished_write(mp); 6057 ACQUIRE_LOCK(&lk); 6058 return; 6059 } 6060 if (ino == lastino) { 6061 if ((error = ffs_syncvnode(vp, MNT_WAIT))) 6062 softdep_error("clear_inodedeps: fsync1", error); 6063 } else { 6064 if ((error = ffs_syncvnode(vp, MNT_NOWAIT))) 6065 softdep_error("clear_inodedeps: fsync2", error); 6066 BO_LOCK(&vp->v_bufobj); 6067 drain_output(vp); 6068 BO_UNLOCK(&vp->v_bufobj); 6069 } 6070 vput(vp); 6071 vn_finished_write(mp); 6072 ACQUIRE_LOCK(&lk); 6073 } 6074 } 6075 6076 /* 6077 * Function to determine if the buffer has outstanding dependencies 6078 * that will cause a roll-back if the buffer is written. If wantcount 6079 * is set, return number of dependencies, otherwise just yes or no. 6080 */ 6081 static int 6082 softdep_count_dependencies(bp, wantcount) 6083 struct buf *bp; 6084 int wantcount; 6085 { 6086 struct worklist *wk; 6087 struct inodedep *inodedep; 6088 struct indirdep *indirdep; 6089 struct allocindir *aip; 6090 struct pagedep *pagedep; 6091 struct diradd *dap; 6092 int i, retval; 6093 6094 retval = 0; 6095 ACQUIRE_LOCK(&lk); 6096 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 6097 switch (wk->wk_type) { 6098 6099 case D_INODEDEP: 6100 inodedep = WK_INODEDEP(wk); 6101 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 6102 /* bitmap allocation dependency */ 6103 retval += 1; 6104 if (!wantcount) 6105 goto out; 6106 } 6107 if (TAILQ_FIRST(&inodedep->id_inoupdt)) { 6108 /* direct block pointer dependency */ 6109 retval += 1; 6110 if (!wantcount) 6111 goto out; 6112 } 6113 if (TAILQ_FIRST(&inodedep->id_extupdt)) { 6114 /* direct block pointer dependency */ 6115 retval += 1; 6116 if (!wantcount) 6117 goto out; 6118 } 6119 continue; 6120 6121 case D_INDIRDEP: 6122 indirdep = WK_INDIRDEP(wk); 6123 6124 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { 6125 /* indirect block pointer dependency */ 6126 retval += 1; 6127 if (!wantcount) 6128 goto out; 6129 } 6130 continue; 6131 6132 case D_PAGEDEP: 6133 pagedep = WK_PAGEDEP(wk); 6134 for (i = 0; i < DAHASHSZ; i++) { 6135 6136 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 6137 /* directory entry dependency */ 6138 retval += 1; 6139 if (!wantcount) 6140 goto out; 6141 } 6142 } 6143 continue; 6144 6145 case D_BMSAFEMAP: 6146 case D_ALLOCDIRECT: 6147 case D_ALLOCINDIR: 6148 case D_MKDIR: 6149 /* never a dependency on these blocks */ 6150 continue; 6151 6152 default: 6153 panic("softdep_check_for_rollback: Unexpected type %s", 6154 TYPENAME(wk->wk_type)); 6155 /* NOTREACHED */ 6156 } 6157 } 6158 out: 6159 FREE_LOCK(&lk); 6160 return retval; 6161 } 6162 6163 /* 6164 * Acquire exclusive access to a buffer. 6165 * Must be called with a locked mtx parameter. 6166 * Return acquired buffer or NULL on failure. 6167 */ 6168 static struct buf * 6169 getdirtybuf(bp, mtx, waitfor) 6170 struct buf *bp; 6171 struct mtx *mtx; 6172 int waitfor; 6173 { 6174 int error; 6175 6176 mtx_assert(mtx, MA_OWNED); 6177 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) { 6178 if (waitfor != MNT_WAIT) 6179 return (NULL); 6180 error = BUF_LOCK(bp, 6181 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx); 6182 /* 6183 * Even if we sucessfully acquire bp here, we have dropped 6184 * mtx, which may violates our guarantee. 6185 */ 6186 if (error == 0) 6187 BUF_UNLOCK(bp); 6188 else if (error != ENOLCK) 6189 panic("getdirtybuf: inconsistent lock: %d", error); 6190 mtx_lock(mtx); 6191 return (NULL); 6192 } 6193 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { 6194 if (mtx == &lk && waitfor == MNT_WAIT) { 6195 mtx_unlock(mtx); 6196 BO_LOCK(bp->b_bufobj); 6197 BUF_UNLOCK(bp); 6198 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { 6199 bp->b_vflags |= BV_BKGRDWAIT; 6200 msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj), 6201 PRIBIO | PDROP, "getbuf", 0); 6202 } else 6203 BO_UNLOCK(bp->b_bufobj); 6204 mtx_lock(mtx); 6205 return (NULL); 6206 } 6207 BUF_UNLOCK(bp); 6208 if (waitfor != MNT_WAIT) 6209 return (NULL); 6210 /* 6211 * The mtx argument must be bp->b_vp's mutex in 6212 * this case. 6213 */ 6214 #ifdef DEBUG_VFS_LOCKS 6215 if (bp->b_vp->v_type != VCHR) 6216 ASSERT_BO_LOCKED(bp->b_bufobj); 6217 #endif 6218 bp->b_vflags |= BV_BKGRDWAIT; 6219 msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0); 6220 return (NULL); 6221 } 6222 if ((bp->b_flags & B_DELWRI) == 0) { 6223 BUF_UNLOCK(bp); 6224 return (NULL); 6225 } 6226 bremfree(bp); 6227 return (bp); 6228 } 6229 6230 6231 /* 6232 * Check if it is safe to suspend the file system now. On entry, 6233 * the vnode interlock for devvp should be held. Return 0 with 6234 * the mount interlock held if the file system can be suspended now, 6235 * otherwise return EAGAIN with the mount interlock held. 6236 */ 6237 int 6238 softdep_check_suspend(struct mount *mp, 6239 struct vnode *devvp, 6240 int softdep_deps, 6241 int softdep_accdeps, 6242 int secondary_writes, 6243 int secondary_accwrites) 6244 { 6245 struct bufobj *bo; 6246 struct ufsmount *ump; 6247 int error; 6248 6249 ump = VFSTOUFS(mp); 6250 bo = &devvp->v_bufobj; 6251 ASSERT_BO_LOCKED(bo); 6252 6253 for (;;) { 6254 if (!TRY_ACQUIRE_LOCK(&lk)) { 6255 BO_UNLOCK(bo); 6256 ACQUIRE_LOCK(&lk); 6257 FREE_LOCK(&lk); 6258 BO_LOCK(bo); 6259 continue; 6260 } 6261 MNT_ILOCK(mp); 6262 if (mp->mnt_secondary_writes != 0) { 6263 FREE_LOCK(&lk); 6264 BO_UNLOCK(bo); 6265 msleep(&mp->mnt_secondary_writes, 6266 MNT_MTX(mp), 6267 (PUSER - 1) | PDROP, "secwr", 0); 6268 BO_LOCK(bo); 6269 continue; 6270 } 6271 break; 6272 } 6273 6274 /* 6275 * Reasons for needing more work before suspend: 6276 * - Dirty buffers on devvp. 6277 * - Softdep activity occurred after start of vnode sync loop 6278 * - Secondary writes occurred after start of vnode sync loop 6279 */ 6280 error = 0; 6281 if (bo->bo_numoutput > 0 || 6282 bo->bo_dirty.bv_cnt > 0 || 6283 softdep_deps != 0 || 6284 ump->softdep_deps != 0 || 6285 softdep_accdeps != ump->softdep_accdeps || 6286 secondary_writes != 0 || 6287 mp->mnt_secondary_writes != 0 || 6288 secondary_accwrites != mp->mnt_secondary_accwrites) 6289 error = EAGAIN; 6290 FREE_LOCK(&lk); 6291 BO_UNLOCK(bo); 6292 return (error); 6293 } 6294 6295 6296 /* 6297 * Get the number of dependency structures for the file system, both 6298 * the current number and the total number allocated. These will 6299 * later be used to detect that softdep processing has occurred. 6300 */ 6301 void 6302 softdep_get_depcounts(struct mount *mp, 6303 int *softdep_depsp, 6304 int *softdep_accdepsp) 6305 { 6306 struct ufsmount *ump; 6307 6308 ump = VFSTOUFS(mp); 6309 ACQUIRE_LOCK(&lk); 6310 *softdep_depsp = ump->softdep_deps; 6311 *softdep_accdepsp = ump->softdep_accdeps; 6312 FREE_LOCK(&lk); 6313 } 6314 6315 /* 6316 * Wait for pending output on a vnode to complete. 6317 * Must be called with vnode lock and interlock locked. 6318 * 6319 * XXX: Should just be a call to bufobj_wwait(). 6320 */ 6321 static void 6322 drain_output(vp) 6323 struct vnode *vp; 6324 { 6325 struct bufobj *bo; 6326 6327 bo = &vp->v_bufobj; 6328 ASSERT_VOP_LOCKED(vp, "drain_output"); 6329 ASSERT_BO_LOCKED(bo); 6330 6331 while (bo->bo_numoutput) { 6332 bo->bo_flag |= BO_WWAIT; 6333 msleep((caddr_t)&bo->bo_numoutput, 6334 BO_MTX(bo), PRIBIO + 1, "drainvp", 0); 6335 } 6336 } 6337 6338 /* 6339 * Called whenever a buffer that is being invalidated or reallocated 6340 * contains dependencies. This should only happen if an I/O error has 6341 * occurred. The routine is called with the buffer locked. 6342 */ 6343 static void 6344 softdep_deallocate_dependencies(bp) 6345 struct buf *bp; 6346 { 6347 6348 if ((bp->b_ioflags & BIO_ERROR) == 0) 6349 panic("softdep_deallocate_dependencies: dangling deps"); 6350 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); 6351 panic("softdep_deallocate_dependencies: unrecovered I/O error"); 6352 } 6353 6354 /* 6355 * Function to handle asynchronous write errors in the filesystem. 6356 */ 6357 static void 6358 softdep_error(func, error) 6359 char *func; 6360 int error; 6361 { 6362 6363 /* XXX should do something better! */ 6364 printf("%s: got error %d while accessing filesystem\n", func, error); 6365 } 6366 6367 #ifdef DDB 6368 6369 DB_SHOW_COMMAND(inodedeps, db_show_inodedeps) 6370 { 6371 struct inodedep_hashhead *inodedephd; 6372 struct inodedep *inodedep; 6373 struct fs *fs; 6374 int cnt; 6375 6376 fs = have_addr ? (struct fs *)addr : NULL; 6377 for (cnt = 0; cnt < inodedep_hash; cnt++) { 6378 inodedephd = &inodedep_hashtbl[cnt]; 6379 LIST_FOREACH(inodedep, inodedephd, id_hash) { 6380 if (fs != NULL && fs != inodedep->id_fs) 6381 continue; 6382 db_printf("%p fs %p st %x ino %jd inoblk %jd\n", 6383 inodedep, inodedep->id_fs, inodedep->id_state, 6384 (intmax_t)inodedep->id_ino, 6385 (intmax_t)fsbtodb(inodedep->id_fs, 6386 ino_to_fsba(inodedep->id_fs, inodedep->id_ino))); 6387 } 6388 } 6389 } 6390 6391 #endif /* DDB */ 6392 6393 #endif /* SOFTUPDATES */ 6394