1 /* 2 * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * The soft updates code is derived from the appendix of a University 5 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, 6 * "Soft Updates: A Solution to the Metadata Update Problem in File 7 * Systems", CSE-TR-254-95, August 1995). 8 * 9 * Further information about soft updates can be obtained from: 10 * 11 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 12 * 1614 Oxford Street mckusick@mckusick.com 13 * Berkeley, CA 94709-1608 +1-510-843-9542 14 * USA 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 26 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 27 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 28 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 29 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 30 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00 39 */ 40 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 /* 45 * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide. 46 */ 47 #ifndef DIAGNOSTIC 48 #define DIAGNOSTIC 49 #endif 50 #ifndef DEBUG 51 #define DEBUG 52 #endif 53 54 #include <sys/param.h> 55 #include <sys/kernel.h> 56 #include <sys/systm.h> 57 #include <sys/bio.h> 58 #include <sys/buf.h> 59 #include <sys/malloc.h> 60 #include <sys/mount.h> 61 #include <sys/proc.h> 62 #include <sys/stat.h> 63 #include <sys/syslog.h> 64 #include <sys/vnode.h> 65 #include <sys/conf.h> 66 #include <ufs/ufs/dir.h> 67 #include <ufs/ufs/extattr.h> 68 #include <ufs/ufs/quota.h> 69 #include <ufs/ufs/inode.h> 70 #include <ufs/ufs/ufsmount.h> 71 #include <ufs/ffs/fs.h> 72 #include <ufs/ffs/softdep.h> 73 #include <ufs/ffs/ffs_extern.h> 74 #include <ufs/ufs/ufs_extern.h> 75 76 /* 77 * These definitions need to be adapted to the system to which 78 * this file is being ported. 79 */ 80 /* 81 * malloc types defined for the softdep system. 82 */ 83 static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies"); 84 static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies"); 85 static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation"); 86 static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map"); 87 static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode"); 88 static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies"); 89 static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block"); 90 static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode"); 91 static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode"); 92 static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated"); 93 static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry"); 94 static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory"); 95 static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted"); 96 static MALLOC_DEFINE(M_NEWDIRBLK, "newdirblk","Unclaimed new directory block"); 97 98 #define M_SOFTDEP_FLAGS (M_WAITOK | M_USE_RESERVE) 99 100 #define D_PAGEDEP 0 101 #define D_INODEDEP 1 102 #define D_NEWBLK 2 103 #define D_BMSAFEMAP 3 104 #define D_ALLOCDIRECT 4 105 #define D_INDIRDEP 5 106 #define D_ALLOCINDIR 6 107 #define D_FREEFRAG 7 108 #define D_FREEBLKS 8 109 #define D_FREEFILE 9 110 #define D_DIRADD 10 111 #define D_MKDIR 11 112 #define D_DIRREM 12 113 #define D_NEWDIRBLK 13 114 #define D_LAST D_NEWDIRBLK 115 116 /* 117 * translate from workitem type to memory type 118 * MUST match the defines above, such that memtype[D_XXX] == M_XXX 119 */ 120 static struct malloc_type *memtype[] = { 121 M_PAGEDEP, 122 M_INODEDEP, 123 M_NEWBLK, 124 M_BMSAFEMAP, 125 M_ALLOCDIRECT, 126 M_INDIRDEP, 127 M_ALLOCINDIR, 128 M_FREEFRAG, 129 M_FREEBLKS, 130 M_FREEFILE, 131 M_DIRADD, 132 M_MKDIR, 133 M_DIRREM, 134 M_NEWDIRBLK 135 }; 136 137 #define DtoM(type) (memtype[type]) 138 139 /* 140 * Names of malloc types. 141 */ 142 #define TYPENAME(type) \ 143 ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???") 144 /* 145 * End system adaptaion definitions. 146 */ 147 148 /* 149 * Internal function prototypes. 150 */ 151 static void softdep_error(char *, int); 152 static void drain_output(struct vnode *, int); 153 static int getdirtybuf(struct buf **, int); 154 static void clear_remove(struct thread *); 155 static void clear_inodedeps(struct thread *); 156 static int flush_pagedep_deps(struct vnode *, struct mount *, 157 struct diraddhd *); 158 static int flush_inodedep_deps(struct fs *, ino_t); 159 static int handle_written_filepage(struct pagedep *, struct buf *); 160 static void diradd_inode_written(struct diradd *, struct inodedep *); 161 static int handle_written_inodeblock(struct inodedep *, struct buf *); 162 static void handle_allocdirect_partdone(struct allocdirect *); 163 static void handle_allocindir_partdone(struct allocindir *); 164 static void initiate_write_filepage(struct pagedep *, struct buf *); 165 static void handle_written_mkdir(struct mkdir *, int); 166 static void initiate_write_inodeblock(struct inodedep *, struct buf *); 167 static void handle_workitem_freefile(struct freefile *); 168 static void handle_workitem_remove(struct dirrem *, struct vnode *); 169 static struct dirrem *newdirrem(struct buf *, struct inode *, 170 struct inode *, int, struct dirrem **); 171 static void free_diradd(struct diradd *); 172 static void free_allocindir(struct allocindir *, struct inodedep *); 173 static void free_newdirblk(struct newdirblk *); 174 static int indir_trunc(struct freeblks *, ufs_daddr_t, int, ufs_lbn_t, long *); 175 static void deallocate_dependencies(struct buf *, struct inodedep *); 176 static void free_allocdirect(struct allocdirectlst *, 177 struct allocdirect *, int); 178 static int check_inode_unwritten(struct inodedep *); 179 static int free_inodedep(struct inodedep *); 180 static void handle_workitem_freeblocks(struct freeblks *, int); 181 static void merge_inode_lists(struct inodedep *); 182 static void setup_allocindir_phase2(struct buf *, struct inode *, 183 struct allocindir *); 184 static struct allocindir *newallocindir(struct inode *, int, ufs_daddr_t, 185 ufs_daddr_t); 186 static void handle_workitem_freefrag(struct freefrag *); 187 static struct freefrag *newfreefrag(struct inode *, ufs_daddr_t, long); 188 static void allocdirect_merge(struct allocdirectlst *, 189 struct allocdirect *, struct allocdirect *); 190 static struct bmsafemap *bmsafemap_lookup(struct buf *); 191 static int newblk_lookup(struct fs *, ufs_daddr_t, int, struct newblk **); 192 static int inodedep_lookup(struct fs *, ino_t, int, struct inodedep **); 193 static int pagedep_lookup(struct inode *, ufs_lbn_t, int, struct pagedep **); 194 static void pause_timer(void *); 195 static int request_cleanup(int, int); 196 static int process_worklist_item(struct mount *, int); 197 static void add_to_worklist(struct worklist *); 198 199 /* 200 * Exported softdep operations. 201 */ 202 static void softdep_disk_io_initiation(struct buf *); 203 static void softdep_disk_write_complete(struct buf *); 204 static void softdep_deallocate_dependencies(struct buf *); 205 static void softdep_move_dependencies(struct buf *, struct buf *); 206 static int softdep_count_dependencies(struct buf *bp, int); 207 208 /* 209 * Locking primitives. 210 * 211 * For a uniprocessor, all we need to do is protect against disk 212 * interrupts. For a multiprocessor, this lock would have to be 213 * a mutex. A single mutex is used throughout this file, though 214 * finer grain locking could be used if contention warranted it. 215 * 216 * For a multiprocessor, the sleep call would accept a lock and 217 * release it after the sleep processing was complete. In a uniprocessor 218 * implementation there is no such interlock, so we simple mark 219 * the places where it needs to be done with the `interlocked' form 220 * of the lock calls. Since the uniprocessor sleep already interlocks 221 * the spl, there is nothing that really needs to be done. 222 */ 223 #ifndef /* NOT */ DEBUG 224 static struct lockit { 225 int lkt_spl; 226 } lk = { 0 }; 227 #define ACQUIRE_LOCK(lk) (lk)->lkt_spl = splbio() 228 #define FREE_LOCK(lk) splx((lk)->lkt_spl) 229 230 #else /* DEBUG */ 231 #define NOHOLDER ((struct thread *)-1) 232 #define SPECIAL_FLAG ((struct thread *)-2) 233 static struct lockit { 234 int lkt_spl; 235 struct thread *lkt_held; 236 } lk = { 0, NOHOLDER }; 237 static int lockcnt; 238 239 static void acquire_lock(struct lockit *); 240 static void free_lock(struct lockit *); 241 void softdep_panic(char *); 242 243 #define ACQUIRE_LOCK(lk) acquire_lock(lk) 244 #define FREE_LOCK(lk) free_lock(lk) 245 246 static void 247 acquire_lock(lk) 248 struct lockit *lk; 249 { 250 struct thread *holder; 251 252 if (lk->lkt_held != NOHOLDER) { 253 holder = lk->lkt_held; 254 FREE_LOCK(lk); 255 if (holder == curthread) 256 panic("softdep_lock: locking against myself"); 257 else 258 panic("softdep_lock: lock held by %p", holder); 259 } 260 lk->lkt_spl = splbio(); 261 lk->lkt_held = curthread; 262 lockcnt++; 263 } 264 265 static void 266 free_lock(lk) 267 struct lockit *lk; 268 { 269 270 if (lk->lkt_held == NOHOLDER) 271 panic("softdep_unlock: lock not held"); 272 lk->lkt_held = NOHOLDER; 273 splx(lk->lkt_spl); 274 } 275 276 /* 277 * Function to release soft updates lock and panic. 278 */ 279 void 280 softdep_panic(msg) 281 char *msg; 282 { 283 284 if (lk.lkt_held != NOHOLDER) 285 FREE_LOCK(&lk); 286 panic(msg); 287 } 288 #endif /* DEBUG */ 289 290 static int interlocked_sleep(struct lockit *, int, void *, int, 291 const char *, int); 292 293 /* 294 * When going to sleep, we must save our SPL so that it does 295 * not get lost if some other process uses the lock while we 296 * are sleeping. We restore it after we have slept. This routine 297 * wraps the interlocking with functions that sleep. The list 298 * below enumerates the available set of operations. 299 */ 300 #define UNKNOWN 0 301 #define SLEEP 1 302 #define LOCKBUF 2 303 304 static int 305 interlocked_sleep(lk, op, ident, flags, wmesg, timo) 306 struct lockit *lk; 307 int op; 308 void *ident; 309 int flags; 310 const char *wmesg; 311 int timo; 312 { 313 struct thread *holder; 314 int s, retval; 315 316 s = lk->lkt_spl; 317 # ifdef DEBUG 318 if (lk->lkt_held == NOHOLDER) 319 panic("interlocked_sleep: lock not held"); 320 lk->lkt_held = NOHOLDER; 321 # endif /* DEBUG */ 322 switch (op) { 323 case SLEEP: 324 retval = tsleep(ident, flags, wmesg, timo); 325 break; 326 case LOCKBUF: 327 retval = BUF_LOCK((struct buf *)ident, flags); 328 break; 329 default: 330 panic("interlocked_sleep: unknown operation"); 331 } 332 # ifdef DEBUG 333 if (lk->lkt_held != NOHOLDER) { 334 holder = lk->lkt_held; 335 FREE_LOCK(lk); 336 if (holder == curthread) 337 panic("interlocked_sleep: locking against self"); 338 else 339 panic("interlocked_sleep: lock held by %p", holder); 340 } 341 lk->lkt_held = curthread; 342 lockcnt++; 343 # endif /* DEBUG */ 344 lk->lkt_spl = s; 345 return (retval); 346 } 347 348 /* 349 * Place holder for real semaphores. 350 */ 351 struct sema { 352 int value; 353 struct thread *holder; 354 char *name; 355 int prio; 356 int timo; 357 }; 358 static void sema_init(struct sema *, char *, int, int); 359 static int sema_get(struct sema *, struct lockit *); 360 static void sema_release(struct sema *); 361 362 static void 363 sema_init(semap, name, prio, timo) 364 struct sema *semap; 365 char *name; 366 int prio, timo; 367 { 368 369 semap->holder = NOHOLDER; 370 semap->value = 0; 371 semap->name = name; 372 semap->prio = prio; 373 semap->timo = timo; 374 } 375 376 static int 377 sema_get(semap, interlock) 378 struct sema *semap; 379 struct lockit *interlock; 380 { 381 382 if (semap->value++ > 0) { 383 if (interlock != NULL) { 384 interlocked_sleep(interlock, SLEEP, (caddr_t)semap, 385 semap->prio, semap->name, semap->timo); 386 FREE_LOCK(interlock); 387 } else { 388 tsleep((caddr_t)semap, semap->prio, semap->name, 389 semap->timo); 390 } 391 return (0); 392 } 393 semap->holder = curthread; 394 if (interlock != NULL) 395 FREE_LOCK(interlock); 396 return (1); 397 } 398 399 static void 400 sema_release(semap) 401 struct sema *semap; 402 { 403 404 if (semap->value <= 0 || semap->holder != curthread) { 405 if (lk.lkt_held != NOHOLDER) 406 FREE_LOCK(&lk); 407 panic("sema_release: not held"); 408 } 409 if (--semap->value > 0) { 410 semap->value = 0; 411 wakeup(semap); 412 } 413 semap->holder = NOHOLDER; 414 } 415 416 /* 417 * Worklist queue management. 418 * These routines require that the lock be held. 419 */ 420 #ifndef /* NOT */ DEBUG 421 #define WORKLIST_INSERT(head, item) do { \ 422 (item)->wk_state |= ONWORKLIST; \ 423 LIST_INSERT_HEAD(head, item, wk_list); \ 424 } while (0) 425 #define WORKLIST_REMOVE(item) do { \ 426 (item)->wk_state &= ~ONWORKLIST; \ 427 LIST_REMOVE(item, wk_list); \ 428 } while (0) 429 #define WORKITEM_FREE(item, type) FREE(item, DtoM(type)) 430 431 #else /* DEBUG */ 432 static void worklist_insert(struct workhead *, struct worklist *); 433 static void worklist_remove(struct worklist *); 434 static void workitem_free(struct worklist *, int); 435 436 #define WORKLIST_INSERT(head, item) worklist_insert(head, item) 437 #define WORKLIST_REMOVE(item) worklist_remove(item) 438 #define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type) 439 440 static void 441 worklist_insert(head, item) 442 struct workhead *head; 443 struct worklist *item; 444 { 445 446 if (lk.lkt_held == NOHOLDER) 447 panic("worklist_insert: lock not held"); 448 if (item->wk_state & ONWORKLIST) { 449 FREE_LOCK(&lk); 450 panic("worklist_insert: already on list"); 451 } 452 item->wk_state |= ONWORKLIST; 453 LIST_INSERT_HEAD(head, item, wk_list); 454 } 455 456 static void 457 worklist_remove(item) 458 struct worklist *item; 459 { 460 461 if (lk.lkt_held == NOHOLDER) 462 panic("worklist_remove: lock not held"); 463 if ((item->wk_state & ONWORKLIST) == 0) { 464 FREE_LOCK(&lk); 465 panic("worklist_remove: not on list"); 466 } 467 item->wk_state &= ~ONWORKLIST; 468 LIST_REMOVE(item, wk_list); 469 } 470 471 static void 472 workitem_free(item, type) 473 struct worklist *item; 474 int type; 475 { 476 477 if (item->wk_state & ONWORKLIST) { 478 if (lk.lkt_held != NOHOLDER) 479 FREE_LOCK(&lk); 480 panic("workitem_free: still on list"); 481 } 482 if (item->wk_type != type) { 483 if (lk.lkt_held != NOHOLDER) 484 FREE_LOCK(&lk); 485 panic("workitem_free: type mismatch"); 486 } 487 FREE(item, DtoM(type)); 488 } 489 #endif /* DEBUG */ 490 491 /* 492 * Workitem queue management 493 */ 494 static struct workhead softdep_workitem_pending; 495 static int num_on_worklist; /* number of worklist items to be processed */ 496 static int softdep_worklist_busy; /* 1 => trying to do unmount */ 497 static int softdep_worklist_req; /* serialized waiters */ 498 static int max_softdeps; /* maximum number of structs before slowdown */ 499 static int tickdelay = 2; /* number of ticks to pause during slowdown */ 500 static int proc_waiting; /* tracks whether we have a timeout posted */ 501 static int *stat_countp; /* statistic to count in proc_waiting timeout */ 502 static struct callout_handle handle; /* handle on posted proc_waiting timeout */ 503 static struct thread *filesys_syncer; /* proc of filesystem syncer process */ 504 static int req_clear_inodedeps; /* syncer process flush some inodedeps */ 505 #define FLUSH_INODES 1 506 static int req_clear_remove; /* syncer process flush some freeblks */ 507 #define FLUSH_REMOVE 2 508 #define FLUSH_REMOVE_WAIT 3 509 /* 510 * runtime statistics 511 */ 512 static int stat_worklist_push; /* number of worklist cleanups */ 513 static int stat_blk_limit_push; /* number of times block limit neared */ 514 static int stat_ino_limit_push; /* number of times inode limit neared */ 515 static int stat_blk_limit_hit; /* number of times block slowdown imposed */ 516 static int stat_ino_limit_hit; /* number of times inode slowdown imposed */ 517 static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */ 518 static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */ 519 static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */ 520 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */ 521 static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */ 522 #ifdef DEBUG 523 #include <vm/vm.h> 524 #include <sys/sysctl.h> 525 SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, ""); 526 SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, ""); 527 SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,""); 528 SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,""); 529 SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,""); 530 SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, ""); 531 SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, ""); 532 SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, ""); 533 SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, ""); 534 SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, ""); 535 SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, ""); 536 SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, ""); 537 #endif /* DEBUG */ 538 539 /* 540 * Add an item to the end of the work queue. 541 * This routine requires that the lock be held. 542 * This is the only routine that adds items to the list. 543 * The following routine is the only one that removes items 544 * and does so in order from first to last. 545 */ 546 static void 547 add_to_worklist(wk) 548 struct worklist *wk; 549 { 550 static struct worklist *worklist_tail; 551 552 if (wk->wk_state & ONWORKLIST) { 553 if (lk.lkt_held != NOHOLDER) 554 FREE_LOCK(&lk); 555 panic("add_to_worklist: already on list"); 556 } 557 wk->wk_state |= ONWORKLIST; 558 if (LIST_FIRST(&softdep_workitem_pending) == NULL) 559 LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list); 560 else 561 LIST_INSERT_AFTER(worklist_tail, wk, wk_list); 562 worklist_tail = wk; 563 num_on_worklist += 1; 564 } 565 566 /* 567 * Process that runs once per second to handle items in the background queue. 568 * 569 * Note that we ensure that everything is done in the order in which they 570 * appear in the queue. The code below depends on this property to ensure 571 * that blocks of a file are freed before the inode itself is freed. This 572 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated 573 * until all the old ones have been purged from the dependency lists. 574 */ 575 int 576 softdep_process_worklist(matchmnt) 577 struct mount *matchmnt; 578 { 579 struct thread *td = curthread; 580 int cnt, matchcnt, loopcount; 581 long starttime; 582 583 /* 584 * Record the process identifier of our caller so that we can give 585 * this process preferential treatment in request_cleanup below. 586 */ 587 filesys_syncer = td; 588 matchcnt = 0; 589 590 /* 591 * There is no danger of having multiple processes run this 592 * code, but we have to single-thread it when softdep_flushfiles() 593 * is in operation to get an accurate count of the number of items 594 * related to its mount point that are in the list. 595 */ 596 if (matchmnt == NULL) { 597 if (softdep_worklist_busy < 0) 598 return(-1); 599 softdep_worklist_busy += 1; 600 } 601 602 /* 603 * If requested, try removing inode or removal dependencies. 604 */ 605 if (req_clear_inodedeps) { 606 clear_inodedeps(td); 607 req_clear_inodedeps -= 1; 608 wakeup_one(&proc_waiting); 609 } 610 if (req_clear_remove) { 611 clear_remove(td); 612 req_clear_remove -= 1; 613 wakeup_one(&proc_waiting); 614 } 615 loopcount = 1; 616 starttime = time_second; 617 while (num_on_worklist > 0) { 618 if ((cnt = process_worklist_item(matchmnt, 0)) == -1) 619 break; 620 else 621 matchcnt += cnt; 622 623 /* 624 * If a umount operation wants to run the worklist 625 * accurately, abort. 626 */ 627 if (softdep_worklist_req && matchmnt == NULL) { 628 matchcnt = -1; 629 break; 630 } 631 632 /* 633 * If requested, try removing inode or removal dependencies. 634 */ 635 if (req_clear_inodedeps) { 636 clear_inodedeps(td); 637 req_clear_inodedeps -= 1; 638 wakeup_one(&proc_waiting); 639 } 640 if (req_clear_remove) { 641 clear_remove(td); 642 req_clear_remove -= 1; 643 wakeup_one(&proc_waiting); 644 } 645 /* 646 * We do not generally want to stop for buffer space, but if 647 * we are really being a buffer hog, we will stop and wait. 648 */ 649 if (loopcount++ % 128 == 0) 650 bwillwrite(); 651 /* 652 * Never allow processing to run for more than one 653 * second. Otherwise the other syncer tasks may get 654 * excessively backlogged. 655 */ 656 if (starttime != time_second && matchmnt == NULL) { 657 matchcnt = -1; 658 break; 659 } 660 } 661 if (matchmnt == NULL) { 662 softdep_worklist_busy -= 1; 663 if (softdep_worklist_req && softdep_worklist_busy == 0) 664 wakeup(&softdep_worklist_req); 665 } 666 return (matchcnt); 667 } 668 669 /* 670 * Process one item on the worklist. 671 */ 672 static int 673 process_worklist_item(matchmnt, flags) 674 struct mount *matchmnt; 675 int flags; 676 { 677 struct worklist *wk; 678 struct mount *mp; 679 struct vnode *vp; 680 int matchcnt = 0; 681 682 ACQUIRE_LOCK(&lk); 683 /* 684 * Normally we just process each item on the worklist in order. 685 * However, if we are in a situation where we cannot lock any 686 * inodes, we have to skip over any dirrem requests whose 687 * vnodes are resident and locked. 688 */ 689 vp = NULL; 690 LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) { 691 if (wk->wk_state & INPROGRESS) 692 continue; 693 if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM) 694 break; 695 wk->wk_state |= INPROGRESS; 696 FREE_LOCK(&lk); 697 VFS_VGET(WK_DIRREM(wk)->dm_mnt, WK_DIRREM(wk)->dm_oldinum, 698 LK_NOWAIT | LK_EXCLUSIVE, &vp); 699 ACQUIRE_LOCK(&lk); 700 wk->wk_state &= ~INPROGRESS; 701 if (vp != NULL) 702 break; 703 } 704 if (wk == 0) { 705 FREE_LOCK(&lk); 706 return (-1); 707 } 708 WORKLIST_REMOVE(wk); 709 num_on_worklist -= 1; 710 FREE_LOCK(&lk); 711 switch (wk->wk_type) { 712 713 case D_DIRREM: 714 /* removal of a directory entry */ 715 mp = WK_DIRREM(wk)->dm_mnt; 716 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT)) 717 panic("%s: dirrem on suspended filesystem", 718 "process_worklist_item"); 719 if (mp == matchmnt) 720 matchcnt += 1; 721 handle_workitem_remove(WK_DIRREM(wk), vp); 722 break; 723 724 case D_FREEBLKS: 725 /* releasing blocks and/or fragments from a file */ 726 mp = WK_FREEBLKS(wk)->fb_mnt; 727 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT)) 728 panic("%s: freeblks on suspended filesystem", 729 "process_worklist_item"); 730 if (mp == matchmnt) 731 matchcnt += 1; 732 handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT); 733 break; 734 735 case D_FREEFRAG: 736 /* releasing a fragment when replaced as a file grows */ 737 mp = WK_FREEFRAG(wk)->ff_mnt; 738 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT)) 739 panic("%s: freefrag on suspended filesystem", 740 "process_worklist_item"); 741 if (mp == matchmnt) 742 matchcnt += 1; 743 handle_workitem_freefrag(WK_FREEFRAG(wk)); 744 break; 745 746 case D_FREEFILE: 747 /* releasing an inode when its link count drops to 0 */ 748 mp = WK_FREEFILE(wk)->fx_mnt; 749 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT)) 750 panic("%s: freefile on suspended filesystem", 751 "process_worklist_item"); 752 if (mp == matchmnt) 753 matchcnt += 1; 754 handle_workitem_freefile(WK_FREEFILE(wk)); 755 break; 756 757 default: 758 panic("%s_process_worklist: Unknown type %s", 759 "softdep", TYPENAME(wk->wk_type)); 760 /* NOTREACHED */ 761 } 762 return (matchcnt); 763 } 764 765 /* 766 * Move dependencies from one buffer to another. 767 */ 768 static void 769 softdep_move_dependencies(oldbp, newbp) 770 struct buf *oldbp; 771 struct buf *newbp; 772 { 773 struct worklist *wk, *wktail; 774 775 if (LIST_FIRST(&newbp->b_dep) != NULL) 776 panic("softdep_move_dependencies: need merge code"); 777 wktail = 0; 778 ACQUIRE_LOCK(&lk); 779 while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { 780 LIST_REMOVE(wk, wk_list); 781 if (wktail == 0) 782 LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); 783 else 784 LIST_INSERT_AFTER(wktail, wk, wk_list); 785 wktail = wk; 786 } 787 FREE_LOCK(&lk); 788 } 789 790 /* 791 * Purge the work list of all items associated with a particular mount point. 792 */ 793 int 794 softdep_flushworklist(oldmnt, countp, td) 795 struct mount *oldmnt; 796 int *countp; 797 struct thread *td; 798 { 799 struct vnode *devvp; 800 int count, error = 0; 801 802 /* 803 * Await our turn to clear out the queue, then serialize access. 804 */ 805 while (softdep_worklist_busy) { 806 softdep_worklist_req += 1; 807 tsleep(&softdep_worklist_req, PRIBIO, "softflush", 0); 808 softdep_worklist_req -= 1; 809 } 810 softdep_worklist_busy = -1; 811 /* 812 * Alternately flush the block device associated with the mount 813 * point and process any dependencies that the flushing 814 * creates. We continue until no more worklist dependencies 815 * are found. 816 */ 817 *countp = 0; 818 devvp = VFSTOUFS(oldmnt)->um_devvp; 819 while ((count = softdep_process_worklist(oldmnt)) > 0) { 820 *countp += count; 821 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td); 822 error = VOP_FSYNC(devvp, td->td_ucred, MNT_WAIT, td); 823 VOP_UNLOCK(devvp, 0, td); 824 if (error) 825 break; 826 } 827 softdep_worklist_busy = 0; 828 if (softdep_worklist_req) 829 wakeup(&softdep_worklist_req); 830 return (error); 831 } 832 833 /* 834 * Flush all vnodes and worklist items associated with a specified mount point. 835 */ 836 int 837 softdep_flushfiles(oldmnt, flags, td) 838 struct mount *oldmnt; 839 int flags; 840 struct thread *td; 841 { 842 int error, count, loopcnt; 843 844 error = 0; 845 846 /* 847 * Alternately flush the vnodes associated with the mount 848 * point and process any dependencies that the flushing 849 * creates. In theory, this loop can happen at most twice, 850 * but we give it a few extra just to be sure. 851 */ 852 for (loopcnt = 10; loopcnt > 0; loopcnt--) { 853 /* 854 * Do another flush in case any vnodes were brought in 855 * as part of the cleanup operations. 856 */ 857 if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0) 858 break; 859 if ((error = softdep_flushworklist(oldmnt, &count, td)) != 0 || 860 count == 0) 861 break; 862 } 863 /* 864 * If we are unmounting then it is an error to fail. If we 865 * are simply trying to downgrade to read-only, then filesystem 866 * activity can keep us busy forever, so we just fail with EBUSY. 867 */ 868 if (loopcnt == 0) { 869 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) 870 panic("softdep_flushfiles: looping"); 871 error = EBUSY; 872 } 873 return (error); 874 } 875 876 /* 877 * Structure hashing. 878 * 879 * There are three types of structures that can be looked up: 880 * 1) pagedep structures identified by mount point, inode number, 881 * and logical block. 882 * 2) inodedep structures identified by mount point and inode number. 883 * 3) newblk structures identified by mount point and 884 * physical block number. 885 * 886 * The "pagedep" and "inodedep" dependency structures are hashed 887 * separately from the file blocks and inodes to which they correspond. 888 * This separation helps when the in-memory copy of an inode or 889 * file block must be replaced. It also obviates the need to access 890 * an inode or file page when simply updating (or de-allocating) 891 * dependency structures. Lookup of newblk structures is needed to 892 * find newly allocated blocks when trying to associate them with 893 * their allocdirect or allocindir structure. 894 * 895 * The lookup routines optionally create and hash a new instance when 896 * an existing entry is not found. 897 */ 898 #define DEPALLOC 0x0001 /* allocate structure if lookup fails */ 899 #define NODELAY 0x0002 /* cannot do background work */ 900 901 /* 902 * Structures and routines associated with pagedep caching. 903 */ 904 LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl; 905 u_long pagedep_hash; /* size of hash table - 1 */ 906 #define PAGEDEP_HASH(mp, inum, lbn) \ 907 (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \ 908 pagedep_hash]) 909 static struct sema pagedep_in_progress; 910 911 /* 912 * Look up a pagedep. Return 1 if found, 0 if not found or found 913 * when asked to allocate but not associated with any buffer. 914 * If not found, allocate if DEPALLOC flag is passed. 915 * Found or allocated entry is returned in pagedeppp. 916 * This routine must be called with splbio interrupts blocked. 917 */ 918 static int 919 pagedep_lookup(ip, lbn, flags, pagedeppp) 920 struct inode *ip; 921 ufs_lbn_t lbn; 922 int flags; 923 struct pagedep **pagedeppp; 924 { 925 struct pagedep *pagedep; 926 struct pagedep_hashhead *pagedephd; 927 struct mount *mp; 928 int i; 929 930 #ifdef DEBUG 931 if (lk.lkt_held == NOHOLDER) 932 panic("pagedep_lookup: lock not held"); 933 #endif 934 mp = ITOV(ip)->v_mount; 935 pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn); 936 top: 937 LIST_FOREACH(pagedep, pagedephd, pd_hash) 938 if (ip->i_number == pagedep->pd_ino && 939 lbn == pagedep->pd_lbn && 940 mp == pagedep->pd_mnt) 941 break; 942 if (pagedep) { 943 *pagedeppp = pagedep; 944 if ((flags & DEPALLOC) != 0 && 945 (pagedep->pd_state & ONWORKLIST) == 0) 946 return (0); 947 return (1); 948 } 949 if ((flags & DEPALLOC) == 0) { 950 *pagedeppp = NULL; 951 return (0); 952 } 953 if (sema_get(&pagedep_in_progress, &lk) == 0) { 954 ACQUIRE_LOCK(&lk); 955 goto top; 956 } 957 MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP, 958 M_SOFTDEP_FLAGS|M_ZERO); 959 pagedep->pd_list.wk_type = D_PAGEDEP; 960 pagedep->pd_mnt = mp; 961 pagedep->pd_ino = ip->i_number; 962 pagedep->pd_lbn = lbn; 963 LIST_INIT(&pagedep->pd_dirremhd); 964 LIST_INIT(&pagedep->pd_pendinghd); 965 for (i = 0; i < DAHASHSZ; i++) 966 LIST_INIT(&pagedep->pd_diraddhd[i]); 967 ACQUIRE_LOCK(&lk); 968 LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); 969 sema_release(&pagedep_in_progress); 970 *pagedeppp = pagedep; 971 return (0); 972 } 973 974 /* 975 * Structures and routines associated with inodedep caching. 976 */ 977 LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl; 978 static u_long inodedep_hash; /* size of hash table - 1 */ 979 static long num_inodedep; /* number of inodedep allocated */ 980 #define INODEDEP_HASH(fs, inum) \ 981 (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash]) 982 static struct sema inodedep_in_progress; 983 984 /* 985 * Look up a inodedep. Return 1 if found, 0 if not found. 986 * If not found, allocate if DEPALLOC flag is passed. 987 * Found or allocated entry is returned in inodedeppp. 988 * This routine must be called with splbio interrupts blocked. 989 */ 990 static int 991 inodedep_lookup(fs, inum, flags, inodedeppp) 992 struct fs *fs; 993 ino_t inum; 994 int flags; 995 struct inodedep **inodedeppp; 996 { 997 struct inodedep *inodedep; 998 struct inodedep_hashhead *inodedephd; 999 int firsttry; 1000 1001 #ifdef DEBUG 1002 if (lk.lkt_held == NOHOLDER) 1003 panic("inodedep_lookup: lock not held"); 1004 #endif 1005 firsttry = 1; 1006 inodedephd = INODEDEP_HASH(fs, inum); 1007 top: 1008 LIST_FOREACH(inodedep, inodedephd, id_hash) 1009 if (inum == inodedep->id_ino && fs == inodedep->id_fs) 1010 break; 1011 if (inodedep) { 1012 *inodedeppp = inodedep; 1013 return (1); 1014 } 1015 if ((flags & DEPALLOC) == 0) { 1016 *inodedeppp = NULL; 1017 return (0); 1018 } 1019 /* 1020 * If we are over our limit, try to improve the situation. 1021 */ 1022 if (num_inodedep > max_softdeps && firsttry && (flags & NODELAY) == 0 && 1023 request_cleanup(FLUSH_INODES, 1)) { 1024 firsttry = 0; 1025 goto top; 1026 } 1027 if (sema_get(&inodedep_in_progress, &lk) == 0) { 1028 ACQUIRE_LOCK(&lk); 1029 goto top; 1030 } 1031 num_inodedep += 1; 1032 MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep), 1033 M_INODEDEP, M_SOFTDEP_FLAGS); 1034 inodedep->id_list.wk_type = D_INODEDEP; 1035 inodedep->id_fs = fs; 1036 inodedep->id_ino = inum; 1037 inodedep->id_state = ALLCOMPLETE; 1038 inodedep->id_nlinkdelta = 0; 1039 inodedep->id_savedino = NULL; 1040 inodedep->id_savedsize = -1; 1041 inodedep->id_buf = NULL; 1042 LIST_INIT(&inodedep->id_pendinghd); 1043 LIST_INIT(&inodedep->id_inowait); 1044 LIST_INIT(&inodedep->id_bufwait); 1045 TAILQ_INIT(&inodedep->id_inoupdt); 1046 TAILQ_INIT(&inodedep->id_newinoupdt); 1047 ACQUIRE_LOCK(&lk); 1048 LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); 1049 sema_release(&inodedep_in_progress); 1050 *inodedeppp = inodedep; 1051 return (0); 1052 } 1053 1054 /* 1055 * Structures and routines associated with newblk caching. 1056 */ 1057 LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl; 1058 u_long newblk_hash; /* size of hash table - 1 */ 1059 #define NEWBLK_HASH(fs, inum) \ 1060 (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash]) 1061 static struct sema newblk_in_progress; 1062 1063 /* 1064 * Look up a newblk. Return 1 if found, 0 if not found. 1065 * If not found, allocate if DEPALLOC flag is passed. 1066 * Found or allocated entry is returned in newblkpp. 1067 */ 1068 static int 1069 newblk_lookup(fs, newblkno, flags, newblkpp) 1070 struct fs *fs; 1071 ufs_daddr_t newblkno; 1072 int flags; 1073 struct newblk **newblkpp; 1074 { 1075 struct newblk *newblk; 1076 struct newblk_hashhead *newblkhd; 1077 1078 newblkhd = NEWBLK_HASH(fs, newblkno); 1079 top: 1080 LIST_FOREACH(newblk, newblkhd, nb_hash) 1081 if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs) 1082 break; 1083 if (newblk) { 1084 *newblkpp = newblk; 1085 return (1); 1086 } 1087 if ((flags & DEPALLOC) == 0) { 1088 *newblkpp = NULL; 1089 return (0); 1090 } 1091 if (sema_get(&newblk_in_progress, 0) == 0) 1092 goto top; 1093 MALLOC(newblk, struct newblk *, sizeof(struct newblk), 1094 M_NEWBLK, M_SOFTDEP_FLAGS); 1095 newblk->nb_state = 0; 1096 newblk->nb_fs = fs; 1097 newblk->nb_newblkno = newblkno; 1098 LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); 1099 sema_release(&newblk_in_progress); 1100 *newblkpp = newblk; 1101 return (0); 1102 } 1103 1104 /* 1105 * Executed during filesystem system initialization before 1106 * mounting any filesystems. 1107 */ 1108 void 1109 softdep_initialize() 1110 { 1111 1112 LIST_INIT(&mkdirlisthd); 1113 LIST_INIT(&softdep_workitem_pending); 1114 max_softdeps = desiredvnodes * 8; 1115 pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, 1116 &pagedep_hash); 1117 sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0); 1118 inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); 1119 sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0); 1120 newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash); 1121 sema_init(&newblk_in_progress, "newblk", PRIBIO, 0); 1122 1123 /* initialise bioops hack */ 1124 bioops.io_start = softdep_disk_io_initiation; 1125 bioops.io_complete = softdep_disk_write_complete; 1126 bioops.io_deallocate = softdep_deallocate_dependencies; 1127 bioops.io_movedeps = softdep_move_dependencies; 1128 bioops.io_countdeps = softdep_count_dependencies; 1129 } 1130 1131 /* 1132 * Called at mount time to notify the dependency code that a 1133 * filesystem wishes to use it. 1134 */ 1135 int 1136 softdep_mount(devvp, mp, fs, cred) 1137 struct vnode *devvp; 1138 struct mount *mp; 1139 struct fs *fs; 1140 struct ucred *cred; 1141 { 1142 struct csum cstotal; 1143 struct cg *cgp; 1144 struct buf *bp; 1145 int error, cyl; 1146 1147 mp->mnt_flag &= ~MNT_ASYNC; 1148 mp->mnt_flag |= MNT_SOFTDEP; 1149 /* 1150 * When doing soft updates, the counters in the 1151 * superblock may have gotten out of sync, so we have 1152 * to scan the cylinder groups and recalculate them. 1153 */ 1154 if (fs->fs_clean != 0) 1155 return (0); 1156 bzero(&cstotal, sizeof cstotal); 1157 for (cyl = 0; cyl < fs->fs_ncg; cyl++) { 1158 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), 1159 fs->fs_cgsize, cred, &bp)) != 0) { 1160 brelse(bp); 1161 return (error); 1162 } 1163 cgp = (struct cg *)bp->b_data; 1164 cstotal.cs_nffree += cgp->cg_cs.cs_nffree; 1165 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; 1166 cstotal.cs_nifree += cgp->cg_cs.cs_nifree; 1167 cstotal.cs_ndir += cgp->cg_cs.cs_ndir; 1168 fs->fs_cs(fs, cyl) = cgp->cg_cs; 1169 brelse(bp); 1170 } 1171 #ifdef DEBUG 1172 if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) 1173 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt); 1174 #endif 1175 bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); 1176 return (0); 1177 } 1178 1179 /* 1180 * Protecting the freemaps (or bitmaps). 1181 * 1182 * To eliminate the need to execute fsck before mounting a filesystem 1183 * after a power failure, one must (conservatively) guarantee that the 1184 * on-disk copy of the bitmaps never indicate that a live inode or block is 1185 * free. So, when a block or inode is allocated, the bitmap should be 1186 * updated (on disk) before any new pointers. When a block or inode is 1187 * freed, the bitmap should not be updated until all pointers have been 1188 * reset. The latter dependency is handled by the delayed de-allocation 1189 * approach described below for block and inode de-allocation. The former 1190 * dependency is handled by calling the following procedure when a block or 1191 * inode is allocated. When an inode is allocated an "inodedep" is created 1192 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. 1193 * Each "inodedep" is also inserted into the hash indexing structure so 1194 * that any additional link additions can be made dependent on the inode 1195 * allocation. 1196 * 1197 * The ufs filesystem maintains a number of free block counts (e.g., per 1198 * cylinder group, per cylinder and per <cylinder, rotational position> pair) 1199 * in addition to the bitmaps. These counts are used to improve efficiency 1200 * during allocation and therefore must be consistent with the bitmaps. 1201 * There is no convenient way to guarantee post-crash consistency of these 1202 * counts with simple update ordering, for two main reasons: (1) The counts 1203 * and bitmaps for a single cylinder group block are not in the same disk 1204 * sector. If a disk write is interrupted (e.g., by power failure), one may 1205 * be written and the other not. (2) Some of the counts are located in the 1206 * superblock rather than the cylinder group block. So, we focus our soft 1207 * updates implementation on protecting the bitmaps. When mounting a 1208 * filesystem, we recompute the auxiliary counts from the bitmaps. 1209 */ 1210 1211 /* 1212 * Called just after updating the cylinder group block to allocate an inode. 1213 */ 1214 void 1215 softdep_setup_inomapdep(bp, ip, newinum) 1216 struct buf *bp; /* buffer for cylgroup block with inode map */ 1217 struct inode *ip; /* inode related to allocation */ 1218 ino_t newinum; /* new inode number being allocated */ 1219 { 1220 struct inodedep *inodedep; 1221 struct bmsafemap *bmsafemap; 1222 1223 /* 1224 * Create a dependency for the newly allocated inode. 1225 * Panic if it already exists as something is seriously wrong. 1226 * Otherwise add it to the dependency list for the buffer holding 1227 * the cylinder group map from which it was allocated. 1228 */ 1229 ACQUIRE_LOCK(&lk); 1230 if ((inodedep_lookup(ip->i_fs, newinum, DEPALLOC|NODELAY, &inodedep))) { 1231 FREE_LOCK(&lk); 1232 panic("softdep_setup_inomapdep: found inode"); 1233 } 1234 inodedep->id_buf = bp; 1235 inodedep->id_state &= ~DEPCOMPLETE; 1236 bmsafemap = bmsafemap_lookup(bp); 1237 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); 1238 FREE_LOCK(&lk); 1239 } 1240 1241 /* 1242 * Called just after updating the cylinder group block to 1243 * allocate block or fragment. 1244 */ 1245 void 1246 softdep_setup_blkmapdep(bp, fs, newblkno) 1247 struct buf *bp; /* buffer for cylgroup block with block map */ 1248 struct fs *fs; /* filesystem doing allocation */ 1249 ufs_daddr_t newblkno; /* number of newly allocated block */ 1250 { 1251 struct newblk *newblk; 1252 struct bmsafemap *bmsafemap; 1253 1254 /* 1255 * Create a dependency for the newly allocated block. 1256 * Add it to the dependency list for the buffer holding 1257 * the cylinder group map from which it was allocated. 1258 */ 1259 if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0) 1260 panic("softdep_setup_blkmapdep: found block"); 1261 ACQUIRE_LOCK(&lk); 1262 newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp); 1263 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); 1264 FREE_LOCK(&lk); 1265 } 1266 1267 /* 1268 * Find the bmsafemap associated with a cylinder group buffer. 1269 * If none exists, create one. The buffer must be locked when 1270 * this routine is called and this routine must be called with 1271 * splbio interrupts blocked. 1272 */ 1273 static struct bmsafemap * 1274 bmsafemap_lookup(bp) 1275 struct buf *bp; 1276 { 1277 struct bmsafemap *bmsafemap; 1278 struct worklist *wk; 1279 1280 #ifdef DEBUG 1281 if (lk.lkt_held == NOHOLDER) 1282 panic("bmsafemap_lookup: lock not held"); 1283 #endif 1284 LIST_FOREACH(wk, &bp->b_dep, wk_list) 1285 if (wk->wk_type == D_BMSAFEMAP) 1286 return (WK_BMSAFEMAP(wk)); 1287 FREE_LOCK(&lk); 1288 MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap), 1289 M_BMSAFEMAP, M_SOFTDEP_FLAGS); 1290 bmsafemap->sm_list.wk_type = D_BMSAFEMAP; 1291 bmsafemap->sm_list.wk_state = 0; 1292 bmsafemap->sm_buf = bp; 1293 LIST_INIT(&bmsafemap->sm_allocdirecthd); 1294 LIST_INIT(&bmsafemap->sm_allocindirhd); 1295 LIST_INIT(&bmsafemap->sm_inodedephd); 1296 LIST_INIT(&bmsafemap->sm_newblkhd); 1297 ACQUIRE_LOCK(&lk); 1298 WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); 1299 return (bmsafemap); 1300 } 1301 1302 /* 1303 * Direct block allocation dependencies. 1304 * 1305 * When a new block is allocated, the corresponding disk locations must be 1306 * initialized (with zeros or new data) before the on-disk inode points to 1307 * them. Also, the freemap from which the block was allocated must be 1308 * updated (on disk) before the inode's pointer. These two dependencies are 1309 * independent of each other and are needed for all file blocks and indirect 1310 * blocks that are pointed to directly by the inode. Just before the 1311 * "in-core" version of the inode is updated with a newly allocated block 1312 * number, a procedure (below) is called to setup allocation dependency 1313 * structures. These structures are removed when the corresponding 1314 * dependencies are satisfied or when the block allocation becomes obsolete 1315 * (i.e., the file is deleted, the block is de-allocated, or the block is a 1316 * fragment that gets upgraded). All of these cases are handled in 1317 * procedures described later. 1318 * 1319 * When a file extension causes a fragment to be upgraded, either to a larger 1320 * fragment or to a full block, the on-disk location may change (if the 1321 * previous fragment could not simply be extended). In this case, the old 1322 * fragment must be de-allocated, but not until after the inode's pointer has 1323 * been updated. In most cases, this is handled by later procedures, which 1324 * will construct a "freefrag" structure to be added to the workitem queue 1325 * when the inode update is complete (or obsolete). The main exception to 1326 * this is when an allocation occurs while a pending allocation dependency 1327 * (for the same block pointer) remains. This case is handled in the main 1328 * allocation dependency setup procedure by immediately freeing the 1329 * unreferenced fragments. 1330 */ 1331 void 1332 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 1333 struct inode *ip; /* inode to which block is being added */ 1334 ufs_lbn_t lbn; /* block pointer within inode */ 1335 ufs_daddr_t newblkno; /* disk block number being added */ 1336 ufs_daddr_t oldblkno; /* previous block number, 0 unless frag */ 1337 long newsize; /* size of new block */ 1338 long oldsize; /* size of new block */ 1339 struct buf *bp; /* bp for allocated block */ 1340 { 1341 struct allocdirect *adp, *oldadp; 1342 struct allocdirectlst *adphead; 1343 struct bmsafemap *bmsafemap; 1344 struct inodedep *inodedep; 1345 struct pagedep *pagedep; 1346 struct newblk *newblk; 1347 1348 MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect), 1349 M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO); 1350 adp->ad_list.wk_type = D_ALLOCDIRECT; 1351 adp->ad_lbn = lbn; 1352 adp->ad_newblkno = newblkno; 1353 adp->ad_oldblkno = oldblkno; 1354 adp->ad_newsize = newsize; 1355 adp->ad_oldsize = oldsize; 1356 adp->ad_state = ATTACHED; 1357 LIST_INIT(&adp->ad_newdirblk); 1358 if (newblkno == oldblkno) 1359 adp->ad_freefrag = NULL; 1360 else 1361 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize); 1362 1363 if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0) 1364 panic("softdep_setup_allocdirect: lost block"); 1365 1366 ACQUIRE_LOCK(&lk); 1367 inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC | NODELAY, &inodedep); 1368 adp->ad_inodedep = inodedep; 1369 1370 if (newblk->nb_state == DEPCOMPLETE) { 1371 adp->ad_state |= DEPCOMPLETE; 1372 adp->ad_buf = NULL; 1373 } else { 1374 bmsafemap = newblk->nb_bmsafemap; 1375 adp->ad_buf = bmsafemap->sm_buf; 1376 LIST_REMOVE(newblk, nb_deps); 1377 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); 1378 } 1379 LIST_REMOVE(newblk, nb_hash); 1380 FREE(newblk, M_NEWBLK); 1381 1382 WORKLIST_INSERT(&bp->b_dep, &adp->ad_list); 1383 if (lbn >= NDADDR) { 1384 /* allocating an indirect block */ 1385 if (oldblkno != 0) { 1386 FREE_LOCK(&lk); 1387 panic("softdep_setup_allocdirect: non-zero indir"); 1388 } 1389 } else { 1390 /* 1391 * Allocating a direct block. 1392 * 1393 * If we are allocating a directory block, then we must 1394 * allocate an associated pagedep to track additions and 1395 * deletions. 1396 */ 1397 if ((ip->i_mode & IFMT) == IFDIR && 1398 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) 1399 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 1400 } 1401 /* 1402 * The list of allocdirects must be kept in sorted and ascending 1403 * order so that the rollback routines can quickly determine the 1404 * first uncommitted block (the size of the file stored on disk 1405 * ends at the end of the lowest committed fragment, or if there 1406 * are no fragments, at the end of the highest committed block). 1407 * Since files generally grow, the typical case is that the new 1408 * block is to be added at the end of the list. We speed this 1409 * special case by checking against the last allocdirect in the 1410 * list before laboriously traversing the list looking for the 1411 * insertion point. 1412 */ 1413 adphead = &inodedep->id_newinoupdt; 1414 oldadp = TAILQ_LAST(adphead, allocdirectlst); 1415 if (oldadp == NULL || oldadp->ad_lbn <= lbn) { 1416 /* insert at end of list */ 1417 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 1418 if (oldadp != NULL && oldadp->ad_lbn == lbn) 1419 allocdirect_merge(adphead, adp, oldadp); 1420 FREE_LOCK(&lk); 1421 return; 1422 } 1423 TAILQ_FOREACH(oldadp, adphead, ad_next) { 1424 if (oldadp->ad_lbn >= lbn) 1425 break; 1426 } 1427 if (oldadp == NULL) { 1428 FREE_LOCK(&lk); 1429 panic("softdep_setup_allocdirect: lost entry"); 1430 } 1431 /* insert in middle of list */ 1432 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 1433 if (oldadp->ad_lbn == lbn) 1434 allocdirect_merge(adphead, adp, oldadp); 1435 FREE_LOCK(&lk); 1436 } 1437 1438 /* 1439 * Replace an old allocdirect dependency with a newer one. 1440 * This routine must be called with splbio interrupts blocked. 1441 */ 1442 static void 1443 allocdirect_merge(adphead, newadp, oldadp) 1444 struct allocdirectlst *adphead; /* head of list holding allocdirects */ 1445 struct allocdirect *newadp; /* allocdirect being added */ 1446 struct allocdirect *oldadp; /* existing allocdirect being checked */ 1447 { 1448 struct worklist *wk; 1449 struct freefrag *freefrag; 1450 struct newdirblk *newdirblk; 1451 1452 #ifdef DEBUG 1453 if (lk.lkt_held == NOHOLDER) 1454 panic("allocdirect_merge: lock not held"); 1455 #endif 1456 if (newadp->ad_oldblkno != oldadp->ad_newblkno || 1457 newadp->ad_oldsize != oldadp->ad_newsize || 1458 newadp->ad_lbn >= NDADDR) { 1459 FREE_LOCK(&lk); 1460 panic("allocdirect_merge: old %d != new %d || lbn %ld >= %d", 1461 newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn, 1462 NDADDR); 1463 } 1464 newadp->ad_oldblkno = oldadp->ad_oldblkno; 1465 newadp->ad_oldsize = oldadp->ad_oldsize; 1466 /* 1467 * If the old dependency had a fragment to free or had never 1468 * previously had a block allocated, then the new dependency 1469 * can immediately post its freefrag and adopt the old freefrag. 1470 * This action is done by swapping the freefrag dependencies. 1471 * The new dependency gains the old one's freefrag, and the 1472 * old one gets the new one and then immediately puts it on 1473 * the worklist when it is freed by free_allocdirect. It is 1474 * not possible to do this swap when the old dependency had a 1475 * non-zero size but no previous fragment to free. This condition 1476 * arises when the new block is an extension of the old block. 1477 * Here, the first part of the fragment allocated to the new 1478 * dependency is part of the block currently claimed on disk by 1479 * the old dependency, so cannot legitimately be freed until the 1480 * conditions for the new dependency are fulfilled. 1481 */ 1482 if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { 1483 freefrag = newadp->ad_freefrag; 1484 newadp->ad_freefrag = oldadp->ad_freefrag; 1485 oldadp->ad_freefrag = freefrag; 1486 } 1487 /* 1488 * If we are tracking a new directory-block allocation, 1489 * move it from the old allocdirect to the new allocdirect. 1490 */ 1491 if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) { 1492 newdirblk = WK_NEWDIRBLK(wk); 1493 WORKLIST_REMOVE(&newdirblk->db_list); 1494 if (LIST_FIRST(&oldadp->ad_newdirblk) != NULL) 1495 panic("allocdirect_merge: extra newdirblk"); 1496 WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list); 1497 } 1498 free_allocdirect(adphead, oldadp, 0); 1499 } 1500 1501 /* 1502 * Allocate a new freefrag structure if needed. 1503 */ 1504 static struct freefrag * 1505 newfreefrag(ip, blkno, size) 1506 struct inode *ip; 1507 ufs_daddr_t blkno; 1508 long size; 1509 { 1510 struct freefrag *freefrag; 1511 struct fs *fs; 1512 1513 if (blkno == 0) 1514 return (NULL); 1515 fs = ip->i_fs; 1516 if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) 1517 panic("newfreefrag: frag size"); 1518 MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag), 1519 M_FREEFRAG, M_SOFTDEP_FLAGS); 1520 freefrag->ff_list.wk_type = D_FREEFRAG; 1521 freefrag->ff_state = 0; 1522 freefrag->ff_inum = ip->i_number; 1523 freefrag->ff_mnt = ITOV(ip)->v_mount; 1524 freefrag->ff_devvp = ip->i_devvp; 1525 freefrag->ff_blkno = blkno; 1526 freefrag->ff_fragsize = size; 1527 return (freefrag); 1528 } 1529 1530 /* 1531 * This workitem de-allocates fragments that were replaced during 1532 * file block allocation. 1533 */ 1534 static void 1535 handle_workitem_freefrag(freefrag) 1536 struct freefrag *freefrag; 1537 { 1538 1539 ffs_blkfree(VFSTOUFS(freefrag->ff_mnt)->um_fs, freefrag->ff_devvp, 1540 freefrag->ff_blkno, freefrag->ff_fragsize, freefrag->ff_inum); 1541 FREE(freefrag, M_FREEFRAG); 1542 } 1543 1544 /* 1545 * Indirect block allocation dependencies. 1546 * 1547 * The same dependencies that exist for a direct block also exist when 1548 * a new block is allocated and pointed to by an entry in a block of 1549 * indirect pointers. The undo/redo states described above are also 1550 * used here. Because an indirect block contains many pointers that 1551 * may have dependencies, a second copy of the entire in-memory indirect 1552 * block is kept. The buffer cache copy is always completely up-to-date. 1553 * The second copy, which is used only as a source for disk writes, 1554 * contains only the safe pointers (i.e., those that have no remaining 1555 * update dependencies). The second copy is freed when all pointers 1556 * are safe. The cache is not allowed to replace indirect blocks with 1557 * pending update dependencies. If a buffer containing an indirect 1558 * block with dependencies is written, these routines will mark it 1559 * dirty again. It can only be successfully written once all the 1560 * dependencies are removed. The ffs_fsync routine in conjunction with 1561 * softdep_sync_metadata work together to get all the dependencies 1562 * removed so that a file can be successfully written to disk. Three 1563 * procedures are used when setting up indirect block pointer 1564 * dependencies. The division is necessary because of the organization 1565 * of the "balloc" routine and because of the distinction between file 1566 * pages and file metadata blocks. 1567 */ 1568 1569 /* 1570 * Allocate a new allocindir structure. 1571 */ 1572 static struct allocindir * 1573 newallocindir(ip, ptrno, newblkno, oldblkno) 1574 struct inode *ip; /* inode for file being extended */ 1575 int ptrno; /* offset of pointer in indirect block */ 1576 ufs_daddr_t newblkno; /* disk block number being added */ 1577 ufs_daddr_t oldblkno; /* previous block number, 0 if none */ 1578 { 1579 struct allocindir *aip; 1580 1581 MALLOC(aip, struct allocindir *, sizeof(struct allocindir), 1582 M_ALLOCINDIR, M_SOFTDEP_FLAGS|M_ZERO); 1583 aip->ai_list.wk_type = D_ALLOCINDIR; 1584 aip->ai_state = ATTACHED; 1585 aip->ai_offset = ptrno; 1586 aip->ai_newblkno = newblkno; 1587 aip->ai_oldblkno = oldblkno; 1588 aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize); 1589 return (aip); 1590 } 1591 1592 /* 1593 * Called just before setting an indirect block pointer 1594 * to a newly allocated file page. 1595 */ 1596 void 1597 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 1598 struct inode *ip; /* inode for file being extended */ 1599 ufs_lbn_t lbn; /* allocated block number within file */ 1600 struct buf *bp; /* buffer with indirect blk referencing page */ 1601 int ptrno; /* offset of pointer in indirect block */ 1602 ufs_daddr_t newblkno; /* disk block number being added */ 1603 ufs_daddr_t oldblkno; /* previous block number, 0 if none */ 1604 struct buf *nbp; /* buffer holding allocated page */ 1605 { 1606 struct allocindir *aip; 1607 struct pagedep *pagedep; 1608 1609 aip = newallocindir(ip, ptrno, newblkno, oldblkno); 1610 ACQUIRE_LOCK(&lk); 1611 /* 1612 * If we are allocating a directory page, then we must 1613 * allocate an associated pagedep to track additions and 1614 * deletions. 1615 */ 1616 if ((ip->i_mode & IFMT) == IFDIR && 1617 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) 1618 WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list); 1619 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); 1620 FREE_LOCK(&lk); 1621 setup_allocindir_phase2(bp, ip, aip); 1622 } 1623 1624 /* 1625 * Called just before setting an indirect block pointer to a 1626 * newly allocated indirect block. 1627 */ 1628 void 1629 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 1630 struct buf *nbp; /* newly allocated indirect block */ 1631 struct inode *ip; /* inode for file being extended */ 1632 struct buf *bp; /* indirect block referencing allocated block */ 1633 int ptrno; /* offset of pointer in indirect block */ 1634 ufs_daddr_t newblkno; /* disk block number being added */ 1635 { 1636 struct allocindir *aip; 1637 1638 aip = newallocindir(ip, ptrno, newblkno, 0); 1639 ACQUIRE_LOCK(&lk); 1640 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); 1641 FREE_LOCK(&lk); 1642 setup_allocindir_phase2(bp, ip, aip); 1643 } 1644 1645 /* 1646 * Called to finish the allocation of the "aip" allocated 1647 * by one of the two routines above. 1648 */ 1649 static void 1650 setup_allocindir_phase2(bp, ip, aip) 1651 struct buf *bp; /* in-memory copy of the indirect block */ 1652 struct inode *ip; /* inode for file being extended */ 1653 struct allocindir *aip; /* allocindir allocated by the above routines */ 1654 { 1655 struct worklist *wk; 1656 struct indirdep *indirdep, *newindirdep; 1657 struct bmsafemap *bmsafemap; 1658 struct allocindir *oldaip; 1659 struct freefrag *freefrag; 1660 struct newblk *newblk; 1661 ufs_daddr_t blkno; 1662 1663 if (bp->b_lblkno >= 0) 1664 panic("setup_allocindir_phase2: not indir blk"); 1665 for (indirdep = NULL, newindirdep = NULL; ; ) { 1666 ACQUIRE_LOCK(&lk); 1667 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 1668 if (wk->wk_type != D_INDIRDEP) 1669 continue; 1670 indirdep = WK_INDIRDEP(wk); 1671 break; 1672 } 1673 if (indirdep == NULL && newindirdep) { 1674 indirdep = newindirdep; 1675 WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); 1676 newindirdep = NULL; 1677 } 1678 FREE_LOCK(&lk); 1679 if (indirdep) { 1680 if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0, 1681 &newblk) == 0) 1682 panic("setup_allocindir: lost block"); 1683 ACQUIRE_LOCK(&lk); 1684 if (newblk->nb_state == DEPCOMPLETE) { 1685 aip->ai_state |= DEPCOMPLETE; 1686 aip->ai_buf = NULL; 1687 } else { 1688 bmsafemap = newblk->nb_bmsafemap; 1689 aip->ai_buf = bmsafemap->sm_buf; 1690 LIST_REMOVE(newblk, nb_deps); 1691 LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd, 1692 aip, ai_deps); 1693 } 1694 LIST_REMOVE(newblk, nb_hash); 1695 FREE(newblk, M_NEWBLK); 1696 aip->ai_indirdep = indirdep; 1697 /* 1698 * Check to see if there is an existing dependency 1699 * for this block. If there is, merge the old 1700 * dependency into the new one. 1701 */ 1702 if (aip->ai_oldblkno == 0) 1703 oldaip = NULL; 1704 else 1705 1706 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) 1707 if (oldaip->ai_offset == aip->ai_offset) 1708 break; 1709 freefrag = NULL; 1710 if (oldaip != NULL) { 1711 if (oldaip->ai_newblkno != aip->ai_oldblkno) { 1712 FREE_LOCK(&lk); 1713 panic("setup_allocindir_phase2: blkno"); 1714 } 1715 aip->ai_oldblkno = oldaip->ai_oldblkno; 1716 freefrag = aip->ai_freefrag; 1717 aip->ai_freefrag = oldaip->ai_freefrag; 1718 oldaip->ai_freefrag = NULL; 1719 free_allocindir(oldaip, NULL); 1720 } 1721 LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); 1722 ((ufs_daddr_t *)indirdep->ir_savebp->b_data) 1723 [aip->ai_offset] = aip->ai_oldblkno; 1724 FREE_LOCK(&lk); 1725 if (freefrag != NULL) 1726 handle_workitem_freefrag(freefrag); 1727 } 1728 if (newindirdep) { 1729 if (indirdep->ir_savebp != NULL) 1730 brelse(newindirdep->ir_savebp); 1731 WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP); 1732 } 1733 if (indirdep) 1734 break; 1735 MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep), 1736 M_INDIRDEP, M_SOFTDEP_FLAGS); 1737 newindirdep->ir_list.wk_type = D_INDIRDEP; 1738 newindirdep->ir_state = ATTACHED; 1739 LIST_INIT(&newindirdep->ir_deplisthd); 1740 LIST_INIT(&newindirdep->ir_donehd); 1741 if (bp->b_blkno == bp->b_lblkno) { 1742 ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, NULL, NULL); 1743 bp->b_blkno = blkno; 1744 } 1745 newindirdep->ir_savebp = 1746 getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0); 1747 BUF_KERNPROC(newindirdep->ir_savebp); 1748 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); 1749 } 1750 } 1751 1752 /* 1753 * Block de-allocation dependencies. 1754 * 1755 * When blocks are de-allocated, the on-disk pointers must be nullified before 1756 * the blocks are made available for use by other files. (The true 1757 * requirement is that old pointers must be nullified before new on-disk 1758 * pointers are set. We chose this slightly more stringent requirement to 1759 * reduce complexity.) Our implementation handles this dependency by updating 1760 * the inode (or indirect block) appropriately but delaying the actual block 1761 * de-allocation (i.e., freemap and free space count manipulation) until 1762 * after the updated versions reach stable storage. After the disk is 1763 * updated, the blocks can be safely de-allocated whenever it is convenient. 1764 * This implementation handles only the common case of reducing a file's 1765 * length to zero. Other cases are handled by the conventional synchronous 1766 * write approach. 1767 * 1768 * The ffs implementation with which we worked double-checks 1769 * the state of the block pointers and file size as it reduces 1770 * a file's length. Some of this code is replicated here in our 1771 * soft updates implementation. The freeblks->fb_chkcnt field is 1772 * used to transfer a part of this information to the procedure 1773 * that eventually de-allocates the blocks. 1774 * 1775 * This routine should be called from the routine that shortens 1776 * a file's length, before the inode's size or block pointers 1777 * are modified. It will save the block pointer information for 1778 * later release and zero the inode so that the calling routine 1779 * can release it. 1780 */ 1781 void 1782 softdep_setup_freeblocks(ip, length) 1783 struct inode *ip; /* The inode whose length is to be reduced */ 1784 off_t length; /* The new length for the file */ 1785 { 1786 struct freeblks *freeblks; 1787 struct inodedep *inodedep; 1788 struct allocdirect *adp; 1789 struct vnode *vp; 1790 struct buf *bp; 1791 struct fs *fs; 1792 int i, delay, error; 1793 1794 fs = ip->i_fs; 1795 if (length != 0) 1796 panic("softdep_setup_freeblocks: non-zero length"); 1797 MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks), 1798 M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO); 1799 freeblks->fb_list.wk_type = D_FREEBLKS; 1800 freeblks->fb_uid = ip->i_uid; 1801 freeblks->fb_previousinum = ip->i_number; 1802 freeblks->fb_devvp = ip->i_devvp; 1803 freeblks->fb_mnt = ITOV(ip)->v_mount; 1804 freeblks->fb_oldsize = ip->i_size; 1805 freeblks->fb_newsize = length; 1806 freeblks->fb_chkcnt = ip->i_blocks; 1807 for (i = 0; i < NDADDR; i++) { 1808 freeblks->fb_dblks[i] = ip->i_db[i]; 1809 ip->i_db[i] = 0; 1810 } 1811 for (i = 0; i < NIADDR; i++) { 1812 freeblks->fb_iblks[i] = ip->i_ib[i]; 1813 ip->i_ib[i] = 0; 1814 } 1815 ip->i_blocks = 0; 1816 ip->i_size = 0; 1817 /* 1818 * If the file was removed, then the space being freed was 1819 * accounted for then (see softdep_filereleased()). If the 1820 * file is merely being truncated, then we account for it now. 1821 */ 1822 if ((ip->i_flag & IN_SPACECOUNTED) == 0) 1823 fs->fs_pendingblocks += freeblks->fb_chkcnt; 1824 /* 1825 * Push the zero'ed inode to to its disk buffer so that we are free 1826 * to delete its dependencies below. Once the dependencies are gone 1827 * the buffer can be safely released. 1828 */ 1829 if ((error = bread(ip->i_devvp, 1830 fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 1831 (int)fs->fs_bsize, NOCRED, &bp)) != 0) { 1832 brelse(bp); 1833 softdep_error("softdep_setup_freeblocks", error); 1834 } 1835 *((struct dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = 1836 ip->i_din; 1837 /* 1838 * Find and eliminate any inode dependencies. 1839 */ 1840 ACQUIRE_LOCK(&lk); 1841 (void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep); 1842 if ((inodedep->id_state & IOSTARTED) != 0) { 1843 FREE_LOCK(&lk); 1844 panic("softdep_setup_freeblocks: inode busy"); 1845 } 1846 /* 1847 * Add the freeblks structure to the list of operations that 1848 * must await the zero'ed inode being written to disk. If we 1849 * still have a bitmap dependency (delay == 0), then the inode 1850 * has never been written to disk, so we can process the 1851 * freeblks below once we have deleted the dependencies. 1852 */ 1853 delay = (inodedep->id_state & DEPCOMPLETE); 1854 if (delay) 1855 WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list); 1856 /* 1857 * Because the file length has been truncated to zero, any 1858 * pending block allocation dependency structures associated 1859 * with this inode are obsolete and can simply be de-allocated. 1860 * We must first merge the two dependency lists to get rid of 1861 * any duplicate freefrag structures, then purge the merged list. 1862 * If we still have a bitmap dependency, then the inode has never 1863 * been written to disk, so we can free any fragments without delay. 1864 */ 1865 merge_inode_lists(inodedep); 1866 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) 1867 free_allocdirect(&inodedep->id_inoupdt, adp, delay); 1868 FREE_LOCK(&lk); 1869 bdwrite(bp); 1870 /* 1871 * We must wait for any I/O in progress to finish so that 1872 * all potential buffers on the dirty list will be visible. 1873 * Once they are all there, walk the list and get rid of 1874 * any dependencies. 1875 */ 1876 vp = ITOV(ip); 1877 ACQUIRE_LOCK(&lk); 1878 drain_output(vp, 1); 1879 while (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) { 1880 bp = TAILQ_FIRST(&vp->v_dirtyblkhd); 1881 (void) inodedep_lookup(fs, ip->i_number, 0, &inodedep); 1882 deallocate_dependencies(bp, inodedep); 1883 bp->b_flags |= B_INVAL | B_NOCACHE; 1884 FREE_LOCK(&lk); 1885 brelse(bp); 1886 ACQUIRE_LOCK(&lk); 1887 } 1888 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0) 1889 (void) free_inodedep(inodedep); 1890 FREE_LOCK(&lk); 1891 /* 1892 * If the inode has never been written to disk (delay == 0), 1893 * then we can process the freeblks now that we have deleted 1894 * the dependencies. 1895 */ 1896 if (!delay) 1897 handle_workitem_freeblocks(freeblks, 0); 1898 } 1899 1900 /* 1901 * Reclaim any dependency structures from a buffer that is about to 1902 * be reallocated to a new vnode. The buffer must be locked, thus, 1903 * no I/O completion operations can occur while we are manipulating 1904 * its associated dependencies. The mutex is held so that other I/O's 1905 * associated with related dependencies do not occur. 1906 */ 1907 static void 1908 deallocate_dependencies(bp, inodedep) 1909 struct buf *bp; 1910 struct inodedep *inodedep; 1911 { 1912 struct worklist *wk; 1913 struct indirdep *indirdep; 1914 struct allocindir *aip; 1915 struct pagedep *pagedep; 1916 struct dirrem *dirrem; 1917 struct diradd *dap; 1918 int i; 1919 1920 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 1921 switch (wk->wk_type) { 1922 1923 case D_INDIRDEP: 1924 indirdep = WK_INDIRDEP(wk); 1925 /* 1926 * None of the indirect pointers will ever be visible, 1927 * so they can simply be tossed. GOINGAWAY ensures 1928 * that allocated pointers will be saved in the buffer 1929 * cache until they are freed. Note that they will 1930 * only be able to be found by their physical address 1931 * since the inode mapping the logical address will 1932 * be gone. The save buffer used for the safe copy 1933 * was allocated in setup_allocindir_phase2 using 1934 * the physical address so it could be used for this 1935 * purpose. Hence we swap the safe copy with the real 1936 * copy, allowing the safe copy to be freed and holding 1937 * on to the real copy for later use in indir_trunc. 1938 */ 1939 if (indirdep->ir_state & GOINGAWAY) { 1940 FREE_LOCK(&lk); 1941 panic("deallocate_dependencies: already gone"); 1942 } 1943 indirdep->ir_state |= GOINGAWAY; 1944 while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) 1945 free_allocindir(aip, inodedep); 1946 if (bp->b_lblkno >= 0 || 1947 bp->b_blkno != indirdep->ir_savebp->b_lblkno) { 1948 FREE_LOCK(&lk); 1949 panic("deallocate_dependencies: not indir"); 1950 } 1951 bcopy(bp->b_data, indirdep->ir_savebp->b_data, 1952 bp->b_bcount); 1953 WORKLIST_REMOVE(wk); 1954 WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk); 1955 continue; 1956 1957 case D_PAGEDEP: 1958 pagedep = WK_PAGEDEP(wk); 1959 /* 1960 * None of the directory additions will ever be 1961 * visible, so they can simply be tossed. 1962 */ 1963 for (i = 0; i < DAHASHSZ; i++) 1964 while ((dap = 1965 LIST_FIRST(&pagedep->pd_diraddhd[i]))) 1966 free_diradd(dap); 1967 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0) 1968 free_diradd(dap); 1969 /* 1970 * Copy any directory remove dependencies to the list 1971 * to be processed after the zero'ed inode is written. 1972 * If the inode has already been written, then they 1973 * can be dumped directly onto the work list. 1974 */ 1975 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { 1976 LIST_REMOVE(dirrem, dm_next); 1977 dirrem->dm_dirinum = pagedep->pd_ino; 1978 if (inodedep == NULL || 1979 (inodedep->id_state & ALLCOMPLETE) == 1980 ALLCOMPLETE) 1981 add_to_worklist(&dirrem->dm_list); 1982 else 1983 WORKLIST_INSERT(&inodedep->id_bufwait, 1984 &dirrem->dm_list); 1985 } 1986 if ((pagedep->pd_state & NEWBLOCK) != 0) { 1987 LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list) 1988 if (wk->wk_type == D_NEWDIRBLK && 1989 WK_NEWDIRBLK(wk)->db_pagedep == 1990 pagedep) 1991 break; 1992 if (wk != NULL) { 1993 WORKLIST_REMOVE(wk); 1994 free_newdirblk(WK_NEWDIRBLK(wk)); 1995 } else { 1996 FREE_LOCK(&lk); 1997 panic("deallocate_dependencies: " 1998 "lost pagedep"); 1999 } 2000 } 2001 WORKLIST_REMOVE(&pagedep->pd_list); 2002 LIST_REMOVE(pagedep, pd_hash); 2003 WORKITEM_FREE(pagedep, D_PAGEDEP); 2004 continue; 2005 2006 case D_ALLOCINDIR: 2007 free_allocindir(WK_ALLOCINDIR(wk), inodedep); 2008 continue; 2009 2010 case D_ALLOCDIRECT: 2011 case D_INODEDEP: 2012 FREE_LOCK(&lk); 2013 panic("deallocate_dependencies: Unexpected type %s", 2014 TYPENAME(wk->wk_type)); 2015 /* NOTREACHED */ 2016 2017 default: 2018 FREE_LOCK(&lk); 2019 panic("deallocate_dependencies: Unknown type %s", 2020 TYPENAME(wk->wk_type)); 2021 /* NOTREACHED */ 2022 } 2023 } 2024 } 2025 2026 /* 2027 * Free an allocdirect. Generate a new freefrag work request if appropriate. 2028 * This routine must be called with splbio interrupts blocked. 2029 */ 2030 static void 2031 free_allocdirect(adphead, adp, delay) 2032 struct allocdirectlst *adphead; 2033 struct allocdirect *adp; 2034 int delay; 2035 { 2036 struct newdirblk *newdirblk; 2037 struct worklist *wk; 2038 2039 #ifdef DEBUG 2040 if (lk.lkt_held == NOHOLDER) 2041 panic("free_allocdirect: lock not held"); 2042 #endif 2043 if ((adp->ad_state & DEPCOMPLETE) == 0) 2044 LIST_REMOVE(adp, ad_deps); 2045 TAILQ_REMOVE(adphead, adp, ad_next); 2046 if ((adp->ad_state & COMPLETE) == 0) 2047 WORKLIST_REMOVE(&adp->ad_list); 2048 if (adp->ad_freefrag != NULL) { 2049 if (delay) 2050 WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, 2051 &adp->ad_freefrag->ff_list); 2052 else 2053 add_to_worklist(&adp->ad_freefrag->ff_list); 2054 } 2055 if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) { 2056 newdirblk = WK_NEWDIRBLK(wk); 2057 WORKLIST_REMOVE(&newdirblk->db_list); 2058 if (LIST_FIRST(&adp->ad_newdirblk) != NULL) 2059 panic("free_allocdirect: extra newdirblk"); 2060 if (delay) 2061 WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, 2062 &newdirblk->db_list); 2063 else 2064 free_newdirblk(newdirblk); 2065 } 2066 WORKITEM_FREE(adp, D_ALLOCDIRECT); 2067 } 2068 2069 /* 2070 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep. 2071 * This routine must be called with splbio interrupts blocked. 2072 */ 2073 static void 2074 free_newdirblk(newdirblk) 2075 struct newdirblk *newdirblk; 2076 { 2077 struct pagedep *pagedep; 2078 struct diradd *dap; 2079 int i; 2080 2081 #ifdef DEBUG 2082 if (lk.lkt_held == NOHOLDER) 2083 panic("free_newdirblk: lock not held"); 2084 #endif 2085 /* 2086 * If the pagedep is still linked onto the directory buffer 2087 * dependency chain, then some of the entries on the 2088 * pd_pendinghd list may not be committed to disk yet. In 2089 * this case, we will simply clear the NEWBLOCK flag and 2090 * let the pd_pendinghd list be processed when the pagedep 2091 * is next written. If the pagedep is no longer on the buffer 2092 * dependency chain, then all the entries on the pd_pending 2093 * list are committed to disk and we can free them here. 2094 */ 2095 pagedep = newdirblk->db_pagedep; 2096 pagedep->pd_state &= ~NEWBLOCK; 2097 if ((pagedep->pd_state & ONWORKLIST) == 0) 2098 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 2099 free_diradd(dap); 2100 /* 2101 * If no dependencies remain, the pagedep will be freed. 2102 */ 2103 for (i = 0; i < DAHASHSZ; i++) 2104 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL) 2105 break; 2106 if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) { 2107 LIST_REMOVE(pagedep, pd_hash); 2108 WORKITEM_FREE(pagedep, D_PAGEDEP); 2109 } 2110 WORKITEM_FREE(newdirblk, D_NEWDIRBLK); 2111 } 2112 2113 /* 2114 * Prepare an inode to be freed. The actual free operation is not 2115 * done until the zero'ed inode has been written to disk. 2116 */ 2117 void 2118 softdep_freefile(pvp, ino, mode) 2119 struct vnode *pvp; 2120 ino_t ino; 2121 int mode; 2122 { 2123 struct inode *ip = VTOI(pvp); 2124 struct inodedep *inodedep; 2125 struct freefile *freefile; 2126 2127 /* 2128 * This sets up the inode de-allocation dependency. 2129 */ 2130 MALLOC(freefile, struct freefile *, sizeof(struct freefile), 2131 M_FREEFILE, M_SOFTDEP_FLAGS); 2132 freefile->fx_list.wk_type = D_FREEFILE; 2133 freefile->fx_list.wk_state = 0; 2134 freefile->fx_mode = mode; 2135 freefile->fx_oldinum = ino; 2136 freefile->fx_devvp = ip->i_devvp; 2137 freefile->fx_mnt = ITOV(ip)->v_mount; 2138 if ((ip->i_flag & IN_SPACECOUNTED) == 0) 2139 ip->i_fs->fs_pendinginodes += 1; 2140 2141 /* 2142 * If the inodedep does not exist, then the zero'ed inode has 2143 * been written to disk. If the allocated inode has never been 2144 * written to disk, then the on-disk inode is zero'ed. In either 2145 * case we can free the file immediately. 2146 */ 2147 ACQUIRE_LOCK(&lk); 2148 if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 || 2149 check_inode_unwritten(inodedep)) { 2150 FREE_LOCK(&lk); 2151 handle_workitem_freefile(freefile); 2152 return; 2153 } 2154 WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); 2155 FREE_LOCK(&lk); 2156 } 2157 2158 /* 2159 * Check to see if an inode has never been written to disk. If 2160 * so free the inodedep and return success, otherwise return failure. 2161 * This routine must be called with splbio interrupts blocked. 2162 * 2163 * If we still have a bitmap dependency, then the inode has never 2164 * been written to disk. Drop the dependency as it is no longer 2165 * necessary since the inode is being deallocated. We set the 2166 * ALLCOMPLETE flags since the bitmap now properly shows that the 2167 * inode is not allocated. Even if the inode is actively being 2168 * written, it has been rolled back to its zero'ed state, so we 2169 * are ensured that a zero inode is what is on the disk. For short 2170 * lived files, this change will usually result in removing all the 2171 * dependencies from the inode so that it can be freed immediately. 2172 */ 2173 static int 2174 check_inode_unwritten(inodedep) 2175 struct inodedep *inodedep; 2176 { 2177 2178 if ((inodedep->id_state & DEPCOMPLETE) != 0 || 2179 LIST_FIRST(&inodedep->id_pendinghd) != NULL || 2180 LIST_FIRST(&inodedep->id_bufwait) != NULL || 2181 LIST_FIRST(&inodedep->id_inowait) != NULL || 2182 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || 2183 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL || 2184 inodedep->id_nlinkdelta != 0) 2185 return (0); 2186 inodedep->id_state |= ALLCOMPLETE; 2187 LIST_REMOVE(inodedep, id_deps); 2188 inodedep->id_buf = NULL; 2189 if (inodedep->id_state & ONWORKLIST) 2190 WORKLIST_REMOVE(&inodedep->id_list); 2191 if (inodedep->id_savedino != NULL) { 2192 FREE(inodedep->id_savedino, M_INODEDEP); 2193 inodedep->id_savedino = NULL; 2194 } 2195 if (free_inodedep(inodedep) == 0) { 2196 FREE_LOCK(&lk); 2197 panic("check_inode_unwritten: busy inode"); 2198 } 2199 return (1); 2200 } 2201 2202 /* 2203 * Try to free an inodedep structure. Return 1 if it could be freed. 2204 */ 2205 static int 2206 free_inodedep(inodedep) 2207 struct inodedep *inodedep; 2208 { 2209 2210 if ((inodedep->id_state & ONWORKLIST) != 0 || 2211 (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || 2212 LIST_FIRST(&inodedep->id_pendinghd) != NULL || 2213 LIST_FIRST(&inodedep->id_bufwait) != NULL || 2214 LIST_FIRST(&inodedep->id_inowait) != NULL || 2215 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || 2216 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL || 2217 inodedep->id_nlinkdelta != 0 || inodedep->id_savedino != NULL) 2218 return (0); 2219 LIST_REMOVE(inodedep, id_hash); 2220 WORKITEM_FREE(inodedep, D_INODEDEP); 2221 num_inodedep -= 1; 2222 return (1); 2223 } 2224 2225 /* 2226 * This workitem routine performs the block de-allocation. 2227 * The workitem is added to the pending list after the updated 2228 * inode block has been written to disk. As mentioned above, 2229 * checks regarding the number of blocks de-allocated (compared 2230 * to the number of blocks allocated for the file) are also 2231 * performed in this function. 2232 */ 2233 static void 2234 handle_workitem_freeblocks(freeblks, flags) 2235 struct freeblks *freeblks; 2236 int flags; 2237 { 2238 struct inode *ip; 2239 struct vnode *vp; 2240 ufs_daddr_t bn; 2241 struct fs *fs; 2242 int i, level, bsize; 2243 long nblocks, blocksreleased = 0; 2244 int error, allerror = 0; 2245 ufs_lbn_t baselbns[NIADDR], tmpval; 2246 2247 fs = VFSTOUFS(freeblks->fb_mnt)->um_fs; 2248 tmpval = 1; 2249 baselbns[0] = NDADDR; 2250 for (i = 1; i < NIADDR; i++) { 2251 tmpval *= NINDIR(fs); 2252 baselbns[i] = baselbns[i - 1] + tmpval; 2253 } 2254 nblocks = btodb(fs->fs_bsize); 2255 blocksreleased = 0; 2256 /* 2257 * Indirect blocks first. 2258 */ 2259 for (level = (NIADDR - 1); level >= 0; level--) { 2260 if ((bn = freeblks->fb_iblks[level]) == 0) 2261 continue; 2262 if ((error = indir_trunc(freeblks, fsbtodb(fs, bn), level, 2263 baselbns[level], &blocksreleased)) == 0) 2264 allerror = error; 2265 ffs_blkfree(fs, freeblks->fb_devvp, bn, fs->fs_bsize, 2266 freeblks->fb_previousinum); 2267 fs->fs_pendingblocks -= nblocks; 2268 blocksreleased += nblocks; 2269 } 2270 /* 2271 * All direct blocks or frags. 2272 */ 2273 for (i = (NDADDR - 1); i >= 0; i--) { 2274 if ((bn = freeblks->fb_dblks[i]) == 0) 2275 continue; 2276 bsize = sblksize(fs, freeblks->fb_oldsize, i); 2277 ffs_blkfree(fs, freeblks->fb_devvp, bn, bsize, 2278 freeblks->fb_previousinum); 2279 fs->fs_pendingblocks -= btodb(bsize); 2280 blocksreleased += btodb(bsize); 2281 } 2282 /* 2283 * If we still have not finished background cleanup, then check 2284 * to see if the block count needs to be adjusted. 2285 */ 2286 if (freeblks->fb_chkcnt != blocksreleased && 2287 (fs->fs_flags & FS_UNCLEAN) != 0 && 2288 VFS_VGET(freeblks->fb_mnt, freeblks->fb_previousinum, 2289 (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp) == 0) { 2290 ip = VTOI(vp); 2291 ip->i_blocks += freeblks->fb_chkcnt - blocksreleased; 2292 ip->i_flag |= IN_CHANGE; 2293 vput(vp); 2294 } 2295 2296 #ifdef DIAGNOSTIC 2297 if (freeblks->fb_chkcnt != blocksreleased && 2298 ((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0)) 2299 printf("handle_workitem_freeblocks: block count"); 2300 if (allerror) 2301 softdep_error("handle_workitem_freeblks", allerror); 2302 #endif /* DIAGNOSTIC */ 2303 2304 WORKITEM_FREE(freeblks, D_FREEBLKS); 2305 } 2306 2307 /* 2308 * Release blocks associated with the inode ip and stored in the indirect 2309 * block dbn. If level is greater than SINGLE, the block is an indirect block 2310 * and recursive calls to indirtrunc must be used to cleanse other indirect 2311 * blocks. 2312 */ 2313 static int 2314 indir_trunc(freeblks, dbn, level, lbn, countp) 2315 struct freeblks *freeblks; 2316 ufs_daddr_t dbn; 2317 int level; 2318 ufs_lbn_t lbn; 2319 long *countp; 2320 { 2321 struct buf *bp; 2322 ufs_daddr_t *bap; 2323 ufs_daddr_t nb; 2324 struct fs *fs; 2325 struct worklist *wk; 2326 struct indirdep *indirdep; 2327 int i, lbnadd, nblocks; 2328 int error, allerror = 0; 2329 2330 fs = VFSTOUFS(freeblks->fb_mnt)->um_fs; 2331 lbnadd = 1; 2332 for (i = level; i > 0; i--) 2333 lbnadd *= NINDIR(fs); 2334 /* 2335 * Get buffer of block pointers to be freed. This routine is not 2336 * called until the zero'ed inode has been written, so it is safe 2337 * to free blocks as they are encountered. Because the inode has 2338 * been zero'ed, calls to bmap on these blocks will fail. So, we 2339 * have to use the on-disk address and the block device for the 2340 * filesystem to look them up. If the file was deleted before its 2341 * indirect blocks were all written to disk, the routine that set 2342 * us up (deallocate_dependencies) will have arranged to leave 2343 * a complete copy of the indirect block in memory for our use. 2344 * Otherwise we have to read the blocks in from the disk. 2345 */ 2346 ACQUIRE_LOCK(&lk); 2347 if ((bp = incore(freeblks->fb_devvp, dbn)) != NULL && 2348 (wk = LIST_FIRST(&bp->b_dep)) != NULL) { 2349 if (wk->wk_type != D_INDIRDEP || 2350 (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp || 2351 (indirdep->ir_state & GOINGAWAY) == 0) { 2352 FREE_LOCK(&lk); 2353 panic("indir_trunc: lost indirdep"); 2354 } 2355 WORKLIST_REMOVE(wk); 2356 WORKITEM_FREE(indirdep, D_INDIRDEP); 2357 if (LIST_FIRST(&bp->b_dep) != NULL) { 2358 FREE_LOCK(&lk); 2359 panic("indir_trunc: dangling dep"); 2360 } 2361 FREE_LOCK(&lk); 2362 } else { 2363 FREE_LOCK(&lk); 2364 error = bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 2365 NOCRED, &bp); 2366 if (error) { 2367 brelse(bp); 2368 return (error); 2369 } 2370 } 2371 /* 2372 * Recursively free indirect blocks. 2373 */ 2374 bap = (ufs_daddr_t *)bp->b_data; 2375 nblocks = btodb(fs->fs_bsize); 2376 for (i = NINDIR(fs) - 1; i >= 0; i--) { 2377 if ((nb = bap[i]) == 0) 2378 continue; 2379 if (level != 0) { 2380 if ((error = indir_trunc(freeblks, fsbtodb(fs, nb), 2381 level - 1, lbn + (i * lbnadd), countp)) != 0) 2382 allerror = error; 2383 } 2384 ffs_blkfree(fs, freeblks->fb_devvp, nb, fs->fs_bsize, 2385 freeblks->fb_previousinum); 2386 fs->fs_pendingblocks -= nblocks; 2387 *countp += nblocks; 2388 } 2389 bp->b_flags |= B_INVAL | B_NOCACHE; 2390 brelse(bp); 2391 return (allerror); 2392 } 2393 2394 /* 2395 * Free an allocindir. 2396 * This routine must be called with splbio interrupts blocked. 2397 */ 2398 static void 2399 free_allocindir(aip, inodedep) 2400 struct allocindir *aip; 2401 struct inodedep *inodedep; 2402 { 2403 struct freefrag *freefrag; 2404 2405 #ifdef DEBUG 2406 if (lk.lkt_held == NOHOLDER) 2407 panic("free_allocindir: lock not held"); 2408 #endif 2409 if ((aip->ai_state & DEPCOMPLETE) == 0) 2410 LIST_REMOVE(aip, ai_deps); 2411 if (aip->ai_state & ONWORKLIST) 2412 WORKLIST_REMOVE(&aip->ai_list); 2413 LIST_REMOVE(aip, ai_next); 2414 if ((freefrag = aip->ai_freefrag) != NULL) { 2415 if (inodedep == NULL) 2416 add_to_worklist(&freefrag->ff_list); 2417 else 2418 WORKLIST_INSERT(&inodedep->id_bufwait, 2419 &freefrag->ff_list); 2420 } 2421 WORKITEM_FREE(aip, D_ALLOCINDIR); 2422 } 2423 2424 /* 2425 * Directory entry addition dependencies. 2426 * 2427 * When adding a new directory entry, the inode (with its incremented link 2428 * count) must be written to disk before the directory entry's pointer to it. 2429 * Also, if the inode is newly allocated, the corresponding freemap must be 2430 * updated (on disk) before the directory entry's pointer. These requirements 2431 * are met via undo/redo on the directory entry's pointer, which consists 2432 * simply of the inode number. 2433 * 2434 * As directory entries are added and deleted, the free space within a 2435 * directory block can become fragmented. The ufs filesystem will compact 2436 * a fragmented directory block to make space for a new entry. When this 2437 * occurs, the offsets of previously added entries change. Any "diradd" 2438 * dependency structures corresponding to these entries must be updated with 2439 * the new offsets. 2440 */ 2441 2442 /* 2443 * This routine is called after the in-memory inode's link 2444 * count has been incremented, but before the directory entry's 2445 * pointer to the inode has been set. 2446 */ 2447 int 2448 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) 2449 struct buf *bp; /* buffer containing directory block */ 2450 struct inode *dp; /* inode for directory */ 2451 off_t diroffset; /* offset of new entry in directory */ 2452 long newinum; /* inode referenced by new directory entry */ 2453 struct buf *newdirbp; /* non-NULL => contents of new mkdir */ 2454 int isnewblk; /* entry is in a newly allocated block */ 2455 { 2456 int offset; /* offset of new entry within directory block */ 2457 ufs_lbn_t lbn; /* block in directory containing new entry */ 2458 struct fs *fs; 2459 struct diradd *dap; 2460 struct allocdirect *adp; 2461 struct pagedep *pagedep; 2462 struct inodedep *inodedep; 2463 struct newdirblk *newdirblk = 0; 2464 struct mkdir *mkdir1, *mkdir2; 2465 2466 /* 2467 * Whiteouts have no dependencies. 2468 */ 2469 if (newinum == WINO) { 2470 if (newdirbp != NULL) 2471 bdwrite(newdirbp); 2472 return (0); 2473 } 2474 2475 fs = dp->i_fs; 2476 lbn = lblkno(fs, diroffset); 2477 offset = blkoff(fs, diroffset); 2478 MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, 2479 M_SOFTDEP_FLAGS|M_ZERO); 2480 dap->da_list.wk_type = D_DIRADD; 2481 dap->da_offset = offset; 2482 dap->da_newinum = newinum; 2483 dap->da_state = ATTACHED; 2484 if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) { 2485 MALLOC(newdirblk, struct newdirblk *, sizeof(struct newdirblk), 2486 M_NEWDIRBLK, M_SOFTDEP_FLAGS); 2487 newdirblk->db_list.wk_type = D_NEWDIRBLK; 2488 newdirblk->db_state = 0; 2489 } 2490 if (newdirbp == NULL) { 2491 dap->da_state |= DEPCOMPLETE; 2492 ACQUIRE_LOCK(&lk); 2493 } else { 2494 dap->da_state |= MKDIR_BODY | MKDIR_PARENT; 2495 MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR, 2496 M_SOFTDEP_FLAGS); 2497 mkdir1->md_list.wk_type = D_MKDIR; 2498 mkdir1->md_state = MKDIR_BODY; 2499 mkdir1->md_diradd = dap; 2500 MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR, 2501 M_SOFTDEP_FLAGS); 2502 mkdir2->md_list.wk_type = D_MKDIR; 2503 mkdir2->md_state = MKDIR_PARENT; 2504 mkdir2->md_diradd = dap; 2505 /* 2506 * Dependency on "." and ".." being written to disk. 2507 */ 2508 mkdir1->md_buf = newdirbp; 2509 ACQUIRE_LOCK(&lk); 2510 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); 2511 WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list); 2512 FREE_LOCK(&lk); 2513 bdwrite(newdirbp); 2514 /* 2515 * Dependency on link count increase for parent directory 2516 */ 2517 ACQUIRE_LOCK(&lk); 2518 if (inodedep_lookup(fs, dp->i_number, 0, &inodedep) == 0 2519 || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 2520 dap->da_state &= ~MKDIR_PARENT; 2521 WORKITEM_FREE(mkdir2, D_MKDIR); 2522 } else { 2523 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 2524 WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list); 2525 } 2526 } 2527 /* 2528 * Link into parent directory pagedep to await its being written. 2529 */ 2530 if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) 2531 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 2532 dap->da_pagedep = pagedep; 2533 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, 2534 da_pdlist); 2535 /* 2536 * Link into its inodedep. Put it on the id_bufwait list if the inode 2537 * is not yet written. If it is written, do the post-inode write 2538 * processing to put it on the id_pendinghd list. 2539 */ 2540 (void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep); 2541 if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) 2542 diradd_inode_written(dap, inodedep); 2543 else 2544 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 2545 if (isnewblk) { 2546 /* 2547 * Directories growing into indirect blocks are rare 2548 * enough and the frequency of new block allocation 2549 * in those cases even more rare, that we choose not 2550 * to bother tracking them. Rather we simply force the 2551 * new directory entry to disk. 2552 */ 2553 if (lbn >= NDADDR) { 2554 FREE_LOCK(&lk); 2555 /* 2556 * We only have a new allocation when at the 2557 * beginning of a new block, not when we are 2558 * expanding into an existing block. 2559 */ 2560 if (blkoff(fs, diroffset) == 0) 2561 return (1); 2562 return (0); 2563 } 2564 /* 2565 * We only have a new allocation when at the beginning 2566 * of a new fragment, not when we are expanding into an 2567 * existing fragment. Also, there is nothing to do if we 2568 * are already tracking this block. 2569 */ 2570 if (fragoff(fs, diroffset) != 0) { 2571 FREE_LOCK(&lk); 2572 return (0); 2573 } 2574 if ((pagedep->pd_state & NEWBLOCK) != 0) { 2575 WORKITEM_FREE(newdirblk, D_NEWDIRBLK); 2576 FREE_LOCK(&lk); 2577 return (0); 2578 } 2579 /* 2580 * Find our associated allocdirect and have it track us. 2581 */ 2582 if (inodedep_lookup(fs, dp->i_number, 0, &inodedep) == 0) 2583 panic("softdep_setup_directory_add: lost inodedep"); 2584 adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst); 2585 if (adp == NULL || adp->ad_lbn != lbn) { 2586 FREE_LOCK(&lk); 2587 panic("softdep_setup_directory_add: lost entry"); 2588 } 2589 pagedep->pd_state |= NEWBLOCK; 2590 newdirblk->db_pagedep = pagedep; 2591 WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list); 2592 } 2593 FREE_LOCK(&lk); 2594 return (0); 2595 } 2596 2597 /* 2598 * This procedure is called to change the offset of a directory 2599 * entry when compacting a directory block which must be owned 2600 * exclusively by the caller. Note that the actual entry movement 2601 * must be done in this procedure to ensure that no I/O completions 2602 * occur while the move is in progress. 2603 */ 2604 void 2605 softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize) 2606 struct inode *dp; /* inode for directory */ 2607 caddr_t base; /* address of dp->i_offset */ 2608 caddr_t oldloc; /* address of old directory location */ 2609 caddr_t newloc; /* address of new directory location */ 2610 int entrysize; /* size of directory entry */ 2611 { 2612 int offset, oldoffset, newoffset; 2613 struct pagedep *pagedep; 2614 struct diradd *dap; 2615 ufs_lbn_t lbn; 2616 2617 ACQUIRE_LOCK(&lk); 2618 lbn = lblkno(dp->i_fs, dp->i_offset); 2619 offset = blkoff(dp->i_fs, dp->i_offset); 2620 if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0) 2621 goto done; 2622 oldoffset = offset + (oldloc - base); 2623 newoffset = offset + (newloc - base); 2624 2625 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) { 2626 if (dap->da_offset != oldoffset) 2627 continue; 2628 dap->da_offset = newoffset; 2629 if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset)) 2630 break; 2631 LIST_REMOVE(dap, da_pdlist); 2632 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)], 2633 dap, da_pdlist); 2634 break; 2635 } 2636 if (dap == NULL) { 2637 2638 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) { 2639 if (dap->da_offset == oldoffset) { 2640 dap->da_offset = newoffset; 2641 break; 2642 } 2643 } 2644 } 2645 done: 2646 bcopy(oldloc, newloc, entrysize); 2647 FREE_LOCK(&lk); 2648 } 2649 2650 /* 2651 * Free a diradd dependency structure. This routine must be called 2652 * with splbio interrupts blocked. 2653 */ 2654 static void 2655 free_diradd(dap) 2656 struct diradd *dap; 2657 { 2658 struct dirrem *dirrem; 2659 struct pagedep *pagedep; 2660 struct inodedep *inodedep; 2661 struct mkdir *mkdir, *nextmd; 2662 2663 #ifdef DEBUG 2664 if (lk.lkt_held == NOHOLDER) 2665 panic("free_diradd: lock not held"); 2666 #endif 2667 WORKLIST_REMOVE(&dap->da_list); 2668 LIST_REMOVE(dap, da_pdlist); 2669 if ((dap->da_state & DIRCHG) == 0) { 2670 pagedep = dap->da_pagedep; 2671 } else { 2672 dirrem = dap->da_previous; 2673 pagedep = dirrem->dm_pagedep; 2674 dirrem->dm_dirinum = pagedep->pd_ino; 2675 add_to_worklist(&dirrem->dm_list); 2676 } 2677 if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum, 2678 0, &inodedep) != 0) 2679 (void) free_inodedep(inodedep); 2680 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 2681 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 2682 nextmd = LIST_NEXT(mkdir, md_mkdirs); 2683 if (mkdir->md_diradd != dap) 2684 continue; 2685 dap->da_state &= ~mkdir->md_state; 2686 WORKLIST_REMOVE(&mkdir->md_list); 2687 LIST_REMOVE(mkdir, md_mkdirs); 2688 WORKITEM_FREE(mkdir, D_MKDIR); 2689 } 2690 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 2691 FREE_LOCK(&lk); 2692 panic("free_diradd: unfound ref"); 2693 } 2694 } 2695 WORKITEM_FREE(dap, D_DIRADD); 2696 } 2697 2698 /* 2699 * Directory entry removal dependencies. 2700 * 2701 * When removing a directory entry, the entry's inode pointer must be 2702 * zero'ed on disk before the corresponding inode's link count is decremented 2703 * (possibly freeing the inode for re-use). This dependency is handled by 2704 * updating the directory entry but delaying the inode count reduction until 2705 * after the directory block has been written to disk. After this point, the 2706 * inode count can be decremented whenever it is convenient. 2707 */ 2708 2709 /* 2710 * This routine should be called immediately after removing 2711 * a directory entry. The inode's link count should not be 2712 * decremented by the calling procedure -- the soft updates 2713 * code will do this task when it is safe. 2714 */ 2715 void 2716 softdep_setup_remove(bp, dp, ip, isrmdir) 2717 struct buf *bp; /* buffer containing directory block */ 2718 struct inode *dp; /* inode for the directory being modified */ 2719 struct inode *ip; /* inode for directory entry being removed */ 2720 int isrmdir; /* indicates if doing RMDIR */ 2721 { 2722 struct dirrem *dirrem, *prevdirrem; 2723 2724 /* 2725 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. 2726 */ 2727 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 2728 2729 /* 2730 * If the COMPLETE flag is clear, then there were no active 2731 * entries and we want to roll back to a zeroed entry until 2732 * the new inode is committed to disk. If the COMPLETE flag is 2733 * set then we have deleted an entry that never made it to 2734 * disk. If the entry we deleted resulted from a name change, 2735 * then the old name still resides on disk. We cannot delete 2736 * its inode (returned to us in prevdirrem) until the zeroed 2737 * directory entry gets to disk. The new inode has never been 2738 * referenced on the disk, so can be deleted immediately. 2739 */ 2740 if ((dirrem->dm_state & COMPLETE) == 0) { 2741 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, 2742 dm_next); 2743 FREE_LOCK(&lk); 2744 } else { 2745 if (prevdirrem != NULL) 2746 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, 2747 prevdirrem, dm_next); 2748 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; 2749 FREE_LOCK(&lk); 2750 handle_workitem_remove(dirrem, NULL); 2751 } 2752 } 2753 2754 /* 2755 * Allocate a new dirrem if appropriate and return it along with 2756 * its associated pagedep. Called without a lock, returns with lock. 2757 */ 2758 static long num_dirrem; /* number of dirrem allocated */ 2759 static struct dirrem * 2760 newdirrem(bp, dp, ip, isrmdir, prevdirremp) 2761 struct buf *bp; /* buffer containing directory block */ 2762 struct inode *dp; /* inode for the directory being modified */ 2763 struct inode *ip; /* inode for directory entry being removed */ 2764 int isrmdir; /* indicates if doing RMDIR */ 2765 struct dirrem **prevdirremp; /* previously referenced inode, if any */ 2766 { 2767 int offset; 2768 ufs_lbn_t lbn; 2769 struct diradd *dap; 2770 struct dirrem *dirrem; 2771 struct pagedep *pagedep; 2772 2773 /* 2774 * Whiteouts have no deletion dependencies. 2775 */ 2776 if (ip == NULL) 2777 panic("newdirrem: whiteout"); 2778 /* 2779 * If we are over our limit, try to improve the situation. 2780 * Limiting the number of dirrem structures will also limit 2781 * the number of freefile and freeblks structures. 2782 */ 2783 if (num_dirrem > max_softdeps / 2) 2784 (void) request_cleanup(FLUSH_REMOVE, 0); 2785 num_dirrem += 1; 2786 MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem), 2787 M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO); 2788 dirrem->dm_list.wk_type = D_DIRREM; 2789 dirrem->dm_state = isrmdir ? RMDIR : 0; 2790 dirrem->dm_mnt = ITOV(ip)->v_mount; 2791 dirrem->dm_oldinum = ip->i_number; 2792 *prevdirremp = NULL; 2793 2794 ACQUIRE_LOCK(&lk); 2795 lbn = lblkno(dp->i_fs, dp->i_offset); 2796 offset = blkoff(dp->i_fs, dp->i_offset); 2797 if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) 2798 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 2799 dirrem->dm_pagedep = pagedep; 2800 /* 2801 * Check for a diradd dependency for the same directory entry. 2802 * If present, then both dependencies become obsolete and can 2803 * be de-allocated. Check for an entry on both the pd_dirraddhd 2804 * list and the pd_pendinghd list. 2805 */ 2806 2807 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) 2808 if (dap->da_offset == offset) 2809 break; 2810 if (dap == NULL) { 2811 2812 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) 2813 if (dap->da_offset == offset) 2814 break; 2815 if (dap == NULL) 2816 return (dirrem); 2817 } 2818 /* 2819 * Must be ATTACHED at this point. 2820 */ 2821 if ((dap->da_state & ATTACHED) == 0) { 2822 FREE_LOCK(&lk); 2823 panic("newdirrem: not ATTACHED"); 2824 } 2825 if (dap->da_newinum != ip->i_number) { 2826 FREE_LOCK(&lk); 2827 panic("newdirrem: inum %d should be %d", 2828 ip->i_number, dap->da_newinum); 2829 } 2830 /* 2831 * If we are deleting a changed name that never made it to disk, 2832 * then return the dirrem describing the previous inode (which 2833 * represents the inode currently referenced from this entry on disk). 2834 */ 2835 if ((dap->da_state & DIRCHG) != 0) { 2836 *prevdirremp = dap->da_previous; 2837 dap->da_state &= ~DIRCHG; 2838 dap->da_pagedep = pagedep; 2839 } 2840 /* 2841 * We are deleting an entry that never made it to disk. 2842 * Mark it COMPLETE so we can delete its inode immediately. 2843 */ 2844 dirrem->dm_state |= COMPLETE; 2845 free_diradd(dap); 2846 return (dirrem); 2847 } 2848 2849 /* 2850 * Directory entry change dependencies. 2851 * 2852 * Changing an existing directory entry requires that an add operation 2853 * be completed first followed by a deletion. The semantics for the addition 2854 * are identical to the description of adding a new entry above except 2855 * that the rollback is to the old inode number rather than zero. Once 2856 * the addition dependency is completed, the removal is done as described 2857 * in the removal routine above. 2858 */ 2859 2860 /* 2861 * This routine should be called immediately after changing 2862 * a directory entry. The inode's link count should not be 2863 * decremented by the calling procedure -- the soft updates 2864 * code will perform this task when it is safe. 2865 */ 2866 void 2867 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 2868 struct buf *bp; /* buffer containing directory block */ 2869 struct inode *dp; /* inode for the directory being modified */ 2870 struct inode *ip; /* inode for directory entry being removed */ 2871 long newinum; /* new inode number for changed entry */ 2872 int isrmdir; /* indicates if doing RMDIR */ 2873 { 2874 int offset; 2875 struct diradd *dap = NULL; 2876 struct dirrem *dirrem, *prevdirrem; 2877 struct pagedep *pagedep; 2878 struct inodedep *inodedep; 2879 2880 offset = blkoff(dp->i_fs, dp->i_offset); 2881 2882 /* 2883 * Whiteouts do not need diradd dependencies. 2884 */ 2885 if (newinum != WINO) { 2886 MALLOC(dap, struct diradd *, sizeof(struct diradd), 2887 M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO); 2888 dap->da_list.wk_type = D_DIRADD; 2889 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; 2890 dap->da_offset = offset; 2891 dap->da_newinum = newinum; 2892 } 2893 2894 /* 2895 * Allocate a new dirrem and ACQUIRE_LOCK. 2896 */ 2897 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 2898 pagedep = dirrem->dm_pagedep; 2899 /* 2900 * The possible values for isrmdir: 2901 * 0 - non-directory file rename 2902 * 1 - directory rename within same directory 2903 * inum - directory rename to new directory of given inode number 2904 * When renaming to a new directory, we are both deleting and 2905 * creating a new directory entry, so the link count on the new 2906 * directory should not change. Thus we do not need the followup 2907 * dirrem which is usually done in handle_workitem_remove. We set 2908 * the DIRCHG flag to tell handle_workitem_remove to skip the 2909 * followup dirrem. 2910 */ 2911 if (isrmdir > 1) 2912 dirrem->dm_state |= DIRCHG; 2913 2914 /* 2915 * Whiteouts have no additional dependencies, 2916 * so just put the dirrem on the correct list. 2917 */ 2918 if (newinum == WINO) { 2919 if ((dirrem->dm_state & COMPLETE) == 0) { 2920 LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, 2921 dm_next); 2922 } else { 2923 dirrem->dm_dirinum = pagedep->pd_ino; 2924 add_to_worklist(&dirrem->dm_list); 2925 } 2926 FREE_LOCK(&lk); 2927 return; 2928 } 2929 2930 /* 2931 * If the COMPLETE flag is clear, then there were no active 2932 * entries and we want to roll back to the previous inode until 2933 * the new inode is committed to disk. If the COMPLETE flag is 2934 * set, then we have deleted an entry that never made it to disk. 2935 * If the entry we deleted resulted from a name change, then the old 2936 * inode reference still resides on disk. Any rollback that we do 2937 * needs to be to that old inode (returned to us in prevdirrem). If 2938 * the entry we deleted resulted from a create, then there is 2939 * no entry on the disk, so we want to roll back to zero rather 2940 * than the uncommitted inode. In either of the COMPLETE cases we 2941 * want to immediately free the unwritten and unreferenced inode. 2942 */ 2943 if ((dirrem->dm_state & COMPLETE) == 0) { 2944 dap->da_previous = dirrem; 2945 } else { 2946 if (prevdirrem != NULL) { 2947 dap->da_previous = prevdirrem; 2948 } else { 2949 dap->da_state &= ~DIRCHG; 2950 dap->da_pagedep = pagedep; 2951 } 2952 dirrem->dm_dirinum = pagedep->pd_ino; 2953 add_to_worklist(&dirrem->dm_list); 2954 } 2955 /* 2956 * Link into its inodedep. Put it on the id_bufwait list if the inode 2957 * is not yet written. If it is written, do the post-inode write 2958 * processing to put it on the id_pendinghd list. 2959 */ 2960 if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 || 2961 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 2962 dap->da_state |= COMPLETE; 2963 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 2964 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 2965 } else { 2966 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 2967 dap, da_pdlist); 2968 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 2969 } 2970 FREE_LOCK(&lk); 2971 } 2972 2973 /* 2974 * Called whenever the link count on an inode is changed. 2975 * It creates an inode dependency so that the new reference(s) 2976 * to the inode cannot be committed to disk until the updated 2977 * inode has been written. 2978 */ 2979 void 2980 softdep_change_linkcnt(ip) 2981 struct inode *ip; /* the inode with the increased link count */ 2982 { 2983 struct inodedep *inodedep; 2984 2985 ACQUIRE_LOCK(&lk); 2986 (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep); 2987 if (ip->i_nlink < ip->i_effnlink) { 2988 FREE_LOCK(&lk); 2989 panic("softdep_change_linkcnt: bad delta"); 2990 } 2991 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 2992 FREE_LOCK(&lk); 2993 } 2994 2995 /* 2996 * Called when the effective link count and the reference count 2997 * on an inode drops to zero. At this point there are no names 2998 * referencing the file in the filesystem and no active file 2999 * references. The space associated with the file will be freed 3000 * as soon as the necessary soft dependencies are cleared. 3001 */ 3002 void 3003 softdep_releasefile(ip) 3004 struct inode *ip; /* inode with the zero effective link count */ 3005 { 3006 struct inodedep *inodedep; 3007 3008 if (ip->i_effnlink > 0) 3009 panic("softdep_filerelease: file still referenced"); 3010 /* 3011 * We may be called several times as the real reference count 3012 * drops to zero. We only want to account for the space once. 3013 */ 3014 if (ip->i_flag & IN_SPACECOUNTED) 3015 return; 3016 /* 3017 * We have to deactivate a snapshot otherwise copyonwrites may 3018 * add blocks and the cleanup may remove blocks after we have 3019 * tried to account for them. 3020 */ 3021 if ((ip->i_flags & SF_SNAPSHOT) != 0) 3022 ffs_snapremove(ITOV(ip)); 3023 /* 3024 * If we are tracking an nlinkdelta, we have to also remember 3025 * whether we accounted for the freed space yet. 3026 */ 3027 ACQUIRE_LOCK(&lk); 3028 if ((inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep))) 3029 inodedep->id_state |= SPACECOUNTED; 3030 FREE_LOCK(&lk); 3031 ip->i_fs->fs_pendingblocks += ip->i_blocks; 3032 ip->i_fs->fs_pendinginodes += 1; 3033 ip->i_flag |= IN_SPACECOUNTED; 3034 } 3035 3036 /* 3037 * This workitem decrements the inode's link count. 3038 * If the link count reaches zero, the file is removed. 3039 */ 3040 static void 3041 handle_workitem_remove(dirrem, xp) 3042 struct dirrem *dirrem; 3043 struct vnode *xp; 3044 { 3045 struct thread *td = curthread; 3046 struct inodedep *inodedep; 3047 struct vnode *vp; 3048 struct inode *ip; 3049 ino_t oldinum; 3050 int error; 3051 3052 if ((vp = xp) == NULL && 3053 (error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, LK_EXCLUSIVE, 3054 &vp)) != 0) { 3055 softdep_error("handle_workitem_remove: vget", error); 3056 return; 3057 } 3058 ip = VTOI(vp); 3059 ACQUIRE_LOCK(&lk); 3060 if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0){ 3061 FREE_LOCK(&lk); 3062 panic("handle_workitem_remove: lost inodedep"); 3063 } 3064 /* 3065 * Normal file deletion. 3066 */ 3067 if ((dirrem->dm_state & RMDIR) == 0) { 3068 ip->i_nlink--; 3069 ip->i_flag |= IN_CHANGE; 3070 if (ip->i_nlink < ip->i_effnlink) { 3071 FREE_LOCK(&lk); 3072 panic("handle_workitem_remove: bad file delta"); 3073 } 3074 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 3075 FREE_LOCK(&lk); 3076 vput(vp); 3077 num_dirrem -= 1; 3078 WORKITEM_FREE(dirrem, D_DIRREM); 3079 return; 3080 } 3081 /* 3082 * Directory deletion. Decrement reference count for both the 3083 * just deleted parent directory entry and the reference for ".". 3084 * Next truncate the directory to length zero. When the 3085 * truncation completes, arrange to have the reference count on 3086 * the parent decremented to account for the loss of "..". 3087 */ 3088 ip->i_nlink -= 2; 3089 ip->i_flag |= IN_CHANGE; 3090 if (ip->i_nlink < ip->i_effnlink) { 3091 FREE_LOCK(&lk); 3092 panic("handle_workitem_remove: bad dir delta"); 3093 } 3094 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 3095 FREE_LOCK(&lk); 3096 if ((error = UFS_TRUNCATE(vp, (off_t)0, 0, td->td_ucred, td)) != 0) 3097 softdep_error("handle_workitem_remove: truncate", error); 3098 /* 3099 * Rename a directory to a new parent. Since, we are both deleting 3100 * and creating a new directory entry, the link count on the new 3101 * directory should not change. Thus we skip the followup dirrem. 3102 */ 3103 if (dirrem->dm_state & DIRCHG) { 3104 vput(vp); 3105 num_dirrem -= 1; 3106 WORKITEM_FREE(dirrem, D_DIRREM); 3107 return; 3108 } 3109 /* 3110 * If the inodedep does not exist, then the zero'ed inode has 3111 * been written to disk. If the allocated inode has never been 3112 * written to disk, then the on-disk inode is zero'ed. In either 3113 * case we can remove the file immediately. 3114 */ 3115 ACQUIRE_LOCK(&lk); 3116 dirrem->dm_state = 0; 3117 oldinum = dirrem->dm_oldinum; 3118 dirrem->dm_oldinum = dirrem->dm_dirinum; 3119 if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 || 3120 check_inode_unwritten(inodedep)) { 3121 FREE_LOCK(&lk); 3122 vput(vp); 3123 handle_workitem_remove(dirrem, NULL); 3124 return; 3125 } 3126 WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); 3127 FREE_LOCK(&lk); 3128 vput(vp); 3129 } 3130 3131 /* 3132 * Inode de-allocation dependencies. 3133 * 3134 * When an inode's link count is reduced to zero, it can be de-allocated. We 3135 * found it convenient to postpone de-allocation until after the inode is 3136 * written to disk with its new link count (zero). At this point, all of the 3137 * on-disk inode's block pointers are nullified and, with careful dependency 3138 * list ordering, all dependencies related to the inode will be satisfied and 3139 * the corresponding dependency structures de-allocated. So, if/when the 3140 * inode is reused, there will be no mixing of old dependencies with new 3141 * ones. This artificial dependency is set up by the block de-allocation 3142 * procedure above (softdep_setup_freeblocks) and completed by the 3143 * following procedure. 3144 */ 3145 static void 3146 handle_workitem_freefile(freefile) 3147 struct freefile *freefile; 3148 { 3149 struct fs *fs; 3150 struct inodedep *idp; 3151 int error; 3152 3153 fs = VFSTOUFS(freefile->fx_mnt)->um_fs; 3154 #ifdef DEBUG 3155 ACQUIRE_LOCK(&lk); 3156 error = inodedep_lookup(fs, freefile->fx_oldinum, 0, &idp); 3157 FREE_LOCK(&lk); 3158 if (error) 3159 panic("handle_workitem_freefile: inodedep survived"); 3160 #endif 3161 fs->fs_pendinginodes -= 1; 3162 if ((error = ffs_freefile(fs, freefile->fx_devvp, freefile->fx_oldinum, 3163 freefile->fx_mode)) != 0) 3164 softdep_error("handle_workitem_freefile", error); 3165 WORKITEM_FREE(freefile, D_FREEFILE); 3166 } 3167 3168 /* 3169 * Disk writes. 3170 * 3171 * The dependency structures constructed above are most actively used when file 3172 * system blocks are written to disk. No constraints are placed on when a 3173 * block can be written, but unsatisfied update dependencies are made safe by 3174 * modifying (or replacing) the source memory for the duration of the disk 3175 * write. When the disk write completes, the memory block is again brought 3176 * up-to-date. 3177 * 3178 * In-core inode structure reclamation. 3179 * 3180 * Because there are a finite number of "in-core" inode structures, they are 3181 * reused regularly. By transferring all inode-related dependencies to the 3182 * in-memory inode block and indexing them separately (via "inodedep"s), we 3183 * can allow "in-core" inode structures to be reused at any time and avoid 3184 * any increase in contention. 3185 * 3186 * Called just before entering the device driver to initiate a new disk I/O. 3187 * The buffer must be locked, thus, no I/O completion operations can occur 3188 * while we are manipulating its associated dependencies. 3189 */ 3190 static void 3191 softdep_disk_io_initiation(bp) 3192 struct buf *bp; /* structure describing disk write to occur */ 3193 { 3194 struct worklist *wk, *nextwk; 3195 struct indirdep *indirdep; 3196 3197 /* 3198 * We only care about write operations. There should never 3199 * be dependencies for reads. 3200 */ 3201 if (bp->b_iocmd == BIO_READ) 3202 panic("softdep_disk_io_initiation: read"); 3203 /* 3204 * Do any necessary pre-I/O processing. 3205 */ 3206 for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) { 3207 nextwk = LIST_NEXT(wk, wk_list); 3208 switch (wk->wk_type) { 3209 3210 case D_PAGEDEP: 3211 initiate_write_filepage(WK_PAGEDEP(wk), bp); 3212 continue; 3213 3214 case D_INODEDEP: 3215 initiate_write_inodeblock(WK_INODEDEP(wk), bp); 3216 continue; 3217 3218 case D_INDIRDEP: 3219 indirdep = WK_INDIRDEP(wk); 3220 if (indirdep->ir_state & GOINGAWAY) 3221 panic("disk_io_initiation: indirdep gone"); 3222 /* 3223 * If there are no remaining dependencies, this 3224 * will be writing the real pointers, so the 3225 * dependency can be freed. 3226 */ 3227 if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) { 3228 indirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE; 3229 brelse(indirdep->ir_savebp); 3230 /* inline expand WORKLIST_REMOVE(wk); */ 3231 wk->wk_state &= ~ONWORKLIST; 3232 LIST_REMOVE(wk, wk_list); 3233 WORKITEM_FREE(indirdep, D_INDIRDEP); 3234 continue; 3235 } 3236 /* 3237 * Replace up-to-date version with safe version. 3238 */ 3239 MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount, 3240 M_INDIRDEP, M_SOFTDEP_FLAGS); 3241 ACQUIRE_LOCK(&lk); 3242 indirdep->ir_state &= ~ATTACHED; 3243 indirdep->ir_state |= UNDONE; 3244 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); 3245 bcopy(indirdep->ir_savebp->b_data, bp->b_data, 3246 bp->b_bcount); 3247 FREE_LOCK(&lk); 3248 continue; 3249 3250 case D_MKDIR: 3251 case D_BMSAFEMAP: 3252 case D_ALLOCDIRECT: 3253 case D_ALLOCINDIR: 3254 continue; 3255 3256 default: 3257 panic("handle_disk_io_initiation: Unexpected type %s", 3258 TYPENAME(wk->wk_type)); 3259 /* NOTREACHED */ 3260 } 3261 } 3262 } 3263 3264 /* 3265 * Called from within the procedure above to deal with unsatisfied 3266 * allocation dependencies in a directory. The buffer must be locked, 3267 * thus, no I/O completion operations can occur while we are 3268 * manipulating its associated dependencies. 3269 */ 3270 static void 3271 initiate_write_filepage(pagedep, bp) 3272 struct pagedep *pagedep; 3273 struct buf *bp; 3274 { 3275 struct diradd *dap; 3276 struct direct *ep; 3277 int i; 3278 3279 if (pagedep->pd_state & IOSTARTED) { 3280 /* 3281 * This can only happen if there is a driver that does not 3282 * understand chaining. Here biodone will reissue the call 3283 * to strategy for the incomplete buffers. 3284 */ 3285 printf("initiate_write_filepage: already started\n"); 3286 return; 3287 } 3288 pagedep->pd_state |= IOSTARTED; 3289 ACQUIRE_LOCK(&lk); 3290 for (i = 0; i < DAHASHSZ; i++) { 3291 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 3292 ep = (struct direct *) 3293 ((char *)bp->b_data + dap->da_offset); 3294 if (ep->d_ino != dap->da_newinum) { 3295 FREE_LOCK(&lk); 3296 panic("%s: dir inum %d != new %d", 3297 "initiate_write_filepage", 3298 ep->d_ino, dap->da_newinum); 3299 } 3300 if (dap->da_state & DIRCHG) 3301 ep->d_ino = dap->da_previous->dm_oldinum; 3302 else 3303 ep->d_ino = 0; 3304 dap->da_state &= ~ATTACHED; 3305 dap->da_state |= UNDONE; 3306 } 3307 } 3308 FREE_LOCK(&lk); 3309 } 3310 3311 /* 3312 * Called from within the procedure above to deal with unsatisfied 3313 * allocation dependencies in an inodeblock. The buffer must be 3314 * locked, thus, no I/O completion operations can occur while we 3315 * are manipulating its associated dependencies. 3316 */ 3317 static void 3318 initiate_write_inodeblock(inodedep, bp) 3319 struct inodedep *inodedep; 3320 struct buf *bp; /* The inode block */ 3321 { 3322 struct allocdirect *adp, *lastadp; 3323 struct dinode *dp; 3324 struct fs *fs; 3325 ufs_lbn_t prevlbn = 0; 3326 int i, deplist; 3327 3328 if (inodedep->id_state & IOSTARTED) 3329 panic("initiate_write_inodeblock: already started"); 3330 inodedep->id_state |= IOSTARTED; 3331 fs = inodedep->id_fs; 3332 dp = (struct dinode *)bp->b_data + 3333 ino_to_fsbo(fs, inodedep->id_ino); 3334 /* 3335 * If the bitmap is not yet written, then the allocated 3336 * inode cannot be written to disk. 3337 */ 3338 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 3339 if (inodedep->id_savedino != NULL) 3340 panic("initiate_write_inodeblock: already doing I/O"); 3341 MALLOC(inodedep->id_savedino, struct dinode *, 3342 sizeof(struct dinode), M_INODEDEP, M_SOFTDEP_FLAGS); 3343 *inodedep->id_savedino = *dp; 3344 bzero((caddr_t)dp, sizeof(struct dinode)); 3345 return; 3346 } 3347 /* 3348 * If no dependencies, then there is nothing to roll back. 3349 */ 3350 inodedep->id_savedsize = dp->di_size; 3351 if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL) 3352 return; 3353 /* 3354 * Set the dependencies to busy. 3355 */ 3356 ACQUIRE_LOCK(&lk); 3357 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 3358 adp = TAILQ_NEXT(adp, ad_next)) { 3359 #ifdef DIAGNOSTIC 3360 if (deplist != 0 && prevlbn >= adp->ad_lbn) { 3361 FREE_LOCK(&lk); 3362 panic("softdep_write_inodeblock: lbn order"); 3363 } 3364 prevlbn = adp->ad_lbn; 3365 if (adp->ad_lbn < NDADDR && 3366 dp->di_db[adp->ad_lbn] != adp->ad_newblkno) { 3367 FREE_LOCK(&lk); 3368 panic("%s: direct pointer #%ld mismatch %d != %d", 3369 "softdep_write_inodeblock", adp->ad_lbn, 3370 dp->di_db[adp->ad_lbn], adp->ad_newblkno); 3371 } 3372 if (adp->ad_lbn >= NDADDR && 3373 dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) { 3374 FREE_LOCK(&lk); 3375 panic("%s: indirect pointer #%ld mismatch %d != %d", 3376 "softdep_write_inodeblock", adp->ad_lbn - NDADDR, 3377 dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno); 3378 } 3379 deplist |= 1 << adp->ad_lbn; 3380 if ((adp->ad_state & ATTACHED) == 0) { 3381 FREE_LOCK(&lk); 3382 panic("softdep_write_inodeblock: Unknown state 0x%x", 3383 adp->ad_state); 3384 } 3385 #endif /* DIAGNOSTIC */ 3386 adp->ad_state &= ~ATTACHED; 3387 adp->ad_state |= UNDONE; 3388 } 3389 /* 3390 * The on-disk inode cannot claim to be any larger than the last 3391 * fragment that has been written. Otherwise, the on-disk inode 3392 * might have fragments that were not the last block in the file 3393 * which would corrupt the filesystem. 3394 */ 3395 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 3396 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 3397 if (adp->ad_lbn >= NDADDR) 3398 break; 3399 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno; 3400 /* keep going until hitting a rollback to a frag */ 3401 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 3402 continue; 3403 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; 3404 for (i = adp->ad_lbn + 1; i < NDADDR; i++) { 3405 #ifdef DIAGNOSTIC 3406 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) { 3407 FREE_LOCK(&lk); 3408 panic("softdep_write_inodeblock: lost dep1"); 3409 } 3410 #endif /* DIAGNOSTIC */ 3411 dp->di_db[i] = 0; 3412 } 3413 for (i = 0; i < NIADDR; i++) { 3414 #ifdef DIAGNOSTIC 3415 if (dp->di_ib[i] != 0 && 3416 (deplist & ((1 << NDADDR) << i)) == 0) { 3417 FREE_LOCK(&lk); 3418 panic("softdep_write_inodeblock: lost dep2"); 3419 } 3420 #endif /* DIAGNOSTIC */ 3421 dp->di_ib[i] = 0; 3422 } 3423 FREE_LOCK(&lk); 3424 return; 3425 } 3426 /* 3427 * If we have zero'ed out the last allocated block of the file, 3428 * roll back the size to the last currently allocated block. 3429 * We know that this last allocated block is a full-sized as 3430 * we already checked for fragments in the loop above. 3431 */ 3432 if (lastadp != NULL && 3433 dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { 3434 for (i = lastadp->ad_lbn; i >= 0; i--) 3435 if (dp->di_db[i] != 0) 3436 break; 3437 dp->di_size = (i + 1) * fs->fs_bsize; 3438 } 3439 /* 3440 * The only dependencies are for indirect blocks. 3441 * 3442 * The file size for indirect block additions is not guaranteed. 3443 * Such a guarantee would be non-trivial to achieve. The conventional 3444 * synchronous write implementation also does not make this guarantee. 3445 * Fsck should catch and fix discrepancies. Arguably, the file size 3446 * can be over-estimated without destroying integrity when the file 3447 * moves into the indirect blocks (i.e., is large). If we want to 3448 * postpone fsck, we are stuck with this argument. 3449 */ 3450 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 3451 dp->di_ib[adp->ad_lbn - NDADDR] = 0; 3452 FREE_LOCK(&lk); 3453 } 3454 3455 /* 3456 * This routine is called during the completion interrupt 3457 * service routine for a disk write (from the procedure called 3458 * by the device driver to inform the filesystem caches of 3459 * a request completion). It should be called early in this 3460 * procedure, before the block is made available to other 3461 * processes or other routines are called. 3462 */ 3463 static void 3464 softdep_disk_write_complete(bp) 3465 struct buf *bp; /* describes the completed disk write */ 3466 { 3467 struct worklist *wk; 3468 struct workhead reattach; 3469 struct newblk *newblk; 3470 struct allocindir *aip; 3471 struct allocdirect *adp; 3472 struct indirdep *indirdep; 3473 struct inodedep *inodedep; 3474 struct bmsafemap *bmsafemap; 3475 3476 #ifdef DEBUG 3477 if (lk.lkt_held != NOHOLDER) 3478 panic("softdep_disk_write_complete: lock is held"); 3479 lk.lkt_held = SPECIAL_FLAG; 3480 #endif 3481 LIST_INIT(&reattach); 3482 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 3483 WORKLIST_REMOVE(wk); 3484 switch (wk->wk_type) { 3485 3486 case D_PAGEDEP: 3487 if (handle_written_filepage(WK_PAGEDEP(wk), bp)) 3488 WORKLIST_INSERT(&reattach, wk); 3489 continue; 3490 3491 case D_INODEDEP: 3492 if (handle_written_inodeblock(WK_INODEDEP(wk), bp)) 3493 WORKLIST_INSERT(&reattach, wk); 3494 continue; 3495 3496 case D_BMSAFEMAP: 3497 bmsafemap = WK_BMSAFEMAP(wk); 3498 while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) { 3499 newblk->nb_state |= DEPCOMPLETE; 3500 newblk->nb_bmsafemap = NULL; 3501 LIST_REMOVE(newblk, nb_deps); 3502 } 3503 while ((adp = 3504 LIST_FIRST(&bmsafemap->sm_allocdirecthd))) { 3505 adp->ad_state |= DEPCOMPLETE; 3506 adp->ad_buf = NULL; 3507 LIST_REMOVE(adp, ad_deps); 3508 handle_allocdirect_partdone(adp); 3509 } 3510 while ((aip = 3511 LIST_FIRST(&bmsafemap->sm_allocindirhd))) { 3512 aip->ai_state |= DEPCOMPLETE; 3513 aip->ai_buf = NULL; 3514 LIST_REMOVE(aip, ai_deps); 3515 handle_allocindir_partdone(aip); 3516 } 3517 while ((inodedep = 3518 LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) { 3519 inodedep->id_state |= DEPCOMPLETE; 3520 LIST_REMOVE(inodedep, id_deps); 3521 inodedep->id_buf = NULL; 3522 } 3523 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 3524 continue; 3525 3526 case D_MKDIR: 3527 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 3528 continue; 3529 3530 case D_ALLOCDIRECT: 3531 adp = WK_ALLOCDIRECT(wk); 3532 adp->ad_state |= COMPLETE; 3533 handle_allocdirect_partdone(adp); 3534 continue; 3535 3536 case D_ALLOCINDIR: 3537 aip = WK_ALLOCINDIR(wk); 3538 aip->ai_state |= COMPLETE; 3539 handle_allocindir_partdone(aip); 3540 continue; 3541 3542 case D_INDIRDEP: 3543 indirdep = WK_INDIRDEP(wk); 3544 if (indirdep->ir_state & GOINGAWAY) { 3545 lk.lkt_held = NOHOLDER; 3546 panic("disk_write_complete: indirdep gone"); 3547 } 3548 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); 3549 FREE(indirdep->ir_saveddata, M_INDIRDEP); 3550 indirdep->ir_saveddata = 0; 3551 indirdep->ir_state &= ~UNDONE; 3552 indirdep->ir_state |= ATTACHED; 3553 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { 3554 handle_allocindir_partdone(aip); 3555 if (aip == LIST_FIRST(&indirdep->ir_donehd)) { 3556 lk.lkt_held = NOHOLDER; 3557 panic("disk_write_complete: not gone"); 3558 } 3559 } 3560 WORKLIST_INSERT(&reattach, wk); 3561 if ((bp->b_flags & B_DELWRI) == 0) 3562 stat_indir_blk_ptrs++; 3563 bdirty(bp); 3564 continue; 3565 3566 default: 3567 lk.lkt_held = NOHOLDER; 3568 panic("handle_disk_write_complete: Unknown type %s", 3569 TYPENAME(wk->wk_type)); 3570 /* NOTREACHED */ 3571 } 3572 } 3573 /* 3574 * Reattach any requests that must be redone. 3575 */ 3576 while ((wk = LIST_FIRST(&reattach)) != NULL) { 3577 WORKLIST_REMOVE(wk); 3578 WORKLIST_INSERT(&bp->b_dep, wk); 3579 } 3580 #ifdef DEBUG 3581 if (lk.lkt_held != SPECIAL_FLAG) 3582 panic("softdep_disk_write_complete: lock lost"); 3583 lk.lkt_held = NOHOLDER; 3584 #endif 3585 } 3586 3587 /* 3588 * Called from within softdep_disk_write_complete above. Note that 3589 * this routine is always called from interrupt level with further 3590 * splbio interrupts blocked. 3591 */ 3592 static void 3593 handle_allocdirect_partdone(adp) 3594 struct allocdirect *adp; /* the completed allocdirect */ 3595 { 3596 struct allocdirect *listadp; 3597 struct inodedep *inodedep; 3598 long bsize, delay; 3599 3600 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 3601 return; 3602 if (adp->ad_buf != NULL) { 3603 lk.lkt_held = NOHOLDER; 3604 panic("handle_allocdirect_partdone: dangling dep"); 3605 } 3606 /* 3607 * The on-disk inode cannot claim to be any larger than the last 3608 * fragment that has been written. Otherwise, the on-disk inode 3609 * might have fragments that were not the last block in the file 3610 * which would corrupt the filesystem. Thus, we cannot free any 3611 * allocdirects after one whose ad_oldblkno claims a fragment as 3612 * these blocks must be rolled back to zero before writing the inode. 3613 * We check the currently active set of allocdirects in id_inoupdt. 3614 */ 3615 inodedep = adp->ad_inodedep; 3616 bsize = inodedep->id_fs->fs_bsize; 3617 TAILQ_FOREACH(listadp, &inodedep->id_inoupdt, ad_next) { 3618 /* found our block */ 3619 if (listadp == adp) 3620 break; 3621 /* continue if ad_oldlbn is not a fragment */ 3622 if (listadp->ad_oldsize == 0 || 3623 listadp->ad_oldsize == bsize) 3624 continue; 3625 /* hit a fragment */ 3626 return; 3627 } 3628 /* 3629 * If we have reached the end of the current list without 3630 * finding the just finished dependency, then it must be 3631 * on the future dependency list. Future dependencies cannot 3632 * be freed until they are moved to the current list. 3633 */ 3634 if (listadp == NULL) { 3635 #ifdef DEBUG 3636 TAILQ_FOREACH(listadp, &inodedep->id_newinoupdt, ad_next) 3637 /* found our block */ 3638 if (listadp == adp) 3639 break; 3640 if (listadp == NULL) { 3641 lk.lkt_held = NOHOLDER; 3642 panic("handle_allocdirect_partdone: lost dep"); 3643 } 3644 #endif /* DEBUG */ 3645 return; 3646 } 3647 /* 3648 * If we have found the just finished dependency, then free 3649 * it along with anything that follows it that is complete. 3650 * If the inode still has a bitmap dependency, then it has 3651 * never been written to disk, hence the on-disk inode cannot 3652 * reference the old fragment so we can free it without delay. 3653 */ 3654 delay = (inodedep->id_state & DEPCOMPLETE); 3655 for (; adp; adp = listadp) { 3656 listadp = TAILQ_NEXT(adp, ad_next); 3657 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 3658 return; 3659 free_allocdirect(&inodedep->id_inoupdt, adp, delay); 3660 } 3661 } 3662 3663 /* 3664 * Called from within softdep_disk_write_complete above. Note that 3665 * this routine is always called from interrupt level with further 3666 * splbio interrupts blocked. 3667 */ 3668 static void 3669 handle_allocindir_partdone(aip) 3670 struct allocindir *aip; /* the completed allocindir */ 3671 { 3672 struct indirdep *indirdep; 3673 3674 if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) 3675 return; 3676 if (aip->ai_buf != NULL) { 3677 lk.lkt_held = NOHOLDER; 3678 panic("handle_allocindir_partdone: dangling dependency"); 3679 } 3680 indirdep = aip->ai_indirdep; 3681 if (indirdep->ir_state & UNDONE) { 3682 LIST_REMOVE(aip, ai_next); 3683 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); 3684 return; 3685 } 3686 ((ufs_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 3687 aip->ai_newblkno; 3688 LIST_REMOVE(aip, ai_next); 3689 if (aip->ai_freefrag != NULL) 3690 add_to_worklist(&aip->ai_freefrag->ff_list); 3691 WORKITEM_FREE(aip, D_ALLOCINDIR); 3692 } 3693 3694 /* 3695 * Called from within softdep_disk_write_complete above to restore 3696 * in-memory inode block contents to their most up-to-date state. Note 3697 * that this routine is always called from interrupt level with further 3698 * splbio interrupts blocked. 3699 */ 3700 static int 3701 handle_written_inodeblock(inodedep, bp) 3702 struct inodedep *inodedep; 3703 struct buf *bp; /* buffer containing the inode block */ 3704 { 3705 struct worklist *wk, *filefree; 3706 struct allocdirect *adp, *nextadp; 3707 struct dinode *dp; 3708 int hadchanges; 3709 3710 if ((inodedep->id_state & IOSTARTED) == 0) { 3711 lk.lkt_held = NOHOLDER; 3712 panic("handle_written_inodeblock: not started"); 3713 } 3714 inodedep->id_state &= ~IOSTARTED; 3715 inodedep->id_state |= COMPLETE; 3716 dp = (struct dinode *)bp->b_data + 3717 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 3718 /* 3719 * If we had to rollback the inode allocation because of 3720 * bitmaps being incomplete, then simply restore it. 3721 * Keep the block dirty so that it will not be reclaimed until 3722 * all associated dependencies have been cleared and the 3723 * corresponding updates written to disk. 3724 */ 3725 if (inodedep->id_savedino != NULL) { 3726 *dp = *inodedep->id_savedino; 3727 FREE(inodedep->id_savedino, M_INODEDEP); 3728 inodedep->id_savedino = NULL; 3729 if ((bp->b_flags & B_DELWRI) == 0) 3730 stat_inode_bitmap++; 3731 bdirty(bp); 3732 return (1); 3733 } 3734 /* 3735 * Roll forward anything that had to be rolled back before 3736 * the inode could be updated. 3737 */ 3738 hadchanges = 0; 3739 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { 3740 nextadp = TAILQ_NEXT(adp, ad_next); 3741 if (adp->ad_state & ATTACHED) { 3742 lk.lkt_held = NOHOLDER; 3743 panic("handle_written_inodeblock: new entry"); 3744 } 3745 if (adp->ad_lbn < NDADDR) { 3746 if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno) { 3747 lk.lkt_held = NOHOLDER; 3748 panic("%s: %s #%ld mismatch %d != %d", 3749 "handle_written_inodeblock", 3750 "direct pointer", adp->ad_lbn, 3751 dp->di_db[adp->ad_lbn], adp->ad_oldblkno); 3752 } 3753 dp->di_db[adp->ad_lbn] = adp->ad_newblkno; 3754 } else { 3755 if (dp->di_ib[adp->ad_lbn - NDADDR] != 0) { 3756 lk.lkt_held = NOHOLDER; 3757 panic("%s: %s #%ld allocated as %d", 3758 "handle_written_inodeblock", 3759 "indirect pointer", adp->ad_lbn - NDADDR, 3760 dp->di_ib[adp->ad_lbn - NDADDR]); 3761 } 3762 dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno; 3763 } 3764 adp->ad_state &= ~UNDONE; 3765 adp->ad_state |= ATTACHED; 3766 hadchanges = 1; 3767 } 3768 if (hadchanges && (bp->b_flags & B_DELWRI) == 0) 3769 stat_direct_blk_ptrs++; 3770 /* 3771 * Reset the file size to its most up-to-date value. 3772 */ 3773 if (inodedep->id_savedsize == -1) { 3774 lk.lkt_held = NOHOLDER; 3775 panic("handle_written_inodeblock: bad size"); 3776 } 3777 if (dp->di_size != inodedep->id_savedsize) { 3778 dp->di_size = inodedep->id_savedsize; 3779 hadchanges = 1; 3780 } 3781 inodedep->id_savedsize = -1; 3782 /* 3783 * If there were any rollbacks in the inode block, then it must be 3784 * marked dirty so that its will eventually get written back in 3785 * its correct form. 3786 */ 3787 if (hadchanges) 3788 bdirty(bp); 3789 /* 3790 * Process any allocdirects that completed during the update. 3791 */ 3792 if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) 3793 handle_allocdirect_partdone(adp); 3794 /* 3795 * Process deallocations that were held pending until the 3796 * inode had been written to disk. Freeing of the inode 3797 * is delayed until after all blocks have been freed to 3798 * avoid creation of new <vfsid, inum, lbn> triples 3799 * before the old ones have been deleted. 3800 */ 3801 filefree = NULL; 3802 while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { 3803 WORKLIST_REMOVE(wk); 3804 switch (wk->wk_type) { 3805 3806 case D_FREEFILE: 3807 /* 3808 * We defer adding filefree to the worklist until 3809 * all other additions have been made to ensure 3810 * that it will be done after all the old blocks 3811 * have been freed. 3812 */ 3813 if (filefree != NULL) { 3814 lk.lkt_held = NOHOLDER; 3815 panic("handle_written_inodeblock: filefree"); 3816 } 3817 filefree = wk; 3818 continue; 3819 3820 case D_MKDIR: 3821 handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); 3822 continue; 3823 3824 case D_DIRADD: 3825 diradd_inode_written(WK_DIRADD(wk), inodedep); 3826 continue; 3827 3828 case D_FREEBLKS: 3829 case D_FREEFRAG: 3830 case D_DIRREM: 3831 add_to_worklist(wk); 3832 continue; 3833 3834 case D_NEWDIRBLK: 3835 free_newdirblk(WK_NEWDIRBLK(wk)); 3836 continue; 3837 3838 default: 3839 lk.lkt_held = NOHOLDER; 3840 panic("handle_written_inodeblock: Unknown type %s", 3841 TYPENAME(wk->wk_type)); 3842 /* NOTREACHED */ 3843 } 3844 } 3845 if (filefree != NULL) { 3846 if (free_inodedep(inodedep) == 0) { 3847 lk.lkt_held = NOHOLDER; 3848 panic("handle_written_inodeblock: live inodedep"); 3849 } 3850 add_to_worklist(filefree); 3851 return (0); 3852 } 3853 3854 /* 3855 * If no outstanding dependencies, free it. 3856 */ 3857 if (free_inodedep(inodedep) || TAILQ_FIRST(&inodedep->id_inoupdt) == 0) 3858 return (0); 3859 return (hadchanges); 3860 } 3861 3862 /* 3863 * Process a diradd entry after its dependent inode has been written. 3864 * This routine must be called with splbio interrupts blocked. 3865 */ 3866 static void 3867 diradd_inode_written(dap, inodedep) 3868 struct diradd *dap; 3869 struct inodedep *inodedep; 3870 { 3871 struct pagedep *pagedep; 3872 3873 dap->da_state |= COMPLETE; 3874 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 3875 if (dap->da_state & DIRCHG) 3876 pagedep = dap->da_previous->dm_pagedep; 3877 else 3878 pagedep = dap->da_pagedep; 3879 LIST_REMOVE(dap, da_pdlist); 3880 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 3881 } 3882 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 3883 } 3884 3885 /* 3886 * Handle the completion of a mkdir dependency. 3887 */ 3888 static void 3889 handle_written_mkdir(mkdir, type) 3890 struct mkdir *mkdir; 3891 int type; 3892 { 3893 struct diradd *dap; 3894 struct pagedep *pagedep; 3895 3896 if (mkdir->md_state != type) { 3897 lk.lkt_held = NOHOLDER; 3898 panic("handle_written_mkdir: bad type"); 3899 } 3900 dap = mkdir->md_diradd; 3901 dap->da_state &= ~type; 3902 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) 3903 dap->da_state |= DEPCOMPLETE; 3904 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 3905 if (dap->da_state & DIRCHG) 3906 pagedep = dap->da_previous->dm_pagedep; 3907 else 3908 pagedep = dap->da_pagedep; 3909 LIST_REMOVE(dap, da_pdlist); 3910 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 3911 } 3912 LIST_REMOVE(mkdir, md_mkdirs); 3913 WORKITEM_FREE(mkdir, D_MKDIR); 3914 } 3915 3916 /* 3917 * Called from within softdep_disk_write_complete above. 3918 * A write operation was just completed. Removed inodes can 3919 * now be freed and associated block pointers may be committed. 3920 * Note that this routine is always called from interrupt level 3921 * with further splbio interrupts blocked. 3922 */ 3923 static int 3924 handle_written_filepage(pagedep, bp) 3925 struct pagedep *pagedep; 3926 struct buf *bp; /* buffer containing the written page */ 3927 { 3928 struct dirrem *dirrem; 3929 struct diradd *dap, *nextdap; 3930 struct direct *ep; 3931 int i, chgs; 3932 3933 if ((pagedep->pd_state & IOSTARTED) == 0) { 3934 lk.lkt_held = NOHOLDER; 3935 panic("handle_written_filepage: not started"); 3936 } 3937 pagedep->pd_state &= ~IOSTARTED; 3938 /* 3939 * Process any directory removals that have been committed. 3940 */ 3941 while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { 3942 LIST_REMOVE(dirrem, dm_next); 3943 dirrem->dm_dirinum = pagedep->pd_ino; 3944 add_to_worklist(&dirrem->dm_list); 3945 } 3946 /* 3947 * Free any directory additions that have been committed. 3948 * If it is a newly allocated block, we have to wait until 3949 * the on-disk directory inode claims the new block. 3950 */ 3951 if ((pagedep->pd_state & NEWBLOCK) == 0) 3952 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 3953 free_diradd(dap); 3954 /* 3955 * Uncommitted directory entries must be restored. 3956 */ 3957 for (chgs = 0, i = 0; i < DAHASHSZ; i++) { 3958 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; 3959 dap = nextdap) { 3960 nextdap = LIST_NEXT(dap, da_pdlist); 3961 if (dap->da_state & ATTACHED) { 3962 lk.lkt_held = NOHOLDER; 3963 panic("handle_written_filepage: attached"); 3964 } 3965 ep = (struct direct *) 3966 ((char *)bp->b_data + dap->da_offset); 3967 ep->d_ino = dap->da_newinum; 3968 dap->da_state &= ~UNDONE; 3969 dap->da_state |= ATTACHED; 3970 chgs = 1; 3971 /* 3972 * If the inode referenced by the directory has 3973 * been written out, then the dependency can be 3974 * moved to the pending list. 3975 */ 3976 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 3977 LIST_REMOVE(dap, da_pdlist); 3978 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, 3979 da_pdlist); 3980 } 3981 } 3982 } 3983 /* 3984 * If there were any rollbacks in the directory, then it must be 3985 * marked dirty so that its will eventually get written back in 3986 * its correct form. 3987 */ 3988 if (chgs) { 3989 if ((bp->b_flags & B_DELWRI) == 0) 3990 stat_dir_entry++; 3991 bdirty(bp); 3992 return (1); 3993 } 3994 /* 3995 * If we are not waiting for a new directory block to be 3996 * claimed by its inode, then the pagedep will be freed. 3997 * Otherwise it will remain to track any new entries on 3998 * the page in case they are fsync'ed. 3999 */ 4000 if ((pagedep->pd_state & NEWBLOCK) == 0) { 4001 LIST_REMOVE(pagedep, pd_hash); 4002 WORKITEM_FREE(pagedep, D_PAGEDEP); 4003 } 4004 return (0); 4005 } 4006 4007 /* 4008 * Writing back in-core inode structures. 4009 * 4010 * The filesystem only accesses an inode's contents when it occupies an 4011 * "in-core" inode structure. These "in-core" structures are separate from 4012 * the page frames used to cache inode blocks. Only the latter are 4013 * transferred to/from the disk. So, when the updated contents of the 4014 * "in-core" inode structure are copied to the corresponding in-memory inode 4015 * block, the dependencies are also transferred. The following procedure is 4016 * called when copying a dirty "in-core" inode to a cached inode block. 4017 */ 4018 4019 /* 4020 * Called when an inode is loaded from disk. If the effective link count 4021 * differed from the actual link count when it was last flushed, then we 4022 * need to ensure that the correct effective link count is put back. 4023 */ 4024 void 4025 softdep_load_inodeblock(ip) 4026 struct inode *ip; /* the "in_core" copy of the inode */ 4027 { 4028 struct inodedep *inodedep; 4029 4030 /* 4031 * Check for alternate nlink count. 4032 */ 4033 ip->i_effnlink = ip->i_nlink; 4034 ACQUIRE_LOCK(&lk); 4035 if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) { 4036 FREE_LOCK(&lk); 4037 return; 4038 } 4039 ip->i_effnlink -= inodedep->id_nlinkdelta; 4040 if (inodedep->id_state & SPACECOUNTED) 4041 ip->i_flag |= IN_SPACECOUNTED; 4042 FREE_LOCK(&lk); 4043 } 4044 4045 /* 4046 * This routine is called just before the "in-core" inode 4047 * information is to be copied to the in-memory inode block. 4048 * Recall that an inode block contains several inodes. If 4049 * the force flag is set, then the dependencies will be 4050 * cleared so that the update can always be made. Note that 4051 * the buffer is locked when this routine is called, so we 4052 * will never be in the middle of writing the inode block 4053 * to disk. 4054 */ 4055 void 4056 softdep_update_inodeblock(ip, bp, waitfor) 4057 struct inode *ip; /* the "in_core" copy of the inode */ 4058 struct buf *bp; /* the buffer containing the inode block */ 4059 int waitfor; /* nonzero => update must be allowed */ 4060 { 4061 struct inodedep *inodedep; 4062 struct worklist *wk; 4063 int error, gotit; 4064 4065 /* 4066 * If the effective link count is not equal to the actual link 4067 * count, then we must track the difference in an inodedep while 4068 * the inode is (potentially) tossed out of the cache. Otherwise, 4069 * if there is no existing inodedep, then there are no dependencies 4070 * to track. 4071 */ 4072 ACQUIRE_LOCK(&lk); 4073 if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) { 4074 FREE_LOCK(&lk); 4075 if (ip->i_effnlink != ip->i_nlink) 4076 panic("softdep_update_inodeblock: bad link count"); 4077 return; 4078 } 4079 if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) { 4080 FREE_LOCK(&lk); 4081 panic("softdep_update_inodeblock: bad delta"); 4082 } 4083 /* 4084 * Changes have been initiated. Anything depending on these 4085 * changes cannot occur until this inode has been written. 4086 */ 4087 inodedep->id_state &= ~COMPLETE; 4088 if ((inodedep->id_state & ONWORKLIST) == 0) 4089 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); 4090 /* 4091 * Any new dependencies associated with the incore inode must 4092 * now be moved to the list associated with the buffer holding 4093 * the in-memory copy of the inode. Once merged process any 4094 * allocdirects that are completed by the merger. 4095 */ 4096 merge_inode_lists(inodedep); 4097 if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL) 4098 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt)); 4099 /* 4100 * Now that the inode has been pushed into the buffer, the 4101 * operations dependent on the inode being written to disk 4102 * can be moved to the id_bufwait so that they will be 4103 * processed when the buffer I/O completes. 4104 */ 4105 while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { 4106 WORKLIST_REMOVE(wk); 4107 WORKLIST_INSERT(&inodedep->id_bufwait, wk); 4108 } 4109 /* 4110 * Newly allocated inodes cannot be written until the bitmap 4111 * that allocates them have been written (indicated by 4112 * DEPCOMPLETE being set in id_state). If we are doing a 4113 * forced sync (e.g., an fsync on a file), we force the bitmap 4114 * to be written so that the update can be done. 4115 */ 4116 if ((inodedep->id_state & DEPCOMPLETE) != 0 || waitfor == 0) { 4117 FREE_LOCK(&lk); 4118 return; 4119 } 4120 gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT); 4121 FREE_LOCK(&lk); 4122 if (gotit && 4123 (error = BUF_WRITE(inodedep->id_buf)) != 0) 4124 softdep_error("softdep_update_inodeblock: bwrite", error); 4125 if ((inodedep->id_state & DEPCOMPLETE) == 0) 4126 panic("softdep_update_inodeblock: update failed"); 4127 } 4128 4129 /* 4130 * Merge the new inode dependency list (id_newinoupdt) into the old 4131 * inode dependency list (id_inoupdt). This routine must be called 4132 * with splbio interrupts blocked. 4133 */ 4134 static void 4135 merge_inode_lists(inodedep) 4136 struct inodedep *inodedep; 4137 { 4138 struct allocdirect *listadp, *newadp; 4139 4140 newadp = TAILQ_FIRST(&inodedep->id_newinoupdt); 4141 for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) { 4142 if (listadp->ad_lbn < newadp->ad_lbn) { 4143 listadp = TAILQ_NEXT(listadp, ad_next); 4144 continue; 4145 } 4146 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next); 4147 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); 4148 if (listadp->ad_lbn == newadp->ad_lbn) { 4149 allocdirect_merge(&inodedep->id_inoupdt, newadp, 4150 listadp); 4151 listadp = newadp; 4152 } 4153 newadp = TAILQ_FIRST(&inodedep->id_newinoupdt); 4154 } 4155 while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) { 4156 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next); 4157 TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next); 4158 } 4159 } 4160 4161 /* 4162 * If we are doing an fsync, then we must ensure that any directory 4163 * entries for the inode have been written after the inode gets to disk. 4164 */ 4165 int 4166 softdep_fsync(vp) 4167 struct vnode *vp; /* the "in_core" copy of the inode */ 4168 { 4169 struct inodedep *inodedep; 4170 struct pagedep *pagedep; 4171 struct worklist *wk; 4172 struct diradd *dap; 4173 struct mount *mnt; 4174 struct vnode *pvp; 4175 struct inode *ip; 4176 struct buf *bp; 4177 struct fs *fs; 4178 struct thread *td = curthread; 4179 int error, flushparent; 4180 ino_t parentino; 4181 ufs_lbn_t lbn; 4182 4183 ip = VTOI(vp); 4184 fs = ip->i_fs; 4185 ACQUIRE_LOCK(&lk); 4186 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) { 4187 FREE_LOCK(&lk); 4188 return (0); 4189 } 4190 if (LIST_FIRST(&inodedep->id_inowait) != NULL || 4191 LIST_FIRST(&inodedep->id_bufwait) != NULL || 4192 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || 4193 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) { 4194 FREE_LOCK(&lk); 4195 panic("softdep_fsync: pending ops"); 4196 } 4197 for (error = 0, flushparent = 0; ; ) { 4198 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) 4199 break; 4200 if (wk->wk_type != D_DIRADD) { 4201 FREE_LOCK(&lk); 4202 panic("softdep_fsync: Unexpected type %s", 4203 TYPENAME(wk->wk_type)); 4204 } 4205 dap = WK_DIRADD(wk); 4206 /* 4207 * Flush our parent if this directory entry has a MKDIR_PARENT 4208 * dependency or is contained in a newly allocated block. 4209 */ 4210 if (dap->da_state & DIRCHG) 4211 pagedep = dap->da_previous->dm_pagedep; 4212 else 4213 pagedep = dap->da_pagedep; 4214 mnt = pagedep->pd_mnt; 4215 parentino = pagedep->pd_ino; 4216 lbn = pagedep->pd_lbn; 4217 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) { 4218 FREE_LOCK(&lk); 4219 panic("softdep_fsync: dirty"); 4220 } 4221 if ((dap->da_state & MKDIR_PARENT) || 4222 (pagedep->pd_state & NEWBLOCK)) 4223 flushparent = 1; 4224 else 4225 flushparent = 0; 4226 /* 4227 * If we are being fsync'ed as part of vgone'ing this vnode, 4228 * then we will not be able to release and recover the 4229 * vnode below, so we just have to give up on writing its 4230 * directory entry out. It will eventually be written, just 4231 * not now, but then the user was not asking to have it 4232 * written, so we are not breaking any promises. 4233 */ 4234 if (vp->v_flag & VXLOCK) 4235 break; 4236 /* 4237 * We prevent deadlock by always fetching inodes from the 4238 * root, moving down the directory tree. Thus, when fetching 4239 * our parent directory, we first try to get the lock. If 4240 * that fails, we must unlock ourselves before requesting 4241 * the lock on our parent. See the comment in ufs_lookup 4242 * for details on possible races. 4243 */ 4244 FREE_LOCK(&lk); 4245 if (VFS_VGET(mnt, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp)) { 4246 VOP_UNLOCK(vp, 0, td); 4247 error = VFS_VGET(mnt, parentino, LK_EXCLUSIVE, &pvp); 4248 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 4249 if (error != 0) 4250 return (error); 4251 } 4252 /* 4253 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps 4254 * that are contained in direct blocks will be resolved by 4255 * doing a UFS_UPDATE. Pagedeps contained in indirect blocks 4256 * may require a complete sync'ing of the directory. So, we 4257 * try the cheap and fast UFS_UPDATE first, and if that fails, 4258 * then we do the slower VOP_FSYNC of the directory. 4259 */ 4260 if (flushparent) { 4261 if ((error = UFS_UPDATE(pvp, 1)) != 0) { 4262 vput(pvp); 4263 return (error); 4264 } 4265 if ((pagedep->pd_state & NEWBLOCK) && 4266 (error = VOP_FSYNC(pvp, td->td_ucred, MNT_WAIT, td))) { 4267 vput(pvp); 4268 return (error); 4269 } 4270 } 4271 /* 4272 * Flush directory page containing the inode's name. 4273 */ 4274 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred, 4275 &bp); 4276 if (error == 0) 4277 error = BUF_WRITE(bp); 4278 else 4279 brelse(bp); 4280 vput(pvp); 4281 if (error != 0) 4282 return (error); 4283 ACQUIRE_LOCK(&lk); 4284 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) 4285 break; 4286 } 4287 FREE_LOCK(&lk); 4288 return (0); 4289 } 4290 4291 /* 4292 * Flush all the dirty bitmaps associated with the block device 4293 * before flushing the rest of the dirty blocks so as to reduce 4294 * the number of dependencies that will have to be rolled back. 4295 */ 4296 void 4297 softdep_fsync_mountdev(vp) 4298 struct vnode *vp; 4299 { 4300 struct buf *bp, *nbp; 4301 struct worklist *wk; 4302 4303 if (!vn_isdisk(vp, NULL)) 4304 panic("softdep_fsync_mountdev: vnode not a disk"); 4305 ACQUIRE_LOCK(&lk); 4306 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 4307 nbp = TAILQ_NEXT(bp, b_vnbufs); 4308 /* 4309 * If it is already scheduled, skip to the next buffer. 4310 */ 4311 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) 4312 continue; 4313 if ((bp->b_flags & B_DELWRI) == 0) { 4314 FREE_LOCK(&lk); 4315 panic("softdep_fsync_mountdev: not dirty"); 4316 } 4317 /* 4318 * We are only interested in bitmaps with outstanding 4319 * dependencies. 4320 */ 4321 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL || 4322 wk->wk_type != D_BMSAFEMAP || 4323 (bp->b_xflags & BX_BKGRDINPROG)) { 4324 BUF_UNLOCK(bp); 4325 continue; 4326 } 4327 bremfree(bp); 4328 FREE_LOCK(&lk); 4329 (void) bawrite(bp); 4330 ACQUIRE_LOCK(&lk); 4331 /* 4332 * Since we may have slept during the I/O, we need 4333 * to start from a known point. 4334 */ 4335 nbp = TAILQ_FIRST(&vp->v_dirtyblkhd); 4336 } 4337 drain_output(vp, 1); 4338 FREE_LOCK(&lk); 4339 } 4340 4341 /* 4342 * This routine is called when we are trying to synchronously flush a 4343 * file. This routine must eliminate any filesystem metadata dependencies 4344 * so that the syncing routine can succeed by pushing the dirty blocks 4345 * associated with the file. If any I/O errors occur, they are returned. 4346 */ 4347 int 4348 softdep_sync_metadata(ap) 4349 struct vop_fsync_args /* { 4350 struct vnode *a_vp; 4351 struct ucred *a_cred; 4352 int a_waitfor; 4353 struct thread *a_td; 4354 } */ *ap; 4355 { 4356 struct vnode *vp = ap->a_vp; 4357 struct pagedep *pagedep; 4358 struct allocdirect *adp; 4359 struct allocindir *aip; 4360 struct buf *bp, *nbp; 4361 struct worklist *wk; 4362 int i, error, waitfor; 4363 4364 /* 4365 * Check whether this vnode is involved in a filesystem 4366 * that is doing soft dependency processing. 4367 */ 4368 if (!vn_isdisk(vp, NULL)) { 4369 if (!DOINGSOFTDEP(vp)) 4370 return (0); 4371 } else 4372 if (vp->v_rdev->si_mountpoint == NULL || 4373 (vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP) == 0) 4374 return (0); 4375 /* 4376 * Ensure that any direct block dependencies have been cleared. 4377 */ 4378 ACQUIRE_LOCK(&lk); 4379 if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) { 4380 FREE_LOCK(&lk); 4381 return (error); 4382 } 4383 /* 4384 * For most files, the only metadata dependencies are the 4385 * cylinder group maps that allocate their inode or blocks. 4386 * The block allocation dependencies can be found by traversing 4387 * the dependency lists for any buffers that remain on their 4388 * dirty buffer list. The inode allocation dependency will 4389 * be resolved when the inode is updated with MNT_WAIT. 4390 * This work is done in two passes. The first pass grabs most 4391 * of the buffers and begins asynchronously writing them. The 4392 * only way to wait for these asynchronous writes is to sleep 4393 * on the filesystem vnode which may stay busy for a long time 4394 * if the filesystem is active. So, instead, we make a second 4395 * pass over the dependencies blocking on each write. In the 4396 * usual case we will be blocking against a write that we 4397 * initiated, so when it is done the dependency will have been 4398 * resolved. Thus the second pass is expected to end quickly. 4399 */ 4400 waitfor = MNT_NOWAIT; 4401 top: 4402 /* 4403 * We must wait for any I/O in progress to finish so that 4404 * all potential buffers on the dirty list will be visible. 4405 */ 4406 drain_output(vp, 1); 4407 if (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) { 4408 FREE_LOCK(&lk); 4409 return (0); 4410 } 4411 bp = TAILQ_FIRST(&vp->v_dirtyblkhd); 4412 /* While syncing snapshots, we must allow recursive lookups */ 4413 bp->b_lock.lk_flags |= LK_CANRECURSE; 4414 loop: 4415 /* 4416 * As we hold the buffer locked, none of its dependencies 4417 * will disappear. 4418 */ 4419 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 4420 switch (wk->wk_type) { 4421 4422 case D_ALLOCDIRECT: 4423 adp = WK_ALLOCDIRECT(wk); 4424 if (adp->ad_state & DEPCOMPLETE) 4425 continue; 4426 nbp = adp->ad_buf; 4427 if (getdirtybuf(&nbp, waitfor) == 0) 4428 continue; 4429 FREE_LOCK(&lk); 4430 if (waitfor == MNT_NOWAIT) { 4431 bawrite(nbp); 4432 } else if ((error = BUF_WRITE(nbp)) != 0) { 4433 break; 4434 } 4435 ACQUIRE_LOCK(&lk); 4436 continue; 4437 4438 case D_ALLOCINDIR: 4439 aip = WK_ALLOCINDIR(wk); 4440 if (aip->ai_state & DEPCOMPLETE) 4441 continue; 4442 nbp = aip->ai_buf; 4443 if (getdirtybuf(&nbp, waitfor) == 0) 4444 continue; 4445 FREE_LOCK(&lk); 4446 if (waitfor == MNT_NOWAIT) { 4447 bawrite(nbp); 4448 } else if ((error = BUF_WRITE(nbp)) != 0) { 4449 break; 4450 } 4451 ACQUIRE_LOCK(&lk); 4452 continue; 4453 4454 case D_INDIRDEP: 4455 restart: 4456 4457 LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) { 4458 if (aip->ai_state & DEPCOMPLETE) 4459 continue; 4460 nbp = aip->ai_buf; 4461 if (getdirtybuf(&nbp, MNT_WAIT) == 0) 4462 goto restart; 4463 FREE_LOCK(&lk); 4464 if ((error = BUF_WRITE(nbp)) != 0) { 4465 break; 4466 } 4467 ACQUIRE_LOCK(&lk); 4468 goto restart; 4469 } 4470 continue; 4471 4472 case D_INODEDEP: 4473 if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs, 4474 WK_INODEDEP(wk)->id_ino)) != 0) { 4475 FREE_LOCK(&lk); 4476 break; 4477 } 4478 continue; 4479 4480 case D_PAGEDEP: 4481 /* 4482 * We are trying to sync a directory that may 4483 * have dependencies on both its own metadata 4484 * and/or dependencies on the inodes of any 4485 * recently allocated files. We walk its diradd 4486 * lists pushing out the associated inode. 4487 */ 4488 pagedep = WK_PAGEDEP(wk); 4489 for (i = 0; i < DAHASHSZ; i++) { 4490 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) 4491 continue; 4492 if ((error = 4493 flush_pagedep_deps(vp, pagedep->pd_mnt, 4494 &pagedep->pd_diraddhd[i]))) { 4495 FREE_LOCK(&lk); 4496 break; 4497 } 4498 } 4499 continue; 4500 4501 case D_MKDIR: 4502 /* 4503 * This case should never happen if the vnode has 4504 * been properly sync'ed. However, if this function 4505 * is used at a place where the vnode has not yet 4506 * been sync'ed, this dependency can show up. So, 4507 * rather than panic, just flush it. 4508 */ 4509 nbp = WK_MKDIR(wk)->md_buf; 4510 if (getdirtybuf(&nbp, waitfor) == 0) 4511 continue; 4512 FREE_LOCK(&lk); 4513 if (waitfor == MNT_NOWAIT) { 4514 bawrite(nbp); 4515 } else if ((error = BUF_WRITE(nbp)) != 0) { 4516 break; 4517 } 4518 ACQUIRE_LOCK(&lk); 4519 continue; 4520 4521 case D_BMSAFEMAP: 4522 /* 4523 * This case should never happen if the vnode has 4524 * been properly sync'ed. However, if this function 4525 * is used at a place where the vnode has not yet 4526 * been sync'ed, this dependency can show up. So, 4527 * rather than panic, just flush it. 4528 */ 4529 nbp = WK_BMSAFEMAP(wk)->sm_buf; 4530 if (getdirtybuf(&nbp, waitfor) == 0) 4531 continue; 4532 FREE_LOCK(&lk); 4533 if (waitfor == MNT_NOWAIT) { 4534 bawrite(nbp); 4535 } else if ((error = BUF_WRITE(nbp)) != 0) { 4536 break; 4537 } 4538 ACQUIRE_LOCK(&lk); 4539 continue; 4540 4541 default: 4542 FREE_LOCK(&lk); 4543 panic("softdep_sync_metadata: Unknown type %s", 4544 TYPENAME(wk->wk_type)); 4545 /* NOTREACHED */ 4546 } 4547 /* We reach here only in error and unlocked */ 4548 if (error == 0) 4549 panic("softdep_sync_metadata: zero error"); 4550 bp->b_lock.lk_flags &= ~LK_CANRECURSE; 4551 bawrite(bp); 4552 return (error); 4553 } 4554 (void) getdirtybuf(&TAILQ_NEXT(bp, b_vnbufs), MNT_WAIT); 4555 nbp = TAILQ_NEXT(bp, b_vnbufs); 4556 FREE_LOCK(&lk); 4557 bp->b_lock.lk_flags &= ~LK_CANRECURSE; 4558 bawrite(bp); 4559 ACQUIRE_LOCK(&lk); 4560 if (nbp != NULL) { 4561 bp = nbp; 4562 goto loop; 4563 } 4564 /* 4565 * The brief unlock is to allow any pent up dependency 4566 * processing to be done. Then proceed with the second pass. 4567 */ 4568 if (waitfor == MNT_NOWAIT) { 4569 waitfor = MNT_WAIT; 4570 FREE_LOCK(&lk); 4571 ACQUIRE_LOCK(&lk); 4572 goto top; 4573 } 4574 4575 /* 4576 * If we have managed to get rid of all the dirty buffers, 4577 * then we are done. For certain directories and block 4578 * devices, we may need to do further work. 4579 * 4580 * We must wait for any I/O in progress to finish so that 4581 * all potential buffers on the dirty list will be visible. 4582 */ 4583 drain_output(vp, 1); 4584 if (TAILQ_FIRST(&vp->v_dirtyblkhd) == NULL) { 4585 FREE_LOCK(&lk); 4586 return (0); 4587 } 4588 4589 FREE_LOCK(&lk); 4590 /* 4591 * If we are trying to sync a block device, some of its buffers may 4592 * contain metadata that cannot be written until the contents of some 4593 * partially written files have been written to disk. The only easy 4594 * way to accomplish this is to sync the entire filesystem (luckily 4595 * this happens rarely). 4596 */ 4597 if (vn_isdisk(vp, NULL) && 4598 vp->v_rdev->si_mountpoint && !VOP_ISLOCKED(vp, NULL) && 4599 (error = VFS_SYNC(vp->v_rdev->si_mountpoint, MNT_WAIT, ap->a_cred, 4600 ap->a_td)) != 0) 4601 return (error); 4602 return (0); 4603 } 4604 4605 /* 4606 * Flush the dependencies associated with an inodedep. 4607 * Called with splbio blocked. 4608 */ 4609 static int 4610 flush_inodedep_deps(fs, ino) 4611 struct fs *fs; 4612 ino_t ino; 4613 { 4614 struct inodedep *inodedep; 4615 struct allocdirect *adp; 4616 int error, waitfor; 4617 struct buf *bp; 4618 4619 /* 4620 * This work is done in two passes. The first pass grabs most 4621 * of the buffers and begins asynchronously writing them. The 4622 * only way to wait for these asynchronous writes is to sleep 4623 * on the filesystem vnode which may stay busy for a long time 4624 * if the filesystem is active. So, instead, we make a second 4625 * pass over the dependencies blocking on each write. In the 4626 * usual case we will be blocking against a write that we 4627 * initiated, so when it is done the dependency will have been 4628 * resolved. Thus the second pass is expected to end quickly. 4629 * We give a brief window at the top of the loop to allow 4630 * any pending I/O to complete. 4631 */ 4632 for (waitfor = MNT_NOWAIT; ; ) { 4633 FREE_LOCK(&lk); 4634 ACQUIRE_LOCK(&lk); 4635 if (inodedep_lookup(fs, ino, 0, &inodedep) == 0) 4636 return (0); 4637 TAILQ_FOREACH(adp, &inodedep->id_inoupdt, ad_next) { 4638 if (adp->ad_state & DEPCOMPLETE) 4639 continue; 4640 bp = adp->ad_buf; 4641 if (getdirtybuf(&bp, waitfor) == 0) { 4642 if (waitfor == MNT_NOWAIT) 4643 continue; 4644 break; 4645 } 4646 FREE_LOCK(&lk); 4647 if (waitfor == MNT_NOWAIT) { 4648 bawrite(bp); 4649 } else if ((error = BUF_WRITE(bp)) != 0) { 4650 ACQUIRE_LOCK(&lk); 4651 return (error); 4652 } 4653 ACQUIRE_LOCK(&lk); 4654 break; 4655 } 4656 if (adp != NULL) 4657 continue; 4658 TAILQ_FOREACH(adp, &inodedep->id_newinoupdt, ad_next) { 4659 if (adp->ad_state & DEPCOMPLETE) 4660 continue; 4661 bp = adp->ad_buf; 4662 if (getdirtybuf(&bp, waitfor) == 0) { 4663 if (waitfor == MNT_NOWAIT) 4664 continue; 4665 break; 4666 } 4667 FREE_LOCK(&lk); 4668 if (waitfor == MNT_NOWAIT) { 4669 bawrite(bp); 4670 } else if ((error = BUF_WRITE(bp)) != 0) { 4671 ACQUIRE_LOCK(&lk); 4672 return (error); 4673 } 4674 ACQUIRE_LOCK(&lk); 4675 break; 4676 } 4677 if (adp != NULL) 4678 continue; 4679 /* 4680 * If pass2, we are done, otherwise do pass 2. 4681 */ 4682 if (waitfor == MNT_WAIT) 4683 break; 4684 waitfor = MNT_WAIT; 4685 } 4686 /* 4687 * Try freeing inodedep in case all dependencies have been removed. 4688 */ 4689 if (inodedep_lookup(fs, ino, 0, &inodedep) != 0) 4690 (void) free_inodedep(inodedep); 4691 return (0); 4692 } 4693 4694 /* 4695 * Eliminate a pagedep dependency by flushing out all its diradd dependencies. 4696 * Called with splbio blocked. 4697 */ 4698 static int 4699 flush_pagedep_deps(pvp, mp, diraddhdp) 4700 struct vnode *pvp; 4701 struct mount *mp; 4702 struct diraddhd *diraddhdp; 4703 { 4704 struct thread *td = curthread; 4705 struct inodedep *inodedep; 4706 struct ufsmount *ump; 4707 struct diradd *dap; 4708 struct vnode *vp; 4709 int gotit, error = 0; 4710 struct buf *bp; 4711 ino_t inum; 4712 4713 ump = VFSTOUFS(mp); 4714 while ((dap = LIST_FIRST(diraddhdp)) != NULL) { 4715 /* 4716 * Flush ourselves if this directory entry 4717 * has a MKDIR_PARENT dependency. 4718 */ 4719 if (dap->da_state & MKDIR_PARENT) { 4720 FREE_LOCK(&lk); 4721 if ((error = UFS_UPDATE(pvp, 1)) != 0) 4722 break; 4723 ACQUIRE_LOCK(&lk); 4724 /* 4725 * If that cleared dependencies, go on to next. 4726 */ 4727 if (dap != LIST_FIRST(diraddhdp)) 4728 continue; 4729 if (dap->da_state & MKDIR_PARENT) { 4730 FREE_LOCK(&lk); 4731 panic("flush_pagedep_deps: MKDIR_PARENT"); 4732 } 4733 } 4734 /* 4735 * A newly allocated directory must have its "." and 4736 * ".." entries written out before its name can be 4737 * committed in its parent. We do not want or need 4738 * the full semantics of a synchronous VOP_FSYNC as 4739 * that may end up here again, once for each directory 4740 * level in the filesystem. Instead, we push the blocks 4741 * and wait for them to clear. We have to fsync twice 4742 * because the first call may choose to defer blocks 4743 * that still have dependencies, but deferral will 4744 * happen at most once. 4745 */ 4746 inum = dap->da_newinum; 4747 if (dap->da_state & MKDIR_BODY) { 4748 FREE_LOCK(&lk); 4749 if ((error = VFS_VGET(mp, inum, LK_EXCLUSIVE, &vp))) 4750 break; 4751 if ((error=VOP_FSYNC(vp, td->td_ucred, MNT_NOWAIT, td)) || 4752 (error=VOP_FSYNC(vp, td->td_ucred, MNT_NOWAIT, td))) { 4753 vput(vp); 4754 break; 4755 } 4756 drain_output(vp, 0); 4757 vput(vp); 4758 ACQUIRE_LOCK(&lk); 4759 /* 4760 * If that cleared dependencies, go on to next. 4761 */ 4762 if (dap != LIST_FIRST(diraddhdp)) 4763 continue; 4764 if (dap->da_state & MKDIR_BODY) { 4765 FREE_LOCK(&lk); 4766 panic("flush_pagedep_deps: MKDIR_BODY"); 4767 } 4768 } 4769 /* 4770 * Flush the inode on which the directory entry depends. 4771 * Having accounted for MKDIR_PARENT and MKDIR_BODY above, 4772 * the only remaining dependency is that the updated inode 4773 * count must get pushed to disk. The inode has already 4774 * been pushed into its inode buffer (via VOP_UPDATE) at 4775 * the time of the reference count change. So we need only 4776 * locate that buffer, ensure that there will be no rollback 4777 * caused by a bitmap dependency, then write the inode buffer. 4778 */ 4779 if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0) { 4780 FREE_LOCK(&lk); 4781 panic("flush_pagedep_deps: lost inode"); 4782 } 4783 /* 4784 * If the inode still has bitmap dependencies, 4785 * push them to disk. 4786 */ 4787 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 4788 gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT); 4789 FREE_LOCK(&lk); 4790 if (gotit && 4791 (error = BUF_WRITE(inodedep->id_buf)) != 0) 4792 break; 4793 ACQUIRE_LOCK(&lk); 4794 if (dap != LIST_FIRST(diraddhdp)) 4795 continue; 4796 } 4797 /* 4798 * If the inode is still sitting in a buffer waiting 4799 * to be written, push it to disk. 4800 */ 4801 FREE_LOCK(&lk); 4802 if ((error = bread(ump->um_devvp, 4803 fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)), 4804 (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) { 4805 brelse(bp); 4806 break; 4807 } 4808 if ((error = BUF_WRITE(bp)) != 0) 4809 break; 4810 ACQUIRE_LOCK(&lk); 4811 /* 4812 * If we have failed to get rid of all the dependencies 4813 * then something is seriously wrong. 4814 */ 4815 if (dap == LIST_FIRST(diraddhdp)) { 4816 FREE_LOCK(&lk); 4817 panic("flush_pagedep_deps: flush failed"); 4818 } 4819 } 4820 if (error) 4821 ACQUIRE_LOCK(&lk); 4822 return (error); 4823 } 4824 4825 /* 4826 * A large burst of file addition or deletion activity can drive the 4827 * memory load excessively high. First attempt to slow things down 4828 * using the techniques below. If that fails, this routine requests 4829 * the offending operations to fall back to running synchronously 4830 * until the memory load returns to a reasonable level. 4831 */ 4832 int 4833 softdep_slowdown(vp) 4834 struct vnode *vp; 4835 { 4836 int max_softdeps_hard; 4837 4838 max_softdeps_hard = max_softdeps * 11 / 10; 4839 if (num_dirrem < max_softdeps_hard / 2 && 4840 num_inodedep < max_softdeps_hard) 4841 return (0); 4842 stat_sync_limit_hit += 1; 4843 return (1); 4844 } 4845 4846 /* 4847 * Called by the allocation routines when they are about to fail 4848 * in the hope that we can free up some disk space. 4849 * 4850 * First check to see if the work list has anything on it. If it has, 4851 * clean up entries until we successfully free some space. Because this 4852 * process holds inodes locked, we cannot handle any remove requests 4853 * that might block on a locked inode as that could lead to deadlock. 4854 * If the worklist yields no free space, encourage the syncer daemon 4855 * to help us. In no event will we try for longer than tickdelay seconds. 4856 */ 4857 int 4858 softdep_request_cleanup(fs, vp) 4859 struct fs *fs; 4860 struct vnode *vp; 4861 { 4862 long starttime, needed; 4863 4864 needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize; 4865 starttime = time_second + tickdelay; 4866 if (UFS_UPDATE(vp, 1) != 0) 4867 return (0); 4868 while (fs->fs_pendingblocks > 0 && fs->fs_cstotal.cs_nbfree <= needed) { 4869 if (time_second > starttime) 4870 return (0); 4871 if (num_on_worklist > 0 && 4872 process_worklist_item(NULL, LK_NOWAIT) != -1) { 4873 stat_worklist_push += 1; 4874 continue; 4875 } 4876 request_cleanup(FLUSH_REMOVE_WAIT, 0); 4877 } 4878 return (1); 4879 } 4880 4881 /* 4882 * If memory utilization has gotten too high, deliberately slow things 4883 * down and speed up the I/O processing. 4884 */ 4885 static int 4886 request_cleanup(resource, islocked) 4887 int resource; 4888 int islocked; 4889 { 4890 struct thread *td = curthread; 4891 4892 /* 4893 * We never hold up the filesystem syncer process. 4894 */ 4895 if (td == filesys_syncer) 4896 return (0); 4897 /* 4898 * First check to see if the work list has gotten backlogged. 4899 * If it has, co-opt this process to help clean up two entries. 4900 * Because this process may hold inodes locked, we cannot 4901 * handle any remove requests that might block on a locked 4902 * inode as that could lead to deadlock. 4903 */ 4904 if (num_on_worklist > max_softdeps / 10) { 4905 if (islocked) 4906 FREE_LOCK(&lk); 4907 process_worklist_item(NULL, LK_NOWAIT); 4908 process_worklist_item(NULL, LK_NOWAIT); 4909 stat_worklist_push += 2; 4910 if (islocked) 4911 ACQUIRE_LOCK(&lk); 4912 return(1); 4913 } 4914 /* 4915 * Next, we attempt to speed up the syncer process. If that 4916 * is successful, then we allow the process to continue. 4917 */ 4918 if (speedup_syncer() && resource != FLUSH_REMOVE_WAIT) 4919 return(0); 4920 /* 4921 * If we are resource constrained on inode dependencies, try 4922 * flushing some dirty inodes. Otherwise, we are constrained 4923 * by file deletions, so try accelerating flushes of directories 4924 * with removal dependencies. We would like to do the cleanup 4925 * here, but we probably hold an inode locked at this point and 4926 * that might deadlock against one that we try to clean. So, 4927 * the best that we can do is request the syncer daemon to do 4928 * the cleanup for us. 4929 */ 4930 switch (resource) { 4931 4932 case FLUSH_INODES: 4933 stat_ino_limit_push += 1; 4934 req_clear_inodedeps += 1; 4935 stat_countp = &stat_ino_limit_hit; 4936 break; 4937 4938 case FLUSH_REMOVE: 4939 case FLUSH_REMOVE_WAIT: 4940 stat_blk_limit_push += 1; 4941 req_clear_remove += 1; 4942 stat_countp = &stat_blk_limit_hit; 4943 break; 4944 4945 default: 4946 if (islocked) 4947 FREE_LOCK(&lk); 4948 panic("request_cleanup: unknown type"); 4949 } 4950 /* 4951 * Hopefully the syncer daemon will catch up and awaken us. 4952 * We wait at most tickdelay before proceeding in any case. 4953 */ 4954 if (islocked == 0) 4955 ACQUIRE_LOCK(&lk); 4956 proc_waiting += 1; 4957 if (handle.callout == NULL) 4958 handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2); 4959 interlocked_sleep(&lk, SLEEP, (caddr_t)&proc_waiting, PPAUSE, 4960 "softupdate", 0); 4961 proc_waiting -= 1; 4962 if (islocked == 0) 4963 FREE_LOCK(&lk); 4964 return (1); 4965 } 4966 4967 /* 4968 * Awaken processes pausing in request_cleanup and clear proc_waiting 4969 * to indicate that there is no longer a timer running. 4970 */ 4971 void 4972 pause_timer(arg) 4973 void *arg; 4974 { 4975 4976 *stat_countp += 1; 4977 wakeup_one(&proc_waiting); 4978 if (proc_waiting > 0) 4979 handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2); 4980 else 4981 handle.callout = NULL; 4982 } 4983 4984 /* 4985 * Flush out a directory with at least one removal dependency in an effort to 4986 * reduce the number of dirrem, freefile, and freeblks dependency structures. 4987 */ 4988 static void 4989 clear_remove(td) 4990 struct thread *td; 4991 { 4992 struct pagedep_hashhead *pagedephd; 4993 struct pagedep *pagedep; 4994 static int next = 0; 4995 struct mount *mp; 4996 struct vnode *vp; 4997 int error, cnt; 4998 ino_t ino; 4999 5000 ACQUIRE_LOCK(&lk); 5001 for (cnt = 0; cnt < pagedep_hash; cnt++) { 5002 pagedephd = &pagedep_hashtbl[next++]; 5003 if (next >= pagedep_hash) 5004 next = 0; 5005 LIST_FOREACH(pagedep, pagedephd, pd_hash) { 5006 if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL) 5007 continue; 5008 mp = pagedep->pd_mnt; 5009 ino = pagedep->pd_ino; 5010 FREE_LOCK(&lk); 5011 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 5012 continue; 5013 if ((error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &vp))) { 5014 softdep_error("clear_remove: vget", error); 5015 vn_finished_write(mp); 5016 return; 5017 } 5018 if ((error = VOP_FSYNC(vp, td->td_ucred, MNT_NOWAIT, td))) 5019 softdep_error("clear_remove: fsync", error); 5020 drain_output(vp, 0); 5021 vput(vp); 5022 vn_finished_write(mp); 5023 return; 5024 } 5025 } 5026 FREE_LOCK(&lk); 5027 } 5028 5029 /* 5030 * Clear out a block of dirty inodes in an effort to reduce 5031 * the number of inodedep dependency structures. 5032 */ 5033 static void 5034 clear_inodedeps(td) 5035 struct thread *td; 5036 { 5037 struct inodedep_hashhead *inodedephd; 5038 struct inodedep *inodedep; 5039 static int next = 0; 5040 struct mount *mp; 5041 struct vnode *vp; 5042 struct fs *fs; 5043 int error, cnt; 5044 ino_t firstino, lastino, ino; 5045 5046 ACQUIRE_LOCK(&lk); 5047 /* 5048 * Pick a random inode dependency to be cleared. 5049 * We will then gather up all the inodes in its block 5050 * that have dependencies and flush them out. 5051 */ 5052 for (cnt = 0; cnt < inodedep_hash; cnt++) { 5053 inodedephd = &inodedep_hashtbl[next++]; 5054 if (next >= inodedep_hash) 5055 next = 0; 5056 if ((inodedep = LIST_FIRST(inodedephd)) != NULL) 5057 break; 5058 } 5059 if (inodedep == NULL) 5060 return; 5061 /* 5062 * Ugly code to find mount point given pointer to superblock. 5063 */ 5064 fs = inodedep->id_fs; 5065 TAILQ_FOREACH(mp, &mountlist, mnt_list) 5066 if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs) 5067 break; 5068 /* 5069 * Find the last inode in the block with dependencies. 5070 */ 5071 firstino = inodedep->id_ino & ~(INOPB(fs) - 1); 5072 for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) 5073 if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0) 5074 break; 5075 /* 5076 * Asynchronously push all but the last inode with dependencies. 5077 * Synchronously push the last inode with dependencies to ensure 5078 * that the inode block gets written to free up the inodedeps. 5079 */ 5080 for (ino = firstino; ino <= lastino; ino++) { 5081 if (inodedep_lookup(fs, ino, 0, &inodedep) == 0) 5082 continue; 5083 FREE_LOCK(&lk); 5084 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 5085 continue; 5086 if ((error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &vp)) != 0) { 5087 softdep_error("clear_inodedeps: vget", error); 5088 vn_finished_write(mp); 5089 return; 5090 } 5091 if (ino == lastino) { 5092 if ((error = VOP_FSYNC(vp, td->td_ucred, MNT_WAIT, td))) 5093 softdep_error("clear_inodedeps: fsync1", error); 5094 } else { 5095 if ((error = VOP_FSYNC(vp, td->td_ucred, MNT_NOWAIT, td))) 5096 softdep_error("clear_inodedeps: fsync2", error); 5097 drain_output(vp, 0); 5098 } 5099 vput(vp); 5100 vn_finished_write(mp); 5101 ACQUIRE_LOCK(&lk); 5102 } 5103 FREE_LOCK(&lk); 5104 } 5105 5106 /* 5107 * Function to determine if the buffer has outstanding dependencies 5108 * that will cause a roll-back if the buffer is written. If wantcount 5109 * is set, return number of dependencies, otherwise just yes or no. 5110 */ 5111 static int 5112 softdep_count_dependencies(bp, wantcount) 5113 struct buf *bp; 5114 int wantcount; 5115 { 5116 struct worklist *wk; 5117 struct inodedep *inodedep; 5118 struct indirdep *indirdep; 5119 struct allocindir *aip; 5120 struct pagedep *pagedep; 5121 struct diradd *dap; 5122 int i, retval; 5123 5124 retval = 0; 5125 ACQUIRE_LOCK(&lk); 5126 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 5127 switch (wk->wk_type) { 5128 5129 case D_INODEDEP: 5130 inodedep = WK_INODEDEP(wk); 5131 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 5132 /* bitmap allocation dependency */ 5133 retval += 1; 5134 if (!wantcount) 5135 goto out; 5136 } 5137 if (TAILQ_FIRST(&inodedep->id_inoupdt)) { 5138 /* direct block pointer dependency */ 5139 retval += 1; 5140 if (!wantcount) 5141 goto out; 5142 } 5143 continue; 5144 5145 case D_INDIRDEP: 5146 indirdep = WK_INDIRDEP(wk); 5147 5148 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { 5149 /* indirect block pointer dependency */ 5150 retval += 1; 5151 if (!wantcount) 5152 goto out; 5153 } 5154 continue; 5155 5156 case D_PAGEDEP: 5157 pagedep = WK_PAGEDEP(wk); 5158 for (i = 0; i < DAHASHSZ; i++) { 5159 5160 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 5161 /* directory entry dependency */ 5162 retval += 1; 5163 if (!wantcount) 5164 goto out; 5165 } 5166 } 5167 continue; 5168 5169 case D_BMSAFEMAP: 5170 case D_ALLOCDIRECT: 5171 case D_ALLOCINDIR: 5172 case D_MKDIR: 5173 /* never a dependency on these blocks */ 5174 continue; 5175 5176 default: 5177 FREE_LOCK(&lk); 5178 panic("softdep_check_for_rollback: Unexpected type %s", 5179 TYPENAME(wk->wk_type)); 5180 /* NOTREACHED */ 5181 } 5182 } 5183 out: 5184 FREE_LOCK(&lk); 5185 return retval; 5186 } 5187 5188 /* 5189 * Acquire exclusive access to a buffer. 5190 * Must be called with splbio blocked. 5191 * Return 1 if buffer was acquired. 5192 */ 5193 static int 5194 getdirtybuf(bpp, waitfor) 5195 struct buf **bpp; 5196 int waitfor; 5197 { 5198 struct buf *bp; 5199 int error; 5200 5201 for (;;) { 5202 if ((bp = *bpp) == NULL) 5203 return (0); 5204 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) == 0) { 5205 if ((bp->b_xflags & BX_BKGRDINPROG) == 0) 5206 break; 5207 BUF_UNLOCK(bp); 5208 if (waitfor != MNT_WAIT) 5209 return (0); 5210 bp->b_xflags |= BX_BKGRDWAIT; 5211 interlocked_sleep(&lk, SLEEP, &bp->b_xflags, PRIBIO, 5212 "getbuf", 0); 5213 continue; 5214 } 5215 if (waitfor != MNT_WAIT) 5216 return (0); 5217 error = interlocked_sleep(&lk, LOCKBUF, bp, 5218 LK_EXCLUSIVE | LK_SLEEPFAIL, 0, 0); 5219 if (error != ENOLCK) { 5220 FREE_LOCK(&lk); 5221 panic("getdirtybuf: inconsistent lock"); 5222 } 5223 } 5224 if ((bp->b_flags & B_DELWRI) == 0) { 5225 BUF_UNLOCK(bp); 5226 return (0); 5227 } 5228 bremfree(bp); 5229 return (1); 5230 } 5231 5232 /* 5233 * Wait for pending output on a vnode to complete. 5234 * Must be called with vnode locked. 5235 */ 5236 static void 5237 drain_output(vp, islocked) 5238 struct vnode *vp; 5239 int islocked; 5240 { 5241 5242 if (!islocked) 5243 ACQUIRE_LOCK(&lk); 5244 while (vp->v_numoutput) { 5245 vp->v_flag |= VBWAIT; 5246 interlocked_sleep(&lk, SLEEP, (caddr_t)&vp->v_numoutput, 5247 PRIBIO + 1, "drainvp", 0); 5248 } 5249 if (!islocked) 5250 FREE_LOCK(&lk); 5251 } 5252 5253 /* 5254 * Called whenever a buffer that is being invalidated or reallocated 5255 * contains dependencies. This should only happen if an I/O error has 5256 * occurred. The routine is called with the buffer locked. 5257 */ 5258 static void 5259 softdep_deallocate_dependencies(bp) 5260 struct buf *bp; 5261 { 5262 5263 if ((bp->b_ioflags & BIO_ERROR) == 0) 5264 panic("softdep_deallocate_dependencies: dangling deps"); 5265 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); 5266 panic("softdep_deallocate_dependencies: unrecovered I/O error"); 5267 } 5268 5269 /* 5270 * Function to handle asynchronous write errors in the filesystem. 5271 */ 5272 void 5273 softdep_error(func, error) 5274 char *func; 5275 int error; 5276 { 5277 5278 /* XXX should do something better! */ 5279 printf("%s: got error %d while accessing filesystem\n", func, error); 5280 } 5281