xref: /freebsd/sys/ufs/ffs/ffs_softdep.c (revision 64de80195bba295c961a4cdf96dbe0e4979bdf2a)
1 /*-
2  * Copyright 1998, 2000 Marshall Kirk McKusick.
3  * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
4  * All rights reserved.
5  *
6  * The soft updates code is derived from the appendix of a University
7  * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
8  * "Soft Updates: A Solution to the Metadata Update Problem in File
9  * Systems", CSE-TR-254-95, August 1995).
10  *
11  * Further information about soft updates can be obtained from:
12  *
13  *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
14  *	1614 Oxford Street		mckusick@mckusick.com
15  *	Berkeley, CA 94709-1608		+1-510-843-9542
16  *	USA
17  *
18  * Redistribution and use in source and binary forms, with or without
19  * modification, are permitted provided that the following conditions
20  * are met:
21  *
22  * 1. Redistributions of source code must retain the above copyright
23  *    notice, this list of conditions and the following disclaimer.
24  * 2. Redistributions in binary form must reproduce the above copyright
25  *    notice, this list of conditions and the following disclaimer in the
26  *    documentation and/or other materials provided with the distribution.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
29  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
30  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
31  * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
34  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
35  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
36  * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
37  * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38  *
39  *	from: @(#)ffs_softdep.c	9.59 (McKusick) 6/21/00
40  */
41 
42 #include <sys/cdefs.h>
43 __FBSDID("$FreeBSD$");
44 
45 #include "opt_ffs.h"
46 #include "opt_quota.h"
47 #include "opt_ddb.h"
48 
49 /*
50  * For now we want the safety net that the DEBUG flag provides.
51  */
52 #ifndef DEBUG
53 #define DEBUG
54 #endif
55 
56 #include <sys/param.h>
57 #include <sys/kernel.h>
58 #include <sys/systm.h>
59 #include <sys/bio.h>
60 #include <sys/buf.h>
61 #include <sys/kdb.h>
62 #include <sys/kthread.h>
63 #include <sys/ktr.h>
64 #include <sys/limits.h>
65 #include <sys/lock.h>
66 #include <sys/malloc.h>
67 #include <sys/mount.h>
68 #include <sys/mutex.h>
69 #include <sys/namei.h>
70 #include <sys/priv.h>
71 #include <sys/proc.h>
72 #include <sys/rwlock.h>
73 #include <sys/stat.h>
74 #include <sys/sysctl.h>
75 #include <sys/syslog.h>
76 #include <sys/vnode.h>
77 #include <sys/conf.h>
78 
79 #include <ufs/ufs/dir.h>
80 #include <ufs/ufs/extattr.h>
81 #include <ufs/ufs/quota.h>
82 #include <ufs/ufs/inode.h>
83 #include <ufs/ufs/ufsmount.h>
84 #include <ufs/ffs/fs.h>
85 #include <ufs/ffs/softdep.h>
86 #include <ufs/ffs/ffs_extern.h>
87 #include <ufs/ufs/ufs_extern.h>
88 
89 #include <vm/vm.h>
90 #include <vm/vm_extern.h>
91 #include <vm/vm_object.h>
92 
93 #include <geom/geom.h>
94 
95 #include <ddb/ddb.h>
96 
97 #define	KTR_SUJ	0	/* Define to KTR_SPARE. */
98 
99 #ifndef SOFTUPDATES
100 
101 int
102 softdep_flushfiles(oldmnt, flags, td)
103 	struct mount *oldmnt;
104 	int flags;
105 	struct thread *td;
106 {
107 
108 	panic("softdep_flushfiles called");
109 }
110 
111 int
112 softdep_mount(devvp, mp, fs, cred)
113 	struct vnode *devvp;
114 	struct mount *mp;
115 	struct fs *fs;
116 	struct ucred *cred;
117 {
118 
119 	return (0);
120 }
121 
122 void
123 softdep_initialize()
124 {
125 
126 	return;
127 }
128 
129 void
130 softdep_uninitialize()
131 {
132 
133 	return;
134 }
135 
136 void
137 softdep_unmount(mp)
138 	struct mount *mp;
139 {
140 
141 	panic("softdep_unmount called");
142 }
143 
144 void
145 softdep_setup_sbupdate(ump, fs, bp)
146 	struct ufsmount *ump;
147 	struct fs *fs;
148 	struct buf *bp;
149 {
150 
151 	panic("softdep_setup_sbupdate called");
152 }
153 
154 void
155 softdep_setup_inomapdep(bp, ip, newinum, mode)
156 	struct buf *bp;
157 	struct inode *ip;
158 	ino_t newinum;
159 	int mode;
160 {
161 
162 	panic("softdep_setup_inomapdep called");
163 }
164 
165 void
166 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
167 	struct buf *bp;
168 	struct mount *mp;
169 	ufs2_daddr_t newblkno;
170 	int frags;
171 	int oldfrags;
172 {
173 
174 	panic("softdep_setup_blkmapdep called");
175 }
176 
177 void
178 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
179 	struct inode *ip;
180 	ufs_lbn_t lbn;
181 	ufs2_daddr_t newblkno;
182 	ufs2_daddr_t oldblkno;
183 	long newsize;
184 	long oldsize;
185 	struct buf *bp;
186 {
187 
188 	panic("softdep_setup_allocdirect called");
189 }
190 
191 void
192 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
193 	struct inode *ip;
194 	ufs_lbn_t lbn;
195 	ufs2_daddr_t newblkno;
196 	ufs2_daddr_t oldblkno;
197 	long newsize;
198 	long oldsize;
199 	struct buf *bp;
200 {
201 
202 	panic("softdep_setup_allocext called");
203 }
204 
205 void
206 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
207 	struct inode *ip;
208 	ufs_lbn_t lbn;
209 	struct buf *bp;
210 	int ptrno;
211 	ufs2_daddr_t newblkno;
212 	ufs2_daddr_t oldblkno;
213 	struct buf *nbp;
214 {
215 
216 	panic("softdep_setup_allocindir_page called");
217 }
218 
219 void
220 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
221 	struct buf *nbp;
222 	struct inode *ip;
223 	struct buf *bp;
224 	int ptrno;
225 	ufs2_daddr_t newblkno;
226 {
227 
228 	panic("softdep_setup_allocindir_meta called");
229 }
230 
231 void
232 softdep_journal_freeblocks(ip, cred, length, flags)
233 	struct inode *ip;
234 	struct ucred *cred;
235 	off_t length;
236 	int flags;
237 {
238 
239 	panic("softdep_journal_freeblocks called");
240 }
241 
242 void
243 softdep_journal_fsync(ip)
244 	struct inode *ip;
245 {
246 
247 	panic("softdep_journal_fsync called");
248 }
249 
250 void
251 softdep_setup_freeblocks(ip, length, flags)
252 	struct inode *ip;
253 	off_t length;
254 	int flags;
255 {
256 
257 	panic("softdep_setup_freeblocks called");
258 }
259 
260 void
261 softdep_freefile(pvp, ino, mode)
262 		struct vnode *pvp;
263 		ino_t ino;
264 		int mode;
265 {
266 
267 	panic("softdep_freefile called");
268 }
269 
270 int
271 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
272 	struct buf *bp;
273 	struct inode *dp;
274 	off_t diroffset;
275 	ino_t newinum;
276 	struct buf *newdirbp;
277 	int isnewblk;
278 {
279 
280 	panic("softdep_setup_directory_add called");
281 }
282 
283 void
284 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
285 	struct buf *bp;
286 	struct inode *dp;
287 	caddr_t base;
288 	caddr_t oldloc;
289 	caddr_t newloc;
290 	int entrysize;
291 {
292 
293 	panic("softdep_change_directoryentry_offset called");
294 }
295 
296 void
297 softdep_setup_remove(bp, dp, ip, isrmdir)
298 	struct buf *bp;
299 	struct inode *dp;
300 	struct inode *ip;
301 	int isrmdir;
302 {
303 
304 	panic("softdep_setup_remove called");
305 }
306 
307 void
308 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
309 	struct buf *bp;
310 	struct inode *dp;
311 	struct inode *ip;
312 	ino_t newinum;
313 	int isrmdir;
314 {
315 
316 	panic("softdep_setup_directory_change called");
317 }
318 
319 void
320 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
321 	struct mount *mp;
322 	struct buf *bp;
323 	ufs2_daddr_t blkno;
324 	int frags;
325 	struct workhead *wkhd;
326 {
327 
328 	panic("%s called", __FUNCTION__);
329 }
330 
331 void
332 softdep_setup_inofree(mp, bp, ino, wkhd)
333 	struct mount *mp;
334 	struct buf *bp;
335 	ino_t ino;
336 	struct workhead *wkhd;
337 {
338 
339 	panic("%s called", __FUNCTION__);
340 }
341 
342 void
343 softdep_setup_unlink(dp, ip)
344 	struct inode *dp;
345 	struct inode *ip;
346 {
347 
348 	panic("%s called", __FUNCTION__);
349 }
350 
351 void
352 softdep_setup_link(dp, ip)
353 	struct inode *dp;
354 	struct inode *ip;
355 {
356 
357 	panic("%s called", __FUNCTION__);
358 }
359 
360 void
361 softdep_revert_link(dp, ip)
362 	struct inode *dp;
363 	struct inode *ip;
364 {
365 
366 	panic("%s called", __FUNCTION__);
367 }
368 
369 void
370 softdep_setup_rmdir(dp, ip)
371 	struct inode *dp;
372 	struct inode *ip;
373 {
374 
375 	panic("%s called", __FUNCTION__);
376 }
377 
378 void
379 softdep_revert_rmdir(dp, ip)
380 	struct inode *dp;
381 	struct inode *ip;
382 {
383 
384 	panic("%s called", __FUNCTION__);
385 }
386 
387 void
388 softdep_setup_create(dp, ip)
389 	struct inode *dp;
390 	struct inode *ip;
391 {
392 
393 	panic("%s called", __FUNCTION__);
394 }
395 
396 void
397 softdep_revert_create(dp, ip)
398 	struct inode *dp;
399 	struct inode *ip;
400 {
401 
402 	panic("%s called", __FUNCTION__);
403 }
404 
405 void
406 softdep_setup_mkdir(dp, ip)
407 	struct inode *dp;
408 	struct inode *ip;
409 {
410 
411 	panic("%s called", __FUNCTION__);
412 }
413 
414 void
415 softdep_revert_mkdir(dp, ip)
416 	struct inode *dp;
417 	struct inode *ip;
418 {
419 
420 	panic("%s called", __FUNCTION__);
421 }
422 
423 void
424 softdep_setup_dotdot_link(dp, ip)
425 	struct inode *dp;
426 	struct inode *ip;
427 {
428 
429 	panic("%s called", __FUNCTION__);
430 }
431 
432 int
433 softdep_prealloc(vp, waitok)
434 	struct vnode *vp;
435 	int waitok;
436 {
437 
438 	panic("%s called", __FUNCTION__);
439 }
440 
441 int
442 softdep_journal_lookup(mp, vpp)
443 	struct mount *mp;
444 	struct vnode **vpp;
445 {
446 
447 	return (ENOENT);
448 }
449 
450 void
451 softdep_change_linkcnt(ip)
452 	struct inode *ip;
453 {
454 
455 	panic("softdep_change_linkcnt called");
456 }
457 
458 void
459 softdep_load_inodeblock(ip)
460 	struct inode *ip;
461 {
462 
463 	panic("softdep_load_inodeblock called");
464 }
465 
466 void
467 softdep_update_inodeblock(ip, bp, waitfor)
468 	struct inode *ip;
469 	struct buf *bp;
470 	int waitfor;
471 {
472 
473 	panic("softdep_update_inodeblock called");
474 }
475 
476 int
477 softdep_fsync(vp)
478 	struct vnode *vp;	/* the "in_core" copy of the inode */
479 {
480 
481 	return (0);
482 }
483 
484 void
485 softdep_fsync_mountdev(vp)
486 	struct vnode *vp;
487 {
488 
489 	return;
490 }
491 
492 int
493 softdep_flushworklist(oldmnt, countp, td)
494 	struct mount *oldmnt;
495 	int *countp;
496 	struct thread *td;
497 {
498 
499 	*countp = 0;
500 	return (0);
501 }
502 
503 int
504 softdep_sync_metadata(struct vnode *vp)
505 {
506 
507 	panic("softdep_sync_metadata called");
508 }
509 
510 int
511 softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
512 {
513 
514 	panic("softdep_sync_buf called");
515 }
516 
517 int
518 softdep_slowdown(vp)
519 	struct vnode *vp;
520 {
521 
522 	panic("softdep_slowdown called");
523 }
524 
525 int
526 softdep_request_cleanup(fs, vp, cred, resource)
527 	struct fs *fs;
528 	struct vnode *vp;
529 	struct ucred *cred;
530 	int resource;
531 {
532 
533 	return (0);
534 }
535 
536 int
537 softdep_check_suspend(struct mount *mp,
538 		      struct vnode *devvp,
539 		      int softdep_depcnt,
540 		      int softdep_accdepcnt,
541 		      int secondary_writes,
542 		      int secondary_accwrites)
543 {
544 	struct bufobj *bo;
545 	int error;
546 
547 	(void) softdep_depcnt,
548 	(void) softdep_accdepcnt;
549 
550 	bo = &devvp->v_bufobj;
551 	ASSERT_BO_WLOCKED(bo);
552 
553 	MNT_ILOCK(mp);
554 	while (mp->mnt_secondary_writes != 0) {
555 		BO_UNLOCK(bo);
556 		msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
557 		    (PUSER - 1) | PDROP, "secwr", 0);
558 		BO_LOCK(bo);
559 		MNT_ILOCK(mp);
560 	}
561 
562 	/*
563 	 * Reasons for needing more work before suspend:
564 	 * - Dirty buffers on devvp.
565 	 * - Secondary writes occurred after start of vnode sync loop
566 	 */
567 	error = 0;
568 	if (bo->bo_numoutput > 0 ||
569 	    bo->bo_dirty.bv_cnt > 0 ||
570 	    secondary_writes != 0 ||
571 	    mp->mnt_secondary_writes != 0 ||
572 	    secondary_accwrites != mp->mnt_secondary_accwrites)
573 		error = EAGAIN;
574 	BO_UNLOCK(bo);
575 	return (error);
576 }
577 
578 void
579 softdep_get_depcounts(struct mount *mp,
580 		      int *softdepactivep,
581 		      int *softdepactiveaccp)
582 {
583 	(void) mp;
584 	*softdepactivep = 0;
585 	*softdepactiveaccp = 0;
586 }
587 
588 void
589 softdep_buf_append(bp, wkhd)
590 	struct buf *bp;
591 	struct workhead *wkhd;
592 {
593 
594 	panic("softdep_buf_appendwork called");
595 }
596 
597 void
598 softdep_inode_append(ip, cred, wkhd)
599 	struct inode *ip;
600 	struct ucred *cred;
601 	struct workhead *wkhd;
602 {
603 
604 	panic("softdep_inode_appendwork called");
605 }
606 
607 void
608 softdep_freework(wkhd)
609 	struct workhead *wkhd;
610 {
611 
612 	panic("softdep_freework called");
613 }
614 
615 #else
616 
617 FEATURE(softupdates, "FFS soft-updates support");
618 
619 static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0,
620     "soft updates stats");
621 static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0,
622     "total dependencies allocated");
623 static SYSCTL_NODE(_debug_softdep, OID_AUTO, highuse, CTLFLAG_RW, 0,
624     "high use dependencies allocated");
625 static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,
626     "current dependencies allocated");
627 static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW, 0,
628     "current dependencies written");
629 
630 unsigned long dep_current[D_LAST + 1];
631 unsigned long dep_highuse[D_LAST + 1];
632 unsigned long dep_total[D_LAST + 1];
633 unsigned long dep_write[D_LAST + 1];
634 
635 #define	SOFTDEP_TYPE(type, str, long)					\
636     static MALLOC_DEFINE(M_ ## type, #str, long);			\
637     SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD,	\
638 	&dep_total[D_ ## type], 0, "");					\
639     SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, 	\
640 	&dep_current[D_ ## type], 0, "");				\
641     SYSCTL_ULONG(_debug_softdep_highuse, OID_AUTO, str, CTLFLAG_RD, 	\
642 	&dep_highuse[D_ ## type], 0, "");				\
643     SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, 	\
644 	&dep_write[D_ ## type], 0, "");
645 
646 SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");
647 SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
648 SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
649     "Block or frag allocated from cyl group map");
650 SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
651 SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
652 SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
653 SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
654 SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
655 SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
656 SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
657 SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
658 SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
659 SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
660 SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
661 SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
662 SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
663 SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
664 SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
665 SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
666 SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
667 SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
668 SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
669 SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
670 SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
671 SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
672 SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
673 SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete");
674 
675 static MALLOC_DEFINE(M_SENTINEL, "sentinel", "Worklist sentinel");
676 
677 static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
678 static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
679 static MALLOC_DEFINE(M_MOUNTDATA, "softdep", "Softdep per-mount data");
680 
681 #define M_SOFTDEP_FLAGS	(M_WAITOK)
682 
683 /*
684  * translate from workitem type to memory type
685  * MUST match the defines above, such that memtype[D_XXX] == M_XXX
686  */
687 static struct malloc_type *memtype[] = {
688 	M_PAGEDEP,
689 	M_INODEDEP,
690 	M_BMSAFEMAP,
691 	M_NEWBLK,
692 	M_ALLOCDIRECT,
693 	M_INDIRDEP,
694 	M_ALLOCINDIR,
695 	M_FREEFRAG,
696 	M_FREEBLKS,
697 	M_FREEFILE,
698 	M_DIRADD,
699 	M_MKDIR,
700 	M_DIRREM,
701 	M_NEWDIRBLK,
702 	M_FREEWORK,
703 	M_FREEDEP,
704 	M_JADDREF,
705 	M_JREMREF,
706 	M_JMVREF,
707 	M_JNEWBLK,
708 	M_JFREEBLK,
709 	M_JFREEFRAG,
710 	M_JSEG,
711 	M_JSEGDEP,
712 	M_SBDEP,
713 	M_JTRUNC,
714 	M_JFSYNC,
715 	M_SENTINEL
716 };
717 
718 #define DtoM(type) (memtype[type])
719 
720 /*
721  * Names of malloc types.
722  */
723 #define TYPENAME(type)  \
724 	((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???")
725 /*
726  * End system adaptation definitions.
727  */
728 
729 #define	DOTDOT_OFFSET	offsetof(struct dirtemplate, dotdot_ino)
730 #define	DOT_OFFSET	offsetof(struct dirtemplate, dot_ino)
731 
732 /*
733  * Internal function prototypes.
734  */
735 static	void check_clear_deps(struct mount *);
736 static	void softdep_error(char *, int);
737 static	int softdep_process_worklist(struct mount *, int);
738 static	int softdep_waitidle(struct mount *, int);
739 static	void drain_output(struct vnode *);
740 static	struct buf *getdirtybuf(struct buf *, struct rwlock *, int);
741 static	void clear_remove(struct mount *);
742 static	void clear_inodedeps(struct mount *);
743 static	void unlinked_inodedep(struct mount *, struct inodedep *);
744 static	void clear_unlinked_inodedep(struct inodedep *);
745 static	struct inodedep *first_unlinked_inodedep(struct ufsmount *);
746 static	int flush_pagedep_deps(struct vnode *, struct mount *,
747 	    struct diraddhd *);
748 static	int free_pagedep(struct pagedep *);
749 static	int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
750 static	int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
751 static	int flush_deplist(struct allocdirectlst *, int, int *);
752 static	int sync_cgs(struct mount *, int);
753 static	int handle_written_filepage(struct pagedep *, struct buf *);
754 static	int handle_written_sbdep(struct sbdep *, struct buf *);
755 static	void initiate_write_sbdep(struct sbdep *);
756 static	void diradd_inode_written(struct diradd *, struct inodedep *);
757 static	int handle_written_indirdep(struct indirdep *, struct buf *,
758 	    struct buf**);
759 static	int handle_written_inodeblock(struct inodedep *, struct buf *);
760 static	int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *,
761 	    uint8_t *);
762 static	int handle_written_bmsafemap(struct bmsafemap *, struct buf *);
763 static	void handle_written_jaddref(struct jaddref *);
764 static	void handle_written_jremref(struct jremref *);
765 static	void handle_written_jseg(struct jseg *, struct buf *);
766 static	void handle_written_jnewblk(struct jnewblk *);
767 static	void handle_written_jblkdep(struct jblkdep *);
768 static	void handle_written_jfreefrag(struct jfreefrag *);
769 static	void complete_jseg(struct jseg *);
770 static	void complete_jsegs(struct jseg *);
771 static	void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
772 static	void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
773 static	void jremref_write(struct jremref *, struct jseg *, uint8_t *);
774 static	void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
775 static	void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
776 static	void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data);
777 static	void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
778 static	void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
779 static	void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
780 static	inline void inoref_write(struct inoref *, struct jseg *,
781 	    struct jrefrec *);
782 static	void handle_allocdirect_partdone(struct allocdirect *,
783 	    struct workhead *);
784 static	struct jnewblk *cancel_newblk(struct newblk *, struct worklist *,
785 	    struct workhead *);
786 static	void indirdep_complete(struct indirdep *);
787 static	int indirblk_lookup(struct mount *, ufs2_daddr_t);
788 static	void indirblk_insert(struct freework *);
789 static	void indirblk_remove(struct freework *);
790 static	void handle_allocindir_partdone(struct allocindir *);
791 static	void initiate_write_filepage(struct pagedep *, struct buf *);
792 static	void initiate_write_indirdep(struct indirdep*, struct buf *);
793 static	void handle_written_mkdir(struct mkdir *, int);
794 static	int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *,
795 	    uint8_t *);
796 static	void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
797 static	void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
798 static	void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
799 static	void handle_workitem_freefile(struct freefile *);
800 static	int handle_workitem_remove(struct dirrem *, int);
801 static	struct dirrem *newdirrem(struct buf *, struct inode *,
802 	    struct inode *, int, struct dirrem **);
803 static	struct indirdep *indirdep_lookup(struct mount *, struct inode *,
804 	    struct buf *);
805 static	void cancel_indirdep(struct indirdep *, struct buf *,
806 	    struct freeblks *);
807 static	void free_indirdep(struct indirdep *);
808 static	void free_diradd(struct diradd *, struct workhead *);
809 static	void merge_diradd(struct inodedep *, struct diradd *);
810 static	void complete_diradd(struct diradd *);
811 static	struct diradd *diradd_lookup(struct pagedep *, int);
812 static	struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
813 	    struct jremref *);
814 static	struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
815 	    struct jremref *);
816 static	void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
817 	    struct jremref *, struct jremref *);
818 static	void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
819 	    struct jremref *);
820 static	void cancel_allocindir(struct allocindir *, struct buf *bp,
821 	    struct freeblks *, int);
822 static	int setup_trunc_indir(struct freeblks *, struct inode *,
823 	    ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t);
824 static	void complete_trunc_indir(struct freework *);
825 static	void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *,
826 	    int);
827 static	void complete_mkdir(struct mkdir *);
828 static	void free_newdirblk(struct newdirblk *);
829 static	void free_jremref(struct jremref *);
830 static	void free_jaddref(struct jaddref *);
831 static	void free_jsegdep(struct jsegdep *);
832 static	void free_jsegs(struct jblocks *);
833 static	void rele_jseg(struct jseg *);
834 static	void free_jseg(struct jseg *, struct jblocks *);
835 static	void free_jnewblk(struct jnewblk *);
836 static	void free_jblkdep(struct jblkdep *);
837 static	void free_jfreefrag(struct jfreefrag *);
838 static	void free_freedep(struct freedep *);
839 static	void journal_jremref(struct dirrem *, struct jremref *,
840 	    struct inodedep *);
841 static	void cancel_jnewblk(struct jnewblk *, struct workhead *);
842 static	int cancel_jaddref(struct jaddref *, struct inodedep *,
843 	    struct workhead *);
844 static	void cancel_jfreefrag(struct jfreefrag *);
845 static	inline void setup_freedirect(struct freeblks *, struct inode *,
846 	    int, int);
847 static	inline void setup_freeext(struct freeblks *, struct inode *, int, int);
848 static	inline void setup_freeindir(struct freeblks *, struct inode *, int,
849 	    ufs_lbn_t, int);
850 static	inline struct freeblks *newfreeblks(struct mount *, struct inode *);
851 static	void freeblks_free(struct ufsmount *, struct freeblks *, int);
852 static	void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
853 static	ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t);
854 static	int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int);
855 static	void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t,
856 	    int, int);
857 static	void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int);
858 static 	int cancel_pagedep(struct pagedep *, struct freeblks *, int);
859 static	int deallocate_dependencies(struct buf *, struct freeblks *, int);
860 static	void newblk_freefrag(struct newblk*);
861 static	void free_newblk(struct newblk *);
862 static	void cancel_allocdirect(struct allocdirectlst *,
863 	    struct allocdirect *, struct freeblks *);
864 static	int check_inode_unwritten(struct inodedep *);
865 static	int free_inodedep(struct inodedep *);
866 static	void freework_freeblock(struct freework *);
867 static	void freework_enqueue(struct freework *);
868 static	int handle_workitem_freeblocks(struct freeblks *, int);
869 static	int handle_complete_freeblocks(struct freeblks *, int);
870 static	void handle_workitem_indirblk(struct freework *);
871 static	void handle_written_freework(struct freework *);
872 static	void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
873 static	struct worklist *jnewblk_merge(struct worklist *, struct worklist *,
874 	    struct workhead *);
875 static	struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *,
876 	    struct inodedep *, struct allocindir *, ufs_lbn_t);
877 static	struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
878 	    ufs2_daddr_t, ufs_lbn_t);
879 static	void handle_workitem_freefrag(struct freefrag *);
880 static	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
881 	    ufs_lbn_t);
882 static	void allocdirect_merge(struct allocdirectlst *,
883 	    struct allocdirect *, struct allocdirect *);
884 static	struct freefrag *allocindir_merge(struct allocindir *,
885 	    struct allocindir *);
886 static	int bmsafemap_find(struct bmsafemap_hashhead *, int,
887 	    struct bmsafemap **);
888 static	struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
889 	    int cg, struct bmsafemap *);
890 static	int newblk_find(struct newblk_hashhead *, ufs2_daddr_t, int,
891 	    struct newblk **);
892 static	int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
893 static	int inodedep_find(struct inodedep_hashhead *, ino_t,
894 	    struct inodedep **);
895 static	int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
896 static	int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t,
897 	    int, struct pagedep **);
898 static	int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
899 	    struct pagedep **);
900 static	void pause_timer(void *);
901 static	int request_cleanup(struct mount *, int);
902 static	int process_worklist_item(struct mount *, int, int);
903 static	void process_removes(struct vnode *);
904 static	void process_truncates(struct vnode *);
905 static	void jwork_move(struct workhead *, struct workhead *);
906 static	void jwork_insert(struct workhead *, struct jsegdep *);
907 static	void add_to_worklist(struct worklist *, int);
908 static	void wake_worklist(struct worklist *);
909 static	void wait_worklist(struct worklist *, char *);
910 static	void remove_from_worklist(struct worklist *);
911 static	void softdep_flush(void *);
912 static	void softdep_flushjournal(struct mount *);
913 static	int softdep_speedup(struct ufsmount *);
914 static	void worklist_speedup(struct mount *);
915 static	int journal_mount(struct mount *, struct fs *, struct ucred *);
916 static	void journal_unmount(struct ufsmount *);
917 static	int journal_space(struct ufsmount *, int);
918 static	void journal_suspend(struct ufsmount *);
919 static	int journal_unsuspend(struct ufsmount *ump);
920 static	void softdep_prelink(struct vnode *, struct vnode *);
921 static	void add_to_journal(struct worklist *);
922 static	void remove_from_journal(struct worklist *);
923 static	void softdep_process_journal(struct mount *, struct worklist *, int);
924 static	struct jremref *newjremref(struct dirrem *, struct inode *,
925 	    struct inode *ip, off_t, nlink_t);
926 static	struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
927 	    uint16_t);
928 static	inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
929 	    uint16_t);
930 static	inline struct jsegdep *inoref_jseg(struct inoref *);
931 static	struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
932 static	struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
933 	    ufs2_daddr_t, int);
934 static	void adjust_newfreework(struct freeblks *, int);
935 static	struct jtrunc *newjtrunc(struct freeblks *, off_t, int);
936 static	void move_newblock_dep(struct jaddref *, struct inodedep *);
937 static	void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t);
938 static	struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
939 	    ufs2_daddr_t, long, ufs_lbn_t);
940 static	struct freework *newfreework(struct ufsmount *, struct freeblks *,
941 	    struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int);
942 static	int jwait(struct worklist *, int);
943 static	struct inodedep *inodedep_lookup_ip(struct inode *);
944 static	int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *);
945 static	struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
946 static	void handle_jwork(struct workhead *);
947 static	struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
948 	    struct mkdir **);
949 static	struct jblocks *jblocks_create(void);
950 static	ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
951 static	void jblocks_free(struct jblocks *, struct mount *, int);
952 static	void jblocks_destroy(struct jblocks *);
953 static	void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
954 
955 /*
956  * Exported softdep operations.
957  */
958 static	void softdep_disk_io_initiation(struct buf *);
959 static	void softdep_disk_write_complete(struct buf *);
960 static	void softdep_deallocate_dependencies(struct buf *);
961 static	int softdep_count_dependencies(struct buf *bp, int);
962 
963 /*
964  * Global lock over all of soft updates.
965  */
966 static struct mtx lk;
967 MTX_SYSINIT(softdep_lock, &lk, "Global Softdep Lock", MTX_DEF);
968 
969 #define ACQUIRE_GBLLOCK(lk)	mtx_lock(lk)
970 #define FREE_GBLLOCK(lk)	mtx_unlock(lk)
971 #define GBLLOCK_OWNED(lk)	mtx_assert((lk), MA_OWNED)
972 
973 /*
974  * Per-filesystem soft-updates locking.
975  */
976 #define LOCK_PTR(ump)		(&(ump)->um_softdep->sd_fslock)
977 #define TRY_ACQUIRE_LOCK(ump)	rw_try_wlock(&(ump)->um_softdep->sd_fslock)
978 #define ACQUIRE_LOCK(ump)	rw_wlock(&(ump)->um_softdep->sd_fslock)
979 #define FREE_LOCK(ump)		rw_wunlock(&(ump)->um_softdep->sd_fslock)
980 #define LOCK_OWNED(ump)		rw_assert(&(ump)->um_softdep->sd_fslock, \
981 				    RA_WLOCKED)
982 
983 #define	BUF_AREC(bp)		lockallowrecurse(&(bp)->b_lock)
984 #define	BUF_NOREC(bp)		lockdisablerecurse(&(bp)->b_lock)
985 
986 /*
987  * Worklist queue management.
988  * These routines require that the lock be held.
989  */
990 #ifndef /* NOT */ DEBUG
991 #define WORKLIST_INSERT(head, item) do {	\
992 	(item)->wk_state |= ONWORKLIST;		\
993 	LIST_INSERT_HEAD(head, item, wk_list);	\
994 } while (0)
995 #define WORKLIST_REMOVE(item) do {		\
996 	(item)->wk_state &= ~ONWORKLIST;	\
997 	LIST_REMOVE(item, wk_list);		\
998 } while (0)
999 #define WORKLIST_INSERT_UNLOCKED	WORKLIST_INSERT
1000 #define WORKLIST_REMOVE_UNLOCKED	WORKLIST_REMOVE
1001 
1002 #else /* DEBUG */
1003 static	void worklist_insert(struct workhead *, struct worklist *, int);
1004 static	void worklist_remove(struct worklist *, int);
1005 
1006 #define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1)
1007 #define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0)
1008 #define WORKLIST_REMOVE(item) worklist_remove(item, 1)
1009 #define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0)
1010 
1011 static void
1012 worklist_insert(head, item, locked)
1013 	struct workhead *head;
1014 	struct worklist *item;
1015 	int locked;
1016 {
1017 
1018 	if (locked)
1019 		LOCK_OWNED(VFSTOUFS(item->wk_mp));
1020 	if (item->wk_state & ONWORKLIST)
1021 		panic("worklist_insert: %p %s(0x%X) already on list",
1022 		    item, TYPENAME(item->wk_type), item->wk_state);
1023 	item->wk_state |= ONWORKLIST;
1024 	LIST_INSERT_HEAD(head, item, wk_list);
1025 }
1026 
1027 static void
1028 worklist_remove(item, locked)
1029 	struct worklist *item;
1030 	int locked;
1031 {
1032 
1033 	if (locked)
1034 		LOCK_OWNED(VFSTOUFS(item->wk_mp));
1035 	if ((item->wk_state & ONWORKLIST) == 0)
1036 		panic("worklist_remove: %p %s(0x%X) not on list",
1037 		    item, TYPENAME(item->wk_type), item->wk_state);
1038 	item->wk_state &= ~ONWORKLIST;
1039 	LIST_REMOVE(item, wk_list);
1040 }
1041 #endif /* DEBUG */
1042 
1043 /*
1044  * Merge two jsegdeps keeping only the oldest one as newer references
1045  * can't be discarded until after older references.
1046  */
1047 static inline struct jsegdep *
1048 jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
1049 {
1050 	struct jsegdep *swp;
1051 
1052 	if (two == NULL)
1053 		return (one);
1054 
1055 	if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
1056 		swp = one;
1057 		one = two;
1058 		two = swp;
1059 	}
1060 	WORKLIST_REMOVE(&two->jd_list);
1061 	free_jsegdep(two);
1062 
1063 	return (one);
1064 }
1065 
1066 /*
1067  * If two freedeps are compatible free one to reduce list size.
1068  */
1069 static inline struct freedep *
1070 freedep_merge(struct freedep *one, struct freedep *two)
1071 {
1072 	if (two == NULL)
1073 		return (one);
1074 
1075 	if (one->fd_freework == two->fd_freework) {
1076 		WORKLIST_REMOVE(&two->fd_list);
1077 		free_freedep(two);
1078 	}
1079 	return (one);
1080 }
1081 
1082 /*
1083  * Move journal work from one list to another.  Duplicate freedeps and
1084  * jsegdeps are coalesced to keep the lists as small as possible.
1085  */
1086 static void
1087 jwork_move(dst, src)
1088 	struct workhead *dst;
1089 	struct workhead *src;
1090 {
1091 	struct freedep *freedep;
1092 	struct jsegdep *jsegdep;
1093 	struct worklist *wkn;
1094 	struct worklist *wk;
1095 
1096 	KASSERT(dst != src,
1097 	    ("jwork_move: dst == src"));
1098 	freedep = NULL;
1099 	jsegdep = NULL;
1100 	LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
1101 		if (wk->wk_type == D_JSEGDEP)
1102 			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1103 		else if (wk->wk_type == D_FREEDEP)
1104 			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1105 	}
1106 
1107 	while ((wk = LIST_FIRST(src)) != NULL) {
1108 		WORKLIST_REMOVE(wk);
1109 		WORKLIST_INSERT(dst, wk);
1110 		if (wk->wk_type == D_JSEGDEP) {
1111 			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1112 			continue;
1113 		}
1114 		if (wk->wk_type == D_FREEDEP)
1115 			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1116 	}
1117 }
1118 
1119 static void
1120 jwork_insert(dst, jsegdep)
1121 	struct workhead *dst;
1122 	struct jsegdep *jsegdep;
1123 {
1124 	struct jsegdep *jsegdepn;
1125 	struct worklist *wk;
1126 
1127 	LIST_FOREACH(wk, dst, wk_list)
1128 		if (wk->wk_type == D_JSEGDEP)
1129 			break;
1130 	if (wk == NULL) {
1131 		WORKLIST_INSERT(dst, &jsegdep->jd_list);
1132 		return;
1133 	}
1134 	jsegdepn = WK_JSEGDEP(wk);
1135 	if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) {
1136 		WORKLIST_REMOVE(wk);
1137 		free_jsegdep(jsegdepn);
1138 		WORKLIST_INSERT(dst, &jsegdep->jd_list);
1139 	} else
1140 		free_jsegdep(jsegdep);
1141 }
1142 
1143 /*
1144  * Routines for tracking and managing workitems.
1145  */
1146 static	void workitem_free(struct worklist *, int);
1147 static	void workitem_alloc(struct worklist *, int, struct mount *);
1148 static	void workitem_reassign(struct worklist *, int);
1149 
1150 #define	WORKITEM_FREE(item, type) \
1151 	workitem_free((struct worklist *)(item), (type))
1152 #define	WORKITEM_REASSIGN(item, type) \
1153 	workitem_reassign((struct worklist *)(item), (type))
1154 
1155 static void
1156 workitem_free(item, type)
1157 	struct worklist *item;
1158 	int type;
1159 {
1160 	struct ufsmount *ump;
1161 
1162 #ifdef DEBUG
1163 	if (item->wk_state & ONWORKLIST)
1164 		panic("workitem_free: %s(0x%X) still on list",
1165 		    TYPENAME(item->wk_type), item->wk_state);
1166 	if (item->wk_type != type && type != D_NEWBLK)
1167 		panic("workitem_free: type mismatch %s != %s",
1168 		    TYPENAME(item->wk_type), TYPENAME(type));
1169 #endif
1170 	if (item->wk_state & IOWAITING)
1171 		wakeup(item);
1172 	ump = VFSTOUFS(item->wk_mp);
1173 	LOCK_OWNED(ump);
1174 	KASSERT(ump->softdep_deps > 0,
1175 	    ("workitem_free: %s: softdep_deps going negative",
1176 	    ump->um_fs->fs_fsmnt));
1177 	if (--ump->softdep_deps == 0 && ump->softdep_req)
1178 		wakeup(&ump->softdep_deps);
1179 	KASSERT(dep_current[item->wk_type] > 0,
1180 	    ("workitem_free: %s: dep_current[%s] going negative",
1181 	    ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1182 	KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
1183 	    ("workitem_free: %s: softdep_curdeps[%s] going negative",
1184 	    ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1185 	atomic_subtract_long(&dep_current[item->wk_type], 1);
1186 	ump->softdep_curdeps[item->wk_type] -= 1;
1187 	free(item, DtoM(type));
1188 }
1189 
1190 static void
1191 workitem_alloc(item, type, mp)
1192 	struct worklist *item;
1193 	int type;
1194 	struct mount *mp;
1195 {
1196 	struct ufsmount *ump;
1197 
1198 	item->wk_type = type;
1199 	item->wk_mp = mp;
1200 	item->wk_state = 0;
1201 
1202 	ump = VFSTOUFS(mp);
1203 	ACQUIRE_GBLLOCK(&lk);
1204 	dep_current[type]++;
1205 	if (dep_current[type] > dep_highuse[type])
1206 		dep_highuse[type] = dep_current[type];
1207 	dep_total[type]++;
1208 	FREE_GBLLOCK(&lk);
1209 	ACQUIRE_LOCK(ump);
1210 	ump->softdep_curdeps[type] += 1;
1211 	ump->softdep_deps++;
1212 	ump->softdep_accdeps++;
1213 	FREE_LOCK(ump);
1214 }
1215 
1216 static void
1217 workitem_reassign(item, newtype)
1218 	struct worklist *item;
1219 	int newtype;
1220 {
1221 	struct ufsmount *ump;
1222 
1223 	ump = VFSTOUFS(item->wk_mp);
1224 	LOCK_OWNED(ump);
1225 	KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
1226 	    ("workitem_reassign: %s: softdep_curdeps[%s] going negative",
1227 	    VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1228 	ump->softdep_curdeps[item->wk_type] -= 1;
1229 	ump->softdep_curdeps[newtype] += 1;
1230 	KASSERT(dep_current[item->wk_type] > 0,
1231 	    ("workitem_reassign: %s: dep_current[%s] going negative",
1232 	    VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1233 	ACQUIRE_GBLLOCK(&lk);
1234 	dep_current[newtype]++;
1235 	dep_current[item->wk_type]--;
1236 	if (dep_current[newtype] > dep_highuse[newtype])
1237 		dep_highuse[newtype] = dep_current[newtype];
1238 	dep_total[newtype]++;
1239 	FREE_GBLLOCK(&lk);
1240 	item->wk_type = newtype;
1241 }
1242 
1243 /*
1244  * Workitem queue management
1245  */
1246 static int max_softdeps;	/* maximum number of structs before slowdown */
1247 static int tickdelay = 2;	/* number of ticks to pause during slowdown */
1248 static int proc_waiting;	/* tracks whether we have a timeout posted */
1249 static int *stat_countp;	/* statistic to count in proc_waiting timeout */
1250 static struct callout softdep_callout;
1251 static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
1252 static int req_clear_remove;	/* syncer process flush some freeblks */
1253 static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */
1254 
1255 /*
1256  * runtime statistics
1257  */
1258 static int stat_flush_threads;	/* number of softdep flushing threads */
1259 static int stat_worklist_push;	/* number of worklist cleanups */
1260 static int stat_blk_limit_push;	/* number of times block limit neared */
1261 static int stat_ino_limit_push;	/* number of times inode limit neared */
1262 static int stat_blk_limit_hit;	/* number of times block slowdown imposed */
1263 static int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
1264 static int stat_sync_limit_hit;	/* number of synchronous slowdowns imposed */
1265 static int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
1266 static int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
1267 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
1268 static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
1269 static int stat_jaddref;	/* bufs redirtied as ino bitmap can not write */
1270 static int stat_jnewblk;	/* bufs redirtied as blk bitmap can not write */
1271 static int stat_journal_min;	/* Times hit journal min threshold */
1272 static int stat_journal_low;	/* Times hit journal low threshold */
1273 static int stat_journal_wait;	/* Times blocked in jwait(). */
1274 static int stat_jwait_filepage;	/* Times blocked in jwait() for filepage. */
1275 static int stat_jwait_freeblks;	/* Times blocked in jwait() for freeblks. */
1276 static int stat_jwait_inode;	/* Times blocked in jwait() for inodes. */
1277 static int stat_jwait_newblk;	/* Times blocked in jwait() for newblks. */
1278 static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */
1279 static int stat_cleanup_blkrequests; /* Number of block cleanup requests */
1280 static int stat_cleanup_inorequests; /* Number of inode cleanup requests */
1281 static int stat_cleanup_retries; /* Number of cleanups that needed to flush */
1282 static int stat_cleanup_failures; /* Number of cleanup requests that failed */
1283 static int stat_emptyjblocks; /* Number of potentially empty journal blocks */
1284 
1285 SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
1286     &max_softdeps, 0, "");
1287 SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
1288     &tickdelay, 0, "");
1289 SYSCTL_INT(_debug_softdep, OID_AUTO, flush_threads, CTLFLAG_RD,
1290     &stat_flush_threads, 0, "");
1291 SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,
1292     &stat_worklist_push, 0,"");
1293 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,
1294     &stat_blk_limit_push, 0,"");
1295 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW,
1296     &stat_ino_limit_push, 0,"");
1297 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW,
1298     &stat_blk_limit_hit, 0, "");
1299 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW,
1300     &stat_ino_limit_hit, 0, "");
1301 SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW,
1302     &stat_sync_limit_hit, 0, "");
1303 SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW,
1304     &stat_indir_blk_ptrs, 0, "");
1305 SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW,
1306     &stat_inode_bitmap, 0, "");
1307 SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW,
1308     &stat_direct_blk_ptrs, 0, "");
1309 SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW,
1310     &stat_dir_entry, 0, "");
1311 SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW,
1312     &stat_jaddref, 0, "");
1313 SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW,
1314     &stat_jnewblk, 0, "");
1315 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW,
1316     &stat_journal_low, 0, "");
1317 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW,
1318     &stat_journal_min, 0, "");
1319 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW,
1320     &stat_journal_wait, 0, "");
1321 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW,
1322     &stat_jwait_filepage, 0, "");
1323 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW,
1324     &stat_jwait_freeblks, 0, "");
1325 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW,
1326     &stat_jwait_inode, 0, "");
1327 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW,
1328     &stat_jwait_newblk, 0, "");
1329 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW,
1330     &stat_cleanup_blkrequests, 0, "");
1331 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW,
1332     &stat_cleanup_inorequests, 0, "");
1333 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW,
1334     &stat_cleanup_high_delay, 0, "");
1335 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW,
1336     &stat_cleanup_retries, 0, "");
1337 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW,
1338     &stat_cleanup_failures, 0, "");
1339 SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW,
1340     &softdep_flushcache, 0, "");
1341 SYSCTL_INT(_debug_softdep, OID_AUTO, emptyjblocks, CTLFLAG_RD,
1342     &stat_emptyjblocks, 0, "");
1343 
1344 SYSCTL_DECL(_vfs_ffs);
1345 
1346 /* Whether to recompute the summary at mount time */
1347 static int compute_summary_at_mount = 0;
1348 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
1349 	   &compute_summary_at_mount, 0, "Recompute summary at mount");
1350 static int print_threads = 0;
1351 SYSCTL_INT(_debug_softdep, OID_AUTO, print_threads, CTLFLAG_RW,
1352     &print_threads, 0, "Notify flusher thread start/stop");
1353 
1354 /* List of all filesystems mounted with soft updates */
1355 static TAILQ_HEAD(, mount_softdeps) softdepmounts;
1356 
1357 /*
1358  * This function cleans the worklist for a filesystem.
1359  * Each filesystem running with soft dependencies gets its own
1360  * thread to run in this function. The thread is started up in
1361  * softdep_mount and shutdown in softdep_unmount. They show up
1362  * as part of the kernel "bufdaemon" process whose process
1363  * entry is available in bufdaemonproc.
1364  */
1365 static int searchfailed;
1366 extern struct proc *bufdaemonproc;
1367 static void
1368 softdep_flush(addr)
1369 	void *addr;
1370 {
1371 	struct mount *mp;
1372 	struct thread *td;
1373 	struct ufsmount *ump;
1374 
1375 	td = curthread;
1376 	td->td_pflags |= TDP_NORUNNINGBUF;
1377 	mp = (struct mount *)addr;
1378 	ump = VFSTOUFS(mp);
1379 	atomic_add_int(&stat_flush_threads, 1);
1380 	ACQUIRE_LOCK(ump);
1381 	ump->softdep_flags &= ~FLUSH_STARTING;
1382 	wakeup(&ump->softdep_flushtd);
1383 	FREE_LOCK(ump);
1384 	if (print_threads) {
1385 		if (stat_flush_threads == 1)
1386 			printf("Running %s at pid %d\n", bufdaemonproc->p_comm,
1387 			    bufdaemonproc->p_pid);
1388 		printf("Start thread %s\n", td->td_name);
1389 	}
1390 	for (;;) {
1391 		while (softdep_process_worklist(mp, 0) > 0 ||
1392 		    (MOUNTEDSUJ(mp) &&
1393 		    VFSTOUFS(mp)->softdep_jblocks->jb_suspended))
1394 			kthread_suspend_check();
1395 		ACQUIRE_LOCK(ump);
1396 		if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1397 			msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM,
1398 			    "sdflush", hz / 2);
1399 		ump->softdep_flags &= ~FLUSH_CLEANUP;
1400 		/*
1401 		 * Check to see if we are done and need to exit.
1402 		 */
1403 		if ((ump->softdep_flags & FLUSH_EXIT) == 0) {
1404 			FREE_LOCK(ump);
1405 			continue;
1406 		}
1407 		ump->softdep_flags &= ~FLUSH_EXIT;
1408 		FREE_LOCK(ump);
1409 		wakeup(&ump->softdep_flags);
1410 		if (print_threads)
1411 			printf("Stop thread %s: searchfailed %d, did cleanups %d\n", td->td_name, searchfailed, ump->um_softdep->sd_cleanups);
1412 		atomic_subtract_int(&stat_flush_threads, 1);
1413 		kthread_exit();
1414 		panic("kthread_exit failed\n");
1415 	}
1416 }
1417 
1418 static void
1419 worklist_speedup(mp)
1420 	struct mount *mp;
1421 {
1422 	struct ufsmount *ump;
1423 
1424 	ump = VFSTOUFS(mp);
1425 	LOCK_OWNED(ump);
1426 	if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1427 		ump->softdep_flags |= FLUSH_CLEANUP;
1428 	wakeup(&ump->softdep_flushtd);
1429 }
1430 
1431 static int
1432 softdep_speedup(ump)
1433 	struct ufsmount *ump;
1434 {
1435 	struct ufsmount *altump;
1436 	struct mount_softdeps *sdp;
1437 
1438 	LOCK_OWNED(ump);
1439 	worklist_speedup(ump->um_mountp);
1440 	bd_speedup();
1441 	/*
1442 	 * If we have global shortages, then we need other
1443 	 * filesystems to help with the cleanup. Here we wakeup a
1444 	 * flusher thread for a filesystem that is over its fair
1445 	 * share of resources.
1446 	 */
1447 	if (req_clear_inodedeps || req_clear_remove) {
1448 		ACQUIRE_GBLLOCK(&lk);
1449 		TAILQ_FOREACH(sdp, &softdepmounts, sd_next) {
1450 			if ((altump = sdp->sd_ump) == ump)
1451 				continue;
1452 			if (((req_clear_inodedeps &&
1453 			    altump->softdep_curdeps[D_INODEDEP] >
1454 			    max_softdeps / stat_flush_threads) ||
1455 			    (req_clear_remove &&
1456 			    altump->softdep_curdeps[D_DIRREM] >
1457 			    (max_softdeps / 2) / stat_flush_threads)) &&
1458 			    TRY_ACQUIRE_LOCK(altump))
1459 				break;
1460 		}
1461 		if (sdp == NULL) {
1462 			searchfailed++;
1463 			FREE_GBLLOCK(&lk);
1464 		} else {
1465 			/*
1466 			 * Move to the end of the list so we pick a
1467 			 * different one on out next try.
1468 			 */
1469 			TAILQ_REMOVE(&softdepmounts, sdp, sd_next);
1470 			TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
1471 			FREE_GBLLOCK(&lk);
1472 			if ((altump->softdep_flags &
1473 			    (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1474 				altump->softdep_flags |= FLUSH_CLEANUP;
1475 			altump->um_softdep->sd_cleanups++;
1476 			wakeup(&altump->softdep_flushtd);
1477 			FREE_LOCK(altump);
1478 		}
1479 	}
1480 	return (speedup_syncer());
1481 }
1482 
1483 /*
1484  * Add an item to the end of the work queue.
1485  * This routine requires that the lock be held.
1486  * This is the only routine that adds items to the list.
1487  * The following routine is the only one that removes items
1488  * and does so in order from first to last.
1489  */
1490 
1491 #define	WK_HEAD		0x0001	/* Add to HEAD. */
1492 #define	WK_NODELAY	0x0002	/* Process immediately. */
1493 
1494 static void
1495 add_to_worklist(wk, flags)
1496 	struct worklist *wk;
1497 	int flags;
1498 {
1499 	struct ufsmount *ump;
1500 
1501 	ump = VFSTOUFS(wk->wk_mp);
1502 	LOCK_OWNED(ump);
1503 	if (wk->wk_state & ONWORKLIST)
1504 		panic("add_to_worklist: %s(0x%X) already on list",
1505 		    TYPENAME(wk->wk_type), wk->wk_state);
1506 	wk->wk_state |= ONWORKLIST;
1507 	if (ump->softdep_on_worklist == 0) {
1508 		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1509 		ump->softdep_worklist_tail = wk;
1510 	} else if (flags & WK_HEAD) {
1511 		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1512 	} else {
1513 		LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
1514 		ump->softdep_worklist_tail = wk;
1515 	}
1516 	ump->softdep_on_worklist += 1;
1517 	if (flags & WK_NODELAY)
1518 		worklist_speedup(wk->wk_mp);
1519 }
1520 
1521 /*
1522  * Remove the item to be processed. If we are removing the last
1523  * item on the list, we need to recalculate the tail pointer.
1524  */
1525 static void
1526 remove_from_worklist(wk)
1527 	struct worklist *wk;
1528 {
1529 	struct ufsmount *ump;
1530 
1531 	ump = VFSTOUFS(wk->wk_mp);
1532 	WORKLIST_REMOVE(wk);
1533 	if (ump->softdep_worklist_tail == wk)
1534 		ump->softdep_worklist_tail =
1535 		    (struct worklist *)wk->wk_list.le_prev;
1536 	ump->softdep_on_worklist -= 1;
1537 }
1538 
1539 static void
1540 wake_worklist(wk)
1541 	struct worklist *wk;
1542 {
1543 	if (wk->wk_state & IOWAITING) {
1544 		wk->wk_state &= ~IOWAITING;
1545 		wakeup(wk);
1546 	}
1547 }
1548 
1549 static void
1550 wait_worklist(wk, wmesg)
1551 	struct worklist *wk;
1552 	char *wmesg;
1553 {
1554 	struct ufsmount *ump;
1555 
1556 	ump = VFSTOUFS(wk->wk_mp);
1557 	wk->wk_state |= IOWAITING;
1558 	msleep(wk, LOCK_PTR(ump), PVM, wmesg, 0);
1559 }
1560 
1561 /*
1562  * Process that runs once per second to handle items in the background queue.
1563  *
1564  * Note that we ensure that everything is done in the order in which they
1565  * appear in the queue. The code below depends on this property to ensure
1566  * that blocks of a file are freed before the inode itself is freed. This
1567  * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
1568  * until all the old ones have been purged from the dependency lists.
1569  */
1570 static int
1571 softdep_process_worklist(mp, full)
1572 	struct mount *mp;
1573 	int full;
1574 {
1575 	int cnt, matchcnt;
1576 	struct ufsmount *ump;
1577 	long starttime;
1578 
1579 	KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
1580 	if (MOUNTEDSOFTDEP(mp) == 0)
1581 		return (0);
1582 	matchcnt = 0;
1583 	ump = VFSTOUFS(mp);
1584 	ACQUIRE_LOCK(ump);
1585 	starttime = time_second;
1586 	softdep_process_journal(mp, NULL, full ? MNT_WAIT : 0);
1587 	check_clear_deps(mp);
1588 	while (ump->softdep_on_worklist > 0) {
1589 		if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0)
1590 			break;
1591 		else
1592 			matchcnt += cnt;
1593 		check_clear_deps(mp);
1594 		/*
1595 		 * We do not generally want to stop for buffer space, but if
1596 		 * we are really being a buffer hog, we will stop and wait.
1597 		 */
1598 		if (should_yield()) {
1599 			FREE_LOCK(ump);
1600 			kern_yield(PRI_USER);
1601 			bwillwrite();
1602 			ACQUIRE_LOCK(ump);
1603 		}
1604 		/*
1605 		 * Never allow processing to run for more than one
1606 		 * second. This gives the syncer thread the opportunity
1607 		 * to pause if appropriate.
1608 		 */
1609 		if (!full && starttime != time_second)
1610 			break;
1611 	}
1612 	if (full == 0)
1613 		journal_unsuspend(ump);
1614 	FREE_LOCK(ump);
1615 	return (matchcnt);
1616 }
1617 
1618 /*
1619  * Process all removes associated with a vnode if we are running out of
1620  * journal space.  Any other process which attempts to flush these will
1621  * be unable as we have the vnodes locked.
1622  */
1623 static void
1624 process_removes(vp)
1625 	struct vnode *vp;
1626 {
1627 	struct inodedep *inodedep;
1628 	struct dirrem *dirrem;
1629 	struct ufsmount *ump;
1630 	struct mount *mp;
1631 	ino_t inum;
1632 
1633 	mp = vp->v_mount;
1634 	ump = VFSTOUFS(mp);
1635 	LOCK_OWNED(ump);
1636 	inum = VTOI(vp)->i_number;
1637 	for (;;) {
1638 top:
1639 		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1640 			return;
1641 		LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) {
1642 			/*
1643 			 * If another thread is trying to lock this vnode
1644 			 * it will fail but we must wait for it to do so
1645 			 * before we can proceed.
1646 			 */
1647 			if (dirrem->dm_state & INPROGRESS) {
1648 				wait_worklist(&dirrem->dm_list, "pwrwait");
1649 				goto top;
1650 			}
1651 			if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==
1652 			    (COMPLETE | ONWORKLIST))
1653 				break;
1654 		}
1655 		if (dirrem == NULL)
1656 			return;
1657 		remove_from_worklist(&dirrem->dm_list);
1658 		FREE_LOCK(ump);
1659 		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1660 			panic("process_removes: suspended filesystem");
1661 		handle_workitem_remove(dirrem, 0);
1662 		vn_finished_secondary_write(mp);
1663 		ACQUIRE_LOCK(ump);
1664 	}
1665 }
1666 
1667 /*
1668  * Process all truncations associated with a vnode if we are running out
1669  * of journal space.  This is called when the vnode lock is already held
1670  * and no other process can clear the truncation.  This function returns
1671  * a value greater than zero if it did any work.
1672  */
1673 static void
1674 process_truncates(vp)
1675 	struct vnode *vp;
1676 {
1677 	struct inodedep *inodedep;
1678 	struct freeblks *freeblks;
1679 	struct ufsmount *ump;
1680 	struct mount *mp;
1681 	ino_t inum;
1682 	int cgwait;
1683 
1684 	mp = vp->v_mount;
1685 	ump = VFSTOUFS(mp);
1686 	LOCK_OWNED(ump);
1687 	inum = VTOI(vp)->i_number;
1688 	for (;;) {
1689 		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1690 			return;
1691 		cgwait = 0;
1692 		TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) {
1693 			/* Journal entries not yet written.  */
1694 			if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) {
1695 				jwait(&LIST_FIRST(
1696 				    &freeblks->fb_jblkdephd)->jb_list,
1697 				    MNT_WAIT);
1698 				break;
1699 			}
1700 			/* Another thread is executing this item. */
1701 			if (freeblks->fb_state & INPROGRESS) {
1702 				wait_worklist(&freeblks->fb_list, "ptrwait");
1703 				break;
1704 			}
1705 			/* Freeblks is waiting on a inode write. */
1706 			if ((freeblks->fb_state & COMPLETE) == 0) {
1707 				FREE_LOCK(ump);
1708 				ffs_update(vp, 1);
1709 				ACQUIRE_LOCK(ump);
1710 				break;
1711 			}
1712 			if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) ==
1713 			    (ALLCOMPLETE | ONWORKLIST)) {
1714 				remove_from_worklist(&freeblks->fb_list);
1715 				freeblks->fb_state |= INPROGRESS;
1716 				FREE_LOCK(ump);
1717 				if (vn_start_secondary_write(NULL, &mp,
1718 				    V_NOWAIT))
1719 					panic("process_truncates: "
1720 					    "suspended filesystem");
1721 				handle_workitem_freeblocks(freeblks, 0);
1722 				vn_finished_secondary_write(mp);
1723 				ACQUIRE_LOCK(ump);
1724 				break;
1725 			}
1726 			if (freeblks->fb_cgwait)
1727 				cgwait++;
1728 		}
1729 		if (cgwait) {
1730 			FREE_LOCK(ump);
1731 			sync_cgs(mp, MNT_WAIT);
1732 			ffs_sync_snap(mp, MNT_WAIT);
1733 			ACQUIRE_LOCK(ump);
1734 			continue;
1735 		}
1736 		if (freeblks == NULL)
1737 			break;
1738 	}
1739 	return;
1740 }
1741 
1742 /*
1743  * Process one item on the worklist.
1744  */
1745 static int
1746 process_worklist_item(mp, target, flags)
1747 	struct mount *mp;
1748 	int target;
1749 	int flags;
1750 {
1751 	struct worklist sentinel;
1752 	struct worklist *wk;
1753 	struct ufsmount *ump;
1754 	int matchcnt;
1755 	int error;
1756 
1757 	KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
1758 	/*
1759 	 * If we are being called because of a process doing a
1760 	 * copy-on-write, then it is not safe to write as we may
1761 	 * recurse into the copy-on-write routine.
1762 	 */
1763 	if (curthread->td_pflags & TDP_COWINPROGRESS)
1764 		return (-1);
1765 	PHOLD(curproc);	/* Don't let the stack go away. */
1766 	ump = VFSTOUFS(mp);
1767 	LOCK_OWNED(ump);
1768 	matchcnt = 0;
1769 	sentinel.wk_mp = NULL;
1770 	sentinel.wk_type = D_SENTINEL;
1771 	LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sentinel, wk_list);
1772 	for (wk = LIST_NEXT(&sentinel, wk_list); wk != NULL;
1773 	    wk = LIST_NEXT(&sentinel, wk_list)) {
1774 		if (wk->wk_type == D_SENTINEL) {
1775 			LIST_REMOVE(&sentinel, wk_list);
1776 			LIST_INSERT_AFTER(wk, &sentinel, wk_list);
1777 			continue;
1778 		}
1779 		if (wk->wk_state & INPROGRESS)
1780 			panic("process_worklist_item: %p already in progress.",
1781 			    wk);
1782 		wk->wk_state |= INPROGRESS;
1783 		remove_from_worklist(wk);
1784 		FREE_LOCK(ump);
1785 		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1786 			panic("process_worklist_item: suspended filesystem");
1787 		switch (wk->wk_type) {
1788 		case D_DIRREM:
1789 			/* removal of a directory entry */
1790 			error = handle_workitem_remove(WK_DIRREM(wk), flags);
1791 			break;
1792 
1793 		case D_FREEBLKS:
1794 			/* releasing blocks and/or fragments from a file */
1795 			error = handle_workitem_freeblocks(WK_FREEBLKS(wk),
1796 			    flags);
1797 			break;
1798 
1799 		case D_FREEFRAG:
1800 			/* releasing a fragment when replaced as a file grows */
1801 			handle_workitem_freefrag(WK_FREEFRAG(wk));
1802 			error = 0;
1803 			break;
1804 
1805 		case D_FREEFILE:
1806 			/* releasing an inode when its link count drops to 0 */
1807 			handle_workitem_freefile(WK_FREEFILE(wk));
1808 			error = 0;
1809 			break;
1810 
1811 		default:
1812 			panic("%s_process_worklist: Unknown type %s",
1813 			    "softdep", TYPENAME(wk->wk_type));
1814 			/* NOTREACHED */
1815 		}
1816 		vn_finished_secondary_write(mp);
1817 		ACQUIRE_LOCK(ump);
1818 		if (error == 0) {
1819 			if (++matchcnt == target)
1820 				break;
1821 			continue;
1822 		}
1823 		/*
1824 		 * We have to retry the worklist item later.  Wake up any
1825 		 * waiters who may be able to complete it immediately and
1826 		 * add the item back to the head so we don't try to execute
1827 		 * it again.
1828 		 */
1829 		wk->wk_state &= ~INPROGRESS;
1830 		wake_worklist(wk);
1831 		add_to_worklist(wk, WK_HEAD);
1832 	}
1833 	LIST_REMOVE(&sentinel, wk_list);
1834 	/* Sentinal could've become the tail from remove_from_worklist. */
1835 	if (ump->softdep_worklist_tail == &sentinel)
1836 		ump->softdep_worklist_tail =
1837 		    (struct worklist *)sentinel.wk_list.le_prev;
1838 	PRELE(curproc);
1839 	return (matchcnt);
1840 }
1841 
1842 /*
1843  * Move dependencies from one buffer to another.
1844  */
1845 int
1846 softdep_move_dependencies(oldbp, newbp)
1847 	struct buf *oldbp;
1848 	struct buf *newbp;
1849 {
1850 	struct worklist *wk, *wktail;
1851 	struct ufsmount *ump;
1852 	int dirty;
1853 
1854 	if ((wk = LIST_FIRST(&oldbp->b_dep)) == NULL)
1855 		return (0);
1856 	KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
1857 	    ("softdep_move_dependencies called on non-softdep filesystem"));
1858 	dirty = 0;
1859 	wktail = NULL;
1860 	ump = VFSTOUFS(wk->wk_mp);
1861 	ACQUIRE_LOCK(ump);
1862 	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
1863 		LIST_REMOVE(wk, wk_list);
1864 		if (wk->wk_type == D_BMSAFEMAP &&
1865 		    bmsafemap_backgroundwrite(WK_BMSAFEMAP(wk), newbp))
1866 			dirty = 1;
1867 		if (wktail == 0)
1868 			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
1869 		else
1870 			LIST_INSERT_AFTER(wktail, wk, wk_list);
1871 		wktail = wk;
1872 	}
1873 	FREE_LOCK(ump);
1874 
1875 	return (dirty);
1876 }
1877 
1878 /*
1879  * Purge the work list of all items associated with a particular mount point.
1880  */
1881 int
1882 softdep_flushworklist(oldmnt, countp, td)
1883 	struct mount *oldmnt;
1884 	int *countp;
1885 	struct thread *td;
1886 {
1887 	struct vnode *devvp;
1888 	int count, error = 0;
1889 	struct ufsmount *ump;
1890 
1891 	/*
1892 	 * Alternately flush the block device associated with the mount
1893 	 * point and process any dependencies that the flushing
1894 	 * creates. We continue until no more worklist dependencies
1895 	 * are found.
1896 	 */
1897 	*countp = 0;
1898 	ump = VFSTOUFS(oldmnt);
1899 	devvp = ump->um_devvp;
1900 	while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
1901 		*countp += count;
1902 		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1903 		error = VOP_FSYNC(devvp, MNT_WAIT, td);
1904 		VOP_UNLOCK(devvp, 0);
1905 		if (error)
1906 			break;
1907 	}
1908 	return (error);
1909 }
1910 
1911 static int
1912 softdep_waitidle(struct mount *mp, int flags __unused)
1913 {
1914 	struct ufsmount *ump;
1915 	int error;
1916 	int i;
1917 
1918 	ump = VFSTOUFS(mp);
1919 	ACQUIRE_LOCK(ump);
1920 	for (i = 0; i < 10 && ump->softdep_deps; i++) {
1921 		ump->softdep_req = 1;
1922 		KASSERT((flags & FORCECLOSE) == 0 ||
1923 		    ump->softdep_on_worklist == 0,
1924 		    ("softdep_waitidle: work added after flush"));
1925 		msleep(&ump->softdep_deps, LOCK_PTR(ump), PVM, "softdeps", 1);
1926 	}
1927 	ump->softdep_req = 0;
1928 	FREE_LOCK(ump);
1929 	error = 0;
1930 	if (i == 10) {
1931 		error = EBUSY;
1932 		printf("softdep_waitidle: Failed to flush worklist for %p\n",
1933 		    mp);
1934 	}
1935 
1936 	return (error);
1937 }
1938 
1939 /*
1940  * Flush all vnodes and worklist items associated with a specified mount point.
1941  */
1942 int
1943 softdep_flushfiles(oldmnt, flags, td)
1944 	struct mount *oldmnt;
1945 	int flags;
1946 	struct thread *td;
1947 {
1948 #ifdef QUOTA
1949 	struct ufsmount *ump;
1950 	int i;
1951 #endif
1952 	int error, early, depcount, loopcnt, retry_flush_count, retry;
1953 	int morework;
1954 
1955 	KASSERT(MOUNTEDSOFTDEP(oldmnt) != 0,
1956 	    ("softdep_flushfiles called on non-softdep filesystem"));
1957 	loopcnt = 10;
1958 	retry_flush_count = 3;
1959 retry_flush:
1960 	error = 0;
1961 
1962 	/*
1963 	 * Alternately flush the vnodes associated with the mount
1964 	 * point and process any dependencies that the flushing
1965 	 * creates. In theory, this loop can happen at most twice,
1966 	 * but we give it a few extra just to be sure.
1967 	 */
1968 	for (; loopcnt > 0; loopcnt--) {
1969 		/*
1970 		 * Do another flush in case any vnodes were brought in
1971 		 * as part of the cleanup operations.
1972 		 */
1973 		early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag &
1974 		    MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH;
1975 		if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0)
1976 			break;
1977 		if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
1978 		    depcount == 0)
1979 			break;
1980 	}
1981 	/*
1982 	 * If we are unmounting then it is an error to fail. If we
1983 	 * are simply trying to downgrade to read-only, then filesystem
1984 	 * activity can keep us busy forever, so we just fail with EBUSY.
1985 	 */
1986 	if (loopcnt == 0) {
1987 		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
1988 			panic("softdep_flushfiles: looping");
1989 		error = EBUSY;
1990 	}
1991 	if (!error)
1992 		error = softdep_waitidle(oldmnt, flags);
1993 	if (!error) {
1994 		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
1995 			retry = 0;
1996 			MNT_ILOCK(oldmnt);
1997 			KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0,
1998 			    ("softdep_flushfiles: !MNTK_NOINSMNTQ"));
1999 			morework = oldmnt->mnt_nvnodelistsize > 0;
2000 #ifdef QUOTA
2001 			ump = VFSTOUFS(oldmnt);
2002 			UFS_LOCK(ump);
2003 			for (i = 0; i < MAXQUOTAS; i++) {
2004 				if (ump->um_quotas[i] != NULLVP)
2005 					morework = 1;
2006 			}
2007 			UFS_UNLOCK(ump);
2008 #endif
2009 			if (morework) {
2010 				if (--retry_flush_count > 0) {
2011 					retry = 1;
2012 					loopcnt = 3;
2013 				} else
2014 					error = EBUSY;
2015 			}
2016 			MNT_IUNLOCK(oldmnt);
2017 			if (retry)
2018 				goto retry_flush;
2019 		}
2020 	}
2021 	return (error);
2022 }
2023 
2024 /*
2025  * Structure hashing.
2026  *
2027  * There are four types of structures that can be looked up:
2028  *	1) pagedep structures identified by mount point, inode number,
2029  *	   and logical block.
2030  *	2) inodedep structures identified by mount point and inode number.
2031  *	3) newblk structures identified by mount point and
2032  *	   physical block number.
2033  *	4) bmsafemap structures identified by mount point and
2034  *	   cylinder group number.
2035  *
2036  * The "pagedep" and "inodedep" dependency structures are hashed
2037  * separately from the file blocks and inodes to which they correspond.
2038  * This separation helps when the in-memory copy of an inode or
2039  * file block must be replaced. It also obviates the need to access
2040  * an inode or file page when simply updating (or de-allocating)
2041  * dependency structures. Lookup of newblk structures is needed to
2042  * find newly allocated blocks when trying to associate them with
2043  * their allocdirect or allocindir structure.
2044  *
2045  * The lookup routines optionally create and hash a new instance when
2046  * an existing entry is not found. The bmsafemap lookup routine always
2047  * allocates a new structure if an existing one is not found.
2048  */
2049 #define DEPALLOC	0x0001	/* allocate structure if lookup fails */
2050 #define NODELAY		0x0002	/* cannot do background work */
2051 
2052 /*
2053  * Structures and routines associated with pagedep caching.
2054  */
2055 #define	PAGEDEP_HASH(ump, inum, lbn) \
2056 	(&(ump)->pagedep_hashtbl[((inum) + (lbn)) & (ump)->pagedep_hash_size])
2057 
2058 static int
2059 pagedep_find(pagedephd, ino, lbn, pagedeppp)
2060 	struct pagedep_hashhead *pagedephd;
2061 	ino_t ino;
2062 	ufs_lbn_t lbn;
2063 	struct pagedep **pagedeppp;
2064 {
2065 	struct pagedep *pagedep;
2066 
2067 	LIST_FOREACH(pagedep, pagedephd, pd_hash) {
2068 		if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn) {
2069 			*pagedeppp = pagedep;
2070 			return (1);
2071 		}
2072 	}
2073 	*pagedeppp = NULL;
2074 	return (0);
2075 }
2076 /*
2077  * Look up a pagedep. Return 1 if found, 0 otherwise.
2078  * If not found, allocate if DEPALLOC flag is passed.
2079  * Found or allocated entry is returned in pagedeppp.
2080  * This routine must be called with splbio interrupts blocked.
2081  */
2082 static int
2083 pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp)
2084 	struct mount *mp;
2085 	struct buf *bp;
2086 	ino_t ino;
2087 	ufs_lbn_t lbn;
2088 	int flags;
2089 	struct pagedep **pagedeppp;
2090 {
2091 	struct pagedep *pagedep;
2092 	struct pagedep_hashhead *pagedephd;
2093 	struct worklist *wk;
2094 	struct ufsmount *ump;
2095 	int ret;
2096 	int i;
2097 
2098 	ump = VFSTOUFS(mp);
2099 	LOCK_OWNED(ump);
2100 	if (bp) {
2101 		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
2102 			if (wk->wk_type == D_PAGEDEP) {
2103 				*pagedeppp = WK_PAGEDEP(wk);
2104 				return (1);
2105 			}
2106 		}
2107 	}
2108 	pagedephd = PAGEDEP_HASH(ump, ino, lbn);
2109 	ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
2110 	if (ret) {
2111 		if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp)
2112 			WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list);
2113 		return (1);
2114 	}
2115 	if ((flags & DEPALLOC) == 0)
2116 		return (0);
2117 	FREE_LOCK(ump);
2118 	pagedep = malloc(sizeof(struct pagedep),
2119 	    M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
2120 	workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
2121 	ACQUIRE_LOCK(ump);
2122 	ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
2123 	if (*pagedeppp) {
2124 		/*
2125 		 * This should never happen since we only create pagedeps
2126 		 * with the vnode lock held.  Could be an assert.
2127 		 */
2128 		WORKITEM_FREE(pagedep, D_PAGEDEP);
2129 		return (ret);
2130 	}
2131 	pagedep->pd_ino = ino;
2132 	pagedep->pd_lbn = lbn;
2133 	LIST_INIT(&pagedep->pd_dirremhd);
2134 	LIST_INIT(&pagedep->pd_pendinghd);
2135 	for (i = 0; i < DAHASHSZ; i++)
2136 		LIST_INIT(&pagedep->pd_diraddhd[i]);
2137 	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
2138 	WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2139 	*pagedeppp = pagedep;
2140 	return (0);
2141 }
2142 
2143 /*
2144  * Structures and routines associated with inodedep caching.
2145  */
2146 #define	INODEDEP_HASH(ump, inum) \
2147       (&(ump)->inodedep_hashtbl[(inum) & (ump)->inodedep_hash_size])
2148 
2149 static int
2150 inodedep_find(inodedephd, inum, inodedeppp)
2151 	struct inodedep_hashhead *inodedephd;
2152 	ino_t inum;
2153 	struct inodedep **inodedeppp;
2154 {
2155 	struct inodedep *inodedep;
2156 
2157 	LIST_FOREACH(inodedep, inodedephd, id_hash)
2158 		if (inum == inodedep->id_ino)
2159 			break;
2160 	if (inodedep) {
2161 		*inodedeppp = inodedep;
2162 		return (1);
2163 	}
2164 	*inodedeppp = NULL;
2165 
2166 	return (0);
2167 }
2168 /*
2169  * Look up an inodedep. Return 1 if found, 0 if not found.
2170  * If not found, allocate if DEPALLOC flag is passed.
2171  * Found or allocated entry is returned in inodedeppp.
2172  * This routine must be called with splbio interrupts blocked.
2173  */
2174 static int
2175 inodedep_lookup(mp, inum, flags, inodedeppp)
2176 	struct mount *mp;
2177 	ino_t inum;
2178 	int flags;
2179 	struct inodedep **inodedeppp;
2180 {
2181 	struct inodedep *inodedep;
2182 	struct inodedep_hashhead *inodedephd;
2183 	struct ufsmount *ump;
2184 	struct fs *fs;
2185 
2186 	ump = VFSTOUFS(mp);
2187 	LOCK_OWNED(ump);
2188 	fs = ump->um_fs;
2189 	inodedephd = INODEDEP_HASH(ump, inum);
2190 
2191 	if (inodedep_find(inodedephd, inum, inodedeppp))
2192 		return (1);
2193 	if ((flags & DEPALLOC) == 0)
2194 		return (0);
2195 	/*
2196 	 * If the system is over its limit and our filesystem is
2197 	 * responsible for more than our share of that usage and
2198 	 * we are not in a rush, request some inodedep cleanup.
2199 	 */
2200 	while (dep_current[D_INODEDEP] > max_softdeps &&
2201 	    (flags & NODELAY) == 0 &&
2202 	    ump->softdep_curdeps[D_INODEDEP] >
2203 	    max_softdeps / stat_flush_threads)
2204 		request_cleanup(mp, FLUSH_INODES);
2205 	FREE_LOCK(ump);
2206 	inodedep = malloc(sizeof(struct inodedep),
2207 		M_INODEDEP, M_SOFTDEP_FLAGS);
2208 	workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
2209 	ACQUIRE_LOCK(ump);
2210 	if (inodedep_find(inodedephd, inum, inodedeppp)) {
2211 		WORKITEM_FREE(inodedep, D_INODEDEP);
2212 		return (1);
2213 	}
2214 	inodedep->id_fs = fs;
2215 	inodedep->id_ino = inum;
2216 	inodedep->id_state = ALLCOMPLETE;
2217 	inodedep->id_nlinkdelta = 0;
2218 	inodedep->id_savedino1 = NULL;
2219 	inodedep->id_savedsize = -1;
2220 	inodedep->id_savedextsize = -1;
2221 	inodedep->id_savednlink = -1;
2222 	inodedep->id_bmsafemap = NULL;
2223 	inodedep->id_mkdiradd = NULL;
2224 	LIST_INIT(&inodedep->id_dirremhd);
2225 	LIST_INIT(&inodedep->id_pendinghd);
2226 	LIST_INIT(&inodedep->id_inowait);
2227 	LIST_INIT(&inodedep->id_bufwait);
2228 	TAILQ_INIT(&inodedep->id_inoreflst);
2229 	TAILQ_INIT(&inodedep->id_inoupdt);
2230 	TAILQ_INIT(&inodedep->id_newinoupdt);
2231 	TAILQ_INIT(&inodedep->id_extupdt);
2232 	TAILQ_INIT(&inodedep->id_newextupdt);
2233 	TAILQ_INIT(&inodedep->id_freeblklst);
2234 	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
2235 	*inodedeppp = inodedep;
2236 	return (0);
2237 }
2238 
2239 /*
2240  * Structures and routines associated with newblk caching.
2241  */
2242 #define	NEWBLK_HASH(ump, inum) \
2243 	(&(ump)->newblk_hashtbl[(inum) & (ump)->newblk_hash_size])
2244 
2245 static int
2246 newblk_find(newblkhd, newblkno, flags, newblkpp)
2247 	struct newblk_hashhead *newblkhd;
2248 	ufs2_daddr_t newblkno;
2249 	int flags;
2250 	struct newblk **newblkpp;
2251 {
2252 	struct newblk *newblk;
2253 
2254 	LIST_FOREACH(newblk, newblkhd, nb_hash) {
2255 		if (newblkno != newblk->nb_newblkno)
2256 			continue;
2257 		/*
2258 		 * If we're creating a new dependency don't match those that
2259 		 * have already been converted to allocdirects.  This is for
2260 		 * a frag extend.
2261 		 */
2262 		if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
2263 			continue;
2264 		break;
2265 	}
2266 	if (newblk) {
2267 		*newblkpp = newblk;
2268 		return (1);
2269 	}
2270 	*newblkpp = NULL;
2271 	return (0);
2272 }
2273 
2274 /*
2275  * Look up a newblk. Return 1 if found, 0 if not found.
2276  * If not found, allocate if DEPALLOC flag is passed.
2277  * Found or allocated entry is returned in newblkpp.
2278  */
2279 static int
2280 newblk_lookup(mp, newblkno, flags, newblkpp)
2281 	struct mount *mp;
2282 	ufs2_daddr_t newblkno;
2283 	int flags;
2284 	struct newblk **newblkpp;
2285 {
2286 	struct newblk *newblk;
2287 	struct newblk_hashhead *newblkhd;
2288 	struct ufsmount *ump;
2289 
2290 	ump = VFSTOUFS(mp);
2291 	LOCK_OWNED(ump);
2292 	newblkhd = NEWBLK_HASH(ump, newblkno);
2293 	if (newblk_find(newblkhd, newblkno, flags, newblkpp))
2294 		return (1);
2295 	if ((flags & DEPALLOC) == 0)
2296 		return (0);
2297 	FREE_LOCK(ump);
2298 	newblk = malloc(sizeof(union allblk), M_NEWBLK,
2299 	    M_SOFTDEP_FLAGS | M_ZERO);
2300 	workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
2301 	ACQUIRE_LOCK(ump);
2302 	if (newblk_find(newblkhd, newblkno, flags, newblkpp)) {
2303 		WORKITEM_FREE(newblk, D_NEWBLK);
2304 		return (1);
2305 	}
2306 	newblk->nb_freefrag = NULL;
2307 	LIST_INIT(&newblk->nb_indirdeps);
2308 	LIST_INIT(&newblk->nb_newdirblk);
2309 	LIST_INIT(&newblk->nb_jwork);
2310 	newblk->nb_state = ATTACHED;
2311 	newblk->nb_newblkno = newblkno;
2312 	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
2313 	*newblkpp = newblk;
2314 	return (0);
2315 }
2316 
2317 /*
2318  * Structures and routines associated with freed indirect block caching.
2319  */
2320 #define	INDIR_HASH(ump, blkno) \
2321 	(&(ump)->indir_hashtbl[(blkno) & (ump)->indir_hash_size])
2322 
2323 /*
2324  * Lookup an indirect block in the indir hash table.  The freework is
2325  * removed and potentially freed.  The caller must do a blocking journal
2326  * write before writing to the blkno.
2327  */
2328 static int
2329 indirblk_lookup(mp, blkno)
2330 	struct mount *mp;
2331 	ufs2_daddr_t blkno;
2332 {
2333 	struct freework *freework;
2334 	struct indir_hashhead *wkhd;
2335 	struct ufsmount *ump;
2336 
2337 	ump = VFSTOUFS(mp);
2338 	wkhd = INDIR_HASH(ump, blkno);
2339 	TAILQ_FOREACH(freework, wkhd, fw_next) {
2340 		if (freework->fw_blkno != blkno)
2341 			continue;
2342 		indirblk_remove(freework);
2343 		return (1);
2344 	}
2345 	return (0);
2346 }
2347 
2348 /*
2349  * Insert an indirect block represented by freework into the indirblk
2350  * hash table so that it may prevent the block from being re-used prior
2351  * to the journal being written.
2352  */
2353 static void
2354 indirblk_insert(freework)
2355 	struct freework *freework;
2356 {
2357 	struct jblocks *jblocks;
2358 	struct jseg *jseg;
2359 	struct ufsmount *ump;
2360 
2361 	ump = VFSTOUFS(freework->fw_list.wk_mp);
2362 	jblocks = ump->softdep_jblocks;
2363 	jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst);
2364 	if (jseg == NULL)
2365 		return;
2366 
2367 	LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs);
2368 	TAILQ_INSERT_HEAD(INDIR_HASH(ump, freework->fw_blkno), freework,
2369 	    fw_next);
2370 	freework->fw_state &= ~DEPCOMPLETE;
2371 }
2372 
2373 static void
2374 indirblk_remove(freework)
2375 	struct freework *freework;
2376 {
2377 	struct ufsmount *ump;
2378 
2379 	ump = VFSTOUFS(freework->fw_list.wk_mp);
2380 	LIST_REMOVE(freework, fw_segs);
2381 	TAILQ_REMOVE(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next);
2382 	freework->fw_state |= DEPCOMPLETE;
2383 	if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
2384 		WORKITEM_FREE(freework, D_FREEWORK);
2385 }
2386 
2387 /*
2388  * Executed during filesystem system initialization before
2389  * mounting any filesystems.
2390  */
2391 void
2392 softdep_initialize()
2393 {
2394 
2395 	TAILQ_INIT(&softdepmounts);
2396 	max_softdeps = desiredvnodes * 4;
2397 
2398 	/* initialise bioops hack */
2399 	bioops.io_start = softdep_disk_io_initiation;
2400 	bioops.io_complete = softdep_disk_write_complete;
2401 	bioops.io_deallocate = softdep_deallocate_dependencies;
2402 	bioops.io_countdeps = softdep_count_dependencies;
2403 
2404 	/* Initialize the callout with an mtx. */
2405 	callout_init_mtx(&softdep_callout, &lk, 0);
2406 }
2407 
2408 /*
2409  * Executed after all filesystems have been unmounted during
2410  * filesystem module unload.
2411  */
2412 void
2413 softdep_uninitialize()
2414 {
2415 
2416 	/* clear bioops hack */
2417 	bioops.io_start = NULL;
2418 	bioops.io_complete = NULL;
2419 	bioops.io_deallocate = NULL;
2420 	bioops.io_countdeps = NULL;
2421 
2422 	callout_drain(&softdep_callout);
2423 }
2424 
2425 /*
2426  * Called at mount time to notify the dependency code that a
2427  * filesystem wishes to use it.
2428  */
2429 int
2430 softdep_mount(devvp, mp, fs, cred)
2431 	struct vnode *devvp;
2432 	struct mount *mp;
2433 	struct fs *fs;
2434 	struct ucred *cred;
2435 {
2436 	struct csum_total cstotal;
2437 	struct mount_softdeps *sdp;
2438 	struct ufsmount *ump;
2439 	struct cg *cgp;
2440 	struct buf *bp;
2441 	int i, error, cyl;
2442 
2443 	sdp = malloc(sizeof(struct mount_softdeps), M_MOUNTDATA,
2444 	    M_WAITOK | M_ZERO);
2445 	MNT_ILOCK(mp);
2446 	mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
2447 	if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
2448 		mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
2449 			MNTK_SOFTDEP | MNTK_NOASYNC;
2450 	}
2451 	ump = VFSTOUFS(mp);
2452 	ump->um_softdep = sdp;
2453 	MNT_IUNLOCK(mp);
2454 	rw_init(LOCK_PTR(ump), "Per-Filesystem Softdep Lock");
2455 	sdp->sd_ump = ump;
2456 	LIST_INIT(&ump->softdep_workitem_pending);
2457 	LIST_INIT(&ump->softdep_journal_pending);
2458 	TAILQ_INIT(&ump->softdep_unlinked);
2459 	LIST_INIT(&ump->softdep_dirtycg);
2460 	ump->softdep_worklist_tail = NULL;
2461 	ump->softdep_on_worklist = 0;
2462 	ump->softdep_deps = 0;
2463 	LIST_INIT(&ump->softdep_mkdirlisthd);
2464 	ump->pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
2465 	    &ump->pagedep_hash_size);
2466 	ump->pagedep_nextclean = 0;
2467 	ump->inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP,
2468 	    &ump->inodedep_hash_size);
2469 	ump->inodedep_nextclean = 0;
2470 	ump->newblk_hashtbl = hashinit(max_softdeps / 2,  M_NEWBLK,
2471 	    &ump->newblk_hash_size);
2472 	ump->bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP,
2473 	    &ump->bmsafemap_hash_size);
2474 	i = 1 << (ffs(desiredvnodes / 10) - 1);
2475 	ump->indir_hashtbl = malloc(i * sizeof(struct indir_hashhead),
2476 	    M_FREEWORK, M_WAITOK);
2477 	ump->indir_hash_size = i - 1;
2478 	for (i = 0; i <= ump->indir_hash_size; i++)
2479 		TAILQ_INIT(&ump->indir_hashtbl[i]);
2480 	ACQUIRE_GBLLOCK(&lk);
2481 	TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
2482 	FREE_GBLLOCK(&lk);
2483 	if ((fs->fs_flags & FS_SUJ) &&
2484 	    (error = journal_mount(mp, fs, cred)) != 0) {
2485 		printf("Failed to start journal: %d\n", error);
2486 		softdep_unmount(mp);
2487 		return (error);
2488 	}
2489 	/*
2490 	 * Start our flushing thread in the bufdaemon process.
2491 	 */
2492 	ACQUIRE_LOCK(ump);
2493 	ump->softdep_flags |= FLUSH_STARTING;
2494 	FREE_LOCK(ump);
2495 	kproc_kthread_add(&softdep_flush, mp, &bufdaemonproc,
2496 	    &ump->softdep_flushtd, 0, 0, "softdepflush", "%s worker",
2497 	    mp->mnt_stat.f_mntonname);
2498 	ACQUIRE_LOCK(ump);
2499 	while ((ump->softdep_flags & FLUSH_STARTING) != 0) {
2500 		msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, "sdstart",
2501 		    hz / 2);
2502 	}
2503 	FREE_LOCK(ump);
2504 	/*
2505 	 * When doing soft updates, the counters in the
2506 	 * superblock may have gotten out of sync. Recomputation
2507 	 * can take a long time and can be deferred for background
2508 	 * fsck.  However, the old behavior of scanning the cylinder
2509 	 * groups and recalculating them at mount time is available
2510 	 * by setting vfs.ffs.compute_summary_at_mount to one.
2511 	 */
2512 	if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
2513 		return (0);
2514 	bzero(&cstotal, sizeof cstotal);
2515 	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
2516 		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
2517 		    fs->fs_cgsize, cred, &bp)) != 0) {
2518 			brelse(bp);
2519 			softdep_unmount(mp);
2520 			return (error);
2521 		}
2522 		cgp = (struct cg *)bp->b_data;
2523 		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
2524 		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
2525 		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
2526 		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
2527 		fs->fs_cs(fs, cyl) = cgp->cg_cs;
2528 		brelse(bp);
2529 	}
2530 #ifdef DEBUG
2531 	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
2532 		printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
2533 #endif
2534 	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
2535 	return (0);
2536 }
2537 
2538 void
2539 softdep_unmount(mp)
2540 	struct mount *mp;
2541 {
2542 	struct ufsmount *ump;
2543 #ifdef INVARIANTS
2544 	int i;
2545 #endif
2546 
2547 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
2548 	    ("softdep_unmount called on non-softdep filesystem"));
2549 	ump = VFSTOUFS(mp);
2550 	MNT_ILOCK(mp);
2551 	mp->mnt_flag &= ~MNT_SOFTDEP;
2552 	if (MOUNTEDSUJ(mp) == 0) {
2553 		MNT_IUNLOCK(mp);
2554 	} else {
2555 		mp->mnt_flag &= ~MNT_SUJ;
2556 		MNT_IUNLOCK(mp);
2557 		journal_unmount(ump);
2558 	}
2559 	/*
2560 	 * Shut down our flushing thread. Check for NULL is if
2561 	 * softdep_mount errors out before the thread has been created.
2562 	 */
2563 	if (ump->softdep_flushtd != NULL) {
2564 		ACQUIRE_LOCK(ump);
2565 		ump->softdep_flags |= FLUSH_EXIT;
2566 		wakeup(&ump->softdep_flushtd);
2567 		msleep(&ump->softdep_flags, LOCK_PTR(ump), PVM | PDROP,
2568 		    "sdwait", 0);
2569 		KASSERT((ump->softdep_flags & FLUSH_EXIT) == 0,
2570 		    ("Thread shutdown failed"));
2571 	}
2572 	/*
2573 	 * Free up our resources.
2574 	 */
2575 	ACQUIRE_GBLLOCK(&lk);
2576 	TAILQ_REMOVE(&softdepmounts, ump->um_softdep, sd_next);
2577 	FREE_GBLLOCK(&lk);
2578 	rw_destroy(LOCK_PTR(ump));
2579 	hashdestroy(ump->pagedep_hashtbl, M_PAGEDEP, ump->pagedep_hash_size);
2580 	hashdestroy(ump->inodedep_hashtbl, M_INODEDEP, ump->inodedep_hash_size);
2581 	hashdestroy(ump->newblk_hashtbl, M_NEWBLK, ump->newblk_hash_size);
2582 	hashdestroy(ump->bmsafemap_hashtbl, M_BMSAFEMAP,
2583 	    ump->bmsafemap_hash_size);
2584 	free(ump->indir_hashtbl, M_FREEWORK);
2585 #ifdef INVARIANTS
2586 	for (i = 0; i <= D_LAST; i++)
2587 		KASSERT(ump->softdep_curdeps[i] == 0,
2588 		    ("Unmount %s: Dep type %s != 0 (%ld)", ump->um_fs->fs_fsmnt,
2589 		    TYPENAME(i), ump->softdep_curdeps[i]));
2590 #endif
2591 	free(ump->um_softdep, M_MOUNTDATA);
2592 }
2593 
2594 static struct jblocks *
2595 jblocks_create(void)
2596 {
2597 	struct jblocks *jblocks;
2598 
2599 	jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
2600 	TAILQ_INIT(&jblocks->jb_segs);
2601 	jblocks->jb_avail = 10;
2602 	jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2603 	    M_JBLOCKS, M_WAITOK | M_ZERO);
2604 
2605 	return (jblocks);
2606 }
2607 
2608 static ufs2_daddr_t
2609 jblocks_alloc(jblocks, bytes, actual)
2610 	struct jblocks *jblocks;
2611 	int bytes;
2612 	int *actual;
2613 {
2614 	ufs2_daddr_t daddr;
2615 	struct jextent *jext;
2616 	int freecnt;
2617 	int blocks;
2618 
2619 	blocks = bytes / DEV_BSIZE;
2620 	jext = &jblocks->jb_extent[jblocks->jb_head];
2621 	freecnt = jext->je_blocks - jblocks->jb_off;
2622 	if (freecnt == 0) {
2623 		jblocks->jb_off = 0;
2624 		if (++jblocks->jb_head > jblocks->jb_used)
2625 			jblocks->jb_head = 0;
2626 		jext = &jblocks->jb_extent[jblocks->jb_head];
2627 		freecnt = jext->je_blocks;
2628 	}
2629 	if (freecnt > blocks)
2630 		freecnt = blocks;
2631 	*actual = freecnt * DEV_BSIZE;
2632 	daddr = jext->je_daddr + jblocks->jb_off;
2633 	jblocks->jb_off += freecnt;
2634 	jblocks->jb_free -= freecnt;
2635 
2636 	return (daddr);
2637 }
2638 
2639 static void
2640 jblocks_free(jblocks, mp, bytes)
2641 	struct jblocks *jblocks;
2642 	struct mount *mp;
2643 	int bytes;
2644 {
2645 
2646 	LOCK_OWNED(VFSTOUFS(mp));
2647 	jblocks->jb_free += bytes / DEV_BSIZE;
2648 	if (jblocks->jb_suspended)
2649 		worklist_speedup(mp);
2650 	wakeup(jblocks);
2651 }
2652 
2653 static void
2654 jblocks_destroy(jblocks)
2655 	struct jblocks *jblocks;
2656 {
2657 
2658 	if (jblocks->jb_extent)
2659 		free(jblocks->jb_extent, M_JBLOCKS);
2660 	free(jblocks, M_JBLOCKS);
2661 }
2662 
2663 static void
2664 jblocks_add(jblocks, daddr, blocks)
2665 	struct jblocks *jblocks;
2666 	ufs2_daddr_t daddr;
2667 	int blocks;
2668 {
2669 	struct jextent *jext;
2670 
2671 	jblocks->jb_blocks += blocks;
2672 	jblocks->jb_free += blocks;
2673 	jext = &jblocks->jb_extent[jblocks->jb_used];
2674 	/* Adding the first block. */
2675 	if (jext->je_daddr == 0) {
2676 		jext->je_daddr = daddr;
2677 		jext->je_blocks = blocks;
2678 		return;
2679 	}
2680 	/* Extending the last extent. */
2681 	if (jext->je_daddr + jext->je_blocks == daddr) {
2682 		jext->je_blocks += blocks;
2683 		return;
2684 	}
2685 	/* Adding a new extent. */
2686 	if (++jblocks->jb_used == jblocks->jb_avail) {
2687 		jblocks->jb_avail *= 2;
2688 		jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2689 		    M_JBLOCKS, M_WAITOK | M_ZERO);
2690 		memcpy(jext, jblocks->jb_extent,
2691 		    sizeof(struct jextent) * jblocks->jb_used);
2692 		free(jblocks->jb_extent, M_JBLOCKS);
2693 		jblocks->jb_extent = jext;
2694 	}
2695 	jext = &jblocks->jb_extent[jblocks->jb_used];
2696 	jext->je_daddr = daddr;
2697 	jext->je_blocks = blocks;
2698 	return;
2699 }
2700 
2701 int
2702 softdep_journal_lookup(mp, vpp)
2703 	struct mount *mp;
2704 	struct vnode **vpp;
2705 {
2706 	struct componentname cnp;
2707 	struct vnode *dvp;
2708 	ino_t sujournal;
2709 	int error;
2710 
2711 	error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp);
2712 	if (error)
2713 		return (error);
2714 	bzero(&cnp, sizeof(cnp));
2715 	cnp.cn_nameiop = LOOKUP;
2716 	cnp.cn_flags = ISLASTCN;
2717 	cnp.cn_thread = curthread;
2718 	cnp.cn_cred = curthread->td_ucred;
2719 	cnp.cn_pnbuf = SUJ_FILE;
2720 	cnp.cn_nameptr = SUJ_FILE;
2721 	cnp.cn_namelen = strlen(SUJ_FILE);
2722 	error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
2723 	vput(dvp);
2724 	if (error != 0)
2725 		return (error);
2726 	error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
2727 	return (error);
2728 }
2729 
2730 /*
2731  * Open and verify the journal file.
2732  */
2733 static int
2734 journal_mount(mp, fs, cred)
2735 	struct mount *mp;
2736 	struct fs *fs;
2737 	struct ucred *cred;
2738 {
2739 	struct jblocks *jblocks;
2740 	struct ufsmount *ump;
2741 	struct vnode *vp;
2742 	struct inode *ip;
2743 	ufs2_daddr_t blkno;
2744 	int bcount;
2745 	int error;
2746 	int i;
2747 
2748 	ump = VFSTOUFS(mp);
2749 	ump->softdep_journal_tail = NULL;
2750 	ump->softdep_on_journal = 0;
2751 	ump->softdep_accdeps = 0;
2752 	ump->softdep_req = 0;
2753 	ump->softdep_jblocks = NULL;
2754 	error = softdep_journal_lookup(mp, &vp);
2755 	if (error != 0) {
2756 		printf("Failed to find journal.  Use tunefs to create one\n");
2757 		return (error);
2758 	}
2759 	ip = VTOI(vp);
2760 	if (ip->i_size < SUJ_MIN) {
2761 		error = ENOSPC;
2762 		goto out;
2763 	}
2764 	bcount = lblkno(fs, ip->i_size);	/* Only use whole blocks. */
2765 	jblocks = jblocks_create();
2766 	for (i = 0; i < bcount; i++) {
2767 		error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
2768 		if (error)
2769 			break;
2770 		jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
2771 	}
2772 	if (error) {
2773 		jblocks_destroy(jblocks);
2774 		goto out;
2775 	}
2776 	jblocks->jb_low = jblocks->jb_free / 3;	/* Reserve 33%. */
2777 	jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
2778 	ump->softdep_jblocks = jblocks;
2779 out:
2780 	if (error == 0) {
2781 		MNT_ILOCK(mp);
2782 		mp->mnt_flag |= MNT_SUJ;
2783 		mp->mnt_flag &= ~MNT_SOFTDEP;
2784 		MNT_IUNLOCK(mp);
2785 		/*
2786 		 * Only validate the journal contents if the
2787 		 * filesystem is clean, otherwise we write the logs
2788 		 * but they'll never be used.  If the filesystem was
2789 		 * still dirty when we mounted it the journal is
2790 		 * invalid and a new journal can only be valid if it
2791 		 * starts from a clean mount.
2792 		 */
2793 		if (fs->fs_clean) {
2794 			DIP_SET(ip, i_modrev, fs->fs_mtime);
2795 			ip->i_flags |= IN_MODIFIED;
2796 			ffs_update(vp, 1);
2797 		}
2798 	}
2799 	vput(vp);
2800 	return (error);
2801 }
2802 
2803 static void
2804 journal_unmount(ump)
2805 	struct ufsmount *ump;
2806 {
2807 
2808 	if (ump->softdep_jblocks)
2809 		jblocks_destroy(ump->softdep_jblocks);
2810 	ump->softdep_jblocks = NULL;
2811 }
2812 
2813 /*
2814  * Called when a journal record is ready to be written.  Space is allocated
2815  * and the journal entry is created when the journal is flushed to stable
2816  * store.
2817  */
2818 static void
2819 add_to_journal(wk)
2820 	struct worklist *wk;
2821 {
2822 	struct ufsmount *ump;
2823 
2824 	ump = VFSTOUFS(wk->wk_mp);
2825 	LOCK_OWNED(ump);
2826 	if (wk->wk_state & ONWORKLIST)
2827 		panic("add_to_journal: %s(0x%X) already on list",
2828 		    TYPENAME(wk->wk_type), wk->wk_state);
2829 	wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
2830 	if (LIST_EMPTY(&ump->softdep_journal_pending)) {
2831 		ump->softdep_jblocks->jb_age = ticks;
2832 		LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
2833 	} else
2834 		LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
2835 	ump->softdep_journal_tail = wk;
2836 	ump->softdep_on_journal += 1;
2837 }
2838 
2839 /*
2840  * Remove an arbitrary item for the journal worklist maintain the tail
2841  * pointer.  This happens when a new operation obviates the need to
2842  * journal an old operation.
2843  */
2844 static void
2845 remove_from_journal(wk)
2846 	struct worklist *wk;
2847 {
2848 	struct ufsmount *ump;
2849 
2850 	ump = VFSTOUFS(wk->wk_mp);
2851 	LOCK_OWNED(ump);
2852 #ifdef SUJ_DEBUG
2853 	{
2854 		struct worklist *wkn;
2855 
2856 		LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
2857 			if (wkn == wk)
2858 				break;
2859 		if (wkn == NULL)
2860 			panic("remove_from_journal: %p is not in journal", wk);
2861 	}
2862 #endif
2863 	/*
2864 	 * We emulate a TAILQ to save space in most structures which do not
2865 	 * require TAILQ semantics.  Here we must update the tail position
2866 	 * when removing the tail which is not the final entry. This works
2867 	 * only if the worklist linkage are at the beginning of the structure.
2868 	 */
2869 	if (ump->softdep_journal_tail == wk)
2870 		ump->softdep_journal_tail =
2871 		    (struct worklist *)wk->wk_list.le_prev;
2872 
2873 	WORKLIST_REMOVE(wk);
2874 	ump->softdep_on_journal -= 1;
2875 }
2876 
2877 /*
2878  * Check for journal space as well as dependency limits so the prelink
2879  * code can throttle both journaled and non-journaled filesystems.
2880  * Threshold is 0 for low and 1 for min.
2881  */
2882 static int
2883 journal_space(ump, thresh)
2884 	struct ufsmount *ump;
2885 	int thresh;
2886 {
2887 	struct jblocks *jblocks;
2888 	int limit, avail;
2889 
2890 	jblocks = ump->softdep_jblocks;
2891 	if (jblocks == NULL)
2892 		return (1);
2893 	/*
2894 	 * We use a tighter restriction here to prevent request_cleanup()
2895 	 * running in threads from running into locks we currently hold.
2896 	 * We have to be over the limit and our filesystem has to be
2897 	 * responsible for more than our share of that usage.
2898 	 */
2899 	limit = (max_softdeps / 10) * 9;
2900 	if (dep_current[D_INODEDEP] > limit &&
2901 	    ump->softdep_curdeps[D_INODEDEP] > limit / stat_flush_threads)
2902 		return (0);
2903 	if (thresh)
2904 		thresh = jblocks->jb_min;
2905 	else
2906 		thresh = jblocks->jb_low;
2907 	avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
2908 	avail = jblocks->jb_free - avail;
2909 
2910 	return (avail > thresh);
2911 }
2912 
2913 static void
2914 journal_suspend(ump)
2915 	struct ufsmount *ump;
2916 {
2917 	struct jblocks *jblocks;
2918 	struct mount *mp;
2919 
2920 	mp = UFSTOVFS(ump);
2921 	jblocks = ump->softdep_jblocks;
2922 	MNT_ILOCK(mp);
2923 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
2924 		stat_journal_min++;
2925 		mp->mnt_kern_flag |= MNTK_SUSPEND;
2926 		mp->mnt_susp_owner = ump->softdep_flushtd;
2927 	}
2928 	jblocks->jb_suspended = 1;
2929 	MNT_IUNLOCK(mp);
2930 }
2931 
2932 static int
2933 journal_unsuspend(struct ufsmount *ump)
2934 {
2935 	struct jblocks *jblocks;
2936 	struct mount *mp;
2937 
2938 	mp = UFSTOVFS(ump);
2939 	jblocks = ump->softdep_jblocks;
2940 
2941 	if (jblocks != NULL && jblocks->jb_suspended &&
2942 	    journal_space(ump, jblocks->jb_min)) {
2943 		jblocks->jb_suspended = 0;
2944 		FREE_LOCK(ump);
2945 		mp->mnt_susp_owner = curthread;
2946 		vfs_write_resume(mp, 0);
2947 		ACQUIRE_LOCK(ump);
2948 		return (1);
2949 	}
2950 	return (0);
2951 }
2952 
2953 /*
2954  * Called before any allocation function to be certain that there is
2955  * sufficient space in the journal prior to creating any new records.
2956  * Since in the case of block allocation we may have multiple locked
2957  * buffers at the time of the actual allocation we can not block
2958  * when the journal records are created.  Doing so would create a deadlock
2959  * if any of these buffers needed to be flushed to reclaim space.  Instead
2960  * we require a sufficiently large amount of available space such that
2961  * each thread in the system could have passed this allocation check and
2962  * still have sufficient free space.  With 20% of a minimum journal size
2963  * of 1MB we have 6553 records available.
2964  */
2965 int
2966 softdep_prealloc(vp, waitok)
2967 	struct vnode *vp;
2968 	int waitok;
2969 {
2970 	struct ufsmount *ump;
2971 
2972 	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
2973 	    ("softdep_prealloc called on non-softdep filesystem"));
2974 	/*
2975 	 * Nothing to do if we are not running journaled soft updates.
2976 	 * If we currently hold the snapshot lock, we must avoid handling
2977 	 * other resources that could cause deadlock.
2978 	 */
2979 	if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)))
2980 		return (0);
2981 	ump = VFSTOUFS(vp->v_mount);
2982 	ACQUIRE_LOCK(ump);
2983 	if (journal_space(ump, 0)) {
2984 		FREE_LOCK(ump);
2985 		return (0);
2986 	}
2987 	stat_journal_low++;
2988 	FREE_LOCK(ump);
2989 	if (waitok == MNT_NOWAIT)
2990 		return (ENOSPC);
2991 	/*
2992 	 * Attempt to sync this vnode once to flush any journal
2993 	 * work attached to it.
2994 	 */
2995 	if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0)
2996 		ffs_syncvnode(vp, waitok, 0);
2997 	ACQUIRE_LOCK(ump);
2998 	process_removes(vp);
2999 	process_truncates(vp);
3000 	if (journal_space(ump, 0) == 0) {
3001 		softdep_speedup(ump);
3002 		if (journal_space(ump, 1) == 0)
3003 			journal_suspend(ump);
3004 	}
3005 	FREE_LOCK(ump);
3006 
3007 	return (0);
3008 }
3009 
3010 /*
3011  * Before adjusting a link count on a vnode verify that we have sufficient
3012  * journal space.  If not, process operations that depend on the currently
3013  * locked pair of vnodes to try to flush space as the syncer, buf daemon,
3014  * and softdep flush threads can not acquire these locks to reclaim space.
3015  */
3016 static void
3017 softdep_prelink(dvp, vp)
3018 	struct vnode *dvp;
3019 	struct vnode *vp;
3020 {
3021 	struct ufsmount *ump;
3022 
3023 	ump = VFSTOUFS(dvp->v_mount);
3024 	LOCK_OWNED(ump);
3025 	/*
3026 	 * Nothing to do if we have sufficient journal space.
3027 	 * If we currently hold the snapshot lock, we must avoid
3028 	 * handling other resources that could cause deadlock.
3029 	 */
3030 	if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp))))
3031 		return;
3032 	stat_journal_low++;
3033 	FREE_LOCK(ump);
3034 	if (vp)
3035 		ffs_syncvnode(vp, MNT_NOWAIT, 0);
3036 	ffs_syncvnode(dvp, MNT_WAIT, 0);
3037 	ACQUIRE_LOCK(ump);
3038 	/* Process vp before dvp as it may create .. removes. */
3039 	if (vp) {
3040 		process_removes(vp);
3041 		process_truncates(vp);
3042 	}
3043 	process_removes(dvp);
3044 	process_truncates(dvp);
3045 	softdep_speedup(ump);
3046 	process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
3047 	if (journal_space(ump, 0) == 0) {
3048 		softdep_speedup(ump);
3049 		if (journal_space(ump, 1) == 0)
3050 			journal_suspend(ump);
3051 	}
3052 }
3053 
3054 static void
3055 jseg_write(ump, jseg, data)
3056 	struct ufsmount *ump;
3057 	struct jseg *jseg;
3058 	uint8_t *data;
3059 {
3060 	struct jsegrec *rec;
3061 
3062 	rec = (struct jsegrec *)data;
3063 	rec->jsr_seq = jseg->js_seq;
3064 	rec->jsr_oldest = jseg->js_oldseq;
3065 	rec->jsr_cnt = jseg->js_cnt;
3066 	rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
3067 	rec->jsr_crc = 0;
3068 	rec->jsr_time = ump->um_fs->fs_mtime;
3069 }
3070 
3071 static inline void
3072 inoref_write(inoref, jseg, rec)
3073 	struct inoref *inoref;
3074 	struct jseg *jseg;
3075 	struct jrefrec *rec;
3076 {
3077 
3078 	inoref->if_jsegdep->jd_seg = jseg;
3079 	rec->jr_ino = inoref->if_ino;
3080 	rec->jr_parent = inoref->if_parent;
3081 	rec->jr_nlink = inoref->if_nlink;
3082 	rec->jr_mode = inoref->if_mode;
3083 	rec->jr_diroff = inoref->if_diroff;
3084 }
3085 
3086 static void
3087 jaddref_write(jaddref, jseg, data)
3088 	struct jaddref *jaddref;
3089 	struct jseg *jseg;
3090 	uint8_t *data;
3091 {
3092 	struct jrefrec *rec;
3093 
3094 	rec = (struct jrefrec *)data;
3095 	rec->jr_op = JOP_ADDREF;
3096 	inoref_write(&jaddref->ja_ref, jseg, rec);
3097 }
3098 
3099 static void
3100 jremref_write(jremref, jseg, data)
3101 	struct jremref *jremref;
3102 	struct jseg *jseg;
3103 	uint8_t *data;
3104 {
3105 	struct jrefrec *rec;
3106 
3107 	rec = (struct jrefrec *)data;
3108 	rec->jr_op = JOP_REMREF;
3109 	inoref_write(&jremref->jr_ref, jseg, rec);
3110 }
3111 
3112 static void
3113 jmvref_write(jmvref, jseg, data)
3114 	struct jmvref *jmvref;
3115 	struct jseg *jseg;
3116 	uint8_t *data;
3117 {
3118 	struct jmvrec *rec;
3119 
3120 	rec = (struct jmvrec *)data;
3121 	rec->jm_op = JOP_MVREF;
3122 	rec->jm_ino = jmvref->jm_ino;
3123 	rec->jm_parent = jmvref->jm_parent;
3124 	rec->jm_oldoff = jmvref->jm_oldoff;
3125 	rec->jm_newoff = jmvref->jm_newoff;
3126 }
3127 
3128 static void
3129 jnewblk_write(jnewblk, jseg, data)
3130 	struct jnewblk *jnewblk;
3131 	struct jseg *jseg;
3132 	uint8_t *data;
3133 {
3134 	struct jblkrec *rec;
3135 
3136 	jnewblk->jn_jsegdep->jd_seg = jseg;
3137 	rec = (struct jblkrec *)data;
3138 	rec->jb_op = JOP_NEWBLK;
3139 	rec->jb_ino = jnewblk->jn_ino;
3140 	rec->jb_blkno = jnewblk->jn_blkno;
3141 	rec->jb_lbn = jnewblk->jn_lbn;
3142 	rec->jb_frags = jnewblk->jn_frags;
3143 	rec->jb_oldfrags = jnewblk->jn_oldfrags;
3144 }
3145 
3146 static void
3147 jfreeblk_write(jfreeblk, jseg, data)
3148 	struct jfreeblk *jfreeblk;
3149 	struct jseg *jseg;
3150 	uint8_t *data;
3151 {
3152 	struct jblkrec *rec;
3153 
3154 	jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg;
3155 	rec = (struct jblkrec *)data;
3156 	rec->jb_op = JOP_FREEBLK;
3157 	rec->jb_ino = jfreeblk->jf_ino;
3158 	rec->jb_blkno = jfreeblk->jf_blkno;
3159 	rec->jb_lbn = jfreeblk->jf_lbn;
3160 	rec->jb_frags = jfreeblk->jf_frags;
3161 	rec->jb_oldfrags = 0;
3162 }
3163 
3164 static void
3165 jfreefrag_write(jfreefrag, jseg, data)
3166 	struct jfreefrag *jfreefrag;
3167 	struct jseg *jseg;
3168 	uint8_t *data;
3169 {
3170 	struct jblkrec *rec;
3171 
3172 	jfreefrag->fr_jsegdep->jd_seg = jseg;
3173 	rec = (struct jblkrec *)data;
3174 	rec->jb_op = JOP_FREEBLK;
3175 	rec->jb_ino = jfreefrag->fr_ino;
3176 	rec->jb_blkno = jfreefrag->fr_blkno;
3177 	rec->jb_lbn = jfreefrag->fr_lbn;
3178 	rec->jb_frags = jfreefrag->fr_frags;
3179 	rec->jb_oldfrags = 0;
3180 }
3181 
3182 static void
3183 jtrunc_write(jtrunc, jseg, data)
3184 	struct jtrunc *jtrunc;
3185 	struct jseg *jseg;
3186 	uint8_t *data;
3187 {
3188 	struct jtrncrec *rec;
3189 
3190 	jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg;
3191 	rec = (struct jtrncrec *)data;
3192 	rec->jt_op = JOP_TRUNC;
3193 	rec->jt_ino = jtrunc->jt_ino;
3194 	rec->jt_size = jtrunc->jt_size;
3195 	rec->jt_extsize = jtrunc->jt_extsize;
3196 }
3197 
3198 static void
3199 jfsync_write(jfsync, jseg, data)
3200 	struct jfsync *jfsync;
3201 	struct jseg *jseg;
3202 	uint8_t *data;
3203 {
3204 	struct jtrncrec *rec;
3205 
3206 	rec = (struct jtrncrec *)data;
3207 	rec->jt_op = JOP_SYNC;
3208 	rec->jt_ino = jfsync->jfs_ino;
3209 	rec->jt_size = jfsync->jfs_size;
3210 	rec->jt_extsize = jfsync->jfs_extsize;
3211 }
3212 
3213 static void
3214 softdep_flushjournal(mp)
3215 	struct mount *mp;
3216 {
3217 	struct jblocks *jblocks;
3218 	struct ufsmount *ump;
3219 
3220 	if (MOUNTEDSUJ(mp) == 0)
3221 		return;
3222 	ump = VFSTOUFS(mp);
3223 	jblocks = ump->softdep_jblocks;
3224 	ACQUIRE_LOCK(ump);
3225 	while (ump->softdep_on_journal) {
3226 		jblocks->jb_needseg = 1;
3227 		softdep_process_journal(mp, NULL, MNT_WAIT);
3228 	}
3229 	FREE_LOCK(ump);
3230 }
3231 
3232 static void softdep_synchronize_completed(struct bio *);
3233 static void softdep_synchronize(struct bio *, struct ufsmount *, void *);
3234 
3235 static void
3236 softdep_synchronize_completed(bp)
3237         struct bio *bp;
3238 {
3239 	struct jseg *oldest;
3240 	struct jseg *jseg;
3241 	struct ufsmount *ump;
3242 
3243 	/*
3244 	 * caller1 marks the last segment written before we issued the
3245 	 * synchronize cache.
3246 	 */
3247 	jseg = bp->bio_caller1;
3248 	if (jseg == NULL) {
3249 		g_destroy_bio(bp);
3250 		return;
3251 	}
3252 	ump = VFSTOUFS(jseg->js_list.wk_mp);
3253 	ACQUIRE_LOCK(ump);
3254 	oldest = NULL;
3255 	/*
3256 	 * Mark all the journal entries waiting on the synchronize cache
3257 	 * as completed so they may continue on.
3258 	 */
3259 	while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) {
3260 		jseg->js_state |= COMPLETE;
3261 		oldest = jseg;
3262 		jseg = TAILQ_PREV(jseg, jseglst, js_next);
3263 	}
3264 	/*
3265 	 * Restart deferred journal entry processing from the oldest
3266 	 * completed jseg.
3267 	 */
3268 	if (oldest)
3269 		complete_jsegs(oldest);
3270 
3271 	FREE_LOCK(ump);
3272 	g_destroy_bio(bp);
3273 }
3274 
3275 /*
3276  * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering
3277  * barriers.  The journal must be written prior to any blocks that depend
3278  * on it and the journal can not be released until the blocks have be
3279  * written.  This code handles both barriers simultaneously.
3280  */
3281 static void
3282 softdep_synchronize(bp, ump, caller1)
3283 	struct bio *bp;
3284 	struct ufsmount *ump;
3285 	void *caller1;
3286 {
3287 
3288 	bp->bio_cmd = BIO_FLUSH;
3289 	bp->bio_flags |= BIO_ORDERED;
3290 	bp->bio_data = NULL;
3291 	bp->bio_offset = ump->um_cp->provider->mediasize;
3292 	bp->bio_length = 0;
3293 	bp->bio_done = softdep_synchronize_completed;
3294 	bp->bio_caller1 = caller1;
3295 	g_io_request(bp,
3296 	    (struct g_consumer *)ump->um_devvp->v_bufobj.bo_private);
3297 }
3298 
3299 /*
3300  * Flush some journal records to disk.
3301  */
3302 static void
3303 softdep_process_journal(mp, needwk, flags)
3304 	struct mount *mp;
3305 	struct worklist *needwk;
3306 	int flags;
3307 {
3308 	struct jblocks *jblocks;
3309 	struct ufsmount *ump;
3310 	struct worklist *wk;
3311 	struct jseg *jseg;
3312 	struct buf *bp;
3313 	struct bio *bio;
3314 	uint8_t *data;
3315 	struct fs *fs;
3316 	int shouldflush;
3317 	int segwritten;
3318 	int jrecmin;	/* Minimum records per block. */
3319 	int jrecmax;	/* Maximum records per block. */
3320 	int size;
3321 	int cnt;
3322 	int off;
3323 	int devbsize;
3324 
3325 	if (MOUNTEDSUJ(mp) == 0)
3326 		return;
3327 	shouldflush = softdep_flushcache;
3328 	bio = NULL;
3329 	jseg = NULL;
3330 	ump = VFSTOUFS(mp);
3331 	LOCK_OWNED(ump);
3332 	fs = ump->um_fs;
3333 	jblocks = ump->softdep_jblocks;
3334 	devbsize = ump->um_devvp->v_bufobj.bo_bsize;
3335 	/*
3336 	 * We write anywhere between a disk block and fs block.  The upper
3337 	 * bound is picked to prevent buffer cache fragmentation and limit
3338 	 * processing time per I/O.
3339 	 */
3340 	jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
3341 	jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
3342 	segwritten = 0;
3343 	for (;;) {
3344 		cnt = ump->softdep_on_journal;
3345 		/*
3346 		 * Criteria for writing a segment:
3347 		 * 1) We have a full block.
3348 		 * 2) We're called from jwait() and haven't found the
3349 		 *    journal item yet.
3350 		 * 3) Always write if needseg is set.
3351 		 * 4) If we are called from process_worklist and have
3352 		 *    not yet written anything we write a partial block
3353 		 *    to enforce a 1 second maximum latency on journal
3354 		 *    entries.
3355 		 */
3356 		if (cnt < (jrecmax - 1) && needwk == NULL &&
3357 		    jblocks->jb_needseg == 0 && (segwritten || cnt == 0))
3358 			break;
3359 		cnt++;
3360 		/*
3361 		 * Verify some free journal space.  softdep_prealloc() should
3362 		 * guarantee that we don't run out so this is indicative of
3363 		 * a problem with the flow control.  Try to recover
3364 		 * gracefully in any event.
3365 		 */
3366 		while (jblocks->jb_free == 0) {
3367 			if (flags != MNT_WAIT)
3368 				break;
3369 			printf("softdep: Out of journal space!\n");
3370 			softdep_speedup(ump);
3371 			msleep(jblocks, LOCK_PTR(ump), PRIBIO, "jblocks", hz);
3372 		}
3373 		FREE_LOCK(ump);
3374 		jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
3375 		workitem_alloc(&jseg->js_list, D_JSEG, mp);
3376 		LIST_INIT(&jseg->js_entries);
3377 		LIST_INIT(&jseg->js_indirs);
3378 		jseg->js_state = ATTACHED;
3379 		if (shouldflush == 0)
3380 			jseg->js_state |= COMPLETE;
3381 		else if (bio == NULL)
3382 			bio = g_alloc_bio();
3383 		jseg->js_jblocks = jblocks;
3384 		bp = geteblk(fs->fs_bsize, 0);
3385 		ACQUIRE_LOCK(ump);
3386 		/*
3387 		 * If there was a race while we were allocating the block
3388 		 * and jseg the entry we care about was likely written.
3389 		 * We bail out in both the WAIT and NOWAIT case and assume
3390 		 * the caller will loop if the entry it cares about is
3391 		 * not written.
3392 		 */
3393 		cnt = ump->softdep_on_journal;
3394 		if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) {
3395 			bp->b_flags |= B_INVAL | B_NOCACHE;
3396 			WORKITEM_FREE(jseg, D_JSEG);
3397 			FREE_LOCK(ump);
3398 			brelse(bp);
3399 			ACQUIRE_LOCK(ump);
3400 			break;
3401 		}
3402 		/*
3403 		 * Calculate the disk block size required for the available
3404 		 * records rounded to the min size.
3405 		 */
3406 		if (cnt == 0)
3407 			size = devbsize;
3408 		else if (cnt < jrecmax)
3409 			size = howmany(cnt, jrecmin) * devbsize;
3410 		else
3411 			size = fs->fs_bsize;
3412 		/*
3413 		 * Allocate a disk block for this journal data and account
3414 		 * for truncation of the requested size if enough contiguous
3415 		 * space was not available.
3416 		 */
3417 		bp->b_blkno = jblocks_alloc(jblocks, size, &size);
3418 		bp->b_lblkno = bp->b_blkno;
3419 		bp->b_offset = bp->b_blkno * DEV_BSIZE;
3420 		bp->b_bcount = size;
3421 		bp->b_flags &= ~B_INVAL;
3422 		bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
3423 		/*
3424 		 * Initialize our jseg with cnt records.  Assign the next
3425 		 * sequence number to it and link it in-order.
3426 		 */
3427 		cnt = MIN(cnt, (size / devbsize) * jrecmin);
3428 		jseg->js_buf = bp;
3429 		jseg->js_cnt = cnt;
3430 		jseg->js_refs = cnt + 1;	/* Self ref. */
3431 		jseg->js_size = size;
3432 		jseg->js_seq = jblocks->jb_nextseq++;
3433 		if (jblocks->jb_oldestseg == NULL)
3434 			jblocks->jb_oldestseg = jseg;
3435 		jseg->js_oldseq = jblocks->jb_oldestseg->js_seq;
3436 		TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
3437 		if (jblocks->jb_writeseg == NULL)
3438 			jblocks->jb_writeseg = jseg;
3439 		/*
3440 		 * Start filling in records from the pending list.
3441 		 */
3442 		data = bp->b_data;
3443 		off = 0;
3444 
3445 		/*
3446 		 * Always put a header on the first block.
3447 		 * XXX As with below, there might not be a chance to get
3448 		 * into the loop.  Ensure that something valid is written.
3449 		 */
3450 		jseg_write(ump, jseg, data);
3451 		off += JREC_SIZE;
3452 		data = bp->b_data + off;
3453 
3454 		/*
3455 		 * XXX Something is wrong here.  There's no work to do,
3456 		 * but we need to perform and I/O and allow it to complete
3457 		 * anyways.
3458 		 */
3459 		if (LIST_EMPTY(&ump->softdep_journal_pending))
3460 			stat_emptyjblocks++;
3461 
3462 		while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
3463 		    != NULL) {
3464 			if (cnt == 0)
3465 				break;
3466 			/* Place a segment header on every device block. */
3467 			if ((off % devbsize) == 0) {
3468 				jseg_write(ump, jseg, data);
3469 				off += JREC_SIZE;
3470 				data = bp->b_data + off;
3471 			}
3472 			if (wk == needwk)
3473 				needwk = NULL;
3474 			remove_from_journal(wk);
3475 			wk->wk_state |= INPROGRESS;
3476 			WORKLIST_INSERT(&jseg->js_entries, wk);
3477 			switch (wk->wk_type) {
3478 			case D_JADDREF:
3479 				jaddref_write(WK_JADDREF(wk), jseg, data);
3480 				break;
3481 			case D_JREMREF:
3482 				jremref_write(WK_JREMREF(wk), jseg, data);
3483 				break;
3484 			case D_JMVREF:
3485 				jmvref_write(WK_JMVREF(wk), jseg, data);
3486 				break;
3487 			case D_JNEWBLK:
3488 				jnewblk_write(WK_JNEWBLK(wk), jseg, data);
3489 				break;
3490 			case D_JFREEBLK:
3491 				jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
3492 				break;
3493 			case D_JFREEFRAG:
3494 				jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
3495 				break;
3496 			case D_JTRUNC:
3497 				jtrunc_write(WK_JTRUNC(wk), jseg, data);
3498 				break;
3499 			case D_JFSYNC:
3500 				jfsync_write(WK_JFSYNC(wk), jseg, data);
3501 				break;
3502 			default:
3503 				panic("process_journal: Unknown type %s",
3504 				    TYPENAME(wk->wk_type));
3505 				/* NOTREACHED */
3506 			}
3507 			off += JREC_SIZE;
3508 			data = bp->b_data + off;
3509 			cnt--;
3510 		}
3511 
3512 		/* Clear any remaining space so we don't leak kernel data */
3513 		if (size > off)
3514 			bzero(data, size - off);
3515 
3516 		/*
3517 		 * Write this one buffer and continue.
3518 		 */
3519 		segwritten = 1;
3520 		jblocks->jb_needseg = 0;
3521 		WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
3522 		FREE_LOCK(ump);
3523 		pbgetvp(ump->um_devvp, bp);
3524 		/*
3525 		 * We only do the blocking wait once we find the journal
3526 		 * entry we're looking for.
3527 		 */
3528 		if (needwk == NULL && flags == MNT_WAIT)
3529 			bwrite(bp);
3530 		else
3531 			bawrite(bp);
3532 		ACQUIRE_LOCK(ump);
3533 	}
3534 	/*
3535 	 * If we wrote a segment issue a synchronize cache so the journal
3536 	 * is reflected on disk before the data is written.  Since reclaiming
3537 	 * journal space also requires writing a journal record this
3538 	 * process also enforces a barrier before reclamation.
3539 	 */
3540 	if (segwritten && shouldflush) {
3541 		softdep_synchronize(bio, ump,
3542 		    TAILQ_LAST(&jblocks->jb_segs, jseglst));
3543 	} else if (bio)
3544 		g_destroy_bio(bio);
3545 	/*
3546 	 * If we've suspended the filesystem because we ran out of journal
3547 	 * space either try to sync it here to make some progress or
3548 	 * unsuspend it if we already have.
3549 	 */
3550 	if (flags == 0 && jblocks->jb_suspended) {
3551 		if (journal_unsuspend(ump))
3552 			return;
3553 		FREE_LOCK(ump);
3554 		VFS_SYNC(mp, MNT_NOWAIT);
3555 		ffs_sbupdate(ump, MNT_WAIT, 0);
3556 		ACQUIRE_LOCK(ump);
3557 	}
3558 }
3559 
3560 /*
3561  * Complete a jseg, allowing all dependencies awaiting journal writes
3562  * to proceed.  Each journal dependency also attaches a jsegdep to dependent
3563  * structures so that the journal segment can be freed to reclaim space.
3564  */
3565 static void
3566 complete_jseg(jseg)
3567 	struct jseg *jseg;
3568 {
3569 	struct worklist *wk;
3570 	struct jmvref *jmvref;
3571 	int waiting;
3572 #ifdef INVARIANTS
3573 	int i = 0;
3574 #endif
3575 
3576 	while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
3577 		WORKLIST_REMOVE(wk);
3578 		waiting = wk->wk_state & IOWAITING;
3579 		wk->wk_state &= ~(INPROGRESS | IOWAITING);
3580 		wk->wk_state |= COMPLETE;
3581 		KASSERT(i++ < jseg->js_cnt,
3582 		    ("handle_written_jseg: overflow %d >= %d",
3583 		    i - 1, jseg->js_cnt));
3584 		switch (wk->wk_type) {
3585 		case D_JADDREF:
3586 			handle_written_jaddref(WK_JADDREF(wk));
3587 			break;
3588 		case D_JREMREF:
3589 			handle_written_jremref(WK_JREMREF(wk));
3590 			break;
3591 		case D_JMVREF:
3592 			rele_jseg(jseg);	/* No jsegdep. */
3593 			jmvref = WK_JMVREF(wk);
3594 			LIST_REMOVE(jmvref, jm_deps);
3595 			if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0)
3596 				free_pagedep(jmvref->jm_pagedep);
3597 			WORKITEM_FREE(jmvref, D_JMVREF);
3598 			break;
3599 		case D_JNEWBLK:
3600 			handle_written_jnewblk(WK_JNEWBLK(wk));
3601 			break;
3602 		case D_JFREEBLK:
3603 			handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep);
3604 			break;
3605 		case D_JTRUNC:
3606 			handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep);
3607 			break;
3608 		case D_JFSYNC:
3609 			rele_jseg(jseg);	/* No jsegdep. */
3610 			WORKITEM_FREE(wk, D_JFSYNC);
3611 			break;
3612 		case D_JFREEFRAG:
3613 			handle_written_jfreefrag(WK_JFREEFRAG(wk));
3614 			break;
3615 		default:
3616 			panic("handle_written_jseg: Unknown type %s",
3617 			    TYPENAME(wk->wk_type));
3618 			/* NOTREACHED */
3619 		}
3620 		if (waiting)
3621 			wakeup(wk);
3622 	}
3623 	/* Release the self reference so the structure may be freed. */
3624 	rele_jseg(jseg);
3625 }
3626 
3627 /*
3628  * Determine which jsegs are ready for completion processing.  Waits for
3629  * synchronize cache to complete as well as forcing in-order completion
3630  * of journal entries.
3631  */
3632 static void
3633 complete_jsegs(jseg)
3634 	struct jseg *jseg;
3635 {
3636 	struct jblocks *jblocks;
3637 	struct jseg *jsegn;
3638 
3639 	jblocks = jseg->js_jblocks;
3640 	/*
3641 	 * Don't allow out of order completions.  If this isn't the first
3642 	 * block wait for it to write before we're done.
3643 	 */
3644 	if (jseg != jblocks->jb_writeseg)
3645 		return;
3646 	/* Iterate through available jsegs processing their entries. */
3647 	while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) {
3648 		jblocks->jb_oldestwrseq = jseg->js_oldseq;
3649 		jsegn = TAILQ_NEXT(jseg, js_next);
3650 		complete_jseg(jseg);
3651 		jseg = jsegn;
3652 	}
3653 	jblocks->jb_writeseg = jseg;
3654 	/*
3655 	 * Attempt to free jsegs now that oldestwrseq may have advanced.
3656 	 */
3657 	free_jsegs(jblocks);
3658 }
3659 
3660 /*
3661  * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Attempt to handle
3662  * the final completions.
3663  */
3664 static void
3665 handle_written_jseg(jseg, bp)
3666 	struct jseg *jseg;
3667 	struct buf *bp;
3668 {
3669 
3670 	if (jseg->js_refs == 0)
3671 		panic("handle_written_jseg: No self-reference on %p", jseg);
3672 	jseg->js_state |= DEPCOMPLETE;
3673 	/*
3674 	 * We'll never need this buffer again, set flags so it will be
3675 	 * discarded.
3676 	 */
3677 	bp->b_flags |= B_INVAL | B_NOCACHE;
3678 	pbrelvp(bp);
3679 	complete_jsegs(jseg);
3680 }
3681 
3682 static inline struct jsegdep *
3683 inoref_jseg(inoref)
3684 	struct inoref *inoref;
3685 {
3686 	struct jsegdep *jsegdep;
3687 
3688 	jsegdep = inoref->if_jsegdep;
3689 	inoref->if_jsegdep = NULL;
3690 
3691 	return (jsegdep);
3692 }
3693 
3694 /*
3695  * Called once a jremref has made it to stable store.  The jremref is marked
3696  * complete and we attempt to free it.  Any pagedeps writes sleeping waiting
3697  * for the jremref to complete will be awoken by free_jremref.
3698  */
3699 static void
3700 handle_written_jremref(jremref)
3701 	struct jremref *jremref;
3702 {
3703 	struct inodedep *inodedep;
3704 	struct jsegdep *jsegdep;
3705 	struct dirrem *dirrem;
3706 
3707 	/* Grab the jsegdep. */
3708 	jsegdep = inoref_jseg(&jremref->jr_ref);
3709 	/*
3710 	 * Remove us from the inoref list.
3711 	 */
3712 	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
3713 	    0, &inodedep) == 0)
3714 		panic("handle_written_jremref: Lost inodedep");
3715 	TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
3716 	/*
3717 	 * Complete the dirrem.
3718 	 */
3719 	dirrem = jremref->jr_dirrem;
3720 	jremref->jr_dirrem = NULL;
3721 	LIST_REMOVE(jremref, jr_deps);
3722 	jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
3723 	jwork_insert(&dirrem->dm_jwork, jsegdep);
3724 	if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
3725 	    (dirrem->dm_state & COMPLETE) != 0)
3726 		add_to_worklist(&dirrem->dm_list, 0);
3727 	free_jremref(jremref);
3728 }
3729 
3730 /*
3731  * Called once a jaddref has made it to stable store.  The dependency is
3732  * marked complete and any dependent structures are added to the inode
3733  * bufwait list to be completed as soon as it is written.  If a bitmap write
3734  * depends on this entry we move the inode into the inodedephd of the
3735  * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
3736  */
3737 static void
3738 handle_written_jaddref(jaddref)
3739 	struct jaddref *jaddref;
3740 {
3741 	struct jsegdep *jsegdep;
3742 	struct inodedep *inodedep;
3743 	struct diradd *diradd;
3744 	struct mkdir *mkdir;
3745 
3746 	/* Grab the jsegdep. */
3747 	jsegdep = inoref_jseg(&jaddref->ja_ref);
3748 	mkdir = NULL;
3749 	diradd = NULL;
3750 	if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
3751 	    0, &inodedep) == 0)
3752 		panic("handle_written_jaddref: Lost inodedep.");
3753 	if (jaddref->ja_diradd == NULL)
3754 		panic("handle_written_jaddref: No dependency");
3755 	if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
3756 		diradd = jaddref->ja_diradd;
3757 		WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
3758 	} else if (jaddref->ja_state & MKDIR_PARENT) {
3759 		mkdir = jaddref->ja_mkdir;
3760 		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
3761 	} else if (jaddref->ja_state & MKDIR_BODY)
3762 		mkdir = jaddref->ja_mkdir;
3763 	else
3764 		panic("handle_written_jaddref: Unknown dependency %p",
3765 		    jaddref->ja_diradd);
3766 	jaddref->ja_diradd = NULL;	/* also clears ja_mkdir */
3767 	/*
3768 	 * Remove us from the inode list.
3769 	 */
3770 	TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
3771 	/*
3772 	 * The mkdir may be waiting on the jaddref to clear before freeing.
3773 	 */
3774 	if (mkdir) {
3775 		KASSERT(mkdir->md_list.wk_type == D_MKDIR,
3776 		    ("handle_written_jaddref: Incorrect type for mkdir %s",
3777 		    TYPENAME(mkdir->md_list.wk_type)));
3778 		mkdir->md_jaddref = NULL;
3779 		diradd = mkdir->md_diradd;
3780 		mkdir->md_state |= DEPCOMPLETE;
3781 		complete_mkdir(mkdir);
3782 	}
3783 	jwork_insert(&diradd->da_jwork, jsegdep);
3784 	if (jaddref->ja_state & NEWBLOCK) {
3785 		inodedep->id_state |= ONDEPLIST;
3786 		LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
3787 		    inodedep, id_deps);
3788 	}
3789 	free_jaddref(jaddref);
3790 }
3791 
3792 /*
3793  * Called once a jnewblk journal is written.  The allocdirect or allocindir
3794  * is placed in the bmsafemap to await notification of a written bitmap.  If
3795  * the operation was canceled we add the segdep to the appropriate
3796  * dependency to free the journal space once the canceling operation
3797  * completes.
3798  */
3799 static void
3800 handle_written_jnewblk(jnewblk)
3801 	struct jnewblk *jnewblk;
3802 {
3803 	struct bmsafemap *bmsafemap;
3804 	struct freefrag *freefrag;
3805 	struct freework *freework;
3806 	struct jsegdep *jsegdep;
3807 	struct newblk *newblk;
3808 
3809 	/* Grab the jsegdep. */
3810 	jsegdep = jnewblk->jn_jsegdep;
3811 	jnewblk->jn_jsegdep = NULL;
3812 	if (jnewblk->jn_dep == NULL)
3813 		panic("handle_written_jnewblk: No dependency for the segdep.");
3814 	switch (jnewblk->jn_dep->wk_type) {
3815 	case D_NEWBLK:
3816 	case D_ALLOCDIRECT:
3817 	case D_ALLOCINDIR:
3818 		/*
3819 		 * Add the written block to the bmsafemap so it can
3820 		 * be notified when the bitmap is on disk.
3821 		 */
3822 		newblk = WK_NEWBLK(jnewblk->jn_dep);
3823 		newblk->nb_jnewblk = NULL;
3824 		if ((newblk->nb_state & GOINGAWAY) == 0) {
3825 			bmsafemap = newblk->nb_bmsafemap;
3826 			newblk->nb_state |= ONDEPLIST;
3827 			LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk,
3828 			    nb_deps);
3829 		}
3830 		jwork_insert(&newblk->nb_jwork, jsegdep);
3831 		break;
3832 	case D_FREEFRAG:
3833 		/*
3834 		 * A newblock being removed by a freefrag when replaced by
3835 		 * frag extension.
3836 		 */
3837 		freefrag = WK_FREEFRAG(jnewblk->jn_dep);
3838 		freefrag->ff_jdep = NULL;
3839 		jwork_insert(&freefrag->ff_jwork, jsegdep);
3840 		break;
3841 	case D_FREEWORK:
3842 		/*
3843 		 * A direct block was removed by truncate.
3844 		 */
3845 		freework = WK_FREEWORK(jnewblk->jn_dep);
3846 		freework->fw_jnewblk = NULL;
3847 		jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep);
3848 		break;
3849 	default:
3850 		panic("handle_written_jnewblk: Unknown type %d.",
3851 		    jnewblk->jn_dep->wk_type);
3852 	}
3853 	jnewblk->jn_dep = NULL;
3854 	free_jnewblk(jnewblk);
3855 }
3856 
3857 /*
3858  * Cancel a jfreefrag that won't be needed, probably due to colliding with
3859  * an in-flight allocation that has not yet been committed.  Divorce us
3860  * from the freefrag and mark it DEPCOMPLETE so that it may be added
3861  * to the worklist.
3862  */
3863 static void
3864 cancel_jfreefrag(jfreefrag)
3865 	struct jfreefrag *jfreefrag;
3866 {
3867 	struct freefrag *freefrag;
3868 
3869 	if (jfreefrag->fr_jsegdep) {
3870 		free_jsegdep(jfreefrag->fr_jsegdep);
3871 		jfreefrag->fr_jsegdep = NULL;
3872 	}
3873 	freefrag = jfreefrag->fr_freefrag;
3874 	jfreefrag->fr_freefrag = NULL;
3875 	free_jfreefrag(jfreefrag);
3876 	freefrag->ff_state |= DEPCOMPLETE;
3877 	CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno);
3878 }
3879 
3880 /*
3881  * Free a jfreefrag when the parent freefrag is rendered obsolete.
3882  */
3883 static void
3884 free_jfreefrag(jfreefrag)
3885 	struct jfreefrag *jfreefrag;
3886 {
3887 
3888 	if (jfreefrag->fr_state & INPROGRESS)
3889 		WORKLIST_REMOVE(&jfreefrag->fr_list);
3890 	else if (jfreefrag->fr_state & ONWORKLIST)
3891 		remove_from_journal(&jfreefrag->fr_list);
3892 	if (jfreefrag->fr_freefrag != NULL)
3893 		panic("free_jfreefrag:  Still attached to a freefrag.");
3894 	WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
3895 }
3896 
3897 /*
3898  * Called when the journal write for a jfreefrag completes.  The parent
3899  * freefrag is added to the worklist if this completes its dependencies.
3900  */
3901 static void
3902 handle_written_jfreefrag(jfreefrag)
3903 	struct jfreefrag *jfreefrag;
3904 {
3905 	struct jsegdep *jsegdep;
3906 	struct freefrag *freefrag;
3907 
3908 	/* Grab the jsegdep. */
3909 	jsegdep = jfreefrag->fr_jsegdep;
3910 	jfreefrag->fr_jsegdep = NULL;
3911 	freefrag = jfreefrag->fr_freefrag;
3912 	if (freefrag == NULL)
3913 		panic("handle_written_jfreefrag: No freefrag.");
3914 	freefrag->ff_state |= DEPCOMPLETE;
3915 	freefrag->ff_jdep = NULL;
3916 	jwork_insert(&freefrag->ff_jwork, jsegdep);
3917 	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
3918 		add_to_worklist(&freefrag->ff_list, 0);
3919 	jfreefrag->fr_freefrag = NULL;
3920 	free_jfreefrag(jfreefrag);
3921 }
3922 
3923 /*
3924  * Called when the journal write for a jfreeblk completes.  The jfreeblk
3925  * is removed from the freeblks list of pending journal writes and the
3926  * jsegdep is moved to the freeblks jwork to be completed when all blocks
3927  * have been reclaimed.
3928  */
3929 static void
3930 handle_written_jblkdep(jblkdep)
3931 	struct jblkdep *jblkdep;
3932 {
3933 	struct freeblks *freeblks;
3934 	struct jsegdep *jsegdep;
3935 
3936 	/* Grab the jsegdep. */
3937 	jsegdep = jblkdep->jb_jsegdep;
3938 	jblkdep->jb_jsegdep = NULL;
3939 	freeblks = jblkdep->jb_freeblks;
3940 	LIST_REMOVE(jblkdep, jb_deps);
3941 	jwork_insert(&freeblks->fb_jwork, jsegdep);
3942 	/*
3943 	 * If the freeblks is all journaled, we can add it to the worklist.
3944 	 */
3945 	if (LIST_EMPTY(&freeblks->fb_jblkdephd) &&
3946 	    (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
3947 		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
3948 
3949 	free_jblkdep(jblkdep);
3950 }
3951 
3952 static struct jsegdep *
3953 newjsegdep(struct worklist *wk)
3954 {
3955 	struct jsegdep *jsegdep;
3956 
3957 	jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
3958 	workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
3959 	jsegdep->jd_seg = NULL;
3960 
3961 	return (jsegdep);
3962 }
3963 
3964 static struct jmvref *
3965 newjmvref(dp, ino, oldoff, newoff)
3966 	struct inode *dp;
3967 	ino_t ino;
3968 	off_t oldoff;
3969 	off_t newoff;
3970 {
3971 	struct jmvref *jmvref;
3972 
3973 	jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
3974 	workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump));
3975 	jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
3976 	jmvref->jm_parent = dp->i_number;
3977 	jmvref->jm_ino = ino;
3978 	jmvref->jm_oldoff = oldoff;
3979 	jmvref->jm_newoff = newoff;
3980 
3981 	return (jmvref);
3982 }
3983 
3984 /*
3985  * Allocate a new jremref that tracks the removal of ip from dp with the
3986  * directory entry offset of diroff.  Mark the entry as ATTACHED and
3987  * DEPCOMPLETE as we have all the information required for the journal write
3988  * and the directory has already been removed from the buffer.  The caller
3989  * is responsible for linking the jremref into the pagedep and adding it
3990  * to the journal to write.  The MKDIR_PARENT flag is set if we're doing
3991  * a DOTDOT addition so handle_workitem_remove() can properly assign
3992  * the jsegdep when we're done.
3993  */
3994 static struct jremref *
3995 newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip,
3996     off_t diroff, nlink_t nlink)
3997 {
3998 	struct jremref *jremref;
3999 
4000 	jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
4001 	workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump));
4002 	jremref->jr_state = ATTACHED;
4003 	newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
4004 	   nlink, ip->i_mode);
4005 	jremref->jr_dirrem = dirrem;
4006 
4007 	return (jremref);
4008 }
4009 
4010 static inline void
4011 newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff,
4012     nlink_t nlink, uint16_t mode)
4013 {
4014 
4015 	inoref->if_jsegdep = newjsegdep(&inoref->if_list);
4016 	inoref->if_diroff = diroff;
4017 	inoref->if_ino = ino;
4018 	inoref->if_parent = parent;
4019 	inoref->if_nlink = nlink;
4020 	inoref->if_mode = mode;
4021 }
4022 
4023 /*
4024  * Allocate a new jaddref to track the addition of ino to dp at diroff.  The
4025  * directory offset may not be known until later.  The caller is responsible
4026  * adding the entry to the journal when this information is available.  nlink
4027  * should be the link count prior to the addition and mode is only required
4028  * to have the correct FMT.
4029  */
4030 static struct jaddref *
4031 newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink,
4032     uint16_t mode)
4033 {
4034 	struct jaddref *jaddref;
4035 
4036 	jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
4037 	workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump));
4038 	jaddref->ja_state = ATTACHED;
4039 	jaddref->ja_mkdir = NULL;
4040 	newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
4041 
4042 	return (jaddref);
4043 }
4044 
4045 /*
4046  * Create a new free dependency for a freework.  The caller is responsible
4047  * for adjusting the reference count when it has the lock held.  The freedep
4048  * will track an outstanding bitmap write that will ultimately clear the
4049  * freework to continue.
4050  */
4051 static struct freedep *
4052 newfreedep(struct freework *freework)
4053 {
4054 	struct freedep *freedep;
4055 
4056 	freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
4057 	workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
4058 	freedep->fd_freework = freework;
4059 
4060 	return (freedep);
4061 }
4062 
4063 /*
4064  * Free a freedep structure once the buffer it is linked to is written.  If
4065  * this is the last reference to the freework schedule it for completion.
4066  */
4067 static void
4068 free_freedep(freedep)
4069 	struct freedep *freedep;
4070 {
4071 	struct freework *freework;
4072 
4073 	freework = freedep->fd_freework;
4074 	freework->fw_freeblks->fb_cgwait--;
4075 	if (--freework->fw_ref == 0)
4076 		freework_enqueue(freework);
4077 	WORKITEM_FREE(freedep, D_FREEDEP);
4078 }
4079 
4080 /*
4081  * Allocate a new freework structure that may be a level in an indirect
4082  * when parent is not NULL or a top level block when it is.  The top level
4083  * freework structures are allocated without the per-filesystem lock held
4084  * and before the freeblks is visible outside of softdep_setup_freeblocks().
4085  */
4086 static struct freework *
4087 newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal)
4088 	struct ufsmount *ump;
4089 	struct freeblks *freeblks;
4090 	struct freework *parent;
4091 	ufs_lbn_t lbn;
4092 	ufs2_daddr_t nb;
4093 	int frags;
4094 	int off;
4095 	int journal;
4096 {
4097 	struct freework *freework;
4098 
4099 	freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
4100 	workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
4101 	freework->fw_state = ATTACHED;
4102 	freework->fw_jnewblk = NULL;
4103 	freework->fw_freeblks = freeblks;
4104 	freework->fw_parent = parent;
4105 	freework->fw_lbn = lbn;
4106 	freework->fw_blkno = nb;
4107 	freework->fw_frags = frags;
4108 	freework->fw_indir = NULL;
4109 	freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 || lbn >= -NXADDR)
4110 		? 0 : NINDIR(ump->um_fs) + 1;
4111 	freework->fw_start = freework->fw_off = off;
4112 	if (journal)
4113 		newjfreeblk(freeblks, lbn, nb, frags);
4114 	if (parent == NULL) {
4115 		ACQUIRE_LOCK(ump);
4116 		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
4117 		freeblks->fb_ref++;
4118 		FREE_LOCK(ump);
4119 	}
4120 
4121 	return (freework);
4122 }
4123 
4124 /*
4125  * Eliminate a jfreeblk for a block that does not need journaling.
4126  */
4127 static void
4128 cancel_jfreeblk(freeblks, blkno)
4129 	struct freeblks *freeblks;
4130 	ufs2_daddr_t blkno;
4131 {
4132 	struct jfreeblk *jfreeblk;
4133 	struct jblkdep *jblkdep;
4134 
4135 	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) {
4136 		if (jblkdep->jb_list.wk_type != D_JFREEBLK)
4137 			continue;
4138 		jfreeblk = WK_JFREEBLK(&jblkdep->jb_list);
4139 		if (jfreeblk->jf_blkno == blkno)
4140 			break;
4141 	}
4142 	if (jblkdep == NULL)
4143 		return;
4144 	CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno);
4145 	free_jsegdep(jblkdep->jb_jsegdep);
4146 	LIST_REMOVE(jblkdep, jb_deps);
4147 	WORKITEM_FREE(jfreeblk, D_JFREEBLK);
4148 }
4149 
4150 /*
4151  * Allocate a new jfreeblk to journal top level block pointer when truncating
4152  * a file.  The caller must add this to the worklist when the per-filesystem
4153  * lock is held.
4154  */
4155 static struct jfreeblk *
4156 newjfreeblk(freeblks, lbn, blkno, frags)
4157 	struct freeblks *freeblks;
4158 	ufs_lbn_t lbn;
4159 	ufs2_daddr_t blkno;
4160 	int frags;
4161 {
4162 	struct jfreeblk *jfreeblk;
4163 
4164 	jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
4165 	workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK,
4166 	    freeblks->fb_list.wk_mp);
4167 	jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list);
4168 	jfreeblk->jf_dep.jb_freeblks = freeblks;
4169 	jfreeblk->jf_ino = freeblks->fb_inum;
4170 	jfreeblk->jf_lbn = lbn;
4171 	jfreeblk->jf_blkno = blkno;
4172 	jfreeblk->jf_frags = frags;
4173 	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps);
4174 
4175 	return (jfreeblk);
4176 }
4177 
4178 /*
4179  * The journal is only prepared to handle full-size block numbers, so we
4180  * have to adjust the record to reflect the change to a full-size block.
4181  * For example, suppose we have a block made up of fragments 8-15 and
4182  * want to free its last two fragments. We are given a request that says:
4183  *     FREEBLK ino=5, blkno=14, lbn=0, frags=2, oldfrags=0
4184  * where frags are the number of fragments to free and oldfrags are the
4185  * number of fragments to keep. To block align it, we have to change it to
4186  * have a valid full-size blkno, so it becomes:
4187  *     FREEBLK ino=5, blkno=8, lbn=0, frags=2, oldfrags=6
4188  */
4189 static void
4190 adjust_newfreework(freeblks, frag_offset)
4191 	struct freeblks *freeblks;
4192 	int frag_offset;
4193 {
4194 	struct jfreeblk *jfreeblk;
4195 
4196 	KASSERT((LIST_FIRST(&freeblks->fb_jblkdephd) != NULL &&
4197 	    LIST_FIRST(&freeblks->fb_jblkdephd)->jb_list.wk_type == D_JFREEBLK),
4198 	    ("adjust_newfreework: Missing freeblks dependency"));
4199 
4200 	jfreeblk = WK_JFREEBLK(LIST_FIRST(&freeblks->fb_jblkdephd));
4201 	jfreeblk->jf_blkno -= frag_offset;
4202 	jfreeblk->jf_frags += frag_offset;
4203 }
4204 
4205 /*
4206  * Allocate a new jtrunc to track a partial truncation.
4207  */
4208 static struct jtrunc *
4209 newjtrunc(freeblks, size, extsize)
4210 	struct freeblks *freeblks;
4211 	off_t size;
4212 	int extsize;
4213 {
4214 	struct jtrunc *jtrunc;
4215 
4216 	jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
4217 	workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC,
4218 	    freeblks->fb_list.wk_mp);
4219 	jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list);
4220 	jtrunc->jt_dep.jb_freeblks = freeblks;
4221 	jtrunc->jt_ino = freeblks->fb_inum;
4222 	jtrunc->jt_size = size;
4223 	jtrunc->jt_extsize = extsize;
4224 	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps);
4225 
4226 	return (jtrunc);
4227 }
4228 
4229 /*
4230  * If we're canceling a new bitmap we have to search for another ref
4231  * to move into the bmsafemap dep.  This might be better expressed
4232  * with another structure.
4233  */
4234 static void
4235 move_newblock_dep(jaddref, inodedep)
4236 	struct jaddref *jaddref;
4237 	struct inodedep *inodedep;
4238 {
4239 	struct inoref *inoref;
4240 	struct jaddref *jaddrefn;
4241 
4242 	jaddrefn = NULL;
4243 	for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4244 	    inoref = TAILQ_NEXT(inoref, if_deps)) {
4245 		if ((jaddref->ja_state & NEWBLOCK) &&
4246 		    inoref->if_list.wk_type == D_JADDREF) {
4247 			jaddrefn = (struct jaddref *)inoref;
4248 			break;
4249 		}
4250 	}
4251 	if (jaddrefn == NULL)
4252 		return;
4253 	jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
4254 	jaddrefn->ja_state |= jaddref->ja_state &
4255 	    (ATTACHED | UNDONE | NEWBLOCK);
4256 	jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
4257 	jaddref->ja_state |= ATTACHED;
4258 	LIST_REMOVE(jaddref, ja_bmdeps);
4259 	LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
4260 	    ja_bmdeps);
4261 }
4262 
4263 /*
4264  * Cancel a jaddref either before it has been written or while it is being
4265  * written.  This happens when a link is removed before the add reaches
4266  * the disk.  The jaddref dependency is kept linked into the bmsafemap
4267  * and inode to prevent the link count or bitmap from reaching the disk
4268  * until handle_workitem_remove() re-adjusts the counts and bitmaps as
4269  * required.
4270  *
4271  * Returns 1 if the canceled addref requires journaling of the remove and
4272  * 0 otherwise.
4273  */
4274 static int
4275 cancel_jaddref(jaddref, inodedep, wkhd)
4276 	struct jaddref *jaddref;
4277 	struct inodedep *inodedep;
4278 	struct workhead *wkhd;
4279 {
4280 	struct inoref *inoref;
4281 	struct jsegdep *jsegdep;
4282 	int needsj;
4283 
4284 	KASSERT((jaddref->ja_state & COMPLETE) == 0,
4285 	    ("cancel_jaddref: Canceling complete jaddref"));
4286 	if (jaddref->ja_state & (INPROGRESS | COMPLETE))
4287 		needsj = 1;
4288 	else
4289 		needsj = 0;
4290 	if (inodedep == NULL)
4291 		if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
4292 		    0, &inodedep) == 0)
4293 			panic("cancel_jaddref: Lost inodedep");
4294 	/*
4295 	 * We must adjust the nlink of any reference operation that follows
4296 	 * us so that it is consistent with the in-memory reference.  This
4297 	 * ensures that inode nlink rollbacks always have the correct link.
4298 	 */
4299 	if (needsj == 0) {
4300 		for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4301 		    inoref = TAILQ_NEXT(inoref, if_deps)) {
4302 			if (inoref->if_state & GOINGAWAY)
4303 				break;
4304 			inoref->if_nlink--;
4305 		}
4306 	}
4307 	jsegdep = inoref_jseg(&jaddref->ja_ref);
4308 	if (jaddref->ja_state & NEWBLOCK)
4309 		move_newblock_dep(jaddref, inodedep);
4310 	wake_worklist(&jaddref->ja_list);
4311 	jaddref->ja_mkdir = NULL;
4312 	if (jaddref->ja_state & INPROGRESS) {
4313 		jaddref->ja_state &= ~INPROGRESS;
4314 		WORKLIST_REMOVE(&jaddref->ja_list);
4315 		jwork_insert(wkhd, jsegdep);
4316 	} else {
4317 		free_jsegdep(jsegdep);
4318 		if (jaddref->ja_state & DEPCOMPLETE)
4319 			remove_from_journal(&jaddref->ja_list);
4320 	}
4321 	jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE);
4322 	/*
4323 	 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
4324 	 * can arrange for them to be freed with the bitmap.  Otherwise we
4325 	 * no longer need this addref attached to the inoreflst and it
4326 	 * will incorrectly adjust nlink if we leave it.
4327 	 */
4328 	if ((jaddref->ja_state & NEWBLOCK) == 0) {
4329 		TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
4330 		    if_deps);
4331 		jaddref->ja_state |= COMPLETE;
4332 		free_jaddref(jaddref);
4333 		return (needsj);
4334 	}
4335 	/*
4336 	 * Leave the head of the list for jsegdeps for fast merging.
4337 	 */
4338 	if (LIST_FIRST(wkhd) != NULL) {
4339 		jaddref->ja_state |= ONWORKLIST;
4340 		LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
4341 	} else
4342 		WORKLIST_INSERT(wkhd, &jaddref->ja_list);
4343 
4344 	return (needsj);
4345 }
4346 
4347 /*
4348  * Attempt to free a jaddref structure when some work completes.  This
4349  * should only succeed once the entry is written and all dependencies have
4350  * been notified.
4351  */
4352 static void
4353 free_jaddref(jaddref)
4354 	struct jaddref *jaddref;
4355 {
4356 
4357 	if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
4358 		return;
4359 	if (jaddref->ja_ref.if_jsegdep)
4360 		panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
4361 		    jaddref, jaddref->ja_state);
4362 	if (jaddref->ja_state & NEWBLOCK)
4363 		LIST_REMOVE(jaddref, ja_bmdeps);
4364 	if (jaddref->ja_state & (INPROGRESS | ONWORKLIST))
4365 		panic("free_jaddref: Bad state %p(0x%X)",
4366 		    jaddref, jaddref->ja_state);
4367 	if (jaddref->ja_mkdir != NULL)
4368 		panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
4369 	WORKITEM_FREE(jaddref, D_JADDREF);
4370 }
4371 
4372 /*
4373  * Free a jremref structure once it has been written or discarded.
4374  */
4375 static void
4376 free_jremref(jremref)
4377 	struct jremref *jremref;
4378 {
4379 
4380 	if (jremref->jr_ref.if_jsegdep)
4381 		free_jsegdep(jremref->jr_ref.if_jsegdep);
4382 	if (jremref->jr_state & INPROGRESS)
4383 		panic("free_jremref: IO still pending");
4384 	WORKITEM_FREE(jremref, D_JREMREF);
4385 }
4386 
4387 /*
4388  * Free a jnewblk structure.
4389  */
4390 static void
4391 free_jnewblk(jnewblk)
4392 	struct jnewblk *jnewblk;
4393 {
4394 
4395 	if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
4396 		return;
4397 	LIST_REMOVE(jnewblk, jn_deps);
4398 	if (jnewblk->jn_dep != NULL)
4399 		panic("free_jnewblk: Dependency still attached.");
4400 	WORKITEM_FREE(jnewblk, D_JNEWBLK);
4401 }
4402 
4403 /*
4404  * Cancel a jnewblk which has been been made redundant by frag extension.
4405  */
4406 static void
4407 cancel_jnewblk(jnewblk, wkhd)
4408 	struct jnewblk *jnewblk;
4409 	struct workhead *wkhd;
4410 {
4411 	struct jsegdep *jsegdep;
4412 
4413 	CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno);
4414 	jsegdep = jnewblk->jn_jsegdep;
4415 	if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL)
4416 		panic("cancel_jnewblk: Invalid state");
4417 	jnewblk->jn_jsegdep  = NULL;
4418 	jnewblk->jn_dep = NULL;
4419 	jnewblk->jn_state |= GOINGAWAY;
4420 	if (jnewblk->jn_state & INPROGRESS) {
4421 		jnewblk->jn_state &= ~INPROGRESS;
4422 		WORKLIST_REMOVE(&jnewblk->jn_list);
4423 		jwork_insert(wkhd, jsegdep);
4424 	} else {
4425 		free_jsegdep(jsegdep);
4426 		remove_from_journal(&jnewblk->jn_list);
4427 	}
4428 	wake_worklist(&jnewblk->jn_list);
4429 	WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
4430 }
4431 
4432 static void
4433 free_jblkdep(jblkdep)
4434 	struct jblkdep *jblkdep;
4435 {
4436 
4437 	if (jblkdep->jb_list.wk_type == D_JFREEBLK)
4438 		WORKITEM_FREE(jblkdep, D_JFREEBLK);
4439 	else if (jblkdep->jb_list.wk_type == D_JTRUNC)
4440 		WORKITEM_FREE(jblkdep, D_JTRUNC);
4441 	else
4442 		panic("free_jblkdep: Unexpected type %s",
4443 		    TYPENAME(jblkdep->jb_list.wk_type));
4444 }
4445 
4446 /*
4447  * Free a single jseg once it is no longer referenced in memory or on
4448  * disk.  Reclaim journal blocks and dependencies waiting for the segment
4449  * to disappear.
4450  */
4451 static void
4452 free_jseg(jseg, jblocks)
4453 	struct jseg *jseg;
4454 	struct jblocks *jblocks;
4455 {
4456 	struct freework *freework;
4457 
4458 	/*
4459 	 * Free freework structures that were lingering to indicate freed
4460 	 * indirect blocks that forced journal write ordering on reallocate.
4461 	 */
4462 	while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL)
4463 		indirblk_remove(freework);
4464 	if (jblocks->jb_oldestseg == jseg)
4465 		jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next);
4466 	TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
4467 	jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
4468 	KASSERT(LIST_EMPTY(&jseg->js_entries),
4469 	    ("free_jseg: Freed jseg has valid entries."));
4470 	WORKITEM_FREE(jseg, D_JSEG);
4471 }
4472 
4473 /*
4474  * Free all jsegs that meet the criteria for being reclaimed and update
4475  * oldestseg.
4476  */
4477 static void
4478 free_jsegs(jblocks)
4479 	struct jblocks *jblocks;
4480 {
4481 	struct jseg *jseg;
4482 
4483 	/*
4484 	 * Free only those jsegs which have none allocated before them to
4485 	 * preserve the journal space ordering.
4486 	 */
4487 	while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
4488 		/*
4489 		 * Only reclaim space when nothing depends on this journal
4490 		 * set and another set has written that it is no longer
4491 		 * valid.
4492 		 */
4493 		if (jseg->js_refs != 0) {
4494 			jblocks->jb_oldestseg = jseg;
4495 			return;
4496 		}
4497 		if ((jseg->js_state & ALLCOMPLETE) != ALLCOMPLETE)
4498 			break;
4499 		if (jseg->js_seq > jblocks->jb_oldestwrseq)
4500 			break;
4501 		/*
4502 		 * We can free jsegs that didn't write entries when
4503 		 * oldestwrseq == js_seq.
4504 		 */
4505 		if (jseg->js_seq == jblocks->jb_oldestwrseq &&
4506 		    jseg->js_cnt != 0)
4507 			break;
4508 		free_jseg(jseg, jblocks);
4509 	}
4510 	/*
4511 	 * If we exited the loop above we still must discover the
4512 	 * oldest valid segment.
4513 	 */
4514 	if (jseg)
4515 		for (jseg = jblocks->jb_oldestseg; jseg != NULL;
4516 		     jseg = TAILQ_NEXT(jseg, js_next))
4517 			if (jseg->js_refs != 0)
4518 				break;
4519 	jblocks->jb_oldestseg = jseg;
4520 	/*
4521 	 * The journal has no valid records but some jsegs may still be
4522 	 * waiting on oldestwrseq to advance.  We force a small record
4523 	 * out to permit these lingering records to be reclaimed.
4524 	 */
4525 	if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs))
4526 		jblocks->jb_needseg = 1;
4527 }
4528 
4529 /*
4530  * Release one reference to a jseg and free it if the count reaches 0.  This
4531  * should eventually reclaim journal space as well.
4532  */
4533 static void
4534 rele_jseg(jseg)
4535 	struct jseg *jseg;
4536 {
4537 
4538 	KASSERT(jseg->js_refs > 0,
4539 	    ("free_jseg: Invalid refcnt %d", jseg->js_refs));
4540 	if (--jseg->js_refs != 0)
4541 		return;
4542 	free_jsegs(jseg->js_jblocks);
4543 }
4544 
4545 /*
4546  * Release a jsegdep and decrement the jseg count.
4547  */
4548 static void
4549 free_jsegdep(jsegdep)
4550 	struct jsegdep *jsegdep;
4551 {
4552 
4553 	if (jsegdep->jd_seg)
4554 		rele_jseg(jsegdep->jd_seg);
4555 	WORKITEM_FREE(jsegdep, D_JSEGDEP);
4556 }
4557 
4558 /*
4559  * Wait for a journal item to make it to disk.  Initiate journal processing
4560  * if required.
4561  */
4562 static int
4563 jwait(wk, waitfor)
4564 	struct worklist *wk;
4565 	int waitfor;
4566 {
4567 
4568 	LOCK_OWNED(VFSTOUFS(wk->wk_mp));
4569 	/*
4570 	 * Blocking journal waits cause slow synchronous behavior.  Record
4571 	 * stats on the frequency of these blocking operations.
4572 	 */
4573 	if (waitfor == MNT_WAIT) {
4574 		stat_journal_wait++;
4575 		switch (wk->wk_type) {
4576 		case D_JREMREF:
4577 		case D_JMVREF:
4578 			stat_jwait_filepage++;
4579 			break;
4580 		case D_JTRUNC:
4581 		case D_JFREEBLK:
4582 			stat_jwait_freeblks++;
4583 			break;
4584 		case D_JNEWBLK:
4585 			stat_jwait_newblk++;
4586 			break;
4587 		case D_JADDREF:
4588 			stat_jwait_inode++;
4589 			break;
4590 		default:
4591 			break;
4592 		}
4593 	}
4594 	/*
4595 	 * If IO has not started we process the journal.  We can't mark the
4596 	 * worklist item as IOWAITING because we drop the lock while
4597 	 * processing the journal and the worklist entry may be freed after
4598 	 * this point.  The caller may call back in and re-issue the request.
4599 	 */
4600 	if ((wk->wk_state & INPROGRESS) == 0) {
4601 		softdep_process_journal(wk->wk_mp, wk, waitfor);
4602 		if (waitfor != MNT_WAIT)
4603 			return (EBUSY);
4604 		return (0);
4605 	}
4606 	if (waitfor != MNT_WAIT)
4607 		return (EBUSY);
4608 	wait_worklist(wk, "jwait");
4609 	return (0);
4610 }
4611 
4612 /*
4613  * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
4614  * appropriate.  This is a convenience function to reduce duplicate code
4615  * for the setup and revert functions below.
4616  */
4617 static struct inodedep *
4618 inodedep_lookup_ip(ip)
4619 	struct inode *ip;
4620 {
4621 	struct inodedep *inodedep;
4622 	int dflags;
4623 
4624 	KASSERT(ip->i_nlink >= ip->i_effnlink,
4625 	    ("inodedep_lookup_ip: bad delta"));
4626 	dflags = DEPALLOC;
4627 	if (IS_SNAPSHOT(ip))
4628 		dflags |= NODELAY;
4629 	(void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags,
4630 	    &inodedep);
4631 	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
4632 	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
4633 
4634 	return (inodedep);
4635 }
4636 
4637 /*
4638  * Called prior to creating a new inode and linking it to a directory.  The
4639  * jaddref structure must already be allocated by softdep_setup_inomapdep
4640  * and it is discovered here so we can initialize the mode and update
4641  * nlinkdelta.
4642  */
4643 void
4644 softdep_setup_create(dp, ip)
4645 	struct inode *dp;
4646 	struct inode *ip;
4647 {
4648 	struct inodedep *inodedep;
4649 	struct jaddref *jaddref;
4650 	struct vnode *dvp;
4651 
4652 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4653 	    ("softdep_setup_create called on non-softdep filesystem"));
4654 	KASSERT(ip->i_nlink == 1,
4655 	    ("softdep_setup_create: Invalid link count."));
4656 	dvp = ITOV(dp);
4657 	ACQUIRE_LOCK(dp->i_ump);
4658 	inodedep = inodedep_lookup_ip(ip);
4659 	if (DOINGSUJ(dvp)) {
4660 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4661 		    inoreflst);
4662 		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
4663 		    ("softdep_setup_create: No addref structure present."));
4664 	}
4665 	softdep_prelink(dvp, NULL);
4666 	FREE_LOCK(dp->i_ump);
4667 }
4668 
4669 /*
4670  * Create a jaddref structure to track the addition of a DOTDOT link when
4671  * we are reparenting an inode as part of a rename.  This jaddref will be
4672  * found by softdep_setup_directory_change.  Adjusts nlinkdelta for
4673  * non-journaling softdep.
4674  */
4675 void
4676 softdep_setup_dotdot_link(dp, ip)
4677 	struct inode *dp;
4678 	struct inode *ip;
4679 {
4680 	struct inodedep *inodedep;
4681 	struct jaddref *jaddref;
4682 	struct vnode *dvp;
4683 	struct vnode *vp;
4684 
4685 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4686 	    ("softdep_setup_dotdot_link called on non-softdep filesystem"));
4687 	dvp = ITOV(dp);
4688 	vp = ITOV(ip);
4689 	jaddref = NULL;
4690 	/*
4691 	 * We don't set MKDIR_PARENT as this is not tied to a mkdir and
4692 	 * is used as a normal link would be.
4693 	 */
4694 	if (DOINGSUJ(dvp))
4695 		jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4696 		    dp->i_effnlink - 1, dp->i_mode);
4697 	ACQUIRE_LOCK(dp->i_ump);
4698 	inodedep = inodedep_lookup_ip(dp);
4699 	if (jaddref)
4700 		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4701 		    if_deps);
4702 	softdep_prelink(dvp, ITOV(ip));
4703 	FREE_LOCK(dp->i_ump);
4704 }
4705 
4706 /*
4707  * Create a jaddref structure to track a new link to an inode.  The directory
4708  * offset is not known until softdep_setup_directory_add or
4709  * softdep_setup_directory_change.  Adjusts nlinkdelta for non-journaling
4710  * softdep.
4711  */
4712 void
4713 softdep_setup_link(dp, ip)
4714 	struct inode *dp;
4715 	struct inode *ip;
4716 {
4717 	struct inodedep *inodedep;
4718 	struct jaddref *jaddref;
4719 	struct vnode *dvp;
4720 
4721 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4722 	    ("softdep_setup_link called on non-softdep filesystem"));
4723 	dvp = ITOV(dp);
4724 	jaddref = NULL;
4725 	if (DOINGSUJ(dvp))
4726 		jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
4727 		    ip->i_mode);
4728 	ACQUIRE_LOCK(dp->i_ump);
4729 	inodedep = inodedep_lookup_ip(ip);
4730 	if (jaddref)
4731 		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4732 		    if_deps);
4733 	softdep_prelink(dvp, ITOV(ip));
4734 	FREE_LOCK(dp->i_ump);
4735 }
4736 
4737 /*
4738  * Called to create the jaddref structures to track . and .. references as
4739  * well as lookup and further initialize the incomplete jaddref created
4740  * by softdep_setup_inomapdep when the inode was allocated.  Adjusts
4741  * nlinkdelta for non-journaling softdep.
4742  */
4743 void
4744 softdep_setup_mkdir(dp, ip)
4745 	struct inode *dp;
4746 	struct inode *ip;
4747 {
4748 	struct inodedep *inodedep;
4749 	struct jaddref *dotdotaddref;
4750 	struct jaddref *dotaddref;
4751 	struct jaddref *jaddref;
4752 	struct vnode *dvp;
4753 
4754 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4755 	    ("softdep_setup_mkdir called on non-softdep filesystem"));
4756 	dvp = ITOV(dp);
4757 	dotaddref = dotdotaddref = NULL;
4758 	if (DOINGSUJ(dvp)) {
4759 		dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
4760 		    ip->i_mode);
4761 		dotaddref->ja_state |= MKDIR_BODY;
4762 		dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4763 		    dp->i_effnlink - 1, dp->i_mode);
4764 		dotdotaddref->ja_state |= MKDIR_PARENT;
4765 	}
4766 	ACQUIRE_LOCK(dp->i_ump);
4767 	inodedep = inodedep_lookup_ip(ip);
4768 	if (DOINGSUJ(dvp)) {
4769 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4770 		    inoreflst);
4771 		KASSERT(jaddref != NULL,
4772 		    ("softdep_setup_mkdir: No addref structure present."));
4773 		KASSERT(jaddref->ja_parent == dp->i_number,
4774 		    ("softdep_setup_mkdir: bad parent %ju",
4775 		    (uintmax_t)jaddref->ja_parent));
4776 		TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
4777 		    if_deps);
4778 	}
4779 	inodedep = inodedep_lookup_ip(dp);
4780 	if (DOINGSUJ(dvp))
4781 		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
4782 		    &dotdotaddref->ja_ref, if_deps);
4783 	softdep_prelink(ITOV(dp), NULL);
4784 	FREE_LOCK(dp->i_ump);
4785 }
4786 
4787 /*
4788  * Called to track nlinkdelta of the inode and parent directories prior to
4789  * unlinking a directory.
4790  */
4791 void
4792 softdep_setup_rmdir(dp, ip)
4793 	struct inode *dp;
4794 	struct inode *ip;
4795 {
4796 	struct vnode *dvp;
4797 
4798 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4799 	    ("softdep_setup_rmdir called on non-softdep filesystem"));
4800 	dvp = ITOV(dp);
4801 	ACQUIRE_LOCK(dp->i_ump);
4802 	(void) inodedep_lookup_ip(ip);
4803 	(void) inodedep_lookup_ip(dp);
4804 	softdep_prelink(dvp, ITOV(ip));
4805 	FREE_LOCK(dp->i_ump);
4806 }
4807 
4808 /*
4809  * Called to track nlinkdelta of the inode and parent directories prior to
4810  * unlink.
4811  */
4812 void
4813 softdep_setup_unlink(dp, ip)
4814 	struct inode *dp;
4815 	struct inode *ip;
4816 {
4817 	struct vnode *dvp;
4818 
4819 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4820 	    ("softdep_setup_unlink called on non-softdep filesystem"));
4821 	dvp = ITOV(dp);
4822 	ACQUIRE_LOCK(dp->i_ump);
4823 	(void) inodedep_lookup_ip(ip);
4824 	(void) inodedep_lookup_ip(dp);
4825 	softdep_prelink(dvp, ITOV(ip));
4826 	FREE_LOCK(dp->i_ump);
4827 }
4828 
4829 /*
4830  * Called to release the journal structures created by a failed non-directory
4831  * creation.  Adjusts nlinkdelta for non-journaling softdep.
4832  */
4833 void
4834 softdep_revert_create(dp, ip)
4835 	struct inode *dp;
4836 	struct inode *ip;
4837 {
4838 	struct inodedep *inodedep;
4839 	struct jaddref *jaddref;
4840 	struct vnode *dvp;
4841 
4842 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4843 	    ("softdep_revert_create called on non-softdep filesystem"));
4844 	dvp = ITOV(dp);
4845 	ACQUIRE_LOCK(dp->i_ump);
4846 	inodedep = inodedep_lookup_ip(ip);
4847 	if (DOINGSUJ(dvp)) {
4848 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4849 		    inoreflst);
4850 		KASSERT(jaddref->ja_parent == dp->i_number,
4851 		    ("softdep_revert_create: addref parent mismatch"));
4852 		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4853 	}
4854 	FREE_LOCK(dp->i_ump);
4855 }
4856 
4857 /*
4858  * Called to release the journal structures created by a failed link
4859  * addition.  Adjusts nlinkdelta for non-journaling softdep.
4860  */
4861 void
4862 softdep_revert_link(dp, ip)
4863 	struct inode *dp;
4864 	struct inode *ip;
4865 {
4866 	struct inodedep *inodedep;
4867 	struct jaddref *jaddref;
4868 	struct vnode *dvp;
4869 
4870 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4871 	    ("softdep_revert_link called on non-softdep filesystem"));
4872 	dvp = ITOV(dp);
4873 	ACQUIRE_LOCK(dp->i_ump);
4874 	inodedep = inodedep_lookup_ip(ip);
4875 	if (DOINGSUJ(dvp)) {
4876 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4877 		    inoreflst);
4878 		KASSERT(jaddref->ja_parent == dp->i_number,
4879 		    ("softdep_revert_link: addref parent mismatch"));
4880 		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4881 	}
4882 	FREE_LOCK(dp->i_ump);
4883 }
4884 
4885 /*
4886  * Called to release the journal structures created by a failed mkdir
4887  * attempt.  Adjusts nlinkdelta for non-journaling softdep.
4888  */
4889 void
4890 softdep_revert_mkdir(dp, ip)
4891 	struct inode *dp;
4892 	struct inode *ip;
4893 {
4894 	struct inodedep *inodedep;
4895 	struct jaddref *jaddref;
4896 	struct jaddref *dotaddref;
4897 	struct vnode *dvp;
4898 
4899 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4900 	    ("softdep_revert_mkdir called on non-softdep filesystem"));
4901 	dvp = ITOV(dp);
4902 
4903 	ACQUIRE_LOCK(dp->i_ump);
4904 	inodedep = inodedep_lookup_ip(dp);
4905 	if (DOINGSUJ(dvp)) {
4906 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4907 		    inoreflst);
4908 		KASSERT(jaddref->ja_parent == ip->i_number,
4909 		    ("softdep_revert_mkdir: dotdot addref parent mismatch"));
4910 		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4911 	}
4912 	inodedep = inodedep_lookup_ip(ip);
4913 	if (DOINGSUJ(dvp)) {
4914 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4915 		    inoreflst);
4916 		KASSERT(jaddref->ja_parent == dp->i_number,
4917 		    ("softdep_revert_mkdir: addref parent mismatch"));
4918 		dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
4919 		    inoreflst, if_deps);
4920 		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4921 		KASSERT(dotaddref->ja_parent == ip->i_number,
4922 		    ("softdep_revert_mkdir: dot addref parent mismatch"));
4923 		cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait);
4924 	}
4925 	FREE_LOCK(dp->i_ump);
4926 }
4927 
4928 /*
4929  * Called to correct nlinkdelta after a failed rmdir.
4930  */
4931 void
4932 softdep_revert_rmdir(dp, ip)
4933 	struct inode *dp;
4934 	struct inode *ip;
4935 {
4936 
4937 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4938 	    ("softdep_revert_rmdir called on non-softdep filesystem"));
4939 	ACQUIRE_LOCK(dp->i_ump);
4940 	(void) inodedep_lookup_ip(ip);
4941 	(void) inodedep_lookup_ip(dp);
4942 	FREE_LOCK(dp->i_ump);
4943 }
4944 
4945 /*
4946  * Protecting the freemaps (or bitmaps).
4947  *
4948  * To eliminate the need to execute fsck before mounting a filesystem
4949  * after a power failure, one must (conservatively) guarantee that the
4950  * on-disk copy of the bitmaps never indicate that a live inode or block is
4951  * free.  So, when a block or inode is allocated, the bitmap should be
4952  * updated (on disk) before any new pointers.  When a block or inode is
4953  * freed, the bitmap should not be updated until all pointers have been
4954  * reset.  The latter dependency is handled by the delayed de-allocation
4955  * approach described below for block and inode de-allocation.  The former
4956  * dependency is handled by calling the following procedure when a block or
4957  * inode is allocated. When an inode is allocated an "inodedep" is created
4958  * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
4959  * Each "inodedep" is also inserted into the hash indexing structure so
4960  * that any additional link additions can be made dependent on the inode
4961  * allocation.
4962  *
4963  * The ufs filesystem maintains a number of free block counts (e.g., per
4964  * cylinder group, per cylinder and per <cylinder, rotational position> pair)
4965  * in addition to the bitmaps.  These counts are used to improve efficiency
4966  * during allocation and therefore must be consistent with the bitmaps.
4967  * There is no convenient way to guarantee post-crash consistency of these
4968  * counts with simple update ordering, for two main reasons: (1) The counts
4969  * and bitmaps for a single cylinder group block are not in the same disk
4970  * sector.  If a disk write is interrupted (e.g., by power failure), one may
4971  * be written and the other not.  (2) Some of the counts are located in the
4972  * superblock rather than the cylinder group block. So, we focus our soft
4973  * updates implementation on protecting the bitmaps. When mounting a
4974  * filesystem, we recompute the auxiliary counts from the bitmaps.
4975  */
4976 
4977 /*
4978  * Called just after updating the cylinder group block to allocate an inode.
4979  */
4980 void
4981 softdep_setup_inomapdep(bp, ip, newinum, mode)
4982 	struct buf *bp;		/* buffer for cylgroup block with inode map */
4983 	struct inode *ip;	/* inode related to allocation */
4984 	ino_t newinum;		/* new inode number being allocated */
4985 	int mode;
4986 {
4987 	struct inodedep *inodedep;
4988 	struct bmsafemap *bmsafemap;
4989 	struct jaddref *jaddref;
4990 	struct mount *mp;
4991 	struct fs *fs;
4992 
4993 	mp = UFSTOVFS(ip->i_ump);
4994 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
4995 	    ("softdep_setup_inomapdep called on non-softdep filesystem"));
4996 	fs = ip->i_ump->um_fs;
4997 	jaddref = NULL;
4998 
4999 	/*
5000 	 * Allocate the journal reference add structure so that the bitmap
5001 	 * can be dependent on it.
5002 	 */
5003 	if (MOUNTEDSUJ(mp)) {
5004 		jaddref = newjaddref(ip, newinum, 0, 0, mode);
5005 		jaddref->ja_state |= NEWBLOCK;
5006 	}
5007 
5008 	/*
5009 	 * Create a dependency for the newly allocated inode.
5010 	 * Panic if it already exists as something is seriously wrong.
5011 	 * Otherwise add it to the dependency list for the buffer holding
5012 	 * the cylinder group map from which it was allocated.
5013 	 *
5014 	 * We have to preallocate a bmsafemap entry in case it is needed
5015 	 * in bmsafemap_lookup since once we allocate the inodedep, we
5016 	 * have to finish initializing it before we can FREE_LOCK().
5017 	 * By preallocating, we avoid FREE_LOCK() while doing a malloc
5018 	 * in bmsafemap_lookup. We cannot call bmsafemap_lookup before
5019 	 * creating the inodedep as it can be freed during the time
5020 	 * that we FREE_LOCK() while allocating the inodedep. We must
5021 	 * call workitem_alloc() before entering the locked section as
5022 	 * it also acquires the lock and we must avoid trying doing so
5023 	 * recursively.
5024 	 */
5025 	bmsafemap = malloc(sizeof(struct bmsafemap),
5026 	    M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5027 	workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5028 	ACQUIRE_LOCK(ip->i_ump);
5029 	if ((inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep)))
5030 		panic("softdep_setup_inomapdep: dependency %p for new"
5031 		    "inode already exists", inodedep);
5032 	bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap);
5033 	if (jaddref) {
5034 		LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
5035 		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
5036 		    if_deps);
5037 	} else {
5038 		inodedep->id_state |= ONDEPLIST;
5039 		LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
5040 	}
5041 	inodedep->id_bmsafemap = bmsafemap;
5042 	inodedep->id_state &= ~DEPCOMPLETE;
5043 	FREE_LOCK(ip->i_ump);
5044 }
5045 
5046 /*
5047  * Called just after updating the cylinder group block to
5048  * allocate block or fragment.
5049  */
5050 void
5051 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
5052 	struct buf *bp;		/* buffer for cylgroup block with block map */
5053 	struct mount *mp;	/* filesystem doing allocation */
5054 	ufs2_daddr_t newblkno;	/* number of newly allocated block */
5055 	int frags;		/* Number of fragments. */
5056 	int oldfrags;		/* Previous number of fragments for extend. */
5057 {
5058 	struct newblk *newblk;
5059 	struct bmsafemap *bmsafemap;
5060 	struct jnewblk *jnewblk;
5061 	struct ufsmount *ump;
5062 	struct fs *fs;
5063 
5064 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5065 	    ("softdep_setup_blkmapdep called on non-softdep filesystem"));
5066 	ump = VFSTOUFS(mp);
5067 	fs = ump->um_fs;
5068 	jnewblk = NULL;
5069 	/*
5070 	 * Create a dependency for the newly allocated block.
5071 	 * Add it to the dependency list for the buffer holding
5072 	 * the cylinder group map from which it was allocated.
5073 	 */
5074 	if (MOUNTEDSUJ(mp)) {
5075 		jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
5076 		workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
5077 		jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
5078 		jnewblk->jn_state = ATTACHED;
5079 		jnewblk->jn_blkno = newblkno;
5080 		jnewblk->jn_frags = frags;
5081 		jnewblk->jn_oldfrags = oldfrags;
5082 #ifdef SUJ_DEBUG
5083 		{
5084 			struct cg *cgp;
5085 			uint8_t *blksfree;
5086 			long bno;
5087 			int i;
5088 
5089 			cgp = (struct cg *)bp->b_data;
5090 			blksfree = cg_blksfree(cgp);
5091 			bno = dtogd(fs, jnewblk->jn_blkno);
5092 			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
5093 			    i++) {
5094 				if (isset(blksfree, bno + i))
5095 					panic("softdep_setup_blkmapdep: "
5096 					    "free fragment %d from %d-%d "
5097 					    "state 0x%X dep %p", i,
5098 					    jnewblk->jn_oldfrags,
5099 					    jnewblk->jn_frags,
5100 					    jnewblk->jn_state,
5101 					    jnewblk->jn_dep);
5102 			}
5103 		}
5104 #endif
5105 	}
5106 
5107 	CTR3(KTR_SUJ,
5108 	    "softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d",
5109 	    newblkno, frags, oldfrags);
5110 	ACQUIRE_LOCK(ump);
5111 	if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
5112 		panic("softdep_setup_blkmapdep: found block");
5113 	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
5114 	    dtog(fs, newblkno), NULL);
5115 	if (jnewblk) {
5116 		jnewblk->jn_dep = (struct worklist *)newblk;
5117 		LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
5118 	} else {
5119 		newblk->nb_state |= ONDEPLIST;
5120 		LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
5121 	}
5122 	newblk->nb_bmsafemap = bmsafemap;
5123 	newblk->nb_jnewblk = jnewblk;
5124 	FREE_LOCK(ump);
5125 }
5126 
5127 #define	BMSAFEMAP_HASH(ump, cg) \
5128       (&(ump)->bmsafemap_hashtbl[(cg) & (ump)->bmsafemap_hash_size])
5129 
5130 static int
5131 bmsafemap_find(bmsafemaphd, cg, bmsafemapp)
5132 	struct bmsafemap_hashhead *bmsafemaphd;
5133 	int cg;
5134 	struct bmsafemap **bmsafemapp;
5135 {
5136 	struct bmsafemap *bmsafemap;
5137 
5138 	LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
5139 		if (bmsafemap->sm_cg == cg)
5140 			break;
5141 	if (bmsafemap) {
5142 		*bmsafemapp = bmsafemap;
5143 		return (1);
5144 	}
5145 	*bmsafemapp = NULL;
5146 
5147 	return (0);
5148 }
5149 
5150 /*
5151  * Find the bmsafemap associated with a cylinder group buffer.
5152  * If none exists, create one. The buffer must be locked when
5153  * this routine is called and this routine must be called with
5154  * the softdep lock held. To avoid giving up the lock while
5155  * allocating a new bmsafemap, a preallocated bmsafemap may be
5156  * provided. If it is provided but not needed, it is freed.
5157  */
5158 static struct bmsafemap *
5159 bmsafemap_lookup(mp, bp, cg, newbmsafemap)
5160 	struct mount *mp;
5161 	struct buf *bp;
5162 	int cg;
5163 	struct bmsafemap *newbmsafemap;
5164 {
5165 	struct bmsafemap_hashhead *bmsafemaphd;
5166 	struct bmsafemap *bmsafemap, *collision;
5167 	struct worklist *wk;
5168 	struct ufsmount *ump;
5169 
5170 	ump = VFSTOUFS(mp);
5171 	LOCK_OWNED(ump);
5172 	KASSERT(bp != NULL, ("bmsafemap_lookup: missing buffer"));
5173 	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5174 		if (wk->wk_type == D_BMSAFEMAP) {
5175 			if (newbmsafemap)
5176 				WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5177 			return (WK_BMSAFEMAP(wk));
5178 		}
5179 	}
5180 	bmsafemaphd = BMSAFEMAP_HASH(ump, cg);
5181 	if (bmsafemap_find(bmsafemaphd, cg, &bmsafemap) == 1) {
5182 		if (newbmsafemap)
5183 			WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5184 		return (bmsafemap);
5185 	}
5186 	if (newbmsafemap) {
5187 		bmsafemap = newbmsafemap;
5188 	} else {
5189 		FREE_LOCK(ump);
5190 		bmsafemap = malloc(sizeof(struct bmsafemap),
5191 			M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5192 		workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5193 		ACQUIRE_LOCK(ump);
5194 	}
5195 	bmsafemap->sm_buf = bp;
5196 	LIST_INIT(&bmsafemap->sm_inodedephd);
5197 	LIST_INIT(&bmsafemap->sm_inodedepwr);
5198 	LIST_INIT(&bmsafemap->sm_newblkhd);
5199 	LIST_INIT(&bmsafemap->sm_newblkwr);
5200 	LIST_INIT(&bmsafemap->sm_jaddrefhd);
5201 	LIST_INIT(&bmsafemap->sm_jnewblkhd);
5202 	LIST_INIT(&bmsafemap->sm_freehd);
5203 	LIST_INIT(&bmsafemap->sm_freewr);
5204 	if (bmsafemap_find(bmsafemaphd, cg, &collision) == 1) {
5205 		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
5206 		return (collision);
5207 	}
5208 	bmsafemap->sm_cg = cg;
5209 	LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
5210 	LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
5211 	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
5212 	return (bmsafemap);
5213 }
5214 
5215 /*
5216  * Direct block allocation dependencies.
5217  *
5218  * When a new block is allocated, the corresponding disk locations must be
5219  * initialized (with zeros or new data) before the on-disk inode points to
5220  * them.  Also, the freemap from which the block was allocated must be
5221  * updated (on disk) before the inode's pointer. These two dependencies are
5222  * independent of each other and are needed for all file blocks and indirect
5223  * blocks that are pointed to directly by the inode.  Just before the
5224  * "in-core" version of the inode is updated with a newly allocated block
5225  * number, a procedure (below) is called to setup allocation dependency
5226  * structures.  These structures are removed when the corresponding
5227  * dependencies are satisfied or when the block allocation becomes obsolete
5228  * (i.e., the file is deleted, the block is de-allocated, or the block is a
5229  * fragment that gets upgraded).  All of these cases are handled in
5230  * procedures described later.
5231  *
5232  * When a file extension causes a fragment to be upgraded, either to a larger
5233  * fragment or to a full block, the on-disk location may change (if the
5234  * previous fragment could not simply be extended). In this case, the old
5235  * fragment must be de-allocated, but not until after the inode's pointer has
5236  * been updated. In most cases, this is handled by later procedures, which
5237  * will construct a "freefrag" structure to be added to the workitem queue
5238  * when the inode update is complete (or obsolete).  The main exception to
5239  * this is when an allocation occurs while a pending allocation dependency
5240  * (for the same block pointer) remains.  This case is handled in the main
5241  * allocation dependency setup procedure by immediately freeing the
5242  * unreferenced fragments.
5243  */
5244 void
5245 softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
5246 	struct inode *ip;	/* inode to which block is being added */
5247 	ufs_lbn_t off;		/* block pointer within inode */
5248 	ufs2_daddr_t newblkno;	/* disk block number being added */
5249 	ufs2_daddr_t oldblkno;	/* previous block number, 0 unless frag */
5250 	long newsize;		/* size of new block */
5251 	long oldsize;		/* size of new block */
5252 	struct buf *bp;		/* bp for allocated block */
5253 {
5254 	struct allocdirect *adp, *oldadp;
5255 	struct allocdirectlst *adphead;
5256 	struct freefrag *freefrag;
5257 	struct inodedep *inodedep;
5258 	struct pagedep *pagedep;
5259 	struct jnewblk *jnewblk;
5260 	struct newblk *newblk;
5261 	struct mount *mp;
5262 	ufs_lbn_t lbn;
5263 
5264 	lbn = bp->b_lblkno;
5265 	mp = UFSTOVFS(ip->i_ump);
5266 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5267 	    ("softdep_setup_allocdirect called on non-softdep filesystem"));
5268 	if (oldblkno && oldblkno != newblkno)
5269 		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
5270 	else
5271 		freefrag = NULL;
5272 
5273 	CTR6(KTR_SUJ,
5274 	    "softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd "
5275 	    "off %jd newsize %ld oldsize %d",
5276 	    ip->i_number, newblkno, oldblkno, off, newsize, oldsize);
5277 	ACQUIRE_LOCK(ip->i_ump);
5278 	if (off >= NDADDR) {
5279 		if (lbn > 0)
5280 			panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
5281 			    lbn, off);
5282 		/* allocating an indirect block */
5283 		if (oldblkno != 0)
5284 			panic("softdep_setup_allocdirect: non-zero indir");
5285 	} else {
5286 		if (off != lbn)
5287 			panic("softdep_setup_allocdirect: lbn %jd != off %jd",
5288 			    lbn, off);
5289 		/*
5290 		 * Allocating a direct block.
5291 		 *
5292 		 * If we are allocating a directory block, then we must
5293 		 * allocate an associated pagedep to track additions and
5294 		 * deletions.
5295 		 */
5296 		if ((ip->i_mode & IFMT) == IFDIR)
5297 			pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC,
5298 			    &pagedep);
5299 	}
5300 	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5301 		panic("softdep_setup_allocdirect: lost block");
5302 	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5303 	    ("softdep_setup_allocdirect: newblk already initialized"));
5304 	/*
5305 	 * Convert the newblk to an allocdirect.
5306 	 */
5307 	WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
5308 	adp = (struct allocdirect *)newblk;
5309 	newblk->nb_freefrag = freefrag;
5310 	adp->ad_offset = off;
5311 	adp->ad_oldblkno = oldblkno;
5312 	adp->ad_newsize = newsize;
5313 	adp->ad_oldsize = oldsize;
5314 
5315 	/*
5316 	 * Finish initializing the journal.
5317 	 */
5318 	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5319 		jnewblk->jn_ino = ip->i_number;
5320 		jnewblk->jn_lbn = lbn;
5321 		add_to_journal(&jnewblk->jn_list);
5322 	}
5323 	if (freefrag && freefrag->ff_jdep != NULL &&
5324 	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5325 		add_to_journal(freefrag->ff_jdep);
5326 	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
5327 	adp->ad_inodedep = inodedep;
5328 
5329 	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5330 	/*
5331 	 * The list of allocdirects must be kept in sorted and ascending
5332 	 * order so that the rollback routines can quickly determine the
5333 	 * first uncommitted block (the size of the file stored on disk
5334 	 * ends at the end of the lowest committed fragment, or if there
5335 	 * are no fragments, at the end of the highest committed block).
5336 	 * Since files generally grow, the typical case is that the new
5337 	 * block is to be added at the end of the list. We speed this
5338 	 * special case by checking against the last allocdirect in the
5339 	 * list before laboriously traversing the list looking for the
5340 	 * insertion point.
5341 	 */
5342 	adphead = &inodedep->id_newinoupdt;
5343 	oldadp = TAILQ_LAST(adphead, allocdirectlst);
5344 	if (oldadp == NULL || oldadp->ad_offset <= off) {
5345 		/* insert at end of list */
5346 		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5347 		if (oldadp != NULL && oldadp->ad_offset == off)
5348 			allocdirect_merge(adphead, adp, oldadp);
5349 		FREE_LOCK(ip->i_ump);
5350 		return;
5351 	}
5352 	TAILQ_FOREACH(oldadp, adphead, ad_next) {
5353 		if (oldadp->ad_offset >= off)
5354 			break;
5355 	}
5356 	if (oldadp == NULL)
5357 		panic("softdep_setup_allocdirect: lost entry");
5358 	/* insert in middle of list */
5359 	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5360 	if (oldadp->ad_offset == off)
5361 		allocdirect_merge(adphead, adp, oldadp);
5362 
5363 	FREE_LOCK(ip->i_ump);
5364 }
5365 
5366 /*
5367  * Merge a newer and older journal record to be stored either in a
5368  * newblock or freefrag.  This handles aggregating journal records for
5369  * fragment allocation into a second record as well as replacing a
5370  * journal free with an aborted journal allocation.  A segment for the
5371  * oldest record will be placed on wkhd if it has been written.  If not
5372  * the segment for the newer record will suffice.
5373  */
5374 static struct worklist *
5375 jnewblk_merge(new, old, wkhd)
5376 	struct worklist *new;
5377 	struct worklist *old;
5378 	struct workhead *wkhd;
5379 {
5380 	struct jnewblk *njnewblk;
5381 	struct jnewblk *jnewblk;
5382 
5383 	/* Handle NULLs to simplify callers. */
5384 	if (new == NULL)
5385 		return (old);
5386 	if (old == NULL)
5387 		return (new);
5388 	/* Replace a jfreefrag with a jnewblk. */
5389 	if (new->wk_type == D_JFREEFRAG) {
5390 		if (WK_JNEWBLK(old)->jn_blkno != WK_JFREEFRAG(new)->fr_blkno)
5391 			panic("jnewblk_merge: blkno mismatch: %p, %p",
5392 			    old, new);
5393 		cancel_jfreefrag(WK_JFREEFRAG(new));
5394 		return (old);
5395 	}
5396 	if (old->wk_type != D_JNEWBLK || new->wk_type != D_JNEWBLK)
5397 		panic("jnewblk_merge: Bad type: old %d new %d\n",
5398 		    old->wk_type, new->wk_type);
5399 	/*
5400 	 * Handle merging of two jnewblk records that describe
5401 	 * different sets of fragments in the same block.
5402 	 */
5403 	jnewblk = WK_JNEWBLK(old);
5404 	njnewblk = WK_JNEWBLK(new);
5405 	if (jnewblk->jn_blkno != njnewblk->jn_blkno)
5406 		panic("jnewblk_merge: Merging disparate blocks.");
5407 	/*
5408 	 * The record may be rolled back in the cg.
5409 	 */
5410 	if (jnewblk->jn_state & UNDONE) {
5411 		jnewblk->jn_state &= ~UNDONE;
5412 		njnewblk->jn_state |= UNDONE;
5413 		njnewblk->jn_state &= ~ATTACHED;
5414 	}
5415 	/*
5416 	 * We modify the newer addref and free the older so that if neither
5417 	 * has been written the most up-to-date copy will be on disk.  If
5418 	 * both have been written but rolled back we only temporarily need
5419 	 * one of them to fix the bits when the cg write completes.
5420 	 */
5421 	jnewblk->jn_state |= ATTACHED | COMPLETE;
5422 	njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
5423 	cancel_jnewblk(jnewblk, wkhd);
5424 	WORKLIST_REMOVE(&jnewblk->jn_list);
5425 	free_jnewblk(jnewblk);
5426 	return (new);
5427 }
5428 
5429 /*
5430  * Replace an old allocdirect dependency with a newer one.
5431  * This routine must be called with splbio interrupts blocked.
5432  */
5433 static void
5434 allocdirect_merge(adphead, newadp, oldadp)
5435 	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
5436 	struct allocdirect *newadp;	/* allocdirect being added */
5437 	struct allocdirect *oldadp;	/* existing allocdirect being checked */
5438 {
5439 	struct worklist *wk;
5440 	struct freefrag *freefrag;
5441 
5442 	freefrag = NULL;
5443 	LOCK_OWNED(VFSTOUFS(newadp->ad_list.wk_mp));
5444 	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
5445 	    newadp->ad_oldsize != oldadp->ad_newsize ||
5446 	    newadp->ad_offset >= NDADDR)
5447 		panic("%s %jd != new %jd || old size %ld != new %ld",
5448 		    "allocdirect_merge: old blkno",
5449 		    (intmax_t)newadp->ad_oldblkno,
5450 		    (intmax_t)oldadp->ad_newblkno,
5451 		    newadp->ad_oldsize, oldadp->ad_newsize);
5452 	newadp->ad_oldblkno = oldadp->ad_oldblkno;
5453 	newadp->ad_oldsize = oldadp->ad_oldsize;
5454 	/*
5455 	 * If the old dependency had a fragment to free or had never
5456 	 * previously had a block allocated, then the new dependency
5457 	 * can immediately post its freefrag and adopt the old freefrag.
5458 	 * This action is done by swapping the freefrag dependencies.
5459 	 * The new dependency gains the old one's freefrag, and the
5460 	 * old one gets the new one and then immediately puts it on
5461 	 * the worklist when it is freed by free_newblk. It is
5462 	 * not possible to do this swap when the old dependency had a
5463 	 * non-zero size but no previous fragment to free. This condition
5464 	 * arises when the new block is an extension of the old block.
5465 	 * Here, the first part of the fragment allocated to the new
5466 	 * dependency is part of the block currently claimed on disk by
5467 	 * the old dependency, so cannot legitimately be freed until the
5468 	 * conditions for the new dependency are fulfilled.
5469 	 */
5470 	freefrag = newadp->ad_freefrag;
5471 	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
5472 		newadp->ad_freefrag = oldadp->ad_freefrag;
5473 		oldadp->ad_freefrag = freefrag;
5474 	}
5475 	/*
5476 	 * If we are tracking a new directory-block allocation,
5477 	 * move it from the old allocdirect to the new allocdirect.
5478 	 */
5479 	if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
5480 		WORKLIST_REMOVE(wk);
5481 		if (!LIST_EMPTY(&oldadp->ad_newdirblk))
5482 			panic("allocdirect_merge: extra newdirblk");
5483 		WORKLIST_INSERT(&newadp->ad_newdirblk, wk);
5484 	}
5485 	TAILQ_REMOVE(adphead, oldadp, ad_next);
5486 	/*
5487 	 * We need to move any journal dependencies over to the freefrag
5488 	 * that releases this block if it exists.  Otherwise we are
5489 	 * extending an existing block and we'll wait until that is
5490 	 * complete to release the journal space and extend the
5491 	 * new journal to cover this old space as well.
5492 	 */
5493 	if (freefrag == NULL) {
5494 		if (oldadp->ad_newblkno != newadp->ad_newblkno)
5495 			panic("allocdirect_merge: %jd != %jd",
5496 			    oldadp->ad_newblkno, newadp->ad_newblkno);
5497 		newadp->ad_block.nb_jnewblk = (struct jnewblk *)
5498 		    jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list,
5499 		    &oldadp->ad_block.nb_jnewblk->jn_list,
5500 		    &newadp->ad_block.nb_jwork);
5501 		oldadp->ad_block.nb_jnewblk = NULL;
5502 		cancel_newblk(&oldadp->ad_block, NULL,
5503 		    &newadp->ad_block.nb_jwork);
5504 	} else {
5505 		wk = (struct worklist *) cancel_newblk(&oldadp->ad_block,
5506 		    &freefrag->ff_list, &freefrag->ff_jwork);
5507 		freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk,
5508 		    &freefrag->ff_jwork);
5509 	}
5510 	free_newblk(&oldadp->ad_block);
5511 }
5512 
5513 /*
5514  * Allocate a jfreefrag structure to journal a single block free.
5515  */
5516 static struct jfreefrag *
5517 newjfreefrag(freefrag, ip, blkno, size, lbn)
5518 	struct freefrag *freefrag;
5519 	struct inode *ip;
5520 	ufs2_daddr_t blkno;
5521 	long size;
5522 	ufs_lbn_t lbn;
5523 {
5524 	struct jfreefrag *jfreefrag;
5525 	struct fs *fs;
5526 
5527 	fs = ip->i_fs;
5528 	jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
5529 	    M_SOFTDEP_FLAGS);
5530 	workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump));
5531 	jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
5532 	jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
5533 	jfreefrag->fr_ino = ip->i_number;
5534 	jfreefrag->fr_lbn = lbn;
5535 	jfreefrag->fr_blkno = blkno;
5536 	jfreefrag->fr_frags = numfrags(fs, size);
5537 	jfreefrag->fr_freefrag = freefrag;
5538 
5539 	return (jfreefrag);
5540 }
5541 
5542 /*
5543  * Allocate a new freefrag structure.
5544  */
5545 static struct freefrag *
5546 newfreefrag(ip, blkno, size, lbn)
5547 	struct inode *ip;
5548 	ufs2_daddr_t blkno;
5549 	long size;
5550 	ufs_lbn_t lbn;
5551 {
5552 	struct freefrag *freefrag;
5553 	struct fs *fs;
5554 
5555 	CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd",
5556 	    ip->i_number, blkno, size, lbn);
5557 	fs = ip->i_fs;
5558 	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
5559 		panic("newfreefrag: frag size");
5560 	freefrag = malloc(sizeof(struct freefrag),
5561 	    M_FREEFRAG, M_SOFTDEP_FLAGS);
5562 	workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
5563 	freefrag->ff_state = ATTACHED;
5564 	LIST_INIT(&freefrag->ff_jwork);
5565 	freefrag->ff_inum = ip->i_number;
5566 	freefrag->ff_vtype = ITOV(ip)->v_type;
5567 	freefrag->ff_blkno = blkno;
5568 	freefrag->ff_fragsize = size;
5569 
5570 	if (MOUNTEDSUJ(UFSTOVFS(ip->i_ump))) {
5571 		freefrag->ff_jdep = (struct worklist *)
5572 		    newjfreefrag(freefrag, ip, blkno, size, lbn);
5573 	} else {
5574 		freefrag->ff_state |= DEPCOMPLETE;
5575 		freefrag->ff_jdep = NULL;
5576 	}
5577 
5578 	return (freefrag);
5579 }
5580 
5581 /*
5582  * This workitem de-allocates fragments that were replaced during
5583  * file block allocation.
5584  */
5585 static void
5586 handle_workitem_freefrag(freefrag)
5587 	struct freefrag *freefrag;
5588 {
5589 	struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
5590 	struct workhead wkhd;
5591 
5592 	CTR3(KTR_SUJ,
5593 	    "handle_workitem_freefrag: ino %d blkno %jd size %ld",
5594 	    freefrag->ff_inum, freefrag->ff_blkno, freefrag->ff_fragsize);
5595 	/*
5596 	 * It would be illegal to add new completion items to the
5597 	 * freefrag after it was schedule to be done so it must be
5598 	 * safe to modify the list head here.
5599 	 */
5600 	LIST_INIT(&wkhd);
5601 	ACQUIRE_LOCK(ump);
5602 	LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
5603 	/*
5604 	 * If the journal has not been written we must cancel it here.
5605 	 */
5606 	if (freefrag->ff_jdep) {
5607 		if (freefrag->ff_jdep->wk_type != D_JNEWBLK)
5608 			panic("handle_workitem_freefrag: Unexpected type %d\n",
5609 			    freefrag->ff_jdep->wk_type);
5610 		cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd);
5611 	}
5612 	FREE_LOCK(ump);
5613 	ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
5614 	   freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd);
5615 	ACQUIRE_LOCK(ump);
5616 	WORKITEM_FREE(freefrag, D_FREEFRAG);
5617 	FREE_LOCK(ump);
5618 }
5619 
5620 /*
5621  * Set up a dependency structure for an external attributes data block.
5622  * This routine follows much of the structure of softdep_setup_allocdirect.
5623  * See the description of softdep_setup_allocdirect above for details.
5624  */
5625 void
5626 softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
5627 	struct inode *ip;
5628 	ufs_lbn_t off;
5629 	ufs2_daddr_t newblkno;
5630 	ufs2_daddr_t oldblkno;
5631 	long newsize;
5632 	long oldsize;
5633 	struct buf *bp;
5634 {
5635 	struct allocdirect *adp, *oldadp;
5636 	struct allocdirectlst *adphead;
5637 	struct freefrag *freefrag;
5638 	struct inodedep *inodedep;
5639 	struct jnewblk *jnewblk;
5640 	struct newblk *newblk;
5641 	struct mount *mp;
5642 	ufs_lbn_t lbn;
5643 
5644 	mp = UFSTOVFS(ip->i_ump);
5645 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5646 	    ("softdep_setup_allocext called on non-softdep filesystem"));
5647 	KASSERT(off < NXADDR, ("softdep_setup_allocext: lbn %lld > NXADDR",
5648 		    (long long)off));
5649 
5650 	lbn = bp->b_lblkno;
5651 	if (oldblkno && oldblkno != newblkno)
5652 		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
5653 	else
5654 		freefrag = NULL;
5655 
5656 	ACQUIRE_LOCK(ip->i_ump);
5657 	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5658 		panic("softdep_setup_allocext: lost block");
5659 	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5660 	    ("softdep_setup_allocext: newblk already initialized"));
5661 	/*
5662 	 * Convert the newblk to an allocdirect.
5663 	 */
5664 	WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
5665 	adp = (struct allocdirect *)newblk;
5666 	newblk->nb_freefrag = freefrag;
5667 	adp->ad_offset = off;
5668 	adp->ad_oldblkno = oldblkno;
5669 	adp->ad_newsize = newsize;
5670 	adp->ad_oldsize = oldsize;
5671 	adp->ad_state |=  EXTDATA;
5672 
5673 	/*
5674 	 * Finish initializing the journal.
5675 	 */
5676 	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5677 		jnewblk->jn_ino = ip->i_number;
5678 		jnewblk->jn_lbn = lbn;
5679 		add_to_journal(&jnewblk->jn_list);
5680 	}
5681 	if (freefrag && freefrag->ff_jdep != NULL &&
5682 	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5683 		add_to_journal(freefrag->ff_jdep);
5684 	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
5685 	adp->ad_inodedep = inodedep;
5686 
5687 	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5688 	/*
5689 	 * The list of allocdirects must be kept in sorted and ascending
5690 	 * order so that the rollback routines can quickly determine the
5691 	 * first uncommitted block (the size of the file stored on disk
5692 	 * ends at the end of the lowest committed fragment, or if there
5693 	 * are no fragments, at the end of the highest committed block).
5694 	 * Since files generally grow, the typical case is that the new
5695 	 * block is to be added at the end of the list. We speed this
5696 	 * special case by checking against the last allocdirect in the
5697 	 * list before laboriously traversing the list looking for the
5698 	 * insertion point.
5699 	 */
5700 	adphead = &inodedep->id_newextupdt;
5701 	oldadp = TAILQ_LAST(adphead, allocdirectlst);
5702 	if (oldadp == NULL || oldadp->ad_offset <= off) {
5703 		/* insert at end of list */
5704 		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5705 		if (oldadp != NULL && oldadp->ad_offset == off)
5706 			allocdirect_merge(adphead, adp, oldadp);
5707 		FREE_LOCK(ip->i_ump);
5708 		return;
5709 	}
5710 	TAILQ_FOREACH(oldadp, adphead, ad_next) {
5711 		if (oldadp->ad_offset >= off)
5712 			break;
5713 	}
5714 	if (oldadp == NULL)
5715 		panic("softdep_setup_allocext: lost entry");
5716 	/* insert in middle of list */
5717 	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5718 	if (oldadp->ad_offset == off)
5719 		allocdirect_merge(adphead, adp, oldadp);
5720 	FREE_LOCK(ip->i_ump);
5721 }
5722 
5723 /*
5724  * Indirect block allocation dependencies.
5725  *
5726  * The same dependencies that exist for a direct block also exist when
5727  * a new block is allocated and pointed to by an entry in a block of
5728  * indirect pointers. The undo/redo states described above are also
5729  * used here. Because an indirect block contains many pointers that
5730  * may have dependencies, a second copy of the entire in-memory indirect
5731  * block is kept. The buffer cache copy is always completely up-to-date.
5732  * The second copy, which is used only as a source for disk writes,
5733  * contains only the safe pointers (i.e., those that have no remaining
5734  * update dependencies). The second copy is freed when all pointers
5735  * are safe. The cache is not allowed to replace indirect blocks with
5736  * pending update dependencies. If a buffer containing an indirect
5737  * block with dependencies is written, these routines will mark it
5738  * dirty again. It can only be successfully written once all the
5739  * dependencies are removed. The ffs_fsync routine in conjunction with
5740  * softdep_sync_metadata work together to get all the dependencies
5741  * removed so that a file can be successfully written to disk. Three
5742  * procedures are used when setting up indirect block pointer
5743  * dependencies. The division is necessary because of the organization
5744  * of the "balloc" routine and because of the distinction between file
5745  * pages and file metadata blocks.
5746  */
5747 
5748 /*
5749  * Allocate a new allocindir structure.
5750  */
5751 static struct allocindir *
5752 newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
5753 	struct inode *ip;	/* inode for file being extended */
5754 	int ptrno;		/* offset of pointer in indirect block */
5755 	ufs2_daddr_t newblkno;	/* disk block number being added */
5756 	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
5757 	ufs_lbn_t lbn;
5758 {
5759 	struct newblk *newblk;
5760 	struct allocindir *aip;
5761 	struct freefrag *freefrag;
5762 	struct jnewblk *jnewblk;
5763 
5764 	if (oldblkno)
5765 		freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn);
5766 	else
5767 		freefrag = NULL;
5768 	ACQUIRE_LOCK(ip->i_ump);
5769 	if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0)
5770 		panic("new_allocindir: lost block");
5771 	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5772 	    ("newallocindir: newblk already initialized"));
5773 	WORKITEM_REASSIGN(newblk, D_ALLOCINDIR);
5774 	newblk->nb_freefrag = freefrag;
5775 	aip = (struct allocindir *)newblk;
5776 	aip->ai_offset = ptrno;
5777 	aip->ai_oldblkno = oldblkno;
5778 	aip->ai_lbn = lbn;
5779 	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5780 		jnewblk->jn_ino = ip->i_number;
5781 		jnewblk->jn_lbn = lbn;
5782 		add_to_journal(&jnewblk->jn_list);
5783 	}
5784 	if (freefrag && freefrag->ff_jdep != NULL &&
5785 	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5786 		add_to_journal(freefrag->ff_jdep);
5787 	return (aip);
5788 }
5789 
5790 /*
5791  * Called just before setting an indirect block pointer
5792  * to a newly allocated file page.
5793  */
5794 void
5795 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
5796 	struct inode *ip;	/* inode for file being extended */
5797 	ufs_lbn_t lbn;		/* allocated block number within file */
5798 	struct buf *bp;		/* buffer with indirect blk referencing page */
5799 	int ptrno;		/* offset of pointer in indirect block */
5800 	ufs2_daddr_t newblkno;	/* disk block number being added */
5801 	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
5802 	struct buf *nbp;	/* buffer holding allocated page */
5803 {
5804 	struct inodedep *inodedep;
5805 	struct freefrag *freefrag;
5806 	struct allocindir *aip;
5807 	struct pagedep *pagedep;
5808 	struct mount *mp;
5809 	int dflags;
5810 
5811 	mp = UFSTOVFS(ip->i_ump);
5812 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5813 	    ("softdep_setup_allocindir_page called on non-softdep filesystem"));
5814 	KASSERT(lbn == nbp->b_lblkno,
5815 	    ("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
5816 	    lbn, bp->b_lblkno));
5817 	CTR4(KTR_SUJ,
5818 	    "softdep_setup_allocindir_page: ino %d blkno %jd oldblkno %jd "
5819 	    "lbn %jd", ip->i_number, newblkno, oldblkno, lbn);
5820 	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
5821 	aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
5822 	dflags = DEPALLOC;
5823 	if (IS_SNAPSHOT(ip))
5824 		dflags |= NODELAY;
5825 	(void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
5826 	/*
5827 	 * If we are allocating a directory page, then we must
5828 	 * allocate an associated pagedep to track additions and
5829 	 * deletions.
5830 	 */
5831 	if ((ip->i_mode & IFMT) == IFDIR)
5832 		pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep);
5833 	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5834 	freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
5835 	FREE_LOCK(ip->i_ump);
5836 	if (freefrag)
5837 		handle_workitem_freefrag(freefrag);
5838 }
5839 
5840 /*
5841  * Called just before setting an indirect block pointer to a
5842  * newly allocated indirect block.
5843  */
5844 void
5845 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
5846 	struct buf *nbp;	/* newly allocated indirect block */
5847 	struct inode *ip;	/* inode for file being extended */
5848 	struct buf *bp;		/* indirect block referencing allocated block */
5849 	int ptrno;		/* offset of pointer in indirect block */
5850 	ufs2_daddr_t newblkno;	/* disk block number being added */
5851 {
5852 	struct inodedep *inodedep;
5853 	struct allocindir *aip;
5854 	ufs_lbn_t lbn;
5855 	int dflags;
5856 
5857 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
5858 	    ("softdep_setup_allocindir_meta called on non-softdep filesystem"));
5859 	CTR3(KTR_SUJ,
5860 	    "softdep_setup_allocindir_meta: ino %d blkno %jd ptrno %d",
5861 	    ip->i_number, newblkno, ptrno);
5862 	lbn = nbp->b_lblkno;
5863 	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
5864 	aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
5865 	dflags = DEPALLOC;
5866 	if (IS_SNAPSHOT(ip))
5867 		dflags |= NODELAY;
5868 	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, &inodedep);
5869 	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5870 	if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn))
5871 		panic("softdep_setup_allocindir_meta: Block already existed");
5872 	FREE_LOCK(ip->i_ump);
5873 }
5874 
5875 static void
5876 indirdep_complete(indirdep)
5877 	struct indirdep *indirdep;
5878 {
5879 	struct allocindir *aip;
5880 
5881 	LIST_REMOVE(indirdep, ir_next);
5882 	indirdep->ir_state |= DEPCOMPLETE;
5883 
5884 	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
5885 		LIST_REMOVE(aip, ai_next);
5886 		free_newblk(&aip->ai_block);
5887 	}
5888 	/*
5889 	 * If this indirdep is not attached to a buf it was simply waiting
5890 	 * on completion to clear completehd.  free_indirdep() asserts
5891 	 * that nothing is dangling.
5892 	 */
5893 	if ((indirdep->ir_state & ONWORKLIST) == 0)
5894 		free_indirdep(indirdep);
5895 }
5896 
5897 static struct indirdep *
5898 indirdep_lookup(mp, ip, bp)
5899 	struct mount *mp;
5900 	struct inode *ip;
5901 	struct buf *bp;
5902 {
5903 	struct indirdep *indirdep, *newindirdep;
5904 	struct newblk *newblk;
5905 	struct ufsmount *ump;
5906 	struct worklist *wk;
5907 	struct fs *fs;
5908 	ufs2_daddr_t blkno;
5909 
5910 	ump = VFSTOUFS(mp);
5911 	LOCK_OWNED(ump);
5912 	indirdep = NULL;
5913 	newindirdep = NULL;
5914 	fs = ip->i_fs;
5915 	for (;;) {
5916 		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5917 			if (wk->wk_type != D_INDIRDEP)
5918 				continue;
5919 			indirdep = WK_INDIRDEP(wk);
5920 			break;
5921 		}
5922 		/* Found on the buffer worklist, no new structure to free. */
5923 		if (indirdep != NULL && newindirdep == NULL)
5924 			return (indirdep);
5925 		if (indirdep != NULL && newindirdep != NULL)
5926 			panic("indirdep_lookup: simultaneous create");
5927 		/* None found on the buffer and a new structure is ready. */
5928 		if (indirdep == NULL && newindirdep != NULL)
5929 			break;
5930 		/* None found and no new structure available. */
5931 		FREE_LOCK(ump);
5932 		newindirdep = malloc(sizeof(struct indirdep),
5933 		    M_INDIRDEP, M_SOFTDEP_FLAGS);
5934 		workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
5935 		newindirdep->ir_state = ATTACHED;
5936 		if (ip->i_ump->um_fstype == UFS1)
5937 			newindirdep->ir_state |= UFS1FMT;
5938 		TAILQ_INIT(&newindirdep->ir_trunc);
5939 		newindirdep->ir_saveddata = NULL;
5940 		LIST_INIT(&newindirdep->ir_deplisthd);
5941 		LIST_INIT(&newindirdep->ir_donehd);
5942 		LIST_INIT(&newindirdep->ir_writehd);
5943 		LIST_INIT(&newindirdep->ir_completehd);
5944 		if (bp->b_blkno == bp->b_lblkno) {
5945 			ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
5946 			    NULL, NULL);
5947 			bp->b_blkno = blkno;
5948 		}
5949 		newindirdep->ir_freeblks = NULL;
5950 		newindirdep->ir_savebp =
5951 		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
5952 		newindirdep->ir_bp = bp;
5953 		BUF_KERNPROC(newindirdep->ir_savebp);
5954 		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
5955 		ACQUIRE_LOCK(ump);
5956 	}
5957 	indirdep = newindirdep;
5958 	WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
5959 	/*
5960 	 * If the block is not yet allocated we don't set DEPCOMPLETE so
5961 	 * that we don't free dependencies until the pointers are valid.
5962 	 * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather
5963 	 * than using the hash.
5964 	 */
5965 	if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk))
5966 		LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next);
5967 	else
5968 		indirdep->ir_state |= DEPCOMPLETE;
5969 	return (indirdep);
5970 }
5971 
5972 /*
5973  * Called to finish the allocation of the "aip" allocated
5974  * by one of the two routines above.
5975  */
5976 static struct freefrag *
5977 setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)
5978 	struct buf *bp;		/* in-memory copy of the indirect block */
5979 	struct inode *ip;	/* inode for file being extended */
5980 	struct inodedep *inodedep; /* Inodedep for ip */
5981 	struct allocindir *aip;	/* allocindir allocated by the above routines */
5982 	ufs_lbn_t lbn;		/* Logical block number for this block. */
5983 {
5984 	struct fs *fs;
5985 	struct indirdep *indirdep;
5986 	struct allocindir *oldaip;
5987 	struct freefrag *freefrag;
5988 	struct mount *mp;
5989 
5990 	LOCK_OWNED(ip->i_ump);
5991 	mp = UFSTOVFS(ip->i_ump);
5992 	fs = ip->i_fs;
5993 	if (bp->b_lblkno >= 0)
5994 		panic("setup_allocindir_phase2: not indir blk");
5995 	KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs),
5996 	    ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset));
5997 	indirdep = indirdep_lookup(mp, ip, bp);
5998 	KASSERT(indirdep->ir_savebp != NULL,
5999 	    ("setup_allocindir_phase2 NULL ir_savebp"));
6000 	aip->ai_indirdep = indirdep;
6001 	/*
6002 	 * Check for an unwritten dependency for this indirect offset.  If
6003 	 * there is, merge the old dependency into the new one.  This happens
6004 	 * as a result of reallocblk only.
6005 	 */
6006 	freefrag = NULL;
6007 	if (aip->ai_oldblkno != 0) {
6008 		LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) {
6009 			if (oldaip->ai_offset == aip->ai_offset) {
6010 				freefrag = allocindir_merge(aip, oldaip);
6011 				goto done;
6012 			}
6013 		}
6014 		LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) {
6015 			if (oldaip->ai_offset == aip->ai_offset) {
6016 				freefrag = allocindir_merge(aip, oldaip);
6017 				goto done;
6018 			}
6019 		}
6020 	}
6021 done:
6022 	LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
6023 	return (freefrag);
6024 }
6025 
6026 /*
6027  * Merge two allocindirs which refer to the same block.  Move newblock
6028  * dependencies and setup the freefrags appropriately.
6029  */
6030 static struct freefrag *
6031 allocindir_merge(aip, oldaip)
6032 	struct allocindir *aip;
6033 	struct allocindir *oldaip;
6034 {
6035 	struct freefrag *freefrag;
6036 	struct worklist *wk;
6037 
6038 	if (oldaip->ai_newblkno != aip->ai_oldblkno)
6039 		panic("allocindir_merge: blkno");
6040 	aip->ai_oldblkno = oldaip->ai_oldblkno;
6041 	freefrag = aip->ai_freefrag;
6042 	aip->ai_freefrag = oldaip->ai_freefrag;
6043 	oldaip->ai_freefrag = NULL;
6044 	KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag"));
6045 	/*
6046 	 * If we are tracking a new directory-block allocation,
6047 	 * move it from the old allocindir to the new allocindir.
6048 	 */
6049 	if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) {
6050 		WORKLIST_REMOVE(wk);
6051 		if (!LIST_EMPTY(&oldaip->ai_newdirblk))
6052 			panic("allocindir_merge: extra newdirblk");
6053 		WORKLIST_INSERT(&aip->ai_newdirblk, wk);
6054 	}
6055 	/*
6056 	 * We can skip journaling for this freefrag and just complete
6057 	 * any pending journal work for the allocindir that is being
6058 	 * removed after the freefrag completes.
6059 	 */
6060 	if (freefrag->ff_jdep)
6061 		cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep));
6062 	LIST_REMOVE(oldaip, ai_next);
6063 	freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block,
6064 	    &freefrag->ff_list, &freefrag->ff_jwork);
6065 	free_newblk(&oldaip->ai_block);
6066 
6067 	return (freefrag);
6068 }
6069 
6070 static inline void
6071 setup_freedirect(freeblks, ip, i, needj)
6072 	struct freeblks *freeblks;
6073 	struct inode *ip;
6074 	int i;
6075 	int needj;
6076 {
6077 	ufs2_daddr_t blkno;
6078 	int frags;
6079 
6080 	blkno = DIP(ip, i_db[i]);
6081 	if (blkno == 0)
6082 		return;
6083 	DIP_SET(ip, i_db[i], 0);
6084 	frags = sblksize(ip->i_fs, ip->i_size, i);
6085 	frags = numfrags(ip->i_fs, frags);
6086 	newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags, 0, needj);
6087 }
6088 
6089 static inline void
6090 setup_freeext(freeblks, ip, i, needj)
6091 	struct freeblks *freeblks;
6092 	struct inode *ip;
6093 	int i;
6094 	int needj;
6095 {
6096 	ufs2_daddr_t blkno;
6097 	int frags;
6098 
6099 	blkno = ip->i_din2->di_extb[i];
6100 	if (blkno == 0)
6101 		return;
6102 	ip->i_din2->di_extb[i] = 0;
6103 	frags = sblksize(ip->i_fs, ip->i_din2->di_extsize, i);
6104 	frags = numfrags(ip->i_fs, frags);
6105 	newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj);
6106 }
6107 
6108 static inline void
6109 setup_freeindir(freeblks, ip, i, lbn, needj)
6110 	struct freeblks *freeblks;
6111 	struct inode *ip;
6112 	int i;
6113 	ufs_lbn_t lbn;
6114 	int needj;
6115 {
6116 	ufs2_daddr_t blkno;
6117 
6118 	blkno = DIP(ip, i_ib[i]);
6119 	if (blkno == 0)
6120 		return;
6121 	DIP_SET(ip, i_ib[i], 0);
6122 	newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, ip->i_fs->fs_frag,
6123 	    0, needj);
6124 }
6125 
6126 static inline struct freeblks *
6127 newfreeblks(mp, ip)
6128 	struct mount *mp;
6129 	struct inode *ip;
6130 {
6131 	struct freeblks *freeblks;
6132 
6133 	freeblks = malloc(sizeof(struct freeblks),
6134 		M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
6135 	workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
6136 	LIST_INIT(&freeblks->fb_jblkdephd);
6137 	LIST_INIT(&freeblks->fb_jwork);
6138 	freeblks->fb_ref = 0;
6139 	freeblks->fb_cgwait = 0;
6140 	freeblks->fb_state = ATTACHED;
6141 	freeblks->fb_uid = ip->i_uid;
6142 	freeblks->fb_inum = ip->i_number;
6143 	freeblks->fb_vtype = ITOV(ip)->v_type;
6144 	freeblks->fb_modrev = DIP(ip, i_modrev);
6145 	freeblks->fb_devvp = ip->i_devvp;
6146 	freeblks->fb_chkcnt = 0;
6147 	freeblks->fb_len = 0;
6148 
6149 	return (freeblks);
6150 }
6151 
6152 static void
6153 trunc_indirdep(indirdep, freeblks, bp, off)
6154 	struct indirdep *indirdep;
6155 	struct freeblks *freeblks;
6156 	struct buf *bp;
6157 	int off;
6158 {
6159 	struct allocindir *aip, *aipn;
6160 
6161 	/*
6162 	 * The first set of allocindirs won't be in savedbp.
6163 	 */
6164 	LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn)
6165 		if (aip->ai_offset > off)
6166 			cancel_allocindir(aip, bp, freeblks, 1);
6167 	LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn)
6168 		if (aip->ai_offset > off)
6169 			cancel_allocindir(aip, bp, freeblks, 1);
6170 	/*
6171 	 * These will exist in savedbp.
6172 	 */
6173 	LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn)
6174 		if (aip->ai_offset > off)
6175 			cancel_allocindir(aip, NULL, freeblks, 0);
6176 	LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn)
6177 		if (aip->ai_offset > off)
6178 			cancel_allocindir(aip, NULL, freeblks, 0);
6179 }
6180 
6181 /*
6182  * Follow the chain of indirects down to lastlbn creating a freework
6183  * structure for each.  This will be used to start indir_trunc() at
6184  * the right offset and create the journal records for the parrtial
6185  * truncation.  A second step will handle the truncated dependencies.
6186  */
6187 static int
6188 setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno)
6189 	struct freeblks *freeblks;
6190 	struct inode *ip;
6191 	ufs_lbn_t lbn;
6192 	ufs_lbn_t lastlbn;
6193 	ufs2_daddr_t blkno;
6194 {
6195 	struct indirdep *indirdep;
6196 	struct indirdep *indirn;
6197 	struct freework *freework;
6198 	struct newblk *newblk;
6199 	struct mount *mp;
6200 	struct buf *bp;
6201 	uint8_t *start;
6202 	uint8_t *end;
6203 	ufs_lbn_t lbnadd;
6204 	int level;
6205 	int error;
6206 	int off;
6207 
6208 
6209 	freework = NULL;
6210 	if (blkno == 0)
6211 		return (0);
6212 	mp = freeblks->fb_list.wk_mp;
6213 	bp = getblk(ITOV(ip), lbn, mp->mnt_stat.f_iosize, 0, 0, 0);
6214 	if ((bp->b_flags & B_CACHE) == 0) {
6215 		bp->b_blkno = blkptrtodb(VFSTOUFS(mp), blkno);
6216 		bp->b_iocmd = BIO_READ;
6217 		bp->b_flags &= ~B_INVAL;
6218 		bp->b_ioflags &= ~BIO_ERROR;
6219 		vfs_busy_pages(bp, 0);
6220 		bp->b_iooffset = dbtob(bp->b_blkno);
6221 		bstrategy(bp);
6222 		curthread->td_ru.ru_inblock++;
6223 		error = bufwait(bp);
6224 		if (error) {
6225 			brelse(bp);
6226 			return (error);
6227 		}
6228 	}
6229 	level = lbn_level(lbn);
6230 	lbnadd = lbn_offset(ip->i_fs, level);
6231 	/*
6232 	 * Compute the offset of the last block we want to keep.  Store
6233 	 * in the freework the first block we want to completely free.
6234 	 */
6235 	off = (lastlbn - -(lbn + level)) / lbnadd;
6236 	if (off + 1 == NINDIR(ip->i_fs))
6237 		goto nowork;
6238 	freework = newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, 0, off+1,
6239 	    0);
6240 	/*
6241 	 * Link the freework into the indirdep.  This will prevent any new
6242 	 * allocations from proceeding until we are finished with the
6243 	 * truncate and the block is written.
6244 	 */
6245 	ACQUIRE_LOCK(ip->i_ump);
6246 	indirdep = indirdep_lookup(mp, ip, bp);
6247 	if (indirdep->ir_freeblks)
6248 		panic("setup_trunc_indir: indirdep already truncated.");
6249 	TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next);
6250 	freework->fw_indir = indirdep;
6251 	/*
6252 	 * Cancel any allocindirs that will not make it to disk.
6253 	 * We have to do this for all copies of the indirdep that
6254 	 * live on this newblk.
6255 	 */
6256 	if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
6257 		newblk_lookup(mp, dbtofsb(ip->i_fs, bp->b_blkno), 0, &newblk);
6258 		LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next)
6259 			trunc_indirdep(indirn, freeblks, bp, off);
6260 	} else
6261 		trunc_indirdep(indirdep, freeblks, bp, off);
6262 	FREE_LOCK(ip->i_ump);
6263 	/*
6264 	 * Creation is protected by the buf lock. The saveddata is only
6265 	 * needed if a full truncation follows a partial truncation but it
6266 	 * is difficult to allocate in that case so we fetch it anyway.
6267 	 */
6268 	if (indirdep->ir_saveddata == NULL)
6269 		indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
6270 		    M_SOFTDEP_FLAGS);
6271 nowork:
6272 	/* Fetch the blkno of the child and the zero start offset. */
6273 	if (ip->i_ump->um_fstype == UFS1) {
6274 		blkno = ((ufs1_daddr_t *)bp->b_data)[off];
6275 		start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1];
6276 	} else {
6277 		blkno = ((ufs2_daddr_t *)bp->b_data)[off];
6278 		start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1];
6279 	}
6280 	if (freework) {
6281 		/* Zero the truncated pointers. */
6282 		end = bp->b_data + bp->b_bcount;
6283 		bzero(start, end - start);
6284 		bdwrite(bp);
6285 	} else
6286 		bqrelse(bp);
6287 	if (level == 0)
6288 		return (0);
6289 	lbn++; /* adjust level */
6290 	lbn -= (off * lbnadd);
6291 	return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno);
6292 }
6293 
6294 /*
6295  * Complete the partial truncation of an indirect block setup by
6296  * setup_trunc_indir().  This zeros the truncated pointers in the saved
6297  * copy and writes them to disk before the freeblks is allowed to complete.
6298  */
6299 static void
6300 complete_trunc_indir(freework)
6301 	struct freework *freework;
6302 {
6303 	struct freework *fwn;
6304 	struct indirdep *indirdep;
6305 	struct ufsmount *ump;
6306 	struct buf *bp;
6307 	uintptr_t start;
6308 	int count;
6309 
6310 	ump = VFSTOUFS(freework->fw_list.wk_mp);
6311 	LOCK_OWNED(ump);
6312 	indirdep = freework->fw_indir;
6313 	for (;;) {
6314 		bp = indirdep->ir_bp;
6315 		/* See if the block was discarded. */
6316 		if (bp == NULL)
6317 			break;
6318 		/* Inline part of getdirtybuf().  We dont want bremfree. */
6319 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0)
6320 			break;
6321 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
6322 		    LOCK_PTR(ump)) == 0)
6323 			BUF_UNLOCK(bp);
6324 		ACQUIRE_LOCK(ump);
6325 	}
6326 	freework->fw_state |= DEPCOMPLETE;
6327 	TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next);
6328 	/*
6329 	 * Zero the pointers in the saved copy.
6330 	 */
6331 	if (indirdep->ir_state & UFS1FMT)
6332 		start = sizeof(ufs1_daddr_t);
6333 	else
6334 		start = sizeof(ufs2_daddr_t);
6335 	start *= freework->fw_start;
6336 	count = indirdep->ir_savebp->b_bcount - start;
6337 	start += (uintptr_t)indirdep->ir_savebp->b_data;
6338 	bzero((char *)start, count);
6339 	/*
6340 	 * We need to start the next truncation in the list if it has not
6341 	 * been started yet.
6342 	 */
6343 	fwn = TAILQ_FIRST(&indirdep->ir_trunc);
6344 	if (fwn != NULL) {
6345 		if (fwn->fw_freeblks == indirdep->ir_freeblks)
6346 			TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next);
6347 		if ((fwn->fw_state & ONWORKLIST) == 0)
6348 			freework_enqueue(fwn);
6349 	}
6350 	/*
6351 	 * If bp is NULL the block was fully truncated, restore
6352 	 * the saved block list otherwise free it if it is no
6353 	 * longer needed.
6354 	 */
6355 	if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
6356 		if (bp == NULL)
6357 			bcopy(indirdep->ir_saveddata,
6358 			    indirdep->ir_savebp->b_data,
6359 			    indirdep->ir_savebp->b_bcount);
6360 		free(indirdep->ir_saveddata, M_INDIRDEP);
6361 		indirdep->ir_saveddata = NULL;
6362 	}
6363 	/*
6364 	 * When bp is NULL there is a full truncation pending.  We
6365 	 * must wait for this full truncation to be journaled before
6366 	 * we can release this freework because the disk pointers will
6367 	 * never be written as zero.
6368 	 */
6369 	if (bp == NULL)  {
6370 		if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd))
6371 			handle_written_freework(freework);
6372 		else
6373 			WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd,
6374 			   &freework->fw_list);
6375 	} else {
6376 		/* Complete when the real copy is written. */
6377 		WORKLIST_INSERT(&bp->b_dep, &freework->fw_list);
6378 		BUF_UNLOCK(bp);
6379 	}
6380 }
6381 
6382 /*
6383  * Calculate the number of blocks we are going to release where datablocks
6384  * is the current total and length is the new file size.
6385  */
6386 static ufs2_daddr_t
6387 blkcount(fs, datablocks, length)
6388 	struct fs *fs;
6389 	ufs2_daddr_t datablocks;
6390 	off_t length;
6391 {
6392 	off_t totblks, numblks;
6393 
6394 	totblks = 0;
6395 	numblks = howmany(length, fs->fs_bsize);
6396 	if (numblks <= NDADDR) {
6397 		totblks = howmany(length, fs->fs_fsize);
6398 		goto out;
6399 	}
6400         totblks = blkstofrags(fs, numblks);
6401 	numblks -= NDADDR;
6402 	/*
6403 	 * Count all single, then double, then triple indirects required.
6404 	 * Subtracting one indirects worth of blocks for each pass
6405 	 * acknowledges one of each pointed to by the inode.
6406 	 */
6407 	for (;;) {
6408 		totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs)));
6409 		numblks -= NINDIR(fs);
6410 		if (numblks <= 0)
6411 			break;
6412 		numblks = howmany(numblks, NINDIR(fs));
6413 	}
6414 out:
6415 	totblks = fsbtodb(fs, totblks);
6416 	/*
6417 	 * Handle sparse files.  We can't reclaim more blocks than the inode
6418 	 * references.  We will correct it later in handle_complete_freeblks()
6419 	 * when we know the real count.
6420 	 */
6421 	if (totblks > datablocks)
6422 		return (0);
6423 	return (datablocks - totblks);
6424 }
6425 
6426 /*
6427  * Handle freeblocks for journaled softupdate filesystems.
6428  *
6429  * Contrary to normal softupdates, we must preserve the block pointers in
6430  * indirects until their subordinates are free.  This is to avoid journaling
6431  * every block that is freed which may consume more space than the journal
6432  * itself.  The recovery program will see the free block journals at the
6433  * base of the truncated area and traverse them to reclaim space.  The
6434  * pointers in the inode may be cleared immediately after the journal
6435  * records are written because each direct and indirect pointer in the
6436  * inode is recorded in a journal.  This permits full truncation to proceed
6437  * asynchronously.  The write order is journal -> inode -> cgs -> indirects.
6438  *
6439  * The algorithm is as follows:
6440  * 1) Traverse the in-memory state and create journal entries to release
6441  *    the relevant blocks and full indirect trees.
6442  * 2) Traverse the indirect block chain adding partial truncation freework
6443  *    records to indirects in the path to lastlbn.  The freework will
6444  *    prevent new allocation dependencies from being satisfied in this
6445  *    indirect until the truncation completes.
6446  * 3) Read and lock the inode block, performing an update with the new size
6447  *    and pointers.  This prevents truncated data from becoming valid on
6448  *    disk through step 4.
6449  * 4) Reap unsatisfied dependencies that are beyond the truncated area,
6450  *    eliminate journal work for those records that do not require it.
6451  * 5) Schedule the journal records to be written followed by the inode block.
6452  * 6) Allocate any necessary frags for the end of file.
6453  * 7) Zero any partially truncated blocks.
6454  *
6455  * From this truncation proceeds asynchronously using the freework and
6456  * indir_trunc machinery.  The file will not be extended again into a
6457  * partially truncated indirect block until all work is completed but
6458  * the normal dependency mechanism ensures that it is rolled back/forward
6459  * as appropriate.  Further truncation may occur without delay and is
6460  * serialized in indir_trunc().
6461  */
6462 void
6463 softdep_journal_freeblocks(ip, cred, length, flags)
6464 	struct inode *ip;	/* The inode whose length is to be reduced */
6465 	struct ucred *cred;
6466 	off_t length;		/* The new length for the file */
6467 	int flags;		/* IO_EXT and/or IO_NORMAL */
6468 {
6469 	struct freeblks *freeblks, *fbn;
6470 	struct worklist *wk, *wkn;
6471 	struct inodedep *inodedep;
6472 	struct jblkdep *jblkdep;
6473 	struct allocdirect *adp, *adpn;
6474 	struct ufsmount *ump;
6475 	struct fs *fs;
6476 	struct buf *bp;
6477 	struct vnode *vp;
6478 	struct mount *mp;
6479 	ufs2_daddr_t extblocks, datablocks;
6480 	ufs_lbn_t tmpval, lbn, lastlbn;
6481 	int frags, lastoff, iboff, allocblock, needj, dflags, error, i;
6482 
6483 	fs = ip->i_fs;
6484 	ump = ip->i_ump;
6485 	mp = UFSTOVFS(ump);
6486 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
6487 	    ("softdep_journal_freeblocks called on non-softdep filesystem"));
6488 	vp = ITOV(ip);
6489 	needj = 1;
6490 	iboff = -1;
6491 	allocblock = 0;
6492 	extblocks = 0;
6493 	datablocks = 0;
6494 	frags = 0;
6495 	freeblks = newfreeblks(mp, ip);
6496 	ACQUIRE_LOCK(ump);
6497 	/*
6498 	 * If we're truncating a removed file that will never be written
6499 	 * we don't need to journal the block frees.  The canceled journals
6500 	 * for the allocations will suffice.
6501 	 */
6502 	dflags = DEPALLOC;
6503 	if (IS_SNAPSHOT(ip))
6504 		dflags |= NODELAY;
6505 	inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6506 	if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED &&
6507 	    length == 0)
6508 		needj = 0;
6509 	CTR3(KTR_SUJ, "softdep_journal_freeblks: ip %d length %ld needj %d",
6510 	    ip->i_number, length, needj);
6511 	FREE_LOCK(ump);
6512 	/*
6513 	 * Calculate the lbn that we are truncating to.  This results in -1
6514 	 * if we're truncating the 0 bytes.  So it is the last lbn we want
6515 	 * to keep, not the first lbn we want to truncate.
6516 	 */
6517 	lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1;
6518 	lastoff = blkoff(fs, length);
6519 	/*
6520 	 * Compute frags we are keeping in lastlbn.  0 means all.
6521 	 */
6522 	if (lastlbn >= 0 && lastlbn < NDADDR) {
6523 		frags = fragroundup(fs, lastoff);
6524 		/* adp offset of last valid allocdirect. */
6525 		iboff = lastlbn;
6526 	} else if (lastlbn > 0)
6527 		iboff = NDADDR;
6528 	if (fs->fs_magic == FS_UFS2_MAGIC)
6529 		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6530 	/*
6531 	 * Handle normal data blocks and indirects.  This section saves
6532 	 * values used after the inode update to complete frag and indirect
6533 	 * truncation.
6534 	 */
6535 	if ((flags & IO_NORMAL) != 0) {
6536 		/*
6537 		 * Handle truncation of whole direct and indirect blocks.
6538 		 */
6539 		for (i = iboff + 1; i < NDADDR; i++)
6540 			setup_freedirect(freeblks, ip, i, needj);
6541 		for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
6542 		    i++, lbn += tmpval, tmpval *= NINDIR(fs)) {
6543 			/* Release a whole indirect tree. */
6544 			if (lbn > lastlbn) {
6545 				setup_freeindir(freeblks, ip, i, -lbn -i,
6546 				    needj);
6547 				continue;
6548 			}
6549 			iboff = i + NDADDR;
6550 			/*
6551 			 * Traverse partially truncated indirect tree.
6552 			 */
6553 			if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn)
6554 				setup_trunc_indir(freeblks, ip, -lbn - i,
6555 				    lastlbn, DIP(ip, i_ib[i]));
6556 		}
6557 		/*
6558 		 * Handle partial truncation to a frag boundary.
6559 		 */
6560 		if (frags) {
6561 			ufs2_daddr_t blkno;
6562 			long oldfrags;
6563 
6564 			oldfrags = blksize(fs, ip, lastlbn);
6565 			blkno = DIP(ip, i_db[lastlbn]);
6566 			if (blkno && oldfrags != frags) {
6567 				oldfrags -= frags;
6568 				oldfrags = numfrags(ip->i_fs, oldfrags);
6569 				blkno += numfrags(ip->i_fs, frags);
6570 				newfreework(ump, freeblks, NULL, lastlbn,
6571 				    blkno, oldfrags, 0, needj);
6572 				if (needj)
6573 					adjust_newfreework(freeblks,
6574 					    numfrags(ip->i_fs, frags));
6575 			} else if (blkno == 0)
6576 				allocblock = 1;
6577 		}
6578 		/*
6579 		 * Add a journal record for partial truncate if we are
6580 		 * handling indirect blocks.  Non-indirects need no extra
6581 		 * journaling.
6582 		 */
6583 		if (length != 0 && lastlbn >= NDADDR) {
6584 			ip->i_flag |= IN_TRUNCATED;
6585 			newjtrunc(freeblks, length, 0);
6586 		}
6587 		ip->i_size = length;
6588 		DIP_SET(ip, i_size, ip->i_size);
6589 		datablocks = DIP(ip, i_blocks) - extblocks;
6590 		if (length != 0)
6591 			datablocks = blkcount(ip->i_fs, datablocks, length);
6592 		freeblks->fb_len = length;
6593 	}
6594 	if ((flags & IO_EXT) != 0) {
6595 		for (i = 0; i < NXADDR; i++)
6596 			setup_freeext(freeblks, ip, i, needj);
6597 		ip->i_din2->di_extsize = 0;
6598 		datablocks += extblocks;
6599 	}
6600 #ifdef QUOTA
6601 	/* Reference the quotas in case the block count is wrong in the end. */
6602 	quotaref(vp, freeblks->fb_quota);
6603 	(void) chkdq(ip, -datablocks, NOCRED, 0);
6604 #endif
6605 	freeblks->fb_chkcnt = -datablocks;
6606 	UFS_LOCK(ump);
6607 	fs->fs_pendingblocks += datablocks;
6608 	UFS_UNLOCK(ump);
6609 	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6610 	/*
6611 	 * Handle truncation of incomplete alloc direct dependencies.  We
6612 	 * hold the inode block locked to prevent incomplete dependencies
6613 	 * from reaching the disk while we are eliminating those that
6614 	 * have been truncated.  This is a partially inlined ffs_update().
6615 	 */
6616 	ufs_itimes(vp);
6617 	ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED);
6618 	error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
6619 	    (int)fs->fs_bsize, cred, &bp);
6620 	if (error) {
6621 		brelse(bp);
6622 		softdep_error("softdep_journal_freeblocks", error);
6623 		return;
6624 	}
6625 	if (bp->b_bufsize == fs->fs_bsize)
6626 		bp->b_flags |= B_CLUSTEROK;
6627 	softdep_update_inodeblock(ip, bp, 0);
6628 	if (ump->um_fstype == UFS1)
6629 		*((struct ufs1_dinode *)bp->b_data +
6630 		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
6631 	else
6632 		*((struct ufs2_dinode *)bp->b_data +
6633 		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
6634 	ACQUIRE_LOCK(ump);
6635 	(void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6636 	if ((inodedep->id_state & IOSTARTED) != 0)
6637 		panic("softdep_setup_freeblocks: inode busy");
6638 	/*
6639 	 * Add the freeblks structure to the list of operations that
6640 	 * must await the zero'ed inode being written to disk. If we
6641 	 * still have a bitmap dependency (needj), then the inode
6642 	 * has never been written to disk, so we can process the
6643 	 * freeblks below once we have deleted the dependencies.
6644 	 */
6645 	if (needj)
6646 		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6647 	else
6648 		freeblks->fb_state |= COMPLETE;
6649 	if ((flags & IO_NORMAL) != 0) {
6650 		TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) {
6651 			if (adp->ad_offset > iboff)
6652 				cancel_allocdirect(&inodedep->id_inoupdt, adp,
6653 				    freeblks);
6654 			/*
6655 			 * Truncate the allocdirect.  We could eliminate
6656 			 * or modify journal records as well.
6657 			 */
6658 			else if (adp->ad_offset == iboff && frags)
6659 				adp->ad_newsize = frags;
6660 		}
6661 	}
6662 	if ((flags & IO_EXT) != 0)
6663 		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
6664 			cancel_allocdirect(&inodedep->id_extupdt, adp,
6665 			    freeblks);
6666 	/*
6667 	 * Scan the bufwait list for newblock dependencies that will never
6668 	 * make it to disk.
6669 	 */
6670 	LIST_FOREACH_SAFE(wk, &inodedep->id_bufwait, wk_list, wkn) {
6671 		if (wk->wk_type != D_ALLOCDIRECT)
6672 			continue;
6673 		adp = WK_ALLOCDIRECT(wk);
6674 		if (((flags & IO_NORMAL) != 0 && (adp->ad_offset > iboff)) ||
6675 		    ((flags & IO_EXT) != 0 && (adp->ad_state & EXTDATA))) {
6676 			cancel_jfreeblk(freeblks, adp->ad_newblkno);
6677 			cancel_newblk(WK_NEWBLK(wk), NULL, &freeblks->fb_jwork);
6678 			WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
6679 		}
6680 	}
6681 	/*
6682 	 * Add journal work.
6683 	 */
6684 	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps)
6685 		add_to_journal(&jblkdep->jb_list);
6686 	FREE_LOCK(ump);
6687 	bdwrite(bp);
6688 	/*
6689 	 * Truncate dependency structures beyond length.
6690 	 */
6691 	trunc_dependencies(ip, freeblks, lastlbn, frags, flags);
6692 	/*
6693 	 * This is only set when we need to allocate a fragment because
6694 	 * none existed at the end of a frag-sized file.  It handles only
6695 	 * allocating a new, zero filled block.
6696 	 */
6697 	if (allocblock) {
6698 		ip->i_size = length - lastoff;
6699 		DIP_SET(ip, i_size, ip->i_size);
6700 		error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp);
6701 		if (error != 0) {
6702 			softdep_error("softdep_journal_freeblks", error);
6703 			return;
6704 		}
6705 		ip->i_size = length;
6706 		DIP_SET(ip, i_size, length);
6707 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
6708 		allocbuf(bp, frags);
6709 		ffs_update(vp, 0);
6710 		bawrite(bp);
6711 	} else if (lastoff != 0 && vp->v_type != VDIR) {
6712 		int size;
6713 
6714 		/*
6715 		 * Zero the end of a truncated frag or block.
6716 		 */
6717 		size = sblksize(fs, length, lastlbn);
6718 		error = bread(vp, lastlbn, size, cred, &bp);
6719 		if (error) {
6720 			softdep_error("softdep_journal_freeblks", error);
6721 			return;
6722 		}
6723 		bzero((char *)bp->b_data + lastoff, size - lastoff);
6724 		bawrite(bp);
6725 
6726 	}
6727 	ACQUIRE_LOCK(ump);
6728 	inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6729 	TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next);
6730 	freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST;
6731 	/*
6732 	 * We zero earlier truncations so they don't erroneously
6733 	 * update i_blocks.
6734 	 */
6735 	if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0)
6736 		TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next)
6737 			fbn->fb_len = 0;
6738 	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE &&
6739 	    LIST_EMPTY(&freeblks->fb_jblkdephd))
6740 		freeblks->fb_state |= INPROGRESS;
6741 	else
6742 		freeblks = NULL;
6743 	FREE_LOCK(ump);
6744 	if (freeblks)
6745 		handle_workitem_freeblocks(freeblks, 0);
6746 	trunc_pages(ip, length, extblocks, flags);
6747 
6748 }
6749 
6750 /*
6751  * Flush a JOP_SYNC to the journal.
6752  */
6753 void
6754 softdep_journal_fsync(ip)
6755 	struct inode *ip;
6756 {
6757 	struct jfsync *jfsync;
6758 
6759 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
6760 	    ("softdep_journal_fsync called on non-softdep filesystem"));
6761 	if ((ip->i_flag & IN_TRUNCATED) == 0)
6762 		return;
6763 	ip->i_flag &= ~IN_TRUNCATED;
6764 	jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO);
6765 	workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ip->i_ump));
6766 	jfsync->jfs_size = ip->i_size;
6767 	jfsync->jfs_ino = ip->i_number;
6768 	ACQUIRE_LOCK(ip->i_ump);
6769 	add_to_journal(&jfsync->jfs_list);
6770 	jwait(&jfsync->jfs_list, MNT_WAIT);
6771 	FREE_LOCK(ip->i_ump);
6772 }
6773 
6774 /*
6775  * Block de-allocation dependencies.
6776  *
6777  * When blocks are de-allocated, the on-disk pointers must be nullified before
6778  * the blocks are made available for use by other files.  (The true
6779  * requirement is that old pointers must be nullified before new on-disk
6780  * pointers are set.  We chose this slightly more stringent requirement to
6781  * reduce complexity.) Our implementation handles this dependency by updating
6782  * the inode (or indirect block) appropriately but delaying the actual block
6783  * de-allocation (i.e., freemap and free space count manipulation) until
6784  * after the updated versions reach stable storage.  After the disk is
6785  * updated, the blocks can be safely de-allocated whenever it is convenient.
6786  * This implementation handles only the common case of reducing a file's
6787  * length to zero. Other cases are handled by the conventional synchronous
6788  * write approach.
6789  *
6790  * The ffs implementation with which we worked double-checks
6791  * the state of the block pointers and file size as it reduces
6792  * a file's length.  Some of this code is replicated here in our
6793  * soft updates implementation.  The freeblks->fb_chkcnt field is
6794  * used to transfer a part of this information to the procedure
6795  * that eventually de-allocates the blocks.
6796  *
6797  * This routine should be called from the routine that shortens
6798  * a file's length, before the inode's size or block pointers
6799  * are modified. It will save the block pointer information for
6800  * later release and zero the inode so that the calling routine
6801  * can release it.
6802  */
6803 void
6804 softdep_setup_freeblocks(ip, length, flags)
6805 	struct inode *ip;	/* The inode whose length is to be reduced */
6806 	off_t length;		/* The new length for the file */
6807 	int flags;		/* IO_EXT and/or IO_NORMAL */
6808 {
6809 	struct ufs1_dinode *dp1;
6810 	struct ufs2_dinode *dp2;
6811 	struct freeblks *freeblks;
6812 	struct inodedep *inodedep;
6813 	struct allocdirect *adp;
6814 	struct ufsmount *ump;
6815 	struct buf *bp;
6816 	struct fs *fs;
6817 	ufs2_daddr_t extblocks, datablocks;
6818 	struct mount *mp;
6819 	int i, delay, error, dflags;
6820 	ufs_lbn_t tmpval;
6821 	ufs_lbn_t lbn;
6822 
6823 	ump = ip->i_ump;
6824 	mp = UFSTOVFS(ump);
6825 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
6826 	    ("softdep_setup_freeblocks called on non-softdep filesystem"));
6827 	CTR2(KTR_SUJ, "softdep_setup_freeblks: ip %d length %ld",
6828 	    ip->i_number, length);
6829 	KASSERT(length == 0, ("softdep_setup_freeblocks: non-zero length"));
6830 	fs = ip->i_fs;
6831 	freeblks = newfreeblks(mp, ip);
6832 	extblocks = 0;
6833 	datablocks = 0;
6834 	if (fs->fs_magic == FS_UFS2_MAGIC)
6835 		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6836 	if ((flags & IO_NORMAL) != 0) {
6837 		for (i = 0; i < NDADDR; i++)
6838 			setup_freedirect(freeblks, ip, i, 0);
6839 		for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
6840 		    i++, lbn += tmpval, tmpval *= NINDIR(fs))
6841 			setup_freeindir(freeblks, ip, i, -lbn -i, 0);
6842 		ip->i_size = 0;
6843 		DIP_SET(ip, i_size, 0);
6844 		datablocks = DIP(ip, i_blocks) - extblocks;
6845 	}
6846 	if ((flags & IO_EXT) != 0) {
6847 		for (i = 0; i < NXADDR; i++)
6848 			setup_freeext(freeblks, ip, i, 0);
6849 		ip->i_din2->di_extsize = 0;
6850 		datablocks += extblocks;
6851 	}
6852 #ifdef QUOTA
6853 	/* Reference the quotas in case the block count is wrong in the end. */
6854 	quotaref(ITOV(ip), freeblks->fb_quota);
6855 	(void) chkdq(ip, -datablocks, NOCRED, 0);
6856 #endif
6857 	freeblks->fb_chkcnt = -datablocks;
6858 	UFS_LOCK(ump);
6859 	fs->fs_pendingblocks += datablocks;
6860 	UFS_UNLOCK(ump);
6861 	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6862 	/*
6863 	 * Push the zero'ed inode to to its disk buffer so that we are free
6864 	 * to delete its dependencies below. Once the dependencies are gone
6865 	 * the buffer can be safely released.
6866 	 */
6867 	if ((error = bread(ip->i_devvp,
6868 	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
6869 	    (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
6870 		brelse(bp);
6871 		softdep_error("softdep_setup_freeblocks", error);
6872 	}
6873 	if (ump->um_fstype == UFS1) {
6874 		dp1 = ((struct ufs1_dinode *)bp->b_data +
6875 		    ino_to_fsbo(fs, ip->i_number));
6876 		ip->i_din1->di_freelink = dp1->di_freelink;
6877 		*dp1 = *ip->i_din1;
6878 	} else {
6879 		dp2 = ((struct ufs2_dinode *)bp->b_data +
6880 		    ino_to_fsbo(fs, ip->i_number));
6881 		ip->i_din2->di_freelink = dp2->di_freelink;
6882 		*dp2 = *ip->i_din2;
6883 	}
6884 	/*
6885 	 * Find and eliminate any inode dependencies.
6886 	 */
6887 	ACQUIRE_LOCK(ump);
6888 	dflags = DEPALLOC;
6889 	if (IS_SNAPSHOT(ip))
6890 		dflags |= NODELAY;
6891 	(void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6892 	if ((inodedep->id_state & IOSTARTED) != 0)
6893 		panic("softdep_setup_freeblocks: inode busy");
6894 	/*
6895 	 * Add the freeblks structure to the list of operations that
6896 	 * must await the zero'ed inode being written to disk. If we
6897 	 * still have a bitmap dependency (delay == 0), then the inode
6898 	 * has never been written to disk, so we can process the
6899 	 * freeblks below once we have deleted the dependencies.
6900 	 */
6901 	delay = (inodedep->id_state & DEPCOMPLETE);
6902 	if (delay)
6903 		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6904 	else
6905 		freeblks->fb_state |= COMPLETE;
6906 	/*
6907 	 * Because the file length has been truncated to zero, any
6908 	 * pending block allocation dependency structures associated
6909 	 * with this inode are obsolete and can simply be de-allocated.
6910 	 * We must first merge the two dependency lists to get rid of
6911 	 * any duplicate freefrag structures, then purge the merged list.
6912 	 * If we still have a bitmap dependency, then the inode has never
6913 	 * been written to disk, so we can free any fragments without delay.
6914 	 */
6915 	if (flags & IO_NORMAL) {
6916 		merge_inode_lists(&inodedep->id_newinoupdt,
6917 		    &inodedep->id_inoupdt);
6918 		while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
6919 			cancel_allocdirect(&inodedep->id_inoupdt, adp,
6920 			    freeblks);
6921 	}
6922 	if (flags & IO_EXT) {
6923 		merge_inode_lists(&inodedep->id_newextupdt,
6924 		    &inodedep->id_extupdt);
6925 		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
6926 			cancel_allocdirect(&inodedep->id_extupdt, adp,
6927 			    freeblks);
6928 	}
6929 	FREE_LOCK(ump);
6930 	bdwrite(bp);
6931 	trunc_dependencies(ip, freeblks, -1, 0, flags);
6932 	ACQUIRE_LOCK(ump);
6933 	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
6934 		(void) free_inodedep(inodedep);
6935 	freeblks->fb_state |= DEPCOMPLETE;
6936 	/*
6937 	 * If the inode with zeroed block pointers is now on disk
6938 	 * we can start freeing blocks.
6939 	 */
6940 	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
6941 		freeblks->fb_state |= INPROGRESS;
6942 	else
6943 		freeblks = NULL;
6944 	FREE_LOCK(ump);
6945 	if (freeblks)
6946 		handle_workitem_freeblocks(freeblks, 0);
6947 	trunc_pages(ip, length, extblocks, flags);
6948 }
6949 
6950 /*
6951  * Eliminate pages from the page cache that back parts of this inode and
6952  * adjust the vnode pager's idea of our size.  This prevents stale data
6953  * from hanging around in the page cache.
6954  */
6955 static void
6956 trunc_pages(ip, length, extblocks, flags)
6957 	struct inode *ip;
6958 	off_t length;
6959 	ufs2_daddr_t extblocks;
6960 	int flags;
6961 {
6962 	struct vnode *vp;
6963 	struct fs *fs;
6964 	ufs_lbn_t lbn;
6965 	off_t end, extend;
6966 
6967 	vp = ITOV(ip);
6968 	fs = ip->i_fs;
6969 	extend = OFF_TO_IDX(lblktosize(fs, -extblocks));
6970 	if ((flags & IO_EXT) != 0)
6971 		vn_pages_remove(vp, extend, 0);
6972 	if ((flags & IO_NORMAL) == 0)
6973 		return;
6974 	BO_LOCK(&vp->v_bufobj);
6975 	drain_output(vp);
6976 	BO_UNLOCK(&vp->v_bufobj);
6977 	/*
6978 	 * The vnode pager eliminates file pages we eliminate indirects
6979 	 * below.
6980 	 */
6981 	vnode_pager_setsize(vp, length);
6982 	/*
6983 	 * Calculate the end based on the last indirect we want to keep.  If
6984 	 * the block extends into indirects we can just use the negative of
6985 	 * its lbn.  Doubles and triples exist at lower numbers so we must
6986 	 * be careful not to remove those, if they exist.  double and triple
6987 	 * indirect lbns do not overlap with others so it is not important
6988 	 * to verify how many levels are required.
6989 	 */
6990 	lbn = lblkno(fs, length);
6991 	if (lbn >= NDADDR) {
6992 		/* Calculate the virtual lbn of the triple indirect. */
6993 		lbn = -lbn - (NIADDR - 1);
6994 		end = OFF_TO_IDX(lblktosize(fs, lbn));
6995 	} else
6996 		end = extend;
6997 	vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end);
6998 }
6999 
7000 /*
7001  * See if the buf bp is in the range eliminated by truncation.
7002  */
7003 static int
7004 trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags)
7005 	struct buf *bp;
7006 	int *blkoffp;
7007 	ufs_lbn_t lastlbn;
7008 	int lastoff;
7009 	int flags;
7010 {
7011 	ufs_lbn_t lbn;
7012 
7013 	*blkoffp = 0;
7014 	/* Only match ext/normal blocks as appropriate. */
7015 	if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
7016 	    ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0))
7017 		return (0);
7018 	/* ALTDATA is always a full truncation. */
7019 	if ((bp->b_xflags & BX_ALTDATA) != 0)
7020 		return (1);
7021 	/* -1 is full truncation. */
7022 	if (lastlbn == -1)
7023 		return (1);
7024 	/*
7025 	 * If this is a partial truncate we only want those
7026 	 * blocks and indirect blocks that cover the range
7027 	 * we're after.
7028 	 */
7029 	lbn = bp->b_lblkno;
7030 	if (lbn < 0)
7031 		lbn = -(lbn + lbn_level(lbn));
7032 	if (lbn < lastlbn)
7033 		return (0);
7034 	/* Here we only truncate lblkno if it's partial. */
7035 	if (lbn == lastlbn) {
7036 		if (lastoff == 0)
7037 			return (0);
7038 		*blkoffp = lastoff;
7039 	}
7040 	return (1);
7041 }
7042 
7043 /*
7044  * Eliminate any dependencies that exist in memory beyond lblkno:off
7045  */
7046 static void
7047 trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags)
7048 	struct inode *ip;
7049 	struct freeblks *freeblks;
7050 	ufs_lbn_t lastlbn;
7051 	int lastoff;
7052 	int flags;
7053 {
7054 	struct bufobj *bo;
7055 	struct vnode *vp;
7056 	struct buf *bp;
7057 	struct fs *fs;
7058 	int blkoff;
7059 
7060 	/*
7061 	 * We must wait for any I/O in progress to finish so that
7062 	 * all potential buffers on the dirty list will be visible.
7063 	 * Once they are all there, walk the list and get rid of
7064 	 * any dependencies.
7065 	 */
7066 	fs = ip->i_fs;
7067 	vp = ITOV(ip);
7068 	bo = &vp->v_bufobj;
7069 	BO_LOCK(bo);
7070 	drain_output(vp);
7071 	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
7072 		bp->b_vflags &= ~BV_SCANNED;
7073 restart:
7074 	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
7075 		if (bp->b_vflags & BV_SCANNED)
7076 			continue;
7077 		if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
7078 			bp->b_vflags |= BV_SCANNED;
7079 			continue;
7080 		}
7081 		KASSERT(bp->b_bufobj == bo, ("Wrong object in buffer"));
7082 		if ((bp = getdirtybuf(bp, BO_LOCKPTR(bo), MNT_WAIT)) == NULL)
7083 			goto restart;
7084 		BO_UNLOCK(bo);
7085 		if (deallocate_dependencies(bp, freeblks, blkoff))
7086 			bqrelse(bp);
7087 		else
7088 			brelse(bp);
7089 		BO_LOCK(bo);
7090 		goto restart;
7091 	}
7092 	/*
7093 	 * Now do the work of vtruncbuf while also matching indirect blocks.
7094 	 */
7095 	TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs)
7096 		bp->b_vflags &= ~BV_SCANNED;
7097 cleanrestart:
7098 	TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) {
7099 		if (bp->b_vflags & BV_SCANNED)
7100 			continue;
7101 		if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
7102 			bp->b_vflags |= BV_SCANNED;
7103 			continue;
7104 		}
7105 		if (BUF_LOCK(bp,
7106 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
7107 		    BO_LOCKPTR(bo)) == ENOLCK) {
7108 			BO_LOCK(bo);
7109 			goto cleanrestart;
7110 		}
7111 		bp->b_vflags |= BV_SCANNED;
7112 		bremfree(bp);
7113 		if (blkoff != 0) {
7114 			allocbuf(bp, blkoff);
7115 			bqrelse(bp);
7116 		} else {
7117 			bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF;
7118 			brelse(bp);
7119 		}
7120 		BO_LOCK(bo);
7121 		goto cleanrestart;
7122 	}
7123 	drain_output(vp);
7124 	BO_UNLOCK(bo);
7125 }
7126 
7127 static int
7128 cancel_pagedep(pagedep, freeblks, blkoff)
7129 	struct pagedep *pagedep;
7130 	struct freeblks *freeblks;
7131 	int blkoff;
7132 {
7133 	struct jremref *jremref;
7134 	struct jmvref *jmvref;
7135 	struct dirrem *dirrem, *tmp;
7136 	int i;
7137 
7138 	/*
7139 	 * Copy any directory remove dependencies to the list
7140 	 * to be processed after the freeblks proceeds.  If
7141 	 * directory entry never made it to disk they
7142 	 * can be dumped directly onto the work list.
7143 	 */
7144 	LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) {
7145 		/* Skip this directory removal if it is intended to remain. */
7146 		if (dirrem->dm_offset < blkoff)
7147 			continue;
7148 		/*
7149 		 * If there are any dirrems we wait for the journal write
7150 		 * to complete and then restart the buf scan as the lock
7151 		 * has been dropped.
7152 		 */
7153 		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) {
7154 			jwait(&jremref->jr_list, MNT_WAIT);
7155 			return (ERESTART);
7156 		}
7157 		LIST_REMOVE(dirrem, dm_next);
7158 		dirrem->dm_dirinum = pagedep->pd_ino;
7159 		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list);
7160 	}
7161 	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) {
7162 		jwait(&jmvref->jm_list, MNT_WAIT);
7163 		return (ERESTART);
7164 	}
7165 	/*
7166 	 * When we're partially truncating a pagedep we just want to flush
7167 	 * journal entries and return.  There can not be any adds in the
7168 	 * truncated portion of the directory and newblk must remain if
7169 	 * part of the block remains.
7170 	 */
7171 	if (blkoff != 0) {
7172 		struct diradd *dap;
7173 
7174 		LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
7175 			if (dap->da_offset > blkoff)
7176 				panic("cancel_pagedep: diradd %p off %d > %d",
7177 				    dap, dap->da_offset, blkoff);
7178 		for (i = 0; i < DAHASHSZ; i++)
7179 			LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist)
7180 				if (dap->da_offset > blkoff)
7181 					panic("cancel_pagedep: diradd %p off %d > %d",
7182 					    dap, dap->da_offset, blkoff);
7183 		return (0);
7184 	}
7185 	/*
7186 	 * There should be no directory add dependencies present
7187 	 * as the directory could not be truncated until all
7188 	 * children were removed.
7189 	 */
7190 	KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL,
7191 	    ("deallocate_dependencies: pendinghd != NULL"));
7192 	for (i = 0; i < DAHASHSZ; i++)
7193 		KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL,
7194 		    ("deallocate_dependencies: diraddhd != NULL"));
7195 	if ((pagedep->pd_state & NEWBLOCK) != 0)
7196 		free_newdirblk(pagedep->pd_newdirblk);
7197 	if (free_pagedep(pagedep) == 0)
7198 		panic("Failed to free pagedep %p", pagedep);
7199 	return (0);
7200 }
7201 
7202 /*
7203  * Reclaim any dependency structures from a buffer that is about to
7204  * be reallocated to a new vnode. The buffer must be locked, thus,
7205  * no I/O completion operations can occur while we are manipulating
7206  * its associated dependencies. The mutex is held so that other I/O's
7207  * associated with related dependencies do not occur.
7208  */
7209 static int
7210 deallocate_dependencies(bp, freeblks, off)
7211 	struct buf *bp;
7212 	struct freeblks *freeblks;
7213 	int off;
7214 {
7215 	struct indirdep *indirdep;
7216 	struct pagedep *pagedep;
7217 	struct allocdirect *adp;
7218 	struct worklist *wk, *wkn;
7219 	struct ufsmount *ump;
7220 
7221 	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
7222 		goto done;
7223 	ump = VFSTOUFS(wk->wk_mp);
7224 	ACQUIRE_LOCK(ump);
7225 	LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) {
7226 		switch (wk->wk_type) {
7227 		case D_INDIRDEP:
7228 			indirdep = WK_INDIRDEP(wk);
7229 			if (bp->b_lblkno >= 0 ||
7230 			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
7231 				panic("deallocate_dependencies: not indir");
7232 			cancel_indirdep(indirdep, bp, freeblks);
7233 			continue;
7234 
7235 		case D_PAGEDEP:
7236 			pagedep = WK_PAGEDEP(wk);
7237 			if (cancel_pagedep(pagedep, freeblks, off)) {
7238 				FREE_LOCK(ump);
7239 				return (ERESTART);
7240 			}
7241 			continue;
7242 
7243 		case D_ALLOCINDIR:
7244 			/*
7245 			 * Simply remove the allocindir, we'll find it via
7246 			 * the indirdep where we can clear pointers if
7247 			 * needed.
7248 			 */
7249 			WORKLIST_REMOVE(wk);
7250 			continue;
7251 
7252 		case D_FREEWORK:
7253 			/*
7254 			 * A truncation is waiting for the zero'd pointers
7255 			 * to be written.  It can be freed when the freeblks
7256 			 * is journaled.
7257 			 */
7258 			WORKLIST_REMOVE(wk);
7259 			wk->wk_state |= ONDEPLIST;
7260 			WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
7261 			break;
7262 
7263 		case D_ALLOCDIRECT:
7264 			adp = WK_ALLOCDIRECT(wk);
7265 			if (off != 0)
7266 				continue;
7267 			/* FALLTHROUGH */
7268 		default:
7269 			panic("deallocate_dependencies: Unexpected type %s",
7270 			    TYPENAME(wk->wk_type));
7271 			/* NOTREACHED */
7272 		}
7273 	}
7274 	FREE_LOCK(ump);
7275 done:
7276 	/*
7277 	 * Don't throw away this buf, we were partially truncating and
7278 	 * some deps may always remain.
7279 	 */
7280 	if (off) {
7281 		allocbuf(bp, off);
7282 		bp->b_vflags |= BV_SCANNED;
7283 		return (EBUSY);
7284 	}
7285 	bp->b_flags |= B_INVAL | B_NOCACHE;
7286 
7287 	return (0);
7288 }
7289 
7290 /*
7291  * An allocdirect is being canceled due to a truncate.  We must make sure
7292  * the journal entry is released in concert with the blkfree that releases
7293  * the storage.  Completed journal entries must not be released until the
7294  * space is no longer pointed to by the inode or in the bitmap.
7295  */
7296 static void
7297 cancel_allocdirect(adphead, adp, freeblks)
7298 	struct allocdirectlst *adphead;
7299 	struct allocdirect *adp;
7300 	struct freeblks *freeblks;
7301 {
7302 	struct freework *freework;
7303 	struct newblk *newblk;
7304 	struct worklist *wk;
7305 
7306 	TAILQ_REMOVE(adphead, adp, ad_next);
7307 	newblk = (struct newblk *)adp;
7308 	freework = NULL;
7309 	/*
7310 	 * Find the correct freework structure.
7311 	 */
7312 	LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {
7313 		if (wk->wk_type != D_FREEWORK)
7314 			continue;
7315 		freework = WK_FREEWORK(wk);
7316 		if (freework->fw_blkno == newblk->nb_newblkno)
7317 			break;
7318 	}
7319 	if (freework == NULL)
7320 		panic("cancel_allocdirect: Freework not found");
7321 	/*
7322 	 * If a newblk exists at all we still have the journal entry that
7323 	 * initiated the allocation so we do not need to journal the free.
7324 	 */
7325 	cancel_jfreeblk(freeblks, freework->fw_blkno);
7326 	/*
7327 	 * If the journal hasn't been written the jnewblk must be passed
7328 	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
7329 	 * this by linking the journal dependency into the freework to be
7330 	 * freed when freework_freeblock() is called.  If the journal has
7331 	 * been written we can simply reclaim the journal space when the
7332 	 * freeblks work is complete.
7333 	 */
7334 	freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list,
7335 	    &freeblks->fb_jwork);
7336 	WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
7337 }
7338 
7339 
7340 /*
7341  * Cancel a new block allocation.  May be an indirect or direct block.  We
7342  * remove it from various lists and return any journal record that needs to
7343  * be resolved by the caller.
7344  *
7345  * A special consideration is made for indirects which were never pointed
7346  * at on disk and will never be found once this block is released.
7347  */
7348 static struct jnewblk *
7349 cancel_newblk(newblk, wk, wkhd)
7350 	struct newblk *newblk;
7351 	struct worklist *wk;
7352 	struct workhead *wkhd;
7353 {
7354 	struct jnewblk *jnewblk;
7355 
7356 	CTR1(KTR_SUJ, "cancel_newblk: blkno %jd", newblk->nb_newblkno);
7357 
7358 	newblk->nb_state |= GOINGAWAY;
7359 	/*
7360 	 * Previously we traversed the completedhd on each indirdep
7361 	 * attached to this newblk to cancel them and gather journal
7362 	 * work.  Since we need only the oldest journal segment and
7363 	 * the lowest point on the tree will always have the oldest
7364 	 * journal segment we are free to release the segments
7365 	 * of any subordinates and may leave the indirdep list to
7366 	 * indirdep_complete() when this newblk is freed.
7367 	 */
7368 	if (newblk->nb_state & ONDEPLIST) {
7369 		newblk->nb_state &= ~ONDEPLIST;
7370 		LIST_REMOVE(newblk, nb_deps);
7371 	}
7372 	if (newblk->nb_state & ONWORKLIST)
7373 		WORKLIST_REMOVE(&newblk->nb_list);
7374 	/*
7375 	 * If the journal entry hasn't been written we save a pointer to
7376 	 * the dependency that frees it until it is written or the
7377 	 * superseding operation completes.
7378 	 */
7379 	jnewblk = newblk->nb_jnewblk;
7380 	if (jnewblk != NULL && wk != NULL) {
7381 		newblk->nb_jnewblk = NULL;
7382 		jnewblk->jn_dep = wk;
7383 	}
7384 	if (!LIST_EMPTY(&newblk->nb_jwork))
7385 		jwork_move(wkhd, &newblk->nb_jwork);
7386 	/*
7387 	 * When truncating we must free the newdirblk early to remove
7388 	 * the pagedep from the hash before returning.
7389 	 */
7390 	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7391 		free_newdirblk(WK_NEWDIRBLK(wk));
7392 	if (!LIST_EMPTY(&newblk->nb_newdirblk))
7393 		panic("cancel_newblk: extra newdirblk");
7394 
7395 	return (jnewblk);
7396 }
7397 
7398 /*
7399  * Schedule the freefrag associated with a newblk to be released once
7400  * the pointers are written and the previous block is no longer needed.
7401  */
7402 static void
7403 newblk_freefrag(newblk)
7404 	struct newblk *newblk;
7405 {
7406 	struct freefrag *freefrag;
7407 
7408 	if (newblk->nb_freefrag == NULL)
7409 		return;
7410 	freefrag = newblk->nb_freefrag;
7411 	newblk->nb_freefrag = NULL;
7412 	freefrag->ff_state |= COMPLETE;
7413 	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
7414 		add_to_worklist(&freefrag->ff_list, 0);
7415 }
7416 
7417 /*
7418  * Free a newblk. Generate a new freefrag work request if appropriate.
7419  * This must be called after the inode pointer and any direct block pointers
7420  * are valid or fully removed via truncate or frag extension.
7421  */
7422 static void
7423 free_newblk(newblk)
7424 	struct newblk *newblk;
7425 {
7426 	struct indirdep *indirdep;
7427 	struct worklist *wk;
7428 
7429 	KASSERT(newblk->nb_jnewblk == NULL,
7430 	    ("free_newblk: jnewblk %p still attached", newblk->nb_jnewblk));
7431 	KASSERT(newblk->nb_list.wk_type != D_NEWBLK,
7432 	    ("free_newblk: unclaimed newblk"));
7433 	LOCK_OWNED(VFSTOUFS(newblk->nb_list.wk_mp));
7434 	newblk_freefrag(newblk);
7435 	if (newblk->nb_state & ONDEPLIST)
7436 		LIST_REMOVE(newblk, nb_deps);
7437 	if (newblk->nb_state & ONWORKLIST)
7438 		WORKLIST_REMOVE(&newblk->nb_list);
7439 	LIST_REMOVE(newblk, nb_hash);
7440 	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7441 		free_newdirblk(WK_NEWDIRBLK(wk));
7442 	if (!LIST_EMPTY(&newblk->nb_newdirblk))
7443 		panic("free_newblk: extra newdirblk");
7444 	while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL)
7445 		indirdep_complete(indirdep);
7446 	handle_jwork(&newblk->nb_jwork);
7447 	WORKITEM_FREE(newblk, D_NEWBLK);
7448 }
7449 
7450 /*
7451  * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
7452  * This routine must be called with splbio interrupts blocked.
7453  */
7454 static void
7455 free_newdirblk(newdirblk)
7456 	struct newdirblk *newdirblk;
7457 {
7458 	struct pagedep *pagedep;
7459 	struct diradd *dap;
7460 	struct worklist *wk;
7461 
7462 	LOCK_OWNED(VFSTOUFS(newdirblk->db_list.wk_mp));
7463 	WORKLIST_REMOVE(&newdirblk->db_list);
7464 	/*
7465 	 * If the pagedep is still linked onto the directory buffer
7466 	 * dependency chain, then some of the entries on the
7467 	 * pd_pendinghd list may not be committed to disk yet. In
7468 	 * this case, we will simply clear the NEWBLOCK flag and
7469 	 * let the pd_pendinghd list be processed when the pagedep
7470 	 * is next written. If the pagedep is no longer on the buffer
7471 	 * dependency chain, then all the entries on the pd_pending
7472 	 * list are committed to disk and we can free them here.
7473 	 */
7474 	pagedep = newdirblk->db_pagedep;
7475 	pagedep->pd_state &= ~NEWBLOCK;
7476 	if ((pagedep->pd_state & ONWORKLIST) == 0) {
7477 		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
7478 			free_diradd(dap, NULL);
7479 		/*
7480 		 * If no dependencies remain, the pagedep will be freed.
7481 		 */
7482 		free_pagedep(pagedep);
7483 	}
7484 	/* Should only ever be one item in the list. */
7485 	while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) {
7486 		WORKLIST_REMOVE(wk);
7487 		handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
7488 	}
7489 	WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
7490 }
7491 
7492 /*
7493  * Prepare an inode to be freed. The actual free operation is not
7494  * done until the zero'ed inode has been written to disk.
7495  */
7496 void
7497 softdep_freefile(pvp, ino, mode)
7498 	struct vnode *pvp;
7499 	ino_t ino;
7500 	int mode;
7501 {
7502 	struct inode *ip = VTOI(pvp);
7503 	struct inodedep *inodedep;
7504 	struct freefile *freefile;
7505 	struct freeblks *freeblks;
7506 	struct ufsmount *ump;
7507 
7508 	ump = ip->i_ump;
7509 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
7510 	    ("softdep_freefile called on non-softdep filesystem"));
7511 	/*
7512 	 * This sets up the inode de-allocation dependency.
7513 	 */
7514 	freefile = malloc(sizeof(struct freefile),
7515 		M_FREEFILE, M_SOFTDEP_FLAGS);
7516 	workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
7517 	freefile->fx_mode = mode;
7518 	freefile->fx_oldinum = ino;
7519 	freefile->fx_devvp = ip->i_devvp;
7520 	LIST_INIT(&freefile->fx_jwork);
7521 	UFS_LOCK(ump);
7522 	ip->i_fs->fs_pendinginodes += 1;
7523 	UFS_UNLOCK(ump);
7524 
7525 	/*
7526 	 * If the inodedep does not exist, then the zero'ed inode has
7527 	 * been written to disk. If the allocated inode has never been
7528 	 * written to disk, then the on-disk inode is zero'ed. In either
7529 	 * case we can free the file immediately.  If the journal was
7530 	 * canceled before being written the inode will never make it to
7531 	 * disk and we must send the canceled journal entrys to
7532 	 * ffs_freefile() to be cleared in conjunction with the bitmap.
7533 	 * Any blocks waiting on the inode to write can be safely freed
7534 	 * here as it will never been written.
7535 	 */
7536 	ACQUIRE_LOCK(ump);
7537 	inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7538 	if (inodedep) {
7539 		/*
7540 		 * Clear out freeblks that no longer need to reference
7541 		 * this inode.
7542 		 */
7543 		while ((freeblks =
7544 		    TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) {
7545 			TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks,
7546 			    fb_next);
7547 			freeblks->fb_state &= ~ONDEPLIST;
7548 		}
7549 		/*
7550 		 * Remove this inode from the unlinked list.
7551 		 */
7552 		if (inodedep->id_state & UNLINKED) {
7553 			/*
7554 			 * Save the journal work to be freed with the bitmap
7555 			 * before we clear UNLINKED.  Otherwise it can be lost
7556 			 * if the inode block is written.
7557 			 */
7558 			handle_bufwait(inodedep, &freefile->fx_jwork);
7559 			clear_unlinked_inodedep(inodedep);
7560 			/*
7561 			 * Re-acquire inodedep as we've dropped the
7562 			 * per-filesystem lock in clear_unlinked_inodedep().
7563 			 */
7564 			inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7565 		}
7566 	}
7567 	if (inodedep == NULL || check_inode_unwritten(inodedep)) {
7568 		FREE_LOCK(ump);
7569 		handle_workitem_freefile(freefile);
7570 		return;
7571 	}
7572 	if ((inodedep->id_state & DEPCOMPLETE) == 0)
7573 		inodedep->id_state |= GOINGAWAY;
7574 	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
7575 	FREE_LOCK(ump);
7576 	if (ip->i_number == ino)
7577 		ip->i_flag |= IN_MODIFIED;
7578 }
7579 
7580 /*
7581  * Check to see if an inode has never been written to disk. If
7582  * so free the inodedep and return success, otherwise return failure.
7583  * This routine must be called with splbio interrupts blocked.
7584  *
7585  * If we still have a bitmap dependency, then the inode has never
7586  * been written to disk. Drop the dependency as it is no longer
7587  * necessary since the inode is being deallocated. We set the
7588  * ALLCOMPLETE flags since the bitmap now properly shows that the
7589  * inode is not allocated. Even if the inode is actively being
7590  * written, it has been rolled back to its zero'ed state, so we
7591  * are ensured that a zero inode is what is on the disk. For short
7592  * lived files, this change will usually result in removing all the
7593  * dependencies from the inode so that it can be freed immediately.
7594  */
7595 static int
7596 check_inode_unwritten(inodedep)
7597 	struct inodedep *inodedep;
7598 {
7599 
7600 	LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7601 
7602 	if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||
7603 	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
7604 	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
7605 	    !LIST_EMPTY(&inodedep->id_bufwait) ||
7606 	    !LIST_EMPTY(&inodedep->id_inowait) ||
7607 	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7608 	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7609 	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7610 	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7611 	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7612 	    !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7613 	    inodedep->id_mkdiradd != NULL ||
7614 	    inodedep->id_nlinkdelta != 0)
7615 		return (0);
7616 	/*
7617 	 * Another process might be in initiate_write_inodeblock_ufs[12]
7618 	 * trying to allocate memory without holding "Softdep Lock".
7619 	 */
7620 	if ((inodedep->id_state & IOSTARTED) != 0 &&
7621 	    inodedep->id_savedino1 == NULL)
7622 		return (0);
7623 
7624 	if (inodedep->id_state & ONDEPLIST)
7625 		LIST_REMOVE(inodedep, id_deps);
7626 	inodedep->id_state &= ~ONDEPLIST;
7627 	inodedep->id_state |= ALLCOMPLETE;
7628 	inodedep->id_bmsafemap = NULL;
7629 	if (inodedep->id_state & ONWORKLIST)
7630 		WORKLIST_REMOVE(&inodedep->id_list);
7631 	if (inodedep->id_savedino1 != NULL) {
7632 		free(inodedep->id_savedino1, M_SAVEDINO);
7633 		inodedep->id_savedino1 = NULL;
7634 	}
7635 	if (free_inodedep(inodedep) == 0)
7636 		panic("check_inode_unwritten: busy inode");
7637 	return (1);
7638 }
7639 
7640 /*
7641  * Try to free an inodedep structure. Return 1 if it could be freed.
7642  */
7643 static int
7644 free_inodedep(inodedep)
7645 	struct inodedep *inodedep;
7646 {
7647 
7648 	LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7649 	if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||
7650 	    (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
7651 	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
7652 	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
7653 	    !LIST_EMPTY(&inodedep->id_bufwait) ||
7654 	    !LIST_EMPTY(&inodedep->id_inowait) ||
7655 	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7656 	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7657 	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7658 	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7659 	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7660 	    !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7661 	    inodedep->id_mkdiradd != NULL ||
7662 	    inodedep->id_nlinkdelta != 0 ||
7663 	    inodedep->id_savedino1 != NULL)
7664 		return (0);
7665 	if (inodedep->id_state & ONDEPLIST)
7666 		LIST_REMOVE(inodedep, id_deps);
7667 	LIST_REMOVE(inodedep, id_hash);
7668 	WORKITEM_FREE(inodedep, D_INODEDEP);
7669 	return (1);
7670 }
7671 
7672 /*
7673  * Free the block referenced by a freework structure.  The parent freeblks
7674  * structure is released and completed when the final cg bitmap reaches
7675  * the disk.  This routine may be freeing a jnewblk which never made it to
7676  * disk in which case we do not have to wait as the operation is undone
7677  * in memory immediately.
7678  */
7679 static void
7680 freework_freeblock(freework)
7681 	struct freework *freework;
7682 {
7683 	struct freeblks *freeblks;
7684 	struct jnewblk *jnewblk;
7685 	struct ufsmount *ump;
7686 	struct workhead wkhd;
7687 	struct fs *fs;
7688 	int bsize;
7689 	int needj;
7690 
7691 	ump = VFSTOUFS(freework->fw_list.wk_mp);
7692 	LOCK_OWNED(ump);
7693 	/*
7694 	 * Handle partial truncate separately.
7695 	 */
7696 	if (freework->fw_indir) {
7697 		complete_trunc_indir(freework);
7698 		return;
7699 	}
7700 	freeblks = freework->fw_freeblks;
7701 	fs = ump->um_fs;
7702 	needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0;
7703 	bsize = lfragtosize(fs, freework->fw_frags);
7704 	LIST_INIT(&wkhd);
7705 	/*
7706 	 * DEPCOMPLETE is cleared in indirblk_insert() if the block lives
7707 	 * on the indirblk hashtable and prevents premature freeing.
7708 	 */
7709 	freework->fw_state |= DEPCOMPLETE;
7710 	/*
7711 	 * SUJ needs to wait for the segment referencing freed indirect
7712 	 * blocks to expire so that we know the checker will not confuse
7713 	 * a re-allocated indirect block with its old contents.
7714 	 */
7715 	if (needj && freework->fw_lbn <= -NDADDR)
7716 		indirblk_insert(freework);
7717 	/*
7718 	 * If we are canceling an existing jnewblk pass it to the free
7719 	 * routine, otherwise pass the freeblk which will ultimately
7720 	 * release the freeblks.  If we're not journaling, we can just
7721 	 * free the freeblks immediately.
7722 	 */
7723 	jnewblk = freework->fw_jnewblk;
7724 	if (jnewblk != NULL) {
7725 		cancel_jnewblk(jnewblk, &wkhd);
7726 		needj = 0;
7727 	} else if (needj) {
7728 		freework->fw_state |= DELAYEDFREE;
7729 		freeblks->fb_cgwait++;
7730 		WORKLIST_INSERT(&wkhd, &freework->fw_list);
7731 	}
7732 	FREE_LOCK(ump);
7733 	freeblks_free(ump, freeblks, btodb(bsize));
7734 	CTR4(KTR_SUJ,
7735 	    "freework_freeblock: ino %d blkno %jd lbn %jd size %ld",
7736 	    freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize);
7737 	ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
7738 	    freeblks->fb_inum, freeblks->fb_vtype, &wkhd);
7739 	ACQUIRE_LOCK(ump);
7740 	/*
7741 	 * The jnewblk will be discarded and the bits in the map never
7742 	 * made it to disk.  We can immediately free the freeblk.
7743 	 */
7744 	if (needj == 0)
7745 		handle_written_freework(freework);
7746 }
7747 
7748 /*
7749  * We enqueue freework items that need processing back on the freeblks and
7750  * add the freeblks to the worklist.  This makes it easier to find all work
7751  * required to flush a truncation in process_truncates().
7752  */
7753 static void
7754 freework_enqueue(freework)
7755 	struct freework *freework;
7756 {
7757 	struct freeblks *freeblks;
7758 
7759 	freeblks = freework->fw_freeblks;
7760 	if ((freework->fw_state & INPROGRESS) == 0)
7761 		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
7762 	if ((freeblks->fb_state &
7763 	    (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE &&
7764 	    LIST_EMPTY(&freeblks->fb_jblkdephd))
7765 		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
7766 }
7767 
7768 /*
7769  * Start, continue, or finish the process of freeing an indirect block tree.
7770  * The free operation may be paused at any point with fw_off containing the
7771  * offset to restart from.  This enables us to implement some flow control
7772  * for large truncates which may fan out and generate a huge number of
7773  * dependencies.
7774  */
7775 static void
7776 handle_workitem_indirblk(freework)
7777 	struct freework *freework;
7778 {
7779 	struct freeblks *freeblks;
7780 	struct ufsmount *ump;
7781 	struct fs *fs;
7782 
7783 	freeblks = freework->fw_freeblks;
7784 	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7785 	fs = ump->um_fs;
7786 	if (freework->fw_state & DEPCOMPLETE) {
7787 		handle_written_freework(freework);
7788 		return;
7789 	}
7790 	if (freework->fw_off == NINDIR(fs)) {
7791 		freework_freeblock(freework);
7792 		return;
7793 	}
7794 	freework->fw_state |= INPROGRESS;
7795 	FREE_LOCK(ump);
7796 	indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),
7797 	    freework->fw_lbn);
7798 	ACQUIRE_LOCK(ump);
7799 }
7800 
7801 /*
7802  * Called when a freework structure attached to a cg buf is written.  The
7803  * ref on either the parent or the freeblks structure is released and
7804  * the freeblks is added back to the worklist if there is more work to do.
7805  */
7806 static void
7807 handle_written_freework(freework)
7808 	struct freework *freework;
7809 {
7810 	struct freeblks *freeblks;
7811 	struct freework *parent;
7812 
7813 	freeblks = freework->fw_freeblks;
7814 	parent = freework->fw_parent;
7815 	if (freework->fw_state & DELAYEDFREE)
7816 		freeblks->fb_cgwait--;
7817 	freework->fw_state |= COMPLETE;
7818 	if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
7819 		WORKITEM_FREE(freework, D_FREEWORK);
7820 	if (parent) {
7821 		if (--parent->fw_ref == 0)
7822 			freework_enqueue(parent);
7823 		return;
7824 	}
7825 	if (--freeblks->fb_ref != 0)
7826 		return;
7827 	if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) ==
7828 	    ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd))
7829 		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
7830 }
7831 
7832 /*
7833  * This workitem routine performs the block de-allocation.
7834  * The workitem is added to the pending list after the updated
7835  * inode block has been written to disk.  As mentioned above,
7836  * checks regarding the number of blocks de-allocated (compared
7837  * to the number of blocks allocated for the file) are also
7838  * performed in this function.
7839  */
7840 static int
7841 handle_workitem_freeblocks(freeblks, flags)
7842 	struct freeblks *freeblks;
7843 	int flags;
7844 {
7845 	struct freework *freework;
7846 	struct newblk *newblk;
7847 	struct allocindir *aip;
7848 	struct ufsmount *ump;
7849 	struct worklist *wk;
7850 
7851 	KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd),
7852 	    ("handle_workitem_freeblocks: Journal entries not written."));
7853 	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7854 	ACQUIRE_LOCK(ump);
7855 	while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
7856 		WORKLIST_REMOVE(wk);
7857 		switch (wk->wk_type) {
7858 		case D_DIRREM:
7859 			wk->wk_state |= COMPLETE;
7860 			add_to_worklist(wk, 0);
7861 			continue;
7862 
7863 		case D_ALLOCDIRECT:
7864 			free_newblk(WK_NEWBLK(wk));
7865 			continue;
7866 
7867 		case D_ALLOCINDIR:
7868 			aip = WK_ALLOCINDIR(wk);
7869 			freework = NULL;
7870 			if (aip->ai_state & DELAYEDFREE) {
7871 				FREE_LOCK(ump);
7872 				freework = newfreework(ump, freeblks, NULL,
7873 				    aip->ai_lbn, aip->ai_newblkno,
7874 				    ump->um_fs->fs_frag, 0, 0);
7875 				ACQUIRE_LOCK(ump);
7876 			}
7877 			newblk = WK_NEWBLK(wk);
7878 			if (newblk->nb_jnewblk) {
7879 				freework->fw_jnewblk = newblk->nb_jnewblk;
7880 				newblk->nb_jnewblk->jn_dep = &freework->fw_list;
7881 				newblk->nb_jnewblk = NULL;
7882 			}
7883 			free_newblk(newblk);
7884 			continue;
7885 
7886 		case D_FREEWORK:
7887 			freework = WK_FREEWORK(wk);
7888 			if (freework->fw_lbn <= -NDADDR)
7889 				handle_workitem_indirblk(freework);
7890 			else
7891 				freework_freeblock(freework);
7892 			continue;
7893 		default:
7894 			panic("handle_workitem_freeblocks: Unknown type %s",
7895 			    TYPENAME(wk->wk_type));
7896 		}
7897 	}
7898 	if (freeblks->fb_ref != 0) {
7899 		freeblks->fb_state &= ~INPROGRESS;
7900 		wake_worklist(&freeblks->fb_list);
7901 		freeblks = NULL;
7902 	}
7903 	FREE_LOCK(ump);
7904 	if (freeblks)
7905 		return handle_complete_freeblocks(freeblks, flags);
7906 	return (0);
7907 }
7908 
7909 /*
7910  * Handle completion of block free via truncate.  This allows fs_pending
7911  * to track the actual free block count more closely than if we only updated
7912  * it at the end.  We must be careful to handle cases where the block count
7913  * on free was incorrect.
7914  */
7915 static void
7916 freeblks_free(ump, freeblks, blocks)
7917 	struct ufsmount *ump;
7918 	struct freeblks *freeblks;
7919 	int blocks;
7920 {
7921 	struct fs *fs;
7922 	ufs2_daddr_t remain;
7923 
7924 	UFS_LOCK(ump);
7925 	remain = -freeblks->fb_chkcnt;
7926 	freeblks->fb_chkcnt += blocks;
7927 	if (remain > 0) {
7928 		if (remain < blocks)
7929 			blocks = remain;
7930 		fs = ump->um_fs;
7931 		fs->fs_pendingblocks -= blocks;
7932 	}
7933 	UFS_UNLOCK(ump);
7934 }
7935 
7936 /*
7937  * Once all of the freework workitems are complete we can retire the
7938  * freeblocks dependency and any journal work awaiting completion.  This
7939  * can not be called until all other dependencies are stable on disk.
7940  */
7941 static int
7942 handle_complete_freeblocks(freeblks, flags)
7943 	struct freeblks *freeblks;
7944 	int flags;
7945 {
7946 	struct inodedep *inodedep;
7947 	struct inode *ip;
7948 	struct vnode *vp;
7949 	struct fs *fs;
7950 	struct ufsmount *ump;
7951 	ufs2_daddr_t spare;
7952 
7953 	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7954 	fs = ump->um_fs;
7955 	flags = LK_EXCLUSIVE | flags;
7956 	spare = freeblks->fb_chkcnt;
7957 
7958 	/*
7959 	 * If we did not release the expected number of blocks we may have
7960 	 * to adjust the inode block count here.  Only do so if it wasn't
7961 	 * a truncation to zero and the modrev still matches.
7962 	 */
7963 	if (spare && freeblks->fb_len != 0) {
7964 		if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum,
7965 		    flags, &vp, FFSV_FORCEINSMQ) != 0)
7966 			return (EBUSY);
7967 		ip = VTOI(vp);
7968 		if (DIP(ip, i_modrev) == freeblks->fb_modrev) {
7969 			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare);
7970 			ip->i_flag |= IN_CHANGE;
7971 			/*
7972 			 * We must wait so this happens before the
7973 			 * journal is reclaimed.
7974 			 */
7975 			ffs_update(vp, 1);
7976 		}
7977 		vput(vp);
7978 	}
7979 	if (spare < 0) {
7980 		UFS_LOCK(ump);
7981 		fs->fs_pendingblocks += spare;
7982 		UFS_UNLOCK(ump);
7983 	}
7984 #ifdef QUOTA
7985 	/* Handle spare. */
7986 	if (spare)
7987 		quotaadj(freeblks->fb_quota, ump, -spare);
7988 	quotarele(freeblks->fb_quota);
7989 #endif
7990 	ACQUIRE_LOCK(ump);
7991 	if (freeblks->fb_state & ONDEPLIST) {
7992 		inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum,
7993 		    0, &inodedep);
7994 		TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next);
7995 		freeblks->fb_state &= ~ONDEPLIST;
7996 		if (TAILQ_EMPTY(&inodedep->id_freeblklst))
7997 			free_inodedep(inodedep);
7998 	}
7999 	/*
8000 	 * All of the freeblock deps must be complete prior to this call
8001 	 * so it's now safe to complete earlier outstanding journal entries.
8002 	 */
8003 	handle_jwork(&freeblks->fb_jwork);
8004 	WORKITEM_FREE(freeblks, D_FREEBLKS);
8005 	FREE_LOCK(ump);
8006 	return (0);
8007 }
8008 
8009 /*
8010  * Release blocks associated with the freeblks and stored in the indirect
8011  * block dbn. If level is greater than SINGLE, the block is an indirect block
8012  * and recursive calls to indirtrunc must be used to cleanse other indirect
8013  * blocks.
8014  *
8015  * This handles partial and complete truncation of blocks.  Partial is noted
8016  * with goingaway == 0.  In this case the freework is completed after the
8017  * zero'd indirects are written to disk.  For full truncation the freework
8018  * is completed after the block is freed.
8019  */
8020 static void
8021 indir_trunc(freework, dbn, lbn)
8022 	struct freework *freework;
8023 	ufs2_daddr_t dbn;
8024 	ufs_lbn_t lbn;
8025 {
8026 	struct freework *nfreework;
8027 	struct workhead wkhd;
8028 	struct freeblks *freeblks;
8029 	struct buf *bp;
8030 	struct fs *fs;
8031 	struct indirdep *indirdep;
8032 	struct ufsmount *ump;
8033 	ufs1_daddr_t *bap1 = 0;
8034 	ufs2_daddr_t nb, nnb, *bap2 = 0;
8035 	ufs_lbn_t lbnadd, nlbn;
8036 	int i, nblocks, ufs1fmt;
8037 	int freedblocks;
8038 	int goingaway;
8039 	int freedeps;
8040 	int needj;
8041 	int level;
8042 	int cnt;
8043 
8044 	freeblks = freework->fw_freeblks;
8045 	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
8046 	fs = ump->um_fs;
8047 	/*
8048 	 * Get buffer of block pointers to be freed.  There are three cases:
8049 	 *
8050 	 * 1) Partial truncate caches the indirdep pointer in the freework
8051 	 *    which provides us a back copy to the save bp which holds the
8052 	 *    pointers we want to clear.  When this completes the zero
8053 	 *    pointers are written to the real copy.
8054 	 * 2) The indirect is being completely truncated, cancel_indirdep()
8055 	 *    eliminated the real copy and placed the indirdep on the saved
8056 	 *    copy.  The indirdep and buf are discarded when this completes.
8057 	 * 3) The indirect was not in memory, we read a copy off of the disk
8058 	 *    using the devvp and drop and invalidate the buffer when we're
8059 	 *    done.
8060 	 */
8061 	goingaway = 1;
8062 	indirdep = NULL;
8063 	if (freework->fw_indir != NULL) {
8064 		goingaway = 0;
8065 		indirdep = freework->fw_indir;
8066 		bp = indirdep->ir_savebp;
8067 		if (bp == NULL || bp->b_blkno != dbn)
8068 			panic("indir_trunc: Bad saved buf %p blkno %jd",
8069 			    bp, (intmax_t)dbn);
8070 	} else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) {
8071 		/*
8072 		 * The lock prevents the buf dep list from changing and
8073 	 	 * indirects on devvp should only ever have one dependency.
8074 		 */
8075 		indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep));
8076 		if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0)
8077 			panic("indir_trunc: Bad indirdep %p from buf %p",
8078 			    indirdep, bp);
8079 	} else if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
8080 	    NOCRED, &bp) != 0) {
8081 		brelse(bp);
8082 		return;
8083 	}
8084 	ACQUIRE_LOCK(ump);
8085 	/* Protects against a race with complete_trunc_indir(). */
8086 	freework->fw_state &= ~INPROGRESS;
8087 	/*
8088 	 * If we have an indirdep we need to enforce the truncation order
8089 	 * and discard it when it is complete.
8090 	 */
8091 	if (indirdep) {
8092 		if (freework != TAILQ_FIRST(&indirdep->ir_trunc) &&
8093 		    !TAILQ_EMPTY(&indirdep->ir_trunc)) {
8094 			/*
8095 			 * Add the complete truncate to the list on the
8096 			 * indirdep to enforce in-order processing.
8097 			 */
8098 			if (freework->fw_indir == NULL)
8099 				TAILQ_INSERT_TAIL(&indirdep->ir_trunc,
8100 				    freework, fw_next);
8101 			FREE_LOCK(ump);
8102 			return;
8103 		}
8104 		/*
8105 		 * If we're goingaway, free the indirdep.  Otherwise it will
8106 		 * linger until the write completes.
8107 		 */
8108 		if (goingaway)
8109 			free_indirdep(indirdep);
8110 	}
8111 	FREE_LOCK(ump);
8112 	/* Initialize pointers depending on block size. */
8113 	if (ump->um_fstype == UFS1) {
8114 		bap1 = (ufs1_daddr_t *)bp->b_data;
8115 		nb = bap1[freework->fw_off];
8116 		ufs1fmt = 1;
8117 	} else {
8118 		bap2 = (ufs2_daddr_t *)bp->b_data;
8119 		nb = bap2[freework->fw_off];
8120 		ufs1fmt = 0;
8121 	}
8122 	level = lbn_level(lbn);
8123 	needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0;
8124 	lbnadd = lbn_offset(fs, level);
8125 	nblocks = btodb(fs->fs_bsize);
8126 	nfreework = freework;
8127 	freedeps = 0;
8128 	cnt = 0;
8129 	/*
8130 	 * Reclaim blocks.  Traverses into nested indirect levels and
8131 	 * arranges for the current level to be freed when subordinates
8132 	 * are free when journaling.
8133 	 */
8134 	for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
8135 		if (i != NINDIR(fs) - 1) {
8136 			if (ufs1fmt)
8137 				nnb = bap1[i+1];
8138 			else
8139 				nnb = bap2[i+1];
8140 		} else
8141 			nnb = 0;
8142 		if (nb == 0)
8143 			continue;
8144 		cnt++;
8145 		if (level != 0) {
8146 			nlbn = (lbn + 1) - (i * lbnadd);
8147 			if (needj != 0) {
8148 				nfreework = newfreework(ump, freeblks, freework,
8149 				    nlbn, nb, fs->fs_frag, 0, 0);
8150 				freedeps++;
8151 			}
8152 			indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
8153 		} else {
8154 			struct freedep *freedep;
8155 
8156 			/*
8157 			 * Attempt to aggregate freedep dependencies for
8158 			 * all blocks being released to the same CG.
8159 			 */
8160 			LIST_INIT(&wkhd);
8161 			if (needj != 0 &&
8162 			    (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) {
8163 				freedep = newfreedep(freework);
8164 				WORKLIST_INSERT_UNLOCKED(&wkhd,
8165 				    &freedep->fd_list);
8166 				freedeps++;
8167 			}
8168 			CTR3(KTR_SUJ,
8169 			    "indir_trunc: ino %d blkno %jd size %ld",
8170 			    freeblks->fb_inum, nb, fs->fs_bsize);
8171 			ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
8172 			    fs->fs_bsize, freeblks->fb_inum,
8173 			    freeblks->fb_vtype, &wkhd);
8174 		}
8175 	}
8176 	if (goingaway) {
8177 		bp->b_flags |= B_INVAL | B_NOCACHE;
8178 		brelse(bp);
8179 	}
8180 	freedblocks = 0;
8181 	if (level == 0)
8182 		freedblocks = (nblocks * cnt);
8183 	if (needj == 0)
8184 		freedblocks += nblocks;
8185 	freeblks_free(ump, freeblks, freedblocks);
8186 	/*
8187 	 * If we are journaling set up the ref counts and offset so this
8188 	 * indirect can be completed when its children are free.
8189 	 */
8190 	if (needj) {
8191 		ACQUIRE_LOCK(ump);
8192 		freework->fw_off = i;
8193 		freework->fw_ref += freedeps;
8194 		freework->fw_ref -= NINDIR(fs) + 1;
8195 		if (level == 0)
8196 			freeblks->fb_cgwait += freedeps;
8197 		if (freework->fw_ref == 0)
8198 			freework_freeblock(freework);
8199 		FREE_LOCK(ump);
8200 		return;
8201 	}
8202 	/*
8203 	 * If we're not journaling we can free the indirect now.
8204 	 */
8205 	dbn = dbtofsb(fs, dbn);
8206 	CTR3(KTR_SUJ,
8207 	    "indir_trunc 2: ino %d blkno %jd size %ld",
8208 	    freeblks->fb_inum, dbn, fs->fs_bsize);
8209 	ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
8210 	    freeblks->fb_inum, freeblks->fb_vtype, NULL);
8211 	/* Non SUJ softdep does single-threaded truncations. */
8212 	if (freework->fw_blkno == dbn) {
8213 		freework->fw_state |= ALLCOMPLETE;
8214 		ACQUIRE_LOCK(ump);
8215 		handle_written_freework(freework);
8216 		FREE_LOCK(ump);
8217 	}
8218 	return;
8219 }
8220 
8221 /*
8222  * Cancel an allocindir when it is removed via truncation.  When bp is not
8223  * NULL the indirect never appeared on disk and is scheduled to be freed
8224  * independently of the indir so we can more easily track journal work.
8225  */
8226 static void
8227 cancel_allocindir(aip, bp, freeblks, trunc)
8228 	struct allocindir *aip;
8229 	struct buf *bp;
8230 	struct freeblks *freeblks;
8231 	int trunc;
8232 {
8233 	struct indirdep *indirdep;
8234 	struct freefrag *freefrag;
8235 	struct newblk *newblk;
8236 
8237 	newblk = (struct newblk *)aip;
8238 	LIST_REMOVE(aip, ai_next);
8239 	/*
8240 	 * We must eliminate the pointer in bp if it must be freed on its
8241 	 * own due to partial truncate or pending journal work.
8242 	 */
8243 	if (bp && (trunc || newblk->nb_jnewblk)) {
8244 		/*
8245 		 * Clear the pointer and mark the aip to be freed
8246 		 * directly if it never existed on disk.
8247 		 */
8248 		aip->ai_state |= DELAYEDFREE;
8249 		indirdep = aip->ai_indirdep;
8250 		if (indirdep->ir_state & UFS1FMT)
8251 			((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8252 		else
8253 			((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8254 	}
8255 	/*
8256 	 * When truncating the previous pointer will be freed via
8257 	 * savedbp.  Eliminate the freefrag which would dup free.
8258 	 */
8259 	if (trunc && (freefrag = newblk->nb_freefrag) != NULL) {
8260 		newblk->nb_freefrag = NULL;
8261 		if (freefrag->ff_jdep)
8262 			cancel_jfreefrag(
8263 			    WK_JFREEFRAG(freefrag->ff_jdep));
8264 		jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork);
8265 		WORKITEM_FREE(freefrag, D_FREEFRAG);
8266 	}
8267 	/*
8268 	 * If the journal hasn't been written the jnewblk must be passed
8269 	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
8270 	 * this by leaving the journal dependency on the newblk to be freed
8271 	 * when a freework is created in handle_workitem_freeblocks().
8272 	 */
8273 	cancel_newblk(newblk, NULL, &freeblks->fb_jwork);
8274 	WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
8275 }
8276 
8277 /*
8278  * Create the mkdir dependencies for . and .. in a new directory.  Link them
8279  * in to a newdirblk so any subsequent additions are tracked properly.  The
8280  * caller is responsible for adding the mkdir1 dependency to the journal
8281  * and updating id_mkdiradd.  This function returns with the per-filesystem
8282  * lock held.
8283  */
8284 static struct mkdir *
8285 setup_newdir(dap, newinum, dinum, newdirbp, mkdirp)
8286 	struct diradd *dap;
8287 	ino_t newinum;
8288 	ino_t dinum;
8289 	struct buf *newdirbp;
8290 	struct mkdir **mkdirp;
8291 {
8292 	struct newblk *newblk;
8293 	struct pagedep *pagedep;
8294 	struct inodedep *inodedep;
8295 	struct newdirblk *newdirblk = 0;
8296 	struct mkdir *mkdir1, *mkdir2;
8297 	struct worklist *wk;
8298 	struct jaddref *jaddref;
8299 	struct ufsmount *ump;
8300 	struct mount *mp;
8301 
8302 	mp = dap->da_list.wk_mp;
8303 	ump = VFSTOUFS(mp);
8304 	newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,
8305 	    M_SOFTDEP_FLAGS);
8306 	workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8307 	LIST_INIT(&newdirblk->db_mkdir);
8308 	mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8309 	workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
8310 	mkdir1->md_state = ATTACHED | MKDIR_BODY;
8311 	mkdir1->md_diradd = dap;
8312 	mkdir1->md_jaddref = NULL;
8313 	mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8314 	workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
8315 	mkdir2->md_state = ATTACHED | MKDIR_PARENT;
8316 	mkdir2->md_diradd = dap;
8317 	mkdir2->md_jaddref = NULL;
8318 	if (MOUNTEDSUJ(mp) == 0) {
8319 		mkdir1->md_state |= DEPCOMPLETE;
8320 		mkdir2->md_state |= DEPCOMPLETE;
8321 	}
8322 	/*
8323 	 * Dependency on "." and ".." being written to disk.
8324 	 */
8325 	mkdir1->md_buf = newdirbp;
8326 	ACQUIRE_LOCK(VFSTOUFS(mp));
8327 	LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir1, md_mkdirs);
8328 	/*
8329 	 * We must link the pagedep, allocdirect, and newdirblk for
8330 	 * the initial file page so the pointer to the new directory
8331 	 * is not written until the directory contents are live and
8332 	 * any subsequent additions are not marked live until the
8333 	 * block is reachable via the inode.
8334 	 */
8335 	if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0)
8336 		panic("setup_newdir: lost pagedep");
8337 	LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)
8338 		if (wk->wk_type == D_ALLOCDIRECT)
8339 			break;
8340 	if (wk == NULL)
8341 		panic("setup_newdir: lost allocdirect");
8342 	if (pagedep->pd_state & NEWBLOCK)
8343 		panic("setup_newdir: NEWBLOCK already set");
8344 	newblk = WK_NEWBLK(wk);
8345 	pagedep->pd_state |= NEWBLOCK;
8346 	pagedep->pd_newdirblk = newdirblk;
8347 	newdirblk->db_pagedep = pagedep;
8348 	WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8349 	WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);
8350 	/*
8351 	 * Look up the inodedep for the parent directory so that we
8352 	 * can link mkdir2 into the pending dotdot jaddref or
8353 	 * the inode write if there is none.  If the inode is
8354 	 * ALLCOMPLETE and no jaddref is present all dependencies have
8355 	 * been satisfied and mkdir2 can be freed.
8356 	 */
8357 	inodedep_lookup(mp, dinum, 0, &inodedep);
8358 	if (MOUNTEDSUJ(mp)) {
8359 		if (inodedep == NULL)
8360 			panic("setup_newdir: Lost parent.");
8361 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8362 		    inoreflst);
8363 		KASSERT(jaddref != NULL && jaddref->ja_parent == newinum &&
8364 		    (jaddref->ja_state & MKDIR_PARENT),
8365 		    ("setup_newdir: bad dotdot jaddref %p", jaddref));
8366 		LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
8367 		mkdir2->md_jaddref = jaddref;
8368 		jaddref->ja_mkdir = mkdir2;
8369 	} else if (inodedep == NULL ||
8370 	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
8371 		dap->da_state &= ~MKDIR_PARENT;
8372 		WORKITEM_FREE(mkdir2, D_MKDIR);
8373 		mkdir2 = NULL;
8374 	} else {
8375 		LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
8376 		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list);
8377 	}
8378 	*mkdirp = mkdir2;
8379 
8380 	return (mkdir1);
8381 }
8382 
8383 /*
8384  * Directory entry addition dependencies.
8385  *
8386  * When adding a new directory entry, the inode (with its incremented link
8387  * count) must be written to disk before the directory entry's pointer to it.
8388  * Also, if the inode is newly allocated, the corresponding freemap must be
8389  * updated (on disk) before the directory entry's pointer. These requirements
8390  * are met via undo/redo on the directory entry's pointer, which consists
8391  * simply of the inode number.
8392  *
8393  * As directory entries are added and deleted, the free space within a
8394  * directory block can become fragmented.  The ufs filesystem will compact
8395  * a fragmented directory block to make space for a new entry. When this
8396  * occurs, the offsets of previously added entries change. Any "diradd"
8397  * dependency structures corresponding to these entries must be updated with
8398  * the new offsets.
8399  */
8400 
8401 /*
8402  * This routine is called after the in-memory inode's link
8403  * count has been incremented, but before the directory entry's
8404  * pointer to the inode has been set.
8405  */
8406 int
8407 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
8408 	struct buf *bp;		/* buffer containing directory block */
8409 	struct inode *dp;	/* inode for directory */
8410 	off_t diroffset;	/* offset of new entry in directory */
8411 	ino_t newinum;		/* inode referenced by new directory entry */
8412 	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
8413 	int isnewblk;		/* entry is in a newly allocated block */
8414 {
8415 	int offset;		/* offset of new entry within directory block */
8416 	ufs_lbn_t lbn;		/* block in directory containing new entry */
8417 	struct fs *fs;
8418 	struct diradd *dap;
8419 	struct newblk *newblk;
8420 	struct pagedep *pagedep;
8421 	struct inodedep *inodedep;
8422 	struct newdirblk *newdirblk = 0;
8423 	struct mkdir *mkdir1, *mkdir2;
8424 	struct jaddref *jaddref;
8425 	struct ufsmount *ump;
8426 	struct mount *mp;
8427 	int isindir;
8428 
8429 	ump = dp->i_ump;
8430 	mp = UFSTOVFS(ump);
8431 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
8432 	    ("softdep_setup_directory_add called on non-softdep filesystem"));
8433 	/*
8434 	 * Whiteouts have no dependencies.
8435 	 */
8436 	if (newinum == WINO) {
8437 		if (newdirbp != NULL)
8438 			bdwrite(newdirbp);
8439 		return (0);
8440 	}
8441 	jaddref = NULL;
8442 	mkdir1 = mkdir2 = NULL;
8443 	fs = dp->i_fs;
8444 	lbn = lblkno(fs, diroffset);
8445 	offset = blkoff(fs, diroffset);
8446 	dap = malloc(sizeof(struct diradd), M_DIRADD,
8447 		M_SOFTDEP_FLAGS|M_ZERO);
8448 	workitem_alloc(&dap->da_list, D_DIRADD, mp);
8449 	dap->da_offset = offset;
8450 	dap->da_newinum = newinum;
8451 	dap->da_state = ATTACHED;
8452 	LIST_INIT(&dap->da_jwork);
8453 	isindir = bp->b_lblkno >= NDADDR;
8454 	if (isnewblk &&
8455 	    (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {
8456 		newdirblk = malloc(sizeof(struct newdirblk),
8457 		    M_NEWDIRBLK, M_SOFTDEP_FLAGS);
8458 		workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8459 		LIST_INIT(&newdirblk->db_mkdir);
8460 	}
8461 	/*
8462 	 * If we're creating a new directory setup the dependencies and set
8463 	 * the dap state to wait for them.  Otherwise it's COMPLETE and
8464 	 * we can move on.
8465 	 */
8466 	if (newdirbp == NULL) {
8467 		dap->da_state |= DEPCOMPLETE;
8468 		ACQUIRE_LOCK(ump);
8469 	} else {
8470 		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
8471 		mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,
8472 		    &mkdir2);
8473 	}
8474 	/*
8475 	 * Link into parent directory pagedep to await its being written.
8476 	 */
8477 	pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep);
8478 #ifdef DEBUG
8479 	if (diradd_lookup(pagedep, offset) != NULL)
8480 		panic("softdep_setup_directory_add: %p already at off %d\n",
8481 		    diradd_lookup(pagedep, offset), offset);
8482 #endif
8483 	dap->da_pagedep = pagedep;
8484 	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
8485 	    da_pdlist);
8486 	inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep);
8487 	/*
8488 	 * If we're journaling, link the diradd into the jaddref so it
8489 	 * may be completed after the journal entry is written.  Otherwise,
8490 	 * link the diradd into its inodedep.  If the inode is not yet
8491 	 * written place it on the bufwait list, otherwise do the post-inode
8492 	 * write processing to put it on the id_pendinghd list.
8493 	 */
8494 	if (MOUNTEDSUJ(mp)) {
8495 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8496 		    inoreflst);
8497 		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
8498 		    ("softdep_setup_directory_add: bad jaddref %p", jaddref));
8499 		jaddref->ja_diroff = diroffset;
8500 		jaddref->ja_diradd = dap;
8501 		add_to_journal(&jaddref->ja_list);
8502 	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
8503 		diradd_inode_written(dap, inodedep);
8504 	else
8505 		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
8506 	/*
8507 	 * Add the journal entries for . and .. links now that the primary
8508 	 * link is written.
8509 	 */
8510 	if (mkdir1 != NULL && MOUNTEDSUJ(mp)) {
8511 		jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
8512 		    inoreflst, if_deps);
8513 		KASSERT(jaddref != NULL &&
8514 		    jaddref->ja_ino == jaddref->ja_parent &&
8515 		    (jaddref->ja_state & MKDIR_BODY),
8516 		    ("softdep_setup_directory_add: bad dot jaddref %p",
8517 		    jaddref));
8518 		mkdir1->md_jaddref = jaddref;
8519 		jaddref->ja_mkdir = mkdir1;
8520 		/*
8521 		 * It is important that the dotdot journal entry
8522 		 * is added prior to the dot entry since dot writes
8523 		 * both the dot and dotdot links.  These both must
8524 		 * be added after the primary link for the journal
8525 		 * to remain consistent.
8526 		 */
8527 		add_to_journal(&mkdir2->md_jaddref->ja_list);
8528 		add_to_journal(&jaddref->ja_list);
8529 	}
8530 	/*
8531 	 * If we are adding a new directory remember this diradd so that if
8532 	 * we rename it we can keep the dot and dotdot dependencies.  If
8533 	 * we are adding a new name for an inode that has a mkdiradd we
8534 	 * must be in rename and we have to move the dot and dotdot
8535 	 * dependencies to this new name.  The old name is being orphaned
8536 	 * soon.
8537 	 */
8538 	if (mkdir1 != NULL) {
8539 		if (inodedep->id_mkdiradd != NULL)
8540 			panic("softdep_setup_directory_add: Existing mkdir");
8541 		inodedep->id_mkdiradd = dap;
8542 	} else if (inodedep->id_mkdiradd)
8543 		merge_diradd(inodedep, dap);
8544 	if (newdirblk) {
8545 		/*
8546 		 * There is nothing to do if we are already tracking
8547 		 * this block.
8548 		 */
8549 		if ((pagedep->pd_state & NEWBLOCK) != 0) {
8550 			WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
8551 			FREE_LOCK(ump);
8552 			return (0);
8553 		}
8554 		if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)
8555 		    == 0)
8556 			panic("softdep_setup_directory_add: lost entry");
8557 		WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8558 		pagedep->pd_state |= NEWBLOCK;
8559 		pagedep->pd_newdirblk = newdirblk;
8560 		newdirblk->db_pagedep = pagedep;
8561 		FREE_LOCK(ump);
8562 		/*
8563 		 * If we extended into an indirect signal direnter to sync.
8564 		 */
8565 		if (isindir)
8566 			return (1);
8567 		return (0);
8568 	}
8569 	FREE_LOCK(ump);
8570 	return (0);
8571 }
8572 
8573 /*
8574  * This procedure is called to change the offset of a directory
8575  * entry when compacting a directory block which must be owned
8576  * exclusively by the caller. Note that the actual entry movement
8577  * must be done in this procedure to ensure that no I/O completions
8578  * occur while the move is in progress.
8579  */
8580 void
8581 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
8582 	struct buf *bp;		/* Buffer holding directory block. */
8583 	struct inode *dp;	/* inode for directory */
8584 	caddr_t base;		/* address of dp->i_offset */
8585 	caddr_t oldloc;		/* address of old directory location */
8586 	caddr_t newloc;		/* address of new directory location */
8587 	int entrysize;		/* size of directory entry */
8588 {
8589 	int offset, oldoffset, newoffset;
8590 	struct pagedep *pagedep;
8591 	struct jmvref *jmvref;
8592 	struct diradd *dap;
8593 	struct direct *de;
8594 	struct mount *mp;
8595 	ufs_lbn_t lbn;
8596 	int flags;
8597 
8598 	mp = UFSTOVFS(dp->i_ump);
8599 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
8600 	    ("softdep_change_directoryentry_offset called on "
8601 	     "non-softdep filesystem"));
8602 	de = (struct direct *)oldloc;
8603 	jmvref = NULL;
8604 	flags = 0;
8605 	/*
8606 	 * Moves are always journaled as it would be too complex to
8607 	 * determine if any affected adds or removes are present in the
8608 	 * journal.
8609 	 */
8610 	if (MOUNTEDSUJ(mp)) {
8611 		flags = DEPALLOC;
8612 		jmvref = newjmvref(dp, de->d_ino,
8613 		    dp->i_offset + (oldloc - base),
8614 		    dp->i_offset + (newloc - base));
8615 	}
8616 	lbn = lblkno(dp->i_fs, dp->i_offset);
8617 	offset = blkoff(dp->i_fs, dp->i_offset);
8618 	oldoffset = offset + (oldloc - base);
8619 	newoffset = offset + (newloc - base);
8620 	ACQUIRE_LOCK(dp->i_ump);
8621 	if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0)
8622 		goto done;
8623 	dap = diradd_lookup(pagedep, oldoffset);
8624 	if (dap) {
8625 		dap->da_offset = newoffset;
8626 		newoffset = DIRADDHASH(newoffset);
8627 		oldoffset = DIRADDHASH(oldoffset);
8628 		if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&
8629 		    newoffset != oldoffset) {
8630 			LIST_REMOVE(dap, da_pdlist);
8631 			LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],
8632 			    dap, da_pdlist);
8633 		}
8634 	}
8635 done:
8636 	if (jmvref) {
8637 		jmvref->jm_pagedep = pagedep;
8638 		LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);
8639 		add_to_journal(&jmvref->jm_list);
8640 	}
8641 	bcopy(oldloc, newloc, entrysize);
8642 	FREE_LOCK(dp->i_ump);
8643 }
8644 
8645 /*
8646  * Move the mkdir dependencies and journal work from one diradd to another
8647  * when renaming a directory.  The new name must depend on the mkdir deps
8648  * completing as the old name did.  Directories can only have one valid link
8649  * at a time so one must be canonical.
8650  */
8651 static void
8652 merge_diradd(inodedep, newdap)
8653 	struct inodedep *inodedep;
8654 	struct diradd *newdap;
8655 {
8656 	struct diradd *olddap;
8657 	struct mkdir *mkdir, *nextmd;
8658 	struct ufsmount *ump;
8659 	short state;
8660 
8661 	olddap = inodedep->id_mkdiradd;
8662 	inodedep->id_mkdiradd = newdap;
8663 	if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8664 		newdap->da_state &= ~DEPCOMPLETE;
8665 		ump = VFSTOUFS(inodedep->id_list.wk_mp);
8666 		for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
8667 		     mkdir = nextmd) {
8668 			nextmd = LIST_NEXT(mkdir, md_mkdirs);
8669 			if (mkdir->md_diradd != olddap)
8670 				continue;
8671 			mkdir->md_diradd = newdap;
8672 			state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);
8673 			newdap->da_state |= state;
8674 			olddap->da_state &= ~state;
8675 			if ((olddap->da_state &
8676 			    (MKDIR_PARENT | MKDIR_BODY)) == 0)
8677 				break;
8678 		}
8679 		if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8680 			panic("merge_diradd: unfound ref");
8681 	}
8682 	/*
8683 	 * Any mkdir related journal items are not safe to be freed until
8684 	 * the new name is stable.
8685 	 */
8686 	jwork_move(&newdap->da_jwork, &olddap->da_jwork);
8687 	olddap->da_state |= DEPCOMPLETE;
8688 	complete_diradd(olddap);
8689 }
8690 
8691 /*
8692  * Move the diradd to the pending list when all diradd dependencies are
8693  * complete.
8694  */
8695 static void
8696 complete_diradd(dap)
8697 	struct diradd *dap;
8698 {
8699 	struct pagedep *pagedep;
8700 
8701 	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
8702 		if (dap->da_state & DIRCHG)
8703 			pagedep = dap->da_previous->dm_pagedep;
8704 		else
8705 			pagedep = dap->da_pagedep;
8706 		LIST_REMOVE(dap, da_pdlist);
8707 		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
8708 	}
8709 }
8710 
8711 /*
8712  * Cancel a diradd when a dirrem overlaps with it.  We must cancel the journal
8713  * add entries and conditonally journal the remove.
8714  */
8715 static void
8716 cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref)
8717 	struct diradd *dap;
8718 	struct dirrem *dirrem;
8719 	struct jremref *jremref;
8720 	struct jremref *dotremref;
8721 	struct jremref *dotdotremref;
8722 {
8723 	struct inodedep *inodedep;
8724 	struct jaddref *jaddref;
8725 	struct inoref *inoref;
8726 	struct ufsmount *ump;
8727 	struct mkdir *mkdir;
8728 
8729 	/*
8730 	 * If no remove references were allocated we're on a non-journaled
8731 	 * filesystem and can skip the cancel step.
8732 	 */
8733 	if (jremref == NULL) {
8734 		free_diradd(dap, NULL);
8735 		return;
8736 	}
8737 	/*
8738 	 * Cancel the primary name an free it if it does not require
8739 	 * journaling.
8740 	 */
8741 	if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,
8742 	    0, &inodedep) != 0) {
8743 		/* Abort the addref that reference this diradd.  */
8744 		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
8745 			if (inoref->if_list.wk_type != D_JADDREF)
8746 				continue;
8747 			jaddref = (struct jaddref *)inoref;
8748 			if (jaddref->ja_diradd != dap)
8749 				continue;
8750 			if (cancel_jaddref(jaddref, inodedep,
8751 			    &dirrem->dm_jwork) == 0) {
8752 				free_jremref(jremref);
8753 				jremref = NULL;
8754 			}
8755 			break;
8756 		}
8757 	}
8758 	/*
8759 	 * Cancel subordinate names and free them if they do not require
8760 	 * journaling.
8761 	 */
8762 	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8763 		ump = VFSTOUFS(dap->da_list.wk_mp);
8764 		LIST_FOREACH(mkdir, &ump->softdep_mkdirlisthd, md_mkdirs) {
8765 			if (mkdir->md_diradd != dap)
8766 				continue;
8767 			if ((jaddref = mkdir->md_jaddref) == NULL)
8768 				continue;
8769 			mkdir->md_jaddref = NULL;
8770 			if (mkdir->md_state & MKDIR_PARENT) {
8771 				if (cancel_jaddref(jaddref, NULL,
8772 				    &dirrem->dm_jwork) == 0) {
8773 					free_jremref(dotdotremref);
8774 					dotdotremref = NULL;
8775 				}
8776 			} else {
8777 				if (cancel_jaddref(jaddref, inodedep,
8778 				    &dirrem->dm_jwork) == 0) {
8779 					free_jremref(dotremref);
8780 					dotremref = NULL;
8781 				}
8782 			}
8783 		}
8784 	}
8785 
8786 	if (jremref)
8787 		journal_jremref(dirrem, jremref, inodedep);
8788 	if (dotremref)
8789 		journal_jremref(dirrem, dotremref, inodedep);
8790 	if (dotdotremref)
8791 		journal_jremref(dirrem, dotdotremref, NULL);
8792 	jwork_move(&dirrem->dm_jwork, &dap->da_jwork);
8793 	free_diradd(dap, &dirrem->dm_jwork);
8794 }
8795 
8796 /*
8797  * Free a diradd dependency structure. This routine must be called
8798  * with splbio interrupts blocked.
8799  */
8800 static void
8801 free_diradd(dap, wkhd)
8802 	struct diradd *dap;
8803 	struct workhead *wkhd;
8804 {
8805 	struct dirrem *dirrem;
8806 	struct pagedep *pagedep;
8807 	struct inodedep *inodedep;
8808 	struct mkdir *mkdir, *nextmd;
8809 	struct ufsmount *ump;
8810 
8811 	ump = VFSTOUFS(dap->da_list.wk_mp);
8812 	LOCK_OWNED(ump);
8813 	LIST_REMOVE(dap, da_pdlist);
8814 	if (dap->da_state & ONWORKLIST)
8815 		WORKLIST_REMOVE(&dap->da_list);
8816 	if ((dap->da_state & DIRCHG) == 0) {
8817 		pagedep = dap->da_pagedep;
8818 	} else {
8819 		dirrem = dap->da_previous;
8820 		pagedep = dirrem->dm_pagedep;
8821 		dirrem->dm_dirinum = pagedep->pd_ino;
8822 		dirrem->dm_state |= COMPLETE;
8823 		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
8824 			add_to_worklist(&dirrem->dm_list, 0);
8825 	}
8826 	if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
8827 	    0, &inodedep) != 0)
8828 		if (inodedep->id_mkdiradd == dap)
8829 			inodedep->id_mkdiradd = NULL;
8830 	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8831 		for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
8832 		     mkdir = nextmd) {
8833 			nextmd = LIST_NEXT(mkdir, md_mkdirs);
8834 			if (mkdir->md_diradd != dap)
8835 				continue;
8836 			dap->da_state &=
8837 			    ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
8838 			LIST_REMOVE(mkdir, md_mkdirs);
8839 			if (mkdir->md_state & ONWORKLIST)
8840 				WORKLIST_REMOVE(&mkdir->md_list);
8841 			if (mkdir->md_jaddref != NULL)
8842 				panic("free_diradd: Unexpected jaddref");
8843 			WORKITEM_FREE(mkdir, D_MKDIR);
8844 			if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
8845 				break;
8846 		}
8847 		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8848 			panic("free_diradd: unfound ref");
8849 	}
8850 	if (inodedep)
8851 		free_inodedep(inodedep);
8852 	/*
8853 	 * Free any journal segments waiting for the directory write.
8854 	 */
8855 	handle_jwork(&dap->da_jwork);
8856 	WORKITEM_FREE(dap, D_DIRADD);
8857 }
8858 
8859 /*
8860  * Directory entry removal dependencies.
8861  *
8862  * When removing a directory entry, the entry's inode pointer must be
8863  * zero'ed on disk before the corresponding inode's link count is decremented
8864  * (possibly freeing the inode for re-use). This dependency is handled by
8865  * updating the directory entry but delaying the inode count reduction until
8866  * after the directory block has been written to disk. After this point, the
8867  * inode count can be decremented whenever it is convenient.
8868  */
8869 
8870 /*
8871  * This routine should be called immediately after removing
8872  * a directory entry.  The inode's link count should not be
8873  * decremented by the calling procedure -- the soft updates
8874  * code will do this task when it is safe.
8875  */
8876 void
8877 softdep_setup_remove(bp, dp, ip, isrmdir)
8878 	struct buf *bp;		/* buffer containing directory block */
8879 	struct inode *dp;	/* inode for the directory being modified */
8880 	struct inode *ip;	/* inode for directory entry being removed */
8881 	int isrmdir;		/* indicates if doing RMDIR */
8882 {
8883 	struct dirrem *dirrem, *prevdirrem;
8884 	struct inodedep *inodedep;
8885 	int direct;
8886 
8887 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
8888 	    ("softdep_setup_remove called on non-softdep filesystem"));
8889 	/*
8890 	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.  We want
8891 	 * newdirrem() to setup the full directory remove which requires
8892 	 * isrmdir > 1.
8893 	 */
8894 	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
8895 	/*
8896 	 * Add the dirrem to the inodedep's pending remove list for quick
8897 	 * discovery later.
8898 	 */
8899 	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
8900 	    &inodedep) == 0)
8901 		panic("softdep_setup_remove: Lost inodedep.");
8902 	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
8903 	dirrem->dm_state |= ONDEPLIST;
8904 	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
8905 
8906 	/*
8907 	 * If the COMPLETE flag is clear, then there were no active
8908 	 * entries and we want to roll back to a zeroed entry until
8909 	 * the new inode is committed to disk. If the COMPLETE flag is
8910 	 * set then we have deleted an entry that never made it to
8911 	 * disk. If the entry we deleted resulted from a name change,
8912 	 * then the old name still resides on disk. We cannot delete
8913 	 * its inode (returned to us in prevdirrem) until the zeroed
8914 	 * directory entry gets to disk. The new inode has never been
8915 	 * referenced on the disk, so can be deleted immediately.
8916 	 */
8917 	if ((dirrem->dm_state & COMPLETE) == 0) {
8918 		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
8919 		    dm_next);
8920 		FREE_LOCK(ip->i_ump);
8921 	} else {
8922 		if (prevdirrem != NULL)
8923 			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
8924 			    prevdirrem, dm_next);
8925 		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
8926 		direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
8927 		FREE_LOCK(ip->i_ump);
8928 		if (direct)
8929 			handle_workitem_remove(dirrem, 0);
8930 	}
8931 }
8932 
8933 /*
8934  * Check for an entry matching 'offset' on both the pd_dirraddhd list and the
8935  * pd_pendinghd list of a pagedep.
8936  */
8937 static struct diradd *
8938 diradd_lookup(pagedep, offset)
8939 	struct pagedep *pagedep;
8940 	int offset;
8941 {
8942 	struct diradd *dap;
8943 
8944 	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
8945 		if (dap->da_offset == offset)
8946 			return (dap);
8947 	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
8948 		if (dap->da_offset == offset)
8949 			return (dap);
8950 	return (NULL);
8951 }
8952 
8953 /*
8954  * Search for a .. diradd dependency in a directory that is being removed.
8955  * If the directory was renamed to a new parent we have a diradd rather
8956  * than a mkdir for the .. entry.  We need to cancel it now before
8957  * it is found in truncate().
8958  */
8959 static struct jremref *
8960 cancel_diradd_dotdot(ip, dirrem, jremref)
8961 	struct inode *ip;
8962 	struct dirrem *dirrem;
8963 	struct jremref *jremref;
8964 {
8965 	struct pagedep *pagedep;
8966 	struct diradd *dap;
8967 	struct worklist *wk;
8968 
8969 	if (pagedep_lookup(UFSTOVFS(ip->i_ump), NULL, ip->i_number, 0, 0,
8970 	    &pagedep) == 0)
8971 		return (jremref);
8972 	dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
8973 	if (dap == NULL)
8974 		return (jremref);
8975 	cancel_diradd(dap, dirrem, jremref, NULL, NULL);
8976 	/*
8977 	 * Mark any journal work as belonging to the parent so it is freed
8978 	 * with the .. reference.
8979 	 */
8980 	LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
8981 		wk->wk_state |= MKDIR_PARENT;
8982 	return (NULL);
8983 }
8984 
8985 /*
8986  * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to
8987  * replace it with a dirrem/diradd pair as a result of re-parenting a
8988  * directory.  This ensures that we don't simultaneously have a mkdir and
8989  * a diradd for the same .. entry.
8990  */
8991 static struct jremref *
8992 cancel_mkdir_dotdot(ip, dirrem, jremref)
8993 	struct inode *ip;
8994 	struct dirrem *dirrem;
8995 	struct jremref *jremref;
8996 {
8997 	struct inodedep *inodedep;
8998 	struct jaddref *jaddref;
8999 	struct ufsmount *ump;
9000 	struct mkdir *mkdir;
9001 	struct diradd *dap;
9002 
9003 	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
9004 	    &inodedep) == 0)
9005 		return (jremref);
9006 	dap = inodedep->id_mkdiradd;
9007 	if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)
9008 		return (jremref);
9009 	ump = VFSTOUFS(inodedep->id_list.wk_mp);
9010 	for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
9011 	    mkdir = LIST_NEXT(mkdir, md_mkdirs))
9012 		if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)
9013 			break;
9014 	if (mkdir == NULL)
9015 		panic("cancel_mkdir_dotdot: Unable to find mkdir\n");
9016 	if ((jaddref = mkdir->md_jaddref) != NULL) {
9017 		mkdir->md_jaddref = NULL;
9018 		jaddref->ja_state &= ~MKDIR_PARENT;
9019 		if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0,
9020 		    &inodedep) == 0)
9021 			panic("cancel_mkdir_dotdot: Lost parent inodedep");
9022 		if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
9023 			journal_jremref(dirrem, jremref, inodedep);
9024 			jremref = NULL;
9025 		}
9026 	}
9027 	if (mkdir->md_state & ONWORKLIST)
9028 		WORKLIST_REMOVE(&mkdir->md_list);
9029 	mkdir->md_state |= ALLCOMPLETE;
9030 	complete_mkdir(mkdir);
9031 	return (jremref);
9032 }
9033 
9034 static void
9035 journal_jremref(dirrem, jremref, inodedep)
9036 	struct dirrem *dirrem;
9037 	struct jremref *jremref;
9038 	struct inodedep *inodedep;
9039 {
9040 
9041 	if (inodedep == NULL)
9042 		if (inodedep_lookup(jremref->jr_list.wk_mp,
9043 		    jremref->jr_ref.if_ino, 0, &inodedep) == 0)
9044 			panic("journal_jremref: Lost inodedep");
9045 	LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);
9046 	TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
9047 	add_to_journal(&jremref->jr_list);
9048 }
9049 
9050 static void
9051 dirrem_journal(dirrem, jremref, dotremref, dotdotremref)
9052 	struct dirrem *dirrem;
9053 	struct jremref *jremref;
9054 	struct jremref *dotremref;
9055 	struct jremref *dotdotremref;
9056 {
9057 	struct inodedep *inodedep;
9058 
9059 
9060 	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,
9061 	    &inodedep) == 0)
9062 		panic("dirrem_journal: Lost inodedep");
9063 	journal_jremref(dirrem, jremref, inodedep);
9064 	if (dotremref)
9065 		journal_jremref(dirrem, dotremref, inodedep);
9066 	if (dotdotremref)
9067 		journal_jremref(dirrem, dotdotremref, NULL);
9068 }
9069 
9070 /*
9071  * Allocate a new dirrem if appropriate and return it along with
9072  * its associated pagedep. Called without a lock, returns with lock.
9073  */
9074 static struct dirrem *
9075 newdirrem(bp, dp, ip, isrmdir, prevdirremp)
9076 	struct buf *bp;		/* buffer containing directory block */
9077 	struct inode *dp;	/* inode for the directory being modified */
9078 	struct inode *ip;	/* inode for directory entry being removed */
9079 	int isrmdir;		/* indicates if doing RMDIR */
9080 	struct dirrem **prevdirremp; /* previously referenced inode, if any */
9081 {
9082 	int offset;
9083 	ufs_lbn_t lbn;
9084 	struct diradd *dap;
9085 	struct dirrem *dirrem;
9086 	struct pagedep *pagedep;
9087 	struct jremref *jremref;
9088 	struct jremref *dotremref;
9089 	struct jremref *dotdotremref;
9090 	struct vnode *dvp;
9091 
9092 	/*
9093 	 * Whiteouts have no deletion dependencies.
9094 	 */
9095 	if (ip == NULL)
9096 		panic("newdirrem: whiteout");
9097 	dvp = ITOV(dp);
9098 	/*
9099 	 * If the system is over its limit and our filesystem is
9100 	 * responsible for more than our share of that usage and
9101 	 * we are not a snapshot, request some inodedep cleanup.
9102 	 * Limiting the number of dirrem structures will also limit
9103 	 * the number of freefile and freeblks structures.
9104 	 */
9105 	ACQUIRE_LOCK(ip->i_ump);
9106 	while (!IS_SNAPSHOT(ip) && dep_current[D_DIRREM] > max_softdeps / 2 &&
9107 	    ip->i_ump->softdep_curdeps[D_DIRREM] >
9108 	    (max_softdeps / 2) / stat_flush_threads)
9109 		(void) request_cleanup(ITOV(dp)->v_mount, FLUSH_BLOCKS);
9110 	FREE_LOCK(ip->i_ump);
9111 	dirrem = malloc(sizeof(struct dirrem),
9112 		M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
9113 	workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
9114 	LIST_INIT(&dirrem->dm_jremrefhd);
9115 	LIST_INIT(&dirrem->dm_jwork);
9116 	dirrem->dm_state = isrmdir ? RMDIR : 0;
9117 	dirrem->dm_oldinum = ip->i_number;
9118 	*prevdirremp = NULL;
9119 	/*
9120 	 * Allocate remove reference structures to track journal write
9121 	 * dependencies.  We will always have one for the link and
9122 	 * when doing directories we will always have one more for dot.
9123 	 * When renaming a directory we skip the dotdot link change so
9124 	 * this is not needed.
9125 	 */
9126 	jremref = dotremref = dotdotremref = NULL;
9127 	if (DOINGSUJ(dvp)) {
9128 		if (isrmdir) {
9129 			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
9130 			    ip->i_effnlink + 2);
9131 			dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,
9132 			    ip->i_effnlink + 1);
9133 			dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,
9134 			    dp->i_effnlink + 1);
9135 			dotdotremref->jr_state |= MKDIR_PARENT;
9136 		} else
9137 			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
9138 			    ip->i_effnlink + 1);
9139 	}
9140 	ACQUIRE_LOCK(ip->i_ump);
9141 	lbn = lblkno(dp->i_fs, dp->i_offset);
9142 	offset = blkoff(dp->i_fs, dp->i_offset);
9143 	pagedep_lookup(UFSTOVFS(dp->i_ump), bp, dp->i_number, lbn, DEPALLOC,
9144 	    &pagedep);
9145 	dirrem->dm_pagedep = pagedep;
9146 	dirrem->dm_offset = offset;
9147 	/*
9148 	 * If we're renaming a .. link to a new directory, cancel any
9149 	 * existing MKDIR_PARENT mkdir.  If it has already been canceled
9150 	 * the jremref is preserved for any potential diradd in this
9151 	 * location.  This can not coincide with a rmdir.
9152 	 */
9153 	if (dp->i_offset == DOTDOT_OFFSET) {
9154 		if (isrmdir)
9155 			panic("newdirrem: .. directory change during remove?");
9156 		jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);
9157 	}
9158 	/*
9159 	 * If we're removing a directory search for the .. dependency now and
9160 	 * cancel it.  Any pending journal work will be added to the dirrem
9161 	 * to be completed when the workitem remove completes.
9162 	 */
9163 	if (isrmdir)
9164 		dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);
9165 	/*
9166 	 * Check for a diradd dependency for the same directory entry.
9167 	 * If present, then both dependencies become obsolete and can
9168 	 * be de-allocated.
9169 	 */
9170 	dap = diradd_lookup(pagedep, offset);
9171 	if (dap == NULL) {
9172 		/*
9173 		 * Link the jremref structures into the dirrem so they are
9174 		 * written prior to the pagedep.
9175 		 */
9176 		if (jremref)
9177 			dirrem_journal(dirrem, jremref, dotremref,
9178 			    dotdotremref);
9179 		return (dirrem);
9180 	}
9181 	/*
9182 	 * Must be ATTACHED at this point.
9183 	 */
9184 	if ((dap->da_state & ATTACHED) == 0)
9185 		panic("newdirrem: not ATTACHED");
9186 	if (dap->da_newinum != ip->i_number)
9187 		panic("newdirrem: inum %ju should be %ju",
9188 		    (uintmax_t)ip->i_number, (uintmax_t)dap->da_newinum);
9189 	/*
9190 	 * If we are deleting a changed name that never made it to disk,
9191 	 * then return the dirrem describing the previous inode (which
9192 	 * represents the inode currently referenced from this entry on disk).
9193 	 */
9194 	if ((dap->da_state & DIRCHG) != 0) {
9195 		*prevdirremp = dap->da_previous;
9196 		dap->da_state &= ~DIRCHG;
9197 		dap->da_pagedep = pagedep;
9198 	}
9199 	/*
9200 	 * We are deleting an entry that never made it to disk.
9201 	 * Mark it COMPLETE so we can delete its inode immediately.
9202 	 */
9203 	dirrem->dm_state |= COMPLETE;
9204 	cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);
9205 #ifdef SUJ_DEBUG
9206 	if (isrmdir == 0) {
9207 		struct worklist *wk;
9208 
9209 		LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
9210 			if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))
9211 				panic("bad wk %p (0x%X)\n", wk, wk->wk_state);
9212 	}
9213 #endif
9214 
9215 	return (dirrem);
9216 }
9217 
9218 /*
9219  * Directory entry change dependencies.
9220  *
9221  * Changing an existing directory entry requires that an add operation
9222  * be completed first followed by a deletion. The semantics for the addition
9223  * are identical to the description of adding a new entry above except
9224  * that the rollback is to the old inode number rather than zero. Once
9225  * the addition dependency is completed, the removal is done as described
9226  * in the removal routine above.
9227  */
9228 
9229 /*
9230  * This routine should be called immediately after changing
9231  * a directory entry.  The inode's link count should not be
9232  * decremented by the calling procedure -- the soft updates
9233  * code will perform this task when it is safe.
9234  */
9235 void
9236 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
9237 	struct buf *bp;		/* buffer containing directory block */
9238 	struct inode *dp;	/* inode for the directory being modified */
9239 	struct inode *ip;	/* inode for directory entry being removed */
9240 	ino_t newinum;		/* new inode number for changed entry */
9241 	int isrmdir;		/* indicates if doing RMDIR */
9242 {
9243 	int offset;
9244 	struct diradd *dap = NULL;
9245 	struct dirrem *dirrem, *prevdirrem;
9246 	struct pagedep *pagedep;
9247 	struct inodedep *inodedep;
9248 	struct jaddref *jaddref;
9249 	struct mount *mp;
9250 
9251 	offset = blkoff(dp->i_fs, dp->i_offset);
9252 	mp = UFSTOVFS(dp->i_ump);
9253 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
9254 	   ("softdep_setup_directory_change called on non-softdep filesystem"));
9255 
9256 	/*
9257 	 * Whiteouts do not need diradd dependencies.
9258 	 */
9259 	if (newinum != WINO) {
9260 		dap = malloc(sizeof(struct diradd),
9261 		    M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
9262 		workitem_alloc(&dap->da_list, D_DIRADD, mp);
9263 		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
9264 		dap->da_offset = offset;
9265 		dap->da_newinum = newinum;
9266 		LIST_INIT(&dap->da_jwork);
9267 	}
9268 
9269 	/*
9270 	 * Allocate a new dirrem and ACQUIRE_LOCK.
9271 	 */
9272 	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
9273 	pagedep = dirrem->dm_pagedep;
9274 	/*
9275 	 * The possible values for isrmdir:
9276 	 *	0 - non-directory file rename
9277 	 *	1 - directory rename within same directory
9278 	 *   inum - directory rename to new directory of given inode number
9279 	 * When renaming to a new directory, we are both deleting and
9280 	 * creating a new directory entry, so the link count on the new
9281 	 * directory should not change. Thus we do not need the followup
9282 	 * dirrem which is usually done in handle_workitem_remove. We set
9283 	 * the DIRCHG flag to tell handle_workitem_remove to skip the
9284 	 * followup dirrem.
9285 	 */
9286 	if (isrmdir > 1)
9287 		dirrem->dm_state |= DIRCHG;
9288 
9289 	/*
9290 	 * Whiteouts have no additional dependencies,
9291 	 * so just put the dirrem on the correct list.
9292 	 */
9293 	if (newinum == WINO) {
9294 		if ((dirrem->dm_state & COMPLETE) == 0) {
9295 			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
9296 			    dm_next);
9297 		} else {
9298 			dirrem->dm_dirinum = pagedep->pd_ino;
9299 			if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9300 				add_to_worklist(&dirrem->dm_list, 0);
9301 		}
9302 		FREE_LOCK(dp->i_ump);
9303 		return;
9304 	}
9305 	/*
9306 	 * Add the dirrem to the inodedep's pending remove list for quick
9307 	 * discovery later.  A valid nlinkdelta ensures that this lookup
9308 	 * will not fail.
9309 	 */
9310 	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
9311 		panic("softdep_setup_directory_change: Lost inodedep.");
9312 	dirrem->dm_state |= ONDEPLIST;
9313 	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9314 
9315 	/*
9316 	 * If the COMPLETE flag is clear, then there were no active
9317 	 * entries and we want to roll back to the previous inode until
9318 	 * the new inode is committed to disk. If the COMPLETE flag is
9319 	 * set, then we have deleted an entry that never made it to disk.
9320 	 * If the entry we deleted resulted from a name change, then the old
9321 	 * inode reference still resides on disk. Any rollback that we do
9322 	 * needs to be to that old inode (returned to us in prevdirrem). If
9323 	 * the entry we deleted resulted from a create, then there is
9324 	 * no entry on the disk, so we want to roll back to zero rather
9325 	 * than the uncommitted inode. In either of the COMPLETE cases we
9326 	 * want to immediately free the unwritten and unreferenced inode.
9327 	 */
9328 	if ((dirrem->dm_state & COMPLETE) == 0) {
9329 		dap->da_previous = dirrem;
9330 	} else {
9331 		if (prevdirrem != NULL) {
9332 			dap->da_previous = prevdirrem;
9333 		} else {
9334 			dap->da_state &= ~DIRCHG;
9335 			dap->da_pagedep = pagedep;
9336 		}
9337 		dirrem->dm_dirinum = pagedep->pd_ino;
9338 		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9339 			add_to_worklist(&dirrem->dm_list, 0);
9340 	}
9341 	/*
9342 	 * Lookup the jaddref for this journal entry.  We must finish
9343 	 * initializing it and make the diradd write dependent on it.
9344 	 * If we're not journaling, put it on the id_bufwait list if the
9345 	 * inode is not yet written. If it is written, do the post-inode
9346 	 * write processing to put it on the id_pendinghd list.
9347 	 */
9348 	inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep);
9349 	if (MOUNTEDSUJ(mp)) {
9350 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
9351 		    inoreflst);
9352 		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
9353 		    ("softdep_setup_directory_change: bad jaddref %p",
9354 		    jaddref));
9355 		jaddref->ja_diroff = dp->i_offset;
9356 		jaddref->ja_diradd = dap;
9357 		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9358 		    dap, da_pdlist);
9359 		add_to_journal(&jaddref->ja_list);
9360 	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
9361 		dap->da_state |= COMPLETE;
9362 		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
9363 		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
9364 	} else {
9365 		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9366 		    dap, da_pdlist);
9367 		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
9368 	}
9369 	/*
9370 	 * If we're making a new name for a directory that has not been
9371 	 * committed when need to move the dot and dotdot references to
9372 	 * this new name.
9373 	 */
9374 	if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET)
9375 		merge_diradd(inodedep, dap);
9376 	FREE_LOCK(dp->i_ump);
9377 }
9378 
9379 /*
9380  * Called whenever the link count on an inode is changed.
9381  * It creates an inode dependency so that the new reference(s)
9382  * to the inode cannot be committed to disk until the updated
9383  * inode has been written.
9384  */
9385 void
9386 softdep_change_linkcnt(ip)
9387 	struct inode *ip;	/* the inode with the increased link count */
9388 {
9389 	struct inodedep *inodedep;
9390 	int dflags;
9391 
9392 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
9393 	    ("softdep_change_linkcnt called on non-softdep filesystem"));
9394 	ACQUIRE_LOCK(ip->i_ump);
9395 	dflags = DEPALLOC;
9396 	if (IS_SNAPSHOT(ip))
9397 		dflags |= NODELAY;
9398 	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, &inodedep);
9399 	if (ip->i_nlink < ip->i_effnlink)
9400 		panic("softdep_change_linkcnt: bad delta");
9401 	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9402 	FREE_LOCK(ip->i_ump);
9403 }
9404 
9405 /*
9406  * Attach a sbdep dependency to the superblock buf so that we can keep
9407  * track of the head of the linked list of referenced but unlinked inodes.
9408  */
9409 void
9410 softdep_setup_sbupdate(ump, fs, bp)
9411 	struct ufsmount *ump;
9412 	struct fs *fs;
9413 	struct buf *bp;
9414 {
9415 	struct sbdep *sbdep;
9416 	struct worklist *wk;
9417 
9418 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
9419 	    ("softdep_setup_sbupdate called on non-softdep filesystem"));
9420 	LIST_FOREACH(wk, &bp->b_dep, wk_list)
9421 		if (wk->wk_type == D_SBDEP)
9422 			break;
9423 	if (wk != NULL)
9424 		return;
9425 	sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);
9426 	workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));
9427 	sbdep->sb_fs = fs;
9428 	sbdep->sb_ump = ump;
9429 	ACQUIRE_LOCK(ump);
9430 	WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);
9431 	FREE_LOCK(ump);
9432 }
9433 
9434 /*
9435  * Return the first unlinked inodedep which is ready to be the head of the
9436  * list.  The inodedep and all those after it must have valid next pointers.
9437  */
9438 static struct inodedep *
9439 first_unlinked_inodedep(ump)
9440 	struct ufsmount *ump;
9441 {
9442 	struct inodedep *inodedep;
9443 	struct inodedep *idp;
9444 
9445 	LOCK_OWNED(ump);
9446 	for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
9447 	    inodedep; inodedep = idp) {
9448 		if ((inodedep->id_state & UNLINKNEXT) == 0)
9449 			return (NULL);
9450 		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9451 		if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)
9452 			break;
9453 		if ((inodedep->id_state & UNLINKPREV) == 0)
9454 			break;
9455 	}
9456 	return (inodedep);
9457 }
9458 
9459 /*
9460  * Set the sujfree unlinked head pointer prior to writing a superblock.
9461  */
9462 static void
9463 initiate_write_sbdep(sbdep)
9464 	struct sbdep *sbdep;
9465 {
9466 	struct inodedep *inodedep;
9467 	struct fs *bpfs;
9468 	struct fs *fs;
9469 
9470 	bpfs = sbdep->sb_fs;
9471 	fs = sbdep->sb_ump->um_fs;
9472 	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9473 	if (inodedep) {
9474 		fs->fs_sujfree = inodedep->id_ino;
9475 		inodedep->id_state |= UNLINKPREV;
9476 	} else
9477 		fs->fs_sujfree = 0;
9478 	bpfs->fs_sujfree = fs->fs_sujfree;
9479 }
9480 
9481 /*
9482  * After a superblock is written determine whether it must be written again
9483  * due to a changing unlinked list head.
9484  */
9485 static int
9486 handle_written_sbdep(sbdep, bp)
9487 	struct sbdep *sbdep;
9488 	struct buf *bp;
9489 {
9490 	struct inodedep *inodedep;
9491 	struct mount *mp;
9492 	struct fs *fs;
9493 
9494 	LOCK_OWNED(sbdep->sb_ump);
9495 	fs = sbdep->sb_fs;
9496 	mp = UFSTOVFS(sbdep->sb_ump);
9497 	/*
9498 	 * If the superblock doesn't match the in-memory list start over.
9499 	 */
9500 	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9501 	if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
9502 	    (inodedep == NULL && fs->fs_sujfree != 0)) {
9503 		bdirty(bp);
9504 		return (1);
9505 	}
9506 	WORKITEM_FREE(sbdep, D_SBDEP);
9507 	if (fs->fs_sujfree == 0)
9508 		return (0);
9509 	/*
9510 	 * Now that we have a record of this inode in stable store allow it
9511 	 * to be written to free up pending work.  Inodes may see a lot of
9512 	 * write activity after they are unlinked which we must not hold up.
9513 	 */
9514 	for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
9515 		if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)
9516 			panic("handle_written_sbdep: Bad inodedep %p (0x%X)",
9517 			    inodedep, inodedep->id_state);
9518 		if (inodedep->id_state & UNLINKONLIST)
9519 			break;
9520 		inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST;
9521 	}
9522 
9523 	return (0);
9524 }
9525 
9526 /*
9527  * Mark an inodedep as unlinked and insert it into the in-memory unlinked list.
9528  */
9529 static void
9530 unlinked_inodedep(mp, inodedep)
9531 	struct mount *mp;
9532 	struct inodedep *inodedep;
9533 {
9534 	struct ufsmount *ump;
9535 
9536 	ump = VFSTOUFS(mp);
9537 	LOCK_OWNED(ump);
9538 	if (MOUNTEDSUJ(mp) == 0)
9539 		return;
9540 	ump->um_fs->fs_fmod = 1;
9541 	if (inodedep->id_state & UNLINKED)
9542 		panic("unlinked_inodedep: %p already unlinked\n", inodedep);
9543 	inodedep->id_state |= UNLINKED;
9544 	TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
9545 }
9546 
9547 /*
9548  * Remove an inodedep from the unlinked inodedep list.  This may require
9549  * disk writes if the inode has made it that far.
9550  */
9551 static void
9552 clear_unlinked_inodedep(inodedep)
9553 	struct inodedep *inodedep;
9554 {
9555 	struct ufsmount *ump;
9556 	struct inodedep *idp;
9557 	struct inodedep *idn;
9558 	struct fs *fs;
9559 	struct buf *bp;
9560 	ino_t ino;
9561 	ino_t nino;
9562 	ino_t pino;
9563 	int error;
9564 
9565 	ump = VFSTOUFS(inodedep->id_list.wk_mp);
9566 	fs = ump->um_fs;
9567 	ino = inodedep->id_ino;
9568 	error = 0;
9569 	for (;;) {
9570 		LOCK_OWNED(ump);
9571 		KASSERT((inodedep->id_state & UNLINKED) != 0,
9572 		    ("clear_unlinked_inodedep: inodedep %p not unlinked",
9573 		    inodedep));
9574 		/*
9575 		 * If nothing has yet been written simply remove us from
9576 		 * the in memory list and return.  This is the most common
9577 		 * case where handle_workitem_remove() loses the final
9578 		 * reference.
9579 		 */
9580 		if ((inodedep->id_state & UNLINKLINKS) == 0)
9581 			break;
9582 		/*
9583 		 * If we have a NEXT pointer and no PREV pointer we can simply
9584 		 * clear NEXT's PREV and remove ourselves from the list.  Be
9585 		 * careful not to clear PREV if the superblock points at
9586 		 * next as well.
9587 		 */
9588 		idn = TAILQ_NEXT(inodedep, id_unlinked);
9589 		if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {
9590 			if (idn && fs->fs_sujfree != idn->id_ino)
9591 				idn->id_state &= ~UNLINKPREV;
9592 			break;
9593 		}
9594 		/*
9595 		 * Here we have an inodedep which is actually linked into
9596 		 * the list.  We must remove it by forcing a write to the
9597 		 * link before us, whether it be the superblock or an inode.
9598 		 * Unfortunately the list may change while we're waiting
9599 		 * on the buf lock for either resource so we must loop until
9600 		 * we lock the right one.  If both the superblock and an
9601 		 * inode point to this inode we must clear the inode first
9602 		 * followed by the superblock.
9603 		 */
9604 		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9605 		pino = 0;
9606 		if (idp && (idp->id_state & UNLINKNEXT))
9607 			pino = idp->id_ino;
9608 		FREE_LOCK(ump);
9609 		if (pino == 0) {
9610 			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9611 			    (int)fs->fs_sbsize, 0, 0, 0);
9612 		} else {
9613 			error = bread(ump->um_devvp,
9614 			    fsbtodb(fs, ino_to_fsba(fs, pino)),
9615 			    (int)fs->fs_bsize, NOCRED, &bp);
9616 			if (error)
9617 				brelse(bp);
9618 		}
9619 		ACQUIRE_LOCK(ump);
9620 		if (error)
9621 			break;
9622 		/* If the list has changed restart the loop. */
9623 		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9624 		nino = 0;
9625 		if (idp && (idp->id_state & UNLINKNEXT))
9626 			nino = idp->id_ino;
9627 		if (nino != pino ||
9628 		    (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {
9629 			FREE_LOCK(ump);
9630 			brelse(bp);
9631 			ACQUIRE_LOCK(ump);
9632 			continue;
9633 		}
9634 		nino = 0;
9635 		idn = TAILQ_NEXT(inodedep, id_unlinked);
9636 		if (idn)
9637 			nino = idn->id_ino;
9638 		/*
9639 		 * Remove us from the in memory list.  After this we cannot
9640 		 * access the inodedep.
9641 		 */
9642 		KASSERT((inodedep->id_state & UNLINKED) != 0,
9643 		    ("clear_unlinked_inodedep: inodedep %p not unlinked",
9644 		    inodedep));
9645 		inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9646 		TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9647 		FREE_LOCK(ump);
9648 		/*
9649 		 * The predecessor's next pointer is manually updated here
9650 		 * so that the NEXT flag is never cleared for an element
9651 		 * that is in the list.
9652 		 */
9653 		if (pino == 0) {
9654 			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
9655 			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
9656 			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
9657 			    bp);
9658 		} else if (fs->fs_magic == FS_UFS1_MAGIC)
9659 			((struct ufs1_dinode *)bp->b_data +
9660 			    ino_to_fsbo(fs, pino))->di_freelink = nino;
9661 		else
9662 			((struct ufs2_dinode *)bp->b_data +
9663 			    ino_to_fsbo(fs, pino))->di_freelink = nino;
9664 		/*
9665 		 * If the bwrite fails we have no recourse to recover.  The
9666 		 * filesystem is corrupted already.
9667 		 */
9668 		bwrite(bp);
9669 		ACQUIRE_LOCK(ump);
9670 		/*
9671 		 * If the superblock pointer still needs to be cleared force
9672 		 * a write here.
9673 		 */
9674 		if (fs->fs_sujfree == ino) {
9675 			FREE_LOCK(ump);
9676 			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9677 			    (int)fs->fs_sbsize, 0, 0, 0);
9678 			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
9679 			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
9680 			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
9681 			    bp);
9682 			bwrite(bp);
9683 			ACQUIRE_LOCK(ump);
9684 		}
9685 
9686 		if (fs->fs_sujfree != ino)
9687 			return;
9688 		panic("clear_unlinked_inodedep: Failed to clear free head");
9689 	}
9690 	if (inodedep->id_ino == fs->fs_sujfree)
9691 		panic("clear_unlinked_inodedep: Freeing head of free list");
9692 	inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9693 	TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9694 	return;
9695 }
9696 
9697 /*
9698  * This workitem decrements the inode's link count.
9699  * If the link count reaches zero, the file is removed.
9700  */
9701 static int
9702 handle_workitem_remove(dirrem, flags)
9703 	struct dirrem *dirrem;
9704 	int flags;
9705 {
9706 	struct inodedep *inodedep;
9707 	struct workhead dotdotwk;
9708 	struct worklist *wk;
9709 	struct ufsmount *ump;
9710 	struct mount *mp;
9711 	struct vnode *vp;
9712 	struct inode *ip;
9713 	ino_t oldinum;
9714 
9715 	if (dirrem->dm_state & ONWORKLIST)
9716 		panic("handle_workitem_remove: dirrem %p still on worklist",
9717 		    dirrem);
9718 	oldinum = dirrem->dm_oldinum;
9719 	mp = dirrem->dm_list.wk_mp;
9720 	ump = VFSTOUFS(mp);
9721 	flags |= LK_EXCLUSIVE;
9722 	if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ) != 0)
9723 		return (EBUSY);
9724 	ip = VTOI(vp);
9725 	ACQUIRE_LOCK(ump);
9726 	if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)
9727 		panic("handle_workitem_remove: lost inodedep");
9728 	if (dirrem->dm_state & ONDEPLIST)
9729 		LIST_REMOVE(dirrem, dm_inonext);
9730 	KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
9731 	    ("handle_workitem_remove:  Journal entries not written."));
9732 
9733 	/*
9734 	 * Move all dependencies waiting on the remove to complete
9735 	 * from the dirrem to the inode inowait list to be completed
9736 	 * after the inode has been updated and written to disk.  Any
9737 	 * marked MKDIR_PARENT are saved to be completed when the .. ref
9738 	 * is removed.
9739 	 */
9740 	LIST_INIT(&dotdotwk);
9741 	while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {
9742 		WORKLIST_REMOVE(wk);
9743 		if (wk->wk_state & MKDIR_PARENT) {
9744 			wk->wk_state &= ~MKDIR_PARENT;
9745 			WORKLIST_INSERT(&dotdotwk, wk);
9746 			continue;
9747 		}
9748 		WORKLIST_INSERT(&inodedep->id_inowait, wk);
9749 	}
9750 	LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);
9751 	/*
9752 	 * Normal file deletion.
9753 	 */
9754 	if ((dirrem->dm_state & RMDIR) == 0) {
9755 		ip->i_nlink--;
9756 		DIP_SET(ip, i_nlink, ip->i_nlink);
9757 		ip->i_flag |= IN_CHANGE;
9758 		if (ip->i_nlink < ip->i_effnlink)
9759 			panic("handle_workitem_remove: bad file delta");
9760 		if (ip->i_nlink == 0)
9761 			unlinked_inodedep(mp, inodedep);
9762 		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9763 		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
9764 		    ("handle_workitem_remove: worklist not empty. %s",
9765 		    TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));
9766 		WORKITEM_FREE(dirrem, D_DIRREM);
9767 		FREE_LOCK(ump);
9768 		goto out;
9769 	}
9770 	/*
9771 	 * Directory deletion. Decrement reference count for both the
9772 	 * just deleted parent directory entry and the reference for ".".
9773 	 * Arrange to have the reference count on the parent decremented
9774 	 * to account for the loss of "..".
9775 	 */
9776 	ip->i_nlink -= 2;
9777 	DIP_SET(ip, i_nlink, ip->i_nlink);
9778 	ip->i_flag |= IN_CHANGE;
9779 	if (ip->i_nlink < ip->i_effnlink)
9780 		panic("handle_workitem_remove: bad dir delta");
9781 	if (ip->i_nlink == 0)
9782 		unlinked_inodedep(mp, inodedep);
9783 	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9784 	/*
9785 	 * Rename a directory to a new parent. Since, we are both deleting
9786 	 * and creating a new directory entry, the link count on the new
9787 	 * directory should not change. Thus we skip the followup dirrem.
9788 	 */
9789 	if (dirrem->dm_state & DIRCHG) {
9790 		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
9791 		    ("handle_workitem_remove: DIRCHG and worklist not empty."));
9792 		WORKITEM_FREE(dirrem, D_DIRREM);
9793 		FREE_LOCK(ump);
9794 		goto out;
9795 	}
9796 	dirrem->dm_state = ONDEPLIST;
9797 	dirrem->dm_oldinum = dirrem->dm_dirinum;
9798 	/*
9799 	 * Place the dirrem on the parent's diremhd list.
9800 	 */
9801 	if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)
9802 		panic("handle_workitem_remove: lost dir inodedep");
9803 	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9804 	/*
9805 	 * If the allocated inode has never been written to disk, then
9806 	 * the on-disk inode is zero'ed and we can remove the file
9807 	 * immediately.  When journaling if the inode has been marked
9808 	 * unlinked and not DEPCOMPLETE we know it can never be written.
9809 	 */
9810 	inodedep_lookup(mp, oldinum, 0, &inodedep);
9811 	if (inodedep == NULL ||
9812 	    (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
9813 	    check_inode_unwritten(inodedep)) {
9814 		FREE_LOCK(ump);
9815 		vput(vp);
9816 		return handle_workitem_remove(dirrem, flags);
9817 	}
9818 	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
9819 	FREE_LOCK(ump);
9820 	ip->i_flag |= IN_CHANGE;
9821 out:
9822 	ffs_update(vp, 0);
9823 	vput(vp);
9824 	return (0);
9825 }
9826 
9827 /*
9828  * Inode de-allocation dependencies.
9829  *
9830  * When an inode's link count is reduced to zero, it can be de-allocated. We
9831  * found it convenient to postpone de-allocation until after the inode is
9832  * written to disk with its new link count (zero).  At this point, all of the
9833  * on-disk inode's block pointers are nullified and, with careful dependency
9834  * list ordering, all dependencies related to the inode will be satisfied and
9835  * the corresponding dependency structures de-allocated.  So, if/when the
9836  * inode is reused, there will be no mixing of old dependencies with new
9837  * ones.  This artificial dependency is set up by the block de-allocation
9838  * procedure above (softdep_setup_freeblocks) and completed by the
9839  * following procedure.
9840  */
9841 static void
9842 handle_workitem_freefile(freefile)
9843 	struct freefile *freefile;
9844 {
9845 	struct workhead wkhd;
9846 	struct fs *fs;
9847 	struct inodedep *idp;
9848 	struct ufsmount *ump;
9849 	int error;
9850 
9851 	ump = VFSTOUFS(freefile->fx_list.wk_mp);
9852 	fs = ump->um_fs;
9853 #ifdef DEBUG
9854 	ACQUIRE_LOCK(ump);
9855 	error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
9856 	FREE_LOCK(ump);
9857 	if (error)
9858 		panic("handle_workitem_freefile: inodedep %p survived", idp);
9859 #endif
9860 	UFS_LOCK(ump);
9861 	fs->fs_pendinginodes -= 1;
9862 	UFS_UNLOCK(ump);
9863 	LIST_INIT(&wkhd);
9864 	LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);
9865 	if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
9866 	    freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)
9867 		softdep_error("handle_workitem_freefile", error);
9868 	ACQUIRE_LOCK(ump);
9869 	WORKITEM_FREE(freefile, D_FREEFILE);
9870 	FREE_LOCK(ump);
9871 }
9872 
9873 
9874 /*
9875  * Helper function which unlinks marker element from work list and returns
9876  * the next element on the list.
9877  */
9878 static __inline struct worklist *
9879 markernext(struct worklist *marker)
9880 {
9881 	struct worklist *next;
9882 
9883 	next = LIST_NEXT(marker, wk_list);
9884 	LIST_REMOVE(marker, wk_list);
9885 	return next;
9886 }
9887 
9888 /*
9889  * Disk writes.
9890  *
9891  * The dependency structures constructed above are most actively used when file
9892  * system blocks are written to disk.  No constraints are placed on when a
9893  * block can be written, but unsatisfied update dependencies are made safe by
9894  * modifying (or replacing) the source memory for the duration of the disk
9895  * write.  When the disk write completes, the memory block is again brought
9896  * up-to-date.
9897  *
9898  * In-core inode structure reclamation.
9899  *
9900  * Because there are a finite number of "in-core" inode structures, they are
9901  * reused regularly.  By transferring all inode-related dependencies to the
9902  * in-memory inode block and indexing them separately (via "inodedep"s), we
9903  * can allow "in-core" inode structures to be reused at any time and avoid
9904  * any increase in contention.
9905  *
9906  * Called just before entering the device driver to initiate a new disk I/O.
9907  * The buffer must be locked, thus, no I/O completion operations can occur
9908  * while we are manipulating its associated dependencies.
9909  */
9910 static void
9911 softdep_disk_io_initiation(bp)
9912 	struct buf *bp;		/* structure describing disk write to occur */
9913 {
9914 	struct worklist *wk;
9915 	struct worklist marker;
9916 	struct inodedep *inodedep;
9917 	struct freeblks *freeblks;
9918 	struct jblkdep *jblkdep;
9919 	struct newblk *newblk;
9920 	struct ufsmount *ump;
9921 
9922 	/*
9923 	 * We only care about write operations. There should never
9924 	 * be dependencies for reads.
9925 	 */
9926 	if (bp->b_iocmd != BIO_WRITE)
9927 		panic("softdep_disk_io_initiation: not write");
9928 
9929 	if (bp->b_vflags & BV_BKGRDINPROG)
9930 		panic("softdep_disk_io_initiation: Writing buffer with "
9931 		    "background write in progress: %p", bp);
9932 
9933 	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
9934 		return;
9935 	ump = VFSTOUFS(wk->wk_mp);
9936 
9937 	marker.wk_type = D_LAST + 1;	/* Not a normal workitem */
9938 	PHOLD(curproc);			/* Don't swap out kernel stack */
9939 	ACQUIRE_LOCK(ump);
9940 	/*
9941 	 * Do any necessary pre-I/O processing.
9942 	 */
9943 	for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
9944 	     wk = markernext(&marker)) {
9945 		LIST_INSERT_AFTER(wk, &marker, wk_list);
9946 		switch (wk->wk_type) {
9947 
9948 		case D_PAGEDEP:
9949 			initiate_write_filepage(WK_PAGEDEP(wk), bp);
9950 			continue;
9951 
9952 		case D_INODEDEP:
9953 			inodedep = WK_INODEDEP(wk);
9954 			if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
9955 				initiate_write_inodeblock_ufs1(inodedep, bp);
9956 			else
9957 				initiate_write_inodeblock_ufs2(inodedep, bp);
9958 			continue;
9959 
9960 		case D_INDIRDEP:
9961 			initiate_write_indirdep(WK_INDIRDEP(wk), bp);
9962 			continue;
9963 
9964 		case D_BMSAFEMAP:
9965 			initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);
9966 			continue;
9967 
9968 		case D_JSEG:
9969 			WK_JSEG(wk)->js_buf = NULL;
9970 			continue;
9971 
9972 		case D_FREEBLKS:
9973 			freeblks = WK_FREEBLKS(wk);
9974 			jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd);
9975 			/*
9976 			 * We have to wait for the freeblks to be journaled
9977 			 * before we can write an inodeblock with updated
9978 			 * pointers.  Be careful to arrange the marker so
9979 			 * we revisit the freeblks if it's not removed by
9980 			 * the first jwait().
9981 			 */
9982 			if (jblkdep != NULL) {
9983 				LIST_REMOVE(&marker, wk_list);
9984 				LIST_INSERT_BEFORE(wk, &marker, wk_list);
9985 				jwait(&jblkdep->jb_list, MNT_WAIT);
9986 			}
9987 			continue;
9988 		case D_ALLOCDIRECT:
9989 		case D_ALLOCINDIR:
9990 			/*
9991 			 * We have to wait for the jnewblk to be journaled
9992 			 * before we can write to a block if the contents
9993 			 * may be confused with an earlier file's indirect
9994 			 * at recovery time.  Handle the marker as described
9995 			 * above.
9996 			 */
9997 			newblk = WK_NEWBLK(wk);
9998 			if (newblk->nb_jnewblk != NULL &&
9999 			    indirblk_lookup(newblk->nb_list.wk_mp,
10000 			    newblk->nb_newblkno)) {
10001 				LIST_REMOVE(&marker, wk_list);
10002 				LIST_INSERT_BEFORE(wk, &marker, wk_list);
10003 				jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
10004 			}
10005 			continue;
10006 
10007 		case D_SBDEP:
10008 			initiate_write_sbdep(WK_SBDEP(wk));
10009 			continue;
10010 
10011 		case D_MKDIR:
10012 		case D_FREEWORK:
10013 		case D_FREEDEP:
10014 		case D_JSEGDEP:
10015 			continue;
10016 
10017 		default:
10018 			panic("handle_disk_io_initiation: Unexpected type %s",
10019 			    TYPENAME(wk->wk_type));
10020 			/* NOTREACHED */
10021 		}
10022 	}
10023 	FREE_LOCK(ump);
10024 	PRELE(curproc);			/* Allow swapout of kernel stack */
10025 }
10026 
10027 /*
10028  * Called from within the procedure above to deal with unsatisfied
10029  * allocation dependencies in a directory. The buffer must be locked,
10030  * thus, no I/O completion operations can occur while we are
10031  * manipulating its associated dependencies.
10032  */
10033 static void
10034 initiate_write_filepage(pagedep, bp)
10035 	struct pagedep *pagedep;
10036 	struct buf *bp;
10037 {
10038 	struct jremref *jremref;
10039 	struct jmvref *jmvref;
10040 	struct dirrem *dirrem;
10041 	struct diradd *dap;
10042 	struct direct *ep;
10043 	int i;
10044 
10045 	if (pagedep->pd_state & IOSTARTED) {
10046 		/*
10047 		 * This can only happen if there is a driver that does not
10048 		 * understand chaining. Here biodone will reissue the call
10049 		 * to strategy for the incomplete buffers.
10050 		 */
10051 		printf("initiate_write_filepage: already started\n");
10052 		return;
10053 	}
10054 	pagedep->pd_state |= IOSTARTED;
10055 	/*
10056 	 * Wait for all journal remove dependencies to hit the disk.
10057 	 * We can not allow any potentially conflicting directory adds
10058 	 * to be visible before removes and rollback is too difficult.
10059 	 * The per-filesystem lock may be dropped and re-acquired, however
10060 	 * we hold the buf locked so the dependency can not go away.
10061 	 */
10062 	LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
10063 		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL)
10064 			jwait(&jremref->jr_list, MNT_WAIT);
10065 	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL)
10066 		jwait(&jmvref->jm_list, MNT_WAIT);
10067 	for (i = 0; i < DAHASHSZ; i++) {
10068 		LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
10069 			ep = (struct direct *)
10070 			    ((char *)bp->b_data + dap->da_offset);
10071 			if (ep->d_ino != dap->da_newinum)
10072 				panic("%s: dir inum %ju != new %ju",
10073 				    "initiate_write_filepage",
10074 				    (uintmax_t)ep->d_ino,
10075 				    (uintmax_t)dap->da_newinum);
10076 			if (dap->da_state & DIRCHG)
10077 				ep->d_ino = dap->da_previous->dm_oldinum;
10078 			else
10079 				ep->d_ino = 0;
10080 			dap->da_state &= ~ATTACHED;
10081 			dap->da_state |= UNDONE;
10082 		}
10083 	}
10084 }
10085 
10086 /*
10087  * Version of initiate_write_inodeblock that handles UFS1 dinodes.
10088  * Note that any bug fixes made to this routine must be done in the
10089  * version found below.
10090  *
10091  * Called from within the procedure above to deal with unsatisfied
10092  * allocation dependencies in an inodeblock. The buffer must be
10093  * locked, thus, no I/O completion operations can occur while we
10094  * are manipulating its associated dependencies.
10095  */
10096 static void
10097 initiate_write_inodeblock_ufs1(inodedep, bp)
10098 	struct inodedep *inodedep;
10099 	struct buf *bp;			/* The inode block */
10100 {
10101 	struct allocdirect *adp, *lastadp;
10102 	struct ufs1_dinode *dp;
10103 	struct ufs1_dinode *sip;
10104 	struct inoref *inoref;
10105 	struct ufsmount *ump;
10106 	struct fs *fs;
10107 	ufs_lbn_t i;
10108 #ifdef INVARIANTS
10109 	ufs_lbn_t prevlbn = 0;
10110 #endif
10111 	int deplist;
10112 
10113 	if (inodedep->id_state & IOSTARTED)
10114 		panic("initiate_write_inodeblock_ufs1: already started");
10115 	inodedep->id_state |= IOSTARTED;
10116 	fs = inodedep->id_fs;
10117 	ump = VFSTOUFS(inodedep->id_list.wk_mp);
10118 	LOCK_OWNED(ump);
10119 	dp = (struct ufs1_dinode *)bp->b_data +
10120 	    ino_to_fsbo(fs, inodedep->id_ino);
10121 
10122 	/*
10123 	 * If we're on the unlinked list but have not yet written our
10124 	 * next pointer initialize it here.
10125 	 */
10126 	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
10127 		struct inodedep *inon;
10128 
10129 		inon = TAILQ_NEXT(inodedep, id_unlinked);
10130 		dp->di_freelink = inon ? inon->id_ino : 0;
10131 	}
10132 	/*
10133 	 * If the bitmap is not yet written, then the allocated
10134 	 * inode cannot be written to disk.
10135 	 */
10136 	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
10137 		if (inodedep->id_savedino1 != NULL)
10138 			panic("initiate_write_inodeblock_ufs1: I/O underway");
10139 		FREE_LOCK(ump);
10140 		sip = malloc(sizeof(struct ufs1_dinode),
10141 		    M_SAVEDINO, M_SOFTDEP_FLAGS);
10142 		ACQUIRE_LOCK(ump);
10143 		inodedep->id_savedino1 = sip;
10144 		*inodedep->id_savedino1 = *dp;
10145 		bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
10146 		dp->di_gen = inodedep->id_savedino1->di_gen;
10147 		dp->di_freelink = inodedep->id_savedino1->di_freelink;
10148 		return;
10149 	}
10150 	/*
10151 	 * If no dependencies, then there is nothing to roll back.
10152 	 */
10153 	inodedep->id_savedsize = dp->di_size;
10154 	inodedep->id_savedextsize = 0;
10155 	inodedep->id_savednlink = dp->di_nlink;
10156 	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
10157 	    TAILQ_EMPTY(&inodedep->id_inoreflst))
10158 		return;
10159 	/*
10160 	 * Revert the link count to that of the first unwritten journal entry.
10161 	 */
10162 	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
10163 	if (inoref)
10164 		dp->di_nlink = inoref->if_nlink;
10165 	/*
10166 	 * Set the dependencies to busy.
10167 	 */
10168 	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10169 	     adp = TAILQ_NEXT(adp, ad_next)) {
10170 #ifdef INVARIANTS
10171 		if (deplist != 0 && prevlbn >= adp->ad_offset)
10172 			panic("softdep_write_inodeblock: lbn order");
10173 		prevlbn = adp->ad_offset;
10174 		if (adp->ad_offset < NDADDR &&
10175 		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
10176 			panic("%s: direct pointer #%jd mismatch %d != %jd",
10177 			    "softdep_write_inodeblock",
10178 			    (intmax_t)adp->ad_offset,
10179 			    dp->di_db[adp->ad_offset],
10180 			    (intmax_t)adp->ad_newblkno);
10181 		if (adp->ad_offset >= NDADDR &&
10182 		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
10183 			panic("%s: indirect pointer #%jd mismatch %d != %jd",
10184 			    "softdep_write_inodeblock",
10185 			    (intmax_t)adp->ad_offset - NDADDR,
10186 			    dp->di_ib[adp->ad_offset - NDADDR],
10187 			    (intmax_t)adp->ad_newblkno);
10188 		deplist |= 1 << adp->ad_offset;
10189 		if ((adp->ad_state & ATTACHED) == 0)
10190 			panic("softdep_write_inodeblock: Unknown state 0x%x",
10191 			    adp->ad_state);
10192 #endif /* INVARIANTS */
10193 		adp->ad_state &= ~ATTACHED;
10194 		adp->ad_state |= UNDONE;
10195 	}
10196 	/*
10197 	 * The on-disk inode cannot claim to be any larger than the last
10198 	 * fragment that has been written. Otherwise, the on-disk inode
10199 	 * might have fragments that were not the last block in the file
10200 	 * which would corrupt the filesystem.
10201 	 */
10202 	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10203 	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10204 		if (adp->ad_offset >= NDADDR)
10205 			break;
10206 		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10207 		/* keep going until hitting a rollback to a frag */
10208 		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10209 			continue;
10210 		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10211 		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
10212 #ifdef INVARIANTS
10213 			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10214 				panic("softdep_write_inodeblock: lost dep1");
10215 #endif /* INVARIANTS */
10216 			dp->di_db[i] = 0;
10217 		}
10218 		for (i = 0; i < NIADDR; i++) {
10219 #ifdef INVARIANTS
10220 			if (dp->di_ib[i] != 0 &&
10221 			    (deplist & ((1 << NDADDR) << i)) == 0)
10222 				panic("softdep_write_inodeblock: lost dep2");
10223 #endif /* INVARIANTS */
10224 			dp->di_ib[i] = 0;
10225 		}
10226 		return;
10227 	}
10228 	/*
10229 	 * If we have zero'ed out the last allocated block of the file,
10230 	 * roll back the size to the last currently allocated block.
10231 	 * We know that this last allocated block is a full-sized as
10232 	 * we already checked for fragments in the loop above.
10233 	 */
10234 	if (lastadp != NULL &&
10235 	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10236 		for (i = lastadp->ad_offset; i >= 0; i--)
10237 			if (dp->di_db[i] != 0)
10238 				break;
10239 		dp->di_size = (i + 1) * fs->fs_bsize;
10240 	}
10241 	/*
10242 	 * The only dependencies are for indirect blocks.
10243 	 *
10244 	 * The file size for indirect block additions is not guaranteed.
10245 	 * Such a guarantee would be non-trivial to achieve. The conventional
10246 	 * synchronous write implementation also does not make this guarantee.
10247 	 * Fsck should catch and fix discrepancies. Arguably, the file size
10248 	 * can be over-estimated without destroying integrity when the file
10249 	 * moves into the indirect blocks (i.e., is large). If we want to
10250 	 * postpone fsck, we are stuck with this argument.
10251 	 */
10252 	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10253 		dp->di_ib[adp->ad_offset - NDADDR] = 0;
10254 }
10255 
10256 /*
10257  * Version of initiate_write_inodeblock that handles UFS2 dinodes.
10258  * Note that any bug fixes made to this routine must be done in the
10259  * version found above.
10260  *
10261  * Called from within the procedure above to deal with unsatisfied
10262  * allocation dependencies in an inodeblock. The buffer must be
10263  * locked, thus, no I/O completion operations can occur while we
10264  * are manipulating its associated dependencies.
10265  */
10266 static void
10267 initiate_write_inodeblock_ufs2(inodedep, bp)
10268 	struct inodedep *inodedep;
10269 	struct buf *bp;			/* The inode block */
10270 {
10271 	struct allocdirect *adp, *lastadp;
10272 	struct ufs2_dinode *dp;
10273 	struct ufs2_dinode *sip;
10274 	struct inoref *inoref;
10275 	struct ufsmount *ump;
10276 	struct fs *fs;
10277 	ufs_lbn_t i;
10278 #ifdef INVARIANTS
10279 	ufs_lbn_t prevlbn = 0;
10280 #endif
10281 	int deplist;
10282 
10283 	if (inodedep->id_state & IOSTARTED)
10284 		panic("initiate_write_inodeblock_ufs2: already started");
10285 	inodedep->id_state |= IOSTARTED;
10286 	fs = inodedep->id_fs;
10287 	ump = VFSTOUFS(inodedep->id_list.wk_mp);
10288 	LOCK_OWNED(ump);
10289 	dp = (struct ufs2_dinode *)bp->b_data +
10290 	    ino_to_fsbo(fs, inodedep->id_ino);
10291 
10292 	/*
10293 	 * If we're on the unlinked list but have not yet written our
10294 	 * next pointer initialize it here.
10295 	 */
10296 	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
10297 		struct inodedep *inon;
10298 
10299 		inon = TAILQ_NEXT(inodedep, id_unlinked);
10300 		dp->di_freelink = inon ? inon->id_ino : 0;
10301 	}
10302 	/*
10303 	 * If the bitmap is not yet written, then the allocated
10304 	 * inode cannot be written to disk.
10305 	 */
10306 	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
10307 		if (inodedep->id_savedino2 != NULL)
10308 			panic("initiate_write_inodeblock_ufs2: I/O underway");
10309 		FREE_LOCK(ump);
10310 		sip = malloc(sizeof(struct ufs2_dinode),
10311 		    M_SAVEDINO, M_SOFTDEP_FLAGS);
10312 		ACQUIRE_LOCK(ump);
10313 		inodedep->id_savedino2 = sip;
10314 		*inodedep->id_savedino2 = *dp;
10315 		bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
10316 		dp->di_gen = inodedep->id_savedino2->di_gen;
10317 		dp->di_freelink = inodedep->id_savedino2->di_freelink;
10318 		return;
10319 	}
10320 	/*
10321 	 * If no dependencies, then there is nothing to roll back.
10322 	 */
10323 	inodedep->id_savedsize = dp->di_size;
10324 	inodedep->id_savedextsize = dp->di_extsize;
10325 	inodedep->id_savednlink = dp->di_nlink;
10326 	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
10327 	    TAILQ_EMPTY(&inodedep->id_extupdt) &&
10328 	    TAILQ_EMPTY(&inodedep->id_inoreflst))
10329 		return;
10330 	/*
10331 	 * Revert the link count to that of the first unwritten journal entry.
10332 	 */
10333 	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
10334 	if (inoref)
10335 		dp->di_nlink = inoref->if_nlink;
10336 
10337 	/*
10338 	 * Set the ext data dependencies to busy.
10339 	 */
10340 	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10341 	     adp = TAILQ_NEXT(adp, ad_next)) {
10342 #ifdef INVARIANTS
10343 		if (deplist != 0 && prevlbn >= adp->ad_offset)
10344 			panic("softdep_write_inodeblock: lbn order");
10345 		prevlbn = adp->ad_offset;
10346 		if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
10347 			panic("%s: direct pointer #%jd mismatch %jd != %jd",
10348 			    "softdep_write_inodeblock",
10349 			    (intmax_t)adp->ad_offset,
10350 			    (intmax_t)dp->di_extb[adp->ad_offset],
10351 			    (intmax_t)adp->ad_newblkno);
10352 		deplist |= 1 << adp->ad_offset;
10353 		if ((adp->ad_state & ATTACHED) == 0)
10354 			panic("softdep_write_inodeblock: Unknown state 0x%x",
10355 			    adp->ad_state);
10356 #endif /* INVARIANTS */
10357 		adp->ad_state &= ~ATTACHED;
10358 		adp->ad_state |= UNDONE;
10359 	}
10360 	/*
10361 	 * The on-disk inode cannot claim to be any larger than the last
10362 	 * fragment that has been written. Otherwise, the on-disk inode
10363 	 * might have fragments that were not the last block in the ext
10364 	 * data which would corrupt the filesystem.
10365 	 */
10366 	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10367 	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10368 		dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;
10369 		/* keep going until hitting a rollback to a frag */
10370 		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10371 			continue;
10372 		dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10373 		for (i = adp->ad_offset + 1; i < NXADDR; i++) {
10374 #ifdef INVARIANTS
10375 			if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
10376 				panic("softdep_write_inodeblock: lost dep1");
10377 #endif /* INVARIANTS */
10378 			dp->di_extb[i] = 0;
10379 		}
10380 		lastadp = NULL;
10381 		break;
10382 	}
10383 	/*
10384 	 * If we have zero'ed out the last allocated block of the ext
10385 	 * data, roll back the size to the last currently allocated block.
10386 	 * We know that this last allocated block is a full-sized as
10387 	 * we already checked for fragments in the loop above.
10388 	 */
10389 	if (lastadp != NULL &&
10390 	    dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10391 		for (i = lastadp->ad_offset; i >= 0; i--)
10392 			if (dp->di_extb[i] != 0)
10393 				break;
10394 		dp->di_extsize = (i + 1) * fs->fs_bsize;
10395 	}
10396 	/*
10397 	 * Set the file data dependencies to busy.
10398 	 */
10399 	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10400 	     adp = TAILQ_NEXT(adp, ad_next)) {
10401 #ifdef INVARIANTS
10402 		if (deplist != 0 && prevlbn >= adp->ad_offset)
10403 			panic("softdep_write_inodeblock: lbn order");
10404 		if ((adp->ad_state & ATTACHED) == 0)
10405 			panic("inodedep %p and adp %p not attached", inodedep, adp);
10406 		prevlbn = adp->ad_offset;
10407 		if (adp->ad_offset < NDADDR &&
10408 		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
10409 			panic("%s: direct pointer #%jd mismatch %jd != %jd",
10410 			    "softdep_write_inodeblock",
10411 			    (intmax_t)adp->ad_offset,
10412 			    (intmax_t)dp->di_db[adp->ad_offset],
10413 			    (intmax_t)adp->ad_newblkno);
10414 		if (adp->ad_offset >= NDADDR &&
10415 		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
10416 			panic("%s indirect pointer #%jd mismatch %jd != %jd",
10417 			    "softdep_write_inodeblock:",
10418 			    (intmax_t)adp->ad_offset - NDADDR,
10419 			    (intmax_t)dp->di_ib[adp->ad_offset - NDADDR],
10420 			    (intmax_t)adp->ad_newblkno);
10421 		deplist |= 1 << adp->ad_offset;
10422 		if ((adp->ad_state & ATTACHED) == 0)
10423 			panic("softdep_write_inodeblock: Unknown state 0x%x",
10424 			    adp->ad_state);
10425 #endif /* INVARIANTS */
10426 		adp->ad_state &= ~ATTACHED;
10427 		adp->ad_state |= UNDONE;
10428 	}
10429 	/*
10430 	 * The on-disk inode cannot claim to be any larger than the last
10431 	 * fragment that has been written. Otherwise, the on-disk inode
10432 	 * might have fragments that were not the last block in the file
10433 	 * which would corrupt the filesystem.
10434 	 */
10435 	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10436 	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10437 		if (adp->ad_offset >= NDADDR)
10438 			break;
10439 		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10440 		/* keep going until hitting a rollback to a frag */
10441 		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10442 			continue;
10443 		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10444 		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
10445 #ifdef INVARIANTS
10446 			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10447 				panic("softdep_write_inodeblock: lost dep2");
10448 #endif /* INVARIANTS */
10449 			dp->di_db[i] = 0;
10450 		}
10451 		for (i = 0; i < NIADDR; i++) {
10452 #ifdef INVARIANTS
10453 			if (dp->di_ib[i] != 0 &&
10454 			    (deplist & ((1 << NDADDR) << i)) == 0)
10455 				panic("softdep_write_inodeblock: lost dep3");
10456 #endif /* INVARIANTS */
10457 			dp->di_ib[i] = 0;
10458 		}
10459 		return;
10460 	}
10461 	/*
10462 	 * If we have zero'ed out the last allocated block of the file,
10463 	 * roll back the size to the last currently allocated block.
10464 	 * We know that this last allocated block is a full-sized as
10465 	 * we already checked for fragments in the loop above.
10466 	 */
10467 	if (lastadp != NULL &&
10468 	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10469 		for (i = lastadp->ad_offset; i >= 0; i--)
10470 			if (dp->di_db[i] != 0)
10471 				break;
10472 		dp->di_size = (i + 1) * fs->fs_bsize;
10473 	}
10474 	/*
10475 	 * The only dependencies are for indirect blocks.
10476 	 *
10477 	 * The file size for indirect block additions is not guaranteed.
10478 	 * Such a guarantee would be non-trivial to achieve. The conventional
10479 	 * synchronous write implementation also does not make this guarantee.
10480 	 * Fsck should catch and fix discrepancies. Arguably, the file size
10481 	 * can be over-estimated without destroying integrity when the file
10482 	 * moves into the indirect blocks (i.e., is large). If we want to
10483 	 * postpone fsck, we are stuck with this argument.
10484 	 */
10485 	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10486 		dp->di_ib[adp->ad_offset - NDADDR] = 0;
10487 }
10488 
10489 /*
10490  * Cancel an indirdep as a result of truncation.  Release all of the
10491  * children allocindirs and place their journal work on the appropriate
10492  * list.
10493  */
10494 static void
10495 cancel_indirdep(indirdep, bp, freeblks)
10496 	struct indirdep *indirdep;
10497 	struct buf *bp;
10498 	struct freeblks *freeblks;
10499 {
10500 	struct allocindir *aip;
10501 
10502 	/*
10503 	 * None of the indirect pointers will ever be visible,
10504 	 * so they can simply be tossed. GOINGAWAY ensures
10505 	 * that allocated pointers will be saved in the buffer
10506 	 * cache until they are freed. Note that they will
10507 	 * only be able to be found by their physical address
10508 	 * since the inode mapping the logical address will
10509 	 * be gone. The save buffer used for the safe copy
10510 	 * was allocated in setup_allocindir_phase2 using
10511 	 * the physical address so it could be used for this
10512 	 * purpose. Hence we swap the safe copy with the real
10513 	 * copy, allowing the safe copy to be freed and holding
10514 	 * on to the real copy for later use in indir_trunc.
10515 	 */
10516 	if (indirdep->ir_state & GOINGAWAY)
10517 		panic("cancel_indirdep: already gone");
10518 	if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
10519 		indirdep->ir_state |= DEPCOMPLETE;
10520 		LIST_REMOVE(indirdep, ir_next);
10521 	}
10522 	indirdep->ir_state |= GOINGAWAY;
10523 	/*
10524 	 * Pass in bp for blocks still have journal writes
10525 	 * pending so we can cancel them on their own.
10526 	 */
10527 	while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
10528 		cancel_allocindir(aip, bp, freeblks, 0);
10529 	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0)
10530 		cancel_allocindir(aip, NULL, freeblks, 0);
10531 	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0)
10532 		cancel_allocindir(aip, NULL, freeblks, 0);
10533 	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0)
10534 		cancel_allocindir(aip, NULL, freeblks, 0);
10535 	/*
10536 	 * If there are pending partial truncations we need to keep the
10537 	 * old block copy around until they complete.  This is because
10538 	 * the current b_data is not a perfect superset of the available
10539 	 * blocks.
10540 	 */
10541 	if (TAILQ_EMPTY(&indirdep->ir_trunc))
10542 		bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);
10543 	else
10544 		bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10545 	WORKLIST_REMOVE(&indirdep->ir_list);
10546 	WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);
10547 	indirdep->ir_bp = NULL;
10548 	indirdep->ir_freeblks = freeblks;
10549 }
10550 
10551 /*
10552  * Free an indirdep once it no longer has new pointers to track.
10553  */
10554 static void
10555 free_indirdep(indirdep)
10556 	struct indirdep *indirdep;
10557 {
10558 
10559 	KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc),
10560 	    ("free_indirdep: Indir trunc list not empty."));
10561 	KASSERT(LIST_EMPTY(&indirdep->ir_completehd),
10562 	    ("free_indirdep: Complete head not empty."));
10563 	KASSERT(LIST_EMPTY(&indirdep->ir_writehd),
10564 	    ("free_indirdep: write head not empty."));
10565 	KASSERT(LIST_EMPTY(&indirdep->ir_donehd),
10566 	    ("free_indirdep: done head not empty."));
10567 	KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),
10568 	    ("free_indirdep: deplist head not empty."));
10569 	KASSERT((indirdep->ir_state & DEPCOMPLETE),
10570 	    ("free_indirdep: %p still on newblk list.", indirdep));
10571 	KASSERT(indirdep->ir_saveddata == NULL,
10572 	    ("free_indirdep: %p still has saved data.", indirdep));
10573 	if (indirdep->ir_state & ONWORKLIST)
10574 		WORKLIST_REMOVE(&indirdep->ir_list);
10575 	WORKITEM_FREE(indirdep, D_INDIRDEP);
10576 }
10577 
10578 /*
10579  * Called before a write to an indirdep.  This routine is responsible for
10580  * rolling back pointers to a safe state which includes only those
10581  * allocindirs which have been completed.
10582  */
10583 static void
10584 initiate_write_indirdep(indirdep, bp)
10585 	struct indirdep *indirdep;
10586 	struct buf *bp;
10587 {
10588 	struct ufsmount *ump;
10589 
10590 	indirdep->ir_state |= IOSTARTED;
10591 	if (indirdep->ir_state & GOINGAWAY)
10592 		panic("disk_io_initiation: indirdep gone");
10593 	/*
10594 	 * If there are no remaining dependencies, this will be writing
10595 	 * the real pointers.
10596 	 */
10597 	if (LIST_EMPTY(&indirdep->ir_deplisthd) &&
10598 	    TAILQ_EMPTY(&indirdep->ir_trunc))
10599 		return;
10600 	/*
10601 	 * Replace up-to-date version with safe version.
10602 	 */
10603 	if (indirdep->ir_saveddata == NULL) {
10604 		ump = VFSTOUFS(indirdep->ir_list.wk_mp);
10605 		LOCK_OWNED(ump);
10606 		FREE_LOCK(ump);
10607 		indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
10608 		    M_SOFTDEP_FLAGS);
10609 		ACQUIRE_LOCK(ump);
10610 	}
10611 	indirdep->ir_state &= ~ATTACHED;
10612 	indirdep->ir_state |= UNDONE;
10613 	bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10614 	bcopy(indirdep->ir_savebp->b_data, bp->b_data,
10615 	    bp->b_bcount);
10616 }
10617 
10618 /*
10619  * Called when an inode has been cleared in a cg bitmap.  This finally
10620  * eliminates any canceled jaddrefs
10621  */
10622 void
10623 softdep_setup_inofree(mp, bp, ino, wkhd)
10624 	struct mount *mp;
10625 	struct buf *bp;
10626 	ino_t ino;
10627 	struct workhead *wkhd;
10628 {
10629 	struct worklist *wk, *wkn;
10630 	struct inodedep *inodedep;
10631 	struct ufsmount *ump;
10632 	uint8_t *inosused;
10633 	struct cg *cgp;
10634 	struct fs *fs;
10635 
10636 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
10637 	    ("softdep_setup_inofree called on non-softdep filesystem"));
10638 	ump = VFSTOUFS(mp);
10639 	ACQUIRE_LOCK(ump);
10640 	fs = ump->um_fs;
10641 	cgp = (struct cg *)bp->b_data;
10642 	inosused = cg_inosused(cgp);
10643 	if (isset(inosused, ino % fs->fs_ipg))
10644 		panic("softdep_setup_inofree: inode %ju not freed.",
10645 		    (uintmax_t)ino);
10646 	if (inodedep_lookup(mp, ino, 0, &inodedep))
10647 		panic("softdep_setup_inofree: ino %ju has existing inodedep %p",
10648 		    (uintmax_t)ino, inodedep);
10649 	if (wkhd) {
10650 		LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
10651 			if (wk->wk_type != D_JADDREF)
10652 				continue;
10653 			WORKLIST_REMOVE(wk);
10654 			/*
10655 			 * We can free immediately even if the jaddref
10656 			 * isn't attached in a background write as now
10657 			 * the bitmaps are reconciled.
10658 			 */
10659 			wk->wk_state |= COMPLETE | ATTACHED;
10660 			free_jaddref(WK_JADDREF(wk));
10661 		}
10662 		jwork_move(&bp->b_dep, wkhd);
10663 	}
10664 	FREE_LOCK(ump);
10665 }
10666 
10667 
10668 /*
10669  * Called via ffs_blkfree() after a set of frags has been cleared from a cg
10670  * map.  Any dependencies waiting for the write to clear are added to the
10671  * buf's list and any jnewblks that are being canceled are discarded
10672  * immediately.
10673  */
10674 void
10675 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
10676 	struct mount *mp;
10677 	struct buf *bp;
10678 	ufs2_daddr_t blkno;
10679 	int frags;
10680 	struct workhead *wkhd;
10681 {
10682 	struct bmsafemap *bmsafemap;
10683 	struct jnewblk *jnewblk;
10684 	struct ufsmount *ump;
10685 	struct worklist *wk;
10686 	struct fs *fs;
10687 #ifdef SUJ_DEBUG
10688 	uint8_t *blksfree;
10689 	struct cg *cgp;
10690 	ufs2_daddr_t jstart;
10691 	ufs2_daddr_t jend;
10692 	ufs2_daddr_t end;
10693 	long bno;
10694 	int i;
10695 #endif
10696 
10697 	CTR3(KTR_SUJ,
10698 	    "softdep_setup_blkfree: blkno %jd frags %d wk head %p",
10699 	    blkno, frags, wkhd);
10700 
10701 	ump = VFSTOUFS(mp);
10702 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
10703 	    ("softdep_setup_blkfree called on non-softdep filesystem"));
10704 	ACQUIRE_LOCK(ump);
10705 	/* Lookup the bmsafemap so we track when it is dirty. */
10706 	fs = ump->um_fs;
10707 	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10708 	/*
10709 	 * Detach any jnewblks which have been canceled.  They must linger
10710 	 * until the bitmap is cleared again by ffs_blkfree() to prevent
10711 	 * an unjournaled allocation from hitting the disk.
10712 	 */
10713 	if (wkhd) {
10714 		while ((wk = LIST_FIRST(wkhd)) != NULL) {
10715 			CTR2(KTR_SUJ,
10716 			    "softdep_setup_blkfree: blkno %jd wk type %d",
10717 			    blkno, wk->wk_type);
10718 			WORKLIST_REMOVE(wk);
10719 			if (wk->wk_type != D_JNEWBLK) {
10720 				WORKLIST_INSERT(&bmsafemap->sm_freehd, wk);
10721 				continue;
10722 			}
10723 			jnewblk = WK_JNEWBLK(wk);
10724 			KASSERT(jnewblk->jn_state & GOINGAWAY,
10725 			    ("softdep_setup_blkfree: jnewblk not canceled."));
10726 #ifdef SUJ_DEBUG
10727 			/*
10728 			 * Assert that this block is free in the bitmap
10729 			 * before we discard the jnewblk.
10730 			 */
10731 			cgp = (struct cg *)bp->b_data;
10732 			blksfree = cg_blksfree(cgp);
10733 			bno = dtogd(fs, jnewblk->jn_blkno);
10734 			for (i = jnewblk->jn_oldfrags;
10735 			    i < jnewblk->jn_frags; i++) {
10736 				if (isset(blksfree, bno + i))
10737 					continue;
10738 				panic("softdep_setup_blkfree: not free");
10739 			}
10740 #endif
10741 			/*
10742 			 * Even if it's not attached we can free immediately
10743 			 * as the new bitmap is correct.
10744 			 */
10745 			wk->wk_state |= COMPLETE | ATTACHED;
10746 			free_jnewblk(jnewblk);
10747 		}
10748 	}
10749 
10750 #ifdef SUJ_DEBUG
10751 	/*
10752 	 * Assert that we are not freeing a block which has an outstanding
10753 	 * allocation dependency.
10754 	 */
10755 	fs = VFSTOUFS(mp)->um_fs;
10756 	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10757 	end = blkno + frags;
10758 	LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
10759 		/*
10760 		 * Don't match against blocks that will be freed when the
10761 		 * background write is done.
10762 		 */
10763 		if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==
10764 		    (COMPLETE | DEPCOMPLETE))
10765 			continue;
10766 		jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;
10767 		jend = jnewblk->jn_blkno + jnewblk->jn_frags;
10768 		if ((blkno >= jstart && blkno < jend) ||
10769 		    (end > jstart && end <= jend)) {
10770 			printf("state 0x%X %jd - %d %d dep %p\n",
10771 			    jnewblk->jn_state, jnewblk->jn_blkno,
10772 			    jnewblk->jn_oldfrags, jnewblk->jn_frags,
10773 			    jnewblk->jn_dep);
10774 			panic("softdep_setup_blkfree: "
10775 			    "%jd-%jd(%d) overlaps with %jd-%jd",
10776 			    blkno, end, frags, jstart, jend);
10777 		}
10778 	}
10779 #endif
10780 	FREE_LOCK(ump);
10781 }
10782 
10783 /*
10784  * Revert a block allocation when the journal record that describes it
10785  * is not yet written.
10786  */
10787 static int
10788 jnewblk_rollback(jnewblk, fs, cgp, blksfree)
10789 	struct jnewblk *jnewblk;
10790 	struct fs *fs;
10791 	struct cg *cgp;
10792 	uint8_t *blksfree;
10793 {
10794 	ufs1_daddr_t fragno;
10795 	long cgbno, bbase;
10796 	int frags, blk;
10797 	int i;
10798 
10799 	frags = 0;
10800 	cgbno = dtogd(fs, jnewblk->jn_blkno);
10801 	/*
10802 	 * We have to test which frags need to be rolled back.  We may
10803 	 * be operating on a stale copy when doing background writes.
10804 	 */
10805 	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++)
10806 		if (isclr(blksfree, cgbno + i))
10807 			frags++;
10808 	if (frags == 0)
10809 		return (0);
10810 	/*
10811 	 * This is mostly ffs_blkfree() sans some validation and
10812 	 * superblock updates.
10813 	 */
10814 	if (frags == fs->fs_frag) {
10815 		fragno = fragstoblks(fs, cgbno);
10816 		ffs_setblock(fs, blksfree, fragno);
10817 		ffs_clusteracct(fs, cgp, fragno, 1);
10818 		cgp->cg_cs.cs_nbfree++;
10819 	} else {
10820 		cgbno += jnewblk->jn_oldfrags;
10821 		bbase = cgbno - fragnum(fs, cgbno);
10822 		/* Decrement the old frags.  */
10823 		blk = blkmap(fs, blksfree, bbase);
10824 		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
10825 		/* Deallocate the fragment */
10826 		for (i = 0; i < frags; i++)
10827 			setbit(blksfree, cgbno + i);
10828 		cgp->cg_cs.cs_nffree += frags;
10829 		/* Add back in counts associated with the new frags */
10830 		blk = blkmap(fs, blksfree, bbase);
10831 		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
10832 		/* If a complete block has been reassembled, account for it. */
10833 		fragno = fragstoblks(fs, bbase);
10834 		if (ffs_isblock(fs, blksfree, fragno)) {
10835 			cgp->cg_cs.cs_nffree -= fs->fs_frag;
10836 			ffs_clusteracct(fs, cgp, fragno, 1);
10837 			cgp->cg_cs.cs_nbfree++;
10838 		}
10839 	}
10840 	stat_jnewblk++;
10841 	jnewblk->jn_state &= ~ATTACHED;
10842 	jnewblk->jn_state |= UNDONE;
10843 
10844 	return (frags);
10845 }
10846 
10847 static void
10848 initiate_write_bmsafemap(bmsafemap, bp)
10849 	struct bmsafemap *bmsafemap;
10850 	struct buf *bp;			/* The cg block. */
10851 {
10852 	struct jaddref *jaddref;
10853 	struct jnewblk *jnewblk;
10854 	uint8_t *inosused;
10855 	uint8_t *blksfree;
10856 	struct cg *cgp;
10857 	struct fs *fs;
10858 	ino_t ino;
10859 
10860 	if (bmsafemap->sm_state & IOSTARTED)
10861 		return;
10862 	bmsafemap->sm_state |= IOSTARTED;
10863 	/*
10864 	 * Clear any inode allocations which are pending journal writes.
10865 	 */
10866 	if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) {
10867 		cgp = (struct cg *)bp->b_data;
10868 		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
10869 		inosused = cg_inosused(cgp);
10870 		LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {
10871 			ino = jaddref->ja_ino % fs->fs_ipg;
10872 			if (isset(inosused, ino)) {
10873 				if ((jaddref->ja_mode & IFMT) == IFDIR)
10874 					cgp->cg_cs.cs_ndir--;
10875 				cgp->cg_cs.cs_nifree++;
10876 				clrbit(inosused, ino);
10877 				jaddref->ja_state &= ~ATTACHED;
10878 				jaddref->ja_state |= UNDONE;
10879 				stat_jaddref++;
10880 			} else
10881 				panic("initiate_write_bmsafemap: inode %ju "
10882 				    "marked free", (uintmax_t)jaddref->ja_ino);
10883 		}
10884 	}
10885 	/*
10886 	 * Clear any block allocations which are pending journal writes.
10887 	 */
10888 	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
10889 		cgp = (struct cg *)bp->b_data;
10890 		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
10891 		blksfree = cg_blksfree(cgp);
10892 		LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
10893 			if (jnewblk_rollback(jnewblk, fs, cgp, blksfree))
10894 				continue;
10895 			panic("initiate_write_bmsafemap: block %jd "
10896 			    "marked free", jnewblk->jn_blkno);
10897 		}
10898 	}
10899 	/*
10900 	 * Move allocation lists to the written lists so they can be
10901 	 * cleared once the block write is complete.
10902 	 */
10903 	LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,
10904 	    inodedep, id_deps);
10905 	LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
10906 	    newblk, nb_deps);
10907 	LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist,
10908 	    wk_list);
10909 }
10910 
10911 /*
10912  * This routine is called during the completion interrupt
10913  * service routine for a disk write (from the procedure called
10914  * by the device driver to inform the filesystem caches of
10915  * a request completion).  It should be called early in this
10916  * procedure, before the block is made available to other
10917  * processes or other routines are called.
10918  *
10919  */
10920 static void
10921 softdep_disk_write_complete(bp)
10922 	struct buf *bp;		/* describes the completed disk write */
10923 {
10924 	struct worklist *wk;
10925 	struct worklist *owk;
10926 	struct ufsmount *ump;
10927 	struct workhead reattach;
10928 	struct freeblks *freeblks;
10929 	struct buf *sbp;
10930 
10931 	/*
10932 	 * If an error occurred while doing the write, then the data
10933 	 * has not hit the disk and the dependencies cannot be unrolled.
10934 	 */
10935 	if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
10936 		return;
10937 	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
10938 		return;
10939 	ump = VFSTOUFS(wk->wk_mp);
10940 	LIST_INIT(&reattach);
10941 	/*
10942 	 * This lock must not be released anywhere in this code segment.
10943 	 */
10944 	sbp = NULL;
10945 	owk = NULL;
10946 	ACQUIRE_LOCK(ump);
10947 	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
10948 		WORKLIST_REMOVE(wk);
10949 		atomic_add_long(&dep_write[wk->wk_type], 1);
10950 		if (wk == owk)
10951 			panic("duplicate worklist: %p\n", wk);
10952 		owk = wk;
10953 		switch (wk->wk_type) {
10954 
10955 		case D_PAGEDEP:
10956 			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
10957 				WORKLIST_INSERT(&reattach, wk);
10958 			continue;
10959 
10960 		case D_INODEDEP:
10961 			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
10962 				WORKLIST_INSERT(&reattach, wk);
10963 			continue;
10964 
10965 		case D_BMSAFEMAP:
10966 			if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp))
10967 				WORKLIST_INSERT(&reattach, wk);
10968 			continue;
10969 
10970 		case D_MKDIR:
10971 			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
10972 			continue;
10973 
10974 		case D_ALLOCDIRECT:
10975 			wk->wk_state |= COMPLETE;
10976 			handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL);
10977 			continue;
10978 
10979 		case D_ALLOCINDIR:
10980 			wk->wk_state |= COMPLETE;
10981 			handle_allocindir_partdone(WK_ALLOCINDIR(wk));
10982 			continue;
10983 
10984 		case D_INDIRDEP:
10985 			if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp))
10986 				WORKLIST_INSERT(&reattach, wk);
10987 			continue;
10988 
10989 		case D_FREEBLKS:
10990 			wk->wk_state |= COMPLETE;
10991 			freeblks = WK_FREEBLKS(wk);
10992 			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE &&
10993 			    LIST_EMPTY(&freeblks->fb_jblkdephd))
10994 				add_to_worklist(wk, WK_NODELAY);
10995 			continue;
10996 
10997 		case D_FREEWORK:
10998 			handle_written_freework(WK_FREEWORK(wk));
10999 			break;
11000 
11001 		case D_JSEGDEP:
11002 			free_jsegdep(WK_JSEGDEP(wk));
11003 			continue;
11004 
11005 		case D_JSEG:
11006 			handle_written_jseg(WK_JSEG(wk), bp);
11007 			continue;
11008 
11009 		case D_SBDEP:
11010 			if (handle_written_sbdep(WK_SBDEP(wk), bp))
11011 				WORKLIST_INSERT(&reattach, wk);
11012 			continue;
11013 
11014 		case D_FREEDEP:
11015 			free_freedep(WK_FREEDEP(wk));
11016 			continue;
11017 
11018 		default:
11019 			panic("handle_disk_write_complete: Unknown type %s",
11020 			    TYPENAME(wk->wk_type));
11021 			/* NOTREACHED */
11022 		}
11023 	}
11024 	/*
11025 	 * Reattach any requests that must be redone.
11026 	 */
11027 	while ((wk = LIST_FIRST(&reattach)) != NULL) {
11028 		WORKLIST_REMOVE(wk);
11029 		WORKLIST_INSERT(&bp->b_dep, wk);
11030 	}
11031 	FREE_LOCK(ump);
11032 	if (sbp)
11033 		brelse(sbp);
11034 }
11035 
11036 /*
11037  * Called from within softdep_disk_write_complete above. Note that
11038  * this routine is always called from interrupt level with further
11039  * splbio interrupts blocked.
11040  */
11041 static void
11042 handle_allocdirect_partdone(adp, wkhd)
11043 	struct allocdirect *adp;	/* the completed allocdirect */
11044 	struct workhead *wkhd;		/* Work to do when inode is writtne. */
11045 {
11046 	struct allocdirectlst *listhead;
11047 	struct allocdirect *listadp;
11048 	struct inodedep *inodedep;
11049 	long bsize;
11050 
11051 	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
11052 		return;
11053 	/*
11054 	 * The on-disk inode cannot claim to be any larger than the last
11055 	 * fragment that has been written. Otherwise, the on-disk inode
11056 	 * might have fragments that were not the last block in the file
11057 	 * which would corrupt the filesystem. Thus, we cannot free any
11058 	 * allocdirects after one whose ad_oldblkno claims a fragment as
11059 	 * these blocks must be rolled back to zero before writing the inode.
11060 	 * We check the currently active set of allocdirects in id_inoupdt
11061 	 * or id_extupdt as appropriate.
11062 	 */
11063 	inodedep = adp->ad_inodedep;
11064 	bsize = inodedep->id_fs->fs_bsize;
11065 	if (adp->ad_state & EXTDATA)
11066 		listhead = &inodedep->id_extupdt;
11067 	else
11068 		listhead = &inodedep->id_inoupdt;
11069 	TAILQ_FOREACH(listadp, listhead, ad_next) {
11070 		/* found our block */
11071 		if (listadp == adp)
11072 			break;
11073 		/* continue if ad_oldlbn is not a fragment */
11074 		if (listadp->ad_oldsize == 0 ||
11075 		    listadp->ad_oldsize == bsize)
11076 			continue;
11077 		/* hit a fragment */
11078 		return;
11079 	}
11080 	/*
11081 	 * If we have reached the end of the current list without
11082 	 * finding the just finished dependency, then it must be
11083 	 * on the future dependency list. Future dependencies cannot
11084 	 * be freed until they are moved to the current list.
11085 	 */
11086 	if (listadp == NULL) {
11087 #ifdef DEBUG
11088 		if (adp->ad_state & EXTDATA)
11089 			listhead = &inodedep->id_newextupdt;
11090 		else
11091 			listhead = &inodedep->id_newinoupdt;
11092 		TAILQ_FOREACH(listadp, listhead, ad_next)
11093 			/* found our block */
11094 			if (listadp == adp)
11095 				break;
11096 		if (listadp == NULL)
11097 			panic("handle_allocdirect_partdone: lost dep");
11098 #endif /* DEBUG */
11099 		return;
11100 	}
11101 	/*
11102 	 * If we have found the just finished dependency, then queue
11103 	 * it along with anything that follows it that is complete.
11104 	 * Since the pointer has not yet been written in the inode
11105 	 * as the dependency prevents it, place the allocdirect on the
11106 	 * bufwait list where it will be freed once the pointer is
11107 	 * valid.
11108 	 */
11109 	if (wkhd == NULL)
11110 		wkhd = &inodedep->id_bufwait;
11111 	for (; adp; adp = listadp) {
11112 		listadp = TAILQ_NEXT(adp, ad_next);
11113 		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
11114 			return;
11115 		TAILQ_REMOVE(listhead, adp, ad_next);
11116 		WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);
11117 	}
11118 }
11119 
11120 /*
11121  * Called from within softdep_disk_write_complete above.  This routine
11122  * completes successfully written allocindirs.
11123  */
11124 static void
11125 handle_allocindir_partdone(aip)
11126 	struct allocindir *aip;		/* the completed allocindir */
11127 {
11128 	struct indirdep *indirdep;
11129 
11130 	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
11131 		return;
11132 	indirdep = aip->ai_indirdep;
11133 	LIST_REMOVE(aip, ai_next);
11134 	/*
11135 	 * Don't set a pointer while the buffer is undergoing IO or while
11136 	 * we have active truncations.
11137 	 */
11138 	if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) {
11139 		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
11140 		return;
11141 	}
11142 	if (indirdep->ir_state & UFS1FMT)
11143 		((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
11144 		    aip->ai_newblkno;
11145 	else
11146 		((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
11147 		    aip->ai_newblkno;
11148 	/*
11149 	 * Await the pointer write before freeing the allocindir.
11150 	 */
11151 	LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);
11152 }
11153 
11154 /*
11155  * Release segments held on a jwork list.
11156  */
11157 static void
11158 handle_jwork(wkhd)
11159 	struct workhead *wkhd;
11160 {
11161 	struct worklist *wk;
11162 
11163 	while ((wk = LIST_FIRST(wkhd)) != NULL) {
11164 		WORKLIST_REMOVE(wk);
11165 		switch (wk->wk_type) {
11166 		case D_JSEGDEP:
11167 			free_jsegdep(WK_JSEGDEP(wk));
11168 			continue;
11169 		case D_FREEDEP:
11170 			free_freedep(WK_FREEDEP(wk));
11171 			continue;
11172 		case D_FREEFRAG:
11173 			rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep));
11174 			WORKITEM_FREE(wk, D_FREEFRAG);
11175 			continue;
11176 		case D_FREEWORK:
11177 			handle_written_freework(WK_FREEWORK(wk));
11178 			continue;
11179 		default:
11180 			panic("handle_jwork: Unknown type %s\n",
11181 			    TYPENAME(wk->wk_type));
11182 		}
11183 	}
11184 }
11185 
11186 /*
11187  * Handle the bufwait list on an inode when it is safe to release items
11188  * held there.  This normally happens after an inode block is written but
11189  * may be delayed and handled later if there are pending journal items that
11190  * are not yet safe to be released.
11191  */
11192 static struct freefile *
11193 handle_bufwait(inodedep, refhd)
11194 	struct inodedep *inodedep;
11195 	struct workhead *refhd;
11196 {
11197 	struct jaddref *jaddref;
11198 	struct freefile *freefile;
11199 	struct worklist *wk;
11200 
11201 	freefile = NULL;
11202 	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
11203 		WORKLIST_REMOVE(wk);
11204 		switch (wk->wk_type) {
11205 		case D_FREEFILE:
11206 			/*
11207 			 * We defer adding freefile to the worklist
11208 			 * until all other additions have been made to
11209 			 * ensure that it will be done after all the
11210 			 * old blocks have been freed.
11211 			 */
11212 			if (freefile != NULL)
11213 				panic("handle_bufwait: freefile");
11214 			freefile = WK_FREEFILE(wk);
11215 			continue;
11216 
11217 		case D_MKDIR:
11218 			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
11219 			continue;
11220 
11221 		case D_DIRADD:
11222 			diradd_inode_written(WK_DIRADD(wk), inodedep);
11223 			continue;
11224 
11225 		case D_FREEFRAG:
11226 			wk->wk_state |= COMPLETE;
11227 			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
11228 				add_to_worklist(wk, 0);
11229 			continue;
11230 
11231 		case D_DIRREM:
11232 			wk->wk_state |= COMPLETE;
11233 			add_to_worklist(wk, 0);
11234 			continue;
11235 
11236 		case D_ALLOCDIRECT:
11237 		case D_ALLOCINDIR:
11238 			free_newblk(WK_NEWBLK(wk));
11239 			continue;
11240 
11241 		case D_JNEWBLK:
11242 			wk->wk_state |= COMPLETE;
11243 			free_jnewblk(WK_JNEWBLK(wk));
11244 			continue;
11245 
11246 		/*
11247 		 * Save freed journal segments and add references on
11248 		 * the supplied list which will delay their release
11249 		 * until the cg bitmap is cleared on disk.
11250 		 */
11251 		case D_JSEGDEP:
11252 			if (refhd == NULL)
11253 				free_jsegdep(WK_JSEGDEP(wk));
11254 			else
11255 				WORKLIST_INSERT(refhd, wk);
11256 			continue;
11257 
11258 		case D_JADDREF:
11259 			jaddref = WK_JADDREF(wk);
11260 			TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
11261 			    if_deps);
11262 			/*
11263 			 * Transfer any jaddrefs to the list to be freed with
11264 			 * the bitmap if we're handling a removed file.
11265 			 */
11266 			if (refhd == NULL) {
11267 				wk->wk_state |= COMPLETE;
11268 				free_jaddref(jaddref);
11269 			} else
11270 				WORKLIST_INSERT(refhd, wk);
11271 			continue;
11272 
11273 		default:
11274 			panic("handle_bufwait: Unknown type %p(%s)",
11275 			    wk, TYPENAME(wk->wk_type));
11276 			/* NOTREACHED */
11277 		}
11278 	}
11279 	return (freefile);
11280 }
11281 /*
11282  * Called from within softdep_disk_write_complete above to restore
11283  * in-memory inode block contents to their most up-to-date state. Note
11284  * that this routine is always called from interrupt level with further
11285  * splbio interrupts blocked.
11286  */
11287 static int
11288 handle_written_inodeblock(inodedep, bp)
11289 	struct inodedep *inodedep;
11290 	struct buf *bp;		/* buffer containing the inode block */
11291 {
11292 	struct freefile *freefile;
11293 	struct allocdirect *adp, *nextadp;
11294 	struct ufs1_dinode *dp1 = NULL;
11295 	struct ufs2_dinode *dp2 = NULL;
11296 	struct workhead wkhd;
11297 	int hadchanges, fstype;
11298 	ino_t freelink;
11299 
11300 	LIST_INIT(&wkhd);
11301 	hadchanges = 0;
11302 	freefile = NULL;
11303 	if ((inodedep->id_state & IOSTARTED) == 0)
11304 		panic("handle_written_inodeblock: not started");
11305 	inodedep->id_state &= ~IOSTARTED;
11306 	if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
11307 		fstype = UFS1;
11308 		dp1 = (struct ufs1_dinode *)bp->b_data +
11309 		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11310 		freelink = dp1->di_freelink;
11311 	} else {
11312 		fstype = UFS2;
11313 		dp2 = (struct ufs2_dinode *)bp->b_data +
11314 		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11315 		freelink = dp2->di_freelink;
11316 	}
11317 	/*
11318 	 * Leave this inodeblock dirty until it's in the list.
11319 	 */
11320 	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED) {
11321 		struct inodedep *inon;
11322 
11323 		inon = TAILQ_NEXT(inodedep, id_unlinked);
11324 		if ((inon == NULL && freelink == 0) ||
11325 		    (inon && inon->id_ino == freelink)) {
11326 			if (inon)
11327 				inon->id_state |= UNLINKPREV;
11328 			inodedep->id_state |= UNLINKNEXT;
11329 		}
11330 		hadchanges = 1;
11331 	}
11332 	/*
11333 	 * If we had to rollback the inode allocation because of
11334 	 * bitmaps being incomplete, then simply restore it.
11335 	 * Keep the block dirty so that it will not be reclaimed until
11336 	 * all associated dependencies have been cleared and the
11337 	 * corresponding updates written to disk.
11338 	 */
11339 	if (inodedep->id_savedino1 != NULL) {
11340 		hadchanges = 1;
11341 		if (fstype == UFS1)
11342 			*dp1 = *inodedep->id_savedino1;
11343 		else
11344 			*dp2 = *inodedep->id_savedino2;
11345 		free(inodedep->id_savedino1, M_SAVEDINO);
11346 		inodedep->id_savedino1 = NULL;
11347 		if ((bp->b_flags & B_DELWRI) == 0)
11348 			stat_inode_bitmap++;
11349 		bdirty(bp);
11350 		/*
11351 		 * If the inode is clear here and GOINGAWAY it will never
11352 		 * be written.  Process the bufwait and clear any pending
11353 		 * work which may include the freefile.
11354 		 */
11355 		if (inodedep->id_state & GOINGAWAY)
11356 			goto bufwait;
11357 		return (1);
11358 	}
11359 	inodedep->id_state |= COMPLETE;
11360 	/*
11361 	 * Roll forward anything that had to be rolled back before
11362 	 * the inode could be updated.
11363 	 */
11364 	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
11365 		nextadp = TAILQ_NEXT(adp, ad_next);
11366 		if (adp->ad_state & ATTACHED)
11367 			panic("handle_written_inodeblock: new entry");
11368 		if (fstype == UFS1) {
11369 			if (adp->ad_offset < NDADDR) {
11370 				if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11371 					panic("%s %s #%jd mismatch %d != %jd",
11372 					    "handle_written_inodeblock:",
11373 					    "direct pointer",
11374 					    (intmax_t)adp->ad_offset,
11375 					    dp1->di_db[adp->ad_offset],
11376 					    (intmax_t)adp->ad_oldblkno);
11377 				dp1->di_db[adp->ad_offset] = adp->ad_newblkno;
11378 			} else {
11379 				if (dp1->di_ib[adp->ad_offset - NDADDR] != 0)
11380 					panic("%s: %s #%jd allocated as %d",
11381 					    "handle_written_inodeblock",
11382 					    "indirect pointer",
11383 					    (intmax_t)adp->ad_offset - NDADDR,
11384 					    dp1->di_ib[adp->ad_offset - NDADDR]);
11385 				dp1->di_ib[adp->ad_offset - NDADDR] =
11386 				    adp->ad_newblkno;
11387 			}
11388 		} else {
11389 			if (adp->ad_offset < NDADDR) {
11390 				if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11391 					panic("%s: %s #%jd %s %jd != %jd",
11392 					    "handle_written_inodeblock",
11393 					    "direct pointer",
11394 					    (intmax_t)adp->ad_offset, "mismatch",
11395 					    (intmax_t)dp2->di_db[adp->ad_offset],
11396 					    (intmax_t)adp->ad_oldblkno);
11397 				dp2->di_db[adp->ad_offset] = adp->ad_newblkno;
11398 			} else {
11399 				if (dp2->di_ib[adp->ad_offset - NDADDR] != 0)
11400 					panic("%s: %s #%jd allocated as %jd",
11401 					    "handle_written_inodeblock",
11402 					    "indirect pointer",
11403 					    (intmax_t)adp->ad_offset - NDADDR,
11404 					    (intmax_t)
11405 					    dp2->di_ib[adp->ad_offset - NDADDR]);
11406 				dp2->di_ib[adp->ad_offset - NDADDR] =
11407 				    adp->ad_newblkno;
11408 			}
11409 		}
11410 		adp->ad_state &= ~UNDONE;
11411 		adp->ad_state |= ATTACHED;
11412 		hadchanges = 1;
11413 	}
11414 	for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
11415 		nextadp = TAILQ_NEXT(adp, ad_next);
11416 		if (adp->ad_state & ATTACHED)
11417 			panic("handle_written_inodeblock: new entry");
11418 		if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)
11419 			panic("%s: direct pointers #%jd %s %jd != %jd",
11420 			    "handle_written_inodeblock",
11421 			    (intmax_t)adp->ad_offset, "mismatch",
11422 			    (intmax_t)dp2->di_extb[adp->ad_offset],
11423 			    (intmax_t)adp->ad_oldblkno);
11424 		dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;
11425 		adp->ad_state &= ~UNDONE;
11426 		adp->ad_state |= ATTACHED;
11427 		hadchanges = 1;
11428 	}
11429 	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
11430 		stat_direct_blk_ptrs++;
11431 	/*
11432 	 * Reset the file size to its most up-to-date value.
11433 	 */
11434 	if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
11435 		panic("handle_written_inodeblock: bad size");
11436 	if (inodedep->id_savednlink > LINK_MAX)
11437 		panic("handle_written_inodeblock: Invalid link count "
11438 		    "%d for inodedep %p", inodedep->id_savednlink, inodedep);
11439 	if (fstype == UFS1) {
11440 		if (dp1->di_nlink != inodedep->id_savednlink) {
11441 			dp1->di_nlink = inodedep->id_savednlink;
11442 			hadchanges = 1;
11443 		}
11444 		if (dp1->di_size != inodedep->id_savedsize) {
11445 			dp1->di_size = inodedep->id_savedsize;
11446 			hadchanges = 1;
11447 		}
11448 	} else {
11449 		if (dp2->di_nlink != inodedep->id_savednlink) {
11450 			dp2->di_nlink = inodedep->id_savednlink;
11451 			hadchanges = 1;
11452 		}
11453 		if (dp2->di_size != inodedep->id_savedsize) {
11454 			dp2->di_size = inodedep->id_savedsize;
11455 			hadchanges = 1;
11456 		}
11457 		if (dp2->di_extsize != inodedep->id_savedextsize) {
11458 			dp2->di_extsize = inodedep->id_savedextsize;
11459 			hadchanges = 1;
11460 		}
11461 	}
11462 	inodedep->id_savedsize = -1;
11463 	inodedep->id_savedextsize = -1;
11464 	inodedep->id_savednlink = -1;
11465 	/*
11466 	 * If there were any rollbacks in the inode block, then it must be
11467 	 * marked dirty so that its will eventually get written back in
11468 	 * its correct form.
11469 	 */
11470 	if (hadchanges)
11471 		bdirty(bp);
11472 bufwait:
11473 	/*
11474 	 * Process any allocdirects that completed during the update.
11475 	 */
11476 	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
11477 		handle_allocdirect_partdone(adp, &wkhd);
11478 	if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
11479 		handle_allocdirect_partdone(adp, &wkhd);
11480 	/*
11481 	 * Process deallocations that were held pending until the
11482 	 * inode had been written to disk. Freeing of the inode
11483 	 * is delayed until after all blocks have been freed to
11484 	 * avoid creation of new <vfsid, inum, lbn> triples
11485 	 * before the old ones have been deleted.  Completely
11486 	 * unlinked inodes are not processed until the unlinked
11487 	 * inode list is written or the last reference is removed.
11488 	 */
11489 	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) {
11490 		freefile = handle_bufwait(inodedep, NULL);
11491 		if (freefile && !LIST_EMPTY(&wkhd)) {
11492 			WORKLIST_INSERT(&wkhd, &freefile->fx_list);
11493 			freefile = NULL;
11494 		}
11495 	}
11496 	/*
11497 	 * Move rolled forward dependency completions to the bufwait list
11498 	 * now that those that were already written have been processed.
11499 	 */
11500 	if (!LIST_EMPTY(&wkhd) && hadchanges == 0)
11501 		panic("handle_written_inodeblock: bufwait but no changes");
11502 	jwork_move(&inodedep->id_bufwait, &wkhd);
11503 
11504 	if (freefile != NULL) {
11505 		/*
11506 		 * If the inode is goingaway it was never written.  Fake up
11507 		 * the state here so free_inodedep() can succeed.
11508 		 */
11509 		if (inodedep->id_state & GOINGAWAY)
11510 			inodedep->id_state |= COMPLETE | DEPCOMPLETE;
11511 		if (free_inodedep(inodedep) == 0)
11512 			panic("handle_written_inodeblock: live inodedep %p",
11513 			    inodedep);
11514 		add_to_worklist(&freefile->fx_list, 0);
11515 		return (0);
11516 	}
11517 
11518 	/*
11519 	 * If no outstanding dependencies, free it.
11520 	 */
11521 	if (free_inodedep(inodedep) ||
11522 	    (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&
11523 	     TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
11524 	     TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&
11525 	     LIST_FIRST(&inodedep->id_bufwait) == 0))
11526 		return (0);
11527 	return (hadchanges);
11528 }
11529 
11530 static int
11531 handle_written_indirdep(indirdep, bp, bpp)
11532 	struct indirdep *indirdep;
11533 	struct buf *bp;
11534 	struct buf **bpp;
11535 {
11536 	struct allocindir *aip;
11537 	struct buf *sbp;
11538 	int chgs;
11539 
11540 	if (indirdep->ir_state & GOINGAWAY)
11541 		panic("handle_written_indirdep: indirdep gone");
11542 	if ((indirdep->ir_state & IOSTARTED) == 0)
11543 		panic("handle_written_indirdep: IO not started");
11544 	chgs = 0;
11545 	/*
11546 	 * If there were rollbacks revert them here.
11547 	 */
11548 	if (indirdep->ir_saveddata) {
11549 		bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
11550 		if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11551 			free(indirdep->ir_saveddata, M_INDIRDEP);
11552 			indirdep->ir_saveddata = NULL;
11553 		}
11554 		chgs = 1;
11555 	}
11556 	indirdep->ir_state &= ~(UNDONE | IOSTARTED);
11557 	indirdep->ir_state |= ATTACHED;
11558 	/*
11559 	 * Move allocindirs with written pointers to the completehd if
11560 	 * the indirdep's pointer is not yet written.  Otherwise
11561 	 * free them here.
11562 	 */
11563 	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) {
11564 		LIST_REMOVE(aip, ai_next);
11565 		if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
11566 			LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,
11567 			    ai_next);
11568 			newblk_freefrag(&aip->ai_block);
11569 			continue;
11570 		}
11571 		free_newblk(&aip->ai_block);
11572 	}
11573 	/*
11574 	 * Move allocindirs that have finished dependency processing from
11575 	 * the done list to the write list after updating the pointers.
11576 	 */
11577 	if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11578 		while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
11579 			handle_allocindir_partdone(aip);
11580 			if (aip == LIST_FIRST(&indirdep->ir_donehd))
11581 				panic("disk_write_complete: not gone");
11582 			chgs = 1;
11583 		}
11584 	}
11585 	/*
11586 	 * Preserve the indirdep if there were any changes or if it is not
11587 	 * yet valid on disk.
11588 	 */
11589 	if (chgs) {
11590 		stat_indir_blk_ptrs++;
11591 		bdirty(bp);
11592 		return (1);
11593 	}
11594 	/*
11595 	 * If there were no changes we can discard the savedbp and detach
11596 	 * ourselves from the buf.  We are only carrying completed pointers
11597 	 * in this case.
11598 	 */
11599 	sbp = indirdep->ir_savebp;
11600 	sbp->b_flags |= B_INVAL | B_NOCACHE;
11601 	indirdep->ir_savebp = NULL;
11602 	indirdep->ir_bp = NULL;
11603 	if (*bpp != NULL)
11604 		panic("handle_written_indirdep: bp already exists.");
11605 	*bpp = sbp;
11606 	/*
11607 	 * The indirdep may not be freed until its parent points at it.
11608 	 */
11609 	if (indirdep->ir_state & DEPCOMPLETE)
11610 		free_indirdep(indirdep);
11611 
11612 	return (0);
11613 }
11614 
11615 /*
11616  * Process a diradd entry after its dependent inode has been written.
11617  * This routine must be called with splbio interrupts blocked.
11618  */
11619 static void
11620 diradd_inode_written(dap, inodedep)
11621 	struct diradd *dap;
11622 	struct inodedep *inodedep;
11623 {
11624 
11625 	dap->da_state |= COMPLETE;
11626 	complete_diradd(dap);
11627 	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
11628 }
11629 
11630 /*
11631  * Returns true if the bmsafemap will have rollbacks when written.  Must only
11632  * be called with the per-filesystem lock and the buf lock on the cg held.
11633  */
11634 static int
11635 bmsafemap_backgroundwrite(bmsafemap, bp)
11636 	struct bmsafemap *bmsafemap;
11637 	struct buf *bp;
11638 {
11639 	int dirty;
11640 
11641 	LOCK_OWNED(VFSTOUFS(bmsafemap->sm_list.wk_mp));
11642 	dirty = !LIST_EMPTY(&bmsafemap->sm_jaddrefhd) |
11643 	    !LIST_EMPTY(&bmsafemap->sm_jnewblkhd);
11644 	/*
11645 	 * If we're initiating a background write we need to process the
11646 	 * rollbacks as they exist now, not as they exist when IO starts.
11647 	 * No other consumers will look at the contents of the shadowed
11648 	 * buf so this is safe to do here.
11649 	 */
11650 	if (bp->b_xflags & BX_BKGRDMARKER)
11651 		initiate_write_bmsafemap(bmsafemap, bp);
11652 
11653 	return (dirty);
11654 }
11655 
11656 /*
11657  * Re-apply an allocation when a cg write is complete.
11658  */
11659 static int
11660 jnewblk_rollforward(jnewblk, fs, cgp, blksfree)
11661 	struct jnewblk *jnewblk;
11662 	struct fs *fs;
11663 	struct cg *cgp;
11664 	uint8_t *blksfree;
11665 {
11666 	ufs1_daddr_t fragno;
11667 	ufs2_daddr_t blkno;
11668 	long cgbno, bbase;
11669 	int frags, blk;
11670 	int i;
11671 
11672 	frags = 0;
11673 	cgbno = dtogd(fs, jnewblk->jn_blkno);
11674 	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) {
11675 		if (isclr(blksfree, cgbno + i))
11676 			panic("jnewblk_rollforward: re-allocated fragment");
11677 		frags++;
11678 	}
11679 	if (frags == fs->fs_frag) {
11680 		blkno = fragstoblks(fs, cgbno);
11681 		ffs_clrblock(fs, blksfree, (long)blkno);
11682 		ffs_clusteracct(fs, cgp, blkno, -1);
11683 		cgp->cg_cs.cs_nbfree--;
11684 	} else {
11685 		bbase = cgbno - fragnum(fs, cgbno);
11686 		cgbno += jnewblk->jn_oldfrags;
11687                 /* If a complete block had been reassembled, account for it. */
11688 		fragno = fragstoblks(fs, bbase);
11689 		if (ffs_isblock(fs, blksfree, fragno)) {
11690 			cgp->cg_cs.cs_nffree += fs->fs_frag;
11691 			ffs_clusteracct(fs, cgp, fragno, -1);
11692 			cgp->cg_cs.cs_nbfree--;
11693 		}
11694 		/* Decrement the old frags.  */
11695 		blk = blkmap(fs, blksfree, bbase);
11696 		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
11697 		/* Allocate the fragment */
11698 		for (i = 0; i < frags; i++)
11699 			clrbit(blksfree, cgbno + i);
11700 		cgp->cg_cs.cs_nffree -= frags;
11701 		/* Add back in counts associated with the new frags */
11702 		blk = blkmap(fs, blksfree, bbase);
11703 		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
11704 	}
11705 	return (frags);
11706 }
11707 
11708 /*
11709  * Complete a write to a bmsafemap structure.  Roll forward any bitmap
11710  * changes if it's not a background write.  Set all written dependencies
11711  * to DEPCOMPLETE and free the structure if possible.
11712  */
11713 static int
11714 handle_written_bmsafemap(bmsafemap, bp)
11715 	struct bmsafemap *bmsafemap;
11716 	struct buf *bp;
11717 {
11718 	struct newblk *newblk;
11719 	struct inodedep *inodedep;
11720 	struct jaddref *jaddref, *jatmp;
11721 	struct jnewblk *jnewblk, *jntmp;
11722 	struct ufsmount *ump;
11723 	uint8_t *inosused;
11724 	uint8_t *blksfree;
11725 	struct cg *cgp;
11726 	struct fs *fs;
11727 	ino_t ino;
11728 	int foreground;
11729 	int chgs;
11730 
11731 	if ((bmsafemap->sm_state & IOSTARTED) == 0)
11732 		panic("initiate_write_bmsafemap: Not started\n");
11733 	ump = VFSTOUFS(bmsafemap->sm_list.wk_mp);
11734 	chgs = 0;
11735 	bmsafemap->sm_state &= ~IOSTARTED;
11736 	foreground = (bp->b_xflags & BX_BKGRDMARKER) == 0;
11737 	/*
11738 	 * Release journal work that was waiting on the write.
11739 	 */
11740 	handle_jwork(&bmsafemap->sm_freewr);
11741 
11742 	/*
11743 	 * Restore unwritten inode allocation pending jaddref writes.
11744 	 */
11745 	if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {
11746 		cgp = (struct cg *)bp->b_data;
11747 		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11748 		inosused = cg_inosused(cgp);
11749 		LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,
11750 		    ja_bmdeps, jatmp) {
11751 			if ((jaddref->ja_state & UNDONE) == 0)
11752 				continue;
11753 			ino = jaddref->ja_ino % fs->fs_ipg;
11754 			if (isset(inosused, ino))
11755 				panic("handle_written_bmsafemap: "
11756 				    "re-allocated inode");
11757 			/* Do the roll-forward only if it's a real copy. */
11758 			if (foreground) {
11759 				if ((jaddref->ja_mode & IFMT) == IFDIR)
11760 					cgp->cg_cs.cs_ndir++;
11761 				cgp->cg_cs.cs_nifree--;
11762 				setbit(inosused, ino);
11763 				chgs = 1;
11764 			}
11765 			jaddref->ja_state &= ~UNDONE;
11766 			jaddref->ja_state |= ATTACHED;
11767 			free_jaddref(jaddref);
11768 		}
11769 	}
11770 	/*
11771 	 * Restore any block allocations which are pending journal writes.
11772 	 */
11773 	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
11774 		cgp = (struct cg *)bp->b_data;
11775 		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11776 		blksfree = cg_blksfree(cgp);
11777 		LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,
11778 		    jntmp) {
11779 			if ((jnewblk->jn_state & UNDONE) == 0)
11780 				continue;
11781 			/* Do the roll-forward only if it's a real copy. */
11782 			if (foreground &&
11783 			    jnewblk_rollforward(jnewblk, fs, cgp, blksfree))
11784 				chgs = 1;
11785 			jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
11786 			jnewblk->jn_state |= ATTACHED;
11787 			free_jnewblk(jnewblk);
11788 		}
11789 	}
11790 	while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
11791 		newblk->nb_state |= DEPCOMPLETE;
11792 		newblk->nb_state &= ~ONDEPLIST;
11793 		newblk->nb_bmsafemap = NULL;
11794 		LIST_REMOVE(newblk, nb_deps);
11795 		if (newblk->nb_list.wk_type == D_ALLOCDIRECT)
11796 			handle_allocdirect_partdone(
11797 			    WK_ALLOCDIRECT(&newblk->nb_list), NULL);
11798 		else if (newblk->nb_list.wk_type == D_ALLOCINDIR)
11799 			handle_allocindir_partdone(
11800 			    WK_ALLOCINDIR(&newblk->nb_list));
11801 		else if (newblk->nb_list.wk_type != D_NEWBLK)
11802 			panic("handle_written_bmsafemap: Unexpected type: %s",
11803 			    TYPENAME(newblk->nb_list.wk_type));
11804 	}
11805 	while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) {
11806 		inodedep->id_state |= DEPCOMPLETE;
11807 		inodedep->id_state &= ~ONDEPLIST;
11808 		LIST_REMOVE(inodedep, id_deps);
11809 		inodedep->id_bmsafemap = NULL;
11810 	}
11811 	LIST_REMOVE(bmsafemap, sm_next);
11812 	if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&
11813 	    LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&
11814 	    LIST_EMPTY(&bmsafemap->sm_newblkhd) &&
11815 	    LIST_EMPTY(&bmsafemap->sm_inodedephd) &&
11816 	    LIST_EMPTY(&bmsafemap->sm_freehd)) {
11817 		LIST_REMOVE(bmsafemap, sm_hash);
11818 		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
11819 		return (0);
11820 	}
11821 	LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
11822 	if (foreground)
11823 		bdirty(bp);
11824 	return (1);
11825 }
11826 
11827 /*
11828  * Try to free a mkdir dependency.
11829  */
11830 static void
11831 complete_mkdir(mkdir)
11832 	struct mkdir *mkdir;
11833 {
11834 	struct diradd *dap;
11835 
11836 	if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)
11837 		return;
11838 	LIST_REMOVE(mkdir, md_mkdirs);
11839 	dap = mkdir->md_diradd;
11840 	dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
11841 	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {
11842 		dap->da_state |= DEPCOMPLETE;
11843 		complete_diradd(dap);
11844 	}
11845 	WORKITEM_FREE(mkdir, D_MKDIR);
11846 }
11847 
11848 /*
11849  * Handle the completion of a mkdir dependency.
11850  */
11851 static void
11852 handle_written_mkdir(mkdir, type)
11853 	struct mkdir *mkdir;
11854 	int type;
11855 {
11856 
11857 	if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)
11858 		panic("handle_written_mkdir: bad type");
11859 	mkdir->md_state |= COMPLETE;
11860 	complete_mkdir(mkdir);
11861 }
11862 
11863 static int
11864 free_pagedep(pagedep)
11865 	struct pagedep *pagedep;
11866 {
11867 	int i;
11868 
11869 	if (pagedep->pd_state & NEWBLOCK)
11870 		return (0);
11871 	if (!LIST_EMPTY(&pagedep->pd_dirremhd))
11872 		return (0);
11873 	for (i = 0; i < DAHASHSZ; i++)
11874 		if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
11875 			return (0);
11876 	if (!LIST_EMPTY(&pagedep->pd_pendinghd))
11877 		return (0);
11878 	if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))
11879 		return (0);
11880 	if (pagedep->pd_state & ONWORKLIST)
11881 		WORKLIST_REMOVE(&pagedep->pd_list);
11882 	LIST_REMOVE(pagedep, pd_hash);
11883 	WORKITEM_FREE(pagedep, D_PAGEDEP);
11884 
11885 	return (1);
11886 }
11887 
11888 /*
11889  * Called from within softdep_disk_write_complete above.
11890  * A write operation was just completed. Removed inodes can
11891  * now be freed and associated block pointers may be committed.
11892  * Note that this routine is always called from interrupt level
11893  * with further splbio interrupts blocked.
11894  */
11895 static int
11896 handle_written_filepage(pagedep, bp)
11897 	struct pagedep *pagedep;
11898 	struct buf *bp;		/* buffer containing the written page */
11899 {
11900 	struct dirrem *dirrem;
11901 	struct diradd *dap, *nextdap;
11902 	struct direct *ep;
11903 	int i, chgs;
11904 
11905 	if ((pagedep->pd_state & IOSTARTED) == 0)
11906 		panic("handle_written_filepage: not started");
11907 	pagedep->pd_state &= ~IOSTARTED;
11908 	/*
11909 	 * Process any directory removals that have been committed.
11910 	 */
11911 	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
11912 		LIST_REMOVE(dirrem, dm_next);
11913 		dirrem->dm_state |= COMPLETE;
11914 		dirrem->dm_dirinum = pagedep->pd_ino;
11915 		KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
11916 		    ("handle_written_filepage: Journal entries not written."));
11917 		add_to_worklist(&dirrem->dm_list, 0);
11918 	}
11919 	/*
11920 	 * Free any directory additions that have been committed.
11921 	 * If it is a newly allocated block, we have to wait until
11922 	 * the on-disk directory inode claims the new block.
11923 	 */
11924 	if ((pagedep->pd_state & NEWBLOCK) == 0)
11925 		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
11926 			free_diradd(dap, NULL);
11927 	/*
11928 	 * Uncommitted directory entries must be restored.
11929 	 */
11930 	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
11931 		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
11932 		     dap = nextdap) {
11933 			nextdap = LIST_NEXT(dap, da_pdlist);
11934 			if (dap->da_state & ATTACHED)
11935 				panic("handle_written_filepage: attached");
11936 			ep = (struct direct *)
11937 			    ((char *)bp->b_data + dap->da_offset);
11938 			ep->d_ino = dap->da_newinum;
11939 			dap->da_state &= ~UNDONE;
11940 			dap->da_state |= ATTACHED;
11941 			chgs = 1;
11942 			/*
11943 			 * If the inode referenced by the directory has
11944 			 * been written out, then the dependency can be
11945 			 * moved to the pending list.
11946 			 */
11947 			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
11948 				LIST_REMOVE(dap, da_pdlist);
11949 				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
11950 				    da_pdlist);
11951 			}
11952 		}
11953 	}
11954 	/*
11955 	 * If there were any rollbacks in the directory, then it must be
11956 	 * marked dirty so that its will eventually get written back in
11957 	 * its correct form.
11958 	 */
11959 	if (chgs) {
11960 		if ((bp->b_flags & B_DELWRI) == 0)
11961 			stat_dir_entry++;
11962 		bdirty(bp);
11963 		return (1);
11964 	}
11965 	/*
11966 	 * If we are not waiting for a new directory block to be
11967 	 * claimed by its inode, then the pagedep will be freed.
11968 	 * Otherwise it will remain to track any new entries on
11969 	 * the page in case they are fsync'ed.
11970 	 */
11971 	free_pagedep(pagedep);
11972 	return (0);
11973 }
11974 
11975 /*
11976  * Writing back in-core inode structures.
11977  *
11978  * The filesystem only accesses an inode's contents when it occupies an
11979  * "in-core" inode structure.  These "in-core" structures are separate from
11980  * the page frames used to cache inode blocks.  Only the latter are
11981  * transferred to/from the disk.  So, when the updated contents of the
11982  * "in-core" inode structure are copied to the corresponding in-memory inode
11983  * block, the dependencies are also transferred.  The following procedure is
11984  * called when copying a dirty "in-core" inode to a cached inode block.
11985  */
11986 
11987 /*
11988  * Called when an inode is loaded from disk. If the effective link count
11989  * differed from the actual link count when it was last flushed, then we
11990  * need to ensure that the correct effective link count is put back.
11991  */
11992 void
11993 softdep_load_inodeblock(ip)
11994 	struct inode *ip;	/* the "in_core" copy of the inode */
11995 {
11996 	struct inodedep *inodedep;
11997 
11998 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
11999 	    ("softdep_load_inodeblock called on non-softdep filesystem"));
12000 	/*
12001 	 * Check for alternate nlink count.
12002 	 */
12003 	ip->i_effnlink = ip->i_nlink;
12004 	ACQUIRE_LOCK(ip->i_ump);
12005 	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
12006 	    &inodedep) == 0) {
12007 		FREE_LOCK(ip->i_ump);
12008 		return;
12009 	}
12010 	ip->i_effnlink -= inodedep->id_nlinkdelta;
12011 	FREE_LOCK(ip->i_ump);
12012 }
12013 
12014 /*
12015  * This routine is called just before the "in-core" inode
12016  * information is to be copied to the in-memory inode block.
12017  * Recall that an inode block contains several inodes. If
12018  * the force flag is set, then the dependencies will be
12019  * cleared so that the update can always be made. Note that
12020  * the buffer is locked when this routine is called, so we
12021  * will never be in the middle of writing the inode block
12022  * to disk.
12023  */
12024 void
12025 softdep_update_inodeblock(ip, bp, waitfor)
12026 	struct inode *ip;	/* the "in_core" copy of the inode */
12027 	struct buf *bp;		/* the buffer containing the inode block */
12028 	int waitfor;		/* nonzero => update must be allowed */
12029 {
12030 	struct inodedep *inodedep;
12031 	struct inoref *inoref;
12032 	struct ufsmount *ump;
12033 	struct worklist *wk;
12034 	struct mount *mp;
12035 	struct buf *ibp;
12036 	struct fs *fs;
12037 	int error;
12038 
12039 	ump = ip->i_ump;
12040 	mp = UFSTOVFS(ump);
12041 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
12042 	    ("softdep_update_inodeblock called on non-softdep filesystem"));
12043 	fs = ip->i_fs;
12044 	/*
12045 	 * Preserve the freelink that is on disk.  clear_unlinked_inodedep()
12046 	 * does not have access to the in-core ip so must write directly into
12047 	 * the inode block buffer when setting freelink.
12048 	 */
12049 	if (fs->fs_magic == FS_UFS1_MAGIC)
12050 		DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data +
12051 		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
12052 	else
12053 		DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data +
12054 		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
12055 	/*
12056 	 * If the effective link count is not equal to the actual link
12057 	 * count, then we must track the difference in an inodedep while
12058 	 * the inode is (potentially) tossed out of the cache. Otherwise,
12059 	 * if there is no existing inodedep, then there are no dependencies
12060 	 * to track.
12061 	 */
12062 	ACQUIRE_LOCK(ump);
12063 again:
12064 	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
12065 		FREE_LOCK(ump);
12066 		if (ip->i_effnlink != ip->i_nlink)
12067 			panic("softdep_update_inodeblock: bad link count");
12068 		return;
12069 	}
12070 	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
12071 		panic("softdep_update_inodeblock: bad delta");
12072 	/*
12073 	 * If we're flushing all dependencies we must also move any waiting
12074 	 * for journal writes onto the bufwait list prior to I/O.
12075 	 */
12076 	if (waitfor) {
12077 		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12078 			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12079 			    == DEPCOMPLETE) {
12080 				jwait(&inoref->if_list, MNT_WAIT);
12081 				goto again;
12082 			}
12083 		}
12084 	}
12085 	/*
12086 	 * Changes have been initiated. Anything depending on these
12087 	 * changes cannot occur until this inode has been written.
12088 	 */
12089 	inodedep->id_state &= ~COMPLETE;
12090 	if ((inodedep->id_state & ONWORKLIST) == 0)
12091 		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
12092 	/*
12093 	 * Any new dependencies associated with the incore inode must
12094 	 * now be moved to the list associated with the buffer holding
12095 	 * the in-memory copy of the inode. Once merged process any
12096 	 * allocdirects that are completed by the merger.
12097 	 */
12098 	merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
12099 	if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
12100 		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),
12101 		    NULL);
12102 	merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
12103 	if (!TAILQ_EMPTY(&inodedep->id_extupdt))
12104 		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),
12105 		    NULL);
12106 	/*
12107 	 * Now that the inode has been pushed into the buffer, the
12108 	 * operations dependent on the inode being written to disk
12109 	 * can be moved to the id_bufwait so that they will be
12110 	 * processed when the buffer I/O completes.
12111 	 */
12112 	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
12113 		WORKLIST_REMOVE(wk);
12114 		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
12115 	}
12116 	/*
12117 	 * Newly allocated inodes cannot be written until the bitmap
12118 	 * that allocates them have been written (indicated by
12119 	 * DEPCOMPLETE being set in id_state). If we are doing a
12120 	 * forced sync (e.g., an fsync on a file), we force the bitmap
12121 	 * to be written so that the update can be done.
12122 	 */
12123 	if (waitfor == 0) {
12124 		FREE_LOCK(ump);
12125 		return;
12126 	}
12127 retry:
12128 	if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {
12129 		FREE_LOCK(ump);
12130 		return;
12131 	}
12132 	ibp = inodedep->id_bmsafemap->sm_buf;
12133 	ibp = getdirtybuf(ibp, LOCK_PTR(ump), MNT_WAIT);
12134 	if (ibp == NULL) {
12135 		/*
12136 		 * If ibp came back as NULL, the dependency could have been
12137 		 * freed while we slept.  Look it up again, and check to see
12138 		 * that it has completed.
12139 		 */
12140 		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
12141 			goto retry;
12142 		FREE_LOCK(ump);
12143 		return;
12144 	}
12145 	FREE_LOCK(ump);
12146 	if ((error = bwrite(ibp)) != 0)
12147 		softdep_error("softdep_update_inodeblock: bwrite", error);
12148 }
12149 
12150 /*
12151  * Merge the a new inode dependency list (such as id_newinoupdt) into an
12152  * old inode dependency list (such as id_inoupdt). This routine must be
12153  * called with splbio interrupts blocked.
12154  */
12155 static void
12156 merge_inode_lists(newlisthead, oldlisthead)
12157 	struct allocdirectlst *newlisthead;
12158 	struct allocdirectlst *oldlisthead;
12159 {
12160 	struct allocdirect *listadp, *newadp;
12161 
12162 	newadp = TAILQ_FIRST(newlisthead);
12163 	for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
12164 		if (listadp->ad_offset < newadp->ad_offset) {
12165 			listadp = TAILQ_NEXT(listadp, ad_next);
12166 			continue;
12167 		}
12168 		TAILQ_REMOVE(newlisthead, newadp, ad_next);
12169 		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
12170 		if (listadp->ad_offset == newadp->ad_offset) {
12171 			allocdirect_merge(oldlisthead, newadp,
12172 			    listadp);
12173 			listadp = newadp;
12174 		}
12175 		newadp = TAILQ_FIRST(newlisthead);
12176 	}
12177 	while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
12178 		TAILQ_REMOVE(newlisthead, newadp, ad_next);
12179 		TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
12180 	}
12181 }
12182 
12183 /*
12184  * If we are doing an fsync, then we must ensure that any directory
12185  * entries for the inode have been written after the inode gets to disk.
12186  */
12187 int
12188 softdep_fsync(vp)
12189 	struct vnode *vp;	/* the "in_core" copy of the inode */
12190 {
12191 	struct inodedep *inodedep;
12192 	struct pagedep *pagedep;
12193 	struct inoref *inoref;
12194 	struct ufsmount *ump;
12195 	struct worklist *wk;
12196 	struct diradd *dap;
12197 	struct mount *mp;
12198 	struct vnode *pvp;
12199 	struct inode *ip;
12200 	struct buf *bp;
12201 	struct fs *fs;
12202 	struct thread *td = curthread;
12203 	int error, flushparent, pagedep_new_block;
12204 	ino_t parentino;
12205 	ufs_lbn_t lbn;
12206 
12207 	ip = VTOI(vp);
12208 	fs = ip->i_fs;
12209 	ump = ip->i_ump;
12210 	mp = vp->v_mount;
12211 	if (MOUNTEDSOFTDEP(mp) == 0)
12212 		return (0);
12213 	ACQUIRE_LOCK(ump);
12214 restart:
12215 	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
12216 		FREE_LOCK(ump);
12217 		return (0);
12218 	}
12219 	TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12220 		if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12221 		    == DEPCOMPLETE) {
12222 			jwait(&inoref->if_list, MNT_WAIT);
12223 			goto restart;
12224 		}
12225 	}
12226 	if (!LIST_EMPTY(&inodedep->id_inowait) ||
12227 	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
12228 	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
12229 	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
12230 	    !TAILQ_EMPTY(&inodedep->id_newinoupdt))
12231 		panic("softdep_fsync: pending ops %p", inodedep);
12232 	for (error = 0, flushparent = 0; ; ) {
12233 		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
12234 			break;
12235 		if (wk->wk_type != D_DIRADD)
12236 			panic("softdep_fsync: Unexpected type %s",
12237 			    TYPENAME(wk->wk_type));
12238 		dap = WK_DIRADD(wk);
12239 		/*
12240 		 * Flush our parent if this directory entry has a MKDIR_PARENT
12241 		 * dependency or is contained in a newly allocated block.
12242 		 */
12243 		if (dap->da_state & DIRCHG)
12244 			pagedep = dap->da_previous->dm_pagedep;
12245 		else
12246 			pagedep = dap->da_pagedep;
12247 		parentino = pagedep->pd_ino;
12248 		lbn = pagedep->pd_lbn;
12249 		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
12250 			panic("softdep_fsync: dirty");
12251 		if ((dap->da_state & MKDIR_PARENT) ||
12252 		    (pagedep->pd_state & NEWBLOCK))
12253 			flushparent = 1;
12254 		else
12255 			flushparent = 0;
12256 		/*
12257 		 * If we are being fsync'ed as part of vgone'ing this vnode,
12258 		 * then we will not be able to release and recover the
12259 		 * vnode below, so we just have to give up on writing its
12260 		 * directory entry out. It will eventually be written, just
12261 		 * not now, but then the user was not asking to have it
12262 		 * written, so we are not breaking any promises.
12263 		 */
12264 		if (vp->v_iflag & VI_DOOMED)
12265 			break;
12266 		/*
12267 		 * We prevent deadlock by always fetching inodes from the
12268 		 * root, moving down the directory tree. Thus, when fetching
12269 		 * our parent directory, we first try to get the lock. If
12270 		 * that fails, we must unlock ourselves before requesting
12271 		 * the lock on our parent. See the comment in ufs_lookup
12272 		 * for details on possible races.
12273 		 */
12274 		FREE_LOCK(ump);
12275 		if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp,
12276 		    FFSV_FORCEINSMQ)) {
12277 			error = vfs_busy(mp, MBF_NOWAIT);
12278 			if (error != 0) {
12279 				vfs_ref(mp);
12280 				VOP_UNLOCK(vp, 0);
12281 				error = vfs_busy(mp, 0);
12282 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
12283 				vfs_rel(mp);
12284 				if (error != 0)
12285 					return (ENOENT);
12286 				if (vp->v_iflag & VI_DOOMED) {
12287 					vfs_unbusy(mp);
12288 					return (ENOENT);
12289 				}
12290 			}
12291 			VOP_UNLOCK(vp, 0);
12292 			error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE,
12293 			    &pvp, FFSV_FORCEINSMQ);
12294 			vfs_unbusy(mp);
12295 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
12296 			if (vp->v_iflag & VI_DOOMED) {
12297 				if (error == 0)
12298 					vput(pvp);
12299 				error = ENOENT;
12300 			}
12301 			if (error != 0)
12302 				return (error);
12303 		}
12304 		/*
12305 		 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
12306 		 * that are contained in direct blocks will be resolved by
12307 		 * doing a ffs_update. Pagedeps contained in indirect blocks
12308 		 * may require a complete sync'ing of the directory. So, we
12309 		 * try the cheap and fast ffs_update first, and if that fails,
12310 		 * then we do the slower ffs_syncvnode of the directory.
12311 		 */
12312 		if (flushparent) {
12313 			int locked;
12314 
12315 			if ((error = ffs_update(pvp, 1)) != 0) {
12316 				vput(pvp);
12317 				return (error);
12318 			}
12319 			ACQUIRE_LOCK(ump);
12320 			locked = 1;
12321 			if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
12322 				if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
12323 					if (wk->wk_type != D_DIRADD)
12324 						panic("softdep_fsync: Unexpected type %s",
12325 						      TYPENAME(wk->wk_type));
12326 					dap = WK_DIRADD(wk);
12327 					if (dap->da_state & DIRCHG)
12328 						pagedep = dap->da_previous->dm_pagedep;
12329 					else
12330 						pagedep = dap->da_pagedep;
12331 					pagedep_new_block = pagedep->pd_state & NEWBLOCK;
12332 					FREE_LOCK(ump);
12333 					locked = 0;
12334 					if (pagedep_new_block && (error =
12335 					    ffs_syncvnode(pvp, MNT_WAIT, 0))) {
12336 						vput(pvp);
12337 						return (error);
12338 					}
12339 				}
12340 			}
12341 			if (locked)
12342 				FREE_LOCK(ump);
12343 		}
12344 		/*
12345 		 * Flush directory page containing the inode's name.
12346 		 */
12347 		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
12348 		    &bp);
12349 		if (error == 0)
12350 			error = bwrite(bp);
12351 		else
12352 			brelse(bp);
12353 		vput(pvp);
12354 		if (error != 0)
12355 			return (error);
12356 		ACQUIRE_LOCK(ump);
12357 		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
12358 			break;
12359 	}
12360 	FREE_LOCK(ump);
12361 	return (0);
12362 }
12363 
12364 /*
12365  * Flush all the dirty bitmaps associated with the block device
12366  * before flushing the rest of the dirty blocks so as to reduce
12367  * the number of dependencies that will have to be rolled back.
12368  *
12369  * XXX Unused?
12370  */
12371 void
12372 softdep_fsync_mountdev(vp)
12373 	struct vnode *vp;
12374 {
12375 	struct buf *bp, *nbp;
12376 	struct worklist *wk;
12377 	struct bufobj *bo;
12378 
12379 	if (!vn_isdisk(vp, NULL))
12380 		panic("softdep_fsync_mountdev: vnode not a disk");
12381 	bo = &vp->v_bufobj;
12382 restart:
12383 	BO_LOCK(bo);
12384 	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
12385 		/*
12386 		 * If it is already scheduled, skip to the next buffer.
12387 		 */
12388 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
12389 			continue;
12390 
12391 		if ((bp->b_flags & B_DELWRI) == 0)
12392 			panic("softdep_fsync_mountdev: not dirty");
12393 		/*
12394 		 * We are only interested in bitmaps with outstanding
12395 		 * dependencies.
12396 		 */
12397 		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
12398 		    wk->wk_type != D_BMSAFEMAP ||
12399 		    (bp->b_vflags & BV_BKGRDINPROG)) {
12400 			BUF_UNLOCK(bp);
12401 			continue;
12402 		}
12403 		BO_UNLOCK(bo);
12404 		bremfree(bp);
12405 		(void) bawrite(bp);
12406 		goto restart;
12407 	}
12408 	drain_output(vp);
12409 	BO_UNLOCK(bo);
12410 }
12411 
12412 /*
12413  * Sync all cylinder groups that were dirty at the time this function is
12414  * called.  Newly dirtied cgs will be inserted before the sentinel.  This
12415  * is used to flush freedep activity that may be holding up writes to a
12416  * indirect block.
12417  */
12418 static int
12419 sync_cgs(mp, waitfor)
12420 	struct mount *mp;
12421 	int waitfor;
12422 {
12423 	struct bmsafemap *bmsafemap;
12424 	struct bmsafemap *sentinel;
12425 	struct ufsmount *ump;
12426 	struct buf *bp;
12427 	int error;
12428 
12429 	sentinel = malloc(sizeof(*sentinel), M_BMSAFEMAP, M_ZERO | M_WAITOK);
12430 	sentinel->sm_cg = -1;
12431 	ump = VFSTOUFS(mp);
12432 	error = 0;
12433 	ACQUIRE_LOCK(ump);
12434 	LIST_INSERT_HEAD(&ump->softdep_dirtycg, sentinel, sm_next);
12435 	for (bmsafemap = LIST_NEXT(sentinel, sm_next); bmsafemap != NULL;
12436 	    bmsafemap = LIST_NEXT(sentinel, sm_next)) {
12437 		/* Skip sentinels and cgs with no work to release. */
12438 		if (bmsafemap->sm_cg == -1 ||
12439 		    (LIST_EMPTY(&bmsafemap->sm_freehd) &&
12440 		    LIST_EMPTY(&bmsafemap->sm_freewr))) {
12441 			LIST_REMOVE(sentinel, sm_next);
12442 			LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
12443 			continue;
12444 		}
12445 		/*
12446 		 * If we don't get the lock and we're waiting try again, if
12447 		 * not move on to the next buf and try to sync it.
12448 		 */
12449 		bp = getdirtybuf(bmsafemap->sm_buf, LOCK_PTR(ump), waitfor);
12450 		if (bp == NULL && waitfor == MNT_WAIT)
12451 			continue;
12452 		LIST_REMOVE(sentinel, sm_next);
12453 		LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
12454 		if (bp == NULL)
12455 			continue;
12456 		FREE_LOCK(ump);
12457 		if (waitfor == MNT_NOWAIT)
12458 			bawrite(bp);
12459 		else
12460 			error = bwrite(bp);
12461 		ACQUIRE_LOCK(ump);
12462 		if (error)
12463 			break;
12464 	}
12465 	LIST_REMOVE(sentinel, sm_next);
12466 	FREE_LOCK(ump);
12467 	free(sentinel, M_BMSAFEMAP);
12468 	return (error);
12469 }
12470 
12471 /*
12472  * This routine is called when we are trying to synchronously flush a
12473  * file. This routine must eliminate any filesystem metadata dependencies
12474  * so that the syncing routine can succeed.
12475  */
12476 int
12477 softdep_sync_metadata(struct vnode *vp)
12478 {
12479 	struct inode *ip;
12480 	int error;
12481 
12482 	ip = VTOI(vp);
12483 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
12484 	    ("softdep_sync_metadata called on non-softdep filesystem"));
12485 	/*
12486 	 * Ensure that any direct block dependencies have been cleared,
12487 	 * truncations are started, and inode references are journaled.
12488 	 */
12489 	ACQUIRE_LOCK(ip->i_ump);
12490 	/*
12491 	 * Write all journal records to prevent rollbacks on devvp.
12492 	 */
12493 	if (vp->v_type == VCHR)
12494 		softdep_flushjournal(vp->v_mount);
12495 	error = flush_inodedep_deps(vp, vp->v_mount, ip->i_number);
12496 	/*
12497 	 * Ensure that all truncates are written so we won't find deps on
12498 	 * indirect blocks.
12499 	 */
12500 	process_truncates(vp);
12501 	FREE_LOCK(ip->i_ump);
12502 
12503 	return (error);
12504 }
12505 
12506 /*
12507  * This routine is called when we are attempting to sync a buf with
12508  * dependencies.  If waitfor is MNT_NOWAIT it attempts to schedule any
12509  * other IO it can but returns EBUSY if the buffer is not yet able to
12510  * be written.  Dependencies which will not cause rollbacks will always
12511  * return 0.
12512  */
12513 int
12514 softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
12515 {
12516 	struct indirdep *indirdep;
12517 	struct pagedep *pagedep;
12518 	struct allocindir *aip;
12519 	struct newblk *newblk;
12520 	struct ufsmount *ump;
12521 	struct buf *nbp;
12522 	struct worklist *wk;
12523 	int i, error;
12524 
12525 	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
12526 	    ("softdep_sync_buf called on non-softdep filesystem"));
12527 	/*
12528 	 * For VCHR we just don't want to force flush any dependencies that
12529 	 * will cause rollbacks.
12530 	 */
12531 	if (vp->v_type == VCHR) {
12532 		if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0))
12533 			return (EBUSY);
12534 		return (0);
12535 	}
12536 	ump = VTOI(vp)->i_ump;
12537 	ACQUIRE_LOCK(ump);
12538 	/*
12539 	 * As we hold the buffer locked, none of its dependencies
12540 	 * will disappear.
12541 	 */
12542 	error = 0;
12543 top:
12544 	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
12545 		switch (wk->wk_type) {
12546 
12547 		case D_ALLOCDIRECT:
12548 		case D_ALLOCINDIR:
12549 			newblk = WK_NEWBLK(wk);
12550 			if (newblk->nb_jnewblk != NULL) {
12551 				if (waitfor == MNT_NOWAIT) {
12552 					error = EBUSY;
12553 					goto out_unlock;
12554 				}
12555 				jwait(&newblk->nb_jnewblk->jn_list, waitfor);
12556 				goto top;
12557 			}
12558 			if (newblk->nb_state & DEPCOMPLETE ||
12559 			    waitfor == MNT_NOWAIT)
12560 				continue;
12561 			nbp = newblk->nb_bmsafemap->sm_buf;
12562 			nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
12563 			if (nbp == NULL)
12564 				goto top;
12565 			FREE_LOCK(ump);
12566 			if ((error = bwrite(nbp)) != 0)
12567 				goto out;
12568 			ACQUIRE_LOCK(ump);
12569 			continue;
12570 
12571 		case D_INDIRDEP:
12572 			indirdep = WK_INDIRDEP(wk);
12573 			if (waitfor == MNT_NOWAIT) {
12574 				if (!TAILQ_EMPTY(&indirdep->ir_trunc) ||
12575 				    !LIST_EMPTY(&indirdep->ir_deplisthd)) {
12576 					error = EBUSY;
12577 					goto out_unlock;
12578 				}
12579 			}
12580 			if (!TAILQ_EMPTY(&indirdep->ir_trunc))
12581 				panic("softdep_sync_buf: truncation pending.");
12582 		restart:
12583 			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
12584 				newblk = (struct newblk *)aip;
12585 				if (newblk->nb_jnewblk != NULL) {
12586 					jwait(&newblk->nb_jnewblk->jn_list,
12587 					    waitfor);
12588 					goto restart;
12589 				}
12590 				if (newblk->nb_state & DEPCOMPLETE)
12591 					continue;
12592 				nbp = newblk->nb_bmsafemap->sm_buf;
12593 				nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
12594 				if (nbp == NULL)
12595 					goto restart;
12596 				FREE_LOCK(ump);
12597 				if ((error = bwrite(nbp)) != 0)
12598 					goto out;
12599 				ACQUIRE_LOCK(ump);
12600 				goto restart;
12601 			}
12602 			continue;
12603 
12604 		case D_PAGEDEP:
12605 			/*
12606 			 * Only flush directory entries in synchronous passes.
12607 			 */
12608 			if (waitfor != MNT_WAIT) {
12609 				error = EBUSY;
12610 				goto out_unlock;
12611 			}
12612 			/*
12613 			 * While syncing snapshots, we must allow recursive
12614 			 * lookups.
12615 			 */
12616 			BUF_AREC(bp);
12617 			/*
12618 			 * We are trying to sync a directory that may
12619 			 * have dependencies on both its own metadata
12620 			 * and/or dependencies on the inodes of any
12621 			 * recently allocated files. We walk its diradd
12622 			 * lists pushing out the associated inode.
12623 			 */
12624 			pagedep = WK_PAGEDEP(wk);
12625 			for (i = 0; i < DAHASHSZ; i++) {
12626 				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
12627 					continue;
12628 				if ((error = flush_pagedep_deps(vp, wk->wk_mp,
12629 				    &pagedep->pd_diraddhd[i]))) {
12630 					BUF_NOREC(bp);
12631 					goto out_unlock;
12632 				}
12633 			}
12634 			BUF_NOREC(bp);
12635 			continue;
12636 
12637 		case D_FREEWORK:
12638 		case D_FREEDEP:
12639 		case D_JSEGDEP:
12640 		case D_JNEWBLK:
12641 			continue;
12642 
12643 		default:
12644 			panic("softdep_sync_buf: Unknown type %s",
12645 			    TYPENAME(wk->wk_type));
12646 			/* NOTREACHED */
12647 		}
12648 	}
12649 out_unlock:
12650 	FREE_LOCK(ump);
12651 out:
12652 	return (error);
12653 }
12654 
12655 /*
12656  * Flush the dependencies associated with an inodedep.
12657  * Called with splbio blocked.
12658  */
12659 static int
12660 flush_inodedep_deps(vp, mp, ino)
12661 	struct vnode *vp;
12662 	struct mount *mp;
12663 	ino_t ino;
12664 {
12665 	struct inodedep *inodedep;
12666 	struct inoref *inoref;
12667 	struct ufsmount *ump;
12668 	int error, waitfor;
12669 
12670 	/*
12671 	 * This work is done in two passes. The first pass grabs most
12672 	 * of the buffers and begins asynchronously writing them. The
12673 	 * only way to wait for these asynchronous writes is to sleep
12674 	 * on the filesystem vnode which may stay busy for a long time
12675 	 * if the filesystem is active. So, instead, we make a second
12676 	 * pass over the dependencies blocking on each write. In the
12677 	 * usual case we will be blocking against a write that we
12678 	 * initiated, so when it is done the dependency will have been
12679 	 * resolved. Thus the second pass is expected to end quickly.
12680 	 * We give a brief window at the top of the loop to allow
12681 	 * any pending I/O to complete.
12682 	 */
12683 	ump = VFSTOUFS(mp);
12684 	LOCK_OWNED(ump);
12685 	for (error = 0, waitfor = MNT_NOWAIT; ; ) {
12686 		if (error)
12687 			return (error);
12688 		FREE_LOCK(ump);
12689 		ACQUIRE_LOCK(ump);
12690 restart:
12691 		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
12692 			return (0);
12693 		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12694 			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12695 			    == DEPCOMPLETE) {
12696 				jwait(&inoref->if_list, MNT_WAIT);
12697 				goto restart;
12698 			}
12699 		}
12700 		if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
12701 		    flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
12702 		    flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
12703 		    flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
12704 			continue;
12705 		/*
12706 		 * If pass2, we are done, otherwise do pass 2.
12707 		 */
12708 		if (waitfor == MNT_WAIT)
12709 			break;
12710 		waitfor = MNT_WAIT;
12711 	}
12712 	/*
12713 	 * Try freeing inodedep in case all dependencies have been removed.
12714 	 */
12715 	if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
12716 		(void) free_inodedep(inodedep);
12717 	return (0);
12718 }
12719 
12720 /*
12721  * Flush an inode dependency list.
12722  * Called with splbio blocked.
12723  */
12724 static int
12725 flush_deplist(listhead, waitfor, errorp)
12726 	struct allocdirectlst *listhead;
12727 	int waitfor;
12728 	int *errorp;
12729 {
12730 	struct allocdirect *adp;
12731 	struct newblk *newblk;
12732 	struct ufsmount *ump;
12733 	struct buf *bp;
12734 
12735 	if ((adp = TAILQ_FIRST(listhead)) == NULL)
12736 		return (0);
12737 	ump = VFSTOUFS(adp->ad_list.wk_mp);
12738 	LOCK_OWNED(ump);
12739 	TAILQ_FOREACH(adp, listhead, ad_next) {
12740 		newblk = (struct newblk *)adp;
12741 		if (newblk->nb_jnewblk != NULL) {
12742 			jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
12743 			return (1);
12744 		}
12745 		if (newblk->nb_state & DEPCOMPLETE)
12746 			continue;
12747 		bp = newblk->nb_bmsafemap->sm_buf;
12748 		bp = getdirtybuf(bp, LOCK_PTR(ump), waitfor);
12749 		if (bp == NULL) {
12750 			if (waitfor == MNT_NOWAIT)
12751 				continue;
12752 			return (1);
12753 		}
12754 		FREE_LOCK(ump);
12755 		if (waitfor == MNT_NOWAIT)
12756 			bawrite(bp);
12757 		else
12758 			*errorp = bwrite(bp);
12759 		ACQUIRE_LOCK(ump);
12760 		return (1);
12761 	}
12762 	return (0);
12763 }
12764 
12765 /*
12766  * Flush dependencies associated with an allocdirect block.
12767  */
12768 static int
12769 flush_newblk_dep(vp, mp, lbn)
12770 	struct vnode *vp;
12771 	struct mount *mp;
12772 	ufs_lbn_t lbn;
12773 {
12774 	struct newblk *newblk;
12775 	struct ufsmount *ump;
12776 	struct bufobj *bo;
12777 	struct inode *ip;
12778 	struct buf *bp;
12779 	ufs2_daddr_t blkno;
12780 	int error;
12781 
12782 	error = 0;
12783 	bo = &vp->v_bufobj;
12784 	ip = VTOI(vp);
12785 	blkno = DIP(ip, i_db[lbn]);
12786 	if (blkno == 0)
12787 		panic("flush_newblk_dep: Missing block");
12788 	ump = VFSTOUFS(mp);
12789 	ACQUIRE_LOCK(ump);
12790 	/*
12791 	 * Loop until all dependencies related to this block are satisfied.
12792 	 * We must be careful to restart after each sleep in case a write
12793 	 * completes some part of this process for us.
12794 	 */
12795 	for (;;) {
12796 		if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {
12797 			FREE_LOCK(ump);
12798 			break;
12799 		}
12800 		if (newblk->nb_list.wk_type != D_ALLOCDIRECT)
12801 			panic("flush_newblk_deps: Bad newblk %p", newblk);
12802 		/*
12803 		 * Flush the journal.
12804 		 */
12805 		if (newblk->nb_jnewblk != NULL) {
12806 			jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
12807 			continue;
12808 		}
12809 		/*
12810 		 * Write the bitmap dependency.
12811 		 */
12812 		if ((newblk->nb_state & DEPCOMPLETE) == 0) {
12813 			bp = newblk->nb_bmsafemap->sm_buf;
12814 			bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
12815 			if (bp == NULL)
12816 				continue;
12817 			FREE_LOCK(ump);
12818 			error = bwrite(bp);
12819 			if (error)
12820 				break;
12821 			ACQUIRE_LOCK(ump);
12822 			continue;
12823 		}
12824 		/*
12825 		 * Write the buffer.
12826 		 */
12827 		FREE_LOCK(ump);
12828 		BO_LOCK(bo);
12829 		bp = gbincore(bo, lbn);
12830 		if (bp != NULL) {
12831 			error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
12832 			    LK_INTERLOCK, BO_LOCKPTR(bo));
12833 			if (error == ENOLCK) {
12834 				ACQUIRE_LOCK(ump);
12835 				continue; /* Slept, retry */
12836 			}
12837 			if (error != 0)
12838 				break;	/* Failed */
12839 			if (bp->b_flags & B_DELWRI) {
12840 				bremfree(bp);
12841 				error = bwrite(bp);
12842 				if (error)
12843 					break;
12844 			} else
12845 				BUF_UNLOCK(bp);
12846 		} else
12847 			BO_UNLOCK(bo);
12848 		/*
12849 		 * We have to wait for the direct pointers to
12850 		 * point at the newdirblk before the dependency
12851 		 * will go away.
12852 		 */
12853 		error = ffs_update(vp, 1);
12854 		if (error)
12855 			break;
12856 		ACQUIRE_LOCK(ump);
12857 	}
12858 	return (error);
12859 }
12860 
12861 /*
12862  * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
12863  * Called with splbio blocked.
12864  */
12865 static int
12866 flush_pagedep_deps(pvp, mp, diraddhdp)
12867 	struct vnode *pvp;
12868 	struct mount *mp;
12869 	struct diraddhd *diraddhdp;
12870 {
12871 	struct inodedep *inodedep;
12872 	struct inoref *inoref;
12873 	struct ufsmount *ump;
12874 	struct diradd *dap;
12875 	struct vnode *vp;
12876 	int error = 0;
12877 	struct buf *bp;
12878 	ino_t inum;
12879 	struct diraddhd unfinished;
12880 
12881 	LIST_INIT(&unfinished);
12882 	ump = VFSTOUFS(mp);
12883 	LOCK_OWNED(ump);
12884 restart:
12885 	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
12886 		/*
12887 		 * Flush ourselves if this directory entry
12888 		 * has a MKDIR_PARENT dependency.
12889 		 */
12890 		if (dap->da_state & MKDIR_PARENT) {
12891 			FREE_LOCK(ump);
12892 			if ((error = ffs_update(pvp, 1)) != 0)
12893 				break;
12894 			ACQUIRE_LOCK(ump);
12895 			/*
12896 			 * If that cleared dependencies, go on to next.
12897 			 */
12898 			if (dap != LIST_FIRST(diraddhdp))
12899 				continue;
12900 			/*
12901 			 * All MKDIR_PARENT dependencies and all the
12902 			 * NEWBLOCK pagedeps that are contained in direct
12903 			 * blocks were resolved by doing above ffs_update.
12904 			 * Pagedeps contained in indirect blocks may
12905 			 * require a complete sync'ing of the directory.
12906 			 * We are in the midst of doing a complete sync,
12907 			 * so if they are not resolved in this pass we
12908 			 * defer them for now as they will be sync'ed by
12909 			 * our caller shortly.
12910 			 */
12911 			LIST_REMOVE(dap, da_pdlist);
12912 			LIST_INSERT_HEAD(&unfinished, dap, da_pdlist);
12913 			continue;
12914 		}
12915 		/*
12916 		 * A newly allocated directory must have its "." and
12917 		 * ".." entries written out before its name can be
12918 		 * committed in its parent.
12919 		 */
12920 		inum = dap->da_newinum;
12921 		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
12922 			panic("flush_pagedep_deps: lost inode1");
12923 		/*
12924 		 * Wait for any pending journal adds to complete so we don't
12925 		 * cause rollbacks while syncing.
12926 		 */
12927 		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12928 			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12929 			    == DEPCOMPLETE) {
12930 				jwait(&inoref->if_list, MNT_WAIT);
12931 				goto restart;
12932 			}
12933 		}
12934 		if (dap->da_state & MKDIR_BODY) {
12935 			FREE_LOCK(ump);
12936 			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
12937 			    FFSV_FORCEINSMQ)))
12938 				break;
12939 			error = flush_newblk_dep(vp, mp, 0);
12940 			/*
12941 			 * If we still have the dependency we might need to
12942 			 * update the vnode to sync the new link count to
12943 			 * disk.
12944 			 */
12945 			if (error == 0 && dap == LIST_FIRST(diraddhdp))
12946 				error = ffs_update(vp, 1);
12947 			vput(vp);
12948 			if (error != 0)
12949 				break;
12950 			ACQUIRE_LOCK(ump);
12951 			/*
12952 			 * If that cleared dependencies, go on to next.
12953 			 */
12954 			if (dap != LIST_FIRST(diraddhdp))
12955 				continue;
12956 			if (dap->da_state & MKDIR_BODY) {
12957 				inodedep_lookup(UFSTOVFS(ump), inum, 0,
12958 				    &inodedep);
12959 				panic("flush_pagedep_deps: MKDIR_BODY "
12960 				    "inodedep %p dap %p vp %p",
12961 				    inodedep, dap, vp);
12962 			}
12963 		}
12964 		/*
12965 		 * Flush the inode on which the directory entry depends.
12966 		 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
12967 		 * the only remaining dependency is that the updated inode
12968 		 * count must get pushed to disk. The inode has already
12969 		 * been pushed into its inode buffer (via VOP_UPDATE) at
12970 		 * the time of the reference count change. So we need only
12971 		 * locate that buffer, ensure that there will be no rollback
12972 		 * caused by a bitmap dependency, then write the inode buffer.
12973 		 */
12974 retry:
12975 		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
12976 			panic("flush_pagedep_deps: lost inode");
12977 		/*
12978 		 * If the inode still has bitmap dependencies,
12979 		 * push them to disk.
12980 		 */
12981 		if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {
12982 			bp = inodedep->id_bmsafemap->sm_buf;
12983 			bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
12984 			if (bp == NULL)
12985 				goto retry;
12986 			FREE_LOCK(ump);
12987 			if ((error = bwrite(bp)) != 0)
12988 				break;
12989 			ACQUIRE_LOCK(ump);
12990 			if (dap != LIST_FIRST(diraddhdp))
12991 				continue;
12992 		}
12993 		/*
12994 		 * If the inode is still sitting in a buffer waiting
12995 		 * to be written or waiting for the link count to be
12996 		 * adjusted update it here to flush it to disk.
12997 		 */
12998 		if (dap == LIST_FIRST(diraddhdp)) {
12999 			FREE_LOCK(ump);
13000 			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
13001 			    FFSV_FORCEINSMQ)))
13002 				break;
13003 			error = ffs_update(vp, 1);
13004 			vput(vp);
13005 			if (error)
13006 				break;
13007 			ACQUIRE_LOCK(ump);
13008 		}
13009 		/*
13010 		 * If we have failed to get rid of all the dependencies
13011 		 * then something is seriously wrong.
13012 		 */
13013 		if (dap == LIST_FIRST(diraddhdp)) {
13014 			inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);
13015 			panic("flush_pagedep_deps: failed to flush "
13016 			    "inodedep %p ino %ju dap %p",
13017 			    inodedep, (uintmax_t)inum, dap);
13018 		}
13019 	}
13020 	if (error)
13021 		ACQUIRE_LOCK(ump);
13022 	while ((dap = LIST_FIRST(&unfinished)) != NULL) {
13023 		LIST_REMOVE(dap, da_pdlist);
13024 		LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist);
13025 	}
13026 	return (error);
13027 }
13028 
13029 /*
13030  * A large burst of file addition or deletion activity can drive the
13031  * memory load excessively high. First attempt to slow things down
13032  * using the techniques below. If that fails, this routine requests
13033  * the offending operations to fall back to running synchronously
13034  * until the memory load returns to a reasonable level.
13035  */
13036 int
13037 softdep_slowdown(vp)
13038 	struct vnode *vp;
13039 {
13040 	struct ufsmount *ump;
13041 	int jlow;
13042 	int max_softdeps_hard;
13043 
13044 	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
13045 	    ("softdep_slowdown called on non-softdep filesystem"));
13046 	ump = VFSTOUFS(vp->v_mount);
13047 	ACQUIRE_LOCK(ump);
13048 	jlow = 0;
13049 	/*
13050 	 * Check for journal space if needed.
13051 	 */
13052 	if (DOINGSUJ(vp)) {
13053 		if (journal_space(ump, 0) == 0)
13054 			jlow = 1;
13055 	}
13056 	/*
13057 	 * If the system is under its limits and our filesystem is
13058 	 * not responsible for more than our share of the usage and
13059 	 * we are not low on journal space, then no need to slow down.
13060 	 */
13061 	max_softdeps_hard = max_softdeps * 11 / 10;
13062 	if (dep_current[D_DIRREM] < max_softdeps_hard / 2 &&
13063 	    dep_current[D_INODEDEP] < max_softdeps_hard &&
13064 	    dep_current[D_INDIRDEP] < max_softdeps_hard / 1000 &&
13065 	    dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0 &&
13066 	    ump->softdep_curdeps[D_DIRREM] <
13067 	    (max_softdeps_hard / 2) / stat_flush_threads &&
13068 	    ump->softdep_curdeps[D_INODEDEP] <
13069 	    max_softdeps_hard / stat_flush_threads &&
13070 	    ump->softdep_curdeps[D_INDIRDEP] <
13071 	    (max_softdeps_hard / 1000) / stat_flush_threads &&
13072 	    ump->softdep_curdeps[D_FREEBLKS] <
13073 	    max_softdeps_hard / stat_flush_threads) {
13074 		FREE_LOCK(ump);
13075   		return (0);
13076 	}
13077 	/*
13078 	 * If the journal is low or our filesystem is over its limit
13079 	 * then speedup the cleanup.
13080 	 */
13081 	if (ump->softdep_curdeps[D_INDIRDEP] <
13082 	    (max_softdeps_hard / 1000) / stat_flush_threads || jlow)
13083 		softdep_speedup(ump);
13084 	stat_sync_limit_hit += 1;
13085 	FREE_LOCK(ump);
13086 	/*
13087 	 * We only slow down the rate at which new dependencies are
13088 	 * generated if we are not using journaling. With journaling,
13089 	 * the cleanup should always be sufficient to keep things
13090 	 * under control.
13091 	 */
13092 	if (DOINGSUJ(vp))
13093 		return (0);
13094 	return (1);
13095 }
13096 
13097 /*
13098  * Called by the allocation routines when they are about to fail
13099  * in the hope that we can free up the requested resource (inodes
13100  * or disk space).
13101  *
13102  * First check to see if the work list has anything on it. If it has,
13103  * clean up entries until we successfully free the requested resource.
13104  * Because this process holds inodes locked, we cannot handle any remove
13105  * requests that might block on a locked inode as that could lead to
13106  * deadlock. If the worklist yields none of the requested resource,
13107  * start syncing out vnodes to free up the needed space.
13108  */
13109 int
13110 softdep_request_cleanup(fs, vp, cred, resource)
13111 	struct fs *fs;
13112 	struct vnode *vp;
13113 	struct ucred *cred;
13114 	int resource;
13115 {
13116 	struct ufsmount *ump;
13117 	struct mount *mp;
13118 	struct vnode *lvp, *mvp;
13119 	long starttime;
13120 	ufs2_daddr_t needed;
13121 	int error;
13122 
13123 	/*
13124 	 * If we are being called because of a process doing a
13125 	 * copy-on-write, then it is not safe to process any
13126 	 * worklist items as we will recurse into the copyonwrite
13127 	 * routine.  This will result in an incoherent snapshot.
13128 	 * If the vnode that we hold is a snapshot, we must avoid
13129 	 * handling other resources that could cause deadlock.
13130 	 */
13131 	if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp)))
13132 		return (0);
13133 
13134 	if (resource == FLUSH_BLOCKS_WAIT)
13135 		stat_cleanup_blkrequests += 1;
13136 	else
13137 		stat_cleanup_inorequests += 1;
13138 
13139 	mp = vp->v_mount;
13140 	ump = VFSTOUFS(mp);
13141 	mtx_assert(UFS_MTX(ump), MA_OWNED);
13142 	UFS_UNLOCK(ump);
13143 	error = ffs_update(vp, 1);
13144 	if (error != 0 || MOUNTEDSOFTDEP(mp) == 0) {
13145 		UFS_LOCK(ump);
13146 		return (0);
13147 	}
13148 	/*
13149 	 * If we are in need of resources, start by cleaning up
13150 	 * any block removals associated with our inode.
13151 	 */
13152 	ACQUIRE_LOCK(ump);
13153 	process_removes(vp);
13154 	process_truncates(vp);
13155 	FREE_LOCK(ump);
13156 	/*
13157 	 * Now clean up at least as many resources as we will need.
13158 	 *
13159 	 * When requested to clean up inodes, the number that are needed
13160 	 * is set by the number of simultaneous writers (mnt_writeopcount)
13161 	 * plus a bit of slop (2) in case some more writers show up while
13162 	 * we are cleaning.
13163 	 *
13164 	 * When requested to free up space, the amount of space that
13165 	 * we need is enough blocks to allocate a full-sized segment
13166 	 * (fs_contigsumsize). The number of such segments that will
13167 	 * be needed is set by the number of simultaneous writers
13168 	 * (mnt_writeopcount) plus a bit of slop (2) in case some more
13169 	 * writers show up while we are cleaning.
13170 	 *
13171 	 * Additionally, if we are unpriviledged and allocating space,
13172 	 * we need to ensure that we clean up enough blocks to get the
13173 	 * needed number of blocks over the threshhold of the minimum
13174 	 * number of blocks required to be kept free by the filesystem
13175 	 * (fs_minfree).
13176 	 */
13177 	if (resource == FLUSH_INODES_WAIT) {
13178 		needed = vp->v_mount->mnt_writeopcount + 2;
13179 	} else if (resource == FLUSH_BLOCKS_WAIT) {
13180 		needed = (vp->v_mount->mnt_writeopcount + 2) *
13181 		    fs->fs_contigsumsize;
13182 		if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0))
13183 			needed += fragstoblks(fs,
13184 			    roundup((fs->fs_dsize * fs->fs_minfree / 100) -
13185 			    fs->fs_cstotal.cs_nffree, fs->fs_frag));
13186 	} else {
13187 		UFS_LOCK(ump);
13188 		printf("softdep_request_cleanup: Unknown resource type %d\n",
13189 		    resource);
13190 		return (0);
13191 	}
13192 	starttime = time_second;
13193 retry:
13194 	if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 &&
13195 	    fs->fs_cstotal.cs_nbfree <= needed) ||
13196 	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
13197 	    fs->fs_cstotal.cs_nifree <= needed)) {
13198 		ACQUIRE_LOCK(ump);
13199 		if (ump->softdep_on_worklist > 0 &&
13200 		    process_worklist_item(UFSTOVFS(ump),
13201 		    ump->softdep_on_worklist, LK_NOWAIT) != 0)
13202 			stat_worklist_push += 1;
13203 		FREE_LOCK(ump);
13204 	}
13205 	/*
13206 	 * If we still need resources and there are no more worklist
13207 	 * entries to process to obtain them, we have to start flushing
13208 	 * the dirty vnodes to force the release of additional requests
13209 	 * to the worklist that we can then process to reap addition
13210 	 * resources. We walk the vnodes associated with the mount point
13211 	 * until we get the needed worklist requests that we can reap.
13212 	 */
13213 	if ((resource == FLUSH_BLOCKS_WAIT &&
13214 	     fs->fs_cstotal.cs_nbfree <= needed) ||
13215 	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
13216 	     fs->fs_cstotal.cs_nifree <= needed)) {
13217 		MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) {
13218 			if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) {
13219 				VI_UNLOCK(lvp);
13220 				continue;
13221 			}
13222 			if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT,
13223 			    curthread))
13224 				continue;
13225 			if (lvp->v_vflag & VV_NOSYNC) {	/* unlinked */
13226 				vput(lvp);
13227 				continue;
13228 			}
13229 			(void) ffs_syncvnode(lvp, MNT_NOWAIT, 0);
13230 			vput(lvp);
13231 		}
13232 		lvp = ump->um_devvp;
13233 		if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
13234 			VOP_FSYNC(lvp, MNT_NOWAIT, curthread);
13235 			VOP_UNLOCK(lvp, 0);
13236 		}
13237 		if (ump->softdep_on_worklist > 0) {
13238 			stat_cleanup_retries += 1;
13239 			goto retry;
13240 		}
13241 		stat_cleanup_failures += 1;
13242 	}
13243 	if (time_second - starttime > stat_cleanup_high_delay)
13244 		stat_cleanup_high_delay = time_second - starttime;
13245 	UFS_LOCK(ump);
13246 	return (1);
13247 }
13248 
13249 /*
13250  * If memory utilization has gotten too high, deliberately slow things
13251  * down and speed up the I/O processing.
13252  */
13253 static int
13254 request_cleanup(mp, resource)
13255 	struct mount *mp;
13256 	int resource;
13257 {
13258 	struct thread *td = curthread;
13259 	struct ufsmount *ump;
13260 
13261 	ump = VFSTOUFS(mp);
13262 	LOCK_OWNED(ump);
13263 	/*
13264 	 * We never hold up the filesystem syncer or buf daemon.
13265 	 */
13266 	if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
13267 		return (0);
13268 	/*
13269 	 * First check to see if the work list has gotten backlogged.
13270 	 * If it has, co-opt this process to help clean up two entries.
13271 	 * Because this process may hold inodes locked, we cannot
13272 	 * handle any remove requests that might block on a locked
13273 	 * inode as that could lead to deadlock.  We set TDP_SOFTDEP
13274 	 * to avoid recursively processing the worklist.
13275 	 */
13276 	if (ump->softdep_on_worklist > max_softdeps / 10) {
13277 		td->td_pflags |= TDP_SOFTDEP;
13278 		process_worklist_item(mp, 2, LK_NOWAIT);
13279 		td->td_pflags &= ~TDP_SOFTDEP;
13280 		stat_worklist_push += 2;
13281 		return(1);
13282 	}
13283 	/*
13284 	 * Next, we attempt to speed up the syncer process. If that
13285 	 * is successful, then we allow the process to continue.
13286 	 */
13287 	if (softdep_speedup(ump) &&
13288 	    resource != FLUSH_BLOCKS_WAIT &&
13289 	    resource != FLUSH_INODES_WAIT)
13290 		return(0);
13291 	/*
13292 	 * If we are resource constrained on inode dependencies, try
13293 	 * flushing some dirty inodes. Otherwise, we are constrained
13294 	 * by file deletions, so try accelerating flushes of directories
13295 	 * with removal dependencies. We would like to do the cleanup
13296 	 * here, but we probably hold an inode locked at this point and
13297 	 * that might deadlock against one that we try to clean. So,
13298 	 * the best that we can do is request the syncer daemon to do
13299 	 * the cleanup for us.
13300 	 */
13301 	switch (resource) {
13302 
13303 	case FLUSH_INODES:
13304 	case FLUSH_INODES_WAIT:
13305 		ACQUIRE_GBLLOCK(&lk);
13306 		stat_ino_limit_push += 1;
13307 		req_clear_inodedeps += 1;
13308 		FREE_GBLLOCK(&lk);
13309 		stat_countp = &stat_ino_limit_hit;
13310 		break;
13311 
13312 	case FLUSH_BLOCKS:
13313 	case FLUSH_BLOCKS_WAIT:
13314 		ACQUIRE_GBLLOCK(&lk);
13315 		stat_blk_limit_push += 1;
13316 		req_clear_remove += 1;
13317 		FREE_GBLLOCK(&lk);
13318 		stat_countp = &stat_blk_limit_hit;
13319 		break;
13320 
13321 	default:
13322 		panic("request_cleanup: unknown type");
13323 	}
13324 	/*
13325 	 * Hopefully the syncer daemon will catch up and awaken us.
13326 	 * We wait at most tickdelay before proceeding in any case.
13327 	 */
13328 	ACQUIRE_GBLLOCK(&lk);
13329 	FREE_LOCK(ump);
13330 	proc_waiting += 1;
13331 	if (callout_pending(&softdep_callout) == FALSE)
13332 		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
13333 		    pause_timer, 0);
13334 
13335 	msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
13336 	proc_waiting -= 1;
13337 	FREE_GBLLOCK(&lk);
13338 	ACQUIRE_LOCK(ump);
13339 	return (1);
13340 }
13341 
13342 /*
13343  * Awaken processes pausing in request_cleanup and clear proc_waiting
13344  * to indicate that there is no longer a timer running. Pause_timer
13345  * will be called with the global softdep mutex (&lk) locked.
13346  */
13347 static void
13348 pause_timer(arg)
13349 	void *arg;
13350 {
13351 
13352 	GBLLOCK_OWNED(&lk);
13353 	/*
13354 	 * The callout_ API has acquired mtx and will hold it around this
13355 	 * function call.
13356 	 */
13357 	*stat_countp += proc_waiting;
13358 	wakeup(&proc_waiting);
13359 }
13360 
13361 /*
13362  * If requested, try removing inode or removal dependencies.
13363  */
13364 static void
13365 check_clear_deps(mp)
13366 	struct mount *mp;
13367 {
13368 
13369 	/*
13370 	 * If we are suspended, it may be because of our using
13371 	 * too many inodedeps, so help clear them out.
13372 	 */
13373 	if (MOUNTEDSUJ(mp) && VFSTOUFS(mp)->softdep_jblocks->jb_suspended)
13374 		clear_inodedeps(mp);
13375 	/*
13376 	 * General requests for cleanup of backed up dependencies
13377 	 */
13378 	ACQUIRE_GBLLOCK(&lk);
13379 	if (req_clear_inodedeps) {
13380 		req_clear_inodedeps -= 1;
13381 		FREE_GBLLOCK(&lk);
13382 		clear_inodedeps(mp);
13383 		ACQUIRE_GBLLOCK(&lk);
13384 		wakeup(&proc_waiting);
13385 	}
13386 	if (req_clear_remove) {
13387 		req_clear_remove -= 1;
13388 		FREE_GBLLOCK(&lk);
13389 		clear_remove(mp);
13390 		ACQUIRE_GBLLOCK(&lk);
13391 		wakeup(&proc_waiting);
13392 	}
13393 	FREE_GBLLOCK(&lk);
13394 }
13395 
13396 /*
13397  * Flush out a directory with at least one removal dependency in an effort to
13398  * reduce the number of dirrem, freefile, and freeblks dependency structures.
13399  */
13400 static void
13401 clear_remove(mp)
13402 	struct mount *mp;
13403 {
13404 	struct pagedep_hashhead *pagedephd;
13405 	struct pagedep *pagedep;
13406 	struct ufsmount *ump;
13407 	struct vnode *vp;
13408 	struct bufobj *bo;
13409 	int error, cnt;
13410 	ino_t ino;
13411 
13412 	ump = VFSTOUFS(mp);
13413 	LOCK_OWNED(ump);
13414 
13415 	for (cnt = 0; cnt <= ump->pagedep_hash_size; cnt++) {
13416 		pagedephd = &ump->pagedep_hashtbl[ump->pagedep_nextclean++];
13417 		if (ump->pagedep_nextclean > ump->pagedep_hash_size)
13418 			ump->pagedep_nextclean = 0;
13419 		LIST_FOREACH(pagedep, pagedephd, pd_hash) {
13420 			if (LIST_EMPTY(&pagedep->pd_dirremhd))
13421 				continue;
13422 			ino = pagedep->pd_ino;
13423 			if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
13424 				continue;
13425 			FREE_LOCK(ump);
13426 
13427 			/*
13428 			 * Let unmount clear deps
13429 			 */
13430 			error = vfs_busy(mp, MBF_NOWAIT);
13431 			if (error != 0)
13432 				goto finish_write;
13433 			error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
13434 			     FFSV_FORCEINSMQ);
13435 			vfs_unbusy(mp);
13436 			if (error != 0) {
13437 				softdep_error("clear_remove: vget", error);
13438 				goto finish_write;
13439 			}
13440 			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
13441 				softdep_error("clear_remove: fsync", error);
13442 			bo = &vp->v_bufobj;
13443 			BO_LOCK(bo);
13444 			drain_output(vp);
13445 			BO_UNLOCK(bo);
13446 			vput(vp);
13447 		finish_write:
13448 			vn_finished_write(mp);
13449 			ACQUIRE_LOCK(ump);
13450 			return;
13451 		}
13452 	}
13453 }
13454 
13455 /*
13456  * Clear out a block of dirty inodes in an effort to reduce
13457  * the number of inodedep dependency structures.
13458  */
13459 static void
13460 clear_inodedeps(mp)
13461 	struct mount *mp;
13462 {
13463 	struct inodedep_hashhead *inodedephd;
13464 	struct inodedep *inodedep;
13465 	struct ufsmount *ump;
13466 	struct vnode *vp;
13467 	struct fs *fs;
13468 	int error, cnt;
13469 	ino_t firstino, lastino, ino;
13470 
13471 	ump = VFSTOUFS(mp);
13472 	fs = ump->um_fs;
13473 	LOCK_OWNED(ump);
13474 	/*
13475 	 * Pick a random inode dependency to be cleared.
13476 	 * We will then gather up all the inodes in its block
13477 	 * that have dependencies and flush them out.
13478 	 */
13479 	for (cnt = 0; cnt <= ump->inodedep_hash_size; cnt++) {
13480 		inodedephd = &ump->inodedep_hashtbl[ump->inodedep_nextclean++];
13481 		if (ump->inodedep_nextclean > ump->inodedep_hash_size)
13482 			ump->inodedep_nextclean = 0;
13483 		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
13484 			break;
13485 	}
13486 	if (inodedep == NULL)
13487 		return;
13488 	/*
13489 	 * Find the last inode in the block with dependencies.
13490 	 */
13491 	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
13492 	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
13493 		if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
13494 			break;
13495 	/*
13496 	 * Asynchronously push all but the last inode with dependencies.
13497 	 * Synchronously push the last inode with dependencies to ensure
13498 	 * that the inode block gets written to free up the inodedeps.
13499 	 */
13500 	for (ino = firstino; ino <= lastino; ino++) {
13501 		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
13502 			continue;
13503 		if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
13504 			continue;
13505 		FREE_LOCK(ump);
13506 		error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */
13507 		if (error != 0) {
13508 			vn_finished_write(mp);
13509 			ACQUIRE_LOCK(ump);
13510 			return;
13511 		}
13512 		if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
13513 		    FFSV_FORCEINSMQ)) != 0) {
13514 			softdep_error("clear_inodedeps: vget", error);
13515 			vfs_unbusy(mp);
13516 			vn_finished_write(mp);
13517 			ACQUIRE_LOCK(ump);
13518 			return;
13519 		}
13520 		vfs_unbusy(mp);
13521 		if (ino == lastino) {
13522 			if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)))
13523 				softdep_error("clear_inodedeps: fsync1", error);
13524 		} else {
13525 			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
13526 				softdep_error("clear_inodedeps: fsync2", error);
13527 			BO_LOCK(&vp->v_bufobj);
13528 			drain_output(vp);
13529 			BO_UNLOCK(&vp->v_bufobj);
13530 		}
13531 		vput(vp);
13532 		vn_finished_write(mp);
13533 		ACQUIRE_LOCK(ump);
13534 	}
13535 }
13536 
13537 void
13538 softdep_buf_append(bp, wkhd)
13539 	struct buf *bp;
13540 	struct workhead *wkhd;
13541 {
13542 	struct worklist *wk;
13543 	struct ufsmount *ump;
13544 
13545 	if ((wk = LIST_FIRST(wkhd)) == NULL)
13546 		return;
13547 	KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
13548 	    ("softdep_buf_append called on non-softdep filesystem"));
13549 	ump = VFSTOUFS(wk->wk_mp);
13550 	ACQUIRE_LOCK(ump);
13551 	while ((wk = LIST_FIRST(wkhd)) != NULL) {
13552 		WORKLIST_REMOVE(wk);
13553 		WORKLIST_INSERT(&bp->b_dep, wk);
13554 	}
13555 	FREE_LOCK(ump);
13556 
13557 }
13558 
13559 void
13560 softdep_inode_append(ip, cred, wkhd)
13561 	struct inode *ip;
13562 	struct ucred *cred;
13563 	struct workhead *wkhd;
13564 {
13565 	struct buf *bp;
13566 	struct fs *fs;
13567 	int error;
13568 
13569 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
13570 	    ("softdep_inode_append called on non-softdep filesystem"));
13571 	fs = ip->i_fs;
13572 	error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
13573 	    (int)fs->fs_bsize, cred, &bp);
13574 	if (error) {
13575 		bqrelse(bp);
13576 		softdep_freework(wkhd);
13577 		return;
13578 	}
13579 	softdep_buf_append(bp, wkhd);
13580 	bqrelse(bp);
13581 }
13582 
13583 void
13584 softdep_freework(wkhd)
13585 	struct workhead *wkhd;
13586 {
13587 	struct worklist *wk;
13588 	struct ufsmount *ump;
13589 
13590 	if ((wk = LIST_FIRST(wkhd)) == NULL)
13591 		return;
13592 	KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
13593 	    ("softdep_freework called on non-softdep filesystem"));
13594 	ump = VFSTOUFS(wk->wk_mp);
13595 	ACQUIRE_LOCK(ump);
13596 	handle_jwork(wkhd);
13597 	FREE_LOCK(ump);
13598 }
13599 
13600 /*
13601  * Function to determine if the buffer has outstanding dependencies
13602  * that will cause a roll-back if the buffer is written. If wantcount
13603  * is set, return number of dependencies, otherwise just yes or no.
13604  */
13605 static int
13606 softdep_count_dependencies(bp, wantcount)
13607 	struct buf *bp;
13608 	int wantcount;
13609 {
13610 	struct worklist *wk;
13611 	struct ufsmount *ump;
13612 	struct bmsafemap *bmsafemap;
13613 	struct freework *freework;
13614 	struct inodedep *inodedep;
13615 	struct indirdep *indirdep;
13616 	struct freeblks *freeblks;
13617 	struct allocindir *aip;
13618 	struct pagedep *pagedep;
13619 	struct dirrem *dirrem;
13620 	struct newblk *newblk;
13621 	struct mkdir *mkdir;
13622 	struct diradd *dap;
13623 	int i, retval;
13624 
13625 	retval = 0;
13626 	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
13627 		return (0);
13628 	ump = VFSTOUFS(wk->wk_mp);
13629 	ACQUIRE_LOCK(ump);
13630 	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
13631 		switch (wk->wk_type) {
13632 
13633 		case D_INODEDEP:
13634 			inodedep = WK_INODEDEP(wk);
13635 			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
13636 				/* bitmap allocation dependency */
13637 				retval += 1;
13638 				if (!wantcount)
13639 					goto out;
13640 			}
13641 			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
13642 				/* direct block pointer dependency */
13643 				retval += 1;
13644 				if (!wantcount)
13645 					goto out;
13646 			}
13647 			if (TAILQ_FIRST(&inodedep->id_extupdt)) {
13648 				/* direct block pointer dependency */
13649 				retval += 1;
13650 				if (!wantcount)
13651 					goto out;
13652 			}
13653 			if (TAILQ_FIRST(&inodedep->id_inoreflst)) {
13654 				/* Add reference dependency. */
13655 				retval += 1;
13656 				if (!wantcount)
13657 					goto out;
13658 			}
13659 			continue;
13660 
13661 		case D_INDIRDEP:
13662 			indirdep = WK_INDIRDEP(wk);
13663 
13664 			TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) {
13665 				/* indirect truncation dependency */
13666 				retval += 1;
13667 				if (!wantcount)
13668 					goto out;
13669 			}
13670 
13671 			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
13672 				/* indirect block pointer dependency */
13673 				retval += 1;
13674 				if (!wantcount)
13675 					goto out;
13676 			}
13677 			continue;
13678 
13679 		case D_PAGEDEP:
13680 			pagedep = WK_PAGEDEP(wk);
13681 			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
13682 				if (LIST_FIRST(&dirrem->dm_jremrefhd)) {
13683 					/* Journal remove ref dependency. */
13684 					retval += 1;
13685 					if (!wantcount)
13686 						goto out;
13687 				}
13688 			}
13689 			for (i = 0; i < DAHASHSZ; i++) {
13690 
13691 				LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
13692 					/* directory entry dependency */
13693 					retval += 1;
13694 					if (!wantcount)
13695 						goto out;
13696 				}
13697 			}
13698 			continue;
13699 
13700 		case D_BMSAFEMAP:
13701 			bmsafemap = WK_BMSAFEMAP(wk);
13702 			if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {
13703 				/* Add reference dependency. */
13704 				retval += 1;
13705 				if (!wantcount)
13706 					goto out;
13707 			}
13708 			if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {
13709 				/* Allocate block dependency. */
13710 				retval += 1;
13711 				if (!wantcount)
13712 					goto out;
13713 			}
13714 			continue;
13715 
13716 		case D_FREEBLKS:
13717 			freeblks = WK_FREEBLKS(wk);
13718 			if (LIST_FIRST(&freeblks->fb_jblkdephd)) {
13719 				/* Freeblk journal dependency. */
13720 				retval += 1;
13721 				if (!wantcount)
13722 					goto out;
13723 			}
13724 			continue;
13725 
13726 		case D_ALLOCDIRECT:
13727 		case D_ALLOCINDIR:
13728 			newblk = WK_NEWBLK(wk);
13729 			if (newblk->nb_jnewblk) {
13730 				/* Journal allocate dependency. */
13731 				retval += 1;
13732 				if (!wantcount)
13733 					goto out;
13734 			}
13735 			continue;
13736 
13737 		case D_MKDIR:
13738 			mkdir = WK_MKDIR(wk);
13739 			if (mkdir->md_jaddref) {
13740 				/* Journal reference dependency. */
13741 				retval += 1;
13742 				if (!wantcount)
13743 					goto out;
13744 			}
13745 			continue;
13746 
13747 		case D_FREEWORK:
13748 		case D_FREEDEP:
13749 		case D_JSEGDEP:
13750 		case D_JSEG:
13751 		case D_SBDEP:
13752 			/* never a dependency on these blocks */
13753 			continue;
13754 
13755 		default:
13756 			panic("softdep_count_dependencies: Unexpected type %s",
13757 			    TYPENAME(wk->wk_type));
13758 			/* NOTREACHED */
13759 		}
13760 	}
13761 out:
13762 	FREE_LOCK(ump);
13763 	return retval;
13764 }
13765 
13766 /*
13767  * Acquire exclusive access to a buffer.
13768  * Must be called with a locked mtx parameter.
13769  * Return acquired buffer or NULL on failure.
13770  */
13771 static struct buf *
13772 getdirtybuf(bp, lock, waitfor)
13773 	struct buf *bp;
13774 	struct rwlock *lock;
13775 	int waitfor;
13776 {
13777 	int error;
13778 
13779 	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
13780 		if (waitfor != MNT_WAIT)
13781 			return (NULL);
13782 		error = BUF_LOCK(bp,
13783 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, lock);
13784 		/*
13785 		 * Even if we sucessfully acquire bp here, we have dropped
13786 		 * lock, which may violates our guarantee.
13787 		 */
13788 		if (error == 0)
13789 			BUF_UNLOCK(bp);
13790 		else if (error != ENOLCK)
13791 			panic("getdirtybuf: inconsistent lock: %d", error);
13792 		rw_wlock(lock);
13793 		return (NULL);
13794 	}
13795 	if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
13796 		if (lock != BO_LOCKPTR(bp->b_bufobj) && waitfor == MNT_WAIT) {
13797 			rw_wunlock(lock);
13798 			BO_LOCK(bp->b_bufobj);
13799 			BUF_UNLOCK(bp);
13800 			if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
13801 				bp->b_vflags |= BV_BKGRDWAIT;
13802 				msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj),
13803 				       PRIBIO | PDROP, "getbuf", 0);
13804 			} else
13805 				BO_UNLOCK(bp->b_bufobj);
13806 			rw_wlock(lock);
13807 			return (NULL);
13808 		}
13809 		BUF_UNLOCK(bp);
13810 		if (waitfor != MNT_WAIT)
13811 			return (NULL);
13812 		/*
13813 		 * The lock argument must be bp->b_vp's mutex in
13814 		 * this case.
13815 		 */
13816 #ifdef	DEBUG_VFS_LOCKS
13817 		if (bp->b_vp->v_type != VCHR)
13818 			ASSERT_BO_WLOCKED(bp->b_bufobj);
13819 #endif
13820 		bp->b_vflags |= BV_BKGRDWAIT;
13821 		rw_sleep(&bp->b_xflags, lock, PRIBIO, "getbuf", 0);
13822 		return (NULL);
13823 	}
13824 	if ((bp->b_flags & B_DELWRI) == 0) {
13825 		BUF_UNLOCK(bp);
13826 		return (NULL);
13827 	}
13828 	bremfree(bp);
13829 	return (bp);
13830 }
13831 
13832 
13833 /*
13834  * Check if it is safe to suspend the file system now.  On entry,
13835  * the vnode interlock for devvp should be held.  Return 0 with
13836  * the mount interlock held if the file system can be suspended now,
13837  * otherwise return EAGAIN with the mount interlock held.
13838  */
13839 int
13840 softdep_check_suspend(struct mount *mp,
13841 		      struct vnode *devvp,
13842 		      int softdep_depcnt,
13843 		      int softdep_accdepcnt,
13844 		      int secondary_writes,
13845 		      int secondary_accwrites)
13846 {
13847 	struct bufobj *bo;
13848 	struct ufsmount *ump;
13849 	int error;
13850 
13851 	bo = &devvp->v_bufobj;
13852 	ASSERT_BO_WLOCKED(bo);
13853 
13854 	/*
13855 	 * If we are not running with soft updates, then we need only
13856 	 * deal with secondary writes as we try to suspend.
13857 	 */
13858 	if (MOUNTEDSOFTDEP(mp) == 0) {
13859 		MNT_ILOCK(mp);
13860 		while (mp->mnt_secondary_writes != 0) {
13861 			BO_UNLOCK(bo);
13862 			msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
13863 			    (PUSER - 1) | PDROP, "secwr", 0);
13864 			BO_LOCK(bo);
13865 			MNT_ILOCK(mp);
13866 		}
13867 
13868 		/*
13869 		 * Reasons for needing more work before suspend:
13870 		 * - Dirty buffers on devvp.
13871 		 * - Secondary writes occurred after start of vnode sync loop
13872 		 */
13873 		error = 0;
13874 		if (bo->bo_numoutput > 0 ||
13875 		    bo->bo_dirty.bv_cnt > 0 ||
13876 		    secondary_writes != 0 ||
13877 		    mp->mnt_secondary_writes != 0 ||
13878 		    secondary_accwrites != mp->mnt_secondary_accwrites)
13879 			error = EAGAIN;
13880 		BO_UNLOCK(bo);
13881 		return (error);
13882 	}
13883 
13884 	/*
13885 	 * If we are running with soft updates, then we need to coordinate
13886 	 * with them as we try to suspend.
13887 	 */
13888 	ump = VFSTOUFS(mp);
13889 	for (;;) {
13890 		if (!TRY_ACQUIRE_LOCK(ump)) {
13891 			BO_UNLOCK(bo);
13892 			ACQUIRE_LOCK(ump);
13893 			FREE_LOCK(ump);
13894 			BO_LOCK(bo);
13895 			continue;
13896 		}
13897 		MNT_ILOCK(mp);
13898 		if (mp->mnt_secondary_writes != 0) {
13899 			FREE_LOCK(ump);
13900 			BO_UNLOCK(bo);
13901 			msleep(&mp->mnt_secondary_writes,
13902 			       MNT_MTX(mp),
13903 			       (PUSER - 1) | PDROP, "secwr", 0);
13904 			BO_LOCK(bo);
13905 			continue;
13906 		}
13907 		break;
13908 	}
13909 
13910 	/*
13911 	 * Reasons for needing more work before suspend:
13912 	 * - Dirty buffers on devvp.
13913 	 * - Softdep activity occurred after start of vnode sync loop
13914 	 * - Secondary writes occurred after start of vnode sync loop
13915 	 */
13916 	error = 0;
13917 	if (bo->bo_numoutput > 0 ||
13918 	    bo->bo_dirty.bv_cnt > 0 ||
13919 	    softdep_depcnt != 0 ||
13920 	    ump->softdep_deps != 0 ||
13921 	    softdep_accdepcnt != ump->softdep_accdeps ||
13922 	    secondary_writes != 0 ||
13923 	    mp->mnt_secondary_writes != 0 ||
13924 	    secondary_accwrites != mp->mnt_secondary_accwrites)
13925 		error = EAGAIN;
13926 	FREE_LOCK(ump);
13927 	BO_UNLOCK(bo);
13928 	return (error);
13929 }
13930 
13931 
13932 /*
13933  * Get the number of dependency structures for the file system, both
13934  * the current number and the total number allocated.  These will
13935  * later be used to detect that softdep processing has occurred.
13936  */
13937 void
13938 softdep_get_depcounts(struct mount *mp,
13939 		      int *softdep_depsp,
13940 		      int *softdep_accdepsp)
13941 {
13942 	struct ufsmount *ump;
13943 
13944 	if (MOUNTEDSOFTDEP(mp) == 0) {
13945 		*softdep_depsp = 0;
13946 		*softdep_accdepsp = 0;
13947 		return;
13948 	}
13949 	ump = VFSTOUFS(mp);
13950 	ACQUIRE_LOCK(ump);
13951 	*softdep_depsp = ump->softdep_deps;
13952 	*softdep_accdepsp = ump->softdep_accdeps;
13953 	FREE_LOCK(ump);
13954 }
13955 
13956 /*
13957  * Wait for pending output on a vnode to complete.
13958  * Must be called with vnode lock and interlock locked.
13959  *
13960  * XXX: Should just be a call to bufobj_wwait().
13961  */
13962 static void
13963 drain_output(vp)
13964 	struct vnode *vp;
13965 {
13966 	struct bufobj *bo;
13967 
13968 	bo = &vp->v_bufobj;
13969 	ASSERT_VOP_LOCKED(vp, "drain_output");
13970 	ASSERT_BO_WLOCKED(bo);
13971 
13972 	while (bo->bo_numoutput) {
13973 		bo->bo_flag |= BO_WWAIT;
13974 		msleep((caddr_t)&bo->bo_numoutput,
13975 		    BO_LOCKPTR(bo), PRIBIO + 1, "drainvp", 0);
13976 	}
13977 }
13978 
13979 /*
13980  * Called whenever a buffer that is being invalidated or reallocated
13981  * contains dependencies. This should only happen if an I/O error has
13982  * occurred. The routine is called with the buffer locked.
13983  */
13984 static void
13985 softdep_deallocate_dependencies(bp)
13986 	struct buf *bp;
13987 {
13988 
13989 	if ((bp->b_ioflags & BIO_ERROR) == 0)
13990 		panic("softdep_deallocate_dependencies: dangling deps");
13991 	if (bp->b_vp != NULL && bp->b_vp->v_mount != NULL)
13992 		softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
13993 	else
13994 		printf("softdep_deallocate_dependencies: "
13995 		    "got error %d while accessing filesystem\n", bp->b_error);
13996 	if (bp->b_error != ENXIO)
13997 		panic("softdep_deallocate_dependencies: unrecovered I/O error");
13998 }
13999 
14000 /*
14001  * Function to handle asynchronous write errors in the filesystem.
14002  */
14003 static void
14004 softdep_error(func, error)
14005 	char *func;
14006 	int error;
14007 {
14008 
14009 	/* XXX should do something better! */
14010 	printf("%s: got error %d while accessing filesystem\n", func, error);
14011 }
14012 
14013 #ifdef DDB
14014 
14015 static void
14016 inodedep_print(struct inodedep *inodedep, int verbose)
14017 {
14018 	db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d"
14019 	    " saveino %p\n",
14020 	    inodedep, inodedep->id_fs, inodedep->id_state,
14021 	    (intmax_t)inodedep->id_ino,
14022 	    (intmax_t)fsbtodb(inodedep->id_fs,
14023 	    ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
14024 	    inodedep->id_nlinkdelta, inodedep->id_savednlink,
14025 	    inodedep->id_savedino1);
14026 
14027 	if (verbose == 0)
14028 		return;
14029 
14030 	db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, "
14031 	    "mkdiradd %p\n",
14032 	    LIST_FIRST(&inodedep->id_pendinghd),
14033 	    LIST_FIRST(&inodedep->id_bufwait),
14034 	    LIST_FIRST(&inodedep->id_inowait),
14035 	    TAILQ_FIRST(&inodedep->id_inoreflst),
14036 	    inodedep->id_mkdiradd);
14037 	db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n",
14038 	    TAILQ_FIRST(&inodedep->id_inoupdt),
14039 	    TAILQ_FIRST(&inodedep->id_newinoupdt),
14040 	    TAILQ_FIRST(&inodedep->id_extupdt),
14041 	    TAILQ_FIRST(&inodedep->id_newextupdt));
14042 }
14043 
14044 DB_SHOW_COMMAND(inodedep, db_show_inodedep)
14045 {
14046 
14047 	if (have_addr == 0) {
14048 		db_printf("Address required\n");
14049 		return;
14050 	}
14051 	inodedep_print((struct inodedep*)addr, 1);
14052 }
14053 
14054 DB_SHOW_COMMAND(inodedeps, db_show_inodedeps)
14055 {
14056 	struct inodedep_hashhead *inodedephd;
14057 	struct inodedep *inodedep;
14058 	struct ufsmount *ump;
14059 	int cnt;
14060 
14061 	if (have_addr == 0) {
14062 		db_printf("Address required\n");
14063 		return;
14064 	}
14065 	ump = (struct ufsmount *)addr;
14066 	for (cnt = 0; cnt < ump->inodedep_hash_size; cnt++) {
14067 		inodedephd = &ump->inodedep_hashtbl[cnt];
14068 		LIST_FOREACH(inodedep, inodedephd, id_hash) {
14069 			inodedep_print(inodedep, 0);
14070 		}
14071 	}
14072 }
14073 
14074 DB_SHOW_COMMAND(worklist, db_show_worklist)
14075 {
14076 	struct worklist *wk;
14077 
14078 	if (have_addr == 0) {
14079 		db_printf("Address required\n");
14080 		return;
14081 	}
14082 	wk = (struct worklist *)addr;
14083 	printf("worklist: %p type %s state 0x%X\n",
14084 	    wk, TYPENAME(wk->wk_type), wk->wk_state);
14085 }
14086 
14087 DB_SHOW_COMMAND(workhead, db_show_workhead)
14088 {
14089 	struct workhead *wkhd;
14090 	struct worklist *wk;
14091 	int i;
14092 
14093 	if (have_addr == 0) {
14094 		db_printf("Address required\n");
14095 		return;
14096 	}
14097 	wkhd = (struct workhead *)addr;
14098 	wk = LIST_FIRST(wkhd);
14099 	for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list))
14100 		db_printf("worklist: %p type %s state 0x%X",
14101 		    wk, TYPENAME(wk->wk_type), wk->wk_state);
14102 	if (i == 100)
14103 		db_printf("workhead overflow");
14104 	printf("\n");
14105 }
14106 
14107 
14108 DB_SHOW_COMMAND(mkdirs, db_show_mkdirs)
14109 {
14110 	struct mkdirlist *mkdirlisthd;
14111 	struct jaddref *jaddref;
14112 	struct diradd *diradd;
14113 	struct mkdir *mkdir;
14114 
14115 	if (have_addr == 0) {
14116 		db_printf("Address required\n");
14117 		return;
14118 	}
14119 	mkdirlisthd = (struct mkdirlist *)addr;
14120 	LIST_FOREACH(mkdir, mkdirlisthd, md_mkdirs) {
14121 		diradd = mkdir->md_diradd;
14122 		db_printf("mkdir: %p state 0x%X dap %p state 0x%X",
14123 		    mkdir, mkdir->md_state, diradd, diradd->da_state);
14124 		if ((jaddref = mkdir->md_jaddref) != NULL)
14125 			db_printf(" jaddref %p jaddref state 0x%X",
14126 			    jaddref, jaddref->ja_state);
14127 		db_printf("\n");
14128 	}
14129 }
14130 
14131 /* exported to ffs_vfsops.c */
14132 extern void db_print_ffs(struct ufsmount *ump);
14133 void
14134 db_print_ffs(struct ufsmount *ump)
14135 {
14136 	db_printf("mp %p %s devvp %p fs %p su_wl %d su_deps %d su_req %d\n",
14137 	    ump->um_mountp, ump->um_mountp->mnt_stat.f_mntonname,
14138 	    ump->um_devvp, ump->um_fs, ump->softdep_on_worklist,
14139 	    ump->softdep_deps, ump->softdep_req);
14140 }
14141 
14142 #endif /* DDB */
14143 
14144 #endif /* SOFTUPDATES */
14145