xref: /freebsd/sys/ufs/ffs/ffs_softdep.c (revision 884a2a699669ec61e2366e3e358342dbc94be24a)
1 /*-
2  * Copyright 1998, 2000 Marshall Kirk McKusick.
3  * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
4  * All rights reserved.
5  *
6  * The soft updates code is derived from the appendix of a University
7  * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
8  * "Soft Updates: A Solution to the Metadata Update Problem in File
9  * Systems", CSE-TR-254-95, August 1995).
10  *
11  * Further information about soft updates can be obtained from:
12  *
13  *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
14  *	1614 Oxford Street		mckusick@mckusick.com
15  *	Berkeley, CA 94709-1608		+1-510-843-9542
16  *	USA
17  *
18  * Redistribution and use in source and binary forms, with or without
19  * modification, are permitted provided that the following conditions
20  * are met:
21  *
22  * 1. Redistributions of source code must retain the above copyright
23  *    notice, this list of conditions and the following disclaimer.
24  * 2. Redistributions in binary form must reproduce the above copyright
25  *    notice, this list of conditions and the following disclaimer in the
26  *    documentation and/or other materials provided with the distribution.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
29  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
30  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
31  * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
34  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
35  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
36  * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
37  * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38  *
39  *	from: @(#)ffs_softdep.c	9.59 (McKusick) 6/21/00
40  */
41 
42 #include <sys/cdefs.h>
43 __FBSDID("$FreeBSD$");
44 
45 #include "opt_ffs.h"
46 #include "opt_ddb.h"
47 
48 /*
49  * For now we want the safety net that the DEBUG flag provides.
50  */
51 #ifndef DEBUG
52 #define DEBUG
53 #endif
54 
55 #include <sys/param.h>
56 #include <sys/kernel.h>
57 #include <sys/systm.h>
58 #include <sys/bio.h>
59 #include <sys/buf.h>
60 #include <sys/kdb.h>
61 #include <sys/kthread.h>
62 #include <sys/lock.h>
63 #include <sys/malloc.h>
64 #include <sys/mount.h>
65 #include <sys/mutex.h>
66 #include <sys/namei.h>
67 #include <sys/priv.h>
68 #include <sys/proc.h>
69 #include <sys/stat.h>
70 #include <sys/sysctl.h>
71 #include <sys/syslog.h>
72 #include <sys/vnode.h>
73 #include <sys/conf.h>
74 #include <ufs/ufs/dir.h>
75 #include <ufs/ufs/extattr.h>
76 #include <ufs/ufs/quota.h>
77 #include <ufs/ufs/inode.h>
78 #include <ufs/ufs/ufsmount.h>
79 #include <ufs/ffs/fs.h>
80 #include <ufs/ffs/softdep.h>
81 #include <ufs/ffs/ffs_extern.h>
82 #include <ufs/ufs/ufs_extern.h>
83 
84 #include <vm/vm.h>
85 
86 #include <ddb/ddb.h>
87 
88 #ifndef SOFTUPDATES
89 
90 int
91 softdep_flushfiles(oldmnt, flags, td)
92 	struct mount *oldmnt;
93 	int flags;
94 	struct thread *td;
95 {
96 
97 	panic("softdep_flushfiles called");
98 }
99 
100 int
101 softdep_mount(devvp, mp, fs, cred)
102 	struct vnode *devvp;
103 	struct mount *mp;
104 	struct fs *fs;
105 	struct ucred *cred;
106 {
107 
108 	return (0);
109 }
110 
111 void
112 softdep_initialize()
113 {
114 
115 	return;
116 }
117 
118 void
119 softdep_uninitialize()
120 {
121 
122 	return;
123 }
124 
125 void
126 softdep_unmount(mp)
127 	struct mount *mp;
128 {
129 
130 }
131 
132 void
133 softdep_setup_sbupdate(ump, fs, bp)
134 	struct ufsmount *ump;
135 	struct fs *fs;
136 	struct buf *bp;
137 {
138 }
139 
140 void
141 softdep_setup_inomapdep(bp, ip, newinum)
142 	struct buf *bp;
143 	struct inode *ip;
144 	ino_t newinum;
145 {
146 
147 	panic("softdep_setup_inomapdep called");
148 }
149 
150 void
151 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
152 	struct buf *bp;
153 	struct mount *mp;
154 	ufs2_daddr_t newblkno;
155 	int frags;
156 	int oldfrags;
157 {
158 
159 	panic("softdep_setup_blkmapdep called");
160 }
161 
162 void
163 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
164 	struct inode *ip;
165 	ufs_lbn_t lbn;
166 	ufs2_daddr_t newblkno;
167 	ufs2_daddr_t oldblkno;
168 	long newsize;
169 	long oldsize;
170 	struct buf *bp;
171 {
172 
173 	panic("softdep_setup_allocdirect called");
174 }
175 
176 void
177 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
178 	struct inode *ip;
179 	ufs_lbn_t lbn;
180 	ufs2_daddr_t newblkno;
181 	ufs2_daddr_t oldblkno;
182 	long newsize;
183 	long oldsize;
184 	struct buf *bp;
185 {
186 
187 	panic("softdep_setup_allocext called");
188 }
189 
190 void
191 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
192 	struct inode *ip;
193 	ufs_lbn_t lbn;
194 	struct buf *bp;
195 	int ptrno;
196 	ufs2_daddr_t newblkno;
197 	ufs2_daddr_t oldblkno;
198 	struct buf *nbp;
199 {
200 
201 	panic("softdep_setup_allocindir_page called");
202 }
203 
204 void
205 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
206 	struct buf *nbp;
207 	struct inode *ip;
208 	struct buf *bp;
209 	int ptrno;
210 	ufs2_daddr_t newblkno;
211 {
212 
213 	panic("softdep_setup_allocindir_meta called");
214 }
215 
216 void
217 softdep_setup_freeblocks(ip, length, flags)
218 	struct inode *ip;
219 	off_t length;
220 	int flags;
221 {
222 
223 	panic("softdep_setup_freeblocks called");
224 }
225 
226 void
227 softdep_freefile(pvp, ino, mode)
228 		struct vnode *pvp;
229 		ino_t ino;
230 		int mode;
231 {
232 
233 	panic("softdep_freefile called");
234 }
235 
236 int
237 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
238 	struct buf *bp;
239 	struct inode *dp;
240 	off_t diroffset;
241 	ino_t newinum;
242 	struct buf *newdirbp;
243 	int isnewblk;
244 {
245 
246 	panic("softdep_setup_directory_add called");
247 }
248 
249 void
250 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
251 	struct buf *bp;
252 	struct inode *dp;
253 	caddr_t base;
254 	caddr_t oldloc;
255 	caddr_t newloc;
256 	int entrysize;
257 {
258 
259 	panic("softdep_change_directoryentry_offset called");
260 }
261 
262 void
263 softdep_setup_remove(bp, dp, ip, isrmdir)
264 	struct buf *bp;
265 	struct inode *dp;
266 	struct inode *ip;
267 	int isrmdir;
268 {
269 
270 	panic("softdep_setup_remove called");
271 }
272 
273 void
274 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
275 	struct buf *bp;
276 	struct inode *dp;
277 	struct inode *ip;
278 	ino_t newinum;
279 	int isrmdir;
280 {
281 
282 	panic("softdep_setup_directory_change called");
283 }
284 
285 void *
286 softdep_setup_trunc(vp, length, flags)
287 	struct vnode *vp;
288 	off_t length;
289 	int flags;
290 {
291 
292 	panic("%s called", __FUNCTION__);
293 
294 	return (NULL);
295 }
296 
297 int
298 softdep_complete_trunc(vp, cookie)
299 	struct vnode *vp;
300 	void *cookie;
301 {
302 
303 	panic("%s called", __FUNCTION__);
304 
305 	return (0);
306 }
307 
308 void
309 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
310 	struct mount *mp;
311 	struct buf *bp;
312 	ufs2_daddr_t blkno;
313 	int frags;
314 	struct workhead *wkhd;
315 {
316 
317 	panic("%s called", __FUNCTION__);
318 }
319 
320 void
321 softdep_setup_inofree(mp, bp, ino, wkhd)
322 	struct mount *mp;
323 	struct buf *bp;
324 	ino_t ino;
325 	struct workhead *wkhd;
326 {
327 
328 	panic("%s called", __FUNCTION__);
329 }
330 
331 void
332 softdep_setup_unlink(dp, ip)
333 	struct inode *dp;
334 	struct inode *ip;
335 {
336 
337 	panic("%s called", __FUNCTION__);
338 }
339 
340 void
341 softdep_setup_link(dp, ip)
342 	struct inode *dp;
343 	struct inode *ip;
344 {
345 
346 	panic("%s called", __FUNCTION__);
347 }
348 
349 void
350 softdep_revert_link(dp, ip)
351 	struct inode *dp;
352 	struct inode *ip;
353 {
354 
355 	panic("%s called", __FUNCTION__);
356 }
357 
358 void
359 softdep_setup_rmdir(dp, ip)
360 	struct inode *dp;
361 	struct inode *ip;
362 {
363 
364 	panic("%s called", __FUNCTION__);
365 }
366 
367 void
368 softdep_revert_rmdir(dp, ip)
369 	struct inode *dp;
370 	struct inode *ip;
371 {
372 
373 	panic("%s called", __FUNCTION__);
374 }
375 
376 void
377 softdep_setup_create(dp, ip)
378 	struct inode *dp;
379 	struct inode *ip;
380 {
381 
382 	panic("%s called", __FUNCTION__);
383 }
384 
385 void
386 softdep_revert_create(dp, ip)
387 	struct inode *dp;
388 	struct inode *ip;
389 {
390 
391 	panic("%s called", __FUNCTION__);
392 }
393 
394 void
395 softdep_setup_mkdir(dp, ip)
396 	struct inode *dp;
397 	struct inode *ip;
398 {
399 
400 	panic("%s called", __FUNCTION__);
401 }
402 
403 void
404 softdep_revert_mkdir(dp, ip)
405 	struct inode *dp;
406 	struct inode *ip;
407 {
408 
409 	panic("%s called", __FUNCTION__);
410 }
411 
412 void
413 softdep_setup_dotdot_link(dp, ip)
414 	struct inode *dp;
415 	struct inode *ip;
416 {
417 
418 	panic("%s called", __FUNCTION__);
419 }
420 
421 int
422 softdep_prealloc(vp, waitok)
423 	struct vnode *vp;
424 	int waitok;
425 {
426 
427 	panic("%s called", __FUNCTION__);
428 
429 	return (0);
430 }
431 
432 int
433 softdep_journal_lookup(mp, vpp)
434 	struct mount *mp;
435 	struct vnode **vpp;
436 {
437 
438 	return (ENOENT);
439 }
440 
441 void
442 softdep_change_linkcnt(ip)
443 	struct inode *ip;
444 {
445 
446 	panic("softdep_change_linkcnt called");
447 }
448 
449 void
450 softdep_load_inodeblock(ip)
451 	struct inode *ip;
452 {
453 
454 	panic("softdep_load_inodeblock called");
455 }
456 
457 void
458 softdep_update_inodeblock(ip, bp, waitfor)
459 	struct inode *ip;
460 	struct buf *bp;
461 	int waitfor;
462 {
463 
464 	panic("softdep_update_inodeblock called");
465 }
466 
467 int
468 softdep_fsync(vp)
469 	struct vnode *vp;	/* the "in_core" copy of the inode */
470 {
471 
472 	return (0);
473 }
474 
475 void
476 softdep_fsync_mountdev(vp)
477 	struct vnode *vp;
478 {
479 
480 	return;
481 }
482 
483 int
484 softdep_flushworklist(oldmnt, countp, td)
485 	struct mount *oldmnt;
486 	int *countp;
487 	struct thread *td;
488 {
489 
490 	*countp = 0;
491 	return (0);
492 }
493 
494 int
495 softdep_sync_metadata(struct vnode *vp)
496 {
497 
498 	return (0);
499 }
500 
501 int
502 softdep_slowdown(vp)
503 	struct vnode *vp;
504 {
505 
506 	panic("softdep_slowdown called");
507 }
508 
509 void
510 softdep_releasefile(ip)
511 	struct inode *ip;	/* inode with the zero effective link count */
512 {
513 
514 	panic("softdep_releasefile called");
515 }
516 
517 int
518 softdep_request_cleanup(fs, vp, cred, resource)
519 	struct fs *fs;
520 	struct vnode *vp;
521 	struct ucred *cred;
522 	int resource;
523 {
524 
525 	return (0);
526 }
527 
528 int
529 softdep_check_suspend(struct mount *mp,
530 		      struct vnode *devvp,
531 		      int softdep_deps,
532 		      int softdep_accdeps,
533 		      int secondary_writes,
534 		      int secondary_accwrites)
535 {
536 	struct bufobj *bo;
537 	int error;
538 
539 	(void) softdep_deps,
540 	(void) softdep_accdeps;
541 
542 	bo = &devvp->v_bufobj;
543 	ASSERT_BO_LOCKED(bo);
544 
545 	MNT_ILOCK(mp);
546 	while (mp->mnt_secondary_writes != 0) {
547 		BO_UNLOCK(bo);
548 		msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
549 		    (PUSER - 1) | PDROP, "secwr", 0);
550 		BO_LOCK(bo);
551 		MNT_ILOCK(mp);
552 	}
553 
554 	/*
555 	 * Reasons for needing more work before suspend:
556 	 * - Dirty buffers on devvp.
557 	 * - Secondary writes occurred after start of vnode sync loop
558 	 */
559 	error = 0;
560 	if (bo->bo_numoutput > 0 ||
561 	    bo->bo_dirty.bv_cnt > 0 ||
562 	    secondary_writes != 0 ||
563 	    mp->mnt_secondary_writes != 0 ||
564 	    secondary_accwrites != mp->mnt_secondary_accwrites)
565 		error = EAGAIN;
566 	BO_UNLOCK(bo);
567 	return (error);
568 }
569 
570 void
571 softdep_get_depcounts(struct mount *mp,
572 		      int *softdepactivep,
573 		      int *softdepactiveaccp)
574 {
575 	(void) mp;
576 	*softdepactivep = 0;
577 	*softdepactiveaccp = 0;
578 }
579 
580 #else
581 
582 FEATURE(softupdates, "FFS soft-updates support");
583 
584 /*
585  * These definitions need to be adapted to the system to which
586  * this file is being ported.
587  */
588 
589 #define M_SOFTDEP_FLAGS	(M_WAITOK)
590 
591 #define	D_PAGEDEP	0
592 #define	D_INODEDEP	1
593 #define	D_BMSAFEMAP	2
594 #define	D_NEWBLK	3
595 #define	D_ALLOCDIRECT	4
596 #define	D_INDIRDEP	5
597 #define	D_ALLOCINDIR	6
598 #define	D_FREEFRAG	7
599 #define	D_FREEBLKS	8
600 #define	D_FREEFILE	9
601 #define	D_DIRADD	10
602 #define	D_MKDIR		11
603 #define	D_DIRREM	12
604 #define	D_NEWDIRBLK	13
605 #define	D_FREEWORK	14
606 #define	D_FREEDEP	15
607 #define	D_JADDREF	16
608 #define	D_JREMREF	17
609 #define	D_JMVREF	18
610 #define	D_JNEWBLK	19
611 #define	D_JFREEBLK	20
612 #define	D_JFREEFRAG	21
613 #define	D_JSEG		22
614 #define	D_JSEGDEP	23
615 #define	D_SBDEP		24
616 #define	D_JTRUNC	25
617 #define	D_LAST		D_JTRUNC
618 
619 unsigned long dep_current[D_LAST + 1];
620 unsigned long dep_total[D_LAST + 1];
621 
622 
623 SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0, "soft updates stats");
624 SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0,
625     "total dependencies allocated");
626 SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,
627     "current dependencies allocated");
628 
629 #define	SOFTDEP_TYPE(type, str, long)					\
630     static MALLOC_DEFINE(M_ ## type, #str, long);			\
631     SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD,	\
632 	&dep_total[D_ ## type], 0, "");					\
633     SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, 	\
634 	&dep_current[D_ ## type], 0, "");
635 
636 SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");
637 SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
638 SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
639     "Block or frag allocated from cyl group map");
640 SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
641 SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
642 SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
643 SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
644 SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
645 SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
646 SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
647 SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
648 SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
649 SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
650 SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
651 SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
652 SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
653 SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
654 SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
655 SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
656 SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
657 SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
658 SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
659 SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
660 SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
661 SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
662 SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
663 
664 static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
665 static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
666 
667 /*
668  * translate from workitem type to memory type
669  * MUST match the defines above, such that memtype[D_XXX] == M_XXX
670  */
671 static struct malloc_type *memtype[] = {
672 	M_PAGEDEP,
673 	M_INODEDEP,
674 	M_BMSAFEMAP,
675 	M_NEWBLK,
676 	M_ALLOCDIRECT,
677 	M_INDIRDEP,
678 	M_ALLOCINDIR,
679 	M_FREEFRAG,
680 	M_FREEBLKS,
681 	M_FREEFILE,
682 	M_DIRADD,
683 	M_MKDIR,
684 	M_DIRREM,
685 	M_NEWDIRBLK,
686 	M_FREEWORK,
687 	M_FREEDEP,
688 	M_JADDREF,
689 	M_JREMREF,
690 	M_JMVREF,
691 	M_JNEWBLK,
692 	M_JFREEBLK,
693 	M_JFREEFRAG,
694 	M_JSEG,
695 	M_JSEGDEP,
696 	M_SBDEP,
697 	M_JTRUNC
698 };
699 
700 static LIST_HEAD(mkdirlist, mkdir) mkdirlisthd;
701 
702 #define DtoM(type) (memtype[type])
703 
704 /*
705  * Names of malloc types.
706  */
707 #define TYPENAME(type)  \
708 	((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???")
709 /*
710  * End system adaptation definitions.
711  */
712 
713 #define	DOTDOT_OFFSET	offsetof(struct dirtemplate, dotdot_ino)
714 #define	DOT_OFFSET	offsetof(struct dirtemplate, dot_ino)
715 
716 /*
717  * Forward declarations.
718  */
719 struct inodedep_hashhead;
720 struct newblk_hashhead;
721 struct pagedep_hashhead;
722 struct bmsafemap_hashhead;
723 
724 /*
725  * Internal function prototypes.
726  */
727 static	void softdep_error(char *, int);
728 static	void drain_output(struct vnode *);
729 static	struct buf *getdirtybuf(struct buf *, struct mtx *, int);
730 static	void clear_remove(struct thread *);
731 static	void clear_inodedeps(struct thread *);
732 static	void unlinked_inodedep(struct mount *, struct inodedep *);
733 static	void clear_unlinked_inodedep(struct inodedep *);
734 static	struct inodedep *first_unlinked_inodedep(struct ufsmount *);
735 static	int flush_pagedep_deps(struct vnode *, struct mount *,
736 	    struct diraddhd *);
737 static	void free_pagedep(struct pagedep *);
738 static	int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
739 static	int flush_inodedep_deps(struct mount *, ino_t);
740 static	int flush_deplist(struct allocdirectlst *, int, int *);
741 static	int handle_written_filepage(struct pagedep *, struct buf *);
742 static	int handle_written_sbdep(struct sbdep *, struct buf *);
743 static	void initiate_write_sbdep(struct sbdep *);
744 static  void diradd_inode_written(struct diradd *, struct inodedep *);
745 static	int handle_written_indirdep(struct indirdep *, struct buf *,
746 	    struct buf**);
747 static	int handle_written_inodeblock(struct inodedep *, struct buf *);
748 static	int handle_written_bmsafemap(struct bmsafemap *, struct buf *);
749 static	void handle_written_jaddref(struct jaddref *);
750 static	void handle_written_jremref(struct jremref *);
751 static	void handle_written_jseg(struct jseg *, struct buf *);
752 static	void handle_written_jnewblk(struct jnewblk *);
753 static	void handle_written_jfreeblk(struct jfreeblk *);
754 static	void handle_written_jfreefrag(struct jfreefrag *);
755 static	void complete_jseg(struct jseg *);
756 static	void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
757 static	void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
758 static	void jremref_write(struct jremref *, struct jseg *, uint8_t *);
759 static	void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
760 static	void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
761 static	void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
762 static	void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
763 static	void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
764 static	inline void inoref_write(struct inoref *, struct jseg *,
765 	    struct jrefrec *);
766 static	void handle_allocdirect_partdone(struct allocdirect *,
767 	    struct workhead *);
768 static	struct jnewblk *cancel_newblk(struct newblk *, struct worklist *,
769 	    struct workhead *);
770 static	void indirdep_complete(struct indirdep *);
771 static	int indirblk_inseg(struct mount *, ufs2_daddr_t);
772 static	void handle_allocindir_partdone(struct allocindir *);
773 static	void initiate_write_filepage(struct pagedep *, struct buf *);
774 static	void initiate_write_indirdep(struct indirdep*, struct buf *);
775 static	void handle_written_mkdir(struct mkdir *, int);
776 static	void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
777 static	void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
778 static	void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
779 static	void handle_workitem_freefile(struct freefile *);
780 static	void handle_workitem_remove(struct dirrem *, struct vnode *);
781 static	struct dirrem *newdirrem(struct buf *, struct inode *,
782 	    struct inode *, int, struct dirrem **);
783 static	void cancel_indirdep(struct indirdep *, struct buf *, struct inodedep *,
784 	    struct freeblks *);
785 static	void free_indirdep(struct indirdep *);
786 static	void free_diradd(struct diradd *, struct workhead *);
787 static	void merge_diradd(struct inodedep *, struct diradd *);
788 static	void complete_diradd(struct diradd *);
789 static	struct diradd *diradd_lookup(struct pagedep *, int);
790 static	struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
791 	    struct jremref *);
792 static	struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
793 	    struct jremref *);
794 static	void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
795 	    struct jremref *, struct jremref *);
796 static	void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
797 	    struct jremref *);
798 static	void cancel_allocindir(struct allocindir *, struct inodedep *,
799 	    struct freeblks *);
800 static	void complete_mkdir(struct mkdir *);
801 static	void free_newdirblk(struct newdirblk *);
802 static	void free_jremref(struct jremref *);
803 static	void free_jaddref(struct jaddref *);
804 static	void free_jsegdep(struct jsegdep *);
805 static	void free_jsegs(struct jblocks *);
806 static	void rele_jseg(struct jseg *);
807 static	void free_jseg(struct jseg *, struct jblocks *);
808 static	void free_jnewblk(struct jnewblk *);
809 static	void free_jfreeblk(struct jfreeblk *);
810 static	void free_jfreefrag(struct jfreefrag *);
811 static	void free_freedep(struct freedep *);
812 static	void journal_jremref(struct dirrem *, struct jremref *,
813 	    struct inodedep *);
814 static	void cancel_jnewblk(struct jnewblk *, struct workhead *);
815 static	int cancel_jaddref(struct jaddref *, struct inodedep *,
816 	    struct workhead *);
817 static	void cancel_jfreefrag(struct jfreefrag *);
818 static	inline void setup_freedirect(struct freeblks *, struct inode *,
819 	    int, int);
820 static	inline void setup_freeext(struct freeblks *, struct inode *, int, int);
821 static	inline void setup_freeindir(struct freeblks *, struct inode *, int i,
822 	    ufs_lbn_t, int);
823 static	inline struct freeblks *newfreeblks(struct mount *, struct inode *);
824 static	void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
825 static	void softdep_trunc_deps(struct vnode *, struct freeblks *, ufs_lbn_t,
826 	    int, int);
827 static 	int cancel_pagedep(struct pagedep *, struct inodedep *,
828 	    struct freeblks *);
829 static	int deallocate_dependencies(struct buf *, struct inodedep *,
830 	    struct freeblks *, int off);
831 static	void free_newblk(struct newblk *);
832 static	void cancel_allocdirect(struct allocdirectlst *,
833 	    struct allocdirect *, struct freeblks *, int);
834 static	int check_inode_unwritten(struct inodedep *);
835 static	int free_inodedep(struct inodedep *);
836 static	void freework_freeblock(struct freework *);
837 static	void handle_workitem_freeblocks(struct freeblks *, int);
838 static	void handle_complete_freeblocks(struct freeblks *);
839 static	void handle_workitem_indirblk(struct freework *);
840 static	void handle_written_freework(struct freework *);
841 static	void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
842 static	struct worklist *jnewblk_merge(struct worklist *, struct worklist *,
843 	    struct workhead *);
844 static	void setup_allocindir_phase2(struct buf *, struct inode *,
845 	    struct inodedep *, struct allocindir *, ufs_lbn_t);
846 static	struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
847 	    ufs2_daddr_t, ufs_lbn_t);
848 static	void handle_workitem_freefrag(struct freefrag *);
849 static	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
850 	    ufs_lbn_t);
851 static	void allocdirect_merge(struct allocdirectlst *,
852 	    struct allocdirect *, struct allocdirect *);
853 static	struct freefrag *allocindir_merge(struct allocindir *,
854 	    struct allocindir *);
855 static	int bmsafemap_find(struct bmsafemap_hashhead *, struct mount *, int,
856 	    struct bmsafemap **);
857 static	struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
858 	    int cg);
859 static	int newblk_find(struct newblk_hashhead *, struct mount *, ufs2_daddr_t,
860 	    int, struct newblk **);
861 static	int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
862 static	int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t,
863 	    struct inodedep **);
864 static	int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
865 static	int pagedep_lookup(struct mount *, ino_t, ufs_lbn_t, int,
866 	    struct pagedep **);
867 static	int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
868 	    struct mount *mp, int, struct pagedep **);
869 static	void pause_timer(void *);
870 static	int request_cleanup(struct mount *, int);
871 static	int process_worklist_item(struct mount *, int);
872 static	void process_removes(struct vnode *);
873 static	void jwork_move(struct workhead *, struct workhead *);
874 static	void add_to_worklist(struct worklist *, int);
875 static	void remove_from_worklist(struct worklist *);
876 static	void softdep_flush(void);
877 static	int softdep_speedup(void);
878 static	void worklist_speedup(void);
879 static	int journal_mount(struct mount *, struct fs *, struct ucred *);
880 static	void journal_unmount(struct mount *);
881 static	int journal_space(struct ufsmount *, int);
882 static	void journal_suspend(struct ufsmount *);
883 static	int journal_unsuspend(struct ufsmount *ump);
884 static	void softdep_prelink(struct vnode *, struct vnode *);
885 static	void add_to_journal(struct worklist *);
886 static	void remove_from_journal(struct worklist *);
887 static	void softdep_process_journal(struct mount *, struct worklist *, int);
888 static	struct jremref *newjremref(struct dirrem *, struct inode *,
889 	    struct inode *ip, off_t, nlink_t);
890 static	struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
891 	    uint16_t);
892 static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
893 	    uint16_t);
894 static inline struct jsegdep *inoref_jseg(struct inoref *);
895 static	struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
896 static	struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
897 	    ufs2_daddr_t, int);
898 static	struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
899 	    ufs2_daddr_t, long, ufs_lbn_t);
900 static	struct freework *newfreework(struct ufsmount *, struct freeblks *,
901 	    struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int);
902 static	void jwait(struct worklist *wk);
903 static	struct inodedep *inodedep_lookup_ip(struct inode *);
904 static	int bmsafemap_rollbacks(struct bmsafemap *);
905 static	struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
906 static	void handle_jwork(struct workhead *);
907 static	struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
908 	    struct mkdir **);
909 static	struct jblocks *jblocks_create(void);
910 static	ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
911 static	void jblocks_free(struct jblocks *, struct mount *, int);
912 static	void jblocks_destroy(struct jblocks *);
913 static	void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
914 
915 /*
916  * Exported softdep operations.
917  */
918 static	void softdep_disk_io_initiation(struct buf *);
919 static	void softdep_disk_write_complete(struct buf *);
920 static	void softdep_deallocate_dependencies(struct buf *);
921 static	int softdep_count_dependencies(struct buf *bp, int);
922 
923 static struct mtx lk;
924 MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF);
925 
926 #define TRY_ACQUIRE_LOCK(lk)		mtx_trylock(lk)
927 #define ACQUIRE_LOCK(lk)		mtx_lock(lk)
928 #define FREE_LOCK(lk)			mtx_unlock(lk)
929 
930 #define	BUF_AREC(bp)			lockallowrecurse(&(bp)->b_lock)
931 #define	BUF_NOREC(bp)			lockdisablerecurse(&(bp)->b_lock)
932 
933 /*
934  * Worklist queue management.
935  * These routines require that the lock be held.
936  */
937 #ifndef /* NOT */ DEBUG
938 #define WORKLIST_INSERT(head, item) do {	\
939 	(item)->wk_state |= ONWORKLIST;		\
940 	LIST_INSERT_HEAD(head, item, wk_list);	\
941 } while (0)
942 #define WORKLIST_REMOVE(item) do {		\
943 	(item)->wk_state &= ~ONWORKLIST;	\
944 	LIST_REMOVE(item, wk_list);		\
945 } while (0)
946 #define WORKLIST_INSERT_UNLOCKED	WORKLIST_INSERT
947 #define WORKLIST_REMOVE_UNLOCKED	WORKLIST_REMOVE
948 
949 #else /* DEBUG */
950 static	void worklist_insert(struct workhead *, struct worklist *, int);
951 static	void worklist_remove(struct worklist *, int);
952 
953 #define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1)
954 #define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0)
955 #define WORKLIST_REMOVE(item) worklist_remove(item, 1)
956 #define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0)
957 
958 static void
959 worklist_insert(head, item, locked)
960 	struct workhead *head;
961 	struct worklist *item;
962 	int locked;
963 {
964 
965 	if (locked)
966 		mtx_assert(&lk, MA_OWNED);
967 	if (item->wk_state & ONWORKLIST)
968 		panic("worklist_insert: %p %s(0x%X) already on list",
969 		    item, TYPENAME(item->wk_type), item->wk_state);
970 	item->wk_state |= ONWORKLIST;
971 	LIST_INSERT_HEAD(head, item, wk_list);
972 }
973 
974 static void
975 worklist_remove(item, locked)
976 	struct worklist *item;
977 	int locked;
978 {
979 
980 	if (locked)
981 		mtx_assert(&lk, MA_OWNED);
982 	if ((item->wk_state & ONWORKLIST) == 0)
983 		panic("worklist_remove: %p %s(0x%X) not on list",
984 		    item, TYPENAME(item->wk_type), item->wk_state);
985 	item->wk_state &= ~ONWORKLIST;
986 	LIST_REMOVE(item, wk_list);
987 }
988 #endif /* DEBUG */
989 
990 /*
991  * Merge two jsegdeps keeping only the oldest one as newer references
992  * can't be discarded until after older references.
993  */
994 static inline struct jsegdep *
995 jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
996 {
997 	struct jsegdep *swp;
998 
999 	if (two == NULL)
1000 		return (one);
1001 
1002 	if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
1003 		swp = one;
1004 		one = two;
1005 		two = swp;
1006 	}
1007 	WORKLIST_REMOVE(&two->jd_list);
1008 	free_jsegdep(two);
1009 
1010 	return (one);
1011 }
1012 
1013 /*
1014  * If two freedeps are compatible free one to reduce list size.
1015  */
1016 static inline struct freedep *
1017 freedep_merge(struct freedep *one, struct freedep *two)
1018 {
1019 	if (two == NULL)
1020 		return (one);
1021 
1022 	if (one->fd_freework == two->fd_freework) {
1023 		WORKLIST_REMOVE(&two->fd_list);
1024 		free_freedep(two);
1025 	}
1026 	return (one);
1027 }
1028 
1029 /*
1030  * Move journal work from one list to another.  Duplicate freedeps and
1031  * jsegdeps are coalesced to keep the lists as small as possible.
1032  */
1033 static void
1034 jwork_move(dst, src)
1035 	struct workhead *dst;
1036 	struct workhead *src;
1037 {
1038 	struct freedep *freedep;
1039 	struct jsegdep *jsegdep;
1040 	struct worklist *wkn;
1041 	struct worklist *wk;
1042 
1043 	KASSERT(dst != src,
1044 	    ("jwork_move: dst == src"));
1045 	freedep = NULL;
1046 	jsegdep = NULL;
1047 	LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
1048 		if (wk->wk_type == D_JSEGDEP)
1049 			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1050 		if (wk->wk_type == D_FREEDEP)
1051 			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1052 	}
1053 
1054 	mtx_assert(&lk, MA_OWNED);
1055 	while ((wk = LIST_FIRST(src)) != NULL) {
1056 		WORKLIST_REMOVE(wk);
1057 		WORKLIST_INSERT(dst, wk);
1058 		if (wk->wk_type == D_JSEGDEP) {
1059 			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1060 			continue;
1061 		}
1062 		if (wk->wk_type == D_FREEDEP)
1063 			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1064 	}
1065 }
1066 
1067 /*
1068  * Routines for tracking and managing workitems.
1069  */
1070 static	void workitem_free(struct worklist *, int);
1071 static	void workitem_alloc(struct worklist *, int, struct mount *);
1072 
1073 #define	WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type))
1074 
1075 static void
1076 workitem_free(item, type)
1077 	struct worklist *item;
1078 	int type;
1079 {
1080 	struct ufsmount *ump;
1081 	mtx_assert(&lk, MA_OWNED);
1082 
1083 #ifdef DEBUG
1084 	if (item->wk_state & ONWORKLIST)
1085 		panic("workitem_free: %s(0x%X) still on list",
1086 		    TYPENAME(item->wk_type), item->wk_state);
1087 	if (item->wk_type != type)
1088 		panic("workitem_free: type mismatch %s != %s",
1089 		    TYPENAME(item->wk_type), TYPENAME(type));
1090 #endif
1091 	ump = VFSTOUFS(item->wk_mp);
1092 	if (--ump->softdep_deps == 0 && ump->softdep_req)
1093 		wakeup(&ump->softdep_deps);
1094 	dep_current[type]--;
1095 	free(item, DtoM(type));
1096 }
1097 
1098 static void
1099 workitem_alloc(item, type, mp)
1100 	struct worklist *item;
1101 	int type;
1102 	struct mount *mp;
1103 {
1104 	item->wk_type = type;
1105 	item->wk_mp = mp;
1106 	item->wk_state = 0;
1107 	ACQUIRE_LOCK(&lk);
1108 	dep_current[type]++;
1109 	dep_total[type]++;
1110 	VFSTOUFS(mp)->softdep_deps++;
1111 	VFSTOUFS(mp)->softdep_accdeps++;
1112 	FREE_LOCK(&lk);
1113 }
1114 
1115 /*
1116  * Workitem queue management
1117  */
1118 static int max_softdeps;	/* maximum number of structs before slowdown */
1119 static int maxindirdeps = 50;	/* max number of indirdeps before slowdown */
1120 static int tickdelay = 2;	/* number of ticks to pause during slowdown */
1121 static int proc_waiting;	/* tracks whether we have a timeout posted */
1122 static int *stat_countp;	/* statistic to count in proc_waiting timeout */
1123 static struct callout softdep_callout;
1124 static int req_pending;
1125 static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
1126 static int req_clear_remove;	/* syncer process flush some freeblks */
1127 
1128 /*
1129  * runtime statistics
1130  */
1131 static int stat_worklist_push;	/* number of worklist cleanups */
1132 static int stat_blk_limit_push;	/* number of times block limit neared */
1133 static int stat_ino_limit_push;	/* number of times inode limit neared */
1134 static int stat_blk_limit_hit;	/* number of times block slowdown imposed */
1135 static int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
1136 static int stat_sync_limit_hit;	/* number of synchronous slowdowns imposed */
1137 static int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
1138 static int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
1139 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
1140 static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
1141 static int stat_jaddref;	/* bufs redirtied as ino bitmap can not write */
1142 static int stat_jnewblk;	/* bufs redirtied as blk bitmap can not write */
1143 static int stat_journal_min;	/* Times hit journal min threshold */
1144 static int stat_journal_low;	/* Times hit journal low threshold */
1145 static int stat_journal_wait;	/* Times blocked in jwait(). */
1146 static int stat_jwait_filepage;	/* Times blocked in jwait() for filepage. */
1147 static int stat_jwait_freeblks;	/* Times blocked in jwait() for freeblks. */
1148 static int stat_jwait_inode;	/* Times blocked in jwait() for inodes. */
1149 static int stat_jwait_newblk;	/* Times blocked in jwait() for newblks. */
1150 static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */
1151 static int stat_cleanup_blkrequests; /* Number of block cleanup requests */
1152 static int stat_cleanup_inorequests; /* Number of inode cleanup requests */
1153 static int stat_cleanup_retries; /* Number of cleanups that needed to flush */
1154 static int stat_cleanup_failures; /* Number of cleanup requests that failed */
1155 
1156 SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
1157     &max_softdeps, 0, "");
1158 SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
1159     &tickdelay, 0, "");
1160 SYSCTL_INT(_debug_softdep, OID_AUTO, maxindirdeps, CTLFLAG_RW,
1161     &maxindirdeps, 0, "");
1162 SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,
1163     &stat_worklist_push, 0,"");
1164 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,
1165     &stat_blk_limit_push, 0,"");
1166 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW,
1167     &stat_ino_limit_push, 0,"");
1168 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW,
1169     &stat_blk_limit_hit, 0, "");
1170 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW,
1171     &stat_ino_limit_hit, 0, "");
1172 SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW,
1173     &stat_sync_limit_hit, 0, "");
1174 SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW,
1175     &stat_indir_blk_ptrs, 0, "");
1176 SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW,
1177     &stat_inode_bitmap, 0, "");
1178 SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW,
1179     &stat_direct_blk_ptrs, 0, "");
1180 SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW,
1181     &stat_dir_entry, 0, "");
1182 SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW,
1183     &stat_jaddref, 0, "");
1184 SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW,
1185     &stat_jnewblk, 0, "");
1186 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW,
1187     &stat_journal_low, 0, "");
1188 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW,
1189     &stat_journal_min, 0, "");
1190 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW,
1191     &stat_journal_wait, 0, "");
1192 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW,
1193     &stat_jwait_filepage, 0, "");
1194 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW,
1195     &stat_jwait_freeblks, 0, "");
1196 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW,
1197     &stat_jwait_inode, 0, "");
1198 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW,
1199     &stat_jwait_newblk, 0, "");
1200 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW,
1201     &stat_cleanup_blkrequests, 0, "");
1202 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW,
1203     &stat_cleanup_inorequests, 0, "");
1204 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW,
1205     &stat_cleanup_high_delay, 0, "");
1206 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW,
1207     &stat_cleanup_retries, 0, "");
1208 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW,
1209     &stat_cleanup_failures, 0, "");
1210 
1211 SYSCTL_DECL(_vfs_ffs);
1212 
1213 LIST_HEAD(bmsafemap_hashhead, bmsafemap) *bmsafemap_hashtbl;
1214 static u_long	bmsafemap_hash;	/* size of hash table - 1 */
1215 
1216 static int compute_summary_at_mount = 0;	/* Whether to recompute the summary at mount time */
1217 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
1218 	   &compute_summary_at_mount, 0, "Recompute summary at mount");
1219 
1220 static struct proc *softdepproc;
1221 static struct kproc_desc softdep_kp = {
1222 	"softdepflush",
1223 	softdep_flush,
1224 	&softdepproc
1225 };
1226 SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start,
1227     &softdep_kp);
1228 
1229 static void
1230 softdep_flush(void)
1231 {
1232 	struct mount *nmp;
1233 	struct mount *mp;
1234 	struct ufsmount *ump;
1235 	struct thread *td;
1236 	int remaining;
1237 	int progress;
1238 	int vfslocked;
1239 
1240 	td = curthread;
1241 	td->td_pflags |= TDP_NORUNNINGBUF;
1242 
1243 	for (;;) {
1244 		kproc_suspend_check(softdepproc);
1245 		vfslocked = VFS_LOCK_GIANT((struct mount *)NULL);
1246 		ACQUIRE_LOCK(&lk);
1247 		/*
1248 		 * If requested, try removing inode or removal dependencies.
1249 		 */
1250 		if (req_clear_inodedeps) {
1251 			clear_inodedeps(td);
1252 			req_clear_inodedeps -= 1;
1253 			wakeup_one(&proc_waiting);
1254 		}
1255 		if (req_clear_remove) {
1256 			clear_remove(td);
1257 			req_clear_remove -= 1;
1258 			wakeup_one(&proc_waiting);
1259 		}
1260 		FREE_LOCK(&lk);
1261 		VFS_UNLOCK_GIANT(vfslocked);
1262 		remaining = progress = 0;
1263 		mtx_lock(&mountlist_mtx);
1264 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp)  {
1265 			nmp = TAILQ_NEXT(mp, mnt_list);
1266 			if ((mp->mnt_flag & MNT_SOFTDEP) == 0)
1267 				continue;
1268 			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
1269 				continue;
1270 			vfslocked = VFS_LOCK_GIANT(mp);
1271 			progress += softdep_process_worklist(mp, 0);
1272 			ump = VFSTOUFS(mp);
1273 			remaining += ump->softdep_on_worklist -
1274 				ump->softdep_on_worklist_inprogress;
1275 			VFS_UNLOCK_GIANT(vfslocked);
1276 			mtx_lock(&mountlist_mtx);
1277 			nmp = TAILQ_NEXT(mp, mnt_list);
1278 			vfs_unbusy(mp);
1279 		}
1280 		mtx_unlock(&mountlist_mtx);
1281 		if (remaining && progress)
1282 			continue;
1283 		ACQUIRE_LOCK(&lk);
1284 		if (!req_pending)
1285 			msleep(&req_pending, &lk, PVM, "sdflush", hz);
1286 		req_pending = 0;
1287 		FREE_LOCK(&lk);
1288 	}
1289 }
1290 
1291 static void
1292 worklist_speedup(void)
1293 {
1294 	mtx_assert(&lk, MA_OWNED);
1295 	if (req_pending == 0) {
1296 		req_pending = 1;
1297 		wakeup(&req_pending);
1298 	}
1299 }
1300 
1301 static int
1302 softdep_speedup(void)
1303 {
1304 
1305 	worklist_speedup();
1306 	bd_speedup();
1307 	return speedup_syncer();
1308 }
1309 
1310 /*
1311  * Add an item to the end of the work queue.
1312  * This routine requires that the lock be held.
1313  * This is the only routine that adds items to the list.
1314  * The following routine is the only one that removes items
1315  * and does so in order from first to last.
1316  */
1317 static void
1318 add_to_worklist(wk, nodelay)
1319 	struct worklist *wk;
1320 	int nodelay;
1321 {
1322 	struct ufsmount *ump;
1323 
1324 	mtx_assert(&lk, MA_OWNED);
1325 	ump = VFSTOUFS(wk->wk_mp);
1326 	if (wk->wk_state & ONWORKLIST)
1327 		panic("add_to_worklist: %s(0x%X) already on list",
1328 		    TYPENAME(wk->wk_type), wk->wk_state);
1329 	wk->wk_state |= ONWORKLIST;
1330 	if (LIST_EMPTY(&ump->softdep_workitem_pending))
1331 		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1332 	else
1333 		LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
1334 	ump->softdep_worklist_tail = wk;
1335 	ump->softdep_on_worklist += 1;
1336 	if (nodelay)
1337 		worklist_speedup();
1338 }
1339 
1340 /*
1341  * Remove the item to be processed. If we are removing the last
1342  * item on the list, we need to recalculate the tail pointer.
1343  */
1344 static void
1345 remove_from_worklist(wk)
1346 	struct worklist *wk;
1347 {
1348 	struct ufsmount *ump;
1349 	struct worklist *wkend;
1350 
1351 	ump = VFSTOUFS(wk->wk_mp);
1352 	WORKLIST_REMOVE(wk);
1353 	if (wk == ump->softdep_worklist_tail) {
1354 		LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list)
1355 			if (LIST_NEXT(wkend, wk_list) == NULL)
1356 				break;
1357 		ump->softdep_worklist_tail = wkend;
1358 	}
1359 	ump->softdep_on_worklist -= 1;
1360 }
1361 
1362 /*
1363  * Process that runs once per second to handle items in the background queue.
1364  *
1365  * Note that we ensure that everything is done in the order in which they
1366  * appear in the queue. The code below depends on this property to ensure
1367  * that blocks of a file are freed before the inode itself is freed. This
1368  * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
1369  * until all the old ones have been purged from the dependency lists.
1370  */
1371 int
1372 softdep_process_worklist(mp, full)
1373 	struct mount *mp;
1374 	int full;
1375 {
1376 	struct thread *td = curthread;
1377 	int cnt, matchcnt;
1378 	struct ufsmount *ump;
1379 	long starttime;
1380 
1381 	KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
1382 	/*
1383 	 * Record the process identifier of our caller so that we can give
1384 	 * this process preferential treatment in request_cleanup below.
1385 	 */
1386 	matchcnt = 0;
1387 	ump = VFSTOUFS(mp);
1388 	ACQUIRE_LOCK(&lk);
1389 	starttime = time_second;
1390 	softdep_process_journal(mp, NULL, full?MNT_WAIT:0);
1391 	while (ump->softdep_on_worklist > 0) {
1392 		if ((cnt = process_worklist_item(mp, LK_NOWAIT)) == -1)
1393 			break;
1394 		else
1395 			matchcnt += cnt;
1396 		/*
1397 		 * If requested, try removing inode or removal dependencies.
1398 		 */
1399 		if (req_clear_inodedeps) {
1400 			clear_inodedeps(td);
1401 			req_clear_inodedeps -= 1;
1402 			wakeup_one(&proc_waiting);
1403 		}
1404 		if (req_clear_remove) {
1405 			clear_remove(td);
1406 			req_clear_remove -= 1;
1407 			wakeup_one(&proc_waiting);
1408 		}
1409 		/*
1410 		 * We do not generally want to stop for buffer space, but if
1411 		 * we are really being a buffer hog, we will stop and wait.
1412 		 */
1413 		if (should_yield()) {
1414 			FREE_LOCK(&lk);
1415 			kern_yield(PRI_UNCHANGED);
1416 			bwillwrite();
1417 			ACQUIRE_LOCK(&lk);
1418 		}
1419 		/*
1420 		 * Never allow processing to run for more than one
1421 		 * second. Otherwise the other mountpoints may get
1422 		 * excessively backlogged.
1423 		 */
1424 		if (!full && starttime != time_second)
1425 			break;
1426 	}
1427 	if (full == 0)
1428 		journal_unsuspend(ump);
1429 	FREE_LOCK(&lk);
1430 	return (matchcnt);
1431 }
1432 
1433 /*
1434  * Process all removes associated with a vnode if we are running out of
1435  * journal space.  Any other process which attempts to flush these will
1436  * be unable as we have the vnodes locked.
1437  */
1438 static void
1439 process_removes(vp)
1440 	struct vnode *vp;
1441 {
1442 	struct inodedep *inodedep;
1443 	struct dirrem *dirrem;
1444 	struct mount *mp;
1445 	ino_t inum;
1446 
1447 	mtx_assert(&lk, MA_OWNED);
1448 
1449 	mp = vp->v_mount;
1450 	inum = VTOI(vp)->i_number;
1451 	for (;;) {
1452 		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1453 			return;
1454 		LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext)
1455 			if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==
1456 			    (COMPLETE | ONWORKLIST))
1457 				break;
1458 		if (dirrem == NULL)
1459 			return;
1460 		/*
1461 		 * If another thread is trying to lock this vnode it will
1462 		 * fail but we must wait for it to do so before we can
1463 		 * proceed.
1464 		 */
1465 		if (dirrem->dm_state & INPROGRESS) {
1466 			dirrem->dm_state |= IOWAITING;
1467 			msleep(&dirrem->dm_list, &lk, PVM, "pwrwait", 0);
1468 			continue;
1469 		}
1470 		remove_from_worklist(&dirrem->dm_list);
1471 		FREE_LOCK(&lk);
1472 		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1473 			panic("process_removes: suspended filesystem");
1474 		handle_workitem_remove(dirrem, vp);
1475 		vn_finished_secondary_write(mp);
1476 		ACQUIRE_LOCK(&lk);
1477 	}
1478 }
1479 
1480 /*
1481  * Process one item on the worklist.
1482  */
1483 static int
1484 process_worklist_item(mp, flags)
1485 	struct mount *mp;
1486 	int flags;
1487 {
1488 	struct worklist *wk;
1489 	struct ufsmount *ump;
1490 	struct vnode *vp;
1491 	int matchcnt = 0;
1492 
1493 	mtx_assert(&lk, MA_OWNED);
1494 	KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
1495 	/*
1496 	 * If we are being called because of a process doing a
1497 	 * copy-on-write, then it is not safe to write as we may
1498 	 * recurse into the copy-on-write routine.
1499 	 */
1500 	if (curthread->td_pflags & TDP_COWINPROGRESS)
1501 		return (-1);
1502 	/*
1503 	 * Normally we just process each item on the worklist in order.
1504 	 * However, if we are in a situation where we cannot lock any
1505 	 * inodes, we have to skip over any dirrem requests whose
1506 	 * vnodes are resident and locked.
1507 	 */
1508 	vp = NULL;
1509 	ump = VFSTOUFS(mp);
1510 	LIST_FOREACH(wk, &ump->softdep_workitem_pending, wk_list) {
1511 		if (wk->wk_state & INPROGRESS)
1512 			continue;
1513 		if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
1514 			break;
1515 		wk->wk_state |= INPROGRESS;
1516 		ump->softdep_on_worklist_inprogress++;
1517 		FREE_LOCK(&lk);
1518 		ffs_vgetf(mp, WK_DIRREM(wk)->dm_oldinum,
1519 		    LK_NOWAIT | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ);
1520 		ACQUIRE_LOCK(&lk);
1521 		if (wk->wk_state & IOWAITING) {
1522 			wk->wk_state &= ~IOWAITING;
1523 			wakeup(wk);
1524 		}
1525 		wk->wk_state &= ~INPROGRESS;
1526 		ump->softdep_on_worklist_inprogress--;
1527 		if (vp != NULL)
1528 			break;
1529 	}
1530 	if (wk == 0)
1531 		return (-1);
1532 	remove_from_worklist(wk);
1533 	FREE_LOCK(&lk);
1534 	if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1535 		panic("process_worklist_item: suspended filesystem");
1536 	matchcnt++;
1537 	switch (wk->wk_type) {
1538 
1539 	case D_DIRREM:
1540 		/* removal of a directory entry */
1541 		handle_workitem_remove(WK_DIRREM(wk), vp);
1542 		if (vp)
1543 			vput(vp);
1544 		break;
1545 
1546 	case D_FREEBLKS:
1547 		/* releasing blocks and/or fragments from a file */
1548 		handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT);
1549 		break;
1550 
1551 	case D_FREEFRAG:
1552 		/* releasing a fragment when replaced as a file grows */
1553 		handle_workitem_freefrag(WK_FREEFRAG(wk));
1554 		break;
1555 
1556 	case D_FREEFILE:
1557 		/* releasing an inode when its link count drops to 0 */
1558 		handle_workitem_freefile(WK_FREEFILE(wk));
1559 		break;
1560 
1561 	case D_FREEWORK:
1562 		/* Final block in an indirect was freed. */
1563 		handle_workitem_indirblk(WK_FREEWORK(wk));
1564 		break;
1565 
1566 	default:
1567 		panic("%s_process_worklist: Unknown type %s",
1568 		    "softdep", TYPENAME(wk->wk_type));
1569 		/* NOTREACHED */
1570 	}
1571 	vn_finished_secondary_write(mp);
1572 	ACQUIRE_LOCK(&lk);
1573 	return (matchcnt);
1574 }
1575 
1576 /*
1577  * Move dependencies from one buffer to another.
1578  */
1579 int
1580 softdep_move_dependencies(oldbp, newbp)
1581 	struct buf *oldbp;
1582 	struct buf *newbp;
1583 {
1584 	struct worklist *wk, *wktail;
1585 	int dirty;
1586 
1587 	dirty = 0;
1588 	wktail = NULL;
1589 	ACQUIRE_LOCK(&lk);
1590 	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
1591 		LIST_REMOVE(wk, wk_list);
1592 		if (wk->wk_type == D_BMSAFEMAP &&
1593 		    bmsafemap_rollbacks(WK_BMSAFEMAP(wk)))
1594 			dirty = 1;
1595 		if (wktail == 0)
1596 			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
1597 		else
1598 			LIST_INSERT_AFTER(wktail, wk, wk_list);
1599 		wktail = wk;
1600 	}
1601 	FREE_LOCK(&lk);
1602 
1603 	return (dirty);
1604 }
1605 
1606 /*
1607  * Purge the work list of all items associated with a particular mount point.
1608  */
1609 int
1610 softdep_flushworklist(oldmnt, countp, td)
1611 	struct mount *oldmnt;
1612 	int *countp;
1613 	struct thread *td;
1614 {
1615 	struct vnode *devvp;
1616 	int count, error = 0;
1617 	struct ufsmount *ump;
1618 
1619 	/*
1620 	 * Alternately flush the block device associated with the mount
1621 	 * point and process any dependencies that the flushing
1622 	 * creates. We continue until no more worklist dependencies
1623 	 * are found.
1624 	 */
1625 	*countp = 0;
1626 	ump = VFSTOUFS(oldmnt);
1627 	devvp = ump->um_devvp;
1628 	while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
1629 		*countp += count;
1630 		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1631 		error = VOP_FSYNC(devvp, MNT_WAIT, td);
1632 		VOP_UNLOCK(devvp, 0);
1633 		if (error)
1634 			break;
1635 	}
1636 	return (error);
1637 }
1638 
1639 int
1640 softdep_waitidle(struct mount *mp)
1641 {
1642 	struct ufsmount *ump;
1643 	int error;
1644 	int i;
1645 
1646 	ump = VFSTOUFS(mp);
1647 	ACQUIRE_LOCK(&lk);
1648 	for (i = 0; i < 10 && ump->softdep_deps; i++) {
1649 		ump->softdep_req = 1;
1650 		if (ump->softdep_on_worklist)
1651 			panic("softdep_waitidle: work added after flush.");
1652 		msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1);
1653 	}
1654 	ump->softdep_req = 0;
1655 	FREE_LOCK(&lk);
1656 	error = 0;
1657 	if (i == 10) {
1658 		error = EBUSY;
1659 		printf("softdep_waitidle: Failed to flush worklist for %p\n",
1660 		    mp);
1661 	}
1662 
1663 	return (error);
1664 }
1665 
1666 /*
1667  * Flush all vnodes and worklist items associated with a specified mount point.
1668  */
1669 int
1670 softdep_flushfiles(oldmnt, flags, td)
1671 	struct mount *oldmnt;
1672 	int flags;
1673 	struct thread *td;
1674 {
1675 	int error, depcount, loopcnt, retry_flush_count, retry;
1676 
1677 	loopcnt = 10;
1678 	retry_flush_count = 3;
1679 retry_flush:
1680 	error = 0;
1681 
1682 	/*
1683 	 * Alternately flush the vnodes associated with the mount
1684 	 * point and process any dependencies that the flushing
1685 	 * creates. In theory, this loop can happen at most twice,
1686 	 * but we give it a few extra just to be sure.
1687 	 */
1688 	for (; loopcnt > 0; loopcnt--) {
1689 		/*
1690 		 * Do another flush in case any vnodes were brought in
1691 		 * as part of the cleanup operations.
1692 		 */
1693 		if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0)
1694 			break;
1695 		if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
1696 		    depcount == 0)
1697 			break;
1698 	}
1699 	/*
1700 	 * If we are unmounting then it is an error to fail. If we
1701 	 * are simply trying to downgrade to read-only, then filesystem
1702 	 * activity can keep us busy forever, so we just fail with EBUSY.
1703 	 */
1704 	if (loopcnt == 0) {
1705 		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
1706 			panic("softdep_flushfiles: looping");
1707 		error = EBUSY;
1708 	}
1709 	if (!error)
1710 		error = softdep_waitidle(oldmnt);
1711 	if (!error) {
1712 		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
1713 			retry = 0;
1714 			MNT_ILOCK(oldmnt);
1715 			KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0,
1716 			    ("softdep_flushfiles: !MNTK_NOINSMNTQ"));
1717 			if (oldmnt->mnt_nvnodelistsize > 0) {
1718 				if (--retry_flush_count > 0) {
1719 					retry = 1;
1720 					loopcnt = 3;
1721 				} else
1722 					error = EBUSY;
1723 			}
1724 			MNT_IUNLOCK(oldmnt);
1725 			if (retry)
1726 				goto retry_flush;
1727 		}
1728 	}
1729 	return (error);
1730 }
1731 
1732 /*
1733  * Structure hashing.
1734  *
1735  * There are three types of structures that can be looked up:
1736  *	1) pagedep structures identified by mount point, inode number,
1737  *	   and logical block.
1738  *	2) inodedep structures identified by mount point and inode number.
1739  *	3) newblk structures identified by mount point and
1740  *	   physical block number.
1741  *
1742  * The "pagedep" and "inodedep" dependency structures are hashed
1743  * separately from the file blocks and inodes to which they correspond.
1744  * This separation helps when the in-memory copy of an inode or
1745  * file block must be replaced. It also obviates the need to access
1746  * an inode or file page when simply updating (or de-allocating)
1747  * dependency structures. Lookup of newblk structures is needed to
1748  * find newly allocated blocks when trying to associate them with
1749  * their allocdirect or allocindir structure.
1750  *
1751  * The lookup routines optionally create and hash a new instance when
1752  * an existing entry is not found.
1753  */
1754 #define DEPALLOC	0x0001	/* allocate structure if lookup fails */
1755 #define NODELAY		0x0002	/* cannot do background work */
1756 
1757 /*
1758  * Structures and routines associated with pagedep caching.
1759  */
1760 LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
1761 u_long	pagedep_hash;		/* size of hash table - 1 */
1762 #define	PAGEDEP_HASH(mp, inum, lbn) \
1763 	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
1764 	    pagedep_hash])
1765 
1766 static int
1767 pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp)
1768 	struct pagedep_hashhead *pagedephd;
1769 	ino_t ino;
1770 	ufs_lbn_t lbn;
1771 	struct mount *mp;
1772 	int flags;
1773 	struct pagedep **pagedeppp;
1774 {
1775 	struct pagedep *pagedep;
1776 
1777 	LIST_FOREACH(pagedep, pagedephd, pd_hash)
1778 		if (ino == pagedep->pd_ino &&
1779 		    lbn == pagedep->pd_lbn &&
1780 		    mp == pagedep->pd_list.wk_mp)
1781 			break;
1782 	if (pagedep) {
1783 		*pagedeppp = pagedep;
1784 		if ((flags & DEPALLOC) != 0 &&
1785 		    (pagedep->pd_state & ONWORKLIST) == 0)
1786 			return (0);
1787 		return (1);
1788 	}
1789 	*pagedeppp = NULL;
1790 	return (0);
1791 }
1792 /*
1793  * Look up a pagedep. Return 1 if found, 0 if not found or found
1794  * when asked to allocate but not associated with any buffer.
1795  * If not found, allocate if DEPALLOC flag is passed.
1796  * Found or allocated entry is returned in pagedeppp.
1797  * This routine must be called with splbio interrupts blocked.
1798  */
1799 static int
1800 pagedep_lookup(mp, ino, lbn, flags, pagedeppp)
1801 	struct mount *mp;
1802 	ino_t ino;
1803 	ufs_lbn_t lbn;
1804 	int flags;
1805 	struct pagedep **pagedeppp;
1806 {
1807 	struct pagedep *pagedep;
1808 	struct pagedep_hashhead *pagedephd;
1809 	int ret;
1810 	int i;
1811 
1812 	mtx_assert(&lk, MA_OWNED);
1813 	pagedephd = PAGEDEP_HASH(mp, ino, lbn);
1814 
1815 	ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);
1816 	if (*pagedeppp || (flags & DEPALLOC) == 0)
1817 		return (ret);
1818 	FREE_LOCK(&lk);
1819 	pagedep = malloc(sizeof(struct pagedep),
1820 	    M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
1821 	workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
1822 	ACQUIRE_LOCK(&lk);
1823 	ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);
1824 	if (*pagedeppp) {
1825 		WORKITEM_FREE(pagedep, D_PAGEDEP);
1826 		return (ret);
1827 	}
1828 	pagedep->pd_ino = ino;
1829 	pagedep->pd_lbn = lbn;
1830 	LIST_INIT(&pagedep->pd_dirremhd);
1831 	LIST_INIT(&pagedep->pd_pendinghd);
1832 	for (i = 0; i < DAHASHSZ; i++)
1833 		LIST_INIT(&pagedep->pd_diraddhd[i]);
1834 	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
1835 	*pagedeppp = pagedep;
1836 	return (0);
1837 }
1838 
1839 /*
1840  * Structures and routines associated with inodedep caching.
1841  */
1842 LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
1843 static u_long	inodedep_hash;	/* size of hash table - 1 */
1844 #define	INODEDEP_HASH(fs, inum) \
1845       (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
1846 
1847 static int
1848 inodedep_find(inodedephd, fs, inum, inodedeppp)
1849 	struct inodedep_hashhead *inodedephd;
1850 	struct fs *fs;
1851 	ino_t inum;
1852 	struct inodedep **inodedeppp;
1853 {
1854 	struct inodedep *inodedep;
1855 
1856 	LIST_FOREACH(inodedep, inodedephd, id_hash)
1857 		if (inum == inodedep->id_ino && fs == inodedep->id_fs)
1858 			break;
1859 	if (inodedep) {
1860 		*inodedeppp = inodedep;
1861 		return (1);
1862 	}
1863 	*inodedeppp = NULL;
1864 
1865 	return (0);
1866 }
1867 /*
1868  * Look up an inodedep. Return 1 if found, 0 if not found.
1869  * If not found, allocate if DEPALLOC flag is passed.
1870  * Found or allocated entry is returned in inodedeppp.
1871  * This routine must be called with splbio interrupts blocked.
1872  */
1873 static int
1874 inodedep_lookup(mp, inum, flags, inodedeppp)
1875 	struct mount *mp;
1876 	ino_t inum;
1877 	int flags;
1878 	struct inodedep **inodedeppp;
1879 {
1880 	struct inodedep *inodedep;
1881 	struct inodedep_hashhead *inodedephd;
1882 	struct fs *fs;
1883 
1884 	mtx_assert(&lk, MA_OWNED);
1885 	fs = VFSTOUFS(mp)->um_fs;
1886 	inodedephd = INODEDEP_HASH(fs, inum);
1887 
1888 	if (inodedep_find(inodedephd, fs, inum, inodedeppp))
1889 		return (1);
1890 	if ((flags & DEPALLOC) == 0)
1891 		return (0);
1892 	/*
1893 	 * If we are over our limit, try to improve the situation.
1894 	 */
1895 	if (dep_current[D_INODEDEP] > max_softdeps && (flags & NODELAY) == 0)
1896 		request_cleanup(mp, FLUSH_INODES);
1897 	FREE_LOCK(&lk);
1898 	inodedep = malloc(sizeof(struct inodedep),
1899 		M_INODEDEP, M_SOFTDEP_FLAGS);
1900 	workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
1901 	ACQUIRE_LOCK(&lk);
1902 	if (inodedep_find(inodedephd, fs, inum, inodedeppp)) {
1903 		WORKITEM_FREE(inodedep, D_INODEDEP);
1904 		return (1);
1905 	}
1906 	inodedep->id_fs = fs;
1907 	inodedep->id_ino = inum;
1908 	inodedep->id_state = ALLCOMPLETE;
1909 	inodedep->id_nlinkdelta = 0;
1910 	inodedep->id_savedino1 = NULL;
1911 	inodedep->id_savedsize = -1;
1912 	inodedep->id_savedextsize = -1;
1913 	inodedep->id_savednlink = -1;
1914 	inodedep->id_bmsafemap = NULL;
1915 	inodedep->id_mkdiradd = NULL;
1916 	LIST_INIT(&inodedep->id_dirremhd);
1917 	LIST_INIT(&inodedep->id_pendinghd);
1918 	LIST_INIT(&inodedep->id_inowait);
1919 	LIST_INIT(&inodedep->id_bufwait);
1920 	TAILQ_INIT(&inodedep->id_inoreflst);
1921 	TAILQ_INIT(&inodedep->id_inoupdt);
1922 	TAILQ_INIT(&inodedep->id_newinoupdt);
1923 	TAILQ_INIT(&inodedep->id_extupdt);
1924 	TAILQ_INIT(&inodedep->id_newextupdt);
1925 	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
1926 	*inodedeppp = inodedep;
1927 	return (0);
1928 }
1929 
1930 /*
1931  * Structures and routines associated with newblk caching.
1932  */
1933 LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
1934 u_long	newblk_hash;		/* size of hash table - 1 */
1935 #define	NEWBLK_HASH(fs, inum) \
1936 	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
1937 
1938 static int
1939 newblk_find(newblkhd, mp, newblkno, flags, newblkpp)
1940 	struct newblk_hashhead *newblkhd;
1941 	struct mount *mp;
1942 	ufs2_daddr_t newblkno;
1943 	int flags;
1944 	struct newblk **newblkpp;
1945 {
1946 	struct newblk *newblk;
1947 
1948 	LIST_FOREACH(newblk, newblkhd, nb_hash) {
1949 		if (newblkno != newblk->nb_newblkno)
1950 			continue;
1951 		if (mp != newblk->nb_list.wk_mp)
1952 			continue;
1953 		/*
1954 		 * If we're creating a new dependency don't match those that
1955 		 * have already been converted to allocdirects.  This is for
1956 		 * a frag extend.
1957 		 */
1958 		if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
1959 			continue;
1960 		break;
1961 	}
1962 	if (newblk) {
1963 		*newblkpp = newblk;
1964 		return (1);
1965 	}
1966 	*newblkpp = NULL;
1967 	return (0);
1968 }
1969 
1970 /*
1971  * Look up a newblk. Return 1 if found, 0 if not found.
1972  * If not found, allocate if DEPALLOC flag is passed.
1973  * Found or allocated entry is returned in newblkpp.
1974  */
1975 static int
1976 newblk_lookup(mp, newblkno, flags, newblkpp)
1977 	struct mount *mp;
1978 	ufs2_daddr_t newblkno;
1979 	int flags;
1980 	struct newblk **newblkpp;
1981 {
1982 	struct newblk *newblk;
1983 	struct newblk_hashhead *newblkhd;
1984 
1985 	newblkhd = NEWBLK_HASH(VFSTOUFS(mp)->um_fs, newblkno);
1986 	if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp))
1987 		return (1);
1988 	if ((flags & DEPALLOC) == 0)
1989 		return (0);
1990 	FREE_LOCK(&lk);
1991 	newblk = malloc(sizeof(union allblk), M_NEWBLK,
1992 	    M_SOFTDEP_FLAGS | M_ZERO);
1993 	workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
1994 	ACQUIRE_LOCK(&lk);
1995 	if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) {
1996 		WORKITEM_FREE(newblk, D_NEWBLK);
1997 		return (1);
1998 	}
1999 	newblk->nb_freefrag = NULL;
2000 	LIST_INIT(&newblk->nb_indirdeps);
2001 	LIST_INIT(&newblk->nb_newdirblk);
2002 	LIST_INIT(&newblk->nb_jwork);
2003 	newblk->nb_state = ATTACHED;
2004 	newblk->nb_newblkno = newblkno;
2005 	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
2006 	*newblkpp = newblk;
2007 	return (0);
2008 }
2009 
2010 /*
2011  * Structures and routines associated with indir caching.
2012  */
2013 struct workhead *indir_hashtbl;
2014 u_long	indir_hash;		/* size of hash table - 1 */
2015 #define	INDIR_HASH(mp, blkno) \
2016 	(&indir_hashtbl[((((register_t)(mp)) >> 13) + (blkno)) & indir_hash])
2017 
2018 static int
2019 indirblk_inseg(mp, blkno)
2020 	struct mount *mp;
2021 	ufs2_daddr_t blkno;
2022 {
2023 	struct freework *freework;
2024 	struct workhead *wkhd;
2025 	struct worklist *wk;
2026 
2027 	wkhd = INDIR_HASH(mp, blkno);
2028 	LIST_FOREACH(wk, wkhd, wk_list) {
2029 		freework = WK_FREEWORK(wk);
2030 		if (freework->fw_blkno == blkno &&
2031 		    freework->fw_list.wk_mp == mp) {
2032 			LIST_REMOVE(freework, fw_next);
2033 			WORKLIST_REMOVE(&freework->fw_list);
2034 			WORKITEM_FREE(freework, D_FREEWORK);
2035 			return (1);
2036 		}
2037 	}
2038 	return (0);
2039 }
2040 
2041 /*
2042  * Executed during filesystem system initialization before
2043  * mounting any filesystems.
2044  */
2045 void
2046 softdep_initialize()
2047 {
2048 
2049 	LIST_INIT(&mkdirlisthd);
2050 	max_softdeps = desiredvnodes * 4;
2051 	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash);
2052 	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
2053 	newblk_hashtbl = hashinit(desiredvnodes / 5,  M_NEWBLK, &newblk_hash);
2054 	bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash);
2055 	indir_hashtbl = hashinit(desiredvnodes / 10, M_FREEWORK, &indir_hash);
2056 
2057 	/* initialise bioops hack */
2058 	bioops.io_start = softdep_disk_io_initiation;
2059 	bioops.io_complete = softdep_disk_write_complete;
2060 	bioops.io_deallocate = softdep_deallocate_dependencies;
2061 	bioops.io_countdeps = softdep_count_dependencies;
2062 
2063 	/* Initialize the callout with an mtx. */
2064 	callout_init_mtx(&softdep_callout, &lk, 0);
2065 }
2066 
2067 /*
2068  * Executed after all filesystems have been unmounted during
2069  * filesystem module unload.
2070  */
2071 void
2072 softdep_uninitialize()
2073 {
2074 
2075 	callout_drain(&softdep_callout);
2076 	hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash);
2077 	hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash);
2078 	hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash);
2079 	hashdestroy(bmsafemap_hashtbl, M_BMSAFEMAP, bmsafemap_hash);
2080 }
2081 
2082 /*
2083  * Called at mount time to notify the dependency code that a
2084  * filesystem wishes to use it.
2085  */
2086 int
2087 softdep_mount(devvp, mp, fs, cred)
2088 	struct vnode *devvp;
2089 	struct mount *mp;
2090 	struct fs *fs;
2091 	struct ucred *cred;
2092 {
2093 	struct csum_total cstotal;
2094 	struct ufsmount *ump;
2095 	struct cg *cgp;
2096 	struct buf *bp;
2097 	int error, cyl;
2098 
2099 	MNT_ILOCK(mp);
2100 	mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
2101 	if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
2102 		mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
2103 			MNTK_SOFTDEP;
2104 		mp->mnt_noasync++;
2105 	}
2106 	MNT_IUNLOCK(mp);
2107 	ump = VFSTOUFS(mp);
2108 	LIST_INIT(&ump->softdep_workitem_pending);
2109 	LIST_INIT(&ump->softdep_journal_pending);
2110 	TAILQ_INIT(&ump->softdep_unlinked);
2111 	ump->softdep_worklist_tail = NULL;
2112 	ump->softdep_on_worklist = 0;
2113 	ump->softdep_deps = 0;
2114 	if ((fs->fs_flags & FS_SUJ) &&
2115 	    (error = journal_mount(mp, fs, cred)) != 0) {
2116 		printf("Failed to start journal: %d\n", error);
2117 		return (error);
2118 	}
2119 	/*
2120 	 * When doing soft updates, the counters in the
2121 	 * superblock may have gotten out of sync. Recomputation
2122 	 * can take a long time and can be deferred for background
2123 	 * fsck.  However, the old behavior of scanning the cylinder
2124 	 * groups and recalculating them at mount time is available
2125 	 * by setting vfs.ffs.compute_summary_at_mount to one.
2126 	 */
2127 	if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
2128 		return (0);
2129 	bzero(&cstotal, sizeof cstotal);
2130 	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
2131 		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
2132 		    fs->fs_cgsize, cred, &bp)) != 0) {
2133 			brelse(bp);
2134 			return (error);
2135 		}
2136 		cgp = (struct cg *)bp->b_data;
2137 		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
2138 		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
2139 		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
2140 		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
2141 		fs->fs_cs(fs, cyl) = cgp->cg_cs;
2142 		brelse(bp);
2143 	}
2144 #ifdef DEBUG
2145 	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
2146 		printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
2147 #endif
2148 	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
2149 	return (0);
2150 }
2151 
2152 void
2153 softdep_unmount(mp)
2154 	struct mount *mp;
2155 {
2156 
2157 	if (mp->mnt_kern_flag & MNTK_SUJ)
2158 		journal_unmount(mp);
2159 }
2160 
2161 struct jblocks {
2162 	struct jseglst	jb_segs;	/* TAILQ of current segments. */
2163 	struct jseg	*jb_writeseg;	/* Next write to complete. */
2164 	struct jseg	*jb_oldestseg;	/* Oldest segment with valid entries. */
2165 	struct jextent	*jb_extent;	/* Extent array. */
2166 	uint64_t	jb_nextseq;	/* Next sequence number. */
2167 	uint64_t	jb_oldestwrseq;	/* Oldest written sequence number. */
2168 	uint8_t		jb_needseg;	/* Need a forced segment. */
2169 	uint8_t		jb_suspended;	/* Did journal suspend writes? */
2170 	int		jb_avail;	/* Available extents. */
2171 	int		jb_used;	/* Last used extent. */
2172 	int		jb_head;	/* Allocator head. */
2173 	int		jb_off;		/* Allocator extent offset. */
2174 	int		jb_blocks;	/* Total disk blocks covered. */
2175 	int		jb_free;	/* Total disk blocks free. */
2176 	int		jb_min;		/* Minimum free space. */
2177 	int		jb_low;		/* Low on space. */
2178 	int		jb_age;		/* Insertion time of oldest rec. */
2179 };
2180 
2181 struct jextent {
2182 	ufs2_daddr_t	je_daddr;	/* Disk block address. */
2183 	int		je_blocks;	/* Disk block count. */
2184 };
2185 
2186 static struct jblocks *
2187 jblocks_create(void)
2188 {
2189 	struct jblocks *jblocks;
2190 
2191 	jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
2192 	TAILQ_INIT(&jblocks->jb_segs);
2193 	jblocks->jb_avail = 10;
2194 	jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2195 	    M_JBLOCKS, M_WAITOK | M_ZERO);
2196 
2197 	return (jblocks);
2198 }
2199 
2200 static ufs2_daddr_t
2201 jblocks_alloc(jblocks, bytes, actual)
2202 	struct jblocks *jblocks;
2203 	int bytes;
2204 	int *actual;
2205 {
2206 	ufs2_daddr_t daddr;
2207 	struct jextent *jext;
2208 	int freecnt;
2209 	int blocks;
2210 
2211 	blocks = bytes / DEV_BSIZE;
2212 	jext = &jblocks->jb_extent[jblocks->jb_head];
2213 	freecnt = jext->je_blocks - jblocks->jb_off;
2214 	if (freecnt == 0) {
2215 		jblocks->jb_off = 0;
2216 		if (++jblocks->jb_head > jblocks->jb_used)
2217 			jblocks->jb_head = 0;
2218 		jext = &jblocks->jb_extent[jblocks->jb_head];
2219 		freecnt = jext->je_blocks;
2220 	}
2221 	if (freecnt > blocks)
2222 		freecnt = blocks;
2223 	*actual = freecnt * DEV_BSIZE;
2224 	daddr = jext->je_daddr + jblocks->jb_off;
2225 	jblocks->jb_off += freecnt;
2226 	jblocks->jb_free -= freecnt;
2227 
2228 	return (daddr);
2229 }
2230 
2231 static void
2232 jblocks_free(jblocks, mp, bytes)
2233 	struct jblocks *jblocks;
2234 	struct mount *mp;
2235 	int bytes;
2236 {
2237 
2238 	jblocks->jb_free += bytes / DEV_BSIZE;
2239 	if (jblocks->jb_suspended)
2240 		worklist_speedup();
2241 	wakeup(jblocks);
2242 }
2243 
2244 static void
2245 jblocks_destroy(jblocks)
2246 	struct jblocks *jblocks;
2247 {
2248 
2249 	if (jblocks->jb_extent)
2250 		free(jblocks->jb_extent, M_JBLOCKS);
2251 	free(jblocks, M_JBLOCKS);
2252 }
2253 
2254 static void
2255 jblocks_add(jblocks, daddr, blocks)
2256 	struct jblocks *jblocks;
2257 	ufs2_daddr_t daddr;
2258 	int blocks;
2259 {
2260 	struct jextent *jext;
2261 
2262 	jblocks->jb_blocks += blocks;
2263 	jblocks->jb_free += blocks;
2264 	jext = &jblocks->jb_extent[jblocks->jb_used];
2265 	/* Adding the first block. */
2266 	if (jext->je_daddr == 0) {
2267 		jext->je_daddr = daddr;
2268 		jext->je_blocks = blocks;
2269 		return;
2270 	}
2271 	/* Extending the last extent. */
2272 	if (jext->je_daddr + jext->je_blocks == daddr) {
2273 		jext->je_blocks += blocks;
2274 		return;
2275 	}
2276 	/* Adding a new extent. */
2277 	if (++jblocks->jb_used == jblocks->jb_avail) {
2278 		jblocks->jb_avail *= 2;
2279 		jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2280 		    M_JBLOCKS, M_WAITOK | M_ZERO);
2281 		memcpy(jext, jblocks->jb_extent,
2282 		    sizeof(struct jextent) * jblocks->jb_used);
2283 		free(jblocks->jb_extent, M_JBLOCKS);
2284 		jblocks->jb_extent = jext;
2285 	}
2286 	jext = &jblocks->jb_extent[jblocks->jb_used];
2287 	jext->je_daddr = daddr;
2288 	jext->je_blocks = blocks;
2289 	return;
2290 }
2291 
2292 int
2293 softdep_journal_lookup(mp, vpp)
2294 	struct mount *mp;
2295 	struct vnode **vpp;
2296 {
2297 	struct componentname cnp;
2298 	struct vnode *dvp;
2299 	ino_t sujournal;
2300 	int error;
2301 
2302 	error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp);
2303 	if (error)
2304 		return (error);
2305 	bzero(&cnp, sizeof(cnp));
2306 	cnp.cn_nameiop = LOOKUP;
2307 	cnp.cn_flags = ISLASTCN;
2308 	cnp.cn_thread = curthread;
2309 	cnp.cn_cred = curthread->td_ucred;
2310 	cnp.cn_pnbuf = SUJ_FILE;
2311 	cnp.cn_nameptr = SUJ_FILE;
2312 	cnp.cn_namelen = strlen(SUJ_FILE);
2313 	error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
2314 	vput(dvp);
2315 	if (error != 0)
2316 		return (error);
2317 	error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
2318 	return (error);
2319 }
2320 
2321 /*
2322  * Open and verify the journal file.
2323  */
2324 static int
2325 journal_mount(mp, fs, cred)
2326 	struct mount *mp;
2327 	struct fs *fs;
2328 	struct ucred *cred;
2329 {
2330 	struct jblocks *jblocks;
2331 	struct vnode *vp;
2332 	struct inode *ip;
2333 	ufs2_daddr_t blkno;
2334 	int bcount;
2335 	int error;
2336 	int i;
2337 
2338 	error = softdep_journal_lookup(mp, &vp);
2339 	if (error != 0) {
2340 		printf("Failed to find journal.  Use tunefs to create one\n");
2341 		return (error);
2342 	}
2343 	ip = VTOI(vp);
2344 	if (ip->i_size < SUJ_MIN) {
2345 		error = ENOSPC;
2346 		goto out;
2347 	}
2348 	bcount = lblkno(fs, ip->i_size);	/* Only use whole blocks. */
2349 	jblocks = jblocks_create();
2350 	for (i = 0; i < bcount; i++) {
2351 		error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
2352 		if (error)
2353 			break;
2354 		jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
2355 	}
2356 	if (error) {
2357 		jblocks_destroy(jblocks);
2358 		goto out;
2359 	}
2360 	jblocks->jb_low = jblocks->jb_free / 3;	/* Reserve 33%. */
2361 	jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
2362 	VFSTOUFS(mp)->softdep_jblocks = jblocks;
2363 out:
2364 	if (error == 0) {
2365 		MNT_ILOCK(mp);
2366 		mp->mnt_kern_flag |= MNTK_SUJ;
2367 		MNT_IUNLOCK(mp);
2368 		/*
2369 		 * Only validate the journal contents if the
2370 		 * filesystem is clean, otherwise we write the logs
2371 		 * but they'll never be used.  If the filesystem was
2372 		 * still dirty when we mounted it the journal is
2373 		 * invalid and a new journal can only be valid if it
2374 		 * starts from a clean mount.
2375 		 */
2376 		if (fs->fs_clean) {
2377 			DIP_SET(ip, i_modrev, fs->fs_mtime);
2378 			ip->i_flags |= IN_MODIFIED;
2379 			ffs_update(vp, 1);
2380 		}
2381 	}
2382 	vput(vp);
2383 	return (error);
2384 }
2385 
2386 static void
2387 journal_unmount(mp)
2388 	struct mount *mp;
2389 {
2390 	struct ufsmount *ump;
2391 
2392 	ump = VFSTOUFS(mp);
2393 	if (ump->softdep_jblocks)
2394 		jblocks_destroy(ump->softdep_jblocks);
2395 	ump->softdep_jblocks = NULL;
2396 }
2397 
2398 /*
2399  * Called when a journal record is ready to be written.  Space is allocated
2400  * and the journal entry is created when the journal is flushed to stable
2401  * store.
2402  */
2403 static void
2404 add_to_journal(wk)
2405 	struct worklist *wk;
2406 {
2407 	struct ufsmount *ump;
2408 
2409 	mtx_assert(&lk, MA_OWNED);
2410 	ump = VFSTOUFS(wk->wk_mp);
2411 	if (wk->wk_state & ONWORKLIST)
2412 		panic("add_to_journal: %s(0x%X) already on list",
2413 		    TYPENAME(wk->wk_type), wk->wk_state);
2414 	wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
2415 	if (LIST_EMPTY(&ump->softdep_journal_pending)) {
2416 		ump->softdep_jblocks->jb_age = ticks;
2417 		LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
2418 	} else
2419 		LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
2420 	ump->softdep_journal_tail = wk;
2421 	ump->softdep_on_journal += 1;
2422 }
2423 
2424 /*
2425  * Remove an arbitrary item for the journal worklist maintain the tail
2426  * pointer.  This happens when a new operation obviates the need to
2427  * journal an old operation.
2428  */
2429 static void
2430 remove_from_journal(wk)
2431 	struct worklist *wk;
2432 {
2433 	struct ufsmount *ump;
2434 
2435 	mtx_assert(&lk, MA_OWNED);
2436 	ump = VFSTOUFS(wk->wk_mp);
2437 #ifdef SUJ_DEBUG
2438 	{
2439 		struct worklist *wkn;
2440 
2441 		LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
2442 			if (wkn == wk)
2443 				break;
2444 		if (wkn == NULL)
2445 			panic("remove_from_journal: %p is not in journal", wk);
2446 	}
2447 #endif
2448 	/*
2449 	 * We emulate a TAILQ to save space in most structures which do not
2450 	 * require TAILQ semantics.  Here we must update the tail position
2451 	 * when removing the tail which is not the final entry. This works
2452 	 * only if the worklist linkage are at the beginning of the structure.
2453 	 */
2454 	if (ump->softdep_journal_tail == wk)
2455 		ump->softdep_journal_tail =
2456 		    (struct worklist *)wk->wk_list.le_prev;
2457 
2458 	WORKLIST_REMOVE(wk);
2459 	ump->softdep_on_journal -= 1;
2460 }
2461 
2462 /*
2463  * Check for journal space as well as dependency limits so the prelink
2464  * code can throttle both journaled and non-journaled filesystems.
2465  * Threshold is 0 for low and 1 for min.
2466  */
2467 static int
2468 journal_space(ump, thresh)
2469 	struct ufsmount *ump;
2470 	int thresh;
2471 {
2472 	struct jblocks *jblocks;
2473 	int avail;
2474 
2475 	jblocks = ump->softdep_jblocks;
2476 	if (jblocks == NULL)
2477 		return (1);
2478 	/*
2479 	 * We use a tighter restriction here to prevent request_cleanup()
2480 	 * running in threads from running into locks we currently hold.
2481 	 */
2482 	if (dep_current[D_INODEDEP] > (max_softdeps / 10) * 9)
2483 		return (0);
2484 	if (thresh)
2485 		thresh = jblocks->jb_min;
2486 	else
2487 		thresh = jblocks->jb_low;
2488 	avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
2489 	avail = jblocks->jb_free - avail;
2490 
2491 	return (avail > thresh);
2492 }
2493 
2494 static void
2495 journal_suspend(ump)
2496 	struct ufsmount *ump;
2497 {
2498 	struct jblocks *jblocks;
2499 	struct mount *mp;
2500 
2501 	mp = UFSTOVFS(ump);
2502 	jblocks = ump->softdep_jblocks;
2503 	MNT_ILOCK(mp);
2504 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
2505 		stat_journal_min++;
2506 		mp->mnt_kern_flag |= MNTK_SUSPEND;
2507 		mp->mnt_susp_owner = FIRST_THREAD_IN_PROC(softdepproc);
2508 	}
2509 	jblocks->jb_suspended = 1;
2510 	MNT_IUNLOCK(mp);
2511 }
2512 
2513 static int
2514 journal_unsuspend(struct ufsmount *ump)
2515 {
2516 	struct jblocks *jblocks;
2517 	struct mount *mp;
2518 
2519 	mp = UFSTOVFS(ump);
2520 	jblocks = ump->softdep_jblocks;
2521 
2522 	if (jblocks != NULL && jblocks->jb_suspended &&
2523 	    journal_space(ump, jblocks->jb_min)) {
2524 		jblocks->jb_suspended = 0;
2525 		FREE_LOCK(&lk);
2526 		mp->mnt_susp_owner = curthread;
2527 		vfs_write_resume(mp);
2528 		ACQUIRE_LOCK(&lk);
2529 		return (1);
2530 	}
2531 	return (0);
2532 }
2533 
2534 /*
2535  * Called before any allocation function to be certain that there is
2536  * sufficient space in the journal prior to creating any new records.
2537  * Since in the case of block allocation we may have multiple locked
2538  * buffers at the time of the actual allocation we can not block
2539  * when the journal records are created.  Doing so would create a deadlock
2540  * if any of these buffers needed to be flushed to reclaim space.  Instead
2541  * we require a sufficiently large amount of available space such that
2542  * each thread in the system could have passed this allocation check and
2543  * still have sufficient free space.  With 20% of a minimum journal size
2544  * of 1MB we have 6553 records available.
2545  */
2546 int
2547 softdep_prealloc(vp, waitok)
2548 	struct vnode *vp;
2549 	int waitok;
2550 {
2551 	struct ufsmount *ump;
2552 
2553 	if (DOINGSUJ(vp) == 0)
2554 		return (0);
2555 	ump = VFSTOUFS(vp->v_mount);
2556 	ACQUIRE_LOCK(&lk);
2557 	if (journal_space(ump, 0)) {
2558 		FREE_LOCK(&lk);
2559 		return (0);
2560 	}
2561 	stat_journal_low++;
2562 	FREE_LOCK(&lk);
2563 	if (waitok == MNT_NOWAIT)
2564 		return (ENOSPC);
2565 	/*
2566 	 * Attempt to sync this vnode once to flush any journal
2567 	 * work attached to it.
2568 	 */
2569 	if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0)
2570 		ffs_syncvnode(vp, waitok);
2571 	ACQUIRE_LOCK(&lk);
2572 	process_removes(vp);
2573 	if (journal_space(ump, 0) == 0) {
2574 		softdep_speedup();
2575 		if (journal_space(ump, 1) == 0)
2576 			journal_suspend(ump);
2577 	}
2578 	FREE_LOCK(&lk);
2579 
2580 	return (0);
2581 }
2582 
2583 /*
2584  * Before adjusting a link count on a vnode verify that we have sufficient
2585  * journal space.  If not, process operations that depend on the currently
2586  * locked pair of vnodes to try to flush space as the syncer, buf daemon,
2587  * and softdep flush threads can not acquire these locks to reclaim space.
2588  */
2589 static void
2590 softdep_prelink(dvp, vp)
2591 	struct vnode *dvp;
2592 	struct vnode *vp;
2593 {
2594 	struct ufsmount *ump;
2595 
2596 	ump = VFSTOUFS(dvp->v_mount);
2597 	mtx_assert(&lk, MA_OWNED);
2598 	if (journal_space(ump, 0))
2599 		return;
2600 	stat_journal_low++;
2601 	FREE_LOCK(&lk);
2602 	if (vp)
2603 		ffs_syncvnode(vp, MNT_NOWAIT);
2604 	ffs_syncvnode(dvp, MNT_WAIT);
2605 	ACQUIRE_LOCK(&lk);
2606 	/* Process vp before dvp as it may create .. removes. */
2607 	if (vp)
2608 		process_removes(vp);
2609 	process_removes(dvp);
2610 	softdep_speedup();
2611 	process_worklist_item(UFSTOVFS(ump), LK_NOWAIT);
2612 	process_worklist_item(UFSTOVFS(ump), LK_NOWAIT);
2613 	if (journal_space(ump, 0) == 0) {
2614 		softdep_speedup();
2615 		if (journal_space(ump, 1) == 0)
2616 			journal_suspend(ump);
2617 	}
2618 }
2619 
2620 static void
2621 jseg_write(ump, jseg, data)
2622 	struct ufsmount *ump;
2623 	struct jseg *jseg;
2624 	uint8_t *data;
2625 {
2626 	struct jsegrec *rec;
2627 
2628 	rec = (struct jsegrec *)data;
2629 	rec->jsr_seq = jseg->js_seq;
2630 	rec->jsr_oldest = jseg->js_oldseq;
2631 	rec->jsr_cnt = jseg->js_cnt;
2632 	rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
2633 	rec->jsr_crc = 0;
2634 	rec->jsr_time = ump->um_fs->fs_mtime;
2635 }
2636 
2637 static inline void
2638 inoref_write(inoref, jseg, rec)
2639 	struct inoref *inoref;
2640 	struct jseg *jseg;
2641 	struct jrefrec *rec;
2642 {
2643 
2644 	inoref->if_jsegdep->jd_seg = jseg;
2645 	rec->jr_ino = inoref->if_ino;
2646 	rec->jr_parent = inoref->if_parent;
2647 	rec->jr_nlink = inoref->if_nlink;
2648 	rec->jr_mode = inoref->if_mode;
2649 	rec->jr_diroff = inoref->if_diroff;
2650 }
2651 
2652 static void
2653 jaddref_write(jaddref, jseg, data)
2654 	struct jaddref *jaddref;
2655 	struct jseg *jseg;
2656 	uint8_t *data;
2657 {
2658 	struct jrefrec *rec;
2659 
2660 	rec = (struct jrefrec *)data;
2661 	rec->jr_op = JOP_ADDREF;
2662 	inoref_write(&jaddref->ja_ref, jseg, rec);
2663 }
2664 
2665 static void
2666 jremref_write(jremref, jseg, data)
2667 	struct jremref *jremref;
2668 	struct jseg *jseg;
2669 	uint8_t *data;
2670 {
2671 	struct jrefrec *rec;
2672 
2673 	rec = (struct jrefrec *)data;
2674 	rec->jr_op = JOP_REMREF;
2675 	inoref_write(&jremref->jr_ref, jseg, rec);
2676 }
2677 
2678 static void
2679 jmvref_write(jmvref, jseg, data)
2680 	struct jmvref *jmvref;
2681 	struct jseg *jseg;
2682 	uint8_t *data;
2683 {
2684 	struct jmvrec *rec;
2685 
2686 	rec = (struct jmvrec *)data;
2687 	rec->jm_op = JOP_MVREF;
2688 	rec->jm_ino = jmvref->jm_ino;
2689 	rec->jm_parent = jmvref->jm_parent;
2690 	rec->jm_oldoff = jmvref->jm_oldoff;
2691 	rec->jm_newoff = jmvref->jm_newoff;
2692 }
2693 
2694 static void
2695 jnewblk_write(jnewblk, jseg, data)
2696 	struct jnewblk *jnewblk;
2697 	struct jseg *jseg;
2698 	uint8_t *data;
2699 {
2700 	struct jblkrec *rec;
2701 
2702 	jnewblk->jn_jsegdep->jd_seg = jseg;
2703 	rec = (struct jblkrec *)data;
2704 	rec->jb_op = JOP_NEWBLK;
2705 	rec->jb_ino = jnewblk->jn_ino;
2706 	rec->jb_blkno = jnewblk->jn_blkno;
2707 	rec->jb_lbn = jnewblk->jn_lbn;
2708 	rec->jb_frags = jnewblk->jn_frags;
2709 	rec->jb_oldfrags = jnewblk->jn_oldfrags;
2710 }
2711 
2712 static void
2713 jfreeblk_write(jfreeblk, jseg, data)
2714 	struct jfreeblk *jfreeblk;
2715 	struct jseg *jseg;
2716 	uint8_t *data;
2717 {
2718 	struct jblkrec *rec;
2719 
2720 	jfreeblk->jf_jsegdep->jd_seg = jseg;
2721 	rec = (struct jblkrec *)data;
2722 	rec->jb_op = JOP_FREEBLK;
2723 	rec->jb_ino = jfreeblk->jf_ino;
2724 	rec->jb_blkno = jfreeblk->jf_blkno;
2725 	rec->jb_lbn = jfreeblk->jf_lbn;
2726 	rec->jb_frags = jfreeblk->jf_frags;
2727 	rec->jb_oldfrags = 0;
2728 }
2729 
2730 static void
2731 jfreefrag_write(jfreefrag, jseg, data)
2732 	struct jfreefrag *jfreefrag;
2733 	struct jseg *jseg;
2734 	uint8_t *data;
2735 {
2736 	struct jblkrec *rec;
2737 
2738 	jfreefrag->fr_jsegdep->jd_seg = jseg;
2739 	rec = (struct jblkrec *)data;
2740 	rec->jb_op = JOP_FREEBLK;
2741 	rec->jb_ino = jfreefrag->fr_ino;
2742 	rec->jb_blkno = jfreefrag->fr_blkno;
2743 	rec->jb_lbn = jfreefrag->fr_lbn;
2744 	rec->jb_frags = jfreefrag->fr_frags;
2745 	rec->jb_oldfrags = 0;
2746 }
2747 
2748 static void
2749 jtrunc_write(jtrunc, jseg, data)
2750 	struct jtrunc *jtrunc;
2751 	struct jseg *jseg;
2752 	uint8_t *data;
2753 {
2754 	struct jtrncrec *rec;
2755 
2756 	rec = (struct jtrncrec *)data;
2757 	rec->jt_op = JOP_TRUNC;
2758 	rec->jt_ino = jtrunc->jt_ino;
2759 	rec->jt_size = jtrunc->jt_size;
2760 	rec->jt_extsize = jtrunc->jt_extsize;
2761 }
2762 
2763 /*
2764  * Flush some journal records to disk.
2765  */
2766 static void
2767 softdep_process_journal(mp, needwk, flags)
2768 	struct mount *mp;
2769 	struct worklist *needwk;
2770 	int flags;
2771 {
2772 	struct jblocks *jblocks;
2773 	struct ufsmount *ump;
2774 	struct worklist *wk;
2775 	struct jseg *jseg;
2776 	struct buf *bp;
2777 	uint8_t *data;
2778 	struct fs *fs;
2779 	int segwritten;
2780 	int jrecmin;	/* Minimum records per block. */
2781 	int jrecmax;	/* Maximum records per block. */
2782 	int size;
2783 	int cnt;
2784 	int off;
2785 	int devbsize;
2786 
2787 	if ((mp->mnt_kern_flag & MNTK_SUJ) == 0)
2788 		return;
2789 	ump = VFSTOUFS(mp);
2790 	fs = ump->um_fs;
2791 	jblocks = ump->softdep_jblocks;
2792 	devbsize = ump->um_devvp->v_bufobj.bo_bsize;
2793 	/*
2794 	 * We write anywhere between a disk block and fs block.  The upper
2795 	 * bound is picked to prevent buffer cache fragmentation and limit
2796 	 * processing time per I/O.
2797 	 */
2798 	jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
2799 	jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
2800 	segwritten = 0;
2801 	for (;;) {
2802 		cnt = ump->softdep_on_journal;
2803 		/*
2804 		 * Criteria for writing a segment:
2805 		 * 1) We have a full block.
2806 		 * 2) We're called from jwait() and haven't found the
2807 		 *    journal item yet.
2808 		 * 3) Always write if needseg is set.
2809 		 * 4) If we are called from process_worklist and have
2810 		 *    not yet written anything we write a partial block
2811 		 *    to enforce a 1 second maximum latency on journal
2812 		 *    entries.
2813 		 */
2814 		if (cnt < (jrecmax - 1) && needwk == NULL &&
2815 		    jblocks->jb_needseg == 0 && (segwritten || cnt == 0))
2816 			break;
2817 		cnt++;
2818 		/*
2819 		 * Verify some free journal space.  softdep_prealloc() should
2820 	 	 * guarantee that we don't run out so this is indicative of
2821 		 * a problem with the flow control.  Try to recover
2822 		 * gracefully in any event.
2823 		 */
2824 		while (jblocks->jb_free == 0) {
2825 			if (flags != MNT_WAIT)
2826 				break;
2827 			printf("softdep: Out of journal space!\n");
2828 			softdep_speedup();
2829 			msleep(jblocks, &lk, PRIBIO, "jblocks", hz);
2830 		}
2831 		FREE_LOCK(&lk);
2832 		jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
2833 		workitem_alloc(&jseg->js_list, D_JSEG, mp);
2834 		LIST_INIT(&jseg->js_entries);
2835 		LIST_INIT(&jseg->js_indirs);
2836 		jseg->js_state = ATTACHED;
2837 		jseg->js_jblocks = jblocks;
2838 		bp = geteblk(fs->fs_bsize, 0);
2839 		ACQUIRE_LOCK(&lk);
2840 		/*
2841 		 * If there was a race while we were allocating the block
2842 		 * and jseg the entry we care about was likely written.
2843 		 * We bail out in both the WAIT and NOWAIT case and assume
2844 		 * the caller will loop if the entry it cares about is
2845 		 * not written.
2846 		 */
2847 		cnt = ump->softdep_on_journal;
2848 		if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) {
2849 			bp->b_flags |= B_INVAL | B_NOCACHE;
2850 			WORKITEM_FREE(jseg, D_JSEG);
2851 			FREE_LOCK(&lk);
2852 			brelse(bp);
2853 			ACQUIRE_LOCK(&lk);
2854 			break;
2855 		}
2856 		/*
2857 		 * Calculate the disk block size required for the available
2858 		 * records rounded to the min size.
2859 		 */
2860 		if (cnt == 0)
2861 			size = devbsize;
2862 		else if (cnt < jrecmax)
2863 			size = howmany(cnt, jrecmin) * devbsize;
2864 		else
2865 			size = fs->fs_bsize;
2866 		/*
2867 		 * Allocate a disk block for this journal data and account
2868 		 * for truncation of the requested size if enough contiguous
2869 		 * space was not available.
2870 		 */
2871 		bp->b_blkno = jblocks_alloc(jblocks, size, &size);
2872 		bp->b_lblkno = bp->b_blkno;
2873 		bp->b_offset = bp->b_blkno * DEV_BSIZE;
2874 		bp->b_bcount = size;
2875 		bp->b_bufobj = &ump->um_devvp->v_bufobj;
2876 		bp->b_flags &= ~B_INVAL;
2877 		bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
2878 		/*
2879 		 * Initialize our jseg with cnt records.  Assign the next
2880 		 * sequence number to it and link it in-order.
2881 		 */
2882 		cnt = MIN(cnt, (size / devbsize) * jrecmin);
2883 		jseg->js_buf = bp;
2884 		jseg->js_cnt = cnt;
2885 		jseg->js_refs = cnt + 1;	/* Self ref. */
2886 		jseg->js_size = size;
2887 		jseg->js_seq = jblocks->jb_nextseq++;
2888 		if (jblocks->jb_oldestseg == NULL)
2889 			jblocks->jb_oldestseg = jseg;
2890 		jseg->js_oldseq = jblocks->jb_oldestseg->js_seq;
2891 		TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
2892 		if (jblocks->jb_writeseg == NULL)
2893 			jblocks->jb_writeseg = jseg;
2894 		/*
2895 		 * Start filling in records from the pending list.
2896 		 */
2897 		data = bp->b_data;
2898 		off = 0;
2899 		while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
2900 		    != NULL) {
2901 			if (cnt == 0)
2902 				break;
2903 			/* Place a segment header on every device block. */
2904 			if ((off % devbsize) == 0) {
2905 				jseg_write(ump, jseg, data);
2906 				off += JREC_SIZE;
2907 				data = bp->b_data + off;
2908 			}
2909 			if (wk == needwk)
2910 				needwk = NULL;
2911 			remove_from_journal(wk);
2912 			wk->wk_state |= IOSTARTED;
2913 			WORKLIST_INSERT(&jseg->js_entries, wk);
2914 			switch (wk->wk_type) {
2915 			case D_JADDREF:
2916 				jaddref_write(WK_JADDREF(wk), jseg, data);
2917 				break;
2918 			case D_JREMREF:
2919 				jremref_write(WK_JREMREF(wk), jseg, data);
2920 				break;
2921 			case D_JMVREF:
2922 				jmvref_write(WK_JMVREF(wk), jseg, data);
2923 				break;
2924 			case D_JNEWBLK:
2925 				jnewblk_write(WK_JNEWBLK(wk), jseg, data);
2926 				break;
2927 			case D_JFREEBLK:
2928 				jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
2929 				break;
2930 			case D_JFREEFRAG:
2931 				jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
2932 				break;
2933 			case D_JTRUNC:
2934 				jtrunc_write(WK_JTRUNC(wk), jseg, data);
2935 				break;
2936 			default:
2937 				panic("process_journal: Unknown type %s",
2938 				    TYPENAME(wk->wk_type));
2939 				/* NOTREACHED */
2940 			}
2941 			off += JREC_SIZE;
2942 			data = bp->b_data + off;
2943 			cnt--;
2944 		}
2945 		/*
2946 		 * Write this one buffer and continue.
2947 		 */
2948 		segwritten = 1;
2949 		jblocks->jb_needseg = 0;
2950 		WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
2951 		FREE_LOCK(&lk);
2952 		BO_LOCK(bp->b_bufobj);
2953 		bgetvp(ump->um_devvp, bp);
2954 		BO_UNLOCK(bp->b_bufobj);
2955 		/*
2956 		 * We only do the blocking wait once we find the journal
2957 		 * entry we're looking for.
2958 		 */
2959 		if (needwk == NULL && flags & MNT_WAIT)
2960 			bwrite(bp);
2961 		else
2962 			bawrite(bp);
2963 		ACQUIRE_LOCK(&lk);
2964 	}
2965 	/*
2966 	 * If we've suspended the filesystem because we ran out of journal
2967 	 * space either try to sync it here to make some progress or
2968 	 * unsuspend it if we already have.
2969 	 */
2970 	if (flags == 0 && jblocks->jb_suspended) {
2971 		if (journal_unsuspend(ump))
2972 			return;
2973 		FREE_LOCK(&lk);
2974 		VFS_SYNC(mp, MNT_NOWAIT);
2975 		ffs_sbupdate(ump, MNT_WAIT, 0);
2976 		ACQUIRE_LOCK(&lk);
2977 	}
2978 }
2979 
2980 /*
2981  * Complete a jseg, allowing all dependencies awaiting journal writes
2982  * to proceed.  Each journal dependency also attaches a jsegdep to dependent
2983  * structures so that the journal segment can be freed to reclaim space.
2984  */
2985 static void
2986 complete_jseg(jseg)
2987 	struct jseg *jseg;
2988 {
2989 	struct worklist *wk;
2990 	struct jmvref *jmvref;
2991 	int waiting;
2992 #ifdef INVARIANTS
2993 	int i = 0;
2994 #endif
2995 
2996 	while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
2997 		WORKLIST_REMOVE(wk);
2998 		waiting = wk->wk_state & IOWAITING;
2999 		wk->wk_state &= ~(IOSTARTED | IOWAITING);
3000 		wk->wk_state |= COMPLETE;
3001 		KASSERT(i++ < jseg->js_cnt,
3002 		    ("handle_written_jseg: overflow %d >= %d",
3003 		    i - 1, jseg->js_cnt));
3004 		switch (wk->wk_type) {
3005 		case D_JADDREF:
3006 			handle_written_jaddref(WK_JADDREF(wk));
3007 			break;
3008 		case D_JREMREF:
3009 			handle_written_jremref(WK_JREMREF(wk));
3010 			break;
3011 		case D_JMVREF:
3012 			/* No jsegdep here. */
3013 			rele_jseg(jseg);
3014 			jmvref = WK_JMVREF(wk);
3015 			LIST_REMOVE(jmvref, jm_deps);
3016 			free_pagedep(jmvref->jm_pagedep);
3017 			WORKITEM_FREE(jmvref, D_JMVREF);
3018 			break;
3019 		case D_JNEWBLK:
3020 			handle_written_jnewblk(WK_JNEWBLK(wk));
3021 			break;
3022 		case D_JFREEBLK:
3023 			handle_written_jfreeblk(WK_JFREEBLK(wk));
3024 			break;
3025 		case D_JFREEFRAG:
3026 			handle_written_jfreefrag(WK_JFREEFRAG(wk));
3027 			break;
3028 		case D_JTRUNC:
3029 			WK_JTRUNC(wk)->jt_jsegdep->jd_seg = jseg;
3030 			WORKITEM_FREE(wk, D_JTRUNC);
3031 			break;
3032 		default:
3033 			panic("handle_written_jseg: Unknown type %s",
3034 			    TYPENAME(wk->wk_type));
3035 			/* NOTREACHED */
3036 		}
3037 		if (waiting)
3038 			wakeup(wk);
3039 	}
3040 	/* Release the self reference so the structure may be freed. */
3041 	rele_jseg(jseg);
3042 }
3043 
3044 /*
3045  * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Handle jseg
3046  * completions in order only.
3047  */
3048 static void
3049 handle_written_jseg(jseg, bp)
3050 	struct jseg *jseg;
3051 	struct buf *bp;
3052 {
3053 	struct jblocks *jblocks;
3054 	struct jseg *jsegn;
3055 
3056 	if (jseg->js_refs == 0)
3057 		panic("handle_written_jseg: No self-reference on %p", jseg);
3058 	jseg->js_state |= DEPCOMPLETE;
3059 	/*
3060 	 * We'll never need this buffer again, set flags so it will be
3061 	 * discarded.
3062 	 */
3063 	bp->b_flags |= B_INVAL | B_NOCACHE;
3064 	jblocks = jseg->js_jblocks;
3065 	/*
3066 	 * Don't allow out of order completions.  If this isn't the first
3067 	 * block wait for it to write before we're done.
3068 	 */
3069 	if (jseg != jblocks->jb_writeseg)
3070 		return;
3071 	/* Iterate through available jsegs processing their entries. */
3072 	do {
3073 		jblocks->jb_oldestwrseq = jseg->js_oldseq;
3074 		jsegn = TAILQ_NEXT(jseg, js_next);
3075 		complete_jseg(jseg);
3076 		jseg = jsegn;
3077 	} while (jseg && jseg->js_state & DEPCOMPLETE);
3078 	jblocks->jb_writeseg = jseg;
3079 	/*
3080 	 * Attempt to free jsegs now that oldestwrseq may have advanced.
3081 	 */
3082 	free_jsegs(jblocks);
3083 }
3084 
3085 static inline struct jsegdep *
3086 inoref_jseg(inoref)
3087 	struct inoref *inoref;
3088 {
3089 	struct jsegdep *jsegdep;
3090 
3091 	jsegdep = inoref->if_jsegdep;
3092 	inoref->if_jsegdep = NULL;
3093 
3094 	return (jsegdep);
3095 }
3096 
3097 /*
3098  * Called once a jremref has made it to stable store.  The jremref is marked
3099  * complete and we attempt to free it.  Any pagedeps writes sleeping waiting
3100  * for the jremref to complete will be awoken by free_jremref.
3101  */
3102 static void
3103 handle_written_jremref(jremref)
3104 	struct jremref *jremref;
3105 {
3106 	struct inodedep *inodedep;
3107 	struct jsegdep *jsegdep;
3108 	struct dirrem *dirrem;
3109 
3110 	/* Grab the jsegdep. */
3111 	jsegdep = inoref_jseg(&jremref->jr_ref);
3112 	/*
3113 	 * Remove us from the inoref list.
3114 	 */
3115 	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
3116 	    0, &inodedep) == 0)
3117 		panic("handle_written_jremref: Lost inodedep");
3118 	TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
3119 	/*
3120 	 * Complete the dirrem.
3121 	 */
3122 	dirrem = jremref->jr_dirrem;
3123 	jremref->jr_dirrem = NULL;
3124 	LIST_REMOVE(jremref, jr_deps);
3125 	jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
3126 	WORKLIST_INSERT(&dirrem->dm_jwork, &jsegdep->jd_list);
3127 	if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
3128 	    (dirrem->dm_state & COMPLETE) != 0)
3129 		add_to_worklist(&dirrem->dm_list, 0);
3130 	free_jremref(jremref);
3131 }
3132 
3133 /*
3134  * Called once a jaddref has made it to stable store.  The dependency is
3135  * marked complete and any dependent structures are added to the inode
3136  * bufwait list to be completed as soon as it is written.  If a bitmap write
3137  * depends on this entry we move the inode into the inodedephd of the
3138  * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
3139  */
3140 static void
3141 handle_written_jaddref(jaddref)
3142 	struct jaddref *jaddref;
3143 {
3144 	struct jsegdep *jsegdep;
3145 	struct inodedep *inodedep;
3146 	struct diradd *diradd;
3147 	struct mkdir *mkdir;
3148 
3149 	/* Grab the jsegdep. */
3150 	jsegdep = inoref_jseg(&jaddref->ja_ref);
3151 	mkdir = NULL;
3152 	diradd = NULL;
3153 	if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
3154 	    0, &inodedep) == 0)
3155 		panic("handle_written_jaddref: Lost inodedep.");
3156 	if (jaddref->ja_diradd == NULL)
3157 		panic("handle_written_jaddref: No dependency");
3158 	if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
3159 		diradd = jaddref->ja_diradd;
3160 		WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
3161 	} else if (jaddref->ja_state & MKDIR_PARENT) {
3162 		mkdir = jaddref->ja_mkdir;
3163 		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
3164 	} else if (jaddref->ja_state & MKDIR_BODY)
3165 		mkdir = jaddref->ja_mkdir;
3166 	else
3167 		panic("handle_written_jaddref: Unknown dependency %p",
3168 		    jaddref->ja_diradd);
3169 	jaddref->ja_diradd = NULL;	/* also clears ja_mkdir */
3170 	/*
3171 	 * Remove us from the inode list.
3172 	 */
3173 	TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
3174 	/*
3175 	 * The mkdir may be waiting on the jaddref to clear before freeing.
3176 	 */
3177 	if (mkdir) {
3178 		KASSERT(mkdir->md_list.wk_type == D_MKDIR,
3179 		    ("handle_written_jaddref: Incorrect type for mkdir %s",
3180 		    TYPENAME(mkdir->md_list.wk_type)));
3181 		mkdir->md_jaddref = NULL;
3182 		diradd = mkdir->md_diradd;
3183 		mkdir->md_state |= DEPCOMPLETE;
3184 		complete_mkdir(mkdir);
3185 	}
3186 	WORKLIST_INSERT(&diradd->da_jwork, &jsegdep->jd_list);
3187 	if (jaddref->ja_state & NEWBLOCK) {
3188 		inodedep->id_state |= ONDEPLIST;
3189 		LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
3190 		    inodedep, id_deps);
3191 	}
3192 	free_jaddref(jaddref);
3193 }
3194 
3195 /*
3196  * Called once a jnewblk journal is written.  The allocdirect or allocindir
3197  * is placed in the bmsafemap to await notification of a written bitmap.  If
3198  * the operation was canceled we add the segdep to the appropriate
3199  * dependency to free the journal space once the canceling operation
3200  * completes.
3201  */
3202 static void
3203 handle_written_jnewblk(jnewblk)
3204 	struct jnewblk *jnewblk;
3205 {
3206 	struct bmsafemap *bmsafemap;
3207 	struct freefrag *freefrag;
3208 	struct jsegdep *jsegdep;
3209 	struct newblk *newblk;
3210 	struct freework *freework;
3211 	struct indirdep *indirdep;
3212 
3213 	/* Grab the jsegdep. */
3214 	jsegdep = jnewblk->jn_jsegdep;
3215 	jnewblk->jn_jsegdep = NULL;
3216 	if (jnewblk->jn_dep == NULL)
3217 		panic("handle_written_jnewblk: No dependency for the segdep.");
3218 	switch (jnewblk->jn_dep->wk_type) {
3219 	case D_NEWBLK:
3220 	case D_ALLOCDIRECT:
3221 	case D_ALLOCINDIR:
3222 		/*
3223 		 * Add the written block to the bmsafemap so it can
3224 		 * be notified when the bitmap is on disk.
3225 		 */
3226 		newblk = WK_NEWBLK(jnewblk->jn_dep);
3227 		newblk->nb_jnewblk = NULL;
3228 		bmsafemap = newblk->nb_bmsafemap;
3229 		newblk->nb_state |= ONDEPLIST;
3230 		LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
3231 		WORKLIST_INSERT(&newblk->nb_jwork, &jsegdep->jd_list);
3232 		break;
3233 	case D_FREEFRAG:
3234 		/*
3235 		 * A newblock being removed by a freefrag when replaced by
3236 		 * frag extension.
3237 		 */
3238 		freefrag = WK_FREEFRAG(jnewblk->jn_dep);
3239 		freefrag->ff_jdep = NULL;
3240 		WORKLIST_INSERT(&freefrag->ff_jwork, &jsegdep->jd_list);
3241 		break;
3242 	case D_FREEWORK:
3243 		/*
3244 		 * A direct block was removed by truncate.
3245 		 */
3246 		freework = WK_FREEWORK(jnewblk->jn_dep);
3247 		freework->fw_jnewblk = NULL;
3248 		WORKLIST_INSERT(&freework->fw_jwork, &jsegdep->jd_list);
3249 		break;
3250 	case D_INDIRDEP:
3251 		/*
3252 		 * An indirect block was removed by truncate.
3253 		 */
3254 		indirdep = WK_INDIRDEP(jnewblk->jn_dep);
3255 		LIST_REMOVE(jnewblk, jn_indirdeps);
3256 		WORKLIST_INSERT(&indirdep->ir_jwork, &jsegdep->jd_list);
3257 		break;
3258 	default:
3259 		panic("handle_written_jnewblk: Unknown type %d.",
3260 		    jnewblk->jn_dep->wk_type);
3261 	}
3262 	jnewblk->jn_dep = NULL;
3263 	free_jnewblk(jnewblk);
3264 }
3265 
3266 /*
3267  * Cancel a jfreefrag that won't be needed, probably due to colliding with
3268  * an in-flight allocation that has not yet been committed.  Divorce us
3269  * from the freefrag and mark it DEPCOMPLETE so that it may be added
3270  * to the worklist.
3271  */
3272 static void
3273 cancel_jfreefrag(jfreefrag)
3274 	struct jfreefrag *jfreefrag;
3275 {
3276 	struct freefrag *freefrag;
3277 
3278 	if (jfreefrag->fr_jsegdep) {
3279 		free_jsegdep(jfreefrag->fr_jsegdep);
3280 		jfreefrag->fr_jsegdep = NULL;
3281 	}
3282 	freefrag = jfreefrag->fr_freefrag;
3283 	jfreefrag->fr_freefrag = NULL;
3284 	free_jfreefrag(jfreefrag);
3285 	freefrag->ff_state |= DEPCOMPLETE;
3286 }
3287 
3288 /*
3289  * Free a jfreefrag when the parent freefrag is rendered obsolete.
3290  */
3291 static void
3292 free_jfreefrag(jfreefrag)
3293 	struct jfreefrag *jfreefrag;
3294 {
3295 
3296 	if (jfreefrag->fr_state & IOSTARTED)
3297 		WORKLIST_REMOVE(&jfreefrag->fr_list);
3298 	else if (jfreefrag->fr_state & ONWORKLIST)
3299 		remove_from_journal(&jfreefrag->fr_list);
3300 	if (jfreefrag->fr_freefrag != NULL)
3301 		panic("free_jfreefrag:  Still attached to a freefrag.");
3302 	WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
3303 }
3304 
3305 /*
3306  * Called when the journal write for a jfreefrag completes.  The parent
3307  * freefrag is added to the worklist if this completes its dependencies.
3308  */
3309 static void
3310 handle_written_jfreefrag(jfreefrag)
3311 	struct jfreefrag *jfreefrag;
3312 {
3313 	struct jsegdep *jsegdep;
3314 	struct freefrag *freefrag;
3315 
3316 	/* Grab the jsegdep. */
3317 	jsegdep = jfreefrag->fr_jsegdep;
3318 	jfreefrag->fr_jsegdep = NULL;
3319 	freefrag = jfreefrag->fr_freefrag;
3320 	if (freefrag == NULL)
3321 		panic("handle_written_jfreefrag: No freefrag.");
3322 	freefrag->ff_state |= DEPCOMPLETE;
3323 	freefrag->ff_jdep = NULL;
3324 	WORKLIST_INSERT(&freefrag->ff_jwork, &jsegdep->jd_list);
3325 	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
3326 		add_to_worklist(&freefrag->ff_list, 0);
3327 	jfreefrag->fr_freefrag = NULL;
3328 	free_jfreefrag(jfreefrag);
3329 }
3330 
3331 /*
3332  * Called when the journal write for a jfreeblk completes.  The jfreeblk
3333  * is removed from the freeblks list of pending journal writes and the
3334  * jsegdep is moved to the freeblks jwork to be completed when all blocks
3335  * have been reclaimed.
3336  */
3337 static void
3338 handle_written_jfreeblk(jfreeblk)
3339 	struct jfreeblk *jfreeblk;
3340 {
3341 	struct freeblks *freeblks;
3342 	struct jsegdep *jsegdep;
3343 
3344 	/* Grab the jsegdep. */
3345 	jsegdep = jfreeblk->jf_jsegdep;
3346 	jfreeblk->jf_jsegdep = NULL;
3347 	freeblks = jfreeblk->jf_freeblks;
3348 	LIST_REMOVE(jfreeblk, jf_deps);
3349 	WORKLIST_INSERT(&freeblks->fb_jwork, &jsegdep->jd_list);
3350 	/*
3351 	 * If the freeblks is all journaled, we can add it to the worklist.
3352 	 */
3353 	if (LIST_EMPTY(&freeblks->fb_jfreeblkhd) &&
3354 	    (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) {
3355 		/* Remove from the b_dep that is waiting on this write. */
3356 		if (freeblks->fb_state & ONWORKLIST)
3357 			WORKLIST_REMOVE(&freeblks->fb_list);
3358 		add_to_worklist(&freeblks->fb_list, 1);
3359 	}
3360 
3361 	free_jfreeblk(jfreeblk);
3362 }
3363 
3364 static struct jsegdep *
3365 newjsegdep(struct worklist *wk)
3366 {
3367 	struct jsegdep *jsegdep;
3368 
3369 	jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
3370 	workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
3371 	jsegdep->jd_seg = NULL;
3372 
3373 	return (jsegdep);
3374 }
3375 
3376 static struct jmvref *
3377 newjmvref(dp, ino, oldoff, newoff)
3378 	struct inode *dp;
3379 	ino_t ino;
3380 	off_t oldoff;
3381 	off_t newoff;
3382 {
3383 	struct jmvref *jmvref;
3384 
3385 	jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
3386 	workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump));
3387 	jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
3388 	jmvref->jm_parent = dp->i_number;
3389 	jmvref->jm_ino = ino;
3390 	jmvref->jm_oldoff = oldoff;
3391 	jmvref->jm_newoff = newoff;
3392 
3393 	return (jmvref);
3394 }
3395 
3396 /*
3397  * Allocate a new jremref that tracks the removal of ip from dp with the
3398  * directory entry offset of diroff.  Mark the entry as ATTACHED and
3399  * DEPCOMPLETE as we have all the information required for the journal write
3400  * and the directory has already been removed from the buffer.  The caller
3401  * is responsible for linking the jremref into the pagedep and adding it
3402  * to the journal to write.  The MKDIR_PARENT flag is set if we're doing
3403  * a DOTDOT addition so handle_workitem_remove() can properly assign
3404  * the jsegdep when we're done.
3405  */
3406 static struct jremref *
3407 newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip,
3408     off_t diroff, nlink_t nlink)
3409 {
3410 	struct jremref *jremref;
3411 
3412 	jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
3413 	workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump));
3414 	jremref->jr_state = ATTACHED;
3415 	newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
3416 	   nlink, ip->i_mode);
3417 	jremref->jr_dirrem = dirrem;
3418 
3419 	return (jremref);
3420 }
3421 
3422 static inline void
3423 newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff,
3424     nlink_t nlink, uint16_t mode)
3425 {
3426 
3427 	inoref->if_jsegdep = newjsegdep(&inoref->if_list);
3428 	inoref->if_diroff = diroff;
3429 	inoref->if_ino = ino;
3430 	inoref->if_parent = parent;
3431 	inoref->if_nlink = nlink;
3432 	inoref->if_mode = mode;
3433 }
3434 
3435 /*
3436  * Allocate a new jaddref to track the addition of ino to dp at diroff.  The
3437  * directory offset may not be known until later.  The caller is responsible
3438  * adding the entry to the journal when this information is available.  nlink
3439  * should be the link count prior to the addition and mode is only required
3440  * to have the correct FMT.
3441  */
3442 static struct jaddref *
3443 newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink,
3444     uint16_t mode)
3445 {
3446 	struct jaddref *jaddref;
3447 
3448 	jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
3449 	workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump));
3450 	jaddref->ja_state = ATTACHED;
3451 	jaddref->ja_mkdir = NULL;
3452 	newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
3453 
3454 	return (jaddref);
3455 }
3456 
3457 /*
3458  * Create a new free dependency for a freework.  The caller is responsible
3459  * for adjusting the reference count when it has the lock held.  The freedep
3460  * will track an outstanding bitmap write that will ultimately clear the
3461  * freework to continue.
3462  */
3463 static struct freedep *
3464 newfreedep(struct freework *freework)
3465 {
3466 	struct freedep *freedep;
3467 
3468 	freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
3469 	workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
3470 	freedep->fd_freework = freework;
3471 
3472 	return (freedep);
3473 }
3474 
3475 /*
3476  * Free a freedep structure once the buffer it is linked to is written.  If
3477  * this is the last reference to the freework schedule it for completion.
3478  */
3479 static void
3480 free_freedep(freedep)
3481 	struct freedep *freedep;
3482 {
3483 
3484 	if (--freedep->fd_freework->fw_ref == 0)
3485 		add_to_worklist(&freedep->fd_freework->fw_list, 1);
3486 	WORKITEM_FREE(freedep, D_FREEDEP);
3487 }
3488 
3489 /*
3490  * Allocate a new freework structure that may be a level in an indirect
3491  * when parent is not NULL or a top level block when it is.  The top level
3492  * freework structures are allocated without lk held and before the freeblks
3493  * is visible outside of softdep_setup_freeblocks().
3494  */
3495 static struct freework *
3496 newfreework(ump, freeblks, parent, lbn, nb, frags, journal)
3497 	struct ufsmount *ump;
3498 	struct freeblks *freeblks;
3499 	struct freework *parent;
3500 	ufs_lbn_t lbn;
3501 	ufs2_daddr_t nb;
3502 	int frags;
3503 	int journal;
3504 {
3505 	struct freework *freework;
3506 
3507 	freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
3508 	workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
3509 	freework->fw_jnewblk = NULL;
3510 	freework->fw_freeblks = freeblks;
3511 	freework->fw_parent = parent;
3512 	freework->fw_lbn = lbn;
3513 	freework->fw_blkno = nb;
3514 	freework->fw_frags = frags;
3515 	freework->fw_ref = ((UFSTOVFS(ump)->mnt_kern_flag & MNTK_SUJ) == 0 ||
3516 	    lbn >= -NXADDR) ? 0 : NINDIR(ump->um_fs) + 1;
3517 	freework->fw_off = 0;
3518 	LIST_INIT(&freework->fw_jwork);
3519 
3520 	if (parent == NULL) {
3521 		WORKLIST_INSERT_UNLOCKED(&freeblks->fb_freeworkhd,
3522 		    &freework->fw_list);
3523 		freeblks->fb_ref++;
3524 	}
3525 	if (journal)
3526 		newjfreeblk(freeblks, lbn, nb, frags);
3527 
3528 	return (freework);
3529 }
3530 
3531 /*
3532  * Allocate a new jfreeblk to journal top level block pointer when truncating
3533  * a file.  The caller must add this to the worklist when lk is held.
3534  */
3535 static struct jfreeblk *
3536 newjfreeblk(freeblks, lbn, blkno, frags)
3537 	struct freeblks *freeblks;
3538 	ufs_lbn_t lbn;
3539 	ufs2_daddr_t blkno;
3540 	int frags;
3541 {
3542 	struct jfreeblk *jfreeblk;
3543 
3544 	jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
3545 	workitem_alloc(&jfreeblk->jf_list, D_JFREEBLK, freeblks->fb_list.wk_mp);
3546 	jfreeblk->jf_jsegdep = newjsegdep(&jfreeblk->jf_list);
3547 	jfreeblk->jf_state = ATTACHED | DEPCOMPLETE;
3548 	jfreeblk->jf_ino = freeblks->fb_previousinum;
3549 	jfreeblk->jf_lbn = lbn;
3550 	jfreeblk->jf_blkno = blkno;
3551 	jfreeblk->jf_frags = frags;
3552 	jfreeblk->jf_freeblks = freeblks;
3553 	LIST_INSERT_HEAD(&freeblks->fb_jfreeblkhd, jfreeblk, jf_deps);
3554 
3555 	return (jfreeblk);
3556 }
3557 
3558 static void move_newblock_dep(struct jaddref *, struct inodedep *);
3559 /*
3560  * If we're canceling a new bitmap we have to search for another ref
3561  * to move into the bmsafemap dep.  This might be better expressed
3562  * with another structure.
3563  */
3564 static void
3565 move_newblock_dep(jaddref, inodedep)
3566 	struct jaddref *jaddref;
3567 	struct inodedep *inodedep;
3568 {
3569 	struct inoref *inoref;
3570 	struct jaddref *jaddrefn;
3571 
3572 	jaddrefn = NULL;
3573 	for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
3574 	    inoref = TAILQ_NEXT(inoref, if_deps)) {
3575 		if ((jaddref->ja_state & NEWBLOCK) &&
3576 		    inoref->if_list.wk_type == D_JADDREF) {
3577 			jaddrefn = (struct jaddref *)inoref;
3578 			break;
3579 		}
3580 	}
3581 	if (jaddrefn == NULL)
3582 		return;
3583 	jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
3584 	jaddrefn->ja_state |= jaddref->ja_state &
3585 	    (ATTACHED | UNDONE | NEWBLOCK);
3586 	jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
3587 	jaddref->ja_state |= ATTACHED;
3588 	LIST_REMOVE(jaddref, ja_bmdeps);
3589 	LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
3590 	    ja_bmdeps);
3591 }
3592 
3593 /*
3594  * Cancel a jaddref either before it has been written or while it is being
3595  * written.  This happens when a link is removed before the add reaches
3596  * the disk.  The jaddref dependency is kept linked into the bmsafemap
3597  * and inode to prevent the link count or bitmap from reaching the disk
3598  * until handle_workitem_remove() re-adjusts the counts and bitmaps as
3599  * required.
3600  *
3601  * Returns 1 if the canceled addref requires journaling of the remove and
3602  * 0 otherwise.
3603  */
3604 static int
3605 cancel_jaddref(jaddref, inodedep, wkhd)
3606 	struct jaddref *jaddref;
3607 	struct inodedep *inodedep;
3608 	struct workhead *wkhd;
3609 {
3610 	struct inoref *inoref;
3611 	struct jsegdep *jsegdep;
3612 	int needsj;
3613 
3614 	KASSERT((jaddref->ja_state & COMPLETE) == 0,
3615 	    ("cancel_jaddref: Canceling complete jaddref"));
3616 	if (jaddref->ja_state & (IOSTARTED | COMPLETE))
3617 		needsj = 1;
3618 	else
3619 		needsj = 0;
3620 	if (inodedep == NULL)
3621 		if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
3622 		    0, &inodedep) == 0)
3623 			panic("cancel_jaddref: Lost inodedep");
3624 	/*
3625 	 * We must adjust the nlink of any reference operation that follows
3626 	 * us so that it is consistent with the in-memory reference.  This
3627 	 * ensures that inode nlink rollbacks always have the correct link.
3628 	 */
3629 	if (needsj == 0) {
3630 		for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
3631 		    inoref = TAILQ_NEXT(inoref, if_deps)) {
3632 			if (inoref->if_state & GOINGAWAY)
3633 				break;
3634 			inoref->if_nlink--;
3635 		}
3636 	}
3637 	jsegdep = inoref_jseg(&jaddref->ja_ref);
3638 	if (jaddref->ja_state & NEWBLOCK)
3639 		move_newblock_dep(jaddref, inodedep);
3640 	if (jaddref->ja_state & IOWAITING) {
3641 		jaddref->ja_state &= ~IOWAITING;
3642 		wakeup(&jaddref->ja_list);
3643 	}
3644 	jaddref->ja_mkdir = NULL;
3645 	if (jaddref->ja_state & IOSTARTED) {
3646 		jaddref->ja_state &= ~IOSTARTED;
3647 		WORKLIST_REMOVE(&jaddref->ja_list);
3648 		WORKLIST_INSERT(wkhd, &jsegdep->jd_list);
3649 	} else {
3650 		free_jsegdep(jsegdep);
3651 		if (jaddref->ja_state & DEPCOMPLETE)
3652 			remove_from_journal(&jaddref->ja_list);
3653 	}
3654 	jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE);
3655 	/*
3656 	 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
3657 	 * can arrange for them to be freed with the bitmap.  Otherwise we
3658 	 * no longer need this addref attached to the inoreflst and it
3659 	 * will incorrectly adjust nlink if we leave it.
3660 	 */
3661 	if ((jaddref->ja_state & NEWBLOCK) == 0) {
3662 		TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
3663 		    if_deps);
3664 		jaddref->ja_state |= COMPLETE;
3665 		free_jaddref(jaddref);
3666 		return (needsj);
3667 	}
3668 	/*
3669 	 * Leave the head of the list for jsegdeps for fast merging.
3670 	 */
3671 	if (LIST_FIRST(wkhd) != NULL) {
3672 		jaddref->ja_state |= ONWORKLIST;
3673 		LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
3674 	} else
3675 		WORKLIST_INSERT(wkhd, &jaddref->ja_list);
3676 
3677 	return (needsj);
3678 }
3679 
3680 /*
3681  * Attempt to free a jaddref structure when some work completes.  This
3682  * should only succeed once the entry is written and all dependencies have
3683  * been notified.
3684  */
3685 static void
3686 free_jaddref(jaddref)
3687 	struct jaddref *jaddref;
3688 {
3689 
3690 	if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
3691 		return;
3692 	if (jaddref->ja_ref.if_jsegdep)
3693 		panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
3694 		    jaddref, jaddref->ja_state);
3695 	if (jaddref->ja_state & NEWBLOCK)
3696 		LIST_REMOVE(jaddref, ja_bmdeps);
3697 	if (jaddref->ja_state & (IOSTARTED | ONWORKLIST))
3698 		panic("free_jaddref: Bad state %p(0x%X)",
3699 		    jaddref, jaddref->ja_state);
3700 	if (jaddref->ja_mkdir != NULL)
3701 		panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
3702 	WORKITEM_FREE(jaddref, D_JADDREF);
3703 }
3704 
3705 /*
3706  * Free a jremref structure once it has been written or discarded.
3707  */
3708 static void
3709 free_jremref(jremref)
3710 	struct jremref *jremref;
3711 {
3712 
3713 	if (jremref->jr_ref.if_jsegdep)
3714 		free_jsegdep(jremref->jr_ref.if_jsegdep);
3715 	if (jremref->jr_state & IOSTARTED)
3716 		panic("free_jremref: IO still pending");
3717 	WORKITEM_FREE(jremref, D_JREMREF);
3718 }
3719 
3720 /*
3721  * Free a jnewblk structure.
3722  */
3723 static void
3724 free_jnewblk(jnewblk)
3725 	struct jnewblk *jnewblk;
3726 {
3727 
3728 	if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
3729 		return;
3730 	LIST_REMOVE(jnewblk, jn_deps);
3731 	if (jnewblk->jn_dep != NULL)
3732 		panic("free_jnewblk: Dependency still attached.");
3733 	WORKITEM_FREE(jnewblk, D_JNEWBLK);
3734 }
3735 
3736 /*
3737  * Cancel a jnewblk which has been superseded by a freeblk.  The jnewblk
3738  * is kept linked into the bmsafemap until the free completes, thus
3739  * preventing the modified state from ever reaching disk.  The free
3740  * routine must pass this structure via ffs_blkfree() to
3741  * softdep_setup_freeblks() so there is no race in releasing the space.
3742  */
3743 static void
3744 cancel_jnewblk(jnewblk, wkhd)
3745 	struct jnewblk *jnewblk;
3746 	struct workhead *wkhd;
3747 {
3748 	struct jsegdep *jsegdep;
3749 
3750 	jsegdep = jnewblk->jn_jsegdep;
3751 	if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL)
3752 		panic("cancel_jnewblk: Invalid state");
3753 	jnewblk->jn_jsegdep  = NULL;
3754 	jnewblk->jn_dep = NULL;
3755 	jnewblk->jn_state |= GOINGAWAY;
3756 	if (jnewblk->jn_state & IOSTARTED) {
3757 		jnewblk->jn_state &= ~IOSTARTED;
3758 		WORKLIST_REMOVE(&jnewblk->jn_list);
3759 		WORKLIST_INSERT(wkhd, &jsegdep->jd_list);
3760 	} else {
3761 		free_jsegdep(jsegdep);
3762 		remove_from_journal(&jnewblk->jn_list);
3763 	}
3764 	if (jnewblk->jn_state & IOWAITING) {
3765 		jnewblk->jn_state &= ~IOWAITING;
3766 		wakeup(&jnewblk->jn_list);
3767 	}
3768 	WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
3769 }
3770 
3771 static void
3772 free_jfreeblk(jfreeblk)
3773 	struct jfreeblk *jfreeblk;
3774 {
3775 
3776 	WORKITEM_FREE(jfreeblk, D_JFREEBLK);
3777 }
3778 
3779 /*
3780  * Free a single jseg once it is no longer referenced in memory or on
3781  * disk.  Reclaim journal blocks and dependencies waiting for the segment
3782  * to disappear.
3783  */
3784 static void
3785 free_jseg(jseg, jblocks)
3786 	struct jseg *jseg;
3787 	struct jblocks *jblocks;
3788 {
3789 	struct freework *freework;
3790 
3791 	/*
3792 	 * Free freework structures that were lingering to indicate freed
3793 	 * indirect blocks that forced journal write ordering on reallocate.
3794 	 */
3795 	while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL) {
3796 		LIST_REMOVE(freework, fw_next);
3797 		WORKLIST_REMOVE(&freework->fw_list);
3798 		WORKITEM_FREE(freework, D_FREEWORK);
3799 	}
3800 	if (jblocks->jb_oldestseg == jseg)
3801 		jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next);
3802 	TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
3803 	jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
3804 	KASSERT(LIST_EMPTY(&jseg->js_entries),
3805 	    ("free_jseg: Freed jseg has valid entries."));
3806 	WORKITEM_FREE(jseg, D_JSEG);
3807 }
3808 
3809 /*
3810  * Free all jsegs that meet the criteria for being reclaimed and update
3811  * oldestseg.
3812  */
3813 static void
3814 free_jsegs(jblocks)
3815 	struct jblocks *jblocks;
3816 {
3817 	struct jseg *jseg;
3818 
3819 	/*
3820 	 * Free only those jsegs which have none allocated before them to
3821 	 * preserve the journal space ordering.
3822 	 */
3823 	while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
3824 		/*
3825 		 * Only reclaim space when nothing depends on this journal
3826 		 * set and another set has written that it is no longer
3827 		 * valid.
3828 		 */
3829 		if (jseg->js_refs != 0) {
3830 			jblocks->jb_oldestseg = jseg;
3831 			return;
3832 		}
3833 		if (!LIST_EMPTY(&jseg->js_indirs) &&
3834 		    jseg->js_seq >= jblocks->jb_oldestwrseq)
3835 			break;
3836 		free_jseg(jseg, jblocks);
3837 	}
3838 	/*
3839 	 * If we exited the loop above we still must discover the
3840 	 * oldest valid segment.
3841 	 */
3842 	if (jseg)
3843 		for (jseg = jblocks->jb_oldestseg; jseg != NULL;
3844 		     jseg = TAILQ_NEXT(jseg, js_next))
3845 			if (jseg->js_refs != 0)
3846 				break;
3847 	jblocks->jb_oldestseg = jseg;
3848 	/*
3849 	 * The journal has no valid records but some jsegs may still be
3850 	 * waiting on oldestwrseq to advance.  We force a small record
3851 	 * out to permit these lingering records to be reclaimed.
3852 	 */
3853 	if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs))
3854 		jblocks->jb_needseg = 1;
3855 }
3856 
3857 /*
3858  * Release one reference to a jseg and free it if the count reaches 0.  This
3859  * should eventually reclaim journal space as well.
3860  */
3861 static void
3862 rele_jseg(jseg)
3863 	struct jseg *jseg;
3864 {
3865 
3866 	KASSERT(jseg->js_refs > 0,
3867 	    ("free_jseg: Invalid refcnt %d", jseg->js_refs));
3868 	if (--jseg->js_refs != 0)
3869 		return;
3870 	free_jsegs(jseg->js_jblocks);
3871 }
3872 
3873 /*
3874  * Release a jsegdep and decrement the jseg count.
3875  */
3876 static void
3877 free_jsegdep(jsegdep)
3878 	struct jsegdep *jsegdep;
3879 {
3880 
3881 	if (jsegdep->jd_seg)
3882 		rele_jseg(jsegdep->jd_seg);
3883 	WORKITEM_FREE(jsegdep, D_JSEGDEP);
3884 }
3885 
3886 /*
3887  * Wait for a journal item to make it to disk.  Initiate journal processing
3888  * if required.
3889  */
3890 static void
3891 jwait(wk)
3892 	struct worklist *wk;
3893 {
3894 
3895 	stat_journal_wait++;
3896 	/*
3897 	 * If IO has not started we process the journal.  We can't mark the
3898 	 * worklist item as IOWAITING because we drop the lock while
3899 	 * processing the journal and the worklist entry may be freed after
3900 	 * this point.  The caller may call back in and re-issue the request.
3901 	 */
3902 	if ((wk->wk_state & IOSTARTED) == 0) {
3903 		softdep_process_journal(wk->wk_mp, wk, MNT_WAIT);
3904 		return;
3905 	}
3906 	wk->wk_state |= IOWAITING;
3907 	msleep(wk, &lk, PRIBIO, "jwait", 0);
3908 }
3909 
3910 /*
3911  * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
3912  * appropriate.  This is a convenience function to reduce duplicate code
3913  * for the setup and revert functions below.
3914  */
3915 static struct inodedep *
3916 inodedep_lookup_ip(ip)
3917 	struct inode *ip;
3918 {
3919 	struct inodedep *inodedep;
3920 
3921 	KASSERT(ip->i_nlink >= ip->i_effnlink,
3922 	    ("inodedep_lookup_ip: bad delta"));
3923 	(void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number,
3924 	    DEPALLOC, &inodedep);
3925 	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3926 
3927 	return (inodedep);
3928 }
3929 
3930 /*
3931  * Create a journal entry that describes a truncate that we're about to
3932  * perform.  The inode allocations and frees between here and the completion
3933  * of the operation are done asynchronously and without journaling.  At
3934  * the end of the operation the vnode is sync'd and the journal space
3935  * is released.  Recovery will discover the partially completed truncate
3936  * and complete it.
3937  */
3938 void *
3939 softdep_setup_trunc(vp, length, flags)
3940 	struct vnode *vp;
3941 	off_t length;
3942 	int flags;
3943 {
3944 	struct jsegdep *jsegdep;
3945 	struct jtrunc *jtrunc;
3946 	struct ufsmount *ump;
3947 	struct inode *ip;
3948 
3949 	softdep_prealloc(vp, MNT_WAIT);
3950 	ip = VTOI(vp);
3951 	ump = VFSTOUFS(vp->v_mount);
3952 	jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
3953 	workitem_alloc(&jtrunc->jt_list, D_JTRUNC, vp->v_mount);
3954 	jsegdep = jtrunc->jt_jsegdep = newjsegdep(&jtrunc->jt_list);
3955 	jtrunc->jt_ino = ip->i_number;
3956 	jtrunc->jt_extsize = 0;
3957 	jtrunc->jt_size = length;
3958 	if ((flags & IO_EXT) == 0 && ump->um_fstype == UFS2)
3959 		jtrunc->jt_extsize = ip->i_din2->di_extsize;
3960 	if ((flags & IO_NORMAL) == 0)
3961 		jtrunc->jt_size = DIP(ip, i_size);
3962 	ACQUIRE_LOCK(&lk);
3963 	add_to_journal(&jtrunc->jt_list);
3964 	while (jsegdep->jd_seg == NULL) {
3965 		stat_jwait_freeblks++;
3966 		jwait(&jtrunc->jt_list);
3967 	}
3968 	FREE_LOCK(&lk);
3969 
3970 	return (jsegdep);
3971 }
3972 
3973 /*
3974  * After synchronous truncation is complete we free sync the vnode and
3975  * release the jsegdep so the journal space can be freed.
3976  */
3977 int
3978 softdep_complete_trunc(vp, cookie)
3979 	struct vnode *vp;
3980 	void *cookie;
3981 {
3982 	int error;
3983 
3984 	error = ffs_syncvnode(vp, MNT_WAIT);
3985 	ACQUIRE_LOCK(&lk);
3986 	free_jsegdep((struct jsegdep *)cookie);
3987 	FREE_LOCK(&lk);
3988 
3989 	return (error);
3990 }
3991 
3992 /*
3993  * Called prior to creating a new inode and linking it to a directory.  The
3994  * jaddref structure must already be allocated by softdep_setup_inomapdep
3995  * and it is discovered here so we can initialize the mode and update
3996  * nlinkdelta.
3997  */
3998 void
3999 softdep_setup_create(dp, ip)
4000 	struct inode *dp;
4001 	struct inode *ip;
4002 {
4003 	struct inodedep *inodedep;
4004 	struct jaddref *jaddref;
4005 	struct vnode *dvp;
4006 
4007 	KASSERT(ip->i_nlink == 1,
4008 	    ("softdep_setup_create: Invalid link count."));
4009 	dvp = ITOV(dp);
4010 	ACQUIRE_LOCK(&lk);
4011 	inodedep = inodedep_lookup_ip(ip);
4012 	if (DOINGSUJ(dvp)) {
4013 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4014 		    inoreflst);
4015 		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
4016 		    ("softdep_setup_create: No addref structure present."));
4017 		jaddref->ja_mode = ip->i_mode;
4018 	}
4019 	softdep_prelink(dvp, NULL);
4020 	FREE_LOCK(&lk);
4021 }
4022 
4023 /*
4024  * Create a jaddref structure to track the addition of a DOTDOT link when
4025  * we are reparenting an inode as part of a rename.  This jaddref will be
4026  * found by softdep_setup_directory_change.  Adjusts nlinkdelta for
4027  * non-journaling softdep.
4028  */
4029 void
4030 softdep_setup_dotdot_link(dp, ip)
4031 	struct inode *dp;
4032 	struct inode *ip;
4033 {
4034 	struct inodedep *inodedep;
4035 	struct jaddref *jaddref;
4036 	struct vnode *dvp;
4037 	struct vnode *vp;
4038 
4039 	dvp = ITOV(dp);
4040 	vp = ITOV(ip);
4041 	jaddref = NULL;
4042 	/*
4043 	 * We don't set MKDIR_PARENT as this is not tied to a mkdir and
4044 	 * is used as a normal link would be.
4045 	 */
4046 	if (DOINGSUJ(dvp))
4047 		jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4048 		    dp->i_effnlink - 1, dp->i_mode);
4049 	ACQUIRE_LOCK(&lk);
4050 	inodedep = inodedep_lookup_ip(dp);
4051 	if (jaddref)
4052 		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4053 		    if_deps);
4054 	softdep_prelink(dvp, ITOV(ip));
4055 	FREE_LOCK(&lk);
4056 }
4057 
4058 /*
4059  * Create a jaddref structure to track a new link to an inode.  The directory
4060  * offset is not known until softdep_setup_directory_add or
4061  * softdep_setup_directory_change.  Adjusts nlinkdelta for non-journaling
4062  * softdep.
4063  */
4064 void
4065 softdep_setup_link(dp, ip)
4066 	struct inode *dp;
4067 	struct inode *ip;
4068 {
4069 	struct inodedep *inodedep;
4070 	struct jaddref *jaddref;
4071 	struct vnode *dvp;
4072 
4073 	dvp = ITOV(dp);
4074 	jaddref = NULL;
4075 	if (DOINGSUJ(dvp))
4076 		jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
4077 		    ip->i_mode);
4078 	ACQUIRE_LOCK(&lk);
4079 	inodedep = inodedep_lookup_ip(ip);
4080 	if (jaddref)
4081 		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4082 		    if_deps);
4083 	softdep_prelink(dvp, ITOV(ip));
4084 	FREE_LOCK(&lk);
4085 }
4086 
4087 /*
4088  * Called to create the jaddref structures to track . and .. references as
4089  * well as lookup and further initialize the incomplete jaddref created
4090  * by softdep_setup_inomapdep when the inode was allocated.  Adjusts
4091  * nlinkdelta for non-journaling softdep.
4092  */
4093 void
4094 softdep_setup_mkdir(dp, ip)
4095 	struct inode *dp;
4096 	struct inode *ip;
4097 {
4098 	struct inodedep *inodedep;
4099 	struct jaddref *dotdotaddref;
4100 	struct jaddref *dotaddref;
4101 	struct jaddref *jaddref;
4102 	struct vnode *dvp;
4103 
4104 	dvp = ITOV(dp);
4105 	dotaddref = dotdotaddref = NULL;
4106 	if (DOINGSUJ(dvp)) {
4107 		dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
4108 		    ip->i_mode);
4109 		dotaddref->ja_state |= MKDIR_BODY;
4110 		dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4111 		    dp->i_effnlink - 1, dp->i_mode);
4112 		dotdotaddref->ja_state |= MKDIR_PARENT;
4113 	}
4114 	ACQUIRE_LOCK(&lk);
4115 	inodedep = inodedep_lookup_ip(ip);
4116 	if (DOINGSUJ(dvp)) {
4117 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4118 		    inoreflst);
4119 		KASSERT(jaddref != NULL,
4120 		    ("softdep_setup_mkdir: No addref structure present."));
4121 		KASSERT(jaddref->ja_parent == dp->i_number,
4122 		    ("softdep_setup_mkdir: bad parent %d",
4123 		    jaddref->ja_parent));
4124 		jaddref->ja_mode = ip->i_mode;
4125 		TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
4126 		    if_deps);
4127 	}
4128 	inodedep = inodedep_lookup_ip(dp);
4129 	if (DOINGSUJ(dvp))
4130 		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
4131 		    &dotdotaddref->ja_ref, if_deps);
4132 	softdep_prelink(ITOV(dp), NULL);
4133 	FREE_LOCK(&lk);
4134 }
4135 
4136 /*
4137  * Called to track nlinkdelta of the inode and parent directories prior to
4138  * unlinking a directory.
4139  */
4140 void
4141 softdep_setup_rmdir(dp, ip)
4142 	struct inode *dp;
4143 	struct inode *ip;
4144 {
4145 	struct vnode *dvp;
4146 
4147 	dvp = ITOV(dp);
4148 	ACQUIRE_LOCK(&lk);
4149 	(void) inodedep_lookup_ip(ip);
4150 	(void) inodedep_lookup_ip(dp);
4151 	softdep_prelink(dvp, ITOV(ip));
4152 	FREE_LOCK(&lk);
4153 }
4154 
4155 /*
4156  * Called to track nlinkdelta of the inode and parent directories prior to
4157  * unlink.
4158  */
4159 void
4160 softdep_setup_unlink(dp, ip)
4161 	struct inode *dp;
4162 	struct inode *ip;
4163 {
4164 	struct vnode *dvp;
4165 
4166 	dvp = ITOV(dp);
4167 	ACQUIRE_LOCK(&lk);
4168 	(void) inodedep_lookup_ip(ip);
4169 	(void) inodedep_lookup_ip(dp);
4170 	softdep_prelink(dvp, ITOV(ip));
4171 	FREE_LOCK(&lk);
4172 }
4173 
4174 /*
4175  * Called to release the journal structures created by a failed non-directory
4176  * creation.  Adjusts nlinkdelta for non-journaling softdep.
4177  */
4178 void
4179 softdep_revert_create(dp, ip)
4180 	struct inode *dp;
4181 	struct inode *ip;
4182 {
4183 	struct inodedep *inodedep;
4184 	struct jaddref *jaddref;
4185 	struct vnode *dvp;
4186 
4187 	dvp = ITOV(dp);
4188 	ACQUIRE_LOCK(&lk);
4189 	inodedep = inodedep_lookup_ip(ip);
4190 	if (DOINGSUJ(dvp)) {
4191 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4192 		    inoreflst);
4193 		KASSERT(jaddref->ja_parent == dp->i_number,
4194 		    ("softdep_revert_create: addref parent mismatch"));
4195 		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4196 	}
4197 	FREE_LOCK(&lk);
4198 }
4199 
4200 /*
4201  * Called to release the journal structures created by a failed dotdot link
4202  * creation.  Adjusts nlinkdelta for non-journaling softdep.
4203  */
4204 void
4205 softdep_revert_dotdot_link(dp, ip)
4206 	struct inode *dp;
4207 	struct inode *ip;
4208 {
4209 	struct inodedep *inodedep;
4210 	struct jaddref *jaddref;
4211 	struct vnode *dvp;
4212 
4213 	dvp = ITOV(dp);
4214 	ACQUIRE_LOCK(&lk);
4215 	inodedep = inodedep_lookup_ip(dp);
4216 	if (DOINGSUJ(dvp)) {
4217 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4218 		    inoreflst);
4219 		KASSERT(jaddref->ja_parent == ip->i_number,
4220 		    ("softdep_revert_dotdot_link: addref parent mismatch"));
4221 		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4222 	}
4223 	FREE_LOCK(&lk);
4224 }
4225 
4226 /*
4227  * Called to release the journal structures created by a failed link
4228  * addition.  Adjusts nlinkdelta for non-journaling softdep.
4229  */
4230 void
4231 softdep_revert_link(dp, ip)
4232 	struct inode *dp;
4233 	struct inode *ip;
4234 {
4235 	struct inodedep *inodedep;
4236 	struct jaddref *jaddref;
4237 	struct vnode *dvp;
4238 
4239 	dvp = ITOV(dp);
4240 	ACQUIRE_LOCK(&lk);
4241 	inodedep = inodedep_lookup_ip(ip);
4242 	if (DOINGSUJ(dvp)) {
4243 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4244 		    inoreflst);
4245 		KASSERT(jaddref->ja_parent == dp->i_number,
4246 		    ("softdep_revert_link: addref parent mismatch"));
4247 		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4248 	}
4249 	FREE_LOCK(&lk);
4250 }
4251 
4252 /*
4253  * Called to release the journal structures created by a failed mkdir
4254  * attempt.  Adjusts nlinkdelta for non-journaling softdep.
4255  */
4256 void
4257 softdep_revert_mkdir(dp, ip)
4258 	struct inode *dp;
4259 	struct inode *ip;
4260 {
4261 	struct inodedep *inodedep;
4262 	struct jaddref *jaddref;
4263 	struct jaddref *dotaddref;
4264 	struct vnode *dvp;
4265 
4266 	dvp = ITOV(dp);
4267 
4268 	ACQUIRE_LOCK(&lk);
4269 	inodedep = inodedep_lookup_ip(dp);
4270 	if (DOINGSUJ(dvp)) {
4271 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4272 		    inoreflst);
4273 		KASSERT(jaddref->ja_parent == ip->i_number,
4274 		    ("softdep_revert_mkdir: dotdot addref parent mismatch"));
4275 		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4276 	}
4277 	inodedep = inodedep_lookup_ip(ip);
4278 	if (DOINGSUJ(dvp)) {
4279 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4280 		    inoreflst);
4281 		KASSERT(jaddref->ja_parent == dp->i_number,
4282 		    ("softdep_revert_mkdir: addref parent mismatch"));
4283 		dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
4284 		    inoreflst, if_deps);
4285 		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4286 		KASSERT(dotaddref->ja_parent == ip->i_number,
4287 		    ("softdep_revert_mkdir: dot addref parent mismatch"));
4288 		cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait);
4289 	}
4290 	FREE_LOCK(&lk);
4291 }
4292 
4293 /*
4294  * Called to correct nlinkdelta after a failed rmdir.
4295  */
4296 void
4297 softdep_revert_rmdir(dp, ip)
4298 	struct inode *dp;
4299 	struct inode *ip;
4300 {
4301 
4302 	ACQUIRE_LOCK(&lk);
4303 	(void) inodedep_lookup_ip(ip);
4304 	(void) inodedep_lookup_ip(dp);
4305 	FREE_LOCK(&lk);
4306 }
4307 
4308 /*
4309  * Protecting the freemaps (or bitmaps).
4310  *
4311  * To eliminate the need to execute fsck before mounting a filesystem
4312  * after a power failure, one must (conservatively) guarantee that the
4313  * on-disk copy of the bitmaps never indicate that a live inode or block is
4314  * free.  So, when a block or inode is allocated, the bitmap should be
4315  * updated (on disk) before any new pointers.  When a block or inode is
4316  * freed, the bitmap should not be updated until all pointers have been
4317  * reset.  The latter dependency is handled by the delayed de-allocation
4318  * approach described below for block and inode de-allocation.  The former
4319  * dependency is handled by calling the following procedure when a block or
4320  * inode is allocated. When an inode is allocated an "inodedep" is created
4321  * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
4322  * Each "inodedep" is also inserted into the hash indexing structure so
4323  * that any additional link additions can be made dependent on the inode
4324  * allocation.
4325  *
4326  * The ufs filesystem maintains a number of free block counts (e.g., per
4327  * cylinder group, per cylinder and per <cylinder, rotational position> pair)
4328  * in addition to the bitmaps.  These counts are used to improve efficiency
4329  * during allocation and therefore must be consistent with the bitmaps.
4330  * There is no convenient way to guarantee post-crash consistency of these
4331  * counts with simple update ordering, for two main reasons: (1) The counts
4332  * and bitmaps for a single cylinder group block are not in the same disk
4333  * sector.  If a disk write is interrupted (e.g., by power failure), one may
4334  * be written and the other not.  (2) Some of the counts are located in the
4335  * superblock rather than the cylinder group block. So, we focus our soft
4336  * updates implementation on protecting the bitmaps. When mounting a
4337  * filesystem, we recompute the auxiliary counts from the bitmaps.
4338  */
4339 
4340 /*
4341  * Called just after updating the cylinder group block to allocate an inode.
4342  */
4343 void
4344 softdep_setup_inomapdep(bp, ip, newinum)
4345 	struct buf *bp;		/* buffer for cylgroup block with inode map */
4346 	struct inode *ip;	/* inode related to allocation */
4347 	ino_t newinum;		/* new inode number being allocated */
4348 {
4349 	struct inodedep *inodedep;
4350 	struct bmsafemap *bmsafemap;
4351 	struct jaddref *jaddref;
4352 	struct mount *mp;
4353 	struct fs *fs;
4354 
4355 	mp = UFSTOVFS(ip->i_ump);
4356 	fs = ip->i_ump->um_fs;
4357 	jaddref = NULL;
4358 
4359 	/*
4360 	 * Allocate the journal reference add structure so that the bitmap
4361 	 * can be dependent on it.
4362 	 */
4363 	if (mp->mnt_kern_flag & MNTK_SUJ) {
4364 		jaddref = newjaddref(ip, newinum, 0, 0, 0);
4365 		jaddref->ja_state |= NEWBLOCK;
4366 	}
4367 
4368 	/*
4369 	 * Create a dependency for the newly allocated inode.
4370 	 * Panic if it already exists as something is seriously wrong.
4371 	 * Otherwise add it to the dependency list for the buffer holding
4372 	 * the cylinder group map from which it was allocated.
4373 	 */
4374 	ACQUIRE_LOCK(&lk);
4375 	if ((inodedep_lookup(mp, newinum, DEPALLOC|NODELAY, &inodedep)))
4376 		panic("softdep_setup_inomapdep: dependency %p for new"
4377 		    "inode already exists", inodedep);
4378 	bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum));
4379 	if (jaddref) {
4380 		LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
4381 		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4382 		    if_deps);
4383 	} else {
4384 		inodedep->id_state |= ONDEPLIST;
4385 		LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
4386 	}
4387 	inodedep->id_bmsafemap = bmsafemap;
4388 	inodedep->id_state &= ~DEPCOMPLETE;
4389 	FREE_LOCK(&lk);
4390 }
4391 
4392 /*
4393  * Called just after updating the cylinder group block to
4394  * allocate block or fragment.
4395  */
4396 void
4397 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
4398 	struct buf *bp;		/* buffer for cylgroup block with block map */
4399 	struct mount *mp;	/* filesystem doing allocation */
4400 	ufs2_daddr_t newblkno;	/* number of newly allocated block */
4401 	int frags;		/* Number of fragments. */
4402 	int oldfrags;		/* Previous number of fragments for extend. */
4403 {
4404 	struct newblk *newblk;
4405 	struct bmsafemap *bmsafemap;
4406 	struct jnewblk *jnewblk;
4407 	struct fs *fs;
4408 
4409 	fs = VFSTOUFS(mp)->um_fs;
4410 	jnewblk = NULL;
4411 	/*
4412 	 * Create a dependency for the newly allocated block.
4413 	 * Add it to the dependency list for the buffer holding
4414 	 * the cylinder group map from which it was allocated.
4415 	 */
4416 	if (mp->mnt_kern_flag & MNTK_SUJ) {
4417 		jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
4418 		workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
4419 		jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
4420 		jnewblk->jn_state = ATTACHED;
4421 		jnewblk->jn_blkno = newblkno;
4422 		jnewblk->jn_frags = frags;
4423 		jnewblk->jn_oldfrags = oldfrags;
4424 #ifdef SUJ_DEBUG
4425 		{
4426 			struct cg *cgp;
4427 			uint8_t *blksfree;
4428 			long bno;
4429 			int i;
4430 
4431 			cgp = (struct cg *)bp->b_data;
4432 			blksfree = cg_blksfree(cgp);
4433 			bno = dtogd(fs, jnewblk->jn_blkno);
4434 			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
4435 			    i++) {
4436 				if (isset(blksfree, bno + i))
4437 					panic("softdep_setup_blkmapdep: "
4438 					    "free fragment %d from %d-%d "
4439 					    "state 0x%X dep %p", i,
4440 					    jnewblk->jn_oldfrags,
4441 					    jnewblk->jn_frags,
4442 					    jnewblk->jn_state,
4443 					    jnewblk->jn_dep);
4444 			}
4445 		}
4446 #endif
4447 	}
4448 	ACQUIRE_LOCK(&lk);
4449 	if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
4450 		panic("softdep_setup_blkmapdep: found block");
4451 	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
4452 	    dtog(fs, newblkno));
4453 	if (jnewblk) {
4454 		jnewblk->jn_dep = (struct worklist *)newblk;
4455 		LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
4456 	} else {
4457 		newblk->nb_state |= ONDEPLIST;
4458 		LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
4459 	}
4460 	newblk->nb_bmsafemap = bmsafemap;
4461 	newblk->nb_jnewblk = jnewblk;
4462 	FREE_LOCK(&lk);
4463 }
4464 
4465 #define	BMSAFEMAP_HASH(fs, cg) \
4466       (&bmsafemap_hashtbl[((((register_t)(fs)) >> 13) + (cg)) & bmsafemap_hash])
4467 
4468 static int
4469 bmsafemap_find(bmsafemaphd, mp, cg, bmsafemapp)
4470 	struct bmsafemap_hashhead *bmsafemaphd;
4471 	struct mount *mp;
4472 	int cg;
4473 	struct bmsafemap **bmsafemapp;
4474 {
4475 	struct bmsafemap *bmsafemap;
4476 
4477 	LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
4478 		if (bmsafemap->sm_list.wk_mp == mp && bmsafemap->sm_cg == cg)
4479 			break;
4480 	if (bmsafemap) {
4481 		*bmsafemapp = bmsafemap;
4482 		return (1);
4483 	}
4484 	*bmsafemapp = NULL;
4485 
4486 	return (0);
4487 }
4488 
4489 /*
4490  * Find the bmsafemap associated with a cylinder group buffer.
4491  * If none exists, create one. The buffer must be locked when
4492  * this routine is called and this routine must be called with
4493  * splbio interrupts blocked.
4494  */
4495 static struct bmsafemap *
4496 bmsafemap_lookup(mp, bp, cg)
4497 	struct mount *mp;
4498 	struct buf *bp;
4499 	int cg;
4500 {
4501 	struct bmsafemap_hashhead *bmsafemaphd;
4502 	struct bmsafemap *bmsafemap, *collision;
4503 	struct worklist *wk;
4504 	struct fs *fs;
4505 
4506 	mtx_assert(&lk, MA_OWNED);
4507 	if (bp)
4508 		LIST_FOREACH(wk, &bp->b_dep, wk_list)
4509 			if (wk->wk_type == D_BMSAFEMAP)
4510 				return (WK_BMSAFEMAP(wk));
4511 	fs = VFSTOUFS(mp)->um_fs;
4512 	bmsafemaphd = BMSAFEMAP_HASH(fs, cg);
4513 	if (bmsafemap_find(bmsafemaphd, mp, cg, &bmsafemap) == 1)
4514 		return (bmsafemap);
4515 	FREE_LOCK(&lk);
4516 	bmsafemap = malloc(sizeof(struct bmsafemap),
4517 		M_BMSAFEMAP, M_SOFTDEP_FLAGS);
4518 	workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
4519 	bmsafemap->sm_buf = bp;
4520 	LIST_INIT(&bmsafemap->sm_inodedephd);
4521 	LIST_INIT(&bmsafemap->sm_inodedepwr);
4522 	LIST_INIT(&bmsafemap->sm_newblkhd);
4523 	LIST_INIT(&bmsafemap->sm_newblkwr);
4524 	LIST_INIT(&bmsafemap->sm_jaddrefhd);
4525 	LIST_INIT(&bmsafemap->sm_jnewblkhd);
4526 	ACQUIRE_LOCK(&lk);
4527 	if (bmsafemap_find(bmsafemaphd, mp, cg, &collision) == 1) {
4528 		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
4529 		return (collision);
4530 	}
4531 	bmsafemap->sm_cg = cg;
4532 	LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
4533 	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
4534 	return (bmsafemap);
4535 }
4536 
4537 /*
4538  * Direct block allocation dependencies.
4539  *
4540  * When a new block is allocated, the corresponding disk locations must be
4541  * initialized (with zeros or new data) before the on-disk inode points to
4542  * them.  Also, the freemap from which the block was allocated must be
4543  * updated (on disk) before the inode's pointer. These two dependencies are
4544  * independent of each other and are needed for all file blocks and indirect
4545  * blocks that are pointed to directly by the inode.  Just before the
4546  * "in-core" version of the inode is updated with a newly allocated block
4547  * number, a procedure (below) is called to setup allocation dependency
4548  * structures.  These structures are removed when the corresponding
4549  * dependencies are satisfied or when the block allocation becomes obsolete
4550  * (i.e., the file is deleted, the block is de-allocated, or the block is a
4551  * fragment that gets upgraded).  All of these cases are handled in
4552  * procedures described later.
4553  *
4554  * When a file extension causes a fragment to be upgraded, either to a larger
4555  * fragment or to a full block, the on-disk location may change (if the
4556  * previous fragment could not simply be extended). In this case, the old
4557  * fragment must be de-allocated, but not until after the inode's pointer has
4558  * been updated. In most cases, this is handled by later procedures, which
4559  * will construct a "freefrag" structure to be added to the workitem queue
4560  * when the inode update is complete (or obsolete).  The main exception to
4561  * this is when an allocation occurs while a pending allocation dependency
4562  * (for the same block pointer) remains.  This case is handled in the main
4563  * allocation dependency setup procedure by immediately freeing the
4564  * unreferenced fragments.
4565  */
4566 void
4567 softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
4568 	struct inode *ip;	/* inode to which block is being added */
4569 	ufs_lbn_t off;		/* block pointer within inode */
4570 	ufs2_daddr_t newblkno;	/* disk block number being added */
4571 	ufs2_daddr_t oldblkno;	/* previous block number, 0 unless frag */
4572 	long newsize;		/* size of new block */
4573 	long oldsize;		/* size of new block */
4574 	struct buf *bp;		/* bp for allocated block */
4575 {
4576 	struct allocdirect *adp, *oldadp;
4577 	struct allocdirectlst *adphead;
4578 	struct freefrag *freefrag;
4579 	struct inodedep *inodedep;
4580 	struct pagedep *pagedep;
4581 	struct jnewblk *jnewblk;
4582 	struct newblk *newblk;
4583 	struct mount *mp;
4584 	ufs_lbn_t lbn;
4585 
4586 	lbn = bp->b_lblkno;
4587 	mp = UFSTOVFS(ip->i_ump);
4588 	if (oldblkno && oldblkno != newblkno)
4589 		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
4590 	else
4591 		freefrag = NULL;
4592 
4593 	ACQUIRE_LOCK(&lk);
4594 	if (off >= NDADDR) {
4595 		if (lbn > 0)
4596 			panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
4597 			    lbn, off);
4598 		/* allocating an indirect block */
4599 		if (oldblkno != 0)
4600 			panic("softdep_setup_allocdirect: non-zero indir");
4601 	} else {
4602 		if (off != lbn)
4603 			panic("softdep_setup_allocdirect: lbn %jd != off %jd",
4604 			    lbn, off);
4605 		/*
4606 		 * Allocating a direct block.
4607 		 *
4608 		 * If we are allocating a directory block, then we must
4609 		 * allocate an associated pagedep to track additions and
4610 		 * deletions.
4611 		 */
4612 		if ((ip->i_mode & IFMT) == IFDIR &&
4613 		    pagedep_lookup(mp, ip->i_number, off, DEPALLOC,
4614 		    &pagedep) == 0)
4615 			WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
4616 	}
4617 	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
4618 		panic("softdep_setup_allocdirect: lost block");
4619 	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
4620 	    ("softdep_setup_allocdirect: newblk already initialized"));
4621 	/*
4622 	 * Convert the newblk to an allocdirect.
4623 	 */
4624 	newblk->nb_list.wk_type = D_ALLOCDIRECT;
4625 	adp = (struct allocdirect *)newblk;
4626 	newblk->nb_freefrag = freefrag;
4627 	adp->ad_offset = off;
4628 	adp->ad_oldblkno = oldblkno;
4629 	adp->ad_newsize = newsize;
4630 	adp->ad_oldsize = oldsize;
4631 
4632 	/*
4633 	 * Finish initializing the journal.
4634 	 */
4635 	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
4636 		jnewblk->jn_ino = ip->i_number;
4637 		jnewblk->jn_lbn = lbn;
4638 		add_to_journal(&jnewblk->jn_list);
4639 	}
4640 	if (freefrag && freefrag->ff_jdep != NULL &&
4641 	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
4642 		add_to_journal(freefrag->ff_jdep);
4643 	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
4644 	adp->ad_inodedep = inodedep;
4645 
4646 	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
4647 	/*
4648 	 * The list of allocdirects must be kept in sorted and ascending
4649 	 * order so that the rollback routines can quickly determine the
4650 	 * first uncommitted block (the size of the file stored on disk
4651 	 * ends at the end of the lowest committed fragment, or if there
4652 	 * are no fragments, at the end of the highest committed block).
4653 	 * Since files generally grow, the typical case is that the new
4654 	 * block is to be added at the end of the list. We speed this
4655 	 * special case by checking against the last allocdirect in the
4656 	 * list before laboriously traversing the list looking for the
4657 	 * insertion point.
4658 	 */
4659 	adphead = &inodedep->id_newinoupdt;
4660 	oldadp = TAILQ_LAST(adphead, allocdirectlst);
4661 	if (oldadp == NULL || oldadp->ad_offset <= off) {
4662 		/* insert at end of list */
4663 		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
4664 		if (oldadp != NULL && oldadp->ad_offset == off)
4665 			allocdirect_merge(adphead, adp, oldadp);
4666 		FREE_LOCK(&lk);
4667 		return;
4668 	}
4669 	TAILQ_FOREACH(oldadp, adphead, ad_next) {
4670 		if (oldadp->ad_offset >= off)
4671 			break;
4672 	}
4673 	if (oldadp == NULL)
4674 		panic("softdep_setup_allocdirect: lost entry");
4675 	/* insert in middle of list */
4676 	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
4677 	if (oldadp->ad_offset == off)
4678 		allocdirect_merge(adphead, adp, oldadp);
4679 
4680 	FREE_LOCK(&lk);
4681 }
4682 
4683 /*
4684  * Merge a newer and older journal record to be stored either in a
4685  * newblock or freefrag.  This handles aggregating journal records for
4686  * fragment allocation into a second record as well as replacing a
4687  * journal free with an aborted journal allocation.  A segment for the
4688  * oldest record will be placed on wkhd if it has been written.  If not
4689  * the segment for the newer record will suffice.
4690  */
4691 static struct worklist *
4692 jnewblk_merge(new, old, wkhd)
4693 	struct worklist *new;
4694 	struct worklist *old;
4695 	struct workhead *wkhd;
4696 {
4697 	struct jnewblk *njnewblk;
4698 	struct jnewblk *jnewblk;
4699 
4700 	/* Handle NULLs to simplify callers. */
4701 	if (new == NULL)
4702 		return (old);
4703 	if (old == NULL)
4704 		return (new);
4705 	/* Replace a jfreefrag with a jnewblk. */
4706 	if (new->wk_type == D_JFREEFRAG) {
4707 		cancel_jfreefrag(WK_JFREEFRAG(new));
4708 		return (old);
4709 	}
4710 	/*
4711 	 * Handle merging of two jnewblk records that describe
4712 	 * different sets of fragments in the same block.
4713 	 */
4714 	jnewblk = WK_JNEWBLK(old);
4715 	njnewblk = WK_JNEWBLK(new);
4716 	if (jnewblk->jn_blkno != njnewblk->jn_blkno)
4717 		panic("jnewblk_merge: Merging disparate blocks.");
4718 	/*
4719 	 * The record may be rolled back in the cg update bits
4720 	 * appropriately.  NEWBLOCK here alerts the cg rollback code
4721 	 * that the frag bits have changed.
4722 	 */
4723 	if (jnewblk->jn_state & UNDONE) {
4724 		njnewblk->jn_state |= UNDONE | NEWBLOCK;
4725 		njnewblk->jn_state &= ~ATTACHED;
4726 		jnewblk->jn_state &= ~UNDONE;
4727 	}
4728 	/*
4729 	 * We modify the newer addref and free the older so that if neither
4730 	 * has been written the most up-to-date copy will be on disk.  If
4731 	 * both have been written but rolled back we only temporarily need
4732 	 * one of them to fix the bits when the cg write completes.
4733 	 */
4734 	jnewblk->jn_state |= ATTACHED | COMPLETE;
4735 	njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
4736 	cancel_jnewblk(jnewblk, wkhd);
4737 	WORKLIST_REMOVE(&jnewblk->jn_list);
4738 	free_jnewblk(jnewblk);
4739 	return (new);
4740 }
4741 
4742 /*
4743  * Replace an old allocdirect dependency with a newer one.
4744  * This routine must be called with splbio interrupts blocked.
4745  */
4746 static void
4747 allocdirect_merge(adphead, newadp, oldadp)
4748 	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
4749 	struct allocdirect *newadp;	/* allocdirect being added */
4750 	struct allocdirect *oldadp;	/* existing allocdirect being checked */
4751 {
4752 	struct worklist *wk;
4753 	struct freefrag *freefrag;
4754 	struct newdirblk *newdirblk;
4755 
4756 	freefrag = NULL;
4757 	mtx_assert(&lk, MA_OWNED);
4758 	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
4759 	    newadp->ad_oldsize != oldadp->ad_newsize ||
4760 	    newadp->ad_offset >= NDADDR)
4761 		panic("%s %jd != new %jd || old size %ld != new %ld",
4762 		    "allocdirect_merge: old blkno",
4763 		    (intmax_t)newadp->ad_oldblkno,
4764 		    (intmax_t)oldadp->ad_newblkno,
4765 		    newadp->ad_oldsize, oldadp->ad_newsize);
4766 	newadp->ad_oldblkno = oldadp->ad_oldblkno;
4767 	newadp->ad_oldsize = oldadp->ad_oldsize;
4768 	/*
4769 	 * If the old dependency had a fragment to free or had never
4770 	 * previously had a block allocated, then the new dependency
4771 	 * can immediately post its freefrag and adopt the old freefrag.
4772 	 * This action is done by swapping the freefrag dependencies.
4773 	 * The new dependency gains the old one's freefrag, and the
4774 	 * old one gets the new one and then immediately puts it on
4775 	 * the worklist when it is freed by free_newblk. It is
4776 	 * not possible to do this swap when the old dependency had a
4777 	 * non-zero size but no previous fragment to free. This condition
4778 	 * arises when the new block is an extension of the old block.
4779 	 * Here, the first part of the fragment allocated to the new
4780 	 * dependency is part of the block currently claimed on disk by
4781 	 * the old dependency, so cannot legitimately be freed until the
4782 	 * conditions for the new dependency are fulfilled.
4783 	 */
4784 	freefrag = newadp->ad_freefrag;
4785 	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
4786 		newadp->ad_freefrag = oldadp->ad_freefrag;
4787 		oldadp->ad_freefrag = freefrag;
4788 	}
4789 	/*
4790 	 * If we are tracking a new directory-block allocation,
4791 	 * move it from the old allocdirect to the new allocdirect.
4792 	 */
4793 	if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
4794 		newdirblk = WK_NEWDIRBLK(wk);
4795 		WORKLIST_REMOVE(&newdirblk->db_list);
4796 		if (!LIST_EMPTY(&oldadp->ad_newdirblk))
4797 			panic("allocdirect_merge: extra newdirblk");
4798 		WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
4799 	}
4800 	TAILQ_REMOVE(adphead, oldadp, ad_next);
4801 	/*
4802 	 * We need to move any journal dependencies over to the freefrag
4803 	 * that releases this block if it exists.  Otherwise we are
4804 	 * extending an existing block and we'll wait until that is
4805 	 * complete to release the journal space and extend the
4806 	 * new journal to cover this old space as well.
4807 	 */
4808 	if (freefrag == NULL) {
4809 		if (oldadp->ad_newblkno != newadp->ad_newblkno)
4810 			panic("allocdirect_merge: %jd != %jd",
4811 			    oldadp->ad_newblkno, newadp->ad_newblkno);
4812 		newadp->ad_block.nb_jnewblk = (struct jnewblk *)
4813 		    jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list,
4814 		    &oldadp->ad_block.nb_jnewblk->jn_list,
4815 		    &newadp->ad_block.nb_jwork);
4816 		oldadp->ad_block.nb_jnewblk = NULL;
4817 		if (cancel_newblk(&oldadp->ad_block, NULL,
4818 		    &newadp->ad_block.nb_jwork))
4819 			panic("allocdirect_merge: Unexpected dependency.");
4820 	} else {
4821 		wk = (struct worklist *) cancel_newblk(&oldadp->ad_block,
4822 		    &freefrag->ff_list, &freefrag->ff_jwork);
4823 		freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk,
4824 		    &freefrag->ff_jwork);
4825 	}
4826 	free_newblk(&oldadp->ad_block);
4827 }
4828 
4829 /*
4830  * Allocate a jfreefrag structure to journal a single block free.
4831  */
4832 static struct jfreefrag *
4833 newjfreefrag(freefrag, ip, blkno, size, lbn)
4834 	struct freefrag *freefrag;
4835 	struct inode *ip;
4836 	ufs2_daddr_t blkno;
4837 	long size;
4838 	ufs_lbn_t lbn;
4839 {
4840 	struct jfreefrag *jfreefrag;
4841 	struct fs *fs;
4842 
4843 	fs = ip->i_fs;
4844 	jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
4845 	    M_SOFTDEP_FLAGS);
4846 	workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump));
4847 	jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
4848 	jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
4849 	jfreefrag->fr_ino = ip->i_number;
4850 	jfreefrag->fr_lbn = lbn;
4851 	jfreefrag->fr_blkno = blkno;
4852 	jfreefrag->fr_frags = numfrags(fs, size);
4853 	jfreefrag->fr_freefrag = freefrag;
4854 
4855 	return (jfreefrag);
4856 }
4857 
4858 /*
4859  * Allocate a new freefrag structure.
4860  */
4861 static struct freefrag *
4862 newfreefrag(ip, blkno, size, lbn)
4863 	struct inode *ip;
4864 	ufs2_daddr_t blkno;
4865 	long size;
4866 	ufs_lbn_t lbn;
4867 {
4868 	struct freefrag *freefrag;
4869 	struct fs *fs;
4870 
4871 	fs = ip->i_fs;
4872 	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
4873 		panic("newfreefrag: frag size");
4874 	freefrag = malloc(sizeof(struct freefrag),
4875 	    M_FREEFRAG, M_SOFTDEP_FLAGS);
4876 	workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
4877 	freefrag->ff_state = ATTACHED;
4878 	LIST_INIT(&freefrag->ff_jwork);
4879 	freefrag->ff_inum = ip->i_number;
4880 	freefrag->ff_blkno = blkno;
4881 	freefrag->ff_fragsize = size;
4882 
4883 	if (fs->fs_flags & FS_SUJ) {
4884 		freefrag->ff_jdep = (struct worklist *)
4885 		    newjfreefrag(freefrag, ip, blkno, size, lbn);
4886 	} else {
4887 		freefrag->ff_state |= DEPCOMPLETE;
4888 		freefrag->ff_jdep = NULL;
4889 	}
4890 
4891 	return (freefrag);
4892 }
4893 
4894 /*
4895  * This workitem de-allocates fragments that were replaced during
4896  * file block allocation.
4897  */
4898 static void
4899 handle_workitem_freefrag(freefrag)
4900 	struct freefrag *freefrag;
4901 {
4902 	struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
4903 	struct workhead wkhd;
4904 
4905 	/*
4906 	 * It would be illegal to add new completion items to the
4907 	 * freefrag after it was schedule to be done so it must be
4908 	 * safe to modify the list head here.
4909 	 */
4910 	LIST_INIT(&wkhd);
4911 	ACQUIRE_LOCK(&lk);
4912 	LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
4913 	/*
4914 	 * If the journal has not been written we must cancel it here.
4915 	 */
4916 	if (freefrag->ff_jdep) {
4917 		if (freefrag->ff_jdep->wk_type != D_JNEWBLK)
4918 			panic("handle_workitem_freefrag: Unexpected type %d\n",
4919 			    freefrag->ff_jdep->wk_type);
4920 		cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd);
4921 	}
4922 	FREE_LOCK(&lk);
4923 	ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
4924 	    freefrag->ff_fragsize, freefrag->ff_inum, &wkhd);
4925 	ACQUIRE_LOCK(&lk);
4926 	WORKITEM_FREE(freefrag, D_FREEFRAG);
4927 	FREE_LOCK(&lk);
4928 }
4929 
4930 /*
4931  * Set up a dependency structure for an external attributes data block.
4932  * This routine follows much of the structure of softdep_setup_allocdirect.
4933  * See the description of softdep_setup_allocdirect above for details.
4934  */
4935 void
4936 softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
4937 	struct inode *ip;
4938 	ufs_lbn_t off;
4939 	ufs2_daddr_t newblkno;
4940 	ufs2_daddr_t oldblkno;
4941 	long newsize;
4942 	long oldsize;
4943 	struct buf *bp;
4944 {
4945 	struct allocdirect *adp, *oldadp;
4946 	struct allocdirectlst *adphead;
4947 	struct freefrag *freefrag;
4948 	struct inodedep *inodedep;
4949 	struct jnewblk *jnewblk;
4950 	struct newblk *newblk;
4951 	struct mount *mp;
4952 	ufs_lbn_t lbn;
4953 
4954 	if (off >= NXADDR)
4955 		panic("softdep_setup_allocext: lbn %lld > NXADDR",
4956 		    (long long)off);
4957 
4958 	lbn = bp->b_lblkno;
4959 	mp = UFSTOVFS(ip->i_ump);
4960 	if (oldblkno && oldblkno != newblkno)
4961 		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
4962 	else
4963 		freefrag = NULL;
4964 
4965 	ACQUIRE_LOCK(&lk);
4966 	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
4967 		panic("softdep_setup_allocext: lost block");
4968 	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
4969 	    ("softdep_setup_allocext: newblk already initialized"));
4970 	/*
4971 	 * Convert the newblk to an allocdirect.
4972 	 */
4973 	newblk->nb_list.wk_type = D_ALLOCDIRECT;
4974 	adp = (struct allocdirect *)newblk;
4975 	newblk->nb_freefrag = freefrag;
4976 	adp->ad_offset = off;
4977 	adp->ad_oldblkno = oldblkno;
4978 	adp->ad_newsize = newsize;
4979 	adp->ad_oldsize = oldsize;
4980 	adp->ad_state |=  EXTDATA;
4981 
4982 	/*
4983 	 * Finish initializing the journal.
4984 	 */
4985 	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
4986 		jnewblk->jn_ino = ip->i_number;
4987 		jnewblk->jn_lbn = lbn;
4988 		add_to_journal(&jnewblk->jn_list);
4989 	}
4990 	if (freefrag && freefrag->ff_jdep != NULL &&
4991 	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
4992 		add_to_journal(freefrag->ff_jdep);
4993 	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
4994 	adp->ad_inodedep = inodedep;
4995 
4996 	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
4997 	/*
4998 	 * The list of allocdirects must be kept in sorted and ascending
4999 	 * order so that the rollback routines can quickly determine the
5000 	 * first uncommitted block (the size of the file stored on disk
5001 	 * ends at the end of the lowest committed fragment, or if there
5002 	 * are no fragments, at the end of the highest committed block).
5003 	 * Since files generally grow, the typical case is that the new
5004 	 * block is to be added at the end of the list. We speed this
5005 	 * special case by checking against the last allocdirect in the
5006 	 * list before laboriously traversing the list looking for the
5007 	 * insertion point.
5008 	 */
5009 	adphead = &inodedep->id_newextupdt;
5010 	oldadp = TAILQ_LAST(adphead, allocdirectlst);
5011 	if (oldadp == NULL || oldadp->ad_offset <= off) {
5012 		/* insert at end of list */
5013 		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5014 		if (oldadp != NULL && oldadp->ad_offset == off)
5015 			allocdirect_merge(adphead, adp, oldadp);
5016 		FREE_LOCK(&lk);
5017 		return;
5018 	}
5019 	TAILQ_FOREACH(oldadp, adphead, ad_next) {
5020 		if (oldadp->ad_offset >= off)
5021 			break;
5022 	}
5023 	if (oldadp == NULL)
5024 		panic("softdep_setup_allocext: lost entry");
5025 	/* insert in middle of list */
5026 	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5027 	if (oldadp->ad_offset == off)
5028 		allocdirect_merge(adphead, adp, oldadp);
5029 	FREE_LOCK(&lk);
5030 }
5031 
5032 /*
5033  * Indirect block allocation dependencies.
5034  *
5035  * The same dependencies that exist for a direct block also exist when
5036  * a new block is allocated and pointed to by an entry in a block of
5037  * indirect pointers. The undo/redo states described above are also
5038  * used here. Because an indirect block contains many pointers that
5039  * may have dependencies, a second copy of the entire in-memory indirect
5040  * block is kept. The buffer cache copy is always completely up-to-date.
5041  * The second copy, which is used only as a source for disk writes,
5042  * contains only the safe pointers (i.e., those that have no remaining
5043  * update dependencies). The second copy is freed when all pointers
5044  * are safe. The cache is not allowed to replace indirect blocks with
5045  * pending update dependencies. If a buffer containing an indirect
5046  * block with dependencies is written, these routines will mark it
5047  * dirty again. It can only be successfully written once all the
5048  * dependencies are removed. The ffs_fsync routine in conjunction with
5049  * softdep_sync_metadata work together to get all the dependencies
5050  * removed so that a file can be successfully written to disk. Three
5051  * procedures are used when setting up indirect block pointer
5052  * dependencies. The division is necessary because of the organization
5053  * of the "balloc" routine and because of the distinction between file
5054  * pages and file metadata blocks.
5055  */
5056 
5057 /*
5058  * Allocate a new allocindir structure.
5059  */
5060 static struct allocindir *
5061 newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
5062 	struct inode *ip;	/* inode for file being extended */
5063 	int ptrno;		/* offset of pointer in indirect block */
5064 	ufs2_daddr_t newblkno;	/* disk block number being added */
5065 	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
5066 	ufs_lbn_t lbn;
5067 {
5068 	struct newblk *newblk;
5069 	struct allocindir *aip;
5070 	struct freefrag *freefrag;
5071 	struct jnewblk *jnewblk;
5072 
5073 	if (oldblkno)
5074 		freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn);
5075 	else
5076 		freefrag = NULL;
5077 	ACQUIRE_LOCK(&lk);
5078 	if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0)
5079 		panic("new_allocindir: lost block");
5080 	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5081 	    ("newallocindir: newblk already initialized"));
5082 	newblk->nb_list.wk_type = D_ALLOCINDIR;
5083 	newblk->nb_freefrag = freefrag;
5084 	aip = (struct allocindir *)newblk;
5085 	aip->ai_offset = ptrno;
5086 	aip->ai_oldblkno = oldblkno;
5087 	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5088 		jnewblk->jn_ino = ip->i_number;
5089 		jnewblk->jn_lbn = lbn;
5090 		add_to_journal(&jnewblk->jn_list);
5091 	}
5092 	if (freefrag && freefrag->ff_jdep != NULL &&
5093 	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5094 		add_to_journal(freefrag->ff_jdep);
5095 	return (aip);
5096 }
5097 
5098 /*
5099  * Called just before setting an indirect block pointer
5100  * to a newly allocated file page.
5101  */
5102 void
5103 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
5104 	struct inode *ip;	/* inode for file being extended */
5105 	ufs_lbn_t lbn;		/* allocated block number within file */
5106 	struct buf *bp;		/* buffer with indirect blk referencing page */
5107 	int ptrno;		/* offset of pointer in indirect block */
5108 	ufs2_daddr_t newblkno;	/* disk block number being added */
5109 	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
5110 	struct buf *nbp;	/* buffer holding allocated page */
5111 {
5112 	struct inodedep *inodedep;
5113 	struct allocindir *aip;
5114 	struct pagedep *pagedep;
5115 	struct mount *mp;
5116 
5117 	if (lbn != nbp->b_lblkno)
5118 		panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
5119 		    lbn, bp->b_lblkno);
5120 	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
5121 	mp = UFSTOVFS(ip->i_ump);
5122 	aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
5123 	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5124 	/*
5125 	 * If we are allocating a directory page, then we must
5126 	 * allocate an associated pagedep to track additions and
5127 	 * deletions.
5128 	 */
5129 	if ((ip->i_mode & IFMT) == IFDIR &&
5130 	    pagedep_lookup(mp, ip->i_number, lbn, DEPALLOC, &pagedep) == 0)
5131 		WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
5132 	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5133 	setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
5134 	FREE_LOCK(&lk);
5135 }
5136 
5137 /*
5138  * Called just before setting an indirect block pointer to a
5139  * newly allocated indirect block.
5140  */
5141 void
5142 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
5143 	struct buf *nbp;	/* newly allocated indirect block */
5144 	struct inode *ip;	/* inode for file being extended */
5145 	struct buf *bp;		/* indirect block referencing allocated block */
5146 	int ptrno;		/* offset of pointer in indirect block */
5147 	ufs2_daddr_t newblkno;	/* disk block number being added */
5148 {
5149 	struct inodedep *inodedep;
5150 	struct allocindir *aip;
5151 	ufs_lbn_t lbn;
5152 
5153 	lbn = nbp->b_lblkno;
5154 	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
5155 	aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
5156 	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep);
5157 	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5158 	setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
5159 	FREE_LOCK(&lk);
5160 }
5161 
5162 static void
5163 indirdep_complete(indirdep)
5164 	struct indirdep *indirdep;
5165 {
5166 	struct allocindir *aip;
5167 
5168 	LIST_REMOVE(indirdep, ir_next);
5169 	indirdep->ir_state &= ~ONDEPLIST;
5170 
5171 	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
5172 		LIST_REMOVE(aip, ai_next);
5173 		free_newblk(&aip->ai_block);
5174 	}
5175 	/*
5176 	 * If this indirdep is not attached to a buf it was simply waiting
5177 	 * on completion to clear completehd.  free_indirdep() asserts
5178 	 * that nothing is dangling.
5179 	 */
5180 	if ((indirdep->ir_state & ONWORKLIST) == 0)
5181 		free_indirdep(indirdep);
5182 }
5183 
5184 /*
5185  * Called to finish the allocation of the "aip" allocated
5186  * by one of the two routines above.
5187  */
5188 static void
5189 setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)
5190 	struct buf *bp;		/* in-memory copy of the indirect block */
5191 	struct inode *ip;	/* inode for file being extended */
5192 	struct inodedep *inodedep; /* Inodedep for ip */
5193 	struct allocindir *aip;	/* allocindir allocated by the above routines */
5194 	ufs_lbn_t lbn;		/* Logical block number for this block. */
5195 {
5196 	struct worklist *wk;
5197 	struct fs *fs;
5198 	struct newblk *newblk;
5199 	struct indirdep *indirdep, *newindirdep;
5200 	struct allocindir *oldaip;
5201 	struct freefrag *freefrag;
5202 	struct mount *mp;
5203 	ufs2_daddr_t blkno;
5204 
5205 	mp = UFSTOVFS(ip->i_ump);
5206 	fs = ip->i_fs;
5207 	mtx_assert(&lk, MA_OWNED);
5208 	if (bp->b_lblkno >= 0)
5209 		panic("setup_allocindir_phase2: not indir blk");
5210 	for (freefrag = NULL, indirdep = NULL, newindirdep = NULL; ; ) {
5211 		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5212 			if (wk->wk_type != D_INDIRDEP)
5213 				continue;
5214 			indirdep = WK_INDIRDEP(wk);
5215 			break;
5216 		}
5217 		if (indirdep == NULL && newindirdep) {
5218 			indirdep = newindirdep;
5219 			newindirdep = NULL;
5220 			WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
5221 			if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0,
5222 			    &newblk)) {
5223 				indirdep->ir_state |= ONDEPLIST;
5224 				LIST_INSERT_HEAD(&newblk->nb_indirdeps,
5225 				    indirdep, ir_next);
5226 			} else
5227 				indirdep->ir_state |= DEPCOMPLETE;
5228 		}
5229 		if (indirdep) {
5230 			aip->ai_indirdep = indirdep;
5231 			/*
5232 			 * Check to see if there is an existing dependency
5233 			 * for this block. If there is, merge the old
5234 			 * dependency into the new one.  This happens
5235 			 * as a result of reallocblk only.
5236 			 */
5237 			if (aip->ai_oldblkno == 0)
5238 				oldaip = NULL;
5239 			else
5240 
5241 				LIST_FOREACH(oldaip, &indirdep->ir_deplisthd,
5242 				    ai_next)
5243 					if (oldaip->ai_offset == aip->ai_offset)
5244 						break;
5245 			if (oldaip != NULL)
5246 				freefrag = allocindir_merge(aip, oldaip);
5247 			LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
5248 			KASSERT(aip->ai_offset >= 0 &&
5249 			    aip->ai_offset < NINDIR(ip->i_ump->um_fs),
5250 			    ("setup_allocindir_phase2: Bad offset %d",
5251 			    aip->ai_offset));
5252 			KASSERT(indirdep->ir_savebp != NULL,
5253 			    ("setup_allocindir_phase2 NULL ir_savebp"));
5254 			if (ip->i_ump->um_fstype == UFS1)
5255 				((ufs1_daddr_t *)indirdep->ir_savebp->b_data)
5256 				    [aip->ai_offset] = aip->ai_oldblkno;
5257 			else
5258 				((ufs2_daddr_t *)indirdep->ir_savebp->b_data)
5259 				    [aip->ai_offset] = aip->ai_oldblkno;
5260 			FREE_LOCK(&lk);
5261 			if (freefrag != NULL)
5262 				handle_workitem_freefrag(freefrag);
5263 		} else
5264 			FREE_LOCK(&lk);
5265 		if (newindirdep) {
5266 			newindirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
5267 			brelse(newindirdep->ir_savebp);
5268 			ACQUIRE_LOCK(&lk);
5269 			WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
5270 			if (indirdep)
5271 				break;
5272 			FREE_LOCK(&lk);
5273 		}
5274 		if (indirdep) {
5275 			ACQUIRE_LOCK(&lk);
5276 			break;
5277 		}
5278 		newindirdep = malloc(sizeof(struct indirdep),
5279 			M_INDIRDEP, M_SOFTDEP_FLAGS);
5280 		workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
5281 		newindirdep->ir_state = ATTACHED;
5282 		if (ip->i_ump->um_fstype == UFS1)
5283 			newindirdep->ir_state |= UFS1FMT;
5284 		newindirdep->ir_saveddata = NULL;
5285 		LIST_INIT(&newindirdep->ir_deplisthd);
5286 		LIST_INIT(&newindirdep->ir_donehd);
5287 		LIST_INIT(&newindirdep->ir_writehd);
5288 		LIST_INIT(&newindirdep->ir_completehd);
5289 		LIST_INIT(&newindirdep->ir_jwork);
5290 		LIST_INIT(&newindirdep->ir_jnewblkhd);
5291 		if (bp->b_blkno == bp->b_lblkno) {
5292 			ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
5293 			    NULL, NULL);
5294 			bp->b_blkno = blkno;
5295 		}
5296 		newindirdep->ir_savebp =
5297 		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
5298 		BUF_KERNPROC(newindirdep->ir_savebp);
5299 		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
5300 		ACQUIRE_LOCK(&lk);
5301 	}
5302 }
5303 
5304 /*
5305  * Merge two allocindirs which refer to the same block.  Move newblock
5306  * dependencies and setup the freefrags appropriately.
5307  */
5308 static struct freefrag *
5309 allocindir_merge(aip, oldaip)
5310 	struct allocindir *aip;
5311 	struct allocindir *oldaip;
5312 {
5313 	struct newdirblk *newdirblk;
5314 	struct freefrag *freefrag;
5315 	struct worklist *wk;
5316 
5317 	if (oldaip->ai_newblkno != aip->ai_oldblkno)
5318 		panic("allocindir_merge: blkno");
5319 	aip->ai_oldblkno = oldaip->ai_oldblkno;
5320 	freefrag = aip->ai_freefrag;
5321 	aip->ai_freefrag = oldaip->ai_freefrag;
5322 	oldaip->ai_freefrag = NULL;
5323 	KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag"));
5324 	/*
5325 	 * If we are tracking a new directory-block allocation,
5326 	 * move it from the old allocindir to the new allocindir.
5327 	 */
5328 	if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) {
5329 		newdirblk = WK_NEWDIRBLK(wk);
5330 		WORKLIST_REMOVE(&newdirblk->db_list);
5331 		if (!LIST_EMPTY(&oldaip->ai_newdirblk))
5332 			panic("allocindir_merge: extra newdirblk");
5333 		WORKLIST_INSERT(&aip->ai_newdirblk, &newdirblk->db_list);
5334 	}
5335 	/*
5336 	 * We can skip journaling for this freefrag and just complete
5337 	 * any pending journal work for the allocindir that is being
5338 	 * removed after the freefrag completes.
5339 	 */
5340 	if (freefrag->ff_jdep)
5341 		cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep));
5342 	LIST_REMOVE(oldaip, ai_next);
5343 	freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block,
5344 	    &freefrag->ff_list, &freefrag->ff_jwork);
5345 	free_newblk(&oldaip->ai_block);
5346 
5347 	return (freefrag);
5348 }
5349 
5350 static inline void
5351 setup_freedirect(freeblks, ip, i, needj)
5352 	struct freeblks *freeblks;
5353 	struct inode *ip;
5354 	int i;
5355 	int needj;
5356 {
5357 	ufs2_daddr_t blkno;
5358 	int frags;
5359 
5360 	blkno = DIP(ip, i_db[i]);
5361 	if (blkno == 0)
5362 		return;
5363 	DIP_SET(ip, i_db[i], 0);
5364 	frags = sblksize(ip->i_fs, ip->i_size, i);
5365 	frags = numfrags(ip->i_fs, frags);
5366 	newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags, needj);
5367 }
5368 
5369 static inline void
5370 setup_freeext(freeblks, ip, i, needj)
5371 	struct freeblks *freeblks;
5372 	struct inode *ip;
5373 	int i;
5374 	int needj;
5375 {
5376 	ufs2_daddr_t blkno;
5377 	int frags;
5378 
5379 	blkno = ip->i_din2->di_extb[i];
5380 	if (blkno == 0)
5381 		return;
5382 	ip->i_din2->di_extb[i] = 0;
5383 	frags = sblksize(ip->i_fs, ip->i_din2->di_extsize, i);
5384 	frags = numfrags(ip->i_fs, frags);
5385 	newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno, frags, needj);
5386 }
5387 
5388 static inline void
5389 setup_freeindir(freeblks, ip, i, lbn, needj)
5390 	struct freeblks *freeblks;
5391 	struct inode *ip;
5392 	ufs_lbn_t lbn;
5393 	int i;
5394 	int needj;
5395 {
5396 	ufs2_daddr_t blkno;
5397 
5398 	blkno = DIP(ip, i_ib[i]);
5399 	if (blkno == 0)
5400 		return;
5401 	DIP_SET(ip, i_ib[i], 0);
5402 	newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, ip->i_fs->fs_frag,
5403 	    needj);
5404 }
5405 
5406 static inline struct freeblks *
5407 newfreeblks(mp, ip)
5408 	struct mount *mp;
5409 	struct inode *ip;
5410 {
5411 	struct freeblks *freeblks;
5412 
5413 	freeblks = malloc(sizeof(struct freeblks),
5414 		M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
5415 	workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
5416 	LIST_INIT(&freeblks->fb_jfreeblkhd);
5417 	LIST_INIT(&freeblks->fb_jwork);
5418 	freeblks->fb_state = ATTACHED;
5419 	freeblks->fb_uid = ip->i_uid;
5420 	freeblks->fb_previousinum = ip->i_number;
5421 	freeblks->fb_devvp = ip->i_devvp;
5422 	freeblks->fb_chkcnt = 0;
5423 
5424 	return (freeblks);
5425 }
5426 
5427 /*
5428  * Block de-allocation dependencies.
5429  *
5430  * When blocks are de-allocated, the on-disk pointers must be nullified before
5431  * the blocks are made available for use by other files.  (The true
5432  * requirement is that old pointers must be nullified before new on-disk
5433  * pointers are set.  We chose this slightly more stringent requirement to
5434  * reduce complexity.) Our implementation handles this dependency by updating
5435  * the inode (or indirect block) appropriately but delaying the actual block
5436  * de-allocation (i.e., freemap and free space count manipulation) until
5437  * after the updated versions reach stable storage.  After the disk is
5438  * updated, the blocks can be safely de-allocated whenever it is convenient.
5439  * This implementation handles only the common case of reducing a file's
5440  * length to zero. Other cases are handled by the conventional synchronous
5441  * write approach.
5442  *
5443  * The ffs implementation with which we worked double-checks
5444  * the state of the block pointers and file size as it reduces
5445  * a file's length.  Some of this code is replicated here in our
5446  * soft updates implementation.  The freeblks->fb_chkcnt field is
5447  * used to transfer a part of this information to the procedure
5448  * that eventually de-allocates the blocks.
5449  *
5450  * This routine should be called from the routine that shortens
5451  * a file's length, before the inode's size or block pointers
5452  * are modified. It will save the block pointer information for
5453  * later release and zero the inode so that the calling routine
5454  * can release it.
5455  */
5456 void
5457 softdep_setup_freeblocks(ip, length, flags)
5458 	struct inode *ip;	/* The inode whose length is to be reduced */
5459 	off_t length;		/* The new length for the file */
5460 	int flags;		/* IO_EXT and/or IO_NORMAL */
5461 {
5462 	struct ufs1_dinode *dp1;
5463 	struct ufs2_dinode *dp2;
5464 	struct freeblks *freeblks;
5465 	struct inodedep *inodedep;
5466 	struct allocdirect *adp;
5467 	struct jfreeblk *jfreeblk;
5468 	struct buf *bp;
5469 	struct fs *fs;
5470 	ufs2_daddr_t extblocks, datablocks;
5471 	struct mount *mp;
5472 	int i, delay, error;
5473 	ufs_lbn_t tmpval;
5474 	ufs_lbn_t lbn;
5475 	int needj;
5476 
5477 	fs = ip->i_fs;
5478 	mp = UFSTOVFS(ip->i_ump);
5479 	if (length != 0)
5480 		panic("softdep_setup_freeblocks: non-zero length");
5481 	freeblks = newfreeblks(mp, ip);
5482 	ACQUIRE_LOCK(&lk);
5483 	/*
5484 	 * If we're truncating a removed file that will never be written
5485 	 * we don't need to journal the block frees.  The canceled journals
5486 	 * for the allocations will suffice.
5487 	 */
5488 	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5489 	if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED ||
5490 	    (fs->fs_flags & FS_SUJ) == 0)
5491 		needj = 0;
5492 	else
5493 		needj = 1;
5494 	FREE_LOCK(&lk);
5495 	extblocks = 0;
5496 	if (fs->fs_magic == FS_UFS2_MAGIC)
5497 		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
5498 	datablocks = DIP(ip, i_blocks) - extblocks;
5499 	if ((flags & IO_NORMAL) != 0) {
5500 		for (i = 0; i < NDADDR; i++)
5501 			setup_freedirect(freeblks, ip, i, needj);
5502 		for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
5503 		    i++, lbn += tmpval, tmpval *= NINDIR(fs))
5504 			setup_freeindir(freeblks, ip, i, -lbn -i, needj);
5505 		ip->i_size = 0;
5506 		DIP_SET(ip, i_size, 0);
5507 		freeblks->fb_chkcnt = datablocks;
5508 		UFS_LOCK(ip->i_ump);
5509 		fs->fs_pendingblocks += datablocks;
5510 		UFS_UNLOCK(ip->i_ump);
5511 	}
5512 	if ((flags & IO_EXT) != 0) {
5513 		for (i = 0; i < NXADDR; i++)
5514 			setup_freeext(freeblks, ip, i, needj);
5515 		ip->i_din2->di_extsize = 0;
5516 		freeblks->fb_chkcnt += extblocks;
5517 	}
5518 	if (LIST_EMPTY(&freeblks->fb_jfreeblkhd))
5519 		needj = 0;
5520 	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt);
5521 	/*
5522 	 * Push the zero'ed inode to to its disk buffer so that we are free
5523 	 * to delete its dependencies below. Once the dependencies are gone
5524 	 * the buffer can be safely released.
5525 	 */
5526 	if ((error = bread(ip->i_devvp,
5527 	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
5528 	    (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
5529 		brelse(bp);
5530 		softdep_error("softdep_setup_freeblocks", error);
5531 	}
5532 	if (ip->i_ump->um_fstype == UFS1) {
5533 		dp1 = ((struct ufs1_dinode *)bp->b_data +
5534 		    ino_to_fsbo(fs, ip->i_number));
5535 		ip->i_din1->di_freelink = dp1->di_freelink;
5536 		*dp1 = *ip->i_din1;
5537 	} else {
5538 		dp2 = ((struct ufs2_dinode *)bp->b_data +
5539 		    ino_to_fsbo(fs, ip->i_number));
5540 		ip->i_din2->di_freelink = dp2->di_freelink;
5541 		*dp2 = *ip->i_din2;
5542 	}
5543 	/*
5544 	 * Find and eliminate any inode dependencies.
5545 	 */
5546 	ACQUIRE_LOCK(&lk);
5547 	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5548 	if ((inodedep->id_state & IOSTARTED) != 0)
5549 		panic("softdep_setup_freeblocks: inode busy");
5550 	/*
5551 	 * Add the freeblks structure to the list of operations that
5552 	 * must await the zero'ed inode being written to disk. If we
5553 	 * still have a bitmap dependency (delay == 0), then the inode
5554 	 * has never been written to disk, so we can process the
5555 	 * freeblks below once we have deleted the dependencies.
5556 	 */
5557 	delay = (inodedep->id_state & DEPCOMPLETE);
5558 	if (delay)
5559 		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
5560 	else if (needj)
5561 		freeblks->fb_state |= COMPLETE;
5562 	/*
5563 	 * Because the file length has been truncated to zero, any
5564 	 * pending block allocation dependency structures associated
5565 	 * with this inode are obsolete and can simply be de-allocated.
5566 	 * We must first merge the two dependency lists to get rid of
5567 	 * any duplicate freefrag structures, then purge the merged list.
5568 	 * If we still have a bitmap dependency, then the inode has never
5569 	 * been written to disk, so we can free any fragments without delay.
5570 	 */
5571 	if (flags & IO_NORMAL) {
5572 		merge_inode_lists(&inodedep->id_newinoupdt,
5573 		    &inodedep->id_inoupdt);
5574 		while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
5575 			cancel_allocdirect(&inodedep->id_inoupdt, adp,
5576 			    freeblks, delay);
5577 	}
5578 	if (flags & IO_EXT) {
5579 		merge_inode_lists(&inodedep->id_newextupdt,
5580 		    &inodedep->id_extupdt);
5581 		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
5582 			cancel_allocdirect(&inodedep->id_extupdt, adp,
5583 			    freeblks, delay);
5584 	}
5585 	LIST_FOREACH(jfreeblk, &freeblks->fb_jfreeblkhd, jf_deps)
5586 		add_to_journal(&jfreeblk->jf_list);
5587 
5588 	FREE_LOCK(&lk);
5589 	bdwrite(bp);
5590 	softdep_trunc_deps(ITOV(ip), freeblks, 0, 0, flags);
5591 	ACQUIRE_LOCK(&lk);
5592 	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
5593 		(void) free_inodedep(inodedep);
5594 
5595 	if (delay || needj)
5596 		freeblks->fb_state |= DEPCOMPLETE;
5597 	if (delay) {
5598 		/*
5599 		 * If the inode with zeroed block pointers is now on disk
5600 		 * we can start freeing blocks. Add freeblks to the worklist
5601 		 * instead of calling  handle_workitem_freeblocks directly as
5602 		 * it is more likely that additional IO is needed to complete
5603 		 * the request here than in the !delay case.
5604 		 */
5605 		if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
5606 			add_to_worklist(&freeblks->fb_list, 1);
5607 	}
5608 	if (needj && LIST_EMPTY(&freeblks->fb_jfreeblkhd))
5609 		needj = 0;
5610 
5611 	FREE_LOCK(&lk);
5612 	/*
5613 	 * If the inode has never been written to disk (delay == 0) and
5614 	 * we're not waiting on any journal writes, then we can process the
5615 	 * freeblks now that we have deleted the dependencies.
5616 	 */
5617 	if (!delay && !needj)
5618 		handle_workitem_freeblocks(freeblks, 0);
5619 }
5620 
5621 /*
5622  * Eliminate any dependencies that exist in memory beyond lblkno:off
5623  */
5624 static void
5625 softdep_trunc_deps(vp, freeblks, lblkno, off, flags)
5626 	struct vnode *vp;
5627 	struct freeblks *freeblks;
5628 	ufs_lbn_t lblkno;
5629 	int off;
5630 	int flags;
5631 {
5632 	struct inodedep *inodedep;
5633 	struct bufobj *bo;
5634 	struct buf *bp;
5635 	struct mount *mp;
5636 	ino_t ino;
5637 
5638 	/*
5639 	 * We must wait for any I/O in progress to finish so that
5640 	 * all potential buffers on the dirty list will be visible.
5641 	 * Once they are all there, walk the list and get rid of
5642 	 * any dependencies.
5643 	 */
5644 	ino = VTOI(vp)->i_number;
5645 	mp = vp->v_mount;
5646 	bo = &vp->v_bufobj;
5647 	BO_LOCK(bo);
5648 	drain_output(vp);
5649 restart:
5650 	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
5651 		if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
5652 		    ((flags & IO_NORMAL) == 0 &&
5653 		      (bp->b_xflags & BX_ALTDATA) == 0))
5654 			continue;
5655 		if ((bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT)) == NULL)
5656 			goto restart;
5657 		BO_UNLOCK(bo);
5658 		ACQUIRE_LOCK(&lk);
5659 		(void) inodedep_lookup(mp, ino, 0, &inodedep);
5660 		if (deallocate_dependencies(bp, inodedep, freeblks, 0))
5661 			bp->b_flags |= B_INVAL | B_NOCACHE;
5662 		FREE_LOCK(&lk);
5663 		brelse(bp);
5664 		BO_LOCK(bo);
5665 		goto restart;
5666 	}
5667 	BO_UNLOCK(bo);
5668 }
5669 
5670 static int
5671 cancel_pagedep(pagedep, inodedep, freeblks)
5672 	struct pagedep *pagedep;
5673 	struct inodedep *inodedep;
5674 	struct freeblks *freeblks;
5675 {
5676 	struct newdirblk *newdirblk;
5677 	struct jremref *jremref;
5678 	struct jmvref *jmvref;
5679 	struct dirrem *dirrem;
5680 	int i;
5681 
5682 	/*
5683 	 * There should be no directory add dependencies present
5684 	 * as the directory could not be truncated until all
5685 	 * children were removed.
5686 	 */
5687 	KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL,
5688 	    ("deallocate_dependencies: pendinghd != NULL"));
5689 	for (i = 0; i < DAHASHSZ; i++)
5690 		KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL,
5691 		    ("deallocate_dependencies: diraddhd != NULL"));
5692 	/*
5693 	 * Copy any directory remove dependencies to the list
5694 	 * to be processed after the zero'ed inode is written.
5695 	 * If the inode has already been written, then they
5696 	 * can be dumped directly onto the work list.
5697 	 */
5698 	LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
5699 		/*
5700 		 * If there are any dirrems we wait for
5701 		 * the journal write to complete and
5702 		 * then restart the buf scan as the lock
5703 		 * has been dropped.
5704 		 */
5705 		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd))
5706 		    != NULL) {
5707 			stat_jwait_filepage++;
5708 			jwait(&jremref->jr_list);
5709 			return (ERESTART);
5710 		}
5711 		LIST_REMOVE(dirrem, dm_next);
5712 		dirrem->dm_dirinum = pagedep->pd_ino;
5713 		if (inodedep == NULL ||
5714 		    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
5715 			dirrem->dm_state |= COMPLETE;
5716 			add_to_worklist(&dirrem->dm_list, 0);
5717 		} else
5718 			WORKLIST_INSERT(&inodedep->id_bufwait,
5719 			    &dirrem->dm_list);
5720 	}
5721 	if ((pagedep->pd_state & NEWBLOCK) != 0) {
5722 		newdirblk = pagedep->pd_newdirblk;
5723 		WORKLIST_REMOVE(&newdirblk->db_list);
5724 		free_newdirblk(newdirblk);
5725 	}
5726 	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) {
5727 		stat_jwait_filepage++;
5728 		jwait(&jmvref->jm_list);
5729 		return (ERESTART);
5730 	}
5731 	WORKLIST_REMOVE(&pagedep->pd_list);
5732 	LIST_REMOVE(pagedep, pd_hash);
5733 	WORKITEM_FREE(pagedep, D_PAGEDEP);
5734 	return (0);
5735 }
5736 
5737 /*
5738  * Reclaim any dependency structures from a buffer that is about to
5739  * be reallocated to a new vnode. The buffer must be locked, thus,
5740  * no I/O completion operations can occur while we are manipulating
5741  * its associated dependencies. The mutex is held so that other I/O's
5742  * associated with related dependencies do not occur.  Returns 1 if
5743  * all dependencies were cleared, 0 otherwise.
5744  */
5745 static int
5746 deallocate_dependencies(bp, inodedep, freeblks, off)
5747 	struct buf *bp;
5748 	struct inodedep *inodedep;
5749 	struct freeblks *freeblks;
5750 	int off;
5751 {
5752 	struct worklist *wk;
5753 	struct indirdep *indirdep;
5754 	struct allocindir *aip;
5755 	struct pagedep *pagedep;
5756 
5757 	mtx_assert(&lk, MA_OWNED);
5758 	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
5759 		switch (wk->wk_type) {
5760 
5761 		case D_INDIRDEP:
5762 			indirdep = WK_INDIRDEP(wk);
5763 			if (bp->b_lblkno >= 0 ||
5764 			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
5765 				panic("deallocate_dependencies: not indir");
5766 			cancel_indirdep(indirdep, bp, inodedep, freeblks);
5767 			continue;
5768 
5769 		case D_PAGEDEP:
5770 			pagedep = WK_PAGEDEP(wk);
5771 			if (cancel_pagedep(pagedep, inodedep, freeblks))
5772 				return (0);
5773 			continue;
5774 
5775 		case D_ALLOCINDIR:
5776 			aip = WK_ALLOCINDIR(wk);
5777 			cancel_allocindir(aip, inodedep, freeblks);
5778 			continue;
5779 
5780 		case D_ALLOCDIRECT:
5781 		case D_INODEDEP:
5782 			panic("deallocate_dependencies: Unexpected type %s",
5783 			    TYPENAME(wk->wk_type));
5784 			/* NOTREACHED */
5785 
5786 		default:
5787 			panic("deallocate_dependencies: Unknown type %s",
5788 			    TYPENAME(wk->wk_type));
5789 			/* NOTREACHED */
5790 		}
5791 	}
5792 
5793 	return (1);
5794 }
5795 
5796 /*
5797  * An allocdirect is being canceled due to a truncate.  We must make sure
5798  * the journal entry is released in concert with the blkfree that releases
5799  * the storage.  Completed journal entries must not be released until the
5800  * space is no longer pointed to by the inode or in the bitmap.
5801  */
5802 static void
5803 cancel_allocdirect(adphead, adp, freeblks, delay)
5804 	struct allocdirectlst *adphead;
5805 	struct allocdirect *adp;
5806 	struct freeblks *freeblks;
5807 	int delay;
5808 {
5809 	struct freework *freework;
5810 	struct newblk *newblk;
5811 	struct worklist *wk;
5812 	ufs_lbn_t lbn;
5813 
5814 	TAILQ_REMOVE(adphead, adp, ad_next);
5815 	newblk = (struct newblk *)adp;
5816 	/*
5817 	 * If the journal hasn't been written the jnewblk must be passed
5818 	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
5819 	 * this by linking the journal dependency into the freework to be
5820 	 * freed when freework_freeblock() is called.  If the journal has
5821 	 * been written we can simply reclaim the journal space when the
5822 	 * freeblks work is complete.
5823 	 */
5824 	if (newblk->nb_jnewblk == NULL) {
5825 		if (cancel_newblk(newblk, NULL, &freeblks->fb_jwork) != NULL)
5826 			panic("cancel_allocdirect: Unexpected dependency");
5827 		goto found;
5828 	}
5829 	lbn = newblk->nb_jnewblk->jn_lbn;
5830 	/*
5831 	 * Find the correct freework structure so it releases the canceled
5832 	 * journal when the bitmap is cleared.  This preserves rollback
5833 	 * until the allocation is reverted.
5834 	 */
5835 	LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {
5836 		freework = WK_FREEWORK(wk);
5837 		if (freework->fw_lbn != lbn)
5838 			continue;
5839 		freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list,
5840 		    &freework->fw_jwork);
5841 		goto found;
5842 	}
5843 	panic("cancel_allocdirect: Freework not found for lbn %jd\n", lbn);
5844 found:
5845 	if (delay)
5846 		WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
5847 		    &newblk->nb_list);
5848 	else
5849 		free_newblk(newblk);
5850 	return;
5851 }
5852 
5853 
5854 /*
5855  * Cancel a new block allocation.  May be an indirect or direct block.  We
5856  * remove it from various lists and return any journal record that needs to
5857  * be resolved by the caller.
5858  *
5859  * A special consideration is made for indirects which were never pointed
5860  * at on disk and will never be found once this block is released.
5861  */
5862 static struct jnewblk *
5863 cancel_newblk(newblk, wk, wkhd)
5864 	struct newblk *newblk;
5865 	struct worklist *wk;
5866 	struct workhead *wkhd;
5867 {
5868 	struct indirdep *indirdep;
5869 	struct allocindir *aip;
5870 	struct jnewblk *jnewblk;
5871 
5872 	while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) {
5873 		indirdep->ir_state &= ~ONDEPLIST;
5874 		LIST_REMOVE(indirdep, ir_next);
5875 		/*
5876 		 * If an indirdep is not on the buf worklist we need to
5877 		 * free it here as deallocate_dependencies() will never
5878 		 * find it.  These pointers were never visible on disk and
5879 		 * can be discarded immediately.
5880 		 */
5881 		while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
5882 			LIST_REMOVE(aip, ai_next);
5883 			if (cancel_newblk(&aip->ai_block, NULL, wkhd) != NULL)
5884 				panic("cancel_newblk: aip has journal entry");
5885 			free_newblk(&aip->ai_block);
5886 		}
5887 		/*
5888 		 * If this indirdep is not attached to a buf it was simply
5889 		 * waiting on completion to clear completehd.  free_indirdep()
5890 		 * asserts that nothing is dangling.
5891 		 */
5892 		if ((indirdep->ir_state & ONWORKLIST) == 0)
5893 			free_indirdep(indirdep);
5894 	}
5895 	if (newblk->nb_state & ONDEPLIST) {
5896 		newblk->nb_state &= ~ONDEPLIST;
5897 		LIST_REMOVE(newblk, nb_deps);
5898 	}
5899 	if (newblk->nb_state & ONWORKLIST)
5900 		WORKLIST_REMOVE(&newblk->nb_list);
5901 	/*
5902 	 * If the journal entry hasn't been written we save a pointer to
5903 	 * the dependency that frees it until it is written or the
5904 	 * superseding operation completes.
5905 	 */
5906 	jnewblk = newblk->nb_jnewblk;
5907 	if (jnewblk != NULL) {
5908 		newblk->nb_jnewblk = NULL;
5909 		jnewblk->jn_dep = wk;
5910 	}
5911 	if (!LIST_EMPTY(&newblk->nb_jwork))
5912 		jwork_move(wkhd, &newblk->nb_jwork);
5913 
5914 	return (jnewblk);
5915 }
5916 
5917 /*
5918  * Free a newblk. Generate a new freefrag work request if appropriate.
5919  * This must be called after the inode pointer and any direct block pointers
5920  * are valid or fully removed via truncate or frag extension.
5921  */
5922 static void
5923 free_newblk(newblk)
5924 	struct newblk *newblk;
5925 {
5926 	struct indirdep *indirdep;
5927 	struct newdirblk *newdirblk;
5928 	struct freefrag *freefrag;
5929 	struct worklist *wk;
5930 
5931 	mtx_assert(&lk, MA_OWNED);
5932 	if (newblk->nb_state & ONDEPLIST)
5933 		LIST_REMOVE(newblk, nb_deps);
5934 	if (newblk->nb_state & ONWORKLIST)
5935 		WORKLIST_REMOVE(&newblk->nb_list);
5936 	LIST_REMOVE(newblk, nb_hash);
5937 	if ((freefrag = newblk->nb_freefrag) != NULL) {
5938 		freefrag->ff_state |= COMPLETE;
5939 		if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
5940 			add_to_worklist(&freefrag->ff_list, 0);
5941 	}
5942 	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) {
5943 		newdirblk = WK_NEWDIRBLK(wk);
5944 		WORKLIST_REMOVE(&newdirblk->db_list);
5945 		if (!LIST_EMPTY(&newblk->nb_newdirblk))
5946 			panic("free_newblk: extra newdirblk");
5947 		free_newdirblk(newdirblk);
5948 	}
5949 	while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) {
5950 		indirdep->ir_state |= DEPCOMPLETE;
5951 		indirdep_complete(indirdep);
5952 	}
5953 	KASSERT(newblk->nb_jnewblk == NULL,
5954 	    ("free_newblk; jnewblk %p still attached", newblk->nb_jnewblk));
5955 	handle_jwork(&newblk->nb_jwork);
5956 	newblk->nb_list.wk_type = D_NEWBLK;
5957 	WORKITEM_FREE(newblk, D_NEWBLK);
5958 }
5959 
5960 /*
5961  * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
5962  * This routine must be called with splbio interrupts blocked.
5963  */
5964 static void
5965 free_newdirblk(newdirblk)
5966 	struct newdirblk *newdirblk;
5967 {
5968 	struct pagedep *pagedep;
5969 	struct diradd *dap;
5970 	struct worklist *wk;
5971 	int i;
5972 
5973 	mtx_assert(&lk, MA_OWNED);
5974 	/*
5975 	 * If the pagedep is still linked onto the directory buffer
5976 	 * dependency chain, then some of the entries on the
5977 	 * pd_pendinghd list may not be committed to disk yet. In
5978 	 * this case, we will simply clear the NEWBLOCK flag and
5979 	 * let the pd_pendinghd list be processed when the pagedep
5980 	 * is next written. If the pagedep is no longer on the buffer
5981 	 * dependency chain, then all the entries on the pd_pending
5982 	 * list are committed to disk and we can free them here.
5983 	 */
5984 	pagedep = newdirblk->db_pagedep;
5985 	pagedep->pd_state &= ~NEWBLOCK;
5986 	if ((pagedep->pd_state & ONWORKLIST) == 0)
5987 		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
5988 			free_diradd(dap, NULL);
5989 	/*
5990 	 * If no dependencies remain, the pagedep will be freed.
5991 	 */
5992 	for (i = 0; i < DAHASHSZ; i++)
5993 		if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
5994 			break;
5995 	if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0 &&
5996 	    LIST_EMPTY(&pagedep->pd_jmvrefhd)) {
5997 		KASSERT(LIST_FIRST(&pagedep->pd_dirremhd) == NULL,
5998 		    ("free_newdirblk: Freeing non-free pagedep %p", pagedep));
5999 		LIST_REMOVE(pagedep, pd_hash);
6000 		WORKITEM_FREE(pagedep, D_PAGEDEP);
6001 	}
6002 	/* Should only ever be one item in the list. */
6003 	while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) {
6004 		WORKLIST_REMOVE(wk);
6005 		handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
6006 	}
6007 	WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
6008 }
6009 
6010 /*
6011  * Prepare an inode to be freed. The actual free operation is not
6012  * done until the zero'ed inode has been written to disk.
6013  */
6014 void
6015 softdep_freefile(pvp, ino, mode)
6016 	struct vnode *pvp;
6017 	ino_t ino;
6018 	int mode;
6019 {
6020 	struct inode *ip = VTOI(pvp);
6021 	struct inodedep *inodedep;
6022 	struct freefile *freefile;
6023 
6024 	/*
6025 	 * This sets up the inode de-allocation dependency.
6026 	 */
6027 	freefile = malloc(sizeof(struct freefile),
6028 		M_FREEFILE, M_SOFTDEP_FLAGS);
6029 	workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
6030 	freefile->fx_mode = mode;
6031 	freefile->fx_oldinum = ino;
6032 	freefile->fx_devvp = ip->i_devvp;
6033 	LIST_INIT(&freefile->fx_jwork);
6034 	UFS_LOCK(ip->i_ump);
6035 	ip->i_fs->fs_pendinginodes += 1;
6036 	UFS_UNLOCK(ip->i_ump);
6037 
6038 	/*
6039 	 * If the inodedep does not exist, then the zero'ed inode has
6040 	 * been written to disk. If the allocated inode has never been
6041 	 * written to disk, then the on-disk inode is zero'ed. In either
6042 	 * case we can free the file immediately.  If the journal was
6043 	 * canceled before being written the inode will never make it to
6044 	 * disk and we must send the canceled journal entrys to
6045 	 * ffs_freefile() to be cleared in conjunction with the bitmap.
6046 	 * Any blocks waiting on the inode to write can be safely freed
6047 	 * here as it will never been written.
6048 	 */
6049 	ACQUIRE_LOCK(&lk);
6050 	inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
6051 	/*
6052 	 * Remove this inode from the unlinked list and set
6053 	 * GOINGAWAY as appropriate to indicate that this inode
6054 	 * will never be written.
6055 	 */
6056 	if (inodedep && inodedep->id_state & UNLINKED) {
6057 		/*
6058 		 * Save the journal work to be freed with the bitmap
6059 		 * before we clear UNLINKED.  Otherwise it can be lost
6060 		 * if the inode block is written.
6061 		 */
6062 		handle_bufwait(inodedep, &freefile->fx_jwork);
6063 		clear_unlinked_inodedep(inodedep);
6064 		/* Re-acquire inodedep as we've dropped lk. */
6065 		inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
6066 	}
6067 	if (inodedep == NULL || check_inode_unwritten(inodedep)) {
6068 		FREE_LOCK(&lk);
6069 		handle_workitem_freefile(freefile);
6070 		return;
6071 	}
6072 	if (inodedep && (inodedep->id_state & DEPCOMPLETE) == 0)
6073 		inodedep->id_state |= GOINGAWAY;
6074 	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
6075 	FREE_LOCK(&lk);
6076 	if (ip->i_number == ino)
6077 		ip->i_flag |= IN_MODIFIED;
6078 }
6079 
6080 /*
6081  * Check to see if an inode has never been written to disk. If
6082  * so free the inodedep and return success, otherwise return failure.
6083  * This routine must be called with splbio interrupts blocked.
6084  *
6085  * If we still have a bitmap dependency, then the inode has never
6086  * been written to disk. Drop the dependency as it is no longer
6087  * necessary since the inode is being deallocated. We set the
6088  * ALLCOMPLETE flags since the bitmap now properly shows that the
6089  * inode is not allocated. Even if the inode is actively being
6090  * written, it has been rolled back to its zero'ed state, so we
6091  * are ensured that a zero inode is what is on the disk. For short
6092  * lived files, this change will usually result in removing all the
6093  * dependencies from the inode so that it can be freed immediately.
6094  */
6095 static int
6096 check_inode_unwritten(inodedep)
6097 	struct inodedep *inodedep;
6098 {
6099 
6100 	mtx_assert(&lk, MA_OWNED);
6101 
6102 	if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||
6103 	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
6104 	    !LIST_EMPTY(&inodedep->id_bufwait) ||
6105 	    !LIST_EMPTY(&inodedep->id_inowait) ||
6106 	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
6107 	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
6108 	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
6109 	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
6110 	    inodedep->id_mkdiradd != NULL ||
6111 	    inodedep->id_nlinkdelta != 0)
6112 		return (0);
6113 	/*
6114 	 * Another process might be in initiate_write_inodeblock_ufs[12]
6115 	 * trying to allocate memory without holding "Softdep Lock".
6116 	 */
6117 	if ((inodedep->id_state & IOSTARTED) != 0 &&
6118 	    inodedep->id_savedino1 == NULL)
6119 		return (0);
6120 
6121 	if (inodedep->id_state & ONDEPLIST)
6122 		LIST_REMOVE(inodedep, id_deps);
6123 	inodedep->id_state &= ~ONDEPLIST;
6124 	inodedep->id_state |= ALLCOMPLETE;
6125 	inodedep->id_bmsafemap = NULL;
6126 	if (inodedep->id_state & ONWORKLIST)
6127 		WORKLIST_REMOVE(&inodedep->id_list);
6128 	if (inodedep->id_savedino1 != NULL) {
6129 		free(inodedep->id_savedino1, M_SAVEDINO);
6130 		inodedep->id_savedino1 = NULL;
6131 	}
6132 	if (free_inodedep(inodedep) == 0)
6133 		panic("check_inode_unwritten: busy inode");
6134 	return (1);
6135 }
6136 
6137 /*
6138  * Try to free an inodedep structure. Return 1 if it could be freed.
6139  */
6140 static int
6141 free_inodedep(inodedep)
6142 	struct inodedep *inodedep;
6143 {
6144 
6145 	mtx_assert(&lk, MA_OWNED);
6146 	if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||
6147 	    (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
6148 	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
6149 	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
6150 	    !LIST_EMPTY(&inodedep->id_bufwait) ||
6151 	    !LIST_EMPTY(&inodedep->id_inowait) ||
6152 	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
6153 	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
6154 	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
6155 	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
6156 	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
6157 	    inodedep->id_mkdiradd != NULL ||
6158 	    inodedep->id_nlinkdelta != 0 ||
6159 	    inodedep->id_savedino1 != NULL)
6160 		return (0);
6161 	if (inodedep->id_state & ONDEPLIST)
6162 		LIST_REMOVE(inodedep, id_deps);
6163 	LIST_REMOVE(inodedep, id_hash);
6164 	WORKITEM_FREE(inodedep, D_INODEDEP);
6165 	return (1);
6166 }
6167 
6168 /*
6169  * Free the block referenced by a freework structure.  The parent freeblks
6170  * structure is released and completed when the final cg bitmap reaches
6171  * the disk.  This routine may be freeing a jnewblk which never made it to
6172  * disk in which case we do not have to wait as the operation is undone
6173  * in memory immediately.
6174  */
6175 static void
6176 freework_freeblock(freework)
6177 	struct freework *freework;
6178 {
6179 	struct freeblks *freeblks;
6180 	struct jnewblk *jnewblk;
6181 	struct ufsmount *ump;
6182 	struct workhead wkhd;
6183 	struct fs *fs;
6184 	int pending;
6185 	int bsize;
6186 	int needj;
6187 
6188 	freeblks = freework->fw_freeblks;
6189 	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
6190 	fs = ump->um_fs;
6191 	needj = freeblks->fb_list.wk_mp->mnt_kern_flag & MNTK_SUJ;
6192 	bsize = lfragtosize(fs, freework->fw_frags);
6193 	pending = btodb(bsize);
6194 	LIST_INIT(&wkhd);
6195 	/*
6196 	 * If we are canceling an existing jnewblk pass it to the free
6197 	 * routine, otherwise pass the freeblk which will ultimately
6198 	 * release the freeblks.  If we're not journaling, we can just
6199 	 * free the freeblks immediately.
6200 	 */
6201 	ACQUIRE_LOCK(&lk);
6202 	LIST_SWAP(&wkhd, &freework->fw_jwork, worklist, wk_list);
6203 	jnewblk = freework->fw_jnewblk;
6204 	if (jnewblk != NULL) {
6205 		/* Could've already been canceled in indir_trunc(). */
6206 		if ((jnewblk->jn_state & GOINGAWAY) == 0)
6207 			cancel_jnewblk(jnewblk, &wkhd);
6208 		needj = 0;
6209 	} else if (needj)
6210 		WORKLIST_INSERT(&wkhd, &freework->fw_list);
6211 	freeblks->fb_chkcnt -= pending;
6212 	FREE_LOCK(&lk);
6213 	/*
6214 	 * extattr blocks don't show up in pending blocks.  XXX why?
6215 	 */
6216 	if (freework->fw_lbn >= 0 || freework->fw_lbn <= -NDADDR) {
6217 		UFS_LOCK(ump);
6218 		fs->fs_pendingblocks -= pending;
6219 		UFS_UNLOCK(ump);
6220 	}
6221 	ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno,
6222 	    bsize, freeblks->fb_previousinum, &wkhd);
6223 	if (needj)
6224 		return;
6225 	/*
6226 	 * The jnewblk will be discarded and the bits in the map never
6227 	 * made it to disk.  We can immediately free the freeblk.
6228 	 */
6229 	ACQUIRE_LOCK(&lk);
6230 	handle_written_freework(freework);
6231 	FREE_LOCK(&lk);
6232 }
6233 
6234 /*
6235  * Start, continue, or finish the process of freeing an indirect block tree.
6236  * The free operation may be paused at any point with fw_off containing the
6237  * offset to restart from.  This enables us to implement some flow control
6238  * for large truncates which may fan out and generate a huge number of
6239  * dependencies.
6240  */
6241 static void
6242 handle_workitem_indirblk(freework)
6243 	struct freework *freework;
6244 {
6245 	struct freeblks *freeblks;
6246 	struct ufsmount *ump;
6247 	struct fs *fs;
6248 
6249 
6250 	freeblks = freework->fw_freeblks;
6251 	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
6252 	fs = ump->um_fs;
6253 	if (freework->fw_off == NINDIR(fs))
6254 		freework_freeblock(freework);
6255 	else
6256 		indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),
6257 		    freework->fw_lbn);
6258 }
6259 
6260 /*
6261  * Called when a freework structure attached to a cg buf is written.  The
6262  * ref on either the parent or the freeblks structure is released and
6263  * either may be added to the worklist if it is the final ref.
6264  */
6265 static void
6266 handle_written_freework(freework)
6267 	struct freework *freework;
6268 {
6269 	struct freeblks *freeblks;
6270 	struct freework *parent;
6271 	struct jsegdep *jsegdep;
6272 	struct worklist *wk;
6273 	int needj;
6274 
6275 	needj = 0;
6276 	freeblks = freework->fw_freeblks;
6277 	parent = freework->fw_parent;
6278 	/*
6279 	 * SUJ needs to wait for the segment referencing freed indirect
6280 	 * blocks to expire so that we know the checker will not confuse
6281 	 * a re-allocated indirect block with its old contents.
6282 	 */
6283 	if (freework->fw_lbn <= -NDADDR &&
6284 	    freework->fw_list.wk_mp->mnt_kern_flag & MNTK_SUJ) {
6285 		LIST_FOREACH(wk, &freeblks->fb_jwork, wk_list)
6286 			if (wk->wk_type == D_JSEGDEP)
6287 				break;
6288 		if (wk) {
6289 			jsegdep = WK_JSEGDEP(wk);
6290 			LIST_INSERT_HEAD(&jsegdep->jd_seg->js_indirs,
6291 			    freework, fw_next);
6292 			WORKLIST_INSERT(INDIR_HASH(freework->fw_list.wk_mp,
6293 			    freework->fw_blkno), &freework->fw_list);
6294 			needj = 1;
6295 		}
6296 	}
6297 	if (parent) {
6298 		if (--parent->fw_ref != 0)
6299 			parent = NULL;
6300 		freeblks = NULL;
6301 	} else if (--freeblks->fb_ref != 0)
6302 		freeblks = NULL;
6303 	if (needj == 0)
6304 		WORKITEM_FREE(freework, D_FREEWORK);
6305 	/*
6306 	 * Don't delay these block frees or it takes an intolerable amount
6307 	 * of time to process truncates and free their journal entries.
6308 	 */
6309 	if (freeblks)
6310 		add_to_worklist(&freeblks->fb_list, 1);
6311 	if (parent)
6312 		add_to_worklist(&parent->fw_list, 1);
6313 }
6314 
6315 /*
6316  * This workitem routine performs the block de-allocation.
6317  * The workitem is added to the pending list after the updated
6318  * inode block has been written to disk.  As mentioned above,
6319  * checks regarding the number of blocks de-allocated (compared
6320  * to the number of blocks allocated for the file) are also
6321  * performed in this function.
6322  */
6323 static void
6324 handle_workitem_freeblocks(freeblks, flags)
6325 	struct freeblks *freeblks;
6326 	int flags;
6327 {
6328 	struct freework *freework;
6329 	struct worklist *wk;
6330 
6331 	KASSERT(LIST_EMPTY(&freeblks->fb_jfreeblkhd),
6332 	    ("handle_workitem_freeblocks: Journal entries not written."));
6333 	if (LIST_EMPTY(&freeblks->fb_freeworkhd)) {
6334 		handle_complete_freeblocks(freeblks);
6335 		return;
6336 	}
6337 	freeblks->fb_ref++;
6338 	while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
6339 		KASSERT(wk->wk_type == D_FREEWORK,
6340 		    ("handle_workitem_freeblocks: Unknown type %s",
6341 		    TYPENAME(wk->wk_type)));
6342 		WORKLIST_REMOVE_UNLOCKED(wk);
6343 		freework = WK_FREEWORK(wk);
6344 		if (freework->fw_lbn <= -NDADDR)
6345 			handle_workitem_indirblk(freework);
6346 		else
6347 			freework_freeblock(freework);
6348 	}
6349 	ACQUIRE_LOCK(&lk);
6350 	if (--freeblks->fb_ref != 0)
6351 		freeblks = NULL;
6352 	FREE_LOCK(&lk);
6353 	if (freeblks)
6354 		handle_complete_freeblocks(freeblks);
6355 }
6356 
6357 /*
6358  * Once all of the freework workitems are complete we can retire the
6359  * freeblocks dependency and any journal work awaiting completion.  This
6360  * can not be called until all other dependencies are stable on disk.
6361  */
6362 static void
6363 handle_complete_freeblocks(freeblks)
6364 	struct freeblks *freeblks;
6365 {
6366 	struct inode *ip;
6367 	struct vnode *vp;
6368 	struct fs *fs;
6369 	struct ufsmount *ump;
6370 	int flags;
6371 
6372 	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
6373 	fs = ump->um_fs;
6374 	flags = LK_NOWAIT;
6375 
6376 	/*
6377 	 * If we still have not finished background cleanup, then check
6378 	 * to see if the block count needs to be adjusted.
6379 	 */
6380 	if (freeblks->fb_chkcnt != 0 && (fs->fs_flags & FS_UNCLEAN) != 0 &&
6381 	    ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_previousinum,
6382 	    (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ) == 0) {
6383 		ip = VTOI(vp);
6384 		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + freeblks->fb_chkcnt);
6385 		ip->i_flag |= IN_CHANGE;
6386 		vput(vp);
6387 	}
6388 
6389 	if (!(freeblks->fb_chkcnt == 0 ||
6390 	    ((fs->fs_flags & FS_UNCLEAN) != 0 && (flags & LK_NOWAIT) == 0)))
6391 	        printf(
6392 	"handle_workitem_freeblocks: inode %ju block count %jd\n",
6393 		   (uintmax_t)freeblks->fb_previousinum,
6394 		   (intmax_t)freeblks->fb_chkcnt);
6395 
6396 	ACQUIRE_LOCK(&lk);
6397 	/*
6398 	 * All of the freeblock deps must be complete prior to this call
6399 	 * so it's now safe to complete earlier outstanding journal entries.
6400 	 */
6401 	handle_jwork(&freeblks->fb_jwork);
6402 	WORKITEM_FREE(freeblks, D_FREEBLKS);
6403 	FREE_LOCK(&lk);
6404 }
6405 
6406 /*
6407  * Release blocks associated with the inode ip and stored in the indirect
6408  * block dbn. If level is greater than SINGLE, the block is an indirect block
6409  * and recursive calls to indirtrunc must be used to cleanse other indirect
6410  * blocks.
6411  */
6412 static void
6413 indir_trunc(freework, dbn, lbn)
6414 	struct freework *freework;
6415 	ufs2_daddr_t dbn;
6416 	ufs_lbn_t lbn;
6417 {
6418 	struct freework *nfreework;
6419 	struct workhead wkhd;
6420 	struct jnewblk *jnewblkn;
6421 	struct jnewblk *jnewblk;
6422 	struct freeblks *freeblks;
6423 	struct buf *bp;
6424 	struct fs *fs;
6425 	struct worklist *wkn;
6426 	struct worklist *wk;
6427 	struct indirdep *indirdep;
6428 	struct ufsmount *ump;
6429 	ufs1_daddr_t *bap1 = 0;
6430 	ufs2_daddr_t nb, nnb, *bap2 = 0;
6431 	ufs_lbn_t lbnadd;
6432 	int i, nblocks, ufs1fmt;
6433 	int fs_pendingblocks;
6434 	int freedeps;
6435 	int needj;
6436 	int level;
6437 	int cnt;
6438 
6439 	LIST_INIT(&wkhd);
6440 	level = lbn_level(lbn);
6441 	if (level == -1)
6442 		panic("indir_trunc: Invalid lbn %jd\n", lbn);
6443 	freeblks = freework->fw_freeblks;
6444 	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
6445 	fs = ump->um_fs;
6446 	fs_pendingblocks = 0;
6447 	freedeps = 0;
6448 	needj = UFSTOVFS(ump)->mnt_kern_flag & MNTK_SUJ;
6449 	lbnadd = lbn_offset(fs, level);
6450 	/*
6451 	 * Get buffer of block pointers to be freed. This routine is not
6452 	 * called until the zero'ed inode has been written, so it is safe
6453 	 * to free blocks as they are encountered. Because the inode has
6454 	 * been zero'ed, calls to bmap on these blocks will fail. So, we
6455 	 * have to use the on-disk address and the block device for the
6456 	 * filesystem to look them up. If the file was deleted before its
6457 	 * indirect blocks were all written to disk, the routine that set
6458 	 * us up (deallocate_dependencies) will have arranged to leave
6459 	 * a complete copy of the indirect block in memory for our use.
6460 	 * Otherwise we have to read the blocks in from the disk.
6461 	 */
6462 #ifdef notyet
6463 	bp = getblk(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 0, 0,
6464 	    GB_NOCREAT);
6465 #else
6466 	bp = incore(&freeblks->fb_devvp->v_bufobj, dbn);
6467 #endif
6468 	ACQUIRE_LOCK(&lk);
6469 	if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
6470 		if (wk->wk_type != D_INDIRDEP ||
6471 		    (wk->wk_state & GOINGAWAY) == 0)
6472 			panic("indir_trunc: lost indirdep %p", wk);
6473 		indirdep = WK_INDIRDEP(wk);
6474 		LIST_SWAP(&wkhd, &indirdep->ir_jwork, worklist, wk_list);
6475 		LIST_FOREACH_SAFE(jnewblk, &indirdep->ir_jnewblkhd,
6476 		    jn_indirdeps, jnewblkn) {
6477 			/*
6478 			 * XXX This cancel may cause some lengthy delay
6479 			 * before the record is reclaimed below.
6480 			 */
6481 			LIST_REMOVE(jnewblk, jn_indirdeps);
6482 			cancel_jnewblk(jnewblk, &wkhd);
6483 		}
6484 
6485 		free_indirdep(indirdep);
6486 		if (!LIST_EMPTY(&bp->b_dep))
6487 			panic("indir_trunc: dangling dep %p",
6488 			    LIST_FIRST(&bp->b_dep));
6489 		ump->um_numindirdeps -= 1;
6490 		FREE_LOCK(&lk);
6491 	} else {
6492 #ifdef notyet
6493 		if (bp)
6494 			brelse(bp);
6495 #endif
6496 		FREE_LOCK(&lk);
6497 		if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
6498 		    NOCRED, &bp) != 0) {
6499 			brelse(bp);
6500 			return;
6501 		}
6502 	}
6503 	/*
6504 	 * Recursively free indirect blocks.
6505 	 */
6506 	if (ump->um_fstype == UFS1) {
6507 		ufs1fmt = 1;
6508 		bap1 = (ufs1_daddr_t *)bp->b_data;
6509 	} else {
6510 		ufs1fmt = 0;
6511 		bap2 = (ufs2_daddr_t *)bp->b_data;
6512 	}
6513 
6514 	/*
6515 	 * Reclaim indirect blocks which never made it to disk.
6516 	 */
6517 	cnt = 0;
6518 	LIST_FOREACH_SAFE(wk, &wkhd, wk_list, wkn) {
6519 		if (wk->wk_type != D_JNEWBLK)
6520 			continue;
6521 		/* XXX Is the lock necessary here for more than an assert? */
6522 		ACQUIRE_LOCK(&lk);
6523 		WORKLIST_REMOVE(wk);
6524 		FREE_LOCK(&lk);
6525 		jnewblk = WK_JNEWBLK(wk);
6526 		if (jnewblk->jn_lbn > 0)
6527 			i = (jnewblk->jn_lbn - -lbn) / lbnadd;
6528 		else
6529 			i = (-(jnewblk->jn_lbn + level - 1) - -(lbn + level)) /
6530 			    lbnadd;
6531 		KASSERT(i >= 0 && i < NINDIR(fs),
6532 		    ("indir_trunc: Index out of range %d parent %jd lbn %jd level %d",
6533 		    i, lbn, jnewblk->jn_lbn, level));
6534 		/* Clear the pointer so it isn't found below. */
6535 		if (ufs1fmt) {
6536 			nb = bap1[i];
6537 			bap1[i] = 0;
6538 		} else {
6539 			nb = bap2[i];
6540 			bap2[i] = 0;
6541 		}
6542 		KASSERT(nb == jnewblk->jn_blkno,
6543 		    ("indir_trunc: Block mismatch %jd != %jd",
6544 		    nb, jnewblk->jn_blkno));
6545 		if (level != 0) {
6546 			ufs_lbn_t nlbn;
6547 
6548 			nlbn = (lbn + 1) - (i * lbnadd);
6549 			nfreework = newfreework(ump, freeblks, freework,
6550 			    nlbn, nb, fs->fs_frag, 0);
6551 			nfreework->fw_jnewblk = jnewblk;
6552 			freedeps++;
6553 			indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
6554 		} else {
6555 			struct workhead freewk;
6556 
6557 			LIST_INIT(&freewk);
6558 			ACQUIRE_LOCK(&lk);
6559 			WORKLIST_INSERT(&freewk, wk);
6560 			FREE_LOCK(&lk);
6561 			ffs_blkfree(ump, fs, freeblks->fb_devvp,
6562 			    jnewblk->jn_blkno, fs->fs_bsize,
6563 			    freeblks->fb_previousinum, &freewk);
6564 		}
6565 		cnt++;
6566 	}
6567 	ACQUIRE_LOCK(&lk);
6568 	/* Any remaining journal work can be completed with freeblks. */
6569 	jwork_move(&freeblks->fb_jwork, &wkhd);
6570 	FREE_LOCK(&lk);
6571 	nblocks = btodb(fs->fs_bsize);
6572 	if (ufs1fmt)
6573 		nb = bap1[0];
6574 	else
6575 		nb = bap2[0];
6576 	nfreework = freework;
6577 	/*
6578 	 * Reclaim on disk blocks.
6579 	 */
6580 	for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
6581 		if (i != NINDIR(fs) - 1) {
6582 			if (ufs1fmt)
6583 				nnb = bap1[i+1];
6584 			else
6585 				nnb = bap2[i+1];
6586 		} else
6587 			nnb = 0;
6588 		if (nb == 0)
6589 			continue;
6590 		cnt++;
6591 		if (level != 0) {
6592 			ufs_lbn_t nlbn;
6593 
6594 			nlbn = (lbn + 1) - (i * lbnadd);
6595 			if (needj != 0) {
6596 				nfreework = newfreework(ump, freeblks, freework,
6597 				    nlbn, nb, fs->fs_frag, 0);
6598 				freedeps++;
6599 			}
6600 			indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
6601 		} else {
6602 			struct freedep *freedep;
6603 
6604 			/*
6605 			 * Attempt to aggregate freedep dependencies for
6606 			 * all blocks being released to the same CG.
6607 			 */
6608 			LIST_INIT(&wkhd);
6609 			if (needj != 0 &&
6610 			    (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) {
6611 				freedep = newfreedep(freework);
6612 				WORKLIST_INSERT_UNLOCKED(&wkhd,
6613 				    &freedep->fd_list);
6614 				freedeps++;
6615 			}
6616 			ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
6617 			    fs->fs_bsize, freeblks->fb_previousinum, &wkhd);
6618 		}
6619 	}
6620 	if (level == 0)
6621 		fs_pendingblocks = (nblocks * cnt);
6622 	/*
6623 	 * If we're not journaling we can free the indirect now.  Otherwise
6624 	 * setup the ref counts and offset so this indirect can be completed
6625 	 * when its children are free.
6626 	 */
6627 	if (needj == 0) {
6628 		fs_pendingblocks += nblocks;
6629 		dbn = dbtofsb(fs, dbn);
6630 		ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
6631 		    freeblks->fb_previousinum, NULL);
6632 		ACQUIRE_LOCK(&lk);
6633 		freeblks->fb_chkcnt -= fs_pendingblocks;
6634 		if (freework->fw_blkno == dbn)
6635 			handle_written_freework(freework);
6636 		FREE_LOCK(&lk);
6637 		freework = NULL;
6638 	} else {
6639 		ACQUIRE_LOCK(&lk);
6640 		freework->fw_off = i;
6641 		freework->fw_ref += freedeps;
6642 		freework->fw_ref -= NINDIR(fs) + 1;
6643 		if (freework->fw_ref != 0)
6644 			freework = NULL;
6645 		freeblks->fb_chkcnt -= fs_pendingblocks;
6646 		FREE_LOCK(&lk);
6647 	}
6648 	if (fs_pendingblocks) {
6649 		UFS_LOCK(ump);
6650 		fs->fs_pendingblocks -= fs_pendingblocks;
6651 		UFS_UNLOCK(ump);
6652 	}
6653 	bp->b_flags |= B_INVAL | B_NOCACHE;
6654 	brelse(bp);
6655 	if (freework)
6656 		handle_workitem_indirblk(freework);
6657 	return;
6658 }
6659 
6660 /*
6661  * Cancel an allocindir when it is removed via truncation.
6662  */
6663 static void
6664 cancel_allocindir(aip, inodedep, freeblks)
6665 	struct allocindir *aip;
6666 	struct inodedep *inodedep;
6667 	struct freeblks *freeblks;
6668 {
6669 	struct jnewblk *jnewblk;
6670 	struct newblk *newblk;
6671 
6672 	/*
6673 	 * If the journal hasn't been written the jnewblk must be passed
6674 	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
6675 	 * this by linking the journal dependency into the indirdep to be
6676 	 * freed when indir_trunc() is called.  If the journal has already
6677 	 * been written we can simply reclaim the journal space when the
6678 	 * freeblks work is complete.
6679 	 */
6680 	LIST_REMOVE(aip, ai_next);
6681 	newblk = (struct newblk *)aip;
6682 	if (newblk->nb_jnewblk == NULL) {
6683 		if (cancel_newblk(newblk, NULL, &freeblks->fb_jwork))
6684 			panic("cancel_allocindir: Unexpected dependency.");
6685 	} else {
6686 		jnewblk = cancel_newblk(newblk, &aip->ai_indirdep->ir_list,
6687 		    &aip->ai_indirdep->ir_jwork);
6688 		if (jnewblk)
6689 			LIST_INSERT_HEAD(&aip->ai_indirdep->ir_jnewblkhd,
6690 			    jnewblk, jn_indirdeps);
6691 	}
6692 	if (inodedep && inodedep->id_state & DEPCOMPLETE)
6693 		WORKLIST_INSERT(&inodedep->id_bufwait, &newblk->nb_list);
6694 	else
6695 		free_newblk(newblk);
6696 }
6697 
6698 /*
6699  * Create the mkdir dependencies for . and .. in a new directory.  Link them
6700  * in to a newdirblk so any subsequent additions are tracked properly.  The
6701  * caller is responsible for adding the mkdir1 dependency to the journal
6702  * and updating id_mkdiradd.  This function returns with lk held.
6703  */
6704 static struct mkdir *
6705 setup_newdir(dap, newinum, dinum, newdirbp, mkdirp)
6706 	struct diradd *dap;
6707 	ino_t newinum;
6708 	ino_t dinum;
6709 	struct buf *newdirbp;
6710 	struct mkdir **mkdirp;
6711 {
6712 	struct newblk *newblk;
6713 	struct pagedep *pagedep;
6714 	struct inodedep *inodedep;
6715 	struct newdirblk *newdirblk = 0;
6716 	struct mkdir *mkdir1, *mkdir2;
6717 	struct worklist *wk;
6718 	struct jaddref *jaddref;
6719 	struct mount *mp;
6720 
6721 	mp = dap->da_list.wk_mp;
6722 	newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,
6723 	    M_SOFTDEP_FLAGS);
6724 	workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
6725 	LIST_INIT(&newdirblk->db_mkdir);
6726 	mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
6727 	workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
6728 	mkdir1->md_state = ATTACHED | MKDIR_BODY;
6729 	mkdir1->md_diradd = dap;
6730 	mkdir1->md_jaddref = NULL;
6731 	mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
6732 	workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
6733 	mkdir2->md_state = ATTACHED | MKDIR_PARENT;
6734 	mkdir2->md_diradd = dap;
6735 	mkdir2->md_jaddref = NULL;
6736 	if ((mp->mnt_kern_flag & MNTK_SUJ) == 0) {
6737 		mkdir1->md_state |= DEPCOMPLETE;
6738 		mkdir2->md_state |= DEPCOMPLETE;
6739 	}
6740 	/*
6741 	 * Dependency on "." and ".." being written to disk.
6742 	 */
6743 	mkdir1->md_buf = newdirbp;
6744 	ACQUIRE_LOCK(&lk);
6745 	LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
6746 	/*
6747 	 * We must link the pagedep, allocdirect, and newdirblk for
6748 	 * the initial file page so the pointer to the new directory
6749 	 * is not written until the directory contents are live and
6750 	 * any subsequent additions are not marked live until the
6751 	 * block is reachable via the inode.
6752 	 */
6753 	if (pagedep_lookup(mp, newinum, 0, 0, &pagedep) == 0)
6754 		panic("setup_newdir: lost pagedep");
6755 	LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)
6756 		if (wk->wk_type == D_ALLOCDIRECT)
6757 			break;
6758 	if (wk == NULL)
6759 		panic("setup_newdir: lost allocdirect");
6760 	newblk = WK_NEWBLK(wk);
6761 	pagedep->pd_state |= NEWBLOCK;
6762 	pagedep->pd_newdirblk = newdirblk;
6763 	newdirblk->db_pagedep = pagedep;
6764 	WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
6765 	WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);
6766 	/*
6767 	 * Look up the inodedep for the parent directory so that we
6768 	 * can link mkdir2 into the pending dotdot jaddref or
6769 	 * the inode write if there is none.  If the inode is
6770 	 * ALLCOMPLETE and no jaddref is present all dependencies have
6771 	 * been satisfied and mkdir2 can be freed.
6772 	 */
6773 	inodedep_lookup(mp, dinum, 0, &inodedep);
6774 	if (mp->mnt_kern_flag & MNTK_SUJ) {
6775 		if (inodedep == NULL)
6776 			panic("setup_newdir: Lost parent.");
6777 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
6778 		    inoreflst);
6779 		KASSERT(jaddref != NULL && jaddref->ja_parent == newinum &&
6780 		    (jaddref->ja_state & MKDIR_PARENT),
6781 		    ("setup_newdir: bad dotdot jaddref %p", jaddref));
6782 		LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
6783 		mkdir2->md_jaddref = jaddref;
6784 		jaddref->ja_mkdir = mkdir2;
6785 	} else if (inodedep == NULL ||
6786 	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
6787 		dap->da_state &= ~MKDIR_PARENT;
6788 		WORKITEM_FREE(mkdir2, D_MKDIR);
6789 	} else {
6790 		LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
6791 		WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
6792 	}
6793 	*mkdirp = mkdir2;
6794 
6795 	return (mkdir1);
6796 }
6797 
6798 /*
6799  * Directory entry addition dependencies.
6800  *
6801  * When adding a new directory entry, the inode (with its incremented link
6802  * count) must be written to disk before the directory entry's pointer to it.
6803  * Also, if the inode is newly allocated, the corresponding freemap must be
6804  * updated (on disk) before the directory entry's pointer. These requirements
6805  * are met via undo/redo on the directory entry's pointer, which consists
6806  * simply of the inode number.
6807  *
6808  * As directory entries are added and deleted, the free space within a
6809  * directory block can become fragmented.  The ufs filesystem will compact
6810  * a fragmented directory block to make space for a new entry. When this
6811  * occurs, the offsets of previously added entries change. Any "diradd"
6812  * dependency structures corresponding to these entries must be updated with
6813  * the new offsets.
6814  */
6815 
6816 /*
6817  * This routine is called after the in-memory inode's link
6818  * count has been incremented, but before the directory entry's
6819  * pointer to the inode has been set.
6820  */
6821 int
6822 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
6823 	struct buf *bp;		/* buffer containing directory block */
6824 	struct inode *dp;	/* inode for directory */
6825 	off_t diroffset;	/* offset of new entry in directory */
6826 	ino_t newinum;		/* inode referenced by new directory entry */
6827 	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
6828 	int isnewblk;		/* entry is in a newly allocated block */
6829 {
6830 	int offset;		/* offset of new entry within directory block */
6831 	ufs_lbn_t lbn;		/* block in directory containing new entry */
6832 	struct fs *fs;
6833 	struct diradd *dap;
6834 	struct newblk *newblk;
6835 	struct pagedep *pagedep;
6836 	struct inodedep *inodedep;
6837 	struct newdirblk *newdirblk = 0;
6838 	struct mkdir *mkdir1, *mkdir2;
6839 	struct jaddref *jaddref;
6840 	struct mount *mp;
6841 	int isindir;
6842 
6843 	/*
6844 	 * Whiteouts have no dependencies.
6845 	 */
6846 	if (newinum == WINO) {
6847 		if (newdirbp != NULL)
6848 			bdwrite(newdirbp);
6849 		return (0);
6850 	}
6851 	jaddref = NULL;
6852 	mkdir1 = mkdir2 = NULL;
6853 	mp = UFSTOVFS(dp->i_ump);
6854 	fs = dp->i_fs;
6855 	lbn = lblkno(fs, diroffset);
6856 	offset = blkoff(fs, diroffset);
6857 	dap = malloc(sizeof(struct diradd), M_DIRADD,
6858 		M_SOFTDEP_FLAGS|M_ZERO);
6859 	workitem_alloc(&dap->da_list, D_DIRADD, mp);
6860 	dap->da_offset = offset;
6861 	dap->da_newinum = newinum;
6862 	dap->da_state = ATTACHED;
6863 	LIST_INIT(&dap->da_jwork);
6864 	isindir = bp->b_lblkno >= NDADDR;
6865 	if (isnewblk &&
6866 	    (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {
6867 		newdirblk = malloc(sizeof(struct newdirblk),
6868 		    M_NEWDIRBLK, M_SOFTDEP_FLAGS);
6869 		workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
6870 		LIST_INIT(&newdirblk->db_mkdir);
6871 	}
6872 	/*
6873 	 * If we're creating a new directory setup the dependencies and set
6874 	 * the dap state to wait for them.  Otherwise it's COMPLETE and
6875 	 * we can move on.
6876 	 */
6877 	if (newdirbp == NULL) {
6878 		dap->da_state |= DEPCOMPLETE;
6879 		ACQUIRE_LOCK(&lk);
6880 	} else {
6881 		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
6882 		mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,
6883 		    &mkdir2);
6884 	}
6885 	/*
6886 	 * Link into parent directory pagedep to await its being written.
6887 	 */
6888 	if (pagedep_lookup(mp, dp->i_number, lbn, DEPALLOC, &pagedep) == 0)
6889 		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
6890 #ifdef DEBUG
6891 	if (diradd_lookup(pagedep, offset) != NULL)
6892 		panic("softdep_setup_directory_add: %p already at off %d\n",
6893 		    diradd_lookup(pagedep, offset), offset);
6894 #endif
6895 	dap->da_pagedep = pagedep;
6896 	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
6897 	    da_pdlist);
6898 	inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
6899 	/*
6900 	 * If we're journaling, link the diradd into the jaddref so it
6901 	 * may be completed after the journal entry is written.  Otherwise,
6902 	 * link the diradd into its inodedep.  If the inode is not yet
6903 	 * written place it on the bufwait list, otherwise do the post-inode
6904 	 * write processing to put it on the id_pendinghd list.
6905 	 */
6906 	if (mp->mnt_kern_flag & MNTK_SUJ) {
6907 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
6908 		    inoreflst);
6909 		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
6910 		    ("softdep_setup_directory_add: bad jaddref %p", jaddref));
6911 		jaddref->ja_diroff = diroffset;
6912 		jaddref->ja_diradd = dap;
6913 		add_to_journal(&jaddref->ja_list);
6914 	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
6915 		diradd_inode_written(dap, inodedep);
6916 	else
6917 		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
6918 	/*
6919 	 * Add the journal entries for . and .. links now that the primary
6920 	 * link is written.
6921 	 */
6922 	if (mkdir1 != NULL && mp->mnt_kern_flag & MNTK_SUJ) {
6923 		jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
6924 		    inoreflst, if_deps);
6925 		KASSERT(jaddref != NULL &&
6926 		    jaddref->ja_ino == jaddref->ja_parent &&
6927 		    (jaddref->ja_state & MKDIR_BODY),
6928 		    ("softdep_setup_directory_add: bad dot jaddref %p",
6929 		    jaddref));
6930 		mkdir1->md_jaddref = jaddref;
6931 		jaddref->ja_mkdir = mkdir1;
6932 		/*
6933 		 * It is important that the dotdot journal entry
6934 		 * is added prior to the dot entry since dot writes
6935 		 * both the dot and dotdot links.  These both must
6936 		 * be added after the primary link for the journal
6937 		 * to remain consistent.
6938 		 */
6939 		add_to_journal(&mkdir2->md_jaddref->ja_list);
6940 		add_to_journal(&jaddref->ja_list);
6941 	}
6942 	/*
6943 	 * If we are adding a new directory remember this diradd so that if
6944 	 * we rename it we can keep the dot and dotdot dependencies.  If
6945 	 * we are adding a new name for an inode that has a mkdiradd we
6946 	 * must be in rename and we have to move the dot and dotdot
6947 	 * dependencies to this new name.  The old name is being orphaned
6948 	 * soon.
6949 	 */
6950 	if (mkdir1 != NULL) {
6951 		if (inodedep->id_mkdiradd != NULL)
6952 			panic("softdep_setup_directory_add: Existing mkdir");
6953 		inodedep->id_mkdiradd = dap;
6954 	} else if (inodedep->id_mkdiradd)
6955 		merge_diradd(inodedep, dap);
6956 	if (newdirblk) {
6957 		/*
6958 		 * There is nothing to do if we are already tracking
6959 		 * this block.
6960 		 */
6961 		if ((pagedep->pd_state & NEWBLOCK) != 0) {
6962 			WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
6963 			FREE_LOCK(&lk);
6964 			return (0);
6965 		}
6966 		if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)
6967 		    == 0)
6968 			panic("softdep_setup_directory_add: lost entry");
6969 		WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
6970 		pagedep->pd_state |= NEWBLOCK;
6971 		pagedep->pd_newdirblk = newdirblk;
6972 		newdirblk->db_pagedep = pagedep;
6973 		FREE_LOCK(&lk);
6974 		/*
6975 		 * If we extended into an indirect signal direnter to sync.
6976 		 */
6977 		if (isindir)
6978 			return (1);
6979 		return (0);
6980 	}
6981 	FREE_LOCK(&lk);
6982 	return (0);
6983 }
6984 
6985 /*
6986  * This procedure is called to change the offset of a directory
6987  * entry when compacting a directory block which must be owned
6988  * exclusively by the caller. Note that the actual entry movement
6989  * must be done in this procedure to ensure that no I/O completions
6990  * occur while the move is in progress.
6991  */
6992 void
6993 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
6994 	struct buf *bp;		/* Buffer holding directory block. */
6995 	struct inode *dp;	/* inode for directory */
6996 	caddr_t base;		/* address of dp->i_offset */
6997 	caddr_t oldloc;		/* address of old directory location */
6998 	caddr_t newloc;		/* address of new directory location */
6999 	int entrysize;		/* size of directory entry */
7000 {
7001 	int offset, oldoffset, newoffset;
7002 	struct pagedep *pagedep;
7003 	struct jmvref *jmvref;
7004 	struct diradd *dap;
7005 	struct direct *de;
7006 	struct mount *mp;
7007 	ufs_lbn_t lbn;
7008 	int flags;
7009 
7010 	mp = UFSTOVFS(dp->i_ump);
7011 	de = (struct direct *)oldloc;
7012 	jmvref = NULL;
7013 	flags = 0;
7014 	/*
7015 	 * Moves are always journaled as it would be too complex to
7016 	 * determine if any affected adds or removes are present in the
7017 	 * journal.
7018 	 */
7019 	if (mp->mnt_kern_flag & MNTK_SUJ)  {
7020 		flags = DEPALLOC;
7021 		jmvref = newjmvref(dp, de->d_ino,
7022 		    dp->i_offset + (oldloc - base),
7023 		    dp->i_offset + (newloc - base));
7024 	}
7025 	lbn = lblkno(dp->i_fs, dp->i_offset);
7026 	offset = blkoff(dp->i_fs, dp->i_offset);
7027 	oldoffset = offset + (oldloc - base);
7028 	newoffset = offset + (newloc - base);
7029 	ACQUIRE_LOCK(&lk);
7030 	if (pagedep_lookup(mp, dp->i_number, lbn, flags, &pagedep) == 0) {
7031 		if (pagedep)
7032 			WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
7033 		goto done;
7034 	}
7035 	dap = diradd_lookup(pagedep, oldoffset);
7036 	if (dap) {
7037 		dap->da_offset = newoffset;
7038 		newoffset = DIRADDHASH(newoffset);
7039 		oldoffset = DIRADDHASH(oldoffset);
7040 		if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&
7041 		    newoffset != oldoffset) {
7042 			LIST_REMOVE(dap, da_pdlist);
7043 			LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],
7044 			    dap, da_pdlist);
7045 		}
7046 	}
7047 done:
7048 	if (jmvref) {
7049 		jmvref->jm_pagedep = pagedep;
7050 		LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);
7051 		add_to_journal(&jmvref->jm_list);
7052 	}
7053 	bcopy(oldloc, newloc, entrysize);
7054 	FREE_LOCK(&lk);
7055 }
7056 
7057 /*
7058  * Move the mkdir dependencies and journal work from one diradd to another
7059  * when renaming a directory.  The new name must depend on the mkdir deps
7060  * completing as the old name did.  Directories can only have one valid link
7061  * at a time so one must be canonical.
7062  */
7063 static void
7064 merge_diradd(inodedep, newdap)
7065 	struct inodedep *inodedep;
7066 	struct diradd *newdap;
7067 {
7068 	struct diradd *olddap;
7069 	struct mkdir *mkdir, *nextmd;
7070 	short state;
7071 
7072 	olddap = inodedep->id_mkdiradd;
7073 	inodedep->id_mkdiradd = newdap;
7074 	if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
7075 		newdap->da_state &= ~DEPCOMPLETE;
7076 		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
7077 			nextmd = LIST_NEXT(mkdir, md_mkdirs);
7078 			if (mkdir->md_diradd != olddap)
7079 				continue;
7080 			mkdir->md_diradd = newdap;
7081 			state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);
7082 			newdap->da_state |= state;
7083 			olddap->da_state &= ~state;
7084 			if ((olddap->da_state &
7085 			    (MKDIR_PARENT | MKDIR_BODY)) == 0)
7086 				break;
7087 		}
7088 		if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
7089 			panic("merge_diradd: unfound ref");
7090 	}
7091 	/*
7092 	 * Any mkdir related journal items are not safe to be freed until
7093 	 * the new name is stable.
7094 	 */
7095 	jwork_move(&newdap->da_jwork, &olddap->da_jwork);
7096 	olddap->da_state |= DEPCOMPLETE;
7097 	complete_diradd(olddap);
7098 }
7099 
7100 /*
7101  * Move the diradd to the pending list when all diradd dependencies are
7102  * complete.
7103  */
7104 static void
7105 complete_diradd(dap)
7106 	struct diradd *dap;
7107 {
7108 	struct pagedep *pagedep;
7109 
7110 	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
7111 		if (dap->da_state & DIRCHG)
7112 			pagedep = dap->da_previous->dm_pagedep;
7113 		else
7114 			pagedep = dap->da_pagedep;
7115 		LIST_REMOVE(dap, da_pdlist);
7116 		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
7117 	}
7118 }
7119 
7120 /*
7121  * Cancel a diradd when a dirrem overlaps with it.  We must cancel the journal
7122  * add entries and conditonally journal the remove.
7123  */
7124 static void
7125 cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref)
7126 	struct diradd *dap;
7127 	struct dirrem *dirrem;
7128 	struct jremref *jremref;
7129 	struct jremref *dotremref;
7130 	struct jremref *dotdotremref;
7131 {
7132 	struct inodedep *inodedep;
7133 	struct jaddref *jaddref;
7134 	struct inoref *inoref;
7135 	struct mkdir *mkdir;
7136 
7137 	/*
7138 	 * If no remove references were allocated we're on a non-journaled
7139 	 * filesystem and can skip the cancel step.
7140 	 */
7141 	if (jremref == NULL) {
7142 		free_diradd(dap, NULL);
7143 		return;
7144 	}
7145 	/*
7146 	 * Cancel the primary name an free it if it does not require
7147 	 * journaling.
7148 	 */
7149 	if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,
7150 	    0, &inodedep) != 0) {
7151 		/* Abort the addref that reference this diradd.  */
7152 		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
7153 			if (inoref->if_list.wk_type != D_JADDREF)
7154 				continue;
7155 			jaddref = (struct jaddref *)inoref;
7156 			if (jaddref->ja_diradd != dap)
7157 				continue;
7158 			if (cancel_jaddref(jaddref, inodedep,
7159 			    &dirrem->dm_jwork) == 0) {
7160 				free_jremref(jremref);
7161 				jremref = NULL;
7162 			}
7163 			break;
7164 		}
7165 	}
7166 	/*
7167 	 * Cancel subordinate names and free them if they do not require
7168 	 * journaling.
7169 	 */
7170 	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
7171 		LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {
7172 			if (mkdir->md_diradd != dap)
7173 				continue;
7174 			if ((jaddref = mkdir->md_jaddref) == NULL)
7175 				continue;
7176 			mkdir->md_jaddref = NULL;
7177 			if (mkdir->md_state & MKDIR_PARENT) {
7178 				if (cancel_jaddref(jaddref, NULL,
7179 				    &dirrem->dm_jwork) == 0) {
7180 					free_jremref(dotdotremref);
7181 					dotdotremref = NULL;
7182 				}
7183 			} else {
7184 				if (cancel_jaddref(jaddref, inodedep,
7185 				    &dirrem->dm_jwork) == 0) {
7186 					free_jremref(dotremref);
7187 					dotremref = NULL;
7188 				}
7189 			}
7190 		}
7191 	}
7192 
7193 	if (jremref)
7194 		journal_jremref(dirrem, jremref, inodedep);
7195 	if (dotremref)
7196 		journal_jremref(dirrem, dotremref, inodedep);
7197 	if (dotdotremref)
7198 		journal_jremref(dirrem, dotdotremref, NULL);
7199 	jwork_move(&dirrem->dm_jwork, &dap->da_jwork);
7200 	free_diradd(dap, &dirrem->dm_jwork);
7201 }
7202 
7203 /*
7204  * Free a diradd dependency structure. This routine must be called
7205  * with splbio interrupts blocked.
7206  */
7207 static void
7208 free_diradd(dap, wkhd)
7209 	struct diradd *dap;
7210 	struct workhead *wkhd;
7211 {
7212 	struct dirrem *dirrem;
7213 	struct pagedep *pagedep;
7214 	struct inodedep *inodedep;
7215 	struct mkdir *mkdir, *nextmd;
7216 
7217 	mtx_assert(&lk, MA_OWNED);
7218 	LIST_REMOVE(dap, da_pdlist);
7219 	if (dap->da_state & ONWORKLIST)
7220 		WORKLIST_REMOVE(&dap->da_list);
7221 	if ((dap->da_state & DIRCHG) == 0) {
7222 		pagedep = dap->da_pagedep;
7223 	} else {
7224 		dirrem = dap->da_previous;
7225 		pagedep = dirrem->dm_pagedep;
7226 		dirrem->dm_dirinum = pagedep->pd_ino;
7227 		dirrem->dm_state |= COMPLETE;
7228 		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
7229 			add_to_worklist(&dirrem->dm_list, 0);
7230 	}
7231 	if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
7232 	    0, &inodedep) != 0)
7233 		if (inodedep->id_mkdiradd == dap)
7234 			inodedep->id_mkdiradd = NULL;
7235 	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
7236 		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
7237 			nextmd = LIST_NEXT(mkdir, md_mkdirs);
7238 			if (mkdir->md_diradd != dap)
7239 				continue;
7240 			dap->da_state &=
7241 			    ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
7242 			LIST_REMOVE(mkdir, md_mkdirs);
7243 			if (mkdir->md_state & ONWORKLIST)
7244 				WORKLIST_REMOVE(&mkdir->md_list);
7245 			if (mkdir->md_jaddref != NULL)
7246 				panic("free_diradd: Unexpected jaddref");
7247 			WORKITEM_FREE(mkdir, D_MKDIR);
7248 			if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
7249 				break;
7250 		}
7251 		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
7252 			panic("free_diradd: unfound ref");
7253 	}
7254 	if (inodedep)
7255 		free_inodedep(inodedep);
7256 	/*
7257 	 * Free any journal segments waiting for the directory write.
7258 	 */
7259 	handle_jwork(&dap->da_jwork);
7260 	WORKITEM_FREE(dap, D_DIRADD);
7261 }
7262 
7263 /*
7264  * Directory entry removal dependencies.
7265  *
7266  * When removing a directory entry, the entry's inode pointer must be
7267  * zero'ed on disk before the corresponding inode's link count is decremented
7268  * (possibly freeing the inode for re-use). This dependency is handled by
7269  * updating the directory entry but delaying the inode count reduction until
7270  * after the directory block has been written to disk. After this point, the
7271  * inode count can be decremented whenever it is convenient.
7272  */
7273 
7274 /*
7275  * This routine should be called immediately after removing
7276  * a directory entry.  The inode's link count should not be
7277  * decremented by the calling procedure -- the soft updates
7278  * code will do this task when it is safe.
7279  */
7280 void
7281 softdep_setup_remove(bp, dp, ip, isrmdir)
7282 	struct buf *bp;		/* buffer containing directory block */
7283 	struct inode *dp;	/* inode for the directory being modified */
7284 	struct inode *ip;	/* inode for directory entry being removed */
7285 	int isrmdir;		/* indicates if doing RMDIR */
7286 {
7287 	struct dirrem *dirrem, *prevdirrem;
7288 	struct inodedep *inodedep;
7289 	int direct;
7290 
7291 	/*
7292 	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.  We want
7293 	 * newdirrem() to setup the full directory remove which requires
7294 	 * isrmdir > 1.
7295 	 */
7296 	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
7297 	/*
7298 	 * Add the dirrem to the inodedep's pending remove list for quick
7299 	 * discovery later.
7300 	 */
7301 	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
7302 	    &inodedep) == 0)
7303 		panic("softdep_setup_remove: Lost inodedep.");
7304 	dirrem->dm_state |= ONDEPLIST;
7305 	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
7306 
7307 	/*
7308 	 * If the COMPLETE flag is clear, then there were no active
7309 	 * entries and we want to roll back to a zeroed entry until
7310 	 * the new inode is committed to disk. If the COMPLETE flag is
7311 	 * set then we have deleted an entry that never made it to
7312 	 * disk. If the entry we deleted resulted from a name change,
7313 	 * then the old name still resides on disk. We cannot delete
7314 	 * its inode (returned to us in prevdirrem) until the zeroed
7315 	 * directory entry gets to disk. The new inode has never been
7316 	 * referenced on the disk, so can be deleted immediately.
7317 	 */
7318 	if ((dirrem->dm_state & COMPLETE) == 0) {
7319 		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
7320 		    dm_next);
7321 		FREE_LOCK(&lk);
7322 	} else {
7323 		if (prevdirrem != NULL)
7324 			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
7325 			    prevdirrem, dm_next);
7326 		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
7327 		direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
7328 		FREE_LOCK(&lk);
7329 		if (direct)
7330 			handle_workitem_remove(dirrem, NULL);
7331 	}
7332 }
7333 
7334 /*
7335  * Check for an entry matching 'offset' on both the pd_dirraddhd list and the
7336  * pd_pendinghd list of a pagedep.
7337  */
7338 static struct diradd *
7339 diradd_lookup(pagedep, offset)
7340 	struct pagedep *pagedep;
7341 	int offset;
7342 {
7343 	struct diradd *dap;
7344 
7345 	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
7346 		if (dap->da_offset == offset)
7347 			return (dap);
7348 	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
7349 		if (dap->da_offset == offset)
7350 			return (dap);
7351 	return (NULL);
7352 }
7353 
7354 /*
7355  * Search for a .. diradd dependency in a directory that is being removed.
7356  * If the directory was renamed to a new parent we have a diradd rather
7357  * than a mkdir for the .. entry.  We need to cancel it now before
7358  * it is found in truncate().
7359  */
7360 static struct jremref *
7361 cancel_diradd_dotdot(ip, dirrem, jremref)
7362 	struct inode *ip;
7363 	struct dirrem *dirrem;
7364 	struct jremref *jremref;
7365 {
7366 	struct pagedep *pagedep;
7367 	struct diradd *dap;
7368 	struct worklist *wk;
7369 
7370 	if (pagedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 0,
7371 	    &pagedep) == 0)
7372 		return (jremref);
7373 	dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
7374 	if (dap == NULL)
7375 		return (jremref);
7376 	cancel_diradd(dap, dirrem, jremref, NULL, NULL);
7377 	/*
7378 	 * Mark any journal work as belonging to the parent so it is freed
7379 	 * with the .. reference.
7380 	 */
7381 	LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
7382 		wk->wk_state |= MKDIR_PARENT;
7383 	return (NULL);
7384 }
7385 
7386 /*
7387  * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to
7388  * replace it with a dirrem/diradd pair as a result of re-parenting a
7389  * directory.  This ensures that we don't simultaneously have a mkdir and
7390  * a diradd for the same .. entry.
7391  */
7392 static struct jremref *
7393 cancel_mkdir_dotdot(ip, dirrem, jremref)
7394 	struct inode *ip;
7395 	struct dirrem *dirrem;
7396 	struct jremref *jremref;
7397 {
7398 	struct inodedep *inodedep;
7399 	struct jaddref *jaddref;
7400 	struct mkdir *mkdir;
7401 	struct diradd *dap;
7402 
7403 	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
7404 	    &inodedep) == 0)
7405 		panic("cancel_mkdir_dotdot: Lost inodedep");
7406 	dap = inodedep->id_mkdiradd;
7407 	if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)
7408 		return (jremref);
7409 	for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir;
7410 	    mkdir = LIST_NEXT(mkdir, md_mkdirs))
7411 		if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)
7412 			break;
7413 	if (mkdir == NULL)
7414 		panic("cancel_mkdir_dotdot: Unable to find mkdir\n");
7415 	if ((jaddref = mkdir->md_jaddref) != NULL) {
7416 		mkdir->md_jaddref = NULL;
7417 		jaddref->ja_state &= ~MKDIR_PARENT;
7418 		if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0,
7419 		    &inodedep) == 0)
7420 			panic("cancel_mkdir_dotdot: Lost parent inodedep");
7421 		if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
7422 			journal_jremref(dirrem, jremref, inodedep);
7423 			jremref = NULL;
7424 		}
7425 	}
7426 	if (mkdir->md_state & ONWORKLIST)
7427 		WORKLIST_REMOVE(&mkdir->md_list);
7428 	mkdir->md_state |= ALLCOMPLETE;
7429 	complete_mkdir(mkdir);
7430 	return (jremref);
7431 }
7432 
7433 static void
7434 journal_jremref(dirrem, jremref, inodedep)
7435 	struct dirrem *dirrem;
7436 	struct jremref *jremref;
7437 	struct inodedep *inodedep;
7438 {
7439 
7440 	if (inodedep == NULL)
7441 		if (inodedep_lookup(jremref->jr_list.wk_mp,
7442 		    jremref->jr_ref.if_ino, 0, &inodedep) == 0)
7443 			panic("journal_jremref: Lost inodedep");
7444 	LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);
7445 	TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
7446 	add_to_journal(&jremref->jr_list);
7447 }
7448 
7449 static void
7450 dirrem_journal(dirrem, jremref, dotremref, dotdotremref)
7451 	struct dirrem *dirrem;
7452 	struct jremref *jremref;
7453 	struct jremref *dotremref;
7454 	struct jremref *dotdotremref;
7455 {
7456 	struct inodedep *inodedep;
7457 
7458 
7459 	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,
7460 	    &inodedep) == 0)
7461 		panic("dirrem_journal: Lost inodedep");
7462 	journal_jremref(dirrem, jremref, inodedep);
7463 	if (dotremref)
7464 		journal_jremref(dirrem, dotremref, inodedep);
7465 	if (dotdotremref)
7466 		journal_jremref(dirrem, dotdotremref, NULL);
7467 }
7468 
7469 /*
7470  * Allocate a new dirrem if appropriate and return it along with
7471  * its associated pagedep. Called without a lock, returns with lock.
7472  */
7473 static struct dirrem *
7474 newdirrem(bp, dp, ip, isrmdir, prevdirremp)
7475 	struct buf *bp;		/* buffer containing directory block */
7476 	struct inode *dp;	/* inode for the directory being modified */
7477 	struct inode *ip;	/* inode for directory entry being removed */
7478 	int isrmdir;		/* indicates if doing RMDIR */
7479 	struct dirrem **prevdirremp; /* previously referenced inode, if any */
7480 {
7481 	int offset;
7482 	ufs_lbn_t lbn;
7483 	struct diradd *dap;
7484 	struct dirrem *dirrem;
7485 	struct pagedep *pagedep;
7486 	struct jremref *jremref;
7487 	struct jremref *dotremref;
7488 	struct jremref *dotdotremref;
7489 	struct vnode *dvp;
7490 
7491 	/*
7492 	 * Whiteouts have no deletion dependencies.
7493 	 */
7494 	if (ip == NULL)
7495 		panic("newdirrem: whiteout");
7496 	dvp = ITOV(dp);
7497 	/*
7498 	 * If we are over our limit, try to improve the situation.
7499 	 * Limiting the number of dirrem structures will also limit
7500 	 * the number of freefile and freeblks structures.
7501 	 */
7502 	ACQUIRE_LOCK(&lk);
7503 	if (!(ip->i_flags & SF_SNAPSHOT) &&
7504 	    dep_current[D_DIRREM] > max_softdeps / 2)
7505 		(void) request_cleanup(ITOV(dp)->v_mount, FLUSH_BLOCKS);
7506 	FREE_LOCK(&lk);
7507 	dirrem = malloc(sizeof(struct dirrem),
7508 		M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
7509 	workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
7510 	LIST_INIT(&dirrem->dm_jremrefhd);
7511 	LIST_INIT(&dirrem->dm_jwork);
7512 	dirrem->dm_state = isrmdir ? RMDIR : 0;
7513 	dirrem->dm_oldinum = ip->i_number;
7514 	*prevdirremp = NULL;
7515 	/*
7516 	 * Allocate remove reference structures to track journal write
7517 	 * dependencies.  We will always have one for the link and
7518 	 * when doing directories we will always have one more for dot.
7519 	 * When renaming a directory we skip the dotdot link change so
7520 	 * this is not needed.
7521 	 */
7522 	jremref = dotremref = dotdotremref = NULL;
7523 	if (DOINGSUJ(dvp)) {
7524 		if (isrmdir) {
7525 			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
7526 			    ip->i_effnlink + 2);
7527 			dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,
7528 			    ip->i_effnlink + 1);
7529 			dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,
7530 			    dp->i_effnlink + 1);
7531 			dotdotremref->jr_state |= MKDIR_PARENT;
7532 		} else
7533 			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
7534 			    ip->i_effnlink + 1);
7535 	}
7536 	ACQUIRE_LOCK(&lk);
7537 	lbn = lblkno(dp->i_fs, dp->i_offset);
7538 	offset = blkoff(dp->i_fs, dp->i_offset);
7539 	if (pagedep_lookup(UFSTOVFS(dp->i_ump), dp->i_number, lbn, DEPALLOC,
7540 	    &pagedep) == 0)
7541 		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
7542 	dirrem->dm_pagedep = pagedep;
7543 	/*
7544 	 * If we're renaming a .. link to a new directory, cancel any
7545 	 * existing MKDIR_PARENT mkdir.  If it has already been canceled
7546 	 * the jremref is preserved for any potential diradd in this
7547 	 * location.  This can not coincide with a rmdir.
7548 	 */
7549 	if (dp->i_offset == DOTDOT_OFFSET) {
7550 		if (isrmdir)
7551 			panic("newdirrem: .. directory change during remove?");
7552 		jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);
7553 	}
7554 	/*
7555 	 * If we're removing a directory search for the .. dependency now and
7556 	 * cancel it.  Any pending journal work will be added to the dirrem
7557 	 * to be completed when the workitem remove completes.
7558 	 */
7559 	if (isrmdir)
7560 		dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);
7561 	/*
7562 	 * Check for a diradd dependency for the same directory entry.
7563 	 * If present, then both dependencies become obsolete and can
7564 	 * be de-allocated.
7565 	 */
7566 	dap = diradd_lookup(pagedep, offset);
7567 	if (dap == NULL) {
7568 		/*
7569 		 * Link the jremref structures into the dirrem so they are
7570 		 * written prior to the pagedep.
7571 		 */
7572 		if (jremref)
7573 			dirrem_journal(dirrem, jremref, dotremref,
7574 			    dotdotremref);
7575 		return (dirrem);
7576 	}
7577 	/*
7578 	 * Must be ATTACHED at this point.
7579 	 */
7580 	if ((dap->da_state & ATTACHED) == 0)
7581 		panic("newdirrem: not ATTACHED");
7582 	if (dap->da_newinum != ip->i_number)
7583 		panic("newdirrem: inum %d should be %d",
7584 		    ip->i_number, dap->da_newinum);
7585 	/*
7586 	 * If we are deleting a changed name that never made it to disk,
7587 	 * then return the dirrem describing the previous inode (which
7588 	 * represents the inode currently referenced from this entry on disk).
7589 	 */
7590 	if ((dap->da_state & DIRCHG) != 0) {
7591 		*prevdirremp = dap->da_previous;
7592 		dap->da_state &= ~DIRCHG;
7593 		dap->da_pagedep = pagedep;
7594 	}
7595 	/*
7596 	 * We are deleting an entry that never made it to disk.
7597 	 * Mark it COMPLETE so we can delete its inode immediately.
7598 	 */
7599 	dirrem->dm_state |= COMPLETE;
7600 	cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);
7601 #ifdef SUJ_DEBUG
7602 	if (isrmdir == 0) {
7603 		struct worklist *wk;
7604 
7605 		LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
7606 			if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))
7607 				panic("bad wk %p (0x%X)\n", wk, wk->wk_state);
7608 	}
7609 #endif
7610 
7611 	return (dirrem);
7612 }
7613 
7614 /*
7615  * Directory entry change dependencies.
7616  *
7617  * Changing an existing directory entry requires that an add operation
7618  * be completed first followed by a deletion. The semantics for the addition
7619  * are identical to the description of adding a new entry above except
7620  * that the rollback is to the old inode number rather than zero. Once
7621  * the addition dependency is completed, the removal is done as described
7622  * in the removal routine above.
7623  */
7624 
7625 /*
7626  * This routine should be called immediately after changing
7627  * a directory entry.  The inode's link count should not be
7628  * decremented by the calling procedure -- the soft updates
7629  * code will perform this task when it is safe.
7630  */
7631 void
7632 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
7633 	struct buf *bp;		/* buffer containing directory block */
7634 	struct inode *dp;	/* inode for the directory being modified */
7635 	struct inode *ip;	/* inode for directory entry being removed */
7636 	ino_t newinum;		/* new inode number for changed entry */
7637 	int isrmdir;		/* indicates if doing RMDIR */
7638 {
7639 	int offset;
7640 	struct diradd *dap = NULL;
7641 	struct dirrem *dirrem, *prevdirrem;
7642 	struct pagedep *pagedep;
7643 	struct inodedep *inodedep;
7644 	struct jaddref *jaddref;
7645 	struct mount *mp;
7646 
7647 	offset = blkoff(dp->i_fs, dp->i_offset);
7648 	mp = UFSTOVFS(dp->i_ump);
7649 
7650 	/*
7651 	 * Whiteouts do not need diradd dependencies.
7652 	 */
7653 	if (newinum != WINO) {
7654 		dap = malloc(sizeof(struct diradd),
7655 		    M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
7656 		workitem_alloc(&dap->da_list, D_DIRADD, mp);
7657 		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
7658 		dap->da_offset = offset;
7659 		dap->da_newinum = newinum;
7660 		LIST_INIT(&dap->da_jwork);
7661 	}
7662 
7663 	/*
7664 	 * Allocate a new dirrem and ACQUIRE_LOCK.
7665 	 */
7666 	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
7667 	pagedep = dirrem->dm_pagedep;
7668 	/*
7669 	 * The possible values for isrmdir:
7670 	 *	0 - non-directory file rename
7671 	 *	1 - directory rename within same directory
7672 	 *   inum - directory rename to new directory of given inode number
7673 	 * When renaming to a new directory, we are both deleting and
7674 	 * creating a new directory entry, so the link count on the new
7675 	 * directory should not change. Thus we do not need the followup
7676 	 * dirrem which is usually done in handle_workitem_remove. We set
7677 	 * the DIRCHG flag to tell handle_workitem_remove to skip the
7678 	 * followup dirrem.
7679 	 */
7680 	if (isrmdir > 1)
7681 		dirrem->dm_state |= DIRCHG;
7682 
7683 	/*
7684 	 * Whiteouts have no additional dependencies,
7685 	 * so just put the dirrem on the correct list.
7686 	 */
7687 	if (newinum == WINO) {
7688 		if ((dirrem->dm_state & COMPLETE) == 0) {
7689 			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
7690 			    dm_next);
7691 		} else {
7692 			dirrem->dm_dirinum = pagedep->pd_ino;
7693 			if (LIST_EMPTY(&dirrem->dm_jremrefhd))
7694 				add_to_worklist(&dirrem->dm_list, 0);
7695 		}
7696 		FREE_LOCK(&lk);
7697 		return;
7698 	}
7699 	/*
7700 	 * Add the dirrem to the inodedep's pending remove list for quick
7701 	 * discovery later.  A valid nlinkdelta ensures that this lookup
7702 	 * will not fail.
7703 	 */
7704 	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
7705 		panic("softdep_setup_directory_change: Lost inodedep.");
7706 	dirrem->dm_state |= ONDEPLIST;
7707 	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
7708 
7709 	/*
7710 	 * If the COMPLETE flag is clear, then there were no active
7711 	 * entries and we want to roll back to the previous inode until
7712 	 * the new inode is committed to disk. If the COMPLETE flag is
7713 	 * set, then we have deleted an entry that never made it to disk.
7714 	 * If the entry we deleted resulted from a name change, then the old
7715 	 * inode reference still resides on disk. Any rollback that we do
7716 	 * needs to be to that old inode (returned to us in prevdirrem). If
7717 	 * the entry we deleted resulted from a create, then there is
7718 	 * no entry on the disk, so we want to roll back to zero rather
7719 	 * than the uncommitted inode. In either of the COMPLETE cases we
7720 	 * want to immediately free the unwritten and unreferenced inode.
7721 	 */
7722 	if ((dirrem->dm_state & COMPLETE) == 0) {
7723 		dap->da_previous = dirrem;
7724 	} else {
7725 		if (prevdirrem != NULL) {
7726 			dap->da_previous = prevdirrem;
7727 		} else {
7728 			dap->da_state &= ~DIRCHG;
7729 			dap->da_pagedep = pagedep;
7730 		}
7731 		dirrem->dm_dirinum = pagedep->pd_ino;
7732 		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
7733 			add_to_worklist(&dirrem->dm_list, 0);
7734 	}
7735 	/*
7736 	 * Lookup the jaddref for this journal entry.  We must finish
7737 	 * initializing it and make the diradd write dependent on it.
7738 	 * If we're not journaling Put it on the id_bufwait list if the inode
7739 	 * is not yet written. If it is written, do the post-inode write
7740 	 * processing to put it on the id_pendinghd list.
7741 	 */
7742 	inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
7743 	if (mp->mnt_kern_flag & MNTK_SUJ) {
7744 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
7745 		    inoreflst);
7746 		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
7747 		    ("softdep_setup_directory_change: bad jaddref %p",
7748 		    jaddref));
7749 		jaddref->ja_diroff = dp->i_offset;
7750 		jaddref->ja_diradd = dap;
7751 		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
7752 		    dap, da_pdlist);
7753 		add_to_journal(&jaddref->ja_list);
7754 	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
7755 		dap->da_state |= COMPLETE;
7756 		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
7757 		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
7758 	} else {
7759 		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
7760 		    dap, da_pdlist);
7761 		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
7762 	}
7763 	/*
7764 	 * If we're making a new name for a directory that has not been
7765 	 * committed when need to move the dot and dotdot references to
7766 	 * this new name.
7767 	 */
7768 	if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET)
7769 		merge_diradd(inodedep, dap);
7770 	FREE_LOCK(&lk);
7771 }
7772 
7773 /*
7774  * Called whenever the link count on an inode is changed.
7775  * It creates an inode dependency so that the new reference(s)
7776  * to the inode cannot be committed to disk until the updated
7777  * inode has been written.
7778  */
7779 void
7780 softdep_change_linkcnt(ip)
7781 	struct inode *ip;	/* the inode with the increased link count */
7782 {
7783 	struct inodedep *inodedep;
7784 
7785 	ACQUIRE_LOCK(&lk);
7786 	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep);
7787 	if (ip->i_nlink < ip->i_effnlink)
7788 		panic("softdep_change_linkcnt: bad delta");
7789 	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
7790 	FREE_LOCK(&lk);
7791 }
7792 
7793 /*
7794  * Attach a sbdep dependency to the superblock buf so that we can keep
7795  * track of the head of the linked list of referenced but unlinked inodes.
7796  */
7797 void
7798 softdep_setup_sbupdate(ump, fs, bp)
7799 	struct ufsmount *ump;
7800 	struct fs *fs;
7801 	struct buf *bp;
7802 {
7803 	struct sbdep *sbdep;
7804 	struct worklist *wk;
7805 
7806 	if ((fs->fs_flags & FS_SUJ) == 0)
7807 		return;
7808 	LIST_FOREACH(wk, &bp->b_dep, wk_list)
7809 		if (wk->wk_type == D_SBDEP)
7810 			break;
7811 	if (wk != NULL)
7812 		return;
7813 	sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);
7814 	workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));
7815 	sbdep->sb_fs = fs;
7816 	sbdep->sb_ump = ump;
7817 	ACQUIRE_LOCK(&lk);
7818 	WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);
7819 	FREE_LOCK(&lk);
7820 }
7821 
7822 /*
7823  * Return the first unlinked inodedep which is ready to be the head of the
7824  * list.  The inodedep and all those after it must have valid next pointers.
7825  */
7826 static struct inodedep *
7827 first_unlinked_inodedep(ump)
7828 	struct ufsmount *ump;
7829 {
7830 	struct inodedep *inodedep;
7831 	struct inodedep *idp;
7832 
7833 	for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
7834 	    inodedep; inodedep = idp) {
7835 		if ((inodedep->id_state & UNLINKNEXT) == 0)
7836 			return (NULL);
7837 		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
7838 		if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)
7839 			break;
7840 		if ((inodedep->id_state & UNLINKPREV) == 0)
7841 			panic("first_unlinked_inodedep: prev != next");
7842 	}
7843 	if (inodedep == NULL)
7844 		return (NULL);
7845 
7846 	return (inodedep);
7847 }
7848 
7849 /*
7850  * Set the sujfree unlinked head pointer prior to writing a superblock.
7851  */
7852 static void
7853 initiate_write_sbdep(sbdep)
7854 	struct sbdep *sbdep;
7855 {
7856 	struct inodedep *inodedep;
7857 	struct fs *bpfs;
7858 	struct fs *fs;
7859 
7860 	bpfs = sbdep->sb_fs;
7861 	fs = sbdep->sb_ump->um_fs;
7862 	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
7863 	if (inodedep) {
7864 		fs->fs_sujfree = inodedep->id_ino;
7865 		inodedep->id_state |= UNLINKPREV;
7866 	} else
7867 		fs->fs_sujfree = 0;
7868 	bpfs->fs_sujfree = fs->fs_sujfree;
7869 }
7870 
7871 /*
7872  * After a superblock is written determine whether it must be written again
7873  * due to a changing unlinked list head.
7874  */
7875 static int
7876 handle_written_sbdep(sbdep, bp)
7877 	struct sbdep *sbdep;
7878 	struct buf *bp;
7879 {
7880 	struct inodedep *inodedep;
7881 	struct mount *mp;
7882 	struct fs *fs;
7883 
7884 	fs = sbdep->sb_fs;
7885 	mp = UFSTOVFS(sbdep->sb_ump);
7886 	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
7887 	if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
7888 	    (inodedep == NULL && fs->fs_sujfree != 0)) {
7889 		bdirty(bp);
7890 		return (1);
7891 	}
7892 	WORKITEM_FREE(sbdep, D_SBDEP);
7893 	if (fs->fs_sujfree == 0)
7894 		return (0);
7895 	if (inodedep_lookup(mp, fs->fs_sujfree, 0, &inodedep) == 0)
7896 		panic("handle_written_sbdep: lost inodedep");
7897 	/*
7898 	 * Now that we have a record of this inode in stable store allow it
7899 	 * to be written to free up pending work.  Inodes may see a lot of
7900 	 * write activity after they are unlinked which we must not hold up.
7901 	 */
7902 	for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
7903 		if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)
7904 			panic("handle_written_sbdep: Bad inodedep %p (0x%X)",
7905 			    inodedep, inodedep->id_state);
7906 		if (inodedep->id_state & UNLINKONLIST)
7907 			break;
7908 		inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST;
7909 	}
7910 
7911 	return (0);
7912 }
7913 
7914 /*
7915  * Mark an inodedep as unlinked and insert it into the in-memory unlinked list.
7916  */
7917 static void
7918 unlinked_inodedep(mp, inodedep)
7919 	struct mount *mp;
7920 	struct inodedep *inodedep;
7921 {
7922 	struct ufsmount *ump;
7923 
7924 	if ((mp->mnt_kern_flag & MNTK_SUJ) == 0)
7925 		return;
7926 	ump = VFSTOUFS(mp);
7927 	ump->um_fs->fs_fmod = 1;
7928 	inodedep->id_state |= UNLINKED;
7929 	TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
7930 }
7931 
7932 /*
7933  * Remove an inodedep from the unlinked inodedep list.  This may require
7934  * disk writes if the inode has made it that far.
7935  */
7936 static void
7937 clear_unlinked_inodedep(inodedep)
7938 	struct inodedep *inodedep;
7939 {
7940 	struct ufsmount *ump;
7941 	struct inodedep *idp;
7942 	struct inodedep *idn;
7943 	struct fs *fs;
7944 	struct buf *bp;
7945 	ino_t ino;
7946 	ino_t nino;
7947 	ino_t pino;
7948 	int error;
7949 
7950 	ump = VFSTOUFS(inodedep->id_list.wk_mp);
7951 	fs = ump->um_fs;
7952 	ino = inodedep->id_ino;
7953 	error = 0;
7954 	for (;;) {
7955 		/*
7956 		 * If nothing has yet been written simply remove us from
7957 		 * the in memory list and return.  This is the most common
7958 		 * case where handle_workitem_remove() loses the final
7959 		 * reference.
7960 		 */
7961 		if ((inodedep->id_state & UNLINKLINKS) == 0)
7962 			break;
7963 		/*
7964 		 * If we have a NEXT pointer and no PREV pointer we can simply
7965 		 * clear NEXT's PREV and remove ourselves from the list.  Be
7966 		 * careful not to clear PREV if the superblock points at
7967 		 * next as well.
7968 		 */
7969 		idn = TAILQ_NEXT(inodedep, id_unlinked);
7970 		if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {
7971 			if (idn && fs->fs_sujfree != idn->id_ino)
7972 				idn->id_state &= ~UNLINKPREV;
7973 			break;
7974 		}
7975 		/*
7976 		 * Here we have an inodedep which is actually linked into
7977 		 * the list.  We must remove it by forcing a write to the
7978 		 * link before us, whether it be the superblock or an inode.
7979 		 * Unfortunately the list may change while we're waiting
7980 		 * on the buf lock for either resource so we must loop until
7981 		 * we lock the right one.  If both the superblock and an
7982 		 * inode point to this inode we must clear the inode first
7983 		 * followed by the superblock.
7984 		 */
7985 		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
7986 		pino = 0;
7987 		if (idp && (idp->id_state & UNLINKNEXT))
7988 			pino = idp->id_ino;
7989 		FREE_LOCK(&lk);
7990 		if (pino == 0)
7991 			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
7992 			    (int)fs->fs_sbsize, 0, 0, 0);
7993 		else
7994 			error = bread(ump->um_devvp,
7995 			    fsbtodb(fs, ino_to_fsba(fs, pino)),
7996 			    (int)fs->fs_bsize, NOCRED, &bp);
7997 		ACQUIRE_LOCK(&lk);
7998 		if (error)
7999 			break;
8000 		/* If the list has changed restart the loop. */
8001 		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
8002 		nino = 0;
8003 		if (idp && (idp->id_state & UNLINKNEXT))
8004 			nino = idp->id_ino;
8005 		if (nino != pino ||
8006 		    (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {
8007 			FREE_LOCK(&lk);
8008 			brelse(bp);
8009 			ACQUIRE_LOCK(&lk);
8010 			continue;
8011 		}
8012 		/*
8013 		 * Remove us from the in memory list.  After this we cannot
8014 		 * access the inodedep.
8015 		 */
8016 		idn = TAILQ_NEXT(inodedep, id_unlinked);
8017 		inodedep->id_state &= ~(UNLINKED | UNLINKLINKS);
8018 		TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
8019 		/*
8020 		 * Determine the next inode number.
8021 		 */
8022 		nino = 0;
8023 		if (idn) {
8024 			/*
8025 			 * If next isn't on the list we can just clear prev's
8026 			 * state and schedule it to be fixed later.  No need
8027 			 * to synchronously write if we're not in the real
8028 			 * list.
8029 			 */
8030 			if ((idn->id_state & UNLINKPREV) == 0 && pino != 0) {
8031 				idp->id_state &= ~UNLINKNEXT;
8032 				if ((idp->id_state & ONWORKLIST) == 0)
8033 					WORKLIST_INSERT(&bp->b_dep,
8034 					    &idp->id_list);
8035 				FREE_LOCK(&lk);
8036 				bawrite(bp);
8037 				ACQUIRE_LOCK(&lk);
8038 				return;
8039 			}
8040 			nino = idn->id_ino;
8041 		}
8042 		FREE_LOCK(&lk);
8043 		/*
8044 		 * The predecessor's next pointer is manually updated here
8045 		 * so that the NEXT flag is never cleared for an element
8046 		 * that is in the list.
8047 		 */
8048 		if (pino == 0) {
8049 			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
8050 			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
8051 			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
8052 			    bp);
8053 		} else if (fs->fs_magic == FS_UFS1_MAGIC)
8054 			((struct ufs1_dinode *)bp->b_data +
8055 			    ino_to_fsbo(fs, pino))->di_freelink = nino;
8056 		else
8057 			((struct ufs2_dinode *)bp->b_data +
8058 			    ino_to_fsbo(fs, pino))->di_freelink = nino;
8059 		/*
8060 		 * If the bwrite fails we have no recourse to recover.  The
8061 		 * filesystem is corrupted already.
8062 		 */
8063 		bwrite(bp);
8064 		ACQUIRE_LOCK(&lk);
8065 		/*
8066 		 * If the superblock pointer still needs to be cleared force
8067 		 * a write here.
8068 		 */
8069 		if (fs->fs_sujfree == ino) {
8070 			FREE_LOCK(&lk);
8071 			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
8072 			    (int)fs->fs_sbsize, 0, 0, 0);
8073 			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
8074 			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
8075 			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
8076 			    bp);
8077 			bwrite(bp);
8078 			ACQUIRE_LOCK(&lk);
8079 		}
8080 		if (fs->fs_sujfree != ino)
8081 			return;
8082 		panic("clear_unlinked_inodedep: Failed to clear free head");
8083 	}
8084 	if (inodedep->id_ino == fs->fs_sujfree)
8085 		panic("clear_unlinked_inodedep: Freeing head of free list");
8086 	inodedep->id_state &= ~(UNLINKED | UNLINKLINKS);
8087 	TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
8088 	return;
8089 }
8090 
8091 /*
8092  * This workitem decrements the inode's link count.
8093  * If the link count reaches zero, the file is removed.
8094  */
8095 static void
8096 handle_workitem_remove(dirrem, xp)
8097 	struct dirrem *dirrem;
8098 	struct vnode *xp;
8099 {
8100 	struct inodedep *inodedep;
8101 	struct workhead dotdotwk;
8102 	struct worklist *wk;
8103 	struct ufsmount *ump;
8104 	struct mount *mp;
8105 	struct vnode *vp;
8106 	struct inode *ip;
8107 	ino_t oldinum;
8108 	int error;
8109 
8110 	if (dirrem->dm_state & ONWORKLIST)
8111 		panic("handle_workitem_remove: dirrem %p still on worklist",
8112 		    dirrem);
8113 	oldinum = dirrem->dm_oldinum;
8114 	mp = dirrem->dm_list.wk_mp;
8115 	ump = VFSTOUFS(mp);
8116 	if ((vp = xp) == NULL &&
8117 	    (error = ffs_vgetf(mp, oldinum, LK_EXCLUSIVE, &vp,
8118 	    FFSV_FORCEINSMQ)) != 0) {
8119 		softdep_error("handle_workitem_remove: vget", error);
8120 		return;
8121 	}
8122 	ip = VTOI(vp);
8123 	ACQUIRE_LOCK(&lk);
8124 	if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)
8125 		panic("handle_workitem_remove: lost inodedep");
8126 	if (dirrem->dm_state & ONDEPLIST)
8127 		LIST_REMOVE(dirrem, dm_inonext);
8128 	KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
8129 	    ("handle_workitem_remove:  Journal entries not written."));
8130 
8131 	/*
8132 	 * Move all dependencies waiting on the remove to complete
8133 	 * from the dirrem to the inode inowait list to be completed
8134 	 * after the inode has been updated and written to disk.  Any
8135 	 * marked MKDIR_PARENT are saved to be completed when the .. ref
8136 	 * is removed.
8137 	 */
8138 	LIST_INIT(&dotdotwk);
8139 	while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {
8140 		WORKLIST_REMOVE(wk);
8141 		if (wk->wk_state & MKDIR_PARENT) {
8142 			wk->wk_state &= ~MKDIR_PARENT;
8143 			WORKLIST_INSERT(&dotdotwk, wk);
8144 			continue;
8145 		}
8146 		WORKLIST_INSERT(&inodedep->id_inowait, wk);
8147 	}
8148 	LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);
8149 	/*
8150 	 * Normal file deletion.
8151 	 */
8152 	if ((dirrem->dm_state & RMDIR) == 0) {
8153 		ip->i_nlink--;
8154 		DIP_SET(ip, i_nlink, ip->i_nlink);
8155 		ip->i_flag |= IN_CHANGE;
8156 		if (ip->i_nlink < ip->i_effnlink)
8157 			panic("handle_workitem_remove: bad file delta");
8158 		if (ip->i_nlink == 0)
8159 			unlinked_inodedep(mp, inodedep);
8160 		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
8161 		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
8162 		    ("handle_workitem_remove: worklist not empty. %s",
8163 		    TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));
8164 		WORKITEM_FREE(dirrem, D_DIRREM);
8165 		FREE_LOCK(&lk);
8166 		goto out;
8167 	}
8168 	/*
8169 	 * Directory deletion. Decrement reference count for both the
8170 	 * just deleted parent directory entry and the reference for ".".
8171 	 * Arrange to have the reference count on the parent decremented
8172 	 * to account for the loss of "..".
8173 	 */
8174 	ip->i_nlink -= 2;
8175 	DIP_SET(ip, i_nlink, ip->i_nlink);
8176 	ip->i_flag |= IN_CHANGE;
8177 	if (ip->i_nlink < ip->i_effnlink)
8178 		panic("handle_workitem_remove: bad dir delta");
8179 	if (ip->i_nlink == 0)
8180 		unlinked_inodedep(mp, inodedep);
8181 	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
8182 	/*
8183 	 * Rename a directory to a new parent. Since, we are both deleting
8184 	 * and creating a new directory entry, the link count on the new
8185 	 * directory should not change. Thus we skip the followup dirrem.
8186 	 */
8187 	if (dirrem->dm_state & DIRCHG) {
8188 		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
8189 		    ("handle_workitem_remove: DIRCHG and worklist not empty."));
8190 		WORKITEM_FREE(dirrem, D_DIRREM);
8191 		FREE_LOCK(&lk);
8192 		goto out;
8193 	}
8194 	dirrem->dm_state = ONDEPLIST;
8195 	dirrem->dm_oldinum = dirrem->dm_dirinum;
8196 	/*
8197 	 * Place the dirrem on the parent's diremhd list.
8198 	 */
8199 	if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)
8200 		panic("handle_workitem_remove: lost dir inodedep");
8201 	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
8202 	/*
8203 	 * If the allocated inode has never been written to disk, then
8204 	 * the on-disk inode is zero'ed and we can remove the file
8205 	 * immediately.  When journaling if the inode has been marked
8206 	 * unlinked and not DEPCOMPLETE we know it can never be written.
8207 	 */
8208 	inodedep_lookup(mp, oldinum, 0, &inodedep);
8209 	if (inodedep == NULL ||
8210 	    (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
8211 	    check_inode_unwritten(inodedep)) {
8212 		if (xp != NULL)
8213 			add_to_worklist(&dirrem->dm_list, 0);
8214 		FREE_LOCK(&lk);
8215 		if (xp == NULL) {
8216 			vput(vp);
8217 			handle_workitem_remove(dirrem, NULL);
8218 		}
8219 		return;
8220 	}
8221 	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
8222 	FREE_LOCK(&lk);
8223 	ip->i_flag |= IN_CHANGE;
8224 out:
8225 	ffs_update(vp, 0);
8226 	if (xp == NULL)
8227 		vput(vp);
8228 }
8229 
8230 /*
8231  * Inode de-allocation dependencies.
8232  *
8233  * When an inode's link count is reduced to zero, it can be de-allocated. We
8234  * found it convenient to postpone de-allocation until after the inode is
8235  * written to disk with its new link count (zero).  At this point, all of the
8236  * on-disk inode's block pointers are nullified and, with careful dependency
8237  * list ordering, all dependencies related to the inode will be satisfied and
8238  * the corresponding dependency structures de-allocated.  So, if/when the
8239  * inode is reused, there will be no mixing of old dependencies with new
8240  * ones.  This artificial dependency is set up by the block de-allocation
8241  * procedure above (softdep_setup_freeblocks) and completed by the
8242  * following procedure.
8243  */
8244 static void
8245 handle_workitem_freefile(freefile)
8246 	struct freefile *freefile;
8247 {
8248 	struct workhead wkhd;
8249 	struct fs *fs;
8250 	struct inodedep *idp;
8251 	struct ufsmount *ump;
8252 	int error;
8253 
8254 	ump = VFSTOUFS(freefile->fx_list.wk_mp);
8255 	fs = ump->um_fs;
8256 #ifdef DEBUG
8257 	ACQUIRE_LOCK(&lk);
8258 	error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
8259 	FREE_LOCK(&lk);
8260 	if (error)
8261 		panic("handle_workitem_freefile: inodedep %p survived", idp);
8262 #endif
8263 	UFS_LOCK(ump);
8264 	fs->fs_pendinginodes -= 1;
8265 	UFS_UNLOCK(ump);
8266 	LIST_INIT(&wkhd);
8267 	LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);
8268 	if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
8269 	    freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)
8270 		softdep_error("handle_workitem_freefile", error);
8271 	ACQUIRE_LOCK(&lk);
8272 	WORKITEM_FREE(freefile, D_FREEFILE);
8273 	FREE_LOCK(&lk);
8274 }
8275 
8276 
8277 /*
8278  * Helper function which unlinks marker element from work list and returns
8279  * the next element on the list.
8280  */
8281 static __inline struct worklist *
8282 markernext(struct worklist *marker)
8283 {
8284 	struct worklist *next;
8285 
8286 	next = LIST_NEXT(marker, wk_list);
8287 	LIST_REMOVE(marker, wk_list);
8288 	return next;
8289 }
8290 
8291 /*
8292  * Disk writes.
8293  *
8294  * The dependency structures constructed above are most actively used when file
8295  * system blocks are written to disk.  No constraints are placed on when a
8296  * block can be written, but unsatisfied update dependencies are made safe by
8297  * modifying (or replacing) the source memory for the duration of the disk
8298  * write.  When the disk write completes, the memory block is again brought
8299  * up-to-date.
8300  *
8301  * In-core inode structure reclamation.
8302  *
8303  * Because there are a finite number of "in-core" inode structures, they are
8304  * reused regularly.  By transferring all inode-related dependencies to the
8305  * in-memory inode block and indexing them separately (via "inodedep"s), we
8306  * can allow "in-core" inode structures to be reused at any time and avoid
8307  * any increase in contention.
8308  *
8309  * Called just before entering the device driver to initiate a new disk I/O.
8310  * The buffer must be locked, thus, no I/O completion operations can occur
8311  * while we are manipulating its associated dependencies.
8312  */
8313 static void
8314 softdep_disk_io_initiation(bp)
8315 	struct buf *bp;		/* structure describing disk write to occur */
8316 {
8317 	struct worklist *wk;
8318 	struct worklist marker;
8319 	struct inodedep *inodedep;
8320 	struct freeblks *freeblks;
8321 	struct jfreeblk *jfreeblk;
8322 	struct newblk *newblk;
8323 
8324 	/*
8325 	 * We only care about write operations. There should never
8326 	 * be dependencies for reads.
8327 	 */
8328 	if (bp->b_iocmd != BIO_WRITE)
8329 		panic("softdep_disk_io_initiation: not write");
8330 
8331 	if (bp->b_vflags & BV_BKGRDINPROG)
8332 		panic("softdep_disk_io_initiation: Writing buffer with "
8333 		    "background write in progress: %p", bp);
8334 
8335 	marker.wk_type = D_LAST + 1;	/* Not a normal workitem */
8336 	PHOLD(curproc);			/* Don't swap out kernel stack */
8337 
8338 	ACQUIRE_LOCK(&lk);
8339 	/*
8340 	 * Do any necessary pre-I/O processing.
8341 	 */
8342 	for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
8343 	     wk = markernext(&marker)) {
8344 		LIST_INSERT_AFTER(wk, &marker, wk_list);
8345 		switch (wk->wk_type) {
8346 
8347 		case D_PAGEDEP:
8348 			initiate_write_filepage(WK_PAGEDEP(wk), bp);
8349 			continue;
8350 
8351 		case D_INODEDEP:
8352 			inodedep = WK_INODEDEP(wk);
8353 			if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
8354 				initiate_write_inodeblock_ufs1(inodedep, bp);
8355 			else
8356 				initiate_write_inodeblock_ufs2(inodedep, bp);
8357 			continue;
8358 
8359 		case D_INDIRDEP:
8360 			initiate_write_indirdep(WK_INDIRDEP(wk), bp);
8361 			continue;
8362 
8363 		case D_BMSAFEMAP:
8364 			initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);
8365 			continue;
8366 
8367 		case D_JSEG:
8368 			WK_JSEG(wk)->js_buf = NULL;
8369 			continue;
8370 
8371 		case D_FREEBLKS:
8372 			freeblks = WK_FREEBLKS(wk);
8373 			jfreeblk = LIST_FIRST(&freeblks->fb_jfreeblkhd);
8374 			/*
8375 			 * We have to wait for the jfreeblks to be journaled
8376 			 * before we can write an inodeblock with updated
8377 			 * pointers.  Be careful to arrange the marker so
8378 			 * we revisit the jfreeblk if it's not removed by
8379 			 * the first jwait().
8380 			 */
8381 			if (jfreeblk != NULL) {
8382 				LIST_REMOVE(&marker, wk_list);
8383 				LIST_INSERT_BEFORE(wk, &marker, wk_list);
8384 				stat_jwait_freeblks++;
8385 				jwait(&jfreeblk->jf_list);
8386 			}
8387 			continue;
8388 		case D_ALLOCDIRECT:
8389 		case D_ALLOCINDIR:
8390 			/*
8391 			 * We have to wait for the jnewblk to be journaled
8392 			 * before we can write to a block if the contents
8393 			 * may be confused with an earlier file's indirect
8394 			 * at recovery time.  Handle the marker as described
8395 			 * above.
8396 			 */
8397 			newblk = WK_NEWBLK(wk);
8398 			if (newblk->nb_jnewblk != NULL &&
8399 			    indirblk_inseg(newblk->nb_list.wk_mp,
8400 			    newblk->nb_newblkno)) {
8401 				LIST_REMOVE(&marker, wk_list);
8402 				LIST_INSERT_BEFORE(wk, &marker, wk_list);
8403 				stat_jwait_newblk++;
8404 				jwait(&newblk->nb_jnewblk->jn_list);
8405 			}
8406 			continue;
8407 
8408 		case D_SBDEP:
8409 			initiate_write_sbdep(WK_SBDEP(wk));
8410 			continue;
8411 
8412 		case D_MKDIR:
8413 		case D_FREEWORK:
8414 		case D_FREEDEP:
8415 		case D_JSEGDEP:
8416 			continue;
8417 
8418 		default:
8419 			panic("handle_disk_io_initiation: Unexpected type %s",
8420 			    TYPENAME(wk->wk_type));
8421 			/* NOTREACHED */
8422 		}
8423 	}
8424 	FREE_LOCK(&lk);
8425 	PRELE(curproc);			/* Allow swapout of kernel stack */
8426 }
8427 
8428 /*
8429  * Called from within the procedure above to deal with unsatisfied
8430  * allocation dependencies in a directory. The buffer must be locked,
8431  * thus, no I/O completion operations can occur while we are
8432  * manipulating its associated dependencies.
8433  */
8434 static void
8435 initiate_write_filepage(pagedep, bp)
8436 	struct pagedep *pagedep;
8437 	struct buf *bp;
8438 {
8439 	struct jremref *jremref;
8440 	struct jmvref *jmvref;
8441 	struct dirrem *dirrem;
8442 	struct diradd *dap;
8443 	struct direct *ep;
8444 	int i;
8445 
8446 	if (pagedep->pd_state & IOSTARTED) {
8447 		/*
8448 		 * This can only happen if there is a driver that does not
8449 		 * understand chaining. Here biodone will reissue the call
8450 		 * to strategy for the incomplete buffers.
8451 		 */
8452 		printf("initiate_write_filepage: already started\n");
8453 		return;
8454 	}
8455 	pagedep->pd_state |= IOSTARTED;
8456 	/*
8457 	 * Wait for all journal remove dependencies to hit the disk.
8458 	 * We can not allow any potentially conflicting directory adds
8459 	 * to be visible before removes and rollback is too difficult.
8460 	 * lk may be dropped and re-acquired, however we hold the buf
8461 	 * locked so the dependency can not go away.
8462 	 */
8463 	LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
8464 		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) {
8465 			stat_jwait_filepage++;
8466 			jwait(&jremref->jr_list);
8467 		}
8468 	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) {
8469 		stat_jwait_filepage++;
8470 		jwait(&jmvref->jm_list);
8471 	}
8472 	for (i = 0; i < DAHASHSZ; i++) {
8473 		LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
8474 			ep = (struct direct *)
8475 			    ((char *)bp->b_data + dap->da_offset);
8476 			if (ep->d_ino != dap->da_newinum)
8477 				panic("%s: dir inum %d != new %d",
8478 				    "initiate_write_filepage",
8479 				    ep->d_ino, dap->da_newinum);
8480 			if (dap->da_state & DIRCHG)
8481 				ep->d_ino = dap->da_previous->dm_oldinum;
8482 			else
8483 				ep->d_ino = 0;
8484 			dap->da_state &= ~ATTACHED;
8485 			dap->da_state |= UNDONE;
8486 		}
8487 	}
8488 }
8489 
8490 /*
8491  * Version of initiate_write_inodeblock that handles UFS1 dinodes.
8492  * Note that any bug fixes made to this routine must be done in the
8493  * version found below.
8494  *
8495  * Called from within the procedure above to deal with unsatisfied
8496  * allocation dependencies in an inodeblock. The buffer must be
8497  * locked, thus, no I/O completion operations can occur while we
8498  * are manipulating its associated dependencies.
8499  */
8500 static void
8501 initiate_write_inodeblock_ufs1(inodedep, bp)
8502 	struct inodedep *inodedep;
8503 	struct buf *bp;			/* The inode block */
8504 {
8505 	struct allocdirect *adp, *lastadp;
8506 	struct ufs1_dinode *dp;
8507 	struct ufs1_dinode *sip;
8508 	struct inoref *inoref;
8509 	struct fs *fs;
8510 	ufs_lbn_t i;
8511 #ifdef INVARIANTS
8512 	ufs_lbn_t prevlbn = 0;
8513 #endif
8514 	int deplist;
8515 
8516 	if (inodedep->id_state & IOSTARTED)
8517 		panic("initiate_write_inodeblock_ufs1: already started");
8518 	inodedep->id_state |= IOSTARTED;
8519 	fs = inodedep->id_fs;
8520 	dp = (struct ufs1_dinode *)bp->b_data +
8521 	    ino_to_fsbo(fs, inodedep->id_ino);
8522 
8523 	/*
8524 	 * If we're on the unlinked list but have not yet written our
8525 	 * next pointer initialize it here.
8526 	 */
8527 	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
8528 		struct inodedep *inon;
8529 
8530 		inon = TAILQ_NEXT(inodedep, id_unlinked);
8531 		dp->di_freelink = inon ? inon->id_ino : 0;
8532 	}
8533 	/*
8534 	 * If the bitmap is not yet written, then the allocated
8535 	 * inode cannot be written to disk.
8536 	 */
8537 	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
8538 		if (inodedep->id_savedino1 != NULL)
8539 			panic("initiate_write_inodeblock_ufs1: I/O underway");
8540 		FREE_LOCK(&lk);
8541 		sip = malloc(sizeof(struct ufs1_dinode),
8542 		    M_SAVEDINO, M_SOFTDEP_FLAGS);
8543 		ACQUIRE_LOCK(&lk);
8544 		inodedep->id_savedino1 = sip;
8545 		*inodedep->id_savedino1 = *dp;
8546 		bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
8547 		dp->di_gen = inodedep->id_savedino1->di_gen;
8548 		dp->di_freelink = inodedep->id_savedino1->di_freelink;
8549 		return;
8550 	}
8551 	/*
8552 	 * If no dependencies, then there is nothing to roll back.
8553 	 */
8554 	inodedep->id_savedsize = dp->di_size;
8555 	inodedep->id_savedextsize = 0;
8556 	inodedep->id_savednlink = dp->di_nlink;
8557 	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
8558 	    TAILQ_EMPTY(&inodedep->id_inoreflst))
8559 		return;
8560 	/*
8561 	 * Revert the link count to that of the first unwritten journal entry.
8562 	 */
8563 	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
8564 	if (inoref)
8565 		dp->di_nlink = inoref->if_nlink;
8566 	/*
8567 	 * Set the dependencies to busy.
8568 	 */
8569 	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
8570 	     adp = TAILQ_NEXT(adp, ad_next)) {
8571 #ifdef INVARIANTS
8572 		if (deplist != 0 && prevlbn >= adp->ad_offset)
8573 			panic("softdep_write_inodeblock: lbn order");
8574 		prevlbn = adp->ad_offset;
8575 		if (adp->ad_offset < NDADDR &&
8576 		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
8577 			panic("%s: direct pointer #%jd mismatch %d != %jd",
8578 			    "softdep_write_inodeblock",
8579 			    (intmax_t)adp->ad_offset,
8580 			    dp->di_db[adp->ad_offset],
8581 			    (intmax_t)adp->ad_newblkno);
8582 		if (adp->ad_offset >= NDADDR &&
8583 		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
8584 			panic("%s: indirect pointer #%jd mismatch %d != %jd",
8585 			    "softdep_write_inodeblock",
8586 			    (intmax_t)adp->ad_offset - NDADDR,
8587 			    dp->di_ib[adp->ad_offset - NDADDR],
8588 			    (intmax_t)adp->ad_newblkno);
8589 		deplist |= 1 << adp->ad_offset;
8590 		if ((adp->ad_state & ATTACHED) == 0)
8591 			panic("softdep_write_inodeblock: Unknown state 0x%x",
8592 			    adp->ad_state);
8593 #endif /* INVARIANTS */
8594 		adp->ad_state &= ~ATTACHED;
8595 		adp->ad_state |= UNDONE;
8596 	}
8597 	/*
8598 	 * The on-disk inode cannot claim to be any larger than the last
8599 	 * fragment that has been written. Otherwise, the on-disk inode
8600 	 * might have fragments that were not the last block in the file
8601 	 * which would corrupt the filesystem.
8602 	 */
8603 	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
8604 	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
8605 		if (adp->ad_offset >= NDADDR)
8606 			break;
8607 		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
8608 		/* keep going until hitting a rollback to a frag */
8609 		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
8610 			continue;
8611 		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
8612 		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
8613 #ifdef INVARIANTS
8614 			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
8615 				panic("softdep_write_inodeblock: lost dep1");
8616 #endif /* INVARIANTS */
8617 			dp->di_db[i] = 0;
8618 		}
8619 		for (i = 0; i < NIADDR; i++) {
8620 #ifdef INVARIANTS
8621 			if (dp->di_ib[i] != 0 &&
8622 			    (deplist & ((1 << NDADDR) << i)) == 0)
8623 				panic("softdep_write_inodeblock: lost dep2");
8624 #endif /* INVARIANTS */
8625 			dp->di_ib[i] = 0;
8626 		}
8627 		return;
8628 	}
8629 	/*
8630 	 * If we have zero'ed out the last allocated block of the file,
8631 	 * roll back the size to the last currently allocated block.
8632 	 * We know that this last allocated block is a full-sized as
8633 	 * we already checked for fragments in the loop above.
8634 	 */
8635 	if (lastadp != NULL &&
8636 	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
8637 		for (i = lastadp->ad_offset; i >= 0; i--)
8638 			if (dp->di_db[i] != 0)
8639 				break;
8640 		dp->di_size = (i + 1) * fs->fs_bsize;
8641 	}
8642 	/*
8643 	 * The only dependencies are for indirect blocks.
8644 	 *
8645 	 * The file size for indirect block additions is not guaranteed.
8646 	 * Such a guarantee would be non-trivial to achieve. The conventional
8647 	 * synchronous write implementation also does not make this guarantee.
8648 	 * Fsck should catch and fix discrepancies. Arguably, the file size
8649 	 * can be over-estimated without destroying integrity when the file
8650 	 * moves into the indirect blocks (i.e., is large). If we want to
8651 	 * postpone fsck, we are stuck with this argument.
8652 	 */
8653 	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
8654 		dp->di_ib[adp->ad_offset - NDADDR] = 0;
8655 }
8656 
8657 /*
8658  * Version of initiate_write_inodeblock that handles UFS2 dinodes.
8659  * Note that any bug fixes made to this routine must be done in the
8660  * version found above.
8661  *
8662  * Called from within the procedure above to deal with unsatisfied
8663  * allocation dependencies in an inodeblock. The buffer must be
8664  * locked, thus, no I/O completion operations can occur while we
8665  * are manipulating its associated dependencies.
8666  */
8667 static void
8668 initiate_write_inodeblock_ufs2(inodedep, bp)
8669 	struct inodedep *inodedep;
8670 	struct buf *bp;			/* The inode block */
8671 {
8672 	struct allocdirect *adp, *lastadp;
8673 	struct ufs2_dinode *dp;
8674 	struct ufs2_dinode *sip;
8675 	struct inoref *inoref;
8676 	struct fs *fs;
8677 	ufs_lbn_t i;
8678 #ifdef INVARIANTS
8679 	ufs_lbn_t prevlbn = 0;
8680 #endif
8681 	int deplist;
8682 
8683 	if (inodedep->id_state & IOSTARTED)
8684 		panic("initiate_write_inodeblock_ufs2: already started");
8685 	inodedep->id_state |= IOSTARTED;
8686 	fs = inodedep->id_fs;
8687 	dp = (struct ufs2_dinode *)bp->b_data +
8688 	    ino_to_fsbo(fs, inodedep->id_ino);
8689 
8690 	/*
8691 	 * If we're on the unlinked list but have not yet written our
8692 	 * next pointer initialize it here.
8693 	 */
8694 	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
8695 		struct inodedep *inon;
8696 
8697 		inon = TAILQ_NEXT(inodedep, id_unlinked);
8698 		dp->di_freelink = inon ? inon->id_ino : 0;
8699 	}
8700 	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) ==
8701 	    (UNLINKED | UNLINKNEXT)) {
8702 		struct inodedep *inon;
8703 		ino_t freelink;
8704 
8705 		inon = TAILQ_NEXT(inodedep, id_unlinked);
8706 		freelink = inon ? inon->id_ino : 0;
8707 		if (freelink != dp->di_freelink)
8708 			panic("ino %p(0x%X) %d, %d != %d",
8709 			    inodedep, inodedep->id_state, inodedep->id_ino,
8710 			    freelink, dp->di_freelink);
8711 	}
8712 	/*
8713 	 * If the bitmap is not yet written, then the allocated
8714 	 * inode cannot be written to disk.
8715 	 */
8716 	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
8717 		if (inodedep->id_savedino2 != NULL)
8718 			panic("initiate_write_inodeblock_ufs2: I/O underway");
8719 		FREE_LOCK(&lk);
8720 		sip = malloc(sizeof(struct ufs2_dinode),
8721 		    M_SAVEDINO, M_SOFTDEP_FLAGS);
8722 		ACQUIRE_LOCK(&lk);
8723 		inodedep->id_savedino2 = sip;
8724 		*inodedep->id_savedino2 = *dp;
8725 		bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
8726 		dp->di_gen = inodedep->id_savedino2->di_gen;
8727 		dp->di_freelink = inodedep->id_savedino2->di_freelink;
8728 		return;
8729 	}
8730 	/*
8731 	 * If no dependencies, then there is nothing to roll back.
8732 	 */
8733 	inodedep->id_savedsize = dp->di_size;
8734 	inodedep->id_savedextsize = dp->di_extsize;
8735 	inodedep->id_savednlink = dp->di_nlink;
8736 	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
8737 	    TAILQ_EMPTY(&inodedep->id_extupdt) &&
8738 	    TAILQ_EMPTY(&inodedep->id_inoreflst))
8739 		return;
8740 	/*
8741 	 * Revert the link count to that of the first unwritten journal entry.
8742 	 */
8743 	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
8744 	if (inoref)
8745 		dp->di_nlink = inoref->if_nlink;
8746 
8747 	/*
8748 	 * Set the ext data dependencies to busy.
8749 	 */
8750 	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
8751 	     adp = TAILQ_NEXT(adp, ad_next)) {
8752 #ifdef INVARIANTS
8753 		if (deplist != 0 && prevlbn >= adp->ad_offset)
8754 			panic("softdep_write_inodeblock: lbn order");
8755 		prevlbn = adp->ad_offset;
8756 		if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
8757 			panic("%s: direct pointer #%jd mismatch %jd != %jd",
8758 			    "softdep_write_inodeblock",
8759 			    (intmax_t)adp->ad_offset,
8760 			    (intmax_t)dp->di_extb[adp->ad_offset],
8761 			    (intmax_t)adp->ad_newblkno);
8762 		deplist |= 1 << adp->ad_offset;
8763 		if ((adp->ad_state & ATTACHED) == 0)
8764 			panic("softdep_write_inodeblock: Unknown state 0x%x",
8765 			    adp->ad_state);
8766 #endif /* INVARIANTS */
8767 		adp->ad_state &= ~ATTACHED;
8768 		adp->ad_state |= UNDONE;
8769 	}
8770 	/*
8771 	 * The on-disk inode cannot claim to be any larger than the last
8772 	 * fragment that has been written. Otherwise, the on-disk inode
8773 	 * might have fragments that were not the last block in the ext
8774 	 * data which would corrupt the filesystem.
8775 	 */
8776 	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
8777 	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
8778 		dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;
8779 		/* keep going until hitting a rollback to a frag */
8780 		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
8781 			continue;
8782 		dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
8783 		for (i = adp->ad_offset + 1; i < NXADDR; i++) {
8784 #ifdef INVARIANTS
8785 			if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
8786 				panic("softdep_write_inodeblock: lost dep1");
8787 #endif /* INVARIANTS */
8788 			dp->di_extb[i] = 0;
8789 		}
8790 		lastadp = NULL;
8791 		break;
8792 	}
8793 	/*
8794 	 * If we have zero'ed out the last allocated block of the ext
8795 	 * data, roll back the size to the last currently allocated block.
8796 	 * We know that this last allocated block is a full-sized as
8797 	 * we already checked for fragments in the loop above.
8798 	 */
8799 	if (lastadp != NULL &&
8800 	    dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
8801 		for (i = lastadp->ad_offset; i >= 0; i--)
8802 			if (dp->di_extb[i] != 0)
8803 				break;
8804 		dp->di_extsize = (i + 1) * fs->fs_bsize;
8805 	}
8806 	/*
8807 	 * Set the file data dependencies to busy.
8808 	 */
8809 	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
8810 	     adp = TAILQ_NEXT(adp, ad_next)) {
8811 #ifdef INVARIANTS
8812 		if (deplist != 0 && prevlbn >= adp->ad_offset)
8813 			panic("softdep_write_inodeblock: lbn order");
8814 		prevlbn = adp->ad_offset;
8815 		if (adp->ad_offset < NDADDR &&
8816 		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
8817 			panic("%s: direct pointer #%jd mismatch %jd != %jd",
8818 			    "softdep_write_inodeblock",
8819 			    (intmax_t)adp->ad_offset,
8820 			    (intmax_t)dp->di_db[adp->ad_offset],
8821 			    (intmax_t)adp->ad_newblkno);
8822 		if (adp->ad_offset >= NDADDR &&
8823 		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
8824 			panic("%s indirect pointer #%jd mismatch %jd != %jd",
8825 			    "softdep_write_inodeblock:",
8826 			    (intmax_t)adp->ad_offset - NDADDR,
8827 			    (intmax_t)dp->di_ib[adp->ad_offset - NDADDR],
8828 			    (intmax_t)adp->ad_newblkno);
8829 		deplist |= 1 << adp->ad_offset;
8830 		if ((adp->ad_state & ATTACHED) == 0)
8831 			panic("softdep_write_inodeblock: Unknown state 0x%x",
8832 			    adp->ad_state);
8833 #endif /* INVARIANTS */
8834 		adp->ad_state &= ~ATTACHED;
8835 		adp->ad_state |= UNDONE;
8836 	}
8837 	/*
8838 	 * The on-disk inode cannot claim to be any larger than the last
8839 	 * fragment that has been written. Otherwise, the on-disk inode
8840 	 * might have fragments that were not the last block in the file
8841 	 * which would corrupt the filesystem.
8842 	 */
8843 	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
8844 	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
8845 		if (adp->ad_offset >= NDADDR)
8846 			break;
8847 		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
8848 		/* keep going until hitting a rollback to a frag */
8849 		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
8850 			continue;
8851 		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
8852 		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
8853 #ifdef INVARIANTS
8854 			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
8855 				panic("softdep_write_inodeblock: lost dep2");
8856 #endif /* INVARIANTS */
8857 			dp->di_db[i] = 0;
8858 		}
8859 		for (i = 0; i < NIADDR; i++) {
8860 #ifdef INVARIANTS
8861 			if (dp->di_ib[i] != 0 &&
8862 			    (deplist & ((1 << NDADDR) << i)) == 0)
8863 				panic("softdep_write_inodeblock: lost dep3");
8864 #endif /* INVARIANTS */
8865 			dp->di_ib[i] = 0;
8866 		}
8867 		return;
8868 	}
8869 	/*
8870 	 * If we have zero'ed out the last allocated block of the file,
8871 	 * roll back the size to the last currently allocated block.
8872 	 * We know that this last allocated block is a full-sized as
8873 	 * we already checked for fragments in the loop above.
8874 	 */
8875 	if (lastadp != NULL &&
8876 	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
8877 		for (i = lastadp->ad_offset; i >= 0; i--)
8878 			if (dp->di_db[i] != 0)
8879 				break;
8880 		dp->di_size = (i + 1) * fs->fs_bsize;
8881 	}
8882 	/*
8883 	 * The only dependencies are for indirect blocks.
8884 	 *
8885 	 * The file size for indirect block additions is not guaranteed.
8886 	 * Such a guarantee would be non-trivial to achieve. The conventional
8887 	 * synchronous write implementation also does not make this guarantee.
8888 	 * Fsck should catch and fix discrepancies. Arguably, the file size
8889 	 * can be over-estimated without destroying integrity when the file
8890 	 * moves into the indirect blocks (i.e., is large). If we want to
8891 	 * postpone fsck, we are stuck with this argument.
8892 	 */
8893 	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
8894 		dp->di_ib[adp->ad_offset - NDADDR] = 0;
8895 }
8896 
8897 /*
8898  * Cancel an indirdep as a result of truncation.  Release all of the
8899  * children allocindirs and place their journal work on the appropriate
8900  * list.
8901  */
8902 static void
8903 cancel_indirdep(indirdep, bp, inodedep, freeblks)
8904 	struct indirdep *indirdep;
8905 	struct buf *bp;
8906 	struct inodedep *inodedep;
8907 	struct freeblks *freeblks;
8908 {
8909 	struct allocindir *aip;
8910 
8911 	/*
8912 	 * None of the indirect pointers will ever be visible,
8913 	 * so they can simply be tossed. GOINGAWAY ensures
8914 	 * that allocated pointers will be saved in the buffer
8915 	 * cache until they are freed. Note that they will
8916 	 * only be able to be found by their physical address
8917 	 * since the inode mapping the logical address will
8918 	 * be gone. The save buffer used for the safe copy
8919 	 * was allocated in setup_allocindir_phase2 using
8920 	 * the physical address so it could be used for this
8921 	 * purpose. Hence we swap the safe copy with the real
8922 	 * copy, allowing the safe copy to be freed and holding
8923 	 * on to the real copy for later use in indir_trunc.
8924 	 */
8925 	if (indirdep->ir_state & GOINGAWAY)
8926 		panic("cancel_indirdep: already gone");
8927 	if (indirdep->ir_state & ONDEPLIST) {
8928 		indirdep->ir_state &= ~ONDEPLIST;
8929 		LIST_REMOVE(indirdep, ir_next);
8930 	}
8931 	indirdep->ir_state |= GOINGAWAY;
8932 	VFSTOUFS(indirdep->ir_list.wk_mp)->um_numindirdeps += 1;
8933 	while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
8934 		cancel_allocindir(aip, inodedep, freeblks);
8935 	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0)
8936 		cancel_allocindir(aip, inodedep, freeblks);
8937 	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0)
8938 		cancel_allocindir(aip, inodedep, freeblks);
8939 	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0)
8940 		cancel_allocindir(aip, inodedep, freeblks);
8941 	bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);
8942 	WORKLIST_REMOVE(&indirdep->ir_list);
8943 	WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);
8944 	indirdep->ir_savebp = NULL;
8945 }
8946 
8947 /*
8948  * Free an indirdep once it no longer has new pointers to track.
8949  */
8950 static void
8951 free_indirdep(indirdep)
8952 	struct indirdep *indirdep;
8953 {
8954 
8955 	KASSERT(LIST_EMPTY(&indirdep->ir_jwork),
8956 	    ("free_indirdep: Journal work not empty."));
8957 	KASSERT(LIST_EMPTY(&indirdep->ir_jnewblkhd),
8958 	    ("free_indirdep: Journal new block list not empty."));
8959 	KASSERT(LIST_EMPTY(&indirdep->ir_completehd),
8960 	    ("free_indirdep: Complete head not empty."));
8961 	KASSERT(LIST_EMPTY(&indirdep->ir_writehd),
8962 	    ("free_indirdep: write head not empty."));
8963 	KASSERT(LIST_EMPTY(&indirdep->ir_donehd),
8964 	    ("free_indirdep: done head not empty."));
8965 	KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),
8966 	    ("free_indirdep: deplist head not empty."));
8967 	KASSERT(indirdep->ir_savebp == NULL,
8968 	    ("free_indirdep: %p ir_savebp != NULL", indirdep));
8969 	KASSERT((indirdep->ir_state & ONDEPLIST) == 0,
8970 	    ("free_indirdep: %p still on deplist.", indirdep));
8971 	if (indirdep->ir_state & ONWORKLIST)
8972 		WORKLIST_REMOVE(&indirdep->ir_list);
8973 	WORKITEM_FREE(indirdep, D_INDIRDEP);
8974 }
8975 
8976 /*
8977  * Called before a write to an indirdep.  This routine is responsible for
8978  * rolling back pointers to a safe state which includes only those
8979  * allocindirs which have been completed.
8980  */
8981 static void
8982 initiate_write_indirdep(indirdep, bp)
8983 	struct indirdep *indirdep;
8984 	struct buf *bp;
8985 {
8986 
8987 	if (indirdep->ir_state & GOINGAWAY)
8988 		panic("disk_io_initiation: indirdep gone");
8989 
8990 	/*
8991 	 * If there are no remaining dependencies, this will be writing
8992 	 * the real pointers.
8993 	 */
8994 	if (LIST_EMPTY(&indirdep->ir_deplisthd))
8995 		return;
8996 	/*
8997 	 * Replace up-to-date version with safe version.
8998 	 */
8999 	FREE_LOCK(&lk);
9000 	indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
9001 	    M_SOFTDEP_FLAGS);
9002 	ACQUIRE_LOCK(&lk);
9003 	indirdep->ir_state &= ~ATTACHED;
9004 	indirdep->ir_state |= UNDONE;
9005 	bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
9006 	bcopy(indirdep->ir_savebp->b_data, bp->b_data,
9007 	    bp->b_bcount);
9008 }
9009 
9010 /*
9011  * Called when an inode has been cleared in a cg bitmap.  This finally
9012  * eliminates any canceled jaddrefs
9013  */
9014 void
9015 softdep_setup_inofree(mp, bp, ino, wkhd)
9016 	struct mount *mp;
9017 	struct buf *bp;
9018 	ino_t ino;
9019 	struct workhead *wkhd;
9020 {
9021 	struct worklist *wk, *wkn;
9022 	struct inodedep *inodedep;
9023 	uint8_t *inosused;
9024 	struct cg *cgp;
9025 	struct fs *fs;
9026 
9027 	ACQUIRE_LOCK(&lk);
9028 	fs = VFSTOUFS(mp)->um_fs;
9029 	cgp = (struct cg *)bp->b_data;
9030 	inosused = cg_inosused(cgp);
9031 	if (isset(inosused, ino % fs->fs_ipg))
9032 		panic("softdep_setup_inofree: inode %d not freed.", ino);
9033 	if (inodedep_lookup(mp, ino, 0, &inodedep))
9034 		panic("softdep_setup_inofree: ino %d has existing inodedep %p",
9035 		    ino, inodedep);
9036 	if (wkhd) {
9037 		LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
9038 			if (wk->wk_type != D_JADDREF)
9039 				continue;
9040 			WORKLIST_REMOVE(wk);
9041 			/*
9042 			 * We can free immediately even if the jaddref
9043 			 * isn't attached in a background write as now
9044 			 * the bitmaps are reconciled.
9045 		 	 */
9046 			wk->wk_state |= COMPLETE | ATTACHED;
9047 			free_jaddref(WK_JADDREF(wk));
9048 		}
9049 		jwork_move(&bp->b_dep, wkhd);
9050 	}
9051 	FREE_LOCK(&lk);
9052 }
9053 
9054 
9055 /*
9056  * Called via ffs_blkfree() after a set of frags has been cleared from a cg
9057  * map.  Any dependencies waiting for the write to clear are added to the
9058  * buf's list and any jnewblks that are being canceled are discarded
9059  * immediately.
9060  */
9061 void
9062 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
9063 	struct mount *mp;
9064 	struct buf *bp;
9065 	ufs2_daddr_t blkno;
9066 	int frags;
9067 	struct workhead *wkhd;
9068 {
9069 	struct jnewblk *jnewblk;
9070 	struct worklist *wk, *wkn;
9071 #ifdef SUJ_DEBUG
9072 	struct bmsafemap *bmsafemap;
9073 	struct fs *fs;
9074 	uint8_t *blksfree;
9075 	struct cg *cgp;
9076 	ufs2_daddr_t jstart;
9077 	ufs2_daddr_t jend;
9078 	ufs2_daddr_t end;
9079 	long bno;
9080 	int i;
9081 #endif
9082 
9083 	ACQUIRE_LOCK(&lk);
9084 	/*
9085 	 * Detach any jnewblks which have been canceled.  They must linger
9086 	 * until the bitmap is cleared again by ffs_blkfree() to prevent
9087 	 * an unjournaled allocation from hitting the disk.
9088 	 */
9089 	if (wkhd) {
9090 		LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
9091 			if (wk->wk_type != D_JNEWBLK)
9092 				continue;
9093 			jnewblk = WK_JNEWBLK(wk);
9094 			KASSERT(jnewblk->jn_state & GOINGAWAY,
9095 			    ("softdep_setup_blkfree: jnewblk not canceled."));
9096 			WORKLIST_REMOVE(wk);
9097 #ifdef SUJ_DEBUG
9098 			/*
9099 			 * Assert that this block is free in the bitmap
9100 			 * before we discard the jnewblk.
9101 			 */
9102 			fs = VFSTOUFS(mp)->um_fs;
9103 			cgp = (struct cg *)bp->b_data;
9104 			blksfree = cg_blksfree(cgp);
9105 			bno = dtogd(fs, jnewblk->jn_blkno);
9106 			for (i = jnewblk->jn_oldfrags;
9107 			    i < jnewblk->jn_frags; i++) {
9108 				if (isset(blksfree, bno + i))
9109 					continue;
9110 				panic("softdep_setup_blkfree: not free");
9111 			}
9112 #endif
9113 			/*
9114 			 * Even if it's not attached we can free immediately
9115 			 * as the new bitmap is correct.
9116 			 */
9117 			wk->wk_state |= COMPLETE | ATTACHED;
9118 			free_jnewblk(jnewblk);
9119 		}
9120 		/*
9121 		 * The buf must be locked by the caller otherwise these could
9122 		 * be added while it's being written and the write would
9123 		 * complete them before they made it to disk.
9124 		 */
9125 		jwork_move(&bp->b_dep, wkhd);
9126 	}
9127 
9128 #ifdef SUJ_DEBUG
9129 	/*
9130 	 * Assert that we are not freeing a block which has an outstanding
9131 	 * allocation dependency.
9132 	 */
9133 	fs = VFSTOUFS(mp)->um_fs;
9134 	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno));
9135 	end = blkno + frags;
9136 	LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
9137 		/*
9138 		 * Don't match against blocks that will be freed when the
9139 		 * background write is done.
9140 		 */
9141 		if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==
9142 		    (COMPLETE | DEPCOMPLETE))
9143 			continue;
9144 		jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;
9145 		jend = jnewblk->jn_blkno + jnewblk->jn_frags;
9146 		if ((blkno >= jstart && blkno < jend) ||
9147 		    (end > jstart && end <= jend)) {
9148 			printf("state 0x%X %jd - %d %d dep %p\n",
9149 			    jnewblk->jn_state, jnewblk->jn_blkno,
9150 			    jnewblk->jn_oldfrags, jnewblk->jn_frags,
9151 			    jnewblk->jn_dep);
9152 			panic("softdep_setup_blkfree: "
9153 			    "%jd-%jd(%d) overlaps with %jd-%jd",
9154 			    blkno, end, frags, jstart, jend);
9155 		}
9156 	}
9157 #endif
9158 	FREE_LOCK(&lk);
9159 }
9160 
9161 static void
9162 initiate_write_bmsafemap(bmsafemap, bp)
9163 	struct bmsafemap *bmsafemap;
9164 	struct buf *bp;			/* The cg block. */
9165 {
9166 	struct jaddref *jaddref;
9167 	struct jnewblk *jnewblk;
9168 	uint8_t *inosused;
9169 	uint8_t *blksfree;
9170 	struct cg *cgp;
9171 	struct fs *fs;
9172 	int cleared;
9173 	ino_t ino;
9174 	long bno;
9175 	int i;
9176 
9177 	if (bmsafemap->sm_state & IOSTARTED)
9178 		panic("initiate_write_bmsafemap: Already started\n");
9179 	bmsafemap->sm_state |= IOSTARTED;
9180 	/*
9181 	 * Clear any inode allocations which are pending journal writes.
9182 	 */
9183 	if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) {
9184 		cgp = (struct cg *)bp->b_data;
9185 		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
9186 		inosused = cg_inosused(cgp);
9187 		LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {
9188 			ino = jaddref->ja_ino % fs->fs_ipg;
9189 			/*
9190 			 * If this is a background copy the inode may not
9191 			 * be marked used yet.
9192 			 */
9193 			if (isset(inosused, ino)) {
9194 				if ((jaddref->ja_mode & IFMT) == IFDIR)
9195 					cgp->cg_cs.cs_ndir--;
9196 				cgp->cg_cs.cs_nifree++;
9197 				clrbit(inosused, ino);
9198 				jaddref->ja_state &= ~ATTACHED;
9199 				jaddref->ja_state |= UNDONE;
9200 				stat_jaddref++;
9201 			} else if ((bp->b_xflags & BX_BKGRDMARKER) == 0)
9202 				panic("initiate_write_bmsafemap: inode %d "
9203 				    "marked free", jaddref->ja_ino);
9204 		}
9205 	}
9206 	/*
9207 	 * Clear any block allocations which are pending journal writes.
9208 	 */
9209 	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
9210 		cgp = (struct cg *)bp->b_data;
9211 		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
9212 		blksfree = cg_blksfree(cgp);
9213 		LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
9214 			bno = dtogd(fs, jnewblk->jn_blkno);
9215 			cleared = 0;
9216 			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
9217 			    i++) {
9218 				if (isclr(blksfree, bno + i)) {
9219 					cleared = 1;
9220 					setbit(blksfree, bno + i);
9221 				}
9222 			}
9223 			/*
9224 			 * We may not clear the block if it's a background
9225 			 * copy.  In that case there is no reason to detach
9226 			 * it.
9227 			 */
9228 			if (cleared) {
9229 				stat_jnewblk++;
9230 				jnewblk->jn_state &= ~ATTACHED;
9231 				jnewblk->jn_state |= UNDONE;
9232 			} else if ((bp->b_xflags & BX_BKGRDMARKER) == 0)
9233 				panic("initiate_write_bmsafemap: block %jd "
9234 				    "marked free", jnewblk->jn_blkno);
9235 		}
9236 	}
9237 	/*
9238 	 * Move allocation lists to the written lists so they can be
9239 	 * cleared once the block write is complete.
9240 	 */
9241 	LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,
9242 	    inodedep, id_deps);
9243 	LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
9244 	    newblk, nb_deps);
9245 }
9246 
9247 /*
9248  * This routine is called during the completion interrupt
9249  * service routine for a disk write (from the procedure called
9250  * by the device driver to inform the filesystem caches of
9251  * a request completion).  It should be called early in this
9252  * procedure, before the block is made available to other
9253  * processes or other routines are called.
9254  *
9255  */
9256 static void
9257 softdep_disk_write_complete(bp)
9258 	struct buf *bp;		/* describes the completed disk write */
9259 {
9260 	struct worklist *wk;
9261 	struct worklist *owk;
9262 	struct workhead reattach;
9263 	struct buf *sbp;
9264 
9265 	/*
9266 	 * If an error occurred while doing the write, then the data
9267 	 * has not hit the disk and the dependencies cannot be unrolled.
9268 	 */
9269 	if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
9270 		return;
9271 	LIST_INIT(&reattach);
9272 	/*
9273 	 * This lock must not be released anywhere in this code segment.
9274 	 */
9275 	sbp = NULL;
9276 	owk = NULL;
9277 	ACQUIRE_LOCK(&lk);
9278 	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
9279 		WORKLIST_REMOVE(wk);
9280 		if (wk == owk)
9281 			panic("duplicate worklist: %p\n", wk);
9282 		owk = wk;
9283 		switch (wk->wk_type) {
9284 
9285 		case D_PAGEDEP:
9286 			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
9287 				WORKLIST_INSERT(&reattach, wk);
9288 			continue;
9289 
9290 		case D_INODEDEP:
9291 			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
9292 				WORKLIST_INSERT(&reattach, wk);
9293 			continue;
9294 
9295 		case D_BMSAFEMAP:
9296 			if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp))
9297 				WORKLIST_INSERT(&reattach, wk);
9298 			continue;
9299 
9300 		case D_MKDIR:
9301 			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
9302 			continue;
9303 
9304 		case D_ALLOCDIRECT:
9305 			wk->wk_state |= COMPLETE;
9306 			handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL);
9307 			continue;
9308 
9309 		case D_ALLOCINDIR:
9310 			wk->wk_state |= COMPLETE;
9311 			handle_allocindir_partdone(WK_ALLOCINDIR(wk));
9312 			continue;
9313 
9314 		case D_INDIRDEP:
9315 			if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp))
9316 				WORKLIST_INSERT(&reattach, wk);
9317 			continue;
9318 
9319 		case D_FREEBLKS:
9320 			wk->wk_state |= COMPLETE;
9321 			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
9322 				add_to_worklist(wk, 1);
9323 			continue;
9324 
9325 		case D_FREEWORK:
9326 			handle_written_freework(WK_FREEWORK(wk));
9327 			break;
9328 
9329 		case D_FREEDEP:
9330 			free_freedep(WK_FREEDEP(wk));
9331 			continue;
9332 
9333 		case D_JSEGDEP:
9334 			free_jsegdep(WK_JSEGDEP(wk));
9335 			continue;
9336 
9337 		case D_JSEG:
9338 			handle_written_jseg(WK_JSEG(wk), bp);
9339 			continue;
9340 
9341 		case D_SBDEP:
9342 			if (handle_written_sbdep(WK_SBDEP(wk), bp))
9343 				WORKLIST_INSERT(&reattach, wk);
9344 			continue;
9345 
9346 		default:
9347 			panic("handle_disk_write_complete: Unknown type %s",
9348 			    TYPENAME(wk->wk_type));
9349 			/* NOTREACHED */
9350 		}
9351 	}
9352 	/*
9353 	 * Reattach any requests that must be redone.
9354 	 */
9355 	while ((wk = LIST_FIRST(&reattach)) != NULL) {
9356 		WORKLIST_REMOVE(wk);
9357 		WORKLIST_INSERT(&bp->b_dep, wk);
9358 	}
9359 	FREE_LOCK(&lk);
9360 	if (sbp)
9361 		brelse(sbp);
9362 }
9363 
9364 /*
9365  * Called from within softdep_disk_write_complete above. Note that
9366  * this routine is always called from interrupt level with further
9367  * splbio interrupts blocked.
9368  */
9369 static void
9370 handle_allocdirect_partdone(adp, wkhd)
9371 	struct allocdirect *adp;	/* the completed allocdirect */
9372 	struct workhead *wkhd;		/* Work to do when inode is writtne. */
9373 {
9374 	struct allocdirectlst *listhead;
9375 	struct allocdirect *listadp;
9376 	struct inodedep *inodedep;
9377 	long bsize;
9378 
9379 	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
9380 		return;
9381 	/*
9382 	 * The on-disk inode cannot claim to be any larger than the last
9383 	 * fragment that has been written. Otherwise, the on-disk inode
9384 	 * might have fragments that were not the last block in the file
9385 	 * which would corrupt the filesystem. Thus, we cannot free any
9386 	 * allocdirects after one whose ad_oldblkno claims a fragment as
9387 	 * these blocks must be rolled back to zero before writing the inode.
9388 	 * We check the currently active set of allocdirects in id_inoupdt
9389 	 * or id_extupdt as appropriate.
9390 	 */
9391 	inodedep = adp->ad_inodedep;
9392 	bsize = inodedep->id_fs->fs_bsize;
9393 	if (adp->ad_state & EXTDATA)
9394 		listhead = &inodedep->id_extupdt;
9395 	else
9396 		listhead = &inodedep->id_inoupdt;
9397 	TAILQ_FOREACH(listadp, listhead, ad_next) {
9398 		/* found our block */
9399 		if (listadp == adp)
9400 			break;
9401 		/* continue if ad_oldlbn is not a fragment */
9402 		if (listadp->ad_oldsize == 0 ||
9403 		    listadp->ad_oldsize == bsize)
9404 			continue;
9405 		/* hit a fragment */
9406 		return;
9407 	}
9408 	/*
9409 	 * If we have reached the end of the current list without
9410 	 * finding the just finished dependency, then it must be
9411 	 * on the future dependency list. Future dependencies cannot
9412 	 * be freed until they are moved to the current list.
9413 	 */
9414 	if (listadp == NULL) {
9415 #ifdef DEBUG
9416 		if (adp->ad_state & EXTDATA)
9417 			listhead = &inodedep->id_newextupdt;
9418 		else
9419 			listhead = &inodedep->id_newinoupdt;
9420 		TAILQ_FOREACH(listadp, listhead, ad_next)
9421 			/* found our block */
9422 			if (listadp == adp)
9423 				break;
9424 		if (listadp == NULL)
9425 			panic("handle_allocdirect_partdone: lost dep");
9426 #endif /* DEBUG */
9427 		return;
9428 	}
9429 	/*
9430 	 * If we have found the just finished dependency, then queue
9431 	 * it along with anything that follows it that is complete.
9432 	 * Since the pointer has not yet been written in the inode
9433 	 * as the dependency prevents it, place the allocdirect on the
9434 	 * bufwait list where it will be freed once the pointer is
9435 	 * valid.
9436 	 */
9437 	if (wkhd == NULL)
9438 		wkhd = &inodedep->id_bufwait;
9439 	for (; adp; adp = listadp) {
9440 		listadp = TAILQ_NEXT(adp, ad_next);
9441 		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
9442 			return;
9443 		TAILQ_REMOVE(listhead, adp, ad_next);
9444 		WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);
9445 	}
9446 }
9447 
9448 /*
9449  * Called from within softdep_disk_write_complete above.  This routine
9450  * completes successfully written allocindirs.
9451  */
9452 static void
9453 handle_allocindir_partdone(aip)
9454 	struct allocindir *aip;		/* the completed allocindir */
9455 {
9456 	struct indirdep *indirdep;
9457 
9458 	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
9459 		return;
9460 	indirdep = aip->ai_indirdep;
9461 	LIST_REMOVE(aip, ai_next);
9462 	if (indirdep->ir_state & UNDONE) {
9463 		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
9464 		return;
9465 	}
9466 	if (indirdep->ir_state & UFS1FMT)
9467 		((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
9468 		    aip->ai_newblkno;
9469 	else
9470 		((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
9471 		    aip->ai_newblkno;
9472 	/*
9473 	 * Await the pointer write before freeing the allocindir.
9474 	 */
9475 	LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);
9476 }
9477 
9478 /*
9479  * Release segments held on a jwork list.
9480  */
9481 static void
9482 handle_jwork(wkhd)
9483 	struct workhead *wkhd;
9484 {
9485 	struct worklist *wk;
9486 
9487 	while ((wk = LIST_FIRST(wkhd)) != NULL) {
9488 		WORKLIST_REMOVE(wk);
9489 		switch (wk->wk_type) {
9490 		case D_JSEGDEP:
9491 			free_jsegdep(WK_JSEGDEP(wk));
9492 			continue;
9493 		default:
9494 			panic("handle_jwork: Unknown type %s\n",
9495 			    TYPENAME(wk->wk_type));
9496 		}
9497 	}
9498 }
9499 
9500 /*
9501  * Handle the bufwait list on an inode when it is safe to release items
9502  * held there.  This normally happens after an inode block is written but
9503  * may be delayed and handled later if there are pending journal items that
9504  * are not yet safe to be released.
9505  */
9506 static struct freefile *
9507 handle_bufwait(inodedep, refhd)
9508 	struct inodedep *inodedep;
9509 	struct workhead *refhd;
9510 {
9511 	struct jaddref *jaddref;
9512 	struct freefile *freefile;
9513 	struct worklist *wk;
9514 
9515 	freefile = NULL;
9516 	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
9517 		WORKLIST_REMOVE(wk);
9518 		switch (wk->wk_type) {
9519 		case D_FREEFILE:
9520 			/*
9521 			 * We defer adding freefile to the worklist
9522 			 * until all other additions have been made to
9523 			 * ensure that it will be done after all the
9524 			 * old blocks have been freed.
9525 			 */
9526 			if (freefile != NULL)
9527 				panic("handle_bufwait: freefile");
9528 			freefile = WK_FREEFILE(wk);
9529 			continue;
9530 
9531 		case D_MKDIR:
9532 			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
9533 			continue;
9534 
9535 		case D_DIRADD:
9536 			diradd_inode_written(WK_DIRADD(wk), inodedep);
9537 			continue;
9538 
9539 		case D_FREEFRAG:
9540 			wk->wk_state |= COMPLETE;
9541 			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
9542 				add_to_worklist(wk, 0);
9543 			continue;
9544 
9545 		case D_DIRREM:
9546 			wk->wk_state |= COMPLETE;
9547 			add_to_worklist(wk, 0);
9548 			continue;
9549 
9550 		case D_ALLOCDIRECT:
9551 		case D_ALLOCINDIR:
9552 			free_newblk(WK_NEWBLK(wk));
9553 			continue;
9554 
9555 		case D_JNEWBLK:
9556 			wk->wk_state |= COMPLETE;
9557 			free_jnewblk(WK_JNEWBLK(wk));
9558 			continue;
9559 
9560 		/*
9561 		 * Save freed journal segments and add references on
9562 		 * the supplied list which will delay their release
9563 		 * until the cg bitmap is cleared on disk.
9564 		 */
9565 		case D_JSEGDEP:
9566 			if (refhd == NULL)
9567 				free_jsegdep(WK_JSEGDEP(wk));
9568 			else
9569 				WORKLIST_INSERT(refhd, wk);
9570 			continue;
9571 
9572 		case D_JADDREF:
9573 			jaddref = WK_JADDREF(wk);
9574 			TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
9575 			    if_deps);
9576 			/*
9577 			 * Transfer any jaddrefs to the list to be freed with
9578 			 * the bitmap if we're handling a removed file.
9579 			 */
9580 			if (refhd == NULL) {
9581 				wk->wk_state |= COMPLETE;
9582 				free_jaddref(jaddref);
9583 			} else
9584 				WORKLIST_INSERT(refhd, wk);
9585 			continue;
9586 
9587 		default:
9588 			panic("handle_bufwait: Unknown type %p(%s)",
9589 			    wk, TYPENAME(wk->wk_type));
9590 			/* NOTREACHED */
9591 		}
9592 	}
9593 	return (freefile);
9594 }
9595 /*
9596  * Called from within softdep_disk_write_complete above to restore
9597  * in-memory inode block contents to their most up-to-date state. Note
9598  * that this routine is always called from interrupt level with further
9599  * splbio interrupts blocked.
9600  */
9601 static int
9602 handle_written_inodeblock(inodedep, bp)
9603 	struct inodedep *inodedep;
9604 	struct buf *bp;		/* buffer containing the inode block */
9605 {
9606 	struct freefile *freefile;
9607 	struct allocdirect *adp, *nextadp;
9608 	struct ufs1_dinode *dp1 = NULL;
9609 	struct ufs2_dinode *dp2 = NULL;
9610 	struct workhead wkhd;
9611 	int hadchanges, fstype;
9612 	ino_t freelink;
9613 
9614 	LIST_INIT(&wkhd);
9615 	hadchanges = 0;
9616 	freefile = NULL;
9617 	if ((inodedep->id_state & IOSTARTED) == 0)
9618 		panic("handle_written_inodeblock: not started");
9619 	inodedep->id_state &= ~IOSTARTED;
9620 	if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
9621 		fstype = UFS1;
9622 		dp1 = (struct ufs1_dinode *)bp->b_data +
9623 		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
9624 		freelink = dp1->di_freelink;
9625 	} else {
9626 		fstype = UFS2;
9627 		dp2 = (struct ufs2_dinode *)bp->b_data +
9628 		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
9629 		freelink = dp2->di_freelink;
9630 	}
9631 	/*
9632 	 * If we wrote a valid freelink pointer during the last write
9633 	 * record it here.
9634 	 */
9635 	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
9636 		struct inodedep *inon;
9637 
9638 		inon = TAILQ_NEXT(inodedep, id_unlinked);
9639 		if ((inon == NULL && freelink == 0) ||
9640 		    (inon && inon->id_ino == freelink)) {
9641 			if (inon)
9642 				inon->id_state |= UNLINKPREV;
9643 			inodedep->id_state |= UNLINKNEXT;
9644 		} else
9645 			hadchanges = 1;
9646 	}
9647 	/* Leave this inodeblock dirty until it's in the list. */
9648 	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED)
9649 		hadchanges = 1;
9650 	/*
9651 	 * If we had to rollback the inode allocation because of
9652 	 * bitmaps being incomplete, then simply restore it.
9653 	 * Keep the block dirty so that it will not be reclaimed until
9654 	 * all associated dependencies have been cleared and the
9655 	 * corresponding updates written to disk.
9656 	 */
9657 	if (inodedep->id_savedino1 != NULL) {
9658 		hadchanges = 1;
9659 		if (fstype == UFS1)
9660 			*dp1 = *inodedep->id_savedino1;
9661 		else
9662 			*dp2 = *inodedep->id_savedino2;
9663 		free(inodedep->id_savedino1, M_SAVEDINO);
9664 		inodedep->id_savedino1 = NULL;
9665 		if ((bp->b_flags & B_DELWRI) == 0)
9666 			stat_inode_bitmap++;
9667 		bdirty(bp);
9668 		/*
9669 		 * If the inode is clear here and GOINGAWAY it will never
9670 		 * be written.  Process the bufwait and clear any pending
9671 		 * work which may include the freefile.
9672 		 */
9673 		if (inodedep->id_state & GOINGAWAY)
9674 			goto bufwait;
9675 		return (1);
9676 	}
9677 	inodedep->id_state |= COMPLETE;
9678 	/*
9679 	 * Roll forward anything that had to be rolled back before
9680 	 * the inode could be updated.
9681 	 */
9682 	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
9683 		nextadp = TAILQ_NEXT(adp, ad_next);
9684 		if (adp->ad_state & ATTACHED)
9685 			panic("handle_written_inodeblock: new entry");
9686 		if (fstype == UFS1) {
9687 			if (adp->ad_offset < NDADDR) {
9688 				if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)
9689 					panic("%s %s #%jd mismatch %d != %jd",
9690 					    "handle_written_inodeblock:",
9691 					    "direct pointer",
9692 					    (intmax_t)adp->ad_offset,
9693 					    dp1->di_db[adp->ad_offset],
9694 					    (intmax_t)adp->ad_oldblkno);
9695 				dp1->di_db[adp->ad_offset] = adp->ad_newblkno;
9696 			} else {
9697 				if (dp1->di_ib[adp->ad_offset - NDADDR] != 0)
9698 					panic("%s: %s #%jd allocated as %d",
9699 					    "handle_written_inodeblock",
9700 					    "indirect pointer",
9701 					    (intmax_t)adp->ad_offset - NDADDR,
9702 					    dp1->di_ib[adp->ad_offset - NDADDR]);
9703 				dp1->di_ib[adp->ad_offset - NDADDR] =
9704 				    adp->ad_newblkno;
9705 			}
9706 		} else {
9707 			if (adp->ad_offset < NDADDR) {
9708 				if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)
9709 					panic("%s: %s #%jd %s %jd != %jd",
9710 					    "handle_written_inodeblock",
9711 					    "direct pointer",
9712 					    (intmax_t)adp->ad_offset, "mismatch",
9713 					    (intmax_t)dp2->di_db[adp->ad_offset],
9714 					    (intmax_t)adp->ad_oldblkno);
9715 				dp2->di_db[adp->ad_offset] = adp->ad_newblkno;
9716 			} else {
9717 				if (dp2->di_ib[adp->ad_offset - NDADDR] != 0)
9718 					panic("%s: %s #%jd allocated as %jd",
9719 					    "handle_written_inodeblock",
9720 					    "indirect pointer",
9721 					    (intmax_t)adp->ad_offset - NDADDR,
9722 					    (intmax_t)
9723 					    dp2->di_ib[adp->ad_offset - NDADDR]);
9724 				dp2->di_ib[adp->ad_offset - NDADDR] =
9725 				    adp->ad_newblkno;
9726 			}
9727 		}
9728 		adp->ad_state &= ~UNDONE;
9729 		adp->ad_state |= ATTACHED;
9730 		hadchanges = 1;
9731 	}
9732 	for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
9733 		nextadp = TAILQ_NEXT(adp, ad_next);
9734 		if (adp->ad_state & ATTACHED)
9735 			panic("handle_written_inodeblock: new entry");
9736 		if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)
9737 			panic("%s: direct pointers #%jd %s %jd != %jd",
9738 			    "handle_written_inodeblock",
9739 			    (intmax_t)adp->ad_offset, "mismatch",
9740 			    (intmax_t)dp2->di_extb[adp->ad_offset],
9741 			    (intmax_t)adp->ad_oldblkno);
9742 		dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;
9743 		adp->ad_state &= ~UNDONE;
9744 		adp->ad_state |= ATTACHED;
9745 		hadchanges = 1;
9746 	}
9747 	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
9748 		stat_direct_blk_ptrs++;
9749 	/*
9750 	 * Reset the file size to its most up-to-date value.
9751 	 */
9752 	if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
9753 		panic("handle_written_inodeblock: bad size");
9754 	if (inodedep->id_savednlink > LINK_MAX)
9755 		panic("handle_written_inodeblock: Invalid link count "
9756 		    "%d for inodedep %p", inodedep->id_savednlink, inodedep);
9757 	if (fstype == UFS1) {
9758 		if (dp1->di_nlink != inodedep->id_savednlink) {
9759 			dp1->di_nlink = inodedep->id_savednlink;
9760 			hadchanges = 1;
9761 		}
9762 		if (dp1->di_size != inodedep->id_savedsize) {
9763 			dp1->di_size = inodedep->id_savedsize;
9764 			hadchanges = 1;
9765 		}
9766 	} else {
9767 		if (dp2->di_nlink != inodedep->id_savednlink) {
9768 			dp2->di_nlink = inodedep->id_savednlink;
9769 			hadchanges = 1;
9770 		}
9771 		if (dp2->di_size != inodedep->id_savedsize) {
9772 			dp2->di_size = inodedep->id_savedsize;
9773 			hadchanges = 1;
9774 		}
9775 		if (dp2->di_extsize != inodedep->id_savedextsize) {
9776 			dp2->di_extsize = inodedep->id_savedextsize;
9777 			hadchanges = 1;
9778 		}
9779 	}
9780 	inodedep->id_savedsize = -1;
9781 	inodedep->id_savedextsize = -1;
9782 	inodedep->id_savednlink = -1;
9783 	/*
9784 	 * If there were any rollbacks in the inode block, then it must be
9785 	 * marked dirty so that its will eventually get written back in
9786 	 * its correct form.
9787 	 */
9788 	if (hadchanges)
9789 		bdirty(bp);
9790 bufwait:
9791 	/*
9792 	 * Process any allocdirects that completed during the update.
9793 	 */
9794 	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
9795 		handle_allocdirect_partdone(adp, &wkhd);
9796 	if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
9797 		handle_allocdirect_partdone(adp, &wkhd);
9798 	/*
9799 	 * Process deallocations that were held pending until the
9800 	 * inode had been written to disk. Freeing of the inode
9801 	 * is delayed until after all blocks have been freed to
9802 	 * avoid creation of new <vfsid, inum, lbn> triples
9803 	 * before the old ones have been deleted.  Completely
9804 	 * unlinked inodes are not processed until the unlinked
9805 	 * inode list is written or the last reference is removed.
9806 	 */
9807 	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) {
9808 		freefile = handle_bufwait(inodedep, NULL);
9809 		if (freefile && !LIST_EMPTY(&wkhd)) {
9810 			WORKLIST_INSERT(&wkhd, &freefile->fx_list);
9811 			freefile = NULL;
9812 		}
9813 	}
9814 	/*
9815 	 * Move rolled forward dependency completions to the bufwait list
9816 	 * now that those that were already written have been processed.
9817 	 */
9818 	if (!LIST_EMPTY(&wkhd) && hadchanges == 0)
9819 		panic("handle_written_inodeblock: bufwait but no changes");
9820 	jwork_move(&inodedep->id_bufwait, &wkhd);
9821 
9822 	if (freefile != NULL) {
9823 		/*
9824 		 * If the inode is goingaway it was never written.  Fake up
9825 		 * the state here so free_inodedep() can succeed.
9826 		 */
9827 		if (inodedep->id_state & GOINGAWAY)
9828 			inodedep->id_state |= COMPLETE | DEPCOMPLETE;
9829 		if (free_inodedep(inodedep) == 0)
9830 			panic("handle_written_inodeblock: live inodedep %p",
9831 			    inodedep);
9832 		add_to_worklist(&freefile->fx_list, 0);
9833 		return (0);
9834 	}
9835 
9836 	/*
9837 	 * If no outstanding dependencies, free it.
9838 	 */
9839 	if (free_inodedep(inodedep) ||
9840 	    (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&
9841 	     TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
9842 	     TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&
9843 	     LIST_FIRST(&inodedep->id_bufwait) == 0))
9844 		return (0);
9845 	return (hadchanges);
9846 }
9847 
9848 static int
9849 handle_written_indirdep(indirdep, bp, bpp)
9850 	struct indirdep *indirdep;
9851 	struct buf *bp;
9852 	struct buf **bpp;
9853 {
9854 	struct allocindir *aip;
9855 	int chgs;
9856 
9857 	if (indirdep->ir_state & GOINGAWAY)
9858 		panic("disk_write_complete: indirdep gone");
9859 	chgs = 0;
9860 	/*
9861 	 * If there were rollbacks revert them here.
9862 	 */
9863 	if (indirdep->ir_saveddata) {
9864 		bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
9865 		free(indirdep->ir_saveddata, M_INDIRDEP);
9866 		indirdep->ir_saveddata = 0;
9867 		chgs = 1;
9868 	}
9869 	indirdep->ir_state &= ~UNDONE;
9870 	indirdep->ir_state |= ATTACHED;
9871 	/*
9872 	 * Move allocindirs with written pointers to the completehd if
9873 	 * the indirdep's pointer is not yet written.  Otherwise
9874 	 * free them here.
9875 	 */
9876 	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) {
9877 		LIST_REMOVE(aip, ai_next);
9878 		if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
9879 			LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,
9880 			    ai_next);
9881 			continue;
9882 		}
9883 		free_newblk(&aip->ai_block);
9884 	}
9885 	/*
9886 	 * Move allocindirs that have finished dependency processing from
9887 	 * the done list to the write list after updating the pointers.
9888 	 */
9889 	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
9890 		handle_allocindir_partdone(aip);
9891 		if (aip == LIST_FIRST(&indirdep->ir_donehd))
9892 			panic("disk_write_complete: not gone");
9893 		chgs = 1;
9894 	}
9895 	/*
9896 	 * If this indirdep has been detached from its newblk during
9897 	 * I/O we need to keep this dep attached to the buffer so
9898 	 * deallocate_dependencies can find it and properly resolve
9899 	 * any outstanding dependencies.
9900 	 */
9901 	if ((indirdep->ir_state & (ONDEPLIST | DEPCOMPLETE)) == 0)
9902 		chgs = 1;
9903 	if ((bp->b_flags & B_DELWRI) == 0)
9904 		stat_indir_blk_ptrs++;
9905 	/*
9906 	 * If there were no changes we can discard the savedbp and detach
9907 	 * ourselves from the buf.  We are only carrying completed pointers
9908 	 * in this case.
9909 	 */
9910 	if (chgs == 0) {
9911 		struct buf *sbp;
9912 
9913 		sbp = indirdep->ir_savebp;
9914 		sbp->b_flags |= B_INVAL | B_NOCACHE;
9915 		indirdep->ir_savebp = NULL;
9916 		if (*bpp != NULL)
9917 			panic("handle_written_indirdep: bp already exists.");
9918 		*bpp = sbp;
9919 	} else
9920 		bdirty(bp);
9921 	/*
9922 	 * If there are no fresh dependencies and none waiting on writes
9923 	 * we can free the indirdep.
9924 	 */
9925 	if ((indirdep->ir_state & DEPCOMPLETE) && chgs == 0) {
9926 		if (indirdep->ir_state & ONDEPLIST)
9927 			LIST_REMOVE(indirdep, ir_next);
9928 		free_indirdep(indirdep);
9929 		return (0);
9930 	}
9931 
9932 	return (chgs);
9933 }
9934 
9935 /*
9936  * Process a diradd entry after its dependent inode has been written.
9937  * This routine must be called with splbio interrupts blocked.
9938  */
9939 static void
9940 diradd_inode_written(dap, inodedep)
9941 	struct diradd *dap;
9942 	struct inodedep *inodedep;
9943 {
9944 
9945 	dap->da_state |= COMPLETE;
9946 	complete_diradd(dap);
9947 	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
9948 }
9949 
9950 /*
9951  * Returns true if the bmsafemap will have rollbacks when written.  Must
9952  * only be called with lk and the buf lock on the cg held.
9953  */
9954 static int
9955 bmsafemap_rollbacks(bmsafemap)
9956 	struct bmsafemap *bmsafemap;
9957 {
9958 
9959 	return (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd) |
9960 	    !LIST_EMPTY(&bmsafemap->sm_jnewblkhd));
9961 }
9962 
9963 /*
9964  * Complete a write to a bmsafemap structure.  Roll forward any bitmap
9965  * changes if it's not a background write.  Set all written dependencies
9966  * to DEPCOMPLETE and free the structure if possible.
9967  */
9968 static int
9969 handle_written_bmsafemap(bmsafemap, bp)
9970 	struct bmsafemap *bmsafemap;
9971 	struct buf *bp;
9972 {
9973 	struct newblk *newblk;
9974 	struct inodedep *inodedep;
9975 	struct jaddref *jaddref, *jatmp;
9976 	struct jnewblk *jnewblk, *jntmp;
9977 	uint8_t *inosused;
9978 	uint8_t *blksfree;
9979 	struct cg *cgp;
9980 	struct fs *fs;
9981 	ino_t ino;
9982 	long bno;
9983 	int chgs;
9984 	int i;
9985 
9986 	if ((bmsafemap->sm_state & IOSTARTED) == 0)
9987 		panic("initiate_write_bmsafemap: Not started\n");
9988 	chgs = 0;
9989 	bmsafemap->sm_state &= ~IOSTARTED;
9990 	/*
9991 	 * Restore unwritten inode allocation pending jaddref writes.
9992 	 */
9993 	if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {
9994 		cgp = (struct cg *)bp->b_data;
9995 		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
9996 		inosused = cg_inosused(cgp);
9997 		LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,
9998 		    ja_bmdeps, jatmp) {
9999 			if ((jaddref->ja_state & UNDONE) == 0)
10000 				continue;
10001 			ino = jaddref->ja_ino % fs->fs_ipg;
10002 			if (isset(inosused, ino))
10003 				panic("handle_written_bmsafemap: "
10004 				    "re-allocated inode");
10005 			if ((bp->b_xflags & BX_BKGRDMARKER) == 0) {
10006 				if ((jaddref->ja_mode & IFMT) == IFDIR)
10007 					cgp->cg_cs.cs_ndir++;
10008 				cgp->cg_cs.cs_nifree--;
10009 				setbit(inosused, ino);
10010 				chgs = 1;
10011 			}
10012 			jaddref->ja_state &= ~UNDONE;
10013 			jaddref->ja_state |= ATTACHED;
10014 			free_jaddref(jaddref);
10015 		}
10016 	}
10017 	/*
10018 	 * Restore any block allocations which are pending journal writes.
10019 	 */
10020 	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
10021 		cgp = (struct cg *)bp->b_data;
10022 		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
10023 		blksfree = cg_blksfree(cgp);
10024 		LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,
10025 		    jntmp) {
10026 			if ((jnewblk->jn_state & UNDONE) == 0)
10027 				continue;
10028 			bno = dtogd(fs, jnewblk->jn_blkno);
10029 			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
10030 			    i++) {
10031 				if (bp->b_xflags & BX_BKGRDMARKER)
10032 					break;
10033 				if ((jnewblk->jn_state & NEWBLOCK) == 0 &&
10034 				    isclr(blksfree, bno + i))
10035 					panic("handle_written_bmsafemap: "
10036 					    "re-allocated fragment");
10037 				clrbit(blksfree, bno + i);
10038 				chgs = 1;
10039 			}
10040 			jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
10041 			jnewblk->jn_state |= ATTACHED;
10042 			free_jnewblk(jnewblk);
10043 		}
10044 	}
10045 	while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
10046 		newblk->nb_state |= DEPCOMPLETE;
10047 		newblk->nb_state &= ~ONDEPLIST;
10048 		newblk->nb_bmsafemap = NULL;
10049 		LIST_REMOVE(newblk, nb_deps);
10050 		if (newblk->nb_list.wk_type == D_ALLOCDIRECT)
10051 			handle_allocdirect_partdone(
10052 			    WK_ALLOCDIRECT(&newblk->nb_list), NULL);
10053 		else if (newblk->nb_list.wk_type == D_ALLOCINDIR)
10054 			handle_allocindir_partdone(
10055 			    WK_ALLOCINDIR(&newblk->nb_list));
10056 		else if (newblk->nb_list.wk_type != D_NEWBLK)
10057 			panic("handle_written_bmsafemap: Unexpected type: %s",
10058 			    TYPENAME(newblk->nb_list.wk_type));
10059 	}
10060 	while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) {
10061 		inodedep->id_state |= DEPCOMPLETE;
10062 		inodedep->id_state &= ~ONDEPLIST;
10063 		LIST_REMOVE(inodedep, id_deps);
10064 		inodedep->id_bmsafemap = NULL;
10065 	}
10066 	if (LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&
10067 	    LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&
10068 	    LIST_EMPTY(&bmsafemap->sm_newblkhd) &&
10069 	    LIST_EMPTY(&bmsafemap->sm_inodedephd)) {
10070 		if (chgs)
10071 			bdirty(bp);
10072 		LIST_REMOVE(bmsafemap, sm_hash);
10073 		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
10074 		return (0);
10075 	}
10076 	bdirty(bp);
10077 	return (1);
10078 }
10079 
10080 /*
10081  * Try to free a mkdir dependency.
10082  */
10083 static void
10084 complete_mkdir(mkdir)
10085 	struct mkdir *mkdir;
10086 {
10087 	struct diradd *dap;
10088 
10089 	if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)
10090 		return;
10091 	LIST_REMOVE(mkdir, md_mkdirs);
10092 	dap = mkdir->md_diradd;
10093 	dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
10094 	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {
10095 		dap->da_state |= DEPCOMPLETE;
10096 		complete_diradd(dap);
10097 	}
10098 	WORKITEM_FREE(mkdir, D_MKDIR);
10099 }
10100 
10101 /*
10102  * Handle the completion of a mkdir dependency.
10103  */
10104 static void
10105 handle_written_mkdir(mkdir, type)
10106 	struct mkdir *mkdir;
10107 	int type;
10108 {
10109 
10110 	if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)
10111 		panic("handle_written_mkdir: bad type");
10112 	mkdir->md_state |= COMPLETE;
10113 	complete_mkdir(mkdir);
10114 }
10115 
10116 static void
10117 free_pagedep(pagedep)
10118 	struct pagedep *pagedep;
10119 {
10120 	int i;
10121 
10122 	if (pagedep->pd_state & (NEWBLOCK | ONWORKLIST))
10123 		return;
10124 	for (i = 0; i < DAHASHSZ; i++)
10125 		if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
10126 			return;
10127 	if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))
10128 		return;
10129 	if (!LIST_EMPTY(&pagedep->pd_dirremhd))
10130 		return;
10131 	if (!LIST_EMPTY(&pagedep->pd_pendinghd))
10132 		return;
10133 	LIST_REMOVE(pagedep, pd_hash);
10134 	WORKITEM_FREE(pagedep, D_PAGEDEP);
10135 }
10136 
10137 /*
10138  * Called from within softdep_disk_write_complete above.
10139  * A write operation was just completed. Removed inodes can
10140  * now be freed and associated block pointers may be committed.
10141  * Note that this routine is always called from interrupt level
10142  * with further splbio interrupts blocked.
10143  */
10144 static int
10145 handle_written_filepage(pagedep, bp)
10146 	struct pagedep *pagedep;
10147 	struct buf *bp;		/* buffer containing the written page */
10148 {
10149 	struct dirrem *dirrem;
10150 	struct diradd *dap, *nextdap;
10151 	struct direct *ep;
10152 	int i, chgs;
10153 
10154 	if ((pagedep->pd_state & IOSTARTED) == 0)
10155 		panic("handle_written_filepage: not started");
10156 	pagedep->pd_state &= ~IOSTARTED;
10157 	/*
10158 	 * Process any directory removals that have been committed.
10159 	 */
10160 	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
10161 		LIST_REMOVE(dirrem, dm_next);
10162 		dirrem->dm_state |= COMPLETE;
10163 		dirrem->dm_dirinum = pagedep->pd_ino;
10164 		KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
10165 		    ("handle_written_filepage: Journal entries not written."));
10166 		add_to_worklist(&dirrem->dm_list, 0);
10167 	}
10168 	/*
10169 	 * Free any directory additions that have been committed.
10170 	 * If it is a newly allocated block, we have to wait until
10171 	 * the on-disk directory inode claims the new block.
10172 	 */
10173 	if ((pagedep->pd_state & NEWBLOCK) == 0)
10174 		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
10175 			free_diradd(dap, NULL);
10176 	/*
10177 	 * Uncommitted directory entries must be restored.
10178 	 */
10179 	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
10180 		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
10181 		     dap = nextdap) {
10182 			nextdap = LIST_NEXT(dap, da_pdlist);
10183 			if (dap->da_state & ATTACHED)
10184 				panic("handle_written_filepage: attached");
10185 			ep = (struct direct *)
10186 			    ((char *)bp->b_data + dap->da_offset);
10187 			ep->d_ino = dap->da_newinum;
10188 			dap->da_state &= ~UNDONE;
10189 			dap->da_state |= ATTACHED;
10190 			chgs = 1;
10191 			/*
10192 			 * If the inode referenced by the directory has
10193 			 * been written out, then the dependency can be
10194 			 * moved to the pending list.
10195 			 */
10196 			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
10197 				LIST_REMOVE(dap, da_pdlist);
10198 				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
10199 				    da_pdlist);
10200 			}
10201 		}
10202 	}
10203 	/*
10204 	 * If there were any rollbacks in the directory, then it must be
10205 	 * marked dirty so that its will eventually get written back in
10206 	 * its correct form.
10207 	 */
10208 	if (chgs) {
10209 		if ((bp->b_flags & B_DELWRI) == 0)
10210 			stat_dir_entry++;
10211 		bdirty(bp);
10212 		return (1);
10213 	}
10214 	/*
10215 	 * If we are not waiting for a new directory block to be
10216 	 * claimed by its inode, then the pagedep will be freed.
10217 	 * Otherwise it will remain to track any new entries on
10218 	 * the page in case they are fsync'ed.
10219 	 */
10220 	if ((pagedep->pd_state & NEWBLOCK) == 0 &&
10221 	    LIST_EMPTY(&pagedep->pd_jmvrefhd)) {
10222 		LIST_REMOVE(pagedep, pd_hash);
10223 		WORKITEM_FREE(pagedep, D_PAGEDEP);
10224 	}
10225 	return (0);
10226 }
10227 
10228 /*
10229  * Writing back in-core inode structures.
10230  *
10231  * The filesystem only accesses an inode's contents when it occupies an
10232  * "in-core" inode structure.  These "in-core" structures are separate from
10233  * the page frames used to cache inode blocks.  Only the latter are
10234  * transferred to/from the disk.  So, when the updated contents of the
10235  * "in-core" inode structure are copied to the corresponding in-memory inode
10236  * block, the dependencies are also transferred.  The following procedure is
10237  * called when copying a dirty "in-core" inode to a cached inode block.
10238  */
10239 
10240 /*
10241  * Called when an inode is loaded from disk. If the effective link count
10242  * differed from the actual link count when it was last flushed, then we
10243  * need to ensure that the correct effective link count is put back.
10244  */
10245 void
10246 softdep_load_inodeblock(ip)
10247 	struct inode *ip;	/* the "in_core" copy of the inode */
10248 {
10249 	struct inodedep *inodedep;
10250 
10251 	/*
10252 	 * Check for alternate nlink count.
10253 	 */
10254 	ip->i_effnlink = ip->i_nlink;
10255 	ACQUIRE_LOCK(&lk);
10256 	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
10257 	    &inodedep) == 0) {
10258 		FREE_LOCK(&lk);
10259 		return;
10260 	}
10261 	ip->i_effnlink -= inodedep->id_nlinkdelta;
10262 	FREE_LOCK(&lk);
10263 }
10264 
10265 /*
10266  * This routine is called just before the "in-core" inode
10267  * information is to be copied to the in-memory inode block.
10268  * Recall that an inode block contains several inodes. If
10269  * the force flag is set, then the dependencies will be
10270  * cleared so that the update can always be made. Note that
10271  * the buffer is locked when this routine is called, so we
10272  * will never be in the middle of writing the inode block
10273  * to disk.
10274  */
10275 void
10276 softdep_update_inodeblock(ip, bp, waitfor)
10277 	struct inode *ip;	/* the "in_core" copy of the inode */
10278 	struct buf *bp;		/* the buffer containing the inode block */
10279 	int waitfor;		/* nonzero => update must be allowed */
10280 {
10281 	struct inodedep *inodedep;
10282 	struct inoref *inoref;
10283 	struct worklist *wk;
10284 	struct mount *mp;
10285 	struct buf *ibp;
10286 	struct fs *fs;
10287 	int error;
10288 
10289 	mp = UFSTOVFS(ip->i_ump);
10290 	fs = ip->i_fs;
10291 	/*
10292 	 * Preserve the freelink that is on disk.  clear_unlinked_inodedep()
10293 	 * does not have access to the in-core ip so must write directly into
10294 	 * the inode block buffer when setting freelink.
10295 	 */
10296 	if (fs->fs_magic == FS_UFS1_MAGIC)
10297 		DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data +
10298 		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
10299 	else
10300 		DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data +
10301 		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
10302 	/*
10303 	 * If the effective link count is not equal to the actual link
10304 	 * count, then we must track the difference in an inodedep while
10305 	 * the inode is (potentially) tossed out of the cache. Otherwise,
10306 	 * if there is no existing inodedep, then there are no dependencies
10307 	 * to track.
10308 	 */
10309 	ACQUIRE_LOCK(&lk);
10310 again:
10311 	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
10312 		FREE_LOCK(&lk);
10313 		if (ip->i_effnlink != ip->i_nlink)
10314 			panic("softdep_update_inodeblock: bad link count");
10315 		return;
10316 	}
10317 	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
10318 		panic("softdep_update_inodeblock: bad delta");
10319 	/*
10320 	 * If we're flushing all dependencies we must also move any waiting
10321 	 * for journal writes onto the bufwait list prior to I/O.
10322 	 */
10323 	if (waitfor) {
10324 		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
10325 			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
10326 			    == DEPCOMPLETE) {
10327 				stat_jwait_inode++;
10328 				jwait(&inoref->if_list);
10329 				goto again;
10330 			}
10331 		}
10332 	}
10333 	/*
10334 	 * Changes have been initiated. Anything depending on these
10335 	 * changes cannot occur until this inode has been written.
10336 	 */
10337 	inodedep->id_state &= ~COMPLETE;
10338 	if ((inodedep->id_state & ONWORKLIST) == 0)
10339 		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
10340 	/*
10341 	 * Any new dependencies associated with the incore inode must
10342 	 * now be moved to the list associated with the buffer holding
10343 	 * the in-memory copy of the inode. Once merged process any
10344 	 * allocdirects that are completed by the merger.
10345 	 */
10346 	merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
10347 	if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
10348 		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),
10349 		    NULL);
10350 	merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
10351 	if (!TAILQ_EMPTY(&inodedep->id_extupdt))
10352 		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),
10353 		    NULL);
10354 	/*
10355 	 * Now that the inode has been pushed into the buffer, the
10356 	 * operations dependent on the inode being written to disk
10357 	 * can be moved to the id_bufwait so that they will be
10358 	 * processed when the buffer I/O completes.
10359 	 */
10360 	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
10361 		WORKLIST_REMOVE(wk);
10362 		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
10363 	}
10364 	/*
10365 	 * Newly allocated inodes cannot be written until the bitmap
10366 	 * that allocates them have been written (indicated by
10367 	 * DEPCOMPLETE being set in id_state). If we are doing a
10368 	 * forced sync (e.g., an fsync on a file), we force the bitmap
10369 	 * to be written so that the update can be done.
10370 	 */
10371 	if (waitfor == 0) {
10372 		FREE_LOCK(&lk);
10373 		return;
10374 	}
10375 retry:
10376 	if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {
10377 		FREE_LOCK(&lk);
10378 		return;
10379 	}
10380 	ibp = inodedep->id_bmsafemap->sm_buf;
10381 	ibp = getdirtybuf(ibp, &lk, MNT_WAIT);
10382 	if (ibp == NULL) {
10383 		/*
10384 		 * If ibp came back as NULL, the dependency could have been
10385 		 * freed while we slept.  Look it up again, and check to see
10386 		 * that it has completed.
10387 		 */
10388 		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
10389 			goto retry;
10390 		FREE_LOCK(&lk);
10391 		return;
10392 	}
10393 	FREE_LOCK(&lk);
10394 	if ((error = bwrite(ibp)) != 0)
10395 		softdep_error("softdep_update_inodeblock: bwrite", error);
10396 }
10397 
10398 /*
10399  * Merge the a new inode dependency list (such as id_newinoupdt) into an
10400  * old inode dependency list (such as id_inoupdt). This routine must be
10401  * called with splbio interrupts blocked.
10402  */
10403 static void
10404 merge_inode_lists(newlisthead, oldlisthead)
10405 	struct allocdirectlst *newlisthead;
10406 	struct allocdirectlst *oldlisthead;
10407 {
10408 	struct allocdirect *listadp, *newadp;
10409 
10410 	newadp = TAILQ_FIRST(newlisthead);
10411 	for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
10412 		if (listadp->ad_offset < newadp->ad_offset) {
10413 			listadp = TAILQ_NEXT(listadp, ad_next);
10414 			continue;
10415 		}
10416 		TAILQ_REMOVE(newlisthead, newadp, ad_next);
10417 		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
10418 		if (listadp->ad_offset == newadp->ad_offset) {
10419 			allocdirect_merge(oldlisthead, newadp,
10420 			    listadp);
10421 			listadp = newadp;
10422 		}
10423 		newadp = TAILQ_FIRST(newlisthead);
10424 	}
10425 	while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
10426 		TAILQ_REMOVE(newlisthead, newadp, ad_next);
10427 		TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
10428 	}
10429 }
10430 
10431 /*
10432  * If we are doing an fsync, then we must ensure that any directory
10433  * entries for the inode have been written after the inode gets to disk.
10434  */
10435 int
10436 softdep_fsync(vp)
10437 	struct vnode *vp;	/* the "in_core" copy of the inode */
10438 {
10439 	struct inodedep *inodedep;
10440 	struct pagedep *pagedep;
10441 	struct inoref *inoref;
10442 	struct worklist *wk;
10443 	struct diradd *dap;
10444 	struct mount *mp;
10445 	struct vnode *pvp;
10446 	struct inode *ip;
10447 	struct buf *bp;
10448 	struct fs *fs;
10449 	struct thread *td = curthread;
10450 	int error, flushparent, pagedep_new_block;
10451 	ino_t parentino;
10452 	ufs_lbn_t lbn;
10453 
10454 	ip = VTOI(vp);
10455 	fs = ip->i_fs;
10456 	mp = vp->v_mount;
10457 	ACQUIRE_LOCK(&lk);
10458 restart:
10459 	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
10460 		FREE_LOCK(&lk);
10461 		return (0);
10462 	}
10463 	TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
10464 		if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
10465 		    == DEPCOMPLETE) {
10466 			stat_jwait_inode++;
10467 			jwait(&inoref->if_list);
10468 			goto restart;
10469 		}
10470 	}
10471 	if (!LIST_EMPTY(&inodedep->id_inowait) ||
10472 	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
10473 	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
10474 	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
10475 	    !TAILQ_EMPTY(&inodedep->id_newinoupdt))
10476 		panic("softdep_fsync: pending ops %p", inodedep);
10477 	for (error = 0, flushparent = 0; ; ) {
10478 		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
10479 			break;
10480 		if (wk->wk_type != D_DIRADD)
10481 			panic("softdep_fsync: Unexpected type %s",
10482 			    TYPENAME(wk->wk_type));
10483 		dap = WK_DIRADD(wk);
10484 		/*
10485 		 * Flush our parent if this directory entry has a MKDIR_PARENT
10486 		 * dependency or is contained in a newly allocated block.
10487 		 */
10488 		if (dap->da_state & DIRCHG)
10489 			pagedep = dap->da_previous->dm_pagedep;
10490 		else
10491 			pagedep = dap->da_pagedep;
10492 		parentino = pagedep->pd_ino;
10493 		lbn = pagedep->pd_lbn;
10494 		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
10495 			panic("softdep_fsync: dirty");
10496 		if ((dap->da_state & MKDIR_PARENT) ||
10497 		    (pagedep->pd_state & NEWBLOCK))
10498 			flushparent = 1;
10499 		else
10500 			flushparent = 0;
10501 		/*
10502 		 * If we are being fsync'ed as part of vgone'ing this vnode,
10503 		 * then we will not be able to release and recover the
10504 		 * vnode below, so we just have to give up on writing its
10505 		 * directory entry out. It will eventually be written, just
10506 		 * not now, but then the user was not asking to have it
10507 		 * written, so we are not breaking any promises.
10508 		 */
10509 		if (vp->v_iflag & VI_DOOMED)
10510 			break;
10511 		/*
10512 		 * We prevent deadlock by always fetching inodes from the
10513 		 * root, moving down the directory tree. Thus, when fetching
10514 		 * our parent directory, we first try to get the lock. If
10515 		 * that fails, we must unlock ourselves before requesting
10516 		 * the lock on our parent. See the comment in ufs_lookup
10517 		 * for details on possible races.
10518 		 */
10519 		FREE_LOCK(&lk);
10520 		if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp,
10521 		    FFSV_FORCEINSMQ)) {
10522 			error = vfs_busy(mp, MBF_NOWAIT);
10523 			if (error != 0) {
10524 				vfs_ref(mp);
10525 				VOP_UNLOCK(vp, 0);
10526 				error = vfs_busy(mp, 0);
10527 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
10528 				vfs_rel(mp);
10529 				if (error != 0)
10530 					return (ENOENT);
10531 				if (vp->v_iflag & VI_DOOMED) {
10532 					vfs_unbusy(mp);
10533 					return (ENOENT);
10534 				}
10535 			}
10536 			VOP_UNLOCK(vp, 0);
10537 			error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE,
10538 			    &pvp, FFSV_FORCEINSMQ);
10539 			vfs_unbusy(mp);
10540 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
10541 			if (vp->v_iflag & VI_DOOMED) {
10542 				if (error == 0)
10543 					vput(pvp);
10544 				error = ENOENT;
10545 			}
10546 			if (error != 0)
10547 				return (error);
10548 		}
10549 		/*
10550 		 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
10551 		 * that are contained in direct blocks will be resolved by
10552 		 * doing a ffs_update. Pagedeps contained in indirect blocks
10553 		 * may require a complete sync'ing of the directory. So, we
10554 		 * try the cheap and fast ffs_update first, and if that fails,
10555 		 * then we do the slower ffs_syncvnode of the directory.
10556 		 */
10557 		if (flushparent) {
10558 			int locked;
10559 
10560 			if ((error = ffs_update(pvp, 1)) != 0) {
10561 				vput(pvp);
10562 				return (error);
10563 			}
10564 			ACQUIRE_LOCK(&lk);
10565 			locked = 1;
10566 			if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
10567 				if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
10568 					if (wk->wk_type != D_DIRADD)
10569 						panic("softdep_fsync: Unexpected type %s",
10570 						      TYPENAME(wk->wk_type));
10571 					dap = WK_DIRADD(wk);
10572 					if (dap->da_state & DIRCHG)
10573 						pagedep = dap->da_previous->dm_pagedep;
10574 					else
10575 						pagedep = dap->da_pagedep;
10576 					pagedep_new_block = pagedep->pd_state & NEWBLOCK;
10577 					FREE_LOCK(&lk);
10578 					locked = 0;
10579 					if (pagedep_new_block &&
10580 					    (error = ffs_syncvnode(pvp, MNT_WAIT))) {
10581 						vput(pvp);
10582 						return (error);
10583 					}
10584 				}
10585 			}
10586 			if (locked)
10587 				FREE_LOCK(&lk);
10588 		}
10589 		/*
10590 		 * Flush directory page containing the inode's name.
10591 		 */
10592 		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
10593 		    &bp);
10594 		if (error == 0)
10595 			error = bwrite(bp);
10596 		else
10597 			brelse(bp);
10598 		vput(pvp);
10599 		if (error != 0)
10600 			return (error);
10601 		ACQUIRE_LOCK(&lk);
10602 		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
10603 			break;
10604 	}
10605 	FREE_LOCK(&lk);
10606 	return (0);
10607 }
10608 
10609 /*
10610  * Flush all the dirty bitmaps associated with the block device
10611  * before flushing the rest of the dirty blocks so as to reduce
10612  * the number of dependencies that will have to be rolled back.
10613  */
10614 void
10615 softdep_fsync_mountdev(vp)
10616 	struct vnode *vp;
10617 {
10618 	struct buf *bp, *nbp;
10619 	struct worklist *wk;
10620 	struct bufobj *bo;
10621 
10622 	if (!vn_isdisk(vp, NULL))
10623 		panic("softdep_fsync_mountdev: vnode not a disk");
10624 	bo = &vp->v_bufobj;
10625 restart:
10626 	BO_LOCK(bo);
10627 	ACQUIRE_LOCK(&lk);
10628 	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
10629 		/*
10630 		 * If it is already scheduled, skip to the next buffer.
10631 		 */
10632 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
10633 			continue;
10634 
10635 		if ((bp->b_flags & B_DELWRI) == 0)
10636 			panic("softdep_fsync_mountdev: not dirty");
10637 		/*
10638 		 * We are only interested in bitmaps with outstanding
10639 		 * dependencies.
10640 		 */
10641 		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
10642 		    wk->wk_type != D_BMSAFEMAP ||
10643 		    (bp->b_vflags & BV_BKGRDINPROG)) {
10644 			BUF_UNLOCK(bp);
10645 			continue;
10646 		}
10647 		FREE_LOCK(&lk);
10648 		BO_UNLOCK(bo);
10649 		bremfree(bp);
10650 		(void) bawrite(bp);
10651 		goto restart;
10652 	}
10653 	FREE_LOCK(&lk);
10654 	drain_output(vp);
10655 	BO_UNLOCK(bo);
10656 }
10657 
10658 /*
10659  * This routine is called when we are trying to synchronously flush a
10660  * file. This routine must eliminate any filesystem metadata dependencies
10661  * so that the syncing routine can succeed by pushing the dirty blocks
10662  * associated with the file. If any I/O errors occur, they are returned.
10663  */
10664 int
10665 softdep_sync_metadata(struct vnode *vp)
10666 {
10667 	struct pagedep *pagedep;
10668 	struct allocindir *aip;
10669 	struct newblk *newblk;
10670 	struct buf *bp, *nbp;
10671 	struct worklist *wk;
10672 	struct bufobj *bo;
10673 	int i, error, waitfor;
10674 
10675 	if (!DOINGSOFTDEP(vp))
10676 		return (0);
10677 	/*
10678 	 * Ensure that any direct block dependencies have been cleared.
10679 	 */
10680 	ACQUIRE_LOCK(&lk);
10681 	if ((error = flush_inodedep_deps(vp->v_mount, VTOI(vp)->i_number))) {
10682 		FREE_LOCK(&lk);
10683 		return (error);
10684 	}
10685 	FREE_LOCK(&lk);
10686 	/*
10687 	 * For most files, the only metadata dependencies are the
10688 	 * cylinder group maps that allocate their inode or blocks.
10689 	 * The block allocation dependencies can be found by traversing
10690 	 * the dependency lists for any buffers that remain on their
10691 	 * dirty buffer list. The inode allocation dependency will
10692 	 * be resolved when the inode is updated with MNT_WAIT.
10693 	 * This work is done in two passes. The first pass grabs most
10694 	 * of the buffers and begins asynchronously writing them. The
10695 	 * only way to wait for these asynchronous writes is to sleep
10696 	 * on the filesystem vnode which may stay busy for a long time
10697 	 * if the filesystem is active. So, instead, we make a second
10698 	 * pass over the dependencies blocking on each write. In the
10699 	 * usual case we will be blocking against a write that we
10700 	 * initiated, so when it is done the dependency will have been
10701 	 * resolved. Thus the second pass is expected to end quickly.
10702 	 */
10703 	waitfor = MNT_NOWAIT;
10704 	bo = &vp->v_bufobj;
10705 
10706 top:
10707 	/*
10708 	 * We must wait for any I/O in progress to finish so that
10709 	 * all potential buffers on the dirty list will be visible.
10710 	 */
10711 	BO_LOCK(bo);
10712 	drain_output(vp);
10713 	while ((bp = TAILQ_FIRST(&bo->bo_dirty.bv_hd)) != NULL) {
10714 		bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT);
10715 		if (bp)
10716 			break;
10717 	}
10718 	BO_UNLOCK(bo);
10719 	if (bp == NULL)
10720 		return (0);
10721 loop:
10722 	/* While syncing snapshots, we must allow recursive lookups */
10723 	BUF_AREC(bp);
10724 	ACQUIRE_LOCK(&lk);
10725 	/*
10726 	 * As we hold the buffer locked, none of its dependencies
10727 	 * will disappear.
10728 	 */
10729 	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
10730 		switch (wk->wk_type) {
10731 
10732 		case D_ALLOCDIRECT:
10733 		case D_ALLOCINDIR:
10734 			newblk = WK_NEWBLK(wk);
10735 			if (newblk->nb_jnewblk != NULL) {
10736 				stat_jwait_newblk++;
10737 				jwait(&newblk->nb_jnewblk->jn_list);
10738 				goto restart;
10739 			}
10740 			if (newblk->nb_state & DEPCOMPLETE)
10741 				continue;
10742 			nbp = newblk->nb_bmsafemap->sm_buf;
10743 			nbp = getdirtybuf(nbp, &lk, waitfor);
10744 			if (nbp == NULL)
10745 				continue;
10746 			FREE_LOCK(&lk);
10747 			if (waitfor == MNT_NOWAIT) {
10748 				bawrite(nbp);
10749 			} else if ((error = bwrite(nbp)) != 0) {
10750 				break;
10751 			}
10752 			ACQUIRE_LOCK(&lk);
10753 			continue;
10754 
10755 		case D_INDIRDEP:
10756 		restart:
10757 
10758 			LIST_FOREACH(aip,
10759 			    &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
10760 				newblk = (struct newblk *)aip;
10761 				if (newblk->nb_jnewblk != NULL) {
10762 					stat_jwait_newblk++;
10763 					jwait(&newblk->nb_jnewblk->jn_list);
10764 					goto restart;
10765 				}
10766 				if (newblk->nb_state & DEPCOMPLETE)
10767 					continue;
10768 				nbp = newblk->nb_bmsafemap->sm_buf;
10769 				nbp = getdirtybuf(nbp, &lk, MNT_WAIT);
10770 				if (nbp == NULL)
10771 					goto restart;
10772 				FREE_LOCK(&lk);
10773 				if ((error = bwrite(nbp)) != 0) {
10774 					goto loop_end;
10775 				}
10776 				ACQUIRE_LOCK(&lk);
10777 				goto restart;
10778 			}
10779 			continue;
10780 
10781 		case D_PAGEDEP:
10782 			/*
10783 			 * We are trying to sync a directory that may
10784 			 * have dependencies on both its own metadata
10785 			 * and/or dependencies on the inodes of any
10786 			 * recently allocated files. We walk its diradd
10787 			 * lists pushing out the associated inode.
10788 			 */
10789 			pagedep = WK_PAGEDEP(wk);
10790 			for (i = 0; i < DAHASHSZ; i++) {
10791 				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
10792 					continue;
10793 				if ((error =
10794 				    flush_pagedep_deps(vp, wk->wk_mp,
10795 						&pagedep->pd_diraddhd[i]))) {
10796 					FREE_LOCK(&lk);
10797 					goto loop_end;
10798 				}
10799 			}
10800 			continue;
10801 
10802 		default:
10803 			panic("softdep_sync_metadata: Unknown type %s",
10804 			    TYPENAME(wk->wk_type));
10805 			/* NOTREACHED */
10806 		}
10807 	loop_end:
10808 		/* We reach here only in error and unlocked */
10809 		if (error == 0)
10810 			panic("softdep_sync_metadata: zero error");
10811 		BUF_NOREC(bp);
10812 		bawrite(bp);
10813 		return (error);
10814 	}
10815 	FREE_LOCK(&lk);
10816 	BO_LOCK(bo);
10817 	while ((nbp = TAILQ_NEXT(bp, b_bobufs)) != NULL) {
10818 		nbp = getdirtybuf(nbp, BO_MTX(bo), MNT_WAIT);
10819 		if (nbp)
10820 			break;
10821 	}
10822 	BO_UNLOCK(bo);
10823 	BUF_NOREC(bp);
10824 	bawrite(bp);
10825 	if (nbp != NULL) {
10826 		bp = nbp;
10827 		goto loop;
10828 	}
10829 	/*
10830 	 * The brief unlock is to allow any pent up dependency
10831 	 * processing to be done. Then proceed with the second pass.
10832 	 */
10833 	if (waitfor == MNT_NOWAIT) {
10834 		waitfor = MNT_WAIT;
10835 		goto top;
10836 	}
10837 
10838 	/*
10839 	 * If we have managed to get rid of all the dirty buffers,
10840 	 * then we are done. For certain directories and block
10841 	 * devices, we may need to do further work.
10842 	 *
10843 	 * We must wait for any I/O in progress to finish so that
10844 	 * all potential buffers on the dirty list will be visible.
10845 	 */
10846 	BO_LOCK(bo);
10847 	drain_output(vp);
10848 	BO_UNLOCK(bo);
10849 	return ffs_update(vp, 1);
10850 	/* return (0); */
10851 }
10852 
10853 /*
10854  * Flush the dependencies associated with an inodedep.
10855  * Called with splbio blocked.
10856  */
10857 static int
10858 flush_inodedep_deps(mp, ino)
10859 	struct mount *mp;
10860 	ino_t ino;
10861 {
10862 	struct inodedep *inodedep;
10863 	struct inoref *inoref;
10864 	int error, waitfor;
10865 
10866 	/*
10867 	 * This work is done in two passes. The first pass grabs most
10868 	 * of the buffers and begins asynchronously writing them. The
10869 	 * only way to wait for these asynchronous writes is to sleep
10870 	 * on the filesystem vnode which may stay busy for a long time
10871 	 * if the filesystem is active. So, instead, we make a second
10872 	 * pass over the dependencies blocking on each write. In the
10873 	 * usual case we will be blocking against a write that we
10874 	 * initiated, so when it is done the dependency will have been
10875 	 * resolved. Thus the second pass is expected to end quickly.
10876 	 * We give a brief window at the top of the loop to allow
10877 	 * any pending I/O to complete.
10878 	 */
10879 	for (error = 0, waitfor = MNT_NOWAIT; ; ) {
10880 		if (error)
10881 			return (error);
10882 		FREE_LOCK(&lk);
10883 		ACQUIRE_LOCK(&lk);
10884 restart:
10885 		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
10886 			return (0);
10887 		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
10888 			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
10889 			    == DEPCOMPLETE) {
10890 				stat_jwait_inode++;
10891 				jwait(&inoref->if_list);
10892 				goto restart;
10893 			}
10894 		}
10895 		if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
10896 		    flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
10897 		    flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
10898 		    flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
10899 			continue;
10900 		/*
10901 		 * If pass2, we are done, otherwise do pass 2.
10902 		 */
10903 		if (waitfor == MNT_WAIT)
10904 			break;
10905 		waitfor = MNT_WAIT;
10906 	}
10907 	/*
10908 	 * Try freeing inodedep in case all dependencies have been removed.
10909 	 */
10910 	if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
10911 		(void) free_inodedep(inodedep);
10912 	return (0);
10913 }
10914 
10915 /*
10916  * Flush an inode dependency list.
10917  * Called with splbio blocked.
10918  */
10919 static int
10920 flush_deplist(listhead, waitfor, errorp)
10921 	struct allocdirectlst *listhead;
10922 	int waitfor;
10923 	int *errorp;
10924 {
10925 	struct allocdirect *adp;
10926 	struct newblk *newblk;
10927 	struct buf *bp;
10928 
10929 	mtx_assert(&lk, MA_OWNED);
10930 	TAILQ_FOREACH(adp, listhead, ad_next) {
10931 		newblk = (struct newblk *)adp;
10932 		if (newblk->nb_jnewblk != NULL) {
10933 			stat_jwait_newblk++;
10934 			jwait(&newblk->nb_jnewblk->jn_list);
10935 			return (1);
10936 		}
10937 		if (newblk->nb_state & DEPCOMPLETE)
10938 			continue;
10939 		bp = newblk->nb_bmsafemap->sm_buf;
10940 		bp = getdirtybuf(bp, &lk, waitfor);
10941 		if (bp == NULL) {
10942 			if (waitfor == MNT_NOWAIT)
10943 				continue;
10944 			return (1);
10945 		}
10946 		FREE_LOCK(&lk);
10947 		if (waitfor == MNT_NOWAIT) {
10948 			bawrite(bp);
10949 		} else if ((*errorp = bwrite(bp)) != 0) {
10950 			ACQUIRE_LOCK(&lk);
10951 			return (1);
10952 		}
10953 		ACQUIRE_LOCK(&lk);
10954 		return (1);
10955 	}
10956 	return (0);
10957 }
10958 
10959 /*
10960  * Flush dependencies associated with an allocdirect block.
10961  */
10962 static int
10963 flush_newblk_dep(vp, mp, lbn)
10964 	struct vnode *vp;
10965 	struct mount *mp;
10966 	ufs_lbn_t lbn;
10967 {
10968 	struct newblk *newblk;
10969 	struct bufobj *bo;
10970 	struct inode *ip;
10971 	struct buf *bp;
10972 	ufs2_daddr_t blkno;
10973 	int error;
10974 
10975 	error = 0;
10976 	bo = &vp->v_bufobj;
10977 	ip = VTOI(vp);
10978 	blkno = DIP(ip, i_db[lbn]);
10979 	if (blkno == 0)
10980 		panic("flush_newblk_dep: Missing block");
10981 	ACQUIRE_LOCK(&lk);
10982 	/*
10983 	 * Loop until all dependencies related to this block are satisfied.
10984 	 * We must be careful to restart after each sleep in case a write
10985 	 * completes some part of this process for us.
10986 	 */
10987 	for (;;) {
10988 		if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {
10989 			FREE_LOCK(&lk);
10990 			break;
10991 		}
10992 		if (newblk->nb_list.wk_type != D_ALLOCDIRECT)
10993 			panic("flush_newblk_deps: Bad newblk %p", newblk);
10994 		/*
10995 		 * Flush the journal.
10996 		 */
10997 		if (newblk->nb_jnewblk != NULL) {
10998 			stat_jwait_newblk++;
10999 			jwait(&newblk->nb_jnewblk->jn_list);
11000 			continue;
11001 		}
11002 		/*
11003 		 * Write the bitmap dependency.
11004 		 */
11005 		if ((newblk->nb_state & DEPCOMPLETE) == 0) {
11006 			bp = newblk->nb_bmsafemap->sm_buf;
11007 			bp = getdirtybuf(bp, &lk, MNT_WAIT);
11008 			if (bp == NULL)
11009 				continue;
11010 			FREE_LOCK(&lk);
11011 			error = bwrite(bp);
11012 			if (error)
11013 				break;
11014 			ACQUIRE_LOCK(&lk);
11015 			continue;
11016 		}
11017 		/*
11018 		 * Write the buffer.
11019 		 */
11020 		FREE_LOCK(&lk);
11021 		BO_LOCK(bo);
11022 		bp = gbincore(bo, lbn);
11023 		if (bp != NULL) {
11024 			error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
11025 			    LK_INTERLOCK, BO_MTX(bo));
11026 			if (error == ENOLCK) {
11027 				ACQUIRE_LOCK(&lk);
11028 				continue; /* Slept, retry */
11029 			}
11030 			if (error != 0)
11031 				break;	/* Failed */
11032 			if (bp->b_flags & B_DELWRI) {
11033 				bremfree(bp);
11034 				error = bwrite(bp);
11035 				if (error)
11036 					break;
11037 			} else
11038 				BUF_UNLOCK(bp);
11039 		} else
11040 			BO_UNLOCK(bo);
11041 		/*
11042 		 * We have to wait for the direct pointers to
11043 		 * point at the newdirblk before the dependency
11044 		 * will go away.
11045 		 */
11046 		error = ffs_update(vp, MNT_WAIT);
11047 		if (error)
11048 			break;
11049 		ACQUIRE_LOCK(&lk);
11050 	}
11051 	return (error);
11052 }
11053 
11054 /*
11055  * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
11056  * Called with splbio blocked.
11057  */
11058 static int
11059 flush_pagedep_deps(pvp, mp, diraddhdp)
11060 	struct vnode *pvp;
11061 	struct mount *mp;
11062 	struct diraddhd *diraddhdp;
11063 {
11064 	struct inodedep *inodedep;
11065 	struct inoref *inoref;
11066 	struct ufsmount *ump;
11067 	struct diradd *dap;
11068 	struct vnode *vp;
11069 	int error = 0;
11070 	struct buf *bp;
11071 	ino_t inum;
11072 
11073 	ump = VFSTOUFS(mp);
11074 restart:
11075 	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
11076 		/*
11077 		 * Flush ourselves if this directory entry
11078 		 * has a MKDIR_PARENT dependency.
11079 		 */
11080 		if (dap->da_state & MKDIR_PARENT) {
11081 			FREE_LOCK(&lk);
11082 			if ((error = ffs_update(pvp, MNT_WAIT)) != 0)
11083 				break;
11084 			ACQUIRE_LOCK(&lk);
11085 			/*
11086 			 * If that cleared dependencies, go on to next.
11087 			 */
11088 			if (dap != LIST_FIRST(diraddhdp))
11089 				continue;
11090 			if (dap->da_state & MKDIR_PARENT)
11091 				panic("flush_pagedep_deps: MKDIR_PARENT");
11092 		}
11093 		/*
11094 		 * A newly allocated directory must have its "." and
11095 		 * ".." entries written out before its name can be
11096 		 * committed in its parent.
11097 		 */
11098 		inum = dap->da_newinum;
11099 		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
11100 			panic("flush_pagedep_deps: lost inode1");
11101 		/*
11102 		 * Wait for any pending journal adds to complete so we don't
11103 		 * cause rollbacks while syncing.
11104 		 */
11105 		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
11106 			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
11107 			    == DEPCOMPLETE) {
11108 				stat_jwait_inode++;
11109 				jwait(&inoref->if_list);
11110 				goto restart;
11111 			}
11112 		}
11113 		if (dap->da_state & MKDIR_BODY) {
11114 			FREE_LOCK(&lk);
11115 			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
11116 			    FFSV_FORCEINSMQ)))
11117 				break;
11118 			error = flush_newblk_dep(vp, mp, 0);
11119 			/*
11120 			 * If we still have the dependency we might need to
11121 			 * update the vnode to sync the new link count to
11122 			 * disk.
11123 			 */
11124 			if (error == 0 && dap == LIST_FIRST(diraddhdp))
11125 				error = ffs_update(vp, MNT_WAIT);
11126 			vput(vp);
11127 			if (error != 0)
11128 				break;
11129 			ACQUIRE_LOCK(&lk);
11130 			/*
11131 			 * If that cleared dependencies, go on to next.
11132 			 */
11133 			if (dap != LIST_FIRST(diraddhdp))
11134 				continue;
11135 			if (dap->da_state & MKDIR_BODY) {
11136 				inodedep_lookup(UFSTOVFS(ump), inum, 0,
11137 				    &inodedep);
11138 				panic("flush_pagedep_deps: MKDIR_BODY "
11139 				    "inodedep %p dap %p vp %p",
11140 				    inodedep, dap, vp);
11141 			}
11142 		}
11143 		/*
11144 		 * Flush the inode on which the directory entry depends.
11145 		 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
11146 		 * the only remaining dependency is that the updated inode
11147 		 * count must get pushed to disk. The inode has already
11148 		 * been pushed into its inode buffer (via VOP_UPDATE) at
11149 		 * the time of the reference count change. So we need only
11150 		 * locate that buffer, ensure that there will be no rollback
11151 		 * caused by a bitmap dependency, then write the inode buffer.
11152 		 */
11153 retry:
11154 		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
11155 			panic("flush_pagedep_deps: lost inode");
11156 		/*
11157 		 * If the inode still has bitmap dependencies,
11158 		 * push them to disk.
11159 		 */
11160 		if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {
11161 			bp = inodedep->id_bmsafemap->sm_buf;
11162 			bp = getdirtybuf(bp, &lk, MNT_WAIT);
11163 			if (bp == NULL)
11164 				goto retry;
11165 			FREE_LOCK(&lk);
11166 			if ((error = bwrite(bp)) != 0)
11167 				break;
11168 			ACQUIRE_LOCK(&lk);
11169 			if (dap != LIST_FIRST(diraddhdp))
11170 				continue;
11171 		}
11172 		/*
11173 		 * If the inode is still sitting in a buffer waiting
11174 		 * to be written or waiting for the link count to be
11175 		 * adjusted update it here to flush it to disk.
11176 		 */
11177 		if (dap == LIST_FIRST(diraddhdp)) {
11178 			FREE_LOCK(&lk);
11179 			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
11180 			    FFSV_FORCEINSMQ)))
11181 				break;
11182 			error = ffs_update(vp, MNT_WAIT);
11183 			vput(vp);
11184 			if (error)
11185 				break;
11186 			ACQUIRE_LOCK(&lk);
11187 		}
11188 		/*
11189 		 * If we have failed to get rid of all the dependencies
11190 		 * then something is seriously wrong.
11191 		 */
11192 		if (dap == LIST_FIRST(diraddhdp)) {
11193 			inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);
11194 			panic("flush_pagedep_deps: failed to flush "
11195 			    "inodedep %p ino %d dap %p", inodedep, inum, dap);
11196 		}
11197 	}
11198 	if (error)
11199 		ACQUIRE_LOCK(&lk);
11200 	return (error);
11201 }
11202 
11203 /*
11204  * A large burst of file addition or deletion activity can drive the
11205  * memory load excessively high. First attempt to slow things down
11206  * using the techniques below. If that fails, this routine requests
11207  * the offending operations to fall back to running synchronously
11208  * until the memory load returns to a reasonable level.
11209  */
11210 int
11211 softdep_slowdown(vp)
11212 	struct vnode *vp;
11213 {
11214 	struct ufsmount *ump;
11215 	int jlow;
11216 	int max_softdeps_hard;
11217 
11218 	ACQUIRE_LOCK(&lk);
11219 	jlow = 0;
11220 	/*
11221 	 * Check for journal space if needed.
11222 	 */
11223 	if (DOINGSUJ(vp)) {
11224 		ump = VFSTOUFS(vp->v_mount);
11225 		if (journal_space(ump, 0) == 0)
11226 			jlow = 1;
11227 	}
11228 	max_softdeps_hard = max_softdeps * 11 / 10;
11229 	if (dep_current[D_DIRREM] < max_softdeps_hard / 2 &&
11230 	    dep_current[D_INODEDEP] < max_softdeps_hard &&
11231 	    VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps &&
11232 	    dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0) {
11233 		FREE_LOCK(&lk);
11234   		return (0);
11235 	}
11236 	if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps || jlow)
11237 		softdep_speedup();
11238 	stat_sync_limit_hit += 1;
11239 	FREE_LOCK(&lk);
11240 	return (1);
11241 }
11242 
11243 /*
11244  * Called by the allocation routines when they are about to fail
11245  * in the hope that we can free up the requested resource (inodes
11246  * or disk space).
11247  *
11248  * First check to see if the work list has anything on it. If it has,
11249  * clean up entries until we successfully free the requested resource.
11250  * Because this process holds inodes locked, we cannot handle any remove
11251  * requests that might block on a locked inode as that could lead to
11252  * deadlock. If the worklist yields none of the requested resource,
11253  * start syncing out vnodes to free up the needed space.
11254  */
11255 int
11256 softdep_request_cleanup(fs, vp, cred, resource)
11257 	struct fs *fs;
11258 	struct vnode *vp;
11259 	struct ucred *cred;
11260 	int resource;
11261 {
11262 	struct ufsmount *ump;
11263 	struct mount *mp;
11264 	struct vnode *lvp, *mvp;
11265 	long starttime;
11266 	ufs2_daddr_t needed;
11267 	int error;
11268 
11269 	mp = vp->v_mount;
11270 	ump = VTOI(vp)->i_ump;
11271 	mtx_assert(UFS_MTX(ump), MA_OWNED);
11272 	if (resource == FLUSH_BLOCKS_WAIT)
11273 		stat_cleanup_blkrequests += 1;
11274 	else
11275 		stat_cleanup_inorequests += 1;
11276 	/*
11277 	 * If we are being called because of a process doing a
11278 	 * copy-on-write, then it is not safe to update the vnode
11279 	 * as we may recurse into the copy-on-write routine.
11280 	 */
11281 	if (!(curthread->td_pflags & TDP_COWINPROGRESS)) {
11282 		UFS_UNLOCK(ump);
11283 		error = ffs_update(vp, 1);
11284 		UFS_LOCK(ump);
11285 		if (error != 0)
11286 			return (0);
11287 	}
11288 	/*
11289 	 * If we are in need of resources, consider pausing for
11290 	 * tickdelay to give ourselves some breathing room.
11291 	 */
11292 	UFS_UNLOCK(ump);
11293 	ACQUIRE_LOCK(&lk);
11294 	request_cleanup(UFSTOVFS(ump), resource);
11295 	FREE_LOCK(&lk);
11296 	UFS_LOCK(ump);
11297 	/*
11298 	 * Now clean up at least as many resources as we will need.
11299 	 *
11300 	 * When requested to clean up inodes, the number that are needed
11301 	 * is set by the number of simultaneous writers (mnt_writeopcount)
11302 	 * plus a bit of slop (2) in case some more writers show up while
11303 	 * we are cleaning.
11304 	 *
11305 	 * When requested to free up space, the amount of space that
11306 	 * we need is enough blocks to allocate a full-sized segment
11307 	 * (fs_contigsumsize). The number of such segments that will
11308 	 * be needed is set by the number of simultaneous writers
11309 	 * (mnt_writeopcount) plus a bit of slop (2) in case some more
11310 	 * writers show up while we are cleaning.
11311 	 *
11312 	 * Additionally, if we are unpriviledged and allocating space,
11313 	 * we need to ensure that we clean up enough blocks to get the
11314 	 * needed number of blocks over the threshhold of the minimum
11315 	 * number of blocks required to be kept free by the filesystem
11316 	 * (fs_minfree).
11317 	 */
11318 	if (resource == FLUSH_INODES_WAIT) {
11319 		needed = vp->v_mount->mnt_writeopcount + 2;
11320 	} else if (resource == FLUSH_BLOCKS_WAIT) {
11321 		needed = (vp->v_mount->mnt_writeopcount + 2) *
11322 		    fs->fs_contigsumsize;
11323 		if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0))
11324 			needed += fragstoblks(fs,
11325 			    roundup((fs->fs_dsize * fs->fs_minfree / 100) -
11326 			    fs->fs_cstotal.cs_nffree, fs->fs_frag));
11327 	} else {
11328 		printf("softdep_request_cleanup: Unknown resource type %d\n",
11329 		    resource);
11330 		return (0);
11331 	}
11332 	starttime = time_second;
11333 retry:
11334 	while ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 &&
11335 		fs->fs_cstotal.cs_nbfree <= needed) ||
11336 	       (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
11337 		fs->fs_cstotal.cs_nifree <= needed)) {
11338 		UFS_UNLOCK(ump);
11339 		ACQUIRE_LOCK(&lk);
11340 		process_removes(vp);
11341 		if (ump->softdep_on_worklist > 0 &&
11342 		    process_worklist_item(UFSTOVFS(ump), LK_NOWAIT) != -1) {
11343 			stat_worklist_push += 1;
11344 			FREE_LOCK(&lk);
11345 			UFS_LOCK(ump);
11346 			continue;
11347 		}
11348 		FREE_LOCK(&lk);
11349 		UFS_LOCK(ump);
11350 	}
11351 	/*
11352 	 * If we still need resources and there are no more worklist
11353 	 * entries to process to obtain them, we have to start flushing
11354 	 * the dirty vnodes to force the release of additional requests
11355 	 * to the worklist that we can then process to reap addition
11356 	 * resources. We walk the vnodes associated with the mount point
11357 	 * until we get the needed worklist requests that we can reap.
11358 	 */
11359 	if ((resource == FLUSH_BLOCKS_WAIT &&
11360 	     fs->fs_cstotal.cs_nbfree <= needed) ||
11361 	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
11362 	     fs->fs_cstotal.cs_nifree <= needed)) {
11363 		UFS_UNLOCK(ump);
11364 		MNT_ILOCK(mp);
11365 		MNT_VNODE_FOREACH(lvp, mp, mvp) {
11366 			UFS_LOCK(ump);
11367 			if (ump->softdep_on_worklist > 0) {
11368 				UFS_UNLOCK(ump);
11369 				MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
11370 				MNT_IUNLOCK(mp);
11371 				UFS_LOCK(ump);
11372 				stat_cleanup_retries += 1;
11373 				goto retry;
11374 			}
11375 			UFS_UNLOCK(ump);
11376 			VI_LOCK(lvp);
11377 			if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0 ||
11378 			    VOP_ISLOCKED(lvp) != 0) {
11379 				VI_UNLOCK(lvp);
11380 				continue;
11381 			}
11382 			MNT_IUNLOCK(mp);
11383 			if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK, curthread)) {
11384 				MNT_ILOCK(mp);
11385 				continue;
11386 			}
11387 			if (lvp->v_vflag & VV_NOSYNC) {	/* unlinked */
11388 				vput(lvp);
11389 				MNT_ILOCK(mp);
11390 				continue;
11391 			}
11392 			(void) ffs_syncvnode(lvp, MNT_WAIT);
11393 			vput(lvp);
11394 			MNT_ILOCK(mp);
11395 		}
11396 		MNT_IUNLOCK(mp);
11397 		stat_cleanup_failures += 1;
11398 		UFS_LOCK(ump);
11399 	}
11400 	if (time_second - starttime > stat_cleanup_high_delay)
11401 		stat_cleanup_high_delay = time_second - starttime;
11402 	return (1);
11403 }
11404 
11405 /*
11406  * If memory utilization has gotten too high, deliberately slow things
11407  * down and speed up the I/O processing.
11408  */
11409 extern struct thread *syncertd;
11410 static int
11411 request_cleanup(mp, resource)
11412 	struct mount *mp;
11413 	int resource;
11414 {
11415 	struct thread *td = curthread;
11416 	struct ufsmount *ump;
11417 
11418 	mtx_assert(&lk, MA_OWNED);
11419 	/*
11420 	 * We never hold up the filesystem syncer or buf daemon.
11421 	 */
11422 	if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
11423 		return (0);
11424 	ump = VFSTOUFS(mp);
11425 	/*
11426 	 * First check to see if the work list has gotten backlogged.
11427 	 * If it has, co-opt this process to help clean up two entries.
11428 	 * Because this process may hold inodes locked, we cannot
11429 	 * handle any remove requests that might block on a locked
11430 	 * inode as that could lead to deadlock.  We set TDP_SOFTDEP
11431 	 * to avoid recursively processing the worklist.
11432 	 */
11433 	if (ump->softdep_on_worklist > max_softdeps / 10) {
11434 		td->td_pflags |= TDP_SOFTDEP;
11435 		process_worklist_item(mp, LK_NOWAIT);
11436 		process_worklist_item(mp, LK_NOWAIT);
11437 		td->td_pflags &= ~TDP_SOFTDEP;
11438 		stat_worklist_push += 2;
11439 		return(1);
11440 	}
11441 	/*
11442 	 * Next, we attempt to speed up the syncer process. If that
11443 	 * is successful, then we allow the process to continue.
11444 	 */
11445 	if (softdep_speedup() &&
11446 	    resource != FLUSH_BLOCKS_WAIT &&
11447 	    resource != FLUSH_INODES_WAIT)
11448 		return(0);
11449 	/*
11450 	 * If we are resource constrained on inode dependencies, try
11451 	 * flushing some dirty inodes. Otherwise, we are constrained
11452 	 * by file deletions, so try accelerating flushes of directories
11453 	 * with removal dependencies. We would like to do the cleanup
11454 	 * here, but we probably hold an inode locked at this point and
11455 	 * that might deadlock against one that we try to clean. So,
11456 	 * the best that we can do is request the syncer daemon to do
11457 	 * the cleanup for us.
11458 	 */
11459 	switch (resource) {
11460 
11461 	case FLUSH_INODES:
11462 	case FLUSH_INODES_WAIT:
11463 		stat_ino_limit_push += 1;
11464 		req_clear_inodedeps += 1;
11465 		stat_countp = &stat_ino_limit_hit;
11466 		break;
11467 
11468 	case FLUSH_BLOCKS:
11469 	case FLUSH_BLOCKS_WAIT:
11470 		stat_blk_limit_push += 1;
11471 		req_clear_remove += 1;
11472 		stat_countp = &stat_blk_limit_hit;
11473 		break;
11474 
11475 	default:
11476 		panic("request_cleanup: unknown type");
11477 	}
11478 	/*
11479 	 * Hopefully the syncer daemon will catch up and awaken us.
11480 	 * We wait at most tickdelay before proceeding in any case.
11481 	 */
11482 	proc_waiting += 1;
11483 	if (callout_pending(&softdep_callout) == FALSE)
11484 		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
11485 		    pause_timer, 0);
11486 
11487 	msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
11488 	proc_waiting -= 1;
11489 	return (1);
11490 }
11491 
11492 /*
11493  * Awaken processes pausing in request_cleanup and clear proc_waiting
11494  * to indicate that there is no longer a timer running.
11495  */
11496 static void
11497 pause_timer(arg)
11498 	void *arg;
11499 {
11500 
11501 	/*
11502 	 * The callout_ API has acquired mtx and will hold it around this
11503 	 * function call.
11504 	 */
11505 	*stat_countp += 1;
11506 	wakeup_one(&proc_waiting);
11507 	if (proc_waiting > 0)
11508 		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
11509 		    pause_timer, 0);
11510 }
11511 
11512 /*
11513  * Flush out a directory with at least one removal dependency in an effort to
11514  * reduce the number of dirrem, freefile, and freeblks dependency structures.
11515  */
11516 static void
11517 clear_remove(td)
11518 	struct thread *td;
11519 {
11520 	struct pagedep_hashhead *pagedephd;
11521 	struct pagedep *pagedep;
11522 	static int next = 0;
11523 	struct mount *mp;
11524 	struct vnode *vp;
11525 	struct bufobj *bo;
11526 	int error, cnt;
11527 	ino_t ino;
11528 
11529 	mtx_assert(&lk, MA_OWNED);
11530 
11531 	for (cnt = 0; cnt < pagedep_hash; cnt++) {
11532 		pagedephd = &pagedep_hashtbl[next++];
11533 		if (next >= pagedep_hash)
11534 			next = 0;
11535 		LIST_FOREACH(pagedep, pagedephd, pd_hash) {
11536 			if (LIST_EMPTY(&pagedep->pd_dirremhd))
11537 				continue;
11538 			mp = pagedep->pd_list.wk_mp;
11539 			ino = pagedep->pd_ino;
11540 			if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
11541 				continue;
11542 			FREE_LOCK(&lk);
11543 
11544 			/*
11545 			 * Let unmount clear deps
11546 			 */
11547 			error = vfs_busy(mp, MBF_NOWAIT);
11548 			if (error != 0)
11549 				goto finish_write;
11550 			error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
11551 			     FFSV_FORCEINSMQ);
11552 			vfs_unbusy(mp);
11553 			if (error != 0) {
11554 				softdep_error("clear_remove: vget", error);
11555 				goto finish_write;
11556 			}
11557 			if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
11558 				softdep_error("clear_remove: fsync", error);
11559 			bo = &vp->v_bufobj;
11560 			BO_LOCK(bo);
11561 			drain_output(vp);
11562 			BO_UNLOCK(bo);
11563 			vput(vp);
11564 		finish_write:
11565 			vn_finished_write(mp);
11566 			ACQUIRE_LOCK(&lk);
11567 			return;
11568 		}
11569 	}
11570 }
11571 
11572 /*
11573  * Clear out a block of dirty inodes in an effort to reduce
11574  * the number of inodedep dependency structures.
11575  */
11576 static void
11577 clear_inodedeps(td)
11578 	struct thread *td;
11579 {
11580 	struct inodedep_hashhead *inodedephd;
11581 	struct inodedep *inodedep;
11582 	static int next = 0;
11583 	struct mount *mp;
11584 	struct vnode *vp;
11585 	struct fs *fs;
11586 	int error, cnt;
11587 	ino_t firstino, lastino, ino;
11588 
11589 	mtx_assert(&lk, MA_OWNED);
11590 	/*
11591 	 * Pick a random inode dependency to be cleared.
11592 	 * We will then gather up all the inodes in its block
11593 	 * that have dependencies and flush them out.
11594 	 */
11595 	for (cnt = 0; cnt < inodedep_hash; cnt++) {
11596 		inodedephd = &inodedep_hashtbl[next++];
11597 		if (next >= inodedep_hash)
11598 			next = 0;
11599 		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
11600 			break;
11601 	}
11602 	if (inodedep == NULL)
11603 		return;
11604 	fs = inodedep->id_fs;
11605 	mp = inodedep->id_list.wk_mp;
11606 	/*
11607 	 * Find the last inode in the block with dependencies.
11608 	 */
11609 	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
11610 	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
11611 		if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
11612 			break;
11613 	/*
11614 	 * Asynchronously push all but the last inode with dependencies.
11615 	 * Synchronously push the last inode with dependencies to ensure
11616 	 * that the inode block gets written to free up the inodedeps.
11617 	 */
11618 	for (ino = firstino; ino <= lastino; ino++) {
11619 		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
11620 			continue;
11621 		if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
11622 			continue;
11623 		FREE_LOCK(&lk);
11624 		error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */
11625 		if (error != 0) {
11626 			vn_finished_write(mp);
11627 			ACQUIRE_LOCK(&lk);
11628 			return;
11629 		}
11630 		if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
11631 		    FFSV_FORCEINSMQ)) != 0) {
11632 			softdep_error("clear_inodedeps: vget", error);
11633 			vfs_unbusy(mp);
11634 			vn_finished_write(mp);
11635 			ACQUIRE_LOCK(&lk);
11636 			return;
11637 		}
11638 		vfs_unbusy(mp);
11639 		if (ino == lastino) {
11640 			if ((error = ffs_syncvnode(vp, MNT_WAIT)))
11641 				softdep_error("clear_inodedeps: fsync1", error);
11642 		} else {
11643 			if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
11644 				softdep_error("clear_inodedeps: fsync2", error);
11645 			BO_LOCK(&vp->v_bufobj);
11646 			drain_output(vp);
11647 			BO_UNLOCK(&vp->v_bufobj);
11648 		}
11649 		vput(vp);
11650 		vn_finished_write(mp);
11651 		ACQUIRE_LOCK(&lk);
11652 	}
11653 }
11654 
11655 /*
11656  * Function to determine if the buffer has outstanding dependencies
11657  * that will cause a roll-back if the buffer is written. If wantcount
11658  * is set, return number of dependencies, otherwise just yes or no.
11659  */
11660 static int
11661 softdep_count_dependencies(bp, wantcount)
11662 	struct buf *bp;
11663 	int wantcount;
11664 {
11665 	struct worklist *wk;
11666 	struct bmsafemap *bmsafemap;
11667 	struct inodedep *inodedep;
11668 	struct indirdep *indirdep;
11669 	struct freeblks *freeblks;
11670 	struct allocindir *aip;
11671 	struct pagedep *pagedep;
11672 	struct dirrem *dirrem;
11673 	struct newblk *newblk;
11674 	struct mkdir *mkdir;
11675 	struct diradd *dap;
11676 	int i, retval;
11677 
11678 	retval = 0;
11679 	ACQUIRE_LOCK(&lk);
11680 	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
11681 		switch (wk->wk_type) {
11682 
11683 		case D_INODEDEP:
11684 			inodedep = WK_INODEDEP(wk);
11685 			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
11686 				/* bitmap allocation dependency */
11687 				retval += 1;
11688 				if (!wantcount)
11689 					goto out;
11690 			}
11691 			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
11692 				/* direct block pointer dependency */
11693 				retval += 1;
11694 				if (!wantcount)
11695 					goto out;
11696 			}
11697 			if (TAILQ_FIRST(&inodedep->id_extupdt)) {
11698 				/* direct block pointer dependency */
11699 				retval += 1;
11700 				if (!wantcount)
11701 					goto out;
11702 			}
11703 			if (TAILQ_FIRST(&inodedep->id_inoreflst)) {
11704 				/* Add reference dependency. */
11705 				retval += 1;
11706 				if (!wantcount)
11707 					goto out;
11708 			}
11709 			continue;
11710 
11711 		case D_INDIRDEP:
11712 			indirdep = WK_INDIRDEP(wk);
11713 
11714 			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
11715 				/* indirect block pointer dependency */
11716 				retval += 1;
11717 				if (!wantcount)
11718 					goto out;
11719 			}
11720 			continue;
11721 
11722 		case D_PAGEDEP:
11723 			pagedep = WK_PAGEDEP(wk);
11724 			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
11725 				if (LIST_FIRST(&dirrem->dm_jremrefhd)) {
11726 					/* Journal remove ref dependency. */
11727 					retval += 1;
11728 					if (!wantcount)
11729 						goto out;
11730 				}
11731 			}
11732 			for (i = 0; i < DAHASHSZ; i++) {
11733 
11734 				LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
11735 					/* directory entry dependency */
11736 					retval += 1;
11737 					if (!wantcount)
11738 						goto out;
11739 				}
11740 			}
11741 			continue;
11742 
11743 		case D_BMSAFEMAP:
11744 			bmsafemap = WK_BMSAFEMAP(wk);
11745 			if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {
11746 				/* Add reference dependency. */
11747 				retval += 1;
11748 				if (!wantcount)
11749 					goto out;
11750 			}
11751 			if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {
11752 				/* Allocate block dependency. */
11753 				retval += 1;
11754 				if (!wantcount)
11755 					goto out;
11756 			}
11757 			continue;
11758 
11759 		case D_FREEBLKS:
11760 			freeblks = WK_FREEBLKS(wk);
11761 			if (LIST_FIRST(&freeblks->fb_jfreeblkhd)) {
11762 				/* Freeblk journal dependency. */
11763 				retval += 1;
11764 				if (!wantcount)
11765 					goto out;
11766 			}
11767 			continue;
11768 
11769 		case D_ALLOCDIRECT:
11770 		case D_ALLOCINDIR:
11771 			newblk = WK_NEWBLK(wk);
11772 			if (newblk->nb_jnewblk) {
11773 				/* Journal allocate dependency. */
11774 				retval += 1;
11775 				if (!wantcount)
11776 					goto out;
11777 			}
11778 			continue;
11779 
11780 		case D_MKDIR:
11781 			mkdir = WK_MKDIR(wk);
11782 			if (mkdir->md_jaddref) {
11783 				/* Journal reference dependency. */
11784 				retval += 1;
11785 				if (!wantcount)
11786 					goto out;
11787 			}
11788 			continue;
11789 
11790 		case D_FREEWORK:
11791 		case D_FREEDEP:
11792 		case D_JSEGDEP:
11793 		case D_JSEG:
11794 		case D_SBDEP:
11795 			/* never a dependency on these blocks */
11796 			continue;
11797 
11798 		default:
11799 			panic("softdep_count_dependencies: Unexpected type %s",
11800 			    TYPENAME(wk->wk_type));
11801 			/* NOTREACHED */
11802 		}
11803 	}
11804 out:
11805 	FREE_LOCK(&lk);
11806 	return retval;
11807 }
11808 
11809 /*
11810  * Acquire exclusive access to a buffer.
11811  * Must be called with a locked mtx parameter.
11812  * Return acquired buffer or NULL on failure.
11813  */
11814 static struct buf *
11815 getdirtybuf(bp, mtx, waitfor)
11816 	struct buf *bp;
11817 	struct mtx *mtx;
11818 	int waitfor;
11819 {
11820 	int error;
11821 
11822 	mtx_assert(mtx, MA_OWNED);
11823 	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
11824 		if (waitfor != MNT_WAIT)
11825 			return (NULL);
11826 		error = BUF_LOCK(bp,
11827 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx);
11828 		/*
11829 		 * Even if we sucessfully acquire bp here, we have dropped
11830 		 * mtx, which may violates our guarantee.
11831 		 */
11832 		if (error == 0)
11833 			BUF_UNLOCK(bp);
11834 		else if (error != ENOLCK)
11835 			panic("getdirtybuf: inconsistent lock: %d", error);
11836 		mtx_lock(mtx);
11837 		return (NULL);
11838 	}
11839 	if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
11840 		if (mtx == &lk && waitfor == MNT_WAIT) {
11841 			mtx_unlock(mtx);
11842 			BO_LOCK(bp->b_bufobj);
11843 			BUF_UNLOCK(bp);
11844 			if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
11845 				bp->b_vflags |= BV_BKGRDWAIT;
11846 				msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj),
11847 				       PRIBIO | PDROP, "getbuf", 0);
11848 			} else
11849 				BO_UNLOCK(bp->b_bufobj);
11850 			mtx_lock(mtx);
11851 			return (NULL);
11852 		}
11853 		BUF_UNLOCK(bp);
11854 		if (waitfor != MNT_WAIT)
11855 			return (NULL);
11856 		/*
11857 		 * The mtx argument must be bp->b_vp's mutex in
11858 		 * this case.
11859 		 */
11860 #ifdef	DEBUG_VFS_LOCKS
11861 		if (bp->b_vp->v_type != VCHR)
11862 			ASSERT_BO_LOCKED(bp->b_bufobj);
11863 #endif
11864 		bp->b_vflags |= BV_BKGRDWAIT;
11865 		msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0);
11866 		return (NULL);
11867 	}
11868 	if ((bp->b_flags & B_DELWRI) == 0) {
11869 		BUF_UNLOCK(bp);
11870 		return (NULL);
11871 	}
11872 	bremfree(bp);
11873 	return (bp);
11874 }
11875 
11876 
11877 /*
11878  * Check if it is safe to suspend the file system now.  On entry,
11879  * the vnode interlock for devvp should be held.  Return 0 with
11880  * the mount interlock held if the file system can be suspended now,
11881  * otherwise return EAGAIN with the mount interlock held.
11882  */
11883 int
11884 softdep_check_suspend(struct mount *mp,
11885 		      struct vnode *devvp,
11886 		      int softdep_deps,
11887 		      int softdep_accdeps,
11888 		      int secondary_writes,
11889 		      int secondary_accwrites)
11890 {
11891 	struct bufobj *bo;
11892 	struct ufsmount *ump;
11893 	int error;
11894 
11895 	ump = VFSTOUFS(mp);
11896 	bo = &devvp->v_bufobj;
11897 	ASSERT_BO_LOCKED(bo);
11898 
11899 	for (;;) {
11900 		if (!TRY_ACQUIRE_LOCK(&lk)) {
11901 			BO_UNLOCK(bo);
11902 			ACQUIRE_LOCK(&lk);
11903 			FREE_LOCK(&lk);
11904 			BO_LOCK(bo);
11905 			continue;
11906 		}
11907 		MNT_ILOCK(mp);
11908 		if (mp->mnt_secondary_writes != 0) {
11909 			FREE_LOCK(&lk);
11910 			BO_UNLOCK(bo);
11911 			msleep(&mp->mnt_secondary_writes,
11912 			       MNT_MTX(mp),
11913 			       (PUSER - 1) | PDROP, "secwr", 0);
11914 			BO_LOCK(bo);
11915 			continue;
11916 		}
11917 		break;
11918 	}
11919 
11920 	/*
11921 	 * Reasons for needing more work before suspend:
11922 	 * - Dirty buffers on devvp.
11923 	 * - Softdep activity occurred after start of vnode sync loop
11924 	 * - Secondary writes occurred after start of vnode sync loop
11925 	 */
11926 	error = 0;
11927 	if (bo->bo_numoutput > 0 ||
11928 	    bo->bo_dirty.bv_cnt > 0 ||
11929 	    softdep_deps != 0 ||
11930 	    ump->softdep_deps != 0 ||
11931 	    softdep_accdeps != ump->softdep_accdeps ||
11932 	    secondary_writes != 0 ||
11933 	    mp->mnt_secondary_writes != 0 ||
11934 	    secondary_accwrites != mp->mnt_secondary_accwrites)
11935 		error = EAGAIN;
11936 	FREE_LOCK(&lk);
11937 	BO_UNLOCK(bo);
11938 	return (error);
11939 }
11940 
11941 
11942 /*
11943  * Get the number of dependency structures for the file system, both
11944  * the current number and the total number allocated.  These will
11945  * later be used to detect that softdep processing has occurred.
11946  */
11947 void
11948 softdep_get_depcounts(struct mount *mp,
11949 		      int *softdep_depsp,
11950 		      int *softdep_accdepsp)
11951 {
11952 	struct ufsmount *ump;
11953 
11954 	ump = VFSTOUFS(mp);
11955 	ACQUIRE_LOCK(&lk);
11956 	*softdep_depsp = ump->softdep_deps;
11957 	*softdep_accdepsp = ump->softdep_accdeps;
11958 	FREE_LOCK(&lk);
11959 }
11960 
11961 /*
11962  * Wait for pending output on a vnode to complete.
11963  * Must be called with vnode lock and interlock locked.
11964  *
11965  * XXX: Should just be a call to bufobj_wwait().
11966  */
11967 static void
11968 drain_output(vp)
11969 	struct vnode *vp;
11970 {
11971 	struct bufobj *bo;
11972 
11973 	bo = &vp->v_bufobj;
11974 	ASSERT_VOP_LOCKED(vp, "drain_output");
11975 	ASSERT_BO_LOCKED(bo);
11976 
11977 	while (bo->bo_numoutput) {
11978 		bo->bo_flag |= BO_WWAIT;
11979 		msleep((caddr_t)&bo->bo_numoutput,
11980 		    BO_MTX(bo), PRIBIO + 1, "drainvp", 0);
11981 	}
11982 }
11983 
11984 /*
11985  * Called whenever a buffer that is being invalidated or reallocated
11986  * contains dependencies. This should only happen if an I/O error has
11987  * occurred. The routine is called with the buffer locked.
11988  */
11989 static void
11990 softdep_deallocate_dependencies(bp)
11991 	struct buf *bp;
11992 {
11993 
11994 	if ((bp->b_ioflags & BIO_ERROR) == 0)
11995 		panic("softdep_deallocate_dependencies: dangling deps");
11996 	softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
11997 	panic("softdep_deallocate_dependencies: unrecovered I/O error");
11998 }
11999 
12000 /*
12001  * Function to handle asynchronous write errors in the filesystem.
12002  */
12003 static void
12004 softdep_error(func, error)
12005 	char *func;
12006 	int error;
12007 {
12008 
12009 	/* XXX should do something better! */
12010 	printf("%s: got error %d while accessing filesystem\n", func, error);
12011 }
12012 
12013 #ifdef DDB
12014 
12015 static void
12016 inodedep_print(struct inodedep *inodedep, int verbose)
12017 {
12018 	db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d"
12019 	    " saveino %p\n",
12020 	    inodedep, inodedep->id_fs, inodedep->id_state,
12021 	    (intmax_t)inodedep->id_ino,
12022 	    (intmax_t)fsbtodb(inodedep->id_fs,
12023 	    ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
12024 	    inodedep->id_nlinkdelta, inodedep->id_savednlink,
12025 	    inodedep->id_savedino1);
12026 
12027 	if (verbose == 0)
12028 		return;
12029 
12030 	db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, "
12031 	    "mkdiradd %p\n",
12032 	    LIST_FIRST(&inodedep->id_pendinghd),
12033 	    LIST_FIRST(&inodedep->id_bufwait),
12034 	    LIST_FIRST(&inodedep->id_inowait),
12035 	    TAILQ_FIRST(&inodedep->id_inoreflst),
12036 	    inodedep->id_mkdiradd);
12037 	db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n",
12038 	    TAILQ_FIRST(&inodedep->id_inoupdt),
12039 	    TAILQ_FIRST(&inodedep->id_newinoupdt),
12040 	    TAILQ_FIRST(&inodedep->id_extupdt),
12041 	    TAILQ_FIRST(&inodedep->id_newextupdt));
12042 }
12043 
12044 DB_SHOW_COMMAND(inodedep, db_show_inodedep)
12045 {
12046 
12047 	if (have_addr == 0) {
12048 		db_printf("Address required\n");
12049 		return;
12050 	}
12051 	inodedep_print((struct inodedep*)addr, 1);
12052 }
12053 
12054 DB_SHOW_COMMAND(inodedeps, db_show_inodedeps)
12055 {
12056 	struct inodedep_hashhead *inodedephd;
12057 	struct inodedep *inodedep;
12058 	struct fs *fs;
12059 	int cnt;
12060 
12061 	fs = have_addr ? (struct fs *)addr : NULL;
12062 	for (cnt = 0; cnt < inodedep_hash; cnt++) {
12063 		inodedephd = &inodedep_hashtbl[cnt];
12064 		LIST_FOREACH(inodedep, inodedephd, id_hash) {
12065 			if (fs != NULL && fs != inodedep->id_fs)
12066 				continue;
12067 			inodedep_print(inodedep, 0);
12068 		}
12069 	}
12070 }
12071 
12072 DB_SHOW_COMMAND(worklist, db_show_worklist)
12073 {
12074 	struct worklist *wk;
12075 
12076 	if (have_addr == 0) {
12077 		db_printf("Address required\n");
12078 		return;
12079 	}
12080 	wk = (struct worklist *)addr;
12081 	printf("worklist: %p type %s state 0x%X\n",
12082 	    wk, TYPENAME(wk->wk_type), wk->wk_state);
12083 }
12084 
12085 DB_SHOW_COMMAND(workhead, db_show_workhead)
12086 {
12087 	struct workhead *wkhd;
12088 	struct worklist *wk;
12089 	int i;
12090 
12091 	if (have_addr == 0) {
12092 		db_printf("Address required\n");
12093 		return;
12094 	}
12095 	wkhd = (struct workhead *)addr;
12096 	wk = LIST_FIRST(wkhd);
12097 	for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list))
12098 		db_printf("worklist: %p type %s state 0x%X",
12099 		    wk, TYPENAME(wk->wk_type), wk->wk_state);
12100 	if (i == 100)
12101 		db_printf("workhead overflow");
12102 	printf("\n");
12103 }
12104 
12105 
12106 DB_SHOW_COMMAND(mkdirs, db_show_mkdirs)
12107 {
12108 	struct jaddref *jaddref;
12109 	struct diradd *diradd;
12110 	struct mkdir *mkdir;
12111 
12112 	LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {
12113 		diradd = mkdir->md_diradd;
12114 		db_printf("mkdir: %p state 0x%X dap %p state 0x%X",
12115 		    mkdir, mkdir->md_state, diradd, diradd->da_state);
12116 		if ((jaddref = mkdir->md_jaddref) != NULL)
12117 			db_printf(" jaddref %p jaddref state 0x%X",
12118 			    jaddref, jaddref->ja_state);
12119 		db_printf("\n");
12120 	}
12121 }
12122 
12123 #endif /* DDB */
12124 
12125 #endif /* SOFTUPDATES */
12126