xref: /freebsd/sys/ufs/ffs/ffs_alloc.c (revision 87569f75a91f298c52a71823c04d41cf53c88889)
1 /*-
2  * Copyright (c) 2002 Networks Associates Technology, Inc.
3  * All rights reserved.
4  *
5  * This software was developed for the FreeBSD Project by Marshall
6  * Kirk McKusick and Network Associates Laboratories, the Security
7  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9  * research program
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  * Copyright (c) 1982, 1986, 1989, 1993
33  *	The Regents of the University of California.  All rights reserved.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  * 4. Neither the name of the University nor the names of its contributors
44  *    may be used to endorse or promote products derived from this software
45  *    without specific prior written permission.
46  *
47  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57  * SUCH DAMAGE.
58  *
59  *	@(#)ffs_alloc.c	8.18 (Berkeley) 5/26/95
60  */
61 
62 #include <sys/cdefs.h>
63 __FBSDID("$FreeBSD$");
64 
65 #include "opt_quota.h"
66 
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/bio.h>
70 #include <sys/buf.h>
71 #include <sys/conf.h>
72 #include <sys/file.h>
73 #include <sys/filedesc.h>
74 #include <sys/proc.h>
75 #include <sys/vnode.h>
76 #include <sys/mount.h>
77 #include <sys/kernel.h>
78 #include <sys/sysctl.h>
79 #include <sys/syslog.h>
80 
81 #include <ufs/ufs/extattr.h>
82 #include <ufs/ufs/quota.h>
83 #include <ufs/ufs/inode.h>
84 #include <ufs/ufs/ufs_extern.h>
85 #include <ufs/ufs/ufsmount.h>
86 
87 #include <ufs/ffs/fs.h>
88 #include <ufs/ffs/ffs_extern.h>
89 
90 typedef ufs2_daddr_t allocfcn_t(struct inode *ip, int cg, ufs2_daddr_t bpref,
91 				  int size);
92 
93 static ufs2_daddr_t ffs_alloccg(struct inode *, int, ufs2_daddr_t, int);
94 static ufs2_daddr_t
95 	      ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t);
96 #ifdef DIAGNOSTIC
97 static int	ffs_checkblk(struct inode *, ufs2_daddr_t, long);
98 #endif
99 static ufs2_daddr_t ffs_clusteralloc(struct inode *, int, ufs2_daddr_t, int);
100 static void	ffs_clusteracct(struct ufsmount *, struct fs *, struct cg *,
101 		    ufs1_daddr_t, int);
102 static ino_t	ffs_dirpref(struct inode *);
103 static ufs2_daddr_t ffs_fragextend(struct inode *, int, ufs2_daddr_t, int, int);
104 static void	ffs_fserr(struct fs *, ino_t, char *);
105 static ufs2_daddr_t	ffs_hashalloc
106 		(struct inode *, int, ufs2_daddr_t, int, allocfcn_t *);
107 static ufs2_daddr_t ffs_nodealloccg(struct inode *, int, ufs2_daddr_t, int);
108 static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int);
109 static int	ffs_reallocblks_ufs1(struct vop_reallocblks_args *);
110 static int	ffs_reallocblks_ufs2(struct vop_reallocblks_args *);
111 
112 /*
113  * Allocate a block in the filesystem.
114  *
115  * The size of the requested block is given, which must be some
116  * multiple of fs_fsize and <= fs_bsize.
117  * A preference may be optionally specified. If a preference is given
118  * the following hierarchy is used to allocate a block:
119  *   1) allocate the requested block.
120  *   2) allocate a rotationally optimal block in the same cylinder.
121  *   3) allocate a block in the same cylinder group.
122  *   4) quadradically rehash into other cylinder groups, until an
123  *      available block is located.
124  * If no block preference is given the following heirarchy is used
125  * to allocate a block:
126  *   1) allocate a block in the cylinder group that contains the
127  *      inode for the file.
128  *   2) quadradically rehash into other cylinder groups, until an
129  *      available block is located.
130  */
131 int
132 ffs_alloc(ip, lbn, bpref, size, cred, bnp)
133 	struct inode *ip;
134 	ufs2_daddr_t lbn, bpref;
135 	int size;
136 	struct ucred *cred;
137 	ufs2_daddr_t *bnp;
138 {
139 	struct fs *fs;
140 	struct ufsmount *ump;
141 	ufs2_daddr_t bno;
142 	int cg, reclaimed;
143 	static struct timeval lastfail;
144 	static int curfail;
145 #ifdef QUOTA
146 	int error;
147 #endif
148 
149 	*bnp = 0;
150 	fs = ip->i_fs;
151 	ump = ip->i_ump;
152 	mtx_assert(UFS_MTX(ump), MA_OWNED);
153 #ifdef DIAGNOSTIC
154 	if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
155 		printf("dev = %s, bsize = %ld, size = %d, fs = %s\n",
156 		    devtoname(ip->i_dev), (long)fs->fs_bsize, size,
157 		    fs->fs_fsmnt);
158 		panic("ffs_alloc: bad size");
159 	}
160 	if (cred == NOCRED)
161 		panic("ffs_alloc: missing credential");
162 #endif /* DIAGNOSTIC */
163 	reclaimed = 0;
164 retry:
165 #ifdef QUOTA
166 	UFS_UNLOCK(ump);
167 	error = chkdq(ip, btodb(size), cred, 0);
168 	if (error)
169 		return (error);
170 	UFS_LOCK(ump);
171 #endif
172 	if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
173 		goto nospace;
174 	if (suser_cred(cred, SUSER_ALLOWJAIL) &&
175 	    freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0)
176 		goto nospace;
177 	if (bpref >= fs->fs_size)
178 		bpref = 0;
179 	if (bpref == 0)
180 		cg = ino_to_cg(fs, ip->i_number);
181 	else
182 		cg = dtog(fs, bpref);
183 	bno = ffs_hashalloc(ip, cg, bpref, size, ffs_alloccg);
184 	if (bno > 0) {
185 		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size));
186 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
187 		*bnp = bno;
188 		return (0);
189 	}
190 #ifdef QUOTA
191 	UFS_UNLOCK(ump);
192 	/*
193 	 * Restore user's disk quota because allocation failed.
194 	 */
195 	(void) chkdq(ip, -btodb(size), cred, FORCE);
196 	UFS_LOCK(ump);
197 #endif
198 nospace:
199 	if (fs->fs_pendingblocks > 0 && reclaimed == 0) {
200 		reclaimed = 1;
201 		softdep_request_cleanup(fs, ITOV(ip));
202 		goto retry;
203 	}
204 	UFS_UNLOCK(ump);
205 	if (ppsratecheck(&lastfail, &curfail, 1)) {
206 		ffs_fserr(fs, ip->i_number, "filesystem full");
207 		uprintf("\n%s: write failed, filesystem is full\n",
208 		    fs->fs_fsmnt);
209 	}
210 	return (ENOSPC);
211 }
212 
213 /*
214  * Reallocate a fragment to a bigger size
215  *
216  * The number and size of the old block is given, and a preference
217  * and new size is also specified. The allocator attempts to extend
218  * the original block. Failing that, the regular block allocator is
219  * invoked to get an appropriate block.
220  */
221 int
222 ffs_realloccg(ip, lbprev, bprev, bpref, osize, nsize, cred, bpp)
223 	struct inode *ip;
224 	ufs2_daddr_t lbprev;
225 	ufs2_daddr_t bprev;
226 	ufs2_daddr_t bpref;
227 	int osize, nsize;
228 	struct ucred *cred;
229 	struct buf **bpp;
230 {
231 	struct vnode *vp;
232 	struct fs *fs;
233 	struct buf *bp;
234 	struct ufsmount *ump;
235 	int cg, request, error, reclaimed;
236 	ufs2_daddr_t bno;
237 	static struct timeval lastfail;
238 	static int curfail;
239 
240 	*bpp = 0;
241 	vp = ITOV(ip);
242 	fs = ip->i_fs;
243 	bp = NULL;
244 	ump = ip->i_ump;
245 	mtx_assert(UFS_MTX(ump), MA_OWNED);
246 #ifdef DIAGNOSTIC
247 	if (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED)
248 		panic("ffs_realloccg: allocation on suspended filesystem");
249 	if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
250 	    (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
251 		printf(
252 		"dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n",
253 		    devtoname(ip->i_dev), (long)fs->fs_bsize, osize,
254 		    nsize, fs->fs_fsmnt);
255 		panic("ffs_realloccg: bad size");
256 	}
257 	if (cred == NOCRED)
258 		panic("ffs_realloccg: missing credential");
259 #endif /* DIAGNOSTIC */
260 	reclaimed = 0;
261 retry:
262 	if (suser_cred(cred, SUSER_ALLOWJAIL) &&
263 	    freespace(fs, fs->fs_minfree) -  numfrags(fs, nsize - osize) < 0) {
264 		goto nospace;
265 	}
266 	if (bprev == 0) {
267 		printf("dev = %s, bsize = %ld, bprev = %jd, fs = %s\n",
268 		    devtoname(ip->i_dev), (long)fs->fs_bsize, (intmax_t)bprev,
269 		    fs->fs_fsmnt);
270 		panic("ffs_realloccg: bad bprev");
271 	}
272 	UFS_UNLOCK(ump);
273 	/*
274 	 * Allocate the extra space in the buffer.
275 	 */
276 	error = bread(vp, lbprev, osize, NOCRED, &bp);
277 	if (error) {
278 		brelse(bp);
279 		return (error);
280 	}
281 
282 	if (bp->b_blkno == bp->b_lblkno) {
283 		if (lbprev >= NDADDR)
284 			panic("ffs_realloccg: lbprev out of range");
285 		bp->b_blkno = fsbtodb(fs, bprev);
286 	}
287 
288 #ifdef QUOTA
289 	error = chkdq(ip, btodb(nsize - osize), cred, 0);
290 	if (error) {
291 		brelse(bp);
292 		return (error);
293 	}
294 #endif
295 	/*
296 	 * Check for extension in the existing location.
297 	 */
298 	cg = dtog(fs, bprev);
299 	UFS_LOCK(ump);
300 	bno = ffs_fragextend(ip, cg, bprev, osize, nsize);
301 	if (bno) {
302 		if (bp->b_blkno != fsbtodb(fs, bno))
303 			panic("ffs_realloccg: bad blockno");
304 		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(nsize - osize));
305 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
306 		allocbuf(bp, nsize);
307 		bp->b_flags |= B_DONE;
308 		if ((bp->b_flags & (B_MALLOC | B_VMIO)) != B_VMIO)
309 			bzero((char *)bp->b_data + osize, nsize - osize);
310 		else
311 			vfs_bio_clrbuf(bp);
312 		*bpp = bp;
313 		return (0);
314 	}
315 	/*
316 	 * Allocate a new disk location.
317 	 */
318 	if (bpref >= fs->fs_size)
319 		bpref = 0;
320 	switch ((int)fs->fs_optim) {
321 	case FS_OPTSPACE:
322 		/*
323 		 * Allocate an exact sized fragment. Although this makes
324 		 * best use of space, we will waste time relocating it if
325 		 * the file continues to grow. If the fragmentation is
326 		 * less than half of the minimum free reserve, we choose
327 		 * to begin optimizing for time.
328 		 */
329 		request = nsize;
330 		if (fs->fs_minfree <= 5 ||
331 		    fs->fs_cstotal.cs_nffree >
332 		    (off_t)fs->fs_dsize * fs->fs_minfree / (2 * 100))
333 			break;
334 		log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n",
335 			fs->fs_fsmnt);
336 		fs->fs_optim = FS_OPTTIME;
337 		break;
338 	case FS_OPTTIME:
339 		/*
340 		 * At this point we have discovered a file that is trying to
341 		 * grow a small fragment to a larger fragment. To save time,
342 		 * we allocate a full sized block, then free the unused portion.
343 		 * If the file continues to grow, the `ffs_fragextend' call
344 		 * above will be able to grow it in place without further
345 		 * copying. If aberrant programs cause disk fragmentation to
346 		 * grow within 2% of the free reserve, we choose to begin
347 		 * optimizing for space.
348 		 */
349 		request = fs->fs_bsize;
350 		if (fs->fs_cstotal.cs_nffree <
351 		    (off_t)fs->fs_dsize * (fs->fs_minfree - 2) / 100)
352 			break;
353 		log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n",
354 			fs->fs_fsmnt);
355 		fs->fs_optim = FS_OPTSPACE;
356 		break;
357 	default:
358 		printf("dev = %s, optim = %ld, fs = %s\n",
359 		    devtoname(ip->i_dev), (long)fs->fs_optim, fs->fs_fsmnt);
360 		panic("ffs_realloccg: bad optim");
361 		/* NOTREACHED */
362 	}
363 	bno = ffs_hashalloc(ip, cg, bpref, request, ffs_alloccg);
364 	if (bno > 0) {
365 		bp->b_blkno = fsbtodb(fs, bno);
366 		if (!DOINGSOFTDEP(vp))
367 			ffs_blkfree(ump, fs, ip->i_devvp, bprev, (long)osize,
368 			    ip->i_number);
369 		if (nsize < request)
370 			ffs_blkfree(ump, fs, ip->i_devvp,
371 			    bno + numfrags(fs, nsize),
372 			    (long)(request - nsize), ip->i_number);
373 		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(nsize - osize));
374 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
375 		allocbuf(bp, nsize);
376 		bp->b_flags |= B_DONE;
377 		if ((bp->b_flags & (B_MALLOC | B_VMIO)) != B_VMIO)
378 			bzero((char *)bp->b_data + osize, nsize - osize);
379 		else
380 			vfs_bio_clrbuf(bp);
381 		*bpp = bp;
382 		return (0);
383 	}
384 #ifdef QUOTA
385 	UFS_UNLOCK(ump);
386 	/*
387 	 * Restore user's disk quota because allocation failed.
388 	 */
389 	(void) chkdq(ip, -btodb(nsize - osize), cred, FORCE);
390 	UFS_LOCK(ump);
391 #endif
392 nospace:
393 	/*
394 	 * no space available
395 	 */
396 	if (fs->fs_pendingblocks > 0 && reclaimed == 0) {
397 		reclaimed = 1;
398 		softdep_request_cleanup(fs, vp);
399 		UFS_UNLOCK(ump);
400 		if (bp)
401 			brelse(bp);
402 		UFS_LOCK(ump);
403 		goto retry;
404 	}
405 	UFS_UNLOCK(ump);
406 	if (bp)
407 		brelse(bp);
408 	if (ppsratecheck(&lastfail, &curfail, 1)) {
409 		ffs_fserr(fs, ip->i_number, "filesystem full");
410 		uprintf("\n%s: write failed, filesystem is full\n",
411 		    fs->fs_fsmnt);
412 	}
413 	return (ENOSPC);
414 }
415 
416 /*
417  * Reallocate a sequence of blocks into a contiguous sequence of blocks.
418  *
419  * The vnode and an array of buffer pointers for a range of sequential
420  * logical blocks to be made contiguous is given. The allocator attempts
421  * to find a range of sequential blocks starting as close as possible
422  * from the end of the allocation for the logical block immediately
423  * preceding the current range. If successful, the physical block numbers
424  * in the buffer pointers and in the inode are changed to reflect the new
425  * allocation. If unsuccessful, the allocation is left unchanged. The
426  * success in doing the reallocation is returned. Note that the error
427  * return is not reflected back to the user. Rather the previous block
428  * allocation will be used.
429  */
430 
431 SYSCTL_NODE(_vfs, OID_AUTO, ffs, CTLFLAG_RW, 0, "FFS filesystem");
432 
433 static int doasyncfree = 1;
434 SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0, "");
435 
436 static int doreallocblks = 1;
437 SYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, "");
438 
439 #ifdef DEBUG
440 static volatile int prtrealloc = 0;
441 #endif
442 
443 int
444 ffs_reallocblks(ap)
445 	struct vop_reallocblks_args /* {
446 		struct vnode *a_vp;
447 		struct cluster_save *a_buflist;
448 	} */ *ap;
449 {
450 
451 	if (doreallocblks == 0)
452 		return (ENOSPC);
453 	if (VTOI(ap->a_vp)->i_ump->um_fstype == UFS1)
454 		return (ffs_reallocblks_ufs1(ap));
455 	return (ffs_reallocblks_ufs2(ap));
456 }
457 
458 static int
459 ffs_reallocblks_ufs1(ap)
460 	struct vop_reallocblks_args /* {
461 		struct vnode *a_vp;
462 		struct cluster_save *a_buflist;
463 	} */ *ap;
464 {
465 	struct fs *fs;
466 	struct inode *ip;
467 	struct vnode *vp;
468 	struct buf *sbp, *ebp;
469 	ufs1_daddr_t *bap, *sbap, *ebap = 0;
470 	struct cluster_save *buflist;
471 	struct ufsmount *ump;
472 	ufs_lbn_t start_lbn, end_lbn;
473 	ufs1_daddr_t soff, newblk, blkno;
474 	ufs2_daddr_t pref;
475 	struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp;
476 	int i, len, start_lvl, end_lvl, ssize;
477 
478 	vp = ap->a_vp;
479 	ip = VTOI(vp);
480 	fs = ip->i_fs;
481 	ump = ip->i_ump;
482 	if (fs->fs_contigsumsize <= 0)
483 		return (ENOSPC);
484 	buflist = ap->a_buflist;
485 	len = buflist->bs_nchildren;
486 	start_lbn = buflist->bs_children[0]->b_lblkno;
487 	end_lbn = start_lbn + len - 1;
488 #ifdef DIAGNOSTIC
489 	for (i = 0; i < len; i++)
490 		if (!ffs_checkblk(ip,
491 		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
492 			panic("ffs_reallocblks: unallocated block 1");
493 	for (i = 1; i < len; i++)
494 		if (buflist->bs_children[i]->b_lblkno != start_lbn + i)
495 			panic("ffs_reallocblks: non-logical cluster");
496 	blkno = buflist->bs_children[0]->b_blkno;
497 	ssize = fsbtodb(fs, fs->fs_frag);
498 	for (i = 1; i < len - 1; i++)
499 		if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize))
500 			panic("ffs_reallocblks: non-physical cluster %d", i);
501 #endif
502 	/*
503 	 * If the latest allocation is in a new cylinder group, assume that
504 	 * the filesystem has decided to move and do not force it back to
505 	 * the previous cylinder group.
506 	 */
507 	if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) !=
508 	    dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno)))
509 		return (ENOSPC);
510 	if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) ||
511 	    ufs_getlbns(vp, end_lbn, end_ap, &end_lvl))
512 		return (ENOSPC);
513 	/*
514 	 * Get the starting offset and block map for the first block.
515 	 */
516 	if (start_lvl == 0) {
517 		sbap = &ip->i_din1->di_db[0];
518 		soff = start_lbn;
519 	} else {
520 		idp = &start_ap[start_lvl - 1];
521 		if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) {
522 			brelse(sbp);
523 			return (ENOSPC);
524 		}
525 		sbap = (ufs1_daddr_t *)sbp->b_data;
526 		soff = idp->in_off;
527 	}
528 	/*
529 	 * If the block range spans two block maps, get the second map.
530 	 */
531 	if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) {
532 		ssize = len;
533 	} else {
534 #ifdef DIAGNOSTIC
535 		if (start_ap[start_lvl-1].in_lbn == idp->in_lbn)
536 			panic("ffs_reallocblk: start == end");
537 #endif
538 		ssize = len - (idp->in_off + 1);
539 		if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp))
540 			goto fail;
541 		ebap = (ufs1_daddr_t *)ebp->b_data;
542 	}
543 	/*
544 	 * Find the preferred location for the cluster.
545 	 */
546 	UFS_LOCK(ump);
547 	pref = ffs_blkpref_ufs1(ip, start_lbn, soff, sbap);
548 	/*
549 	 * Search the block map looking for an allocation of the desired size.
550 	 */
551 	if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref,
552 	    len, ffs_clusteralloc)) == 0) {
553 		UFS_UNLOCK(ump);
554 		goto fail;
555 	}
556 	/*
557 	 * We have found a new contiguous block.
558 	 *
559 	 * First we have to replace the old block pointers with the new
560 	 * block pointers in the inode and indirect blocks associated
561 	 * with the file.
562 	 */
563 #ifdef DEBUG
564 	if (prtrealloc)
565 		printf("realloc: ino %d, lbns %jd-%jd\n\told:", ip->i_number,
566 		    (intmax_t)start_lbn, (intmax_t)end_lbn);
567 #endif
568 	blkno = newblk;
569 	for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) {
570 		if (i == ssize) {
571 			bap = ebap;
572 			soff = -i;
573 		}
574 #ifdef DIAGNOSTIC
575 		if (!ffs_checkblk(ip,
576 		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
577 			panic("ffs_reallocblks: unallocated block 2");
578 		if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap)
579 			panic("ffs_reallocblks: alloc mismatch");
580 #endif
581 #ifdef DEBUG
582 		if (prtrealloc)
583 			printf(" %d,", *bap);
584 #endif
585 		if (DOINGSOFTDEP(vp)) {
586 			if (sbap == &ip->i_din1->di_db[0] && i < ssize)
587 				softdep_setup_allocdirect(ip, start_lbn + i,
588 				    blkno, *bap, fs->fs_bsize, fs->fs_bsize,
589 				    buflist->bs_children[i]);
590 			else
591 				softdep_setup_allocindir_page(ip, start_lbn + i,
592 				    i < ssize ? sbp : ebp, soff + i, blkno,
593 				    *bap, buflist->bs_children[i]);
594 		}
595 		*bap++ = blkno;
596 	}
597 	/*
598 	 * Next we must write out the modified inode and indirect blocks.
599 	 * For strict correctness, the writes should be synchronous since
600 	 * the old block values may have been written to disk. In practise
601 	 * they are almost never written, but if we are concerned about
602 	 * strict correctness, the `doasyncfree' flag should be set to zero.
603 	 *
604 	 * The test on `doasyncfree' should be changed to test a flag
605 	 * that shows whether the associated buffers and inodes have
606 	 * been written. The flag should be set when the cluster is
607 	 * started and cleared whenever the buffer or inode is flushed.
608 	 * We can then check below to see if it is set, and do the
609 	 * synchronous write only when it has been cleared.
610 	 */
611 	if (sbap != &ip->i_din1->di_db[0]) {
612 		if (doasyncfree)
613 			bdwrite(sbp);
614 		else
615 			bwrite(sbp);
616 	} else {
617 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
618 		if (!doasyncfree)
619 			ffs_update(vp, 1);
620 	}
621 	if (ssize < len) {
622 		if (doasyncfree)
623 			bdwrite(ebp);
624 		else
625 			bwrite(ebp);
626 	}
627 	/*
628 	 * Last, free the old blocks and assign the new blocks to the buffers.
629 	 */
630 #ifdef DEBUG
631 	if (prtrealloc)
632 		printf("\n\tnew:");
633 #endif
634 	for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
635 		if (!DOINGSOFTDEP(vp))
636 			ffs_blkfree(ump, fs, ip->i_devvp,
637 			    dbtofsb(fs, buflist->bs_children[i]->b_blkno),
638 			    fs->fs_bsize, ip->i_number);
639 		buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
640 #ifdef DIAGNOSTIC
641 		if (!ffs_checkblk(ip,
642 		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
643 			panic("ffs_reallocblks: unallocated block 3");
644 #endif
645 #ifdef DEBUG
646 		if (prtrealloc)
647 			printf(" %d,", blkno);
648 #endif
649 	}
650 #ifdef DEBUG
651 	if (prtrealloc) {
652 		prtrealloc--;
653 		printf("\n");
654 	}
655 #endif
656 	return (0);
657 
658 fail:
659 	if (ssize < len)
660 		brelse(ebp);
661 	if (sbap != &ip->i_din1->di_db[0])
662 		brelse(sbp);
663 	return (ENOSPC);
664 }
665 
666 static int
667 ffs_reallocblks_ufs2(ap)
668 	struct vop_reallocblks_args /* {
669 		struct vnode *a_vp;
670 		struct cluster_save *a_buflist;
671 	} */ *ap;
672 {
673 	struct fs *fs;
674 	struct inode *ip;
675 	struct vnode *vp;
676 	struct buf *sbp, *ebp;
677 	ufs2_daddr_t *bap, *sbap, *ebap = 0;
678 	struct cluster_save *buflist;
679 	struct ufsmount *ump;
680 	ufs_lbn_t start_lbn, end_lbn;
681 	ufs2_daddr_t soff, newblk, blkno, pref;
682 	struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp;
683 	int i, len, start_lvl, end_lvl, ssize;
684 
685 	vp = ap->a_vp;
686 	ip = VTOI(vp);
687 	fs = ip->i_fs;
688 	ump = ip->i_ump;
689 	if (fs->fs_contigsumsize <= 0)
690 		return (ENOSPC);
691 	buflist = ap->a_buflist;
692 	len = buflist->bs_nchildren;
693 	start_lbn = buflist->bs_children[0]->b_lblkno;
694 	end_lbn = start_lbn + len - 1;
695 #ifdef DIAGNOSTIC
696 	for (i = 0; i < len; i++)
697 		if (!ffs_checkblk(ip,
698 		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
699 			panic("ffs_reallocblks: unallocated block 1");
700 	for (i = 1; i < len; i++)
701 		if (buflist->bs_children[i]->b_lblkno != start_lbn + i)
702 			panic("ffs_reallocblks: non-logical cluster");
703 	blkno = buflist->bs_children[0]->b_blkno;
704 	ssize = fsbtodb(fs, fs->fs_frag);
705 	for (i = 1; i < len - 1; i++)
706 		if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize))
707 			panic("ffs_reallocblks: non-physical cluster %d", i);
708 #endif
709 	/*
710 	 * If the latest allocation is in a new cylinder group, assume that
711 	 * the filesystem has decided to move and do not force it back to
712 	 * the previous cylinder group.
713 	 */
714 	if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) !=
715 	    dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno)))
716 		return (ENOSPC);
717 	if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) ||
718 	    ufs_getlbns(vp, end_lbn, end_ap, &end_lvl))
719 		return (ENOSPC);
720 	/*
721 	 * Get the starting offset and block map for the first block.
722 	 */
723 	if (start_lvl == 0) {
724 		sbap = &ip->i_din2->di_db[0];
725 		soff = start_lbn;
726 	} else {
727 		idp = &start_ap[start_lvl - 1];
728 		if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) {
729 			brelse(sbp);
730 			return (ENOSPC);
731 		}
732 		sbap = (ufs2_daddr_t *)sbp->b_data;
733 		soff = idp->in_off;
734 	}
735 	/*
736 	 * If the block range spans two block maps, get the second map.
737 	 */
738 	if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) {
739 		ssize = len;
740 	} else {
741 #ifdef DIAGNOSTIC
742 		if (start_ap[start_lvl-1].in_lbn == idp->in_lbn)
743 			panic("ffs_reallocblk: start == end");
744 #endif
745 		ssize = len - (idp->in_off + 1);
746 		if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp))
747 			goto fail;
748 		ebap = (ufs2_daddr_t *)ebp->b_data;
749 	}
750 	/*
751 	 * Find the preferred location for the cluster.
752 	 */
753 	UFS_LOCK(ump);
754 	pref = ffs_blkpref_ufs2(ip, start_lbn, soff, sbap);
755 	/*
756 	 * Search the block map looking for an allocation of the desired size.
757 	 */
758 	if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref,
759 	    len, ffs_clusteralloc)) == 0) {
760 		UFS_UNLOCK(ump);
761 		goto fail;
762 	}
763 	/*
764 	 * We have found a new contiguous block.
765 	 *
766 	 * First we have to replace the old block pointers with the new
767 	 * block pointers in the inode and indirect blocks associated
768 	 * with the file.
769 	 */
770 #ifdef DEBUG
771 	if (prtrealloc)
772 		printf("realloc: ino %d, lbns %jd-%jd\n\told:", ip->i_number,
773 		    (intmax_t)start_lbn, (intmax_t)end_lbn);
774 #endif
775 	blkno = newblk;
776 	for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) {
777 		if (i == ssize) {
778 			bap = ebap;
779 			soff = -i;
780 		}
781 #ifdef DIAGNOSTIC
782 		if (!ffs_checkblk(ip,
783 		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
784 			panic("ffs_reallocblks: unallocated block 2");
785 		if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap)
786 			panic("ffs_reallocblks: alloc mismatch");
787 #endif
788 #ifdef DEBUG
789 		if (prtrealloc)
790 			printf(" %jd,", (intmax_t)*bap);
791 #endif
792 		if (DOINGSOFTDEP(vp)) {
793 			if (sbap == &ip->i_din2->di_db[0] && i < ssize)
794 				softdep_setup_allocdirect(ip, start_lbn + i,
795 				    blkno, *bap, fs->fs_bsize, fs->fs_bsize,
796 				    buflist->bs_children[i]);
797 			else
798 				softdep_setup_allocindir_page(ip, start_lbn + i,
799 				    i < ssize ? sbp : ebp, soff + i, blkno,
800 				    *bap, buflist->bs_children[i]);
801 		}
802 		*bap++ = blkno;
803 	}
804 	/*
805 	 * Next we must write out the modified inode and indirect blocks.
806 	 * For strict correctness, the writes should be synchronous since
807 	 * the old block values may have been written to disk. In practise
808 	 * they are almost never written, but if we are concerned about
809 	 * strict correctness, the `doasyncfree' flag should be set to zero.
810 	 *
811 	 * The test on `doasyncfree' should be changed to test a flag
812 	 * that shows whether the associated buffers and inodes have
813 	 * been written. The flag should be set when the cluster is
814 	 * started and cleared whenever the buffer or inode is flushed.
815 	 * We can then check below to see if it is set, and do the
816 	 * synchronous write only when it has been cleared.
817 	 */
818 	if (sbap != &ip->i_din2->di_db[0]) {
819 		if (doasyncfree)
820 			bdwrite(sbp);
821 		else
822 			bwrite(sbp);
823 	} else {
824 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
825 		if (!doasyncfree)
826 			ffs_update(vp, 1);
827 	}
828 	if (ssize < len) {
829 		if (doasyncfree)
830 			bdwrite(ebp);
831 		else
832 			bwrite(ebp);
833 	}
834 	/*
835 	 * Last, free the old blocks and assign the new blocks to the buffers.
836 	 */
837 #ifdef DEBUG
838 	if (prtrealloc)
839 		printf("\n\tnew:");
840 #endif
841 	for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
842 		if (!DOINGSOFTDEP(vp))
843 			ffs_blkfree(ump, fs, ip->i_devvp,
844 			    dbtofsb(fs, buflist->bs_children[i]->b_blkno),
845 			    fs->fs_bsize, ip->i_number);
846 		buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
847 #ifdef DIAGNOSTIC
848 		if (!ffs_checkblk(ip,
849 		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
850 			panic("ffs_reallocblks: unallocated block 3");
851 #endif
852 #ifdef DEBUG
853 		if (prtrealloc)
854 			printf(" %jd,", (intmax_t)blkno);
855 #endif
856 	}
857 #ifdef DEBUG
858 	if (prtrealloc) {
859 		prtrealloc--;
860 		printf("\n");
861 	}
862 #endif
863 	return (0);
864 
865 fail:
866 	if (ssize < len)
867 		brelse(ebp);
868 	if (sbap != &ip->i_din2->di_db[0])
869 		brelse(sbp);
870 	return (ENOSPC);
871 }
872 
873 /*
874  * Allocate an inode in the filesystem.
875  *
876  * If allocating a directory, use ffs_dirpref to select the inode.
877  * If allocating in a directory, the following hierarchy is followed:
878  *   1) allocate the preferred inode.
879  *   2) allocate an inode in the same cylinder group.
880  *   3) quadradically rehash into other cylinder groups, until an
881  *      available inode is located.
882  * If no inode preference is given the following heirarchy is used
883  * to allocate an inode:
884  *   1) allocate an inode in cylinder group 0.
885  *   2) quadradically rehash into other cylinder groups, until an
886  *      available inode is located.
887  */
888 int
889 ffs_valloc(pvp, mode, cred, vpp)
890 	struct vnode *pvp;
891 	int mode;
892 	struct ucred *cred;
893 	struct vnode **vpp;
894 {
895 	struct inode *pip;
896 	struct fs *fs;
897 	struct inode *ip;
898 	struct timespec ts;
899 	struct ufsmount *ump;
900 	ino_t ino, ipref;
901 	int cg, error;
902 	static struct timeval lastfail;
903 	static int curfail;
904 
905 	*vpp = NULL;
906 	pip = VTOI(pvp);
907 	fs = pip->i_fs;
908 	ump = pip->i_ump;
909 
910 	UFS_LOCK(ump);
911 	if (fs->fs_cstotal.cs_nifree == 0)
912 		goto noinodes;
913 
914 	if ((mode & IFMT) == IFDIR)
915 		ipref = ffs_dirpref(pip);
916 	else
917 		ipref = pip->i_number;
918 	if (ipref >= fs->fs_ncg * fs->fs_ipg)
919 		ipref = 0;
920 	cg = ino_to_cg(fs, ipref);
921 	/*
922 	 * Track number of dirs created one after another
923 	 * in a same cg without intervening by files.
924 	 */
925 	if ((mode & IFMT) == IFDIR) {
926 		if (fs->fs_contigdirs[cg] < 255)
927 			fs->fs_contigdirs[cg]++;
928 	} else {
929 		if (fs->fs_contigdirs[cg] > 0)
930 			fs->fs_contigdirs[cg]--;
931 	}
932 	ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode,
933 					(allocfcn_t *)ffs_nodealloccg);
934 	if (ino == 0)
935 		goto noinodes;
936 	error = ffs_vget(pvp->v_mount, ino, LK_EXCLUSIVE, vpp);
937 	if (error) {
938 		ffs_vfree(pvp, ino, mode);
939 		return (error);
940 	}
941 	ip = VTOI(*vpp);
942 	if (ip->i_mode) {
943 		printf("mode = 0%o, inum = %lu, fs = %s\n",
944 		    ip->i_mode, (u_long)ip->i_number, fs->fs_fsmnt);
945 		panic("ffs_valloc: dup alloc");
946 	}
947 	if (DIP(ip, i_blocks) && (fs->fs_flags & FS_UNCLEAN) == 0) {  /* XXX */
948 		printf("free inode %s/%lu had %ld blocks\n",
949 		    fs->fs_fsmnt, (u_long)ino, (long)DIP(ip, i_blocks));
950 		DIP_SET(ip, i_blocks, 0);
951 	}
952 	ip->i_flags = 0;
953 	DIP_SET(ip, i_flags, 0);
954 	/*
955 	 * Set up a new generation number for this inode.
956 	 */
957 	if (ip->i_gen == 0 || ++ip->i_gen == 0)
958 		ip->i_gen = arc4random() / 2 + 1;
959 	DIP_SET(ip, i_gen, ip->i_gen);
960 	if (fs->fs_magic == FS_UFS2_MAGIC) {
961 		vfs_timestamp(&ts);
962 		ip->i_din2->di_birthtime = ts.tv_sec;
963 		ip->i_din2->di_birthnsec = ts.tv_nsec;
964 	}
965 	ip->i_flag = 0;
966 	vnode_destroy_vobject(*vpp);
967 	(*vpp)->v_type = VNON;
968 	if (fs->fs_magic == FS_UFS2_MAGIC)
969 		(*vpp)->v_op = &ffs_vnodeops2;
970 	else
971 		(*vpp)->v_op = &ffs_vnodeops1;
972 	return (0);
973 noinodes:
974 	UFS_UNLOCK(ump);
975 	if (ppsratecheck(&lastfail, &curfail, 1)) {
976 		ffs_fserr(fs, pip->i_number, "out of inodes");
977 		uprintf("\n%s: create/symlink failed, no inodes free\n",
978 		    fs->fs_fsmnt);
979 	}
980 	return (ENOSPC);
981 }
982 
983 /*
984  * Find a cylinder group to place a directory.
985  *
986  * The policy implemented by this algorithm is to allocate a
987  * directory inode in the same cylinder group as its parent
988  * directory, but also to reserve space for its files inodes
989  * and data. Restrict the number of directories which may be
990  * allocated one after another in the same cylinder group
991  * without intervening allocation of files.
992  *
993  * If we allocate a first level directory then force allocation
994  * in another cylinder group.
995  */
996 static ino_t
997 ffs_dirpref(pip)
998 	struct inode *pip;
999 {
1000 	struct fs *fs;
1001 	int cg, prefcg, dirsize, cgsize;
1002 	int avgifree, avgbfree, avgndir, curdirsize;
1003 	int minifree, minbfree, maxndir;
1004 	int mincg, minndir;
1005 	int maxcontigdirs;
1006 
1007 	mtx_assert(UFS_MTX(pip->i_ump), MA_OWNED);
1008 	fs = pip->i_fs;
1009 
1010 	avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
1011 	avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
1012 	avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg;
1013 
1014 	/*
1015 	 * Force allocation in another cg if creating a first level dir.
1016 	 */
1017 	ASSERT_VOP_LOCKED(ITOV(pip), "ffs_dirpref");
1018 	if (ITOV(pip)->v_vflag & VV_ROOT) {
1019 		prefcg = arc4random() % fs->fs_ncg;
1020 		mincg = prefcg;
1021 		minndir = fs->fs_ipg;
1022 		for (cg = prefcg; cg < fs->fs_ncg; cg++)
1023 			if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
1024 			    fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
1025 			    fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
1026 				mincg = cg;
1027 				minndir = fs->fs_cs(fs, cg).cs_ndir;
1028 			}
1029 		for (cg = 0; cg < prefcg; cg++)
1030 			if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
1031 			    fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
1032 			    fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
1033 				mincg = cg;
1034 				minndir = fs->fs_cs(fs, cg).cs_ndir;
1035 			}
1036 		return ((ino_t)(fs->fs_ipg * mincg));
1037 	}
1038 
1039 	/*
1040 	 * Count various limits which used for
1041 	 * optimal allocation of a directory inode.
1042 	 */
1043 	maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg);
1044 	minifree = avgifree - avgifree / 4;
1045 	if (minifree < 1)
1046 		minifree = 1;
1047 	minbfree = avgbfree - avgbfree / 4;
1048 	if (minbfree < 1)
1049 		minbfree = 1;
1050 	cgsize = fs->fs_fsize * fs->fs_fpg;
1051 	dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir;
1052 	curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0;
1053 	if (dirsize < curdirsize)
1054 		dirsize = curdirsize;
1055 	maxcontigdirs = min((avgbfree * fs->fs_bsize) / dirsize, 255);
1056 	if (fs->fs_avgfpdir > 0)
1057 		maxcontigdirs = min(maxcontigdirs,
1058 				    fs->fs_ipg / fs->fs_avgfpdir);
1059 	if (maxcontigdirs == 0)
1060 		maxcontigdirs = 1;
1061 
1062 	/*
1063 	 * Limit number of dirs in one cg and reserve space for
1064 	 * regular files, but only if we have no deficit in
1065 	 * inodes or space.
1066 	 */
1067 	prefcg = ino_to_cg(fs, pip->i_number);
1068 	for (cg = prefcg; cg < fs->fs_ncg; cg++)
1069 		if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
1070 		    fs->fs_cs(fs, cg).cs_nifree >= minifree &&
1071 	    	    fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
1072 			if (fs->fs_contigdirs[cg] < maxcontigdirs)
1073 				return ((ino_t)(fs->fs_ipg * cg));
1074 		}
1075 	for (cg = 0; cg < prefcg; cg++)
1076 		if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
1077 		    fs->fs_cs(fs, cg).cs_nifree >= minifree &&
1078 	    	    fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
1079 			if (fs->fs_contigdirs[cg] < maxcontigdirs)
1080 				return ((ino_t)(fs->fs_ipg * cg));
1081 		}
1082 	/*
1083 	 * This is a backstop when we have deficit in space.
1084 	 */
1085 	for (cg = prefcg; cg < fs->fs_ncg; cg++)
1086 		if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
1087 			return ((ino_t)(fs->fs_ipg * cg));
1088 	for (cg = 0; cg < prefcg; cg++)
1089 		if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
1090 			break;
1091 	return ((ino_t)(fs->fs_ipg * cg));
1092 }
1093 
1094 /*
1095  * Select the desired position for the next block in a file.  The file is
1096  * logically divided into sections. The first section is composed of the
1097  * direct blocks. Each additional section contains fs_maxbpg blocks.
1098  *
1099  * If no blocks have been allocated in the first section, the policy is to
1100  * request a block in the same cylinder group as the inode that describes
1101  * the file. If no blocks have been allocated in any other section, the
1102  * policy is to place the section in a cylinder group with a greater than
1103  * average number of free blocks.  An appropriate cylinder group is found
1104  * by using a rotor that sweeps the cylinder groups. When a new group of
1105  * blocks is needed, the sweep begins in the cylinder group following the
1106  * cylinder group from which the previous allocation was made. The sweep
1107  * continues until a cylinder group with greater than the average number
1108  * of free blocks is found. If the allocation is for the first block in an
1109  * indirect block, the information on the previous allocation is unavailable;
1110  * here a best guess is made based upon the logical block number being
1111  * allocated.
1112  *
1113  * If a section is already partially allocated, the policy is to
1114  * contiguously allocate fs_maxcontig blocks. The end of one of these
1115  * contiguous blocks and the beginning of the next is laid out
1116  * contiguously if possible.
1117  */
1118 ufs2_daddr_t
1119 ffs_blkpref_ufs1(ip, lbn, indx, bap)
1120 	struct inode *ip;
1121 	ufs_lbn_t lbn;
1122 	int indx;
1123 	ufs1_daddr_t *bap;
1124 {
1125 	struct fs *fs;
1126 	int cg;
1127 	int avgbfree, startcg;
1128 
1129 	mtx_assert(UFS_MTX(ip->i_ump), MA_OWNED);
1130 	fs = ip->i_fs;
1131 	if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
1132 		if (lbn < NDADDR + NINDIR(fs)) {
1133 			cg = ino_to_cg(fs, ip->i_number);
1134 			return (cgbase(fs, cg) + fs->fs_frag);
1135 		}
1136 		/*
1137 		 * Find a cylinder with greater than average number of
1138 		 * unused data blocks.
1139 		 */
1140 		if (indx == 0 || bap[indx - 1] == 0)
1141 			startcg =
1142 			    ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
1143 		else
1144 			startcg = dtog(fs, bap[indx - 1]) + 1;
1145 		startcg %= fs->fs_ncg;
1146 		avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
1147 		for (cg = startcg; cg < fs->fs_ncg; cg++)
1148 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
1149 				fs->fs_cgrotor = cg;
1150 				return (cgbase(fs, cg) + fs->fs_frag);
1151 			}
1152 		for (cg = 0; cg <= startcg; cg++)
1153 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
1154 				fs->fs_cgrotor = cg;
1155 				return (cgbase(fs, cg) + fs->fs_frag);
1156 			}
1157 		return (0);
1158 	}
1159 	/*
1160 	 * We just always try to lay things out contiguously.
1161 	 */
1162 	return (bap[indx - 1] + fs->fs_frag);
1163 }
1164 
1165 /*
1166  * Same as above, but for UFS2
1167  */
1168 ufs2_daddr_t
1169 ffs_blkpref_ufs2(ip, lbn, indx, bap)
1170 	struct inode *ip;
1171 	ufs_lbn_t lbn;
1172 	int indx;
1173 	ufs2_daddr_t *bap;
1174 {
1175 	struct fs *fs;
1176 	int cg;
1177 	int avgbfree, startcg;
1178 
1179 	mtx_assert(UFS_MTX(ip->i_ump), MA_OWNED);
1180 	fs = ip->i_fs;
1181 	if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
1182 		if (lbn < NDADDR + NINDIR(fs)) {
1183 			cg = ino_to_cg(fs, ip->i_number);
1184 			return (cgbase(fs, cg) + fs->fs_frag);
1185 		}
1186 		/*
1187 		 * Find a cylinder with greater than average number of
1188 		 * unused data blocks.
1189 		 */
1190 		if (indx == 0 || bap[indx - 1] == 0)
1191 			startcg =
1192 			    ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
1193 		else
1194 			startcg = dtog(fs, bap[indx - 1]) + 1;
1195 		startcg %= fs->fs_ncg;
1196 		avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
1197 		for (cg = startcg; cg < fs->fs_ncg; cg++)
1198 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
1199 				fs->fs_cgrotor = cg;
1200 				return (cgbase(fs, cg) + fs->fs_frag);
1201 			}
1202 		for (cg = 0; cg <= startcg; cg++)
1203 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
1204 				fs->fs_cgrotor = cg;
1205 				return (cgbase(fs, cg) + fs->fs_frag);
1206 			}
1207 		return (0);
1208 	}
1209 	/*
1210 	 * We just always try to lay things out contiguously.
1211 	 */
1212 	return (bap[indx - 1] + fs->fs_frag);
1213 }
1214 
1215 /*
1216  * Implement the cylinder overflow algorithm.
1217  *
1218  * The policy implemented by this algorithm is:
1219  *   1) allocate the block in its requested cylinder group.
1220  *   2) quadradically rehash on the cylinder group number.
1221  *   3) brute force search for a free block.
1222  *
1223  * Must be called with the UFS lock held.  Will release the lock on success
1224  * and return with it held on failure.
1225  */
1226 /*VARARGS5*/
1227 static ufs2_daddr_t
1228 ffs_hashalloc(ip, cg, pref, size, allocator)
1229 	struct inode *ip;
1230 	int cg;
1231 	ufs2_daddr_t pref;
1232 	int size;	/* size for data blocks, mode for inodes */
1233 	allocfcn_t *allocator;
1234 {
1235 	struct fs *fs;
1236 	ufs2_daddr_t result;
1237 	int i, icg = cg;
1238 
1239 	mtx_assert(UFS_MTX(ip->i_ump), MA_OWNED);
1240 #ifdef DIAGNOSTIC
1241 	if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED)
1242 		panic("ffs_hashalloc: allocation on suspended filesystem");
1243 #endif
1244 	fs = ip->i_fs;
1245 	/*
1246 	 * 1: preferred cylinder group
1247 	 */
1248 	result = (*allocator)(ip, cg, pref, size);
1249 	if (result)
1250 		return (result);
1251 	/*
1252 	 * 2: quadratic rehash
1253 	 */
1254 	for (i = 1; i < fs->fs_ncg; i *= 2) {
1255 		cg += i;
1256 		if (cg >= fs->fs_ncg)
1257 			cg -= fs->fs_ncg;
1258 		result = (*allocator)(ip, cg, 0, size);
1259 		if (result)
1260 			return (result);
1261 	}
1262 	/*
1263 	 * 3: brute force search
1264 	 * Note that we start at i == 2, since 0 was checked initially,
1265 	 * and 1 is always checked in the quadratic rehash.
1266 	 */
1267 	cg = (icg + 2) % fs->fs_ncg;
1268 	for (i = 2; i < fs->fs_ncg; i++) {
1269 		result = (*allocator)(ip, cg, 0, size);
1270 		if (result)
1271 			return (result);
1272 		cg++;
1273 		if (cg == fs->fs_ncg)
1274 			cg = 0;
1275 	}
1276 	return (0);
1277 }
1278 
1279 /*
1280  * Determine whether a fragment can be extended.
1281  *
1282  * Check to see if the necessary fragments are available, and
1283  * if they are, allocate them.
1284  */
1285 static ufs2_daddr_t
1286 ffs_fragextend(ip, cg, bprev, osize, nsize)
1287 	struct inode *ip;
1288 	int cg;
1289 	ufs2_daddr_t bprev;
1290 	int osize, nsize;
1291 {
1292 	struct fs *fs;
1293 	struct cg *cgp;
1294 	struct buf *bp;
1295 	struct ufsmount *ump;
1296 	int nffree;
1297 	long bno;
1298 	int frags, bbase;
1299 	int i, error;
1300 	u_int8_t *blksfree;
1301 
1302 	ump = ip->i_ump;
1303 	fs = ip->i_fs;
1304 	if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
1305 		return (0);
1306 	frags = numfrags(fs, nsize);
1307 	bbase = fragnum(fs, bprev);
1308 	if (bbase > fragnum(fs, (bprev + frags - 1))) {
1309 		/* cannot extend across a block boundary */
1310 		return (0);
1311 	}
1312 	UFS_UNLOCK(ump);
1313 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
1314 		(int)fs->fs_cgsize, NOCRED, &bp);
1315 	if (error)
1316 		goto fail;
1317 	cgp = (struct cg *)bp->b_data;
1318 	if (!cg_chkmagic(cgp))
1319 		goto fail;
1320 	bp->b_xflags |= BX_BKGRDWRITE;
1321 	cgp->cg_old_time = cgp->cg_time = time_second;
1322 	bno = dtogd(fs, bprev);
1323 	blksfree = cg_blksfree(cgp);
1324 	for (i = numfrags(fs, osize); i < frags; i++)
1325 		if (isclr(blksfree, bno + i))
1326 			goto fail;
1327 	/*
1328 	 * the current fragment can be extended
1329 	 * deduct the count on fragment being extended into
1330 	 * increase the count on the remaining fragment (if any)
1331 	 * allocate the extended piece
1332 	 */
1333 	for (i = frags; i < fs->fs_frag - bbase; i++)
1334 		if (isclr(blksfree, bno + i))
1335 			break;
1336 	cgp->cg_frsum[i - numfrags(fs, osize)]--;
1337 	if (i != frags)
1338 		cgp->cg_frsum[i - frags]++;
1339 	for (i = numfrags(fs, osize), nffree = 0; i < frags; i++) {
1340 		clrbit(blksfree, bno + i);
1341 		cgp->cg_cs.cs_nffree--;
1342 		nffree++;
1343 	}
1344 	UFS_LOCK(ump);
1345 	fs->fs_cstotal.cs_nffree -= nffree;
1346 	fs->fs_cs(fs, cg).cs_nffree -= nffree;
1347 	fs->fs_fmod = 1;
1348 	ACTIVECLEAR(fs, cg);
1349 	UFS_UNLOCK(ump);
1350 	if (DOINGSOFTDEP(ITOV(ip)))
1351 		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev);
1352 	bdwrite(bp);
1353 	return (bprev);
1354 
1355 fail:
1356 	brelse(bp);
1357 	UFS_LOCK(ump);
1358 	return (0);
1359 
1360 }
1361 
1362 /*
1363  * Determine whether a block can be allocated.
1364  *
1365  * Check to see if a block of the appropriate size is available,
1366  * and if it is, allocate it.
1367  */
1368 static ufs2_daddr_t
1369 ffs_alloccg(ip, cg, bpref, size)
1370 	struct inode *ip;
1371 	int cg;
1372 	ufs2_daddr_t bpref;
1373 	int size;
1374 {
1375 	struct fs *fs;
1376 	struct cg *cgp;
1377 	struct buf *bp;
1378 	struct ufsmount *ump;
1379 	ufs1_daddr_t bno;
1380 	ufs2_daddr_t blkno;
1381 	int i, allocsiz, error, frags;
1382 	u_int8_t *blksfree;
1383 
1384 	ump = ip->i_ump;
1385 	fs = ip->i_fs;
1386 	if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
1387 		return (0);
1388 	UFS_UNLOCK(ump);
1389 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
1390 		(int)fs->fs_cgsize, NOCRED, &bp);
1391 	if (error)
1392 		goto fail;
1393 	cgp = (struct cg *)bp->b_data;
1394 	if (!cg_chkmagic(cgp) ||
1395 	    (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize))
1396 		goto fail;
1397 	bp->b_xflags |= BX_BKGRDWRITE;
1398 	cgp->cg_old_time = cgp->cg_time = time_second;
1399 	if (size == fs->fs_bsize) {
1400 		UFS_LOCK(ump);
1401 		blkno = ffs_alloccgblk(ip, bp, bpref);
1402 		ACTIVECLEAR(fs, cg);
1403 		UFS_UNLOCK(ump);
1404 		bdwrite(bp);
1405 		return (blkno);
1406 	}
1407 	/*
1408 	 * check to see if any fragments are already available
1409 	 * allocsiz is the size which will be allocated, hacking
1410 	 * it down to a smaller size if necessary
1411 	 */
1412 	blksfree = cg_blksfree(cgp);
1413 	frags = numfrags(fs, size);
1414 	for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++)
1415 		if (cgp->cg_frsum[allocsiz] != 0)
1416 			break;
1417 	if (allocsiz == fs->fs_frag) {
1418 		/*
1419 		 * no fragments were available, so a block will be
1420 		 * allocated, and hacked up
1421 		 */
1422 		if (cgp->cg_cs.cs_nbfree == 0)
1423 			goto fail;
1424 		UFS_LOCK(ump);
1425 		blkno = ffs_alloccgblk(ip, bp, bpref);
1426 		bno = dtogd(fs, blkno);
1427 		for (i = frags; i < fs->fs_frag; i++)
1428 			setbit(blksfree, bno + i);
1429 		i = fs->fs_frag - frags;
1430 		cgp->cg_cs.cs_nffree += i;
1431 		fs->fs_cstotal.cs_nffree += i;
1432 		fs->fs_cs(fs, cg).cs_nffree += i;
1433 		fs->fs_fmod = 1;
1434 		cgp->cg_frsum[i]++;
1435 		ACTIVECLEAR(fs, cg);
1436 		UFS_UNLOCK(ump);
1437 		bdwrite(bp);
1438 		return (blkno);
1439 	}
1440 	bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
1441 	if (bno < 0)
1442 		goto fail;
1443 	for (i = 0; i < frags; i++)
1444 		clrbit(blksfree, bno + i);
1445 	cgp->cg_cs.cs_nffree -= frags;
1446 	cgp->cg_frsum[allocsiz]--;
1447 	if (frags != allocsiz)
1448 		cgp->cg_frsum[allocsiz - frags]++;
1449 	UFS_LOCK(ump);
1450 	fs->fs_cstotal.cs_nffree -= frags;
1451 	fs->fs_cs(fs, cg).cs_nffree -= frags;
1452 	fs->fs_fmod = 1;
1453 	blkno = cgbase(fs, cg) + bno;
1454 	ACTIVECLEAR(fs, cg);
1455 	UFS_UNLOCK(ump);
1456 	if (DOINGSOFTDEP(ITOV(ip)))
1457 		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno);
1458 	bdwrite(bp);
1459 	return (blkno);
1460 
1461 fail:
1462 	brelse(bp);
1463 	UFS_LOCK(ump);
1464 	return (0);
1465 }
1466 
1467 /*
1468  * Allocate a block in a cylinder group.
1469  *
1470  * This algorithm implements the following policy:
1471  *   1) allocate the requested block.
1472  *   2) allocate a rotationally optimal block in the same cylinder.
1473  *   3) allocate the next available block on the block rotor for the
1474  *      specified cylinder group.
1475  * Note that this routine only allocates fs_bsize blocks; these
1476  * blocks may be fragmented by the routine that allocates them.
1477  */
1478 static ufs2_daddr_t
1479 ffs_alloccgblk(ip, bp, bpref)
1480 	struct inode *ip;
1481 	struct buf *bp;
1482 	ufs2_daddr_t bpref;
1483 {
1484 	struct fs *fs;
1485 	struct cg *cgp;
1486 	struct ufsmount *ump;
1487 	ufs1_daddr_t bno;
1488 	ufs2_daddr_t blkno;
1489 	u_int8_t *blksfree;
1490 
1491 	fs = ip->i_fs;
1492 	ump = ip->i_ump;
1493 	mtx_assert(UFS_MTX(ump), MA_OWNED);
1494 	cgp = (struct cg *)bp->b_data;
1495 	blksfree = cg_blksfree(cgp);
1496 	if (bpref == 0 || dtog(fs, bpref) != cgp->cg_cgx) {
1497 		bpref = cgp->cg_rotor;
1498 	} else {
1499 		bpref = blknum(fs, bpref);
1500 		bno = dtogd(fs, bpref);
1501 		/*
1502 		 * if the requested block is available, use it
1503 		 */
1504 		if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno)))
1505 			goto gotit;
1506 	}
1507 	/*
1508 	 * Take the next available block in this cylinder group.
1509 	 */
1510 	bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag);
1511 	if (bno < 0)
1512 		return (0);
1513 	cgp->cg_rotor = bno;
1514 gotit:
1515 	blkno = fragstoblks(fs, bno);
1516 	ffs_clrblock(fs, blksfree, (long)blkno);
1517 	ffs_clusteracct(ump, fs, cgp, blkno, -1);
1518 	cgp->cg_cs.cs_nbfree--;
1519 	fs->fs_cstotal.cs_nbfree--;
1520 	fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
1521 	fs->fs_fmod = 1;
1522 	blkno = cgbase(fs, cgp->cg_cgx) + bno;
1523 	/* XXX Fixme. */
1524 	UFS_UNLOCK(ump);
1525 	if (DOINGSOFTDEP(ITOV(ip)))
1526 		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno);
1527 	UFS_LOCK(ump);
1528 	return (blkno);
1529 }
1530 
1531 /*
1532  * Determine whether a cluster can be allocated.
1533  *
1534  * We do not currently check for optimal rotational layout if there
1535  * are multiple choices in the same cylinder group. Instead we just
1536  * take the first one that we find following bpref.
1537  */
1538 static ufs2_daddr_t
1539 ffs_clusteralloc(ip, cg, bpref, len)
1540 	struct inode *ip;
1541 	int cg;
1542 	ufs2_daddr_t bpref;
1543 	int len;
1544 {
1545 	struct fs *fs;
1546 	struct cg *cgp;
1547 	struct buf *bp;
1548 	struct ufsmount *ump;
1549 	int i, run, bit, map, got;
1550 	ufs2_daddr_t bno;
1551 	u_char *mapp;
1552 	int32_t *lp;
1553 	u_int8_t *blksfree;
1554 
1555 	fs = ip->i_fs;
1556 	ump = ip->i_ump;
1557 	if (fs->fs_maxcluster[cg] < len)
1558 		return (0);
1559 	UFS_UNLOCK(ump);
1560 	if (bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize,
1561 	    NOCRED, &bp))
1562 		goto fail_lock;
1563 	cgp = (struct cg *)bp->b_data;
1564 	if (!cg_chkmagic(cgp))
1565 		goto fail_lock;
1566 	bp->b_xflags |= BX_BKGRDWRITE;
1567 	/*
1568 	 * Check to see if a cluster of the needed size (or bigger) is
1569 	 * available in this cylinder group.
1570 	 */
1571 	lp = &cg_clustersum(cgp)[len];
1572 	for (i = len; i <= fs->fs_contigsumsize; i++)
1573 		if (*lp++ > 0)
1574 			break;
1575 	if (i > fs->fs_contigsumsize) {
1576 		/*
1577 		 * This is the first time looking for a cluster in this
1578 		 * cylinder group. Update the cluster summary information
1579 		 * to reflect the true maximum sized cluster so that
1580 		 * future cluster allocation requests can avoid reading
1581 		 * the cylinder group map only to find no clusters.
1582 		 */
1583 		lp = &cg_clustersum(cgp)[len - 1];
1584 		for (i = len - 1; i > 0; i--)
1585 			if (*lp-- > 0)
1586 				break;
1587 		UFS_LOCK(ump);
1588 		fs->fs_maxcluster[cg] = i;
1589 		goto fail;
1590 	}
1591 	/*
1592 	 * Search the cluster map to find a big enough cluster.
1593 	 * We take the first one that we find, even if it is larger
1594 	 * than we need as we prefer to get one close to the previous
1595 	 * block allocation. We do not search before the current
1596 	 * preference point as we do not want to allocate a block
1597 	 * that is allocated before the previous one (as we will
1598 	 * then have to wait for another pass of the elevator
1599 	 * algorithm before it will be read). We prefer to fail and
1600 	 * be recalled to try an allocation in the next cylinder group.
1601 	 */
1602 	if (dtog(fs, bpref) != cg)
1603 		bpref = 0;
1604 	else
1605 		bpref = fragstoblks(fs, dtogd(fs, blknum(fs, bpref)));
1606 	mapp = &cg_clustersfree(cgp)[bpref / NBBY];
1607 	map = *mapp++;
1608 	bit = 1 << (bpref % NBBY);
1609 	for (run = 0, got = bpref; got < cgp->cg_nclusterblks; got++) {
1610 		if ((map & bit) == 0) {
1611 			run = 0;
1612 		} else {
1613 			run++;
1614 			if (run == len)
1615 				break;
1616 		}
1617 		if ((got & (NBBY - 1)) != (NBBY - 1)) {
1618 			bit <<= 1;
1619 		} else {
1620 			map = *mapp++;
1621 			bit = 1;
1622 		}
1623 	}
1624 	if (got >= cgp->cg_nclusterblks)
1625 		goto fail_lock;
1626 	/*
1627 	 * Allocate the cluster that we have found.
1628 	 */
1629 	blksfree = cg_blksfree(cgp);
1630 	for (i = 1; i <= len; i++)
1631 		if (!ffs_isblock(fs, blksfree, got - run + i))
1632 			panic("ffs_clusteralloc: map mismatch");
1633 	bno = cgbase(fs, cg) + blkstofrags(fs, got - run + 1);
1634 	if (dtog(fs, bno) != cg)
1635 		panic("ffs_clusteralloc: allocated out of group");
1636 	len = blkstofrags(fs, len);
1637 	UFS_LOCK(ump);
1638 	for (i = 0; i < len; i += fs->fs_frag)
1639 		if (ffs_alloccgblk(ip, bp, bno + i) != bno + i)
1640 			panic("ffs_clusteralloc: lost block");
1641 	ACTIVECLEAR(fs, cg);
1642 	UFS_UNLOCK(ump);
1643 	bdwrite(bp);
1644 	return (bno);
1645 
1646 fail_lock:
1647 	UFS_LOCK(ump);
1648 fail:
1649 	brelse(bp);
1650 	return (0);
1651 }
1652 
1653 /*
1654  * Determine whether an inode can be allocated.
1655  *
1656  * Check to see if an inode is available, and if it is,
1657  * allocate it using the following policy:
1658  *   1) allocate the requested inode.
1659  *   2) allocate the next available inode after the requested
1660  *      inode in the specified cylinder group.
1661  */
1662 static ufs2_daddr_t
1663 ffs_nodealloccg(ip, cg, ipref, mode)
1664 	struct inode *ip;
1665 	int cg;
1666 	ufs2_daddr_t ipref;
1667 	int mode;
1668 {
1669 	struct fs *fs;
1670 	struct cg *cgp;
1671 	struct buf *bp, *ibp;
1672 	struct ufsmount *ump;
1673 	u_int8_t *inosused;
1674 	struct ufs2_dinode *dp2;
1675 	int error, start, len, loc, map, i;
1676 
1677 	fs = ip->i_fs;
1678 	ump = ip->i_ump;
1679 	if (fs->fs_cs(fs, cg).cs_nifree == 0)
1680 		return (0);
1681 	UFS_UNLOCK(ump);
1682 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
1683 		(int)fs->fs_cgsize, NOCRED, &bp);
1684 	if (error) {
1685 		brelse(bp);
1686 		UFS_LOCK(ump);
1687 		return (0);
1688 	}
1689 	cgp = (struct cg *)bp->b_data;
1690 	if (!cg_chkmagic(cgp) || cgp->cg_cs.cs_nifree == 0) {
1691 		brelse(bp);
1692 		UFS_LOCK(ump);
1693 		return (0);
1694 	}
1695 	bp->b_xflags |= BX_BKGRDWRITE;
1696 	cgp->cg_old_time = cgp->cg_time = time_second;
1697 	inosused = cg_inosused(cgp);
1698 	if (ipref) {
1699 		ipref %= fs->fs_ipg;
1700 		if (isclr(inosused, ipref))
1701 			goto gotit;
1702 	}
1703 	start = cgp->cg_irotor / NBBY;
1704 	len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY);
1705 	loc = skpc(0xff, len, &inosused[start]);
1706 	if (loc == 0) {
1707 		len = start + 1;
1708 		start = 0;
1709 		loc = skpc(0xff, len, &inosused[0]);
1710 		if (loc == 0) {
1711 			printf("cg = %d, irotor = %ld, fs = %s\n",
1712 			    cg, (long)cgp->cg_irotor, fs->fs_fsmnt);
1713 			panic("ffs_nodealloccg: map corrupted");
1714 			/* NOTREACHED */
1715 		}
1716 	}
1717 	i = start + len - loc;
1718 	map = inosused[i];
1719 	ipref = i * NBBY;
1720 	for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) {
1721 		if ((map & i) == 0) {
1722 			cgp->cg_irotor = ipref;
1723 			goto gotit;
1724 		}
1725 	}
1726 	printf("fs = %s\n", fs->fs_fsmnt);
1727 	panic("ffs_nodealloccg: block not in map");
1728 	/* NOTREACHED */
1729 gotit:
1730 	/*
1731 	 * Check to see if we need to initialize more inodes.
1732 	 */
1733 	ibp = NULL;
1734 	if (fs->fs_magic == FS_UFS2_MAGIC &&
1735 	    ipref + INOPB(fs) > cgp->cg_initediblk &&
1736 	    cgp->cg_initediblk < cgp->cg_niblk) {
1737 		ibp = getblk(ip->i_devvp, fsbtodb(fs,
1738 		    ino_to_fsba(fs, cg * fs->fs_ipg + cgp->cg_initediblk)),
1739 		    (int)fs->fs_bsize, 0, 0, 0);
1740 		bzero(ibp->b_data, (int)fs->fs_bsize);
1741 		dp2 = (struct ufs2_dinode *)(ibp->b_data);
1742 		for (i = 0; i < INOPB(fs); i++) {
1743 			dp2->di_gen = arc4random() / 2 + 1;
1744 			dp2++;
1745 		}
1746 		cgp->cg_initediblk += INOPB(fs);
1747 	}
1748 	UFS_LOCK(ump);
1749 	ACTIVECLEAR(fs, cg);
1750 	setbit(inosused, ipref);
1751 	cgp->cg_cs.cs_nifree--;
1752 	fs->fs_cstotal.cs_nifree--;
1753 	fs->fs_cs(fs, cg).cs_nifree--;
1754 	fs->fs_fmod = 1;
1755 	if ((mode & IFMT) == IFDIR) {
1756 		cgp->cg_cs.cs_ndir++;
1757 		fs->fs_cstotal.cs_ndir++;
1758 		fs->fs_cs(fs, cg).cs_ndir++;
1759 	}
1760 	UFS_UNLOCK(ump);
1761 	if (DOINGSOFTDEP(ITOV(ip)))
1762 		softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref);
1763 	bdwrite(bp);
1764 	if (ibp != NULL)
1765 		bawrite(ibp);
1766 	return (cg * fs->fs_ipg + ipref);
1767 }
1768 
1769 /*
1770  * check if a block is free
1771  */
1772 static int
1773 ffs_isfreeblock(struct fs *fs, u_char *cp, ufs1_daddr_t h)
1774 {
1775 
1776 	switch ((int)fs->fs_frag) {
1777 	case 8:
1778 		return (cp[h] == 0);
1779 	case 4:
1780 		return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0);
1781 	case 2:
1782 		return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0);
1783 	case 1:
1784 		return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0);
1785 	default:
1786 		panic("ffs_isfreeblock");
1787 	}
1788 	return (0);
1789 }
1790 
1791 /*
1792  * Free a block or fragment.
1793  *
1794  * The specified block or fragment is placed back in the
1795  * free map. If a fragment is deallocated, a possible
1796  * block reassembly is checked.
1797  */
1798 void
1799 ffs_blkfree(ump, fs, devvp, bno, size, inum)
1800 	struct ufsmount *ump;
1801 	struct fs *fs;
1802 	struct vnode *devvp;
1803 	ufs2_daddr_t bno;
1804 	long size;
1805 	ino_t inum;
1806 {
1807 	struct cg *cgp;
1808 	struct buf *bp;
1809 	ufs1_daddr_t fragno, cgbno;
1810 	ufs2_daddr_t cgblkno;
1811 	int i, cg, blk, frags, bbase;
1812 	u_int8_t *blksfree;
1813 	struct cdev *dev;
1814 
1815 	cg = dtog(fs, bno);
1816 	if (devvp->v_type != VCHR) {
1817 		/* devvp is a snapshot */
1818 		dev = VTOI(devvp)->i_devvp->v_rdev;
1819 		cgblkno = fragstoblks(fs, cgtod(fs, cg));
1820 	} else {
1821 		/* devvp is a normal disk device */
1822 		dev = devvp->v_rdev;
1823 		cgblkno = fsbtodb(fs, cgtod(fs, cg));
1824 		ASSERT_VOP_LOCKED(devvp, "ffs_blkfree");
1825 		if ((devvp->v_vflag & VV_COPYONWRITE) &&
1826 		    ffs_snapblkfree(fs, devvp, bno, size, inum))
1827 			return;
1828 	}
1829 #ifdef DIAGNOSTIC
1830 	if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 ||
1831 	    fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) {
1832 		printf("dev=%s, bno = %jd, bsize = %ld, size = %ld, fs = %s\n",
1833 		    devtoname(dev), (intmax_t)bno, (long)fs->fs_bsize,
1834 		    size, fs->fs_fsmnt);
1835 		panic("ffs_blkfree: bad size");
1836 	}
1837 #endif
1838 	if ((u_int)bno >= fs->fs_size) {
1839 		printf("bad block %jd, ino %lu\n", (intmax_t)bno,
1840 		    (u_long)inum);
1841 		ffs_fserr(fs, inum, "bad block");
1842 		return;
1843 	}
1844 	if (bread(devvp, cgblkno, (int)fs->fs_cgsize, NOCRED, &bp)) {
1845 		brelse(bp);
1846 		return;
1847 	}
1848 	cgp = (struct cg *)bp->b_data;
1849 	if (!cg_chkmagic(cgp)) {
1850 		brelse(bp);
1851 		return;
1852 	}
1853 	bp->b_xflags |= BX_BKGRDWRITE;
1854 	cgp->cg_old_time = cgp->cg_time = time_second;
1855 	cgbno = dtogd(fs, bno);
1856 	blksfree = cg_blksfree(cgp);
1857 	UFS_LOCK(ump);
1858 	if (size == fs->fs_bsize) {
1859 		fragno = fragstoblks(fs, cgbno);
1860 		if (!ffs_isfreeblock(fs, blksfree, fragno)) {
1861 			if (devvp->v_type != VCHR) {
1862 				UFS_UNLOCK(ump);
1863 				/* devvp is a snapshot */
1864 				brelse(bp);
1865 				return;
1866 			}
1867 			printf("dev = %s, block = %jd, fs = %s\n",
1868 			    devtoname(dev), (intmax_t)bno, fs->fs_fsmnt);
1869 			panic("ffs_blkfree: freeing free block");
1870 		}
1871 		ffs_setblock(fs, blksfree, fragno);
1872 		ffs_clusteracct(ump, fs, cgp, fragno, 1);
1873 		cgp->cg_cs.cs_nbfree++;
1874 		fs->fs_cstotal.cs_nbfree++;
1875 		fs->fs_cs(fs, cg).cs_nbfree++;
1876 	} else {
1877 		bbase = cgbno - fragnum(fs, cgbno);
1878 		/*
1879 		 * decrement the counts associated with the old frags
1880 		 */
1881 		blk = blkmap(fs, blksfree, bbase);
1882 		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
1883 		/*
1884 		 * deallocate the fragment
1885 		 */
1886 		frags = numfrags(fs, size);
1887 		for (i = 0; i < frags; i++) {
1888 			if (isset(blksfree, cgbno + i)) {
1889 				printf("dev = %s, block = %jd, fs = %s\n",
1890 				    devtoname(dev), (intmax_t)(bno + i),
1891 				    fs->fs_fsmnt);
1892 				panic("ffs_blkfree: freeing free frag");
1893 			}
1894 			setbit(blksfree, cgbno + i);
1895 		}
1896 		cgp->cg_cs.cs_nffree += i;
1897 		fs->fs_cstotal.cs_nffree += i;
1898 		fs->fs_cs(fs, cg).cs_nffree += i;
1899 		/*
1900 		 * add back in counts associated with the new frags
1901 		 */
1902 		blk = blkmap(fs, blksfree, bbase);
1903 		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
1904 		/*
1905 		 * if a complete block has been reassembled, account for it
1906 		 */
1907 		fragno = fragstoblks(fs, bbase);
1908 		if (ffs_isblock(fs, blksfree, fragno)) {
1909 			cgp->cg_cs.cs_nffree -= fs->fs_frag;
1910 			fs->fs_cstotal.cs_nffree -= fs->fs_frag;
1911 			fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
1912 			ffs_clusteracct(ump, fs, cgp, fragno, 1);
1913 			cgp->cg_cs.cs_nbfree++;
1914 			fs->fs_cstotal.cs_nbfree++;
1915 			fs->fs_cs(fs, cg).cs_nbfree++;
1916 		}
1917 	}
1918 	fs->fs_fmod = 1;
1919 	ACTIVECLEAR(fs, cg);
1920 	UFS_UNLOCK(ump);
1921 	bdwrite(bp);
1922 }
1923 
1924 #ifdef DIAGNOSTIC
1925 /*
1926  * Verify allocation of a block or fragment. Returns true if block or
1927  * fragment is allocated, false if it is free.
1928  */
1929 static int
1930 ffs_checkblk(ip, bno, size)
1931 	struct inode *ip;
1932 	ufs2_daddr_t bno;
1933 	long size;
1934 {
1935 	struct fs *fs;
1936 	struct cg *cgp;
1937 	struct buf *bp;
1938 	ufs1_daddr_t cgbno;
1939 	int i, error, frags, free;
1940 	u_int8_t *blksfree;
1941 
1942 	fs = ip->i_fs;
1943 	if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
1944 		printf("bsize = %ld, size = %ld, fs = %s\n",
1945 		    (long)fs->fs_bsize, size, fs->fs_fsmnt);
1946 		panic("ffs_checkblk: bad size");
1947 	}
1948 	if ((u_int)bno >= fs->fs_size)
1949 		panic("ffs_checkblk: bad block %jd", (intmax_t)bno);
1950 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, dtog(fs, bno))),
1951 		(int)fs->fs_cgsize, NOCRED, &bp);
1952 	if (error)
1953 		panic("ffs_checkblk: cg bread failed");
1954 	cgp = (struct cg *)bp->b_data;
1955 	if (!cg_chkmagic(cgp))
1956 		panic("ffs_checkblk: cg magic mismatch");
1957 	bp->b_xflags |= BX_BKGRDWRITE;
1958 	blksfree = cg_blksfree(cgp);
1959 	cgbno = dtogd(fs, bno);
1960 	if (size == fs->fs_bsize) {
1961 		free = ffs_isblock(fs, blksfree, fragstoblks(fs, cgbno));
1962 	} else {
1963 		frags = numfrags(fs, size);
1964 		for (free = 0, i = 0; i < frags; i++)
1965 			if (isset(blksfree, cgbno + i))
1966 				free++;
1967 		if (free != 0 && free != frags)
1968 			panic("ffs_checkblk: partially free fragment");
1969 	}
1970 	brelse(bp);
1971 	return (!free);
1972 }
1973 #endif /* DIAGNOSTIC */
1974 
1975 /*
1976  * Free an inode.
1977  */
1978 int
1979 ffs_vfree(pvp, ino, mode)
1980 	struct vnode *pvp;
1981 	ino_t ino;
1982 	int mode;
1983 {
1984 	struct inode *ip;
1985 
1986 	if (DOINGSOFTDEP(pvp)) {
1987 		softdep_freefile(pvp, ino, mode);
1988 		return (0);
1989 	}
1990 	ip = VTOI(pvp);
1991 	return (ffs_freefile(ip->i_ump, ip->i_fs, ip->i_devvp, ino, mode));
1992 }
1993 
1994 /*
1995  * Do the actual free operation.
1996  * The specified inode is placed back in the free map.
1997  */
1998 int
1999 ffs_freefile(ump, fs, devvp, ino, mode)
2000 	struct ufsmount *ump;
2001 	struct fs *fs;
2002 	struct vnode *devvp;
2003 	ino_t ino;
2004 	int mode;
2005 {
2006 	struct cg *cgp;
2007 	struct buf *bp;
2008 	ufs2_daddr_t cgbno;
2009 	int error, cg;
2010 	u_int8_t *inosused;
2011 	struct cdev *dev;
2012 
2013 	cg = ino_to_cg(fs, ino);
2014 	if (devvp->v_type != VCHR) {
2015 		/* devvp is a snapshot */
2016 		dev = VTOI(devvp)->i_devvp->v_rdev;
2017 		cgbno = fragstoblks(fs, cgtod(fs, cg));
2018 	} else {
2019 		/* devvp is a normal disk device */
2020 		dev = devvp->v_rdev;
2021 		cgbno = fsbtodb(fs, cgtod(fs, cg));
2022 	}
2023 	if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
2024 		panic("ffs_freefile: range: dev = %s, ino = %lu, fs = %s",
2025 		    devtoname(dev), (u_long)ino, fs->fs_fsmnt);
2026 	if ((error = bread(devvp, cgbno, (int)fs->fs_cgsize, NOCRED, &bp))) {
2027 		brelse(bp);
2028 		return (error);
2029 	}
2030 	cgp = (struct cg *)bp->b_data;
2031 	if (!cg_chkmagic(cgp)) {
2032 		brelse(bp);
2033 		return (0);
2034 	}
2035 	bp->b_xflags |= BX_BKGRDWRITE;
2036 	cgp->cg_old_time = cgp->cg_time = time_second;
2037 	inosused = cg_inosused(cgp);
2038 	ino %= fs->fs_ipg;
2039 	if (isclr(inosused, ino)) {
2040 		printf("dev = %s, ino = %lu, fs = %s\n", devtoname(dev),
2041 		    (u_long)ino + cg * fs->fs_ipg, fs->fs_fsmnt);
2042 		if (fs->fs_ronly == 0)
2043 			panic("ffs_freefile: freeing free inode");
2044 	}
2045 	clrbit(inosused, ino);
2046 	if (ino < cgp->cg_irotor)
2047 		cgp->cg_irotor = ino;
2048 	cgp->cg_cs.cs_nifree++;
2049 	UFS_LOCK(ump);
2050 	fs->fs_cstotal.cs_nifree++;
2051 	fs->fs_cs(fs, cg).cs_nifree++;
2052 	if ((mode & IFMT) == IFDIR) {
2053 		cgp->cg_cs.cs_ndir--;
2054 		fs->fs_cstotal.cs_ndir--;
2055 		fs->fs_cs(fs, cg).cs_ndir--;
2056 	}
2057 	fs->fs_fmod = 1;
2058 	ACTIVECLEAR(fs, cg);
2059 	UFS_UNLOCK(ump);
2060 	bdwrite(bp);
2061 	return (0);
2062 }
2063 
2064 /*
2065  * Check to see if a file is free.
2066  */
2067 int
2068 ffs_checkfreefile(fs, devvp, ino)
2069 	struct fs *fs;
2070 	struct vnode *devvp;
2071 	ino_t ino;
2072 {
2073 	struct cg *cgp;
2074 	struct buf *bp;
2075 	ufs2_daddr_t cgbno;
2076 	int ret, cg;
2077 	u_int8_t *inosused;
2078 
2079 	cg = ino_to_cg(fs, ino);
2080 	if (devvp->v_type != VCHR) {
2081 		/* devvp is a snapshot */
2082 		cgbno = fragstoblks(fs, cgtod(fs, cg));
2083 	} else {
2084 		/* devvp is a normal disk device */
2085 		cgbno = fsbtodb(fs, cgtod(fs, cg));
2086 	}
2087 	if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
2088 		return (1);
2089 	if (bread(devvp, cgbno, (int)fs->fs_cgsize, NOCRED, &bp)) {
2090 		brelse(bp);
2091 		return (1);
2092 	}
2093 	cgp = (struct cg *)bp->b_data;
2094 	if (!cg_chkmagic(cgp)) {
2095 		brelse(bp);
2096 		return (1);
2097 	}
2098 	inosused = cg_inosused(cgp);
2099 	ino %= fs->fs_ipg;
2100 	ret = isclr(inosused, ino);
2101 	brelse(bp);
2102 	return (ret);
2103 }
2104 
2105 /*
2106  * Find a block of the specified size in the specified cylinder group.
2107  *
2108  * It is a panic if a request is made to find a block if none are
2109  * available.
2110  */
2111 static ufs1_daddr_t
2112 ffs_mapsearch(fs, cgp, bpref, allocsiz)
2113 	struct fs *fs;
2114 	struct cg *cgp;
2115 	ufs2_daddr_t bpref;
2116 	int allocsiz;
2117 {
2118 	ufs1_daddr_t bno;
2119 	int start, len, loc, i;
2120 	int blk, field, subfield, pos;
2121 	u_int8_t *blksfree;
2122 
2123 	/*
2124 	 * find the fragment by searching through the free block
2125 	 * map for an appropriate bit pattern
2126 	 */
2127 	if (bpref)
2128 		start = dtogd(fs, bpref) / NBBY;
2129 	else
2130 		start = cgp->cg_frotor / NBBY;
2131 	blksfree = cg_blksfree(cgp);
2132 	len = howmany(fs->fs_fpg, NBBY) - start;
2133 	loc = scanc((u_int)len, (u_char *)&blksfree[start],
2134 		(u_char *)fragtbl[fs->fs_frag],
2135 		(u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
2136 	if (loc == 0) {
2137 		len = start + 1;
2138 		start = 0;
2139 		loc = scanc((u_int)len, (u_char *)&blksfree[0],
2140 			(u_char *)fragtbl[fs->fs_frag],
2141 			(u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
2142 		if (loc == 0) {
2143 			printf("start = %d, len = %d, fs = %s\n",
2144 			    start, len, fs->fs_fsmnt);
2145 			panic("ffs_alloccg: map corrupted");
2146 			/* NOTREACHED */
2147 		}
2148 	}
2149 	bno = (start + len - loc) * NBBY;
2150 	cgp->cg_frotor = bno;
2151 	/*
2152 	 * found the byte in the map
2153 	 * sift through the bits to find the selected frag
2154 	 */
2155 	for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
2156 		blk = blkmap(fs, blksfree, bno);
2157 		blk <<= 1;
2158 		field = around[allocsiz];
2159 		subfield = inside[allocsiz];
2160 		for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) {
2161 			if ((blk & field) == subfield)
2162 				return (bno + pos);
2163 			field <<= 1;
2164 			subfield <<= 1;
2165 		}
2166 	}
2167 	printf("bno = %lu, fs = %s\n", (u_long)bno, fs->fs_fsmnt);
2168 	panic("ffs_alloccg: block not in map");
2169 	return (-1);
2170 }
2171 
2172 /*
2173  * Update the cluster map because of an allocation or free.
2174  *
2175  * Cnt == 1 means free; cnt == -1 means allocating.
2176  */
2177 void
2178 ffs_clusteracct(ump, fs, cgp, blkno, cnt)
2179 	struct ufsmount *ump;
2180 	struct fs *fs;
2181 	struct cg *cgp;
2182 	ufs1_daddr_t blkno;
2183 	int cnt;
2184 {
2185 	int32_t *sump;
2186 	int32_t *lp;
2187 	u_char *freemapp, *mapp;
2188 	int i, start, end, forw, back, map, bit;
2189 
2190 	mtx_assert(UFS_MTX(ump), MA_OWNED);
2191 
2192 	if (fs->fs_contigsumsize <= 0)
2193 		return;
2194 	freemapp = cg_clustersfree(cgp);
2195 	sump = cg_clustersum(cgp);
2196 	/*
2197 	 * Allocate or clear the actual block.
2198 	 */
2199 	if (cnt > 0)
2200 		setbit(freemapp, blkno);
2201 	else
2202 		clrbit(freemapp, blkno);
2203 	/*
2204 	 * Find the size of the cluster going forward.
2205 	 */
2206 	start = blkno + 1;
2207 	end = start + fs->fs_contigsumsize;
2208 	if (end >= cgp->cg_nclusterblks)
2209 		end = cgp->cg_nclusterblks;
2210 	mapp = &freemapp[start / NBBY];
2211 	map = *mapp++;
2212 	bit = 1 << (start % NBBY);
2213 	for (i = start; i < end; i++) {
2214 		if ((map & bit) == 0)
2215 			break;
2216 		if ((i & (NBBY - 1)) != (NBBY - 1)) {
2217 			bit <<= 1;
2218 		} else {
2219 			map = *mapp++;
2220 			bit = 1;
2221 		}
2222 	}
2223 	forw = i - start;
2224 	/*
2225 	 * Find the size of the cluster going backward.
2226 	 */
2227 	start = blkno - 1;
2228 	end = start - fs->fs_contigsumsize;
2229 	if (end < 0)
2230 		end = -1;
2231 	mapp = &freemapp[start / NBBY];
2232 	map = *mapp--;
2233 	bit = 1 << (start % NBBY);
2234 	for (i = start; i > end; i--) {
2235 		if ((map & bit) == 0)
2236 			break;
2237 		if ((i & (NBBY - 1)) != 0) {
2238 			bit >>= 1;
2239 		} else {
2240 			map = *mapp--;
2241 			bit = 1 << (NBBY - 1);
2242 		}
2243 	}
2244 	back = start - i;
2245 	/*
2246 	 * Account for old cluster and the possibly new forward and
2247 	 * back clusters.
2248 	 */
2249 	i = back + forw + 1;
2250 	if (i > fs->fs_contigsumsize)
2251 		i = fs->fs_contigsumsize;
2252 	sump[i] += cnt;
2253 	if (back > 0)
2254 		sump[back] -= cnt;
2255 	if (forw > 0)
2256 		sump[forw] -= cnt;
2257 	/*
2258 	 * Update cluster summary information.
2259 	 */
2260 	lp = &sump[fs->fs_contigsumsize];
2261 	for (i = fs->fs_contigsumsize; i > 0; i--)
2262 		if (*lp-- > 0)
2263 			break;
2264 	fs->fs_maxcluster[cgp->cg_cgx] = i;
2265 }
2266 
2267 /*
2268  * Fserr prints the name of a filesystem with an error diagnostic.
2269  *
2270  * The form of the error message is:
2271  *	fs: error message
2272  */
2273 static void
2274 ffs_fserr(fs, inum, cp)
2275 	struct fs *fs;
2276 	ino_t inum;
2277 	char *cp;
2278 {
2279 	struct thread *td = curthread;	/* XXX */
2280 	struct proc *p = td->td_proc;
2281 
2282 	log(LOG_ERR, "pid %d (%s), uid %d inumber %d on %s: %s\n",
2283 	    p->p_pid, p->p_comm, td->td_ucred->cr_uid, inum, fs->fs_fsmnt, cp);
2284 }
2285 
2286 /*
2287  * This function provides the capability for the fsck program to
2288  * update an active filesystem. Eleven operations are provided:
2289  *
2290  * adjrefcnt(inode, amt) - adjusts the reference count on the
2291  *	specified inode by the specified amount. Under normal
2292  *	operation the count should always go down. Decrementing
2293  *	the count to zero will cause the inode to be freed.
2294  * adjblkcnt(inode, amt) - adjust the number of blocks used to
2295  *	by the specifed amount.
2296  * adjndir, adjbfree, adjifree, adjffree, adjnumclusters(amt) -
2297  *	adjust the superblock summary.
2298  * freedirs(inode, count) - directory inodes [inode..inode + count - 1]
2299  *	are marked as free. Inodes should never have to be marked
2300  *	as in use.
2301  * freefiles(inode, count) - file inodes [inode..inode + count - 1]
2302  *	are marked as free. Inodes should never have to be marked
2303  *	as in use.
2304  * freeblks(blockno, size) - blocks [blockno..blockno + size - 1]
2305  *	are marked as free. Blocks should never have to be marked
2306  *	as in use.
2307  * setflags(flags, set/clear) - the fs_flags field has the specified
2308  *	flags set (second parameter +1) or cleared (second parameter -1).
2309  */
2310 
2311 static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS);
2312 
2313 SYSCTL_PROC(_vfs_ffs, FFS_ADJ_REFCNT, adjrefcnt, CTLFLAG_WR|CTLTYPE_STRUCT,
2314 	0, 0, sysctl_ffs_fsck, "S,fsck", "Adjust Inode Reference Count");
2315 
2316 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_BLKCNT, adjblkcnt, CTLFLAG_WR,
2317 	sysctl_ffs_fsck, "Adjust Inode Used Blocks Count");
2318 
2319 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NDIR, adjndir, CTLFLAG_WR,
2320 	sysctl_ffs_fsck, "Adjust number of directories");
2321 
2322 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NBFREE, adjnbfree, CTLFLAG_WR,
2323 	sysctl_ffs_fsck, "Adjust number of free blocks");
2324 
2325 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NIFREE, adjnifree, CTLFLAG_WR,
2326 	sysctl_ffs_fsck, "Adjust number of free inodes");
2327 
2328 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NFFREE, adjnffree, CTLFLAG_WR,
2329 	sysctl_ffs_fsck, "Adjust number of free frags");
2330 
2331 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NUMCLUSTERS, adjnumclusters, CTLFLAG_WR,
2332 	sysctl_ffs_fsck, "Adjust number of free clusters");
2333 
2334 static SYSCTL_NODE(_vfs_ffs, FFS_DIR_FREE, freedirs, CTLFLAG_WR,
2335 	sysctl_ffs_fsck, "Free Range of Directory Inodes");
2336 
2337 static SYSCTL_NODE(_vfs_ffs, FFS_FILE_FREE, freefiles, CTLFLAG_WR,
2338 	sysctl_ffs_fsck, "Free Range of File Inodes");
2339 
2340 static SYSCTL_NODE(_vfs_ffs, FFS_BLK_FREE, freeblks, CTLFLAG_WR,
2341 	sysctl_ffs_fsck, "Free Range of Blocks");
2342 
2343 static SYSCTL_NODE(_vfs_ffs, FFS_SET_FLAGS, setflags, CTLFLAG_WR,
2344 	sysctl_ffs_fsck, "Change Filesystem Flags");
2345 
2346 #ifdef DEBUG
2347 static int fsckcmds = 0;
2348 SYSCTL_INT(_debug, OID_AUTO, fsckcmds, CTLFLAG_RW, &fsckcmds, 0, "");
2349 #endif /* DEBUG */
2350 
2351 static int
2352 sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
2353 {
2354 	struct fsck_cmd cmd;
2355 	struct ufsmount *ump;
2356 	struct vnode *vp;
2357 	struct inode *ip;
2358 	struct mount *mp;
2359 	struct fs *fs;
2360 	ufs2_daddr_t blkno;
2361 	long blkcnt, blksize;
2362 	struct file *fp;
2363 	int filetype, error;
2364 
2365 	if (req->newlen > sizeof cmd)
2366 		return (EBADRPC);
2367 	if ((error = SYSCTL_IN(req, &cmd, sizeof cmd)) != 0)
2368 		return (error);
2369 	if (cmd.version != FFS_CMD_VERSION)
2370 		return (ERPCMISMATCH);
2371 	if ((error = getvnode(curproc->p_fd, cmd.handle, &fp)) != 0)
2372 		return (error);
2373 	vn_start_write(fp->f_data, &mp, V_WAIT);
2374 	if (mp == 0 || strncmp(mp->mnt_stat.f_fstypename, "ufs", MFSNAMELEN)) {
2375 		vn_finished_write(mp);
2376 		fdrop(fp, curthread);
2377 		return (EINVAL);
2378 	}
2379 	if (mp->mnt_flag & MNT_RDONLY) {
2380 		vn_finished_write(mp);
2381 		fdrop(fp, curthread);
2382 		return (EROFS);
2383 	}
2384 	ump = VFSTOUFS(mp);
2385 	fs = ump->um_fs;
2386 	filetype = IFREG;
2387 
2388 	switch (oidp->oid_number) {
2389 
2390 	case FFS_SET_FLAGS:
2391 #ifdef DEBUG
2392 		if (fsckcmds)
2393 			printf("%s: %s flags\n", mp->mnt_stat.f_mntonname,
2394 			    cmd.size > 0 ? "set" : "clear");
2395 #endif /* DEBUG */
2396 		if (cmd.size > 0)
2397 			fs->fs_flags |= (long)cmd.value;
2398 		else
2399 			fs->fs_flags &= ~(long)cmd.value;
2400 		break;
2401 
2402 	case FFS_ADJ_REFCNT:
2403 #ifdef DEBUG
2404 		if (fsckcmds) {
2405 			printf("%s: adjust inode %jd count by %jd\n",
2406 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
2407 			    (intmax_t)cmd.size);
2408 		}
2409 #endif /* DEBUG */
2410 		if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp)))
2411 			break;
2412 		ip = VTOI(vp);
2413 		ip->i_nlink += cmd.size;
2414 		DIP_SET(ip, i_nlink, ip->i_nlink);
2415 		ip->i_effnlink += cmd.size;
2416 		ip->i_flag |= IN_CHANGE;
2417 		if (DOINGSOFTDEP(vp))
2418 			softdep_change_linkcnt(ip);
2419 		vput(vp);
2420 		break;
2421 
2422 	case FFS_ADJ_BLKCNT:
2423 #ifdef DEBUG
2424 		if (fsckcmds) {
2425 			printf("%s: adjust inode %jd block count by %jd\n",
2426 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
2427 			    (intmax_t)cmd.size);
2428 		}
2429 #endif /* DEBUG */
2430 		if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp)))
2431 			break;
2432 		ip = VTOI(vp);
2433 		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + cmd.size);
2434 		ip->i_flag |= IN_CHANGE;
2435 		vput(vp);
2436 		break;
2437 
2438 	case FFS_DIR_FREE:
2439 		filetype = IFDIR;
2440 		/* fall through */
2441 
2442 	case FFS_FILE_FREE:
2443 #ifdef DEBUG
2444 		if (fsckcmds) {
2445 			if (cmd.size == 1)
2446 				printf("%s: free %s inode %d\n",
2447 				    mp->mnt_stat.f_mntonname,
2448 				    filetype == IFDIR ? "directory" : "file",
2449 				    (ino_t)cmd.value);
2450 			else
2451 				printf("%s: free %s inodes %d-%d\n",
2452 				    mp->mnt_stat.f_mntonname,
2453 				    filetype == IFDIR ? "directory" : "file",
2454 				    (ino_t)cmd.value,
2455 				    (ino_t)(cmd.value + cmd.size - 1));
2456 		}
2457 #endif /* DEBUG */
2458 		while (cmd.size > 0) {
2459 			if ((error = ffs_freefile(ump, fs, ump->um_devvp,
2460 			    cmd.value, filetype)))
2461 				break;
2462 			cmd.size -= 1;
2463 			cmd.value += 1;
2464 		}
2465 		break;
2466 
2467 	case FFS_BLK_FREE:
2468 #ifdef DEBUG
2469 		if (fsckcmds) {
2470 			if (cmd.size == 1)
2471 				printf("%s: free block %jd\n",
2472 				    mp->mnt_stat.f_mntonname,
2473 				    (intmax_t)cmd.value);
2474 			else
2475 				printf("%s: free blocks %jd-%jd\n",
2476 				    mp->mnt_stat.f_mntonname,
2477 				    (intmax_t)cmd.value,
2478 				    (intmax_t)cmd.value + cmd.size - 1);
2479 		}
2480 #endif /* DEBUG */
2481 		blkno = cmd.value;
2482 		blkcnt = cmd.size;
2483 		blksize = fs->fs_frag - (blkno % fs->fs_frag);
2484 		while (blkcnt > 0) {
2485 			if (blksize > blkcnt)
2486 				blksize = blkcnt;
2487 			ffs_blkfree(ump, fs, ump->um_devvp, blkno,
2488 			    blksize * fs->fs_fsize, ROOTINO);
2489 			blkno += blksize;
2490 			blkcnt -= blksize;
2491 			blksize = fs->fs_frag;
2492 		}
2493 		break;
2494 
2495 	/*
2496 	 * Adjust superblock summaries.  fsck(8) is expected to
2497 	 * submit deltas when necessary.
2498 	 */
2499 	case FFS_ADJ_NDIR:
2500 #ifdef DEBUG
2501 		if (fsckcmds) {
2502 			printf("%s: adjust number of directories by %jd\n",
2503 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
2504 		}
2505 #endif /* DEBUG */
2506 		fs->fs_cstotal.cs_ndir += cmd.value;
2507 		break;
2508 	case FFS_ADJ_NBFREE:
2509 #ifdef DEBUG
2510 		if (fsckcmds) {
2511 			printf("%s: adjust number of free blocks by %+jd\n",
2512 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
2513 		}
2514 #endif /* DEBUG */
2515 		fs->fs_cstotal.cs_nbfree += cmd.value;
2516 		break;
2517 	case FFS_ADJ_NIFREE:
2518 #ifdef DEBUG
2519 		if (fsckcmds) {
2520 			printf("%s: adjust number of free inodes by %+jd\n",
2521 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
2522 		}
2523 #endif /* DEBUG */
2524 		fs->fs_cstotal.cs_nifree += cmd.value;
2525 		break;
2526 	case FFS_ADJ_NFFREE:
2527 #ifdef DEBUG
2528 		if (fsckcmds) {
2529 			printf("%s: adjust number of free frags by %+jd\n",
2530 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
2531 		}
2532 #endif /* DEBUG */
2533 		fs->fs_cstotal.cs_nffree += cmd.value;
2534 		break;
2535 	case FFS_ADJ_NUMCLUSTERS:
2536 #ifdef DEBUG
2537 		if (fsckcmds) {
2538 			printf("%s: adjust number of free clusters by %+jd\n",
2539 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
2540 		}
2541 #endif /* DEBUG */
2542 		fs->fs_cstotal.cs_numclusters += cmd.value;
2543 		break;
2544 
2545 	default:
2546 #ifdef DEBUG
2547 		if (fsckcmds) {
2548 			printf("Invalid request %d from fsck\n",
2549 			    oidp->oid_number);
2550 		}
2551 #endif /* DEBUG */
2552 		error = EINVAL;
2553 		break;
2554 
2555 	}
2556 	fdrop(fp, curthread);
2557 	vn_finished_write(mp);
2558 	return (error);
2559 }
2560