xref: /titanic_52/usr/src/uts/common/fs/ufs/ufs_alloc.c (revision 10db1377dafab8ba3feedef26db9c5d8539a5cd1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 /*
31  * University Copyright- Copyright (c) 1982, 1986, 1988
32  * The Regents of the University of California
33  * All Rights Reserved
34  *
35  * University Acknowledgment- Portions of this document are derived from
36  * software developed by the University of California, Berkeley, and its
37  * contributors.
38  */
39 
40 
41 #pragma ident	"%Z%%M%	%I%	%E% SMI"
42 
43 #include <sys/condvar_impl.h>
44 #include <sys/types.h>
45 #include <sys/t_lock.h>
46 #include <sys/debug.h>
47 #include <sys/param.h>
48 #include <sys/systm.h>
49 #include <sys/signal.h>
50 #include <sys/cred.h>
51 #include <sys/proc.h>
52 #include <sys/disp.h>
53 #include <sys/user.h>
54 #include <sys/buf.h>
55 #include <sys/vfs.h>
56 #include <sys/vnode.h>
57 #include <sys/acl.h>
58 #include <sys/fs/ufs_fs.h>
59 #include <sys/fs/ufs_inode.h>
60 #include <sys/fs/ufs_acl.h>
61 #include <sys/fs/ufs_bio.h>
62 #include <sys/fs/ufs_quota.h>
63 #include <sys/kmem.h>
64 #include <sys/fs/ufs_trans.h>
65 #include <sys/fs/ufs_panic.h>
66 #include <sys/errno.h>
67 #include <sys/time.h>
68 #include <sys/sysmacros.h>
69 #include <sys/file.h>
70 #include <sys/fcntl.h>
71 #include <sys/flock.h>
72 #include <fs/fs_subr.h>
73 #include <sys/cmn_err.h>
74 #include <sys/policy.h>
75 
76 static ino_t	hashalloc();
77 static daddr_t	fragextend();
78 static daddr_t	alloccg();
79 static daddr_t	alloccgblk();
80 static ino_t	ialloccg();
81 static daddr_t	mapsearch();
82 
83 extern int	inside[], around[];
84 extern uchar_t	*fragtbl[];
85 void delay();
86 
87 /*
88  * Allocate a block in the file system.
89  *
90  * The size of the requested block is given, which must be some
91  * multiple of fs_fsize and <= fs_bsize.
92  * A preference may be optionally specified. If a preference is given
93  * the following hierarchy is used to allocate a block:
94  *   1) allocate the requested block.
95  *   2) allocate a rotationally optimal block in the same cylinder.
96  *   3) allocate a block in the same cylinder group.
97  *   4) quadratically rehash into other cylinder groups, until an
98  *	available block is located.
99  * If no block preference is given the following hierarchy is used
100  * to allocate a block:
101  *   1) allocate a block in the cylinder group that contains the
102  *	inode for the file.
103  *   2) quadratically rehash into other cylinder groups, until an
104  *	available block is located.
105  */
106 int
107 alloc(struct inode *ip, daddr_t bpref, int size, daddr_t *bnp, cred_t *cr)
108 {
109 	struct fs *fs;
110 	struct ufsvfs *ufsvfsp;
111 	daddr_t bno;
112 	int cg;
113 	int err;
114 	char *errmsg = NULL;
115 	size_t len;
116 
117 	ufsvfsp = ip->i_ufsvfs;
118 	fs = ufsvfsp->vfs_fs;
119 	if ((unsigned)size > fs->fs_bsize || fragoff(fs, size) != 0) {
120 		err = ufs_fault(ITOV(ip), "alloc: bad size, dev = 0x%lx,"
121 		    " bsize = %d, size = %d, fs = %s\n",
122 		    ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
123 		return (err);
124 	}
125 	if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
126 		goto nospace;
127 	if (freespace(fs, ufsvfsp) <= 0 &&
128 	    secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0)
129 		goto nospace;
130 	err = chkdq(ip, (long)btodb(size), 0, cr, &errmsg, &len);
131 	/* Note that may not have err, but may have errmsg */
132 	if (errmsg != NULL) {
133 		uprintf(errmsg);
134 		kmem_free(errmsg, len);
135 		errmsg = NULL;
136 	}
137 	if (err)
138 		return (err);
139 	if (bpref >= fs->fs_size)
140 		bpref = 0;
141 	if (bpref == 0)
142 		cg = (int)itog(fs, ip->i_number);
143 	else
144 		cg = dtog(fs, bpref);
145 
146 	bno = (daddr_t)hashalloc(ip, cg, (long)bpref, size,
147 	    (ulong_t (*)())alloccg);
148 	if (bno > 0) {
149 		*bnp = bno;
150 		return (0);
151 	}
152 
153 	/*
154 	 * hashalloc() failed because some other thread grabbed
155 	 * the last block so unwind the quota operation.  We can
156 	 * ignore the return because subtractions don't fail and
157 	 * size is guaranteed to be >= zero by our caller.
158 	 */
159 	(void) chkdq(ip, -(long)btodb(size), 0, cr, (char **)NULL,
160 		(size_t *)NULL);
161 
162 nospace:
163 	mutex_enter(&ufsvfsp->vfs_lock);
164 	if ((lbolt - ufsvfsp->vfs_lastwhinetime) > (hz << 2) &&
165 		(!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) {
166 		ufsvfsp->vfs_lastwhinetime = lbolt;
167 		cmn_err(CE_NOTE, "alloc: %s: file system full", fs->fs_fsmnt);
168 	}
169 	mutex_exit(&ufsvfsp->vfs_lock);
170 	return (ENOSPC);
171 }
172 
173 /*
174  * Reallocate a fragment to a bigger size
175  *
176  * The number and size of the old block is given, and a preference
177  * and new size is also specified.  The allocator attempts to extend
178  * the original block.  Failing that, the regular block allocator is
179  * invoked to get an appropriate block.
180  */
181 int
182 realloccg(struct inode *ip, daddr_t bprev, daddr_t bpref, int osize,
183     int nsize, daddr_t *bnp, cred_t *cr)
184 {
185 	daddr_t bno;
186 	struct fs *fs;
187 	struct ufsvfs *ufsvfsp;
188 	int cg, request;
189 	int err;
190 	char *errmsg = NULL;
191 	size_t len;
192 
193 	ufsvfsp = ip->i_ufsvfs;
194 	fs = ufsvfsp->vfs_fs;
195 	if ((unsigned)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
196 	    (unsigned)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
197 		err = ufs_fault(ITOV(ip),
198 		    "realloccg: bad size, dev=0x%lx, bsize=%d, "
199 		    "osize=%d, nsize=%d, fs=%s\n",
200 		    ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt);
201 		return (err);
202 	}
203 	if (freespace(fs, ufsvfsp) <= 0 &&
204 	    secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0)
205 		goto nospace;
206 	if (bprev == 0) {
207 		err = ufs_fault(ITOV(ip),
208 		    "realloccg: bad bprev, dev = 0x%lx, bsize = %d,"
209 		    " bprev = %ld, fs = %s\n", ip->i_dev, fs->fs_bsize, bprev,
210 		    fs->fs_fsmnt);
211 		return (err);
212 	}
213 	err = chkdq(ip, (long)btodb(nsize - osize), 0, cr, &errmsg, &len);
214 	/* Note that may not have err, but may have errmsg */
215 	if (errmsg != NULL) {
216 		uprintf(errmsg);
217 		kmem_free(errmsg, len);
218 		errmsg = NULL;
219 	}
220 	if (err)
221 		return (err);
222 	cg = dtog(fs, bprev);
223 	bno = fragextend(ip, cg, (long)bprev, osize, nsize);
224 	if (bno != 0) {
225 		*bnp = bno;
226 		return (0);
227 	}
228 	if (bpref >= fs->fs_size)
229 		bpref = 0;
230 
231 	/*
232 	 * When optimizing for time we allocate a full block and
233 	 * then only use the upper portion for this request. When
234 	 * this file grows again it will grow into the unused portion
235 	 * of the block (See fragextend() above).  This saves time
236 	 * because an extra disk write would be needed if the frags
237 	 * following the current allocation were not free. The extra
238 	 * disk write is needed to move the data from its current
239 	 * location into the newly allocated position.
240 	 *
241 	 * When optimizing for space we allocate a run of frags
242 	 * that is just the right size for this request.
243 	 */
244 	request = (fs->fs_optim == FS_OPTTIME) ? fs->fs_bsize : nsize;
245 	bno = (daddr_t)hashalloc(ip, cg, (long)bpref, request,
246 		(ulong_t (*)())alloccg);
247 	if (bno > 0) {
248 		*bnp = bno;
249 		if (nsize < request)
250 			(void) free(ip, bno + numfrags(fs, nsize),
251 			    (off_t)(request - nsize), I_NOCANCEL);
252 		return (0);
253 	}
254 
255 	/*
256 	 * hashalloc() failed because some other thread grabbed
257 	 * the last block so unwind the quota operation.  We can
258 	 * ignore the return because subtractions don't fail, and
259 	 * our caller guarantees nsize >= osize.
260 	 */
261 	(void) chkdq(ip, -(long)btodb(nsize - osize), 0, cr, (char **)NULL,
262 		(size_t *)NULL);
263 
264 nospace:
265 	mutex_enter(&ufsvfsp->vfs_lock);
266 	if ((lbolt - ufsvfsp->vfs_lastwhinetime) > (hz << 2) &&
267 		(!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) {
268 		ufsvfsp->vfs_lastwhinetime = lbolt;
269 		cmn_err(CE_NOTE,
270 			"realloccg %s: file system full", fs->fs_fsmnt);
271 	}
272 	mutex_exit(&ufsvfsp->vfs_lock);
273 	return (ENOSPC);
274 }
275 
276 /*
277  * Allocate an inode in the file system.
278  *
279  * A preference may be optionally specified. If a preference is given
280  * the following hierarchy is used to allocate an inode:
281  *   1) allocate the requested inode.
282  *   2) allocate an inode in the same cylinder group.
283  *   3) quadratically rehash into other cylinder groups, until an
284  *	available inode is located.
285  * If no inode preference is given the following hierarchy is used
286  * to allocate an inode:
287  *   1) allocate an inode in cylinder group 0.
288  *   2) quadratically rehash into other cylinder groups, until an
289  *	available inode is located.
290  */
291 int
292 ufs_ialloc(struct inode *pip,
293     ino_t ipref, mode_t mode, struct inode **ipp, cred_t *cr)
294 {
295 	struct inode *ip;
296 	struct fs *fs;
297 	int cg;
298 	ino_t ino;
299 	int err;
300 	int nifree;
301 	struct ufsvfs *ufsvfsp = pip->i_ufsvfs;
302 	char *errmsg = NULL;
303 	size_t len;
304 
305 	ASSERT(RW_WRITE_HELD(&pip->i_rwlock));
306 	fs = pip->i_fs;
307 loop:
308 	nifree = fs->fs_cstotal.cs_nifree;
309 
310 	if (nifree == 0)
311 		goto noinodes;
312 	/*
313 	 * Shadow inodes don't count against a user's inode allocation.
314 	 * They are an implementation method and not a resource.
315 	 */
316 	if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
317 		err = chkiq((struct ufsvfs *)ITOV(pip)->v_vfsp->vfs_data,
318 			/* change */ 1, (struct inode *)NULL, crgetuid(cr), 0,
319 			cr, &errmsg, &len);
320 		/*
321 		 * As we haven't acquired any locks yet, dump the message
322 		 * now.
323 		 */
324 		if (errmsg != NULL) {
325 			uprintf(errmsg);
326 			kmem_free(errmsg, len);
327 			errmsg = NULL;
328 		}
329 		if (err)
330 			return (err);
331 	}
332 
333 	if (ipref >= (ulong_t)(fs->fs_ncg * fs->fs_ipg))
334 		ipref = 0;
335 	cg = (int)itog(fs, ipref);
336 	ino = (ino_t)hashalloc(pip, cg, (long)ipref, (int)mode,
337 	    (ulong_t (*)())ialloccg);
338 	if (ino == 0) {
339 		if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
340 			/*
341 			 * We can safely ignore the return from chkiq()
342 			 * because deallocations can only fail if we
343 			 * can't get the user's quota info record off
344 			 * the disk due to an I/O error.  In that case,
345 			 * the quota subsystem is already messed up.
346 			 */
347 			(void) chkiq(ufsvfsp, /* change */ -1,
348 				(struct inode *)NULL, crgetuid(cr), 0, cr,
349 				(char **)NULL, (size_t *)NULL);
350 		}
351 		goto noinodes;
352 	}
353 	err = ufs_iget(pip->i_vfs, ino, ipp, cr);
354 	if (err) {
355 		if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
356 			/*
357 			 * See above comment about why it is safe to ignore an
358 			 * error return here.
359 			 */
360 			(void) chkiq(ufsvfsp, /* change */ -1,
361 				(struct inode *)NULL, crgetuid(cr), 0, cr,
362 				(char **)NULL, (size_t *)NULL);
363 		}
364 		ufs_ifree(pip, ino, 0);
365 		return (err);
366 	}
367 	ip = *ipp;
368 	ASSERT(!ip->i_ufs_acl);
369 	ASSERT(!ip->i_dquot);
370 	rw_enter(&ip->i_contents, RW_WRITER);
371 
372 	/*
373 	 * Check if we really got a free inode, if not then complain
374 	 * and mark the inode ISTALE so that it will be freed by the
375 	 * ufs idle thread eventually and will not be sent to ufs_delete().
376 	 */
377 	if (ip->i_mode || (ip->i_nlink > 0)) {
378 		ip->i_flag |= ISTALE;
379 		rw_exit(&ip->i_contents);
380 		VN_RELE(ITOV(ip));
381 		cmn_err(CE_WARN,
382 			"%s: unexpected allocated inode %d, run fsck(1M)%s",
383 			fs->fs_fsmnt, (int)ino,
384 			(TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
385 		goto loop;
386 	}
387 
388 	/*
389 	 * Check the inode has no size or data blocks.
390 	 * This could have happened if the truncation failed when
391 	 * deleting the inode. It used to be possible for this to occur
392 	 * if a block allocation failed when iteratively truncating a
393 	 * large file using logging and with a full file system.
394 	 * This was fixed with bug fix 4348738. However, truncation may
395 	 * still fail on an IO error. So in all cases for safety and
396 	 * security we clear out the size; the blocks allocated; and
397 	 * pointers to the blocks. This will ultimately cause a fsck
398 	 * error of un-accounted for blocks, but its a fairly benign error,
399 	 * and possibly the correct thing to do anyway as accesssing those
400 	 * blocks agains may lead to more IO errors.
401 	 */
402 	if (ip->i_size || ip->i_blocks) {
403 		int i;
404 
405 		if (ip->i_size) {
406 			cmn_err(CE_WARN,
407 			    "%s: free inode %d had size 0x%llx, run fsck(1M)%s",
408 			    fs->fs_fsmnt, (int)ino, ip->i_size,
409 			    (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
410 		}
411 		/*
412 		 * Clear any garbage left behind.
413 		 */
414 		ip->i_size = (u_offset_t)0;
415 		ip->i_blocks = 0;
416 		for (i = 0; i < NDADDR; i++)
417 			ip->i_db[i] = 0;
418 		for (i = 0; i < NIADDR; i++)
419 			ip->i_ib[i] = 0;
420 	}
421 
422 	/*
423 	 * Initialize the link count
424 	 */
425 	ip->i_nlink = 0;
426 
427 	/*
428 	 * Clear the old flags
429 	 */
430 	ip->i_flag &= IREF;
431 
432 	/*
433 	 * Access times are not really defined if the fs is mounted
434 	 * with 'noatime'. But it can cause nfs clients to fail
435 	 * open() if the atime is not a legal value. Set a legal value
436 	 * here when the inode is allocated.
437 	 */
438 	if (ufsvfsp->vfs_noatime) {
439 		mutex_enter(&ufs_iuniqtime_lock);
440 		ip->i_atime = iuniqtime;
441 		mutex_exit(&ufs_iuniqtime_lock);
442 	}
443 	rw_exit(&ip->i_contents);
444 	return (0);
445 noinodes:
446 	if (!(TRANS_ISTRANS(ufsvfsp)) || !(pip->i_flag & IQUIET))
447 		cmn_err(CE_NOTE, "%s: out of inodes\n", fs->fs_fsmnt);
448 	return (ENOSPC);
449 }
450 
451 /*
452  * Find a cylinder group to place a directory.
453  * Returns an inumber within the selected cylinder group.
454  * Note, the vfs_lock is not needed as we don't require exact cg summary info.
455  *
456  * If the switch ufs_close_dirs is set, then the policy is to use
457  * the current cg if it has more than 25% free inodes and more
458  * than 25% free blocks. Otherwise the cgs are searched from
459  * the beginning and the first cg with the same criteria is
460  * used. If that is also null then we revert to the old algorithm.
461  * This tends to cluster files at the beginning of the disk
462  * until the disk gets full.
463  *
464  * Otherwise if ufs_close_dirs is not set then the original policy is
465  * used which is to select from among those cylinder groups with
466  * above the average number of free inodes, the one with the smallest
467  * number of directories.
468  */
469 
470 int ufs_close_dirs = 1;	/* allocate directories close as possible */
471 
472 ino_t
473 dirpref(inode_t *dp)
474 {
475 	int cg, minndir, mincg, avgifree, mininode, minbpg, ifree;
476 	struct fs *fs = dp->i_fs;
477 
478 	cg = itog(fs, dp->i_number);
479 	mininode = fs->fs_ipg >> 2;
480 	minbpg = fs->fs_maxbpg >> 2;
481 	if (ufs_close_dirs &&
482 	    (fs->fs_cs(fs, cg).cs_nifree > mininode) &&
483 	    (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) {
484 		return (dp->i_number);
485 	}
486 
487 	avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
488 	minndir = fs->fs_ipg;
489 	mincg = 0;
490 	for (cg = 0; cg < fs->fs_ncg; cg++) {
491 		ifree = fs->fs_cs(fs, cg).cs_nifree;
492 		if (ufs_close_dirs &&
493 		    (ifree > mininode) &&
494 		    (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) {
495 			return ((ino_t)(fs->fs_ipg * cg));
496 		}
497 		if ((fs->fs_cs(fs, cg).cs_ndir < minndir) &&
498 		    (ifree >= avgifree)) {
499 			mincg = cg;
500 			minndir = fs->fs_cs(fs, cg).cs_ndir;
501 		}
502 	}
503 	return ((ino_t)(fs->fs_ipg * mincg));
504 }
505 
506 /*
507  * Select the desired position for the next block in a file.  The file is
508  * logically divided into sections. The first section is composed of the
509  * direct blocks. Each additional section contains fs_maxbpg blocks.
510  *
511  * If no blocks have been allocated in the first section, the policy is to
512  * request a block in the same cylinder group as the inode that describes
513  * the file. If no blocks have been allocated in any other section, the
514  * policy is to place the section in a cylinder group with a greater than
515  * average number of free blocks.  An appropriate cylinder group is found
516  * by using a rotor that sweeps the cylinder groups. When a new group of
517  * blocks is needed, the sweep begins in the cylinder group following the
518  * cylinder group from which the previous allocation was made. The sweep
519  * continues until a cylinder group with greater than the average number
520  * of free blocks is found. If the allocation is for the first block in an
521  * indirect block, the information on the previous allocation is unavailable;
522  * here a best guess is made based upon the logical block number being
523  * allocated.
524  *
525  * If a section is already partially allocated, the policy is to
526  * contiguously allocate fs_maxcontig blocks.  The end of one of these
527  * contiguous blocks and the beginning of the next is physically separated
528  * so that the disk head will be in transit between them for at least
529  * fs_rotdelay milliseconds.  This is to allow time for the processor to
530  * schedule another I/O transfer.
531  */
532 daddr_t
533 blkpref(struct inode *ip, daddr_t lbn, int indx, daddr32_t *bap)
534 {
535 	struct fs *fs;
536 	struct ufsvfs *ufsvfsp;
537 	int cg;
538 	int avgbfree, startcg;
539 	daddr_t nextblk;
540 
541 	ufsvfsp = ip->i_ufsvfs;
542 	fs = ip->i_fs;
543 	if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
544 		if (lbn < NDADDR) {
545 			cg = itog(fs, ip->i_number);
546 			return (fs->fs_fpg * cg + fs->fs_frag);
547 		}
548 		/*
549 		 * Find a cylinder with greater than average
550 		 * number of unused data blocks.
551 		 */
552 		if (indx == 0 || bap[indx - 1] == 0)
553 			startcg = itog(fs, ip->i_number) + lbn / fs->fs_maxbpg;
554 		else
555 			startcg = dtog(fs, bap[indx - 1]) + 1;
556 		startcg %= fs->fs_ncg;
557 
558 		mutex_enter(&ufsvfsp->vfs_lock);
559 		avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
560 		/*
561 		 * used for computing log space for writes/truncs
562 		 */
563 		ufsvfsp->vfs_avgbfree = avgbfree;
564 		for (cg = startcg; cg < fs->fs_ncg; cg++)
565 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
566 				fs->fs_cgrotor = cg;
567 				mutex_exit(&ufsvfsp->vfs_lock);
568 				return (fs->fs_fpg * cg + fs->fs_frag);
569 			}
570 		for (cg = 0; cg <= startcg; cg++)
571 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
572 				fs->fs_cgrotor = cg;
573 				mutex_exit(&ufsvfsp->vfs_lock);
574 				return (fs->fs_fpg * cg + fs->fs_frag);
575 			}
576 		mutex_exit(&ufsvfsp->vfs_lock);
577 		return (NULL);
578 	}
579 	/*
580 	 * One or more previous blocks have been laid out. If less
581 	 * than fs_maxcontig previous blocks are contiguous, the
582 	 * next block is requested contiguously, otherwise it is
583 	 * requested rotationally delayed by fs_rotdelay milliseconds.
584 	 */
585 
586 	nextblk = bap[indx - 1];
587 	/*
588 	 * Provision for fallocate to return positive
589 	 * blk preference based on last allocation
590 	 */
591 	if (nextblk < 0 && nextblk != UFS_HOLE) {
592 		nextblk = (-bap[indx - 1]) + fs->fs_frag;
593 	} else {
594 		nextblk = bap[indx - 1] + fs->fs_frag;
595 	}
596 
597 	if (indx > fs->fs_maxcontig && bap[indx - fs->fs_maxcontig] +
598 	    blkstofrags(fs, fs->fs_maxcontig) != nextblk) {
599 		return (nextblk);
600 	}
601 	if (fs->fs_rotdelay != 0)
602 		/*
603 		 * Here we convert ms of delay to frags as:
604 		 * (frags) = (ms) * (rev/sec) * (sect/rev) /
605 		 * 	((sect/frag) * (ms/sec))
606 		 * then round up to the next block.
607 		 */
608 		nextblk += roundup(fs->fs_rotdelay * fs->fs_rps * fs->fs_nsect /
609 		    (NSPF(fs) * 1000), fs->fs_frag);
610 	return (nextblk);
611 }
612 
613 /*
614  * Free a block or fragment.
615  *
616  * The specified block or fragment is placed back in the
617  * free map. If a fragment is deallocated, a possible
618  * block reassembly is checked.
619  */
620 void
621 free(struct inode *ip, daddr_t bno, off_t size, int flags)
622 {
623 	struct fs *fs = ip->i_fs;
624 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
625 	struct ufs_q *delq = &ufsvfsp->vfs_delete;
626 	struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
627 	struct cg *cgp;
628 	struct buf *bp;
629 	int cg, bmap, bbase;
630 	int i;
631 	uchar_t *blksfree;
632 	int *blktot;
633 	short *blks;
634 	daddr_t blkno, cylno, rpos;
635 
636 	/*
637 	 * fallocate'd files will have negative block address.
638 	 * So negate it again to get original block address.
639 	 */
640 	if (bno < 0 && bno % fs->fs_bsize == 0 && bno != UFS_HOLE) {
641 		bno = -bno;
642 	}
643 
644 	if ((unsigned long)size > fs->fs_bsize || fragoff(fs, size) != 0) {
645 		(void) ufs_fault(ITOV(ip),
646 		    "free: bad size, dev = 0x%lx, bsize = %d, size = %d, "
647 		    "fs = %s\n", ip->i_dev, fs->fs_bsize,
648 		    (int)size, fs->fs_fsmnt);
649 		return;
650 	}
651 	cg = dtog(fs, bno);
652 	ASSERT(!ufs_badblock(ip, bno));
653 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
654 	    (int)fs->fs_cgsize);
655 
656 	cgp = bp->b_un.b_cg;
657 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
658 		brelse(bp);
659 		return;
660 	}
661 
662 	if (!(flags & I_NOCANCEL))
663 		TRANS_CANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size, flags);
664 	if (flags & (I_DIR|I_IBLK|I_SHAD|I_QUOTA)) {
665 		TRANS_MATA_FREE(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size);
666 	}
667 	blksfree = cg_blksfree(cgp);
668 	blktot = cg_blktot(cgp);
669 	mutex_enter(&ufsvfsp->vfs_lock);
670 	cgp->cg_time = gethrestime_sec();
671 	bno = dtogd(fs, bno);
672 	if (size == fs->fs_bsize) {
673 		blkno = fragstoblks(fs, bno);
674 		cylno = cbtocylno(fs, bno);
675 		rpos = cbtorpos(ufsvfsp, bno);
676 		blks = cg_blks(ufsvfsp, cgp, cylno);
677 		if (!isclrblock(fs, blksfree, blkno)) {
678 			mutex_exit(&ufsvfsp->vfs_lock);
679 			brelse(bp);
680 			(void) ufs_fault(ITOV(ip), "free: freeing free block, "
681 			    "dev:0x%lx, block:%ld, ino:%lu, fs:%s",
682 			    ip->i_dev, bno, ip->i_number, fs->fs_fsmnt);
683 			return;
684 		}
685 		setblock(fs, blksfree, blkno);
686 		blks[rpos]++;
687 		blktot[cylno]++;
688 		cgp->cg_cs.cs_nbfree++;		/* Log below */
689 		fs->fs_cstotal.cs_nbfree++;
690 		fs->fs_cs(fs, cg).cs_nbfree++;
691 		if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) {
692 			mutex_enter(&delq->uq_mutex);
693 			delq_info->delq_unreclaimed_blocks -=
694 			    btodb(fs->fs_bsize);
695 			mutex_exit(&delq->uq_mutex);
696 		}
697 	} else {
698 		bbase = bno - fragnum(fs, bno);
699 		/*
700 		 * Decrement the counts associated with the old frags
701 		 */
702 		bmap = blkmap(fs, blksfree, bbase);
703 		fragacct(fs, bmap, cgp->cg_frsum, -1);
704 		/*
705 		 * Deallocate the fragment
706 		 */
707 		for (i = 0; i < numfrags(fs, size); i++) {
708 			if (isset(blksfree, bno + i)) {
709 				brelse(bp);
710 				mutex_exit(&ufsvfsp->vfs_lock);
711 				(void) ufs_fault(ITOV(ip),
712 				    "free: freeing free frag, "
713 				    "dev:0x%lx, blk:%ld, cg:%d, "
714 				    "ino:%lu, fs:%s",
715 				    ip->i_dev,
716 				    bno + i,
717 				    cgp->cg_cgx,
718 				    ip->i_number,
719 				    fs->fs_fsmnt);
720 				return;
721 			}
722 			setbit(blksfree, bno + i);
723 		}
724 		cgp->cg_cs.cs_nffree += i;
725 		fs->fs_cstotal.cs_nffree += i;
726 		fs->fs_cs(fs, cg).cs_nffree += i;
727 		if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) {
728 			mutex_enter(&delq->uq_mutex);
729 			delq_info->delq_unreclaimed_blocks -=
730 			    btodb(i * fs->fs_fsize);
731 			mutex_exit(&delq->uq_mutex);
732 		}
733 		/*
734 		 * Add back in counts associated with the new frags
735 		 */
736 		bmap = blkmap(fs, blksfree, bbase);
737 		fragacct(fs, bmap, cgp->cg_frsum, 1);
738 		/*
739 		 * If a complete block has been reassembled, account for it
740 		 */
741 		blkno = fragstoblks(fs, bbase);
742 		if (isblock(fs, blksfree, blkno)) {
743 			cylno = cbtocylno(fs, bbase);
744 			rpos = cbtorpos(ufsvfsp, bbase);
745 			blks = cg_blks(ufsvfsp, cgp, cylno);
746 			blks[rpos]++;
747 			blktot[cylno]++;
748 			cgp->cg_cs.cs_nffree -= fs->fs_frag;
749 			fs->fs_cstotal.cs_nffree -= fs->fs_frag;
750 			fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
751 			cgp->cg_cs.cs_nbfree++;
752 			fs->fs_cstotal.cs_nbfree++;
753 			fs->fs_cs(fs, cg).cs_nbfree++;
754 		}
755 	}
756 	fs->fs_fmod = 1;
757 	ufs_notclean(ufsvfsp);
758 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
759 	TRANS_SI(ufsvfsp, fs, cg);
760 	bdrwrite(bp);
761 }
762 
763 /*
764  * Free an inode.
765  *
766  * The specified inode is placed back in the free map.
767  */
768 void
769 ufs_ifree(struct inode *ip, ino_t ino, mode_t mode)
770 {
771 	struct fs *fs = ip->i_fs;
772 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
773 	struct cg *cgp;
774 	struct buf *bp;
775 	unsigned int inot;
776 	int cg;
777 	char *iused;
778 
779 	if (ip->i_number == ino && ip->i_mode != 0) {
780 		(void) ufs_fault(ITOV(ip),
781 		    "ufs_ifree: illegal mode: (imode) %o, (omode) %o, ino %d, "
782 		    "fs = %s\n",
783 		    ip->i_mode, mode, (int)ip->i_number, fs->fs_fsmnt);
784 		return;
785 	}
786 	if (ino >= fs->fs_ipg * fs->fs_ncg) {
787 		(void) ufs_fault(ITOV(ip),
788 		    "ifree: range, dev = 0x%x, ino = %d, fs = %s\n",
789 		    (int)ip->i_dev, (int)ino, fs->fs_fsmnt);
790 		return;
791 	}
792 	cg = (int)itog(fs, ino);
793 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
794 	    (int)fs->fs_cgsize);
795 
796 	cgp = bp->b_un.b_cg;
797 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
798 		brelse(bp);
799 		return;
800 	}
801 	mutex_enter(&ufsvfsp->vfs_lock);
802 	cgp->cg_time = gethrestime_sec();
803 	iused = cg_inosused(cgp);
804 	inot = (unsigned int)(ino % (ulong_t)fs->fs_ipg);
805 	if (isclr(iused, inot)) {
806 		mutex_exit(&ufsvfsp->vfs_lock);
807 		brelse(bp);
808 		(void) ufs_fault(ITOV(ip), "ufs_ifree: freeing free inode, "
809 		    "mode: (imode) %o, (omode) %o, ino:%d, "
810 		    "fs:%s",
811 		    ip->i_mode, mode, (int)ino, fs->fs_fsmnt);
812 		return;
813 	}
814 	clrbit(iused, inot);
815 
816 	if (inot < (ulong_t)cgp->cg_irotor)
817 		cgp->cg_irotor = inot;
818 	cgp->cg_cs.cs_nifree++;
819 	fs->fs_cstotal.cs_nifree++;
820 	fs->fs_cs(fs, cg).cs_nifree++;
821 	if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) {
822 		cgp->cg_cs.cs_ndir--;
823 		fs->fs_cstotal.cs_ndir--;
824 		fs->fs_cs(fs, cg).cs_ndir--;
825 	}
826 	fs->fs_fmod = 1;
827 	ufs_notclean(ufsvfsp);
828 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
829 	TRANS_SI(ufsvfsp, fs, cg);
830 	bdrwrite(bp);
831 }
832 
833 /*
834  * Implement the cylinder overflow algorithm.
835  *
836  * The policy implemented by this algorithm is:
837  *   1) allocate the block in its requested cylinder group.
838  *   2) quadratically rehash on the cylinder group number.
839  *   3) brute force search for a free block.
840  * The size parameter means size for data blocks, mode for inodes.
841  */
842 static ino_t
843 hashalloc(struct inode *ip, int cg, long pref, int size, ulong_t (*allocator)())
844 {
845 	struct fs *fs;
846 	int i;
847 	long result;
848 	int icg = cg;
849 
850 	fs = ip->i_fs;
851 	/*
852 	 * 1: preferred cylinder group
853 	 */
854 	result = (*allocator)(ip, cg, pref, size);
855 	if (result)
856 		return (result);
857 	/*
858 	 * 2: quadratic rehash
859 	 */
860 	for (i = 1; i < fs->fs_ncg; i *= 2) {
861 		cg += i;
862 		if (cg >= fs->fs_ncg)
863 			cg -= fs->fs_ncg;
864 		result = (*allocator)(ip, cg, 0, size);
865 		if (result)
866 			return (result);
867 	}
868 	/*
869 	 * 3: brute force search
870 	 * Note that we start at i == 2, since 0 was checked initially,
871 	 * and 1 is always checked in the quadratic rehash.
872 	 */
873 	cg = (icg + 2) % fs->fs_ncg;
874 	for (i = 2; i < fs->fs_ncg; i++) {
875 		result = (*allocator)(ip, cg, 0, size);
876 		if (result)
877 			return (result);
878 		cg++;
879 		if (cg == fs->fs_ncg)
880 			cg = 0;
881 	}
882 	return (NULL);
883 }
884 
885 /*
886  * Determine whether a fragment can be extended.
887  *
888  * Check to see if the necessary fragments are available, and
889  * if they are, allocate them.
890  */
891 static daddr_t
892 fragextend(struct inode *ip, int cg, long bprev, int osize, int nsize)
893 {
894 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
895 	struct fs *fs = ip->i_fs;
896 	struct buf *bp;
897 	struct cg *cgp;
898 	uchar_t *blksfree;
899 	long bno;
900 	int frags, bbase;
901 	int i, j;
902 
903 	if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
904 		return (NULL);
905 	frags = numfrags(fs, nsize);
906 	bbase = (int)fragnum(fs, bprev);
907 	if (bbase > fragnum(fs, (bprev + frags - 1))) {
908 		/* cannot extend across a block boundary */
909 		return (NULL);
910 	}
911 
912 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
913 	    (int)fs->fs_cgsize);
914 	cgp = bp->b_un.b_cg;
915 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
916 		brelse(bp);
917 		return (NULL);
918 	}
919 
920 	blksfree = cg_blksfree(cgp);
921 	mutex_enter(&ufsvfsp->vfs_lock);
922 	bno = dtogd(fs, bprev);
923 	for (i = numfrags(fs, osize); i < frags; i++) {
924 		if (isclr(blksfree, bno + i)) {
925 			mutex_exit(&ufsvfsp->vfs_lock);
926 			brelse(bp);
927 			return (NULL);
928 		}
929 		if ((TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bprev + i)),
930 			fs->fs_fsize))) {
931 			mutex_exit(&ufsvfsp->vfs_lock);
932 			brelse(bp);
933 			return (NULL);
934 		}
935 	}
936 
937 	cgp->cg_time = gethrestime_sec();
938 	/*
939 	 * The current fragment can be extended,
940 	 * deduct the count on fragment being extended into
941 	 * increase the count on the remaining fragment (if any)
942 	 * allocate the extended piece.
943 	 */
944 	for (i = frags; i < fs->fs_frag - bbase; i++)
945 		if (isclr(blksfree, bno + i))
946 			break;
947 	j = i - numfrags(fs, osize);
948 	cgp->cg_frsum[j]--;
949 	ASSERT(cgp->cg_frsum[j] >= 0);
950 	if (i != frags)
951 		cgp->cg_frsum[i - frags]++;
952 	for (i = numfrags(fs, osize); i < frags; i++) {
953 		clrbit(blksfree, bno + i);
954 		cgp->cg_cs.cs_nffree--;
955 		fs->fs_cs(fs, cg).cs_nffree--;
956 		fs->fs_cstotal.cs_nffree--;
957 	}
958 	fs->fs_fmod = 1;
959 	ufs_notclean(ufsvfsp);
960 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
961 	TRANS_SI(ufsvfsp, fs, cg);
962 	bdrwrite(bp);
963 	return ((daddr_t)bprev);
964 }
965 
966 /*
967  * Determine whether a block can be allocated.
968  *
969  * Check to see if a block of the apprpriate size
970  * is available, and if it is, allocate it.
971  */
972 static daddr_t
973 alloccg(struct inode *ip, int cg, daddr_t bpref, int size)
974 {
975 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
976 	struct fs *fs = ip->i_fs;
977 	struct buf *bp;
978 	struct cg *cgp;
979 	uchar_t *blksfree;
980 	int bno, frags;
981 	int allocsiz;
982 	int i;
983 
984 	if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
985 		return (0);
986 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
987 	    (int)fs->fs_cgsize);
988 
989 	cgp = bp->b_un.b_cg;
990 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) ||
991 	    (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) {
992 		brelse(bp);
993 		return (0);
994 	}
995 	blksfree = cg_blksfree(cgp);
996 	mutex_enter(&ufsvfsp->vfs_lock);
997 	cgp->cg_time = gethrestime_sec();
998 	if (size == fs->fs_bsize) {
999 		if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0)
1000 			goto errout;
1001 		fs->fs_fmod = 1;
1002 		ufs_notclean(ufsvfsp);
1003 		TRANS_SI(ufsvfsp, fs, cg);
1004 		bdrwrite(bp);
1005 		return (bno);
1006 	}
1007 	/*
1008 	 * Check to see if any fragments are already available
1009 	 * allocsiz is the size which will be allocated, hacking
1010 	 * it down to a smaller size if necessary.
1011 	 */
1012 	frags = numfrags(fs, size);
1013 	for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++)
1014 		if (cgp->cg_frsum[allocsiz] != 0)
1015 			break;
1016 
1017 	if (allocsiz != fs->fs_frag)
1018 		bno = mapsearch(ufsvfsp, cgp, bpref, allocsiz);
1019 
1020 	if (allocsiz == fs->fs_frag || bno < 0) {
1021 		/*
1022 		 * No fragments were available, so a block
1023 		 * will be allocated and hacked up.
1024 		 */
1025 		if (cgp->cg_cs.cs_nbfree == 0)
1026 			goto errout;
1027 		if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0)
1028 			goto errout;
1029 		bpref = dtogd(fs, bno);
1030 		for (i = frags; i < fs->fs_frag; i++)
1031 			setbit(blksfree, bpref + i);
1032 		i = fs->fs_frag - frags;
1033 		cgp->cg_cs.cs_nffree += i;
1034 		fs->fs_cstotal.cs_nffree += i;
1035 		fs->fs_cs(fs, cg).cs_nffree += i;
1036 		cgp->cg_frsum[i]++;
1037 		fs->fs_fmod = 1;
1038 		ufs_notclean(ufsvfsp);
1039 		TRANS_SI(ufsvfsp, fs, cg);
1040 		bdrwrite(bp);
1041 		return (bno);
1042 	}
1043 
1044 	for (i = 0; i < frags; i++)
1045 		clrbit(blksfree, bno + i);
1046 	cgp->cg_cs.cs_nffree -= frags;
1047 	fs->fs_cstotal.cs_nffree -= frags;
1048 	fs->fs_cs(fs, cg).cs_nffree -= frags;
1049 	cgp->cg_frsum[allocsiz]--;
1050 	ASSERT(cgp->cg_frsum[allocsiz] >= 0);
1051 	if (frags != allocsiz) {
1052 		cgp->cg_frsum[allocsiz - frags]++;
1053 	}
1054 	fs->fs_fmod = 1;
1055 	ufs_notclean(ufsvfsp);
1056 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
1057 	TRANS_SI(ufsvfsp, fs, cg);
1058 	bdrwrite(bp);
1059 	return (cg * fs->fs_fpg + bno);
1060 errout:
1061 	mutex_exit(&ufsvfsp->vfs_lock);
1062 	brelse(bp);
1063 	return (0);
1064 }
1065 
1066 /*
1067  * Allocate a block in a cylinder group.
1068  *
1069  * This algorithm implements the following policy:
1070  *   1) allocate the requested block.
1071  *   2) allocate a rotationally optimal block in the same cylinder.
1072  *   3) allocate the next available block on the block rotor for the
1073  *	specified cylinder group.
1074  * Note that this routine only allocates fs_bsize blocks; these
1075  * blocks may be fragmented by the routine that allocates them.
1076  */
1077 static daddr_t
1078 alloccgblk(
1079 	struct ufsvfs *ufsvfsp,
1080 	struct cg *cgp,
1081 	daddr_t bpref,
1082 	struct buf *bp)
1083 {
1084 	daddr_t bno;
1085 	int cylno, pos, delta, rotbl_size;
1086 	short *cylbp;
1087 	int i;
1088 	struct fs *fs;
1089 	uchar_t *blksfree;
1090 	daddr_t blkno, rpos, frag;
1091 	short *blks;
1092 	int32_t *blktot;
1093 
1094 	ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock));
1095 	fs = ufsvfsp->vfs_fs;
1096 	blksfree = cg_blksfree(cgp);
1097 	if (bpref == 0) {
1098 		bpref = cgp->cg_rotor;
1099 		goto norot;
1100 	}
1101 	bpref = blknum(fs, bpref);
1102 	bpref = dtogd(fs, bpref);
1103 	/*
1104 	 * If the requested block is available, use it.
1105 	 */
1106 	if (isblock(fs, blksfree, (daddr_t)fragstoblks(fs, bpref))) {
1107 		bno = bpref;
1108 		goto gotit;
1109 	}
1110 	/*
1111 	 * Check for a block available on the same cylinder.
1112 	 */
1113 	cylno = cbtocylno(fs, bpref);
1114 	if (cg_blktot(cgp)[cylno] == 0)
1115 		goto norot;
1116 	if (fs->fs_cpc == 0) {
1117 		/*
1118 		 * Block layout info is not available, so just
1119 		 * have to take any block in this cylinder.
1120 		 */
1121 		bpref = howmany(fs->fs_spc * cylno, NSPF(fs));
1122 		goto norot;
1123 	}
1124 	/*
1125 	 * Check the summary information to see if a block is
1126 	 * available in the requested cylinder starting at the
1127 	 * requested rotational position and proceeding around.
1128 	 */
1129 	cylbp = cg_blks(ufsvfsp, cgp, cylno);
1130 	pos = cbtorpos(ufsvfsp, bpref);
1131 	for (i = pos; i < ufsvfsp->vfs_nrpos; i++)
1132 		if (cylbp[i] > 0)
1133 			break;
1134 	if (i == ufsvfsp->vfs_nrpos)
1135 		for (i = 0; i < pos; i++)
1136 			if (cylbp[i] > 0)
1137 				break;
1138 	if (cylbp[i] > 0) {
1139 		/*
1140 		 * Found a rotational position, now find the actual
1141 		 * block.  A "panic" if none is actually there.
1142 		 */
1143 
1144 		/*
1145 		 * Up to this point, "pos" has referred to the rotational
1146 		 * position of the desired block.  From now on, it holds
1147 		 * the offset of the current cylinder within a cylinder
1148 		 * cycle.  (A cylinder cycle refers to a set of cylinders
1149 		 * which are described by a single rotational table; the
1150 		 * size of the cycle is fs_cpc.)
1151 		 *
1152 		 * bno is set to the block number of the first block within
1153 		 * the current cylinder cycle.
1154 		 */
1155 
1156 		pos = cylno % fs->fs_cpc;
1157 		bno = (cylno - pos) * fs->fs_spc / NSPB(fs);
1158 
1159 		/*
1160 		 * The blocks within a cylinder are grouped into equivalence
1161 		 * classes according to their "rotational position."  There
1162 		 * are two tables used to determine these classes.
1163 		 *
1164 		 * The positional offset table (fs_postbl) has an entry for
1165 		 * each rotational position of each cylinder in a cylinder
1166 		 * cycle.  This entry contains the relative block number
1167 		 * (counting from the start of the cylinder cycle) of the
1168 		 * first block in the equivalence class for that position
1169 		 * and that cylinder.  Positions for which no blocks exist
1170 		 * are indicated by a -1.
1171 		 *
1172 		 * The rotational delta table (fs_rotbl) has an entry for
1173 		 * each block in a cylinder cycle.  This entry contains
1174 		 * the offset from that block to the next block in the
1175 		 * same equivalence class.  The last block in the class
1176 		 * is indicated by a zero in the table.
1177 		 *
1178 		 * The following code, then, walks through all of the blocks
1179 		 * in the cylinder (cylno) which we're allocating within
1180 		 * which are in the equivalence class for the rotational
1181 		 * position (i) which we're allocating within.
1182 		 */
1183 
1184 		if (fs_postbl(ufsvfsp, pos)[i] == -1) {
1185 			(void) ufs_fault(ufsvfsp->vfs_root,
1186 			    "alloccgblk: cyl groups corrupted, pos = %d, "
1187 			    "i = %d, fs = %s\n", pos, i, fs->fs_fsmnt);
1188 			return (0);
1189 		}
1190 
1191 		/*
1192 		 * There is one entry in the rotational table for each block
1193 		 * in the cylinder cycle.  These are whole blocks, not frags.
1194 		 */
1195 
1196 		rotbl_size = (fs->fs_cpc * fs->fs_spc) >>
1197 		    (fs->fs_fragshift + fs->fs_fsbtodb);
1198 
1199 		/*
1200 		 * As we start, "i" is the rotational position within which
1201 		 * we're searching.  After the next line, it will be a block
1202 		 * number (relative to the start of the cylinder cycle)
1203 		 * within the equivalence class of that rotational position.
1204 		 */
1205 
1206 		i = fs_postbl(ufsvfsp, pos)[i];
1207 
1208 		for (;;) {
1209 			if (isblock(fs, blksfree, (daddr_t)(bno + i))) {
1210 				bno = blkstofrags(fs, (bno + i));
1211 				goto gotit;
1212 			}
1213 			delta = fs_rotbl(fs)[i];
1214 			if (delta <= 0 ||		/* End of chain, or */
1215 			    delta + i > rotbl_size)	/* end of table? */
1216 				break;			/* If so, panic. */
1217 			i += delta;
1218 		}
1219 		(void) ufs_fault(ufsvfsp->vfs_root,
1220 		    "alloccgblk: can't find blk in cyl, pos:%d, i:%d, "
1221 		    "fs:%s bno: %x\n", pos, i, fs->fs_fsmnt, (int)bno);
1222 		return (0);
1223 	}
1224 norot:
1225 	/*
1226 	 * No blocks in the requested cylinder, so take
1227 	 * next available one in this cylinder group.
1228 	 */
1229 	bno = mapsearch(ufsvfsp, cgp, bpref, (int)fs->fs_frag);
1230 	if (bno < 0)
1231 		return (0);
1232 	cgp->cg_rotor = bno;
1233 gotit:
1234 	blkno = fragstoblks(fs, bno);
1235 	frag = (cgp->cg_cgx * fs->fs_fpg) + bno;
1236 	if (TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, frag)), fs->fs_bsize))
1237 		goto norot;
1238 	clrblock(fs, blksfree, (long)blkno);
1239 	/*
1240 	 * the other cg/sb/si fields are TRANS'ed by the caller
1241 	 */
1242 	cgp->cg_cs.cs_nbfree--;
1243 	fs->fs_cstotal.cs_nbfree--;
1244 	fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
1245 	cylno = cbtocylno(fs, bno);
1246 	blks = cg_blks(ufsvfsp, cgp, cylno);
1247 	rpos = cbtorpos(ufsvfsp, bno);
1248 	blktot = cg_blktot(cgp);
1249 	blks[rpos]--;
1250 	blktot[cylno]--;
1251 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
1252 	fs->fs_fmod = 1;
1253 	return (frag);
1254 }
1255 
1256 /*
1257  * Determine whether an inode can be allocated.
1258  *
1259  * Check to see if an inode is available, and if it is,
1260  * allocate it using the following policy:
1261  *   1) allocate the requested inode.
1262  *   2) allocate the next available inode after the requested
1263  *	inode in the specified cylinder group.
1264  */
1265 static ino_t
1266 ialloccg(struct inode *ip, int cg, daddr_t ipref, int mode)
1267 {
1268 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
1269 	struct fs *fs = ip->i_fs;
1270 	struct cg *cgp;
1271 	struct buf *bp;
1272 	int start, len, loc, map, i;
1273 	char *iused;
1274 
1275 	if (fs->fs_cs(fs, cg).cs_nifree == 0)
1276 		return (0);
1277 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
1278 		    (int)fs->fs_cgsize);
1279 
1280 	cgp = bp->b_un.b_cg;
1281 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) ||
1282 	    cgp->cg_cs.cs_nifree == 0) {
1283 		brelse(bp);
1284 		return (0);
1285 	}
1286 	iused = cg_inosused(cgp);
1287 	mutex_enter(&ufsvfsp->vfs_lock);
1288 	/*
1289 	 * While we are waiting for the mutex, someone may have taken
1290 	 * the last available inode.  Need to recheck.
1291 	 */
1292 	if (cgp->cg_cs.cs_nifree == 0) {
1293 		mutex_exit(&ufsvfsp->vfs_lock);
1294 		brelse(bp);
1295 		return (0);
1296 	}
1297 
1298 	cgp->cg_time = gethrestime_sec();
1299 	if (ipref) {
1300 		ipref %= fs->fs_ipg;
1301 		if (isclr(iused, ipref))
1302 			goto gotit;
1303 	}
1304 	start = cgp->cg_irotor / NBBY;
1305 	len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY);
1306 	loc = skpc(0xff, (uint_t)len, &iused[start]);
1307 	if (loc == 0) {
1308 		len = start + 1;
1309 		start = 0;
1310 		loc = skpc(0xff, (uint_t)len, &iused[0]);
1311 		if (loc == 0) {
1312 			mutex_exit(&ufsvfsp->vfs_lock);
1313 			(void) ufs_fault(ITOV(ip),
1314 			    "ialloccg: map corrupted, cg = %d, irotor = %d, "
1315 			    "fs = %s\n", cg, (int)cgp->cg_irotor, fs->fs_fsmnt);
1316 			return (0);
1317 		}
1318 	}
1319 	i = start + len - loc;
1320 	map = iused[i];
1321 	ipref = i * NBBY;
1322 	for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) {
1323 		if ((map & i) == 0) {
1324 			cgp->cg_irotor = ipref;
1325 			goto gotit;
1326 		}
1327 	}
1328 
1329 	mutex_exit(&ufsvfsp->vfs_lock);
1330 	(void) ufs_fault(ITOV(ip), "ialloccg: block not in mapfs = %s",
1331 							    fs->fs_fsmnt);
1332 	return (0);
1333 gotit:
1334 	setbit(iused, ipref);
1335 	cgp->cg_cs.cs_nifree--;
1336 	fs->fs_cstotal.cs_nifree--;
1337 	fs->fs_cs(fs, cg).cs_nifree--;
1338 	if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) {
1339 		cgp->cg_cs.cs_ndir++;
1340 		fs->fs_cstotal.cs_ndir++;
1341 		fs->fs_cs(fs, cg).cs_ndir++;
1342 	}
1343 	fs->fs_fmod = 1;
1344 	ufs_notclean(ufsvfsp);
1345 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
1346 	TRANS_SI(ufsvfsp, fs, cg);
1347 	bdrwrite(bp);
1348 	return (cg * fs->fs_ipg + ipref);
1349 }
1350 
1351 /*
1352  * Find a block of the specified size in the specified cylinder group.
1353  *
1354  * It is a panic if a request is made to find a block if none are
1355  * available.
1356  */
1357 static daddr_t
1358 mapsearch(struct ufsvfs *ufsvfsp, struct cg *cgp, daddr_t bpref,
1359 	int allocsiz)
1360 {
1361 	struct fs *fs	= ufsvfsp->vfs_fs;
1362 	daddr_t bno, cfrag;
1363 	int start, len, loc, i, last, first, secondtime;
1364 	int blk, field, subfield, pos;
1365 	int gotit;
1366 
1367 	/*
1368 	 * ufsvfs->vfs_lock is held when calling this.
1369 	 */
1370 	/*
1371 	 * Find the fragment by searching through the
1372 	 * free block map for an appropriate bit pattern.
1373 	 */
1374 	if (bpref)
1375 		start = dtogd(fs, bpref) / NBBY;
1376 	else
1377 		start = cgp->cg_frotor / NBBY;
1378 	/*
1379 	 * the following loop performs two scans -- the first scan
1380 	 * searches the bottom half of the array for a match and the
1381 	 * second scan searches the top half of the array.  The loops
1382 	 * have been merged just to make things difficult.
1383 	 */
1384 	first = start;
1385 	last = howmany(fs->fs_fpg, NBBY);
1386 	secondtime = 0;
1387 	cfrag = cgp->cg_cgx * fs->fs_fpg;
1388 	while (first < last) {
1389 		len = last - first;
1390 		/*
1391 		 * search the array for a match
1392 		 */
1393 		loc = scanc((unsigned)len, (uchar_t *)&cg_blksfree(cgp)[first],
1394 			(uchar_t *)fragtbl[fs->fs_frag],
1395 			(int)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
1396 		/*
1397 		 * match found
1398 		 */
1399 		if (loc) {
1400 			bno = (last - loc) * NBBY;
1401 
1402 			/*
1403 			 * Found the byte in the map, sift
1404 			 * through the bits to find the selected frag
1405 			 */
1406 			cgp->cg_frotor = bno;
1407 			gotit = 0;
1408 			for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
1409 				blk = blkmap(fs, cg_blksfree(cgp), bno);
1410 				blk <<= 1;
1411 				field = around[allocsiz];
1412 				subfield = inside[allocsiz];
1413 				for (pos = 0;
1414 				    pos <= fs->fs_frag - allocsiz;
1415 				    pos++) {
1416 					if ((blk & field) == subfield) {
1417 						gotit++;
1418 						break;
1419 					}
1420 					field <<= 1;
1421 					subfield <<= 1;
1422 				}
1423 				if (gotit)
1424 					break;
1425 			}
1426 			bno += pos;
1427 
1428 			/*
1429 			 * success if block is *not* being converted from
1430 			 * metadata into userdata (harpy).  If so, ignore.
1431 			 */
1432 			if (!TRANS_ISCANCEL(ufsvfsp,
1433 			    ldbtob(fsbtodb(fs, (cfrag+bno))),
1434 			    allocsiz * fs->fs_fsize))
1435 				return (bno);
1436 
1437 			/*
1438 			 * keep looking -- this block is being converted
1439 			 */
1440 			first = (last - loc) + 1;
1441 			loc = 0;
1442 			if (first < last)
1443 				continue;
1444 		}
1445 		/*
1446 		 * no usable matches in bottom half -- now search the top half
1447 		 */
1448 		if (secondtime)
1449 			/*
1450 			 * no usable matches in top half -- all done
1451 			 */
1452 			break;
1453 		secondtime = 1;
1454 		last = start + 1;
1455 		first = 0;
1456 	}
1457 	/*
1458 	 * no usable matches
1459 	 */
1460 	return ((daddr_t)-1);
1461 }
1462 
1463 #define	UFSNADDR (NDADDR + NIADDR)	/* NADDR applies to (obsolete) S5FS */
1464 #define	IB(i)	(NDADDR + (i))	/* index of i'th indirect block ptr */
1465 #define	SINGLE	0		/* single indirect block ptr */
1466 #define	DOUBLE	1		/* double indirect block ptr */
1467 #define	TRIPLE	2		/* triple indirect block ptr */
1468 
1469 /*
1470  * Acquire a write lock, and keep trying till we get it
1471  */
1472 static int
1473 allocsp_wlockfs(struct vnode *vp, struct lockfs *lf)
1474 {
1475 	int err = 0;
1476 
1477 lockagain:
1478 	do {
1479 		err = ufs_fiolfss(vp, lf);
1480 		if (err)
1481 			return (err);
1482 	} while (!LOCKFS_IS_ULOCK(lf));
1483 
1484 	lf->lf_lock = LOCKFS_WLOCK;
1485 	lf->lf_flags = 0;
1486 	lf->lf_comment = NULL;
1487 	err = ufs__fiolfs(vp, lf, 1, 0);
1488 
1489 	if (err == EBUSY || err == EINVAL)
1490 		goto lockagain;
1491 
1492 	return (err);
1493 }
1494 
1495 /*
1496  * Release the write lock
1497  */
1498 static int
1499 allocsp_unlockfs(struct vnode *vp, struct lockfs *lf)
1500 {
1501 	int err = 0;
1502 
1503 	lf->lf_lock = LOCKFS_ULOCK;
1504 	lf->lf_flags = 0;
1505 	err = ufs__fiolfs(vp, lf, 1, 0);
1506 	return (err);
1507 }
1508 
1509 struct allocsp_undo {
1510 	daddr_t offset;
1511 	daddr_t blk;
1512 	struct allocsp_undo *next;
1513 };
1514 
1515 /*
1516  * ufs_allocsp() can be used to pre-allocate blocks for a file on a given
1517  * file system. The blocks are not initialized and are only marked as allocated.
1518  * These addresses are then stored as negative block numbers in the inode to
1519  * imply special handling. UFS has been modified where necessary to understand
1520  * this new notion. Successfully fallocated files will have IFALLOCATE cflag
1521  * set in the inode.
1522  */
1523 int
1524 ufs_allocsp(struct vnode *vp, struct flock64 *lp, cred_t *cr)
1525 {
1526 	struct lockfs lf;
1527 	int berr, err, resv, issync;
1528 	off_t start, istart, len; /* istart, special for idb */
1529 	struct inode *ip;
1530 	struct fs *fs;
1531 	struct ufsvfs *ufsvfsp;
1532 	u_offset_t resid, i;
1533 	daddr32_t db_undo[NDADDR];	/* old direct blocks */
1534 	struct allocsp_undo *ib_undo = NULL;	/* ib undo */
1535 	struct allocsp_undo *undo = NULL;
1536 	u_offset_t osz;			/* old file size */
1537 	int chunkblks = 0;		/* # of blocks in 1 allocation */
1538 	int cnt = 0;
1539 	daddr_t allocblk;
1540 	daddr_t totblks = 0;
1541 	struct ulockfs	*ulp;
1542 
1543 	ASSERT(vp->v_type == VREG);
1544 
1545 	ip = VTOI(vp);
1546 	fs = ip->i_fs;
1547 	if ((ufsvfsp = ip->i_ufsvfs) == NULL) {
1548 		err = EIO;
1549 		goto out_allocsp;
1550 	}
1551 
1552 	istart = start = blkroundup(fs, (lp->l_start));
1553 	len = blkroundup(fs, (lp->l_len));
1554 	chunkblks = blkroundup(fs, ufsvfsp->vfs_iotransz) / fs->fs_bsize;
1555 	ulp = &ufsvfsp->vfs_ulockfs;
1556 
1557 	if (lp->l_start < 0 || lp->l_len <= 0)
1558 		return (EINVAL);
1559 
1560 	/* Quickly check to make sure we have space before we proceed */
1561 	if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree) {
1562 		if (TRANS_ISTRANS(ufsvfsp)) {
1563 			ufs_delete_drain_wait(ufsvfsp, 1);
1564 			if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree)
1565 				return (ENOSPC);
1566 		} else
1567 			return (ENOSPC);
1568 	}
1569 
1570 	/*
1571 	 * We will keep i_rwlock locked as WRITER through out the function
1572 	 * since we don't want anyone else reading or writing to the inode
1573 	 * while we are in the middle of fallocating the file.
1574 	 */
1575 	rw_enter(&ip->i_rwlock, RW_WRITER);
1576 
1577 	/* Back up the direct block list, used for undo later if necessary */
1578 	rw_enter(&ip->i_contents, RW_READER);
1579 	for (i = 0; i < NDADDR; i++)
1580 		db_undo[i] = ip->i_db[i];
1581 	osz = ip->i_size;
1582 	rw_exit(&ip->i_contents);
1583 
1584 	/* Allocate any direct blocks now before we write lock the fs */
1585 	if (lblkno(fs, start) < NDADDR) {
1586 		ufs_trans_trunc_resv(ip, ip->i_size + (NDADDR * fs->fs_bsize),
1587 		    &resv, &resid);
1588 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1589 
1590 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1591 		rw_enter(&ip->i_contents, RW_WRITER);
1592 
1593 		for (i = start; (i < len) && (lblkno(fs, i) < NDADDR);
1594 		    i += fs->fs_bsize) {
1595 			berr = bmap_write(ip, i, fs->fs_bsize, BI_FALLOCATE,
1596 			    &allocblk, cr);
1597 			/* Yikes error, quit */
1598 			if (berr) {
1599 				TRANS_INODE(ufsvfsp, ip);
1600 				rw_exit(&ip->i_contents);
1601 				rw_exit(&ufsvfsp->vfs_dqrwlock);
1602 				TRANS_END_CSYNC(ufsvfsp, err, issync,
1603 				    TOP_ALLOCSP, resv);
1604 				goto exit;
1605 			}
1606 
1607 			if (allocblk) {
1608 				totblks++;
1609 				ip->i_size += fs->fs_bsize;
1610 			}
1611 		}
1612 
1613 		TRANS_INODE(ufsvfsp, ip);
1614 		rw_exit(&ip->i_contents);
1615 		rw_exit(&ufsvfsp->vfs_dqrwlock);
1616 		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
1617 
1618 		istart =  i;	/* start offset for indirect allocation */
1619 	}
1620 
1621 	/* Write lock the file system */
1622 	if (err = allocsp_wlockfs(vp, &lf))
1623 		goto exit;
1624 
1625 	/* Break the transactions into vfs_iotransz units */
1626 	ufs_trans_trunc_resv(ip, ip->i_size +
1627 	    blkroundup(fs, ufsvfsp->vfs_iotransz), &resv, &resid);
1628 	TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1629 
1630 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1631 	rw_enter(&ip->i_contents, RW_WRITER);
1632 
1633 	/* Now go about fallocating necessary indirect blocks */
1634 	for (i = istart; i < len; i += fs->fs_bsize) {
1635 		berr = bmap_write(ip, i, fs->fs_bsize, BI_FALLOCATE,
1636 		    &allocblk, cr);
1637 		if (berr) {
1638 			TRANS_INODE(ufsvfsp, ip);
1639 			rw_exit(&ip->i_contents);
1640 			rw_exit(&ufsvfsp->vfs_dqrwlock);
1641 			TRANS_END_CSYNC(ufsvfsp, err, issync,
1642 			    TOP_ALLOCSP, resv);
1643 			err = allocsp_unlockfs(vp, &lf);
1644 			goto exit;
1645 		}
1646 
1647 		/* Update the blk counter only if new block was added */
1648 		if (allocblk) {
1649 			/* Save undo information */
1650 			undo = kmem_alloc(sizeof (struct allocsp_undo),
1651 			    KM_SLEEP);
1652 			undo->offset = i;
1653 			undo->blk = allocblk;
1654 			undo->next = ib_undo;
1655 			ib_undo = undo;
1656 			totblks++;
1657 			ip->i_size += fs->fs_bsize;
1658 		}
1659 		cnt++;
1660 
1661 		/* Being a good UFS citizen, let others get a share */
1662 		if (cnt == chunkblks) {
1663 			/*
1664 			 * If there are waiters or the fs is hard locked,
1665 			 * error locked, or read-only error locked,
1666 			 * quit with EIO
1667 			 */
1668 			if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp) ||
1669 			    ULOCKFS_IS_ROELOCK(ulp)) {
1670 				ip->i_cflags |= IFALLOCATE;
1671 				TRANS_INODE(ufsvfsp, ip);
1672 				rw_exit(&ip->i_contents);
1673 				rw_exit(&ufsvfsp->vfs_dqrwlock);
1674 
1675 				TRANS_END_CSYNC(ufsvfsp, err, issync,
1676 				    TOP_ALLOCSP, resv);
1677 				rw_exit(&ip->i_rwlock);
1678 				return (EIO);
1679 			}
1680 
1681 			TRANS_INODE(ufsvfsp, ip);
1682 			rw_exit(&ip->i_contents);
1683 			rw_exit(&ufsvfsp->vfs_dqrwlock);
1684 
1685 			/* End the current transaction */
1686 			TRANS_END_CSYNC(ufsvfsp, err, issync,
1687 			    TOP_ALLOCSP, resv);
1688 
1689 			if (CV_HAS_WAITERS(&ulp->ul_cv)) {
1690 				/* Release the write lock */
1691 				if (err = allocsp_unlockfs(vp, &lf))
1692 					goto exit;
1693 
1694 				/* Wake up others waiting to do operations */
1695 				mutex_enter(&ulp->ul_lock);
1696 				cv_broadcast(&ulp->ul_cv);
1697 				mutex_exit(&ulp->ul_lock);
1698 
1699 				/* Grab the write lock again */
1700 				if (err = allocsp_wlockfs(vp, &lf))
1701 					goto exit;
1702 			} /* end of CV_HAS_WAITERS(&ulp->ul_cv) */
1703 
1704 			/* Reserve more space in log for this file */
1705 			ufs_trans_trunc_resv(ip,
1706 			    ip->i_size + blkroundup(fs, ufsvfsp->vfs_iotransz),
1707 			    &resv, &resid);
1708 			TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1709 
1710 			rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1711 			rw_enter(&ip->i_contents, RW_WRITER);
1712 
1713 			cnt = 0;	/* reset cnt b/c of new transaction */
1714 		}
1715 	}
1716 
1717 	if (!err && !berr)
1718 		ip->i_cflags |= IFALLOCATE;
1719 
1720 	/* Release locks, end log transaction and unlock fs */
1721 	TRANS_INODE(ufsvfsp, ip);
1722 	rw_exit(&ip->i_contents);
1723 	rw_exit(&ufsvfsp->vfs_dqrwlock);
1724 
1725 	TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
1726 	err = allocsp_unlockfs(vp, &lf);
1727 
1728 	/*
1729 	 * @ exit label, we should no longer be holding the fs write lock, and
1730 	 * all logging transactions should have been ended. We still hold
1731 	 * ip->i_rwlock.
1732 	 */
1733 exit:
1734 	/*
1735 	 * File has grown larger than 2GB. Set flag
1736 	 * in superblock to indicate this, if it
1737 	 * is not already set.
1738 	 */
1739 	if ((ip->i_size > MAXOFF32_T) &&
1740 		!(fs->fs_flags & FSLARGEFILES)) {
1741 		ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
1742 		mutex_enter(&ufsvfsp->vfs_lock);
1743 		fs->fs_flags |= FSLARGEFILES;
1744 		ufs_sbwrite(ufsvfsp);
1745 		mutex_exit(&ufsvfsp->vfs_lock);
1746 	}
1747 
1748 	/*
1749 	 * Since we couldn't allocate completely, we will undo the allocations.
1750 	 */
1751 	if (berr) {
1752 		ufs_trans_trunc_resv(ip, totblks * fs->fs_bsize, &resv, &resid);
1753 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1754 
1755 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1756 		rw_enter(&ip->i_contents, RW_WRITER);
1757 
1758 		/* Direct blocks */
1759 		for (i = 0; i < NDADDR; i++) {
1760 			/*
1761 			 * Only free the block if they are not same, and
1762 			 * the old one isn't zero (the fragment was
1763 			 * re-allocated).
1764 			 */
1765 			if (db_undo[i] != ip->i_db[i] && db_undo[i] == 0) {
1766 				free(ip, ip->i_db[i], fs->fs_bsize, 0);
1767 				ip->i_db[i] = 0;
1768 			}
1769 		}
1770 
1771 		/* Undo the indirect blocks */
1772 		while (ib_undo != NULL) {
1773 			undo = ib_undo;
1774 			err = bmap_set_bn(vp, undo->offset, 0);
1775 			if (err)
1776 				cmn_err(CE_PANIC, "ufs_allocsp(): failed to "
1777 				    "undo allocation of block %ld",
1778 				    undo->offset);
1779 			free(ip, undo->blk, fs->fs_bsize, I_IBLK);
1780 			ib_undo = undo->next;
1781 			kmem_free(undo, sizeof (struct allocsp_undo));
1782 		}
1783 
1784 		ip->i_size = osz;
1785 		TRANS_INODE(ufsvfsp, ip);
1786 
1787 		rw_exit(&ip->i_contents);
1788 		rw_exit(&ufsvfsp->vfs_dqrwlock);
1789 
1790 		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
1791 
1792 		rw_exit(&ip->i_rwlock);
1793 		return (berr);
1794 	}
1795 
1796 	/*
1797 	 * Don't forget to free the undo chain :)
1798 	 */
1799 	while (ib_undo != NULL) {
1800 		undo = ib_undo;
1801 		ib_undo = undo->next;
1802 		kmem_free(undo, sizeof (struct allocsp_undo));
1803 	}
1804 
1805 	rw_exit(&ip->i_rwlock);
1806 
1807 out_allocsp:
1808 	return (err);
1809 }
1810 
1811 /*
1812  * Free storage space associated with the specified inode.  The portion
1813  * to be freed is specified by lp->l_start and lp->l_len (already
1814  * normalized to a "whence" of 0).
1815  *
1816  * This is an experimental facility whose continued existence is not
1817  * guaranteed.  Currently, we only support the special case
1818  * of l_len == 0, meaning free to end of file.
1819  *
1820  * Blocks are freed in reverse order.  This FILO algorithm will tend to
1821  * maintain a contiguous free list much longer than FIFO.
1822  * See also ufs_itrunc() in ufs_inode.c.
1823  *
1824  * Bug: unused bytes in the last retained block are not cleared.
1825  * This may result in a "hole" in the file that does not read as zeroes.
1826  */
1827 /* ARGSUSED */
1828 int
1829 ufs_freesp(struct vnode *vp, struct flock64 *lp, int flag, cred_t *cr)
1830 {
1831 	int i;
1832 	struct inode *ip = VTOI(vp);
1833 	int error;
1834 
1835 	ASSERT(vp->v_type == VREG);
1836 	ASSERT(lp->l_start >= 0);	/* checked by convoff */
1837 
1838 	if (lp->l_len != 0)
1839 		return (EINVAL);
1840 
1841 	rw_enter(&ip->i_contents, RW_READER);
1842 	if (ip->i_size == (u_offset_t)lp->l_start) {
1843 		rw_exit(&ip->i_contents);
1844 		return (0);
1845 	}
1846 
1847 	/*
1848 	 * Check if there is any active mandatory lock on the
1849 	 * range that will be truncated/expanded.
1850 	 */
1851 	if (MANDLOCK(vp, ip->i_mode)) {
1852 		offset_t save_start;
1853 
1854 		save_start = lp->l_start;
1855 
1856 		if (ip->i_size < lp->l_start) {
1857 			/*
1858 			 * "Truncate up" case: need to make sure there
1859 			 * is no lock beyond current end-of-file. To
1860 			 * do so, we need to set l_start to the size
1861 			 * of the file temporarily.
1862 			 */
1863 			lp->l_start = ip->i_size;
1864 		}
1865 		lp->l_type = F_WRLCK;
1866 		lp->l_sysid = 0;
1867 		lp->l_pid = ttoproc(curthread)->p_pid;
1868 		i = (flag & (FNDELAY|FNONBLOCK)) ? 0 : SLPFLCK;
1869 		rw_exit(&ip->i_contents);
1870 		if ((i = reclock(vp, lp, i, 0, lp->l_start, NULL)) != 0 ||
1871 		    lp->l_type != F_UNLCK) {
1872 			return (i ? i : EAGAIN);
1873 		}
1874 		rw_enter(&ip->i_contents, RW_READER);
1875 
1876 		lp->l_start = save_start;
1877 	}
1878 
1879 	/*
1880 	 * Make sure a write isn't in progress (allocating blocks)
1881 	 * by acquiring i_rwlock (we promised ufs_bmap we wouldn't
1882 	 * truncate while it was allocating blocks).
1883 	 * Grab the locks in the right order.
1884 	 */
1885 	rw_exit(&ip->i_contents);
1886 	rw_enter(&ip->i_rwlock, RW_WRITER);
1887 	error = TRANS_ITRUNC(ip, (u_offset_t)lp->l_start, 0, cr);
1888 	rw_exit(&ip->i_rwlock);
1889 	return (error);
1890 }
1891 
1892 /*
1893  * Find a cg with as close to nb contiguous bytes as possible
1894  *	THIS MAY TAKE MANY DISK READS!
1895  *
1896  * Implemented in an attempt to allocate contiguous blocks for
1897  * writing the ufs log file to, minimizing future disk head seeking
1898  */
1899 daddr_t
1900 contigpref(ufsvfs_t *ufsvfsp, size_t nb)
1901 {
1902 	struct fs	*fs	= ufsvfsp->vfs_fs;
1903 	daddr_t		nblk	= lblkno(fs, blkroundup(fs, nb));
1904 	daddr_t		savebno, curbno, cgbno;
1905 	int		cg, cgblks, savecg, savenblk, curnblk;
1906 	uchar_t		*blksfree;
1907 	buf_t		*bp;
1908 	struct cg	*cgp;
1909 
1910 	savenblk = 0;
1911 	savecg = 0;
1912 	savebno = 0;
1913 	for (cg = 0; cg < fs->fs_ncg; ++cg) {
1914 
1915 		/* not enough free blks for a contig check */
1916 		if (fs->fs_cs(fs, cg).cs_nbfree < nblk)
1917 			continue;
1918 
1919 		/*
1920 		 * find the largest contiguous range in this cg
1921 		 */
1922 		bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev,
1923 		    (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
1924 		    (int)fs->fs_cgsize);
1925 		cgp = bp->b_un.b_cg;
1926 		if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
1927 			brelse(bp);
1928 			continue;
1929 		}
1930 		blksfree = cg_blksfree(cgp);	    /* free array */
1931 		cgblks = fragstoblks(fs, fs->fs_fpg); /* blks in free array */
1932 		cgbno = 0;
1933 		while (cgbno < cgblks && savenblk < nblk) {
1934 			/* find a free block */
1935 			for (; cgbno < cgblks; ++cgbno)
1936 				if (isblock(fs, blksfree, cgbno))
1937 					break;
1938 			curbno = cgbno;
1939 			/* count the number of free blocks */
1940 			for (curnblk = 0; cgbno < cgblks; ++cgbno) {
1941 				if (!isblock(fs, blksfree, cgbno))
1942 					break;
1943 				if (++curnblk >= nblk)
1944 					break;
1945 			}
1946 			if (curnblk > savenblk) {
1947 				savecg = cg;
1948 				savenblk = curnblk;
1949 				savebno = curbno;
1950 			}
1951 		}
1952 		brelse(bp);
1953 		if (savenblk >= nblk)
1954 			break;
1955 	}
1956 
1957 	/* convert block offset in cg to frag offset in cg */
1958 	savebno = blkstofrags(fs, savebno);
1959 
1960 	/* convert frag offset in cg to frag offset in fs */
1961 	savebno += (savecg * fs->fs_fpg);
1962 
1963 	return (savebno);
1964 }
1965