xref: /titanic_50/usr/src/uts/common/fs/ufs/ufs_alloc.c (revision 7a286c471efbab8562f7655a82931904703fffe0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 #include <sys/condvar_impl.h>
40 #include <sys/types.h>
41 #include <sys/t_lock.h>
42 #include <sys/debug.h>
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/signal.h>
46 #include <sys/cred.h>
47 #include <sys/proc.h>
48 #include <sys/disp.h>
49 #include <sys/user.h>
50 #include <sys/buf.h>
51 #include <sys/vfs.h>
52 #include <sys/vnode.h>
53 #include <sys/acl.h>
54 #include <sys/fs/ufs_fs.h>
55 #include <sys/fs/ufs_inode.h>
56 #include <sys/fs/ufs_acl.h>
57 #include <sys/fs/ufs_bio.h>
58 #include <sys/fs/ufs_quota.h>
59 #include <sys/kmem.h>
60 #include <sys/fs/ufs_trans.h>
61 #include <sys/fs/ufs_panic.h>
62 #include <sys/errno.h>
63 #include <sys/time.h>
64 #include <sys/sysmacros.h>
65 #include <sys/file.h>
66 #include <sys/fcntl.h>
67 #include <sys/flock.h>
68 #include <fs/fs_subr.h>
69 #include <sys/cmn_err.h>
70 #include <sys/policy.h>
71 #include <sys/fs/ufs_log.h>
72 
73 static ino_t	hashalloc();
74 static daddr_t	fragextend();
75 static daddr_t	alloccg();
76 static daddr_t	alloccgblk();
77 static ino_t	ialloccg();
78 static daddr_t	mapsearch();
79 static int	findlogstartcg();
80 
81 extern int	inside[], around[];
82 extern uchar_t	*fragtbl[];
83 void delay();
84 
85 /*
86  * Allocate a block in the file system.
87  *
88  * The size of the requested block is given, which must be some
89  * multiple of fs_fsize and <= fs_bsize.
90  * A preference may be optionally specified. If a preference is given
91  * the following hierarchy is used to allocate a block:
92  *   1) allocate the requested block.
93  *   2) allocate a rotationally optimal block in the same cylinder.
94  *   3) allocate a block in the same cylinder group.
95  *   4) quadratically rehash into other cylinder groups, until an
96  *	available block is located.
97  * If no block preference is given the following hierarchy is used
98  * to allocate a block:
99  *   1) allocate a block in the cylinder group that contains the
100  *	inode for the file.
101  *   2) quadratically rehash into other cylinder groups, until an
102  *	available block is located.
103  */
104 int
105 alloc(struct inode *ip, daddr_t bpref, int size, daddr_t *bnp, cred_t *cr)
106 {
107 	struct fs *fs;
108 	struct ufsvfs *ufsvfsp;
109 	daddr_t bno;
110 	int cg;
111 	int err;
112 	char *errmsg = NULL;
113 	size_t len;
114 
115 	ufsvfsp = ip->i_ufsvfs;
116 	fs = ufsvfsp->vfs_fs;
117 	if ((unsigned)size > fs->fs_bsize || fragoff(fs, size) != 0) {
118 		err = ufs_fault(ITOV(ip), "alloc: bad size, dev = 0x%lx,"
119 		    " bsize = %d, size = %d, fs = %s\n",
120 		    ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
121 		return (err);
122 	}
123 	if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
124 		goto nospace;
125 	if (freespace(fs, ufsvfsp) <= 0 &&
126 	    secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0)
127 		goto nospace;
128 	err = chkdq(ip, (long)btodb(size), 0, cr, &errmsg, &len);
129 	/* Note that may not have err, but may have errmsg */
130 	if (errmsg != NULL) {
131 		uprintf(errmsg);
132 		kmem_free(errmsg, len);
133 		errmsg = NULL;
134 	}
135 	if (err)
136 		return (err);
137 	if (bpref >= fs->fs_size)
138 		bpref = 0;
139 	if (bpref == 0)
140 		cg = (int)itog(fs, ip->i_number);
141 	else
142 		cg = dtog(fs, bpref);
143 
144 	bno = (daddr_t)hashalloc(ip, cg, (long)bpref, size,
145 	    (ulong_t (*)())alloccg);
146 	if (bno > 0) {
147 		*bnp = bno;
148 		return (0);
149 	}
150 
151 	/*
152 	 * hashalloc() failed because some other thread grabbed
153 	 * the last block so unwind the quota operation.  We can
154 	 * ignore the return because subtractions don't fail and
155 	 * size is guaranteed to be >= zero by our caller.
156 	 */
157 	(void) chkdq(ip, -(long)btodb(size), 0, cr, (char **)NULL,
158 	    (size_t *)NULL);
159 
160 nospace:
161 	mutex_enter(&ufsvfsp->vfs_lock);
162 	if ((lbolt - ufsvfsp->vfs_lastwhinetime) > (hz << 2) &&
163 	    (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) {
164 		ufsvfsp->vfs_lastwhinetime = lbolt;
165 		cmn_err(CE_NOTE, "alloc: %s: file system full", fs->fs_fsmnt);
166 	}
167 	mutex_exit(&ufsvfsp->vfs_lock);
168 	return (ENOSPC);
169 }
170 
171 /*
172  * Reallocate a fragment to a bigger size
173  *
174  * The number and size of the old block is given, and a preference
175  * and new size is also specified.  The allocator attempts to extend
176  * the original block.  Failing that, the regular block allocator is
177  * invoked to get an appropriate block.
178  */
179 int
180 realloccg(struct inode *ip, daddr_t bprev, daddr_t bpref, int osize,
181     int nsize, daddr_t *bnp, cred_t *cr)
182 {
183 	daddr_t bno;
184 	struct fs *fs;
185 	struct ufsvfs *ufsvfsp;
186 	int cg, request;
187 	int err;
188 	char *errmsg = NULL;
189 	size_t len;
190 
191 	ufsvfsp = ip->i_ufsvfs;
192 	fs = ufsvfsp->vfs_fs;
193 	if ((unsigned)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
194 	    (unsigned)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
195 		err = ufs_fault(ITOV(ip),
196 		    "realloccg: bad size, dev=0x%lx, bsize=%d, "
197 		    "osize=%d, nsize=%d, fs=%s\n",
198 		    ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt);
199 		return (err);
200 	}
201 	if (freespace(fs, ufsvfsp) <= 0 &&
202 	    secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0)
203 		goto nospace;
204 	if (bprev == 0) {
205 		err = ufs_fault(ITOV(ip),
206 		    "realloccg: bad bprev, dev = 0x%lx, bsize = %d,"
207 		    " bprev = %ld, fs = %s\n", ip->i_dev, fs->fs_bsize, bprev,
208 		    fs->fs_fsmnt);
209 		return (err);
210 	}
211 	err = chkdq(ip, (long)btodb(nsize - osize), 0, cr, &errmsg, &len);
212 	/* Note that may not have err, but may have errmsg */
213 	if (errmsg != NULL) {
214 		uprintf(errmsg);
215 		kmem_free(errmsg, len);
216 		errmsg = NULL;
217 	}
218 	if (err)
219 		return (err);
220 	cg = dtog(fs, bprev);
221 	bno = fragextend(ip, cg, (long)bprev, osize, nsize);
222 	if (bno != 0) {
223 		*bnp = bno;
224 		return (0);
225 	}
226 	if (bpref >= fs->fs_size)
227 		bpref = 0;
228 
229 	/*
230 	 * When optimizing for time we allocate a full block and
231 	 * then only use the upper portion for this request. When
232 	 * this file grows again it will grow into the unused portion
233 	 * of the block (See fragextend() above).  This saves time
234 	 * because an extra disk write would be needed if the frags
235 	 * following the current allocation were not free. The extra
236 	 * disk write is needed to move the data from its current
237 	 * location into the newly allocated position.
238 	 *
239 	 * When optimizing for space we allocate a run of frags
240 	 * that is just the right size for this request.
241 	 */
242 	request = (fs->fs_optim == FS_OPTTIME) ? fs->fs_bsize : nsize;
243 	bno = (daddr_t)hashalloc(ip, cg, (long)bpref, request,
244 	    (ulong_t (*)())alloccg);
245 	if (bno > 0) {
246 		*bnp = bno;
247 		if (nsize < request)
248 			(void) free(ip, bno + numfrags(fs, nsize),
249 			    (off_t)(request - nsize), I_NOCANCEL);
250 		return (0);
251 	}
252 
253 	/*
254 	 * hashalloc() failed because some other thread grabbed
255 	 * the last block so unwind the quota operation.  We can
256 	 * ignore the return because subtractions don't fail, and
257 	 * our caller guarantees nsize >= osize.
258 	 */
259 	(void) chkdq(ip, -(long)btodb(nsize - osize), 0, cr, (char **)NULL,
260 	    (size_t *)NULL);
261 
262 nospace:
263 	mutex_enter(&ufsvfsp->vfs_lock);
264 	if ((lbolt - ufsvfsp->vfs_lastwhinetime) > (hz << 2) &&
265 	    (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) {
266 		ufsvfsp->vfs_lastwhinetime = lbolt;
267 		cmn_err(CE_NOTE,
268 		    "realloccg %s: file system full", fs->fs_fsmnt);
269 	}
270 	mutex_exit(&ufsvfsp->vfs_lock);
271 	return (ENOSPC);
272 }
273 
274 /*
275  * Allocate an inode in the file system.
276  *
277  * A preference may be optionally specified. If a preference is given
278  * the following hierarchy is used to allocate an inode:
279  *   1) allocate the requested inode.
280  *   2) allocate an inode in the same cylinder group.
281  *   3) quadratically rehash into other cylinder groups, until an
282  *	available inode is located.
283  * If no inode preference is given the following hierarchy is used
284  * to allocate an inode:
285  *   1) allocate an inode in cylinder group 0.
286  *   2) quadratically rehash into other cylinder groups, until an
287  *	available inode is located.
288  */
289 int
290 ufs_ialloc(struct inode *pip,
291     ino_t ipref, mode_t mode, struct inode **ipp, cred_t *cr)
292 {
293 	struct inode *ip;
294 	struct fs *fs;
295 	int cg;
296 	ino_t ino;
297 	int err;
298 	int nifree;
299 	struct ufsvfs *ufsvfsp = pip->i_ufsvfs;
300 	char *errmsg = NULL;
301 	size_t len;
302 
303 	ASSERT(RW_WRITE_HELD(&pip->i_rwlock));
304 	fs = pip->i_fs;
305 loop:
306 	nifree = fs->fs_cstotal.cs_nifree;
307 
308 	if (nifree == 0)
309 		goto noinodes;
310 	/*
311 	 * Shadow inodes don't count against a user's inode allocation.
312 	 * They are an implementation method and not a resource.
313 	 */
314 	if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
315 		err = chkiq((struct ufsvfs *)ITOV(pip)->v_vfsp->vfs_data,
316 		    /* change */ 1, (struct inode *)NULL, crgetuid(cr), 0,
317 		    cr, &errmsg, &len);
318 		/*
319 		 * As we haven't acquired any locks yet, dump the message
320 		 * now.
321 		 */
322 		if (errmsg != NULL) {
323 			uprintf(errmsg);
324 			kmem_free(errmsg, len);
325 			errmsg = NULL;
326 		}
327 		if (err)
328 			return (err);
329 	}
330 
331 	if (ipref >= (ulong_t)(fs->fs_ncg * fs->fs_ipg))
332 		ipref = 0;
333 	cg = (int)itog(fs, ipref);
334 	ino = (ino_t)hashalloc(pip, cg, (long)ipref, (int)mode,
335 	    (ulong_t (*)())ialloccg);
336 	if (ino == 0) {
337 		if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
338 			/*
339 			 * We can safely ignore the return from chkiq()
340 			 * because deallocations can only fail if we
341 			 * can't get the user's quota info record off
342 			 * the disk due to an I/O error.  In that case,
343 			 * the quota subsystem is already messed up.
344 			 */
345 			(void) chkiq(ufsvfsp, /* change */ -1,
346 			    (struct inode *)NULL, crgetuid(cr), 0, cr,
347 			    (char **)NULL, (size_t *)NULL);
348 		}
349 		goto noinodes;
350 	}
351 	err = ufs_iget(pip->i_vfs, ino, ipp, cr);
352 	if (err) {
353 		if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
354 			/*
355 			 * See above comment about why it is safe to ignore an
356 			 * error return here.
357 			 */
358 			(void) chkiq(ufsvfsp, /* change */ -1,
359 			    (struct inode *)NULL, crgetuid(cr), 0, cr,
360 			    (char **)NULL, (size_t *)NULL);
361 		}
362 		ufs_ifree(pip, ino, 0);
363 		return (err);
364 	}
365 	ip = *ipp;
366 	ASSERT(!ip->i_ufs_acl);
367 	ASSERT(!ip->i_dquot);
368 	rw_enter(&ip->i_contents, RW_WRITER);
369 
370 	/*
371 	 * Check if we really got a free inode, if not then complain
372 	 * and mark the inode ISTALE so that it will be freed by the
373 	 * ufs idle thread eventually and will not be sent to ufs_delete().
374 	 */
375 	if (ip->i_mode || (ip->i_nlink > 0)) {
376 		ip->i_flag |= ISTALE;
377 		rw_exit(&ip->i_contents);
378 		VN_RELE(ITOV(ip));
379 		cmn_err(CE_WARN,
380 		    "%s: unexpected allocated inode %d, run fsck(1M)%s",
381 		    fs->fs_fsmnt, (int)ino,
382 		    (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
383 		goto loop;
384 	}
385 
386 	/*
387 	 * Check the inode has no size or data blocks.
388 	 * This could have happened if the truncation failed when
389 	 * deleting the inode. It used to be possible for this to occur
390 	 * if a block allocation failed when iteratively truncating a
391 	 * large file using logging and with a full file system.
392 	 * This was fixed with bug fix 4348738. However, truncation may
393 	 * still fail on an IO error. So in all cases for safety and
394 	 * security we clear out the size; the blocks allocated; and
395 	 * pointers to the blocks. This will ultimately cause a fsck
396 	 * error of un-accounted for blocks, but its a fairly benign error,
397 	 * and possibly the correct thing to do anyway as accesssing those
398 	 * blocks agains may lead to more IO errors.
399 	 */
400 	if (ip->i_size || ip->i_blocks) {
401 		int i;
402 
403 		if (ip->i_size) {
404 			cmn_err(CE_WARN,
405 			    "%s: free inode %d had size 0x%llx, run fsck(1M)%s",
406 			    fs->fs_fsmnt, (int)ino, ip->i_size,
407 			    (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
408 		}
409 		/*
410 		 * Clear any garbage left behind.
411 		 */
412 		ip->i_size = (u_offset_t)0;
413 		ip->i_blocks = 0;
414 		for (i = 0; i < NDADDR; i++)
415 			ip->i_db[i] = 0;
416 		for (i = 0; i < NIADDR; i++)
417 			ip->i_ib[i] = 0;
418 	}
419 
420 	/*
421 	 * Initialize the link count
422 	 */
423 	ip->i_nlink = 0;
424 
425 	/*
426 	 * Clear the old flags
427 	 */
428 	ip->i_flag &= IREF;
429 
430 	/*
431 	 * Access times are not really defined if the fs is mounted
432 	 * with 'noatime'. But it can cause nfs clients to fail
433 	 * open() if the atime is not a legal value. Set a legal value
434 	 * here when the inode is allocated.
435 	 */
436 	if (ufsvfsp->vfs_noatime) {
437 		mutex_enter(&ufs_iuniqtime_lock);
438 		ip->i_atime = iuniqtime;
439 		mutex_exit(&ufs_iuniqtime_lock);
440 	}
441 	rw_exit(&ip->i_contents);
442 	return (0);
443 noinodes:
444 	if (!(TRANS_ISTRANS(ufsvfsp)) || !(pip->i_flag & IQUIET))
445 		cmn_err(CE_NOTE, "%s: out of inodes\n", fs->fs_fsmnt);
446 	return (ENOSPC);
447 }
448 
449 /*
450  * Find a cylinder group to place a directory.
451  * Returns an inumber within the selected cylinder group.
452  * Note, the vfs_lock is not needed as we don't require exact cg summary info.
453  *
454  * If the switch ufs_close_dirs is set, then the policy is to use
455  * the current cg if it has more than 25% free inodes and more
456  * than 25% free blocks. Otherwise the cgs are searched from
457  * the beginning and the first cg with the same criteria is
458  * used. If that is also null then we revert to the old algorithm.
459  * This tends to cluster files at the beginning of the disk
460  * until the disk gets full.
461  *
462  * Otherwise if ufs_close_dirs is not set then the original policy is
463  * used which is to select from among those cylinder groups with
464  * above the average number of free inodes, the one with the smallest
465  * number of directories.
466  */
467 
468 int ufs_close_dirs = 1;	/* allocate directories close as possible */
469 
470 ino_t
471 dirpref(inode_t *dp)
472 {
473 	int cg, minndir, mincg, avgifree, mininode, minbpg, ifree;
474 	struct fs *fs = dp->i_fs;
475 
476 	cg = itog(fs, dp->i_number);
477 	mininode = fs->fs_ipg >> 2;
478 	minbpg = fs->fs_maxbpg >> 2;
479 	if (ufs_close_dirs &&
480 	    (fs->fs_cs(fs, cg).cs_nifree > mininode) &&
481 	    (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) {
482 		return (dp->i_number);
483 	}
484 
485 	avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
486 	minndir = fs->fs_ipg;
487 	mincg = 0;
488 	for (cg = 0; cg < fs->fs_ncg; cg++) {
489 		ifree = fs->fs_cs(fs, cg).cs_nifree;
490 		if (ufs_close_dirs &&
491 		    (ifree > mininode) &&
492 		    (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) {
493 			return ((ino_t)(fs->fs_ipg * cg));
494 		}
495 		if ((fs->fs_cs(fs, cg).cs_ndir < minndir) &&
496 		    (ifree >= avgifree)) {
497 			mincg = cg;
498 			minndir = fs->fs_cs(fs, cg).cs_ndir;
499 		}
500 	}
501 	return ((ino_t)(fs->fs_ipg * mincg));
502 }
503 
504 /*
505  * Select the desired position for the next block in a file.  The file is
506  * logically divided into sections. The first section is composed of the
507  * direct blocks. Each additional section contains fs_maxbpg blocks.
508  *
509  * If no blocks have been allocated in the first section, the policy is to
510  * request a block in the same cylinder group as the inode that describes
511  * the file. If no blocks have been allocated in any other section, the
512  * policy is to place the section in a cylinder group with a greater than
513  * average number of free blocks.  An appropriate cylinder group is found
514  * by using a rotor that sweeps the cylinder groups. When a new group of
515  * blocks is needed, the sweep begins in the cylinder group following the
516  * cylinder group from which the previous allocation was made. The sweep
517  * continues until a cylinder group with greater than the average number
518  * of free blocks is found. If the allocation is for the first block in an
519  * indirect block, the information on the previous allocation is unavailable;
520  * here a best guess is made based upon the logical block number being
521  * allocated.
522  *
523  * If a section is already partially allocated, the policy is to
524  * contiguously allocate fs_maxcontig blocks.  The end of one of these
525  * contiguous blocks and the beginning of the next is physically separated
526  * so that the disk head will be in transit between them for at least
527  * fs_rotdelay milliseconds.  This is to allow time for the processor to
528  * schedule another I/O transfer.
529  */
530 daddr_t
531 blkpref(struct inode *ip, daddr_t lbn, int indx, daddr32_t *bap)
532 {
533 	struct fs *fs;
534 	struct ufsvfs *ufsvfsp;
535 	int cg;
536 	int avgbfree, startcg;
537 	daddr_t nextblk;
538 
539 	ufsvfsp = ip->i_ufsvfs;
540 	fs = ip->i_fs;
541 	if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
542 		if (lbn < NDADDR) {
543 			cg = itog(fs, ip->i_number);
544 			return (fs->fs_fpg * cg + fs->fs_frag);
545 		}
546 		/*
547 		 * Find a cylinder with greater than average
548 		 * number of unused data blocks.
549 		 */
550 		if (indx == 0 || bap[indx - 1] == 0)
551 			startcg = itog(fs, ip->i_number) + lbn / fs->fs_maxbpg;
552 		else
553 			startcg = dtog(fs, bap[indx - 1]) + 1;
554 		startcg %= fs->fs_ncg;
555 
556 		mutex_enter(&ufsvfsp->vfs_lock);
557 		avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
558 		/*
559 		 * used for computing log space for writes/truncs
560 		 */
561 		ufsvfsp->vfs_avgbfree = avgbfree;
562 		for (cg = startcg; cg < fs->fs_ncg; cg++)
563 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
564 				fs->fs_cgrotor = cg;
565 				mutex_exit(&ufsvfsp->vfs_lock);
566 				return (fs->fs_fpg * cg + fs->fs_frag);
567 			}
568 		for (cg = 0; cg <= startcg; cg++)
569 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
570 				fs->fs_cgrotor = cg;
571 				mutex_exit(&ufsvfsp->vfs_lock);
572 				return (fs->fs_fpg * cg + fs->fs_frag);
573 			}
574 		mutex_exit(&ufsvfsp->vfs_lock);
575 		return (NULL);
576 	}
577 	/*
578 	 * One or more previous blocks have been laid out. If less
579 	 * than fs_maxcontig previous blocks are contiguous, the
580 	 * next block is requested contiguously, otherwise it is
581 	 * requested rotationally delayed by fs_rotdelay milliseconds.
582 	 */
583 
584 	nextblk = bap[indx - 1];
585 	/*
586 	 * Provision for fallocate to return positive
587 	 * blk preference based on last allocation
588 	 */
589 	if (nextblk < 0 && nextblk != UFS_HOLE) {
590 		nextblk = (-bap[indx - 1]) + fs->fs_frag;
591 	} else {
592 		nextblk = bap[indx - 1] + fs->fs_frag;
593 	}
594 
595 	if (indx > fs->fs_maxcontig && bap[indx - fs->fs_maxcontig] +
596 	    blkstofrags(fs, fs->fs_maxcontig) != nextblk) {
597 		return (nextblk);
598 	}
599 	if (fs->fs_rotdelay != 0)
600 		/*
601 		 * Here we convert ms of delay to frags as:
602 		 * (frags) = (ms) * (rev/sec) * (sect/rev) /
603 		 * 	((sect/frag) * (ms/sec))
604 		 * then round up to the next block.
605 		 */
606 		nextblk += roundup(fs->fs_rotdelay * fs->fs_rps * fs->fs_nsect /
607 		    (NSPF(fs) * 1000), fs->fs_frag);
608 	return (nextblk);
609 }
610 
611 /*
612  * Free a block or fragment.
613  *
614  * The specified block or fragment is placed back in the
615  * free map. If a fragment is deallocated, a possible
616  * block reassembly is checked.
617  */
618 void
619 free(struct inode *ip, daddr_t bno, off_t size, int flags)
620 {
621 	struct fs *fs = ip->i_fs;
622 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
623 	struct ufs_q *delq = &ufsvfsp->vfs_delete;
624 	struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
625 	struct cg *cgp;
626 	struct buf *bp;
627 	int cg, bmap, bbase;
628 	int i;
629 	uchar_t *blksfree;
630 	int *blktot;
631 	short *blks;
632 	daddr_t blkno, cylno, rpos;
633 
634 	/*
635 	 * fallocate'd files will have negative block address.
636 	 * So negate it again to get original block address.
637 	 */
638 	if (bno < 0 && (bno % fs->fs_frag == 0) && bno != UFS_HOLE) {
639 		bno = -bno;
640 	}
641 
642 	if ((unsigned long)size > fs->fs_bsize || fragoff(fs, size) != 0) {
643 		(void) ufs_fault(ITOV(ip),
644 		    "free: bad size, dev = 0x%lx, bsize = %d, size = %d, "
645 		    "fs = %s\n", ip->i_dev, fs->fs_bsize,
646 		    (int)size, fs->fs_fsmnt);
647 		return;
648 	}
649 	cg = dtog(fs, bno);
650 	ASSERT(!ufs_badblock(ip, bno));
651 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
652 	    (int)fs->fs_cgsize);
653 
654 	cgp = bp->b_un.b_cg;
655 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
656 		brelse(bp);
657 		return;
658 	}
659 
660 	if (!(flags & I_NOCANCEL))
661 		TRANS_CANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size, flags);
662 	if (flags & (I_DIR|I_IBLK|I_SHAD|I_QUOTA)) {
663 		TRANS_MATA_FREE(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size);
664 	}
665 	blksfree = cg_blksfree(cgp);
666 	blktot = cg_blktot(cgp);
667 	mutex_enter(&ufsvfsp->vfs_lock);
668 	cgp->cg_time = gethrestime_sec();
669 	bno = dtogd(fs, bno);
670 	if (size == fs->fs_bsize) {
671 		blkno = fragstoblks(fs, bno);
672 		cylno = cbtocylno(fs, bno);
673 		rpos = cbtorpos(ufsvfsp, bno);
674 		blks = cg_blks(ufsvfsp, cgp, cylno);
675 		if (!isclrblock(fs, blksfree, blkno)) {
676 			mutex_exit(&ufsvfsp->vfs_lock);
677 			brelse(bp);
678 			(void) ufs_fault(ITOV(ip), "free: freeing free block, "
679 			    "dev:0x%lx, block:%ld, ino:%lu, fs:%s",
680 			    ip->i_dev, bno, ip->i_number, fs->fs_fsmnt);
681 			return;
682 		}
683 		setblock(fs, blksfree, blkno);
684 		blks[rpos]++;
685 		blktot[cylno]++;
686 		cgp->cg_cs.cs_nbfree++;		/* Log below */
687 		fs->fs_cstotal.cs_nbfree++;
688 		fs->fs_cs(fs, cg).cs_nbfree++;
689 		if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) {
690 			mutex_enter(&delq->uq_mutex);
691 			delq_info->delq_unreclaimed_blocks -=
692 			    btodb(fs->fs_bsize);
693 			mutex_exit(&delq->uq_mutex);
694 		}
695 	} else {
696 		bbase = bno - fragnum(fs, bno);
697 		/*
698 		 * Decrement the counts associated with the old frags
699 		 */
700 		bmap = blkmap(fs, blksfree, bbase);
701 		fragacct(fs, bmap, cgp->cg_frsum, -1);
702 		/*
703 		 * Deallocate the fragment
704 		 */
705 		for (i = 0; i < numfrags(fs, size); i++) {
706 			if (isset(blksfree, bno + i)) {
707 				brelse(bp);
708 				mutex_exit(&ufsvfsp->vfs_lock);
709 				(void) ufs_fault(ITOV(ip),
710 				    "free: freeing free frag, "
711 				    "dev:0x%lx, blk:%ld, cg:%d, "
712 				    "ino:%lu, fs:%s",
713 				    ip->i_dev,
714 				    bno + i,
715 				    cgp->cg_cgx,
716 				    ip->i_number,
717 				    fs->fs_fsmnt);
718 				return;
719 			}
720 			setbit(blksfree, bno + i);
721 		}
722 		cgp->cg_cs.cs_nffree += i;
723 		fs->fs_cstotal.cs_nffree += i;
724 		fs->fs_cs(fs, cg).cs_nffree += i;
725 		if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) {
726 			mutex_enter(&delq->uq_mutex);
727 			delq_info->delq_unreclaimed_blocks -=
728 			    btodb(i * fs->fs_fsize);
729 			mutex_exit(&delq->uq_mutex);
730 		}
731 		/*
732 		 * Add back in counts associated with the new frags
733 		 */
734 		bmap = blkmap(fs, blksfree, bbase);
735 		fragacct(fs, bmap, cgp->cg_frsum, 1);
736 		/*
737 		 * If a complete block has been reassembled, account for it
738 		 */
739 		blkno = fragstoblks(fs, bbase);
740 		if (isblock(fs, blksfree, blkno)) {
741 			cylno = cbtocylno(fs, bbase);
742 			rpos = cbtorpos(ufsvfsp, bbase);
743 			blks = cg_blks(ufsvfsp, cgp, cylno);
744 			blks[rpos]++;
745 			blktot[cylno]++;
746 			cgp->cg_cs.cs_nffree -= fs->fs_frag;
747 			fs->fs_cstotal.cs_nffree -= fs->fs_frag;
748 			fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
749 			cgp->cg_cs.cs_nbfree++;
750 			fs->fs_cstotal.cs_nbfree++;
751 			fs->fs_cs(fs, cg).cs_nbfree++;
752 		}
753 	}
754 	fs->fs_fmod = 1;
755 	ufs_notclean(ufsvfsp);
756 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
757 	TRANS_SI(ufsvfsp, fs, cg);
758 	bdrwrite(bp);
759 }
760 
761 /*
762  * Free an inode.
763  *
764  * The specified inode is placed back in the free map.
765  */
766 void
767 ufs_ifree(struct inode *ip, ino_t ino, mode_t mode)
768 {
769 	struct fs *fs = ip->i_fs;
770 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
771 	struct cg *cgp;
772 	struct buf *bp;
773 	unsigned int inot;
774 	int cg;
775 	char *iused;
776 
777 	if (ip->i_number == ino && ip->i_mode != 0) {
778 		(void) ufs_fault(ITOV(ip),
779 		    "ufs_ifree: illegal mode: (imode) %o, (omode) %o, ino %d, "
780 		    "fs = %s\n",
781 		    ip->i_mode, mode, (int)ip->i_number, fs->fs_fsmnt);
782 		return;
783 	}
784 	if (ino >= fs->fs_ipg * fs->fs_ncg) {
785 		(void) ufs_fault(ITOV(ip),
786 		    "ifree: range, dev = 0x%x, ino = %d, fs = %s\n",
787 		    (int)ip->i_dev, (int)ino, fs->fs_fsmnt);
788 		return;
789 	}
790 	cg = (int)itog(fs, ino);
791 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
792 	    (int)fs->fs_cgsize);
793 
794 	cgp = bp->b_un.b_cg;
795 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
796 		brelse(bp);
797 		return;
798 	}
799 	mutex_enter(&ufsvfsp->vfs_lock);
800 	cgp->cg_time = gethrestime_sec();
801 	iused = cg_inosused(cgp);
802 	inot = (unsigned int)(ino % (ulong_t)fs->fs_ipg);
803 	if (isclr(iused, inot)) {
804 		mutex_exit(&ufsvfsp->vfs_lock);
805 		brelse(bp);
806 		(void) ufs_fault(ITOV(ip), "ufs_ifree: freeing free inode, "
807 		    "mode: (imode) %o, (omode) %o, ino:%d, "
808 		    "fs:%s",
809 		    ip->i_mode, mode, (int)ino, fs->fs_fsmnt);
810 		return;
811 	}
812 	clrbit(iused, inot);
813 
814 	if (inot < (ulong_t)cgp->cg_irotor)
815 		cgp->cg_irotor = inot;
816 	cgp->cg_cs.cs_nifree++;
817 	fs->fs_cstotal.cs_nifree++;
818 	fs->fs_cs(fs, cg).cs_nifree++;
819 	if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) {
820 		cgp->cg_cs.cs_ndir--;
821 		fs->fs_cstotal.cs_ndir--;
822 		fs->fs_cs(fs, cg).cs_ndir--;
823 	}
824 	fs->fs_fmod = 1;
825 	ufs_notclean(ufsvfsp);
826 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
827 	TRANS_SI(ufsvfsp, fs, cg);
828 	bdrwrite(bp);
829 }
830 
831 /*
832  * Implement the cylinder overflow algorithm.
833  *
834  * The policy implemented by this algorithm is:
835  *   1) allocate the block in its requested cylinder group.
836  *   2) quadratically rehash on the cylinder group number.
837  *   3) brute force search for a free block.
838  * The size parameter means size for data blocks, mode for inodes.
839  */
840 static ino_t
841 hashalloc(struct inode *ip, int cg, long pref, int size, ulong_t (*allocator)())
842 {
843 	struct fs *fs;
844 	int i;
845 	long result;
846 	int icg = cg;
847 
848 	fs = ip->i_fs;
849 	/*
850 	 * 1: preferred cylinder group
851 	 */
852 	result = (*allocator)(ip, cg, pref, size);
853 	if (result)
854 		return (result);
855 	/*
856 	 * 2: quadratic rehash
857 	 */
858 	for (i = 1; i < fs->fs_ncg; i *= 2) {
859 		cg += i;
860 		if (cg >= fs->fs_ncg)
861 			cg -= fs->fs_ncg;
862 		result = (*allocator)(ip, cg, 0, size);
863 		if (result)
864 			return (result);
865 	}
866 	/*
867 	 * 3: brute force search
868 	 * Note that we start at i == 2, since 0 was checked initially,
869 	 * and 1 is always checked in the quadratic rehash.
870 	 */
871 	cg = (icg + 2) % fs->fs_ncg;
872 	for (i = 2; i < fs->fs_ncg; i++) {
873 		result = (*allocator)(ip, cg, 0, size);
874 		if (result)
875 			return (result);
876 		cg++;
877 		if (cg == fs->fs_ncg)
878 			cg = 0;
879 	}
880 	return (NULL);
881 }
882 
883 /*
884  * Determine whether a fragment can be extended.
885  *
886  * Check to see if the necessary fragments are available, and
887  * if they are, allocate them.
888  */
889 static daddr_t
890 fragextend(struct inode *ip, int cg, long bprev, int osize, int nsize)
891 {
892 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
893 	struct fs *fs = ip->i_fs;
894 	struct buf *bp;
895 	struct cg *cgp;
896 	uchar_t *blksfree;
897 	long bno;
898 	int frags, bbase;
899 	int i, j;
900 
901 	if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
902 		return (NULL);
903 	frags = numfrags(fs, nsize);
904 	bbase = (int)fragnum(fs, bprev);
905 	if (bbase > fragnum(fs, (bprev + frags - 1))) {
906 		/* cannot extend across a block boundary */
907 		return (NULL);
908 	}
909 
910 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
911 	    (int)fs->fs_cgsize);
912 	cgp = bp->b_un.b_cg;
913 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
914 		brelse(bp);
915 		return (NULL);
916 	}
917 
918 	blksfree = cg_blksfree(cgp);
919 	mutex_enter(&ufsvfsp->vfs_lock);
920 	bno = dtogd(fs, bprev);
921 	for (i = numfrags(fs, osize); i < frags; i++) {
922 		if (isclr(blksfree, bno + i)) {
923 			mutex_exit(&ufsvfsp->vfs_lock);
924 			brelse(bp);
925 			return (NULL);
926 		}
927 		if ((TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bprev + i)),
928 		    fs->fs_fsize))) {
929 			mutex_exit(&ufsvfsp->vfs_lock);
930 			brelse(bp);
931 			return (NULL);
932 		}
933 	}
934 
935 	cgp->cg_time = gethrestime_sec();
936 	/*
937 	 * The current fragment can be extended,
938 	 * deduct the count on fragment being extended into
939 	 * increase the count on the remaining fragment (if any)
940 	 * allocate the extended piece.
941 	 */
942 	for (i = frags; i < fs->fs_frag - bbase; i++)
943 		if (isclr(blksfree, bno + i))
944 			break;
945 	j = i - numfrags(fs, osize);
946 	cgp->cg_frsum[j]--;
947 	ASSERT(cgp->cg_frsum[j] >= 0);
948 	if (i != frags)
949 		cgp->cg_frsum[i - frags]++;
950 	for (i = numfrags(fs, osize); i < frags; i++) {
951 		clrbit(blksfree, bno + i);
952 		cgp->cg_cs.cs_nffree--;
953 		fs->fs_cs(fs, cg).cs_nffree--;
954 		fs->fs_cstotal.cs_nffree--;
955 	}
956 	fs->fs_fmod = 1;
957 	ufs_notclean(ufsvfsp);
958 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
959 	TRANS_SI(ufsvfsp, fs, cg);
960 	bdrwrite(bp);
961 	return ((daddr_t)bprev);
962 }
963 
964 /*
965  * Determine whether a block can be allocated.
966  *
967  * Check to see if a block of the apprpriate size
968  * is available, and if it is, allocate it.
969  */
970 static daddr_t
971 alloccg(struct inode *ip, int cg, daddr_t bpref, int size)
972 {
973 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
974 	struct fs *fs = ip->i_fs;
975 	struct buf *bp;
976 	struct cg *cgp;
977 	uchar_t *blksfree;
978 	int bno, frags;
979 	int allocsiz;
980 	int i;
981 
982 	/*
983 	 * Searching for space could be time expensive so do some
984 	 * up front checking to verify that there is actually space
985 	 * available (free blocks or free frags).
986 	 */
987 	if (fs->fs_cs(fs, cg).cs_nbfree == 0) {
988 		if (size == fs->fs_bsize)
989 			return (0);
990 
991 		/*
992 		 * If there are not enough free frags then return.
993 		 */
994 		if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, size))
995 			return (0);
996 	}
997 
998 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
999 	    (int)fs->fs_cgsize);
1000 
1001 	cgp = bp->b_un.b_cg;
1002 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) ||
1003 	    (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) {
1004 		brelse(bp);
1005 		return (0);
1006 	}
1007 	blksfree = cg_blksfree(cgp);
1008 	mutex_enter(&ufsvfsp->vfs_lock);
1009 	cgp->cg_time = gethrestime_sec();
1010 	if (size == fs->fs_bsize) {
1011 		if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0)
1012 			goto errout;
1013 		fs->fs_fmod = 1;
1014 		ufs_notclean(ufsvfsp);
1015 		TRANS_SI(ufsvfsp, fs, cg);
1016 		bdrwrite(bp);
1017 		return (bno);
1018 	}
1019 	/*
1020 	 * Check fragment bitmap to see if any fragments are already available.
1021 	 * mapsearch() may fail because the fragment that fits this request
1022 	 * might still be on the cancel list and not available for re-use yet.
1023 	 * Look for a bigger sized fragment to allocate first before we have
1024 	 * to give up and fragment a whole new block eventually.
1025 	 */
1026 	frags = numfrags(fs, size);
1027 	allocsiz = frags;
1028 next_size:
1029 	for (; allocsiz < fs->fs_frag; allocsiz++)
1030 		if (cgp->cg_frsum[allocsiz] != 0)
1031 			break;
1032 
1033 	if (allocsiz != fs->fs_frag) {
1034 		bno = mapsearch(ufsvfsp, cgp, bpref, allocsiz);
1035 		if (bno < 0 && allocsiz < (fs->fs_frag - 1)) {
1036 			allocsiz++;
1037 			goto next_size;
1038 		}
1039 	}
1040 
1041 	if (allocsiz == fs->fs_frag || bno < 0) {
1042 		/*
1043 		 * No fragments were available, so a block
1044 		 * will be allocated and hacked up.
1045 		 */
1046 		if (cgp->cg_cs.cs_nbfree == 0)
1047 			goto errout;
1048 		if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0)
1049 			goto errout;
1050 		bpref = dtogd(fs, bno);
1051 		for (i = frags; i < fs->fs_frag; i++)
1052 			setbit(blksfree, bpref + i);
1053 		i = fs->fs_frag - frags;
1054 		cgp->cg_cs.cs_nffree += i;
1055 		fs->fs_cstotal.cs_nffree += i;
1056 		fs->fs_cs(fs, cg).cs_nffree += i;
1057 		cgp->cg_frsum[i]++;
1058 		fs->fs_fmod = 1;
1059 		ufs_notclean(ufsvfsp);
1060 		TRANS_SI(ufsvfsp, fs, cg);
1061 		bdrwrite(bp);
1062 		return (bno);
1063 	}
1064 
1065 	for (i = 0; i < frags; i++)
1066 		clrbit(blksfree, bno + i);
1067 	cgp->cg_cs.cs_nffree -= frags;
1068 	fs->fs_cstotal.cs_nffree -= frags;
1069 	fs->fs_cs(fs, cg).cs_nffree -= frags;
1070 	cgp->cg_frsum[allocsiz]--;
1071 	ASSERT(cgp->cg_frsum[allocsiz] >= 0);
1072 	if (frags != allocsiz) {
1073 		cgp->cg_frsum[allocsiz - frags]++;
1074 	}
1075 	fs->fs_fmod = 1;
1076 	ufs_notclean(ufsvfsp);
1077 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
1078 	TRANS_SI(ufsvfsp, fs, cg);
1079 	bdrwrite(bp);
1080 	return (cg * fs->fs_fpg + bno);
1081 errout:
1082 	mutex_exit(&ufsvfsp->vfs_lock);
1083 	brelse(bp);
1084 	return (0);
1085 }
1086 
1087 /*
1088  * Allocate a block in a cylinder group.
1089  *
1090  * This algorithm implements the following policy:
1091  *   1) allocate the requested block.
1092  *   2) allocate a rotationally optimal block in the same cylinder.
1093  *   3) allocate the next available block on the block rotor for the
1094  *	specified cylinder group.
1095  * Note that this routine only allocates fs_bsize blocks; these
1096  * blocks may be fragmented by the routine that allocates them.
1097  */
1098 static daddr_t
1099 alloccgblk(
1100 	struct ufsvfs *ufsvfsp,
1101 	struct cg *cgp,
1102 	daddr_t bpref,
1103 	struct buf *bp)
1104 {
1105 	daddr_t bno;
1106 	int cylno, pos, delta, rotbl_size;
1107 	short *cylbp;
1108 	int i;
1109 	struct fs *fs;
1110 	uchar_t *blksfree;
1111 	daddr_t blkno, rpos, frag;
1112 	short *blks;
1113 	int32_t *blktot;
1114 
1115 	ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock));
1116 	fs = ufsvfsp->vfs_fs;
1117 	blksfree = cg_blksfree(cgp);
1118 	if (bpref == 0) {
1119 		bpref = cgp->cg_rotor;
1120 		goto norot;
1121 	}
1122 	bpref = blknum(fs, bpref);
1123 	bpref = dtogd(fs, bpref);
1124 	/*
1125 	 * If the requested block is available, use it.
1126 	 */
1127 	if (isblock(fs, blksfree, (daddr_t)fragstoblks(fs, bpref))) {
1128 		bno = bpref;
1129 		goto gotit;
1130 	}
1131 	/*
1132 	 * Check for a block available on the same cylinder.
1133 	 */
1134 	cylno = cbtocylno(fs, bpref);
1135 	if (cg_blktot(cgp)[cylno] == 0)
1136 		goto norot;
1137 	if (fs->fs_cpc == 0) {
1138 		/*
1139 		 * Block layout info is not available, so just
1140 		 * have to take any block in this cylinder.
1141 		 */
1142 		bpref = howmany(fs->fs_spc * cylno, NSPF(fs));
1143 		goto norot;
1144 	}
1145 	/*
1146 	 * Check the summary information to see if a block is
1147 	 * available in the requested cylinder starting at the
1148 	 * requested rotational position and proceeding around.
1149 	 */
1150 	cylbp = cg_blks(ufsvfsp, cgp, cylno);
1151 	pos = cbtorpos(ufsvfsp, bpref);
1152 	for (i = pos; i < ufsvfsp->vfs_nrpos; i++)
1153 		if (cylbp[i] > 0)
1154 			break;
1155 	if (i == ufsvfsp->vfs_nrpos)
1156 		for (i = 0; i < pos; i++)
1157 			if (cylbp[i] > 0)
1158 				break;
1159 	if (cylbp[i] > 0) {
1160 		/*
1161 		 * Found a rotational position, now find the actual
1162 		 * block.  A "panic" if none is actually there.
1163 		 */
1164 
1165 		/*
1166 		 * Up to this point, "pos" has referred to the rotational
1167 		 * position of the desired block.  From now on, it holds
1168 		 * the offset of the current cylinder within a cylinder
1169 		 * cycle.  (A cylinder cycle refers to a set of cylinders
1170 		 * which are described by a single rotational table; the
1171 		 * size of the cycle is fs_cpc.)
1172 		 *
1173 		 * bno is set to the block number of the first block within
1174 		 * the current cylinder cycle.
1175 		 */
1176 
1177 		pos = cylno % fs->fs_cpc;
1178 		bno = (cylno - pos) * fs->fs_spc / NSPB(fs);
1179 
1180 		/*
1181 		 * The blocks within a cylinder are grouped into equivalence
1182 		 * classes according to their "rotational position."  There
1183 		 * are two tables used to determine these classes.
1184 		 *
1185 		 * The positional offset table (fs_postbl) has an entry for
1186 		 * each rotational position of each cylinder in a cylinder
1187 		 * cycle.  This entry contains the relative block number
1188 		 * (counting from the start of the cylinder cycle) of the
1189 		 * first block in the equivalence class for that position
1190 		 * and that cylinder.  Positions for which no blocks exist
1191 		 * are indicated by a -1.
1192 		 *
1193 		 * The rotational delta table (fs_rotbl) has an entry for
1194 		 * each block in a cylinder cycle.  This entry contains
1195 		 * the offset from that block to the next block in the
1196 		 * same equivalence class.  The last block in the class
1197 		 * is indicated by a zero in the table.
1198 		 *
1199 		 * The following code, then, walks through all of the blocks
1200 		 * in the cylinder (cylno) which we're allocating within
1201 		 * which are in the equivalence class for the rotational
1202 		 * position (i) which we're allocating within.
1203 		 */
1204 
1205 		if (fs_postbl(ufsvfsp, pos)[i] == -1) {
1206 			(void) ufs_fault(ufsvfsp->vfs_root,
1207 			    "alloccgblk: cyl groups corrupted, pos = %d, "
1208 			    "i = %d, fs = %s\n", pos, i, fs->fs_fsmnt);
1209 			return (0);
1210 		}
1211 
1212 		/*
1213 		 * There is one entry in the rotational table for each block
1214 		 * in the cylinder cycle.  These are whole blocks, not frags.
1215 		 */
1216 
1217 		rotbl_size = (fs->fs_cpc * fs->fs_spc) >>
1218 		    (fs->fs_fragshift + fs->fs_fsbtodb);
1219 
1220 		/*
1221 		 * As we start, "i" is the rotational position within which
1222 		 * we're searching.  After the next line, it will be a block
1223 		 * number (relative to the start of the cylinder cycle)
1224 		 * within the equivalence class of that rotational position.
1225 		 */
1226 
1227 		i = fs_postbl(ufsvfsp, pos)[i];
1228 
1229 		for (;;) {
1230 			if (isblock(fs, blksfree, (daddr_t)(bno + i))) {
1231 				bno = blkstofrags(fs, (bno + i));
1232 				goto gotit;
1233 			}
1234 			delta = fs_rotbl(fs)[i];
1235 			if (delta <= 0 ||		/* End of chain, or */
1236 			    delta + i > rotbl_size)	/* end of table? */
1237 				break;			/* If so, panic. */
1238 			i += delta;
1239 		}
1240 		(void) ufs_fault(ufsvfsp->vfs_root,
1241 		    "alloccgblk: can't find blk in cyl, pos:%d, i:%d, "
1242 		    "fs:%s bno: %x\n", pos, i, fs->fs_fsmnt, (int)bno);
1243 		return (0);
1244 	}
1245 norot:
1246 	/*
1247 	 * No blocks in the requested cylinder, so take
1248 	 * next available one in this cylinder group.
1249 	 */
1250 	bno = mapsearch(ufsvfsp, cgp, bpref, (int)fs->fs_frag);
1251 	if (bno < 0)
1252 		return (0);
1253 	cgp->cg_rotor = bno;
1254 gotit:
1255 	blkno = fragstoblks(fs, bno);
1256 	frag = (cgp->cg_cgx * fs->fs_fpg) + bno;
1257 	if (TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, frag)), fs->fs_bsize))
1258 		goto norot;
1259 	clrblock(fs, blksfree, (long)blkno);
1260 	/*
1261 	 * the other cg/sb/si fields are TRANS'ed by the caller
1262 	 */
1263 	cgp->cg_cs.cs_nbfree--;
1264 	fs->fs_cstotal.cs_nbfree--;
1265 	fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
1266 	cylno = cbtocylno(fs, bno);
1267 	blks = cg_blks(ufsvfsp, cgp, cylno);
1268 	rpos = cbtorpos(ufsvfsp, bno);
1269 	blktot = cg_blktot(cgp);
1270 	blks[rpos]--;
1271 	blktot[cylno]--;
1272 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
1273 	fs->fs_fmod = 1;
1274 	return (frag);
1275 }
1276 
1277 /*
1278  * Determine whether an inode can be allocated.
1279  *
1280  * Check to see if an inode is available, and if it is,
1281  * allocate it using the following policy:
1282  *   1) allocate the requested inode.
1283  *   2) allocate the next available inode after the requested
1284  *	inode in the specified cylinder group.
1285  */
1286 static ino_t
1287 ialloccg(struct inode *ip, int cg, daddr_t ipref, int mode)
1288 {
1289 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
1290 	struct fs *fs = ip->i_fs;
1291 	struct cg *cgp;
1292 	struct buf *bp;
1293 	int start, len, loc, map, i;
1294 	char *iused;
1295 
1296 	if (fs->fs_cs(fs, cg).cs_nifree == 0)
1297 		return (0);
1298 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
1299 	    (int)fs->fs_cgsize);
1300 
1301 	cgp = bp->b_un.b_cg;
1302 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) ||
1303 	    cgp->cg_cs.cs_nifree == 0) {
1304 		brelse(bp);
1305 		return (0);
1306 	}
1307 	iused = cg_inosused(cgp);
1308 	mutex_enter(&ufsvfsp->vfs_lock);
1309 	/*
1310 	 * While we are waiting for the mutex, someone may have taken
1311 	 * the last available inode.  Need to recheck.
1312 	 */
1313 	if (cgp->cg_cs.cs_nifree == 0) {
1314 		mutex_exit(&ufsvfsp->vfs_lock);
1315 		brelse(bp);
1316 		return (0);
1317 	}
1318 
1319 	cgp->cg_time = gethrestime_sec();
1320 	if (ipref) {
1321 		ipref %= fs->fs_ipg;
1322 		if (isclr(iused, ipref))
1323 			goto gotit;
1324 	}
1325 	start = cgp->cg_irotor / NBBY;
1326 	len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY);
1327 	loc = skpc(0xff, (uint_t)len, &iused[start]);
1328 	if (loc == 0) {
1329 		len = start + 1;
1330 		start = 0;
1331 		loc = skpc(0xff, (uint_t)len, &iused[0]);
1332 		if (loc == 0) {
1333 			mutex_exit(&ufsvfsp->vfs_lock);
1334 			(void) ufs_fault(ITOV(ip),
1335 			    "ialloccg: map corrupted, cg = %d, irotor = %d, "
1336 			    "fs = %s\n", cg, (int)cgp->cg_irotor, fs->fs_fsmnt);
1337 			return (0);
1338 		}
1339 	}
1340 	i = start + len - loc;
1341 	map = iused[i];
1342 	ipref = i * NBBY;
1343 	for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) {
1344 		if ((map & i) == 0) {
1345 			cgp->cg_irotor = ipref;
1346 			goto gotit;
1347 		}
1348 	}
1349 
1350 	mutex_exit(&ufsvfsp->vfs_lock);
1351 	(void) ufs_fault(ITOV(ip), "ialloccg: block not in mapfs = %s",
1352 	    fs->fs_fsmnt);
1353 	return (0);
1354 gotit:
1355 	setbit(iused, ipref);
1356 	cgp->cg_cs.cs_nifree--;
1357 	fs->fs_cstotal.cs_nifree--;
1358 	fs->fs_cs(fs, cg).cs_nifree--;
1359 	if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) {
1360 		cgp->cg_cs.cs_ndir++;
1361 		fs->fs_cstotal.cs_ndir++;
1362 		fs->fs_cs(fs, cg).cs_ndir++;
1363 	}
1364 	fs->fs_fmod = 1;
1365 	ufs_notclean(ufsvfsp);
1366 	TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
1367 	TRANS_SI(ufsvfsp, fs, cg);
1368 	bdrwrite(bp);
1369 	return (cg * fs->fs_ipg + ipref);
1370 }
1371 
1372 /*
1373  * Find a block of the specified size in the specified cylinder group.
1374  *
1375  * It is a panic if a request is made to find a block if none are
1376  * available.
1377  */
1378 static daddr_t
1379 mapsearch(struct ufsvfs *ufsvfsp, struct cg *cgp, daddr_t bpref,
1380 	int allocsiz)
1381 {
1382 	struct fs *fs	= ufsvfsp->vfs_fs;
1383 	daddr_t bno, cfrag;
1384 	int start, len, loc, i, last, first, secondtime;
1385 	int blk, field, subfield, pos;
1386 	int gotit;
1387 
1388 	/*
1389 	 * ufsvfs->vfs_lock is held when calling this.
1390 	 */
1391 	/*
1392 	 * Find the fragment by searching through the
1393 	 * free block map for an appropriate bit pattern.
1394 	 */
1395 	if (bpref)
1396 		start = dtogd(fs, bpref) / NBBY;
1397 	else
1398 		start = cgp->cg_frotor / NBBY;
1399 	/*
1400 	 * the following loop performs two scans -- the first scan
1401 	 * searches the bottom half of the array for a match and the
1402 	 * second scan searches the top half of the array.  The loops
1403 	 * have been merged just to make things difficult.
1404 	 */
1405 	first = start;
1406 	last = howmany(fs->fs_fpg, NBBY);
1407 	secondtime = 0;
1408 	cfrag = cgp->cg_cgx * fs->fs_fpg;
1409 	while (first < last) {
1410 		len = last - first;
1411 		/*
1412 		 * search the array for a match
1413 		 */
1414 		loc = scanc((unsigned)len, (uchar_t *)&cg_blksfree(cgp)[first],
1415 		    (uchar_t *)fragtbl[fs->fs_frag],
1416 		    (int)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
1417 		/*
1418 		 * match found
1419 		 */
1420 		if (loc) {
1421 			bno = (last - loc) * NBBY;
1422 
1423 			/*
1424 			 * Found the byte in the map, sift
1425 			 * through the bits to find the selected frag
1426 			 */
1427 			cgp->cg_frotor = bno;
1428 			gotit = 0;
1429 			for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
1430 				blk = blkmap(fs, cg_blksfree(cgp), bno);
1431 				blk <<= 1;
1432 				field = around[allocsiz];
1433 				subfield = inside[allocsiz];
1434 				for (pos = 0;
1435 				    pos <= fs->fs_frag - allocsiz;
1436 				    pos++) {
1437 					if ((blk & field) == subfield) {
1438 						gotit++;
1439 						break;
1440 					}
1441 					field <<= 1;
1442 					subfield <<= 1;
1443 				}
1444 				if (gotit)
1445 					break;
1446 			}
1447 			bno += pos;
1448 
1449 			/*
1450 			 * success if block is *not* being converted from
1451 			 * metadata into userdata (harpy).  If so, ignore.
1452 			 */
1453 			if (!TRANS_ISCANCEL(ufsvfsp,
1454 			    ldbtob(fsbtodb(fs, (cfrag+bno))),
1455 			    allocsiz * fs->fs_fsize))
1456 				return (bno);
1457 
1458 			/*
1459 			 * keep looking -- this block is being converted
1460 			 */
1461 			first = (last - loc) + 1;
1462 			loc = 0;
1463 			if (first < last)
1464 				continue;
1465 		}
1466 		/*
1467 		 * no usable matches in bottom half -- now search the top half
1468 		 */
1469 		if (secondtime)
1470 			/*
1471 			 * no usable matches in top half -- all done
1472 			 */
1473 			break;
1474 		secondtime = 1;
1475 		last = start + 1;
1476 		first = 0;
1477 	}
1478 	/*
1479 	 * no usable matches
1480 	 */
1481 	return ((daddr_t)-1);
1482 }
1483 
1484 #define	UFSNADDR (NDADDR + NIADDR)	/* NADDR applies to (obsolete) S5FS */
1485 #define	IB(i)	(NDADDR + (i))	/* index of i'th indirect block ptr */
1486 #define	SINGLE	0		/* single indirect block ptr */
1487 #define	DOUBLE	1		/* double indirect block ptr */
1488 #define	TRIPLE	2		/* triple indirect block ptr */
1489 
1490 /*
1491  * Acquire a write lock, and keep trying till we get it
1492  */
1493 static int
1494 allocsp_wlockfs(struct vnode *vp, struct lockfs *lf)
1495 {
1496 	int err = 0;
1497 
1498 lockagain:
1499 	do {
1500 		err = ufs_fiolfss(vp, lf);
1501 		if (err)
1502 			return (err);
1503 	} while (!LOCKFS_IS_ULOCK(lf));
1504 
1505 	lf->lf_lock = LOCKFS_WLOCK;
1506 	lf->lf_flags = 0;
1507 	lf->lf_comment = NULL;
1508 	err = ufs__fiolfs(vp, lf, 1, 0);
1509 
1510 	if (err == EBUSY || err == EINVAL)
1511 		goto lockagain;
1512 
1513 	return (err);
1514 }
1515 
1516 /*
1517  * Release the write lock
1518  */
1519 static int
1520 allocsp_unlockfs(struct vnode *vp, struct lockfs *lf)
1521 {
1522 	int err = 0;
1523 
1524 	lf->lf_lock = LOCKFS_ULOCK;
1525 	lf->lf_flags = 0;
1526 	err = ufs__fiolfs(vp, lf, 1, 0);
1527 	return (err);
1528 }
1529 
1530 struct allocsp_undo {
1531 	daddr_t offset;
1532 	daddr_t blk;
1533 	struct allocsp_undo *next;
1534 };
1535 
1536 /*
1537  * ufs_allocsp() can be used to pre-allocate blocks for a file on a given
1538  * file system. For direct blocks, the blocks are allocated from the offset
1539  * requested to the block boundary, then any full blocks are allocated,
1540  * and finally any remainder.
1541  * For indirect blocks the blocks are not initialized and are
1542  * only marked as allocated. These addresses are then stored as negative
1543  * block numbers in the inode to imply special handling. UFS has been modified
1544  * where necessary to understand this new notion.
1545  * Successfully fallocated files will have IFALLOCATE cflag set in the inode.
1546  */
1547 int
1548 ufs_allocsp(struct vnode *vp, struct flock64 *lp, cred_t *cr)
1549 {
1550 	struct lockfs lf;
1551 	int berr, err, resv, issync;
1552 	off_t istart, len; /* istart, special for idb */
1553 	struct inode *ip;
1554 	struct fs *fs;
1555 	struct ufsvfs *ufsvfsp;
1556 	u_offset_t resid, i, uoff;
1557 	daddr32_t db_undo[NDADDR];	/* old direct blocks */
1558 	struct allocsp_undo *ib_undo = NULL;	/* ib undo */
1559 	struct allocsp_undo *undo = NULL;
1560 	u_offset_t osz;			/* old file size */
1561 	int chunkblks = 0;		/* # of blocks in 1 allocation */
1562 	int cnt = 0;
1563 	daddr_t allocblk;
1564 	daddr_t totblks = 0;
1565 	struct ulockfs	*ulp;
1566 	size_t done_len;
1567 	int nbytes, offsetn;
1568 
1569 
1570 	ASSERT(vp->v_type == VREG);
1571 
1572 	ip = VTOI(vp);
1573 	fs = ip->i_fs;
1574 	if ((ufsvfsp = ip->i_ufsvfs) == NULL) {
1575 		err = EIO;
1576 		goto out_allocsp;
1577 	}
1578 
1579 	istart = blkroundup(fs, (lp->l_start));
1580 	len = blkroundup(fs, (lp->l_len));
1581 	chunkblks = blkroundup(fs, ufsvfsp->vfs_iotransz) / fs->fs_bsize;
1582 	ulp = &ufsvfsp->vfs_ulockfs;
1583 
1584 	if (lp->l_start < 0 || lp->l_len <= 0)
1585 		return (EINVAL);
1586 
1587 	/* Quickly check to make sure we have space before we proceed */
1588 	if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree) {
1589 		if (TRANS_ISTRANS(ufsvfsp)) {
1590 			ufs_delete_drain_wait(ufsvfsp, 1);
1591 			if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree)
1592 				return (ENOSPC);
1593 		} else
1594 			return (ENOSPC);
1595 	}
1596 
1597 	/*
1598 	 * We will keep i_rwlock locked as WRITER through out the function
1599 	 * since we don't want anyone else reading or writing to the inode
1600 	 * while we are in the middle of fallocating the file.
1601 	 */
1602 	rw_enter(&ip->i_rwlock, RW_WRITER);
1603 
1604 	/* Back up the direct block list, used for undo later if necessary */
1605 	rw_enter(&ip->i_contents, RW_READER);
1606 	for (i = 0; i < NDADDR; i++)
1607 		db_undo[i] = ip->i_db[i];
1608 	osz = ip->i_size;
1609 	rw_exit(&ip->i_contents);
1610 
1611 	/* Write lock the file system */
1612 	if (err = allocsp_wlockfs(vp, &lf))
1613 		goto exit;
1614 
1615 	/*
1616 	 * Allocate any direct blocks now.
1617 	 * Blocks are allocated from the offset requested to the block
1618 	 * boundary, then any full blocks are allocated, and finally any
1619 	 * remainder.
1620 	 */
1621 	if (lblkno(fs, lp->l_start) < NDADDR) {
1622 		ufs_trans_trunc_resv(ip, ip->i_size + (NDADDR * fs->fs_bsize),
1623 		    &resv, &resid);
1624 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1625 
1626 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1627 		rw_enter(&ip->i_contents, RW_WRITER);
1628 
1629 		done_len = 0;
1630 		while ((done_len < lp->l_len) &&
1631 		    (lblkno(fs, lp->l_start + done_len) < NDADDR)) {
1632 			uoff = (offset_t)(lp->l_start + done_len);
1633 			offsetn = (int)blkoff(fs, uoff);
1634 			nbytes = (int)MIN(fs->fs_bsize - offsetn,
1635 			    lp->l_len - done_len);
1636 
1637 			berr = bmap_write(ip, uoff, offsetn + nbytes,
1638 			    BI_FALLOCATE, &allocblk, cr);
1639 			/* Yikes error, quit */
1640 			if (berr) {
1641 				TRANS_INODE(ufsvfsp, ip);
1642 				rw_exit(&ip->i_contents);
1643 				rw_exit(&ufsvfsp->vfs_dqrwlock);
1644 				TRANS_END_CSYNC(ufsvfsp, err, issync,
1645 				    TOP_ALLOCSP, resv);
1646 				err = allocsp_unlockfs(vp, &lf);
1647 				goto exit;
1648 			}
1649 
1650 			if (allocblk) {
1651 				totblks++;
1652 				if ((uoff + nbytes) > ip->i_size)
1653 					ip->i_size = (uoff + nbytes);
1654 			}
1655 			done_len += nbytes;
1656 		}
1657 
1658 		TRANS_INODE(ufsvfsp, ip);
1659 		rw_exit(&ip->i_contents);
1660 		rw_exit(&ufsvfsp->vfs_dqrwlock);
1661 		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
1662 
1663 		/* start offset for indirect allocation */
1664 		istart =  (uoff + nbytes);
1665 	}
1666 
1667 	/* Break the transactions into vfs_iotransz units */
1668 	ufs_trans_trunc_resv(ip, ip->i_size +
1669 	    blkroundup(fs, ufsvfsp->vfs_iotransz), &resv, &resid);
1670 	TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1671 
1672 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1673 	rw_enter(&ip->i_contents, RW_WRITER);
1674 
1675 	/* Now go about fallocating necessary indirect blocks */
1676 	for (i = istart; i < (lp->l_start + lp->l_len); i += fs->fs_bsize) {
1677 		berr = bmap_write(ip, i, fs->fs_bsize, BI_FALLOCATE,
1678 		    &allocblk, cr);
1679 		if (berr) {
1680 			TRANS_INODE(ufsvfsp, ip);
1681 			rw_exit(&ip->i_contents);
1682 			rw_exit(&ufsvfsp->vfs_dqrwlock);
1683 			TRANS_END_CSYNC(ufsvfsp, err, issync,
1684 			    TOP_ALLOCSP, resv);
1685 			err = allocsp_unlockfs(vp, &lf);
1686 			goto exit;
1687 		}
1688 
1689 		/* Update the blk counter only if new block was added */
1690 		if (allocblk) {
1691 			/* Save undo information */
1692 			undo = kmem_alloc(sizeof (struct allocsp_undo),
1693 			    KM_SLEEP);
1694 			undo->offset = i;
1695 			undo->blk = allocblk;
1696 			undo->next = ib_undo;
1697 			ib_undo = undo;
1698 			totblks++;
1699 
1700 			if (i >= ip->i_size)
1701 				ip->i_size += fs->fs_bsize;
1702 		}
1703 		cnt++;
1704 
1705 		/* Being a good UFS citizen, let others get a share */
1706 		if (cnt == chunkblks) {
1707 			/*
1708 			 * If there are waiters or the fs is hard locked,
1709 			 * error locked, or read-only error locked,
1710 			 * quit with EIO
1711 			 */
1712 			if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp) ||
1713 			    ULOCKFS_IS_ROELOCK(ulp)) {
1714 				ip->i_cflags |= IFALLOCATE;
1715 				TRANS_INODE(ufsvfsp, ip);
1716 				rw_exit(&ip->i_contents);
1717 				rw_exit(&ufsvfsp->vfs_dqrwlock);
1718 
1719 				TRANS_END_CSYNC(ufsvfsp, err, issync,
1720 				    TOP_ALLOCSP, resv);
1721 				rw_exit(&ip->i_rwlock);
1722 				(void) allocsp_unlockfs(vp, &lf);
1723 				return (EIO);
1724 			}
1725 
1726 			TRANS_INODE(ufsvfsp, ip);
1727 			rw_exit(&ip->i_contents);
1728 			rw_exit(&ufsvfsp->vfs_dqrwlock);
1729 
1730 			/* End the current transaction */
1731 			TRANS_END_CSYNC(ufsvfsp, err, issync,
1732 			    TOP_ALLOCSP, resv);
1733 
1734 			if (CV_HAS_WAITERS(&ulp->ul_cv)) {
1735 				/* Release the write lock */
1736 				if (err = allocsp_unlockfs(vp, &lf))
1737 					goto exit;
1738 
1739 				/* Wake up others waiting to do operations */
1740 				mutex_enter(&ulp->ul_lock);
1741 				cv_broadcast(&ulp->ul_cv);
1742 				mutex_exit(&ulp->ul_lock);
1743 
1744 				/* Grab the write lock again */
1745 				if (err = allocsp_wlockfs(vp, &lf))
1746 					goto exit;
1747 			} /* end of CV_HAS_WAITERS(&ulp->ul_cv) */
1748 
1749 			/* Reserve more space in log for this file */
1750 			ufs_trans_trunc_resv(ip,
1751 			    ip->i_size + blkroundup(fs, ufsvfsp->vfs_iotransz),
1752 			    &resv, &resid);
1753 			TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1754 
1755 			rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1756 			rw_enter(&ip->i_contents, RW_WRITER);
1757 
1758 			cnt = 0;	/* reset cnt b/c of new transaction */
1759 		}
1760 	}
1761 
1762 	if (!err && !berr)
1763 		ip->i_cflags |= IFALLOCATE;
1764 
1765 	/* If the file has grown then correct the file size */
1766 	if (osz < (lp->l_start + lp->l_len))
1767 		ip->i_size = (lp->l_start + lp->l_len);
1768 
1769 	/* Release locks, end log transaction and unlock fs */
1770 	TRANS_INODE(ufsvfsp, ip);
1771 	rw_exit(&ip->i_contents);
1772 	rw_exit(&ufsvfsp->vfs_dqrwlock);
1773 
1774 	TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
1775 	err = allocsp_unlockfs(vp, &lf);
1776 
1777 	/*
1778 	 * @ exit label, we should no longer be holding the fs write lock, and
1779 	 * all logging transactions should have been ended. We still hold
1780 	 * ip->i_rwlock.
1781 	 */
1782 exit:
1783 	/*
1784 	 * File has grown larger than 2GB. Set flag
1785 	 * in superblock to indicate this, if it
1786 	 * is not already set.
1787 	 */
1788 	if ((ip->i_size > MAXOFF32_T) &&
1789 	    !(fs->fs_flags & FSLARGEFILES)) {
1790 		ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
1791 		mutex_enter(&ufsvfsp->vfs_lock);
1792 		fs->fs_flags |= FSLARGEFILES;
1793 		ufs_sbwrite(ufsvfsp);
1794 		mutex_exit(&ufsvfsp->vfs_lock);
1795 	}
1796 
1797 	/*
1798 	 * Since we couldn't allocate completely, we will undo the allocations.
1799 	 */
1800 	if (berr) {
1801 		ufs_trans_trunc_resv(ip, totblks * fs->fs_bsize, &resv, &resid);
1802 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
1803 
1804 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1805 		rw_enter(&ip->i_contents, RW_WRITER);
1806 
1807 		/* Direct blocks */
1808 		for (i = 0; i < NDADDR; i++) {
1809 			/*
1810 			 * Only free the block if they are not same, and
1811 			 * the old one isn't zero (the fragment was
1812 			 * re-allocated).
1813 			 */
1814 			if (db_undo[i] != ip->i_db[i] && db_undo[i] == 0) {
1815 				free(ip, ip->i_db[i], fs->fs_bsize, 0);
1816 				ip->i_db[i] = 0;
1817 			}
1818 		}
1819 
1820 		/* Undo the indirect blocks */
1821 		while (ib_undo != NULL) {
1822 			undo = ib_undo;
1823 			err = bmap_set_bn(vp, undo->offset, 0);
1824 			if (err)
1825 				cmn_err(CE_PANIC, "ufs_allocsp(): failed to "
1826 				    "undo allocation of block %ld",
1827 				    undo->offset);
1828 			free(ip, undo->blk, fs->fs_bsize, I_IBLK);
1829 			ib_undo = undo->next;
1830 			kmem_free(undo, sizeof (struct allocsp_undo));
1831 		}
1832 
1833 		ip->i_size = osz;
1834 		TRANS_INODE(ufsvfsp, ip);
1835 
1836 		rw_exit(&ip->i_contents);
1837 		rw_exit(&ufsvfsp->vfs_dqrwlock);
1838 
1839 		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
1840 
1841 		rw_exit(&ip->i_rwlock);
1842 		return (berr);
1843 	}
1844 
1845 	/*
1846 	 * Don't forget to free the undo chain :)
1847 	 */
1848 	while (ib_undo != NULL) {
1849 		undo = ib_undo;
1850 		ib_undo = undo->next;
1851 		kmem_free(undo, sizeof (struct allocsp_undo));
1852 	}
1853 
1854 	rw_exit(&ip->i_rwlock);
1855 
1856 out_allocsp:
1857 	return (err);
1858 }
1859 
1860 /*
1861  * Free storage space associated with the specified inode.  The portion
1862  * to be freed is specified by lp->l_start and lp->l_len (already
1863  * normalized to a "whence" of 0).
1864  *
1865  * This is an experimental facility whose continued existence is not
1866  * guaranteed.  Currently, we only support the special case
1867  * of l_len == 0, meaning free to end of file.
1868  *
1869  * Blocks are freed in reverse order.  This FILO algorithm will tend to
1870  * maintain a contiguous free list much longer than FIFO.
1871  * See also ufs_itrunc() in ufs_inode.c.
1872  *
1873  * Bug: unused bytes in the last retained block are not cleared.
1874  * This may result in a "hole" in the file that does not read as zeroes.
1875  */
1876 /* ARGSUSED */
1877 int
1878 ufs_freesp(struct vnode *vp, struct flock64 *lp, int flag, cred_t *cr)
1879 {
1880 	int i;
1881 	struct inode *ip = VTOI(vp);
1882 	int error;
1883 
1884 	ASSERT(vp->v_type == VREG);
1885 	ASSERT(lp->l_start >= 0);	/* checked by convoff */
1886 
1887 	if (lp->l_len != 0)
1888 		return (EINVAL);
1889 
1890 	rw_enter(&ip->i_contents, RW_READER);
1891 	if (ip->i_size == (u_offset_t)lp->l_start) {
1892 		rw_exit(&ip->i_contents);
1893 		return (0);
1894 	}
1895 
1896 	/*
1897 	 * Check if there is any active mandatory lock on the
1898 	 * range that will be truncated/expanded.
1899 	 */
1900 	if (MANDLOCK(vp, ip->i_mode)) {
1901 		offset_t save_start;
1902 
1903 		save_start = lp->l_start;
1904 
1905 		if (ip->i_size < lp->l_start) {
1906 			/*
1907 			 * "Truncate up" case: need to make sure there
1908 			 * is no lock beyond current end-of-file. To
1909 			 * do so, we need to set l_start to the size
1910 			 * of the file temporarily.
1911 			 */
1912 			lp->l_start = ip->i_size;
1913 		}
1914 		lp->l_type = F_WRLCK;
1915 		lp->l_sysid = 0;
1916 		lp->l_pid = ttoproc(curthread)->p_pid;
1917 		i = (flag & (FNDELAY|FNONBLOCK)) ? 0 : SLPFLCK;
1918 		rw_exit(&ip->i_contents);
1919 		if ((i = reclock(vp, lp, i, 0, lp->l_start, NULL)) != 0 ||
1920 		    lp->l_type != F_UNLCK) {
1921 			return (i ? i : EAGAIN);
1922 		}
1923 		rw_enter(&ip->i_contents, RW_READER);
1924 
1925 		lp->l_start = save_start;
1926 	}
1927 
1928 	/*
1929 	 * Make sure a write isn't in progress (allocating blocks)
1930 	 * by acquiring i_rwlock (we promised ufs_bmap we wouldn't
1931 	 * truncate while it was allocating blocks).
1932 	 * Grab the locks in the right order.
1933 	 */
1934 	rw_exit(&ip->i_contents);
1935 	rw_enter(&ip->i_rwlock, RW_WRITER);
1936 	error = TRANS_ITRUNC(ip, (u_offset_t)lp->l_start, 0, cr);
1937 	rw_exit(&ip->i_rwlock);
1938 	return (error);
1939 }
1940 
1941 /*
1942  * Find a cg with as close to nb contiguous bytes as possible
1943  *	THIS MAY TAKE MANY DISK READS!
1944  *
1945  * Implemented in an attempt to allocate contiguous blocks for
1946  * writing the ufs log file to, minimizing future disk head seeking
1947  */
1948 daddr_t
1949 contigpref(ufsvfs_t *ufsvfsp, size_t nb, size_t minb)
1950 {
1951 	struct fs	*fs	= ufsvfsp->vfs_fs;
1952 	daddr_t		nblk	= lblkno(fs, blkroundup(fs, nb));
1953 	daddr_t		minblk	= lblkno(fs, blkroundup(fs, minb));
1954 	daddr_t		savebno, curbno, cgbno;
1955 	int		cg, cgblks, savecg, savenblk, curnblk, startcg;
1956 	uchar_t		*blksfree;
1957 	buf_t		*bp;
1958 	struct cg	*cgp;
1959 
1960 	savenblk = 0;
1961 	savecg = 0;
1962 	savebno = 0;
1963 
1964 	if ((startcg = findlogstartcg(fs, nblk, minblk)) == -1)
1965 		cg = 0;	/* Nothing suitable found */
1966 	else
1967 		cg = startcg;
1968 
1969 	for (; cg < fs->fs_ncg; ++cg) {
1970 		/*
1971 		 * find the largest contiguous range in this cg
1972 		 */
1973 		bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev,
1974 		    (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
1975 		    (int)fs->fs_cgsize);
1976 		cgp = bp->b_un.b_cg;
1977 		if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
1978 			brelse(bp);
1979 			continue;
1980 		}
1981 		blksfree = cg_blksfree(cgp);	    /* free array */
1982 		cgblks = fragstoblks(fs, fs->fs_fpg); /* blks in free array */
1983 		cgbno = 0;
1984 		while (cgbno < cgblks && savenblk < nblk) {
1985 			/* find a free block */
1986 			for (; cgbno < cgblks; ++cgbno) {
1987 				if (isblock(fs, blksfree, cgbno)) {
1988 					if (startcg != -1)
1989 						goto done;
1990 					else
1991 						break;
1992 				}
1993 			}
1994 			curbno = cgbno;
1995 			/* count the number of free blocks */
1996 			for (curnblk = 0; cgbno < cgblks; ++cgbno) {
1997 				if (!isblock(fs, blksfree, cgbno))
1998 					break;
1999 				if (++curnblk >= nblk)
2000 					break;
2001 			}
2002 			if (curnblk > savenblk) {
2003 				savecg = cg;
2004 				savenblk = curnblk;
2005 				savebno = curbno;
2006 			}
2007 		}
2008 		brelse(bp);
2009 		if (savenblk >= nblk)
2010 			break;
2011 	}
2012 
2013 done:
2014 	if (startcg != -1) {
2015 		brelse(bp);
2016 		savecg = startcg;
2017 		savebno = cgbno;
2018 	}
2019 
2020 	/* convert block offset in cg to frag offset in cg */
2021 	savebno = blkstofrags(fs, savebno);
2022 
2023 	/* convert frag offset in cg to frag offset in fs */
2024 	savebno += (savecg * fs->fs_fpg);
2025 
2026 	return (savebno);
2027 }
2028 
2029 /*
2030  * The object of this routine is to find a start point for the UFS log.
2031  * Ideally the space should be allocated from the smallest possible number
2032  * of contiguous cylinder groups. This is found by using a sliding window
2033  * technique. The smallest window of contiguous cylinder groups, which is
2034  * still able to accommodate the target, is found by moving the window
2035  * through the cylinder groups in a single pass. The end of the window is
2036  * advanced until the space is accommodated, then the start is advanced until
2037  * it no longer fits, the end is then advanced again and so on until the
2038  * final cylinder group is reached. The first suitable instance is recorded
2039  * and its starting cg number is returned.
2040  *
2041  * If we are not able to find a minimum amount of space, represented by
2042  * minblk, or to do so uses more than the available extents, then return -1.
2043  */
2044 
2045 int
2046 findlogstartcg(struct fs *fs, daddr_t requested, daddr_t minblk)
2047 {
2048 	int	 ncgs;		 /* number of cylinder groups */
2049 	daddr_t target;		 /* amount of space sought */
2050 	int	 cwidth, ctotal; /* current window width and total */
2051 	int	 bwidth, btotal; /* best window width and total so far */
2052 	int	 s;	/* index of the first element in the current window */
2053 	int	 e;	/* index of the first element + the width */
2054 			/*  (i.e. 1 + index of last element) */
2055 	int	 bs; /* index of the first element in the best window so far */
2056 	int	 header, max_extents;
2057 
2058 	target = requested;
2059 	ncgs = fs->fs_ncg;
2060 
2061 	header = sizeof (extent_block_t) - sizeof (extent_t);
2062 	max_extents = ((fs->fs_bsize)-header) / sizeof (extent_t);
2063 	cwidth = ctotal = 0;
2064 	btotal = -1;
2065 	bwidth = ncgs;
2066 	s = e = 0;
2067 	while (e < ncgs) {
2068 	/* Advance the end of the window until it accommodates the target. */
2069 		while (ctotal < target && e < ncgs) {
2070 			ctotal += fs->fs_cs(fs, e).cs_nbfree;
2071 			e++;
2072 		}
2073 
2074 		/*
2075 		 * Advance the start of the window until it no longer
2076 		 * accommodates the target.
2077 		 */
2078 		while (ctotal >= target && s < e) {
2079 			/* See if this is the smallest window so far. */
2080 			cwidth = e - s;
2081 			if (cwidth <= bwidth) {
2082 				if (cwidth == bwidth && ctotal <= btotal)
2083 					goto more;
2084 				bwidth = cwidth;
2085 				btotal = ctotal;
2086 				bs = s;
2087 			}
2088 more:
2089 			ctotal -= fs->fs_cs(fs, s).cs_nbfree;
2090 			s++;
2091 		}
2092 	}
2093 
2094 	/*
2095 	 * If we cannot allocate the minimum required or we use too many
2096 	 * extents to do so, return -1.
2097 	 */
2098 	if (btotal < minblk || bwidth > max_extents)
2099 		bs = -1;
2100 
2101 	return (bs);
2102 }
2103