xref: /titanic_41/usr/src/uts/common/fs/ufs/ufs_subr.c (revision 864221ad7169608e293fbeaa9df563afc9f345a0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 
40 #pragma ident	"%Z%%M%	%I%	%E% SMI"
41 
42 #include <sys/types.h>
43 #include <sys/t_lock.h>
44 #include <sys/param.h>
45 #include <sys/time.h>
46 #include <sys/fs/ufs_fs.h>
47 #include <sys/cmn_err.h>
48 
49 #ifdef _KERNEL
50 
51 #include <sys/systm.h>
52 #include <sys/sysmacros.h>
53 #include <sys/buf.h>
54 #include <sys/conf.h>
55 #include <sys/user.h>
56 #include <sys/var.h>
57 #include <sys/vfs.h>
58 #include <sys/vnode.h>
59 #include <sys/proc.h>
60 #include <sys/debug.h>
61 #include <sys/fssnap_if.h>
62 #include <sys/fs/ufs_inode.h>
63 #include <sys/fs/ufs_trans.h>
64 #include <sys/fs/ufs_panic.h>
65 #include <sys/fs/ufs_bio.h>
66 #include <sys/fs/ufs_log.h>
67 #include <sys/kmem.h>
68 #include <sys/policy.h>
69 #include <vm/hat.h>
70 #include <vm/as.h>
71 #include <vm/seg.h>
72 #include <vm/pvn.h>
73 #include <vm/seg_map.h>
74 #include <sys/swap.h>
75 #include <vm/seg_kmem.h>
76 
77 #else  /* _KERNEL */
78 
79 #define	ASSERT(x)		/* don't use asserts for fsck et al */
80 
81 #endif  /* _KERNEL */
82 
83 #ifdef _KERNEL
84 
85 /*
86  * Used to verify that a given entry on the ufs_instances list (see below)
87  * still refers to a mounted file system.
88  *
89  * XXX:	This is a crock that substitutes for proper locking to coordinate
90  *	updates to and uses of the entries in ufs_instances.
91  */
92 struct check_node {
93 	struct vfs *vfsp;
94 	struct ufsvfs *ufsvfs;
95 	dev_t vfs_dev;
96 };
97 
98 static vfs_t *still_mounted(struct check_node *);
99 
100 /*
101  * All ufs file system instances are linked together into a list starting at
102  * ufs_instances.  The list is updated as part of mount and unmount.  It's
103  * consulted in ufs_update, to allow syncing out all ufs file system instances
104  * in a batch.
105  *
106  * ufsvfs_mutex guards access to this list and to the {,old}ufsvfslist
107  * manipulated in ufs_funmount_cleanup.  (A given ufs instance is always on
108  * exactly one of these lists except while it's being allocated or
109  * deallocated.)
110  */
111 struct ufsvfs	*ufs_instances;
112 extern kmutex_t		ufsvfs_mutex;	/* XXX: move this to ufs_inode.h? */
113 
114 /*
115  * ufsvfs list manipulation routines
116  */
117 
118 /*
119  * Link ufsp in at the head of the list of ufs_instances.
120  */
121 void
122 ufs_vfs_add(struct ufsvfs *ufsp)
123 {
124 	mutex_enter(&ufsvfs_mutex);
125 	ufsp->vfs_next = ufs_instances;
126 	ufs_instances = ufsp;
127 	mutex_exit(&ufsvfs_mutex);
128 }
129 
130 /*
131  * Remove ufsp from the list of ufs_instances.
132  *
133  * Does no error checking; ufsp is assumed to actually be on the list.
134  */
135 void
136 ufs_vfs_remove(struct ufsvfs *ufsp)
137 {
138 	struct ufsvfs	**delpt = &ufs_instances;
139 
140 	mutex_enter(&ufsvfs_mutex);
141 	for (; *delpt != NULL; delpt = &((*delpt)->vfs_next)) {
142 		if (*delpt == ufsp) {
143 			*delpt = ufsp->vfs_next;
144 			ufsp->vfs_next = NULL;
145 			break;
146 		}
147 	}
148 	mutex_exit(&ufsvfs_mutex);
149 }
150 
151 /*
152  * Clean up state resulting from a forcible unmount that couldn't be handled
153  * directly during the unmount.  (See commentary in the unmount code for more
154  * info.)
155  */
156 static void
157 ufs_funmount_cleanup()
158 {
159 	struct ufsvfs		*ufsvfsp;
160 	extern struct ufsvfs	*oldufsvfslist, *ufsvfslist;
161 
162 	/*
163 	 * Assumption: it's now safe to blow away the entries on
164 	 * oldufsvfslist.
165 	 */
166 	mutex_enter(&ufsvfs_mutex);
167 	while ((ufsvfsp = oldufsvfslist) != NULL) {
168 		oldufsvfslist = ufsvfsp->vfs_next;
169 
170 		mutex_destroy(&ufsvfsp->vfs_lock);
171 		kmem_free(ufsvfsp, sizeof (struct ufsvfs));
172 	}
173 	/*
174 	 * Rotate more recent unmount entries into place in preparation for
175 	 * the next time around.
176 	 */
177 	oldufsvfslist = ufsvfslist;
178 	ufsvfslist = NULL;
179 	mutex_exit(&ufsvfs_mutex);
180 }
181 
182 
183 /*
184  * ufs_update performs the ufs part of `sync'.  It goes through the disk
185  * queues to initiate sandbagged IO; goes through the inodes to write
186  * modified nodes; and it goes through the mount table to initiate
187  * the writing of the modified super blocks.
188  */
189 extern time_t	time;
190 time_t		ufs_sync_time;
191 time_t		ufs_sync_time_secs = 1;
192 
193 extern kmutex_t	ufs_scan_lock;
194 
195 void
196 ufs_update(int flag)
197 {
198 	struct vfs *vfsp;
199 	struct fs *fs;
200 	struct ufsvfs *ufsp;
201 	struct ufsvfs *ufsnext;
202 	struct ufsvfs *update_list = NULL;
203 	int check_cnt = 0;
204 	size_t check_size;
205 	struct check_node *check_list, *ptr;
206 	int cheap = flag & SYNC_ATTR;
207 
208 	/*
209 	 * This is a hack.  A design flaw in the forced unmount protocol
210 	 * could allow a thread to attempt to use a kmem_freed ufsvfs
211 	 * structure in ufs_lockfs_begin/ufs_check_lockfs.  This window
212 	 * is difficult to hit, even during the lockfs stress tests.
213 	 * So the hacky fix is to wait awhile before kmem_free'ing the
214 	 * ufsvfs structures for forcibly unmounted file systems.  `Awhile'
215 	 * is defined as every other call from fsflush (~60 seconds).
216 	 */
217 	if (cheap)
218 		ufs_funmount_cleanup();
219 
220 	/*
221 	 * Examine all ufsvfs structures and add those that we can lock to the
222 	 * update list.  This is so that we don't hold the list lock for a
223 	 * long time.  If vfs_lock fails for a file system instance, then skip
224 	 * it because somebody is doing a unmount on it.
225 	 */
226 	mutex_enter(&ufsvfs_mutex);
227 	for (ufsp = ufs_instances; ufsp != NULL; ufsp = ufsp->vfs_next) {
228 		vfsp = ufsp->vfs_vfs;
229 		if (vfs_lock(vfsp) != 0)
230 			continue;
231 		ufsp->vfs_wnext = update_list;
232 		update_list = ufsp;
233 		check_cnt++;
234 	}
235 	mutex_exit(&ufsvfs_mutex);
236 
237 	if (update_list == NULL)
238 		return;
239 
240 	check_size = sizeof (struct check_node) * check_cnt;
241 	check_list = ptr = kmem_alloc(check_size, KM_NOSLEEP);
242 
243 	/*
244 	 * Write back modified superblocks.
245 	 * Consistency check that the superblock of
246 	 * each file system is still in the buffer cache.
247 	 *
248 	 * Note that the update_list traversal is done without the protection
249 	 * of an overall list lock, so it's necessary to rely on the fact that
250 	 * each entry of the list is vfs_locked when moving from one entry to
251 	 * the next.  This works because a concurrent attempt to add an entry
252 	 * to another thread's update_list won't find it, since it'll already
253 	 * be locked.
254 	 */
255 	check_cnt = 0;
256 	for (ufsp = update_list; ufsp != NULL; ufsp = ufsnext) {
257 		/*
258 		 * Need to grab the next ptr before we unlock this one so
259 		 * another thread doesn't grab it and change it before we move
260 		 * on to the next vfs.  (Once we unlock it, it's ok if another
261 		 * thread finds it to add it to its own update_list; we don't
262 		 * attempt to refer to it through our list any more.)
263 		 */
264 		ufsnext = ufsp->vfs_wnext;
265 		vfsp = ufsp->vfs_vfs;
266 
267 		/*
268 		 * Seems like this can't happen, so perhaps it should become
269 		 * an ASSERT(vfsp->vfs_data != NULL).
270 		 */
271 		if (!vfsp->vfs_data) {
272 			vfs_unlock(vfsp);
273 			continue;
274 		}
275 
276 		fs = ufsp->vfs_fs;
277 
278 		/*
279 		 * don't update a locked superblock during a panic; it
280 		 * may be in an inconsistent state
281 		 */
282 		if (panicstr) {
283 			if (!mutex_tryenter(&ufsp->vfs_lock)) {
284 				vfs_unlock(vfsp);
285 				continue;
286 			}
287 		} else
288 			mutex_enter(&ufsp->vfs_lock);
289 		/*
290 		 * Build up the STABLE check list, so we can unlock the vfs
291 		 * until we do the actual checking.
292 		 */
293 		if (check_list != NULL) {
294 			if ((fs->fs_ronly == 0) &&
295 			    (fs->fs_clean != FSBAD) &&
296 			    (fs->fs_clean != FSSUSPEND)) {
297 				ptr->vfsp = vfsp;
298 				ptr->ufsvfs = ufsp;
299 				ptr->vfs_dev = vfsp->vfs_dev;
300 				ptr++;
301 				check_cnt++;
302 			}
303 		}
304 
305 		/*
306 		 * superblock is not modified
307 		 */
308 		if (fs->fs_fmod == 0) {
309 			mutex_exit(&ufsp->vfs_lock);
310 			vfs_unlock(vfsp);
311 			continue;
312 		}
313 		if (fs->fs_ronly != 0) {
314 			mutex_exit(&ufsp->vfs_lock);
315 			vfs_unlock(vfsp);
316 			(void) ufs_fault(ufsp->vfs_root,
317 			    "fs = %s update: ro fs mod\n", fs->fs_fsmnt);
318 			/*
319 			 * XXX:	Why is this a return instead of a continue?
320 			 *	This may be an attempt to replace a panic with
321 			 *	something less drastic, but there's cleanup we
322 			 *	should be doing that's not being done (e.g.,
323 			 *	unlocking the remaining entries on the list).
324 			 */
325 			return;
326 		}
327 		fs->fs_fmod = 0;
328 		mutex_exit(&ufsp->vfs_lock);
329 		TRANS_SBUPDATE(ufsp, vfsp, TOP_SBUPDATE_UPDATE);
330 		vfs_unlock(vfsp);
331 	}
332 
333 	ufs_sync_time = time;
334 
335 	/*
336 	 * Avoid racing with ufs_unmount() and ufs_sync().
337 	 */
338 	mutex_enter(&ufs_scan_lock);
339 
340 	(void) ufs_scan_inodes(1, ufs_sync_inode, (void *)(uintptr_t)cheap,
341 	    NULL);
342 
343 	mutex_exit(&ufs_scan_lock);
344 
345 	/*
346 	 * Force stale buffer cache information to be flushed,
347 	 * for all devices.  This should cause any remaining control
348 	 * information (e.g., cg and inode info) to be flushed back.
349 	 */
350 	bflush((dev_t)NODEV);
351 
352 	if (check_list == NULL)
353 		return;
354 
355 	/*
356 	 * For each UFS filesystem in the STABLE check_list, update
357 	 * the clean flag if warranted.
358 	 */
359 	for (ptr = check_list; check_cnt > 0; check_cnt--, ptr++) {
360 		int	error;
361 
362 		/*
363 		 * still_mounted() returns with vfsp and the vfs_reflock
364 		 * held if ptr refers to a vfs that is still mounted.
365 		 */
366 		if ((vfsp = still_mounted(ptr)) == NULL)
367 			continue;
368 		ufs_checkclean(vfsp);
369 		/*
370 		 * commit any outstanding async transactions
371 		 */
372 		ufsp = (struct ufsvfs *)vfsp->vfs_data;
373 		curthread->t_flag |= T_DONTBLOCK;
374 		TRANS_BEGIN_SYNC(ufsp, TOP_COMMIT_UPDATE, TOP_COMMIT_SIZE,
375 		    error);
376 		if (!error) {
377 			TRANS_END_SYNC(ufsp, error, TOP_COMMIT_UPDATE,
378 			    TOP_COMMIT_SIZE);
379 		}
380 		curthread->t_flag &= ~T_DONTBLOCK;
381 
382 		vfs_unlock(vfsp);
383 	}
384 
385 	kmem_free(check_list, check_size);
386 }
387 
388 int
389 ufs_sync_inode(struct inode *ip, void *arg)
390 {
391 	int cheap = (int)(uintptr_t)arg;
392 	struct ufsvfs *ufsvfsp;
393 	uint_t flag = ip->i_flag;
394 
395 	if (cheap && ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) == 0))
396 		return (0);
397 
398 	/*
399 	 * if we are panic'ing; then don't update the inode if this
400 	 * file system is FSSTABLE.  Otherwise, we would have to
401 	 * force the superblock to FSACTIVE and the superblock
402 	 * may not be in a good state.  Also, if the inode is
403 	 * IREF'ed then it may be in an inconsistent state.  Don't
404 	 * push it.  Finally, don't push the inode if the fs is
405 	 * logging; the transaction will be discarded at boot.
406 	 */
407 	if (panicstr) {
408 
409 		if (flag & IREF)
410 			return (0);
411 
412 		if (ip->i_ufsvfs == NULL ||
413 		    (ip->i_fs->fs_clean == FSSTABLE ||
414 		    ip->i_fs->fs_clean == FSLOG))
415 				return (0);
416 	}
417 
418 	ufsvfsp = ip->i_ufsvfs;
419 
420 	/*
421 	 * Limit access time only updates
422 	 */
423 	if (((flag & (IMOD|IMODACC|IUPD|ICHG|IACC)) == IMODACC) && ufsvfsp) {
424 		/*
425 		 * if file system has deferred access time turned on and there
426 		 * was no IO recently, don't bother flushing it. It will be
427 		 * flushed when I/Os start again.
428 		 */
429 		if (cheap && (ufsvfsp->vfs_dfritime & UFS_DFRATIME) &&
430 		    (ufsvfsp->vfs_iotstamp + ufs_iowait < lbolt))
431 			return (0);
432 		/*
433 		 * an app issueing a sync() can take forever on a trans device
434 		 * when NetWorker or find is running because all of the
435 		 * directorys' access times have to be updated. So, we limit
436 		 * the time we spend updating access times per sync.
437 		 */
438 		if (TRANS_ISTRANS(ufsvfsp) && ((ufs_sync_time +
439 		    ufs_sync_time_secs) < time))
440 			return (0);
441 	}
442 
443 	/*
444 	 * if we are running on behalf of the flush thread or this is
445 	 * a swap file, then simply do a delay update of the inode.
446 	 * Otherwise, push the pages and then do a delayed inode update.
447 	 */
448 	if (cheap || IS_SWAPVP(ITOV(ip))) {
449 		TRANS_IUPDAT(ip, 0);
450 	} else {
451 		(void) TRANS_SYNCIP(ip, B_ASYNC, I_ASYNC, TOP_SYNCIP_SYNC);
452 	}
453 	return (0);
454 }
455 
456 /*
457  * Flush all the pages associated with an inode using the given 'flags',
458  * then force inode information to be written back using the given 'waitfor'.
459  */
460 int
461 ufs_syncip(struct inode *ip, int flags, int waitfor, top_t topid)
462 {
463 	int	error;
464 	struct vnode *vp = ITOV(ip);
465 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
466 	int dotrans = 0;
467 
468 	/*
469 	 * Return if file system has been forcibly umounted.
470 	 */
471 	if (ufsvfsp == NULL)
472 		return (EIO);
473 	/*
474 	 * don't need to VOP_PUTPAGE if there are no pages
475 	 */
476 	if (!vn_has_cached_data(vp) || vp->v_type == VCHR) {
477 		error = 0;
478 	} else {
479 		/*
480 		 * if the inode we're working on is a shadow inode
481 		 * or quota inode we need to make sure that the
482 		 * ufs_putpage call is inside a transaction as this
483 		 * could include meta data changes.
484 		 */
485 		if ((ip->i_mode & IFMT) == IFSHAD ||
486 		    ufsvfsp->vfs_qinod == ip) {
487 			dotrans = 1;
488 			curthread->t_flag |= T_DONTBLOCK;
489 			TRANS_BEGIN_ASYNC(ufsvfsp, TOP_PUTPAGE,
490 			    TOP_PUTPAGE_SIZE(ip));
491 		}
492 		error = VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
493 		    flags, CRED(), NULL);
494 		if (dotrans) {
495 			TRANS_END_ASYNC(ufsvfsp, TOP_PUTPAGE,
496 			    TOP_PUTPAGE_SIZE(ip));
497 			curthread->t_flag &= ~T_DONTBLOCK;
498 			dotrans = 0;
499 		}
500 	}
501 	if (panicstr && TRANS_ISTRANS(ufsvfsp))
502 		goto out;
503 	/*
504 	 * waitfor represents two things -
505 	 * 1. whether data sync or file sync.
506 	 * 2. if file sync then ufs_iupdat should 'waitfor' disk i/o or not.
507 	 */
508 	if (waitfor == I_DSYNC) {
509 		/*
510 		 * If data sync, only IATTCHG (size/block change) requires
511 		 * inode update, fdatasync()/FDSYNC implementation.
512 		 */
513 		if (ip->i_flag & (IBDWRITE|IATTCHG)) {
514 			/*
515 			 * Enter a transaction to provide mutual exclusion
516 			 * with deltamap_push and avoid a race where
517 			 * the inode flush could get dropped.
518 			 */
519 			if ((curthread->t_flag & T_DONTBLOCK) == 0) {
520 				dotrans = 1;
521 				curthread->t_flag |= T_DONTBLOCK;
522 				TRANS_BEGIN_ASYNC(ufsvfsp, topid,
523 				    TOP_SYNCIP_SIZE);
524 			}
525 			rw_enter(&ip->i_contents, RW_READER);
526 			mutex_enter(&ip->i_tlock);
527 			ip->i_flag &= ~IMODTIME;
528 			mutex_exit(&ip->i_tlock);
529 			ufs_iupdat(ip, 1);
530 			rw_exit(&ip->i_contents);
531 			if (dotrans) {
532 				TRANS_END_ASYNC(ufsvfsp, topid,
533 				    TOP_SYNCIP_SIZE);
534 				curthread->t_flag &= ~T_DONTBLOCK;
535 			}
536 		}
537 	} else {
538 		/* For file sync, any inode change requires inode update */
539 		if (ip->i_flag & (IBDWRITE|IUPD|IACC|ICHG|IMOD|IMODACC)) {
540 			/*
541 			 * Enter a transaction to provide mutual exclusion
542 			 * with deltamap_push and avoid a race where
543 			 * the inode flush could get dropped.
544 			 */
545 			if ((curthread->t_flag & T_DONTBLOCK) == 0) {
546 				dotrans = 1;
547 				curthread->t_flag |= T_DONTBLOCK;
548 				TRANS_BEGIN_ASYNC(ufsvfsp, topid,
549 				    TOP_SYNCIP_SIZE);
550 			}
551 			rw_enter(&ip->i_contents, RW_READER);
552 			mutex_enter(&ip->i_tlock);
553 			ip->i_flag &= ~IMODTIME;
554 			mutex_exit(&ip->i_tlock);
555 			ufs_iupdat(ip, waitfor);
556 			rw_exit(&ip->i_contents);
557 			if (dotrans) {
558 				TRANS_END_ASYNC(ufsvfsp, topid,
559 				    TOP_SYNCIP_SIZE);
560 				curthread->t_flag &= ~T_DONTBLOCK;
561 			}
562 		}
563 	}
564 
565 out:
566 	return (error);
567 }
568 /*
569  * Flush all indirect blocks related to an inode.
570  * Supports triple indirect blocks also.
571  */
572 int
573 ufs_sync_indir(struct inode *ip)
574 {
575 	int i;
576 	daddr_t blkno;
577 	daddr_t lbn;	/* logical blkno of last blk in file */
578 	daddr_t clbn;	/* current logical blk */
579 	daddr32_t *bap;
580 	struct fs *fs;
581 	struct buf *bp;
582 	int bsize;
583 	struct ufsvfs *ufsvfsp;
584 	int j;
585 	daddr_t indirect_blkno;
586 	daddr32_t *indirect_bap;
587 	struct buf *indirect_bp;
588 
589 	ufsvfsp = ip->i_ufsvfs;
590 	/*
591 	 * unnecessary when logging; allocation blocks are kept up-to-date
592 	 */
593 	if (TRANS_ISTRANS(ufsvfsp))
594 		return (0);
595 
596 	fs = ufsvfsp->vfs_fs;
597 	bsize = fs->fs_bsize;
598 	lbn = (daddr_t)lblkno(fs, ip->i_size - 1);
599 	if (lbn < NDADDR)
600 		return (0);	/* No indirect blocks used */
601 	if (lbn < NDADDR + NINDIR(fs)) {
602 		/* File has one indirect block. */
603 		blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, ip->i_ib[0]));
604 		return (0);
605 	}
606 
607 	/* Write out all the first level indirect blocks */
608 	for (i = 0; i <= NIADDR; i++) {
609 		if ((blkno = ip->i_ib[i]) == 0)
610 			continue;
611 		blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno));
612 	}
613 	/* Write out second level of indirect blocks */
614 	if ((blkno = ip->i_ib[1]) == 0)
615 		return (0);
616 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, blkno), bsize);
617 	if (bp->b_flags & B_ERROR) {
618 		brelse(bp);
619 		return (EIO);
620 	}
621 	bap = bp->b_un.b_daddr;
622 	clbn = NDADDR + NINDIR(fs);
623 	for (i = 0; i < NINDIR(fs); i++) {
624 		if (clbn > lbn)
625 			break;
626 		clbn += NINDIR(fs);
627 		if ((blkno = bap[i]) == 0)
628 			continue;
629 		blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno));
630 	}
631 
632 	brelse(bp);
633 	/* write out third level indirect blocks */
634 
635 	if ((blkno = ip->i_ib[2]) == 0)
636 		return (0);
637 
638 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, blkno), bsize);
639 	if (bp->b_flags & B_ERROR) {
640 		brelse(bp);
641 		return (EIO);
642 	}
643 	bap = bp->b_un.b_daddr;
644 	clbn = NDADDR + NINDIR(fs) + (NINDIR(fs) * NINDIR(fs));
645 
646 	for (i = 0; i < NINDIR(fs); i++) {
647 		if (clbn > lbn)
648 			break;
649 		if ((indirect_blkno = bap[i]) == 0)
650 			continue;
651 		blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, indirect_blkno));
652 		indirect_bp = UFS_BREAD(ufsvfsp, ip->i_dev,
653 		    (daddr_t)fsbtodb(fs, indirect_blkno), bsize);
654 		if (indirect_bp->b_flags & B_ERROR) {
655 			brelse(indirect_bp);
656 			brelse(bp);
657 			return (EIO);
658 		}
659 		indirect_bap = indirect_bp->b_un.b_daddr;
660 		for (j = 0; j < NINDIR(fs); j++) {
661 			if (clbn > lbn)
662 				break;
663 			clbn += NINDIR(fs);
664 			if ((blkno = indirect_bap[j]) == 0)
665 				continue;
666 			blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno));
667 		}
668 		brelse(indirect_bp);
669 	}
670 	brelse(bp);
671 
672 	return (0);
673 }
674 
675 /*
676  * Flush all indirect blocks related to an offset of a file.
677  * read/write in sync mode may have to flush indirect blocks.
678  */
679 int
680 ufs_indirblk_sync(struct inode *ip, offset_t off)
681 {
682 	daddr_t	lbn;
683 	struct	fs *fs;
684 	struct	buf *bp;
685 	int	i, j, shft;
686 	daddr_t	ob, nb, tbn;
687 	daddr32_t *bap;
688 	int	nindirshift, nindiroffset;
689 	struct ufsvfs *ufsvfsp;
690 
691 	ufsvfsp = ip->i_ufsvfs;
692 	/*
693 	 * unnecessary when logging; allocation blocks are kept up-to-date
694 	 */
695 	if (TRANS_ISTRANS(ufsvfsp))
696 		return (0);
697 
698 	fs = ufsvfsp->vfs_fs;
699 
700 	lbn = (daddr_t)lblkno(fs, off);
701 	if (lbn < 0)
702 		return (EFBIG);
703 
704 	/* The first NDADDR are direct so nothing to do */
705 	if (lbn < NDADDR)
706 		return (0);
707 
708 	nindirshift = ip->i_ufsvfs->vfs_nindirshift;
709 	nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
710 
711 	/* Determine level of indirect blocks */
712 	shft = 0;
713 	tbn = lbn - NDADDR;
714 	for (j = NIADDR; j > 0; j--) {
715 		longlong_t	sh;
716 
717 		shft += nindirshift;
718 		sh = 1LL << shft;
719 		if (tbn < sh)
720 			break;
721 		tbn -= (daddr_t)sh;
722 	}
723 
724 	if (j == 0)
725 		return (EFBIG);
726 
727 	if ((nb = ip->i_ib[NIADDR - j]) == 0)
728 			return (0);		/* UFS Hole */
729 
730 	/* Flush first level indirect block */
731 	blkflush(ip->i_dev, fsbtodb(fs, nb));
732 
733 	/* Fetch through next levels */
734 	for (; j < NIADDR; j++) {
735 		ob = nb;
736 		bp = UFS_BREAD(ufsvfsp,
737 		    ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
738 		if (bp->b_flags & B_ERROR) {
739 			brelse(bp);
740 			return (EIO);
741 		}
742 		bap = bp->b_un.b_daddr;
743 		shft -= nindirshift;		/* sh / nindir */
744 		i = (tbn >> shft) & nindiroffset; /* (tbn /sh) & nindir */
745 		nb = bap[i];
746 		brelse(bp);
747 		if (nb == 0) {
748 			return (0); 		/* UFS hole */
749 		}
750 		blkflush(ip->i_dev, fsbtodb(fs, nb));
751 	}
752 	return (0);
753 }
754 
755 #ifdef DEBUG
756 
757 /*
758  * The bad block checking routines: ufs_indir_badblock() and ufs_badblock()
759  * are very expensive. It's been found from profiling that we're
760  * spending 6-7% of our time in ufs_badblock, and another 1-2% in
761  * ufs_indir_badblock. They are only called via ASSERTs (from debug kernels).
762  * In addition from experience no failures have been found in recent
763  * years. So the following tunable can be set to enable checking.
764  */
765 int ufs_badblock_checks = 0;
766 
767 /*
768  * Check that a given indirect block contains blocks in range
769  */
770 int
771 ufs_indir_badblock(struct inode *ip, daddr32_t *bap)
772 {
773 	int i;
774 	int err = 0;
775 
776 	if (ufs_badblock_checks) {
777 		for (i = 0; i < NINDIR(ip->i_fs) - 1; i++)
778 			if (bap[i] != 0 && (err = ufs_badblock(ip, bap[i])))
779 				break;
780 	}
781 	return (err);
782 }
783 
784 /*
785  * Check that a specified block number is in range.
786  */
787 int
788 ufs_badblock(struct inode *ip, daddr_t bn)
789 {
790 	long	c;
791 	daddr_t	sum;
792 
793 	if (!ufs_badblock_checks)
794 		return (0);
795 	ASSERT(bn);
796 	if (bn <= 0 || bn > ip->i_fs->fs_size)
797 		return (bn);
798 
799 	sum = 0;
800 	c = dtog(ip->i_fs, bn);
801 	if (c == 0) {
802 		sum = howmany(ip->i_fs->fs_cssize, ip->i_fs->fs_fsize);
803 	}
804 	/*
805 	 * if block no. is below this cylinder group,
806 	 * within the space reserved for superblock, inodes, (summary data)
807 	 * or if it is above this cylinder group
808 	 * then its invalid
809 	 * It's hard to see how we'd be outside this cyl, but let's be careful.
810 	 */
811 	if ((bn < cgbase(ip->i_fs, c)) ||
812 	    (bn >= cgsblock(ip->i_fs, c) && bn < cgdmin(ip->i_fs, c)+sum) ||
813 	    (bn >= (unsigned)cgbase(ip->i_fs, c+1)))
814 		return (bn);
815 
816 	return (0);	/* not a bad block */
817 }
818 
819 #endif /* DEBUG */
820 
821 /*
822  * When i_rwlock is write-locked or has a writer pended, then the inode
823  * is going to change in a way that the filesystem will be marked as
824  * active. So no need to let the filesystem be mark as stable now.
825  * Also to ensure the filesystem consistency during the directory
826  * operations, filesystem cannot be marked as stable if i_rwlock of
827  * the directory inode is write-locked.
828  */
829 
830 /*
831  * Check for busy inodes for this filesystem.
832  * NOTE: Needs better way to do this expensive operation in the future.
833  */
834 static void
835 ufs_icheck(struct ufsvfs *ufsvfsp, int *isbusyp, int *isreclaimp)
836 {
837 	union  ihead	*ih;
838 	struct inode	*ip;
839 	int		i;
840 	int		isnottrans	= !TRANS_ISTRANS(ufsvfsp);
841 	int		isbusy		= *isbusyp;
842 	int		isreclaim	= *isreclaimp;
843 
844 	for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
845 		mutex_enter(&ih_lock[i]);
846 		for (ip = ih->ih_chain[0];
847 		    ip != (struct inode *)ih;
848 		    ip = ip->i_forw) {
849 			/*
850 			 * if inode is busy/modified/deleted, filesystem is busy
851 			 */
852 			if (ip->i_ufsvfs != ufsvfsp)
853 				continue;
854 			if ((ip->i_flag & (IMOD | IUPD | ICHG)) ||
855 			    (RW_ISWRITER(&ip->i_rwlock)))
856 				isbusy = 1;
857 			if ((ip->i_nlink <= 0) && (ip->i_flag & IREF))
858 				isreclaim = 1;
859 			if (isbusy && (isreclaim || isnottrans))
860 				break;
861 		}
862 		mutex_exit(&ih_lock[i]);
863 		if (isbusy && (isreclaim || isnottrans))
864 			break;
865 	}
866 	*isbusyp = isbusy;
867 	*isreclaimp = isreclaim;
868 }
869 
870 /*
871  * As part of the ufs 'sync' operation, this routine is called to mark
872  * the filesystem as STABLE if there is no modified metadata in memory.
873  */
874 void
875 ufs_checkclean(struct vfs *vfsp)
876 {
877 	struct ufsvfs	*ufsvfsp	= (struct ufsvfs *)vfsp->vfs_data;
878 	struct fs	*fs		= ufsvfsp->vfs_fs;
879 	int		isbusy;
880 	int		isreclaim;
881 	int		updatesb;
882 
883 	ASSERT(vfs_lock_held(vfsp));
884 
885 	/*
886 	 * filesystem is stable or cleanflag processing is disabled; do nothing
887 	 *	no transitions when panic'ing
888 	 */
889 	if (fs->fs_ronly ||
890 	    fs->fs_clean == FSBAD ||
891 	    fs->fs_clean == FSSUSPEND ||
892 	    fs->fs_clean == FSSTABLE ||
893 	    panicstr)
894 		return;
895 
896 	/*
897 	 * if logging and nothing to reclaim; do nothing
898 	 */
899 	if ((fs->fs_clean == FSLOG) &&
900 	    (((fs->fs_reclaim & FS_RECLAIM) == 0) ||
901 	    (fs->fs_reclaim & FS_RECLAIMING)))
902 		return;
903 
904 	/*
905 	 * FS_CHECKCLEAN is reset if the file system goes dirty
906 	 * FS_CHECKRECLAIM is reset if a file gets deleted
907 	 */
908 	mutex_enter(&ufsvfsp->vfs_lock);
909 	fs->fs_reclaim |= (FS_CHECKCLEAN | FS_CHECKRECLAIM);
910 	mutex_exit(&ufsvfsp->vfs_lock);
911 
912 	updatesb = 0;
913 
914 	/*
915 	 * if logging or buffers are busy; do nothing
916 	 */
917 	isbusy = isreclaim = 0;
918 	if ((fs->fs_clean == FSLOG) ||
919 	    (bcheck(vfsp->vfs_dev, ufsvfsp->vfs_bufp)))
920 		isbusy = 1;
921 
922 	/*
923 	 * isreclaim == TRUE means can't change the state of fs_reclaim
924 	 */
925 	isreclaim =
926 	    ((fs->fs_clean == FSLOG) &&
927 	    (((fs->fs_reclaim & FS_RECLAIM) == 0) ||
928 	    (fs->fs_reclaim & FS_RECLAIMING)));
929 
930 	/*
931 	 * if fs is busy or can't change the state of fs_reclaim; do nothing
932 	 */
933 	if (isbusy && isreclaim)
934 		return;
935 
936 	/*
937 	 * look for busy or deleted inodes; (deleted == needs reclaim)
938 	 */
939 	ufs_icheck(ufsvfsp, &isbusy, &isreclaim);
940 
941 	mutex_enter(&ufsvfsp->vfs_lock);
942 
943 	/*
944 	 * IF POSSIBLE, RESET RECLAIM
945 	 */
946 	/*
947 	 * the reclaim thread is not running
948 	 */
949 	if ((fs->fs_reclaim & FS_RECLAIMING) == 0)
950 		/*
951 		 * no files were deleted during the scan
952 		 */
953 		if (fs->fs_reclaim & FS_CHECKRECLAIM)
954 			/*
955 			 * no deleted files were found in the inode cache
956 			 */
957 			if ((isreclaim == 0) && (fs->fs_reclaim & FS_RECLAIM)) {
958 				fs->fs_reclaim &= ~FS_RECLAIM;
959 				updatesb = 1;
960 			}
961 	/*
962 	 * IF POSSIBLE, SET STABLE
963 	 */
964 	/*
965 	 * not logging
966 	 */
967 	if (fs->fs_clean != FSLOG)
968 		/*
969 		 * file system has not gone dirty since the scan began
970 		 */
971 		if (fs->fs_reclaim & FS_CHECKCLEAN)
972 			/*
973 			 * nothing dirty was found in the buffer or inode cache
974 			 */
975 			if ((isbusy == 0) && (isreclaim == 0) &&
976 			    (fs->fs_clean != FSSTABLE)) {
977 				fs->fs_clean = FSSTABLE;
978 				updatesb = 1;
979 			}
980 
981 	mutex_exit(&ufsvfsp->vfs_lock);
982 	if (updatesb) {
983 		TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE);
984 	}
985 }
986 
987 /*
988  * called whenever an unlink occurs
989  */
990 void
991 ufs_setreclaim(struct inode *ip)
992 {
993 	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
994 	struct fs	*fs		= ufsvfsp->vfs_fs;
995 
996 	if (ip->i_nlink || fs->fs_ronly || (fs->fs_clean != FSLOG))
997 		return;
998 
999 	/*
1000 	 * reclaim-needed bit is already set or we need to tell
1001 	 * ufs_checkclean that a file has been deleted
1002 	 */
1003 	if ((fs->fs_reclaim & (FS_RECLAIM | FS_CHECKRECLAIM)) == FS_RECLAIM)
1004 		return;
1005 
1006 	mutex_enter(&ufsvfsp->vfs_lock);
1007 	/*
1008 	 * inform ufs_checkclean that the file system has gone dirty
1009 	 */
1010 	fs->fs_reclaim &= ~FS_CHECKRECLAIM;
1011 
1012 	/*
1013 	 * set the reclaim-needed bit
1014 	 */
1015 	if ((fs->fs_reclaim & FS_RECLAIM) == 0) {
1016 		fs->fs_reclaim |= FS_RECLAIM;
1017 		ufs_sbwrite(ufsvfsp);
1018 	}
1019 	mutex_exit(&ufsvfsp->vfs_lock);
1020 }
1021 
1022 /*
1023  * Before any modified metadata written back to the disk, this routine
1024  * is called to mark the filesystem as ACTIVE.
1025  */
1026 void
1027 ufs_notclean(struct ufsvfs *ufsvfsp)
1028 {
1029 	struct fs *fs = ufsvfsp->vfs_fs;
1030 
1031 	ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock));
1032 	ULOCKFS_SET_MOD((&ufsvfsp->vfs_ulockfs));
1033 
1034 	/*
1035 	 * inform ufs_checkclean that the file system has gone dirty
1036 	 */
1037 	fs->fs_reclaim &= ~FS_CHECKCLEAN;
1038 
1039 	/*
1040 	 * ignore if active or bad or suspended or readonly or logging
1041 	 */
1042 	if ((fs->fs_clean == FSACTIVE) || (fs->fs_clean == FSLOG) ||
1043 	    (fs->fs_clean == FSBAD) || (fs->fs_clean == FSSUSPEND) ||
1044 	    (fs->fs_ronly)) {
1045 		mutex_exit(&ufsvfsp->vfs_lock);
1046 		return;
1047 	}
1048 	fs->fs_clean = FSACTIVE;
1049 	/*
1050 	 * write superblock synchronously
1051 	 */
1052 	ufs_sbwrite(ufsvfsp);
1053 	mutex_exit(&ufsvfsp->vfs_lock);
1054 }
1055 
1056 /*
1057  * ufs specific fbwrite()
1058  */
1059 int
1060 ufs_fbwrite(struct fbuf *fbp, struct inode *ip)
1061 {
1062 	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
1063 
1064 	if (TRANS_ISTRANS(ufsvfsp))
1065 		return (fbwrite(fbp));
1066 	mutex_enter(&ufsvfsp->vfs_lock);
1067 	ufs_notclean(ufsvfsp);
1068 	return ((ufsvfsp->vfs_dio) ? fbdwrite(fbp) : fbwrite(fbp));
1069 }
1070 
1071 /*
1072  * ufs specific fbiwrite()
1073  */
1074 int
1075 ufs_fbiwrite(struct fbuf *fbp, struct inode *ip, daddr_t bn, long bsize)
1076 {
1077 	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
1078 	o_mode_t	ifmt		= ip->i_mode & IFMT;
1079 	buf_t		*bp;
1080 	int		error;
1081 
1082 	mutex_enter(&ufsvfsp->vfs_lock);
1083 	ufs_notclean(ufsvfsp);
1084 	if (ifmt == IFDIR || ifmt == IFSHAD || ifmt == IFATTRDIR ||
1085 	    (ip->i_ufsvfs->vfs_qinod == ip)) {
1086 		TRANS_DELTA(ufsvfsp, ldbtob(bn * (offset_t)(btod(bsize))),
1087 		    fbp->fb_count, DT_FBI, 0, 0);
1088 	}
1089 	/*
1090 	 * Inlined version of fbiwrite()
1091 	 */
1092 	bp = pageio_setup((struct page *)NULL, fbp->fb_count,
1093 	    ip->i_devvp, B_WRITE);
1094 	bp->b_flags &= ~B_PAGEIO;
1095 	bp->b_un.b_addr = fbp->fb_addr;
1096 
1097 	bp->b_blkno = bn * btod(bsize);
1098 	bp->b_dev = cmpdev(ip->i_dev);	/* store in old dev format */
1099 	bp->b_edev = ip->i_dev;
1100 	bp->b_proc = NULL;			/* i.e. the kernel */
1101 	bp->b_file = ip->i_vnode;
1102 	bp->b_offset = -1;
1103 
1104 	if (ufsvfsp->vfs_log) {
1105 		lufs_write_strategy(ufsvfsp->vfs_log, bp);
1106 	} else if (ufsvfsp->vfs_snapshot) {
1107 		fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
1108 	} else {
1109 		ufsvfsp->vfs_iotstamp = lbolt;
1110 		ub.ub_fbiwrites.value.ul++;
1111 		(void) bdev_strategy(bp);
1112 		lwp_stat_update(LWP_STAT_OUBLK, 1);
1113 	}
1114 	error = biowait(bp);
1115 	pageio_done(bp);
1116 	fbrelse(fbp, S_OTHER);
1117 	return (error);
1118 }
1119 
1120 /*
1121  * Write the ufs superblock only.
1122  */
1123 void
1124 ufs_sbwrite(struct ufsvfs *ufsvfsp)
1125 {
1126 	char sav_fs_fmod;
1127 	struct fs *fs = ufsvfsp->vfs_fs;
1128 	struct buf *bp = ufsvfsp->vfs_bufp;
1129 
1130 	ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock));
1131 
1132 	/*
1133 	 * for ulockfs processing, limit the superblock writes
1134 	 */
1135 	if ((ufsvfsp->vfs_ulockfs.ul_sbowner) &&
1136 	    (curthread != ufsvfsp->vfs_ulockfs.ul_sbowner)) {
1137 		/* try again later */
1138 		fs->fs_fmod = 1;
1139 		return;
1140 	}
1141 
1142 	ULOCKFS_SET_MOD((&ufsvfsp->vfs_ulockfs));
1143 	/*
1144 	 * update superblock timestamp and fs_clean checksum
1145 	 * if marked FSBAD, we always want an erroneous
1146 	 * checksum to force repair
1147 	 */
1148 	fs->fs_time = gethrestime_sec();
1149 	fs->fs_state = (fs->fs_clean != FSBAD) ?
1150 	    FSOKAY - fs->fs_time : -(FSOKAY - fs->fs_time);
1151 	switch (fs->fs_clean) {
1152 	case FSCLEAN:
1153 	case FSSTABLE:
1154 		fs->fs_reclaim &= ~FS_RECLAIM;
1155 		break;
1156 	case FSACTIVE:
1157 	case FSSUSPEND:
1158 	case FSBAD:
1159 	case FSLOG:
1160 		break;
1161 	default:
1162 		fs->fs_clean = FSACTIVE;
1163 		break;
1164 	}
1165 	/*
1166 	 * reset incore only bits
1167 	 */
1168 	fs->fs_reclaim &= ~(FS_CHECKCLEAN | FS_CHECKRECLAIM);
1169 
1170 	/*
1171 	 * delta the whole superblock
1172 	 */
1173 	TRANS_DELTA(ufsvfsp, ldbtob(SBLOCK), sizeof (struct fs),
1174 	    DT_SB, NULL, 0);
1175 	/*
1176 	 * retain the incore state of fs_fmod; set the ondisk state to 0
1177 	 */
1178 	sav_fs_fmod = fs->fs_fmod;
1179 	fs->fs_fmod = 0;
1180 
1181 	/*
1182 	 * Don't release the buffer after written to the disk
1183 	 */
1184 	UFS_BWRITE2(ufsvfsp, bp);
1185 	fs->fs_fmod = sav_fs_fmod;	/* reset fs_fmod's incore state */
1186 }
1187 
1188 /*
1189  * Returns vfs pointer if vfs still being mounted. vfs lock is held.
1190  * Otherwise, returns NULL.
1191  *
1192  * For our purposes, "still mounted" means that the file system still appears
1193  * on the list of UFS file system instances.
1194  */
1195 static vfs_t *
1196 still_mounted(struct check_node *checkp)
1197 {
1198 	struct vfs	*vfsp;
1199 	struct ufsvfs	*ufsp;
1200 
1201 	mutex_enter(&ufsvfs_mutex);
1202 	for (ufsp = ufs_instances; ufsp != NULL; ufsp = ufsp->vfs_next) {
1203 		if (ufsp != checkp->ufsvfs)
1204 			continue;
1205 		/*
1206 		 * Tentative match:  verify it and try to lock.  (It's not at
1207 		 * all clear how the verification could fail, given that we've
1208 		 * gotten this far.  We would have had to reallocate the
1209 		 * ufsvfs struct at hand for a new incarnation; is that really
1210 		 * possible in the interval from constructing the check_node
1211 		 * to here?)
1212 		 */
1213 		vfsp = ufsp->vfs_vfs;
1214 		if (vfsp != checkp->vfsp)
1215 			continue;
1216 		if (vfsp->vfs_dev != checkp->vfs_dev)
1217 			continue;
1218 		if (vfs_lock(vfsp) != 0)
1219 			continue;
1220 
1221 		mutex_exit(&ufsvfs_mutex);
1222 		return (vfsp);
1223 	}
1224 	mutex_exit(&ufsvfs_mutex);
1225 	return (NULL);
1226 }
1227 
1228 int
1229 ufs_si_io_done(struct buf *bp)
1230 {
1231 	sema_v(&bp->b_io);
1232 	return (0);
1233 }
1234 
1235 #define	SI_BUFSZ roundup(sizeof (struct cg), DEV_BSIZE)
1236 #define	NSIBUF 32
1237 
1238 /*
1239  * ufs_construct_si()
1240  * Read each cylinder group in turn and construct the summary information
1241  */
1242 static int
1243 ufs_construct_si(dev_t dev, struct fs *fs, struct ufsvfs *ufsvfsp)
1244 {
1245 	buf_t *bps, *bp;
1246 	char *bufs;
1247 	struct csum *sip = fs->fs_u.fs_csp;
1248 	struct cg *cgp;
1249 	int i, ncg;
1250 	int error = 0, cg = 0;
1251 
1252 	bps = kmem_alloc(NSIBUF * sizeof (buf_t), KM_SLEEP);
1253 	bufs = kmem_alloc(NSIBUF * SI_BUFSZ, KM_SLEEP);
1254 
1255 	/*
1256 	 * Initialise the buffer headers
1257 	 */
1258 	for (bp = bps, i = 0; i < NSIBUF; i++, bp++) {
1259 		bioinit(bp);
1260 		bp->b_iodone = ufs_si_io_done;
1261 		bp->b_bufsize = bp->b_bcount = SI_BUFSZ;
1262 		bp->b_flags = B_READ;
1263 		bp->b_un.b_addr = bufs + (i * SI_BUFSZ);
1264 		bp->b_edev = dev;
1265 	}
1266 
1267 	/*
1268 	 * Repeat while there are cylinder groups left to read.
1269 	 */
1270 	do {
1271 		/*
1272 		 * Issue upto NSIBUF asynchronous reads
1273 		 */
1274 		ncg = MIN(NSIBUF, (fs->fs_ncg - cg));
1275 		for (bp = bps, i = 0; i < ncg; i++, bp++) {
1276 			bp->b_blkno = (daddr_t)fsbtodb(fs, cgtod(fs, cg + i));
1277 			if (ufsvfsp->vfs_log) {
1278 				lufs_read_strategy(ufsvfsp->vfs_log, bp);
1279 			} else {
1280 				(void) bdev_strategy(bp);
1281 			}
1282 		}
1283 
1284 		/*
1285 		 * wait for each read to finish;
1286 		 * check for errors and copy the csum info
1287 		 */
1288 		for (bp = bps, i = 0; i < ncg; i++, bp++) {
1289 			sema_p(&bp->b_io);
1290 			if (!error) {
1291 				cgp = bp->b_un.b_cg;
1292 				sip[cg + i] = cgp->cg_cs;
1293 				error = geterror(bp);
1294 			}
1295 		}
1296 		if (error) {
1297 			goto err;
1298 		}
1299 		cg += ncg;
1300 	} while (cg < fs->fs_ncg);
1301 
1302 err:
1303 	kmem_free(bps, NSIBUF * sizeof (buf_t));
1304 	kmem_free(bufs, NSIBUF * SI_BUFSZ);
1305 	return (error);
1306 }
1307 
1308 /*
1309  * ufs_getsummaryinfo
1310  */
1311 int
1312 ufs_getsummaryinfo(dev_t dev, struct ufsvfs *ufsvfsp, struct fs *fs)
1313 {
1314 	int		i;		/* `for' loop counter */
1315 	ssize_t		size;		/* bytes of summary info to read */
1316 	daddr_t		frags;		/* frags of summary info to read */
1317 	caddr_t		sip;		/* summary info */
1318 	struct buf	*tp;		/* tmp buf */
1319 
1320 	/*
1321 	 * maintain metadata map for trans device (debug only)
1322 	 */
1323 	TRANS_MATA_SI(ufsvfsp, fs);
1324 
1325 	/*
1326 	 * Compute #frags and allocate space for summary info
1327 	 */
1328 	frags = howmany(fs->fs_cssize, fs->fs_fsize);
1329 	sip = kmem_alloc((size_t)fs->fs_cssize, KM_SLEEP);
1330 	fs->fs_u.fs_csp = (struct csum *)sip;
1331 
1332 	if (fs->fs_si == FS_SI_BAD) {
1333 		/*
1334 		 * The summary information is unknown, read it in from
1335 		 * the cylinder groups.
1336 		 */
1337 		if (TRANS_ISTRANS(ufsvfsp) && !TRANS_ISERROR(ufsvfsp) &&
1338 		    ufsvfsp->vfs_log->un_logmap) {
1339 			logmap_roll_dev(ufsvfsp->vfs_log); /* flush the log */
1340 		}
1341 		bzero(sip, (size_t)fs->fs_cssize);
1342 		if (ufs_construct_si(dev, fs, ufsvfsp)) {
1343 			kmem_free(fs->fs_u.fs_csp, fs->fs_cssize);
1344 			fs->fs_u.fs_csp = NULL;
1345 			return (EIO);
1346 		}
1347 	} else {
1348 		/* Read summary info a fs block at a time */
1349 		size = fs->fs_bsize;
1350 		for (i = 0; i < frags; i += fs->fs_frag) {
1351 			if (i + fs->fs_frag > frags)
1352 				/*
1353 				 * This happens only the last iteration, so
1354 				 * don't worry about size being reset
1355 				 */
1356 				size = (frags - i) * fs->fs_fsize;
1357 			tp = UFS_BREAD(ufsvfsp, dev,
1358 			    (daddr_t)fsbtodb(fs, fs->fs_csaddr+i), size);
1359 			tp->b_flags |= B_STALE | B_AGE;
1360 			if (tp->b_flags & B_ERROR) {
1361 				kmem_free(fs->fs_u.fs_csp, fs->fs_cssize);
1362 				fs->fs_u.fs_csp = NULL;
1363 				brelse(tp);
1364 				return (EIO);
1365 			}
1366 			bcopy(tp->b_un.b_addr, sip, size);
1367 			sip += size;
1368 			brelse(tp);
1369 		}
1370 	}
1371 	bzero((caddr_t)&fs->fs_cstotal, sizeof (fs->fs_cstotal));
1372 	for (i = 0; i < fs->fs_ncg; ++i) {
1373 		fs->fs_cstotal.cs_ndir += fs->fs_cs(fs, i).cs_ndir;
1374 		fs->fs_cstotal.cs_nbfree += fs->fs_cs(fs, i).cs_nbfree;
1375 		fs->fs_cstotal.cs_nifree += fs->fs_cs(fs, i).cs_nifree;
1376 		fs->fs_cstotal.cs_nffree += fs->fs_cs(fs, i).cs_nffree;
1377 	}
1378 	return (0);
1379 }
1380 
1381 /*
1382  * ufs_putsummaryinfo() stores all the cylinder group summary information
1383  * This is only used when logging, but the file system may not
1384  * be logging at the time, eg a read-only mount to flush the log
1385  * may push the summary info out.
1386  */
1387 int
1388 ufs_putsummaryinfo(dev_t dev, struct ufsvfs *ufsvfsp, struct fs *fs)
1389 {
1390 	struct buf	b, *bp;		/* tmp buf */
1391 	caddr_t		sip;		/* summary info */
1392 	ssize_t		size;		/* bytes of summary info to write */
1393 	daddr_t		frags;		/* frags of summary info to write */
1394 	int		i;		/* `for' loop counter */
1395 	int		error;		/* error */
1396 
1397 	if (TRANS_ISERROR(ufsvfsp)) {
1398 		return (EIO);
1399 	}
1400 
1401 	if ((fs->fs_si != FS_SI_BAD) || !ufsvfsp->vfs_nolog_si) {
1402 		return (0);
1403 	}
1404 
1405 	bp = &b;
1406 	bioinit(bp);
1407 	bp->b_iodone = ufs_si_io_done;
1408 	bp->b_bufsize = size = fs->fs_bsize;
1409 	bp->b_flags = B_WRITE;
1410 	bp->b_un.b_addr = kmem_alloc(size, KM_SLEEP);
1411 	bp->b_edev = dev;
1412 	frags = howmany(fs->fs_cssize, fs->fs_fsize);
1413 	sip = (caddr_t)fs->fs_u.fs_csp;
1414 
1415 	/* Write summary info one fs block at a time */
1416 	for (error = 0, i = 0; (i < frags) && (error == 0); i += fs->fs_frag) {
1417 		if (i + fs->fs_frag > frags) {
1418 			/*
1419 			 * This happens only the last iteration, so
1420 			 * don't worry about size being reset
1421 			 */
1422 			size = (frags - i) * fs->fs_fsize;
1423 		}
1424 		bcopy(sip, bp->b_un.b_addr, size);
1425 		bp->b_blkno = (daddr_t)fsbtodb(fs, fs->fs_csaddr+i);
1426 		bp->b_bcount = size;
1427 		(void) bdev_strategy(bp);
1428 		sema_p(&bp->b_io); /* wait for write to complete */
1429 		error = geterror(bp);
1430 		sip += size;
1431 	}
1432 	kmem_free(bp->b_un.b_addr, fs->fs_bsize);
1433 	if (!error) {
1434 		fs->fs_si = FS_SI_OK;
1435 	}
1436 	return (error);
1437 }
1438 
1439 /*
1440  * Decide whether it is okay to remove within a sticky directory.
1441  * Two conditions need to be met:  write access to the directory
1442  * is needed.  In sticky directories, write access is not sufficient;
1443  * you can remove entries from a directory only if you own the directory,
1444  * if you are privileged, if you own the entry or if the entry is
1445  * a plain file and you have write access to that file.
1446  * Function returns 0 if remove access is granted.
1447  */
1448 int
1449 ufs_sticky_remove_access(struct inode *dp, struct inode *ip, struct cred *cr)
1450 {
1451 	uid_t uid;
1452 	if ((dp->i_mode & ISVTX) &&
1453 	    (uid = crgetuid(cr)) != dp->i_uid &&
1454 	    uid != ip->i_uid &&
1455 	    ((ip->i_mode & IFMT) != IFREG ||
1456 	    ufs_iaccess(ip, IWRITE, cr) != 0))
1457 		return (secpolicy_vnode_remove(cr));
1458 
1459 	return (0);
1460 }
1461 #endif	/* _KERNEL */
1462 
1463 extern	int around[9];
1464 extern	int inside[9];
1465 extern	uchar_t *fragtbl[];
1466 
1467 /*
1468  * Update the frsum fields to reflect addition or deletion
1469  * of some frags.
1470  */
1471 void
1472 fragacct(struct fs *fs, int fragmap, int32_t *fraglist, int cnt)
1473 {
1474 	int inblk;
1475 	int field, subfield;
1476 	int siz, pos;
1477 
1478 	/*
1479 	 * ufsvfsp->vfs_lock is held when calling this.
1480 	 */
1481 	inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1;
1482 	fragmap <<= 1;
1483 	for (siz = 1; siz < fs->fs_frag; siz++) {
1484 		if ((inblk & (1 << (siz + (fs->fs_frag % NBBY)))) == 0)
1485 			continue;
1486 		field = around[siz];
1487 		subfield = inside[siz];
1488 		for (pos = siz; pos <= fs->fs_frag; pos++) {
1489 			if ((fragmap & field) == subfield) {
1490 				fraglist[siz] += cnt;
1491 				ASSERT(fraglist[siz] >= 0);
1492 				pos += siz;
1493 				field <<= siz;
1494 				subfield <<= siz;
1495 			}
1496 			field <<= 1;
1497 			subfield <<= 1;
1498 		}
1499 	}
1500 }
1501 
1502 /*
1503  * Block operations
1504  */
1505 
1506 /*
1507  * Check if a block is available
1508  */
1509 int
1510 isblock(struct fs *fs, uchar_t *cp, daddr_t h)
1511 {
1512 	uchar_t mask;
1513 
1514 	ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \
1515 	    fs->fs_frag == 1);
1516 	/*
1517 	 * ufsvfsp->vfs_lock is held when calling this.
1518 	 */
1519 	switch ((int)fs->fs_frag) {
1520 	case 8:
1521 		return (cp[h] == 0xff);
1522 	case 4:
1523 		mask = 0x0f << ((h & 0x1) << 2);
1524 		return ((cp[h >> 1] & mask) == mask);
1525 	case 2:
1526 		mask = 0x03 << ((h & 0x3) << 1);
1527 		return ((cp[h >> 2] & mask) == mask);
1528 	case 1:
1529 		mask = 0x01 << (h & 0x7);
1530 		return ((cp[h >> 3] & mask) == mask);
1531 	default:
1532 #ifndef _KERNEL
1533 		cmn_err(CE_PANIC, "isblock: illegal fs->fs_frag value (%d)",
1534 		    fs->fs_frag);
1535 #endif /* _KERNEL */
1536 		return (0);
1537 	}
1538 }
1539 
1540 /*
1541  * Take a block out of the map
1542  */
1543 void
1544 clrblock(struct fs *fs, uchar_t *cp, daddr_t h)
1545 {
1546 	ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \
1547 	    fs->fs_frag == 1);
1548 	/*
1549 	 * ufsvfsp->vfs_lock is held when calling this.
1550 	 */
1551 	switch ((int)fs->fs_frag) {
1552 	case 8:
1553 		cp[h] = 0;
1554 		return;
1555 	case 4:
1556 		cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2));
1557 		return;
1558 	case 2:
1559 		cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1));
1560 		return;
1561 	case 1:
1562 		cp[h >> 3] &= ~(0x01 << (h & 0x7));
1563 		return;
1564 	default:
1565 #ifndef _KERNEL
1566 		cmn_err(CE_PANIC, "clrblock: illegal fs->fs_frag value (%d)",
1567 		    fs->fs_frag);
1568 #endif /* _KERNEL */
1569 		return;
1570 	}
1571 }
1572 
1573 /*
1574  * Is block allocated?
1575  */
1576 int
1577 isclrblock(struct fs *fs, uchar_t *cp, daddr_t h)
1578 {
1579 	uchar_t	mask;
1580 	int	frag;
1581 	/*
1582 	 * ufsvfsp->vfs_lock is held when calling this.
1583 	 */
1584 	frag = fs->fs_frag;
1585 	ASSERT(frag == 8 || frag == 4 || frag == 2 || frag == 1);
1586 	switch (frag) {
1587 	case 8:
1588 		return (cp[h] == 0);
1589 	case 4:
1590 		mask = ~(0x0f << ((h & 0x1) << 2));
1591 		return (cp[h >> 1] == (cp[h >> 1] & mask));
1592 	case 2:
1593 		mask =	~(0x03 << ((h & 0x3) << 1));
1594 		return (cp[h >> 2] == (cp[h >> 2] & mask));
1595 	case 1:
1596 		mask = ~(0x01 << (h & 0x7));
1597 		return (cp[h >> 3] == (cp[h >> 3] & mask));
1598 	default:
1599 #ifndef _KERNEL
1600 		cmn_err(CE_PANIC, "isclrblock: illegal fs->fs_frag value (%d)",
1601 		    fs->fs_frag);
1602 #endif /* _KERNEL */
1603 		break;
1604 	}
1605 	return (0);
1606 }
1607 
1608 /*
1609  * Put a block into the map
1610  */
1611 void
1612 setblock(struct fs *fs, uchar_t *cp, daddr_t h)
1613 {
1614 	ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \
1615 	    fs->fs_frag == 1);
1616 	/*
1617 	 * ufsvfsp->vfs_lock is held when calling this.
1618 	 */
1619 	switch ((int)fs->fs_frag) {
1620 	case 8:
1621 		cp[h] = 0xff;
1622 		return;
1623 	case 4:
1624 		cp[h >> 1] |= (0x0f << ((h & 0x1) << 2));
1625 		return;
1626 	case 2:
1627 		cp[h >> 2] |= (0x03 << ((h & 0x3) << 1));
1628 		return;
1629 	case 1:
1630 		cp[h >> 3] |= (0x01 << (h & 0x7));
1631 		return;
1632 	default:
1633 #ifndef _KERNEL
1634 		cmn_err(CE_PANIC, "setblock: illegal fs->fs_frag value (%d)",
1635 		    fs->fs_frag);
1636 #endif /* _KERNEL */
1637 		return;
1638 	}
1639 }
1640 
1641 int
1642 skpc(char c, uint_t len, char *cp)
1643 {
1644 	if (len == 0)
1645 		return (0);
1646 	while (*cp++ == c && --len)
1647 		;
1648 	return (len);
1649 }
1650