xref: /titanic_41/usr/src/uts/common/fs/ufs/ufs_subr.c (revision 6f45ec7b0b964c3be967c4880e8867ac1e7763a5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 /*
31  * University Copyright- Copyright (c) 1982, 1986, 1988
32  * The Regents of the University of California
33  * All Rights Reserved
34  *
35  * University Acknowledgment- Portions of this document are derived from
36  * software developed by the University of California, Berkeley, and its
37  * contributors.
38  */
39 
40 
41 #pragma ident	"%Z%%M%	%I%	%E% SMI"
42 
43 #include <sys/types.h>
44 #include <sys/t_lock.h>
45 #include <sys/param.h>
46 #include <sys/time.h>
47 #include <sys/fs/ufs_fs.h>
48 #include <sys/cmn_err.h>
49 
50 #ifdef _KERNEL
51 
52 #include <sys/systm.h>
53 #include <sys/sysmacros.h>
54 #include <sys/buf.h>
55 #include <sys/conf.h>
56 #include <sys/user.h>
57 #include <sys/var.h>
58 #include <sys/vfs.h>
59 #include <sys/vnode.h>
60 #include <sys/proc.h>
61 #include <sys/debug.h>
62 #include <sys/fssnap_if.h>
63 #include <sys/fs/ufs_inode.h>
64 #include <sys/fs/ufs_trans.h>
65 #include <sys/fs/ufs_panic.h>
66 #include <sys/fs/ufs_bio.h>
67 #include <sys/fs/ufs_log.h>
68 #include <sys/kmem.h>
69 #include <sys/vtrace.h>
70 #include <sys/policy.h>
71 #include <vm/hat.h>
72 #include <vm/as.h>
73 #include <vm/seg.h>
74 #include <vm/pvn.h>
75 #include <vm/seg_map.h>
76 #include <sys/swap.h>
77 #include <vm/seg_kmem.h>
78 
79 #else  /* _KERNEL */
80 
81 #define	ASSERT(x)		/* don't use asserts for fsck et al */
82 
83 #endif  /* _KERNEL */
84 
85 #ifdef _KERNEL
86 
87 /*
88  * Used to verify that a given entry on the ufs_instances list (see below)
89  * still refers to a mounted file system.
90  *
91  * XXX:	This is a crock that substitutes for proper locking to coordinate
92  *	updates to and uses of the entries in ufs_instances.
93  */
94 struct check_node {
95 	struct vfs *vfsp;
96 	struct ufsvfs *ufsvfs;
97 	dev_t vfs_dev;
98 };
99 
100 static vfs_t *still_mounted(struct check_node *);
101 
102 /*
103  * All ufs file system instances are linked together into a list starting at
104  * ufs_instances.  The list is updated as part of mount and unmount.  It's
105  * consulted in ufs_update, to allow syncing out all ufs file system instances
106  * in a batch.
107  *
108  * ufsvfs_mutex guards access to this list and to the {,old}ufsvfslist
109  * manipulated in ufs_funmount_cleanup.  (A given ufs instance is always on
110  * exactly one of these lists except while it's being allocated or
111  * deallocated.)
112  */
113 struct ufsvfs	*ufs_instances;
114 extern kmutex_t		ufsvfs_mutex;	/* XXX: move this to ufs_inode.h? */
115 
116 /*
117  * ufsvfs list manipulation routines
118  */
119 
120 /*
121  * Link ufsp in at the head of the list of ufs_instances.
122  */
123 void
124 ufs_vfs_add(struct ufsvfs *ufsp)
125 {
126 	mutex_enter(&ufsvfs_mutex);
127 	ufsp->vfs_next = ufs_instances;
128 	ufs_instances = ufsp;
129 	mutex_exit(&ufsvfs_mutex);
130 }
131 
132 /*
133  * Remove ufsp from the list of ufs_instances.
134  *
135  * Does no error checking; ufsp is assumed to actually be on the list.
136  */
137 void
138 ufs_vfs_remove(struct ufsvfs *ufsp)
139 {
140 	struct ufsvfs	**delpt = &ufs_instances;
141 
142 	mutex_enter(&ufsvfs_mutex);
143 	for (; *delpt != NULL; delpt = &((*delpt)->vfs_next)) {
144 		if (*delpt == ufsp) {
145 			*delpt = ufsp->vfs_next;
146 			ufsp->vfs_next = NULL;
147 			break;
148 		}
149 	}
150 	mutex_exit(&ufsvfs_mutex);
151 }
152 
153 /*
154  * Clean up state resulting from a forcible unmount that couldn't be handled
155  * directly during the unmount.  (See commentary in the unmount code for more
156  * info.)
157  */
158 static void
159 ufs_funmount_cleanup()
160 {
161 	struct ufsvfs		*ufsvfsp;
162 	extern struct ufsvfs	*oldufsvfslist, *ufsvfslist;
163 
164 	/*
165 	 * Assumption: it's now safe to blow away the entries on
166 	 * oldufsvfslist.
167 	 */
168 	mutex_enter(&ufsvfs_mutex);
169 	while ((ufsvfsp = oldufsvfslist) != NULL) {
170 		oldufsvfslist = ufsvfsp->vfs_next;
171 
172 		mutex_destroy(&ufsvfsp->vfs_lock);
173 		kmem_free(ufsvfsp, sizeof (struct ufsvfs));
174 	}
175 	/*
176 	 * Rotate more recent unmount entries into place in preparation for
177 	 * the next time around.
178 	 */
179 	oldufsvfslist = ufsvfslist;
180 	ufsvfslist = NULL;
181 	mutex_exit(&ufsvfs_mutex);
182 }
183 
184 
185 /*
186  * ufs_update performs the ufs part of `sync'.  It goes through the disk
187  * queues to initiate sandbagged IO; goes through the inodes to write
188  * modified nodes; and it goes through the mount table to initiate
189  * the writing of the modified super blocks.
190  */
191 extern time_t	time;
192 time_t		ufs_sync_time;
193 time_t		ufs_sync_time_secs = 1;
194 
195 extern kmutex_t	ufs_scan_lock;
196 
197 void
198 ufs_update(int flag)
199 {
200 	struct vfs *vfsp;
201 	struct fs *fs;
202 	struct ufsvfs *ufsp;
203 	struct ufsvfs *ufsnext;
204 	struct ufsvfs *update_list = NULL;
205 	int check_cnt = 0;
206 	size_t check_size;
207 	struct check_node *check_list, *ptr;
208 	int cheap = flag & SYNC_ATTR;
209 
210 	/*
211 	 * This is a hack.  A design flaw in the forced unmount protocol
212 	 * could allow a thread to attempt to use a kmem_freed ufsvfs
213 	 * structure in ufs_lockfs_begin/ufs_check_lockfs.  This window
214 	 * is difficult to hit, even during the lockfs stress tests.
215 	 * So the hacky fix is to wait awhile before kmem_free'ing the
216 	 * ufsvfs structures for forcibly unmounted file systems.  `Awhile'
217 	 * is defined as every other call from fsflush (~60 seconds).
218 	 */
219 	if (cheap)
220 		ufs_funmount_cleanup();
221 
222 	/*
223 	 * Examine all ufsvfs structures and add those that we can lock to the
224 	 * update list.  This is so that we don't hold the list lock for a
225 	 * long time.  If vfs_lock fails for a file system instance, then skip
226 	 * it because somebody is doing a unmount on it.
227 	 */
228 	mutex_enter(&ufsvfs_mutex);
229 	for (ufsp = ufs_instances; ufsp != NULL; ufsp = ufsp->vfs_next) {
230 		vfsp = ufsp->vfs_vfs;
231 		if (vfs_lock(vfsp) != 0)
232 			continue;
233 		ufsp->vfs_wnext = update_list;
234 		update_list = ufsp;
235 		check_cnt++;
236 	}
237 	mutex_exit(&ufsvfs_mutex);
238 
239 	if (update_list == NULL)
240 		return;
241 
242 	check_size = sizeof (struct check_node) * check_cnt;
243 	check_list = ptr = kmem_alloc(check_size, KM_NOSLEEP);
244 
245 	/*
246 	 * Write back modified superblocks.
247 	 * Consistency check that the superblock of
248 	 * each file system is still in the buffer cache.
249 	 *
250 	 * Note that the update_list traversal is done without the protection
251 	 * of an overall list lock, so it's necessary to rely on the fact that
252 	 * each entry of the list is vfs_locked when moving from one entry to
253 	 * the next.  This works because a concurrent attempt to add an entry
254 	 * to another thread's update_list won't find it, since it'll already
255 	 * be locked.
256 	 */
257 	check_cnt = 0;
258 	for (ufsp = update_list; ufsp != NULL; ufsp = ufsnext) {
259 		/*
260 		 * Need to grab the next ptr before we unlock this one so
261 		 * another thread doesn't grab it and change it before we move
262 		 * on to the next vfs.  (Once we unlock it, it's ok if another
263 		 * thread finds it to add it to its own update_list; we don't
264 		 * attempt to refer to it through our list any more.)
265 		 */
266 		ufsnext = ufsp->vfs_wnext;
267 		vfsp = ufsp->vfs_vfs;
268 
269 		/*
270 		 * Seems like this can't happen, so perhaps it should become
271 		 * an ASSERT(vfsp->vfs_data != NULL).
272 		 */
273 		if (!vfsp->vfs_data) {
274 			vfs_unlock(vfsp);
275 			continue;
276 		}
277 
278 		fs = ufsp->vfs_fs;
279 
280 		/*
281 		 * don't update a locked superblock during a panic; it
282 		 * may be in an inconsistent state
283 		 */
284 		if (panicstr) {
285 			if (!mutex_tryenter(&ufsp->vfs_lock)) {
286 				vfs_unlock(vfsp);
287 				continue;
288 			}
289 		} else
290 			mutex_enter(&ufsp->vfs_lock);
291 		/*
292 		 * Build up the STABLE check list, so we can unlock the vfs
293 		 * until we do the actual checking.
294 		 */
295 		if (check_list != NULL) {
296 			if ((fs->fs_ronly == 0) &&
297 			    (fs->fs_clean != FSBAD) &&
298 			    (fs->fs_clean != FSSUSPEND)) {
299 				ptr->vfsp = vfsp;
300 				ptr->ufsvfs = ufsp;
301 				ptr->vfs_dev = vfsp->vfs_dev;
302 				ptr++;
303 				check_cnt++;
304 			}
305 		}
306 
307 		/*
308 		 * superblock is not modified
309 		 */
310 		if (fs->fs_fmod == 0) {
311 			mutex_exit(&ufsp->vfs_lock);
312 			vfs_unlock(vfsp);
313 			continue;
314 		}
315 		if (fs->fs_ronly != 0) {
316 			mutex_exit(&ufsp->vfs_lock);
317 			vfs_unlock(vfsp);
318 			(void) ufs_fault(ufsp->vfs_root,
319 					"fs = %s update: ro fs mod\n",
320 				fs->fs_fsmnt);
321 			/*
322 			 * XXX:	Why is this a return instead of a continue?
323 			 *	This may be an attempt to replace a panic with
324 			 *	something less drastic, but there's cleanup we
325 			 *	should be doing that's not being done (e.g.,
326 			 *	unlocking the remaining entries on the list).
327 			 */
328 			return;
329 		}
330 		fs->fs_fmod = 0;
331 		mutex_exit(&ufsp->vfs_lock);
332 		TRANS_SBUPDATE(ufsp, vfsp, TOP_SBUPDATE_UPDATE);
333 		vfs_unlock(vfsp);
334 	}
335 
336 	ufs_sync_time = time;
337 
338 	/*
339 	 * Avoid racing with ufs_unmount() and ufs_sync().
340 	 */
341 	mutex_enter(&ufs_scan_lock);
342 
343 	(void) ufs_scan_inodes(1, ufs_sync_inode, (void *)(uintptr_t)cheap,
344 	    NULL);
345 
346 	mutex_exit(&ufs_scan_lock);
347 
348 	/*
349 	 * Force stale buffer cache information to be flushed,
350 	 * for all devices.  This should cause any remaining control
351 	 * information (e.g., cg and inode info) to be flushed back.
352 	 */
353 	bflush((dev_t)NODEV);
354 
355 	if (check_list == NULL)
356 		return;
357 
358 	/*
359 	 * For each UFS filesystem in the STABLE check_list, update
360 	 * the clean flag if warranted.
361 	 */
362 	for (ptr = check_list; check_cnt > 0; check_cnt--, ptr++) {
363 		int	error;
364 
365 		/*
366 		 * still_mounted() returns with vfsp and the vfs_reflock
367 		 * held if ptr refers to a vfs that is still mounted.
368 		 */
369 		if ((vfsp = still_mounted(ptr)) == NULL)
370 			continue;
371 		ufs_checkclean(vfsp);
372 		/*
373 		 * commit any outstanding async transactions
374 		 */
375 		ufsp = (struct ufsvfs *)vfsp->vfs_data;
376 		curthread->t_flag |= T_DONTBLOCK;
377 		TRANS_BEGIN_SYNC(ufsp, TOP_COMMIT_UPDATE, TOP_COMMIT_SIZE,
378 		    error);
379 		if (!error) {
380 			TRANS_END_SYNC(ufsp, error, TOP_COMMIT_UPDATE,
381 					TOP_COMMIT_SIZE);
382 		}
383 		curthread->t_flag &= ~T_DONTBLOCK;
384 
385 		vfs_unlock(vfsp);
386 	}
387 
388 	kmem_free(check_list, check_size);
389 }
390 
391 int
392 ufs_sync_inode(struct inode *ip, void *arg)
393 {
394 	int cheap = (int)(uintptr_t)arg;
395 	struct ufsvfs *ufsvfsp;
396 	uint_t flag = ip->i_flag;
397 
398 	if (cheap && ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) == 0))
399 		return (0);
400 
401 	/*
402 	 * if we are panic'ing; then don't update the inode if this
403 	 * file system is FSSTABLE.  Otherwise, we would have to
404 	 * force the superblock to FSACTIVE and the superblock
405 	 * may not be in a good state.  Also, if the inode is
406 	 * IREF'ed then it may be in an inconsistent state.  Don't
407 	 * push it.  Finally, don't push the inode if the fs is
408 	 * logging; the transaction will be discarded at boot.
409 	 */
410 	if (panicstr) {
411 
412 		if (flag & IREF)
413 			return (0);
414 
415 		if (ip->i_ufsvfs == NULL ||
416 		    (ip->i_fs->fs_clean == FSSTABLE ||
417 		    ip->i_fs->fs_clean == FSLOG))
418 				return (0);
419 	}
420 
421 	ufsvfsp = ip->i_ufsvfs;
422 
423 	/*
424 	 * Limit access time only updates
425 	 */
426 	if (((flag & (IMOD|IMODACC|IUPD|ICHG|IACC)) == IMODACC) && ufsvfsp) {
427 		/*
428 		 * if file system has deferred access time turned on and there
429 		 * was no IO recently, don't bother flushing it. It will be
430 		 * flushed when I/Os start again.
431 		 */
432 		if (cheap && (ufsvfsp->vfs_dfritime & UFS_DFRATIME) &&
433 		    (ufsvfsp->vfs_iotstamp + ufs_iowait < lbolt))
434 			return (0);
435 		/*
436 		 * an app issueing a sync() can take forever on a trans device
437 		 * when NetWorker or find is running because all of the
438 		 * directorys' access times have to be updated. So, we limit
439 		 * the time we spend updating access times per sync.
440 		 */
441 		if (TRANS_ISTRANS(ufsvfsp) && ((ufs_sync_time +
442 		    ufs_sync_time_secs) < time))
443 			return (0);
444 	}
445 
446 	/*
447 	 * if we are running on behalf of the flush thread or this is
448 	 * a swap file, then simply do a delay update of the inode.
449 	 * Otherwise, push the pages and then do a delayed inode update.
450 	 */
451 	if (cheap || IS_SWAPVP(ITOV(ip))) {
452 		TRANS_IUPDAT(ip, 0);
453 	} else {
454 		(void) TRANS_SYNCIP(ip, B_ASYNC, I_ASYNC, TOP_SYNCIP_SYNC);
455 	}
456 	return (0);
457 }
458 
459 /*
460  * Flush all the pages associated with an inode using the given 'flags',
461  * then force inode information to be written back using the given 'waitfor'.
462  */
463 int
464 ufs_syncip(struct inode *ip, int flags, int waitfor, top_t topid)
465 {
466 	int	error;
467 	struct vnode *vp = ITOV(ip);
468 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
469 	int dotrans = 0;
470 
471 	TRACE_3(TR_FAC_UFS, TR_UFS_SYNCIP_START,
472 		"ufs_syncip_start:vp %p flags %x waitfor %x",
473 		vp, flags, waitfor);
474 
475 	/*
476 	 * Return if file system has been forcibly umounted.
477 	 */
478 	if (ufsvfsp == NULL)
479 		return (EIO);
480 	/*
481 	 * don't need to VOP_PUTPAGE if there are no pages
482 	 */
483 	if (!vn_has_cached_data(vp) || vp->v_type == VCHR) {
484 		error = 0;
485 	} else {
486 		/*
487 		 * if the inode we're working on is a shadow inode
488 		 * or quota inode we need to make sure that the
489 		 * ufs_putpage call is inside a transaction as this
490 		 * could include meta data changes.
491 		 */
492 		if ((ip->i_mode & IFMT) == IFSHAD ||
493 			ufsvfsp->vfs_qinod == ip) {
494 			dotrans = 1;
495 			curthread->t_flag |= T_DONTBLOCK;
496 			TRANS_BEGIN_ASYNC(ufsvfsp, TOP_PUTPAGE,
497 			    TOP_PUTPAGE_SIZE(ip));
498 		}
499 		error = VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, flags, CRED());
500 		if (dotrans) {
501 			TRANS_END_ASYNC(ufsvfsp, TOP_PUTPAGE,
502 			    TOP_PUTPAGE_SIZE(ip));
503 			curthread->t_flag &= ~T_DONTBLOCK;
504 			dotrans = 0;
505 		}
506 	}
507 	if (panicstr && TRANS_ISTRANS(ufsvfsp))
508 		goto out;
509 	/*
510 	 * waitfor represents two things -
511 	 * 1. whether data sync or file sync.
512 	 * 2. if file sync then ufs_iupdat should 'waitfor' disk i/o or not.
513 	 */
514 	if (waitfor == I_DSYNC) {
515 		/*
516 		 * If data sync, only IATTCHG (size/block change) requires
517 		 * inode update, fdatasync()/FDSYNC implementation.
518 		 */
519 		if (ip->i_flag & (IBDWRITE|IATTCHG)) {
520 			/*
521 			 * Enter a transaction to provide mutual exclusion
522 			 * with deltamap_push and avoid a race where
523 			 * the inode flush could get dropped.
524 			 */
525 			if ((curthread->t_flag & T_DONTBLOCK) == 0) {
526 				dotrans = 1;
527 				curthread->t_flag |= T_DONTBLOCK;
528 				TRANS_BEGIN_ASYNC(ufsvfsp, topid,
529 				    TOP_SYNCIP_SIZE);
530 			}
531 			rw_enter(&ip->i_contents, RW_READER);
532 			mutex_enter(&ip->i_tlock);
533 			ip->i_flag &= ~IMODTIME;
534 			mutex_exit(&ip->i_tlock);
535 			ufs_iupdat(ip, 1);
536 			rw_exit(&ip->i_contents);
537 			if (dotrans) {
538 				TRANS_END_ASYNC(ufsvfsp, topid,
539 				    TOP_SYNCIP_SIZE);
540 				curthread->t_flag &= ~T_DONTBLOCK;
541 			}
542 		}
543 	} else {
544 		/* For file sync, any inode change requires inode update */
545 		if (ip->i_flag & (IBDWRITE|IUPD|IACC|ICHG|IMOD|IMODACC)) {
546 			/*
547 			 * Enter a transaction to provide mutual exclusion
548 			 * with deltamap_push and avoid a race where
549 			 * the inode flush could get dropped.
550 			 */
551 			if ((curthread->t_flag & T_DONTBLOCK) == 0) {
552 				dotrans = 1;
553 				curthread->t_flag |= T_DONTBLOCK;
554 				TRANS_BEGIN_ASYNC(ufsvfsp, topid,
555 				    TOP_SYNCIP_SIZE);
556 			}
557 			rw_enter(&ip->i_contents, RW_READER);
558 			mutex_enter(&ip->i_tlock);
559 			ip->i_flag &= ~IMODTIME;
560 			mutex_exit(&ip->i_tlock);
561 			ufs_iupdat(ip, waitfor);
562 			rw_exit(&ip->i_contents);
563 			if (dotrans) {
564 				TRANS_END_ASYNC(ufsvfsp, topid,
565 				    TOP_SYNCIP_SIZE);
566 				curthread->t_flag &= ~T_DONTBLOCK;
567 			}
568 		}
569 	}
570 
571 out:
572 	TRACE_2(TR_FAC_UFS, TR_UFS_SYNCIP_END,
573 		"ufs_syncip_end:vp %p error %d",
574 		vp, error);
575 
576 	return (error);
577 }
578 /*
579  * Flush all indirect blocks related to an inode.
580  * Supports triple indirect blocks also.
581  */
582 int
583 ufs_sync_indir(struct inode *ip)
584 {
585 	int i;
586 	daddr_t blkno;
587 	daddr_t lbn;	/* logical blkno of last blk in file */
588 	daddr_t clbn;	/* current logical blk */
589 	daddr32_t *bap;
590 	struct fs *fs;
591 	struct buf *bp;
592 	int bsize;
593 	struct ufsvfs *ufsvfsp;
594 	int j;
595 	daddr_t indirect_blkno;
596 	daddr32_t *indirect_bap;
597 	struct buf *indirect_bp;
598 
599 	ufsvfsp = ip->i_ufsvfs;
600 	/*
601 	 * unnecessary when logging; allocation blocks are kept up-to-date
602 	 */
603 	if (TRANS_ISTRANS(ufsvfsp))
604 		return (0);
605 
606 	fs = ufsvfsp->vfs_fs;
607 	bsize = fs->fs_bsize;
608 	lbn = (daddr_t)lblkno(fs, ip->i_size - 1);
609 	if (lbn < NDADDR)
610 		return (0);	/* No indirect blocks used */
611 	if (lbn < NDADDR + NINDIR(fs)) {
612 		/* File has one indirect block. */
613 		blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, ip->i_ib[0]));
614 		return (0);
615 	}
616 
617 	/* Write out all the first level indirect blocks */
618 	for (i = 0; i <= NIADDR; i++) {
619 		if ((blkno = ip->i_ib[i]) == 0)
620 			continue;
621 		blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno));
622 	}
623 	/* Write out second level of indirect blocks */
624 	if ((blkno = ip->i_ib[1]) == 0)
625 		return (0);
626 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, blkno), bsize);
627 	if (bp->b_flags & B_ERROR) {
628 		brelse(bp);
629 		return (EIO);
630 	}
631 	bap = bp->b_un.b_daddr;
632 	clbn = NDADDR + NINDIR(fs);
633 	for (i = 0; i < NINDIR(fs); i++) {
634 		if (clbn > lbn)
635 			break;
636 		clbn += NINDIR(fs);
637 		if ((blkno = bap[i]) == 0)
638 			continue;
639 		blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno));
640 	}
641 
642 	brelse(bp);
643 	/* write out third level indirect blocks */
644 
645 	if ((blkno = ip->i_ib[2]) == 0)
646 		return (0);
647 
648 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, blkno), bsize);
649 	if (bp->b_flags & B_ERROR) {
650 		brelse(bp);
651 		return (EIO);
652 	}
653 	bap = bp->b_un.b_daddr;
654 	clbn = NDADDR + NINDIR(fs) + (NINDIR(fs) * NINDIR(fs));
655 
656 	for (i = 0; i < NINDIR(fs); i++) {
657 		if (clbn > lbn)
658 			break;
659 		if ((indirect_blkno = bap[i]) == 0)
660 			continue;
661 		blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, indirect_blkno));
662 		indirect_bp = UFS_BREAD(ufsvfsp, ip->i_dev,
663 			(daddr_t)fsbtodb(fs, indirect_blkno), bsize);
664 		if (indirect_bp->b_flags & B_ERROR) {
665 			brelse(indirect_bp);
666 			brelse(bp);
667 			return (EIO);
668 		}
669 		indirect_bap = indirect_bp->b_un.b_daddr;
670 		for (j = 0; j < NINDIR(fs); j++) {
671 			if (clbn > lbn)
672 				break;
673 			clbn += NINDIR(fs);
674 			if ((blkno = indirect_bap[j]) == 0)
675 				continue;
676 			blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno));
677 		}
678 		brelse(indirect_bp);
679 	}
680 	brelse(bp);
681 
682 	return (0);
683 }
684 
685 /*
686  * Flush all indirect blocks related to an offset of a file.
687  * read/write in sync mode may have to flush indirect blocks.
688  */
689 int
690 ufs_indirblk_sync(struct inode *ip, offset_t off)
691 {
692 	daddr_t	lbn;
693 	struct	fs *fs;
694 	struct	buf *bp;
695 	int	i, j, shft;
696 	daddr_t	ob, nb, tbn;
697 	daddr32_t *bap;
698 	int	nindirshift, nindiroffset;
699 	struct ufsvfs *ufsvfsp;
700 
701 	ufsvfsp = ip->i_ufsvfs;
702 	/*
703 	 * unnecessary when logging; allocation blocks are kept up-to-date
704 	 */
705 	if (TRANS_ISTRANS(ufsvfsp))
706 		return (0);
707 
708 	fs = ufsvfsp->vfs_fs;
709 
710 	lbn = (daddr_t)lblkno(fs, off);
711 	if (lbn < 0)
712 		return (EFBIG);
713 
714 	/* The first NDADDR are direct so nothing to do */
715 	if (lbn < NDADDR)
716 		return (0);
717 
718 	nindirshift = ip->i_ufsvfs->vfs_nindirshift;
719 	nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
720 
721 	/* Determine level of indirect blocks */
722 	shft = 0;
723 	tbn = lbn - NDADDR;
724 	for (j = NIADDR; j > 0; j--) {
725 		longlong_t	sh;
726 
727 		shft += nindirshift;
728 		sh = 1LL << shft;
729 		if (tbn < sh)
730 			break;
731 		tbn -= (daddr_t)sh;
732 	}
733 
734 	if (j == 0)
735 		return (EFBIG);
736 
737 	if ((nb = ip->i_ib[NIADDR - j]) == 0)
738 			return (0);		/* UFS Hole */
739 
740 	/* Flush first level indirect block */
741 	blkflush(ip->i_dev, fsbtodb(fs, nb));
742 
743 	/* Fetch through next levels */
744 	for (; j < NIADDR; j++) {
745 		ob = nb;
746 		bp = UFS_BREAD(ufsvfsp,
747 				ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
748 		if (bp->b_flags & B_ERROR) {
749 			brelse(bp);
750 			return (EIO);
751 		}
752 		bap = bp->b_un.b_daddr;
753 		shft -= nindirshift;		/* sh / nindir */
754 		i = (tbn >> shft) & nindiroffset; /* (tbn /sh) & nindir */
755 		nb = bap[i];
756 		brelse(bp);
757 		if (nb == 0) {
758 			return (0); 		/* UFS hole */
759 		}
760 		blkflush(ip->i_dev, fsbtodb(fs, nb));
761 	}
762 	return (0);
763 }
764 
765 #ifdef DEBUG
766 
767 /*
768  * The bad block checking routines: ufs_indir_badblock() and ufs_badblock()
769  * are very expensive. It's been found from profiling that we're
770  * spending 6-7% of our time in ufs_badblock, and another 1-2% in
771  * ufs_indir_badblock. They are only called via ASSERTs (from debug kernels).
772  * In addition from experience no failures have been found in recent
773  * years. So the following tunable can be set to enable checking.
774  */
775 int ufs_badblock_checks = 0;
776 
777 /*
778  * Check that a given indirect block contains blocks in range
779  */
780 int
781 ufs_indir_badblock(struct inode *ip, daddr32_t *bap)
782 {
783 	int i;
784 	int err = 0;
785 
786 	if (ufs_badblock_checks) {
787 		for (i = 0; i < NINDIR(ip->i_fs) - 1; i++)
788 			if (bap[i] != 0 && (err = ufs_badblock(ip, bap[i])))
789 				break;
790 	}
791 	return (err);
792 }
793 
794 /*
795  * Check that a specified block number is in range.
796  */
797 int
798 ufs_badblock(struct inode *ip, daddr_t bn)
799 {
800 	long	c;
801 	daddr_t	sum;
802 
803 	if (!ufs_badblock_checks)
804 		return (0);
805 	ASSERT(bn);
806 	if (bn <= 0 || bn > ip->i_fs->fs_size)
807 		return (bn);
808 
809 	sum = 0;
810 	c = dtog(ip->i_fs, bn);
811 	if (c == 0) {
812 		sum = howmany(ip->i_fs->fs_cssize, ip->i_fs->fs_fsize);
813 	}
814 	/*
815 	 * if block no. is below this cylinder group,
816 	 * within the space reserved for superblock, inodes, (summary data)
817 	 * or if it is above this cylinder group
818 	 * then its invalid
819 	 * It's hard to see how we'd be outside this cyl, but let's be careful.
820 	 */
821 	if ((bn < cgbase(ip->i_fs, c)) ||
822 	    (bn >= cgsblock(ip->i_fs, c) && bn < cgdmin(ip->i_fs, c)+sum) ||
823 	    (bn >= (unsigned)cgbase(ip->i_fs, c+1)))
824 		return (bn);
825 
826 	return (0);	/* not a bad block */
827 }
828 
829 #endif /* DEBUG */
830 
831 /*
832  * When i_rwlock is write-locked or has a writer pended, then the inode
833  * is going to change in a way that the filesystem will be marked as
834  * active. So no need to let the filesystem be mark as stable now.
835  * Also to ensure the filesystem consistency during the directory
836  * operations, filesystem cannot be marked as stable if i_rwlock of
837  * the directory inode is write-locked.
838  */
839 
840 /*
841  * Check for busy inodes for this filesystem.
842  * NOTE: Needs better way to do this expensive operation in the future.
843  */
844 static void
845 ufs_icheck(struct ufsvfs *ufsvfsp, int *isbusyp, int *isreclaimp)
846 {
847 	union  ihead	*ih;
848 	struct inode	*ip;
849 	int		i;
850 	int		isnottrans	= !TRANS_ISTRANS(ufsvfsp);
851 	int		isbusy		= *isbusyp;
852 	int		isreclaim	= *isreclaimp;
853 
854 	for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
855 		mutex_enter(&ih_lock[i]);
856 		for (ip = ih->ih_chain[0];
857 		    ip != (struct inode *)ih;
858 		    ip = ip->i_forw) {
859 			/*
860 			 * if inode is busy/modified/deleted, filesystem is busy
861 			 */
862 			if (ip->i_ufsvfs != ufsvfsp)
863 				continue;
864 			if ((ip->i_flag & (IMOD | IUPD | ICHG)) ||
865 			    (RW_ISWRITER(&ip->i_rwlock)))
866 				isbusy = 1;
867 			if ((ip->i_nlink <= 0) && (ip->i_flag & IREF))
868 				isreclaim = 1;
869 			if (isbusy && (isreclaim || isnottrans))
870 				break;
871 		}
872 		mutex_exit(&ih_lock[i]);
873 		if (isbusy && (isreclaim || isnottrans))
874 			break;
875 	}
876 	*isbusyp = isbusy;
877 	*isreclaimp = isreclaim;
878 }
879 
880 /*
881  * As part of the ufs 'sync' operation, this routine is called to mark
882  * the filesystem as STABLE if there is no modified metadata in memory.
883  */
884 void
885 ufs_checkclean(struct vfs *vfsp)
886 {
887 	struct ufsvfs	*ufsvfsp	= (struct ufsvfs *)vfsp->vfs_data;
888 	struct fs	*fs		= ufsvfsp->vfs_fs;
889 	int		isbusy;
890 	int		isreclaim;
891 	int		updatesb;
892 
893 	ASSERT(vfs_lock_held(vfsp));
894 
895 	/*
896 	 * filesystem is stable or cleanflag processing is disabled; do nothing
897 	 *	no transitions when panic'ing
898 	 */
899 	if (fs->fs_ronly ||
900 	    fs->fs_clean == FSBAD ||
901 	    fs->fs_clean == FSSUSPEND ||
902 	    fs->fs_clean == FSSTABLE ||
903 	    panicstr)
904 		return;
905 
906 	/*
907 	 * if logging and nothing to reclaim; do nothing
908 	 */
909 	if ((fs->fs_clean == FSLOG) &&
910 	    (((fs->fs_reclaim & FS_RECLAIM) == 0) ||
911 	    (fs->fs_reclaim & FS_RECLAIMING)))
912 		return;
913 
914 	/*
915 	 * FS_CHECKCLEAN is reset if the file system goes dirty
916 	 * FS_CHECKRECLAIM is reset if a file gets deleted
917 	 */
918 	mutex_enter(&ufsvfsp->vfs_lock);
919 	fs->fs_reclaim |= (FS_CHECKCLEAN | FS_CHECKRECLAIM);
920 	mutex_exit(&ufsvfsp->vfs_lock);
921 
922 	updatesb = 0;
923 
924 	/*
925 	 * if logging or buffers are busy; do nothing
926 	 */
927 	isbusy = isreclaim = 0;
928 	if ((fs->fs_clean == FSLOG) ||
929 	    (bcheck(vfsp->vfs_dev, ufsvfsp->vfs_bufp)))
930 		isbusy = 1;
931 
932 	/*
933 	 * isreclaim == TRUE means can't change the state of fs_reclaim
934 	 */
935 	isreclaim =
936 		((fs->fs_clean == FSLOG) &&
937 		(((fs->fs_reclaim & FS_RECLAIM) == 0) ||
938 		(fs->fs_reclaim & FS_RECLAIMING)));
939 
940 	/*
941 	 * if fs is busy or can't change the state of fs_reclaim; do nothing
942 	 */
943 	if (isbusy && isreclaim)
944 		return;
945 
946 	/*
947 	 * look for busy or deleted inodes; (deleted == needs reclaim)
948 	 */
949 	ufs_icheck(ufsvfsp, &isbusy, &isreclaim);
950 
951 	mutex_enter(&ufsvfsp->vfs_lock);
952 
953 	/*
954 	 * IF POSSIBLE, RESET RECLAIM
955 	 */
956 	/*
957 	 * the reclaim thread is not running
958 	 */
959 	if ((fs->fs_reclaim & FS_RECLAIMING) == 0)
960 		/*
961 		 * no files were deleted during the scan
962 		 */
963 		if (fs->fs_reclaim & FS_CHECKRECLAIM)
964 			/*
965 			 * no deleted files were found in the inode cache
966 			 */
967 			if ((isreclaim == 0) && (fs->fs_reclaim & FS_RECLAIM)) {
968 				fs->fs_reclaim &= ~FS_RECLAIM;
969 				updatesb = 1;
970 			}
971 	/*
972 	 * IF POSSIBLE, SET STABLE
973 	 */
974 	/*
975 	 * not logging
976 	 */
977 	if (fs->fs_clean != FSLOG)
978 		/*
979 		 * file system has not gone dirty since the scan began
980 		 */
981 		if (fs->fs_reclaim & FS_CHECKCLEAN)
982 			/*
983 			 * nothing dirty was found in the buffer or inode cache
984 			 */
985 			if ((isbusy == 0) && (isreclaim == 0) &&
986 			    (fs->fs_clean != FSSTABLE)) {
987 				fs->fs_clean = FSSTABLE;
988 				updatesb = 1;
989 			}
990 
991 	mutex_exit(&ufsvfsp->vfs_lock);
992 	if (updatesb) {
993 		TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE);
994 	}
995 }
996 
997 /*
998  * called whenever an unlink occurs
999  */
1000 void
1001 ufs_setreclaim(struct inode *ip)
1002 {
1003 	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
1004 	struct fs	*fs		= ufsvfsp->vfs_fs;
1005 
1006 	if (ip->i_nlink || fs->fs_ronly || (fs->fs_clean != FSLOG))
1007 		return;
1008 
1009 	/*
1010 	 * reclaim-needed bit is already set or we need to tell
1011 	 * ufs_checkclean that a file has been deleted
1012 	 */
1013 	if ((fs->fs_reclaim & (FS_RECLAIM | FS_CHECKRECLAIM)) == FS_RECLAIM)
1014 		return;
1015 
1016 	mutex_enter(&ufsvfsp->vfs_lock);
1017 	/*
1018 	 * inform ufs_checkclean that the file system has gone dirty
1019 	 */
1020 	fs->fs_reclaim &= ~FS_CHECKRECLAIM;
1021 
1022 	/*
1023 	 * set the reclaim-needed bit
1024 	 */
1025 	if ((fs->fs_reclaim & FS_RECLAIM) == 0) {
1026 		fs->fs_reclaim |= FS_RECLAIM;
1027 		ufs_sbwrite(ufsvfsp);
1028 	}
1029 	mutex_exit(&ufsvfsp->vfs_lock);
1030 }
1031 
1032 /*
1033  * Before any modified metadata written back to the disk, this routine
1034  * is called to mark the filesystem as ACTIVE.
1035  */
1036 void
1037 ufs_notclean(struct ufsvfs *ufsvfsp)
1038 {
1039 	struct fs *fs = ufsvfsp->vfs_fs;
1040 
1041 	ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock));
1042 	ULOCKFS_SET_MOD((&ufsvfsp->vfs_ulockfs));
1043 
1044 	/*
1045 	 * inform ufs_checkclean that the file system has gone dirty
1046 	 */
1047 	fs->fs_reclaim &= ~FS_CHECKCLEAN;
1048 
1049 	/*
1050 	 * ignore if active or bad or suspended or readonly or logging
1051 	 */
1052 	if ((fs->fs_clean == FSACTIVE) || (fs->fs_clean == FSLOG) ||
1053 	    (fs->fs_clean == FSBAD) || (fs->fs_clean == FSSUSPEND) ||
1054 	    (fs->fs_ronly)) {
1055 		mutex_exit(&ufsvfsp->vfs_lock);
1056 		return;
1057 	}
1058 	fs->fs_clean = FSACTIVE;
1059 	/*
1060 	 * write superblock synchronously
1061 	 */
1062 	ufs_sbwrite(ufsvfsp);
1063 	mutex_exit(&ufsvfsp->vfs_lock);
1064 }
1065 
1066 /*
1067  * ufs specific fbwrite()
1068  */
1069 int
1070 ufs_fbwrite(struct fbuf *fbp, struct inode *ip)
1071 {
1072 	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
1073 
1074 	if (TRANS_ISTRANS(ufsvfsp))
1075 		return (fbwrite(fbp));
1076 	mutex_enter(&ufsvfsp->vfs_lock);
1077 	ufs_notclean(ufsvfsp);
1078 	return ((ufsvfsp->vfs_dio) ? fbdwrite(fbp) : fbwrite(fbp));
1079 }
1080 
1081 /*
1082  * ufs specific fbiwrite()
1083  */
1084 int
1085 ufs_fbiwrite(struct fbuf *fbp, struct inode *ip, daddr_t bn, long bsize)
1086 {
1087 	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
1088 	o_mode_t	ifmt		= ip->i_mode & IFMT;
1089 	buf_t		*bp;
1090 	int		error;
1091 
1092 	mutex_enter(&ufsvfsp->vfs_lock);
1093 	ufs_notclean(ufsvfsp);
1094 	if (ifmt == IFDIR || ifmt == IFSHAD || ifmt == IFATTRDIR ||
1095 	    (ip->i_ufsvfs->vfs_qinod == ip)) {
1096 		TRANS_DELTA(ufsvfsp, ldbtob(bn * (offset_t)(btod(bsize))),
1097 			fbp->fb_count, DT_FBI, 0, 0);
1098 	}
1099 	/*
1100 	 * Inlined version of fbiwrite()
1101 	 */
1102 	bp = pageio_setup((struct page *)NULL, fbp->fb_count,
1103 			ip->i_devvp, B_WRITE);
1104 	bp->b_flags &= ~B_PAGEIO;
1105 	bp->b_un.b_addr = fbp->fb_addr;
1106 
1107 	bp->b_blkno = bn * btod(bsize);
1108 	bp->b_dev = cmpdev(ip->i_dev);	/* store in old dev format */
1109 	bp->b_edev = ip->i_dev;
1110 	bp->b_proc = NULL;			/* i.e. the kernel */
1111 	bp->b_file = ip->i_vnode;
1112 	bp->b_offset = -1;
1113 
1114 	if (ufsvfsp->vfs_log) {
1115 		lufs_write_strategy(ufsvfsp->vfs_log, bp);
1116 	} else if (ufsvfsp->vfs_snapshot) {
1117 		fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
1118 	} else {
1119 		ufsvfsp->vfs_iotstamp = lbolt;
1120 		ub.ub_fbiwrites.value.ul++;
1121 		(void) bdev_strategy(bp);
1122 		lwp_stat_update(LWP_STAT_OUBLK, 1);
1123 	}
1124 	error = biowait(bp);
1125 	pageio_done(bp);
1126 	fbrelse(fbp, S_OTHER);
1127 	return (error);
1128 }
1129 
1130 /*
1131  * Write the ufs superblock only.
1132  */
1133 void
1134 ufs_sbwrite(struct ufsvfs *ufsvfsp)
1135 {
1136 	char sav_fs_fmod;
1137 	struct fs *fs = ufsvfsp->vfs_fs;
1138 	struct buf *bp = ufsvfsp->vfs_bufp;
1139 
1140 	ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock));
1141 
1142 	/*
1143 	 * for ulockfs processing, limit the superblock writes
1144 	 */
1145 	if ((ufsvfsp->vfs_ulockfs.ul_sbowner) &&
1146 	    (curthread != ufsvfsp->vfs_ulockfs.ul_sbowner)) {
1147 		/* try again later */
1148 		fs->fs_fmod = 1;
1149 		return;
1150 	}
1151 
1152 	ULOCKFS_SET_MOD((&ufsvfsp->vfs_ulockfs));
1153 	/*
1154 	 * update superblock timestamp and fs_clean checksum
1155 	 * if marked FSBAD, we always want an erroneous
1156 	 * checksum to force repair
1157 	 */
1158 	fs->fs_time = gethrestime_sec();
1159 	fs->fs_state = fs->fs_clean != FSBAD? FSOKAY - fs->fs_time:
1160 						-(FSOKAY - fs->fs_time);
1161 	switch (fs->fs_clean) {
1162 	case FSCLEAN:
1163 	case FSSTABLE:
1164 		fs->fs_reclaim &= ~FS_RECLAIM;
1165 		break;
1166 	case FSACTIVE:
1167 	case FSSUSPEND:
1168 	case FSBAD:
1169 	case FSLOG:
1170 		break;
1171 	default:
1172 		fs->fs_clean = FSACTIVE;
1173 		break;
1174 	}
1175 	/*
1176 	 * reset incore only bits
1177 	 */
1178 	fs->fs_reclaim &= ~(FS_CHECKCLEAN | FS_CHECKRECLAIM);
1179 
1180 	/*
1181 	 * delta the whole superblock
1182 	 */
1183 	TRANS_DELTA(ufsvfsp, ldbtob(SBLOCK), sizeof (struct fs),
1184 		DT_SB, NULL, 0);
1185 	/*
1186 	 * retain the incore state of fs_fmod; set the ondisk state to 0
1187 	 */
1188 	sav_fs_fmod = fs->fs_fmod;
1189 	fs->fs_fmod = 0;
1190 
1191 	/*
1192 	 * Don't release the buffer after written to the disk
1193 	 */
1194 	UFS_BWRITE2(ufsvfsp, bp);
1195 	fs->fs_fmod = sav_fs_fmod;	/* reset fs_fmod's incore state */
1196 }
1197 
1198 /*
1199  * Returns vfs pointer if vfs still being mounted. vfs lock is held.
1200  * Otherwise, returns NULL.
1201  *
1202  * For our purposes, "still mounted" means that the file system still appears
1203  * on the list of UFS file system instances.
1204  */
1205 static vfs_t *
1206 still_mounted(struct check_node *checkp)
1207 {
1208 	struct vfs	*vfsp;
1209 	struct ufsvfs	*ufsp;
1210 
1211 	mutex_enter(&ufsvfs_mutex);
1212 	for (ufsp = ufs_instances; ufsp != NULL; ufsp = ufsp->vfs_next) {
1213 		if (ufsp != checkp->ufsvfs)
1214 			continue;
1215 		/*
1216 		 * Tentative match:  verify it and try to lock.  (It's not at
1217 		 * all clear how the verification could fail, given that we've
1218 		 * gotten this far.  We would have had to reallocate the
1219 		 * ufsvfs struct at hand for a new incarnation; is that really
1220 		 * possible in the interval from constructing the check_node
1221 		 * to here?)
1222 		 */
1223 		vfsp = ufsp->vfs_vfs;
1224 		if (vfsp != checkp->vfsp)
1225 			continue;
1226 		if (vfsp->vfs_dev != checkp->vfs_dev)
1227 			continue;
1228 		if (vfs_lock(vfsp) != 0)
1229 			continue;
1230 
1231 		mutex_exit(&ufsvfs_mutex);
1232 		return (vfsp);
1233 	}
1234 	mutex_exit(&ufsvfs_mutex);
1235 	return (NULL);
1236 }
1237 
1238 int
1239 ufs_si_io_done(struct buf *bp)
1240 {
1241 	sema_v(&bp->b_io);
1242 	return (0);
1243 }
1244 
1245 #define	SI_BUFSZ roundup(sizeof (struct cg), DEV_BSIZE)
1246 #define	NSIBUF 32
1247 
1248 /*
1249  * ufs_construct_si()
1250  * Read each cylinder group in turn and construct the summary information
1251  */
1252 static int
1253 ufs_construct_si(dev_t dev, struct fs *fs, struct ufsvfs *ufsvfsp)
1254 {
1255 	buf_t *bps, *bp;
1256 	char *bufs;
1257 	struct csum *sip = fs->fs_u.fs_csp;
1258 	struct cg *cgp;
1259 	int i, ncg;
1260 	int error = 0, cg = 0;
1261 
1262 	bps = kmem_alloc(NSIBUF * sizeof (buf_t), KM_SLEEP);
1263 	bufs = kmem_alloc(NSIBUF * SI_BUFSZ, KM_SLEEP);
1264 
1265 	/*
1266 	 * Initialise the buffer headers
1267 	 */
1268 	for (bp = bps, i = 0; i < NSIBUF; i++, bp++) {
1269 		bioinit(bp);
1270 		bp->b_iodone = ufs_si_io_done;
1271 		bp->b_bufsize = bp->b_bcount = SI_BUFSZ;
1272 		bp->b_flags = B_READ;
1273 		bp->b_un.b_addr = bufs + (i * SI_BUFSZ);
1274 		bp->b_edev = dev;
1275 	}
1276 
1277 	/*
1278 	 * Repeat while there are cylinder groups left to read.
1279 	 */
1280 	do {
1281 		/*
1282 		 * Issue upto NSIBUF asynchronous reads
1283 		 */
1284 		ncg = MIN(NSIBUF, (fs->fs_ncg - cg));
1285 		for (bp = bps, i = 0; i < ncg; i++, bp++) {
1286 			bp->b_blkno = (daddr_t)fsbtodb(fs, cgtod(fs, cg + i));
1287 			if (ufsvfsp->vfs_log) {
1288 				lufs_read_strategy(ufsvfsp->vfs_log, bp);
1289 			} else {
1290 				(void) bdev_strategy(bp);
1291 			}
1292 		}
1293 
1294 		/*
1295 		 * wait for each read to finish;
1296 		 * check for errors and copy the csum info
1297 		 */
1298 		for (bp = bps, i = 0; i < ncg; i++, bp++) {
1299 			sema_p(&bp->b_io);
1300 			if (!error) {
1301 				cgp = bp->b_un.b_cg;
1302 				sip[cg + i] = cgp->cg_cs;
1303 				error = geterror(bp);
1304 			}
1305 		}
1306 		if (error) {
1307 			goto err;
1308 		}
1309 		cg += ncg;
1310 	} while (cg < fs->fs_ncg);
1311 
1312 err:
1313 	kmem_free(bps, NSIBUF * sizeof (buf_t));
1314 	kmem_free(bufs, NSIBUF * SI_BUFSZ);
1315 	return (error);
1316 }
1317 
1318 /*
1319  * ufs_getsummaryinfo
1320  */
1321 int
1322 ufs_getsummaryinfo(dev_t dev, struct ufsvfs *ufsvfsp, struct fs *fs)
1323 {
1324 	int		i;		/* `for' loop counter */
1325 	ssize_t		size;		/* bytes of summary info to read */
1326 	daddr_t		frags;		/* frags of summary info to read */
1327 	caddr_t		sip;		/* summary info */
1328 	struct buf	*tp;		/* tmp buf */
1329 
1330 	/*
1331 	 * maintain metadata map for trans device (debug only)
1332 	 */
1333 	TRANS_MATA_SI(ufsvfsp, fs);
1334 
1335 	/*
1336 	 * Compute #frags and allocate space for summary info
1337 	 */
1338 	frags = howmany(fs->fs_cssize, fs->fs_fsize);
1339 	sip = kmem_alloc((size_t)fs->fs_cssize, KM_SLEEP);
1340 	fs->fs_u.fs_csp = (struct csum *)sip;
1341 
1342 	if (fs->fs_si == FS_SI_BAD) {
1343 		/*
1344 		 * The summary information is unknown, read it in from
1345 		 * the cylinder groups.
1346 		 */
1347 		if (TRANS_ISTRANS(ufsvfsp) && !TRANS_ISERROR(ufsvfsp) &&
1348 		    ufsvfsp->vfs_log->un_logmap) {
1349 			logmap_roll_dev(ufsvfsp->vfs_log); /* flush the log */
1350 		}
1351 		bzero(sip, (size_t)fs->fs_cssize);
1352 		if (ufs_construct_si(dev, fs, ufsvfsp)) {
1353 			kmem_free(fs->fs_u.fs_csp, fs->fs_cssize);
1354 			fs->fs_u.fs_csp = NULL;
1355 			return (EIO);
1356 		}
1357 	} else {
1358 		/* Read summary info a fs block at a time */
1359 		size = fs->fs_bsize;
1360 		for (i = 0; i < frags; i += fs->fs_frag) {
1361 			if (i + fs->fs_frag > frags)
1362 				/*
1363 				 * This happens only the last iteration, so
1364 				 * don't worry about size being reset
1365 				 */
1366 				size = (frags - i) * fs->fs_fsize;
1367 			tp = UFS_BREAD(ufsvfsp, dev,
1368 			    (daddr_t)fsbtodb(fs, fs->fs_csaddr+i), size);
1369 			tp->b_flags |= B_STALE | B_AGE;
1370 			if (tp->b_flags & B_ERROR) {
1371 				kmem_free(fs->fs_u.fs_csp, fs->fs_cssize);
1372 				fs->fs_u.fs_csp = NULL;
1373 				brelse(tp);
1374 				return (EIO);
1375 			}
1376 			bcopy(tp->b_un.b_addr, sip, size);
1377 			sip += size;
1378 			brelse(tp);
1379 		}
1380 	}
1381 	bzero((caddr_t)&fs->fs_cstotal, sizeof (fs->fs_cstotal));
1382 	for (i = 0; i < fs->fs_ncg; ++i) {
1383 		fs->fs_cstotal.cs_ndir += fs->fs_cs(fs, i).cs_ndir;
1384 		fs->fs_cstotal.cs_nbfree += fs->fs_cs(fs, i).cs_nbfree;
1385 		fs->fs_cstotal.cs_nifree += fs->fs_cs(fs, i).cs_nifree;
1386 		fs->fs_cstotal.cs_nffree += fs->fs_cs(fs, i).cs_nffree;
1387 	}
1388 	return (0);
1389 }
1390 
1391 /*
1392  * ufs_putsummaryinfo() stores all the cylinder group summary information
1393  * This is only used when logging, but the file system may not
1394  * be logging at the time, eg a read-only mount to flush the log
1395  * may push the summary info out.
1396  */
1397 int
1398 ufs_putsummaryinfo(dev_t dev, struct ufsvfs *ufsvfsp, struct fs *fs)
1399 {
1400 	struct buf	b, *bp;		/* tmp buf */
1401 	caddr_t		sip;		/* summary info */
1402 	ssize_t		size;		/* bytes of summary info to write */
1403 	daddr_t		frags;		/* frags of summary info to write */
1404 	int		i;		/* `for' loop counter */
1405 	int		error;		/* error */
1406 
1407 	if (TRANS_ISERROR(ufsvfsp)) {
1408 		return (EIO);
1409 	}
1410 
1411 	if ((fs->fs_si != FS_SI_BAD) || !ufsvfsp->vfs_nolog_si) {
1412 		return (0);
1413 	}
1414 
1415 	bp = &b;
1416 	bioinit(bp);
1417 	bp->b_iodone = ufs_si_io_done;
1418 	bp->b_bufsize = size = fs->fs_bsize;
1419 	bp->b_flags = B_WRITE;
1420 	bp->b_un.b_addr = kmem_alloc(size, KM_SLEEP);
1421 	bp->b_edev = dev;
1422 	frags = howmany(fs->fs_cssize, fs->fs_fsize);
1423 	sip = (caddr_t)fs->fs_u.fs_csp;
1424 
1425 	/* Write summary info one fs block at a time */
1426 	for (error = 0, i = 0; (i < frags) && (error == 0); i += fs->fs_frag) {
1427 		if (i + fs->fs_frag > frags) {
1428 			/*
1429 			 * This happens only the last iteration, so
1430 			 * don't worry about size being reset
1431 			 */
1432 			size = (frags - i) * fs->fs_fsize;
1433 		}
1434 		bcopy(sip, bp->b_un.b_addr, size);
1435 		bp->b_blkno = (daddr_t)fsbtodb(fs, fs->fs_csaddr+i);
1436 		bp->b_bcount = size;
1437 		(void) bdev_strategy(bp);
1438 		sema_p(&bp->b_io); /* wait for write to complete */
1439 		error = geterror(bp);
1440 		sip += size;
1441 	}
1442 	kmem_free(bp->b_un.b_addr, fs->fs_bsize);
1443 	if (!error) {
1444 		fs->fs_si = FS_SI_OK;
1445 	}
1446 	return (error);
1447 }
1448 
1449 /*
1450  * Decide whether it is okay to remove within a sticky directory.
1451  * Two conditions need to be met:  write access to the directory
1452  * is needed.  In sticky directories, write access is not sufficient;
1453  * you can remove entries from a directory only if you own the directory,
1454  * if you are privileged, if you own the entry or if the entry is
1455  * a plain file and you have write access to that file.
1456  * Function returns 0 if remove access is granted.
1457  */
1458 int
1459 ufs_sticky_remove_access(struct inode *dp, struct inode *ip, struct cred *cr)
1460 {
1461 	uid_t uid;
1462 	if ((dp->i_mode & ISVTX) &&
1463 	    (uid = crgetuid(cr)) != dp->i_uid &&
1464 	    uid != ip->i_uid &&
1465 	    ((ip->i_mode & IFMT) != IFREG ||
1466 	    ufs_iaccess(ip, IWRITE, cr) != 0))
1467 		return (secpolicy_vnode_remove(cr));
1468 
1469 	return (0);
1470 }
1471 #endif	/* _KERNEL */
1472 
1473 extern	int around[9];
1474 extern	int inside[9];
1475 extern	uchar_t *fragtbl[];
1476 
1477 /*
1478  * Update the frsum fields to reflect addition or deletion
1479  * of some frags.
1480  */
1481 void
1482 fragacct(struct fs *fs, int fragmap, int32_t *fraglist, int cnt)
1483 {
1484 	int inblk;
1485 	int field, subfield;
1486 	int siz, pos;
1487 
1488 	/*
1489 	 * ufsvfsp->vfs_lock is held when calling this.
1490 	 */
1491 	inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1;
1492 	fragmap <<= 1;
1493 	for (siz = 1; siz < fs->fs_frag; siz++) {
1494 		if ((inblk & (1 << (siz + (fs->fs_frag % NBBY)))) == 0)
1495 			continue;
1496 		field = around[siz];
1497 		subfield = inside[siz];
1498 		for (pos = siz; pos <= fs->fs_frag; pos++) {
1499 			if ((fragmap & field) == subfield) {
1500 				fraglist[siz] += cnt;
1501 				ASSERT(fraglist[siz] >= 0);
1502 				pos += siz;
1503 				field <<= siz;
1504 				subfield <<= siz;
1505 			}
1506 			field <<= 1;
1507 			subfield <<= 1;
1508 		}
1509 	}
1510 }
1511 
1512 /*
1513  * Block operations
1514  */
1515 
1516 /*
1517  * Check if a block is available
1518  */
1519 int
1520 isblock(struct fs *fs, uchar_t *cp, daddr_t h)
1521 {
1522 	uchar_t mask;
1523 
1524 	ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \
1525 		    fs->fs_frag == 1);
1526 	/*
1527 	 * ufsvfsp->vfs_lock is held when calling this.
1528 	 */
1529 	switch ((int)fs->fs_frag) {
1530 	case 8:
1531 		return (cp[h] == 0xff);
1532 	case 4:
1533 		mask = 0x0f << ((h & 0x1) << 2);
1534 		return ((cp[h >> 1] & mask) == mask);
1535 	case 2:
1536 		mask = 0x03 << ((h & 0x3) << 1);
1537 		return ((cp[h >> 2] & mask) == mask);
1538 	case 1:
1539 		mask = 0x01 << (h & 0x7);
1540 		return ((cp[h >> 3] & mask) == mask);
1541 	default:
1542 #ifndef _KERNEL
1543 		cmn_err(CE_PANIC, "isblock: illegal fs->fs_frag value (%d)",
1544 			    fs->fs_frag);
1545 #endif /* _KERNEL */
1546 		return (0);
1547 	}
1548 }
1549 
1550 /*
1551  * Take a block out of the map
1552  */
1553 void
1554 clrblock(struct fs *fs, uchar_t *cp, daddr_t h)
1555 {
1556 	ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \
1557 		fs->fs_frag == 1);
1558 	/*
1559 	 * ufsvfsp->vfs_lock is held when calling this.
1560 	 */
1561 	switch ((int)fs->fs_frag) {
1562 	case 8:
1563 		cp[h] = 0;
1564 		return;
1565 	case 4:
1566 		cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2));
1567 		return;
1568 	case 2:
1569 		cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1));
1570 		return;
1571 	case 1:
1572 		cp[h >> 3] &= ~(0x01 << (h & 0x7));
1573 		return;
1574 	default:
1575 #ifndef _KERNEL
1576 		cmn_err(CE_PANIC, "clrblock: illegal fs->fs_frag value (%d)",
1577 			    fs->fs_frag);
1578 #endif /* _KERNEL */
1579 		return;
1580 	}
1581 }
1582 
1583 /*
1584  * Is block allocated?
1585  */
1586 int
1587 isclrblock(struct fs *fs, uchar_t *cp, daddr_t h)
1588 {
1589 	uchar_t	mask;
1590 	int	frag;
1591 	/*
1592 	 * ufsvfsp->vfs_lock is held when calling this.
1593 	 */
1594 	frag = fs->fs_frag;
1595 	ASSERT(frag == 8 || frag == 4 || frag == 2 || frag == 1);
1596 	switch (frag) {
1597 	case 8:
1598 		return (cp[h] == 0);
1599 	case 4:
1600 		mask = ~(0x0f << ((h & 0x1) << 2));
1601 		return (cp[h >> 1] == (cp[h >> 1] & mask));
1602 	case 2:
1603 		mask =	~(0x03 << ((h & 0x3) << 1));
1604 		return (cp[h >> 2] == (cp[h >> 2] & mask));
1605 	case 1:
1606 		mask = ~(0x01 << (h & 0x7));
1607 		return (cp[h >> 3] == (cp[h >> 3] & mask));
1608 	default:
1609 #ifndef _KERNEL
1610 		cmn_err(CE_PANIC, "isclrblock: illegal fs->fs_frag value (%d)",
1611 			    fs->fs_frag);
1612 #endif /* _KERNEL */
1613 		break;
1614 	}
1615 	return (0);
1616 }
1617 
1618 /*
1619  * Put a block into the map
1620  */
1621 void
1622 setblock(struct fs *fs, uchar_t *cp, daddr_t h)
1623 {
1624 	ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \
1625 		    fs->fs_frag == 1);
1626 	/*
1627 	 * ufsvfsp->vfs_lock is held when calling this.
1628 	 */
1629 	switch ((int)fs->fs_frag) {
1630 	case 8:
1631 		cp[h] = 0xff;
1632 		return;
1633 	case 4:
1634 		cp[h >> 1] |= (0x0f << ((h & 0x1) << 2));
1635 		return;
1636 	case 2:
1637 		cp[h >> 2] |= (0x03 << ((h & 0x3) << 1));
1638 		return;
1639 	case 1:
1640 		cp[h >> 3] |= (0x01 << (h & 0x7));
1641 		return;
1642 	default:
1643 #ifndef _KERNEL
1644 		cmn_err(CE_PANIC, "setblock: illegal fs->fs_frag value (%d)",
1645 			    fs->fs_frag);
1646 #endif /* _KERNEL */
1647 		return;
1648 	}
1649 }
1650 
1651 int
1652 skpc(char c, uint_t len, char *cp)
1653 {
1654 	if (len == 0)
1655 		return (0);
1656 	while (*cp++ == c && --len)
1657 		;
1658 	return (len);
1659 }
1660