xref: /illumos-gate/usr/src/uts/common/fs/ufs/ufs_subr.c (revision 15c07adc1c7b828006b5e3c4d528b92229d6bd23)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 #include <sys/types.h>
40 #include <sys/t_lock.h>
41 #include <sys/param.h>
42 #include <sys/time.h>
43 #include <sys/fs/ufs_fs.h>
44 #include <sys/cmn_err.h>
45 
46 #ifdef _KERNEL
47 
48 #include <sys/systm.h>
49 #include <sys/sysmacros.h>
50 #include <sys/buf.h>
51 #include <sys/conf.h>
52 #include <sys/user.h>
53 #include <sys/var.h>
54 #include <sys/vfs.h>
55 #include <sys/vnode.h>
56 #include <sys/proc.h>
57 #include <sys/debug.h>
58 #include <sys/fssnap_if.h>
59 #include <sys/fs/ufs_inode.h>
60 #include <sys/fs/ufs_trans.h>
61 #include <sys/fs/ufs_panic.h>
62 #include <sys/fs/ufs_bio.h>
63 #include <sys/fs/ufs_log.h>
64 #include <sys/kmem.h>
65 #include <sys/policy.h>
66 #include <vm/hat.h>
67 #include <vm/as.h>
68 #include <vm/seg.h>
69 #include <vm/pvn.h>
70 #include <vm/seg_map.h>
71 #include <sys/swap.h>
72 #include <vm/seg_kmem.h>
73 
74 #else  /* _KERNEL */
75 
76 #define	ASSERT(x)		/* don't use asserts for fsck et al */
77 
78 #endif  /* _KERNEL */
79 
80 #ifdef _KERNEL
81 
82 /*
83  * Used to verify that a given entry on the ufs_instances list (see below)
84  * still refers to a mounted file system.
85  *
86  * XXX:	This is a crock that substitutes for proper locking to coordinate
87  *	updates to and uses of the entries in ufs_instances.
88  */
89 struct check_node {
90 	struct vfs *vfsp;
91 	struct ufsvfs *ufsvfs;
92 	dev_t vfs_dev;
93 };
94 
95 static vfs_t *still_mounted(struct check_node *);
96 
97 /*
98  * All ufs file system instances are linked together into a list starting at
99  * ufs_instances.  The list is updated as part of mount and unmount.  It's
100  * consulted in ufs_update, to allow syncing out all ufs file system instances
101  * in a batch.
102  *
103  * ufsvfs_mutex guards access to this list and to the {,old}ufsvfslist
104  * manipulated in ufs_funmount_cleanup.  (A given ufs instance is always on
105  * exactly one of these lists except while it's being allocated or
106  * deallocated.)
107  */
108 struct ufsvfs	*ufs_instances;
109 extern kmutex_t		ufsvfs_mutex;	/* XXX: move this to ufs_inode.h? */
110 
111 /*
112  * ufsvfs list manipulation routines
113  */
114 
115 /*
116  * Link ufsp in at the head of the list of ufs_instances.
117  */
118 void
119 ufs_vfs_add(struct ufsvfs *ufsp)
120 {
121 	mutex_enter(&ufsvfs_mutex);
122 	ufsp->vfs_next = ufs_instances;
123 	ufs_instances = ufsp;
124 	mutex_exit(&ufsvfs_mutex);
125 }
126 
127 /*
128  * Remove ufsp from the list of ufs_instances.
129  *
130  * Does no error checking; ufsp is assumed to actually be on the list.
131  */
132 void
133 ufs_vfs_remove(struct ufsvfs *ufsp)
134 {
135 	struct ufsvfs	**delpt = &ufs_instances;
136 
137 	mutex_enter(&ufsvfs_mutex);
138 	for (; *delpt != NULL; delpt = &((*delpt)->vfs_next)) {
139 		if (*delpt == ufsp) {
140 			*delpt = ufsp->vfs_next;
141 			ufsp->vfs_next = NULL;
142 			break;
143 		}
144 	}
145 	mutex_exit(&ufsvfs_mutex);
146 }
147 
148 /*
149  * Clean up state resulting from a forcible unmount that couldn't be handled
150  * directly during the unmount.  (See commentary in the unmount code for more
151  * info.)
152  */
153 static void
154 ufs_funmount_cleanup()
155 {
156 	struct ufsvfs		*ufsvfsp;
157 	extern struct ufsvfs	*oldufsvfslist, *ufsvfslist;
158 
159 	/*
160 	 * Assumption: it's now safe to blow away the entries on
161 	 * oldufsvfslist.
162 	 */
163 	mutex_enter(&ufsvfs_mutex);
164 	while ((ufsvfsp = oldufsvfslist) != NULL) {
165 		oldufsvfslist = ufsvfsp->vfs_next;
166 
167 		mutex_destroy(&ufsvfsp->vfs_lock);
168 		kmem_free(ufsvfsp, sizeof (struct ufsvfs));
169 	}
170 	/*
171 	 * Rotate more recent unmount entries into place in preparation for
172 	 * the next time around.
173 	 */
174 	oldufsvfslist = ufsvfslist;
175 	ufsvfslist = NULL;
176 	mutex_exit(&ufsvfs_mutex);
177 }
178 
179 
180 /*
181  * ufs_update performs the ufs part of `sync'.  It goes through the disk
182  * queues to initiate sandbagged IO; goes through the inodes to write
183  * modified nodes; and it goes through the mount table to initiate
184  * the writing of the modified super blocks.
185  */
186 extern time_t	time;
187 time_t		ufs_sync_time;
188 time_t		ufs_sync_time_secs = 1;
189 
190 extern kmutex_t	ufs_scan_lock;
191 
192 void
193 ufs_update(int flag)
194 {
195 	struct vfs *vfsp;
196 	struct fs *fs;
197 	struct ufsvfs *ufsp;
198 	struct ufsvfs *ufsnext;
199 	struct ufsvfs *update_list = NULL;
200 	int check_cnt = 0;
201 	size_t check_size;
202 	struct check_node *check_list, *ptr;
203 	int cheap = flag & SYNC_ATTR;
204 
205 	/*
206 	 * This is a hack.  A design flaw in the forced unmount protocol
207 	 * could allow a thread to attempt to use a kmem_freed ufsvfs
208 	 * structure in ufs_lockfs_begin/ufs_check_lockfs.  This window
209 	 * is difficult to hit, even during the lockfs stress tests.
210 	 * So the hacky fix is to wait awhile before kmem_free'ing the
211 	 * ufsvfs structures for forcibly unmounted file systems.  `Awhile'
212 	 * is defined as every other call from fsflush (~60 seconds).
213 	 */
214 	if (cheap)
215 		ufs_funmount_cleanup();
216 
217 	/*
218 	 * Examine all ufsvfs structures and add those that we can lock to the
219 	 * update list.  This is so that we don't hold the list lock for a
220 	 * long time.  If vfs_lock fails for a file system instance, then skip
221 	 * it because somebody is doing a unmount on it.
222 	 */
223 	mutex_enter(&ufsvfs_mutex);
224 	for (ufsp = ufs_instances; ufsp != NULL; ufsp = ufsp->vfs_next) {
225 		vfsp = ufsp->vfs_vfs;
226 		if (vfs_lock(vfsp) != 0)
227 			continue;
228 		ufsp->vfs_wnext = update_list;
229 		update_list = ufsp;
230 		check_cnt++;
231 	}
232 	mutex_exit(&ufsvfs_mutex);
233 
234 	if (update_list == NULL)
235 		return;
236 
237 	check_size = sizeof (struct check_node) * check_cnt;
238 	check_list = ptr = kmem_alloc(check_size, KM_NOSLEEP);
239 
240 	/*
241 	 * Write back modified superblocks.
242 	 * Consistency check that the superblock of
243 	 * each file system is still in the buffer cache.
244 	 *
245 	 * Note that the update_list traversal is done without the protection
246 	 * of an overall list lock, so it's necessary to rely on the fact that
247 	 * each entry of the list is vfs_locked when moving from one entry to
248 	 * the next.  This works because a concurrent attempt to add an entry
249 	 * to another thread's update_list won't find it, since it'll already
250 	 * be locked.
251 	 */
252 	check_cnt = 0;
253 	for (ufsp = update_list; ufsp != NULL; ufsp = ufsnext) {
254 		/*
255 		 * Need to grab the next ptr before we unlock this one so
256 		 * another thread doesn't grab it and change it before we move
257 		 * on to the next vfs.  (Once we unlock it, it's ok if another
258 		 * thread finds it to add it to its own update_list; we don't
259 		 * attempt to refer to it through our list any more.)
260 		 */
261 		ufsnext = ufsp->vfs_wnext;
262 		vfsp = ufsp->vfs_vfs;
263 
264 		/*
265 		 * Seems like this can't happen, so perhaps it should become
266 		 * an ASSERT(vfsp->vfs_data != NULL).
267 		 */
268 		if (!vfsp->vfs_data) {
269 			vfs_unlock(vfsp);
270 			continue;
271 		}
272 
273 		fs = ufsp->vfs_fs;
274 
275 		/*
276 		 * don't update a locked superblock during a panic; it
277 		 * may be in an inconsistent state
278 		 */
279 		if (panicstr) {
280 			if (!mutex_tryenter(&ufsp->vfs_lock)) {
281 				vfs_unlock(vfsp);
282 				continue;
283 			}
284 		} else
285 			mutex_enter(&ufsp->vfs_lock);
286 		/*
287 		 * Build up the STABLE check list, so we can unlock the vfs
288 		 * until we do the actual checking.
289 		 */
290 		if (check_list != NULL) {
291 			if ((fs->fs_ronly == 0) &&
292 			    (fs->fs_clean != FSBAD) &&
293 			    (fs->fs_clean != FSSUSPEND)) {
294 				ptr->vfsp = vfsp;
295 				ptr->ufsvfs = ufsp;
296 				ptr->vfs_dev = vfsp->vfs_dev;
297 				ptr++;
298 				check_cnt++;
299 			}
300 		}
301 
302 		/*
303 		 * superblock is not modified
304 		 */
305 		if (fs->fs_fmod == 0) {
306 			mutex_exit(&ufsp->vfs_lock);
307 			vfs_unlock(vfsp);
308 			continue;
309 		}
310 		if (fs->fs_ronly != 0) {
311 			mutex_exit(&ufsp->vfs_lock);
312 			vfs_unlock(vfsp);
313 			(void) ufs_fault(ufsp->vfs_root,
314 			    "fs = %s update: ro fs mod\n", fs->fs_fsmnt);
315 			/*
316 			 * XXX:	Why is this a return instead of a continue?
317 			 *	This may be an attempt to replace a panic with
318 			 *	something less drastic, but there's cleanup we
319 			 *	should be doing that's not being done (e.g.,
320 			 *	unlocking the remaining entries on the list).
321 			 */
322 			return;
323 		}
324 		fs->fs_fmod = 0;
325 		mutex_exit(&ufsp->vfs_lock);
326 		TRANS_SBUPDATE(ufsp, vfsp, TOP_SBUPDATE_UPDATE);
327 		vfs_unlock(vfsp);
328 	}
329 
330 	ufs_sync_time = time;
331 
332 	/*
333 	 * Avoid racing with ufs_unmount() and ufs_sync().
334 	 */
335 	mutex_enter(&ufs_scan_lock);
336 
337 	(void) ufs_scan_inodes(1, ufs_sync_inode, (void *)(uintptr_t)cheap,
338 	    NULL);
339 
340 	mutex_exit(&ufs_scan_lock);
341 
342 	/*
343 	 * Force stale buffer cache information to be flushed,
344 	 * for all devices.  This should cause any remaining control
345 	 * information (e.g., cg and inode info) to be flushed back.
346 	 */
347 	bflush((dev_t)NODEV);
348 
349 	if (check_list == NULL)
350 		return;
351 
352 	/*
353 	 * For each UFS filesystem in the STABLE check_list, update
354 	 * the clean flag if warranted.
355 	 */
356 	for (ptr = check_list; check_cnt > 0; check_cnt--, ptr++) {
357 		int	error;
358 
359 		/*
360 		 * still_mounted() returns with vfsp and the vfs_reflock
361 		 * held if ptr refers to a vfs that is still mounted.
362 		 */
363 		if ((vfsp = still_mounted(ptr)) == NULL)
364 			continue;
365 		ufs_checkclean(vfsp);
366 		/*
367 		 * commit any outstanding async transactions
368 		 */
369 		ufsp = (struct ufsvfs *)vfsp->vfs_data;
370 		curthread->t_flag |= T_DONTBLOCK;
371 		TRANS_BEGIN_SYNC(ufsp, TOP_COMMIT_UPDATE, TOP_COMMIT_SIZE,
372 		    error);
373 		if (!error) {
374 			TRANS_END_SYNC(ufsp, error, TOP_COMMIT_UPDATE,
375 			    TOP_COMMIT_SIZE);
376 		}
377 		curthread->t_flag &= ~T_DONTBLOCK;
378 
379 		vfs_unlock(vfsp);
380 	}
381 
382 	kmem_free(check_list, check_size);
383 }
384 
385 int
386 ufs_sync_inode(struct inode *ip, void *arg)
387 {
388 	int cheap = (int)(uintptr_t)arg;
389 	struct ufsvfs *ufsvfsp;
390 	uint_t flag = ip->i_flag;
391 
392 	if (cheap && ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) == 0))
393 		return (0);
394 
395 	/*
396 	 * if we are panic'ing; then don't update the inode if this
397 	 * file system is FSSTABLE.  Otherwise, we would have to
398 	 * force the superblock to FSACTIVE and the superblock
399 	 * may not be in a good state.  Also, if the inode is
400 	 * IREF'ed then it may be in an inconsistent state.  Don't
401 	 * push it.  Finally, don't push the inode if the fs is
402 	 * logging; the transaction will be discarded at boot.
403 	 */
404 	if (panicstr) {
405 
406 		if (flag & IREF)
407 			return (0);
408 
409 		if (ip->i_ufsvfs == NULL ||
410 		    (ip->i_fs->fs_clean == FSSTABLE ||
411 		    ip->i_fs->fs_clean == FSLOG))
412 				return (0);
413 	}
414 
415 	ufsvfsp = ip->i_ufsvfs;
416 
417 	/*
418 	 * Limit access time only updates
419 	 */
420 	if (((flag & (IMOD|IMODACC|IUPD|ICHG|IACC)) == IMODACC) && ufsvfsp) {
421 		/*
422 		 * if file system has deferred access time turned on and there
423 		 * was no IO recently, don't bother flushing it. It will be
424 		 * flushed when I/Os start again.
425 		 */
426 		if (cheap && (ufsvfsp->vfs_dfritime & UFS_DFRATIME) &&
427 		    (ufsvfsp->vfs_iotstamp + ufs_iowait < ddi_get_lbolt()))
428 			return (0);
429 		/*
430 		 * an app issueing a sync() can take forever on a trans device
431 		 * when NetWorker or find is running because all of the
432 		 * directorys' access times have to be updated. So, we limit
433 		 * the time we spend updating access times per sync.
434 		 */
435 		if (TRANS_ISTRANS(ufsvfsp) && ((ufs_sync_time +
436 		    ufs_sync_time_secs) < time))
437 			return (0);
438 	}
439 
440 	/*
441 	 * if we are running on behalf of the flush thread or this is
442 	 * a swap file, then simply do a delay update of the inode.
443 	 * Otherwise, push the pages and then do a delayed inode update.
444 	 */
445 	if (cheap || IS_SWAPVP(ITOV(ip))) {
446 		TRANS_IUPDAT(ip, 0);
447 	} else {
448 		(void) TRANS_SYNCIP(ip, B_ASYNC, I_ASYNC, TOP_SYNCIP_SYNC);
449 	}
450 	return (0);
451 }
452 
453 /*
454  * Flush all the pages associated with an inode using the given 'flags',
455  * then force inode information to be written back using the given 'waitfor'.
456  */
457 int
458 ufs_syncip(struct inode *ip, int flags, int waitfor, top_t topid)
459 {
460 	int	error;
461 	struct vnode *vp = ITOV(ip);
462 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
463 	int dotrans = 0;
464 
465 	/*
466 	 * Return if file system has been forcibly umounted.
467 	 */
468 	if (ufsvfsp == NULL)
469 		return (EIO);
470 	/*
471 	 * don't need to VOP_PUTPAGE if there are no pages
472 	 */
473 	if (!vn_has_cached_data(vp) || vp->v_type == VCHR) {
474 		error = 0;
475 	} else {
476 		/*
477 		 * if the inode we're working on is a shadow inode
478 		 * or quota inode we need to make sure that the
479 		 * ufs_putpage call is inside a transaction as this
480 		 * could include meta data changes.
481 		 */
482 		if ((ip->i_mode & IFMT) == IFSHAD ||
483 		    ufsvfsp->vfs_qinod == ip) {
484 			dotrans = 1;
485 			curthread->t_flag |= T_DONTBLOCK;
486 			TRANS_BEGIN_ASYNC(ufsvfsp, TOP_PUTPAGE,
487 			    TOP_PUTPAGE_SIZE(ip));
488 		}
489 		error = VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
490 		    flags, CRED(), NULL);
491 		if (dotrans) {
492 			TRANS_END_ASYNC(ufsvfsp, TOP_PUTPAGE,
493 			    TOP_PUTPAGE_SIZE(ip));
494 			curthread->t_flag &= ~T_DONTBLOCK;
495 			dotrans = 0;
496 		}
497 	}
498 	if (panicstr && TRANS_ISTRANS(ufsvfsp))
499 		goto out;
500 	/*
501 	 * waitfor represents two things -
502 	 * 1. whether data sync or file sync.
503 	 * 2. if file sync then ufs_iupdat should 'waitfor' disk i/o or not.
504 	 */
505 	if (waitfor == I_DSYNC) {
506 		/*
507 		 * If data sync, only IATTCHG (size/block change) requires
508 		 * inode update, fdatasync()/FDSYNC implementation.
509 		 */
510 		if (ip->i_flag & (IBDWRITE|IATTCHG)) {
511 			/*
512 			 * Enter a transaction to provide mutual exclusion
513 			 * with deltamap_push and avoid a race where
514 			 * the inode flush could get dropped.
515 			 */
516 			if ((curthread->t_flag & T_DONTBLOCK) == 0) {
517 				dotrans = 1;
518 				curthread->t_flag |= T_DONTBLOCK;
519 				TRANS_BEGIN_ASYNC(ufsvfsp, topid,
520 				    TOP_SYNCIP_SIZE);
521 			}
522 			rw_enter(&ip->i_contents, RW_READER);
523 			mutex_enter(&ip->i_tlock);
524 			ip->i_flag &= ~IMODTIME;
525 			mutex_exit(&ip->i_tlock);
526 			ufs_iupdat(ip, 1);
527 			rw_exit(&ip->i_contents);
528 			if (dotrans) {
529 				TRANS_END_ASYNC(ufsvfsp, topid,
530 				    TOP_SYNCIP_SIZE);
531 				curthread->t_flag &= ~T_DONTBLOCK;
532 			}
533 		}
534 	} else {
535 		/* For file sync, any inode change requires inode update */
536 		if (ip->i_flag & (IBDWRITE|IUPD|IACC|ICHG|IMOD|IMODACC)) {
537 			/*
538 			 * Enter a transaction to provide mutual exclusion
539 			 * with deltamap_push and avoid a race where
540 			 * the inode flush could get dropped.
541 			 */
542 			if ((curthread->t_flag & T_DONTBLOCK) == 0) {
543 				dotrans = 1;
544 				curthread->t_flag |= T_DONTBLOCK;
545 				TRANS_BEGIN_ASYNC(ufsvfsp, topid,
546 				    TOP_SYNCIP_SIZE);
547 			}
548 			rw_enter(&ip->i_contents, RW_READER);
549 			mutex_enter(&ip->i_tlock);
550 			ip->i_flag &= ~IMODTIME;
551 			mutex_exit(&ip->i_tlock);
552 			ufs_iupdat(ip, waitfor);
553 			rw_exit(&ip->i_contents);
554 			if (dotrans) {
555 				TRANS_END_ASYNC(ufsvfsp, topid,
556 				    TOP_SYNCIP_SIZE);
557 				curthread->t_flag &= ~T_DONTBLOCK;
558 			}
559 		}
560 	}
561 
562 out:
563 	return (error);
564 }
565 /*
566  * Flush all indirect blocks related to an inode.
567  * Supports triple indirect blocks also.
568  */
569 int
570 ufs_sync_indir(struct inode *ip)
571 {
572 	int i;
573 	daddr_t blkno;
574 	daddr_t lbn;	/* logical blkno of last blk in file */
575 	daddr_t clbn;	/* current logical blk */
576 	daddr32_t *bap;
577 	struct fs *fs;
578 	struct buf *bp;
579 	int bsize;
580 	struct ufsvfs *ufsvfsp;
581 	int j;
582 	daddr_t indirect_blkno;
583 	daddr32_t *indirect_bap;
584 	struct buf *indirect_bp;
585 
586 	ufsvfsp = ip->i_ufsvfs;
587 	/*
588 	 * unnecessary when logging; allocation blocks are kept up-to-date
589 	 */
590 	if (TRANS_ISTRANS(ufsvfsp))
591 		return (0);
592 
593 	fs = ufsvfsp->vfs_fs;
594 	bsize = fs->fs_bsize;
595 	lbn = (daddr_t)lblkno(fs, ip->i_size - 1);
596 	if (lbn < NDADDR)
597 		return (0);	/* No indirect blocks used */
598 	if (lbn < NDADDR + NINDIR(fs)) {
599 		/* File has one indirect block. */
600 		blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, ip->i_ib[0]));
601 		return (0);
602 	}
603 
604 	/* Write out all the first level indirect blocks */
605 	for (i = 0; i < NIADDR; i++) {
606 		if ((blkno = ip->i_ib[i]) == 0)
607 			continue;
608 		blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno));
609 	}
610 	/* Write out second level of indirect blocks */
611 	if ((blkno = ip->i_ib[1]) == 0)
612 		return (0);
613 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, blkno), bsize);
614 	if (bp->b_flags & B_ERROR) {
615 		brelse(bp);
616 		return (EIO);
617 	}
618 	bap = bp->b_un.b_daddr;
619 	clbn = NDADDR + NINDIR(fs);
620 	for (i = 0; i < NINDIR(fs); i++) {
621 		if (clbn > lbn)
622 			break;
623 		clbn += NINDIR(fs);
624 		if ((blkno = bap[i]) == 0)
625 			continue;
626 		blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno));
627 	}
628 
629 	brelse(bp);
630 	/* write out third level indirect blocks */
631 
632 	if ((blkno = ip->i_ib[2]) == 0)
633 		return (0);
634 
635 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, blkno), bsize);
636 	if (bp->b_flags & B_ERROR) {
637 		brelse(bp);
638 		return (EIO);
639 	}
640 	bap = bp->b_un.b_daddr;
641 	clbn = NDADDR + NINDIR(fs) + (NINDIR(fs) * NINDIR(fs));
642 
643 	for (i = 0; i < NINDIR(fs); i++) {
644 		if (clbn > lbn)
645 			break;
646 		if ((indirect_blkno = bap[i]) == 0)
647 			continue;
648 		blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, indirect_blkno));
649 		indirect_bp = UFS_BREAD(ufsvfsp, ip->i_dev,
650 		    (daddr_t)fsbtodb(fs, indirect_blkno), bsize);
651 		if (indirect_bp->b_flags & B_ERROR) {
652 			brelse(indirect_bp);
653 			brelse(bp);
654 			return (EIO);
655 		}
656 		indirect_bap = indirect_bp->b_un.b_daddr;
657 		for (j = 0; j < NINDIR(fs); j++) {
658 			if (clbn > lbn)
659 				break;
660 			clbn += NINDIR(fs);
661 			if ((blkno = indirect_bap[j]) == 0)
662 				continue;
663 			blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno));
664 		}
665 		brelse(indirect_bp);
666 	}
667 	brelse(bp);
668 
669 	return (0);
670 }
671 
672 /*
673  * Flush all indirect blocks related to an offset of a file.
674  * read/write in sync mode may have to flush indirect blocks.
675  */
676 int
677 ufs_indirblk_sync(struct inode *ip, offset_t off)
678 {
679 	daddr_t	lbn;
680 	struct	fs *fs;
681 	struct	buf *bp;
682 	int	i, j, shft;
683 	daddr_t	ob, nb, tbn;
684 	daddr32_t *bap;
685 	int	nindirshift, nindiroffset;
686 	struct ufsvfs *ufsvfsp;
687 
688 	ufsvfsp = ip->i_ufsvfs;
689 	/*
690 	 * unnecessary when logging; allocation blocks are kept up-to-date
691 	 */
692 	if (TRANS_ISTRANS(ufsvfsp))
693 		return (0);
694 
695 	fs = ufsvfsp->vfs_fs;
696 
697 	lbn = (daddr_t)lblkno(fs, off);
698 	if (lbn < 0)
699 		return (EFBIG);
700 
701 	/* The first NDADDR are direct so nothing to do */
702 	if (lbn < NDADDR)
703 		return (0);
704 
705 	nindirshift = ip->i_ufsvfs->vfs_nindirshift;
706 	nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
707 
708 	/* Determine level of indirect blocks */
709 	shft = 0;
710 	tbn = lbn - NDADDR;
711 	for (j = NIADDR; j > 0; j--) {
712 		longlong_t	sh;
713 
714 		shft += nindirshift;
715 		sh = 1LL << shft;
716 		if (tbn < sh)
717 			break;
718 		tbn -= (daddr_t)sh;
719 	}
720 
721 	if (j == 0)
722 		return (EFBIG);
723 
724 	if ((nb = ip->i_ib[NIADDR - j]) == 0)
725 			return (0);		/* UFS Hole */
726 
727 	/* Flush first level indirect block */
728 	blkflush(ip->i_dev, fsbtodb(fs, nb));
729 
730 	/* Fetch through next levels */
731 	for (; j < NIADDR; j++) {
732 		ob = nb;
733 		bp = UFS_BREAD(ufsvfsp,
734 		    ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
735 		if (bp->b_flags & B_ERROR) {
736 			brelse(bp);
737 			return (EIO);
738 		}
739 		bap = bp->b_un.b_daddr;
740 		shft -= nindirshift;		/* sh / nindir */
741 		i = (tbn >> shft) & nindiroffset; /* (tbn /sh) & nindir */
742 		nb = bap[i];
743 		brelse(bp);
744 		if (nb == 0) {
745 			return (0); 		/* UFS hole */
746 		}
747 		blkflush(ip->i_dev, fsbtodb(fs, nb));
748 	}
749 	return (0);
750 }
751 
752 #ifdef DEBUG
753 
754 /*
755  * The bad block checking routines: ufs_indir_badblock() and ufs_badblock()
756  * are very expensive. It's been found from profiling that we're
757  * spending 6-7% of our time in ufs_badblock, and another 1-2% in
758  * ufs_indir_badblock. They are only called via ASSERTs (from debug kernels).
759  * In addition from experience no failures have been found in recent
760  * years. So the following tunable can be set to enable checking.
761  */
762 int ufs_badblock_checks = 0;
763 
764 /*
765  * Check that a given indirect block contains blocks in range
766  */
767 int
768 ufs_indir_badblock(struct inode *ip, daddr32_t *bap)
769 {
770 	int i;
771 	int err = 0;
772 
773 	if (ufs_badblock_checks) {
774 		for (i = 0; i < NINDIR(ip->i_fs) - 1; i++)
775 			if (bap[i] != 0 && (err = ufs_badblock(ip, bap[i])))
776 				break;
777 	}
778 	return (err);
779 }
780 
781 /*
782  * Check that a specified block number is in range.
783  */
784 int
785 ufs_badblock(struct inode *ip, daddr_t bn)
786 {
787 	long	c;
788 	daddr_t	sum;
789 
790 	if (!ufs_badblock_checks)
791 		return (0);
792 	ASSERT(bn);
793 	if (bn <= 0 || bn > ip->i_fs->fs_size)
794 		return (bn);
795 
796 	sum = 0;
797 	c = dtog(ip->i_fs, bn);
798 	if (c == 0) {
799 		sum = howmany(ip->i_fs->fs_cssize, ip->i_fs->fs_fsize);
800 	}
801 	/*
802 	 * if block no. is below this cylinder group,
803 	 * within the space reserved for superblock, inodes, (summary data)
804 	 * or if it is above this cylinder group
805 	 * then its invalid
806 	 * It's hard to see how we'd be outside this cyl, but let's be careful.
807 	 */
808 	if ((bn < cgbase(ip->i_fs, c)) ||
809 	    (bn >= cgsblock(ip->i_fs, c) && bn < cgdmin(ip->i_fs, c)+sum) ||
810 	    (bn >= (unsigned)cgbase(ip->i_fs, c+1)))
811 		return (bn);
812 
813 	return (0);	/* not a bad block */
814 }
815 
816 #endif /* DEBUG */
817 
818 /*
819  * When i_rwlock is write-locked or has a writer pended, then the inode
820  * is going to change in a way that the filesystem will be marked as
821  * active. So no need to let the filesystem be mark as stable now.
822  * Also to ensure the filesystem consistency during the directory
823  * operations, filesystem cannot be marked as stable if i_rwlock of
824  * the directory inode is write-locked.
825  */
826 
827 /*
828  * Check for busy inodes for this filesystem.
829  * NOTE: Needs better way to do this expensive operation in the future.
830  */
831 static void
832 ufs_icheck(struct ufsvfs *ufsvfsp, int *isbusyp, int *isreclaimp)
833 {
834 	union  ihead	*ih;
835 	struct inode	*ip;
836 	int		i;
837 	int		isnottrans	= !TRANS_ISTRANS(ufsvfsp);
838 	int		isbusy		= *isbusyp;
839 	int		isreclaim	= *isreclaimp;
840 
841 	for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
842 		mutex_enter(&ih_lock[i]);
843 		for (ip = ih->ih_chain[0];
844 		    ip != (struct inode *)ih;
845 		    ip = ip->i_forw) {
846 			/*
847 			 * if inode is busy/modified/deleted, filesystem is busy
848 			 */
849 			if (ip->i_ufsvfs != ufsvfsp)
850 				continue;
851 			if ((ip->i_flag & (IMOD | IUPD | ICHG)) ||
852 			    (RW_ISWRITER(&ip->i_rwlock)))
853 				isbusy = 1;
854 			if ((ip->i_nlink <= 0) && (ip->i_flag & IREF))
855 				isreclaim = 1;
856 			if (isbusy && (isreclaim || isnottrans))
857 				break;
858 		}
859 		mutex_exit(&ih_lock[i]);
860 		if (isbusy && (isreclaim || isnottrans))
861 			break;
862 	}
863 	*isbusyp = isbusy;
864 	*isreclaimp = isreclaim;
865 }
866 
867 /*
868  * As part of the ufs 'sync' operation, this routine is called to mark
869  * the filesystem as STABLE if there is no modified metadata in memory.
870  */
871 void
872 ufs_checkclean(struct vfs *vfsp)
873 {
874 	struct ufsvfs	*ufsvfsp	= (struct ufsvfs *)vfsp->vfs_data;
875 	struct fs	*fs		= ufsvfsp->vfs_fs;
876 	int		isbusy;
877 	int		isreclaim;
878 	int		updatesb;
879 
880 	ASSERT(vfs_lock_held(vfsp));
881 
882 	/*
883 	 * filesystem is stable or cleanflag processing is disabled; do nothing
884 	 *	no transitions when panic'ing
885 	 */
886 	if (fs->fs_ronly ||
887 	    fs->fs_clean == FSBAD ||
888 	    fs->fs_clean == FSSUSPEND ||
889 	    fs->fs_clean == FSSTABLE ||
890 	    panicstr)
891 		return;
892 
893 	/*
894 	 * if logging and nothing to reclaim; do nothing
895 	 */
896 	if ((fs->fs_clean == FSLOG) &&
897 	    (((fs->fs_reclaim & FS_RECLAIM) == 0) ||
898 	    (fs->fs_reclaim & FS_RECLAIMING)))
899 		return;
900 
901 	/*
902 	 * FS_CHECKCLEAN is reset if the file system goes dirty
903 	 * FS_CHECKRECLAIM is reset if a file gets deleted
904 	 */
905 	mutex_enter(&ufsvfsp->vfs_lock);
906 	fs->fs_reclaim |= (FS_CHECKCLEAN | FS_CHECKRECLAIM);
907 	mutex_exit(&ufsvfsp->vfs_lock);
908 
909 	updatesb = 0;
910 
911 	/*
912 	 * if logging or buffers are busy; do nothing
913 	 */
914 	isbusy = isreclaim = 0;
915 	if ((fs->fs_clean == FSLOG) ||
916 	    (bcheck(vfsp->vfs_dev, ufsvfsp->vfs_bufp)))
917 		isbusy = 1;
918 
919 	/*
920 	 * isreclaim == TRUE means can't change the state of fs_reclaim
921 	 */
922 	isreclaim =
923 	    ((fs->fs_clean == FSLOG) &&
924 	    (((fs->fs_reclaim & FS_RECLAIM) == 0) ||
925 	    (fs->fs_reclaim & FS_RECLAIMING)));
926 
927 	/*
928 	 * if fs is busy or can't change the state of fs_reclaim; do nothing
929 	 */
930 	if (isbusy && isreclaim)
931 		return;
932 
933 	/*
934 	 * look for busy or deleted inodes; (deleted == needs reclaim)
935 	 */
936 	ufs_icheck(ufsvfsp, &isbusy, &isreclaim);
937 
938 	mutex_enter(&ufsvfsp->vfs_lock);
939 
940 	/*
941 	 * IF POSSIBLE, RESET RECLAIM
942 	 */
943 	/*
944 	 * the reclaim thread is not running
945 	 */
946 	if ((fs->fs_reclaim & FS_RECLAIMING) == 0)
947 		/*
948 		 * no files were deleted during the scan
949 		 */
950 		if (fs->fs_reclaim & FS_CHECKRECLAIM)
951 			/*
952 			 * no deleted files were found in the inode cache
953 			 */
954 			if ((isreclaim == 0) && (fs->fs_reclaim & FS_RECLAIM)) {
955 				fs->fs_reclaim &= ~FS_RECLAIM;
956 				updatesb = 1;
957 			}
958 	/*
959 	 * IF POSSIBLE, SET STABLE
960 	 */
961 	/*
962 	 * not logging
963 	 */
964 	if (fs->fs_clean != FSLOG)
965 		/*
966 		 * file system has not gone dirty since the scan began
967 		 */
968 		if (fs->fs_reclaim & FS_CHECKCLEAN)
969 			/*
970 			 * nothing dirty was found in the buffer or inode cache
971 			 */
972 			if ((isbusy == 0) && (isreclaim == 0) &&
973 			    (fs->fs_clean != FSSTABLE)) {
974 				fs->fs_clean = FSSTABLE;
975 				updatesb = 1;
976 			}
977 
978 	mutex_exit(&ufsvfsp->vfs_lock);
979 	if (updatesb) {
980 		TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE);
981 	}
982 }
983 
984 /*
985  * called whenever an unlink occurs
986  */
987 void
988 ufs_setreclaim(struct inode *ip)
989 {
990 	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
991 	struct fs	*fs		= ufsvfsp->vfs_fs;
992 
993 	if (ip->i_nlink || fs->fs_ronly || (fs->fs_clean != FSLOG))
994 		return;
995 
996 	/*
997 	 * reclaim-needed bit is already set or we need to tell
998 	 * ufs_checkclean that a file has been deleted
999 	 */
1000 	if ((fs->fs_reclaim & (FS_RECLAIM | FS_CHECKRECLAIM)) == FS_RECLAIM)
1001 		return;
1002 
1003 	mutex_enter(&ufsvfsp->vfs_lock);
1004 	/*
1005 	 * inform ufs_checkclean that the file system has gone dirty
1006 	 */
1007 	fs->fs_reclaim &= ~FS_CHECKRECLAIM;
1008 
1009 	/*
1010 	 * set the reclaim-needed bit
1011 	 */
1012 	if ((fs->fs_reclaim & FS_RECLAIM) == 0) {
1013 		fs->fs_reclaim |= FS_RECLAIM;
1014 		ufs_sbwrite(ufsvfsp);
1015 	}
1016 	mutex_exit(&ufsvfsp->vfs_lock);
1017 }
1018 
1019 /*
1020  * Before any modified metadata written back to the disk, this routine
1021  * is called to mark the filesystem as ACTIVE.
1022  */
1023 void
1024 ufs_notclean(struct ufsvfs *ufsvfsp)
1025 {
1026 	struct fs *fs = ufsvfsp->vfs_fs;
1027 
1028 	ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock));
1029 	ULOCKFS_SET_MOD((&ufsvfsp->vfs_ulockfs));
1030 
1031 	/*
1032 	 * inform ufs_checkclean that the file system has gone dirty
1033 	 */
1034 	fs->fs_reclaim &= ~FS_CHECKCLEAN;
1035 
1036 	/*
1037 	 * ignore if active or bad or suspended or readonly or logging
1038 	 */
1039 	if ((fs->fs_clean == FSACTIVE) || (fs->fs_clean == FSLOG) ||
1040 	    (fs->fs_clean == FSBAD) || (fs->fs_clean == FSSUSPEND) ||
1041 	    (fs->fs_ronly)) {
1042 		mutex_exit(&ufsvfsp->vfs_lock);
1043 		return;
1044 	}
1045 	fs->fs_clean = FSACTIVE;
1046 	/*
1047 	 * write superblock synchronously
1048 	 */
1049 	ufs_sbwrite(ufsvfsp);
1050 	mutex_exit(&ufsvfsp->vfs_lock);
1051 }
1052 
1053 /*
1054  * ufs specific fbwrite()
1055  */
1056 int
1057 ufs_fbwrite(struct fbuf *fbp, struct inode *ip)
1058 {
1059 	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
1060 
1061 	if (TRANS_ISTRANS(ufsvfsp))
1062 		return (fbwrite(fbp));
1063 	mutex_enter(&ufsvfsp->vfs_lock);
1064 	ufs_notclean(ufsvfsp);
1065 	return ((ufsvfsp->vfs_dio) ? fbdwrite(fbp) : fbwrite(fbp));
1066 }
1067 
1068 /*
1069  * ufs specific fbiwrite()
1070  */
1071 int
1072 ufs_fbiwrite(struct fbuf *fbp, struct inode *ip, daddr_t bn, long bsize)
1073 {
1074 	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
1075 	o_mode_t	ifmt		= ip->i_mode & IFMT;
1076 	buf_t		*bp;
1077 	int		error;
1078 
1079 	mutex_enter(&ufsvfsp->vfs_lock);
1080 	ufs_notclean(ufsvfsp);
1081 	if (ifmt == IFDIR || ifmt == IFSHAD || ifmt == IFATTRDIR ||
1082 	    (ip->i_ufsvfs->vfs_qinod == ip)) {
1083 		TRANS_DELTA(ufsvfsp, ldbtob(bn * (offset_t)(btod(bsize))),
1084 		    fbp->fb_count, DT_FBI, 0, 0);
1085 	}
1086 	/*
1087 	 * Inlined version of fbiwrite()
1088 	 */
1089 	bp = pageio_setup((struct page *)NULL, fbp->fb_count,
1090 	    ip->i_devvp, B_WRITE);
1091 	bp->b_flags &= ~B_PAGEIO;
1092 	bp->b_un.b_addr = fbp->fb_addr;
1093 
1094 	bp->b_blkno = bn * btod(bsize);
1095 	bp->b_dev = cmpdev(ip->i_dev);	/* store in old dev format */
1096 	bp->b_edev = ip->i_dev;
1097 	bp->b_proc = NULL;			/* i.e. the kernel */
1098 	bp->b_file = ip->i_vnode;
1099 	bp->b_offset = -1;
1100 
1101 	if (ufsvfsp->vfs_log) {
1102 		lufs_write_strategy(ufsvfsp->vfs_log, bp);
1103 	} else if (ufsvfsp->vfs_snapshot) {
1104 		fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
1105 	} else {
1106 		ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
1107 		ub.ub_fbiwrites.value.ul++;
1108 		(void) bdev_strategy(bp);
1109 		lwp_stat_update(LWP_STAT_OUBLK, 1);
1110 	}
1111 	error = biowait(bp);
1112 	pageio_done(bp);
1113 	fbrelse(fbp, S_OTHER);
1114 	return (error);
1115 }
1116 
1117 /*
1118  * Write the ufs superblock only.
1119  */
1120 void
1121 ufs_sbwrite(struct ufsvfs *ufsvfsp)
1122 {
1123 	char sav_fs_fmod;
1124 	struct fs *fs = ufsvfsp->vfs_fs;
1125 	struct buf *bp = ufsvfsp->vfs_bufp;
1126 
1127 	ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock));
1128 
1129 	/*
1130 	 * for ulockfs processing, limit the superblock writes
1131 	 */
1132 	if ((ufsvfsp->vfs_ulockfs.ul_sbowner) &&
1133 	    (curthread != ufsvfsp->vfs_ulockfs.ul_sbowner)) {
1134 		/* try again later */
1135 		fs->fs_fmod = 1;
1136 		return;
1137 	}
1138 
1139 	ULOCKFS_SET_MOD((&ufsvfsp->vfs_ulockfs));
1140 	/*
1141 	 * update superblock timestamp and fs_clean checksum
1142 	 * if marked FSBAD, we always want an erroneous
1143 	 * checksum to force repair
1144 	 */
1145 	fs->fs_time = gethrestime_sec();
1146 	fs->fs_state = (fs->fs_clean != FSBAD) ?
1147 	    FSOKAY - fs->fs_time : -(FSOKAY - fs->fs_time);
1148 	switch (fs->fs_clean) {
1149 	case FSCLEAN:
1150 	case FSSTABLE:
1151 		fs->fs_reclaim &= ~FS_RECLAIM;
1152 		break;
1153 	case FSACTIVE:
1154 	case FSSUSPEND:
1155 	case FSBAD:
1156 	case FSLOG:
1157 		break;
1158 	default:
1159 		fs->fs_clean = FSACTIVE;
1160 		break;
1161 	}
1162 	/*
1163 	 * reset incore only bits
1164 	 */
1165 	fs->fs_reclaim &= ~(FS_CHECKCLEAN | FS_CHECKRECLAIM);
1166 
1167 	/*
1168 	 * delta the whole superblock
1169 	 */
1170 	TRANS_DELTA(ufsvfsp, ldbtob(SBLOCK), sizeof (struct fs),
1171 	    DT_SB, NULL, 0);
1172 	/*
1173 	 * retain the incore state of fs_fmod; set the ondisk state to 0
1174 	 */
1175 	sav_fs_fmod = fs->fs_fmod;
1176 	fs->fs_fmod = 0;
1177 
1178 	/*
1179 	 * Don't release the buffer after written to the disk
1180 	 */
1181 	UFS_BWRITE2(ufsvfsp, bp);
1182 	fs->fs_fmod = sav_fs_fmod;	/* reset fs_fmod's incore state */
1183 }
1184 
1185 /*
1186  * Returns vfs pointer if vfs still being mounted. vfs lock is held.
1187  * Otherwise, returns NULL.
1188  *
1189  * For our purposes, "still mounted" means that the file system still appears
1190  * on the list of UFS file system instances.
1191  */
1192 static vfs_t *
1193 still_mounted(struct check_node *checkp)
1194 {
1195 	struct vfs	*vfsp;
1196 	struct ufsvfs	*ufsp;
1197 
1198 	mutex_enter(&ufsvfs_mutex);
1199 	for (ufsp = ufs_instances; ufsp != NULL; ufsp = ufsp->vfs_next) {
1200 		if (ufsp != checkp->ufsvfs)
1201 			continue;
1202 		/*
1203 		 * Tentative match:  verify it and try to lock.  (It's not at
1204 		 * all clear how the verification could fail, given that we've
1205 		 * gotten this far.  We would have had to reallocate the
1206 		 * ufsvfs struct at hand for a new incarnation; is that really
1207 		 * possible in the interval from constructing the check_node
1208 		 * to here?)
1209 		 */
1210 		vfsp = ufsp->vfs_vfs;
1211 		if (vfsp != checkp->vfsp)
1212 			continue;
1213 		if (vfsp->vfs_dev != checkp->vfs_dev)
1214 			continue;
1215 		if (vfs_lock(vfsp) != 0)
1216 			continue;
1217 
1218 		mutex_exit(&ufsvfs_mutex);
1219 		return (vfsp);
1220 	}
1221 	mutex_exit(&ufsvfs_mutex);
1222 	return (NULL);
1223 }
1224 
1225 int
1226 ufs_si_io_done(struct buf *bp)
1227 {
1228 	sema_v(&bp->b_io);
1229 	return (0);
1230 }
1231 
1232 #define	SI_BUFSZ roundup(sizeof (struct cg), DEV_BSIZE)
1233 #define	NSIBUF 32
1234 
1235 /*
1236  * ufs_construct_si()
1237  * Read each cylinder group in turn and construct the summary information
1238  */
1239 static int
1240 ufs_construct_si(dev_t dev, struct fs *fs, struct ufsvfs *ufsvfsp)
1241 {
1242 	buf_t *bps, *bp;
1243 	char *bufs;
1244 	struct csum *sip = fs->fs_u.fs_csp;
1245 	struct cg *cgp;
1246 	int i, ncg;
1247 	int error = 0, cg = 0;
1248 
1249 	bps = kmem_alloc(NSIBUF * sizeof (buf_t), KM_SLEEP);
1250 	bufs = kmem_alloc(NSIBUF * SI_BUFSZ, KM_SLEEP);
1251 
1252 	/*
1253 	 * Initialise the buffer headers
1254 	 */
1255 	for (bp = bps, i = 0; i < NSIBUF; i++, bp++) {
1256 		bioinit(bp);
1257 		bp->b_iodone = ufs_si_io_done;
1258 		bp->b_bufsize = bp->b_bcount = SI_BUFSZ;
1259 		bp->b_flags = B_READ;
1260 		bp->b_un.b_addr = bufs + (i * SI_BUFSZ);
1261 		bp->b_edev = dev;
1262 	}
1263 
1264 	/*
1265 	 * Repeat while there are cylinder groups left to read.
1266 	 */
1267 	do {
1268 		/*
1269 		 * Issue upto NSIBUF asynchronous reads
1270 		 */
1271 		ncg = MIN(NSIBUF, (fs->fs_ncg - cg));
1272 		for (bp = bps, i = 0; i < ncg; i++, bp++) {
1273 			bp->b_blkno = (daddr_t)fsbtodb(fs, cgtod(fs, cg + i));
1274 			if (ufsvfsp->vfs_log) {
1275 				lufs_read_strategy(ufsvfsp->vfs_log, bp);
1276 			} else {
1277 				(void) bdev_strategy(bp);
1278 			}
1279 		}
1280 
1281 		/*
1282 		 * wait for each read to finish;
1283 		 * check for errors and copy the csum info
1284 		 */
1285 		for (bp = bps, i = 0; i < ncg; i++, bp++) {
1286 			sema_p(&bp->b_io);
1287 			if (!error) {
1288 				cgp = bp->b_un.b_cg;
1289 				sip[cg + i] = cgp->cg_cs;
1290 				error = geterror(bp);
1291 			}
1292 		}
1293 		if (error) {
1294 			goto err;
1295 		}
1296 		cg += ncg;
1297 	} while (cg < fs->fs_ncg);
1298 
1299 err:
1300 	kmem_free(bps, NSIBUF * sizeof (buf_t));
1301 	kmem_free(bufs, NSIBUF * SI_BUFSZ);
1302 	return (error);
1303 }
1304 
1305 /*
1306  * ufs_getsummaryinfo
1307  */
1308 int
1309 ufs_getsummaryinfo(dev_t dev, struct ufsvfs *ufsvfsp, struct fs *fs)
1310 {
1311 	int		i;		/* `for' loop counter */
1312 	ssize_t		size;		/* bytes of summary info to read */
1313 	daddr_t		frags;		/* frags of summary info to read */
1314 	caddr_t		sip;		/* summary info */
1315 	struct buf	*tp;		/* tmp buf */
1316 
1317 	/*
1318 	 * maintain metadata map for trans device (debug only)
1319 	 */
1320 	TRANS_MATA_SI(ufsvfsp, fs);
1321 
1322 	/*
1323 	 * Compute #frags and allocate space for summary info
1324 	 */
1325 	frags = howmany(fs->fs_cssize, fs->fs_fsize);
1326 	sip = kmem_alloc((size_t)fs->fs_cssize, KM_SLEEP);
1327 	fs->fs_u.fs_csp = (struct csum *)sip;
1328 
1329 	if (fs->fs_si == FS_SI_BAD) {
1330 		/*
1331 		 * The summary information is unknown, read it in from
1332 		 * the cylinder groups.
1333 		 */
1334 		if (TRANS_ISTRANS(ufsvfsp) && !TRANS_ISERROR(ufsvfsp) &&
1335 		    ufsvfsp->vfs_log->un_logmap) {
1336 			logmap_roll_dev(ufsvfsp->vfs_log); /* flush the log */
1337 		}
1338 		bzero(sip, (size_t)fs->fs_cssize);
1339 		if (ufs_construct_si(dev, fs, ufsvfsp)) {
1340 			kmem_free(fs->fs_u.fs_csp, fs->fs_cssize);
1341 			fs->fs_u.fs_csp = NULL;
1342 			return (EIO);
1343 		}
1344 	} else {
1345 		/* Read summary info a fs block at a time */
1346 		size = fs->fs_bsize;
1347 		for (i = 0; i < frags; i += fs->fs_frag) {
1348 			if (i + fs->fs_frag > frags)
1349 				/*
1350 				 * This happens only the last iteration, so
1351 				 * don't worry about size being reset
1352 				 */
1353 				size = (frags - i) * fs->fs_fsize;
1354 			tp = UFS_BREAD(ufsvfsp, dev,
1355 			    (daddr_t)fsbtodb(fs, fs->fs_csaddr+i), size);
1356 			tp->b_flags |= B_STALE | B_AGE;
1357 			if (tp->b_flags & B_ERROR) {
1358 				kmem_free(fs->fs_u.fs_csp, fs->fs_cssize);
1359 				fs->fs_u.fs_csp = NULL;
1360 				brelse(tp);
1361 				return (EIO);
1362 			}
1363 			bcopy(tp->b_un.b_addr, sip, size);
1364 			sip += size;
1365 			brelse(tp);
1366 		}
1367 	}
1368 	bzero((caddr_t)&fs->fs_cstotal, sizeof (fs->fs_cstotal));
1369 	for (i = 0; i < fs->fs_ncg; ++i) {
1370 		fs->fs_cstotal.cs_ndir += fs->fs_cs(fs, i).cs_ndir;
1371 		fs->fs_cstotal.cs_nbfree += fs->fs_cs(fs, i).cs_nbfree;
1372 		fs->fs_cstotal.cs_nifree += fs->fs_cs(fs, i).cs_nifree;
1373 		fs->fs_cstotal.cs_nffree += fs->fs_cs(fs, i).cs_nffree;
1374 	}
1375 	return (0);
1376 }
1377 
1378 /*
1379  * ufs_putsummaryinfo() stores all the cylinder group summary information
1380  * This is only used when logging, but the file system may not
1381  * be logging at the time, eg a read-only mount to flush the log
1382  * may push the summary info out.
1383  */
1384 int
1385 ufs_putsummaryinfo(dev_t dev, struct ufsvfs *ufsvfsp, struct fs *fs)
1386 {
1387 	struct buf	b, *bp;		/* tmp buf */
1388 	caddr_t		sip;		/* summary info */
1389 	ssize_t		size;		/* bytes of summary info to write */
1390 	daddr_t		frags;		/* frags of summary info to write */
1391 	int		i;		/* `for' loop counter */
1392 	int		error;		/* error */
1393 
1394 	if (TRANS_ISERROR(ufsvfsp)) {
1395 		return (EIO);
1396 	}
1397 
1398 	if ((fs->fs_si != FS_SI_BAD) || !ufsvfsp->vfs_nolog_si) {
1399 		return (0);
1400 	}
1401 
1402 	bp = &b;
1403 	bioinit(bp);
1404 	bp->b_iodone = ufs_si_io_done;
1405 	bp->b_bufsize = size = fs->fs_bsize;
1406 	bp->b_flags = B_WRITE;
1407 	bp->b_un.b_addr = kmem_alloc(size, KM_SLEEP);
1408 	bp->b_edev = dev;
1409 	frags = howmany(fs->fs_cssize, fs->fs_fsize);
1410 	sip = (caddr_t)fs->fs_u.fs_csp;
1411 
1412 	/* Write summary info one fs block at a time */
1413 	for (error = 0, i = 0; (i < frags) && (error == 0); i += fs->fs_frag) {
1414 		if (i + fs->fs_frag > frags) {
1415 			/*
1416 			 * This happens only the last iteration, so
1417 			 * don't worry about size being reset
1418 			 */
1419 			size = (frags - i) * fs->fs_fsize;
1420 		}
1421 		bcopy(sip, bp->b_un.b_addr, size);
1422 		bp->b_blkno = (daddr_t)fsbtodb(fs, fs->fs_csaddr+i);
1423 		bp->b_bcount = size;
1424 		(void) bdev_strategy(bp);
1425 		sema_p(&bp->b_io); /* wait for write to complete */
1426 		error = geterror(bp);
1427 		sip += size;
1428 	}
1429 	kmem_free(bp->b_un.b_addr, fs->fs_bsize);
1430 	if (!error) {
1431 		fs->fs_si = FS_SI_OK;
1432 	}
1433 	return (error);
1434 }
1435 
1436 /*
1437  * Decide whether it is okay to remove within a sticky directory.
1438  * Two conditions need to be met:  write access to the directory
1439  * is needed.  In sticky directories, write access is not sufficient;
1440  * you can remove entries from a directory only if you own the directory,
1441  * if you are privileged, if you own the entry or if the entry is
1442  * a plain file and you have write access to that file.
1443  * Function returns 0 if remove access is granted.
1444  * Note, the caller is responsible for holding the i_contents lock
1445  * at least as reader on the inquired inode 'ip'.
1446  */
1447 int
1448 ufs_sticky_remove_access(struct inode *dp, struct inode *ip, struct cred *cr)
1449 {
1450 	uid_t uid;
1451 
1452 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
1453 
1454 	if ((dp->i_mode & ISVTX) &&
1455 	    (uid = crgetuid(cr)) != dp->i_uid &&
1456 	    uid != ip->i_uid &&
1457 	    ((ip->i_mode & IFMT) != IFREG ||
1458 	    ufs_iaccess(ip, IWRITE, cr, 0) != 0))
1459 		return (secpolicy_vnode_remove(cr));
1460 
1461 	return (0);
1462 }
1463 #endif	/* _KERNEL */
1464 
1465 extern	int around[9];
1466 extern	int inside[9];
1467 extern	uchar_t *fragtbl[];
1468 
1469 /*
1470  * Update the frsum fields to reflect addition or deletion
1471  * of some frags.
1472  */
1473 void
1474 fragacct(struct fs *fs, int fragmap, int32_t *fraglist, int cnt)
1475 {
1476 	int inblk;
1477 	int field, subfield;
1478 	int siz, pos;
1479 
1480 	/*
1481 	 * ufsvfsp->vfs_lock is held when calling this.
1482 	 */
1483 	inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1;
1484 	fragmap <<= 1;
1485 	for (siz = 1; siz < fs->fs_frag; siz++) {
1486 		if ((inblk & (1 << (siz + (fs->fs_frag % NBBY)))) == 0)
1487 			continue;
1488 		field = around[siz];
1489 		subfield = inside[siz];
1490 		for (pos = siz; pos <= fs->fs_frag; pos++) {
1491 			if ((fragmap & field) == subfield) {
1492 				fraglist[siz] += cnt;
1493 				ASSERT(fraglist[siz] >= 0);
1494 				pos += siz;
1495 				field <<= siz;
1496 				subfield <<= siz;
1497 			}
1498 			field <<= 1;
1499 			subfield <<= 1;
1500 		}
1501 	}
1502 }
1503 
1504 /*
1505  * Block operations
1506  */
1507 
1508 /*
1509  * Check if a block is available
1510  */
1511 int
1512 isblock(struct fs *fs, uchar_t *cp, daddr_t h)
1513 {
1514 	uchar_t mask;
1515 
1516 	ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \
1517 	    fs->fs_frag == 1);
1518 	/*
1519 	 * ufsvfsp->vfs_lock is held when calling this.
1520 	 */
1521 	switch ((int)fs->fs_frag) {
1522 	case 8:
1523 		return (cp[h] == 0xff);
1524 	case 4:
1525 		mask = 0x0f << ((h & 0x1) << 2);
1526 		return ((cp[h >> 1] & mask) == mask);
1527 	case 2:
1528 		mask = 0x03 << ((h & 0x3) << 1);
1529 		return ((cp[h >> 2] & mask) == mask);
1530 	case 1:
1531 		mask = 0x01 << (h & 0x7);
1532 		return ((cp[h >> 3] & mask) == mask);
1533 	default:
1534 #ifndef _KERNEL
1535 		cmn_err(CE_PANIC, "isblock: illegal fs->fs_frag value (%d)",
1536 		    fs->fs_frag);
1537 #endif /* _KERNEL */
1538 		return (0);
1539 	}
1540 }
1541 
1542 /*
1543  * Take a block out of the map
1544  */
1545 void
1546 clrblock(struct fs *fs, uchar_t *cp, daddr_t h)
1547 {
1548 	ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \
1549 	    fs->fs_frag == 1);
1550 	/*
1551 	 * ufsvfsp->vfs_lock is held when calling this.
1552 	 */
1553 	switch ((int)fs->fs_frag) {
1554 	case 8:
1555 		cp[h] = 0;
1556 		return;
1557 	case 4:
1558 		cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2));
1559 		return;
1560 	case 2:
1561 		cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1));
1562 		return;
1563 	case 1:
1564 		cp[h >> 3] &= ~(0x01 << (h & 0x7));
1565 		return;
1566 	default:
1567 #ifndef _KERNEL
1568 		cmn_err(CE_PANIC, "clrblock: illegal fs->fs_frag value (%d)",
1569 		    fs->fs_frag);
1570 #endif /* _KERNEL */
1571 		return;
1572 	}
1573 }
1574 
1575 /*
1576  * Is block allocated?
1577  */
1578 int
1579 isclrblock(struct fs *fs, uchar_t *cp, daddr_t h)
1580 {
1581 	uchar_t	mask;
1582 	int	frag;
1583 	/*
1584 	 * ufsvfsp->vfs_lock is held when calling this.
1585 	 */
1586 	frag = fs->fs_frag;
1587 	ASSERT(frag == 8 || frag == 4 || frag == 2 || frag == 1);
1588 	switch (frag) {
1589 	case 8:
1590 		return (cp[h] == 0);
1591 	case 4:
1592 		mask = ~(0x0f << ((h & 0x1) << 2));
1593 		return (cp[h >> 1] == (cp[h >> 1] & mask));
1594 	case 2:
1595 		mask =	~(0x03 << ((h & 0x3) << 1));
1596 		return (cp[h >> 2] == (cp[h >> 2] & mask));
1597 	case 1:
1598 		mask = ~(0x01 << (h & 0x7));
1599 		return (cp[h >> 3] == (cp[h >> 3] & mask));
1600 	default:
1601 #ifndef _KERNEL
1602 		cmn_err(CE_PANIC, "isclrblock: illegal fs->fs_frag value (%d)",
1603 		    fs->fs_frag);
1604 #endif /* _KERNEL */
1605 		break;
1606 	}
1607 	return (0);
1608 }
1609 
1610 /*
1611  * Put a block into the map
1612  */
1613 void
1614 setblock(struct fs *fs, uchar_t *cp, daddr_t h)
1615 {
1616 	ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \
1617 	    fs->fs_frag == 1);
1618 	/*
1619 	 * ufsvfsp->vfs_lock is held when calling this.
1620 	 */
1621 	switch ((int)fs->fs_frag) {
1622 	case 8:
1623 		cp[h] = 0xff;
1624 		return;
1625 	case 4:
1626 		cp[h >> 1] |= (0x0f << ((h & 0x1) << 2));
1627 		return;
1628 	case 2:
1629 		cp[h >> 2] |= (0x03 << ((h & 0x3) << 1));
1630 		return;
1631 	case 1:
1632 		cp[h >> 3] |= (0x01 << (h & 0x7));
1633 		return;
1634 	default:
1635 #ifndef _KERNEL
1636 		cmn_err(CE_PANIC, "setblock: illegal fs->fs_frag value (%d)",
1637 		    fs->fs_frag);
1638 #endif /* _KERNEL */
1639 		return;
1640 	}
1641 }
1642 
1643 int
1644 skpc(char c, uint_t len, char *cp)
1645 {
1646 	if (len == 0)
1647 		return (0);
1648 	while (*cp++ == c && --len)
1649 		;
1650 	return (len);
1651 }
1652