xref: /titanic_50/usr/src/uts/common/fs/ufs/ufs_lockfs.c (revision 40b706cf0ea54322e61c57be046ae27fde135222)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/t_lock.h>
28 #include <sys/param.h>
29 #include <sys/time.h>
30 #include <sys/systm.h>
31 #include <sys/sysmacros.h>
32 #include <sys/resource.h>
33 #include <sys/signal.h>
34 #include <sys/cred.h>
35 #include <sys/user.h>
36 #include <sys/buf.h>
37 #include <sys/vfs.h>
38 #include <sys/vnode.h>
39 #include <sys/proc.h>
40 #include <sys/disp.h>
41 #include <sys/file.h>
42 #include <sys/fcntl.h>
43 #include <sys/flock.h>
44 #include <sys/atomic.h>
45 #include <sys/kmem.h>
46 #include <sys/uio.h>
47 #include <sys/conf.h>
48 #include <sys/mman.h>
49 #include <sys/pathname.h>
50 #include <sys/debug.h>
51 #include <sys/vmmeter.h>
52 #include <sys/vmsystm.h>
53 #include <sys/cmn_err.h>
54 #include <sys/acct.h>
55 #include <sys/dnlc.h>
56 #include <sys/swap.h>
57 
58 #include <sys/fs/ufs_fs.h>
59 #include <sys/fs/ufs_inode.h>
60 #include <sys/fs/ufs_fsdir.h>
61 #include <sys/fs/ufs_trans.h>
62 #include <sys/fs/ufs_panic.h>
63 #include <sys/fs/ufs_mount.h>
64 #include <sys/fs/ufs_bio.h>
65 #include <sys/fs/ufs_log.h>
66 #include <sys/fs/ufs_quota.h>
67 #include <sys/dirent.h>		/* must be AFTER <sys/fs/fsdir.h>! */
68 #include <sys/errno.h>
69 #include <sys/sysinfo.h>
70 
71 #include <vm/hat.h>
72 #include <vm/pvn.h>
73 #include <vm/as.h>
74 #include <vm/seg.h>
75 #include <vm/seg_map.h>
76 #include <vm/seg_vn.h>
77 #include <vm/rm.h>
78 #include <vm/anon.h>
79 #include <sys/swap.h>
80 #include <sys/dnlc.h>
81 
82 extern struct vnode *common_specvp(struct vnode *vp);
83 
84 /* error lock status */
85 #define	UN_ERRLCK	(-1)
86 #define	SET_ERRLCK	1
87 #define	RE_ERRLCK	2
88 #define	NO_ERRLCK	0
89 
90 /*
91  * Index to be used in TSD for storing lockfs data
92  */
93 uint_t ufs_lockfs_key;
94 
95 typedef struct _ulockfs_info {
96 	struct _ulockfs_info *next;
97 	struct ulockfs *ulp;
98 	uint_t flags;
99 } ulockfs_info_t;
100 
101 #define	ULOCK_INFO_FALLOCATE	0x00000001	/* fallocate thread */
102 
103 /*
104  * Check in TSD that whether we are already doing any VOP on this filesystem
105  */
106 #define	IS_REC_VOP(found, head, ulp, free)		\
107 {							\
108 	ulockfs_info_t *_curr;				\
109 							\
110 	for (found = 0, free = NULL, _curr = head;	\
111 	    _curr != NULL; _curr = _curr->next) {	\
112 		if ((free == NULL) &&			\
113 		    (_curr->ulp == NULL))		\
114 			free = _curr;			\
115 		if (_curr->ulp == ulp) {		\
116 			found = 1;			\
117 			break;				\
118 		}					\
119 	}						\
120 }
121 
122 /*
123  * Get the lockfs data from TSD so that lockfs handles the recursive VOP
124  * properly
125  */
126 #define	SEARCH_ULOCKFSP(head, ulp, info)		\
127 {							\
128 	ulockfs_info_t *_curr;				\
129 							\
130 	for (_curr = head; _curr != NULL;		\
131 	    _curr = _curr->next) {			\
132 		if (_curr->ulp == ulp) {		\
133 			break;				\
134 		}					\
135 	}						\
136 							\
137 	info = _curr;					\
138 }
139 
140 /*
141  * Validate lockfs request
142  */
143 static int
144 ufs_getlfd(
145 	struct lockfs *lockfsp,		/* new lock request */
146 	struct lockfs *ul_lockfsp)	/* old lock state */
147 {
148 	int	error = 0;
149 
150 	/*
151 	 * no input flags defined
152 	 */
153 	if (lockfsp->lf_flags != 0) {
154 		error = EINVAL;
155 		goto errout;
156 	}
157 
158 	/*
159 	 * check key
160 	 */
161 	if (!LOCKFS_IS_ULOCK(ul_lockfsp))
162 		if (lockfsp->lf_key != ul_lockfsp->lf_key) {
163 			error = EINVAL;
164 			goto errout;
165 	}
166 
167 	lockfsp->lf_key = ul_lockfsp->lf_key + 1;
168 
169 errout:
170 	return (error);
171 }
172 
173 /*
174  * ufs_checkaccton
175  *	check if accounting is turned on on this fs
176  */
177 
178 int
179 ufs_checkaccton(struct vnode *vp)
180 {
181 	if (acct_fs_in_use(vp))
182 		return (EDEADLK);
183 	return (0);
184 }
185 
186 /*
187  * ufs_checkswapon
188  *	check if local swapping is to file on this fs
189  */
190 int
191 ufs_checkswapon(struct vnode *vp)
192 {
193 	struct swapinfo	*sip;
194 
195 	mutex_enter(&swapinfo_lock);
196 	for (sip = swapinfo; sip; sip = sip->si_next)
197 		if (sip->si_vp->v_vfsp == vp->v_vfsp) {
198 			mutex_exit(&swapinfo_lock);
199 			return (EDEADLK);
200 		}
201 	mutex_exit(&swapinfo_lock);
202 	return (0);
203 }
204 
205 /*
206  * ufs_freeze
207  *	pend future accesses for current lock and desired lock
208  */
209 void
210 ufs_freeze(struct ulockfs *ulp, struct lockfs *lockfsp)
211 {
212 	/*
213 	 * set to new lock type
214 	 */
215 	ulp->ul_lockfs.lf_lock = lockfsp->lf_lock;
216 	ulp->ul_lockfs.lf_key = lockfsp->lf_key;
217 	ulp->ul_lockfs.lf_comlen = lockfsp->lf_comlen;
218 	ulp->ul_lockfs.lf_comment = lockfsp->lf_comment;
219 
220 	ulp->ul_fs_lock = (1 << ulp->ul_lockfs.lf_lock);
221 }
222 
223 /*
224  * All callers of ufs_quiesce() atomically increment ufs_quiesce_pend before
225  * starting ufs_quiesce() protocol and decrement it only when a file system no
226  * longer has to be in quiescent state. This allows ufs_pageio() to detect
227  * that another thread wants to quiesce a file system. See more comments in
228  * ufs_pageio().
229  */
230 ulong_t ufs_quiesce_pend = 0;
231 
232 /*
233  * ufs_quiesce
234  *	wait for outstanding accesses to finish
235  */
236 int
237 ufs_quiesce(struct ulockfs *ulp)
238 {
239 	int error = 0;
240 	ulockfs_info_t *head;
241 	ulockfs_info_t *info;
242 
243 	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
244 	SEARCH_ULOCKFSP(head, ulp, info);
245 
246 	/*
247 	 * Set a softlock to suspend future ufs_vnops so that
248 	 * this lockfs request will not be starved
249 	 */
250 	ULOCKFS_SET_SLOCK(ulp);
251 	ASSERT(ufs_quiesce_pend);
252 
253 	/* check if there is any outstanding ufs vnodeops calls */
254 	while (ulp->ul_vnops_cnt || ulp->ul_falloc_cnt) {
255 		/*
256 		 * use timed version of cv_wait_sig() to make sure we don't
257 		 * miss a wake up call from ufs_pageio() when it doesn't use
258 		 * ul_lock.
259 		 *
260 		 * when a fallocate thread comes in, the only way it returns
261 		 * from this function is if there are no other vnode operations
262 		 * going on (remember fallocate threads are tracked using
263 		 * ul_falloc_cnt not ul_vnops_cnt), and another fallocate thread
264 		 * hasn't already grabbed the fs write lock.
265 		 */
266 		if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
267 			if (!ulp->ul_vnops_cnt && !ULOCKFS_IS_FWLOCK(ulp))
268 				goto out;
269 		}
270 		if (!cv_timedwait_sig(&ulp->ul_cv, &ulp->ul_lock, lbolt + hz)) {
271 			error = EINTR;
272 			goto out;
273 		}
274 	}
275 
276 out:
277 	/*
278 	 * unlock the soft lock
279 	 */
280 	ULOCKFS_CLR_SLOCK(ulp);
281 
282 	return (error);
283 }
284 
285 /*
286  * ufs_flush_inode
287  */
288 int
289 ufs_flush_inode(struct inode *ip, void *arg)
290 {
291 	int	error;
292 	int	saverror	= 0;
293 
294 	/*
295 	 * wrong file system; keep looking
296 	 */
297 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
298 		return (0);
299 
300 	/*
301 	 * asynchronously push all the dirty pages
302 	 */
303 	if (((error = TRANS_SYNCIP(ip, B_ASYNC, 0, TOP_SYNCIP_FLUSHI)) != 0) &&
304 	    (error != EAGAIN))
305 		saverror = error;
306 	/*
307 	 * wait for io and discard all mappings
308 	 */
309 	if (error = TRANS_SYNCIP(ip, B_INVAL, 0, TOP_SYNCIP_FLUSHI))
310 		saverror = error;
311 
312 	if (ITOV(ip)->v_type == VDIR) {
313 		dnlc_dir_purge(&ip->i_danchor);
314 	}
315 
316 	return (saverror);
317 }
318 
319 /*
320  * ufs_flush
321  *	Flush everything that is currently dirty; this includes invalidating
322  *	any mappings.
323  */
324 int
325 ufs_flush(struct vfs *vfsp)
326 {
327 	int		error;
328 	int		saverror = 0;
329 	struct ufsvfs	*ufsvfsp	= (struct ufsvfs *)vfsp->vfs_data;
330 	struct fs	*fs		= ufsvfsp->vfs_fs;
331 	int		tdontblock = 0;
332 
333 	ASSERT(vfs_lock_held(vfsp));
334 
335 	/*
336 	 * purge dnlc
337 	 */
338 	(void) dnlc_purge_vfsp(vfsp, 0);
339 
340 	/*
341 	 * drain the delete and idle threads
342 	 */
343 	ufs_delete_drain(vfsp, 0, 0);
344 	ufs_idle_drain(vfsp);
345 
346 	/*
347 	 * flush and invalidate quota records
348 	 */
349 	(void) qsync(ufsvfsp);
350 
351 	/*
352 	 * flush w/invalidate the inodes for vfsp
353 	 */
354 	if (error = ufs_scan_inodes(0, ufs_flush_inode, ufsvfsp, ufsvfsp))
355 		saverror = error;
356 
357 	/*
358 	 * synchronously flush superblock and summary info
359 	 */
360 	if (fs->fs_ronly == 0 && fs->fs_fmod) {
361 		fs->fs_fmod = 0;
362 		TRANS_SBUPDATE(ufsvfsp, vfsp, TOP_SBUPDATE_FLUSH);
363 	}
364 	/*
365 	 * flush w/invalidate block device pages and buf cache
366 	 */
367 	if ((error = VOP_PUTPAGE(common_specvp(ufsvfsp->vfs_devvp),
368 	    (offset_t)0, 0, B_INVAL, CRED(), NULL)) > 0)
369 		saverror = error;
370 
371 	(void) bflush((dev_t)vfsp->vfs_dev);
372 	(void) bfinval((dev_t)vfsp->vfs_dev, 0);
373 
374 	/*
375 	 * drain the delete and idle threads again
376 	 */
377 	ufs_delete_drain(vfsp, 0, 0);
378 	ufs_idle_drain(vfsp);
379 
380 	/*
381 	 * play with the clean flag
382 	 */
383 	if (saverror == 0)
384 		ufs_checkclean(vfsp);
385 
386 	/*
387 	 * Flush any outstanding transactions and roll the log
388 	 * only if we are supposed to do, i.e. LDL_NOROLL not set.
389 	 * We can not simply check for fs_ronly here since fsck also may
390 	 * use this code to roll the log on a read-only filesystem, e.g.
391 	 * root during early stages of boot, if other then a sanity check is
392 	 * done, it will clear LDL_NOROLL before.
393 	 * In addition we assert that the deltamap does not contain any deltas
394 	 * in case LDL_NOROLL is set since this is not supposed to happen.
395 	 */
396 	if (TRANS_ISTRANS(ufsvfsp)) {
397 		ml_unit_t	*ul	= ufsvfsp->vfs_log;
398 		mt_map_t	*mtm	= ul->un_deltamap;
399 
400 		if (ul->un_flags & LDL_NOROLL) {
401 			ASSERT(mtm->mtm_nme == 0);
402 		} else {
403 			/*
404 			 * Do not set T_DONTBLOCK if there is a
405 			 * transaction opened by caller.
406 			 */
407 			if (curthread->t_flag & T_DONTBLOCK)
408 				tdontblock = 1;
409 			else
410 				curthread->t_flag |= T_DONTBLOCK;
411 
412 			TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_FLUSH,
413 			    TOP_COMMIT_SIZE, error);
414 
415 			if (!error) {
416 				TRANS_END_SYNC(ufsvfsp, saverror,
417 				    TOP_COMMIT_FLUSH, TOP_COMMIT_SIZE);
418 			}
419 
420 			if (tdontblock == 0)
421 				curthread->t_flag &= ~T_DONTBLOCK;
422 
423 			logmap_roll_dev(ufsvfsp->vfs_log);
424 		}
425 	}
426 
427 	return (saverror);
428 }
429 
430 /*
431  * ufs_thaw_wlock
432  *	special processing when thawing down to wlock
433  */
434 static int
435 ufs_thaw_wlock(struct inode *ip, void *arg)
436 {
437 	/*
438 	 * wrong file system; keep looking
439 	 */
440 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
441 		return (0);
442 
443 	/*
444 	 * iupdat refuses to clear flags if the fs is read only.  The fs
445 	 * may become read/write during the lock and we wouldn't want
446 	 * these inodes being written to disk.  So clear the flags.
447 	 */
448 	rw_enter(&ip->i_contents, RW_WRITER);
449 	ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
450 	rw_exit(&ip->i_contents);
451 
452 	/*
453 	 * pages are mlocked -- fail wlock
454 	 */
455 	if (ITOV(ip)->v_type != VCHR && vn_has_cached_data(ITOV(ip)))
456 		return (EBUSY);
457 
458 	return (0);
459 }
460 
461 /*
462  * ufs_thaw_hlock
463  *	special processing when thawing down to hlock or elock
464  */
465 static int
466 ufs_thaw_hlock(struct inode *ip, void *arg)
467 {
468 	struct vnode	*vp	= ITOV(ip);
469 
470 	/*
471 	 * wrong file system; keep looking
472 	 */
473 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
474 		return (0);
475 
476 	/*
477 	 * blow away all pages - even if they are mlocked
478 	 */
479 	do {
480 		(void) TRANS_SYNCIP(ip, B_INVAL | B_FORCE, 0, TOP_SYNCIP_HLOCK);
481 	} while ((vp->v_type != VCHR) && vn_has_cached_data(vp));
482 	rw_enter(&ip->i_contents, RW_WRITER);
483 	ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
484 	rw_exit(&ip->i_contents);
485 
486 	return (0);
487 }
488 
489 /*
490  * ufs_thaw
491  *	thaw file system lock down to current value
492  */
493 int
494 ufs_thaw(struct vfs *vfsp, struct ufsvfs *ufsvfsp, struct ulockfs *ulp)
495 {
496 	int		error	= 0;
497 	int		noidel	= (int)(ulp->ul_flag & ULOCKFS_NOIDEL);
498 
499 	/*
500 	 * if wlock or hlock or elock
501 	 */
502 	if (ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_HLOCK(ulp) ||
503 	    ULOCKFS_IS_ELOCK(ulp)) {
504 
505 		/*
506 		 * don't keep access times
507 		 * don't free deleted files
508 		 * if superblock writes are allowed, limit them to me for now
509 		 */
510 		ulp->ul_flag |= (ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
511 		if (ulp->ul_sbowner != (kthread_id_t)-1)
512 			ulp->ul_sbowner = curthread;
513 
514 		/*
515 		 * wait for writes for deleted files and superblock updates
516 		 */
517 		(void) ufs_flush(vfsp);
518 
519 		/*
520 		 * now make sure the quota file is up-to-date
521 		 *	expensive; but effective
522 		 */
523 		error = ufs_flush(vfsp);
524 		/*
525 		 * no one can write the superblock
526 		 */
527 		ulp->ul_sbowner = (kthread_id_t)-1;
528 
529 		/*
530 		 * special processing for wlock/hlock/elock
531 		 */
532 		if (ULOCKFS_IS_WLOCK(ulp)) {
533 			if (error)
534 				goto errout;
535 			error = bfinval(ufsvfsp->vfs_dev, 0);
536 			if (error)
537 				goto errout;
538 			error = ufs_scan_inodes(0, ufs_thaw_wlock,
539 			    (void *)ufsvfsp, ufsvfsp);
540 			if (error)
541 				goto errout;
542 		}
543 		if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp)) {
544 			error = 0;
545 			(void) ufs_scan_inodes(0, ufs_thaw_hlock,
546 			    (void *)ufsvfsp, ufsvfsp);
547 			(void) bfinval(ufsvfsp->vfs_dev, 1);
548 		}
549 	} else {
550 
551 		/*
552 		 * okay to keep access times
553 		 * okay to free deleted files
554 		 * okay to write the superblock
555 		 */
556 		ulp->ul_flag &= ~(ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
557 		ulp->ul_sbowner = NULL;
558 
559 		/*
560 		 * flush in case deleted files are in memory
561 		 */
562 		if (noidel) {
563 			if (error = ufs_flush(vfsp))
564 				goto errout;
565 		}
566 	}
567 
568 errout:
569 	cv_broadcast(&ulp->ul_cv);
570 	return (error);
571 }
572 
573 /*
574  * ufs_reconcile_fs
575  *	reconcile incore superblock with ondisk superblock
576  */
577 int
578 ufs_reconcile_fs(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
579 {
580 	struct fs	*mfs; 	/* in-memory superblock */
581 	struct fs	*dfs;	/* on-disk   superblock */
582 	struct buf	*bp;	/* on-disk   superblock buf */
583 	int		 needs_unlock;
584 	char		 finished_fsclean;
585 
586 	mfs = ufsvfsp->vfs_fs;
587 
588 	/*
589 	 * get the on-disk copy of the superblock
590 	 */
591 	bp = UFS_BREAD(ufsvfsp, vfsp->vfs_dev, SBLOCK, SBSIZE);
592 	bp->b_flags |= (B_STALE|B_AGE);
593 	if (bp->b_flags & B_ERROR) {
594 		brelse(bp);
595 		return (EIO);
596 	}
597 	dfs = bp->b_un.b_fs;
598 
599 	/* error locks may only unlock after the fs has been made consistent */
600 	if (errlck == UN_ERRLCK) {
601 		if (dfs->fs_clean == FSFIX) {	/* being repaired */
602 			brelse(bp);
603 			return (EAGAIN);
604 		}
605 		/* repair not yet started? */
606 		finished_fsclean = TRANS_ISTRANS(ufsvfsp)? FSLOG: FSCLEAN;
607 		if (dfs->fs_clean != finished_fsclean) {
608 			brelse(bp);
609 			return (EBUSY);
610 		}
611 	}
612 
613 	/*
614 	 * if superblock has changed too much, abort
615 	 */
616 	if ((mfs->fs_sblkno		!= dfs->fs_sblkno) ||
617 	    (mfs->fs_cblkno		!= dfs->fs_cblkno) ||
618 	    (mfs->fs_iblkno		!= dfs->fs_iblkno) ||
619 	    (mfs->fs_dblkno		!= dfs->fs_dblkno) ||
620 	    (mfs->fs_cgoffset		!= dfs->fs_cgoffset) ||
621 	    (mfs->fs_cgmask		!= dfs->fs_cgmask) ||
622 	    (mfs->fs_bsize		!= dfs->fs_bsize) ||
623 	    (mfs->fs_fsize		!= dfs->fs_fsize) ||
624 	    (mfs->fs_frag		!= dfs->fs_frag) ||
625 	    (mfs->fs_bmask		!= dfs->fs_bmask) ||
626 	    (mfs->fs_fmask		!= dfs->fs_fmask) ||
627 	    (mfs->fs_bshift		!= dfs->fs_bshift) ||
628 	    (mfs->fs_fshift		!= dfs->fs_fshift) ||
629 	    (mfs->fs_fragshift		!= dfs->fs_fragshift) ||
630 	    (mfs->fs_fsbtodb		!= dfs->fs_fsbtodb) ||
631 	    (mfs->fs_sbsize		!= dfs->fs_sbsize) ||
632 	    (mfs->fs_nindir		!= dfs->fs_nindir) ||
633 	    (mfs->fs_nspf		!= dfs->fs_nspf) ||
634 	    (mfs->fs_trackskew		!= dfs->fs_trackskew) ||
635 	    (mfs->fs_cgsize		!= dfs->fs_cgsize) ||
636 	    (mfs->fs_ntrak		!= dfs->fs_ntrak) ||
637 	    (mfs->fs_nsect		!= dfs->fs_nsect) ||
638 	    (mfs->fs_spc		!= dfs->fs_spc) ||
639 	    (mfs->fs_cpg		!= dfs->fs_cpg) ||
640 	    (mfs->fs_ipg		!= dfs->fs_ipg) ||
641 	    (mfs->fs_fpg		!= dfs->fs_fpg) ||
642 	    (mfs->fs_postblformat	!= dfs->fs_postblformat) ||
643 	    (mfs->fs_magic		!= dfs->fs_magic)) {
644 		brelse(bp);
645 		return (EACCES);
646 	}
647 	if (dfs->fs_clean == FSBAD || FSOKAY != dfs->fs_state + dfs->fs_time)
648 		if (mfs->fs_clean == FSLOG) {
649 			brelse(bp);
650 			return (EACCES);
651 		}
652 
653 	/*
654 	 * get new summary info
655 	 */
656 	if (ufs_getsummaryinfo(vfsp->vfs_dev, ufsvfsp, dfs)) {
657 		brelse(bp);
658 		return (EIO);
659 	}
660 
661 	/*
662 	 * release old summary info and update in-memory superblock
663 	 */
664 	kmem_free(mfs->fs_u.fs_csp, mfs->fs_cssize);
665 	mfs->fs_u.fs_csp = dfs->fs_u.fs_csp;	/* Only entry 0 used */
666 
667 	/*
668 	 * update fields allowed to change
669 	 */
670 	mfs->fs_size		= dfs->fs_size;
671 	mfs->fs_dsize		= dfs->fs_dsize;
672 	mfs->fs_ncg		= dfs->fs_ncg;
673 	mfs->fs_minfree		= dfs->fs_minfree;
674 	mfs->fs_rotdelay	= dfs->fs_rotdelay;
675 	mfs->fs_rps		= dfs->fs_rps;
676 	mfs->fs_maxcontig	= dfs->fs_maxcontig;
677 	mfs->fs_maxbpg		= dfs->fs_maxbpg;
678 	mfs->fs_csmask		= dfs->fs_csmask;
679 	mfs->fs_csshift		= dfs->fs_csshift;
680 	mfs->fs_optim		= dfs->fs_optim;
681 	mfs->fs_csaddr		= dfs->fs_csaddr;
682 	mfs->fs_cssize		= dfs->fs_cssize;
683 	mfs->fs_ncyl		= dfs->fs_ncyl;
684 	mfs->fs_cstotal		= dfs->fs_cstotal;
685 	mfs->fs_reclaim		= dfs->fs_reclaim;
686 
687 	if (mfs->fs_reclaim & (FS_RECLAIM|FS_RECLAIMING)) {
688 		mfs->fs_reclaim &= ~FS_RECLAIM;
689 		mfs->fs_reclaim |=  FS_RECLAIMING;
690 		ufs_thread_start(&ufsvfsp->vfs_reclaim,
691 		    ufs_thread_reclaim, vfsp);
692 	}
693 
694 	/* XXX What to do about sparecon? */
695 
696 	/* XXX need to copy volume label */
697 
698 	/*
699 	 * ondisk clean flag overrides inmemory clean flag iff == FSBAD
700 	 * or if error-locked and ondisk is now clean
701 	 */
702 	needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
703 	if (needs_unlock)
704 		mutex_enter(&ufsvfsp->vfs_lock);
705 
706 	if (errlck == UN_ERRLCK) {
707 		if (finished_fsclean == dfs->fs_clean)
708 			mfs->fs_clean = finished_fsclean;
709 		else
710 			mfs->fs_clean = FSBAD;
711 		mfs->fs_state = FSOKAY - dfs->fs_time;
712 	}
713 
714 	if (FSOKAY != dfs->fs_state + dfs->fs_time ||
715 	    (dfs->fs_clean == FSBAD))
716 		mfs->fs_clean = FSBAD;
717 
718 	if (needs_unlock)
719 		mutex_exit(&ufsvfsp->vfs_lock);
720 
721 	brelse(bp);
722 
723 	return (0);
724 }
725 
726 /*
727  * ufs_reconcile_inode
728  *	reconcile ondisk inode with incore inode
729  */
730 static int
731 ufs_reconcile_inode(struct inode *ip, void *arg)
732 {
733 	int		i;
734 	int		ndaddr;
735 	int		niaddr;
736 	struct dinode	*dp;		/* ondisk inode */
737 	struct buf	*bp	= NULL;
738 	uid_t		d_uid;
739 	gid_t		d_gid;
740 	int		error = 0;
741 	struct fs	*fs;
742 
743 	/*
744 	 * not an inode we care about
745 	 */
746 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
747 		return (0);
748 
749 	fs = ip->i_fs;
750 
751 	/*
752 	 * Inode reconciliation fails: we made the filesystem quiescent
753 	 * and we did a ufs_flush() before calling ufs_reconcile_inode()
754 	 * and thus the inode should not have been changed inbetween.
755 	 * Any discrepancies indicate a logic error and a pretty
756 	 * significant run-state inconsistency we should complain about.
757 	 */
758 	if (ip->i_flag & (IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG)) {
759 		cmn_err(CE_WARN, "%s: Inode reconciliation failed for"
760 		    "inode %llu", fs->fs_fsmnt, (u_longlong_t)ip->i_number);
761 		return (EINVAL);
762 	}
763 
764 	/*
765 	 * get the dinode
766 	 */
767 	bp = UFS_BREAD(ip->i_ufsvfs,
768 	    ip->i_dev, (daddr_t)fsbtodb(fs, itod(fs, ip->i_number)),
769 	    (int)fs->fs_bsize);
770 	if (bp->b_flags & B_ERROR) {
771 		brelse(bp);
772 		return (EIO);
773 	}
774 	dp  = bp->b_un.b_dino;
775 	dp += itoo(fs, ip->i_number);
776 
777 	/*
778 	 * handle Sun's implementation of EFT
779 	 */
780 	d_uid = (dp->di_suid == UID_LONG) ? dp->di_uid : (uid_t)dp->di_suid;
781 	d_gid = (dp->di_sgid == GID_LONG) ? dp->di_gid : (uid_t)dp->di_sgid;
782 
783 	rw_enter(&ip->i_contents, RW_WRITER);
784 
785 	/*
786 	 * some fields are not allowed to change
787 	 */
788 	if ((ip->i_mode  != dp->di_mode) ||
789 	    (ip->i_gen   != dp->di_gen) ||
790 	    (ip->i_uid   != d_uid) ||
791 	    (ip->i_gid   != d_gid)) {
792 		error = EACCES;
793 		goto out;
794 	}
795 
796 	/*
797 	 * and some are allowed to change
798 	 */
799 	ip->i_size		= dp->di_size;
800 	ip->i_ic.ic_flags	= dp->di_ic.ic_flags;
801 	ip->i_blocks		= dp->di_blocks;
802 	ip->i_nlink		= dp->di_nlink;
803 	if (ip->i_flag & IFASTSYMLNK) {
804 		ndaddr = 1;
805 		niaddr = 0;
806 	} else {
807 		ndaddr = NDADDR;
808 		niaddr = NIADDR;
809 	}
810 	for (i = 0; i < ndaddr; ++i)
811 		ip->i_db[i] = dp->di_db[i];
812 	for (i = 0; i < niaddr; ++i)
813 		ip->i_ib[i] = dp->di_ib[i];
814 
815 out:
816 	rw_exit(&ip->i_contents);
817 	brelse(bp);
818 	return (error);
819 }
820 
821 /*
822  * ufs_reconcile
823  *	reconcile ondisk superblock/inodes with any incore
824  */
825 static int
826 ufs_reconcile(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
827 {
828 	int	error = 0;
829 
830 	/*
831 	 * get rid of as much inmemory data as possible
832 	 */
833 	(void) ufs_flush(vfsp);
834 
835 	/*
836 	 * reconcile the superblock and inodes
837 	 */
838 	if (error = ufs_reconcile_fs(vfsp, ufsvfsp, errlck))
839 		return (error);
840 	if (error = ufs_scan_inodes(0, ufs_reconcile_inode, ufsvfsp, ufsvfsp))
841 		return (error);
842 	/*
843 	 * allocation blocks may be incorrect; get rid of them
844 	 */
845 	(void) ufs_flush(vfsp);
846 
847 	return (error);
848 }
849 
850 /*
851  * File system locking
852  */
853 int
854 ufs_fiolfs(struct vnode *vp, struct lockfs *lockfsp, int from_log)
855 {
856 	return (ufs__fiolfs(vp, lockfsp, /* from_user */ 1, from_log));
857 }
858 
859 /* kernel-internal interface, also used by fix-on-panic */
860 int
861 ufs__fiolfs(
862 	struct vnode *vp,
863 	struct lockfs *lockfsp,
864 	int from_user,
865 	int from_log)
866 {
867 	struct ulockfs	*ulp;
868 	struct lockfs	lfs;
869 	int		error;
870 	struct vfs	*vfsp;
871 	struct ufsvfs	*ufsvfsp;
872 	int		 errlck		= NO_ERRLCK;
873 	int		 poll_events	= POLLPRI;
874 	extern struct pollhead ufs_pollhd;
875 	ulockfs_info_t *head;
876 	ulockfs_info_t *info;
877 	int signal = 0;
878 
879 	/* check valid lock type */
880 	if (!lockfsp || lockfsp->lf_lock > LOCKFS_MAXLOCK)
881 		return (EINVAL);
882 
883 	if (!vp || !vp->v_vfsp || !vp->v_vfsp->vfs_data)
884 		return (EIO);
885 
886 	vfsp = vp->v_vfsp;
887 
888 	if (vfsp->vfs_flag & VFS_UNMOUNTED) /* has been unmounted */
889 		return (EIO);
890 
891 	/* take the lock and check again */
892 	vfs_lock_wait(vfsp);
893 	if (vfsp->vfs_flag & VFS_UNMOUNTED) {
894 		vfs_unlock(vfsp);
895 		return (EIO);
896 	}
897 
898 	/*
899 	 * Can't wlock or ro/elock fs with accounting or local swap file
900 	 * We need to check for this before we grab the ul_lock to avoid
901 	 * deadlocks with the accounting framework.
902 	 */
903 	if ((LOCKFS_IS_WLOCK(lockfsp) || LOCKFS_IS_ELOCK(lockfsp) ||
904 	    LOCKFS_IS_ROELOCK(lockfsp)) && !from_log) {
905 		if (ufs_checkaccton(vp) || ufs_checkswapon(vp)) {
906 			vfs_unlock(vfsp);
907 			return (EDEADLK);
908 		}
909 	}
910 
911 	ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
912 	ulp = &ufsvfsp->vfs_ulockfs;
913 	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
914 	SEARCH_ULOCKFSP(head, ulp, info);
915 
916 	/*
917 	 * Suspend both the reclaim thread and the delete thread.
918 	 * This must be done outside the lockfs locking protocol.
919 	 */
920 	ufs_thread_suspend(&ufsvfsp->vfs_reclaim);
921 	ufs_thread_suspend(&ufsvfsp->vfs_delete);
922 
923 	mutex_enter(&ulp->ul_lock);
924 	atomic_add_long(&ufs_quiesce_pend, 1);
925 
926 	/*
927 	 * Quit if there is another lockfs request in progress
928 	 * that is waiting for existing ufs_vnops to complete.
929 	 */
930 	if (ULOCKFS_IS_BUSY(ulp)) {
931 		error = EBUSY;
932 		goto errexit;
933 	}
934 
935 	/* cannot ulocked or downgrade a hard-lock */
936 	if (ULOCKFS_IS_HLOCK(ulp)) {
937 		error = EIO;
938 		goto errexit;
939 	}
940 
941 	/* an error lock may be unlocked or relocked, only */
942 	if (ULOCKFS_IS_ELOCK(ulp)) {
943 		if (!LOCKFS_IS_ULOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
944 			error = EBUSY;
945 			goto errexit;
946 		}
947 	}
948 
949 	/*
950 	 * a read-only error lock may only be upgraded to an
951 	 * error lock or hard lock
952 	 */
953 	if (ULOCKFS_IS_ROELOCK(ulp)) {
954 		if (!LOCKFS_IS_HLOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
955 			error = EBUSY;
956 			goto errexit;
957 		}
958 	}
959 
960 	/*
961 	 * until read-only error locks are fully implemented
962 	 * just return EINVAL
963 	 */
964 	if (LOCKFS_IS_ROELOCK(lockfsp)) {
965 		error = EINVAL;
966 		goto errexit;
967 	}
968 
969 	/*
970 	 * an error lock may only be applied if the file system is
971 	 * unlocked or already error locked.
972 	 * (this is to prevent the case where a fs gets changed out from
973 	 * underneath a fs that is locked for backup,
974 	 * that is, name/delete/write-locked.)
975 	 */
976 	if ((!ULOCKFS_IS_ULOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp) &&
977 	    !ULOCKFS_IS_ROELOCK(ulp)) &&
978 	    (LOCKFS_IS_ELOCK(lockfsp) || LOCKFS_IS_ROELOCK(lockfsp))) {
979 		error = EBUSY;
980 		goto errexit;
981 	}
982 
983 	/* get and validate the input lockfs request */
984 	if (error = ufs_getlfd(lockfsp, &ulp->ul_lockfs))
985 		goto errexit;
986 
987 	/*
988 	 * save current ulockfs struct
989 	 */
990 	bcopy(&ulp->ul_lockfs, &lfs, sizeof (struct lockfs));
991 
992 	/*
993 	 * Freeze the file system (pend future accesses)
994 	 */
995 	ufs_freeze(ulp, lockfsp);
996 
997 	/*
998 	 * Set locking in progress because ufs_quiesce may free the
999 	 * ul_lock mutex.
1000 	 */
1001 	ULOCKFS_SET_BUSY(ulp);
1002 	/* update the ioctl copy */
1003 	LOCKFS_SET_BUSY(&ulp->ul_lockfs);
1004 
1005 	/*
1006 	 * We  need to unset FWLOCK status before we call ufs_quiesce
1007 	 * so that the thread doesnt get suspended. We do this only if
1008 	 * this (fallocate) thread requested an unlock operation.
1009 	 */
1010 	if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
1011 		if (!ULOCKFS_IS_WLOCK(ulp))
1012 			ULOCKFS_CLR_FWLOCK(ulp);
1013 	}
1014 
1015 	/*
1016 	 * Quiesce (wait for outstanding accesses to finish)
1017 	 */
1018 	if (error = ufs_quiesce(ulp)) {
1019 		/*
1020 		 * Interrupted due to signal. There could still be
1021 		 * pending vnops.
1022 		 */
1023 		signal = 1;
1024 
1025 		/*
1026 		 * We do broadcast because lock-status
1027 		 * could be reverted to old status.
1028 		 */
1029 		cv_broadcast(&ulp->ul_cv);
1030 		goto errout;
1031 	}
1032 
1033 	/*
1034 	 * If the fallocate thread requested a write fs lock operation
1035 	 * then we set fwlock status in the ulp.
1036 	 */
1037 	if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
1038 		if (ULOCKFS_IS_WLOCK(ulp))
1039 			ULOCKFS_SET_FWLOCK(ulp);
1040 	}
1041 
1042 	/*
1043 	 * save error lock status to pass down to reconcilation
1044 	 * routines and for later cleanup
1045 	 */
1046 	if (LOCKFS_IS_ELOCK(&lfs) && ULOCKFS_IS_ULOCK(ulp))
1047 		errlck = UN_ERRLCK;
1048 
1049 	if (ULOCKFS_IS_ELOCK(ulp) || ULOCKFS_IS_ROELOCK(ulp)) {
1050 		int needs_unlock;
1051 		int needs_sbwrite;
1052 
1053 		poll_events |= POLLERR;
1054 		errlck = LOCKFS_IS_ELOCK(&lfs) || LOCKFS_IS_ROELOCK(&lfs) ?
1055 		    RE_ERRLCK : SET_ERRLCK;
1056 
1057 		needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
1058 		if (needs_unlock)
1059 			mutex_enter(&ufsvfsp->vfs_lock);
1060 
1061 		/* disable delayed i/o */
1062 		needs_sbwrite = 0;
1063 
1064 		if (errlck == SET_ERRLCK) {
1065 			ufsvfsp->vfs_fs->fs_clean = FSBAD;
1066 			needs_sbwrite = 1;
1067 		}
1068 
1069 		needs_sbwrite |= ufsvfsp->vfs_dio;
1070 		ufsvfsp->vfs_dio = 0;
1071 
1072 		if (needs_unlock)
1073 			mutex_exit(&ufsvfsp->vfs_lock);
1074 
1075 		if (needs_sbwrite) {
1076 			ulp->ul_sbowner = curthread;
1077 			TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE);
1078 
1079 			if (needs_unlock)
1080 				mutex_enter(&ufsvfsp->vfs_lock);
1081 
1082 			ufsvfsp->vfs_fs->fs_fmod = 0;
1083 
1084 			if (needs_unlock)
1085 				mutex_exit(&ufsvfsp->vfs_lock);
1086 		}
1087 	}
1088 
1089 	/*
1090 	 * reconcile superblock and inodes if was wlocked
1091 	 */
1092 	if (LOCKFS_IS_WLOCK(&lfs) || LOCKFS_IS_ELOCK(&lfs)) {
1093 		if (error = ufs_reconcile(vfsp, ufsvfsp, errlck))
1094 			goto errout;
1095 		/*
1096 		 * in case the fs grew; reset the metadata map for logging tests
1097 		 */
1098 		TRANS_MATA_UMOUNT(ufsvfsp);
1099 		TRANS_MATA_MOUNT(ufsvfsp);
1100 		TRANS_MATA_SI(ufsvfsp, ufsvfsp->vfs_fs);
1101 	}
1102 
1103 	/*
1104 	 * At least everything *currently* dirty goes out.
1105 	 */
1106 
1107 	if ((error = ufs_flush(vfsp)) != 0 && !ULOCKFS_IS_HLOCK(ulp) &&
1108 	    !ULOCKFS_IS_ELOCK(ulp))
1109 		goto errout;
1110 
1111 	/*
1112 	 * thaw file system and wakeup pended processes
1113 	 */
1114 	if (error = ufs_thaw(vfsp, ufsvfsp, ulp))
1115 		if (!ULOCKFS_IS_HLOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp))
1116 			goto errout;
1117 
1118 	/*
1119 	 * reset modified flag if not already write locked
1120 	 */
1121 	if (!LOCKFS_IS_WLOCK(&lfs))
1122 		ULOCKFS_CLR_MOD(ulp);
1123 
1124 	/*
1125 	 * idle the lock struct
1126 	 */
1127 	ULOCKFS_CLR_BUSY(ulp);
1128 	/* update the ioctl copy */
1129 	LOCKFS_CLR_BUSY(&ulp->ul_lockfs);
1130 
1131 	/*
1132 	 * free current comment
1133 	 */
1134 	if (lfs.lf_comment && lfs.lf_comlen != 0) {
1135 		kmem_free(lfs.lf_comment, lfs.lf_comlen);
1136 		lfs.lf_comment = NULL;
1137 		lfs.lf_comlen = 0;
1138 	}
1139 
1140 	/* do error lock cleanup */
1141 	if (errlck == UN_ERRLCK)
1142 		ufsfx_unlockfs(ufsvfsp);
1143 
1144 	else if (errlck == RE_ERRLCK)
1145 		ufsfx_lockfs(ufsvfsp);
1146 
1147 	/* don't allow error lock from user to invoke panic */
1148 	else if (from_user && errlck == SET_ERRLCK &&
1149 	    !(ufsvfsp->vfs_fsfx.fx_flags & (UFSMNT_ONERROR_PANIC >> 4)))
1150 		(void) ufs_fault(ufsvfsp->vfs_root,
1151 		    ulp->ul_lockfs.lf_comment && ulp->ul_lockfs.lf_comlen > 0 ?
1152 		    ulp->ul_lockfs.lf_comment: "user-applied error lock");
1153 
1154 	atomic_add_long(&ufs_quiesce_pend, -1);
1155 	mutex_exit(&ulp->ul_lock);
1156 	vfs_unlock(vfsp);
1157 
1158 	if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs))
1159 		poll_events |= POLLERR;
1160 
1161 	pollwakeup(&ufs_pollhd, poll_events);
1162 
1163 	/*
1164 	 * Allow both the delete thread and the reclaim thread to
1165 	 * continue.
1166 	 */
1167 	ufs_thread_continue(&ufsvfsp->vfs_delete);
1168 	ufs_thread_continue(&ufsvfsp->vfs_reclaim);
1169 
1170 	return (0);
1171 
1172 errout:
1173 	/*
1174 	 * Lock failed. Reset the old lock in ufsvfs if not hard locked.
1175 	 */
1176 	if (!LOCKFS_IS_HLOCK(&ulp->ul_lockfs)) {
1177 		bcopy(&lfs, &ulp->ul_lockfs, sizeof (struct lockfs));
1178 		ulp->ul_fs_lock = (1 << lfs.lf_lock);
1179 	}
1180 
1181 	/*
1182 	 * Don't call ufs_thaw() when there's a signal during
1183 	 * ufs quiesce operation as it can lead to deadlock
1184 	 * with getpage.
1185 	 */
1186 	if (signal == 0)
1187 		(void) ufs_thaw(vfsp, ufsvfsp, ulp);
1188 
1189 	ULOCKFS_CLR_BUSY(ulp);
1190 	LOCKFS_CLR_BUSY(&ulp->ul_lockfs);
1191 
1192 errexit:
1193 	atomic_add_long(&ufs_quiesce_pend, -1);
1194 	mutex_exit(&ulp->ul_lock);
1195 	vfs_unlock(vfsp);
1196 
1197 	/*
1198 	 * Allow both the delete thread and the reclaim thread to
1199 	 * continue.
1200 	 */
1201 	ufs_thread_continue(&ufsvfsp->vfs_delete);
1202 	ufs_thread_continue(&ufsvfsp->vfs_reclaim);
1203 
1204 	return (error);
1205 }
1206 
1207 /*
1208  * fiolfss
1209  * 	return the current file system locking state info
1210  */
1211 int
1212 ufs_fiolfss(struct vnode *vp, struct lockfs *lockfsp)
1213 {
1214 	struct ulockfs	*ulp;
1215 
1216 	if (!vp || !vp->v_vfsp || !VTOI(vp))
1217 		return (EINVAL);
1218 
1219 	/* file system has been forcibly unmounted */
1220 	if (VTOI(vp)->i_ufsvfs == NULL)
1221 		return (EIO);
1222 
1223 	ulp = VTOUL(vp);
1224 
1225 	if (ULOCKFS_IS_HLOCK(ulp)) {
1226 		*lockfsp = ulp->ul_lockfs;	/* structure assignment */
1227 		return (0);
1228 	}
1229 
1230 	mutex_enter(&ulp->ul_lock);
1231 
1232 	*lockfsp = ulp->ul_lockfs;	/* structure assignment */
1233 
1234 	if (ULOCKFS_IS_MOD(ulp))
1235 		lockfsp->lf_flags |= LOCKFS_MOD;
1236 
1237 	mutex_exit(&ulp->ul_lock);
1238 
1239 	return (0);
1240 }
1241 
1242 /*
1243  * ufs_check_lockfs
1244  *	check whether a ufs_vnops conflicts with the file system lock
1245  */
1246 int
1247 ufs_check_lockfs(struct ufsvfs *ufsvfsp, struct ulockfs *ulp, ulong_t mask)
1248 {
1249 	k_sigset_t	smask;
1250 	int		sig, slock;
1251 
1252 	ASSERT(MUTEX_HELD(&ulp->ul_lock));
1253 
1254 	while (ulp->ul_fs_lock & mask) {
1255 		slock = (int)ULOCKFS_IS_SLOCK(ulp);
1256 		if ((curthread->t_flag & T_DONTPEND) && !slock) {
1257 			curthread->t_flag |= T_WOULDBLOCK;
1258 			return (EAGAIN);
1259 		}
1260 		curthread->t_flag &= ~T_WOULDBLOCK;
1261 
1262 		/*
1263 		 * In the case of an onerr umount of the fs, threads could
1264 		 * have blocked before coming into ufs_check_lockfs and
1265 		 * need to check for the special case of ELOCK and
1266 		 * vfs_dontblock being set which would indicate that the fs
1267 		 * is on its way out and will not return therefore making
1268 		 * EIO the appropriate response.
1269 		 */
1270 		if (ULOCKFS_IS_HLOCK(ulp) ||
1271 		    (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
1272 			return (EIO);
1273 
1274 		/*
1275 		 * wait for lock status to change
1276 		 */
1277 		if (slock || ufsvfsp->vfs_nointr) {
1278 			cv_wait(&ulp->ul_cv, &ulp->ul_lock);
1279 		} else {
1280 			sigintr(&smask, 1);
1281 			sig = cv_wait_sig(&ulp->ul_cv, &ulp->ul_lock);
1282 			sigunintr(&smask);
1283 			if ((!sig && (ulp->ul_fs_lock & mask)) ||
1284 			    ufsvfsp->vfs_dontblock)
1285 				return (EINTR);
1286 		}
1287 	}
1288 
1289 	if (mask & ULOCKFS_FWLOCK) {
1290 		atomic_add_long(&ulp->ul_falloc_cnt, 1);
1291 		ULOCKFS_SET_FALLOC(ulp);
1292 	} else {
1293 		atomic_add_long(&ulp->ul_vnops_cnt, 1);
1294 	}
1295 
1296 	return (0);
1297 }
1298 
1299 /*
1300  * Check whether we came across the handcrafted lockfs protocol path. We can't
1301  * simply check for T_DONTBLOCK here as one would assume since this can also
1302  * falsely catch recursive VOP's going to a different filesystem, instead we
1303  * check if we already hold the ulockfs->ul_lock mutex.
1304  */
1305 static int
1306 ufs_lockfs_is_under_rawlockfs(struct ulockfs *ulp)
1307 {
1308 	return ((mutex_owner(&ulp->ul_lock) != curthread) ? 0 : 1);
1309 }
1310 
1311 /*
1312  * ufs_lockfs_begin - start the lockfs locking protocol
1313  */
1314 int
1315 ufs_lockfs_begin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask)
1316 {
1317 	int 		error;
1318 	int		rec_vop;
1319 	ushort_t	op_cnt_incremented = 0;
1320 	ulong_t		*ctr;
1321 	struct ulockfs *ulp;
1322 	ulockfs_info_t	*ulockfs_info;
1323 	ulockfs_info_t	*ulockfs_info_free;
1324 	ulockfs_info_t	*ulockfs_info_temp;
1325 
1326 	/*
1327 	 * file system has been forcibly unmounted
1328 	 */
1329 	if (ufsvfsp == NULL)
1330 		return (EIO);
1331 
1332 	*ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1333 
1334 	/*
1335 	 * Do lockfs protocol
1336 	 */
1337 	ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1338 	IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1339 
1340 	/*
1341 	 * Detect recursive VOP call or handcrafted internal lockfs protocol
1342 	 * path and bail out in that case.
1343 	 */
1344 	if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1345 		*ulpp = NULL;
1346 		return (0);
1347 	} else {
1348 		if (ulockfs_info_free == NULL) {
1349 			if ((ulockfs_info_temp = (ulockfs_info_t *)
1350 			    kmem_zalloc(sizeof (ulockfs_info_t),
1351 			    KM_NOSLEEP)) == NULL) {
1352 				*ulpp = NULL;
1353 				return (ENOMEM);
1354 			}
1355 		}
1356 	}
1357 
1358 	/*
1359 	 * First time VOP call
1360 	 *
1361 	 * Increment the ctr irrespective of the lockfs state. If the lockfs
1362 	 * state is not ULOCKFS_ULOCK, we can decrement it later. However,
1363 	 * before incrementing we need to check if there is a pending quiesce
1364 	 * request because if we have a continuous stream of ufs_lockfs_begin
1365 	 * requests pounding on a few cpu's then the ufs_quiesce thread might
1366 	 * never see the value of zero for ctr - a livelock kind of scenario.
1367 	 */
1368 	ctr = (mask & ULOCKFS_FWLOCK) ?
1369 	    &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt;
1370 	if (!ULOCKFS_IS_SLOCK(ulp)) {
1371 		atomic_add_long(ctr, 1);
1372 		op_cnt_incremented++;
1373 	}
1374 
1375 	/*
1376 	 * If the lockfs state (indicated by ul_fs_lock) is not just
1377 	 * ULOCKFS_ULOCK, then we will be routed through ufs_check_lockfs
1378 	 * where there is a check with an appropriate mask to selectively allow
1379 	 * operations permitted for that kind of lockfs state.
1380 	 *
1381 	 * Even these selective operations should not be allowed to go through
1382 	 * if a lockfs request is in progress because that could result in inode
1383 	 * modifications during a quiesce and could hence result in inode
1384 	 * reconciliation failures. ULOCKFS_SLOCK alone would not be sufficient,
1385 	 * so make use of ufs_quiesce_pend to disallow vnode operations when a
1386 	 * quiesce is in progress.
1387 	 */
1388 	if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
1389 		if (op_cnt_incremented)
1390 			if (!atomic_add_long_nv(ctr, -1))
1391 				cv_broadcast(&ulp->ul_cv);
1392 		mutex_enter(&ulp->ul_lock);
1393 		error = ufs_check_lockfs(ufsvfsp, ulp, mask);
1394 		mutex_exit(&ulp->ul_lock);
1395 		if (error) {
1396 			if (ulockfs_info_free == NULL)
1397 				kmem_free(ulockfs_info_temp,
1398 				    sizeof (ulockfs_info_t));
1399 			return (error);
1400 		}
1401 	} else {
1402 		/*
1403 		 * This is the common case of file system in a unlocked state.
1404 		 *
1405 		 * If a file system is unlocked, we would expect the ctr to have
1406 		 * been incremented by now. But this will not be true when a
1407 		 * quiesce is winding up - SLOCK was set when we checked before
1408 		 * incrementing the ctr, but by the time we checked for
1409 		 * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. It is okay
1410 		 * to take ul_lock and go through the slow path in this uncommon
1411 		 * case.
1412 		 */
1413 		if (op_cnt_incremented == 0) {
1414 			mutex_enter(&ulp->ul_lock);
1415 			error = ufs_check_lockfs(ufsvfsp, ulp, mask);
1416 			if (error) {
1417 				mutex_exit(&ulp->ul_lock);
1418 				if (ulockfs_info_free == NULL)
1419 					kmem_free(ulockfs_info_temp,
1420 					    sizeof (ulockfs_info_t));
1421 				return (error);
1422 			}
1423 			if (mask & ULOCKFS_FWLOCK)
1424 				ULOCKFS_SET_FALLOC(ulp);
1425 			mutex_exit(&ulp->ul_lock);
1426 		} else if (mask & ULOCKFS_FWLOCK) {
1427 			mutex_enter(&ulp->ul_lock);
1428 			ULOCKFS_SET_FALLOC(ulp);
1429 			mutex_exit(&ulp->ul_lock);
1430 		}
1431 	}
1432 
1433 	if (ulockfs_info_free != NULL) {
1434 		ulockfs_info_free->ulp = ulp;
1435 		if (mask & ULOCKFS_FWLOCK)
1436 			ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE;
1437 	} else {
1438 		ulockfs_info_temp->ulp = ulp;
1439 		ulockfs_info_temp->next = ulockfs_info;
1440 		if (mask & ULOCKFS_FWLOCK)
1441 			ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE;
1442 		ASSERT(ufs_lockfs_key != 0);
1443 		(void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1444 	}
1445 
1446 	curthread->t_flag |= T_DONTBLOCK;
1447 	return (0);
1448 }
1449 
1450 /*
1451  * Check whether we are returning from the top level VOP.
1452  */
1453 static int
1454 ufs_lockfs_top_vop_return(ulockfs_info_t *head)
1455 {
1456 	ulockfs_info_t *info;
1457 	int result = 1;
1458 
1459 	for (info = head; info != NULL; info = info->next) {
1460 		if (info->ulp != NULL) {
1461 			result = 0;
1462 			break;
1463 		}
1464 	}
1465 
1466 	return (result);
1467 }
1468 
1469 /*
1470  * ufs_lockfs_end - terminate the lockfs locking protocol
1471  */
1472 void
1473 ufs_lockfs_end(struct ulockfs *ulp)
1474 {
1475 	ulockfs_info_t *info;
1476 	ulockfs_info_t *head;
1477 
1478 	/*
1479 	 * end-of-VOP protocol
1480 	 */
1481 	if (ulp == NULL)
1482 		return;
1483 
1484 	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1485 	SEARCH_ULOCKFSP(head, ulp, info);
1486 
1487 	/*
1488 	 * If we're called from a first level VOP, we have to have a
1489 	 * valid ulockfs record in the TSD.
1490 	 */
1491 	ASSERT(info != NULL);
1492 
1493 	/*
1494 	 * Invalidate the ulockfs record.
1495 	 */
1496 	info->ulp = NULL;
1497 
1498 	if (ufs_lockfs_top_vop_return(head))
1499 		curthread->t_flag &= ~T_DONTBLOCK;
1500 
1501 	/* fallocate thread */
1502 	if (ULOCKFS_IS_FALLOC(ulp) && info->flags & ULOCK_INFO_FALLOCATE) {
1503 		/* Clear the thread's fallocate state */
1504 		info->flags &= ~ULOCK_INFO_FALLOCATE;
1505 		if (!atomic_add_long_nv(&ulp->ul_falloc_cnt, -1)) {
1506 			mutex_enter(&ulp->ul_lock);
1507 			ULOCKFS_CLR_FALLOC(ulp);
1508 			cv_broadcast(&ulp->ul_cv);
1509 			mutex_exit(&ulp->ul_lock);
1510 		}
1511 	} else  { /* normal thread */
1512 		if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1))
1513 			cv_broadcast(&ulp->ul_cv);
1514 	}
1515 }
1516 
1517 /*
1518  * ufs_lockfs_trybegin - try to start the lockfs locking protocol without
1519  * blocking.
1520  */
1521 int
1522 ufs_lockfs_trybegin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask)
1523 {
1524 	int 		error = 0;
1525 	int		rec_vop;
1526 	ushort_t	op_cnt_incremented = 0;
1527 	ulong_t		*ctr;
1528 	struct ulockfs *ulp;
1529 	ulockfs_info_t	*ulockfs_info;
1530 	ulockfs_info_t	*ulockfs_info_free;
1531 	ulockfs_info_t	*ulockfs_info_temp;
1532 
1533 	/*
1534 	 * file system has been forcibly unmounted
1535 	 */
1536 	if (ufsvfsp == NULL)
1537 		return (EIO);
1538 
1539 	*ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1540 
1541 	/*
1542 	 * Do lockfs protocol
1543 	 */
1544 	ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1545 	IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1546 
1547 	/*
1548 	 * Detect recursive VOP call or handcrafted internal lockfs protocol
1549 	 * path and bail out in that case.
1550 	 */
1551 	if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1552 		*ulpp = NULL;
1553 		return (0);
1554 	} else {
1555 		if (ulockfs_info_free == NULL) {
1556 			if ((ulockfs_info_temp = (ulockfs_info_t *)
1557 			    kmem_zalloc(sizeof (ulockfs_info_t),
1558 			    KM_NOSLEEP)) == NULL) {
1559 				*ulpp = NULL;
1560 				return (ENOMEM);
1561 			}
1562 		}
1563 	}
1564 
1565 	/*
1566 	 * First time VOP call
1567 	 *
1568 	 * Increment the ctr irrespective of the lockfs state. If the lockfs
1569 	 * state is not ULOCKFS_ULOCK, we can decrement it later. However,
1570 	 * before incrementing we need to check if there is a pending quiesce
1571 	 * request because if we have a continuous stream of ufs_lockfs_begin
1572 	 * requests pounding on a few cpu's then the ufs_quiesce thread might
1573 	 * never see the value of zero for ctr - a livelock kind of scenario.
1574 	 */
1575 	ctr = (mask & ULOCKFS_FWLOCK) ?
1576 	    &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt;
1577 	if (!ULOCKFS_IS_SLOCK(ulp)) {
1578 		atomic_add_long(ctr, 1);
1579 		op_cnt_incremented++;
1580 	}
1581 
1582 	if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
1583 		/*
1584 		 * Non-blocking version of ufs_check_lockfs() code.
1585 		 *
1586 		 * If the file system is not hard locked or error locked
1587 		 * and if ulp->ul_fs_lock allows this operation, increment
1588 		 * the appropriate counter and proceed (For eg., In case the
1589 		 * file system is delete locked, a mmap can still go through).
1590 		 */
1591 		if (op_cnt_incremented)
1592 			if (!atomic_add_long_nv(ctr, -1))
1593 				cv_broadcast(&ulp->ul_cv);
1594 		mutex_enter(&ulp->ul_lock);
1595 		if (ULOCKFS_IS_HLOCK(ulp) ||
1596 		    (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
1597 			error = EIO;
1598 		else if (ulp->ul_fs_lock & mask)
1599 			error = EAGAIN;
1600 
1601 		if (error) {
1602 			mutex_exit(&ulp->ul_lock);
1603 			if (ulockfs_info_free == NULL)
1604 				kmem_free(ulockfs_info_temp,
1605 				    sizeof (ulockfs_info_t));
1606 			return (error);
1607 		}
1608 		atomic_add_long(ctr, 1);
1609 		if (mask & ULOCKFS_FWLOCK)
1610 			ULOCKFS_SET_FALLOC(ulp);
1611 		mutex_exit(&ulp->ul_lock);
1612 	} else {
1613 		/*
1614 		 * This is the common case of file system in a unlocked state.
1615 		 *
1616 		 * If a file system is unlocked, we would expect the ctr to have
1617 		 * been incremented by now. But this will not be true when a
1618 		 * quiesce is winding up - SLOCK was set when we checked before
1619 		 * incrementing the ctr, but by the time we checked for
1620 		 * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. Take
1621 		 * ul_lock and go through the non-blocking version of
1622 		 * ufs_check_lockfs() code.
1623 		 */
1624 		if (op_cnt_incremented == 0) {
1625 			mutex_enter(&ulp->ul_lock);
1626 			if (ULOCKFS_IS_HLOCK(ulp) ||
1627 			    (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
1628 				error = EIO;
1629 			else if (ulp->ul_fs_lock & mask)
1630 				error = EAGAIN;
1631 
1632 			if (error) {
1633 				mutex_exit(&ulp->ul_lock);
1634 				if (ulockfs_info_free == NULL)
1635 					kmem_free(ulockfs_info_temp,
1636 					    sizeof (ulockfs_info_t));
1637 				return (error);
1638 			}
1639 			atomic_add_long(ctr, 1);
1640 			if (mask & ULOCKFS_FWLOCK)
1641 				ULOCKFS_SET_FALLOC(ulp);
1642 			mutex_exit(&ulp->ul_lock);
1643 		} else if (mask & ULOCKFS_FWLOCK) {
1644 			mutex_enter(&ulp->ul_lock);
1645 			ULOCKFS_SET_FALLOC(ulp);
1646 			mutex_exit(&ulp->ul_lock);
1647 		}
1648 	}
1649 
1650 	if (ulockfs_info_free != NULL) {
1651 		ulockfs_info_free->ulp = ulp;
1652 		if (mask & ULOCKFS_FWLOCK)
1653 			ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE;
1654 	} else {
1655 		ulockfs_info_temp->ulp = ulp;
1656 		ulockfs_info_temp->next = ulockfs_info;
1657 		if (mask & ULOCKFS_FWLOCK)
1658 			ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE;
1659 		ASSERT(ufs_lockfs_key != 0);
1660 		(void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1661 	}
1662 
1663 	curthread->t_flag |= T_DONTBLOCK;
1664 	return (0);
1665 }
1666 
1667 /*
1668  * specialized version of ufs_lockfs_begin() called by ufs_getpage().
1669  */
1670 int
1671 ufs_lockfs_begin_getpage(
1672 	struct ufsvfs	*ufsvfsp,
1673 	struct ulockfs	**ulpp,
1674 	struct seg	*seg,
1675 	int		read_access,
1676 	uint_t		*protp)
1677 {
1678 	ulong_t			mask;
1679 	int 			error;
1680 	int			rec_vop;
1681 	struct ulockfs		*ulp;
1682 	ulockfs_info_t		*ulockfs_info;
1683 	ulockfs_info_t		*ulockfs_info_free;
1684 	ulockfs_info_t		*ulockfs_info_temp;
1685 
1686 	/*
1687 	 * file system has been forcibly unmounted
1688 	 */
1689 	if (ufsvfsp == NULL)
1690 		return (EIO);
1691 
1692 	*ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1693 
1694 	/*
1695 	 * Do lockfs protocol
1696 	 */
1697 	ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1698 	IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1699 
1700 	/*
1701 	 * Detect recursive VOP call or handcrafted internal lockfs protocol
1702 	 * path and bail out in that case.
1703 	 */
1704 	if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1705 		*ulpp = NULL;
1706 		return (0);
1707 	} else {
1708 		if (ulockfs_info_free == NULL) {
1709 			if ((ulockfs_info_temp = (ulockfs_info_t *)
1710 			    kmem_zalloc(sizeof (ulockfs_info_t),
1711 			    KM_NOSLEEP)) == NULL) {
1712 				*ulpp = NULL;
1713 				return (ENOMEM);
1714 			}
1715 		}
1716 	}
1717 
1718 	/*
1719 	 * First time VOP call
1720 	 */
1721 	atomic_add_long(&ulp->ul_vnops_cnt, 1);
1722 	if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
1723 		if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1))
1724 			cv_broadcast(&ulp->ul_cv);
1725 		mutex_enter(&ulp->ul_lock);
1726 		if (seg->s_ops == &segvn_ops &&
1727 		    ((struct segvn_data *)seg->s_data)->type != MAP_SHARED) {
1728 			mask = (ulong_t)ULOCKFS_GETREAD_MASK;
1729 		} else if (protp && read_access) {
1730 			/*
1731 			 * Restrict the mapping to readonly.
1732 			 * Writes to this mapping will cause
1733 			 * another fault which will then
1734 			 * be suspended if fs is write locked
1735 			 */
1736 			*protp &= ~PROT_WRITE;
1737 			mask = (ulong_t)ULOCKFS_GETREAD_MASK;
1738 		} else
1739 			mask = (ulong_t)ULOCKFS_GETWRITE_MASK;
1740 
1741 		/*
1742 		 * will sleep if this fs is locked against this VOP
1743 		 */
1744 		error = ufs_check_lockfs(ufsvfsp, ulp, mask);
1745 		mutex_exit(&ulp->ul_lock);
1746 		if (error) {
1747 			if (ulockfs_info_free == NULL)
1748 				kmem_free(ulockfs_info_temp,
1749 				    sizeof (ulockfs_info_t));
1750 			return (error);
1751 		}
1752 	}
1753 
1754 	if (ulockfs_info_free != NULL) {
1755 		ulockfs_info_free->ulp = ulp;
1756 	} else {
1757 		ulockfs_info_temp->ulp = ulp;
1758 		ulockfs_info_temp->next = ulockfs_info;
1759 		ASSERT(ufs_lockfs_key != 0);
1760 		(void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1761 	}
1762 
1763 	curthread->t_flag |= T_DONTBLOCK;
1764 	return (0);
1765 }
1766 
1767 void
1768 ufs_lockfs_tsd_destructor(void *head)
1769 {
1770 	ulockfs_info_t *curr = (ulockfs_info_t *)head;
1771 	ulockfs_info_t *temp;
1772 
1773 	for (; curr != NULL; ) {
1774 		/*
1775 		 * The TSD destructor is being called when the thread exits
1776 		 * (via thread_exit()). At that time it must have cleaned up
1777 		 * all VOPs via ufs_lockfs_end() and there must not be a
1778 		 * valid ulockfs record exist while a thread is exiting.
1779 		 */
1780 		temp = curr;
1781 		curr = curr->next;
1782 		ASSERT(temp->ulp == NULL);
1783 		kmem_free(temp, sizeof (ulockfs_info_t));
1784 	}
1785 }
1786