xref: /titanic_52/usr/src/uts/common/fs/ufs/ufs_lockfs.c (revision bb0ade0978a02d3fe0b0165cd4725fdcb593fbfb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/t_lock.h>
30 #include <sys/param.h>
31 #include <sys/time.h>
32 #include <sys/systm.h>
33 #include <sys/sysmacros.h>
34 #include <sys/resource.h>
35 #include <sys/signal.h>
36 #include <sys/cred.h>
37 #include <sys/user.h>
38 #include <sys/buf.h>
39 #include <sys/vfs.h>
40 #include <sys/vnode.h>
41 #include <sys/proc.h>
42 #include <sys/disp.h>
43 #include <sys/file.h>
44 #include <sys/fcntl.h>
45 #include <sys/flock.h>
46 #include <sys/atomic.h>
47 #include <sys/kmem.h>
48 #include <sys/uio.h>
49 #include <sys/conf.h>
50 #include <sys/mman.h>
51 #include <sys/pathname.h>
52 #include <sys/debug.h>
53 #include <sys/vmmeter.h>
54 #include <sys/vmsystm.h>
55 #include <sys/cmn_err.h>
56 #include <sys/acct.h>
57 #include <sys/dnlc.h>
58 #include <sys/swap.h>
59 
60 #include <sys/fs/ufs_fs.h>
61 #include <sys/fs/ufs_inode.h>
62 #include <sys/fs/ufs_fsdir.h>
63 #include <sys/fs/ufs_trans.h>
64 #include <sys/fs/ufs_panic.h>
65 #include <sys/fs/ufs_mount.h>
66 #include <sys/fs/ufs_bio.h>
67 #include <sys/fs/ufs_log.h>
68 #include <sys/fs/ufs_quota.h>
69 #include <sys/dirent.h>		/* must be AFTER <sys/fs/fsdir.h>! */
70 #include <sys/errno.h>
71 #include <sys/sysinfo.h>
72 
73 #include <vm/hat.h>
74 #include <vm/pvn.h>
75 #include <vm/as.h>
76 #include <vm/seg.h>
77 #include <vm/seg_map.h>
78 #include <vm/seg_vn.h>
79 #include <vm/rm.h>
80 #include <vm/anon.h>
81 #include <sys/swap.h>
82 #include <sys/dnlc.h>
83 
84 extern struct vnode *common_specvp(struct vnode *vp);
85 
86 /* error lock status */
87 #define	UN_ERRLCK	(-1)
88 #define	SET_ERRLCK	1
89 #define	RE_ERRLCK	2
90 #define	NO_ERRLCK	0
91 
92 /*
93  * Index to be used in TSD for storing lockfs data
94  */
95 uint_t ufs_lockfs_key;
96 
97 typedef struct _ulockfs_info {
98 	struct _ulockfs_info *next;
99 	struct ulockfs *ulp;
100 	uint_t flags;
101 } ulockfs_info_t;
102 
103 #define	ULOCK_INFO_FALLOCATE	0x00000001	/* fallocate thread */
104 
105 /*
106  * Check in TSD that whether we are already doing any VOP on this filesystem
107  */
108 #define	IS_REC_VOP(found, head, ulp, free)		\
109 {							\
110 	ulockfs_info_t *_curr;				\
111 							\
112 	for (found = 0, free = NULL, _curr = head;	\
113 	    _curr != NULL; _curr = _curr->next) {	\
114 		if ((free == NULL) &&			\
115 		    (_curr->ulp == NULL))		\
116 			free = _curr;			\
117 		if (_curr->ulp == ulp) {		\
118 			found = 1;			\
119 			break;				\
120 		}					\
121 	}						\
122 }
123 
124 /*
125  * Get the lockfs data from TSD so that lockfs handles the recursive VOP
126  * properly
127  */
128 #define	SEARCH_ULOCKFSP(head, ulp, info)		\
129 {							\
130 	ulockfs_info_t *_curr;				\
131 							\
132 	for (_curr = head; _curr != NULL;		\
133 	    _curr = _curr->next) {			\
134 		if (_curr->ulp == ulp) {		\
135 			break;				\
136 		}					\
137 	}						\
138 							\
139 	info = _curr;					\
140 }
141 
142 /*
143  * Validate lockfs request
144  */
145 static int
146 ufs_getlfd(
147 	struct lockfs *lockfsp,		/* new lock request */
148 	struct lockfs *ul_lockfsp)	/* old lock state */
149 {
150 	int	error = 0;
151 
152 	/*
153 	 * no input flags defined
154 	 */
155 	if (lockfsp->lf_flags != 0) {
156 		error = EINVAL;
157 		goto errout;
158 	}
159 
160 	/*
161 	 * check key
162 	 */
163 	if (!LOCKFS_IS_ULOCK(ul_lockfsp))
164 		if (lockfsp->lf_key != ul_lockfsp->lf_key) {
165 			error = EINVAL;
166 			goto errout;
167 	}
168 
169 	lockfsp->lf_key = ul_lockfsp->lf_key + 1;
170 
171 errout:
172 	return (error);
173 }
174 
175 /*
176  * ufs_checkaccton
177  *	check if accounting is turned on on this fs
178  */
179 
180 int
181 ufs_checkaccton(struct vnode *vp)
182 {
183 	if (acct_fs_in_use(vp))
184 		return (EDEADLK);
185 	return (0);
186 }
187 
188 /*
189  * ufs_checkswapon
190  *	check if local swapping is to file on this fs
191  */
192 int
193 ufs_checkswapon(struct vnode *vp)
194 {
195 	struct swapinfo	*sip;
196 
197 	mutex_enter(&swapinfo_lock);
198 	for (sip = swapinfo; sip; sip = sip->si_next)
199 		if (sip->si_vp->v_vfsp == vp->v_vfsp) {
200 			mutex_exit(&swapinfo_lock);
201 			return (EDEADLK);
202 		}
203 	mutex_exit(&swapinfo_lock);
204 	return (0);
205 }
206 
207 /*
208  * ufs_freeze
209  *	pend future accesses for current lock and desired lock
210  */
211 void
212 ufs_freeze(struct ulockfs *ulp, struct lockfs *lockfsp)
213 {
214 	/*
215 	 * set to new lock type
216 	 */
217 	ulp->ul_lockfs.lf_lock = lockfsp->lf_lock;
218 	ulp->ul_lockfs.lf_key = lockfsp->lf_key;
219 	ulp->ul_lockfs.lf_comlen = lockfsp->lf_comlen;
220 	ulp->ul_lockfs.lf_comment = lockfsp->lf_comment;
221 
222 	ulp->ul_fs_lock = (1 << ulp->ul_lockfs.lf_lock);
223 }
224 
225 /*
226  * All callers of ufs_quiesce() atomically increment ufs_quiesce_pend before
227  * starting ufs_quiesce() protocol and decrement it only when a file system no
228  * longer has to be in quiescent state. This allows ufs_pageio() to detect
229  * that another thread wants to quiesce a file system. See more comments in
230  * ufs_pageio().
231  */
232 ulong_t ufs_quiesce_pend = 0;
233 
234 /*
235  * ufs_quiesce
236  *	wait for outstanding accesses to finish
237  */
238 int
239 ufs_quiesce(struct ulockfs *ulp)
240 {
241 	int error = 0;
242 	ulockfs_info_t *head;
243 	ulockfs_info_t *info;
244 
245 	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
246 	SEARCH_ULOCKFSP(head, ulp, info);
247 
248 	/*
249 	 * Set a softlock to suspend future ufs_vnops so that
250 	 * this lockfs request will not be starved
251 	 */
252 	ULOCKFS_SET_SLOCK(ulp);
253 	ASSERT(ufs_quiesce_pend);
254 
255 	/* check if there is any outstanding ufs vnodeops calls */
256 	while (ulp->ul_vnops_cnt || ulp->ul_falloc_cnt) {
257 		/*
258 		 * use timed version of cv_wait_sig() to make sure we don't
259 		 * miss a wake up call from ufs_pageio() when it doesn't use
260 		 * ul_lock.
261 		 *
262 		 * when a fallocate thread comes in, the only way it returns
263 		 * from this function is if there are no other vnode operations
264 		 * going on (remember fallocate threads are tracked using
265 		 * ul_falloc_cnt not ul_vnops_cnt), and another fallocate thread
266 		 * hasn't already grabbed the fs write lock.
267 		 */
268 		if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
269 			if (!ulp->ul_vnops_cnt && !ULOCKFS_IS_FWLOCK(ulp))
270 				goto out;
271 		}
272 		if (!cv_timedwait_sig(&ulp->ul_cv, &ulp->ul_lock, lbolt + hz)) {
273 			error = EINTR;
274 			goto out;
275 		}
276 	}
277 
278 out:
279 	/*
280 	 * unlock the soft lock
281 	 */
282 	ULOCKFS_CLR_SLOCK(ulp);
283 
284 	return (error);
285 }
286 
287 /*
288  * ufs_flush_inode
289  */
290 int
291 ufs_flush_inode(struct inode *ip, void *arg)
292 {
293 	int	error;
294 	int	saverror	= 0;
295 
296 	/*
297 	 * wrong file system; keep looking
298 	 */
299 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
300 		return (0);
301 
302 	/*
303 	 * asynchronously push all the dirty pages
304 	 */
305 	if (((error = TRANS_SYNCIP(ip, B_ASYNC, 0, TOP_SYNCIP_FLUSHI)) != 0) &&
306 	    (error != EAGAIN))
307 		saverror = error;
308 	/*
309 	 * wait for io and discard all mappings
310 	 */
311 	if (error = TRANS_SYNCIP(ip, B_INVAL, 0, TOP_SYNCIP_FLUSHI))
312 		saverror = error;
313 
314 	if (ITOV(ip)->v_type == VDIR) {
315 		dnlc_dir_purge(&ip->i_danchor);
316 	}
317 
318 	return (saverror);
319 }
320 
321 /*
322  * ufs_flush
323  *	Flush everything that is currently dirty; this includes invalidating
324  *	any mappings.
325  */
326 int
327 ufs_flush(struct vfs *vfsp)
328 {
329 	int		error;
330 	int		saverror = 0;
331 	struct ufsvfs	*ufsvfsp	= (struct ufsvfs *)vfsp->vfs_data;
332 	struct fs	*fs		= ufsvfsp->vfs_fs;
333 	int		tdontblock = 0;
334 
335 	ASSERT(vfs_lock_held(vfsp));
336 
337 	/*
338 	 * purge dnlc
339 	 */
340 	(void) dnlc_purge_vfsp(vfsp, 0);
341 
342 	/*
343 	 * drain the delete and idle threads
344 	 */
345 	ufs_delete_drain(vfsp, 0, 0);
346 	ufs_idle_drain(vfsp);
347 
348 	/*
349 	 * flush and invalidate quota records
350 	 */
351 	(void) qsync(ufsvfsp);
352 
353 	/*
354 	 * flush w/invalidate the inodes for vfsp
355 	 */
356 	if (error = ufs_scan_inodes(0, ufs_flush_inode, ufsvfsp, ufsvfsp))
357 		saverror = error;
358 
359 	/*
360 	 * synchronously flush superblock and summary info
361 	 */
362 	if (fs->fs_ronly == 0 && fs->fs_fmod) {
363 		fs->fs_fmod = 0;
364 		TRANS_SBUPDATE(ufsvfsp, vfsp, TOP_SBUPDATE_FLUSH);
365 	}
366 	/*
367 	 * flush w/invalidate block device pages and buf cache
368 	 */
369 	if ((error = VOP_PUTPAGE(common_specvp(ufsvfsp->vfs_devvp),
370 	    (offset_t)0, 0, B_INVAL, CRED(), NULL)) > 0)
371 		saverror = error;
372 
373 	(void) bflush((dev_t)vfsp->vfs_dev);
374 	(void) bfinval((dev_t)vfsp->vfs_dev, 0);
375 
376 	/*
377 	 * drain the delete and idle threads again
378 	 */
379 	ufs_delete_drain(vfsp, 0, 0);
380 	ufs_idle_drain(vfsp);
381 
382 	/*
383 	 * play with the clean flag
384 	 */
385 	if (saverror == 0)
386 		ufs_checkclean(vfsp);
387 
388 	/*
389 	 * Flush any outstanding transactions and roll the log
390 	 * only if we are supposed to do, i.e. LDL_NOROLL not set.
391 	 * We can not simply check for fs_ronly here since fsck also may
392 	 * use this code to roll the log on a read-only filesystem, e.g.
393 	 * root during early stages of boot, if other then a sanity check is
394 	 * done, it will clear LDL_NOROLL before.
395 	 * In addition we assert that the deltamap does not contain any deltas
396 	 * in case LDL_NOROLL is set since this is not supposed to happen.
397 	 */
398 	if (TRANS_ISTRANS(ufsvfsp)) {
399 		ml_unit_t	*ul	= ufsvfsp->vfs_log;
400 		mt_map_t	*mtm	= ul->un_deltamap;
401 
402 		if (ul->un_flags & LDL_NOROLL) {
403 			ASSERT(mtm->mtm_nme == 0);
404 		} else {
405 			/*
406 			 * Do not set T_DONTBLOCK if there is a
407 			 * transaction opened by caller.
408 			 */
409 			if (curthread->t_flag & T_DONTBLOCK)
410 				tdontblock = 1;
411 			else
412 				curthread->t_flag |= T_DONTBLOCK;
413 
414 			TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_FLUSH,
415 			    TOP_COMMIT_SIZE, error);
416 
417 			if (!error) {
418 				TRANS_END_SYNC(ufsvfsp, saverror,
419 				    TOP_COMMIT_FLUSH, TOP_COMMIT_SIZE);
420 			}
421 
422 			if (tdontblock == 0)
423 				curthread->t_flag &= ~T_DONTBLOCK;
424 
425 			logmap_roll_dev(ufsvfsp->vfs_log);
426 		}
427 	}
428 
429 	return (saverror);
430 }
431 
432 /*
433  * ufs_thaw_wlock
434  *	special processing when thawing down to wlock
435  */
436 static int
437 ufs_thaw_wlock(struct inode *ip, void *arg)
438 {
439 	/*
440 	 * wrong file system; keep looking
441 	 */
442 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
443 		return (0);
444 
445 	/*
446 	 * iupdat refuses to clear flags if the fs is read only.  The fs
447 	 * may become read/write during the lock and we wouldn't want
448 	 * these inodes being written to disk.  So clear the flags.
449 	 */
450 	rw_enter(&ip->i_contents, RW_WRITER);
451 	ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
452 	rw_exit(&ip->i_contents);
453 
454 	/*
455 	 * pages are mlocked -- fail wlock
456 	 */
457 	if (ITOV(ip)->v_type != VCHR && vn_has_cached_data(ITOV(ip)))
458 		return (EBUSY);
459 
460 	return (0);
461 }
462 
463 /*
464  * ufs_thaw_hlock
465  *	special processing when thawing down to hlock or elock
466  */
467 static int
468 ufs_thaw_hlock(struct inode *ip, void *arg)
469 {
470 	struct vnode	*vp	= ITOV(ip);
471 
472 	/*
473 	 * wrong file system; keep looking
474 	 */
475 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
476 		return (0);
477 
478 	/*
479 	 * blow away all pages - even if they are mlocked
480 	 */
481 	do {
482 		(void) TRANS_SYNCIP(ip, B_INVAL | B_FORCE, 0, TOP_SYNCIP_HLOCK);
483 	} while ((vp->v_type != VCHR) && vn_has_cached_data(vp));
484 	rw_enter(&ip->i_contents, RW_WRITER);
485 	ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
486 	rw_exit(&ip->i_contents);
487 
488 	return (0);
489 }
490 
491 /*
492  * ufs_thaw
493  *	thaw file system lock down to current value
494  */
495 int
496 ufs_thaw(struct vfs *vfsp, struct ufsvfs *ufsvfsp, struct ulockfs *ulp)
497 {
498 	int		error	= 0;
499 	int		noidel	= (int)(ulp->ul_flag & ULOCKFS_NOIDEL);
500 
501 	/*
502 	 * if wlock or hlock or elock
503 	 */
504 	if (ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_HLOCK(ulp) ||
505 	    ULOCKFS_IS_ELOCK(ulp)) {
506 
507 		/*
508 		 * don't keep access times
509 		 * don't free deleted files
510 		 * if superblock writes are allowed, limit them to me for now
511 		 */
512 		ulp->ul_flag |= (ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
513 		if (ulp->ul_sbowner != (kthread_id_t)-1)
514 			ulp->ul_sbowner = curthread;
515 
516 		/*
517 		 * wait for writes for deleted files and superblock updates
518 		 */
519 		(void) ufs_flush(vfsp);
520 
521 		/*
522 		 * now make sure the quota file is up-to-date
523 		 *	expensive; but effective
524 		 */
525 		error = ufs_flush(vfsp);
526 		/*
527 		 * no one can write the superblock
528 		 */
529 		ulp->ul_sbowner = (kthread_id_t)-1;
530 
531 		/*
532 		 * special processing for wlock/hlock/elock
533 		 */
534 		if (ULOCKFS_IS_WLOCK(ulp)) {
535 			if (error)
536 				goto errout;
537 			error = bfinval(ufsvfsp->vfs_dev, 0);
538 			if (error)
539 				goto errout;
540 			error = ufs_scan_inodes(0, ufs_thaw_wlock,
541 			    (void *)ufsvfsp, ufsvfsp);
542 			if (error)
543 				goto errout;
544 		}
545 		if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp)) {
546 			error = 0;
547 			(void) ufs_scan_inodes(0, ufs_thaw_hlock,
548 			    (void *)ufsvfsp, ufsvfsp);
549 			(void) bfinval(ufsvfsp->vfs_dev, 1);
550 		}
551 	} else {
552 
553 		/*
554 		 * okay to keep access times
555 		 * okay to free deleted files
556 		 * okay to write the superblock
557 		 */
558 		ulp->ul_flag &= ~(ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
559 		ulp->ul_sbowner = NULL;
560 
561 		/*
562 		 * flush in case deleted files are in memory
563 		 */
564 		if (noidel) {
565 			if (error = ufs_flush(vfsp))
566 				goto errout;
567 		}
568 	}
569 
570 errout:
571 	cv_broadcast(&ulp->ul_cv);
572 	return (error);
573 }
574 
575 /*
576  * ufs_reconcile_fs
577  *	reconcile incore superblock with ondisk superblock
578  */
579 int
580 ufs_reconcile_fs(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
581 {
582 	struct fs	*mfs; 	/* in-memory superblock */
583 	struct fs	*dfs;	/* on-disk   superblock */
584 	struct buf	*bp;	/* on-disk   superblock buf */
585 	int		 needs_unlock;
586 	char		 finished_fsclean;
587 
588 	mfs = ufsvfsp->vfs_fs;
589 
590 	/*
591 	 * get the on-disk copy of the superblock
592 	 */
593 	bp = UFS_BREAD(ufsvfsp, vfsp->vfs_dev, SBLOCK, SBSIZE);
594 	bp->b_flags |= (B_STALE|B_AGE);
595 	if (bp->b_flags & B_ERROR) {
596 		brelse(bp);
597 		return (EIO);
598 	}
599 	dfs = bp->b_un.b_fs;
600 
601 	/* error locks may only unlock after the fs has been made consistent */
602 	if (errlck == UN_ERRLCK) {
603 		if (dfs->fs_clean == FSFIX) {	/* being repaired */
604 			brelse(bp);
605 			return (EAGAIN);
606 		}
607 		/* repair not yet started? */
608 		finished_fsclean = TRANS_ISTRANS(ufsvfsp)? FSLOG: FSCLEAN;
609 		if (dfs->fs_clean != finished_fsclean) {
610 			brelse(bp);
611 			return (EBUSY);
612 		}
613 	}
614 
615 	/*
616 	 * if superblock has changed too much, abort
617 	 */
618 	if ((mfs->fs_sblkno		!= dfs->fs_sblkno) ||
619 	    (mfs->fs_cblkno		!= dfs->fs_cblkno) ||
620 	    (mfs->fs_iblkno		!= dfs->fs_iblkno) ||
621 	    (mfs->fs_dblkno		!= dfs->fs_dblkno) ||
622 	    (mfs->fs_cgoffset		!= dfs->fs_cgoffset) ||
623 	    (mfs->fs_cgmask		!= dfs->fs_cgmask) ||
624 	    (mfs->fs_bsize		!= dfs->fs_bsize) ||
625 	    (mfs->fs_fsize		!= dfs->fs_fsize) ||
626 	    (mfs->fs_frag		!= dfs->fs_frag) ||
627 	    (mfs->fs_bmask		!= dfs->fs_bmask) ||
628 	    (mfs->fs_fmask		!= dfs->fs_fmask) ||
629 	    (mfs->fs_bshift		!= dfs->fs_bshift) ||
630 	    (mfs->fs_fshift		!= dfs->fs_fshift) ||
631 	    (mfs->fs_fragshift		!= dfs->fs_fragshift) ||
632 	    (mfs->fs_fsbtodb		!= dfs->fs_fsbtodb) ||
633 	    (mfs->fs_sbsize		!= dfs->fs_sbsize) ||
634 	    (mfs->fs_nindir		!= dfs->fs_nindir) ||
635 	    (mfs->fs_nspf		!= dfs->fs_nspf) ||
636 	    (mfs->fs_trackskew		!= dfs->fs_trackskew) ||
637 	    (mfs->fs_cgsize		!= dfs->fs_cgsize) ||
638 	    (mfs->fs_ntrak		!= dfs->fs_ntrak) ||
639 	    (mfs->fs_nsect		!= dfs->fs_nsect) ||
640 	    (mfs->fs_spc		!= dfs->fs_spc) ||
641 	    (mfs->fs_cpg		!= dfs->fs_cpg) ||
642 	    (mfs->fs_ipg		!= dfs->fs_ipg) ||
643 	    (mfs->fs_fpg		!= dfs->fs_fpg) ||
644 	    (mfs->fs_postblformat	!= dfs->fs_postblformat) ||
645 	    (mfs->fs_magic		!= dfs->fs_magic)) {
646 		brelse(bp);
647 		return (EACCES);
648 	}
649 	if (dfs->fs_clean == FSBAD || FSOKAY != dfs->fs_state + dfs->fs_time)
650 		if (mfs->fs_clean == FSLOG) {
651 			brelse(bp);
652 			return (EACCES);
653 		}
654 
655 	/*
656 	 * get new summary info
657 	 */
658 	if (ufs_getsummaryinfo(vfsp->vfs_dev, ufsvfsp, dfs)) {
659 		brelse(bp);
660 		return (EIO);
661 	}
662 
663 	/*
664 	 * release old summary info and update in-memory superblock
665 	 */
666 	kmem_free(mfs->fs_u.fs_csp, mfs->fs_cssize);
667 	mfs->fs_u.fs_csp = dfs->fs_u.fs_csp;	/* Only entry 0 used */
668 
669 	/*
670 	 * update fields allowed to change
671 	 */
672 	mfs->fs_size		= dfs->fs_size;
673 	mfs->fs_dsize		= dfs->fs_dsize;
674 	mfs->fs_ncg		= dfs->fs_ncg;
675 	mfs->fs_minfree		= dfs->fs_minfree;
676 	mfs->fs_rotdelay	= dfs->fs_rotdelay;
677 	mfs->fs_rps		= dfs->fs_rps;
678 	mfs->fs_maxcontig	= dfs->fs_maxcontig;
679 	mfs->fs_maxbpg		= dfs->fs_maxbpg;
680 	mfs->fs_csmask		= dfs->fs_csmask;
681 	mfs->fs_csshift		= dfs->fs_csshift;
682 	mfs->fs_optim		= dfs->fs_optim;
683 	mfs->fs_csaddr		= dfs->fs_csaddr;
684 	mfs->fs_cssize		= dfs->fs_cssize;
685 	mfs->fs_ncyl		= dfs->fs_ncyl;
686 	mfs->fs_cstotal		= dfs->fs_cstotal;
687 	mfs->fs_reclaim		= dfs->fs_reclaim;
688 
689 	if (mfs->fs_reclaim & (FS_RECLAIM|FS_RECLAIMING)) {
690 		mfs->fs_reclaim &= ~FS_RECLAIM;
691 		mfs->fs_reclaim |=  FS_RECLAIMING;
692 		ufs_thread_start(&ufsvfsp->vfs_reclaim,
693 		    ufs_thread_reclaim, vfsp);
694 	}
695 
696 	/* XXX What to do about sparecon? */
697 
698 	/* XXX need to copy volume label */
699 
700 	/*
701 	 * ondisk clean flag overrides inmemory clean flag iff == FSBAD
702 	 * or if error-locked and ondisk is now clean
703 	 */
704 	needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
705 	if (needs_unlock)
706 		mutex_enter(&ufsvfsp->vfs_lock);
707 
708 	if (errlck == UN_ERRLCK) {
709 		if (finished_fsclean == dfs->fs_clean)
710 			mfs->fs_clean = finished_fsclean;
711 		else
712 			mfs->fs_clean = FSBAD;
713 		mfs->fs_state = FSOKAY - dfs->fs_time;
714 	}
715 
716 	if (FSOKAY != dfs->fs_state + dfs->fs_time ||
717 	    (dfs->fs_clean == FSBAD))
718 		mfs->fs_clean = FSBAD;
719 
720 	if (needs_unlock)
721 		mutex_exit(&ufsvfsp->vfs_lock);
722 
723 	brelse(bp);
724 
725 	return (0);
726 }
727 
728 /*
729  * ufs_reconcile_inode
730  *	reconcile ondisk inode with incore inode
731  */
732 static int
733 ufs_reconcile_inode(struct inode *ip, void *arg)
734 {
735 	int		i;
736 	int		ndaddr;
737 	int		niaddr;
738 	struct dinode	*dp;		/* ondisk inode */
739 	struct buf	*bp	= NULL;
740 	uid_t		d_uid;
741 	gid_t		d_gid;
742 	int		error = 0;
743 	struct fs	*fs;
744 
745 	/*
746 	 * not an inode we care about
747 	 */
748 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
749 		return (0);
750 
751 	fs = ip->i_fs;
752 
753 	/*
754 	 * Inode reconciliation fails: we made the filesystem quiescent
755 	 * and we did a ufs_flush() before calling ufs_reconcile_inode()
756 	 * and thus the inode should not have been changed inbetween.
757 	 * Any discrepancies indicate a logic error and a pretty
758 	 * significant run-state inconsistency we should complain about.
759 	 */
760 	if (ip->i_flag & (IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG)) {
761 		cmn_err(CE_WARN, "%s: Inode reconciliation failed for"
762 		    "inode %llu", fs->fs_fsmnt, (u_longlong_t)ip->i_number);
763 		return (EINVAL);
764 	}
765 
766 	/*
767 	 * get the dinode
768 	 */
769 	bp = UFS_BREAD(ip->i_ufsvfs,
770 	    ip->i_dev, (daddr_t)fsbtodb(fs, itod(fs, ip->i_number)),
771 	    (int)fs->fs_bsize);
772 	if (bp->b_flags & B_ERROR) {
773 		brelse(bp);
774 		return (EIO);
775 	}
776 	dp  = bp->b_un.b_dino;
777 	dp += itoo(fs, ip->i_number);
778 
779 	/*
780 	 * handle Sun's implementation of EFT
781 	 */
782 	d_uid = (dp->di_suid == UID_LONG) ? dp->di_uid : (uid_t)dp->di_suid;
783 	d_gid = (dp->di_sgid == GID_LONG) ? dp->di_gid : (uid_t)dp->di_sgid;
784 
785 	rw_enter(&ip->i_contents, RW_WRITER);
786 
787 	/*
788 	 * some fields are not allowed to change
789 	 */
790 	if ((ip->i_mode  != dp->di_mode) ||
791 	    (ip->i_gen   != dp->di_gen) ||
792 	    (ip->i_uid   != d_uid) ||
793 	    (ip->i_gid   != d_gid)) {
794 		error = EACCES;
795 		goto out;
796 	}
797 
798 	/*
799 	 * and some are allowed to change
800 	 */
801 	ip->i_size		= dp->di_size;
802 	ip->i_ic.ic_flags	= dp->di_ic.ic_flags;
803 	ip->i_blocks		= dp->di_blocks;
804 	ip->i_nlink		= dp->di_nlink;
805 	if (ip->i_flag & IFASTSYMLNK) {
806 		ndaddr = 1;
807 		niaddr = 0;
808 	} else {
809 		ndaddr = NDADDR;
810 		niaddr = NIADDR;
811 	}
812 	for (i = 0; i < ndaddr; ++i)
813 		ip->i_db[i] = dp->di_db[i];
814 	for (i = 0; i < niaddr; ++i)
815 		ip->i_ib[i] = dp->di_ib[i];
816 
817 out:
818 	rw_exit(&ip->i_contents);
819 	brelse(bp);
820 	return (error);
821 }
822 
823 /*
824  * ufs_reconcile
825  *	reconcile ondisk superblock/inodes with any incore
826  */
827 static int
828 ufs_reconcile(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
829 {
830 	int	error = 0;
831 
832 	/*
833 	 * get rid of as much inmemory data as possible
834 	 */
835 	(void) ufs_flush(vfsp);
836 
837 	/*
838 	 * reconcile the superblock and inodes
839 	 */
840 	if (error = ufs_reconcile_fs(vfsp, ufsvfsp, errlck))
841 		return (error);
842 	if (error = ufs_scan_inodes(0, ufs_reconcile_inode, ufsvfsp, ufsvfsp))
843 		return (error);
844 	/*
845 	 * allocation blocks may be incorrect; get rid of them
846 	 */
847 	(void) ufs_flush(vfsp);
848 
849 	return (error);
850 }
851 
852 /*
853  * File system locking
854  */
855 int
856 ufs_fiolfs(struct vnode *vp, struct lockfs *lockfsp, int from_log)
857 {
858 	return (ufs__fiolfs(vp, lockfsp, /* from_user */ 1, from_log));
859 }
860 
861 /* kernel-internal interface, also used by fix-on-panic */
862 int
863 ufs__fiolfs(
864 	struct vnode *vp,
865 	struct lockfs *lockfsp,
866 	int from_user,
867 	int from_log)
868 {
869 	struct ulockfs	*ulp;
870 	struct lockfs	lfs;
871 	int		error;
872 	struct vfs	*vfsp;
873 	struct ufsvfs	*ufsvfsp;
874 	int		 errlck		= NO_ERRLCK;
875 	int		 poll_events	= POLLPRI;
876 	extern struct pollhead ufs_pollhd;
877 	ulockfs_info_t *head;
878 	ulockfs_info_t *info;
879 	int signal = 0;
880 
881 	/* check valid lock type */
882 	if (!lockfsp || lockfsp->lf_lock > LOCKFS_MAXLOCK)
883 		return (EINVAL);
884 
885 	if (!vp || !vp->v_vfsp || !vp->v_vfsp->vfs_data)
886 		return (EIO);
887 
888 	vfsp = vp->v_vfsp;
889 	ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
890 	ulp = &ufsvfsp->vfs_ulockfs;
891 
892 	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
893 	SEARCH_ULOCKFSP(head, ulp, info);
894 
895 	/*
896 	 * Suspend both the reclaim thread and the delete thread.
897 	 * This must be done outside the lockfs locking protocol.
898 	 */
899 	ufs_thread_suspend(&ufsvfsp->vfs_reclaim);
900 	ufs_thread_suspend(&ufsvfsp->vfs_delete);
901 
902 	/*
903 	 * Acquire vfs_reflock around ul_lock to avoid deadlock with
904 	 * umount/remount/sync.
905 	 */
906 	vfs_lock_wait(vfsp);
907 	mutex_enter(&ulp->ul_lock);
908 	atomic_add_long(&ufs_quiesce_pend, 1);
909 
910 	/*
911 	 * Quit if there is another lockfs request in progress
912 	 * that is waiting for existing ufs_vnops to complete.
913 	 */
914 	if (ULOCKFS_IS_BUSY(ulp)) {
915 		error = EBUSY;
916 		goto errexit;
917 	}
918 
919 	/* cannot ulocked or downgrade a hard-lock */
920 	if (ULOCKFS_IS_HLOCK(ulp)) {
921 		error = EIO;
922 		goto errexit;
923 	}
924 
925 	/* an error lock may be unlocked or relocked, only */
926 	if (ULOCKFS_IS_ELOCK(ulp)) {
927 		if (!LOCKFS_IS_ULOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
928 			error = EBUSY;
929 			goto errexit;
930 		}
931 	}
932 
933 	/*
934 	 * a read-only error lock may only be upgraded to an
935 	 * error lock or hard lock
936 	 */
937 	if (ULOCKFS_IS_ROELOCK(ulp)) {
938 		if (!LOCKFS_IS_HLOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
939 			error = EBUSY;
940 			goto errexit;
941 		}
942 	}
943 
944 	/*
945 	 * until read-only error locks are fully implemented
946 	 * just return EINVAL
947 	 */
948 	if (LOCKFS_IS_ROELOCK(lockfsp)) {
949 		error = EINVAL;
950 		goto errexit;
951 	}
952 
953 	/*
954 	 * an error lock may only be applied if the file system is
955 	 * unlocked or already error locked.
956 	 * (this is to prevent the case where a fs gets changed out from
957 	 * underneath a fs that is locked for backup,
958 	 * that is, name/delete/write-locked.)
959 	 */
960 	if ((!ULOCKFS_IS_ULOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp) &&
961 	    !ULOCKFS_IS_ROELOCK(ulp)) &&
962 	    (LOCKFS_IS_ELOCK(lockfsp) || LOCKFS_IS_ROELOCK(lockfsp))) {
963 		error = EBUSY;
964 		goto errexit;
965 	}
966 
967 	/* get and validate the input lockfs request */
968 	if (error = ufs_getlfd(lockfsp, &ulp->ul_lockfs))
969 		goto errexit;
970 
971 	/*
972 	 * save current ulockfs struct
973 	 */
974 	bcopy(&ulp->ul_lockfs, &lfs, sizeof (struct lockfs));
975 
976 	/*
977 	 * Freeze the file system (pend future accesses)
978 	 */
979 	ufs_freeze(ulp, lockfsp);
980 
981 	/*
982 	 * Set locking in progress because ufs_quiesce may free the
983 	 * ul_lock mutex.
984 	 */
985 	ULOCKFS_SET_BUSY(ulp);
986 	/* update the ioctl copy */
987 	LOCKFS_SET_BUSY(&ulp->ul_lockfs);
988 
989 	/*
990 	 * We  need to unset FWLOCK status before we call ufs_quiesce
991 	 * so that the thread doesnt get suspended. We do this only if
992 	 * this (fallocate) thread requested an unlock operation.
993 	 */
994 	if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
995 		if (!ULOCKFS_IS_WLOCK(ulp))
996 			ULOCKFS_CLR_FWLOCK(ulp);
997 	}
998 
999 	/*
1000 	 * Quiesce (wait for outstanding accesses to finish)
1001 	 */
1002 	if (error = ufs_quiesce(ulp)) {
1003 		/*
1004 		 * Interrupted due to signal. There could still be
1005 		 * pending vnops.
1006 		 */
1007 		signal = 1;
1008 
1009 		/*
1010 		 * We do broadcast because lock-status
1011 		 * could be reverted to old status.
1012 		 */
1013 		cv_broadcast(&ulp->ul_cv);
1014 		goto errout;
1015 	}
1016 
1017 	/*
1018 	 * If the fallocate thread requested a write fs lock operation
1019 	 * then we set fwlock status in the ulp.
1020 	 */
1021 	if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
1022 		if (ULOCKFS_IS_WLOCK(ulp))
1023 			ULOCKFS_SET_FWLOCK(ulp);
1024 	}
1025 
1026 	/*
1027 	 * can't wlock or (ro)elock fs with accounting or local swap file
1028 	 */
1029 	if ((ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp) ||
1030 	    ULOCKFS_IS_ROELOCK(ulp)) && !from_log) {
1031 		if (error = ufs_checkaccton(vp))
1032 			goto errout;
1033 		if (error = ufs_checkswapon(vp))
1034 			goto errout;
1035 	}
1036 
1037 	/*
1038 	 * save error lock status to pass down to reconcilation
1039 	 * routines and for later cleanup
1040 	 */
1041 	if (LOCKFS_IS_ELOCK(&lfs) && ULOCKFS_IS_ULOCK(ulp))
1042 		errlck = UN_ERRLCK;
1043 
1044 	if (ULOCKFS_IS_ELOCK(ulp) || ULOCKFS_IS_ROELOCK(ulp)) {
1045 		int needs_unlock;
1046 		int needs_sbwrite;
1047 
1048 		poll_events |= POLLERR;
1049 		errlck = LOCKFS_IS_ELOCK(&lfs) || LOCKFS_IS_ROELOCK(&lfs) ?
1050 		    RE_ERRLCK : SET_ERRLCK;
1051 
1052 		needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
1053 		if (needs_unlock)
1054 			mutex_enter(&ufsvfsp->vfs_lock);
1055 
1056 		/* disable delayed i/o */
1057 		needs_sbwrite = 0;
1058 
1059 		if (errlck == SET_ERRLCK) {
1060 			ufsvfsp->vfs_fs->fs_clean = FSBAD;
1061 			needs_sbwrite = 1;
1062 		}
1063 
1064 		needs_sbwrite |= ufsvfsp->vfs_dio;
1065 		ufsvfsp->vfs_dio = 0;
1066 
1067 		if (needs_unlock)
1068 			mutex_exit(&ufsvfsp->vfs_lock);
1069 
1070 		if (needs_sbwrite) {
1071 			ulp->ul_sbowner = curthread;
1072 			TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE);
1073 
1074 			if (needs_unlock)
1075 				mutex_enter(&ufsvfsp->vfs_lock);
1076 
1077 			ufsvfsp->vfs_fs->fs_fmod = 0;
1078 
1079 			if (needs_unlock)
1080 				mutex_exit(&ufsvfsp->vfs_lock);
1081 		}
1082 	}
1083 
1084 	/*
1085 	 * reconcile superblock and inodes if was wlocked
1086 	 */
1087 	if (LOCKFS_IS_WLOCK(&lfs) || LOCKFS_IS_ELOCK(&lfs)) {
1088 		if (error = ufs_reconcile(vfsp, ufsvfsp, errlck))
1089 			goto errout;
1090 		/*
1091 		 * in case the fs grew; reset the metadata map for logging tests
1092 		 */
1093 		TRANS_MATA_UMOUNT(ufsvfsp);
1094 		TRANS_MATA_MOUNT(ufsvfsp);
1095 		TRANS_MATA_SI(ufsvfsp, ufsvfsp->vfs_fs);
1096 	}
1097 
1098 	/*
1099 	 * At least everything *currently* dirty goes out.
1100 	 */
1101 
1102 	if ((error = ufs_flush(vfsp)) != 0 && !ULOCKFS_IS_HLOCK(ulp) &&
1103 	    !ULOCKFS_IS_ELOCK(ulp))
1104 		goto errout;
1105 
1106 	/*
1107 	 * thaw file system and wakeup pended processes
1108 	 */
1109 	if (error = ufs_thaw(vfsp, ufsvfsp, ulp))
1110 		if (!ULOCKFS_IS_HLOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp))
1111 			goto errout;
1112 
1113 	/*
1114 	 * reset modified flag if not already write locked
1115 	 */
1116 	if (!LOCKFS_IS_WLOCK(&lfs))
1117 		ULOCKFS_CLR_MOD(ulp);
1118 
1119 	/*
1120 	 * idle the lock struct
1121 	 */
1122 	ULOCKFS_CLR_BUSY(ulp);
1123 	/* update the ioctl copy */
1124 	LOCKFS_CLR_BUSY(&ulp->ul_lockfs);
1125 
1126 	/*
1127 	 * free current comment
1128 	 */
1129 	if (lfs.lf_comment && lfs.lf_comlen != 0) {
1130 		kmem_free(lfs.lf_comment, lfs.lf_comlen);
1131 		lfs.lf_comment = NULL;
1132 		lfs.lf_comlen = 0;
1133 	}
1134 
1135 	/* do error lock cleanup */
1136 	if (errlck == UN_ERRLCK)
1137 		ufsfx_unlockfs(ufsvfsp);
1138 
1139 	else if (errlck == RE_ERRLCK)
1140 		ufsfx_lockfs(ufsvfsp);
1141 
1142 	/* don't allow error lock from user to invoke panic */
1143 	else if (from_user && errlck == SET_ERRLCK &&
1144 	    !(ufsvfsp->vfs_fsfx.fx_flags & (UFSMNT_ONERROR_PANIC >> 4)))
1145 		(void) ufs_fault(ufsvfsp->vfs_root,
1146 		    ulp->ul_lockfs.lf_comment && ulp->ul_lockfs.lf_comlen > 0 ?
1147 		    ulp->ul_lockfs.lf_comment: "user-applied error lock");
1148 
1149 	atomic_add_long(&ufs_quiesce_pend, -1);
1150 	mutex_exit(&ulp->ul_lock);
1151 	vfs_unlock(vfsp);
1152 
1153 	if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs))
1154 		poll_events |= POLLERR;
1155 
1156 	pollwakeup(&ufs_pollhd, poll_events);
1157 
1158 	/*
1159 	 * Allow both the delete thread and the reclaim thread to
1160 	 * continue.
1161 	 */
1162 	ufs_thread_continue(&ufsvfsp->vfs_delete);
1163 	ufs_thread_continue(&ufsvfsp->vfs_reclaim);
1164 
1165 	return (0);
1166 
1167 errout:
1168 	/*
1169 	 * Lock failed. Reset the old lock in ufsvfs if not hard locked.
1170 	 */
1171 	if (!LOCKFS_IS_HLOCK(&ulp->ul_lockfs)) {
1172 		bcopy(&lfs, &ulp->ul_lockfs, sizeof (struct lockfs));
1173 		ulp->ul_fs_lock = (1 << lfs.lf_lock);
1174 	}
1175 
1176 	/*
1177 	 * Don't call ufs_thaw() when there's a signal during
1178 	 * ufs quiesce operation as it can lead to deadlock
1179 	 * with getpage.
1180 	 */
1181 	if (signal == 0)
1182 		(void) ufs_thaw(vfsp, ufsvfsp, ulp);
1183 
1184 	ULOCKFS_CLR_BUSY(ulp);
1185 	LOCKFS_CLR_BUSY(&ulp->ul_lockfs);
1186 
1187 errexit:
1188 	atomic_add_long(&ufs_quiesce_pend, -1);
1189 	mutex_exit(&ulp->ul_lock);
1190 	vfs_unlock(vfsp);
1191 
1192 	/*
1193 	 * Allow both the delete thread and the reclaim thread to
1194 	 * continue.
1195 	 */
1196 	ufs_thread_continue(&ufsvfsp->vfs_delete);
1197 	ufs_thread_continue(&ufsvfsp->vfs_reclaim);
1198 
1199 	return (error);
1200 }
1201 
1202 /*
1203  * fiolfss
1204  * 	return the current file system locking state info
1205  */
1206 int
1207 ufs_fiolfss(struct vnode *vp, struct lockfs *lockfsp)
1208 {
1209 	struct ulockfs	*ulp;
1210 
1211 	if (!vp || !vp->v_vfsp || !VTOI(vp))
1212 		return (EINVAL);
1213 
1214 	/* file system has been forcibly unmounted */
1215 	if (VTOI(vp)->i_ufsvfs == NULL)
1216 		return (EIO);
1217 
1218 	ulp = VTOUL(vp);
1219 
1220 	if (ULOCKFS_IS_HLOCK(ulp)) {
1221 		*lockfsp = ulp->ul_lockfs;	/* structure assignment */
1222 		return (0);
1223 	}
1224 
1225 	mutex_enter(&ulp->ul_lock);
1226 
1227 	*lockfsp = ulp->ul_lockfs;	/* structure assignment */
1228 
1229 	if (ULOCKFS_IS_MOD(ulp))
1230 		lockfsp->lf_flags |= LOCKFS_MOD;
1231 
1232 	mutex_exit(&ulp->ul_lock);
1233 
1234 	return (0);
1235 }
1236 
1237 /*
1238  * ufs_check_lockfs
1239  *	check whether a ufs_vnops conflicts with the file system lock
1240  */
1241 int
1242 ufs_check_lockfs(struct ufsvfs *ufsvfsp, struct ulockfs *ulp, ulong_t mask)
1243 {
1244 	k_sigset_t	smask;
1245 	int		sig, slock;
1246 
1247 	ASSERT(MUTEX_HELD(&ulp->ul_lock));
1248 
1249 	while (ulp->ul_fs_lock & mask) {
1250 		slock = (int)ULOCKFS_IS_SLOCK(ulp);
1251 		if ((curthread->t_flag & T_DONTPEND) && !slock) {
1252 			curthread->t_flag |= T_WOULDBLOCK;
1253 			return (EAGAIN);
1254 		}
1255 		curthread->t_flag &= ~T_WOULDBLOCK;
1256 
1257 		/*
1258 		 * In the case of an onerr umount of the fs, threads could
1259 		 * have blocked before coming into ufs_check_lockfs and
1260 		 * need to check for the special case of ELOCK and
1261 		 * vfs_dontblock being set which would indicate that the fs
1262 		 * is on its way out and will not return therefore making
1263 		 * EIO the appropriate response.
1264 		 */
1265 		if (ULOCKFS_IS_HLOCK(ulp) ||
1266 		    (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
1267 			return (EIO);
1268 
1269 		/*
1270 		 * wait for lock status to change
1271 		 */
1272 		if (slock || ufsvfsp->vfs_nointr) {
1273 			cv_wait(&ulp->ul_cv, &ulp->ul_lock);
1274 		} else {
1275 			sigintr(&smask, 1);
1276 			sig = cv_wait_sig(&ulp->ul_cv, &ulp->ul_lock);
1277 			sigunintr(&smask);
1278 			if ((!sig && (ulp->ul_fs_lock & mask)) ||
1279 			    ufsvfsp->vfs_dontblock)
1280 				return (EINTR);
1281 		}
1282 	}
1283 
1284 	if (mask & ULOCKFS_FWLOCK) {
1285 		atomic_add_long(&ulp->ul_falloc_cnt, 1);
1286 		ULOCKFS_SET_FALLOC(ulp);
1287 	} else {
1288 		atomic_add_long(&ulp->ul_vnops_cnt, 1);
1289 	}
1290 
1291 	return (0);
1292 }
1293 
1294 /*
1295  * Check whether we came across the handcrafted lockfs protocol path. We can't
1296  * simply check for T_DONTBLOCK here as one would assume since this can also
1297  * falsely catch recursive VOP's going to a different filesystem, instead we
1298  * check if we already hold the ulockfs->ul_lock mutex.
1299  */
1300 static int
1301 ufs_lockfs_is_under_rawlockfs(struct ulockfs *ulp)
1302 {
1303 	return ((mutex_owner(&ulp->ul_lock) != curthread) ? 0 : 1);
1304 }
1305 
1306 /*
1307  * ufs_lockfs_begin - start the lockfs locking protocol
1308  */
1309 int
1310 ufs_lockfs_begin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask)
1311 {
1312 	int 		error;
1313 	int		rec_vop;
1314 	ushort_t	op_cnt_incremented = 0;
1315 	ulong_t		*ctr;
1316 	struct ulockfs *ulp;
1317 	ulockfs_info_t	*ulockfs_info;
1318 	ulockfs_info_t	*ulockfs_info_free;
1319 	ulockfs_info_t	*ulockfs_info_temp;
1320 
1321 	/*
1322 	 * file system has been forcibly unmounted
1323 	 */
1324 	if (ufsvfsp == NULL)
1325 		return (EIO);
1326 
1327 	*ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1328 
1329 	/*
1330 	 * Do lockfs protocol
1331 	 */
1332 	ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1333 	IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1334 
1335 	/*
1336 	 * Detect recursive VOP call or handcrafted internal lockfs protocol
1337 	 * path and bail out in that case.
1338 	 */
1339 	if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1340 		*ulpp = NULL;
1341 		return (0);
1342 	} else {
1343 		if (ulockfs_info_free == NULL) {
1344 			if ((ulockfs_info_temp = (ulockfs_info_t *)
1345 			    kmem_zalloc(sizeof (ulockfs_info_t),
1346 			    KM_NOSLEEP)) == NULL) {
1347 				*ulpp = NULL;
1348 				return (ENOMEM);
1349 			}
1350 		}
1351 	}
1352 
1353 	/*
1354 	 * First time VOP call
1355 	 *
1356 	 * Increment the ctr irrespective of the lockfs state. If the lockfs
1357 	 * state is not ULOCKFS_ULOCK, we can decrement it later. However,
1358 	 * before incrementing we need to check if there is a pending quiesce
1359 	 * request because if we have a continuous stream of ufs_lockfs_begin
1360 	 * requests pounding on a few cpu's then the ufs_quiesce thread might
1361 	 * never see the value of zero for ctr - a livelock kind of scenario.
1362 	 */
1363 	ctr = (mask & ULOCKFS_FWLOCK) ?
1364 	    &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt;
1365 	if (!ULOCKFS_IS_SLOCK(ulp)) {
1366 		atomic_add_long(ctr, 1);
1367 		op_cnt_incremented++;
1368 	}
1369 
1370 	/*
1371 	 * If the lockfs state (indicated by ul_fs_lock) is not just
1372 	 * ULOCKFS_ULOCK, then we will be routed through ufs_check_lockfs
1373 	 * where there is a check with an appropriate mask to selectively allow
1374 	 * operations permitted for that kind of lockfs state.
1375 	 *
1376 	 * Even these selective operations should not be allowed to go through
1377 	 * if a lockfs request is in progress because that could result in inode
1378 	 * modifications during a quiesce and could hence result in inode
1379 	 * reconciliation failures. ULOCKFS_SLOCK alone would not be sufficient,
1380 	 * so make use of ufs_quiesce_pend to disallow vnode operations when a
1381 	 * quiesce is in progress.
1382 	 */
1383 	if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
1384 		if (op_cnt_incremented)
1385 			if (!atomic_add_long_nv(ctr, -1))
1386 				cv_broadcast(&ulp->ul_cv);
1387 		mutex_enter(&ulp->ul_lock);
1388 		error = ufs_check_lockfs(ufsvfsp, ulp, mask);
1389 		mutex_exit(&ulp->ul_lock);
1390 		if (error) {
1391 			if (ulockfs_info_free == NULL)
1392 				kmem_free(ulockfs_info_temp,
1393 				    sizeof (ulockfs_info_t));
1394 			return (error);
1395 		}
1396 	} else {
1397 		/*
1398 		 * This is the common case of file system in a unlocked state.
1399 		 *
1400 		 * If a file system is unlocked, we would expect the ctr to have
1401 		 * been incremented by now. But this will not be true when a
1402 		 * quiesce is winding up - SLOCK was set when we checked before
1403 		 * incrementing the ctr, but by the time we checked for
1404 		 * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. It is okay
1405 		 * to take ul_lock and go through the slow path in this uncommon
1406 		 * case.
1407 		 */
1408 		if (op_cnt_incremented == 0) {
1409 			mutex_enter(&ulp->ul_lock);
1410 			error = ufs_check_lockfs(ufsvfsp, ulp, mask);
1411 			if (error) {
1412 				mutex_exit(&ulp->ul_lock);
1413 				if (ulockfs_info_free == NULL)
1414 					kmem_free(ulockfs_info_temp,
1415 					    sizeof (ulockfs_info_t));
1416 				return (error);
1417 			}
1418 			if (mask & ULOCKFS_FWLOCK)
1419 				ULOCKFS_SET_FALLOC(ulp);
1420 			mutex_exit(&ulp->ul_lock);
1421 		} else if (mask & ULOCKFS_FWLOCK) {
1422 			mutex_enter(&ulp->ul_lock);
1423 			ULOCKFS_SET_FALLOC(ulp);
1424 			mutex_exit(&ulp->ul_lock);
1425 		}
1426 	}
1427 
1428 	if (ulockfs_info_free != NULL) {
1429 		ulockfs_info_free->ulp = ulp;
1430 		if (mask & ULOCKFS_FWLOCK)
1431 			ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE;
1432 	} else {
1433 		ulockfs_info_temp->ulp = ulp;
1434 		ulockfs_info_temp->next = ulockfs_info;
1435 		if (mask & ULOCKFS_FWLOCK)
1436 			ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE;
1437 		ASSERT(ufs_lockfs_key != 0);
1438 		(void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1439 	}
1440 
1441 	curthread->t_flag |= T_DONTBLOCK;
1442 	return (0);
1443 }
1444 
1445 /*
1446  * Check whether we are returning from the top level VOP.
1447  */
1448 static int
1449 ufs_lockfs_top_vop_return(ulockfs_info_t *head)
1450 {
1451 	ulockfs_info_t *info;
1452 	int result = 1;
1453 
1454 	for (info = head; info != NULL; info = info->next) {
1455 		if (info->ulp != NULL) {
1456 			result = 0;
1457 			break;
1458 		}
1459 	}
1460 
1461 	return (result);
1462 }
1463 
1464 /*
1465  * ufs_lockfs_end - terminate the lockfs locking protocol
1466  */
1467 void
1468 ufs_lockfs_end(struct ulockfs *ulp)
1469 {
1470 	ulockfs_info_t *info;
1471 	ulockfs_info_t *head;
1472 
1473 	/*
1474 	 * end-of-VOP protocol
1475 	 */
1476 	if (ulp == NULL)
1477 		return;
1478 
1479 	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1480 	SEARCH_ULOCKFSP(head, ulp, info);
1481 
1482 	/*
1483 	 * If we're called from a first level VOP, we have to have a
1484 	 * valid ulockfs record in the TSD.
1485 	 */
1486 	ASSERT(info != NULL);
1487 
1488 	/*
1489 	 * Invalidate the ulockfs record.
1490 	 */
1491 	info->ulp = NULL;
1492 
1493 	if (ufs_lockfs_top_vop_return(head))
1494 		curthread->t_flag &= ~T_DONTBLOCK;
1495 
1496 	/* fallocate thread */
1497 	if (ULOCKFS_IS_FALLOC(ulp) && info->flags & ULOCK_INFO_FALLOCATE) {
1498 		/* Clear the thread's fallocate state */
1499 		info->flags &= ~ULOCK_INFO_FALLOCATE;
1500 		if (!atomic_add_long_nv(&ulp->ul_falloc_cnt, -1)) {
1501 			mutex_enter(&ulp->ul_lock);
1502 			ULOCKFS_CLR_FALLOC(ulp);
1503 			cv_broadcast(&ulp->ul_cv);
1504 			mutex_exit(&ulp->ul_lock);
1505 		}
1506 	} else  { /* normal thread */
1507 		if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1))
1508 			cv_broadcast(&ulp->ul_cv);
1509 	}
1510 }
1511 
1512 /*
1513  * ufs_lockfs_trybegin - try to start the lockfs locking protocol without
1514  * blocking.
1515  */
1516 int
1517 ufs_lockfs_trybegin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask)
1518 {
1519 	int 		error = 0;
1520 	int		rec_vop;
1521 	ushort_t	op_cnt_incremented = 0;
1522 	ulong_t		*ctr;
1523 	struct ulockfs *ulp;
1524 	ulockfs_info_t	*ulockfs_info;
1525 	ulockfs_info_t	*ulockfs_info_free;
1526 	ulockfs_info_t	*ulockfs_info_temp;
1527 
1528 	/*
1529 	 * file system has been forcibly unmounted
1530 	 */
1531 	if (ufsvfsp == NULL)
1532 		return (EIO);
1533 
1534 	*ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1535 
1536 	/*
1537 	 * Do lockfs protocol
1538 	 */
1539 	ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1540 	IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1541 
1542 	/*
1543 	 * Detect recursive VOP call or handcrafted internal lockfs protocol
1544 	 * path and bail out in that case.
1545 	 */
1546 	if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1547 		*ulpp = NULL;
1548 		return (0);
1549 	} else {
1550 		if (ulockfs_info_free == NULL) {
1551 			if ((ulockfs_info_temp = (ulockfs_info_t *)
1552 			    kmem_zalloc(sizeof (ulockfs_info_t),
1553 			    KM_NOSLEEP)) == NULL) {
1554 				*ulpp = NULL;
1555 				return (ENOMEM);
1556 			}
1557 		}
1558 	}
1559 
1560 	/*
1561 	 * First time VOP call
1562 	 *
1563 	 * Increment the ctr irrespective of the lockfs state. If the lockfs
1564 	 * state is not ULOCKFS_ULOCK, we can decrement it later. However,
1565 	 * before incrementing we need to check if there is a pending quiesce
1566 	 * request because if we have a continuous stream of ufs_lockfs_begin
1567 	 * requests pounding on a few cpu's then the ufs_quiesce thread might
1568 	 * never see the value of zero for ctr - a livelock kind of scenario.
1569 	 */
1570 	ctr = (mask & ULOCKFS_FWLOCK) ?
1571 	    &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt;
1572 	if (!ULOCKFS_IS_SLOCK(ulp)) {
1573 		atomic_add_long(ctr, 1);
1574 		op_cnt_incremented++;
1575 	}
1576 
1577 	if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
1578 		/*
1579 		 * Non-blocking version of ufs_check_lockfs() code.
1580 		 *
1581 		 * If the file system is not hard locked or error locked
1582 		 * and if ulp->ul_fs_lock allows this operation, increment
1583 		 * the appropriate counter and proceed (For eg., In case the
1584 		 * file system is delete locked, a mmap can still go through).
1585 		 */
1586 		if (op_cnt_incremented)
1587 			if (!atomic_add_long_nv(ctr, -1))
1588 				cv_broadcast(&ulp->ul_cv);
1589 		mutex_enter(&ulp->ul_lock);
1590 		if (ULOCKFS_IS_HLOCK(ulp) ||
1591 		    (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
1592 			error = EIO;
1593 		else if (ulp->ul_fs_lock & mask)
1594 			error = EAGAIN;
1595 
1596 		if (error) {
1597 			mutex_exit(&ulp->ul_lock);
1598 			if (ulockfs_info_free == NULL)
1599 				kmem_free(ulockfs_info_temp,
1600 				    sizeof (ulockfs_info_t));
1601 			return (error);
1602 		}
1603 		atomic_add_long(ctr, 1);
1604 		if (mask & ULOCKFS_FWLOCK)
1605 			ULOCKFS_SET_FALLOC(ulp);
1606 		mutex_exit(&ulp->ul_lock);
1607 	} else {
1608 		/*
1609 		 * This is the common case of file system in a unlocked state.
1610 		 *
1611 		 * If a file system is unlocked, we would expect the ctr to have
1612 		 * been incremented by now. But this will not be true when a
1613 		 * quiesce is winding up - SLOCK was set when we checked before
1614 		 * incrementing the ctr, but by the time we checked for
1615 		 * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. Take
1616 		 * ul_lock and go through the non-blocking version of
1617 		 * ufs_check_lockfs() code.
1618 		 */
1619 		if (op_cnt_incremented == 0) {
1620 			mutex_enter(&ulp->ul_lock);
1621 			if (ULOCKFS_IS_HLOCK(ulp) ||
1622 			    (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
1623 				error = EIO;
1624 			else if (ulp->ul_fs_lock & mask)
1625 				error = EAGAIN;
1626 
1627 			if (error) {
1628 				mutex_exit(&ulp->ul_lock);
1629 				if (ulockfs_info_free == NULL)
1630 					kmem_free(ulockfs_info_temp,
1631 					    sizeof (ulockfs_info_t));
1632 				return (error);
1633 			}
1634 			atomic_add_long(ctr, 1);
1635 			if (mask & ULOCKFS_FWLOCK)
1636 				ULOCKFS_SET_FALLOC(ulp);
1637 			mutex_exit(&ulp->ul_lock);
1638 		} else if (mask & ULOCKFS_FWLOCK) {
1639 			mutex_enter(&ulp->ul_lock);
1640 			ULOCKFS_SET_FALLOC(ulp);
1641 			mutex_exit(&ulp->ul_lock);
1642 		}
1643 	}
1644 
1645 	if (ulockfs_info_free != NULL) {
1646 		ulockfs_info_free->ulp = ulp;
1647 		if (mask & ULOCKFS_FWLOCK)
1648 			ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE;
1649 	} else {
1650 		ulockfs_info_temp->ulp = ulp;
1651 		ulockfs_info_temp->next = ulockfs_info;
1652 		if (mask & ULOCKFS_FWLOCK)
1653 			ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE;
1654 		ASSERT(ufs_lockfs_key != 0);
1655 		(void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1656 	}
1657 
1658 	curthread->t_flag |= T_DONTBLOCK;
1659 	return (0);
1660 }
1661 
1662 /*
1663  * specialized version of ufs_lockfs_begin() called by ufs_getpage().
1664  */
1665 int
1666 ufs_lockfs_begin_getpage(
1667 	struct ufsvfs	*ufsvfsp,
1668 	struct ulockfs	**ulpp,
1669 	struct seg	*seg,
1670 	int		read_access,
1671 	uint_t		*protp)
1672 {
1673 	ulong_t			mask;
1674 	int 			error;
1675 	int			rec_vop;
1676 	struct ulockfs		*ulp;
1677 	ulockfs_info_t		*ulockfs_info;
1678 	ulockfs_info_t		*ulockfs_info_free;
1679 	ulockfs_info_t		*ulockfs_info_temp;
1680 
1681 	/*
1682 	 * file system has been forcibly unmounted
1683 	 */
1684 	if (ufsvfsp == NULL)
1685 		return (EIO);
1686 
1687 	*ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1688 
1689 	/*
1690 	 * Do lockfs protocol
1691 	 */
1692 	ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1693 	IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1694 
1695 	/*
1696 	 * Detect recursive VOP call or handcrafted internal lockfs protocol
1697 	 * path and bail out in that case.
1698 	 */
1699 	if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1700 		*ulpp = NULL;
1701 		return (0);
1702 	} else {
1703 		if (ulockfs_info_free == NULL) {
1704 			if ((ulockfs_info_temp = (ulockfs_info_t *)
1705 			    kmem_zalloc(sizeof (ulockfs_info_t),
1706 			    KM_NOSLEEP)) == NULL) {
1707 				*ulpp = NULL;
1708 				return (ENOMEM);
1709 			}
1710 		}
1711 	}
1712 
1713 	/*
1714 	 * First time VOP call
1715 	 */
1716 	atomic_add_long(&ulp->ul_vnops_cnt, 1);
1717 	if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
1718 		if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1))
1719 			cv_broadcast(&ulp->ul_cv);
1720 		mutex_enter(&ulp->ul_lock);
1721 		if (seg->s_ops == &segvn_ops &&
1722 		    ((struct segvn_data *)seg->s_data)->type != MAP_SHARED) {
1723 			mask = (ulong_t)ULOCKFS_GETREAD_MASK;
1724 		} else if (protp && read_access) {
1725 			/*
1726 			 * Restrict the mapping to readonly.
1727 			 * Writes to this mapping will cause
1728 			 * another fault which will then
1729 			 * be suspended if fs is write locked
1730 			 */
1731 			*protp &= ~PROT_WRITE;
1732 			mask = (ulong_t)ULOCKFS_GETREAD_MASK;
1733 		} else
1734 			mask = (ulong_t)ULOCKFS_GETWRITE_MASK;
1735 
1736 		/*
1737 		 * will sleep if this fs is locked against this VOP
1738 		 */
1739 		error = ufs_check_lockfs(ufsvfsp, ulp, mask);
1740 		mutex_exit(&ulp->ul_lock);
1741 		if (error) {
1742 			if (ulockfs_info_free == NULL)
1743 				kmem_free(ulockfs_info_temp,
1744 				    sizeof (ulockfs_info_t));
1745 			return (error);
1746 		}
1747 	}
1748 
1749 	if (ulockfs_info_free != NULL) {
1750 		ulockfs_info_free->ulp = ulp;
1751 	} else {
1752 		ulockfs_info_temp->ulp = ulp;
1753 		ulockfs_info_temp->next = ulockfs_info;
1754 		ASSERT(ufs_lockfs_key != 0);
1755 		(void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1756 	}
1757 
1758 	curthread->t_flag |= T_DONTBLOCK;
1759 	return (0);
1760 }
1761 
1762 void
1763 ufs_lockfs_tsd_destructor(void *head)
1764 {
1765 	ulockfs_info_t *curr = (ulockfs_info_t *)head;
1766 	ulockfs_info_t *temp;
1767 
1768 	for (; curr != NULL; ) {
1769 		/*
1770 		 * The TSD destructor is being called when the thread exits
1771 		 * (via thread_exit()). At that time it must have cleaned up
1772 		 * all VOPs via ufs_lockfs_end() and there must not be a
1773 		 * valid ulockfs record exist while a thread is exiting.
1774 		 */
1775 		temp = curr;
1776 		curr = curr->next;
1777 		ASSERT(temp->ulp == NULL);
1778 		kmem_free(temp, sizeof (ulockfs_info_t));
1779 	}
1780 }
1781