xref: /titanic_41/usr/src/uts/common/fs/ufs/ufs_lockfs.c (revision 0b6016e6ff70af39f99c9cc28e0c2207c8f5413c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/t_lock.h>
30 #include <sys/param.h>
31 #include <sys/time.h>
32 #include <sys/systm.h>
33 #include <sys/sysmacros.h>
34 #include <sys/resource.h>
35 #include <sys/signal.h>
36 #include <sys/cred.h>
37 #include <sys/user.h>
38 #include <sys/buf.h>
39 #include <sys/vfs.h>
40 #include <sys/vnode.h>
41 #include <sys/proc.h>
42 #include <sys/disp.h>
43 #include <sys/file.h>
44 #include <sys/fcntl.h>
45 #include <sys/flock.h>
46 #include <sys/atomic.h>
47 #include <sys/kmem.h>
48 #include <sys/uio.h>
49 #include <sys/conf.h>
50 #include <sys/mman.h>
51 #include <sys/pathname.h>
52 #include <sys/debug.h>
53 #include <sys/vmmeter.h>
54 #include <sys/vmsystm.h>
55 #include <sys/cmn_err.h>
56 #include <sys/vtrace.h>
57 #include <sys/acct.h>
58 #include <sys/dnlc.h>
59 #include <sys/swap.h>
60 
61 #include <sys/fs/ufs_fs.h>
62 #include <sys/fs/ufs_inode.h>
63 #include <sys/fs/ufs_fsdir.h>
64 #include <sys/fs/ufs_trans.h>
65 #include <sys/fs/ufs_panic.h>
66 #include <sys/fs/ufs_mount.h>
67 #include <sys/fs/ufs_bio.h>
68 #include <sys/fs/ufs_log.h>
69 #include <sys/fs/ufs_quota.h>
70 #include <sys/dirent.h>		/* must be AFTER <sys/fs/fsdir.h>! */
71 #include <sys/errno.h>
72 #include <sys/sysinfo.h>
73 
74 #include <vm/hat.h>
75 #include <vm/pvn.h>
76 #include <vm/as.h>
77 #include <vm/seg.h>
78 #include <vm/seg_map.h>
79 #include <vm/seg_vn.h>
80 #include <vm/rm.h>
81 #include <vm/anon.h>
82 #include <sys/swap.h>
83 #include <sys/dnlc.h>
84 
85 extern struct vnode *common_specvp(struct vnode *vp);
86 
87 /* error lock status */
88 #define	UN_ERRLCK	(-1)
89 #define	SET_ERRLCK	1
90 #define	RE_ERRLCK	2
91 #define	NO_ERRLCK	0
92 
93 /*
94  * Index to be used in TSD for storing lockfs data
95  */
96 uint_t ufs_lockfs_key;
97 
98 typedef struct _ulockfs_info {
99 	struct _ulockfs_info *next;
100 	struct ulockfs *ulp;
101 	uint_t flags;
102 } ulockfs_info_t;
103 
104 #define	ULOCK_INFO_FALLOCATE	0x00000001	/* fallocate thread */
105 
106 /*
107  * Check in TSD that whether we are already doing any VOP on this filesystem
108  */
109 #define	IS_REC_VOP(found, head, ulp, free)		\
110 {							\
111 	ulockfs_info_t *_curr;				\
112 							\
113 	for (found = 0, free = NULL, _curr = head;	\
114 	    _curr != NULL; _curr = _curr->next) {	\
115 		if ((free == NULL) &&			\
116 		    (_curr->ulp == NULL))		\
117 			free = _curr;			\
118 		if (_curr->ulp == ulp) {		\
119 			found = 1;			\
120 			break;				\
121 		}					\
122 	}						\
123 }
124 
125 /*
126  * Get the lockfs data from TSD so that lockfs handles the recursive VOP
127  * properly
128  */
129 #define	SEARCH_ULOCKFSP(head, ulp, info)		\
130 {							\
131 	ulockfs_info_t *_curr;				\
132 							\
133 	for (_curr = head; _curr != NULL;		\
134 	    _curr = _curr->next) {			\
135 		if (_curr->ulp == ulp) {		\
136 			break;				\
137 		}					\
138 	}						\
139 							\
140 	info = _curr;					\
141 }
142 
143 /*
144  * Validate lockfs request
145  */
146 static int
147 ufs_getlfd(
148 	struct lockfs *lockfsp,		/* new lock request */
149 	struct lockfs *ul_lockfsp)	/* old lock state */
150 {
151 	int	error = 0;
152 
153 	/*
154 	 * no input flags defined
155 	 */
156 	if (lockfsp->lf_flags != 0) {
157 		error = EINVAL;
158 		goto errout;
159 	}
160 
161 	/*
162 	 * check key
163 	 */
164 	if (!LOCKFS_IS_ULOCK(ul_lockfsp))
165 		if (lockfsp->lf_key != ul_lockfsp->lf_key) {
166 			error = EINVAL;
167 			goto errout;
168 	}
169 
170 	lockfsp->lf_key = ul_lockfsp->lf_key + 1;
171 
172 errout:
173 	return (error);
174 }
175 
176 /*
177  * ufs_checkaccton
178  *	check if accounting is turned on on this fs
179  */
180 
181 int
182 ufs_checkaccton(struct vnode *vp)
183 {
184 	if (acct_fs_in_use(vp))
185 		return (EDEADLK);
186 	return (0);
187 }
188 
189 /*
190  * ufs_checkswapon
191  *	check if local swapping is to file on this fs
192  */
193 int
194 ufs_checkswapon(struct vnode *vp)
195 {
196 	struct swapinfo	*sip;
197 
198 	mutex_enter(&swapinfo_lock);
199 	for (sip = swapinfo; sip; sip = sip->si_next)
200 		if (sip->si_vp->v_vfsp == vp->v_vfsp) {
201 			mutex_exit(&swapinfo_lock);
202 			return (EDEADLK);
203 		}
204 	mutex_exit(&swapinfo_lock);
205 	return (0);
206 }
207 
208 /*
209  * ufs_freeze
210  *	pend future accesses for current lock and desired lock
211  */
212 void
213 ufs_freeze(struct ulockfs *ulp, struct lockfs *lockfsp)
214 {
215 	/*
216 	 * set to new lock type
217 	 */
218 	ulp->ul_lockfs.lf_lock = lockfsp->lf_lock;
219 	ulp->ul_lockfs.lf_key = lockfsp->lf_key;
220 	ulp->ul_lockfs.lf_comlen = lockfsp->lf_comlen;
221 	ulp->ul_lockfs.lf_comment = lockfsp->lf_comment;
222 
223 	ulp->ul_fs_lock = (1 << ulp->ul_lockfs.lf_lock);
224 }
225 
226 /*
227  * All callers of ufs_quiesce() atomically increment ufs_quiesce_pend before
228  * starting ufs_quiesce() protocol and decrement it only when a file system no
229  * longer has to be in quiescent state. This allows ufs_pageio() to detect
230  * that another thread wants to quiesce a file system. See more comments in
231  * ufs_pageio().
232  */
233 ulong_t ufs_quiesce_pend = 0;
234 
235 /*
236  * ufs_quiesce
237  *	wait for outstanding accesses to finish
238  */
239 int
240 ufs_quiesce(struct ulockfs *ulp)
241 {
242 	int error = 0;
243 	ulockfs_info_t *head;
244 	ulockfs_info_t *info;
245 
246 	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
247 	SEARCH_ULOCKFSP(head, ulp, info);
248 
249 	/*
250 	 * Set a softlock to suspend future ufs_vnops so that
251 	 * this lockfs request will not be starved
252 	 */
253 	ULOCKFS_SET_SLOCK(ulp);
254 	ASSERT(ufs_quiesce_pend);
255 
256 	/* check if there is any outstanding ufs vnodeops calls */
257 	while (ulp->ul_vnops_cnt || ulp->ul_falloc_cnt) {
258 		/*
259 		 * use timed version of cv_wait_sig() to make sure we don't
260 		 * miss a wake up call from ufs_pageio() when it doesn't use
261 		 * ul_lock.
262 		 *
263 		 * when a fallocate thread comes in, the only way it returns
264 		 * from this function is if there are no other vnode operations
265 		 * going on (remember fallocate threads are tracked using
266 		 * ul_falloc_cnt not ul_vnops_cnt), and another fallocate thread
267 		 * hasn't already grabbed the fs write lock.
268 		 */
269 		if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
270 			if (!ulp->ul_vnops_cnt && !ULOCKFS_IS_FWLOCK(ulp))
271 				goto out;
272 		}
273 		if (!cv_timedwait_sig(&ulp->ul_cv, &ulp->ul_lock, lbolt + hz)) {
274 			error = EINTR;
275 			goto out;
276 		}
277 	}
278 
279 out:
280 	/*
281 	 * unlock the soft lock
282 	 */
283 	ULOCKFS_CLR_SLOCK(ulp);
284 
285 	return (error);
286 }
287 
288 /*
289  * ufs_flush_inode
290  */
291 int
292 ufs_flush_inode(struct inode *ip, void *arg)
293 {
294 	int	error;
295 	int	saverror	= 0;
296 
297 	/*
298 	 * wrong file system; keep looking
299 	 */
300 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
301 		return (0);
302 
303 	/*
304 	 * asynchronously push all the dirty pages
305 	 */
306 	if (((error = TRANS_SYNCIP(ip, B_ASYNC, 0, TOP_SYNCIP_FLUSHI)) != 0) &&
307 	    (error != EAGAIN))
308 		saverror = error;
309 	/*
310 	 * wait for io and discard all mappings
311 	 */
312 	if (error = TRANS_SYNCIP(ip, B_INVAL, 0, TOP_SYNCIP_FLUSHI))
313 		saverror = error;
314 
315 	if (ITOV(ip)->v_type == VDIR) {
316 		dnlc_dir_purge(&ip->i_danchor);
317 	}
318 
319 	return (saverror);
320 }
321 
322 /*
323  * ufs_flush
324  *	Flush everything that is currently dirty; this includes invalidating
325  *	any mappings.
326  */
327 int
328 ufs_flush(struct vfs *vfsp)
329 {
330 	int		error;
331 	int		saverror = 0;
332 	struct ufsvfs	*ufsvfsp	= (struct ufsvfs *)vfsp->vfs_data;
333 	struct fs	*fs		= ufsvfsp->vfs_fs;
334 
335 	ASSERT(vfs_lock_held(vfsp));
336 
337 	/*
338 	 * purge dnlc
339 	 */
340 	(void) dnlc_purge_vfsp(vfsp, 0);
341 
342 	/*
343 	 * drain the delete and idle threads
344 	 */
345 	ufs_delete_drain(vfsp, 0, 0);
346 	ufs_idle_drain(vfsp);
347 
348 	/*
349 	 * flush and invalidate quota records
350 	 */
351 	(void) qsync(ufsvfsp);
352 
353 	/*
354 	 * flush w/invalidate the inodes for vfsp
355 	 */
356 	if (error = ufs_scan_inodes(0, ufs_flush_inode, ufsvfsp, ufsvfsp))
357 		saverror = error;
358 
359 	/*
360 	 * synchronously flush superblock and summary info
361 	 */
362 	if (fs->fs_ronly == 0 && fs->fs_fmod) {
363 		fs->fs_fmod = 0;
364 		TRANS_SBUPDATE(ufsvfsp, vfsp, TOP_SBUPDATE_FLUSH);
365 	}
366 	/*
367 	 * flush w/invalidate block device pages and buf cache
368 	 */
369 	if ((error = VOP_PUTPAGE(common_specvp(ufsvfsp->vfs_devvp),
370 	    (offset_t)0, 0, B_INVAL, CRED())) > 0)
371 		saverror = error;
372 
373 	(void) bflush((dev_t)vfsp->vfs_dev);
374 	(void) bfinval((dev_t)vfsp->vfs_dev, 0);
375 
376 	/*
377 	 * drain the delete and idle threads again
378 	 */
379 	ufs_delete_drain(vfsp, 0, 0);
380 	ufs_idle_drain(vfsp);
381 
382 	/*
383 	 * play with the clean flag
384 	 */
385 	if (saverror == 0)
386 		ufs_checkclean(vfsp);
387 
388 	/*
389 	 * Flush any outstanding transactions and roll the log
390 	 * only if we are supposed to do, i.e. LDL_NOROLL not set.
391 	 * We can not simply check for fs_ronly here since fsck also may
392 	 * use this code to roll the log on a read-only filesystem, e.g.
393 	 * root during early stages of boot, if other then a sanity check is
394 	 * done, it will clear LDL_NOROLL before.
395 	 * In addition we assert that the deltamap does not contain any deltas
396 	 * in case LDL_NOROLL is set since this is not supposed to happen.
397 	 */
398 	if (TRANS_ISTRANS(ufsvfsp)) {
399 		ml_unit_t	*ul	= ufsvfsp->vfs_log;
400 		mt_map_t	*mtm	= ul->un_deltamap;
401 
402 		if (ul->un_flags & LDL_NOROLL) {
403 			ASSERT(mtm->mtm_nme == 0);
404 		} else {
405 			curthread->t_flag |= T_DONTBLOCK;
406 			TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_FLUSH,
407 			    TOP_COMMIT_SIZE, error);
408 			if (!error) {
409 				TRANS_END_SYNC(ufsvfsp, saverror,
410 				    TOP_COMMIT_FLUSH, TOP_COMMIT_SIZE);
411 			}
412 			curthread->t_flag &= ~T_DONTBLOCK;
413 			logmap_roll_dev(ufsvfsp->vfs_log);
414 		}
415 	}
416 
417 	return (saverror);
418 }
419 
420 /*
421  * ufs_thaw_wlock
422  *	special processing when thawing down to wlock
423  */
424 static int
425 ufs_thaw_wlock(struct inode *ip, void *arg)
426 {
427 	/*
428 	 * wrong file system; keep looking
429 	 */
430 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
431 		return (0);
432 
433 	/*
434 	 * iupdat refuses to clear flags if the fs is read only.  The fs
435 	 * may become read/write during the lock and we wouldn't want
436 	 * these inodes being written to disk.  So clear the flags.
437 	 */
438 	rw_enter(&ip->i_contents, RW_WRITER);
439 	ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
440 	rw_exit(&ip->i_contents);
441 
442 	/*
443 	 * pages are mlocked -- fail wlock
444 	 */
445 	if (ITOV(ip)->v_type != VCHR && vn_has_cached_data(ITOV(ip)))
446 		return (EBUSY);
447 
448 	return (0);
449 }
450 
451 /*
452  * ufs_thaw_hlock
453  *	special processing when thawing down to hlock or elock
454  */
455 static int
456 ufs_thaw_hlock(struct inode *ip, void *arg)
457 {
458 	struct vnode	*vp	= ITOV(ip);
459 
460 	/*
461 	 * wrong file system; keep looking
462 	 */
463 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
464 		return (0);
465 
466 	/*
467 	 * blow away all pages - even if they are mlocked
468 	 */
469 	do {
470 		(void) TRANS_SYNCIP(ip, B_INVAL | B_FORCE, 0, TOP_SYNCIP_HLOCK);
471 	} while ((vp->v_type != VCHR) && vn_has_cached_data(vp));
472 	rw_enter(&ip->i_contents, RW_WRITER);
473 	ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
474 	rw_exit(&ip->i_contents);
475 
476 	return (0);
477 }
478 
479 /*
480  * ufs_thaw
481  *	thaw file system lock down to current value
482  */
483 int
484 ufs_thaw(struct vfs *vfsp, struct ufsvfs *ufsvfsp, struct ulockfs *ulp)
485 {
486 	int		error	= 0;
487 	int		noidel	= (int)(ulp->ul_flag & ULOCKFS_NOIDEL);
488 
489 	/*
490 	 * if wlock or hlock or elock
491 	 */
492 	if (ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_HLOCK(ulp) ||
493 	    ULOCKFS_IS_ELOCK(ulp)) {
494 
495 		/*
496 		 * don't keep access times
497 		 * don't free deleted files
498 		 * if superblock writes are allowed, limit them to me for now
499 		 */
500 		ulp->ul_flag |= (ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
501 		if (ulp->ul_sbowner != (kthread_id_t)-1)
502 			ulp->ul_sbowner = curthread;
503 
504 		/*
505 		 * wait for writes for deleted files and superblock updates
506 		 */
507 		(void) ufs_flush(vfsp);
508 
509 		/*
510 		 * now make sure the quota file is up-to-date
511 		 *	expensive; but effective
512 		 */
513 		error = ufs_flush(vfsp);
514 		/*
515 		 * no one can write the superblock
516 		 */
517 		ulp->ul_sbowner = (kthread_id_t)-1;
518 
519 		/*
520 		 * special processing for wlock/hlock/elock
521 		 */
522 		if (ULOCKFS_IS_WLOCK(ulp)) {
523 			if (error)
524 				goto errout;
525 			error = bfinval(ufsvfsp->vfs_dev, 0);
526 			if (error)
527 				goto errout;
528 			error = ufs_scan_inodes(0, ufs_thaw_wlock,
529 					(void *)ufsvfsp, ufsvfsp);
530 			if (error)
531 				goto errout;
532 		}
533 		if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp)) {
534 			error = 0;
535 			(void) ufs_scan_inodes(0, ufs_thaw_hlock,
536 					(void *)ufsvfsp, ufsvfsp);
537 			(void) bfinval(ufsvfsp->vfs_dev, 1);
538 		}
539 	} else {
540 
541 		/*
542 		 * okay to keep access times
543 		 * okay to free deleted files
544 		 * okay to write the superblock
545 		 */
546 		ulp->ul_flag &= ~(ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
547 		ulp->ul_sbowner = NULL;
548 
549 		/*
550 		 * flush in case deleted files are in memory
551 		 */
552 		if (noidel) {
553 			if (error = ufs_flush(vfsp))
554 				goto errout;
555 		}
556 	}
557 
558 errout:
559 	cv_broadcast(&ulp->ul_cv);
560 	return (error);
561 }
562 
563 /*
564  * ufs_reconcile_fs
565  *	reconcile incore superblock with ondisk superblock
566  */
567 int
568 ufs_reconcile_fs(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
569 {
570 	struct fs	*mfs; 	/* in-memory superblock */
571 	struct fs	*dfs;	/* on-disk   superblock */
572 	struct buf	*bp;	/* on-disk   superblock buf */
573 	int		 needs_unlock;
574 	char		 finished_fsclean;
575 
576 	mfs = ufsvfsp->vfs_fs;
577 
578 	/*
579 	 * get the on-disk copy of the superblock
580 	 */
581 	bp = UFS_BREAD(ufsvfsp, vfsp->vfs_dev, SBLOCK, SBSIZE);
582 	bp->b_flags |= (B_STALE|B_AGE);
583 	if (bp->b_flags & B_ERROR) {
584 		brelse(bp);
585 		return (EIO);
586 	}
587 	dfs = bp->b_un.b_fs;
588 
589 	/* error locks may only unlock after the fs has been made consistent */
590 	if (errlck == UN_ERRLCK) {
591 		if (dfs->fs_clean == FSFIX) {	/* being repaired */
592 			brelse(bp);
593 			return (EAGAIN);
594 		}
595 		/* repair not yet started? */
596 		finished_fsclean = TRANS_ISTRANS(ufsvfsp)? FSLOG: FSCLEAN;
597 		if (dfs->fs_clean != finished_fsclean) {
598 			brelse(bp);
599 			return (EBUSY);
600 		}
601 	}
602 
603 	/*
604 	 * if superblock has changed too much, abort
605 	 */
606 	if ((mfs->fs_sblkno		!= dfs->fs_sblkno) ||
607 	    (mfs->fs_cblkno		!= dfs->fs_cblkno) ||
608 	    (mfs->fs_iblkno		!= dfs->fs_iblkno) ||
609 	    (mfs->fs_dblkno		!= dfs->fs_dblkno) ||
610 	    (mfs->fs_cgoffset		!= dfs->fs_cgoffset) ||
611 	    (mfs->fs_cgmask		!= dfs->fs_cgmask) ||
612 	    (mfs->fs_bsize		!= dfs->fs_bsize) ||
613 	    (mfs->fs_fsize		!= dfs->fs_fsize) ||
614 	    (mfs->fs_frag		!= dfs->fs_frag) ||
615 	    (mfs->fs_bmask		!= dfs->fs_bmask) ||
616 	    (mfs->fs_fmask		!= dfs->fs_fmask) ||
617 	    (mfs->fs_bshift		!= dfs->fs_bshift) ||
618 	    (mfs->fs_fshift		!= dfs->fs_fshift) ||
619 	    (mfs->fs_fragshift		!= dfs->fs_fragshift) ||
620 	    (mfs->fs_fsbtodb		!= dfs->fs_fsbtodb) ||
621 	    (mfs->fs_sbsize		!= dfs->fs_sbsize) ||
622 	    (mfs->fs_nindir		!= dfs->fs_nindir) ||
623 	    (mfs->fs_nspf		!= dfs->fs_nspf) ||
624 	    (mfs->fs_trackskew		!= dfs->fs_trackskew) ||
625 	    (mfs->fs_cgsize		!= dfs->fs_cgsize) ||
626 	    (mfs->fs_ntrak		!= dfs->fs_ntrak) ||
627 	    (mfs->fs_nsect		!= dfs->fs_nsect) ||
628 	    (mfs->fs_spc		!= dfs->fs_spc) ||
629 	    (mfs->fs_cpg		!= dfs->fs_cpg) ||
630 	    (mfs->fs_ipg		!= dfs->fs_ipg) ||
631 	    (mfs->fs_fpg		!= dfs->fs_fpg) ||
632 	    (mfs->fs_postblformat	!= dfs->fs_postblformat) ||
633 	    (mfs->fs_magic		!= dfs->fs_magic)) {
634 		brelse(bp);
635 		return (EACCES);
636 	}
637 	if (dfs->fs_clean == FSBAD || FSOKAY != dfs->fs_state + dfs->fs_time)
638 		if (mfs->fs_clean == FSLOG) {
639 			brelse(bp);
640 			return (EACCES);
641 		}
642 
643 	/*
644 	 * get new summary info
645 	 */
646 	if (ufs_getsummaryinfo(vfsp->vfs_dev, ufsvfsp, dfs)) {
647 		brelse(bp);
648 		return (EIO);
649 	}
650 
651 	/*
652 	 * release old summary info and update in-memory superblock
653 	 */
654 	kmem_free(mfs->fs_u.fs_csp, mfs->fs_cssize);
655 	mfs->fs_u.fs_csp = dfs->fs_u.fs_csp;	/* Only entry 0 used */
656 
657 	/*
658 	 * update fields allowed to change
659 	 */
660 	mfs->fs_size		= dfs->fs_size;
661 	mfs->fs_dsize		= dfs->fs_dsize;
662 	mfs->fs_ncg		= dfs->fs_ncg;
663 	mfs->fs_minfree		= dfs->fs_minfree;
664 	mfs->fs_rotdelay	= dfs->fs_rotdelay;
665 	mfs->fs_rps		= dfs->fs_rps;
666 	mfs->fs_maxcontig	= dfs->fs_maxcontig;
667 	mfs->fs_maxbpg		= dfs->fs_maxbpg;
668 	mfs->fs_csmask		= dfs->fs_csmask;
669 	mfs->fs_csshift		= dfs->fs_csshift;
670 	mfs->fs_optim		= dfs->fs_optim;
671 	mfs->fs_csaddr		= dfs->fs_csaddr;
672 	mfs->fs_cssize		= dfs->fs_cssize;
673 	mfs->fs_ncyl		= dfs->fs_ncyl;
674 	mfs->fs_cstotal		= dfs->fs_cstotal;
675 	mfs->fs_reclaim		= dfs->fs_reclaim;
676 
677 	if (mfs->fs_reclaim & (FS_RECLAIM|FS_RECLAIMING)) {
678 		mfs->fs_reclaim &= ~FS_RECLAIM;
679 		mfs->fs_reclaim |=  FS_RECLAIMING;
680 		ufs_thread_start(&ufsvfsp->vfs_reclaim,
681 			ufs_thread_reclaim, vfsp);
682 	}
683 
684 	/* XXX What to do about sparecon? */
685 
686 	/* XXX need to copy volume label */
687 
688 	/*
689 	 * ondisk clean flag overrides inmemory clean flag iff == FSBAD
690 	 * or if error-locked and ondisk is now clean
691 	 */
692 	needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
693 	if (needs_unlock)
694 		mutex_enter(&ufsvfsp->vfs_lock);
695 
696 	if (errlck == UN_ERRLCK) {
697 		if (finished_fsclean == dfs->fs_clean)
698 			mfs->fs_clean = finished_fsclean;
699 		else
700 			mfs->fs_clean = FSBAD;
701 		mfs->fs_state = FSOKAY - dfs->fs_time;
702 	}
703 
704 	if (FSOKAY != dfs->fs_state + dfs->fs_time ||
705 	    (dfs->fs_clean == FSBAD))
706 		mfs->fs_clean = FSBAD;
707 
708 	if (needs_unlock)
709 		mutex_exit(&ufsvfsp->vfs_lock);
710 
711 	brelse(bp);
712 
713 	return (0);
714 }
715 
716 /*
717  * ufs_reconcile_inode
718  *	reconcile ondisk inode with incore inode
719  */
720 static int
721 ufs_reconcile_inode(struct inode *ip, void *arg)
722 {
723 	int		i;
724 	int		ndaddr;
725 	int		niaddr;
726 	struct dinode	*dp;		/* ondisk inode */
727 	struct buf	*bp	= NULL;
728 	uid_t		d_uid;
729 	gid_t		d_gid;
730 	int		error = 0;
731 	struct fs	*fs;
732 
733 	/*
734 	 * not an inode we care about
735 	 */
736 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
737 		return (0);
738 
739 	fs = ip->i_fs;
740 
741 	/*
742 	 * Inode reconciliation fails: we made the filesystem quiescent
743 	 * and we did a ufs_flush() before calling ufs_reconcile_inode()
744 	 * and thus the inode should not have been changed inbetween.
745 	 * Any discrepancies indicate a logic error and a pretty
746 	 * significant run-state inconsistency we should complain about.
747 	 */
748 	if (ip->i_flag & (IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG)) {
749 		cmn_err(CE_WARN, "%s: Inode reconciliation failed for"
750 		    "inode %llu", fs->fs_fsmnt, (u_longlong_t)ip->i_number);
751 		return (EINVAL);
752 	}
753 
754 	/*
755 	 * get the dinode
756 	 */
757 	bp = UFS_BREAD(ip->i_ufsvfs,
758 			ip->i_dev, (daddr_t)fsbtodb(fs, itod(fs, ip->i_number)),
759 	    (int)fs->fs_bsize);
760 	if (bp->b_flags & B_ERROR) {
761 		brelse(bp);
762 		return (EIO);
763 	}
764 	dp  = bp->b_un.b_dino;
765 	dp += itoo(fs, ip->i_number);
766 
767 	/*
768 	 * handle Sun's implementation of EFT
769 	 */
770 	d_uid = (dp->di_suid == UID_LONG) ? dp->di_uid : (uid_t)dp->di_suid;
771 	d_gid = (dp->di_sgid == GID_LONG) ? dp->di_gid : (uid_t)dp->di_sgid;
772 
773 	rw_enter(&ip->i_contents, RW_WRITER);
774 
775 	/*
776 	 * some fields are not allowed to change
777 	 */
778 	if ((ip->i_mode  != dp->di_mode) ||
779 	    (ip->i_gen   != dp->di_gen) ||
780 	    (ip->i_uid   != d_uid) ||
781 	    (ip->i_gid   != d_gid)) {
782 		error = EACCES;
783 		goto out;
784 	}
785 
786 	/*
787 	 * and some are allowed to change
788 	 */
789 	ip->i_size		= dp->di_size;
790 	ip->i_ic.ic_flags	= dp->di_ic.ic_flags;
791 	ip->i_blocks		= dp->di_blocks;
792 	ip->i_nlink		= dp->di_nlink;
793 	if (ip->i_flag & IFASTSYMLNK) {
794 		ndaddr = 1;
795 		niaddr = 0;
796 	} else {
797 		ndaddr = NDADDR;
798 		niaddr = NIADDR;
799 	}
800 	for (i = 0; i < ndaddr; ++i)
801 		ip->i_db[i] = dp->di_db[i];
802 	for (i = 0; i < niaddr; ++i)
803 		ip->i_ib[i] = dp->di_ib[i];
804 
805 out:
806 	rw_exit(&ip->i_contents);
807 	brelse(bp);
808 	return (error);
809 }
810 
811 /*
812  * ufs_reconcile
813  *	reconcile ondisk superblock/inodes with any incore
814  */
815 static int
816 ufs_reconcile(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
817 {
818 	int	error = 0;
819 
820 	/*
821 	 * get rid of as much inmemory data as possible
822 	 */
823 	(void) ufs_flush(vfsp);
824 
825 	/*
826 	 * reconcile the superblock and inodes
827 	 */
828 	if (error = ufs_reconcile_fs(vfsp, ufsvfsp, errlck))
829 		return (error);
830 	if (error = ufs_scan_inodes(0, ufs_reconcile_inode, ufsvfsp, ufsvfsp))
831 		return (error);
832 	/*
833 	 * allocation blocks may be incorrect; get rid of them
834 	 */
835 	(void) ufs_flush(vfsp);
836 
837 	return (error);
838 }
839 
840 /*
841  * File system locking
842  */
843 int
844 ufs_fiolfs(struct vnode *vp, struct lockfs *lockfsp, int from_log)
845 {
846 	return (ufs__fiolfs(vp, lockfsp, /* from_user */ 1, from_log));
847 }
848 
849 /* kernel-internal interface, also used by fix-on-panic */
850 int
851 ufs__fiolfs(
852 	struct vnode *vp,
853 	struct lockfs *lockfsp,
854 	int from_user,
855 	int from_log)
856 {
857 	struct ulockfs	*ulp;
858 	struct lockfs	lfs;
859 	int		error;
860 	struct vfs	*vfsp;
861 	struct ufsvfs	*ufsvfsp;
862 	int		 errlck		= NO_ERRLCK;
863 	int		 poll_events	= POLLPRI;
864 	extern struct pollhead ufs_pollhd;
865 	ulockfs_info_t *head;
866 	ulockfs_info_t *info;
867 
868 	/* check valid lock type */
869 	if (!lockfsp || lockfsp->lf_lock > LOCKFS_MAXLOCK)
870 		return (EINVAL);
871 
872 	if (!vp || !vp->v_vfsp || !vp->v_vfsp->vfs_data)
873 		return (EIO);
874 
875 	vfsp = vp->v_vfsp;
876 	ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
877 	ulp = &ufsvfsp->vfs_ulockfs;
878 
879 	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
880 	SEARCH_ULOCKFSP(head, ulp, info);
881 
882 	/*
883 	 * Suspend both the reclaim thread and the delete thread.
884 	 * This must be done outside the lockfs locking protocol.
885 	 */
886 	ufs_thread_suspend(&ufsvfsp->vfs_reclaim);
887 	ufs_thread_suspend(&ufsvfsp->vfs_delete);
888 
889 	/*
890 	 * Acquire vfs_reflock around ul_lock to avoid deadlock with
891 	 * umount/remount/sync.
892 	 */
893 	vfs_lock_wait(vfsp);
894 	mutex_enter(&ulp->ul_lock);
895 	atomic_add_long(&ufs_quiesce_pend, 1);
896 
897 	/*
898 	 * Quit if there is another lockfs request in progress
899 	 * that is waiting for existing ufs_vnops to complete.
900 	 */
901 	if (ULOCKFS_IS_BUSY(ulp)) {
902 		error = EBUSY;
903 		goto errexit;
904 	}
905 
906 	/* cannot ulocked or downgrade a hard-lock */
907 	if (ULOCKFS_IS_HLOCK(ulp)) {
908 		error = EIO;
909 		goto errexit;
910 	}
911 
912 	/* an error lock may be unlocked or relocked, only */
913 	if (ULOCKFS_IS_ELOCK(ulp)) {
914 		if (!LOCKFS_IS_ULOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
915 			error = EBUSY;
916 			goto errexit;
917 		}
918 	}
919 
920 	/*
921 	 * a read-only error lock may only be upgraded to an
922 	 * error lock or hard lock
923 	 */
924 	if (ULOCKFS_IS_ROELOCK(ulp)) {
925 		if (!LOCKFS_IS_HLOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
926 			error = EBUSY;
927 			goto errexit;
928 		}
929 	}
930 
931 	/*
932 	 * until read-only error locks are fully implemented
933 	 * just return EINVAL
934 	 */
935 	if (LOCKFS_IS_ROELOCK(lockfsp)) {
936 		error = EINVAL;
937 		goto errexit;
938 	}
939 
940 	/*
941 	 * an error lock may only be applied if the file system is
942 	 * unlocked or already error locked.
943 	 * (this is to prevent the case where a fs gets changed out from
944 	 * underneath a fs that is locked for backup,
945 	 * that is, name/delete/write-locked.)
946 	 */
947 	if ((!ULOCKFS_IS_ULOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp) &&
948 	    !ULOCKFS_IS_ROELOCK(ulp)) &&
949 	    (LOCKFS_IS_ELOCK(lockfsp) || LOCKFS_IS_ROELOCK(lockfsp))) {
950 		error = EBUSY;
951 		goto errexit;
952 	}
953 
954 	/* get and validate the input lockfs request */
955 	if (error = ufs_getlfd(lockfsp, &ulp->ul_lockfs))
956 		goto errexit;
957 
958 	/*
959 	 * save current ulockfs struct
960 	 */
961 	bcopy(&ulp->ul_lockfs, &lfs, sizeof (struct lockfs));
962 
963 	/*
964 	 * Freeze the file system (pend future accesses)
965 	 */
966 	ufs_freeze(ulp, lockfsp);
967 
968 	/*
969 	 * Set locking in progress because ufs_quiesce may free the
970 	 * ul_lock mutex.
971 	 */
972 	ULOCKFS_SET_BUSY(ulp);
973 	/* update the ioctl copy */
974 	LOCKFS_SET_BUSY(&ulp->ul_lockfs);
975 
976 	/*
977 	 * We  need to unset FWLOCK status before we call ufs_quiesce
978 	 * so that the thread doesnt get suspended. We do this only if
979 	 * this (fallocate) thread requested an unlock operation.
980 	 */
981 	if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
982 		if (!ULOCKFS_IS_WLOCK(ulp))
983 			ULOCKFS_CLR_FWLOCK(ulp);
984 	}
985 
986 	/*
987 	 * Quiesce (wait for outstanding accesses to finish)
988 	 */
989 	if (error = ufs_quiesce(ulp))
990 		goto errout;
991 
992 	/*
993 	 * If the fallocate thread requested a write fs lock operation
994 	 * then we set fwlock status in the ulp.
995 	 */
996 	if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
997 		if (ULOCKFS_IS_WLOCK(ulp))
998 			ULOCKFS_SET_FWLOCK(ulp);
999 	}
1000 
1001 	/*
1002 	 * can't wlock or (ro)elock fs with accounting or local swap file
1003 	 */
1004 	if ((ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp) ||
1005 	    ULOCKFS_IS_ROELOCK(ulp)) && !from_log) {
1006 		if (error = ufs_checkaccton(vp))
1007 			goto errout;
1008 		if (error = ufs_checkswapon(vp))
1009 			goto errout;
1010 	}
1011 
1012 	/*
1013 	 * save error lock status to pass down to reconcilation
1014 	 * routines and for later cleanup
1015 	 */
1016 	if (LOCKFS_IS_ELOCK(&lfs) && ULOCKFS_IS_ULOCK(ulp))
1017 		errlck = UN_ERRLCK;
1018 
1019 	if (ULOCKFS_IS_ELOCK(ulp) || ULOCKFS_IS_ROELOCK(ulp)) {
1020 		int needs_unlock;
1021 		int needs_sbwrite;
1022 
1023 		poll_events |= POLLERR;
1024 		errlck = LOCKFS_IS_ELOCK(&lfs) || LOCKFS_IS_ROELOCK(&lfs)?
1025 							RE_ERRLCK: SET_ERRLCK;
1026 
1027 		needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
1028 		if (needs_unlock)
1029 			mutex_enter(&ufsvfsp->vfs_lock);
1030 
1031 		/* disable delayed i/o */
1032 		needs_sbwrite = 0;
1033 
1034 		if (errlck == SET_ERRLCK) {
1035 			ufsvfsp->vfs_fs->fs_clean = FSBAD;
1036 			needs_sbwrite = 1;
1037 		}
1038 
1039 		needs_sbwrite |= ufsvfsp->vfs_dio;
1040 		ufsvfsp->vfs_dio = 0;
1041 
1042 		if (needs_unlock)
1043 			mutex_exit(&ufsvfsp->vfs_lock);
1044 
1045 		if (needs_sbwrite) {
1046 			ulp->ul_sbowner = curthread;
1047 			TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE);
1048 
1049 			if (needs_unlock)
1050 				mutex_enter(&ufsvfsp->vfs_lock);
1051 
1052 			ufsvfsp->vfs_fs->fs_fmod = 0;
1053 
1054 			if (needs_unlock)
1055 				mutex_exit(&ufsvfsp->vfs_lock);
1056 		}
1057 	}
1058 
1059 	/*
1060 	 * reconcile superblock and inodes if was wlocked
1061 	 */
1062 	if (LOCKFS_IS_WLOCK(&lfs) || LOCKFS_IS_ELOCK(&lfs)) {
1063 		if (error = ufs_reconcile(vfsp, ufsvfsp, errlck))
1064 			goto errout;
1065 		/*
1066 		 * in case the fs grew; reset the metadata map for logging tests
1067 		 */
1068 		TRANS_MATA_UMOUNT(ufsvfsp);
1069 		TRANS_MATA_MOUNT(ufsvfsp);
1070 		TRANS_MATA_SI(ufsvfsp, ufsvfsp->vfs_fs);
1071 	}
1072 
1073 	/*
1074 	 * At least everything *currently* dirty goes out.
1075 	 */
1076 
1077 	if ((error = ufs_flush(vfsp)) != 0 && !ULOCKFS_IS_HLOCK(ulp) &&
1078 	    !ULOCKFS_IS_ELOCK(ulp))
1079 		goto errout;
1080 
1081 	/*
1082 	 * thaw file system and wakeup pended processes
1083 	 */
1084 	if (error = ufs_thaw(vfsp, ufsvfsp, ulp))
1085 		if (!ULOCKFS_IS_HLOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp))
1086 			goto errout;
1087 
1088 	/*
1089 	 * reset modified flag if not already write locked
1090 	 */
1091 	if (!LOCKFS_IS_WLOCK(&lfs))
1092 		ULOCKFS_CLR_MOD(ulp);
1093 
1094 	/*
1095 	 * idle the lock struct
1096 	 */
1097 	ULOCKFS_CLR_BUSY(ulp);
1098 	/* update the ioctl copy */
1099 	LOCKFS_CLR_BUSY(&ulp->ul_lockfs);
1100 
1101 	/*
1102 	 * free current comment
1103 	 */
1104 	if (lfs.lf_comment && lfs.lf_comlen != 0) {
1105 		kmem_free(lfs.lf_comment, lfs.lf_comlen);
1106 		lfs.lf_comment = NULL;
1107 		lfs.lf_comlen = 0;
1108 	}
1109 
1110 	/* do error lock cleanup */
1111 	if (errlck == UN_ERRLCK)
1112 		ufsfx_unlockfs(ufsvfsp);
1113 
1114 	else if (errlck == RE_ERRLCK)
1115 		ufsfx_lockfs(ufsvfsp);
1116 
1117 	/* don't allow error lock from user to invoke panic */
1118 	else if (from_user && errlck == SET_ERRLCK &&
1119 		!(ufsvfsp->vfs_fsfx.fx_flags & (UFSMNT_ONERROR_PANIC >> 4)))
1120 		(void) ufs_fault(ufsvfsp->vfs_root,
1121 		    ulp->ul_lockfs.lf_comment && ulp->ul_lockfs.lf_comlen > 0 ?
1122 		    ulp->ul_lockfs.lf_comment: "user-applied error lock");
1123 
1124 	atomic_add_long(&ufs_quiesce_pend, -1);
1125 	mutex_exit(&ulp->ul_lock);
1126 	vfs_unlock(vfsp);
1127 
1128 	if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs))
1129 		poll_events |= POLLERR;
1130 
1131 	pollwakeup(&ufs_pollhd, poll_events);
1132 
1133 	/*
1134 	 * Allow both the delete thread and the reclaim thread to
1135 	 * continue.
1136 	 */
1137 	ufs_thread_continue(&ufsvfsp->vfs_delete);
1138 	ufs_thread_continue(&ufsvfsp->vfs_reclaim);
1139 
1140 	return (0);
1141 
1142 errout:
1143 	/*
1144 	 * Lock failed. Reset the old lock in ufsvfs if not hard locked.
1145 	 */
1146 	if (!LOCKFS_IS_HLOCK(&ulp->ul_lockfs)) {
1147 		bcopy(&lfs, &ulp->ul_lockfs, sizeof (struct lockfs));
1148 		ulp->ul_fs_lock = (1 << lfs.lf_lock);
1149 	}
1150 	(void) ufs_thaw(vfsp, ufsvfsp, ulp);
1151 	ULOCKFS_CLR_BUSY(ulp);
1152 	LOCKFS_CLR_BUSY(&ulp->ul_lockfs);
1153 
1154 errexit:
1155 	atomic_add_long(&ufs_quiesce_pend, -1);
1156 	mutex_exit(&ulp->ul_lock);
1157 	vfs_unlock(vfsp);
1158 
1159 	/*
1160 	 * Allow both the delete thread and the reclaim thread to
1161 	 * continue.
1162 	 */
1163 	ufs_thread_continue(&ufsvfsp->vfs_delete);
1164 	ufs_thread_continue(&ufsvfsp->vfs_reclaim);
1165 
1166 	return (error);
1167 }
1168 
1169 /*
1170  * fiolfss
1171  * 	return the current file system locking state info
1172  */
1173 int
1174 ufs_fiolfss(struct vnode *vp, struct lockfs *lockfsp)
1175 {
1176 	struct ulockfs	*ulp;
1177 
1178 	if (!vp || !vp->v_vfsp || !VTOI(vp))
1179 		return (EINVAL);
1180 
1181 	/* file system has been forcibly unmounted */
1182 	if (VTOI(vp)->i_ufsvfs == NULL)
1183 		return (EIO);
1184 
1185 	ulp = VTOUL(vp);
1186 
1187 	if (ULOCKFS_IS_HLOCK(ulp)) {
1188 		*lockfsp = ulp->ul_lockfs;	/* structure assignment */
1189 		return (0);
1190 	}
1191 
1192 	mutex_enter(&ulp->ul_lock);
1193 
1194 	*lockfsp = ulp->ul_lockfs;	/* structure assignment */
1195 
1196 	if (ULOCKFS_IS_MOD(ulp))
1197 		lockfsp->lf_flags |= LOCKFS_MOD;
1198 
1199 	mutex_exit(&ulp->ul_lock);
1200 
1201 	return (0);
1202 }
1203 
1204 /*
1205  * ufs_check_lockfs
1206  *	check whether a ufs_vnops conflicts with the file system lock
1207  */
1208 int
1209 ufs_check_lockfs(struct ufsvfs *ufsvfsp, struct ulockfs *ulp, ulong_t mask)
1210 {
1211 	k_sigset_t	smask;
1212 	int		sig, slock;
1213 
1214 	ASSERT(MUTEX_HELD(&ulp->ul_lock));
1215 
1216 	while (ulp->ul_fs_lock & mask) {
1217 		slock = (int)ULOCKFS_IS_SLOCK(ulp);
1218 		if ((curthread->t_flag & T_DONTPEND) && !slock) {
1219 			curthread->t_flag |= T_WOULDBLOCK;
1220 			return (EAGAIN);
1221 		}
1222 		curthread->t_flag &= ~T_WOULDBLOCK;
1223 
1224 		/*
1225 		 * In the case of an onerr umount of the fs, threads could
1226 		 * have blocked before coming into ufs_check_lockfs and
1227 		 * need to check for the special case of ELOCK and
1228 		 * vfs_dontblock being set which would indicate that the fs
1229 		 * is on its way out and will not return therefore making
1230 		 * EIO the appropriate response.
1231 		 */
1232 		if (ULOCKFS_IS_HLOCK(ulp) ||
1233 		    (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
1234 			return (EIO);
1235 
1236 		/*
1237 		 * wait for lock status to change
1238 		 */
1239 		if (slock || ufsvfsp->vfs_nointr) {
1240 			cv_wait(&ulp->ul_cv, &ulp->ul_lock);
1241 		} else {
1242 			sigintr(&smask, 1);
1243 			sig = cv_wait_sig(&ulp->ul_cv, &ulp->ul_lock);
1244 			sigunintr(&smask);
1245 			if ((!sig && (ulp->ul_fs_lock & mask)) ||
1246 				ufsvfsp->vfs_dontblock)
1247 				return (EINTR);
1248 		}
1249 	}
1250 
1251 	if (mask & ULOCKFS_FWLOCK) {
1252 		atomic_add_long(&ulp->ul_falloc_cnt, 1);
1253 		ULOCKFS_SET_FALLOC(ulp);
1254 	} else {
1255 		atomic_add_long(&ulp->ul_vnops_cnt, 1);
1256 	}
1257 
1258 	return (0);
1259 }
1260 
1261 /*
1262  * Check whether we came across the handcrafted lockfs protocol path. We can't
1263  * simply check for T_DONTBLOCK here as one would assume since this can also
1264  * falsely catch recursive VOP's going to a different filesystem, instead we
1265  * check if we already hold the ulockfs->ul_lock mutex.
1266  */
1267 static int
1268 ufs_lockfs_is_under_rawlockfs(struct ulockfs *ulp)
1269 {
1270 	return ((mutex_owner(&ulp->ul_lock) != curthread) ? 0 : 1);
1271 }
1272 
1273 /*
1274  * ufs_lockfs_begin - start the lockfs locking protocol
1275  */
1276 int
1277 ufs_lockfs_begin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask)
1278 {
1279 	int 		error;
1280 	int		rec_vop;
1281 	struct ulockfs *ulp;
1282 	ulockfs_info_t	*ulockfs_info;
1283 	ulockfs_info_t	*ulockfs_info_free;
1284 	ulockfs_info_t	*ulockfs_info_temp;
1285 
1286 	/*
1287 	 * file system has been forcibly unmounted
1288 	 */
1289 	if (ufsvfsp == NULL)
1290 		return (EIO);
1291 
1292 	*ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1293 
1294 	/*
1295 	 * Do lockfs protocol
1296 	 */
1297 	ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1298 	IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1299 
1300 	/*
1301 	 * Detect recursive VOP call or handcrafted internal lockfs protocol
1302 	 * path and bail out in that case.
1303 	 */
1304 	if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1305 		*ulpp = NULL;
1306 		return (0);
1307 	} else {
1308 		if (ulockfs_info_free == NULL) {
1309 			if ((ulockfs_info_temp = (ulockfs_info_t *)
1310 			    kmem_zalloc(sizeof (ulockfs_info_t),
1311 			    KM_NOSLEEP)) == NULL) {
1312 				*ulpp = NULL;
1313 				return (ENOMEM);
1314 			}
1315 		}
1316 	}
1317 
1318 	/*
1319 	 * First time VOP call
1320 	 */
1321 	mutex_enter(&ulp->ul_lock);
1322 	if (ULOCKFS_IS_JUSTULOCK(ulp)) {
1323 		if (mask & ULOCKFS_FWLOCK) {
1324 			atomic_add_long(&ulp->ul_falloc_cnt, 1);
1325 			ULOCKFS_SET_FALLOC(ulp);
1326 		} else {
1327 			atomic_add_long(&ulp->ul_vnops_cnt, 1);
1328 		}
1329 	} else {
1330 		if (error = ufs_check_lockfs(ufsvfsp, ulp, mask)) {
1331 			mutex_exit(&ulp->ul_lock);
1332 			if (ulockfs_info_free == NULL)
1333 				kmem_free(ulockfs_info_temp,
1334 				    sizeof (ulockfs_info_t));
1335 			return (error);
1336 		}
1337 	}
1338 	mutex_exit(&ulp->ul_lock);
1339 
1340 	if (ulockfs_info_free != NULL) {
1341 		ulockfs_info_free->ulp = ulp;
1342 		if (mask & ULOCKFS_FWLOCK)
1343 			ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE;
1344 	} else {
1345 		ulockfs_info_temp->ulp = ulp;
1346 		ulockfs_info_temp->next = ulockfs_info;
1347 		if (mask & ULOCKFS_FWLOCK)
1348 			ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE;
1349 		ASSERT(ufs_lockfs_key != 0);
1350 		(void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1351 	}
1352 
1353 	curthread->t_flag |= T_DONTBLOCK;
1354 	return (0);
1355 }
1356 
1357 /*
1358  * Check whether we are returning from the top level VOP.
1359  */
1360 static int
1361 ufs_lockfs_top_vop_return(ulockfs_info_t *head)
1362 {
1363 	ulockfs_info_t *info;
1364 	int result = 1;
1365 
1366 	for (info = head; info != NULL; info = info->next) {
1367 		if (info->ulp != NULL) {
1368 			result = 0;
1369 			break;
1370 		}
1371 	}
1372 
1373 	return (result);
1374 }
1375 
1376 /*
1377  * ufs_lockfs_end - terminate the lockfs locking protocol
1378  */
1379 void
1380 ufs_lockfs_end(struct ulockfs *ulp)
1381 {
1382 	ulockfs_info_t *info;
1383 	ulockfs_info_t *head;
1384 
1385 	/*
1386 	 * end-of-VOP protocol
1387 	 */
1388 	if (ulp == NULL)
1389 		return;
1390 
1391 	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1392 	SEARCH_ULOCKFSP(head, ulp, info);
1393 
1394 	/*
1395 	 * If we're called from a first level VOP, we have to have a
1396 	 * valid ulockfs record in the TSD.
1397 	 */
1398 	ASSERT(info != NULL);
1399 
1400 	/*
1401 	 * Invalidate the ulockfs record.
1402 	 */
1403 	info->ulp = NULL;
1404 
1405 	if (ufs_lockfs_top_vop_return(head))
1406 		curthread->t_flag &= ~T_DONTBLOCK;
1407 
1408 	mutex_enter(&ulp->ul_lock);
1409 
1410 	/* fallocate thread */
1411 	if (ULOCKFS_IS_FALLOC(ulp) && info->flags & ULOCK_INFO_FALLOCATE) {
1412 		if (!atomic_add_long_nv(&ulp->ul_falloc_cnt, -1))
1413 			ULOCKFS_CLR_FALLOC(ulp);
1414 	} else  { /* normal thread */
1415 		if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1))
1416 			cv_broadcast(&ulp->ul_cv);
1417 	}
1418 
1419 	/* Clear the thread's fallocate state */
1420 	if (info->flags & ULOCK_INFO_FALLOCATE)
1421 		info->flags &= ~ULOCK_INFO_FALLOCATE;
1422 
1423 	if (ulp->ul_vnops_cnt == 0 && ulp->ul_falloc_cnt)
1424 		cv_broadcast(&ulp->ul_cv);
1425 
1426 	mutex_exit(&ulp->ul_lock);
1427 }
1428 
1429 /*
1430  * specialized version of ufs_lockfs_begin() called by ufs_getpage().
1431  */
1432 int
1433 ufs_lockfs_begin_getpage(
1434 	struct ufsvfs	*ufsvfsp,
1435 	struct ulockfs	**ulpp,
1436 	struct seg	*seg,
1437 	int		read_access,
1438 	uint_t		*protp)
1439 {
1440 	ulong_t			mask;
1441 	int 			error;
1442 	int			rec_vop;
1443 	struct ulockfs		*ulp;
1444 	ulockfs_info_t		*ulockfs_info;
1445 	ulockfs_info_t		*ulockfs_info_free;
1446 	ulockfs_info_t		*ulockfs_info_temp;
1447 
1448 	/*
1449 	 * file system has been forcibly unmounted
1450 	 */
1451 	if (ufsvfsp == NULL)
1452 		return (EIO);
1453 
1454 	*ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1455 
1456 	/*
1457 	 * Do lockfs protocol
1458 	 */
1459 	ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1460 	IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1461 
1462 	/*
1463 	 * Detect recursive VOP call or handcrafted internal lockfs protocol
1464 	 * path and bail out in that case.
1465 	 */
1466 	if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1467 		*ulpp = NULL;
1468 		return (0);
1469 	} else {
1470 		if (ulockfs_info_free == NULL) {
1471 			if ((ulockfs_info_temp = (ulockfs_info_t *)
1472 			    kmem_zalloc(sizeof (ulockfs_info_t),
1473 			    KM_NOSLEEP)) == NULL) {
1474 				*ulpp = NULL;
1475 				return (ENOMEM);
1476 			}
1477 		}
1478 	}
1479 
1480 	/*
1481 	 * First time VOP call
1482 	 */
1483 	mutex_enter(&ulp->ul_lock);
1484 	if (ULOCKFS_IS_JUSTULOCK(ulp))
1485 		/*
1486 		 * fs is not locked, simply inc the active-ops counter
1487 		 */
1488 		atomic_add_long(&ulp->ul_vnops_cnt, 1);
1489 	else {
1490 		if (seg->s_ops == &segvn_ops &&
1491 		    ((struct segvn_data *)seg->s_data)->type != MAP_SHARED) {
1492 			mask = (ulong_t)ULOCKFS_GETREAD_MASK;
1493 		} else if (protp && read_access) {
1494 			/*
1495 			 * Restrict the mapping to readonly.
1496 			 * Writes to this mapping will cause
1497 			 * another fault which will then
1498 			 * be suspended if fs is write locked
1499 			 */
1500 			*protp &= ~PROT_WRITE;
1501 			mask = (ulong_t)ULOCKFS_GETREAD_MASK;
1502 		} else
1503 			mask = (ulong_t)ULOCKFS_GETWRITE_MASK;
1504 
1505 		/*
1506 		 * will sleep if this fs is locked against this VOP
1507 		 */
1508 		if (error = ufs_check_lockfs(ufsvfsp, ulp, mask)) {
1509 			mutex_exit(&ulp->ul_lock);
1510 			if (ulockfs_info_free == NULL)
1511 				kmem_free(ulockfs_info_temp,
1512 				    sizeof (ulockfs_info_t));
1513 			return (error);
1514 		}
1515 	}
1516 	mutex_exit(&ulp->ul_lock);
1517 
1518 	if (ulockfs_info_free != NULL) {
1519 		ulockfs_info_free->ulp = ulp;
1520 	} else {
1521 		ulockfs_info_temp->ulp = ulp;
1522 		ulockfs_info_temp->next = ulockfs_info;
1523 		ASSERT(ufs_lockfs_key != 0);
1524 		(void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1525 	}
1526 
1527 	curthread->t_flag |= T_DONTBLOCK;
1528 	return (0);
1529 }
1530 
1531 void
1532 ufs_lockfs_tsd_destructor(void *head)
1533 {
1534 	ulockfs_info_t *curr = (ulockfs_info_t *)head;
1535 	ulockfs_info_t *temp;
1536 
1537 	for (; curr != NULL; ) {
1538 		/*
1539 		 * The TSD destructor is being called when the thread exits
1540 		 * (via thread_exit()). At that time it must have cleaned up
1541 		 * all VOPs via ufs_lockfs_end() and there must not be a
1542 		 * valid ulockfs record exist while a thread is exiting.
1543 		 */
1544 		temp = curr;
1545 		curr = curr->next;
1546 		ASSERT(temp->ulp == NULL);
1547 		kmem_free(temp, sizeof (ulockfs_info_t));
1548 	}
1549 }
1550