xref: /illumos-gate/usr/src/uts/common/fs/ufs/ufs_lockfs.c (revision 7f7322febbcfe774b7270abc3b191c094bfcc517)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/t_lock.h>
31 #include <sys/param.h>
32 #include <sys/time.h>
33 #include <sys/systm.h>
34 #include <sys/sysmacros.h>
35 #include <sys/resource.h>
36 #include <sys/signal.h>
37 #include <sys/cred.h>
38 #include <sys/user.h>
39 #include <sys/buf.h>
40 #include <sys/vfs.h>
41 #include <sys/vnode.h>
42 #include <sys/proc.h>
43 #include <sys/disp.h>
44 #include <sys/file.h>
45 #include <sys/fcntl.h>
46 #include <sys/flock.h>
47 #include <sys/atomic.h>
48 #include <sys/kmem.h>
49 #include <sys/uio.h>
50 #include <sys/conf.h>
51 #include <sys/mman.h>
52 #include <sys/pathname.h>
53 #include <sys/debug.h>
54 #include <sys/vmmeter.h>
55 #include <sys/vmsystm.h>
56 #include <sys/cmn_err.h>
57 #include <sys/vtrace.h>
58 #include <sys/acct.h>
59 #include <sys/dnlc.h>
60 #include <sys/swap.h>
61 
62 #include <sys/fs/ufs_fs.h>
63 #include <sys/fs/ufs_inode.h>
64 #include <sys/fs/ufs_fsdir.h>
65 #include <sys/fs/ufs_trans.h>
66 #include <sys/fs/ufs_panic.h>
67 #include <sys/fs/ufs_mount.h>
68 #include <sys/fs/ufs_bio.h>
69 #include <sys/fs/ufs_log.h>
70 #include <sys/fs/ufs_quota.h>
71 #include <sys/dirent.h>		/* must be AFTER <sys/fs/fsdir.h>! */
72 #include <sys/errno.h>
73 #include <sys/sysinfo.h>
74 
75 #include <vm/hat.h>
76 #include <vm/pvn.h>
77 #include <vm/as.h>
78 #include <vm/seg.h>
79 #include <vm/seg_map.h>
80 #include <vm/seg_vn.h>
81 #include <vm/rm.h>
82 #include <vm/anon.h>
83 #include <sys/swap.h>
84 #include <sys/dnlc.h>
85 
86 extern struct vnode *common_specvp(struct vnode *vp);
87 
88 /* error lock status */
89 #define	UN_ERRLCK	(-1)
90 #define	SET_ERRLCK	1
91 #define	RE_ERRLCK	2
92 #define	NO_ERRLCK	0
93 
94 /*
95  * Index to be used in TSD for storing lockfs data
96  */
97 uint_t ufs_lockfs_key;
98 
99 typedef struct _ulockfs_info {
100 	struct _ulockfs_info *next;
101 	struct ulockfs *ulp;
102 	uint_t flags;
103 } ulockfs_info_t;
104 
105 #define	ULOCK_INFO_FALLOCATE	0x00000001	/* fallocate thread */
106 
107 /*
108  * Check in TSD that whether we are already doing any VOP on this filesystem
109  */
110 #define	IS_REC_VOP(found, head, ulp, free)		\
111 {							\
112 	ulockfs_info_t *_curr;				\
113 							\
114 	for (found = 0, free = NULL, _curr = head;	\
115 	    _curr != NULL; _curr = _curr->next) {	\
116 		if ((free == NULL) &&			\
117 		    (_curr->ulp == NULL))		\
118 			free = _curr;			\
119 		if (_curr->ulp == ulp) {		\
120 			found = 1;			\
121 			break;				\
122 		}					\
123 	}						\
124 }
125 
126 /*
127  * Get the lockfs data from TSD so that lockfs handles the recursive VOP
128  * properly
129  */
130 #define	SEARCH_ULOCKFSP(head, ulp, info)		\
131 {							\
132 	ulockfs_info_t *_curr;				\
133 							\
134 	for (_curr = head; _curr != NULL;		\
135 	    _curr = _curr->next) {			\
136 		if (_curr->ulp == ulp) {		\
137 			break;				\
138 		}					\
139 	}						\
140 							\
141 	info = _curr;					\
142 }
143 
144 /*
145  * Validate lockfs request
146  */
147 static int
148 ufs_getlfd(
149 	struct lockfs *lockfsp,		/* new lock request */
150 	struct lockfs *ul_lockfsp)	/* old lock state */
151 {
152 	int	error = 0;
153 
154 	/*
155 	 * no input flags defined
156 	 */
157 	if (lockfsp->lf_flags != 0) {
158 		error = EINVAL;
159 		goto errout;
160 	}
161 
162 	/*
163 	 * check key
164 	 */
165 	if (!LOCKFS_IS_ULOCK(ul_lockfsp))
166 		if (lockfsp->lf_key != ul_lockfsp->lf_key) {
167 			error = EINVAL;
168 			goto errout;
169 	}
170 
171 	lockfsp->lf_key = ul_lockfsp->lf_key + 1;
172 
173 errout:
174 	return (error);
175 }
176 
177 /*
178  * ufs_checkaccton
179  *	check if accounting is turned on on this fs
180  */
181 
182 int
183 ufs_checkaccton(struct vnode *vp)
184 {
185 	if (acct_fs_in_use(vp))
186 		return (EDEADLK);
187 	return (0);
188 }
189 
190 /*
191  * ufs_checkswapon
192  *	check if local swapping is to file on this fs
193  */
194 int
195 ufs_checkswapon(struct vnode *vp)
196 {
197 	struct swapinfo	*sip;
198 
199 	mutex_enter(&swapinfo_lock);
200 	for (sip = swapinfo; sip; sip = sip->si_next)
201 		if (sip->si_vp->v_vfsp == vp->v_vfsp) {
202 			mutex_exit(&swapinfo_lock);
203 			return (EDEADLK);
204 		}
205 	mutex_exit(&swapinfo_lock);
206 	return (0);
207 }
208 
209 /*
210  * ufs_freeze
211  *	pend future accesses for current lock and desired lock
212  */
213 void
214 ufs_freeze(struct ulockfs *ulp, struct lockfs *lockfsp)
215 {
216 	/*
217 	 * set to new lock type
218 	 */
219 	ulp->ul_lockfs.lf_lock = lockfsp->lf_lock;
220 	ulp->ul_lockfs.lf_key = lockfsp->lf_key;
221 	ulp->ul_lockfs.lf_comlen = lockfsp->lf_comlen;
222 	ulp->ul_lockfs.lf_comment = lockfsp->lf_comment;
223 
224 	ulp->ul_fs_lock = (1 << ulp->ul_lockfs.lf_lock);
225 }
226 
227 /*
228  * All callers of ufs_quiesce() atomically increment ufs_quiesce_pend before
229  * starting ufs_quiesce() protocol and decrement it only when a file system no
230  * longer has to be in quiescent state. This allows ufs_pageio() to detect
231  * that another thread wants to quiesce a file system. See more comments in
232  * ufs_pageio().
233  */
234 ulong_t ufs_quiesce_pend = 0;
235 
236 /*
237  * ufs_quiesce
238  *	wait for outstanding accesses to finish
239  */
240 int
241 ufs_quiesce(struct ulockfs *ulp)
242 {
243 	int error = 0;
244 	ulockfs_info_t *head;
245 	ulockfs_info_t *info;
246 
247 	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
248 	SEARCH_ULOCKFSP(head, ulp, info);
249 
250 	/*
251 	 * Set a softlock to suspend future ufs_vnops so that
252 	 * this lockfs request will not be starved
253 	 */
254 	ULOCKFS_SET_SLOCK(ulp);
255 	ASSERT(ufs_quiesce_pend);
256 
257 	/* check if there is any outstanding ufs vnodeops calls */
258 	while (ulp->ul_vnops_cnt || ulp->ul_falloc_cnt) {
259 		/*
260 		 * use timed version of cv_wait_sig() to make sure we don't
261 		 * miss a wake up call from ufs_pageio() when it doesn't use
262 		 * ul_lock.
263 		 *
264 		 * when a fallocate thread comes in, the only way it returns
265 		 * from this function is if there are no other vnode operations
266 		 * going on (remember fallocate threads are tracked using
267 		 * ul_falloc_cnt not ul_vnops_cnt), and another fallocate thread
268 		 * hasn't already grabbed the fs write lock.
269 		 */
270 		if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
271 			if (!ulp->ul_vnops_cnt && !ULOCKFS_IS_FWLOCK(ulp))
272 				goto out;
273 		}
274 		if (!cv_timedwait_sig(&ulp->ul_cv, &ulp->ul_lock, lbolt + hz)) {
275 			error = EINTR;
276 			goto out;
277 		}
278 	}
279 
280 out:
281 	/*
282 	 * unlock the soft lock
283 	 */
284 	ULOCKFS_CLR_SLOCK(ulp);
285 
286 	return (error);
287 }
288 
289 /*
290  * ufs_flush_inode
291  */
292 int
293 ufs_flush_inode(struct inode *ip, void *arg)
294 {
295 	int	error;
296 	int	saverror	= 0;
297 
298 	/*
299 	 * wrong file system; keep looking
300 	 */
301 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
302 		return (0);
303 
304 	/*
305 	 * asynchronously push all the dirty pages
306 	 */
307 	if (((error = TRANS_SYNCIP(ip, B_ASYNC, 0, TOP_SYNCIP_FLUSHI)) != 0) &&
308 	    (error != EAGAIN))
309 		saverror = error;
310 	/*
311 	 * wait for io and discard all mappings
312 	 */
313 	if (error = TRANS_SYNCIP(ip, B_INVAL, 0, TOP_SYNCIP_FLUSHI))
314 		saverror = error;
315 
316 	if (ITOV(ip)->v_type == VDIR) {
317 		dnlc_dir_purge(&ip->i_danchor);
318 	}
319 
320 	return (saverror);
321 }
322 
323 /*
324  * ufs_flush
325  *	Flush everything that is currently dirty; this includes invalidating
326  *	any mappings.
327  */
328 int
329 ufs_flush(struct vfs *vfsp)
330 {
331 	int		error;
332 	int		saverror = 0;
333 	struct ufsvfs	*ufsvfsp	= (struct ufsvfs *)vfsp->vfs_data;
334 	struct fs	*fs		= ufsvfsp->vfs_fs;
335 
336 	ASSERT(vfs_lock_held(vfsp));
337 
338 	/*
339 	 * purge dnlc
340 	 */
341 	(void) dnlc_purge_vfsp(vfsp, 0);
342 
343 	/*
344 	 * drain the delete and idle threads
345 	 */
346 	ufs_delete_drain(vfsp, 0, 0);
347 	ufs_idle_drain(vfsp);
348 
349 	/*
350 	 * flush and invalidate quota records
351 	 */
352 	(void) qsync(ufsvfsp);
353 
354 	/*
355 	 * flush w/invalidate the inodes for vfsp
356 	 */
357 	if (error = ufs_scan_inodes(0, ufs_flush_inode, ufsvfsp, ufsvfsp))
358 		saverror = error;
359 
360 	/*
361 	 * synchronously flush superblock and summary info
362 	 */
363 	if (fs->fs_ronly == 0 && fs->fs_fmod) {
364 		fs->fs_fmod = 0;
365 		TRANS_SBUPDATE(ufsvfsp, vfsp, TOP_SBUPDATE_FLUSH);
366 	}
367 	/*
368 	 * flush w/invalidate block device pages and buf cache
369 	 */
370 	if ((error = VOP_PUTPAGE(common_specvp(ufsvfsp->vfs_devvp),
371 	    (offset_t)0, 0, B_INVAL, CRED())) > 0)
372 		saverror = error;
373 
374 	(void) bflush((dev_t)vfsp->vfs_dev);
375 	(void) bfinval((dev_t)vfsp->vfs_dev, 0);
376 
377 	/*
378 	 * drain the delete and idle threads again
379 	 */
380 	ufs_delete_drain(vfsp, 0, 0);
381 	ufs_idle_drain(vfsp);
382 
383 	/*
384 	 * play with the clean flag
385 	 */
386 	if (saverror == 0)
387 		ufs_checkclean(vfsp);
388 
389 	/*
390 	 * Flush any outstanding transactions and roll the log
391 	 * only if we are supposed to do, i.e. LDL_NOROLL not set.
392 	 * We can not simply check for fs_ronly here since fsck also may
393 	 * use this code to roll the log on a read-only filesystem, e.g.
394 	 * root during early stages of boot, if other then a sanity check is
395 	 * done, it will clear LDL_NOROLL before.
396 	 * In addition we assert that the deltamap does not contain any deltas
397 	 * in case LDL_NOROLL is set since this is not supposed to happen.
398 	 */
399 	if (TRANS_ISTRANS(ufsvfsp)) {
400 		ml_unit_t	*ul	= ufsvfsp->vfs_log;
401 		mt_map_t	*mtm	= ul->un_deltamap;
402 
403 		if (ul->un_flags & LDL_NOROLL) {
404 			ASSERT(mtm->mtm_nme == 0);
405 		} else {
406 			curthread->t_flag |= T_DONTBLOCK;
407 			TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_FLUSH,
408 			    TOP_COMMIT_SIZE, error);
409 			if (!error) {
410 				TRANS_END_SYNC(ufsvfsp, saverror,
411 				    TOP_COMMIT_FLUSH, TOP_COMMIT_SIZE);
412 			}
413 			curthread->t_flag &= ~T_DONTBLOCK;
414 			logmap_roll_dev(ufsvfsp->vfs_log);
415 		}
416 	}
417 
418 	return (saverror);
419 }
420 
421 /*
422  * ufs_thaw_wlock
423  *	special processing when thawing down to wlock
424  */
425 static int
426 ufs_thaw_wlock(struct inode *ip, void *arg)
427 {
428 	/*
429 	 * wrong file system; keep looking
430 	 */
431 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
432 		return (0);
433 
434 	/*
435 	 * iupdat refuses to clear flags if the fs is read only.  The fs
436 	 * may become read/write during the lock and we wouldn't want
437 	 * these inodes being written to disk.  So clear the flags.
438 	 */
439 	rw_enter(&ip->i_contents, RW_WRITER);
440 	ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
441 	rw_exit(&ip->i_contents);
442 
443 	/*
444 	 * pages are mlocked -- fail wlock
445 	 */
446 	if (ITOV(ip)->v_type != VCHR && vn_has_cached_data(ITOV(ip)))
447 		return (EBUSY);
448 
449 	return (0);
450 }
451 
452 /*
453  * ufs_thaw_hlock
454  *	special processing when thawing down to hlock or elock
455  */
456 static int
457 ufs_thaw_hlock(struct inode *ip, void *arg)
458 {
459 	struct vnode	*vp	= ITOV(ip);
460 
461 	/*
462 	 * wrong file system; keep looking
463 	 */
464 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
465 		return (0);
466 
467 	/*
468 	 * blow away all pages - even if they are mlocked
469 	 */
470 	do {
471 		(void) TRANS_SYNCIP(ip, B_INVAL | B_FORCE, 0, TOP_SYNCIP_HLOCK);
472 	} while ((vp->v_type != VCHR) && vn_has_cached_data(vp));
473 	rw_enter(&ip->i_contents, RW_WRITER);
474 	ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
475 	rw_exit(&ip->i_contents);
476 
477 	return (0);
478 }
479 
480 /*
481  * ufs_thaw
482  *	thaw file system lock down to current value
483  */
484 int
485 ufs_thaw(struct vfs *vfsp, struct ufsvfs *ufsvfsp, struct ulockfs *ulp)
486 {
487 	int		error	= 0;
488 	int		noidel	= (int)(ulp->ul_flag & ULOCKFS_NOIDEL);
489 
490 	/*
491 	 * if wlock or hlock or elock
492 	 */
493 	if (ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_HLOCK(ulp) ||
494 	    ULOCKFS_IS_ELOCK(ulp)) {
495 
496 		/*
497 		 * don't keep access times
498 		 * don't free deleted files
499 		 * if superblock writes are allowed, limit them to me for now
500 		 */
501 		ulp->ul_flag |= (ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
502 		if (ulp->ul_sbowner != (kthread_id_t)-1)
503 			ulp->ul_sbowner = curthread;
504 
505 		/*
506 		 * wait for writes for deleted files and superblock updates
507 		 */
508 		(void) ufs_flush(vfsp);
509 
510 		/*
511 		 * now make sure the quota file is up-to-date
512 		 *	expensive; but effective
513 		 */
514 		error = ufs_flush(vfsp);
515 		/*
516 		 * no one can write the superblock
517 		 */
518 		ulp->ul_sbowner = (kthread_id_t)-1;
519 
520 		/*
521 		 * special processing for wlock/hlock/elock
522 		 */
523 		if (ULOCKFS_IS_WLOCK(ulp)) {
524 			if (error)
525 				goto errout;
526 			error = bfinval(ufsvfsp->vfs_dev, 0);
527 			if (error)
528 				goto errout;
529 			error = ufs_scan_inodes(0, ufs_thaw_wlock,
530 					(void *)ufsvfsp, ufsvfsp);
531 			if (error)
532 				goto errout;
533 		}
534 		if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp)) {
535 			error = 0;
536 			(void) ufs_scan_inodes(0, ufs_thaw_hlock,
537 					(void *)ufsvfsp, ufsvfsp);
538 			(void) bfinval(ufsvfsp->vfs_dev, 1);
539 		}
540 	} else {
541 
542 		/*
543 		 * okay to keep access times
544 		 * okay to free deleted files
545 		 * okay to write the superblock
546 		 */
547 		ulp->ul_flag &= ~(ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
548 		ulp->ul_sbowner = NULL;
549 
550 		/*
551 		 * flush in case deleted files are in memory
552 		 */
553 		if (noidel) {
554 			if (error = ufs_flush(vfsp))
555 				goto errout;
556 		}
557 	}
558 
559 errout:
560 	cv_broadcast(&ulp->ul_cv);
561 	return (error);
562 }
563 
564 /*
565  * ufs_reconcile_fs
566  *	reconcile incore superblock with ondisk superblock
567  */
568 int
569 ufs_reconcile_fs(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
570 {
571 	struct fs	*mfs; 	/* in-memory superblock */
572 	struct fs	*dfs;	/* on-disk   superblock */
573 	struct buf	*bp;	/* on-disk   superblock buf */
574 	int		 needs_unlock;
575 	char		 finished_fsclean;
576 
577 	mfs = ufsvfsp->vfs_fs;
578 
579 	/*
580 	 * get the on-disk copy of the superblock
581 	 */
582 	bp = UFS_BREAD(ufsvfsp, vfsp->vfs_dev, SBLOCK, SBSIZE);
583 	bp->b_flags |= (B_STALE|B_AGE);
584 	if (bp->b_flags & B_ERROR) {
585 		brelse(bp);
586 		return (EIO);
587 	}
588 	dfs = bp->b_un.b_fs;
589 
590 	/* error locks may only unlock after the fs has been made consistent */
591 	if (errlck == UN_ERRLCK) {
592 		if (dfs->fs_clean == FSFIX) {	/* being repaired */
593 			brelse(bp);
594 			return (EAGAIN);
595 		}
596 		/* repair not yet started? */
597 		finished_fsclean = TRANS_ISTRANS(ufsvfsp)? FSLOG: FSCLEAN;
598 		if (dfs->fs_clean != finished_fsclean) {
599 			brelse(bp);
600 			return (EBUSY);
601 		}
602 	}
603 
604 	/*
605 	 * if superblock has changed too much, abort
606 	 */
607 	if ((mfs->fs_sblkno		!= dfs->fs_sblkno) ||
608 	    (mfs->fs_cblkno		!= dfs->fs_cblkno) ||
609 	    (mfs->fs_iblkno		!= dfs->fs_iblkno) ||
610 	    (mfs->fs_dblkno		!= dfs->fs_dblkno) ||
611 	    (mfs->fs_cgoffset		!= dfs->fs_cgoffset) ||
612 	    (mfs->fs_cgmask		!= dfs->fs_cgmask) ||
613 	    (mfs->fs_bsize		!= dfs->fs_bsize) ||
614 	    (mfs->fs_fsize		!= dfs->fs_fsize) ||
615 	    (mfs->fs_frag		!= dfs->fs_frag) ||
616 	    (mfs->fs_bmask		!= dfs->fs_bmask) ||
617 	    (mfs->fs_fmask		!= dfs->fs_fmask) ||
618 	    (mfs->fs_bshift		!= dfs->fs_bshift) ||
619 	    (mfs->fs_fshift		!= dfs->fs_fshift) ||
620 	    (mfs->fs_fragshift		!= dfs->fs_fragshift) ||
621 	    (mfs->fs_fsbtodb		!= dfs->fs_fsbtodb) ||
622 	    (mfs->fs_sbsize		!= dfs->fs_sbsize) ||
623 	    (mfs->fs_nindir		!= dfs->fs_nindir) ||
624 	    (mfs->fs_nspf		!= dfs->fs_nspf) ||
625 	    (mfs->fs_trackskew		!= dfs->fs_trackskew) ||
626 	    (mfs->fs_cgsize		!= dfs->fs_cgsize) ||
627 	    (mfs->fs_ntrak		!= dfs->fs_ntrak) ||
628 	    (mfs->fs_nsect		!= dfs->fs_nsect) ||
629 	    (mfs->fs_spc		!= dfs->fs_spc) ||
630 	    (mfs->fs_cpg		!= dfs->fs_cpg) ||
631 	    (mfs->fs_ipg		!= dfs->fs_ipg) ||
632 	    (mfs->fs_fpg		!= dfs->fs_fpg) ||
633 	    (mfs->fs_postblformat	!= dfs->fs_postblformat) ||
634 	    (mfs->fs_magic		!= dfs->fs_magic)) {
635 		brelse(bp);
636 		return (EACCES);
637 	}
638 	if (dfs->fs_clean == FSBAD || FSOKAY != dfs->fs_state + dfs->fs_time)
639 		if (mfs->fs_clean == FSLOG) {
640 			brelse(bp);
641 			return (EACCES);
642 		}
643 
644 	/*
645 	 * get new summary info
646 	 */
647 	if (ufs_getsummaryinfo(vfsp->vfs_dev, ufsvfsp, dfs)) {
648 		brelse(bp);
649 		return (EIO);
650 	}
651 
652 	/*
653 	 * release old summary info and update in-memory superblock
654 	 */
655 	kmem_free(mfs->fs_u.fs_csp, mfs->fs_cssize);
656 	mfs->fs_u.fs_csp = dfs->fs_u.fs_csp;	/* Only entry 0 used */
657 
658 	/*
659 	 * update fields allowed to change
660 	 */
661 	mfs->fs_size		= dfs->fs_size;
662 	mfs->fs_dsize		= dfs->fs_dsize;
663 	mfs->fs_ncg		= dfs->fs_ncg;
664 	mfs->fs_minfree		= dfs->fs_minfree;
665 	mfs->fs_rotdelay	= dfs->fs_rotdelay;
666 	mfs->fs_rps		= dfs->fs_rps;
667 	mfs->fs_maxcontig	= dfs->fs_maxcontig;
668 	mfs->fs_maxbpg		= dfs->fs_maxbpg;
669 	mfs->fs_csmask		= dfs->fs_csmask;
670 	mfs->fs_csshift		= dfs->fs_csshift;
671 	mfs->fs_optim		= dfs->fs_optim;
672 	mfs->fs_csaddr		= dfs->fs_csaddr;
673 	mfs->fs_cssize		= dfs->fs_cssize;
674 	mfs->fs_ncyl		= dfs->fs_ncyl;
675 	mfs->fs_cstotal		= dfs->fs_cstotal;
676 	mfs->fs_reclaim		= dfs->fs_reclaim;
677 
678 	if (mfs->fs_reclaim & (FS_RECLAIM|FS_RECLAIMING)) {
679 		mfs->fs_reclaim &= ~FS_RECLAIM;
680 		mfs->fs_reclaim |=  FS_RECLAIMING;
681 		ufs_thread_start(&ufsvfsp->vfs_reclaim,
682 			ufs_thread_reclaim, vfsp);
683 	}
684 
685 	/* XXX What to do about sparecon? */
686 
687 	/* XXX need to copy volume label */
688 
689 	/*
690 	 * ondisk clean flag overrides inmemory clean flag iff == FSBAD
691 	 * or if error-locked and ondisk is now clean
692 	 */
693 	needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
694 	if (needs_unlock)
695 		mutex_enter(&ufsvfsp->vfs_lock);
696 
697 	if (errlck == UN_ERRLCK) {
698 		if (finished_fsclean == dfs->fs_clean)
699 			mfs->fs_clean = finished_fsclean;
700 		else
701 			mfs->fs_clean = FSBAD;
702 		mfs->fs_state = FSOKAY - dfs->fs_time;
703 	}
704 
705 	if (FSOKAY != dfs->fs_state + dfs->fs_time ||
706 	    (dfs->fs_clean == FSBAD))
707 		mfs->fs_clean = FSBAD;
708 
709 	if (needs_unlock)
710 		mutex_exit(&ufsvfsp->vfs_lock);
711 
712 	brelse(bp);
713 
714 	return (0);
715 }
716 
717 /*
718  * ufs_reconcile_inode
719  *	reconcile ondisk inode with incore inode
720  */
721 static int
722 ufs_reconcile_inode(struct inode *ip, void *arg)
723 {
724 	int		i;
725 	int		ndaddr;
726 	int		niaddr;
727 	struct dinode	*dp;		/* ondisk inode */
728 	struct buf	*bp	= NULL;
729 	uid_t		d_uid;
730 	gid_t		d_gid;
731 	int		error = 0;
732 	struct fs	*fs;
733 
734 	/*
735 	 * not an inode we care about
736 	 */
737 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
738 		return (0);
739 
740 	fs = ip->i_fs;
741 
742 	/*
743 	 * Inode reconciliation fails: we made the filesystem quiescent
744 	 * and we did a ufs_flush() before calling ufs_reconcile_inode()
745 	 * and thus the inode should not have been changed inbetween.
746 	 * Any discrepancies indicate a logic error and a pretty
747 	 * significant run-state inconsistency we should complain about.
748 	 */
749 	if (ip->i_flag & (IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG)) {
750 		cmn_err(CE_WARN, "%s: Inode reconciliation failed for"
751 		    "inode %llu", fs->fs_fsmnt, (u_longlong_t)ip->i_number);
752 		return (EINVAL);
753 	}
754 
755 	/*
756 	 * get the dinode
757 	 */
758 	bp = UFS_BREAD(ip->i_ufsvfs,
759 			ip->i_dev, (daddr_t)fsbtodb(fs, itod(fs, ip->i_number)),
760 	    (int)fs->fs_bsize);
761 	if (bp->b_flags & B_ERROR) {
762 		brelse(bp);
763 		return (EIO);
764 	}
765 	dp  = bp->b_un.b_dino;
766 	dp += itoo(fs, ip->i_number);
767 
768 	/*
769 	 * handle Sun's implementation of EFT
770 	 */
771 	d_uid = (dp->di_suid == UID_LONG) ? dp->di_uid : (uid_t)dp->di_suid;
772 	d_gid = (dp->di_sgid == GID_LONG) ? dp->di_gid : (uid_t)dp->di_sgid;
773 
774 	rw_enter(&ip->i_contents, RW_WRITER);
775 
776 	/*
777 	 * some fields are not allowed to change
778 	 */
779 	if ((ip->i_mode  != dp->di_mode) ||
780 	    (ip->i_gen   != dp->di_gen) ||
781 	    (ip->i_uid   != d_uid) ||
782 	    (ip->i_gid   != d_gid)) {
783 		error = EACCES;
784 		goto out;
785 	}
786 
787 	/*
788 	 * and some are allowed to change
789 	 */
790 	ip->i_size		= dp->di_size;
791 	ip->i_ic.ic_flags	= dp->di_ic.ic_flags;
792 	ip->i_blocks		= dp->di_blocks;
793 	ip->i_nlink		= dp->di_nlink;
794 	if (ip->i_flag & IFASTSYMLNK) {
795 		ndaddr = 1;
796 		niaddr = 0;
797 	} else {
798 		ndaddr = NDADDR;
799 		niaddr = NIADDR;
800 	}
801 	for (i = 0; i < ndaddr; ++i)
802 		ip->i_db[i] = dp->di_db[i];
803 	for (i = 0; i < niaddr; ++i)
804 		ip->i_ib[i] = dp->di_ib[i];
805 
806 out:
807 	rw_exit(&ip->i_contents);
808 	brelse(bp);
809 	return (error);
810 }
811 
812 /*
813  * ufs_reconcile
814  *	reconcile ondisk superblock/inodes with any incore
815  */
816 static int
817 ufs_reconcile(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
818 {
819 	int	error = 0;
820 
821 	/*
822 	 * get rid of as much inmemory data as possible
823 	 */
824 	(void) ufs_flush(vfsp);
825 
826 	/*
827 	 * reconcile the superblock and inodes
828 	 */
829 	if (error = ufs_reconcile_fs(vfsp, ufsvfsp, errlck))
830 		return (error);
831 	if (error = ufs_scan_inodes(0, ufs_reconcile_inode, ufsvfsp, ufsvfsp))
832 		return (error);
833 	/*
834 	 * allocation blocks may be incorrect; get rid of them
835 	 */
836 	(void) ufs_flush(vfsp);
837 
838 	return (error);
839 }
840 
841 /*
842  * File system locking
843  */
844 int
845 ufs_fiolfs(struct vnode *vp, struct lockfs *lockfsp, int from_log)
846 {
847 	return (ufs__fiolfs(vp, lockfsp, /* from_user */ 1, from_log));
848 }
849 
850 /* kernel-internal interface, also used by fix-on-panic */
851 int
852 ufs__fiolfs(
853 	struct vnode *vp,
854 	struct lockfs *lockfsp,
855 	int from_user,
856 	int from_log)
857 {
858 	struct ulockfs	*ulp;
859 	struct lockfs	lfs;
860 	int		error;
861 	struct vfs	*vfsp;
862 	struct ufsvfs	*ufsvfsp;
863 	int		 errlck		= NO_ERRLCK;
864 	int		 poll_events	= POLLPRI;
865 	extern struct pollhead ufs_pollhd;
866 	ulockfs_info_t *head;
867 	ulockfs_info_t *info;
868 
869 	/* check valid lock type */
870 	if (!lockfsp || lockfsp->lf_lock > LOCKFS_MAXLOCK)
871 		return (EINVAL);
872 
873 	if (!vp || !vp->v_vfsp || !vp->v_vfsp->vfs_data)
874 		return (EIO);
875 
876 	vfsp = vp->v_vfsp;
877 	ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
878 	ulp = &ufsvfsp->vfs_ulockfs;
879 
880 	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
881 	SEARCH_ULOCKFSP(head, ulp, info);
882 
883 	/*
884 	 * Suspend both the reclaim thread and the delete thread.
885 	 * This must be done outside the lockfs locking protocol.
886 	 */
887 	ufs_thread_suspend(&ufsvfsp->vfs_reclaim);
888 	ufs_thread_suspend(&ufsvfsp->vfs_delete);
889 
890 	/*
891 	 * Acquire vfs_reflock around ul_lock to avoid deadlock with
892 	 * umount/remount/sync.
893 	 */
894 	vfs_lock_wait(vfsp);
895 	mutex_enter(&ulp->ul_lock);
896 	atomic_add_long(&ufs_quiesce_pend, 1);
897 
898 	/*
899 	 * Quit if there is another lockfs request in progress
900 	 * that is waiting for existing ufs_vnops to complete.
901 	 */
902 	if (ULOCKFS_IS_BUSY(ulp)) {
903 		error = EBUSY;
904 		goto errexit;
905 	}
906 
907 	/* cannot ulocked or downgrade a hard-lock */
908 	if (ULOCKFS_IS_HLOCK(ulp)) {
909 		error = EIO;
910 		goto errexit;
911 	}
912 
913 	/* an error lock may be unlocked or relocked, only */
914 	if (ULOCKFS_IS_ELOCK(ulp)) {
915 		if (!LOCKFS_IS_ULOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
916 			error = EBUSY;
917 			goto errexit;
918 		}
919 	}
920 
921 	/*
922 	 * a read-only error lock may only be upgraded to an
923 	 * error lock or hard lock
924 	 */
925 	if (ULOCKFS_IS_ROELOCK(ulp)) {
926 		if (!LOCKFS_IS_HLOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
927 			error = EBUSY;
928 			goto errexit;
929 		}
930 	}
931 
932 	/*
933 	 * until read-only error locks are fully implemented
934 	 * just return EINVAL
935 	 */
936 	if (LOCKFS_IS_ROELOCK(lockfsp)) {
937 		error = EINVAL;
938 		goto errexit;
939 	}
940 
941 	/*
942 	 * an error lock may only be applied if the file system is
943 	 * unlocked or already error locked.
944 	 * (this is to prevent the case where a fs gets changed out from
945 	 * underneath a fs that is locked for backup,
946 	 * that is, name/delete/write-locked.)
947 	 */
948 	if ((!ULOCKFS_IS_ULOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp) &&
949 	    !ULOCKFS_IS_ROELOCK(ulp)) &&
950 	    (LOCKFS_IS_ELOCK(lockfsp) || LOCKFS_IS_ROELOCK(lockfsp))) {
951 		error = EBUSY;
952 		goto errexit;
953 	}
954 
955 	/* get and validate the input lockfs request */
956 	if (error = ufs_getlfd(lockfsp, &ulp->ul_lockfs))
957 		goto errexit;
958 
959 	/*
960 	 * save current ulockfs struct
961 	 */
962 	bcopy(&ulp->ul_lockfs, &lfs, sizeof (struct lockfs));
963 
964 	/*
965 	 * Freeze the file system (pend future accesses)
966 	 */
967 	ufs_freeze(ulp, lockfsp);
968 
969 	/*
970 	 * Set locking in progress because ufs_quiesce may free the
971 	 * ul_lock mutex.
972 	 */
973 	ULOCKFS_SET_BUSY(ulp);
974 	/* update the ioctl copy */
975 	LOCKFS_SET_BUSY(&ulp->ul_lockfs);
976 
977 	/*
978 	 * We  need to unset FWLOCK status before we call ufs_quiesce
979 	 * so that the thread doesnt get suspended. We do this only if
980 	 * this (fallocate) thread requested an unlock operation.
981 	 */
982 	if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
983 		if (!ULOCKFS_IS_WLOCK(ulp))
984 			ULOCKFS_CLR_FWLOCK(ulp);
985 	}
986 
987 	/*
988 	 * Quiesce (wait for outstanding accesses to finish)
989 	 */
990 	if (error = ufs_quiesce(ulp))
991 		goto errout;
992 
993 	/*
994 	 * If the fallocate thread requested a write fs lock operation
995 	 * then we set fwlock status in the ulp.
996 	 */
997 	if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
998 		if (ULOCKFS_IS_WLOCK(ulp))
999 			ULOCKFS_SET_FWLOCK(ulp);
1000 	}
1001 
1002 	/*
1003 	 * can't wlock or (ro)elock fs with accounting or local swap file
1004 	 */
1005 	if ((ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp) ||
1006 	    ULOCKFS_IS_ROELOCK(ulp)) && !from_log) {
1007 		if (error = ufs_checkaccton(vp))
1008 			goto errout;
1009 		if (error = ufs_checkswapon(vp))
1010 			goto errout;
1011 	}
1012 
1013 	/*
1014 	 * save error lock status to pass down to reconcilation
1015 	 * routines and for later cleanup
1016 	 */
1017 	if (LOCKFS_IS_ELOCK(&lfs) && ULOCKFS_IS_ULOCK(ulp))
1018 		errlck = UN_ERRLCK;
1019 
1020 	if (ULOCKFS_IS_ELOCK(ulp) || ULOCKFS_IS_ROELOCK(ulp)) {
1021 		int needs_unlock;
1022 		int needs_sbwrite;
1023 
1024 		poll_events |= POLLERR;
1025 		errlck = LOCKFS_IS_ELOCK(&lfs) || LOCKFS_IS_ROELOCK(&lfs)?
1026 							RE_ERRLCK: SET_ERRLCK;
1027 
1028 		needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
1029 		if (needs_unlock)
1030 			mutex_enter(&ufsvfsp->vfs_lock);
1031 
1032 		/* disable delayed i/o */
1033 		needs_sbwrite = 0;
1034 
1035 		if (errlck == SET_ERRLCK) {
1036 			ufsvfsp->vfs_fs->fs_clean = FSBAD;
1037 			needs_sbwrite = 1;
1038 		}
1039 
1040 		needs_sbwrite |= ufsvfsp->vfs_dio;
1041 		ufsvfsp->vfs_dio = 0;
1042 
1043 		if (needs_unlock)
1044 			mutex_exit(&ufsvfsp->vfs_lock);
1045 
1046 		if (needs_sbwrite) {
1047 			ulp->ul_sbowner = curthread;
1048 			TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE);
1049 
1050 			if (needs_unlock)
1051 				mutex_enter(&ufsvfsp->vfs_lock);
1052 
1053 			ufsvfsp->vfs_fs->fs_fmod = 0;
1054 
1055 			if (needs_unlock)
1056 				mutex_exit(&ufsvfsp->vfs_lock);
1057 		}
1058 	}
1059 
1060 	/*
1061 	 * reconcile superblock and inodes if was wlocked
1062 	 */
1063 	if (LOCKFS_IS_WLOCK(&lfs) || LOCKFS_IS_ELOCK(&lfs)) {
1064 		if (error = ufs_reconcile(vfsp, ufsvfsp, errlck))
1065 			goto errout;
1066 		/*
1067 		 * in case the fs grew; reset the metadata map for logging tests
1068 		 */
1069 		TRANS_MATA_UMOUNT(ufsvfsp);
1070 		TRANS_MATA_MOUNT(ufsvfsp);
1071 		TRANS_MATA_SI(ufsvfsp, ufsvfsp->vfs_fs);
1072 	}
1073 
1074 	/*
1075 	 * At least everything *currently* dirty goes out.
1076 	 */
1077 
1078 	if ((error = ufs_flush(vfsp)) != 0 && !ULOCKFS_IS_HLOCK(ulp) &&
1079 	    !ULOCKFS_IS_ELOCK(ulp))
1080 		goto errout;
1081 
1082 	/*
1083 	 * thaw file system and wakeup pended processes
1084 	 */
1085 	if (error = ufs_thaw(vfsp, ufsvfsp, ulp))
1086 		if (!ULOCKFS_IS_HLOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp))
1087 			goto errout;
1088 
1089 	/*
1090 	 * reset modified flag if not already write locked
1091 	 */
1092 	if (!LOCKFS_IS_WLOCK(&lfs))
1093 		ULOCKFS_CLR_MOD(ulp);
1094 
1095 	/*
1096 	 * idle the lock struct
1097 	 */
1098 	ULOCKFS_CLR_BUSY(ulp);
1099 	/* update the ioctl copy */
1100 	LOCKFS_CLR_BUSY(&ulp->ul_lockfs);
1101 
1102 	/*
1103 	 * free current comment
1104 	 */
1105 	if (lfs.lf_comment && lfs.lf_comlen != 0) {
1106 		kmem_free(lfs.lf_comment, lfs.lf_comlen);
1107 		lfs.lf_comment = NULL;
1108 		lfs.lf_comlen = 0;
1109 	}
1110 
1111 	/* do error lock cleanup */
1112 	if (errlck == UN_ERRLCK)
1113 		ufsfx_unlockfs(ufsvfsp);
1114 
1115 	else if (errlck == RE_ERRLCK)
1116 		ufsfx_lockfs(ufsvfsp);
1117 
1118 	/* don't allow error lock from user to invoke panic */
1119 	else if (from_user && errlck == SET_ERRLCK &&
1120 		!(ufsvfsp->vfs_fsfx.fx_flags & (UFSMNT_ONERROR_PANIC >> 4)))
1121 		(void) ufs_fault(ufsvfsp->vfs_root,
1122 		    ulp->ul_lockfs.lf_comment && ulp->ul_lockfs.lf_comlen > 0 ?
1123 		    ulp->ul_lockfs.lf_comment: "user-applied error lock");
1124 
1125 	atomic_add_long(&ufs_quiesce_pend, -1);
1126 	mutex_exit(&ulp->ul_lock);
1127 	vfs_unlock(vfsp);
1128 
1129 	if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs))
1130 		poll_events |= POLLERR;
1131 
1132 	pollwakeup(&ufs_pollhd, poll_events);
1133 
1134 	/*
1135 	 * Allow both the delete thread and the reclaim thread to
1136 	 * continue.
1137 	 */
1138 	ufs_thread_continue(&ufsvfsp->vfs_delete);
1139 	ufs_thread_continue(&ufsvfsp->vfs_reclaim);
1140 
1141 	return (0);
1142 
1143 errout:
1144 	/*
1145 	 * Lock failed. Reset the old lock in ufsvfs if not hard locked.
1146 	 */
1147 	if (!LOCKFS_IS_HLOCK(&ulp->ul_lockfs)) {
1148 		bcopy(&lfs, &ulp->ul_lockfs, sizeof (struct lockfs));
1149 		ulp->ul_fs_lock = (1 << lfs.lf_lock);
1150 	}
1151 	(void) ufs_thaw(vfsp, ufsvfsp, ulp);
1152 	ULOCKFS_CLR_BUSY(ulp);
1153 	LOCKFS_CLR_BUSY(&ulp->ul_lockfs);
1154 
1155 errexit:
1156 	atomic_add_long(&ufs_quiesce_pend, -1);
1157 	mutex_exit(&ulp->ul_lock);
1158 	vfs_unlock(vfsp);
1159 
1160 	/*
1161 	 * Allow both the delete thread and the reclaim thread to
1162 	 * continue.
1163 	 */
1164 	ufs_thread_continue(&ufsvfsp->vfs_delete);
1165 	ufs_thread_continue(&ufsvfsp->vfs_reclaim);
1166 
1167 	return (error);
1168 }
1169 
1170 /*
1171  * fiolfss
1172  * 	return the current file system locking state info
1173  */
1174 int
1175 ufs_fiolfss(struct vnode *vp, struct lockfs *lockfsp)
1176 {
1177 	struct ulockfs	*ulp;
1178 
1179 	if (!vp || !vp->v_vfsp || !VTOI(vp))
1180 		return (EINVAL);
1181 
1182 	/* file system has been forcibly unmounted */
1183 	if (VTOI(vp)->i_ufsvfs == NULL)
1184 		return (EIO);
1185 
1186 	ulp = VTOUL(vp);
1187 
1188 	if (ULOCKFS_IS_HLOCK(ulp)) {
1189 		*lockfsp = ulp->ul_lockfs;	/* structure assignment */
1190 		return (0);
1191 	}
1192 
1193 	mutex_enter(&ulp->ul_lock);
1194 
1195 	*lockfsp = ulp->ul_lockfs;	/* structure assignment */
1196 
1197 	if (ULOCKFS_IS_MOD(ulp))
1198 		lockfsp->lf_flags |= LOCKFS_MOD;
1199 
1200 	mutex_exit(&ulp->ul_lock);
1201 
1202 	return (0);
1203 }
1204 
1205 /*
1206  * ufs_check_lockfs
1207  *	check whether a ufs_vnops conflicts with the file system lock
1208  */
1209 int
1210 ufs_check_lockfs(struct ufsvfs *ufsvfsp, struct ulockfs *ulp, ulong_t mask)
1211 {
1212 	k_sigset_t	smask;
1213 	int		sig, slock;
1214 
1215 	ASSERT(MUTEX_HELD(&ulp->ul_lock));
1216 
1217 	while (ulp->ul_fs_lock & mask) {
1218 		slock = (int)ULOCKFS_IS_SLOCK(ulp);
1219 		if ((curthread->t_flag & T_DONTPEND) && !slock) {
1220 			curthread->t_flag |= T_WOULDBLOCK;
1221 			return (EAGAIN);
1222 		}
1223 		curthread->t_flag &= ~T_WOULDBLOCK;
1224 
1225 		if (ULOCKFS_IS_HLOCK(ulp))
1226 			return (EIO);
1227 
1228 		/*
1229 		 * wait for lock status to change
1230 		 */
1231 		if (slock || ufsvfsp->vfs_nointr) {
1232 			cv_wait(&ulp->ul_cv, &ulp->ul_lock);
1233 		} else {
1234 			sigintr(&smask, 1);
1235 			sig = cv_wait_sig(&ulp->ul_cv, &ulp->ul_lock);
1236 			sigunintr(&smask);
1237 			if ((!sig && (ulp->ul_fs_lock & mask)) ||
1238 				ufsvfsp->vfs_dontblock)
1239 				return (EINTR);
1240 		}
1241 	}
1242 
1243 	if (mask & ULOCKFS_FWLOCK) {
1244 		atomic_add_long(&ulp->ul_falloc_cnt, 1);
1245 		ULOCKFS_SET_FALLOC(ulp);
1246 	} else {
1247 		atomic_add_long(&ulp->ul_vnops_cnt, 1);
1248 	}
1249 
1250 	return (0);
1251 }
1252 
1253 /*
1254  * Check whether we came across the handcrafted lockfs protocol path. We can't
1255  * simply check for T_DONTBLOCK here as one would assume since this can also
1256  * falsely catch recursive VOP's going to a different filesystem, instead we
1257  * check if we already hold the ulockfs->ul_lock mutex.
1258  */
1259 static int
1260 ufs_lockfs_is_under_rawlockfs(struct ulockfs *ulp)
1261 {
1262 	return ((mutex_owner(&ulp->ul_lock) != curthread) ? 0 : 1);
1263 }
1264 
1265 /*
1266  * ufs_lockfs_begin - start the lockfs locking protocol
1267  */
1268 int
1269 ufs_lockfs_begin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask)
1270 {
1271 	int 		error;
1272 	int		rec_vop;
1273 	struct ulockfs *ulp;
1274 	ulockfs_info_t	*ulockfs_info;
1275 	ulockfs_info_t	*ulockfs_info_free;
1276 	ulockfs_info_t	*ulockfs_info_temp;
1277 
1278 	/*
1279 	 * file system has been forcibly unmounted
1280 	 */
1281 	if (ufsvfsp == NULL)
1282 		return (EIO);
1283 
1284 	*ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1285 
1286 	/*
1287 	 * Do lockfs protocol
1288 	 */
1289 	ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1290 	IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1291 
1292 	/*
1293 	 * Detect recursive VOP call or handcrafted internal lockfs protocol
1294 	 * path and bail out in that case.
1295 	 */
1296 	if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1297 		*ulpp = NULL;
1298 		return (0);
1299 	} else {
1300 		if (ulockfs_info_free == NULL) {
1301 			if ((ulockfs_info_temp = (ulockfs_info_t *)
1302 			    kmem_zalloc(sizeof (ulockfs_info_t),
1303 			    KM_NOSLEEP)) == NULL) {
1304 				*ulpp = NULL;
1305 				return (ENOMEM);
1306 			}
1307 		}
1308 	}
1309 
1310 	/*
1311 	 * First time VOP call
1312 	 */
1313 	mutex_enter(&ulp->ul_lock);
1314 	if (ULOCKFS_IS_JUSTULOCK(ulp)) {
1315 		if (mask & ULOCKFS_FWLOCK) {
1316 			atomic_add_long(&ulp->ul_falloc_cnt, 1);
1317 			ULOCKFS_SET_FALLOC(ulp);
1318 		} else {
1319 			atomic_add_long(&ulp->ul_vnops_cnt, 1);
1320 		}
1321 	} else {
1322 		if (error = ufs_check_lockfs(ufsvfsp, ulp, mask)) {
1323 			mutex_exit(&ulp->ul_lock);
1324 			if (ulockfs_info_free == NULL)
1325 				kmem_free(ulockfs_info_temp,
1326 				    sizeof (ulockfs_info_t));
1327 			return (error);
1328 		}
1329 	}
1330 	mutex_exit(&ulp->ul_lock);
1331 
1332 	if (ulockfs_info_free != NULL) {
1333 		ulockfs_info_free->ulp = ulp;
1334 		if (mask & ULOCKFS_FWLOCK)
1335 			ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE;
1336 	} else {
1337 		ulockfs_info_temp->ulp = ulp;
1338 		ulockfs_info_temp->next = ulockfs_info;
1339 		if (mask & ULOCKFS_FWLOCK)
1340 			ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE;
1341 		ASSERT(ufs_lockfs_key != 0);
1342 		(void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1343 	}
1344 
1345 	curthread->t_flag |= T_DONTBLOCK;
1346 	return (0);
1347 }
1348 
1349 /*
1350  * Check whether we are returning from the top level VOP.
1351  */
1352 static int
1353 ufs_lockfs_top_vop_return(ulockfs_info_t *head)
1354 {
1355 	ulockfs_info_t *info;
1356 	int result = 1;
1357 
1358 	for (info = head; info != NULL; info = info->next) {
1359 		if (info->ulp != NULL) {
1360 			result = 0;
1361 			break;
1362 		}
1363 	}
1364 
1365 	return (result);
1366 }
1367 
1368 /*
1369  * ufs_lockfs_end - terminate the lockfs locking protocol
1370  */
1371 void
1372 ufs_lockfs_end(struct ulockfs *ulp)
1373 {
1374 	ulockfs_info_t *info;
1375 	ulockfs_info_t *head;
1376 
1377 	/*
1378 	 * end-of-VOP protocol
1379 	 */
1380 	if (ulp == NULL)
1381 		return;
1382 
1383 	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1384 	SEARCH_ULOCKFSP(head, ulp, info);
1385 
1386 	/*
1387 	 * If we're called from a first level VOP, we have to have a
1388 	 * valid ulockfs record in the TSD.
1389 	 */
1390 	ASSERT(info != NULL);
1391 
1392 	/*
1393 	 * Invalidate the ulockfs record.
1394 	 */
1395 	info->ulp = NULL;
1396 
1397 	if (ufs_lockfs_top_vop_return(head))
1398 		curthread->t_flag &= ~T_DONTBLOCK;
1399 
1400 	mutex_enter(&ulp->ul_lock);
1401 
1402 	/* fallocate thread */
1403 	if (ULOCKFS_IS_FALLOC(ulp) && info->flags & ULOCK_INFO_FALLOCATE) {
1404 		if (!atomic_add_long_nv(&ulp->ul_falloc_cnt, -1))
1405 			ULOCKFS_CLR_FALLOC(ulp);
1406 	} else  { /* normal thread */
1407 		if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1))
1408 			cv_broadcast(&ulp->ul_cv);
1409 	}
1410 
1411 	/* Clear the thread's fallocate state */
1412 	if (info->flags & ULOCK_INFO_FALLOCATE)
1413 		info->flags &= ~ULOCK_INFO_FALLOCATE;
1414 
1415 	if (ulp->ul_vnops_cnt == 0 && ulp->ul_falloc_cnt)
1416 		cv_broadcast(&ulp->ul_cv);
1417 
1418 	mutex_exit(&ulp->ul_lock);
1419 }
1420 
1421 /*
1422  * specialized version of ufs_lockfs_begin() called by ufs_getpage().
1423  */
1424 int
1425 ufs_lockfs_begin_getpage(
1426 	struct ufsvfs	*ufsvfsp,
1427 	struct ulockfs	**ulpp,
1428 	struct seg	*seg,
1429 	int		read_access,
1430 	uint_t		*protp)
1431 {
1432 	ulong_t			mask;
1433 	int 			error;
1434 	int			rec_vop;
1435 	struct ulockfs		*ulp;
1436 	ulockfs_info_t		*ulockfs_info;
1437 	ulockfs_info_t		*ulockfs_info_free;
1438 	ulockfs_info_t		*ulockfs_info_temp;
1439 
1440 	/*
1441 	 * file system has been forcibly unmounted
1442 	 */
1443 	if (ufsvfsp == NULL)
1444 		return (EIO);
1445 
1446 	*ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1447 
1448 	/*
1449 	 * Do lockfs protocol
1450 	 */
1451 	ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1452 	IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1453 
1454 	/*
1455 	 * Detect recursive VOP call or handcrafted internal lockfs protocol
1456 	 * path and bail out in that case.
1457 	 */
1458 	if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1459 		*ulpp = NULL;
1460 		return (0);
1461 	} else {
1462 		if (ulockfs_info_free == NULL) {
1463 			if ((ulockfs_info_temp = (ulockfs_info_t *)
1464 			    kmem_zalloc(sizeof (ulockfs_info_t),
1465 			    KM_NOSLEEP)) == NULL) {
1466 				*ulpp = NULL;
1467 				return (ENOMEM);
1468 			}
1469 		}
1470 	}
1471 
1472 	/*
1473 	 * First time VOP call
1474 	 */
1475 	mutex_enter(&ulp->ul_lock);
1476 	if (ULOCKFS_IS_JUSTULOCK(ulp))
1477 		/*
1478 		 * fs is not locked, simply inc the active-ops counter
1479 		 */
1480 		atomic_add_long(&ulp->ul_vnops_cnt, 1);
1481 	else {
1482 		if (seg->s_ops == &segvn_ops &&
1483 		    ((struct segvn_data *)seg->s_data)->type != MAP_SHARED) {
1484 			mask = (ulong_t)ULOCKFS_GETREAD_MASK;
1485 		} else if (protp && read_access) {
1486 			/*
1487 			 * Restrict the mapping to readonly.
1488 			 * Writes to this mapping will cause
1489 			 * another fault which will then
1490 			 * be suspended if fs is write locked
1491 			 */
1492 			*protp &= ~PROT_WRITE;
1493 			mask = (ulong_t)ULOCKFS_GETREAD_MASK;
1494 		} else
1495 			mask = (ulong_t)ULOCKFS_GETWRITE_MASK;
1496 
1497 		/*
1498 		 * will sleep if this fs is locked against this VOP
1499 		 */
1500 		if (error = ufs_check_lockfs(ufsvfsp, ulp, mask)) {
1501 			mutex_exit(&ulp->ul_lock);
1502 			if (ulockfs_info_free == NULL)
1503 				kmem_free(ulockfs_info_temp,
1504 				    sizeof (ulockfs_info_t));
1505 			return (error);
1506 		}
1507 	}
1508 	mutex_exit(&ulp->ul_lock);
1509 
1510 	if (ulockfs_info_free != NULL) {
1511 		ulockfs_info_free->ulp = ulp;
1512 	} else {
1513 		ulockfs_info_temp->ulp = ulp;
1514 		ulockfs_info_temp->next = ulockfs_info;
1515 		ASSERT(ufs_lockfs_key != 0);
1516 		(void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1517 	}
1518 
1519 	curthread->t_flag |= T_DONTBLOCK;
1520 	return (0);
1521 }
1522 
1523 void
1524 ufs_lockfs_tsd_destructor(void *head)
1525 {
1526 	ulockfs_info_t *curr = (ulockfs_info_t *)head;
1527 	ulockfs_info_t *temp;
1528 
1529 	for (; curr != NULL; ) {
1530 		/*
1531 		 * The TSD destructor is being called when the thread exits
1532 		 * (via thread_exit()). At that time it must have cleaned up
1533 		 * all VOPs via ufs_lockfs_end() and there must not be a
1534 		 * valid ulockfs record exist while a thread is exiting.
1535 		 */
1536 		temp = curr;
1537 		curr = curr->next;
1538 		ASSERT(temp->ulp == NULL);
1539 		kmem_free(temp, sizeof (ulockfs_info_t));
1540 	}
1541 }
1542