xref: /illumos-gate/usr/src/uts/common/fs/ufs/ufs_lockfs.c (revision 4e93fb0f6383eaac21897dcdae56b87118131e4d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/t_lock.h>
30 #include <sys/param.h>
31 #include <sys/time.h>
32 #include <sys/systm.h>
33 #include <sys/sysmacros.h>
34 #include <sys/resource.h>
35 #include <sys/signal.h>
36 #include <sys/cred.h>
37 #include <sys/user.h>
38 #include <sys/buf.h>
39 #include <sys/vfs.h>
40 #include <sys/vnode.h>
41 #include <sys/proc.h>
42 #include <sys/disp.h>
43 #include <sys/file.h>
44 #include <sys/fcntl.h>
45 #include <sys/flock.h>
46 #include <sys/atomic.h>
47 #include <sys/kmem.h>
48 #include <sys/uio.h>
49 #include <sys/conf.h>
50 #include <sys/mman.h>
51 #include <sys/pathname.h>
52 #include <sys/debug.h>
53 #include <sys/vmmeter.h>
54 #include <sys/vmsystm.h>
55 #include <sys/cmn_err.h>
56 #include <sys/vtrace.h>
57 #include <sys/acct.h>
58 #include <sys/dnlc.h>
59 #include <sys/swap.h>
60 
61 #include <sys/fs/ufs_fs.h>
62 #include <sys/fs/ufs_inode.h>
63 #include <sys/fs/ufs_fsdir.h>
64 #include <sys/fs/ufs_trans.h>
65 #include <sys/fs/ufs_panic.h>
66 #include <sys/fs/ufs_mount.h>
67 #include <sys/fs/ufs_bio.h>
68 #include <sys/fs/ufs_log.h>
69 #include <sys/fs/ufs_quota.h>
70 #include <sys/dirent.h>		/* must be AFTER <sys/fs/fsdir.h>! */
71 #include <sys/errno.h>
72 #include <sys/sysinfo.h>
73 
74 #include <vm/hat.h>
75 #include <vm/pvn.h>
76 #include <vm/as.h>
77 #include <vm/seg.h>
78 #include <vm/seg_map.h>
79 #include <vm/seg_vn.h>
80 #include <vm/rm.h>
81 #include <vm/anon.h>
82 #include <sys/swap.h>
83 #include <sys/dnlc.h>
84 
85 extern struct vnode *common_specvp(struct vnode *vp);
86 
87 /* error lock status */
88 #define	UN_ERRLCK	(-1)
89 #define	SET_ERRLCK	1
90 #define	RE_ERRLCK	2
91 #define	NO_ERRLCK	0
92 
93 /*
94  * Index to be used in TSD for storing lockfs data
95  */
96 uint_t ufs_lockfs_key;
97 
98 typedef struct _ulockfs_info {
99 	struct _ulockfs_info *next;
100 	struct ulockfs *ulp;
101 	uint_t flags;
102 } ulockfs_info_t;
103 
104 #define	ULOCK_INFO_FALLOCATE	0x00000001	/* fallocate thread */
105 
106 /*
107  * Check in TSD that whether we are already doing any VOP on this filesystem
108  */
109 #define	IS_REC_VOP(found, head, ulp, free)		\
110 {							\
111 	ulockfs_info_t *_curr;				\
112 							\
113 	for (found = 0, free = NULL, _curr = head;	\
114 	    _curr != NULL; _curr = _curr->next) {	\
115 		if ((free == NULL) &&			\
116 		    (_curr->ulp == NULL))		\
117 			free = _curr;			\
118 		if (_curr->ulp == ulp) {		\
119 			found = 1;			\
120 			break;				\
121 		}					\
122 	}						\
123 }
124 
125 /*
126  * Get the lockfs data from TSD so that lockfs handles the recursive VOP
127  * properly
128  */
129 #define	SEARCH_ULOCKFSP(head, ulp, info)		\
130 {							\
131 	ulockfs_info_t *_curr;				\
132 							\
133 	for (_curr = head; _curr != NULL;		\
134 	    _curr = _curr->next) {			\
135 		if (_curr->ulp == ulp) {		\
136 			break;				\
137 		}					\
138 	}						\
139 							\
140 	info = _curr;					\
141 }
142 
143 /*
144  * Validate lockfs request
145  */
146 static int
147 ufs_getlfd(
148 	struct lockfs *lockfsp,		/* new lock request */
149 	struct lockfs *ul_lockfsp)	/* old lock state */
150 {
151 	int	error = 0;
152 
153 	/*
154 	 * no input flags defined
155 	 */
156 	if (lockfsp->lf_flags != 0) {
157 		error = EINVAL;
158 		goto errout;
159 	}
160 
161 	/*
162 	 * check key
163 	 */
164 	if (!LOCKFS_IS_ULOCK(ul_lockfsp))
165 		if (lockfsp->lf_key != ul_lockfsp->lf_key) {
166 			error = EINVAL;
167 			goto errout;
168 	}
169 
170 	lockfsp->lf_key = ul_lockfsp->lf_key + 1;
171 
172 errout:
173 	return (error);
174 }
175 
176 /*
177  * ufs_checkaccton
178  *	check if accounting is turned on on this fs
179  */
180 
181 int
182 ufs_checkaccton(struct vnode *vp)
183 {
184 	if (acct_fs_in_use(vp))
185 		return (EDEADLK);
186 	return (0);
187 }
188 
189 /*
190  * ufs_checkswapon
191  *	check if local swapping is to file on this fs
192  */
193 int
194 ufs_checkswapon(struct vnode *vp)
195 {
196 	struct swapinfo	*sip;
197 
198 	mutex_enter(&swapinfo_lock);
199 	for (sip = swapinfo; sip; sip = sip->si_next)
200 		if (sip->si_vp->v_vfsp == vp->v_vfsp) {
201 			mutex_exit(&swapinfo_lock);
202 			return (EDEADLK);
203 		}
204 	mutex_exit(&swapinfo_lock);
205 	return (0);
206 }
207 
208 /*
209  * ufs_freeze
210  *	pend future accesses for current lock and desired lock
211  */
212 void
213 ufs_freeze(struct ulockfs *ulp, struct lockfs *lockfsp)
214 {
215 	/*
216 	 * set to new lock type
217 	 */
218 	ulp->ul_lockfs.lf_lock = lockfsp->lf_lock;
219 	ulp->ul_lockfs.lf_key = lockfsp->lf_key;
220 	ulp->ul_lockfs.lf_comlen = lockfsp->lf_comlen;
221 	ulp->ul_lockfs.lf_comment = lockfsp->lf_comment;
222 
223 	ulp->ul_fs_lock = (1 << ulp->ul_lockfs.lf_lock);
224 }
225 
226 /*
227  * All callers of ufs_quiesce() atomically increment ufs_quiesce_pend before
228  * starting ufs_quiesce() protocol and decrement it only when a file system no
229  * longer has to be in quiescent state. This allows ufs_pageio() to detect
230  * that another thread wants to quiesce a file system. See more comments in
231  * ufs_pageio().
232  */
233 ulong_t ufs_quiesce_pend = 0;
234 
235 /*
236  * ufs_quiesce
237  *	wait for outstanding accesses to finish
238  */
239 int
240 ufs_quiesce(struct ulockfs *ulp)
241 {
242 	int error = 0;
243 	ulockfs_info_t *head;
244 	ulockfs_info_t *info;
245 
246 	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
247 	SEARCH_ULOCKFSP(head, ulp, info);
248 
249 	/*
250 	 * Set a softlock to suspend future ufs_vnops so that
251 	 * this lockfs request will not be starved
252 	 */
253 	ULOCKFS_SET_SLOCK(ulp);
254 	ASSERT(ufs_quiesce_pend);
255 
256 	/* check if there is any outstanding ufs vnodeops calls */
257 	while (ulp->ul_vnops_cnt || ulp->ul_falloc_cnt) {
258 		/*
259 		 * use timed version of cv_wait_sig() to make sure we don't
260 		 * miss a wake up call from ufs_pageio() when it doesn't use
261 		 * ul_lock.
262 		 *
263 		 * when a fallocate thread comes in, the only way it returns
264 		 * from this function is if there are no other vnode operations
265 		 * going on (remember fallocate threads are tracked using
266 		 * ul_falloc_cnt not ul_vnops_cnt), and another fallocate thread
267 		 * hasn't already grabbed the fs write lock.
268 		 */
269 		if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
270 			if (!ulp->ul_vnops_cnt && !ULOCKFS_IS_FWLOCK(ulp))
271 				goto out;
272 		}
273 		if (!cv_timedwait_sig(&ulp->ul_cv, &ulp->ul_lock, lbolt + hz)) {
274 			error = EINTR;
275 			goto out;
276 		}
277 	}
278 
279 out:
280 	/*
281 	 * unlock the soft lock
282 	 */
283 	ULOCKFS_CLR_SLOCK(ulp);
284 
285 	return (error);
286 }
287 
288 /*
289  * ufs_flush_inode
290  */
291 int
292 ufs_flush_inode(struct inode *ip, void *arg)
293 {
294 	int	error;
295 	int	saverror	= 0;
296 
297 	/*
298 	 * wrong file system; keep looking
299 	 */
300 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
301 		return (0);
302 
303 	/*
304 	 * asynchronously push all the dirty pages
305 	 */
306 	if (((error = TRANS_SYNCIP(ip, B_ASYNC, 0, TOP_SYNCIP_FLUSHI)) != 0) &&
307 	    (error != EAGAIN))
308 		saverror = error;
309 	/*
310 	 * wait for io and discard all mappings
311 	 */
312 	if (error = TRANS_SYNCIP(ip, B_INVAL, 0, TOP_SYNCIP_FLUSHI))
313 		saverror = error;
314 
315 	if (ITOV(ip)->v_type == VDIR) {
316 		dnlc_dir_purge(&ip->i_danchor);
317 	}
318 
319 	return (saverror);
320 }
321 
322 /*
323  * ufs_flush
324  *	Flush everything that is currently dirty; this includes invalidating
325  *	any mappings.
326  */
327 int
328 ufs_flush(struct vfs *vfsp)
329 {
330 	int		error;
331 	int		saverror = 0;
332 	struct ufsvfs	*ufsvfsp	= (struct ufsvfs *)vfsp->vfs_data;
333 	struct fs	*fs		= ufsvfsp->vfs_fs;
334 	int		tdontblock = 0;
335 
336 	ASSERT(vfs_lock_held(vfsp));
337 
338 	/*
339 	 * purge dnlc
340 	 */
341 	(void) dnlc_purge_vfsp(vfsp, 0);
342 
343 	/*
344 	 * drain the delete and idle threads
345 	 */
346 	ufs_delete_drain(vfsp, 0, 0);
347 	ufs_idle_drain(vfsp);
348 
349 	/*
350 	 * flush and invalidate quota records
351 	 */
352 	(void) qsync(ufsvfsp);
353 
354 	/*
355 	 * flush w/invalidate the inodes for vfsp
356 	 */
357 	if (error = ufs_scan_inodes(0, ufs_flush_inode, ufsvfsp, ufsvfsp))
358 		saverror = error;
359 
360 	/*
361 	 * synchronously flush superblock and summary info
362 	 */
363 	if (fs->fs_ronly == 0 && fs->fs_fmod) {
364 		fs->fs_fmod = 0;
365 		TRANS_SBUPDATE(ufsvfsp, vfsp, TOP_SBUPDATE_FLUSH);
366 	}
367 	/*
368 	 * flush w/invalidate block device pages and buf cache
369 	 */
370 	if ((error = VOP_PUTPAGE(common_specvp(ufsvfsp->vfs_devvp),
371 	    (offset_t)0, 0, B_INVAL, CRED())) > 0)
372 		saverror = error;
373 
374 	(void) bflush((dev_t)vfsp->vfs_dev);
375 	(void) bfinval((dev_t)vfsp->vfs_dev, 0);
376 
377 	/*
378 	 * drain the delete and idle threads again
379 	 */
380 	ufs_delete_drain(vfsp, 0, 0);
381 	ufs_idle_drain(vfsp);
382 
383 	/*
384 	 * play with the clean flag
385 	 */
386 	if (saverror == 0)
387 		ufs_checkclean(vfsp);
388 
389 	/*
390 	 * Flush any outstanding transactions and roll the log
391 	 * only if we are supposed to do, i.e. LDL_NOROLL not set.
392 	 * We can not simply check for fs_ronly here since fsck also may
393 	 * use this code to roll the log on a read-only filesystem, e.g.
394 	 * root during early stages of boot, if other then a sanity check is
395 	 * done, it will clear LDL_NOROLL before.
396 	 * In addition we assert that the deltamap does not contain any deltas
397 	 * in case LDL_NOROLL is set since this is not supposed to happen.
398 	 */
399 	if (TRANS_ISTRANS(ufsvfsp)) {
400 		ml_unit_t	*ul	= ufsvfsp->vfs_log;
401 		mt_map_t	*mtm	= ul->un_deltamap;
402 
403 		if (ul->un_flags & LDL_NOROLL) {
404 			ASSERT(mtm->mtm_nme == 0);
405 		} else {
406 			/*
407 			 * Do not set T_DONTBLOCK if there is a
408 			 * transaction opened by caller.
409 			 */
410 			if (curthread->t_flag & T_DONTBLOCK)
411 				tdontblock = 1;
412 			else
413 				curthread->t_flag |= T_DONTBLOCK;
414 
415 			TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_FLUSH,
416 			    TOP_COMMIT_SIZE, error);
417 
418 			if (!error) {
419 				TRANS_END_SYNC(ufsvfsp, saverror,
420 				    TOP_COMMIT_FLUSH, TOP_COMMIT_SIZE);
421 			}
422 
423 			if (tdontblock == 0)
424 				curthread->t_flag &= ~T_DONTBLOCK;
425 
426 			logmap_roll_dev(ufsvfsp->vfs_log);
427 		}
428 	}
429 
430 	return (saverror);
431 }
432 
433 /*
434  * ufs_thaw_wlock
435  *	special processing when thawing down to wlock
436  */
437 static int
438 ufs_thaw_wlock(struct inode *ip, void *arg)
439 {
440 	/*
441 	 * wrong file system; keep looking
442 	 */
443 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
444 		return (0);
445 
446 	/*
447 	 * iupdat refuses to clear flags if the fs is read only.  The fs
448 	 * may become read/write during the lock and we wouldn't want
449 	 * these inodes being written to disk.  So clear the flags.
450 	 */
451 	rw_enter(&ip->i_contents, RW_WRITER);
452 	ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
453 	rw_exit(&ip->i_contents);
454 
455 	/*
456 	 * pages are mlocked -- fail wlock
457 	 */
458 	if (ITOV(ip)->v_type != VCHR && vn_has_cached_data(ITOV(ip)))
459 		return (EBUSY);
460 
461 	return (0);
462 }
463 
464 /*
465  * ufs_thaw_hlock
466  *	special processing when thawing down to hlock or elock
467  */
468 static int
469 ufs_thaw_hlock(struct inode *ip, void *arg)
470 {
471 	struct vnode	*vp	= ITOV(ip);
472 
473 	/*
474 	 * wrong file system; keep looking
475 	 */
476 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
477 		return (0);
478 
479 	/*
480 	 * blow away all pages - even if they are mlocked
481 	 */
482 	do {
483 		(void) TRANS_SYNCIP(ip, B_INVAL | B_FORCE, 0, TOP_SYNCIP_HLOCK);
484 	} while ((vp->v_type != VCHR) && vn_has_cached_data(vp));
485 	rw_enter(&ip->i_contents, RW_WRITER);
486 	ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
487 	rw_exit(&ip->i_contents);
488 
489 	return (0);
490 }
491 
492 /*
493  * ufs_thaw
494  *	thaw file system lock down to current value
495  */
496 int
497 ufs_thaw(struct vfs *vfsp, struct ufsvfs *ufsvfsp, struct ulockfs *ulp)
498 {
499 	int		error	= 0;
500 	int		noidel	= (int)(ulp->ul_flag & ULOCKFS_NOIDEL);
501 
502 	/*
503 	 * if wlock or hlock or elock
504 	 */
505 	if (ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_HLOCK(ulp) ||
506 	    ULOCKFS_IS_ELOCK(ulp)) {
507 
508 		/*
509 		 * don't keep access times
510 		 * don't free deleted files
511 		 * if superblock writes are allowed, limit them to me for now
512 		 */
513 		ulp->ul_flag |= (ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
514 		if (ulp->ul_sbowner != (kthread_id_t)-1)
515 			ulp->ul_sbowner = curthread;
516 
517 		/*
518 		 * wait for writes for deleted files and superblock updates
519 		 */
520 		(void) ufs_flush(vfsp);
521 
522 		/*
523 		 * now make sure the quota file is up-to-date
524 		 *	expensive; but effective
525 		 */
526 		error = ufs_flush(vfsp);
527 		/*
528 		 * no one can write the superblock
529 		 */
530 		ulp->ul_sbowner = (kthread_id_t)-1;
531 
532 		/*
533 		 * special processing for wlock/hlock/elock
534 		 */
535 		if (ULOCKFS_IS_WLOCK(ulp)) {
536 			if (error)
537 				goto errout;
538 			error = bfinval(ufsvfsp->vfs_dev, 0);
539 			if (error)
540 				goto errout;
541 			error = ufs_scan_inodes(0, ufs_thaw_wlock,
542 					(void *)ufsvfsp, ufsvfsp);
543 			if (error)
544 				goto errout;
545 		}
546 		if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp)) {
547 			error = 0;
548 			(void) ufs_scan_inodes(0, ufs_thaw_hlock,
549 					(void *)ufsvfsp, ufsvfsp);
550 			(void) bfinval(ufsvfsp->vfs_dev, 1);
551 		}
552 	} else {
553 
554 		/*
555 		 * okay to keep access times
556 		 * okay to free deleted files
557 		 * okay to write the superblock
558 		 */
559 		ulp->ul_flag &= ~(ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
560 		ulp->ul_sbowner = NULL;
561 
562 		/*
563 		 * flush in case deleted files are in memory
564 		 */
565 		if (noidel) {
566 			if (error = ufs_flush(vfsp))
567 				goto errout;
568 		}
569 	}
570 
571 errout:
572 	cv_broadcast(&ulp->ul_cv);
573 	return (error);
574 }
575 
576 /*
577  * ufs_reconcile_fs
578  *	reconcile incore superblock with ondisk superblock
579  */
580 int
581 ufs_reconcile_fs(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
582 {
583 	struct fs	*mfs; 	/* in-memory superblock */
584 	struct fs	*dfs;	/* on-disk   superblock */
585 	struct buf	*bp;	/* on-disk   superblock buf */
586 	int		 needs_unlock;
587 	char		 finished_fsclean;
588 
589 	mfs = ufsvfsp->vfs_fs;
590 
591 	/*
592 	 * get the on-disk copy of the superblock
593 	 */
594 	bp = UFS_BREAD(ufsvfsp, vfsp->vfs_dev, SBLOCK, SBSIZE);
595 	bp->b_flags |= (B_STALE|B_AGE);
596 	if (bp->b_flags & B_ERROR) {
597 		brelse(bp);
598 		return (EIO);
599 	}
600 	dfs = bp->b_un.b_fs;
601 
602 	/* error locks may only unlock after the fs has been made consistent */
603 	if (errlck == UN_ERRLCK) {
604 		if (dfs->fs_clean == FSFIX) {	/* being repaired */
605 			brelse(bp);
606 			return (EAGAIN);
607 		}
608 		/* repair not yet started? */
609 		finished_fsclean = TRANS_ISTRANS(ufsvfsp)? FSLOG: FSCLEAN;
610 		if (dfs->fs_clean != finished_fsclean) {
611 			brelse(bp);
612 			return (EBUSY);
613 		}
614 	}
615 
616 	/*
617 	 * if superblock has changed too much, abort
618 	 */
619 	if ((mfs->fs_sblkno		!= dfs->fs_sblkno) ||
620 	    (mfs->fs_cblkno		!= dfs->fs_cblkno) ||
621 	    (mfs->fs_iblkno		!= dfs->fs_iblkno) ||
622 	    (mfs->fs_dblkno		!= dfs->fs_dblkno) ||
623 	    (mfs->fs_cgoffset		!= dfs->fs_cgoffset) ||
624 	    (mfs->fs_cgmask		!= dfs->fs_cgmask) ||
625 	    (mfs->fs_bsize		!= dfs->fs_bsize) ||
626 	    (mfs->fs_fsize		!= dfs->fs_fsize) ||
627 	    (mfs->fs_frag		!= dfs->fs_frag) ||
628 	    (mfs->fs_bmask		!= dfs->fs_bmask) ||
629 	    (mfs->fs_fmask		!= dfs->fs_fmask) ||
630 	    (mfs->fs_bshift		!= dfs->fs_bshift) ||
631 	    (mfs->fs_fshift		!= dfs->fs_fshift) ||
632 	    (mfs->fs_fragshift		!= dfs->fs_fragshift) ||
633 	    (mfs->fs_fsbtodb		!= dfs->fs_fsbtodb) ||
634 	    (mfs->fs_sbsize		!= dfs->fs_sbsize) ||
635 	    (mfs->fs_nindir		!= dfs->fs_nindir) ||
636 	    (mfs->fs_nspf		!= dfs->fs_nspf) ||
637 	    (mfs->fs_trackskew		!= dfs->fs_trackskew) ||
638 	    (mfs->fs_cgsize		!= dfs->fs_cgsize) ||
639 	    (mfs->fs_ntrak		!= dfs->fs_ntrak) ||
640 	    (mfs->fs_nsect		!= dfs->fs_nsect) ||
641 	    (mfs->fs_spc		!= dfs->fs_spc) ||
642 	    (mfs->fs_cpg		!= dfs->fs_cpg) ||
643 	    (mfs->fs_ipg		!= dfs->fs_ipg) ||
644 	    (mfs->fs_fpg		!= dfs->fs_fpg) ||
645 	    (mfs->fs_postblformat	!= dfs->fs_postblformat) ||
646 	    (mfs->fs_magic		!= dfs->fs_magic)) {
647 		brelse(bp);
648 		return (EACCES);
649 	}
650 	if (dfs->fs_clean == FSBAD || FSOKAY != dfs->fs_state + dfs->fs_time)
651 		if (mfs->fs_clean == FSLOG) {
652 			brelse(bp);
653 			return (EACCES);
654 		}
655 
656 	/*
657 	 * get new summary info
658 	 */
659 	if (ufs_getsummaryinfo(vfsp->vfs_dev, ufsvfsp, dfs)) {
660 		brelse(bp);
661 		return (EIO);
662 	}
663 
664 	/*
665 	 * release old summary info and update in-memory superblock
666 	 */
667 	kmem_free(mfs->fs_u.fs_csp, mfs->fs_cssize);
668 	mfs->fs_u.fs_csp = dfs->fs_u.fs_csp;	/* Only entry 0 used */
669 
670 	/*
671 	 * update fields allowed to change
672 	 */
673 	mfs->fs_size		= dfs->fs_size;
674 	mfs->fs_dsize		= dfs->fs_dsize;
675 	mfs->fs_ncg		= dfs->fs_ncg;
676 	mfs->fs_minfree		= dfs->fs_minfree;
677 	mfs->fs_rotdelay	= dfs->fs_rotdelay;
678 	mfs->fs_rps		= dfs->fs_rps;
679 	mfs->fs_maxcontig	= dfs->fs_maxcontig;
680 	mfs->fs_maxbpg		= dfs->fs_maxbpg;
681 	mfs->fs_csmask		= dfs->fs_csmask;
682 	mfs->fs_csshift		= dfs->fs_csshift;
683 	mfs->fs_optim		= dfs->fs_optim;
684 	mfs->fs_csaddr		= dfs->fs_csaddr;
685 	mfs->fs_cssize		= dfs->fs_cssize;
686 	mfs->fs_ncyl		= dfs->fs_ncyl;
687 	mfs->fs_cstotal		= dfs->fs_cstotal;
688 	mfs->fs_reclaim		= dfs->fs_reclaim;
689 
690 	if (mfs->fs_reclaim & (FS_RECLAIM|FS_RECLAIMING)) {
691 		mfs->fs_reclaim &= ~FS_RECLAIM;
692 		mfs->fs_reclaim |=  FS_RECLAIMING;
693 		ufs_thread_start(&ufsvfsp->vfs_reclaim,
694 			ufs_thread_reclaim, vfsp);
695 	}
696 
697 	/* XXX What to do about sparecon? */
698 
699 	/* XXX need to copy volume label */
700 
701 	/*
702 	 * ondisk clean flag overrides inmemory clean flag iff == FSBAD
703 	 * or if error-locked and ondisk is now clean
704 	 */
705 	needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
706 	if (needs_unlock)
707 		mutex_enter(&ufsvfsp->vfs_lock);
708 
709 	if (errlck == UN_ERRLCK) {
710 		if (finished_fsclean == dfs->fs_clean)
711 			mfs->fs_clean = finished_fsclean;
712 		else
713 			mfs->fs_clean = FSBAD;
714 		mfs->fs_state = FSOKAY - dfs->fs_time;
715 	}
716 
717 	if (FSOKAY != dfs->fs_state + dfs->fs_time ||
718 	    (dfs->fs_clean == FSBAD))
719 		mfs->fs_clean = FSBAD;
720 
721 	if (needs_unlock)
722 		mutex_exit(&ufsvfsp->vfs_lock);
723 
724 	brelse(bp);
725 
726 	return (0);
727 }
728 
729 /*
730  * ufs_reconcile_inode
731  *	reconcile ondisk inode with incore inode
732  */
733 static int
734 ufs_reconcile_inode(struct inode *ip, void *arg)
735 {
736 	int		i;
737 	int		ndaddr;
738 	int		niaddr;
739 	struct dinode	*dp;		/* ondisk inode */
740 	struct buf	*bp	= NULL;
741 	uid_t		d_uid;
742 	gid_t		d_gid;
743 	int		error = 0;
744 	struct fs	*fs;
745 
746 	/*
747 	 * not an inode we care about
748 	 */
749 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
750 		return (0);
751 
752 	fs = ip->i_fs;
753 
754 	/*
755 	 * Inode reconciliation fails: we made the filesystem quiescent
756 	 * and we did a ufs_flush() before calling ufs_reconcile_inode()
757 	 * and thus the inode should not have been changed inbetween.
758 	 * Any discrepancies indicate a logic error and a pretty
759 	 * significant run-state inconsistency we should complain about.
760 	 */
761 	if (ip->i_flag & (IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG)) {
762 		cmn_err(CE_WARN, "%s: Inode reconciliation failed for"
763 		    "inode %llu", fs->fs_fsmnt, (u_longlong_t)ip->i_number);
764 		return (EINVAL);
765 	}
766 
767 	/*
768 	 * get the dinode
769 	 */
770 	bp = UFS_BREAD(ip->i_ufsvfs,
771 			ip->i_dev, (daddr_t)fsbtodb(fs, itod(fs, ip->i_number)),
772 	    (int)fs->fs_bsize);
773 	if (bp->b_flags & B_ERROR) {
774 		brelse(bp);
775 		return (EIO);
776 	}
777 	dp  = bp->b_un.b_dino;
778 	dp += itoo(fs, ip->i_number);
779 
780 	/*
781 	 * handle Sun's implementation of EFT
782 	 */
783 	d_uid = (dp->di_suid == UID_LONG) ? dp->di_uid : (uid_t)dp->di_suid;
784 	d_gid = (dp->di_sgid == GID_LONG) ? dp->di_gid : (uid_t)dp->di_sgid;
785 
786 	rw_enter(&ip->i_contents, RW_WRITER);
787 
788 	/*
789 	 * some fields are not allowed to change
790 	 */
791 	if ((ip->i_mode  != dp->di_mode) ||
792 	    (ip->i_gen   != dp->di_gen) ||
793 	    (ip->i_uid   != d_uid) ||
794 	    (ip->i_gid   != d_gid)) {
795 		error = EACCES;
796 		goto out;
797 	}
798 
799 	/*
800 	 * and some are allowed to change
801 	 */
802 	ip->i_size		= dp->di_size;
803 	ip->i_ic.ic_flags	= dp->di_ic.ic_flags;
804 	ip->i_blocks		= dp->di_blocks;
805 	ip->i_nlink		= dp->di_nlink;
806 	if (ip->i_flag & IFASTSYMLNK) {
807 		ndaddr = 1;
808 		niaddr = 0;
809 	} else {
810 		ndaddr = NDADDR;
811 		niaddr = NIADDR;
812 	}
813 	for (i = 0; i < ndaddr; ++i)
814 		ip->i_db[i] = dp->di_db[i];
815 	for (i = 0; i < niaddr; ++i)
816 		ip->i_ib[i] = dp->di_ib[i];
817 
818 out:
819 	rw_exit(&ip->i_contents);
820 	brelse(bp);
821 	return (error);
822 }
823 
824 /*
825  * ufs_reconcile
826  *	reconcile ondisk superblock/inodes with any incore
827  */
828 static int
829 ufs_reconcile(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
830 {
831 	int	error = 0;
832 
833 	/*
834 	 * get rid of as much inmemory data as possible
835 	 */
836 	(void) ufs_flush(vfsp);
837 
838 	/*
839 	 * reconcile the superblock and inodes
840 	 */
841 	if (error = ufs_reconcile_fs(vfsp, ufsvfsp, errlck))
842 		return (error);
843 	if (error = ufs_scan_inodes(0, ufs_reconcile_inode, ufsvfsp, ufsvfsp))
844 		return (error);
845 	/*
846 	 * allocation blocks may be incorrect; get rid of them
847 	 */
848 	(void) ufs_flush(vfsp);
849 
850 	return (error);
851 }
852 
853 /*
854  * File system locking
855  */
856 int
857 ufs_fiolfs(struct vnode *vp, struct lockfs *lockfsp, int from_log)
858 {
859 	return (ufs__fiolfs(vp, lockfsp, /* from_user */ 1, from_log));
860 }
861 
862 /* kernel-internal interface, also used by fix-on-panic */
863 int
864 ufs__fiolfs(
865 	struct vnode *vp,
866 	struct lockfs *lockfsp,
867 	int from_user,
868 	int from_log)
869 {
870 	struct ulockfs	*ulp;
871 	struct lockfs	lfs;
872 	int		error;
873 	struct vfs	*vfsp;
874 	struct ufsvfs	*ufsvfsp;
875 	int		 errlck		= NO_ERRLCK;
876 	int		 poll_events	= POLLPRI;
877 	extern struct pollhead ufs_pollhd;
878 	ulockfs_info_t *head;
879 	ulockfs_info_t *info;
880 	int signal = 0;
881 
882 	/* check valid lock type */
883 	if (!lockfsp || lockfsp->lf_lock > LOCKFS_MAXLOCK)
884 		return (EINVAL);
885 
886 	if (!vp || !vp->v_vfsp || !vp->v_vfsp->vfs_data)
887 		return (EIO);
888 
889 	vfsp = vp->v_vfsp;
890 	ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
891 	ulp = &ufsvfsp->vfs_ulockfs;
892 
893 	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
894 	SEARCH_ULOCKFSP(head, ulp, info);
895 
896 	/*
897 	 * Suspend both the reclaim thread and the delete thread.
898 	 * This must be done outside the lockfs locking protocol.
899 	 */
900 	ufs_thread_suspend(&ufsvfsp->vfs_reclaim);
901 	ufs_thread_suspend(&ufsvfsp->vfs_delete);
902 
903 	/*
904 	 * Acquire vfs_reflock around ul_lock to avoid deadlock with
905 	 * umount/remount/sync.
906 	 */
907 	vfs_lock_wait(vfsp);
908 	mutex_enter(&ulp->ul_lock);
909 	atomic_add_long(&ufs_quiesce_pend, 1);
910 
911 	/*
912 	 * Quit if there is another lockfs request in progress
913 	 * that is waiting for existing ufs_vnops to complete.
914 	 */
915 	if (ULOCKFS_IS_BUSY(ulp)) {
916 		error = EBUSY;
917 		goto errexit;
918 	}
919 
920 	/* cannot ulocked or downgrade a hard-lock */
921 	if (ULOCKFS_IS_HLOCK(ulp)) {
922 		error = EIO;
923 		goto errexit;
924 	}
925 
926 	/* an error lock may be unlocked or relocked, only */
927 	if (ULOCKFS_IS_ELOCK(ulp)) {
928 		if (!LOCKFS_IS_ULOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
929 			error = EBUSY;
930 			goto errexit;
931 		}
932 	}
933 
934 	/*
935 	 * a read-only error lock may only be upgraded to an
936 	 * error lock or hard lock
937 	 */
938 	if (ULOCKFS_IS_ROELOCK(ulp)) {
939 		if (!LOCKFS_IS_HLOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
940 			error = EBUSY;
941 			goto errexit;
942 		}
943 	}
944 
945 	/*
946 	 * until read-only error locks are fully implemented
947 	 * just return EINVAL
948 	 */
949 	if (LOCKFS_IS_ROELOCK(lockfsp)) {
950 		error = EINVAL;
951 		goto errexit;
952 	}
953 
954 	/*
955 	 * an error lock may only be applied if the file system is
956 	 * unlocked or already error locked.
957 	 * (this is to prevent the case where a fs gets changed out from
958 	 * underneath a fs that is locked for backup,
959 	 * that is, name/delete/write-locked.)
960 	 */
961 	if ((!ULOCKFS_IS_ULOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp) &&
962 	    !ULOCKFS_IS_ROELOCK(ulp)) &&
963 	    (LOCKFS_IS_ELOCK(lockfsp) || LOCKFS_IS_ROELOCK(lockfsp))) {
964 		error = EBUSY;
965 		goto errexit;
966 	}
967 
968 	/* get and validate the input lockfs request */
969 	if (error = ufs_getlfd(lockfsp, &ulp->ul_lockfs))
970 		goto errexit;
971 
972 	/*
973 	 * save current ulockfs struct
974 	 */
975 	bcopy(&ulp->ul_lockfs, &lfs, sizeof (struct lockfs));
976 
977 	/*
978 	 * Freeze the file system (pend future accesses)
979 	 */
980 	ufs_freeze(ulp, lockfsp);
981 
982 	/*
983 	 * Set locking in progress because ufs_quiesce may free the
984 	 * ul_lock mutex.
985 	 */
986 	ULOCKFS_SET_BUSY(ulp);
987 	/* update the ioctl copy */
988 	LOCKFS_SET_BUSY(&ulp->ul_lockfs);
989 
990 	/*
991 	 * We  need to unset FWLOCK status before we call ufs_quiesce
992 	 * so that the thread doesnt get suspended. We do this only if
993 	 * this (fallocate) thread requested an unlock operation.
994 	 */
995 	if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
996 		if (!ULOCKFS_IS_WLOCK(ulp))
997 			ULOCKFS_CLR_FWLOCK(ulp);
998 	}
999 
1000 	/*
1001 	 * Quiesce (wait for outstanding accesses to finish)
1002 	 */
1003 	if (error = ufs_quiesce(ulp)) {
1004 		/*
1005 		 * Interrupted due to signal. There could still be
1006 		 * pending vnops.
1007 		 */
1008 		signal = 1;
1009 
1010 		/*
1011 		 * We do broadcast because lock-status
1012 		 * could be reverted to old status.
1013 		 */
1014 		cv_broadcast(&ulp->ul_cv);
1015 		goto errout;
1016 	}
1017 
1018 	/*
1019 	 * If the fallocate thread requested a write fs lock operation
1020 	 * then we set fwlock status in the ulp.
1021 	 */
1022 	if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
1023 		if (ULOCKFS_IS_WLOCK(ulp))
1024 			ULOCKFS_SET_FWLOCK(ulp);
1025 	}
1026 
1027 	/*
1028 	 * can't wlock or (ro)elock fs with accounting or local swap file
1029 	 */
1030 	if ((ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp) ||
1031 	    ULOCKFS_IS_ROELOCK(ulp)) && !from_log) {
1032 		if (error = ufs_checkaccton(vp))
1033 			goto errout;
1034 		if (error = ufs_checkswapon(vp))
1035 			goto errout;
1036 	}
1037 
1038 	/*
1039 	 * save error lock status to pass down to reconcilation
1040 	 * routines and for later cleanup
1041 	 */
1042 	if (LOCKFS_IS_ELOCK(&lfs) && ULOCKFS_IS_ULOCK(ulp))
1043 		errlck = UN_ERRLCK;
1044 
1045 	if (ULOCKFS_IS_ELOCK(ulp) || ULOCKFS_IS_ROELOCK(ulp)) {
1046 		int needs_unlock;
1047 		int needs_sbwrite;
1048 
1049 		poll_events |= POLLERR;
1050 		errlck = LOCKFS_IS_ELOCK(&lfs) || LOCKFS_IS_ROELOCK(&lfs)?
1051 							RE_ERRLCK: SET_ERRLCK;
1052 
1053 		needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
1054 		if (needs_unlock)
1055 			mutex_enter(&ufsvfsp->vfs_lock);
1056 
1057 		/* disable delayed i/o */
1058 		needs_sbwrite = 0;
1059 
1060 		if (errlck == SET_ERRLCK) {
1061 			ufsvfsp->vfs_fs->fs_clean = FSBAD;
1062 			needs_sbwrite = 1;
1063 		}
1064 
1065 		needs_sbwrite |= ufsvfsp->vfs_dio;
1066 		ufsvfsp->vfs_dio = 0;
1067 
1068 		if (needs_unlock)
1069 			mutex_exit(&ufsvfsp->vfs_lock);
1070 
1071 		if (needs_sbwrite) {
1072 			ulp->ul_sbowner = curthread;
1073 			TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE);
1074 
1075 			if (needs_unlock)
1076 				mutex_enter(&ufsvfsp->vfs_lock);
1077 
1078 			ufsvfsp->vfs_fs->fs_fmod = 0;
1079 
1080 			if (needs_unlock)
1081 				mutex_exit(&ufsvfsp->vfs_lock);
1082 		}
1083 	}
1084 
1085 	/*
1086 	 * reconcile superblock and inodes if was wlocked
1087 	 */
1088 	if (LOCKFS_IS_WLOCK(&lfs) || LOCKFS_IS_ELOCK(&lfs)) {
1089 		if (error = ufs_reconcile(vfsp, ufsvfsp, errlck))
1090 			goto errout;
1091 		/*
1092 		 * in case the fs grew; reset the metadata map for logging tests
1093 		 */
1094 		TRANS_MATA_UMOUNT(ufsvfsp);
1095 		TRANS_MATA_MOUNT(ufsvfsp);
1096 		TRANS_MATA_SI(ufsvfsp, ufsvfsp->vfs_fs);
1097 	}
1098 
1099 	/*
1100 	 * At least everything *currently* dirty goes out.
1101 	 */
1102 
1103 	if ((error = ufs_flush(vfsp)) != 0 && !ULOCKFS_IS_HLOCK(ulp) &&
1104 	    !ULOCKFS_IS_ELOCK(ulp))
1105 		goto errout;
1106 
1107 	/*
1108 	 * thaw file system and wakeup pended processes
1109 	 */
1110 	if (error = ufs_thaw(vfsp, ufsvfsp, ulp))
1111 		if (!ULOCKFS_IS_HLOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp))
1112 			goto errout;
1113 
1114 	/*
1115 	 * reset modified flag if not already write locked
1116 	 */
1117 	if (!LOCKFS_IS_WLOCK(&lfs))
1118 		ULOCKFS_CLR_MOD(ulp);
1119 
1120 	/*
1121 	 * idle the lock struct
1122 	 */
1123 	ULOCKFS_CLR_BUSY(ulp);
1124 	/* update the ioctl copy */
1125 	LOCKFS_CLR_BUSY(&ulp->ul_lockfs);
1126 
1127 	/*
1128 	 * free current comment
1129 	 */
1130 	if (lfs.lf_comment && lfs.lf_comlen != 0) {
1131 		kmem_free(lfs.lf_comment, lfs.lf_comlen);
1132 		lfs.lf_comment = NULL;
1133 		lfs.lf_comlen = 0;
1134 	}
1135 
1136 	/* do error lock cleanup */
1137 	if (errlck == UN_ERRLCK)
1138 		ufsfx_unlockfs(ufsvfsp);
1139 
1140 	else if (errlck == RE_ERRLCK)
1141 		ufsfx_lockfs(ufsvfsp);
1142 
1143 	/* don't allow error lock from user to invoke panic */
1144 	else if (from_user && errlck == SET_ERRLCK &&
1145 		!(ufsvfsp->vfs_fsfx.fx_flags & (UFSMNT_ONERROR_PANIC >> 4)))
1146 		(void) ufs_fault(ufsvfsp->vfs_root,
1147 		    ulp->ul_lockfs.lf_comment && ulp->ul_lockfs.lf_comlen > 0 ?
1148 		    ulp->ul_lockfs.lf_comment: "user-applied error lock");
1149 
1150 	atomic_add_long(&ufs_quiesce_pend, -1);
1151 	mutex_exit(&ulp->ul_lock);
1152 	vfs_unlock(vfsp);
1153 
1154 	if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs))
1155 		poll_events |= POLLERR;
1156 
1157 	pollwakeup(&ufs_pollhd, poll_events);
1158 
1159 	/*
1160 	 * Allow both the delete thread and the reclaim thread to
1161 	 * continue.
1162 	 */
1163 	ufs_thread_continue(&ufsvfsp->vfs_delete);
1164 	ufs_thread_continue(&ufsvfsp->vfs_reclaim);
1165 
1166 	return (0);
1167 
1168 errout:
1169 	/*
1170 	 * Lock failed. Reset the old lock in ufsvfs if not hard locked.
1171 	 */
1172 	if (!LOCKFS_IS_HLOCK(&ulp->ul_lockfs)) {
1173 		bcopy(&lfs, &ulp->ul_lockfs, sizeof (struct lockfs));
1174 		ulp->ul_fs_lock = (1 << lfs.lf_lock);
1175 	}
1176 
1177 	/*
1178 	 * Don't call ufs_thaw() when there's a signal during
1179 	 * ufs quiesce operation as it can lead to deadlock
1180 	 * with getpage.
1181 	 */
1182 	if (signal == 0)
1183 		(void) ufs_thaw(vfsp, ufsvfsp, ulp);
1184 
1185 	ULOCKFS_CLR_BUSY(ulp);
1186 	LOCKFS_CLR_BUSY(&ulp->ul_lockfs);
1187 
1188 errexit:
1189 	atomic_add_long(&ufs_quiesce_pend, -1);
1190 	mutex_exit(&ulp->ul_lock);
1191 	vfs_unlock(vfsp);
1192 
1193 	/*
1194 	 * Allow both the delete thread and the reclaim thread to
1195 	 * continue.
1196 	 */
1197 	ufs_thread_continue(&ufsvfsp->vfs_delete);
1198 	ufs_thread_continue(&ufsvfsp->vfs_reclaim);
1199 
1200 	return (error);
1201 }
1202 
1203 /*
1204  * fiolfss
1205  * 	return the current file system locking state info
1206  */
1207 int
1208 ufs_fiolfss(struct vnode *vp, struct lockfs *lockfsp)
1209 {
1210 	struct ulockfs	*ulp;
1211 
1212 	if (!vp || !vp->v_vfsp || !VTOI(vp))
1213 		return (EINVAL);
1214 
1215 	/* file system has been forcibly unmounted */
1216 	if (VTOI(vp)->i_ufsvfs == NULL)
1217 		return (EIO);
1218 
1219 	ulp = VTOUL(vp);
1220 
1221 	if (ULOCKFS_IS_HLOCK(ulp)) {
1222 		*lockfsp = ulp->ul_lockfs;	/* structure assignment */
1223 		return (0);
1224 	}
1225 
1226 	mutex_enter(&ulp->ul_lock);
1227 
1228 	*lockfsp = ulp->ul_lockfs;	/* structure assignment */
1229 
1230 	if (ULOCKFS_IS_MOD(ulp))
1231 		lockfsp->lf_flags |= LOCKFS_MOD;
1232 
1233 	mutex_exit(&ulp->ul_lock);
1234 
1235 	return (0);
1236 }
1237 
1238 /*
1239  * ufs_check_lockfs
1240  *	check whether a ufs_vnops conflicts with the file system lock
1241  */
1242 int
1243 ufs_check_lockfs(struct ufsvfs *ufsvfsp, struct ulockfs *ulp, ulong_t mask)
1244 {
1245 	k_sigset_t	smask;
1246 	int		sig, slock;
1247 
1248 	ASSERT(MUTEX_HELD(&ulp->ul_lock));
1249 
1250 	while (ulp->ul_fs_lock & mask) {
1251 		slock = (int)ULOCKFS_IS_SLOCK(ulp);
1252 		if ((curthread->t_flag & T_DONTPEND) && !slock) {
1253 			curthread->t_flag |= T_WOULDBLOCK;
1254 			return (EAGAIN);
1255 		}
1256 		curthread->t_flag &= ~T_WOULDBLOCK;
1257 
1258 		/*
1259 		 * In the case of an onerr umount of the fs, threads could
1260 		 * have blocked before coming into ufs_check_lockfs and
1261 		 * need to check for the special case of ELOCK and
1262 		 * vfs_dontblock being set which would indicate that the fs
1263 		 * is on its way out and will not return therefore making
1264 		 * EIO the appropriate response.
1265 		 */
1266 		if (ULOCKFS_IS_HLOCK(ulp) ||
1267 		    (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
1268 			return (EIO);
1269 
1270 		/*
1271 		 * wait for lock status to change
1272 		 */
1273 		if (slock || ufsvfsp->vfs_nointr) {
1274 			cv_wait(&ulp->ul_cv, &ulp->ul_lock);
1275 		} else {
1276 			sigintr(&smask, 1);
1277 			sig = cv_wait_sig(&ulp->ul_cv, &ulp->ul_lock);
1278 			sigunintr(&smask);
1279 			if ((!sig && (ulp->ul_fs_lock & mask)) ||
1280 				ufsvfsp->vfs_dontblock)
1281 				return (EINTR);
1282 		}
1283 	}
1284 
1285 	if (mask & ULOCKFS_FWLOCK) {
1286 		atomic_add_long(&ulp->ul_falloc_cnt, 1);
1287 		ULOCKFS_SET_FALLOC(ulp);
1288 	} else {
1289 		atomic_add_long(&ulp->ul_vnops_cnt, 1);
1290 	}
1291 
1292 	return (0);
1293 }
1294 
1295 /*
1296  * Check whether we came across the handcrafted lockfs protocol path. We can't
1297  * simply check for T_DONTBLOCK here as one would assume since this can also
1298  * falsely catch recursive VOP's going to a different filesystem, instead we
1299  * check if we already hold the ulockfs->ul_lock mutex.
1300  */
1301 static int
1302 ufs_lockfs_is_under_rawlockfs(struct ulockfs *ulp)
1303 {
1304 	return ((mutex_owner(&ulp->ul_lock) != curthread) ? 0 : 1);
1305 }
1306 
1307 /*
1308  * ufs_lockfs_begin - start the lockfs locking protocol
1309  */
1310 int
1311 ufs_lockfs_begin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask)
1312 {
1313 	int 		error;
1314 	int		rec_vop;
1315 	struct ulockfs *ulp;
1316 	ulockfs_info_t	*ulockfs_info;
1317 	ulockfs_info_t	*ulockfs_info_free;
1318 	ulockfs_info_t	*ulockfs_info_temp;
1319 
1320 	/*
1321 	 * file system has been forcibly unmounted
1322 	 */
1323 	if (ufsvfsp == NULL)
1324 		return (EIO);
1325 
1326 	*ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1327 
1328 	/*
1329 	 * Do lockfs protocol
1330 	 */
1331 	ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1332 	IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1333 
1334 	/*
1335 	 * Detect recursive VOP call or handcrafted internal lockfs protocol
1336 	 * path and bail out in that case.
1337 	 */
1338 	if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1339 		*ulpp = NULL;
1340 		return (0);
1341 	} else {
1342 		if (ulockfs_info_free == NULL) {
1343 			if ((ulockfs_info_temp = (ulockfs_info_t *)
1344 			    kmem_zalloc(sizeof (ulockfs_info_t),
1345 			    KM_NOSLEEP)) == NULL) {
1346 				*ulpp = NULL;
1347 				return (ENOMEM);
1348 			}
1349 		}
1350 	}
1351 
1352 	/*
1353 	 * First time VOP call
1354 	 */
1355 	mutex_enter(&ulp->ul_lock);
1356 	if (ULOCKFS_IS_JUSTULOCK(ulp)) {
1357 		if (mask & ULOCKFS_FWLOCK) {
1358 			atomic_add_long(&ulp->ul_falloc_cnt, 1);
1359 			ULOCKFS_SET_FALLOC(ulp);
1360 		} else {
1361 			atomic_add_long(&ulp->ul_vnops_cnt, 1);
1362 		}
1363 	} else {
1364 		if (error = ufs_check_lockfs(ufsvfsp, ulp, mask)) {
1365 			mutex_exit(&ulp->ul_lock);
1366 			if (ulockfs_info_free == NULL)
1367 				kmem_free(ulockfs_info_temp,
1368 				    sizeof (ulockfs_info_t));
1369 			return (error);
1370 		}
1371 	}
1372 	mutex_exit(&ulp->ul_lock);
1373 
1374 	if (ulockfs_info_free != NULL) {
1375 		ulockfs_info_free->ulp = ulp;
1376 		if (mask & ULOCKFS_FWLOCK)
1377 			ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE;
1378 	} else {
1379 		ulockfs_info_temp->ulp = ulp;
1380 		ulockfs_info_temp->next = ulockfs_info;
1381 		if (mask & ULOCKFS_FWLOCK)
1382 			ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE;
1383 		ASSERT(ufs_lockfs_key != 0);
1384 		(void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1385 	}
1386 
1387 	curthread->t_flag |= T_DONTBLOCK;
1388 	return (0);
1389 }
1390 
1391 /*
1392  * Check whether we are returning from the top level VOP.
1393  */
1394 static int
1395 ufs_lockfs_top_vop_return(ulockfs_info_t *head)
1396 {
1397 	ulockfs_info_t *info;
1398 	int result = 1;
1399 
1400 	for (info = head; info != NULL; info = info->next) {
1401 		if (info->ulp != NULL) {
1402 			result = 0;
1403 			break;
1404 		}
1405 	}
1406 
1407 	return (result);
1408 }
1409 
1410 /*
1411  * ufs_lockfs_end - terminate the lockfs locking protocol
1412  */
1413 void
1414 ufs_lockfs_end(struct ulockfs *ulp)
1415 {
1416 	ulockfs_info_t *info;
1417 	ulockfs_info_t *head;
1418 
1419 	/*
1420 	 * end-of-VOP protocol
1421 	 */
1422 	if (ulp == NULL)
1423 		return;
1424 
1425 	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1426 	SEARCH_ULOCKFSP(head, ulp, info);
1427 
1428 	/*
1429 	 * If we're called from a first level VOP, we have to have a
1430 	 * valid ulockfs record in the TSD.
1431 	 */
1432 	ASSERT(info != NULL);
1433 
1434 	/*
1435 	 * Invalidate the ulockfs record.
1436 	 */
1437 	info->ulp = NULL;
1438 
1439 	if (ufs_lockfs_top_vop_return(head))
1440 		curthread->t_flag &= ~T_DONTBLOCK;
1441 
1442 	mutex_enter(&ulp->ul_lock);
1443 
1444 	/* fallocate thread */
1445 	if (ULOCKFS_IS_FALLOC(ulp) && info->flags & ULOCK_INFO_FALLOCATE) {
1446 		if (!atomic_add_long_nv(&ulp->ul_falloc_cnt, -1))
1447 			ULOCKFS_CLR_FALLOC(ulp);
1448 	} else  { /* normal thread */
1449 		if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1))
1450 			cv_broadcast(&ulp->ul_cv);
1451 	}
1452 
1453 	/* Clear the thread's fallocate state */
1454 	if (info->flags & ULOCK_INFO_FALLOCATE)
1455 		info->flags &= ~ULOCK_INFO_FALLOCATE;
1456 
1457 	if (ulp->ul_vnops_cnt == 0 && ulp->ul_falloc_cnt)
1458 		cv_broadcast(&ulp->ul_cv);
1459 
1460 	mutex_exit(&ulp->ul_lock);
1461 }
1462 
1463 /*
1464  * specialized version of ufs_lockfs_begin() called by ufs_getpage().
1465  */
1466 int
1467 ufs_lockfs_begin_getpage(
1468 	struct ufsvfs	*ufsvfsp,
1469 	struct ulockfs	**ulpp,
1470 	struct seg	*seg,
1471 	int		read_access,
1472 	uint_t		*protp)
1473 {
1474 	ulong_t			mask;
1475 	int 			error;
1476 	int			rec_vop;
1477 	struct ulockfs		*ulp;
1478 	ulockfs_info_t		*ulockfs_info;
1479 	ulockfs_info_t		*ulockfs_info_free;
1480 	ulockfs_info_t		*ulockfs_info_temp;
1481 
1482 	/*
1483 	 * file system has been forcibly unmounted
1484 	 */
1485 	if (ufsvfsp == NULL)
1486 		return (EIO);
1487 
1488 	*ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1489 
1490 	/*
1491 	 * Do lockfs protocol
1492 	 */
1493 	ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1494 	IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1495 
1496 	/*
1497 	 * Detect recursive VOP call or handcrafted internal lockfs protocol
1498 	 * path and bail out in that case.
1499 	 */
1500 	if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1501 		*ulpp = NULL;
1502 		return (0);
1503 	} else {
1504 		if (ulockfs_info_free == NULL) {
1505 			if ((ulockfs_info_temp = (ulockfs_info_t *)
1506 			    kmem_zalloc(sizeof (ulockfs_info_t),
1507 			    KM_NOSLEEP)) == NULL) {
1508 				*ulpp = NULL;
1509 				return (ENOMEM);
1510 			}
1511 		}
1512 	}
1513 
1514 	/*
1515 	 * First time VOP call
1516 	 */
1517 	mutex_enter(&ulp->ul_lock);
1518 	if (ULOCKFS_IS_JUSTULOCK(ulp))
1519 		/*
1520 		 * fs is not locked, simply inc the active-ops counter
1521 		 */
1522 		atomic_add_long(&ulp->ul_vnops_cnt, 1);
1523 	else {
1524 		if (seg->s_ops == &segvn_ops &&
1525 		    ((struct segvn_data *)seg->s_data)->type != MAP_SHARED) {
1526 			mask = (ulong_t)ULOCKFS_GETREAD_MASK;
1527 		} else if (protp && read_access) {
1528 			/*
1529 			 * Restrict the mapping to readonly.
1530 			 * Writes to this mapping will cause
1531 			 * another fault which will then
1532 			 * be suspended if fs is write locked
1533 			 */
1534 			*protp &= ~PROT_WRITE;
1535 			mask = (ulong_t)ULOCKFS_GETREAD_MASK;
1536 		} else
1537 			mask = (ulong_t)ULOCKFS_GETWRITE_MASK;
1538 
1539 		/*
1540 		 * will sleep if this fs is locked against this VOP
1541 		 */
1542 		if (error = ufs_check_lockfs(ufsvfsp, ulp, mask)) {
1543 			mutex_exit(&ulp->ul_lock);
1544 			if (ulockfs_info_free == NULL)
1545 				kmem_free(ulockfs_info_temp,
1546 				    sizeof (ulockfs_info_t));
1547 			return (error);
1548 		}
1549 	}
1550 	mutex_exit(&ulp->ul_lock);
1551 
1552 	if (ulockfs_info_free != NULL) {
1553 		ulockfs_info_free->ulp = ulp;
1554 	} else {
1555 		ulockfs_info_temp->ulp = ulp;
1556 		ulockfs_info_temp->next = ulockfs_info;
1557 		ASSERT(ufs_lockfs_key != 0);
1558 		(void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1559 	}
1560 
1561 	curthread->t_flag |= T_DONTBLOCK;
1562 	return (0);
1563 }
1564 
1565 void
1566 ufs_lockfs_tsd_destructor(void *head)
1567 {
1568 	ulockfs_info_t *curr = (ulockfs_info_t *)head;
1569 	ulockfs_info_t *temp;
1570 
1571 	for (; curr != NULL; ) {
1572 		/*
1573 		 * The TSD destructor is being called when the thread exits
1574 		 * (via thread_exit()). At that time it must have cleaned up
1575 		 * all VOPs via ufs_lockfs_end() and there must not be a
1576 		 * valid ulockfs record exist while a thread is exiting.
1577 		 */
1578 		temp = curr;
1579 		curr = curr->next;
1580 		ASSERT(temp->ulp == NULL);
1581 		kmem_free(temp, sizeof (ulockfs_info_t));
1582 	}
1583 }
1584