xref: /titanic_52/usr/src/uts/common/fs/ufs/ufs_lockfs.c (revision a0130b69c3952b827264ca8cb19bcbeecb67b463)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/t_lock.h>
31 #include <sys/param.h>
32 #include <sys/time.h>
33 #include <sys/systm.h>
34 #include <sys/sysmacros.h>
35 #include <sys/resource.h>
36 #include <sys/signal.h>
37 #include <sys/cred.h>
38 #include <sys/user.h>
39 #include <sys/buf.h>
40 #include <sys/vfs.h>
41 #include <sys/vnode.h>
42 #include <sys/proc.h>
43 #include <sys/disp.h>
44 #include <sys/file.h>
45 #include <sys/fcntl.h>
46 #include <sys/flock.h>
47 #include <sys/atomic.h>
48 #include <sys/kmem.h>
49 #include <sys/uio.h>
50 #include <sys/conf.h>
51 #include <sys/mman.h>
52 #include <sys/pathname.h>
53 #include <sys/debug.h>
54 #include <sys/vmmeter.h>
55 #include <sys/vmsystm.h>
56 #include <sys/cmn_err.h>
57 #include <sys/vtrace.h>
58 #include <sys/acct.h>
59 #include <sys/dnlc.h>
60 #include <sys/swap.h>
61 
62 #include <sys/fs/ufs_fs.h>
63 #include <sys/fs/ufs_inode.h>
64 #include <sys/fs/ufs_fsdir.h>
65 #include <sys/fs/ufs_trans.h>
66 #include <sys/fs/ufs_panic.h>
67 #include <sys/fs/ufs_mount.h>
68 #include <sys/fs/ufs_bio.h>
69 #include <sys/fs/ufs_log.h>
70 #include <sys/fs/ufs_quota.h>
71 #include <sys/dirent.h>		/* must be AFTER <sys/fs/fsdir.h>! */
72 #include <sys/errno.h>
73 #include <sys/sysinfo.h>
74 
75 #include <vm/hat.h>
76 #include <vm/pvn.h>
77 #include <vm/as.h>
78 #include <vm/seg.h>
79 #include <vm/seg_map.h>
80 #include <vm/seg_vn.h>
81 #include <vm/rm.h>
82 #include <vm/anon.h>
83 #include <sys/swap.h>
84 #include <sys/dnlc.h>
85 
86 extern struct vnode *common_specvp(struct vnode *vp);
87 
88 /* error lock status */
89 #define	UN_ERRLCK	(-1)
90 #define	SET_ERRLCK	1
91 #define	RE_ERRLCK	2
92 #define	NO_ERRLCK	0
93 
94 /*
95  * Index to be used in TSD for storing lockfs data
96  */
97 uint_t ufs_lockfs_key;
98 
99 typedef struct _ulockfs_info {
100 	struct _ulockfs_info *next;
101 	struct ulockfs *ulp;
102 } ulockfs_info_t;
103 
104 /*
105  * Check in TSD that whether we are already doing any VOP on this filesystem
106  */
107 #define	IS_REC_VOP(found, head, ulp, free)		\
108 {							\
109 	ulockfs_info_t *_curr;				\
110 							\
111 	for (found = 0, free = NULL, _curr = head;	\
112 	    _curr != NULL; _curr = _curr->next) {	\
113 		if ((free == NULL) &&			\
114 		    (_curr->ulp == NULL))		\
115 			free = _curr;			\
116 		if (_curr->ulp == ulp) {		\
117 			found = 1;			\
118 			break;				\
119 		}					\
120 	}						\
121 }
122 
123 /*
124  * Get the lockfs data from TSD so that lockfs handles the recursive VOP
125  * properly
126  */
127 #define	SEARCH_ULOCKFSP(head, ulp, info)		\
128 {							\
129 	ulockfs_info_t *_curr;				\
130 							\
131 	for (_curr = head; _curr != NULL;		\
132 	    _curr = _curr->next) {			\
133 		if (_curr->ulp == ulp) {		\
134 			break;				\
135 		}					\
136 	}						\
137 							\
138 	info = _curr;					\
139 }
140 
141 /*
142  * Validate lockfs request
143  */
144 static int
145 ufs_getlfd(
146 	struct lockfs *lockfsp,		/* new lock request */
147 	struct lockfs *ul_lockfsp)	/* old lock state */
148 {
149 	int	error = 0;
150 
151 	/*
152 	 * no input flags defined
153 	 */
154 	if (lockfsp->lf_flags != 0) {
155 		error = EINVAL;
156 		goto errout;
157 	}
158 
159 	/*
160 	 * check key
161 	 */
162 	if (!LOCKFS_IS_ULOCK(ul_lockfsp))
163 		if (lockfsp->lf_key != ul_lockfsp->lf_key) {
164 			error = EINVAL;
165 			goto errout;
166 	}
167 
168 	lockfsp->lf_key = ul_lockfsp->lf_key + 1;
169 
170 errout:
171 	return (error);
172 }
173 
174 /*
175  * ufs_checkaccton
176  *	check if accounting is turned on on this fs
177  */
178 
179 int
180 ufs_checkaccton(struct vnode *vp)
181 {
182 	if (acct_fs_in_use(vp))
183 		return (EDEADLK);
184 	return (0);
185 }
186 
187 /*
188  * ufs_checkswapon
189  *	check if local swapping is to file on this fs
190  */
191 int
192 ufs_checkswapon(struct vnode *vp)
193 {
194 	struct swapinfo	*sip;
195 
196 	mutex_enter(&swapinfo_lock);
197 	for (sip = swapinfo; sip; sip = sip->si_next)
198 		if (sip->si_vp->v_vfsp == vp->v_vfsp) {
199 			mutex_exit(&swapinfo_lock);
200 			return (EDEADLK);
201 		}
202 	mutex_exit(&swapinfo_lock);
203 	return (0);
204 }
205 
206 /*
207  * ufs_freeze
208  *	pend future accesses for current lock and desired lock
209  */
210 void
211 ufs_freeze(struct ulockfs *ulp, struct lockfs *lockfsp)
212 {
213 	/*
214 	 * set to new lock type
215 	 */
216 	ulp->ul_lockfs.lf_lock = lockfsp->lf_lock;
217 	ulp->ul_lockfs.lf_key = lockfsp->lf_key;
218 	ulp->ul_lockfs.lf_comlen = lockfsp->lf_comlen;
219 	ulp->ul_lockfs.lf_comment = lockfsp->lf_comment;
220 
221 	ulp->ul_fs_lock = (1 << ulp->ul_lockfs.lf_lock);
222 }
223 
224 /*
225  * All callers of ufs_quiesce() atomically increment ufs_quiesce_pend before
226  * starting ufs_quiesce() protocol and decrement it only when a file system no
227  * longer has to be in quiescent state. This allows ufs_pageio() to detect
228  * that another thread wants to quiesce a file system. See more comments in
229  * ufs_pageio().
230  */
231 ulong_t ufs_quiesce_pend = 0;
232 
233 /*
234  * ufs_quiesce
235  *	wait for outstanding accesses to finish
236  */
237 int
238 ufs_quiesce(struct ulockfs *ulp)
239 {
240 	int error = 0;
241 
242 	/*
243 	 * Set a softlock to suspend future ufs_vnops so that
244 	 * this lockfs request will not be starved
245 	 */
246 	ULOCKFS_SET_SLOCK(ulp);
247 	ASSERT(ufs_quiesce_pend);
248 
249 	/* check if there is any outstanding ufs vnodeops calls */
250 	while (ulp->ul_vnops_cnt)
251 		/*
252 		 * use timed version of cv_wait_sig() to make sure we don't
253 		 * miss a wake up call from ufs_pageio() when it doesn't use
254 		 * ul_lock.
255 		 */
256 		if (!cv_timedwait_sig(&ulp->ul_cv, &ulp->ul_lock, lbolt + hz)) {
257 			error = EINTR;
258 			goto out;
259 		}
260 
261 out:
262 	/*
263 	 * unlock the soft lock
264 	 */
265 	ULOCKFS_CLR_SLOCK(ulp);
266 
267 	return (error);
268 }
269 /*
270  * ufs_flush_inode
271  */
272 int
273 ufs_flush_inode(struct inode *ip, void *arg)
274 {
275 	int	error;
276 	int	saverror	= 0;
277 
278 	/*
279 	 * wrong file system; keep looking
280 	 */
281 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
282 		return (0);
283 
284 	/*
285 	 * asynchronously push all the dirty pages
286 	 */
287 	if (((error = TRANS_SYNCIP(ip, B_ASYNC, 0, TOP_SYNCIP_FLUSHI)) != 0) &&
288 	    (error != EAGAIN))
289 		saverror = error;
290 	/*
291 	 * wait for io and discard all mappings
292 	 */
293 	if (error = TRANS_SYNCIP(ip, B_INVAL, 0, TOP_SYNCIP_FLUSHI))
294 		saverror = error;
295 
296 	if (ITOV(ip)->v_type == VDIR) {
297 		dnlc_dir_purge(&ip->i_danchor);
298 	}
299 
300 	return (saverror);
301 }
302 
303 /*
304  * ufs_flush
305  *	Flush everything that is currently dirty; this includes invalidating
306  *	any mappings.
307  */
308 int
309 ufs_flush(struct vfs *vfsp)
310 {
311 	int		error;
312 	int		saverror = 0;
313 	struct ufsvfs	*ufsvfsp	= (struct ufsvfs *)vfsp->vfs_data;
314 	struct fs	*fs		= ufsvfsp->vfs_fs;
315 	ml_unit_t	*ul	= ufsvfsp->vfs_log;
316 	mt_map_t	*mtm	= ul->un_deltamap;
317 
318 	ASSERT(vfs_lock_held(vfsp));
319 
320 	/*
321 	 * purge dnlc
322 	 */
323 	(void) dnlc_purge_vfsp(vfsp, 0);
324 
325 	/*
326 	 * drain the delete and idle threads
327 	 */
328 	ufs_delete_drain(vfsp, 0, 0);
329 	ufs_idle_drain(vfsp);
330 
331 	/*
332 	 * flush and invalidate quota records
333 	 */
334 	(void) qsync(ufsvfsp);
335 
336 	/*
337 	 * flush w/invalidate the inodes for vfsp
338 	 */
339 	if (error = ufs_scan_inodes(0, ufs_flush_inode, ufsvfsp, ufsvfsp))
340 		saverror = error;
341 
342 	/*
343 	 * synchronously flush superblock and summary info
344 	 */
345 	if (fs->fs_ronly == 0 && fs->fs_fmod) {
346 		fs->fs_fmod = 0;
347 		TRANS_SBUPDATE(ufsvfsp, vfsp, TOP_SBUPDATE_FLUSH);
348 	}
349 	/*
350 	 * flush w/invalidate block device pages and buf cache
351 	 */
352 	if ((error = VOP_PUTPAGE(common_specvp(ufsvfsp->vfs_devvp),
353 	    (offset_t)0, 0, B_INVAL, CRED())) > 0)
354 		saverror = error;
355 
356 	(void) bflush((dev_t)vfsp->vfs_dev);
357 	(void) bfinval((dev_t)vfsp->vfs_dev, 0);
358 
359 	/*
360 	 * drain the delete and idle threads again
361 	 */
362 	ufs_delete_drain(vfsp, 0, 0);
363 	ufs_idle_drain(vfsp);
364 
365 	/*
366 	 * play with the clean flag
367 	 */
368 	if (saverror == 0)
369 		ufs_checkclean(vfsp);
370 
371 	/*
372 	 * Flush any outstanding transactions and roll the log
373 	 * only if we are supposed to do, i.e. LDL_NOROLL not set.
374 	 * We can not simply check for fs_ronly here since fsck also may
375 	 * use this code to roll the log on a read-only filesystem, e.g.
376 	 * root during early stages of boot, if other then a sanity check is
377 	 * done, it will clear LDL_NOROLL before.
378 	 * In addition we assert that the deltamap does not contain any deltas
379 	 * in case LDL_NOROLL is set since this is not supposed to happen.
380 	 */
381 	if (TRANS_ISTRANS(ufsvfsp)) {
382 		if (ul->un_flags & LDL_NOROLL) {
383 			ASSERT(mtm->mtm_nme == 0);
384 		} else {
385 			curthread->t_flag |= T_DONTBLOCK;
386 			TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_FLUSH,
387 			    TOP_COMMIT_SIZE, error);
388 			if (!error) {
389 				TRANS_END_SYNC(ufsvfsp, saverror,
390 				    TOP_COMMIT_FLUSH, TOP_COMMIT_SIZE);
391 			}
392 			curthread->t_flag &= ~T_DONTBLOCK;
393 
394 			logmap_roll_dev(ufsvfsp->vfs_log);
395 		}
396 	}
397 
398 	return (saverror);
399 }
400 
401 /*
402  * ufs_thaw_wlock
403  *	special processing when thawing down to wlock
404  */
405 static int
406 ufs_thaw_wlock(struct inode *ip, void *arg)
407 {
408 	/*
409 	 * wrong file system; keep looking
410 	 */
411 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
412 		return (0);
413 
414 	/*
415 	 * iupdat refuses to clear flags if the fs is read only.  The fs
416 	 * may become read/write during the lock and we wouldn't want
417 	 * these inodes being written to disk.  So clear the flags.
418 	 */
419 	rw_enter(&ip->i_contents, RW_WRITER);
420 	ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
421 	rw_exit(&ip->i_contents);
422 
423 	/*
424 	 * pages are mlocked -- fail wlock
425 	 */
426 	if (ITOV(ip)->v_type != VCHR && vn_has_cached_data(ITOV(ip)))
427 		return (EBUSY);
428 
429 	return (0);
430 }
431 
432 /*
433  * ufs_thaw_hlock
434  *	special processing when thawing down to hlock or elock
435  */
436 static int
437 ufs_thaw_hlock(struct inode *ip, void *arg)
438 {
439 	struct vnode	*vp	= ITOV(ip);
440 
441 	/*
442 	 * wrong file system; keep looking
443 	 */
444 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
445 		return (0);
446 
447 	/*
448 	 * blow away all pages - even if they are mlocked
449 	 */
450 	do {
451 		(void) TRANS_SYNCIP(ip, B_INVAL | B_FORCE, 0, TOP_SYNCIP_HLOCK);
452 	} while ((vp->v_type != VCHR) && vn_has_cached_data(vp));
453 	rw_enter(&ip->i_contents, RW_WRITER);
454 	ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
455 	rw_exit(&ip->i_contents);
456 
457 	return (0);
458 }
459 
460 /*
461  * ufs_thaw
462  *	thaw file system lock down to current value
463  */
464 int
465 ufs_thaw(struct vfs *vfsp, struct ufsvfs *ufsvfsp, struct ulockfs *ulp)
466 {
467 	int		error	= 0;
468 	int		noidel	= (int)(ulp->ul_flag & ULOCKFS_NOIDEL);
469 
470 	/*
471 	 * if wlock or hlock or elock
472 	 */
473 	if (ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_HLOCK(ulp) ||
474 	    ULOCKFS_IS_ELOCK(ulp)) {
475 
476 		/*
477 		 * don't keep access times
478 		 * don't free deleted files
479 		 * if superblock writes are allowed, limit them to me for now
480 		 */
481 		ulp->ul_flag |= (ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
482 		if (ulp->ul_sbowner != (kthread_id_t)-1)
483 			ulp->ul_sbowner = curthread;
484 
485 		/*
486 		 * wait for writes for deleted files and superblock updates
487 		 */
488 		(void) ufs_flush(vfsp);
489 
490 		/*
491 		 * now make sure the quota file is up-to-date
492 		 *	expensive; but effective
493 		 */
494 		error = ufs_flush(vfsp);
495 		/*
496 		 * no one can write the superblock
497 		 */
498 		ulp->ul_sbowner = (kthread_id_t)-1;
499 
500 		/*
501 		 * special processing for wlock/hlock/elock
502 		 */
503 		if (ULOCKFS_IS_WLOCK(ulp)) {
504 			if (error)
505 				goto errout;
506 			error = bfinval(ufsvfsp->vfs_dev, 0);
507 			if (error)
508 				goto errout;
509 			error = ufs_scan_inodes(0, ufs_thaw_wlock,
510 					(void *)ufsvfsp, ufsvfsp);
511 			if (error)
512 				goto errout;
513 		}
514 		if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp)) {
515 			error = 0;
516 			(void) ufs_scan_inodes(0, ufs_thaw_hlock,
517 					(void *)ufsvfsp, ufsvfsp);
518 			(void) bfinval(ufsvfsp->vfs_dev, 1);
519 		}
520 	} else {
521 
522 		/*
523 		 * okay to keep access times
524 		 * okay to free deleted files
525 		 * okay to write the superblock
526 		 */
527 		ulp->ul_flag &= ~(ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
528 		ulp->ul_sbowner = NULL;
529 
530 		/*
531 		 * flush in case deleted files are in memory
532 		 */
533 		if (noidel) {
534 			if (error = ufs_flush(vfsp))
535 				goto errout;
536 		}
537 	}
538 
539 errout:
540 	cv_broadcast(&ulp->ul_cv);
541 	return (error);
542 }
543 
544 /*
545  * ufs_reconcile_fs
546  *	reconcile incore superblock with ondisk superblock
547  */
548 int
549 ufs_reconcile_fs(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
550 {
551 	struct fs	*mfs; 	/* in-memory superblock */
552 	struct fs	*dfs;	/* on-disk   superblock */
553 	struct buf	*bp;	/* on-disk   superblock buf */
554 	int		 needs_unlock;
555 	char		 finished_fsclean;
556 
557 	mfs = ufsvfsp->vfs_fs;
558 
559 	/*
560 	 * get the on-disk copy of the superblock
561 	 */
562 	bp = UFS_BREAD(ufsvfsp, vfsp->vfs_dev, SBLOCK, SBSIZE);
563 	bp->b_flags |= (B_STALE|B_AGE);
564 	if (bp->b_flags & B_ERROR) {
565 		brelse(bp);
566 		return (EIO);
567 	}
568 	dfs = bp->b_un.b_fs;
569 
570 	/* error locks may only unlock after the fs has been made consistent */
571 	if (errlck == UN_ERRLCK) {
572 		if (dfs->fs_clean == FSFIX) {	/* being repaired */
573 			brelse(bp);
574 			return (EAGAIN);
575 		}
576 		/* repair not yet started? */
577 		finished_fsclean = TRANS_ISTRANS(ufsvfsp)? FSLOG: FSCLEAN;
578 		if (dfs->fs_clean != finished_fsclean) {
579 			brelse(bp);
580 			return (EBUSY);
581 		}
582 	}
583 
584 	/*
585 	 * if superblock has changed too much, abort
586 	 */
587 	if ((mfs->fs_sblkno		!= dfs->fs_sblkno) ||
588 	    (mfs->fs_cblkno		!= dfs->fs_cblkno) ||
589 	    (mfs->fs_iblkno		!= dfs->fs_iblkno) ||
590 	    (mfs->fs_dblkno		!= dfs->fs_dblkno) ||
591 	    (mfs->fs_cgoffset		!= dfs->fs_cgoffset) ||
592 	    (mfs->fs_cgmask		!= dfs->fs_cgmask) ||
593 	    (mfs->fs_bsize		!= dfs->fs_bsize) ||
594 	    (mfs->fs_fsize		!= dfs->fs_fsize) ||
595 	    (mfs->fs_frag		!= dfs->fs_frag) ||
596 	    (mfs->fs_bmask		!= dfs->fs_bmask) ||
597 	    (mfs->fs_fmask		!= dfs->fs_fmask) ||
598 	    (mfs->fs_bshift		!= dfs->fs_bshift) ||
599 	    (mfs->fs_fshift		!= dfs->fs_fshift) ||
600 	    (mfs->fs_fragshift		!= dfs->fs_fragshift) ||
601 	    (mfs->fs_fsbtodb		!= dfs->fs_fsbtodb) ||
602 	    (mfs->fs_sbsize		!= dfs->fs_sbsize) ||
603 	    (mfs->fs_nindir		!= dfs->fs_nindir) ||
604 	    (mfs->fs_nspf		!= dfs->fs_nspf) ||
605 	    (mfs->fs_trackskew		!= dfs->fs_trackskew) ||
606 	    (mfs->fs_cgsize		!= dfs->fs_cgsize) ||
607 	    (mfs->fs_ntrak		!= dfs->fs_ntrak) ||
608 	    (mfs->fs_nsect		!= dfs->fs_nsect) ||
609 	    (mfs->fs_spc		!= dfs->fs_spc) ||
610 	    (mfs->fs_cpg		!= dfs->fs_cpg) ||
611 	    (mfs->fs_ipg		!= dfs->fs_ipg) ||
612 	    (mfs->fs_fpg		!= dfs->fs_fpg) ||
613 	    (mfs->fs_postblformat	!= dfs->fs_postblformat) ||
614 	    (mfs->fs_magic		!= dfs->fs_magic)) {
615 		brelse(bp);
616 		return (EACCES);
617 	}
618 	if (dfs->fs_clean == FSBAD || FSOKAY != dfs->fs_state + dfs->fs_time)
619 		if (mfs->fs_clean == FSLOG) {
620 			brelse(bp);
621 			return (EACCES);
622 		}
623 
624 	/*
625 	 * get new summary info
626 	 */
627 	if (ufs_getsummaryinfo(vfsp->vfs_dev, ufsvfsp, dfs)) {
628 		brelse(bp);
629 		return (EIO);
630 	}
631 
632 	/*
633 	 * release old summary info and update in-memory superblock
634 	 */
635 	kmem_free(mfs->fs_u.fs_csp, mfs->fs_cssize);
636 	mfs->fs_u.fs_csp = dfs->fs_u.fs_csp;	/* Only entry 0 used */
637 
638 	/*
639 	 * update fields allowed to change
640 	 */
641 	mfs->fs_size		= dfs->fs_size;
642 	mfs->fs_dsize		= dfs->fs_dsize;
643 	mfs->fs_ncg		= dfs->fs_ncg;
644 	mfs->fs_minfree		= dfs->fs_minfree;
645 	mfs->fs_rotdelay	= dfs->fs_rotdelay;
646 	mfs->fs_rps		= dfs->fs_rps;
647 	mfs->fs_maxcontig	= dfs->fs_maxcontig;
648 	mfs->fs_maxbpg		= dfs->fs_maxbpg;
649 	mfs->fs_csmask		= dfs->fs_csmask;
650 	mfs->fs_csshift		= dfs->fs_csshift;
651 	mfs->fs_optim		= dfs->fs_optim;
652 	mfs->fs_csaddr		= dfs->fs_csaddr;
653 	mfs->fs_cssize		= dfs->fs_cssize;
654 	mfs->fs_ncyl		= dfs->fs_ncyl;
655 	mfs->fs_cstotal		= dfs->fs_cstotal;
656 	mfs->fs_reclaim		= dfs->fs_reclaim;
657 
658 	if (mfs->fs_reclaim & (FS_RECLAIM|FS_RECLAIMING)) {
659 		mfs->fs_reclaim &= ~FS_RECLAIM;
660 		mfs->fs_reclaim |=  FS_RECLAIMING;
661 		ufs_thread_start(&ufsvfsp->vfs_reclaim,
662 			ufs_thread_reclaim, vfsp);
663 	}
664 
665 	/* XXX What to do about sparecon? */
666 
667 	/* XXX need to copy volume label */
668 
669 	/*
670 	 * ondisk clean flag overrides inmemory clean flag iff == FSBAD
671 	 * or if error-locked and ondisk is now clean
672 	 */
673 	needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
674 	if (needs_unlock)
675 		mutex_enter(&ufsvfsp->vfs_lock);
676 
677 	if (errlck == UN_ERRLCK) {
678 		if (finished_fsclean == dfs->fs_clean)
679 			mfs->fs_clean = finished_fsclean;
680 		else
681 			mfs->fs_clean = FSBAD;
682 		mfs->fs_state = FSOKAY - dfs->fs_time;
683 	}
684 
685 	if (FSOKAY != dfs->fs_state + dfs->fs_time ||
686 	    (dfs->fs_clean == FSBAD))
687 		mfs->fs_clean = FSBAD;
688 
689 	if (needs_unlock)
690 		mutex_exit(&ufsvfsp->vfs_lock);
691 
692 	brelse(bp);
693 
694 	return (0);
695 }
696 
697 /*
698  * ufs_reconcile_inode
699  *	reconcile ondisk inode with incore inode
700  */
701 static int
702 ufs_reconcile_inode(struct inode *ip, void *arg)
703 {
704 	int		i;
705 	int		ndaddr;
706 	int		niaddr;
707 	struct dinode	*dp;		/* ondisk inode */
708 	struct buf	*bp	= NULL;
709 	uid_t		d_uid;
710 	gid_t		d_gid;
711 	int		error = 0;
712 	struct fs	*fs;
713 
714 	/*
715 	 * not an inode we care about
716 	 */
717 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
718 		return (0);
719 
720 	fs = ip->i_fs;
721 
722 	/*
723 	 * Inode reconciliation fails: we made the filesystem quiescent
724 	 * and we did a ufs_flush() before calling ufs_reconcile_inode()
725 	 * and thus the inode should not have been changed inbetween.
726 	 * Any discrepancies indicate a logic error and a pretty
727 	 * significant run-state inconsistency we should complain about.
728 	 */
729 	if (ip->i_flag & (IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG)) {
730 		cmn_err(CE_WARN, "%s: Inode reconciliation failed for"
731 		    "inode %llu", fs->fs_fsmnt, (u_longlong_t)ip->i_number);
732 		return (EINVAL);
733 	}
734 
735 	/*
736 	 * get the dinode
737 	 */
738 	bp = UFS_BREAD(ip->i_ufsvfs,
739 			ip->i_dev, (daddr_t)fsbtodb(fs, itod(fs, ip->i_number)),
740 	    (int)fs->fs_bsize);
741 	if (bp->b_flags & B_ERROR) {
742 		brelse(bp);
743 		return (EIO);
744 	}
745 	dp  = bp->b_un.b_dino;
746 	dp += itoo(fs, ip->i_number);
747 
748 	/*
749 	 * handle Sun's implementation of EFT
750 	 */
751 	d_uid = (dp->di_suid == UID_LONG) ? dp->di_uid : (uid_t)dp->di_suid;
752 	d_gid = (dp->di_sgid == GID_LONG) ? dp->di_gid : (uid_t)dp->di_sgid;
753 
754 	rw_enter(&ip->i_contents, RW_WRITER);
755 
756 	/*
757 	 * some fields are not allowed to change
758 	 */
759 	if ((ip->i_mode  != dp->di_mode) ||
760 	    (ip->i_gen   != dp->di_gen) ||
761 	    (ip->i_uid   != d_uid) ||
762 	    (ip->i_gid   != d_gid)) {
763 		error = EACCES;
764 		goto out;
765 	}
766 
767 	/*
768 	 * and some are allowed to change
769 	 */
770 	ip->i_size		= dp->di_size;
771 	ip->i_ic.ic_flags	= dp->di_ic.ic_flags;
772 	ip->i_blocks		= dp->di_blocks;
773 	ip->i_nlink		= dp->di_nlink;
774 	if (ip->i_flag & IFASTSYMLNK) {
775 		ndaddr = 1;
776 		niaddr = 0;
777 	} else {
778 		ndaddr = NDADDR;
779 		niaddr = NIADDR;
780 	}
781 	for (i = 0; i < ndaddr; ++i)
782 		ip->i_db[i] = dp->di_db[i];
783 	for (i = 0; i < niaddr; ++i)
784 		ip->i_ib[i] = dp->di_ib[i];
785 
786 out:
787 	rw_exit(&ip->i_contents);
788 	brelse(bp);
789 	return (error);
790 }
791 
792 /*
793  * ufs_reconcile
794  *	reconcile ondisk superblock/inodes with any incore
795  */
796 static int
797 ufs_reconcile(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
798 {
799 	int	error = 0;
800 
801 	/*
802 	 * get rid of as much inmemory data as possible
803 	 */
804 	(void) ufs_flush(vfsp);
805 
806 	/*
807 	 * reconcile the superblock and inodes
808 	 */
809 	if (error = ufs_reconcile_fs(vfsp, ufsvfsp, errlck))
810 		return (error);
811 	if (error = ufs_scan_inodes(0, ufs_reconcile_inode, ufsvfsp, ufsvfsp))
812 		return (error);
813 	/*
814 	 * allocation blocks may be incorrect; get rid of them
815 	 */
816 	(void) ufs_flush(vfsp);
817 
818 	return (error);
819 }
820 
821 /*
822  * File system locking
823  */
824 int
825 ufs_fiolfs(struct vnode *vp, struct lockfs *lockfsp, int from_log)
826 {
827 	return (ufs__fiolfs(vp, lockfsp, /* from_user */ 1, from_log));
828 }
829 
830 /* kernel-internal interface, also used by fix-on-panic */
831 int
832 ufs__fiolfs(
833 	struct vnode *vp,
834 	struct lockfs *lockfsp,
835 	int from_user,
836 	int from_log)
837 {
838 	struct ulockfs	*ulp;
839 	struct lockfs	lfs;
840 	int		error;
841 	struct vfs	*vfsp;
842 	struct ufsvfs	*ufsvfsp;
843 	int		 errlck		= NO_ERRLCK;
844 	int		 poll_events	= POLLPRI;
845 	extern struct pollhead ufs_pollhd;
846 
847 	/* check valid lock type */
848 	if (!lockfsp || lockfsp->lf_lock > LOCKFS_MAXLOCK)
849 		return (EINVAL);
850 
851 	if (!vp || !vp->v_vfsp || !vp->v_vfsp->vfs_data)
852 		return (EIO);
853 
854 	vfsp = vp->v_vfsp;
855 	ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
856 	ulp = &ufsvfsp->vfs_ulockfs;
857 
858 	/*
859 	 * Suspend both the reclaim thread and the delete thread.
860 	 * This must be done outside the lockfs locking protocol.
861 	 */
862 	ufs_thread_suspend(&ufsvfsp->vfs_reclaim);
863 	ufs_thread_suspend(&ufsvfsp->vfs_delete);
864 
865 	/*
866 	 * Acquire vfs_reflock around ul_lock to avoid deadlock with
867 	 * umount/remount/sync.
868 	 */
869 	vfs_lock_wait(vfsp);
870 	mutex_enter(&ulp->ul_lock);
871 	atomic_add_long(&ufs_quiesce_pend, 1);
872 
873 	/*
874 	 * Quit if there is another lockfs request in progress
875 	 * that is waiting for existing ufs_vnops to complete.
876 	 */
877 	if (ULOCKFS_IS_BUSY(ulp)) {
878 		error = EBUSY;
879 		goto errexit;
880 	}
881 
882 	/* cannot ulocked or downgrade a hard-lock */
883 	if (ULOCKFS_IS_HLOCK(ulp)) {
884 		error = EIO;
885 		goto errexit;
886 	}
887 
888 	/* an error lock may be unlocked or relocked, only */
889 	if (ULOCKFS_IS_ELOCK(ulp)) {
890 		if (!LOCKFS_IS_ULOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
891 			error = EBUSY;
892 			goto errexit;
893 		}
894 	}
895 
896 	/*
897 	 * a read-only error lock may only be upgraded to an
898 	 * error lock or hard lock
899 	 */
900 	if (ULOCKFS_IS_ROELOCK(ulp)) {
901 		if (!LOCKFS_IS_HLOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
902 			error = EBUSY;
903 			goto errexit;
904 		}
905 	}
906 
907 	/*
908 	 * until read-only error locks are fully implemented
909 	 * just return EINVAL
910 	 */
911 	if (LOCKFS_IS_ROELOCK(lockfsp)) {
912 		error = EINVAL;
913 		goto errexit;
914 	}
915 
916 	/*
917 	 * an error lock may only be applied if the file system is
918 	 * unlocked or already error locked.
919 	 * (this is to prevent the case where a fs gets changed out from
920 	 * underneath a fs that is locked for backup,
921 	 * that is, name/delete/write-locked.)
922 	 */
923 	if ((!ULOCKFS_IS_ULOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp) &&
924 	    !ULOCKFS_IS_ROELOCK(ulp)) &&
925 	    (LOCKFS_IS_ELOCK(lockfsp) || LOCKFS_IS_ROELOCK(lockfsp))) {
926 		error = EBUSY;
927 		goto errexit;
928 	}
929 
930 	/* get and validate the input lockfs request */
931 	if (error = ufs_getlfd(lockfsp, &ulp->ul_lockfs))
932 		goto errexit;
933 
934 	/*
935 	 * save current ulockfs struct
936 	 */
937 	bcopy(&ulp->ul_lockfs, &lfs, sizeof (struct lockfs));
938 
939 	/*
940 	 * Freeze the file system (pend future accesses)
941 	 */
942 	ufs_freeze(ulp, lockfsp);
943 
944 	/*
945 	 * Set locking in progress because ufs_quiesce may free the
946 	 * ul_lock mutex.
947 	 */
948 	ULOCKFS_SET_BUSY(ulp);
949 	/* update the ioctl copy */
950 	LOCKFS_SET_BUSY(&ulp->ul_lockfs);
951 
952 	/*
953 	 * Quiesce (wait for outstanding accesses to finish)
954 	 */
955 	if (error = ufs_quiesce(ulp))
956 		goto errout;
957 
958 	/*
959 	 * can't wlock or (ro)elock fs with accounting or local swap file
960 	 */
961 	if ((ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp) ||
962 	    ULOCKFS_IS_ROELOCK(ulp)) && !from_log) {
963 		if (error = ufs_checkaccton(vp))
964 			goto errout;
965 		if (error = ufs_checkswapon(vp))
966 			goto errout;
967 	}
968 
969 	/*
970 	 * save error lock status to pass down to reconcilation
971 	 * routines and for later cleanup
972 	 */
973 	if (LOCKFS_IS_ELOCK(&lfs) && ULOCKFS_IS_ULOCK(ulp))
974 		errlck = UN_ERRLCK;
975 
976 	if (ULOCKFS_IS_ELOCK(ulp) || ULOCKFS_IS_ROELOCK(ulp)) {
977 		int needs_unlock;
978 		int needs_sbwrite;
979 
980 		poll_events |= POLLERR;
981 		errlck = LOCKFS_IS_ELOCK(&lfs) || LOCKFS_IS_ROELOCK(&lfs)?
982 							RE_ERRLCK: SET_ERRLCK;
983 
984 		needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
985 		if (needs_unlock)
986 			mutex_enter(&ufsvfsp->vfs_lock);
987 
988 		/* disable delayed i/o */
989 		needs_sbwrite = 0;
990 
991 		if (errlck == SET_ERRLCK) {
992 			ufsvfsp->vfs_fs->fs_clean = FSBAD;
993 			needs_sbwrite = 1;
994 		}
995 
996 		needs_sbwrite |= ufsvfsp->vfs_dio;
997 		ufsvfsp->vfs_dio = 0;
998 
999 		if (needs_unlock)
1000 			mutex_exit(&ufsvfsp->vfs_lock);
1001 
1002 		if (needs_sbwrite) {
1003 			ulp->ul_sbowner = curthread;
1004 			TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE);
1005 
1006 			if (needs_unlock)
1007 				mutex_enter(&ufsvfsp->vfs_lock);
1008 
1009 			ufsvfsp->vfs_fs->fs_fmod = 0;
1010 
1011 			if (needs_unlock)
1012 				mutex_exit(&ufsvfsp->vfs_lock);
1013 		}
1014 	}
1015 
1016 	/*
1017 	 * reconcile superblock and inodes if was wlocked
1018 	 */
1019 	if (LOCKFS_IS_WLOCK(&lfs) || LOCKFS_IS_ELOCK(&lfs)) {
1020 		if (error = ufs_reconcile(vfsp, ufsvfsp, errlck))
1021 			goto errout;
1022 		/*
1023 		 * in case the fs grew; reset the metadata map for logging tests
1024 		 */
1025 		TRANS_MATA_UMOUNT(ufsvfsp);
1026 		TRANS_MATA_MOUNT(ufsvfsp);
1027 		TRANS_MATA_SI(ufsvfsp, ufsvfsp->vfs_fs);
1028 	}
1029 
1030 	/*
1031 	 * At least everything *currently* dirty goes out.
1032 	 */
1033 
1034 	if ((error = ufs_flush(vfsp)) != 0 && !ULOCKFS_IS_HLOCK(ulp) &&
1035 	    !ULOCKFS_IS_ELOCK(ulp))
1036 		goto errout;
1037 
1038 	/*
1039 	 * thaw file system and wakeup pended processes
1040 	 */
1041 	if (error = ufs_thaw(vfsp, ufsvfsp, ulp))
1042 		if (!ULOCKFS_IS_HLOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp))
1043 			goto errout;
1044 
1045 	/*
1046 	 * reset modified flag if not already write locked
1047 	 */
1048 	if (!LOCKFS_IS_WLOCK(&lfs))
1049 		ULOCKFS_CLR_MOD(ulp);
1050 
1051 	/*
1052 	 * idle the lock struct
1053 	 */
1054 	ULOCKFS_CLR_BUSY(ulp);
1055 	/* update the ioctl copy */
1056 	LOCKFS_CLR_BUSY(&ulp->ul_lockfs);
1057 
1058 	/*
1059 	 * free current comment
1060 	 */
1061 	if (lfs.lf_comment && lfs.lf_comlen != 0) {
1062 		kmem_free(lfs.lf_comment, lfs.lf_comlen);
1063 		lfs.lf_comment = NULL;
1064 		lfs.lf_comlen = 0;
1065 	}
1066 
1067 	/* do error lock cleanup */
1068 	if (errlck == UN_ERRLCK)
1069 		ufsfx_unlockfs(ufsvfsp);
1070 
1071 	else if (errlck == RE_ERRLCK)
1072 		ufsfx_lockfs(ufsvfsp);
1073 
1074 	/* don't allow error lock from user to invoke panic */
1075 	else if (from_user && errlck == SET_ERRLCK &&
1076 		!(ufsvfsp->vfs_fsfx.fx_flags & (UFSMNT_ONERROR_PANIC >> 4)))
1077 		(void) ufs_fault(ufsvfsp->vfs_root,
1078 		    ulp->ul_lockfs.lf_comment && ulp->ul_lockfs.lf_comlen > 0 ?
1079 		    ulp->ul_lockfs.lf_comment: "user-applied error lock");
1080 
1081 	atomic_add_long(&ufs_quiesce_pend, -1);
1082 	mutex_exit(&ulp->ul_lock);
1083 	vfs_unlock(vfsp);
1084 
1085 	if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs))
1086 		poll_events |= POLLERR;
1087 
1088 	pollwakeup(&ufs_pollhd, poll_events);
1089 
1090 	/*
1091 	 * Allow both the delete thread and the reclaim thread to
1092 	 * continue.
1093 	 */
1094 	ufs_thread_continue(&ufsvfsp->vfs_delete);
1095 	ufs_thread_continue(&ufsvfsp->vfs_reclaim);
1096 
1097 	return (0);
1098 
1099 errout:
1100 	/*
1101 	 * Lock failed. Reset the old lock in ufsvfs if not hard locked.
1102 	 */
1103 	if (!LOCKFS_IS_HLOCK(&ulp->ul_lockfs)) {
1104 		bcopy(&lfs, &ulp->ul_lockfs, sizeof (struct lockfs));
1105 		ulp->ul_fs_lock = (1 << lfs.lf_lock);
1106 	}
1107 	(void) ufs_thaw(vfsp, ufsvfsp, ulp);
1108 	ULOCKFS_CLR_BUSY(ulp);
1109 	LOCKFS_CLR_BUSY(&ulp->ul_lockfs);
1110 
1111 errexit:
1112 	atomic_add_long(&ufs_quiesce_pend, -1);
1113 	mutex_exit(&ulp->ul_lock);
1114 	vfs_unlock(vfsp);
1115 
1116 	/*
1117 	 * Allow both the delete thread and the reclaim thread to
1118 	 * continue.
1119 	 */
1120 	ufs_thread_continue(&ufsvfsp->vfs_delete);
1121 	ufs_thread_continue(&ufsvfsp->vfs_reclaim);
1122 
1123 	return (error);
1124 }
1125 
1126 /*
1127  * fiolfss
1128  * 	return the current file system locking state info
1129  */
1130 int
1131 ufs_fiolfss(struct vnode *vp, struct lockfs *lockfsp)
1132 {
1133 	struct ulockfs	*ulp;
1134 
1135 	if (!vp || !vp->v_vfsp || !VTOI(vp))
1136 		return (EINVAL);
1137 
1138 	/* file system has been forcibly unmounted */
1139 	if (VTOI(vp)->i_ufsvfs == NULL)
1140 		return (EIO);
1141 
1142 	ulp = VTOUL(vp);
1143 
1144 	if (ULOCKFS_IS_HLOCK(ulp)) {
1145 		*lockfsp = ulp->ul_lockfs;	/* structure assignment */
1146 		return (0);
1147 	}
1148 
1149 	mutex_enter(&ulp->ul_lock);
1150 
1151 	*lockfsp = ulp->ul_lockfs;	/* structure assignment */
1152 
1153 	if (ULOCKFS_IS_MOD(ulp))
1154 		lockfsp->lf_flags |= LOCKFS_MOD;
1155 
1156 	mutex_exit(&ulp->ul_lock);
1157 
1158 	return (0);
1159 }
1160 
1161 /*
1162  * ufs_check_lockfs
1163  *	check whether a ufs_vnops conflicts with the file system lock
1164  */
1165 int
1166 ufs_check_lockfs(struct ufsvfs *ufsvfsp, struct ulockfs *ulp, ulong_t mask)
1167 {
1168 	k_sigset_t	smask;
1169 	int		sig, slock;
1170 
1171 	ASSERT(MUTEX_HELD(&ulp->ul_lock));
1172 
1173 	while (ulp->ul_fs_lock & mask) {
1174 		slock = (int)ULOCKFS_IS_SLOCK(ulp);
1175 		if ((curthread->t_flag & T_DONTPEND) && !slock) {
1176 			curthread->t_flag |= T_WOULDBLOCK;
1177 			return (EAGAIN);
1178 		}
1179 		curthread->t_flag &= ~T_WOULDBLOCK;
1180 
1181 		if (ULOCKFS_IS_HLOCK(ulp))
1182 			return (EIO);
1183 
1184 		/*
1185 		 * wait for lock status to change
1186 		 */
1187 		if (slock || ufsvfsp->vfs_nointr) {
1188 			cv_wait(&ulp->ul_cv, &ulp->ul_lock);
1189 		} else {
1190 			sigintr(&smask, 1);
1191 			sig = cv_wait_sig(&ulp->ul_cv, &ulp->ul_lock);
1192 			sigunintr(&smask);
1193 			if ((!sig && (ulp->ul_fs_lock & mask)) ||
1194 				ufsvfsp->vfs_dontblock)
1195 				return (EINTR);
1196 		}
1197 	}
1198 	atomic_add_long(&ulp->ul_vnops_cnt, 1);
1199 	return (0);
1200 }
1201 
1202 /*
1203  * Check whether we came across the handcrafted lockfs protocol path. We can't
1204  * simply check for T_DONTBLOCK here as one would assume since this can also
1205  * falsely catch recursive VOP's going to a different filesystem, instead we
1206  * check if we already hold the ulockfs->ul_lock mutex.
1207  */
1208 static int
1209 ufs_lockfs_is_under_rawlockfs(struct ulockfs *ulp)
1210 {
1211 	return ((mutex_owner(&ulp->ul_lock) != curthread) ? 0 : 1);
1212 }
1213 
1214 /*
1215  * ufs_lockfs_begin - start the lockfs locking protocol
1216  */
1217 int
1218 ufs_lockfs_begin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask)
1219 {
1220 	int 		error;
1221 	int		rec_vop;
1222 	struct ulockfs *ulp;
1223 	ulockfs_info_t	*ulockfs_info;
1224 	ulockfs_info_t	*ulockfs_info_free;
1225 	ulockfs_info_t	*ulockfs_info_temp;
1226 
1227 	/*
1228 	 * file system has been forcibly unmounted
1229 	 */
1230 	if (ufsvfsp == NULL)
1231 		return (EIO);
1232 
1233 	*ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1234 
1235 	/*
1236 	 * Do lockfs protocol
1237 	 */
1238 	ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1239 	IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1240 
1241 	/*
1242 	 * Detect recursive VOP call or handcrafted internal lockfs protocol
1243 	 * path and bail out in that case.
1244 	 */
1245 	if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1246 		*ulpp = NULL;
1247 		return (0);
1248 	} else {
1249 		if (ulockfs_info_free == NULL) {
1250 			if ((ulockfs_info_temp = (ulockfs_info_t *)
1251 			    kmem_zalloc(sizeof (ulockfs_info_t),
1252 			    KM_NOSLEEP)) == NULL) {
1253 				*ulpp = NULL;
1254 				return (ENOMEM);
1255 			}
1256 		}
1257 	}
1258 
1259 	/*
1260 	 * First time VOP call
1261 	 */
1262 	mutex_enter(&ulp->ul_lock);
1263 	if (ULOCKFS_IS_JUSTULOCK(ulp))
1264 		atomic_add_long(&ulp->ul_vnops_cnt, 1);
1265 	else {
1266 		if (error = ufs_check_lockfs(ufsvfsp, ulp, mask)) {
1267 			mutex_exit(&ulp->ul_lock);
1268 			if (ulockfs_info_free == NULL)
1269 				kmem_free(ulockfs_info_temp,
1270 				    sizeof (ulockfs_info_t));
1271 			return (error);
1272 		}
1273 	}
1274 	mutex_exit(&ulp->ul_lock);
1275 
1276 	if (ulockfs_info_free != NULL) {
1277 		ulockfs_info_free->ulp = ulp;
1278 	} else {
1279 		ulockfs_info_temp->ulp = ulp;
1280 		ulockfs_info_temp->next = ulockfs_info;
1281 		ASSERT(ufs_lockfs_key != 0);
1282 		(void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1283 	}
1284 
1285 	curthread->t_flag |= T_DONTBLOCK;
1286 	return (0);
1287 }
1288 
1289 /*
1290  * Check whether we are returning from the top level VOP.
1291  */
1292 static int
1293 ufs_lockfs_top_vop_return(ulockfs_info_t *head)
1294 {
1295 	ulockfs_info_t *info;
1296 	int result = 1;
1297 
1298 	for (info = head; info != NULL; info = info->next) {
1299 		if (info->ulp != NULL) {
1300 			result = 0;
1301 			break;
1302 		}
1303 	}
1304 
1305 	return (result);
1306 }
1307 
1308 /*
1309  * ufs_lockfs_end - terminate the lockfs locking protocol
1310  */
1311 void
1312 ufs_lockfs_end(struct ulockfs *ulp)
1313 {
1314 	ulockfs_info_t *info;
1315 	ulockfs_info_t *head;
1316 
1317 	/*
1318 	 * end-of-VOP protocol
1319 	 */
1320 	if (ulp == NULL)
1321 		return;
1322 
1323 	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1324 	SEARCH_ULOCKFSP(head, ulp, info);
1325 
1326 	/*
1327 	 * If we're called from a first level VOP, we have to have a
1328 	 * valid ulockfs record in the TSD.
1329 	 */
1330 	ASSERT(info != NULL);
1331 
1332 	/*
1333 	 * Invalidate the ulockfs record.
1334 	 */
1335 	info->ulp = NULL;
1336 
1337 	if (ufs_lockfs_top_vop_return(head))
1338 		curthread->t_flag &= ~T_DONTBLOCK;
1339 
1340 	mutex_enter(&ulp->ul_lock);
1341 
1342 	if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1))
1343 		cv_broadcast(&ulp->ul_cv);
1344 
1345 	mutex_exit(&ulp->ul_lock);
1346 }
1347 
1348 /*
1349  * specialized version of ufs_lockfs_begin() called by ufs_getpage().
1350  */
1351 int
1352 ufs_lockfs_begin_getpage(
1353 	struct ufsvfs	*ufsvfsp,
1354 	struct ulockfs	**ulpp,
1355 	struct seg	*seg,
1356 	int		read_access,
1357 	uint_t		*protp)
1358 {
1359 	ulong_t			mask;
1360 	int 			error;
1361 	int			rec_vop;
1362 	struct ulockfs		*ulp;
1363 	ulockfs_info_t		*ulockfs_info;
1364 	ulockfs_info_t		*ulockfs_info_free;
1365 	ulockfs_info_t		*ulockfs_info_temp;
1366 
1367 	/*
1368 	 * file system has been forcibly unmounted
1369 	 */
1370 	if (ufsvfsp == NULL)
1371 		return (EIO);
1372 
1373 	*ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1374 
1375 	/*
1376 	 * Do lockfs protocol
1377 	 */
1378 	ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1379 	IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1380 
1381 	/*
1382 	 * Detect recursive VOP call or handcrafted internal lockfs protocol
1383 	 * path and bail out in that case.
1384 	 */
1385 	if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1386 		*ulpp = NULL;
1387 		return (0);
1388 	} else {
1389 		if (ulockfs_info_free == NULL) {
1390 			if ((ulockfs_info_temp = (ulockfs_info_t *)
1391 			    kmem_zalloc(sizeof (ulockfs_info_t),
1392 			    KM_NOSLEEP)) == NULL) {
1393 				*ulpp = NULL;
1394 				return (ENOMEM);
1395 			}
1396 		}
1397 	}
1398 
1399 	/*
1400 	 * First time VOP call
1401 	 */
1402 	mutex_enter(&ulp->ul_lock);
1403 	if (ULOCKFS_IS_JUSTULOCK(ulp))
1404 		/*
1405 		 * fs is not locked, simply inc the active-ops counter
1406 		 */
1407 		atomic_add_long(&ulp->ul_vnops_cnt, 1);
1408 	else {
1409 		if (seg->s_ops == &segvn_ops &&
1410 		    ((struct segvn_data *)seg->s_data)->type != MAP_SHARED) {
1411 			mask = (ulong_t)ULOCKFS_GETREAD_MASK;
1412 		} else if (protp && read_access) {
1413 			/*
1414 			 * Restrict the mapping to readonly.
1415 			 * Writes to this mapping will cause
1416 			 * another fault which will then
1417 			 * be suspended if fs is write locked
1418 			 */
1419 			*protp &= ~PROT_WRITE;
1420 			mask = (ulong_t)ULOCKFS_GETREAD_MASK;
1421 		} else
1422 			mask = (ulong_t)ULOCKFS_GETWRITE_MASK;
1423 
1424 		/*
1425 		 * will sleep if this fs is locked against this VOP
1426 		 */
1427 		if (error = ufs_check_lockfs(ufsvfsp, ulp, mask)) {
1428 			mutex_exit(&ulp->ul_lock);
1429 			if (ulockfs_info_free == NULL)
1430 				kmem_free(ulockfs_info_temp,
1431 				    sizeof (ulockfs_info_t));
1432 			return (error);
1433 		}
1434 	}
1435 	mutex_exit(&ulp->ul_lock);
1436 
1437 	if (ulockfs_info_free != NULL) {
1438 		ulockfs_info_free->ulp = ulp;
1439 	} else {
1440 		ulockfs_info_temp->ulp = ulp;
1441 		ulockfs_info_temp->next = ulockfs_info;
1442 		ASSERT(ufs_lockfs_key != 0);
1443 		(void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1444 	}
1445 
1446 	curthread->t_flag |= T_DONTBLOCK;
1447 	return (0);
1448 }
1449 
1450 void
1451 ufs_lockfs_tsd_destructor(void *head)
1452 {
1453 	ulockfs_info_t *curr = (ulockfs_info_t *)head;
1454 	ulockfs_info_t *temp;
1455 
1456 	for (; curr != NULL; ) {
1457 		/*
1458 		 * The TSD destructor is being called when the thread exits
1459 		 * (via thread_exit()). At that time it must have cleaned up
1460 		 * all VOPs via ufs_lockfs_end() and there must not be a
1461 		 * valid ulockfs record exist while a thread is exiting.
1462 		 */
1463 		temp = curr;
1464 		curr = curr->next;
1465 		ASSERT(temp->ulp == NULL);
1466 		kmem_free(temp, sizeof (ulockfs_info_t));
1467 	}
1468 }
1469