xref: /titanic_41/usr/src/uts/common/fs/ufs/ufs_lockfs.c (revision 5203bc321053fb87d7073c7640548fab73634793)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/types.h>
26 #include <sys/t_lock.h>
27 #include <sys/param.h>
28 #include <sys/time.h>
29 #include <sys/systm.h>
30 #include <sys/sysmacros.h>
31 #include <sys/resource.h>
32 #include <sys/signal.h>
33 #include <sys/cred.h>
34 #include <sys/user.h>
35 #include <sys/buf.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/proc.h>
39 #include <sys/disp.h>
40 #include <sys/file.h>
41 #include <sys/fcntl.h>
42 #include <sys/flock.h>
43 #include <sys/atomic.h>
44 #include <sys/kmem.h>
45 #include <sys/uio.h>
46 #include <sys/conf.h>
47 #include <sys/mman.h>
48 #include <sys/pathname.h>
49 #include <sys/debug.h>
50 #include <sys/vmsystm.h>
51 #include <sys/cmn_err.h>
52 #include <sys/acct.h>
53 #include <sys/dnlc.h>
54 #include <sys/swap.h>
55 
56 #include <sys/fs/ufs_fs.h>
57 #include <sys/fs/ufs_inode.h>
58 #include <sys/fs/ufs_fsdir.h>
59 #include <sys/fs/ufs_trans.h>
60 #include <sys/fs/ufs_panic.h>
61 #include <sys/fs/ufs_mount.h>
62 #include <sys/fs/ufs_bio.h>
63 #include <sys/fs/ufs_log.h>
64 #include <sys/fs/ufs_quota.h>
65 #include <sys/dirent.h>		/* must be AFTER <sys/fs/fsdir.h>! */
66 #include <sys/errno.h>
67 #include <sys/sysinfo.h>
68 
69 #include <vm/hat.h>
70 #include <vm/pvn.h>
71 #include <vm/as.h>
72 #include <vm/seg.h>
73 #include <vm/seg_map.h>
74 #include <vm/seg_vn.h>
75 #include <vm/rm.h>
76 #include <vm/anon.h>
77 #include <sys/swap.h>
78 #include <sys/dnlc.h>
79 
80 extern struct vnode *common_specvp(struct vnode *vp);
81 
82 /* error lock status */
83 #define	UN_ERRLCK	(-1)
84 #define	SET_ERRLCK	1
85 #define	RE_ERRLCK	2
86 #define	NO_ERRLCK	0
87 
88 /*
89  * Index to be used in TSD for storing lockfs data
90  */
91 uint_t ufs_lockfs_key;
92 
93 typedef struct _ulockfs_info {
94 	struct _ulockfs_info *next;
95 	struct ulockfs *ulp;
96 	uint_t flags;
97 } ulockfs_info_t;
98 
99 #define	ULOCK_INFO_FALLOCATE	0x00000001	/* fallocate thread */
100 
101 /*
102  * Check in TSD that whether we are already doing any VOP on this filesystem
103  */
104 #define	IS_REC_VOP(found, head, ulp, free)		\
105 {							\
106 	ulockfs_info_t *_curr;				\
107 							\
108 	for (found = 0, free = NULL, _curr = head;	\
109 	    _curr != NULL; _curr = _curr->next) {	\
110 		if ((free == NULL) &&			\
111 		    (_curr->ulp == NULL))		\
112 			free = _curr;			\
113 		if (_curr->ulp == ulp) {		\
114 			found = 1;			\
115 			break;				\
116 		}					\
117 	}						\
118 }
119 
120 /*
121  * Get the lockfs data from TSD so that lockfs handles the recursive VOP
122  * properly
123  */
124 #define	SEARCH_ULOCKFSP(head, ulp, info)		\
125 {							\
126 	ulockfs_info_t *_curr;				\
127 							\
128 	for (_curr = head; _curr != NULL;		\
129 	    _curr = _curr->next) {			\
130 		if (_curr->ulp == ulp) {		\
131 			break;				\
132 		}					\
133 	}						\
134 							\
135 	info = _curr;					\
136 }
137 
138 /*
139  * Validate lockfs request
140  */
141 static int
142 ufs_getlfd(
143 	struct lockfs *lockfsp,		/* new lock request */
144 	struct lockfs *ul_lockfsp)	/* old lock state */
145 {
146 	int	error = 0;
147 
148 	/*
149 	 * no input flags defined
150 	 */
151 	if (lockfsp->lf_flags != 0) {
152 		error = EINVAL;
153 		goto errout;
154 	}
155 
156 	/*
157 	 * check key
158 	 */
159 	if (!LOCKFS_IS_ULOCK(ul_lockfsp))
160 		if (lockfsp->lf_key != ul_lockfsp->lf_key) {
161 			error = EINVAL;
162 			goto errout;
163 	}
164 
165 	lockfsp->lf_key = ul_lockfsp->lf_key + 1;
166 
167 errout:
168 	return (error);
169 }
170 
171 /*
172  * ufs_checkaccton
173  *	check if accounting is turned on on this fs
174  */
175 
176 int
177 ufs_checkaccton(struct vnode *vp)
178 {
179 	if (acct_fs_in_use(vp))
180 		return (EDEADLK);
181 	return (0);
182 }
183 
184 /*
185  * ufs_checkswapon
186  *	check if local swapping is to file on this fs
187  */
188 int
189 ufs_checkswapon(struct vnode *vp)
190 {
191 	struct swapinfo	*sip;
192 
193 	mutex_enter(&swapinfo_lock);
194 	for (sip = swapinfo; sip; sip = sip->si_next)
195 		if (sip->si_vp->v_vfsp == vp->v_vfsp) {
196 			mutex_exit(&swapinfo_lock);
197 			return (EDEADLK);
198 		}
199 	mutex_exit(&swapinfo_lock);
200 	return (0);
201 }
202 
203 /*
204  * ufs_freeze
205  *	pend future accesses for current lock and desired lock
206  */
207 void
208 ufs_freeze(struct ulockfs *ulp, struct lockfs *lockfsp)
209 {
210 	/*
211 	 * set to new lock type
212 	 */
213 	ulp->ul_lockfs.lf_lock = lockfsp->lf_lock;
214 	ulp->ul_lockfs.lf_key = lockfsp->lf_key;
215 	ulp->ul_lockfs.lf_comlen = lockfsp->lf_comlen;
216 	ulp->ul_lockfs.lf_comment = lockfsp->lf_comment;
217 
218 	ulp->ul_fs_lock = (1 << ulp->ul_lockfs.lf_lock);
219 }
220 
221 /*
222  * All callers of ufs_quiesce() atomically increment ufs_quiesce_pend before
223  * starting ufs_quiesce() protocol and decrement it only when a file system no
224  * longer has to be in quiescent state. This allows ufs_pageio() to detect
225  * that another thread wants to quiesce a file system. See more comments in
226  * ufs_pageio().
227  */
228 ulong_t ufs_quiesce_pend = 0;
229 
230 /*
231  * ufs_quiesce
232  *	wait for outstanding accesses to finish
233  */
234 int
235 ufs_quiesce(struct ulockfs *ulp)
236 {
237 	int error = 0;
238 	ulockfs_info_t *head;
239 	ulockfs_info_t *info;
240 	klwp_t *lwp = ttolwp(curthread);
241 
242 	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
243 	SEARCH_ULOCKFSP(head, ulp, info);
244 
245 	/*
246 	 * We have to keep /proc away from stopping us after we applied
247 	 * the softlock but before we got a chance to clear it again.
248 	 * prstop() may pagefault and become stuck on the softlock still
249 	 * pending.
250 	 */
251 	if (lwp != NULL)
252 		lwp->lwp_nostop++;
253 
254 	/*
255 	 * Set a softlock to suspend future ufs_vnops so that
256 	 * this lockfs request will not be starved
257 	 */
258 	ULOCKFS_SET_SLOCK(ulp);
259 	ASSERT(ufs_quiesce_pend);
260 
261 	/* check if there is any outstanding ufs vnodeops calls */
262 	while (ulp->ul_vnops_cnt || ulp->ul_falloc_cnt) {
263 		/*
264 		 * use timed version of cv_wait_sig() to make sure we don't
265 		 * miss a wake up call from ufs_pageio() when it doesn't use
266 		 * ul_lock.
267 		 *
268 		 * when a fallocate thread comes in, the only way it returns
269 		 * from this function is if there are no other vnode operations
270 		 * going on (remember fallocate threads are tracked using
271 		 * ul_falloc_cnt not ul_vnops_cnt), and another fallocate thread
272 		 * hasn't already grabbed the fs write lock.
273 		 */
274 		if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
275 			if (!ulp->ul_vnops_cnt && !ULOCKFS_IS_FWLOCK(ulp))
276 				goto out;
277 		}
278 		if (!cv_reltimedwait_sig(&ulp->ul_cv, &ulp->ul_lock, hz,
279 		    TR_CLOCK_TICK)) {
280 			error = EINTR;
281 			goto out;
282 		}
283 	}
284 
285 out:
286 	/*
287 	 * unlock the soft lock
288 	 */
289 	ULOCKFS_CLR_SLOCK(ulp);
290 
291 	if (lwp != NULL)
292 		lwp->lwp_nostop--;
293 
294 	return (error);
295 }
296 
297 /*
298  * ufs_flush_inode
299  */
300 int
301 ufs_flush_inode(struct inode *ip, void *arg)
302 {
303 	int	error;
304 	int	saverror	= 0;
305 
306 	/*
307 	 * wrong file system; keep looking
308 	 */
309 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
310 		return (0);
311 
312 	/*
313 	 * asynchronously push all the dirty pages
314 	 */
315 	if (((error = TRANS_SYNCIP(ip, B_ASYNC, 0, TOP_SYNCIP_FLUSHI)) != 0) &&
316 	    (error != EAGAIN))
317 		saverror = error;
318 	/*
319 	 * wait for io and discard all mappings
320 	 */
321 	if (error = TRANS_SYNCIP(ip, B_INVAL, 0, TOP_SYNCIP_FLUSHI))
322 		saverror = error;
323 
324 	if (ITOV(ip)->v_type == VDIR) {
325 		dnlc_dir_purge(&ip->i_danchor);
326 	}
327 
328 	return (saverror);
329 }
330 
331 /*
332  * ufs_flush
333  *	Flush everything that is currently dirty; this includes invalidating
334  *	any mappings.
335  */
336 int
337 ufs_flush(struct vfs *vfsp)
338 {
339 	int		error;
340 	int		saverror = 0;
341 	struct ufsvfs	*ufsvfsp	= (struct ufsvfs *)vfsp->vfs_data;
342 	struct fs	*fs		= ufsvfsp->vfs_fs;
343 	int		tdontblock = 0;
344 
345 	ASSERT(vfs_lock_held(vfsp));
346 
347 	/*
348 	 * purge dnlc
349 	 */
350 	(void) dnlc_purge_vfsp(vfsp, 0);
351 
352 	/*
353 	 * drain the delete and idle threads
354 	 */
355 	ufs_delete_drain(vfsp, 0, 0);
356 	ufs_idle_drain(vfsp);
357 
358 	/*
359 	 * flush and invalidate quota records
360 	 */
361 	(void) qsync(ufsvfsp);
362 
363 	/*
364 	 * flush w/invalidate the inodes for vfsp
365 	 */
366 	if (error = ufs_scan_inodes(0, ufs_flush_inode, ufsvfsp, ufsvfsp))
367 		saverror = error;
368 
369 	/*
370 	 * synchronously flush superblock and summary info
371 	 */
372 	if (fs->fs_ronly == 0 && fs->fs_fmod) {
373 		fs->fs_fmod = 0;
374 		TRANS_SBUPDATE(ufsvfsp, vfsp, TOP_SBUPDATE_FLUSH);
375 	}
376 	/*
377 	 * flush w/invalidate block device pages and buf cache
378 	 */
379 	if ((error = VOP_PUTPAGE(common_specvp(ufsvfsp->vfs_devvp),
380 	    (offset_t)0, 0, B_INVAL, CRED(), NULL)) > 0)
381 		saverror = error;
382 
383 	(void) bflush((dev_t)vfsp->vfs_dev);
384 	(void) bfinval((dev_t)vfsp->vfs_dev, 0);
385 
386 	/*
387 	 * drain the delete and idle threads again
388 	 */
389 	ufs_delete_drain(vfsp, 0, 0);
390 	ufs_idle_drain(vfsp);
391 
392 	/*
393 	 * play with the clean flag
394 	 */
395 	if (saverror == 0)
396 		ufs_checkclean(vfsp);
397 
398 	/*
399 	 * Flush any outstanding transactions and roll the log
400 	 * only if we are supposed to do, i.e. LDL_NOROLL not set.
401 	 * We can not simply check for fs_ronly here since fsck also may
402 	 * use this code to roll the log on a read-only filesystem, e.g.
403 	 * root during early stages of boot, if other then a sanity check is
404 	 * done, it will clear LDL_NOROLL before.
405 	 * In addition we assert that the deltamap does not contain any deltas
406 	 * in case LDL_NOROLL is set since this is not supposed to happen.
407 	 */
408 	if (TRANS_ISTRANS(ufsvfsp)) {
409 		ml_unit_t	*ul	= ufsvfsp->vfs_log;
410 		mt_map_t	*mtm	= ul->un_deltamap;
411 
412 		if (ul->un_flags & LDL_NOROLL) {
413 			ASSERT(mtm->mtm_nme == 0);
414 		} else {
415 			/*
416 			 * Do not set T_DONTBLOCK if there is a
417 			 * transaction opened by caller.
418 			 */
419 			if (curthread->t_flag & T_DONTBLOCK)
420 				tdontblock = 1;
421 			else
422 				curthread->t_flag |= T_DONTBLOCK;
423 
424 			TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_FLUSH,
425 			    TOP_COMMIT_SIZE, error);
426 
427 			if (!error) {
428 				TRANS_END_SYNC(ufsvfsp, saverror,
429 				    TOP_COMMIT_FLUSH, TOP_COMMIT_SIZE);
430 			}
431 
432 			if (tdontblock == 0)
433 				curthread->t_flag &= ~T_DONTBLOCK;
434 
435 			logmap_roll_dev(ufsvfsp->vfs_log);
436 		}
437 	}
438 
439 	return (saverror);
440 }
441 
442 /*
443  * ufs_thaw_wlock
444  *	special processing when thawing down to wlock
445  */
446 static int
447 ufs_thaw_wlock(struct inode *ip, void *arg)
448 {
449 	/*
450 	 * wrong file system; keep looking
451 	 */
452 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
453 		return (0);
454 
455 	/*
456 	 * iupdat refuses to clear flags if the fs is read only.  The fs
457 	 * may become read/write during the lock and we wouldn't want
458 	 * these inodes being written to disk.  So clear the flags.
459 	 */
460 	rw_enter(&ip->i_contents, RW_WRITER);
461 	ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
462 	rw_exit(&ip->i_contents);
463 
464 	/*
465 	 * pages are mlocked -- fail wlock
466 	 */
467 	if (ITOV(ip)->v_type != VCHR && vn_has_cached_data(ITOV(ip)))
468 		return (EBUSY);
469 
470 	return (0);
471 }
472 
473 /*
474  * ufs_thaw_hlock
475  *	special processing when thawing down to hlock or elock
476  */
477 static int
478 ufs_thaw_hlock(struct inode *ip, void *arg)
479 {
480 	struct vnode	*vp	= ITOV(ip);
481 
482 	/*
483 	 * wrong file system; keep looking
484 	 */
485 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
486 		return (0);
487 
488 	/*
489 	 * blow away all pages - even if they are mlocked
490 	 */
491 	do {
492 		(void) TRANS_SYNCIP(ip, B_INVAL | B_FORCE, 0, TOP_SYNCIP_HLOCK);
493 	} while ((vp->v_type != VCHR) && vn_has_cached_data(vp));
494 	rw_enter(&ip->i_contents, RW_WRITER);
495 	ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
496 	rw_exit(&ip->i_contents);
497 
498 	return (0);
499 }
500 
501 /*
502  * ufs_thaw
503  *	thaw file system lock down to current value
504  */
505 int
506 ufs_thaw(struct vfs *vfsp, struct ufsvfs *ufsvfsp, struct ulockfs *ulp)
507 {
508 	int		error	= 0;
509 	int		noidel	= (int)(ulp->ul_flag & ULOCKFS_NOIDEL);
510 
511 	/*
512 	 * if wlock or hlock or elock
513 	 */
514 	if (ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_HLOCK(ulp) ||
515 	    ULOCKFS_IS_ELOCK(ulp)) {
516 
517 		/*
518 		 * don't keep access times
519 		 * don't free deleted files
520 		 * if superblock writes are allowed, limit them to me for now
521 		 */
522 		ulp->ul_flag |= (ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
523 		if (ulp->ul_sbowner != (kthread_id_t)-1)
524 			ulp->ul_sbowner = curthread;
525 
526 		/*
527 		 * wait for writes for deleted files and superblock updates
528 		 */
529 		(void) ufs_flush(vfsp);
530 
531 		/*
532 		 * now make sure the quota file is up-to-date
533 		 *	expensive; but effective
534 		 */
535 		error = ufs_flush(vfsp);
536 		/*
537 		 * no one can write the superblock
538 		 */
539 		ulp->ul_sbowner = (kthread_id_t)-1;
540 
541 		/*
542 		 * special processing for wlock/hlock/elock
543 		 */
544 		if (ULOCKFS_IS_WLOCK(ulp)) {
545 			if (error)
546 				goto errout;
547 			error = bfinval(ufsvfsp->vfs_dev, 0);
548 			if (error)
549 				goto errout;
550 			error = ufs_scan_inodes(0, ufs_thaw_wlock,
551 			    (void *)ufsvfsp, ufsvfsp);
552 			if (error)
553 				goto errout;
554 		}
555 		if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp)) {
556 			error = 0;
557 			(void) ufs_scan_inodes(0, ufs_thaw_hlock,
558 			    (void *)ufsvfsp, ufsvfsp);
559 			(void) bfinval(ufsvfsp->vfs_dev, 1);
560 		}
561 	} else {
562 
563 		/*
564 		 * okay to keep access times
565 		 * okay to free deleted files
566 		 * okay to write the superblock
567 		 */
568 		ulp->ul_flag &= ~(ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
569 		ulp->ul_sbowner = NULL;
570 
571 		/*
572 		 * flush in case deleted files are in memory
573 		 */
574 		if (noidel) {
575 			if (error = ufs_flush(vfsp))
576 				goto errout;
577 		}
578 	}
579 
580 errout:
581 	cv_broadcast(&ulp->ul_cv);
582 	return (error);
583 }
584 
585 /*
586  * ufs_reconcile_fs
587  *	reconcile incore superblock with ondisk superblock
588  */
589 int
590 ufs_reconcile_fs(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
591 {
592 	struct fs	*mfs; 	/* in-memory superblock */
593 	struct fs	*dfs;	/* on-disk   superblock */
594 	struct buf	*bp;	/* on-disk   superblock buf */
595 	int		 needs_unlock;
596 	char		 finished_fsclean;
597 
598 	mfs = ufsvfsp->vfs_fs;
599 
600 	/*
601 	 * get the on-disk copy of the superblock
602 	 */
603 	bp = UFS_BREAD(ufsvfsp, vfsp->vfs_dev, SBLOCK, SBSIZE);
604 	bp->b_flags |= (B_STALE|B_AGE);
605 	if (bp->b_flags & B_ERROR) {
606 		brelse(bp);
607 		return (EIO);
608 	}
609 	dfs = bp->b_un.b_fs;
610 
611 	/* error locks may only unlock after the fs has been made consistent */
612 	if (errlck == UN_ERRLCK) {
613 		if (dfs->fs_clean == FSFIX) {	/* being repaired */
614 			brelse(bp);
615 			return (EAGAIN);
616 		}
617 		/* repair not yet started? */
618 		finished_fsclean = TRANS_ISTRANS(ufsvfsp)? FSLOG: FSCLEAN;
619 		if (dfs->fs_clean != finished_fsclean) {
620 			brelse(bp);
621 			return (EBUSY);
622 		}
623 	}
624 
625 	/*
626 	 * if superblock has changed too much, abort
627 	 */
628 	if ((mfs->fs_sblkno		!= dfs->fs_sblkno) ||
629 	    (mfs->fs_cblkno		!= dfs->fs_cblkno) ||
630 	    (mfs->fs_iblkno		!= dfs->fs_iblkno) ||
631 	    (mfs->fs_dblkno		!= dfs->fs_dblkno) ||
632 	    (mfs->fs_cgoffset		!= dfs->fs_cgoffset) ||
633 	    (mfs->fs_cgmask		!= dfs->fs_cgmask) ||
634 	    (mfs->fs_bsize		!= dfs->fs_bsize) ||
635 	    (mfs->fs_fsize		!= dfs->fs_fsize) ||
636 	    (mfs->fs_frag		!= dfs->fs_frag) ||
637 	    (mfs->fs_bmask		!= dfs->fs_bmask) ||
638 	    (mfs->fs_fmask		!= dfs->fs_fmask) ||
639 	    (mfs->fs_bshift		!= dfs->fs_bshift) ||
640 	    (mfs->fs_fshift		!= dfs->fs_fshift) ||
641 	    (mfs->fs_fragshift		!= dfs->fs_fragshift) ||
642 	    (mfs->fs_fsbtodb		!= dfs->fs_fsbtodb) ||
643 	    (mfs->fs_sbsize		!= dfs->fs_sbsize) ||
644 	    (mfs->fs_nindir		!= dfs->fs_nindir) ||
645 	    (mfs->fs_nspf		!= dfs->fs_nspf) ||
646 	    (mfs->fs_trackskew		!= dfs->fs_trackskew) ||
647 	    (mfs->fs_cgsize		!= dfs->fs_cgsize) ||
648 	    (mfs->fs_ntrak		!= dfs->fs_ntrak) ||
649 	    (mfs->fs_nsect		!= dfs->fs_nsect) ||
650 	    (mfs->fs_spc		!= dfs->fs_spc) ||
651 	    (mfs->fs_cpg		!= dfs->fs_cpg) ||
652 	    (mfs->fs_ipg		!= dfs->fs_ipg) ||
653 	    (mfs->fs_fpg		!= dfs->fs_fpg) ||
654 	    (mfs->fs_postblformat	!= dfs->fs_postblformat) ||
655 	    (mfs->fs_magic		!= dfs->fs_magic)) {
656 		brelse(bp);
657 		return (EACCES);
658 	}
659 	if (dfs->fs_clean == FSBAD || FSOKAY != dfs->fs_state + dfs->fs_time)
660 		if (mfs->fs_clean == FSLOG) {
661 			brelse(bp);
662 			return (EACCES);
663 		}
664 
665 	/*
666 	 * get new summary info
667 	 */
668 	if (ufs_getsummaryinfo(vfsp->vfs_dev, ufsvfsp, dfs)) {
669 		brelse(bp);
670 		return (EIO);
671 	}
672 
673 	/*
674 	 * release old summary info and update in-memory superblock
675 	 */
676 	kmem_free(mfs->fs_u.fs_csp, mfs->fs_cssize);
677 	mfs->fs_u.fs_csp = dfs->fs_u.fs_csp;	/* Only entry 0 used */
678 
679 	/*
680 	 * update fields allowed to change
681 	 */
682 	mfs->fs_size		= dfs->fs_size;
683 	mfs->fs_dsize		= dfs->fs_dsize;
684 	mfs->fs_ncg		= dfs->fs_ncg;
685 	mfs->fs_minfree		= dfs->fs_minfree;
686 	mfs->fs_rotdelay	= dfs->fs_rotdelay;
687 	mfs->fs_rps		= dfs->fs_rps;
688 	mfs->fs_maxcontig	= dfs->fs_maxcontig;
689 	mfs->fs_maxbpg		= dfs->fs_maxbpg;
690 	mfs->fs_csmask		= dfs->fs_csmask;
691 	mfs->fs_csshift		= dfs->fs_csshift;
692 	mfs->fs_optim		= dfs->fs_optim;
693 	mfs->fs_csaddr		= dfs->fs_csaddr;
694 	mfs->fs_cssize		= dfs->fs_cssize;
695 	mfs->fs_ncyl		= dfs->fs_ncyl;
696 	mfs->fs_cstotal		= dfs->fs_cstotal;
697 	mfs->fs_reclaim		= dfs->fs_reclaim;
698 
699 	if (mfs->fs_reclaim & (FS_RECLAIM|FS_RECLAIMING)) {
700 		mfs->fs_reclaim &= ~FS_RECLAIM;
701 		mfs->fs_reclaim |=  FS_RECLAIMING;
702 		ufs_thread_start(&ufsvfsp->vfs_reclaim,
703 		    ufs_thread_reclaim, vfsp);
704 	}
705 
706 	/* XXX What to do about sparecon? */
707 
708 	/* XXX need to copy volume label */
709 
710 	/*
711 	 * ondisk clean flag overrides inmemory clean flag iff == FSBAD
712 	 * or if error-locked and ondisk is now clean
713 	 */
714 	needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
715 	if (needs_unlock)
716 		mutex_enter(&ufsvfsp->vfs_lock);
717 
718 	if (errlck == UN_ERRLCK) {
719 		if (finished_fsclean == dfs->fs_clean)
720 			mfs->fs_clean = finished_fsclean;
721 		else
722 			mfs->fs_clean = FSBAD;
723 		mfs->fs_state = FSOKAY - dfs->fs_time;
724 	}
725 
726 	if (FSOKAY != dfs->fs_state + dfs->fs_time ||
727 	    (dfs->fs_clean == FSBAD))
728 		mfs->fs_clean = FSBAD;
729 
730 	if (needs_unlock)
731 		mutex_exit(&ufsvfsp->vfs_lock);
732 
733 	brelse(bp);
734 
735 	return (0);
736 }
737 
738 /*
739  * ufs_reconcile_inode
740  *	reconcile ondisk inode with incore inode
741  */
742 static int
743 ufs_reconcile_inode(struct inode *ip, void *arg)
744 {
745 	int		i;
746 	int		ndaddr;
747 	int		niaddr;
748 	struct dinode	*dp;		/* ondisk inode */
749 	struct buf	*bp	= NULL;
750 	uid_t		d_uid;
751 	gid_t		d_gid;
752 	int		error = 0;
753 	struct fs	*fs;
754 
755 	/*
756 	 * not an inode we care about
757 	 */
758 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
759 		return (0);
760 
761 	fs = ip->i_fs;
762 
763 	/*
764 	 * Inode reconciliation fails: we made the filesystem quiescent
765 	 * and we did a ufs_flush() before calling ufs_reconcile_inode()
766 	 * and thus the inode should not have been changed inbetween.
767 	 * Any discrepancies indicate a logic error and a pretty
768 	 * significant run-state inconsistency we should complain about.
769 	 */
770 	if (ip->i_flag & (IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG)) {
771 		cmn_err(CE_WARN, "%s: Inode reconciliation failed for"
772 		    "inode %llu", fs->fs_fsmnt, (u_longlong_t)ip->i_number);
773 		return (EINVAL);
774 	}
775 
776 	/*
777 	 * get the dinode
778 	 */
779 	bp = UFS_BREAD(ip->i_ufsvfs,
780 	    ip->i_dev, (daddr_t)fsbtodb(fs, itod(fs, ip->i_number)),
781 	    (int)fs->fs_bsize);
782 	if (bp->b_flags & B_ERROR) {
783 		brelse(bp);
784 		return (EIO);
785 	}
786 	dp  = bp->b_un.b_dino;
787 	dp += itoo(fs, ip->i_number);
788 
789 	/*
790 	 * handle Sun's implementation of EFT
791 	 */
792 	d_uid = (dp->di_suid == UID_LONG) ? dp->di_uid : (uid_t)dp->di_suid;
793 	d_gid = (dp->di_sgid == GID_LONG) ? dp->di_gid : (uid_t)dp->di_sgid;
794 
795 	rw_enter(&ip->i_contents, RW_WRITER);
796 
797 	/*
798 	 * some fields are not allowed to change
799 	 */
800 	if ((ip->i_mode  != dp->di_mode) ||
801 	    (ip->i_gen   != dp->di_gen) ||
802 	    (ip->i_uid   != d_uid) ||
803 	    (ip->i_gid   != d_gid)) {
804 		error = EACCES;
805 		goto out;
806 	}
807 
808 	/*
809 	 * and some are allowed to change
810 	 */
811 	ip->i_size		= dp->di_size;
812 	ip->i_ic.ic_flags	= dp->di_ic.ic_flags;
813 	ip->i_blocks		= dp->di_blocks;
814 	ip->i_nlink		= dp->di_nlink;
815 	if (ip->i_flag & IFASTSYMLNK) {
816 		ndaddr = 1;
817 		niaddr = 0;
818 	} else {
819 		ndaddr = NDADDR;
820 		niaddr = NIADDR;
821 	}
822 	for (i = 0; i < ndaddr; ++i)
823 		ip->i_db[i] = dp->di_db[i];
824 	for (i = 0; i < niaddr; ++i)
825 		ip->i_ib[i] = dp->di_ib[i];
826 
827 out:
828 	rw_exit(&ip->i_contents);
829 	brelse(bp);
830 	return (error);
831 }
832 
833 /*
834  * ufs_reconcile
835  *	reconcile ondisk superblock/inodes with any incore
836  */
837 static int
838 ufs_reconcile(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
839 {
840 	int	error = 0;
841 
842 	/*
843 	 * get rid of as much inmemory data as possible
844 	 */
845 	(void) ufs_flush(vfsp);
846 
847 	/*
848 	 * reconcile the superblock and inodes
849 	 */
850 	if (error = ufs_reconcile_fs(vfsp, ufsvfsp, errlck))
851 		return (error);
852 	if (error = ufs_scan_inodes(0, ufs_reconcile_inode, ufsvfsp, ufsvfsp))
853 		return (error);
854 	/*
855 	 * allocation blocks may be incorrect; get rid of them
856 	 */
857 	(void) ufs_flush(vfsp);
858 
859 	return (error);
860 }
861 
862 /*
863  * File system locking
864  */
865 int
866 ufs_fiolfs(struct vnode *vp, struct lockfs *lockfsp, int from_log)
867 {
868 	return (ufs__fiolfs(vp, lockfsp, /* from_user */ 1, from_log));
869 }
870 
871 /* kernel-internal interface, also used by fix-on-panic */
872 int
873 ufs__fiolfs(
874 	struct vnode *vp,
875 	struct lockfs *lockfsp,
876 	int from_user,
877 	int from_log)
878 {
879 	struct ulockfs	*ulp;
880 	struct lockfs	lfs;
881 	int		error;
882 	struct vfs	*vfsp;
883 	struct ufsvfs	*ufsvfsp;
884 	int		 errlck		= NO_ERRLCK;
885 	int		 poll_events	= POLLPRI;
886 	extern struct pollhead ufs_pollhd;
887 	ulockfs_info_t *head;
888 	ulockfs_info_t *info;
889 	int signal = 0;
890 
891 	/* check valid lock type */
892 	if (!lockfsp || lockfsp->lf_lock > LOCKFS_MAXLOCK)
893 		return (EINVAL);
894 
895 	if (!vp || !vp->v_vfsp || !vp->v_vfsp->vfs_data)
896 		return (EIO);
897 
898 	vfsp = vp->v_vfsp;
899 
900 	if (vfsp->vfs_flag & VFS_UNMOUNTED) /* has been unmounted */
901 		return (EIO);
902 
903 	/* take the lock and check again */
904 	vfs_lock_wait(vfsp);
905 	if (vfsp->vfs_flag & VFS_UNMOUNTED) {
906 		vfs_unlock(vfsp);
907 		return (EIO);
908 	}
909 
910 	/*
911 	 * Can't wlock or ro/elock fs with accounting or local swap file
912 	 * We need to check for this before we grab the ul_lock to avoid
913 	 * deadlocks with the accounting framework.
914 	 */
915 	if ((LOCKFS_IS_WLOCK(lockfsp) || LOCKFS_IS_ELOCK(lockfsp) ||
916 	    LOCKFS_IS_ROELOCK(lockfsp)) && !from_log) {
917 		if (ufs_checkaccton(vp) || ufs_checkswapon(vp)) {
918 			vfs_unlock(vfsp);
919 			return (EDEADLK);
920 		}
921 	}
922 
923 	ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
924 	ulp = &ufsvfsp->vfs_ulockfs;
925 	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
926 	SEARCH_ULOCKFSP(head, ulp, info);
927 
928 	/*
929 	 * Suspend both the reclaim thread and the delete thread.
930 	 * This must be done outside the lockfs locking protocol.
931 	 */
932 	ufs_thread_suspend(&ufsvfsp->vfs_reclaim);
933 	ufs_thread_suspend(&ufsvfsp->vfs_delete);
934 
935 	mutex_enter(&ulp->ul_lock);
936 	atomic_add_long(&ufs_quiesce_pend, 1);
937 
938 	/*
939 	 * Quit if there is another lockfs request in progress
940 	 * that is waiting for existing ufs_vnops to complete.
941 	 */
942 	if (ULOCKFS_IS_BUSY(ulp)) {
943 		error = EBUSY;
944 		goto errexit;
945 	}
946 
947 	/* cannot ulocked or downgrade a hard-lock */
948 	if (ULOCKFS_IS_HLOCK(ulp)) {
949 		error = EIO;
950 		goto errexit;
951 	}
952 
953 	/* an error lock may be unlocked or relocked, only */
954 	if (ULOCKFS_IS_ELOCK(ulp)) {
955 		if (!LOCKFS_IS_ULOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
956 			error = EBUSY;
957 			goto errexit;
958 		}
959 	}
960 
961 	/*
962 	 * a read-only error lock may only be upgraded to an
963 	 * error lock or hard lock
964 	 */
965 	if (ULOCKFS_IS_ROELOCK(ulp)) {
966 		if (!LOCKFS_IS_HLOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
967 			error = EBUSY;
968 			goto errexit;
969 		}
970 	}
971 
972 	/*
973 	 * until read-only error locks are fully implemented
974 	 * just return EINVAL
975 	 */
976 	if (LOCKFS_IS_ROELOCK(lockfsp)) {
977 		error = EINVAL;
978 		goto errexit;
979 	}
980 
981 	/*
982 	 * an error lock may only be applied if the file system is
983 	 * unlocked or already error locked.
984 	 * (this is to prevent the case where a fs gets changed out from
985 	 * underneath a fs that is locked for backup,
986 	 * that is, name/delete/write-locked.)
987 	 */
988 	if ((!ULOCKFS_IS_ULOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp) &&
989 	    !ULOCKFS_IS_ROELOCK(ulp)) &&
990 	    (LOCKFS_IS_ELOCK(lockfsp) || LOCKFS_IS_ROELOCK(lockfsp))) {
991 		error = EBUSY;
992 		goto errexit;
993 	}
994 
995 	/* get and validate the input lockfs request */
996 	if (error = ufs_getlfd(lockfsp, &ulp->ul_lockfs))
997 		goto errexit;
998 
999 	/*
1000 	 * save current ulockfs struct
1001 	 */
1002 	bcopy(&ulp->ul_lockfs, &lfs, sizeof (struct lockfs));
1003 
1004 	/*
1005 	 * Freeze the file system (pend future accesses)
1006 	 */
1007 	ufs_freeze(ulp, lockfsp);
1008 
1009 	/*
1010 	 * Set locking in progress because ufs_quiesce may free the
1011 	 * ul_lock mutex.
1012 	 */
1013 	ULOCKFS_SET_BUSY(ulp);
1014 	/* update the ioctl copy */
1015 	LOCKFS_SET_BUSY(&ulp->ul_lockfs);
1016 
1017 	/*
1018 	 * We  need to unset FWLOCK status before we call ufs_quiesce
1019 	 * so that the thread doesnt get suspended. We do this only if
1020 	 * this (fallocate) thread requested an unlock operation.
1021 	 */
1022 	if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
1023 		if (!ULOCKFS_IS_WLOCK(ulp))
1024 			ULOCKFS_CLR_FWLOCK(ulp);
1025 	}
1026 
1027 	/*
1028 	 * Quiesce (wait for outstanding accesses to finish)
1029 	 */
1030 	if (error = ufs_quiesce(ulp)) {
1031 		/*
1032 		 * Interrupted due to signal. There could still be
1033 		 * pending vnops.
1034 		 */
1035 		signal = 1;
1036 
1037 		/*
1038 		 * We do broadcast because lock-status
1039 		 * could be reverted to old status.
1040 		 */
1041 		cv_broadcast(&ulp->ul_cv);
1042 		goto errout;
1043 	}
1044 
1045 	/*
1046 	 * If the fallocate thread requested a write fs lock operation
1047 	 * then we set fwlock status in the ulp.
1048 	 */
1049 	if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
1050 		if (ULOCKFS_IS_WLOCK(ulp))
1051 			ULOCKFS_SET_FWLOCK(ulp);
1052 	}
1053 
1054 	/*
1055 	 * save error lock status to pass down to reconcilation
1056 	 * routines and for later cleanup
1057 	 */
1058 	if (LOCKFS_IS_ELOCK(&lfs) && ULOCKFS_IS_ULOCK(ulp))
1059 		errlck = UN_ERRLCK;
1060 
1061 	if (ULOCKFS_IS_ELOCK(ulp) || ULOCKFS_IS_ROELOCK(ulp)) {
1062 		int needs_unlock;
1063 		int needs_sbwrite;
1064 
1065 		poll_events |= POLLERR;
1066 		errlck = LOCKFS_IS_ELOCK(&lfs) || LOCKFS_IS_ROELOCK(&lfs) ?
1067 		    RE_ERRLCK : SET_ERRLCK;
1068 
1069 		needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
1070 		if (needs_unlock)
1071 			mutex_enter(&ufsvfsp->vfs_lock);
1072 
1073 		/* disable delayed i/o */
1074 		needs_sbwrite = 0;
1075 
1076 		if (errlck == SET_ERRLCK) {
1077 			ufsvfsp->vfs_fs->fs_clean = FSBAD;
1078 			needs_sbwrite = 1;
1079 		}
1080 
1081 		needs_sbwrite |= ufsvfsp->vfs_dio;
1082 		ufsvfsp->vfs_dio = 0;
1083 
1084 		if (needs_unlock)
1085 			mutex_exit(&ufsvfsp->vfs_lock);
1086 
1087 		if (needs_sbwrite) {
1088 			ulp->ul_sbowner = curthread;
1089 			TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE);
1090 
1091 			if (needs_unlock)
1092 				mutex_enter(&ufsvfsp->vfs_lock);
1093 
1094 			ufsvfsp->vfs_fs->fs_fmod = 0;
1095 
1096 			if (needs_unlock)
1097 				mutex_exit(&ufsvfsp->vfs_lock);
1098 		}
1099 	}
1100 
1101 	/*
1102 	 * reconcile superblock and inodes if was wlocked
1103 	 */
1104 	if (LOCKFS_IS_WLOCK(&lfs) || LOCKFS_IS_ELOCK(&lfs)) {
1105 		if (error = ufs_reconcile(vfsp, ufsvfsp, errlck))
1106 			goto errout;
1107 		/*
1108 		 * in case the fs grew; reset the metadata map for logging tests
1109 		 */
1110 		TRANS_MATA_UMOUNT(ufsvfsp);
1111 		TRANS_MATA_MOUNT(ufsvfsp);
1112 		TRANS_MATA_SI(ufsvfsp, ufsvfsp->vfs_fs);
1113 	}
1114 
1115 	/*
1116 	 * At least everything *currently* dirty goes out.
1117 	 */
1118 
1119 	if ((error = ufs_flush(vfsp)) != 0 && !ULOCKFS_IS_HLOCK(ulp) &&
1120 	    !ULOCKFS_IS_ELOCK(ulp))
1121 		goto errout;
1122 
1123 	/*
1124 	 * thaw file system and wakeup pended processes
1125 	 */
1126 	if (error = ufs_thaw(vfsp, ufsvfsp, ulp))
1127 		if (!ULOCKFS_IS_HLOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp))
1128 			goto errout;
1129 
1130 	/*
1131 	 * reset modified flag if not already write locked
1132 	 */
1133 	if (!LOCKFS_IS_WLOCK(&lfs))
1134 		ULOCKFS_CLR_MOD(ulp);
1135 
1136 	/*
1137 	 * idle the lock struct
1138 	 */
1139 	ULOCKFS_CLR_BUSY(ulp);
1140 	/* update the ioctl copy */
1141 	LOCKFS_CLR_BUSY(&ulp->ul_lockfs);
1142 
1143 	/*
1144 	 * free current comment
1145 	 */
1146 	if (lfs.lf_comment && lfs.lf_comlen != 0) {
1147 		kmem_free(lfs.lf_comment, lfs.lf_comlen);
1148 		lfs.lf_comment = NULL;
1149 		lfs.lf_comlen = 0;
1150 	}
1151 
1152 	/* do error lock cleanup */
1153 	if (errlck == UN_ERRLCK)
1154 		ufsfx_unlockfs(ufsvfsp);
1155 
1156 	else if (errlck == RE_ERRLCK)
1157 		ufsfx_lockfs(ufsvfsp);
1158 
1159 	/* don't allow error lock from user to invoke panic */
1160 	else if (from_user && errlck == SET_ERRLCK &&
1161 	    !(ufsvfsp->vfs_fsfx.fx_flags & (UFSMNT_ONERROR_PANIC >> 4)))
1162 		(void) ufs_fault(ufsvfsp->vfs_root,
1163 		    ulp->ul_lockfs.lf_comment && ulp->ul_lockfs.lf_comlen > 0 ?
1164 		    ulp->ul_lockfs.lf_comment: "user-applied error lock");
1165 
1166 	atomic_add_long(&ufs_quiesce_pend, -1);
1167 	mutex_exit(&ulp->ul_lock);
1168 	vfs_unlock(vfsp);
1169 
1170 	if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs))
1171 		poll_events |= POLLERR;
1172 
1173 	pollwakeup(&ufs_pollhd, poll_events);
1174 
1175 	/*
1176 	 * Allow both the delete thread and the reclaim thread to
1177 	 * continue.
1178 	 */
1179 	ufs_thread_continue(&ufsvfsp->vfs_delete);
1180 	ufs_thread_continue(&ufsvfsp->vfs_reclaim);
1181 
1182 	return (0);
1183 
1184 errout:
1185 	/*
1186 	 * Lock failed. Reset the old lock in ufsvfs if not hard locked.
1187 	 */
1188 	if (!LOCKFS_IS_HLOCK(&ulp->ul_lockfs)) {
1189 		bcopy(&lfs, &ulp->ul_lockfs, sizeof (struct lockfs));
1190 		ulp->ul_fs_lock = (1 << lfs.lf_lock);
1191 	}
1192 
1193 	/*
1194 	 * Don't call ufs_thaw() when there's a signal during
1195 	 * ufs quiesce operation as it can lead to deadlock
1196 	 * with getpage.
1197 	 */
1198 	if (signal == 0)
1199 		(void) ufs_thaw(vfsp, ufsvfsp, ulp);
1200 
1201 	ULOCKFS_CLR_BUSY(ulp);
1202 	LOCKFS_CLR_BUSY(&ulp->ul_lockfs);
1203 
1204 errexit:
1205 	atomic_add_long(&ufs_quiesce_pend, -1);
1206 	mutex_exit(&ulp->ul_lock);
1207 	vfs_unlock(vfsp);
1208 
1209 	/*
1210 	 * Allow both the delete thread and the reclaim thread to
1211 	 * continue.
1212 	 */
1213 	ufs_thread_continue(&ufsvfsp->vfs_delete);
1214 	ufs_thread_continue(&ufsvfsp->vfs_reclaim);
1215 
1216 	return (error);
1217 }
1218 
1219 /*
1220  * fiolfss
1221  * 	return the current file system locking state info
1222  */
1223 int
1224 ufs_fiolfss(struct vnode *vp, struct lockfs *lockfsp)
1225 {
1226 	struct ulockfs	*ulp;
1227 
1228 	if (!vp || !vp->v_vfsp || !VTOI(vp))
1229 		return (EINVAL);
1230 
1231 	/* file system has been forcibly unmounted */
1232 	if (VTOI(vp)->i_ufsvfs == NULL)
1233 		return (EIO);
1234 
1235 	ulp = VTOUL(vp);
1236 
1237 	if (ULOCKFS_IS_HLOCK(ulp)) {
1238 		*lockfsp = ulp->ul_lockfs;	/* structure assignment */
1239 		return (0);
1240 	}
1241 
1242 	mutex_enter(&ulp->ul_lock);
1243 
1244 	*lockfsp = ulp->ul_lockfs;	/* structure assignment */
1245 
1246 	if (ULOCKFS_IS_MOD(ulp))
1247 		lockfsp->lf_flags |= LOCKFS_MOD;
1248 
1249 	mutex_exit(&ulp->ul_lock);
1250 
1251 	return (0);
1252 }
1253 
1254 /*
1255  * ufs_check_lockfs
1256  *	check whether a ufs_vnops conflicts with the file system lock
1257  */
1258 int
1259 ufs_check_lockfs(struct ufsvfs *ufsvfsp, struct ulockfs *ulp, ulong_t mask)
1260 {
1261 	k_sigset_t	smask;
1262 	int		sig, slock;
1263 
1264 	ASSERT(MUTEX_HELD(&ulp->ul_lock));
1265 
1266 	while (ulp->ul_fs_lock & mask) {
1267 		slock = (int)ULOCKFS_IS_SLOCK(ulp);
1268 		if ((curthread->t_flag & T_DONTPEND) && !slock) {
1269 			curthread->t_flag |= T_WOULDBLOCK;
1270 			return (EAGAIN);
1271 		}
1272 		curthread->t_flag &= ~T_WOULDBLOCK;
1273 
1274 		/*
1275 		 * In the case of an onerr umount of the fs, threads could
1276 		 * have blocked before coming into ufs_check_lockfs and
1277 		 * need to check for the special case of ELOCK and
1278 		 * vfs_dontblock being set which would indicate that the fs
1279 		 * is on its way out and will not return therefore making
1280 		 * EIO the appropriate response.
1281 		 */
1282 		if (ULOCKFS_IS_HLOCK(ulp) ||
1283 		    (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
1284 			return (EIO);
1285 
1286 		/*
1287 		 * wait for lock status to change
1288 		 */
1289 		if (slock || ufsvfsp->vfs_nointr) {
1290 			cv_wait(&ulp->ul_cv, &ulp->ul_lock);
1291 		} else {
1292 			sigintr(&smask, 1);
1293 			sig = cv_wait_sig(&ulp->ul_cv, &ulp->ul_lock);
1294 			sigunintr(&smask);
1295 			if ((!sig && (ulp->ul_fs_lock & mask)) ||
1296 			    ufsvfsp->vfs_dontblock)
1297 				return (EINTR);
1298 		}
1299 	}
1300 
1301 	if (mask & ULOCKFS_FWLOCK) {
1302 		atomic_add_long(&ulp->ul_falloc_cnt, 1);
1303 		ULOCKFS_SET_FALLOC(ulp);
1304 	} else {
1305 		atomic_add_long(&ulp->ul_vnops_cnt, 1);
1306 	}
1307 
1308 	return (0);
1309 }
1310 
1311 /*
1312  * Check whether we came across the handcrafted lockfs protocol path. We can't
1313  * simply check for T_DONTBLOCK here as one would assume since this can also
1314  * falsely catch recursive VOP's going to a different filesystem, instead we
1315  * check if we already hold the ulockfs->ul_lock mutex.
1316  */
1317 static int
1318 ufs_lockfs_is_under_rawlockfs(struct ulockfs *ulp)
1319 {
1320 	return ((mutex_owner(&ulp->ul_lock) != curthread) ? 0 : 1);
1321 }
1322 
1323 /*
1324  * ufs_lockfs_begin - start the lockfs locking protocol
1325  */
1326 int
1327 ufs_lockfs_begin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask)
1328 {
1329 	int 		error;
1330 	int		rec_vop;
1331 	ushort_t	op_cnt_incremented = 0;
1332 	ulong_t		*ctr;
1333 	struct ulockfs *ulp;
1334 	ulockfs_info_t	*ulockfs_info;
1335 	ulockfs_info_t	*ulockfs_info_free;
1336 	ulockfs_info_t	*ulockfs_info_temp;
1337 
1338 	/*
1339 	 * file system has been forcibly unmounted
1340 	 */
1341 	if (ufsvfsp == NULL)
1342 		return (EIO);
1343 
1344 	*ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1345 
1346 	/*
1347 	 * Do lockfs protocol
1348 	 */
1349 	ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1350 	IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1351 
1352 	/*
1353 	 * Detect recursive VOP call or handcrafted internal lockfs protocol
1354 	 * path and bail out in that case.
1355 	 */
1356 	if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1357 		*ulpp = NULL;
1358 		return (0);
1359 	} else {
1360 		if (ulockfs_info_free == NULL) {
1361 			if ((ulockfs_info_temp = (ulockfs_info_t *)
1362 			    kmem_zalloc(sizeof (ulockfs_info_t),
1363 			    KM_NOSLEEP)) == NULL) {
1364 				*ulpp = NULL;
1365 				return (ENOMEM);
1366 			}
1367 		}
1368 	}
1369 
1370 	/*
1371 	 * First time VOP call
1372 	 *
1373 	 * Increment the ctr irrespective of the lockfs state. If the lockfs
1374 	 * state is not ULOCKFS_ULOCK, we can decrement it later. However,
1375 	 * before incrementing we need to check if there is a pending quiesce
1376 	 * request because if we have a continuous stream of ufs_lockfs_begin
1377 	 * requests pounding on a few cpu's then the ufs_quiesce thread might
1378 	 * never see the value of zero for ctr - a livelock kind of scenario.
1379 	 */
1380 	ctr = (mask & ULOCKFS_FWLOCK) ?
1381 	    &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt;
1382 	if (!ULOCKFS_IS_SLOCK(ulp)) {
1383 		atomic_add_long(ctr, 1);
1384 		op_cnt_incremented++;
1385 	}
1386 
1387 	/*
1388 	 * If the lockfs state (indicated by ul_fs_lock) is not just
1389 	 * ULOCKFS_ULOCK, then we will be routed through ufs_check_lockfs
1390 	 * where there is a check with an appropriate mask to selectively allow
1391 	 * operations permitted for that kind of lockfs state.
1392 	 *
1393 	 * Even these selective operations should not be allowed to go through
1394 	 * if a lockfs request is in progress because that could result in inode
1395 	 * modifications during a quiesce and could hence result in inode
1396 	 * reconciliation failures. ULOCKFS_SLOCK alone would not be sufficient,
1397 	 * so make use of ufs_quiesce_pend to disallow vnode operations when a
1398 	 * quiesce is in progress.
1399 	 */
1400 	if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
1401 		if (op_cnt_incremented)
1402 			if (!atomic_add_long_nv(ctr, -1))
1403 				cv_broadcast(&ulp->ul_cv);
1404 		mutex_enter(&ulp->ul_lock);
1405 		error = ufs_check_lockfs(ufsvfsp, ulp, mask);
1406 		mutex_exit(&ulp->ul_lock);
1407 		if (error) {
1408 			if (ulockfs_info_free == NULL)
1409 				kmem_free(ulockfs_info_temp,
1410 				    sizeof (ulockfs_info_t));
1411 			return (error);
1412 		}
1413 	} else {
1414 		/*
1415 		 * This is the common case of file system in a unlocked state.
1416 		 *
1417 		 * If a file system is unlocked, we would expect the ctr to have
1418 		 * been incremented by now. But this will not be true when a
1419 		 * quiesce is winding up - SLOCK was set when we checked before
1420 		 * incrementing the ctr, but by the time we checked for
1421 		 * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. It is okay
1422 		 * to take ul_lock and go through the slow path in this uncommon
1423 		 * case.
1424 		 */
1425 		if (op_cnt_incremented == 0) {
1426 			mutex_enter(&ulp->ul_lock);
1427 			error = ufs_check_lockfs(ufsvfsp, ulp, mask);
1428 			if (error) {
1429 				mutex_exit(&ulp->ul_lock);
1430 				if (ulockfs_info_free == NULL)
1431 					kmem_free(ulockfs_info_temp,
1432 					    sizeof (ulockfs_info_t));
1433 				return (error);
1434 			}
1435 			if (mask & ULOCKFS_FWLOCK)
1436 				ULOCKFS_SET_FALLOC(ulp);
1437 			mutex_exit(&ulp->ul_lock);
1438 		} else if (mask & ULOCKFS_FWLOCK) {
1439 			mutex_enter(&ulp->ul_lock);
1440 			ULOCKFS_SET_FALLOC(ulp);
1441 			mutex_exit(&ulp->ul_lock);
1442 		}
1443 	}
1444 
1445 	if (ulockfs_info_free != NULL) {
1446 		ulockfs_info_free->ulp = ulp;
1447 		if (mask & ULOCKFS_FWLOCK)
1448 			ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE;
1449 	} else {
1450 		ulockfs_info_temp->ulp = ulp;
1451 		ulockfs_info_temp->next = ulockfs_info;
1452 		if (mask & ULOCKFS_FWLOCK)
1453 			ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE;
1454 		ASSERT(ufs_lockfs_key != 0);
1455 		(void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1456 	}
1457 
1458 	curthread->t_flag |= T_DONTBLOCK;
1459 	return (0);
1460 }
1461 
1462 /*
1463  * Check whether we are returning from the top level VOP.
1464  */
1465 static int
1466 ufs_lockfs_top_vop_return(ulockfs_info_t *head)
1467 {
1468 	ulockfs_info_t *info;
1469 	int result = 1;
1470 
1471 	for (info = head; info != NULL; info = info->next) {
1472 		if (info->ulp != NULL) {
1473 			result = 0;
1474 			break;
1475 		}
1476 	}
1477 
1478 	return (result);
1479 }
1480 
1481 /*
1482  * ufs_lockfs_end - terminate the lockfs locking protocol
1483  */
1484 void
1485 ufs_lockfs_end(struct ulockfs *ulp)
1486 {
1487 	ulockfs_info_t *info;
1488 	ulockfs_info_t *head;
1489 
1490 	/*
1491 	 * end-of-VOP protocol
1492 	 */
1493 	if (ulp == NULL)
1494 		return;
1495 
1496 	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1497 	SEARCH_ULOCKFSP(head, ulp, info);
1498 
1499 	/*
1500 	 * If we're called from a first level VOP, we have to have a
1501 	 * valid ulockfs record in the TSD.
1502 	 */
1503 	ASSERT(info != NULL);
1504 
1505 	/*
1506 	 * Invalidate the ulockfs record.
1507 	 */
1508 	info->ulp = NULL;
1509 
1510 	if (ufs_lockfs_top_vop_return(head))
1511 		curthread->t_flag &= ~T_DONTBLOCK;
1512 
1513 	/* fallocate thread */
1514 	if (ULOCKFS_IS_FALLOC(ulp) && info->flags & ULOCK_INFO_FALLOCATE) {
1515 		/* Clear the thread's fallocate state */
1516 		info->flags &= ~ULOCK_INFO_FALLOCATE;
1517 		if (!atomic_add_long_nv(&ulp->ul_falloc_cnt, -1)) {
1518 			mutex_enter(&ulp->ul_lock);
1519 			ULOCKFS_CLR_FALLOC(ulp);
1520 			cv_broadcast(&ulp->ul_cv);
1521 			mutex_exit(&ulp->ul_lock);
1522 		}
1523 	} else  { /* normal thread */
1524 		if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1))
1525 			cv_broadcast(&ulp->ul_cv);
1526 	}
1527 }
1528 
1529 /*
1530  * ufs_lockfs_trybegin - try to start the lockfs locking protocol without
1531  * blocking.
1532  */
1533 int
1534 ufs_lockfs_trybegin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask)
1535 {
1536 	int 		error = 0;
1537 	int		rec_vop;
1538 	ushort_t	op_cnt_incremented = 0;
1539 	ulong_t		*ctr;
1540 	struct ulockfs *ulp;
1541 	ulockfs_info_t	*ulockfs_info;
1542 	ulockfs_info_t	*ulockfs_info_free;
1543 	ulockfs_info_t	*ulockfs_info_temp;
1544 
1545 	/*
1546 	 * file system has been forcibly unmounted
1547 	 */
1548 	if (ufsvfsp == NULL)
1549 		return (EIO);
1550 
1551 	*ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1552 
1553 	/*
1554 	 * Do lockfs protocol
1555 	 */
1556 	ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1557 	IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1558 
1559 	/*
1560 	 * Detect recursive VOP call or handcrafted internal lockfs protocol
1561 	 * path and bail out in that case.
1562 	 */
1563 	if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1564 		*ulpp = NULL;
1565 		return (0);
1566 	} else {
1567 		if (ulockfs_info_free == NULL) {
1568 			if ((ulockfs_info_temp = (ulockfs_info_t *)
1569 			    kmem_zalloc(sizeof (ulockfs_info_t),
1570 			    KM_NOSLEEP)) == NULL) {
1571 				*ulpp = NULL;
1572 				return (ENOMEM);
1573 			}
1574 		}
1575 	}
1576 
1577 	/*
1578 	 * First time VOP call
1579 	 *
1580 	 * Increment the ctr irrespective of the lockfs state. If the lockfs
1581 	 * state is not ULOCKFS_ULOCK, we can decrement it later. However,
1582 	 * before incrementing we need to check if there is a pending quiesce
1583 	 * request because if we have a continuous stream of ufs_lockfs_begin
1584 	 * requests pounding on a few cpu's then the ufs_quiesce thread might
1585 	 * never see the value of zero for ctr - a livelock kind of scenario.
1586 	 */
1587 	ctr = (mask & ULOCKFS_FWLOCK) ?
1588 	    &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt;
1589 	if (!ULOCKFS_IS_SLOCK(ulp)) {
1590 		atomic_add_long(ctr, 1);
1591 		op_cnt_incremented++;
1592 	}
1593 
1594 	if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
1595 		/*
1596 		 * Non-blocking version of ufs_check_lockfs() code.
1597 		 *
1598 		 * If the file system is not hard locked or error locked
1599 		 * and if ulp->ul_fs_lock allows this operation, increment
1600 		 * the appropriate counter and proceed (For eg., In case the
1601 		 * file system is delete locked, a mmap can still go through).
1602 		 */
1603 		if (op_cnt_incremented)
1604 			if (!atomic_add_long_nv(ctr, -1))
1605 				cv_broadcast(&ulp->ul_cv);
1606 		mutex_enter(&ulp->ul_lock);
1607 		if (ULOCKFS_IS_HLOCK(ulp) ||
1608 		    (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
1609 			error = EIO;
1610 		else if (ulp->ul_fs_lock & mask)
1611 			error = EAGAIN;
1612 
1613 		if (error) {
1614 			mutex_exit(&ulp->ul_lock);
1615 			if (ulockfs_info_free == NULL)
1616 				kmem_free(ulockfs_info_temp,
1617 				    sizeof (ulockfs_info_t));
1618 			return (error);
1619 		}
1620 		atomic_add_long(ctr, 1);
1621 		if (mask & ULOCKFS_FWLOCK)
1622 			ULOCKFS_SET_FALLOC(ulp);
1623 		mutex_exit(&ulp->ul_lock);
1624 	} else {
1625 		/*
1626 		 * This is the common case of file system in a unlocked state.
1627 		 *
1628 		 * If a file system is unlocked, we would expect the ctr to have
1629 		 * been incremented by now. But this will not be true when a
1630 		 * quiesce is winding up - SLOCK was set when we checked before
1631 		 * incrementing the ctr, but by the time we checked for
1632 		 * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. Take
1633 		 * ul_lock and go through the non-blocking version of
1634 		 * ufs_check_lockfs() code.
1635 		 */
1636 		if (op_cnt_incremented == 0) {
1637 			mutex_enter(&ulp->ul_lock);
1638 			if (ULOCKFS_IS_HLOCK(ulp) ||
1639 			    (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
1640 				error = EIO;
1641 			else if (ulp->ul_fs_lock & mask)
1642 				error = EAGAIN;
1643 
1644 			if (error) {
1645 				mutex_exit(&ulp->ul_lock);
1646 				if (ulockfs_info_free == NULL)
1647 					kmem_free(ulockfs_info_temp,
1648 					    sizeof (ulockfs_info_t));
1649 				return (error);
1650 			}
1651 			atomic_add_long(ctr, 1);
1652 			if (mask & ULOCKFS_FWLOCK)
1653 				ULOCKFS_SET_FALLOC(ulp);
1654 			mutex_exit(&ulp->ul_lock);
1655 		} else if (mask & ULOCKFS_FWLOCK) {
1656 			mutex_enter(&ulp->ul_lock);
1657 			ULOCKFS_SET_FALLOC(ulp);
1658 			mutex_exit(&ulp->ul_lock);
1659 		}
1660 	}
1661 
1662 	if (ulockfs_info_free != NULL) {
1663 		ulockfs_info_free->ulp = ulp;
1664 		if (mask & ULOCKFS_FWLOCK)
1665 			ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE;
1666 	} else {
1667 		ulockfs_info_temp->ulp = ulp;
1668 		ulockfs_info_temp->next = ulockfs_info;
1669 		if (mask & ULOCKFS_FWLOCK)
1670 			ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE;
1671 		ASSERT(ufs_lockfs_key != 0);
1672 		(void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1673 	}
1674 
1675 	curthread->t_flag |= T_DONTBLOCK;
1676 	return (0);
1677 }
1678 
1679 /*
1680  * specialized version of ufs_lockfs_begin() called by ufs_getpage().
1681  */
1682 int
1683 ufs_lockfs_begin_getpage(
1684 	struct ufsvfs	*ufsvfsp,
1685 	struct ulockfs	**ulpp,
1686 	struct seg	*seg,
1687 	int		read_access,
1688 	uint_t		*protp)
1689 {
1690 	ulong_t			mask;
1691 	int 			error;
1692 	int			rec_vop;
1693 	struct ulockfs		*ulp;
1694 	ulockfs_info_t		*ulockfs_info;
1695 	ulockfs_info_t		*ulockfs_info_free;
1696 	ulockfs_info_t		*ulockfs_info_temp;
1697 
1698 	/*
1699 	 * file system has been forcibly unmounted
1700 	 */
1701 	if (ufsvfsp == NULL)
1702 		return (EIO);
1703 
1704 	*ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1705 
1706 	/*
1707 	 * Do lockfs protocol
1708 	 */
1709 	ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1710 	IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1711 
1712 	/*
1713 	 * Detect recursive VOP call or handcrafted internal lockfs protocol
1714 	 * path and bail out in that case.
1715 	 */
1716 	if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1717 		*ulpp = NULL;
1718 		return (0);
1719 	} else {
1720 		if (ulockfs_info_free == NULL) {
1721 			if ((ulockfs_info_temp = (ulockfs_info_t *)
1722 			    kmem_zalloc(sizeof (ulockfs_info_t),
1723 			    KM_NOSLEEP)) == NULL) {
1724 				*ulpp = NULL;
1725 				return (ENOMEM);
1726 			}
1727 		}
1728 	}
1729 
1730 	/*
1731 	 * First time VOP call
1732 	 */
1733 	atomic_add_long(&ulp->ul_vnops_cnt, 1);
1734 	if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
1735 		if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1))
1736 			cv_broadcast(&ulp->ul_cv);
1737 		mutex_enter(&ulp->ul_lock);
1738 		if (seg->s_ops == &segvn_ops &&
1739 		    ((struct segvn_data *)seg->s_data)->type != MAP_SHARED) {
1740 			mask = (ulong_t)ULOCKFS_GETREAD_MASK;
1741 		} else if (protp && read_access) {
1742 			/*
1743 			 * Restrict the mapping to readonly.
1744 			 * Writes to this mapping will cause
1745 			 * another fault which will then
1746 			 * be suspended if fs is write locked
1747 			 */
1748 			*protp &= ~PROT_WRITE;
1749 			mask = (ulong_t)ULOCKFS_GETREAD_MASK;
1750 		} else
1751 			mask = (ulong_t)ULOCKFS_GETWRITE_MASK;
1752 
1753 		/*
1754 		 * will sleep if this fs is locked against this VOP
1755 		 */
1756 		error = ufs_check_lockfs(ufsvfsp, ulp, mask);
1757 		mutex_exit(&ulp->ul_lock);
1758 		if (error) {
1759 			if (ulockfs_info_free == NULL)
1760 				kmem_free(ulockfs_info_temp,
1761 				    sizeof (ulockfs_info_t));
1762 			return (error);
1763 		}
1764 	}
1765 
1766 	if (ulockfs_info_free != NULL) {
1767 		ulockfs_info_free->ulp = ulp;
1768 	} else {
1769 		ulockfs_info_temp->ulp = ulp;
1770 		ulockfs_info_temp->next = ulockfs_info;
1771 		ASSERT(ufs_lockfs_key != 0);
1772 		(void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1773 	}
1774 
1775 	curthread->t_flag |= T_DONTBLOCK;
1776 	return (0);
1777 }
1778 
1779 void
1780 ufs_lockfs_tsd_destructor(void *head)
1781 {
1782 	ulockfs_info_t *curr = (ulockfs_info_t *)head;
1783 	ulockfs_info_t *temp;
1784 
1785 	for (; curr != NULL; ) {
1786 		/*
1787 		 * The TSD destructor is being called when the thread exits
1788 		 * (via thread_exit()). At that time it must have cleaned up
1789 		 * all VOPs via ufs_lockfs_end() and there must not be a
1790 		 * valid ulockfs record exist while a thread is exiting.
1791 		 */
1792 		temp = curr;
1793 		curr = curr->next;
1794 		ASSERT(temp->ulp == NULL);
1795 		kmem_free(temp, sizeof (ulockfs_info_t));
1796 	}
1797 }
1798