1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 #include <sys/types.h>
26 #include <sys/t_lock.h>
27 #include <sys/param.h>
28 #include <sys/time.h>
29 #include <sys/systm.h>
30 #include <sys/sysmacros.h>
31 #include <sys/resource.h>
32 #include <sys/signal.h>
33 #include <sys/cred.h>
34 #include <sys/user.h>
35 #include <sys/buf.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/proc.h>
39 #include <sys/disp.h>
40 #include <sys/file.h>
41 #include <sys/fcntl.h>
42 #include <sys/flock.h>
43 #include <sys/atomic.h>
44 #include <sys/kmem.h>
45 #include <sys/uio.h>
46 #include <sys/conf.h>
47 #include <sys/mman.h>
48 #include <sys/pathname.h>
49 #include <sys/debug.h>
50 #include <sys/vmsystm.h>
51 #include <sys/cmn_err.h>
52 #include <sys/acct.h>
53 #include <sys/dnlc.h>
54 #include <sys/swap.h>
55
56 #include <sys/fs/ufs_fs.h>
57 #include <sys/fs/ufs_inode.h>
58 #include <sys/fs/ufs_fsdir.h>
59 #include <sys/fs/ufs_trans.h>
60 #include <sys/fs/ufs_panic.h>
61 #include <sys/fs/ufs_mount.h>
62 #include <sys/fs/ufs_bio.h>
63 #include <sys/fs/ufs_log.h>
64 #include <sys/fs/ufs_quota.h>
65 #include <sys/dirent.h> /* must be AFTER <sys/fs/fsdir.h>! */
66 #include <sys/errno.h>
67 #include <sys/sysinfo.h>
68
69 #include <vm/hat.h>
70 #include <vm/pvn.h>
71 #include <vm/as.h>
72 #include <vm/seg.h>
73 #include <vm/seg_map.h>
74 #include <vm/seg_vn.h>
75 #include <vm/rm.h>
76 #include <vm/anon.h>
77 #include <sys/swap.h>
78 #include <sys/dnlc.h>
79
80 extern struct vnode *common_specvp(struct vnode *vp);
81
82 /* error lock status */
83 #define UN_ERRLCK (-1)
84 #define SET_ERRLCK 1
85 #define RE_ERRLCK 2
86 #define NO_ERRLCK 0
87
88 /*
89 * Index to be used in TSD for storing lockfs data
90 */
91 uint_t ufs_lockfs_key;
92
93 typedef struct _ulockfs_info {
94 struct _ulockfs_info *next;
95 struct ulockfs *ulp;
96 uint_t flags;
97 } ulockfs_info_t;
98
99 #define ULOCK_INFO_FALLOCATE 0x00000001 /* fallocate thread */
100
101 /*
102 * Check in TSD that whether we are already doing any VOP on this filesystem
103 */
104 #define IS_REC_VOP(found, head, ulp, free) \
105 { \
106 ulockfs_info_t *_curr; \
107 \
108 for (found = 0, free = NULL, _curr = head; \
109 _curr != NULL; _curr = _curr->next) { \
110 if ((free == NULL) && \
111 (_curr->ulp == NULL)) \
112 free = _curr; \
113 if (_curr->ulp == ulp) { \
114 found = 1; \
115 break; \
116 } \
117 } \
118 }
119
120 /*
121 * Get the lockfs data from TSD so that lockfs handles the recursive VOP
122 * properly
123 */
124 #define SEARCH_ULOCKFSP(head, ulp, info) \
125 { \
126 ulockfs_info_t *_curr; \
127 \
128 for (_curr = head; _curr != NULL; \
129 _curr = _curr->next) { \
130 if (_curr->ulp == ulp) { \
131 break; \
132 } \
133 } \
134 \
135 info = _curr; \
136 }
137
138 /*
139 * Validate lockfs request
140 */
141 static int
ufs_getlfd(struct lockfs * lockfsp,struct lockfs * ul_lockfsp)142 ufs_getlfd(
143 struct lockfs *lockfsp, /* new lock request */
144 struct lockfs *ul_lockfsp) /* old lock state */
145 {
146 int error = 0;
147
148 /*
149 * no input flags defined
150 */
151 if (lockfsp->lf_flags != 0) {
152 error = EINVAL;
153 goto errout;
154 }
155
156 /*
157 * check key
158 */
159 if (!LOCKFS_IS_ULOCK(ul_lockfsp))
160 if (lockfsp->lf_key != ul_lockfsp->lf_key) {
161 error = EINVAL;
162 goto errout;
163 }
164
165 lockfsp->lf_key = ul_lockfsp->lf_key + 1;
166
167 errout:
168 return (error);
169 }
170
171 /*
172 * ufs_checkaccton
173 * check if accounting is turned on on this fs
174 */
175
176 int
ufs_checkaccton(struct vnode * vp)177 ufs_checkaccton(struct vnode *vp)
178 {
179 if (acct_fs_in_use(vp))
180 return (EDEADLK);
181 return (0);
182 }
183
184 /*
185 * ufs_checkswapon
186 * check if local swapping is to file on this fs
187 */
188 int
ufs_checkswapon(struct vnode * vp)189 ufs_checkswapon(struct vnode *vp)
190 {
191 struct swapinfo *sip;
192
193 mutex_enter(&swapinfo_lock);
194 for (sip = swapinfo; sip; sip = sip->si_next)
195 if (sip->si_vp->v_vfsp == vp->v_vfsp) {
196 mutex_exit(&swapinfo_lock);
197 return (EDEADLK);
198 }
199 mutex_exit(&swapinfo_lock);
200 return (0);
201 }
202
203 /*
204 * ufs_freeze
205 * pend future accesses for current lock and desired lock
206 */
207 void
ufs_freeze(struct ulockfs * ulp,struct lockfs * lockfsp)208 ufs_freeze(struct ulockfs *ulp, struct lockfs *lockfsp)
209 {
210 /*
211 * set to new lock type
212 */
213 ulp->ul_lockfs.lf_lock = lockfsp->lf_lock;
214 ulp->ul_lockfs.lf_key = lockfsp->lf_key;
215 ulp->ul_lockfs.lf_comlen = lockfsp->lf_comlen;
216 ulp->ul_lockfs.lf_comment = lockfsp->lf_comment;
217
218 ulp->ul_fs_lock = (1 << ulp->ul_lockfs.lf_lock);
219 }
220
221 /*
222 * All callers of ufs_quiesce() atomically increment ufs_quiesce_pend before
223 * starting ufs_quiesce() protocol and decrement it only when a file system no
224 * longer has to be in quiescent state. This allows ufs_pageio() to detect
225 * that another thread wants to quiesce a file system. See more comments in
226 * ufs_pageio().
227 */
228 ulong_t ufs_quiesce_pend = 0;
229
230 /*
231 * ufs_quiesce
232 * wait for outstanding accesses to finish
233 */
234 int
ufs_quiesce(struct ulockfs * ulp)235 ufs_quiesce(struct ulockfs *ulp)
236 {
237 int error = 0;
238 ulockfs_info_t *head;
239 ulockfs_info_t *info;
240 klwp_t *lwp = ttolwp(curthread);
241
242 head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
243 SEARCH_ULOCKFSP(head, ulp, info);
244
245 /*
246 * We have to keep /proc away from stopping us after we applied
247 * the softlock but before we got a chance to clear it again.
248 * prstop() may pagefault and become stuck on the softlock still
249 * pending.
250 */
251 if (lwp != NULL)
252 lwp->lwp_nostop++;
253
254 /*
255 * Set a softlock to suspend future ufs_vnops so that
256 * this lockfs request will not be starved
257 */
258 ULOCKFS_SET_SLOCK(ulp);
259 ASSERT(ufs_quiesce_pend);
260
261 /* check if there is any outstanding ufs vnodeops calls */
262 while (ulp->ul_vnops_cnt || ulp->ul_falloc_cnt) {
263 /*
264 * use timed version of cv_wait_sig() to make sure we don't
265 * miss a wake up call from ufs_pageio() when it doesn't use
266 * ul_lock.
267 *
268 * when a fallocate thread comes in, the only way it returns
269 * from this function is if there are no other vnode operations
270 * going on (remember fallocate threads are tracked using
271 * ul_falloc_cnt not ul_vnops_cnt), and another fallocate thread
272 * hasn't already grabbed the fs write lock.
273 */
274 if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
275 if (!ulp->ul_vnops_cnt && !ULOCKFS_IS_FWLOCK(ulp))
276 goto out;
277 }
278 if (!cv_reltimedwait_sig(&ulp->ul_cv, &ulp->ul_lock, hz,
279 TR_CLOCK_TICK)) {
280 error = EINTR;
281 goto out;
282 }
283 }
284
285 out:
286 /*
287 * unlock the soft lock
288 */
289 ULOCKFS_CLR_SLOCK(ulp);
290
291 if (lwp != NULL)
292 lwp->lwp_nostop--;
293
294 return (error);
295 }
296
297 /*
298 * ufs_flush_inode
299 */
300 int
ufs_flush_inode(struct inode * ip,void * arg)301 ufs_flush_inode(struct inode *ip, void *arg)
302 {
303 int error;
304 int saverror = 0;
305
306 /*
307 * wrong file system; keep looking
308 */
309 if (ip->i_ufsvfs != (struct ufsvfs *)arg)
310 return (0);
311
312 /*
313 * asynchronously push all the dirty pages
314 */
315 if (((error = TRANS_SYNCIP(ip, B_ASYNC, 0, TOP_SYNCIP_FLUSHI)) != 0) &&
316 (error != EAGAIN))
317 saverror = error;
318 /*
319 * wait for io and discard all mappings
320 */
321 if (error = TRANS_SYNCIP(ip, B_INVAL, 0, TOP_SYNCIP_FLUSHI))
322 saverror = error;
323
324 if (ITOV(ip)->v_type == VDIR) {
325 dnlc_dir_purge(&ip->i_danchor);
326 }
327
328 return (saverror);
329 }
330
331 /*
332 * ufs_flush
333 * Flush everything that is currently dirty; this includes invalidating
334 * any mappings.
335 */
336 int
ufs_flush(struct vfs * vfsp)337 ufs_flush(struct vfs *vfsp)
338 {
339 int error;
340 int saverror = 0;
341 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
342 struct fs *fs = ufsvfsp->vfs_fs;
343 int tdontblock = 0;
344
345 ASSERT(vfs_lock_held(vfsp));
346
347 /*
348 * purge dnlc
349 */
350 (void) dnlc_purge_vfsp(vfsp, 0);
351
352 /*
353 * drain the delete and idle threads
354 */
355 ufs_delete_drain(vfsp, 0, 0);
356 ufs_idle_drain(vfsp);
357
358 /*
359 * flush and invalidate quota records
360 */
361 (void) qsync(ufsvfsp);
362
363 /*
364 * flush w/invalidate the inodes for vfsp
365 */
366 if (error = ufs_scan_inodes(0, ufs_flush_inode, ufsvfsp, ufsvfsp))
367 saverror = error;
368
369 /*
370 * synchronously flush superblock and summary info
371 */
372 if (fs->fs_ronly == 0 && fs->fs_fmod) {
373 fs->fs_fmod = 0;
374 TRANS_SBUPDATE(ufsvfsp, vfsp, TOP_SBUPDATE_FLUSH);
375 }
376 /*
377 * flush w/invalidate block device pages and buf cache
378 */
379 if ((error = VOP_PUTPAGE(common_specvp(ufsvfsp->vfs_devvp),
380 (offset_t)0, 0, B_INVAL, CRED(), NULL)) > 0)
381 saverror = error;
382
383 (void) bflush((dev_t)vfsp->vfs_dev);
384 (void) bfinval((dev_t)vfsp->vfs_dev, 0);
385
386 /*
387 * drain the delete and idle threads again
388 */
389 ufs_delete_drain(vfsp, 0, 0);
390 ufs_idle_drain(vfsp);
391
392 /*
393 * play with the clean flag
394 */
395 if (saverror == 0)
396 ufs_checkclean(vfsp);
397
398 /*
399 * Flush any outstanding transactions and roll the log
400 * only if we are supposed to do, i.e. LDL_NOROLL not set.
401 * We can not simply check for fs_ronly here since fsck also may
402 * use this code to roll the log on a read-only filesystem, e.g.
403 * root during early stages of boot, if other then a sanity check is
404 * done, it will clear LDL_NOROLL before.
405 * In addition we assert that the deltamap does not contain any deltas
406 * in case LDL_NOROLL is set since this is not supposed to happen.
407 */
408 if (TRANS_ISTRANS(ufsvfsp)) {
409 ml_unit_t *ul = ufsvfsp->vfs_log;
410 mt_map_t *mtm = ul->un_deltamap;
411
412 if (ul->un_flags & LDL_NOROLL) {
413 ASSERT(mtm->mtm_nme == 0);
414 } else {
415 /*
416 * Do not set T_DONTBLOCK if there is a
417 * transaction opened by caller.
418 */
419 if (curthread->t_flag & T_DONTBLOCK)
420 tdontblock = 1;
421 else
422 curthread->t_flag |= T_DONTBLOCK;
423
424 TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_FLUSH,
425 TOP_COMMIT_SIZE, error);
426
427 if (!error) {
428 TRANS_END_SYNC(ufsvfsp, saverror,
429 TOP_COMMIT_FLUSH, TOP_COMMIT_SIZE);
430 }
431
432 if (tdontblock == 0)
433 curthread->t_flag &= ~T_DONTBLOCK;
434
435 logmap_roll_dev(ufsvfsp->vfs_log);
436 }
437 }
438
439 return (saverror);
440 }
441
442 /*
443 * ufs_thaw_wlock
444 * special processing when thawing down to wlock
445 */
446 static int
ufs_thaw_wlock(struct inode * ip,void * arg)447 ufs_thaw_wlock(struct inode *ip, void *arg)
448 {
449 /*
450 * wrong file system; keep looking
451 */
452 if (ip->i_ufsvfs != (struct ufsvfs *)arg)
453 return (0);
454
455 /*
456 * iupdat refuses to clear flags if the fs is read only. The fs
457 * may become read/write during the lock and we wouldn't want
458 * these inodes being written to disk. So clear the flags.
459 */
460 rw_enter(&ip->i_contents, RW_WRITER);
461 ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
462 rw_exit(&ip->i_contents);
463
464 /*
465 * pages are mlocked -- fail wlock
466 */
467 if (ITOV(ip)->v_type != VCHR && vn_has_cached_data(ITOV(ip)))
468 return (EBUSY);
469
470 return (0);
471 }
472
473 /*
474 * ufs_thaw_hlock
475 * special processing when thawing down to hlock or elock
476 */
477 static int
ufs_thaw_hlock(struct inode * ip,void * arg)478 ufs_thaw_hlock(struct inode *ip, void *arg)
479 {
480 struct vnode *vp = ITOV(ip);
481
482 /*
483 * wrong file system; keep looking
484 */
485 if (ip->i_ufsvfs != (struct ufsvfs *)arg)
486 return (0);
487
488 /*
489 * blow away all pages - even if they are mlocked
490 */
491 do {
492 (void) TRANS_SYNCIP(ip, B_INVAL | B_FORCE, 0, TOP_SYNCIP_HLOCK);
493 } while ((vp->v_type != VCHR) && vn_has_cached_data(vp));
494 rw_enter(&ip->i_contents, RW_WRITER);
495 ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
496 rw_exit(&ip->i_contents);
497
498 return (0);
499 }
500
501 /*
502 * ufs_thaw
503 * thaw file system lock down to current value
504 */
505 int
ufs_thaw(struct vfs * vfsp,struct ufsvfs * ufsvfsp,struct ulockfs * ulp)506 ufs_thaw(struct vfs *vfsp, struct ufsvfs *ufsvfsp, struct ulockfs *ulp)
507 {
508 int error = 0;
509 int noidel = (int)(ulp->ul_flag & ULOCKFS_NOIDEL);
510
511 /*
512 * if wlock or hlock or elock
513 */
514 if (ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_HLOCK(ulp) ||
515 ULOCKFS_IS_ELOCK(ulp)) {
516
517 /*
518 * don't keep access times
519 * don't free deleted files
520 * if superblock writes are allowed, limit them to me for now
521 */
522 ulp->ul_flag |= (ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
523 if (ulp->ul_sbowner != (kthread_id_t)-1)
524 ulp->ul_sbowner = curthread;
525
526 /*
527 * wait for writes for deleted files and superblock updates
528 */
529 (void) ufs_flush(vfsp);
530
531 /*
532 * now make sure the quota file is up-to-date
533 * expensive; but effective
534 */
535 error = ufs_flush(vfsp);
536 /*
537 * no one can write the superblock
538 */
539 ulp->ul_sbowner = (kthread_id_t)-1;
540
541 /*
542 * special processing for wlock/hlock/elock
543 */
544 if (ULOCKFS_IS_WLOCK(ulp)) {
545 if (error)
546 goto errout;
547 error = bfinval(ufsvfsp->vfs_dev, 0);
548 if (error)
549 goto errout;
550 error = ufs_scan_inodes(0, ufs_thaw_wlock,
551 (void *)ufsvfsp, ufsvfsp);
552 if (error)
553 goto errout;
554 }
555 if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp)) {
556 error = 0;
557 (void) ufs_scan_inodes(0, ufs_thaw_hlock,
558 (void *)ufsvfsp, ufsvfsp);
559 (void) bfinval(ufsvfsp->vfs_dev, 1);
560 }
561 } else {
562
563 /*
564 * okay to keep access times
565 * okay to free deleted files
566 * okay to write the superblock
567 */
568 ulp->ul_flag &= ~(ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
569 ulp->ul_sbowner = NULL;
570
571 /*
572 * flush in case deleted files are in memory
573 */
574 if (noidel) {
575 if (error = ufs_flush(vfsp))
576 goto errout;
577 }
578 }
579
580 errout:
581 cv_broadcast(&ulp->ul_cv);
582 return (error);
583 }
584
585 /*
586 * ufs_reconcile_fs
587 * reconcile incore superblock with ondisk superblock
588 */
589 int
ufs_reconcile_fs(struct vfs * vfsp,struct ufsvfs * ufsvfsp,int errlck)590 ufs_reconcile_fs(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
591 {
592 struct fs *mfs; /* in-memory superblock */
593 struct fs *dfs; /* on-disk superblock */
594 struct buf *bp; /* on-disk superblock buf */
595 int needs_unlock;
596 char finished_fsclean;
597
598 mfs = ufsvfsp->vfs_fs;
599
600 /*
601 * get the on-disk copy of the superblock
602 */
603 bp = UFS_BREAD(ufsvfsp, vfsp->vfs_dev, SBLOCK, SBSIZE);
604 bp->b_flags |= (B_STALE|B_AGE);
605 if (bp->b_flags & B_ERROR) {
606 brelse(bp);
607 return (EIO);
608 }
609 dfs = bp->b_un.b_fs;
610
611 /* error locks may only unlock after the fs has been made consistent */
612 if (errlck == UN_ERRLCK) {
613 if (dfs->fs_clean == FSFIX) { /* being repaired */
614 brelse(bp);
615 return (EAGAIN);
616 }
617 /* repair not yet started? */
618 finished_fsclean = TRANS_ISTRANS(ufsvfsp)? FSLOG: FSCLEAN;
619 if (dfs->fs_clean != finished_fsclean) {
620 brelse(bp);
621 return (EBUSY);
622 }
623 }
624
625 /*
626 * if superblock has changed too much, abort
627 */
628 if ((mfs->fs_sblkno != dfs->fs_sblkno) ||
629 (mfs->fs_cblkno != dfs->fs_cblkno) ||
630 (mfs->fs_iblkno != dfs->fs_iblkno) ||
631 (mfs->fs_dblkno != dfs->fs_dblkno) ||
632 (mfs->fs_cgoffset != dfs->fs_cgoffset) ||
633 (mfs->fs_cgmask != dfs->fs_cgmask) ||
634 (mfs->fs_bsize != dfs->fs_bsize) ||
635 (mfs->fs_fsize != dfs->fs_fsize) ||
636 (mfs->fs_frag != dfs->fs_frag) ||
637 (mfs->fs_bmask != dfs->fs_bmask) ||
638 (mfs->fs_fmask != dfs->fs_fmask) ||
639 (mfs->fs_bshift != dfs->fs_bshift) ||
640 (mfs->fs_fshift != dfs->fs_fshift) ||
641 (mfs->fs_fragshift != dfs->fs_fragshift) ||
642 (mfs->fs_fsbtodb != dfs->fs_fsbtodb) ||
643 (mfs->fs_sbsize != dfs->fs_sbsize) ||
644 (mfs->fs_nindir != dfs->fs_nindir) ||
645 (mfs->fs_nspf != dfs->fs_nspf) ||
646 (mfs->fs_trackskew != dfs->fs_trackskew) ||
647 (mfs->fs_cgsize != dfs->fs_cgsize) ||
648 (mfs->fs_ntrak != dfs->fs_ntrak) ||
649 (mfs->fs_nsect != dfs->fs_nsect) ||
650 (mfs->fs_spc != dfs->fs_spc) ||
651 (mfs->fs_cpg != dfs->fs_cpg) ||
652 (mfs->fs_ipg != dfs->fs_ipg) ||
653 (mfs->fs_fpg != dfs->fs_fpg) ||
654 (mfs->fs_postblformat != dfs->fs_postblformat) ||
655 (mfs->fs_magic != dfs->fs_magic)) {
656 brelse(bp);
657 return (EACCES);
658 }
659 if (dfs->fs_clean == FSBAD || FSOKAY != dfs->fs_state + dfs->fs_time)
660 if (mfs->fs_clean == FSLOG) {
661 brelse(bp);
662 return (EACCES);
663 }
664
665 /*
666 * get new summary info
667 */
668 if (ufs_getsummaryinfo(vfsp->vfs_dev, ufsvfsp, dfs)) {
669 brelse(bp);
670 return (EIO);
671 }
672
673 /*
674 * release old summary info and update in-memory superblock
675 */
676 kmem_free(mfs->fs_u.fs_csp, mfs->fs_cssize);
677 mfs->fs_u.fs_csp = dfs->fs_u.fs_csp; /* Only entry 0 used */
678
679 /*
680 * update fields allowed to change
681 */
682 mfs->fs_size = dfs->fs_size;
683 mfs->fs_dsize = dfs->fs_dsize;
684 mfs->fs_ncg = dfs->fs_ncg;
685 mfs->fs_minfree = dfs->fs_minfree;
686 mfs->fs_rotdelay = dfs->fs_rotdelay;
687 mfs->fs_rps = dfs->fs_rps;
688 mfs->fs_maxcontig = dfs->fs_maxcontig;
689 mfs->fs_maxbpg = dfs->fs_maxbpg;
690 mfs->fs_csmask = dfs->fs_csmask;
691 mfs->fs_csshift = dfs->fs_csshift;
692 mfs->fs_optim = dfs->fs_optim;
693 mfs->fs_csaddr = dfs->fs_csaddr;
694 mfs->fs_cssize = dfs->fs_cssize;
695 mfs->fs_ncyl = dfs->fs_ncyl;
696 mfs->fs_cstotal = dfs->fs_cstotal;
697 mfs->fs_reclaim = dfs->fs_reclaim;
698
699 if (mfs->fs_reclaim & (FS_RECLAIM|FS_RECLAIMING)) {
700 mfs->fs_reclaim &= ~FS_RECLAIM;
701 mfs->fs_reclaim |= FS_RECLAIMING;
702 ufs_thread_start(&ufsvfsp->vfs_reclaim,
703 ufs_thread_reclaim, vfsp);
704 }
705
706 /* XXX What to do about sparecon? */
707
708 /* XXX need to copy volume label */
709
710 /*
711 * ondisk clean flag overrides inmemory clean flag iff == FSBAD
712 * or if error-locked and ondisk is now clean
713 */
714 needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
715 if (needs_unlock)
716 mutex_enter(&ufsvfsp->vfs_lock);
717
718 if (errlck == UN_ERRLCK) {
719 if (finished_fsclean == dfs->fs_clean)
720 mfs->fs_clean = finished_fsclean;
721 else
722 mfs->fs_clean = FSBAD;
723 mfs->fs_state = FSOKAY - dfs->fs_time;
724 }
725
726 if (FSOKAY != dfs->fs_state + dfs->fs_time ||
727 (dfs->fs_clean == FSBAD))
728 mfs->fs_clean = FSBAD;
729
730 if (needs_unlock)
731 mutex_exit(&ufsvfsp->vfs_lock);
732
733 brelse(bp);
734
735 return (0);
736 }
737
738 /*
739 * ufs_reconcile_inode
740 * reconcile ondisk inode with incore inode
741 */
742 static int
ufs_reconcile_inode(struct inode * ip,void * arg)743 ufs_reconcile_inode(struct inode *ip, void *arg)
744 {
745 int i;
746 int ndaddr;
747 int niaddr;
748 struct dinode *dp; /* ondisk inode */
749 struct buf *bp = NULL;
750 uid_t d_uid;
751 gid_t d_gid;
752 int error = 0;
753 struct fs *fs;
754
755 /*
756 * not an inode we care about
757 */
758 if (ip->i_ufsvfs != (struct ufsvfs *)arg)
759 return (0);
760
761 fs = ip->i_fs;
762
763 /*
764 * Inode reconciliation fails: we made the filesystem quiescent
765 * and we did a ufs_flush() before calling ufs_reconcile_inode()
766 * and thus the inode should not have been changed inbetween.
767 * Any discrepancies indicate a logic error and a pretty
768 * significant run-state inconsistency we should complain about.
769 */
770 if (ip->i_flag & (IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG)) {
771 cmn_err(CE_WARN, "%s: Inode reconciliation failed for"
772 "inode %llu", fs->fs_fsmnt, (u_longlong_t)ip->i_number);
773 return (EINVAL);
774 }
775
776 /*
777 * get the dinode
778 */
779 bp = UFS_BREAD(ip->i_ufsvfs,
780 ip->i_dev, (daddr_t)fsbtodb(fs, itod(fs, ip->i_number)),
781 (int)fs->fs_bsize);
782 if (bp->b_flags & B_ERROR) {
783 brelse(bp);
784 return (EIO);
785 }
786 dp = bp->b_un.b_dino;
787 dp += itoo(fs, ip->i_number);
788
789 /*
790 * handle Sun's implementation of EFT
791 */
792 d_uid = (dp->di_suid == UID_LONG) ? dp->di_uid : (uid_t)dp->di_suid;
793 d_gid = (dp->di_sgid == GID_LONG) ? dp->di_gid : (uid_t)dp->di_sgid;
794
795 rw_enter(&ip->i_contents, RW_WRITER);
796
797 /*
798 * some fields are not allowed to change
799 */
800 if ((ip->i_mode != dp->di_mode) ||
801 (ip->i_gen != dp->di_gen) ||
802 (ip->i_uid != d_uid) ||
803 (ip->i_gid != d_gid)) {
804 error = EACCES;
805 goto out;
806 }
807
808 /*
809 * and some are allowed to change
810 */
811 ip->i_size = dp->di_size;
812 ip->i_ic.ic_flags = dp->di_ic.ic_flags;
813 ip->i_blocks = dp->di_blocks;
814 ip->i_nlink = dp->di_nlink;
815 if (ip->i_flag & IFASTSYMLNK) {
816 ndaddr = 1;
817 niaddr = 0;
818 } else {
819 ndaddr = NDADDR;
820 niaddr = NIADDR;
821 }
822 for (i = 0; i < ndaddr; ++i)
823 ip->i_db[i] = dp->di_db[i];
824 for (i = 0; i < niaddr; ++i)
825 ip->i_ib[i] = dp->di_ib[i];
826
827 out:
828 rw_exit(&ip->i_contents);
829 brelse(bp);
830 return (error);
831 }
832
833 /*
834 * ufs_reconcile
835 * reconcile ondisk superblock/inodes with any incore
836 */
837 static int
ufs_reconcile(struct vfs * vfsp,struct ufsvfs * ufsvfsp,int errlck)838 ufs_reconcile(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
839 {
840 int error = 0;
841
842 /*
843 * get rid of as much inmemory data as possible
844 */
845 (void) ufs_flush(vfsp);
846
847 /*
848 * reconcile the superblock and inodes
849 */
850 if (error = ufs_reconcile_fs(vfsp, ufsvfsp, errlck))
851 return (error);
852 if (error = ufs_scan_inodes(0, ufs_reconcile_inode, ufsvfsp, ufsvfsp))
853 return (error);
854 /*
855 * allocation blocks may be incorrect; get rid of them
856 */
857 (void) ufs_flush(vfsp);
858
859 return (error);
860 }
861
862 /*
863 * File system locking
864 */
865 int
ufs_fiolfs(struct vnode * vp,struct lockfs * lockfsp,int from_log)866 ufs_fiolfs(struct vnode *vp, struct lockfs *lockfsp, int from_log)
867 {
868 return (ufs__fiolfs(vp, lockfsp, /* from_user */ 1, from_log));
869 }
870
871 /* kernel-internal interface, also used by fix-on-panic */
872 int
ufs__fiolfs(struct vnode * vp,struct lockfs * lockfsp,int from_user,int from_log)873 ufs__fiolfs(
874 struct vnode *vp,
875 struct lockfs *lockfsp,
876 int from_user,
877 int from_log)
878 {
879 struct ulockfs *ulp;
880 struct lockfs lfs;
881 int error;
882 struct vfs *vfsp;
883 struct ufsvfs *ufsvfsp;
884 int errlck = NO_ERRLCK;
885 int poll_events = POLLPRI;
886 extern struct pollhead ufs_pollhd;
887 ulockfs_info_t *head;
888 ulockfs_info_t *info;
889 int signal = 0;
890
891 /* check valid lock type */
892 if (!lockfsp || lockfsp->lf_lock > LOCKFS_MAXLOCK)
893 return (EINVAL);
894
895 if (!vp || !vp->v_vfsp || !vp->v_vfsp->vfs_data)
896 return (EIO);
897
898 vfsp = vp->v_vfsp;
899
900 if (vfsp->vfs_flag & VFS_UNMOUNTED) /* has been unmounted */
901 return (EIO);
902
903 /* take the lock and check again */
904 vfs_lock_wait(vfsp);
905 if (vfsp->vfs_flag & VFS_UNMOUNTED) {
906 vfs_unlock(vfsp);
907 return (EIO);
908 }
909
910 /*
911 * Can't wlock or ro/elock fs with accounting or local swap file
912 * We need to check for this before we grab the ul_lock to avoid
913 * deadlocks with the accounting framework.
914 */
915 if ((LOCKFS_IS_WLOCK(lockfsp) || LOCKFS_IS_ELOCK(lockfsp) ||
916 LOCKFS_IS_ROELOCK(lockfsp)) && !from_log) {
917 if (ufs_checkaccton(vp) || ufs_checkswapon(vp)) {
918 vfs_unlock(vfsp);
919 return (EDEADLK);
920 }
921 }
922
923 ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
924 ulp = &ufsvfsp->vfs_ulockfs;
925 head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
926 SEARCH_ULOCKFSP(head, ulp, info);
927
928 /*
929 * Suspend both the reclaim thread and the delete thread.
930 * This must be done outside the lockfs locking protocol.
931 */
932 ufs_thread_suspend(&ufsvfsp->vfs_reclaim);
933 ufs_thread_suspend(&ufsvfsp->vfs_delete);
934
935 mutex_enter(&ulp->ul_lock);
936 atomic_inc_ulong(&ufs_quiesce_pend);
937
938 /*
939 * Quit if there is another lockfs request in progress
940 * that is waiting for existing ufs_vnops to complete.
941 */
942 if (ULOCKFS_IS_BUSY(ulp)) {
943 error = EBUSY;
944 goto errexit;
945 }
946
947 /* cannot ulocked or downgrade a hard-lock */
948 if (ULOCKFS_IS_HLOCK(ulp)) {
949 error = EIO;
950 goto errexit;
951 }
952
953 /* an error lock may be unlocked or relocked, only */
954 if (ULOCKFS_IS_ELOCK(ulp)) {
955 if (!LOCKFS_IS_ULOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
956 error = EBUSY;
957 goto errexit;
958 }
959 }
960
961 /*
962 * a read-only error lock may only be upgraded to an
963 * error lock or hard lock
964 */
965 if (ULOCKFS_IS_ROELOCK(ulp)) {
966 if (!LOCKFS_IS_HLOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
967 error = EBUSY;
968 goto errexit;
969 }
970 }
971
972 /*
973 * until read-only error locks are fully implemented
974 * just return EINVAL
975 */
976 if (LOCKFS_IS_ROELOCK(lockfsp)) {
977 error = EINVAL;
978 goto errexit;
979 }
980
981 /*
982 * an error lock may only be applied if the file system is
983 * unlocked or already error locked.
984 * (this is to prevent the case where a fs gets changed out from
985 * underneath a fs that is locked for backup,
986 * that is, name/delete/write-locked.)
987 */
988 if ((!ULOCKFS_IS_ULOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp) &&
989 !ULOCKFS_IS_ROELOCK(ulp)) &&
990 (LOCKFS_IS_ELOCK(lockfsp) || LOCKFS_IS_ROELOCK(lockfsp))) {
991 error = EBUSY;
992 goto errexit;
993 }
994
995 /* get and validate the input lockfs request */
996 if (error = ufs_getlfd(lockfsp, &ulp->ul_lockfs))
997 goto errexit;
998
999 /*
1000 * save current ulockfs struct
1001 */
1002 bcopy(&ulp->ul_lockfs, &lfs, sizeof (struct lockfs));
1003
1004 /*
1005 * Freeze the file system (pend future accesses)
1006 */
1007 ufs_freeze(ulp, lockfsp);
1008
1009 /*
1010 * Set locking in progress because ufs_quiesce may free the
1011 * ul_lock mutex.
1012 */
1013 ULOCKFS_SET_BUSY(ulp);
1014 /* update the ioctl copy */
1015 LOCKFS_SET_BUSY(&ulp->ul_lockfs);
1016
1017 /*
1018 * We need to unset FWLOCK status before we call ufs_quiesce
1019 * so that the thread doesnt get suspended. We do this only if
1020 * this (fallocate) thread requested an unlock operation.
1021 */
1022 if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
1023 if (!ULOCKFS_IS_WLOCK(ulp))
1024 ULOCKFS_CLR_FWLOCK(ulp);
1025 }
1026
1027 /*
1028 * Quiesce (wait for outstanding accesses to finish)
1029 */
1030 if (error = ufs_quiesce(ulp)) {
1031 /*
1032 * Interrupted due to signal. There could still be
1033 * pending vnops.
1034 */
1035 signal = 1;
1036
1037 /*
1038 * We do broadcast because lock-status
1039 * could be reverted to old status.
1040 */
1041 cv_broadcast(&ulp->ul_cv);
1042 goto errout;
1043 }
1044
1045 /*
1046 * If the fallocate thread requested a write fs lock operation
1047 * then we set fwlock status in the ulp.
1048 */
1049 if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
1050 if (ULOCKFS_IS_WLOCK(ulp))
1051 ULOCKFS_SET_FWLOCK(ulp);
1052 }
1053
1054 /*
1055 * save error lock status to pass down to reconcilation
1056 * routines and for later cleanup
1057 */
1058 if (LOCKFS_IS_ELOCK(&lfs) && ULOCKFS_IS_ULOCK(ulp))
1059 errlck = UN_ERRLCK;
1060
1061 if (ULOCKFS_IS_ELOCK(ulp) || ULOCKFS_IS_ROELOCK(ulp)) {
1062 int needs_unlock;
1063 int needs_sbwrite;
1064
1065 poll_events |= POLLERR;
1066 errlck = LOCKFS_IS_ELOCK(&lfs) || LOCKFS_IS_ROELOCK(&lfs) ?
1067 RE_ERRLCK : SET_ERRLCK;
1068
1069 needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
1070 if (needs_unlock)
1071 mutex_enter(&ufsvfsp->vfs_lock);
1072
1073 /* disable delayed i/o */
1074 needs_sbwrite = 0;
1075
1076 if (errlck == SET_ERRLCK) {
1077 ufsvfsp->vfs_fs->fs_clean = FSBAD;
1078 needs_sbwrite = 1;
1079 }
1080
1081 needs_sbwrite |= ufsvfsp->vfs_dio;
1082 ufsvfsp->vfs_dio = 0;
1083
1084 if (needs_unlock)
1085 mutex_exit(&ufsvfsp->vfs_lock);
1086
1087 if (needs_sbwrite) {
1088 ulp->ul_sbowner = curthread;
1089 TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE);
1090
1091 if (needs_unlock)
1092 mutex_enter(&ufsvfsp->vfs_lock);
1093
1094 ufsvfsp->vfs_fs->fs_fmod = 0;
1095
1096 if (needs_unlock)
1097 mutex_exit(&ufsvfsp->vfs_lock);
1098 }
1099 }
1100
1101 /*
1102 * reconcile superblock and inodes if was wlocked
1103 */
1104 if (LOCKFS_IS_WLOCK(&lfs) || LOCKFS_IS_ELOCK(&lfs)) {
1105 if (error = ufs_reconcile(vfsp, ufsvfsp, errlck))
1106 goto errout;
1107 /*
1108 * in case the fs grew; reset the metadata map for logging tests
1109 */
1110 TRANS_MATA_UMOUNT(ufsvfsp);
1111 TRANS_MATA_MOUNT(ufsvfsp);
1112 TRANS_MATA_SI(ufsvfsp, ufsvfsp->vfs_fs);
1113 }
1114
1115 /*
1116 * At least everything *currently* dirty goes out.
1117 */
1118
1119 if ((error = ufs_flush(vfsp)) != 0 && !ULOCKFS_IS_HLOCK(ulp) &&
1120 !ULOCKFS_IS_ELOCK(ulp))
1121 goto errout;
1122
1123 /*
1124 * thaw file system and wakeup pended processes
1125 */
1126 if (error = ufs_thaw(vfsp, ufsvfsp, ulp))
1127 if (!ULOCKFS_IS_HLOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp))
1128 goto errout;
1129
1130 /*
1131 * reset modified flag if not already write locked
1132 */
1133 if (!LOCKFS_IS_WLOCK(&lfs))
1134 ULOCKFS_CLR_MOD(ulp);
1135
1136 /*
1137 * idle the lock struct
1138 */
1139 ULOCKFS_CLR_BUSY(ulp);
1140 /* update the ioctl copy */
1141 LOCKFS_CLR_BUSY(&ulp->ul_lockfs);
1142
1143 /*
1144 * free current comment
1145 */
1146 if (lfs.lf_comment && lfs.lf_comlen != 0) {
1147 kmem_free(lfs.lf_comment, lfs.lf_comlen);
1148 lfs.lf_comment = NULL;
1149 lfs.lf_comlen = 0;
1150 }
1151
1152 /* do error lock cleanup */
1153 if (errlck == UN_ERRLCK)
1154 ufsfx_unlockfs(ufsvfsp);
1155
1156 else if (errlck == RE_ERRLCK)
1157 ufsfx_lockfs(ufsvfsp);
1158
1159 /* don't allow error lock from user to invoke panic */
1160 else if (from_user && errlck == SET_ERRLCK &&
1161 !(ufsvfsp->vfs_fsfx.fx_flags & (UFSMNT_ONERROR_PANIC >> 4)))
1162 (void) ufs_fault(ufsvfsp->vfs_root,
1163 ulp->ul_lockfs.lf_comment && ulp->ul_lockfs.lf_comlen > 0 ?
1164 ulp->ul_lockfs.lf_comment: "user-applied error lock");
1165
1166 atomic_dec_ulong(&ufs_quiesce_pend);
1167 mutex_exit(&ulp->ul_lock);
1168 vfs_unlock(vfsp);
1169
1170 if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs))
1171 poll_events |= POLLERR;
1172
1173 pollwakeup(&ufs_pollhd, poll_events);
1174
1175 /*
1176 * Allow both the delete thread and the reclaim thread to
1177 * continue.
1178 */
1179 ufs_thread_continue(&ufsvfsp->vfs_delete);
1180 ufs_thread_continue(&ufsvfsp->vfs_reclaim);
1181
1182 return (0);
1183
1184 errout:
1185 /*
1186 * Lock failed. Reset the old lock in ufsvfs if not hard locked.
1187 */
1188 if (!LOCKFS_IS_HLOCK(&ulp->ul_lockfs)) {
1189 bcopy(&lfs, &ulp->ul_lockfs, sizeof (struct lockfs));
1190 ulp->ul_fs_lock = (1 << lfs.lf_lock);
1191 }
1192
1193 /*
1194 * Don't call ufs_thaw() when there's a signal during
1195 * ufs quiesce operation as it can lead to deadlock
1196 * with getpage.
1197 */
1198 if (signal == 0)
1199 (void) ufs_thaw(vfsp, ufsvfsp, ulp);
1200
1201 ULOCKFS_CLR_BUSY(ulp);
1202 LOCKFS_CLR_BUSY(&ulp->ul_lockfs);
1203
1204 errexit:
1205 atomic_dec_ulong(&ufs_quiesce_pend);
1206 mutex_exit(&ulp->ul_lock);
1207 vfs_unlock(vfsp);
1208
1209 /*
1210 * Allow both the delete thread and the reclaim thread to
1211 * continue.
1212 */
1213 ufs_thread_continue(&ufsvfsp->vfs_delete);
1214 ufs_thread_continue(&ufsvfsp->vfs_reclaim);
1215
1216 return (error);
1217 }
1218
1219 /*
1220 * fiolfss
1221 * return the current file system locking state info
1222 */
1223 int
ufs_fiolfss(struct vnode * vp,struct lockfs * lockfsp)1224 ufs_fiolfss(struct vnode *vp, struct lockfs *lockfsp)
1225 {
1226 struct ulockfs *ulp;
1227
1228 if (!vp || !vp->v_vfsp || !VTOI(vp))
1229 return (EINVAL);
1230
1231 /* file system has been forcibly unmounted */
1232 if (VTOI(vp)->i_ufsvfs == NULL)
1233 return (EIO);
1234
1235 ulp = VTOUL(vp);
1236
1237 if (ULOCKFS_IS_HLOCK(ulp)) {
1238 *lockfsp = ulp->ul_lockfs; /* structure assignment */
1239 return (0);
1240 }
1241
1242 mutex_enter(&ulp->ul_lock);
1243
1244 *lockfsp = ulp->ul_lockfs; /* structure assignment */
1245
1246 if (ULOCKFS_IS_MOD(ulp))
1247 lockfsp->lf_flags |= LOCKFS_MOD;
1248
1249 mutex_exit(&ulp->ul_lock);
1250
1251 return (0);
1252 }
1253
1254 /*
1255 * ufs_check_lockfs
1256 * check whether a ufs_vnops conflicts with the file system lock
1257 */
1258 int
ufs_check_lockfs(struct ufsvfs * ufsvfsp,struct ulockfs * ulp,ulong_t mask)1259 ufs_check_lockfs(struct ufsvfs *ufsvfsp, struct ulockfs *ulp, ulong_t mask)
1260 {
1261 k_sigset_t smask;
1262 int sig, slock;
1263
1264 ASSERT(MUTEX_HELD(&ulp->ul_lock));
1265
1266 while (ulp->ul_fs_lock & mask) {
1267 slock = (int)ULOCKFS_IS_SLOCK(ulp);
1268 if ((curthread->t_flag & T_DONTPEND) && !slock) {
1269 curthread->t_flag |= T_WOULDBLOCK;
1270 return (EAGAIN);
1271 }
1272 curthread->t_flag &= ~T_WOULDBLOCK;
1273
1274 /*
1275 * In the case of an onerr umount of the fs, threads could
1276 * have blocked before coming into ufs_check_lockfs and
1277 * need to check for the special case of ELOCK and
1278 * vfs_dontblock being set which would indicate that the fs
1279 * is on its way out and will not return therefore making
1280 * EIO the appropriate response.
1281 */
1282 if (ULOCKFS_IS_HLOCK(ulp) ||
1283 (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
1284 return (EIO);
1285
1286 /*
1287 * wait for lock status to change
1288 */
1289 if (slock || ufsvfsp->vfs_nointr) {
1290 cv_wait(&ulp->ul_cv, &ulp->ul_lock);
1291 } else {
1292 sigintr(&smask, 1);
1293 sig = cv_wait_sig(&ulp->ul_cv, &ulp->ul_lock);
1294 sigunintr(&smask);
1295 if ((!sig && (ulp->ul_fs_lock & mask)) ||
1296 ufsvfsp->vfs_dontblock)
1297 return (EINTR);
1298 }
1299 }
1300
1301 if (mask & ULOCKFS_FWLOCK) {
1302 atomic_inc_ulong(&ulp->ul_falloc_cnt);
1303 ULOCKFS_SET_FALLOC(ulp);
1304 } else {
1305 atomic_inc_ulong(&ulp->ul_vnops_cnt);
1306 }
1307
1308 return (0);
1309 }
1310
1311 /*
1312 * Check whether we came across the handcrafted lockfs protocol path. We can't
1313 * simply check for T_DONTBLOCK here as one would assume since this can also
1314 * falsely catch recursive VOP's going to a different filesystem, instead we
1315 * check if we already hold the ulockfs->ul_lock mutex.
1316 */
1317 static int
ufs_lockfs_is_under_rawlockfs(struct ulockfs * ulp)1318 ufs_lockfs_is_under_rawlockfs(struct ulockfs *ulp)
1319 {
1320 return ((mutex_owner(&ulp->ul_lock) != curthread) ? 0 : 1);
1321 }
1322
1323 /*
1324 * ufs_lockfs_begin - start the lockfs locking protocol
1325 */
1326 int
ufs_lockfs_begin(struct ufsvfs * ufsvfsp,struct ulockfs ** ulpp,ulong_t mask)1327 ufs_lockfs_begin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask)
1328 {
1329 int error;
1330 int rec_vop;
1331 ushort_t op_cnt_incremented = 0;
1332 ulong_t *ctr;
1333 struct ulockfs *ulp;
1334 ulockfs_info_t *ulockfs_info;
1335 ulockfs_info_t *ulockfs_info_free;
1336 ulockfs_info_t *ulockfs_info_temp;
1337
1338 /*
1339 * file system has been forcibly unmounted
1340 */
1341 if (ufsvfsp == NULL)
1342 return (EIO);
1343
1344 *ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1345
1346 /*
1347 * Do lockfs protocol
1348 */
1349 ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1350 IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1351
1352 /*
1353 * Detect recursive VOP call or handcrafted internal lockfs protocol
1354 * path and bail out in that case.
1355 */
1356 if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1357 *ulpp = NULL;
1358 return (0);
1359 } else {
1360 if (ulockfs_info_free == NULL) {
1361 if ((ulockfs_info_temp = (ulockfs_info_t *)
1362 kmem_zalloc(sizeof (ulockfs_info_t),
1363 KM_NOSLEEP)) == NULL) {
1364 *ulpp = NULL;
1365 return (ENOMEM);
1366 }
1367 }
1368 }
1369
1370 /*
1371 * First time VOP call
1372 *
1373 * Increment the ctr irrespective of the lockfs state. If the lockfs
1374 * state is not ULOCKFS_ULOCK, we can decrement it later. However,
1375 * before incrementing we need to check if there is a pending quiesce
1376 * request because if we have a continuous stream of ufs_lockfs_begin
1377 * requests pounding on a few cpu's then the ufs_quiesce thread might
1378 * never see the value of zero for ctr - a livelock kind of scenario.
1379 */
1380 ctr = (mask & ULOCKFS_FWLOCK) ?
1381 &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt;
1382 if (!ULOCKFS_IS_SLOCK(ulp)) {
1383 atomic_inc_ulong(ctr);
1384 op_cnt_incremented++;
1385 }
1386
1387 /*
1388 * If the lockfs state (indicated by ul_fs_lock) is not just
1389 * ULOCKFS_ULOCK, then we will be routed through ufs_check_lockfs
1390 * where there is a check with an appropriate mask to selectively allow
1391 * operations permitted for that kind of lockfs state.
1392 *
1393 * Even these selective operations should not be allowed to go through
1394 * if a lockfs request is in progress because that could result in inode
1395 * modifications during a quiesce and could hence result in inode
1396 * reconciliation failures. ULOCKFS_SLOCK alone would not be sufficient,
1397 * so make use of ufs_quiesce_pend to disallow vnode operations when a
1398 * quiesce is in progress.
1399 */
1400 if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
1401 if (op_cnt_incremented)
1402 if (!atomic_dec_ulong_nv(ctr))
1403 cv_broadcast(&ulp->ul_cv);
1404 mutex_enter(&ulp->ul_lock);
1405 error = ufs_check_lockfs(ufsvfsp, ulp, mask);
1406 mutex_exit(&ulp->ul_lock);
1407 if (error) {
1408 if (ulockfs_info_free == NULL)
1409 kmem_free(ulockfs_info_temp,
1410 sizeof (ulockfs_info_t));
1411 return (error);
1412 }
1413 } else {
1414 /*
1415 * This is the common case of file system in a unlocked state.
1416 *
1417 * If a file system is unlocked, we would expect the ctr to have
1418 * been incremented by now. But this will not be true when a
1419 * quiesce is winding up - SLOCK was set when we checked before
1420 * incrementing the ctr, but by the time we checked for
1421 * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. It is okay
1422 * to take ul_lock and go through the slow path in this uncommon
1423 * case.
1424 */
1425 if (op_cnt_incremented == 0) {
1426 mutex_enter(&ulp->ul_lock);
1427 error = ufs_check_lockfs(ufsvfsp, ulp, mask);
1428 if (error) {
1429 mutex_exit(&ulp->ul_lock);
1430 if (ulockfs_info_free == NULL)
1431 kmem_free(ulockfs_info_temp,
1432 sizeof (ulockfs_info_t));
1433 return (error);
1434 }
1435 if (mask & ULOCKFS_FWLOCK)
1436 ULOCKFS_SET_FALLOC(ulp);
1437 mutex_exit(&ulp->ul_lock);
1438 } else if (mask & ULOCKFS_FWLOCK) {
1439 mutex_enter(&ulp->ul_lock);
1440 ULOCKFS_SET_FALLOC(ulp);
1441 mutex_exit(&ulp->ul_lock);
1442 }
1443 }
1444
1445 if (ulockfs_info_free != NULL) {
1446 ulockfs_info_free->ulp = ulp;
1447 if (mask & ULOCKFS_FWLOCK)
1448 ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE;
1449 } else {
1450 ulockfs_info_temp->ulp = ulp;
1451 ulockfs_info_temp->next = ulockfs_info;
1452 if (mask & ULOCKFS_FWLOCK)
1453 ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE;
1454 ASSERT(ufs_lockfs_key != 0);
1455 (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1456 }
1457
1458 curthread->t_flag |= T_DONTBLOCK;
1459 return (0);
1460 }
1461
1462 /*
1463 * Check whether we are returning from the top level VOP.
1464 */
1465 static int
ufs_lockfs_top_vop_return(ulockfs_info_t * head)1466 ufs_lockfs_top_vop_return(ulockfs_info_t *head)
1467 {
1468 ulockfs_info_t *info;
1469 int result = 1;
1470
1471 for (info = head; info != NULL; info = info->next) {
1472 if (info->ulp != NULL) {
1473 result = 0;
1474 break;
1475 }
1476 }
1477
1478 return (result);
1479 }
1480
1481 /*
1482 * ufs_lockfs_end - terminate the lockfs locking protocol
1483 */
1484 void
ufs_lockfs_end(struct ulockfs * ulp)1485 ufs_lockfs_end(struct ulockfs *ulp)
1486 {
1487 ulockfs_info_t *info;
1488 ulockfs_info_t *head;
1489
1490 /*
1491 * end-of-VOP protocol
1492 */
1493 if (ulp == NULL)
1494 return;
1495
1496 head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1497 SEARCH_ULOCKFSP(head, ulp, info);
1498
1499 /*
1500 * If we're called from a first level VOP, we have to have a
1501 * valid ulockfs record in the TSD.
1502 */
1503 ASSERT(info != NULL);
1504
1505 /*
1506 * Invalidate the ulockfs record.
1507 */
1508 info->ulp = NULL;
1509
1510 if (ufs_lockfs_top_vop_return(head))
1511 curthread->t_flag &= ~T_DONTBLOCK;
1512
1513 /* fallocate thread */
1514 if (ULOCKFS_IS_FALLOC(ulp) && info->flags & ULOCK_INFO_FALLOCATE) {
1515 /* Clear the thread's fallocate state */
1516 info->flags &= ~ULOCK_INFO_FALLOCATE;
1517 if (!atomic_dec_ulong_nv(&ulp->ul_falloc_cnt)) {
1518 mutex_enter(&ulp->ul_lock);
1519 ULOCKFS_CLR_FALLOC(ulp);
1520 cv_broadcast(&ulp->ul_cv);
1521 mutex_exit(&ulp->ul_lock);
1522 }
1523 } else { /* normal thread */
1524 if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
1525 cv_broadcast(&ulp->ul_cv);
1526 }
1527 }
1528
1529 /*
1530 * ufs_lockfs_trybegin - try to start the lockfs locking protocol without
1531 * blocking.
1532 */
1533 int
ufs_lockfs_trybegin(struct ufsvfs * ufsvfsp,struct ulockfs ** ulpp,ulong_t mask)1534 ufs_lockfs_trybegin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask)
1535 {
1536 int error = 0;
1537 int rec_vop;
1538 ushort_t op_cnt_incremented = 0;
1539 ulong_t *ctr;
1540 struct ulockfs *ulp;
1541 ulockfs_info_t *ulockfs_info;
1542 ulockfs_info_t *ulockfs_info_free;
1543 ulockfs_info_t *ulockfs_info_temp;
1544
1545 /*
1546 * file system has been forcibly unmounted
1547 */
1548 if (ufsvfsp == NULL)
1549 return (EIO);
1550
1551 *ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1552
1553 /*
1554 * Do lockfs protocol
1555 */
1556 ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1557 IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1558
1559 /*
1560 * Detect recursive VOP call or handcrafted internal lockfs protocol
1561 * path and bail out in that case.
1562 */
1563 if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1564 *ulpp = NULL;
1565 return (0);
1566 } else {
1567 if (ulockfs_info_free == NULL) {
1568 if ((ulockfs_info_temp = (ulockfs_info_t *)
1569 kmem_zalloc(sizeof (ulockfs_info_t),
1570 KM_NOSLEEP)) == NULL) {
1571 *ulpp = NULL;
1572 return (ENOMEM);
1573 }
1574 }
1575 }
1576
1577 /*
1578 * First time VOP call
1579 *
1580 * Increment the ctr irrespective of the lockfs state. If the lockfs
1581 * state is not ULOCKFS_ULOCK, we can decrement it later. However,
1582 * before incrementing we need to check if there is a pending quiesce
1583 * request because if we have a continuous stream of ufs_lockfs_begin
1584 * requests pounding on a few cpu's then the ufs_quiesce thread might
1585 * never see the value of zero for ctr - a livelock kind of scenario.
1586 */
1587 ctr = (mask & ULOCKFS_FWLOCK) ?
1588 &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt;
1589 if (!ULOCKFS_IS_SLOCK(ulp)) {
1590 atomic_inc_ulong(ctr);
1591 op_cnt_incremented++;
1592 }
1593
1594 if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
1595 /*
1596 * Non-blocking version of ufs_check_lockfs() code.
1597 *
1598 * If the file system is not hard locked or error locked
1599 * and if ulp->ul_fs_lock allows this operation, increment
1600 * the appropriate counter and proceed (For eg., In case the
1601 * file system is delete locked, a mmap can still go through).
1602 */
1603 if (op_cnt_incremented)
1604 if (!atomic_dec_ulong_nv(ctr))
1605 cv_broadcast(&ulp->ul_cv);
1606 mutex_enter(&ulp->ul_lock);
1607 if (ULOCKFS_IS_HLOCK(ulp) ||
1608 (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
1609 error = EIO;
1610 else if (ulp->ul_fs_lock & mask)
1611 error = EAGAIN;
1612
1613 if (error) {
1614 mutex_exit(&ulp->ul_lock);
1615 if (ulockfs_info_free == NULL)
1616 kmem_free(ulockfs_info_temp,
1617 sizeof (ulockfs_info_t));
1618 return (error);
1619 }
1620 atomic_inc_ulong(ctr);
1621 if (mask & ULOCKFS_FWLOCK)
1622 ULOCKFS_SET_FALLOC(ulp);
1623 mutex_exit(&ulp->ul_lock);
1624 } else {
1625 /*
1626 * This is the common case of file system in a unlocked state.
1627 *
1628 * If a file system is unlocked, we would expect the ctr to have
1629 * been incremented by now. But this will not be true when a
1630 * quiesce is winding up - SLOCK was set when we checked before
1631 * incrementing the ctr, but by the time we checked for
1632 * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. Take
1633 * ul_lock and go through the non-blocking version of
1634 * ufs_check_lockfs() code.
1635 */
1636 if (op_cnt_incremented == 0) {
1637 mutex_enter(&ulp->ul_lock);
1638 if (ULOCKFS_IS_HLOCK(ulp) ||
1639 (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
1640 error = EIO;
1641 else if (ulp->ul_fs_lock & mask)
1642 error = EAGAIN;
1643
1644 if (error) {
1645 mutex_exit(&ulp->ul_lock);
1646 if (ulockfs_info_free == NULL)
1647 kmem_free(ulockfs_info_temp,
1648 sizeof (ulockfs_info_t));
1649 return (error);
1650 }
1651 atomic_inc_ulong(ctr);
1652 if (mask & ULOCKFS_FWLOCK)
1653 ULOCKFS_SET_FALLOC(ulp);
1654 mutex_exit(&ulp->ul_lock);
1655 } else if (mask & ULOCKFS_FWLOCK) {
1656 mutex_enter(&ulp->ul_lock);
1657 ULOCKFS_SET_FALLOC(ulp);
1658 mutex_exit(&ulp->ul_lock);
1659 }
1660 }
1661
1662 if (ulockfs_info_free != NULL) {
1663 ulockfs_info_free->ulp = ulp;
1664 if (mask & ULOCKFS_FWLOCK)
1665 ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE;
1666 } else {
1667 ulockfs_info_temp->ulp = ulp;
1668 ulockfs_info_temp->next = ulockfs_info;
1669 if (mask & ULOCKFS_FWLOCK)
1670 ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE;
1671 ASSERT(ufs_lockfs_key != 0);
1672 (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1673 }
1674
1675 curthread->t_flag |= T_DONTBLOCK;
1676 return (0);
1677 }
1678
1679 /*
1680 * specialized version of ufs_lockfs_begin() called by ufs_getpage().
1681 */
1682 int
ufs_lockfs_begin_getpage(struct ufsvfs * ufsvfsp,struct ulockfs ** ulpp,struct seg * seg,int read_access,uint_t * protp)1683 ufs_lockfs_begin_getpage(
1684 struct ufsvfs *ufsvfsp,
1685 struct ulockfs **ulpp,
1686 struct seg *seg,
1687 int read_access,
1688 uint_t *protp)
1689 {
1690 ulong_t mask;
1691 int error;
1692 int rec_vop;
1693 struct ulockfs *ulp;
1694 ulockfs_info_t *ulockfs_info;
1695 ulockfs_info_t *ulockfs_info_free;
1696 ulockfs_info_t *ulockfs_info_temp;
1697
1698 /*
1699 * file system has been forcibly unmounted
1700 */
1701 if (ufsvfsp == NULL)
1702 return (EIO);
1703
1704 *ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1705
1706 /*
1707 * Do lockfs protocol
1708 */
1709 ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1710 IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1711
1712 /*
1713 * Detect recursive VOP call or handcrafted internal lockfs protocol
1714 * path and bail out in that case.
1715 */
1716 if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1717 *ulpp = NULL;
1718 return (0);
1719 } else {
1720 if (ulockfs_info_free == NULL) {
1721 if ((ulockfs_info_temp = (ulockfs_info_t *)
1722 kmem_zalloc(sizeof (ulockfs_info_t),
1723 KM_NOSLEEP)) == NULL) {
1724 *ulpp = NULL;
1725 return (ENOMEM);
1726 }
1727 }
1728 }
1729
1730 /*
1731 * First time VOP call
1732 */
1733 atomic_inc_ulong(&ulp->ul_vnops_cnt);
1734 if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
1735 if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
1736 cv_broadcast(&ulp->ul_cv);
1737 mutex_enter(&ulp->ul_lock);
1738 if (seg->s_ops == &segvn_ops &&
1739 ((struct segvn_data *)seg->s_data)->type != MAP_SHARED) {
1740 mask = (ulong_t)ULOCKFS_GETREAD_MASK;
1741 } else if (protp && read_access) {
1742 /*
1743 * Restrict the mapping to readonly.
1744 * Writes to this mapping will cause
1745 * another fault which will then
1746 * be suspended if fs is write locked
1747 */
1748 *protp &= ~PROT_WRITE;
1749 mask = (ulong_t)ULOCKFS_GETREAD_MASK;
1750 } else
1751 mask = (ulong_t)ULOCKFS_GETWRITE_MASK;
1752
1753 /*
1754 * will sleep if this fs is locked against this VOP
1755 */
1756 error = ufs_check_lockfs(ufsvfsp, ulp, mask);
1757 mutex_exit(&ulp->ul_lock);
1758 if (error) {
1759 if (ulockfs_info_free == NULL)
1760 kmem_free(ulockfs_info_temp,
1761 sizeof (ulockfs_info_t));
1762 return (error);
1763 }
1764 }
1765
1766 if (ulockfs_info_free != NULL) {
1767 ulockfs_info_free->ulp = ulp;
1768 } else {
1769 ulockfs_info_temp->ulp = ulp;
1770 ulockfs_info_temp->next = ulockfs_info;
1771 ASSERT(ufs_lockfs_key != 0);
1772 (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1773 }
1774
1775 curthread->t_flag |= T_DONTBLOCK;
1776 return (0);
1777 }
1778
1779 void
ufs_lockfs_tsd_destructor(void * head)1780 ufs_lockfs_tsd_destructor(void *head)
1781 {
1782 ulockfs_info_t *curr = (ulockfs_info_t *)head;
1783 ulockfs_info_t *temp;
1784
1785 for (; curr != NULL; ) {
1786 /*
1787 * The TSD destructor is being called when the thread exits
1788 * (via thread_exit()). At that time it must have cleaned up
1789 * all VOPs via ufs_lockfs_end() and there must not be a
1790 * valid ulockfs record exist while a thread is exiting.
1791 */
1792 temp = curr;
1793 curr = curr->next;
1794 ASSERT(temp->ulp == NULL);
1795 kmem_free(temp, sizeof (ulockfs_info_t));
1796 }
1797 }
1798