1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 #include <sys/types.h> 26 #include <sys/t_lock.h> 27 #include <sys/param.h> 28 #include <sys/time.h> 29 #include <sys/systm.h> 30 #include <sys/sysmacros.h> 31 #include <sys/resource.h> 32 #include <sys/signal.h> 33 #include <sys/cred.h> 34 #include <sys/user.h> 35 #include <sys/buf.h> 36 #include <sys/vfs.h> 37 #include <sys/vnode.h> 38 #include <sys/proc.h> 39 #include <sys/disp.h> 40 #include <sys/file.h> 41 #include <sys/fcntl.h> 42 #include <sys/flock.h> 43 #include <sys/atomic.h> 44 #include <sys/kmem.h> 45 #include <sys/uio.h> 46 #include <sys/conf.h> 47 #include <sys/mman.h> 48 #include <sys/pathname.h> 49 #include <sys/debug.h> 50 #include <sys/vmsystm.h> 51 #include <sys/cmn_err.h> 52 #include <sys/acct.h> 53 #include <sys/dnlc.h> 54 #include <sys/swap.h> 55 56 #include <sys/fs/ufs_fs.h> 57 #include <sys/fs/ufs_inode.h> 58 #include <sys/fs/ufs_fsdir.h> 59 #include <sys/fs/ufs_trans.h> 60 #include <sys/fs/ufs_panic.h> 61 #include <sys/fs/ufs_mount.h> 62 #include <sys/fs/ufs_bio.h> 63 #include <sys/fs/ufs_log.h> 64 #include <sys/fs/ufs_quota.h> 65 #include <sys/dirent.h> /* must be AFTER <sys/fs/fsdir.h>! */ 66 #include <sys/errno.h> 67 #include <sys/sysinfo.h> 68 69 #include <vm/hat.h> 70 #include <vm/pvn.h> 71 #include <vm/as.h> 72 #include <vm/seg.h> 73 #include <vm/seg_map.h> 74 #include <vm/seg_vn.h> 75 #include <vm/rm.h> 76 #include <vm/anon.h> 77 #include <sys/swap.h> 78 #include <sys/dnlc.h> 79 80 extern struct vnode *common_specvp(struct vnode *vp); 81 82 /* error lock status */ 83 #define UN_ERRLCK (-1) 84 #define SET_ERRLCK 1 85 #define RE_ERRLCK 2 86 #define NO_ERRLCK 0 87 88 /* 89 * Index to be used in TSD for storing lockfs data 90 */ 91 uint_t ufs_lockfs_key; 92 93 typedef struct _ulockfs_info { 94 struct _ulockfs_info *next; 95 struct ulockfs *ulp; 96 uint_t flags; 97 } ulockfs_info_t; 98 99 #define ULOCK_INFO_FALLOCATE 0x00000001 /* fallocate thread */ 100 101 /* 102 * Check in TSD that whether we are already doing any VOP on this filesystem 103 */ 104 #define IS_REC_VOP(found, head, ulp, free) \ 105 { \ 106 ulockfs_info_t *_curr; \ 107 \ 108 for (found = 0, free = NULL, _curr = head; \ 109 _curr != NULL; _curr = _curr->next) { \ 110 if ((free == NULL) && \ 111 (_curr->ulp == NULL)) \ 112 free = _curr; \ 113 if (_curr->ulp == ulp) { \ 114 found = 1; \ 115 break; \ 116 } \ 117 } \ 118 } 119 120 /* 121 * Get the lockfs data from TSD so that lockfs handles the recursive VOP 122 * properly 123 */ 124 #define SEARCH_ULOCKFSP(head, ulp, info) \ 125 { \ 126 ulockfs_info_t *_curr; \ 127 \ 128 for (_curr = head; _curr != NULL; \ 129 _curr = _curr->next) { \ 130 if (_curr->ulp == ulp) { \ 131 break; \ 132 } \ 133 } \ 134 \ 135 info = _curr; \ 136 } 137 138 /* 139 * Validate lockfs request 140 */ 141 static int 142 ufs_getlfd( 143 struct lockfs *lockfsp, /* new lock request */ 144 struct lockfs *ul_lockfsp) /* old lock state */ 145 { 146 int error = 0; 147 148 /* 149 * no input flags defined 150 */ 151 if (lockfsp->lf_flags != 0) { 152 error = EINVAL; 153 goto errout; 154 } 155 156 /* 157 * check key 158 */ 159 if (!LOCKFS_IS_ULOCK(ul_lockfsp)) 160 if (lockfsp->lf_key != ul_lockfsp->lf_key) { 161 error = EINVAL; 162 goto errout; 163 } 164 165 lockfsp->lf_key = ul_lockfsp->lf_key + 1; 166 167 errout: 168 return (error); 169 } 170 171 /* 172 * ufs_checkaccton 173 * check if accounting is turned on on this fs 174 */ 175 176 int 177 ufs_checkaccton(struct vnode *vp) 178 { 179 if (acct_fs_in_use(vp)) 180 return (EDEADLK); 181 return (0); 182 } 183 184 /* 185 * ufs_checkswapon 186 * check if local swapping is to file on this fs 187 */ 188 int 189 ufs_checkswapon(struct vnode *vp) 190 { 191 struct swapinfo *sip; 192 193 mutex_enter(&swapinfo_lock); 194 for (sip = swapinfo; sip; sip = sip->si_next) 195 if (sip->si_vp->v_vfsp == vp->v_vfsp) { 196 mutex_exit(&swapinfo_lock); 197 return (EDEADLK); 198 } 199 mutex_exit(&swapinfo_lock); 200 return (0); 201 } 202 203 /* 204 * ufs_freeze 205 * pend future accesses for current lock and desired lock 206 */ 207 void 208 ufs_freeze(struct ulockfs *ulp, struct lockfs *lockfsp) 209 { 210 /* 211 * set to new lock type 212 */ 213 ulp->ul_lockfs.lf_lock = lockfsp->lf_lock; 214 ulp->ul_lockfs.lf_key = lockfsp->lf_key; 215 ulp->ul_lockfs.lf_comlen = lockfsp->lf_comlen; 216 ulp->ul_lockfs.lf_comment = lockfsp->lf_comment; 217 218 ulp->ul_fs_lock = (1 << ulp->ul_lockfs.lf_lock); 219 } 220 221 /* 222 * All callers of ufs_quiesce() atomically increment ufs_quiesce_pend before 223 * starting ufs_quiesce() protocol and decrement it only when a file system no 224 * longer has to be in quiescent state. This allows ufs_pageio() to detect 225 * that another thread wants to quiesce a file system. See more comments in 226 * ufs_pageio(). 227 */ 228 ulong_t ufs_quiesce_pend = 0; 229 230 /* 231 * ufs_quiesce 232 * wait for outstanding accesses to finish 233 */ 234 int 235 ufs_quiesce(struct ulockfs *ulp) 236 { 237 int error = 0; 238 ulockfs_info_t *head; 239 ulockfs_info_t *info; 240 klwp_t *lwp = ttolwp(curthread); 241 242 head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 243 SEARCH_ULOCKFSP(head, ulp, info); 244 245 /* 246 * We have to keep /proc away from stopping us after we applied 247 * the softlock but before we got a chance to clear it again. 248 * prstop() may pagefault and become stuck on the softlock still 249 * pending. 250 */ 251 if (lwp != NULL) 252 lwp->lwp_nostop++; 253 254 /* 255 * Set a softlock to suspend future ufs_vnops so that 256 * this lockfs request will not be starved 257 */ 258 ULOCKFS_SET_SLOCK(ulp); 259 ASSERT(ufs_quiesce_pend); 260 261 /* check if there is any outstanding ufs vnodeops calls */ 262 while (ulp->ul_vnops_cnt || ulp->ul_falloc_cnt) { 263 /* 264 * use timed version of cv_wait_sig() to make sure we don't 265 * miss a wake up call from ufs_pageio() when it doesn't use 266 * ul_lock. 267 * 268 * when a fallocate thread comes in, the only way it returns 269 * from this function is if there are no other vnode operations 270 * going on (remember fallocate threads are tracked using 271 * ul_falloc_cnt not ul_vnops_cnt), and another fallocate thread 272 * hasn't already grabbed the fs write lock. 273 */ 274 if (info && (info->flags & ULOCK_INFO_FALLOCATE)) { 275 if (!ulp->ul_vnops_cnt && !ULOCKFS_IS_FWLOCK(ulp)) 276 goto out; 277 } 278 if (!cv_reltimedwait_sig(&ulp->ul_cv, &ulp->ul_lock, hz, 279 TR_CLOCK_TICK)) { 280 error = EINTR; 281 goto out; 282 } 283 } 284 285 out: 286 /* 287 * unlock the soft lock 288 */ 289 ULOCKFS_CLR_SLOCK(ulp); 290 291 if (lwp != NULL) 292 lwp->lwp_nostop--; 293 294 return (error); 295 } 296 297 /* 298 * ufs_flush_inode 299 */ 300 int 301 ufs_flush_inode(struct inode *ip, void *arg) 302 { 303 int error; 304 int saverror = 0; 305 306 /* 307 * wrong file system; keep looking 308 */ 309 if (ip->i_ufsvfs != (struct ufsvfs *)arg) 310 return (0); 311 312 /* 313 * asynchronously push all the dirty pages 314 */ 315 if (((error = TRANS_SYNCIP(ip, B_ASYNC, 0, TOP_SYNCIP_FLUSHI)) != 0) && 316 (error != EAGAIN)) 317 saverror = error; 318 /* 319 * wait for io and discard all mappings 320 */ 321 if (error = TRANS_SYNCIP(ip, B_INVAL, 0, TOP_SYNCIP_FLUSHI)) 322 saverror = error; 323 324 if (ITOV(ip)->v_type == VDIR) { 325 dnlc_dir_purge(&ip->i_danchor); 326 } 327 328 return (saverror); 329 } 330 331 /* 332 * ufs_flush 333 * Flush everything that is currently dirty; this includes invalidating 334 * any mappings. 335 */ 336 int 337 ufs_flush(struct vfs *vfsp) 338 { 339 int error; 340 int saverror = 0; 341 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 342 struct fs *fs = ufsvfsp->vfs_fs; 343 int tdontblock = 0; 344 345 ASSERT(vfs_lock_held(vfsp)); 346 347 /* 348 * purge dnlc 349 */ 350 (void) dnlc_purge_vfsp(vfsp, 0); 351 352 /* 353 * drain the delete and idle threads 354 */ 355 ufs_delete_drain(vfsp, 0, 0); 356 ufs_idle_drain(vfsp); 357 358 /* 359 * flush and invalidate quota records 360 */ 361 (void) qsync(ufsvfsp); 362 363 /* 364 * flush w/invalidate the inodes for vfsp 365 */ 366 if (error = ufs_scan_inodes(0, ufs_flush_inode, ufsvfsp, ufsvfsp)) 367 saverror = error; 368 369 /* 370 * synchronously flush superblock and summary info 371 */ 372 if (fs->fs_ronly == 0 && fs->fs_fmod) { 373 fs->fs_fmod = 0; 374 TRANS_SBUPDATE(ufsvfsp, vfsp, TOP_SBUPDATE_FLUSH); 375 } 376 /* 377 * flush w/invalidate block device pages and buf cache 378 */ 379 if ((error = VOP_PUTPAGE(common_specvp(ufsvfsp->vfs_devvp), 380 (offset_t)0, 0, B_INVAL, CRED(), NULL)) > 0) 381 saverror = error; 382 383 (void) bflush((dev_t)vfsp->vfs_dev); 384 (void) bfinval((dev_t)vfsp->vfs_dev, 0); 385 386 /* 387 * drain the delete and idle threads again 388 */ 389 ufs_delete_drain(vfsp, 0, 0); 390 ufs_idle_drain(vfsp); 391 392 /* 393 * play with the clean flag 394 */ 395 if (saverror == 0) 396 ufs_checkclean(vfsp); 397 398 /* 399 * Flush any outstanding transactions and roll the log 400 * only if we are supposed to do, i.e. LDL_NOROLL not set. 401 * We can not simply check for fs_ronly here since fsck also may 402 * use this code to roll the log on a read-only filesystem, e.g. 403 * root during early stages of boot, if other then a sanity check is 404 * done, it will clear LDL_NOROLL before. 405 * In addition we assert that the deltamap does not contain any deltas 406 * in case LDL_NOROLL is set since this is not supposed to happen. 407 */ 408 if (TRANS_ISTRANS(ufsvfsp)) { 409 ml_unit_t *ul = ufsvfsp->vfs_log; 410 mt_map_t *mtm = ul->un_deltamap; 411 412 if (ul->un_flags & LDL_NOROLL) { 413 ASSERT(mtm->mtm_nme == 0); 414 } else { 415 /* 416 * Do not set T_DONTBLOCK if there is a 417 * transaction opened by caller. 418 */ 419 if (curthread->t_flag & T_DONTBLOCK) 420 tdontblock = 1; 421 else 422 curthread->t_flag |= T_DONTBLOCK; 423 424 TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_FLUSH, 425 TOP_COMMIT_SIZE, error); 426 427 if (!error) { 428 TRANS_END_SYNC(ufsvfsp, saverror, 429 TOP_COMMIT_FLUSH, TOP_COMMIT_SIZE); 430 } 431 432 if (tdontblock == 0) 433 curthread->t_flag &= ~T_DONTBLOCK; 434 435 logmap_roll_dev(ufsvfsp->vfs_log); 436 } 437 } 438 439 return (saverror); 440 } 441 442 /* 443 * ufs_thaw_wlock 444 * special processing when thawing down to wlock 445 */ 446 static int 447 ufs_thaw_wlock(struct inode *ip, void *arg) 448 { 449 /* 450 * wrong file system; keep looking 451 */ 452 if (ip->i_ufsvfs != (struct ufsvfs *)arg) 453 return (0); 454 455 /* 456 * iupdat refuses to clear flags if the fs is read only. The fs 457 * may become read/write during the lock and we wouldn't want 458 * these inodes being written to disk. So clear the flags. 459 */ 460 rw_enter(&ip->i_contents, RW_WRITER); 461 ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG); 462 rw_exit(&ip->i_contents); 463 464 /* 465 * pages are mlocked -- fail wlock 466 */ 467 if (ITOV(ip)->v_type != VCHR && vn_has_cached_data(ITOV(ip))) 468 return (EBUSY); 469 470 return (0); 471 } 472 473 /* 474 * ufs_thaw_hlock 475 * special processing when thawing down to hlock or elock 476 */ 477 static int 478 ufs_thaw_hlock(struct inode *ip, void *arg) 479 { 480 struct vnode *vp = ITOV(ip); 481 482 /* 483 * wrong file system; keep looking 484 */ 485 if (ip->i_ufsvfs != (struct ufsvfs *)arg) 486 return (0); 487 488 /* 489 * blow away all pages - even if they are mlocked 490 */ 491 do { 492 (void) TRANS_SYNCIP(ip, B_INVAL | B_FORCE, 0, TOP_SYNCIP_HLOCK); 493 } while ((vp->v_type != VCHR) && vn_has_cached_data(vp)); 494 rw_enter(&ip->i_contents, RW_WRITER); 495 ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG); 496 rw_exit(&ip->i_contents); 497 498 return (0); 499 } 500 501 /* 502 * ufs_thaw 503 * thaw file system lock down to current value 504 */ 505 int 506 ufs_thaw(struct vfs *vfsp, struct ufsvfs *ufsvfsp, struct ulockfs *ulp) 507 { 508 int error = 0; 509 int noidel = (int)(ulp->ul_flag & ULOCKFS_NOIDEL); 510 511 /* 512 * if wlock or hlock or elock 513 */ 514 if (ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_HLOCK(ulp) || 515 ULOCKFS_IS_ELOCK(ulp)) { 516 517 /* 518 * don't keep access times 519 * don't free deleted files 520 * if superblock writes are allowed, limit them to me for now 521 */ 522 ulp->ul_flag |= (ULOCKFS_NOIACC|ULOCKFS_NOIDEL); 523 if (ulp->ul_sbowner != (kthread_id_t)-1) 524 ulp->ul_sbowner = curthread; 525 526 /* 527 * wait for writes for deleted files and superblock updates 528 */ 529 (void) ufs_flush(vfsp); 530 531 /* 532 * now make sure the quota file is up-to-date 533 * expensive; but effective 534 */ 535 error = ufs_flush(vfsp); 536 /* 537 * no one can write the superblock 538 */ 539 ulp->ul_sbowner = (kthread_id_t)-1; 540 541 /* 542 * special processing for wlock/hlock/elock 543 */ 544 if (ULOCKFS_IS_WLOCK(ulp)) { 545 if (error) 546 goto errout; 547 error = bfinval(ufsvfsp->vfs_dev, 0); 548 if (error) 549 goto errout; 550 error = ufs_scan_inodes(0, ufs_thaw_wlock, 551 (void *)ufsvfsp, ufsvfsp); 552 if (error) 553 goto errout; 554 } 555 if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp)) { 556 error = 0; 557 (void) ufs_scan_inodes(0, ufs_thaw_hlock, 558 (void *)ufsvfsp, ufsvfsp); 559 (void) bfinval(ufsvfsp->vfs_dev, 1); 560 } 561 } else { 562 563 /* 564 * okay to keep access times 565 * okay to free deleted files 566 * okay to write the superblock 567 */ 568 ulp->ul_flag &= ~(ULOCKFS_NOIACC|ULOCKFS_NOIDEL); 569 ulp->ul_sbowner = NULL; 570 571 /* 572 * flush in case deleted files are in memory 573 */ 574 if (noidel) { 575 if (error = ufs_flush(vfsp)) 576 goto errout; 577 } 578 } 579 580 errout: 581 cv_broadcast(&ulp->ul_cv); 582 return (error); 583 } 584 585 /* 586 * ufs_reconcile_fs 587 * reconcile incore superblock with ondisk superblock 588 */ 589 int 590 ufs_reconcile_fs(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck) 591 { 592 struct fs *mfs; /* in-memory superblock */ 593 struct fs *dfs; /* on-disk superblock */ 594 struct buf *bp; /* on-disk superblock buf */ 595 int needs_unlock; 596 char finished_fsclean; 597 598 mfs = ufsvfsp->vfs_fs; 599 600 /* 601 * get the on-disk copy of the superblock 602 */ 603 bp = UFS_BREAD(ufsvfsp, vfsp->vfs_dev, SBLOCK, SBSIZE); 604 bp->b_flags |= (B_STALE|B_AGE); 605 if (bp->b_flags & B_ERROR) { 606 brelse(bp); 607 return (EIO); 608 } 609 dfs = bp->b_un.b_fs; 610 611 /* error locks may only unlock after the fs has been made consistent */ 612 if (errlck == UN_ERRLCK) { 613 if (dfs->fs_clean == FSFIX) { /* being repaired */ 614 brelse(bp); 615 return (EAGAIN); 616 } 617 /* repair not yet started? */ 618 finished_fsclean = TRANS_ISTRANS(ufsvfsp)? FSLOG: FSCLEAN; 619 if (dfs->fs_clean != finished_fsclean) { 620 brelse(bp); 621 return (EBUSY); 622 } 623 } 624 625 /* 626 * if superblock has changed too much, abort 627 */ 628 if ((mfs->fs_sblkno != dfs->fs_sblkno) || 629 (mfs->fs_cblkno != dfs->fs_cblkno) || 630 (mfs->fs_iblkno != dfs->fs_iblkno) || 631 (mfs->fs_dblkno != dfs->fs_dblkno) || 632 (mfs->fs_cgoffset != dfs->fs_cgoffset) || 633 (mfs->fs_cgmask != dfs->fs_cgmask) || 634 (mfs->fs_bsize != dfs->fs_bsize) || 635 (mfs->fs_fsize != dfs->fs_fsize) || 636 (mfs->fs_frag != dfs->fs_frag) || 637 (mfs->fs_bmask != dfs->fs_bmask) || 638 (mfs->fs_fmask != dfs->fs_fmask) || 639 (mfs->fs_bshift != dfs->fs_bshift) || 640 (mfs->fs_fshift != dfs->fs_fshift) || 641 (mfs->fs_fragshift != dfs->fs_fragshift) || 642 (mfs->fs_fsbtodb != dfs->fs_fsbtodb) || 643 (mfs->fs_sbsize != dfs->fs_sbsize) || 644 (mfs->fs_nindir != dfs->fs_nindir) || 645 (mfs->fs_nspf != dfs->fs_nspf) || 646 (mfs->fs_trackskew != dfs->fs_trackskew) || 647 (mfs->fs_cgsize != dfs->fs_cgsize) || 648 (mfs->fs_ntrak != dfs->fs_ntrak) || 649 (mfs->fs_nsect != dfs->fs_nsect) || 650 (mfs->fs_spc != dfs->fs_spc) || 651 (mfs->fs_cpg != dfs->fs_cpg) || 652 (mfs->fs_ipg != dfs->fs_ipg) || 653 (mfs->fs_fpg != dfs->fs_fpg) || 654 (mfs->fs_postblformat != dfs->fs_postblformat) || 655 (mfs->fs_magic != dfs->fs_magic)) { 656 brelse(bp); 657 return (EACCES); 658 } 659 if (dfs->fs_clean == FSBAD || FSOKAY != dfs->fs_state + dfs->fs_time) 660 if (mfs->fs_clean == FSLOG) { 661 brelse(bp); 662 return (EACCES); 663 } 664 665 /* 666 * get new summary info 667 */ 668 if (ufs_getsummaryinfo(vfsp->vfs_dev, ufsvfsp, dfs)) { 669 brelse(bp); 670 return (EIO); 671 } 672 673 /* 674 * release old summary info and update in-memory superblock 675 */ 676 kmem_free(mfs->fs_u.fs_csp, mfs->fs_cssize); 677 mfs->fs_u.fs_csp = dfs->fs_u.fs_csp; /* Only entry 0 used */ 678 679 /* 680 * update fields allowed to change 681 */ 682 mfs->fs_size = dfs->fs_size; 683 mfs->fs_dsize = dfs->fs_dsize; 684 mfs->fs_ncg = dfs->fs_ncg; 685 mfs->fs_minfree = dfs->fs_minfree; 686 mfs->fs_rotdelay = dfs->fs_rotdelay; 687 mfs->fs_rps = dfs->fs_rps; 688 mfs->fs_maxcontig = dfs->fs_maxcontig; 689 mfs->fs_maxbpg = dfs->fs_maxbpg; 690 mfs->fs_csmask = dfs->fs_csmask; 691 mfs->fs_csshift = dfs->fs_csshift; 692 mfs->fs_optim = dfs->fs_optim; 693 mfs->fs_csaddr = dfs->fs_csaddr; 694 mfs->fs_cssize = dfs->fs_cssize; 695 mfs->fs_ncyl = dfs->fs_ncyl; 696 mfs->fs_cstotal = dfs->fs_cstotal; 697 mfs->fs_reclaim = dfs->fs_reclaim; 698 699 if (mfs->fs_reclaim & (FS_RECLAIM|FS_RECLAIMING)) { 700 mfs->fs_reclaim &= ~FS_RECLAIM; 701 mfs->fs_reclaim |= FS_RECLAIMING; 702 ufs_thread_start(&ufsvfsp->vfs_reclaim, 703 ufs_thread_reclaim, vfsp); 704 } 705 706 /* XXX What to do about sparecon? */ 707 708 /* XXX need to copy volume label */ 709 710 /* 711 * ondisk clean flag overrides inmemory clean flag iff == FSBAD 712 * or if error-locked and ondisk is now clean 713 */ 714 needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock); 715 if (needs_unlock) 716 mutex_enter(&ufsvfsp->vfs_lock); 717 718 if (errlck == UN_ERRLCK) { 719 if (finished_fsclean == dfs->fs_clean) 720 mfs->fs_clean = finished_fsclean; 721 else 722 mfs->fs_clean = FSBAD; 723 mfs->fs_state = FSOKAY - dfs->fs_time; 724 } 725 726 if (FSOKAY != dfs->fs_state + dfs->fs_time || 727 (dfs->fs_clean == FSBAD)) 728 mfs->fs_clean = FSBAD; 729 730 if (needs_unlock) 731 mutex_exit(&ufsvfsp->vfs_lock); 732 733 brelse(bp); 734 735 return (0); 736 } 737 738 /* 739 * ufs_reconcile_inode 740 * reconcile ondisk inode with incore inode 741 */ 742 static int 743 ufs_reconcile_inode(struct inode *ip, void *arg) 744 { 745 int i; 746 int ndaddr; 747 int niaddr; 748 struct dinode *dp; /* ondisk inode */ 749 struct buf *bp = NULL; 750 uid_t d_uid; 751 gid_t d_gid; 752 int error = 0; 753 struct fs *fs; 754 755 /* 756 * not an inode we care about 757 */ 758 if (ip->i_ufsvfs != (struct ufsvfs *)arg) 759 return (0); 760 761 fs = ip->i_fs; 762 763 /* 764 * Inode reconciliation fails: we made the filesystem quiescent 765 * and we did a ufs_flush() before calling ufs_reconcile_inode() 766 * and thus the inode should not have been changed inbetween. 767 * Any discrepancies indicate a logic error and a pretty 768 * significant run-state inconsistency we should complain about. 769 */ 770 if (ip->i_flag & (IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG)) { 771 cmn_err(CE_WARN, "%s: Inode reconciliation failed for" 772 "inode %llu", fs->fs_fsmnt, (u_longlong_t)ip->i_number); 773 return (EINVAL); 774 } 775 776 /* 777 * get the dinode 778 */ 779 bp = UFS_BREAD(ip->i_ufsvfs, 780 ip->i_dev, (daddr_t)fsbtodb(fs, itod(fs, ip->i_number)), 781 (int)fs->fs_bsize); 782 if (bp->b_flags & B_ERROR) { 783 brelse(bp); 784 return (EIO); 785 } 786 dp = bp->b_un.b_dino; 787 dp += itoo(fs, ip->i_number); 788 789 /* 790 * handle Sun's implementation of EFT 791 */ 792 d_uid = (dp->di_suid == UID_LONG) ? dp->di_uid : (uid_t)dp->di_suid; 793 d_gid = (dp->di_sgid == GID_LONG) ? dp->di_gid : (uid_t)dp->di_sgid; 794 795 rw_enter(&ip->i_contents, RW_WRITER); 796 797 /* 798 * some fields are not allowed to change 799 */ 800 if ((ip->i_mode != dp->di_mode) || 801 (ip->i_gen != dp->di_gen) || 802 (ip->i_uid != d_uid) || 803 (ip->i_gid != d_gid)) { 804 error = EACCES; 805 goto out; 806 } 807 808 /* 809 * and some are allowed to change 810 */ 811 ip->i_size = dp->di_size; 812 ip->i_ic.ic_flags = dp->di_ic.ic_flags; 813 ip->i_blocks = dp->di_blocks; 814 ip->i_nlink = dp->di_nlink; 815 if (ip->i_flag & IFASTSYMLNK) { 816 ndaddr = 1; 817 niaddr = 0; 818 } else { 819 ndaddr = NDADDR; 820 niaddr = NIADDR; 821 } 822 for (i = 0; i < ndaddr; ++i) 823 ip->i_db[i] = dp->di_db[i]; 824 for (i = 0; i < niaddr; ++i) 825 ip->i_ib[i] = dp->di_ib[i]; 826 827 out: 828 rw_exit(&ip->i_contents); 829 brelse(bp); 830 return (error); 831 } 832 833 /* 834 * ufs_reconcile 835 * reconcile ondisk superblock/inodes with any incore 836 */ 837 static int 838 ufs_reconcile(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck) 839 { 840 int error = 0; 841 842 /* 843 * get rid of as much inmemory data as possible 844 */ 845 (void) ufs_flush(vfsp); 846 847 /* 848 * reconcile the superblock and inodes 849 */ 850 if (error = ufs_reconcile_fs(vfsp, ufsvfsp, errlck)) 851 return (error); 852 if (error = ufs_scan_inodes(0, ufs_reconcile_inode, ufsvfsp, ufsvfsp)) 853 return (error); 854 /* 855 * allocation blocks may be incorrect; get rid of them 856 */ 857 (void) ufs_flush(vfsp); 858 859 return (error); 860 } 861 862 /* 863 * File system locking 864 */ 865 int 866 ufs_fiolfs(struct vnode *vp, struct lockfs *lockfsp, int from_log) 867 { 868 return (ufs__fiolfs(vp, lockfsp, /* from_user */ 1, from_log)); 869 } 870 871 /* kernel-internal interface, also used by fix-on-panic */ 872 int 873 ufs__fiolfs( 874 struct vnode *vp, 875 struct lockfs *lockfsp, 876 int from_user, 877 int from_log) 878 { 879 struct ulockfs *ulp; 880 struct lockfs lfs; 881 int error; 882 struct vfs *vfsp; 883 struct ufsvfs *ufsvfsp; 884 int errlck = NO_ERRLCK; 885 int poll_events = POLLPRI; 886 extern struct pollhead ufs_pollhd; 887 ulockfs_info_t *head; 888 ulockfs_info_t *info; 889 int signal = 0; 890 891 /* check valid lock type */ 892 if (!lockfsp || lockfsp->lf_lock > LOCKFS_MAXLOCK) 893 return (EINVAL); 894 895 if (!vp || !vp->v_vfsp || !vp->v_vfsp->vfs_data) 896 return (EIO); 897 898 vfsp = vp->v_vfsp; 899 900 if (vfsp->vfs_flag & VFS_UNMOUNTED) /* has been unmounted */ 901 return (EIO); 902 903 /* take the lock and check again */ 904 vfs_lock_wait(vfsp); 905 if (vfsp->vfs_flag & VFS_UNMOUNTED) { 906 vfs_unlock(vfsp); 907 return (EIO); 908 } 909 910 /* 911 * Can't wlock or ro/elock fs with accounting or local swap file 912 * We need to check for this before we grab the ul_lock to avoid 913 * deadlocks with the accounting framework. 914 */ 915 if ((LOCKFS_IS_WLOCK(lockfsp) || LOCKFS_IS_ELOCK(lockfsp) || 916 LOCKFS_IS_ROELOCK(lockfsp)) && !from_log) { 917 if (ufs_checkaccton(vp) || ufs_checkswapon(vp)) { 918 vfs_unlock(vfsp); 919 return (EDEADLK); 920 } 921 } 922 923 ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 924 ulp = &ufsvfsp->vfs_ulockfs; 925 head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 926 SEARCH_ULOCKFSP(head, ulp, info); 927 928 /* 929 * Suspend both the reclaim thread and the delete thread. 930 * This must be done outside the lockfs locking protocol. 931 */ 932 ufs_thread_suspend(&ufsvfsp->vfs_reclaim); 933 ufs_thread_suspend(&ufsvfsp->vfs_delete); 934 935 mutex_enter(&ulp->ul_lock); 936 atomic_add_long(&ufs_quiesce_pend, 1); 937 938 /* 939 * Quit if there is another lockfs request in progress 940 * that is waiting for existing ufs_vnops to complete. 941 */ 942 if (ULOCKFS_IS_BUSY(ulp)) { 943 error = EBUSY; 944 goto errexit; 945 } 946 947 /* cannot ulocked or downgrade a hard-lock */ 948 if (ULOCKFS_IS_HLOCK(ulp)) { 949 error = EIO; 950 goto errexit; 951 } 952 953 /* an error lock may be unlocked or relocked, only */ 954 if (ULOCKFS_IS_ELOCK(ulp)) { 955 if (!LOCKFS_IS_ULOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) { 956 error = EBUSY; 957 goto errexit; 958 } 959 } 960 961 /* 962 * a read-only error lock may only be upgraded to an 963 * error lock or hard lock 964 */ 965 if (ULOCKFS_IS_ROELOCK(ulp)) { 966 if (!LOCKFS_IS_HLOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) { 967 error = EBUSY; 968 goto errexit; 969 } 970 } 971 972 /* 973 * until read-only error locks are fully implemented 974 * just return EINVAL 975 */ 976 if (LOCKFS_IS_ROELOCK(lockfsp)) { 977 error = EINVAL; 978 goto errexit; 979 } 980 981 /* 982 * an error lock may only be applied if the file system is 983 * unlocked or already error locked. 984 * (this is to prevent the case where a fs gets changed out from 985 * underneath a fs that is locked for backup, 986 * that is, name/delete/write-locked.) 987 */ 988 if ((!ULOCKFS_IS_ULOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp) && 989 !ULOCKFS_IS_ROELOCK(ulp)) && 990 (LOCKFS_IS_ELOCK(lockfsp) || LOCKFS_IS_ROELOCK(lockfsp))) { 991 error = EBUSY; 992 goto errexit; 993 } 994 995 /* get and validate the input lockfs request */ 996 if (error = ufs_getlfd(lockfsp, &ulp->ul_lockfs)) 997 goto errexit; 998 999 /* 1000 * save current ulockfs struct 1001 */ 1002 bcopy(&ulp->ul_lockfs, &lfs, sizeof (struct lockfs)); 1003 1004 /* 1005 * Freeze the file system (pend future accesses) 1006 */ 1007 ufs_freeze(ulp, lockfsp); 1008 1009 /* 1010 * Set locking in progress because ufs_quiesce may free the 1011 * ul_lock mutex. 1012 */ 1013 ULOCKFS_SET_BUSY(ulp); 1014 /* update the ioctl copy */ 1015 LOCKFS_SET_BUSY(&ulp->ul_lockfs); 1016 1017 /* 1018 * We need to unset FWLOCK status before we call ufs_quiesce 1019 * so that the thread doesnt get suspended. We do this only if 1020 * this (fallocate) thread requested an unlock operation. 1021 */ 1022 if (info && (info->flags & ULOCK_INFO_FALLOCATE)) { 1023 if (!ULOCKFS_IS_WLOCK(ulp)) 1024 ULOCKFS_CLR_FWLOCK(ulp); 1025 } 1026 1027 /* 1028 * Quiesce (wait for outstanding accesses to finish) 1029 */ 1030 if (error = ufs_quiesce(ulp)) { 1031 /* 1032 * Interrupted due to signal. There could still be 1033 * pending vnops. 1034 */ 1035 signal = 1; 1036 1037 /* 1038 * We do broadcast because lock-status 1039 * could be reverted to old status. 1040 */ 1041 cv_broadcast(&ulp->ul_cv); 1042 goto errout; 1043 } 1044 1045 /* 1046 * If the fallocate thread requested a write fs lock operation 1047 * then we set fwlock status in the ulp. 1048 */ 1049 if (info && (info->flags & ULOCK_INFO_FALLOCATE)) { 1050 if (ULOCKFS_IS_WLOCK(ulp)) 1051 ULOCKFS_SET_FWLOCK(ulp); 1052 } 1053 1054 /* 1055 * save error lock status to pass down to reconcilation 1056 * routines and for later cleanup 1057 */ 1058 if (LOCKFS_IS_ELOCK(&lfs) && ULOCKFS_IS_ULOCK(ulp)) 1059 errlck = UN_ERRLCK; 1060 1061 if (ULOCKFS_IS_ELOCK(ulp) || ULOCKFS_IS_ROELOCK(ulp)) { 1062 int needs_unlock; 1063 int needs_sbwrite; 1064 1065 poll_events |= POLLERR; 1066 errlck = LOCKFS_IS_ELOCK(&lfs) || LOCKFS_IS_ROELOCK(&lfs) ? 1067 RE_ERRLCK : SET_ERRLCK; 1068 1069 needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock); 1070 if (needs_unlock) 1071 mutex_enter(&ufsvfsp->vfs_lock); 1072 1073 /* disable delayed i/o */ 1074 needs_sbwrite = 0; 1075 1076 if (errlck == SET_ERRLCK) { 1077 ufsvfsp->vfs_fs->fs_clean = FSBAD; 1078 needs_sbwrite = 1; 1079 } 1080 1081 needs_sbwrite |= ufsvfsp->vfs_dio; 1082 ufsvfsp->vfs_dio = 0; 1083 1084 if (needs_unlock) 1085 mutex_exit(&ufsvfsp->vfs_lock); 1086 1087 if (needs_sbwrite) { 1088 ulp->ul_sbowner = curthread; 1089 TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE); 1090 1091 if (needs_unlock) 1092 mutex_enter(&ufsvfsp->vfs_lock); 1093 1094 ufsvfsp->vfs_fs->fs_fmod = 0; 1095 1096 if (needs_unlock) 1097 mutex_exit(&ufsvfsp->vfs_lock); 1098 } 1099 } 1100 1101 /* 1102 * reconcile superblock and inodes if was wlocked 1103 */ 1104 if (LOCKFS_IS_WLOCK(&lfs) || LOCKFS_IS_ELOCK(&lfs)) { 1105 if (error = ufs_reconcile(vfsp, ufsvfsp, errlck)) 1106 goto errout; 1107 /* 1108 * in case the fs grew; reset the metadata map for logging tests 1109 */ 1110 TRANS_MATA_UMOUNT(ufsvfsp); 1111 TRANS_MATA_MOUNT(ufsvfsp); 1112 TRANS_MATA_SI(ufsvfsp, ufsvfsp->vfs_fs); 1113 } 1114 1115 /* 1116 * At least everything *currently* dirty goes out. 1117 */ 1118 1119 if ((error = ufs_flush(vfsp)) != 0 && !ULOCKFS_IS_HLOCK(ulp) && 1120 !ULOCKFS_IS_ELOCK(ulp)) 1121 goto errout; 1122 1123 /* 1124 * thaw file system and wakeup pended processes 1125 */ 1126 if (error = ufs_thaw(vfsp, ufsvfsp, ulp)) 1127 if (!ULOCKFS_IS_HLOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp)) 1128 goto errout; 1129 1130 /* 1131 * reset modified flag if not already write locked 1132 */ 1133 if (!LOCKFS_IS_WLOCK(&lfs)) 1134 ULOCKFS_CLR_MOD(ulp); 1135 1136 /* 1137 * idle the lock struct 1138 */ 1139 ULOCKFS_CLR_BUSY(ulp); 1140 /* update the ioctl copy */ 1141 LOCKFS_CLR_BUSY(&ulp->ul_lockfs); 1142 1143 /* 1144 * free current comment 1145 */ 1146 if (lfs.lf_comment && lfs.lf_comlen != 0) { 1147 kmem_free(lfs.lf_comment, lfs.lf_comlen); 1148 lfs.lf_comment = NULL; 1149 lfs.lf_comlen = 0; 1150 } 1151 1152 /* do error lock cleanup */ 1153 if (errlck == UN_ERRLCK) 1154 ufsfx_unlockfs(ufsvfsp); 1155 1156 else if (errlck == RE_ERRLCK) 1157 ufsfx_lockfs(ufsvfsp); 1158 1159 /* don't allow error lock from user to invoke panic */ 1160 else if (from_user && errlck == SET_ERRLCK && 1161 !(ufsvfsp->vfs_fsfx.fx_flags & (UFSMNT_ONERROR_PANIC >> 4))) 1162 (void) ufs_fault(ufsvfsp->vfs_root, 1163 ulp->ul_lockfs.lf_comment && ulp->ul_lockfs.lf_comlen > 0 ? 1164 ulp->ul_lockfs.lf_comment: "user-applied error lock"); 1165 1166 atomic_add_long(&ufs_quiesce_pend, -1); 1167 mutex_exit(&ulp->ul_lock); 1168 vfs_unlock(vfsp); 1169 1170 if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs)) 1171 poll_events |= POLLERR; 1172 1173 pollwakeup(&ufs_pollhd, poll_events); 1174 1175 /* 1176 * Allow both the delete thread and the reclaim thread to 1177 * continue. 1178 */ 1179 ufs_thread_continue(&ufsvfsp->vfs_delete); 1180 ufs_thread_continue(&ufsvfsp->vfs_reclaim); 1181 1182 return (0); 1183 1184 errout: 1185 /* 1186 * Lock failed. Reset the old lock in ufsvfs if not hard locked. 1187 */ 1188 if (!LOCKFS_IS_HLOCK(&ulp->ul_lockfs)) { 1189 bcopy(&lfs, &ulp->ul_lockfs, sizeof (struct lockfs)); 1190 ulp->ul_fs_lock = (1 << lfs.lf_lock); 1191 } 1192 1193 /* 1194 * Don't call ufs_thaw() when there's a signal during 1195 * ufs quiesce operation as it can lead to deadlock 1196 * with getpage. 1197 */ 1198 if (signal == 0) 1199 (void) ufs_thaw(vfsp, ufsvfsp, ulp); 1200 1201 ULOCKFS_CLR_BUSY(ulp); 1202 LOCKFS_CLR_BUSY(&ulp->ul_lockfs); 1203 1204 errexit: 1205 atomic_add_long(&ufs_quiesce_pend, -1); 1206 mutex_exit(&ulp->ul_lock); 1207 vfs_unlock(vfsp); 1208 1209 /* 1210 * Allow both the delete thread and the reclaim thread to 1211 * continue. 1212 */ 1213 ufs_thread_continue(&ufsvfsp->vfs_delete); 1214 ufs_thread_continue(&ufsvfsp->vfs_reclaim); 1215 1216 return (error); 1217 } 1218 1219 /* 1220 * fiolfss 1221 * return the current file system locking state info 1222 */ 1223 int 1224 ufs_fiolfss(struct vnode *vp, struct lockfs *lockfsp) 1225 { 1226 struct ulockfs *ulp; 1227 1228 if (!vp || !vp->v_vfsp || !VTOI(vp)) 1229 return (EINVAL); 1230 1231 /* file system has been forcibly unmounted */ 1232 if (VTOI(vp)->i_ufsvfs == NULL) 1233 return (EIO); 1234 1235 ulp = VTOUL(vp); 1236 1237 if (ULOCKFS_IS_HLOCK(ulp)) { 1238 *lockfsp = ulp->ul_lockfs; /* structure assignment */ 1239 return (0); 1240 } 1241 1242 mutex_enter(&ulp->ul_lock); 1243 1244 *lockfsp = ulp->ul_lockfs; /* structure assignment */ 1245 1246 if (ULOCKFS_IS_MOD(ulp)) 1247 lockfsp->lf_flags |= LOCKFS_MOD; 1248 1249 mutex_exit(&ulp->ul_lock); 1250 1251 return (0); 1252 } 1253 1254 /* 1255 * ufs_check_lockfs 1256 * check whether a ufs_vnops conflicts with the file system lock 1257 */ 1258 int 1259 ufs_check_lockfs(struct ufsvfs *ufsvfsp, struct ulockfs *ulp, ulong_t mask) 1260 { 1261 k_sigset_t smask; 1262 int sig, slock; 1263 1264 ASSERT(MUTEX_HELD(&ulp->ul_lock)); 1265 1266 while (ulp->ul_fs_lock & mask) { 1267 slock = (int)ULOCKFS_IS_SLOCK(ulp); 1268 if ((curthread->t_flag & T_DONTPEND) && !slock) { 1269 curthread->t_flag |= T_WOULDBLOCK; 1270 return (EAGAIN); 1271 } 1272 curthread->t_flag &= ~T_WOULDBLOCK; 1273 1274 /* 1275 * In the case of an onerr umount of the fs, threads could 1276 * have blocked before coming into ufs_check_lockfs and 1277 * need to check for the special case of ELOCK and 1278 * vfs_dontblock being set which would indicate that the fs 1279 * is on its way out and will not return therefore making 1280 * EIO the appropriate response. 1281 */ 1282 if (ULOCKFS_IS_HLOCK(ulp) || 1283 (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock)) 1284 return (EIO); 1285 1286 /* 1287 * wait for lock status to change 1288 */ 1289 if (slock || ufsvfsp->vfs_nointr) { 1290 cv_wait(&ulp->ul_cv, &ulp->ul_lock); 1291 } else { 1292 sigintr(&smask, 1); 1293 sig = cv_wait_sig(&ulp->ul_cv, &ulp->ul_lock); 1294 sigunintr(&smask); 1295 if ((!sig && (ulp->ul_fs_lock & mask)) || 1296 ufsvfsp->vfs_dontblock) 1297 return (EINTR); 1298 } 1299 } 1300 1301 if (mask & ULOCKFS_FWLOCK) { 1302 atomic_add_long(&ulp->ul_falloc_cnt, 1); 1303 ULOCKFS_SET_FALLOC(ulp); 1304 } else { 1305 atomic_add_long(&ulp->ul_vnops_cnt, 1); 1306 } 1307 1308 return (0); 1309 } 1310 1311 /* 1312 * Check whether we came across the handcrafted lockfs protocol path. We can't 1313 * simply check for T_DONTBLOCK here as one would assume since this can also 1314 * falsely catch recursive VOP's going to a different filesystem, instead we 1315 * check if we already hold the ulockfs->ul_lock mutex. 1316 */ 1317 static int 1318 ufs_lockfs_is_under_rawlockfs(struct ulockfs *ulp) 1319 { 1320 return ((mutex_owner(&ulp->ul_lock) != curthread) ? 0 : 1); 1321 } 1322 1323 /* 1324 * ufs_lockfs_begin - start the lockfs locking protocol 1325 */ 1326 int 1327 ufs_lockfs_begin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask) 1328 { 1329 int error; 1330 int rec_vop; 1331 ushort_t op_cnt_incremented = 0; 1332 ulong_t *ctr; 1333 struct ulockfs *ulp; 1334 ulockfs_info_t *ulockfs_info; 1335 ulockfs_info_t *ulockfs_info_free; 1336 ulockfs_info_t *ulockfs_info_temp; 1337 1338 /* 1339 * file system has been forcibly unmounted 1340 */ 1341 if (ufsvfsp == NULL) 1342 return (EIO); 1343 1344 *ulpp = ulp = &ufsvfsp->vfs_ulockfs; 1345 1346 /* 1347 * Do lockfs protocol 1348 */ 1349 ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 1350 IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free); 1351 1352 /* 1353 * Detect recursive VOP call or handcrafted internal lockfs protocol 1354 * path and bail out in that case. 1355 */ 1356 if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) { 1357 *ulpp = NULL; 1358 return (0); 1359 } else { 1360 if (ulockfs_info_free == NULL) { 1361 if ((ulockfs_info_temp = (ulockfs_info_t *) 1362 kmem_zalloc(sizeof (ulockfs_info_t), 1363 KM_NOSLEEP)) == NULL) { 1364 *ulpp = NULL; 1365 return (ENOMEM); 1366 } 1367 } 1368 } 1369 1370 /* 1371 * First time VOP call 1372 * 1373 * Increment the ctr irrespective of the lockfs state. If the lockfs 1374 * state is not ULOCKFS_ULOCK, we can decrement it later. However, 1375 * before incrementing we need to check if there is a pending quiesce 1376 * request because if we have a continuous stream of ufs_lockfs_begin 1377 * requests pounding on a few cpu's then the ufs_quiesce thread might 1378 * never see the value of zero for ctr - a livelock kind of scenario. 1379 */ 1380 ctr = (mask & ULOCKFS_FWLOCK) ? 1381 &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt; 1382 if (!ULOCKFS_IS_SLOCK(ulp)) { 1383 atomic_add_long(ctr, 1); 1384 op_cnt_incremented++; 1385 } 1386 1387 /* 1388 * If the lockfs state (indicated by ul_fs_lock) is not just 1389 * ULOCKFS_ULOCK, then we will be routed through ufs_check_lockfs 1390 * where there is a check with an appropriate mask to selectively allow 1391 * operations permitted for that kind of lockfs state. 1392 * 1393 * Even these selective operations should not be allowed to go through 1394 * if a lockfs request is in progress because that could result in inode 1395 * modifications during a quiesce and could hence result in inode 1396 * reconciliation failures. ULOCKFS_SLOCK alone would not be sufficient, 1397 * so make use of ufs_quiesce_pend to disallow vnode operations when a 1398 * quiesce is in progress. 1399 */ 1400 if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) { 1401 if (op_cnt_incremented) 1402 if (!atomic_add_long_nv(ctr, -1)) 1403 cv_broadcast(&ulp->ul_cv); 1404 mutex_enter(&ulp->ul_lock); 1405 error = ufs_check_lockfs(ufsvfsp, ulp, mask); 1406 mutex_exit(&ulp->ul_lock); 1407 if (error) { 1408 if (ulockfs_info_free == NULL) 1409 kmem_free(ulockfs_info_temp, 1410 sizeof (ulockfs_info_t)); 1411 return (error); 1412 } 1413 } else { 1414 /* 1415 * This is the common case of file system in a unlocked state. 1416 * 1417 * If a file system is unlocked, we would expect the ctr to have 1418 * been incremented by now. But this will not be true when a 1419 * quiesce is winding up - SLOCK was set when we checked before 1420 * incrementing the ctr, but by the time we checked for 1421 * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. It is okay 1422 * to take ul_lock and go through the slow path in this uncommon 1423 * case. 1424 */ 1425 if (op_cnt_incremented == 0) { 1426 mutex_enter(&ulp->ul_lock); 1427 error = ufs_check_lockfs(ufsvfsp, ulp, mask); 1428 if (error) { 1429 mutex_exit(&ulp->ul_lock); 1430 if (ulockfs_info_free == NULL) 1431 kmem_free(ulockfs_info_temp, 1432 sizeof (ulockfs_info_t)); 1433 return (error); 1434 } 1435 if (mask & ULOCKFS_FWLOCK) 1436 ULOCKFS_SET_FALLOC(ulp); 1437 mutex_exit(&ulp->ul_lock); 1438 } else if (mask & ULOCKFS_FWLOCK) { 1439 mutex_enter(&ulp->ul_lock); 1440 ULOCKFS_SET_FALLOC(ulp); 1441 mutex_exit(&ulp->ul_lock); 1442 } 1443 } 1444 1445 if (ulockfs_info_free != NULL) { 1446 ulockfs_info_free->ulp = ulp; 1447 if (mask & ULOCKFS_FWLOCK) 1448 ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE; 1449 } else { 1450 ulockfs_info_temp->ulp = ulp; 1451 ulockfs_info_temp->next = ulockfs_info; 1452 if (mask & ULOCKFS_FWLOCK) 1453 ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE; 1454 ASSERT(ufs_lockfs_key != 0); 1455 (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp); 1456 } 1457 1458 curthread->t_flag |= T_DONTBLOCK; 1459 return (0); 1460 } 1461 1462 /* 1463 * Check whether we are returning from the top level VOP. 1464 */ 1465 static int 1466 ufs_lockfs_top_vop_return(ulockfs_info_t *head) 1467 { 1468 ulockfs_info_t *info; 1469 int result = 1; 1470 1471 for (info = head; info != NULL; info = info->next) { 1472 if (info->ulp != NULL) { 1473 result = 0; 1474 break; 1475 } 1476 } 1477 1478 return (result); 1479 } 1480 1481 /* 1482 * ufs_lockfs_end - terminate the lockfs locking protocol 1483 */ 1484 void 1485 ufs_lockfs_end(struct ulockfs *ulp) 1486 { 1487 ulockfs_info_t *info; 1488 ulockfs_info_t *head; 1489 1490 /* 1491 * end-of-VOP protocol 1492 */ 1493 if (ulp == NULL) 1494 return; 1495 1496 head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 1497 SEARCH_ULOCKFSP(head, ulp, info); 1498 1499 /* 1500 * If we're called from a first level VOP, we have to have a 1501 * valid ulockfs record in the TSD. 1502 */ 1503 ASSERT(info != NULL); 1504 1505 /* 1506 * Invalidate the ulockfs record. 1507 */ 1508 info->ulp = NULL; 1509 1510 if (ufs_lockfs_top_vop_return(head)) 1511 curthread->t_flag &= ~T_DONTBLOCK; 1512 1513 /* fallocate thread */ 1514 if (ULOCKFS_IS_FALLOC(ulp) && info->flags & ULOCK_INFO_FALLOCATE) { 1515 /* Clear the thread's fallocate state */ 1516 info->flags &= ~ULOCK_INFO_FALLOCATE; 1517 if (!atomic_add_long_nv(&ulp->ul_falloc_cnt, -1)) { 1518 mutex_enter(&ulp->ul_lock); 1519 ULOCKFS_CLR_FALLOC(ulp); 1520 cv_broadcast(&ulp->ul_cv); 1521 mutex_exit(&ulp->ul_lock); 1522 } 1523 } else { /* normal thread */ 1524 if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 1525 cv_broadcast(&ulp->ul_cv); 1526 } 1527 } 1528 1529 /* 1530 * ufs_lockfs_trybegin - try to start the lockfs locking protocol without 1531 * blocking. 1532 */ 1533 int 1534 ufs_lockfs_trybegin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask) 1535 { 1536 int error = 0; 1537 int rec_vop; 1538 ushort_t op_cnt_incremented = 0; 1539 ulong_t *ctr; 1540 struct ulockfs *ulp; 1541 ulockfs_info_t *ulockfs_info; 1542 ulockfs_info_t *ulockfs_info_free; 1543 ulockfs_info_t *ulockfs_info_temp; 1544 1545 /* 1546 * file system has been forcibly unmounted 1547 */ 1548 if (ufsvfsp == NULL) 1549 return (EIO); 1550 1551 *ulpp = ulp = &ufsvfsp->vfs_ulockfs; 1552 1553 /* 1554 * Do lockfs protocol 1555 */ 1556 ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 1557 IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free); 1558 1559 /* 1560 * Detect recursive VOP call or handcrafted internal lockfs protocol 1561 * path and bail out in that case. 1562 */ 1563 if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) { 1564 *ulpp = NULL; 1565 return (0); 1566 } else { 1567 if (ulockfs_info_free == NULL) { 1568 if ((ulockfs_info_temp = (ulockfs_info_t *) 1569 kmem_zalloc(sizeof (ulockfs_info_t), 1570 KM_NOSLEEP)) == NULL) { 1571 *ulpp = NULL; 1572 return (ENOMEM); 1573 } 1574 } 1575 } 1576 1577 /* 1578 * First time VOP call 1579 * 1580 * Increment the ctr irrespective of the lockfs state. If the lockfs 1581 * state is not ULOCKFS_ULOCK, we can decrement it later. However, 1582 * before incrementing we need to check if there is a pending quiesce 1583 * request because if we have a continuous stream of ufs_lockfs_begin 1584 * requests pounding on a few cpu's then the ufs_quiesce thread might 1585 * never see the value of zero for ctr - a livelock kind of scenario. 1586 */ 1587 ctr = (mask & ULOCKFS_FWLOCK) ? 1588 &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt; 1589 if (!ULOCKFS_IS_SLOCK(ulp)) { 1590 atomic_add_long(ctr, 1); 1591 op_cnt_incremented++; 1592 } 1593 1594 if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) { 1595 /* 1596 * Non-blocking version of ufs_check_lockfs() code. 1597 * 1598 * If the file system is not hard locked or error locked 1599 * and if ulp->ul_fs_lock allows this operation, increment 1600 * the appropriate counter and proceed (For eg., In case the 1601 * file system is delete locked, a mmap can still go through). 1602 */ 1603 if (op_cnt_incremented) 1604 if (!atomic_add_long_nv(ctr, -1)) 1605 cv_broadcast(&ulp->ul_cv); 1606 mutex_enter(&ulp->ul_lock); 1607 if (ULOCKFS_IS_HLOCK(ulp) || 1608 (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock)) 1609 error = EIO; 1610 else if (ulp->ul_fs_lock & mask) 1611 error = EAGAIN; 1612 1613 if (error) { 1614 mutex_exit(&ulp->ul_lock); 1615 if (ulockfs_info_free == NULL) 1616 kmem_free(ulockfs_info_temp, 1617 sizeof (ulockfs_info_t)); 1618 return (error); 1619 } 1620 atomic_add_long(ctr, 1); 1621 if (mask & ULOCKFS_FWLOCK) 1622 ULOCKFS_SET_FALLOC(ulp); 1623 mutex_exit(&ulp->ul_lock); 1624 } else { 1625 /* 1626 * This is the common case of file system in a unlocked state. 1627 * 1628 * If a file system is unlocked, we would expect the ctr to have 1629 * been incremented by now. But this will not be true when a 1630 * quiesce is winding up - SLOCK was set when we checked before 1631 * incrementing the ctr, but by the time we checked for 1632 * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. Take 1633 * ul_lock and go through the non-blocking version of 1634 * ufs_check_lockfs() code. 1635 */ 1636 if (op_cnt_incremented == 0) { 1637 mutex_enter(&ulp->ul_lock); 1638 if (ULOCKFS_IS_HLOCK(ulp) || 1639 (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock)) 1640 error = EIO; 1641 else if (ulp->ul_fs_lock & mask) 1642 error = EAGAIN; 1643 1644 if (error) { 1645 mutex_exit(&ulp->ul_lock); 1646 if (ulockfs_info_free == NULL) 1647 kmem_free(ulockfs_info_temp, 1648 sizeof (ulockfs_info_t)); 1649 return (error); 1650 } 1651 atomic_add_long(ctr, 1); 1652 if (mask & ULOCKFS_FWLOCK) 1653 ULOCKFS_SET_FALLOC(ulp); 1654 mutex_exit(&ulp->ul_lock); 1655 } else if (mask & ULOCKFS_FWLOCK) { 1656 mutex_enter(&ulp->ul_lock); 1657 ULOCKFS_SET_FALLOC(ulp); 1658 mutex_exit(&ulp->ul_lock); 1659 } 1660 } 1661 1662 if (ulockfs_info_free != NULL) { 1663 ulockfs_info_free->ulp = ulp; 1664 if (mask & ULOCKFS_FWLOCK) 1665 ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE; 1666 } else { 1667 ulockfs_info_temp->ulp = ulp; 1668 ulockfs_info_temp->next = ulockfs_info; 1669 if (mask & ULOCKFS_FWLOCK) 1670 ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE; 1671 ASSERT(ufs_lockfs_key != 0); 1672 (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp); 1673 } 1674 1675 curthread->t_flag |= T_DONTBLOCK; 1676 return (0); 1677 } 1678 1679 /* 1680 * specialized version of ufs_lockfs_begin() called by ufs_getpage(). 1681 */ 1682 int 1683 ufs_lockfs_begin_getpage( 1684 struct ufsvfs *ufsvfsp, 1685 struct ulockfs **ulpp, 1686 struct seg *seg, 1687 int read_access, 1688 uint_t *protp) 1689 { 1690 ulong_t mask; 1691 int error; 1692 int rec_vop; 1693 struct ulockfs *ulp; 1694 ulockfs_info_t *ulockfs_info; 1695 ulockfs_info_t *ulockfs_info_free; 1696 ulockfs_info_t *ulockfs_info_temp; 1697 1698 /* 1699 * file system has been forcibly unmounted 1700 */ 1701 if (ufsvfsp == NULL) 1702 return (EIO); 1703 1704 *ulpp = ulp = &ufsvfsp->vfs_ulockfs; 1705 1706 /* 1707 * Do lockfs protocol 1708 */ 1709 ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 1710 IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free); 1711 1712 /* 1713 * Detect recursive VOP call or handcrafted internal lockfs protocol 1714 * path and bail out in that case. 1715 */ 1716 if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) { 1717 *ulpp = NULL; 1718 return (0); 1719 } else { 1720 if (ulockfs_info_free == NULL) { 1721 if ((ulockfs_info_temp = (ulockfs_info_t *) 1722 kmem_zalloc(sizeof (ulockfs_info_t), 1723 KM_NOSLEEP)) == NULL) { 1724 *ulpp = NULL; 1725 return (ENOMEM); 1726 } 1727 } 1728 } 1729 1730 /* 1731 * First time VOP call 1732 */ 1733 atomic_add_long(&ulp->ul_vnops_cnt, 1); 1734 if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) { 1735 if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 1736 cv_broadcast(&ulp->ul_cv); 1737 mutex_enter(&ulp->ul_lock); 1738 if (seg->s_ops == &segvn_ops && 1739 ((struct segvn_data *)seg->s_data)->type != MAP_SHARED) { 1740 mask = (ulong_t)ULOCKFS_GETREAD_MASK; 1741 } else if (protp && read_access) { 1742 /* 1743 * Restrict the mapping to readonly. 1744 * Writes to this mapping will cause 1745 * another fault which will then 1746 * be suspended if fs is write locked 1747 */ 1748 *protp &= ~PROT_WRITE; 1749 mask = (ulong_t)ULOCKFS_GETREAD_MASK; 1750 } else 1751 mask = (ulong_t)ULOCKFS_GETWRITE_MASK; 1752 1753 /* 1754 * will sleep if this fs is locked against this VOP 1755 */ 1756 error = ufs_check_lockfs(ufsvfsp, ulp, mask); 1757 mutex_exit(&ulp->ul_lock); 1758 if (error) { 1759 if (ulockfs_info_free == NULL) 1760 kmem_free(ulockfs_info_temp, 1761 sizeof (ulockfs_info_t)); 1762 return (error); 1763 } 1764 } 1765 1766 if (ulockfs_info_free != NULL) { 1767 ulockfs_info_free->ulp = ulp; 1768 } else { 1769 ulockfs_info_temp->ulp = ulp; 1770 ulockfs_info_temp->next = ulockfs_info; 1771 ASSERT(ufs_lockfs_key != 0); 1772 (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp); 1773 } 1774 1775 curthread->t_flag |= T_DONTBLOCK; 1776 return (0); 1777 } 1778 1779 void 1780 ufs_lockfs_tsd_destructor(void *head) 1781 { 1782 ulockfs_info_t *curr = (ulockfs_info_t *)head; 1783 ulockfs_info_t *temp; 1784 1785 for (; curr != NULL; ) { 1786 /* 1787 * The TSD destructor is being called when the thread exits 1788 * (via thread_exit()). At that time it must have cleaned up 1789 * all VOPs via ufs_lockfs_end() and there must not be a 1790 * valid ulockfs record exist while a thread is exiting. 1791 */ 1792 temp = curr; 1793 curr = curr->next; 1794 ASSERT(temp->ulp == NULL); 1795 kmem_free(temp, sizeof (ulockfs_info_t)); 1796 } 1797 } 1798