1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/t_lock.h> 28 #include <sys/param.h> 29 #include <sys/time.h> 30 #include <sys/systm.h> 31 #include <sys/sysmacros.h> 32 #include <sys/resource.h> 33 #include <sys/signal.h> 34 #include <sys/cred.h> 35 #include <sys/user.h> 36 #include <sys/buf.h> 37 #include <sys/vfs.h> 38 #include <sys/vnode.h> 39 #include <sys/proc.h> 40 #include <sys/disp.h> 41 #include <sys/file.h> 42 #include <sys/fcntl.h> 43 #include <sys/flock.h> 44 #include <sys/atomic.h> 45 #include <sys/kmem.h> 46 #include <sys/uio.h> 47 #include <sys/conf.h> 48 #include <sys/mman.h> 49 #include <sys/pathname.h> 50 #include <sys/debug.h> 51 #include <sys/vmmeter.h> 52 #include <sys/vmsystm.h> 53 #include <sys/cmn_err.h> 54 #include <sys/acct.h> 55 #include <sys/dnlc.h> 56 #include <sys/swap.h> 57 58 #include <sys/fs/ufs_fs.h> 59 #include <sys/fs/ufs_inode.h> 60 #include <sys/fs/ufs_fsdir.h> 61 #include <sys/fs/ufs_trans.h> 62 #include <sys/fs/ufs_panic.h> 63 #include <sys/fs/ufs_mount.h> 64 #include <sys/fs/ufs_bio.h> 65 #include <sys/fs/ufs_log.h> 66 #include <sys/fs/ufs_quota.h> 67 #include <sys/dirent.h> /* must be AFTER <sys/fs/fsdir.h>! */ 68 #include <sys/errno.h> 69 #include <sys/sysinfo.h> 70 71 #include <vm/hat.h> 72 #include <vm/pvn.h> 73 #include <vm/as.h> 74 #include <vm/seg.h> 75 #include <vm/seg_map.h> 76 #include <vm/seg_vn.h> 77 #include <vm/rm.h> 78 #include <vm/anon.h> 79 #include <sys/swap.h> 80 #include <sys/dnlc.h> 81 82 extern struct vnode *common_specvp(struct vnode *vp); 83 84 /* error lock status */ 85 #define UN_ERRLCK (-1) 86 #define SET_ERRLCK 1 87 #define RE_ERRLCK 2 88 #define NO_ERRLCK 0 89 90 /* 91 * Index to be used in TSD for storing lockfs data 92 */ 93 uint_t ufs_lockfs_key; 94 95 typedef struct _ulockfs_info { 96 struct _ulockfs_info *next; 97 struct ulockfs *ulp; 98 uint_t flags; 99 } ulockfs_info_t; 100 101 #define ULOCK_INFO_FALLOCATE 0x00000001 /* fallocate thread */ 102 103 /* 104 * Check in TSD that whether we are already doing any VOP on this filesystem 105 */ 106 #define IS_REC_VOP(found, head, ulp, free) \ 107 { \ 108 ulockfs_info_t *_curr; \ 109 \ 110 for (found = 0, free = NULL, _curr = head; \ 111 _curr != NULL; _curr = _curr->next) { \ 112 if ((free == NULL) && \ 113 (_curr->ulp == NULL)) \ 114 free = _curr; \ 115 if (_curr->ulp == ulp) { \ 116 found = 1; \ 117 break; \ 118 } \ 119 } \ 120 } 121 122 /* 123 * Get the lockfs data from TSD so that lockfs handles the recursive VOP 124 * properly 125 */ 126 #define SEARCH_ULOCKFSP(head, ulp, info) \ 127 { \ 128 ulockfs_info_t *_curr; \ 129 \ 130 for (_curr = head; _curr != NULL; \ 131 _curr = _curr->next) { \ 132 if (_curr->ulp == ulp) { \ 133 break; \ 134 } \ 135 } \ 136 \ 137 info = _curr; \ 138 } 139 140 /* 141 * Validate lockfs request 142 */ 143 static int 144 ufs_getlfd( 145 struct lockfs *lockfsp, /* new lock request */ 146 struct lockfs *ul_lockfsp) /* old lock state */ 147 { 148 int error = 0; 149 150 /* 151 * no input flags defined 152 */ 153 if (lockfsp->lf_flags != 0) { 154 error = EINVAL; 155 goto errout; 156 } 157 158 /* 159 * check key 160 */ 161 if (!LOCKFS_IS_ULOCK(ul_lockfsp)) 162 if (lockfsp->lf_key != ul_lockfsp->lf_key) { 163 error = EINVAL; 164 goto errout; 165 } 166 167 lockfsp->lf_key = ul_lockfsp->lf_key + 1; 168 169 errout: 170 return (error); 171 } 172 173 /* 174 * ufs_checkaccton 175 * check if accounting is turned on on this fs 176 */ 177 178 int 179 ufs_checkaccton(struct vnode *vp) 180 { 181 if (acct_fs_in_use(vp)) 182 return (EDEADLK); 183 return (0); 184 } 185 186 /* 187 * ufs_checkswapon 188 * check if local swapping is to file on this fs 189 */ 190 int 191 ufs_checkswapon(struct vnode *vp) 192 { 193 struct swapinfo *sip; 194 195 mutex_enter(&swapinfo_lock); 196 for (sip = swapinfo; sip; sip = sip->si_next) 197 if (sip->si_vp->v_vfsp == vp->v_vfsp) { 198 mutex_exit(&swapinfo_lock); 199 return (EDEADLK); 200 } 201 mutex_exit(&swapinfo_lock); 202 return (0); 203 } 204 205 /* 206 * ufs_freeze 207 * pend future accesses for current lock and desired lock 208 */ 209 void 210 ufs_freeze(struct ulockfs *ulp, struct lockfs *lockfsp) 211 { 212 /* 213 * set to new lock type 214 */ 215 ulp->ul_lockfs.lf_lock = lockfsp->lf_lock; 216 ulp->ul_lockfs.lf_key = lockfsp->lf_key; 217 ulp->ul_lockfs.lf_comlen = lockfsp->lf_comlen; 218 ulp->ul_lockfs.lf_comment = lockfsp->lf_comment; 219 220 ulp->ul_fs_lock = (1 << ulp->ul_lockfs.lf_lock); 221 } 222 223 /* 224 * All callers of ufs_quiesce() atomically increment ufs_quiesce_pend before 225 * starting ufs_quiesce() protocol and decrement it only when a file system no 226 * longer has to be in quiescent state. This allows ufs_pageio() to detect 227 * that another thread wants to quiesce a file system. See more comments in 228 * ufs_pageio(). 229 */ 230 ulong_t ufs_quiesce_pend = 0; 231 232 /* 233 * ufs_quiesce 234 * wait for outstanding accesses to finish 235 */ 236 int 237 ufs_quiesce(struct ulockfs *ulp) 238 { 239 int error = 0; 240 ulockfs_info_t *head; 241 ulockfs_info_t *info; 242 klwp_t *lwp = ttolwp(curthread); 243 244 head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 245 SEARCH_ULOCKFSP(head, ulp, info); 246 247 /* 248 * We have to keep /proc away from stopping us after we applied 249 * the softlock but before we got a chance to clear it again. 250 * prstop() may pagefault and become stuck on the softlock still 251 * pending. 252 */ 253 if (lwp != NULL) 254 lwp->lwp_nostop++; 255 256 /* 257 * Set a softlock to suspend future ufs_vnops so that 258 * this lockfs request will not be starved 259 */ 260 ULOCKFS_SET_SLOCK(ulp); 261 ASSERT(ufs_quiesce_pend); 262 263 /* check if there is any outstanding ufs vnodeops calls */ 264 while (ulp->ul_vnops_cnt || ulp->ul_falloc_cnt) { 265 /* 266 * use timed version of cv_wait_sig() to make sure we don't 267 * miss a wake up call from ufs_pageio() when it doesn't use 268 * ul_lock. 269 * 270 * when a fallocate thread comes in, the only way it returns 271 * from this function is if there are no other vnode operations 272 * going on (remember fallocate threads are tracked using 273 * ul_falloc_cnt not ul_vnops_cnt), and another fallocate thread 274 * hasn't already grabbed the fs write lock. 275 */ 276 if (info && (info->flags & ULOCK_INFO_FALLOCATE)) { 277 if (!ulp->ul_vnops_cnt && !ULOCKFS_IS_FWLOCK(ulp)) 278 goto out; 279 } 280 if (!cv_reltimedwait_sig(&ulp->ul_cv, &ulp->ul_lock, hz, 281 TR_CLOCK_TICK)) { 282 error = EINTR; 283 goto out; 284 } 285 } 286 287 out: 288 /* 289 * unlock the soft lock 290 */ 291 ULOCKFS_CLR_SLOCK(ulp); 292 293 if (lwp != NULL) 294 lwp->lwp_nostop--; 295 296 return (error); 297 } 298 299 /* 300 * ufs_flush_inode 301 */ 302 int 303 ufs_flush_inode(struct inode *ip, void *arg) 304 { 305 int error; 306 int saverror = 0; 307 308 /* 309 * wrong file system; keep looking 310 */ 311 if (ip->i_ufsvfs != (struct ufsvfs *)arg) 312 return (0); 313 314 /* 315 * asynchronously push all the dirty pages 316 */ 317 if (((error = TRANS_SYNCIP(ip, B_ASYNC, 0, TOP_SYNCIP_FLUSHI)) != 0) && 318 (error != EAGAIN)) 319 saverror = error; 320 /* 321 * wait for io and discard all mappings 322 */ 323 if (error = TRANS_SYNCIP(ip, B_INVAL, 0, TOP_SYNCIP_FLUSHI)) 324 saverror = error; 325 326 if (ITOV(ip)->v_type == VDIR) { 327 dnlc_dir_purge(&ip->i_danchor); 328 } 329 330 return (saverror); 331 } 332 333 /* 334 * ufs_flush 335 * Flush everything that is currently dirty; this includes invalidating 336 * any mappings. 337 */ 338 int 339 ufs_flush(struct vfs *vfsp) 340 { 341 int error; 342 int saverror = 0; 343 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 344 struct fs *fs = ufsvfsp->vfs_fs; 345 int tdontblock = 0; 346 347 ASSERT(vfs_lock_held(vfsp)); 348 349 /* 350 * purge dnlc 351 */ 352 (void) dnlc_purge_vfsp(vfsp, 0); 353 354 /* 355 * drain the delete and idle threads 356 */ 357 ufs_delete_drain(vfsp, 0, 0); 358 ufs_idle_drain(vfsp); 359 360 /* 361 * flush and invalidate quota records 362 */ 363 (void) qsync(ufsvfsp); 364 365 /* 366 * flush w/invalidate the inodes for vfsp 367 */ 368 if (error = ufs_scan_inodes(0, ufs_flush_inode, ufsvfsp, ufsvfsp)) 369 saverror = error; 370 371 /* 372 * synchronously flush superblock and summary info 373 */ 374 if (fs->fs_ronly == 0 && fs->fs_fmod) { 375 fs->fs_fmod = 0; 376 TRANS_SBUPDATE(ufsvfsp, vfsp, TOP_SBUPDATE_FLUSH); 377 } 378 /* 379 * flush w/invalidate block device pages and buf cache 380 */ 381 if ((error = VOP_PUTPAGE(common_specvp(ufsvfsp->vfs_devvp), 382 (offset_t)0, 0, B_INVAL, CRED(), NULL)) > 0) 383 saverror = error; 384 385 (void) bflush((dev_t)vfsp->vfs_dev); 386 (void) bfinval((dev_t)vfsp->vfs_dev, 0); 387 388 /* 389 * drain the delete and idle threads again 390 */ 391 ufs_delete_drain(vfsp, 0, 0); 392 ufs_idle_drain(vfsp); 393 394 /* 395 * play with the clean flag 396 */ 397 if (saverror == 0) 398 ufs_checkclean(vfsp); 399 400 /* 401 * Flush any outstanding transactions and roll the log 402 * only if we are supposed to do, i.e. LDL_NOROLL not set. 403 * We can not simply check for fs_ronly here since fsck also may 404 * use this code to roll the log on a read-only filesystem, e.g. 405 * root during early stages of boot, if other then a sanity check is 406 * done, it will clear LDL_NOROLL before. 407 * In addition we assert that the deltamap does not contain any deltas 408 * in case LDL_NOROLL is set since this is not supposed to happen. 409 */ 410 if (TRANS_ISTRANS(ufsvfsp)) { 411 ml_unit_t *ul = ufsvfsp->vfs_log; 412 mt_map_t *mtm = ul->un_deltamap; 413 414 if (ul->un_flags & LDL_NOROLL) { 415 ASSERT(mtm->mtm_nme == 0); 416 } else { 417 /* 418 * Do not set T_DONTBLOCK if there is a 419 * transaction opened by caller. 420 */ 421 if (curthread->t_flag & T_DONTBLOCK) 422 tdontblock = 1; 423 else 424 curthread->t_flag |= T_DONTBLOCK; 425 426 TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_FLUSH, 427 TOP_COMMIT_SIZE, error); 428 429 if (!error) { 430 TRANS_END_SYNC(ufsvfsp, saverror, 431 TOP_COMMIT_FLUSH, TOP_COMMIT_SIZE); 432 } 433 434 if (tdontblock == 0) 435 curthread->t_flag &= ~T_DONTBLOCK; 436 437 logmap_roll_dev(ufsvfsp->vfs_log); 438 } 439 } 440 441 return (saverror); 442 } 443 444 /* 445 * ufs_thaw_wlock 446 * special processing when thawing down to wlock 447 */ 448 static int 449 ufs_thaw_wlock(struct inode *ip, void *arg) 450 { 451 /* 452 * wrong file system; keep looking 453 */ 454 if (ip->i_ufsvfs != (struct ufsvfs *)arg) 455 return (0); 456 457 /* 458 * iupdat refuses to clear flags if the fs is read only. The fs 459 * may become read/write during the lock and we wouldn't want 460 * these inodes being written to disk. So clear the flags. 461 */ 462 rw_enter(&ip->i_contents, RW_WRITER); 463 ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG); 464 rw_exit(&ip->i_contents); 465 466 /* 467 * pages are mlocked -- fail wlock 468 */ 469 if (ITOV(ip)->v_type != VCHR && vn_has_cached_data(ITOV(ip))) 470 return (EBUSY); 471 472 return (0); 473 } 474 475 /* 476 * ufs_thaw_hlock 477 * special processing when thawing down to hlock or elock 478 */ 479 static int 480 ufs_thaw_hlock(struct inode *ip, void *arg) 481 { 482 struct vnode *vp = ITOV(ip); 483 484 /* 485 * wrong file system; keep looking 486 */ 487 if (ip->i_ufsvfs != (struct ufsvfs *)arg) 488 return (0); 489 490 /* 491 * blow away all pages - even if they are mlocked 492 */ 493 do { 494 (void) TRANS_SYNCIP(ip, B_INVAL | B_FORCE, 0, TOP_SYNCIP_HLOCK); 495 } while ((vp->v_type != VCHR) && vn_has_cached_data(vp)); 496 rw_enter(&ip->i_contents, RW_WRITER); 497 ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG); 498 rw_exit(&ip->i_contents); 499 500 return (0); 501 } 502 503 /* 504 * ufs_thaw 505 * thaw file system lock down to current value 506 */ 507 int 508 ufs_thaw(struct vfs *vfsp, struct ufsvfs *ufsvfsp, struct ulockfs *ulp) 509 { 510 int error = 0; 511 int noidel = (int)(ulp->ul_flag & ULOCKFS_NOIDEL); 512 513 /* 514 * if wlock or hlock or elock 515 */ 516 if (ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_HLOCK(ulp) || 517 ULOCKFS_IS_ELOCK(ulp)) { 518 519 /* 520 * don't keep access times 521 * don't free deleted files 522 * if superblock writes are allowed, limit them to me for now 523 */ 524 ulp->ul_flag |= (ULOCKFS_NOIACC|ULOCKFS_NOIDEL); 525 if (ulp->ul_sbowner != (kthread_id_t)-1) 526 ulp->ul_sbowner = curthread; 527 528 /* 529 * wait for writes for deleted files and superblock updates 530 */ 531 (void) ufs_flush(vfsp); 532 533 /* 534 * now make sure the quota file is up-to-date 535 * expensive; but effective 536 */ 537 error = ufs_flush(vfsp); 538 /* 539 * no one can write the superblock 540 */ 541 ulp->ul_sbowner = (kthread_id_t)-1; 542 543 /* 544 * special processing for wlock/hlock/elock 545 */ 546 if (ULOCKFS_IS_WLOCK(ulp)) { 547 if (error) 548 goto errout; 549 error = bfinval(ufsvfsp->vfs_dev, 0); 550 if (error) 551 goto errout; 552 error = ufs_scan_inodes(0, ufs_thaw_wlock, 553 (void *)ufsvfsp, ufsvfsp); 554 if (error) 555 goto errout; 556 } 557 if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp)) { 558 error = 0; 559 (void) ufs_scan_inodes(0, ufs_thaw_hlock, 560 (void *)ufsvfsp, ufsvfsp); 561 (void) bfinval(ufsvfsp->vfs_dev, 1); 562 } 563 } else { 564 565 /* 566 * okay to keep access times 567 * okay to free deleted files 568 * okay to write the superblock 569 */ 570 ulp->ul_flag &= ~(ULOCKFS_NOIACC|ULOCKFS_NOIDEL); 571 ulp->ul_sbowner = NULL; 572 573 /* 574 * flush in case deleted files are in memory 575 */ 576 if (noidel) { 577 if (error = ufs_flush(vfsp)) 578 goto errout; 579 } 580 } 581 582 errout: 583 cv_broadcast(&ulp->ul_cv); 584 return (error); 585 } 586 587 /* 588 * ufs_reconcile_fs 589 * reconcile incore superblock with ondisk superblock 590 */ 591 int 592 ufs_reconcile_fs(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck) 593 { 594 struct fs *mfs; /* in-memory superblock */ 595 struct fs *dfs; /* on-disk superblock */ 596 struct buf *bp; /* on-disk superblock buf */ 597 int needs_unlock; 598 char finished_fsclean; 599 600 mfs = ufsvfsp->vfs_fs; 601 602 /* 603 * get the on-disk copy of the superblock 604 */ 605 bp = UFS_BREAD(ufsvfsp, vfsp->vfs_dev, SBLOCK, SBSIZE); 606 bp->b_flags |= (B_STALE|B_AGE); 607 if (bp->b_flags & B_ERROR) { 608 brelse(bp); 609 return (EIO); 610 } 611 dfs = bp->b_un.b_fs; 612 613 /* error locks may only unlock after the fs has been made consistent */ 614 if (errlck == UN_ERRLCK) { 615 if (dfs->fs_clean == FSFIX) { /* being repaired */ 616 brelse(bp); 617 return (EAGAIN); 618 } 619 /* repair not yet started? */ 620 finished_fsclean = TRANS_ISTRANS(ufsvfsp)? FSLOG: FSCLEAN; 621 if (dfs->fs_clean != finished_fsclean) { 622 brelse(bp); 623 return (EBUSY); 624 } 625 } 626 627 /* 628 * if superblock has changed too much, abort 629 */ 630 if ((mfs->fs_sblkno != dfs->fs_sblkno) || 631 (mfs->fs_cblkno != dfs->fs_cblkno) || 632 (mfs->fs_iblkno != dfs->fs_iblkno) || 633 (mfs->fs_dblkno != dfs->fs_dblkno) || 634 (mfs->fs_cgoffset != dfs->fs_cgoffset) || 635 (mfs->fs_cgmask != dfs->fs_cgmask) || 636 (mfs->fs_bsize != dfs->fs_bsize) || 637 (mfs->fs_fsize != dfs->fs_fsize) || 638 (mfs->fs_frag != dfs->fs_frag) || 639 (mfs->fs_bmask != dfs->fs_bmask) || 640 (mfs->fs_fmask != dfs->fs_fmask) || 641 (mfs->fs_bshift != dfs->fs_bshift) || 642 (mfs->fs_fshift != dfs->fs_fshift) || 643 (mfs->fs_fragshift != dfs->fs_fragshift) || 644 (mfs->fs_fsbtodb != dfs->fs_fsbtodb) || 645 (mfs->fs_sbsize != dfs->fs_sbsize) || 646 (mfs->fs_nindir != dfs->fs_nindir) || 647 (mfs->fs_nspf != dfs->fs_nspf) || 648 (mfs->fs_trackskew != dfs->fs_trackskew) || 649 (mfs->fs_cgsize != dfs->fs_cgsize) || 650 (mfs->fs_ntrak != dfs->fs_ntrak) || 651 (mfs->fs_nsect != dfs->fs_nsect) || 652 (mfs->fs_spc != dfs->fs_spc) || 653 (mfs->fs_cpg != dfs->fs_cpg) || 654 (mfs->fs_ipg != dfs->fs_ipg) || 655 (mfs->fs_fpg != dfs->fs_fpg) || 656 (mfs->fs_postblformat != dfs->fs_postblformat) || 657 (mfs->fs_magic != dfs->fs_magic)) { 658 brelse(bp); 659 return (EACCES); 660 } 661 if (dfs->fs_clean == FSBAD || FSOKAY != dfs->fs_state + dfs->fs_time) 662 if (mfs->fs_clean == FSLOG) { 663 brelse(bp); 664 return (EACCES); 665 } 666 667 /* 668 * get new summary info 669 */ 670 if (ufs_getsummaryinfo(vfsp->vfs_dev, ufsvfsp, dfs)) { 671 brelse(bp); 672 return (EIO); 673 } 674 675 /* 676 * release old summary info and update in-memory superblock 677 */ 678 kmem_free(mfs->fs_u.fs_csp, mfs->fs_cssize); 679 mfs->fs_u.fs_csp = dfs->fs_u.fs_csp; /* Only entry 0 used */ 680 681 /* 682 * update fields allowed to change 683 */ 684 mfs->fs_size = dfs->fs_size; 685 mfs->fs_dsize = dfs->fs_dsize; 686 mfs->fs_ncg = dfs->fs_ncg; 687 mfs->fs_minfree = dfs->fs_minfree; 688 mfs->fs_rotdelay = dfs->fs_rotdelay; 689 mfs->fs_rps = dfs->fs_rps; 690 mfs->fs_maxcontig = dfs->fs_maxcontig; 691 mfs->fs_maxbpg = dfs->fs_maxbpg; 692 mfs->fs_csmask = dfs->fs_csmask; 693 mfs->fs_csshift = dfs->fs_csshift; 694 mfs->fs_optim = dfs->fs_optim; 695 mfs->fs_csaddr = dfs->fs_csaddr; 696 mfs->fs_cssize = dfs->fs_cssize; 697 mfs->fs_ncyl = dfs->fs_ncyl; 698 mfs->fs_cstotal = dfs->fs_cstotal; 699 mfs->fs_reclaim = dfs->fs_reclaim; 700 701 if (mfs->fs_reclaim & (FS_RECLAIM|FS_RECLAIMING)) { 702 mfs->fs_reclaim &= ~FS_RECLAIM; 703 mfs->fs_reclaim |= FS_RECLAIMING; 704 ufs_thread_start(&ufsvfsp->vfs_reclaim, 705 ufs_thread_reclaim, vfsp); 706 } 707 708 /* XXX What to do about sparecon? */ 709 710 /* XXX need to copy volume label */ 711 712 /* 713 * ondisk clean flag overrides inmemory clean flag iff == FSBAD 714 * or if error-locked and ondisk is now clean 715 */ 716 needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock); 717 if (needs_unlock) 718 mutex_enter(&ufsvfsp->vfs_lock); 719 720 if (errlck == UN_ERRLCK) { 721 if (finished_fsclean == dfs->fs_clean) 722 mfs->fs_clean = finished_fsclean; 723 else 724 mfs->fs_clean = FSBAD; 725 mfs->fs_state = FSOKAY - dfs->fs_time; 726 } 727 728 if (FSOKAY != dfs->fs_state + dfs->fs_time || 729 (dfs->fs_clean == FSBAD)) 730 mfs->fs_clean = FSBAD; 731 732 if (needs_unlock) 733 mutex_exit(&ufsvfsp->vfs_lock); 734 735 brelse(bp); 736 737 return (0); 738 } 739 740 /* 741 * ufs_reconcile_inode 742 * reconcile ondisk inode with incore inode 743 */ 744 static int 745 ufs_reconcile_inode(struct inode *ip, void *arg) 746 { 747 int i; 748 int ndaddr; 749 int niaddr; 750 struct dinode *dp; /* ondisk inode */ 751 struct buf *bp = NULL; 752 uid_t d_uid; 753 gid_t d_gid; 754 int error = 0; 755 struct fs *fs; 756 757 /* 758 * not an inode we care about 759 */ 760 if (ip->i_ufsvfs != (struct ufsvfs *)arg) 761 return (0); 762 763 fs = ip->i_fs; 764 765 /* 766 * Inode reconciliation fails: we made the filesystem quiescent 767 * and we did a ufs_flush() before calling ufs_reconcile_inode() 768 * and thus the inode should not have been changed inbetween. 769 * Any discrepancies indicate a logic error and a pretty 770 * significant run-state inconsistency we should complain about. 771 */ 772 if (ip->i_flag & (IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG)) { 773 cmn_err(CE_WARN, "%s: Inode reconciliation failed for" 774 "inode %llu", fs->fs_fsmnt, (u_longlong_t)ip->i_number); 775 return (EINVAL); 776 } 777 778 /* 779 * get the dinode 780 */ 781 bp = UFS_BREAD(ip->i_ufsvfs, 782 ip->i_dev, (daddr_t)fsbtodb(fs, itod(fs, ip->i_number)), 783 (int)fs->fs_bsize); 784 if (bp->b_flags & B_ERROR) { 785 brelse(bp); 786 return (EIO); 787 } 788 dp = bp->b_un.b_dino; 789 dp += itoo(fs, ip->i_number); 790 791 /* 792 * handle Sun's implementation of EFT 793 */ 794 d_uid = (dp->di_suid == UID_LONG) ? dp->di_uid : (uid_t)dp->di_suid; 795 d_gid = (dp->di_sgid == GID_LONG) ? dp->di_gid : (uid_t)dp->di_sgid; 796 797 rw_enter(&ip->i_contents, RW_WRITER); 798 799 /* 800 * some fields are not allowed to change 801 */ 802 if ((ip->i_mode != dp->di_mode) || 803 (ip->i_gen != dp->di_gen) || 804 (ip->i_uid != d_uid) || 805 (ip->i_gid != d_gid)) { 806 error = EACCES; 807 goto out; 808 } 809 810 /* 811 * and some are allowed to change 812 */ 813 ip->i_size = dp->di_size; 814 ip->i_ic.ic_flags = dp->di_ic.ic_flags; 815 ip->i_blocks = dp->di_blocks; 816 ip->i_nlink = dp->di_nlink; 817 if (ip->i_flag & IFASTSYMLNK) { 818 ndaddr = 1; 819 niaddr = 0; 820 } else { 821 ndaddr = NDADDR; 822 niaddr = NIADDR; 823 } 824 for (i = 0; i < ndaddr; ++i) 825 ip->i_db[i] = dp->di_db[i]; 826 for (i = 0; i < niaddr; ++i) 827 ip->i_ib[i] = dp->di_ib[i]; 828 829 out: 830 rw_exit(&ip->i_contents); 831 brelse(bp); 832 return (error); 833 } 834 835 /* 836 * ufs_reconcile 837 * reconcile ondisk superblock/inodes with any incore 838 */ 839 static int 840 ufs_reconcile(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck) 841 { 842 int error = 0; 843 844 /* 845 * get rid of as much inmemory data as possible 846 */ 847 (void) ufs_flush(vfsp); 848 849 /* 850 * reconcile the superblock and inodes 851 */ 852 if (error = ufs_reconcile_fs(vfsp, ufsvfsp, errlck)) 853 return (error); 854 if (error = ufs_scan_inodes(0, ufs_reconcile_inode, ufsvfsp, ufsvfsp)) 855 return (error); 856 /* 857 * allocation blocks may be incorrect; get rid of them 858 */ 859 (void) ufs_flush(vfsp); 860 861 return (error); 862 } 863 864 /* 865 * File system locking 866 */ 867 int 868 ufs_fiolfs(struct vnode *vp, struct lockfs *lockfsp, int from_log) 869 { 870 return (ufs__fiolfs(vp, lockfsp, /* from_user */ 1, from_log)); 871 } 872 873 /* kernel-internal interface, also used by fix-on-panic */ 874 int 875 ufs__fiolfs( 876 struct vnode *vp, 877 struct lockfs *lockfsp, 878 int from_user, 879 int from_log) 880 { 881 struct ulockfs *ulp; 882 struct lockfs lfs; 883 int error; 884 struct vfs *vfsp; 885 struct ufsvfs *ufsvfsp; 886 int errlck = NO_ERRLCK; 887 int poll_events = POLLPRI; 888 extern struct pollhead ufs_pollhd; 889 ulockfs_info_t *head; 890 ulockfs_info_t *info; 891 int signal = 0; 892 893 /* check valid lock type */ 894 if (!lockfsp || lockfsp->lf_lock > LOCKFS_MAXLOCK) 895 return (EINVAL); 896 897 if (!vp || !vp->v_vfsp || !vp->v_vfsp->vfs_data) 898 return (EIO); 899 900 vfsp = vp->v_vfsp; 901 902 if (vfsp->vfs_flag & VFS_UNMOUNTED) /* has been unmounted */ 903 return (EIO); 904 905 /* take the lock and check again */ 906 vfs_lock_wait(vfsp); 907 if (vfsp->vfs_flag & VFS_UNMOUNTED) { 908 vfs_unlock(vfsp); 909 return (EIO); 910 } 911 912 /* 913 * Can't wlock or ro/elock fs with accounting or local swap file 914 * We need to check for this before we grab the ul_lock to avoid 915 * deadlocks with the accounting framework. 916 */ 917 if ((LOCKFS_IS_WLOCK(lockfsp) || LOCKFS_IS_ELOCK(lockfsp) || 918 LOCKFS_IS_ROELOCK(lockfsp)) && !from_log) { 919 if (ufs_checkaccton(vp) || ufs_checkswapon(vp)) { 920 vfs_unlock(vfsp); 921 return (EDEADLK); 922 } 923 } 924 925 ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 926 ulp = &ufsvfsp->vfs_ulockfs; 927 head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 928 SEARCH_ULOCKFSP(head, ulp, info); 929 930 /* 931 * Suspend both the reclaim thread and the delete thread. 932 * This must be done outside the lockfs locking protocol. 933 */ 934 ufs_thread_suspend(&ufsvfsp->vfs_reclaim); 935 ufs_thread_suspend(&ufsvfsp->vfs_delete); 936 937 mutex_enter(&ulp->ul_lock); 938 atomic_add_long(&ufs_quiesce_pend, 1); 939 940 /* 941 * Quit if there is another lockfs request in progress 942 * that is waiting for existing ufs_vnops to complete. 943 */ 944 if (ULOCKFS_IS_BUSY(ulp)) { 945 error = EBUSY; 946 goto errexit; 947 } 948 949 /* cannot ulocked or downgrade a hard-lock */ 950 if (ULOCKFS_IS_HLOCK(ulp)) { 951 error = EIO; 952 goto errexit; 953 } 954 955 /* an error lock may be unlocked or relocked, only */ 956 if (ULOCKFS_IS_ELOCK(ulp)) { 957 if (!LOCKFS_IS_ULOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) { 958 error = EBUSY; 959 goto errexit; 960 } 961 } 962 963 /* 964 * a read-only error lock may only be upgraded to an 965 * error lock or hard lock 966 */ 967 if (ULOCKFS_IS_ROELOCK(ulp)) { 968 if (!LOCKFS_IS_HLOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) { 969 error = EBUSY; 970 goto errexit; 971 } 972 } 973 974 /* 975 * until read-only error locks are fully implemented 976 * just return EINVAL 977 */ 978 if (LOCKFS_IS_ROELOCK(lockfsp)) { 979 error = EINVAL; 980 goto errexit; 981 } 982 983 /* 984 * an error lock may only be applied if the file system is 985 * unlocked or already error locked. 986 * (this is to prevent the case where a fs gets changed out from 987 * underneath a fs that is locked for backup, 988 * that is, name/delete/write-locked.) 989 */ 990 if ((!ULOCKFS_IS_ULOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp) && 991 !ULOCKFS_IS_ROELOCK(ulp)) && 992 (LOCKFS_IS_ELOCK(lockfsp) || LOCKFS_IS_ROELOCK(lockfsp))) { 993 error = EBUSY; 994 goto errexit; 995 } 996 997 /* get and validate the input lockfs request */ 998 if (error = ufs_getlfd(lockfsp, &ulp->ul_lockfs)) 999 goto errexit; 1000 1001 /* 1002 * save current ulockfs struct 1003 */ 1004 bcopy(&ulp->ul_lockfs, &lfs, sizeof (struct lockfs)); 1005 1006 /* 1007 * Freeze the file system (pend future accesses) 1008 */ 1009 ufs_freeze(ulp, lockfsp); 1010 1011 /* 1012 * Set locking in progress because ufs_quiesce may free the 1013 * ul_lock mutex. 1014 */ 1015 ULOCKFS_SET_BUSY(ulp); 1016 /* update the ioctl copy */ 1017 LOCKFS_SET_BUSY(&ulp->ul_lockfs); 1018 1019 /* 1020 * We need to unset FWLOCK status before we call ufs_quiesce 1021 * so that the thread doesnt get suspended. We do this only if 1022 * this (fallocate) thread requested an unlock operation. 1023 */ 1024 if (info && (info->flags & ULOCK_INFO_FALLOCATE)) { 1025 if (!ULOCKFS_IS_WLOCK(ulp)) 1026 ULOCKFS_CLR_FWLOCK(ulp); 1027 } 1028 1029 /* 1030 * Quiesce (wait for outstanding accesses to finish) 1031 */ 1032 if (error = ufs_quiesce(ulp)) { 1033 /* 1034 * Interrupted due to signal. There could still be 1035 * pending vnops. 1036 */ 1037 signal = 1; 1038 1039 /* 1040 * We do broadcast because lock-status 1041 * could be reverted to old status. 1042 */ 1043 cv_broadcast(&ulp->ul_cv); 1044 goto errout; 1045 } 1046 1047 /* 1048 * If the fallocate thread requested a write fs lock operation 1049 * then we set fwlock status in the ulp. 1050 */ 1051 if (info && (info->flags & ULOCK_INFO_FALLOCATE)) { 1052 if (ULOCKFS_IS_WLOCK(ulp)) 1053 ULOCKFS_SET_FWLOCK(ulp); 1054 } 1055 1056 /* 1057 * save error lock status to pass down to reconcilation 1058 * routines and for later cleanup 1059 */ 1060 if (LOCKFS_IS_ELOCK(&lfs) && ULOCKFS_IS_ULOCK(ulp)) 1061 errlck = UN_ERRLCK; 1062 1063 if (ULOCKFS_IS_ELOCK(ulp) || ULOCKFS_IS_ROELOCK(ulp)) { 1064 int needs_unlock; 1065 int needs_sbwrite; 1066 1067 poll_events |= POLLERR; 1068 errlck = LOCKFS_IS_ELOCK(&lfs) || LOCKFS_IS_ROELOCK(&lfs) ? 1069 RE_ERRLCK : SET_ERRLCK; 1070 1071 needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock); 1072 if (needs_unlock) 1073 mutex_enter(&ufsvfsp->vfs_lock); 1074 1075 /* disable delayed i/o */ 1076 needs_sbwrite = 0; 1077 1078 if (errlck == SET_ERRLCK) { 1079 ufsvfsp->vfs_fs->fs_clean = FSBAD; 1080 needs_sbwrite = 1; 1081 } 1082 1083 needs_sbwrite |= ufsvfsp->vfs_dio; 1084 ufsvfsp->vfs_dio = 0; 1085 1086 if (needs_unlock) 1087 mutex_exit(&ufsvfsp->vfs_lock); 1088 1089 if (needs_sbwrite) { 1090 ulp->ul_sbowner = curthread; 1091 TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE); 1092 1093 if (needs_unlock) 1094 mutex_enter(&ufsvfsp->vfs_lock); 1095 1096 ufsvfsp->vfs_fs->fs_fmod = 0; 1097 1098 if (needs_unlock) 1099 mutex_exit(&ufsvfsp->vfs_lock); 1100 } 1101 } 1102 1103 /* 1104 * reconcile superblock and inodes if was wlocked 1105 */ 1106 if (LOCKFS_IS_WLOCK(&lfs) || LOCKFS_IS_ELOCK(&lfs)) { 1107 if (error = ufs_reconcile(vfsp, ufsvfsp, errlck)) 1108 goto errout; 1109 /* 1110 * in case the fs grew; reset the metadata map for logging tests 1111 */ 1112 TRANS_MATA_UMOUNT(ufsvfsp); 1113 TRANS_MATA_MOUNT(ufsvfsp); 1114 TRANS_MATA_SI(ufsvfsp, ufsvfsp->vfs_fs); 1115 } 1116 1117 /* 1118 * At least everything *currently* dirty goes out. 1119 */ 1120 1121 if ((error = ufs_flush(vfsp)) != 0 && !ULOCKFS_IS_HLOCK(ulp) && 1122 !ULOCKFS_IS_ELOCK(ulp)) 1123 goto errout; 1124 1125 /* 1126 * thaw file system and wakeup pended processes 1127 */ 1128 if (error = ufs_thaw(vfsp, ufsvfsp, ulp)) 1129 if (!ULOCKFS_IS_HLOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp)) 1130 goto errout; 1131 1132 /* 1133 * reset modified flag if not already write locked 1134 */ 1135 if (!LOCKFS_IS_WLOCK(&lfs)) 1136 ULOCKFS_CLR_MOD(ulp); 1137 1138 /* 1139 * idle the lock struct 1140 */ 1141 ULOCKFS_CLR_BUSY(ulp); 1142 /* update the ioctl copy */ 1143 LOCKFS_CLR_BUSY(&ulp->ul_lockfs); 1144 1145 /* 1146 * free current comment 1147 */ 1148 if (lfs.lf_comment && lfs.lf_comlen != 0) { 1149 kmem_free(lfs.lf_comment, lfs.lf_comlen); 1150 lfs.lf_comment = NULL; 1151 lfs.lf_comlen = 0; 1152 } 1153 1154 /* do error lock cleanup */ 1155 if (errlck == UN_ERRLCK) 1156 ufsfx_unlockfs(ufsvfsp); 1157 1158 else if (errlck == RE_ERRLCK) 1159 ufsfx_lockfs(ufsvfsp); 1160 1161 /* don't allow error lock from user to invoke panic */ 1162 else if (from_user && errlck == SET_ERRLCK && 1163 !(ufsvfsp->vfs_fsfx.fx_flags & (UFSMNT_ONERROR_PANIC >> 4))) 1164 (void) ufs_fault(ufsvfsp->vfs_root, 1165 ulp->ul_lockfs.lf_comment && ulp->ul_lockfs.lf_comlen > 0 ? 1166 ulp->ul_lockfs.lf_comment: "user-applied error lock"); 1167 1168 atomic_add_long(&ufs_quiesce_pend, -1); 1169 mutex_exit(&ulp->ul_lock); 1170 vfs_unlock(vfsp); 1171 1172 if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs)) 1173 poll_events |= POLLERR; 1174 1175 pollwakeup(&ufs_pollhd, poll_events); 1176 1177 /* 1178 * Allow both the delete thread and the reclaim thread to 1179 * continue. 1180 */ 1181 ufs_thread_continue(&ufsvfsp->vfs_delete); 1182 ufs_thread_continue(&ufsvfsp->vfs_reclaim); 1183 1184 return (0); 1185 1186 errout: 1187 /* 1188 * Lock failed. Reset the old lock in ufsvfs if not hard locked. 1189 */ 1190 if (!LOCKFS_IS_HLOCK(&ulp->ul_lockfs)) { 1191 bcopy(&lfs, &ulp->ul_lockfs, sizeof (struct lockfs)); 1192 ulp->ul_fs_lock = (1 << lfs.lf_lock); 1193 } 1194 1195 /* 1196 * Don't call ufs_thaw() when there's a signal during 1197 * ufs quiesce operation as it can lead to deadlock 1198 * with getpage. 1199 */ 1200 if (signal == 0) 1201 (void) ufs_thaw(vfsp, ufsvfsp, ulp); 1202 1203 ULOCKFS_CLR_BUSY(ulp); 1204 LOCKFS_CLR_BUSY(&ulp->ul_lockfs); 1205 1206 errexit: 1207 atomic_add_long(&ufs_quiesce_pend, -1); 1208 mutex_exit(&ulp->ul_lock); 1209 vfs_unlock(vfsp); 1210 1211 /* 1212 * Allow both the delete thread and the reclaim thread to 1213 * continue. 1214 */ 1215 ufs_thread_continue(&ufsvfsp->vfs_delete); 1216 ufs_thread_continue(&ufsvfsp->vfs_reclaim); 1217 1218 return (error); 1219 } 1220 1221 /* 1222 * fiolfss 1223 * return the current file system locking state info 1224 */ 1225 int 1226 ufs_fiolfss(struct vnode *vp, struct lockfs *lockfsp) 1227 { 1228 struct ulockfs *ulp; 1229 1230 if (!vp || !vp->v_vfsp || !VTOI(vp)) 1231 return (EINVAL); 1232 1233 /* file system has been forcibly unmounted */ 1234 if (VTOI(vp)->i_ufsvfs == NULL) 1235 return (EIO); 1236 1237 ulp = VTOUL(vp); 1238 1239 if (ULOCKFS_IS_HLOCK(ulp)) { 1240 *lockfsp = ulp->ul_lockfs; /* structure assignment */ 1241 return (0); 1242 } 1243 1244 mutex_enter(&ulp->ul_lock); 1245 1246 *lockfsp = ulp->ul_lockfs; /* structure assignment */ 1247 1248 if (ULOCKFS_IS_MOD(ulp)) 1249 lockfsp->lf_flags |= LOCKFS_MOD; 1250 1251 mutex_exit(&ulp->ul_lock); 1252 1253 return (0); 1254 } 1255 1256 /* 1257 * ufs_check_lockfs 1258 * check whether a ufs_vnops conflicts with the file system lock 1259 */ 1260 int 1261 ufs_check_lockfs(struct ufsvfs *ufsvfsp, struct ulockfs *ulp, ulong_t mask) 1262 { 1263 k_sigset_t smask; 1264 int sig, slock; 1265 1266 ASSERT(MUTEX_HELD(&ulp->ul_lock)); 1267 1268 while (ulp->ul_fs_lock & mask) { 1269 slock = (int)ULOCKFS_IS_SLOCK(ulp); 1270 if ((curthread->t_flag & T_DONTPEND) && !slock) { 1271 curthread->t_flag |= T_WOULDBLOCK; 1272 return (EAGAIN); 1273 } 1274 curthread->t_flag &= ~T_WOULDBLOCK; 1275 1276 /* 1277 * In the case of an onerr umount of the fs, threads could 1278 * have blocked before coming into ufs_check_lockfs and 1279 * need to check for the special case of ELOCK and 1280 * vfs_dontblock being set which would indicate that the fs 1281 * is on its way out and will not return therefore making 1282 * EIO the appropriate response. 1283 */ 1284 if (ULOCKFS_IS_HLOCK(ulp) || 1285 (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock)) 1286 return (EIO); 1287 1288 /* 1289 * wait for lock status to change 1290 */ 1291 if (slock || ufsvfsp->vfs_nointr) { 1292 cv_wait(&ulp->ul_cv, &ulp->ul_lock); 1293 } else { 1294 sigintr(&smask, 1); 1295 sig = cv_wait_sig(&ulp->ul_cv, &ulp->ul_lock); 1296 sigunintr(&smask); 1297 if ((!sig && (ulp->ul_fs_lock & mask)) || 1298 ufsvfsp->vfs_dontblock) 1299 return (EINTR); 1300 } 1301 } 1302 1303 if (mask & ULOCKFS_FWLOCK) { 1304 atomic_add_long(&ulp->ul_falloc_cnt, 1); 1305 ULOCKFS_SET_FALLOC(ulp); 1306 } else { 1307 atomic_add_long(&ulp->ul_vnops_cnt, 1); 1308 } 1309 1310 return (0); 1311 } 1312 1313 /* 1314 * Check whether we came across the handcrafted lockfs protocol path. We can't 1315 * simply check for T_DONTBLOCK here as one would assume since this can also 1316 * falsely catch recursive VOP's going to a different filesystem, instead we 1317 * check if we already hold the ulockfs->ul_lock mutex. 1318 */ 1319 static int 1320 ufs_lockfs_is_under_rawlockfs(struct ulockfs *ulp) 1321 { 1322 return ((mutex_owner(&ulp->ul_lock) != curthread) ? 0 : 1); 1323 } 1324 1325 /* 1326 * ufs_lockfs_begin - start the lockfs locking protocol 1327 */ 1328 int 1329 ufs_lockfs_begin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask) 1330 { 1331 int error; 1332 int rec_vop; 1333 ushort_t op_cnt_incremented = 0; 1334 ulong_t *ctr; 1335 struct ulockfs *ulp; 1336 ulockfs_info_t *ulockfs_info; 1337 ulockfs_info_t *ulockfs_info_free; 1338 ulockfs_info_t *ulockfs_info_temp; 1339 1340 /* 1341 * file system has been forcibly unmounted 1342 */ 1343 if (ufsvfsp == NULL) 1344 return (EIO); 1345 1346 *ulpp = ulp = &ufsvfsp->vfs_ulockfs; 1347 1348 /* 1349 * Do lockfs protocol 1350 */ 1351 ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 1352 IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free); 1353 1354 /* 1355 * Detect recursive VOP call or handcrafted internal lockfs protocol 1356 * path and bail out in that case. 1357 */ 1358 if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) { 1359 *ulpp = NULL; 1360 return (0); 1361 } else { 1362 if (ulockfs_info_free == NULL) { 1363 if ((ulockfs_info_temp = (ulockfs_info_t *) 1364 kmem_zalloc(sizeof (ulockfs_info_t), 1365 KM_NOSLEEP)) == NULL) { 1366 *ulpp = NULL; 1367 return (ENOMEM); 1368 } 1369 } 1370 } 1371 1372 /* 1373 * First time VOP call 1374 * 1375 * Increment the ctr irrespective of the lockfs state. If the lockfs 1376 * state is not ULOCKFS_ULOCK, we can decrement it later. However, 1377 * before incrementing we need to check if there is a pending quiesce 1378 * request because if we have a continuous stream of ufs_lockfs_begin 1379 * requests pounding on a few cpu's then the ufs_quiesce thread might 1380 * never see the value of zero for ctr - a livelock kind of scenario. 1381 */ 1382 ctr = (mask & ULOCKFS_FWLOCK) ? 1383 &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt; 1384 if (!ULOCKFS_IS_SLOCK(ulp)) { 1385 atomic_add_long(ctr, 1); 1386 op_cnt_incremented++; 1387 } 1388 1389 /* 1390 * If the lockfs state (indicated by ul_fs_lock) is not just 1391 * ULOCKFS_ULOCK, then we will be routed through ufs_check_lockfs 1392 * where there is a check with an appropriate mask to selectively allow 1393 * operations permitted for that kind of lockfs state. 1394 * 1395 * Even these selective operations should not be allowed to go through 1396 * if a lockfs request is in progress because that could result in inode 1397 * modifications during a quiesce and could hence result in inode 1398 * reconciliation failures. ULOCKFS_SLOCK alone would not be sufficient, 1399 * so make use of ufs_quiesce_pend to disallow vnode operations when a 1400 * quiesce is in progress. 1401 */ 1402 if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) { 1403 if (op_cnt_incremented) 1404 if (!atomic_add_long_nv(ctr, -1)) 1405 cv_broadcast(&ulp->ul_cv); 1406 mutex_enter(&ulp->ul_lock); 1407 error = ufs_check_lockfs(ufsvfsp, ulp, mask); 1408 mutex_exit(&ulp->ul_lock); 1409 if (error) { 1410 if (ulockfs_info_free == NULL) 1411 kmem_free(ulockfs_info_temp, 1412 sizeof (ulockfs_info_t)); 1413 return (error); 1414 } 1415 } else { 1416 /* 1417 * This is the common case of file system in a unlocked state. 1418 * 1419 * If a file system is unlocked, we would expect the ctr to have 1420 * been incremented by now. But this will not be true when a 1421 * quiesce is winding up - SLOCK was set when we checked before 1422 * incrementing the ctr, but by the time we checked for 1423 * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. It is okay 1424 * to take ul_lock and go through the slow path in this uncommon 1425 * case. 1426 */ 1427 if (op_cnt_incremented == 0) { 1428 mutex_enter(&ulp->ul_lock); 1429 error = ufs_check_lockfs(ufsvfsp, ulp, mask); 1430 if (error) { 1431 mutex_exit(&ulp->ul_lock); 1432 if (ulockfs_info_free == NULL) 1433 kmem_free(ulockfs_info_temp, 1434 sizeof (ulockfs_info_t)); 1435 return (error); 1436 } 1437 if (mask & ULOCKFS_FWLOCK) 1438 ULOCKFS_SET_FALLOC(ulp); 1439 mutex_exit(&ulp->ul_lock); 1440 } else if (mask & ULOCKFS_FWLOCK) { 1441 mutex_enter(&ulp->ul_lock); 1442 ULOCKFS_SET_FALLOC(ulp); 1443 mutex_exit(&ulp->ul_lock); 1444 } 1445 } 1446 1447 if (ulockfs_info_free != NULL) { 1448 ulockfs_info_free->ulp = ulp; 1449 if (mask & ULOCKFS_FWLOCK) 1450 ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE; 1451 } else { 1452 ulockfs_info_temp->ulp = ulp; 1453 ulockfs_info_temp->next = ulockfs_info; 1454 if (mask & ULOCKFS_FWLOCK) 1455 ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE; 1456 ASSERT(ufs_lockfs_key != 0); 1457 (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp); 1458 } 1459 1460 curthread->t_flag |= T_DONTBLOCK; 1461 return (0); 1462 } 1463 1464 /* 1465 * Check whether we are returning from the top level VOP. 1466 */ 1467 static int 1468 ufs_lockfs_top_vop_return(ulockfs_info_t *head) 1469 { 1470 ulockfs_info_t *info; 1471 int result = 1; 1472 1473 for (info = head; info != NULL; info = info->next) { 1474 if (info->ulp != NULL) { 1475 result = 0; 1476 break; 1477 } 1478 } 1479 1480 return (result); 1481 } 1482 1483 /* 1484 * ufs_lockfs_end - terminate the lockfs locking protocol 1485 */ 1486 void 1487 ufs_lockfs_end(struct ulockfs *ulp) 1488 { 1489 ulockfs_info_t *info; 1490 ulockfs_info_t *head; 1491 1492 /* 1493 * end-of-VOP protocol 1494 */ 1495 if (ulp == NULL) 1496 return; 1497 1498 head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 1499 SEARCH_ULOCKFSP(head, ulp, info); 1500 1501 /* 1502 * If we're called from a first level VOP, we have to have a 1503 * valid ulockfs record in the TSD. 1504 */ 1505 ASSERT(info != NULL); 1506 1507 /* 1508 * Invalidate the ulockfs record. 1509 */ 1510 info->ulp = NULL; 1511 1512 if (ufs_lockfs_top_vop_return(head)) 1513 curthread->t_flag &= ~T_DONTBLOCK; 1514 1515 /* fallocate thread */ 1516 if (ULOCKFS_IS_FALLOC(ulp) && info->flags & ULOCK_INFO_FALLOCATE) { 1517 /* Clear the thread's fallocate state */ 1518 info->flags &= ~ULOCK_INFO_FALLOCATE; 1519 if (!atomic_add_long_nv(&ulp->ul_falloc_cnt, -1)) { 1520 mutex_enter(&ulp->ul_lock); 1521 ULOCKFS_CLR_FALLOC(ulp); 1522 cv_broadcast(&ulp->ul_cv); 1523 mutex_exit(&ulp->ul_lock); 1524 } 1525 } else { /* normal thread */ 1526 if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 1527 cv_broadcast(&ulp->ul_cv); 1528 } 1529 } 1530 1531 /* 1532 * ufs_lockfs_trybegin - try to start the lockfs locking protocol without 1533 * blocking. 1534 */ 1535 int 1536 ufs_lockfs_trybegin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask) 1537 { 1538 int error = 0; 1539 int rec_vop; 1540 ushort_t op_cnt_incremented = 0; 1541 ulong_t *ctr; 1542 struct ulockfs *ulp; 1543 ulockfs_info_t *ulockfs_info; 1544 ulockfs_info_t *ulockfs_info_free; 1545 ulockfs_info_t *ulockfs_info_temp; 1546 1547 /* 1548 * file system has been forcibly unmounted 1549 */ 1550 if (ufsvfsp == NULL) 1551 return (EIO); 1552 1553 *ulpp = ulp = &ufsvfsp->vfs_ulockfs; 1554 1555 /* 1556 * Do lockfs protocol 1557 */ 1558 ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 1559 IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free); 1560 1561 /* 1562 * Detect recursive VOP call or handcrafted internal lockfs protocol 1563 * path and bail out in that case. 1564 */ 1565 if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) { 1566 *ulpp = NULL; 1567 return (0); 1568 } else { 1569 if (ulockfs_info_free == NULL) { 1570 if ((ulockfs_info_temp = (ulockfs_info_t *) 1571 kmem_zalloc(sizeof (ulockfs_info_t), 1572 KM_NOSLEEP)) == NULL) { 1573 *ulpp = NULL; 1574 return (ENOMEM); 1575 } 1576 } 1577 } 1578 1579 /* 1580 * First time VOP call 1581 * 1582 * Increment the ctr irrespective of the lockfs state. If the lockfs 1583 * state is not ULOCKFS_ULOCK, we can decrement it later. However, 1584 * before incrementing we need to check if there is a pending quiesce 1585 * request because if we have a continuous stream of ufs_lockfs_begin 1586 * requests pounding on a few cpu's then the ufs_quiesce thread might 1587 * never see the value of zero for ctr - a livelock kind of scenario. 1588 */ 1589 ctr = (mask & ULOCKFS_FWLOCK) ? 1590 &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt; 1591 if (!ULOCKFS_IS_SLOCK(ulp)) { 1592 atomic_add_long(ctr, 1); 1593 op_cnt_incremented++; 1594 } 1595 1596 if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) { 1597 /* 1598 * Non-blocking version of ufs_check_lockfs() code. 1599 * 1600 * If the file system is not hard locked or error locked 1601 * and if ulp->ul_fs_lock allows this operation, increment 1602 * the appropriate counter and proceed (For eg., In case the 1603 * file system is delete locked, a mmap can still go through). 1604 */ 1605 if (op_cnt_incremented) 1606 if (!atomic_add_long_nv(ctr, -1)) 1607 cv_broadcast(&ulp->ul_cv); 1608 mutex_enter(&ulp->ul_lock); 1609 if (ULOCKFS_IS_HLOCK(ulp) || 1610 (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock)) 1611 error = EIO; 1612 else if (ulp->ul_fs_lock & mask) 1613 error = EAGAIN; 1614 1615 if (error) { 1616 mutex_exit(&ulp->ul_lock); 1617 if (ulockfs_info_free == NULL) 1618 kmem_free(ulockfs_info_temp, 1619 sizeof (ulockfs_info_t)); 1620 return (error); 1621 } 1622 atomic_add_long(ctr, 1); 1623 if (mask & ULOCKFS_FWLOCK) 1624 ULOCKFS_SET_FALLOC(ulp); 1625 mutex_exit(&ulp->ul_lock); 1626 } else { 1627 /* 1628 * This is the common case of file system in a unlocked state. 1629 * 1630 * If a file system is unlocked, we would expect the ctr to have 1631 * been incremented by now. But this will not be true when a 1632 * quiesce is winding up - SLOCK was set when we checked before 1633 * incrementing the ctr, but by the time we checked for 1634 * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. Take 1635 * ul_lock and go through the non-blocking version of 1636 * ufs_check_lockfs() code. 1637 */ 1638 if (op_cnt_incremented == 0) { 1639 mutex_enter(&ulp->ul_lock); 1640 if (ULOCKFS_IS_HLOCK(ulp) || 1641 (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock)) 1642 error = EIO; 1643 else if (ulp->ul_fs_lock & mask) 1644 error = EAGAIN; 1645 1646 if (error) { 1647 mutex_exit(&ulp->ul_lock); 1648 if (ulockfs_info_free == NULL) 1649 kmem_free(ulockfs_info_temp, 1650 sizeof (ulockfs_info_t)); 1651 return (error); 1652 } 1653 atomic_add_long(ctr, 1); 1654 if (mask & ULOCKFS_FWLOCK) 1655 ULOCKFS_SET_FALLOC(ulp); 1656 mutex_exit(&ulp->ul_lock); 1657 } else if (mask & ULOCKFS_FWLOCK) { 1658 mutex_enter(&ulp->ul_lock); 1659 ULOCKFS_SET_FALLOC(ulp); 1660 mutex_exit(&ulp->ul_lock); 1661 } 1662 } 1663 1664 if (ulockfs_info_free != NULL) { 1665 ulockfs_info_free->ulp = ulp; 1666 if (mask & ULOCKFS_FWLOCK) 1667 ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE; 1668 } else { 1669 ulockfs_info_temp->ulp = ulp; 1670 ulockfs_info_temp->next = ulockfs_info; 1671 if (mask & ULOCKFS_FWLOCK) 1672 ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE; 1673 ASSERT(ufs_lockfs_key != 0); 1674 (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp); 1675 } 1676 1677 curthread->t_flag |= T_DONTBLOCK; 1678 return (0); 1679 } 1680 1681 /* 1682 * specialized version of ufs_lockfs_begin() called by ufs_getpage(). 1683 */ 1684 int 1685 ufs_lockfs_begin_getpage( 1686 struct ufsvfs *ufsvfsp, 1687 struct ulockfs **ulpp, 1688 struct seg *seg, 1689 int read_access, 1690 uint_t *protp) 1691 { 1692 ulong_t mask; 1693 int error; 1694 int rec_vop; 1695 struct ulockfs *ulp; 1696 ulockfs_info_t *ulockfs_info; 1697 ulockfs_info_t *ulockfs_info_free; 1698 ulockfs_info_t *ulockfs_info_temp; 1699 1700 /* 1701 * file system has been forcibly unmounted 1702 */ 1703 if (ufsvfsp == NULL) 1704 return (EIO); 1705 1706 *ulpp = ulp = &ufsvfsp->vfs_ulockfs; 1707 1708 /* 1709 * Do lockfs protocol 1710 */ 1711 ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 1712 IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free); 1713 1714 /* 1715 * Detect recursive VOP call or handcrafted internal lockfs protocol 1716 * path and bail out in that case. 1717 */ 1718 if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) { 1719 *ulpp = NULL; 1720 return (0); 1721 } else { 1722 if (ulockfs_info_free == NULL) { 1723 if ((ulockfs_info_temp = (ulockfs_info_t *) 1724 kmem_zalloc(sizeof (ulockfs_info_t), 1725 KM_NOSLEEP)) == NULL) { 1726 *ulpp = NULL; 1727 return (ENOMEM); 1728 } 1729 } 1730 } 1731 1732 /* 1733 * First time VOP call 1734 */ 1735 atomic_add_long(&ulp->ul_vnops_cnt, 1); 1736 if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) { 1737 if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 1738 cv_broadcast(&ulp->ul_cv); 1739 mutex_enter(&ulp->ul_lock); 1740 if (seg->s_ops == &segvn_ops && 1741 ((struct segvn_data *)seg->s_data)->type != MAP_SHARED) { 1742 mask = (ulong_t)ULOCKFS_GETREAD_MASK; 1743 } else if (protp && read_access) { 1744 /* 1745 * Restrict the mapping to readonly. 1746 * Writes to this mapping will cause 1747 * another fault which will then 1748 * be suspended if fs is write locked 1749 */ 1750 *protp &= ~PROT_WRITE; 1751 mask = (ulong_t)ULOCKFS_GETREAD_MASK; 1752 } else 1753 mask = (ulong_t)ULOCKFS_GETWRITE_MASK; 1754 1755 /* 1756 * will sleep if this fs is locked against this VOP 1757 */ 1758 error = ufs_check_lockfs(ufsvfsp, ulp, mask); 1759 mutex_exit(&ulp->ul_lock); 1760 if (error) { 1761 if (ulockfs_info_free == NULL) 1762 kmem_free(ulockfs_info_temp, 1763 sizeof (ulockfs_info_t)); 1764 return (error); 1765 } 1766 } 1767 1768 if (ulockfs_info_free != NULL) { 1769 ulockfs_info_free->ulp = ulp; 1770 } else { 1771 ulockfs_info_temp->ulp = ulp; 1772 ulockfs_info_temp->next = ulockfs_info; 1773 ASSERT(ufs_lockfs_key != 0); 1774 (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp); 1775 } 1776 1777 curthread->t_flag |= T_DONTBLOCK; 1778 return (0); 1779 } 1780 1781 void 1782 ufs_lockfs_tsd_destructor(void *head) 1783 { 1784 ulockfs_info_t *curr = (ulockfs_info_t *)head; 1785 ulockfs_info_t *temp; 1786 1787 for (; curr != NULL; ) { 1788 /* 1789 * The TSD destructor is being called when the thread exits 1790 * (via thread_exit()). At that time it must have cleaned up 1791 * all VOPs via ufs_lockfs_end() and there must not be a 1792 * valid ulockfs record exist while a thread is exiting. 1793 */ 1794 temp = curr; 1795 curr = curr->next; 1796 ASSERT(temp->ulp == NULL); 1797 kmem_free(temp, sizeof (ulockfs_info_t)); 1798 } 1799 } 1800