1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/t_lock.h> 30 #include <sys/param.h> 31 #include <sys/time.h> 32 #include <sys/systm.h> 33 #include <sys/sysmacros.h> 34 #include <sys/resource.h> 35 #include <sys/signal.h> 36 #include <sys/cred.h> 37 #include <sys/user.h> 38 #include <sys/buf.h> 39 #include <sys/vfs.h> 40 #include <sys/vnode.h> 41 #include <sys/proc.h> 42 #include <sys/disp.h> 43 #include <sys/file.h> 44 #include <sys/fcntl.h> 45 #include <sys/flock.h> 46 #include <sys/atomic.h> 47 #include <sys/kmem.h> 48 #include <sys/uio.h> 49 #include <sys/conf.h> 50 #include <sys/mman.h> 51 #include <sys/pathname.h> 52 #include <sys/debug.h> 53 #include <sys/vmmeter.h> 54 #include <sys/vmsystm.h> 55 #include <sys/cmn_err.h> 56 #include <sys/acct.h> 57 #include <sys/dnlc.h> 58 #include <sys/swap.h> 59 60 #include <sys/fs/ufs_fs.h> 61 #include <sys/fs/ufs_inode.h> 62 #include <sys/fs/ufs_fsdir.h> 63 #include <sys/fs/ufs_trans.h> 64 #include <sys/fs/ufs_panic.h> 65 #include <sys/fs/ufs_mount.h> 66 #include <sys/fs/ufs_bio.h> 67 #include <sys/fs/ufs_log.h> 68 #include <sys/fs/ufs_quota.h> 69 #include <sys/dirent.h> /* must be AFTER <sys/fs/fsdir.h>! */ 70 #include <sys/errno.h> 71 #include <sys/sysinfo.h> 72 73 #include <vm/hat.h> 74 #include <vm/pvn.h> 75 #include <vm/as.h> 76 #include <vm/seg.h> 77 #include <vm/seg_map.h> 78 #include <vm/seg_vn.h> 79 #include <vm/rm.h> 80 #include <vm/anon.h> 81 #include <sys/swap.h> 82 #include <sys/dnlc.h> 83 84 extern struct vnode *common_specvp(struct vnode *vp); 85 86 /* error lock status */ 87 #define UN_ERRLCK (-1) 88 #define SET_ERRLCK 1 89 #define RE_ERRLCK 2 90 #define NO_ERRLCK 0 91 92 /* 93 * Index to be used in TSD for storing lockfs data 94 */ 95 uint_t ufs_lockfs_key; 96 97 typedef struct _ulockfs_info { 98 struct _ulockfs_info *next; 99 struct ulockfs *ulp; 100 uint_t flags; 101 } ulockfs_info_t; 102 103 #define ULOCK_INFO_FALLOCATE 0x00000001 /* fallocate thread */ 104 105 /* 106 * Check in TSD that whether we are already doing any VOP on this filesystem 107 */ 108 #define IS_REC_VOP(found, head, ulp, free) \ 109 { \ 110 ulockfs_info_t *_curr; \ 111 \ 112 for (found = 0, free = NULL, _curr = head; \ 113 _curr != NULL; _curr = _curr->next) { \ 114 if ((free == NULL) && \ 115 (_curr->ulp == NULL)) \ 116 free = _curr; \ 117 if (_curr->ulp == ulp) { \ 118 found = 1; \ 119 break; \ 120 } \ 121 } \ 122 } 123 124 /* 125 * Get the lockfs data from TSD so that lockfs handles the recursive VOP 126 * properly 127 */ 128 #define SEARCH_ULOCKFSP(head, ulp, info) \ 129 { \ 130 ulockfs_info_t *_curr; \ 131 \ 132 for (_curr = head; _curr != NULL; \ 133 _curr = _curr->next) { \ 134 if (_curr->ulp == ulp) { \ 135 break; \ 136 } \ 137 } \ 138 \ 139 info = _curr; \ 140 } 141 142 /* 143 * Validate lockfs request 144 */ 145 static int 146 ufs_getlfd( 147 struct lockfs *lockfsp, /* new lock request */ 148 struct lockfs *ul_lockfsp) /* old lock state */ 149 { 150 int error = 0; 151 152 /* 153 * no input flags defined 154 */ 155 if (lockfsp->lf_flags != 0) { 156 error = EINVAL; 157 goto errout; 158 } 159 160 /* 161 * check key 162 */ 163 if (!LOCKFS_IS_ULOCK(ul_lockfsp)) 164 if (lockfsp->lf_key != ul_lockfsp->lf_key) { 165 error = EINVAL; 166 goto errout; 167 } 168 169 lockfsp->lf_key = ul_lockfsp->lf_key + 1; 170 171 errout: 172 return (error); 173 } 174 175 /* 176 * ufs_checkaccton 177 * check if accounting is turned on on this fs 178 */ 179 180 int 181 ufs_checkaccton(struct vnode *vp) 182 { 183 if (acct_fs_in_use(vp)) 184 return (EDEADLK); 185 return (0); 186 } 187 188 /* 189 * ufs_checkswapon 190 * check if local swapping is to file on this fs 191 */ 192 int 193 ufs_checkswapon(struct vnode *vp) 194 { 195 struct swapinfo *sip; 196 197 mutex_enter(&swapinfo_lock); 198 for (sip = swapinfo; sip; sip = sip->si_next) 199 if (sip->si_vp->v_vfsp == vp->v_vfsp) { 200 mutex_exit(&swapinfo_lock); 201 return (EDEADLK); 202 } 203 mutex_exit(&swapinfo_lock); 204 return (0); 205 } 206 207 /* 208 * ufs_freeze 209 * pend future accesses for current lock and desired lock 210 */ 211 void 212 ufs_freeze(struct ulockfs *ulp, struct lockfs *lockfsp) 213 { 214 /* 215 * set to new lock type 216 */ 217 ulp->ul_lockfs.lf_lock = lockfsp->lf_lock; 218 ulp->ul_lockfs.lf_key = lockfsp->lf_key; 219 ulp->ul_lockfs.lf_comlen = lockfsp->lf_comlen; 220 ulp->ul_lockfs.lf_comment = lockfsp->lf_comment; 221 222 ulp->ul_fs_lock = (1 << ulp->ul_lockfs.lf_lock); 223 } 224 225 /* 226 * All callers of ufs_quiesce() atomically increment ufs_quiesce_pend before 227 * starting ufs_quiesce() protocol and decrement it only when a file system no 228 * longer has to be in quiescent state. This allows ufs_pageio() to detect 229 * that another thread wants to quiesce a file system. See more comments in 230 * ufs_pageio(). 231 */ 232 ulong_t ufs_quiesce_pend = 0; 233 234 /* 235 * ufs_quiesce 236 * wait for outstanding accesses to finish 237 */ 238 int 239 ufs_quiesce(struct ulockfs *ulp) 240 { 241 int error = 0; 242 ulockfs_info_t *head; 243 ulockfs_info_t *info; 244 245 head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 246 SEARCH_ULOCKFSP(head, ulp, info); 247 248 /* 249 * Set a softlock to suspend future ufs_vnops so that 250 * this lockfs request will not be starved 251 */ 252 ULOCKFS_SET_SLOCK(ulp); 253 ASSERT(ufs_quiesce_pend); 254 255 /* check if there is any outstanding ufs vnodeops calls */ 256 while (ulp->ul_vnops_cnt || ulp->ul_falloc_cnt) { 257 /* 258 * use timed version of cv_wait_sig() to make sure we don't 259 * miss a wake up call from ufs_pageio() when it doesn't use 260 * ul_lock. 261 * 262 * when a fallocate thread comes in, the only way it returns 263 * from this function is if there are no other vnode operations 264 * going on (remember fallocate threads are tracked using 265 * ul_falloc_cnt not ul_vnops_cnt), and another fallocate thread 266 * hasn't already grabbed the fs write lock. 267 */ 268 if (info && (info->flags & ULOCK_INFO_FALLOCATE)) { 269 if (!ulp->ul_vnops_cnt && !ULOCKFS_IS_FWLOCK(ulp)) 270 goto out; 271 } 272 if (!cv_timedwait_sig(&ulp->ul_cv, &ulp->ul_lock, lbolt + hz)) { 273 error = EINTR; 274 goto out; 275 } 276 } 277 278 out: 279 /* 280 * unlock the soft lock 281 */ 282 ULOCKFS_CLR_SLOCK(ulp); 283 284 return (error); 285 } 286 287 /* 288 * ufs_flush_inode 289 */ 290 int 291 ufs_flush_inode(struct inode *ip, void *arg) 292 { 293 int error; 294 int saverror = 0; 295 296 /* 297 * wrong file system; keep looking 298 */ 299 if (ip->i_ufsvfs != (struct ufsvfs *)arg) 300 return (0); 301 302 /* 303 * asynchronously push all the dirty pages 304 */ 305 if (((error = TRANS_SYNCIP(ip, B_ASYNC, 0, TOP_SYNCIP_FLUSHI)) != 0) && 306 (error != EAGAIN)) 307 saverror = error; 308 /* 309 * wait for io and discard all mappings 310 */ 311 if (error = TRANS_SYNCIP(ip, B_INVAL, 0, TOP_SYNCIP_FLUSHI)) 312 saverror = error; 313 314 if (ITOV(ip)->v_type == VDIR) { 315 dnlc_dir_purge(&ip->i_danchor); 316 } 317 318 return (saverror); 319 } 320 321 /* 322 * ufs_flush 323 * Flush everything that is currently dirty; this includes invalidating 324 * any mappings. 325 */ 326 int 327 ufs_flush(struct vfs *vfsp) 328 { 329 int error; 330 int saverror = 0; 331 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 332 struct fs *fs = ufsvfsp->vfs_fs; 333 int tdontblock = 0; 334 335 ASSERT(vfs_lock_held(vfsp)); 336 337 /* 338 * purge dnlc 339 */ 340 (void) dnlc_purge_vfsp(vfsp, 0); 341 342 /* 343 * drain the delete and idle threads 344 */ 345 ufs_delete_drain(vfsp, 0, 0); 346 ufs_idle_drain(vfsp); 347 348 /* 349 * flush and invalidate quota records 350 */ 351 (void) qsync(ufsvfsp); 352 353 /* 354 * flush w/invalidate the inodes for vfsp 355 */ 356 if (error = ufs_scan_inodes(0, ufs_flush_inode, ufsvfsp, ufsvfsp)) 357 saverror = error; 358 359 /* 360 * synchronously flush superblock and summary info 361 */ 362 if (fs->fs_ronly == 0 && fs->fs_fmod) { 363 fs->fs_fmod = 0; 364 TRANS_SBUPDATE(ufsvfsp, vfsp, TOP_SBUPDATE_FLUSH); 365 } 366 /* 367 * flush w/invalidate block device pages and buf cache 368 */ 369 if ((error = VOP_PUTPAGE(common_specvp(ufsvfsp->vfs_devvp), 370 (offset_t)0, 0, B_INVAL, CRED(), NULL)) > 0) 371 saverror = error; 372 373 (void) bflush((dev_t)vfsp->vfs_dev); 374 (void) bfinval((dev_t)vfsp->vfs_dev, 0); 375 376 /* 377 * drain the delete and idle threads again 378 */ 379 ufs_delete_drain(vfsp, 0, 0); 380 ufs_idle_drain(vfsp); 381 382 /* 383 * play with the clean flag 384 */ 385 if (saverror == 0) 386 ufs_checkclean(vfsp); 387 388 /* 389 * Flush any outstanding transactions and roll the log 390 * only if we are supposed to do, i.e. LDL_NOROLL not set. 391 * We can not simply check for fs_ronly here since fsck also may 392 * use this code to roll the log on a read-only filesystem, e.g. 393 * root during early stages of boot, if other then a sanity check is 394 * done, it will clear LDL_NOROLL before. 395 * In addition we assert that the deltamap does not contain any deltas 396 * in case LDL_NOROLL is set since this is not supposed to happen. 397 */ 398 if (TRANS_ISTRANS(ufsvfsp)) { 399 ml_unit_t *ul = ufsvfsp->vfs_log; 400 mt_map_t *mtm = ul->un_deltamap; 401 402 if (ul->un_flags & LDL_NOROLL) { 403 ASSERT(mtm->mtm_nme == 0); 404 } else { 405 /* 406 * Do not set T_DONTBLOCK if there is a 407 * transaction opened by caller. 408 */ 409 if (curthread->t_flag & T_DONTBLOCK) 410 tdontblock = 1; 411 else 412 curthread->t_flag |= T_DONTBLOCK; 413 414 TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_FLUSH, 415 TOP_COMMIT_SIZE, error); 416 417 if (!error) { 418 TRANS_END_SYNC(ufsvfsp, saverror, 419 TOP_COMMIT_FLUSH, TOP_COMMIT_SIZE); 420 } 421 422 if (tdontblock == 0) 423 curthread->t_flag &= ~T_DONTBLOCK; 424 425 logmap_roll_dev(ufsvfsp->vfs_log); 426 } 427 } 428 429 return (saverror); 430 } 431 432 /* 433 * ufs_thaw_wlock 434 * special processing when thawing down to wlock 435 */ 436 static int 437 ufs_thaw_wlock(struct inode *ip, void *arg) 438 { 439 /* 440 * wrong file system; keep looking 441 */ 442 if (ip->i_ufsvfs != (struct ufsvfs *)arg) 443 return (0); 444 445 /* 446 * iupdat refuses to clear flags if the fs is read only. The fs 447 * may become read/write during the lock and we wouldn't want 448 * these inodes being written to disk. So clear the flags. 449 */ 450 rw_enter(&ip->i_contents, RW_WRITER); 451 ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG); 452 rw_exit(&ip->i_contents); 453 454 /* 455 * pages are mlocked -- fail wlock 456 */ 457 if (ITOV(ip)->v_type != VCHR && vn_has_cached_data(ITOV(ip))) 458 return (EBUSY); 459 460 return (0); 461 } 462 463 /* 464 * ufs_thaw_hlock 465 * special processing when thawing down to hlock or elock 466 */ 467 static int 468 ufs_thaw_hlock(struct inode *ip, void *arg) 469 { 470 struct vnode *vp = ITOV(ip); 471 472 /* 473 * wrong file system; keep looking 474 */ 475 if (ip->i_ufsvfs != (struct ufsvfs *)arg) 476 return (0); 477 478 /* 479 * blow away all pages - even if they are mlocked 480 */ 481 do { 482 (void) TRANS_SYNCIP(ip, B_INVAL | B_FORCE, 0, TOP_SYNCIP_HLOCK); 483 } while ((vp->v_type != VCHR) && vn_has_cached_data(vp)); 484 rw_enter(&ip->i_contents, RW_WRITER); 485 ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG); 486 rw_exit(&ip->i_contents); 487 488 return (0); 489 } 490 491 /* 492 * ufs_thaw 493 * thaw file system lock down to current value 494 */ 495 int 496 ufs_thaw(struct vfs *vfsp, struct ufsvfs *ufsvfsp, struct ulockfs *ulp) 497 { 498 int error = 0; 499 int noidel = (int)(ulp->ul_flag & ULOCKFS_NOIDEL); 500 501 /* 502 * if wlock or hlock or elock 503 */ 504 if (ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_HLOCK(ulp) || 505 ULOCKFS_IS_ELOCK(ulp)) { 506 507 /* 508 * don't keep access times 509 * don't free deleted files 510 * if superblock writes are allowed, limit them to me for now 511 */ 512 ulp->ul_flag |= (ULOCKFS_NOIACC|ULOCKFS_NOIDEL); 513 if (ulp->ul_sbowner != (kthread_id_t)-1) 514 ulp->ul_sbowner = curthread; 515 516 /* 517 * wait for writes for deleted files and superblock updates 518 */ 519 (void) ufs_flush(vfsp); 520 521 /* 522 * now make sure the quota file is up-to-date 523 * expensive; but effective 524 */ 525 error = ufs_flush(vfsp); 526 /* 527 * no one can write the superblock 528 */ 529 ulp->ul_sbowner = (kthread_id_t)-1; 530 531 /* 532 * special processing for wlock/hlock/elock 533 */ 534 if (ULOCKFS_IS_WLOCK(ulp)) { 535 if (error) 536 goto errout; 537 error = bfinval(ufsvfsp->vfs_dev, 0); 538 if (error) 539 goto errout; 540 error = ufs_scan_inodes(0, ufs_thaw_wlock, 541 (void *)ufsvfsp, ufsvfsp); 542 if (error) 543 goto errout; 544 } 545 if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp)) { 546 error = 0; 547 (void) ufs_scan_inodes(0, ufs_thaw_hlock, 548 (void *)ufsvfsp, ufsvfsp); 549 (void) bfinval(ufsvfsp->vfs_dev, 1); 550 } 551 } else { 552 553 /* 554 * okay to keep access times 555 * okay to free deleted files 556 * okay to write the superblock 557 */ 558 ulp->ul_flag &= ~(ULOCKFS_NOIACC|ULOCKFS_NOIDEL); 559 ulp->ul_sbowner = NULL; 560 561 /* 562 * flush in case deleted files are in memory 563 */ 564 if (noidel) { 565 if (error = ufs_flush(vfsp)) 566 goto errout; 567 } 568 } 569 570 errout: 571 cv_broadcast(&ulp->ul_cv); 572 return (error); 573 } 574 575 /* 576 * ufs_reconcile_fs 577 * reconcile incore superblock with ondisk superblock 578 */ 579 int 580 ufs_reconcile_fs(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck) 581 { 582 struct fs *mfs; /* in-memory superblock */ 583 struct fs *dfs; /* on-disk superblock */ 584 struct buf *bp; /* on-disk superblock buf */ 585 int needs_unlock; 586 char finished_fsclean; 587 588 mfs = ufsvfsp->vfs_fs; 589 590 /* 591 * get the on-disk copy of the superblock 592 */ 593 bp = UFS_BREAD(ufsvfsp, vfsp->vfs_dev, SBLOCK, SBSIZE); 594 bp->b_flags |= (B_STALE|B_AGE); 595 if (bp->b_flags & B_ERROR) { 596 brelse(bp); 597 return (EIO); 598 } 599 dfs = bp->b_un.b_fs; 600 601 /* error locks may only unlock after the fs has been made consistent */ 602 if (errlck == UN_ERRLCK) { 603 if (dfs->fs_clean == FSFIX) { /* being repaired */ 604 brelse(bp); 605 return (EAGAIN); 606 } 607 /* repair not yet started? */ 608 finished_fsclean = TRANS_ISTRANS(ufsvfsp)? FSLOG: FSCLEAN; 609 if (dfs->fs_clean != finished_fsclean) { 610 brelse(bp); 611 return (EBUSY); 612 } 613 } 614 615 /* 616 * if superblock has changed too much, abort 617 */ 618 if ((mfs->fs_sblkno != dfs->fs_sblkno) || 619 (mfs->fs_cblkno != dfs->fs_cblkno) || 620 (mfs->fs_iblkno != dfs->fs_iblkno) || 621 (mfs->fs_dblkno != dfs->fs_dblkno) || 622 (mfs->fs_cgoffset != dfs->fs_cgoffset) || 623 (mfs->fs_cgmask != dfs->fs_cgmask) || 624 (mfs->fs_bsize != dfs->fs_bsize) || 625 (mfs->fs_fsize != dfs->fs_fsize) || 626 (mfs->fs_frag != dfs->fs_frag) || 627 (mfs->fs_bmask != dfs->fs_bmask) || 628 (mfs->fs_fmask != dfs->fs_fmask) || 629 (mfs->fs_bshift != dfs->fs_bshift) || 630 (mfs->fs_fshift != dfs->fs_fshift) || 631 (mfs->fs_fragshift != dfs->fs_fragshift) || 632 (mfs->fs_fsbtodb != dfs->fs_fsbtodb) || 633 (mfs->fs_sbsize != dfs->fs_sbsize) || 634 (mfs->fs_nindir != dfs->fs_nindir) || 635 (mfs->fs_nspf != dfs->fs_nspf) || 636 (mfs->fs_trackskew != dfs->fs_trackskew) || 637 (mfs->fs_cgsize != dfs->fs_cgsize) || 638 (mfs->fs_ntrak != dfs->fs_ntrak) || 639 (mfs->fs_nsect != dfs->fs_nsect) || 640 (mfs->fs_spc != dfs->fs_spc) || 641 (mfs->fs_cpg != dfs->fs_cpg) || 642 (mfs->fs_ipg != dfs->fs_ipg) || 643 (mfs->fs_fpg != dfs->fs_fpg) || 644 (mfs->fs_postblformat != dfs->fs_postblformat) || 645 (mfs->fs_magic != dfs->fs_magic)) { 646 brelse(bp); 647 return (EACCES); 648 } 649 if (dfs->fs_clean == FSBAD || FSOKAY != dfs->fs_state + dfs->fs_time) 650 if (mfs->fs_clean == FSLOG) { 651 brelse(bp); 652 return (EACCES); 653 } 654 655 /* 656 * get new summary info 657 */ 658 if (ufs_getsummaryinfo(vfsp->vfs_dev, ufsvfsp, dfs)) { 659 brelse(bp); 660 return (EIO); 661 } 662 663 /* 664 * release old summary info and update in-memory superblock 665 */ 666 kmem_free(mfs->fs_u.fs_csp, mfs->fs_cssize); 667 mfs->fs_u.fs_csp = dfs->fs_u.fs_csp; /* Only entry 0 used */ 668 669 /* 670 * update fields allowed to change 671 */ 672 mfs->fs_size = dfs->fs_size; 673 mfs->fs_dsize = dfs->fs_dsize; 674 mfs->fs_ncg = dfs->fs_ncg; 675 mfs->fs_minfree = dfs->fs_minfree; 676 mfs->fs_rotdelay = dfs->fs_rotdelay; 677 mfs->fs_rps = dfs->fs_rps; 678 mfs->fs_maxcontig = dfs->fs_maxcontig; 679 mfs->fs_maxbpg = dfs->fs_maxbpg; 680 mfs->fs_csmask = dfs->fs_csmask; 681 mfs->fs_csshift = dfs->fs_csshift; 682 mfs->fs_optim = dfs->fs_optim; 683 mfs->fs_csaddr = dfs->fs_csaddr; 684 mfs->fs_cssize = dfs->fs_cssize; 685 mfs->fs_ncyl = dfs->fs_ncyl; 686 mfs->fs_cstotal = dfs->fs_cstotal; 687 mfs->fs_reclaim = dfs->fs_reclaim; 688 689 if (mfs->fs_reclaim & (FS_RECLAIM|FS_RECLAIMING)) { 690 mfs->fs_reclaim &= ~FS_RECLAIM; 691 mfs->fs_reclaim |= FS_RECLAIMING; 692 ufs_thread_start(&ufsvfsp->vfs_reclaim, 693 ufs_thread_reclaim, vfsp); 694 } 695 696 /* XXX What to do about sparecon? */ 697 698 /* XXX need to copy volume label */ 699 700 /* 701 * ondisk clean flag overrides inmemory clean flag iff == FSBAD 702 * or if error-locked and ondisk is now clean 703 */ 704 needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock); 705 if (needs_unlock) 706 mutex_enter(&ufsvfsp->vfs_lock); 707 708 if (errlck == UN_ERRLCK) { 709 if (finished_fsclean == dfs->fs_clean) 710 mfs->fs_clean = finished_fsclean; 711 else 712 mfs->fs_clean = FSBAD; 713 mfs->fs_state = FSOKAY - dfs->fs_time; 714 } 715 716 if (FSOKAY != dfs->fs_state + dfs->fs_time || 717 (dfs->fs_clean == FSBAD)) 718 mfs->fs_clean = FSBAD; 719 720 if (needs_unlock) 721 mutex_exit(&ufsvfsp->vfs_lock); 722 723 brelse(bp); 724 725 return (0); 726 } 727 728 /* 729 * ufs_reconcile_inode 730 * reconcile ondisk inode with incore inode 731 */ 732 static int 733 ufs_reconcile_inode(struct inode *ip, void *arg) 734 { 735 int i; 736 int ndaddr; 737 int niaddr; 738 struct dinode *dp; /* ondisk inode */ 739 struct buf *bp = NULL; 740 uid_t d_uid; 741 gid_t d_gid; 742 int error = 0; 743 struct fs *fs; 744 745 /* 746 * not an inode we care about 747 */ 748 if (ip->i_ufsvfs != (struct ufsvfs *)arg) 749 return (0); 750 751 fs = ip->i_fs; 752 753 /* 754 * Inode reconciliation fails: we made the filesystem quiescent 755 * and we did a ufs_flush() before calling ufs_reconcile_inode() 756 * and thus the inode should not have been changed inbetween. 757 * Any discrepancies indicate a logic error and a pretty 758 * significant run-state inconsistency we should complain about. 759 */ 760 if (ip->i_flag & (IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG)) { 761 cmn_err(CE_WARN, "%s: Inode reconciliation failed for" 762 "inode %llu", fs->fs_fsmnt, (u_longlong_t)ip->i_number); 763 return (EINVAL); 764 } 765 766 /* 767 * get the dinode 768 */ 769 bp = UFS_BREAD(ip->i_ufsvfs, 770 ip->i_dev, (daddr_t)fsbtodb(fs, itod(fs, ip->i_number)), 771 (int)fs->fs_bsize); 772 if (bp->b_flags & B_ERROR) { 773 brelse(bp); 774 return (EIO); 775 } 776 dp = bp->b_un.b_dino; 777 dp += itoo(fs, ip->i_number); 778 779 /* 780 * handle Sun's implementation of EFT 781 */ 782 d_uid = (dp->di_suid == UID_LONG) ? dp->di_uid : (uid_t)dp->di_suid; 783 d_gid = (dp->di_sgid == GID_LONG) ? dp->di_gid : (uid_t)dp->di_sgid; 784 785 rw_enter(&ip->i_contents, RW_WRITER); 786 787 /* 788 * some fields are not allowed to change 789 */ 790 if ((ip->i_mode != dp->di_mode) || 791 (ip->i_gen != dp->di_gen) || 792 (ip->i_uid != d_uid) || 793 (ip->i_gid != d_gid)) { 794 error = EACCES; 795 goto out; 796 } 797 798 /* 799 * and some are allowed to change 800 */ 801 ip->i_size = dp->di_size; 802 ip->i_ic.ic_flags = dp->di_ic.ic_flags; 803 ip->i_blocks = dp->di_blocks; 804 ip->i_nlink = dp->di_nlink; 805 if (ip->i_flag & IFASTSYMLNK) { 806 ndaddr = 1; 807 niaddr = 0; 808 } else { 809 ndaddr = NDADDR; 810 niaddr = NIADDR; 811 } 812 for (i = 0; i < ndaddr; ++i) 813 ip->i_db[i] = dp->di_db[i]; 814 for (i = 0; i < niaddr; ++i) 815 ip->i_ib[i] = dp->di_ib[i]; 816 817 out: 818 rw_exit(&ip->i_contents); 819 brelse(bp); 820 return (error); 821 } 822 823 /* 824 * ufs_reconcile 825 * reconcile ondisk superblock/inodes with any incore 826 */ 827 static int 828 ufs_reconcile(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck) 829 { 830 int error = 0; 831 832 /* 833 * get rid of as much inmemory data as possible 834 */ 835 (void) ufs_flush(vfsp); 836 837 /* 838 * reconcile the superblock and inodes 839 */ 840 if (error = ufs_reconcile_fs(vfsp, ufsvfsp, errlck)) 841 return (error); 842 if (error = ufs_scan_inodes(0, ufs_reconcile_inode, ufsvfsp, ufsvfsp)) 843 return (error); 844 /* 845 * allocation blocks may be incorrect; get rid of them 846 */ 847 (void) ufs_flush(vfsp); 848 849 return (error); 850 } 851 852 /* 853 * File system locking 854 */ 855 int 856 ufs_fiolfs(struct vnode *vp, struct lockfs *lockfsp, int from_log) 857 { 858 return (ufs__fiolfs(vp, lockfsp, /* from_user */ 1, from_log)); 859 } 860 861 /* kernel-internal interface, also used by fix-on-panic */ 862 int 863 ufs__fiolfs( 864 struct vnode *vp, 865 struct lockfs *lockfsp, 866 int from_user, 867 int from_log) 868 { 869 struct ulockfs *ulp; 870 struct lockfs lfs; 871 int error; 872 struct vfs *vfsp; 873 struct ufsvfs *ufsvfsp; 874 int errlck = NO_ERRLCK; 875 int poll_events = POLLPRI; 876 extern struct pollhead ufs_pollhd; 877 ulockfs_info_t *head; 878 ulockfs_info_t *info; 879 int signal = 0; 880 881 /* check valid lock type */ 882 if (!lockfsp || lockfsp->lf_lock > LOCKFS_MAXLOCK) 883 return (EINVAL); 884 885 if (!vp || !vp->v_vfsp || !vp->v_vfsp->vfs_data) 886 return (EIO); 887 888 vfsp = vp->v_vfsp; 889 ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 890 ulp = &ufsvfsp->vfs_ulockfs; 891 892 head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 893 SEARCH_ULOCKFSP(head, ulp, info); 894 895 /* 896 * Suspend both the reclaim thread and the delete thread. 897 * This must be done outside the lockfs locking protocol. 898 */ 899 ufs_thread_suspend(&ufsvfsp->vfs_reclaim); 900 ufs_thread_suspend(&ufsvfsp->vfs_delete); 901 902 /* 903 * Acquire vfs_reflock around ul_lock to avoid deadlock with 904 * umount/remount/sync. 905 */ 906 vfs_lock_wait(vfsp); 907 mutex_enter(&ulp->ul_lock); 908 atomic_add_long(&ufs_quiesce_pend, 1); 909 910 /* 911 * Quit if there is another lockfs request in progress 912 * that is waiting for existing ufs_vnops to complete. 913 */ 914 if (ULOCKFS_IS_BUSY(ulp)) { 915 error = EBUSY; 916 goto errexit; 917 } 918 919 /* cannot ulocked or downgrade a hard-lock */ 920 if (ULOCKFS_IS_HLOCK(ulp)) { 921 error = EIO; 922 goto errexit; 923 } 924 925 /* an error lock may be unlocked or relocked, only */ 926 if (ULOCKFS_IS_ELOCK(ulp)) { 927 if (!LOCKFS_IS_ULOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) { 928 error = EBUSY; 929 goto errexit; 930 } 931 } 932 933 /* 934 * a read-only error lock may only be upgraded to an 935 * error lock or hard lock 936 */ 937 if (ULOCKFS_IS_ROELOCK(ulp)) { 938 if (!LOCKFS_IS_HLOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) { 939 error = EBUSY; 940 goto errexit; 941 } 942 } 943 944 /* 945 * until read-only error locks are fully implemented 946 * just return EINVAL 947 */ 948 if (LOCKFS_IS_ROELOCK(lockfsp)) { 949 error = EINVAL; 950 goto errexit; 951 } 952 953 /* 954 * an error lock may only be applied if the file system is 955 * unlocked or already error locked. 956 * (this is to prevent the case where a fs gets changed out from 957 * underneath a fs that is locked for backup, 958 * that is, name/delete/write-locked.) 959 */ 960 if ((!ULOCKFS_IS_ULOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp) && 961 !ULOCKFS_IS_ROELOCK(ulp)) && 962 (LOCKFS_IS_ELOCK(lockfsp) || LOCKFS_IS_ROELOCK(lockfsp))) { 963 error = EBUSY; 964 goto errexit; 965 } 966 967 /* get and validate the input lockfs request */ 968 if (error = ufs_getlfd(lockfsp, &ulp->ul_lockfs)) 969 goto errexit; 970 971 /* 972 * save current ulockfs struct 973 */ 974 bcopy(&ulp->ul_lockfs, &lfs, sizeof (struct lockfs)); 975 976 /* 977 * Freeze the file system (pend future accesses) 978 */ 979 ufs_freeze(ulp, lockfsp); 980 981 /* 982 * Set locking in progress because ufs_quiesce may free the 983 * ul_lock mutex. 984 */ 985 ULOCKFS_SET_BUSY(ulp); 986 /* update the ioctl copy */ 987 LOCKFS_SET_BUSY(&ulp->ul_lockfs); 988 989 /* 990 * We need to unset FWLOCK status before we call ufs_quiesce 991 * so that the thread doesnt get suspended. We do this only if 992 * this (fallocate) thread requested an unlock operation. 993 */ 994 if (info && (info->flags & ULOCK_INFO_FALLOCATE)) { 995 if (!ULOCKFS_IS_WLOCK(ulp)) 996 ULOCKFS_CLR_FWLOCK(ulp); 997 } 998 999 /* 1000 * Quiesce (wait for outstanding accesses to finish) 1001 */ 1002 if (error = ufs_quiesce(ulp)) { 1003 /* 1004 * Interrupted due to signal. There could still be 1005 * pending vnops. 1006 */ 1007 signal = 1; 1008 1009 /* 1010 * We do broadcast because lock-status 1011 * could be reverted to old status. 1012 */ 1013 cv_broadcast(&ulp->ul_cv); 1014 goto errout; 1015 } 1016 1017 /* 1018 * If the fallocate thread requested a write fs lock operation 1019 * then we set fwlock status in the ulp. 1020 */ 1021 if (info && (info->flags & ULOCK_INFO_FALLOCATE)) { 1022 if (ULOCKFS_IS_WLOCK(ulp)) 1023 ULOCKFS_SET_FWLOCK(ulp); 1024 } 1025 1026 /* 1027 * can't wlock or (ro)elock fs with accounting or local swap file 1028 */ 1029 if ((ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp) || 1030 ULOCKFS_IS_ROELOCK(ulp)) && !from_log) { 1031 if (error = ufs_checkaccton(vp)) 1032 goto errout; 1033 if (error = ufs_checkswapon(vp)) 1034 goto errout; 1035 } 1036 1037 /* 1038 * save error lock status to pass down to reconcilation 1039 * routines and for later cleanup 1040 */ 1041 if (LOCKFS_IS_ELOCK(&lfs) && ULOCKFS_IS_ULOCK(ulp)) 1042 errlck = UN_ERRLCK; 1043 1044 if (ULOCKFS_IS_ELOCK(ulp) || ULOCKFS_IS_ROELOCK(ulp)) { 1045 int needs_unlock; 1046 int needs_sbwrite; 1047 1048 poll_events |= POLLERR; 1049 errlck = LOCKFS_IS_ELOCK(&lfs) || LOCKFS_IS_ROELOCK(&lfs) ? 1050 RE_ERRLCK : SET_ERRLCK; 1051 1052 needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock); 1053 if (needs_unlock) 1054 mutex_enter(&ufsvfsp->vfs_lock); 1055 1056 /* disable delayed i/o */ 1057 needs_sbwrite = 0; 1058 1059 if (errlck == SET_ERRLCK) { 1060 ufsvfsp->vfs_fs->fs_clean = FSBAD; 1061 needs_sbwrite = 1; 1062 } 1063 1064 needs_sbwrite |= ufsvfsp->vfs_dio; 1065 ufsvfsp->vfs_dio = 0; 1066 1067 if (needs_unlock) 1068 mutex_exit(&ufsvfsp->vfs_lock); 1069 1070 if (needs_sbwrite) { 1071 ulp->ul_sbowner = curthread; 1072 TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE); 1073 1074 if (needs_unlock) 1075 mutex_enter(&ufsvfsp->vfs_lock); 1076 1077 ufsvfsp->vfs_fs->fs_fmod = 0; 1078 1079 if (needs_unlock) 1080 mutex_exit(&ufsvfsp->vfs_lock); 1081 } 1082 } 1083 1084 /* 1085 * reconcile superblock and inodes if was wlocked 1086 */ 1087 if (LOCKFS_IS_WLOCK(&lfs) || LOCKFS_IS_ELOCK(&lfs)) { 1088 if (error = ufs_reconcile(vfsp, ufsvfsp, errlck)) 1089 goto errout; 1090 /* 1091 * in case the fs grew; reset the metadata map for logging tests 1092 */ 1093 TRANS_MATA_UMOUNT(ufsvfsp); 1094 TRANS_MATA_MOUNT(ufsvfsp); 1095 TRANS_MATA_SI(ufsvfsp, ufsvfsp->vfs_fs); 1096 } 1097 1098 /* 1099 * At least everything *currently* dirty goes out. 1100 */ 1101 1102 if ((error = ufs_flush(vfsp)) != 0 && !ULOCKFS_IS_HLOCK(ulp) && 1103 !ULOCKFS_IS_ELOCK(ulp)) 1104 goto errout; 1105 1106 /* 1107 * thaw file system and wakeup pended processes 1108 */ 1109 if (error = ufs_thaw(vfsp, ufsvfsp, ulp)) 1110 if (!ULOCKFS_IS_HLOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp)) 1111 goto errout; 1112 1113 /* 1114 * reset modified flag if not already write locked 1115 */ 1116 if (!LOCKFS_IS_WLOCK(&lfs)) 1117 ULOCKFS_CLR_MOD(ulp); 1118 1119 /* 1120 * idle the lock struct 1121 */ 1122 ULOCKFS_CLR_BUSY(ulp); 1123 /* update the ioctl copy */ 1124 LOCKFS_CLR_BUSY(&ulp->ul_lockfs); 1125 1126 /* 1127 * free current comment 1128 */ 1129 if (lfs.lf_comment && lfs.lf_comlen != 0) { 1130 kmem_free(lfs.lf_comment, lfs.lf_comlen); 1131 lfs.lf_comment = NULL; 1132 lfs.lf_comlen = 0; 1133 } 1134 1135 /* do error lock cleanup */ 1136 if (errlck == UN_ERRLCK) 1137 ufsfx_unlockfs(ufsvfsp); 1138 1139 else if (errlck == RE_ERRLCK) 1140 ufsfx_lockfs(ufsvfsp); 1141 1142 /* don't allow error lock from user to invoke panic */ 1143 else if (from_user && errlck == SET_ERRLCK && 1144 !(ufsvfsp->vfs_fsfx.fx_flags & (UFSMNT_ONERROR_PANIC >> 4))) 1145 (void) ufs_fault(ufsvfsp->vfs_root, 1146 ulp->ul_lockfs.lf_comment && ulp->ul_lockfs.lf_comlen > 0 ? 1147 ulp->ul_lockfs.lf_comment: "user-applied error lock"); 1148 1149 atomic_add_long(&ufs_quiesce_pend, -1); 1150 mutex_exit(&ulp->ul_lock); 1151 vfs_unlock(vfsp); 1152 1153 if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs)) 1154 poll_events |= POLLERR; 1155 1156 pollwakeup(&ufs_pollhd, poll_events); 1157 1158 /* 1159 * Allow both the delete thread and the reclaim thread to 1160 * continue. 1161 */ 1162 ufs_thread_continue(&ufsvfsp->vfs_delete); 1163 ufs_thread_continue(&ufsvfsp->vfs_reclaim); 1164 1165 return (0); 1166 1167 errout: 1168 /* 1169 * Lock failed. Reset the old lock in ufsvfs if not hard locked. 1170 */ 1171 if (!LOCKFS_IS_HLOCK(&ulp->ul_lockfs)) { 1172 bcopy(&lfs, &ulp->ul_lockfs, sizeof (struct lockfs)); 1173 ulp->ul_fs_lock = (1 << lfs.lf_lock); 1174 } 1175 1176 /* 1177 * Don't call ufs_thaw() when there's a signal during 1178 * ufs quiesce operation as it can lead to deadlock 1179 * with getpage. 1180 */ 1181 if (signal == 0) 1182 (void) ufs_thaw(vfsp, ufsvfsp, ulp); 1183 1184 ULOCKFS_CLR_BUSY(ulp); 1185 LOCKFS_CLR_BUSY(&ulp->ul_lockfs); 1186 1187 errexit: 1188 atomic_add_long(&ufs_quiesce_pend, -1); 1189 mutex_exit(&ulp->ul_lock); 1190 vfs_unlock(vfsp); 1191 1192 /* 1193 * Allow both the delete thread and the reclaim thread to 1194 * continue. 1195 */ 1196 ufs_thread_continue(&ufsvfsp->vfs_delete); 1197 ufs_thread_continue(&ufsvfsp->vfs_reclaim); 1198 1199 return (error); 1200 } 1201 1202 /* 1203 * fiolfss 1204 * return the current file system locking state info 1205 */ 1206 int 1207 ufs_fiolfss(struct vnode *vp, struct lockfs *lockfsp) 1208 { 1209 struct ulockfs *ulp; 1210 1211 if (!vp || !vp->v_vfsp || !VTOI(vp)) 1212 return (EINVAL); 1213 1214 /* file system has been forcibly unmounted */ 1215 if (VTOI(vp)->i_ufsvfs == NULL) 1216 return (EIO); 1217 1218 ulp = VTOUL(vp); 1219 1220 if (ULOCKFS_IS_HLOCK(ulp)) { 1221 *lockfsp = ulp->ul_lockfs; /* structure assignment */ 1222 return (0); 1223 } 1224 1225 mutex_enter(&ulp->ul_lock); 1226 1227 *lockfsp = ulp->ul_lockfs; /* structure assignment */ 1228 1229 if (ULOCKFS_IS_MOD(ulp)) 1230 lockfsp->lf_flags |= LOCKFS_MOD; 1231 1232 mutex_exit(&ulp->ul_lock); 1233 1234 return (0); 1235 } 1236 1237 /* 1238 * ufs_check_lockfs 1239 * check whether a ufs_vnops conflicts with the file system lock 1240 */ 1241 int 1242 ufs_check_lockfs(struct ufsvfs *ufsvfsp, struct ulockfs *ulp, ulong_t mask) 1243 { 1244 k_sigset_t smask; 1245 int sig, slock; 1246 1247 ASSERT(MUTEX_HELD(&ulp->ul_lock)); 1248 1249 while (ulp->ul_fs_lock & mask) { 1250 slock = (int)ULOCKFS_IS_SLOCK(ulp); 1251 if ((curthread->t_flag & T_DONTPEND) && !slock) { 1252 curthread->t_flag |= T_WOULDBLOCK; 1253 return (EAGAIN); 1254 } 1255 curthread->t_flag &= ~T_WOULDBLOCK; 1256 1257 /* 1258 * In the case of an onerr umount of the fs, threads could 1259 * have blocked before coming into ufs_check_lockfs and 1260 * need to check for the special case of ELOCK and 1261 * vfs_dontblock being set which would indicate that the fs 1262 * is on its way out and will not return therefore making 1263 * EIO the appropriate response. 1264 */ 1265 if (ULOCKFS_IS_HLOCK(ulp) || 1266 (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock)) 1267 return (EIO); 1268 1269 /* 1270 * wait for lock status to change 1271 */ 1272 if (slock || ufsvfsp->vfs_nointr) { 1273 cv_wait(&ulp->ul_cv, &ulp->ul_lock); 1274 } else { 1275 sigintr(&smask, 1); 1276 sig = cv_wait_sig(&ulp->ul_cv, &ulp->ul_lock); 1277 sigunintr(&smask); 1278 if ((!sig && (ulp->ul_fs_lock & mask)) || 1279 ufsvfsp->vfs_dontblock) 1280 return (EINTR); 1281 } 1282 } 1283 1284 if (mask & ULOCKFS_FWLOCK) { 1285 atomic_add_long(&ulp->ul_falloc_cnt, 1); 1286 ULOCKFS_SET_FALLOC(ulp); 1287 } else { 1288 atomic_add_long(&ulp->ul_vnops_cnt, 1); 1289 } 1290 1291 return (0); 1292 } 1293 1294 /* 1295 * Check whether we came across the handcrafted lockfs protocol path. We can't 1296 * simply check for T_DONTBLOCK here as one would assume since this can also 1297 * falsely catch recursive VOP's going to a different filesystem, instead we 1298 * check if we already hold the ulockfs->ul_lock mutex. 1299 */ 1300 static int 1301 ufs_lockfs_is_under_rawlockfs(struct ulockfs *ulp) 1302 { 1303 return ((mutex_owner(&ulp->ul_lock) != curthread) ? 0 : 1); 1304 } 1305 1306 /* 1307 * ufs_lockfs_begin - start the lockfs locking protocol 1308 */ 1309 int 1310 ufs_lockfs_begin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask) 1311 { 1312 int error; 1313 int rec_vop; 1314 ushort_t op_cnt_incremented = 0; 1315 ulong_t *ctr; 1316 struct ulockfs *ulp; 1317 ulockfs_info_t *ulockfs_info; 1318 ulockfs_info_t *ulockfs_info_free; 1319 ulockfs_info_t *ulockfs_info_temp; 1320 1321 /* 1322 * file system has been forcibly unmounted 1323 */ 1324 if (ufsvfsp == NULL) 1325 return (EIO); 1326 1327 *ulpp = ulp = &ufsvfsp->vfs_ulockfs; 1328 1329 /* 1330 * Do lockfs protocol 1331 */ 1332 ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 1333 IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free); 1334 1335 /* 1336 * Detect recursive VOP call or handcrafted internal lockfs protocol 1337 * path and bail out in that case. 1338 */ 1339 if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) { 1340 *ulpp = NULL; 1341 return (0); 1342 } else { 1343 if (ulockfs_info_free == NULL) { 1344 if ((ulockfs_info_temp = (ulockfs_info_t *) 1345 kmem_zalloc(sizeof (ulockfs_info_t), 1346 KM_NOSLEEP)) == NULL) { 1347 *ulpp = NULL; 1348 return (ENOMEM); 1349 } 1350 } 1351 } 1352 1353 /* 1354 * First time VOP call 1355 * 1356 * Increment the ctr irrespective of the lockfs state. If the lockfs 1357 * state is not ULOCKFS_ULOCK, we can decrement it later. However, 1358 * before incrementing we need to check if there is a pending quiesce 1359 * request because if we have a continuous stream of ufs_lockfs_begin 1360 * requests pounding on a few cpu's then the ufs_quiesce thread might 1361 * never see the value of zero for ctr - a livelock kind of scenario. 1362 */ 1363 ctr = (mask & ULOCKFS_FWLOCK) ? 1364 &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt; 1365 if (!ULOCKFS_IS_SLOCK(ulp)) { 1366 atomic_add_long(ctr, 1); 1367 op_cnt_incremented++; 1368 } 1369 1370 /* 1371 * If the lockfs state (indicated by ul_fs_lock) is not just 1372 * ULOCKFS_ULOCK, then we will be routed through ufs_check_lockfs 1373 * where there is a check with an appropriate mask to selectively allow 1374 * operations permitted for that kind of lockfs state. 1375 * 1376 * Even these selective operations should not be allowed to go through 1377 * if a lockfs request is in progress because that could result in inode 1378 * modifications during a quiesce and could hence result in inode 1379 * reconciliation failures. ULOCKFS_SLOCK alone would not be sufficient, 1380 * so make use of ufs_quiesce_pend to disallow vnode operations when a 1381 * quiesce is in progress. 1382 */ 1383 if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) { 1384 if (op_cnt_incremented) 1385 if (!atomic_add_long_nv(ctr, -1)) 1386 cv_broadcast(&ulp->ul_cv); 1387 mutex_enter(&ulp->ul_lock); 1388 error = ufs_check_lockfs(ufsvfsp, ulp, mask); 1389 mutex_exit(&ulp->ul_lock); 1390 if (error) { 1391 if (ulockfs_info_free == NULL) 1392 kmem_free(ulockfs_info_temp, 1393 sizeof (ulockfs_info_t)); 1394 return (error); 1395 } 1396 } else { 1397 /* 1398 * This is the common case of file system in a unlocked state. 1399 * 1400 * If a file system is unlocked, we would expect the ctr to have 1401 * been incremented by now. But this will not be true when a 1402 * quiesce is winding up - SLOCK was set when we checked before 1403 * incrementing the ctr, but by the time we checked for 1404 * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. It is okay 1405 * to take ul_lock and go through the slow path in this uncommon 1406 * case. 1407 */ 1408 if (op_cnt_incremented == 0) { 1409 mutex_enter(&ulp->ul_lock); 1410 error = ufs_check_lockfs(ufsvfsp, ulp, mask); 1411 if (error) { 1412 mutex_exit(&ulp->ul_lock); 1413 if (ulockfs_info_free == NULL) 1414 kmem_free(ulockfs_info_temp, 1415 sizeof (ulockfs_info_t)); 1416 return (error); 1417 } 1418 if (mask & ULOCKFS_FWLOCK) 1419 ULOCKFS_SET_FALLOC(ulp); 1420 mutex_exit(&ulp->ul_lock); 1421 } else if (mask & ULOCKFS_FWLOCK) { 1422 mutex_enter(&ulp->ul_lock); 1423 ULOCKFS_SET_FALLOC(ulp); 1424 mutex_exit(&ulp->ul_lock); 1425 } 1426 } 1427 1428 if (ulockfs_info_free != NULL) { 1429 ulockfs_info_free->ulp = ulp; 1430 if (mask & ULOCKFS_FWLOCK) 1431 ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE; 1432 } else { 1433 ulockfs_info_temp->ulp = ulp; 1434 ulockfs_info_temp->next = ulockfs_info; 1435 if (mask & ULOCKFS_FWLOCK) 1436 ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE; 1437 ASSERT(ufs_lockfs_key != 0); 1438 (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp); 1439 } 1440 1441 curthread->t_flag |= T_DONTBLOCK; 1442 return (0); 1443 } 1444 1445 /* 1446 * Check whether we are returning from the top level VOP. 1447 */ 1448 static int 1449 ufs_lockfs_top_vop_return(ulockfs_info_t *head) 1450 { 1451 ulockfs_info_t *info; 1452 int result = 1; 1453 1454 for (info = head; info != NULL; info = info->next) { 1455 if (info->ulp != NULL) { 1456 result = 0; 1457 break; 1458 } 1459 } 1460 1461 return (result); 1462 } 1463 1464 /* 1465 * ufs_lockfs_end - terminate the lockfs locking protocol 1466 */ 1467 void 1468 ufs_lockfs_end(struct ulockfs *ulp) 1469 { 1470 ulockfs_info_t *info; 1471 ulockfs_info_t *head; 1472 1473 /* 1474 * end-of-VOP protocol 1475 */ 1476 if (ulp == NULL) 1477 return; 1478 1479 head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 1480 SEARCH_ULOCKFSP(head, ulp, info); 1481 1482 /* 1483 * If we're called from a first level VOP, we have to have a 1484 * valid ulockfs record in the TSD. 1485 */ 1486 ASSERT(info != NULL); 1487 1488 /* 1489 * Invalidate the ulockfs record. 1490 */ 1491 info->ulp = NULL; 1492 1493 if (ufs_lockfs_top_vop_return(head)) 1494 curthread->t_flag &= ~T_DONTBLOCK; 1495 1496 /* fallocate thread */ 1497 if (ULOCKFS_IS_FALLOC(ulp) && info->flags & ULOCK_INFO_FALLOCATE) { 1498 /* Clear the thread's fallocate state */ 1499 info->flags &= ~ULOCK_INFO_FALLOCATE; 1500 if (!atomic_add_long_nv(&ulp->ul_falloc_cnt, -1)) { 1501 mutex_enter(&ulp->ul_lock); 1502 ULOCKFS_CLR_FALLOC(ulp); 1503 cv_broadcast(&ulp->ul_cv); 1504 mutex_exit(&ulp->ul_lock); 1505 } 1506 } else { /* normal thread */ 1507 if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 1508 cv_broadcast(&ulp->ul_cv); 1509 } 1510 } 1511 1512 /* 1513 * ufs_lockfs_trybegin - try to start the lockfs locking protocol without 1514 * blocking. 1515 */ 1516 int 1517 ufs_lockfs_trybegin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask) 1518 { 1519 int error = 0; 1520 int rec_vop; 1521 ushort_t op_cnt_incremented = 0; 1522 ulong_t *ctr; 1523 struct ulockfs *ulp; 1524 ulockfs_info_t *ulockfs_info; 1525 ulockfs_info_t *ulockfs_info_free; 1526 ulockfs_info_t *ulockfs_info_temp; 1527 1528 /* 1529 * file system has been forcibly unmounted 1530 */ 1531 if (ufsvfsp == NULL) 1532 return (EIO); 1533 1534 *ulpp = ulp = &ufsvfsp->vfs_ulockfs; 1535 1536 /* 1537 * Do lockfs protocol 1538 */ 1539 ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 1540 IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free); 1541 1542 /* 1543 * Detect recursive VOP call or handcrafted internal lockfs protocol 1544 * path and bail out in that case. 1545 */ 1546 if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) { 1547 *ulpp = NULL; 1548 return (0); 1549 } else { 1550 if (ulockfs_info_free == NULL) { 1551 if ((ulockfs_info_temp = (ulockfs_info_t *) 1552 kmem_zalloc(sizeof (ulockfs_info_t), 1553 KM_NOSLEEP)) == NULL) { 1554 *ulpp = NULL; 1555 return (ENOMEM); 1556 } 1557 } 1558 } 1559 1560 /* 1561 * First time VOP call 1562 * 1563 * Increment the ctr irrespective of the lockfs state. If the lockfs 1564 * state is not ULOCKFS_ULOCK, we can decrement it later. However, 1565 * before incrementing we need to check if there is a pending quiesce 1566 * request because if we have a continuous stream of ufs_lockfs_begin 1567 * requests pounding on a few cpu's then the ufs_quiesce thread might 1568 * never see the value of zero for ctr - a livelock kind of scenario. 1569 */ 1570 ctr = (mask & ULOCKFS_FWLOCK) ? 1571 &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt; 1572 if (!ULOCKFS_IS_SLOCK(ulp)) { 1573 atomic_add_long(ctr, 1); 1574 op_cnt_incremented++; 1575 } 1576 1577 if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) { 1578 /* 1579 * Non-blocking version of ufs_check_lockfs() code. 1580 * 1581 * If the file system is not hard locked or error locked 1582 * and if ulp->ul_fs_lock allows this operation, increment 1583 * the appropriate counter and proceed (For eg., In case the 1584 * file system is delete locked, a mmap can still go through). 1585 */ 1586 if (op_cnt_incremented) 1587 if (!atomic_add_long_nv(ctr, -1)) 1588 cv_broadcast(&ulp->ul_cv); 1589 mutex_enter(&ulp->ul_lock); 1590 if (ULOCKFS_IS_HLOCK(ulp) || 1591 (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock)) 1592 error = EIO; 1593 else if (ulp->ul_fs_lock & mask) 1594 error = EAGAIN; 1595 1596 if (error) { 1597 mutex_exit(&ulp->ul_lock); 1598 if (ulockfs_info_free == NULL) 1599 kmem_free(ulockfs_info_temp, 1600 sizeof (ulockfs_info_t)); 1601 return (error); 1602 } 1603 atomic_add_long(ctr, 1); 1604 if (mask & ULOCKFS_FWLOCK) 1605 ULOCKFS_SET_FALLOC(ulp); 1606 mutex_exit(&ulp->ul_lock); 1607 } else { 1608 /* 1609 * This is the common case of file system in a unlocked state. 1610 * 1611 * If a file system is unlocked, we would expect the ctr to have 1612 * been incremented by now. But this will not be true when a 1613 * quiesce is winding up - SLOCK was set when we checked before 1614 * incrementing the ctr, but by the time we checked for 1615 * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. Take 1616 * ul_lock and go through the non-blocking version of 1617 * ufs_check_lockfs() code. 1618 */ 1619 if (op_cnt_incremented == 0) { 1620 mutex_enter(&ulp->ul_lock); 1621 if (ULOCKFS_IS_HLOCK(ulp) || 1622 (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock)) 1623 error = EIO; 1624 else if (ulp->ul_fs_lock & mask) 1625 error = EAGAIN; 1626 1627 if (error) { 1628 mutex_exit(&ulp->ul_lock); 1629 if (ulockfs_info_free == NULL) 1630 kmem_free(ulockfs_info_temp, 1631 sizeof (ulockfs_info_t)); 1632 return (error); 1633 } 1634 atomic_add_long(ctr, 1); 1635 if (mask & ULOCKFS_FWLOCK) 1636 ULOCKFS_SET_FALLOC(ulp); 1637 mutex_exit(&ulp->ul_lock); 1638 } else if (mask & ULOCKFS_FWLOCK) { 1639 mutex_enter(&ulp->ul_lock); 1640 ULOCKFS_SET_FALLOC(ulp); 1641 mutex_exit(&ulp->ul_lock); 1642 } 1643 } 1644 1645 if (ulockfs_info_free != NULL) { 1646 ulockfs_info_free->ulp = ulp; 1647 if (mask & ULOCKFS_FWLOCK) 1648 ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE; 1649 } else { 1650 ulockfs_info_temp->ulp = ulp; 1651 ulockfs_info_temp->next = ulockfs_info; 1652 if (mask & ULOCKFS_FWLOCK) 1653 ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE; 1654 ASSERT(ufs_lockfs_key != 0); 1655 (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp); 1656 } 1657 1658 curthread->t_flag |= T_DONTBLOCK; 1659 return (0); 1660 } 1661 1662 /* 1663 * specialized version of ufs_lockfs_begin() called by ufs_getpage(). 1664 */ 1665 int 1666 ufs_lockfs_begin_getpage( 1667 struct ufsvfs *ufsvfsp, 1668 struct ulockfs **ulpp, 1669 struct seg *seg, 1670 int read_access, 1671 uint_t *protp) 1672 { 1673 ulong_t mask; 1674 int error; 1675 int rec_vop; 1676 struct ulockfs *ulp; 1677 ulockfs_info_t *ulockfs_info; 1678 ulockfs_info_t *ulockfs_info_free; 1679 ulockfs_info_t *ulockfs_info_temp; 1680 1681 /* 1682 * file system has been forcibly unmounted 1683 */ 1684 if (ufsvfsp == NULL) 1685 return (EIO); 1686 1687 *ulpp = ulp = &ufsvfsp->vfs_ulockfs; 1688 1689 /* 1690 * Do lockfs protocol 1691 */ 1692 ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 1693 IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free); 1694 1695 /* 1696 * Detect recursive VOP call or handcrafted internal lockfs protocol 1697 * path and bail out in that case. 1698 */ 1699 if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) { 1700 *ulpp = NULL; 1701 return (0); 1702 } else { 1703 if (ulockfs_info_free == NULL) { 1704 if ((ulockfs_info_temp = (ulockfs_info_t *) 1705 kmem_zalloc(sizeof (ulockfs_info_t), 1706 KM_NOSLEEP)) == NULL) { 1707 *ulpp = NULL; 1708 return (ENOMEM); 1709 } 1710 } 1711 } 1712 1713 /* 1714 * First time VOP call 1715 */ 1716 atomic_add_long(&ulp->ul_vnops_cnt, 1); 1717 if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) { 1718 if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 1719 cv_broadcast(&ulp->ul_cv); 1720 mutex_enter(&ulp->ul_lock); 1721 if (seg->s_ops == &segvn_ops && 1722 ((struct segvn_data *)seg->s_data)->type != MAP_SHARED) { 1723 mask = (ulong_t)ULOCKFS_GETREAD_MASK; 1724 } else if (protp && read_access) { 1725 /* 1726 * Restrict the mapping to readonly. 1727 * Writes to this mapping will cause 1728 * another fault which will then 1729 * be suspended if fs is write locked 1730 */ 1731 *protp &= ~PROT_WRITE; 1732 mask = (ulong_t)ULOCKFS_GETREAD_MASK; 1733 } else 1734 mask = (ulong_t)ULOCKFS_GETWRITE_MASK; 1735 1736 /* 1737 * will sleep if this fs is locked against this VOP 1738 */ 1739 error = ufs_check_lockfs(ufsvfsp, ulp, mask); 1740 mutex_exit(&ulp->ul_lock); 1741 if (error) { 1742 if (ulockfs_info_free == NULL) 1743 kmem_free(ulockfs_info_temp, 1744 sizeof (ulockfs_info_t)); 1745 return (error); 1746 } 1747 } 1748 1749 if (ulockfs_info_free != NULL) { 1750 ulockfs_info_free->ulp = ulp; 1751 } else { 1752 ulockfs_info_temp->ulp = ulp; 1753 ulockfs_info_temp->next = ulockfs_info; 1754 ASSERT(ufs_lockfs_key != 0); 1755 (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp); 1756 } 1757 1758 curthread->t_flag |= T_DONTBLOCK; 1759 return (0); 1760 } 1761 1762 void 1763 ufs_lockfs_tsd_destructor(void *head) 1764 { 1765 ulockfs_info_t *curr = (ulockfs_info_t *)head; 1766 ulockfs_info_t *temp; 1767 1768 for (; curr != NULL; ) { 1769 /* 1770 * The TSD destructor is being called when the thread exits 1771 * (via thread_exit()). At that time it must have cleaned up 1772 * all VOPs via ufs_lockfs_end() and there must not be a 1773 * valid ulockfs record exist while a thread is exiting. 1774 */ 1775 temp = curr; 1776 curr = curr->next; 1777 ASSERT(temp->ulp == NULL); 1778 kmem_free(temp, sizeof (ulockfs_info_t)); 1779 } 1780 } 1781