1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/t_lock.h> 28 #include <sys/param.h> 29 #include <sys/time.h> 30 #include <sys/systm.h> 31 #include <sys/sysmacros.h> 32 #include <sys/resource.h> 33 #include <sys/signal.h> 34 #include <sys/cred.h> 35 #include <sys/user.h> 36 #include <sys/buf.h> 37 #include <sys/vfs.h> 38 #include <sys/vnode.h> 39 #include <sys/proc.h> 40 #include <sys/disp.h> 41 #include <sys/file.h> 42 #include <sys/fcntl.h> 43 #include <sys/flock.h> 44 #include <sys/atomic.h> 45 #include <sys/kmem.h> 46 #include <sys/uio.h> 47 #include <sys/conf.h> 48 #include <sys/mman.h> 49 #include <sys/pathname.h> 50 #include <sys/debug.h> 51 #include <sys/vmmeter.h> 52 #include <sys/vmsystm.h> 53 #include <sys/cmn_err.h> 54 #include <sys/acct.h> 55 #include <sys/dnlc.h> 56 #include <sys/swap.h> 57 58 #include <sys/fs/ufs_fs.h> 59 #include <sys/fs/ufs_inode.h> 60 #include <sys/fs/ufs_fsdir.h> 61 #include <sys/fs/ufs_trans.h> 62 #include <sys/fs/ufs_panic.h> 63 #include <sys/fs/ufs_mount.h> 64 #include <sys/fs/ufs_bio.h> 65 #include <sys/fs/ufs_log.h> 66 #include <sys/fs/ufs_quota.h> 67 #include <sys/dirent.h> /* must be AFTER <sys/fs/fsdir.h>! */ 68 #include <sys/errno.h> 69 #include <sys/sysinfo.h> 70 71 #include <vm/hat.h> 72 #include <vm/pvn.h> 73 #include <vm/as.h> 74 #include <vm/seg.h> 75 #include <vm/seg_map.h> 76 #include <vm/seg_vn.h> 77 #include <vm/rm.h> 78 #include <vm/anon.h> 79 #include <sys/swap.h> 80 #include <sys/dnlc.h> 81 82 extern struct vnode *common_specvp(struct vnode *vp); 83 84 /* error lock status */ 85 #define UN_ERRLCK (-1) 86 #define SET_ERRLCK 1 87 #define RE_ERRLCK 2 88 #define NO_ERRLCK 0 89 90 /* 91 * Index to be used in TSD for storing lockfs data 92 */ 93 uint_t ufs_lockfs_key; 94 95 typedef struct _ulockfs_info { 96 struct _ulockfs_info *next; 97 struct ulockfs *ulp; 98 uint_t flags; 99 } ulockfs_info_t; 100 101 #define ULOCK_INFO_FALLOCATE 0x00000001 /* fallocate thread */ 102 103 /* 104 * Check in TSD that whether we are already doing any VOP on this filesystem 105 */ 106 #define IS_REC_VOP(found, head, ulp, free) \ 107 { \ 108 ulockfs_info_t *_curr; \ 109 \ 110 for (found = 0, free = NULL, _curr = head; \ 111 _curr != NULL; _curr = _curr->next) { \ 112 if ((free == NULL) && \ 113 (_curr->ulp == NULL)) \ 114 free = _curr; \ 115 if (_curr->ulp == ulp) { \ 116 found = 1; \ 117 break; \ 118 } \ 119 } \ 120 } 121 122 /* 123 * Get the lockfs data from TSD so that lockfs handles the recursive VOP 124 * properly 125 */ 126 #define SEARCH_ULOCKFSP(head, ulp, info) \ 127 { \ 128 ulockfs_info_t *_curr; \ 129 \ 130 for (_curr = head; _curr != NULL; \ 131 _curr = _curr->next) { \ 132 if (_curr->ulp == ulp) { \ 133 break; \ 134 } \ 135 } \ 136 \ 137 info = _curr; \ 138 } 139 140 /* 141 * Validate lockfs request 142 */ 143 static int 144 ufs_getlfd( 145 struct lockfs *lockfsp, /* new lock request */ 146 struct lockfs *ul_lockfsp) /* old lock state */ 147 { 148 int error = 0; 149 150 /* 151 * no input flags defined 152 */ 153 if (lockfsp->lf_flags != 0) { 154 error = EINVAL; 155 goto errout; 156 } 157 158 /* 159 * check key 160 */ 161 if (!LOCKFS_IS_ULOCK(ul_lockfsp)) 162 if (lockfsp->lf_key != ul_lockfsp->lf_key) { 163 error = EINVAL; 164 goto errout; 165 } 166 167 lockfsp->lf_key = ul_lockfsp->lf_key + 1; 168 169 errout: 170 return (error); 171 } 172 173 /* 174 * ufs_checkaccton 175 * check if accounting is turned on on this fs 176 */ 177 178 int 179 ufs_checkaccton(struct vnode *vp) 180 { 181 if (acct_fs_in_use(vp)) 182 return (EDEADLK); 183 return (0); 184 } 185 186 /* 187 * ufs_checkswapon 188 * check if local swapping is to file on this fs 189 */ 190 int 191 ufs_checkswapon(struct vnode *vp) 192 { 193 struct swapinfo *sip; 194 195 mutex_enter(&swapinfo_lock); 196 for (sip = swapinfo; sip; sip = sip->si_next) 197 if (sip->si_vp->v_vfsp == vp->v_vfsp) { 198 mutex_exit(&swapinfo_lock); 199 return (EDEADLK); 200 } 201 mutex_exit(&swapinfo_lock); 202 return (0); 203 } 204 205 /* 206 * ufs_freeze 207 * pend future accesses for current lock and desired lock 208 */ 209 void 210 ufs_freeze(struct ulockfs *ulp, struct lockfs *lockfsp) 211 { 212 /* 213 * set to new lock type 214 */ 215 ulp->ul_lockfs.lf_lock = lockfsp->lf_lock; 216 ulp->ul_lockfs.lf_key = lockfsp->lf_key; 217 ulp->ul_lockfs.lf_comlen = lockfsp->lf_comlen; 218 ulp->ul_lockfs.lf_comment = lockfsp->lf_comment; 219 220 ulp->ul_fs_lock = (1 << ulp->ul_lockfs.lf_lock); 221 } 222 223 /* 224 * All callers of ufs_quiesce() atomically increment ufs_quiesce_pend before 225 * starting ufs_quiesce() protocol and decrement it only when a file system no 226 * longer has to be in quiescent state. This allows ufs_pageio() to detect 227 * that another thread wants to quiesce a file system. See more comments in 228 * ufs_pageio(). 229 */ 230 ulong_t ufs_quiesce_pend = 0; 231 232 /* 233 * ufs_quiesce 234 * wait for outstanding accesses to finish 235 */ 236 int 237 ufs_quiesce(struct ulockfs *ulp) 238 { 239 int error = 0; 240 ulockfs_info_t *head; 241 ulockfs_info_t *info; 242 243 head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 244 SEARCH_ULOCKFSP(head, ulp, info); 245 246 /* 247 * Set a softlock to suspend future ufs_vnops so that 248 * this lockfs request will not be starved 249 */ 250 ULOCKFS_SET_SLOCK(ulp); 251 ASSERT(ufs_quiesce_pend); 252 253 /* check if there is any outstanding ufs vnodeops calls */ 254 while (ulp->ul_vnops_cnt || ulp->ul_falloc_cnt) { 255 /* 256 * use timed version of cv_wait_sig() to make sure we don't 257 * miss a wake up call from ufs_pageio() when it doesn't use 258 * ul_lock. 259 * 260 * when a fallocate thread comes in, the only way it returns 261 * from this function is if there are no other vnode operations 262 * going on (remember fallocate threads are tracked using 263 * ul_falloc_cnt not ul_vnops_cnt), and another fallocate thread 264 * hasn't already grabbed the fs write lock. 265 */ 266 if (info && (info->flags & ULOCK_INFO_FALLOCATE)) { 267 if (!ulp->ul_vnops_cnt && !ULOCKFS_IS_FWLOCK(ulp)) 268 goto out; 269 } 270 if (!cv_timedwait_sig(&ulp->ul_cv, &ulp->ul_lock, lbolt + hz)) { 271 error = EINTR; 272 goto out; 273 } 274 } 275 276 out: 277 /* 278 * unlock the soft lock 279 */ 280 ULOCKFS_CLR_SLOCK(ulp); 281 282 return (error); 283 } 284 285 /* 286 * ufs_flush_inode 287 */ 288 int 289 ufs_flush_inode(struct inode *ip, void *arg) 290 { 291 int error; 292 int saverror = 0; 293 294 /* 295 * wrong file system; keep looking 296 */ 297 if (ip->i_ufsvfs != (struct ufsvfs *)arg) 298 return (0); 299 300 /* 301 * asynchronously push all the dirty pages 302 */ 303 if (((error = TRANS_SYNCIP(ip, B_ASYNC, 0, TOP_SYNCIP_FLUSHI)) != 0) && 304 (error != EAGAIN)) 305 saverror = error; 306 /* 307 * wait for io and discard all mappings 308 */ 309 if (error = TRANS_SYNCIP(ip, B_INVAL, 0, TOP_SYNCIP_FLUSHI)) 310 saverror = error; 311 312 if (ITOV(ip)->v_type == VDIR) { 313 dnlc_dir_purge(&ip->i_danchor); 314 } 315 316 return (saverror); 317 } 318 319 /* 320 * ufs_flush 321 * Flush everything that is currently dirty; this includes invalidating 322 * any mappings. 323 */ 324 int 325 ufs_flush(struct vfs *vfsp) 326 { 327 int error; 328 int saverror = 0; 329 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 330 struct fs *fs = ufsvfsp->vfs_fs; 331 int tdontblock = 0; 332 333 ASSERT(vfs_lock_held(vfsp)); 334 335 /* 336 * purge dnlc 337 */ 338 (void) dnlc_purge_vfsp(vfsp, 0); 339 340 /* 341 * drain the delete and idle threads 342 */ 343 ufs_delete_drain(vfsp, 0, 0); 344 ufs_idle_drain(vfsp); 345 346 /* 347 * flush and invalidate quota records 348 */ 349 (void) qsync(ufsvfsp); 350 351 /* 352 * flush w/invalidate the inodes for vfsp 353 */ 354 if (error = ufs_scan_inodes(0, ufs_flush_inode, ufsvfsp, ufsvfsp)) 355 saverror = error; 356 357 /* 358 * synchronously flush superblock and summary info 359 */ 360 if (fs->fs_ronly == 0 && fs->fs_fmod) { 361 fs->fs_fmod = 0; 362 TRANS_SBUPDATE(ufsvfsp, vfsp, TOP_SBUPDATE_FLUSH); 363 } 364 /* 365 * flush w/invalidate block device pages and buf cache 366 */ 367 if ((error = VOP_PUTPAGE(common_specvp(ufsvfsp->vfs_devvp), 368 (offset_t)0, 0, B_INVAL, CRED(), NULL)) > 0) 369 saverror = error; 370 371 (void) bflush((dev_t)vfsp->vfs_dev); 372 (void) bfinval((dev_t)vfsp->vfs_dev, 0); 373 374 /* 375 * drain the delete and idle threads again 376 */ 377 ufs_delete_drain(vfsp, 0, 0); 378 ufs_idle_drain(vfsp); 379 380 /* 381 * play with the clean flag 382 */ 383 if (saverror == 0) 384 ufs_checkclean(vfsp); 385 386 /* 387 * Flush any outstanding transactions and roll the log 388 * only if we are supposed to do, i.e. LDL_NOROLL not set. 389 * We can not simply check for fs_ronly here since fsck also may 390 * use this code to roll the log on a read-only filesystem, e.g. 391 * root during early stages of boot, if other then a sanity check is 392 * done, it will clear LDL_NOROLL before. 393 * In addition we assert that the deltamap does not contain any deltas 394 * in case LDL_NOROLL is set since this is not supposed to happen. 395 */ 396 if (TRANS_ISTRANS(ufsvfsp)) { 397 ml_unit_t *ul = ufsvfsp->vfs_log; 398 mt_map_t *mtm = ul->un_deltamap; 399 400 if (ul->un_flags & LDL_NOROLL) { 401 ASSERT(mtm->mtm_nme == 0); 402 } else { 403 /* 404 * Do not set T_DONTBLOCK if there is a 405 * transaction opened by caller. 406 */ 407 if (curthread->t_flag & T_DONTBLOCK) 408 tdontblock = 1; 409 else 410 curthread->t_flag |= T_DONTBLOCK; 411 412 TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_FLUSH, 413 TOP_COMMIT_SIZE, error); 414 415 if (!error) { 416 TRANS_END_SYNC(ufsvfsp, saverror, 417 TOP_COMMIT_FLUSH, TOP_COMMIT_SIZE); 418 } 419 420 if (tdontblock == 0) 421 curthread->t_flag &= ~T_DONTBLOCK; 422 423 logmap_roll_dev(ufsvfsp->vfs_log); 424 } 425 } 426 427 return (saverror); 428 } 429 430 /* 431 * ufs_thaw_wlock 432 * special processing when thawing down to wlock 433 */ 434 static int 435 ufs_thaw_wlock(struct inode *ip, void *arg) 436 { 437 /* 438 * wrong file system; keep looking 439 */ 440 if (ip->i_ufsvfs != (struct ufsvfs *)arg) 441 return (0); 442 443 /* 444 * iupdat refuses to clear flags if the fs is read only. The fs 445 * may become read/write during the lock and we wouldn't want 446 * these inodes being written to disk. So clear the flags. 447 */ 448 rw_enter(&ip->i_contents, RW_WRITER); 449 ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG); 450 rw_exit(&ip->i_contents); 451 452 /* 453 * pages are mlocked -- fail wlock 454 */ 455 if (ITOV(ip)->v_type != VCHR && vn_has_cached_data(ITOV(ip))) 456 return (EBUSY); 457 458 return (0); 459 } 460 461 /* 462 * ufs_thaw_hlock 463 * special processing when thawing down to hlock or elock 464 */ 465 static int 466 ufs_thaw_hlock(struct inode *ip, void *arg) 467 { 468 struct vnode *vp = ITOV(ip); 469 470 /* 471 * wrong file system; keep looking 472 */ 473 if (ip->i_ufsvfs != (struct ufsvfs *)arg) 474 return (0); 475 476 /* 477 * blow away all pages - even if they are mlocked 478 */ 479 do { 480 (void) TRANS_SYNCIP(ip, B_INVAL | B_FORCE, 0, TOP_SYNCIP_HLOCK); 481 } while ((vp->v_type != VCHR) && vn_has_cached_data(vp)); 482 rw_enter(&ip->i_contents, RW_WRITER); 483 ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG); 484 rw_exit(&ip->i_contents); 485 486 return (0); 487 } 488 489 /* 490 * ufs_thaw 491 * thaw file system lock down to current value 492 */ 493 int 494 ufs_thaw(struct vfs *vfsp, struct ufsvfs *ufsvfsp, struct ulockfs *ulp) 495 { 496 int error = 0; 497 int noidel = (int)(ulp->ul_flag & ULOCKFS_NOIDEL); 498 499 /* 500 * if wlock or hlock or elock 501 */ 502 if (ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_HLOCK(ulp) || 503 ULOCKFS_IS_ELOCK(ulp)) { 504 505 /* 506 * don't keep access times 507 * don't free deleted files 508 * if superblock writes are allowed, limit them to me for now 509 */ 510 ulp->ul_flag |= (ULOCKFS_NOIACC|ULOCKFS_NOIDEL); 511 if (ulp->ul_sbowner != (kthread_id_t)-1) 512 ulp->ul_sbowner = curthread; 513 514 /* 515 * wait for writes for deleted files and superblock updates 516 */ 517 (void) ufs_flush(vfsp); 518 519 /* 520 * now make sure the quota file is up-to-date 521 * expensive; but effective 522 */ 523 error = ufs_flush(vfsp); 524 /* 525 * no one can write the superblock 526 */ 527 ulp->ul_sbowner = (kthread_id_t)-1; 528 529 /* 530 * special processing for wlock/hlock/elock 531 */ 532 if (ULOCKFS_IS_WLOCK(ulp)) { 533 if (error) 534 goto errout; 535 error = bfinval(ufsvfsp->vfs_dev, 0); 536 if (error) 537 goto errout; 538 error = ufs_scan_inodes(0, ufs_thaw_wlock, 539 (void *)ufsvfsp, ufsvfsp); 540 if (error) 541 goto errout; 542 } 543 if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp)) { 544 error = 0; 545 (void) ufs_scan_inodes(0, ufs_thaw_hlock, 546 (void *)ufsvfsp, ufsvfsp); 547 (void) bfinval(ufsvfsp->vfs_dev, 1); 548 } 549 } else { 550 551 /* 552 * okay to keep access times 553 * okay to free deleted files 554 * okay to write the superblock 555 */ 556 ulp->ul_flag &= ~(ULOCKFS_NOIACC|ULOCKFS_NOIDEL); 557 ulp->ul_sbowner = NULL; 558 559 /* 560 * flush in case deleted files are in memory 561 */ 562 if (noidel) { 563 if (error = ufs_flush(vfsp)) 564 goto errout; 565 } 566 } 567 568 errout: 569 cv_broadcast(&ulp->ul_cv); 570 return (error); 571 } 572 573 /* 574 * ufs_reconcile_fs 575 * reconcile incore superblock with ondisk superblock 576 */ 577 int 578 ufs_reconcile_fs(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck) 579 { 580 struct fs *mfs; /* in-memory superblock */ 581 struct fs *dfs; /* on-disk superblock */ 582 struct buf *bp; /* on-disk superblock buf */ 583 int needs_unlock; 584 char finished_fsclean; 585 586 mfs = ufsvfsp->vfs_fs; 587 588 /* 589 * get the on-disk copy of the superblock 590 */ 591 bp = UFS_BREAD(ufsvfsp, vfsp->vfs_dev, SBLOCK, SBSIZE); 592 bp->b_flags |= (B_STALE|B_AGE); 593 if (bp->b_flags & B_ERROR) { 594 brelse(bp); 595 return (EIO); 596 } 597 dfs = bp->b_un.b_fs; 598 599 /* error locks may only unlock after the fs has been made consistent */ 600 if (errlck == UN_ERRLCK) { 601 if (dfs->fs_clean == FSFIX) { /* being repaired */ 602 brelse(bp); 603 return (EAGAIN); 604 } 605 /* repair not yet started? */ 606 finished_fsclean = TRANS_ISTRANS(ufsvfsp)? FSLOG: FSCLEAN; 607 if (dfs->fs_clean != finished_fsclean) { 608 brelse(bp); 609 return (EBUSY); 610 } 611 } 612 613 /* 614 * if superblock has changed too much, abort 615 */ 616 if ((mfs->fs_sblkno != dfs->fs_sblkno) || 617 (mfs->fs_cblkno != dfs->fs_cblkno) || 618 (mfs->fs_iblkno != dfs->fs_iblkno) || 619 (mfs->fs_dblkno != dfs->fs_dblkno) || 620 (mfs->fs_cgoffset != dfs->fs_cgoffset) || 621 (mfs->fs_cgmask != dfs->fs_cgmask) || 622 (mfs->fs_bsize != dfs->fs_bsize) || 623 (mfs->fs_fsize != dfs->fs_fsize) || 624 (mfs->fs_frag != dfs->fs_frag) || 625 (mfs->fs_bmask != dfs->fs_bmask) || 626 (mfs->fs_fmask != dfs->fs_fmask) || 627 (mfs->fs_bshift != dfs->fs_bshift) || 628 (mfs->fs_fshift != dfs->fs_fshift) || 629 (mfs->fs_fragshift != dfs->fs_fragshift) || 630 (mfs->fs_fsbtodb != dfs->fs_fsbtodb) || 631 (mfs->fs_sbsize != dfs->fs_sbsize) || 632 (mfs->fs_nindir != dfs->fs_nindir) || 633 (mfs->fs_nspf != dfs->fs_nspf) || 634 (mfs->fs_trackskew != dfs->fs_trackskew) || 635 (mfs->fs_cgsize != dfs->fs_cgsize) || 636 (mfs->fs_ntrak != dfs->fs_ntrak) || 637 (mfs->fs_nsect != dfs->fs_nsect) || 638 (mfs->fs_spc != dfs->fs_spc) || 639 (mfs->fs_cpg != dfs->fs_cpg) || 640 (mfs->fs_ipg != dfs->fs_ipg) || 641 (mfs->fs_fpg != dfs->fs_fpg) || 642 (mfs->fs_postblformat != dfs->fs_postblformat) || 643 (mfs->fs_magic != dfs->fs_magic)) { 644 brelse(bp); 645 return (EACCES); 646 } 647 if (dfs->fs_clean == FSBAD || FSOKAY != dfs->fs_state + dfs->fs_time) 648 if (mfs->fs_clean == FSLOG) { 649 brelse(bp); 650 return (EACCES); 651 } 652 653 /* 654 * get new summary info 655 */ 656 if (ufs_getsummaryinfo(vfsp->vfs_dev, ufsvfsp, dfs)) { 657 brelse(bp); 658 return (EIO); 659 } 660 661 /* 662 * release old summary info and update in-memory superblock 663 */ 664 kmem_free(mfs->fs_u.fs_csp, mfs->fs_cssize); 665 mfs->fs_u.fs_csp = dfs->fs_u.fs_csp; /* Only entry 0 used */ 666 667 /* 668 * update fields allowed to change 669 */ 670 mfs->fs_size = dfs->fs_size; 671 mfs->fs_dsize = dfs->fs_dsize; 672 mfs->fs_ncg = dfs->fs_ncg; 673 mfs->fs_minfree = dfs->fs_minfree; 674 mfs->fs_rotdelay = dfs->fs_rotdelay; 675 mfs->fs_rps = dfs->fs_rps; 676 mfs->fs_maxcontig = dfs->fs_maxcontig; 677 mfs->fs_maxbpg = dfs->fs_maxbpg; 678 mfs->fs_csmask = dfs->fs_csmask; 679 mfs->fs_csshift = dfs->fs_csshift; 680 mfs->fs_optim = dfs->fs_optim; 681 mfs->fs_csaddr = dfs->fs_csaddr; 682 mfs->fs_cssize = dfs->fs_cssize; 683 mfs->fs_ncyl = dfs->fs_ncyl; 684 mfs->fs_cstotal = dfs->fs_cstotal; 685 mfs->fs_reclaim = dfs->fs_reclaim; 686 687 if (mfs->fs_reclaim & (FS_RECLAIM|FS_RECLAIMING)) { 688 mfs->fs_reclaim &= ~FS_RECLAIM; 689 mfs->fs_reclaim |= FS_RECLAIMING; 690 ufs_thread_start(&ufsvfsp->vfs_reclaim, 691 ufs_thread_reclaim, vfsp); 692 } 693 694 /* XXX What to do about sparecon? */ 695 696 /* XXX need to copy volume label */ 697 698 /* 699 * ondisk clean flag overrides inmemory clean flag iff == FSBAD 700 * or if error-locked and ondisk is now clean 701 */ 702 needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock); 703 if (needs_unlock) 704 mutex_enter(&ufsvfsp->vfs_lock); 705 706 if (errlck == UN_ERRLCK) { 707 if (finished_fsclean == dfs->fs_clean) 708 mfs->fs_clean = finished_fsclean; 709 else 710 mfs->fs_clean = FSBAD; 711 mfs->fs_state = FSOKAY - dfs->fs_time; 712 } 713 714 if (FSOKAY != dfs->fs_state + dfs->fs_time || 715 (dfs->fs_clean == FSBAD)) 716 mfs->fs_clean = FSBAD; 717 718 if (needs_unlock) 719 mutex_exit(&ufsvfsp->vfs_lock); 720 721 brelse(bp); 722 723 return (0); 724 } 725 726 /* 727 * ufs_reconcile_inode 728 * reconcile ondisk inode with incore inode 729 */ 730 static int 731 ufs_reconcile_inode(struct inode *ip, void *arg) 732 { 733 int i; 734 int ndaddr; 735 int niaddr; 736 struct dinode *dp; /* ondisk inode */ 737 struct buf *bp = NULL; 738 uid_t d_uid; 739 gid_t d_gid; 740 int error = 0; 741 struct fs *fs; 742 743 /* 744 * not an inode we care about 745 */ 746 if (ip->i_ufsvfs != (struct ufsvfs *)arg) 747 return (0); 748 749 fs = ip->i_fs; 750 751 /* 752 * Inode reconciliation fails: we made the filesystem quiescent 753 * and we did a ufs_flush() before calling ufs_reconcile_inode() 754 * and thus the inode should not have been changed inbetween. 755 * Any discrepancies indicate a logic error and a pretty 756 * significant run-state inconsistency we should complain about. 757 */ 758 if (ip->i_flag & (IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG)) { 759 cmn_err(CE_WARN, "%s: Inode reconciliation failed for" 760 "inode %llu", fs->fs_fsmnt, (u_longlong_t)ip->i_number); 761 return (EINVAL); 762 } 763 764 /* 765 * get the dinode 766 */ 767 bp = UFS_BREAD(ip->i_ufsvfs, 768 ip->i_dev, (daddr_t)fsbtodb(fs, itod(fs, ip->i_number)), 769 (int)fs->fs_bsize); 770 if (bp->b_flags & B_ERROR) { 771 brelse(bp); 772 return (EIO); 773 } 774 dp = bp->b_un.b_dino; 775 dp += itoo(fs, ip->i_number); 776 777 /* 778 * handle Sun's implementation of EFT 779 */ 780 d_uid = (dp->di_suid == UID_LONG) ? dp->di_uid : (uid_t)dp->di_suid; 781 d_gid = (dp->di_sgid == GID_LONG) ? dp->di_gid : (uid_t)dp->di_sgid; 782 783 rw_enter(&ip->i_contents, RW_WRITER); 784 785 /* 786 * some fields are not allowed to change 787 */ 788 if ((ip->i_mode != dp->di_mode) || 789 (ip->i_gen != dp->di_gen) || 790 (ip->i_uid != d_uid) || 791 (ip->i_gid != d_gid)) { 792 error = EACCES; 793 goto out; 794 } 795 796 /* 797 * and some are allowed to change 798 */ 799 ip->i_size = dp->di_size; 800 ip->i_ic.ic_flags = dp->di_ic.ic_flags; 801 ip->i_blocks = dp->di_blocks; 802 ip->i_nlink = dp->di_nlink; 803 if (ip->i_flag & IFASTSYMLNK) { 804 ndaddr = 1; 805 niaddr = 0; 806 } else { 807 ndaddr = NDADDR; 808 niaddr = NIADDR; 809 } 810 for (i = 0; i < ndaddr; ++i) 811 ip->i_db[i] = dp->di_db[i]; 812 for (i = 0; i < niaddr; ++i) 813 ip->i_ib[i] = dp->di_ib[i]; 814 815 out: 816 rw_exit(&ip->i_contents); 817 brelse(bp); 818 return (error); 819 } 820 821 /* 822 * ufs_reconcile 823 * reconcile ondisk superblock/inodes with any incore 824 */ 825 static int 826 ufs_reconcile(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck) 827 { 828 int error = 0; 829 830 /* 831 * get rid of as much inmemory data as possible 832 */ 833 (void) ufs_flush(vfsp); 834 835 /* 836 * reconcile the superblock and inodes 837 */ 838 if (error = ufs_reconcile_fs(vfsp, ufsvfsp, errlck)) 839 return (error); 840 if (error = ufs_scan_inodes(0, ufs_reconcile_inode, ufsvfsp, ufsvfsp)) 841 return (error); 842 /* 843 * allocation blocks may be incorrect; get rid of them 844 */ 845 (void) ufs_flush(vfsp); 846 847 return (error); 848 } 849 850 /* 851 * File system locking 852 */ 853 int 854 ufs_fiolfs(struct vnode *vp, struct lockfs *lockfsp, int from_log) 855 { 856 return (ufs__fiolfs(vp, lockfsp, /* from_user */ 1, from_log)); 857 } 858 859 /* kernel-internal interface, also used by fix-on-panic */ 860 int 861 ufs__fiolfs( 862 struct vnode *vp, 863 struct lockfs *lockfsp, 864 int from_user, 865 int from_log) 866 { 867 struct ulockfs *ulp; 868 struct lockfs lfs; 869 int error; 870 struct vfs *vfsp; 871 struct ufsvfs *ufsvfsp; 872 int errlck = NO_ERRLCK; 873 int poll_events = POLLPRI; 874 extern struct pollhead ufs_pollhd; 875 ulockfs_info_t *head; 876 ulockfs_info_t *info; 877 int signal = 0; 878 879 /* check valid lock type */ 880 if (!lockfsp || lockfsp->lf_lock > LOCKFS_MAXLOCK) 881 return (EINVAL); 882 883 if (!vp || !vp->v_vfsp || !vp->v_vfsp->vfs_data) 884 return (EIO); 885 886 vfsp = vp->v_vfsp; 887 888 if (vfsp->vfs_flag & VFS_UNMOUNTED) /* has been unmounted */ 889 return (EIO); 890 891 /* take the lock and check again */ 892 vfs_lock_wait(vfsp); 893 if (vfsp->vfs_flag & VFS_UNMOUNTED) { 894 vfs_unlock(vfsp); 895 return (EIO); 896 } 897 898 /* 899 * Can't wlock or ro/elock fs with accounting or local swap file 900 * We need to check for this before we grab the ul_lock to avoid 901 * deadlocks with the accounting framework. 902 */ 903 if ((LOCKFS_IS_WLOCK(lockfsp) || LOCKFS_IS_ELOCK(lockfsp) || 904 LOCKFS_IS_ROELOCK(lockfsp)) && !from_log) { 905 if (ufs_checkaccton(vp) || ufs_checkswapon(vp)) { 906 vfs_unlock(vfsp); 907 return (EDEADLK); 908 } 909 } 910 911 ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 912 ulp = &ufsvfsp->vfs_ulockfs; 913 head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 914 SEARCH_ULOCKFSP(head, ulp, info); 915 916 /* 917 * Suspend both the reclaim thread and the delete thread. 918 * This must be done outside the lockfs locking protocol. 919 */ 920 ufs_thread_suspend(&ufsvfsp->vfs_reclaim); 921 ufs_thread_suspend(&ufsvfsp->vfs_delete); 922 923 mutex_enter(&ulp->ul_lock); 924 atomic_add_long(&ufs_quiesce_pend, 1); 925 926 /* 927 * Quit if there is another lockfs request in progress 928 * that is waiting for existing ufs_vnops to complete. 929 */ 930 if (ULOCKFS_IS_BUSY(ulp)) { 931 error = EBUSY; 932 goto errexit; 933 } 934 935 /* cannot ulocked or downgrade a hard-lock */ 936 if (ULOCKFS_IS_HLOCK(ulp)) { 937 error = EIO; 938 goto errexit; 939 } 940 941 /* an error lock may be unlocked or relocked, only */ 942 if (ULOCKFS_IS_ELOCK(ulp)) { 943 if (!LOCKFS_IS_ULOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) { 944 error = EBUSY; 945 goto errexit; 946 } 947 } 948 949 /* 950 * a read-only error lock may only be upgraded to an 951 * error lock or hard lock 952 */ 953 if (ULOCKFS_IS_ROELOCK(ulp)) { 954 if (!LOCKFS_IS_HLOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) { 955 error = EBUSY; 956 goto errexit; 957 } 958 } 959 960 /* 961 * until read-only error locks are fully implemented 962 * just return EINVAL 963 */ 964 if (LOCKFS_IS_ROELOCK(lockfsp)) { 965 error = EINVAL; 966 goto errexit; 967 } 968 969 /* 970 * an error lock may only be applied if the file system is 971 * unlocked or already error locked. 972 * (this is to prevent the case where a fs gets changed out from 973 * underneath a fs that is locked for backup, 974 * that is, name/delete/write-locked.) 975 */ 976 if ((!ULOCKFS_IS_ULOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp) && 977 !ULOCKFS_IS_ROELOCK(ulp)) && 978 (LOCKFS_IS_ELOCK(lockfsp) || LOCKFS_IS_ROELOCK(lockfsp))) { 979 error = EBUSY; 980 goto errexit; 981 } 982 983 /* get and validate the input lockfs request */ 984 if (error = ufs_getlfd(lockfsp, &ulp->ul_lockfs)) 985 goto errexit; 986 987 /* 988 * save current ulockfs struct 989 */ 990 bcopy(&ulp->ul_lockfs, &lfs, sizeof (struct lockfs)); 991 992 /* 993 * Freeze the file system (pend future accesses) 994 */ 995 ufs_freeze(ulp, lockfsp); 996 997 /* 998 * Set locking in progress because ufs_quiesce may free the 999 * ul_lock mutex. 1000 */ 1001 ULOCKFS_SET_BUSY(ulp); 1002 /* update the ioctl copy */ 1003 LOCKFS_SET_BUSY(&ulp->ul_lockfs); 1004 1005 /* 1006 * We need to unset FWLOCK status before we call ufs_quiesce 1007 * so that the thread doesnt get suspended. We do this only if 1008 * this (fallocate) thread requested an unlock operation. 1009 */ 1010 if (info && (info->flags & ULOCK_INFO_FALLOCATE)) { 1011 if (!ULOCKFS_IS_WLOCK(ulp)) 1012 ULOCKFS_CLR_FWLOCK(ulp); 1013 } 1014 1015 /* 1016 * Quiesce (wait for outstanding accesses to finish) 1017 */ 1018 if (error = ufs_quiesce(ulp)) { 1019 /* 1020 * Interrupted due to signal. There could still be 1021 * pending vnops. 1022 */ 1023 signal = 1; 1024 1025 /* 1026 * We do broadcast because lock-status 1027 * could be reverted to old status. 1028 */ 1029 cv_broadcast(&ulp->ul_cv); 1030 goto errout; 1031 } 1032 1033 /* 1034 * If the fallocate thread requested a write fs lock operation 1035 * then we set fwlock status in the ulp. 1036 */ 1037 if (info && (info->flags & ULOCK_INFO_FALLOCATE)) { 1038 if (ULOCKFS_IS_WLOCK(ulp)) 1039 ULOCKFS_SET_FWLOCK(ulp); 1040 } 1041 1042 /* 1043 * save error lock status to pass down to reconcilation 1044 * routines and for later cleanup 1045 */ 1046 if (LOCKFS_IS_ELOCK(&lfs) && ULOCKFS_IS_ULOCK(ulp)) 1047 errlck = UN_ERRLCK; 1048 1049 if (ULOCKFS_IS_ELOCK(ulp) || ULOCKFS_IS_ROELOCK(ulp)) { 1050 int needs_unlock; 1051 int needs_sbwrite; 1052 1053 poll_events |= POLLERR; 1054 errlck = LOCKFS_IS_ELOCK(&lfs) || LOCKFS_IS_ROELOCK(&lfs) ? 1055 RE_ERRLCK : SET_ERRLCK; 1056 1057 needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock); 1058 if (needs_unlock) 1059 mutex_enter(&ufsvfsp->vfs_lock); 1060 1061 /* disable delayed i/o */ 1062 needs_sbwrite = 0; 1063 1064 if (errlck == SET_ERRLCK) { 1065 ufsvfsp->vfs_fs->fs_clean = FSBAD; 1066 needs_sbwrite = 1; 1067 } 1068 1069 needs_sbwrite |= ufsvfsp->vfs_dio; 1070 ufsvfsp->vfs_dio = 0; 1071 1072 if (needs_unlock) 1073 mutex_exit(&ufsvfsp->vfs_lock); 1074 1075 if (needs_sbwrite) { 1076 ulp->ul_sbowner = curthread; 1077 TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE); 1078 1079 if (needs_unlock) 1080 mutex_enter(&ufsvfsp->vfs_lock); 1081 1082 ufsvfsp->vfs_fs->fs_fmod = 0; 1083 1084 if (needs_unlock) 1085 mutex_exit(&ufsvfsp->vfs_lock); 1086 } 1087 } 1088 1089 /* 1090 * reconcile superblock and inodes if was wlocked 1091 */ 1092 if (LOCKFS_IS_WLOCK(&lfs) || LOCKFS_IS_ELOCK(&lfs)) { 1093 if (error = ufs_reconcile(vfsp, ufsvfsp, errlck)) 1094 goto errout; 1095 /* 1096 * in case the fs grew; reset the metadata map for logging tests 1097 */ 1098 TRANS_MATA_UMOUNT(ufsvfsp); 1099 TRANS_MATA_MOUNT(ufsvfsp); 1100 TRANS_MATA_SI(ufsvfsp, ufsvfsp->vfs_fs); 1101 } 1102 1103 /* 1104 * At least everything *currently* dirty goes out. 1105 */ 1106 1107 if ((error = ufs_flush(vfsp)) != 0 && !ULOCKFS_IS_HLOCK(ulp) && 1108 !ULOCKFS_IS_ELOCK(ulp)) 1109 goto errout; 1110 1111 /* 1112 * thaw file system and wakeup pended processes 1113 */ 1114 if (error = ufs_thaw(vfsp, ufsvfsp, ulp)) 1115 if (!ULOCKFS_IS_HLOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp)) 1116 goto errout; 1117 1118 /* 1119 * reset modified flag if not already write locked 1120 */ 1121 if (!LOCKFS_IS_WLOCK(&lfs)) 1122 ULOCKFS_CLR_MOD(ulp); 1123 1124 /* 1125 * idle the lock struct 1126 */ 1127 ULOCKFS_CLR_BUSY(ulp); 1128 /* update the ioctl copy */ 1129 LOCKFS_CLR_BUSY(&ulp->ul_lockfs); 1130 1131 /* 1132 * free current comment 1133 */ 1134 if (lfs.lf_comment && lfs.lf_comlen != 0) { 1135 kmem_free(lfs.lf_comment, lfs.lf_comlen); 1136 lfs.lf_comment = NULL; 1137 lfs.lf_comlen = 0; 1138 } 1139 1140 /* do error lock cleanup */ 1141 if (errlck == UN_ERRLCK) 1142 ufsfx_unlockfs(ufsvfsp); 1143 1144 else if (errlck == RE_ERRLCK) 1145 ufsfx_lockfs(ufsvfsp); 1146 1147 /* don't allow error lock from user to invoke panic */ 1148 else if (from_user && errlck == SET_ERRLCK && 1149 !(ufsvfsp->vfs_fsfx.fx_flags & (UFSMNT_ONERROR_PANIC >> 4))) 1150 (void) ufs_fault(ufsvfsp->vfs_root, 1151 ulp->ul_lockfs.lf_comment && ulp->ul_lockfs.lf_comlen > 0 ? 1152 ulp->ul_lockfs.lf_comment: "user-applied error lock"); 1153 1154 atomic_add_long(&ufs_quiesce_pend, -1); 1155 mutex_exit(&ulp->ul_lock); 1156 vfs_unlock(vfsp); 1157 1158 if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs)) 1159 poll_events |= POLLERR; 1160 1161 pollwakeup(&ufs_pollhd, poll_events); 1162 1163 /* 1164 * Allow both the delete thread and the reclaim thread to 1165 * continue. 1166 */ 1167 ufs_thread_continue(&ufsvfsp->vfs_delete); 1168 ufs_thread_continue(&ufsvfsp->vfs_reclaim); 1169 1170 return (0); 1171 1172 errout: 1173 /* 1174 * Lock failed. Reset the old lock in ufsvfs if not hard locked. 1175 */ 1176 if (!LOCKFS_IS_HLOCK(&ulp->ul_lockfs)) { 1177 bcopy(&lfs, &ulp->ul_lockfs, sizeof (struct lockfs)); 1178 ulp->ul_fs_lock = (1 << lfs.lf_lock); 1179 } 1180 1181 /* 1182 * Don't call ufs_thaw() when there's a signal during 1183 * ufs quiesce operation as it can lead to deadlock 1184 * with getpage. 1185 */ 1186 if (signal == 0) 1187 (void) ufs_thaw(vfsp, ufsvfsp, ulp); 1188 1189 ULOCKFS_CLR_BUSY(ulp); 1190 LOCKFS_CLR_BUSY(&ulp->ul_lockfs); 1191 1192 errexit: 1193 atomic_add_long(&ufs_quiesce_pend, -1); 1194 mutex_exit(&ulp->ul_lock); 1195 vfs_unlock(vfsp); 1196 1197 /* 1198 * Allow both the delete thread and the reclaim thread to 1199 * continue. 1200 */ 1201 ufs_thread_continue(&ufsvfsp->vfs_delete); 1202 ufs_thread_continue(&ufsvfsp->vfs_reclaim); 1203 1204 return (error); 1205 } 1206 1207 /* 1208 * fiolfss 1209 * return the current file system locking state info 1210 */ 1211 int 1212 ufs_fiolfss(struct vnode *vp, struct lockfs *lockfsp) 1213 { 1214 struct ulockfs *ulp; 1215 1216 if (!vp || !vp->v_vfsp || !VTOI(vp)) 1217 return (EINVAL); 1218 1219 /* file system has been forcibly unmounted */ 1220 if (VTOI(vp)->i_ufsvfs == NULL) 1221 return (EIO); 1222 1223 ulp = VTOUL(vp); 1224 1225 if (ULOCKFS_IS_HLOCK(ulp)) { 1226 *lockfsp = ulp->ul_lockfs; /* structure assignment */ 1227 return (0); 1228 } 1229 1230 mutex_enter(&ulp->ul_lock); 1231 1232 *lockfsp = ulp->ul_lockfs; /* structure assignment */ 1233 1234 if (ULOCKFS_IS_MOD(ulp)) 1235 lockfsp->lf_flags |= LOCKFS_MOD; 1236 1237 mutex_exit(&ulp->ul_lock); 1238 1239 return (0); 1240 } 1241 1242 /* 1243 * ufs_check_lockfs 1244 * check whether a ufs_vnops conflicts with the file system lock 1245 */ 1246 int 1247 ufs_check_lockfs(struct ufsvfs *ufsvfsp, struct ulockfs *ulp, ulong_t mask) 1248 { 1249 k_sigset_t smask; 1250 int sig, slock; 1251 1252 ASSERT(MUTEX_HELD(&ulp->ul_lock)); 1253 1254 while (ulp->ul_fs_lock & mask) { 1255 slock = (int)ULOCKFS_IS_SLOCK(ulp); 1256 if ((curthread->t_flag & T_DONTPEND) && !slock) { 1257 curthread->t_flag |= T_WOULDBLOCK; 1258 return (EAGAIN); 1259 } 1260 curthread->t_flag &= ~T_WOULDBLOCK; 1261 1262 /* 1263 * In the case of an onerr umount of the fs, threads could 1264 * have blocked before coming into ufs_check_lockfs and 1265 * need to check for the special case of ELOCK and 1266 * vfs_dontblock being set which would indicate that the fs 1267 * is on its way out and will not return therefore making 1268 * EIO the appropriate response. 1269 */ 1270 if (ULOCKFS_IS_HLOCK(ulp) || 1271 (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock)) 1272 return (EIO); 1273 1274 /* 1275 * wait for lock status to change 1276 */ 1277 if (slock || ufsvfsp->vfs_nointr) { 1278 cv_wait(&ulp->ul_cv, &ulp->ul_lock); 1279 } else { 1280 sigintr(&smask, 1); 1281 sig = cv_wait_sig(&ulp->ul_cv, &ulp->ul_lock); 1282 sigunintr(&smask); 1283 if ((!sig && (ulp->ul_fs_lock & mask)) || 1284 ufsvfsp->vfs_dontblock) 1285 return (EINTR); 1286 } 1287 } 1288 1289 if (mask & ULOCKFS_FWLOCK) { 1290 atomic_add_long(&ulp->ul_falloc_cnt, 1); 1291 ULOCKFS_SET_FALLOC(ulp); 1292 } else { 1293 atomic_add_long(&ulp->ul_vnops_cnt, 1); 1294 } 1295 1296 return (0); 1297 } 1298 1299 /* 1300 * Check whether we came across the handcrafted lockfs protocol path. We can't 1301 * simply check for T_DONTBLOCK here as one would assume since this can also 1302 * falsely catch recursive VOP's going to a different filesystem, instead we 1303 * check if we already hold the ulockfs->ul_lock mutex. 1304 */ 1305 static int 1306 ufs_lockfs_is_under_rawlockfs(struct ulockfs *ulp) 1307 { 1308 return ((mutex_owner(&ulp->ul_lock) != curthread) ? 0 : 1); 1309 } 1310 1311 /* 1312 * ufs_lockfs_begin - start the lockfs locking protocol 1313 */ 1314 int 1315 ufs_lockfs_begin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask) 1316 { 1317 int error; 1318 int rec_vop; 1319 ushort_t op_cnt_incremented = 0; 1320 ulong_t *ctr; 1321 struct ulockfs *ulp; 1322 ulockfs_info_t *ulockfs_info; 1323 ulockfs_info_t *ulockfs_info_free; 1324 ulockfs_info_t *ulockfs_info_temp; 1325 1326 /* 1327 * file system has been forcibly unmounted 1328 */ 1329 if (ufsvfsp == NULL) 1330 return (EIO); 1331 1332 *ulpp = ulp = &ufsvfsp->vfs_ulockfs; 1333 1334 /* 1335 * Do lockfs protocol 1336 */ 1337 ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 1338 IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free); 1339 1340 /* 1341 * Detect recursive VOP call or handcrafted internal lockfs protocol 1342 * path and bail out in that case. 1343 */ 1344 if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) { 1345 *ulpp = NULL; 1346 return (0); 1347 } else { 1348 if (ulockfs_info_free == NULL) { 1349 if ((ulockfs_info_temp = (ulockfs_info_t *) 1350 kmem_zalloc(sizeof (ulockfs_info_t), 1351 KM_NOSLEEP)) == NULL) { 1352 *ulpp = NULL; 1353 return (ENOMEM); 1354 } 1355 } 1356 } 1357 1358 /* 1359 * First time VOP call 1360 * 1361 * Increment the ctr irrespective of the lockfs state. If the lockfs 1362 * state is not ULOCKFS_ULOCK, we can decrement it later. However, 1363 * before incrementing we need to check if there is a pending quiesce 1364 * request because if we have a continuous stream of ufs_lockfs_begin 1365 * requests pounding on a few cpu's then the ufs_quiesce thread might 1366 * never see the value of zero for ctr - a livelock kind of scenario. 1367 */ 1368 ctr = (mask & ULOCKFS_FWLOCK) ? 1369 &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt; 1370 if (!ULOCKFS_IS_SLOCK(ulp)) { 1371 atomic_add_long(ctr, 1); 1372 op_cnt_incremented++; 1373 } 1374 1375 /* 1376 * If the lockfs state (indicated by ul_fs_lock) is not just 1377 * ULOCKFS_ULOCK, then we will be routed through ufs_check_lockfs 1378 * where there is a check with an appropriate mask to selectively allow 1379 * operations permitted for that kind of lockfs state. 1380 * 1381 * Even these selective operations should not be allowed to go through 1382 * if a lockfs request is in progress because that could result in inode 1383 * modifications during a quiesce and could hence result in inode 1384 * reconciliation failures. ULOCKFS_SLOCK alone would not be sufficient, 1385 * so make use of ufs_quiesce_pend to disallow vnode operations when a 1386 * quiesce is in progress. 1387 */ 1388 if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) { 1389 if (op_cnt_incremented) 1390 if (!atomic_add_long_nv(ctr, -1)) 1391 cv_broadcast(&ulp->ul_cv); 1392 mutex_enter(&ulp->ul_lock); 1393 error = ufs_check_lockfs(ufsvfsp, ulp, mask); 1394 mutex_exit(&ulp->ul_lock); 1395 if (error) { 1396 if (ulockfs_info_free == NULL) 1397 kmem_free(ulockfs_info_temp, 1398 sizeof (ulockfs_info_t)); 1399 return (error); 1400 } 1401 } else { 1402 /* 1403 * This is the common case of file system in a unlocked state. 1404 * 1405 * If a file system is unlocked, we would expect the ctr to have 1406 * been incremented by now. But this will not be true when a 1407 * quiesce is winding up - SLOCK was set when we checked before 1408 * incrementing the ctr, but by the time we checked for 1409 * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. It is okay 1410 * to take ul_lock and go through the slow path in this uncommon 1411 * case. 1412 */ 1413 if (op_cnt_incremented == 0) { 1414 mutex_enter(&ulp->ul_lock); 1415 error = ufs_check_lockfs(ufsvfsp, ulp, mask); 1416 if (error) { 1417 mutex_exit(&ulp->ul_lock); 1418 if (ulockfs_info_free == NULL) 1419 kmem_free(ulockfs_info_temp, 1420 sizeof (ulockfs_info_t)); 1421 return (error); 1422 } 1423 if (mask & ULOCKFS_FWLOCK) 1424 ULOCKFS_SET_FALLOC(ulp); 1425 mutex_exit(&ulp->ul_lock); 1426 } else if (mask & ULOCKFS_FWLOCK) { 1427 mutex_enter(&ulp->ul_lock); 1428 ULOCKFS_SET_FALLOC(ulp); 1429 mutex_exit(&ulp->ul_lock); 1430 } 1431 } 1432 1433 if (ulockfs_info_free != NULL) { 1434 ulockfs_info_free->ulp = ulp; 1435 if (mask & ULOCKFS_FWLOCK) 1436 ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE; 1437 } else { 1438 ulockfs_info_temp->ulp = ulp; 1439 ulockfs_info_temp->next = ulockfs_info; 1440 if (mask & ULOCKFS_FWLOCK) 1441 ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE; 1442 ASSERT(ufs_lockfs_key != 0); 1443 (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp); 1444 } 1445 1446 curthread->t_flag |= T_DONTBLOCK; 1447 return (0); 1448 } 1449 1450 /* 1451 * Check whether we are returning from the top level VOP. 1452 */ 1453 static int 1454 ufs_lockfs_top_vop_return(ulockfs_info_t *head) 1455 { 1456 ulockfs_info_t *info; 1457 int result = 1; 1458 1459 for (info = head; info != NULL; info = info->next) { 1460 if (info->ulp != NULL) { 1461 result = 0; 1462 break; 1463 } 1464 } 1465 1466 return (result); 1467 } 1468 1469 /* 1470 * ufs_lockfs_end - terminate the lockfs locking protocol 1471 */ 1472 void 1473 ufs_lockfs_end(struct ulockfs *ulp) 1474 { 1475 ulockfs_info_t *info; 1476 ulockfs_info_t *head; 1477 1478 /* 1479 * end-of-VOP protocol 1480 */ 1481 if (ulp == NULL) 1482 return; 1483 1484 head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 1485 SEARCH_ULOCKFSP(head, ulp, info); 1486 1487 /* 1488 * If we're called from a first level VOP, we have to have a 1489 * valid ulockfs record in the TSD. 1490 */ 1491 ASSERT(info != NULL); 1492 1493 /* 1494 * Invalidate the ulockfs record. 1495 */ 1496 info->ulp = NULL; 1497 1498 if (ufs_lockfs_top_vop_return(head)) 1499 curthread->t_flag &= ~T_DONTBLOCK; 1500 1501 /* fallocate thread */ 1502 if (ULOCKFS_IS_FALLOC(ulp) && info->flags & ULOCK_INFO_FALLOCATE) { 1503 /* Clear the thread's fallocate state */ 1504 info->flags &= ~ULOCK_INFO_FALLOCATE; 1505 if (!atomic_add_long_nv(&ulp->ul_falloc_cnt, -1)) { 1506 mutex_enter(&ulp->ul_lock); 1507 ULOCKFS_CLR_FALLOC(ulp); 1508 cv_broadcast(&ulp->ul_cv); 1509 mutex_exit(&ulp->ul_lock); 1510 } 1511 } else { /* normal thread */ 1512 if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 1513 cv_broadcast(&ulp->ul_cv); 1514 } 1515 } 1516 1517 /* 1518 * ufs_lockfs_trybegin - try to start the lockfs locking protocol without 1519 * blocking. 1520 */ 1521 int 1522 ufs_lockfs_trybegin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask) 1523 { 1524 int error = 0; 1525 int rec_vop; 1526 ushort_t op_cnt_incremented = 0; 1527 ulong_t *ctr; 1528 struct ulockfs *ulp; 1529 ulockfs_info_t *ulockfs_info; 1530 ulockfs_info_t *ulockfs_info_free; 1531 ulockfs_info_t *ulockfs_info_temp; 1532 1533 /* 1534 * file system has been forcibly unmounted 1535 */ 1536 if (ufsvfsp == NULL) 1537 return (EIO); 1538 1539 *ulpp = ulp = &ufsvfsp->vfs_ulockfs; 1540 1541 /* 1542 * Do lockfs protocol 1543 */ 1544 ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 1545 IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free); 1546 1547 /* 1548 * Detect recursive VOP call or handcrafted internal lockfs protocol 1549 * path and bail out in that case. 1550 */ 1551 if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) { 1552 *ulpp = NULL; 1553 return (0); 1554 } else { 1555 if (ulockfs_info_free == NULL) { 1556 if ((ulockfs_info_temp = (ulockfs_info_t *) 1557 kmem_zalloc(sizeof (ulockfs_info_t), 1558 KM_NOSLEEP)) == NULL) { 1559 *ulpp = NULL; 1560 return (ENOMEM); 1561 } 1562 } 1563 } 1564 1565 /* 1566 * First time VOP call 1567 * 1568 * Increment the ctr irrespective of the lockfs state. If the lockfs 1569 * state is not ULOCKFS_ULOCK, we can decrement it later. However, 1570 * before incrementing we need to check if there is a pending quiesce 1571 * request because if we have a continuous stream of ufs_lockfs_begin 1572 * requests pounding on a few cpu's then the ufs_quiesce thread might 1573 * never see the value of zero for ctr - a livelock kind of scenario. 1574 */ 1575 ctr = (mask & ULOCKFS_FWLOCK) ? 1576 &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt; 1577 if (!ULOCKFS_IS_SLOCK(ulp)) { 1578 atomic_add_long(ctr, 1); 1579 op_cnt_incremented++; 1580 } 1581 1582 if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) { 1583 /* 1584 * Non-blocking version of ufs_check_lockfs() code. 1585 * 1586 * If the file system is not hard locked or error locked 1587 * and if ulp->ul_fs_lock allows this operation, increment 1588 * the appropriate counter and proceed (For eg., In case the 1589 * file system is delete locked, a mmap can still go through). 1590 */ 1591 if (op_cnt_incremented) 1592 if (!atomic_add_long_nv(ctr, -1)) 1593 cv_broadcast(&ulp->ul_cv); 1594 mutex_enter(&ulp->ul_lock); 1595 if (ULOCKFS_IS_HLOCK(ulp) || 1596 (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock)) 1597 error = EIO; 1598 else if (ulp->ul_fs_lock & mask) 1599 error = EAGAIN; 1600 1601 if (error) { 1602 mutex_exit(&ulp->ul_lock); 1603 if (ulockfs_info_free == NULL) 1604 kmem_free(ulockfs_info_temp, 1605 sizeof (ulockfs_info_t)); 1606 return (error); 1607 } 1608 atomic_add_long(ctr, 1); 1609 if (mask & ULOCKFS_FWLOCK) 1610 ULOCKFS_SET_FALLOC(ulp); 1611 mutex_exit(&ulp->ul_lock); 1612 } else { 1613 /* 1614 * This is the common case of file system in a unlocked state. 1615 * 1616 * If a file system is unlocked, we would expect the ctr to have 1617 * been incremented by now. But this will not be true when a 1618 * quiesce is winding up - SLOCK was set when we checked before 1619 * incrementing the ctr, but by the time we checked for 1620 * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. Take 1621 * ul_lock and go through the non-blocking version of 1622 * ufs_check_lockfs() code. 1623 */ 1624 if (op_cnt_incremented == 0) { 1625 mutex_enter(&ulp->ul_lock); 1626 if (ULOCKFS_IS_HLOCK(ulp) || 1627 (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock)) 1628 error = EIO; 1629 else if (ulp->ul_fs_lock & mask) 1630 error = EAGAIN; 1631 1632 if (error) { 1633 mutex_exit(&ulp->ul_lock); 1634 if (ulockfs_info_free == NULL) 1635 kmem_free(ulockfs_info_temp, 1636 sizeof (ulockfs_info_t)); 1637 return (error); 1638 } 1639 atomic_add_long(ctr, 1); 1640 if (mask & ULOCKFS_FWLOCK) 1641 ULOCKFS_SET_FALLOC(ulp); 1642 mutex_exit(&ulp->ul_lock); 1643 } else if (mask & ULOCKFS_FWLOCK) { 1644 mutex_enter(&ulp->ul_lock); 1645 ULOCKFS_SET_FALLOC(ulp); 1646 mutex_exit(&ulp->ul_lock); 1647 } 1648 } 1649 1650 if (ulockfs_info_free != NULL) { 1651 ulockfs_info_free->ulp = ulp; 1652 if (mask & ULOCKFS_FWLOCK) 1653 ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE; 1654 } else { 1655 ulockfs_info_temp->ulp = ulp; 1656 ulockfs_info_temp->next = ulockfs_info; 1657 if (mask & ULOCKFS_FWLOCK) 1658 ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE; 1659 ASSERT(ufs_lockfs_key != 0); 1660 (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp); 1661 } 1662 1663 curthread->t_flag |= T_DONTBLOCK; 1664 return (0); 1665 } 1666 1667 /* 1668 * specialized version of ufs_lockfs_begin() called by ufs_getpage(). 1669 */ 1670 int 1671 ufs_lockfs_begin_getpage( 1672 struct ufsvfs *ufsvfsp, 1673 struct ulockfs **ulpp, 1674 struct seg *seg, 1675 int read_access, 1676 uint_t *protp) 1677 { 1678 ulong_t mask; 1679 int error; 1680 int rec_vop; 1681 struct ulockfs *ulp; 1682 ulockfs_info_t *ulockfs_info; 1683 ulockfs_info_t *ulockfs_info_free; 1684 ulockfs_info_t *ulockfs_info_temp; 1685 1686 /* 1687 * file system has been forcibly unmounted 1688 */ 1689 if (ufsvfsp == NULL) 1690 return (EIO); 1691 1692 *ulpp = ulp = &ufsvfsp->vfs_ulockfs; 1693 1694 /* 1695 * Do lockfs protocol 1696 */ 1697 ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 1698 IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free); 1699 1700 /* 1701 * Detect recursive VOP call or handcrafted internal lockfs protocol 1702 * path and bail out in that case. 1703 */ 1704 if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) { 1705 *ulpp = NULL; 1706 return (0); 1707 } else { 1708 if (ulockfs_info_free == NULL) { 1709 if ((ulockfs_info_temp = (ulockfs_info_t *) 1710 kmem_zalloc(sizeof (ulockfs_info_t), 1711 KM_NOSLEEP)) == NULL) { 1712 *ulpp = NULL; 1713 return (ENOMEM); 1714 } 1715 } 1716 } 1717 1718 /* 1719 * First time VOP call 1720 */ 1721 atomic_add_long(&ulp->ul_vnops_cnt, 1); 1722 if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) { 1723 if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 1724 cv_broadcast(&ulp->ul_cv); 1725 mutex_enter(&ulp->ul_lock); 1726 if (seg->s_ops == &segvn_ops && 1727 ((struct segvn_data *)seg->s_data)->type != MAP_SHARED) { 1728 mask = (ulong_t)ULOCKFS_GETREAD_MASK; 1729 } else if (protp && read_access) { 1730 /* 1731 * Restrict the mapping to readonly. 1732 * Writes to this mapping will cause 1733 * another fault which will then 1734 * be suspended if fs is write locked 1735 */ 1736 *protp &= ~PROT_WRITE; 1737 mask = (ulong_t)ULOCKFS_GETREAD_MASK; 1738 } else 1739 mask = (ulong_t)ULOCKFS_GETWRITE_MASK; 1740 1741 /* 1742 * will sleep if this fs is locked against this VOP 1743 */ 1744 error = ufs_check_lockfs(ufsvfsp, ulp, mask); 1745 mutex_exit(&ulp->ul_lock); 1746 if (error) { 1747 if (ulockfs_info_free == NULL) 1748 kmem_free(ulockfs_info_temp, 1749 sizeof (ulockfs_info_t)); 1750 return (error); 1751 } 1752 } 1753 1754 if (ulockfs_info_free != NULL) { 1755 ulockfs_info_free->ulp = ulp; 1756 } else { 1757 ulockfs_info_temp->ulp = ulp; 1758 ulockfs_info_temp->next = ulockfs_info; 1759 ASSERT(ufs_lockfs_key != 0); 1760 (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp); 1761 } 1762 1763 curthread->t_flag |= T_DONTBLOCK; 1764 return (0); 1765 } 1766 1767 void 1768 ufs_lockfs_tsd_destructor(void *head) 1769 { 1770 ulockfs_info_t *curr = (ulockfs_info_t *)head; 1771 ulockfs_info_t *temp; 1772 1773 for (; curr != NULL; ) { 1774 /* 1775 * The TSD destructor is being called when the thread exits 1776 * (via thread_exit()). At that time it must have cleaned up 1777 * all VOPs via ufs_lockfs_end() and there must not be a 1778 * valid ulockfs record exist while a thread is exiting. 1779 */ 1780 temp = curr; 1781 curr = curr->next; 1782 ASSERT(temp->ulp == NULL); 1783 kmem_free(temp, sizeof (ulockfs_info_t)); 1784 } 1785 } 1786