1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/t_lock.h> 28 #include <sys/param.h> 29 #include <sys/time.h> 30 #include <sys/systm.h> 31 #include <sys/sysmacros.h> 32 #include <sys/resource.h> 33 #include <sys/signal.h> 34 #include <sys/cred.h> 35 #include <sys/user.h> 36 #include <sys/buf.h> 37 #include <sys/vfs.h> 38 #include <sys/vnode.h> 39 #include <sys/proc.h> 40 #include <sys/disp.h> 41 #include <sys/file.h> 42 #include <sys/fcntl.h> 43 #include <sys/flock.h> 44 #include <sys/atomic.h> 45 #include <sys/kmem.h> 46 #include <sys/uio.h> 47 #include <sys/conf.h> 48 #include <sys/mman.h> 49 #include <sys/pathname.h> 50 #include <sys/debug.h> 51 #include <sys/vmmeter.h> 52 #include <sys/vmsystm.h> 53 #include <sys/cmn_err.h> 54 #include <sys/acct.h> 55 #include <sys/dnlc.h> 56 #include <sys/swap.h> 57 58 #include <sys/fs/ufs_fs.h> 59 #include <sys/fs/ufs_inode.h> 60 #include <sys/fs/ufs_fsdir.h> 61 #include <sys/fs/ufs_trans.h> 62 #include <sys/fs/ufs_panic.h> 63 #include <sys/fs/ufs_mount.h> 64 #include <sys/fs/ufs_bio.h> 65 #include <sys/fs/ufs_log.h> 66 #include <sys/fs/ufs_quota.h> 67 #include <sys/dirent.h> /* must be AFTER <sys/fs/fsdir.h>! */ 68 #include <sys/errno.h> 69 #include <sys/sysinfo.h> 70 71 #include <vm/hat.h> 72 #include <vm/pvn.h> 73 #include <vm/as.h> 74 #include <vm/seg.h> 75 #include <vm/seg_map.h> 76 #include <vm/seg_vn.h> 77 #include <vm/rm.h> 78 #include <vm/anon.h> 79 #include <sys/swap.h> 80 #include <sys/dnlc.h> 81 82 extern struct vnode *common_specvp(struct vnode *vp); 83 84 /* error lock status */ 85 #define UN_ERRLCK (-1) 86 #define SET_ERRLCK 1 87 #define RE_ERRLCK 2 88 #define NO_ERRLCK 0 89 90 /* 91 * Index to be used in TSD for storing lockfs data 92 */ 93 uint_t ufs_lockfs_key; 94 95 typedef struct _ulockfs_info { 96 struct _ulockfs_info *next; 97 struct ulockfs *ulp; 98 uint_t flags; 99 } ulockfs_info_t; 100 101 #define ULOCK_INFO_FALLOCATE 0x00000001 /* fallocate thread */ 102 103 /* 104 * Check in TSD that whether we are already doing any VOP on this filesystem 105 */ 106 #define IS_REC_VOP(found, head, ulp, free) \ 107 { \ 108 ulockfs_info_t *_curr; \ 109 \ 110 for (found = 0, free = NULL, _curr = head; \ 111 _curr != NULL; _curr = _curr->next) { \ 112 if ((free == NULL) && \ 113 (_curr->ulp == NULL)) \ 114 free = _curr; \ 115 if (_curr->ulp == ulp) { \ 116 found = 1; \ 117 break; \ 118 } \ 119 } \ 120 } 121 122 /* 123 * Get the lockfs data from TSD so that lockfs handles the recursive VOP 124 * properly 125 */ 126 #define SEARCH_ULOCKFSP(head, ulp, info) \ 127 { \ 128 ulockfs_info_t *_curr; \ 129 \ 130 for (_curr = head; _curr != NULL; \ 131 _curr = _curr->next) { \ 132 if (_curr->ulp == ulp) { \ 133 break; \ 134 } \ 135 } \ 136 \ 137 info = _curr; \ 138 } 139 140 /* 141 * Validate lockfs request 142 */ 143 static int 144 ufs_getlfd( 145 struct lockfs *lockfsp, /* new lock request */ 146 struct lockfs *ul_lockfsp) /* old lock state */ 147 { 148 int error = 0; 149 150 /* 151 * no input flags defined 152 */ 153 if (lockfsp->lf_flags != 0) { 154 error = EINVAL; 155 goto errout; 156 } 157 158 /* 159 * check key 160 */ 161 if (!LOCKFS_IS_ULOCK(ul_lockfsp)) 162 if (lockfsp->lf_key != ul_lockfsp->lf_key) { 163 error = EINVAL; 164 goto errout; 165 } 166 167 lockfsp->lf_key = ul_lockfsp->lf_key + 1; 168 169 errout: 170 return (error); 171 } 172 173 /* 174 * ufs_checkaccton 175 * check if accounting is turned on on this fs 176 */ 177 178 int 179 ufs_checkaccton(struct vnode *vp) 180 { 181 if (acct_fs_in_use(vp)) 182 return (EDEADLK); 183 return (0); 184 } 185 186 /* 187 * ufs_checkswapon 188 * check if local swapping is to file on this fs 189 */ 190 int 191 ufs_checkswapon(struct vnode *vp) 192 { 193 struct swapinfo *sip; 194 195 mutex_enter(&swapinfo_lock); 196 for (sip = swapinfo; sip; sip = sip->si_next) 197 if (sip->si_vp->v_vfsp == vp->v_vfsp) { 198 mutex_exit(&swapinfo_lock); 199 return (EDEADLK); 200 } 201 mutex_exit(&swapinfo_lock); 202 return (0); 203 } 204 205 /* 206 * ufs_freeze 207 * pend future accesses for current lock and desired lock 208 */ 209 void 210 ufs_freeze(struct ulockfs *ulp, struct lockfs *lockfsp) 211 { 212 /* 213 * set to new lock type 214 */ 215 ulp->ul_lockfs.lf_lock = lockfsp->lf_lock; 216 ulp->ul_lockfs.lf_key = lockfsp->lf_key; 217 ulp->ul_lockfs.lf_comlen = lockfsp->lf_comlen; 218 ulp->ul_lockfs.lf_comment = lockfsp->lf_comment; 219 220 ulp->ul_fs_lock = (1 << ulp->ul_lockfs.lf_lock); 221 } 222 223 /* 224 * All callers of ufs_quiesce() atomically increment ufs_quiesce_pend before 225 * starting ufs_quiesce() protocol and decrement it only when a file system no 226 * longer has to be in quiescent state. This allows ufs_pageio() to detect 227 * that another thread wants to quiesce a file system. See more comments in 228 * ufs_pageio(). 229 */ 230 ulong_t ufs_quiesce_pend = 0; 231 232 /* 233 * ufs_quiesce 234 * wait for outstanding accesses to finish 235 */ 236 int 237 ufs_quiesce(struct ulockfs *ulp) 238 { 239 int error = 0; 240 ulockfs_info_t *head; 241 ulockfs_info_t *info; 242 klwp_t *lwp = ttolwp(curthread); 243 244 head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 245 SEARCH_ULOCKFSP(head, ulp, info); 246 247 /* 248 * We have to keep /proc away from stopping us after we applied 249 * the softlock but before we got a chance to clear it again. 250 * prstop() may pagefault and become stuck on the softlock still 251 * pending. 252 */ 253 if (lwp != NULL) 254 lwp->lwp_nostop++; 255 256 /* 257 * Set a softlock to suspend future ufs_vnops so that 258 * this lockfs request will not be starved 259 */ 260 ULOCKFS_SET_SLOCK(ulp); 261 ASSERT(ufs_quiesce_pend); 262 263 /* check if there is any outstanding ufs vnodeops calls */ 264 while (ulp->ul_vnops_cnt || ulp->ul_falloc_cnt) { 265 /* 266 * use timed version of cv_wait_sig() to make sure we don't 267 * miss a wake up call from ufs_pageio() when it doesn't use 268 * ul_lock. 269 * 270 * when a fallocate thread comes in, the only way it returns 271 * from this function is if there are no other vnode operations 272 * going on (remember fallocate threads are tracked using 273 * ul_falloc_cnt not ul_vnops_cnt), and another fallocate thread 274 * hasn't already grabbed the fs write lock. 275 */ 276 if (info && (info->flags & ULOCK_INFO_FALLOCATE)) { 277 if (!ulp->ul_vnops_cnt && !ULOCKFS_IS_FWLOCK(ulp)) 278 goto out; 279 } 280 if (!cv_timedwait_sig(&ulp->ul_cv, &ulp->ul_lock, lbolt + hz)) { 281 error = EINTR; 282 goto out; 283 } 284 } 285 286 out: 287 /* 288 * unlock the soft lock 289 */ 290 ULOCKFS_CLR_SLOCK(ulp); 291 292 if (lwp != NULL) 293 lwp->lwp_nostop--; 294 295 return (error); 296 } 297 298 /* 299 * ufs_flush_inode 300 */ 301 int 302 ufs_flush_inode(struct inode *ip, void *arg) 303 { 304 int error; 305 int saverror = 0; 306 307 /* 308 * wrong file system; keep looking 309 */ 310 if (ip->i_ufsvfs != (struct ufsvfs *)arg) 311 return (0); 312 313 /* 314 * asynchronously push all the dirty pages 315 */ 316 if (((error = TRANS_SYNCIP(ip, B_ASYNC, 0, TOP_SYNCIP_FLUSHI)) != 0) && 317 (error != EAGAIN)) 318 saverror = error; 319 /* 320 * wait for io and discard all mappings 321 */ 322 if (error = TRANS_SYNCIP(ip, B_INVAL, 0, TOP_SYNCIP_FLUSHI)) 323 saverror = error; 324 325 if (ITOV(ip)->v_type == VDIR) { 326 dnlc_dir_purge(&ip->i_danchor); 327 } 328 329 return (saverror); 330 } 331 332 /* 333 * ufs_flush 334 * Flush everything that is currently dirty; this includes invalidating 335 * any mappings. 336 */ 337 int 338 ufs_flush(struct vfs *vfsp) 339 { 340 int error; 341 int saverror = 0; 342 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 343 struct fs *fs = ufsvfsp->vfs_fs; 344 int tdontblock = 0; 345 346 ASSERT(vfs_lock_held(vfsp)); 347 348 /* 349 * purge dnlc 350 */ 351 (void) dnlc_purge_vfsp(vfsp, 0); 352 353 /* 354 * drain the delete and idle threads 355 */ 356 ufs_delete_drain(vfsp, 0, 0); 357 ufs_idle_drain(vfsp); 358 359 /* 360 * flush and invalidate quota records 361 */ 362 (void) qsync(ufsvfsp); 363 364 /* 365 * flush w/invalidate the inodes for vfsp 366 */ 367 if (error = ufs_scan_inodes(0, ufs_flush_inode, ufsvfsp, ufsvfsp)) 368 saverror = error; 369 370 /* 371 * synchronously flush superblock and summary info 372 */ 373 if (fs->fs_ronly == 0 && fs->fs_fmod) { 374 fs->fs_fmod = 0; 375 TRANS_SBUPDATE(ufsvfsp, vfsp, TOP_SBUPDATE_FLUSH); 376 } 377 /* 378 * flush w/invalidate block device pages and buf cache 379 */ 380 if ((error = VOP_PUTPAGE(common_specvp(ufsvfsp->vfs_devvp), 381 (offset_t)0, 0, B_INVAL, CRED(), NULL)) > 0) 382 saverror = error; 383 384 (void) bflush((dev_t)vfsp->vfs_dev); 385 (void) bfinval((dev_t)vfsp->vfs_dev, 0); 386 387 /* 388 * drain the delete and idle threads again 389 */ 390 ufs_delete_drain(vfsp, 0, 0); 391 ufs_idle_drain(vfsp); 392 393 /* 394 * play with the clean flag 395 */ 396 if (saverror == 0) 397 ufs_checkclean(vfsp); 398 399 /* 400 * Flush any outstanding transactions and roll the log 401 * only if we are supposed to do, i.e. LDL_NOROLL not set. 402 * We can not simply check for fs_ronly here since fsck also may 403 * use this code to roll the log on a read-only filesystem, e.g. 404 * root during early stages of boot, if other then a sanity check is 405 * done, it will clear LDL_NOROLL before. 406 * In addition we assert that the deltamap does not contain any deltas 407 * in case LDL_NOROLL is set since this is not supposed to happen. 408 */ 409 if (TRANS_ISTRANS(ufsvfsp)) { 410 ml_unit_t *ul = ufsvfsp->vfs_log; 411 mt_map_t *mtm = ul->un_deltamap; 412 413 if (ul->un_flags & LDL_NOROLL) { 414 ASSERT(mtm->mtm_nme == 0); 415 } else { 416 /* 417 * Do not set T_DONTBLOCK if there is a 418 * transaction opened by caller. 419 */ 420 if (curthread->t_flag & T_DONTBLOCK) 421 tdontblock = 1; 422 else 423 curthread->t_flag |= T_DONTBLOCK; 424 425 TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_FLUSH, 426 TOP_COMMIT_SIZE, error); 427 428 if (!error) { 429 TRANS_END_SYNC(ufsvfsp, saverror, 430 TOP_COMMIT_FLUSH, TOP_COMMIT_SIZE); 431 } 432 433 if (tdontblock == 0) 434 curthread->t_flag &= ~T_DONTBLOCK; 435 436 logmap_roll_dev(ufsvfsp->vfs_log); 437 } 438 } 439 440 return (saverror); 441 } 442 443 /* 444 * ufs_thaw_wlock 445 * special processing when thawing down to wlock 446 */ 447 static int 448 ufs_thaw_wlock(struct inode *ip, void *arg) 449 { 450 /* 451 * wrong file system; keep looking 452 */ 453 if (ip->i_ufsvfs != (struct ufsvfs *)arg) 454 return (0); 455 456 /* 457 * iupdat refuses to clear flags if the fs is read only. The fs 458 * may become read/write during the lock and we wouldn't want 459 * these inodes being written to disk. So clear the flags. 460 */ 461 rw_enter(&ip->i_contents, RW_WRITER); 462 ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG); 463 rw_exit(&ip->i_contents); 464 465 /* 466 * pages are mlocked -- fail wlock 467 */ 468 if (ITOV(ip)->v_type != VCHR && vn_has_cached_data(ITOV(ip))) 469 return (EBUSY); 470 471 return (0); 472 } 473 474 /* 475 * ufs_thaw_hlock 476 * special processing when thawing down to hlock or elock 477 */ 478 static int 479 ufs_thaw_hlock(struct inode *ip, void *arg) 480 { 481 struct vnode *vp = ITOV(ip); 482 483 /* 484 * wrong file system; keep looking 485 */ 486 if (ip->i_ufsvfs != (struct ufsvfs *)arg) 487 return (0); 488 489 /* 490 * blow away all pages - even if they are mlocked 491 */ 492 do { 493 (void) TRANS_SYNCIP(ip, B_INVAL | B_FORCE, 0, TOP_SYNCIP_HLOCK); 494 } while ((vp->v_type != VCHR) && vn_has_cached_data(vp)); 495 rw_enter(&ip->i_contents, RW_WRITER); 496 ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG); 497 rw_exit(&ip->i_contents); 498 499 return (0); 500 } 501 502 /* 503 * ufs_thaw 504 * thaw file system lock down to current value 505 */ 506 int 507 ufs_thaw(struct vfs *vfsp, struct ufsvfs *ufsvfsp, struct ulockfs *ulp) 508 { 509 int error = 0; 510 int noidel = (int)(ulp->ul_flag & ULOCKFS_NOIDEL); 511 512 /* 513 * if wlock or hlock or elock 514 */ 515 if (ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_HLOCK(ulp) || 516 ULOCKFS_IS_ELOCK(ulp)) { 517 518 /* 519 * don't keep access times 520 * don't free deleted files 521 * if superblock writes are allowed, limit them to me for now 522 */ 523 ulp->ul_flag |= (ULOCKFS_NOIACC|ULOCKFS_NOIDEL); 524 if (ulp->ul_sbowner != (kthread_id_t)-1) 525 ulp->ul_sbowner = curthread; 526 527 /* 528 * wait for writes for deleted files and superblock updates 529 */ 530 (void) ufs_flush(vfsp); 531 532 /* 533 * now make sure the quota file is up-to-date 534 * expensive; but effective 535 */ 536 error = ufs_flush(vfsp); 537 /* 538 * no one can write the superblock 539 */ 540 ulp->ul_sbowner = (kthread_id_t)-1; 541 542 /* 543 * special processing for wlock/hlock/elock 544 */ 545 if (ULOCKFS_IS_WLOCK(ulp)) { 546 if (error) 547 goto errout; 548 error = bfinval(ufsvfsp->vfs_dev, 0); 549 if (error) 550 goto errout; 551 error = ufs_scan_inodes(0, ufs_thaw_wlock, 552 (void *)ufsvfsp, ufsvfsp); 553 if (error) 554 goto errout; 555 } 556 if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp)) { 557 error = 0; 558 (void) ufs_scan_inodes(0, ufs_thaw_hlock, 559 (void *)ufsvfsp, ufsvfsp); 560 (void) bfinval(ufsvfsp->vfs_dev, 1); 561 } 562 } else { 563 564 /* 565 * okay to keep access times 566 * okay to free deleted files 567 * okay to write the superblock 568 */ 569 ulp->ul_flag &= ~(ULOCKFS_NOIACC|ULOCKFS_NOIDEL); 570 ulp->ul_sbowner = NULL; 571 572 /* 573 * flush in case deleted files are in memory 574 */ 575 if (noidel) { 576 if (error = ufs_flush(vfsp)) 577 goto errout; 578 } 579 } 580 581 errout: 582 cv_broadcast(&ulp->ul_cv); 583 return (error); 584 } 585 586 /* 587 * ufs_reconcile_fs 588 * reconcile incore superblock with ondisk superblock 589 */ 590 int 591 ufs_reconcile_fs(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck) 592 { 593 struct fs *mfs; /* in-memory superblock */ 594 struct fs *dfs; /* on-disk superblock */ 595 struct buf *bp; /* on-disk superblock buf */ 596 int needs_unlock; 597 char finished_fsclean; 598 599 mfs = ufsvfsp->vfs_fs; 600 601 /* 602 * get the on-disk copy of the superblock 603 */ 604 bp = UFS_BREAD(ufsvfsp, vfsp->vfs_dev, SBLOCK, SBSIZE); 605 bp->b_flags |= (B_STALE|B_AGE); 606 if (bp->b_flags & B_ERROR) { 607 brelse(bp); 608 return (EIO); 609 } 610 dfs = bp->b_un.b_fs; 611 612 /* error locks may only unlock after the fs has been made consistent */ 613 if (errlck == UN_ERRLCK) { 614 if (dfs->fs_clean == FSFIX) { /* being repaired */ 615 brelse(bp); 616 return (EAGAIN); 617 } 618 /* repair not yet started? */ 619 finished_fsclean = TRANS_ISTRANS(ufsvfsp)? FSLOG: FSCLEAN; 620 if (dfs->fs_clean != finished_fsclean) { 621 brelse(bp); 622 return (EBUSY); 623 } 624 } 625 626 /* 627 * if superblock has changed too much, abort 628 */ 629 if ((mfs->fs_sblkno != dfs->fs_sblkno) || 630 (mfs->fs_cblkno != dfs->fs_cblkno) || 631 (mfs->fs_iblkno != dfs->fs_iblkno) || 632 (mfs->fs_dblkno != dfs->fs_dblkno) || 633 (mfs->fs_cgoffset != dfs->fs_cgoffset) || 634 (mfs->fs_cgmask != dfs->fs_cgmask) || 635 (mfs->fs_bsize != dfs->fs_bsize) || 636 (mfs->fs_fsize != dfs->fs_fsize) || 637 (mfs->fs_frag != dfs->fs_frag) || 638 (mfs->fs_bmask != dfs->fs_bmask) || 639 (mfs->fs_fmask != dfs->fs_fmask) || 640 (mfs->fs_bshift != dfs->fs_bshift) || 641 (mfs->fs_fshift != dfs->fs_fshift) || 642 (mfs->fs_fragshift != dfs->fs_fragshift) || 643 (mfs->fs_fsbtodb != dfs->fs_fsbtodb) || 644 (mfs->fs_sbsize != dfs->fs_sbsize) || 645 (mfs->fs_nindir != dfs->fs_nindir) || 646 (mfs->fs_nspf != dfs->fs_nspf) || 647 (mfs->fs_trackskew != dfs->fs_trackskew) || 648 (mfs->fs_cgsize != dfs->fs_cgsize) || 649 (mfs->fs_ntrak != dfs->fs_ntrak) || 650 (mfs->fs_nsect != dfs->fs_nsect) || 651 (mfs->fs_spc != dfs->fs_spc) || 652 (mfs->fs_cpg != dfs->fs_cpg) || 653 (mfs->fs_ipg != dfs->fs_ipg) || 654 (mfs->fs_fpg != dfs->fs_fpg) || 655 (mfs->fs_postblformat != dfs->fs_postblformat) || 656 (mfs->fs_magic != dfs->fs_magic)) { 657 brelse(bp); 658 return (EACCES); 659 } 660 if (dfs->fs_clean == FSBAD || FSOKAY != dfs->fs_state + dfs->fs_time) 661 if (mfs->fs_clean == FSLOG) { 662 brelse(bp); 663 return (EACCES); 664 } 665 666 /* 667 * get new summary info 668 */ 669 if (ufs_getsummaryinfo(vfsp->vfs_dev, ufsvfsp, dfs)) { 670 brelse(bp); 671 return (EIO); 672 } 673 674 /* 675 * release old summary info and update in-memory superblock 676 */ 677 kmem_free(mfs->fs_u.fs_csp, mfs->fs_cssize); 678 mfs->fs_u.fs_csp = dfs->fs_u.fs_csp; /* Only entry 0 used */ 679 680 /* 681 * update fields allowed to change 682 */ 683 mfs->fs_size = dfs->fs_size; 684 mfs->fs_dsize = dfs->fs_dsize; 685 mfs->fs_ncg = dfs->fs_ncg; 686 mfs->fs_minfree = dfs->fs_minfree; 687 mfs->fs_rotdelay = dfs->fs_rotdelay; 688 mfs->fs_rps = dfs->fs_rps; 689 mfs->fs_maxcontig = dfs->fs_maxcontig; 690 mfs->fs_maxbpg = dfs->fs_maxbpg; 691 mfs->fs_csmask = dfs->fs_csmask; 692 mfs->fs_csshift = dfs->fs_csshift; 693 mfs->fs_optim = dfs->fs_optim; 694 mfs->fs_csaddr = dfs->fs_csaddr; 695 mfs->fs_cssize = dfs->fs_cssize; 696 mfs->fs_ncyl = dfs->fs_ncyl; 697 mfs->fs_cstotal = dfs->fs_cstotal; 698 mfs->fs_reclaim = dfs->fs_reclaim; 699 700 if (mfs->fs_reclaim & (FS_RECLAIM|FS_RECLAIMING)) { 701 mfs->fs_reclaim &= ~FS_RECLAIM; 702 mfs->fs_reclaim |= FS_RECLAIMING; 703 ufs_thread_start(&ufsvfsp->vfs_reclaim, 704 ufs_thread_reclaim, vfsp); 705 } 706 707 /* XXX What to do about sparecon? */ 708 709 /* XXX need to copy volume label */ 710 711 /* 712 * ondisk clean flag overrides inmemory clean flag iff == FSBAD 713 * or if error-locked and ondisk is now clean 714 */ 715 needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock); 716 if (needs_unlock) 717 mutex_enter(&ufsvfsp->vfs_lock); 718 719 if (errlck == UN_ERRLCK) { 720 if (finished_fsclean == dfs->fs_clean) 721 mfs->fs_clean = finished_fsclean; 722 else 723 mfs->fs_clean = FSBAD; 724 mfs->fs_state = FSOKAY - dfs->fs_time; 725 } 726 727 if (FSOKAY != dfs->fs_state + dfs->fs_time || 728 (dfs->fs_clean == FSBAD)) 729 mfs->fs_clean = FSBAD; 730 731 if (needs_unlock) 732 mutex_exit(&ufsvfsp->vfs_lock); 733 734 brelse(bp); 735 736 return (0); 737 } 738 739 /* 740 * ufs_reconcile_inode 741 * reconcile ondisk inode with incore inode 742 */ 743 static int 744 ufs_reconcile_inode(struct inode *ip, void *arg) 745 { 746 int i; 747 int ndaddr; 748 int niaddr; 749 struct dinode *dp; /* ondisk inode */ 750 struct buf *bp = NULL; 751 uid_t d_uid; 752 gid_t d_gid; 753 int error = 0; 754 struct fs *fs; 755 756 /* 757 * not an inode we care about 758 */ 759 if (ip->i_ufsvfs != (struct ufsvfs *)arg) 760 return (0); 761 762 fs = ip->i_fs; 763 764 /* 765 * Inode reconciliation fails: we made the filesystem quiescent 766 * and we did a ufs_flush() before calling ufs_reconcile_inode() 767 * and thus the inode should not have been changed inbetween. 768 * Any discrepancies indicate a logic error and a pretty 769 * significant run-state inconsistency we should complain about. 770 */ 771 if (ip->i_flag & (IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG)) { 772 cmn_err(CE_WARN, "%s: Inode reconciliation failed for" 773 "inode %llu", fs->fs_fsmnt, (u_longlong_t)ip->i_number); 774 return (EINVAL); 775 } 776 777 /* 778 * get the dinode 779 */ 780 bp = UFS_BREAD(ip->i_ufsvfs, 781 ip->i_dev, (daddr_t)fsbtodb(fs, itod(fs, ip->i_number)), 782 (int)fs->fs_bsize); 783 if (bp->b_flags & B_ERROR) { 784 brelse(bp); 785 return (EIO); 786 } 787 dp = bp->b_un.b_dino; 788 dp += itoo(fs, ip->i_number); 789 790 /* 791 * handle Sun's implementation of EFT 792 */ 793 d_uid = (dp->di_suid == UID_LONG) ? dp->di_uid : (uid_t)dp->di_suid; 794 d_gid = (dp->di_sgid == GID_LONG) ? dp->di_gid : (uid_t)dp->di_sgid; 795 796 rw_enter(&ip->i_contents, RW_WRITER); 797 798 /* 799 * some fields are not allowed to change 800 */ 801 if ((ip->i_mode != dp->di_mode) || 802 (ip->i_gen != dp->di_gen) || 803 (ip->i_uid != d_uid) || 804 (ip->i_gid != d_gid)) { 805 error = EACCES; 806 goto out; 807 } 808 809 /* 810 * and some are allowed to change 811 */ 812 ip->i_size = dp->di_size; 813 ip->i_ic.ic_flags = dp->di_ic.ic_flags; 814 ip->i_blocks = dp->di_blocks; 815 ip->i_nlink = dp->di_nlink; 816 if (ip->i_flag & IFASTSYMLNK) { 817 ndaddr = 1; 818 niaddr = 0; 819 } else { 820 ndaddr = NDADDR; 821 niaddr = NIADDR; 822 } 823 for (i = 0; i < ndaddr; ++i) 824 ip->i_db[i] = dp->di_db[i]; 825 for (i = 0; i < niaddr; ++i) 826 ip->i_ib[i] = dp->di_ib[i]; 827 828 out: 829 rw_exit(&ip->i_contents); 830 brelse(bp); 831 return (error); 832 } 833 834 /* 835 * ufs_reconcile 836 * reconcile ondisk superblock/inodes with any incore 837 */ 838 static int 839 ufs_reconcile(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck) 840 { 841 int error = 0; 842 843 /* 844 * get rid of as much inmemory data as possible 845 */ 846 (void) ufs_flush(vfsp); 847 848 /* 849 * reconcile the superblock and inodes 850 */ 851 if (error = ufs_reconcile_fs(vfsp, ufsvfsp, errlck)) 852 return (error); 853 if (error = ufs_scan_inodes(0, ufs_reconcile_inode, ufsvfsp, ufsvfsp)) 854 return (error); 855 /* 856 * allocation blocks may be incorrect; get rid of them 857 */ 858 (void) ufs_flush(vfsp); 859 860 return (error); 861 } 862 863 /* 864 * File system locking 865 */ 866 int 867 ufs_fiolfs(struct vnode *vp, struct lockfs *lockfsp, int from_log) 868 { 869 return (ufs__fiolfs(vp, lockfsp, /* from_user */ 1, from_log)); 870 } 871 872 /* kernel-internal interface, also used by fix-on-panic */ 873 int 874 ufs__fiolfs( 875 struct vnode *vp, 876 struct lockfs *lockfsp, 877 int from_user, 878 int from_log) 879 { 880 struct ulockfs *ulp; 881 struct lockfs lfs; 882 int error; 883 struct vfs *vfsp; 884 struct ufsvfs *ufsvfsp; 885 int errlck = NO_ERRLCK; 886 int poll_events = POLLPRI; 887 extern struct pollhead ufs_pollhd; 888 ulockfs_info_t *head; 889 ulockfs_info_t *info; 890 int signal = 0; 891 892 /* check valid lock type */ 893 if (!lockfsp || lockfsp->lf_lock > LOCKFS_MAXLOCK) 894 return (EINVAL); 895 896 if (!vp || !vp->v_vfsp || !vp->v_vfsp->vfs_data) 897 return (EIO); 898 899 vfsp = vp->v_vfsp; 900 901 if (vfsp->vfs_flag & VFS_UNMOUNTED) /* has been unmounted */ 902 return (EIO); 903 904 /* take the lock and check again */ 905 vfs_lock_wait(vfsp); 906 if (vfsp->vfs_flag & VFS_UNMOUNTED) { 907 vfs_unlock(vfsp); 908 return (EIO); 909 } 910 911 /* 912 * Can't wlock or ro/elock fs with accounting or local swap file 913 * We need to check for this before we grab the ul_lock to avoid 914 * deadlocks with the accounting framework. 915 */ 916 if ((LOCKFS_IS_WLOCK(lockfsp) || LOCKFS_IS_ELOCK(lockfsp) || 917 LOCKFS_IS_ROELOCK(lockfsp)) && !from_log) { 918 if (ufs_checkaccton(vp) || ufs_checkswapon(vp)) { 919 vfs_unlock(vfsp); 920 return (EDEADLK); 921 } 922 } 923 924 ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 925 ulp = &ufsvfsp->vfs_ulockfs; 926 head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 927 SEARCH_ULOCKFSP(head, ulp, info); 928 929 /* 930 * Suspend both the reclaim thread and the delete thread. 931 * This must be done outside the lockfs locking protocol. 932 */ 933 ufs_thread_suspend(&ufsvfsp->vfs_reclaim); 934 ufs_thread_suspend(&ufsvfsp->vfs_delete); 935 936 mutex_enter(&ulp->ul_lock); 937 atomic_add_long(&ufs_quiesce_pend, 1); 938 939 /* 940 * Quit if there is another lockfs request in progress 941 * that is waiting for existing ufs_vnops to complete. 942 */ 943 if (ULOCKFS_IS_BUSY(ulp)) { 944 error = EBUSY; 945 goto errexit; 946 } 947 948 /* cannot ulocked or downgrade a hard-lock */ 949 if (ULOCKFS_IS_HLOCK(ulp)) { 950 error = EIO; 951 goto errexit; 952 } 953 954 /* an error lock may be unlocked or relocked, only */ 955 if (ULOCKFS_IS_ELOCK(ulp)) { 956 if (!LOCKFS_IS_ULOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) { 957 error = EBUSY; 958 goto errexit; 959 } 960 } 961 962 /* 963 * a read-only error lock may only be upgraded to an 964 * error lock or hard lock 965 */ 966 if (ULOCKFS_IS_ROELOCK(ulp)) { 967 if (!LOCKFS_IS_HLOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) { 968 error = EBUSY; 969 goto errexit; 970 } 971 } 972 973 /* 974 * until read-only error locks are fully implemented 975 * just return EINVAL 976 */ 977 if (LOCKFS_IS_ROELOCK(lockfsp)) { 978 error = EINVAL; 979 goto errexit; 980 } 981 982 /* 983 * an error lock may only be applied if the file system is 984 * unlocked or already error locked. 985 * (this is to prevent the case where a fs gets changed out from 986 * underneath a fs that is locked for backup, 987 * that is, name/delete/write-locked.) 988 */ 989 if ((!ULOCKFS_IS_ULOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp) && 990 !ULOCKFS_IS_ROELOCK(ulp)) && 991 (LOCKFS_IS_ELOCK(lockfsp) || LOCKFS_IS_ROELOCK(lockfsp))) { 992 error = EBUSY; 993 goto errexit; 994 } 995 996 /* get and validate the input lockfs request */ 997 if (error = ufs_getlfd(lockfsp, &ulp->ul_lockfs)) 998 goto errexit; 999 1000 /* 1001 * save current ulockfs struct 1002 */ 1003 bcopy(&ulp->ul_lockfs, &lfs, sizeof (struct lockfs)); 1004 1005 /* 1006 * Freeze the file system (pend future accesses) 1007 */ 1008 ufs_freeze(ulp, lockfsp); 1009 1010 /* 1011 * Set locking in progress because ufs_quiesce may free the 1012 * ul_lock mutex. 1013 */ 1014 ULOCKFS_SET_BUSY(ulp); 1015 /* update the ioctl copy */ 1016 LOCKFS_SET_BUSY(&ulp->ul_lockfs); 1017 1018 /* 1019 * We need to unset FWLOCK status before we call ufs_quiesce 1020 * so that the thread doesnt get suspended. We do this only if 1021 * this (fallocate) thread requested an unlock operation. 1022 */ 1023 if (info && (info->flags & ULOCK_INFO_FALLOCATE)) { 1024 if (!ULOCKFS_IS_WLOCK(ulp)) 1025 ULOCKFS_CLR_FWLOCK(ulp); 1026 } 1027 1028 /* 1029 * Quiesce (wait for outstanding accesses to finish) 1030 */ 1031 if (error = ufs_quiesce(ulp)) { 1032 /* 1033 * Interrupted due to signal. There could still be 1034 * pending vnops. 1035 */ 1036 signal = 1; 1037 1038 /* 1039 * We do broadcast because lock-status 1040 * could be reverted to old status. 1041 */ 1042 cv_broadcast(&ulp->ul_cv); 1043 goto errout; 1044 } 1045 1046 /* 1047 * If the fallocate thread requested a write fs lock operation 1048 * then we set fwlock status in the ulp. 1049 */ 1050 if (info && (info->flags & ULOCK_INFO_FALLOCATE)) { 1051 if (ULOCKFS_IS_WLOCK(ulp)) 1052 ULOCKFS_SET_FWLOCK(ulp); 1053 } 1054 1055 /* 1056 * save error lock status to pass down to reconcilation 1057 * routines and for later cleanup 1058 */ 1059 if (LOCKFS_IS_ELOCK(&lfs) && ULOCKFS_IS_ULOCK(ulp)) 1060 errlck = UN_ERRLCK; 1061 1062 if (ULOCKFS_IS_ELOCK(ulp) || ULOCKFS_IS_ROELOCK(ulp)) { 1063 int needs_unlock; 1064 int needs_sbwrite; 1065 1066 poll_events |= POLLERR; 1067 errlck = LOCKFS_IS_ELOCK(&lfs) || LOCKFS_IS_ROELOCK(&lfs) ? 1068 RE_ERRLCK : SET_ERRLCK; 1069 1070 needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock); 1071 if (needs_unlock) 1072 mutex_enter(&ufsvfsp->vfs_lock); 1073 1074 /* disable delayed i/o */ 1075 needs_sbwrite = 0; 1076 1077 if (errlck == SET_ERRLCK) { 1078 ufsvfsp->vfs_fs->fs_clean = FSBAD; 1079 needs_sbwrite = 1; 1080 } 1081 1082 needs_sbwrite |= ufsvfsp->vfs_dio; 1083 ufsvfsp->vfs_dio = 0; 1084 1085 if (needs_unlock) 1086 mutex_exit(&ufsvfsp->vfs_lock); 1087 1088 if (needs_sbwrite) { 1089 ulp->ul_sbowner = curthread; 1090 TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE); 1091 1092 if (needs_unlock) 1093 mutex_enter(&ufsvfsp->vfs_lock); 1094 1095 ufsvfsp->vfs_fs->fs_fmod = 0; 1096 1097 if (needs_unlock) 1098 mutex_exit(&ufsvfsp->vfs_lock); 1099 } 1100 } 1101 1102 /* 1103 * reconcile superblock and inodes if was wlocked 1104 */ 1105 if (LOCKFS_IS_WLOCK(&lfs) || LOCKFS_IS_ELOCK(&lfs)) { 1106 if (error = ufs_reconcile(vfsp, ufsvfsp, errlck)) 1107 goto errout; 1108 /* 1109 * in case the fs grew; reset the metadata map for logging tests 1110 */ 1111 TRANS_MATA_UMOUNT(ufsvfsp); 1112 TRANS_MATA_MOUNT(ufsvfsp); 1113 TRANS_MATA_SI(ufsvfsp, ufsvfsp->vfs_fs); 1114 } 1115 1116 /* 1117 * At least everything *currently* dirty goes out. 1118 */ 1119 1120 if ((error = ufs_flush(vfsp)) != 0 && !ULOCKFS_IS_HLOCK(ulp) && 1121 !ULOCKFS_IS_ELOCK(ulp)) 1122 goto errout; 1123 1124 /* 1125 * thaw file system and wakeup pended processes 1126 */ 1127 if (error = ufs_thaw(vfsp, ufsvfsp, ulp)) 1128 if (!ULOCKFS_IS_HLOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp)) 1129 goto errout; 1130 1131 /* 1132 * reset modified flag if not already write locked 1133 */ 1134 if (!LOCKFS_IS_WLOCK(&lfs)) 1135 ULOCKFS_CLR_MOD(ulp); 1136 1137 /* 1138 * idle the lock struct 1139 */ 1140 ULOCKFS_CLR_BUSY(ulp); 1141 /* update the ioctl copy */ 1142 LOCKFS_CLR_BUSY(&ulp->ul_lockfs); 1143 1144 /* 1145 * free current comment 1146 */ 1147 if (lfs.lf_comment && lfs.lf_comlen != 0) { 1148 kmem_free(lfs.lf_comment, lfs.lf_comlen); 1149 lfs.lf_comment = NULL; 1150 lfs.lf_comlen = 0; 1151 } 1152 1153 /* do error lock cleanup */ 1154 if (errlck == UN_ERRLCK) 1155 ufsfx_unlockfs(ufsvfsp); 1156 1157 else if (errlck == RE_ERRLCK) 1158 ufsfx_lockfs(ufsvfsp); 1159 1160 /* don't allow error lock from user to invoke panic */ 1161 else if (from_user && errlck == SET_ERRLCK && 1162 !(ufsvfsp->vfs_fsfx.fx_flags & (UFSMNT_ONERROR_PANIC >> 4))) 1163 (void) ufs_fault(ufsvfsp->vfs_root, 1164 ulp->ul_lockfs.lf_comment && ulp->ul_lockfs.lf_comlen > 0 ? 1165 ulp->ul_lockfs.lf_comment: "user-applied error lock"); 1166 1167 atomic_add_long(&ufs_quiesce_pend, -1); 1168 mutex_exit(&ulp->ul_lock); 1169 vfs_unlock(vfsp); 1170 1171 if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs)) 1172 poll_events |= POLLERR; 1173 1174 pollwakeup(&ufs_pollhd, poll_events); 1175 1176 /* 1177 * Allow both the delete thread and the reclaim thread to 1178 * continue. 1179 */ 1180 ufs_thread_continue(&ufsvfsp->vfs_delete); 1181 ufs_thread_continue(&ufsvfsp->vfs_reclaim); 1182 1183 return (0); 1184 1185 errout: 1186 /* 1187 * Lock failed. Reset the old lock in ufsvfs if not hard locked. 1188 */ 1189 if (!LOCKFS_IS_HLOCK(&ulp->ul_lockfs)) { 1190 bcopy(&lfs, &ulp->ul_lockfs, sizeof (struct lockfs)); 1191 ulp->ul_fs_lock = (1 << lfs.lf_lock); 1192 } 1193 1194 /* 1195 * Don't call ufs_thaw() when there's a signal during 1196 * ufs quiesce operation as it can lead to deadlock 1197 * with getpage. 1198 */ 1199 if (signal == 0) 1200 (void) ufs_thaw(vfsp, ufsvfsp, ulp); 1201 1202 ULOCKFS_CLR_BUSY(ulp); 1203 LOCKFS_CLR_BUSY(&ulp->ul_lockfs); 1204 1205 errexit: 1206 atomic_add_long(&ufs_quiesce_pend, -1); 1207 mutex_exit(&ulp->ul_lock); 1208 vfs_unlock(vfsp); 1209 1210 /* 1211 * Allow both the delete thread and the reclaim thread to 1212 * continue. 1213 */ 1214 ufs_thread_continue(&ufsvfsp->vfs_delete); 1215 ufs_thread_continue(&ufsvfsp->vfs_reclaim); 1216 1217 return (error); 1218 } 1219 1220 /* 1221 * fiolfss 1222 * return the current file system locking state info 1223 */ 1224 int 1225 ufs_fiolfss(struct vnode *vp, struct lockfs *lockfsp) 1226 { 1227 struct ulockfs *ulp; 1228 1229 if (!vp || !vp->v_vfsp || !VTOI(vp)) 1230 return (EINVAL); 1231 1232 /* file system has been forcibly unmounted */ 1233 if (VTOI(vp)->i_ufsvfs == NULL) 1234 return (EIO); 1235 1236 ulp = VTOUL(vp); 1237 1238 if (ULOCKFS_IS_HLOCK(ulp)) { 1239 *lockfsp = ulp->ul_lockfs; /* structure assignment */ 1240 return (0); 1241 } 1242 1243 mutex_enter(&ulp->ul_lock); 1244 1245 *lockfsp = ulp->ul_lockfs; /* structure assignment */ 1246 1247 if (ULOCKFS_IS_MOD(ulp)) 1248 lockfsp->lf_flags |= LOCKFS_MOD; 1249 1250 mutex_exit(&ulp->ul_lock); 1251 1252 return (0); 1253 } 1254 1255 /* 1256 * ufs_check_lockfs 1257 * check whether a ufs_vnops conflicts with the file system lock 1258 */ 1259 int 1260 ufs_check_lockfs(struct ufsvfs *ufsvfsp, struct ulockfs *ulp, ulong_t mask) 1261 { 1262 k_sigset_t smask; 1263 int sig, slock; 1264 1265 ASSERT(MUTEX_HELD(&ulp->ul_lock)); 1266 1267 while (ulp->ul_fs_lock & mask) { 1268 slock = (int)ULOCKFS_IS_SLOCK(ulp); 1269 if ((curthread->t_flag & T_DONTPEND) && !slock) { 1270 curthread->t_flag |= T_WOULDBLOCK; 1271 return (EAGAIN); 1272 } 1273 curthread->t_flag &= ~T_WOULDBLOCK; 1274 1275 /* 1276 * In the case of an onerr umount of the fs, threads could 1277 * have blocked before coming into ufs_check_lockfs and 1278 * need to check for the special case of ELOCK and 1279 * vfs_dontblock being set which would indicate that the fs 1280 * is on its way out and will not return therefore making 1281 * EIO the appropriate response. 1282 */ 1283 if (ULOCKFS_IS_HLOCK(ulp) || 1284 (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock)) 1285 return (EIO); 1286 1287 /* 1288 * wait for lock status to change 1289 */ 1290 if (slock || ufsvfsp->vfs_nointr) { 1291 cv_wait(&ulp->ul_cv, &ulp->ul_lock); 1292 } else { 1293 sigintr(&smask, 1); 1294 sig = cv_wait_sig(&ulp->ul_cv, &ulp->ul_lock); 1295 sigunintr(&smask); 1296 if ((!sig && (ulp->ul_fs_lock & mask)) || 1297 ufsvfsp->vfs_dontblock) 1298 return (EINTR); 1299 } 1300 } 1301 1302 if (mask & ULOCKFS_FWLOCK) { 1303 atomic_add_long(&ulp->ul_falloc_cnt, 1); 1304 ULOCKFS_SET_FALLOC(ulp); 1305 } else { 1306 atomic_add_long(&ulp->ul_vnops_cnt, 1); 1307 } 1308 1309 return (0); 1310 } 1311 1312 /* 1313 * Check whether we came across the handcrafted lockfs protocol path. We can't 1314 * simply check for T_DONTBLOCK here as one would assume since this can also 1315 * falsely catch recursive VOP's going to a different filesystem, instead we 1316 * check if we already hold the ulockfs->ul_lock mutex. 1317 */ 1318 static int 1319 ufs_lockfs_is_under_rawlockfs(struct ulockfs *ulp) 1320 { 1321 return ((mutex_owner(&ulp->ul_lock) != curthread) ? 0 : 1); 1322 } 1323 1324 /* 1325 * ufs_lockfs_begin - start the lockfs locking protocol 1326 */ 1327 int 1328 ufs_lockfs_begin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask) 1329 { 1330 int error; 1331 int rec_vop; 1332 ushort_t op_cnt_incremented = 0; 1333 ulong_t *ctr; 1334 struct ulockfs *ulp; 1335 ulockfs_info_t *ulockfs_info; 1336 ulockfs_info_t *ulockfs_info_free; 1337 ulockfs_info_t *ulockfs_info_temp; 1338 1339 /* 1340 * file system has been forcibly unmounted 1341 */ 1342 if (ufsvfsp == NULL) 1343 return (EIO); 1344 1345 *ulpp = ulp = &ufsvfsp->vfs_ulockfs; 1346 1347 /* 1348 * Do lockfs protocol 1349 */ 1350 ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 1351 IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free); 1352 1353 /* 1354 * Detect recursive VOP call or handcrafted internal lockfs protocol 1355 * path and bail out in that case. 1356 */ 1357 if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) { 1358 *ulpp = NULL; 1359 return (0); 1360 } else { 1361 if (ulockfs_info_free == NULL) { 1362 if ((ulockfs_info_temp = (ulockfs_info_t *) 1363 kmem_zalloc(sizeof (ulockfs_info_t), 1364 KM_NOSLEEP)) == NULL) { 1365 *ulpp = NULL; 1366 return (ENOMEM); 1367 } 1368 } 1369 } 1370 1371 /* 1372 * First time VOP call 1373 * 1374 * Increment the ctr irrespective of the lockfs state. If the lockfs 1375 * state is not ULOCKFS_ULOCK, we can decrement it later. However, 1376 * before incrementing we need to check if there is a pending quiesce 1377 * request because if we have a continuous stream of ufs_lockfs_begin 1378 * requests pounding on a few cpu's then the ufs_quiesce thread might 1379 * never see the value of zero for ctr - a livelock kind of scenario. 1380 */ 1381 ctr = (mask & ULOCKFS_FWLOCK) ? 1382 &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt; 1383 if (!ULOCKFS_IS_SLOCK(ulp)) { 1384 atomic_add_long(ctr, 1); 1385 op_cnt_incremented++; 1386 } 1387 1388 /* 1389 * If the lockfs state (indicated by ul_fs_lock) is not just 1390 * ULOCKFS_ULOCK, then we will be routed through ufs_check_lockfs 1391 * where there is a check with an appropriate mask to selectively allow 1392 * operations permitted for that kind of lockfs state. 1393 * 1394 * Even these selective operations should not be allowed to go through 1395 * if a lockfs request is in progress because that could result in inode 1396 * modifications during a quiesce and could hence result in inode 1397 * reconciliation failures. ULOCKFS_SLOCK alone would not be sufficient, 1398 * so make use of ufs_quiesce_pend to disallow vnode operations when a 1399 * quiesce is in progress. 1400 */ 1401 if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) { 1402 if (op_cnt_incremented) 1403 if (!atomic_add_long_nv(ctr, -1)) 1404 cv_broadcast(&ulp->ul_cv); 1405 mutex_enter(&ulp->ul_lock); 1406 error = ufs_check_lockfs(ufsvfsp, ulp, mask); 1407 mutex_exit(&ulp->ul_lock); 1408 if (error) { 1409 if (ulockfs_info_free == NULL) 1410 kmem_free(ulockfs_info_temp, 1411 sizeof (ulockfs_info_t)); 1412 return (error); 1413 } 1414 } else { 1415 /* 1416 * This is the common case of file system in a unlocked state. 1417 * 1418 * If a file system is unlocked, we would expect the ctr to have 1419 * been incremented by now. But this will not be true when a 1420 * quiesce is winding up - SLOCK was set when we checked before 1421 * incrementing the ctr, but by the time we checked for 1422 * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. It is okay 1423 * to take ul_lock and go through the slow path in this uncommon 1424 * case. 1425 */ 1426 if (op_cnt_incremented == 0) { 1427 mutex_enter(&ulp->ul_lock); 1428 error = ufs_check_lockfs(ufsvfsp, ulp, mask); 1429 if (error) { 1430 mutex_exit(&ulp->ul_lock); 1431 if (ulockfs_info_free == NULL) 1432 kmem_free(ulockfs_info_temp, 1433 sizeof (ulockfs_info_t)); 1434 return (error); 1435 } 1436 if (mask & ULOCKFS_FWLOCK) 1437 ULOCKFS_SET_FALLOC(ulp); 1438 mutex_exit(&ulp->ul_lock); 1439 } else if (mask & ULOCKFS_FWLOCK) { 1440 mutex_enter(&ulp->ul_lock); 1441 ULOCKFS_SET_FALLOC(ulp); 1442 mutex_exit(&ulp->ul_lock); 1443 } 1444 } 1445 1446 if (ulockfs_info_free != NULL) { 1447 ulockfs_info_free->ulp = ulp; 1448 if (mask & ULOCKFS_FWLOCK) 1449 ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE; 1450 } else { 1451 ulockfs_info_temp->ulp = ulp; 1452 ulockfs_info_temp->next = ulockfs_info; 1453 if (mask & ULOCKFS_FWLOCK) 1454 ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE; 1455 ASSERT(ufs_lockfs_key != 0); 1456 (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp); 1457 } 1458 1459 curthread->t_flag |= T_DONTBLOCK; 1460 return (0); 1461 } 1462 1463 /* 1464 * Check whether we are returning from the top level VOP. 1465 */ 1466 static int 1467 ufs_lockfs_top_vop_return(ulockfs_info_t *head) 1468 { 1469 ulockfs_info_t *info; 1470 int result = 1; 1471 1472 for (info = head; info != NULL; info = info->next) { 1473 if (info->ulp != NULL) { 1474 result = 0; 1475 break; 1476 } 1477 } 1478 1479 return (result); 1480 } 1481 1482 /* 1483 * ufs_lockfs_end - terminate the lockfs locking protocol 1484 */ 1485 void 1486 ufs_lockfs_end(struct ulockfs *ulp) 1487 { 1488 ulockfs_info_t *info; 1489 ulockfs_info_t *head; 1490 1491 /* 1492 * end-of-VOP protocol 1493 */ 1494 if (ulp == NULL) 1495 return; 1496 1497 head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 1498 SEARCH_ULOCKFSP(head, ulp, info); 1499 1500 /* 1501 * If we're called from a first level VOP, we have to have a 1502 * valid ulockfs record in the TSD. 1503 */ 1504 ASSERT(info != NULL); 1505 1506 /* 1507 * Invalidate the ulockfs record. 1508 */ 1509 info->ulp = NULL; 1510 1511 if (ufs_lockfs_top_vop_return(head)) 1512 curthread->t_flag &= ~T_DONTBLOCK; 1513 1514 /* fallocate thread */ 1515 if (ULOCKFS_IS_FALLOC(ulp) && info->flags & ULOCK_INFO_FALLOCATE) { 1516 /* Clear the thread's fallocate state */ 1517 info->flags &= ~ULOCK_INFO_FALLOCATE; 1518 if (!atomic_add_long_nv(&ulp->ul_falloc_cnt, -1)) { 1519 mutex_enter(&ulp->ul_lock); 1520 ULOCKFS_CLR_FALLOC(ulp); 1521 cv_broadcast(&ulp->ul_cv); 1522 mutex_exit(&ulp->ul_lock); 1523 } 1524 } else { /* normal thread */ 1525 if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 1526 cv_broadcast(&ulp->ul_cv); 1527 } 1528 } 1529 1530 /* 1531 * ufs_lockfs_trybegin - try to start the lockfs locking protocol without 1532 * blocking. 1533 */ 1534 int 1535 ufs_lockfs_trybegin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask) 1536 { 1537 int error = 0; 1538 int rec_vop; 1539 ushort_t op_cnt_incremented = 0; 1540 ulong_t *ctr; 1541 struct ulockfs *ulp; 1542 ulockfs_info_t *ulockfs_info; 1543 ulockfs_info_t *ulockfs_info_free; 1544 ulockfs_info_t *ulockfs_info_temp; 1545 1546 /* 1547 * file system has been forcibly unmounted 1548 */ 1549 if (ufsvfsp == NULL) 1550 return (EIO); 1551 1552 *ulpp = ulp = &ufsvfsp->vfs_ulockfs; 1553 1554 /* 1555 * Do lockfs protocol 1556 */ 1557 ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 1558 IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free); 1559 1560 /* 1561 * Detect recursive VOP call or handcrafted internal lockfs protocol 1562 * path and bail out in that case. 1563 */ 1564 if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) { 1565 *ulpp = NULL; 1566 return (0); 1567 } else { 1568 if (ulockfs_info_free == NULL) { 1569 if ((ulockfs_info_temp = (ulockfs_info_t *) 1570 kmem_zalloc(sizeof (ulockfs_info_t), 1571 KM_NOSLEEP)) == NULL) { 1572 *ulpp = NULL; 1573 return (ENOMEM); 1574 } 1575 } 1576 } 1577 1578 /* 1579 * First time VOP call 1580 * 1581 * Increment the ctr irrespective of the lockfs state. If the lockfs 1582 * state is not ULOCKFS_ULOCK, we can decrement it later. However, 1583 * before incrementing we need to check if there is a pending quiesce 1584 * request because if we have a continuous stream of ufs_lockfs_begin 1585 * requests pounding on a few cpu's then the ufs_quiesce thread might 1586 * never see the value of zero for ctr - a livelock kind of scenario. 1587 */ 1588 ctr = (mask & ULOCKFS_FWLOCK) ? 1589 &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt; 1590 if (!ULOCKFS_IS_SLOCK(ulp)) { 1591 atomic_add_long(ctr, 1); 1592 op_cnt_incremented++; 1593 } 1594 1595 if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) { 1596 /* 1597 * Non-blocking version of ufs_check_lockfs() code. 1598 * 1599 * If the file system is not hard locked or error locked 1600 * and if ulp->ul_fs_lock allows this operation, increment 1601 * the appropriate counter and proceed (For eg., In case the 1602 * file system is delete locked, a mmap can still go through). 1603 */ 1604 if (op_cnt_incremented) 1605 if (!atomic_add_long_nv(ctr, -1)) 1606 cv_broadcast(&ulp->ul_cv); 1607 mutex_enter(&ulp->ul_lock); 1608 if (ULOCKFS_IS_HLOCK(ulp) || 1609 (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock)) 1610 error = EIO; 1611 else if (ulp->ul_fs_lock & mask) 1612 error = EAGAIN; 1613 1614 if (error) { 1615 mutex_exit(&ulp->ul_lock); 1616 if (ulockfs_info_free == NULL) 1617 kmem_free(ulockfs_info_temp, 1618 sizeof (ulockfs_info_t)); 1619 return (error); 1620 } 1621 atomic_add_long(ctr, 1); 1622 if (mask & ULOCKFS_FWLOCK) 1623 ULOCKFS_SET_FALLOC(ulp); 1624 mutex_exit(&ulp->ul_lock); 1625 } else { 1626 /* 1627 * This is the common case of file system in a unlocked state. 1628 * 1629 * If a file system is unlocked, we would expect the ctr to have 1630 * been incremented by now. But this will not be true when a 1631 * quiesce is winding up - SLOCK was set when we checked before 1632 * incrementing the ctr, but by the time we checked for 1633 * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. Take 1634 * ul_lock and go through the non-blocking version of 1635 * ufs_check_lockfs() code. 1636 */ 1637 if (op_cnt_incremented == 0) { 1638 mutex_enter(&ulp->ul_lock); 1639 if (ULOCKFS_IS_HLOCK(ulp) || 1640 (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock)) 1641 error = EIO; 1642 else if (ulp->ul_fs_lock & mask) 1643 error = EAGAIN; 1644 1645 if (error) { 1646 mutex_exit(&ulp->ul_lock); 1647 if (ulockfs_info_free == NULL) 1648 kmem_free(ulockfs_info_temp, 1649 sizeof (ulockfs_info_t)); 1650 return (error); 1651 } 1652 atomic_add_long(ctr, 1); 1653 if (mask & ULOCKFS_FWLOCK) 1654 ULOCKFS_SET_FALLOC(ulp); 1655 mutex_exit(&ulp->ul_lock); 1656 } else if (mask & ULOCKFS_FWLOCK) { 1657 mutex_enter(&ulp->ul_lock); 1658 ULOCKFS_SET_FALLOC(ulp); 1659 mutex_exit(&ulp->ul_lock); 1660 } 1661 } 1662 1663 if (ulockfs_info_free != NULL) { 1664 ulockfs_info_free->ulp = ulp; 1665 if (mask & ULOCKFS_FWLOCK) 1666 ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE; 1667 } else { 1668 ulockfs_info_temp->ulp = ulp; 1669 ulockfs_info_temp->next = ulockfs_info; 1670 if (mask & ULOCKFS_FWLOCK) 1671 ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE; 1672 ASSERT(ufs_lockfs_key != 0); 1673 (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp); 1674 } 1675 1676 curthread->t_flag |= T_DONTBLOCK; 1677 return (0); 1678 } 1679 1680 /* 1681 * specialized version of ufs_lockfs_begin() called by ufs_getpage(). 1682 */ 1683 int 1684 ufs_lockfs_begin_getpage( 1685 struct ufsvfs *ufsvfsp, 1686 struct ulockfs **ulpp, 1687 struct seg *seg, 1688 int read_access, 1689 uint_t *protp) 1690 { 1691 ulong_t mask; 1692 int error; 1693 int rec_vop; 1694 struct ulockfs *ulp; 1695 ulockfs_info_t *ulockfs_info; 1696 ulockfs_info_t *ulockfs_info_free; 1697 ulockfs_info_t *ulockfs_info_temp; 1698 1699 /* 1700 * file system has been forcibly unmounted 1701 */ 1702 if (ufsvfsp == NULL) 1703 return (EIO); 1704 1705 *ulpp = ulp = &ufsvfsp->vfs_ulockfs; 1706 1707 /* 1708 * Do lockfs protocol 1709 */ 1710 ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); 1711 IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free); 1712 1713 /* 1714 * Detect recursive VOP call or handcrafted internal lockfs protocol 1715 * path and bail out in that case. 1716 */ 1717 if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) { 1718 *ulpp = NULL; 1719 return (0); 1720 } else { 1721 if (ulockfs_info_free == NULL) { 1722 if ((ulockfs_info_temp = (ulockfs_info_t *) 1723 kmem_zalloc(sizeof (ulockfs_info_t), 1724 KM_NOSLEEP)) == NULL) { 1725 *ulpp = NULL; 1726 return (ENOMEM); 1727 } 1728 } 1729 } 1730 1731 /* 1732 * First time VOP call 1733 */ 1734 atomic_add_long(&ulp->ul_vnops_cnt, 1); 1735 if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) { 1736 if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) 1737 cv_broadcast(&ulp->ul_cv); 1738 mutex_enter(&ulp->ul_lock); 1739 if (seg->s_ops == &segvn_ops && 1740 ((struct segvn_data *)seg->s_data)->type != MAP_SHARED) { 1741 mask = (ulong_t)ULOCKFS_GETREAD_MASK; 1742 } else if (protp && read_access) { 1743 /* 1744 * Restrict the mapping to readonly. 1745 * Writes to this mapping will cause 1746 * another fault which will then 1747 * be suspended if fs is write locked 1748 */ 1749 *protp &= ~PROT_WRITE; 1750 mask = (ulong_t)ULOCKFS_GETREAD_MASK; 1751 } else 1752 mask = (ulong_t)ULOCKFS_GETWRITE_MASK; 1753 1754 /* 1755 * will sleep if this fs is locked against this VOP 1756 */ 1757 error = ufs_check_lockfs(ufsvfsp, ulp, mask); 1758 mutex_exit(&ulp->ul_lock); 1759 if (error) { 1760 if (ulockfs_info_free == NULL) 1761 kmem_free(ulockfs_info_temp, 1762 sizeof (ulockfs_info_t)); 1763 return (error); 1764 } 1765 } 1766 1767 if (ulockfs_info_free != NULL) { 1768 ulockfs_info_free->ulp = ulp; 1769 } else { 1770 ulockfs_info_temp->ulp = ulp; 1771 ulockfs_info_temp->next = ulockfs_info; 1772 ASSERT(ufs_lockfs_key != 0); 1773 (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp); 1774 } 1775 1776 curthread->t_flag |= T_DONTBLOCK; 1777 return (0); 1778 } 1779 1780 void 1781 ufs_lockfs_tsd_destructor(void *head) 1782 { 1783 ulockfs_info_t *curr = (ulockfs_info_t *)head; 1784 ulockfs_info_t *temp; 1785 1786 for (; curr != NULL; ) { 1787 /* 1788 * The TSD destructor is being called when the thread exits 1789 * (via thread_exit()). At that time it must have cleaned up 1790 * all VOPs via ufs_lockfs_end() and there must not be a 1791 * valid ulockfs record exist while a thread is exiting. 1792 */ 1793 temp = curr; 1794 curr = curr->next; 1795 ASSERT(temp->ulp == NULL); 1796 kmem_free(temp, sizeof (ulockfs_info_t)); 1797 } 1798 } 1799