1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Copyright (c) 2017 by Delphix. All rights reserved. 31 */ 32 33 /* 34 * Portions of this source code were derived from Berkeley 4.3 BSD 35 * under license from the Regents of the University of California. 36 */ 37 38 #include <sys/types.h> 39 #include <sys/systm.h> 40 #include <sys/errno.h> 41 #include <sys/kmem.h> 42 #include <sys/buf.h> 43 #include <sys/vnode.h> 44 #include <sys/vfs.h> 45 #include <sys/user.h> 46 #include <sys/callb.h> 47 #include <sys/cpuvar.h> 48 #include <sys/fs/ufs_inode.h> 49 #include <sys/fs/ufs_log.h> 50 #include <sys/fs/ufs_trans.h> 51 #include <sys/fs/ufs_acl.h> 52 #include <sys/fs/ufs_bio.h> 53 #include <sys/fs/ufs_fsdir.h> 54 #include <sys/debug.h> 55 #include <sys/cmn_err.h> 56 #include <sys/sysmacros.h> 57 #include <vm/pvn.h> 58 59 extern pri_t minclsyspri; 60 extern int hash2ints(); 61 extern struct kmem_cache *inode_cache; /* cache of free inodes */ 62 extern int ufs_idle_waiters; 63 extern struct instats ins; 64 65 static void ufs_attr_purge(struct inode *); 66 67 /* 68 * initialize a thread's queue struct 69 */ 70 void 71 ufs_thread_init(struct ufs_q *uq, int lowat) 72 { 73 bzero((caddr_t)uq, sizeof (*uq)); 74 cv_init(&uq->uq_cv, NULL, CV_DEFAULT, NULL); 75 mutex_init(&uq->uq_mutex, NULL, MUTEX_DEFAULT, NULL); 76 uq->uq_lowat = lowat; 77 uq->uq_hiwat = 2 * lowat; 78 uq->uq_threadp = NULL; 79 } 80 81 /* 82 * start a thread for a queue (assumes success) 83 */ 84 void 85 ufs_thread_start(struct ufs_q *uq, void (*func)(), struct vfs *vfsp) 86 { 87 mutex_enter(&uq->uq_mutex); 88 if (uq->uq_threadp == NULL) { 89 uq->uq_threadp = thread_create(NULL, 0, func, vfsp, 0, &p0, 90 TS_RUN, minclsyspri); 91 uq->uq_flags = 0; 92 } 93 mutex_exit(&uq->uq_mutex); 94 } 95 96 /* 97 * wait for the thread to exit 98 */ 99 void 100 ufs_thread_exit(struct ufs_q *uq) 101 { 102 kt_did_t ufs_thread_did = 0; 103 104 mutex_enter(&uq->uq_mutex); 105 uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED); 106 if (uq->uq_threadp != NULL) { 107 ufs_thread_did = uq->uq_threadp->t_did; 108 uq->uq_flags |= (UQ_EXIT|UQ_WAIT); 109 cv_broadcast(&uq->uq_cv); 110 } 111 mutex_exit(&uq->uq_mutex); 112 113 /* 114 * It's safe to call thread_join() with an already-gone 115 * t_did, but we have to obtain it before the kernel 116 * thread structure is freed. We do so above under the 117 * protection of the uq_mutex when we're sure the thread 118 * still exists and it's save to de-reference it. 119 * We also have to check if ufs_thread_did is != 0 120 * before calling thread_join() since thread 0 in the system 121 * gets a t_did of 0. 122 */ 123 if (ufs_thread_did) 124 thread_join(ufs_thread_did); 125 } 126 127 /* 128 * wait for a thread to suspend itself on the caller's behalf 129 * the caller is responsible for continuing the thread 130 */ 131 void 132 ufs_thread_suspend(struct ufs_q *uq) 133 { 134 mutex_enter(&uq->uq_mutex); 135 if (uq->uq_threadp != NULL) { 136 /* 137 * wait while another thread is suspending this thread. 138 * no need to do a cv_broadcast(), as whoever suspended 139 * the thread must continue it at some point. 140 */ 141 while ((uq->uq_flags & UQ_SUSPEND) && 142 (uq->uq_threadp != NULL)) { 143 /* 144 * We can't use cv_signal() because if our 145 * signal doesn't happen to hit the desired 146 * thread but instead some other waiter like 147 * ourselves, we'll wait forever for a 148 * response. Well, at least an indeterminate 149 * amount of time until we just happen to get 150 * lucky from whomever did get signalled doing 151 * a cv_signal() of their own. This is an 152 * unfortunate performance lossage. 153 */ 154 uq->uq_flags |= UQ_WAIT; 155 cv_wait(&uq->uq_cv, &uq->uq_mutex); 156 } 157 158 uq->uq_flags |= (UQ_SUSPEND | UQ_WAIT); 159 160 /* 161 * wait for the thread to suspend itself 162 */ 163 if ((uq->uq_flags & UQ_SUSPENDED) == 0 && 164 (uq->uq_threadp != NULL)) { 165 cv_broadcast(&uq->uq_cv); 166 } 167 168 while (((uq->uq_flags & UQ_SUSPENDED) == 0) && 169 (uq->uq_threadp != NULL)) { 170 cv_wait(&uq->uq_cv, &uq->uq_mutex); 171 } 172 } 173 mutex_exit(&uq->uq_mutex); 174 } 175 176 /* 177 * allow a thread to continue from a ufs_thread_suspend() 178 * This thread must be the same as the thread that called 179 * ufs_thread_suspend. 180 */ 181 void 182 ufs_thread_continue(struct ufs_q *uq) 183 { 184 mutex_enter(&uq->uq_mutex); 185 uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED); 186 cv_broadcast(&uq->uq_cv); 187 mutex_exit(&uq->uq_mutex); 188 } 189 190 /* 191 * some common code for managing a threads execution 192 * uq is locked at entry and return 193 * may sleep 194 * may exit 195 */ 196 /* 197 * Kind of a hack passing in the callb_cpr_t * here. 198 * It should really be part of the ufs_q structure. 199 * I did not put it in there because we are already in beta 200 * and I was concerned that changing ufs_inode.h to include 201 * callb.h might break something. 202 */ 203 int 204 ufs_thread_run(struct ufs_q *uq, callb_cpr_t *cprinfop) 205 { 206 again: 207 ASSERT(uq->uq_ne >= 0); 208 209 if (uq->uq_flags & UQ_SUSPEND) { 210 uq->uq_flags |= UQ_SUSPENDED; 211 } else if (uq->uq_flags & UQ_EXIT) { 212 /* 213 * exiting; empty the queue (may infinite loop) 214 */ 215 if (uq->uq_ne) 216 return (uq->uq_ne); 217 uq->uq_threadp = NULL; 218 if (uq->uq_flags & UQ_WAIT) { 219 cv_broadcast(&uq->uq_cv); 220 } 221 uq->uq_flags &= ~(UQ_EXIT | UQ_WAIT); 222 CALLB_CPR_EXIT(cprinfop); 223 thread_exit(); 224 } else if (uq->uq_ne >= uq->uq_lowat) { 225 /* 226 * process a block of entries until below high water mark 227 */ 228 return (uq->uq_ne - (uq->uq_lowat >> 1)); 229 } 230 if (uq->uq_flags & UQ_WAIT) { 231 uq->uq_flags &= ~UQ_WAIT; 232 cv_broadcast(&uq->uq_cv); 233 } 234 CALLB_CPR_SAFE_BEGIN(cprinfop); 235 cv_wait(&uq->uq_cv, &uq->uq_mutex); 236 CALLB_CPR_SAFE_END(cprinfop, &uq->uq_mutex); 237 goto again; 238 } 239 240 /* 241 * DELETE INODE 242 * The following routines implement the protocol for freeing the resources 243 * held by an idle and deleted inode. 244 */ 245 void 246 ufs_delete(struct ufsvfs *ufsvfsp, struct inode *ip, int dolockfs) 247 { 248 ushort_t mode; 249 struct vnode *vp = ITOV(ip); 250 struct ulockfs *ulp; 251 int trans_size; 252 int dorwlock = ((ip->i_mode & IFMT) == IFREG); 253 int issync; 254 int err; 255 struct inode *dp; 256 struct ufs_q *delq = &ufsvfsp->vfs_delete; 257 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 258 259 /* 260 * Ignore if deletes are not allowed (wlock/hlock) 261 */ 262 if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) { 263 mutex_enter(&delq->uq_mutex); 264 delq_info->delq_unreclaimed_blocks -= ip->i_blocks; 265 delq_info->delq_unreclaimed_files--; 266 mutex_exit(&delq->uq_mutex); 267 VN_RELE(vp); 268 return; 269 } 270 271 if ((vp->v_count > 1) || (ip->i_mode == 0)) { 272 mutex_enter(&delq->uq_mutex); 273 delq_info->delq_unreclaimed_blocks -= ip->i_blocks; 274 delq_info->delq_unreclaimed_files--; 275 mutex_exit(&delq->uq_mutex); 276 VN_RELE(vp); 277 return; 278 } 279 /* 280 * If we are called as part of setting a fs lock, then only 281 * do part of the lockfs protocol. In other words, don't hang. 282 */ 283 if (dolockfs) { 284 if (ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_DELETE_MASK)) 285 return; 286 } else { 287 /* 288 * check for recursive VOP call 289 */ 290 if (curthread->t_flag & T_DONTBLOCK) { 291 ulp = NULL; 292 } else { 293 ulp = &ufsvfsp->vfs_ulockfs; 294 curthread->t_flag |= T_DONTBLOCK; 295 } 296 } 297 298 /* 299 * Hold rwlock to synchronize with (nfs) writes 300 */ 301 if (dorwlock) 302 rw_enter(&ip->i_rwlock, RW_WRITER); 303 304 /* 305 * Delete the attribute directory. 306 */ 307 if (ip->i_oeftflag != 0) { 308 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE, 309 trans_size = (int)TOP_REMOVE_SIZE(ip)); 310 rw_enter(&ip->i_contents, RW_WRITER); 311 err = ufs_iget(ip->i_vfs, ip->i_oeftflag, 312 &dp, CRED()); 313 if (err == 0) { 314 rw_enter(&dp->i_rwlock, RW_WRITER); 315 rw_enter(&dp->i_contents, RW_WRITER); 316 dp->i_flag |= IUPD|ICHG; 317 dp->i_seq++; 318 TRANS_INODE(dp->i_ufsvfs, dp); 319 dp->i_nlink -= 2; 320 ufs_setreclaim(dp); 321 /* 322 * Should get rid of any negative cache entries that 323 * might be lingering, as well as ``.'' and 324 * ``..''. If we don't, the VN_RELE() below 325 * won't actually put dp on the delete queue 326 * and it'll hang out until someone forces it 327 * (lockfs -f, umount, ...). The only reliable 328 * way of doing this at the moment is to call 329 * dnlc_purge_vp(ITOV(dp)), which is unacceptably 330 * slow, so we'll just note the problem in this 331 * comment for now. 332 */ 333 dnlc_remove(ITOV(dp), "."); 334 dnlc_remove(ITOV(dp), ".."); 335 ITIMES_NOLOCK(dp); 336 if (!TRANS_ISTRANS(ufsvfsp)) { 337 ufs_iupdat(dp, I_SYNC); 338 } 339 rw_exit(&dp->i_contents); 340 rw_exit(&dp->i_rwlock); 341 VN_RELE(ITOV(dp)); 342 } 343 /* 344 * Clear out attribute pointer 345 */ 346 ip->i_oeftflag = 0; 347 rw_exit(&ip->i_contents); 348 TRANS_END_CSYNC(ufsvfsp, err, issync, 349 TOP_REMOVE, trans_size); 350 dnlc_remove(ITOV(ip), XATTR_DIR_NAME); 351 } 352 353 if ((ip->i_mode & IFMT) == IFATTRDIR) { 354 ufs_attr_purge(ip); 355 } 356 357 (void) TRANS_ITRUNC(ip, (u_offset_t)0, I_FREE | I_ACCT, CRED()); 358 359 /* 360 * the inode's space has been freed; now free the inode 361 */ 362 if (ulp) { 363 trans_size = TOP_IFREE_SIZE(ip); 364 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IFREE, trans_size); 365 } 366 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 367 rw_enter(&ip->i_contents, RW_WRITER); 368 TRANS_INODE(ufsvfsp, ip); 369 mode = ip->i_mode; 370 ip->i_mode = 0; 371 ip->i_rdev = 0; 372 ip->i_ordev = 0; 373 ip->i_flag |= IMOD; 374 if (ip->i_ufs_acl) { 375 (void) ufs_si_free(ip->i_ufs_acl, vp->v_vfsp, CRED()); 376 ip->i_ufs_acl = NULL; 377 ip->i_shadow = 0; 378 } 379 380 /* 381 * This inode is torn down but still retains it's identity 382 * (inode number). It could get recycled soon so it's best 383 * to clean up the vnode just in case. 384 */ 385 mutex_enter(&vp->v_lock); 386 vn_recycle(vp); 387 mutex_exit(&vp->v_lock); 388 389 /* 390 * free the inode 391 */ 392 ufs_ifree(ip, ip->i_number, mode); 393 /* 394 * release quota resources; can't fail 395 */ 396 (void) chkiq((struct ufsvfs *)vp->v_vfsp->vfs_data, 397 /* change */ -1, ip, (uid_t)ip->i_uid, 0, CRED(), 398 (char **)NULL, (size_t *)NULL); 399 dqrele(ip->i_dquot); 400 ip->i_dquot = NULL; 401 ip->i_flag &= ~(IDEL | IDIRECTIO); 402 ip->i_cflags = 0; 403 if (!TRANS_ISTRANS(ufsvfsp)) { 404 ufs_iupdat(ip, I_SYNC); 405 } else { 406 mutex_enter(&delq->uq_mutex); 407 delq_info->delq_unreclaimed_files--; 408 mutex_exit(&delq->uq_mutex); 409 } 410 rw_exit(&ip->i_contents); 411 rw_exit(&ufsvfsp->vfs_dqrwlock); 412 if (dorwlock) 413 rw_exit(&ip->i_rwlock); 414 VN_RELE(vp); 415 416 /* 417 * End of transaction 418 */ 419 if (ulp) { 420 TRANS_END_ASYNC(ufsvfsp, TOP_IFREE, trans_size); 421 if (dolockfs) 422 ufs_lockfs_end(ulp); 423 else 424 curthread->t_flag &= ~T_DONTBLOCK; 425 } 426 } 427 428 /* 429 * Create the delete thread and init the delq_info for this fs 430 */ 431 void 432 ufs_delete_init(struct ufsvfs *ufsvfsp, int lowat) 433 { 434 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 435 436 ufs_thread_init(&ufsvfsp->vfs_delete, lowat); 437 (void) memset((void *)delq_info, 0, sizeof (*delq_info)); 438 } 439 440 /* 441 * thread that frees up deleted inodes 442 */ 443 void 444 ufs_thread_delete(struct vfs *vfsp) 445 { 446 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 447 struct ufs_q *uq = &ufsvfsp->vfs_delete; 448 struct inode *ip; 449 long ne; 450 callb_cpr_t cprinfo; 451 452 CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr, 453 "ufsdelete"); 454 455 mutex_enter(&uq->uq_mutex); 456 again: 457 /* 458 * Sleep until there is work to do. Only do one entry at 459 * a time, to reduce the wait time for checking for a suspend 460 * request. The ?: is for pedantic portability. 461 */ 462 ne = ufs_thread_run(uq, &cprinfo) ? 1 : 0; 463 464 /* 465 * process an entry, if there are any 466 */ 467 if (ne && (ip = uq->uq_ihead)) { 468 /* 469 * process first entry on queue. Assumed conditions are: 470 * ip is held (v_count >= 1) 471 * ip is referenced (i_flag & IREF) 472 * ip is free (i_nlink <= 0) 473 */ 474 if ((uq->uq_ihead = ip->i_freef) == ip) 475 uq->uq_ihead = NULL; 476 ip->i_freef->i_freeb = ip->i_freeb; 477 ip->i_freeb->i_freef = ip->i_freef; 478 ip->i_freef = ip; 479 ip->i_freeb = ip; 480 uq->uq_ne--; 481 mutex_exit(&uq->uq_mutex); 482 ufs_delete(ufsvfsp, ip, 1); 483 mutex_enter(&uq->uq_mutex); 484 } 485 goto again; 486 } 487 488 /* 489 * drain ne entries off the delete queue. As new queue entries may 490 * be added while we're working, ne is interpreted as follows: 491 * 492 * ne > 0 => remove up to ne entries 493 * ne == 0 => remove all entries currently on the queue 494 * ne == -1 => remove entries until the queue is empty 495 */ 496 void 497 ufs_delete_drain(struct vfs *vfsp, int ne, int dolockfs) 498 { 499 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 500 struct ufs_q *uq; 501 struct inode *ip; 502 int drain_cnt = 0; 503 int done; 504 505 /* 506 * if forcibly unmounted; ignore 507 */ 508 if (ufsvfsp == NULL) 509 return; 510 511 uq = &ufsvfsp->vfs_delete; 512 mutex_enter(&uq->uq_mutex); 513 if (ne == 0) 514 drain_cnt = uq->uq_ne; 515 else if (ne > 0) 516 drain_cnt = ne; 517 518 /* 519 * process up to ne entries 520 */ 521 522 done = 0; 523 while (!done && (ip = uq->uq_ihead)) { 524 if (ne != -1) 525 drain_cnt--; 526 if (ne != -1 && drain_cnt == 0) 527 done = 1; 528 if ((uq->uq_ihead = ip->i_freef) == ip) 529 uq->uq_ihead = NULL; 530 ip->i_freef->i_freeb = ip->i_freeb; 531 ip->i_freeb->i_freef = ip->i_freef; 532 ip->i_freef = ip; 533 ip->i_freeb = ip; 534 uq->uq_ne--; 535 mutex_exit(&uq->uq_mutex); 536 ufs_delete(ufsvfsp, ip, dolockfs); 537 mutex_enter(&uq->uq_mutex); 538 } 539 mutex_exit(&uq->uq_mutex); 540 } 541 542 void 543 ufs_sync_with_thread(struct ufs_q *uq) 544 { 545 mutex_enter(&uq->uq_mutex); 546 547 /* 548 * Wake up delete thread to free up space. 549 */ 550 if ((uq->uq_flags & UQ_WAIT) == 0) { 551 uq->uq_flags |= UQ_WAIT; 552 cv_broadcast(&uq->uq_cv); 553 } 554 555 while ((uq->uq_threadp != NULL) && (uq->uq_flags & UQ_WAIT)) { 556 cv_wait(&uq->uq_cv, &uq->uq_mutex); 557 } 558 559 mutex_exit(&uq->uq_mutex); 560 } 561 562 /* 563 * Get rid of everything that's currently in the delete queue, 564 * plus whatever the delete thread is working on at the moment. 565 * 566 * This ability is required for providing true POSIX semantics 567 * regarding close(2), unlink(2), etc, even when logging is enabled. 568 * The standard requires that the released space be immediately 569 * observable (statvfs(2)) and allocatable (e.g., write(2)). 570 */ 571 void 572 ufs_delete_drain_wait(struct ufsvfs *ufsvfsp, int dolockfs) 573 { 574 struct ufs_q *uq = &ufsvfsp->vfs_delete; 575 int error; 576 struct ufs_q *delq = &ufsvfsp->vfs_delete; 577 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 578 579 /* 580 * If there is something on delq or delete thread 581 * working on delq. 582 */ 583 mutex_enter(&delq->uq_mutex); 584 if (delq_info->delq_unreclaimed_files > 0) { 585 mutex_exit(&delq->uq_mutex); 586 (void) ufs_delete_drain(ufsvfsp->vfs_vfs, 0, dolockfs); 587 ufs_sync_with_thread(uq); 588 } else { 589 ASSERT(delq_info->delq_unreclaimed_files == 0); 590 mutex_exit(&delq->uq_mutex); 591 return; 592 } 593 594 /* 595 * Commit any outstanding transactions to make sure 596 * any canceled freed blocks are available for allocation. 597 */ 598 curthread->t_flag |= T_DONTBLOCK; 599 TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_UPDATE, TOP_COMMIT_SIZE, error); 600 if (!error) { 601 TRANS_END_SYNC(ufsvfsp, error, TOP_COMMIT_UPDATE, 602 TOP_COMMIT_SIZE); 603 } 604 curthread->t_flag &= ~T_DONTBLOCK; 605 } 606 607 /* 608 * Adjust the resource usage in a struct statvfs based on 609 * what's in the delete queue. 610 * 611 * We do not consider the impact of ACLs or extended attributes 612 * that may be deleted as a side-effect of deleting a file. 613 * Those are metadata, and their sizes aren't reflected in the 614 * sizes returned by stat(), so this is not a problem. 615 */ 616 void 617 ufs_delete_adjust_stats(struct ufsvfs *ufsvfsp, struct statvfs64 *sp) 618 { 619 struct ufs_q *uq = &ufsvfsp->vfs_delete; 620 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 621 622 mutex_enter(&uq->uq_mutex); 623 /* 624 * The blocks accounted for in the delete queue info are 625 * counted in DEV_BSIZE chunks, but ufs_statvfs counts in 626 * filesystem fragments, so a conversion is required here. 627 */ 628 sp->f_bfree += dbtofsb(ufsvfsp->vfs_fs, 629 delq_info->delq_unreclaimed_blocks); 630 sp->f_ffree += delq_info->delq_unreclaimed_files; 631 mutex_exit(&uq->uq_mutex); 632 } 633 634 /* 635 * IDLE INODE 636 * The following routines implement the protocol for maintaining an 637 * LRU list of idle inodes and for moving the idle inodes to the 638 * reuse list when the number of allocated inodes exceeds the user 639 * tunable high-water mark (ufs_ninode). 640 */ 641 642 /* 643 * clean an idle inode and move it to the reuse list 644 */ 645 static void 646 ufs_idle_free(struct inode *ip) 647 { 648 int pages; 649 int hno; 650 kmutex_t *ihm; 651 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 652 struct vnode *vp = ITOV(ip); 653 int vn_has_data, vn_modified; 654 655 /* 656 * inode is held 657 */ 658 659 /* 660 * remember `pages' for stats below 661 */ 662 pages = (ip->i_mode && vn_has_cached_data(vp) && vp->v_type != VCHR); 663 664 /* 665 * start the dirty pages to disk and then invalidate them 666 * unless the inode is invalid (ISTALE) 667 */ 668 if ((ip->i_flag & ISTALE) == 0) { 669 (void) TRANS_SYNCIP(ip, B_ASYNC, I_ASYNC, TOP_SYNCIP_FREE); 670 (void) TRANS_SYNCIP(ip, 671 (TRANS_ISERROR(ufsvfsp)) ? B_INVAL | B_FORCE : B_INVAL, 672 I_ASYNC, TOP_SYNCIP_FREE); 673 } 674 675 /* 676 * wait for any current ufs_iget to finish and block future ufs_igets 677 */ 678 ASSERT(ip->i_number != 0); 679 hno = INOHASH(ip->i_number); 680 ihm = &ih_lock[hno]; 681 mutex_enter(ihm); 682 683 /* 684 * It must be guaranteed that v_count >= 2, otherwise 685 * something must be wrong with this vnode already. 686 * That is why we use VN_RELE_LOCKED() instead of VN_RELE(). 687 * Acquire the vnode lock in case another thread is in 688 * VN_RELE(). 689 */ 690 mutex_enter(&vp->v_lock); 691 692 VERIFY3U(vp->v_count, >=, 2); 693 694 VN_RELE_LOCKED(vp); 695 696 vn_has_data = (vp->v_type != VCHR && vn_has_cached_data(vp)); 697 vn_modified = (ip->i_flag & (IMOD|IMODACC|IACC|ICHG|IUPD|IATTCHG)); 698 699 if (vp->v_count != 1 || 700 ((vn_has_data || vn_modified) && 701 ((ip->i_flag & ISTALE) == 0))) { 702 /* 703 * Another thread has referenced this inode while 704 * we are trying to free it. Call VN_RELE() to 705 * release our reference, if v_count > 1 data is 706 * present or one of the modified etc. flags was 707 * set, whereby ISTALE wasn't set. 708 * If we'd proceed with ISTALE set here, we might 709 * get ourselves into a deadlock situation. 710 */ 711 mutex_exit(&vp->v_lock); 712 mutex_exit(ihm); 713 VN_RELE(vp); 714 } else { 715 /* 716 * The inode is currently unreferenced and can not 717 * acquire further references because it has no pages 718 * and the hash is locked. Inodes acquire references 719 * via the hash list or via their pages. 720 */ 721 722 mutex_exit(&vp->v_lock); 723 724 /* 725 * remove it from the cache 726 */ 727 remque(ip); 728 mutex_exit(ihm); 729 /* 730 * Stale inodes have no valid ufsvfs 731 */ 732 if ((ip->i_flag & ISTALE) == 0 && ip->i_dquot) { 733 TRANS_DQRELE(ufsvfsp, ip->i_dquot); 734 ip->i_dquot = NULL; 735 } 736 if ((ip->i_flag & ISTALE) && 737 vn_has_data) { 738 /* 739 * ISTALE inodes may have data 740 * and this data needs to be 741 * cleaned up. 742 */ 743 (void) pvn_vplist_dirty(vp, (u_offset_t)0, 744 ufs_putapage, B_INVAL | B_TRUNC, 745 (struct cred *)NULL); 746 } 747 ufs_si_del(ip); 748 if (pages) { 749 CPU_STATS_ADDQ(CPU, sys, ufsipage, 1); 750 } else { 751 CPU_STATS_ADDQ(CPU, sys, ufsinopage, 1); 752 } 753 ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp)); 754 755 /* 756 * We had better not have a vnode reference count > 1 757 * at this point, if we do then something is broken as 758 * this inode/vnode acquired a reference underneath of us. 759 */ 760 ASSERT(vp->v_count == 1); 761 762 ufs_free_inode(ip); 763 } 764 } 765 766 /* 767 * this thread processes the global idle queue 768 */ 769 iqhead_t *ufs_junk_iq; 770 iqhead_t *ufs_useful_iq; 771 int ufs_njunk_iq = 0; 772 int ufs_nuseful_iq = 0; 773 int ufs_niqhash; 774 int ufs_iqhashmask; 775 struct ufs_q ufs_idle_q; 776 777 void 778 ufs_thread_idle(void) 779 { 780 callb_cpr_t cprinfo; 781 int i; 782 int ne; 783 784 ufs_niqhash = (ufs_idle_q.uq_lowat >> 1) / IQHASHQLEN; 785 ufs_niqhash = 1 << highbit(ufs_niqhash); /* round up to power of 2 */ 786 ufs_iqhashmask = ufs_niqhash - 1; 787 ufs_junk_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_junk_iq), 788 KM_SLEEP); 789 ufs_useful_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_useful_iq), 790 KM_SLEEP); 791 792 /* Initialize hash queue headers */ 793 for (i = 0; i < ufs_niqhash; i++) { 794 ufs_junk_iq[i].i_freef = (inode_t *)&ufs_junk_iq[i]; 795 ufs_junk_iq[i].i_freeb = (inode_t *)&ufs_junk_iq[i]; 796 ufs_useful_iq[i].i_freef = (inode_t *)&ufs_useful_iq[i]; 797 ufs_useful_iq[i].i_freeb = (inode_t *)&ufs_useful_iq[i]; 798 } 799 800 CALLB_CPR_INIT(&cprinfo, &ufs_idle_q.uq_mutex, callb_generic_cpr, 801 "ufsidle"); 802 again: 803 /* 804 * Whenever the idle thread is awakened, it repeatedly gives 805 * back half of the idle queue until the idle queue falls 806 * below lowat. 807 */ 808 mutex_enter(&ufs_idle_q.uq_mutex); 809 if (ufs_idle_q.uq_ne < ufs_idle_q.uq_lowat) { 810 CALLB_CPR_SAFE_BEGIN(&cprinfo); 811 cv_wait(&ufs_idle_q.uq_cv, &ufs_idle_q.uq_mutex); 812 CALLB_CPR_SAFE_END(&cprinfo, &ufs_idle_q.uq_mutex); 813 } 814 mutex_exit(&ufs_idle_q.uq_mutex); 815 816 /* 817 * Give back 1/2 of the idle queue 818 */ 819 ne = ufs_idle_q.uq_ne >> 1; 820 ins.in_tidles.value.ul += ne; 821 ufs_idle_some(ne); 822 goto again; 823 } 824 825 /* 826 * Reclaim callback for ufs inode cache. 827 * Invoked by the kernel memory allocator when memory gets tight. 828 */ 829 /*ARGSUSED*/ 830 void 831 ufs_inode_cache_reclaim(void *cdrarg) 832 { 833 /* 834 * If we are low on memory and the idle queue is over its 835 * halfway mark, then free 50% of the idle q 836 * 837 * We don't free all of the idle inodes because the inodes 838 * for popular NFS files may have been kicked from the dnlc. 839 * The inodes for these files will end up on the idle queue 840 * after every NFS access. 841 * 842 * If we repeatedly push them from the idle queue then 843 * NFS users may be unhappy as an extra buf cache operation 844 * is incurred for every NFS operation to these files. 845 * 846 * It's not common, but I have seen it happen. 847 * 848 */ 849 if (ufs_idle_q.uq_ne < (ufs_idle_q.uq_lowat >> 1)) 850 return; 851 mutex_enter(&ufs_idle_q.uq_mutex); 852 cv_broadcast(&ufs_idle_q.uq_cv); 853 mutex_exit(&ufs_idle_q.uq_mutex); 854 } 855 856 /* 857 * Free up some idle inodes 858 */ 859 void 860 ufs_idle_some(int ne) 861 { 862 int i; 863 struct inode *ip; 864 struct vnode *vp; 865 static int junk_rotor = 0; 866 static int useful_rotor = 0; 867 868 for (i = 0; i < ne; ++i) { 869 mutex_enter(&ufs_idle_q.uq_mutex); 870 871 if (ufs_njunk_iq) { 872 while (ufs_junk_iq[junk_rotor].i_freef == 873 (inode_t *)&ufs_junk_iq[junk_rotor]) { 874 junk_rotor = IQNEXT(junk_rotor); 875 } 876 ip = ufs_junk_iq[junk_rotor].i_freef; 877 ASSERT(ip->i_flag & IJUNKIQ); 878 } else if (ufs_nuseful_iq) { 879 while (ufs_useful_iq[useful_rotor].i_freef == 880 (inode_t *)&ufs_useful_iq[useful_rotor]) { 881 useful_rotor = IQNEXT(useful_rotor); 882 } 883 ip = ufs_useful_iq[useful_rotor].i_freef; 884 ASSERT(!(ip->i_flag & IJUNKIQ)); 885 } else { 886 mutex_exit(&ufs_idle_q.uq_mutex); 887 return; 888 } 889 890 /* 891 * emulate ufs_iget 892 */ 893 vp = ITOV(ip); 894 VN_HOLD(vp); 895 mutex_exit(&ufs_idle_q.uq_mutex); 896 rw_enter(&ip->i_contents, RW_WRITER); 897 /* 898 * VN_RELE should not be called if 899 * ufs_rmidle returns true, as it will 900 * effectively be done in ufs_idle_free. 901 */ 902 if (ufs_rmidle(ip)) { 903 rw_exit(&ip->i_contents); 904 ufs_idle_free(ip); 905 } else { 906 rw_exit(&ip->i_contents); 907 VN_RELE(vp); 908 } 909 } 910 } 911 912 /* 913 * drain entries for vfsp from the idle queue 914 * vfsp == NULL means drain the entire thing 915 */ 916 void 917 ufs_idle_drain(struct vfs *vfsp) 918 { 919 struct inode *ip, *nip; 920 struct inode *ianchor = NULL; 921 int i; 922 923 mutex_enter(&ufs_idle_q.uq_mutex); 924 if (ufs_njunk_iq) { 925 /* for each hash q */ 926 for (i = 0; i < ufs_niqhash; i++) { 927 /* search down the hash q */ 928 for (ip = ufs_junk_iq[i].i_freef; 929 ip != (inode_t *)&ufs_junk_iq[i]; 930 ip = ip->i_freef) { 931 if (ip->i_vfs == vfsp || vfsp == NULL) { 932 /* found a matching entry */ 933 VN_HOLD(ITOV(ip)); 934 mutex_exit(&ufs_idle_q.uq_mutex); 935 rw_enter(&ip->i_contents, RW_WRITER); 936 /* 937 * See comments in ufs_idle_some() 938 * as we will call ufs_idle_free() 939 * after scanning both queues. 940 */ 941 if (ufs_rmidle(ip)) { 942 rw_exit(&ip->i_contents); 943 ip->i_freef = ianchor; 944 ianchor = ip; 945 } else { 946 rw_exit(&ip->i_contents); 947 VN_RELE(ITOV(ip)); 948 } 949 /* restart this hash q */ 950 ip = (inode_t *)&ufs_junk_iq[i]; 951 mutex_enter(&ufs_idle_q.uq_mutex); 952 } 953 } 954 } 955 } 956 if (ufs_nuseful_iq) { 957 /* for each hash q */ 958 for (i = 0; i < ufs_niqhash; i++) { 959 /* search down the hash q */ 960 for (ip = ufs_useful_iq[i].i_freef; 961 ip != (inode_t *)&ufs_useful_iq[i]; 962 ip = ip->i_freef) { 963 if (ip->i_vfs == vfsp || vfsp == NULL) { 964 /* found a matching entry */ 965 VN_HOLD(ITOV(ip)); 966 mutex_exit(&ufs_idle_q.uq_mutex); 967 rw_enter(&ip->i_contents, RW_WRITER); 968 /* 969 * See comments in ufs_idle_some() 970 * as we will call ufs_idle_free() 971 * after scanning both queues. 972 */ 973 if (ufs_rmidle(ip)) { 974 rw_exit(&ip->i_contents); 975 ip->i_freef = ianchor; 976 ianchor = ip; 977 } else { 978 rw_exit(&ip->i_contents); 979 VN_RELE(ITOV(ip)); 980 } 981 /* restart this hash q */ 982 ip = (inode_t *)&ufs_useful_iq[i]; 983 mutex_enter(&ufs_idle_q.uq_mutex); 984 } 985 } 986 } 987 } 988 989 mutex_exit(&ufs_idle_q.uq_mutex); 990 /* no more matching entries, release those we have found (if any) */ 991 for (ip = ianchor; ip; ip = nip) { 992 nip = ip->i_freef; 993 ip->i_freef = ip; 994 ufs_idle_free(ip); 995 } 996 } 997 998 /* 999 * RECLAIM DELETED INODES 1000 * The following thread scans the file system once looking for deleted files 1001 */ 1002 void 1003 ufs_thread_reclaim(struct vfs *vfsp) 1004 { 1005 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 1006 struct ufs_q *uq = &ufsvfsp->vfs_reclaim; 1007 struct fs *fs = ufsvfsp->vfs_fs; 1008 struct buf *bp = 0; 1009 int err = 0; 1010 daddr_t bno; 1011 ino_t ino; 1012 struct dinode *dp; 1013 struct inode *ip; 1014 callb_cpr_t cprinfo; 1015 1016 CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr, 1017 "ufsreclaim"); 1018 1019 /* 1020 * mount decided that we don't need a reclaim thread 1021 */ 1022 if ((fs->fs_reclaim & FS_RECLAIMING) == 0) 1023 err++; 1024 1025 /* 1026 * don't reclaim if readonly 1027 */ 1028 if (fs->fs_ronly) 1029 err++; 1030 1031 for (ino = 0; ino < (fs->fs_ncg * fs->fs_ipg) && !err; ++ino) { 1032 1033 /* 1034 * Check whether we are the target of another 1035 * thread having called ufs_thread_exit() or 1036 * ufs_thread_suspend(). 1037 */ 1038 mutex_enter(&uq->uq_mutex); 1039 again: 1040 if (uq->uq_flags & UQ_EXIT) { 1041 err++; 1042 mutex_exit(&uq->uq_mutex); 1043 break; 1044 } else if (uq->uq_flags & UQ_SUSPEND) { 1045 uq->uq_flags |= UQ_SUSPENDED; 1046 /* 1047 * Release the buf before we cv_wait() 1048 * otherwise we may deadlock with the 1049 * thread that called ufs_thread_suspend(). 1050 */ 1051 if (bp) { 1052 brelse(bp); 1053 bp = 0; 1054 } 1055 if (uq->uq_flags & UQ_WAIT) { 1056 uq->uq_flags &= ~UQ_WAIT; 1057 cv_broadcast(&uq->uq_cv); 1058 } 1059 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1060 cv_wait(&uq->uq_cv, &uq->uq_mutex); 1061 CALLB_CPR_SAFE_END(&cprinfo, &uq->uq_mutex); 1062 goto again; 1063 } 1064 mutex_exit(&uq->uq_mutex); 1065 1066 /* 1067 * if we don't already have the buf; get it 1068 */ 1069 bno = fsbtodb(fs, itod(fs, ino)); 1070 if ((bp == 0) || (bp->b_blkno != bno)) { 1071 if (bp) 1072 brelse(bp); 1073 bp = UFS_BREAD(ufsvfsp, 1074 ufsvfsp->vfs_dev, bno, fs->fs_bsize); 1075 bp->b_flags |= B_AGE; 1076 } 1077 if (bp->b_flags & B_ERROR) { 1078 err++; 1079 continue; 1080 } 1081 /* 1082 * nlink <= 0 and mode != 0 means deleted 1083 */ 1084 dp = (struct dinode *)bp->b_un.b_addr + itoo(fs, ino); 1085 if ((dp->di_nlink <= 0) && (dp->di_mode != 0)) { 1086 /* 1087 * can't hold the buf (deadlock) 1088 */ 1089 brelse(bp); 1090 bp = 0; 1091 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1092 /* 1093 * iget/iput sequence will put inode on ifree 1094 * thread queue if it is idle. This is a nop 1095 * for busy (open, deleted) inodes 1096 */ 1097 if (ufs_iget(vfsp, ino, &ip, CRED())) 1098 err++; 1099 else 1100 VN_RELE(ITOV(ip)); 1101 rw_exit(&ufsvfsp->vfs_dqrwlock); 1102 } 1103 } 1104 1105 if (bp) 1106 brelse(bp); 1107 if (!err) { 1108 /* 1109 * reset the reclaiming-bit 1110 */ 1111 mutex_enter(&ufsvfsp->vfs_lock); 1112 fs->fs_reclaim &= ~FS_RECLAIMING; 1113 mutex_exit(&ufsvfsp->vfs_lock); 1114 TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_RECLAIM); 1115 } 1116 1117 /* 1118 * exit the reclaim thread 1119 */ 1120 mutex_enter(&uq->uq_mutex); 1121 uq->uq_threadp = NULL; 1122 uq->uq_flags &= ~UQ_WAIT; 1123 cv_broadcast(&uq->uq_cv); 1124 CALLB_CPR_EXIT(&cprinfo); 1125 thread_exit(); 1126 } 1127 /* 1128 * HLOCK FILE SYSTEM 1129 * hlock the file system's whose logs have device errors 1130 */ 1131 struct ufs_q ufs_hlock; 1132 /*ARGSUSED*/ 1133 void 1134 ufs_thread_hlock(void *ignore) 1135 { 1136 int retry; 1137 callb_cpr_t cprinfo; 1138 1139 CALLB_CPR_INIT(&cprinfo, &ufs_hlock.uq_mutex, callb_generic_cpr, 1140 "ufshlock"); 1141 1142 for (;;) { 1143 /* 1144 * sleep until there is work to do 1145 */ 1146 mutex_enter(&ufs_hlock.uq_mutex); 1147 (void) ufs_thread_run(&ufs_hlock, &cprinfo); 1148 ufs_hlock.uq_ne = 0; 1149 mutex_exit(&ufs_hlock.uq_mutex); 1150 /* 1151 * hlock the error'ed fs's 1152 * retry after a bit if another app is doing lockfs stuff 1153 */ 1154 do { 1155 retry = ufs_trans_hlock(); 1156 if (retry) { 1157 mutex_enter(&ufs_hlock.uq_mutex); 1158 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1159 (void) cv_reltimedwait(&ufs_hlock.uq_cv, 1160 &ufs_hlock.uq_mutex, hz, TR_CLOCK_TICK); 1161 CALLB_CPR_SAFE_END(&cprinfo, 1162 &ufs_hlock.uq_mutex); 1163 mutex_exit(&ufs_hlock.uq_mutex); 1164 } 1165 } while (retry); 1166 } 1167 } 1168 1169 static void 1170 ufs_attr_purge(struct inode *dp) 1171 { 1172 int err; 1173 int error; 1174 off_t dirsize; /* size of the directory */ 1175 off_t offset; /* offset in the directory */ 1176 int entryoffsetinblk; /* offset of ep in fbp's buffer */ 1177 struct inode *tp; 1178 struct fbuf *fbp; /* pointer to directory block */ 1179 struct direct *ep; /* directory entry */ 1180 int trans_size; 1181 int issync; 1182 struct ufsvfs *ufsvfsp = dp->i_ufsvfs; 1183 1184 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1185 1186 fbp = NULL; 1187 dirsize = roundup(dp->i_size, DIRBLKSIZ); 1188 offset = 0; 1189 entryoffsetinblk = 0; 1190 1191 /* 1192 * Purge directory cache 1193 */ 1194 1195 dnlc_dir_purge(&dp->i_danchor); 1196 1197 while (offset < dirsize) { 1198 /* 1199 * If offset is on a block boundary, 1200 * read the next directory block. 1201 * Release previous if it exists. 1202 */ 1203 if (blkoff(dp->i_fs, offset) == 0) { 1204 if (fbp != NULL) { 1205 fbrelse(fbp, S_OTHER); 1206 } 1207 1208 err = blkatoff(dp, offset, (char **)0, &fbp); 1209 if (err) { 1210 goto out; 1211 } 1212 entryoffsetinblk = 0; 1213 } 1214 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk); 1215 if (ep->d_ino == 0 || (ep->d_name[0] == '.' && 1216 ep->d_name[1] == '\0') || 1217 (ep->d_name[0] == '.' && ep->d_name[1] == '.' && 1218 ep->d_name[2] == '\0')) { 1219 1220 entryoffsetinblk += ep->d_reclen; 1221 1222 } else { 1223 1224 if ((err = ufs_iget(dp->i_vfs, ep->d_ino, 1225 &tp, CRED())) != 0) { 1226 goto out; 1227 } 1228 1229 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE, 1230 trans_size = (int)TOP_REMOVE_SIZE(tp)); 1231 1232 /* 1233 * Delete inode. 1234 */ 1235 1236 dnlc_remove(ITOV(dp), ep->d_name); 1237 1238 rw_enter(&tp->i_contents, RW_WRITER); 1239 tp->i_flag |= ICHG; 1240 tp->i_seq++; 1241 TRANS_INODE(tp->i_ufsvfs, tp); 1242 tp->i_nlink--; 1243 ufs_setreclaim(tp); 1244 ITIMES_NOLOCK(tp); 1245 rw_exit(&tp->i_contents); 1246 1247 VN_RELE(ITOV(tp)); 1248 entryoffsetinblk += ep->d_reclen; 1249 TRANS_END_CSYNC(ufsvfsp, error, 1250 issync, TOP_REMOVE, trans_size); 1251 1252 } 1253 offset += ep->d_reclen; 1254 } 1255 1256 if (fbp) { 1257 fbrelse(fbp, S_OTHER); 1258 } 1259 1260 out: 1261 rw_exit(&ufsvfsp->vfs_dqrwlock); 1262 } 1263