1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #pragma ident "%Z%%M% %I% %E% SMI" 35 36 #include <sys/types.h> 37 #include <sys/systm.h> 38 #include <sys/errno.h> 39 #include <sys/kmem.h> 40 #include <sys/buf.h> 41 #include <sys/vnode.h> 42 #include <sys/vfs.h> 43 #include <sys/user.h> 44 #include <sys/callb.h> 45 #include <sys/cpuvar.h> 46 #include <sys/fs/ufs_inode.h> 47 #include <sys/fs/ufs_log.h> 48 #include <sys/fs/ufs_trans.h> 49 #include <sys/fs/ufs_acl.h> 50 #include <sys/fs/ufs_bio.h> 51 #include <sys/fs/ufs_fsdir.h> 52 #include <sys/debug.h> 53 #include <sys/cmn_err.h> 54 #include <sys/sysmacros.h> 55 56 extern pri_t minclsyspri; 57 extern int hash2ints(); 58 extern struct kmem_cache *inode_cache; /* cache of free inodes */ 59 extern int ufs_idle_waiters; 60 extern struct instats ins; 61 62 static void ufs_attr_purge(struct inode *); 63 64 /* 65 * initialize a thread's queue struct 66 */ 67 void 68 ufs_thread_init(struct ufs_q *uq, int lowat) 69 { 70 bzero((caddr_t)uq, sizeof (*uq)); 71 cv_init(&uq->uq_cv, NULL, CV_DEFAULT, NULL); 72 mutex_init(&uq->uq_mutex, NULL, MUTEX_DEFAULT, NULL); 73 uq->uq_lowat = lowat; 74 uq->uq_hiwat = 2 * lowat; 75 uq->uq_threadp = NULL; 76 } 77 78 /* 79 * start a thread for a queue (assumes success) 80 */ 81 void 82 ufs_thread_start(struct ufs_q *uq, void (*func)(), struct vfs *vfsp) 83 { 84 mutex_enter(&uq->uq_mutex); 85 if (uq->uq_threadp == NULL) { 86 uq->uq_threadp = thread_create(NULL, 0, func, vfsp, 0, &p0, 87 TS_RUN, minclsyspri); 88 uq->uq_flags = 0; 89 } 90 mutex_exit(&uq->uq_mutex); 91 } 92 93 /* 94 * wait for the thread to exit 95 */ 96 void 97 ufs_thread_exit(struct ufs_q *uq) 98 { 99 kt_did_t ufs_thread_did = 0; 100 101 mutex_enter(&uq->uq_mutex); 102 uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED); 103 if (uq->uq_threadp != NULL) { 104 ufs_thread_did = uq->uq_threadp->t_did; 105 uq->uq_flags |= (UQ_EXIT|UQ_WAIT); 106 cv_broadcast(&uq->uq_cv); 107 } 108 mutex_exit(&uq->uq_mutex); 109 110 /* 111 * It's safe to call thread_join() with an already-gone 112 * t_did, but we have to obtain it before the kernel 113 * thread structure is freed. We do so above under the 114 * protection of the uq_mutex when we're sure the thread 115 * still exists and it's save to de-reference it. 116 * We also have to check if ufs_thread_did is != 0 117 * before calling thread_join() since thread 0 in the system 118 * gets a t_did of 0. 119 */ 120 if (ufs_thread_did) 121 thread_join(ufs_thread_did); 122 } 123 124 /* 125 * wait for a thread to suspend itself on the caller's behalf 126 * the caller is responsible for continuing the thread 127 */ 128 void 129 ufs_thread_suspend(struct ufs_q *uq) 130 { 131 mutex_enter(&uq->uq_mutex); 132 if (uq->uq_threadp != NULL) { 133 /* 134 * wait while another thread is suspending this thread. 135 * no need to do a cv_broadcast(), as whoever suspended 136 * the thread must continue it at some point. 137 */ 138 while ((uq->uq_flags & UQ_SUSPEND) && 139 (uq->uq_threadp != NULL)) { 140 /* 141 * We can't use cv_signal() because if our 142 * signal doesn't happen to hit the desired 143 * thread but instead some other waiter like 144 * ourselves, we'll wait forever for a 145 * response. Well, at least an indeterminate 146 * amount of time until we just happen to get 147 * lucky from whomever did get signalled doing 148 * a cv_signal() of their own. This is an 149 * unfortunate performance lossage. 150 */ 151 uq->uq_flags |= UQ_WAIT; 152 cv_wait(&uq->uq_cv, &uq->uq_mutex); 153 } 154 155 uq->uq_flags |= (UQ_SUSPEND | UQ_WAIT); 156 157 /* 158 * wait for the thread to suspend itself 159 */ 160 if ((uq->uq_flags & UQ_SUSPENDED) == 0 && 161 (uq->uq_threadp != NULL)) { 162 cv_broadcast(&uq->uq_cv); 163 } 164 165 while (((uq->uq_flags & UQ_SUSPENDED) == 0) && 166 (uq->uq_threadp != NULL)) { 167 cv_wait(&uq->uq_cv, &uq->uq_mutex); 168 } 169 } 170 mutex_exit(&uq->uq_mutex); 171 } 172 173 /* 174 * allow a thread to continue from a ufs_thread_suspend() 175 * This thread must be the same as the thread that called 176 * ufs_thread_suspend. 177 */ 178 void 179 ufs_thread_continue(struct ufs_q *uq) 180 { 181 mutex_enter(&uq->uq_mutex); 182 uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED); 183 cv_broadcast(&uq->uq_cv); 184 mutex_exit(&uq->uq_mutex); 185 } 186 187 /* 188 * some common code for managing a threads execution 189 * uq is locked at entry and return 190 * may sleep 191 * may exit 192 */ 193 /* 194 * Kind of a hack passing in the callb_cpr_t * here. 195 * It should really be part of the ufs_q structure. 196 * I did not put it in there because we are already in beta 197 * and I was concerned that changing ufs_inode.h to include 198 * callb.h might break something. 199 */ 200 int 201 ufs_thread_run(struct ufs_q *uq, callb_cpr_t *cprinfop) 202 { 203 again: 204 ASSERT(uq->uq_ne >= 0); 205 206 if (uq->uq_flags & UQ_SUSPEND) { 207 uq->uq_flags |= UQ_SUSPENDED; 208 } else if (uq->uq_flags & UQ_EXIT) { 209 /* 210 * exiting; empty the queue (may infinite loop) 211 */ 212 if (uq->uq_ne) 213 return (uq->uq_ne); 214 uq->uq_threadp = NULL; 215 if (uq->uq_flags & UQ_WAIT) { 216 cv_broadcast(&uq->uq_cv); 217 } 218 uq->uq_flags &= ~(UQ_EXIT | UQ_WAIT); 219 CALLB_CPR_EXIT(cprinfop); 220 thread_exit(); 221 } else if (uq->uq_ne >= uq->uq_lowat) { 222 /* 223 * process a block of entries until below high water mark 224 */ 225 return (uq->uq_ne - (uq->uq_lowat >> 1)); 226 } 227 if (uq->uq_flags & UQ_WAIT) { 228 uq->uq_flags &= ~UQ_WAIT; 229 cv_broadcast(&uq->uq_cv); 230 } 231 CALLB_CPR_SAFE_BEGIN(cprinfop); 232 cv_wait(&uq->uq_cv, &uq->uq_mutex); 233 CALLB_CPR_SAFE_END(cprinfop, &uq->uq_mutex); 234 goto again; 235 } 236 237 /* 238 * DELETE INODE 239 * The following routines implement the protocol for freeing the resources 240 * held by an idle and deleted inode. 241 */ 242 void 243 ufs_delete(struct ufsvfs *ufsvfsp, struct inode *ip, int dolockfs) 244 { 245 ushort_t mode; 246 struct vnode *vp = ITOV(ip); 247 struct ulockfs *ulp; 248 int trans_size; 249 int dorwlock = ((ip->i_mode & IFMT) == IFREG); 250 int issync; 251 int err; 252 struct inode *dp; 253 struct ufs_q *delq = &ufsvfsp->vfs_delete; 254 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 255 256 /* 257 * Ignore if deletes are not allowed (wlock/hlock) 258 */ 259 if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) { 260 mutex_enter(&delq->uq_mutex); 261 delq_info->delq_unreclaimed_blocks -= ip->i_blocks; 262 delq_info->delq_unreclaimed_files--; 263 mutex_exit(&delq->uq_mutex); 264 VN_RELE(vp); 265 return; 266 } 267 268 if ((vp->v_count > 1) || (ip->i_mode == 0)) { 269 mutex_enter(&delq->uq_mutex); 270 delq_info->delq_unreclaimed_blocks -= ip->i_blocks; 271 delq_info->delq_unreclaimed_files--; 272 mutex_exit(&delq->uq_mutex); 273 VN_RELE(vp); 274 return; 275 } 276 /* 277 * If we are called as part of setting a fs lock, then only 278 * do part of the lockfs protocol. In other words, don't hang. 279 */ 280 if (dolockfs) { 281 if (ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_DELETE_MASK)) 282 return; 283 } else { 284 /* 285 * check for recursive VOP call 286 */ 287 if (curthread->t_flag & T_DONTBLOCK) { 288 ulp = NULL; 289 } else { 290 ulp = &ufsvfsp->vfs_ulockfs; 291 curthread->t_flag |= T_DONTBLOCK; 292 } 293 } 294 295 /* 296 * Hold rwlock to synchronize with (nfs) writes 297 */ 298 if (dorwlock) 299 rw_enter(&ip->i_rwlock, RW_WRITER); 300 301 /* 302 * Delete the attribute directory. 303 */ 304 if (ip->i_oeftflag != 0) { 305 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE, 306 trans_size = (int)TOP_REMOVE_SIZE(ip)); 307 rw_enter(&ip->i_contents, RW_WRITER); 308 err = ufs_iget(ip->i_vfs, ip->i_oeftflag, 309 &dp, CRED()); 310 if (err == 0) { 311 rw_enter(&dp->i_rwlock, RW_WRITER); 312 rw_enter(&dp->i_contents, RW_WRITER); 313 dp->i_flag |= IUPD|ICHG; 314 dp->i_seq++; 315 TRANS_INODE(dp->i_ufsvfs, dp); 316 dp->i_nlink -= 2; 317 ufs_setreclaim(dp); 318 /* 319 * Should get rid of any negative cache entries that 320 * might be lingering, as well as ``.'' and 321 * ``..''. If we don't, the VN_RELE() below 322 * won't actually put dp on the delete queue 323 * and it'll hang out until someone forces it 324 * (lockfs -f, umount, ...). The only reliable 325 * way of doing this at the moment is to call 326 * dnlc_purge_vp(ITOV(dp)), which is unacceptably 327 * slow, so we'll just note the problem in this 328 * comment for now. 329 */ 330 dnlc_remove(ITOV(dp), "."); 331 dnlc_remove(ITOV(dp), ".."); 332 ITIMES_NOLOCK(dp); 333 if (!TRANS_ISTRANS(ufsvfsp)) { 334 ufs_iupdat(dp, I_SYNC); 335 } 336 rw_exit(&dp->i_contents); 337 rw_exit(&dp->i_rwlock); 338 VN_RELE(ITOV(dp)); 339 } 340 /* 341 * Clear out attribute pointer 342 */ 343 ip->i_oeftflag = 0; 344 rw_exit(&ip->i_contents); 345 TRANS_END_CSYNC(ufsvfsp, err, issync, 346 TOP_REMOVE, trans_size); 347 dnlc_remove(ITOV(ip), XATTR_DIR_NAME); 348 } 349 350 if ((ip->i_mode & IFMT) == IFATTRDIR) { 351 ufs_attr_purge(ip); 352 } 353 354 (void) TRANS_ITRUNC(ip, (u_offset_t)0, I_FREE | I_ACCT, CRED()); 355 356 /* 357 * the inode's space has been freed; now free the inode 358 */ 359 if (ulp) { 360 trans_size = TOP_IFREE_SIZE(ip); 361 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IFREE, trans_size); 362 } 363 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 364 rw_enter(&ip->i_contents, RW_WRITER); 365 TRANS_INODE(ufsvfsp, ip); 366 mode = ip->i_mode; 367 ip->i_mode = 0; 368 ip->i_rdev = 0; 369 ip->i_ordev = 0; 370 ip->i_flag |= IMOD; 371 if (ip->i_ufs_acl) { 372 (void) ufs_si_free(ip->i_ufs_acl, vp->v_vfsp, CRED()); 373 ip->i_ufs_acl = NULL; 374 ip->i_shadow = 0; 375 } 376 377 /* 378 * This inode is torn down but still retains it's identity 379 * (inode number). It could get recycled soon so it's best 380 * to clean up the vnode just in case. 381 */ 382 mutex_enter(&vp->v_lock); 383 vn_recycle(vp); 384 mutex_exit(&vp->v_lock); 385 386 /* 387 * free the inode 388 */ 389 ufs_ifree(ip, ip->i_number, mode); 390 /* 391 * release quota resources; can't fail 392 */ 393 (void) chkiq((struct ufsvfs *)vp->v_vfsp->vfs_data, 394 /* change */ -1, ip, (uid_t)ip->i_uid, 0, CRED(), 395 (char **)NULL, (size_t *)NULL); 396 dqrele(ip->i_dquot); 397 ip->i_dquot = NULL; 398 ip->i_flag &= ~(IDEL | IDIRECTIO); 399 ip->i_cflags = 0; 400 if (!TRANS_ISTRANS(ufsvfsp)) { 401 ufs_iupdat(ip, I_SYNC); 402 } else { 403 mutex_enter(&delq->uq_mutex); 404 delq_info->delq_unreclaimed_files--; 405 mutex_exit(&delq->uq_mutex); 406 } 407 rw_exit(&ip->i_contents); 408 rw_exit(&ufsvfsp->vfs_dqrwlock); 409 if (dorwlock) 410 rw_exit(&ip->i_rwlock); 411 VN_RELE(vp); 412 413 /* 414 * End of transaction 415 */ 416 if (ulp) { 417 TRANS_END_ASYNC(ufsvfsp, TOP_IFREE, trans_size); 418 if (dolockfs) 419 ufs_lockfs_end(ulp); 420 else 421 curthread->t_flag &= ~T_DONTBLOCK; 422 } 423 } 424 425 /* 426 * Create the delete thread and init the delq_info for this fs 427 */ 428 void 429 ufs_delete_init(struct ufsvfs *ufsvfsp, int lowat) 430 { 431 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 432 433 ufs_thread_init(&ufsvfsp->vfs_delete, lowat); 434 (void) memset((void *)delq_info, 0, sizeof (*delq_info)); 435 } 436 437 /* 438 * thread that frees up deleted inodes 439 */ 440 void 441 ufs_thread_delete(struct vfs *vfsp) 442 { 443 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 444 struct ufs_q *uq = &ufsvfsp->vfs_delete; 445 struct inode *ip; 446 long ne; 447 callb_cpr_t cprinfo; 448 449 CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr, 450 "ufsdelete"); 451 452 mutex_enter(&uq->uq_mutex); 453 again: 454 /* 455 * Sleep until there is work to do. Only do one entry at 456 * a time, to reduce the wait time for checking for a suspend 457 * request. The ?: is for pedantic portability. 458 */ 459 ne = ufs_thread_run(uq, &cprinfo) ? 1 : 0; 460 461 /* 462 * process an entry, if there are any 463 */ 464 if (ne && (ip = uq->uq_ihead)) { 465 /* 466 * process first entry on queue. Assumed conditions are: 467 * ip is held (v_count >= 1) 468 * ip is referenced (i_flag & IREF) 469 * ip is free (i_nlink <= 0) 470 */ 471 if ((uq->uq_ihead = ip->i_freef) == ip) 472 uq->uq_ihead = NULL; 473 ip->i_freef->i_freeb = ip->i_freeb; 474 ip->i_freeb->i_freef = ip->i_freef; 475 ip->i_freef = ip; 476 ip->i_freeb = ip; 477 uq->uq_ne--; 478 mutex_exit(&uq->uq_mutex); 479 ufs_delete(ufsvfsp, ip, 1); 480 mutex_enter(&uq->uq_mutex); 481 } 482 goto again; 483 } 484 485 /* 486 * drain ne entries off the delete queue. As new queue entries may 487 * be added while we're working, ne is interpreted as follows: 488 * 489 * ne > 0 => remove up to ne entries 490 * ne == 0 => remove all entries currently on the queue 491 * ne == -1 => remove entries until the queue is empty 492 */ 493 void 494 ufs_delete_drain(struct vfs *vfsp, int ne, int dolockfs) 495 { 496 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 497 struct ufs_q *uq; 498 struct inode *ip; 499 int drain_cnt = 0; 500 int done; 501 502 /* 503 * if forcibly unmounted; ignore 504 */ 505 if (ufsvfsp == NULL) 506 return; 507 508 uq = &ufsvfsp->vfs_delete; 509 mutex_enter(&uq->uq_mutex); 510 if (ne == 0) 511 drain_cnt = uq->uq_ne; 512 else if (ne > 0) 513 drain_cnt = ne; 514 515 /* 516 * process up to ne entries 517 */ 518 519 done = 0; 520 while (!done && (ip = uq->uq_ihead)) { 521 if (ne != -1) 522 drain_cnt--; 523 if (ne != -1 && drain_cnt == 0) 524 done = 1; 525 if ((uq->uq_ihead = ip->i_freef) == ip) 526 uq->uq_ihead = NULL; 527 ip->i_freef->i_freeb = ip->i_freeb; 528 ip->i_freeb->i_freef = ip->i_freef; 529 ip->i_freef = ip; 530 ip->i_freeb = ip; 531 uq->uq_ne--; 532 mutex_exit(&uq->uq_mutex); 533 ufs_delete(ufsvfsp, ip, dolockfs); 534 mutex_enter(&uq->uq_mutex); 535 } 536 mutex_exit(&uq->uq_mutex); 537 } 538 539 void 540 ufs_sync_with_thread(struct ufs_q *uq) 541 { 542 mutex_enter(&uq->uq_mutex); 543 544 /* 545 * Wake up delete thread to free up space. 546 */ 547 if ((uq->uq_flags & UQ_WAIT) == 0) { 548 uq->uq_flags |= UQ_WAIT; 549 cv_broadcast(&uq->uq_cv); 550 } 551 552 while ((uq->uq_threadp != NULL) && (uq->uq_flags & UQ_WAIT)) { 553 cv_wait(&uq->uq_cv, &uq->uq_mutex); 554 } 555 556 mutex_exit(&uq->uq_mutex); 557 } 558 559 /* 560 * Get rid of everything that's currently in the delete queue, 561 * plus whatever the delete thread is working on at the moment. 562 * 563 * This ability is required for providing true POSIX semantics 564 * regarding close(2), unlink(2), etc, even when logging is enabled. 565 * The standard requires that the released space be immediately 566 * observable (statvfs(2)) and allocatable (e.g., write(2)). 567 */ 568 void 569 ufs_delete_drain_wait(struct ufsvfs *ufsvfsp, int dolockfs) 570 { 571 struct ufs_q *uq = &ufsvfsp->vfs_delete; 572 int error; 573 struct ufs_q *delq = &ufsvfsp->vfs_delete; 574 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 575 576 /* 577 * If there is something on delq or delete thread 578 * working on delq. 579 */ 580 mutex_enter(&delq->uq_mutex); 581 if (delq_info->delq_unreclaimed_files > 0) { 582 mutex_exit(&delq->uq_mutex); 583 (void) ufs_delete_drain(ufsvfsp->vfs_vfs, 0, dolockfs); 584 ufs_sync_with_thread(uq); 585 } else { 586 ASSERT(delq_info->delq_unreclaimed_files == 0); 587 mutex_exit(&delq->uq_mutex); 588 return; 589 } 590 591 /* 592 * Commit any outstanding transactions to make sure 593 * any canceled freed blocks are available for allocation. 594 */ 595 curthread->t_flag |= T_DONTBLOCK; 596 TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_UPDATE, TOP_COMMIT_SIZE, error); 597 if (!error) { 598 TRANS_END_SYNC(ufsvfsp, error, TOP_COMMIT_UPDATE, 599 TOP_COMMIT_SIZE); 600 } 601 curthread->t_flag &= ~T_DONTBLOCK; 602 } 603 604 /* 605 * Adjust the resource usage in a struct statvfs based on 606 * what's in the delete queue. 607 * 608 * We do not consider the impact of ACLs or extended attributes 609 * that may be deleted as a side-effect of deleting a file. 610 * Those are metadata, and their sizes aren't reflected in the 611 * sizes returned by stat(), so this is not a problem. 612 */ 613 void 614 ufs_delete_adjust_stats(struct ufsvfs *ufsvfsp, struct statvfs64 *sp) 615 { 616 struct ufs_q *uq = &ufsvfsp->vfs_delete; 617 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 618 619 mutex_enter(&uq->uq_mutex); 620 /* 621 * The blocks accounted for in the delete queue info are 622 * counted in DEV_BSIZE chunks, but ufs_statvfs counts in 623 * filesystem fragments, so a conversion is required here. 624 */ 625 sp->f_bfree += dbtofsb(ufsvfsp->vfs_fs, 626 delq_info->delq_unreclaimed_blocks); 627 sp->f_ffree += delq_info->delq_unreclaimed_files; 628 mutex_exit(&uq->uq_mutex); 629 } 630 631 /* 632 * IDLE INODE 633 * The following routines implement the protocol for maintaining an 634 * LRU list of idle inodes and for moving the idle inodes to the 635 * reuse list when the number of allocated inodes exceeds the user 636 * tunable high-water mark (ufs_ninode). 637 */ 638 639 /* 640 * clean an idle inode and move it to the reuse list 641 */ 642 static void 643 ufs_idle_free(struct inode *ip) 644 { 645 int pages; 646 int hno; 647 kmutex_t *ihm; 648 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 649 struct vnode *vp = ITOV(ip); 650 651 /* 652 * inode is held 653 */ 654 655 /* 656 * remember `pages' for stats below 657 */ 658 pages = (ip->i_mode && vn_has_cached_data(vp) && vp->v_type != VCHR); 659 660 /* 661 * start the dirty pages to disk and then invalidate them 662 * unless the inode is invalid (ISTALE) 663 */ 664 if ((ip->i_flag & ISTALE) == 0) { 665 (void) TRANS_SYNCIP(ip, B_ASYNC, I_ASYNC, TOP_SYNCIP_FREE); 666 (void) TRANS_SYNCIP(ip, 667 (TRANS_ISERROR(ufsvfsp)) ? B_INVAL | B_FORCE : B_INVAL, 668 I_ASYNC, TOP_SYNCIP_FREE); 669 } 670 671 /* 672 * wait for any current ufs_iget to finish and block future ufs_igets 673 */ 674 ASSERT(ip->i_number != 0); 675 hno = INOHASH(ip->i_number); 676 ihm = &ih_lock[hno]; 677 mutex_enter(ihm); 678 679 /* 680 * It must be guaranteed that v_count >= 2, otherwise 681 * something must be wrong with this vnode already. 682 * That is why we use v_count-- instead of VN_RELE(). 683 * Acquire the vnode lock in case another thread is in 684 * VN_RELE(). 685 */ 686 mutex_enter(&vp->v_lock); 687 688 if (vp->v_count < 2) 689 cmn_err(CE_PANIC, 690 "ufs_idle_free: vnode ref count is less than 2"); 691 692 vp->v_count--; 693 if ((vp->v_type != VCHR && vn_has_cached_data(vp)) || 694 vp->v_count != 1 || 695 ip->i_flag & (IMOD|IMODACC|IACC|ICHG|IUPD|IATTCHG)) { 696 /* 697 * Another thread has referenced this inode while 698 * we are trying to free it. Call VN_RELE() to 699 * release our reference. 700 */ 701 mutex_exit(&vp->v_lock); 702 mutex_exit(ihm); 703 VN_RELE(vp); 704 } else { 705 /* 706 * The inode is currently unreferenced and can not 707 * acquire further references because it has no pages 708 * and the hash is locked. Inodes acquire references 709 * via the hash list or via their pages. 710 */ 711 712 mutex_exit(&vp->v_lock); 713 714 /* 715 * remove it from the cache 716 */ 717 remque(ip); 718 mutex_exit(ihm); 719 /* 720 * Stale inodes have no valid ufsvfs 721 */ 722 if ((ip->i_flag & ISTALE) == 0 && ip->i_dquot) { 723 TRANS_DQRELE(ufsvfsp, ip->i_dquot); 724 ip->i_dquot = NULL; 725 } 726 ufs_si_del(ip); 727 if (pages) { 728 CPU_STATS_ADDQ(CPU, sys, ufsipage, 1); 729 } else { 730 CPU_STATS_ADDQ(CPU, sys, ufsinopage, 1); 731 } 732 ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp)); 733 734 /* 735 * We had better not have a vnode reference count > 1 736 * at this point, if we do then something is broken as 737 * this inode/vnode acquired a reference underneath of us. 738 */ 739 ASSERT(vp->v_count == 1); 740 741 ufs_free_inode(ip); 742 } 743 } 744 745 /* 746 * this thread processes the global idle queue 747 */ 748 iqhead_t *ufs_junk_iq; 749 iqhead_t *ufs_useful_iq; 750 int ufs_njunk_iq = 0; 751 int ufs_nuseful_iq = 0; 752 int ufs_niqhash; 753 int ufs_iqhashmask; 754 struct ufs_q ufs_idle_q; 755 756 void 757 ufs_thread_idle(void) 758 { 759 callb_cpr_t cprinfo; 760 int i; 761 int ne; 762 763 ufs_niqhash = (ufs_idle_q.uq_lowat >> 1) / IQHASHQLEN; 764 ufs_niqhash = 1 << highbit(ufs_niqhash); /* round up to power of 2 */ 765 ufs_iqhashmask = ufs_niqhash - 1; 766 ufs_junk_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_junk_iq), 767 KM_SLEEP); 768 ufs_useful_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_useful_iq), 769 KM_SLEEP); 770 771 /* Initialize hash queue headers */ 772 for (i = 0; i < ufs_niqhash; i++) { 773 ufs_junk_iq[i].i_freef = (inode_t *)&ufs_junk_iq[i]; 774 ufs_junk_iq[i].i_freeb = (inode_t *)&ufs_junk_iq[i]; 775 ufs_useful_iq[i].i_freef = (inode_t *)&ufs_useful_iq[i]; 776 ufs_useful_iq[i].i_freeb = (inode_t *)&ufs_useful_iq[i]; 777 } 778 779 CALLB_CPR_INIT(&cprinfo, &ufs_idle_q.uq_mutex, callb_generic_cpr, 780 "ufsidle"); 781 again: 782 /* 783 * Whenever the idle thread is awakened, it repeatedly gives 784 * back half of the idle queue until the idle queue falls 785 * below lowat. 786 */ 787 mutex_enter(&ufs_idle_q.uq_mutex); 788 if (ufs_idle_q.uq_ne < ufs_idle_q.uq_lowat) { 789 CALLB_CPR_SAFE_BEGIN(&cprinfo); 790 cv_wait(&ufs_idle_q.uq_cv, &ufs_idle_q.uq_mutex); 791 CALLB_CPR_SAFE_END(&cprinfo, &ufs_idle_q.uq_mutex); 792 } 793 mutex_exit(&ufs_idle_q.uq_mutex); 794 795 /* 796 * Give back 1/2 of the idle queue 797 */ 798 ne = ufs_idle_q.uq_ne >> 1; 799 ins.in_tidles.value.ul += ne; 800 ufs_idle_some(ne); 801 goto again; 802 } 803 804 /* 805 * Reclaim callback for ufs inode cache. 806 * Invoked by the kernel memory allocator when memory gets tight. 807 */ 808 /*ARGSUSED*/ 809 void 810 ufs_inode_cache_reclaim(void *cdrarg) 811 { 812 /* 813 * If we are low on memory and the idle queue is over its 814 * halfway mark, then free 50% of the idle q 815 * 816 * We don't free all of the idle inodes because the inodes 817 * for popular NFS files may have been kicked from the dnlc. 818 * The inodes for these files will end up on the idle queue 819 * after every NFS access. 820 * 821 * If we repeatedly push them from the idle queue then 822 * NFS users may be unhappy as an extra buf cache operation 823 * is incurred for every NFS operation to these files. 824 * 825 * It's not common, but I have seen it happen. 826 * 827 */ 828 if (ufs_idle_q.uq_ne < (ufs_idle_q.uq_lowat >> 1)) 829 return; 830 mutex_enter(&ufs_idle_q.uq_mutex); 831 cv_broadcast(&ufs_idle_q.uq_cv); 832 mutex_exit(&ufs_idle_q.uq_mutex); 833 } 834 835 /* 836 * Free up some idle inodes 837 */ 838 void 839 ufs_idle_some(int ne) 840 { 841 int i; 842 struct inode *ip; 843 struct vnode *vp; 844 static int junk_rotor = 0; 845 static int useful_rotor = 0; 846 847 for (i = 0; i < ne; ++i) { 848 mutex_enter(&ufs_idle_q.uq_mutex); 849 850 if (ufs_njunk_iq) { 851 while (ufs_junk_iq[junk_rotor].i_freef == 852 (inode_t *)&ufs_junk_iq[junk_rotor]) { 853 junk_rotor = IQNEXT(junk_rotor); 854 } 855 ip = ufs_junk_iq[junk_rotor].i_freef; 856 ASSERT(ip->i_flag & IJUNKIQ); 857 } else if (ufs_nuseful_iq) { 858 while (ufs_useful_iq[useful_rotor].i_freef == 859 (inode_t *)&ufs_useful_iq[useful_rotor]) { 860 useful_rotor = IQNEXT(useful_rotor); 861 } 862 ip = ufs_useful_iq[useful_rotor].i_freef; 863 ASSERT(!(ip->i_flag & IJUNKIQ)); 864 } else { 865 mutex_exit(&ufs_idle_q.uq_mutex); 866 return; 867 } 868 869 /* 870 * emulate ufs_iget 871 */ 872 vp = ITOV(ip); 873 VN_HOLD(vp); 874 mutex_exit(&ufs_idle_q.uq_mutex); 875 rw_enter(&ip->i_contents, RW_WRITER); 876 /* 877 * VN_RELE should not be called if 878 * ufs_rmidle returns true, as it will 879 * effectively be done in ufs_idle_free. 880 */ 881 if (ufs_rmidle(ip)) { 882 rw_exit(&ip->i_contents); 883 ufs_idle_free(ip); 884 } else { 885 rw_exit(&ip->i_contents); 886 VN_RELE(vp); 887 } 888 } 889 } 890 891 /* 892 * drain entries for vfsp from the idle queue 893 * vfsp == NULL means drain the entire thing 894 */ 895 void 896 ufs_idle_drain(struct vfs *vfsp) 897 { 898 struct inode *ip, *nip; 899 struct inode *ianchor = NULL; 900 int i; 901 902 mutex_enter(&ufs_idle_q.uq_mutex); 903 if (ufs_njunk_iq) { 904 /* for each hash q */ 905 for (i = 0; i < ufs_niqhash; i++) { 906 /* search down the hash q */ 907 for (ip = ufs_junk_iq[i].i_freef; 908 ip != (inode_t *)&ufs_junk_iq[i]; 909 ip = ip->i_freef) { 910 if (ip->i_vfs == vfsp || vfsp == NULL) { 911 /* found a matching entry */ 912 VN_HOLD(ITOV(ip)); 913 mutex_exit(&ufs_idle_q.uq_mutex); 914 rw_enter(&ip->i_contents, RW_WRITER); 915 /* 916 * See comments in ufs_idle_some() 917 * as we will call ufs_idle_free() 918 * after scanning both queues. 919 */ 920 if (ufs_rmidle(ip)) { 921 rw_exit(&ip->i_contents); 922 ip->i_freef = ianchor; 923 ianchor = ip; 924 } else { 925 rw_exit(&ip->i_contents); 926 VN_RELE(ITOV(ip)); 927 } 928 /* restart this hash q */ 929 ip = (inode_t *)&ufs_junk_iq[i]; 930 mutex_enter(&ufs_idle_q.uq_mutex); 931 } 932 } 933 } 934 } 935 if (ufs_nuseful_iq) { 936 /* for each hash q */ 937 for (i = 0; i < ufs_niqhash; i++) { 938 /* search down the hash q */ 939 for (ip = ufs_useful_iq[i].i_freef; 940 ip != (inode_t *)&ufs_useful_iq[i]; 941 ip = ip->i_freef) { 942 if (ip->i_vfs == vfsp || vfsp == NULL) { 943 /* found a matching entry */ 944 VN_HOLD(ITOV(ip)); 945 mutex_exit(&ufs_idle_q.uq_mutex); 946 rw_enter(&ip->i_contents, RW_WRITER); 947 /* 948 * See comments in ufs_idle_some() 949 * as we will call ufs_idle_free() 950 * after scanning both queues. 951 */ 952 if (ufs_rmidle(ip)) { 953 rw_exit(&ip->i_contents); 954 ip->i_freef = ianchor; 955 ianchor = ip; 956 } else { 957 rw_exit(&ip->i_contents); 958 VN_RELE(ITOV(ip)); 959 } 960 /* restart this hash q */ 961 ip = (inode_t *)&ufs_useful_iq[i]; 962 mutex_enter(&ufs_idle_q.uq_mutex); 963 } 964 } 965 } 966 } 967 968 mutex_exit(&ufs_idle_q.uq_mutex); 969 /* no more matching entries, release those we have found (if any) */ 970 for (ip = ianchor; ip; ip = nip) { 971 nip = ip->i_freef; 972 ip->i_freef = ip; 973 ufs_idle_free(ip); 974 } 975 } 976 977 /* 978 * RECLAIM DELETED INODES 979 * The following thread scans the file system once looking for deleted files 980 */ 981 void 982 ufs_thread_reclaim(struct vfs *vfsp) 983 { 984 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 985 struct ufs_q *uq = &ufsvfsp->vfs_reclaim; 986 struct fs *fs = ufsvfsp->vfs_fs; 987 struct buf *bp = 0; 988 int err = 0; 989 daddr_t bno; 990 ino_t ino; 991 struct dinode *dp; 992 struct inode *ip; 993 callb_cpr_t cprinfo; 994 995 CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr, 996 "ufsreclaim"); 997 998 /* 999 * mount decided that we don't need a reclaim thread 1000 */ 1001 if ((fs->fs_reclaim & FS_RECLAIMING) == 0) 1002 err++; 1003 1004 /* 1005 * don't reclaim if readonly 1006 */ 1007 if (fs->fs_ronly) 1008 err++; 1009 1010 for (ino = 0; ino < (fs->fs_ncg * fs->fs_ipg) && !err; ++ino) { 1011 1012 /* 1013 * Check whether we are the target of another 1014 * thread having called ufs_thread_exit() or 1015 * ufs_thread_suspend(). 1016 */ 1017 mutex_enter(&uq->uq_mutex); 1018 again: 1019 if (uq->uq_flags & UQ_EXIT) { 1020 err++; 1021 mutex_exit(&uq->uq_mutex); 1022 break; 1023 } else if (uq->uq_flags & UQ_SUSPEND) { 1024 uq->uq_flags |= UQ_SUSPENDED; 1025 /* 1026 * Release the buf before we cv_wait() 1027 * otherwise we may deadlock with the 1028 * thread that called ufs_thread_suspend(). 1029 */ 1030 if (bp) { 1031 brelse(bp); 1032 bp = 0; 1033 } 1034 if (uq->uq_flags & UQ_WAIT) { 1035 uq->uq_flags &= ~UQ_WAIT; 1036 cv_broadcast(&uq->uq_cv); 1037 } 1038 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1039 cv_wait(&uq->uq_cv, &uq->uq_mutex); 1040 CALLB_CPR_SAFE_END(&cprinfo, &uq->uq_mutex); 1041 goto again; 1042 } 1043 mutex_exit(&uq->uq_mutex); 1044 1045 /* 1046 * if we don't already have the buf; get it 1047 */ 1048 bno = fsbtodb(fs, itod(fs, ino)); 1049 if ((bp == 0) || (bp->b_blkno != bno)) { 1050 if (bp) 1051 brelse(bp); 1052 bp = UFS_BREAD(ufsvfsp, 1053 ufsvfsp->vfs_dev, bno, fs->fs_bsize); 1054 bp->b_flags |= B_AGE; 1055 } 1056 if (bp->b_flags & B_ERROR) { 1057 err++; 1058 continue; 1059 } 1060 /* 1061 * nlink <= 0 and mode != 0 means deleted 1062 */ 1063 dp = (struct dinode *)bp->b_un.b_addr + itoo(fs, ino); 1064 if ((dp->di_nlink <= 0) && (dp->di_mode != 0)) { 1065 /* 1066 * can't hold the buf (deadlock) 1067 */ 1068 brelse(bp); 1069 bp = 0; 1070 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1071 /* 1072 * iget/iput sequence will put inode on ifree 1073 * thread queue if it is idle. This is a nop 1074 * for busy (open, deleted) inodes 1075 */ 1076 if (ufs_iget(vfsp, ino, &ip, CRED())) 1077 err++; 1078 else 1079 VN_RELE(ITOV(ip)); 1080 rw_exit(&ufsvfsp->vfs_dqrwlock); 1081 } 1082 } 1083 1084 if (bp) 1085 brelse(bp); 1086 if (!err) { 1087 /* 1088 * reset the reclaiming-bit 1089 */ 1090 mutex_enter(&ufsvfsp->vfs_lock); 1091 fs->fs_reclaim &= ~FS_RECLAIMING; 1092 mutex_exit(&ufsvfsp->vfs_lock); 1093 TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_RECLAIM); 1094 } 1095 1096 /* 1097 * exit the reclaim thread 1098 */ 1099 mutex_enter(&uq->uq_mutex); 1100 uq->uq_threadp = NULL; 1101 uq->uq_flags &= ~UQ_WAIT; 1102 cv_broadcast(&uq->uq_cv); 1103 CALLB_CPR_EXIT(&cprinfo); 1104 thread_exit(); 1105 } 1106 /* 1107 * HLOCK FILE SYSTEM 1108 * hlock the file system's whose logs have device errors 1109 */ 1110 struct ufs_q ufs_hlock; 1111 /*ARGSUSED*/ 1112 void 1113 ufs_thread_hlock(void *ignore) 1114 { 1115 int retry; 1116 callb_cpr_t cprinfo; 1117 1118 CALLB_CPR_INIT(&cprinfo, &ufs_hlock.uq_mutex, callb_generic_cpr, 1119 "ufshlock"); 1120 1121 for (;;) { 1122 /* 1123 * sleep until there is work to do 1124 */ 1125 mutex_enter(&ufs_hlock.uq_mutex); 1126 (void) ufs_thread_run(&ufs_hlock, &cprinfo); 1127 ufs_hlock.uq_ne = 0; 1128 mutex_exit(&ufs_hlock.uq_mutex); 1129 /* 1130 * hlock the error'ed fs's 1131 * retry after a bit if another app is doing lockfs stuff 1132 */ 1133 do { 1134 retry = ufs_trans_hlock(); 1135 if (retry) { 1136 mutex_enter(&ufs_hlock.uq_mutex); 1137 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1138 (void) cv_timedwait(&ufs_hlock.uq_cv, 1139 &ufs_hlock.uq_mutex, lbolt + hz); 1140 CALLB_CPR_SAFE_END(&cprinfo, 1141 &ufs_hlock.uq_mutex); 1142 mutex_exit(&ufs_hlock.uq_mutex); 1143 } 1144 } while (retry); 1145 } 1146 } 1147 1148 static void 1149 ufs_attr_purge(struct inode *dp) 1150 { 1151 int err; 1152 int error; 1153 off_t dirsize; /* size of the directory */ 1154 off_t offset; /* offset in the directory */ 1155 int entryoffsetinblk; /* offset of ep in fbp's buffer */ 1156 struct inode *tp; 1157 struct fbuf *fbp; /* pointer to directory block */ 1158 struct direct *ep; /* directory entry */ 1159 int trans_size; 1160 int issync; 1161 struct ufsvfs *ufsvfsp = dp->i_ufsvfs; 1162 1163 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1164 1165 fbp = NULL; 1166 dirsize = roundup(dp->i_size, DIRBLKSIZ); 1167 offset = 0; 1168 entryoffsetinblk = 0; 1169 1170 /* 1171 * Purge directory cache 1172 */ 1173 1174 dnlc_dir_purge(&dp->i_danchor); 1175 1176 while (offset < dirsize) { 1177 /* 1178 * If offset is on a block boundary, 1179 * read the next directory block. 1180 * Release previous if it exists. 1181 */ 1182 if (blkoff(dp->i_fs, offset) == 0) { 1183 if (fbp != NULL) { 1184 fbrelse(fbp, S_OTHER); 1185 } 1186 1187 err = blkatoff(dp, offset, (char **)0, &fbp); 1188 if (err) { 1189 goto out; 1190 } 1191 entryoffsetinblk = 0; 1192 } 1193 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk); 1194 if (ep->d_ino == 0 || (ep->d_name[0] == '.' && 1195 ep->d_name[1] == '\0') || 1196 (ep->d_name[0] == '.' && ep->d_name[1] == '.' && 1197 ep->d_name[2] == '\0')) { 1198 1199 entryoffsetinblk += ep->d_reclen; 1200 1201 } else { 1202 1203 if ((err = ufs_iget(dp->i_vfs, ep->d_ino, 1204 &tp, CRED())) != 0) { 1205 goto out; 1206 } 1207 1208 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE, 1209 trans_size = (int)TOP_REMOVE_SIZE(tp)); 1210 1211 /* 1212 * Delete inode. 1213 */ 1214 1215 dnlc_remove(ITOV(dp), ep->d_name); 1216 1217 rw_enter(&tp->i_contents, RW_WRITER); 1218 tp->i_flag |= ICHG; 1219 tp->i_seq++; 1220 TRANS_INODE(tp->i_ufsvfs, tp); 1221 tp->i_nlink--; 1222 ufs_setreclaim(tp); 1223 ITIMES_NOLOCK(tp); 1224 rw_exit(&tp->i_contents); 1225 1226 VN_RELE(ITOV(tp)); 1227 entryoffsetinblk += ep->d_reclen; 1228 TRANS_END_CSYNC(ufsvfsp, error, 1229 issync, TOP_REMOVE, trans_size); 1230 1231 } 1232 offset += ep->d_reclen; 1233 } 1234 1235 if (fbp) { 1236 fbrelse(fbp, S_OTHER); 1237 } 1238 1239 out: 1240 rw_exit(&ufsvfsp->vfs_dqrwlock); 1241 } 1242