1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Portions of this source code were derived from Berkeley 4.3 BSD 32 * under license from the Regents of the University of California. 33 */ 34 35 #pragma ident "%Z%%M% %I% %E% SMI" 36 37 #include <sys/types.h> 38 #include <sys/systm.h> 39 #include <sys/errno.h> 40 #include <sys/kmem.h> 41 #include <sys/buf.h> 42 #include <sys/vnode.h> 43 #include <sys/vfs.h> 44 #include <sys/user.h> 45 #include <sys/callb.h> 46 #include <sys/cpuvar.h> 47 #include <sys/fs/ufs_inode.h> 48 #include <sys/fs/ufs_log.h> 49 #include <sys/fs/ufs_trans.h> 50 #include <sys/fs/ufs_acl.h> 51 #include <sys/fs/ufs_bio.h> 52 #include <sys/fs/ufs_fsdir.h> 53 #include <sys/debug.h> 54 #include <sys/cmn_err.h> 55 #include <sys/sysmacros.h> 56 57 extern pri_t minclsyspri; 58 extern int hash2ints(); 59 extern struct kmem_cache *inode_cache; /* cache of free inodes */ 60 extern int ufs_idle_waiters; 61 extern struct instats ins; 62 63 static void ufs_attr_purge(struct inode *); 64 65 /* 66 * initialize a thread's queue struct 67 */ 68 void 69 ufs_thread_init(struct ufs_q *uq, int lowat) 70 { 71 bzero((caddr_t)uq, sizeof (*uq)); 72 cv_init(&uq->uq_cv, NULL, CV_DEFAULT, NULL); 73 mutex_init(&uq->uq_mutex, NULL, MUTEX_DEFAULT, NULL); 74 uq->uq_lowat = lowat; 75 uq->uq_hiwat = 2 * lowat; 76 uq->uq_threadp = NULL; 77 } 78 79 /* 80 * start a thread for a queue (assumes success) 81 */ 82 void 83 ufs_thread_start(struct ufs_q *uq, void (*func)(), struct vfs *vfsp) 84 { 85 mutex_enter(&uq->uq_mutex); 86 if (uq->uq_threadp == NULL) { 87 uq->uq_threadp = thread_create(NULL, 0, func, vfsp, 0, &p0, 88 TS_RUN, minclsyspri); 89 uq->uq_flags = 0; 90 } 91 mutex_exit(&uq->uq_mutex); 92 } 93 94 /* 95 * wait for the thread to exit 96 */ 97 void 98 ufs_thread_exit(struct ufs_q *uq) 99 { 100 kt_did_t ufs_thread_did = 0; 101 102 mutex_enter(&uq->uq_mutex); 103 uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED); 104 if (uq->uq_threadp != NULL) { 105 ufs_thread_did = uq->uq_threadp->t_did; 106 uq->uq_flags |= (UQ_EXIT|UQ_WAIT); 107 cv_broadcast(&uq->uq_cv); 108 } 109 mutex_exit(&uq->uq_mutex); 110 111 /* 112 * It's safe to call thread_join() with an already-gone 113 * t_did, but we have to obtain it before the kernel 114 * thread structure is freed. We do so above under the 115 * protection of the uq_mutex when we're sure the thread 116 * still exists and it's save to de-reference it. 117 * We also have to check if ufs_thread_did is != 0 118 * before calling thread_join() since thread 0 in the system 119 * gets a t_did of 0. 120 */ 121 if (ufs_thread_did) 122 thread_join(ufs_thread_did); 123 } 124 125 /* 126 * wait for a thread to suspend itself on the caller's behalf 127 * the caller is responsible for continuing the thread 128 */ 129 void 130 ufs_thread_suspend(struct ufs_q *uq) 131 { 132 mutex_enter(&uq->uq_mutex); 133 if (uq->uq_threadp != NULL) { 134 /* 135 * wait while another thread is suspending this thread. 136 * no need to do a cv_broadcast(), as whoever suspended 137 * the thread must continue it at some point. 138 */ 139 while ((uq->uq_flags & UQ_SUSPEND) && 140 (uq->uq_threadp != NULL)) { 141 /* 142 * We can't use cv_signal() because if our 143 * signal doesn't happen to hit the desired 144 * thread but instead some other waiter like 145 * ourselves, we'll wait forever for a 146 * response. Well, at least an indeterminate 147 * amount of time until we just happen to get 148 * lucky from whomever did get signalled doing 149 * a cv_signal() of their own. This is an 150 * unfortunate performance lossage. 151 */ 152 uq->uq_flags |= UQ_WAIT; 153 cv_wait(&uq->uq_cv, &uq->uq_mutex); 154 } 155 156 /* 157 * wait for the thread to suspend itself 158 */ 159 uq->uq_flags |= UQ_SUSPEND; 160 while (((uq->uq_flags & UQ_SUSPENDED) == 0) && 161 (uq->uq_threadp != NULL)) { 162 uq->uq_flags |= UQ_WAIT; 163 cv_broadcast(&uq->uq_cv); 164 cv_wait(&uq->uq_cv, &uq->uq_mutex); 165 } 166 } 167 mutex_exit(&uq->uq_mutex); 168 } 169 170 /* 171 * allow a thread to continue from a ufs_thread_suspend() 172 * This thread must be the same as the thread that called 173 * ufs_thread_suspend. 174 */ 175 void 176 ufs_thread_continue(struct ufs_q *uq) 177 { 178 mutex_enter(&uq->uq_mutex); 179 uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED); 180 cv_broadcast(&uq->uq_cv); 181 mutex_exit(&uq->uq_mutex); 182 } 183 184 /* 185 * some common code for managing a threads execution 186 * uq is locked at entry and return 187 * may sleep 188 * may exit 189 */ 190 /* 191 * Kind of a hack passing in the callb_cpr_t * here. 192 * It should really be part of the ufs_q structure. 193 * I did not put it in there because we are already in beta 194 * and I was concerned that changing ufs_inode.h to include 195 * callb.h might break something. 196 */ 197 int 198 ufs_thread_run(struct ufs_q *uq, callb_cpr_t *cprinfop) 199 { 200 again: 201 ASSERT(uq->uq_ne >= 0); 202 203 if (uq->uq_flags & UQ_SUSPEND) { 204 uq->uq_flags |= UQ_SUSPENDED; 205 } else if (uq->uq_flags & UQ_EXIT) { 206 /* 207 * exiting; empty the queue (may infinite loop) 208 */ 209 if (uq->uq_ne) 210 return (uq->uq_ne); 211 uq->uq_threadp = NULL; 212 if (uq->uq_flags & UQ_WAIT) { 213 cv_broadcast(&uq->uq_cv); 214 } 215 uq->uq_flags &= ~(UQ_EXIT | UQ_WAIT); 216 CALLB_CPR_EXIT(cprinfop); 217 thread_exit(); 218 } else if (uq->uq_ne >= uq->uq_lowat) { 219 /* 220 * process a block of entries until below high water mark 221 */ 222 return (uq->uq_ne - (uq->uq_lowat >> 1)); 223 } else if (uq->uq_flags & UQ_FASTCLIENTS) { 224 /* 225 * Let the fast acting clients through 226 */ 227 return (0); 228 } 229 if (uq->uq_flags & UQ_WAIT) { 230 uq->uq_flags &= ~UQ_WAIT; 231 cv_broadcast(&uq->uq_cv); 232 } 233 CALLB_CPR_SAFE_BEGIN(cprinfop); 234 cv_wait(&uq->uq_cv, &uq->uq_mutex); 235 CALLB_CPR_SAFE_END(cprinfop, &uq->uq_mutex); 236 goto again; 237 } 238 239 /* 240 * DELETE INODE 241 * The following routines implement the protocol for freeing the resources 242 * held by an idle and deleted inode. 243 */ 244 void 245 ufs_delete(struct ufsvfs *ufsvfsp, struct inode *ip, int dolockfs) 246 { 247 ushort_t mode; 248 struct vnode *vp = ITOV(ip); 249 struct ulockfs *ulp; 250 int trans_size; 251 int dorwlock = ((ip->i_mode & IFMT) == IFREG); 252 int issync; 253 int err; 254 struct inode *dp; 255 struct ufs_q *delq = &ufsvfsp->vfs_delete; 256 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 257 258 /* 259 * not on a trans device or not part of a transaction 260 */ 261 ASSERT(!TRANS_ISTRANS(ufsvfsp) || 262 ((curthread->t_flag & T_DONTBLOCK) == 0)); 263 264 /* 265 * Ignore if deletes are not allowed (wlock/hlock) 266 */ 267 if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) { 268 VN_RELE(vp); 269 return; 270 } 271 272 if ((vp->v_count > 1) || (ip->i_mode == 0)) { 273 VN_RELE(vp); 274 return; 275 } 276 /* 277 * If we are called as part of setting a fs lock, then only 278 * do part of the lockfs protocol. In other words, don't hang. 279 */ 280 if (dolockfs) { 281 if (ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_DELETE_MASK)) 282 return; 283 } else { 284 /* 285 * check for recursive VOP call 286 */ 287 if (curthread->t_flag & T_DONTBLOCK) { 288 ulp = NULL; 289 } else { 290 ulp = &ufsvfsp->vfs_ulockfs; 291 curthread->t_flag |= T_DONTBLOCK; 292 } 293 } 294 295 /* 296 * Hold rwlock to synchronize with (nfs) writes 297 */ 298 if (dorwlock) 299 rw_enter(&ip->i_rwlock, RW_WRITER); 300 301 /* 302 * Delete the attribute directory. 303 */ 304 if (ip->i_oeftflag != 0) { 305 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE, 306 trans_size = (int)TOP_REMOVE_SIZE(ip)); 307 rw_enter(&ip->i_contents, RW_WRITER); 308 err = ufs_iget(ip->i_vfs, ip->i_oeftflag, 309 &dp, CRED()); 310 if (err == 0) { 311 rw_enter(&dp->i_rwlock, RW_WRITER); 312 rw_enter(&dp->i_contents, RW_WRITER); 313 dp->i_flag |= IUPD|ICHG; 314 dp->i_seq++; 315 TRANS_INODE(dp->i_ufsvfs, dp); 316 dp->i_nlink -= 2; 317 ufs_setreclaim(dp); 318 /* 319 * Should get rid of any negative cache entries that 320 * might be lingering, as well as ``.'' and 321 * ``..''. If we don't, the VN_RELE() below 322 * won't actually put dp on the delete queue 323 * and it'll hang out until someone forces it 324 * (lockfs -f, umount, ...). The only reliable 325 * way of doing this at the moment is to call 326 * dnlc_purge_vp(ITOV(dp)), which is unacceptably 327 * slow, so we'll just note the problem in this 328 * comment for now. 329 */ 330 dnlc_remove(ITOV(dp), "."); 331 dnlc_remove(ITOV(dp), ".."); 332 ITIMES_NOLOCK(dp); 333 if (!TRANS_ISTRANS(ufsvfsp)) { 334 ufs_iupdat(dp, I_SYNC); 335 } 336 rw_exit(&dp->i_contents); 337 rw_exit(&dp->i_rwlock); 338 VN_RELE(ITOV(dp)); 339 } 340 /* 341 * Clear out attribute pointer 342 */ 343 ip->i_oeftflag = 0; 344 rw_exit(&ip->i_contents); 345 TRANS_END_CSYNC(ufsvfsp, err, issync, 346 TOP_REMOVE, trans_size); 347 dnlc_remove(ITOV(ip), XATTR_DIR_NAME); 348 } 349 350 if ((ip->i_mode & IFMT) == IFATTRDIR) { 351 ufs_attr_purge(ip); 352 } 353 354 (void) TRANS_ITRUNC(ip, (u_offset_t)0, I_FREE | I_ACCT, CRED()); 355 356 /* 357 * the inode's space has been freed; now free the inode 358 */ 359 if (ulp) { 360 trans_size = TOP_IFREE_SIZE(ip); 361 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IFREE, trans_size); 362 } 363 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 364 rw_enter(&ip->i_contents, RW_WRITER); 365 TRANS_INODE(ufsvfsp, ip); 366 mode = ip->i_mode; 367 ip->i_mode = 0; 368 ip->i_rdev = 0; 369 ip->i_ordev = 0; 370 ip->i_flag |= IMOD; 371 if (ip->i_ufs_acl) { 372 (void) ufs_si_free(ip->i_ufs_acl, vp->v_vfsp, CRED()); 373 ip->i_ufs_acl = NULL; 374 ip->i_shadow = 0; 375 } 376 377 /* 378 * This inode is torn down but still retains it's identity 379 * (inode number). It could get recycled soon so it's best 380 * to clean up the vnode just in case. 381 */ 382 mutex_enter(&vp->v_lock); 383 vn_recycle(vp); 384 mutex_exit(&vp->v_lock); 385 386 /* 387 * free the inode 388 */ 389 ufs_ifree(ip, ip->i_number, mode); 390 /* 391 * release quota resources; can't fail 392 */ 393 (void) chkiq((struct ufsvfs *)vp->v_vfsp->vfs_data, 394 /* change */ -1, ip, (uid_t)ip->i_uid, 0, CRED(), 395 (char **)NULL, (size_t *)NULL); 396 dqrele(ip->i_dquot); 397 ip->i_dquot = NULL; 398 ip->i_flag &= ~(IDEL | IDIRECTIO); 399 ip->i_cflags = 0; 400 if (!TRANS_ISTRANS(ufsvfsp)) { 401 ufs_iupdat(ip, I_SYNC); 402 } else { 403 mutex_enter(&delq->uq_mutex); 404 delq_info->delq_unreclaimed_files--; 405 mutex_exit(&delq->uq_mutex); 406 } 407 rw_exit(&ip->i_contents); 408 rw_exit(&ufsvfsp->vfs_dqrwlock); 409 if (dorwlock) 410 rw_exit(&ip->i_rwlock); 411 VN_RELE(vp); 412 413 /* 414 * End of transaction 415 */ 416 if (ulp) { 417 TRANS_END_ASYNC(ufsvfsp, TOP_IFREE, trans_size); 418 if (dolockfs) 419 ufs_lockfs_end(ulp); 420 else 421 curthread->t_flag &= ~T_DONTBLOCK; 422 } 423 } 424 425 /* 426 * Create the delete thread and init the delq_info for this fs 427 */ 428 void 429 ufs_delete_init(struct ufsvfs *ufsvfsp, int lowat) 430 { 431 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 432 433 ufs_thread_init(&ufsvfsp->vfs_delete, lowat); 434 (void) memset((void *)delq_info, 0, sizeof (*delq_info)); 435 cv_init(&delq_info->delq_fast_cv, NULL, CV_DEFAULT, NULL); 436 } 437 438 /* 439 * thread that frees up deleted inodes 440 */ 441 void 442 ufs_thread_delete(struct vfs *vfsp) 443 { 444 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 445 struct ufs_q *uq = &ufsvfsp->vfs_delete; 446 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 447 struct inode *ip; 448 long ne; 449 callb_cpr_t cprinfo; 450 451 CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr, 452 "ufsdelete"); 453 454 mutex_enter(&uq->uq_mutex); 455 again: 456 /* 457 * Sleep until there is work to do. Only do one entry at 458 * a time, to reduce the wait time for checking for a suspend 459 * or fast-client request. The ?: is for pedantic portability. 460 */ 461 ne = ufs_thread_run(uq, &cprinfo) ? 1 : 0; 462 463 /* 464 * process an entry, if there are any 465 */ 466 if (ne && (ip = uq->uq_ihead)) { 467 /* 468 * process first entry on queue. Assumed conditions are: 469 * ip is held (v_count >= 1) 470 * ip is referenced (i_flag & IREF) 471 * ip is free (i_nlink <= 0) 472 */ 473 if ((uq->uq_ihead = ip->i_freef) == ip) 474 uq->uq_ihead = NULL; 475 ip->i_freef->i_freeb = ip->i_freeb; 476 ip->i_freeb->i_freef = ip->i_freef; 477 ip->i_freef = ip; 478 ip->i_freeb = ip; 479 uq->uq_ne--; 480 mutex_exit(&uq->uq_mutex); 481 ufs_delete(ufsvfsp, ip, 1); 482 mutex_enter(&uq->uq_mutex); 483 } 484 485 /* 486 * If there are any fast clients, let all of them through. 487 * Mainly intended for statvfs(), which doesn't need to do 488 * anything except look at the number of bytes/inodes that 489 * are in the queue. 490 */ 491 if (uq->uq_flags & UQ_FASTCLIENTS) { 492 uq->uq_flags &= ~UQ_FASTCLIENTS; 493 /* 494 * Give clients a chance. The lock exit/entry 495 * allows waiting statvfs threads through. 496 */ 497 cv_broadcast(&delq_info->delq_fast_cv); 498 mutex_exit(&uq->uq_mutex); 499 mutex_enter(&uq->uq_mutex); 500 } 501 goto again; 502 } 503 504 /* 505 * drain ne entries off the delete queue. As new queue entries may 506 * be added while we're working, ne is interpreted as follows: 507 * 508 * ne > 0 => remove up to ne entries 509 * ne == 0 => remove all entries currently on the queue 510 * ne == -1 => remove entries until the queue is empty 511 */ 512 void 513 ufs_delete_drain(struct vfs *vfsp, int ne, int dolockfs) 514 { 515 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 516 struct ufs_q *uq; 517 struct inode *ip; 518 int drain_cnt = 0; 519 int done; 520 521 /* 522 * if forcibly unmounted; ignore 523 */ 524 if (ufsvfsp == NULL) 525 return; 526 527 uq = &ufsvfsp->vfs_delete; 528 mutex_enter(&uq->uq_mutex); 529 if (ne == 0) 530 drain_cnt = uq->uq_ne; 531 else if (ne > 0) 532 drain_cnt = ne; 533 534 /* 535 * process up to ne entries 536 */ 537 538 done = 0; 539 while (!done && (ip = uq->uq_ihead)) { 540 if (ne != -1) 541 drain_cnt--; 542 if (ne != -1 && drain_cnt == 0) 543 done = 1; 544 if ((uq->uq_ihead = ip->i_freef) == ip) 545 uq->uq_ihead = NULL; 546 ip->i_freef->i_freeb = ip->i_freeb; 547 ip->i_freeb->i_freef = ip->i_freef; 548 ip->i_freef = ip; 549 ip->i_freeb = ip; 550 uq->uq_ne--; 551 mutex_exit(&uq->uq_mutex); 552 ufs_delete(ufsvfsp, ip, dolockfs); 553 mutex_enter(&uq->uq_mutex); 554 } 555 mutex_exit(&uq->uq_mutex); 556 } 557 558 void 559 ufs_sync_with_thread(struct ufs_q *uq) 560 { 561 mutex_enter(&uq->uq_mutex); 562 uq->uq_flags |= UQ_WAIT; 563 /* 564 * Someone other than the thread we're interested in might 565 * send a signal, so make sure the thread's given an 566 * acknowledgement. 567 */ 568 while ((uq->uq_threadp != NULL) && (uq->uq_flags & UQ_WAIT)) { 569 cv_broadcast(&uq->uq_cv); 570 cv_wait(&uq->uq_cv, &uq->uq_mutex); 571 } 572 mutex_exit(&uq->uq_mutex); 573 } 574 575 /* 576 * Get rid of everything that's currently in the delete queue, 577 * plus whatever the delete thread is working on at the moment. 578 * 579 * This ability is required for providing true POSIX semantics 580 * regarding close(2), unlink(2), etc, even when logging is enabled. 581 * The standard requires that the released space be immediately 582 * observable (statvfs(2)) and allocatable (e.g., write(2)). 583 */ 584 void 585 ufs_delete_drain_wait(struct ufsvfs *ufsvfsp, int dolockfs) 586 { 587 struct ufs_q *uq = &ufsvfsp->vfs_delete; 588 int error; 589 590 (void) ufs_delete_drain(ufsvfsp->vfs_vfs, 0, dolockfs); 591 ufs_sync_with_thread(uq); 592 593 /* 594 * Commit any outstanding transactions to make sure 595 * any canceled freed blocks are available for allocation. 596 */ 597 curthread->t_flag |= T_DONTBLOCK; 598 TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_UPDATE, TOP_COMMIT_SIZE, error); 599 if (!error) { 600 TRANS_END_SYNC(ufsvfsp, error, TOP_COMMIT_UPDATE, 601 TOP_COMMIT_SIZE); 602 } 603 curthread->t_flag &= ~T_DONTBLOCK; 604 } 605 606 /* 607 * Adjust the resource usage in a struct statvfs based on 608 * what's in the delete queue. 609 * 610 * We do not consider the impact of ACLs or extended attributes 611 * that may be deleted as a side-effect of deleting a file. 612 * Those are metadata, and their sizes aren't reflected in the 613 * sizes returned by stat(), so this is not a problem. 614 */ 615 void 616 ufs_delete_adjust_stats(struct ufsvfs *ufsvfsp, struct statvfs64 *sp) 617 { 618 struct ufs_q *uq = &ufsvfsp->vfs_delete; 619 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 620 621 /* 622 * We'll get signalled when it's our turn. However, if there's 623 * nothing going on, there's no point in waking up the delete 624 * thread and waiting for it to tell us to continue. 625 */ 626 mutex_enter(&uq->uq_mutex); 627 628 if ((uq->uq_flags & UQ_FASTCLIENTS) || (uq->uq_ne != 0)) { 629 uq->uq_flags |= UQ_FASTCLIENTS; 630 cv_broadcast(&uq->uq_cv); 631 cv_wait(&delq_info->delq_fast_cv, &uq->uq_mutex); 632 } 633 634 sp->f_bfree += delq_info->delq_unreclaimed_blocks; 635 sp->f_ffree += delq_info->delq_unreclaimed_files; 636 mutex_exit(&uq->uq_mutex); 637 } 638 639 /* 640 * IDLE INODE 641 * The following routines implement the protocol for maintaining an 642 * LRU list of idle inodes and for moving the idle inodes to the 643 * reuse list when the number of allocated inodes exceeds the user 644 * tunable high-water mark (ufs_ninode). 645 */ 646 647 /* 648 * clean an idle inode and move it to the reuse list 649 */ 650 static void 651 ufs_idle_free(struct inode *ip) 652 { 653 int pages; 654 int hno; 655 kmutex_t *ihm; 656 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 657 struct vnode *vp = ITOV(ip); 658 659 /* 660 * inode is held 661 */ 662 663 /* 664 * remember `pages' for stats below 665 */ 666 pages = (ip->i_mode && vn_has_cached_data(vp) && vp->v_type != VCHR); 667 668 /* 669 * start the dirty pages to disk and then invalidate them 670 * unless the inode is invalid (ISTALE) 671 */ 672 if ((ip->i_flag & ISTALE) == 0) { 673 (void) TRANS_SYNCIP(ip, B_ASYNC, I_ASYNC, TOP_SYNCIP_FREE); 674 (void) TRANS_SYNCIP(ip, 675 (TRANS_ISERROR(ufsvfsp)) ? 676 B_INVAL | B_FORCE : B_INVAL, 677 I_ASYNC, TOP_SYNCIP_FREE); 678 } 679 680 /* 681 * wait for any current ufs_iget to finish and block future ufs_igets 682 */ 683 ASSERT(ip->i_number != 0); 684 hno = INOHASH(ip->i_number); 685 ihm = &ih_lock[hno]; 686 mutex_enter(ihm); 687 688 /* 689 * It must be guaranteed that v_count >= 2, otherwise 690 * something must be wrong with this vnode already. 691 * That is why we use v_count-- instead of VN_RELE(). 692 * Acquire the vnode lock in case another thread is in 693 * VN_RELE(). 694 */ 695 mutex_enter(&vp->v_lock); 696 697 if (vp->v_count < 2) 698 cmn_err(CE_PANIC, 699 "ufs_idle_free: vnode ref count is less than 2"); 700 701 vp->v_count--; 702 if ((vp->v_type != VCHR && vn_has_cached_data(vp)) || 703 vp->v_count != 1 || 704 ip->i_flag & (IMOD|IMODACC|IACC|ICHG|IUPD|IATTCHG)) { 705 /* 706 * Another thread has referenced this inode while 707 * we are trying to free it. Call VN_RELE() to 708 * release our reference. 709 */ 710 mutex_exit(&vp->v_lock); 711 mutex_exit(ihm); 712 VN_RELE(vp); 713 } else { 714 /* 715 * The inode is currently unreferenced and can not 716 * acquire further references because it has no pages 717 * and the hash is locked. Inodes acquire references 718 * via the hash list or via their pages. 719 */ 720 721 mutex_exit(&vp->v_lock); 722 723 /* 724 * remove it from the cache 725 */ 726 remque(ip); 727 mutex_exit(ihm); 728 /* 729 * Stale inodes have no valid ufsvfs 730 */ 731 if ((ip->i_flag & ISTALE) == 0 && ip->i_dquot) { 732 TRANS_DQRELE(ufsvfsp, ip->i_dquot); 733 ip->i_dquot = NULL; 734 } 735 ufs_si_del(ip); 736 if (pages) { 737 CPU_STATS_ADDQ(CPU, sys, ufsipage, 1); 738 } else { 739 CPU_STATS_ADDQ(CPU, sys, ufsinopage, 1); 740 } 741 ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp)); 742 ufs_free_inode(ip); 743 } 744 } 745 746 /* 747 * this thread processes the global idle queue 748 */ 749 iqhead_t *ufs_junk_iq; 750 iqhead_t *ufs_useful_iq; 751 int ufs_njunk_iq = 0; 752 int ufs_nuseful_iq = 0; 753 int ufs_niqhash; 754 int ufs_iqhashmask; 755 struct ufs_q ufs_idle_q; 756 757 void 758 ufs_thread_idle(void) 759 { 760 callb_cpr_t cprinfo; 761 int i; 762 int ne; 763 764 ufs_niqhash = (ufs_idle_q.uq_lowat >> 1) / IQHASHQLEN; 765 ufs_niqhash = 1 << highbit(ufs_niqhash); /* round up to power of 2 */ 766 ufs_iqhashmask = ufs_niqhash - 1; 767 ufs_junk_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_junk_iq), 768 KM_SLEEP); 769 ufs_useful_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_useful_iq), 770 KM_SLEEP); 771 772 /* Initialize hash queue headers */ 773 for (i = 0; i < ufs_niqhash; i++) { 774 ufs_junk_iq[i].i_freef = (inode_t *)&ufs_junk_iq[i]; 775 ufs_junk_iq[i].i_freeb = (inode_t *)&ufs_junk_iq[i]; 776 ufs_useful_iq[i].i_freef = (inode_t *)&ufs_useful_iq[i]; 777 ufs_useful_iq[i].i_freeb = (inode_t *)&ufs_useful_iq[i]; 778 } 779 780 CALLB_CPR_INIT(&cprinfo, &ufs_idle_q.uq_mutex, callb_generic_cpr, 781 "ufsidle"); 782 again: 783 /* 784 * Whenever the idle thread is awakened, it repeatedly gives 785 * back half of the idle queue until the idle queue falls 786 * below lowat. 787 */ 788 mutex_enter(&ufs_idle_q.uq_mutex); 789 if (ufs_idle_q.uq_ne < ufs_idle_q.uq_lowat) { 790 CALLB_CPR_SAFE_BEGIN(&cprinfo); 791 cv_wait(&ufs_idle_q.uq_cv, &ufs_idle_q.uq_mutex); 792 CALLB_CPR_SAFE_END(&cprinfo, &ufs_idle_q.uq_mutex); 793 } 794 mutex_exit(&ufs_idle_q.uq_mutex); 795 796 /* 797 * Give back 1/2 of the idle queue 798 */ 799 ne = ufs_idle_q.uq_ne >> 1; 800 ins.in_tidles.value.ul += ne; 801 ufs_idle_some(ne); 802 goto again; 803 } 804 805 /* 806 * Reclaim callback for ufs inode cache. 807 * Invoked by the kernel memory allocator when memory gets tight. 808 */ 809 /*ARGSUSED*/ 810 void 811 ufs_inode_cache_reclaim(void *cdrarg) 812 { 813 /* 814 * If we are low on memory and the idle queue is over its 815 * halfway mark, then free 50% of the idle q 816 * 817 * We don't free all of the idle inodes because the inodes 818 * for popular NFS files may have been kicked from the dnlc. 819 * The inodes for these files will end up on the idle queue 820 * after every NFS access. 821 * 822 * If we repeatedly push them from the idle queue then 823 * NFS users may be unhappy as an extra buf cache operation 824 * is incurred for every NFS operation to these files. 825 * 826 * It's not common, but I have seen it happen. 827 * 828 */ 829 if (ufs_idle_q.uq_ne < (ufs_idle_q.uq_lowat >> 1)) 830 return; 831 mutex_enter(&ufs_idle_q.uq_mutex); 832 cv_broadcast(&ufs_idle_q.uq_cv); 833 mutex_exit(&ufs_idle_q.uq_mutex); 834 } 835 836 /* 837 * Free up some idle inodes 838 */ 839 void 840 ufs_idle_some(int ne) 841 { 842 int i; 843 struct inode *ip; 844 struct vnode *vp; 845 static int junk_rotor = 0; 846 static int useful_rotor = 0; 847 848 for (i = 0; i < ne; ++i) { 849 mutex_enter(&ufs_idle_q.uq_mutex); 850 851 if (ufs_njunk_iq) { 852 while (ufs_junk_iq[junk_rotor].i_freef == 853 (inode_t *)&ufs_junk_iq[junk_rotor]) { 854 junk_rotor = IQNEXT(junk_rotor); 855 } 856 ip = ufs_junk_iq[junk_rotor].i_freef; 857 ASSERT(ip->i_flag & IJUNKIQ); 858 } else if (ufs_nuseful_iq) { 859 while (ufs_useful_iq[useful_rotor].i_freef == 860 (inode_t *)&ufs_useful_iq[useful_rotor]) { 861 useful_rotor = IQNEXT(useful_rotor); 862 } 863 ip = ufs_useful_iq[useful_rotor].i_freef; 864 ASSERT(!(ip->i_flag & IJUNKIQ)); 865 } else { 866 mutex_exit(&ufs_idle_q.uq_mutex); 867 return; 868 } 869 870 /* 871 * emulate ufs_iget 872 */ 873 vp = ITOV(ip); 874 VN_HOLD(vp); 875 mutex_exit(&ufs_idle_q.uq_mutex); 876 rw_enter(&ip->i_contents, RW_WRITER); 877 /* 878 * VN_RELE should not be called if 879 * ufs_rmidle returns true, as it will 880 * effectively be done in ufs_idle_free. 881 */ 882 if (ufs_rmidle(ip)) { 883 rw_exit(&ip->i_contents); 884 ufs_idle_free(ip); 885 } else { 886 rw_exit(&ip->i_contents); 887 VN_RELE(vp); 888 } 889 } 890 } 891 892 /* 893 * drain entries for vfsp from the idle queue 894 * vfsp == NULL means drain the entire thing 895 */ 896 void 897 ufs_idle_drain(struct vfs *vfsp) 898 { 899 struct inode *ip, *nip; 900 struct inode *ianchor = NULL; 901 int i; 902 903 mutex_enter(&ufs_idle_q.uq_mutex); 904 if (ufs_njunk_iq) { 905 /* for each hash q */ 906 for (i = 0; i < ufs_niqhash; i++) { 907 /* search down the hash q */ 908 for (ip = ufs_junk_iq[i].i_freef; 909 ip != (inode_t *)&ufs_junk_iq[i]; 910 ip = ip->i_freef) { 911 if (ip->i_vfs == vfsp || vfsp == NULL) { 912 /* found a matching entry */ 913 VN_HOLD(ITOV(ip)); 914 mutex_exit(&ufs_idle_q.uq_mutex); 915 rw_enter(&ip->i_contents, RW_WRITER); 916 /* 917 * See comments in ufs_idle_some() 918 * as we will call ufs_idle_free() 919 * after scanning both queues. 920 */ 921 if (ufs_rmidle(ip)) { 922 rw_exit(&ip->i_contents); 923 ip->i_freef = ianchor; 924 ianchor = ip; 925 } else { 926 rw_exit(&ip->i_contents); 927 VN_RELE(ITOV(ip)); 928 } 929 /* restart this hash q */ 930 ip = (inode_t *)&ufs_junk_iq[i]; 931 mutex_enter(&ufs_idle_q.uq_mutex); 932 } 933 } 934 } 935 } 936 if (ufs_nuseful_iq) { 937 /* for each hash q */ 938 for (i = 0; i < ufs_niqhash; i++) { 939 /* search down the hash q */ 940 for (ip = ufs_useful_iq[i].i_freef; 941 ip != (inode_t *)&ufs_useful_iq[i]; 942 ip = ip->i_freef) { 943 if (ip->i_vfs == vfsp || vfsp == NULL) { 944 /* found a matching entry */ 945 VN_HOLD(ITOV(ip)); 946 mutex_exit(&ufs_idle_q.uq_mutex); 947 rw_enter(&ip->i_contents, RW_WRITER); 948 /* 949 * See comments in ufs_idle_some() 950 * as we will call ufs_idle_free() 951 * after scanning both queues. 952 */ 953 if (ufs_rmidle(ip)) { 954 rw_exit(&ip->i_contents); 955 ip->i_freef = ianchor; 956 ianchor = ip; 957 } else { 958 rw_exit(&ip->i_contents); 959 VN_RELE(ITOV(ip)); 960 } 961 /* restart this hash q */ 962 ip = (inode_t *)&ufs_useful_iq[i]; 963 mutex_enter(&ufs_idle_q.uq_mutex); 964 } 965 } 966 } 967 } 968 969 mutex_exit(&ufs_idle_q.uq_mutex); 970 /* no more matching entries, release those we have found (if any) */ 971 for (ip = ianchor; ip; ip = nip) { 972 nip = ip->i_freef; 973 ip->i_freef = ip; 974 ufs_idle_free(ip); 975 } 976 } 977 978 /* 979 * RECLAIM DELETED INODES 980 * The following thread scans the file system once looking for deleted files 981 */ 982 void 983 ufs_thread_reclaim(struct vfs *vfsp) 984 { 985 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 986 struct ufs_q *uq = &ufsvfsp->vfs_reclaim; 987 struct fs *fs = ufsvfsp->vfs_fs; 988 struct buf *bp = 0; 989 int err = 0; 990 daddr_t bno; 991 ino_t ino; 992 struct dinode *dp; 993 struct inode *ip; 994 callb_cpr_t cprinfo; 995 996 CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr, 997 "ufsreclaim"); 998 999 /* 1000 * mount decided that we don't need a reclaim thread 1001 */ 1002 if ((fs->fs_reclaim & FS_RECLAIMING) == 0) 1003 err++; 1004 1005 /* 1006 * don't reclaim if readonly 1007 */ 1008 if (fs->fs_ronly) 1009 err++; 1010 1011 for (ino = 0; ino < (fs->fs_ncg * fs->fs_ipg) && !err; ++ino) { 1012 1013 /* 1014 * Check whether we are the target of another 1015 * thread having called ufs_thread_exit() or 1016 * ufs_thread_suspend(). 1017 */ 1018 mutex_enter(&uq->uq_mutex); 1019 again: 1020 if (uq->uq_flags & UQ_EXIT) { 1021 err++; 1022 mutex_exit(&uq->uq_mutex); 1023 break; 1024 } else if (uq->uq_flags & UQ_SUSPEND) { 1025 uq->uq_flags |= UQ_SUSPENDED; 1026 /* 1027 * Release the buf before we cv_wait() 1028 * otherwise we may deadlock with the 1029 * thread that called ufs_thread_suspend(). 1030 */ 1031 if (bp) { 1032 brelse(bp); 1033 bp = 0; 1034 } 1035 if (uq->uq_flags & UQ_WAIT) { 1036 uq->uq_flags &= ~UQ_WAIT; 1037 cv_broadcast(&uq->uq_cv); 1038 } 1039 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1040 cv_wait(&uq->uq_cv, &uq->uq_mutex); 1041 CALLB_CPR_SAFE_END(&cprinfo, &uq->uq_mutex); 1042 goto again; 1043 } 1044 mutex_exit(&uq->uq_mutex); 1045 1046 /* 1047 * if we don't already have the buf; get it 1048 */ 1049 bno = fsbtodb(fs, itod(fs, ino)); 1050 if ((bp == 0) || (bp->b_blkno != bno)) { 1051 if (bp) 1052 brelse(bp); 1053 bp = UFS_BREAD(ufsvfsp, 1054 ufsvfsp->vfs_dev, bno, fs->fs_bsize); 1055 bp->b_flags |= B_AGE; 1056 } 1057 if (bp->b_flags & B_ERROR) { 1058 err++; 1059 continue; 1060 } 1061 /* 1062 * nlink <= 0 and mode != 0 means deleted 1063 */ 1064 dp = (struct dinode *)bp->b_un.b_addr + itoo(fs, ino); 1065 if ((dp->di_nlink <= 0) && (dp->di_mode != 0)) { 1066 /* 1067 * can't hold the buf (deadlock) 1068 */ 1069 brelse(bp); 1070 bp = 0; 1071 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1072 /* 1073 * iget/iput sequence will put inode on ifree 1074 * thread queue if it is idle. This is a nop 1075 * for busy (open, deleted) inodes 1076 */ 1077 if (ufs_iget(vfsp, ino, &ip, CRED())) 1078 err++; 1079 else 1080 VN_RELE(ITOV(ip)); 1081 rw_exit(&ufsvfsp->vfs_dqrwlock); 1082 } 1083 } 1084 1085 if (bp) 1086 brelse(bp); 1087 if (!err) { 1088 /* 1089 * reset the reclaiming-bit 1090 */ 1091 mutex_enter(&ufsvfsp->vfs_lock); 1092 fs->fs_reclaim &= ~FS_RECLAIMING; 1093 mutex_exit(&ufsvfsp->vfs_lock); 1094 TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_RECLAIM); 1095 } 1096 1097 /* 1098 * exit the reclaim thread 1099 */ 1100 mutex_enter(&uq->uq_mutex); 1101 uq->uq_threadp = NULL; 1102 uq->uq_flags &= ~UQ_WAIT; 1103 cv_broadcast(&uq->uq_cv); 1104 CALLB_CPR_EXIT(&cprinfo); 1105 thread_exit(); 1106 } 1107 /* 1108 * HLOCK FILE SYSTEM 1109 * hlock the file system's whose logs have device errors 1110 */ 1111 struct ufs_q ufs_hlock; 1112 /*ARGSUSED*/ 1113 void 1114 ufs_thread_hlock(void *ignore) 1115 { 1116 int retry; 1117 callb_cpr_t cprinfo; 1118 1119 CALLB_CPR_INIT(&cprinfo, &ufs_hlock.uq_mutex, callb_generic_cpr, 1120 "ufshlock"); 1121 1122 for (;;) { 1123 /* 1124 * sleep until there is work to do 1125 */ 1126 mutex_enter(&ufs_hlock.uq_mutex); 1127 (void) ufs_thread_run(&ufs_hlock, &cprinfo); 1128 ufs_hlock.uq_ne = 0; 1129 mutex_exit(&ufs_hlock.uq_mutex); 1130 /* 1131 * hlock the error'ed fs's 1132 * retry after a bit if another app is doing lockfs stuff 1133 */ 1134 do { 1135 retry = ufs_trans_hlock(); 1136 if (retry) { 1137 mutex_enter(&ufs_hlock.uq_mutex); 1138 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1139 (void) cv_timedwait(&ufs_hlock.uq_cv, 1140 &ufs_hlock.uq_mutex, 1141 lbolt + hz); 1142 CALLB_CPR_SAFE_END(&cprinfo, 1143 &ufs_hlock.uq_mutex); 1144 mutex_exit(&ufs_hlock.uq_mutex); 1145 } 1146 } while (retry); 1147 } 1148 } 1149 1150 static void 1151 ufs_attr_purge(struct inode *dp) 1152 { 1153 int err; 1154 int error; 1155 off_t dirsize; /* size of the directory */ 1156 off_t offset; /* offset in the directory */ 1157 int entryoffsetinblk; /* offset of ep in fbp's buffer */ 1158 struct inode *tp; 1159 struct fbuf *fbp; /* pointer to directory block */ 1160 struct direct *ep; /* directory entry */ 1161 int trans_size; 1162 int issync; 1163 struct ufsvfs *ufsvfsp = dp->i_ufsvfs; 1164 1165 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1166 1167 fbp = NULL; 1168 dirsize = roundup(dp->i_size, DIRBLKSIZ); 1169 offset = 0; 1170 entryoffsetinblk = 0; 1171 1172 /* 1173 * Purge directory cache 1174 */ 1175 1176 dnlc_dir_purge(&dp->i_danchor); 1177 1178 while (offset < dirsize) { 1179 /* 1180 * If offset is on a block boundary, 1181 * read the next directory block. 1182 * Release previous if it exists. 1183 */ 1184 if (blkoff(dp->i_fs, offset) == 0) { 1185 if (fbp != NULL) { 1186 fbrelse(fbp, S_OTHER); 1187 } 1188 1189 err = blkatoff(dp, offset, (char **)0, &fbp); 1190 if (err) { 1191 goto out; 1192 } 1193 entryoffsetinblk = 0; 1194 } 1195 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk); 1196 if (ep->d_ino == 0 || (ep->d_name[0] == '.' && 1197 ep->d_name[1] == '\0') || 1198 (ep->d_name[0] == '.' && ep->d_name[1] == '.' && 1199 ep->d_name[2] == '\0')) { 1200 1201 entryoffsetinblk += ep->d_reclen; 1202 1203 } else { 1204 1205 if ((err = ufs_iget(dp->i_vfs, ep->d_ino, 1206 &tp, CRED())) != 0) { 1207 goto out; 1208 } 1209 1210 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE, 1211 trans_size = (int)TOP_REMOVE_SIZE(tp)); 1212 1213 /* 1214 * Delete inode. 1215 */ 1216 1217 dnlc_remove(ITOV(dp), ep->d_name); 1218 1219 rw_enter(&tp->i_contents, RW_WRITER); 1220 tp->i_flag |= ICHG; 1221 tp->i_seq++; 1222 TRANS_INODE(tp->i_ufsvfs, tp); 1223 tp->i_nlink--; 1224 ufs_setreclaim(tp); 1225 ITIMES_NOLOCK(tp); 1226 rw_exit(&tp->i_contents); 1227 1228 VN_RELE(ITOV(tp)); 1229 entryoffsetinblk += ep->d_reclen; 1230 TRANS_END_CSYNC(ufsvfsp, error, 1231 issync, TOP_REMOVE, trans_size); 1232 1233 } 1234 offset += ep->d_reclen; 1235 } 1236 1237 if (fbp) { 1238 fbrelse(fbp, S_OTHER); 1239 } 1240 1241 out: 1242 rw_exit(&ufsvfsp->vfs_dqrwlock); 1243 } 1244