1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #pragma ident "%Z%%M% %I% %E% SMI" 35 36 #include <sys/types.h> 37 #include <sys/systm.h> 38 #include <sys/errno.h> 39 #include <sys/kmem.h> 40 #include <sys/buf.h> 41 #include <sys/vnode.h> 42 #include <sys/vfs.h> 43 #include <sys/user.h> 44 #include <sys/callb.h> 45 #include <sys/cpuvar.h> 46 #include <sys/fs/ufs_inode.h> 47 #include <sys/fs/ufs_log.h> 48 #include <sys/fs/ufs_trans.h> 49 #include <sys/fs/ufs_acl.h> 50 #include <sys/fs/ufs_bio.h> 51 #include <sys/fs/ufs_fsdir.h> 52 #include <sys/debug.h> 53 #include <sys/cmn_err.h> 54 #include <sys/sysmacros.h> 55 56 extern pri_t minclsyspri; 57 extern int hash2ints(); 58 extern struct kmem_cache *inode_cache; /* cache of free inodes */ 59 extern int ufs_idle_waiters; 60 extern struct instats ins; 61 62 static void ufs_attr_purge(struct inode *); 63 64 /* 65 * initialize a thread's queue struct 66 */ 67 void 68 ufs_thread_init(struct ufs_q *uq, int lowat) 69 { 70 bzero((caddr_t)uq, sizeof (*uq)); 71 cv_init(&uq->uq_cv, NULL, CV_DEFAULT, NULL); 72 mutex_init(&uq->uq_mutex, NULL, MUTEX_DEFAULT, NULL); 73 uq->uq_lowat = lowat; 74 uq->uq_hiwat = 2 * lowat; 75 uq->uq_threadp = NULL; 76 } 77 78 /* 79 * start a thread for a queue (assumes success) 80 */ 81 void 82 ufs_thread_start(struct ufs_q *uq, void (*func)(), struct vfs *vfsp) 83 { 84 mutex_enter(&uq->uq_mutex); 85 if (uq->uq_threadp == NULL) { 86 uq->uq_threadp = thread_create(NULL, 0, func, vfsp, 0, &p0, 87 TS_RUN, minclsyspri); 88 uq->uq_flags = 0; 89 } 90 mutex_exit(&uq->uq_mutex); 91 } 92 93 /* 94 * wait for the thread to exit 95 */ 96 void 97 ufs_thread_exit(struct ufs_q *uq) 98 { 99 kt_did_t ufs_thread_did = 0; 100 101 mutex_enter(&uq->uq_mutex); 102 uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED); 103 if (uq->uq_threadp != NULL) { 104 ufs_thread_did = uq->uq_threadp->t_did; 105 uq->uq_flags |= (UQ_EXIT|UQ_WAIT); 106 cv_broadcast(&uq->uq_cv); 107 } 108 mutex_exit(&uq->uq_mutex); 109 110 /* 111 * It's safe to call thread_join() with an already-gone 112 * t_did, but we have to obtain it before the kernel 113 * thread structure is freed. We do so above under the 114 * protection of the uq_mutex when we're sure the thread 115 * still exists and it's save to de-reference it. 116 * We also have to check if ufs_thread_did is != 0 117 * before calling thread_join() since thread 0 in the system 118 * gets a t_did of 0. 119 */ 120 if (ufs_thread_did) 121 thread_join(ufs_thread_did); 122 } 123 124 /* 125 * wait for a thread to suspend itself on the caller's behalf 126 * the caller is responsible for continuing the thread 127 */ 128 void 129 ufs_thread_suspend(struct ufs_q *uq) 130 { 131 mutex_enter(&uq->uq_mutex); 132 if (uq->uq_threadp != NULL) { 133 /* 134 * wait while another thread is suspending this thread. 135 * no need to do a cv_broadcast(), as whoever suspended 136 * the thread must continue it at some point. 137 */ 138 while ((uq->uq_flags & UQ_SUSPEND) && 139 (uq->uq_threadp != NULL)) { 140 /* 141 * We can't use cv_signal() because if our 142 * signal doesn't happen to hit the desired 143 * thread but instead some other waiter like 144 * ourselves, we'll wait forever for a 145 * response. Well, at least an indeterminate 146 * amount of time until we just happen to get 147 * lucky from whomever did get signalled doing 148 * a cv_signal() of their own. This is an 149 * unfortunate performance lossage. 150 */ 151 uq->uq_flags |= UQ_WAIT; 152 cv_wait(&uq->uq_cv, &uq->uq_mutex); 153 } 154 155 /* 156 * wait for the thread to suspend itself 157 */ 158 uq->uq_flags |= UQ_SUSPEND; 159 while (((uq->uq_flags & UQ_SUSPENDED) == 0) && 160 (uq->uq_threadp != NULL)) { 161 uq->uq_flags |= UQ_WAIT; 162 cv_broadcast(&uq->uq_cv); 163 cv_wait(&uq->uq_cv, &uq->uq_mutex); 164 } 165 } 166 mutex_exit(&uq->uq_mutex); 167 } 168 169 /* 170 * allow a thread to continue from a ufs_thread_suspend() 171 * This thread must be the same as the thread that called 172 * ufs_thread_suspend. 173 */ 174 void 175 ufs_thread_continue(struct ufs_q *uq) 176 { 177 mutex_enter(&uq->uq_mutex); 178 uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED); 179 cv_broadcast(&uq->uq_cv); 180 mutex_exit(&uq->uq_mutex); 181 } 182 183 /* 184 * some common code for managing a threads execution 185 * uq is locked at entry and return 186 * may sleep 187 * may exit 188 */ 189 /* 190 * Kind of a hack passing in the callb_cpr_t * here. 191 * It should really be part of the ufs_q structure. 192 * I did not put it in there because we are already in beta 193 * and I was concerned that changing ufs_inode.h to include 194 * callb.h might break something. 195 */ 196 int 197 ufs_thread_run(struct ufs_q *uq, callb_cpr_t *cprinfop) 198 { 199 again: 200 ASSERT(uq->uq_ne >= 0); 201 202 if (uq->uq_flags & UQ_SUSPEND) { 203 uq->uq_flags |= UQ_SUSPENDED; 204 } else if (uq->uq_flags & UQ_EXIT) { 205 /* 206 * exiting; empty the queue (may infinite loop) 207 */ 208 if (uq->uq_ne) 209 return (uq->uq_ne); 210 uq->uq_threadp = NULL; 211 if (uq->uq_flags & UQ_WAIT) { 212 cv_broadcast(&uq->uq_cv); 213 } 214 uq->uq_flags &= ~(UQ_EXIT | UQ_WAIT); 215 CALLB_CPR_EXIT(cprinfop); 216 thread_exit(); 217 } else if (uq->uq_ne >= uq->uq_lowat) { 218 /* 219 * process a block of entries until below high water mark 220 */ 221 return (uq->uq_ne - (uq->uq_lowat >> 1)); 222 } else if (uq->uq_flags & UQ_FASTCLIENTS) { 223 /* 224 * Let the fast acting clients through 225 */ 226 return (0); 227 } 228 if (uq->uq_flags & UQ_WAIT) { 229 uq->uq_flags &= ~UQ_WAIT; 230 cv_broadcast(&uq->uq_cv); 231 } 232 CALLB_CPR_SAFE_BEGIN(cprinfop); 233 cv_wait(&uq->uq_cv, &uq->uq_mutex); 234 CALLB_CPR_SAFE_END(cprinfop, &uq->uq_mutex); 235 goto again; 236 } 237 238 /* 239 * DELETE INODE 240 * The following routines implement the protocol for freeing the resources 241 * held by an idle and deleted inode. 242 */ 243 void 244 ufs_delete(struct ufsvfs *ufsvfsp, struct inode *ip, int dolockfs) 245 { 246 ushort_t mode; 247 struct vnode *vp = ITOV(ip); 248 struct ulockfs *ulp; 249 int trans_size; 250 int dorwlock = ((ip->i_mode & IFMT) == IFREG); 251 int issync; 252 int err; 253 struct inode *dp; 254 struct ufs_q *delq = &ufsvfsp->vfs_delete; 255 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 256 257 /* 258 * not on a trans device or not part of a transaction 259 */ 260 ASSERT(!TRANS_ISTRANS(ufsvfsp) || 261 ((curthread->t_flag & T_DONTBLOCK) == 0)); 262 263 /* 264 * Ignore if deletes are not allowed (wlock/hlock) 265 */ 266 if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) { 267 mutex_enter(&delq->uq_mutex); 268 delq_info->delq_unreclaimed_blocks -= ip->i_blocks; 269 delq_info->delq_unreclaimed_files--; 270 mutex_exit(&delq->uq_mutex); 271 VN_RELE(vp); 272 return; 273 } 274 275 if ((vp->v_count > 1) || (ip->i_mode == 0)) { 276 mutex_enter(&delq->uq_mutex); 277 delq_info->delq_unreclaimed_blocks -= ip->i_blocks; 278 delq_info->delq_unreclaimed_files--; 279 mutex_exit(&delq->uq_mutex); 280 VN_RELE(vp); 281 return; 282 } 283 /* 284 * If we are called as part of setting a fs lock, then only 285 * do part of the lockfs protocol. In other words, don't hang. 286 */ 287 if (dolockfs) { 288 if (ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_DELETE_MASK)) 289 return; 290 } else { 291 /* 292 * check for recursive VOP call 293 */ 294 if (curthread->t_flag & T_DONTBLOCK) { 295 ulp = NULL; 296 } else { 297 ulp = &ufsvfsp->vfs_ulockfs; 298 curthread->t_flag |= T_DONTBLOCK; 299 } 300 } 301 302 /* 303 * Hold rwlock to synchronize with (nfs) writes 304 */ 305 if (dorwlock) 306 rw_enter(&ip->i_rwlock, RW_WRITER); 307 308 /* 309 * Delete the attribute directory. 310 */ 311 if (ip->i_oeftflag != 0) { 312 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE, 313 trans_size = (int)TOP_REMOVE_SIZE(ip)); 314 rw_enter(&ip->i_contents, RW_WRITER); 315 err = ufs_iget(ip->i_vfs, ip->i_oeftflag, 316 &dp, CRED()); 317 if (err == 0) { 318 rw_enter(&dp->i_rwlock, RW_WRITER); 319 rw_enter(&dp->i_contents, RW_WRITER); 320 dp->i_flag |= IUPD|ICHG; 321 dp->i_seq++; 322 TRANS_INODE(dp->i_ufsvfs, dp); 323 dp->i_nlink -= 2; 324 ufs_setreclaim(dp); 325 /* 326 * Should get rid of any negative cache entries that 327 * might be lingering, as well as ``.'' and 328 * ``..''. If we don't, the VN_RELE() below 329 * won't actually put dp on the delete queue 330 * and it'll hang out until someone forces it 331 * (lockfs -f, umount, ...). The only reliable 332 * way of doing this at the moment is to call 333 * dnlc_purge_vp(ITOV(dp)), which is unacceptably 334 * slow, so we'll just note the problem in this 335 * comment for now. 336 */ 337 dnlc_remove(ITOV(dp), "."); 338 dnlc_remove(ITOV(dp), ".."); 339 ITIMES_NOLOCK(dp); 340 if (!TRANS_ISTRANS(ufsvfsp)) { 341 ufs_iupdat(dp, I_SYNC); 342 } 343 rw_exit(&dp->i_contents); 344 rw_exit(&dp->i_rwlock); 345 VN_RELE(ITOV(dp)); 346 } 347 /* 348 * Clear out attribute pointer 349 */ 350 ip->i_oeftflag = 0; 351 rw_exit(&ip->i_contents); 352 TRANS_END_CSYNC(ufsvfsp, err, issync, 353 TOP_REMOVE, trans_size); 354 dnlc_remove(ITOV(ip), XATTR_DIR_NAME); 355 } 356 357 if ((ip->i_mode & IFMT) == IFATTRDIR) { 358 ufs_attr_purge(ip); 359 } 360 361 (void) TRANS_ITRUNC(ip, (u_offset_t)0, I_FREE | I_ACCT, CRED()); 362 363 /* 364 * the inode's space has been freed; now free the inode 365 */ 366 if (ulp) { 367 trans_size = TOP_IFREE_SIZE(ip); 368 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IFREE, trans_size); 369 } 370 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 371 rw_enter(&ip->i_contents, RW_WRITER); 372 TRANS_INODE(ufsvfsp, ip); 373 mode = ip->i_mode; 374 ip->i_mode = 0; 375 ip->i_rdev = 0; 376 ip->i_ordev = 0; 377 ip->i_flag |= IMOD; 378 if (ip->i_ufs_acl) { 379 (void) ufs_si_free(ip->i_ufs_acl, vp->v_vfsp, CRED()); 380 ip->i_ufs_acl = NULL; 381 ip->i_shadow = 0; 382 } 383 384 /* 385 * This inode is torn down but still retains it's identity 386 * (inode number). It could get recycled soon so it's best 387 * to clean up the vnode just in case. 388 */ 389 mutex_enter(&vp->v_lock); 390 vn_recycle(vp); 391 mutex_exit(&vp->v_lock); 392 393 /* 394 * free the inode 395 */ 396 ufs_ifree(ip, ip->i_number, mode); 397 /* 398 * release quota resources; can't fail 399 */ 400 (void) chkiq((struct ufsvfs *)vp->v_vfsp->vfs_data, 401 /* change */ -1, ip, (uid_t)ip->i_uid, 0, CRED(), 402 (char **)NULL, (size_t *)NULL); 403 dqrele(ip->i_dquot); 404 ip->i_dquot = NULL; 405 ip->i_flag &= ~(IDEL | IDIRECTIO); 406 ip->i_cflags = 0; 407 if (!TRANS_ISTRANS(ufsvfsp)) { 408 ufs_iupdat(ip, I_SYNC); 409 } else { 410 mutex_enter(&delq->uq_mutex); 411 delq_info->delq_unreclaimed_files--; 412 mutex_exit(&delq->uq_mutex); 413 } 414 rw_exit(&ip->i_contents); 415 rw_exit(&ufsvfsp->vfs_dqrwlock); 416 if (dorwlock) 417 rw_exit(&ip->i_rwlock); 418 VN_RELE(vp); 419 420 /* 421 * End of transaction 422 */ 423 if (ulp) { 424 TRANS_END_ASYNC(ufsvfsp, TOP_IFREE, trans_size); 425 if (dolockfs) 426 ufs_lockfs_end(ulp); 427 else 428 curthread->t_flag &= ~T_DONTBLOCK; 429 } 430 } 431 432 /* 433 * Create the delete thread and init the delq_info for this fs 434 */ 435 void 436 ufs_delete_init(struct ufsvfs *ufsvfsp, int lowat) 437 { 438 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 439 440 ufs_thread_init(&ufsvfsp->vfs_delete, lowat); 441 (void) memset((void *)delq_info, 0, sizeof (*delq_info)); 442 cv_init(&delq_info->delq_fast_cv, NULL, CV_DEFAULT, NULL); 443 } 444 445 /* 446 * thread that frees up deleted inodes 447 */ 448 void 449 ufs_thread_delete(struct vfs *vfsp) 450 { 451 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 452 struct ufs_q *uq = &ufsvfsp->vfs_delete; 453 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 454 struct inode *ip; 455 long ne; 456 callb_cpr_t cprinfo; 457 458 CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr, 459 "ufsdelete"); 460 461 mutex_enter(&uq->uq_mutex); 462 again: 463 /* 464 * Sleep until there is work to do. Only do one entry at 465 * a time, to reduce the wait time for checking for a suspend 466 * or fast-client request. The ?: is for pedantic portability. 467 */ 468 ne = ufs_thread_run(uq, &cprinfo) ? 1 : 0; 469 470 /* 471 * process an entry, if there are any 472 */ 473 if (ne && (ip = uq->uq_ihead)) { 474 /* 475 * process first entry on queue. Assumed conditions are: 476 * ip is held (v_count >= 1) 477 * ip is referenced (i_flag & IREF) 478 * ip is free (i_nlink <= 0) 479 */ 480 if ((uq->uq_ihead = ip->i_freef) == ip) 481 uq->uq_ihead = NULL; 482 ip->i_freef->i_freeb = ip->i_freeb; 483 ip->i_freeb->i_freef = ip->i_freef; 484 ip->i_freef = ip; 485 ip->i_freeb = ip; 486 uq->uq_ne--; 487 mutex_exit(&uq->uq_mutex); 488 ufs_delete(ufsvfsp, ip, 1); 489 mutex_enter(&uq->uq_mutex); 490 } 491 492 /* 493 * If there are any fast clients, let all of them through. 494 * Mainly intended for statvfs(), which doesn't need to do 495 * anything except look at the number of bytes/inodes that 496 * are in the queue. 497 */ 498 if (uq->uq_flags & UQ_FASTCLIENTS) { 499 uq->uq_flags &= ~UQ_FASTCLIENTS; 500 /* 501 * Give clients a chance. The lock exit/entry 502 * allows waiting statvfs threads through. 503 */ 504 cv_broadcast(&delq_info->delq_fast_cv); 505 mutex_exit(&uq->uq_mutex); 506 mutex_enter(&uq->uq_mutex); 507 } 508 goto again; 509 } 510 511 /* 512 * drain ne entries off the delete queue. As new queue entries may 513 * be added while we're working, ne is interpreted as follows: 514 * 515 * ne > 0 => remove up to ne entries 516 * ne == 0 => remove all entries currently on the queue 517 * ne == -1 => remove entries until the queue is empty 518 */ 519 void 520 ufs_delete_drain(struct vfs *vfsp, int ne, int dolockfs) 521 { 522 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 523 struct ufs_q *uq; 524 struct inode *ip; 525 int drain_cnt = 0; 526 int done; 527 528 /* 529 * if forcibly unmounted; ignore 530 */ 531 if (ufsvfsp == NULL) 532 return; 533 534 uq = &ufsvfsp->vfs_delete; 535 mutex_enter(&uq->uq_mutex); 536 if (ne == 0) 537 drain_cnt = uq->uq_ne; 538 else if (ne > 0) 539 drain_cnt = ne; 540 541 /* 542 * process up to ne entries 543 */ 544 545 done = 0; 546 while (!done && (ip = uq->uq_ihead)) { 547 if (ne != -1) 548 drain_cnt--; 549 if (ne != -1 && drain_cnt == 0) 550 done = 1; 551 if ((uq->uq_ihead = ip->i_freef) == ip) 552 uq->uq_ihead = NULL; 553 ip->i_freef->i_freeb = ip->i_freeb; 554 ip->i_freeb->i_freef = ip->i_freef; 555 ip->i_freef = ip; 556 ip->i_freeb = ip; 557 uq->uq_ne--; 558 mutex_exit(&uq->uq_mutex); 559 ufs_delete(ufsvfsp, ip, dolockfs); 560 mutex_enter(&uq->uq_mutex); 561 } 562 mutex_exit(&uq->uq_mutex); 563 } 564 565 void 566 ufs_sync_with_thread(struct ufs_q *uq) 567 { 568 mutex_enter(&uq->uq_mutex); 569 uq->uq_flags |= UQ_WAIT; 570 /* 571 * Someone other than the thread we're interested in might 572 * send a signal, so make sure the thread's given an 573 * acknowledgement. 574 */ 575 while ((uq->uq_threadp != NULL) && (uq->uq_flags & UQ_WAIT)) { 576 cv_broadcast(&uq->uq_cv); 577 cv_wait(&uq->uq_cv, &uq->uq_mutex); 578 } 579 mutex_exit(&uq->uq_mutex); 580 } 581 582 /* 583 * Get rid of everything that's currently in the delete queue, 584 * plus whatever the delete thread is working on at the moment. 585 * 586 * This ability is required for providing true POSIX semantics 587 * regarding close(2), unlink(2), etc, even when logging is enabled. 588 * The standard requires that the released space be immediately 589 * observable (statvfs(2)) and allocatable (e.g., write(2)). 590 */ 591 void 592 ufs_delete_drain_wait(struct ufsvfs *ufsvfsp, int dolockfs) 593 { 594 struct ufs_q *uq = &ufsvfsp->vfs_delete; 595 int error; 596 597 (void) ufs_delete_drain(ufsvfsp->vfs_vfs, 0, dolockfs); 598 ufs_sync_with_thread(uq); 599 600 /* 601 * Commit any outstanding transactions to make sure 602 * any canceled freed blocks are available for allocation. 603 */ 604 curthread->t_flag |= T_DONTBLOCK; 605 TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_UPDATE, TOP_COMMIT_SIZE, error); 606 if (!error) { 607 TRANS_END_SYNC(ufsvfsp, error, TOP_COMMIT_UPDATE, 608 TOP_COMMIT_SIZE); 609 } 610 curthread->t_flag &= ~T_DONTBLOCK; 611 } 612 613 /* 614 * Adjust the resource usage in a struct statvfs based on 615 * what's in the delete queue. 616 * 617 * We do not consider the impact of ACLs or extended attributes 618 * that may be deleted as a side-effect of deleting a file. 619 * Those are metadata, and their sizes aren't reflected in the 620 * sizes returned by stat(), so this is not a problem. 621 */ 622 void 623 ufs_delete_adjust_stats(struct ufsvfs *ufsvfsp, struct statvfs64 *sp) 624 { 625 struct ufs_q *uq = &ufsvfsp->vfs_delete; 626 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 627 628 /* 629 * We'll get signalled when it's our turn. However, if there's 630 * nothing going on, there's no point in waking up the delete 631 * thread and waiting for it to tell us to continue. 632 */ 633 mutex_enter(&uq->uq_mutex); 634 635 if ((uq->uq_flags & UQ_FASTCLIENTS) || (uq->uq_ne != 0)) { 636 uq->uq_flags |= UQ_FASTCLIENTS; 637 cv_broadcast(&uq->uq_cv); 638 cv_wait(&delq_info->delq_fast_cv, &uq->uq_mutex); 639 } 640 641 /* 642 * The blocks accounted for in the delete queue info are 643 * counted in DEV_BSIZE chunks, but ufs_statvfs counts in 644 * filesystem fragments, so a conversion is required here. 645 */ 646 sp->f_bfree += dbtofsb(ufsvfsp->vfs_fs, 647 delq_info->delq_unreclaimed_blocks); 648 sp->f_ffree += delq_info->delq_unreclaimed_files; 649 mutex_exit(&uq->uq_mutex); 650 } 651 652 /* 653 * IDLE INODE 654 * The following routines implement the protocol for maintaining an 655 * LRU list of idle inodes and for moving the idle inodes to the 656 * reuse list when the number of allocated inodes exceeds the user 657 * tunable high-water mark (ufs_ninode). 658 */ 659 660 /* 661 * clean an idle inode and move it to the reuse list 662 */ 663 static void 664 ufs_idle_free(struct inode *ip) 665 { 666 int pages; 667 int hno; 668 kmutex_t *ihm; 669 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 670 struct vnode *vp = ITOV(ip); 671 672 /* 673 * inode is held 674 */ 675 676 /* 677 * remember `pages' for stats below 678 */ 679 pages = (ip->i_mode && vn_has_cached_data(vp) && vp->v_type != VCHR); 680 681 /* 682 * start the dirty pages to disk and then invalidate them 683 * unless the inode is invalid (ISTALE) 684 */ 685 if ((ip->i_flag & ISTALE) == 0) { 686 (void) TRANS_SYNCIP(ip, B_ASYNC, I_ASYNC, TOP_SYNCIP_FREE); 687 (void) TRANS_SYNCIP(ip, 688 (TRANS_ISERROR(ufsvfsp)) ? 689 B_INVAL | B_FORCE : B_INVAL, 690 I_ASYNC, TOP_SYNCIP_FREE); 691 } 692 693 /* 694 * wait for any current ufs_iget to finish and block future ufs_igets 695 */ 696 ASSERT(ip->i_number != 0); 697 hno = INOHASH(ip->i_number); 698 ihm = &ih_lock[hno]; 699 mutex_enter(ihm); 700 701 /* 702 * It must be guaranteed that v_count >= 2, otherwise 703 * something must be wrong with this vnode already. 704 * That is why we use v_count-- instead of VN_RELE(). 705 * Acquire the vnode lock in case another thread is in 706 * VN_RELE(). 707 */ 708 mutex_enter(&vp->v_lock); 709 710 if (vp->v_count < 2) 711 cmn_err(CE_PANIC, 712 "ufs_idle_free: vnode ref count is less than 2"); 713 714 vp->v_count--; 715 if ((vp->v_type != VCHR && vn_has_cached_data(vp)) || 716 vp->v_count != 1 || 717 ip->i_flag & (IMOD|IMODACC|IACC|ICHG|IUPD|IATTCHG)) { 718 /* 719 * Another thread has referenced this inode while 720 * we are trying to free it. Call VN_RELE() to 721 * release our reference. 722 */ 723 mutex_exit(&vp->v_lock); 724 mutex_exit(ihm); 725 VN_RELE(vp); 726 } else { 727 /* 728 * The inode is currently unreferenced and can not 729 * acquire further references because it has no pages 730 * and the hash is locked. Inodes acquire references 731 * via the hash list or via their pages. 732 */ 733 734 mutex_exit(&vp->v_lock); 735 736 /* 737 * remove it from the cache 738 */ 739 remque(ip); 740 mutex_exit(ihm); 741 /* 742 * Stale inodes have no valid ufsvfs 743 */ 744 if ((ip->i_flag & ISTALE) == 0 && ip->i_dquot) { 745 TRANS_DQRELE(ufsvfsp, ip->i_dquot); 746 ip->i_dquot = NULL; 747 } 748 ufs_si_del(ip); 749 if (pages) { 750 CPU_STATS_ADDQ(CPU, sys, ufsipage, 1); 751 } else { 752 CPU_STATS_ADDQ(CPU, sys, ufsinopage, 1); 753 } 754 ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp)); 755 756 /* 757 * We had better not have a vnode reference count > 1 758 * at this point, if we do then something is broken as 759 * this inode/vnode acquired a reference underneath of us. 760 */ 761 ASSERT(vp->v_count == 1); 762 763 ufs_free_inode(ip); 764 } 765 } 766 767 /* 768 * this thread processes the global idle queue 769 */ 770 iqhead_t *ufs_junk_iq; 771 iqhead_t *ufs_useful_iq; 772 int ufs_njunk_iq = 0; 773 int ufs_nuseful_iq = 0; 774 int ufs_niqhash; 775 int ufs_iqhashmask; 776 struct ufs_q ufs_idle_q; 777 778 void 779 ufs_thread_idle(void) 780 { 781 callb_cpr_t cprinfo; 782 int i; 783 int ne; 784 785 ufs_niqhash = (ufs_idle_q.uq_lowat >> 1) / IQHASHQLEN; 786 ufs_niqhash = 1 << highbit(ufs_niqhash); /* round up to power of 2 */ 787 ufs_iqhashmask = ufs_niqhash - 1; 788 ufs_junk_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_junk_iq), 789 KM_SLEEP); 790 ufs_useful_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_useful_iq), 791 KM_SLEEP); 792 793 /* Initialize hash queue headers */ 794 for (i = 0; i < ufs_niqhash; i++) { 795 ufs_junk_iq[i].i_freef = (inode_t *)&ufs_junk_iq[i]; 796 ufs_junk_iq[i].i_freeb = (inode_t *)&ufs_junk_iq[i]; 797 ufs_useful_iq[i].i_freef = (inode_t *)&ufs_useful_iq[i]; 798 ufs_useful_iq[i].i_freeb = (inode_t *)&ufs_useful_iq[i]; 799 } 800 801 CALLB_CPR_INIT(&cprinfo, &ufs_idle_q.uq_mutex, callb_generic_cpr, 802 "ufsidle"); 803 again: 804 /* 805 * Whenever the idle thread is awakened, it repeatedly gives 806 * back half of the idle queue until the idle queue falls 807 * below lowat. 808 */ 809 mutex_enter(&ufs_idle_q.uq_mutex); 810 if (ufs_idle_q.uq_ne < ufs_idle_q.uq_lowat) { 811 CALLB_CPR_SAFE_BEGIN(&cprinfo); 812 cv_wait(&ufs_idle_q.uq_cv, &ufs_idle_q.uq_mutex); 813 CALLB_CPR_SAFE_END(&cprinfo, &ufs_idle_q.uq_mutex); 814 } 815 mutex_exit(&ufs_idle_q.uq_mutex); 816 817 /* 818 * Give back 1/2 of the idle queue 819 */ 820 ne = ufs_idle_q.uq_ne >> 1; 821 ins.in_tidles.value.ul += ne; 822 ufs_idle_some(ne); 823 goto again; 824 } 825 826 /* 827 * Reclaim callback for ufs inode cache. 828 * Invoked by the kernel memory allocator when memory gets tight. 829 */ 830 /*ARGSUSED*/ 831 void 832 ufs_inode_cache_reclaim(void *cdrarg) 833 { 834 /* 835 * If we are low on memory and the idle queue is over its 836 * halfway mark, then free 50% of the idle q 837 * 838 * We don't free all of the idle inodes because the inodes 839 * for popular NFS files may have been kicked from the dnlc. 840 * The inodes for these files will end up on the idle queue 841 * after every NFS access. 842 * 843 * If we repeatedly push them from the idle queue then 844 * NFS users may be unhappy as an extra buf cache operation 845 * is incurred for every NFS operation to these files. 846 * 847 * It's not common, but I have seen it happen. 848 * 849 */ 850 if (ufs_idle_q.uq_ne < (ufs_idle_q.uq_lowat >> 1)) 851 return; 852 mutex_enter(&ufs_idle_q.uq_mutex); 853 cv_broadcast(&ufs_idle_q.uq_cv); 854 mutex_exit(&ufs_idle_q.uq_mutex); 855 } 856 857 /* 858 * Free up some idle inodes 859 */ 860 void 861 ufs_idle_some(int ne) 862 { 863 int i; 864 struct inode *ip; 865 struct vnode *vp; 866 static int junk_rotor = 0; 867 static int useful_rotor = 0; 868 869 for (i = 0; i < ne; ++i) { 870 mutex_enter(&ufs_idle_q.uq_mutex); 871 872 if (ufs_njunk_iq) { 873 while (ufs_junk_iq[junk_rotor].i_freef == 874 (inode_t *)&ufs_junk_iq[junk_rotor]) { 875 junk_rotor = IQNEXT(junk_rotor); 876 } 877 ip = ufs_junk_iq[junk_rotor].i_freef; 878 ASSERT(ip->i_flag & IJUNKIQ); 879 } else if (ufs_nuseful_iq) { 880 while (ufs_useful_iq[useful_rotor].i_freef == 881 (inode_t *)&ufs_useful_iq[useful_rotor]) { 882 useful_rotor = IQNEXT(useful_rotor); 883 } 884 ip = ufs_useful_iq[useful_rotor].i_freef; 885 ASSERT(!(ip->i_flag & IJUNKIQ)); 886 } else { 887 mutex_exit(&ufs_idle_q.uq_mutex); 888 return; 889 } 890 891 /* 892 * emulate ufs_iget 893 */ 894 vp = ITOV(ip); 895 VN_HOLD(vp); 896 mutex_exit(&ufs_idle_q.uq_mutex); 897 rw_enter(&ip->i_contents, RW_WRITER); 898 /* 899 * VN_RELE should not be called if 900 * ufs_rmidle returns true, as it will 901 * effectively be done in ufs_idle_free. 902 */ 903 if (ufs_rmidle(ip)) { 904 rw_exit(&ip->i_contents); 905 ufs_idle_free(ip); 906 } else { 907 rw_exit(&ip->i_contents); 908 VN_RELE(vp); 909 } 910 } 911 } 912 913 /* 914 * drain entries for vfsp from the idle queue 915 * vfsp == NULL means drain the entire thing 916 */ 917 void 918 ufs_idle_drain(struct vfs *vfsp) 919 { 920 struct inode *ip, *nip; 921 struct inode *ianchor = NULL; 922 int i; 923 924 mutex_enter(&ufs_idle_q.uq_mutex); 925 if (ufs_njunk_iq) { 926 /* for each hash q */ 927 for (i = 0; i < ufs_niqhash; i++) { 928 /* search down the hash q */ 929 for (ip = ufs_junk_iq[i].i_freef; 930 ip != (inode_t *)&ufs_junk_iq[i]; 931 ip = ip->i_freef) { 932 if (ip->i_vfs == vfsp || vfsp == NULL) { 933 /* found a matching entry */ 934 VN_HOLD(ITOV(ip)); 935 mutex_exit(&ufs_idle_q.uq_mutex); 936 rw_enter(&ip->i_contents, RW_WRITER); 937 /* 938 * See comments in ufs_idle_some() 939 * as we will call ufs_idle_free() 940 * after scanning both queues. 941 */ 942 if (ufs_rmidle(ip)) { 943 rw_exit(&ip->i_contents); 944 ip->i_freef = ianchor; 945 ianchor = ip; 946 } else { 947 rw_exit(&ip->i_contents); 948 VN_RELE(ITOV(ip)); 949 } 950 /* restart this hash q */ 951 ip = (inode_t *)&ufs_junk_iq[i]; 952 mutex_enter(&ufs_idle_q.uq_mutex); 953 } 954 } 955 } 956 } 957 if (ufs_nuseful_iq) { 958 /* for each hash q */ 959 for (i = 0; i < ufs_niqhash; i++) { 960 /* search down the hash q */ 961 for (ip = ufs_useful_iq[i].i_freef; 962 ip != (inode_t *)&ufs_useful_iq[i]; 963 ip = ip->i_freef) { 964 if (ip->i_vfs == vfsp || vfsp == NULL) { 965 /* found a matching entry */ 966 VN_HOLD(ITOV(ip)); 967 mutex_exit(&ufs_idle_q.uq_mutex); 968 rw_enter(&ip->i_contents, RW_WRITER); 969 /* 970 * See comments in ufs_idle_some() 971 * as we will call ufs_idle_free() 972 * after scanning both queues. 973 */ 974 if (ufs_rmidle(ip)) { 975 rw_exit(&ip->i_contents); 976 ip->i_freef = ianchor; 977 ianchor = ip; 978 } else { 979 rw_exit(&ip->i_contents); 980 VN_RELE(ITOV(ip)); 981 } 982 /* restart this hash q */ 983 ip = (inode_t *)&ufs_useful_iq[i]; 984 mutex_enter(&ufs_idle_q.uq_mutex); 985 } 986 } 987 } 988 } 989 990 mutex_exit(&ufs_idle_q.uq_mutex); 991 /* no more matching entries, release those we have found (if any) */ 992 for (ip = ianchor; ip; ip = nip) { 993 nip = ip->i_freef; 994 ip->i_freef = ip; 995 ufs_idle_free(ip); 996 } 997 } 998 999 /* 1000 * RECLAIM DELETED INODES 1001 * The following thread scans the file system once looking for deleted files 1002 */ 1003 void 1004 ufs_thread_reclaim(struct vfs *vfsp) 1005 { 1006 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 1007 struct ufs_q *uq = &ufsvfsp->vfs_reclaim; 1008 struct fs *fs = ufsvfsp->vfs_fs; 1009 struct buf *bp = 0; 1010 int err = 0; 1011 daddr_t bno; 1012 ino_t ino; 1013 struct dinode *dp; 1014 struct inode *ip; 1015 callb_cpr_t cprinfo; 1016 1017 CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr, 1018 "ufsreclaim"); 1019 1020 /* 1021 * mount decided that we don't need a reclaim thread 1022 */ 1023 if ((fs->fs_reclaim & FS_RECLAIMING) == 0) 1024 err++; 1025 1026 /* 1027 * don't reclaim if readonly 1028 */ 1029 if (fs->fs_ronly) 1030 err++; 1031 1032 for (ino = 0; ino < (fs->fs_ncg * fs->fs_ipg) && !err; ++ino) { 1033 1034 /* 1035 * Check whether we are the target of another 1036 * thread having called ufs_thread_exit() or 1037 * ufs_thread_suspend(). 1038 */ 1039 mutex_enter(&uq->uq_mutex); 1040 again: 1041 if (uq->uq_flags & UQ_EXIT) { 1042 err++; 1043 mutex_exit(&uq->uq_mutex); 1044 break; 1045 } else if (uq->uq_flags & UQ_SUSPEND) { 1046 uq->uq_flags |= UQ_SUSPENDED; 1047 /* 1048 * Release the buf before we cv_wait() 1049 * otherwise we may deadlock with the 1050 * thread that called ufs_thread_suspend(). 1051 */ 1052 if (bp) { 1053 brelse(bp); 1054 bp = 0; 1055 } 1056 if (uq->uq_flags & UQ_WAIT) { 1057 uq->uq_flags &= ~UQ_WAIT; 1058 cv_broadcast(&uq->uq_cv); 1059 } 1060 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1061 cv_wait(&uq->uq_cv, &uq->uq_mutex); 1062 CALLB_CPR_SAFE_END(&cprinfo, &uq->uq_mutex); 1063 goto again; 1064 } 1065 mutex_exit(&uq->uq_mutex); 1066 1067 /* 1068 * if we don't already have the buf; get it 1069 */ 1070 bno = fsbtodb(fs, itod(fs, ino)); 1071 if ((bp == 0) || (bp->b_blkno != bno)) { 1072 if (bp) 1073 brelse(bp); 1074 bp = UFS_BREAD(ufsvfsp, 1075 ufsvfsp->vfs_dev, bno, fs->fs_bsize); 1076 bp->b_flags |= B_AGE; 1077 } 1078 if (bp->b_flags & B_ERROR) { 1079 err++; 1080 continue; 1081 } 1082 /* 1083 * nlink <= 0 and mode != 0 means deleted 1084 */ 1085 dp = (struct dinode *)bp->b_un.b_addr + itoo(fs, ino); 1086 if ((dp->di_nlink <= 0) && (dp->di_mode != 0)) { 1087 /* 1088 * can't hold the buf (deadlock) 1089 */ 1090 brelse(bp); 1091 bp = 0; 1092 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1093 /* 1094 * iget/iput sequence will put inode on ifree 1095 * thread queue if it is idle. This is a nop 1096 * for busy (open, deleted) inodes 1097 */ 1098 if (ufs_iget(vfsp, ino, &ip, CRED())) 1099 err++; 1100 else 1101 VN_RELE(ITOV(ip)); 1102 rw_exit(&ufsvfsp->vfs_dqrwlock); 1103 } 1104 } 1105 1106 if (bp) 1107 brelse(bp); 1108 if (!err) { 1109 /* 1110 * reset the reclaiming-bit 1111 */ 1112 mutex_enter(&ufsvfsp->vfs_lock); 1113 fs->fs_reclaim &= ~FS_RECLAIMING; 1114 mutex_exit(&ufsvfsp->vfs_lock); 1115 TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_RECLAIM); 1116 } 1117 1118 /* 1119 * exit the reclaim thread 1120 */ 1121 mutex_enter(&uq->uq_mutex); 1122 uq->uq_threadp = NULL; 1123 uq->uq_flags &= ~UQ_WAIT; 1124 cv_broadcast(&uq->uq_cv); 1125 CALLB_CPR_EXIT(&cprinfo); 1126 thread_exit(); 1127 } 1128 /* 1129 * HLOCK FILE SYSTEM 1130 * hlock the file system's whose logs have device errors 1131 */ 1132 struct ufs_q ufs_hlock; 1133 /*ARGSUSED*/ 1134 void 1135 ufs_thread_hlock(void *ignore) 1136 { 1137 int retry; 1138 callb_cpr_t cprinfo; 1139 1140 CALLB_CPR_INIT(&cprinfo, &ufs_hlock.uq_mutex, callb_generic_cpr, 1141 "ufshlock"); 1142 1143 for (;;) { 1144 /* 1145 * sleep until there is work to do 1146 */ 1147 mutex_enter(&ufs_hlock.uq_mutex); 1148 (void) ufs_thread_run(&ufs_hlock, &cprinfo); 1149 ufs_hlock.uq_ne = 0; 1150 mutex_exit(&ufs_hlock.uq_mutex); 1151 /* 1152 * hlock the error'ed fs's 1153 * retry after a bit if another app is doing lockfs stuff 1154 */ 1155 do { 1156 retry = ufs_trans_hlock(); 1157 if (retry) { 1158 mutex_enter(&ufs_hlock.uq_mutex); 1159 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1160 (void) cv_timedwait(&ufs_hlock.uq_cv, 1161 &ufs_hlock.uq_mutex, 1162 lbolt + hz); 1163 CALLB_CPR_SAFE_END(&cprinfo, 1164 &ufs_hlock.uq_mutex); 1165 mutex_exit(&ufs_hlock.uq_mutex); 1166 } 1167 } while (retry); 1168 } 1169 } 1170 1171 static void 1172 ufs_attr_purge(struct inode *dp) 1173 { 1174 int err; 1175 int error; 1176 off_t dirsize; /* size of the directory */ 1177 off_t offset; /* offset in the directory */ 1178 int entryoffsetinblk; /* offset of ep in fbp's buffer */ 1179 struct inode *tp; 1180 struct fbuf *fbp; /* pointer to directory block */ 1181 struct direct *ep; /* directory entry */ 1182 int trans_size; 1183 int issync; 1184 struct ufsvfs *ufsvfsp = dp->i_ufsvfs; 1185 1186 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1187 1188 fbp = NULL; 1189 dirsize = roundup(dp->i_size, DIRBLKSIZ); 1190 offset = 0; 1191 entryoffsetinblk = 0; 1192 1193 /* 1194 * Purge directory cache 1195 */ 1196 1197 dnlc_dir_purge(&dp->i_danchor); 1198 1199 while (offset < dirsize) { 1200 /* 1201 * If offset is on a block boundary, 1202 * read the next directory block. 1203 * Release previous if it exists. 1204 */ 1205 if (blkoff(dp->i_fs, offset) == 0) { 1206 if (fbp != NULL) { 1207 fbrelse(fbp, S_OTHER); 1208 } 1209 1210 err = blkatoff(dp, offset, (char **)0, &fbp); 1211 if (err) { 1212 goto out; 1213 } 1214 entryoffsetinblk = 0; 1215 } 1216 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk); 1217 if (ep->d_ino == 0 || (ep->d_name[0] == '.' && 1218 ep->d_name[1] == '\0') || 1219 (ep->d_name[0] == '.' && ep->d_name[1] == '.' && 1220 ep->d_name[2] == '\0')) { 1221 1222 entryoffsetinblk += ep->d_reclen; 1223 1224 } else { 1225 1226 if ((err = ufs_iget(dp->i_vfs, ep->d_ino, 1227 &tp, CRED())) != 0) { 1228 goto out; 1229 } 1230 1231 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE, 1232 trans_size = (int)TOP_REMOVE_SIZE(tp)); 1233 1234 /* 1235 * Delete inode. 1236 */ 1237 1238 dnlc_remove(ITOV(dp), ep->d_name); 1239 1240 rw_enter(&tp->i_contents, RW_WRITER); 1241 tp->i_flag |= ICHG; 1242 tp->i_seq++; 1243 TRANS_INODE(tp->i_ufsvfs, tp); 1244 tp->i_nlink--; 1245 ufs_setreclaim(tp); 1246 ITIMES_NOLOCK(tp); 1247 rw_exit(&tp->i_contents); 1248 1249 VN_RELE(ITOV(tp)); 1250 entryoffsetinblk += ep->d_reclen; 1251 TRANS_END_CSYNC(ufsvfsp, error, 1252 issync, TOP_REMOVE, trans_size); 1253 1254 } 1255 offset += ep->d_reclen; 1256 } 1257 1258 if (fbp) { 1259 fbrelse(fbp, S_OTHER); 1260 } 1261 1262 out: 1263 rw_exit(&ufsvfsp->vfs_dqrwlock); 1264 } 1265