1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #pragma ident "%Z%%M% %I% %E% SMI" 35 36 #include <sys/types.h> 37 #include <sys/systm.h> 38 #include <sys/errno.h> 39 #include <sys/kmem.h> 40 #include <sys/buf.h> 41 #include <sys/vnode.h> 42 #include <sys/vfs.h> 43 #include <sys/user.h> 44 #include <sys/callb.h> 45 #include <sys/cpuvar.h> 46 #include <sys/fs/ufs_inode.h> 47 #include <sys/fs/ufs_log.h> 48 #include <sys/fs/ufs_trans.h> 49 #include <sys/fs/ufs_acl.h> 50 #include <sys/fs/ufs_bio.h> 51 #include <sys/fs/ufs_fsdir.h> 52 #include <sys/debug.h> 53 #include <sys/cmn_err.h> 54 #include <sys/sysmacros.h> 55 56 extern pri_t minclsyspri; 57 extern int hash2ints(); 58 extern struct kmem_cache *inode_cache; /* cache of free inodes */ 59 extern int ufs_idle_waiters; 60 extern struct instats ins; 61 62 static void ufs_attr_purge(struct inode *); 63 64 /* 65 * initialize a thread's queue struct 66 */ 67 void 68 ufs_thread_init(struct ufs_q *uq, int lowat) 69 { 70 bzero((caddr_t)uq, sizeof (*uq)); 71 cv_init(&uq->uq_cv, NULL, CV_DEFAULT, NULL); 72 mutex_init(&uq->uq_mutex, NULL, MUTEX_DEFAULT, NULL); 73 uq->uq_lowat = lowat; 74 uq->uq_hiwat = 2 * lowat; 75 uq->uq_threadp = NULL; 76 } 77 78 /* 79 * start a thread for a queue (assumes success) 80 */ 81 void 82 ufs_thread_start(struct ufs_q *uq, void (*func)(), struct vfs *vfsp) 83 { 84 mutex_enter(&uq->uq_mutex); 85 if (uq->uq_threadp == NULL) { 86 uq->uq_threadp = thread_create(NULL, 0, func, vfsp, 0, &p0, 87 TS_RUN, minclsyspri); 88 uq->uq_flags = 0; 89 } 90 mutex_exit(&uq->uq_mutex); 91 } 92 93 /* 94 * wait for the thread to exit 95 */ 96 void 97 ufs_thread_exit(struct ufs_q *uq) 98 { 99 kt_did_t ufs_thread_did = 0; 100 101 mutex_enter(&uq->uq_mutex); 102 uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED); 103 if (uq->uq_threadp != NULL) { 104 ufs_thread_did = uq->uq_threadp->t_did; 105 uq->uq_flags |= (UQ_EXIT|UQ_WAIT); 106 cv_broadcast(&uq->uq_cv); 107 } 108 mutex_exit(&uq->uq_mutex); 109 110 /* 111 * It's safe to call thread_join() with an already-gone 112 * t_did, but we have to obtain it before the kernel 113 * thread structure is freed. We do so above under the 114 * protection of the uq_mutex when we're sure the thread 115 * still exists and it's save to de-reference it. 116 * We also have to check if ufs_thread_did is != 0 117 * before calling thread_join() since thread 0 in the system 118 * gets a t_did of 0. 119 */ 120 if (ufs_thread_did) 121 thread_join(ufs_thread_did); 122 } 123 124 /* 125 * wait for a thread to suspend itself on the caller's behalf 126 * the caller is responsible for continuing the thread 127 */ 128 void 129 ufs_thread_suspend(struct ufs_q *uq) 130 { 131 mutex_enter(&uq->uq_mutex); 132 if (uq->uq_threadp != NULL) { 133 /* 134 * wait while another thread is suspending this thread. 135 * no need to do a cv_broadcast(), as whoever suspended 136 * the thread must continue it at some point. 137 */ 138 while ((uq->uq_flags & UQ_SUSPEND) && 139 (uq->uq_threadp != NULL)) { 140 /* 141 * We can't use cv_signal() because if our 142 * signal doesn't happen to hit the desired 143 * thread but instead some other waiter like 144 * ourselves, we'll wait forever for a 145 * response. Well, at least an indeterminate 146 * amount of time until we just happen to get 147 * lucky from whomever did get signalled doing 148 * a cv_signal() of their own. This is an 149 * unfortunate performance lossage. 150 */ 151 uq->uq_flags |= UQ_WAIT; 152 cv_wait(&uq->uq_cv, &uq->uq_mutex); 153 } 154 155 /* 156 * wait for the thread to suspend itself 157 */ 158 uq->uq_flags |= UQ_SUSPEND; 159 while (((uq->uq_flags & UQ_SUSPENDED) == 0) && 160 (uq->uq_threadp != NULL)) { 161 uq->uq_flags |= UQ_WAIT; 162 cv_broadcast(&uq->uq_cv); 163 cv_wait(&uq->uq_cv, &uq->uq_mutex); 164 } 165 } 166 mutex_exit(&uq->uq_mutex); 167 } 168 169 /* 170 * allow a thread to continue from a ufs_thread_suspend() 171 * This thread must be the same as the thread that called 172 * ufs_thread_suspend. 173 */ 174 void 175 ufs_thread_continue(struct ufs_q *uq) 176 { 177 mutex_enter(&uq->uq_mutex); 178 uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED); 179 cv_broadcast(&uq->uq_cv); 180 mutex_exit(&uq->uq_mutex); 181 } 182 183 /* 184 * some common code for managing a threads execution 185 * uq is locked at entry and return 186 * may sleep 187 * may exit 188 */ 189 /* 190 * Kind of a hack passing in the callb_cpr_t * here. 191 * It should really be part of the ufs_q structure. 192 * I did not put it in there because we are already in beta 193 * and I was concerned that changing ufs_inode.h to include 194 * callb.h might break something. 195 */ 196 int 197 ufs_thread_run(struct ufs_q *uq, callb_cpr_t *cprinfop) 198 { 199 again: 200 ASSERT(uq->uq_ne >= 0); 201 202 if (uq->uq_flags & UQ_SUSPEND) { 203 uq->uq_flags |= UQ_SUSPENDED; 204 } else if (uq->uq_flags & UQ_EXIT) { 205 /* 206 * exiting; empty the queue (may infinite loop) 207 */ 208 if (uq->uq_ne) 209 return (uq->uq_ne); 210 uq->uq_threadp = NULL; 211 if (uq->uq_flags & UQ_WAIT) { 212 cv_broadcast(&uq->uq_cv); 213 } 214 uq->uq_flags &= ~(UQ_EXIT | UQ_WAIT); 215 CALLB_CPR_EXIT(cprinfop); 216 thread_exit(); 217 } else if (uq->uq_ne >= uq->uq_lowat) { 218 /* 219 * process a block of entries until below high water mark 220 */ 221 return (uq->uq_ne - (uq->uq_lowat >> 1)); 222 } else if (uq->uq_flags & UQ_FASTCLIENTS) { 223 /* 224 * Let the fast acting clients through 225 */ 226 return (0); 227 } 228 if (uq->uq_flags & UQ_WAIT) { 229 uq->uq_flags &= ~UQ_WAIT; 230 cv_broadcast(&uq->uq_cv); 231 } 232 CALLB_CPR_SAFE_BEGIN(cprinfop); 233 cv_wait(&uq->uq_cv, &uq->uq_mutex); 234 CALLB_CPR_SAFE_END(cprinfop, &uq->uq_mutex); 235 goto again; 236 } 237 238 /* 239 * DELETE INODE 240 * The following routines implement the protocol for freeing the resources 241 * held by an idle and deleted inode. 242 */ 243 void 244 ufs_delete(struct ufsvfs *ufsvfsp, struct inode *ip, int dolockfs) 245 { 246 ushort_t mode; 247 struct vnode *vp = ITOV(ip); 248 struct ulockfs *ulp; 249 int trans_size; 250 int dorwlock = ((ip->i_mode & IFMT) == IFREG); 251 int issync; 252 int err; 253 struct inode *dp; 254 struct ufs_q *delq = &ufsvfsp->vfs_delete; 255 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 256 257 /* 258 * not on a trans device or not part of a transaction 259 */ 260 ASSERT(!TRANS_ISTRANS(ufsvfsp) || 261 ((curthread->t_flag & T_DONTBLOCK) == 0)); 262 263 /* 264 * Ignore if deletes are not allowed (wlock/hlock) 265 */ 266 if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) { 267 VN_RELE(vp); 268 return; 269 } 270 271 if ((vp->v_count > 1) || (ip->i_mode == 0)) { 272 VN_RELE(vp); 273 return; 274 } 275 /* 276 * If we are called as part of setting a fs lock, then only 277 * do part of the lockfs protocol. In other words, don't hang. 278 */ 279 if (dolockfs) { 280 if (ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_DELETE_MASK)) 281 return; 282 } else { 283 /* 284 * check for recursive VOP call 285 */ 286 if (curthread->t_flag & T_DONTBLOCK) { 287 ulp = NULL; 288 } else { 289 ulp = &ufsvfsp->vfs_ulockfs; 290 curthread->t_flag |= T_DONTBLOCK; 291 } 292 } 293 294 /* 295 * Hold rwlock to synchronize with (nfs) writes 296 */ 297 if (dorwlock) 298 rw_enter(&ip->i_rwlock, RW_WRITER); 299 300 /* 301 * Delete the attribute directory. 302 */ 303 if (ip->i_oeftflag != 0) { 304 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE, 305 trans_size = (int)TOP_REMOVE_SIZE(ip)); 306 rw_enter(&ip->i_contents, RW_WRITER); 307 err = ufs_iget(ip->i_vfs, ip->i_oeftflag, 308 &dp, CRED()); 309 if (err == 0) { 310 rw_enter(&dp->i_rwlock, RW_WRITER); 311 rw_enter(&dp->i_contents, RW_WRITER); 312 dp->i_flag |= IUPD|ICHG; 313 dp->i_seq++; 314 TRANS_INODE(dp->i_ufsvfs, dp); 315 dp->i_nlink -= 2; 316 ufs_setreclaim(dp); 317 /* 318 * Should get rid of any negative cache entries that 319 * might be lingering, as well as ``.'' and 320 * ``..''. If we don't, the VN_RELE() below 321 * won't actually put dp on the delete queue 322 * and it'll hang out until someone forces it 323 * (lockfs -f, umount, ...). The only reliable 324 * way of doing this at the moment is to call 325 * dnlc_purge_vp(ITOV(dp)), which is unacceptably 326 * slow, so we'll just note the problem in this 327 * comment for now. 328 */ 329 dnlc_remove(ITOV(dp), "."); 330 dnlc_remove(ITOV(dp), ".."); 331 ITIMES_NOLOCK(dp); 332 if (!TRANS_ISTRANS(ufsvfsp)) { 333 ufs_iupdat(dp, I_SYNC); 334 } 335 rw_exit(&dp->i_contents); 336 rw_exit(&dp->i_rwlock); 337 VN_RELE(ITOV(dp)); 338 } 339 /* 340 * Clear out attribute pointer 341 */ 342 ip->i_oeftflag = 0; 343 rw_exit(&ip->i_contents); 344 TRANS_END_CSYNC(ufsvfsp, err, issync, 345 TOP_REMOVE, trans_size); 346 dnlc_remove(ITOV(ip), XATTR_DIR_NAME); 347 } 348 349 if ((ip->i_mode & IFMT) == IFATTRDIR) { 350 ufs_attr_purge(ip); 351 } 352 353 (void) TRANS_ITRUNC(ip, (u_offset_t)0, I_FREE | I_ACCT, CRED()); 354 355 /* 356 * the inode's space has been freed; now free the inode 357 */ 358 if (ulp) { 359 trans_size = TOP_IFREE_SIZE(ip); 360 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IFREE, trans_size); 361 } 362 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 363 rw_enter(&ip->i_contents, RW_WRITER); 364 TRANS_INODE(ufsvfsp, ip); 365 mode = ip->i_mode; 366 ip->i_mode = 0; 367 ip->i_rdev = 0; 368 ip->i_ordev = 0; 369 ip->i_flag |= IMOD; 370 if (ip->i_ufs_acl) { 371 (void) ufs_si_free(ip->i_ufs_acl, vp->v_vfsp, CRED()); 372 ip->i_ufs_acl = NULL; 373 ip->i_shadow = 0; 374 } 375 376 /* 377 * This inode is torn down but still retains it's identity 378 * (inode number). It could get recycled soon so it's best 379 * to clean up the vnode just in case. 380 */ 381 mutex_enter(&vp->v_lock); 382 vn_recycle(vp); 383 mutex_exit(&vp->v_lock); 384 385 /* 386 * free the inode 387 */ 388 ufs_ifree(ip, ip->i_number, mode); 389 /* 390 * release quota resources; can't fail 391 */ 392 (void) chkiq((struct ufsvfs *)vp->v_vfsp->vfs_data, 393 /* change */ -1, ip, (uid_t)ip->i_uid, 0, CRED(), 394 (char **)NULL, (size_t *)NULL); 395 dqrele(ip->i_dquot); 396 ip->i_dquot = NULL; 397 ip->i_flag &= ~(IDEL | IDIRECTIO); 398 ip->i_cflags = 0; 399 if (!TRANS_ISTRANS(ufsvfsp)) { 400 ufs_iupdat(ip, I_SYNC); 401 } else { 402 mutex_enter(&delq->uq_mutex); 403 delq_info->delq_unreclaimed_files--; 404 mutex_exit(&delq->uq_mutex); 405 } 406 rw_exit(&ip->i_contents); 407 rw_exit(&ufsvfsp->vfs_dqrwlock); 408 if (dorwlock) 409 rw_exit(&ip->i_rwlock); 410 VN_RELE(vp); 411 412 /* 413 * End of transaction 414 */ 415 if (ulp) { 416 TRANS_END_ASYNC(ufsvfsp, TOP_IFREE, trans_size); 417 if (dolockfs) 418 ufs_lockfs_end(ulp); 419 else 420 curthread->t_flag &= ~T_DONTBLOCK; 421 } 422 } 423 424 /* 425 * Create the delete thread and init the delq_info for this fs 426 */ 427 void 428 ufs_delete_init(struct ufsvfs *ufsvfsp, int lowat) 429 { 430 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 431 432 ufs_thread_init(&ufsvfsp->vfs_delete, lowat); 433 (void) memset((void *)delq_info, 0, sizeof (*delq_info)); 434 cv_init(&delq_info->delq_fast_cv, NULL, CV_DEFAULT, NULL); 435 } 436 437 /* 438 * thread that frees up deleted inodes 439 */ 440 void 441 ufs_thread_delete(struct vfs *vfsp) 442 { 443 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 444 struct ufs_q *uq = &ufsvfsp->vfs_delete; 445 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 446 struct inode *ip; 447 long ne; 448 callb_cpr_t cprinfo; 449 450 CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr, 451 "ufsdelete"); 452 453 mutex_enter(&uq->uq_mutex); 454 again: 455 /* 456 * Sleep until there is work to do. Only do one entry at 457 * a time, to reduce the wait time for checking for a suspend 458 * or fast-client request. The ?: is for pedantic portability. 459 */ 460 ne = ufs_thread_run(uq, &cprinfo) ? 1 : 0; 461 462 /* 463 * process an entry, if there are any 464 */ 465 if (ne && (ip = uq->uq_ihead)) { 466 /* 467 * process first entry on queue. Assumed conditions are: 468 * ip is held (v_count >= 1) 469 * ip is referenced (i_flag & IREF) 470 * ip is free (i_nlink <= 0) 471 */ 472 if ((uq->uq_ihead = ip->i_freef) == ip) 473 uq->uq_ihead = NULL; 474 ip->i_freef->i_freeb = ip->i_freeb; 475 ip->i_freeb->i_freef = ip->i_freef; 476 ip->i_freef = ip; 477 ip->i_freeb = ip; 478 uq->uq_ne--; 479 mutex_exit(&uq->uq_mutex); 480 ufs_delete(ufsvfsp, ip, 1); 481 mutex_enter(&uq->uq_mutex); 482 } 483 484 /* 485 * If there are any fast clients, let all of them through. 486 * Mainly intended for statvfs(), which doesn't need to do 487 * anything except look at the number of bytes/inodes that 488 * are in the queue. 489 */ 490 if (uq->uq_flags & UQ_FASTCLIENTS) { 491 uq->uq_flags &= ~UQ_FASTCLIENTS; 492 /* 493 * Give clients a chance. The lock exit/entry 494 * allows waiting statvfs threads through. 495 */ 496 cv_broadcast(&delq_info->delq_fast_cv); 497 mutex_exit(&uq->uq_mutex); 498 mutex_enter(&uq->uq_mutex); 499 } 500 goto again; 501 } 502 503 /* 504 * drain ne entries off the delete queue. As new queue entries may 505 * be added while we're working, ne is interpreted as follows: 506 * 507 * ne > 0 => remove up to ne entries 508 * ne == 0 => remove all entries currently on the queue 509 * ne == -1 => remove entries until the queue is empty 510 */ 511 void 512 ufs_delete_drain(struct vfs *vfsp, int ne, int dolockfs) 513 { 514 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 515 struct ufs_q *uq; 516 struct inode *ip; 517 int drain_cnt = 0; 518 int done; 519 520 /* 521 * if forcibly unmounted; ignore 522 */ 523 if (ufsvfsp == NULL) 524 return; 525 526 uq = &ufsvfsp->vfs_delete; 527 mutex_enter(&uq->uq_mutex); 528 if (ne == 0) 529 drain_cnt = uq->uq_ne; 530 else if (ne > 0) 531 drain_cnt = ne; 532 533 /* 534 * process up to ne entries 535 */ 536 537 done = 0; 538 while (!done && (ip = uq->uq_ihead)) { 539 if (ne != -1) 540 drain_cnt--; 541 if (ne != -1 && drain_cnt == 0) 542 done = 1; 543 if ((uq->uq_ihead = ip->i_freef) == ip) 544 uq->uq_ihead = NULL; 545 ip->i_freef->i_freeb = ip->i_freeb; 546 ip->i_freeb->i_freef = ip->i_freef; 547 ip->i_freef = ip; 548 ip->i_freeb = ip; 549 uq->uq_ne--; 550 mutex_exit(&uq->uq_mutex); 551 ufs_delete(ufsvfsp, ip, dolockfs); 552 mutex_enter(&uq->uq_mutex); 553 } 554 mutex_exit(&uq->uq_mutex); 555 } 556 557 void 558 ufs_sync_with_thread(struct ufs_q *uq) 559 { 560 mutex_enter(&uq->uq_mutex); 561 uq->uq_flags |= UQ_WAIT; 562 /* 563 * Someone other than the thread we're interested in might 564 * send a signal, so make sure the thread's given an 565 * acknowledgement. 566 */ 567 while ((uq->uq_threadp != NULL) && (uq->uq_flags & UQ_WAIT)) { 568 cv_broadcast(&uq->uq_cv); 569 cv_wait(&uq->uq_cv, &uq->uq_mutex); 570 } 571 mutex_exit(&uq->uq_mutex); 572 } 573 574 /* 575 * Get rid of everything that's currently in the delete queue, 576 * plus whatever the delete thread is working on at the moment. 577 * 578 * This ability is required for providing true POSIX semantics 579 * regarding close(2), unlink(2), etc, even when logging is enabled. 580 * The standard requires that the released space be immediately 581 * observable (statvfs(2)) and allocatable (e.g., write(2)). 582 */ 583 void 584 ufs_delete_drain_wait(struct ufsvfs *ufsvfsp, int dolockfs) 585 { 586 struct ufs_q *uq = &ufsvfsp->vfs_delete; 587 int error; 588 589 (void) ufs_delete_drain(ufsvfsp->vfs_vfs, 0, dolockfs); 590 ufs_sync_with_thread(uq); 591 592 /* 593 * Commit any outstanding transactions to make sure 594 * any canceled freed blocks are available for allocation. 595 */ 596 curthread->t_flag |= T_DONTBLOCK; 597 TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_UPDATE, TOP_COMMIT_SIZE, error); 598 if (!error) { 599 TRANS_END_SYNC(ufsvfsp, error, TOP_COMMIT_UPDATE, 600 TOP_COMMIT_SIZE); 601 } 602 curthread->t_flag &= ~T_DONTBLOCK; 603 } 604 605 /* 606 * Adjust the resource usage in a struct statvfs based on 607 * what's in the delete queue. 608 * 609 * We do not consider the impact of ACLs or extended attributes 610 * that may be deleted as a side-effect of deleting a file. 611 * Those are metadata, and their sizes aren't reflected in the 612 * sizes returned by stat(), so this is not a problem. 613 */ 614 void 615 ufs_delete_adjust_stats(struct ufsvfs *ufsvfsp, struct statvfs64 *sp) 616 { 617 struct ufs_q *uq = &ufsvfsp->vfs_delete; 618 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 619 620 /* 621 * We'll get signalled when it's our turn. However, if there's 622 * nothing going on, there's no point in waking up the delete 623 * thread and waiting for it to tell us to continue. 624 */ 625 mutex_enter(&uq->uq_mutex); 626 627 if ((uq->uq_flags & UQ_FASTCLIENTS) || (uq->uq_ne != 0)) { 628 uq->uq_flags |= UQ_FASTCLIENTS; 629 cv_broadcast(&uq->uq_cv); 630 cv_wait(&delq_info->delq_fast_cv, &uq->uq_mutex); 631 } 632 633 /* 634 * The blocks accounted for in the delete queue info are 635 * counted in DEV_BSIZE chunks, but ufs_statvfs counts in 636 * filesystem fragments, so a conversion is required here. 637 */ 638 sp->f_bfree += dbtofsb(ufsvfsp->vfs_fs, 639 delq_info->delq_unreclaimed_blocks); 640 sp->f_ffree += delq_info->delq_unreclaimed_files; 641 mutex_exit(&uq->uq_mutex); 642 } 643 644 /* 645 * IDLE INODE 646 * The following routines implement the protocol for maintaining an 647 * LRU list of idle inodes and for moving the idle inodes to the 648 * reuse list when the number of allocated inodes exceeds the user 649 * tunable high-water mark (ufs_ninode). 650 */ 651 652 /* 653 * clean an idle inode and move it to the reuse list 654 */ 655 static void 656 ufs_idle_free(struct inode *ip) 657 { 658 int pages; 659 int hno; 660 kmutex_t *ihm; 661 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 662 struct vnode *vp = ITOV(ip); 663 664 /* 665 * inode is held 666 */ 667 668 /* 669 * remember `pages' for stats below 670 */ 671 pages = (ip->i_mode && vn_has_cached_data(vp) && vp->v_type != VCHR); 672 673 /* 674 * start the dirty pages to disk and then invalidate them 675 * unless the inode is invalid (ISTALE) 676 */ 677 if ((ip->i_flag & ISTALE) == 0) { 678 (void) TRANS_SYNCIP(ip, B_ASYNC, I_ASYNC, TOP_SYNCIP_FREE); 679 (void) TRANS_SYNCIP(ip, 680 (TRANS_ISERROR(ufsvfsp)) ? 681 B_INVAL | B_FORCE : B_INVAL, 682 I_ASYNC, TOP_SYNCIP_FREE); 683 } 684 685 /* 686 * wait for any current ufs_iget to finish and block future ufs_igets 687 */ 688 ASSERT(ip->i_number != 0); 689 hno = INOHASH(ip->i_number); 690 ihm = &ih_lock[hno]; 691 mutex_enter(ihm); 692 693 /* 694 * It must be guaranteed that v_count >= 2, otherwise 695 * something must be wrong with this vnode already. 696 * That is why we use v_count-- instead of VN_RELE(). 697 * Acquire the vnode lock in case another thread is in 698 * VN_RELE(). 699 */ 700 mutex_enter(&vp->v_lock); 701 702 if (vp->v_count < 2) 703 cmn_err(CE_PANIC, 704 "ufs_idle_free: vnode ref count is less than 2"); 705 706 vp->v_count--; 707 if ((vp->v_type != VCHR && vn_has_cached_data(vp)) || 708 vp->v_count != 1 || 709 ip->i_flag & (IMOD|IMODACC|IACC|ICHG|IUPD|IATTCHG)) { 710 /* 711 * Another thread has referenced this inode while 712 * we are trying to free it. Call VN_RELE() to 713 * release our reference. 714 */ 715 mutex_exit(&vp->v_lock); 716 mutex_exit(ihm); 717 VN_RELE(vp); 718 } else { 719 /* 720 * The inode is currently unreferenced and can not 721 * acquire further references because it has no pages 722 * and the hash is locked. Inodes acquire references 723 * via the hash list or via their pages. 724 */ 725 726 mutex_exit(&vp->v_lock); 727 728 /* 729 * remove it from the cache 730 */ 731 remque(ip); 732 mutex_exit(ihm); 733 /* 734 * Stale inodes have no valid ufsvfs 735 */ 736 if ((ip->i_flag & ISTALE) == 0 && ip->i_dquot) { 737 TRANS_DQRELE(ufsvfsp, ip->i_dquot); 738 ip->i_dquot = NULL; 739 } 740 ufs_si_del(ip); 741 if (pages) { 742 CPU_STATS_ADDQ(CPU, sys, ufsipage, 1); 743 } else { 744 CPU_STATS_ADDQ(CPU, sys, ufsinopage, 1); 745 } 746 ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp)); 747 ufs_free_inode(ip); 748 } 749 } 750 751 /* 752 * this thread processes the global idle queue 753 */ 754 iqhead_t *ufs_junk_iq; 755 iqhead_t *ufs_useful_iq; 756 int ufs_njunk_iq = 0; 757 int ufs_nuseful_iq = 0; 758 int ufs_niqhash; 759 int ufs_iqhashmask; 760 struct ufs_q ufs_idle_q; 761 762 void 763 ufs_thread_idle(void) 764 { 765 callb_cpr_t cprinfo; 766 int i; 767 int ne; 768 769 ufs_niqhash = (ufs_idle_q.uq_lowat >> 1) / IQHASHQLEN; 770 ufs_niqhash = 1 << highbit(ufs_niqhash); /* round up to power of 2 */ 771 ufs_iqhashmask = ufs_niqhash - 1; 772 ufs_junk_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_junk_iq), 773 KM_SLEEP); 774 ufs_useful_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_useful_iq), 775 KM_SLEEP); 776 777 /* Initialize hash queue headers */ 778 for (i = 0; i < ufs_niqhash; i++) { 779 ufs_junk_iq[i].i_freef = (inode_t *)&ufs_junk_iq[i]; 780 ufs_junk_iq[i].i_freeb = (inode_t *)&ufs_junk_iq[i]; 781 ufs_useful_iq[i].i_freef = (inode_t *)&ufs_useful_iq[i]; 782 ufs_useful_iq[i].i_freeb = (inode_t *)&ufs_useful_iq[i]; 783 } 784 785 CALLB_CPR_INIT(&cprinfo, &ufs_idle_q.uq_mutex, callb_generic_cpr, 786 "ufsidle"); 787 again: 788 /* 789 * Whenever the idle thread is awakened, it repeatedly gives 790 * back half of the idle queue until the idle queue falls 791 * below lowat. 792 */ 793 mutex_enter(&ufs_idle_q.uq_mutex); 794 if (ufs_idle_q.uq_ne < ufs_idle_q.uq_lowat) { 795 CALLB_CPR_SAFE_BEGIN(&cprinfo); 796 cv_wait(&ufs_idle_q.uq_cv, &ufs_idle_q.uq_mutex); 797 CALLB_CPR_SAFE_END(&cprinfo, &ufs_idle_q.uq_mutex); 798 } 799 mutex_exit(&ufs_idle_q.uq_mutex); 800 801 /* 802 * Give back 1/2 of the idle queue 803 */ 804 ne = ufs_idle_q.uq_ne >> 1; 805 ins.in_tidles.value.ul += ne; 806 ufs_idle_some(ne); 807 goto again; 808 } 809 810 /* 811 * Reclaim callback for ufs inode cache. 812 * Invoked by the kernel memory allocator when memory gets tight. 813 */ 814 /*ARGSUSED*/ 815 void 816 ufs_inode_cache_reclaim(void *cdrarg) 817 { 818 /* 819 * If we are low on memory and the idle queue is over its 820 * halfway mark, then free 50% of the idle q 821 * 822 * We don't free all of the idle inodes because the inodes 823 * for popular NFS files may have been kicked from the dnlc. 824 * The inodes for these files will end up on the idle queue 825 * after every NFS access. 826 * 827 * If we repeatedly push them from the idle queue then 828 * NFS users may be unhappy as an extra buf cache operation 829 * is incurred for every NFS operation to these files. 830 * 831 * It's not common, but I have seen it happen. 832 * 833 */ 834 if (ufs_idle_q.uq_ne < (ufs_idle_q.uq_lowat >> 1)) 835 return; 836 mutex_enter(&ufs_idle_q.uq_mutex); 837 cv_broadcast(&ufs_idle_q.uq_cv); 838 mutex_exit(&ufs_idle_q.uq_mutex); 839 } 840 841 /* 842 * Free up some idle inodes 843 */ 844 void 845 ufs_idle_some(int ne) 846 { 847 int i; 848 struct inode *ip; 849 struct vnode *vp; 850 static int junk_rotor = 0; 851 static int useful_rotor = 0; 852 853 for (i = 0; i < ne; ++i) { 854 mutex_enter(&ufs_idle_q.uq_mutex); 855 856 if (ufs_njunk_iq) { 857 while (ufs_junk_iq[junk_rotor].i_freef == 858 (inode_t *)&ufs_junk_iq[junk_rotor]) { 859 junk_rotor = IQNEXT(junk_rotor); 860 } 861 ip = ufs_junk_iq[junk_rotor].i_freef; 862 ASSERT(ip->i_flag & IJUNKIQ); 863 } else if (ufs_nuseful_iq) { 864 while (ufs_useful_iq[useful_rotor].i_freef == 865 (inode_t *)&ufs_useful_iq[useful_rotor]) { 866 useful_rotor = IQNEXT(useful_rotor); 867 } 868 ip = ufs_useful_iq[useful_rotor].i_freef; 869 ASSERT(!(ip->i_flag & IJUNKIQ)); 870 } else { 871 mutex_exit(&ufs_idle_q.uq_mutex); 872 return; 873 } 874 875 /* 876 * emulate ufs_iget 877 */ 878 vp = ITOV(ip); 879 VN_HOLD(vp); 880 mutex_exit(&ufs_idle_q.uq_mutex); 881 rw_enter(&ip->i_contents, RW_WRITER); 882 /* 883 * VN_RELE should not be called if 884 * ufs_rmidle returns true, as it will 885 * effectively be done in ufs_idle_free. 886 */ 887 if (ufs_rmidle(ip)) { 888 rw_exit(&ip->i_contents); 889 ufs_idle_free(ip); 890 } else { 891 rw_exit(&ip->i_contents); 892 VN_RELE(vp); 893 } 894 } 895 } 896 897 /* 898 * drain entries for vfsp from the idle queue 899 * vfsp == NULL means drain the entire thing 900 */ 901 void 902 ufs_idle_drain(struct vfs *vfsp) 903 { 904 struct inode *ip, *nip; 905 struct inode *ianchor = NULL; 906 int i; 907 908 mutex_enter(&ufs_idle_q.uq_mutex); 909 if (ufs_njunk_iq) { 910 /* for each hash q */ 911 for (i = 0; i < ufs_niqhash; i++) { 912 /* search down the hash q */ 913 for (ip = ufs_junk_iq[i].i_freef; 914 ip != (inode_t *)&ufs_junk_iq[i]; 915 ip = ip->i_freef) { 916 if (ip->i_vfs == vfsp || vfsp == NULL) { 917 /* found a matching entry */ 918 VN_HOLD(ITOV(ip)); 919 mutex_exit(&ufs_idle_q.uq_mutex); 920 rw_enter(&ip->i_contents, RW_WRITER); 921 /* 922 * See comments in ufs_idle_some() 923 * as we will call ufs_idle_free() 924 * after scanning both queues. 925 */ 926 if (ufs_rmidle(ip)) { 927 rw_exit(&ip->i_contents); 928 ip->i_freef = ianchor; 929 ianchor = ip; 930 } else { 931 rw_exit(&ip->i_contents); 932 VN_RELE(ITOV(ip)); 933 } 934 /* restart this hash q */ 935 ip = (inode_t *)&ufs_junk_iq[i]; 936 mutex_enter(&ufs_idle_q.uq_mutex); 937 } 938 } 939 } 940 } 941 if (ufs_nuseful_iq) { 942 /* for each hash q */ 943 for (i = 0; i < ufs_niqhash; i++) { 944 /* search down the hash q */ 945 for (ip = ufs_useful_iq[i].i_freef; 946 ip != (inode_t *)&ufs_useful_iq[i]; 947 ip = ip->i_freef) { 948 if (ip->i_vfs == vfsp || vfsp == NULL) { 949 /* found a matching entry */ 950 VN_HOLD(ITOV(ip)); 951 mutex_exit(&ufs_idle_q.uq_mutex); 952 rw_enter(&ip->i_contents, RW_WRITER); 953 /* 954 * See comments in ufs_idle_some() 955 * as we will call ufs_idle_free() 956 * after scanning both queues. 957 */ 958 if (ufs_rmidle(ip)) { 959 rw_exit(&ip->i_contents); 960 ip->i_freef = ianchor; 961 ianchor = ip; 962 } else { 963 rw_exit(&ip->i_contents); 964 VN_RELE(ITOV(ip)); 965 } 966 /* restart this hash q */ 967 ip = (inode_t *)&ufs_useful_iq[i]; 968 mutex_enter(&ufs_idle_q.uq_mutex); 969 } 970 } 971 } 972 } 973 974 mutex_exit(&ufs_idle_q.uq_mutex); 975 /* no more matching entries, release those we have found (if any) */ 976 for (ip = ianchor; ip; ip = nip) { 977 nip = ip->i_freef; 978 ip->i_freef = ip; 979 ufs_idle_free(ip); 980 } 981 } 982 983 /* 984 * RECLAIM DELETED INODES 985 * The following thread scans the file system once looking for deleted files 986 */ 987 void 988 ufs_thread_reclaim(struct vfs *vfsp) 989 { 990 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 991 struct ufs_q *uq = &ufsvfsp->vfs_reclaim; 992 struct fs *fs = ufsvfsp->vfs_fs; 993 struct buf *bp = 0; 994 int err = 0; 995 daddr_t bno; 996 ino_t ino; 997 struct dinode *dp; 998 struct inode *ip; 999 callb_cpr_t cprinfo; 1000 1001 CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr, 1002 "ufsreclaim"); 1003 1004 /* 1005 * mount decided that we don't need a reclaim thread 1006 */ 1007 if ((fs->fs_reclaim & FS_RECLAIMING) == 0) 1008 err++; 1009 1010 /* 1011 * don't reclaim if readonly 1012 */ 1013 if (fs->fs_ronly) 1014 err++; 1015 1016 for (ino = 0; ino < (fs->fs_ncg * fs->fs_ipg) && !err; ++ino) { 1017 1018 /* 1019 * Check whether we are the target of another 1020 * thread having called ufs_thread_exit() or 1021 * ufs_thread_suspend(). 1022 */ 1023 mutex_enter(&uq->uq_mutex); 1024 again: 1025 if (uq->uq_flags & UQ_EXIT) { 1026 err++; 1027 mutex_exit(&uq->uq_mutex); 1028 break; 1029 } else if (uq->uq_flags & UQ_SUSPEND) { 1030 uq->uq_flags |= UQ_SUSPENDED; 1031 /* 1032 * Release the buf before we cv_wait() 1033 * otherwise we may deadlock with the 1034 * thread that called ufs_thread_suspend(). 1035 */ 1036 if (bp) { 1037 brelse(bp); 1038 bp = 0; 1039 } 1040 if (uq->uq_flags & UQ_WAIT) { 1041 uq->uq_flags &= ~UQ_WAIT; 1042 cv_broadcast(&uq->uq_cv); 1043 } 1044 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1045 cv_wait(&uq->uq_cv, &uq->uq_mutex); 1046 CALLB_CPR_SAFE_END(&cprinfo, &uq->uq_mutex); 1047 goto again; 1048 } 1049 mutex_exit(&uq->uq_mutex); 1050 1051 /* 1052 * if we don't already have the buf; get it 1053 */ 1054 bno = fsbtodb(fs, itod(fs, ino)); 1055 if ((bp == 0) || (bp->b_blkno != bno)) { 1056 if (bp) 1057 brelse(bp); 1058 bp = UFS_BREAD(ufsvfsp, 1059 ufsvfsp->vfs_dev, bno, fs->fs_bsize); 1060 bp->b_flags |= B_AGE; 1061 } 1062 if (bp->b_flags & B_ERROR) { 1063 err++; 1064 continue; 1065 } 1066 /* 1067 * nlink <= 0 and mode != 0 means deleted 1068 */ 1069 dp = (struct dinode *)bp->b_un.b_addr + itoo(fs, ino); 1070 if ((dp->di_nlink <= 0) && (dp->di_mode != 0)) { 1071 /* 1072 * can't hold the buf (deadlock) 1073 */ 1074 brelse(bp); 1075 bp = 0; 1076 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1077 /* 1078 * iget/iput sequence will put inode on ifree 1079 * thread queue if it is idle. This is a nop 1080 * for busy (open, deleted) inodes 1081 */ 1082 if (ufs_iget(vfsp, ino, &ip, CRED())) 1083 err++; 1084 else 1085 VN_RELE(ITOV(ip)); 1086 rw_exit(&ufsvfsp->vfs_dqrwlock); 1087 } 1088 } 1089 1090 if (bp) 1091 brelse(bp); 1092 if (!err) { 1093 /* 1094 * reset the reclaiming-bit 1095 */ 1096 mutex_enter(&ufsvfsp->vfs_lock); 1097 fs->fs_reclaim &= ~FS_RECLAIMING; 1098 mutex_exit(&ufsvfsp->vfs_lock); 1099 TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_RECLAIM); 1100 } 1101 1102 /* 1103 * exit the reclaim thread 1104 */ 1105 mutex_enter(&uq->uq_mutex); 1106 uq->uq_threadp = NULL; 1107 uq->uq_flags &= ~UQ_WAIT; 1108 cv_broadcast(&uq->uq_cv); 1109 CALLB_CPR_EXIT(&cprinfo); 1110 thread_exit(); 1111 } 1112 /* 1113 * HLOCK FILE SYSTEM 1114 * hlock the file system's whose logs have device errors 1115 */ 1116 struct ufs_q ufs_hlock; 1117 /*ARGSUSED*/ 1118 void 1119 ufs_thread_hlock(void *ignore) 1120 { 1121 int retry; 1122 callb_cpr_t cprinfo; 1123 1124 CALLB_CPR_INIT(&cprinfo, &ufs_hlock.uq_mutex, callb_generic_cpr, 1125 "ufshlock"); 1126 1127 for (;;) { 1128 /* 1129 * sleep until there is work to do 1130 */ 1131 mutex_enter(&ufs_hlock.uq_mutex); 1132 (void) ufs_thread_run(&ufs_hlock, &cprinfo); 1133 ufs_hlock.uq_ne = 0; 1134 mutex_exit(&ufs_hlock.uq_mutex); 1135 /* 1136 * hlock the error'ed fs's 1137 * retry after a bit if another app is doing lockfs stuff 1138 */ 1139 do { 1140 retry = ufs_trans_hlock(); 1141 if (retry) { 1142 mutex_enter(&ufs_hlock.uq_mutex); 1143 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1144 (void) cv_timedwait(&ufs_hlock.uq_cv, 1145 &ufs_hlock.uq_mutex, 1146 lbolt + hz); 1147 CALLB_CPR_SAFE_END(&cprinfo, 1148 &ufs_hlock.uq_mutex); 1149 mutex_exit(&ufs_hlock.uq_mutex); 1150 } 1151 } while (retry); 1152 } 1153 } 1154 1155 static void 1156 ufs_attr_purge(struct inode *dp) 1157 { 1158 int err; 1159 int error; 1160 off_t dirsize; /* size of the directory */ 1161 off_t offset; /* offset in the directory */ 1162 int entryoffsetinblk; /* offset of ep in fbp's buffer */ 1163 struct inode *tp; 1164 struct fbuf *fbp; /* pointer to directory block */ 1165 struct direct *ep; /* directory entry */ 1166 int trans_size; 1167 int issync; 1168 struct ufsvfs *ufsvfsp = dp->i_ufsvfs; 1169 1170 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1171 1172 fbp = NULL; 1173 dirsize = roundup(dp->i_size, DIRBLKSIZ); 1174 offset = 0; 1175 entryoffsetinblk = 0; 1176 1177 /* 1178 * Purge directory cache 1179 */ 1180 1181 dnlc_dir_purge(&dp->i_danchor); 1182 1183 while (offset < dirsize) { 1184 /* 1185 * If offset is on a block boundary, 1186 * read the next directory block. 1187 * Release previous if it exists. 1188 */ 1189 if (blkoff(dp->i_fs, offset) == 0) { 1190 if (fbp != NULL) { 1191 fbrelse(fbp, S_OTHER); 1192 } 1193 1194 err = blkatoff(dp, offset, (char **)0, &fbp); 1195 if (err) { 1196 goto out; 1197 } 1198 entryoffsetinblk = 0; 1199 } 1200 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk); 1201 if (ep->d_ino == 0 || (ep->d_name[0] == '.' && 1202 ep->d_name[1] == '\0') || 1203 (ep->d_name[0] == '.' && ep->d_name[1] == '.' && 1204 ep->d_name[2] == '\0')) { 1205 1206 entryoffsetinblk += ep->d_reclen; 1207 1208 } else { 1209 1210 if ((err = ufs_iget(dp->i_vfs, ep->d_ino, 1211 &tp, CRED())) != 0) { 1212 goto out; 1213 } 1214 1215 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE, 1216 trans_size = (int)TOP_REMOVE_SIZE(tp)); 1217 1218 /* 1219 * Delete inode. 1220 */ 1221 1222 dnlc_remove(ITOV(dp), ep->d_name); 1223 1224 rw_enter(&tp->i_contents, RW_WRITER); 1225 tp->i_flag |= ICHG; 1226 tp->i_seq++; 1227 TRANS_INODE(tp->i_ufsvfs, tp); 1228 tp->i_nlink--; 1229 ufs_setreclaim(tp); 1230 ITIMES_NOLOCK(tp); 1231 rw_exit(&tp->i_contents); 1232 1233 VN_RELE(ITOV(tp)); 1234 entryoffsetinblk += ep->d_reclen; 1235 TRANS_END_CSYNC(ufsvfsp, error, 1236 issync, TOP_REMOVE, trans_size); 1237 1238 } 1239 offset += ep->d_reclen; 1240 } 1241 1242 if (fbp) { 1243 fbrelse(fbp, S_OTHER); 1244 } 1245 1246 out: 1247 rw_exit(&ufsvfsp->vfs_dqrwlock); 1248 } 1249