1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Portions of this source code were derived from Berkeley 4.3 BSD 32 * under license from the Regents of the University of California. 33 */ 34 35 #pragma ident "%Z%%M% %I% %E% SMI" 36 37 #include <sys/types.h> 38 #include <sys/systm.h> 39 #include <sys/errno.h> 40 #include <sys/kmem.h> 41 #include <sys/buf.h> 42 #include <sys/vnode.h> 43 #include <sys/vfs.h> 44 #include <sys/user.h> 45 #include <sys/callb.h> 46 #include <sys/cpuvar.h> 47 #include <sys/fs/ufs_inode.h> 48 #include <sys/fs/ufs_log.h> 49 #include <sys/fs/ufs_trans.h> 50 #include <sys/fs/ufs_acl.h> 51 #include <sys/fs/ufs_bio.h> 52 #include <sys/fs/ufs_fsdir.h> 53 #include <sys/debug.h> 54 #include <sys/cmn_err.h> 55 #include <sys/sysmacros.h> 56 57 extern pri_t minclsyspri; 58 extern int hash2ints(); 59 extern struct kmem_cache *inode_cache; /* cache of free inodes */ 60 extern int ufs_idle_waiters; 61 extern struct instats ins; 62 63 static void ufs_attr_purge(struct inode *); 64 65 /* 66 * initialize a thread's queue struct 67 */ 68 void 69 ufs_thread_init(struct ufs_q *uq, int lowat) 70 { 71 bzero((caddr_t)uq, sizeof (*uq)); 72 cv_init(&uq->uq_cv, NULL, CV_DEFAULT, NULL); 73 mutex_init(&uq->uq_mutex, NULL, MUTEX_DEFAULT, NULL); 74 uq->uq_lowat = lowat; 75 uq->uq_hiwat = 2 * lowat; 76 uq->uq_threadp = NULL; 77 } 78 79 /* 80 * start a thread for a queue (assumes success) 81 */ 82 void 83 ufs_thread_start(struct ufs_q *uq, void (*func)(), struct vfs *vfsp) 84 { 85 mutex_enter(&uq->uq_mutex); 86 if (uq->uq_threadp == NULL) { 87 uq->uq_threadp = thread_create(NULL, 0, func, vfsp, 0, &p0, 88 TS_RUN, minclsyspri); 89 uq->uq_flags = 0; 90 } 91 mutex_exit(&uq->uq_mutex); 92 } 93 94 /* 95 * wait for the thread to exit 96 */ 97 void 98 ufs_thread_exit(struct ufs_q *uq) 99 { 100 kt_did_t ufs_thread_did = 0; 101 102 mutex_enter(&uq->uq_mutex); 103 uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED); 104 if (uq->uq_threadp != NULL) { 105 ufs_thread_did = uq->uq_threadp->t_did; 106 uq->uq_flags |= (UQ_EXIT|UQ_WAIT); 107 cv_broadcast(&uq->uq_cv); 108 } 109 mutex_exit(&uq->uq_mutex); 110 111 /* 112 * It's safe to call thread_join() with an already-gone 113 * t_did, but we have to obtain it before the kernel 114 * thread structure is freed. We do so above under the 115 * protection of the uq_mutex when we're sure the thread 116 * still exists and it's save to de-reference it. 117 * We also have to check if ufs_thread_did is != 0 118 * before calling thread_join() since thread 0 in the system 119 * gets a t_did of 0. 120 */ 121 if (ufs_thread_did) 122 thread_join(ufs_thread_did); 123 } 124 125 /* 126 * wait for a thread to suspend itself on the caller's behalf 127 * the caller is responsible for continuing the thread 128 */ 129 void 130 ufs_thread_suspend(struct ufs_q *uq) 131 { 132 mutex_enter(&uq->uq_mutex); 133 if (uq->uq_threadp != NULL) { 134 /* 135 * wait while another thread is suspending this thread. 136 * no need to do a cv_broadcast(), as whoever suspended 137 * the thread must continue at some point. 138 */ 139 while ((uq->uq_flags & UQ_SUSPEND) && 140 (uq->uq_threadp != NULL)) { 141 uq->uq_flags |= UQ_WAIT; 142 cv_wait(&uq->uq_cv, &uq->uq_mutex); 143 } 144 145 /* 146 * wait for the thread to suspend itself 147 */ 148 uq->uq_flags |= UQ_SUSPEND; 149 while (((uq->uq_flags & UQ_SUSPENDED) == 0) && 150 (uq->uq_threadp != NULL)) { 151 uq->uq_flags |= UQ_WAIT; 152 cv_broadcast(&uq->uq_cv); 153 cv_wait(&uq->uq_cv, &uq->uq_mutex); 154 } 155 } 156 mutex_exit(&uq->uq_mutex); 157 } 158 159 /* 160 * allow a thread to continue from a ufs_thread_suspend() 161 * This thread must be the same as the thread that called 162 * ufs_thread_suspend. 163 */ 164 void 165 ufs_thread_continue(struct ufs_q *uq) 166 { 167 mutex_enter(&uq->uq_mutex); 168 uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED); 169 cv_broadcast(&uq->uq_cv); 170 mutex_exit(&uq->uq_mutex); 171 } 172 173 /* 174 * some common code for managing a threads execution 175 * uq is locked at entry and return 176 * may sleep 177 * may exit 178 */ 179 /* 180 * Kind of a hack passing in the callb_cpr_t * here. 181 * It should really be part of the ufs_q structure. 182 * I did not put it in there because we are already in beta 183 * and I was concerned that changing ufs_inode.h to include 184 * callb.h might break something. 185 */ 186 int 187 ufs_thread_run(struct ufs_q *uq, callb_cpr_t *cprinfop) 188 { 189 again: 190 ASSERT(uq->uq_ne >= 0); 191 192 if (uq->uq_flags & UQ_SUSPEND) { 193 uq->uq_flags |= UQ_SUSPENDED; 194 } else if (uq->uq_flags & UQ_EXIT) { 195 /* 196 * exiting; empty the queue (may infinite loop) 197 */ 198 if (uq->uq_ne) 199 return (uq->uq_ne); 200 uq->uq_threadp = NULL; 201 if (uq->uq_flags & UQ_WAIT) 202 cv_broadcast(&uq->uq_cv); 203 uq->uq_flags &= ~(UQ_EXIT | UQ_WAIT); 204 CALLB_CPR_EXIT(cprinfop); 205 thread_exit(); 206 } else if (uq->uq_ne >= uq->uq_lowat) { 207 /* 208 * process a block of entries until below high water mark 209 */ 210 return (uq->uq_ne - (uq->uq_lowat >> 1)); 211 } 212 if (uq->uq_flags & UQ_WAIT) { 213 uq->uq_flags &= ~UQ_WAIT; 214 cv_broadcast(&uq->uq_cv); 215 } 216 CALLB_CPR_SAFE_BEGIN(cprinfop); 217 cv_wait(&uq->uq_cv, &uq->uq_mutex); 218 CALLB_CPR_SAFE_END(cprinfop, &uq->uq_mutex); 219 goto again; 220 } 221 222 /* 223 * DELETE INODE 224 * The following routines implement the protocol for freeing the resources 225 * held by an idle and deleted inode. 226 */ 227 void 228 ufs_delete(struct ufsvfs *ufsvfsp, struct inode *ip, int dolockfs) 229 { 230 ushort_t mode; 231 struct vnode *vp = ITOV(ip); 232 struct ulockfs *ulp; 233 int trans_size; 234 int dorwlock = ((ip->i_mode & IFMT) == IFREG); 235 int issync; 236 int err; 237 struct inode *dp; 238 239 /* 240 * not on a trans device or not part of a transaction 241 */ 242 ASSERT(!TRANS_ISTRANS(ufsvfsp) || 243 ((curthread->t_flag & T_DONTBLOCK) == 0)); 244 245 /* 246 * Ignore if deletes are not allowed (wlock/hlock) 247 */ 248 if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) { 249 VN_RELE(vp); 250 return; 251 } 252 253 if ((vp->v_count > 1) || (ip->i_mode == 0)) { 254 VN_RELE(vp); 255 return; 256 } 257 /* 258 * If we are called as part of setting a fs lock, then only 259 * do part of the lockfs protocol. In other words, don't hang. 260 */ 261 if (dolockfs) { 262 if (ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_DELETE_MASK)) 263 return; 264 } else { 265 /* 266 * check for recursive VOP call 267 */ 268 if (curthread->t_flag & T_DONTBLOCK) { 269 ulp = NULL; 270 } else { 271 ulp = &ufsvfsp->vfs_ulockfs; 272 curthread->t_flag |= T_DONTBLOCK; 273 } 274 } 275 276 /* 277 * Hold rwlock to synchronize with (nfs) writes 278 */ 279 if (dorwlock) 280 rw_enter(&ip->i_rwlock, RW_WRITER); 281 282 /* 283 * Delete the attribute directory. 284 */ 285 if (ip->i_oeftflag != 0) { 286 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE, 287 trans_size = (int)TOP_REMOVE_SIZE(ip)); 288 rw_enter(&ip->i_contents, RW_WRITER); 289 err = ufs_iget(ip->i_vfs, ip->i_oeftflag, 290 &dp, CRED()); 291 if (err == 0) { 292 rw_enter(&dp->i_rwlock, RW_WRITER); 293 rw_enter(&dp->i_contents, RW_WRITER); 294 dp->i_flag |= IUPD|ICHG; 295 dp->i_seq++; 296 TRANS_INODE(dp->i_ufsvfs, dp); 297 dp->i_nlink -= 2; 298 ufs_setreclaim(dp); 299 /* 300 * Should get rid of any negative cache entries that 301 * might be lingering, as well as ``.'' and 302 * ``..''. If we don't, the VN_RELE() below 303 * won't actually put dp on the delete queue 304 * and it'll hang out until someone forces it 305 * (lockfs -f, umount, ...). The only reliable 306 * way of doing this at the moment is to call 307 * dnlc_purge_vp(ITOV(dp)), which is unacceptably 308 * slow, so we'll just note the problem in this 309 * comment for now. 310 */ 311 dnlc_remove(ITOV(dp), "."); 312 dnlc_remove(ITOV(dp), ".."); 313 ITIMES_NOLOCK(dp); 314 if (!TRANS_ISTRANS(ufsvfsp)) { 315 ufs_iupdat(dp, I_SYNC); 316 } 317 rw_exit(&dp->i_contents); 318 rw_exit(&dp->i_rwlock); 319 VN_RELE(ITOV(dp)); 320 } 321 /* 322 * Clear out attribute pointer 323 */ 324 ip->i_oeftflag = 0; 325 rw_exit(&ip->i_contents); 326 TRANS_END_CSYNC(ufsvfsp, err, issync, 327 TOP_REMOVE, trans_size); 328 dnlc_remove(ITOV(ip), XATTR_DIR_NAME); 329 } 330 331 if ((ip->i_mode & IFMT) == IFATTRDIR) { 332 ufs_attr_purge(ip); 333 } 334 335 (void) TRANS_ITRUNC(ip, (u_offset_t)0, I_FREE, CRED()); 336 337 /* 338 * the inode's space has been freed; now free the inode 339 */ 340 if (ulp) { 341 trans_size = TOP_IFREE_SIZE(ip); 342 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IFREE, trans_size); 343 } 344 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 345 rw_enter(&ip->i_contents, RW_WRITER); 346 TRANS_INODE(ufsvfsp, ip); 347 mode = ip->i_mode; 348 ip->i_mode = 0; 349 ip->i_rdev = 0; 350 ip->i_ordev = 0; 351 ip->i_flag |= IMOD; 352 if (ip->i_ufs_acl) { 353 (void) ufs_si_free(ip->i_ufs_acl, vp->v_vfsp, CRED()); 354 ip->i_ufs_acl = NULL; 355 ip->i_shadow = 0; 356 } 357 358 /* 359 * This inode is torn down but still retains it's identity 360 * (inode number). It could get recycled soon so it's best 361 * to clean up the vnode just in case. 362 */ 363 mutex_enter(&vp->v_lock); 364 vn_recycle(vp); 365 mutex_exit(&vp->v_lock); 366 367 /* 368 * free the inode 369 */ 370 ufs_ifree(ip, ip->i_number, mode); 371 /* 372 * release quota resources; can't fail 373 */ 374 (void) chkiq((struct ufsvfs *)vp->v_vfsp->vfs_data, 375 /* change */ -1, ip, (uid_t)ip->i_uid, 0, CRED(), 376 (char **)NULL, (size_t *)NULL); 377 dqrele(ip->i_dquot); 378 ip->i_dquot = NULL; 379 ip->i_flag &= ~(IDEL | IDIRECTIO); 380 ip->i_cflags = 0; 381 if (!TRANS_ISTRANS(ufsvfsp)) { 382 ufs_iupdat(ip, I_SYNC); 383 } 384 rw_exit(&ip->i_contents); 385 rw_exit(&ufsvfsp->vfs_dqrwlock); 386 if (dorwlock) 387 rw_exit(&ip->i_rwlock); 388 VN_RELE(vp); 389 390 /* 391 * End of transaction 392 */ 393 if (ulp) { 394 TRANS_END_ASYNC(ufsvfsp, TOP_IFREE, trans_size); 395 if (dolockfs) 396 ufs_lockfs_end(ulp); 397 else 398 curthread->t_flag &= ~T_DONTBLOCK; 399 } 400 } 401 402 /* 403 * thread that frees up deleted inodes 404 */ 405 void 406 ufs_thread_delete(struct vfs *vfsp) 407 { 408 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 409 struct ufs_q *uq = &ufsvfsp->vfs_delete; 410 struct inode *ip; 411 long ne; 412 callb_cpr_t cprinfo; 413 414 CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr, 415 "ufsdelete"); 416 417 mutex_enter(&uq->uq_mutex); 418 again: 419 /* 420 * sleep until there is work to do 421 */ 422 ne = ufs_thread_run(uq, &cprinfo); 423 /* 424 * process up to ne entries 425 */ 426 while (ne-- && (ip = uq->uq_ihead)) { 427 /* 428 * process first entry on queue. Assumed conditions are: 429 * ip is held (v_count >= 1) 430 * ip is referenced (i_flag & IREF) 431 * ip is free (i_nlink <= 0) 432 */ 433 if ((uq->uq_ihead = ip->i_freef) == ip) 434 uq->uq_ihead = NULL; 435 ip->i_freef->i_freeb = ip->i_freeb; 436 ip->i_freeb->i_freef = ip->i_freef; 437 ip->i_freef = ip; 438 ip->i_freeb = ip; 439 uq->uq_ne--; 440 mutex_exit(&uq->uq_mutex); 441 ufs_delete(ufsvfsp, ip, 1); 442 mutex_enter(&uq->uq_mutex); 443 } 444 goto again; 445 } 446 447 /* 448 * drain ne entries off the delete queue. As new queue entries may 449 * be added while we're working, ne is interpreted as follows: 450 * 451 * ne > 0 => remove up to ne entries 452 * ne == 0 => remove all entries currently on the queue 453 * ne == -1 => remove entries until the queue is empty 454 */ 455 void 456 ufs_delete_drain(struct vfs *vfsp, int ne, int dolockfs) 457 { 458 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 459 struct ufs_q *uq; 460 struct inode *ip; 461 int drain_cnt = 0; 462 int done; 463 464 /* 465 * if forcibly unmounted; ignore 466 */ 467 if (ufsvfsp == NULL) 468 return; 469 470 uq = &ufsvfsp->vfs_delete; 471 mutex_enter(&uq->uq_mutex); 472 if (ne == 0) 473 drain_cnt = uq->uq_ne; 474 else if (ne > 0) 475 drain_cnt = ne; 476 477 /* 478 * process up to ne entries 479 */ 480 481 done = 0; 482 while (!done && (ip = uq->uq_ihead)) { 483 if (ne != -1) 484 drain_cnt--; 485 if (ne != -1 && drain_cnt == 0) 486 done = 1; 487 if ((uq->uq_ihead = ip->i_freef) == ip) 488 uq->uq_ihead = NULL; 489 ip->i_freef->i_freeb = ip->i_freeb; 490 ip->i_freeb->i_freef = ip->i_freef; 491 ip->i_freef = ip; 492 ip->i_freeb = ip; 493 uq->uq_ne--; 494 mutex_exit(&uq->uq_mutex); 495 ufs_delete(ufsvfsp, ip, dolockfs); 496 mutex_enter(&uq->uq_mutex); 497 } 498 mutex_exit(&uq->uq_mutex); 499 } 500 501 void 502 ufs_sync_with_thread(struct ufs_q *uq) 503 { 504 mutex_enter(&uq->uq_mutex); 505 uq->uq_flags |= UQ_WAIT; 506 /* 507 * Someone other than the thread we're interested in might 508 * send a signal, so make sure the thread's given an 509 * acknowledgement. 510 */ 511 while ((uq->uq_threadp != NULL) && (uq->uq_flags & UQ_WAIT)) { 512 cv_broadcast(&uq->uq_cv); 513 cv_wait(&uq->uq_cv, &uq->uq_mutex); 514 } 515 mutex_exit(&uq->uq_mutex); 516 } 517 518 /* 519 * Get rid of everything that's currently in the delete queue, 520 * plus whatever the delete thread is working on at the moment. 521 * 522 * This ability is required for providing true POSIX semantics 523 * regarding close(2), unlink(2), etc, even when logging is enabled. 524 * The standard requires that the released space be immediately 525 * observable (statvfs(2)) and allocatable (e.g., write(2)). 526 */ 527 void 528 ufs_delete_drain_wait(struct ufsvfs *ufsvfsp, int dolockfs) 529 { 530 struct ufs_q *uq = &ufsvfsp->vfs_delete; 531 int error; 532 533 (void) ufs_delete_drain(ufsvfsp->vfs_vfs, 0, dolockfs); 534 ufs_sync_with_thread(uq); 535 536 /* 537 * Commit any outstanding transactions to make sure 538 * any canceled freed blocks are available for allocation. 539 */ 540 curthread->t_flag |= T_DONTBLOCK; 541 TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_UPDATE, TOP_COMMIT_SIZE, error); 542 if (!error) { 543 TRANS_END_SYNC(ufsvfsp, error, TOP_COMMIT_UPDATE, 544 TOP_COMMIT_SIZE); 545 } 546 curthread->t_flag &= ~T_DONTBLOCK; 547 } 548 549 /* 550 * Adjust the resource usage in a struct statvfs based on 551 * what's in the delete queue. Assumes that the delete 552 * thread has been suspended. 553 * 554 * We do not consider the impact of ACLs or extended attributes 555 * that may be deleted as a side-effect of deleting a file. 556 * Those are metadata, and their sizes aren't reflected in the 557 * sizes returned by stat(), so this is not a problem. 558 */ 559 void 560 ufs_delete_adjust_stats(struct ufsvfs *ufsvfsp, struct statvfs64 *sp) 561 { 562 struct inode *ip; 563 struct fs *fs = ufsvfsp->vfs_fs; 564 struct ufs_q *uq = &ufsvfsp->vfs_delete; 565 566 /* 567 * To be self-consistent with the existing contents of 568 * *sp, we have to keep the queue stable during our 569 * traversal. mainly, this keeps anyone from doing a 570 * ufs_delete_drain() on top of us. 571 */ 572 mutex_enter(&uq->uq_mutex); 573 574 ip = uq->uq_ihead; 575 if (ip != NULL) { 576 do { 577 sp->f_bfree += dbtofsb(fs, ip->i_blocks); 578 sp->f_ffree += 1; 579 ip = ip->i_freef; 580 } while (ip != uq->uq_ihead); 581 } 582 583 mutex_exit(&uq->uq_mutex); 584 } 585 586 /* 587 * IDLE INODE 588 * The following routines implement the protocol for maintaining an 589 * LRU list of idle inodes and for moving the idle inodes to the 590 * reuse list when the number of allocated inodes exceeds the user 591 * tunable high-water mark (ufs_ninode). 592 */ 593 594 /* 595 * clean an idle inode and move it to the reuse list 596 */ 597 static void 598 ufs_idle_free(struct inode *ip) 599 { 600 int pages; 601 int hno; 602 kmutex_t *ihm; 603 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 604 struct vnode *vp = ITOV(ip); 605 606 /* 607 * inode is held 608 */ 609 610 /* 611 * remember `pages' for stats below 612 */ 613 pages = (ip->i_mode && vn_has_cached_data(vp) && vp->v_type != VCHR); 614 615 /* 616 * start the dirty pages to disk and then invalidate them 617 * unless the inode is invalid (ISTALE) 618 */ 619 if ((ip->i_flag & ISTALE) == 0) { 620 (void) TRANS_SYNCIP(ip, B_ASYNC, I_ASYNC, TOP_SYNCIP_FREE); 621 (void) TRANS_SYNCIP(ip, 622 (TRANS_ISERROR(ufsvfsp)) ? 623 B_INVAL | B_FORCE : B_INVAL, 624 I_ASYNC, TOP_SYNCIP_FREE); 625 } 626 627 /* 628 * wait for any current ufs_iget to finish and block future ufs_igets 629 */ 630 ASSERT(ip->i_number != 0); 631 hno = INOHASH(ip->i_number); 632 ihm = &ih_lock[hno]; 633 mutex_enter(ihm); 634 635 /* 636 * It must be guaranteed that v_count >= 2, otherwise 637 * something must be wrong with this vnode already. 638 * That is why we use v_count-- instead of VN_RELE(). 639 * Acquire the vnode lock in case another thread is in 640 * VN_RELE(). 641 */ 642 mutex_enter(&vp->v_lock); 643 644 if (vp->v_count < 2) 645 cmn_err(CE_PANIC, 646 "ufs_idle_free: vnode ref count is less than 2"); 647 648 vp->v_count--; 649 if ((vp->v_type != VCHR && vn_has_cached_data(vp)) || 650 vp->v_count != 1 || 651 ip->i_flag & (IMOD|IMODACC|IACC|ICHG|IUPD|IATTCHG)) { 652 /* 653 * Another thread has referenced this inode while 654 * we are trying to free it. Call VN_RELE() to 655 * release our reference. 656 */ 657 mutex_exit(&vp->v_lock); 658 mutex_exit(ihm); 659 VN_RELE(vp); 660 } else { 661 /* 662 * The inode is currently unreferenced and can not 663 * acquire further references because it has no pages 664 * and the hash is locked. Inodes acquire references 665 * via the hash list or via their pages. 666 */ 667 668 mutex_exit(&vp->v_lock); 669 670 /* 671 * remove it from the cache 672 */ 673 remque(ip); 674 mutex_exit(ihm); 675 /* 676 * Stale inodes have no valid ufsvfs 677 */ 678 if ((ip->i_flag & ISTALE) == 0 && ip->i_dquot) { 679 TRANS_DQRELE(ufsvfsp, ip->i_dquot); 680 ip->i_dquot = NULL; 681 } 682 ufs_si_del(ip); 683 if (pages) { 684 CPU_STATS_ADDQ(CPU, sys, ufsipage, 1); 685 } else { 686 CPU_STATS_ADDQ(CPU, sys, ufsinopage, 1); 687 } 688 ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp)); 689 ufs_free_inode(ip); 690 } 691 } 692 693 /* 694 * this thread processes the global idle queue 695 */ 696 iqhead_t *ufs_junk_iq; 697 iqhead_t *ufs_useful_iq; 698 int ufs_njunk_iq = 0; 699 int ufs_nuseful_iq = 0; 700 int ufs_niqhash; 701 int ufs_iqhashmask; 702 struct ufs_q ufs_idle_q; 703 704 void 705 ufs_thread_idle(void) 706 { 707 callb_cpr_t cprinfo; 708 int i; 709 int ne; 710 711 ufs_niqhash = (ufs_idle_q.uq_lowat >> 1) / IQHASHQLEN; 712 ufs_niqhash = 1 << highbit(ufs_niqhash); /* round up to power of 2 */ 713 ufs_iqhashmask = ufs_niqhash - 1; 714 ufs_junk_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_junk_iq), 715 KM_SLEEP); 716 ufs_useful_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_useful_iq), 717 KM_SLEEP); 718 719 /* Initialize hash queue headers */ 720 for (i = 0; i < ufs_niqhash; i++) { 721 ufs_junk_iq[i].i_freef = (inode_t *)&ufs_junk_iq[i]; 722 ufs_junk_iq[i].i_freeb = (inode_t *)&ufs_junk_iq[i]; 723 ufs_useful_iq[i].i_freef = (inode_t *)&ufs_useful_iq[i]; 724 ufs_useful_iq[i].i_freeb = (inode_t *)&ufs_useful_iq[i]; 725 } 726 727 CALLB_CPR_INIT(&cprinfo, &ufs_idle_q.uq_mutex, callb_generic_cpr, 728 "ufsidle"); 729 again: 730 /* 731 * Whenever the idle thread is awakened, it repeatedly gives 732 * back half of the idle queue until the idle queue falls 733 * below lowat. 734 */ 735 mutex_enter(&ufs_idle_q.uq_mutex); 736 if (ufs_idle_q.uq_ne < ufs_idle_q.uq_lowat) { 737 CALLB_CPR_SAFE_BEGIN(&cprinfo); 738 cv_wait(&ufs_idle_q.uq_cv, &ufs_idle_q.uq_mutex); 739 CALLB_CPR_SAFE_END(&cprinfo, &ufs_idle_q.uq_mutex); 740 } 741 mutex_exit(&ufs_idle_q.uq_mutex); 742 743 /* 744 * Give back 1/2 of the idle queue 745 */ 746 ne = ufs_idle_q.uq_ne >> 1; 747 ins.in_tidles.value.ul += ne; 748 ufs_idle_some(ne); 749 goto again; 750 } 751 752 /* 753 * Reclaim callback for ufs inode cache. 754 * Invoked by the kernel memory allocator when memory gets tight. 755 */ 756 /*ARGSUSED*/ 757 void 758 ufs_inode_cache_reclaim(void *cdrarg) 759 { 760 /* 761 * If we are low on memory and the idle queue is over its 762 * halfway mark, then free 50% of the idle q 763 * 764 * We don't free all of the idle inodes because the inodes 765 * for popular NFS files may have been kicked from the dnlc. 766 * The inodes for these files will end up on the idle queue 767 * after every NFS access. 768 * 769 * If we repeatedly push them from the idle queue then 770 * NFS users may be unhappy as an extra buf cache operation 771 * is incurred for every NFS operation to these files. 772 * 773 * It's not common, but I have seen it happen. 774 * 775 */ 776 if (ufs_idle_q.uq_ne < (ufs_idle_q.uq_lowat >> 1)) 777 return; 778 mutex_enter(&ufs_idle_q.uq_mutex); 779 cv_broadcast(&ufs_idle_q.uq_cv); 780 mutex_exit(&ufs_idle_q.uq_mutex); 781 } 782 783 /* 784 * Free up some idle inodes 785 */ 786 void 787 ufs_idle_some(int ne) 788 { 789 int i; 790 struct inode *ip; 791 struct vnode *vp; 792 static int junk_rotor = 0; 793 static int useful_rotor = 0; 794 795 for (i = 0; i < ne; ++i) { 796 mutex_enter(&ufs_idle_q.uq_mutex); 797 798 if (ufs_njunk_iq) { 799 while (ufs_junk_iq[junk_rotor].i_freef == 800 (inode_t *)&ufs_junk_iq[junk_rotor]) { 801 junk_rotor = IQNEXT(junk_rotor); 802 } 803 ip = ufs_junk_iq[junk_rotor].i_freef; 804 ASSERT(ip->i_flag & IJUNKIQ); 805 } else if (ufs_nuseful_iq) { 806 while (ufs_useful_iq[useful_rotor].i_freef == 807 (inode_t *)&ufs_useful_iq[useful_rotor]) { 808 useful_rotor = IQNEXT(useful_rotor); 809 } 810 ip = ufs_useful_iq[useful_rotor].i_freef; 811 ASSERT(!(ip->i_flag & IJUNKIQ)); 812 } else { 813 mutex_exit(&ufs_idle_q.uq_mutex); 814 return; 815 } 816 817 /* 818 * emulate ufs_iget 819 */ 820 vp = ITOV(ip); 821 VN_HOLD(vp); 822 mutex_exit(&ufs_idle_q.uq_mutex); 823 rw_enter(&ip->i_contents, RW_WRITER); 824 /* 825 * VN_RELE should not be called if 826 * ufs_rmidle returns true, as it will 827 * effectively be done in ufs_idle_free. 828 */ 829 if (ufs_rmidle(ip)) { 830 rw_exit(&ip->i_contents); 831 ufs_idle_free(ip); 832 } else { 833 rw_exit(&ip->i_contents); 834 VN_RELE(vp); 835 } 836 } 837 } 838 839 /* 840 * drain entries for vfsp from the idle queue 841 * vfsp == NULL means drain the entire thing 842 */ 843 void 844 ufs_idle_drain(struct vfs *vfsp) 845 { 846 struct inode *ip, *nip; 847 struct inode *ianchor = NULL; 848 int i; 849 850 mutex_enter(&ufs_idle_q.uq_mutex); 851 if (ufs_njunk_iq) { 852 /* for each hash q */ 853 for (i = 0; i < ufs_niqhash; i++) { 854 /* search down the hash q */ 855 for (ip = ufs_junk_iq[i].i_freef; 856 ip != (inode_t *)&ufs_junk_iq[i]; 857 ip = ip->i_freef) { 858 if (ip->i_vfs == vfsp || vfsp == NULL) { 859 /* found a matching entry */ 860 VN_HOLD(ITOV(ip)); 861 mutex_exit(&ufs_idle_q.uq_mutex); 862 rw_enter(&ip->i_contents, RW_WRITER); 863 /* 864 * See comments in ufs_idle_some() 865 * as we will call ufs_idle_free() 866 * after scanning both queues. 867 */ 868 if (ufs_rmidle(ip)) { 869 rw_exit(&ip->i_contents); 870 ip->i_freef = ianchor; 871 ianchor = ip; 872 } else { 873 rw_exit(&ip->i_contents); 874 VN_RELE(ITOV(ip)); 875 } 876 /* restart this hash q */ 877 ip = (inode_t *)&ufs_junk_iq[i]; 878 mutex_enter(&ufs_idle_q.uq_mutex); 879 } 880 } 881 } 882 } 883 if (ufs_nuseful_iq) { 884 /* for each hash q */ 885 for (i = 0; i < ufs_niqhash; i++) { 886 /* search down the hash q */ 887 for (ip = ufs_useful_iq[i].i_freef; 888 ip != (inode_t *)&ufs_useful_iq[i]; 889 ip = ip->i_freef) { 890 if (ip->i_vfs == vfsp || vfsp == NULL) { 891 /* found a matching entry */ 892 VN_HOLD(ITOV(ip)); 893 mutex_exit(&ufs_idle_q.uq_mutex); 894 rw_enter(&ip->i_contents, RW_WRITER); 895 /* 896 * See comments in ufs_idle_some() 897 * as we will call ufs_idle_free() 898 * after scanning both queues. 899 */ 900 if (ufs_rmidle(ip)) { 901 rw_exit(&ip->i_contents); 902 ip->i_freef = ianchor; 903 ianchor = ip; 904 } else { 905 rw_exit(&ip->i_contents); 906 VN_RELE(ITOV(ip)); 907 } 908 /* restart this hash q */ 909 ip = (inode_t *)&ufs_useful_iq[i]; 910 mutex_enter(&ufs_idle_q.uq_mutex); 911 } 912 } 913 } 914 } 915 916 mutex_exit(&ufs_idle_q.uq_mutex); 917 /* no more matching entries, release those we have found (if any) */ 918 for (ip = ianchor; ip; ip = nip) { 919 nip = ip->i_freef; 920 ip->i_freef = ip; 921 ufs_idle_free(ip); 922 } 923 } 924 925 /* 926 * RECLAIM DELETED INODES 927 * The following thread scans the file system once looking for deleted files 928 */ 929 void 930 ufs_thread_reclaim(struct vfs *vfsp) 931 { 932 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 933 struct ufs_q *uq = &ufsvfsp->vfs_reclaim; 934 struct fs *fs = ufsvfsp->vfs_fs; 935 struct buf *bp = 0; 936 int err = 0; 937 daddr_t bno; 938 ino_t ino; 939 struct dinode *dp; 940 struct inode *ip; 941 callb_cpr_t cprinfo; 942 943 CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr, 944 "ufsreclaim"); 945 946 /* 947 * mount decided that we don't need a reclaim thread 948 */ 949 if ((fs->fs_reclaim & FS_RECLAIMING) == 0) 950 err++; 951 952 /* 953 * don't reclaim if readonly 954 */ 955 if (fs->fs_ronly) 956 err++; 957 958 for (ino = 0; ino < (fs->fs_ncg * fs->fs_ipg) && !err; ++ino) { 959 960 /* 961 * Check whether we are the target of another 962 * thread having called ufs_thread_exit() or 963 * ufs_thread_suspend(). 964 */ 965 mutex_enter(&uq->uq_mutex); 966 again: 967 if (uq->uq_flags & UQ_EXIT) { 968 err++; 969 mutex_exit(&uq->uq_mutex); 970 break; 971 } else if (uq->uq_flags & UQ_SUSPEND) { 972 uq->uq_flags |= UQ_SUSPENDED; 973 /* 974 * Release the buf before we cv_wait() 975 * otherwise we may deadlock with the 976 * thread that called ufs_thread_suspend(). 977 */ 978 if (bp) { 979 brelse(bp); 980 bp = 0; 981 } 982 if (uq->uq_flags & UQ_WAIT) { 983 uq->uq_flags &= ~UQ_WAIT; 984 cv_broadcast(&uq->uq_cv); 985 } 986 CALLB_CPR_SAFE_BEGIN(&cprinfo); 987 cv_wait(&uq->uq_cv, &uq->uq_mutex); 988 CALLB_CPR_SAFE_END(&cprinfo, &uq->uq_mutex); 989 goto again; 990 } 991 mutex_exit(&uq->uq_mutex); 992 993 /* 994 * if we don't already have the buf; get it 995 */ 996 bno = fsbtodb(fs, itod(fs, ino)); 997 if ((bp == 0) || (bp->b_blkno != bno)) { 998 if (bp) 999 brelse(bp); 1000 bp = UFS_BREAD(ufsvfsp, 1001 ufsvfsp->vfs_dev, bno, fs->fs_bsize); 1002 bp->b_flags |= B_AGE; 1003 } 1004 if (bp->b_flags & B_ERROR) { 1005 err++; 1006 continue; 1007 } 1008 /* 1009 * nlink <= 0 and mode != 0 means deleted 1010 */ 1011 dp = (struct dinode *)bp->b_un.b_addr + itoo(fs, ino); 1012 if ((dp->di_nlink <= 0) && (dp->di_mode != 0)) { 1013 /* 1014 * can't hold the buf (deadlock) 1015 */ 1016 brelse(bp); 1017 bp = 0; 1018 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1019 /* 1020 * iget/iput sequence will put inode on ifree 1021 * thread queue if it is idle. This is a nop 1022 * for busy (open, deleted) inodes 1023 */ 1024 if (ufs_iget(vfsp, ino, &ip, CRED())) 1025 err++; 1026 else 1027 VN_RELE(ITOV(ip)); 1028 rw_exit(&ufsvfsp->vfs_dqrwlock); 1029 } 1030 } 1031 1032 if (bp) 1033 brelse(bp); 1034 if (!err) { 1035 /* 1036 * reset the reclaiming-bit 1037 */ 1038 mutex_enter(&ufsvfsp->vfs_lock); 1039 fs->fs_reclaim &= ~FS_RECLAIMING; 1040 mutex_exit(&ufsvfsp->vfs_lock); 1041 TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_RECLAIM); 1042 } 1043 1044 /* 1045 * exit the reclaim thread 1046 */ 1047 mutex_enter(&uq->uq_mutex); 1048 uq->uq_threadp = NULL; 1049 uq->uq_flags &= ~UQ_WAIT; 1050 cv_broadcast(&uq->uq_cv); 1051 CALLB_CPR_EXIT(&cprinfo); 1052 thread_exit(); 1053 } 1054 /* 1055 * HLOCK FILE SYSTEM 1056 * hlock the file system's whose logs have device errors 1057 */ 1058 struct ufs_q ufs_hlock; 1059 /*ARGSUSED*/ 1060 void 1061 ufs_thread_hlock(void *ignore) 1062 { 1063 int retry; 1064 callb_cpr_t cprinfo; 1065 1066 CALLB_CPR_INIT(&cprinfo, &ufs_hlock.uq_mutex, callb_generic_cpr, 1067 "ufshlock"); 1068 1069 for (;;) { 1070 /* 1071 * sleep until there is work to do 1072 */ 1073 mutex_enter(&ufs_hlock.uq_mutex); 1074 (void) ufs_thread_run(&ufs_hlock, &cprinfo); 1075 ufs_hlock.uq_ne = 0; 1076 mutex_exit(&ufs_hlock.uq_mutex); 1077 /* 1078 * hlock the error'ed fs's 1079 * retry after a bit if another app is doing lockfs stuff 1080 */ 1081 do { 1082 retry = ufs_trans_hlock(); 1083 if (retry) { 1084 mutex_enter(&ufs_hlock.uq_mutex); 1085 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1086 (void) cv_timedwait(&ufs_hlock.uq_cv, 1087 &ufs_hlock.uq_mutex, 1088 lbolt + hz); 1089 CALLB_CPR_SAFE_END(&cprinfo, 1090 &ufs_hlock.uq_mutex); 1091 mutex_exit(&ufs_hlock.uq_mutex); 1092 } 1093 } while (retry); 1094 } 1095 } 1096 1097 static void 1098 ufs_attr_purge(struct inode *dp) 1099 { 1100 int err; 1101 int error; 1102 off_t dirsize; /* size of the directory */ 1103 off_t offset; /* offset in the directory */ 1104 int entryoffsetinblk; /* offset of ep in fbp's buffer */ 1105 struct inode *tp; 1106 struct fbuf *fbp; /* pointer to directory block */ 1107 struct direct *ep; /* directory entry */ 1108 int trans_size; 1109 int issync; 1110 struct ufsvfs *ufsvfsp = dp->i_ufsvfs; 1111 1112 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1113 1114 fbp = NULL; 1115 dirsize = roundup(dp->i_size, DIRBLKSIZ); 1116 offset = 0; 1117 entryoffsetinblk = 0; 1118 1119 /* 1120 * Purge directory cache 1121 */ 1122 1123 dnlc_dir_purge(&dp->i_danchor); 1124 1125 while (offset < dirsize) { 1126 /* 1127 * If offset is on a block boundary, 1128 * read the next directory block. 1129 * Release previous if it exists. 1130 */ 1131 if (blkoff(dp->i_fs, offset) == 0) { 1132 if (fbp != NULL) { 1133 fbrelse(fbp, S_OTHER); 1134 } 1135 1136 err = blkatoff(dp, offset, (char **)0, &fbp); 1137 if (err) { 1138 goto out; 1139 } 1140 entryoffsetinblk = 0; 1141 } 1142 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk); 1143 if (ep->d_ino == 0 || (ep->d_name[0] == '.' && 1144 ep->d_name[1] == '\0') || 1145 (ep->d_name[0] == '.' && ep->d_name[1] == '.' && 1146 ep->d_name[2] == '\0')) { 1147 1148 entryoffsetinblk += ep->d_reclen; 1149 1150 } else { 1151 1152 if ((err = ufs_iget(dp->i_vfs, ep->d_ino, 1153 &tp, CRED())) != 0) { 1154 goto out; 1155 } 1156 1157 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE, 1158 trans_size = (int)TOP_REMOVE_SIZE(tp)); 1159 1160 /* 1161 * Delete inode. 1162 */ 1163 1164 dnlc_remove(ITOV(dp), ep->d_name); 1165 1166 rw_enter(&tp->i_contents, RW_WRITER); 1167 tp->i_flag |= ICHG; 1168 tp->i_seq++; 1169 TRANS_INODE(tp->i_ufsvfs, tp); 1170 tp->i_nlink--; 1171 ufs_setreclaim(tp); 1172 ITIMES_NOLOCK(tp); 1173 rw_exit(&tp->i_contents); 1174 1175 VN_RELE(ITOV(tp)); 1176 entryoffsetinblk += ep->d_reclen; 1177 TRANS_END_CSYNC(ufsvfsp, error, 1178 issync, TOP_REMOVE, trans_size); 1179 1180 } 1181 offset += ep->d_reclen; 1182 } 1183 1184 if (fbp) { 1185 fbrelse(fbp, S_OTHER); 1186 } 1187 1188 out: 1189 rw_exit(&ufsvfsp->vfs_dqrwlock); 1190 } 1191