1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #include <sys/types.h> 35 #include <sys/systm.h> 36 #include <sys/errno.h> 37 #include <sys/kmem.h> 38 #include <sys/buf.h> 39 #include <sys/vnode.h> 40 #include <sys/vfs.h> 41 #include <sys/user.h> 42 #include <sys/callb.h> 43 #include <sys/cpuvar.h> 44 #include <sys/fs/ufs_inode.h> 45 #include <sys/fs/ufs_log.h> 46 #include <sys/fs/ufs_trans.h> 47 #include <sys/fs/ufs_acl.h> 48 #include <sys/fs/ufs_bio.h> 49 #include <sys/fs/ufs_fsdir.h> 50 #include <sys/debug.h> 51 #include <sys/cmn_err.h> 52 #include <sys/sysmacros.h> 53 #include <vm/pvn.h> 54 55 extern pri_t minclsyspri; 56 extern int hash2ints(); 57 extern struct kmem_cache *inode_cache; /* cache of free inodes */ 58 extern int ufs_idle_waiters; 59 extern struct instats ins; 60 61 static void ufs_attr_purge(struct inode *); 62 63 /* 64 * initialize a thread's queue struct 65 */ 66 void 67 ufs_thread_init(struct ufs_q *uq, int lowat) 68 { 69 bzero((caddr_t)uq, sizeof (*uq)); 70 cv_init(&uq->uq_cv, NULL, CV_DEFAULT, NULL); 71 mutex_init(&uq->uq_mutex, NULL, MUTEX_DEFAULT, NULL); 72 uq->uq_lowat = lowat; 73 uq->uq_hiwat = 2 * lowat; 74 uq->uq_threadp = NULL; 75 } 76 77 /* 78 * start a thread for a queue (assumes success) 79 */ 80 void 81 ufs_thread_start(struct ufs_q *uq, void (*func)(), struct vfs *vfsp) 82 { 83 mutex_enter(&uq->uq_mutex); 84 if (uq->uq_threadp == NULL) { 85 uq->uq_threadp = thread_create(NULL, 0, func, vfsp, 0, &p0, 86 TS_RUN, minclsyspri); 87 uq->uq_flags = 0; 88 } 89 mutex_exit(&uq->uq_mutex); 90 } 91 92 /* 93 * wait for the thread to exit 94 */ 95 void 96 ufs_thread_exit(struct ufs_q *uq) 97 { 98 kt_did_t ufs_thread_did = 0; 99 100 mutex_enter(&uq->uq_mutex); 101 uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED); 102 if (uq->uq_threadp != NULL) { 103 ufs_thread_did = uq->uq_threadp->t_did; 104 uq->uq_flags |= (UQ_EXIT|UQ_WAIT); 105 cv_broadcast(&uq->uq_cv); 106 } 107 mutex_exit(&uq->uq_mutex); 108 109 /* 110 * It's safe to call thread_join() with an already-gone 111 * t_did, but we have to obtain it before the kernel 112 * thread structure is freed. We do so above under the 113 * protection of the uq_mutex when we're sure the thread 114 * still exists and it's save to de-reference it. 115 * We also have to check if ufs_thread_did is != 0 116 * before calling thread_join() since thread 0 in the system 117 * gets a t_did of 0. 118 */ 119 if (ufs_thread_did) 120 thread_join(ufs_thread_did); 121 } 122 123 /* 124 * wait for a thread to suspend itself on the caller's behalf 125 * the caller is responsible for continuing the thread 126 */ 127 void 128 ufs_thread_suspend(struct ufs_q *uq) 129 { 130 mutex_enter(&uq->uq_mutex); 131 if (uq->uq_threadp != NULL) { 132 /* 133 * wait while another thread is suspending this thread. 134 * no need to do a cv_broadcast(), as whoever suspended 135 * the thread must continue it at some point. 136 */ 137 while ((uq->uq_flags & UQ_SUSPEND) && 138 (uq->uq_threadp != NULL)) { 139 /* 140 * We can't use cv_signal() because if our 141 * signal doesn't happen to hit the desired 142 * thread but instead some other waiter like 143 * ourselves, we'll wait forever for a 144 * response. Well, at least an indeterminate 145 * amount of time until we just happen to get 146 * lucky from whomever did get signalled doing 147 * a cv_signal() of their own. This is an 148 * unfortunate performance lossage. 149 */ 150 uq->uq_flags |= UQ_WAIT; 151 cv_wait(&uq->uq_cv, &uq->uq_mutex); 152 } 153 154 uq->uq_flags |= (UQ_SUSPEND | UQ_WAIT); 155 156 /* 157 * wait for the thread to suspend itself 158 */ 159 if ((uq->uq_flags & UQ_SUSPENDED) == 0 && 160 (uq->uq_threadp != NULL)) { 161 cv_broadcast(&uq->uq_cv); 162 } 163 164 while (((uq->uq_flags & UQ_SUSPENDED) == 0) && 165 (uq->uq_threadp != NULL)) { 166 cv_wait(&uq->uq_cv, &uq->uq_mutex); 167 } 168 } 169 mutex_exit(&uq->uq_mutex); 170 } 171 172 /* 173 * allow a thread to continue from a ufs_thread_suspend() 174 * This thread must be the same as the thread that called 175 * ufs_thread_suspend. 176 */ 177 void 178 ufs_thread_continue(struct ufs_q *uq) 179 { 180 mutex_enter(&uq->uq_mutex); 181 uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED); 182 cv_broadcast(&uq->uq_cv); 183 mutex_exit(&uq->uq_mutex); 184 } 185 186 /* 187 * some common code for managing a threads execution 188 * uq is locked at entry and return 189 * may sleep 190 * may exit 191 */ 192 /* 193 * Kind of a hack passing in the callb_cpr_t * here. 194 * It should really be part of the ufs_q structure. 195 * I did not put it in there because we are already in beta 196 * and I was concerned that changing ufs_inode.h to include 197 * callb.h might break something. 198 */ 199 int 200 ufs_thread_run(struct ufs_q *uq, callb_cpr_t *cprinfop) 201 { 202 again: 203 ASSERT(uq->uq_ne >= 0); 204 205 if (uq->uq_flags & UQ_SUSPEND) { 206 uq->uq_flags |= UQ_SUSPENDED; 207 } else if (uq->uq_flags & UQ_EXIT) { 208 /* 209 * exiting; empty the queue (may infinite loop) 210 */ 211 if (uq->uq_ne) 212 return (uq->uq_ne); 213 uq->uq_threadp = NULL; 214 if (uq->uq_flags & UQ_WAIT) { 215 cv_broadcast(&uq->uq_cv); 216 } 217 uq->uq_flags &= ~(UQ_EXIT | UQ_WAIT); 218 CALLB_CPR_EXIT(cprinfop); 219 thread_exit(); 220 } else if (uq->uq_ne >= uq->uq_lowat) { 221 /* 222 * process a block of entries until below high water mark 223 */ 224 return (uq->uq_ne - (uq->uq_lowat >> 1)); 225 } 226 if (uq->uq_flags & UQ_WAIT) { 227 uq->uq_flags &= ~UQ_WAIT; 228 cv_broadcast(&uq->uq_cv); 229 } 230 CALLB_CPR_SAFE_BEGIN(cprinfop); 231 cv_wait(&uq->uq_cv, &uq->uq_mutex); 232 CALLB_CPR_SAFE_END(cprinfop, &uq->uq_mutex); 233 goto again; 234 } 235 236 /* 237 * DELETE INODE 238 * The following routines implement the protocol for freeing the resources 239 * held by an idle and deleted inode. 240 */ 241 void 242 ufs_delete(struct ufsvfs *ufsvfsp, struct inode *ip, int dolockfs) 243 { 244 ushort_t mode; 245 struct vnode *vp = ITOV(ip); 246 struct ulockfs *ulp; 247 int trans_size; 248 int dorwlock = ((ip->i_mode & IFMT) == IFREG); 249 int issync; 250 int err; 251 struct inode *dp; 252 struct ufs_q *delq = &ufsvfsp->vfs_delete; 253 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 254 255 /* 256 * Ignore if deletes are not allowed (wlock/hlock) 257 */ 258 if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) { 259 mutex_enter(&delq->uq_mutex); 260 delq_info->delq_unreclaimed_blocks -= ip->i_blocks; 261 delq_info->delq_unreclaimed_files--; 262 mutex_exit(&delq->uq_mutex); 263 VN_RELE(vp); 264 return; 265 } 266 267 if ((vp->v_count > 1) || (ip->i_mode == 0)) { 268 mutex_enter(&delq->uq_mutex); 269 delq_info->delq_unreclaimed_blocks -= ip->i_blocks; 270 delq_info->delq_unreclaimed_files--; 271 mutex_exit(&delq->uq_mutex); 272 VN_RELE(vp); 273 return; 274 } 275 /* 276 * If we are called as part of setting a fs lock, then only 277 * do part of the lockfs protocol. In other words, don't hang. 278 */ 279 if (dolockfs) { 280 if (ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_DELETE_MASK)) 281 return; 282 } else { 283 /* 284 * check for recursive VOP call 285 */ 286 if (curthread->t_flag & T_DONTBLOCK) { 287 ulp = NULL; 288 } else { 289 ulp = &ufsvfsp->vfs_ulockfs; 290 curthread->t_flag |= T_DONTBLOCK; 291 } 292 } 293 294 /* 295 * Hold rwlock to synchronize with (nfs) writes 296 */ 297 if (dorwlock) 298 rw_enter(&ip->i_rwlock, RW_WRITER); 299 300 /* 301 * Delete the attribute directory. 302 */ 303 if (ip->i_oeftflag != 0) { 304 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE, 305 trans_size = (int)TOP_REMOVE_SIZE(ip)); 306 rw_enter(&ip->i_contents, RW_WRITER); 307 err = ufs_iget(ip->i_vfs, ip->i_oeftflag, 308 &dp, CRED()); 309 if (err == 0) { 310 rw_enter(&dp->i_rwlock, RW_WRITER); 311 rw_enter(&dp->i_contents, RW_WRITER); 312 dp->i_flag |= IUPD|ICHG; 313 dp->i_seq++; 314 TRANS_INODE(dp->i_ufsvfs, dp); 315 dp->i_nlink -= 2; 316 ufs_setreclaim(dp); 317 /* 318 * Should get rid of any negative cache entries that 319 * might be lingering, as well as ``.'' and 320 * ``..''. If we don't, the VN_RELE() below 321 * won't actually put dp on the delete queue 322 * and it'll hang out until someone forces it 323 * (lockfs -f, umount, ...). The only reliable 324 * way of doing this at the moment is to call 325 * dnlc_purge_vp(ITOV(dp)), which is unacceptably 326 * slow, so we'll just note the problem in this 327 * comment for now. 328 */ 329 dnlc_remove(ITOV(dp), "."); 330 dnlc_remove(ITOV(dp), ".."); 331 ITIMES_NOLOCK(dp); 332 if (!TRANS_ISTRANS(ufsvfsp)) { 333 ufs_iupdat(dp, I_SYNC); 334 } 335 rw_exit(&dp->i_contents); 336 rw_exit(&dp->i_rwlock); 337 VN_RELE(ITOV(dp)); 338 } 339 /* 340 * Clear out attribute pointer 341 */ 342 ip->i_oeftflag = 0; 343 rw_exit(&ip->i_contents); 344 TRANS_END_CSYNC(ufsvfsp, err, issync, 345 TOP_REMOVE, trans_size); 346 dnlc_remove(ITOV(ip), XATTR_DIR_NAME); 347 } 348 349 if ((ip->i_mode & IFMT) == IFATTRDIR) { 350 ufs_attr_purge(ip); 351 } 352 353 (void) TRANS_ITRUNC(ip, (u_offset_t)0, I_FREE | I_ACCT, CRED()); 354 355 /* 356 * the inode's space has been freed; now free the inode 357 */ 358 if (ulp) { 359 trans_size = TOP_IFREE_SIZE(ip); 360 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IFREE, trans_size); 361 } 362 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 363 rw_enter(&ip->i_contents, RW_WRITER); 364 TRANS_INODE(ufsvfsp, ip); 365 mode = ip->i_mode; 366 ip->i_mode = 0; 367 ip->i_rdev = 0; 368 ip->i_ordev = 0; 369 ip->i_flag |= IMOD; 370 if (ip->i_ufs_acl) { 371 (void) ufs_si_free(ip->i_ufs_acl, vp->v_vfsp, CRED()); 372 ip->i_ufs_acl = NULL; 373 ip->i_shadow = 0; 374 } 375 376 /* 377 * This inode is torn down but still retains it's identity 378 * (inode number). It could get recycled soon so it's best 379 * to clean up the vnode just in case. 380 */ 381 mutex_enter(&vp->v_lock); 382 vn_recycle(vp); 383 mutex_exit(&vp->v_lock); 384 385 /* 386 * free the inode 387 */ 388 ufs_ifree(ip, ip->i_number, mode); 389 /* 390 * release quota resources; can't fail 391 */ 392 (void) chkiq((struct ufsvfs *)vp->v_vfsp->vfs_data, 393 /* change */ -1, ip, (uid_t)ip->i_uid, 0, CRED(), 394 (char **)NULL, (size_t *)NULL); 395 dqrele(ip->i_dquot); 396 ip->i_dquot = NULL; 397 ip->i_flag &= ~(IDEL | IDIRECTIO); 398 ip->i_cflags = 0; 399 if (!TRANS_ISTRANS(ufsvfsp)) { 400 ufs_iupdat(ip, I_SYNC); 401 } else { 402 mutex_enter(&delq->uq_mutex); 403 delq_info->delq_unreclaimed_files--; 404 mutex_exit(&delq->uq_mutex); 405 } 406 rw_exit(&ip->i_contents); 407 rw_exit(&ufsvfsp->vfs_dqrwlock); 408 if (dorwlock) 409 rw_exit(&ip->i_rwlock); 410 VN_RELE(vp); 411 412 /* 413 * End of transaction 414 */ 415 if (ulp) { 416 TRANS_END_ASYNC(ufsvfsp, TOP_IFREE, trans_size); 417 if (dolockfs) 418 ufs_lockfs_end(ulp); 419 else 420 curthread->t_flag &= ~T_DONTBLOCK; 421 } 422 } 423 424 /* 425 * Create the delete thread and init the delq_info for this fs 426 */ 427 void 428 ufs_delete_init(struct ufsvfs *ufsvfsp, int lowat) 429 { 430 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 431 432 ufs_thread_init(&ufsvfsp->vfs_delete, lowat); 433 (void) memset((void *)delq_info, 0, sizeof (*delq_info)); 434 } 435 436 /* 437 * thread that frees up deleted inodes 438 */ 439 void 440 ufs_thread_delete(struct vfs *vfsp) 441 { 442 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 443 struct ufs_q *uq = &ufsvfsp->vfs_delete; 444 struct inode *ip; 445 long ne; 446 callb_cpr_t cprinfo; 447 448 CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr, 449 "ufsdelete"); 450 451 mutex_enter(&uq->uq_mutex); 452 again: 453 /* 454 * Sleep until there is work to do. Only do one entry at 455 * a time, to reduce the wait time for checking for a suspend 456 * request. The ?: is for pedantic portability. 457 */ 458 ne = ufs_thread_run(uq, &cprinfo) ? 1 : 0; 459 460 /* 461 * process an entry, if there are any 462 */ 463 if (ne && (ip = uq->uq_ihead)) { 464 /* 465 * process first entry on queue. Assumed conditions are: 466 * ip is held (v_count >= 1) 467 * ip is referenced (i_flag & IREF) 468 * ip is free (i_nlink <= 0) 469 */ 470 if ((uq->uq_ihead = ip->i_freef) == ip) 471 uq->uq_ihead = NULL; 472 ip->i_freef->i_freeb = ip->i_freeb; 473 ip->i_freeb->i_freef = ip->i_freef; 474 ip->i_freef = ip; 475 ip->i_freeb = ip; 476 uq->uq_ne--; 477 mutex_exit(&uq->uq_mutex); 478 ufs_delete(ufsvfsp, ip, 1); 479 mutex_enter(&uq->uq_mutex); 480 } 481 goto again; 482 } 483 484 /* 485 * drain ne entries off the delete queue. As new queue entries may 486 * be added while we're working, ne is interpreted as follows: 487 * 488 * ne > 0 => remove up to ne entries 489 * ne == 0 => remove all entries currently on the queue 490 * ne == -1 => remove entries until the queue is empty 491 */ 492 void 493 ufs_delete_drain(struct vfs *vfsp, int ne, int dolockfs) 494 { 495 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 496 struct ufs_q *uq; 497 struct inode *ip; 498 int drain_cnt = 0; 499 int done; 500 501 /* 502 * if forcibly unmounted; ignore 503 */ 504 if (ufsvfsp == NULL) 505 return; 506 507 uq = &ufsvfsp->vfs_delete; 508 mutex_enter(&uq->uq_mutex); 509 if (ne == 0) 510 drain_cnt = uq->uq_ne; 511 else if (ne > 0) 512 drain_cnt = ne; 513 514 /* 515 * process up to ne entries 516 */ 517 518 done = 0; 519 while (!done && (ip = uq->uq_ihead)) { 520 if (ne != -1) 521 drain_cnt--; 522 if (ne != -1 && drain_cnt == 0) 523 done = 1; 524 if ((uq->uq_ihead = ip->i_freef) == ip) 525 uq->uq_ihead = NULL; 526 ip->i_freef->i_freeb = ip->i_freeb; 527 ip->i_freeb->i_freef = ip->i_freef; 528 ip->i_freef = ip; 529 ip->i_freeb = ip; 530 uq->uq_ne--; 531 mutex_exit(&uq->uq_mutex); 532 ufs_delete(ufsvfsp, ip, dolockfs); 533 mutex_enter(&uq->uq_mutex); 534 } 535 mutex_exit(&uq->uq_mutex); 536 } 537 538 void 539 ufs_sync_with_thread(struct ufs_q *uq) 540 { 541 mutex_enter(&uq->uq_mutex); 542 543 /* 544 * Wake up delete thread to free up space. 545 */ 546 if ((uq->uq_flags & UQ_WAIT) == 0) { 547 uq->uq_flags |= UQ_WAIT; 548 cv_broadcast(&uq->uq_cv); 549 } 550 551 while ((uq->uq_threadp != NULL) && (uq->uq_flags & UQ_WAIT)) { 552 cv_wait(&uq->uq_cv, &uq->uq_mutex); 553 } 554 555 mutex_exit(&uq->uq_mutex); 556 } 557 558 /* 559 * Get rid of everything that's currently in the delete queue, 560 * plus whatever the delete thread is working on at the moment. 561 * 562 * This ability is required for providing true POSIX semantics 563 * regarding close(2), unlink(2), etc, even when logging is enabled. 564 * The standard requires that the released space be immediately 565 * observable (statvfs(2)) and allocatable (e.g., write(2)). 566 */ 567 void 568 ufs_delete_drain_wait(struct ufsvfs *ufsvfsp, int dolockfs) 569 { 570 struct ufs_q *uq = &ufsvfsp->vfs_delete; 571 int error; 572 struct ufs_q *delq = &ufsvfsp->vfs_delete; 573 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 574 575 /* 576 * If there is something on delq or delete thread 577 * working on delq. 578 */ 579 mutex_enter(&delq->uq_mutex); 580 if (delq_info->delq_unreclaimed_files > 0) { 581 mutex_exit(&delq->uq_mutex); 582 (void) ufs_delete_drain(ufsvfsp->vfs_vfs, 0, dolockfs); 583 ufs_sync_with_thread(uq); 584 } else { 585 ASSERT(delq_info->delq_unreclaimed_files == 0); 586 mutex_exit(&delq->uq_mutex); 587 return; 588 } 589 590 /* 591 * Commit any outstanding transactions to make sure 592 * any canceled freed blocks are available for allocation. 593 */ 594 curthread->t_flag |= T_DONTBLOCK; 595 TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_UPDATE, TOP_COMMIT_SIZE, error); 596 if (!error) { 597 TRANS_END_SYNC(ufsvfsp, error, TOP_COMMIT_UPDATE, 598 TOP_COMMIT_SIZE); 599 } 600 curthread->t_flag &= ~T_DONTBLOCK; 601 } 602 603 /* 604 * Adjust the resource usage in a struct statvfs based on 605 * what's in the delete queue. 606 * 607 * We do not consider the impact of ACLs or extended attributes 608 * that may be deleted as a side-effect of deleting a file. 609 * Those are metadata, and their sizes aren't reflected in the 610 * sizes returned by stat(), so this is not a problem. 611 */ 612 void 613 ufs_delete_adjust_stats(struct ufsvfs *ufsvfsp, struct statvfs64 *sp) 614 { 615 struct ufs_q *uq = &ufsvfsp->vfs_delete; 616 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 617 618 mutex_enter(&uq->uq_mutex); 619 /* 620 * The blocks accounted for in the delete queue info are 621 * counted in DEV_BSIZE chunks, but ufs_statvfs counts in 622 * filesystem fragments, so a conversion is required here. 623 */ 624 sp->f_bfree += dbtofsb(ufsvfsp->vfs_fs, 625 delq_info->delq_unreclaimed_blocks); 626 sp->f_ffree += delq_info->delq_unreclaimed_files; 627 mutex_exit(&uq->uq_mutex); 628 } 629 630 /* 631 * IDLE INODE 632 * The following routines implement the protocol for maintaining an 633 * LRU list of idle inodes and for moving the idle inodes to the 634 * reuse list when the number of allocated inodes exceeds the user 635 * tunable high-water mark (ufs_ninode). 636 */ 637 638 /* 639 * clean an idle inode and move it to the reuse list 640 */ 641 static void 642 ufs_idle_free(struct inode *ip) 643 { 644 int pages; 645 int hno; 646 kmutex_t *ihm; 647 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 648 struct vnode *vp = ITOV(ip); 649 int vn_has_data, vn_modified; 650 651 /* 652 * inode is held 653 */ 654 655 /* 656 * remember `pages' for stats below 657 */ 658 pages = (ip->i_mode && vn_has_cached_data(vp) && vp->v_type != VCHR); 659 660 /* 661 * start the dirty pages to disk and then invalidate them 662 * unless the inode is invalid (ISTALE) 663 */ 664 if ((ip->i_flag & ISTALE) == 0) { 665 (void) TRANS_SYNCIP(ip, B_ASYNC, I_ASYNC, TOP_SYNCIP_FREE); 666 (void) TRANS_SYNCIP(ip, 667 (TRANS_ISERROR(ufsvfsp)) ? B_INVAL | B_FORCE : B_INVAL, 668 I_ASYNC, TOP_SYNCIP_FREE); 669 } 670 671 /* 672 * wait for any current ufs_iget to finish and block future ufs_igets 673 */ 674 ASSERT(ip->i_number != 0); 675 hno = INOHASH(ip->i_number); 676 ihm = &ih_lock[hno]; 677 mutex_enter(ihm); 678 679 /* 680 * It must be guaranteed that v_count >= 2, otherwise 681 * something must be wrong with this vnode already. 682 * That is why we use v_count-- instead of VN_RELE(). 683 * Acquire the vnode lock in case another thread is in 684 * VN_RELE(). 685 */ 686 mutex_enter(&vp->v_lock); 687 688 if (vp->v_count < 2) 689 cmn_err(CE_PANIC, 690 "ufs_idle_free: vnode ref count is less than 2"); 691 692 vp->v_count--; 693 694 vn_has_data = (vp->v_type != VCHR && vn_has_cached_data(vp)); 695 vn_modified = (ip->i_flag & (IMOD|IMODACC|IACC|ICHG|IUPD|IATTCHG)); 696 697 if (vp->v_count != 1 || 698 ((vn_has_data || vn_modified) && 699 ((ip->i_flag & ISTALE) == 0))) { 700 /* 701 * Another thread has referenced this inode while 702 * we are trying to free it. Call VN_RELE() to 703 * release our reference, if v_count > 1 data is 704 * present or one of the modified etc. flags was 705 * set, whereby ISTALE wasn't set. 706 * If we'd proceed with ISTALE set here, we might 707 * get ourselves into a deadlock situation. 708 */ 709 mutex_exit(&vp->v_lock); 710 mutex_exit(ihm); 711 VN_RELE(vp); 712 } else { 713 /* 714 * The inode is currently unreferenced and can not 715 * acquire further references because it has no pages 716 * and the hash is locked. Inodes acquire references 717 * via the hash list or via their pages. 718 */ 719 720 mutex_exit(&vp->v_lock); 721 722 /* 723 * remove it from the cache 724 */ 725 remque(ip); 726 mutex_exit(ihm); 727 /* 728 * Stale inodes have no valid ufsvfs 729 */ 730 if ((ip->i_flag & ISTALE) == 0 && ip->i_dquot) { 731 TRANS_DQRELE(ufsvfsp, ip->i_dquot); 732 ip->i_dquot = NULL; 733 } 734 if ((ip->i_flag & ISTALE) && 735 vn_has_data) { 736 /* 737 * ISTALE inodes may have data 738 * and this data needs to be 739 * cleaned up. 740 */ 741 (void) pvn_vplist_dirty(vp, (u_offset_t)0, 742 ufs_putapage, B_INVAL | B_TRUNC, 743 (struct cred *)NULL); 744 } 745 ufs_si_del(ip); 746 if (pages) { 747 CPU_STATS_ADDQ(CPU, sys, ufsipage, 1); 748 } else { 749 CPU_STATS_ADDQ(CPU, sys, ufsinopage, 1); 750 } 751 ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp)); 752 753 /* 754 * We had better not have a vnode reference count > 1 755 * at this point, if we do then something is broken as 756 * this inode/vnode acquired a reference underneath of us. 757 */ 758 ASSERT(vp->v_count == 1); 759 760 ufs_free_inode(ip); 761 } 762 } 763 764 /* 765 * this thread processes the global idle queue 766 */ 767 iqhead_t *ufs_junk_iq; 768 iqhead_t *ufs_useful_iq; 769 int ufs_njunk_iq = 0; 770 int ufs_nuseful_iq = 0; 771 int ufs_niqhash; 772 int ufs_iqhashmask; 773 struct ufs_q ufs_idle_q; 774 775 void 776 ufs_thread_idle(void) 777 { 778 callb_cpr_t cprinfo; 779 int i; 780 int ne; 781 782 ufs_niqhash = (ufs_idle_q.uq_lowat >> 1) / IQHASHQLEN; 783 ufs_niqhash = 1 << highbit(ufs_niqhash); /* round up to power of 2 */ 784 ufs_iqhashmask = ufs_niqhash - 1; 785 ufs_junk_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_junk_iq), 786 KM_SLEEP); 787 ufs_useful_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_useful_iq), 788 KM_SLEEP); 789 790 /* Initialize hash queue headers */ 791 for (i = 0; i < ufs_niqhash; i++) { 792 ufs_junk_iq[i].i_freef = (inode_t *)&ufs_junk_iq[i]; 793 ufs_junk_iq[i].i_freeb = (inode_t *)&ufs_junk_iq[i]; 794 ufs_useful_iq[i].i_freef = (inode_t *)&ufs_useful_iq[i]; 795 ufs_useful_iq[i].i_freeb = (inode_t *)&ufs_useful_iq[i]; 796 } 797 798 CALLB_CPR_INIT(&cprinfo, &ufs_idle_q.uq_mutex, callb_generic_cpr, 799 "ufsidle"); 800 again: 801 /* 802 * Whenever the idle thread is awakened, it repeatedly gives 803 * back half of the idle queue until the idle queue falls 804 * below lowat. 805 */ 806 mutex_enter(&ufs_idle_q.uq_mutex); 807 if (ufs_idle_q.uq_ne < ufs_idle_q.uq_lowat) { 808 CALLB_CPR_SAFE_BEGIN(&cprinfo); 809 cv_wait(&ufs_idle_q.uq_cv, &ufs_idle_q.uq_mutex); 810 CALLB_CPR_SAFE_END(&cprinfo, &ufs_idle_q.uq_mutex); 811 } 812 mutex_exit(&ufs_idle_q.uq_mutex); 813 814 /* 815 * Give back 1/2 of the idle queue 816 */ 817 ne = ufs_idle_q.uq_ne >> 1; 818 ins.in_tidles.value.ul += ne; 819 ufs_idle_some(ne); 820 goto again; 821 } 822 823 /* 824 * Reclaim callback for ufs inode cache. 825 * Invoked by the kernel memory allocator when memory gets tight. 826 */ 827 /*ARGSUSED*/ 828 void 829 ufs_inode_cache_reclaim(void *cdrarg) 830 { 831 /* 832 * If we are low on memory and the idle queue is over its 833 * halfway mark, then free 50% of the idle q 834 * 835 * We don't free all of the idle inodes because the inodes 836 * for popular NFS files may have been kicked from the dnlc. 837 * The inodes for these files will end up on the idle queue 838 * after every NFS access. 839 * 840 * If we repeatedly push them from the idle queue then 841 * NFS users may be unhappy as an extra buf cache operation 842 * is incurred for every NFS operation to these files. 843 * 844 * It's not common, but I have seen it happen. 845 * 846 */ 847 if (ufs_idle_q.uq_ne < (ufs_idle_q.uq_lowat >> 1)) 848 return; 849 mutex_enter(&ufs_idle_q.uq_mutex); 850 cv_broadcast(&ufs_idle_q.uq_cv); 851 mutex_exit(&ufs_idle_q.uq_mutex); 852 } 853 854 /* 855 * Free up some idle inodes 856 */ 857 void 858 ufs_idle_some(int ne) 859 { 860 int i; 861 struct inode *ip; 862 struct vnode *vp; 863 static int junk_rotor = 0; 864 static int useful_rotor = 0; 865 866 for (i = 0; i < ne; ++i) { 867 mutex_enter(&ufs_idle_q.uq_mutex); 868 869 if (ufs_njunk_iq) { 870 while (ufs_junk_iq[junk_rotor].i_freef == 871 (inode_t *)&ufs_junk_iq[junk_rotor]) { 872 junk_rotor = IQNEXT(junk_rotor); 873 } 874 ip = ufs_junk_iq[junk_rotor].i_freef; 875 ASSERT(ip->i_flag & IJUNKIQ); 876 } else if (ufs_nuseful_iq) { 877 while (ufs_useful_iq[useful_rotor].i_freef == 878 (inode_t *)&ufs_useful_iq[useful_rotor]) { 879 useful_rotor = IQNEXT(useful_rotor); 880 } 881 ip = ufs_useful_iq[useful_rotor].i_freef; 882 ASSERT(!(ip->i_flag & IJUNKIQ)); 883 } else { 884 mutex_exit(&ufs_idle_q.uq_mutex); 885 return; 886 } 887 888 /* 889 * emulate ufs_iget 890 */ 891 vp = ITOV(ip); 892 VN_HOLD(vp); 893 mutex_exit(&ufs_idle_q.uq_mutex); 894 rw_enter(&ip->i_contents, RW_WRITER); 895 /* 896 * VN_RELE should not be called if 897 * ufs_rmidle returns true, as it will 898 * effectively be done in ufs_idle_free. 899 */ 900 if (ufs_rmidle(ip)) { 901 rw_exit(&ip->i_contents); 902 ufs_idle_free(ip); 903 } else { 904 rw_exit(&ip->i_contents); 905 VN_RELE(vp); 906 } 907 } 908 } 909 910 /* 911 * drain entries for vfsp from the idle queue 912 * vfsp == NULL means drain the entire thing 913 */ 914 void 915 ufs_idle_drain(struct vfs *vfsp) 916 { 917 struct inode *ip, *nip; 918 struct inode *ianchor = NULL; 919 int i; 920 921 mutex_enter(&ufs_idle_q.uq_mutex); 922 if (ufs_njunk_iq) { 923 /* for each hash q */ 924 for (i = 0; i < ufs_niqhash; i++) { 925 /* search down the hash q */ 926 for (ip = ufs_junk_iq[i].i_freef; 927 ip != (inode_t *)&ufs_junk_iq[i]; 928 ip = ip->i_freef) { 929 if (ip->i_vfs == vfsp || vfsp == NULL) { 930 /* found a matching entry */ 931 VN_HOLD(ITOV(ip)); 932 mutex_exit(&ufs_idle_q.uq_mutex); 933 rw_enter(&ip->i_contents, RW_WRITER); 934 /* 935 * See comments in ufs_idle_some() 936 * as we will call ufs_idle_free() 937 * after scanning both queues. 938 */ 939 if (ufs_rmidle(ip)) { 940 rw_exit(&ip->i_contents); 941 ip->i_freef = ianchor; 942 ianchor = ip; 943 } else { 944 rw_exit(&ip->i_contents); 945 VN_RELE(ITOV(ip)); 946 } 947 /* restart this hash q */ 948 ip = (inode_t *)&ufs_junk_iq[i]; 949 mutex_enter(&ufs_idle_q.uq_mutex); 950 } 951 } 952 } 953 } 954 if (ufs_nuseful_iq) { 955 /* for each hash q */ 956 for (i = 0; i < ufs_niqhash; i++) { 957 /* search down the hash q */ 958 for (ip = ufs_useful_iq[i].i_freef; 959 ip != (inode_t *)&ufs_useful_iq[i]; 960 ip = ip->i_freef) { 961 if (ip->i_vfs == vfsp || vfsp == NULL) { 962 /* found a matching entry */ 963 VN_HOLD(ITOV(ip)); 964 mutex_exit(&ufs_idle_q.uq_mutex); 965 rw_enter(&ip->i_contents, RW_WRITER); 966 /* 967 * See comments in ufs_idle_some() 968 * as we will call ufs_idle_free() 969 * after scanning both queues. 970 */ 971 if (ufs_rmidle(ip)) { 972 rw_exit(&ip->i_contents); 973 ip->i_freef = ianchor; 974 ianchor = ip; 975 } else { 976 rw_exit(&ip->i_contents); 977 VN_RELE(ITOV(ip)); 978 } 979 /* restart this hash q */ 980 ip = (inode_t *)&ufs_useful_iq[i]; 981 mutex_enter(&ufs_idle_q.uq_mutex); 982 } 983 } 984 } 985 } 986 987 mutex_exit(&ufs_idle_q.uq_mutex); 988 /* no more matching entries, release those we have found (if any) */ 989 for (ip = ianchor; ip; ip = nip) { 990 nip = ip->i_freef; 991 ip->i_freef = ip; 992 ufs_idle_free(ip); 993 } 994 } 995 996 /* 997 * RECLAIM DELETED INODES 998 * The following thread scans the file system once looking for deleted files 999 */ 1000 void 1001 ufs_thread_reclaim(struct vfs *vfsp) 1002 { 1003 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 1004 struct ufs_q *uq = &ufsvfsp->vfs_reclaim; 1005 struct fs *fs = ufsvfsp->vfs_fs; 1006 struct buf *bp = 0; 1007 int err = 0; 1008 daddr_t bno; 1009 ino_t ino; 1010 struct dinode *dp; 1011 struct inode *ip; 1012 callb_cpr_t cprinfo; 1013 1014 CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr, 1015 "ufsreclaim"); 1016 1017 /* 1018 * mount decided that we don't need a reclaim thread 1019 */ 1020 if ((fs->fs_reclaim & FS_RECLAIMING) == 0) 1021 err++; 1022 1023 /* 1024 * don't reclaim if readonly 1025 */ 1026 if (fs->fs_ronly) 1027 err++; 1028 1029 for (ino = 0; ino < (fs->fs_ncg * fs->fs_ipg) && !err; ++ino) { 1030 1031 /* 1032 * Check whether we are the target of another 1033 * thread having called ufs_thread_exit() or 1034 * ufs_thread_suspend(). 1035 */ 1036 mutex_enter(&uq->uq_mutex); 1037 again: 1038 if (uq->uq_flags & UQ_EXIT) { 1039 err++; 1040 mutex_exit(&uq->uq_mutex); 1041 break; 1042 } else if (uq->uq_flags & UQ_SUSPEND) { 1043 uq->uq_flags |= UQ_SUSPENDED; 1044 /* 1045 * Release the buf before we cv_wait() 1046 * otherwise we may deadlock with the 1047 * thread that called ufs_thread_suspend(). 1048 */ 1049 if (bp) { 1050 brelse(bp); 1051 bp = 0; 1052 } 1053 if (uq->uq_flags & UQ_WAIT) { 1054 uq->uq_flags &= ~UQ_WAIT; 1055 cv_broadcast(&uq->uq_cv); 1056 } 1057 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1058 cv_wait(&uq->uq_cv, &uq->uq_mutex); 1059 CALLB_CPR_SAFE_END(&cprinfo, &uq->uq_mutex); 1060 goto again; 1061 } 1062 mutex_exit(&uq->uq_mutex); 1063 1064 /* 1065 * if we don't already have the buf; get it 1066 */ 1067 bno = fsbtodb(fs, itod(fs, ino)); 1068 if ((bp == 0) || (bp->b_blkno != bno)) { 1069 if (bp) 1070 brelse(bp); 1071 bp = UFS_BREAD(ufsvfsp, 1072 ufsvfsp->vfs_dev, bno, fs->fs_bsize); 1073 bp->b_flags |= B_AGE; 1074 } 1075 if (bp->b_flags & B_ERROR) { 1076 err++; 1077 continue; 1078 } 1079 /* 1080 * nlink <= 0 and mode != 0 means deleted 1081 */ 1082 dp = (struct dinode *)bp->b_un.b_addr + itoo(fs, ino); 1083 if ((dp->di_nlink <= 0) && (dp->di_mode != 0)) { 1084 /* 1085 * can't hold the buf (deadlock) 1086 */ 1087 brelse(bp); 1088 bp = 0; 1089 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1090 /* 1091 * iget/iput sequence will put inode on ifree 1092 * thread queue if it is idle. This is a nop 1093 * for busy (open, deleted) inodes 1094 */ 1095 if (ufs_iget(vfsp, ino, &ip, CRED())) 1096 err++; 1097 else 1098 VN_RELE(ITOV(ip)); 1099 rw_exit(&ufsvfsp->vfs_dqrwlock); 1100 } 1101 } 1102 1103 if (bp) 1104 brelse(bp); 1105 if (!err) { 1106 /* 1107 * reset the reclaiming-bit 1108 */ 1109 mutex_enter(&ufsvfsp->vfs_lock); 1110 fs->fs_reclaim &= ~FS_RECLAIMING; 1111 mutex_exit(&ufsvfsp->vfs_lock); 1112 TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_RECLAIM); 1113 } 1114 1115 /* 1116 * exit the reclaim thread 1117 */ 1118 mutex_enter(&uq->uq_mutex); 1119 uq->uq_threadp = NULL; 1120 uq->uq_flags &= ~UQ_WAIT; 1121 cv_broadcast(&uq->uq_cv); 1122 CALLB_CPR_EXIT(&cprinfo); 1123 thread_exit(); 1124 } 1125 /* 1126 * HLOCK FILE SYSTEM 1127 * hlock the file system's whose logs have device errors 1128 */ 1129 struct ufs_q ufs_hlock; 1130 /*ARGSUSED*/ 1131 void 1132 ufs_thread_hlock(void *ignore) 1133 { 1134 int retry; 1135 callb_cpr_t cprinfo; 1136 1137 CALLB_CPR_INIT(&cprinfo, &ufs_hlock.uq_mutex, callb_generic_cpr, 1138 "ufshlock"); 1139 1140 for (;;) { 1141 /* 1142 * sleep until there is work to do 1143 */ 1144 mutex_enter(&ufs_hlock.uq_mutex); 1145 (void) ufs_thread_run(&ufs_hlock, &cprinfo); 1146 ufs_hlock.uq_ne = 0; 1147 mutex_exit(&ufs_hlock.uq_mutex); 1148 /* 1149 * hlock the error'ed fs's 1150 * retry after a bit if another app is doing lockfs stuff 1151 */ 1152 do { 1153 retry = ufs_trans_hlock(); 1154 if (retry) { 1155 mutex_enter(&ufs_hlock.uq_mutex); 1156 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1157 (void) cv_reltimedwait(&ufs_hlock.uq_cv, 1158 &ufs_hlock.uq_mutex, hz, TR_CLOCK_TICK); 1159 CALLB_CPR_SAFE_END(&cprinfo, 1160 &ufs_hlock.uq_mutex); 1161 mutex_exit(&ufs_hlock.uq_mutex); 1162 } 1163 } while (retry); 1164 } 1165 } 1166 1167 static void 1168 ufs_attr_purge(struct inode *dp) 1169 { 1170 int err; 1171 int error; 1172 off_t dirsize; /* size of the directory */ 1173 off_t offset; /* offset in the directory */ 1174 int entryoffsetinblk; /* offset of ep in fbp's buffer */ 1175 struct inode *tp; 1176 struct fbuf *fbp; /* pointer to directory block */ 1177 struct direct *ep; /* directory entry */ 1178 int trans_size; 1179 int issync; 1180 struct ufsvfs *ufsvfsp = dp->i_ufsvfs; 1181 1182 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1183 1184 fbp = NULL; 1185 dirsize = roundup(dp->i_size, DIRBLKSIZ); 1186 offset = 0; 1187 entryoffsetinblk = 0; 1188 1189 /* 1190 * Purge directory cache 1191 */ 1192 1193 dnlc_dir_purge(&dp->i_danchor); 1194 1195 while (offset < dirsize) { 1196 /* 1197 * If offset is on a block boundary, 1198 * read the next directory block. 1199 * Release previous if it exists. 1200 */ 1201 if (blkoff(dp->i_fs, offset) == 0) { 1202 if (fbp != NULL) { 1203 fbrelse(fbp, S_OTHER); 1204 } 1205 1206 err = blkatoff(dp, offset, (char **)0, &fbp); 1207 if (err) { 1208 goto out; 1209 } 1210 entryoffsetinblk = 0; 1211 } 1212 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk); 1213 if (ep->d_ino == 0 || (ep->d_name[0] == '.' && 1214 ep->d_name[1] == '\0') || 1215 (ep->d_name[0] == '.' && ep->d_name[1] == '.' && 1216 ep->d_name[2] == '\0')) { 1217 1218 entryoffsetinblk += ep->d_reclen; 1219 1220 } else { 1221 1222 if ((err = ufs_iget(dp->i_vfs, ep->d_ino, 1223 &tp, CRED())) != 0) { 1224 goto out; 1225 } 1226 1227 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE, 1228 trans_size = (int)TOP_REMOVE_SIZE(tp)); 1229 1230 /* 1231 * Delete inode. 1232 */ 1233 1234 dnlc_remove(ITOV(dp), ep->d_name); 1235 1236 rw_enter(&tp->i_contents, RW_WRITER); 1237 tp->i_flag |= ICHG; 1238 tp->i_seq++; 1239 TRANS_INODE(tp->i_ufsvfs, tp); 1240 tp->i_nlink--; 1241 ufs_setreclaim(tp); 1242 ITIMES_NOLOCK(tp); 1243 rw_exit(&tp->i_contents); 1244 1245 VN_RELE(ITOV(tp)); 1246 entryoffsetinblk += ep->d_reclen; 1247 TRANS_END_CSYNC(ufsvfsp, error, 1248 issync, TOP_REMOVE, trans_size); 1249 1250 } 1251 offset += ep->d_reclen; 1252 } 1253 1254 if (fbp) { 1255 fbrelse(fbp, S_OTHER); 1256 } 1257 1258 out: 1259 rw_exit(&ufsvfsp->vfs_dqrwlock); 1260 } 1261