1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #pragma ident "%Z%%M% %I% %E% SMI" 35 36 #include <sys/types.h> 37 #include <sys/systm.h> 38 #include <sys/errno.h> 39 #include <sys/kmem.h> 40 #include <sys/buf.h> 41 #include <sys/vnode.h> 42 #include <sys/vfs.h> 43 #include <sys/user.h> 44 #include <sys/callb.h> 45 #include <sys/cpuvar.h> 46 #include <sys/fs/ufs_inode.h> 47 #include <sys/fs/ufs_log.h> 48 #include <sys/fs/ufs_trans.h> 49 #include <sys/fs/ufs_acl.h> 50 #include <sys/fs/ufs_bio.h> 51 #include <sys/fs/ufs_fsdir.h> 52 #include <sys/debug.h> 53 #include <sys/cmn_err.h> 54 #include <sys/sysmacros.h> 55 56 extern pri_t minclsyspri; 57 extern int hash2ints(); 58 extern struct kmem_cache *inode_cache; /* cache of free inodes */ 59 extern int ufs_idle_waiters; 60 extern struct instats ins; 61 62 static void ufs_attr_purge(struct inode *); 63 64 /* 65 * initialize a thread's queue struct 66 */ 67 void 68 ufs_thread_init(struct ufs_q *uq, int lowat) 69 { 70 bzero((caddr_t)uq, sizeof (*uq)); 71 cv_init(&uq->uq_cv, NULL, CV_DEFAULT, NULL); 72 mutex_init(&uq->uq_mutex, NULL, MUTEX_DEFAULT, NULL); 73 uq->uq_lowat = lowat; 74 uq->uq_hiwat = 2 * lowat; 75 uq->uq_threadp = NULL; 76 } 77 78 /* 79 * start a thread for a queue (assumes success) 80 */ 81 void 82 ufs_thread_start(struct ufs_q *uq, void (*func)(), struct vfs *vfsp) 83 { 84 mutex_enter(&uq->uq_mutex); 85 if (uq->uq_threadp == NULL) { 86 uq->uq_threadp = thread_create(NULL, 0, func, vfsp, 0, &p0, 87 TS_RUN, minclsyspri); 88 uq->uq_flags = 0; 89 } 90 mutex_exit(&uq->uq_mutex); 91 } 92 93 /* 94 * wait for the thread to exit 95 */ 96 void 97 ufs_thread_exit(struct ufs_q *uq) 98 { 99 kt_did_t ufs_thread_did = 0; 100 101 mutex_enter(&uq->uq_mutex); 102 uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED); 103 if (uq->uq_threadp != NULL) { 104 ufs_thread_did = uq->uq_threadp->t_did; 105 uq->uq_flags |= (UQ_EXIT|UQ_WAIT); 106 cv_broadcast(&uq->uq_cv); 107 } 108 mutex_exit(&uq->uq_mutex); 109 110 /* 111 * It's safe to call thread_join() with an already-gone 112 * t_did, but we have to obtain it before the kernel 113 * thread structure is freed. We do so above under the 114 * protection of the uq_mutex when we're sure the thread 115 * still exists and it's save to de-reference it. 116 * We also have to check if ufs_thread_did is != 0 117 * before calling thread_join() since thread 0 in the system 118 * gets a t_did of 0. 119 */ 120 if (ufs_thread_did) 121 thread_join(ufs_thread_did); 122 } 123 124 /* 125 * wait for a thread to suspend itself on the caller's behalf 126 * the caller is responsible for continuing the thread 127 */ 128 void 129 ufs_thread_suspend(struct ufs_q *uq) 130 { 131 mutex_enter(&uq->uq_mutex); 132 if (uq->uq_threadp != NULL) { 133 /* 134 * wait while another thread is suspending this thread. 135 * no need to do a cv_broadcast(), as whoever suspended 136 * the thread must continue it at some point. 137 */ 138 while ((uq->uq_flags & UQ_SUSPEND) && 139 (uq->uq_threadp != NULL)) { 140 /* 141 * We can't use cv_signal() because if our 142 * signal doesn't happen to hit the desired 143 * thread but instead some other waiter like 144 * ourselves, we'll wait forever for a 145 * response. Well, at least an indeterminate 146 * amount of time until we just happen to get 147 * lucky from whomever did get signalled doing 148 * a cv_signal() of their own. This is an 149 * unfortunate performance lossage. 150 */ 151 uq->uq_flags |= UQ_WAIT; 152 cv_wait(&uq->uq_cv, &uq->uq_mutex); 153 } 154 155 uq->uq_flags |= (UQ_SUSPEND | UQ_WAIT); 156 157 /* 158 * wait for the thread to suspend itself 159 */ 160 if ((uq->uq_flags & UQ_SUSPENDED) == 0 && 161 (uq->uq_threadp != NULL)) { 162 cv_broadcast(&uq->uq_cv); 163 } 164 165 while (((uq->uq_flags & UQ_SUSPENDED) == 0) && 166 (uq->uq_threadp != NULL)) { 167 cv_wait(&uq->uq_cv, &uq->uq_mutex); 168 } 169 } 170 mutex_exit(&uq->uq_mutex); 171 } 172 173 /* 174 * allow a thread to continue from a ufs_thread_suspend() 175 * This thread must be the same as the thread that called 176 * ufs_thread_suspend. 177 */ 178 void 179 ufs_thread_continue(struct ufs_q *uq) 180 { 181 mutex_enter(&uq->uq_mutex); 182 uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED); 183 cv_broadcast(&uq->uq_cv); 184 mutex_exit(&uq->uq_mutex); 185 } 186 187 /* 188 * some common code for managing a threads execution 189 * uq is locked at entry and return 190 * may sleep 191 * may exit 192 */ 193 /* 194 * Kind of a hack passing in the callb_cpr_t * here. 195 * It should really be part of the ufs_q structure. 196 * I did not put it in there because we are already in beta 197 * and I was concerned that changing ufs_inode.h to include 198 * callb.h might break something. 199 */ 200 int 201 ufs_thread_run(struct ufs_q *uq, callb_cpr_t *cprinfop) 202 { 203 again: 204 ASSERT(uq->uq_ne >= 0); 205 206 if (uq->uq_flags & UQ_SUSPEND) { 207 uq->uq_flags |= UQ_SUSPENDED; 208 } else if (uq->uq_flags & UQ_EXIT) { 209 /* 210 * exiting; empty the queue (may infinite loop) 211 */ 212 if (uq->uq_ne) 213 return (uq->uq_ne); 214 uq->uq_threadp = NULL; 215 if (uq->uq_flags & UQ_WAIT) { 216 cv_broadcast(&uq->uq_cv); 217 } 218 uq->uq_flags &= ~(UQ_EXIT | UQ_WAIT); 219 CALLB_CPR_EXIT(cprinfop); 220 thread_exit(); 221 } else if (uq->uq_ne >= uq->uq_lowat) { 222 /* 223 * process a block of entries until below high water mark 224 */ 225 return (uq->uq_ne - (uq->uq_lowat >> 1)); 226 } else if (uq->uq_flags & UQ_FASTCLIENTS) { 227 /* 228 * Let the fast acting clients through 229 */ 230 return (0); 231 } 232 if (uq->uq_flags & UQ_WAIT) { 233 uq->uq_flags &= ~UQ_WAIT; 234 cv_broadcast(&uq->uq_cv); 235 } 236 CALLB_CPR_SAFE_BEGIN(cprinfop); 237 cv_wait(&uq->uq_cv, &uq->uq_mutex); 238 CALLB_CPR_SAFE_END(cprinfop, &uq->uq_mutex); 239 goto again; 240 } 241 242 /* 243 * DELETE INODE 244 * The following routines implement the protocol for freeing the resources 245 * held by an idle and deleted inode. 246 */ 247 void 248 ufs_delete(struct ufsvfs *ufsvfsp, struct inode *ip, int dolockfs) 249 { 250 ushort_t mode; 251 struct vnode *vp = ITOV(ip); 252 struct ulockfs *ulp; 253 int trans_size; 254 int dorwlock = ((ip->i_mode & IFMT) == IFREG); 255 int issync; 256 int err; 257 struct inode *dp; 258 struct ufs_q *delq = &ufsvfsp->vfs_delete; 259 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 260 261 /* 262 * not on a trans device or not part of a transaction 263 */ 264 ASSERT(!TRANS_ISTRANS(ufsvfsp) || 265 ((curthread->t_flag & T_DONTBLOCK) == 0)); 266 267 /* 268 * Ignore if deletes are not allowed (wlock/hlock) 269 */ 270 if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) { 271 mutex_enter(&delq->uq_mutex); 272 delq_info->delq_unreclaimed_blocks -= ip->i_blocks; 273 delq_info->delq_unreclaimed_files--; 274 mutex_exit(&delq->uq_mutex); 275 VN_RELE(vp); 276 return; 277 } 278 279 if ((vp->v_count > 1) || (ip->i_mode == 0)) { 280 mutex_enter(&delq->uq_mutex); 281 delq_info->delq_unreclaimed_blocks -= ip->i_blocks; 282 delq_info->delq_unreclaimed_files--; 283 mutex_exit(&delq->uq_mutex); 284 VN_RELE(vp); 285 return; 286 } 287 /* 288 * If we are called as part of setting a fs lock, then only 289 * do part of the lockfs protocol. In other words, don't hang. 290 */ 291 if (dolockfs) { 292 if (ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_DELETE_MASK)) 293 return; 294 } else { 295 /* 296 * check for recursive VOP call 297 */ 298 if (curthread->t_flag & T_DONTBLOCK) { 299 ulp = NULL; 300 } else { 301 ulp = &ufsvfsp->vfs_ulockfs; 302 curthread->t_flag |= T_DONTBLOCK; 303 } 304 } 305 306 /* 307 * Hold rwlock to synchronize with (nfs) writes 308 */ 309 if (dorwlock) 310 rw_enter(&ip->i_rwlock, RW_WRITER); 311 312 /* 313 * Delete the attribute directory. 314 */ 315 if (ip->i_oeftflag != 0) { 316 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE, 317 trans_size = (int)TOP_REMOVE_SIZE(ip)); 318 rw_enter(&ip->i_contents, RW_WRITER); 319 err = ufs_iget(ip->i_vfs, ip->i_oeftflag, 320 &dp, CRED()); 321 if (err == 0) { 322 rw_enter(&dp->i_rwlock, RW_WRITER); 323 rw_enter(&dp->i_contents, RW_WRITER); 324 dp->i_flag |= IUPD|ICHG; 325 dp->i_seq++; 326 TRANS_INODE(dp->i_ufsvfs, dp); 327 dp->i_nlink -= 2; 328 ufs_setreclaim(dp); 329 /* 330 * Should get rid of any negative cache entries that 331 * might be lingering, as well as ``.'' and 332 * ``..''. If we don't, the VN_RELE() below 333 * won't actually put dp on the delete queue 334 * and it'll hang out until someone forces it 335 * (lockfs -f, umount, ...). The only reliable 336 * way of doing this at the moment is to call 337 * dnlc_purge_vp(ITOV(dp)), which is unacceptably 338 * slow, so we'll just note the problem in this 339 * comment for now. 340 */ 341 dnlc_remove(ITOV(dp), "."); 342 dnlc_remove(ITOV(dp), ".."); 343 ITIMES_NOLOCK(dp); 344 if (!TRANS_ISTRANS(ufsvfsp)) { 345 ufs_iupdat(dp, I_SYNC); 346 } 347 rw_exit(&dp->i_contents); 348 rw_exit(&dp->i_rwlock); 349 VN_RELE(ITOV(dp)); 350 } 351 /* 352 * Clear out attribute pointer 353 */ 354 ip->i_oeftflag = 0; 355 rw_exit(&ip->i_contents); 356 TRANS_END_CSYNC(ufsvfsp, err, issync, 357 TOP_REMOVE, trans_size); 358 dnlc_remove(ITOV(ip), XATTR_DIR_NAME); 359 } 360 361 if ((ip->i_mode & IFMT) == IFATTRDIR) { 362 ufs_attr_purge(ip); 363 } 364 365 (void) TRANS_ITRUNC(ip, (u_offset_t)0, I_FREE | I_ACCT, CRED()); 366 367 /* 368 * the inode's space has been freed; now free the inode 369 */ 370 if (ulp) { 371 trans_size = TOP_IFREE_SIZE(ip); 372 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IFREE, trans_size); 373 } 374 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 375 rw_enter(&ip->i_contents, RW_WRITER); 376 TRANS_INODE(ufsvfsp, ip); 377 mode = ip->i_mode; 378 ip->i_mode = 0; 379 ip->i_rdev = 0; 380 ip->i_ordev = 0; 381 ip->i_flag |= IMOD; 382 if (ip->i_ufs_acl) { 383 (void) ufs_si_free(ip->i_ufs_acl, vp->v_vfsp, CRED()); 384 ip->i_ufs_acl = NULL; 385 ip->i_shadow = 0; 386 } 387 388 /* 389 * This inode is torn down but still retains it's identity 390 * (inode number). It could get recycled soon so it's best 391 * to clean up the vnode just in case. 392 */ 393 mutex_enter(&vp->v_lock); 394 vn_recycle(vp); 395 mutex_exit(&vp->v_lock); 396 397 /* 398 * free the inode 399 */ 400 ufs_ifree(ip, ip->i_number, mode); 401 /* 402 * release quota resources; can't fail 403 */ 404 (void) chkiq((struct ufsvfs *)vp->v_vfsp->vfs_data, 405 /* change */ -1, ip, (uid_t)ip->i_uid, 0, CRED(), 406 (char **)NULL, (size_t *)NULL); 407 dqrele(ip->i_dquot); 408 ip->i_dquot = NULL; 409 ip->i_flag &= ~(IDEL | IDIRECTIO); 410 ip->i_cflags = 0; 411 if (!TRANS_ISTRANS(ufsvfsp)) { 412 ufs_iupdat(ip, I_SYNC); 413 } else { 414 mutex_enter(&delq->uq_mutex); 415 delq_info->delq_unreclaimed_files--; 416 mutex_exit(&delq->uq_mutex); 417 } 418 rw_exit(&ip->i_contents); 419 rw_exit(&ufsvfsp->vfs_dqrwlock); 420 if (dorwlock) 421 rw_exit(&ip->i_rwlock); 422 VN_RELE(vp); 423 424 /* 425 * End of transaction 426 */ 427 if (ulp) { 428 TRANS_END_ASYNC(ufsvfsp, TOP_IFREE, trans_size); 429 if (dolockfs) 430 ufs_lockfs_end(ulp); 431 else 432 curthread->t_flag &= ~T_DONTBLOCK; 433 } 434 } 435 436 /* 437 * Create the delete thread and init the delq_info for this fs 438 */ 439 void 440 ufs_delete_init(struct ufsvfs *ufsvfsp, int lowat) 441 { 442 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 443 444 ufs_thread_init(&ufsvfsp->vfs_delete, lowat); 445 (void) memset((void *)delq_info, 0, sizeof (*delq_info)); 446 cv_init(&delq_info->delq_fast_cv, NULL, CV_DEFAULT, NULL); 447 } 448 449 /* 450 * thread that frees up deleted inodes 451 */ 452 void 453 ufs_thread_delete(struct vfs *vfsp) 454 { 455 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 456 struct ufs_q *uq = &ufsvfsp->vfs_delete; 457 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 458 struct inode *ip; 459 long ne; 460 callb_cpr_t cprinfo; 461 462 CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr, 463 "ufsdelete"); 464 465 mutex_enter(&uq->uq_mutex); 466 again: 467 /* 468 * Sleep until there is work to do. Only do one entry at 469 * a time, to reduce the wait time for checking for a suspend 470 * or fast-client request. The ?: is for pedantic portability. 471 */ 472 ne = ufs_thread_run(uq, &cprinfo) ? 1 : 0; 473 474 /* 475 * process an entry, if there are any 476 */ 477 if (ne && (ip = uq->uq_ihead)) { 478 /* 479 * process first entry on queue. Assumed conditions are: 480 * ip is held (v_count >= 1) 481 * ip is referenced (i_flag & IREF) 482 * ip is free (i_nlink <= 0) 483 */ 484 if ((uq->uq_ihead = ip->i_freef) == ip) 485 uq->uq_ihead = NULL; 486 ip->i_freef->i_freeb = ip->i_freeb; 487 ip->i_freeb->i_freef = ip->i_freef; 488 ip->i_freef = ip; 489 ip->i_freeb = ip; 490 uq->uq_ne--; 491 mutex_exit(&uq->uq_mutex); 492 ufs_delete(ufsvfsp, ip, 1); 493 mutex_enter(&uq->uq_mutex); 494 } 495 496 /* 497 * If there are any fast clients, let all of them through. 498 * Mainly intended for statvfs(), which doesn't need to do 499 * anything except look at the number of bytes/inodes that 500 * are in the queue. 501 */ 502 if (uq->uq_flags & UQ_FASTCLIENTS) { 503 uq->uq_flags &= ~UQ_FASTCLIENTS; 504 /* 505 * Give clients a chance. The lock exit/entry 506 * allows waiting statvfs threads through. 507 */ 508 cv_broadcast(&delq_info->delq_fast_cv); 509 mutex_exit(&uq->uq_mutex); 510 mutex_enter(&uq->uq_mutex); 511 } 512 goto again; 513 } 514 515 /* 516 * drain ne entries off the delete queue. As new queue entries may 517 * be added while we're working, ne is interpreted as follows: 518 * 519 * ne > 0 => remove up to ne entries 520 * ne == 0 => remove all entries currently on the queue 521 * ne == -1 => remove entries until the queue is empty 522 */ 523 void 524 ufs_delete_drain(struct vfs *vfsp, int ne, int dolockfs) 525 { 526 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 527 struct ufs_q *uq; 528 struct inode *ip; 529 int drain_cnt = 0; 530 int done; 531 532 /* 533 * if forcibly unmounted; ignore 534 */ 535 if (ufsvfsp == NULL) 536 return; 537 538 uq = &ufsvfsp->vfs_delete; 539 mutex_enter(&uq->uq_mutex); 540 if (ne == 0) 541 drain_cnt = uq->uq_ne; 542 else if (ne > 0) 543 drain_cnt = ne; 544 545 /* 546 * process up to ne entries 547 */ 548 549 done = 0; 550 while (!done && (ip = uq->uq_ihead)) { 551 if (ne != -1) 552 drain_cnt--; 553 if (ne != -1 && drain_cnt == 0) 554 done = 1; 555 if ((uq->uq_ihead = ip->i_freef) == ip) 556 uq->uq_ihead = NULL; 557 ip->i_freef->i_freeb = ip->i_freeb; 558 ip->i_freeb->i_freef = ip->i_freef; 559 ip->i_freef = ip; 560 ip->i_freeb = ip; 561 uq->uq_ne--; 562 mutex_exit(&uq->uq_mutex); 563 ufs_delete(ufsvfsp, ip, dolockfs); 564 mutex_enter(&uq->uq_mutex); 565 } 566 mutex_exit(&uq->uq_mutex); 567 } 568 569 void 570 ufs_sync_with_thread(struct ufs_q *uq) 571 { 572 mutex_enter(&uq->uq_mutex); 573 574 /* 575 * Wake up delete thread to free up space. 576 */ 577 if ((uq->uq_flags & UQ_WAIT) == 0) { 578 uq->uq_flags |= UQ_WAIT; 579 cv_broadcast(&uq->uq_cv); 580 } 581 582 while ((uq->uq_threadp != NULL) && (uq->uq_flags & UQ_WAIT)) { 583 cv_wait(&uq->uq_cv, &uq->uq_mutex); 584 } 585 586 mutex_exit(&uq->uq_mutex); 587 } 588 589 /* 590 * Get rid of everything that's currently in the delete queue, 591 * plus whatever the delete thread is working on at the moment. 592 * 593 * This ability is required for providing true POSIX semantics 594 * regarding close(2), unlink(2), etc, even when logging is enabled. 595 * The standard requires that the released space be immediately 596 * observable (statvfs(2)) and allocatable (e.g., write(2)). 597 */ 598 void 599 ufs_delete_drain_wait(struct ufsvfs *ufsvfsp, int dolockfs) 600 { 601 struct ufs_q *uq = &ufsvfsp->vfs_delete; 602 int error; 603 struct ufs_q *delq = &ufsvfsp->vfs_delete; 604 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 605 606 /* 607 * If there is something on delq or delete thread 608 * working on delq. 609 */ 610 mutex_enter(&delq->uq_mutex); 611 if (delq_info->delq_unreclaimed_files > 0) { 612 mutex_exit(&delq->uq_mutex); 613 (void) ufs_delete_drain(ufsvfsp->vfs_vfs, 0, dolockfs); 614 ufs_sync_with_thread(uq); 615 } else { 616 ASSERT(delq_info->delq_unreclaimed_files == 0); 617 mutex_exit(&delq->uq_mutex); 618 return; 619 } 620 621 /* 622 * Commit any outstanding transactions to make sure 623 * any canceled freed blocks are available for allocation. 624 */ 625 curthread->t_flag |= T_DONTBLOCK; 626 TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_UPDATE, TOP_COMMIT_SIZE, error); 627 if (!error) { 628 TRANS_END_SYNC(ufsvfsp, error, TOP_COMMIT_UPDATE, 629 TOP_COMMIT_SIZE); 630 } 631 curthread->t_flag &= ~T_DONTBLOCK; 632 } 633 634 /* 635 * Adjust the resource usage in a struct statvfs based on 636 * what's in the delete queue. 637 * 638 * We do not consider the impact of ACLs or extended attributes 639 * that may be deleted as a side-effect of deleting a file. 640 * Those are metadata, and their sizes aren't reflected in the 641 * sizes returned by stat(), so this is not a problem. 642 */ 643 void 644 ufs_delete_adjust_stats(struct ufsvfs *ufsvfsp, struct statvfs64 *sp) 645 { 646 struct ufs_q *uq = &ufsvfsp->vfs_delete; 647 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 648 649 /* 650 * We'll get signalled when it's our turn. However, if there's 651 * nothing going on, there's no point in waking up the delete 652 * thread and waiting for it to tell us to continue. 653 */ 654 mutex_enter(&uq->uq_mutex); 655 656 if ((uq->uq_flags & UQ_FASTCLIENTS) || (uq->uq_ne != 0)) { 657 uq->uq_flags |= UQ_FASTCLIENTS; 658 cv_broadcast(&uq->uq_cv); 659 cv_wait(&delq_info->delq_fast_cv, &uq->uq_mutex); 660 } 661 662 /* 663 * The blocks accounted for in the delete queue info are 664 * counted in DEV_BSIZE chunks, but ufs_statvfs counts in 665 * filesystem fragments, so a conversion is required here. 666 */ 667 sp->f_bfree += dbtofsb(ufsvfsp->vfs_fs, 668 delq_info->delq_unreclaimed_blocks); 669 sp->f_ffree += delq_info->delq_unreclaimed_files; 670 mutex_exit(&uq->uq_mutex); 671 } 672 673 /* 674 * IDLE INODE 675 * The following routines implement the protocol for maintaining an 676 * LRU list of idle inodes and for moving the idle inodes to the 677 * reuse list when the number of allocated inodes exceeds the user 678 * tunable high-water mark (ufs_ninode). 679 */ 680 681 /* 682 * clean an idle inode and move it to the reuse list 683 */ 684 static void 685 ufs_idle_free(struct inode *ip) 686 { 687 int pages; 688 int hno; 689 kmutex_t *ihm; 690 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 691 struct vnode *vp = ITOV(ip); 692 693 /* 694 * inode is held 695 */ 696 697 /* 698 * remember `pages' for stats below 699 */ 700 pages = (ip->i_mode && vn_has_cached_data(vp) && vp->v_type != VCHR); 701 702 /* 703 * start the dirty pages to disk and then invalidate them 704 * unless the inode is invalid (ISTALE) 705 */ 706 if ((ip->i_flag & ISTALE) == 0) { 707 (void) TRANS_SYNCIP(ip, B_ASYNC, I_ASYNC, TOP_SYNCIP_FREE); 708 (void) TRANS_SYNCIP(ip, 709 (TRANS_ISERROR(ufsvfsp)) ? 710 B_INVAL | B_FORCE : B_INVAL, 711 I_ASYNC, TOP_SYNCIP_FREE); 712 } 713 714 /* 715 * wait for any current ufs_iget to finish and block future ufs_igets 716 */ 717 ASSERT(ip->i_number != 0); 718 hno = INOHASH(ip->i_number); 719 ihm = &ih_lock[hno]; 720 mutex_enter(ihm); 721 722 /* 723 * It must be guaranteed that v_count >= 2, otherwise 724 * something must be wrong with this vnode already. 725 * That is why we use v_count-- instead of VN_RELE(). 726 * Acquire the vnode lock in case another thread is in 727 * VN_RELE(). 728 */ 729 mutex_enter(&vp->v_lock); 730 731 if (vp->v_count < 2) 732 cmn_err(CE_PANIC, 733 "ufs_idle_free: vnode ref count is less than 2"); 734 735 vp->v_count--; 736 if ((vp->v_type != VCHR && vn_has_cached_data(vp)) || 737 vp->v_count != 1 || 738 ip->i_flag & (IMOD|IMODACC|IACC|ICHG|IUPD|IATTCHG)) { 739 /* 740 * Another thread has referenced this inode while 741 * we are trying to free it. Call VN_RELE() to 742 * release our reference. 743 */ 744 mutex_exit(&vp->v_lock); 745 mutex_exit(ihm); 746 VN_RELE(vp); 747 } else { 748 /* 749 * The inode is currently unreferenced and can not 750 * acquire further references because it has no pages 751 * and the hash is locked. Inodes acquire references 752 * via the hash list or via their pages. 753 */ 754 755 mutex_exit(&vp->v_lock); 756 757 /* 758 * remove it from the cache 759 */ 760 remque(ip); 761 mutex_exit(ihm); 762 /* 763 * Stale inodes have no valid ufsvfs 764 */ 765 if ((ip->i_flag & ISTALE) == 0 && ip->i_dquot) { 766 TRANS_DQRELE(ufsvfsp, ip->i_dquot); 767 ip->i_dquot = NULL; 768 } 769 ufs_si_del(ip); 770 if (pages) { 771 CPU_STATS_ADDQ(CPU, sys, ufsipage, 1); 772 } else { 773 CPU_STATS_ADDQ(CPU, sys, ufsinopage, 1); 774 } 775 ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp)); 776 777 /* 778 * We had better not have a vnode reference count > 1 779 * at this point, if we do then something is broken as 780 * this inode/vnode acquired a reference underneath of us. 781 */ 782 ASSERT(vp->v_count == 1); 783 784 ufs_free_inode(ip); 785 } 786 } 787 788 /* 789 * this thread processes the global idle queue 790 */ 791 iqhead_t *ufs_junk_iq; 792 iqhead_t *ufs_useful_iq; 793 int ufs_njunk_iq = 0; 794 int ufs_nuseful_iq = 0; 795 int ufs_niqhash; 796 int ufs_iqhashmask; 797 struct ufs_q ufs_idle_q; 798 799 void 800 ufs_thread_idle(void) 801 { 802 callb_cpr_t cprinfo; 803 int i; 804 int ne; 805 806 ufs_niqhash = (ufs_idle_q.uq_lowat >> 1) / IQHASHQLEN; 807 ufs_niqhash = 1 << highbit(ufs_niqhash); /* round up to power of 2 */ 808 ufs_iqhashmask = ufs_niqhash - 1; 809 ufs_junk_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_junk_iq), 810 KM_SLEEP); 811 ufs_useful_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_useful_iq), 812 KM_SLEEP); 813 814 /* Initialize hash queue headers */ 815 for (i = 0; i < ufs_niqhash; i++) { 816 ufs_junk_iq[i].i_freef = (inode_t *)&ufs_junk_iq[i]; 817 ufs_junk_iq[i].i_freeb = (inode_t *)&ufs_junk_iq[i]; 818 ufs_useful_iq[i].i_freef = (inode_t *)&ufs_useful_iq[i]; 819 ufs_useful_iq[i].i_freeb = (inode_t *)&ufs_useful_iq[i]; 820 } 821 822 CALLB_CPR_INIT(&cprinfo, &ufs_idle_q.uq_mutex, callb_generic_cpr, 823 "ufsidle"); 824 again: 825 /* 826 * Whenever the idle thread is awakened, it repeatedly gives 827 * back half of the idle queue until the idle queue falls 828 * below lowat. 829 */ 830 mutex_enter(&ufs_idle_q.uq_mutex); 831 if (ufs_idle_q.uq_ne < ufs_idle_q.uq_lowat) { 832 CALLB_CPR_SAFE_BEGIN(&cprinfo); 833 cv_wait(&ufs_idle_q.uq_cv, &ufs_idle_q.uq_mutex); 834 CALLB_CPR_SAFE_END(&cprinfo, &ufs_idle_q.uq_mutex); 835 } 836 mutex_exit(&ufs_idle_q.uq_mutex); 837 838 /* 839 * Give back 1/2 of the idle queue 840 */ 841 ne = ufs_idle_q.uq_ne >> 1; 842 ins.in_tidles.value.ul += ne; 843 ufs_idle_some(ne); 844 goto again; 845 } 846 847 /* 848 * Reclaim callback for ufs inode cache. 849 * Invoked by the kernel memory allocator when memory gets tight. 850 */ 851 /*ARGSUSED*/ 852 void 853 ufs_inode_cache_reclaim(void *cdrarg) 854 { 855 /* 856 * If we are low on memory and the idle queue is over its 857 * halfway mark, then free 50% of the idle q 858 * 859 * We don't free all of the idle inodes because the inodes 860 * for popular NFS files may have been kicked from the dnlc. 861 * The inodes for these files will end up on the idle queue 862 * after every NFS access. 863 * 864 * If we repeatedly push them from the idle queue then 865 * NFS users may be unhappy as an extra buf cache operation 866 * is incurred for every NFS operation to these files. 867 * 868 * It's not common, but I have seen it happen. 869 * 870 */ 871 if (ufs_idle_q.uq_ne < (ufs_idle_q.uq_lowat >> 1)) 872 return; 873 mutex_enter(&ufs_idle_q.uq_mutex); 874 cv_broadcast(&ufs_idle_q.uq_cv); 875 mutex_exit(&ufs_idle_q.uq_mutex); 876 } 877 878 /* 879 * Free up some idle inodes 880 */ 881 void 882 ufs_idle_some(int ne) 883 { 884 int i; 885 struct inode *ip; 886 struct vnode *vp; 887 static int junk_rotor = 0; 888 static int useful_rotor = 0; 889 890 for (i = 0; i < ne; ++i) { 891 mutex_enter(&ufs_idle_q.uq_mutex); 892 893 if (ufs_njunk_iq) { 894 while (ufs_junk_iq[junk_rotor].i_freef == 895 (inode_t *)&ufs_junk_iq[junk_rotor]) { 896 junk_rotor = IQNEXT(junk_rotor); 897 } 898 ip = ufs_junk_iq[junk_rotor].i_freef; 899 ASSERT(ip->i_flag & IJUNKIQ); 900 } else if (ufs_nuseful_iq) { 901 while (ufs_useful_iq[useful_rotor].i_freef == 902 (inode_t *)&ufs_useful_iq[useful_rotor]) { 903 useful_rotor = IQNEXT(useful_rotor); 904 } 905 ip = ufs_useful_iq[useful_rotor].i_freef; 906 ASSERT(!(ip->i_flag & IJUNKIQ)); 907 } else { 908 mutex_exit(&ufs_idle_q.uq_mutex); 909 return; 910 } 911 912 /* 913 * emulate ufs_iget 914 */ 915 vp = ITOV(ip); 916 VN_HOLD(vp); 917 mutex_exit(&ufs_idle_q.uq_mutex); 918 rw_enter(&ip->i_contents, RW_WRITER); 919 /* 920 * VN_RELE should not be called if 921 * ufs_rmidle returns true, as it will 922 * effectively be done in ufs_idle_free. 923 */ 924 if (ufs_rmidle(ip)) { 925 rw_exit(&ip->i_contents); 926 ufs_idle_free(ip); 927 } else { 928 rw_exit(&ip->i_contents); 929 VN_RELE(vp); 930 } 931 } 932 } 933 934 /* 935 * drain entries for vfsp from the idle queue 936 * vfsp == NULL means drain the entire thing 937 */ 938 void 939 ufs_idle_drain(struct vfs *vfsp) 940 { 941 struct inode *ip, *nip; 942 struct inode *ianchor = NULL; 943 int i; 944 945 mutex_enter(&ufs_idle_q.uq_mutex); 946 if (ufs_njunk_iq) { 947 /* for each hash q */ 948 for (i = 0; i < ufs_niqhash; i++) { 949 /* search down the hash q */ 950 for (ip = ufs_junk_iq[i].i_freef; 951 ip != (inode_t *)&ufs_junk_iq[i]; 952 ip = ip->i_freef) { 953 if (ip->i_vfs == vfsp || vfsp == NULL) { 954 /* found a matching entry */ 955 VN_HOLD(ITOV(ip)); 956 mutex_exit(&ufs_idle_q.uq_mutex); 957 rw_enter(&ip->i_contents, RW_WRITER); 958 /* 959 * See comments in ufs_idle_some() 960 * as we will call ufs_idle_free() 961 * after scanning both queues. 962 */ 963 if (ufs_rmidle(ip)) { 964 rw_exit(&ip->i_contents); 965 ip->i_freef = ianchor; 966 ianchor = ip; 967 } else { 968 rw_exit(&ip->i_contents); 969 VN_RELE(ITOV(ip)); 970 } 971 /* restart this hash q */ 972 ip = (inode_t *)&ufs_junk_iq[i]; 973 mutex_enter(&ufs_idle_q.uq_mutex); 974 } 975 } 976 } 977 } 978 if (ufs_nuseful_iq) { 979 /* for each hash q */ 980 for (i = 0; i < ufs_niqhash; i++) { 981 /* search down the hash q */ 982 for (ip = ufs_useful_iq[i].i_freef; 983 ip != (inode_t *)&ufs_useful_iq[i]; 984 ip = ip->i_freef) { 985 if (ip->i_vfs == vfsp || vfsp == NULL) { 986 /* found a matching entry */ 987 VN_HOLD(ITOV(ip)); 988 mutex_exit(&ufs_idle_q.uq_mutex); 989 rw_enter(&ip->i_contents, RW_WRITER); 990 /* 991 * See comments in ufs_idle_some() 992 * as we will call ufs_idle_free() 993 * after scanning both queues. 994 */ 995 if (ufs_rmidle(ip)) { 996 rw_exit(&ip->i_contents); 997 ip->i_freef = ianchor; 998 ianchor = ip; 999 } else { 1000 rw_exit(&ip->i_contents); 1001 VN_RELE(ITOV(ip)); 1002 } 1003 /* restart this hash q */ 1004 ip = (inode_t *)&ufs_useful_iq[i]; 1005 mutex_enter(&ufs_idle_q.uq_mutex); 1006 } 1007 } 1008 } 1009 } 1010 1011 mutex_exit(&ufs_idle_q.uq_mutex); 1012 /* no more matching entries, release those we have found (if any) */ 1013 for (ip = ianchor; ip; ip = nip) { 1014 nip = ip->i_freef; 1015 ip->i_freef = ip; 1016 ufs_idle_free(ip); 1017 } 1018 } 1019 1020 /* 1021 * RECLAIM DELETED INODES 1022 * The following thread scans the file system once looking for deleted files 1023 */ 1024 void 1025 ufs_thread_reclaim(struct vfs *vfsp) 1026 { 1027 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 1028 struct ufs_q *uq = &ufsvfsp->vfs_reclaim; 1029 struct fs *fs = ufsvfsp->vfs_fs; 1030 struct buf *bp = 0; 1031 int err = 0; 1032 daddr_t bno; 1033 ino_t ino; 1034 struct dinode *dp; 1035 struct inode *ip; 1036 callb_cpr_t cprinfo; 1037 1038 CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr, 1039 "ufsreclaim"); 1040 1041 /* 1042 * mount decided that we don't need a reclaim thread 1043 */ 1044 if ((fs->fs_reclaim & FS_RECLAIMING) == 0) 1045 err++; 1046 1047 /* 1048 * don't reclaim if readonly 1049 */ 1050 if (fs->fs_ronly) 1051 err++; 1052 1053 for (ino = 0; ino < (fs->fs_ncg * fs->fs_ipg) && !err; ++ino) { 1054 1055 /* 1056 * Check whether we are the target of another 1057 * thread having called ufs_thread_exit() or 1058 * ufs_thread_suspend(). 1059 */ 1060 mutex_enter(&uq->uq_mutex); 1061 again: 1062 if (uq->uq_flags & UQ_EXIT) { 1063 err++; 1064 mutex_exit(&uq->uq_mutex); 1065 break; 1066 } else if (uq->uq_flags & UQ_SUSPEND) { 1067 uq->uq_flags |= UQ_SUSPENDED; 1068 /* 1069 * Release the buf before we cv_wait() 1070 * otherwise we may deadlock with the 1071 * thread that called ufs_thread_suspend(). 1072 */ 1073 if (bp) { 1074 brelse(bp); 1075 bp = 0; 1076 } 1077 if (uq->uq_flags & UQ_WAIT) { 1078 uq->uq_flags &= ~UQ_WAIT; 1079 cv_broadcast(&uq->uq_cv); 1080 } 1081 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1082 cv_wait(&uq->uq_cv, &uq->uq_mutex); 1083 CALLB_CPR_SAFE_END(&cprinfo, &uq->uq_mutex); 1084 goto again; 1085 } 1086 mutex_exit(&uq->uq_mutex); 1087 1088 /* 1089 * if we don't already have the buf; get it 1090 */ 1091 bno = fsbtodb(fs, itod(fs, ino)); 1092 if ((bp == 0) || (bp->b_blkno != bno)) { 1093 if (bp) 1094 brelse(bp); 1095 bp = UFS_BREAD(ufsvfsp, 1096 ufsvfsp->vfs_dev, bno, fs->fs_bsize); 1097 bp->b_flags |= B_AGE; 1098 } 1099 if (bp->b_flags & B_ERROR) { 1100 err++; 1101 continue; 1102 } 1103 /* 1104 * nlink <= 0 and mode != 0 means deleted 1105 */ 1106 dp = (struct dinode *)bp->b_un.b_addr + itoo(fs, ino); 1107 if ((dp->di_nlink <= 0) && (dp->di_mode != 0)) { 1108 /* 1109 * can't hold the buf (deadlock) 1110 */ 1111 brelse(bp); 1112 bp = 0; 1113 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1114 /* 1115 * iget/iput sequence will put inode on ifree 1116 * thread queue if it is idle. This is a nop 1117 * for busy (open, deleted) inodes 1118 */ 1119 if (ufs_iget(vfsp, ino, &ip, CRED())) 1120 err++; 1121 else 1122 VN_RELE(ITOV(ip)); 1123 rw_exit(&ufsvfsp->vfs_dqrwlock); 1124 } 1125 } 1126 1127 if (bp) 1128 brelse(bp); 1129 if (!err) { 1130 /* 1131 * reset the reclaiming-bit 1132 */ 1133 mutex_enter(&ufsvfsp->vfs_lock); 1134 fs->fs_reclaim &= ~FS_RECLAIMING; 1135 mutex_exit(&ufsvfsp->vfs_lock); 1136 TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_RECLAIM); 1137 } 1138 1139 /* 1140 * exit the reclaim thread 1141 */ 1142 mutex_enter(&uq->uq_mutex); 1143 uq->uq_threadp = NULL; 1144 uq->uq_flags &= ~UQ_WAIT; 1145 cv_broadcast(&uq->uq_cv); 1146 CALLB_CPR_EXIT(&cprinfo); 1147 thread_exit(); 1148 } 1149 /* 1150 * HLOCK FILE SYSTEM 1151 * hlock the file system's whose logs have device errors 1152 */ 1153 struct ufs_q ufs_hlock; 1154 /*ARGSUSED*/ 1155 void 1156 ufs_thread_hlock(void *ignore) 1157 { 1158 int retry; 1159 callb_cpr_t cprinfo; 1160 1161 CALLB_CPR_INIT(&cprinfo, &ufs_hlock.uq_mutex, callb_generic_cpr, 1162 "ufshlock"); 1163 1164 for (;;) { 1165 /* 1166 * sleep until there is work to do 1167 */ 1168 mutex_enter(&ufs_hlock.uq_mutex); 1169 (void) ufs_thread_run(&ufs_hlock, &cprinfo); 1170 ufs_hlock.uq_ne = 0; 1171 mutex_exit(&ufs_hlock.uq_mutex); 1172 /* 1173 * hlock the error'ed fs's 1174 * retry after a bit if another app is doing lockfs stuff 1175 */ 1176 do { 1177 retry = ufs_trans_hlock(); 1178 if (retry) { 1179 mutex_enter(&ufs_hlock.uq_mutex); 1180 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1181 (void) cv_timedwait(&ufs_hlock.uq_cv, 1182 &ufs_hlock.uq_mutex, 1183 lbolt + hz); 1184 CALLB_CPR_SAFE_END(&cprinfo, 1185 &ufs_hlock.uq_mutex); 1186 mutex_exit(&ufs_hlock.uq_mutex); 1187 } 1188 } while (retry); 1189 } 1190 } 1191 1192 static void 1193 ufs_attr_purge(struct inode *dp) 1194 { 1195 int err; 1196 int error; 1197 off_t dirsize; /* size of the directory */ 1198 off_t offset; /* offset in the directory */ 1199 int entryoffsetinblk; /* offset of ep in fbp's buffer */ 1200 struct inode *tp; 1201 struct fbuf *fbp; /* pointer to directory block */ 1202 struct direct *ep; /* directory entry */ 1203 int trans_size; 1204 int issync; 1205 struct ufsvfs *ufsvfsp = dp->i_ufsvfs; 1206 1207 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1208 1209 fbp = NULL; 1210 dirsize = roundup(dp->i_size, DIRBLKSIZ); 1211 offset = 0; 1212 entryoffsetinblk = 0; 1213 1214 /* 1215 * Purge directory cache 1216 */ 1217 1218 dnlc_dir_purge(&dp->i_danchor); 1219 1220 while (offset < dirsize) { 1221 /* 1222 * If offset is on a block boundary, 1223 * read the next directory block. 1224 * Release previous if it exists. 1225 */ 1226 if (blkoff(dp->i_fs, offset) == 0) { 1227 if (fbp != NULL) { 1228 fbrelse(fbp, S_OTHER); 1229 } 1230 1231 err = blkatoff(dp, offset, (char **)0, &fbp); 1232 if (err) { 1233 goto out; 1234 } 1235 entryoffsetinblk = 0; 1236 } 1237 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk); 1238 if (ep->d_ino == 0 || (ep->d_name[0] == '.' && 1239 ep->d_name[1] == '\0') || 1240 (ep->d_name[0] == '.' && ep->d_name[1] == '.' && 1241 ep->d_name[2] == '\0')) { 1242 1243 entryoffsetinblk += ep->d_reclen; 1244 1245 } else { 1246 1247 if ((err = ufs_iget(dp->i_vfs, ep->d_ino, 1248 &tp, CRED())) != 0) { 1249 goto out; 1250 } 1251 1252 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE, 1253 trans_size = (int)TOP_REMOVE_SIZE(tp)); 1254 1255 /* 1256 * Delete inode. 1257 */ 1258 1259 dnlc_remove(ITOV(dp), ep->d_name); 1260 1261 rw_enter(&tp->i_contents, RW_WRITER); 1262 tp->i_flag |= ICHG; 1263 tp->i_seq++; 1264 TRANS_INODE(tp->i_ufsvfs, tp); 1265 tp->i_nlink--; 1266 ufs_setreclaim(tp); 1267 ITIMES_NOLOCK(tp); 1268 rw_exit(&tp->i_contents); 1269 1270 VN_RELE(ITOV(tp)); 1271 entryoffsetinblk += ep->d_reclen; 1272 TRANS_END_CSYNC(ufsvfsp, error, 1273 issync, TOP_REMOVE, trans_size); 1274 1275 } 1276 offset += ep->d_reclen; 1277 } 1278 1279 if (fbp) { 1280 fbrelse(fbp, S_OTHER); 1281 } 1282 1283 out: 1284 rw_exit(&ufsvfsp->vfs_dqrwlock); 1285 } 1286