1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Copyright (c) 2017 by Delphix. All rights reserved. 31 */ 32 33 /* 34 * Portions of this source code were derived from Berkeley 4.3 BSD 35 * under license from the Regents of the University of California. 36 */ 37 38 #include <sys/types.h> 39 #include <sys/systm.h> 40 #include <sys/errno.h> 41 #include <sys/kmem.h> 42 #include <sys/buf.h> 43 #include <sys/vnode.h> 44 #include <sys/vfs.h> 45 #include <sys/user.h> 46 #include <sys/callb.h> 47 #include <sys/cpuvar.h> 48 #include <sys/fs/ufs_inode.h> 49 #include <sys/fs/ufs_log.h> 50 #include <sys/fs/ufs_trans.h> 51 #include <sys/fs/ufs_acl.h> 52 #include <sys/fs/ufs_bio.h> 53 #include <sys/fs/ufs_fsdir.h> 54 #include <sys/debug.h> 55 #include <sys/cmn_err.h> 56 #include <sys/sysmacros.h> 57 #include <vm/pvn.h> 58 59 extern pri_t minclsyspri; 60 extern int hash2ints(); 61 extern struct kmem_cache *inode_cache; /* cache of free inodes */ 62 extern int ufs_idle_waiters; 63 extern struct instats ins; 64 65 static void ufs_attr_purge(struct inode *); 66 67 /* 68 * initialize a thread's queue struct 69 */ 70 void 71 ufs_thread_init(struct ufs_q *uq, int lowat) 72 { 73 bzero((caddr_t)uq, sizeof (*uq)); 74 cv_init(&uq->uq_cv, NULL, CV_DEFAULT, NULL); 75 mutex_init(&uq->uq_mutex, NULL, MUTEX_DEFAULT, NULL); 76 uq->uq_lowat = lowat; 77 uq->uq_hiwat = 2 * lowat; 78 uq->uq_threadp = NULL; 79 } 80 81 /* 82 * start a thread for a queue (assumes success) 83 */ 84 void 85 ufs_thread_start(struct ufs_q *uq, void (*func)(), struct vfs *vfsp) 86 { 87 mutex_enter(&uq->uq_mutex); 88 if (uq->uq_threadp == NULL) { 89 uq->uq_threadp = thread_create(NULL, 0, func, vfsp, 0, &p0, 90 TS_RUN, minclsyspri); 91 uq->uq_flags = 0; 92 } 93 mutex_exit(&uq->uq_mutex); 94 } 95 96 /* 97 * wait for the thread to exit 98 */ 99 void 100 ufs_thread_exit(struct ufs_q *uq) 101 { 102 kt_did_t ufs_thread_did = 0; 103 104 mutex_enter(&uq->uq_mutex); 105 uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED); 106 if (uq->uq_threadp != NULL) { 107 ufs_thread_did = uq->uq_threadp->t_did; 108 uq->uq_flags |= (UQ_EXIT|UQ_WAIT); 109 cv_broadcast(&uq->uq_cv); 110 } 111 mutex_exit(&uq->uq_mutex); 112 113 /* 114 * It's safe to call thread_join() with an already-gone 115 * t_did, but we have to obtain it before the kernel 116 * thread structure is freed. We do so above under the 117 * protection of the uq_mutex when we're sure the thread 118 * still exists and it's save to de-reference it. 119 * We also have to check if ufs_thread_did is != 0 120 * before calling thread_join() since thread 0 in the system 121 * gets a t_did of 0. 122 */ 123 if (ufs_thread_did) 124 thread_join(ufs_thread_did); 125 } 126 127 /* 128 * wait for a thread to suspend itself on the caller's behalf 129 * the caller is responsible for continuing the thread 130 */ 131 void 132 ufs_thread_suspend(struct ufs_q *uq) 133 { 134 mutex_enter(&uq->uq_mutex); 135 if (uq->uq_threadp != NULL) { 136 /* 137 * wait while another thread is suspending this thread. 138 * no need to do a cv_broadcast(), as whoever suspended 139 * the thread must continue it at some point. 140 */ 141 while ((uq->uq_flags & UQ_SUSPEND) && 142 (uq->uq_threadp != NULL)) { 143 /* 144 * We can't use cv_signal() because if our 145 * signal doesn't happen to hit the desired 146 * thread but instead some other waiter like 147 * ourselves, we'll wait forever for a 148 * response. Well, at least an indeterminate 149 * amount of time until we just happen to get 150 * lucky from whomever did get signalled doing 151 * a cv_signal() of their own. This is an 152 * unfortunate performance lossage. 153 */ 154 uq->uq_flags |= UQ_WAIT; 155 cv_wait(&uq->uq_cv, &uq->uq_mutex); 156 } 157 158 uq->uq_flags |= (UQ_SUSPEND | UQ_WAIT); 159 160 /* 161 * wait for the thread to suspend itself 162 */ 163 if ((uq->uq_flags & UQ_SUSPENDED) == 0 && 164 (uq->uq_threadp != NULL)) { 165 cv_broadcast(&uq->uq_cv); 166 } 167 168 while (((uq->uq_flags & UQ_SUSPENDED) == 0) && 169 (uq->uq_threadp != NULL)) { 170 cv_wait(&uq->uq_cv, &uq->uq_mutex); 171 } 172 } 173 mutex_exit(&uq->uq_mutex); 174 } 175 176 /* 177 * allow a thread to continue from a ufs_thread_suspend() 178 * This thread must be the same as the thread that called 179 * ufs_thread_suspend. 180 */ 181 void 182 ufs_thread_continue(struct ufs_q *uq) 183 { 184 mutex_enter(&uq->uq_mutex); 185 uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED); 186 cv_broadcast(&uq->uq_cv); 187 mutex_exit(&uq->uq_mutex); 188 } 189 190 /* 191 * some common code for managing a threads execution 192 * uq is locked at entry and return 193 * may sleep 194 * may exit 195 */ 196 /* 197 * Kind of a hack passing in the callb_cpr_t * here. 198 * It should really be part of the ufs_q structure. 199 * I did not put it in there because we are already in beta 200 * and I was concerned that changing ufs_inode.h to include 201 * callb.h might break something. 202 */ 203 int 204 ufs_thread_run(struct ufs_q *uq, callb_cpr_t *cprinfop) 205 { 206 again: 207 ASSERT(uq->uq_ne >= 0); 208 209 if (uq->uq_flags & UQ_SUSPEND) { 210 uq->uq_flags |= UQ_SUSPENDED; 211 } else if (uq->uq_flags & UQ_EXIT) { 212 /* 213 * exiting; empty the queue (may infinite loop) 214 */ 215 if (uq->uq_ne) 216 return (uq->uq_ne); 217 uq->uq_threadp = NULL; 218 if (uq->uq_flags & UQ_WAIT) { 219 cv_broadcast(&uq->uq_cv); 220 } 221 uq->uq_flags &= ~(UQ_EXIT | UQ_WAIT); 222 CALLB_CPR_EXIT(cprinfop); 223 thread_exit(); 224 } else if (uq->uq_ne >= uq->uq_lowat) { 225 /* 226 * process a block of entries until below high water mark 227 */ 228 return (uq->uq_ne - (uq->uq_lowat >> 1)); 229 } 230 if (uq->uq_flags & UQ_WAIT) { 231 uq->uq_flags &= ~UQ_WAIT; 232 cv_broadcast(&uq->uq_cv); 233 } 234 CALLB_CPR_SAFE_BEGIN(cprinfop); 235 cv_wait(&uq->uq_cv, &uq->uq_mutex); 236 CALLB_CPR_SAFE_END(cprinfop, &uq->uq_mutex); 237 goto again; 238 } 239 240 /* 241 * DELETE INODE 242 * The following routines implement the protocol for freeing the resources 243 * held by an idle and deleted inode. 244 */ 245 void 246 ufs_delete(struct ufsvfs *ufsvfsp, struct inode *ip, int dolockfs) 247 { 248 ushort_t mode; 249 struct vnode *vp = ITOV(ip); 250 struct ulockfs *ulp; 251 int trans_size; 252 int dorwlock = ((ip->i_mode & IFMT) == IFREG); 253 int issync; 254 int err; 255 struct inode *dp; 256 struct ufs_q *delq = &ufsvfsp->vfs_delete; 257 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 258 259 /* 260 * Ignore if deletes are not allowed (wlock/hlock) 261 */ 262 if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) { 263 mutex_enter(&delq->uq_mutex); 264 delq_info->delq_unreclaimed_blocks -= ip->i_blocks; 265 delq_info->delq_unreclaimed_files--; 266 mutex_exit(&delq->uq_mutex); 267 VN_RELE(vp); 268 return; 269 } 270 271 if ((vp->v_count > 1) || (ip->i_mode == 0)) { 272 mutex_enter(&delq->uq_mutex); 273 delq_info->delq_unreclaimed_blocks -= ip->i_blocks; 274 delq_info->delq_unreclaimed_files--; 275 mutex_exit(&delq->uq_mutex); 276 VN_RELE(vp); 277 return; 278 } 279 /* 280 * If we are called as part of setting a fs lock, then only 281 * do part of the lockfs protocol. In other words, don't hang. 282 */ 283 if (dolockfs) { 284 if (ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_DELETE_MASK)) 285 return; 286 } else { 287 /* 288 * check for recursive VOP call 289 */ 290 if (curthread->t_flag & T_DONTBLOCK) { 291 ulp = NULL; 292 } else { 293 ulp = &ufsvfsp->vfs_ulockfs; 294 curthread->t_flag |= T_DONTBLOCK; 295 } 296 } 297 298 /* 299 * Hold rwlock to synchronize with (nfs) writes 300 */ 301 if (dorwlock) 302 rw_enter(&ip->i_rwlock, RW_WRITER); 303 304 /* 305 * Delete the attribute directory. 306 */ 307 if (ip->i_oeftflag != 0) { 308 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE, 309 trans_size = (int)TOP_REMOVE_SIZE(ip)); 310 rw_enter(&ip->i_contents, RW_WRITER); 311 err = ufs_iget(ip->i_vfs, ip->i_oeftflag, 312 &dp, CRED()); 313 if (err == 0) { 314 rw_enter(&dp->i_rwlock, RW_WRITER); 315 rw_enter(&dp->i_contents, RW_WRITER); 316 dp->i_flag |= IUPD|ICHG; 317 dp->i_seq++; 318 TRANS_INODE(dp->i_ufsvfs, dp); 319 dp->i_nlink -= 2; 320 ufs_setreclaim(dp); 321 /* 322 * Should get rid of any negative cache entries that 323 * might be lingering, as well as ``.'' and 324 * ``..''. If we don't, the VN_RELE() below 325 * won't actually put dp on the delete queue 326 * and it'll hang out until someone forces it 327 * (lockfs -f, umount, ...). The only reliable 328 * way of doing this at the moment is to call 329 * dnlc_purge_vp(ITOV(dp)), which is unacceptably 330 * slow, so we'll just note the problem in this 331 * comment for now. 332 */ 333 dnlc_remove(ITOV(dp), "."); 334 dnlc_remove(ITOV(dp), ".."); 335 ITIMES_NOLOCK(dp); 336 if (!TRANS_ISTRANS(ufsvfsp)) { 337 ufs_iupdat(dp, I_SYNC); 338 } 339 rw_exit(&dp->i_contents); 340 rw_exit(&dp->i_rwlock); 341 VN_RELE(ITOV(dp)); 342 } 343 /* 344 * Clear out attribute pointer 345 */ 346 ip->i_oeftflag = 0; 347 rw_exit(&ip->i_contents); 348 TRANS_END_CSYNC(ufsvfsp, err, issync, 349 TOP_REMOVE, trans_size); 350 dnlc_remove(ITOV(ip), XATTR_DIR_NAME); 351 } 352 353 if ((ip->i_mode & IFMT) == IFATTRDIR) { 354 ufs_attr_purge(ip); 355 } 356 357 (void) TRANS_ITRUNC(ip, (u_offset_t)0, I_FREE | I_ACCT, CRED()); 358 359 /* 360 * the inode's space has been freed; now free the inode 361 */ 362 if (ulp) { 363 trans_size = TOP_IFREE_SIZE(ip); 364 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IFREE, trans_size); 365 } 366 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 367 rw_enter(&ip->i_contents, RW_WRITER); 368 TRANS_INODE(ufsvfsp, ip); 369 mode = ip->i_mode; 370 ip->i_mode = 0; 371 ip->i_rdev = 0; 372 ip->i_ordev = 0; 373 ip->i_flag |= IMOD; 374 if (ip->i_ufs_acl) { 375 (void) ufs_si_free(ip->i_ufs_acl, vp->v_vfsp, CRED()); 376 ip->i_ufs_acl = NULL; 377 ip->i_shadow = 0; 378 } 379 380 /* 381 * This inode is torn down but still retains it's identity 382 * (inode number). It could get recycled soon so it's best 383 * to clean up the vnode just in case. 384 */ 385 mutex_enter(&vp->v_lock); 386 vn_recycle(vp); 387 mutex_exit(&vp->v_lock); 388 389 /* 390 * free the inode 391 */ 392 ufs_ifree(ip, ip->i_number, mode); 393 /* 394 * release quota resources; can't fail 395 */ 396 (void) chkiq((struct ufsvfs *)vp->v_vfsp->vfs_data, 397 /* change */ -1, ip, (uid_t)ip->i_uid, 0, CRED(), 398 (char **)NULL, (size_t *)NULL); 399 dqrele(ip->i_dquot); 400 ip->i_dquot = NULL; 401 ip->i_flag &= ~(IDEL | IDIRECTIO); 402 ip->i_cflags = 0; 403 if (!TRANS_ISTRANS(ufsvfsp)) { 404 ufs_iupdat(ip, I_SYNC); 405 } else { 406 mutex_enter(&delq->uq_mutex); 407 delq_info->delq_unreclaimed_files--; 408 mutex_exit(&delq->uq_mutex); 409 } 410 rw_exit(&ip->i_contents); 411 rw_exit(&ufsvfsp->vfs_dqrwlock); 412 if (dorwlock) 413 rw_exit(&ip->i_rwlock); 414 VN_RELE(vp); 415 416 /* 417 * End of transaction 418 */ 419 if (ulp) { 420 TRANS_END_ASYNC(ufsvfsp, TOP_IFREE, trans_size); 421 if (dolockfs) 422 ufs_lockfs_end(ulp); 423 else 424 curthread->t_flag &= ~T_DONTBLOCK; 425 } 426 } 427 428 /* 429 * Create the delete thread and init the delq_info for this fs 430 */ 431 void 432 ufs_delete_init(struct ufsvfs *ufsvfsp, int lowat) 433 { 434 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 435 436 ufs_thread_init(&ufsvfsp->vfs_delete, lowat); 437 (void) memset((void *)delq_info, 0, sizeof (*delq_info)); 438 } 439 440 /* 441 * thread that frees up deleted inodes 442 */ 443 void 444 ufs_thread_delete(struct vfs *vfsp) 445 { 446 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 447 struct ufs_q *uq = &ufsvfsp->vfs_delete; 448 struct inode *ip; 449 long ne; 450 callb_cpr_t cprinfo; 451 452 CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr, 453 "ufsdelete"); 454 455 mutex_enter(&uq->uq_mutex); 456 again: 457 /* 458 * Sleep until there is work to do. Only do one entry at 459 * a time, to reduce the wait time for checking for a suspend 460 * request. The ?: is for pedantic portability. 461 */ 462 ne = ufs_thread_run(uq, &cprinfo) ? 1 : 0; 463 464 /* 465 * process an entry, if there are any 466 */ 467 if (ne && (ip = uq->uq_ihead)) { 468 /* 469 * process first entry on queue. Assumed conditions are: 470 * ip is held (v_count >= 1) 471 * ip is referenced (i_flag & IREF) 472 * ip is free (i_nlink <= 0) 473 */ 474 if ((uq->uq_ihead = ip->i_freef) == ip) 475 uq->uq_ihead = NULL; 476 ip->i_freef->i_freeb = ip->i_freeb; 477 ip->i_freeb->i_freef = ip->i_freef; 478 ip->i_freef = ip; 479 ip->i_freeb = ip; 480 uq->uq_ne--; 481 mutex_exit(&uq->uq_mutex); 482 ufs_delete(ufsvfsp, ip, 1); 483 mutex_enter(&uq->uq_mutex); 484 } 485 goto again; 486 } 487 488 /* 489 * drain ne entries off the delete queue. As new queue entries may 490 * be added while we're working, ne is interpreted as follows: 491 * 492 * ne > 0 => remove up to ne entries 493 * ne == 0 => remove all entries currently on the queue 494 * ne == -1 => remove entries until the queue is empty 495 */ 496 void 497 ufs_delete_drain(struct vfs *vfsp, int ne, int dolockfs) 498 { 499 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 500 struct ufs_q *uq; 501 struct inode *ip; 502 int drain_cnt = 0; 503 int done; 504 505 /* 506 * if forcibly unmounted; ignore 507 */ 508 if (ufsvfsp == NULL) 509 return; 510 511 uq = &ufsvfsp->vfs_delete; 512 mutex_enter(&uq->uq_mutex); 513 if (ne == 0) 514 drain_cnt = uq->uq_ne; 515 else if (ne > 0) 516 drain_cnt = ne; 517 518 /* 519 * process up to ne entries 520 */ 521 522 done = 0; 523 while (!done && (ip = uq->uq_ihead)) { 524 if (ne != -1) 525 drain_cnt--; 526 if (ne != -1 && drain_cnt == 0) 527 done = 1; 528 if ((uq->uq_ihead = ip->i_freef) == ip) 529 uq->uq_ihead = NULL; 530 ip->i_freef->i_freeb = ip->i_freeb; 531 ip->i_freeb->i_freef = ip->i_freef; 532 ip->i_freef = ip; 533 ip->i_freeb = ip; 534 uq->uq_ne--; 535 mutex_exit(&uq->uq_mutex); 536 ufs_delete(ufsvfsp, ip, dolockfs); 537 mutex_enter(&uq->uq_mutex); 538 } 539 mutex_exit(&uq->uq_mutex); 540 } 541 542 void 543 ufs_sync_with_thread(struct ufs_q *uq) 544 { 545 mutex_enter(&uq->uq_mutex); 546 547 /* 548 * Wake up delete thread to free up space. 549 */ 550 if ((uq->uq_flags & UQ_WAIT) == 0) { 551 uq->uq_flags |= UQ_WAIT; 552 cv_broadcast(&uq->uq_cv); 553 } 554 555 while ((uq->uq_threadp != NULL) && (uq->uq_flags & UQ_WAIT)) { 556 cv_wait(&uq->uq_cv, &uq->uq_mutex); 557 } 558 559 mutex_exit(&uq->uq_mutex); 560 } 561 562 /* 563 * Get rid of everything that's currently in the delete queue, 564 * plus whatever the delete thread is working on at the moment. 565 * 566 * This ability is required for providing true POSIX semantics 567 * regarding close(2), unlink(2), etc, even when logging is enabled. 568 * The standard requires that the released space be immediately 569 * observable (statvfs(2)) and allocatable (e.g., write(2)). 570 */ 571 void 572 ufs_delete_drain_wait(struct ufsvfs *ufsvfsp, int dolockfs) 573 { 574 struct ufs_q *uq = &ufsvfsp->vfs_delete; 575 int error; 576 struct ufs_q *delq = &ufsvfsp->vfs_delete; 577 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 578 579 /* 580 * If there is something on delq or delete thread 581 * working on delq. 582 */ 583 mutex_enter(&delq->uq_mutex); 584 if (delq_info->delq_unreclaimed_files > 0) { 585 mutex_exit(&delq->uq_mutex); 586 (void) ufs_delete_drain(ufsvfsp->vfs_vfs, 0, dolockfs); 587 ufs_sync_with_thread(uq); 588 } else { 589 ASSERT(delq_info->delq_unreclaimed_files == 0); 590 mutex_exit(&delq->uq_mutex); 591 return; 592 } 593 594 /* 595 * Commit any outstanding transactions to make sure 596 * any canceled freed blocks are available for allocation. 597 */ 598 curthread->t_flag |= T_DONTBLOCK; 599 TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_UPDATE, TOP_COMMIT_SIZE, error); 600 if (!error) { 601 TRANS_END_SYNC(ufsvfsp, error, TOP_COMMIT_UPDATE, 602 TOP_COMMIT_SIZE); 603 } 604 curthread->t_flag &= ~T_DONTBLOCK; 605 } 606 607 /* 608 * Adjust the resource usage in a struct statvfs based on 609 * what's in the delete queue. 610 * 611 * We do not consider the impact of ACLs or extended attributes 612 * that may be deleted as a side-effect of deleting a file. 613 * Those are metadata, and their sizes aren't reflected in the 614 * sizes returned by stat(), so this is not a problem. 615 */ 616 void 617 ufs_delete_adjust_stats(struct ufsvfs *ufsvfsp, struct statvfs64 *sp) 618 { 619 struct ufs_q *uq = &ufsvfsp->vfs_delete; 620 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 621 622 mutex_enter(&uq->uq_mutex); 623 /* 624 * The blocks accounted for in the delete queue info are 625 * counted in DEV_BSIZE chunks, but ufs_statvfs counts in 626 * filesystem fragments, so a conversion is required here. 627 */ 628 sp->f_bfree += dbtofsb(ufsvfsp->vfs_fs, 629 delq_info->delq_unreclaimed_blocks); 630 sp->f_ffree += delq_info->delq_unreclaimed_files; 631 mutex_exit(&uq->uq_mutex); 632 } 633 634 /* 635 * IDLE INODE 636 * The following routines implement the protocol for maintaining an 637 * LRU list of idle inodes and for moving the idle inodes to the 638 * reuse list when the number of allocated inodes exceeds the user 639 * tunable high-water mark (ufs_ninode). 640 */ 641 642 /* 643 * clean an idle inode and move it to the reuse list 644 */ 645 static void 646 ufs_idle_free(struct inode *ip) 647 { 648 int pages; 649 int hno; 650 kmutex_t *ihm; 651 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 652 struct vnode *vp = ITOV(ip); 653 int vn_has_data, vn_modified; 654 655 /* 656 * inode is held 657 */ 658 659 /* 660 * remember `pages' for stats below 661 */ 662 pages = (ip->i_mode && vn_has_cached_data(vp) && vp->v_type != VCHR); 663 664 /* 665 * start the dirty pages to disk and then invalidate them 666 * unless the inode is invalid (ISTALE) 667 */ 668 if ((ip->i_flag & ISTALE) == 0) { 669 (void) TRANS_SYNCIP(ip, B_ASYNC, I_ASYNC, TOP_SYNCIP_FREE); 670 (void) TRANS_SYNCIP(ip, 671 (TRANS_ISERROR(ufsvfsp)) ? B_INVAL | B_FORCE : B_INVAL, 672 I_ASYNC, TOP_SYNCIP_FREE); 673 } 674 675 /* 676 * wait for any current ufs_iget to finish and block future ufs_igets 677 */ 678 ASSERT(ip->i_number != 0); 679 hno = INOHASH(ip->i_number); 680 ihm = &ih_lock[hno]; 681 mutex_enter(ihm); 682 683 /* 684 * It must be guaranteed that v_count >= 2, otherwise 685 * something must be wrong with this vnode already. 686 * That is why we use VN_RELE_LOCKED() instead of VN_RELE(). 687 * Acquire the vnode lock in case another thread is in 688 * VN_RELE(). 689 */ 690 mutex_enter(&vp->v_lock); 691 692 if (vp->v_count < 2) { 693 cmn_err(CE_PANIC, 694 "ufs_idle_free: vnode ref count is less than 2"); 695 } 696 697 VN_RELE_LOCKED(vp); 698 699 vn_has_data = (vp->v_type != VCHR && vn_has_cached_data(vp)); 700 vn_modified = (ip->i_flag & (IMOD|IMODACC|IACC|ICHG|IUPD|IATTCHG)); 701 702 if (vp->v_count != 1 || 703 ((vn_has_data || vn_modified) && 704 ((ip->i_flag & ISTALE) == 0))) { 705 /* 706 * Another thread has referenced this inode while 707 * we are trying to free it. Call VN_RELE() to 708 * release our reference, if v_count > 1 data is 709 * present or one of the modified etc. flags was 710 * set, whereby ISTALE wasn't set. 711 * If we'd proceed with ISTALE set here, we might 712 * get ourselves into a deadlock situation. 713 */ 714 mutex_exit(&vp->v_lock); 715 mutex_exit(ihm); 716 VN_RELE(vp); 717 } else { 718 /* 719 * The inode is currently unreferenced and can not 720 * acquire further references because it has no pages 721 * and the hash is locked. Inodes acquire references 722 * via the hash list or via their pages. 723 */ 724 725 mutex_exit(&vp->v_lock); 726 727 /* 728 * remove it from the cache 729 */ 730 remque(ip); 731 mutex_exit(ihm); 732 /* 733 * Stale inodes have no valid ufsvfs 734 */ 735 if ((ip->i_flag & ISTALE) == 0 && ip->i_dquot) { 736 TRANS_DQRELE(ufsvfsp, ip->i_dquot); 737 ip->i_dquot = NULL; 738 } 739 if ((ip->i_flag & ISTALE) && 740 vn_has_data) { 741 /* 742 * ISTALE inodes may have data 743 * and this data needs to be 744 * cleaned up. 745 */ 746 (void) pvn_vplist_dirty(vp, (u_offset_t)0, 747 ufs_putapage, B_INVAL | B_TRUNC, 748 (struct cred *)NULL); 749 } 750 ufs_si_del(ip); 751 if (pages) { 752 CPU_STATS_ADDQ(CPU, sys, ufsipage, 1); 753 } else { 754 CPU_STATS_ADDQ(CPU, sys, ufsinopage, 1); 755 } 756 ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp)); 757 758 /* 759 * We had better not have a vnode reference count > 1 760 * at this point, if we do then something is broken as 761 * this inode/vnode acquired a reference underneath of us. 762 */ 763 ASSERT(vp->v_count == 1); 764 765 ufs_free_inode(ip); 766 } 767 } 768 769 /* 770 * this thread processes the global idle queue 771 */ 772 iqhead_t *ufs_junk_iq; 773 iqhead_t *ufs_useful_iq; 774 int ufs_njunk_iq = 0; 775 int ufs_nuseful_iq = 0; 776 int ufs_niqhash; 777 int ufs_iqhashmask; 778 struct ufs_q ufs_idle_q; 779 780 void 781 ufs_thread_idle(void) 782 { 783 callb_cpr_t cprinfo; 784 int i; 785 int ne; 786 787 ufs_niqhash = (ufs_idle_q.uq_lowat >> 1) / IQHASHQLEN; 788 ufs_niqhash = 1 << highbit(ufs_niqhash); /* round up to power of 2 */ 789 ufs_iqhashmask = ufs_niqhash - 1; 790 ufs_junk_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_junk_iq), 791 KM_SLEEP); 792 ufs_useful_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_useful_iq), 793 KM_SLEEP); 794 795 /* Initialize hash queue headers */ 796 for (i = 0; i < ufs_niqhash; i++) { 797 ufs_junk_iq[i].i_freef = (inode_t *)&ufs_junk_iq[i]; 798 ufs_junk_iq[i].i_freeb = (inode_t *)&ufs_junk_iq[i]; 799 ufs_useful_iq[i].i_freef = (inode_t *)&ufs_useful_iq[i]; 800 ufs_useful_iq[i].i_freeb = (inode_t *)&ufs_useful_iq[i]; 801 } 802 803 CALLB_CPR_INIT(&cprinfo, &ufs_idle_q.uq_mutex, callb_generic_cpr, 804 "ufsidle"); 805 again: 806 /* 807 * Whenever the idle thread is awakened, it repeatedly gives 808 * back half of the idle queue until the idle queue falls 809 * below lowat. 810 */ 811 mutex_enter(&ufs_idle_q.uq_mutex); 812 if (ufs_idle_q.uq_ne < ufs_idle_q.uq_lowat) { 813 CALLB_CPR_SAFE_BEGIN(&cprinfo); 814 cv_wait(&ufs_idle_q.uq_cv, &ufs_idle_q.uq_mutex); 815 CALLB_CPR_SAFE_END(&cprinfo, &ufs_idle_q.uq_mutex); 816 } 817 mutex_exit(&ufs_idle_q.uq_mutex); 818 819 /* 820 * Give back 1/2 of the idle queue 821 */ 822 ne = ufs_idle_q.uq_ne >> 1; 823 ins.in_tidles.value.ul += ne; 824 ufs_idle_some(ne); 825 goto again; 826 } 827 828 /* 829 * Reclaim callback for ufs inode cache. 830 * Invoked by the kernel memory allocator when memory gets tight. 831 */ 832 /*ARGSUSED*/ 833 void 834 ufs_inode_cache_reclaim(void *cdrarg) 835 { 836 /* 837 * If we are low on memory and the idle queue is over its 838 * halfway mark, then free 50% of the idle q 839 * 840 * We don't free all of the idle inodes because the inodes 841 * for popular NFS files may have been kicked from the dnlc. 842 * The inodes for these files will end up on the idle queue 843 * after every NFS access. 844 * 845 * If we repeatedly push them from the idle queue then 846 * NFS users may be unhappy as an extra buf cache operation 847 * is incurred for every NFS operation to these files. 848 * 849 * It's not common, but I have seen it happen. 850 * 851 */ 852 if (ufs_idle_q.uq_ne < (ufs_idle_q.uq_lowat >> 1)) 853 return; 854 mutex_enter(&ufs_idle_q.uq_mutex); 855 cv_broadcast(&ufs_idle_q.uq_cv); 856 mutex_exit(&ufs_idle_q.uq_mutex); 857 } 858 859 /* 860 * Free up some idle inodes 861 */ 862 void 863 ufs_idle_some(int ne) 864 { 865 int i; 866 struct inode *ip; 867 struct vnode *vp; 868 static int junk_rotor = 0; 869 static int useful_rotor = 0; 870 871 for (i = 0; i < ne; ++i) { 872 mutex_enter(&ufs_idle_q.uq_mutex); 873 874 if (ufs_njunk_iq) { 875 while (ufs_junk_iq[junk_rotor].i_freef == 876 (inode_t *)&ufs_junk_iq[junk_rotor]) { 877 junk_rotor = IQNEXT(junk_rotor); 878 } 879 ip = ufs_junk_iq[junk_rotor].i_freef; 880 ASSERT(ip->i_flag & IJUNKIQ); 881 } else if (ufs_nuseful_iq) { 882 while (ufs_useful_iq[useful_rotor].i_freef == 883 (inode_t *)&ufs_useful_iq[useful_rotor]) { 884 useful_rotor = IQNEXT(useful_rotor); 885 } 886 ip = ufs_useful_iq[useful_rotor].i_freef; 887 ASSERT(!(ip->i_flag & IJUNKIQ)); 888 } else { 889 mutex_exit(&ufs_idle_q.uq_mutex); 890 return; 891 } 892 893 /* 894 * emulate ufs_iget 895 */ 896 vp = ITOV(ip); 897 VN_HOLD(vp); 898 mutex_exit(&ufs_idle_q.uq_mutex); 899 rw_enter(&ip->i_contents, RW_WRITER); 900 /* 901 * VN_RELE should not be called if 902 * ufs_rmidle returns true, as it will 903 * effectively be done in ufs_idle_free. 904 */ 905 if (ufs_rmidle(ip)) { 906 rw_exit(&ip->i_contents); 907 ufs_idle_free(ip); 908 } else { 909 rw_exit(&ip->i_contents); 910 VN_RELE(vp); 911 } 912 } 913 } 914 915 /* 916 * drain entries for vfsp from the idle queue 917 * vfsp == NULL means drain the entire thing 918 */ 919 void 920 ufs_idle_drain(struct vfs *vfsp) 921 { 922 struct inode *ip, *nip; 923 struct inode *ianchor = NULL; 924 int i; 925 926 mutex_enter(&ufs_idle_q.uq_mutex); 927 if (ufs_njunk_iq) { 928 /* for each hash q */ 929 for (i = 0; i < ufs_niqhash; i++) { 930 /* search down the hash q */ 931 for (ip = ufs_junk_iq[i].i_freef; 932 ip != (inode_t *)&ufs_junk_iq[i]; 933 ip = ip->i_freef) { 934 if (ip->i_vfs == vfsp || vfsp == NULL) { 935 /* found a matching entry */ 936 VN_HOLD(ITOV(ip)); 937 mutex_exit(&ufs_idle_q.uq_mutex); 938 rw_enter(&ip->i_contents, RW_WRITER); 939 /* 940 * See comments in ufs_idle_some() 941 * as we will call ufs_idle_free() 942 * after scanning both queues. 943 */ 944 if (ufs_rmidle(ip)) { 945 rw_exit(&ip->i_contents); 946 ip->i_freef = ianchor; 947 ianchor = ip; 948 } else { 949 rw_exit(&ip->i_contents); 950 VN_RELE(ITOV(ip)); 951 } 952 /* restart this hash q */ 953 ip = (inode_t *)&ufs_junk_iq[i]; 954 mutex_enter(&ufs_idle_q.uq_mutex); 955 } 956 } 957 } 958 } 959 if (ufs_nuseful_iq) { 960 /* for each hash q */ 961 for (i = 0; i < ufs_niqhash; i++) { 962 /* search down the hash q */ 963 for (ip = ufs_useful_iq[i].i_freef; 964 ip != (inode_t *)&ufs_useful_iq[i]; 965 ip = ip->i_freef) { 966 if (ip->i_vfs == vfsp || vfsp == NULL) { 967 /* found a matching entry */ 968 VN_HOLD(ITOV(ip)); 969 mutex_exit(&ufs_idle_q.uq_mutex); 970 rw_enter(&ip->i_contents, RW_WRITER); 971 /* 972 * See comments in ufs_idle_some() 973 * as we will call ufs_idle_free() 974 * after scanning both queues. 975 */ 976 if (ufs_rmidle(ip)) { 977 rw_exit(&ip->i_contents); 978 ip->i_freef = ianchor; 979 ianchor = ip; 980 } else { 981 rw_exit(&ip->i_contents); 982 VN_RELE(ITOV(ip)); 983 } 984 /* restart this hash q */ 985 ip = (inode_t *)&ufs_useful_iq[i]; 986 mutex_enter(&ufs_idle_q.uq_mutex); 987 } 988 } 989 } 990 } 991 992 mutex_exit(&ufs_idle_q.uq_mutex); 993 /* no more matching entries, release those we have found (if any) */ 994 for (ip = ianchor; ip; ip = nip) { 995 nip = ip->i_freef; 996 ip->i_freef = ip; 997 ufs_idle_free(ip); 998 } 999 } 1000 1001 /* 1002 * RECLAIM DELETED INODES 1003 * The following thread scans the file system once looking for deleted files 1004 */ 1005 void 1006 ufs_thread_reclaim(struct vfs *vfsp) 1007 { 1008 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 1009 struct ufs_q *uq = &ufsvfsp->vfs_reclaim; 1010 struct fs *fs = ufsvfsp->vfs_fs; 1011 struct buf *bp = 0; 1012 int err = 0; 1013 daddr_t bno; 1014 ino_t ino; 1015 struct dinode *dp; 1016 struct inode *ip; 1017 callb_cpr_t cprinfo; 1018 1019 CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr, 1020 "ufsreclaim"); 1021 1022 /* 1023 * mount decided that we don't need a reclaim thread 1024 */ 1025 if ((fs->fs_reclaim & FS_RECLAIMING) == 0) 1026 err++; 1027 1028 /* 1029 * don't reclaim if readonly 1030 */ 1031 if (fs->fs_ronly) 1032 err++; 1033 1034 for (ino = 0; ino < (fs->fs_ncg * fs->fs_ipg) && !err; ++ino) { 1035 1036 /* 1037 * Check whether we are the target of another 1038 * thread having called ufs_thread_exit() or 1039 * ufs_thread_suspend(). 1040 */ 1041 mutex_enter(&uq->uq_mutex); 1042 again: 1043 if (uq->uq_flags & UQ_EXIT) { 1044 err++; 1045 mutex_exit(&uq->uq_mutex); 1046 break; 1047 } else if (uq->uq_flags & UQ_SUSPEND) { 1048 uq->uq_flags |= UQ_SUSPENDED; 1049 /* 1050 * Release the buf before we cv_wait() 1051 * otherwise we may deadlock with the 1052 * thread that called ufs_thread_suspend(). 1053 */ 1054 if (bp) { 1055 brelse(bp); 1056 bp = 0; 1057 } 1058 if (uq->uq_flags & UQ_WAIT) { 1059 uq->uq_flags &= ~UQ_WAIT; 1060 cv_broadcast(&uq->uq_cv); 1061 } 1062 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1063 cv_wait(&uq->uq_cv, &uq->uq_mutex); 1064 CALLB_CPR_SAFE_END(&cprinfo, &uq->uq_mutex); 1065 goto again; 1066 } 1067 mutex_exit(&uq->uq_mutex); 1068 1069 /* 1070 * if we don't already have the buf; get it 1071 */ 1072 bno = fsbtodb(fs, itod(fs, ino)); 1073 if ((bp == 0) || (bp->b_blkno != bno)) { 1074 if (bp) 1075 brelse(bp); 1076 bp = UFS_BREAD(ufsvfsp, 1077 ufsvfsp->vfs_dev, bno, fs->fs_bsize); 1078 bp->b_flags |= B_AGE; 1079 } 1080 if (bp->b_flags & B_ERROR) { 1081 err++; 1082 continue; 1083 } 1084 /* 1085 * nlink <= 0 and mode != 0 means deleted 1086 */ 1087 dp = (struct dinode *)bp->b_un.b_addr + itoo(fs, ino); 1088 if ((dp->di_nlink <= 0) && (dp->di_mode != 0)) { 1089 /* 1090 * can't hold the buf (deadlock) 1091 */ 1092 brelse(bp); 1093 bp = 0; 1094 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1095 /* 1096 * iget/iput sequence will put inode on ifree 1097 * thread queue if it is idle. This is a nop 1098 * for busy (open, deleted) inodes 1099 */ 1100 if (ufs_iget(vfsp, ino, &ip, CRED())) 1101 err++; 1102 else 1103 VN_RELE(ITOV(ip)); 1104 rw_exit(&ufsvfsp->vfs_dqrwlock); 1105 } 1106 } 1107 1108 if (bp) 1109 brelse(bp); 1110 if (!err) { 1111 /* 1112 * reset the reclaiming-bit 1113 */ 1114 mutex_enter(&ufsvfsp->vfs_lock); 1115 fs->fs_reclaim &= ~FS_RECLAIMING; 1116 mutex_exit(&ufsvfsp->vfs_lock); 1117 TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_RECLAIM); 1118 } 1119 1120 /* 1121 * exit the reclaim thread 1122 */ 1123 mutex_enter(&uq->uq_mutex); 1124 uq->uq_threadp = NULL; 1125 uq->uq_flags &= ~UQ_WAIT; 1126 cv_broadcast(&uq->uq_cv); 1127 CALLB_CPR_EXIT(&cprinfo); 1128 thread_exit(); 1129 } 1130 /* 1131 * HLOCK FILE SYSTEM 1132 * hlock the file system's whose logs have device errors 1133 */ 1134 struct ufs_q ufs_hlock; 1135 /*ARGSUSED*/ 1136 void 1137 ufs_thread_hlock(void *ignore) 1138 { 1139 int retry; 1140 callb_cpr_t cprinfo; 1141 1142 CALLB_CPR_INIT(&cprinfo, &ufs_hlock.uq_mutex, callb_generic_cpr, 1143 "ufshlock"); 1144 1145 for (;;) { 1146 /* 1147 * sleep until there is work to do 1148 */ 1149 mutex_enter(&ufs_hlock.uq_mutex); 1150 (void) ufs_thread_run(&ufs_hlock, &cprinfo); 1151 ufs_hlock.uq_ne = 0; 1152 mutex_exit(&ufs_hlock.uq_mutex); 1153 /* 1154 * hlock the error'ed fs's 1155 * retry after a bit if another app is doing lockfs stuff 1156 */ 1157 do { 1158 retry = ufs_trans_hlock(); 1159 if (retry) { 1160 mutex_enter(&ufs_hlock.uq_mutex); 1161 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1162 (void) cv_reltimedwait(&ufs_hlock.uq_cv, 1163 &ufs_hlock.uq_mutex, hz, TR_CLOCK_TICK); 1164 CALLB_CPR_SAFE_END(&cprinfo, 1165 &ufs_hlock.uq_mutex); 1166 mutex_exit(&ufs_hlock.uq_mutex); 1167 } 1168 } while (retry); 1169 } 1170 } 1171 1172 static void 1173 ufs_attr_purge(struct inode *dp) 1174 { 1175 int err; 1176 int error; 1177 off_t dirsize; /* size of the directory */ 1178 off_t offset; /* offset in the directory */ 1179 int entryoffsetinblk; /* offset of ep in fbp's buffer */ 1180 struct inode *tp; 1181 struct fbuf *fbp; /* pointer to directory block */ 1182 struct direct *ep; /* directory entry */ 1183 int trans_size; 1184 int issync; 1185 struct ufsvfs *ufsvfsp = dp->i_ufsvfs; 1186 1187 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1188 1189 fbp = NULL; 1190 dirsize = roundup(dp->i_size, DIRBLKSIZ); 1191 offset = 0; 1192 entryoffsetinblk = 0; 1193 1194 /* 1195 * Purge directory cache 1196 */ 1197 1198 dnlc_dir_purge(&dp->i_danchor); 1199 1200 while (offset < dirsize) { 1201 /* 1202 * If offset is on a block boundary, 1203 * read the next directory block. 1204 * Release previous if it exists. 1205 */ 1206 if (blkoff(dp->i_fs, offset) == 0) { 1207 if (fbp != NULL) { 1208 fbrelse(fbp, S_OTHER); 1209 } 1210 1211 err = blkatoff(dp, offset, (char **)0, &fbp); 1212 if (err) { 1213 goto out; 1214 } 1215 entryoffsetinblk = 0; 1216 } 1217 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk); 1218 if (ep->d_ino == 0 || (ep->d_name[0] == '.' && 1219 ep->d_name[1] == '\0') || 1220 (ep->d_name[0] == '.' && ep->d_name[1] == '.' && 1221 ep->d_name[2] == '\0')) { 1222 1223 entryoffsetinblk += ep->d_reclen; 1224 1225 } else { 1226 1227 if ((err = ufs_iget(dp->i_vfs, ep->d_ino, 1228 &tp, CRED())) != 0) { 1229 goto out; 1230 } 1231 1232 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE, 1233 trans_size = (int)TOP_REMOVE_SIZE(tp)); 1234 1235 /* 1236 * Delete inode. 1237 */ 1238 1239 dnlc_remove(ITOV(dp), ep->d_name); 1240 1241 rw_enter(&tp->i_contents, RW_WRITER); 1242 tp->i_flag |= ICHG; 1243 tp->i_seq++; 1244 TRANS_INODE(tp->i_ufsvfs, tp); 1245 tp->i_nlink--; 1246 ufs_setreclaim(tp); 1247 ITIMES_NOLOCK(tp); 1248 rw_exit(&tp->i_contents); 1249 1250 VN_RELE(ITOV(tp)); 1251 entryoffsetinblk += ep->d_reclen; 1252 TRANS_END_CSYNC(ufsvfsp, error, 1253 issync, TOP_REMOVE, trans_size); 1254 1255 } 1256 offset += ep->d_reclen; 1257 } 1258 1259 if (fbp) { 1260 fbrelse(fbp, S_OTHER); 1261 } 1262 1263 out: 1264 rw_exit(&ufsvfsp->vfs_dqrwlock); 1265 } 1266