1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 #pragma ident "%Z%%M% %I% %E% SMI" 41 42 #include <sys/types.h> 43 #include <sys/systm.h> 44 #include <sys/param.h> 45 #include <sys/t_lock.h> 46 #include <sys/systm.h> 47 #include <sys/vfs.h> 48 #include <sys/vnode.h> 49 #include <sys/dnlc.h> 50 #include <sys/kmem.h> 51 #include <sys/cmn_err.h> 52 #include <sys/vtrace.h> 53 #include <sys/bitmap.h> 54 #include <sys/var.h> 55 #include <sys/sysmacros.h> 56 #include <sys/kstat.h> 57 #include <sys/atomic.h> 58 #include <sys/taskq.h> 59 60 /* 61 * Directory name lookup cache. 62 * Based on code originally done by Robert Elz at Melbourne. 63 * 64 * Names found by directory scans are retained in a cache 65 * for future reference. Each hash chain is ordered by LRU 66 * Cache is indexed by hash value obtained from (vp, name) 67 * where the vp refers to the directory containing the name. 68 */ 69 70 /* 71 * Tunable nc_hashavelen is the average length desired for this chain, from 72 * which the size of the nc_hash table is derived at create time. 73 */ 74 #define NC_HASHAVELEN_DEFAULT 4 75 int nc_hashavelen = NC_HASHAVELEN_DEFAULT; 76 77 /* 78 * NC_MOVETOFRONT is the move-to-front threshold: if the hash lookup 79 * depth exceeds this value, we move the looked-up entry to the front of 80 * its hash chain. The idea is to make sure that the most frequently 81 * accessed entries are found most quickly (by keeping them near the 82 * front of their hash chains). 83 */ 84 #define NC_MOVETOFRONT 2 85 86 /* 87 * 88 * DNLC_MAX_RELE is used to size an array on the stack when releasing 89 * vnodes. This array is used rather than calling VN_RELE() inline because 90 * all dnlc locks must be dropped by that time in order to avoid a 91 * possible deadlock. This deadlock occurs when the dnlc holds the last 92 * reference to the vnode and so the VOP_INACTIVE vector is called which 93 * can in turn call back into the dnlc. A global array was used but had 94 * many problems: 95 * 1) Actually doesn't have an upper bound on the array size as 96 * entries can be added after starting the purge. 97 * 2) The locking scheme causes a hang. 98 * 3) Caused serialisation on the global lock. 99 * 4) The array was often unnecessarily huge. 100 * 101 * Note the current value 8 allows up to 4 cache entries (to be purged 102 * from each hash chain), before having to cycle around and retry. 103 * This ought to be ample given that nc_hashavelen is typically very small. 104 */ 105 #define DNLC_MAX_RELE 8 /* must be even */ 106 107 /* 108 * Hash table of name cache entries for fast lookup, dynamically 109 * allocated at startup. 110 */ 111 nc_hash_t *nc_hash; 112 113 /* 114 * Rotors. Used to select entries on a round-robin basis. 115 */ 116 static nc_hash_t *dnlc_purge_fs1_rotor; 117 static nc_hash_t *dnlc_free_rotor; 118 119 /* 120 * # of dnlc entries (uninitialized) 121 * 122 * the initial value was chosen as being 123 * a random string of bits, probably not 124 * normally chosen by a systems administrator 125 */ 126 int ncsize = -1; 127 uint32_t dnlc_nentries = 0; /* current number of name cache entries */ 128 static int nc_hashsz; /* size of hash table */ 129 static int nc_hashmask; /* size of hash table minus 1 */ 130 131 /* 132 * The dnlc_reduce_cache() taskq queue is activated when there are 133 * ncsize name cache entries and it reduces the size down to 134 * dnlc_nentries_low_water, which is by default one hundreth 135 * less (or 99%) of ncsize. 136 */ 137 #define DNLC_LOW_WATER_DIVISOR_DEFAULT 100 138 uint_t dnlc_low_water_divisor = DNLC_LOW_WATER_DIVISOR_DEFAULT; 139 uint_t dnlc_nentries_low_water; 140 int dnlc_reduce_idle = 1; /* no locking needed */ 141 142 /* 143 * If dnlc_nentries hits dnlc_max_nentries (twice ncsize) 144 * then this means the dnlc_reduce_cache() taskq is failing to 145 * keep up. In this case we refuse to add new entries to the dnlc 146 * until the taskq catches up. 147 */ 148 uint_t dnlc_max_nentries; /* twice ncsize */ 149 uint64_t dnlc_max_nentries_cnt = 0; /* statistic on times we failed */ 150 151 /* 152 * Tunable to define when we should just remove items from 153 * the end of the chain. 154 */ 155 #define DNLC_LONG_CHAIN 8 156 uint_t dnlc_long_chain = DNLC_LONG_CHAIN; 157 158 /* 159 * ncstats has been deprecated, due to the integer size of the counters 160 * which can easily overflow in the dnlc. 161 * It is maintained (at some expense) for compatability. 162 * The preferred interface is the kstat accessible nc_stats below. 163 */ 164 struct ncstats ncstats; 165 166 struct nc_stats ncs = { 167 { "hits", KSTAT_DATA_UINT64 }, 168 { "misses", KSTAT_DATA_UINT64 }, 169 { "negative_cache_hits", KSTAT_DATA_UINT64 }, 170 { "enters", KSTAT_DATA_UINT64 }, 171 { "double_enters", KSTAT_DATA_UINT64 }, 172 { "purge_total_entries", KSTAT_DATA_UINT64 }, 173 { "purge_all", KSTAT_DATA_UINT64 }, 174 { "purge_vp", KSTAT_DATA_UINT64 }, 175 { "purge_vfs", KSTAT_DATA_UINT64 }, 176 { "purge_fs1", KSTAT_DATA_UINT64 }, 177 { "pick_free", KSTAT_DATA_UINT64 }, 178 { "pick_heuristic", KSTAT_DATA_UINT64 }, 179 { "pick_last", KSTAT_DATA_UINT64 }, 180 181 /* directory caching stats */ 182 183 { "dir_hits", KSTAT_DATA_UINT64 }, 184 { "dir_misses", KSTAT_DATA_UINT64 }, 185 { "dir_cached_current", KSTAT_DATA_UINT64 }, 186 { "dir_entries_cached_current", KSTAT_DATA_UINT64 }, 187 { "dir_cached_total", KSTAT_DATA_UINT64 }, 188 { "dir_start_no_memory", KSTAT_DATA_UINT64 }, 189 { "dir_add_no_memory", KSTAT_DATA_UINT64 }, 190 { "dir_add_abort", KSTAT_DATA_UINT64 }, 191 { "dir_add_max", KSTAT_DATA_UINT64 }, 192 { "dir_remove_entry_fail", KSTAT_DATA_UINT64 }, 193 { "dir_remove_space_fail", KSTAT_DATA_UINT64 }, 194 { "dir_update_fail", KSTAT_DATA_UINT64 }, 195 { "dir_fini_purge", KSTAT_DATA_UINT64 }, 196 { "dir_reclaim_last", KSTAT_DATA_UINT64 }, 197 { "dir_reclaim_any", KSTAT_DATA_UINT64 }, 198 }; 199 200 static int doingcache = 1; 201 202 vnode_t negative_cache_vnode; 203 204 /* 205 * Insert entry at the front of the queue 206 */ 207 #define nc_inshash(ncp, hp) \ 208 { \ 209 (ncp)->hash_next = (hp)->hash_next; \ 210 (ncp)->hash_prev = (ncache_t *)(hp); \ 211 (hp)->hash_next->hash_prev = (ncp); \ 212 (hp)->hash_next = (ncp); \ 213 } 214 215 /* 216 * Remove entry from hash queue 217 */ 218 #define nc_rmhash(ncp) \ 219 { \ 220 (ncp)->hash_prev->hash_next = (ncp)->hash_next; \ 221 (ncp)->hash_next->hash_prev = (ncp)->hash_prev; \ 222 (ncp)->hash_prev = NULL; \ 223 (ncp)->hash_next = NULL; \ 224 } 225 226 /* 227 * Free an entry. 228 */ 229 #define dnlc_free(ncp) \ 230 { \ 231 kmem_free((ncp), sizeof (ncache_t) + (ncp)->namlen); \ 232 atomic_add_32(&dnlc_nentries, -1); \ 233 } 234 235 236 /* 237 * Cached directory info. 238 * ====================== 239 */ 240 241 /* 242 * Cached directory free space hash function. 243 * Needs the free space handle and the dcp to get the hash table size 244 * Returns the hash index. 245 */ 246 #define DDFHASH(handle, dcp) ((handle >> 2) & (dcp)->dc_fhash_mask) 247 248 /* 249 * Cached directory name entry hash function. 250 * Uses the name and returns in the input arguments the hash and the name 251 * length. 252 */ 253 #define DNLC_DIR_HASH(name, hash, namelen) \ 254 { \ 255 char Xc, *Xcp; \ 256 hash = *name; \ 257 for (Xcp = (name + 1); (Xc = *Xcp) != 0; Xcp++) \ 258 hash = (hash << 4) + hash + Xc; \ 259 ASSERT((Xcp - (name)) <= ((1 << NBBY) - 1)); \ 260 namelen = Xcp - (name); \ 261 } 262 263 /* special dircache_t pointer to indicate error should be returned */ 264 /* 265 * The anchor directory cache pointer can contain 3 types of values, 266 * 1) NULL: No directory cache 267 * 2) DC_RET_LOW_MEM (-1): There was a directory cache that found to be 268 * too big or a memory shortage occurred. This value remains in the 269 * pointer until a dnlc_dir_start() which returns the a DNOMEM error. 270 * This is kludgy but efficient and only visible in this source file. 271 * 3) A valid cache pointer. 272 */ 273 #define DC_RET_LOW_MEM (dircache_t *)1 274 #define VALID_DIR_CACHE(dcp) ((dircache_t *)(dcp) > DC_RET_LOW_MEM) 275 276 /* Tunables */ 277 uint_t dnlc_dir_enable = 1; /* disable caching directories by setting to 0 */ 278 uint_t dnlc_dir_min_size = 40; /* min no of directory entries before caching */ 279 uint_t dnlc_dir_max_size = UINT_MAX; /* ditto maximum */ 280 uint_t dnlc_dir_hash_size_shift = 3; /* 8 entries per hash bucket */ 281 uint_t dnlc_dir_min_reclaim = 350000; /* approx 1MB of dcentrys */ 282 /* 283 * dnlc_dir_hash_resize_shift determines when the hash tables 284 * get re-adjusted due to growth or shrinkage 285 * - currently 2 indicating that there can be at most 4 286 * times or at least one quarter the number of entries 287 * before hash table readjustment. Note that with 288 * dnlc_dir_hash_size_shift above set at 3 this would 289 * mean readjustment would occur if the average number 290 * of entries went above 32 or below 2 291 */ 292 uint_t dnlc_dir_hash_resize_shift = 2; /* readjust rate */ 293 294 static kmem_cache_t *dnlc_dir_space_cache; /* free space entry cache */ 295 static dchead_t dc_head; /* anchor of cached directories */ 296 297 /* Prototypes */ 298 static ncache_t *dnlc_get(uchar_t namlen); 299 static ncache_t *dnlc_search(vnode_t *dp, char *name, uchar_t namlen, int hash); 300 static void dnlc_reduce_cache(void *unused); 301 static void dnlc_dir_reclaim(void *unused); 302 static void dnlc_dir_abort(dircache_t *dcp); 303 static void dnlc_dir_adjust_fhash(dircache_t *dcp); 304 static void dnlc_dir_adjust_nhash(dircache_t *dcp); 305 306 307 /* 308 * Initialize the directory cache. 309 */ 310 void 311 dnlc_init() 312 { 313 nc_hash_t *hp; 314 kstat_t *ksp; 315 int i; 316 317 /* 318 * Set up the size of the dnlc (ncsize) and its low water mark. 319 */ 320 if (ncsize == -1) { 321 /* calculate a reasonable size for the low water */ 322 dnlc_nentries_low_water = 4 * (v.v_proc + maxusers) + 320; 323 ncsize = dnlc_nentries_low_water + 324 (dnlc_nentries_low_water / dnlc_low_water_divisor); 325 } else { 326 /* don't change the user specified ncsize */ 327 dnlc_nentries_low_water = 328 ncsize - (ncsize / dnlc_low_water_divisor); 329 } 330 if (ncsize <= 0) { 331 doingcache = 0; 332 dnlc_dir_enable = 0; /* also disable directory caching */ 333 ncsize = 0; 334 cmn_err(CE_NOTE, "name cache (dnlc) disabled"); 335 return; 336 } 337 dnlc_max_nentries = ncsize * 2; 338 339 /* 340 * Initialise the hash table. 341 * Compute hash size rounding to the next power of two. 342 */ 343 nc_hashsz = ncsize / nc_hashavelen; 344 nc_hashsz = 1 << highbit(nc_hashsz); 345 nc_hashmask = nc_hashsz - 1; 346 nc_hash = kmem_zalloc(nc_hashsz * sizeof (*nc_hash), KM_SLEEP); 347 for (i = 0; i < nc_hashsz; i++) { 348 hp = (nc_hash_t *)&nc_hash[i]; 349 mutex_init(&hp->hash_lock, NULL, MUTEX_DEFAULT, NULL); 350 hp->hash_next = (ncache_t *)hp; 351 hp->hash_prev = (ncache_t *)hp; 352 } 353 354 /* 355 * Initialize rotors 356 */ 357 dnlc_free_rotor = dnlc_purge_fs1_rotor = &nc_hash[0]; 358 359 /* 360 * Set up the directory caching to use kmem_cache_alloc 361 * for its free space entries so that we can get a callback 362 * when the system is short on memory, to allow us to free 363 * up some memory. we don't use the constructor/deconstructor 364 * functions. 365 */ 366 dnlc_dir_space_cache = kmem_cache_create("dnlc_space_cache", 367 sizeof (dcfree_t), 0, NULL, NULL, dnlc_dir_reclaim, NULL, 368 NULL, 0); 369 370 /* 371 * Initialise the head of the cached directory structures 372 */ 373 mutex_init(&dc_head.dch_lock, NULL, MUTEX_DEFAULT, NULL); 374 dc_head.dch_next = (dircache_t *)&dc_head; 375 dc_head.dch_prev = (dircache_t *)&dc_head; 376 377 /* 378 * Initialise the reference count of the negative cache vnode to 1 379 * so that it never goes away (VOP_INACTIVE isn't called on it). 380 */ 381 negative_cache_vnode.v_count = 1; 382 383 /* 384 * Initialise kstats - both the old compatability raw kind and 385 * the more extensive named stats. 386 */ 387 ksp = kstat_create("unix", 0, "ncstats", "misc", KSTAT_TYPE_RAW, 388 sizeof (struct ncstats), KSTAT_FLAG_VIRTUAL); 389 if (ksp) { 390 ksp->ks_data = (void *) &ncstats; 391 kstat_install(ksp); 392 } 393 ksp = kstat_create("unix", 0, "dnlcstats", "misc", KSTAT_TYPE_NAMED, 394 sizeof (ncs) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 395 if (ksp) { 396 ksp->ks_data = (void *) &ncs; 397 kstat_install(ksp); 398 } 399 } 400 401 /* 402 * Add a name to the directory cache. 403 */ 404 void 405 dnlc_enter(vnode_t *dp, char *name, vnode_t *vp) 406 { 407 ncache_t *ncp; 408 nc_hash_t *hp; 409 uchar_t namlen; 410 int hash; 411 412 TRACE_0(TR_FAC_NFS, TR_DNLC_ENTER_START, "dnlc_enter_start:"); 413 414 if (!doingcache) { 415 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END, 416 "dnlc_enter_end:(%S) %d", "not caching", 0); 417 return; 418 } 419 420 /* 421 * Get a new dnlc entry. Assume the entry won't be in the cache 422 * and initialize it now 423 */ 424 DNLCHASH(name, dp, hash, namlen); 425 if ((ncp = dnlc_get(namlen)) == NULL) 426 return; 427 ncp->dp = dp; 428 VN_HOLD(dp); 429 ncp->vp = vp; 430 VN_HOLD(vp); 431 bcopy(name, ncp->name, namlen + 1); /* name and null */ 432 ncp->hash = hash; 433 hp = &nc_hash[hash & nc_hashmask]; 434 435 mutex_enter(&hp->hash_lock); 436 if (dnlc_search(dp, name, namlen, hash) != NULL) { 437 mutex_exit(&hp->hash_lock); 438 ncstats.dbl_enters++; 439 ncs.ncs_dbl_enters.value.ui64++; 440 VN_RELE(dp); 441 VN_RELE(vp); 442 dnlc_free(ncp); /* crfree done here */ 443 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END, 444 "dnlc_enter_end:(%S) %d", 445 "dbl enter", ncstats.dbl_enters); 446 return; 447 } 448 /* 449 * Insert back into the hash chain. 450 */ 451 nc_inshash(ncp, hp); 452 mutex_exit(&hp->hash_lock); 453 ncstats.enters++; 454 ncs.ncs_enters.value.ui64++; 455 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END, 456 "dnlc_enter_end:(%S) %d", "done", ncstats.enters); 457 } 458 459 /* 460 * Add a name to the directory cache. 461 * 462 * This function is basically identical with 463 * dnlc_enter(). The difference is that when the 464 * desired dnlc entry is found, the vnode in the 465 * ncache is compared with the vnode passed in. 466 * 467 * If they are not equal then the ncache is 468 * updated with the passed in vnode. Otherwise 469 * it just frees up the newly allocated dnlc entry. 470 */ 471 void 472 dnlc_update(vnode_t *dp, char *name, vnode_t *vp) 473 { 474 ncache_t *ncp; 475 ncache_t *tcp; 476 vnode_t *tvp; 477 nc_hash_t *hp; 478 int hash; 479 uchar_t namlen; 480 481 TRACE_0(TR_FAC_NFS, TR_DNLC_ENTER_START, "dnlc_update_start:"); 482 483 if (!doingcache) { 484 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END, 485 "dnlc_update_end:(%S) %d", "not caching", 0); 486 return; 487 } 488 489 /* 490 * Get a new dnlc entry and initialize it now. 491 * If we fail to get a new entry, call dnlc_remove() to purge 492 * any existing dnlc entry including negative cache (DNLC_NO_VNODE) 493 * entry. 494 * Failure to clear an existing entry could result in false dnlc 495 * lookup (negative/stale entry). 496 */ 497 DNLCHASH(name, dp, hash, namlen); 498 if ((ncp = dnlc_get(namlen)) == NULL) { 499 dnlc_remove(dp, name); 500 return; 501 } 502 ncp->dp = dp; 503 VN_HOLD(dp); 504 ncp->vp = vp; 505 VN_HOLD(vp); 506 bcopy(name, ncp->name, namlen + 1); /* name and null */ 507 ncp->hash = hash; 508 hp = &nc_hash[hash & nc_hashmask]; 509 510 mutex_enter(&hp->hash_lock); 511 if ((tcp = dnlc_search(dp, name, namlen, hash)) != NULL) { 512 if (tcp->vp != vp) { 513 tvp = tcp->vp; 514 tcp->vp = vp; 515 mutex_exit(&hp->hash_lock); 516 VN_RELE(tvp); 517 ncstats.enters++; 518 ncs.ncs_enters.value.ui64++; 519 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END, 520 "dnlc_update_end:(%S) %d", "done", ncstats.enters); 521 } else { 522 mutex_exit(&hp->hash_lock); 523 VN_RELE(vp); 524 ncstats.dbl_enters++; 525 ncs.ncs_dbl_enters.value.ui64++; 526 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END, 527 "dnlc_update_end:(%S) %d", 528 "dbl enter", ncstats.dbl_enters); 529 } 530 VN_RELE(dp); 531 dnlc_free(ncp); /* crfree done here */ 532 return; 533 } 534 /* 535 * insert the new entry, since it is not in dnlc yet 536 */ 537 nc_inshash(ncp, hp); 538 mutex_exit(&hp->hash_lock); 539 ncstats.enters++; 540 ncs.ncs_enters.value.ui64++; 541 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END, 542 "dnlc_update_end:(%S) %d", "done", ncstats.enters); 543 } 544 545 /* 546 * Look up a name in the directory name cache. 547 * 548 * Return a doubly-held vnode if found: one hold so that it may 549 * remain in the cache for other users, the other hold so that 550 * the cache is not re-cycled and the identity of the vnode is 551 * lost before the caller can use the vnode. 552 */ 553 vnode_t * 554 dnlc_lookup(vnode_t *dp, char *name) 555 { 556 ncache_t *ncp; 557 nc_hash_t *hp; 558 vnode_t *vp; 559 int hash, depth; 560 uchar_t namlen; 561 562 TRACE_2(TR_FAC_NFS, TR_DNLC_LOOKUP_START, 563 "dnlc_lookup_start:dp %x name %s", dp, name); 564 565 if (!doingcache) { 566 TRACE_4(TR_FAC_NFS, TR_DNLC_LOOKUP_END, 567 "dnlc_lookup_end:%S %d vp %x name %s", 568 "not_caching", 0, NULL, name); 569 return (NULL); 570 } 571 572 DNLCHASH(name, dp, hash, namlen); 573 depth = 1; 574 hp = &nc_hash[hash & nc_hashmask]; 575 mutex_enter(&hp->hash_lock); 576 577 for (ncp = hp->hash_next; ncp != (ncache_t *)hp; 578 ncp = ncp->hash_next) { 579 if (ncp->hash == hash && /* fast signature check */ 580 ncp->dp == dp && 581 ncp->namlen == namlen && 582 bcmp(ncp->name, name, namlen) == 0) { 583 /* 584 * Move this entry to the head of its hash chain 585 * if it's not already close. 586 */ 587 if (depth > NC_MOVETOFRONT) { 588 ncache_t *next = ncp->hash_next; 589 ncache_t *prev = ncp->hash_prev; 590 591 prev->hash_next = next; 592 next->hash_prev = prev; 593 ncp->hash_next = next = hp->hash_next; 594 ncp->hash_prev = (ncache_t *)hp; 595 next->hash_prev = ncp; 596 hp->hash_next = ncp; 597 598 ncstats.move_to_front++; 599 } 600 601 /* 602 * Put a hold on the vnode now so its identity 603 * can't change before the caller has a chance to 604 * put a hold on it. 605 */ 606 vp = ncp->vp; 607 VN_HOLD(vp); 608 mutex_exit(&hp->hash_lock); 609 ncstats.hits++; 610 ncs.ncs_hits.value.ui64++; 611 if (vp == DNLC_NO_VNODE) { 612 ncs.ncs_neg_hits.value.ui64++; 613 } 614 TRACE_4(TR_FAC_NFS, TR_DNLC_LOOKUP_END, 615 "dnlc_lookup_end:%S %d vp %x name %s", 616 "hit", ncstats.hits, vp, name); 617 return (vp); 618 } 619 depth++; 620 } 621 622 mutex_exit(&hp->hash_lock); 623 ncstats.misses++; 624 ncs.ncs_misses.value.ui64++; 625 TRACE_4(TR_FAC_NFS, TR_DNLC_LOOKUP_END, 626 "dnlc_lookup_end:%S %d vp %x name %s", "miss", ncstats.misses, 627 NULL, name); 628 return (NULL); 629 } 630 631 /* 632 * Remove an entry in the directory name cache. 633 */ 634 void 635 dnlc_remove(vnode_t *dp, char *name) 636 { 637 ncache_t *ncp; 638 nc_hash_t *hp; 639 uchar_t namlen; 640 int hash; 641 642 if (!doingcache) 643 return; 644 DNLCHASH(name, dp, hash, namlen); 645 hp = &nc_hash[hash & nc_hashmask]; 646 647 mutex_enter(&hp->hash_lock); 648 if (ncp = dnlc_search(dp, name, namlen, hash)) { 649 /* 650 * Free up the entry 651 */ 652 nc_rmhash(ncp); 653 mutex_exit(&hp->hash_lock); 654 VN_RELE(ncp->vp); 655 VN_RELE(ncp->dp); 656 dnlc_free(ncp); 657 return; 658 } 659 mutex_exit(&hp->hash_lock); 660 } 661 662 /* 663 * Purge the entire cache. 664 */ 665 void 666 dnlc_purge() 667 { 668 nc_hash_t *nch; 669 ncache_t *ncp; 670 int index; 671 int i; 672 vnode_t *nc_rele[DNLC_MAX_RELE]; 673 674 if (!doingcache) 675 return; 676 677 ncstats.purges++; 678 ncs.ncs_purge_all.value.ui64++; 679 680 for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) { 681 index = 0; 682 mutex_enter(&nch->hash_lock); 683 ncp = nch->hash_next; 684 while (ncp != (ncache_t *)nch) { 685 ncache_t *np; 686 687 np = ncp->hash_next; 688 nc_rele[index++] = ncp->vp; 689 nc_rele[index++] = ncp->dp; 690 691 nc_rmhash(ncp); 692 dnlc_free(ncp); 693 ncp = np; 694 ncs.ncs_purge_total.value.ui64++; 695 if (index == DNLC_MAX_RELE) 696 break; 697 } 698 mutex_exit(&nch->hash_lock); 699 700 /* Release holds on all the vnodes now that we have no locks */ 701 for (i = 0; i < index; i++) { 702 VN_RELE(nc_rele[i]); 703 } 704 if (ncp != (ncache_t *)nch) { 705 nch--; /* Do current hash chain again */ 706 } 707 } 708 } 709 710 /* 711 * Purge any cache entries referencing a vnode. 712 * Exit as soon as the vnode reference count goes to 1, as the caller 713 * must hold a reference, and the dnlc can therefore have no more. 714 */ 715 void 716 dnlc_purge_vp(vnode_t *vp) 717 { 718 nc_hash_t *nch; 719 ncache_t *ncp; 720 int index; 721 int i; 722 vnode_t *nc_rele[DNLC_MAX_RELE]; 723 724 ASSERT(vp->v_count > 0); 725 if (vp->v_count == 1) { 726 return; 727 } 728 729 if (!doingcache) 730 return; 731 732 ncstats.purges++; 733 ncs.ncs_purge_vp.value.ui64++; 734 735 for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) { 736 index = 0; 737 mutex_enter(&nch->hash_lock); 738 ncp = nch->hash_next; 739 while (ncp != (ncache_t *)nch) { 740 ncache_t *np; 741 742 np = ncp->hash_next; 743 if (ncp->dp == vp || ncp->vp == vp) { 744 nc_rele[index++] = ncp->vp; 745 nc_rele[index++] = ncp->dp; 746 nc_rmhash(ncp); 747 dnlc_free(ncp); 748 ncs.ncs_purge_total.value.ui64++; 749 if (index == DNLC_MAX_RELE) { 750 ncp = np; 751 break; 752 } 753 } 754 ncp = np; 755 } 756 mutex_exit(&nch->hash_lock); 757 758 /* Release holds on all the vnodes now that we have no locks */ 759 if (index) { 760 for (i = 0; i < index; i++) { 761 VN_RELE(nc_rele[i]); 762 } 763 764 if (vp->v_count == 1) { 765 return; /* no more dnlc references */ 766 } 767 } 768 769 if (ncp != (ncache_t *)nch) { 770 nch--; /* Do current hash chain again */ 771 } 772 } 773 } 774 775 /* 776 * Purge cache entries referencing a vfsp. Caller supplies a count 777 * of entries to purge; up to that many will be freed. A count of 778 * zero indicates that all such entries should be purged. Returns 779 * the number of entries that were purged. 780 */ 781 int 782 dnlc_purge_vfsp(vfs_t *vfsp, int count) 783 { 784 nc_hash_t *nch; 785 ncache_t *ncp; 786 int n = 0; 787 int index; 788 int i; 789 vnode_t *nc_rele[DNLC_MAX_RELE]; 790 791 if (!doingcache) 792 return (0); 793 794 ncstats.purges++; 795 ncs.ncs_purge_vfs.value.ui64++; 796 797 for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) { 798 index = 0; 799 mutex_enter(&nch->hash_lock); 800 ncp = nch->hash_next; 801 while (ncp != (ncache_t *)nch) { 802 ncache_t *np; 803 804 np = ncp->hash_next; 805 ASSERT(ncp->dp != NULL); 806 ASSERT(ncp->vp != NULL); 807 if ((ncp->dp->v_vfsp == vfsp) || 808 (ncp->vp->v_vfsp == vfsp)) { 809 n++; 810 nc_rele[index++] = ncp->vp; 811 nc_rele[index++] = ncp->dp; 812 nc_rmhash(ncp); 813 dnlc_free(ncp); 814 ncs.ncs_purge_total.value.ui64++; 815 if (index == DNLC_MAX_RELE) { 816 ncp = np; 817 break; 818 } 819 if (count != 0 && n >= count) { 820 break; 821 } 822 } 823 ncp = np; 824 } 825 mutex_exit(&nch->hash_lock); 826 /* Release holds on all the vnodes now that we have no locks */ 827 for (i = 0; i < index; i++) { 828 VN_RELE(nc_rele[i]); 829 } 830 if (count != 0 && n >= count) { 831 return (n); 832 } 833 if (ncp != (ncache_t *)nch) { 834 nch--; /* Do current hash chain again */ 835 } 836 } 837 return (n); 838 } 839 840 /* 841 * Purge 1 entry from the dnlc that is part of the filesystem(s) 842 * represented by 'vop'. The purpose of this routine is to allow 843 * users of the dnlc to free a vnode that is being held by the dnlc. 844 * 845 * If we find a vnode that we release which will result in 846 * freeing the underlying vnode (count was 1), return 1, 0 847 * if no appropriate vnodes found. 848 * 849 * Note, vop is not the 'right' identifier for a filesystem. 850 */ 851 int 852 dnlc_fs_purge1(vnodeops_t *vop) 853 { 854 nc_hash_t *end; 855 nc_hash_t *hp; 856 ncache_t *ncp; 857 vnode_t *vp; 858 859 if (!doingcache) 860 return (0); 861 862 ncs.ncs_purge_fs1.value.ui64++; 863 864 /* 865 * Scan the dnlc entries looking for a likely candidate. 866 */ 867 hp = end = dnlc_purge_fs1_rotor; 868 869 do { 870 if (++hp == &nc_hash[nc_hashsz]) 871 hp = nc_hash; 872 dnlc_purge_fs1_rotor = hp; 873 if (hp->hash_next == (ncache_t *)hp) 874 continue; 875 mutex_enter(&hp->hash_lock); 876 for (ncp = hp->hash_prev; 877 ncp != (ncache_t *)hp; 878 ncp = ncp->hash_prev) { 879 vp = ncp->vp; 880 if (!vn_has_cached_data(vp) && (vp->v_count == 1) && 881 vn_matchops(vp, vop)) 882 break; 883 } 884 if (ncp != (ncache_t *)hp) { 885 nc_rmhash(ncp); 886 mutex_exit(&hp->hash_lock); 887 VN_RELE(ncp->dp); 888 VN_RELE(vp) 889 dnlc_free(ncp); 890 ncs.ncs_purge_total.value.ui64++; 891 return (1); 892 } 893 mutex_exit(&hp->hash_lock); 894 } while (hp != end); 895 return (0); 896 } 897 898 /* 899 * Perform a reverse lookup in the DNLC. This will find the first occurrence of 900 * the vnode. If successful, it will return the vnode of the parent, and the 901 * name of the entry in the given buffer. If it cannot be found, or the buffer 902 * is too small, then it will return NULL. Note that this is a highly 903 * inefficient function, since the DNLC is constructed solely for forward 904 * lookups. 905 */ 906 vnode_t * 907 dnlc_reverse_lookup(vnode_t *vp, char *buf, size_t buflen) 908 { 909 nc_hash_t *nch; 910 ncache_t *ncp; 911 vnode_t *pvp; 912 913 if (!doingcache) 914 return (NULL); 915 916 for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) { 917 mutex_enter(&nch->hash_lock); 918 ncp = nch->hash_next; 919 while (ncp != (ncache_t *)nch) { 920 /* 921 * We ignore '..' entries since it can create 922 * confusion and infinite loops. 923 */ 924 if (ncp->vp == vp && !(ncp->namlen == 2 && 925 0 == bcmp(ncp->name, "..", 2)) && 926 ncp->namlen < buflen) { 927 bcopy(ncp->name, buf, ncp->namlen); 928 buf[ncp->namlen] = '\0'; 929 pvp = ncp->dp; 930 VN_HOLD(pvp); 931 mutex_exit(&nch->hash_lock); 932 return (pvp); 933 } 934 ncp = ncp->hash_next; 935 } 936 mutex_exit(&nch->hash_lock); 937 } 938 939 return (NULL); 940 } 941 /* 942 * Utility routine to search for a cache entry. Return the 943 * ncache entry if found, NULL otherwise. 944 */ 945 static ncache_t * 946 dnlc_search(vnode_t *dp, char *name, uchar_t namlen, int hash) 947 { 948 nc_hash_t *hp; 949 ncache_t *ncp; 950 951 hp = &nc_hash[hash & nc_hashmask]; 952 953 for (ncp = hp->hash_next; ncp != (ncache_t *)hp; ncp = ncp->hash_next) { 954 if (ncp->hash == hash && 955 ncp->dp == dp && 956 ncp->namlen == namlen && 957 bcmp(ncp->name, name, namlen) == 0) 958 return (ncp); 959 } 960 return (NULL); 961 } 962 963 #if ((1 << NBBY) - 1) < (MAXNAMELEN - 1) 964 #error ncache_t name length representation is too small 965 #endif 966 967 /* 968 * Get a new name cache entry. 969 * If the dnlc_reduce_cache() taskq isn't keeping up with demand, or memory 970 * is short then just return NULL. If we're over ncsize then kick off a 971 * thread to free some in use entries down to dnlc_nentries_low_water. 972 * Caller must initialise all fields except namlen. 973 * Component names are defined to be less than MAXNAMELEN 974 * which includes a null. 975 */ 976 static ncache_t * 977 dnlc_get(uchar_t namlen) 978 { 979 ncache_t *ncp; 980 981 if (dnlc_nentries > dnlc_max_nentries) { 982 dnlc_max_nentries_cnt++; /* keep a statistic */ 983 return (NULL); 984 } 985 ncp = kmem_alloc(sizeof (ncache_t) + namlen, KM_NOSLEEP); 986 if (ncp == NULL) { 987 return (NULL); 988 } 989 ncp->namlen = namlen; 990 atomic_add_32(&dnlc_nentries, 1); 991 if (dnlc_reduce_idle && (dnlc_nentries >= ncsize)) { 992 dnlc_reduce_idle = 0; 993 (void) taskq_dispatch(system_taskq, dnlc_reduce_cache, 994 NULL, TQ_SLEEP); 995 } 996 return (ncp); 997 } 998 999 /* 1000 * Taskq routine to free up name cache entries to reduce the 1001 * cache size to the low water mark. 1002 */ 1003 /*ARGSUSED*/ 1004 static void 1005 dnlc_reduce_cache(void *unused) 1006 { 1007 nc_hash_t *hp = dnlc_free_rotor; 1008 vnode_t *vp; 1009 ncache_t *ncp; 1010 int cnt; 1011 1012 do { 1013 /* 1014 * Find the first non empty hash queue without locking 1015 * Recheck we really have entries to avoid 1016 * an infinite loop if all the entries get purged. 1017 */ 1018 do { 1019 if (++hp == &nc_hash[nc_hashsz]) { 1020 hp = nc_hash; 1021 if (dnlc_nentries < dnlc_nentries_low_water) { 1022 dnlc_reduce_idle = 1; 1023 return; 1024 } 1025 } 1026 } while (hp->hash_next == (ncache_t *)hp); 1027 1028 mutex_enter(&hp->hash_lock); 1029 for (cnt = 0, ncp = hp->hash_prev; ncp != (ncache_t *)hp; 1030 ncp = ncp->hash_prev, cnt++) { 1031 vp = ncp->vp; 1032 /* 1033 * A name cache entry with a reference count 1034 * of one is only referenced by the dnlc. 1035 * Also negative cache entries are purged first. 1036 */ 1037 if (!vn_has_cached_data(vp) && 1038 ((vp->v_count == 1) || (vp == DNLC_NO_VNODE))) { 1039 ncs.ncs_pick_heur.value.ui64++; 1040 goto found; 1041 } 1042 /* 1043 * Remove from the end of the chain if the 1044 * chain is too long 1045 */ 1046 if (cnt > dnlc_long_chain) { 1047 ncp = hp->hash_prev; 1048 ncs.ncs_pick_last.value.ui64++; 1049 vp = ncp->vp; 1050 goto found; 1051 } 1052 } 1053 /* check for race and continue */ 1054 if (hp->hash_next == (ncache_t *)hp) { 1055 mutex_exit(&hp->hash_lock); 1056 continue; 1057 } 1058 1059 ncp = hp->hash_prev; /* pick the last one in the hash queue */ 1060 ncs.ncs_pick_last.value.ui64++; 1061 vp = ncp->vp; 1062 found: 1063 /* 1064 * Remove from hash chain. 1065 */ 1066 nc_rmhash(ncp); 1067 mutex_exit(&hp->hash_lock); 1068 VN_RELE(vp); 1069 VN_RELE(ncp->dp); 1070 dnlc_free(ncp); 1071 } while (dnlc_nentries > dnlc_nentries_low_water); 1072 1073 dnlc_free_rotor = hp; 1074 dnlc_reduce_idle = 1; 1075 } 1076 1077 /* 1078 * Directory caching routines 1079 * ========================== 1080 * 1081 * See dnlc.h for details of the interfaces below. 1082 */ 1083 1084 /* 1085 * Lookup up an entry in a complete or partial directory cache. 1086 */ 1087 dcret_t 1088 dnlc_dir_lookup(dcanchor_t *dcap, char *name, uint64_t *handle) 1089 { 1090 dircache_t *dcp; 1091 dcentry_t *dep; 1092 int hash; 1093 int ret; 1094 uchar_t namlen; 1095 1096 /* 1097 * can test without lock as we are only a cache 1098 */ 1099 if (!VALID_DIR_CACHE(dcap->dca_dircache)) { 1100 ncs.ncs_dir_misses.value.ui64++; 1101 return (DNOCACHE); 1102 } 1103 1104 if (!dnlc_dir_enable) { 1105 return (DNOCACHE); 1106 } 1107 1108 mutex_enter(&dcap->dca_lock); 1109 dcp = (dircache_t *)dcap->dca_dircache; 1110 if (VALID_DIR_CACHE(dcp)) { 1111 dcp->dc_actime = lbolt64; 1112 DNLC_DIR_HASH(name, hash, namlen); 1113 dep = dcp->dc_namehash[hash & dcp->dc_nhash_mask]; 1114 while (dep != NULL) { 1115 if ((dep->de_hash == hash) && 1116 (namlen == dep->de_namelen) && 1117 bcmp(dep->de_name, name, namlen) == 0) { 1118 *handle = dep->de_handle; 1119 mutex_exit(&dcap->dca_lock); 1120 ncs.ncs_dir_hits.value.ui64++; 1121 return (DFOUND); 1122 } 1123 dep = dep->de_next; 1124 } 1125 if (dcp->dc_complete) { 1126 ret = DNOENT; 1127 } else { 1128 ret = DNOCACHE; 1129 } 1130 mutex_exit(&dcap->dca_lock); 1131 return (ret); 1132 } else { 1133 mutex_exit(&dcap->dca_lock); 1134 ncs.ncs_dir_misses.value.ui64++; 1135 return (DNOCACHE); 1136 } 1137 } 1138 1139 /* 1140 * Start a new directory cache. An estimate of the number of 1141 * entries is provided to as a quick check to ensure the directory 1142 * is cacheable. 1143 */ 1144 dcret_t 1145 dnlc_dir_start(dcanchor_t *dcap, uint_t num_entries) 1146 { 1147 dircache_t *dcp; 1148 1149 if (!dnlc_dir_enable || 1150 (num_entries < dnlc_dir_min_size)) { 1151 return (DNOCACHE); 1152 } 1153 1154 if (num_entries > dnlc_dir_max_size) { 1155 return (DTOOBIG); 1156 } 1157 1158 mutex_enter(&dc_head.dch_lock); 1159 mutex_enter(&dcap->dca_lock); 1160 1161 if (dcap->dca_dircache == DC_RET_LOW_MEM) { 1162 dcap->dca_dircache = NULL; 1163 mutex_exit(&dcap->dca_lock); 1164 mutex_exit(&dc_head.dch_lock); 1165 return (DNOMEM); 1166 } 1167 1168 /* 1169 * Check if there's currently a cache. 1170 * This probably only occurs on a race. 1171 */ 1172 if (dcap->dca_dircache != NULL) { 1173 mutex_exit(&dcap->dca_lock); 1174 mutex_exit(&dc_head.dch_lock); 1175 return (DNOCACHE); 1176 } 1177 1178 /* 1179 * Allocate the dircache struct, entry and free space hash tables. 1180 * These tables are initially just one entry but dynamically resize 1181 * when entries and free space are added or removed. 1182 */ 1183 if ((dcp = kmem_zalloc(sizeof (dircache_t), KM_NOSLEEP)) == NULL) { 1184 goto error; 1185 } 1186 if ((dcp->dc_namehash = kmem_zalloc(sizeof (dcentry_t *), 1187 KM_NOSLEEP)) == NULL) { 1188 goto error; 1189 } 1190 if ((dcp->dc_freehash = kmem_zalloc(sizeof (dcfree_t *), 1191 KM_NOSLEEP)) == NULL) { 1192 goto error; 1193 } 1194 1195 dcp->dc_anchor = dcap; /* set back pointer to anchor */ 1196 dcap->dca_dircache = dcp; 1197 1198 /* add into head of global chain */ 1199 dcp->dc_next = dc_head.dch_next; 1200 dcp->dc_prev = (dircache_t *)&dc_head; 1201 dcp->dc_next->dc_prev = dcp; 1202 dc_head.dch_next = dcp; 1203 1204 mutex_exit(&dcap->dca_lock); 1205 mutex_exit(&dc_head.dch_lock); 1206 ncs.ncs_cur_dirs.value.ui64++; 1207 ncs.ncs_dirs_cached.value.ui64++; 1208 return (DOK); 1209 error: 1210 if (dcp != NULL) { 1211 if (dcp->dc_namehash) { 1212 kmem_free(dcp->dc_namehash, sizeof (dcentry_t *)); 1213 } 1214 kmem_free(dcp, sizeof (dircache_t)); 1215 } 1216 /* 1217 * Must also kmem_free dcp->dc_freehash if more error cases are added 1218 */ 1219 mutex_exit(&dcap->dca_lock); 1220 mutex_exit(&dc_head.dch_lock); 1221 ncs.ncs_dir_start_nm.value.ui64++; 1222 return (DNOCACHE); 1223 } 1224 1225 /* 1226 * Add a directopry entry to a partial or complete directory cache. 1227 */ 1228 dcret_t 1229 dnlc_dir_add_entry(dcanchor_t *dcap, char *name, uint64_t handle) 1230 { 1231 dircache_t *dcp; 1232 dcentry_t **hp, *dep; 1233 int hash; 1234 uint_t capacity; 1235 uchar_t namlen; 1236 1237 /* 1238 * Allocate the dcentry struct, including the variable 1239 * size name. Note, the null terminator is not copied. 1240 * 1241 * We do this outside the lock to avoid possible deadlock if 1242 * dnlc_dir_reclaim() is called as a result of memory shortage. 1243 */ 1244 DNLC_DIR_HASH(name, hash, namlen); 1245 dep = kmem_alloc(sizeof (dcentry_t) - 1 + namlen, KM_NOSLEEP); 1246 if (dep == NULL) { 1247 #ifdef DEBUG 1248 /* 1249 * The kmem allocator generates random failures for 1250 * KM_NOSLEEP calls (see KMEM_RANDOM_ALLOCATION_FAILURE) 1251 * So try again before we blow away a perfectly good cache. 1252 * This is done not to cover an error but purely for 1253 * performance running a debug kernel. 1254 * This random error only occurs in debug mode. 1255 */ 1256 dep = kmem_alloc(sizeof (dcentry_t) - 1 + namlen, KM_NOSLEEP); 1257 if (dep != NULL) 1258 goto ok; 1259 #endif 1260 ncs.ncs_dir_add_nm.value.ui64++; 1261 /* 1262 * Free a directory cache. This may be the one we are 1263 * called with. 1264 */ 1265 dnlc_dir_reclaim(NULL); 1266 dep = kmem_alloc(sizeof (dcentry_t) - 1 + namlen, KM_NOSLEEP); 1267 if (dep == NULL) { 1268 /* 1269 * still no memory, better delete this cache 1270 */ 1271 mutex_enter(&dcap->dca_lock); 1272 dcp = (dircache_t *)dcap->dca_dircache; 1273 if (VALID_DIR_CACHE(dcp)) { 1274 dnlc_dir_abort(dcp); 1275 dcap->dca_dircache = DC_RET_LOW_MEM; 1276 } 1277 mutex_exit(&dcap->dca_lock); 1278 ncs.ncs_dir_addabort.value.ui64++; 1279 return (DNOCACHE); 1280 } 1281 /* 1282 * fall through as if the 1st kmem_alloc had worked 1283 */ 1284 } 1285 #ifdef DEBUG 1286 ok: 1287 #endif 1288 mutex_enter(&dcap->dca_lock); 1289 dcp = (dircache_t *)dcap->dca_dircache; 1290 if (VALID_DIR_CACHE(dcp)) { 1291 /* 1292 * If the total number of entries goes above the max 1293 * then free this cache 1294 */ 1295 if ((dcp->dc_num_entries + dcp->dc_num_free) > 1296 dnlc_dir_max_size) { 1297 mutex_exit(&dcap->dca_lock); 1298 dnlc_dir_purge(dcap); 1299 kmem_free(dep, sizeof (dcentry_t) - 1 + namlen); 1300 ncs.ncs_dir_add_max.value.ui64++; 1301 return (DTOOBIG); 1302 } 1303 dcp->dc_num_entries++; 1304 capacity = (dcp->dc_nhash_mask + 1) << dnlc_dir_hash_size_shift; 1305 if (dcp->dc_num_entries >= 1306 (capacity << dnlc_dir_hash_resize_shift)) { 1307 dnlc_dir_adjust_nhash(dcp); 1308 } 1309 hp = &dcp->dc_namehash[hash & dcp->dc_nhash_mask]; 1310 1311 /* 1312 * Initialise and chain in new entry 1313 */ 1314 dep->de_handle = handle; 1315 dep->de_hash = hash; 1316 /* 1317 * Note de_namelen is a uchar_t to conserve space 1318 * and alignment padding. The max length of any 1319 * pathname component is defined as MAXNAMELEN 1320 * which is 256 (including the terminating null). 1321 * So provided this doesn't change, we don't include the null, 1322 * we always use bcmp to compare strings, and we don't 1323 * start storing full names, then we are ok. 1324 * The space savings is worth it. 1325 */ 1326 dep->de_namelen = namlen; 1327 bcopy(name, dep->de_name, namlen); 1328 dep->de_next = *hp; 1329 *hp = dep; 1330 dcp->dc_actime = lbolt64; 1331 mutex_exit(&dcap->dca_lock); 1332 ncs.ncs_dir_num_ents.value.ui64++; 1333 return (DOK); 1334 } else { 1335 mutex_exit(&dcap->dca_lock); 1336 kmem_free(dep, sizeof (dcentry_t) - 1 + namlen); 1337 return (DNOCACHE); 1338 } 1339 } 1340 1341 /* 1342 * Add free space to a partial or complete directory cache. 1343 */ 1344 dcret_t 1345 dnlc_dir_add_space(dcanchor_t *dcap, uint_t len, uint64_t handle) 1346 { 1347 dircache_t *dcp; 1348 dcfree_t *dfp, **hp; 1349 uint_t capacity; 1350 1351 /* 1352 * We kmem_alloc outside the lock to avoid possible deadlock if 1353 * dnlc_dir_reclaim() is called as a result of memory shortage. 1354 */ 1355 dfp = kmem_cache_alloc(dnlc_dir_space_cache, KM_NOSLEEP); 1356 if (dfp == NULL) { 1357 #ifdef DEBUG 1358 /* 1359 * The kmem allocator generates random failures for 1360 * KM_NOSLEEP calls (see KMEM_RANDOM_ALLOCATION_FAILURE) 1361 * So try again before we blow away a perfectly good cache. 1362 * This random error only occurs in debug mode 1363 */ 1364 dfp = kmem_cache_alloc(dnlc_dir_space_cache, KM_NOSLEEP); 1365 if (dfp != NULL) 1366 goto ok; 1367 #endif 1368 ncs.ncs_dir_add_nm.value.ui64++; 1369 /* 1370 * Free a directory cache. This may be the one we are 1371 * called with. 1372 */ 1373 dnlc_dir_reclaim(NULL); 1374 dfp = kmem_cache_alloc(dnlc_dir_space_cache, KM_NOSLEEP); 1375 if (dfp == NULL) { 1376 /* 1377 * still no memory, better delete this cache 1378 */ 1379 mutex_enter(&dcap->dca_lock); 1380 dcp = (dircache_t *)dcap->dca_dircache; 1381 if (VALID_DIR_CACHE(dcp)) { 1382 dnlc_dir_abort(dcp); 1383 dcap->dca_dircache = DC_RET_LOW_MEM; 1384 } 1385 mutex_exit(&dcap->dca_lock); 1386 ncs.ncs_dir_addabort.value.ui64++; 1387 return (DNOCACHE); 1388 } 1389 /* 1390 * fall through as if the 1st kmem_alloc had worked 1391 */ 1392 } 1393 1394 #ifdef DEBUG 1395 ok: 1396 #endif 1397 mutex_enter(&dcap->dca_lock); 1398 dcp = (dircache_t *)dcap->dca_dircache; 1399 if (VALID_DIR_CACHE(dcp)) { 1400 if ((dcp->dc_num_entries + dcp->dc_num_free) > 1401 dnlc_dir_max_size) { 1402 mutex_exit(&dcap->dca_lock); 1403 dnlc_dir_purge(dcap); 1404 kmem_cache_free(dnlc_dir_space_cache, dfp); 1405 ncs.ncs_dir_add_max.value.ui64++; 1406 return (DTOOBIG); 1407 } 1408 dcp->dc_num_free++; 1409 capacity = (dcp->dc_fhash_mask + 1) << dnlc_dir_hash_size_shift; 1410 if (dcp->dc_num_free >= 1411 (capacity << dnlc_dir_hash_resize_shift)) { 1412 dnlc_dir_adjust_fhash(dcp); 1413 } 1414 /* 1415 * Initialise and chain a new entry 1416 */ 1417 dfp->df_handle = handle; 1418 dfp->df_len = len; 1419 dcp->dc_actime = lbolt64; 1420 hp = &(dcp->dc_freehash[DDFHASH(handle, dcp)]); 1421 dfp->df_next = *hp; 1422 *hp = dfp; 1423 mutex_exit(&dcap->dca_lock); 1424 ncs.ncs_dir_num_ents.value.ui64++; 1425 return (DOK); 1426 } else { 1427 mutex_exit(&dcap->dca_lock); 1428 kmem_cache_free(dnlc_dir_space_cache, dfp); 1429 return (DNOCACHE); 1430 } 1431 } 1432 1433 /* 1434 * Mark a directory cache as complete. 1435 */ 1436 void 1437 dnlc_dir_complete(dcanchor_t *dcap) 1438 { 1439 dircache_t *dcp; 1440 1441 mutex_enter(&dcap->dca_lock); 1442 dcp = (dircache_t *)dcap->dca_dircache; 1443 if (VALID_DIR_CACHE(dcp)) { 1444 dcp->dc_complete = B_TRUE; 1445 } 1446 mutex_exit(&dcap->dca_lock); 1447 } 1448 1449 /* 1450 * Internal routine to delete a partial or full directory cache. 1451 * No additional locking needed. 1452 */ 1453 static void 1454 dnlc_dir_abort(dircache_t *dcp) 1455 { 1456 dcentry_t *dep, *nhp; 1457 dcfree_t *fep, *fhp; 1458 uint_t nhtsize = dcp->dc_nhash_mask + 1; /* name hash table size */ 1459 uint_t fhtsize = dcp->dc_fhash_mask + 1; /* free hash table size */ 1460 uint_t i; 1461 1462 /* 1463 * Free up the cached name entries and hash table 1464 */ 1465 for (i = 0; i < nhtsize; i++) { /* for each hash bucket */ 1466 nhp = dcp->dc_namehash[i]; 1467 while (nhp != NULL) { /* for each chained entry */ 1468 dep = nhp->de_next; 1469 kmem_free(nhp, sizeof (dcentry_t) - 1 + 1470 nhp->de_namelen); 1471 nhp = dep; 1472 } 1473 } 1474 kmem_free(dcp->dc_namehash, sizeof (dcentry_t *) * nhtsize); 1475 1476 /* 1477 * Free up the free space entries and hash table 1478 */ 1479 for (i = 0; i < fhtsize; i++) { /* for each hash bucket */ 1480 fhp = dcp->dc_freehash[i]; 1481 while (fhp != NULL) { /* for each chained entry */ 1482 fep = fhp->df_next; 1483 kmem_cache_free(dnlc_dir_space_cache, fhp); 1484 fhp = fep; 1485 } 1486 } 1487 kmem_free(dcp->dc_freehash, sizeof (dcfree_t *) * fhtsize); 1488 1489 /* 1490 * Finally free the directory cache structure itself 1491 */ 1492 ncs.ncs_dir_num_ents.value.ui64 -= (dcp->dc_num_entries + 1493 dcp->dc_num_free); 1494 kmem_free(dcp, sizeof (dircache_t)); 1495 ncs.ncs_cur_dirs.value.ui64--; 1496 } 1497 1498 /* 1499 * Remove a partial or complete directory cache 1500 */ 1501 void 1502 dnlc_dir_purge(dcanchor_t *dcap) 1503 { 1504 dircache_t *dcp; 1505 1506 mutex_enter(&dc_head.dch_lock); 1507 mutex_enter(&dcap->dca_lock); 1508 dcp = (dircache_t *)dcap->dca_dircache; 1509 if (!VALID_DIR_CACHE(dcp)) { 1510 mutex_exit(&dcap->dca_lock); 1511 mutex_exit(&dc_head.dch_lock); 1512 return; 1513 } 1514 dcap->dca_dircache = NULL; 1515 /* 1516 * Unchain from global list 1517 */ 1518 dcp->dc_prev->dc_next = dcp->dc_next; 1519 dcp->dc_next->dc_prev = dcp->dc_prev; 1520 mutex_exit(&dcap->dca_lock); 1521 mutex_exit(&dc_head.dch_lock); 1522 dnlc_dir_abort(dcp); 1523 } 1524 1525 /* 1526 * Remove an entry from a complete or partial directory cache. 1527 * Return the handle if it's non null. 1528 */ 1529 dcret_t 1530 dnlc_dir_rem_entry(dcanchor_t *dcap, char *name, uint64_t *handlep) 1531 { 1532 dircache_t *dcp; 1533 dcentry_t **prevpp, *te; 1534 uint_t capacity; 1535 int hash; 1536 int ret; 1537 uchar_t namlen; 1538 1539 if (!dnlc_dir_enable) { 1540 return (DNOCACHE); 1541 } 1542 1543 mutex_enter(&dcap->dca_lock); 1544 dcp = (dircache_t *)dcap->dca_dircache; 1545 if (VALID_DIR_CACHE(dcp)) { 1546 dcp->dc_actime = lbolt64; 1547 if (dcp->dc_nhash_mask > 0) { /* ie not minimum */ 1548 capacity = (dcp->dc_nhash_mask + 1) << 1549 dnlc_dir_hash_size_shift; 1550 if (dcp->dc_num_entries <= 1551 (capacity >> dnlc_dir_hash_resize_shift)) { 1552 dnlc_dir_adjust_nhash(dcp); 1553 } 1554 } 1555 DNLC_DIR_HASH(name, hash, namlen); 1556 prevpp = &dcp->dc_namehash[hash & dcp->dc_nhash_mask]; 1557 while (*prevpp != NULL) { 1558 if (((*prevpp)->de_hash == hash) && 1559 (namlen == (*prevpp)->de_namelen) && 1560 bcmp((*prevpp)->de_name, name, namlen) == 0) { 1561 if (handlep != NULL) { 1562 *handlep = (*prevpp)->de_handle; 1563 } 1564 te = *prevpp; 1565 *prevpp = (*prevpp)->de_next; 1566 kmem_free(te, sizeof (dcentry_t) - 1 + 1567 te->de_namelen); 1568 1569 /* 1570 * If the total number of entries 1571 * falls below half the minimum number 1572 * of entries then free this cache. 1573 */ 1574 if (--dcp->dc_num_entries < 1575 (dnlc_dir_min_size >> 1)) { 1576 mutex_exit(&dcap->dca_lock); 1577 dnlc_dir_purge(dcap); 1578 } else { 1579 mutex_exit(&dcap->dca_lock); 1580 } 1581 ncs.ncs_dir_num_ents.value.ui64--; 1582 return (DFOUND); 1583 } 1584 prevpp = &((*prevpp)->de_next); 1585 } 1586 if (dcp->dc_complete) { 1587 ncs.ncs_dir_reme_fai.value.ui64++; 1588 ret = DNOENT; 1589 } else { 1590 ret = DNOCACHE; 1591 } 1592 mutex_exit(&dcap->dca_lock); 1593 return (ret); 1594 } else { 1595 mutex_exit(&dcap->dca_lock); 1596 return (DNOCACHE); 1597 } 1598 } 1599 1600 1601 /* 1602 * Remove free space of at least the given length from a complete 1603 * or partial directory cache. 1604 */ 1605 dcret_t 1606 dnlc_dir_rem_space_by_len(dcanchor_t *dcap, uint_t len, uint64_t *handlep) 1607 { 1608 dircache_t *dcp; 1609 dcfree_t **prevpp, *tfp; 1610 uint_t fhtsize; /* free hash table size */ 1611 uint_t i; 1612 uint_t capacity; 1613 int ret; 1614 1615 if (!dnlc_dir_enable) { 1616 return (DNOCACHE); 1617 } 1618 1619 mutex_enter(&dcap->dca_lock); 1620 dcp = (dircache_t *)dcap->dca_dircache; 1621 if (VALID_DIR_CACHE(dcp)) { 1622 dcp->dc_actime = lbolt64; 1623 if (dcp->dc_fhash_mask > 0) { /* ie not minimum */ 1624 capacity = (dcp->dc_fhash_mask + 1) << 1625 dnlc_dir_hash_size_shift; 1626 if (dcp->dc_num_free <= 1627 (capacity >> dnlc_dir_hash_resize_shift)) { 1628 dnlc_dir_adjust_fhash(dcp); 1629 } 1630 } 1631 /* 1632 * Search for an entry of the appropriate size 1633 * on a first fit basis. 1634 */ 1635 fhtsize = dcp->dc_fhash_mask + 1; 1636 for (i = 0; i < fhtsize; i++) { /* for each hash bucket */ 1637 prevpp = &(dcp->dc_freehash[i]); 1638 while (*prevpp != NULL) { 1639 if ((*prevpp)->df_len >= len) { 1640 *handlep = (*prevpp)->df_handle; 1641 tfp = *prevpp; 1642 *prevpp = (*prevpp)->df_next; 1643 dcp->dc_num_free--; 1644 mutex_exit(&dcap->dca_lock); 1645 kmem_cache_free(dnlc_dir_space_cache, 1646 tfp); 1647 ncs.ncs_dir_num_ents.value.ui64--; 1648 return (DFOUND); 1649 } 1650 prevpp = &((*prevpp)->df_next); 1651 } 1652 } 1653 if (dcp->dc_complete) { 1654 ret = DNOENT; 1655 } else { 1656 ret = DNOCACHE; 1657 } 1658 mutex_exit(&dcap->dca_lock); 1659 return (ret); 1660 } else { 1661 mutex_exit(&dcap->dca_lock); 1662 return (DNOCACHE); 1663 } 1664 } 1665 1666 /* 1667 * Remove free space with the given handle from a complete or partial 1668 * directory cache. 1669 */ 1670 dcret_t 1671 dnlc_dir_rem_space_by_handle(dcanchor_t *dcap, uint64_t handle) 1672 { 1673 dircache_t *dcp; 1674 dcfree_t **prevpp, *tfp; 1675 uint_t capacity; 1676 int ret; 1677 1678 if (!dnlc_dir_enable) { 1679 return (DNOCACHE); 1680 } 1681 1682 mutex_enter(&dcap->dca_lock); 1683 dcp = (dircache_t *)dcap->dca_dircache; 1684 if (VALID_DIR_CACHE(dcp)) { 1685 dcp->dc_actime = lbolt64; 1686 if (dcp->dc_fhash_mask > 0) { /* ie not minimum */ 1687 capacity = (dcp->dc_fhash_mask + 1) << 1688 dnlc_dir_hash_size_shift; 1689 if (dcp->dc_num_free <= 1690 (capacity >> dnlc_dir_hash_resize_shift)) { 1691 dnlc_dir_adjust_fhash(dcp); 1692 } 1693 } 1694 1695 /* 1696 * search for the exact entry 1697 */ 1698 prevpp = &(dcp->dc_freehash[DDFHASH(handle, dcp)]); 1699 while (*prevpp != NULL) { 1700 if ((*prevpp)->df_handle == handle) { 1701 tfp = *prevpp; 1702 *prevpp = (*prevpp)->df_next; 1703 dcp->dc_num_free--; 1704 mutex_exit(&dcap->dca_lock); 1705 kmem_cache_free(dnlc_dir_space_cache, tfp); 1706 ncs.ncs_dir_num_ents.value.ui64--; 1707 return (DFOUND); 1708 } 1709 prevpp = &((*prevpp)->df_next); 1710 } 1711 if (dcp->dc_complete) { 1712 ncs.ncs_dir_rems_fai.value.ui64++; 1713 ret = DNOENT; 1714 } else { 1715 ret = DNOCACHE; 1716 } 1717 mutex_exit(&dcap->dca_lock); 1718 return (ret); 1719 } else { 1720 mutex_exit(&dcap->dca_lock); 1721 return (DNOCACHE); 1722 } 1723 } 1724 1725 /* 1726 * Update the handle of an directory cache entry. 1727 */ 1728 dcret_t 1729 dnlc_dir_update(dcanchor_t *dcap, char *name, uint64_t handle) 1730 { 1731 dircache_t *dcp; 1732 dcentry_t *dep; 1733 int hash; 1734 int ret; 1735 uchar_t namlen; 1736 1737 if (!dnlc_dir_enable) { 1738 return (DNOCACHE); 1739 } 1740 1741 mutex_enter(&dcap->dca_lock); 1742 dcp = (dircache_t *)dcap->dca_dircache; 1743 if (VALID_DIR_CACHE(dcp)) { 1744 dcp->dc_actime = lbolt64; 1745 DNLC_DIR_HASH(name, hash, namlen); 1746 dep = dcp->dc_namehash[hash & dcp->dc_nhash_mask]; 1747 while (dep != NULL) { 1748 if ((dep->de_hash == hash) && 1749 (namlen == dep->de_namelen) && 1750 bcmp(dep->de_name, name, namlen) == 0) { 1751 dep->de_handle = handle; 1752 mutex_exit(&dcap->dca_lock); 1753 return (DFOUND); 1754 } 1755 dep = dep->de_next; 1756 } 1757 if (dcp->dc_complete) { 1758 ncs.ncs_dir_upd_fail.value.ui64++; 1759 ret = DNOENT; 1760 } else { 1761 ret = DNOCACHE; 1762 } 1763 mutex_exit(&dcap->dca_lock); 1764 return (ret); 1765 } else { 1766 mutex_exit(&dcap->dca_lock); 1767 return (DNOCACHE); 1768 } 1769 } 1770 1771 void 1772 dnlc_dir_fini(dcanchor_t *dcap) 1773 { 1774 dircache_t *dcp; 1775 1776 mutex_enter(&dc_head.dch_lock); 1777 mutex_enter(&dcap->dca_lock); 1778 dcp = (dircache_t *)dcap->dca_dircache; 1779 if (VALID_DIR_CACHE(dcp)) { 1780 /* 1781 * Unchain from global list 1782 */ 1783 ncs.ncs_dir_finipurg.value.ui64++; 1784 dcp->dc_prev->dc_next = dcp->dc_next; 1785 dcp->dc_next->dc_prev = dcp->dc_prev; 1786 } else { 1787 dcp = NULL; 1788 } 1789 dcap->dca_dircache = NULL; 1790 mutex_exit(&dcap->dca_lock); 1791 mutex_exit(&dc_head.dch_lock); 1792 mutex_destroy(&dcap->dca_lock); 1793 if (dcp) { 1794 dnlc_dir_abort(dcp); 1795 } 1796 } 1797 1798 /* 1799 * Reclaim callback for dnlc directory caching. 1800 * Invoked by the kernel memory allocator when memory gets tight. 1801 * This is a pretty serious condition and can lead easily lead to system 1802 * hangs if not enough space is returned. 1803 * 1804 * Deciding which directory (or directories) to purge is tricky. 1805 * Purging everything is an overkill, but purging just the oldest used 1806 * was found to lead to hangs. The largest cached directories use the 1807 * most memory, but take the most effort to rebuild, whereas the smaller 1808 * ones have little value and give back little space. So what to do? 1809 * 1810 * The current policy is to continue purging the oldest used directories 1811 * until at least dnlc_dir_min_reclaim directory entries have been purged. 1812 */ 1813 /*ARGSUSED*/ 1814 static void 1815 dnlc_dir_reclaim(void *unused) 1816 { 1817 dircache_t *dcp, *oldest; 1818 uint_t dirent_cnt = 0; 1819 1820 mutex_enter(&dc_head.dch_lock); 1821 while (dirent_cnt < dnlc_dir_min_reclaim) { 1822 dcp = dc_head.dch_next; 1823 oldest = NULL; 1824 while (dcp != (dircache_t *)&dc_head) { 1825 if (oldest == NULL) { 1826 oldest = dcp; 1827 } else { 1828 if (dcp->dc_actime < oldest->dc_actime) { 1829 oldest = dcp; 1830 } 1831 } 1832 dcp = dcp->dc_next; 1833 } 1834 if (oldest == NULL) { 1835 /* nothing to delete */ 1836 mutex_exit(&dc_head.dch_lock); 1837 return; 1838 } 1839 /* 1840 * remove from directory chain and purge 1841 */ 1842 oldest->dc_prev->dc_next = oldest->dc_next; 1843 oldest->dc_next->dc_prev = oldest->dc_prev; 1844 mutex_enter(&oldest->dc_anchor->dca_lock); 1845 /* 1846 * If this was the last entry then it must be too large. 1847 * Mark it as such by saving a special dircache_t 1848 * pointer (DC_RET_LOW_MEM) in the anchor. The error DNOMEM 1849 * will be presented to the caller of dnlc_dir_start() 1850 */ 1851 if (oldest->dc_next == oldest->dc_prev) { 1852 oldest->dc_anchor->dca_dircache = DC_RET_LOW_MEM; 1853 ncs.ncs_dir_rec_last.value.ui64++; 1854 } else { 1855 oldest->dc_anchor->dca_dircache = NULL; 1856 ncs.ncs_dir_recl_any.value.ui64++; 1857 } 1858 mutex_exit(&oldest->dc_anchor->dca_lock); 1859 dirent_cnt += oldest->dc_num_entries; 1860 dnlc_dir_abort(oldest); 1861 } 1862 mutex_exit(&dc_head.dch_lock); 1863 } 1864 1865 /* 1866 * Dynamically grow or shrink the size of the name hash table 1867 */ 1868 static void 1869 dnlc_dir_adjust_nhash(dircache_t *dcp) 1870 { 1871 dcentry_t **newhash, *dep, **nhp, *tep; 1872 uint_t newsize; 1873 uint_t oldsize; 1874 uint_t newsizemask; 1875 int i; 1876 1877 /* 1878 * Allocate new hash table 1879 */ 1880 newsize = dcp->dc_num_entries >> dnlc_dir_hash_size_shift; 1881 newhash = kmem_zalloc(sizeof (dcentry_t *) * newsize, KM_NOSLEEP); 1882 if (newhash == NULL) { 1883 /* 1884 * System is short on memory just return 1885 * Note, the old hash table is still usable. 1886 * This return is unlikely to repeatedy occur, because 1887 * either some other directory caches will be reclaimed 1888 * due to memory shortage, thus freeing memory, or this 1889 * directory cahe will be reclaimed. 1890 */ 1891 return; 1892 } 1893 oldsize = dcp->dc_nhash_mask + 1; 1894 dcp->dc_nhash_mask = newsizemask = newsize - 1; 1895 1896 /* 1897 * Move entries from the old table to the new 1898 */ 1899 for (i = 0; i < oldsize; i++) { /* for each hash bucket */ 1900 dep = dcp->dc_namehash[i]; 1901 while (dep != NULL) { /* for each chained entry */ 1902 tep = dep; 1903 dep = dep->de_next; 1904 nhp = &newhash[tep->de_hash & newsizemask]; 1905 tep->de_next = *nhp; 1906 *nhp = tep; 1907 } 1908 } 1909 1910 /* 1911 * delete old hash table and set new one in place 1912 */ 1913 kmem_free(dcp->dc_namehash, sizeof (dcentry_t *) * oldsize); 1914 dcp->dc_namehash = newhash; 1915 } 1916 1917 /* 1918 * Dynamically grow or shrink the size of the free space hash table 1919 */ 1920 static void 1921 dnlc_dir_adjust_fhash(dircache_t *dcp) 1922 { 1923 dcfree_t **newhash, *dfp, **nhp, *tfp; 1924 uint_t newsize; 1925 uint_t oldsize; 1926 int i; 1927 1928 /* 1929 * Allocate new hash table 1930 */ 1931 newsize = dcp->dc_num_free >> dnlc_dir_hash_size_shift; 1932 newhash = kmem_zalloc(sizeof (dcfree_t *) * newsize, KM_NOSLEEP); 1933 if (newhash == NULL) { 1934 /* 1935 * System is short on memory just return 1936 * Note, the old hash table is still usable. 1937 * This return is unlikely to repeatedy occur, because 1938 * either some other directory caches will be reclaimed 1939 * due to memory shortage, thus freeing memory, or this 1940 * directory cahe will be reclaimed. 1941 */ 1942 return; 1943 } 1944 oldsize = dcp->dc_fhash_mask + 1; 1945 dcp->dc_fhash_mask = newsize - 1; 1946 1947 /* 1948 * Move entries from the old table to the new 1949 */ 1950 for (i = 0; i < oldsize; i++) { /* for each hash bucket */ 1951 dfp = dcp->dc_freehash[i]; 1952 while (dfp != NULL) { /* for each chained entry */ 1953 tfp = dfp; 1954 dfp = dfp->df_next; 1955 nhp = &newhash[DDFHASH(tfp->df_handle, dcp)]; 1956 tfp->df_next = *nhp; 1957 *nhp = tfp; 1958 } 1959 } 1960 1961 /* 1962 * delete old hash table and set new one in place 1963 */ 1964 kmem_free(dcp->dc_freehash, sizeof (dcfree_t *) * oldsize); 1965 dcp->dc_freehash = newhash; 1966 } 1967