1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 #include <sys/types.h> 42 #include <sys/systm.h> 43 #include <sys/param.h> 44 #include <sys/t_lock.h> 45 #include <sys/systm.h> 46 #include <sys/vfs.h> 47 #include <sys/vnode.h> 48 #include <sys/dnlc.h> 49 #include <sys/kmem.h> 50 #include <sys/cmn_err.h> 51 #include <sys/vtrace.h> 52 #include <sys/bitmap.h> 53 #include <sys/var.h> 54 #include <sys/sysmacros.h> 55 #include <sys/kstat.h> 56 #include <sys/atomic.h> 57 #include <sys/taskq.h> 58 59 /* 60 * Directory name lookup cache. 61 * Based on code originally done by Robert Elz at Melbourne. 62 * 63 * Names found by directory scans are retained in a cache 64 * for future reference. Each hash chain is ordered by LRU 65 * Cache is indexed by hash value obtained from (vp, name) 66 * where the vp refers to the directory containing the name. 67 */ 68 69 /* 70 * Tunable nc_hashavelen is the average length desired for this chain, from 71 * which the size of the nc_hash table is derived at create time. 72 */ 73 #define NC_HASHAVELEN_DEFAULT 4 74 int nc_hashavelen = NC_HASHAVELEN_DEFAULT; 75 76 /* 77 * NC_MOVETOFRONT is the move-to-front threshold: if the hash lookup 78 * depth exceeds this value, we move the looked-up entry to the front of 79 * its hash chain. The idea is to make sure that the most frequently 80 * accessed entries are found most quickly (by keeping them near the 81 * front of their hash chains). 82 */ 83 #define NC_MOVETOFRONT 2 84 85 /* 86 * 87 * DNLC_MAX_RELE is used to size an array on the stack when releasing 88 * vnodes. This array is used rather than calling VN_RELE() inline because 89 * all dnlc locks must be dropped by that time in order to avoid a 90 * possible deadlock. This deadlock occurs when the dnlc holds the last 91 * reference to the vnode and so the VOP_INACTIVE vector is called which 92 * can in turn call back into the dnlc. A global array was used but had 93 * many problems: 94 * 1) Actually doesn't have an upper bound on the array size as 95 * entries can be added after starting the purge. 96 * 2) The locking scheme causes a hang. 97 * 3) Caused serialisation on the global lock. 98 * 4) The array was often unnecessarily huge. 99 * 100 * Note the current value 8 allows up to 4 cache entries (to be purged 101 * from each hash chain), before having to cycle around and retry. 102 * This ought to be ample given that nc_hashavelen is typically very small. 103 */ 104 #define DNLC_MAX_RELE 8 /* must be even */ 105 106 /* 107 * Hash table of name cache entries for fast lookup, dynamically 108 * allocated at startup. 109 */ 110 nc_hash_t *nc_hash; 111 112 /* 113 * Rotors. Used to select entries on a round-robin basis. 114 */ 115 static nc_hash_t *dnlc_purge_fs1_rotor; 116 static nc_hash_t *dnlc_free_rotor; 117 118 /* 119 * # of dnlc entries (uninitialized) 120 * 121 * the initial value was chosen as being 122 * a random string of bits, probably not 123 * normally chosen by a systems administrator 124 */ 125 int ncsize = -1; 126 uint32_t dnlc_nentries = 0; /* current number of name cache entries */ 127 static int nc_hashsz; /* size of hash table */ 128 static int nc_hashmask; /* size of hash table minus 1 */ 129 130 /* 131 * The dnlc_reduce_cache() taskq queue is activated when there are 132 * ncsize name cache entries and if no parameter is provided, it reduces 133 * the size down to dnlc_nentries_low_water, which is by default one 134 * hundreth less (or 99%) of ncsize. 135 * 136 * If a parameter is provided to dnlc_reduce_cache(), then we reduce 137 * the size down based on ncsize_onepercent - where ncsize_onepercent 138 * is 1% of ncsize. 139 */ 140 #define DNLC_LOW_WATER_DIVISOR_DEFAULT 100 141 uint_t dnlc_low_water_divisor = DNLC_LOW_WATER_DIVISOR_DEFAULT; 142 uint_t dnlc_nentries_low_water; 143 int dnlc_reduce_idle = 1; /* no locking needed */ 144 uint_t ncsize_onepercent; 145 146 /* 147 * If dnlc_nentries hits dnlc_max_nentries (twice ncsize) 148 * then this means the dnlc_reduce_cache() taskq is failing to 149 * keep up. In this case we refuse to add new entries to the dnlc 150 * until the taskq catches up. 151 */ 152 uint_t dnlc_max_nentries; /* twice ncsize */ 153 uint64_t dnlc_max_nentries_cnt = 0; /* statistic on times we failed */ 154 155 /* 156 * Tunable to define when we should just remove items from 157 * the end of the chain. 158 */ 159 #define DNLC_LONG_CHAIN 8 160 uint_t dnlc_long_chain = DNLC_LONG_CHAIN; 161 162 /* 163 * ncstats has been deprecated, due to the integer size of the counters 164 * which can easily overflow in the dnlc. 165 * It is maintained (at some expense) for compatability. 166 * The preferred interface is the kstat accessible nc_stats below. 167 */ 168 struct ncstats ncstats; 169 170 struct nc_stats ncs = { 171 { "hits", KSTAT_DATA_UINT64 }, 172 { "misses", KSTAT_DATA_UINT64 }, 173 { "negative_cache_hits", KSTAT_DATA_UINT64 }, 174 { "enters", KSTAT_DATA_UINT64 }, 175 { "double_enters", KSTAT_DATA_UINT64 }, 176 { "purge_total_entries", KSTAT_DATA_UINT64 }, 177 { "purge_all", KSTAT_DATA_UINT64 }, 178 { "purge_vp", KSTAT_DATA_UINT64 }, 179 { "purge_vfs", KSTAT_DATA_UINT64 }, 180 { "purge_fs1", KSTAT_DATA_UINT64 }, 181 { "pick_free", KSTAT_DATA_UINT64 }, 182 { "pick_heuristic", KSTAT_DATA_UINT64 }, 183 { "pick_last", KSTAT_DATA_UINT64 }, 184 185 /* directory caching stats */ 186 187 { "dir_hits", KSTAT_DATA_UINT64 }, 188 { "dir_misses", KSTAT_DATA_UINT64 }, 189 { "dir_cached_current", KSTAT_DATA_UINT64 }, 190 { "dir_entries_cached_current", KSTAT_DATA_UINT64 }, 191 { "dir_cached_total", KSTAT_DATA_UINT64 }, 192 { "dir_start_no_memory", KSTAT_DATA_UINT64 }, 193 { "dir_add_no_memory", KSTAT_DATA_UINT64 }, 194 { "dir_add_abort", KSTAT_DATA_UINT64 }, 195 { "dir_add_max", KSTAT_DATA_UINT64 }, 196 { "dir_remove_entry_fail", KSTAT_DATA_UINT64 }, 197 { "dir_remove_space_fail", KSTAT_DATA_UINT64 }, 198 { "dir_update_fail", KSTAT_DATA_UINT64 }, 199 { "dir_fini_purge", KSTAT_DATA_UINT64 }, 200 { "dir_reclaim_last", KSTAT_DATA_UINT64 }, 201 { "dir_reclaim_any", KSTAT_DATA_UINT64 }, 202 }; 203 204 static int doingcache = 1; 205 206 vnode_t negative_cache_vnode; 207 208 /* 209 * Insert entry at the front of the queue 210 */ 211 #define nc_inshash(ncp, hp) \ 212 { \ 213 (ncp)->hash_next = (hp)->hash_next; \ 214 (ncp)->hash_prev = (ncache_t *)(hp); \ 215 (hp)->hash_next->hash_prev = (ncp); \ 216 (hp)->hash_next = (ncp); \ 217 } 218 219 /* 220 * Remove entry from hash queue 221 */ 222 #define nc_rmhash(ncp) \ 223 { \ 224 (ncp)->hash_prev->hash_next = (ncp)->hash_next; \ 225 (ncp)->hash_next->hash_prev = (ncp)->hash_prev; \ 226 (ncp)->hash_prev = NULL; \ 227 (ncp)->hash_next = NULL; \ 228 } 229 230 /* 231 * Free an entry. 232 */ 233 #define dnlc_free(ncp) \ 234 { \ 235 kmem_free((ncp), sizeof (ncache_t) + (ncp)->namlen); \ 236 atomic_add_32(&dnlc_nentries, -1); \ 237 } 238 239 240 /* 241 * Cached directory info. 242 * ====================== 243 */ 244 245 /* 246 * Cached directory free space hash function. 247 * Needs the free space handle and the dcp to get the hash table size 248 * Returns the hash index. 249 */ 250 #define DDFHASH(handle, dcp) ((handle >> 2) & (dcp)->dc_fhash_mask) 251 252 /* 253 * Cached directory name entry hash function. 254 * Uses the name and returns in the input arguments the hash and the name 255 * length. 256 */ 257 #define DNLC_DIR_HASH(name, hash, namelen) \ 258 { \ 259 char Xc, *Xcp; \ 260 hash = *name; \ 261 for (Xcp = (name + 1); (Xc = *Xcp) != 0; Xcp++) \ 262 hash = (hash << 4) + hash + Xc; \ 263 ASSERT((Xcp - (name)) <= ((1 << NBBY) - 1)); \ 264 namelen = Xcp - (name); \ 265 } 266 267 /* special dircache_t pointer to indicate error should be returned */ 268 /* 269 * The anchor directory cache pointer can contain 3 types of values, 270 * 1) NULL: No directory cache 271 * 2) DC_RET_LOW_MEM (-1): There was a directory cache that found to be 272 * too big or a memory shortage occurred. This value remains in the 273 * pointer until a dnlc_dir_start() which returns the a DNOMEM error. 274 * This is kludgy but efficient and only visible in this source file. 275 * 3) A valid cache pointer. 276 */ 277 #define DC_RET_LOW_MEM (dircache_t *)1 278 #define VALID_DIR_CACHE(dcp) ((dircache_t *)(dcp) > DC_RET_LOW_MEM) 279 280 /* Tunables */ 281 uint_t dnlc_dir_enable = 1; /* disable caching directories by setting to 0 */ 282 uint_t dnlc_dir_min_size = 40; /* min no of directory entries before caching */ 283 uint_t dnlc_dir_max_size = UINT_MAX; /* ditto maximum */ 284 uint_t dnlc_dir_hash_size_shift = 3; /* 8 entries per hash bucket */ 285 uint_t dnlc_dir_min_reclaim = 350000; /* approx 1MB of dcentrys */ 286 /* 287 * dnlc_dir_hash_resize_shift determines when the hash tables 288 * get re-adjusted due to growth or shrinkage 289 * - currently 2 indicating that there can be at most 4 290 * times or at least one quarter the number of entries 291 * before hash table readjustment. Note that with 292 * dnlc_dir_hash_size_shift above set at 3 this would 293 * mean readjustment would occur if the average number 294 * of entries went above 32 or below 2 295 */ 296 uint_t dnlc_dir_hash_resize_shift = 2; /* readjust rate */ 297 298 static kmem_cache_t *dnlc_dir_space_cache; /* free space entry cache */ 299 static dchead_t dc_head; /* anchor of cached directories */ 300 301 /* Prototypes */ 302 static ncache_t *dnlc_get(uchar_t namlen); 303 static ncache_t *dnlc_search(vnode_t *dp, char *name, uchar_t namlen, int hash); 304 static void dnlc_dir_reclaim(void *unused); 305 static void dnlc_dir_abort(dircache_t *dcp); 306 static void dnlc_dir_adjust_fhash(dircache_t *dcp); 307 static void dnlc_dir_adjust_nhash(dircache_t *dcp); 308 309 310 /* 311 * Initialize the directory cache. 312 */ 313 void 314 dnlc_init() 315 { 316 nc_hash_t *hp; 317 kstat_t *ksp; 318 int i; 319 320 /* 321 * Set up the size of the dnlc (ncsize) and its low water mark. 322 */ 323 if (ncsize == -1) { 324 /* calculate a reasonable size for the low water */ 325 dnlc_nentries_low_water = 4 * (v.v_proc + maxusers) + 320; 326 ncsize = dnlc_nentries_low_water + 327 (dnlc_nentries_low_water / dnlc_low_water_divisor); 328 } else { 329 /* don't change the user specified ncsize */ 330 dnlc_nentries_low_water = 331 ncsize - (ncsize / dnlc_low_water_divisor); 332 } 333 if (ncsize <= 0) { 334 doingcache = 0; 335 dnlc_dir_enable = 0; /* also disable directory caching */ 336 ncsize = 0; 337 cmn_err(CE_NOTE, "name cache (dnlc) disabled"); 338 return; 339 } 340 dnlc_max_nentries = ncsize * 2; 341 ncsize_onepercent = ncsize / 100; 342 343 /* 344 * Initialise the hash table. 345 * Compute hash size rounding to the next power of two. 346 */ 347 nc_hashsz = ncsize / nc_hashavelen; 348 nc_hashsz = 1 << highbit(nc_hashsz); 349 nc_hashmask = nc_hashsz - 1; 350 nc_hash = kmem_zalloc(nc_hashsz * sizeof (*nc_hash), KM_SLEEP); 351 for (i = 0; i < nc_hashsz; i++) { 352 hp = (nc_hash_t *)&nc_hash[i]; 353 mutex_init(&hp->hash_lock, NULL, MUTEX_DEFAULT, NULL); 354 hp->hash_next = (ncache_t *)hp; 355 hp->hash_prev = (ncache_t *)hp; 356 } 357 358 /* 359 * Initialize rotors 360 */ 361 dnlc_free_rotor = dnlc_purge_fs1_rotor = &nc_hash[0]; 362 363 /* 364 * Set up the directory caching to use kmem_cache_alloc 365 * for its free space entries so that we can get a callback 366 * when the system is short on memory, to allow us to free 367 * up some memory. we don't use the constructor/deconstructor 368 * functions. 369 */ 370 dnlc_dir_space_cache = kmem_cache_create("dnlc_space_cache", 371 sizeof (dcfree_t), 0, NULL, NULL, dnlc_dir_reclaim, NULL, 372 NULL, 0); 373 374 /* 375 * Initialise the head of the cached directory structures 376 */ 377 mutex_init(&dc_head.dch_lock, NULL, MUTEX_DEFAULT, NULL); 378 dc_head.dch_next = (dircache_t *)&dc_head; 379 dc_head.dch_prev = (dircache_t *)&dc_head; 380 381 /* 382 * Initialise the reference count of the negative cache vnode to 1 383 * so that it never goes away (VOP_INACTIVE isn't called on it). 384 */ 385 negative_cache_vnode.v_count = 1; 386 387 /* 388 * Initialise kstats - both the old compatability raw kind and 389 * the more extensive named stats. 390 */ 391 ksp = kstat_create("unix", 0, "ncstats", "misc", KSTAT_TYPE_RAW, 392 sizeof (struct ncstats), KSTAT_FLAG_VIRTUAL); 393 if (ksp) { 394 ksp->ks_data = (void *) &ncstats; 395 kstat_install(ksp); 396 } 397 ksp = kstat_create("unix", 0, "dnlcstats", "misc", KSTAT_TYPE_NAMED, 398 sizeof (ncs) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 399 if (ksp) { 400 ksp->ks_data = (void *) &ncs; 401 kstat_install(ksp); 402 } 403 } 404 405 /* 406 * Add a name to the directory cache. 407 */ 408 void 409 dnlc_enter(vnode_t *dp, char *name, vnode_t *vp) 410 { 411 ncache_t *ncp; 412 nc_hash_t *hp; 413 uchar_t namlen; 414 int hash; 415 416 TRACE_0(TR_FAC_NFS, TR_DNLC_ENTER_START, "dnlc_enter_start:"); 417 418 if (!doingcache) { 419 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END, 420 "dnlc_enter_end:(%S) %d", "not caching", 0); 421 return; 422 } 423 424 /* 425 * Get a new dnlc entry. Assume the entry won't be in the cache 426 * and initialize it now 427 */ 428 DNLCHASH(name, dp, hash, namlen); 429 if ((ncp = dnlc_get(namlen)) == NULL) 430 return; 431 ncp->dp = dp; 432 VN_HOLD(dp); 433 ncp->vp = vp; 434 VN_HOLD(vp); 435 bcopy(name, ncp->name, namlen + 1); /* name and null */ 436 ncp->hash = hash; 437 hp = &nc_hash[hash & nc_hashmask]; 438 439 mutex_enter(&hp->hash_lock); 440 if (dnlc_search(dp, name, namlen, hash) != NULL) { 441 mutex_exit(&hp->hash_lock); 442 ncstats.dbl_enters++; 443 ncs.ncs_dbl_enters.value.ui64++; 444 VN_RELE(dp); 445 VN_RELE(vp); 446 dnlc_free(ncp); /* crfree done here */ 447 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END, 448 "dnlc_enter_end:(%S) %d", 449 "dbl enter", ncstats.dbl_enters); 450 return; 451 } 452 /* 453 * Insert back into the hash chain. 454 */ 455 nc_inshash(ncp, hp); 456 mutex_exit(&hp->hash_lock); 457 ncstats.enters++; 458 ncs.ncs_enters.value.ui64++; 459 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END, 460 "dnlc_enter_end:(%S) %d", "done", ncstats.enters); 461 } 462 463 /* 464 * Add a name to the directory cache. 465 * 466 * This function is basically identical with 467 * dnlc_enter(). The difference is that when the 468 * desired dnlc entry is found, the vnode in the 469 * ncache is compared with the vnode passed in. 470 * 471 * If they are not equal then the ncache is 472 * updated with the passed in vnode. Otherwise 473 * it just frees up the newly allocated dnlc entry. 474 */ 475 void 476 dnlc_update(vnode_t *dp, char *name, vnode_t *vp) 477 { 478 ncache_t *ncp; 479 ncache_t *tcp; 480 vnode_t *tvp; 481 nc_hash_t *hp; 482 int hash; 483 uchar_t namlen; 484 485 TRACE_0(TR_FAC_NFS, TR_DNLC_ENTER_START, "dnlc_update_start:"); 486 487 if (!doingcache) { 488 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END, 489 "dnlc_update_end:(%S) %d", "not caching", 0); 490 return; 491 } 492 493 /* 494 * Get a new dnlc entry and initialize it now. 495 * If we fail to get a new entry, call dnlc_remove() to purge 496 * any existing dnlc entry including negative cache (DNLC_NO_VNODE) 497 * entry. 498 * Failure to clear an existing entry could result in false dnlc 499 * lookup (negative/stale entry). 500 */ 501 DNLCHASH(name, dp, hash, namlen); 502 if ((ncp = dnlc_get(namlen)) == NULL) { 503 dnlc_remove(dp, name); 504 return; 505 } 506 ncp->dp = dp; 507 VN_HOLD(dp); 508 ncp->vp = vp; 509 VN_HOLD(vp); 510 bcopy(name, ncp->name, namlen + 1); /* name and null */ 511 ncp->hash = hash; 512 hp = &nc_hash[hash & nc_hashmask]; 513 514 mutex_enter(&hp->hash_lock); 515 if ((tcp = dnlc_search(dp, name, namlen, hash)) != NULL) { 516 if (tcp->vp != vp) { 517 tvp = tcp->vp; 518 tcp->vp = vp; 519 mutex_exit(&hp->hash_lock); 520 VN_RELE(tvp); 521 ncstats.enters++; 522 ncs.ncs_enters.value.ui64++; 523 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END, 524 "dnlc_update_end:(%S) %d", "done", ncstats.enters); 525 } else { 526 mutex_exit(&hp->hash_lock); 527 VN_RELE(vp); 528 ncstats.dbl_enters++; 529 ncs.ncs_dbl_enters.value.ui64++; 530 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END, 531 "dnlc_update_end:(%S) %d", 532 "dbl enter", ncstats.dbl_enters); 533 } 534 VN_RELE(dp); 535 dnlc_free(ncp); /* crfree done here */ 536 return; 537 } 538 /* 539 * insert the new entry, since it is not in dnlc yet 540 */ 541 nc_inshash(ncp, hp); 542 mutex_exit(&hp->hash_lock); 543 ncstats.enters++; 544 ncs.ncs_enters.value.ui64++; 545 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END, 546 "dnlc_update_end:(%S) %d", "done", ncstats.enters); 547 } 548 549 /* 550 * Look up a name in the directory name cache. 551 * 552 * Return a doubly-held vnode if found: one hold so that it may 553 * remain in the cache for other users, the other hold so that 554 * the cache is not re-cycled and the identity of the vnode is 555 * lost before the caller can use the vnode. 556 */ 557 vnode_t * 558 dnlc_lookup(vnode_t *dp, char *name) 559 { 560 ncache_t *ncp; 561 nc_hash_t *hp; 562 vnode_t *vp; 563 int hash, depth; 564 uchar_t namlen; 565 566 TRACE_2(TR_FAC_NFS, TR_DNLC_LOOKUP_START, 567 "dnlc_lookup_start:dp %x name %s", dp, name); 568 569 if (!doingcache) { 570 TRACE_4(TR_FAC_NFS, TR_DNLC_LOOKUP_END, 571 "dnlc_lookup_end:%S %d vp %x name %s", 572 "not_caching", 0, NULL, name); 573 return (NULL); 574 } 575 576 DNLCHASH(name, dp, hash, namlen); 577 depth = 1; 578 hp = &nc_hash[hash & nc_hashmask]; 579 mutex_enter(&hp->hash_lock); 580 581 for (ncp = hp->hash_next; ncp != (ncache_t *)hp; 582 ncp = ncp->hash_next) { 583 if (ncp->hash == hash && /* fast signature check */ 584 ncp->dp == dp && 585 ncp->namlen == namlen && 586 bcmp(ncp->name, name, namlen) == 0) { 587 /* 588 * Move this entry to the head of its hash chain 589 * if it's not already close. 590 */ 591 if (depth > NC_MOVETOFRONT) { 592 ncache_t *next = ncp->hash_next; 593 ncache_t *prev = ncp->hash_prev; 594 595 prev->hash_next = next; 596 next->hash_prev = prev; 597 ncp->hash_next = next = hp->hash_next; 598 ncp->hash_prev = (ncache_t *)hp; 599 next->hash_prev = ncp; 600 hp->hash_next = ncp; 601 602 ncstats.move_to_front++; 603 } 604 605 /* 606 * Put a hold on the vnode now so its identity 607 * can't change before the caller has a chance to 608 * put a hold on it. 609 */ 610 vp = ncp->vp; 611 VN_HOLD(vp); 612 mutex_exit(&hp->hash_lock); 613 ncstats.hits++; 614 ncs.ncs_hits.value.ui64++; 615 if (vp == DNLC_NO_VNODE) { 616 ncs.ncs_neg_hits.value.ui64++; 617 } 618 TRACE_4(TR_FAC_NFS, TR_DNLC_LOOKUP_END, 619 "dnlc_lookup_end:%S %d vp %x name %s", 620 "hit", ncstats.hits, vp, name); 621 return (vp); 622 } 623 depth++; 624 } 625 626 mutex_exit(&hp->hash_lock); 627 ncstats.misses++; 628 ncs.ncs_misses.value.ui64++; 629 TRACE_4(TR_FAC_NFS, TR_DNLC_LOOKUP_END, 630 "dnlc_lookup_end:%S %d vp %x name %s", "miss", ncstats.misses, 631 NULL, name); 632 return (NULL); 633 } 634 635 /* 636 * Remove an entry in the directory name cache. 637 */ 638 void 639 dnlc_remove(vnode_t *dp, char *name) 640 { 641 ncache_t *ncp; 642 nc_hash_t *hp; 643 uchar_t namlen; 644 int hash; 645 646 if (!doingcache) 647 return; 648 DNLCHASH(name, dp, hash, namlen); 649 hp = &nc_hash[hash & nc_hashmask]; 650 651 mutex_enter(&hp->hash_lock); 652 if (ncp = dnlc_search(dp, name, namlen, hash)) { 653 /* 654 * Free up the entry 655 */ 656 nc_rmhash(ncp); 657 mutex_exit(&hp->hash_lock); 658 VN_RELE(ncp->vp); 659 VN_RELE(ncp->dp); 660 dnlc_free(ncp); 661 return; 662 } 663 mutex_exit(&hp->hash_lock); 664 } 665 666 /* 667 * Purge the entire cache. 668 */ 669 void 670 dnlc_purge() 671 { 672 nc_hash_t *nch; 673 ncache_t *ncp; 674 int index; 675 int i; 676 vnode_t *nc_rele[DNLC_MAX_RELE]; 677 678 if (!doingcache) 679 return; 680 681 ncstats.purges++; 682 ncs.ncs_purge_all.value.ui64++; 683 684 for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) { 685 index = 0; 686 mutex_enter(&nch->hash_lock); 687 ncp = nch->hash_next; 688 while (ncp != (ncache_t *)nch) { 689 ncache_t *np; 690 691 np = ncp->hash_next; 692 nc_rele[index++] = ncp->vp; 693 nc_rele[index++] = ncp->dp; 694 695 nc_rmhash(ncp); 696 dnlc_free(ncp); 697 ncp = np; 698 ncs.ncs_purge_total.value.ui64++; 699 if (index == DNLC_MAX_RELE) 700 break; 701 } 702 mutex_exit(&nch->hash_lock); 703 704 /* Release holds on all the vnodes now that we have no locks */ 705 for (i = 0; i < index; i++) { 706 VN_RELE(nc_rele[i]); 707 } 708 if (ncp != (ncache_t *)nch) { 709 nch--; /* Do current hash chain again */ 710 } 711 } 712 } 713 714 /* 715 * Purge any cache entries referencing a vnode. 716 * Exit as soon as the vnode reference count goes to 1, as the caller 717 * must hold a reference, and the dnlc can therefore have no more. 718 */ 719 void 720 dnlc_purge_vp(vnode_t *vp) 721 { 722 nc_hash_t *nch; 723 ncache_t *ncp; 724 int index; 725 vnode_t *nc_rele[DNLC_MAX_RELE]; 726 727 ASSERT(vp->v_count > 0); 728 if (vp->v_count == 1) { 729 return; 730 } 731 732 if (!doingcache) 733 return; 734 735 ncstats.purges++; 736 ncs.ncs_purge_vp.value.ui64++; 737 738 for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) { 739 index = 0; 740 mutex_enter(&nch->hash_lock); 741 ncp = nch->hash_next; 742 while (ncp != (ncache_t *)nch) { 743 ncache_t *np; 744 745 np = ncp->hash_next; 746 if (ncp->dp == vp || ncp->vp == vp) { 747 nc_rele[index++] = ncp->vp; 748 nc_rele[index++] = ncp->dp; 749 nc_rmhash(ncp); 750 dnlc_free(ncp); 751 ncs.ncs_purge_total.value.ui64++; 752 if (index == DNLC_MAX_RELE) { 753 ncp = np; 754 break; 755 } 756 } 757 ncp = np; 758 } 759 mutex_exit(&nch->hash_lock); 760 761 /* Release holds on all the vnodes now that we have no locks */ 762 while (index) { 763 VN_RELE(nc_rele[--index]); 764 } 765 766 if (vp->v_count == 1) { 767 return; /* no more dnlc references */ 768 } 769 770 if (ncp != (ncache_t *)nch) { 771 nch--; /* Do current hash chain again */ 772 } 773 } 774 } 775 776 /* 777 * Purge cache entries referencing a vfsp. Caller supplies a count 778 * of entries to purge; up to that many will be freed. A count of 779 * zero indicates that all such entries should be purged. Returns 780 * the number of entries that were purged. 781 */ 782 int 783 dnlc_purge_vfsp(vfs_t *vfsp, int count) 784 { 785 nc_hash_t *nch; 786 ncache_t *ncp; 787 int n = 0; 788 int index; 789 int i; 790 vnode_t *nc_rele[DNLC_MAX_RELE]; 791 792 if (!doingcache) 793 return (0); 794 795 ncstats.purges++; 796 ncs.ncs_purge_vfs.value.ui64++; 797 798 for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) { 799 index = 0; 800 mutex_enter(&nch->hash_lock); 801 ncp = nch->hash_next; 802 while (ncp != (ncache_t *)nch) { 803 ncache_t *np; 804 805 np = ncp->hash_next; 806 ASSERT(ncp->dp != NULL); 807 ASSERT(ncp->vp != NULL); 808 if ((ncp->dp->v_vfsp == vfsp) || 809 (ncp->vp->v_vfsp == vfsp)) { 810 n++; 811 nc_rele[index++] = ncp->vp; 812 nc_rele[index++] = ncp->dp; 813 nc_rmhash(ncp); 814 dnlc_free(ncp); 815 ncs.ncs_purge_total.value.ui64++; 816 if (index == DNLC_MAX_RELE) { 817 ncp = np; 818 break; 819 } 820 if (count != 0 && n >= count) { 821 break; 822 } 823 } 824 ncp = np; 825 } 826 mutex_exit(&nch->hash_lock); 827 /* Release holds on all the vnodes now that we have no locks */ 828 for (i = 0; i < index; i++) { 829 VN_RELE(nc_rele[i]); 830 } 831 if (count != 0 && n >= count) { 832 return (n); 833 } 834 if (ncp != (ncache_t *)nch) { 835 nch--; /* Do current hash chain again */ 836 } 837 } 838 return (n); 839 } 840 841 /* 842 * Purge 1 entry from the dnlc that is part of the filesystem(s) 843 * represented by 'vop'. The purpose of this routine is to allow 844 * users of the dnlc to free a vnode that is being held by the dnlc. 845 * 846 * If we find a vnode that we release which will result in 847 * freeing the underlying vnode (count was 1), return 1, 0 848 * if no appropriate vnodes found. 849 * 850 * Note, vop is not the 'right' identifier for a filesystem. 851 */ 852 int 853 dnlc_fs_purge1(vnodeops_t *vop) 854 { 855 nc_hash_t *end; 856 nc_hash_t *hp; 857 ncache_t *ncp; 858 vnode_t *vp; 859 860 if (!doingcache) 861 return (0); 862 863 ncs.ncs_purge_fs1.value.ui64++; 864 865 /* 866 * Scan the dnlc entries looking for a likely candidate. 867 */ 868 hp = end = dnlc_purge_fs1_rotor; 869 870 do { 871 if (++hp == &nc_hash[nc_hashsz]) 872 hp = nc_hash; 873 dnlc_purge_fs1_rotor = hp; 874 if (hp->hash_next == (ncache_t *)hp) 875 continue; 876 mutex_enter(&hp->hash_lock); 877 for (ncp = hp->hash_prev; 878 ncp != (ncache_t *)hp; 879 ncp = ncp->hash_prev) { 880 vp = ncp->vp; 881 if (!vn_has_cached_data(vp) && (vp->v_count == 1) && 882 vn_matchops(vp, vop)) 883 break; 884 } 885 if (ncp != (ncache_t *)hp) { 886 nc_rmhash(ncp); 887 mutex_exit(&hp->hash_lock); 888 VN_RELE(ncp->dp); 889 VN_RELE(vp) 890 dnlc_free(ncp); 891 ncs.ncs_purge_total.value.ui64++; 892 return (1); 893 } 894 mutex_exit(&hp->hash_lock); 895 } while (hp != end); 896 return (0); 897 } 898 899 /* 900 * Perform a reverse lookup in the DNLC. This will find the first occurrence of 901 * the vnode. If successful, it will return the vnode of the parent, and the 902 * name of the entry in the given buffer. If it cannot be found, or the buffer 903 * is too small, then it will return NULL. Note that this is a highly 904 * inefficient function, since the DNLC is constructed solely for forward 905 * lookups. 906 */ 907 vnode_t * 908 dnlc_reverse_lookup(vnode_t *vp, char *buf, size_t buflen) 909 { 910 nc_hash_t *nch; 911 ncache_t *ncp; 912 vnode_t *pvp; 913 914 if (!doingcache) 915 return (NULL); 916 917 for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) { 918 mutex_enter(&nch->hash_lock); 919 ncp = nch->hash_next; 920 while (ncp != (ncache_t *)nch) { 921 /* 922 * We ignore '..' entries since it can create 923 * confusion and infinite loops. 924 */ 925 if (ncp->vp == vp && !(ncp->namlen == 2 && 926 0 == bcmp(ncp->name, "..", 2)) && 927 ncp->namlen < buflen) { 928 bcopy(ncp->name, buf, ncp->namlen); 929 buf[ncp->namlen] = '\0'; 930 pvp = ncp->dp; 931 VN_HOLD(pvp); 932 mutex_exit(&nch->hash_lock); 933 return (pvp); 934 } 935 ncp = ncp->hash_next; 936 } 937 mutex_exit(&nch->hash_lock); 938 } 939 940 return (NULL); 941 } 942 /* 943 * Utility routine to search for a cache entry. Return the 944 * ncache entry if found, NULL otherwise. 945 */ 946 static ncache_t * 947 dnlc_search(vnode_t *dp, char *name, uchar_t namlen, int hash) 948 { 949 nc_hash_t *hp; 950 ncache_t *ncp; 951 952 hp = &nc_hash[hash & nc_hashmask]; 953 954 for (ncp = hp->hash_next; ncp != (ncache_t *)hp; ncp = ncp->hash_next) { 955 if (ncp->hash == hash && 956 ncp->dp == dp && 957 ncp->namlen == namlen && 958 bcmp(ncp->name, name, namlen) == 0) 959 return (ncp); 960 } 961 return (NULL); 962 } 963 964 #if ((1 << NBBY) - 1) < (MAXNAMELEN - 1) 965 #error ncache_t name length representation is too small 966 #endif 967 968 /* 969 * Get a new name cache entry. 970 * If the dnlc_reduce_cache() taskq isn't keeping up with demand, or memory 971 * is short then just return NULL. If we're over ncsize then kick off a 972 * thread to free some in use entries down to dnlc_nentries_low_water. 973 * Caller must initialise all fields except namlen. 974 * Component names are defined to be less than MAXNAMELEN 975 * which includes a null. 976 */ 977 static ncache_t * 978 dnlc_get(uchar_t namlen) 979 { 980 ncache_t *ncp; 981 982 if (dnlc_nentries > dnlc_max_nentries) { 983 dnlc_max_nentries_cnt++; /* keep a statistic */ 984 return (NULL); 985 } 986 ncp = kmem_alloc(sizeof (ncache_t) + namlen, KM_NOSLEEP); 987 if (ncp == NULL) { 988 return (NULL); 989 } 990 ncp->namlen = namlen; 991 atomic_add_32(&dnlc_nentries, 1); 992 if (dnlc_reduce_idle && (dnlc_nentries >= ncsize)) { 993 dnlc_reduce_idle = 0; 994 (void) taskq_dispatch(system_taskq, dnlc_reduce_cache, 995 NULL, TQ_SLEEP); 996 } 997 return (ncp); 998 } 999 1000 /* 1001 * Taskq routine to free up name cache entries to reduce the 1002 * cache size to the low water mark if "reduce_percent" is not provided. 1003 * If "reduce_percent" is provided, reduce cache size by 1004 * (ncsize_onepercent * reduce_percent). 1005 * 1006 * This routine can also be called directly by ZFS's ARC when memory is low. 1007 */ 1008 /*ARGSUSED*/ 1009 void 1010 dnlc_reduce_cache(void *reduce_percent) 1011 { 1012 nc_hash_t *hp = dnlc_free_rotor; 1013 vnode_t *vp; 1014 ncache_t *ncp; 1015 int cnt; 1016 uint_t low_water = dnlc_nentries_low_water; 1017 1018 if (reduce_percent) { 1019 uint_t reduce_cnt; 1020 1021 reduce_cnt = ncsize_onepercent * (uint_t)reduce_percent; 1022 if (reduce_cnt > dnlc_nentries) 1023 low_water = 0; 1024 else 1025 low_water = dnlc_nentries - reduce_cnt; 1026 } 1027 1028 do { 1029 /* 1030 * Find the first non empty hash queue without locking 1031 * Recheck we really have entries to avoid 1032 * an infinite loop if all the entries get purged. 1033 */ 1034 do { 1035 if (++hp == &nc_hash[nc_hashsz]) { 1036 hp = nc_hash; 1037 if (dnlc_nentries <= low_water) { 1038 dnlc_reduce_idle = 1; 1039 return; 1040 } 1041 } 1042 } while (hp->hash_next == (ncache_t *)hp); 1043 1044 mutex_enter(&hp->hash_lock); 1045 for (cnt = 0, ncp = hp->hash_prev; ncp != (ncache_t *)hp; 1046 ncp = ncp->hash_prev, cnt++) { 1047 vp = ncp->vp; 1048 /* 1049 * A name cache entry with a reference count 1050 * of one is only referenced by the dnlc. 1051 * Also negative cache entries are purged first. 1052 */ 1053 if (!vn_has_cached_data(vp) && 1054 ((vp->v_count == 1) || (vp == DNLC_NO_VNODE))) { 1055 ncs.ncs_pick_heur.value.ui64++; 1056 goto found; 1057 } 1058 /* 1059 * Remove from the end of the chain if the 1060 * chain is too long 1061 */ 1062 if (cnt > dnlc_long_chain) { 1063 ncp = hp->hash_prev; 1064 ncs.ncs_pick_last.value.ui64++; 1065 vp = ncp->vp; 1066 goto found; 1067 } 1068 } 1069 /* check for race and continue */ 1070 if (hp->hash_next == (ncache_t *)hp) { 1071 mutex_exit(&hp->hash_lock); 1072 continue; 1073 } 1074 1075 ncp = hp->hash_prev; /* pick the last one in the hash queue */ 1076 ncs.ncs_pick_last.value.ui64++; 1077 vp = ncp->vp; 1078 found: 1079 /* 1080 * Remove from hash chain. 1081 */ 1082 nc_rmhash(ncp); 1083 mutex_exit(&hp->hash_lock); 1084 VN_RELE(vp); 1085 VN_RELE(ncp->dp); 1086 dnlc_free(ncp); 1087 } while (dnlc_nentries > low_water); 1088 1089 dnlc_free_rotor = hp; 1090 dnlc_reduce_idle = 1; 1091 } 1092 1093 /* 1094 * Directory caching routines 1095 * ========================== 1096 * 1097 * See dnlc.h for details of the interfaces below. 1098 */ 1099 1100 /* 1101 * Lookup up an entry in a complete or partial directory cache. 1102 */ 1103 dcret_t 1104 dnlc_dir_lookup(dcanchor_t *dcap, char *name, uint64_t *handle) 1105 { 1106 dircache_t *dcp; 1107 dcentry_t *dep; 1108 int hash; 1109 int ret; 1110 uchar_t namlen; 1111 1112 /* 1113 * can test without lock as we are only a cache 1114 */ 1115 if (!VALID_DIR_CACHE(dcap->dca_dircache)) { 1116 ncs.ncs_dir_misses.value.ui64++; 1117 return (DNOCACHE); 1118 } 1119 1120 if (!dnlc_dir_enable) { 1121 return (DNOCACHE); 1122 } 1123 1124 mutex_enter(&dcap->dca_lock); 1125 dcp = (dircache_t *)dcap->dca_dircache; 1126 if (VALID_DIR_CACHE(dcp)) { 1127 dcp->dc_actime = lbolt64; 1128 DNLC_DIR_HASH(name, hash, namlen); 1129 dep = dcp->dc_namehash[hash & dcp->dc_nhash_mask]; 1130 while (dep != NULL) { 1131 if ((dep->de_hash == hash) && 1132 (namlen == dep->de_namelen) && 1133 bcmp(dep->de_name, name, namlen) == 0) { 1134 *handle = dep->de_handle; 1135 mutex_exit(&dcap->dca_lock); 1136 ncs.ncs_dir_hits.value.ui64++; 1137 return (DFOUND); 1138 } 1139 dep = dep->de_next; 1140 } 1141 if (dcp->dc_complete) { 1142 ret = DNOENT; 1143 } else { 1144 ret = DNOCACHE; 1145 } 1146 mutex_exit(&dcap->dca_lock); 1147 return (ret); 1148 } else { 1149 mutex_exit(&dcap->dca_lock); 1150 ncs.ncs_dir_misses.value.ui64++; 1151 return (DNOCACHE); 1152 } 1153 } 1154 1155 /* 1156 * Start a new directory cache. An estimate of the number of 1157 * entries is provided to as a quick check to ensure the directory 1158 * is cacheable. 1159 */ 1160 dcret_t 1161 dnlc_dir_start(dcanchor_t *dcap, uint_t num_entries) 1162 { 1163 dircache_t *dcp; 1164 1165 if (!dnlc_dir_enable || 1166 (num_entries < dnlc_dir_min_size)) { 1167 return (DNOCACHE); 1168 } 1169 1170 if (num_entries > dnlc_dir_max_size) { 1171 return (DTOOBIG); 1172 } 1173 1174 mutex_enter(&dc_head.dch_lock); 1175 mutex_enter(&dcap->dca_lock); 1176 1177 if (dcap->dca_dircache == DC_RET_LOW_MEM) { 1178 dcap->dca_dircache = NULL; 1179 mutex_exit(&dcap->dca_lock); 1180 mutex_exit(&dc_head.dch_lock); 1181 return (DNOMEM); 1182 } 1183 1184 /* 1185 * Check if there's currently a cache. 1186 * This probably only occurs on a race. 1187 */ 1188 if (dcap->dca_dircache != NULL) { 1189 mutex_exit(&dcap->dca_lock); 1190 mutex_exit(&dc_head.dch_lock); 1191 return (DNOCACHE); 1192 } 1193 1194 /* 1195 * Allocate the dircache struct, entry and free space hash tables. 1196 * These tables are initially just one entry but dynamically resize 1197 * when entries and free space are added or removed. 1198 */ 1199 if ((dcp = kmem_zalloc(sizeof (dircache_t), KM_NOSLEEP)) == NULL) { 1200 goto error; 1201 } 1202 if ((dcp->dc_namehash = kmem_zalloc(sizeof (dcentry_t *), 1203 KM_NOSLEEP)) == NULL) { 1204 goto error; 1205 } 1206 if ((dcp->dc_freehash = kmem_zalloc(sizeof (dcfree_t *), 1207 KM_NOSLEEP)) == NULL) { 1208 goto error; 1209 } 1210 1211 dcp->dc_anchor = dcap; /* set back pointer to anchor */ 1212 dcap->dca_dircache = dcp; 1213 1214 /* add into head of global chain */ 1215 dcp->dc_next = dc_head.dch_next; 1216 dcp->dc_prev = (dircache_t *)&dc_head; 1217 dcp->dc_next->dc_prev = dcp; 1218 dc_head.dch_next = dcp; 1219 1220 mutex_exit(&dcap->dca_lock); 1221 mutex_exit(&dc_head.dch_lock); 1222 ncs.ncs_cur_dirs.value.ui64++; 1223 ncs.ncs_dirs_cached.value.ui64++; 1224 return (DOK); 1225 error: 1226 if (dcp != NULL) { 1227 if (dcp->dc_namehash) { 1228 kmem_free(dcp->dc_namehash, sizeof (dcentry_t *)); 1229 } 1230 kmem_free(dcp, sizeof (dircache_t)); 1231 } 1232 /* 1233 * Must also kmem_free dcp->dc_freehash if more error cases are added 1234 */ 1235 mutex_exit(&dcap->dca_lock); 1236 mutex_exit(&dc_head.dch_lock); 1237 ncs.ncs_dir_start_nm.value.ui64++; 1238 return (DNOCACHE); 1239 } 1240 1241 /* 1242 * Add a directopry entry to a partial or complete directory cache. 1243 */ 1244 dcret_t 1245 dnlc_dir_add_entry(dcanchor_t *dcap, char *name, uint64_t handle) 1246 { 1247 dircache_t *dcp; 1248 dcentry_t **hp, *dep; 1249 int hash; 1250 uint_t capacity; 1251 uchar_t namlen; 1252 1253 /* 1254 * Allocate the dcentry struct, including the variable 1255 * size name. Note, the null terminator is not copied. 1256 * 1257 * We do this outside the lock to avoid possible deadlock if 1258 * dnlc_dir_reclaim() is called as a result of memory shortage. 1259 */ 1260 DNLC_DIR_HASH(name, hash, namlen); 1261 dep = kmem_alloc(sizeof (dcentry_t) - 1 + namlen, KM_NOSLEEP); 1262 if (dep == NULL) { 1263 #ifdef DEBUG 1264 /* 1265 * The kmem allocator generates random failures for 1266 * KM_NOSLEEP calls (see KMEM_RANDOM_ALLOCATION_FAILURE) 1267 * So try again before we blow away a perfectly good cache. 1268 * This is done not to cover an error but purely for 1269 * performance running a debug kernel. 1270 * This random error only occurs in debug mode. 1271 */ 1272 dep = kmem_alloc(sizeof (dcentry_t) - 1 + namlen, KM_NOSLEEP); 1273 if (dep != NULL) 1274 goto ok; 1275 #endif 1276 ncs.ncs_dir_add_nm.value.ui64++; 1277 /* 1278 * Free a directory cache. This may be the one we are 1279 * called with. 1280 */ 1281 dnlc_dir_reclaim(NULL); 1282 dep = kmem_alloc(sizeof (dcentry_t) - 1 + namlen, KM_NOSLEEP); 1283 if (dep == NULL) { 1284 /* 1285 * still no memory, better delete this cache 1286 */ 1287 mutex_enter(&dcap->dca_lock); 1288 dcp = (dircache_t *)dcap->dca_dircache; 1289 if (VALID_DIR_CACHE(dcp)) { 1290 dnlc_dir_abort(dcp); 1291 dcap->dca_dircache = DC_RET_LOW_MEM; 1292 } 1293 mutex_exit(&dcap->dca_lock); 1294 ncs.ncs_dir_addabort.value.ui64++; 1295 return (DNOCACHE); 1296 } 1297 /* 1298 * fall through as if the 1st kmem_alloc had worked 1299 */ 1300 } 1301 #ifdef DEBUG 1302 ok: 1303 #endif 1304 mutex_enter(&dcap->dca_lock); 1305 dcp = (dircache_t *)dcap->dca_dircache; 1306 if (VALID_DIR_CACHE(dcp)) { 1307 /* 1308 * If the total number of entries goes above the max 1309 * then free this cache 1310 */ 1311 if ((dcp->dc_num_entries + dcp->dc_num_free) > 1312 dnlc_dir_max_size) { 1313 mutex_exit(&dcap->dca_lock); 1314 dnlc_dir_purge(dcap); 1315 kmem_free(dep, sizeof (dcentry_t) - 1 + namlen); 1316 ncs.ncs_dir_add_max.value.ui64++; 1317 return (DTOOBIG); 1318 } 1319 dcp->dc_num_entries++; 1320 capacity = (dcp->dc_nhash_mask + 1) << dnlc_dir_hash_size_shift; 1321 if (dcp->dc_num_entries >= 1322 (capacity << dnlc_dir_hash_resize_shift)) { 1323 dnlc_dir_adjust_nhash(dcp); 1324 } 1325 hp = &dcp->dc_namehash[hash & dcp->dc_nhash_mask]; 1326 1327 /* 1328 * Initialise and chain in new entry 1329 */ 1330 dep->de_handle = handle; 1331 dep->de_hash = hash; 1332 /* 1333 * Note de_namelen is a uchar_t to conserve space 1334 * and alignment padding. The max length of any 1335 * pathname component is defined as MAXNAMELEN 1336 * which is 256 (including the terminating null). 1337 * So provided this doesn't change, we don't include the null, 1338 * we always use bcmp to compare strings, and we don't 1339 * start storing full names, then we are ok. 1340 * The space savings is worth it. 1341 */ 1342 dep->de_namelen = namlen; 1343 bcopy(name, dep->de_name, namlen); 1344 dep->de_next = *hp; 1345 *hp = dep; 1346 dcp->dc_actime = lbolt64; 1347 mutex_exit(&dcap->dca_lock); 1348 ncs.ncs_dir_num_ents.value.ui64++; 1349 return (DOK); 1350 } else { 1351 mutex_exit(&dcap->dca_lock); 1352 kmem_free(dep, sizeof (dcentry_t) - 1 + namlen); 1353 return (DNOCACHE); 1354 } 1355 } 1356 1357 /* 1358 * Add free space to a partial or complete directory cache. 1359 */ 1360 dcret_t 1361 dnlc_dir_add_space(dcanchor_t *dcap, uint_t len, uint64_t handle) 1362 { 1363 dircache_t *dcp; 1364 dcfree_t *dfp, **hp; 1365 uint_t capacity; 1366 1367 /* 1368 * We kmem_alloc outside the lock to avoid possible deadlock if 1369 * dnlc_dir_reclaim() is called as a result of memory shortage. 1370 */ 1371 dfp = kmem_cache_alloc(dnlc_dir_space_cache, KM_NOSLEEP); 1372 if (dfp == NULL) { 1373 #ifdef DEBUG 1374 /* 1375 * The kmem allocator generates random failures for 1376 * KM_NOSLEEP calls (see KMEM_RANDOM_ALLOCATION_FAILURE) 1377 * So try again before we blow away a perfectly good cache. 1378 * This random error only occurs in debug mode 1379 */ 1380 dfp = kmem_cache_alloc(dnlc_dir_space_cache, KM_NOSLEEP); 1381 if (dfp != NULL) 1382 goto ok; 1383 #endif 1384 ncs.ncs_dir_add_nm.value.ui64++; 1385 /* 1386 * Free a directory cache. This may be the one we are 1387 * called with. 1388 */ 1389 dnlc_dir_reclaim(NULL); 1390 dfp = kmem_cache_alloc(dnlc_dir_space_cache, KM_NOSLEEP); 1391 if (dfp == NULL) { 1392 /* 1393 * still no memory, better delete this cache 1394 */ 1395 mutex_enter(&dcap->dca_lock); 1396 dcp = (dircache_t *)dcap->dca_dircache; 1397 if (VALID_DIR_CACHE(dcp)) { 1398 dnlc_dir_abort(dcp); 1399 dcap->dca_dircache = DC_RET_LOW_MEM; 1400 } 1401 mutex_exit(&dcap->dca_lock); 1402 ncs.ncs_dir_addabort.value.ui64++; 1403 return (DNOCACHE); 1404 } 1405 /* 1406 * fall through as if the 1st kmem_alloc had worked 1407 */ 1408 } 1409 1410 #ifdef DEBUG 1411 ok: 1412 #endif 1413 mutex_enter(&dcap->dca_lock); 1414 dcp = (dircache_t *)dcap->dca_dircache; 1415 if (VALID_DIR_CACHE(dcp)) { 1416 if ((dcp->dc_num_entries + dcp->dc_num_free) > 1417 dnlc_dir_max_size) { 1418 mutex_exit(&dcap->dca_lock); 1419 dnlc_dir_purge(dcap); 1420 kmem_cache_free(dnlc_dir_space_cache, dfp); 1421 ncs.ncs_dir_add_max.value.ui64++; 1422 return (DTOOBIG); 1423 } 1424 dcp->dc_num_free++; 1425 capacity = (dcp->dc_fhash_mask + 1) << dnlc_dir_hash_size_shift; 1426 if (dcp->dc_num_free >= 1427 (capacity << dnlc_dir_hash_resize_shift)) { 1428 dnlc_dir_adjust_fhash(dcp); 1429 } 1430 /* 1431 * Initialise and chain a new entry 1432 */ 1433 dfp->df_handle = handle; 1434 dfp->df_len = len; 1435 dcp->dc_actime = lbolt64; 1436 hp = &(dcp->dc_freehash[DDFHASH(handle, dcp)]); 1437 dfp->df_next = *hp; 1438 *hp = dfp; 1439 mutex_exit(&dcap->dca_lock); 1440 ncs.ncs_dir_num_ents.value.ui64++; 1441 return (DOK); 1442 } else { 1443 mutex_exit(&dcap->dca_lock); 1444 kmem_cache_free(dnlc_dir_space_cache, dfp); 1445 return (DNOCACHE); 1446 } 1447 } 1448 1449 /* 1450 * Mark a directory cache as complete. 1451 */ 1452 void 1453 dnlc_dir_complete(dcanchor_t *dcap) 1454 { 1455 dircache_t *dcp; 1456 1457 mutex_enter(&dcap->dca_lock); 1458 dcp = (dircache_t *)dcap->dca_dircache; 1459 if (VALID_DIR_CACHE(dcp)) { 1460 dcp->dc_complete = B_TRUE; 1461 } 1462 mutex_exit(&dcap->dca_lock); 1463 } 1464 1465 /* 1466 * Internal routine to delete a partial or full directory cache. 1467 * No additional locking needed. 1468 */ 1469 static void 1470 dnlc_dir_abort(dircache_t *dcp) 1471 { 1472 dcentry_t *dep, *nhp; 1473 dcfree_t *fep, *fhp; 1474 uint_t nhtsize = dcp->dc_nhash_mask + 1; /* name hash table size */ 1475 uint_t fhtsize = dcp->dc_fhash_mask + 1; /* free hash table size */ 1476 uint_t i; 1477 1478 /* 1479 * Free up the cached name entries and hash table 1480 */ 1481 for (i = 0; i < nhtsize; i++) { /* for each hash bucket */ 1482 nhp = dcp->dc_namehash[i]; 1483 while (nhp != NULL) { /* for each chained entry */ 1484 dep = nhp->de_next; 1485 kmem_free(nhp, sizeof (dcentry_t) - 1 + 1486 nhp->de_namelen); 1487 nhp = dep; 1488 } 1489 } 1490 kmem_free(dcp->dc_namehash, sizeof (dcentry_t *) * nhtsize); 1491 1492 /* 1493 * Free up the free space entries and hash table 1494 */ 1495 for (i = 0; i < fhtsize; i++) { /* for each hash bucket */ 1496 fhp = dcp->dc_freehash[i]; 1497 while (fhp != NULL) { /* for each chained entry */ 1498 fep = fhp->df_next; 1499 kmem_cache_free(dnlc_dir_space_cache, fhp); 1500 fhp = fep; 1501 } 1502 } 1503 kmem_free(dcp->dc_freehash, sizeof (dcfree_t *) * fhtsize); 1504 1505 /* 1506 * Finally free the directory cache structure itself 1507 */ 1508 ncs.ncs_dir_num_ents.value.ui64 -= (dcp->dc_num_entries + 1509 dcp->dc_num_free); 1510 kmem_free(dcp, sizeof (dircache_t)); 1511 ncs.ncs_cur_dirs.value.ui64--; 1512 } 1513 1514 /* 1515 * Remove a partial or complete directory cache 1516 */ 1517 void 1518 dnlc_dir_purge(dcanchor_t *dcap) 1519 { 1520 dircache_t *dcp; 1521 1522 mutex_enter(&dc_head.dch_lock); 1523 mutex_enter(&dcap->dca_lock); 1524 dcp = (dircache_t *)dcap->dca_dircache; 1525 if (!VALID_DIR_CACHE(dcp)) { 1526 mutex_exit(&dcap->dca_lock); 1527 mutex_exit(&dc_head.dch_lock); 1528 return; 1529 } 1530 dcap->dca_dircache = NULL; 1531 /* 1532 * Unchain from global list 1533 */ 1534 dcp->dc_prev->dc_next = dcp->dc_next; 1535 dcp->dc_next->dc_prev = dcp->dc_prev; 1536 mutex_exit(&dcap->dca_lock); 1537 mutex_exit(&dc_head.dch_lock); 1538 dnlc_dir_abort(dcp); 1539 } 1540 1541 /* 1542 * Remove an entry from a complete or partial directory cache. 1543 * Return the handle if it's non null. 1544 */ 1545 dcret_t 1546 dnlc_dir_rem_entry(dcanchor_t *dcap, char *name, uint64_t *handlep) 1547 { 1548 dircache_t *dcp; 1549 dcentry_t **prevpp, *te; 1550 uint_t capacity; 1551 int hash; 1552 int ret; 1553 uchar_t namlen; 1554 1555 if (!dnlc_dir_enable) { 1556 return (DNOCACHE); 1557 } 1558 1559 mutex_enter(&dcap->dca_lock); 1560 dcp = (dircache_t *)dcap->dca_dircache; 1561 if (VALID_DIR_CACHE(dcp)) { 1562 dcp->dc_actime = lbolt64; 1563 if (dcp->dc_nhash_mask > 0) { /* ie not minimum */ 1564 capacity = (dcp->dc_nhash_mask + 1) << 1565 dnlc_dir_hash_size_shift; 1566 if (dcp->dc_num_entries <= 1567 (capacity >> dnlc_dir_hash_resize_shift)) { 1568 dnlc_dir_adjust_nhash(dcp); 1569 } 1570 } 1571 DNLC_DIR_HASH(name, hash, namlen); 1572 prevpp = &dcp->dc_namehash[hash & dcp->dc_nhash_mask]; 1573 while (*prevpp != NULL) { 1574 if (((*prevpp)->de_hash == hash) && 1575 (namlen == (*prevpp)->de_namelen) && 1576 bcmp((*prevpp)->de_name, name, namlen) == 0) { 1577 if (handlep != NULL) { 1578 *handlep = (*prevpp)->de_handle; 1579 } 1580 te = *prevpp; 1581 *prevpp = (*prevpp)->de_next; 1582 kmem_free(te, sizeof (dcentry_t) - 1 + 1583 te->de_namelen); 1584 1585 /* 1586 * If the total number of entries 1587 * falls below half the minimum number 1588 * of entries then free this cache. 1589 */ 1590 if (--dcp->dc_num_entries < 1591 (dnlc_dir_min_size >> 1)) { 1592 mutex_exit(&dcap->dca_lock); 1593 dnlc_dir_purge(dcap); 1594 } else { 1595 mutex_exit(&dcap->dca_lock); 1596 } 1597 ncs.ncs_dir_num_ents.value.ui64--; 1598 return (DFOUND); 1599 } 1600 prevpp = &((*prevpp)->de_next); 1601 } 1602 if (dcp->dc_complete) { 1603 ncs.ncs_dir_reme_fai.value.ui64++; 1604 ret = DNOENT; 1605 } else { 1606 ret = DNOCACHE; 1607 } 1608 mutex_exit(&dcap->dca_lock); 1609 return (ret); 1610 } else { 1611 mutex_exit(&dcap->dca_lock); 1612 return (DNOCACHE); 1613 } 1614 } 1615 1616 1617 /* 1618 * Remove free space of at least the given length from a complete 1619 * or partial directory cache. 1620 */ 1621 dcret_t 1622 dnlc_dir_rem_space_by_len(dcanchor_t *dcap, uint_t len, uint64_t *handlep) 1623 { 1624 dircache_t *dcp; 1625 dcfree_t **prevpp, *tfp; 1626 uint_t fhtsize; /* free hash table size */ 1627 uint_t i; 1628 uint_t capacity; 1629 int ret; 1630 1631 if (!dnlc_dir_enable) { 1632 return (DNOCACHE); 1633 } 1634 1635 mutex_enter(&dcap->dca_lock); 1636 dcp = (dircache_t *)dcap->dca_dircache; 1637 if (VALID_DIR_CACHE(dcp)) { 1638 dcp->dc_actime = lbolt64; 1639 if (dcp->dc_fhash_mask > 0) { /* ie not minimum */ 1640 capacity = (dcp->dc_fhash_mask + 1) << 1641 dnlc_dir_hash_size_shift; 1642 if (dcp->dc_num_free <= 1643 (capacity >> dnlc_dir_hash_resize_shift)) { 1644 dnlc_dir_adjust_fhash(dcp); 1645 } 1646 } 1647 /* 1648 * Search for an entry of the appropriate size 1649 * on a first fit basis. 1650 */ 1651 fhtsize = dcp->dc_fhash_mask + 1; 1652 for (i = 0; i < fhtsize; i++) { /* for each hash bucket */ 1653 prevpp = &(dcp->dc_freehash[i]); 1654 while (*prevpp != NULL) { 1655 if ((*prevpp)->df_len >= len) { 1656 *handlep = (*prevpp)->df_handle; 1657 tfp = *prevpp; 1658 *prevpp = (*prevpp)->df_next; 1659 dcp->dc_num_free--; 1660 mutex_exit(&dcap->dca_lock); 1661 kmem_cache_free(dnlc_dir_space_cache, 1662 tfp); 1663 ncs.ncs_dir_num_ents.value.ui64--; 1664 return (DFOUND); 1665 } 1666 prevpp = &((*prevpp)->df_next); 1667 } 1668 } 1669 if (dcp->dc_complete) { 1670 ret = DNOENT; 1671 } else { 1672 ret = DNOCACHE; 1673 } 1674 mutex_exit(&dcap->dca_lock); 1675 return (ret); 1676 } else { 1677 mutex_exit(&dcap->dca_lock); 1678 return (DNOCACHE); 1679 } 1680 } 1681 1682 /* 1683 * Remove free space with the given handle from a complete or partial 1684 * directory cache. 1685 */ 1686 dcret_t 1687 dnlc_dir_rem_space_by_handle(dcanchor_t *dcap, uint64_t handle) 1688 { 1689 dircache_t *dcp; 1690 dcfree_t **prevpp, *tfp; 1691 uint_t capacity; 1692 int ret; 1693 1694 if (!dnlc_dir_enable) { 1695 return (DNOCACHE); 1696 } 1697 1698 mutex_enter(&dcap->dca_lock); 1699 dcp = (dircache_t *)dcap->dca_dircache; 1700 if (VALID_DIR_CACHE(dcp)) { 1701 dcp->dc_actime = lbolt64; 1702 if (dcp->dc_fhash_mask > 0) { /* ie not minimum */ 1703 capacity = (dcp->dc_fhash_mask + 1) << 1704 dnlc_dir_hash_size_shift; 1705 if (dcp->dc_num_free <= 1706 (capacity >> dnlc_dir_hash_resize_shift)) { 1707 dnlc_dir_adjust_fhash(dcp); 1708 } 1709 } 1710 1711 /* 1712 * search for the exact entry 1713 */ 1714 prevpp = &(dcp->dc_freehash[DDFHASH(handle, dcp)]); 1715 while (*prevpp != NULL) { 1716 if ((*prevpp)->df_handle == handle) { 1717 tfp = *prevpp; 1718 *prevpp = (*prevpp)->df_next; 1719 dcp->dc_num_free--; 1720 mutex_exit(&dcap->dca_lock); 1721 kmem_cache_free(dnlc_dir_space_cache, tfp); 1722 ncs.ncs_dir_num_ents.value.ui64--; 1723 return (DFOUND); 1724 } 1725 prevpp = &((*prevpp)->df_next); 1726 } 1727 if (dcp->dc_complete) { 1728 ncs.ncs_dir_rems_fai.value.ui64++; 1729 ret = DNOENT; 1730 } else { 1731 ret = DNOCACHE; 1732 } 1733 mutex_exit(&dcap->dca_lock); 1734 return (ret); 1735 } else { 1736 mutex_exit(&dcap->dca_lock); 1737 return (DNOCACHE); 1738 } 1739 } 1740 1741 /* 1742 * Update the handle of an directory cache entry. 1743 */ 1744 dcret_t 1745 dnlc_dir_update(dcanchor_t *dcap, char *name, uint64_t handle) 1746 { 1747 dircache_t *dcp; 1748 dcentry_t *dep; 1749 int hash; 1750 int ret; 1751 uchar_t namlen; 1752 1753 if (!dnlc_dir_enable) { 1754 return (DNOCACHE); 1755 } 1756 1757 mutex_enter(&dcap->dca_lock); 1758 dcp = (dircache_t *)dcap->dca_dircache; 1759 if (VALID_DIR_CACHE(dcp)) { 1760 dcp->dc_actime = lbolt64; 1761 DNLC_DIR_HASH(name, hash, namlen); 1762 dep = dcp->dc_namehash[hash & dcp->dc_nhash_mask]; 1763 while (dep != NULL) { 1764 if ((dep->de_hash == hash) && 1765 (namlen == dep->de_namelen) && 1766 bcmp(dep->de_name, name, namlen) == 0) { 1767 dep->de_handle = handle; 1768 mutex_exit(&dcap->dca_lock); 1769 return (DFOUND); 1770 } 1771 dep = dep->de_next; 1772 } 1773 if (dcp->dc_complete) { 1774 ncs.ncs_dir_upd_fail.value.ui64++; 1775 ret = DNOENT; 1776 } else { 1777 ret = DNOCACHE; 1778 } 1779 mutex_exit(&dcap->dca_lock); 1780 return (ret); 1781 } else { 1782 mutex_exit(&dcap->dca_lock); 1783 return (DNOCACHE); 1784 } 1785 } 1786 1787 void 1788 dnlc_dir_fini(dcanchor_t *dcap) 1789 { 1790 dircache_t *dcp; 1791 1792 mutex_enter(&dc_head.dch_lock); 1793 mutex_enter(&dcap->dca_lock); 1794 dcp = (dircache_t *)dcap->dca_dircache; 1795 if (VALID_DIR_CACHE(dcp)) { 1796 /* 1797 * Unchain from global list 1798 */ 1799 ncs.ncs_dir_finipurg.value.ui64++; 1800 dcp->dc_prev->dc_next = dcp->dc_next; 1801 dcp->dc_next->dc_prev = dcp->dc_prev; 1802 } else { 1803 dcp = NULL; 1804 } 1805 dcap->dca_dircache = NULL; 1806 mutex_exit(&dcap->dca_lock); 1807 mutex_exit(&dc_head.dch_lock); 1808 mutex_destroy(&dcap->dca_lock); 1809 if (dcp) { 1810 dnlc_dir_abort(dcp); 1811 } 1812 } 1813 1814 /* 1815 * Reclaim callback for dnlc directory caching. 1816 * Invoked by the kernel memory allocator when memory gets tight. 1817 * This is a pretty serious condition and can lead easily lead to system 1818 * hangs if not enough space is returned. 1819 * 1820 * Deciding which directory (or directories) to purge is tricky. 1821 * Purging everything is an overkill, but purging just the oldest used 1822 * was found to lead to hangs. The largest cached directories use the 1823 * most memory, but take the most effort to rebuild, whereas the smaller 1824 * ones have little value and give back little space. So what to do? 1825 * 1826 * The current policy is to continue purging the oldest used directories 1827 * until at least dnlc_dir_min_reclaim directory entries have been purged. 1828 */ 1829 /*ARGSUSED*/ 1830 static void 1831 dnlc_dir_reclaim(void *unused) 1832 { 1833 dircache_t *dcp, *oldest; 1834 uint_t dirent_cnt = 0; 1835 1836 mutex_enter(&dc_head.dch_lock); 1837 while (dirent_cnt < dnlc_dir_min_reclaim) { 1838 dcp = dc_head.dch_next; 1839 oldest = NULL; 1840 while (dcp != (dircache_t *)&dc_head) { 1841 if (oldest == NULL) { 1842 oldest = dcp; 1843 } else { 1844 if (dcp->dc_actime < oldest->dc_actime) { 1845 oldest = dcp; 1846 } 1847 } 1848 dcp = dcp->dc_next; 1849 } 1850 if (oldest == NULL) { 1851 /* nothing to delete */ 1852 mutex_exit(&dc_head.dch_lock); 1853 return; 1854 } 1855 /* 1856 * remove from directory chain and purge 1857 */ 1858 oldest->dc_prev->dc_next = oldest->dc_next; 1859 oldest->dc_next->dc_prev = oldest->dc_prev; 1860 mutex_enter(&oldest->dc_anchor->dca_lock); 1861 /* 1862 * If this was the last entry then it must be too large. 1863 * Mark it as such by saving a special dircache_t 1864 * pointer (DC_RET_LOW_MEM) in the anchor. The error DNOMEM 1865 * will be presented to the caller of dnlc_dir_start() 1866 */ 1867 if (oldest->dc_next == oldest->dc_prev) { 1868 oldest->dc_anchor->dca_dircache = DC_RET_LOW_MEM; 1869 ncs.ncs_dir_rec_last.value.ui64++; 1870 } else { 1871 oldest->dc_anchor->dca_dircache = NULL; 1872 ncs.ncs_dir_recl_any.value.ui64++; 1873 } 1874 mutex_exit(&oldest->dc_anchor->dca_lock); 1875 dirent_cnt += oldest->dc_num_entries; 1876 dnlc_dir_abort(oldest); 1877 } 1878 mutex_exit(&dc_head.dch_lock); 1879 } 1880 1881 /* 1882 * Dynamically grow or shrink the size of the name hash table 1883 */ 1884 static void 1885 dnlc_dir_adjust_nhash(dircache_t *dcp) 1886 { 1887 dcentry_t **newhash, *dep, **nhp, *tep; 1888 uint_t newsize; 1889 uint_t oldsize; 1890 uint_t newsizemask; 1891 int i; 1892 1893 /* 1894 * Allocate new hash table 1895 */ 1896 newsize = dcp->dc_num_entries >> dnlc_dir_hash_size_shift; 1897 newhash = kmem_zalloc(sizeof (dcentry_t *) * newsize, KM_NOSLEEP); 1898 if (newhash == NULL) { 1899 /* 1900 * System is short on memory just return 1901 * Note, the old hash table is still usable. 1902 * This return is unlikely to repeatedy occur, because 1903 * either some other directory caches will be reclaimed 1904 * due to memory shortage, thus freeing memory, or this 1905 * directory cahe will be reclaimed. 1906 */ 1907 return; 1908 } 1909 oldsize = dcp->dc_nhash_mask + 1; 1910 dcp->dc_nhash_mask = newsizemask = newsize - 1; 1911 1912 /* 1913 * Move entries from the old table to the new 1914 */ 1915 for (i = 0; i < oldsize; i++) { /* for each hash bucket */ 1916 dep = dcp->dc_namehash[i]; 1917 while (dep != NULL) { /* for each chained entry */ 1918 tep = dep; 1919 dep = dep->de_next; 1920 nhp = &newhash[tep->de_hash & newsizemask]; 1921 tep->de_next = *nhp; 1922 *nhp = tep; 1923 } 1924 } 1925 1926 /* 1927 * delete old hash table and set new one in place 1928 */ 1929 kmem_free(dcp->dc_namehash, sizeof (dcentry_t *) * oldsize); 1930 dcp->dc_namehash = newhash; 1931 } 1932 1933 /* 1934 * Dynamically grow or shrink the size of the free space hash table 1935 */ 1936 static void 1937 dnlc_dir_adjust_fhash(dircache_t *dcp) 1938 { 1939 dcfree_t **newhash, *dfp, **nhp, *tfp; 1940 uint_t newsize; 1941 uint_t oldsize; 1942 int i; 1943 1944 /* 1945 * Allocate new hash table 1946 */ 1947 newsize = dcp->dc_num_free >> dnlc_dir_hash_size_shift; 1948 newhash = kmem_zalloc(sizeof (dcfree_t *) * newsize, KM_NOSLEEP); 1949 if (newhash == NULL) { 1950 /* 1951 * System is short on memory just return 1952 * Note, the old hash table is still usable. 1953 * This return is unlikely to repeatedy occur, because 1954 * either some other directory caches will be reclaimed 1955 * due to memory shortage, thus freeing memory, or this 1956 * directory cahe will be reclaimed. 1957 */ 1958 return; 1959 } 1960 oldsize = dcp->dc_fhash_mask + 1; 1961 dcp->dc_fhash_mask = newsize - 1; 1962 1963 /* 1964 * Move entries from the old table to the new 1965 */ 1966 for (i = 0; i < oldsize; i++) { /* for each hash bucket */ 1967 dfp = dcp->dc_freehash[i]; 1968 while (dfp != NULL) { /* for each chained entry */ 1969 tfp = dfp; 1970 dfp = dfp->df_next; 1971 nhp = &newhash[DDFHASH(tfp->df_handle, dcp)]; 1972 tfp->df_next = *nhp; 1973 *nhp = tfp; 1974 } 1975 } 1976 1977 /* 1978 * delete old hash table and set new one in place 1979 */ 1980 kmem_free(dcp->dc_freehash, sizeof (dcfree_t *) * oldsize); 1981 dcp->dc_freehash = newhash; 1982 } 1983