xref: /titanic_50/usr/src/uts/common/fs/dnlc.c (revision 1767006bb066ef500b90b432fba79d63d0d09b36)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
26 /*	  All Rights Reserved  	*/
27 
28 /*
29  * University Copyright- Copyright (c) 1982, 1986, 1988
30  * The Regents of the University of California
31  * All Rights Reserved
32  *
33  * University Acknowledgment- Portions of this document are derived from
34  * software developed by the University of California, Berkeley, and its
35  * contributors.
36  */
37 
38 #include <sys/types.h>
39 #include <sys/systm.h>
40 #include <sys/param.h>
41 #include <sys/t_lock.h>
42 #include <sys/systm.h>
43 #include <sys/vfs.h>
44 #include <sys/vnode.h>
45 #include <sys/dnlc.h>
46 #include <sys/kmem.h>
47 #include <sys/cmn_err.h>
48 #include <sys/vtrace.h>
49 #include <sys/bitmap.h>
50 #include <sys/var.h>
51 #include <sys/sysmacros.h>
52 #include <sys/kstat.h>
53 #include <sys/atomic.h>
54 #include <sys/taskq.h>
55 
56 /*
57  * Directory name lookup cache.
58  * Based on code originally done by Robert Elz at Melbourne.
59  *
60  * Names found by directory scans are retained in a cache
61  * for future reference.  Each hash chain is ordered by LRU
62  * Cache is indexed by hash value obtained from (vp, name)
63  * where the vp refers to the directory containing the name.
64  */
65 
66 /*
67  * We want to be able to identify files that are referenced only by the DNLC.
68  * When adding a reference from the DNLC, call VN_HOLD_DNLC instead of VN_HOLD,
69  * since multiple DNLC references should only be counted once in v_count. This
70  * file contains only two(2) calls to VN_HOLD, renamed VN_HOLD_CALLER in the
71  * hope that no one will mistakenly add a VN_HOLD to this file. (Unfortunately
72  * it is not possible to #undef VN_HOLD and retain VN_HOLD_CALLER. Ideally a
73  * Makefile rule would grep uncommented C tokens to check that VN_HOLD is
74  * referenced only once in this file, to define VN_HOLD_CALLER.)
75  */
76 #define	VN_HOLD_CALLER	VN_HOLD
77 #define	VN_HOLD_DNLC(vp)	{	\
78 	mutex_enter(&(vp)->v_lock);	\
79 	if ((vp)->v_count_dnlc == 0)	\
80 		(vp)->v_count++;	\
81 	(vp)->v_count_dnlc++;		\
82 	mutex_exit(&(vp)->v_lock);	\
83 }
84 #define	VN_RELE_DNLC(vp)	{	\
85 	vn_rele_dnlc(vp);		\
86 }
87 
88 /*
89  * Tunable nc_hashavelen is the average length desired for this chain, from
90  * which the size of the nc_hash table is derived at create time.
91  */
92 #define	NC_HASHAVELEN_DEFAULT	4
93 int nc_hashavelen = NC_HASHAVELEN_DEFAULT;
94 
95 /*
96  * NC_MOVETOFRONT is the move-to-front threshold: if the hash lookup
97  * depth exceeds this value, we move the looked-up entry to the front of
98  * its hash chain.  The idea is to make sure that the most frequently
99  * accessed entries are found most quickly (by keeping them near the
100  * front of their hash chains).
101  */
102 #define	NC_MOVETOFRONT	2
103 
104 /*
105  *
106  * DNLC_MAX_RELE is used to size an array on the stack when releasing
107  * vnodes. This array is used rather than calling VN_RELE() inline because
108  * all dnlc locks must be dropped by that time in order to avoid a
109  * possible deadlock. This deadlock occurs when the dnlc holds the last
110  * reference to the vnode and so the VOP_INACTIVE vector is called which
111  * can in turn call back into the dnlc. A global array was used but had
112  * many problems:
113  *	1) Actually doesn't have an upper bound on the array size as
114  *	   entries can be added after starting the purge.
115  *	2) The locking scheme causes a hang.
116  *	3) Caused serialisation on the global lock.
117  *	4) The array was often unnecessarily huge.
118  *
119  * Note the current value 8 allows up to 4 cache entries (to be purged
120  * from each hash chain), before having to cycle around and retry.
121  * This ought to be ample given that nc_hashavelen is typically very small.
122  */
123 #define	DNLC_MAX_RELE	8 /* must be even */
124 
125 /*
126  * Hash table of name cache entries for fast lookup, dynamically
127  * allocated at startup.
128  */
129 nc_hash_t *nc_hash;
130 
131 /*
132  * Rotors. Used to select entries on a round-robin basis.
133  */
134 static nc_hash_t *dnlc_purge_fs1_rotor;
135 static nc_hash_t *dnlc_free_rotor;
136 
137 /*
138  * # of dnlc entries (uninitialized)
139  *
140  * the initial value was chosen as being
141  * a random string of bits, probably not
142  * normally chosen by a systems administrator
143  */
144 int ncsize = -1;
145 volatile uint32_t dnlc_nentries = 0;	/* current num of name cache entries */
146 static int nc_hashsz;			/* size of hash table */
147 static int nc_hashmask;			/* size of hash table minus 1 */
148 
149 /*
150  * The dnlc_reduce_cache() taskq queue is activated when there are
151  * ncsize name cache entries and if no parameter is provided, it reduces
152  * the size down to dnlc_nentries_low_water, which is by default one
153  * hundreth less (or 99%) of ncsize.
154  *
155  * If a parameter is provided to dnlc_reduce_cache(), then we reduce
156  * the size down based on ncsize_onepercent - where ncsize_onepercent
157  * is 1% of ncsize; however, we never let dnlc_reduce_cache() reduce
158  * the size below 3% of ncsize (ncsize_min_percent).
159  */
160 #define	DNLC_LOW_WATER_DIVISOR_DEFAULT 100
161 uint_t dnlc_low_water_divisor = DNLC_LOW_WATER_DIVISOR_DEFAULT;
162 uint_t dnlc_nentries_low_water;
163 int dnlc_reduce_idle = 1; /* no locking needed */
164 uint_t ncsize_onepercent;
165 uint_t ncsize_min_percent;
166 
167 /*
168  * If dnlc_nentries hits dnlc_max_nentries (twice ncsize)
169  * then this means the dnlc_reduce_cache() taskq is failing to
170  * keep up. In this case we refuse to add new entries to the dnlc
171  * until the taskq catches up.
172  */
173 uint_t dnlc_max_nentries; /* twice ncsize */
174 uint64_t dnlc_max_nentries_cnt = 0; /* statistic on times we failed */
175 
176 /*
177  * Tunable to define when we should just remove items from
178  * the end of the chain.
179  */
180 #define	DNLC_LONG_CHAIN 8
181 uint_t dnlc_long_chain = DNLC_LONG_CHAIN;
182 
183 /*
184  * ncstats has been deprecated, due to the integer size of the counters
185  * which can easily overflow in the dnlc.
186  * It is maintained (at some expense) for compatability.
187  * The preferred interface is the kstat accessible nc_stats below.
188  */
189 struct ncstats ncstats;
190 
191 struct nc_stats ncs = {
192 	{ "hits",			KSTAT_DATA_UINT64 },
193 	{ "misses",			KSTAT_DATA_UINT64 },
194 	{ "negative_cache_hits",	KSTAT_DATA_UINT64 },
195 	{ "enters",			KSTAT_DATA_UINT64 },
196 	{ "double_enters",		KSTAT_DATA_UINT64 },
197 	{ "purge_total_entries",	KSTAT_DATA_UINT64 },
198 	{ "purge_all",			KSTAT_DATA_UINT64 },
199 	{ "purge_vp",			KSTAT_DATA_UINT64 },
200 	{ "purge_vfs",			KSTAT_DATA_UINT64 },
201 	{ "purge_fs1",			KSTAT_DATA_UINT64 },
202 	{ "pick_free",			KSTAT_DATA_UINT64 },
203 	{ "pick_heuristic",		KSTAT_DATA_UINT64 },
204 	{ "pick_last",			KSTAT_DATA_UINT64 },
205 
206 	/* directory caching stats */
207 
208 	{ "dir_hits",			KSTAT_DATA_UINT64 },
209 	{ "dir_misses",			KSTAT_DATA_UINT64 },
210 	{ "dir_cached_current",		KSTAT_DATA_UINT64 },
211 	{ "dir_entries_cached_current",	KSTAT_DATA_UINT64 },
212 	{ "dir_cached_total",		KSTAT_DATA_UINT64 },
213 	{ "dir_start_no_memory",	KSTAT_DATA_UINT64 },
214 	{ "dir_add_no_memory",		KSTAT_DATA_UINT64 },
215 	{ "dir_add_abort",		KSTAT_DATA_UINT64 },
216 	{ "dir_add_max",		KSTAT_DATA_UINT64 },
217 	{ "dir_remove_entry_fail",	KSTAT_DATA_UINT64 },
218 	{ "dir_remove_space_fail",	KSTAT_DATA_UINT64 },
219 	{ "dir_update_fail",		KSTAT_DATA_UINT64 },
220 	{ "dir_fini_purge",		KSTAT_DATA_UINT64 },
221 	{ "dir_reclaim_last",		KSTAT_DATA_UINT64 },
222 	{ "dir_reclaim_any",		KSTAT_DATA_UINT64 },
223 };
224 
225 static int doingcache = 1;
226 
227 vnode_t negative_cache_vnode;
228 
229 /*
230  * Insert entry at the front of the queue
231  */
232 #define	nc_inshash(ncp, hp) \
233 { \
234 	(ncp)->hash_next = (hp)->hash_next; \
235 	(ncp)->hash_prev = (ncache_t *)(hp); \
236 	(hp)->hash_next->hash_prev = (ncp); \
237 	(hp)->hash_next = (ncp); \
238 }
239 
240 /*
241  * Remove entry from hash queue
242  */
243 #define	nc_rmhash(ncp) \
244 { \
245 	(ncp)->hash_prev->hash_next = (ncp)->hash_next; \
246 	(ncp)->hash_next->hash_prev = (ncp)->hash_prev; \
247 	(ncp)->hash_prev = NULL; \
248 	(ncp)->hash_next = NULL; \
249 }
250 
251 /*
252  * Free an entry.
253  */
254 #define	dnlc_free(ncp) \
255 { \
256 	kmem_free((ncp), sizeof (ncache_t) + (ncp)->namlen); \
257 	atomic_dec_32(&dnlc_nentries); \
258 }
259 
260 
261 /*
262  * Cached directory info.
263  * ======================
264  */
265 
266 /*
267  * Cached directory free space hash function.
268  * Needs the free space handle and the dcp to get the hash table size
269  * Returns the hash index.
270  */
271 #define	DDFHASH(handle, dcp) ((handle >> 2) & (dcp)->dc_fhash_mask)
272 
273 /*
274  * Cached directory name entry hash function.
275  * Uses the name and returns in the input arguments the hash and the name
276  * length.
277  */
278 #define	DNLC_DIR_HASH(name, hash, namelen)			\
279 	{							\
280 		char Xc;					\
281 		const char *Xcp;				\
282 		hash = *name;					\
283 		for (Xcp = (name + 1); (Xc = *Xcp) != 0; Xcp++)	\
284 			hash = (hash << 4) + hash + Xc;		\
285 		ASSERT((Xcp - (name)) <= ((1 << NBBY) - 1));	\
286 		namelen = Xcp - (name);				\
287 	}
288 
289 /* special dircache_t pointer to indicate error should be returned */
290 /*
291  * The anchor directory cache pointer can contain 3 types of values,
292  * 1) NULL: No directory cache
293  * 2) DC_RET_LOW_MEM (-1): There was a directory cache that found to be
294  *    too big or a memory shortage occurred. This value remains in the
295  *    pointer until a dnlc_dir_start() which returns the a DNOMEM error.
296  *    This is kludgy but efficient and only visible in this source file.
297  * 3) A valid cache pointer.
298  */
299 #define	DC_RET_LOW_MEM (dircache_t *)1
300 #define	VALID_DIR_CACHE(dcp) ((dircache_t *)(dcp) > DC_RET_LOW_MEM)
301 
302 /* Tunables */
303 uint_t dnlc_dir_enable = 1; /* disable caching directories by setting to 0 */
304 uint_t dnlc_dir_min_size = 40; /* min no of directory entries before caching */
305 uint_t dnlc_dir_max_size = UINT_MAX; /* ditto maximum */
306 uint_t dnlc_dir_hash_size_shift = 3; /* 8 entries per hash bucket */
307 uint_t dnlc_dir_min_reclaim =  350000; /* approx 1MB of dcentrys */
308 /*
309  * dnlc_dir_hash_resize_shift determines when the hash tables
310  * get re-adjusted due to growth or shrinkage
311  * - currently 2 indicating that there can be at most 4
312  * times or at least one quarter the number of entries
313  * before hash table readjustment. Note that with
314  * dnlc_dir_hash_size_shift above set at 3 this would
315  * mean readjustment would occur if the average number
316  * of entries went above 32 or below 2
317  */
318 uint_t dnlc_dir_hash_resize_shift = 2; /* readjust rate */
319 
320 static kmem_cache_t *dnlc_dir_space_cache; /* free space entry cache */
321 static dchead_t dc_head; /* anchor of cached directories */
322 
323 /* Prototypes */
324 static ncache_t *dnlc_get(uchar_t namlen);
325 static ncache_t *dnlc_search(vnode_t *dp, const char *name, uchar_t namlen,
326     int hash);
327 static void dnlc_dir_reclaim(void *unused);
328 static void dnlc_dir_abort(dircache_t *dcp);
329 static void dnlc_dir_adjust_fhash(dircache_t *dcp);
330 static void dnlc_dir_adjust_nhash(dircache_t *dcp);
331 static void do_dnlc_reduce_cache(void *);
332 
333 
334 /*
335  * Initialize the directory cache.
336  */
337 void
338 dnlc_init()
339 {
340 	nc_hash_t *hp;
341 	kstat_t *ksp;
342 	int i;
343 
344 	/*
345 	 * Set up the size of the dnlc (ncsize) and its low water mark.
346 	 */
347 	if (ncsize == -1) {
348 		/* calculate a reasonable size for the low water */
349 		dnlc_nentries_low_water = 4 * (v.v_proc + maxusers) + 320;
350 		ncsize = dnlc_nentries_low_water +
351 		    (dnlc_nentries_low_water / dnlc_low_water_divisor);
352 	} else {
353 		/* don't change the user specified ncsize */
354 		dnlc_nentries_low_water =
355 		    ncsize - (ncsize / dnlc_low_water_divisor);
356 	}
357 	if (ncsize <= 0) {
358 		doingcache = 0;
359 		dnlc_dir_enable = 0; /* also disable directory caching */
360 		ncsize = 0;
361 		cmn_err(CE_NOTE, "name cache (dnlc) disabled");
362 		return;
363 	}
364 	dnlc_max_nentries = ncsize * 2;
365 	ncsize_onepercent = ncsize / 100;
366 	ncsize_min_percent = ncsize_onepercent * 3;
367 
368 	/*
369 	 * Initialise the hash table.
370 	 * Compute hash size rounding to the next power of two.
371 	 */
372 	nc_hashsz = ncsize / nc_hashavelen;
373 	nc_hashsz = 1 << highbit(nc_hashsz);
374 	nc_hashmask = nc_hashsz - 1;
375 	nc_hash = kmem_zalloc(nc_hashsz * sizeof (*nc_hash), KM_SLEEP);
376 	for (i = 0; i < nc_hashsz; i++) {
377 		hp = (nc_hash_t *)&nc_hash[i];
378 		mutex_init(&hp->hash_lock, NULL, MUTEX_DEFAULT, NULL);
379 		hp->hash_next = (ncache_t *)hp;
380 		hp->hash_prev = (ncache_t *)hp;
381 	}
382 
383 	/*
384 	 * Initialize rotors
385 	 */
386 	dnlc_free_rotor = dnlc_purge_fs1_rotor = &nc_hash[0];
387 
388 	/*
389 	 * Set up the directory caching to use kmem_cache_alloc
390 	 * for its free space entries so that we can get a callback
391 	 * when the system is short on memory, to allow us to free
392 	 * up some memory. we don't use the constructor/deconstructor
393 	 * functions.
394 	 */
395 	dnlc_dir_space_cache = kmem_cache_create("dnlc_space_cache",
396 	    sizeof (dcfree_t), 0, NULL, NULL, dnlc_dir_reclaim, NULL,
397 	    NULL, 0);
398 
399 	/*
400 	 * Initialise the head of the cached directory structures
401 	 */
402 	mutex_init(&dc_head.dch_lock, NULL, MUTEX_DEFAULT, NULL);
403 	dc_head.dch_next = (dircache_t *)&dc_head;
404 	dc_head.dch_prev = (dircache_t *)&dc_head;
405 
406 	/*
407 	 * Initialise the reference count of the negative cache vnode to 1
408 	 * so that it never goes away (VOP_INACTIVE isn't called on it).
409 	 */
410 	negative_cache_vnode.v_count = 1;
411 	negative_cache_vnode.v_count_dnlc = 0;
412 
413 	/*
414 	 * Initialise kstats - both the old compatability raw kind and
415 	 * the more extensive named stats.
416 	 */
417 	ksp = kstat_create("unix", 0, "ncstats", "misc", KSTAT_TYPE_RAW,
418 	    sizeof (struct ncstats), KSTAT_FLAG_VIRTUAL);
419 	if (ksp) {
420 		ksp->ks_data = (void *) &ncstats;
421 		kstat_install(ksp);
422 	}
423 	ksp = kstat_create("unix", 0, "dnlcstats", "misc", KSTAT_TYPE_NAMED,
424 	    sizeof (ncs) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
425 	if (ksp) {
426 		ksp->ks_data = (void *) &ncs;
427 		kstat_install(ksp);
428 	}
429 }
430 
431 /*
432  * Add a name to the directory cache.
433  */
434 void
435 dnlc_enter(vnode_t *dp, const char *name, vnode_t *vp)
436 {
437 	ncache_t *ncp;
438 	nc_hash_t *hp;
439 	uchar_t namlen;
440 	int hash;
441 
442 	TRACE_0(TR_FAC_NFS, TR_DNLC_ENTER_START, "dnlc_enter_start:");
443 
444 	if (!doingcache) {
445 		TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
446 		    "dnlc_enter_end:(%S) %d", "not caching", 0);
447 		return;
448 	}
449 
450 	/*
451 	 * Get a new dnlc entry. Assume the entry won't be in the cache
452 	 * and initialize it now
453 	 */
454 	DNLCHASH(name, dp, hash, namlen);
455 	if ((ncp = dnlc_get(namlen)) == NULL)
456 		return;
457 	ncp->dp = dp;
458 	VN_HOLD_DNLC(dp);
459 	ncp->vp = vp;
460 	VN_HOLD_DNLC(vp);
461 	bcopy(name, ncp->name, namlen + 1); /* name and null */
462 	ncp->hash = hash;
463 	hp = &nc_hash[hash & nc_hashmask];
464 
465 	mutex_enter(&hp->hash_lock);
466 	if (dnlc_search(dp, name, namlen, hash) != NULL) {
467 		mutex_exit(&hp->hash_lock);
468 		ncstats.dbl_enters++;
469 		ncs.ncs_dbl_enters.value.ui64++;
470 		VN_RELE_DNLC(dp);
471 		VN_RELE_DNLC(vp);
472 		dnlc_free(ncp);		/* crfree done here */
473 		TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
474 		    "dnlc_enter_end:(%S) %d", "dbl enter", ncstats.dbl_enters);
475 		return;
476 	}
477 	/*
478 	 * Insert back into the hash chain.
479 	 */
480 	nc_inshash(ncp, hp);
481 	mutex_exit(&hp->hash_lock);
482 	ncstats.enters++;
483 	ncs.ncs_enters.value.ui64++;
484 	TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
485 	    "dnlc_enter_end:(%S) %d", "done", ncstats.enters);
486 }
487 
488 /*
489  * Add a name to the directory cache.
490  *
491  * This function is basically identical with
492  * dnlc_enter().  The difference is that when the
493  * desired dnlc entry is found, the vnode in the
494  * ncache is compared with the vnode passed in.
495  *
496  * If they are not equal then the ncache is
497  * updated with the passed in vnode.  Otherwise
498  * it just frees up the newly allocated dnlc entry.
499  */
500 void
501 dnlc_update(vnode_t *dp, const char *name, vnode_t *vp)
502 {
503 	ncache_t *ncp;
504 	ncache_t *tcp;
505 	vnode_t *tvp;
506 	nc_hash_t *hp;
507 	int hash;
508 	uchar_t namlen;
509 
510 	TRACE_0(TR_FAC_NFS, TR_DNLC_ENTER_START, "dnlc_update_start:");
511 
512 	if (!doingcache) {
513 		TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
514 		    "dnlc_update_end:(%S) %d", "not caching", 0);
515 		return;
516 	}
517 
518 	/*
519 	 * Get a new dnlc entry and initialize it now.
520 	 * If we fail to get a new entry, call dnlc_remove() to purge
521 	 * any existing dnlc entry including negative cache (DNLC_NO_VNODE)
522 	 * entry.
523 	 * Failure to clear an existing entry could result in false dnlc
524 	 * lookup (negative/stale entry).
525 	 */
526 	DNLCHASH(name, dp, hash, namlen);
527 	if ((ncp = dnlc_get(namlen)) == NULL) {
528 		dnlc_remove(dp, name);
529 		return;
530 	}
531 	ncp->dp = dp;
532 	VN_HOLD_DNLC(dp);
533 	ncp->vp = vp;
534 	VN_HOLD_DNLC(vp);
535 	bcopy(name, ncp->name, namlen + 1); /* name and null */
536 	ncp->hash = hash;
537 	hp = &nc_hash[hash & nc_hashmask];
538 
539 	mutex_enter(&hp->hash_lock);
540 	if ((tcp = dnlc_search(dp, name, namlen, hash)) != NULL) {
541 		if (tcp->vp != vp) {
542 			tvp = tcp->vp;
543 			tcp->vp = vp;
544 			mutex_exit(&hp->hash_lock);
545 			VN_RELE_DNLC(tvp);
546 			ncstats.enters++;
547 			ncs.ncs_enters.value.ui64++;
548 			TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
549 			    "dnlc_update_end:(%S) %d", "done", ncstats.enters);
550 		} else {
551 			mutex_exit(&hp->hash_lock);
552 			VN_RELE_DNLC(vp);
553 			ncstats.dbl_enters++;
554 			ncs.ncs_dbl_enters.value.ui64++;
555 			TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
556 			    "dnlc_update_end:(%S) %d",
557 			    "dbl enter", ncstats.dbl_enters);
558 		}
559 		VN_RELE_DNLC(dp);
560 		dnlc_free(ncp);		/* crfree done here */
561 		return;
562 	}
563 	/*
564 	 * insert the new entry, since it is not in dnlc yet
565 	 */
566 	nc_inshash(ncp, hp);
567 	mutex_exit(&hp->hash_lock);
568 	ncstats.enters++;
569 	ncs.ncs_enters.value.ui64++;
570 	TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
571 	    "dnlc_update_end:(%S) %d", "done", ncstats.enters);
572 }
573 
574 /*
575  * Look up a name in the directory name cache.
576  *
577  * Return a doubly-held vnode if found: one hold so that it may
578  * remain in the cache for other users, the other hold so that
579  * the cache is not re-cycled and the identity of the vnode is
580  * lost before the caller can use the vnode.
581  */
582 vnode_t *
583 dnlc_lookup(vnode_t *dp, const char *name)
584 {
585 	ncache_t *ncp;
586 	nc_hash_t *hp;
587 	vnode_t *vp;
588 	int hash, depth;
589 	uchar_t namlen;
590 
591 	TRACE_2(TR_FAC_NFS, TR_DNLC_LOOKUP_START,
592 	    "dnlc_lookup_start:dp %x name %s", dp, name);
593 
594 	if (!doingcache) {
595 		TRACE_4(TR_FAC_NFS, TR_DNLC_LOOKUP_END,
596 		    "dnlc_lookup_end:%S %d vp %x name %s",
597 		    "not_caching", 0, NULL, name);
598 		return (NULL);
599 	}
600 
601 	DNLCHASH(name, dp, hash, namlen);
602 	depth = 1;
603 	hp = &nc_hash[hash & nc_hashmask];
604 	mutex_enter(&hp->hash_lock);
605 
606 	for (ncp = hp->hash_next; ncp != (ncache_t *)hp;
607 	    ncp = ncp->hash_next) {
608 		if (ncp->hash == hash &&	/* fast signature check */
609 		    ncp->dp == dp &&
610 		    ncp->namlen == namlen &&
611 		    bcmp(ncp->name, name, namlen) == 0) {
612 			/*
613 			 * Move this entry to the head of its hash chain
614 			 * if it's not already close.
615 			 */
616 			if (depth > NC_MOVETOFRONT) {
617 				ncache_t *next = ncp->hash_next;
618 				ncache_t *prev = ncp->hash_prev;
619 
620 				prev->hash_next = next;
621 				next->hash_prev = prev;
622 				ncp->hash_next = next = hp->hash_next;
623 				ncp->hash_prev = (ncache_t *)hp;
624 				next->hash_prev = ncp;
625 				hp->hash_next = ncp;
626 
627 				ncstats.move_to_front++;
628 			}
629 
630 			/*
631 			 * Put a hold on the vnode now so its identity
632 			 * can't change before the caller has a chance to
633 			 * put a hold on it.
634 			 */
635 			vp = ncp->vp;
636 			VN_HOLD_CALLER(vp); /* VN_HOLD 1 of 2 in this file */
637 			mutex_exit(&hp->hash_lock);
638 			ncstats.hits++;
639 			ncs.ncs_hits.value.ui64++;
640 			if (vp == DNLC_NO_VNODE) {
641 				ncs.ncs_neg_hits.value.ui64++;
642 			}
643 			TRACE_4(TR_FAC_NFS, TR_DNLC_LOOKUP_END,
644 			    "dnlc_lookup_end:%S %d vp %x name %s", "hit",
645 			    ncstats.hits, vp, name);
646 			return (vp);
647 		}
648 		depth++;
649 	}
650 
651 	mutex_exit(&hp->hash_lock);
652 	ncstats.misses++;
653 	ncs.ncs_misses.value.ui64++;
654 	TRACE_4(TR_FAC_NFS, TR_DNLC_LOOKUP_END,
655 	    "dnlc_lookup_end:%S %d vp %x name %s", "miss", ncstats.misses,
656 	    NULL, name);
657 	return (NULL);
658 }
659 
660 /*
661  * Remove an entry in the directory name cache.
662  */
663 void
664 dnlc_remove(vnode_t *dp, const char *name)
665 {
666 	ncache_t *ncp;
667 	nc_hash_t *hp;
668 	uchar_t namlen;
669 	int hash;
670 
671 	if (!doingcache)
672 		return;
673 	DNLCHASH(name, dp, hash, namlen);
674 	hp = &nc_hash[hash & nc_hashmask];
675 
676 	mutex_enter(&hp->hash_lock);
677 	if (ncp = dnlc_search(dp, name, namlen, hash)) {
678 		/*
679 		 * Free up the entry
680 		 */
681 		nc_rmhash(ncp);
682 		mutex_exit(&hp->hash_lock);
683 		VN_RELE_DNLC(ncp->vp);
684 		VN_RELE_DNLC(ncp->dp);
685 		dnlc_free(ncp);
686 		return;
687 	}
688 	mutex_exit(&hp->hash_lock);
689 }
690 
691 /*
692  * Purge the entire cache.
693  */
694 void
695 dnlc_purge()
696 {
697 	nc_hash_t *nch;
698 	ncache_t *ncp;
699 	int index;
700 	int i;
701 	vnode_t *nc_rele[DNLC_MAX_RELE];
702 
703 	if (!doingcache)
704 		return;
705 
706 	ncstats.purges++;
707 	ncs.ncs_purge_all.value.ui64++;
708 
709 	for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) {
710 		index = 0;
711 		mutex_enter(&nch->hash_lock);
712 		ncp = nch->hash_next;
713 		while (ncp != (ncache_t *)nch) {
714 			ncache_t *np;
715 
716 			np = ncp->hash_next;
717 			nc_rele[index++] = ncp->vp;
718 			nc_rele[index++] = ncp->dp;
719 
720 			nc_rmhash(ncp);
721 			dnlc_free(ncp);
722 			ncp = np;
723 			ncs.ncs_purge_total.value.ui64++;
724 			if (index == DNLC_MAX_RELE)
725 				break;
726 		}
727 		mutex_exit(&nch->hash_lock);
728 
729 		/* Release holds on all the vnodes now that we have no locks */
730 		for (i = 0; i < index; i++) {
731 			VN_RELE_DNLC(nc_rele[i]);
732 		}
733 		if (ncp != (ncache_t *)nch) {
734 			nch--; /* Do current hash chain again */
735 		}
736 	}
737 }
738 
739 /*
740  * Purge any cache entries referencing a vnode. Exit as soon as the dnlc
741  * reference count goes to zero (the caller still holds a reference).
742  */
743 void
744 dnlc_purge_vp(vnode_t *vp)
745 {
746 	nc_hash_t *nch;
747 	ncache_t *ncp;
748 	int index;
749 	vnode_t *nc_rele[DNLC_MAX_RELE];
750 
751 	ASSERT(vp->v_count > 0);
752 	if (vp->v_count_dnlc == 0) {
753 		return;
754 	}
755 
756 	if (!doingcache)
757 		return;
758 
759 	ncstats.purges++;
760 	ncs.ncs_purge_vp.value.ui64++;
761 
762 	for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) {
763 		index = 0;
764 		mutex_enter(&nch->hash_lock);
765 		ncp = nch->hash_next;
766 		while (ncp != (ncache_t *)nch) {
767 			ncache_t *np;
768 
769 			np = ncp->hash_next;
770 			if (ncp->dp == vp || ncp->vp == vp) {
771 				nc_rele[index++] = ncp->vp;
772 				nc_rele[index++] = ncp->dp;
773 				nc_rmhash(ncp);
774 				dnlc_free(ncp);
775 				ncs.ncs_purge_total.value.ui64++;
776 				if (index == DNLC_MAX_RELE) {
777 					ncp = np;
778 					break;
779 				}
780 			}
781 			ncp = np;
782 		}
783 		mutex_exit(&nch->hash_lock);
784 
785 		/* Release holds on all the vnodes now that we have no locks */
786 		while (index) {
787 			VN_RELE_DNLC(nc_rele[--index]);
788 		}
789 
790 		if (vp->v_count_dnlc == 0) {
791 			return;
792 		}
793 
794 		if (ncp != (ncache_t *)nch) {
795 			nch--; /* Do current hash chain again */
796 		}
797 	}
798 }
799 
800 /*
801  * Purge cache entries referencing a vfsp.  Caller supplies a count
802  * of entries to purge; up to that many will be freed.  A count of
803  * zero indicates that all such entries should be purged.  Returns
804  * the number of entries that were purged.
805  */
806 int
807 dnlc_purge_vfsp(vfs_t *vfsp, int count)
808 {
809 	nc_hash_t *nch;
810 	ncache_t *ncp;
811 	int n = 0;
812 	int index;
813 	int i;
814 	vnode_t *nc_rele[DNLC_MAX_RELE];
815 
816 	if (!doingcache)
817 		return (0);
818 
819 	ncstats.purges++;
820 	ncs.ncs_purge_vfs.value.ui64++;
821 
822 	for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) {
823 		index = 0;
824 		mutex_enter(&nch->hash_lock);
825 		ncp = nch->hash_next;
826 		while (ncp != (ncache_t *)nch) {
827 			ncache_t *np;
828 
829 			np = ncp->hash_next;
830 			ASSERT(ncp->dp != NULL);
831 			ASSERT(ncp->vp != NULL);
832 			if ((ncp->dp->v_vfsp == vfsp) ||
833 			    (ncp->vp->v_vfsp == vfsp)) {
834 				n++;
835 				nc_rele[index++] = ncp->vp;
836 				nc_rele[index++] = ncp->dp;
837 				nc_rmhash(ncp);
838 				dnlc_free(ncp);
839 				ncs.ncs_purge_total.value.ui64++;
840 				if (index == DNLC_MAX_RELE) {
841 					ncp = np;
842 					break;
843 				}
844 				if (count != 0 && n >= count) {
845 					break;
846 				}
847 			}
848 			ncp = np;
849 		}
850 		mutex_exit(&nch->hash_lock);
851 		/* Release holds on all the vnodes now that we have no locks */
852 		for (i = 0; i < index; i++) {
853 			VN_RELE_DNLC(nc_rele[i]);
854 		}
855 		if (count != 0 && n >= count) {
856 			return (n);
857 		}
858 		if (ncp != (ncache_t *)nch) {
859 			nch--; /* Do current hash chain again */
860 		}
861 	}
862 	return (n);
863 }
864 
865 /*
866  * Purge 1 entry from the dnlc that is part of the filesystem(s)
867  * represented by 'vop'. The purpose of this routine is to allow
868  * users of the dnlc to free a vnode that is being held by the dnlc.
869  *
870  * If we find a vnode that we release which will result in
871  * freeing the underlying vnode (count was 1), return 1, 0
872  * if no appropriate vnodes found.
873  *
874  * Note, vop is not the 'right' identifier for a filesystem.
875  */
876 int
877 dnlc_fs_purge1(vnodeops_t *vop)
878 {
879 	nc_hash_t *end;
880 	nc_hash_t *hp;
881 	ncache_t *ncp;
882 	vnode_t *vp;
883 
884 	if (!doingcache)
885 		return (0);
886 
887 	ncs.ncs_purge_fs1.value.ui64++;
888 
889 	/*
890 	 * Scan the dnlc entries looking for a likely candidate.
891 	 */
892 	hp = end = dnlc_purge_fs1_rotor;
893 
894 	do {
895 		if (++hp == &nc_hash[nc_hashsz])
896 			hp = nc_hash;
897 		dnlc_purge_fs1_rotor = hp;
898 		if (hp->hash_next == (ncache_t *)hp)
899 			continue;
900 		mutex_enter(&hp->hash_lock);
901 		for (ncp = hp->hash_prev;
902 		    ncp != (ncache_t *)hp;
903 		    ncp = ncp->hash_prev) {
904 			vp = ncp->vp;
905 			if (!vn_has_cached_data(vp) && (vp->v_count == 1) &&
906 			    vn_matchops(vp, vop))
907 				break;
908 		}
909 		if (ncp != (ncache_t *)hp) {
910 			nc_rmhash(ncp);
911 			mutex_exit(&hp->hash_lock);
912 			VN_RELE_DNLC(ncp->dp);
913 			VN_RELE_DNLC(vp)
914 			dnlc_free(ncp);
915 			ncs.ncs_purge_total.value.ui64++;
916 			return (1);
917 		}
918 		mutex_exit(&hp->hash_lock);
919 	} while (hp != end);
920 	return (0);
921 }
922 
923 /*
924  * Perform a reverse lookup in the DNLC.  This will find the first occurrence of
925  * the vnode.  If successful, it will return the vnode of the parent, and the
926  * name of the entry in the given buffer.  If it cannot be found, or the buffer
927  * is too small, then it will return NULL.  Note that this is a highly
928  * inefficient function, since the DNLC is constructed solely for forward
929  * lookups.
930  */
931 vnode_t *
932 dnlc_reverse_lookup(vnode_t *vp, char *buf, size_t buflen)
933 {
934 	nc_hash_t *nch;
935 	ncache_t *ncp;
936 	vnode_t *pvp;
937 
938 	if (!doingcache)
939 		return (NULL);
940 
941 	for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) {
942 		mutex_enter(&nch->hash_lock);
943 		ncp = nch->hash_next;
944 		while (ncp != (ncache_t *)nch) {
945 			/*
946 			 * We ignore '..' entries since it can create
947 			 * confusion and infinite loops.
948 			 */
949 			if (ncp->vp == vp && !(ncp->namlen == 2 &&
950 			    0 == bcmp(ncp->name, "..", 2)) &&
951 			    ncp->namlen < buflen) {
952 				bcopy(ncp->name, buf, ncp->namlen);
953 				buf[ncp->namlen] = '\0';
954 				pvp = ncp->dp;
955 				/* VN_HOLD 2 of 2 in this file */
956 				VN_HOLD_CALLER(pvp);
957 				mutex_exit(&nch->hash_lock);
958 				return (pvp);
959 			}
960 			ncp = ncp->hash_next;
961 		}
962 		mutex_exit(&nch->hash_lock);
963 	}
964 
965 	return (NULL);
966 }
967 /*
968  * Utility routine to search for a cache entry. Return the
969  * ncache entry if found, NULL otherwise.
970  */
971 static ncache_t *
972 dnlc_search(vnode_t *dp, const char *name, uchar_t namlen, int hash)
973 {
974 	nc_hash_t *hp;
975 	ncache_t *ncp;
976 
977 	hp = &nc_hash[hash & nc_hashmask];
978 
979 	for (ncp = hp->hash_next; ncp != (ncache_t *)hp; ncp = ncp->hash_next) {
980 		if (ncp->hash == hash &&
981 		    ncp->dp == dp &&
982 		    ncp->namlen == namlen &&
983 		    bcmp(ncp->name, name, namlen) == 0)
984 			return (ncp);
985 	}
986 	return (NULL);
987 }
988 
989 #if ((1 << NBBY) - 1) < (MAXNAMELEN - 1)
990 #error ncache_t name length representation is too small
991 #endif
992 
993 void
994 dnlc_reduce_cache(void *reduce_percent)
995 {
996 	if (dnlc_reduce_idle && (dnlc_nentries >= ncsize || reduce_percent)) {
997 		dnlc_reduce_idle = 0;
998 		if ((taskq_dispatch(system_taskq, do_dnlc_reduce_cache,
999 		    reduce_percent, TQ_NOSLEEP)) == NULL)
1000 			dnlc_reduce_idle = 1;
1001 	}
1002 }
1003 
1004 /*
1005  * Get a new name cache entry.
1006  * If the dnlc_reduce_cache() taskq isn't keeping up with demand, or memory
1007  * is short then just return NULL. If we're over ncsize then kick off a
1008  * thread to free some in use entries down to dnlc_nentries_low_water.
1009  * Caller must initialise all fields except namlen.
1010  * Component names are defined to be less than MAXNAMELEN
1011  * which includes a null.
1012  */
1013 static ncache_t *
1014 dnlc_get(uchar_t namlen)
1015 {
1016 	ncache_t *ncp;
1017 
1018 	if (dnlc_nentries > dnlc_max_nentries) {
1019 		dnlc_max_nentries_cnt++; /* keep a statistic */
1020 		return (NULL);
1021 	}
1022 	ncp = kmem_alloc(sizeof (ncache_t) + namlen, KM_NOSLEEP);
1023 	if (ncp == NULL) {
1024 		return (NULL);
1025 	}
1026 	ncp->namlen = namlen;
1027 	atomic_inc_32(&dnlc_nentries);
1028 	dnlc_reduce_cache(NULL);
1029 	return (ncp);
1030 }
1031 
1032 /*
1033  * Taskq routine to free up name cache entries to reduce the
1034  * cache size to the low water mark if "reduce_percent" is not provided.
1035  * If "reduce_percent" is provided, reduce cache size by
1036  * (ncsize_onepercent * reduce_percent).
1037  */
1038 /*ARGSUSED*/
1039 static void
1040 do_dnlc_reduce_cache(void *reduce_percent)
1041 {
1042 	nc_hash_t *hp = dnlc_free_rotor, *start_hp = hp;
1043 	vnode_t *vp;
1044 	ncache_t *ncp;
1045 	int cnt;
1046 	uint_t low_water = dnlc_nentries_low_water;
1047 
1048 	if (reduce_percent) {
1049 		uint_t reduce_cnt;
1050 
1051 		/*
1052 		 * Never try to reduce the current number
1053 		 * of cache entries below 3% of ncsize.
1054 		 */
1055 		if (dnlc_nentries <= ncsize_min_percent) {
1056 			dnlc_reduce_idle = 1;
1057 			return;
1058 		}
1059 		reduce_cnt = ncsize_onepercent *
1060 		    (uint_t)(uintptr_t)reduce_percent;
1061 
1062 		if (reduce_cnt > dnlc_nentries ||
1063 		    dnlc_nentries - reduce_cnt < ncsize_min_percent)
1064 			low_water = ncsize_min_percent;
1065 		else
1066 			low_water = dnlc_nentries - reduce_cnt;
1067 	}
1068 
1069 	do {
1070 		/*
1071 		 * Find the first non empty hash queue without locking.
1072 		 * Only look at each hash queue once to avoid an infinite loop.
1073 		 */
1074 		do {
1075 			if (++hp == &nc_hash[nc_hashsz])
1076 				hp = nc_hash;
1077 		} while (hp->hash_next == (ncache_t *)hp && hp != start_hp);
1078 
1079 		/* return if all hash queues are empty. */
1080 		if (hp->hash_next == (ncache_t *)hp) {
1081 			dnlc_reduce_idle = 1;
1082 			return;
1083 		}
1084 
1085 		mutex_enter(&hp->hash_lock);
1086 		for (cnt = 0, ncp = hp->hash_prev; ncp != (ncache_t *)hp;
1087 		    ncp = ncp->hash_prev, cnt++) {
1088 			vp = ncp->vp;
1089 			/*
1090 			 * A name cache entry with a reference count
1091 			 * of one is only referenced by the dnlc.
1092 			 * Also negative cache entries are purged first.
1093 			 */
1094 			if (!vn_has_cached_data(vp) &&
1095 			    ((vp->v_count == 1) || (vp == DNLC_NO_VNODE))) {
1096 				ncs.ncs_pick_heur.value.ui64++;
1097 				goto found;
1098 			}
1099 			/*
1100 			 * Remove from the end of the chain if the
1101 			 * chain is too long
1102 			 */
1103 			if (cnt > dnlc_long_chain) {
1104 				ncp = hp->hash_prev;
1105 				ncs.ncs_pick_last.value.ui64++;
1106 				vp = ncp->vp;
1107 				goto found;
1108 			}
1109 		}
1110 		/* check for race and continue */
1111 		if (hp->hash_next == (ncache_t *)hp) {
1112 			mutex_exit(&hp->hash_lock);
1113 			continue;
1114 		}
1115 
1116 		ncp = hp->hash_prev; /* pick the last one in the hash queue */
1117 		ncs.ncs_pick_last.value.ui64++;
1118 		vp = ncp->vp;
1119 found:
1120 		/*
1121 		 * Remove from hash chain.
1122 		 */
1123 		nc_rmhash(ncp);
1124 		mutex_exit(&hp->hash_lock);
1125 		VN_RELE_DNLC(vp);
1126 		VN_RELE_DNLC(ncp->dp);
1127 		dnlc_free(ncp);
1128 	} while (dnlc_nentries > low_water);
1129 
1130 	dnlc_free_rotor = hp;
1131 	dnlc_reduce_idle = 1;
1132 }
1133 
1134 /*
1135  * Directory caching routines
1136  * ==========================
1137  *
1138  * See dnlc.h for details of the interfaces below.
1139  */
1140 
1141 /*
1142  * Lookup up an entry in a complete or partial directory cache.
1143  */
1144 dcret_t
1145 dnlc_dir_lookup(dcanchor_t *dcap, const char *name, uint64_t *handle)
1146 {
1147 	dircache_t *dcp;
1148 	dcentry_t *dep;
1149 	int hash;
1150 	int ret;
1151 	uchar_t namlen;
1152 
1153 	/*
1154 	 * can test without lock as we are only a cache
1155 	 */
1156 	if (!VALID_DIR_CACHE(dcap->dca_dircache)) {
1157 		ncs.ncs_dir_misses.value.ui64++;
1158 		return (DNOCACHE);
1159 	}
1160 
1161 	if (!dnlc_dir_enable) {
1162 		return (DNOCACHE);
1163 	}
1164 
1165 	mutex_enter(&dcap->dca_lock);
1166 	dcp = (dircache_t *)dcap->dca_dircache;
1167 	if (VALID_DIR_CACHE(dcp)) {
1168 		dcp->dc_actime = ddi_get_lbolt64();
1169 		DNLC_DIR_HASH(name, hash, namlen);
1170 		dep = dcp->dc_namehash[hash & dcp->dc_nhash_mask];
1171 		while (dep != NULL) {
1172 			if ((dep->de_hash == hash) &&
1173 			    (namlen == dep->de_namelen) &&
1174 			    bcmp(dep->de_name, name, namlen) == 0) {
1175 				*handle = dep->de_handle;
1176 				mutex_exit(&dcap->dca_lock);
1177 				ncs.ncs_dir_hits.value.ui64++;
1178 				return (DFOUND);
1179 			}
1180 			dep = dep->de_next;
1181 		}
1182 		if (dcp->dc_complete) {
1183 			ret = DNOENT;
1184 		} else {
1185 			ret = DNOCACHE;
1186 		}
1187 		mutex_exit(&dcap->dca_lock);
1188 		return (ret);
1189 	} else {
1190 		mutex_exit(&dcap->dca_lock);
1191 		ncs.ncs_dir_misses.value.ui64++;
1192 		return (DNOCACHE);
1193 	}
1194 }
1195 
1196 /*
1197  * Start a new directory cache. An estimate of the number of
1198  * entries is provided to as a quick check to ensure the directory
1199  * is cacheable.
1200  */
1201 dcret_t
1202 dnlc_dir_start(dcanchor_t *dcap, uint_t num_entries)
1203 {
1204 	dircache_t *dcp;
1205 
1206 	if (!dnlc_dir_enable ||
1207 	    (num_entries < dnlc_dir_min_size)) {
1208 		return (DNOCACHE);
1209 	}
1210 
1211 	if (num_entries > dnlc_dir_max_size) {
1212 		return (DTOOBIG);
1213 	}
1214 
1215 	mutex_enter(&dc_head.dch_lock);
1216 	mutex_enter(&dcap->dca_lock);
1217 
1218 	if (dcap->dca_dircache == DC_RET_LOW_MEM) {
1219 		dcap->dca_dircache = NULL;
1220 		mutex_exit(&dcap->dca_lock);
1221 		mutex_exit(&dc_head.dch_lock);
1222 		return (DNOMEM);
1223 	}
1224 
1225 	/*
1226 	 * Check if there's currently a cache.
1227 	 * This probably only occurs on a race.
1228 	 */
1229 	if (dcap->dca_dircache != NULL) {
1230 		mutex_exit(&dcap->dca_lock);
1231 		mutex_exit(&dc_head.dch_lock);
1232 		return (DNOCACHE);
1233 	}
1234 
1235 	/*
1236 	 * Allocate the dircache struct, entry and free space hash tables.
1237 	 * These tables are initially just one entry but dynamically resize
1238 	 * when entries and free space are added or removed.
1239 	 */
1240 	if ((dcp = kmem_zalloc(sizeof (dircache_t), KM_NOSLEEP)) == NULL) {
1241 		goto error;
1242 	}
1243 	if ((dcp->dc_namehash = kmem_zalloc(sizeof (dcentry_t *),
1244 	    KM_NOSLEEP)) == NULL) {
1245 		goto error;
1246 	}
1247 	if ((dcp->dc_freehash = kmem_zalloc(sizeof (dcfree_t *),
1248 	    KM_NOSLEEP)) == NULL) {
1249 		goto error;
1250 	}
1251 
1252 	dcp->dc_anchor = dcap; /* set back pointer to anchor */
1253 	dcap->dca_dircache = dcp;
1254 
1255 	/* add into head of global chain */
1256 	dcp->dc_next = dc_head.dch_next;
1257 	dcp->dc_prev = (dircache_t *)&dc_head;
1258 	dcp->dc_next->dc_prev = dcp;
1259 	dc_head.dch_next = dcp;
1260 
1261 	mutex_exit(&dcap->dca_lock);
1262 	mutex_exit(&dc_head.dch_lock);
1263 	ncs.ncs_cur_dirs.value.ui64++;
1264 	ncs.ncs_dirs_cached.value.ui64++;
1265 	return (DOK);
1266 error:
1267 	if (dcp != NULL) {
1268 		if (dcp->dc_namehash) {
1269 			kmem_free(dcp->dc_namehash, sizeof (dcentry_t *));
1270 		}
1271 		kmem_free(dcp, sizeof (dircache_t));
1272 	}
1273 	/*
1274 	 * Must also kmem_free dcp->dc_freehash if more error cases are added
1275 	 */
1276 	mutex_exit(&dcap->dca_lock);
1277 	mutex_exit(&dc_head.dch_lock);
1278 	ncs.ncs_dir_start_nm.value.ui64++;
1279 	return (DNOCACHE);
1280 }
1281 
1282 /*
1283  * Add a directopry entry to a partial or complete directory cache.
1284  */
1285 dcret_t
1286 dnlc_dir_add_entry(dcanchor_t *dcap, const char *name, uint64_t handle)
1287 {
1288 	dircache_t *dcp;
1289 	dcentry_t **hp, *dep;
1290 	int hash;
1291 	uint_t capacity;
1292 	uchar_t namlen;
1293 
1294 	/*
1295 	 * Allocate the dcentry struct, including the variable
1296 	 * size name. Note, the null terminator is not copied.
1297 	 *
1298 	 * We do this outside the lock to avoid possible deadlock if
1299 	 * dnlc_dir_reclaim() is called as a result of memory shortage.
1300 	 */
1301 	DNLC_DIR_HASH(name, hash, namlen);
1302 	dep = kmem_alloc(sizeof (dcentry_t) - 1 + namlen, KM_NOSLEEP);
1303 	if (dep == NULL) {
1304 #ifdef DEBUG
1305 		/*
1306 		 * The kmem allocator generates random failures for
1307 		 * KM_NOSLEEP calls (see KMEM_RANDOM_ALLOCATION_FAILURE)
1308 		 * So try again before we blow away a perfectly good cache.
1309 		 * This is done not to cover an error but purely for
1310 		 * performance running a debug kernel.
1311 		 * This random error only occurs in debug mode.
1312 		 */
1313 		dep = kmem_alloc(sizeof (dcentry_t) - 1 + namlen, KM_NOSLEEP);
1314 		if (dep != NULL)
1315 			goto ok;
1316 #endif
1317 		ncs.ncs_dir_add_nm.value.ui64++;
1318 		/*
1319 		 * Free a directory cache. This may be the one we are
1320 		 * called with.
1321 		 */
1322 		dnlc_dir_reclaim(NULL);
1323 		dep = kmem_alloc(sizeof (dcentry_t) - 1 + namlen, KM_NOSLEEP);
1324 		if (dep == NULL) {
1325 			/*
1326 			 * still no memory, better delete this cache
1327 			 */
1328 			mutex_enter(&dcap->dca_lock);
1329 			dcp = (dircache_t *)dcap->dca_dircache;
1330 			if (VALID_DIR_CACHE(dcp)) {
1331 				dnlc_dir_abort(dcp);
1332 				dcap->dca_dircache = DC_RET_LOW_MEM;
1333 			}
1334 			mutex_exit(&dcap->dca_lock);
1335 			ncs.ncs_dir_addabort.value.ui64++;
1336 			return (DNOCACHE);
1337 		}
1338 		/*
1339 		 * fall through as if the 1st kmem_alloc had worked
1340 		 */
1341 	}
1342 #ifdef DEBUG
1343 ok:
1344 #endif
1345 	mutex_enter(&dcap->dca_lock);
1346 	dcp = (dircache_t *)dcap->dca_dircache;
1347 	if (VALID_DIR_CACHE(dcp)) {
1348 		/*
1349 		 * If the total number of entries goes above the max
1350 		 * then free this cache
1351 		 */
1352 		if ((dcp->dc_num_entries + dcp->dc_num_free) >
1353 		    dnlc_dir_max_size) {
1354 			mutex_exit(&dcap->dca_lock);
1355 			dnlc_dir_purge(dcap);
1356 			kmem_free(dep, sizeof (dcentry_t) - 1 + namlen);
1357 			ncs.ncs_dir_add_max.value.ui64++;
1358 			return (DTOOBIG);
1359 		}
1360 		dcp->dc_num_entries++;
1361 		capacity = (dcp->dc_nhash_mask + 1) << dnlc_dir_hash_size_shift;
1362 		if (dcp->dc_num_entries >=
1363 		    (capacity << dnlc_dir_hash_resize_shift)) {
1364 			dnlc_dir_adjust_nhash(dcp);
1365 		}
1366 		hp = &dcp->dc_namehash[hash & dcp->dc_nhash_mask];
1367 
1368 		/*
1369 		 * Initialise and chain in new entry
1370 		 */
1371 		dep->de_handle = handle;
1372 		dep->de_hash = hash;
1373 		/*
1374 		 * Note de_namelen is a uchar_t to conserve space
1375 		 * and alignment padding. The max length of any
1376 		 * pathname component is defined as MAXNAMELEN
1377 		 * which is 256 (including the terminating null).
1378 		 * So provided this doesn't change, we don't include the null,
1379 		 * we always use bcmp to compare strings, and we don't
1380 		 * start storing full names, then we are ok.
1381 		 * The space savings is worth it.
1382 		 */
1383 		dep->de_namelen = namlen;
1384 		bcopy(name, dep->de_name, namlen);
1385 		dep->de_next = *hp;
1386 		*hp = dep;
1387 		dcp->dc_actime = ddi_get_lbolt64();
1388 		mutex_exit(&dcap->dca_lock);
1389 		ncs.ncs_dir_num_ents.value.ui64++;
1390 		return (DOK);
1391 	} else {
1392 		mutex_exit(&dcap->dca_lock);
1393 		kmem_free(dep, sizeof (dcentry_t) - 1 + namlen);
1394 		return (DNOCACHE);
1395 	}
1396 }
1397 
1398 /*
1399  * Add free space to a partial or complete directory cache.
1400  */
1401 dcret_t
1402 dnlc_dir_add_space(dcanchor_t *dcap, uint_t len, uint64_t handle)
1403 {
1404 	dircache_t *dcp;
1405 	dcfree_t *dfp, **hp;
1406 	uint_t capacity;
1407 
1408 	/*
1409 	 * We kmem_alloc outside the lock to avoid possible deadlock if
1410 	 * dnlc_dir_reclaim() is called as a result of memory shortage.
1411 	 */
1412 	dfp = kmem_cache_alloc(dnlc_dir_space_cache, KM_NOSLEEP);
1413 	if (dfp == NULL) {
1414 #ifdef DEBUG
1415 		/*
1416 		 * The kmem allocator generates random failures for
1417 		 * KM_NOSLEEP calls (see KMEM_RANDOM_ALLOCATION_FAILURE)
1418 		 * So try again before we blow away a perfectly good cache.
1419 		 * This random error only occurs in debug mode
1420 		 */
1421 		dfp = kmem_cache_alloc(dnlc_dir_space_cache, KM_NOSLEEP);
1422 		if (dfp != NULL)
1423 			goto ok;
1424 #endif
1425 		ncs.ncs_dir_add_nm.value.ui64++;
1426 		/*
1427 		 * Free a directory cache. This may be the one we are
1428 		 * called with.
1429 		 */
1430 		dnlc_dir_reclaim(NULL);
1431 		dfp = kmem_cache_alloc(dnlc_dir_space_cache, KM_NOSLEEP);
1432 		if (dfp == NULL) {
1433 			/*
1434 			 * still no memory, better delete this cache
1435 			 */
1436 			mutex_enter(&dcap->dca_lock);
1437 			dcp = (dircache_t *)dcap->dca_dircache;
1438 			if (VALID_DIR_CACHE(dcp)) {
1439 				dnlc_dir_abort(dcp);
1440 				dcap->dca_dircache = DC_RET_LOW_MEM;
1441 			}
1442 			mutex_exit(&dcap->dca_lock);
1443 			ncs.ncs_dir_addabort.value.ui64++;
1444 			return (DNOCACHE);
1445 		}
1446 		/*
1447 		 * fall through as if the 1st kmem_alloc had worked
1448 		 */
1449 	}
1450 
1451 #ifdef DEBUG
1452 ok:
1453 #endif
1454 	mutex_enter(&dcap->dca_lock);
1455 	dcp = (dircache_t *)dcap->dca_dircache;
1456 	if (VALID_DIR_CACHE(dcp)) {
1457 		if ((dcp->dc_num_entries + dcp->dc_num_free) >
1458 		    dnlc_dir_max_size) {
1459 			mutex_exit(&dcap->dca_lock);
1460 			dnlc_dir_purge(dcap);
1461 			kmem_cache_free(dnlc_dir_space_cache, dfp);
1462 			ncs.ncs_dir_add_max.value.ui64++;
1463 			return (DTOOBIG);
1464 		}
1465 		dcp->dc_num_free++;
1466 		capacity = (dcp->dc_fhash_mask + 1) << dnlc_dir_hash_size_shift;
1467 		if (dcp->dc_num_free >=
1468 		    (capacity << dnlc_dir_hash_resize_shift)) {
1469 			dnlc_dir_adjust_fhash(dcp);
1470 		}
1471 		/*
1472 		 * Initialise and chain a new entry
1473 		 */
1474 		dfp->df_handle = handle;
1475 		dfp->df_len = len;
1476 		dcp->dc_actime = ddi_get_lbolt64();
1477 		hp = &(dcp->dc_freehash[DDFHASH(handle, dcp)]);
1478 		dfp->df_next = *hp;
1479 		*hp = dfp;
1480 		mutex_exit(&dcap->dca_lock);
1481 		ncs.ncs_dir_num_ents.value.ui64++;
1482 		return (DOK);
1483 	} else {
1484 		mutex_exit(&dcap->dca_lock);
1485 		kmem_cache_free(dnlc_dir_space_cache, dfp);
1486 		return (DNOCACHE);
1487 	}
1488 }
1489 
1490 /*
1491  * Mark a directory cache as complete.
1492  */
1493 void
1494 dnlc_dir_complete(dcanchor_t *dcap)
1495 {
1496 	dircache_t *dcp;
1497 
1498 	mutex_enter(&dcap->dca_lock);
1499 	dcp = (dircache_t *)dcap->dca_dircache;
1500 	if (VALID_DIR_CACHE(dcp)) {
1501 		dcp->dc_complete = B_TRUE;
1502 	}
1503 	mutex_exit(&dcap->dca_lock);
1504 }
1505 
1506 /*
1507  * Internal routine to delete a partial or full directory cache.
1508  * No additional locking needed.
1509  */
1510 static void
1511 dnlc_dir_abort(dircache_t *dcp)
1512 {
1513 	dcentry_t *dep, *nhp;
1514 	dcfree_t *fep, *fhp;
1515 	uint_t nhtsize = dcp->dc_nhash_mask + 1; /* name hash table size */
1516 	uint_t fhtsize = dcp->dc_fhash_mask + 1; /* free hash table size */
1517 	uint_t i;
1518 
1519 	/*
1520 	 * Free up the cached name entries and hash table
1521 	 */
1522 	for (i = 0; i < nhtsize; i++) { /* for each hash bucket */
1523 		nhp = dcp->dc_namehash[i];
1524 		while (nhp != NULL) { /* for each chained entry */
1525 			dep = nhp->de_next;
1526 			kmem_free(nhp, sizeof (dcentry_t) - 1 +
1527 			    nhp->de_namelen);
1528 			nhp = dep;
1529 		}
1530 	}
1531 	kmem_free(dcp->dc_namehash, sizeof (dcentry_t *) * nhtsize);
1532 
1533 	/*
1534 	 * Free up the free space entries and hash table
1535 	 */
1536 	for (i = 0; i < fhtsize; i++) { /* for each hash bucket */
1537 		fhp = dcp->dc_freehash[i];
1538 		while (fhp != NULL) { /* for each chained entry */
1539 			fep = fhp->df_next;
1540 			kmem_cache_free(dnlc_dir_space_cache, fhp);
1541 			fhp = fep;
1542 		}
1543 	}
1544 	kmem_free(dcp->dc_freehash, sizeof (dcfree_t *) * fhtsize);
1545 
1546 	/*
1547 	 * Finally free the directory cache structure itself
1548 	 */
1549 	ncs.ncs_dir_num_ents.value.ui64 -= (dcp->dc_num_entries +
1550 	    dcp->dc_num_free);
1551 	kmem_free(dcp, sizeof (dircache_t));
1552 	ncs.ncs_cur_dirs.value.ui64--;
1553 }
1554 
1555 /*
1556  * Remove a partial or complete directory cache
1557  */
1558 void
1559 dnlc_dir_purge(dcanchor_t *dcap)
1560 {
1561 	dircache_t *dcp;
1562 
1563 	mutex_enter(&dc_head.dch_lock);
1564 	mutex_enter(&dcap->dca_lock);
1565 	dcp = (dircache_t *)dcap->dca_dircache;
1566 	if (!VALID_DIR_CACHE(dcp)) {
1567 		mutex_exit(&dcap->dca_lock);
1568 		mutex_exit(&dc_head.dch_lock);
1569 		return;
1570 	}
1571 	dcap->dca_dircache = NULL;
1572 	/*
1573 	 * Unchain from global list
1574 	 */
1575 	dcp->dc_prev->dc_next = dcp->dc_next;
1576 	dcp->dc_next->dc_prev = dcp->dc_prev;
1577 	mutex_exit(&dcap->dca_lock);
1578 	mutex_exit(&dc_head.dch_lock);
1579 	dnlc_dir_abort(dcp);
1580 }
1581 
1582 /*
1583  * Remove an entry from a complete or partial directory cache.
1584  * Return the handle if it's non null.
1585  */
1586 dcret_t
1587 dnlc_dir_rem_entry(dcanchor_t *dcap, const char *name, uint64_t *handlep)
1588 {
1589 	dircache_t *dcp;
1590 	dcentry_t **prevpp, *te;
1591 	uint_t capacity;
1592 	int hash;
1593 	int ret;
1594 	uchar_t namlen;
1595 
1596 	if (!dnlc_dir_enable) {
1597 		return (DNOCACHE);
1598 	}
1599 
1600 	mutex_enter(&dcap->dca_lock);
1601 	dcp = (dircache_t *)dcap->dca_dircache;
1602 	if (VALID_DIR_CACHE(dcp)) {
1603 		dcp->dc_actime = ddi_get_lbolt64();
1604 		if (dcp->dc_nhash_mask > 0) { /* ie not minimum */
1605 			capacity = (dcp->dc_nhash_mask + 1) <<
1606 			    dnlc_dir_hash_size_shift;
1607 			if (dcp->dc_num_entries <=
1608 			    (capacity >> dnlc_dir_hash_resize_shift)) {
1609 				dnlc_dir_adjust_nhash(dcp);
1610 			}
1611 		}
1612 		DNLC_DIR_HASH(name, hash, namlen);
1613 		prevpp = &dcp->dc_namehash[hash & dcp->dc_nhash_mask];
1614 		while (*prevpp != NULL) {
1615 			if (((*prevpp)->de_hash == hash) &&
1616 			    (namlen == (*prevpp)->de_namelen) &&
1617 			    bcmp((*prevpp)->de_name, name, namlen) == 0) {
1618 				if (handlep != NULL) {
1619 					*handlep = (*prevpp)->de_handle;
1620 				}
1621 				te = *prevpp;
1622 				*prevpp = (*prevpp)->de_next;
1623 				kmem_free(te, sizeof (dcentry_t) - 1 +
1624 				    te->de_namelen);
1625 
1626 				/*
1627 				 * If the total number of entries
1628 				 * falls below half the minimum number
1629 				 * of entries then free this cache.
1630 				 */
1631 				if (--dcp->dc_num_entries <
1632 				    (dnlc_dir_min_size >> 1)) {
1633 					mutex_exit(&dcap->dca_lock);
1634 					dnlc_dir_purge(dcap);
1635 				} else {
1636 					mutex_exit(&dcap->dca_lock);
1637 				}
1638 				ncs.ncs_dir_num_ents.value.ui64--;
1639 				return (DFOUND);
1640 			}
1641 			prevpp = &((*prevpp)->de_next);
1642 		}
1643 		if (dcp->dc_complete) {
1644 			ncs.ncs_dir_reme_fai.value.ui64++;
1645 			ret = DNOENT;
1646 		} else {
1647 			ret = DNOCACHE;
1648 		}
1649 		mutex_exit(&dcap->dca_lock);
1650 		return (ret);
1651 	} else {
1652 		mutex_exit(&dcap->dca_lock);
1653 		return (DNOCACHE);
1654 	}
1655 }
1656 
1657 
1658 /*
1659  * Remove free space of at least the given length from a complete
1660  * or partial directory cache.
1661  */
1662 dcret_t
1663 dnlc_dir_rem_space_by_len(dcanchor_t *dcap, uint_t len, uint64_t *handlep)
1664 {
1665 	dircache_t *dcp;
1666 	dcfree_t **prevpp, *tfp;
1667 	uint_t fhtsize; /* free hash table size */
1668 	uint_t i;
1669 	uint_t capacity;
1670 	int ret;
1671 
1672 	if (!dnlc_dir_enable) {
1673 		return (DNOCACHE);
1674 	}
1675 
1676 	mutex_enter(&dcap->dca_lock);
1677 	dcp = (dircache_t *)dcap->dca_dircache;
1678 	if (VALID_DIR_CACHE(dcp)) {
1679 		dcp->dc_actime = ddi_get_lbolt64();
1680 		if (dcp->dc_fhash_mask > 0) { /* ie not minimum */
1681 			capacity = (dcp->dc_fhash_mask + 1) <<
1682 			    dnlc_dir_hash_size_shift;
1683 			if (dcp->dc_num_free <=
1684 			    (capacity >> dnlc_dir_hash_resize_shift)) {
1685 				dnlc_dir_adjust_fhash(dcp);
1686 			}
1687 		}
1688 		/*
1689 		 * Search for an entry of the appropriate size
1690 		 * on a first fit basis.
1691 		 */
1692 		fhtsize = dcp->dc_fhash_mask + 1;
1693 		for (i = 0; i < fhtsize; i++) { /* for each hash bucket */
1694 			prevpp = &(dcp->dc_freehash[i]);
1695 			while (*prevpp != NULL) {
1696 				if ((*prevpp)->df_len >= len) {
1697 					*handlep = (*prevpp)->df_handle;
1698 					tfp = *prevpp;
1699 					*prevpp = (*prevpp)->df_next;
1700 					dcp->dc_num_free--;
1701 					mutex_exit(&dcap->dca_lock);
1702 					kmem_cache_free(dnlc_dir_space_cache,
1703 					    tfp);
1704 					ncs.ncs_dir_num_ents.value.ui64--;
1705 					return (DFOUND);
1706 				}
1707 				prevpp = &((*prevpp)->df_next);
1708 			}
1709 		}
1710 		if (dcp->dc_complete) {
1711 			ret = DNOENT;
1712 		} else {
1713 			ret = DNOCACHE;
1714 		}
1715 		mutex_exit(&dcap->dca_lock);
1716 		return (ret);
1717 	} else {
1718 		mutex_exit(&dcap->dca_lock);
1719 		return (DNOCACHE);
1720 	}
1721 }
1722 
1723 /*
1724  * Remove free space with the given handle from a complete or partial
1725  * directory cache.
1726  */
1727 dcret_t
1728 dnlc_dir_rem_space_by_handle(dcanchor_t *dcap, uint64_t handle)
1729 {
1730 	dircache_t *dcp;
1731 	dcfree_t **prevpp, *tfp;
1732 	uint_t capacity;
1733 	int ret;
1734 
1735 	if (!dnlc_dir_enable) {
1736 		return (DNOCACHE);
1737 	}
1738 
1739 	mutex_enter(&dcap->dca_lock);
1740 	dcp = (dircache_t *)dcap->dca_dircache;
1741 	if (VALID_DIR_CACHE(dcp)) {
1742 		dcp->dc_actime = ddi_get_lbolt64();
1743 		if (dcp->dc_fhash_mask > 0) { /* ie not minimum */
1744 			capacity = (dcp->dc_fhash_mask + 1) <<
1745 			    dnlc_dir_hash_size_shift;
1746 			if (dcp->dc_num_free <=
1747 			    (capacity >> dnlc_dir_hash_resize_shift)) {
1748 				dnlc_dir_adjust_fhash(dcp);
1749 			}
1750 		}
1751 
1752 		/*
1753 		 * search for the exact entry
1754 		 */
1755 		prevpp = &(dcp->dc_freehash[DDFHASH(handle, dcp)]);
1756 		while (*prevpp != NULL) {
1757 			if ((*prevpp)->df_handle == handle) {
1758 				tfp = *prevpp;
1759 				*prevpp = (*prevpp)->df_next;
1760 				dcp->dc_num_free--;
1761 				mutex_exit(&dcap->dca_lock);
1762 				kmem_cache_free(dnlc_dir_space_cache, tfp);
1763 				ncs.ncs_dir_num_ents.value.ui64--;
1764 				return (DFOUND);
1765 			}
1766 			prevpp = &((*prevpp)->df_next);
1767 		}
1768 		if (dcp->dc_complete) {
1769 			ncs.ncs_dir_rems_fai.value.ui64++;
1770 			ret = DNOENT;
1771 		} else {
1772 			ret = DNOCACHE;
1773 		}
1774 		mutex_exit(&dcap->dca_lock);
1775 		return (ret);
1776 	} else {
1777 		mutex_exit(&dcap->dca_lock);
1778 		return (DNOCACHE);
1779 	}
1780 }
1781 
1782 /*
1783  * Update the handle of an directory cache entry.
1784  */
1785 dcret_t
1786 dnlc_dir_update(dcanchor_t *dcap, const char *name, uint64_t handle)
1787 {
1788 	dircache_t *dcp;
1789 	dcentry_t *dep;
1790 	int hash;
1791 	int ret;
1792 	uchar_t namlen;
1793 
1794 	if (!dnlc_dir_enable) {
1795 		return (DNOCACHE);
1796 	}
1797 
1798 	mutex_enter(&dcap->dca_lock);
1799 	dcp = (dircache_t *)dcap->dca_dircache;
1800 	if (VALID_DIR_CACHE(dcp)) {
1801 		dcp->dc_actime = ddi_get_lbolt64();
1802 		DNLC_DIR_HASH(name, hash, namlen);
1803 		dep = dcp->dc_namehash[hash & dcp->dc_nhash_mask];
1804 		while (dep != NULL) {
1805 			if ((dep->de_hash == hash) &&
1806 			    (namlen == dep->de_namelen) &&
1807 			    bcmp(dep->de_name, name, namlen) == 0) {
1808 				dep->de_handle = handle;
1809 				mutex_exit(&dcap->dca_lock);
1810 				return (DFOUND);
1811 			}
1812 			dep = dep->de_next;
1813 		}
1814 		if (dcp->dc_complete) {
1815 			ncs.ncs_dir_upd_fail.value.ui64++;
1816 			ret = DNOENT;
1817 		} else {
1818 			ret = DNOCACHE;
1819 		}
1820 		mutex_exit(&dcap->dca_lock);
1821 		return (ret);
1822 	} else {
1823 		mutex_exit(&dcap->dca_lock);
1824 		return (DNOCACHE);
1825 	}
1826 }
1827 
1828 void
1829 dnlc_dir_fini(dcanchor_t *dcap)
1830 {
1831 	dircache_t *dcp;
1832 
1833 	mutex_enter(&dc_head.dch_lock);
1834 	mutex_enter(&dcap->dca_lock);
1835 	dcp = (dircache_t *)dcap->dca_dircache;
1836 	if (VALID_DIR_CACHE(dcp)) {
1837 		/*
1838 		 * Unchain from global list
1839 		 */
1840 		ncs.ncs_dir_finipurg.value.ui64++;
1841 		dcp->dc_prev->dc_next = dcp->dc_next;
1842 		dcp->dc_next->dc_prev = dcp->dc_prev;
1843 	} else {
1844 		dcp = NULL;
1845 	}
1846 	dcap->dca_dircache = NULL;
1847 	mutex_exit(&dcap->dca_lock);
1848 	mutex_exit(&dc_head.dch_lock);
1849 	mutex_destroy(&dcap->dca_lock);
1850 	if (dcp) {
1851 		dnlc_dir_abort(dcp);
1852 	}
1853 }
1854 
1855 /*
1856  * Reclaim callback for dnlc directory caching.
1857  * Invoked by the kernel memory allocator when memory gets tight.
1858  * This is a pretty serious condition and can lead easily lead to system
1859  * hangs if not enough space is returned.
1860  *
1861  * Deciding which directory (or directories) to purge is tricky.
1862  * Purging everything is an overkill, but purging just the oldest used
1863  * was found to lead to hangs. The largest cached directories use the
1864  * most memory, but take the most effort to rebuild, whereas the smaller
1865  * ones have little value and give back little space. So what to do?
1866  *
1867  * The current policy is to continue purging the oldest used directories
1868  * until at least dnlc_dir_min_reclaim directory entries have been purged.
1869  */
1870 /*ARGSUSED*/
1871 static void
1872 dnlc_dir_reclaim(void *unused)
1873 {
1874 	dircache_t *dcp, *oldest;
1875 	uint_t dirent_cnt = 0;
1876 
1877 	mutex_enter(&dc_head.dch_lock);
1878 	while (dirent_cnt < dnlc_dir_min_reclaim) {
1879 		dcp = dc_head.dch_next;
1880 		oldest = NULL;
1881 		while (dcp != (dircache_t *)&dc_head) {
1882 			if (oldest == NULL) {
1883 				oldest = dcp;
1884 			} else {
1885 				if (dcp->dc_actime < oldest->dc_actime) {
1886 					oldest = dcp;
1887 				}
1888 			}
1889 			dcp = dcp->dc_next;
1890 		}
1891 		if (oldest == NULL) {
1892 			/* nothing to delete */
1893 			mutex_exit(&dc_head.dch_lock);
1894 			return;
1895 		}
1896 		/*
1897 		 * remove from directory chain and purge
1898 		 */
1899 		oldest->dc_prev->dc_next = oldest->dc_next;
1900 		oldest->dc_next->dc_prev = oldest->dc_prev;
1901 		mutex_enter(&oldest->dc_anchor->dca_lock);
1902 		/*
1903 		 * If this was the last entry then it must be too large.
1904 		 * Mark it as such by saving a special dircache_t
1905 		 * pointer (DC_RET_LOW_MEM) in the anchor. The error DNOMEM
1906 		 * will be presented to the caller of dnlc_dir_start()
1907 		 */
1908 		if (oldest->dc_next == oldest->dc_prev) {
1909 			oldest->dc_anchor->dca_dircache = DC_RET_LOW_MEM;
1910 			ncs.ncs_dir_rec_last.value.ui64++;
1911 		} else {
1912 			oldest->dc_anchor->dca_dircache = NULL;
1913 			ncs.ncs_dir_recl_any.value.ui64++;
1914 		}
1915 		mutex_exit(&oldest->dc_anchor->dca_lock);
1916 		dirent_cnt += oldest->dc_num_entries;
1917 		dnlc_dir_abort(oldest);
1918 	}
1919 	mutex_exit(&dc_head.dch_lock);
1920 }
1921 
1922 /*
1923  * Dynamically grow or shrink the size of the name hash table
1924  */
1925 static void
1926 dnlc_dir_adjust_nhash(dircache_t *dcp)
1927 {
1928 	dcentry_t **newhash, *dep, **nhp, *tep;
1929 	uint_t newsize;
1930 	uint_t oldsize;
1931 	uint_t newsizemask;
1932 	int i;
1933 
1934 	/*
1935 	 * Allocate new hash table
1936 	 */
1937 	newsize = dcp->dc_num_entries >> dnlc_dir_hash_size_shift;
1938 	newhash = kmem_zalloc(sizeof (dcentry_t *) * newsize, KM_NOSLEEP);
1939 	if (newhash == NULL) {
1940 		/*
1941 		 * System is short on memory just return
1942 		 * Note, the old hash table is still usable.
1943 		 * This return is unlikely to repeatedy occur, because
1944 		 * either some other directory caches will be reclaimed
1945 		 * due to memory shortage, thus freeing memory, or this
1946 		 * directory cahe will be reclaimed.
1947 		 */
1948 		return;
1949 	}
1950 	oldsize = dcp->dc_nhash_mask + 1;
1951 	dcp->dc_nhash_mask = newsizemask = newsize - 1;
1952 
1953 	/*
1954 	 * Move entries from the old table to the new
1955 	 */
1956 	for (i = 0; i < oldsize; i++) { /* for each hash bucket */
1957 		dep = dcp->dc_namehash[i];
1958 		while (dep != NULL) { /* for each chained entry */
1959 			tep = dep;
1960 			dep = dep->de_next;
1961 			nhp = &newhash[tep->de_hash & newsizemask];
1962 			tep->de_next = *nhp;
1963 			*nhp = tep;
1964 		}
1965 	}
1966 
1967 	/*
1968 	 * delete old hash table and set new one in place
1969 	 */
1970 	kmem_free(dcp->dc_namehash, sizeof (dcentry_t *) * oldsize);
1971 	dcp->dc_namehash = newhash;
1972 }
1973 
1974 /*
1975  * Dynamically grow or shrink the size of the free space hash table
1976  */
1977 static void
1978 dnlc_dir_adjust_fhash(dircache_t *dcp)
1979 {
1980 	dcfree_t **newhash, *dfp, **nhp, *tfp;
1981 	uint_t newsize;
1982 	uint_t oldsize;
1983 	int i;
1984 
1985 	/*
1986 	 * Allocate new hash table
1987 	 */
1988 	newsize = dcp->dc_num_free >> dnlc_dir_hash_size_shift;
1989 	newhash = kmem_zalloc(sizeof (dcfree_t *) * newsize, KM_NOSLEEP);
1990 	if (newhash == NULL) {
1991 		/*
1992 		 * System is short on memory just return
1993 		 * Note, the old hash table is still usable.
1994 		 * This return is unlikely to repeatedy occur, because
1995 		 * either some other directory caches will be reclaimed
1996 		 * due to memory shortage, thus freeing memory, or this
1997 		 * directory cahe will be reclaimed.
1998 		 */
1999 		return;
2000 	}
2001 	oldsize = dcp->dc_fhash_mask + 1;
2002 	dcp->dc_fhash_mask = newsize - 1;
2003 
2004 	/*
2005 	 * Move entries from the old table to the new
2006 	 */
2007 	for (i = 0; i < oldsize; i++) { /* for each hash bucket */
2008 		dfp = dcp->dc_freehash[i];
2009 		while (dfp != NULL) { /* for each chained entry */
2010 			tfp = dfp;
2011 			dfp = dfp->df_next;
2012 			nhp = &newhash[DDFHASH(tfp->df_handle, dcp)];
2013 			tfp->df_next = *nhp;
2014 			*nhp = tfp;
2015 		}
2016 	}
2017 
2018 	/*
2019 	 * delete old hash table and set new one in place
2020 	 */
2021 	kmem_free(dcp->dc_freehash, sizeof (dcfree_t *) * oldsize);
2022 	dcp->dc_freehash = newhash;
2023 }
2024