xref: /illumos-gate/usr/src/uts/common/fs/dev/sdev_ncache.c (revision f875b4ebb1dd9fdbeb043557cab38ab3bf7f6e01)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * negative cache handling for the /dev fs
30  */
31 
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/t_lock.h>
35 #include <sys/systm.h>
36 #include <sys/sysmacros.h>
37 #include <sys/user.h>
38 #include <sys/time.h>
39 #include <sys/vfs.h>
40 #include <sys/vnode.h>
41 #include <sys/file.h>
42 #include <sys/fcntl.h>
43 #include <sys/flock.h>
44 #include <sys/kmem.h>
45 #include <sys/uio.h>
46 #include <sys/errno.h>
47 #include <sys/stat.h>
48 #include <sys/cred.h>
49 #include <sys/cmn_err.h>
50 #include <sys/debug.h>
51 #include <sys/mode.h>
52 #include <sys/policy.h>
53 #include <fs/fs_subr.h>
54 #include <sys/mount.h>
55 #include <sys/fs/snode.h>
56 #include <sys/fs/dv_node.h>
57 #include <sys/fs/sdev_node.h>
58 #include <sys/sunndi.h>
59 #include <sys/sunmdi.h>
60 #include <sys/ddi.h>
61 #include <sys/modctl.h>
62 #include <sys/devcache.h>
63 
64 
65 /*
66  * ncache is a negative cache of failed lookups.  An entry
67  * is added after an attempt to configure a device by that
68  * name failed.  An accumulation of these entries over time
69  * gives us a set of device name for which implicit reconfiguration
70  * does not need to be attempted.  If a name is created matching
71  * an entry in ncache, that entry is removed, with the
72  * persistent store updated.
73  *
74  * Implicit reconfig is initiated for any name during lookup that
75  * can't be resolved from the backing store and that isn't
76  * present in the negative cache.  This functionality is
77  * enabled during system startup once communication with devfsadm
78  * can be achieved.  Since readdir is more general, implicit
79  * reconfig initiated by reading a directory isn't enabled until
80  * the system is more fully booted, at the time of the multi-user
81  * milestone, corresponding to init state 2.
82  *
83  * A maximum is imposed on the number of entries in the cache
84  * to limit some script going wild and as a defense against attack.
85  * The default limit is 64 and can be adjusted via sdev_nc_max_entries.
86  *
87  * Each entry also has a expiration count.  When looked up a name in
88  * the cache is set to the default.  Subsequent boots will decrement
89  * the count if a name isn't referenced.  This permits a once-only
90  * entry to eventually be removed over time.
91  *
92  * sdev_reconfig_delay implements a "debounce" of the timing beyond
93  * system available indication, providing what the filesystem considers
94  * to be the system-is-fully-booted state.  This is provided to adjust
95  * the timing if some application startup is performing a readdir
96  * in /dev that initiates a troublesome implicit reconfig on every boot.
97  *
98  * sdev_nc_disable_reset can be used to disable clearing the negative cache
99  * on reconfig boot.  The default is to clear the cache on reconfig boot.
100  * sdev_nc_disable can be used to disable the negative cache itself.
101  *
102  * sdev_reconfig_disable can be used to disable implicit reconfig.
103  * The default is that implicit reconfig is enabled.
104  */
105 
106 /* tunables and defaults */
107 #define	SDEV_NC_EXPIRECNT	4
108 #define	SDEV_NC_MAX_ENTRIES	64
109 #define	SEV_RECONFIG_DELAY	6	/* seconds */
110 
111 /* tunables */
112 int	sdev_nc_expirecnt = SDEV_NC_EXPIRECNT;
113 int	sdev_nc_max_entries = SDEV_NC_MAX_ENTRIES;
114 int	sdev_reconfig_delay = SEV_RECONFIG_DELAY;
115 int	sdev_reconfig_verbose = 0;
116 int	sdev_reconfig_disable = 0;
117 int	sdev_nc_disable = 0;
118 int	sdev_nc_disable_reset = 0;
119 int	sdev_nc_verbose = 0;
120 int	sdev_cache_read_disable = 0;
121 int	sdev_cache_write_disable = 0;
122 
123 /* globals */
124 int	sdev_boot_state = SDEV_BOOT_STATE_INITIAL;
125 int	sdev_reconfig_boot = 0;
126 sdev_nc_list_t *sdev_ncache;
127 static nvf_handle_t sdevfd_handle;
128 
129 /* static prototypes */
130 static void sdev_ncache_write_complete(nvf_handle_t);
131 static void sdev_ncache_write(void);
132 static void sdev_ncache_process_store(void);
133 static sdev_nc_list_t *sdev_nc_newlist(void);
134 static void sdev_nc_free_unlinked_node(sdev_nc_node_t *);
135 static sdev_nc_node_t *sdev_nc_findpath(sdev_nc_list_t *, char *);
136 static void sdev_nc_insertnode(sdev_nc_list_t *, sdev_nc_node_t *);
137 static void sdev_nc_free_bootonly(void);
138 static int sdev_ncache_unpack_nvlist(nvf_handle_t, nvlist_t *, char *);
139 static int sdev_ncache_pack_list(nvf_handle_t, nvlist_t **);
140 static void sdev_ncache_list_free(nvf_handle_t);
141 static void sdev_nvp_free(nvp_devname_t *);
142 
143 /*
144  * Registration for /etc/devices/devname_cache
145  */
146 static nvf_ops_t sdev_cache_ops = {
147 	"/etc/devices/devname_cache",		/* path to cache */
148 	sdev_ncache_unpack_nvlist,		/* read: unpack nvlist */
149 	sdev_ncache_pack_list,			/* write: pack list */
150 	sdev_ncache_list_free,			/* free data list */
151 	sdev_ncache_write_complete		/* write complete callback */
152 };
153 
154 /*
155  * called once at filesystem initialization
156  */
157 void
158 sdev_ncache_init(void)
159 {
160 	sdev_ncache = sdev_nc_newlist();
161 }
162 
163 /*
164  * called at mount of the global instance
165  * currently the global instance is never unmounted
166  */
167 void
168 sdev_ncache_setup(void)
169 {
170 	sdevfd_handle = nvf_register_file(&sdev_cache_ops);
171 	ASSERT(sdevfd_handle);
172 
173 	list_create(nvf_list(sdevfd_handle), sizeof (nvp_devname_t),
174 	    offsetof(nvp_devname_t, nvp_link));
175 
176 	rw_enter(nvf_lock(sdevfd_handle), RW_WRITER);
177 	if (!sdev_cache_read_disable) {
178 		(void) nvf_read_file(sdevfd_handle);
179 	}
180 	sdev_ncache_process_store();
181 	rw_exit(nvf_lock(sdevfd_handle));
182 
183 	sdev_devstate_change();
184 }
185 
186 static void
187 sdev_nvp_free(nvp_devname_t *dp)
188 {
189 	int	i;
190 	char	**p;
191 
192 	if (dp->nvp_npaths > 0) {
193 		p = dp->nvp_paths;
194 		for (i = 0; i < dp->nvp_npaths; i++, p++) {
195 			kmem_free(*p, strlen(*p)+1);
196 		}
197 		kmem_free(dp->nvp_paths,
198 			dp->nvp_npaths * sizeof (char *));
199 		kmem_free(dp->nvp_expirecnts,
200 			dp->nvp_npaths * sizeof (int));
201 	}
202 
203 	kmem_free(dp, sizeof (nvp_devname_t));
204 }
205 
206 static void
207 sdev_ncache_list_free(nvf_handle_t fd)
208 {
209 	list_t		*listp;
210 	nvp_devname_t	*dp;
211 
212 	ASSERT(fd == sdevfd_handle);
213 	ASSERT(RW_WRITE_HELD(nvf_lock(fd)));
214 
215 	listp = nvf_list(fd);
216 	if ((dp = list_head(listp)) != NULL) {
217 		list_remove(listp, dp);
218 		sdev_nvp_free(dp);
219 	}
220 }
221 
222 /*
223  * Unpack a device path/nvlist pair to internal data list format.
224  * Used to decode the nvlist format into the internal representation
225  * when reading /etc/devices/devname_cache.
226  * Note that the expiration counts are optional, for compatibility
227  * with earlier instances of the cache.  If not present, the
228  * expire counts are initialized to defaults.
229  */
230 static int
231 sdev_ncache_unpack_nvlist(nvf_handle_t fd, nvlist_t *nvl, char *name)
232 {
233 	nvp_devname_t *np;
234 	char	**strs;
235 	int	*cnts;
236 	uint_t	nstrs, ncnts;
237 	int	rval, i;
238 
239 	ASSERT(fd == sdevfd_handle);
240 	ASSERT(RW_WRITE_HELD(nvf_lock(fd)));
241 
242 	/* name of the sublist must match what we created */
243 	if (strcmp(name, DP_DEVNAME_ID) != 0) {
244 		return (-1);
245 	}
246 
247 	np = kmem_zalloc(sizeof (nvp_devname_t), KM_SLEEP);
248 
249 	rval = nvlist_lookup_string_array(nvl,
250 	    DP_DEVNAME_NCACHE_ID, &strs, &nstrs);
251 	if (rval) {
252 		kmem_free(np, sizeof (nvp_devname_t));
253 		return (-1);
254 	}
255 
256 	np->nvp_npaths = nstrs;
257 	np->nvp_paths = kmem_zalloc(nstrs * sizeof (char *), KM_SLEEP);
258 	for (i = 0; i < nstrs; i++) {
259 		np->nvp_paths[i] = i_ddi_strdup(strs[i], KM_SLEEP);
260 	}
261 	np->nvp_expirecnts = kmem_zalloc(nstrs * sizeof (int), KM_SLEEP);
262 	for (i = 0; i < nstrs; i++) {
263 		np->nvp_expirecnts[i] = sdev_nc_expirecnt;
264 	}
265 
266 	rval = nvlist_lookup_int32_array(nvl,
267 	    DP_DEVNAME_NC_EXPIRECNT_ID, &cnts, &ncnts);
268 	if (rval == 0) {
269 		ASSERT(ncnts == nstrs);
270 		ncnts = min(ncnts, nstrs);
271 		for (i = 0; i < nstrs; i++) {
272 			np->nvp_expirecnts[i] = cnts[i];
273 		}
274 	}
275 
276 	list_insert_tail(nvf_list(sdevfd_handle), np);
277 
278 	return (0);
279 }
280 
281 /*
282  * Pack internal format cache data to a single nvlist.
283  * Used when writing the nvlist file.
284  * Note this is called indirectly by the nvpflush daemon.
285  */
286 static int
287 sdev_ncache_pack_list(nvf_handle_t fd, nvlist_t **ret_nvl)
288 {
289 	nvlist_t	*nvl, *sub_nvl;
290 	nvp_devname_t	*np;
291 	int		rval;
292 	list_t		*listp;
293 
294 	ASSERT(fd == sdevfd_handle);
295 	ASSERT(RW_WRITE_HELD(nvf_lock(fd)));
296 
297 	rval = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP);
298 	if (rval != 0) {
299 		nvf_error("%s: nvlist alloc error %d\n",
300 			nvf_cache_name(fd), rval);
301 		return (DDI_FAILURE);
302 	}
303 
304 	listp = nvf_list(sdevfd_handle);
305 	if ((np = list_head(listp)) != NULL) {
306 		ASSERT(list_next(listp, np) == NULL);
307 
308 		rval = nvlist_alloc(&sub_nvl, NV_UNIQUE_NAME, KM_SLEEP);
309 		if (rval != 0) {
310 			nvf_error("%s: nvlist alloc error %d\n",
311 				nvf_cache_name(fd), rval);
312 			sub_nvl = NULL;
313 			goto err;
314 		}
315 
316 		rval = nvlist_add_string_array(sub_nvl,
317 		    DP_DEVNAME_NCACHE_ID, np->nvp_paths, np->nvp_npaths);
318 		if (rval != 0) {
319 			nvf_error("%s: nvlist add error %d (sdev)\n",
320 			    nvf_cache_name(fd), rval);
321 			goto err;
322 		}
323 
324 		rval = nvlist_add_int32_array(sub_nvl,
325 		    DP_DEVNAME_NC_EXPIRECNT_ID,
326 		    np->nvp_expirecnts, np->nvp_npaths);
327 		if (rval != 0) {
328 			nvf_error("%s: nvlist add error %d (sdev)\n",
329 			    nvf_cache_name(fd), rval);
330 			goto err;
331 		}
332 
333 		rval = nvlist_add_nvlist(nvl, DP_DEVNAME_ID, sub_nvl);
334 		if (rval != 0) {
335 			nvf_error("%s: nvlist add error %d (sublist)\n",
336 			    nvf_cache_name(fd), rval);
337 			goto err;
338 		}
339 		nvlist_free(sub_nvl);
340 	}
341 
342 	*ret_nvl = nvl;
343 	return (DDI_SUCCESS);
344 
345 err:
346 	if (sub_nvl)
347 		nvlist_free(sub_nvl);
348 	nvlist_free(nvl);
349 	*ret_nvl = NULL;
350 	return (DDI_FAILURE);
351 }
352 
353 /*
354  * Run through the data read from the backing cache store
355  * to establish the initial state of the neg. cache.
356  */
357 static void
358 sdev_ncache_process_store(void)
359 {
360 	sdev_nc_list_t	*ncl = sdev_ncache;
361 	nvp_devname_t	*np;
362 	sdev_nc_node_t	*lp;
363 	char		*path;
364 	int		i, n;
365 	list_t		*listp;
366 
367 	if (sdev_nc_disable)
368 		return;
369 
370 	ASSERT(RW_WRITE_HELD(nvf_lock(sdevfd_handle)));
371 
372 	listp = nvf_list(sdevfd_handle);
373 	for (np = list_head(listp); np; np = list_next(listp, np)) {
374 		for (i = 0; i < np->nvp_npaths; i++) {
375 			sdcmn_err5(("    %s %d\n",
376 			    np->nvp_paths[i], np->nvp_expirecnts[i]));
377 			if (ncl->ncl_nentries < sdev_nc_max_entries) {
378 				path = np->nvp_paths[i];
379 				n = strlen(path) + 1;
380 				lp = kmem_alloc(sizeof (sdev_nc_node_t),
381 				    KM_SLEEP);
382 				lp->ncn_name = kmem_alloc(n, KM_SLEEP);
383 				bcopy(path, lp->ncn_name, n);
384 				lp->ncn_flags = NCN_SRC_STORE;
385 				lp->ncn_expirecnt = np->nvp_expirecnts[i];
386 				sdev_nc_insertnode(ncl, lp);
387 			} else if (sdev_nc_verbose) {
388 				cmn_err(CE_CONT,
389 				    "?%s: truncating from ncache (max %d)\n",
390 				    np->nvp_paths[i], sdev_nc_max_entries);
391 			}
392 		}
393 	}
394 }
395 
396 /*
397  * called by nvpflush daemon to inform us that an update of
398  * the cache file has been completed.
399  */
400 static void
401 sdev_ncache_write_complete(nvf_handle_t fd)
402 {
403 	sdev_nc_list_t	*ncl = sdev_ncache;
404 
405 	ASSERT(fd == sdevfd_handle);
406 
407 	mutex_enter(&ncl->ncl_mutex);
408 
409 	ASSERT(ncl->ncl_flags & NCL_LIST_WRITING);
410 
411 	if (ncl->ncl_flags & NCL_LIST_DIRTY) {
412 		sdcmn_err5(("ncache write complete but dirty again\n"));
413 		ncl->ncl_flags &= ~NCL_LIST_DIRTY;
414 		mutex_exit(&ncl->ncl_mutex);
415 		sdev_ncache_write();
416 	} else {
417 		sdcmn_err5(("ncache write complete\n"));
418 		ncl->ncl_flags &= ~NCL_LIST_WRITING;
419 		mutex_exit(&ncl->ncl_mutex);
420 		rw_enter(nvf_lock(fd), RW_WRITER);
421 		sdev_ncache_list_free(fd);
422 		rw_exit(nvf_lock(fd));
423 	}
424 }
425 
426 /*
427  * Prepare to perform an update of the neg. cache backing store.
428  */
429 static void
430 sdev_ncache_write(void)
431 {
432 	sdev_nc_list_t	*ncl = sdev_ncache;
433 	nvp_devname_t	*np;
434 	sdev_nc_node_t	*lp;
435 	int		n, i;
436 
437 	if (sdev_cache_write_disable) {
438 		mutex_enter(&ncl->ncl_mutex);
439 		ncl->ncl_flags &= ~NCL_LIST_WRITING;
440 		mutex_exit(&ncl->ncl_mutex);
441 		return;
442 	}
443 
444 	/* proper lock ordering here is essential */
445 	rw_enter(nvf_lock(sdevfd_handle), RW_WRITER);
446 	sdev_ncache_list_free(sdevfd_handle);
447 
448 	rw_enter(&ncl->ncl_lock, RW_READER);
449 	n = ncl->ncl_nentries;
450 	ASSERT(n <= sdev_nc_max_entries);
451 
452 	np = kmem_zalloc(sizeof (nvp_devname_t), KM_SLEEP);
453 	np->nvp_npaths = n;
454 	np->nvp_paths = kmem_zalloc(n * sizeof (char *), KM_SLEEP);
455 	np->nvp_expirecnts = kmem_zalloc(n * sizeof (int), KM_SLEEP);
456 
457 	i = 0;
458 	for (lp = list_head(&ncl->ncl_list); lp;
459 	    lp = list_next(&ncl->ncl_list, lp)) {
460 		np->nvp_paths[i] = i_ddi_strdup(lp->ncn_name, KM_SLEEP);
461 		np->nvp_expirecnts[i] = lp->ncn_expirecnt;
462 		sdcmn_err5(("    %s %d\n",
463 		    np->nvp_paths[i], np->nvp_expirecnts[i]));
464 		i++;
465 	}
466 
467 	rw_exit(&ncl->ncl_lock);
468 
469 	nvf_mark_dirty(sdevfd_handle);
470 	list_insert_tail(nvf_list(sdevfd_handle), np);
471 	rw_exit(nvf_lock(sdevfd_handle));
472 
473 	nvf_wake_daemon();
474 }
475 
476 static void
477 sdev_nc_flush_updates(void)
478 {
479 	sdev_nc_list_t *ncl = sdev_ncache;
480 
481 	if (sdev_nc_disable || sdev_cache_write_disable)
482 		return;
483 
484 	mutex_enter(&ncl->ncl_mutex);
485 	if (((ncl->ncl_flags &
486 	    (NCL_LIST_DIRTY | NCL_LIST_WENABLE | NCL_LIST_WRITING)) ==
487 	    (NCL_LIST_DIRTY | NCL_LIST_WENABLE))) {
488 		ncl->ncl_flags &= ~NCL_LIST_DIRTY;
489 		ncl->ncl_flags |= NCL_LIST_WRITING;
490 		mutex_exit(&ncl->ncl_mutex);
491 		sdev_ncache_write();
492 	} else {
493 		mutex_exit(&ncl->ncl_mutex);
494 	}
495 }
496 
497 static void
498 sdev_nc_flush_boot_update(void)
499 {
500 	sdev_nc_list_t *ncl = sdev_ncache;
501 
502 	if (sdev_nc_disable || sdev_cache_write_disable ||
503 	    (sdev_boot_state == SDEV_BOOT_STATE_INITIAL)) {
504 		return;
505 	}
506 	mutex_enter(&ncl->ncl_mutex);
507 	if (ncl->ncl_flags & NCL_LIST_WENABLE) {
508 		mutex_exit(&ncl->ncl_mutex);
509 		sdev_nc_flush_updates();
510 	} else {
511 		mutex_exit(&ncl->ncl_mutex);
512 	}
513 
514 }
515 
516 static void
517 sdev_state_boot_complete()
518 {
519 	sdev_nc_list_t	*ncl = sdev_ncache;
520 	sdev_nc_node_t	*lp, *next;
521 
522 	/*
523 	 * Once boot is complete, decrement the expire count of each entry
524 	 * in the cache not touched by a reference.  Remove any that
525 	 * goes to zero.  This effectively removes random entries over
526 	 * time.
527 	 */
528 	rw_enter(&ncl->ncl_lock, RW_WRITER);
529 	mutex_enter(&ncl->ncl_mutex);
530 
531 	for (lp = list_head(&ncl->ncl_list); lp; lp = next) {
532 		next = list_next(&ncl->ncl_list, lp);
533 		if (sdev_nc_expirecnt > 0 && lp->ncn_expirecnt > 0) {
534 			if (lp->ncn_flags & NCN_ACTIVE) {
535 				if (lp->ncn_expirecnt != sdev_nc_expirecnt) {
536 					lp->ncn_expirecnt = sdev_nc_expirecnt;
537 					ncl->ncl_flags |= NCL_LIST_DIRTY;
538 				}
539 			} else {
540 				if (--lp->ncn_expirecnt == 0) {
541 					list_remove(&ncl->ncl_list, lp);
542 					sdev_nc_free_unlinked_node(lp);
543 					ncl->ncl_nentries--;
544 				}
545 				ncl->ncl_flags |= NCL_LIST_DIRTY;
546 			}
547 		}
548 	}
549 
550 	mutex_exit(&ncl->ncl_mutex);
551 	rw_exit(&ncl->ncl_lock);
552 
553 	sdev_nc_flush_boot_update();
554 	sdev_boot_state = SDEV_BOOT_STATE_COMPLETE;
555 }
556 
557 /*
558  * Upon transition to the login state on a reconfigure boot,
559  * a debounce timer is set up so that we cache all the nonsense
560  * lookups we're hit with by the windowing system startup.
561  */
562 
563 /*ARGSUSED*/
564 static void
565 sdev_state_timeout(void *arg)
566 {
567 	sdev_state_boot_complete();
568 }
569 
570 static void
571 sdev_state_sysavail()
572 {
573 	sdev_nc_list_t *ncl = sdev_ncache;
574 	clock_t	nticks;
575 	int nsecs;
576 
577 	mutex_enter(&ncl->ncl_mutex);
578 	ncl->ncl_flags |= NCL_LIST_WENABLE;
579 	mutex_exit(&ncl->ncl_mutex);
580 
581 	nsecs = sdev_reconfig_delay;
582 	if (nsecs == 0) {
583 		sdev_state_boot_complete();
584 	} else {
585 		nticks = drv_usectohz(1000000 * nsecs);
586 		sdcmn_err5(("timeout initiated %ld\n", nticks));
587 		(void) timeout(sdev_state_timeout, NULL, nticks);
588 		sdev_nc_flush_boot_update();
589 	}
590 }
591 
592 /*
593  * Called to inform the filesystem of progress during boot,
594  * either a notice of reconfiguration boot or an indication of
595  * system boot complete.  At system boot complete, set up a
596  * timer at the expiration of which no further failed lookups
597  * will be added to the negative cache.
598  *
599  * The dev filesystem infers from reconfig boot that implicit
600  * reconfig need not be invoked at all as all available devices
601  * will have already been named.
602  *
603  * The dev filesystem infers from "system available" that devfsadmd
604  * can now be run and hence implicit reconfiguration may be initiated.
605  * During early stages of system startup, implicit reconfig is
606  * not done to avoid impacting boot performance.
607  */
608 void
609 sdev_devstate_change(void)
610 {
611 	int new_state;
612 
613 	/*
614 	 * Track system state and manage interesting transitions
615 	 */
616 	new_state = SDEV_BOOT_STATE_INITIAL;
617 	if (i_ddi_reconfig())
618 		new_state = SDEV_BOOT_STATE_RECONFIG;
619 	if (i_ddi_sysavail())
620 		new_state = SDEV_BOOT_STATE_SYSAVAIL;
621 
622 	if (sdev_boot_state < new_state) {
623 		switch (new_state) {
624 		case SDEV_BOOT_STATE_RECONFIG:
625 			sdcmn_err5(("state change: reconfigure boot\n"));
626 			sdev_boot_state = new_state;
627 			sdev_reconfig_boot = 1;
628 			if (!sdev_nc_disable_reset)
629 				sdev_nc_free_bootonly();
630 			break;
631 		case SDEV_BOOT_STATE_SYSAVAIL:
632 			sdcmn_err5(("system available\n"));
633 			sdev_boot_state = new_state;
634 			sdev_state_sysavail();
635 			break;
636 		}
637 	}
638 }
639 
640 /*
641  * Lookup: filter out entries in the negative cache
642  * Return 1 if the lookup should not cause a reconfig.
643  */
644 int
645 sdev_lookup_filter(sdev_node_t *dv, char *nm)
646 {
647 	int n;
648 	sdev_nc_list_t *ncl = sdev_ncache;
649 	sdev_nc_node_t *lp;
650 	char *path;
651 	int rval = 0;
652 	int changed = 0;
653 
654 	ASSERT(i_ddi_io_initialized());
655 	ASSERT(SDEVTOV(dv)->v_type == VDIR);
656 
657 	if (sdev_nc_disable)
658 		return (0);
659 
660 	n = strlen(dv->sdev_path) + strlen(nm) + 2;
661 	path = kmem_alloc(n, KM_SLEEP);
662 	(void) sprintf(path, "%s/%s", dv->sdev_path, nm);
663 
664 	rw_enter(&ncl->ncl_lock, RW_READER);
665 	if ((lp = sdev_nc_findpath(ncl, path)) != NULL) {
666 		sdcmn_err5(("%s/%s: lookup by %s cached, no reconfig\n",
667 		    dv->sdev_name, nm, curproc->p_user.u_comm));
668 		if (sdev_nc_verbose) {
669 			cmn_err(CE_CONT,
670 			    "?%s/%s: lookup by %s cached, no reconfig\n",
671 			    dv->sdev_name, nm, curproc->p_user.u_comm);
672 		}
673 		mutex_enter(&ncl->ncl_mutex);
674 		lp->ncn_flags |= NCN_ACTIVE;
675 		if (sdev_nc_expirecnt > 0 && lp->ncn_expirecnt > 0 &&
676 		    lp->ncn_expirecnt < sdev_nc_expirecnt) {
677 			lp->ncn_expirecnt = sdev_nc_expirecnt;
678 			ncl->ncl_flags |= NCL_LIST_DIRTY;
679 			changed = 1;
680 		}
681 		mutex_exit(&ncl->ncl_mutex);
682 		rval = 1;
683 	}
684 	rw_exit(&ncl->ncl_lock);
685 	kmem_free(path, n);
686 	if (changed)
687 		sdev_nc_flush_boot_update();
688 	return (rval);
689 }
690 
691 void
692 sdev_lookup_failed(sdev_node_t *dv, char *nm, int failed_flags)
693 {
694 	if (sdev_nc_disable)
695 		return;
696 
697 	/*
698 	 * If we're still in the initial boot stage, always update
699 	 * the cache - we may not have received notice of the
700 	 * reconfig boot state yet.  On a reconfigure boot, entries
701 	 * from the backing store are not re-persisted on update,
702 	 * but new entries are marked as needing an update.
703 	 * Never cache dynamic or non-global nodes.
704 	 */
705 	if (SDEV_IS_GLOBAL(dv) && !SDEV_IS_DYNAMIC(dv) &&
706 	    !SDEV_IS_NO_NCACHE(dv) &&
707 	    ((failed_flags & SLF_NO_NCACHE) == 0) &&
708 	    ((sdev_reconfig_boot &&
709 		(sdev_boot_state != SDEV_BOOT_STATE_COMPLETE)) ||
710 	    (!sdev_reconfig_boot && ((failed_flags & SLF_REBUILT))))) {
711 			sdev_nc_addname(sdev_ncache,
712 			    dv, nm, NCN_SRC_CURRENT|NCN_ACTIVE);
713 	}
714 }
715 
716 static sdev_nc_list_t *
717 sdev_nc_newlist(void)
718 {
719 	sdev_nc_list_t	*ncl;
720 
721 	ncl = kmem_zalloc(sizeof (sdev_nc_list_t), KM_SLEEP);
722 
723 	rw_init(&ncl->ncl_lock, NULL, RW_DEFAULT, NULL);
724 	mutex_init(&ncl->ncl_mutex, NULL, MUTEX_DEFAULT, NULL);
725 	list_create(&ncl->ncl_list, sizeof (sdev_nc_node_t),
726 	    offsetof(sdev_nc_node_t, ncn_link));
727 
728 	return (ncl);
729 }
730 
731 static void
732 sdev_nc_free_unlinked_node(sdev_nc_node_t *lp)
733 {
734 	kmem_free(lp->ncn_name, strlen(lp->ncn_name) + 1);
735 	kmem_free(lp, sizeof (sdev_nc_node_t));
736 }
737 
738 static sdev_nc_node_t *
739 sdev_nc_findpath(sdev_nc_list_t *ncl, char *path)
740 {
741 	sdev_nc_node_t *lp;
742 
743 	ASSERT(RW_LOCK_HELD(&ncl->ncl_lock));
744 
745 	for (lp = list_head(&ncl->ncl_list); lp;
746 	    lp = list_next(&ncl->ncl_list, lp)) {
747 		if (strcmp(path, lp->ncn_name) == 0)
748 			return (lp);
749 	}
750 
751 	return (NULL);
752 }
753 
754 static void
755 sdev_nc_insertnode(sdev_nc_list_t *ncl, sdev_nc_node_t *new)
756 {
757 	sdev_nc_node_t *lp;
758 
759 	rw_enter(&ncl->ncl_lock, RW_WRITER);
760 
761 	lp = sdev_nc_findpath(ncl, new->ncn_name);
762 	if (lp == NULL) {
763 		if (ncl->ncl_nentries == sdev_nc_max_entries) {
764 			sdcmn_err5((
765 			    "%s by %s: not adding to ncache (max %d)\n",
766 			    new->ncn_name, curproc->p_user.u_comm,
767 			    ncl->ncl_nentries));
768 			if (sdev_nc_verbose) {
769 				cmn_err(CE_CONT, "?%s by %s: "
770 				    "not adding to ncache (max %d)\n",
771 				    new->ncn_name, curproc->p_user.u_comm,
772 				    ncl->ncl_nentries);
773 			}
774 			rw_exit(&ncl->ncl_lock);
775 			sdev_nc_free_unlinked_node(new);
776 		} else {
777 
778 			list_insert_tail(&ncl->ncl_list, new);
779 			ncl->ncl_nentries++;
780 
781 			/* don't mark list dirty for nodes from store */
782 			mutex_enter(&ncl->ncl_mutex);
783 			if ((new->ncn_flags & NCN_SRC_STORE) == 0) {
784 				sdcmn_err5(("%s by %s: add to ncache\n",
785 				    new->ncn_name, curproc->p_user.u_comm));
786 				if (sdev_nc_verbose) {
787 					cmn_err(CE_CONT,
788 					    "?%s by %s: add to ncache\n",
789 					    new->ncn_name,
790 					    curproc->p_user.u_comm);
791 				}
792 				ncl->ncl_flags |= NCL_LIST_DIRTY;
793 			}
794 			mutex_exit(&ncl->ncl_mutex);
795 			rw_exit(&ncl->ncl_lock);
796 			lp = new;
797 			sdev_nc_flush_boot_update();
798 		}
799 	} else {
800 		mutex_enter(&ncl->ncl_mutex);
801 		lp->ncn_flags |= new->ncn_flags;
802 		mutex_exit(&ncl->ncl_mutex);
803 		rw_exit(&ncl->ncl_lock);
804 		sdev_nc_free_unlinked_node(new);
805 	}
806 }
807 
808 void
809 sdev_nc_addname(sdev_nc_list_t *ncl, sdev_node_t *dv, char *nm, int flags)
810 {
811 	int n;
812 	sdev_nc_node_t *lp;
813 
814 	ASSERT(SDEVTOV(dv)->v_type == VDIR);
815 
816 	lp = kmem_zalloc(sizeof (sdev_nc_node_t), KM_SLEEP);
817 
818 	n = strlen(dv->sdev_path) + strlen(nm) + 2;
819 	lp->ncn_name = kmem_alloc(n, KM_SLEEP);
820 	(void) sprintf(lp->ncn_name, "%s/%s",
821 		dv->sdev_path, nm);
822 	lp->ncn_flags = flags;
823 	lp->ncn_expirecnt = sdev_nc_expirecnt;
824 	sdev_nc_insertnode(ncl, lp);
825 }
826 
827 void
828 sdev_nc_node_exists(sdev_node_t *dv)
829 {
830 	/* dynamic and non-global nodes are never cached */
831 	if (SDEV_IS_GLOBAL(dv) && !SDEV_IS_DYNAMIC(dv) &&
832 	    !SDEV_IS_NO_NCACHE(dv)) {
833 		sdev_nc_path_exists(sdev_ncache, dv->sdev_path);
834 	}
835 }
836 
837 void
838 sdev_nc_path_exists(sdev_nc_list_t *ncl, char *path)
839 {
840 	sdev_nc_node_t *lp;
841 
842 	if (sdev_nc_disable)
843 		return;
844 
845 	rw_enter(&ncl->ncl_lock, RW_READER);
846 	if ((lp = sdev_nc_findpath(ncl, path)) == NULL) {
847 		rw_exit(&ncl->ncl_lock);
848 		return;
849 	}
850 	if (rw_tryupgrade(&ncl->ncl_lock) == 0) {
851 		rw_exit(&ncl->ncl_lock);
852 		rw_enter(&ncl->ncl_lock, RW_WRITER);
853 		lp = sdev_nc_findpath(ncl, path);
854 	}
855 	if (lp) {
856 		list_remove(&ncl->ncl_list, lp);
857 		ncl->ncl_nentries--;
858 		mutex_enter(&ncl->ncl_mutex);
859 		ncl->ncl_flags |= NCL_LIST_DIRTY;
860 		if (ncl->ncl_flags & NCL_LIST_WENABLE) {
861 			mutex_exit(&ncl->ncl_mutex);
862 			rw_exit(&ncl->ncl_lock);
863 			sdev_nc_flush_updates();
864 		} else {
865 			mutex_exit(&ncl->ncl_mutex);
866 			rw_exit(&ncl->ncl_lock);
867 		}
868 		sdev_nc_free_unlinked_node(lp);
869 		sdcmn_err5(("%s by %s: removed from ncache\n",
870 		    path, curproc->p_user.u_comm));
871 		if (sdev_nc_verbose) {
872 			cmn_err(CE_CONT, "?%s by %s: removed from ncache\n",
873 			    path, curproc->p_user.u_comm);
874 		}
875 	} else
876 		rw_exit(&ncl->ncl_lock);
877 }
878 
879 static void
880 sdev_nc_free_bootonly(void)
881 {
882 	sdev_nc_list_t	*ncl = sdev_ncache;
883 	sdev_nc_node_t *lp;
884 	sdev_nc_node_t *next;
885 
886 	ASSERT(sdev_reconfig_boot);
887 
888 	rw_enter(&ncl->ncl_lock, RW_WRITER);
889 
890 	for (lp = list_head(&ncl->ncl_list); lp; lp = next) {
891 		next = list_next(&ncl->ncl_list, lp);
892 		if ((lp->ncn_flags & NCN_SRC_CURRENT) == 0) {
893 			sdcmn_err5(("freeing %s\n", lp->ncn_name));
894 			mutex_enter(&ncl->ncl_mutex);
895 			ncl->ncl_flags |= NCL_LIST_DIRTY;
896 			mutex_exit(&ncl->ncl_mutex);
897 			list_remove(&ncl->ncl_list, lp);
898 			sdev_nc_free_unlinked_node(lp);
899 			ncl->ncl_nentries--;
900 		}
901 	}
902 
903 	rw_exit(&ncl->ncl_lock);
904 }
905