xref: /titanic_52/usr/src/uts/common/fs/dev/sdev_ncache.c (revision 6185db853e024a486ff8837e6784dd290d866112)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * negative cache handling for the /dev fs
30  */
31 
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/t_lock.h>
35 #include <sys/systm.h>
36 #include <sys/sysmacros.h>
37 #include <sys/user.h>
38 #include <sys/time.h>
39 #include <sys/vfs.h>
40 #include <sys/vnode.h>
41 #include <sys/file.h>
42 #include <sys/fcntl.h>
43 #include <sys/flock.h>
44 #include <sys/kmem.h>
45 #include <sys/uio.h>
46 #include <sys/errno.h>
47 #include <sys/stat.h>
48 #include <sys/cred.h>
49 #include <sys/cmn_err.h>
50 #include <sys/debug.h>
51 #include <sys/mode.h>
52 #include <sys/policy.h>
53 #include <fs/fs_subr.h>
54 #include <sys/mount.h>
55 #include <sys/fs/snode.h>
56 #include <sys/fs/dv_node.h>
57 #include <sys/fs/sdev_node.h>
58 #include <sys/sunndi.h>
59 #include <sys/sunmdi.h>
60 #include <sys/ddi.h>
61 #include <sys/modctl.h>
62 #include <sys/devcache.h>
63 
64 
65 /*
66  * ncache is a negative cache of failed lookups.  An entry
67  * is added after an attempt to configure a device by that
68  * name failed.  An accumulation of these entries over time
69  * gives us a set of device name for which implicit reconfiguration
70  * does not need to be attempted.  If a name is created matching
71  * an entry in ncache, that entry is removed, with the
72  * persistent store updated.
73  *
74  * Implicit reconfig is initiated for any name during lookup that
75  * can't be resolved from the backing store and that isn't
76  * present in the negative cache.  This functionality is
77  * enabled during system startup once communication with devfsadm
78  * can be achieved.  Since readdir is more general, implicit
79  * reconfig initiated by reading a directory isn't enabled until
80  * the system is more fully booted, at the time of the multi-user
81  * milestone, corresponding to init state 2.
82  *
83  * A maximum is imposed on the number of entries in the cache
84  * to limit some script going wild and as a defense against attack.
85  * The default limit is 64 and can be adjusted via sdev_nc_max_entries.
86  *
87  * Each entry also has a expiration count.  When looked up a name in
88  * the cache is set to the default.  Subsequent boots will decrement
89  * the count if a name isn't referenced.  This permits a once-only
90  * entry to eventually be removed over time.
91  *
92  * sdev_reconfig_delay implements a "debounce" of the timing beyond
93  * system available indication, providing what the filesystem considers
94  * to be the system-is-fully-booted state.  This is provided to adjust
95  * the timing if some application startup is performing a readdir
96  * in /dev that initiates a troublesome implicit reconfig on every boot.
97  *
98  * sdev_nc_disable_reset can be used to disable clearing the negative cache
99  * on reconfig boot.  The default is to clear the cache on reconfig boot.
100  * sdev_nc_disable can be used to disable the negative cache itself.
101  *
102  * sdev_reconfig_disable can be used to disable implicit reconfig.
103  * The default is that implicit reconfig is enabled.
104  */
105 
106 /* tunables and defaults */
107 #define	SDEV_NC_EXPIRECNT	4
108 #define	SDEV_NC_MAX_ENTRIES	64
109 #define	SEV_RECONFIG_DELAY	6	/* seconds */
110 
111 /* tunables */
112 int	sdev_nc_expirecnt = SDEV_NC_EXPIRECNT;
113 int	sdev_nc_max_entries = SDEV_NC_MAX_ENTRIES;
114 int	sdev_reconfig_delay = SEV_RECONFIG_DELAY;
115 int	sdev_reconfig_verbose = 0;
116 int	sdev_reconfig_disable = 0;
117 int	sdev_nc_disable = 0;
118 int	sdev_nc_disable_reset = 0;
119 int	sdev_nc_verbose = 0;
120 int	sdev_cache_read_disable = 0;
121 int	sdev_cache_write_disable = 0;
122 
123 /* globals */
124 int	sdev_boot_state = SDEV_BOOT_STATE_INITIAL;
125 int	sdev_reconfig_boot = 0;
126 sdev_nc_list_t *sdev_ncache;
127 static timeout_id_t sdev_timeout_id = 0;
128 static nvf_handle_t sdevfd_handle;
129 
130 /* static prototypes */
131 static void sdev_ncache_write_complete(nvf_handle_t);
132 static void sdev_ncache_write(void);
133 static void sdev_ncache_process_store(void);
134 static sdev_nc_list_t *sdev_nc_newlist(void);
135 static void sdev_nc_free_unlinked_node(sdev_nc_node_t *);
136 static void sdev_nc_free_all_nodes(sdev_nc_list_t *);
137 static void sdev_nc_freelist(sdev_nc_list_t *);
138 static sdev_nc_node_t *sdev_nc_findpath(sdev_nc_list_t *, char *);
139 static void sdev_nc_insertnode(sdev_nc_list_t *, sdev_nc_node_t *);
140 static void sdev_nc_free_bootonly(void);
141 static int sdev_ncache_unpack_nvlist(nvf_handle_t, nvlist_t *, char *);
142 static int sdev_ncache_pack_list(nvf_handle_t, nvlist_t **);
143 static void sdev_ncache_list_free(nvf_handle_t);
144 static void sdev_nvp_free(nvp_devname_t *);
145 
146 /*
147  * Registration for /etc/devices/devname_cache
148  */
149 static nvf_ops_t sdev_cache_ops = {
150 	"/etc/devices/devname_cache",		/* path to cache */
151 	sdev_ncache_unpack_nvlist,		/* read: unpack nvlist */
152 	sdev_ncache_pack_list,			/* write: pack list */
153 	sdev_ncache_list_free,			/* free data list */
154 	sdev_ncache_write_complete		/* write complete callback */
155 };
156 
157 /*
158  * called once at filesystem initialization
159  */
160 void
161 sdev_ncache_init(void)
162 {
163 	sdev_ncache = sdev_nc_newlist();
164 }
165 
166 /*
167  * called at mount of the global instance
168  * currently the global instance is never unmounted
169  */
170 void
171 sdev_ncache_setup(void)
172 {
173 	sdevfd_handle = nvf_register_file(&sdev_cache_ops);
174 	ASSERT(sdevfd_handle);
175 
176 	list_create(nvf_list(sdevfd_handle), sizeof (nvp_devname_t),
177 	    offsetof(nvp_devname_t, nvp_link));
178 
179 	rw_enter(nvf_lock(sdevfd_handle), RW_WRITER);
180 	if (!sdev_cache_read_disable) {
181 		(void) nvf_read_file(sdevfd_handle);
182 	}
183 	sdev_ncache_process_store();
184 	rw_exit(nvf_lock(sdevfd_handle));
185 
186 	sdev_devstate_change();
187 }
188 
189 static void
190 sdev_nvp_free(nvp_devname_t *dp)
191 {
192 	int	i;
193 	char	**p;
194 
195 	if (dp->nvp_npaths > 0) {
196 		p = dp->nvp_paths;
197 		for (i = 0; i < dp->nvp_npaths; i++, p++) {
198 			kmem_free(*p, strlen(*p)+1);
199 		}
200 		kmem_free(dp->nvp_paths,
201 			dp->nvp_npaths * sizeof (char *));
202 		kmem_free(dp->nvp_expirecnts,
203 			dp->nvp_npaths * sizeof (int));
204 	}
205 
206 	kmem_free(dp, sizeof (nvp_devname_t));
207 }
208 
209 static void
210 sdev_ncache_list_free(nvf_handle_t fd)
211 {
212 	list_t		*listp;
213 	nvp_devname_t	*dp;
214 
215 	ASSERT(fd == sdevfd_handle);
216 	ASSERT(RW_WRITE_HELD(nvf_lock(fd)));
217 
218 	listp = nvf_list(fd);
219 	if ((dp = list_head(listp)) != NULL) {
220 		list_remove(listp, dp);
221 		sdev_nvp_free(dp);
222 	}
223 }
224 
225 /*
226  * Unpack a device path/nvlist pair to internal data list format.
227  * Used to decode the nvlist format into the internal representation
228  * when reading /etc/devices/devname_cache.
229  * Note that the expiration counts are optional, for compatibility
230  * with earlier instances of the cache.  If not present, the
231  * expire counts are initialized to defaults.
232  */
233 static int
234 sdev_ncache_unpack_nvlist(nvf_handle_t fd, nvlist_t *nvl, char *name)
235 {
236 	nvp_devname_t *np;
237 	char	**strs;
238 	int	*cnts;
239 	uint_t	nstrs, ncnts;
240 	int	rval, i;
241 
242 	ASSERT(fd == sdevfd_handle);
243 	ASSERT(RW_WRITE_HELD(nvf_lock(fd)));
244 
245 	/* name of the sublist must match what we created */
246 	if (strcmp(name, DP_DEVNAME_ID) != 0) {
247 		return (-1);
248 	}
249 
250 	np = kmem_zalloc(sizeof (nvp_devname_t), KM_SLEEP);
251 
252 	rval = nvlist_lookup_string_array(nvl,
253 	    DP_DEVNAME_NCACHE_ID, &strs, &nstrs);
254 	if (rval) {
255 		kmem_free(np, sizeof (nvp_devname_t));
256 		return (-1);
257 	}
258 
259 	np->nvp_npaths = nstrs;
260 	np->nvp_paths = kmem_zalloc(nstrs * sizeof (char *), KM_SLEEP);
261 	for (i = 0; i < nstrs; i++) {
262 		np->nvp_paths[i] = i_ddi_strdup(strs[i], KM_SLEEP);
263 	}
264 	np->nvp_expirecnts = kmem_zalloc(nstrs * sizeof (int), KM_SLEEP);
265 	for (i = 0; i < nstrs; i++) {
266 		np->nvp_expirecnts[i] = sdev_nc_expirecnt;
267 	}
268 
269 	rval = nvlist_lookup_int32_array(nvl,
270 	    DP_DEVNAME_NC_EXPIRECNT_ID, &cnts, &ncnts);
271 	if (rval == 0) {
272 		ASSERT(ncnts == nstrs);
273 		ncnts = min(ncnts, nstrs);
274 		for (i = 0; i < nstrs; i++) {
275 			np->nvp_expirecnts[i] = cnts[i];
276 		}
277 	}
278 
279 	list_insert_tail(nvf_list(sdevfd_handle), np);
280 
281 	return (0);
282 }
283 
284 /*
285  * Pack internal format cache data to a single nvlist.
286  * Used when writing the nvlist file.
287  * Note this is called indirectly by the nvpflush daemon.
288  */
289 static int
290 sdev_ncache_pack_list(nvf_handle_t fd, nvlist_t **ret_nvl)
291 {
292 	nvlist_t	*nvl, *sub_nvl;
293 	nvp_devname_t	*np;
294 	int		rval;
295 	list_t		*listp;
296 
297 	ASSERT(fd == sdevfd_handle);
298 	ASSERT(RW_WRITE_HELD(nvf_lock(fd)));
299 
300 	rval = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP);
301 	if (rval != 0) {
302 		nvf_error("%s: nvlist alloc error %d\n",
303 			nvf_cache_name(fd), rval);
304 		return (DDI_FAILURE);
305 	}
306 
307 	listp = nvf_list(sdevfd_handle);
308 	if ((np = list_head(listp)) != NULL) {
309 		ASSERT(list_next(listp, np) == NULL);
310 
311 		rval = nvlist_alloc(&sub_nvl, NV_UNIQUE_NAME, KM_SLEEP);
312 		if (rval != 0) {
313 			nvf_error("%s: nvlist alloc error %d\n",
314 				nvf_cache_name(fd), rval);
315 			sub_nvl = NULL;
316 			goto err;
317 		}
318 
319 		rval = nvlist_add_string_array(sub_nvl,
320 		    DP_DEVNAME_NCACHE_ID, np->nvp_paths, np->nvp_npaths);
321 		if (rval != 0) {
322 			nvf_error("%s: nvlist add error %d (sdev)\n",
323 			    nvf_cache_name(fd), rval);
324 			goto err;
325 		}
326 
327 		rval = nvlist_add_int32_array(sub_nvl,
328 		    DP_DEVNAME_NC_EXPIRECNT_ID,
329 		    np->nvp_expirecnts, np->nvp_npaths);
330 		if (rval != 0) {
331 			nvf_error("%s: nvlist add error %d (sdev)\n",
332 			    nvf_cache_name(fd), rval);
333 			goto err;
334 		}
335 
336 		rval = nvlist_add_nvlist(nvl, DP_DEVNAME_ID, sub_nvl);
337 		if (rval != 0) {
338 			nvf_error("%s: nvlist add error %d (sublist)\n",
339 			    nvf_cache_name(fd), rval);
340 			goto err;
341 		}
342 		nvlist_free(sub_nvl);
343 	}
344 
345 	*ret_nvl = nvl;
346 	return (DDI_SUCCESS);
347 
348 err:
349 	if (sub_nvl)
350 		nvlist_free(sub_nvl);
351 	nvlist_free(nvl);
352 	*ret_nvl = NULL;
353 	return (DDI_FAILURE);
354 }
355 
356 /*
357  * Run through the data read from the backing cache store
358  * to establish the initial state of the neg. cache.
359  */
360 static void
361 sdev_ncache_process_store(void)
362 {
363 	sdev_nc_list_t	*ncl = sdev_ncache;
364 	nvp_devname_t	*np;
365 	sdev_nc_node_t	*lp;
366 	char		*path;
367 	int		i, n;
368 	list_t		*listp;
369 
370 	if (sdev_nc_disable)
371 		return;
372 
373 	ASSERT(RW_WRITE_HELD(nvf_lock(sdevfd_handle)));
374 
375 	listp = nvf_list(sdevfd_handle);
376 	for (np = list_head(listp); np; np = list_next(listp, np)) {
377 		for (i = 0; i < np->nvp_npaths; i++) {
378 			sdcmn_err5(("    %s %d\n",
379 			    np->nvp_paths[i], np->nvp_expirecnts[i]));
380 			if (ncl->ncl_nentries < sdev_nc_max_entries) {
381 				path = np->nvp_paths[i];
382 				n = strlen(path) + 1;
383 				lp = kmem_alloc(sizeof (sdev_nc_node_t),
384 				    KM_SLEEP);
385 				lp->ncn_name = kmem_alloc(n, KM_SLEEP);
386 				bcopy(path, lp->ncn_name, n);
387 				lp->ncn_flags = NCN_SRC_STORE;
388 				lp->ncn_expirecnt = np->nvp_expirecnts[i];
389 				sdev_nc_insertnode(ncl, lp);
390 			} else if (sdev_nc_verbose) {
391 				cmn_err(CE_CONT,
392 				    "?%s: truncating from ncache (max %d)\n",
393 				    np->nvp_paths[i], sdev_nc_max_entries);
394 			}
395 		}
396 	}
397 }
398 
399 /*
400  * called by nvpflush daemon to inform us that an update of
401  * the cache file has been completed.
402  */
403 static void
404 sdev_ncache_write_complete(nvf_handle_t fd)
405 {
406 	sdev_nc_list_t	*ncl = sdev_ncache;
407 
408 	ASSERT(fd == sdevfd_handle);
409 
410 	mutex_enter(&ncl->ncl_mutex);
411 
412 	ASSERT(ncl->ncl_flags & NCL_LIST_WRITING);
413 
414 	if (ncl->ncl_flags & NCL_LIST_DIRTY) {
415 		sdcmn_err5(("ncache write complete but dirty again\n"));
416 		ncl->ncl_flags &= ~NCL_LIST_DIRTY;
417 		mutex_exit(&ncl->ncl_mutex);
418 		sdev_ncache_write();
419 	} else {
420 		sdcmn_err5(("ncache write complete\n"));
421 		ncl->ncl_flags &= ~NCL_LIST_WRITING;
422 		mutex_exit(&ncl->ncl_mutex);
423 		rw_enter(nvf_lock(fd), RW_WRITER);
424 		sdev_ncache_list_free(fd);
425 		rw_exit(nvf_lock(fd));
426 	}
427 }
428 
429 /*
430  * Prepare to perform an update of the neg. cache backing store.
431  */
432 static void
433 sdev_ncache_write(void)
434 {
435 	sdev_nc_list_t	*ncl = sdev_ncache;
436 	nvp_devname_t	*np;
437 	sdev_nc_node_t	*lp;
438 	int		n, i;
439 
440 	if (sdev_cache_write_disable) {
441 		mutex_enter(&ncl->ncl_mutex);
442 		ncl->ncl_flags &= ~NCL_LIST_WRITING;
443 		mutex_exit(&ncl->ncl_mutex);
444 		return;
445 	}
446 
447 	/* proper lock ordering here is essential */
448 	rw_enter(nvf_lock(sdevfd_handle), RW_WRITER);
449 	sdev_ncache_list_free(sdevfd_handle);
450 
451 	rw_enter(&ncl->ncl_lock, RW_READER);
452 	n = ncl->ncl_nentries;
453 	ASSERT(n <= sdev_nc_max_entries);
454 
455 	np = kmem_zalloc(sizeof (nvp_devname_t), KM_SLEEP);
456 	np->nvp_npaths = n;
457 	np->nvp_paths = kmem_zalloc(n * sizeof (char *), KM_SLEEP);
458 	np->nvp_expirecnts = kmem_zalloc(n * sizeof (int), KM_SLEEP);
459 
460 	i = 0;
461 	for (lp = list_head(&ncl->ncl_list); lp;
462 	    lp = list_next(&ncl->ncl_list, lp)) {
463 		np->nvp_paths[i] = i_ddi_strdup(lp->ncn_name, KM_SLEEP);
464 		np->nvp_expirecnts[i] = lp->ncn_expirecnt;
465 		sdcmn_err5(("    %s %d\n",
466 		    np->nvp_paths[i], np->nvp_expirecnts[i]));
467 		i++;
468 	}
469 
470 	rw_exit(&ncl->ncl_lock);
471 
472 	nvf_mark_dirty(sdevfd_handle);
473 	list_insert_tail(nvf_list(sdevfd_handle), np);
474 	rw_exit(nvf_lock(sdevfd_handle));
475 
476 	nvf_wake_daemon();
477 }
478 
479 static void
480 sdev_nc_flush_updates(void)
481 {
482 	sdev_nc_list_t *ncl = sdev_ncache;
483 
484 	if (sdev_nc_disable || sdev_cache_write_disable)
485 		return;
486 
487 	mutex_enter(&ncl->ncl_mutex);
488 	if (((ncl->ncl_flags &
489 	    (NCL_LIST_DIRTY | NCL_LIST_WENABLE | NCL_LIST_WRITING)) ==
490 	    (NCL_LIST_DIRTY | NCL_LIST_WENABLE))) {
491 		ncl->ncl_flags &= ~NCL_LIST_DIRTY;
492 		ncl->ncl_flags |= NCL_LIST_WRITING;
493 		mutex_exit(&ncl->ncl_mutex);
494 		sdev_ncache_write();
495 	} else {
496 		mutex_exit(&ncl->ncl_mutex);
497 	}
498 }
499 
500 static void
501 sdev_nc_flush_boot_update(void)
502 {
503 	sdev_nc_list_t *ncl = sdev_ncache;
504 
505 	if (sdev_nc_disable || sdev_cache_write_disable ||
506 	    (sdev_boot_state == SDEV_BOOT_STATE_INITIAL)) {
507 		return;
508 	}
509 	mutex_enter(&ncl->ncl_mutex);
510 	if (ncl->ncl_flags & NCL_LIST_WENABLE) {
511 		mutex_exit(&ncl->ncl_mutex);
512 		sdev_nc_flush_updates();
513 	} else {
514 		mutex_exit(&ncl->ncl_mutex);
515 	}
516 
517 }
518 
519 static void
520 sdev_state_boot_complete()
521 {
522 	sdev_nc_list_t	*ncl = sdev_ncache;
523 	sdev_nc_node_t	*lp, *next;
524 
525 	/*
526 	 * Once boot is complete, decrement the expire count of each entry
527 	 * in the cache not touched by a reference.  Remove any that
528 	 * goes to zero.  This effectively removes random entries over
529 	 * time.
530 	 */
531 	rw_enter(&ncl->ncl_lock, RW_WRITER);
532 	mutex_enter(&ncl->ncl_mutex);
533 
534 	for (lp = list_head(&ncl->ncl_list); lp; lp = next) {
535 		next = list_next(&ncl->ncl_list, lp);
536 		if (sdev_nc_expirecnt > 0 && lp->ncn_expirecnt > 0) {
537 			if (lp->ncn_flags & NCN_ACTIVE) {
538 				if (lp->ncn_expirecnt != sdev_nc_expirecnt) {
539 					lp->ncn_expirecnt = sdev_nc_expirecnt;
540 					ncl->ncl_flags |= NCL_LIST_DIRTY;
541 				}
542 			} else {
543 				if (--lp->ncn_expirecnt == 0) {
544 					list_remove(&ncl->ncl_list, lp);
545 					sdev_nc_free_unlinked_node(lp);
546 					ncl->ncl_nentries--;
547 				}
548 				ncl->ncl_flags |= NCL_LIST_DIRTY;
549 			}
550 		}
551 	}
552 
553 	mutex_exit(&ncl->ncl_mutex);
554 	rw_exit(&ncl->ncl_lock);
555 
556 	sdev_nc_flush_boot_update();
557 	sdev_boot_state = SDEV_BOOT_STATE_COMPLETE;
558 }
559 
560 /*
561  * Upon transition to the login state on a reconfigure boot,
562  * a debounce timer is set up so that we cache all the nonsense
563  * lookups we're hit with by the windowing system startup.
564  */
565 
566 /*ARGSUSED*/
567 static void
568 sdev_state_timeout(void *arg)
569 {
570 	sdev_timeout_id = 0;
571 	sdev_state_boot_complete();
572 }
573 
574 static void
575 sdev_state_sysavail()
576 {
577 	sdev_nc_list_t *ncl = sdev_ncache;
578 	clock_t	nticks;
579 	int nsecs;
580 
581 	mutex_enter(&ncl->ncl_mutex);
582 	ncl->ncl_flags |= NCL_LIST_WENABLE;
583 	mutex_exit(&ncl->ncl_mutex);
584 
585 	nsecs = sdev_reconfig_delay;
586 	if (nsecs == 0) {
587 		sdev_state_boot_complete();
588 	} else {
589 		nticks = drv_usectohz(1000000 * nsecs);
590 		sdcmn_err5(("timeout initiated %ld\n", nticks));
591 		sdev_timeout_id = timeout(sdev_state_timeout, NULL, nticks);
592 		sdev_nc_flush_boot_update();
593 	}
594 }
595 
596 /*
597  * Called to inform the filesystem of progress during boot,
598  * either a notice of reconfiguration boot or an indication of
599  * system boot complete.  At system boot complete, set up a
600  * timer at the expiration of which no further failed lookups
601  * will be added to the negative cache.
602  *
603  * The dev filesystem infers from reconfig boot that implicit
604  * reconfig need not be invoked at all as all available devices
605  * will have already been named.
606  *
607  * The dev filesystem infers from "system available" that devfsadmd
608  * can now be run and hence implicit reconfiguration may be initiated.
609  * During early stages of system startup, implicit reconfig is
610  * not done to avoid impacting boot performance.
611  */
612 void
613 sdev_devstate_change(void)
614 {
615 	int new_state;
616 
617 	/*
618 	 * Track system state and manage interesting transitions
619 	 */
620 	new_state = SDEV_BOOT_STATE_INITIAL;
621 	if (i_ddi_reconfig())
622 		new_state = SDEV_BOOT_STATE_RECONFIG;
623 	if (i_ddi_sysavail())
624 		new_state = SDEV_BOOT_STATE_SYSAVAIL;
625 
626 	if (sdev_boot_state < new_state) {
627 		switch (new_state) {
628 		case SDEV_BOOT_STATE_RECONFIG:
629 			sdcmn_err5(("state change: reconfigure boot\n"));
630 			sdev_boot_state = new_state;
631 			sdev_reconfig_boot = 1;
632 			if (!sdev_nc_disable_reset)
633 				sdev_nc_free_bootonly();
634 			break;
635 		case SDEV_BOOT_STATE_SYSAVAIL:
636 			sdcmn_err5(("system available\n"));
637 			sdev_boot_state = new_state;
638 			sdev_state_sysavail();
639 			break;
640 		}
641 	}
642 }
643 
644 /*
645  * Lookup: filter out entries in the negative cache
646  * Return 1 if the lookup should not cause a reconfig.
647  */
648 int
649 sdev_lookup_filter(sdev_node_t *dv, char *nm)
650 {
651 	int n;
652 	sdev_nc_list_t *ncl = sdev_ncache;
653 	sdev_nc_node_t *lp;
654 	char *path;
655 	int rval = 0;
656 	int changed = 0;
657 
658 	ASSERT(i_ddi_io_initialized());
659 	ASSERT(SDEVTOV(dv)->v_type == VDIR);
660 
661 	if (sdev_nc_disable)
662 		return (0);
663 
664 	n = strlen(dv->sdev_path) + strlen(nm) + 2;
665 	path = kmem_alloc(n, KM_SLEEP);
666 	(void) sprintf(path, "%s/%s", dv->sdev_path, nm);
667 
668 	rw_enter(&ncl->ncl_lock, RW_READER);
669 	if ((lp = sdev_nc_findpath(ncl, path)) != NULL) {
670 		sdcmn_err5(("%s/%s: lookup by %s cached, no reconfig\n",
671 		    dv->sdev_name, nm, curproc->p_user.u_comm));
672 		if (sdev_nc_verbose) {
673 			cmn_err(CE_CONT,
674 			    "?%s/%s: lookup by %s cached, no reconfig\n",
675 			    dv->sdev_name, nm, curproc->p_user.u_comm);
676 		}
677 		mutex_enter(&ncl->ncl_mutex);
678 		lp->ncn_flags |= NCN_ACTIVE;
679 		if (sdev_nc_expirecnt > 0 && lp->ncn_expirecnt > 0 &&
680 		    lp->ncn_expirecnt < sdev_nc_expirecnt) {
681 			lp->ncn_expirecnt = sdev_nc_expirecnt;
682 			ncl->ncl_flags |= NCL_LIST_DIRTY;
683 			changed = 1;
684 		}
685 		mutex_exit(&ncl->ncl_mutex);
686 		rval = 1;
687 	}
688 	rw_exit(&ncl->ncl_lock);
689 	kmem_free(path, n);
690 	if (changed)
691 		sdev_nc_flush_boot_update();
692 	return (rval);
693 }
694 
695 void
696 sdev_lookup_failed(sdev_node_t *dv, char *nm, int failed_flags)
697 {
698 	if (sdev_nc_disable)
699 		return;
700 
701 	/*
702 	 * If we're still in the initial boot stage, always update
703 	 * the cache - we may not have received notice of the
704 	 * reconfig boot state yet.  On a reconfigure boot, entries
705 	 * from the backing store are not re-persisted on update,
706 	 * but new entries are marked as needing an update.
707 	 * Never cache dynamic or non-global nodes.
708 	 */
709 	if (SDEV_IS_GLOBAL(dv) && !SDEV_IS_DYNAMIC(dv) &&
710 	    !SDEV_IS_NO_NCACHE(dv) &&
711 	    ((failed_flags & SLF_NO_NCACHE) == 0) &&
712 	    ((sdev_reconfig_boot &&
713 		(sdev_boot_state != SDEV_BOOT_STATE_COMPLETE)) ||
714 	    (!sdev_reconfig_boot && ((failed_flags & SLF_REBUILT))))) {
715 			sdev_nc_addname(sdev_ncache,
716 			    dv, nm, NCN_SRC_CURRENT|NCN_ACTIVE);
717 	}
718 }
719 
720 static sdev_nc_list_t *
721 sdev_nc_newlist(void)
722 {
723 	sdev_nc_list_t	*ncl;
724 
725 	ncl = kmem_zalloc(sizeof (sdev_nc_list_t), KM_SLEEP);
726 
727 	rw_init(&ncl->ncl_lock, NULL, RW_DEFAULT, NULL);
728 	mutex_init(&ncl->ncl_mutex, NULL, MUTEX_DEFAULT, NULL);
729 	list_create(&ncl->ncl_list, sizeof (sdev_nc_node_t),
730 	    offsetof(sdev_nc_node_t, ncn_link));
731 
732 	return (ncl);
733 }
734 
735 static void
736 sdev_nc_free_unlinked_node(sdev_nc_node_t *lp)
737 {
738 	kmem_free(lp->ncn_name, strlen(lp->ncn_name) + 1);
739 	kmem_free(lp, sizeof (sdev_nc_node_t));
740 }
741 
742 static void
743 sdev_nc_free_all_nodes(sdev_nc_list_t *ncl)
744 {
745 	sdev_nc_node_t *lp;
746 
747 	while ((lp = list_head(&ncl->ncl_list)) != NULL) {
748 		list_remove(&ncl->ncl_list, lp);
749 		sdev_nc_free_unlinked_node(lp);
750 		ncl->ncl_nentries--;
751 	}
752 	ASSERT(ncl->ncl_nentries == 0);
753 }
754 
755 static void
756 sdev_nc_freelist(sdev_nc_list_t *ncl)
757 {
758 	if (!list_is_empty(&ncl->ncl_list))
759 		sdev_nc_free_all_nodes(ncl);
760 	ASSERT(list_is_empty(&ncl->ncl_list));
761 	ASSERT(ncl->ncl_nentries == 0);
762 
763 	mutex_destroy(&ncl->ncl_mutex);
764 	rw_destroy(&ncl->ncl_lock);
765 	list_destroy(&ncl->ncl_list);
766 	kmem_free(ncl, sizeof (sdev_nc_list_t));
767 }
768 
769 static sdev_nc_node_t *
770 sdev_nc_findpath(sdev_nc_list_t *ncl, char *path)
771 {
772 	sdev_nc_node_t *lp;
773 
774 	ASSERT(RW_LOCK_HELD(&ncl->ncl_lock));
775 
776 	for (lp = list_head(&ncl->ncl_list); lp;
777 	    lp = list_next(&ncl->ncl_list, lp)) {
778 		if (strcmp(path, lp->ncn_name) == 0)
779 			return (lp);
780 	}
781 
782 	return (NULL);
783 }
784 
785 static void
786 sdev_nc_insertnode(sdev_nc_list_t *ncl, sdev_nc_node_t *new)
787 {
788 	sdev_nc_node_t *lp;
789 
790 	rw_enter(&ncl->ncl_lock, RW_WRITER);
791 
792 	lp = sdev_nc_findpath(ncl, new->ncn_name);
793 	if (lp == NULL) {
794 		if (ncl->ncl_nentries == sdev_nc_max_entries) {
795 			sdcmn_err5((
796 			    "%s by %s: not adding to ncache (max %d)\n",
797 			    new->ncn_name, curproc->p_user.u_comm,
798 			    ncl->ncl_nentries));
799 			if (sdev_nc_verbose) {
800 				cmn_err(CE_CONT, "?%s by %s: "
801 				    "not adding to ncache (max %d)\n",
802 				    new->ncn_name, curproc->p_user.u_comm,
803 				    ncl->ncl_nentries);
804 			}
805 			rw_exit(&ncl->ncl_lock);
806 			sdev_nc_free_unlinked_node(new);
807 		} else {
808 
809 			list_insert_tail(&ncl->ncl_list, new);
810 			ncl->ncl_nentries++;
811 
812 			/* don't mark list dirty for nodes from store */
813 			mutex_enter(&ncl->ncl_mutex);
814 			if ((new->ncn_flags & NCN_SRC_STORE) == 0) {
815 				sdcmn_err5(("%s by %s: add to ncache\n",
816 				    new->ncn_name, curproc->p_user.u_comm));
817 				if (sdev_nc_verbose) {
818 					cmn_err(CE_CONT,
819 					    "?%s by %s: add to ncache\n",
820 					    new->ncn_name,
821 					    curproc->p_user.u_comm);
822 				}
823 				ncl->ncl_flags |= NCL_LIST_DIRTY;
824 			}
825 			mutex_exit(&ncl->ncl_mutex);
826 			rw_exit(&ncl->ncl_lock);
827 			lp = new;
828 			sdev_nc_flush_boot_update();
829 		}
830 	} else {
831 		mutex_enter(&ncl->ncl_mutex);
832 		lp->ncn_flags |= new->ncn_flags;
833 		mutex_exit(&ncl->ncl_mutex);
834 		rw_exit(&ncl->ncl_lock);
835 		sdev_nc_free_unlinked_node(new);
836 	}
837 }
838 
839 void
840 sdev_nc_addname(sdev_nc_list_t *ncl, sdev_node_t *dv, char *nm, int flags)
841 {
842 	int n;
843 	sdev_nc_node_t *lp;
844 
845 	ASSERT(SDEVTOV(dv)->v_type == VDIR);
846 
847 	lp = kmem_zalloc(sizeof (sdev_nc_node_t), KM_SLEEP);
848 
849 	n = strlen(dv->sdev_path) + strlen(nm) + 2;
850 	lp->ncn_name = kmem_alloc(n, KM_SLEEP);
851 	(void) sprintf(lp->ncn_name, "%s/%s",
852 		dv->sdev_path, nm);
853 	lp->ncn_flags = flags;
854 	lp->ncn_expirecnt = sdev_nc_expirecnt;
855 	sdev_nc_insertnode(ncl, lp);
856 }
857 
858 void
859 sdev_nc_node_exists(sdev_node_t *dv)
860 {
861 	/* dynamic and non-global nodes are never cached */
862 	if (SDEV_IS_GLOBAL(dv) && !SDEV_IS_DYNAMIC(dv) &&
863 	    !SDEV_IS_NO_NCACHE(dv)) {
864 		sdev_nc_path_exists(sdev_ncache, dv->sdev_path);
865 	}
866 }
867 
868 void
869 sdev_nc_path_exists(sdev_nc_list_t *ncl, char *path)
870 {
871 	sdev_nc_node_t *lp;
872 
873 	if (sdev_nc_disable)
874 		return;
875 
876 	rw_enter(&ncl->ncl_lock, RW_READER);
877 	if ((lp = sdev_nc_findpath(ncl, path)) == NULL) {
878 		rw_exit(&ncl->ncl_lock);
879 		return;
880 	}
881 	if (rw_tryupgrade(&ncl->ncl_lock) == 0) {
882 		rw_exit(&ncl->ncl_lock);
883 		rw_enter(&ncl->ncl_lock, RW_WRITER);
884 		lp = sdev_nc_findpath(ncl, path);
885 	}
886 	if (lp) {
887 		list_remove(&ncl->ncl_list, lp);
888 		ncl->ncl_nentries--;
889 		mutex_enter(&ncl->ncl_mutex);
890 		ncl->ncl_flags |= NCL_LIST_DIRTY;
891 		if (ncl->ncl_flags & NCL_LIST_WENABLE) {
892 			mutex_exit(&ncl->ncl_mutex);
893 			rw_exit(&ncl->ncl_lock);
894 			sdev_nc_flush_updates();
895 		} else {
896 			mutex_exit(&ncl->ncl_mutex);
897 			rw_exit(&ncl->ncl_lock);
898 		}
899 		sdev_nc_free_unlinked_node(lp);
900 		sdcmn_err5(("%s by %s: removed from ncache\n",
901 		    path, curproc->p_user.u_comm));
902 		if (sdev_nc_verbose) {
903 			cmn_err(CE_CONT, "?%s by %s: removed from ncache\n",
904 			    path, curproc->p_user.u_comm);
905 		}
906 	} else
907 		rw_exit(&ncl->ncl_lock);
908 }
909 
910 static void
911 sdev_nc_free_bootonly(void)
912 {
913 	sdev_nc_list_t	*ncl = sdev_ncache;
914 	sdev_nc_node_t *lp;
915 	sdev_nc_node_t *next;
916 
917 	ASSERT(sdev_reconfig_boot);
918 
919 	rw_enter(&ncl->ncl_lock, RW_WRITER);
920 
921 	for (lp = list_head(&ncl->ncl_list); lp; lp = next) {
922 		next = list_next(&ncl->ncl_list, lp);
923 		if ((lp->ncn_flags & NCN_SRC_CURRENT) == 0) {
924 			sdcmn_err5(("freeing %s\n", lp->ncn_name));
925 			mutex_enter(&ncl->ncl_mutex);
926 			ncl->ncl_flags |= NCL_LIST_DIRTY;
927 			mutex_exit(&ncl->ncl_mutex);
928 			list_remove(&ncl->ncl_list, lp);
929 			sdev_nc_free_unlinked_node(lp);
930 			ncl->ncl_nentries--;
931 		}
932 	}
933 
934 	rw_exit(&ncl->ncl_lock);
935 }
936