xref: /titanic_52/usr/src/uts/common/fs/dev/sdev_ncache.c (revision 9b241b4ed1cf882400b069ff9853cdd310d469bf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * negative cache handling for the /dev fs
28  */
29 
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/t_lock.h>
33 #include <sys/systm.h>
34 #include <sys/sysmacros.h>
35 #include <sys/user.h>
36 #include <sys/time.h>
37 #include <sys/vfs.h>
38 #include <sys/vnode.h>
39 #include <sys/file.h>
40 #include <sys/fcntl.h>
41 #include <sys/flock.h>
42 #include <sys/kmem.h>
43 #include <sys/uio.h>
44 #include <sys/errno.h>
45 #include <sys/stat.h>
46 #include <sys/cred.h>
47 #include <sys/cmn_err.h>
48 #include <sys/debug.h>
49 #include <sys/mode.h>
50 #include <sys/policy.h>
51 #include <fs/fs_subr.h>
52 #include <sys/mount.h>
53 #include <sys/fs/snode.h>
54 #include <sys/fs/dv_node.h>
55 #include <sys/fs/sdev_impl.h>
56 #include <sys/sunndi.h>
57 #include <sys/sunmdi.h>
58 #include <sys/ddi.h>
59 #include <sys/modctl.h>
60 #include <sys/devcache.h>
61 
62 
63 /*
64  * ncache is a negative cache of failed lookups.  An entry
65  * is added after an attempt to configure a device by that
66  * name failed.  An accumulation of these entries over time
67  * gives us a set of device name for which implicit reconfiguration
68  * does not need to be attempted.  If a name is created matching
69  * an entry in ncache, that entry is removed, with the
70  * persistent store updated.
71  *
72  * Implicit reconfig is initiated for any name during lookup that
73  * can't be resolved from the backing store and that isn't
74  * present in the negative cache.  This functionality is
75  * enabled during system startup once communication with devfsadm
76  * can be achieved.  Since readdir is more general, implicit
77  * reconfig initiated by reading a directory isn't enabled until
78  * the system is more fully booted, at the time of the multi-user
79  * milestone, corresponding to init state 2.
80  *
81  * A maximum is imposed on the number of entries in the cache
82  * to limit some script going wild and as a defense against attack.
83  * The default limit is 64 and can be adjusted via sdev_nc_max_entries.
84  *
85  * Each entry also has a expiration count.  When looked up a name in
86  * the cache is set to the default.  Subsequent boots will decrement
87  * the count if a name isn't referenced.  This permits a once-only
88  * entry to eventually be removed over time.
89  *
90  * sdev_reconfig_delay implements a "debounce" of the timing beyond
91  * system available indication, providing what the filesystem considers
92  * to be the system-is-fully-booted state.  This is provided to adjust
93  * the timing if some application startup is performing a readdir
94  * in /dev that initiates a troublesome implicit reconfig on every boot.
95  *
96  * sdev_nc_disable_reset can be used to disable clearing the negative cache
97  * on reconfig boot.  The default is to clear the cache on reconfig boot.
98  * sdev_nc_disable can be used to disable the negative cache itself.
99  *
100  * sdev_reconfig_disable can be used to disable implicit reconfig.
101  * The default is that implicit reconfig is enabled.
102  */
103 
104 /* tunables and defaults */
105 #define	SDEV_NC_EXPIRECNT	4
106 #define	SDEV_NC_MAX_ENTRIES	64
107 #define	SEV_RECONFIG_DELAY	6	/* seconds */
108 
109 /* tunables */
110 int	sdev_nc_expirecnt = SDEV_NC_EXPIRECNT;
111 int	sdev_nc_max_entries = SDEV_NC_MAX_ENTRIES;
112 int	sdev_reconfig_delay = SEV_RECONFIG_DELAY;
113 int	sdev_reconfig_verbose = 0;
114 int	sdev_reconfig_disable = 0;
115 int	sdev_nc_disable = 0;
116 int	sdev_nc_disable_reset = 0;
117 int	sdev_nc_verbose = 0;
118 int	sdev_cache_read_disable = 0;
119 int	sdev_cache_write_disable = 0;
120 
121 /* globals */
122 int	sdev_boot_state = SDEV_BOOT_STATE_INITIAL;
123 int	sdev_reconfig_boot = 0;
124 sdev_nc_list_t *sdev_ncache;
125 static nvf_handle_t sdevfd_handle;
126 
127 /* static prototypes */
128 static void sdev_ncache_write_complete(nvf_handle_t);
129 static void sdev_ncache_write(void);
130 static void sdev_ncache_process_store(void);
131 static sdev_nc_list_t *sdev_nc_newlist(void);
132 static void sdev_nc_free_unlinked_node(sdev_nc_node_t *);
133 static sdev_nc_node_t *sdev_nc_findpath(sdev_nc_list_t *, char *);
134 static void sdev_nc_insertnode(sdev_nc_list_t *, sdev_nc_node_t *);
135 static void sdev_nc_free_bootonly(void);
136 static int sdev_ncache_unpack_nvlist(nvf_handle_t, nvlist_t *, char *);
137 static int sdev_ncache_pack_list(nvf_handle_t, nvlist_t **);
138 static void sdev_ncache_list_free(nvf_handle_t);
139 static void sdev_nvp_free(nvp_devname_t *);
140 
141 /*
142  * Registration for /etc/devices/devname_cache
143  */
144 static nvf_ops_t sdev_cache_ops = {
145 	"/etc/devices/devname_cache",		/* path to cache */
146 	sdev_ncache_unpack_nvlist,		/* read: unpack nvlist */
147 	sdev_ncache_pack_list,			/* write: pack list */
148 	sdev_ncache_list_free,			/* free data list */
149 	sdev_ncache_write_complete		/* write complete callback */
150 };
151 
152 /*
153  * called once at filesystem initialization
154  */
155 void
156 sdev_ncache_init(void)
157 {
158 	sdev_ncache = sdev_nc_newlist();
159 }
160 
161 /*
162  * called at mount of the global instance
163  * currently the global instance is never unmounted
164  */
165 void
166 sdev_ncache_setup(void)
167 {
168 	sdevfd_handle = nvf_register_file(&sdev_cache_ops);
169 	ASSERT(sdevfd_handle);
170 
171 	list_create(nvf_list(sdevfd_handle), sizeof (nvp_devname_t),
172 	    offsetof(nvp_devname_t, nvp_link));
173 
174 	rw_enter(nvf_lock(sdevfd_handle), RW_WRITER);
175 	if (!sdev_cache_read_disable) {
176 		(void) nvf_read_file(sdevfd_handle);
177 	}
178 	sdev_ncache_process_store();
179 	rw_exit(nvf_lock(sdevfd_handle));
180 
181 	sdev_devstate_change();
182 }
183 
184 static void
185 sdev_nvp_free(nvp_devname_t *dp)
186 {
187 	int	i;
188 	char	**p;
189 
190 	if (dp->nvp_npaths > 0) {
191 		p = dp->nvp_paths;
192 		for (i = 0; i < dp->nvp_npaths; i++, p++) {
193 			kmem_free(*p, strlen(*p)+1);
194 		}
195 		kmem_free(dp->nvp_paths,
196 		    dp->nvp_npaths * sizeof (char *));
197 		kmem_free(dp->nvp_expirecnts,
198 		    dp->nvp_npaths * sizeof (int));
199 	}
200 
201 	kmem_free(dp, sizeof (nvp_devname_t));
202 }
203 
204 static void
205 sdev_ncache_list_free(nvf_handle_t fd)
206 {
207 	list_t		*listp;
208 	nvp_devname_t	*dp;
209 
210 	ASSERT(fd == sdevfd_handle);
211 	ASSERT(RW_WRITE_HELD(nvf_lock(fd)));
212 
213 	listp = nvf_list(fd);
214 	if ((dp = list_head(listp)) != NULL) {
215 		list_remove(listp, dp);
216 		sdev_nvp_free(dp);
217 	}
218 }
219 
220 /*
221  * Unpack a device path/nvlist pair to internal data list format.
222  * Used to decode the nvlist format into the internal representation
223  * when reading /etc/devices/devname_cache.
224  * Note that the expiration counts are optional, for compatibility
225  * with earlier instances of the cache.  If not present, the
226  * expire counts are initialized to defaults.
227  */
228 static int
229 sdev_ncache_unpack_nvlist(nvf_handle_t fd, nvlist_t *nvl, char *name)
230 {
231 	nvp_devname_t *np;
232 	char	**strs;
233 	int	*cnts;
234 	uint_t	nstrs, ncnts;
235 	int	rval, i;
236 
237 	ASSERT(fd == sdevfd_handle);
238 	ASSERT(RW_WRITE_HELD(nvf_lock(fd)));
239 
240 	/* name of the sublist must match what we created */
241 	if (strcmp(name, DP_DEVNAME_ID) != 0) {
242 		return (-1);
243 	}
244 
245 	np = kmem_zalloc(sizeof (nvp_devname_t), KM_SLEEP);
246 
247 	rval = nvlist_lookup_string_array(nvl,
248 	    DP_DEVNAME_NCACHE_ID, &strs, &nstrs);
249 	if (rval) {
250 		kmem_free(np, sizeof (nvp_devname_t));
251 		return (-1);
252 	}
253 
254 	np->nvp_npaths = nstrs;
255 	np->nvp_paths = kmem_zalloc(nstrs * sizeof (char *), KM_SLEEP);
256 	for (i = 0; i < nstrs; i++) {
257 		np->nvp_paths[i] = i_ddi_strdup(strs[i], KM_SLEEP);
258 	}
259 	np->nvp_expirecnts = kmem_zalloc(nstrs * sizeof (int), KM_SLEEP);
260 	for (i = 0; i < nstrs; i++) {
261 		np->nvp_expirecnts[i] = sdev_nc_expirecnt;
262 	}
263 
264 	rval = nvlist_lookup_int32_array(nvl,
265 	    DP_DEVNAME_NC_EXPIRECNT_ID, &cnts, &ncnts);
266 	if (rval == 0) {
267 		ASSERT(ncnts == nstrs);
268 		ncnts = min(ncnts, nstrs);
269 		for (i = 0; i < nstrs; i++) {
270 			np->nvp_expirecnts[i] = cnts[i];
271 		}
272 	}
273 
274 	list_insert_tail(nvf_list(sdevfd_handle), np);
275 
276 	return (0);
277 }
278 
279 /*
280  * Pack internal format cache data to a single nvlist.
281  * Used when writing the nvlist file.
282  * Note this is called indirectly by the nvpflush daemon.
283  */
284 static int
285 sdev_ncache_pack_list(nvf_handle_t fd, nvlist_t **ret_nvl)
286 {
287 	nvlist_t	*nvl, *sub_nvl;
288 	nvp_devname_t	*np;
289 	int		rval;
290 	list_t		*listp;
291 
292 	ASSERT(fd == sdevfd_handle);
293 	ASSERT(RW_WRITE_HELD(nvf_lock(fd)));
294 
295 	rval = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP);
296 	if (rval != 0) {
297 		nvf_error("%s: nvlist alloc error %d\n",
298 		    nvf_cache_name(fd), rval);
299 		return (DDI_FAILURE);
300 	}
301 
302 	listp = nvf_list(sdevfd_handle);
303 	if ((np = list_head(listp)) != NULL) {
304 		ASSERT(list_next(listp, np) == NULL);
305 
306 		rval = nvlist_alloc(&sub_nvl, NV_UNIQUE_NAME, KM_SLEEP);
307 		if (rval != 0) {
308 			nvf_error("%s: nvlist alloc error %d\n",
309 			    nvf_cache_name(fd), rval);
310 			sub_nvl = NULL;
311 			goto err;
312 		}
313 
314 		rval = nvlist_add_string_array(sub_nvl,
315 		    DP_DEVNAME_NCACHE_ID, np->nvp_paths, np->nvp_npaths);
316 		if (rval != 0) {
317 			nvf_error("%s: nvlist add error %d (sdev)\n",
318 			    nvf_cache_name(fd), rval);
319 			goto err;
320 		}
321 
322 		rval = nvlist_add_int32_array(sub_nvl,
323 		    DP_DEVNAME_NC_EXPIRECNT_ID,
324 		    np->nvp_expirecnts, np->nvp_npaths);
325 		if (rval != 0) {
326 			nvf_error("%s: nvlist add error %d (sdev)\n",
327 			    nvf_cache_name(fd), rval);
328 			goto err;
329 		}
330 
331 		rval = nvlist_add_nvlist(nvl, DP_DEVNAME_ID, sub_nvl);
332 		if (rval != 0) {
333 			nvf_error("%s: nvlist add error %d (sublist)\n",
334 			    nvf_cache_name(fd), rval);
335 			goto err;
336 		}
337 		nvlist_free(sub_nvl);
338 	}
339 
340 	*ret_nvl = nvl;
341 	return (DDI_SUCCESS);
342 
343 err:
344 	if (sub_nvl)
345 		nvlist_free(sub_nvl);
346 	nvlist_free(nvl);
347 	*ret_nvl = NULL;
348 	return (DDI_FAILURE);
349 }
350 
351 /*
352  * Run through the data read from the backing cache store
353  * to establish the initial state of the neg. cache.
354  */
355 static void
356 sdev_ncache_process_store(void)
357 {
358 	sdev_nc_list_t	*ncl = sdev_ncache;
359 	nvp_devname_t	*np;
360 	sdev_nc_node_t	*lp;
361 	char		*path;
362 	int		i, n;
363 	list_t		*listp;
364 
365 	if (sdev_nc_disable)
366 		return;
367 
368 	ASSERT(RW_WRITE_HELD(nvf_lock(sdevfd_handle)));
369 
370 	listp = nvf_list(sdevfd_handle);
371 	for (np = list_head(listp); np; np = list_next(listp, np)) {
372 		for (i = 0; i < np->nvp_npaths; i++) {
373 			sdcmn_err5(("    %s %d\n",
374 			    np->nvp_paths[i], np->nvp_expirecnts[i]));
375 			if (ncl->ncl_nentries < sdev_nc_max_entries) {
376 				path = np->nvp_paths[i];
377 				n = strlen(path) + 1;
378 				lp = kmem_alloc(sizeof (sdev_nc_node_t),
379 				    KM_SLEEP);
380 				lp->ncn_name = kmem_alloc(n, KM_SLEEP);
381 				bcopy(path, lp->ncn_name, n);
382 				lp->ncn_flags = NCN_SRC_STORE;
383 				lp->ncn_expirecnt = np->nvp_expirecnts[i];
384 				sdev_nc_insertnode(ncl, lp);
385 			} else if (sdev_nc_verbose) {
386 				cmn_err(CE_CONT,
387 				    "?%s: truncating from ncache (max %d)\n",
388 				    np->nvp_paths[i], sdev_nc_max_entries);
389 			}
390 		}
391 	}
392 }
393 
394 /*
395  * called by nvpflush daemon to inform us that an update of
396  * the cache file has been completed.
397  */
398 static void
399 sdev_ncache_write_complete(nvf_handle_t fd)
400 {
401 	sdev_nc_list_t	*ncl = sdev_ncache;
402 
403 	ASSERT(fd == sdevfd_handle);
404 
405 	mutex_enter(&ncl->ncl_mutex);
406 
407 	ASSERT(ncl->ncl_flags & NCL_LIST_WRITING);
408 
409 	if (ncl->ncl_flags & NCL_LIST_DIRTY) {
410 		sdcmn_err5(("ncache write complete but dirty again\n"));
411 		ncl->ncl_flags &= ~NCL_LIST_DIRTY;
412 		mutex_exit(&ncl->ncl_mutex);
413 		sdev_ncache_write();
414 	} else {
415 		sdcmn_err5(("ncache write complete\n"));
416 		ncl->ncl_flags &= ~NCL_LIST_WRITING;
417 		mutex_exit(&ncl->ncl_mutex);
418 		rw_enter(nvf_lock(fd), RW_WRITER);
419 		sdev_ncache_list_free(fd);
420 		rw_exit(nvf_lock(fd));
421 	}
422 }
423 
424 /*
425  * Prepare to perform an update of the neg. cache backing store.
426  */
427 static void
428 sdev_ncache_write(void)
429 {
430 	sdev_nc_list_t	*ncl = sdev_ncache;
431 	nvp_devname_t	*np;
432 	sdev_nc_node_t	*lp;
433 	int		n, i;
434 
435 	if (sdev_cache_write_disable) {
436 		mutex_enter(&ncl->ncl_mutex);
437 		ncl->ncl_flags &= ~NCL_LIST_WRITING;
438 		mutex_exit(&ncl->ncl_mutex);
439 		return;
440 	}
441 
442 	/* proper lock ordering here is essential */
443 	rw_enter(nvf_lock(sdevfd_handle), RW_WRITER);
444 	sdev_ncache_list_free(sdevfd_handle);
445 
446 	rw_enter(&ncl->ncl_lock, RW_READER);
447 	n = ncl->ncl_nentries;
448 	ASSERT(n <= sdev_nc_max_entries);
449 
450 	np = kmem_zalloc(sizeof (nvp_devname_t), KM_SLEEP);
451 	np->nvp_npaths = n;
452 	np->nvp_paths = kmem_zalloc(n * sizeof (char *), KM_SLEEP);
453 	np->nvp_expirecnts = kmem_zalloc(n * sizeof (int), KM_SLEEP);
454 
455 	i = 0;
456 	for (lp = list_head(&ncl->ncl_list); lp;
457 	    lp = list_next(&ncl->ncl_list, lp)) {
458 		np->nvp_paths[i] = i_ddi_strdup(lp->ncn_name, KM_SLEEP);
459 		np->nvp_expirecnts[i] = lp->ncn_expirecnt;
460 		sdcmn_err5(("    %s %d\n",
461 		    np->nvp_paths[i], np->nvp_expirecnts[i]));
462 		i++;
463 	}
464 
465 	rw_exit(&ncl->ncl_lock);
466 
467 	nvf_mark_dirty(sdevfd_handle);
468 	list_insert_tail(nvf_list(sdevfd_handle), np);
469 	rw_exit(nvf_lock(sdevfd_handle));
470 
471 	nvf_wake_daemon();
472 }
473 
474 static void
475 sdev_nc_flush_updates(void)
476 {
477 	sdev_nc_list_t *ncl = sdev_ncache;
478 
479 	if (sdev_nc_disable || sdev_cache_write_disable)
480 		return;
481 
482 	mutex_enter(&ncl->ncl_mutex);
483 	if (((ncl->ncl_flags &
484 	    (NCL_LIST_DIRTY | NCL_LIST_WENABLE | NCL_LIST_WRITING)) ==
485 	    (NCL_LIST_DIRTY | NCL_LIST_WENABLE))) {
486 		ncl->ncl_flags &= ~NCL_LIST_DIRTY;
487 		ncl->ncl_flags |= NCL_LIST_WRITING;
488 		mutex_exit(&ncl->ncl_mutex);
489 		sdev_ncache_write();
490 	} else {
491 		mutex_exit(&ncl->ncl_mutex);
492 	}
493 }
494 
495 static void
496 sdev_nc_flush_boot_update(void)
497 {
498 	sdev_nc_list_t *ncl = sdev_ncache;
499 
500 	if (sdev_nc_disable || sdev_cache_write_disable ||
501 	    (sdev_boot_state == SDEV_BOOT_STATE_INITIAL)) {
502 		return;
503 	}
504 	mutex_enter(&ncl->ncl_mutex);
505 	if (ncl->ncl_flags & NCL_LIST_WENABLE) {
506 		mutex_exit(&ncl->ncl_mutex);
507 		sdev_nc_flush_updates();
508 	} else {
509 		mutex_exit(&ncl->ncl_mutex);
510 	}
511 
512 }
513 
514 static void
515 sdev_state_boot_complete()
516 {
517 	sdev_nc_list_t	*ncl = sdev_ncache;
518 	sdev_nc_node_t	*lp, *next;
519 
520 	/*
521 	 * Once boot is complete, decrement the expire count of each entry
522 	 * in the cache not touched by a reference.  Remove any that
523 	 * goes to zero.  This effectively removes random entries over
524 	 * time.
525 	 */
526 	rw_enter(&ncl->ncl_lock, RW_WRITER);
527 	mutex_enter(&ncl->ncl_mutex);
528 
529 	for (lp = list_head(&ncl->ncl_list); lp; lp = next) {
530 		next = list_next(&ncl->ncl_list, lp);
531 		if (sdev_nc_expirecnt > 0 && lp->ncn_expirecnt > 0) {
532 			if (lp->ncn_flags & NCN_ACTIVE) {
533 				if (lp->ncn_expirecnt != sdev_nc_expirecnt) {
534 					lp->ncn_expirecnt = sdev_nc_expirecnt;
535 					ncl->ncl_flags |= NCL_LIST_DIRTY;
536 				}
537 			} else {
538 				if (--lp->ncn_expirecnt == 0) {
539 					list_remove(&ncl->ncl_list, lp);
540 					sdev_nc_free_unlinked_node(lp);
541 					ncl->ncl_nentries--;
542 				}
543 				ncl->ncl_flags |= NCL_LIST_DIRTY;
544 			}
545 		}
546 	}
547 
548 	mutex_exit(&ncl->ncl_mutex);
549 	rw_exit(&ncl->ncl_lock);
550 
551 	sdev_nc_flush_boot_update();
552 	sdev_boot_state = SDEV_BOOT_STATE_COMPLETE;
553 }
554 
555 /*
556  * Upon transition to the login state on a reconfigure boot,
557  * a debounce timer is set up so that we cache all the nonsense
558  * lookups we're hit with by the windowing system startup.
559  */
560 
561 /*ARGSUSED*/
562 static void
563 sdev_state_timeout(void *arg)
564 {
565 	sdev_state_boot_complete();
566 }
567 
568 static void
569 sdev_state_sysavail()
570 {
571 	sdev_nc_list_t *ncl = sdev_ncache;
572 	clock_t	nticks;
573 	int nsecs;
574 
575 	mutex_enter(&ncl->ncl_mutex);
576 	ncl->ncl_flags |= NCL_LIST_WENABLE;
577 	mutex_exit(&ncl->ncl_mutex);
578 
579 	nsecs = sdev_reconfig_delay;
580 	if (nsecs == 0) {
581 		sdev_state_boot_complete();
582 	} else {
583 		nticks = drv_usectohz(1000000 * nsecs);
584 		sdcmn_err5(("timeout initiated %ld\n", nticks));
585 		(void) timeout(sdev_state_timeout, NULL, nticks);
586 		sdev_nc_flush_boot_update();
587 	}
588 }
589 
590 /*
591  * Called to inform the filesystem of progress during boot,
592  * either a notice of reconfiguration boot or an indication of
593  * system boot complete.  At system boot complete, set up a
594  * timer at the expiration of which no further failed lookups
595  * will be added to the negative cache.
596  *
597  * The dev filesystem infers from reconfig boot that implicit
598  * reconfig need not be invoked at all as all available devices
599  * will have already been named.
600  *
601  * The dev filesystem infers from "system available" that devfsadmd
602  * can now be run and hence implicit reconfiguration may be initiated.
603  * During early stages of system startup, implicit reconfig is
604  * not done to avoid impacting boot performance.
605  */
606 void
607 sdev_devstate_change(void)
608 {
609 	int new_state;
610 
611 	/*
612 	 * Track system state and manage interesting transitions
613 	 */
614 	new_state = SDEV_BOOT_STATE_INITIAL;
615 	if (i_ddi_reconfig())
616 		new_state = SDEV_BOOT_STATE_RECONFIG;
617 	if (i_ddi_sysavail())
618 		new_state = SDEV_BOOT_STATE_SYSAVAIL;
619 
620 	if (sdev_boot_state < new_state) {
621 		switch (new_state) {
622 		case SDEV_BOOT_STATE_RECONFIG:
623 			sdcmn_err5(("state change: reconfigure boot\n"));
624 			sdev_boot_state = new_state;
625 			/*
626 			 * The /dev filesystem fills a hot-plug .vs.
627 			 * public-namespace gap by invoking 'devfsadm' once
628 			 * as a result of the first /dev lookup failure
629 			 * (or getdents/readdir). Originally, it was thought
630 			 * that a reconfig reboot did not have a hot-plug gap,
631 			 * but this is not true - the gap is just smaller:
632 			 * it exists from the the time the smf invocation of
633 			 * devfsadm completes its forced devinfo snapshot,
634 			 * to the time when the smf devfsadmd daemon invocation
635 			 * is set up and listening for hotplug sysevents.
636 			 * Since there is still a gap with reconfig reboot,
637 			 * we no longer set 'sdev_reconfig_boot'.
638 			 */
639 			if (!sdev_nc_disable_reset)
640 				sdev_nc_free_bootonly();
641 			break;
642 		case SDEV_BOOT_STATE_SYSAVAIL:
643 			sdcmn_err5(("system available\n"));
644 			sdev_boot_state = new_state;
645 			sdev_state_sysavail();
646 			break;
647 		}
648 	}
649 }
650 
651 /*
652  * Lookup: filter out entries in the negative cache
653  * Return 1 if the lookup should not cause a reconfig.
654  */
655 int
656 sdev_lookup_filter(sdev_node_t *dv, char *nm)
657 {
658 	int n;
659 	sdev_nc_list_t *ncl = sdev_ncache;
660 	sdev_nc_node_t *lp;
661 	char *path;
662 	int rval = 0;
663 	int changed = 0;
664 
665 	ASSERT(i_ddi_io_initialized());
666 	ASSERT(SDEVTOV(dv)->v_type == VDIR);
667 
668 	if (sdev_nc_disable)
669 		return (0);
670 
671 	n = strlen(dv->sdev_path) + strlen(nm) + 2;
672 	path = kmem_alloc(n, KM_SLEEP);
673 	(void) sprintf(path, "%s/%s", dv->sdev_path, nm);
674 
675 	rw_enter(&ncl->ncl_lock, RW_READER);
676 	if ((lp = sdev_nc_findpath(ncl, path)) != NULL) {
677 		sdcmn_err5(("%s/%s: lookup by %s cached, no reconfig\n",
678 		    dv->sdev_name, nm, curproc->p_user.u_comm));
679 		if (sdev_nc_verbose) {
680 			cmn_err(CE_CONT,
681 			    "?%s/%s: lookup by %s cached, no reconfig\n",
682 			    dv->sdev_name, nm, curproc->p_user.u_comm);
683 		}
684 		mutex_enter(&ncl->ncl_mutex);
685 		lp->ncn_flags |= NCN_ACTIVE;
686 		if (sdev_nc_expirecnt > 0 && lp->ncn_expirecnt > 0 &&
687 		    lp->ncn_expirecnt < sdev_nc_expirecnt) {
688 			lp->ncn_expirecnt = sdev_nc_expirecnt;
689 			ncl->ncl_flags |= NCL_LIST_DIRTY;
690 			changed = 1;
691 		}
692 		mutex_exit(&ncl->ncl_mutex);
693 		rval = 1;
694 	}
695 	rw_exit(&ncl->ncl_lock);
696 	kmem_free(path, n);
697 	if (changed)
698 		sdev_nc_flush_boot_update();
699 	return (rval);
700 }
701 
702 void
703 sdev_lookup_failed(sdev_node_t *dv, char *nm, int failed_flags)
704 {
705 	if (sdev_nc_disable)
706 		return;
707 
708 	/*
709 	 * If we're still in the initial boot stage, always update
710 	 * the cache - we may not have received notice of the
711 	 * reconfig boot state yet.  On a reconfigure boot, entries
712 	 * from the backing store are not re-persisted on update,
713 	 * but new entries are marked as needing an update.
714 	 * Never cache dynamic or non-global nodes.
715 	 */
716 	if (SDEV_IS_GLOBAL(dv) && !SDEV_IS_DYNAMIC(dv) &&
717 	    !SDEV_IS_NO_NCACHE(dv) &&
718 	    ((failed_flags & SLF_NO_NCACHE) == 0) &&
719 	    ((sdev_reconfig_boot &&
720 	    (sdev_boot_state != SDEV_BOOT_STATE_COMPLETE)) ||
721 	    (!sdev_reconfig_boot && ((failed_flags & SLF_REBUILT))))) {
722 			sdev_nc_addname(sdev_ncache,
723 			    dv, nm, NCN_SRC_CURRENT|NCN_ACTIVE);
724 	}
725 }
726 
727 static sdev_nc_list_t *
728 sdev_nc_newlist(void)
729 {
730 	sdev_nc_list_t	*ncl;
731 
732 	ncl = kmem_zalloc(sizeof (sdev_nc_list_t), KM_SLEEP);
733 
734 	rw_init(&ncl->ncl_lock, NULL, RW_DEFAULT, NULL);
735 	mutex_init(&ncl->ncl_mutex, NULL, MUTEX_DEFAULT, NULL);
736 	list_create(&ncl->ncl_list, sizeof (sdev_nc_node_t),
737 	    offsetof(sdev_nc_node_t, ncn_link));
738 
739 	return (ncl);
740 }
741 
742 static void
743 sdev_nc_free_unlinked_node(sdev_nc_node_t *lp)
744 {
745 	kmem_free(lp->ncn_name, strlen(lp->ncn_name) + 1);
746 	kmem_free(lp, sizeof (sdev_nc_node_t));
747 }
748 
749 static sdev_nc_node_t *
750 sdev_nc_findpath(sdev_nc_list_t *ncl, char *path)
751 {
752 	sdev_nc_node_t *lp;
753 
754 	ASSERT(RW_LOCK_HELD(&ncl->ncl_lock));
755 
756 	for (lp = list_head(&ncl->ncl_list); lp;
757 	    lp = list_next(&ncl->ncl_list, lp)) {
758 		if (strcmp(path, lp->ncn_name) == 0)
759 			return (lp);
760 	}
761 
762 	return (NULL);
763 }
764 
765 static void
766 sdev_nc_insertnode(sdev_nc_list_t *ncl, sdev_nc_node_t *new)
767 {
768 	sdev_nc_node_t *lp;
769 
770 	rw_enter(&ncl->ncl_lock, RW_WRITER);
771 
772 	lp = sdev_nc_findpath(ncl, new->ncn_name);
773 	if (lp == NULL) {
774 		if (ncl->ncl_nentries == sdev_nc_max_entries) {
775 			sdcmn_err5((
776 			    "%s by %s: not adding to ncache (max %d)\n",
777 			    new->ncn_name, curproc->p_user.u_comm,
778 			    ncl->ncl_nentries));
779 			if (sdev_nc_verbose) {
780 				cmn_err(CE_CONT, "?%s by %s: "
781 				    "not adding to ncache (max %d)\n",
782 				    new->ncn_name, curproc->p_user.u_comm,
783 				    ncl->ncl_nentries);
784 			}
785 			rw_exit(&ncl->ncl_lock);
786 			sdev_nc_free_unlinked_node(new);
787 		} else {
788 
789 			list_insert_tail(&ncl->ncl_list, new);
790 			ncl->ncl_nentries++;
791 
792 			/* don't mark list dirty for nodes from store */
793 			mutex_enter(&ncl->ncl_mutex);
794 			if ((new->ncn_flags & NCN_SRC_STORE) == 0) {
795 				sdcmn_err5(("%s by %s: add to ncache\n",
796 				    new->ncn_name, curproc->p_user.u_comm));
797 				if (sdev_nc_verbose) {
798 					cmn_err(CE_CONT,
799 					    "?%s by %s: add to ncache\n",
800 					    new->ncn_name,
801 					    curproc->p_user.u_comm);
802 				}
803 				ncl->ncl_flags |= NCL_LIST_DIRTY;
804 			}
805 			mutex_exit(&ncl->ncl_mutex);
806 			rw_exit(&ncl->ncl_lock);
807 			lp = new;
808 			sdev_nc_flush_boot_update();
809 		}
810 	} else {
811 		mutex_enter(&ncl->ncl_mutex);
812 		lp->ncn_flags |= new->ncn_flags;
813 		mutex_exit(&ncl->ncl_mutex);
814 		rw_exit(&ncl->ncl_lock);
815 		sdev_nc_free_unlinked_node(new);
816 	}
817 }
818 
819 void
820 sdev_nc_addname(sdev_nc_list_t *ncl, sdev_node_t *dv, char *nm, int flags)
821 {
822 	int n;
823 	sdev_nc_node_t *lp;
824 
825 	ASSERT(SDEVTOV(dv)->v_type == VDIR);
826 
827 	lp = kmem_zalloc(sizeof (sdev_nc_node_t), KM_SLEEP);
828 
829 	n = strlen(dv->sdev_path) + strlen(nm) + 2;
830 	lp->ncn_name = kmem_alloc(n, KM_SLEEP);
831 	(void) sprintf(lp->ncn_name, "%s/%s",
832 	    dv->sdev_path, nm);
833 	lp->ncn_flags = flags;
834 	lp->ncn_expirecnt = sdev_nc_expirecnt;
835 	sdev_nc_insertnode(ncl, lp);
836 }
837 
838 void
839 sdev_nc_node_exists(sdev_node_t *dv)
840 {
841 	/* dynamic and non-global nodes are never cached */
842 	if (SDEV_IS_GLOBAL(dv) && !SDEV_IS_DYNAMIC(dv) &&
843 	    !SDEV_IS_NO_NCACHE(dv)) {
844 		sdev_nc_path_exists(sdev_ncache, dv->sdev_path);
845 	}
846 }
847 
848 void
849 sdev_nc_path_exists(sdev_nc_list_t *ncl, char *path)
850 {
851 	sdev_nc_node_t *lp;
852 
853 	if (sdev_nc_disable)
854 		return;
855 
856 	rw_enter(&ncl->ncl_lock, RW_READER);
857 	if ((lp = sdev_nc_findpath(ncl, path)) == NULL) {
858 		rw_exit(&ncl->ncl_lock);
859 		return;
860 	}
861 	if (rw_tryupgrade(&ncl->ncl_lock) == 0) {
862 		rw_exit(&ncl->ncl_lock);
863 		rw_enter(&ncl->ncl_lock, RW_WRITER);
864 		lp = sdev_nc_findpath(ncl, path);
865 	}
866 	if (lp) {
867 		list_remove(&ncl->ncl_list, lp);
868 		ncl->ncl_nentries--;
869 		mutex_enter(&ncl->ncl_mutex);
870 		ncl->ncl_flags |= NCL_LIST_DIRTY;
871 		if (ncl->ncl_flags & NCL_LIST_WENABLE) {
872 			mutex_exit(&ncl->ncl_mutex);
873 			rw_exit(&ncl->ncl_lock);
874 			sdev_nc_flush_updates();
875 		} else {
876 			mutex_exit(&ncl->ncl_mutex);
877 			rw_exit(&ncl->ncl_lock);
878 		}
879 		sdev_nc_free_unlinked_node(lp);
880 		sdcmn_err5(("%s by %s: removed from ncache\n",
881 		    path, curproc->p_user.u_comm));
882 		if (sdev_nc_verbose) {
883 			cmn_err(CE_CONT, "?%s by %s: removed from ncache\n",
884 			    path, curproc->p_user.u_comm);
885 		}
886 	} else
887 		rw_exit(&ncl->ncl_lock);
888 }
889 
890 static void
891 sdev_nc_free_bootonly(void)
892 {
893 	sdev_nc_list_t	*ncl = sdev_ncache;
894 	sdev_nc_node_t *lp;
895 	sdev_nc_node_t *next;
896 
897 	rw_enter(&ncl->ncl_lock, RW_WRITER);
898 
899 	for (lp = list_head(&ncl->ncl_list); lp; lp = next) {
900 		next = list_next(&ncl->ncl_list, lp);
901 		if ((lp->ncn_flags & NCN_SRC_CURRENT) == 0) {
902 			sdcmn_err5(("freeing %s\n", lp->ncn_name));
903 			mutex_enter(&ncl->ncl_mutex);
904 			ncl->ncl_flags |= NCL_LIST_DIRTY;
905 			mutex_exit(&ncl->ncl_mutex);
906 			list_remove(&ncl->ncl_list, lp);
907 			sdev_nc_free_unlinked_node(lp);
908 			ncl->ncl_nentries--;
909 		}
910 	}
911 
912 	rw_exit(&ncl->ncl_lock);
913 }
914