xref: /titanic_51/usr/src/uts/common/os/sunmdi.c (revision fbbfbc6ee66f60ad88ebd18c6c030797335354f4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Multipath driver interface (MDI) implementation; see mdi_impl.h for a more
28  * detailed discussion of the overall mpxio architecture.
29  *
30  * Default locking order:
31  *
32  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_phci_mutex);
33  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_client_mutex);
34  * _NOTE(LOCK_ORDER(mdi_vhci:vh_phci_mutex, mdi_phci::ph_mutex);
35  * _NOTE(LOCK_ORDER(mdi_vhci:vh_client_mutex, mdi_client::ct_mutex);
36  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
37  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_client::ct_mutex))
38  * _NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
39  */
40 
41 #include <sys/note.h>
42 #include <sys/types.h>
43 #include <sys/varargs.h>
44 #include <sys/param.h>
45 #include <sys/errno.h>
46 #include <sys/uio.h>
47 #include <sys/buf.h>
48 #include <sys/modctl.h>
49 #include <sys/open.h>
50 #include <sys/kmem.h>
51 #include <sys/poll.h>
52 #include <sys/conf.h>
53 #include <sys/bootconf.h>
54 #include <sys/cmn_err.h>
55 #include <sys/stat.h>
56 #include <sys/ddi.h>
57 #include <sys/sunddi.h>
58 #include <sys/ddipropdefs.h>
59 #include <sys/sunndi.h>
60 #include <sys/ndi_impldefs.h>
61 #include <sys/promif.h>
62 #include <sys/sunmdi.h>
63 #include <sys/mdi_impldefs.h>
64 #include <sys/taskq.h>
65 #include <sys/epm.h>
66 #include <sys/sunpm.h>
67 #include <sys/modhash.h>
68 #include <sys/disp.h>
69 #include <sys/autoconf.h>
70 #include <sys/sysmacros.h>
71 
72 #ifdef	DEBUG
73 #include <sys/debug.h>
74 int	mdi_debug = 1;
75 int	mdi_debug_logonly = 0;
76 #define	MDI_DEBUG(dbglevel, pargs) if (mdi_debug >= (dbglevel))	i_mdi_log pargs
77 #define	MDI_WARN	CE_WARN, __func__
78 #define	MDI_NOTE	CE_NOTE, __func__
79 #define	MDI_CONT	CE_CONT, __func__
80 static void i_mdi_log(int, const char *, dev_info_t *, const char *, ...);
81 #else	/* !DEBUG */
82 #define	MDI_DEBUG(dbglevel, pargs)
83 #endif	/* DEBUG */
84 int	mdi_debug_consoleonly = 0;
85 
86 extern pri_t	minclsyspri;
87 extern int	modrootloaded;
88 
89 /*
90  * Global mutex:
91  * Protects vHCI list and structure members.
92  */
93 kmutex_t	mdi_mutex;
94 
95 /*
96  * Registered vHCI class driver lists
97  */
98 int		mdi_vhci_count;
99 mdi_vhci_t	*mdi_vhci_head;
100 mdi_vhci_t	*mdi_vhci_tail;
101 
102 /*
103  * Client Hash Table size
104  */
105 static int	mdi_client_table_size = CLIENT_HASH_TABLE_SIZE;
106 
107 /*
108  * taskq interface definitions
109  */
110 #define	MDI_TASKQ_N_THREADS	8
111 #define	MDI_TASKQ_PRI		minclsyspri
112 #define	MDI_TASKQ_MINALLOC	(4*mdi_taskq_n_threads)
113 #define	MDI_TASKQ_MAXALLOC	(500*mdi_taskq_n_threads)
114 
115 taskq_t				*mdi_taskq;
116 static uint_t			mdi_taskq_n_threads = MDI_TASKQ_N_THREADS;
117 
118 #define	TICKS_PER_SECOND	(drv_usectohz(1000000))
119 
120 /*
121  * The data should be "quiet" for this interval (in seconds) before the
122  * vhci cached data is flushed to the disk.
123  */
124 static int mdi_vhcache_flush_delay = 10;
125 
126 /* number of seconds the vhcache flush daemon will sleep idle before exiting */
127 static int mdi_vhcache_flush_daemon_idle_time = 60;
128 
129 /*
130  * MDI falls back to discovery of all paths when a bus_config_one fails.
131  * The following parameters can be used to tune this operation.
132  *
133  * mdi_path_discovery_boot
134  *	Number of times path discovery will be attempted during early boot.
135  *	Probably there is no reason to ever set this value to greater than one.
136  *
137  * mdi_path_discovery_postboot
138  *	Number of times path discovery will be attempted after early boot.
139  *	Set it to a minimum of two to allow for discovery of iscsi paths which
140  *	may happen very late during booting.
141  *
142  * mdi_path_discovery_interval
143  *	Minimum number of seconds MDI will wait between successive discovery
144  *	of all paths. Set it to -1 to disable discovery of all paths.
145  */
146 static int mdi_path_discovery_boot = 1;
147 static int mdi_path_discovery_postboot = 2;
148 static int mdi_path_discovery_interval = 10;
149 
150 /*
151  * number of seconds the asynchronous configuration thread will sleep idle
152  * before exiting.
153  */
154 static int mdi_async_config_idle_time = 600;
155 
156 static int mdi_bus_config_cache_hash_size = 256;
157 
158 /* turns off multithreaded configuration for certain operations */
159 static int mdi_mtc_off = 0;
160 
161 /*
162  * The "path" to a pathinfo node is identical to the /devices path to a
163  * devinfo node had the device been enumerated under a pHCI instead of
164  * a vHCI.  This pathinfo "path" is associated with a 'path_instance'.
165  * This association persists across create/delete of the pathinfo nodes,
166  * but not across reboot.
167  */
168 static uint_t		mdi_pathmap_instance = 1;	/* 0 -> any path */
169 static int		mdi_pathmap_hash_size = 256;
170 static kmutex_t		mdi_pathmap_mutex;
171 static mod_hash_t	*mdi_pathmap_bypath;		/* "path"->instance */
172 static mod_hash_t	*mdi_pathmap_byinstance;	/* instance->"path" */
173 static mod_hash_t	*mdi_pathmap_sbyinstance;	/* inst->shortpath */
174 
175 /*
176  * MDI component property name/value string definitions
177  */
178 const char 		*mdi_component_prop = "mpxio-component";
179 const char		*mdi_component_prop_vhci = "vhci";
180 const char		*mdi_component_prop_phci = "phci";
181 const char		*mdi_component_prop_client = "client";
182 
183 /*
184  * MDI client global unique identifier property name
185  */
186 const char		*mdi_client_guid_prop = "client-guid";
187 
188 /*
189  * MDI client load balancing property name/value string definitions
190  */
191 const char		*mdi_load_balance = "load-balance";
192 const char		*mdi_load_balance_none = "none";
193 const char		*mdi_load_balance_rr = "round-robin";
194 const char		*mdi_load_balance_lba = "logical-block";
195 
196 /*
197  * Obsolete vHCI class definition; to be removed after Leadville update
198  */
199 const char *mdi_vhci_class_scsi = MDI_HCI_CLASS_SCSI;
200 
201 static char vhci_greeting[] =
202 	"\tThere already exists one vHCI driver for class %s\n"
203 	"\tOnly one vHCI driver for each class is allowed\n";
204 
205 /*
206  * Static function prototypes
207  */
208 static int		i_mdi_phci_offline(dev_info_t *, uint_t);
209 static int		i_mdi_client_offline(dev_info_t *, uint_t);
210 static int		i_mdi_phci_pre_detach(dev_info_t *, ddi_detach_cmd_t);
211 static void		i_mdi_phci_post_detach(dev_info_t *,
212 			    ddi_detach_cmd_t, int);
213 static int		i_mdi_client_pre_detach(dev_info_t *,
214 			    ddi_detach_cmd_t);
215 static void		i_mdi_client_post_detach(dev_info_t *,
216 			    ddi_detach_cmd_t, int);
217 static void		i_mdi_pm_hold_pip(mdi_pathinfo_t *);
218 static void		i_mdi_pm_rele_pip(mdi_pathinfo_t *);
219 static int 		i_mdi_lba_lb(mdi_client_t *ct,
220 			    mdi_pathinfo_t **ret_pip, struct buf *buf);
221 static void		i_mdi_pm_hold_client(mdi_client_t *, int);
222 static void		i_mdi_pm_rele_client(mdi_client_t *, int);
223 static void		i_mdi_pm_reset_client(mdi_client_t *);
224 static int		i_mdi_power_all_phci(mdi_client_t *);
225 static void		i_mdi_log_sysevent(dev_info_t *, char *, char *);
226 
227 
228 /*
229  * Internal mdi_pathinfo node functions
230  */
231 static void		i_mdi_pi_kstat_destroy(mdi_pathinfo_t *);
232 
233 static mdi_vhci_t	*i_mdi_vhci_class2vhci(char *);
234 static mdi_vhci_t	*i_devi_get_vhci(dev_info_t *);
235 static mdi_phci_t	*i_devi_get_phci(dev_info_t *);
236 static void		i_mdi_phci_lock(mdi_phci_t *, mdi_pathinfo_t *);
237 static void		i_mdi_phci_unlock(mdi_phci_t *);
238 static mdi_pathinfo_t	*i_mdi_pi_alloc(mdi_phci_t *, char *, mdi_client_t *);
239 static void		i_mdi_phci_add_path(mdi_phci_t *, mdi_pathinfo_t *);
240 static void		i_mdi_client_add_path(mdi_client_t *, mdi_pathinfo_t *);
241 static void		i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *,
242 			    mdi_client_t *);
243 static void		i_mdi_phci_remove_path(mdi_phci_t *, mdi_pathinfo_t *);
244 static void		i_mdi_client_remove_path(mdi_client_t *,
245 			    mdi_pathinfo_t *);
246 
247 static int		i_mdi_pi_state_change(mdi_pathinfo_t *,
248 			    mdi_pathinfo_state_t, int);
249 static int		i_mdi_pi_offline(mdi_pathinfo_t *, int);
250 static dev_info_t	*i_mdi_devinfo_create(mdi_vhci_t *, char *, char *,
251 			    char **, int);
252 static dev_info_t	*i_mdi_devinfo_find(mdi_vhci_t *, char *, char *);
253 static int		i_mdi_devinfo_remove(dev_info_t *, dev_info_t *, int);
254 static int		i_mdi_is_child_present(dev_info_t *, dev_info_t *);
255 static mdi_client_t	*i_mdi_client_alloc(mdi_vhci_t *, char *, char *);
256 static void		i_mdi_client_enlist_table(mdi_vhci_t *, mdi_client_t *);
257 static void		i_mdi_client_delist_table(mdi_vhci_t *, mdi_client_t *);
258 static mdi_client_t	*i_mdi_client_find(mdi_vhci_t *, char *, char *);
259 static void		i_mdi_client_update_state(mdi_client_t *);
260 static int		i_mdi_client_compute_state(mdi_client_t *,
261 			    mdi_phci_t *);
262 static void		i_mdi_client_lock(mdi_client_t *, mdi_pathinfo_t *);
263 static void		i_mdi_client_unlock(mdi_client_t *);
264 static int		i_mdi_client_free(mdi_vhci_t *, mdi_client_t *);
265 static mdi_client_t	*i_devi_get_client(dev_info_t *);
266 /*
267  * NOTE: this will be removed once the NWS files are changed to use the new
268  * mdi_{enable,disable}_path interfaces
269  */
270 static int		i_mdi_pi_enable_disable(dev_info_t *, dev_info_t *,
271 				int, int);
272 static mdi_pathinfo_t 	*i_mdi_enable_disable_path(mdi_pathinfo_t *pip,
273 				mdi_vhci_t *vh, int flags, int op);
274 /*
275  * Failover related function prototypes
276  */
277 static int		i_mdi_failover(void *);
278 
279 /*
280  * misc internal functions
281  */
282 static int		i_mdi_get_hash_key(char *);
283 static int		i_map_nvlist_error_to_mdi(int);
284 static void		i_mdi_report_path_state(mdi_client_t *,
285 			    mdi_pathinfo_t *);
286 
287 static void		setup_vhci_cache(mdi_vhci_t *);
288 static int		destroy_vhci_cache(mdi_vhci_t *);
289 static int		stop_vhcache_async_threads(mdi_vhci_config_t *);
290 static boolean_t	stop_vhcache_flush_thread(void *, int);
291 static void		free_string_array(char **, int);
292 static void		free_vhcache_phci(mdi_vhcache_phci_t *);
293 static void		free_vhcache_pathinfo(mdi_vhcache_pathinfo_t *);
294 static void		free_vhcache_client(mdi_vhcache_client_t *);
295 static int		mainnvl_to_vhcache(mdi_vhci_cache_t *, nvlist_t *);
296 static nvlist_t		*vhcache_to_mainnvl(mdi_vhci_cache_t *);
297 static void		vhcache_phci_add(mdi_vhci_config_t *, mdi_phci_t *);
298 static void		vhcache_phci_remove(mdi_vhci_config_t *, mdi_phci_t *);
299 static void		vhcache_pi_add(mdi_vhci_config_t *,
300 			    struct mdi_pathinfo *);
301 static void		vhcache_pi_remove(mdi_vhci_config_t *,
302 			    struct mdi_pathinfo *);
303 static void		free_phclient_path_list(mdi_phys_path_t *);
304 static void		sort_vhcache_paths(mdi_vhcache_client_t *);
305 static int		flush_vhcache(mdi_vhci_config_t *, int);
306 static void		vhcache_dirty(mdi_vhci_config_t *);
307 static void		free_async_client_config(mdi_async_client_config_t *);
308 static void		single_threaded_vhconfig_enter(mdi_vhci_config_t *);
309 static void		single_threaded_vhconfig_exit(mdi_vhci_config_t *);
310 static nvlist_t		*read_on_disk_vhci_cache(char *);
311 extern int		fread_nvlist(char *, nvlist_t **);
312 extern int		fwrite_nvlist(char *, nvlist_t *);
313 
314 /* called once when first vhci registers with mdi */
315 static void
316 i_mdi_init()
317 {
318 	static int initialized = 0;
319 
320 	if (initialized)
321 		return;
322 	initialized = 1;
323 
324 	mutex_init(&mdi_mutex, NULL, MUTEX_DEFAULT, NULL);
325 
326 	/* Create our taskq resources */
327 	mdi_taskq = taskq_create("mdi_taskq", mdi_taskq_n_threads,
328 	    MDI_TASKQ_PRI, MDI_TASKQ_MINALLOC, MDI_TASKQ_MAXALLOC,
329 	    TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
330 	ASSERT(mdi_taskq != NULL);	/* taskq_create never fails */
331 
332 	/* Allocate ['path_instance' <-> "path"] maps */
333 	mutex_init(&mdi_pathmap_mutex, NULL, MUTEX_DRIVER, NULL);
334 	mdi_pathmap_bypath = mod_hash_create_strhash(
335 	    "mdi_pathmap_bypath", mdi_pathmap_hash_size,
336 	    mod_hash_null_valdtor);
337 	mdi_pathmap_byinstance = mod_hash_create_idhash(
338 	    "mdi_pathmap_byinstance", mdi_pathmap_hash_size,
339 	    mod_hash_null_valdtor);
340 	mdi_pathmap_sbyinstance = mod_hash_create_idhash(
341 	    "mdi_pathmap_sbyinstance", mdi_pathmap_hash_size,
342 	    mod_hash_null_valdtor);
343 }
344 
345 /*
346  * mdi_get_component_type():
347  *		Return mpxio component type
348  * Return Values:
349  *		MDI_COMPONENT_NONE
350  *		MDI_COMPONENT_VHCI
351  *		MDI_COMPONENT_PHCI
352  *		MDI_COMPONENT_CLIENT
353  * XXX This doesn't work under multi-level MPxIO and should be
354  *	removed when clients migrate mdi_component_is_*() interfaces.
355  */
356 int
357 mdi_get_component_type(dev_info_t *dip)
358 {
359 	return (DEVI(dip)->devi_mdi_component);
360 }
361 
362 /*
363  * mdi_vhci_register():
364  *		Register a vHCI module with the mpxio framework
365  *		mdi_vhci_register() is called by vHCI drivers to register the
366  *		'class_driver' vHCI driver and its MDI entrypoints with the
367  *		mpxio framework.  The vHCI driver must call this interface as
368  *		part of its attach(9e) handler.
369  *		Competing threads may try to attach mdi_vhci_register() as
370  *		the vHCI drivers are loaded and attached as a result of pHCI
371  *		driver instance registration (mdi_phci_register()) with the
372  *		framework.
373  * Return Values:
374  *		MDI_SUCCESS
375  *		MDI_FAILURE
376  */
377 /*ARGSUSED*/
378 int
379 mdi_vhci_register(char *class, dev_info_t *vdip, mdi_vhci_ops_t *vops,
380     int flags)
381 {
382 	mdi_vhci_t		*vh = NULL;
383 
384 	/* Registrant can't be older */
385 	ASSERT(vops->vo_revision <= MDI_VHCI_OPS_REV);
386 
387 #ifdef DEBUG
388 	/*
389 	 * IB nexus driver is loaded only when IB hardware is present.
390 	 * In order to be able to do this there is a need to drive the loading
391 	 * and attaching of the IB nexus driver (especially when an IB hardware
392 	 * is dynamically plugged in) when an IB HCA driver (PHCI)
393 	 * is being attached. Unfortunately this gets into the limitations
394 	 * of devfs as there seems to be no clean way to drive configuration
395 	 * of a subtree from another subtree of a devfs. Hence, do not ASSERT
396 	 * for IB.
397 	 */
398 	if (strcmp(class, MDI_HCI_CLASS_IB) != 0)
399 		ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));
400 #endif
401 
402 	i_mdi_init();
403 
404 	mutex_enter(&mdi_mutex);
405 	/*
406 	 * Scan for already registered vhci
407 	 */
408 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
409 		if (strcmp(vh->vh_class, class) == 0) {
410 			/*
411 			 * vHCI has already been created.  Check for valid
412 			 * vHCI ops registration.  We only support one vHCI
413 			 * module per class
414 			 */
415 			if (vh->vh_ops != NULL) {
416 				mutex_exit(&mdi_mutex);
417 				cmn_err(CE_NOTE, vhci_greeting, class);
418 				return (MDI_FAILURE);
419 			}
420 			break;
421 		}
422 	}
423 
424 	/*
425 	 * if not yet created, create the vHCI component
426 	 */
427 	if (vh == NULL) {
428 		struct client_hash	*hash = NULL;
429 		char			*load_balance;
430 
431 		/*
432 		 * Allocate and initialize the mdi extensions
433 		 */
434 		vh = kmem_zalloc(sizeof (mdi_vhci_t), KM_SLEEP);
435 		hash = kmem_zalloc(mdi_client_table_size * sizeof (*hash),
436 		    KM_SLEEP);
437 		vh->vh_client_table = hash;
438 		vh->vh_class = kmem_zalloc(strlen(class) + 1, KM_SLEEP);
439 		(void) strcpy(vh->vh_class, class);
440 		vh->vh_lb = LOAD_BALANCE_RR;
441 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vdip,
442 		    0, LOAD_BALANCE_PROP, &load_balance) == DDI_SUCCESS) {
443 			if (strcmp(load_balance, LOAD_BALANCE_PROP_NONE) == 0) {
444 				vh->vh_lb = LOAD_BALANCE_NONE;
445 			} else if (strcmp(load_balance, LOAD_BALANCE_PROP_LBA)
446 				    == 0) {
447 				vh->vh_lb = LOAD_BALANCE_LBA;
448 			}
449 			ddi_prop_free(load_balance);
450 		}
451 
452 		mutex_init(&vh->vh_phci_mutex, NULL, MUTEX_DEFAULT, NULL);
453 		mutex_init(&vh->vh_client_mutex, NULL, MUTEX_DEFAULT, NULL);
454 
455 		/*
456 		 * Store the vHCI ops vectors
457 		 */
458 		vh->vh_dip = vdip;
459 		vh->vh_ops = vops;
460 
461 		setup_vhci_cache(vh);
462 
463 		if (mdi_vhci_head == NULL) {
464 			mdi_vhci_head = vh;
465 		}
466 		if (mdi_vhci_tail) {
467 			mdi_vhci_tail->vh_next = vh;
468 		}
469 		mdi_vhci_tail = vh;
470 		mdi_vhci_count++;
471 	}
472 
473 	/*
474 	 * Claim the devfs node as a vhci component
475 	 */
476 	DEVI(vdip)->devi_mdi_component |= MDI_COMPONENT_VHCI;
477 
478 	/*
479 	 * Initialize our back reference from dev_info node
480 	 */
481 	DEVI(vdip)->devi_mdi_xhci = (caddr_t)vh;
482 	mutex_exit(&mdi_mutex);
483 	return (MDI_SUCCESS);
484 }
485 
486 /*
487  * mdi_vhci_unregister():
488  *		Unregister a vHCI module from mpxio framework
489  *		mdi_vhci_unregister() is called from the detach(9E) entrypoint
490  * 		of a vhci to unregister it from the framework.
491  * Return Values:
492  *		MDI_SUCCESS
493  *		MDI_FAILURE
494  */
495 /*ARGSUSED*/
496 int
497 mdi_vhci_unregister(dev_info_t *vdip, int flags)
498 {
499 	mdi_vhci_t	*found, *vh, *prev = NULL;
500 
501 	ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));
502 
503 	/*
504 	 * Check for invalid VHCI
505 	 */
506 	if ((vh = i_devi_get_vhci(vdip)) == NULL)
507 		return (MDI_FAILURE);
508 
509 	/*
510 	 * Scan the list of registered vHCIs for a match
511 	 */
512 	mutex_enter(&mdi_mutex);
513 	for (found = mdi_vhci_head; found != NULL; found = found->vh_next) {
514 		if (found == vh)
515 			break;
516 		prev = found;
517 	}
518 
519 	if (found == NULL) {
520 		mutex_exit(&mdi_mutex);
521 		return (MDI_FAILURE);
522 	}
523 
524 	/*
525 	 * Check the vHCI, pHCI and client count. All the pHCIs and clients
526 	 * should have been unregistered, before a vHCI can be
527 	 * unregistered.
528 	 */
529 	MDI_VHCI_PHCI_LOCK(vh);
530 	if (vh->vh_refcnt || vh->vh_phci_count || vh->vh_client_count) {
531 		MDI_VHCI_PHCI_UNLOCK(vh);
532 		mutex_exit(&mdi_mutex);
533 		return (MDI_FAILURE);
534 	}
535 	MDI_VHCI_PHCI_UNLOCK(vh);
536 
537 	if (destroy_vhci_cache(vh) != MDI_SUCCESS) {
538 		mutex_exit(&mdi_mutex);
539 		return (MDI_FAILURE);
540 	}
541 
542 	/*
543 	 * Remove the vHCI from the global list
544 	 */
545 	if (vh == mdi_vhci_head) {
546 		mdi_vhci_head = vh->vh_next;
547 	} else {
548 		prev->vh_next = vh->vh_next;
549 	}
550 	if (vh == mdi_vhci_tail) {
551 		mdi_vhci_tail = prev;
552 	}
553 	mdi_vhci_count--;
554 	mutex_exit(&mdi_mutex);
555 
556 	vh->vh_ops = NULL;
557 	DEVI(vdip)->devi_mdi_component &= ~MDI_COMPONENT_VHCI;
558 	DEVI(vdip)->devi_mdi_xhci = NULL;
559 	kmem_free(vh->vh_class, strlen(vh->vh_class)+1);
560 	kmem_free(vh->vh_client_table,
561 	    mdi_client_table_size * sizeof (struct client_hash));
562 	mutex_destroy(&vh->vh_phci_mutex);
563 	mutex_destroy(&vh->vh_client_mutex);
564 
565 	kmem_free(vh, sizeof (mdi_vhci_t));
566 	return (MDI_SUCCESS);
567 }
568 
569 /*
570  * i_mdi_vhci_class2vhci():
571  *		Look for a matching vHCI module given a vHCI class name
572  * Return Values:
573  *		Handle to a vHCI component
574  *		NULL
575  */
576 static mdi_vhci_t *
577 i_mdi_vhci_class2vhci(char *class)
578 {
579 	mdi_vhci_t	*vh = NULL;
580 
581 	ASSERT(!MUTEX_HELD(&mdi_mutex));
582 
583 	mutex_enter(&mdi_mutex);
584 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
585 		if (strcmp(vh->vh_class, class) == 0) {
586 			break;
587 		}
588 	}
589 	mutex_exit(&mdi_mutex);
590 	return (vh);
591 }
592 
593 /*
594  * i_devi_get_vhci():
595  *		Utility function to get the handle to a vHCI component
596  * Return Values:
597  *		Handle to a vHCI component
598  *		NULL
599  */
600 mdi_vhci_t *
601 i_devi_get_vhci(dev_info_t *vdip)
602 {
603 	mdi_vhci_t	*vh = NULL;
604 	if (MDI_VHCI(vdip)) {
605 		vh = (mdi_vhci_t *)DEVI(vdip)->devi_mdi_xhci;
606 	}
607 	return (vh);
608 }
609 
610 /*
611  * mdi_phci_register():
612  *		Register a pHCI module with mpxio framework
613  *		mdi_phci_register() is called by pHCI drivers to register with
614  *		the mpxio framework and a specific 'class_driver' vHCI.  The
615  *		pHCI driver must call this interface as part of its attach(9e)
616  *		handler.
617  * Return Values:
618  *		MDI_SUCCESS
619  *		MDI_FAILURE
620  */
621 /*ARGSUSED*/
622 int
623 mdi_phci_register(char *class, dev_info_t *pdip, int flags)
624 {
625 	mdi_phci_t		*ph;
626 	mdi_vhci_t		*vh;
627 	char			*data;
628 
629 	/*
630 	 * Some subsystems, like fcp, perform pHCI registration from a
631 	 * different thread than the one doing the pHCI attach(9E) - the
632 	 * driver attach code is waiting for this other thread to complete.
633 	 * This means we can only ASSERT DEVI_BUSY_CHANGING of parent
634 	 * (indicating that some thread has done an ndi_devi_enter of parent)
635 	 * not DEVI_BUSY_OWNED (which would indicate that we did the enter).
636 	 */
637 	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));
638 
639 	/*
640 	 * Check for mpxio-disable property. Enable mpxio if the property is
641 	 * missing or not set to "yes".
642 	 * If the property is set to "yes" then emit a brief message.
643 	 */
644 	if ((ddi_prop_lookup_string(DDI_DEV_T_ANY, pdip, 0, "mpxio-disable",
645 	    &data) == DDI_SUCCESS)) {
646 		if (strcmp(data, "yes") == 0) {
647 			MDI_DEBUG(1, (MDI_CONT, pdip,
648 			    "?multipath capabilities disabled via %s.conf.",
649 			    ddi_driver_name(pdip)));
650 			ddi_prop_free(data);
651 			return (MDI_FAILURE);
652 		}
653 		ddi_prop_free(data);
654 	}
655 
656 	/*
657 	 * Search for a matching vHCI
658 	 */
659 	vh = (mdi_vhci_t *)i_mdi_vhci_class2vhci(class);
660 	if (vh == NULL) {
661 		return (MDI_FAILURE);
662 	}
663 
664 	ph = kmem_zalloc(sizeof (mdi_phci_t), KM_SLEEP);
665 	mutex_init(&ph->ph_mutex, NULL, MUTEX_DEFAULT, NULL);
666 	ph->ph_dip = pdip;
667 	ph->ph_vhci = vh;
668 	ph->ph_next = NULL;
669 	ph->ph_unstable = 0;
670 	ph->ph_vprivate = 0;
671 	cv_init(&ph->ph_unstable_cv, NULL, CV_DRIVER, NULL);
672 
673 	MDI_PHCI_LOCK(ph);
674 	MDI_PHCI_SET_POWER_UP(ph);
675 	MDI_PHCI_UNLOCK(ph);
676 	DEVI(pdip)->devi_mdi_component |= MDI_COMPONENT_PHCI;
677 	DEVI(pdip)->devi_mdi_xhci = (caddr_t)ph;
678 
679 	vhcache_phci_add(vh->vh_config, ph);
680 
681 	MDI_VHCI_PHCI_LOCK(vh);
682 	if (vh->vh_phci_head == NULL) {
683 		vh->vh_phci_head = ph;
684 	}
685 	if (vh->vh_phci_tail) {
686 		vh->vh_phci_tail->ph_next = ph;
687 	}
688 	vh->vh_phci_tail = ph;
689 	vh->vh_phci_count++;
690 	MDI_VHCI_PHCI_UNLOCK(vh);
691 
692 	i_mdi_log_sysevent(pdip, class, ESC_DDI_INITIATOR_REGISTER);
693 	return (MDI_SUCCESS);
694 }
695 
696 /*
697  * mdi_phci_unregister():
698  *		Unregister a pHCI module from mpxio framework
699  *		mdi_phci_unregister() is called by the pHCI drivers from their
700  *		detach(9E) handler to unregister their instances from the
701  *		framework.
702  * Return Values:
703  *		MDI_SUCCESS
704  *		MDI_FAILURE
705  */
706 /*ARGSUSED*/
707 int
708 mdi_phci_unregister(dev_info_t *pdip, int flags)
709 {
710 	mdi_vhci_t		*vh;
711 	mdi_phci_t		*ph;
712 	mdi_phci_t		*tmp;
713 	mdi_phci_t		*prev = NULL;
714 	mdi_pathinfo_t		*pip;
715 
716 	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));
717 
718 	ph = i_devi_get_phci(pdip);
719 	if (ph == NULL) {
720 		MDI_DEBUG(1, (MDI_WARN, pdip, "!not a valid pHCI"));
721 		return (MDI_FAILURE);
722 	}
723 
724 	vh = ph->ph_vhci;
725 	ASSERT(vh != NULL);
726 	if (vh == NULL) {
727 		MDI_DEBUG(1, (MDI_WARN, pdip, "!not a valid vHCI"));
728 		return (MDI_FAILURE);
729 	}
730 
731 	MDI_VHCI_PHCI_LOCK(vh);
732 	tmp = vh->vh_phci_head;
733 	while (tmp) {
734 		if (tmp == ph) {
735 			break;
736 		}
737 		prev = tmp;
738 		tmp = tmp->ph_next;
739 	}
740 
741 	if (ph == vh->vh_phci_head) {
742 		vh->vh_phci_head = ph->ph_next;
743 	} else {
744 		prev->ph_next = ph->ph_next;
745 	}
746 
747 	if (ph == vh->vh_phci_tail) {
748 		vh->vh_phci_tail = prev;
749 	}
750 
751 	vh->vh_phci_count--;
752 	MDI_VHCI_PHCI_UNLOCK(vh);
753 
754 	/* Walk remaining pathinfo nodes and disassociate them from pHCI */
755 	MDI_PHCI_LOCK(ph);
756 	for (pip = (mdi_pathinfo_t *)ph->ph_path_head; pip;
757 	    pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link)
758 		MDI_PI(pip)->pi_phci = NULL;
759 	MDI_PHCI_UNLOCK(ph);
760 
761 	i_mdi_log_sysevent(pdip, ph->ph_vhci->vh_class,
762 	    ESC_DDI_INITIATOR_UNREGISTER);
763 	vhcache_phci_remove(vh->vh_config, ph);
764 	cv_destroy(&ph->ph_unstable_cv);
765 	mutex_destroy(&ph->ph_mutex);
766 	kmem_free(ph, sizeof (mdi_phci_t));
767 	DEVI(pdip)->devi_mdi_component &= ~MDI_COMPONENT_PHCI;
768 	DEVI(pdip)->devi_mdi_xhci = NULL;
769 	return (MDI_SUCCESS);
770 }
771 
772 /*
773  * i_devi_get_phci():
774  * 		Utility function to return the phci extensions.
775  */
776 static mdi_phci_t *
777 i_devi_get_phci(dev_info_t *pdip)
778 {
779 	mdi_phci_t	*ph = NULL;
780 
781 	if (MDI_PHCI(pdip)) {
782 		ph = (mdi_phci_t *)DEVI(pdip)->devi_mdi_xhci;
783 	}
784 	return (ph);
785 }
786 
787 /*
788  * Single thread mdi entry into devinfo node for modifying its children.
789  * If necessary we perform an ndi_devi_enter of the vHCI before doing
790  * an ndi_devi_enter of 'dip'.  We maintain circular in two parts: one
791  * for the vHCI and one for the pHCI.
792  */
793 void
794 mdi_devi_enter(dev_info_t *phci_dip, int *circular)
795 {
796 	dev_info_t	*vdip;
797 	int		vcircular, pcircular;
798 
799 	/* Verify calling context */
800 	ASSERT(MDI_PHCI(phci_dip));
801 	vdip = mdi_devi_get_vdip(phci_dip);
802 	ASSERT(vdip);			/* A pHCI always has a vHCI */
803 
804 	/*
805 	 * If pHCI is detaching then the framework has already entered the
806 	 * vHCI on a threads that went down the code path leading to
807 	 * detach_node().  This framework enter of the vHCI during pHCI
808 	 * detach is done to avoid deadlock with vHCI power management
809 	 * operations which enter the vHCI and the enter down the path
810 	 * to the pHCI. If pHCI is detaching then we piggyback this calls
811 	 * enter of the vHCI on frameworks vHCI enter that has already
812 	 * occurred - this is OK because we know that the framework thread
813 	 * doing detach is waiting for our completion.
814 	 *
815 	 * We should DEVI_IS_DETACHING under an enter of the parent to avoid
816 	 * race with detach - but we can't do that because the framework has
817 	 * already entered the parent, so we have some complexity instead.
818 	 */
819 	for (;;) {
820 		if (ndi_devi_tryenter(vdip, &vcircular)) {
821 			ASSERT(vcircular != -1);
822 			if (DEVI_IS_DETACHING(phci_dip)) {
823 				ndi_devi_exit(vdip, vcircular);
824 				vcircular = -1;
825 			}
826 			break;
827 		} else if (DEVI_IS_DETACHING(phci_dip)) {
828 			vcircular = -1;
829 			break;
830 		} else if (servicing_interrupt()) {
831 			/*
832 			 * Don't delay an interrupt (and ensure adaptive
833 			 * mutex inversion support).
834 			 */
835 			ndi_devi_enter(vdip, &vcircular);
836 			break;
837 		} else {
838 			delay_random(2);
839 		}
840 	}
841 
842 	ndi_devi_enter(phci_dip, &pcircular);
843 	*circular = (vcircular << 16) | (pcircular & 0xFFFF);
844 }
845 
846 /*
847  * Attempt to mdi_devi_enter.
848  */
849 int
850 mdi_devi_tryenter(dev_info_t *phci_dip, int *circular)
851 {
852 	dev_info_t	*vdip;
853 	int		vcircular, pcircular;
854 
855 	/* Verify calling context */
856 	ASSERT(MDI_PHCI(phci_dip));
857 	vdip = mdi_devi_get_vdip(phci_dip);
858 	ASSERT(vdip);			/* A pHCI always has a vHCI */
859 
860 	if (ndi_devi_tryenter(vdip, &vcircular)) {
861 		if (ndi_devi_tryenter(phci_dip, &pcircular)) {
862 			*circular = (vcircular << 16) | (pcircular & 0xFFFF);
863 			return (1);	/* locked */
864 		}
865 		ndi_devi_exit(vdip, vcircular);
866 	}
867 	return (0);			/* busy */
868 }
869 
870 /*
871  * Release mdi_devi_enter or successful mdi_devi_tryenter.
872  */
873 void
874 mdi_devi_exit(dev_info_t *phci_dip, int circular)
875 {
876 	dev_info_t	*vdip;
877 	int		vcircular, pcircular;
878 
879 	/* Verify calling context */
880 	ASSERT(MDI_PHCI(phci_dip));
881 	vdip = mdi_devi_get_vdip(phci_dip);
882 	ASSERT(vdip);			/* A pHCI always has a vHCI */
883 
884 	/* extract two circular recursion values from single int */
885 	pcircular = (short)(circular & 0xFFFF);
886 	vcircular = (short)((circular >> 16) & 0xFFFF);
887 
888 	ndi_devi_exit(phci_dip, pcircular);
889 	if (vcircular != -1)
890 		ndi_devi_exit(vdip, vcircular);
891 }
892 
893 /*
894  * The functions mdi_devi_exit_phci() and mdi_devi_enter_phci() are used
895  * around a pHCI drivers calls to mdi_pi_online/offline, after holding
896  * the pathinfo node via mdi_hold_path/mdi_rele_path, to avoid deadlock
897  * with vHCI power management code during path online/offline.  Each
898  * mdi_devi_exit_phci must have a matching mdi_devi_enter_phci, and both must
899  * occur within the scope of an active mdi_devi_enter that establishes the
900  * circular value.
901  */
902 void
903 mdi_devi_exit_phci(dev_info_t *phci_dip, int circular)
904 {
905 	int		pcircular;
906 
907 	/* Verify calling context */
908 	ASSERT(MDI_PHCI(phci_dip));
909 
910 	/* Keep hold on pHCI until we reenter in mdi_devi_enter_phci */
911 	ndi_hold_devi(phci_dip);
912 
913 	pcircular = (short)(circular & 0xFFFF);
914 	ndi_devi_exit(phci_dip, pcircular);
915 }
916 
917 void
918 mdi_devi_enter_phci(dev_info_t *phci_dip, int *circular)
919 {
920 	int		pcircular;
921 
922 	/* Verify calling context */
923 	ASSERT(MDI_PHCI(phci_dip));
924 
925 	ndi_devi_enter(phci_dip, &pcircular);
926 
927 	/* Drop hold from mdi_devi_exit_phci. */
928 	ndi_rele_devi(phci_dip);
929 
930 	/* verify matching mdi_devi_exit_phci/mdi_devi_enter_phci use */
931 	ASSERT(pcircular == ((short)(*circular & 0xFFFF)));
932 }
933 
934 /*
935  * mdi_devi_get_vdip():
936  *		given a pHCI dip return vHCI dip
937  */
938 dev_info_t *
939 mdi_devi_get_vdip(dev_info_t *pdip)
940 {
941 	mdi_phci_t	*ph;
942 
943 	ph = i_devi_get_phci(pdip);
944 	if (ph && ph->ph_vhci)
945 		return (ph->ph_vhci->vh_dip);
946 	return (NULL);
947 }
948 
949 /*
950  * mdi_devi_pdip_entered():
951  *		Return 1 if we are vHCI and have done an ndi_devi_enter
952  *		of a pHCI
953  */
954 int
955 mdi_devi_pdip_entered(dev_info_t *vdip)
956 {
957 	mdi_vhci_t	*vh;
958 	mdi_phci_t	*ph;
959 
960 	vh = i_devi_get_vhci(vdip);
961 	if (vh == NULL)
962 		return (0);
963 
964 	MDI_VHCI_PHCI_LOCK(vh);
965 	ph = vh->vh_phci_head;
966 	while (ph) {
967 		if (ph->ph_dip && DEVI_BUSY_OWNED(ph->ph_dip)) {
968 			MDI_VHCI_PHCI_UNLOCK(vh);
969 			return (1);
970 		}
971 		ph = ph->ph_next;
972 	}
973 	MDI_VHCI_PHCI_UNLOCK(vh);
974 	return (0);
975 }
976 
977 /*
978  * mdi_phci_path2devinfo():
979  * 		Utility function to search for a valid phci device given
980  *		the devfs pathname.
981  */
982 dev_info_t *
983 mdi_phci_path2devinfo(dev_info_t *vdip, caddr_t pathname)
984 {
985 	char		*temp_pathname;
986 	mdi_vhci_t	*vh;
987 	mdi_phci_t	*ph;
988 	dev_info_t 	*pdip = NULL;
989 
990 	vh = i_devi_get_vhci(vdip);
991 	ASSERT(vh != NULL);
992 
993 	if (vh == NULL) {
994 		/*
995 		 * Invalid vHCI component, return failure
996 		 */
997 		return (NULL);
998 	}
999 
1000 	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1001 	MDI_VHCI_PHCI_LOCK(vh);
1002 	ph = vh->vh_phci_head;
1003 	while (ph != NULL) {
1004 		pdip = ph->ph_dip;
1005 		ASSERT(pdip != NULL);
1006 		*temp_pathname = '\0';
1007 		(void) ddi_pathname(pdip, temp_pathname);
1008 		if (strcmp(temp_pathname, pathname) == 0) {
1009 			break;
1010 		}
1011 		ph = ph->ph_next;
1012 	}
1013 	if (ph == NULL) {
1014 		pdip = NULL;
1015 	}
1016 	MDI_VHCI_PHCI_UNLOCK(vh);
1017 	kmem_free(temp_pathname, MAXPATHLEN);
1018 	return (pdip);
1019 }
1020 
1021 /*
1022  * mdi_phci_get_path_count():
1023  * 		get number of path information nodes associated with a given
1024  *		pHCI device.
1025  */
1026 int
1027 mdi_phci_get_path_count(dev_info_t *pdip)
1028 {
1029 	mdi_phci_t	*ph;
1030 	int		count = 0;
1031 
1032 	ph = i_devi_get_phci(pdip);
1033 	if (ph != NULL) {
1034 		count = ph->ph_path_count;
1035 	}
1036 	return (count);
1037 }
1038 
1039 /*
1040  * i_mdi_phci_lock():
1041  *		Lock a pHCI device
1042  * Return Values:
1043  *		None
1044  * Note:
1045  *		The default locking order is:
1046  *		_NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
1047  *		But there are number of situations where locks need to be
1048  *		grabbed in reverse order.  This routine implements try and lock
1049  *		mechanism depending on the requested parameter option.
1050  */
1051 static void
1052 i_mdi_phci_lock(mdi_phci_t *ph, mdi_pathinfo_t *pip)
1053 {
1054 	if (pip) {
1055 		/* Reverse locking is requested. */
1056 		while (MDI_PHCI_TRYLOCK(ph) == 0) {
1057 			if (servicing_interrupt()) {
1058 				MDI_PI_HOLD(pip);
1059 				MDI_PI_UNLOCK(pip);
1060 				MDI_PHCI_LOCK(ph);
1061 				MDI_PI_LOCK(pip);
1062 				MDI_PI_RELE(pip);
1063 				break;
1064 			} else {
1065 				/*
1066 				 * tryenter failed. Try to grab again
1067 				 * after a small delay
1068 				 */
1069 				MDI_PI_HOLD(pip);
1070 				MDI_PI_UNLOCK(pip);
1071 				delay_random(2);
1072 				MDI_PI_LOCK(pip);
1073 				MDI_PI_RELE(pip);
1074 			}
1075 		}
1076 	} else {
1077 		MDI_PHCI_LOCK(ph);
1078 	}
1079 }
1080 
1081 /*
1082  * i_mdi_phci_unlock():
1083  *		Unlock the pHCI component
1084  */
1085 static void
1086 i_mdi_phci_unlock(mdi_phci_t *ph)
1087 {
1088 	MDI_PHCI_UNLOCK(ph);
1089 }
1090 
1091 /*
1092  * i_mdi_devinfo_create():
1093  *		create client device's devinfo node
1094  * Return Values:
1095  *		dev_info
1096  *		NULL
1097  * Notes:
1098  */
1099 static dev_info_t *
1100 i_mdi_devinfo_create(mdi_vhci_t *vh, char *name, char *guid,
1101 	char **compatible, int ncompatible)
1102 {
1103 	dev_info_t *cdip = NULL;
1104 
1105 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1106 
1107 	/* Verify for duplicate entry */
1108 	cdip = i_mdi_devinfo_find(vh, name, guid);
1109 	ASSERT(cdip == NULL);
1110 	if (cdip) {
1111 		cmn_err(CE_WARN,
1112 		    "i_mdi_devinfo_create: client %s@%s already exists",
1113 			name ? name : "", guid ? guid : "");
1114 	}
1115 
1116 	ndi_devi_alloc_sleep(vh->vh_dip, name, DEVI_SID_NODEID, &cdip);
1117 	if (cdip == NULL)
1118 		goto fail;
1119 
1120 	/*
1121 	 * Create component type and Global unique identifier
1122 	 * properties
1123 	 */
1124 	if (ndi_prop_update_string(DDI_DEV_T_NONE, cdip,
1125 	    MDI_CLIENT_GUID_PROP, guid) != DDI_PROP_SUCCESS) {
1126 		goto fail;
1127 	}
1128 
1129 	/* Decorate the node with compatible property */
1130 	if (compatible &&
1131 	    (ndi_prop_update_string_array(DDI_DEV_T_NONE, cdip,
1132 	    "compatible", compatible, ncompatible) != DDI_PROP_SUCCESS)) {
1133 		goto fail;
1134 	}
1135 
1136 	return (cdip);
1137 
1138 fail:
1139 	if (cdip) {
1140 		(void) ndi_prop_remove_all(cdip);
1141 		(void) ndi_devi_free(cdip);
1142 	}
1143 	return (NULL);
1144 }
1145 
1146 /*
1147  * i_mdi_devinfo_find():
1148  *		Find a matching devinfo node for given client node name
1149  *		and its guid.
1150  * Return Values:
1151  *		Handle to a dev_info node or NULL
1152  */
1153 static dev_info_t *
1154 i_mdi_devinfo_find(mdi_vhci_t *vh, caddr_t name, char *guid)
1155 {
1156 	char			*data;
1157 	dev_info_t 		*cdip = NULL;
1158 	dev_info_t 		*ndip = NULL;
1159 	int			circular;
1160 
1161 	ndi_devi_enter(vh->vh_dip, &circular);
1162 	ndip = (dev_info_t *)DEVI(vh->vh_dip)->devi_child;
1163 	while ((cdip = ndip) != NULL) {
1164 		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
1165 
1166 		if (strcmp(DEVI(cdip)->devi_node_name, name)) {
1167 			continue;
1168 		}
1169 
1170 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, cdip,
1171 		    DDI_PROP_DONTPASS, MDI_CLIENT_GUID_PROP,
1172 		    &data) != DDI_PROP_SUCCESS) {
1173 			continue;
1174 		}
1175 
1176 		if (strcmp(data, guid) != 0) {
1177 			ddi_prop_free(data);
1178 			continue;
1179 		}
1180 		ddi_prop_free(data);
1181 		break;
1182 	}
1183 	ndi_devi_exit(vh->vh_dip, circular);
1184 	return (cdip);
1185 }
1186 
1187 /*
1188  * i_mdi_devinfo_remove():
1189  *		Remove a client device node
1190  */
1191 static int
1192 i_mdi_devinfo_remove(dev_info_t *vdip, dev_info_t *cdip, int flags)
1193 {
1194 	int	rv = MDI_SUCCESS;
1195 
1196 	if (i_mdi_is_child_present(vdip, cdip) == MDI_SUCCESS ||
1197 	    (flags & MDI_CLIENT_FLAGS_DEV_NOT_SUPPORTED)) {
1198 		rv = ndi_devi_offline(cdip, NDI_DEVFS_CLEAN | NDI_DEVI_REMOVE);
1199 		if (rv != NDI_SUCCESS) {
1200 			MDI_DEBUG(1, (MDI_NOTE, cdip,
1201 			    "!failed: cdip %p", (void *)cdip));
1202 		}
1203 		/*
1204 		 * Convert to MDI error code
1205 		 */
1206 		switch (rv) {
1207 		case NDI_SUCCESS:
1208 			rv = MDI_SUCCESS;
1209 			break;
1210 		case NDI_BUSY:
1211 			rv = MDI_BUSY;
1212 			break;
1213 		default:
1214 			rv = MDI_FAILURE;
1215 			break;
1216 		}
1217 	}
1218 	return (rv);
1219 }
1220 
1221 /*
1222  * i_devi_get_client()
1223  *		Utility function to get mpxio component extensions
1224  */
1225 static mdi_client_t *
1226 i_devi_get_client(dev_info_t *cdip)
1227 {
1228 	mdi_client_t	*ct = NULL;
1229 
1230 	if (MDI_CLIENT(cdip)) {
1231 		ct = (mdi_client_t *)DEVI(cdip)->devi_mdi_client;
1232 	}
1233 	return (ct);
1234 }
1235 
1236 /*
1237  * i_mdi_is_child_present():
1238  *		Search for the presence of client device dev_info node
1239  */
1240 static int
1241 i_mdi_is_child_present(dev_info_t *vdip, dev_info_t *cdip)
1242 {
1243 	int		rv = MDI_FAILURE;
1244 	struct dev_info	*dip;
1245 	int		circular;
1246 
1247 	ndi_devi_enter(vdip, &circular);
1248 	dip = DEVI(vdip)->devi_child;
1249 	while (dip) {
1250 		if (dip == DEVI(cdip)) {
1251 			rv = MDI_SUCCESS;
1252 			break;
1253 		}
1254 		dip = dip->devi_sibling;
1255 	}
1256 	ndi_devi_exit(vdip, circular);
1257 	return (rv);
1258 }
1259 
1260 
1261 /*
1262  * i_mdi_client_lock():
1263  *		Grab client component lock
1264  * Return Values:
1265  *		None
1266  * Note:
1267  *		The default locking order is:
1268  *		_NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
1269  *		But there are number of situations where locks need to be
1270  *		grabbed in reverse order.  This routine implements try and lock
1271  *		mechanism depending on the requested parameter option.
1272  */
1273 static void
1274 i_mdi_client_lock(mdi_client_t *ct, mdi_pathinfo_t *pip)
1275 {
1276 	if (pip) {
1277 		/*
1278 		 * Reverse locking is requested.
1279 		 */
1280 		while (MDI_CLIENT_TRYLOCK(ct) == 0) {
1281 			if (servicing_interrupt()) {
1282 				MDI_PI_HOLD(pip);
1283 				MDI_PI_UNLOCK(pip);
1284 				MDI_CLIENT_LOCK(ct);
1285 				MDI_PI_LOCK(pip);
1286 				MDI_PI_RELE(pip);
1287 				break;
1288 			} else {
1289 				/*
1290 				 * tryenter failed. Try to grab again
1291 				 * after a small delay
1292 				 */
1293 				MDI_PI_HOLD(pip);
1294 				MDI_PI_UNLOCK(pip);
1295 				delay_random(2);
1296 				MDI_PI_LOCK(pip);
1297 				MDI_PI_RELE(pip);
1298 			}
1299 		}
1300 	} else {
1301 		MDI_CLIENT_LOCK(ct);
1302 	}
1303 }
1304 
1305 /*
1306  * i_mdi_client_unlock():
1307  *		Unlock a client component
1308  */
1309 static void
1310 i_mdi_client_unlock(mdi_client_t *ct)
1311 {
1312 	MDI_CLIENT_UNLOCK(ct);
1313 }
1314 
1315 /*
1316  * i_mdi_client_alloc():
1317  * 		Allocate and initialize a client structure.  Caller should
1318  *		hold the vhci client lock.
1319  * Return Values:
1320  *		Handle to a client component
1321  */
1322 /*ARGSUSED*/
1323 static mdi_client_t *
1324 i_mdi_client_alloc(mdi_vhci_t *vh, char *name, char *lguid)
1325 {
1326 	mdi_client_t	*ct;
1327 
1328 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1329 
1330 	/*
1331 	 * Allocate and initialize a component structure.
1332 	 */
1333 	ct = kmem_zalloc(sizeof (*ct), KM_SLEEP);
1334 	mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL);
1335 	ct->ct_hnext = NULL;
1336 	ct->ct_hprev = NULL;
1337 	ct->ct_dip = NULL;
1338 	ct->ct_vhci = vh;
1339 	ct->ct_drvname = kmem_alloc(strlen(name) + 1, KM_SLEEP);
1340 	(void) strcpy(ct->ct_drvname, name);
1341 	ct->ct_guid = kmem_alloc(strlen(lguid) + 1, KM_SLEEP);
1342 	(void) strcpy(ct->ct_guid, lguid);
1343 	ct->ct_cprivate = NULL;
1344 	ct->ct_vprivate = NULL;
1345 	ct->ct_flags = 0;
1346 	ct->ct_state = MDI_CLIENT_STATE_FAILED;
1347 	MDI_CLIENT_LOCK(ct);
1348 	MDI_CLIENT_SET_OFFLINE(ct);
1349 	MDI_CLIENT_SET_DETACH(ct);
1350 	MDI_CLIENT_SET_POWER_UP(ct);
1351 	MDI_CLIENT_UNLOCK(ct);
1352 	ct->ct_failover_flags = 0;
1353 	ct->ct_failover_status = 0;
1354 	cv_init(&ct->ct_failover_cv, NULL, CV_DRIVER, NULL);
1355 	ct->ct_unstable = 0;
1356 	cv_init(&ct->ct_unstable_cv, NULL, CV_DRIVER, NULL);
1357 	cv_init(&ct->ct_powerchange_cv, NULL, CV_DRIVER, NULL);
1358 	ct->ct_lb = vh->vh_lb;
1359 	ct->ct_lb_args =  kmem_zalloc(sizeof (client_lb_args_t), KM_SLEEP);
1360 	ct->ct_lb_args->region_size = LOAD_BALANCE_DEFAULT_REGION_SIZE;
1361 	ct->ct_path_count = 0;
1362 	ct->ct_path_head = NULL;
1363 	ct->ct_path_tail = NULL;
1364 	ct->ct_path_last = NULL;
1365 
1366 	/*
1367 	 * Add this client component to our client hash queue
1368 	 */
1369 	i_mdi_client_enlist_table(vh, ct);
1370 	return (ct);
1371 }
1372 
1373 /*
1374  * i_mdi_client_enlist_table():
1375  *		Attach the client device to the client hash table. Caller
1376  *		should hold the vhci client lock.
1377  */
1378 static void
1379 i_mdi_client_enlist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1380 {
1381 	int 			index;
1382 	struct client_hash	*head;
1383 
1384 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1385 
1386 	index = i_mdi_get_hash_key(ct->ct_guid);
1387 	head = &vh->vh_client_table[index];
1388 	ct->ct_hnext = (mdi_client_t *)head->ct_hash_head;
1389 	head->ct_hash_head = ct;
1390 	head->ct_hash_count++;
1391 	vh->vh_client_count++;
1392 }
1393 
1394 /*
1395  * i_mdi_client_delist_table():
1396  *		Attach the client device to the client hash table.
1397  *		Caller should hold the vhci client lock.
1398  */
1399 static void
1400 i_mdi_client_delist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1401 {
1402 	int			index;
1403 	char			*guid;
1404 	struct client_hash 	*head;
1405 	mdi_client_t		*next;
1406 	mdi_client_t		*last;
1407 
1408 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1409 
1410 	guid = ct->ct_guid;
1411 	index = i_mdi_get_hash_key(guid);
1412 	head = &vh->vh_client_table[index];
1413 
1414 	last = NULL;
1415 	next = (mdi_client_t *)head->ct_hash_head;
1416 	while (next != NULL) {
1417 		if (next == ct) {
1418 			break;
1419 		}
1420 		last = next;
1421 		next = next->ct_hnext;
1422 	}
1423 
1424 	if (next) {
1425 		head->ct_hash_count--;
1426 		if (last == NULL) {
1427 			head->ct_hash_head = ct->ct_hnext;
1428 		} else {
1429 			last->ct_hnext = ct->ct_hnext;
1430 		}
1431 		ct->ct_hnext = NULL;
1432 		vh->vh_client_count--;
1433 	}
1434 }
1435 
1436 
1437 /*
1438  * i_mdi_client_free():
1439  *		Free a client component
1440  */
1441 static int
1442 i_mdi_client_free(mdi_vhci_t *vh, mdi_client_t *ct)
1443 {
1444 	int		rv = MDI_SUCCESS;
1445 	int		flags = ct->ct_flags;
1446 	dev_info_t	*cdip;
1447 	dev_info_t	*vdip;
1448 
1449 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1450 
1451 	vdip = vh->vh_dip;
1452 	cdip = ct->ct_dip;
1453 
1454 	(void) ndi_prop_remove(DDI_DEV_T_NONE, cdip, MDI_CLIENT_GUID_PROP);
1455 	DEVI(cdip)->devi_mdi_component &= ~MDI_COMPONENT_CLIENT;
1456 	DEVI(cdip)->devi_mdi_client = NULL;
1457 
1458 	/*
1459 	 * Clear out back ref. to dev_info_t node
1460 	 */
1461 	ct->ct_dip = NULL;
1462 
1463 	/*
1464 	 * Remove this client from our hash queue
1465 	 */
1466 	i_mdi_client_delist_table(vh, ct);
1467 
1468 	/*
1469 	 * Uninitialize and free the component
1470 	 */
1471 	kmem_free(ct->ct_drvname, strlen(ct->ct_drvname) + 1);
1472 	kmem_free(ct->ct_guid, strlen(ct->ct_guid) + 1);
1473 	kmem_free(ct->ct_lb_args, sizeof (client_lb_args_t));
1474 	cv_destroy(&ct->ct_failover_cv);
1475 	cv_destroy(&ct->ct_unstable_cv);
1476 	cv_destroy(&ct->ct_powerchange_cv);
1477 	mutex_destroy(&ct->ct_mutex);
1478 	kmem_free(ct, sizeof (*ct));
1479 
1480 	if (cdip != NULL) {
1481 		MDI_VHCI_CLIENT_UNLOCK(vh);
1482 		(void) i_mdi_devinfo_remove(vdip, cdip, flags);
1483 		MDI_VHCI_CLIENT_LOCK(vh);
1484 	}
1485 	return (rv);
1486 }
1487 
1488 /*
1489  * i_mdi_client_find():
1490  * 		Find the client structure corresponding to a given guid
1491  *		Caller should hold the vhci client lock.
1492  */
1493 static mdi_client_t *
1494 i_mdi_client_find(mdi_vhci_t *vh, char *cname, char *guid)
1495 {
1496 	int			index;
1497 	struct client_hash	*head;
1498 	mdi_client_t		*ct;
1499 
1500 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1501 
1502 	index = i_mdi_get_hash_key(guid);
1503 	head = &vh->vh_client_table[index];
1504 
1505 	ct = head->ct_hash_head;
1506 	while (ct != NULL) {
1507 		if (strcmp(ct->ct_guid, guid) == 0 &&
1508 		    (cname == NULL || strcmp(ct->ct_drvname, cname) == 0)) {
1509 			break;
1510 		}
1511 		ct = ct->ct_hnext;
1512 	}
1513 	return (ct);
1514 }
1515 
1516 /*
1517  * i_mdi_client_update_state():
1518  *		Compute and update client device state
1519  * Notes:
1520  *		A client device can be in any of three possible states:
1521  *
1522  *		MDI_CLIENT_STATE_OPTIMAL - Client in optimal state with more
1523  *		one online/standby paths. Can tolerate failures.
1524  *		MDI_CLIENT_STATE_DEGRADED - Client device in degraded state with
1525  *		no alternate paths available as standby. A failure on the online
1526  *		would result in loss of access to device data.
1527  *		MDI_CLIENT_STATE_FAILED - Client device in failed state with
1528  *		no paths available to access the device.
1529  */
1530 static void
1531 i_mdi_client_update_state(mdi_client_t *ct)
1532 {
1533 	int state;
1534 
1535 	ASSERT(MDI_CLIENT_LOCKED(ct));
1536 	state = i_mdi_client_compute_state(ct, NULL);
1537 	MDI_CLIENT_SET_STATE(ct, state);
1538 }
1539 
1540 /*
1541  * i_mdi_client_compute_state():
1542  *		Compute client device state
1543  *
1544  *		mdi_phci_t *	Pointer to pHCI structure which should
1545  *				while computing the new value.  Used by
1546  *				i_mdi_phci_offline() to find the new
1547  *				client state after DR of a pHCI.
1548  */
1549 static int
1550 i_mdi_client_compute_state(mdi_client_t *ct, mdi_phci_t *ph)
1551 {
1552 	int		state;
1553 	int		online_count = 0;
1554 	int		standby_count = 0;
1555 	mdi_pathinfo_t	*pip, *next;
1556 
1557 	ASSERT(MDI_CLIENT_LOCKED(ct));
1558 	pip = ct->ct_path_head;
1559 	while (pip != NULL) {
1560 		MDI_PI_LOCK(pip);
1561 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
1562 		if (MDI_PI(pip)->pi_phci == ph) {
1563 			MDI_PI_UNLOCK(pip);
1564 			pip = next;
1565 			continue;
1566 		}
1567 
1568 		if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1569 				== MDI_PATHINFO_STATE_ONLINE)
1570 			online_count++;
1571 		else if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1572 				== MDI_PATHINFO_STATE_STANDBY)
1573 			standby_count++;
1574 		MDI_PI_UNLOCK(pip);
1575 		pip = next;
1576 	}
1577 
1578 	if (online_count == 0) {
1579 		if (standby_count == 0) {
1580 			state = MDI_CLIENT_STATE_FAILED;
1581 			MDI_DEBUG(2, (MDI_NOTE, ct->ct_dip,
1582 			    "client state failed: ct = %p", (void *)ct));
1583 		} else if (standby_count == 1) {
1584 			state = MDI_CLIENT_STATE_DEGRADED;
1585 		} else {
1586 			state = MDI_CLIENT_STATE_OPTIMAL;
1587 		}
1588 	} else if (online_count == 1) {
1589 		if (standby_count == 0) {
1590 			state = MDI_CLIENT_STATE_DEGRADED;
1591 		} else {
1592 			state = MDI_CLIENT_STATE_OPTIMAL;
1593 		}
1594 	} else {
1595 		state = MDI_CLIENT_STATE_OPTIMAL;
1596 	}
1597 	return (state);
1598 }
1599 
1600 /*
1601  * i_mdi_client2devinfo():
1602  *		Utility function
1603  */
1604 dev_info_t *
1605 i_mdi_client2devinfo(mdi_client_t *ct)
1606 {
1607 	return (ct->ct_dip);
1608 }
1609 
1610 /*
1611  * mdi_client_path2_devinfo():
1612  * 		Given the parent devinfo and child devfs pathname, search for
1613  *		a valid devfs node handle.
1614  */
1615 dev_info_t *
1616 mdi_client_path2devinfo(dev_info_t *vdip, char *pathname)
1617 {
1618 	dev_info_t 	*cdip = NULL;
1619 	dev_info_t 	*ndip = NULL;
1620 	char		*temp_pathname;
1621 	int		circular;
1622 
1623 	/*
1624 	 * Allocate temp buffer
1625 	 */
1626 	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1627 
1628 	/*
1629 	 * Lock parent against changes
1630 	 */
1631 	ndi_devi_enter(vdip, &circular);
1632 	ndip = (dev_info_t *)DEVI(vdip)->devi_child;
1633 	while ((cdip = ndip) != NULL) {
1634 		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
1635 
1636 		*temp_pathname = '\0';
1637 		(void) ddi_pathname(cdip, temp_pathname);
1638 		if (strcmp(temp_pathname, pathname) == 0) {
1639 			break;
1640 		}
1641 	}
1642 	/*
1643 	 * Release devinfo lock
1644 	 */
1645 	ndi_devi_exit(vdip, circular);
1646 
1647 	/*
1648 	 * Free the temp buffer
1649 	 */
1650 	kmem_free(temp_pathname, MAXPATHLEN);
1651 	return (cdip);
1652 }
1653 
1654 /*
1655  * mdi_client_get_path_count():
1656  * 		Utility function to get number of path information nodes
1657  *		associated with a given client device.
1658  */
1659 int
1660 mdi_client_get_path_count(dev_info_t *cdip)
1661 {
1662 	mdi_client_t	*ct;
1663 	int		count = 0;
1664 
1665 	ct = i_devi_get_client(cdip);
1666 	if (ct != NULL) {
1667 		count = ct->ct_path_count;
1668 	}
1669 	return (count);
1670 }
1671 
1672 
1673 /*
1674  * i_mdi_get_hash_key():
1675  * 		Create a hash using strings as keys
1676  *
1677  */
1678 static int
1679 i_mdi_get_hash_key(char *str)
1680 {
1681 	uint32_t	g, hash = 0;
1682 	char		*p;
1683 
1684 	for (p = str; *p != '\0'; p++) {
1685 		g = *p;
1686 		hash += g;
1687 	}
1688 	return (hash % (CLIENT_HASH_TABLE_SIZE - 1));
1689 }
1690 
1691 /*
1692  * mdi_get_lb_policy():
1693  * 		Get current load balancing policy for a given client device
1694  */
1695 client_lb_t
1696 mdi_get_lb_policy(dev_info_t *cdip)
1697 {
1698 	client_lb_t	lb = LOAD_BALANCE_NONE;
1699 	mdi_client_t	*ct;
1700 
1701 	ct = i_devi_get_client(cdip);
1702 	if (ct != NULL) {
1703 		lb = ct->ct_lb;
1704 	}
1705 	return (lb);
1706 }
1707 
1708 /*
1709  * mdi_set_lb_region_size():
1710  * 		Set current region size for the load-balance
1711  */
1712 int
1713 mdi_set_lb_region_size(dev_info_t *cdip, int region_size)
1714 {
1715 	mdi_client_t	*ct;
1716 	int		rv = MDI_FAILURE;
1717 
1718 	ct = i_devi_get_client(cdip);
1719 	if (ct != NULL && ct->ct_lb_args != NULL) {
1720 		ct->ct_lb_args->region_size = region_size;
1721 		rv = MDI_SUCCESS;
1722 	}
1723 	return (rv);
1724 }
1725 
1726 /*
1727  * mdi_Set_lb_policy():
1728  * 		Set current load balancing policy for a given client device
1729  */
1730 int
1731 mdi_set_lb_policy(dev_info_t *cdip, client_lb_t lb)
1732 {
1733 	mdi_client_t	*ct;
1734 	int		rv = MDI_FAILURE;
1735 
1736 	ct = i_devi_get_client(cdip);
1737 	if (ct != NULL) {
1738 		ct->ct_lb = lb;
1739 		rv = MDI_SUCCESS;
1740 	}
1741 	return (rv);
1742 }
1743 
1744 /*
1745  * mdi_failover():
1746  *		failover function called by the vHCI drivers to initiate
1747  *		a failover operation.  This is typically due to non-availability
1748  *		of online paths to route I/O requests.  Failover can be
1749  *		triggered through user application also.
1750  *
1751  *		The vHCI driver calls mdi_failover() to initiate a failover
1752  *		operation. mdi_failover() calls back into the vHCI driver's
1753  *		vo_failover() entry point to perform the actual failover
1754  *		operation.  The reason for requiring the vHCI driver to
1755  *		initiate failover by calling mdi_failover(), instead of directly
1756  *		executing vo_failover() itself, is to ensure that the mdi
1757  *		framework can keep track of the client state properly.
1758  *		Additionally, mdi_failover() provides as a convenience the
1759  *		option of performing the failover operation synchronously or
1760  *		asynchronously
1761  *
1762  *		Upon successful completion of the failover operation, the
1763  *		paths that were previously ONLINE will be in the STANDBY state,
1764  *		and the newly activated paths will be in the ONLINE state.
1765  *
1766  *		The flags modifier determines whether the activation is done
1767  *		synchronously: MDI_FAILOVER_SYNC
1768  * Return Values:
1769  *		MDI_SUCCESS
1770  *		MDI_FAILURE
1771  *		MDI_BUSY
1772  */
1773 /*ARGSUSED*/
1774 int
1775 mdi_failover(dev_info_t *vdip, dev_info_t *cdip, int flags)
1776 {
1777 	int			rv;
1778 	mdi_client_t		*ct;
1779 
1780 	ct = i_devi_get_client(cdip);
1781 	ASSERT(ct != NULL);
1782 	if (ct == NULL) {
1783 		/* cdip is not a valid client device. Nothing more to do. */
1784 		return (MDI_FAILURE);
1785 	}
1786 
1787 	MDI_CLIENT_LOCK(ct);
1788 
1789 	if (MDI_CLIENT_IS_PATH_FREE_IN_PROGRESS(ct)) {
1790 		/* A path to the client is being freed */
1791 		MDI_CLIENT_UNLOCK(ct);
1792 		return (MDI_BUSY);
1793 	}
1794 
1795 
1796 	if (MDI_CLIENT_IS_FAILED(ct)) {
1797 		/*
1798 		 * Client is in failed state. Nothing more to do.
1799 		 */
1800 		MDI_CLIENT_UNLOCK(ct);
1801 		return (MDI_FAILURE);
1802 	}
1803 
1804 	if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
1805 		/*
1806 		 * Failover is already in progress; return BUSY
1807 		 */
1808 		MDI_CLIENT_UNLOCK(ct);
1809 		return (MDI_BUSY);
1810 	}
1811 	/*
1812 	 * Make sure that mdi_pathinfo node state changes are processed.
1813 	 * We do not allow failovers to progress while client path state
1814 	 * changes are in progress
1815 	 */
1816 	if (ct->ct_unstable) {
1817 		if (flags == MDI_FAILOVER_ASYNC) {
1818 			MDI_CLIENT_UNLOCK(ct);
1819 			return (MDI_BUSY);
1820 		} else {
1821 			while (ct->ct_unstable)
1822 				cv_wait(&ct->ct_unstable_cv, &ct->ct_mutex);
1823 		}
1824 	}
1825 
1826 	/*
1827 	 * Client device is in stable state. Before proceeding, perform sanity
1828 	 * checks again.
1829 	 */
1830 	if ((MDI_CLIENT_IS_DETACHED(ct)) || (MDI_CLIENT_IS_FAILED(ct)) ||
1831 	    (!i_ddi_devi_attached(ct->ct_dip))) {
1832 		/*
1833 		 * Client is in failed state. Nothing more to do.
1834 		 */
1835 		MDI_CLIENT_UNLOCK(ct);
1836 		return (MDI_FAILURE);
1837 	}
1838 
1839 	/*
1840 	 * Set the client state as failover in progress.
1841 	 */
1842 	MDI_CLIENT_SET_FAILOVER_IN_PROGRESS(ct);
1843 	ct->ct_failover_flags = flags;
1844 	MDI_CLIENT_UNLOCK(ct);
1845 
1846 	if (flags == MDI_FAILOVER_ASYNC) {
1847 		/*
1848 		 * Submit the initiate failover request via CPR safe
1849 		 * taskq threads.
1850 		 */
1851 		(void) taskq_dispatch(mdi_taskq, (task_func_t *)i_mdi_failover,
1852 		    ct, KM_SLEEP);
1853 		return (MDI_ACCEPT);
1854 	} else {
1855 		/*
1856 		 * Synchronous failover mode.  Typically invoked from the user
1857 		 * land.
1858 		 */
1859 		rv = i_mdi_failover(ct);
1860 	}
1861 	return (rv);
1862 }
1863 
1864 /*
1865  * i_mdi_failover():
1866  *		internal failover function. Invokes vHCI drivers failover
1867  *		callback function and process the failover status
1868  * Return Values:
1869  *		None
1870  *
1871  * Note: A client device in failover state can not be detached or freed.
1872  */
1873 static int
1874 i_mdi_failover(void *arg)
1875 {
1876 	int		rv = MDI_SUCCESS;
1877 	mdi_client_t	*ct = (mdi_client_t *)arg;
1878 	mdi_vhci_t	*vh = ct->ct_vhci;
1879 
1880 	ASSERT(!MDI_CLIENT_LOCKED(ct));
1881 
1882 	if (vh->vh_ops->vo_failover != NULL) {
1883 		/*
1884 		 * Call vHCI drivers callback routine
1885 		 */
1886 		rv = (*vh->vh_ops->vo_failover)(vh->vh_dip, ct->ct_dip,
1887 		    ct->ct_failover_flags);
1888 	}
1889 
1890 	MDI_CLIENT_LOCK(ct);
1891 	MDI_CLIENT_CLEAR_FAILOVER_IN_PROGRESS(ct);
1892 
1893 	/*
1894 	 * Save the failover return status
1895 	 */
1896 	ct->ct_failover_status = rv;
1897 
1898 	/*
1899 	 * As a result of failover, client status would have been changed.
1900 	 * Update the client state and wake up anyone waiting on this client
1901 	 * device.
1902 	 */
1903 	i_mdi_client_update_state(ct);
1904 
1905 	cv_broadcast(&ct->ct_failover_cv);
1906 	MDI_CLIENT_UNLOCK(ct);
1907 	return (rv);
1908 }
1909 
1910 /*
1911  * Load balancing is logical block.
1912  * IOs within the range described by region_size
1913  * would go on the same path. This would improve the
1914  * performance by cache-hit on some of the RAID devices.
1915  * Search only for online paths(At some point we
1916  * may want to balance across target ports).
1917  * If no paths are found then default to round-robin.
1918  */
1919 static int
1920 i_mdi_lba_lb(mdi_client_t *ct, mdi_pathinfo_t **ret_pip, struct buf *bp)
1921 {
1922 	int		path_index = -1;
1923 	int		online_path_count = 0;
1924 	int		online_nonpref_path_count = 0;
1925 	int 		region_size = ct->ct_lb_args->region_size;
1926 	mdi_pathinfo_t	*pip;
1927 	mdi_pathinfo_t	*next;
1928 	int		preferred, path_cnt;
1929 
1930 	pip = ct->ct_path_head;
1931 	while (pip) {
1932 		MDI_PI_LOCK(pip);
1933 		if (MDI_PI(pip)->pi_state ==
1934 		    MDI_PATHINFO_STATE_ONLINE && MDI_PI(pip)->pi_preferred) {
1935 			online_path_count++;
1936 		} else if (MDI_PI(pip)->pi_state ==
1937 		    MDI_PATHINFO_STATE_ONLINE && !MDI_PI(pip)->pi_preferred) {
1938 			online_nonpref_path_count++;
1939 		}
1940 		next = (mdi_pathinfo_t *)
1941 		    MDI_PI(pip)->pi_client_link;
1942 		MDI_PI_UNLOCK(pip);
1943 		pip = next;
1944 	}
1945 	/* if found any online/preferred then use this type */
1946 	if (online_path_count > 0) {
1947 		path_cnt = online_path_count;
1948 		preferred = 1;
1949 	} else if (online_nonpref_path_count > 0) {
1950 		path_cnt = online_nonpref_path_count;
1951 		preferred = 0;
1952 	} else {
1953 		path_cnt = 0;
1954 	}
1955 	if (path_cnt) {
1956 		path_index = (bp->b_blkno >> region_size) % path_cnt;
1957 		pip = ct->ct_path_head;
1958 		while (pip && path_index != -1) {
1959 			MDI_PI_LOCK(pip);
1960 			if (path_index == 0 &&
1961 			    (MDI_PI(pip)->pi_state ==
1962 			    MDI_PATHINFO_STATE_ONLINE) &&
1963 				MDI_PI(pip)->pi_preferred == preferred) {
1964 				MDI_PI_HOLD(pip);
1965 				MDI_PI_UNLOCK(pip);
1966 				*ret_pip = pip;
1967 				return (MDI_SUCCESS);
1968 			}
1969 			path_index --;
1970 			next = (mdi_pathinfo_t *)
1971 			    MDI_PI(pip)->pi_client_link;
1972 			MDI_PI_UNLOCK(pip);
1973 			pip = next;
1974 		}
1975 		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
1976 		    "lba %llx: path %s %p",
1977 		    bp->b_lblkno, mdi_pi_spathname(pip), (void *)pip));
1978 	}
1979 	return (MDI_FAILURE);
1980 }
1981 
1982 /*
1983  * mdi_select_path():
1984  *		select a path to access a client device.
1985  *
1986  *		mdi_select_path() function is called by the vHCI drivers to
1987  *		select a path to route the I/O request to.  The caller passes
1988  *		the block I/O data transfer structure ("buf") as one of the
1989  *		parameters.  The mpxio framework uses the buf structure
1990  *		contents to maintain per path statistics (total I/O size /
1991  *		count pending).  If more than one online paths are available to
1992  *		select, the framework automatically selects a suitable path
1993  *		for routing I/O request. If a failover operation is active for
1994  *		this client device the call shall be failed with MDI_BUSY error
1995  *		code.
1996  *
1997  *		By default this function returns a suitable path in online
1998  *		state based on the current load balancing policy.  Currently
1999  *		we support LOAD_BALANCE_NONE (Previously selected online path
2000  *		will continue to be used till the path is usable) and
2001  *		LOAD_BALANCE_RR (Online paths will be selected in a round
2002  *		robin fashion), LOAD_BALANCE_LB(Online paths will be selected
2003  *		based on the logical block).  The load balancing
2004  *		through vHCI drivers configuration file (driver.conf).
2005  *
2006  *		vHCI drivers may override this default behavior by specifying
2007  *		appropriate flags.  The meaning of the thrid argument depends
2008  *		on the flags specified. If MDI_SELECT_PATH_INSTANCE is set
2009  *		then the argument is the "path instance" of the path to select.
2010  *		If MDI_SELECT_PATH_INSTANCE is not set then the argument is
2011  *		"start_pip". A non NULL "start_pip" is the starting point to
2012  *		walk and find the next appropriate path.  The following values
2013  *		are currently defined: MDI_SELECT_ONLINE_PATH (to select an
2014  *		ONLINE path) and/or MDI_SELECT_STANDBY_PATH (to select an
2015  *		STANDBY path).
2016  *
2017  *		The non-standard behavior is used by the scsi_vhci driver,
2018  *		whenever it has to use a STANDBY/FAULTED path.  Eg. during
2019  *		attach of client devices (to avoid an unnecessary failover
2020  *		when the STANDBY path comes up first), during failover
2021  *		(to activate a STANDBY path as ONLINE).
2022  *
2023  *		The selected path is returned in a a mdi_hold_path() state
2024  *		(pi_ref_cnt). Caller should release the hold by calling
2025  *		mdi_rele_path().
2026  *
2027  * Return Values:
2028  *		MDI_SUCCESS	- Completed successfully
2029  *		MDI_BUSY 	- Client device is busy failing over
2030  *		MDI_NOPATH	- Client device is online, but no valid path are
2031  *				  available to access this client device
2032  *		MDI_FAILURE	- Invalid client device or state
2033  *		MDI_DEVI_ONLINING
2034  *				- Client device (struct dev_info state) is in
2035  *				  onlining state.
2036  */
2037 
2038 /*ARGSUSED*/
2039 int
2040 mdi_select_path(dev_info_t *cdip, struct buf *bp, int flags,
2041     void *arg, mdi_pathinfo_t **ret_pip)
2042 {
2043 	mdi_client_t	*ct;
2044 	mdi_pathinfo_t	*pip;
2045 	mdi_pathinfo_t	*next;
2046 	mdi_pathinfo_t	*head;
2047 	mdi_pathinfo_t	*start;
2048 	client_lb_t	lbp;	/* load balancing policy */
2049 	int		sb = 1;	/* standard behavior */
2050 	int		preferred = 1;	/* preferred path */
2051 	int		cond, cont = 1;
2052 	int		retry = 0;
2053 	mdi_pathinfo_t	*start_pip;	/* request starting pathinfo */
2054 	int		path_instance;	/* request specific path instance */
2055 
2056 	/* determine type of arg based on flags */
2057 	if (flags & MDI_SELECT_PATH_INSTANCE) {
2058 		path_instance = (int)(intptr_t)arg;
2059 		start_pip = NULL;
2060 	} else {
2061 		path_instance = 0;
2062 		start_pip = (mdi_pathinfo_t *)arg;
2063 	}
2064 
2065 	if (flags != 0) {
2066 		/*
2067 		 * disable default behavior
2068 		 */
2069 		sb = 0;
2070 	}
2071 
2072 	*ret_pip = NULL;
2073 	ct = i_devi_get_client(cdip);
2074 	if (ct == NULL) {
2075 		/* mdi extensions are NULL, Nothing more to do */
2076 		return (MDI_FAILURE);
2077 	}
2078 
2079 	MDI_CLIENT_LOCK(ct);
2080 
2081 	if (sb) {
2082 		if (MDI_CLIENT_IS_FAILED(ct)) {
2083 			/*
2084 			 * Client is not ready to accept any I/O requests.
2085 			 * Fail this request.
2086 			 */
2087 			MDI_DEBUG(2, (MDI_NOTE, cdip,
2088 			    "client state offline ct = %p", (void *)ct));
2089 			MDI_CLIENT_UNLOCK(ct);
2090 			return (MDI_FAILURE);
2091 		}
2092 
2093 		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
2094 			/*
2095 			 * Check for Failover is in progress. If so tell the
2096 			 * caller that this device is busy.
2097 			 */
2098 			MDI_DEBUG(2, (MDI_NOTE, cdip,
2099 			    "client failover in progress ct = %p",
2100 			    (void *)ct));
2101 			MDI_CLIENT_UNLOCK(ct);
2102 			return (MDI_BUSY);
2103 		}
2104 
2105 		/*
2106 		 * Check to see whether the client device is attached.
2107 		 * If not so, let the vHCI driver manually select a path
2108 		 * (standby) and let the probe/attach process to continue.
2109 		 */
2110 		if (MDI_CLIENT_IS_DETACHED(ct) || !i_ddi_devi_attached(cdip)) {
2111 			MDI_DEBUG(4, (MDI_NOTE, cdip,
2112 			    "devi is onlining ct = %p", (void *)ct));
2113 			MDI_CLIENT_UNLOCK(ct);
2114 			return (MDI_DEVI_ONLINING);
2115 		}
2116 	}
2117 
2118 	/*
2119 	 * Cache in the client list head.  If head of the list is NULL
2120 	 * return MDI_NOPATH
2121 	 */
2122 	head = ct->ct_path_head;
2123 	if (head == NULL) {
2124 		MDI_CLIENT_UNLOCK(ct);
2125 		return (MDI_NOPATH);
2126 	}
2127 
2128 	/* Caller is specifying a specific pathinfo path by path_instance */
2129 	if (path_instance) {
2130 		/* search for pathinfo with correct path_instance */
2131 		for (pip = head;
2132 		    pip && (mdi_pi_get_path_instance(pip) != path_instance);
2133 		    pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link)
2134 			;
2135 
2136 		/* If path can't be selected then MDI_FAILURE is returned. */
2137 		if (pip == NULL) {
2138 			MDI_CLIENT_UNLOCK(ct);
2139 			return (MDI_FAILURE);
2140 		}
2141 
2142 		/*
2143 		 * Verify state of path. When asked to select a specific
2144 		 * path_instance, we select the requested path in any
2145 		 * state (ONLINE, OFFLINE, STANDBY, FAULT) other than INIT.
2146 		 * We don't however select paths where the pHCI has detached.
2147 		 * NOTE: last pathinfo node of an opened client device may
2148 		 * exist in an OFFLINE state after the pHCI associated with
2149 		 * that path has detached (but pi_phci will be NULL if that
2150 		 * has occurred).
2151 		 */
2152 		MDI_PI_LOCK(pip);
2153 		if ((MDI_PI(pip)->pi_state == MDI_PATHINFO_STATE_INIT) ||
2154 		    (MDI_PI(pip)->pi_phci == NULL)) {
2155 			MDI_PI_UNLOCK(pip);
2156 			MDI_CLIENT_UNLOCK(ct);
2157 			return (MDI_FAILURE);
2158 		}
2159 
2160 		/*
2161 		 * Return the path in hold state. Caller should release the
2162 		 * lock by calling mdi_rele_path()
2163 		 */
2164 		MDI_PI_HOLD(pip);
2165 		MDI_PI_UNLOCK(pip);
2166 		*ret_pip = pip;
2167 		MDI_CLIENT_UNLOCK(ct);
2168 		return (MDI_SUCCESS);
2169 	}
2170 
2171 	/*
2172 	 * for non default behavior, bypass current
2173 	 * load balancing policy and always use LOAD_BALANCE_RR
2174 	 * except that the start point will be adjusted based
2175 	 * on the provided start_pip
2176 	 */
2177 	lbp = sb ? ct->ct_lb : LOAD_BALANCE_RR;
2178 
2179 	switch (lbp) {
2180 	case LOAD_BALANCE_NONE:
2181 		/*
2182 		 * Load balancing is None  or Alternate path mode
2183 		 * Start looking for a online mdi_pathinfo node starting from
2184 		 * last known selected path
2185 		 */
2186 		preferred = 1;
2187 		pip = (mdi_pathinfo_t *)ct->ct_path_last;
2188 		if (pip == NULL) {
2189 			pip = head;
2190 		}
2191 		start = pip;
2192 		do {
2193 			MDI_PI_LOCK(pip);
2194 			/*
2195 			 * No need to explicitly check if the path is disabled.
2196 			 * Since we are checking for state == ONLINE and the
2197 			 * same variable is used for DISABLE/ENABLE information.
2198 			 */
2199 			if ((MDI_PI(pip)->pi_state  ==
2200 				MDI_PATHINFO_STATE_ONLINE) &&
2201 				preferred == MDI_PI(pip)->pi_preferred) {
2202 				/*
2203 				 * Return the path in hold state. Caller should
2204 				 * release the lock by calling mdi_rele_path()
2205 				 */
2206 				MDI_PI_HOLD(pip);
2207 				MDI_PI_UNLOCK(pip);
2208 				ct->ct_path_last = pip;
2209 				*ret_pip = pip;
2210 				MDI_CLIENT_UNLOCK(ct);
2211 				return (MDI_SUCCESS);
2212 			}
2213 
2214 			/*
2215 			 * Path is busy.
2216 			 */
2217 			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
2218 			    MDI_PI_IS_TRANSIENT(pip))
2219 				retry = 1;
2220 			/*
2221 			 * Keep looking for a next available online path
2222 			 */
2223 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2224 			if (next == NULL) {
2225 				next = head;
2226 			}
2227 			MDI_PI_UNLOCK(pip);
2228 			pip = next;
2229 			if (start == pip && preferred) {
2230 				preferred = 0;
2231 			} else if (start == pip && !preferred) {
2232 				cont = 0;
2233 			}
2234 		} while (cont);
2235 		break;
2236 
2237 	case LOAD_BALANCE_LBA:
2238 		/*
2239 		 * Make sure we are looking
2240 		 * for an online path. Otherwise, if it is for a STANDBY
2241 		 * path request, it will go through and fetch an ONLINE
2242 		 * path which is not desirable.
2243 		 */
2244 		if ((ct->ct_lb_args != NULL) &&
2245 			    (ct->ct_lb_args->region_size) && bp &&
2246 				(sb || (flags == MDI_SELECT_ONLINE_PATH))) {
2247 			if (i_mdi_lba_lb(ct, ret_pip, bp)
2248 				    == MDI_SUCCESS) {
2249 				MDI_CLIENT_UNLOCK(ct);
2250 				return (MDI_SUCCESS);
2251 			}
2252 		}
2253 		/* FALLTHROUGH */
2254 	case LOAD_BALANCE_RR:
2255 		/*
2256 		 * Load balancing is Round Robin. Start looking for a online
2257 		 * mdi_pathinfo node starting from last known selected path
2258 		 * as the start point.  If override flags are specified,
2259 		 * process accordingly.
2260 		 * If the search is already in effect(start_pip not null),
2261 		 * then lets just use the same path preference to continue the
2262 		 * traversal.
2263 		 */
2264 
2265 		if (start_pip != NULL) {
2266 			preferred = MDI_PI(start_pip)->pi_preferred;
2267 		} else {
2268 			preferred = 1;
2269 		}
2270 
2271 		start = sb ? (mdi_pathinfo_t *)ct->ct_path_last : start_pip;
2272 		if (start == NULL) {
2273 			pip = head;
2274 		} else {
2275 			pip = (mdi_pathinfo_t *)MDI_PI(start)->pi_client_link;
2276 			if (pip == NULL) {
2277 				if ( flags & MDI_SELECT_NO_PREFERRED) {
2278 					/*
2279 					 * Return since we hit the end of list
2280 					 */
2281 					MDI_CLIENT_UNLOCK(ct);
2282 					return (MDI_NOPATH);
2283 				}
2284 
2285 				if (!sb) {
2286 					if (preferred == 0) {
2287 						/*
2288 						 * Looks like we have completed
2289 						 * the traversal as preferred
2290 						 * value is 0. Time to bail out.
2291 						 */
2292 						*ret_pip = NULL;
2293 						MDI_CLIENT_UNLOCK(ct);
2294 						return (MDI_NOPATH);
2295 					} else {
2296 						/*
2297 						 * Looks like we reached the
2298 						 * end of the list. Lets enable
2299 						 * traversal of non preferred
2300 						 * paths.
2301 						 */
2302 						preferred = 0;
2303 					}
2304 				}
2305 				pip = head;
2306 			}
2307 		}
2308 		start = pip;
2309 		do {
2310 			MDI_PI_LOCK(pip);
2311 			if (sb) {
2312 				cond = ((MDI_PI(pip)->pi_state ==
2313 				    MDI_PATHINFO_STATE_ONLINE &&
2314 					MDI_PI(pip)->pi_preferred ==
2315 						preferred) ? 1 : 0);
2316 			} else {
2317 				if (flags == MDI_SELECT_ONLINE_PATH) {
2318 					cond = ((MDI_PI(pip)->pi_state ==
2319 					    MDI_PATHINFO_STATE_ONLINE &&
2320 						MDI_PI(pip)->pi_preferred ==
2321 						preferred) ? 1 : 0);
2322 				} else if (flags == MDI_SELECT_STANDBY_PATH) {
2323 					cond = ((MDI_PI(pip)->pi_state ==
2324 					    MDI_PATHINFO_STATE_STANDBY &&
2325 						MDI_PI(pip)->pi_preferred ==
2326 						preferred) ? 1 : 0);
2327 				} else if (flags == (MDI_SELECT_ONLINE_PATH |
2328 				    MDI_SELECT_STANDBY_PATH)) {
2329 					cond = (((MDI_PI(pip)->pi_state ==
2330 					    MDI_PATHINFO_STATE_ONLINE ||
2331 					    (MDI_PI(pip)->pi_state ==
2332 					    MDI_PATHINFO_STATE_STANDBY)) &&
2333 						MDI_PI(pip)->pi_preferred ==
2334 						preferred) ? 1 : 0);
2335 				} else if (flags ==
2336 					(MDI_SELECT_STANDBY_PATH |
2337 					MDI_SELECT_ONLINE_PATH |
2338 					MDI_SELECT_USER_DISABLE_PATH)) {
2339 					cond = (((MDI_PI(pip)->pi_state ==
2340 					    MDI_PATHINFO_STATE_ONLINE ||
2341 					    (MDI_PI(pip)->pi_state ==
2342 					    MDI_PATHINFO_STATE_STANDBY) ||
2343 						(MDI_PI(pip)->pi_state ==
2344 					    (MDI_PATHINFO_STATE_ONLINE|
2345 					    MDI_PATHINFO_STATE_USER_DISABLE)) ||
2346 						(MDI_PI(pip)->pi_state ==
2347 					    (MDI_PATHINFO_STATE_STANDBY |
2348 					    MDI_PATHINFO_STATE_USER_DISABLE)))&&
2349 						MDI_PI(pip)->pi_preferred ==
2350 						preferred) ? 1 : 0);
2351 				} else if (flags ==
2352 				    (MDI_SELECT_STANDBY_PATH |
2353 				    MDI_SELECT_ONLINE_PATH |
2354 				    MDI_SELECT_NO_PREFERRED)) {
2355 					cond = (((MDI_PI(pip)->pi_state ==
2356 					    MDI_PATHINFO_STATE_ONLINE) ||
2357 					    (MDI_PI(pip)->pi_state ==
2358 					    MDI_PATHINFO_STATE_STANDBY))
2359 					    ? 1 : 0);
2360 				} else {
2361 					cond = 0;
2362 				}
2363 			}
2364 			/*
2365 			 * No need to explicitly check if the path is disabled.
2366 			 * Since we are checking for state == ONLINE and the
2367 			 * same variable is used for DISABLE/ENABLE information.
2368 			 */
2369 			if (cond) {
2370 				/*
2371 				 * Return the path in hold state. Caller should
2372 				 * release the lock by calling mdi_rele_path()
2373 				 */
2374 				MDI_PI_HOLD(pip);
2375 				MDI_PI_UNLOCK(pip);
2376 				if (sb)
2377 					ct->ct_path_last = pip;
2378 				*ret_pip = pip;
2379 				MDI_CLIENT_UNLOCK(ct);
2380 				return (MDI_SUCCESS);
2381 			}
2382 			/*
2383 			 * Path is busy.
2384 			 */
2385 			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
2386 			    MDI_PI_IS_TRANSIENT(pip))
2387 				retry = 1;
2388 
2389 			/*
2390 			 * Keep looking for a next available online path
2391 			 */
2392 do_again:
2393 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2394 			if (next == NULL) {
2395 				if ( flags & MDI_SELECT_NO_PREFERRED) {
2396 					/*
2397 					 * Bail out since we hit the end of list
2398 					 */
2399 					MDI_PI_UNLOCK(pip);
2400 					break;
2401 				}
2402 
2403 				if (!sb) {
2404 					if (preferred == 1) {
2405 						/*
2406 						 * Looks like we reached the
2407 						 * end of the list. Lets enable
2408 						 * traversal of non preferred
2409 						 * paths.
2410 						 */
2411 						preferred = 0;
2412 						next = head;
2413 					} else {
2414 						/*
2415 						 * We have done both the passes
2416 						 * Preferred as well as for
2417 						 * Non-preferred. Bail out now.
2418 						 */
2419 						cont = 0;
2420 					}
2421 				} else {
2422 					/*
2423 					 * Standard behavior case.
2424 					 */
2425 					next = head;
2426 				}
2427 			}
2428 			MDI_PI_UNLOCK(pip);
2429 			if (cont == 0) {
2430 				break;
2431 			}
2432 			pip = next;
2433 
2434 			if (!sb) {
2435 				/*
2436 				 * We need to handle the selection of
2437 				 * non-preferred path in the following
2438 				 * case:
2439 				 *
2440 				 * +------+   +------+   +------+   +-----+
2441 				 * | A : 1| - | B : 1| - | C : 0| - |NULL |
2442 				 * +------+   +------+   +------+   +-----+
2443 				 *
2444 				 * If we start the search with B, we need to
2445 				 * skip beyond B to pick C which is non -
2446 				 * preferred in the second pass. The following
2447 				 * test, if true, will allow us to skip over
2448 				 * the 'start'(B in the example) to select
2449 				 * other non preferred elements.
2450 				 */
2451 				if ((start_pip != NULL) && (start_pip == pip) &&
2452 				    (MDI_PI(start_pip)->pi_preferred
2453 				    != preferred)) {
2454 					/*
2455 					 * try again after going past the start
2456 					 * pip
2457 					 */
2458 					MDI_PI_LOCK(pip);
2459 					goto do_again;
2460 				}
2461 			} else {
2462 				/*
2463 				 * Standard behavior case
2464 				 */
2465 				if (start == pip && preferred) {
2466 					/* look for nonpreferred paths */
2467 					preferred = 0;
2468 				} else if (start == pip && !preferred) {
2469 					/*
2470 					 * Exit condition
2471 					 */
2472 					cont = 0;
2473 				}
2474 			}
2475 		} while (cont);
2476 		break;
2477 	}
2478 
2479 	MDI_CLIENT_UNLOCK(ct);
2480 	if (retry == 1) {
2481 		return (MDI_BUSY);
2482 	} else {
2483 		return (MDI_NOPATH);
2484 	}
2485 }
2486 
2487 /*
2488  * For a client, return the next available path to any phci
2489  *
2490  * Note:
2491  *		Caller should hold the branch's devinfo node to get a consistent
2492  *		snap shot of the mdi_pathinfo nodes.
2493  *
2494  *		Please note that even the list is stable the mdi_pathinfo
2495  *		node state and properties are volatile.  The caller should lock
2496  *		and unlock the nodes by calling mdi_pi_lock() and
2497  *		mdi_pi_unlock() functions to get a stable properties.
2498  *
2499  *		If there is a need to use the nodes beyond the hold of the
2500  *		devinfo node period (For ex. I/O), then mdi_pathinfo node
2501  *		need to be held against unexpected removal by calling
2502  *		mdi_hold_path() and should be released by calling
2503  *		mdi_rele_path() on completion.
2504  */
2505 mdi_pathinfo_t *
2506 mdi_get_next_phci_path(dev_info_t *ct_dip, mdi_pathinfo_t *pip)
2507 {
2508 	mdi_client_t *ct;
2509 
2510 	if (!MDI_CLIENT(ct_dip))
2511 		return (NULL);
2512 
2513 	/*
2514 	 * Walk through client link
2515 	 */
2516 	ct = (mdi_client_t *)DEVI(ct_dip)->devi_mdi_client;
2517 	ASSERT(ct != NULL);
2518 
2519 	if (pip == NULL)
2520 		return ((mdi_pathinfo_t *)ct->ct_path_head);
2521 
2522 	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link);
2523 }
2524 
2525 /*
2526  * For a phci, return the next available path to any client
2527  * Note: ditto mdi_get_next_phci_path()
2528  */
2529 mdi_pathinfo_t *
2530 mdi_get_next_client_path(dev_info_t *ph_dip, mdi_pathinfo_t *pip)
2531 {
2532 	mdi_phci_t *ph;
2533 
2534 	if (!MDI_PHCI(ph_dip))
2535 		return (NULL);
2536 
2537 	/*
2538 	 * Walk through pHCI link
2539 	 */
2540 	ph = (mdi_phci_t *)DEVI(ph_dip)->devi_mdi_xhci;
2541 	ASSERT(ph != NULL);
2542 
2543 	if (pip == NULL)
2544 		return ((mdi_pathinfo_t *)ph->ph_path_head);
2545 
2546 	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link);
2547 }
2548 
2549 /*
2550  * mdi_hold_path():
2551  *		Hold the mdi_pathinfo node against unwanted unexpected free.
2552  * Return Values:
2553  *		None
2554  */
2555 void
2556 mdi_hold_path(mdi_pathinfo_t *pip)
2557 {
2558 	if (pip) {
2559 		MDI_PI_LOCK(pip);
2560 		MDI_PI_HOLD(pip);
2561 		MDI_PI_UNLOCK(pip);
2562 	}
2563 }
2564 
2565 
2566 /*
2567  * mdi_rele_path():
2568  *		Release the mdi_pathinfo node which was selected
2569  *		through mdi_select_path() mechanism or manually held by
2570  *		calling mdi_hold_path().
2571  * Return Values:
2572  *		None
2573  */
2574 void
2575 mdi_rele_path(mdi_pathinfo_t *pip)
2576 {
2577 	if (pip) {
2578 		MDI_PI_LOCK(pip);
2579 		MDI_PI_RELE(pip);
2580 		if (MDI_PI(pip)->pi_ref_cnt == 0) {
2581 			cv_broadcast(&MDI_PI(pip)->pi_ref_cv);
2582 		}
2583 		MDI_PI_UNLOCK(pip);
2584 	}
2585 }
2586 
2587 /*
2588  * mdi_pi_lock():
2589  * 		Lock the mdi_pathinfo node.
2590  * Note:
2591  *		The caller should release the lock by calling mdi_pi_unlock()
2592  */
2593 void
2594 mdi_pi_lock(mdi_pathinfo_t *pip)
2595 {
2596 	ASSERT(pip != NULL);
2597 	if (pip) {
2598 		MDI_PI_LOCK(pip);
2599 	}
2600 }
2601 
2602 
2603 /*
2604  * mdi_pi_unlock():
2605  * 		Unlock the mdi_pathinfo node.
2606  * Note:
2607  *		The mdi_pathinfo node should have been locked with mdi_pi_lock()
2608  */
2609 void
2610 mdi_pi_unlock(mdi_pathinfo_t *pip)
2611 {
2612 	ASSERT(pip != NULL);
2613 	if (pip) {
2614 		MDI_PI_UNLOCK(pip);
2615 	}
2616 }
2617 
2618 /*
2619  * mdi_pi_find():
2620  *		Search the list of mdi_pathinfo nodes attached to the
2621  *		pHCI/Client device node whose path address matches "paddr".
2622  *		Returns a pointer to the mdi_pathinfo node if a matching node is
2623  *		found.
2624  * Return Values:
2625  *		mdi_pathinfo node handle
2626  *		NULL
2627  * Notes:
2628  *		Caller need not hold any locks to call this function.
2629  */
2630 mdi_pathinfo_t *
2631 mdi_pi_find(dev_info_t *pdip, char *caddr, char *paddr)
2632 {
2633 	mdi_phci_t		*ph;
2634 	mdi_vhci_t		*vh;
2635 	mdi_client_t		*ct;
2636 	mdi_pathinfo_t		*pip = NULL;
2637 
2638 	MDI_DEBUG(2, (MDI_NOTE, pdip,
2639 	    "caddr@%s paddr@%s", caddr ? caddr : "", paddr ? paddr : ""));
2640 	if ((pdip == NULL) || (paddr == NULL)) {
2641 		return (NULL);
2642 	}
2643 	ph = i_devi_get_phci(pdip);
2644 	if (ph == NULL) {
2645 		/*
2646 		 * Invalid pHCI device, Nothing more to do.
2647 		 */
2648 		MDI_DEBUG(2, (MDI_WARN, pdip, "invalid phci"));
2649 		return (NULL);
2650 	}
2651 
2652 	vh = ph->ph_vhci;
2653 	if (vh == NULL) {
2654 		/*
2655 		 * Invalid vHCI device, Nothing more to do.
2656 		 */
2657 		MDI_DEBUG(2, (MDI_WARN, pdip, "invalid vhci"));
2658 		return (NULL);
2659 	}
2660 
2661 	/*
2662 	 * Look for pathinfo node identified by paddr.
2663 	 */
2664 	if (caddr == NULL) {
2665 		/*
2666 		 * Find a mdi_pathinfo node under pHCI list for a matching
2667 		 * unit address.
2668 		 */
2669 		MDI_PHCI_LOCK(ph);
2670 		if (MDI_PHCI_IS_OFFLINE(ph)) {
2671 			MDI_DEBUG(2, (MDI_WARN, pdip,
2672 			    "offline phci %p", (void *)ph));
2673 			MDI_PHCI_UNLOCK(ph);
2674 			return (NULL);
2675 		}
2676 		pip = (mdi_pathinfo_t *)ph->ph_path_head;
2677 
2678 		while (pip != NULL) {
2679 			if (strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2680 				break;
2681 			}
2682 			pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
2683 		}
2684 		MDI_PHCI_UNLOCK(ph);
2685 		MDI_DEBUG(2, (MDI_NOTE, pdip,
2686 		    "found %s %p", mdi_pi_spathname(pip), (void *)pip));
2687 		return (pip);
2688 	}
2689 
2690 	/*
2691 	 * XXX - Is the rest of the code in this function really necessary?
2692 	 * The consumers of mdi_pi_find() can search for the desired pathinfo
2693 	 * node by calling mdi_pi_find(pdip, NULL, paddr). Irrespective of
2694 	 * whether the search is based on the pathinfo nodes attached to
2695 	 * the pHCI or the client node, the result will be the same.
2696 	 */
2697 
2698 	/*
2699 	 * Find the client device corresponding to 'caddr'
2700 	 */
2701 	MDI_VHCI_CLIENT_LOCK(vh);
2702 
2703 	/*
2704 	 * XXX - Passing NULL to the following function works as long as the
2705 	 * the client addresses (caddr) are unique per vhci basis.
2706 	 */
2707 	ct = i_mdi_client_find(vh, NULL, caddr);
2708 	if (ct == NULL) {
2709 		/*
2710 		 * Client not found, Obviously mdi_pathinfo node has not been
2711 		 * created yet.
2712 		 */
2713 		MDI_VHCI_CLIENT_UNLOCK(vh);
2714 		MDI_DEBUG(2, (MDI_NOTE, pdip,
2715 		    "client not found for caddr @%s", caddr ? caddr : ""));
2716 		return (NULL);
2717 	}
2718 
2719 	/*
2720 	 * Hold the client lock and look for a mdi_pathinfo node with matching
2721 	 * pHCI and paddr
2722 	 */
2723 	MDI_CLIENT_LOCK(ct);
2724 
2725 	/*
2726 	 * Release the global mutex as it is no more needed. Note: We always
2727 	 * respect the locking order while acquiring.
2728 	 */
2729 	MDI_VHCI_CLIENT_UNLOCK(vh);
2730 
2731 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2732 	while (pip != NULL) {
2733 		/*
2734 		 * Compare the unit address
2735 		 */
2736 		if ((MDI_PI(pip)->pi_phci == ph) &&
2737 		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2738 			break;
2739 		}
2740 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2741 	}
2742 	MDI_CLIENT_UNLOCK(ct);
2743 	MDI_DEBUG(2, (MDI_NOTE, pdip,
2744 	    "found: %s %p", mdi_pi_spathname(pip), (void *)pip));
2745 	return (pip);
2746 }
2747 
2748 /*
2749  * mdi_pi_alloc():
2750  *		Allocate and initialize a new instance of a mdi_pathinfo node.
2751  *		The mdi_pathinfo node returned by this function identifies a
2752  *		unique device path is capable of having properties attached
2753  *		and passed to mdi_pi_online() to fully attach and online the
2754  *		path and client device node.
2755  *		The mdi_pathinfo node returned by this function must be
2756  *		destroyed using mdi_pi_free() if the path is no longer
2757  *		operational or if the caller fails to attach a client device
2758  *		node when calling mdi_pi_online(). The framework will not free
2759  *		the resources allocated.
2760  *		This function can be called from both interrupt and kernel
2761  *		contexts.  DDI_NOSLEEP flag should be used while calling
2762  *		from interrupt contexts.
2763  * Return Values:
2764  *		MDI_SUCCESS
2765  *		MDI_FAILURE
2766  *		MDI_NOMEM
2767  */
2768 /*ARGSUSED*/
2769 int
2770 mdi_pi_alloc_compatible(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2771     char **compatible, int ncompatible, int flags, mdi_pathinfo_t **ret_pip)
2772 {
2773 	mdi_vhci_t	*vh;
2774 	mdi_phci_t	*ph;
2775 	mdi_client_t	*ct;
2776 	mdi_pathinfo_t	*pip = NULL;
2777 	dev_info_t	*cdip;
2778 	int		rv = MDI_NOMEM;
2779 	int		path_allocated = 0;
2780 
2781 	MDI_DEBUG(2, (MDI_NOTE, pdip,
2782 	    "cname %s: caddr@%s paddr@%s",
2783 	    cname ? cname : "", caddr ? caddr : "", paddr ? paddr : ""));
2784 
2785 	if (pdip == NULL || cname == NULL || caddr == NULL || paddr == NULL ||
2786 	    ret_pip == NULL) {
2787 		/* Nothing more to do */
2788 		return (MDI_FAILURE);
2789 	}
2790 
2791 	*ret_pip = NULL;
2792 
2793 	/* No allocations on detaching pHCI */
2794 	if (DEVI_IS_DETACHING(pdip)) {
2795 		/* Invalid pHCI device, return failure */
2796 		MDI_DEBUG(1, (MDI_WARN, pdip,
2797 		    "!detaching pHCI=%p", (void *)pdip));
2798 		return (MDI_FAILURE);
2799 	}
2800 
2801 	ph = i_devi_get_phci(pdip);
2802 	ASSERT(ph != NULL);
2803 	if (ph == NULL) {
2804 		/* Invalid pHCI device, return failure */
2805 		MDI_DEBUG(1, (MDI_WARN, pdip,
2806 		    "!invalid pHCI=%p", (void *)pdip));
2807 		return (MDI_FAILURE);
2808 	}
2809 
2810 	MDI_PHCI_LOCK(ph);
2811 	vh = ph->ph_vhci;
2812 	if (vh == NULL) {
2813 		/* Invalid vHCI device, return failure */
2814 		MDI_DEBUG(1, (MDI_WARN, pdip,
2815 		    "!invalid vHCI=%p", (void *)pdip));
2816 		MDI_PHCI_UNLOCK(ph);
2817 		return (MDI_FAILURE);
2818 	}
2819 
2820 	if (MDI_PHCI_IS_READY(ph) == 0) {
2821 		/*
2822 		 * Do not allow new node creation when pHCI is in
2823 		 * offline/suspended states
2824 		 */
2825 		MDI_DEBUG(1, (MDI_WARN, pdip,
2826 		    "pHCI=%p is not ready", (void *)ph));
2827 		MDI_PHCI_UNLOCK(ph);
2828 		return (MDI_BUSY);
2829 	}
2830 	MDI_PHCI_UNSTABLE(ph);
2831 	MDI_PHCI_UNLOCK(ph);
2832 
2833 	/* look for a matching client, create one if not found */
2834 	MDI_VHCI_CLIENT_LOCK(vh);
2835 	ct = i_mdi_client_find(vh, cname, caddr);
2836 	if (ct == NULL) {
2837 		ct = i_mdi_client_alloc(vh, cname, caddr);
2838 		ASSERT(ct != NULL);
2839 	}
2840 
2841 	if (ct->ct_dip == NULL) {
2842 		/*
2843 		 * Allocate a devinfo node
2844 		 */
2845 		ct->ct_dip = i_mdi_devinfo_create(vh, cname, caddr,
2846 		    compatible, ncompatible);
2847 		if (ct->ct_dip == NULL) {
2848 			(void) i_mdi_client_free(vh, ct);
2849 			goto fail;
2850 		}
2851 	}
2852 	cdip = ct->ct_dip;
2853 
2854 	DEVI(cdip)->devi_mdi_component |= MDI_COMPONENT_CLIENT;
2855 	DEVI(cdip)->devi_mdi_client = (caddr_t)ct;
2856 
2857 	MDI_CLIENT_LOCK(ct);
2858 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2859 	while (pip != NULL) {
2860 		/*
2861 		 * Compare the unit address
2862 		 */
2863 		if ((MDI_PI(pip)->pi_phci == ph) &&
2864 		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2865 			break;
2866 		}
2867 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2868 	}
2869 	MDI_CLIENT_UNLOCK(ct);
2870 
2871 	if (pip == NULL) {
2872 		/*
2873 		 * This is a new path for this client device.  Allocate and
2874 		 * initialize a new pathinfo node
2875 		 */
2876 		pip = i_mdi_pi_alloc(ph, paddr, ct);
2877 		ASSERT(pip != NULL);
2878 		path_allocated = 1;
2879 	}
2880 	rv = MDI_SUCCESS;
2881 
2882 fail:
2883 	/*
2884 	 * Release the global mutex.
2885 	 */
2886 	MDI_VHCI_CLIENT_UNLOCK(vh);
2887 
2888 	/*
2889 	 * Mark the pHCI as stable
2890 	 */
2891 	MDI_PHCI_LOCK(ph);
2892 	MDI_PHCI_STABLE(ph);
2893 	MDI_PHCI_UNLOCK(ph);
2894 	*ret_pip = pip;
2895 
2896 	MDI_DEBUG(2, (MDI_NOTE, pdip,
2897 	    "alloc %s %p", mdi_pi_spathname(pip), (void *)pip));
2898 
2899 	if (path_allocated)
2900 		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
2901 
2902 	return (rv);
2903 }
2904 
2905 /*ARGSUSED*/
2906 int
2907 mdi_pi_alloc(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2908     int flags, mdi_pathinfo_t **ret_pip)
2909 {
2910 	return (mdi_pi_alloc_compatible(pdip, cname, caddr, paddr, NULL, 0,
2911 	    flags, ret_pip));
2912 }
2913 
2914 /*
2915  * i_mdi_pi_alloc():
2916  *		Allocate a mdi_pathinfo node and add to the pHCI path list
2917  * Return Values:
2918  *		mdi_pathinfo
2919  */
2920 /*ARGSUSED*/
2921 static mdi_pathinfo_t *
2922 i_mdi_pi_alloc(mdi_phci_t *ph, char *paddr, mdi_client_t *ct)
2923 {
2924 	mdi_pathinfo_t	*pip;
2925 	int		ct_circular;
2926 	int		ph_circular;
2927 	static char	path[MAXPATHLEN];	/* mdi_pathmap_mutex protects */
2928 	char		*path_persistent;
2929 	int		path_instance;
2930 	mod_hash_val_t	hv;
2931 
2932 	ASSERT(MDI_VHCI_CLIENT_LOCKED(ph->ph_vhci));
2933 
2934 	pip = kmem_zalloc(sizeof (struct mdi_pathinfo), KM_SLEEP);
2935 	mutex_init(&MDI_PI(pip)->pi_mutex, NULL, MUTEX_DEFAULT, NULL);
2936 	MDI_PI(pip)->pi_state = MDI_PATHINFO_STATE_INIT |
2937 	    MDI_PATHINFO_STATE_TRANSIENT;
2938 
2939 	if (MDI_PHCI_IS_USER_DISABLED(ph))
2940 		MDI_PI_SET_USER_DISABLE(pip);
2941 
2942 	if (MDI_PHCI_IS_DRV_DISABLED_TRANSIENT(ph))
2943 		MDI_PI_SET_DRV_DISABLE_TRANS(pip);
2944 
2945 	if (MDI_PHCI_IS_DRV_DISABLED(ph))
2946 		MDI_PI_SET_DRV_DISABLE(pip);
2947 
2948 	MDI_PI(pip)->pi_old_state = MDI_PATHINFO_STATE_INIT;
2949 	cv_init(&MDI_PI(pip)->pi_state_cv, NULL, CV_DEFAULT, NULL);
2950 	MDI_PI(pip)->pi_client = ct;
2951 	MDI_PI(pip)->pi_phci = ph;
2952 	MDI_PI(pip)->pi_addr = kmem_alloc(strlen(paddr) + 1, KM_SLEEP);
2953 	(void) strcpy(MDI_PI(pip)->pi_addr, paddr);
2954 
2955         /*
2956 	 * We form the "path" to the pathinfo node, and see if we have
2957 	 * already allocated a 'path_instance' for that "path".  If so,
2958 	 * we use the already allocated 'path_instance'.  If not, we
2959 	 * allocate a new 'path_instance' and associate it with a copy of
2960 	 * the "path" string (which is never freed). The association
2961 	 * between a 'path_instance' this "path" string persists until
2962 	 * reboot.
2963 	 */
2964         mutex_enter(&mdi_pathmap_mutex);
2965 	(void) ddi_pathname(ph->ph_dip, path);
2966 	(void) sprintf(path + strlen(path), "/%s@%s",
2967 	    mdi_pi_get_node_name(pip), mdi_pi_get_addr(pip));
2968         if (mod_hash_find(mdi_pathmap_bypath, (mod_hash_key_t)path, &hv) == 0) {
2969                 path_instance = (uint_t)(intptr_t)hv;
2970         } else {
2971 		/* allocate a new 'path_instance' and persistent "path" */
2972 		path_instance = mdi_pathmap_instance++;
2973 		path_persistent = i_ddi_strdup(path, KM_SLEEP);
2974                 (void) mod_hash_insert(mdi_pathmap_bypath,
2975                     (mod_hash_key_t)path_persistent,
2976                     (mod_hash_val_t)(intptr_t)path_instance);
2977 		(void) mod_hash_insert(mdi_pathmap_byinstance,
2978 		    (mod_hash_key_t)(intptr_t)path_instance,
2979 		    (mod_hash_val_t)path_persistent);
2980 
2981 		/* create shortpath name */
2982 		(void) snprintf(path, sizeof(path), "%s%d/%s@%s",
2983 		    ddi_driver_name(ph->ph_dip), ddi_get_instance(ph->ph_dip),
2984 		    mdi_pi_get_node_name(pip), mdi_pi_get_addr(pip));
2985 		path_persistent = i_ddi_strdup(path, KM_SLEEP);
2986 		(void) mod_hash_insert(mdi_pathmap_sbyinstance,
2987 		    (mod_hash_key_t)(intptr_t)path_instance,
2988 		    (mod_hash_val_t)path_persistent);
2989         }
2990         mutex_exit(&mdi_pathmap_mutex);
2991 	MDI_PI(pip)->pi_path_instance = path_instance;
2992 
2993 	(void) nvlist_alloc(&MDI_PI(pip)->pi_prop, NV_UNIQUE_NAME, KM_SLEEP);
2994 	ASSERT(MDI_PI(pip)->pi_prop != NULL);
2995 	MDI_PI(pip)->pi_pprivate = NULL;
2996 	MDI_PI(pip)->pi_cprivate = NULL;
2997 	MDI_PI(pip)->pi_vprivate = NULL;
2998 	MDI_PI(pip)->pi_client_link = NULL;
2999 	MDI_PI(pip)->pi_phci_link = NULL;
3000 	MDI_PI(pip)->pi_ref_cnt = 0;
3001 	MDI_PI(pip)->pi_kstats = NULL;
3002 	MDI_PI(pip)->pi_preferred = 1;
3003 	cv_init(&MDI_PI(pip)->pi_ref_cv, NULL, CV_DEFAULT, NULL);
3004 
3005 	/*
3006 	 * Lock both dev_info nodes against changes in parallel.
3007 	 *
3008 	 * The ndi_devi_enter(Client), is atypical since the client is a leaf.
3009 	 * This atypical operation is done to synchronize pathinfo nodes
3010 	 * during devinfo snapshot (see di_register_pip) by 'pretending' that
3011 	 * the pathinfo nodes are children of the Client.
3012 	 */
3013 	ndi_devi_enter(ct->ct_dip, &ct_circular);
3014 	ndi_devi_enter(ph->ph_dip, &ph_circular);
3015 
3016 	i_mdi_phci_add_path(ph, pip);
3017 	i_mdi_client_add_path(ct, pip);
3018 
3019 	ndi_devi_exit(ph->ph_dip, ph_circular);
3020 	ndi_devi_exit(ct->ct_dip, ct_circular);
3021 
3022 	return (pip);
3023 }
3024 
3025 /*
3026  * mdi_pi_pathname_by_instance():
3027  *	Lookup of "path" by 'path_instance'. Return "path".
3028  *	NOTE: returned "path" remains valid forever (until reboot).
3029  */
3030 char *
3031 mdi_pi_pathname_by_instance(int path_instance)
3032 {
3033 	char		*path;
3034 	mod_hash_val_t	hv;
3035 
3036 	/* mdi_pathmap lookup of "path" by 'path_instance' */
3037 	mutex_enter(&mdi_pathmap_mutex);
3038 	if (mod_hash_find(mdi_pathmap_byinstance,
3039 	    (mod_hash_key_t)(intptr_t)path_instance, &hv) == 0)
3040 		path = (char *)hv;
3041 	else
3042 		path = NULL;
3043 	mutex_exit(&mdi_pathmap_mutex);
3044 	return (path);
3045 }
3046 
3047 /*
3048  * mdi_pi_spathname_by_instance():
3049  *	Lookup of "shortpath" by 'path_instance'. Return "shortpath".
3050  *	NOTE: returned "shortpath" remains valid forever (until reboot).
3051  */
3052 char *
3053 mdi_pi_spathname_by_instance(int path_instance)
3054 {
3055 	char		*path;
3056 	mod_hash_val_t	hv;
3057 
3058 	/* mdi_pathmap lookup of "path" by 'path_instance' */
3059 	mutex_enter(&mdi_pathmap_mutex);
3060 	if (mod_hash_find(mdi_pathmap_sbyinstance,
3061 	    (mod_hash_key_t)(intptr_t)path_instance, &hv) == 0)
3062 		path = (char *)hv;
3063 	else
3064 		path = NULL;
3065 	mutex_exit(&mdi_pathmap_mutex);
3066 	return (path);
3067 }
3068 
3069 
3070 /*
3071  * i_mdi_phci_add_path():
3072  * 		Add a mdi_pathinfo node to pHCI list.
3073  * Notes:
3074  *		Caller should per-pHCI mutex
3075  */
3076 static void
3077 i_mdi_phci_add_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
3078 {
3079 	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
3080 
3081 	MDI_PHCI_LOCK(ph);
3082 	if (ph->ph_path_head == NULL) {
3083 		ph->ph_path_head = pip;
3084 	} else {
3085 		MDI_PI(ph->ph_path_tail)->pi_phci_link = MDI_PI(pip);
3086 	}
3087 	ph->ph_path_tail = pip;
3088 	ph->ph_path_count++;
3089 	MDI_PHCI_UNLOCK(ph);
3090 }
3091 
3092 /*
3093  * i_mdi_client_add_path():
3094  *		Add mdi_pathinfo node to client list
3095  */
3096 static void
3097 i_mdi_client_add_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
3098 {
3099 	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
3100 
3101 	MDI_CLIENT_LOCK(ct);
3102 	if (ct->ct_path_head == NULL) {
3103 		ct->ct_path_head = pip;
3104 	} else {
3105 		MDI_PI(ct->ct_path_tail)->pi_client_link = MDI_PI(pip);
3106 	}
3107 	ct->ct_path_tail = pip;
3108 	ct->ct_path_count++;
3109 	MDI_CLIENT_UNLOCK(ct);
3110 }
3111 
3112 /*
3113  * mdi_pi_free():
3114  *		Free the mdi_pathinfo node and also client device node if this
3115  *		is the last path to the device
3116  * Return Values:
3117  *		MDI_SUCCESS
3118  *		MDI_FAILURE
3119  *		MDI_BUSY
3120  */
3121 /*ARGSUSED*/
3122 int
3123 mdi_pi_free(mdi_pathinfo_t *pip, int flags)
3124 {
3125 	int		rv = MDI_FAILURE;
3126 	mdi_vhci_t	*vh;
3127 	mdi_phci_t	*ph;
3128 	mdi_client_t	*ct;
3129 	int		(*f)();
3130 	int		client_held = 0;
3131 
3132 	MDI_PI_LOCK(pip);
3133 	ph = MDI_PI(pip)->pi_phci;
3134 	ASSERT(ph != NULL);
3135 	if (ph == NULL) {
3136 		/*
3137 		 * Invalid pHCI device, return failure
3138 		 */
3139 		MDI_DEBUG(1, (MDI_WARN, NULL,
3140 		    "!invalid pHCI: pip %s %p",
3141 		    mdi_pi_spathname(pip), (void *)pip));
3142 		MDI_PI_UNLOCK(pip);
3143 		return (MDI_FAILURE);
3144 	}
3145 
3146 	vh = ph->ph_vhci;
3147 	ASSERT(vh != NULL);
3148 	if (vh == NULL) {
3149 		/* Invalid pHCI device, return failure */
3150 		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3151 		    "!invalid vHCI: pip %s %p",
3152 		    mdi_pi_spathname(pip), (void *)pip));
3153 		MDI_PI_UNLOCK(pip);
3154 		return (MDI_FAILURE);
3155 	}
3156 
3157 	ct = MDI_PI(pip)->pi_client;
3158 	ASSERT(ct != NULL);
3159 	if (ct == NULL) {
3160 		/*
3161 		 * Invalid Client device, return failure
3162 		 */
3163 		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3164 		    "!invalid client: pip %s %p",
3165 		    mdi_pi_spathname(pip), (void *)pip));
3166 		MDI_PI_UNLOCK(pip);
3167 		return (MDI_FAILURE);
3168 	}
3169 
3170 	/*
3171 	 * Check to see for busy condition.  A mdi_pathinfo can only be freed
3172 	 * if the node state is either offline or init and the reference count
3173 	 * is zero.
3174 	 */
3175 	if (!(MDI_PI_IS_OFFLINE(pip) || MDI_PI_IS_INIT(pip) ||
3176 	    MDI_PI_IS_INITING(pip))) {
3177 		/*
3178 		 * Node is busy
3179 		 */
3180 		MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
3181 		    "!busy: pip %s %p", mdi_pi_spathname(pip), (void *)pip));
3182 		MDI_PI_UNLOCK(pip);
3183 		return (MDI_BUSY);
3184 	}
3185 
3186 	while (MDI_PI(pip)->pi_ref_cnt != 0) {
3187 		/*
3188 		 * Give a chance for pending I/Os to complete.
3189 		 */
3190 		MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3191 		    "!%d cmds still pending on path: %s %p",
3192 		    MDI_PI(pip)->pi_ref_cnt,
3193 		    mdi_pi_spathname(pip), (void *)pip));
3194 		if (cv_timedwait(&MDI_PI(pip)->pi_ref_cv,
3195 		    &MDI_PI(pip)->pi_mutex,
3196 		    ddi_get_lbolt() + drv_usectohz(60 * 1000000)) == -1) {
3197 			/*
3198 			 * The timeout time reached without ref_cnt being zero
3199 			 * being signaled.
3200 			 */
3201 			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3202 			    "!Timeout reached on path %s %p without the cond",
3203 			    mdi_pi_spathname(pip), (void *)pip));
3204 			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3205 			    "!%d cmds still pending on path %s %p",
3206 			    MDI_PI(pip)->pi_ref_cnt,
3207 			    mdi_pi_spathname(pip), (void *)pip));
3208 			MDI_PI_UNLOCK(pip);
3209 			return (MDI_BUSY);
3210 		}
3211 	}
3212 	if (MDI_PI(pip)->pi_pm_held) {
3213 		client_held = 1;
3214 	}
3215 	MDI_PI_UNLOCK(pip);
3216 
3217 	vhcache_pi_remove(vh->vh_config, MDI_PI(pip));
3218 
3219 	MDI_CLIENT_LOCK(ct);
3220 
3221 	/* Prevent further failovers till MDI_VHCI_CLIENT_LOCK is held */
3222 	MDI_CLIENT_SET_PATH_FREE_IN_PROGRESS(ct);
3223 
3224 	/*
3225 	 * Wait till failover is complete before removing this node.
3226 	 */
3227 	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
3228 		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
3229 
3230 	MDI_CLIENT_UNLOCK(ct);
3231 	MDI_VHCI_CLIENT_LOCK(vh);
3232 	MDI_CLIENT_LOCK(ct);
3233 	MDI_CLIENT_CLEAR_PATH_FREE_IN_PROGRESS(ct);
3234 
3235 	if (!MDI_PI_IS_INITING(pip)) {
3236 		f = vh->vh_ops->vo_pi_uninit;
3237 		if (f != NULL) {
3238 			rv = (*f)(vh->vh_dip, pip, 0);
3239 		}
3240 	}
3241 	/*
3242 	 * If vo_pi_uninit() completed successfully.
3243 	 */
3244 	if (rv == MDI_SUCCESS) {
3245 		if (client_held) {
3246 			MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
3247 			    "i_mdi_pm_rele_client\n"));
3248 			i_mdi_pm_rele_client(ct, 1);
3249 		}
3250 		i_mdi_pi_free(ph, pip, ct);
3251 		if (ct->ct_path_count == 0) {
3252 			/*
3253 			 * Client lost its last path.
3254 			 * Clean up the client device
3255 			 */
3256 			MDI_CLIENT_UNLOCK(ct);
3257 			(void) i_mdi_client_free(ct->ct_vhci, ct);
3258 			MDI_VHCI_CLIENT_UNLOCK(vh);
3259 			return (rv);
3260 		}
3261 	}
3262 	MDI_CLIENT_UNLOCK(ct);
3263 	MDI_VHCI_CLIENT_UNLOCK(vh);
3264 
3265 	if (rv == MDI_FAILURE)
3266 		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
3267 
3268 	return (rv);
3269 }
3270 
3271 /*
3272  * i_mdi_pi_free():
3273  *		Free the mdi_pathinfo node
3274  */
3275 static void
3276 i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *pip, mdi_client_t *ct)
3277 {
3278 	int	ct_circular;
3279 	int	ph_circular;
3280 
3281 	ASSERT(MDI_CLIENT_LOCKED(ct));
3282 
3283 	/*
3284 	 * remove any per-path kstats
3285 	 */
3286 	i_mdi_pi_kstat_destroy(pip);
3287 
3288 	/* See comments in i_mdi_pi_alloc() */
3289 	ndi_devi_enter(ct->ct_dip, &ct_circular);
3290 	ndi_devi_enter(ph->ph_dip, &ph_circular);
3291 
3292 	i_mdi_client_remove_path(ct, pip);
3293 	i_mdi_phci_remove_path(ph, pip);
3294 
3295 	ndi_devi_exit(ph->ph_dip, ph_circular);
3296 	ndi_devi_exit(ct->ct_dip, ct_circular);
3297 
3298 	mutex_destroy(&MDI_PI(pip)->pi_mutex);
3299 	cv_destroy(&MDI_PI(pip)->pi_state_cv);
3300 	cv_destroy(&MDI_PI(pip)->pi_ref_cv);
3301 	if (MDI_PI(pip)->pi_addr) {
3302 		kmem_free(MDI_PI(pip)->pi_addr,
3303 		    strlen(MDI_PI(pip)->pi_addr) + 1);
3304 		MDI_PI(pip)->pi_addr = NULL;
3305 	}
3306 
3307 	if (MDI_PI(pip)->pi_prop) {
3308 		(void) nvlist_free(MDI_PI(pip)->pi_prop);
3309 		MDI_PI(pip)->pi_prop = NULL;
3310 	}
3311 	kmem_free(pip, sizeof (struct mdi_pathinfo));
3312 }
3313 
3314 
3315 /*
3316  * i_mdi_phci_remove_path():
3317  * 		Remove a mdi_pathinfo node from pHCI list.
3318  * Notes:
3319  *		Caller should hold per-pHCI mutex
3320  */
3321 static void
3322 i_mdi_phci_remove_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
3323 {
3324 	mdi_pathinfo_t	*prev = NULL;
3325 	mdi_pathinfo_t	*path = NULL;
3326 
3327 	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
3328 
3329 	MDI_PHCI_LOCK(ph);
3330 	path = ph->ph_path_head;
3331 	while (path != NULL) {
3332 		if (path == pip) {
3333 			break;
3334 		}
3335 		prev = path;
3336 		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
3337 	}
3338 
3339 	if (path) {
3340 		ph->ph_path_count--;
3341 		if (prev) {
3342 			MDI_PI(prev)->pi_phci_link = MDI_PI(path)->pi_phci_link;
3343 		} else {
3344 			ph->ph_path_head =
3345 			    (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
3346 		}
3347 		if (ph->ph_path_tail == path) {
3348 			ph->ph_path_tail = prev;
3349 		}
3350 	}
3351 
3352 	/*
3353 	 * Clear the pHCI link
3354 	 */
3355 	MDI_PI(pip)->pi_phci_link = NULL;
3356 	MDI_PI(pip)->pi_phci = NULL;
3357 	MDI_PHCI_UNLOCK(ph);
3358 }
3359 
3360 /*
3361  * i_mdi_client_remove_path():
3362  * 		Remove a mdi_pathinfo node from client path list.
3363  */
3364 static void
3365 i_mdi_client_remove_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
3366 {
3367 	mdi_pathinfo_t	*prev = NULL;
3368 	mdi_pathinfo_t	*path;
3369 
3370 	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
3371 
3372 	ASSERT(MDI_CLIENT_LOCKED(ct));
3373 	path = ct->ct_path_head;
3374 	while (path != NULL) {
3375 		if (path == pip) {
3376 			break;
3377 		}
3378 		prev = path;
3379 		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
3380 	}
3381 
3382 	if (path) {
3383 		ct->ct_path_count--;
3384 		if (prev) {
3385 			MDI_PI(prev)->pi_client_link =
3386 			    MDI_PI(path)->pi_client_link;
3387 		} else {
3388 			ct->ct_path_head =
3389 			    (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
3390 		}
3391 		if (ct->ct_path_tail == path) {
3392 			ct->ct_path_tail = prev;
3393 		}
3394 		if (ct->ct_path_last == path) {
3395 			ct->ct_path_last = ct->ct_path_head;
3396 		}
3397 	}
3398 	MDI_PI(pip)->pi_client_link = NULL;
3399 	MDI_PI(pip)->pi_client = NULL;
3400 }
3401 
3402 /*
3403  * i_mdi_pi_state_change():
3404  *		online a mdi_pathinfo node
3405  *
3406  * Return Values:
3407  *		MDI_SUCCESS
3408  *		MDI_FAILURE
3409  */
3410 /*ARGSUSED*/
3411 static int
3412 i_mdi_pi_state_change(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state, int flag)
3413 {
3414 	int		rv = MDI_SUCCESS;
3415 	mdi_vhci_t	*vh;
3416 	mdi_phci_t	*ph;
3417 	mdi_client_t	*ct;
3418 	int		(*f)();
3419 	dev_info_t	*cdip;
3420 
3421 	MDI_PI_LOCK(pip);
3422 
3423 	ph = MDI_PI(pip)->pi_phci;
3424 	ASSERT(ph);
3425 	if (ph == NULL) {
3426 		/*
3427 		 * Invalid pHCI device, fail the request
3428 		 */
3429 		MDI_PI_UNLOCK(pip);
3430 		MDI_DEBUG(1, (MDI_WARN, NULL,
3431 		    "!invalid phci: pip %s %p",
3432 		    mdi_pi_spathname(pip), (void *)pip));
3433 		return (MDI_FAILURE);
3434 	}
3435 
3436 	vh = ph->ph_vhci;
3437 	ASSERT(vh);
3438 	if (vh == NULL) {
3439 		/*
3440 		 * Invalid vHCI device, fail the request
3441 		 */
3442 		MDI_PI_UNLOCK(pip);
3443 		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3444 		    "!invalid vhci: pip %s %p",
3445 		    mdi_pi_spathname(pip), (void *)pip));
3446 		return (MDI_FAILURE);
3447 	}
3448 
3449 	ct = MDI_PI(pip)->pi_client;
3450 	ASSERT(ct != NULL);
3451 	if (ct == NULL) {
3452 		/*
3453 		 * Invalid client device, fail the request
3454 		 */
3455 		MDI_PI_UNLOCK(pip);
3456 		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3457 		    "!invalid client: pip %s %p",
3458 		    mdi_pi_spathname(pip), (void *)pip));
3459 		return (MDI_FAILURE);
3460 	}
3461 
3462 	/*
3463 	 * If this path has not been initialized yet, Callback vHCI driver's
3464 	 * pathinfo node initialize entry point
3465 	 */
3466 
3467 	if (MDI_PI_IS_INITING(pip)) {
3468 		MDI_PI_UNLOCK(pip);
3469 		f = vh->vh_ops->vo_pi_init;
3470 		if (f != NULL) {
3471 			rv = (*f)(vh->vh_dip, pip, 0);
3472 			if (rv != MDI_SUCCESS) {
3473 				MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
3474 				    "!vo_pi_init failed: vHCI %p, pip %s %p",
3475 				    (void *)vh, mdi_pi_spathname(pip),
3476 				    (void *)pip));
3477 				return (MDI_FAILURE);
3478 			}
3479 		}
3480 		MDI_PI_LOCK(pip);
3481 		MDI_PI_CLEAR_TRANSIENT(pip);
3482 	}
3483 
3484 	/*
3485 	 * Do not allow state transition when pHCI is in offline/suspended
3486 	 * states
3487 	 */
3488 	i_mdi_phci_lock(ph, pip);
3489 	if (MDI_PHCI_IS_READY(ph) == 0) {
3490 		MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
3491 		    "!pHCI not ready, pHCI=%p", (void *)ph));
3492 		MDI_PI_UNLOCK(pip);
3493 		i_mdi_phci_unlock(ph);
3494 		return (MDI_BUSY);
3495 	}
3496 	MDI_PHCI_UNSTABLE(ph);
3497 	i_mdi_phci_unlock(ph);
3498 
3499 	/*
3500 	 * Check if mdi_pathinfo state is in transient state.
3501 	 * If yes, offlining is in progress and wait till transient state is
3502 	 * cleared.
3503 	 */
3504 	if (MDI_PI_IS_TRANSIENT(pip)) {
3505 		while (MDI_PI_IS_TRANSIENT(pip)) {
3506 			cv_wait(&MDI_PI(pip)->pi_state_cv,
3507 			    &MDI_PI(pip)->pi_mutex);
3508 		}
3509 	}
3510 
3511 	/*
3512 	 * Grab the client lock in reverse order sequence and release the
3513 	 * mdi_pathinfo mutex.
3514 	 */
3515 	i_mdi_client_lock(ct, pip);
3516 	MDI_PI_UNLOCK(pip);
3517 
3518 	/*
3519 	 * Wait till failover state is cleared
3520 	 */
3521 	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
3522 		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
3523 
3524 	/*
3525 	 * Mark the mdi_pathinfo node state as transient
3526 	 */
3527 	MDI_PI_LOCK(pip);
3528 	switch (state) {
3529 	case MDI_PATHINFO_STATE_ONLINE:
3530 		MDI_PI_SET_ONLINING(pip);
3531 		break;
3532 
3533 	case MDI_PATHINFO_STATE_STANDBY:
3534 		MDI_PI_SET_STANDBYING(pip);
3535 		break;
3536 
3537 	case MDI_PATHINFO_STATE_FAULT:
3538 		/*
3539 		 * Mark the pathinfo state as FAULTED
3540 		 */
3541 		MDI_PI_SET_FAULTING(pip);
3542 		MDI_PI_ERRSTAT(pip, MDI_PI_HARDERR);
3543 		break;
3544 
3545 	case MDI_PATHINFO_STATE_OFFLINE:
3546 		/*
3547 		 * ndi_devi_offline() cannot hold pip or ct locks.
3548 		 */
3549 		MDI_PI_UNLOCK(pip);
3550 
3551 		/*
3552 		 * If this is a user initiated path online->offline operation
3553 		 * who's success would transition a client from DEGRADED to
3554 		 * FAILED then only proceed if we can offline the client first.
3555 		 */
3556 		cdip = ct->ct_dip;
3557 		if ((flag & NDI_USER_REQ) &&
3558 		    MDI_PI_IS_ONLINE(pip) &&
3559 		    (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED)) {
3560 			i_mdi_client_unlock(ct);
3561 			rv = ndi_devi_offline(cdip, NDI_DEVFS_CLEAN);
3562 			if (rv != NDI_SUCCESS) {
3563 				/*
3564 				 * Convert to MDI error code
3565 				 */
3566 				switch (rv) {
3567 				case NDI_BUSY:
3568 					rv = MDI_BUSY;
3569 					break;
3570 				default:
3571 					rv = MDI_FAILURE;
3572 					break;
3573 				}
3574 				goto state_change_exit;
3575 			} else {
3576 				i_mdi_client_lock(ct, NULL);
3577 			}
3578 		}
3579 		/*
3580 		 * Mark the mdi_pathinfo node state as transient
3581 		 */
3582 		MDI_PI_LOCK(pip);
3583 		MDI_PI_SET_OFFLINING(pip);
3584 		break;
3585 	}
3586 	MDI_PI_UNLOCK(pip);
3587 	MDI_CLIENT_UNSTABLE(ct);
3588 	i_mdi_client_unlock(ct);
3589 
3590 	f = vh->vh_ops->vo_pi_state_change;
3591 	if (f != NULL)
3592 		rv = (*f)(vh->vh_dip, pip, state, 0, flag);
3593 
3594 	MDI_CLIENT_LOCK(ct);
3595 	MDI_PI_LOCK(pip);
3596 	if (rv == MDI_NOT_SUPPORTED) {
3597 		MDI_CLIENT_SET_DEV_NOT_SUPPORTED(ct);
3598 	}
3599 	if (rv != MDI_SUCCESS) {
3600 		MDI_DEBUG(2, (MDI_WARN, ct->ct_dip,
3601 		    "vo_pi_state_change failed: rv %x", rv));
3602 	}
3603 	if (MDI_PI_IS_TRANSIENT(pip)) {
3604 		if (rv == MDI_SUCCESS) {
3605 			MDI_PI_CLEAR_TRANSIENT(pip);
3606 		} else {
3607 			MDI_PI(pip)->pi_state = MDI_PI_OLD_STATE(pip);
3608 		}
3609 	}
3610 
3611 	/*
3612 	 * Wake anyone waiting for this mdi_pathinfo node
3613 	 */
3614 	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
3615 	MDI_PI_UNLOCK(pip);
3616 
3617 	/*
3618 	 * Mark the client device as stable
3619 	 */
3620 	MDI_CLIENT_STABLE(ct);
3621 	if (rv == MDI_SUCCESS) {
3622 		if (ct->ct_unstable == 0) {
3623 			cdip = ct->ct_dip;
3624 
3625 			/*
3626 			 * Onlining the mdi_pathinfo node will impact the
3627 			 * client state Update the client and dev_info node
3628 			 * state accordingly
3629 			 */
3630 			rv = NDI_SUCCESS;
3631 			i_mdi_client_update_state(ct);
3632 			switch (MDI_CLIENT_STATE(ct)) {
3633 			case MDI_CLIENT_STATE_OPTIMAL:
3634 			case MDI_CLIENT_STATE_DEGRADED:
3635 				if (cdip && !i_ddi_devi_attached(cdip) &&
3636 				    ((state == MDI_PATHINFO_STATE_ONLINE) ||
3637 				    (state == MDI_PATHINFO_STATE_STANDBY))) {
3638 
3639 					/*
3640 					 * Must do ndi_devi_online() through
3641 					 * hotplug thread for deferred
3642 					 * attach mechanism to work
3643 					 */
3644 					MDI_CLIENT_UNLOCK(ct);
3645 					rv = ndi_devi_online(cdip, 0);
3646 					MDI_CLIENT_LOCK(ct);
3647 					if ((rv != NDI_SUCCESS) &&
3648 					    (MDI_CLIENT_STATE(ct) ==
3649 					    MDI_CLIENT_STATE_DEGRADED)) {
3650 						/*
3651 						 * ndi_devi_online failed.
3652 						 * Reset client flags to
3653 						 * offline.
3654 						 */
3655 						MDI_DEBUG(1, (MDI_WARN, cdip,
3656 						    "!ndi_devi_online failed "
3657 						    "error %x", rv));
3658 						MDI_CLIENT_SET_OFFLINE(ct);
3659 					}
3660 					if (rv != NDI_SUCCESS) {
3661 						/* Reset the path state */
3662 						MDI_PI_LOCK(pip);
3663 						MDI_PI(pip)->pi_state =
3664 						    MDI_PI_OLD_STATE(pip);
3665 						MDI_PI_UNLOCK(pip);
3666 					}
3667 				}
3668 				break;
3669 
3670 			case MDI_CLIENT_STATE_FAILED:
3671 				/*
3672 				 * This is the last path case for
3673 				 * non-user initiated events.
3674 				 */
3675 				if (((flag & NDI_USER_REQ) == 0) &&
3676 				    cdip && (i_ddi_node_state(cdip) >=
3677 				    DS_INITIALIZED)) {
3678 					MDI_CLIENT_UNLOCK(ct);
3679 					rv = ndi_devi_offline(cdip,
3680 					    NDI_DEVFS_CLEAN);
3681 					MDI_CLIENT_LOCK(ct);
3682 
3683 					if (rv != NDI_SUCCESS) {
3684 						/*
3685 						 * ndi_devi_offline failed.
3686 						 * Reset client flags to
3687 						 * online as the path could not
3688 						 * be offlined.
3689 						 */
3690 						MDI_DEBUG(1, (MDI_WARN, cdip,
3691 						    "!ndi_devi_offline failed: "
3692 						    "error %x", rv));
3693 						MDI_CLIENT_SET_ONLINE(ct);
3694 					}
3695 				}
3696 				break;
3697 			}
3698 			/*
3699 			 * Convert to MDI error code
3700 			 */
3701 			switch (rv) {
3702 			case NDI_SUCCESS:
3703 				MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
3704 				i_mdi_report_path_state(ct, pip);
3705 				rv = MDI_SUCCESS;
3706 				break;
3707 			case NDI_BUSY:
3708 				rv = MDI_BUSY;
3709 				break;
3710 			default:
3711 				rv = MDI_FAILURE;
3712 				break;
3713 			}
3714 		}
3715 	}
3716 	MDI_CLIENT_UNLOCK(ct);
3717 
3718 state_change_exit:
3719 	/*
3720 	 * Mark the pHCI as stable again.
3721 	 */
3722 	MDI_PHCI_LOCK(ph);
3723 	MDI_PHCI_STABLE(ph);
3724 	MDI_PHCI_UNLOCK(ph);
3725 	return (rv);
3726 }
3727 
3728 /*
3729  * mdi_pi_online():
3730  *		Place the path_info node in the online state.  The path is
3731  *		now available to be selected by mdi_select_path() for
3732  *		transporting I/O requests to client devices.
3733  * Return Values:
3734  *		MDI_SUCCESS
3735  *		MDI_FAILURE
3736  */
3737 int
3738 mdi_pi_online(mdi_pathinfo_t *pip, int flags)
3739 {
3740 	mdi_client_t	*ct = MDI_PI(pip)->pi_client;
3741 	int		client_held = 0;
3742 	int		rv;
3743 
3744 	ASSERT(ct != NULL);
3745 	rv = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_ONLINE, flags);
3746 	if (rv != MDI_SUCCESS)
3747 		return (rv);
3748 
3749 	MDI_PI_LOCK(pip);
3750 	if (MDI_PI(pip)->pi_pm_held == 0) {
3751 		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
3752 		    "i_mdi_pm_hold_pip %p", (void *)pip));
3753 		i_mdi_pm_hold_pip(pip);
3754 		client_held = 1;
3755 	}
3756 	MDI_PI_UNLOCK(pip);
3757 
3758 	if (client_held) {
3759 		MDI_CLIENT_LOCK(ct);
3760 		if (ct->ct_power_cnt == 0) {
3761 			rv = i_mdi_power_all_phci(ct);
3762 		}
3763 
3764 		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
3765 		    "i_mdi_pm_hold_client %p", (void *)ct));
3766 		i_mdi_pm_hold_client(ct, 1);
3767 		MDI_CLIENT_UNLOCK(ct);
3768 	}
3769 
3770 	return (rv);
3771 }
3772 
3773 /*
3774  * mdi_pi_standby():
3775  *		Place the mdi_pathinfo node in standby state
3776  *
3777  * Return Values:
3778  *		MDI_SUCCESS
3779  *		MDI_FAILURE
3780  */
3781 int
3782 mdi_pi_standby(mdi_pathinfo_t *pip, int flags)
3783 {
3784 	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_STANDBY, flags));
3785 }
3786 
3787 /*
3788  * mdi_pi_fault():
3789  *		Place the mdi_pathinfo node in fault'ed state
3790  * Return Values:
3791  *		MDI_SUCCESS
3792  *		MDI_FAILURE
3793  */
3794 int
3795 mdi_pi_fault(mdi_pathinfo_t *pip, int flags)
3796 {
3797 	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_FAULT, flags));
3798 }
3799 
3800 /*
3801  * mdi_pi_offline():
3802  *		Offline a mdi_pathinfo node.
3803  * Return Values:
3804  *		MDI_SUCCESS
3805  *		MDI_FAILURE
3806  */
3807 int
3808 mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
3809 {
3810 	int	ret, client_held = 0;
3811 	mdi_client_t	*ct;
3812 
3813 	/*
3814 	 * Original code overloaded NDI_DEVI_REMOVE to this interface, and
3815 	 * used it to mean "user initiated operation" (i.e. devctl). Callers
3816 	 * should now just use NDI_USER_REQ.
3817 	 */
3818 	if (flags & NDI_DEVI_REMOVE) {
3819 		flags &= ~NDI_DEVI_REMOVE;
3820 		flags |= NDI_USER_REQ;
3821 	}
3822 
3823 	ret = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_OFFLINE, flags);
3824 
3825 	if (ret == MDI_SUCCESS) {
3826 		MDI_PI_LOCK(pip);
3827 		if (MDI_PI(pip)->pi_pm_held) {
3828 			client_held = 1;
3829 		}
3830 		MDI_PI_UNLOCK(pip);
3831 
3832 		if (client_held) {
3833 			ct = MDI_PI(pip)->pi_client;
3834 			MDI_CLIENT_LOCK(ct);
3835 			MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
3836 			    "i_mdi_pm_rele_client\n"));
3837 			i_mdi_pm_rele_client(ct, 1);
3838 			MDI_CLIENT_UNLOCK(ct);
3839 		}
3840 	}
3841 
3842 	return (ret);
3843 }
3844 
3845 /*
3846  * i_mdi_pi_offline():
3847  *		Offline a mdi_pathinfo node and call the vHCI driver's callback
3848  */
3849 static int
3850 i_mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
3851 {
3852 	dev_info_t	*vdip = NULL;
3853 	mdi_vhci_t	*vh = NULL;
3854 	mdi_client_t	*ct = NULL;
3855 	int		(*f)();
3856 	int		rv;
3857 
3858 	MDI_PI_LOCK(pip);
3859 	ct = MDI_PI(pip)->pi_client;
3860 	ASSERT(ct != NULL);
3861 
3862 	while (MDI_PI(pip)->pi_ref_cnt != 0) {
3863 		/*
3864 		 * Give a chance for pending I/Os to complete.
3865 		 */
3866 		MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3867 		    "!%d cmds still pending on path %s %p",
3868 		    MDI_PI(pip)->pi_ref_cnt, mdi_pi_spathname(pip),
3869 		    (void *)pip));
3870 		if (cv_timedwait(&MDI_PI(pip)->pi_ref_cv,
3871 		    &MDI_PI(pip)->pi_mutex,
3872 		    ddi_get_lbolt() + drv_usectohz(60 * 1000000)) == -1) {
3873 			/*
3874 			 * The timeout time reached without ref_cnt being zero
3875 			 * being signaled.
3876 			 */
3877 			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3878 			    "!Timeout reached on path %s %p without the cond",
3879 			    mdi_pi_spathname(pip), (void *)pip));
3880 			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3881 			    "!%d cmds still pending on path %s %p",
3882 			    MDI_PI(pip)->pi_ref_cnt,
3883 			    mdi_pi_spathname(pip), (void *)pip));
3884 		}
3885 	}
3886 	vh = ct->ct_vhci;
3887 	vdip = vh->vh_dip;
3888 
3889 	/*
3890 	 * Notify vHCI that has registered this event
3891 	 */
3892 	ASSERT(vh->vh_ops);
3893 	f = vh->vh_ops->vo_pi_state_change;
3894 
3895 	if (f != NULL) {
3896 		MDI_PI_UNLOCK(pip);
3897 		if ((rv = (*f)(vdip, pip, MDI_PATHINFO_STATE_OFFLINE, 0,
3898 		    flags)) != MDI_SUCCESS) {
3899 			MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
3900 			    "!vo_path_offline failed: vdip %s%d %p: path %s %p",
3901 			    ddi_driver_name(vdip), ddi_get_instance(vdip),
3902 			    (void *)vdip, mdi_pi_spathname(pip), (void *)pip));
3903 		}
3904 		MDI_PI_LOCK(pip);
3905 	}
3906 
3907 	/*
3908 	 * Set the mdi_pathinfo node state and clear the transient condition
3909 	 */
3910 	MDI_PI_SET_OFFLINE(pip);
3911 	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
3912 	MDI_PI_UNLOCK(pip);
3913 
3914 	MDI_CLIENT_LOCK(ct);
3915 	if (rv == MDI_SUCCESS) {
3916 		if (ct->ct_unstable == 0) {
3917 			dev_info_t	*cdip = ct->ct_dip;
3918 
3919 			/*
3920 			 * Onlining the mdi_pathinfo node will impact the
3921 			 * client state Update the client and dev_info node
3922 			 * state accordingly
3923 			 */
3924 			i_mdi_client_update_state(ct);
3925 			rv = NDI_SUCCESS;
3926 			if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
3927 				if (cdip &&
3928 				    (i_ddi_node_state(cdip) >=
3929 				    DS_INITIALIZED)) {
3930 					MDI_CLIENT_UNLOCK(ct);
3931 					rv = ndi_devi_offline(cdip,
3932 					    NDI_DEVFS_CLEAN);
3933 					MDI_CLIENT_LOCK(ct);
3934 					if (rv != NDI_SUCCESS) {
3935 						/*
3936 						 * ndi_devi_offline failed.
3937 						 * Reset client flags to
3938 						 * online.
3939 						 */
3940 						MDI_DEBUG(4, (MDI_WARN, cdip,
3941 						    "ndi_devi_offline failed: "
3942 						    "error %x", rv));
3943 						MDI_CLIENT_SET_ONLINE(ct);
3944 					}
3945 				}
3946 			}
3947 			/*
3948 			 * Convert to MDI error code
3949 			 */
3950 			switch (rv) {
3951 			case NDI_SUCCESS:
3952 				rv = MDI_SUCCESS;
3953 				break;
3954 			case NDI_BUSY:
3955 				rv = MDI_BUSY;
3956 				break;
3957 			default:
3958 				rv = MDI_FAILURE;
3959 				break;
3960 			}
3961 		}
3962 		MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
3963 		i_mdi_report_path_state(ct, pip);
3964 	}
3965 
3966 	MDI_CLIENT_UNLOCK(ct);
3967 
3968 	/*
3969 	 * Change in the mdi_pathinfo node state will impact the client state
3970 	 */
3971 	MDI_DEBUG(2, (MDI_NOTE, ct->ct_dip,
3972 	    "ct = %p pip = %p", (void *)ct, (void *)pip));
3973 	return (rv);
3974 }
3975 
3976 /*
3977  * mdi_pi_get_node_name():
3978  *              Get the name associated with a mdi_pathinfo node.
3979  *              Since pathinfo nodes are not directly named, we
3980  *              return the node_name of the client.
3981  *
3982  * Return Values:
3983  *              char *
3984  */
3985 char *
3986 mdi_pi_get_node_name(mdi_pathinfo_t *pip)
3987 {
3988 	mdi_client_t    *ct;
3989 
3990 	if (pip == NULL)
3991 		return (NULL);
3992 	ct = MDI_PI(pip)->pi_client;
3993 	if ((ct == NULL) || (ct->ct_dip == NULL))
3994 		return (NULL);
3995 	return (ddi_node_name(ct->ct_dip));
3996 }
3997 
3998 /*
3999  * mdi_pi_get_addr():
4000  *		Get the unit address associated with a mdi_pathinfo node
4001  *
4002  * Return Values:
4003  *		char *
4004  */
4005 char *
4006 mdi_pi_get_addr(mdi_pathinfo_t *pip)
4007 {
4008 	if (pip == NULL)
4009 		return (NULL);
4010 
4011 	return (MDI_PI(pip)->pi_addr);
4012 }
4013 
4014 /*
4015  * mdi_pi_get_path_instance():
4016  *		Get the 'path_instance' of a mdi_pathinfo node
4017  *
4018  * Return Values:
4019  *		path_instance
4020  */
4021 int
4022 mdi_pi_get_path_instance(mdi_pathinfo_t *pip)
4023 {
4024 	if (pip == NULL)
4025 		return (0);
4026 
4027 	return (MDI_PI(pip)->pi_path_instance);
4028 }
4029 
4030 /*
4031  * mdi_pi_pathname():
4032  *		Return pointer to path to pathinfo node.
4033  */
4034 char *
4035 mdi_pi_pathname(mdi_pathinfo_t *pip)
4036 {
4037 	if (pip == NULL)
4038 		return (NULL);
4039 	return (mdi_pi_pathname_by_instance(mdi_pi_get_path_instance(pip)));
4040 }
4041 
4042 /*
4043  * mdi_pi_spathname():
4044  *		Return pointer to shortpath to pathinfo node. Used for debug
4045  *		messages, so return "" instead of NULL when unknown.
4046  */
4047 char *
4048 mdi_pi_spathname(mdi_pathinfo_t *pip)
4049 {
4050 	char	*spath = "";
4051 
4052 	if (pip) {
4053 		spath = mdi_pi_spathname_by_instance(
4054 		    mdi_pi_get_path_instance(pip));
4055 		if (spath == NULL)
4056 			spath = "";
4057 	}
4058 	return (spath);
4059 }
4060 
4061 char *
4062 mdi_pi_pathname_obp(mdi_pathinfo_t *pip, char *path)
4063 {
4064 	char *obp_path = NULL;
4065 	if ((pip == NULL) || (path == NULL))
4066 		return (NULL);
4067 
4068 	if (mdi_prop_lookup_string(pip, "obp-path", &obp_path) == MDI_SUCCESS) {
4069 		(void) strcpy(path, obp_path);
4070 		(void) mdi_prop_free(obp_path);
4071 	} else {
4072 		path = NULL;
4073 	}
4074 	return (path);
4075 }
4076 
4077 int
4078 mdi_pi_pathname_obp_set(mdi_pathinfo_t *pip, char *component)
4079 {
4080 	dev_info_t *pdip;
4081 	char *obp_path = NULL;
4082 	int rc = MDI_FAILURE;
4083 
4084 	if (pip == NULL)
4085 		return (MDI_FAILURE);
4086 
4087 	pdip = mdi_pi_get_phci(pip);
4088 	if (pdip == NULL)
4089 		return (MDI_FAILURE);
4090 
4091 	obp_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
4092 
4093 	if (ddi_pathname_obp(pdip, obp_path) == NULL) {
4094 		(void) ddi_pathname(pdip, obp_path);
4095 	}
4096 
4097 	if (component) {
4098 		(void) strncat(obp_path, "/", MAXPATHLEN);
4099 		(void) strncat(obp_path, component, MAXPATHLEN);
4100 	}
4101 	rc = mdi_prop_update_string(pip, "obp-path", obp_path);
4102 
4103 	if (obp_path)
4104 		kmem_free(obp_path, MAXPATHLEN);
4105 	return (rc);
4106 }
4107 
4108 /*
4109  * mdi_pi_get_client():
4110  *		Get the client devinfo associated with a mdi_pathinfo node
4111  *
4112  * Return Values:
4113  *		Handle to client device dev_info node
4114  */
4115 dev_info_t *
4116 mdi_pi_get_client(mdi_pathinfo_t *pip)
4117 {
4118 	dev_info_t	*dip = NULL;
4119 	if (pip) {
4120 		dip = MDI_PI(pip)->pi_client->ct_dip;
4121 	}
4122 	return (dip);
4123 }
4124 
4125 /*
4126  * mdi_pi_get_phci():
4127  *		Get the pHCI devinfo associated with the mdi_pathinfo node
4128  * Return Values:
4129  *		Handle to dev_info node
4130  */
4131 dev_info_t *
4132 mdi_pi_get_phci(mdi_pathinfo_t *pip)
4133 {
4134 	dev_info_t	*dip = NULL;
4135 	mdi_phci_t	*ph;
4136 
4137 	if (pip) {
4138 		ph = MDI_PI(pip)->pi_phci;
4139 		if (ph)
4140 			dip = ph->ph_dip;
4141 	}
4142 	return (dip);
4143 }
4144 
4145 /*
4146  * mdi_pi_get_client_private():
4147  *		Get the client private information associated with the
4148  *		mdi_pathinfo node
4149  */
4150 void *
4151 mdi_pi_get_client_private(mdi_pathinfo_t *pip)
4152 {
4153 	void *cprivate = NULL;
4154 	if (pip) {
4155 		cprivate = MDI_PI(pip)->pi_cprivate;
4156 	}
4157 	return (cprivate);
4158 }
4159 
4160 /*
4161  * mdi_pi_set_client_private():
4162  *		Set the client private information in the mdi_pathinfo node
4163  */
4164 void
4165 mdi_pi_set_client_private(mdi_pathinfo_t *pip, void *priv)
4166 {
4167 	if (pip) {
4168 		MDI_PI(pip)->pi_cprivate = priv;
4169 	}
4170 }
4171 
4172 /*
4173  * mdi_pi_get_phci_private():
4174  *		Get the pHCI private information associated with the
4175  *		mdi_pathinfo node
4176  */
4177 caddr_t
4178 mdi_pi_get_phci_private(mdi_pathinfo_t *pip)
4179 {
4180 	caddr_t	pprivate = NULL;
4181 
4182 	if (pip) {
4183 		pprivate = MDI_PI(pip)->pi_pprivate;
4184 	}
4185 	return (pprivate);
4186 }
4187 
4188 /*
4189  * mdi_pi_set_phci_private():
4190  *		Set the pHCI private information in the mdi_pathinfo node
4191  */
4192 void
4193 mdi_pi_set_phci_private(mdi_pathinfo_t *pip, caddr_t priv)
4194 {
4195 	if (pip) {
4196 		MDI_PI(pip)->pi_pprivate = priv;
4197 	}
4198 }
4199 
4200 /*
4201  * mdi_pi_get_state():
4202  *		Get the mdi_pathinfo node state. Transient states are internal
4203  *		and not provided to the users
4204  */
4205 mdi_pathinfo_state_t
4206 mdi_pi_get_state(mdi_pathinfo_t *pip)
4207 {
4208 	mdi_pathinfo_state_t    state = MDI_PATHINFO_STATE_INIT;
4209 
4210 	if (pip) {
4211 		if (MDI_PI_IS_TRANSIENT(pip)) {
4212 			/*
4213 			 * mdi_pathinfo is in state transition.  Return the
4214 			 * last good state.
4215 			 */
4216 			state = MDI_PI_OLD_STATE(pip);
4217 		} else {
4218 			state = MDI_PI_STATE(pip);
4219 		}
4220 	}
4221 	return (state);
4222 }
4223 
4224 /*
4225  * mdi_pi_get_flags():
4226  *		Get the mdi_pathinfo node flags.
4227  */
4228 uint_t
4229 mdi_pi_get_flags(mdi_pathinfo_t *pip)
4230 {
4231 	return (pip ? MDI_PI(pip)->pi_flags : 0);
4232 }
4233 
4234 /*
4235  * Note that the following function needs to be the new interface for
4236  * mdi_pi_get_state when mpxio gets integrated to ON.
4237  */
4238 int
4239 mdi_pi_get_state2(mdi_pathinfo_t *pip, mdi_pathinfo_state_t *state,
4240 		uint32_t *ext_state)
4241 {
4242 	*state = MDI_PATHINFO_STATE_INIT;
4243 
4244 	if (pip) {
4245 		if (MDI_PI_IS_TRANSIENT(pip)) {
4246 			/*
4247 			 * mdi_pathinfo is in state transition.  Return the
4248 			 * last good state.
4249 			 */
4250 			*state = MDI_PI_OLD_STATE(pip);
4251 			*ext_state = MDI_PI_OLD_EXT_STATE(pip);
4252 		} else {
4253 			*state = MDI_PI_STATE(pip);
4254 			*ext_state = MDI_PI_EXT_STATE(pip);
4255 		}
4256 	}
4257 	return (MDI_SUCCESS);
4258 }
4259 
4260 /*
4261  * mdi_pi_get_preferred:
4262  *	Get the preferred path flag
4263  */
4264 int
4265 mdi_pi_get_preferred(mdi_pathinfo_t *pip)
4266 {
4267 	if (pip) {
4268 		return (MDI_PI(pip)->pi_preferred);
4269 	}
4270 	return (0);
4271 }
4272 
4273 /*
4274  * mdi_pi_set_preferred:
4275  *	Set the preferred path flag
4276  */
4277 void
4278 mdi_pi_set_preferred(mdi_pathinfo_t *pip, int preferred)
4279 {
4280 	if (pip) {
4281 		MDI_PI(pip)->pi_preferred = preferred;
4282 	}
4283 }
4284 
4285 /*
4286  * mdi_pi_set_state():
4287  *		Set the mdi_pathinfo node state
4288  */
4289 void
4290 mdi_pi_set_state(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state)
4291 {
4292 	uint32_t	ext_state;
4293 
4294 	if (pip) {
4295 		ext_state = MDI_PI(pip)->pi_state & MDI_PATHINFO_EXT_STATE_MASK;
4296 		MDI_PI(pip)->pi_state = state;
4297 		MDI_PI(pip)->pi_state |= ext_state;
4298 
4299 		/* Path has changed state, invalidate DINFOCACHE snap shot. */
4300 		i_ddi_di_cache_invalidate();
4301 	}
4302 }
4303 
4304 /*
4305  * Property functions:
4306  */
4307 int
4308 i_map_nvlist_error_to_mdi(int val)
4309 {
4310 	int rv;
4311 
4312 	switch (val) {
4313 	case 0:
4314 		rv = DDI_PROP_SUCCESS;
4315 		break;
4316 	case EINVAL:
4317 	case ENOTSUP:
4318 		rv = DDI_PROP_INVAL_ARG;
4319 		break;
4320 	case ENOMEM:
4321 		rv = DDI_PROP_NO_MEMORY;
4322 		break;
4323 	default:
4324 		rv = DDI_PROP_NOT_FOUND;
4325 		break;
4326 	}
4327 	return (rv);
4328 }
4329 
4330 /*
4331  * mdi_pi_get_next_prop():
4332  * 		Property walk function.  The caller should hold mdi_pi_lock()
4333  *		and release by calling mdi_pi_unlock() at the end of walk to
4334  *		get a consistent value.
4335  */
4336 nvpair_t *
4337 mdi_pi_get_next_prop(mdi_pathinfo_t *pip, nvpair_t *prev)
4338 {
4339 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4340 		return (NULL);
4341 	}
4342 	ASSERT(MDI_PI_LOCKED(pip));
4343 	return (nvlist_next_nvpair(MDI_PI(pip)->pi_prop, prev));
4344 }
4345 
4346 /*
4347  * mdi_prop_remove():
4348  * 		Remove the named property from the named list.
4349  */
4350 int
4351 mdi_prop_remove(mdi_pathinfo_t *pip, char *name)
4352 {
4353 	if (pip == NULL) {
4354 		return (DDI_PROP_NOT_FOUND);
4355 	}
4356 	ASSERT(!MDI_PI_LOCKED(pip));
4357 	MDI_PI_LOCK(pip);
4358 	if (MDI_PI(pip)->pi_prop == NULL) {
4359 		MDI_PI_UNLOCK(pip);
4360 		return (DDI_PROP_NOT_FOUND);
4361 	}
4362 	if (name) {
4363 		(void) nvlist_remove_all(MDI_PI(pip)->pi_prop, name);
4364 	} else {
4365 		char		nvp_name[MAXNAMELEN];
4366 		nvpair_t	*nvp;
4367 		nvp = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, NULL);
4368 		while (nvp) {
4369 			nvpair_t	*next;
4370 			next = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, nvp);
4371 			(void) snprintf(nvp_name, sizeof(nvp_name), "%s",
4372 			    nvpair_name(nvp));
4373 			(void) nvlist_remove_all(MDI_PI(pip)->pi_prop,
4374 			    nvp_name);
4375 			nvp = next;
4376 		}
4377 	}
4378 	MDI_PI_UNLOCK(pip);
4379 	return (DDI_PROP_SUCCESS);
4380 }
4381 
4382 /*
4383  * mdi_prop_size():
4384  * 		Get buffer size needed to pack the property data.
4385  * 		Caller should hold the mdi_pathinfo_t lock to get a consistent
4386  *		buffer size.
4387  */
4388 int
4389 mdi_prop_size(mdi_pathinfo_t *pip, size_t *buflenp)
4390 {
4391 	int	rv;
4392 	size_t	bufsize;
4393 
4394 	*buflenp = 0;
4395 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4396 		return (DDI_PROP_NOT_FOUND);
4397 	}
4398 	ASSERT(MDI_PI_LOCKED(pip));
4399 	rv = nvlist_size(MDI_PI(pip)->pi_prop,
4400 	    &bufsize, NV_ENCODE_NATIVE);
4401 	*buflenp = bufsize;
4402 	return (i_map_nvlist_error_to_mdi(rv));
4403 }
4404 
4405 /*
4406  * mdi_prop_pack():
4407  * 		pack the property list.  The caller should hold the
4408  *		mdi_pathinfo_t node to get a consistent data
4409  */
4410 int
4411 mdi_prop_pack(mdi_pathinfo_t *pip, char **bufp, uint_t buflen)
4412 {
4413 	int	rv;
4414 	size_t	bufsize;
4415 
4416 	if ((pip == NULL) || MDI_PI(pip)->pi_prop == NULL) {
4417 		return (DDI_PROP_NOT_FOUND);
4418 	}
4419 
4420 	ASSERT(MDI_PI_LOCKED(pip));
4421 
4422 	bufsize = buflen;
4423 	rv = nvlist_pack(MDI_PI(pip)->pi_prop, bufp, (size_t *)&bufsize,
4424 	    NV_ENCODE_NATIVE, KM_SLEEP);
4425 
4426 	return (i_map_nvlist_error_to_mdi(rv));
4427 }
4428 
4429 /*
4430  * mdi_prop_update_byte():
4431  *		Create/Update a byte property
4432  */
4433 int
4434 mdi_prop_update_byte(mdi_pathinfo_t *pip, char *name, uchar_t data)
4435 {
4436 	int rv;
4437 
4438 	if (pip == NULL) {
4439 		return (DDI_PROP_INVAL_ARG);
4440 	}
4441 	ASSERT(!MDI_PI_LOCKED(pip));
4442 	MDI_PI_LOCK(pip);
4443 	if (MDI_PI(pip)->pi_prop == NULL) {
4444 		MDI_PI_UNLOCK(pip);
4445 		return (DDI_PROP_NOT_FOUND);
4446 	}
4447 	rv = nvlist_add_byte(MDI_PI(pip)->pi_prop, name, data);
4448 	MDI_PI_UNLOCK(pip);
4449 	return (i_map_nvlist_error_to_mdi(rv));
4450 }
4451 
4452 /*
4453  * mdi_prop_update_byte_array():
4454  *		Create/Update a byte array property
4455  */
4456 int
4457 mdi_prop_update_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t *data,
4458     uint_t nelements)
4459 {
4460 	int rv;
4461 
4462 	if (pip == NULL) {
4463 		return (DDI_PROP_INVAL_ARG);
4464 	}
4465 	ASSERT(!MDI_PI_LOCKED(pip));
4466 	MDI_PI_LOCK(pip);
4467 	if (MDI_PI(pip)->pi_prop == NULL) {
4468 		MDI_PI_UNLOCK(pip);
4469 		return (DDI_PROP_NOT_FOUND);
4470 	}
4471 	rv = nvlist_add_byte_array(MDI_PI(pip)->pi_prop, name, data, nelements);
4472 	MDI_PI_UNLOCK(pip);
4473 	return (i_map_nvlist_error_to_mdi(rv));
4474 }
4475 
4476 /*
4477  * mdi_prop_update_int():
4478  *		Create/Update a 32 bit integer property
4479  */
4480 int
4481 mdi_prop_update_int(mdi_pathinfo_t *pip, char *name, int data)
4482 {
4483 	int rv;
4484 
4485 	if (pip == NULL) {
4486 		return (DDI_PROP_INVAL_ARG);
4487 	}
4488 	ASSERT(!MDI_PI_LOCKED(pip));
4489 	MDI_PI_LOCK(pip);
4490 	if (MDI_PI(pip)->pi_prop == NULL) {
4491 		MDI_PI_UNLOCK(pip);
4492 		return (DDI_PROP_NOT_FOUND);
4493 	}
4494 	rv = nvlist_add_int32(MDI_PI(pip)->pi_prop, name, (int32_t)data);
4495 	MDI_PI_UNLOCK(pip);
4496 	return (i_map_nvlist_error_to_mdi(rv));
4497 }
4498 
4499 /*
4500  * mdi_prop_update_int64():
4501  *		Create/Update a 64 bit integer property
4502  */
4503 int
4504 mdi_prop_update_int64(mdi_pathinfo_t *pip, char *name, int64_t data)
4505 {
4506 	int rv;
4507 
4508 	if (pip == NULL) {
4509 		return (DDI_PROP_INVAL_ARG);
4510 	}
4511 	ASSERT(!MDI_PI_LOCKED(pip));
4512 	MDI_PI_LOCK(pip);
4513 	if (MDI_PI(pip)->pi_prop == NULL) {
4514 		MDI_PI_UNLOCK(pip);
4515 		return (DDI_PROP_NOT_FOUND);
4516 	}
4517 	rv = nvlist_add_int64(MDI_PI(pip)->pi_prop, name, data);
4518 	MDI_PI_UNLOCK(pip);
4519 	return (i_map_nvlist_error_to_mdi(rv));
4520 }
4521 
4522 /*
4523  * mdi_prop_update_int_array():
4524  *		Create/Update a int array property
4525  */
4526 int
4527 mdi_prop_update_int_array(mdi_pathinfo_t *pip, char *name, int *data,
4528 	    uint_t nelements)
4529 {
4530 	int rv;
4531 
4532 	if (pip == NULL) {
4533 		return (DDI_PROP_INVAL_ARG);
4534 	}
4535 	ASSERT(!MDI_PI_LOCKED(pip));
4536 	MDI_PI_LOCK(pip);
4537 	if (MDI_PI(pip)->pi_prop == NULL) {
4538 		MDI_PI_UNLOCK(pip);
4539 		return (DDI_PROP_NOT_FOUND);
4540 	}
4541 	rv = nvlist_add_int32_array(MDI_PI(pip)->pi_prop, name, (int32_t *)data,
4542 	    nelements);
4543 	MDI_PI_UNLOCK(pip);
4544 	return (i_map_nvlist_error_to_mdi(rv));
4545 }
4546 
4547 /*
4548  * mdi_prop_update_string():
4549  *		Create/Update a string property
4550  */
4551 int
4552 mdi_prop_update_string(mdi_pathinfo_t *pip, char *name, char *data)
4553 {
4554 	int rv;
4555 
4556 	if (pip == NULL) {
4557 		return (DDI_PROP_INVAL_ARG);
4558 	}
4559 	ASSERT(!MDI_PI_LOCKED(pip));
4560 	MDI_PI_LOCK(pip);
4561 	if (MDI_PI(pip)->pi_prop == NULL) {
4562 		MDI_PI_UNLOCK(pip);
4563 		return (DDI_PROP_NOT_FOUND);
4564 	}
4565 	rv = nvlist_add_string(MDI_PI(pip)->pi_prop, name, data);
4566 	MDI_PI_UNLOCK(pip);
4567 	return (i_map_nvlist_error_to_mdi(rv));
4568 }
4569 
4570 /*
4571  * mdi_prop_update_string_array():
4572  *		Create/Update a string array property
4573  */
4574 int
4575 mdi_prop_update_string_array(mdi_pathinfo_t *pip, char *name, char **data,
4576     uint_t nelements)
4577 {
4578 	int rv;
4579 
4580 	if (pip == NULL) {
4581 		return (DDI_PROP_INVAL_ARG);
4582 	}
4583 	ASSERT(!MDI_PI_LOCKED(pip));
4584 	MDI_PI_LOCK(pip);
4585 	if (MDI_PI(pip)->pi_prop == NULL) {
4586 		MDI_PI_UNLOCK(pip);
4587 		return (DDI_PROP_NOT_FOUND);
4588 	}
4589 	rv = nvlist_add_string_array(MDI_PI(pip)->pi_prop, name, data,
4590 	    nelements);
4591 	MDI_PI_UNLOCK(pip);
4592 	return (i_map_nvlist_error_to_mdi(rv));
4593 }
4594 
4595 /*
4596  * mdi_prop_lookup_byte():
4597  * 		Look for byte property identified by name.  The data returned
4598  *		is the actual property and valid as long as mdi_pathinfo_t node
4599  *		is alive.
4600  */
4601 int
4602 mdi_prop_lookup_byte(mdi_pathinfo_t *pip, char *name, uchar_t *data)
4603 {
4604 	int rv;
4605 
4606 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4607 		return (DDI_PROP_NOT_FOUND);
4608 	}
4609 	rv = nvlist_lookup_byte(MDI_PI(pip)->pi_prop, name, data);
4610 	return (i_map_nvlist_error_to_mdi(rv));
4611 }
4612 
4613 
4614 /*
4615  * mdi_prop_lookup_byte_array():
4616  * 		Look for byte array property identified by name.  The data
4617  *		returned is the actual property and valid as long as
4618  *		mdi_pathinfo_t node is alive.
4619  */
4620 int
4621 mdi_prop_lookup_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t **data,
4622     uint_t *nelements)
4623 {
4624 	int rv;
4625 
4626 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4627 		return (DDI_PROP_NOT_FOUND);
4628 	}
4629 	rv = nvlist_lookup_byte_array(MDI_PI(pip)->pi_prop, name, data,
4630 	    nelements);
4631 	return (i_map_nvlist_error_to_mdi(rv));
4632 }
4633 
4634 /*
4635  * mdi_prop_lookup_int():
4636  * 		Look for int property identified by name.  The data returned
4637  *		is the actual property and valid as long as mdi_pathinfo_t
4638  *		node is alive.
4639  */
4640 int
4641 mdi_prop_lookup_int(mdi_pathinfo_t *pip, char *name, int *data)
4642 {
4643 	int rv;
4644 
4645 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4646 		return (DDI_PROP_NOT_FOUND);
4647 	}
4648 	rv = nvlist_lookup_int32(MDI_PI(pip)->pi_prop, name, (int32_t *)data);
4649 	return (i_map_nvlist_error_to_mdi(rv));
4650 }
4651 
4652 /*
4653  * mdi_prop_lookup_int64():
4654  * 		Look for int64 property identified by name.  The data returned
4655  *		is the actual property and valid as long as mdi_pathinfo_t node
4656  *		is alive.
4657  */
4658 int
4659 mdi_prop_lookup_int64(mdi_pathinfo_t *pip, char *name, int64_t *data)
4660 {
4661 	int rv;
4662 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4663 		return (DDI_PROP_NOT_FOUND);
4664 	}
4665 	rv = nvlist_lookup_int64(MDI_PI(pip)->pi_prop, name, data);
4666 	return (i_map_nvlist_error_to_mdi(rv));
4667 }
4668 
4669 /*
4670  * mdi_prop_lookup_int_array():
4671  * 		Look for int array property identified by name.  The data
4672  *		returned is the actual property and valid as long as
4673  *		mdi_pathinfo_t node is alive.
4674  */
4675 int
4676 mdi_prop_lookup_int_array(mdi_pathinfo_t *pip, char *name, int **data,
4677     uint_t *nelements)
4678 {
4679 	int rv;
4680 
4681 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4682 		return (DDI_PROP_NOT_FOUND);
4683 	}
4684 	rv = nvlist_lookup_int32_array(MDI_PI(pip)->pi_prop, name,
4685 	    (int32_t **)data, nelements);
4686 	return (i_map_nvlist_error_to_mdi(rv));
4687 }
4688 
4689 /*
4690  * mdi_prop_lookup_string():
4691  * 		Look for string property identified by name.  The data
4692  *		returned is the actual property and valid as long as
4693  *		mdi_pathinfo_t node is alive.
4694  */
4695 int
4696 mdi_prop_lookup_string(mdi_pathinfo_t *pip, char *name, char **data)
4697 {
4698 	int rv;
4699 
4700 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4701 		return (DDI_PROP_NOT_FOUND);
4702 	}
4703 	rv = nvlist_lookup_string(MDI_PI(pip)->pi_prop, name, data);
4704 	return (i_map_nvlist_error_to_mdi(rv));
4705 }
4706 
4707 /*
4708  * mdi_prop_lookup_string_array():
4709  * 		Look for string array property identified by name.  The data
4710  *		returned is the actual property and valid as long as
4711  *		mdi_pathinfo_t node is alive.
4712  */
4713 int
4714 mdi_prop_lookup_string_array(mdi_pathinfo_t *pip, char *name, char ***data,
4715     uint_t *nelements)
4716 {
4717 	int rv;
4718 
4719 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4720 		return (DDI_PROP_NOT_FOUND);
4721 	}
4722 	rv = nvlist_lookup_string_array(MDI_PI(pip)->pi_prop, name, data,
4723 	    nelements);
4724 	return (i_map_nvlist_error_to_mdi(rv));
4725 }
4726 
4727 /*
4728  * mdi_prop_free():
4729  * 		Symmetrical function to ddi_prop_free(). nvlist_lookup_xx()
4730  *		functions return the pointer to actual property data and not a
4731  *		copy of it.  So the data returned is valid as long as
4732  *		mdi_pathinfo_t node is valid.
4733  */
4734 /*ARGSUSED*/
4735 int
4736 mdi_prop_free(void *data)
4737 {
4738 	return (DDI_PROP_SUCCESS);
4739 }
4740 
4741 /*ARGSUSED*/
4742 static void
4743 i_mdi_report_path_state(mdi_client_t *ct, mdi_pathinfo_t *pip)
4744 {
4745 	char		*ct_path;
4746 	char		*ct_status;
4747 	char		*status;
4748 	dev_info_t	*cdip = ct->ct_dip;
4749 	char		lb_buf[64];
4750 	int		report_lb_c = 0, report_lb_p = 0;
4751 
4752 	ASSERT(MDI_CLIENT_LOCKED(ct));
4753 	if ((cdip == NULL) || (ddi_get_instance(cdip) == -1) ||
4754 	    (MDI_CLIENT_IS_REPORT_DEV_NEEDED(ct) == 0)) {
4755 		return;
4756 	}
4757 	if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_OPTIMAL) {
4758 		ct_status = "optimal";
4759 		report_lb_c = 1;
4760 	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED) {
4761 		ct_status = "degraded";
4762 	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
4763 		ct_status = "failed";
4764 	} else {
4765 		ct_status = "unknown";
4766 	}
4767 
4768 	lb_buf[0] = 0;		/* not interested in load balancing config */
4769 
4770 	if (MDI_PI_FLAGS_IS_DEVICE_REMOVED(pip)) {
4771 		status = "removed";
4772 	} else if (MDI_PI_IS_OFFLINE(pip)) {
4773 		status = "offline";
4774 	} else if (MDI_PI_IS_ONLINE(pip)) {
4775 		status = "online";
4776 		report_lb_p = 1;
4777 	} else if (MDI_PI_IS_STANDBY(pip)) {
4778 		status = "standby";
4779 	} else if (MDI_PI_IS_FAULT(pip)) {
4780 		status = "faulted";
4781 	} else {
4782 		status = "unknown";
4783 	}
4784 
4785 	if (cdip) {
4786 		ct_path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4787 
4788 		/*
4789 		 * NOTE: Keeping "multipath status: %s" and
4790 		 * "Load balancing: %s" format unchanged in case someone
4791 		 * scrubs /var/adm/messages looking for these messages.
4792 		 */
4793 		if (report_lb_c && report_lb_p) {
4794 			if (ct->ct_lb == LOAD_BALANCE_LBA) {
4795 				(void) snprintf(lb_buf, sizeof (lb_buf),
4796 				    "%s, region-size: %d", mdi_load_balance_lba,
4797 				    ct->ct_lb_args->region_size);
4798 			} else if (ct->ct_lb == LOAD_BALANCE_NONE) {
4799 				(void) snprintf(lb_buf, sizeof (lb_buf),
4800 				    "%s", mdi_load_balance_none);
4801 			} else {
4802 				(void) snprintf(lb_buf, sizeof (lb_buf), "%s",
4803 				    mdi_load_balance_rr);
4804 			}
4805 
4806 			cmn_err(mdi_debug_consoleonly ? CE_NOTE : CE_CONT,
4807 			    "?%s (%s%d) multipath status: %s: "
4808 			    "path %d %s is %s: Load balancing: %s\n",
4809 			    ddi_pathname(cdip, ct_path), ddi_driver_name(cdip),
4810 			    ddi_get_instance(cdip), ct_status,
4811 			    mdi_pi_get_path_instance(pip),
4812 			    mdi_pi_spathname(pip), status, lb_buf);
4813 		} else {
4814 			cmn_err(mdi_debug_consoleonly ? CE_NOTE : CE_CONT,
4815 			    "?%s (%s%d) multipath status: %s: "
4816 			    "path %d %s is %s\n",
4817 			    ddi_pathname(cdip, ct_path), ddi_driver_name(cdip),
4818 			    ddi_get_instance(cdip), ct_status,
4819 			    mdi_pi_get_path_instance(pip),
4820 			    mdi_pi_spathname(pip), status);
4821 		}
4822 
4823 		kmem_free(ct_path, MAXPATHLEN);
4824 		MDI_CLIENT_CLEAR_REPORT_DEV_NEEDED(ct);
4825 	}
4826 }
4827 
4828 #ifdef	DEBUG
4829 /*
4830  * i_mdi_log():
4831  *		Utility function for error message management
4832  *
4833  *		NOTE: Implementation takes care of trailing \n for cmn_err,
4834  *		MDI_DEBUG should not terminate fmt strings with \n.
4835  *
4836  *		NOTE: If the level is >= 2, and there is no leading !?^
4837  *		then a leading ! is implied (but can be overriden via
4838  *		mdi_debug_consoleonly). If you are using kmdb on the console,
4839  *		consider setting mdi_debug_consoleonly to 1 as an aid.
4840  */
4841 /*PRINTFLIKE4*/
4842 static void
4843 i_mdi_log(int level, const char *func, dev_info_t *dip, const char *fmt, ...)
4844 {
4845 	char		name[MAXNAMELEN];
4846 	char		buf[512];
4847 	char		*bp;
4848 	va_list		ap;
4849 	int		log_only = 0;
4850 	int		boot_only = 0;
4851 	int		console_only = 0;
4852 
4853 	if (dip) {
4854 		(void) snprintf(name, sizeof(name), "%s%d: ",
4855 		    ddi_driver_name(dip), ddi_get_instance(dip));
4856 	} else {
4857 		name[0] = 0;
4858 	}
4859 
4860 	va_start(ap, fmt);
4861 	(void) vsnprintf(buf, sizeof(buf), fmt, ap);
4862 	va_end(ap);
4863 
4864 	switch (buf[0]) {
4865 	case '!':
4866 		bp = &buf[1];
4867 		log_only = 1;
4868 		break;
4869 	case '?':
4870 		bp = &buf[1];
4871 		boot_only = 1;
4872 		break;
4873 	case '^':
4874 		bp = &buf[1];
4875 		console_only = 1;
4876 		break;
4877 	default:
4878 		if (level >= 2)
4879 			log_only = 1;		/* ! implied */
4880 		bp = buf;
4881 		break;
4882 	}
4883 	if (mdi_debug_logonly) {
4884 		log_only = 1;
4885 		boot_only = 0;
4886 		console_only = 0;
4887 	}
4888 	if (mdi_debug_consoleonly) {
4889 		log_only = 0;
4890 		boot_only = 0;
4891 		console_only = 1;
4892 		level = CE_NOTE;
4893 		goto console;
4894 	}
4895 
4896 	switch (level) {
4897 	case CE_NOTE:
4898 		level = CE_CONT;
4899 		/* FALLTHROUGH */
4900 	case CE_CONT:
4901 		if (boot_only) {
4902 			cmn_err(level, "?mdi: %s%s: %s\n", name, func, bp);
4903 		} else if (console_only) {
4904 			cmn_err(level, "^mdi: %s%s: %s\n", name, func, bp);
4905 		} else if (log_only) {
4906 			cmn_err(level, "!mdi: %s%s: %s\n", name, func, bp);
4907 		} else {
4908 			cmn_err(level, "mdi: %s%s: %s\n", name, func, bp);
4909 		}
4910 		break;
4911 
4912 	case CE_WARN:
4913 	case CE_PANIC:
4914 	console:
4915 		if (boot_only) {
4916 			cmn_err(level, "?mdi: %s%s: %s", name, func, bp);
4917 		} else if (console_only) {
4918 			cmn_err(level, "^mdi: %s%s: %s", name, func, bp);
4919 		} else if (log_only) {
4920 			cmn_err(level, "!mdi: %s%s: %s", name, func, bp);
4921 		} else {
4922 			cmn_err(level, "mdi: %s%s: %s", name, func, bp);
4923 		}
4924 		break;
4925 	default:
4926 		cmn_err(level, "mdi: %s%s", name, bp);
4927 		break;
4928 	}
4929 }
4930 #endif	/* DEBUG */
4931 
4932 void
4933 i_mdi_client_online(dev_info_t *ct_dip)
4934 {
4935 	mdi_client_t	*ct;
4936 
4937 	/*
4938 	 * Client online notification. Mark client state as online
4939 	 * restore our binding with dev_info node
4940 	 */
4941 	ct = i_devi_get_client(ct_dip);
4942 	ASSERT(ct != NULL);
4943 	MDI_CLIENT_LOCK(ct);
4944 	MDI_CLIENT_SET_ONLINE(ct);
4945 	/* catch for any memory leaks */
4946 	ASSERT((ct->ct_dip == NULL) || (ct->ct_dip == ct_dip));
4947 	ct->ct_dip = ct_dip;
4948 
4949 	if (ct->ct_power_cnt == 0)
4950 		(void) i_mdi_power_all_phci(ct);
4951 
4952 	MDI_DEBUG(4, (MDI_NOTE, ct_dip,
4953 	    "i_mdi_pm_hold_client %p", (void *)ct));
4954 	i_mdi_pm_hold_client(ct, 1);
4955 
4956 	MDI_CLIENT_UNLOCK(ct);
4957 }
4958 
4959 void
4960 i_mdi_phci_online(dev_info_t *ph_dip)
4961 {
4962 	mdi_phci_t	*ph;
4963 
4964 	/* pHCI online notification. Mark state accordingly */
4965 	ph = i_devi_get_phci(ph_dip);
4966 	ASSERT(ph != NULL);
4967 	MDI_PHCI_LOCK(ph);
4968 	MDI_PHCI_SET_ONLINE(ph);
4969 	MDI_PHCI_UNLOCK(ph);
4970 }
4971 
4972 /*
4973  * mdi_devi_online():
4974  * 		Online notification from NDI framework on pHCI/client
4975  *		device online.
4976  * Return Values:
4977  *		NDI_SUCCESS
4978  *		MDI_FAILURE
4979  */
4980 /*ARGSUSED*/
4981 int
4982 mdi_devi_online(dev_info_t *dip, uint_t flags)
4983 {
4984 	if (MDI_PHCI(dip)) {
4985 		i_mdi_phci_online(dip);
4986 	}
4987 
4988 	if (MDI_CLIENT(dip)) {
4989 		i_mdi_client_online(dip);
4990 	}
4991 	return (NDI_SUCCESS);
4992 }
4993 
4994 /*
4995  * mdi_devi_offline():
4996  * 		Offline notification from NDI framework on pHCI/Client device
4997  *		offline.
4998  *
4999  * Return Values:
5000  *		NDI_SUCCESS
5001  *		NDI_FAILURE
5002  */
5003 /*ARGSUSED*/
5004 int
5005 mdi_devi_offline(dev_info_t *dip, uint_t flags)
5006 {
5007 	int		rv = NDI_SUCCESS;
5008 
5009 	if (MDI_CLIENT(dip)) {
5010 		rv = i_mdi_client_offline(dip, flags);
5011 		if (rv != NDI_SUCCESS)
5012 			return (rv);
5013 	}
5014 
5015 	if (MDI_PHCI(dip)) {
5016 		rv = i_mdi_phci_offline(dip, flags);
5017 
5018 		if ((rv != NDI_SUCCESS) && MDI_CLIENT(dip)) {
5019 			/* set client back online */
5020 			i_mdi_client_online(dip);
5021 		}
5022 	}
5023 
5024 	return (rv);
5025 }
5026 
5027 /*ARGSUSED*/
5028 static int
5029 i_mdi_phci_offline(dev_info_t *dip, uint_t flags)
5030 {
5031 	int		rv = NDI_SUCCESS;
5032 	mdi_phci_t	*ph;
5033 	mdi_client_t	*ct;
5034 	mdi_pathinfo_t	*pip;
5035 	mdi_pathinfo_t	*next;
5036 	mdi_pathinfo_t	*failed_pip = NULL;
5037 	dev_info_t	*cdip;
5038 
5039 	/*
5040 	 * pHCI component offline notification
5041 	 * Make sure that this pHCI instance is free to be offlined.
5042 	 * If it is OK to proceed, Offline and remove all the child
5043 	 * mdi_pathinfo nodes.  This process automatically offlines
5044 	 * corresponding client devices, for which this pHCI provides
5045 	 * critical services.
5046 	 */
5047 	ph = i_devi_get_phci(dip);
5048 	MDI_DEBUG(2, (MDI_NOTE, dip,
5049 	    "called %p %p", (void *)dip, (void *)ph));
5050 	if (ph == NULL) {
5051 		return (rv);
5052 	}
5053 
5054 	MDI_PHCI_LOCK(ph);
5055 
5056 	if (MDI_PHCI_IS_OFFLINE(ph)) {
5057 		MDI_DEBUG(1, (MDI_WARN, dip,
5058 		    "!pHCI already offlined: %p", (void *)dip));
5059 		MDI_PHCI_UNLOCK(ph);
5060 		return (NDI_SUCCESS);
5061 	}
5062 
5063 	/*
5064 	 * Check to see if the pHCI can be offlined
5065 	 */
5066 	if (ph->ph_unstable) {
5067 		MDI_DEBUG(1, (MDI_WARN, dip,
5068 		    "!One or more target devices are in transient state. "
5069 		    "This device can not be removed at this moment. "
5070 		    "Please try again later."));
5071 		MDI_PHCI_UNLOCK(ph);
5072 		return (NDI_BUSY);
5073 	}
5074 
5075 	pip = ph->ph_path_head;
5076 	while (pip != NULL) {
5077 		MDI_PI_LOCK(pip);
5078 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5079 
5080 		/*
5081 		 * The mdi_pathinfo state is OK. Check the client state.
5082 		 * If failover in progress fail the pHCI from offlining
5083 		 */
5084 		ct = MDI_PI(pip)->pi_client;
5085 		i_mdi_client_lock(ct, pip);
5086 		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
5087 		    (ct->ct_unstable)) {
5088 			/*
5089 			 * Failover is in progress, Fail the DR
5090 			 */
5091 			MDI_DEBUG(1, (MDI_WARN, dip,
5092 			    "!pHCI device is busy. "
5093 			    "This device can not be removed at this moment. "
5094 			    "Please try again later."));
5095 			MDI_PI_UNLOCK(pip);
5096 			i_mdi_client_unlock(ct);
5097 			MDI_PHCI_UNLOCK(ph);
5098 			return (NDI_BUSY);
5099 		}
5100 		MDI_PI_UNLOCK(pip);
5101 
5102 		/*
5103 		 * Check to see of we are removing the last path of this
5104 		 * client device...
5105 		 */
5106 		cdip = ct->ct_dip;
5107 		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5108 		    (i_mdi_client_compute_state(ct, ph) ==
5109 		    MDI_CLIENT_STATE_FAILED)) {
5110 			i_mdi_client_unlock(ct);
5111 			MDI_PHCI_UNLOCK(ph);
5112 			if (ndi_devi_offline(cdip,
5113 			    NDI_DEVFS_CLEAN) != NDI_SUCCESS) {
5114 				/*
5115 				 * ndi_devi_offline() failed.
5116 				 * This pHCI provides the critical path
5117 				 * to one or more client devices.
5118 				 * Return busy.
5119 				 */
5120 				MDI_PHCI_LOCK(ph);
5121 				MDI_DEBUG(1, (MDI_WARN, dip,
5122 				    "!pHCI device is busy. "
5123 				    "This device can not be removed at this "
5124 				    "moment. Please try again later."));
5125 				failed_pip = pip;
5126 				break;
5127 			} else {
5128 				MDI_PHCI_LOCK(ph);
5129 				pip = next;
5130 			}
5131 		} else {
5132 			i_mdi_client_unlock(ct);
5133 			pip = next;
5134 		}
5135 	}
5136 
5137 	if (failed_pip) {
5138 		pip = ph->ph_path_head;
5139 		while (pip != failed_pip) {
5140 			MDI_PI_LOCK(pip);
5141 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5142 			ct = MDI_PI(pip)->pi_client;
5143 			i_mdi_client_lock(ct, pip);
5144 			cdip = ct->ct_dip;
5145 			switch (MDI_CLIENT_STATE(ct)) {
5146 			case MDI_CLIENT_STATE_OPTIMAL:
5147 			case MDI_CLIENT_STATE_DEGRADED:
5148 				if (cdip) {
5149 					MDI_PI_UNLOCK(pip);
5150 					i_mdi_client_unlock(ct);
5151 					MDI_PHCI_UNLOCK(ph);
5152 					(void) ndi_devi_online(cdip, 0);
5153 					MDI_PHCI_LOCK(ph);
5154 					pip = next;
5155 					continue;
5156 				}
5157 				break;
5158 
5159 			case MDI_CLIENT_STATE_FAILED:
5160 				if (cdip) {
5161 					MDI_PI_UNLOCK(pip);
5162 					i_mdi_client_unlock(ct);
5163 					MDI_PHCI_UNLOCK(ph);
5164 					(void) ndi_devi_offline(cdip,
5165 						NDI_DEVFS_CLEAN);
5166 					MDI_PHCI_LOCK(ph);
5167 					pip = next;
5168 					continue;
5169 				}
5170 				break;
5171 			}
5172 			MDI_PI_UNLOCK(pip);
5173 			i_mdi_client_unlock(ct);
5174 			pip = next;
5175 		}
5176 		MDI_PHCI_UNLOCK(ph);
5177 		return (NDI_BUSY);
5178 	}
5179 
5180 	/*
5181 	 * Mark the pHCI as offline
5182 	 */
5183 	MDI_PHCI_SET_OFFLINE(ph);
5184 
5185 	/*
5186 	 * Mark the child mdi_pathinfo nodes as transient
5187 	 */
5188 	pip = ph->ph_path_head;
5189 	while (pip != NULL) {
5190 		MDI_PI_LOCK(pip);
5191 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5192 		MDI_PI_SET_OFFLINING(pip);
5193 		MDI_PI_UNLOCK(pip);
5194 		pip = next;
5195 	}
5196 	MDI_PHCI_UNLOCK(ph);
5197 	/*
5198 	 * Give a chance for any pending commands to execute
5199 	 */
5200 	delay_random(5);
5201 	MDI_PHCI_LOCK(ph);
5202 	pip = ph->ph_path_head;
5203 	while (pip != NULL) {
5204 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5205 		(void) i_mdi_pi_offline(pip, flags);
5206 		MDI_PI_LOCK(pip);
5207 		ct = MDI_PI(pip)->pi_client;
5208 		if (!MDI_PI_IS_OFFLINE(pip)) {
5209 			MDI_DEBUG(1, (MDI_WARN, dip,
5210 			    "!pHCI device is busy. "
5211 			    "This device can not be removed at this moment. "
5212 			    "Please try again later."));
5213 			MDI_PI_UNLOCK(pip);
5214 			MDI_PHCI_SET_ONLINE(ph);
5215 			MDI_PHCI_UNLOCK(ph);
5216 			return (NDI_BUSY);
5217 		}
5218 		MDI_PI_UNLOCK(pip);
5219 		pip = next;
5220 	}
5221 	MDI_PHCI_UNLOCK(ph);
5222 
5223 	return (rv);
5224 }
5225 
5226 void
5227 mdi_phci_mark_retiring(dev_info_t *dip, char **cons_array)
5228 {
5229 	mdi_phci_t	*ph;
5230 	mdi_client_t	*ct;
5231 	mdi_pathinfo_t	*pip;
5232 	mdi_pathinfo_t	*next;
5233 	dev_info_t	*cdip;
5234 
5235 	if (!MDI_PHCI(dip))
5236 		return;
5237 
5238 	ph = i_devi_get_phci(dip);
5239 	if (ph == NULL) {
5240 		return;
5241 	}
5242 
5243 	MDI_PHCI_LOCK(ph);
5244 
5245 	if (MDI_PHCI_IS_OFFLINE(ph)) {
5246 		/* has no last path */
5247 		MDI_PHCI_UNLOCK(ph);
5248 		return;
5249 	}
5250 
5251 	pip = ph->ph_path_head;
5252 	while (pip != NULL) {
5253 		MDI_PI_LOCK(pip);
5254 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5255 
5256 		ct = MDI_PI(pip)->pi_client;
5257 		i_mdi_client_lock(ct, pip);
5258 		MDI_PI_UNLOCK(pip);
5259 
5260 		cdip = ct->ct_dip;
5261 		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5262 		    (i_mdi_client_compute_state(ct, ph) ==
5263 		    MDI_CLIENT_STATE_FAILED)) {
5264 			/* Last path. Mark client dip as retiring */
5265 			i_mdi_client_unlock(ct);
5266 			MDI_PHCI_UNLOCK(ph);
5267 			(void) e_ddi_mark_retiring(cdip, cons_array);
5268 			MDI_PHCI_LOCK(ph);
5269 			pip = next;
5270 		} else {
5271 			i_mdi_client_unlock(ct);
5272 			pip = next;
5273 		}
5274 	}
5275 
5276 	MDI_PHCI_UNLOCK(ph);
5277 
5278 	return;
5279 }
5280 
5281 void
5282 mdi_phci_retire_notify(dev_info_t *dip, int *constraint)
5283 {
5284 	mdi_phci_t	*ph;
5285 	mdi_client_t	*ct;
5286 	mdi_pathinfo_t	*pip;
5287 	mdi_pathinfo_t	*next;
5288 	dev_info_t	*cdip;
5289 
5290 	if (!MDI_PHCI(dip))
5291 		return;
5292 
5293 	ph = i_devi_get_phci(dip);
5294 	if (ph == NULL)
5295 		return;
5296 
5297 	MDI_PHCI_LOCK(ph);
5298 
5299 	if (MDI_PHCI_IS_OFFLINE(ph)) {
5300 		MDI_PHCI_UNLOCK(ph);
5301 		/* not last path */
5302 		return;
5303 	}
5304 
5305 	if (ph->ph_unstable) {
5306 		MDI_PHCI_UNLOCK(ph);
5307 		/* can't check for constraints */
5308 		*constraint = 0;
5309 		return;
5310 	}
5311 
5312 	pip = ph->ph_path_head;
5313 	while (pip != NULL) {
5314 		MDI_PI_LOCK(pip);
5315 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5316 
5317 		/*
5318 		 * The mdi_pathinfo state is OK. Check the client state.
5319 		 * If failover in progress fail the pHCI from offlining
5320 		 */
5321 		ct = MDI_PI(pip)->pi_client;
5322 		i_mdi_client_lock(ct, pip);
5323 		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
5324 		    (ct->ct_unstable)) {
5325 			/*
5326 			 * Failover is in progress, can't check for constraints
5327 			 */
5328 			MDI_PI_UNLOCK(pip);
5329 			i_mdi_client_unlock(ct);
5330 			MDI_PHCI_UNLOCK(ph);
5331 			*constraint = 0;
5332 			return;
5333 		}
5334 		MDI_PI_UNLOCK(pip);
5335 
5336 		/*
5337 		 * Check to see of we are retiring the last path of this
5338 		 * client device...
5339 		 */
5340 		cdip = ct->ct_dip;
5341 		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5342 		    (i_mdi_client_compute_state(ct, ph) ==
5343 		    MDI_CLIENT_STATE_FAILED)) {
5344 			i_mdi_client_unlock(ct);
5345 			MDI_PHCI_UNLOCK(ph);
5346 			(void) e_ddi_retire_notify(cdip, constraint);
5347 			MDI_PHCI_LOCK(ph);
5348 			pip = next;
5349 		} else {
5350 			i_mdi_client_unlock(ct);
5351 			pip = next;
5352 		}
5353 	}
5354 
5355 	MDI_PHCI_UNLOCK(ph);
5356 
5357 	return;
5358 }
5359 
5360 /*
5361  * offline the path(s) hanging off the pHCI. If the
5362  * last path to any client, check that constraints
5363  * have been applied.
5364  */
5365 void
5366 mdi_phci_retire_finalize(dev_info_t *dip, int phci_only)
5367 {
5368 	mdi_phci_t	*ph;
5369 	mdi_client_t	*ct;
5370 	mdi_pathinfo_t	*pip;
5371 	mdi_pathinfo_t	*next;
5372 	dev_info_t	*cdip;
5373 	int		unstable = 0;
5374 	int		constraint;
5375 
5376 	if (!MDI_PHCI(dip))
5377 		return;
5378 
5379 	ph = i_devi_get_phci(dip);
5380 	if (ph == NULL) {
5381 		/* no last path and no pips */
5382 		return;
5383 	}
5384 
5385 	MDI_PHCI_LOCK(ph);
5386 
5387 	if (MDI_PHCI_IS_OFFLINE(ph)) {
5388 		MDI_PHCI_UNLOCK(ph);
5389 		/* no last path and no pips */
5390 		return;
5391 	}
5392 
5393 	/*
5394 	 * Check to see if the pHCI can be offlined
5395 	 */
5396 	if (ph->ph_unstable) {
5397 		unstable = 1;
5398 	}
5399 
5400 	pip = ph->ph_path_head;
5401 	while (pip != NULL) {
5402 		MDI_PI_LOCK(pip);
5403 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5404 
5405 		/*
5406 		 * if failover in progress fail the pHCI from offlining
5407 		 */
5408 		ct = MDI_PI(pip)->pi_client;
5409 		i_mdi_client_lock(ct, pip);
5410 		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
5411 		    (ct->ct_unstable)) {
5412 			unstable = 1;
5413 		}
5414 		MDI_PI_UNLOCK(pip);
5415 
5416 		/*
5417 		 * Check to see of we are removing the last path of this
5418 		 * client device...
5419 		 */
5420 		cdip = ct->ct_dip;
5421 		if (!phci_only && cdip &&
5422 		    (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5423 		    (i_mdi_client_compute_state(ct, ph) ==
5424 		    MDI_CLIENT_STATE_FAILED)) {
5425 			i_mdi_client_unlock(ct);
5426 			MDI_PHCI_UNLOCK(ph);
5427 			/*
5428 			 * We don't retire clients we just retire the
5429 			 * path to a client. If it is the last path
5430 			 * to a client, constraints are checked and
5431 			 * if we pass the last path is offlined. MPXIO will
5432 			 * then fail all I/Os to the client. Since we don't
5433 			 * want to retire the client on a path error
5434 			 * set constraint = 0 so that the client dip
5435 			 * is not retired.
5436 			 */
5437 			constraint = 0;
5438 			(void) e_ddi_retire_finalize(cdip, &constraint);
5439 			MDI_PHCI_LOCK(ph);
5440 			pip = next;
5441 		} else {
5442 			i_mdi_client_unlock(ct);
5443 			pip = next;
5444 		}
5445 	}
5446 
5447 	/*
5448 	 * Cannot offline pip(s)
5449 	 */
5450 	if (unstable) {
5451 		cmn_err(CE_WARN, "%s%d: mdi_phci_retire_finalize: "
5452 		    "pHCI in transient state, cannot retire",
5453 		    ddi_driver_name(dip), ddi_get_instance(dip));
5454 		MDI_PHCI_UNLOCK(ph);
5455 		return;
5456 	}
5457 
5458 	/*
5459 	 * Mark the pHCI as offline
5460 	 */
5461 	MDI_PHCI_SET_OFFLINE(ph);
5462 
5463 	/*
5464 	 * Mark the child mdi_pathinfo nodes as transient
5465 	 */
5466 	pip = ph->ph_path_head;
5467 	while (pip != NULL) {
5468 		MDI_PI_LOCK(pip);
5469 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5470 		MDI_PI_SET_OFFLINING(pip);
5471 		MDI_PI_UNLOCK(pip);
5472 		pip = next;
5473 	}
5474 	MDI_PHCI_UNLOCK(ph);
5475 	/*
5476 	 * Give a chance for any pending commands to execute
5477 	 */
5478 	delay_random(5);
5479 	MDI_PHCI_LOCK(ph);
5480 	pip = ph->ph_path_head;
5481 	while (pip != NULL) {
5482 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5483 		(void) i_mdi_pi_offline(pip, 0);
5484 		MDI_PI_LOCK(pip);
5485 		ct = MDI_PI(pip)->pi_client;
5486 		if (!MDI_PI_IS_OFFLINE(pip)) {
5487 			cmn_err(CE_WARN, "mdi_phci_retire_finalize: "
5488 			    "path %d %s busy, cannot offline",
5489 			    mdi_pi_get_path_instance(pip),
5490 			    mdi_pi_spathname(pip));
5491 			MDI_PI_UNLOCK(pip);
5492 			MDI_PHCI_SET_ONLINE(ph);
5493 			MDI_PHCI_UNLOCK(ph);
5494 			return;
5495 		}
5496 		MDI_PI_UNLOCK(pip);
5497 		pip = next;
5498 	}
5499 	MDI_PHCI_UNLOCK(ph);
5500 
5501 	return;
5502 }
5503 
5504 void
5505 mdi_phci_unretire(dev_info_t *dip)
5506 {
5507 	ASSERT(MDI_PHCI(dip));
5508 
5509 	/*
5510 	 * Online the phci
5511 	 */
5512 	i_mdi_phci_online(dip);
5513 }
5514 
5515 /*ARGSUSED*/
5516 static int
5517 i_mdi_client_offline(dev_info_t *dip, uint_t flags)
5518 {
5519 	int		rv = NDI_SUCCESS;
5520 	mdi_client_t	*ct;
5521 
5522 	/*
5523 	 * Client component to go offline.  Make sure that we are
5524 	 * not in failing over state and update client state
5525 	 * accordingly
5526 	 */
5527 	ct = i_devi_get_client(dip);
5528 	MDI_DEBUG(2, (MDI_NOTE, dip,
5529 	    "called %p %p", (void *)dip, (void *)ct));
5530 	if (ct != NULL) {
5531 		MDI_CLIENT_LOCK(ct);
5532 		if (ct->ct_unstable) {
5533 			/*
5534 			 * One or more paths are in transient state,
5535 			 * Dont allow offline of a client device
5536 			 */
5537 			MDI_DEBUG(1, (MDI_WARN, dip,
5538 			    "!One or more paths to "
5539 			    "this device are in transient state. "
5540 			    "This device can not be removed at this moment. "
5541 			    "Please try again later."));
5542 			MDI_CLIENT_UNLOCK(ct);
5543 			return (NDI_BUSY);
5544 		}
5545 		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
5546 			/*
5547 			 * Failover is in progress, Dont allow DR of
5548 			 * a client device
5549 			 */
5550 			MDI_DEBUG(1, (MDI_WARN, dip,
5551 			    "!Client device is Busy. "
5552 			    "This device can not be removed at this moment. "
5553 			    "Please try again later."));
5554 			MDI_CLIENT_UNLOCK(ct);
5555 			return (NDI_BUSY);
5556 		}
5557 		MDI_CLIENT_SET_OFFLINE(ct);
5558 
5559 		/*
5560 		 * Unbind our relationship with the dev_info node
5561 		 */
5562 		if (flags & NDI_DEVI_REMOVE) {
5563 			ct->ct_dip = NULL;
5564 		}
5565 		MDI_CLIENT_UNLOCK(ct);
5566 	}
5567 	return (rv);
5568 }
5569 
5570 /*
5571  * mdi_pre_attach():
5572  *		Pre attach() notification handler
5573  */
5574 /*ARGSUSED*/
5575 int
5576 mdi_pre_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
5577 {
5578 	/* don't support old DDI_PM_RESUME */
5579 	if ((DEVI(dip)->devi_mdi_component != MDI_COMPONENT_NONE) &&
5580 	    (cmd == DDI_PM_RESUME))
5581 		return (DDI_FAILURE);
5582 
5583 	return (DDI_SUCCESS);
5584 }
5585 
5586 /*
5587  * mdi_post_attach():
5588  *		Post attach() notification handler
5589  */
5590 /*ARGSUSED*/
5591 void
5592 mdi_post_attach(dev_info_t *dip, ddi_attach_cmd_t cmd, int error)
5593 {
5594 	mdi_phci_t	*ph;
5595 	mdi_client_t	*ct;
5596 	mdi_vhci_t	*vh;
5597 
5598 	if (MDI_PHCI(dip)) {
5599 		ph = i_devi_get_phci(dip);
5600 		ASSERT(ph != NULL);
5601 
5602 		MDI_PHCI_LOCK(ph);
5603 		switch (cmd) {
5604 		case DDI_ATTACH:
5605 			MDI_DEBUG(2, (MDI_NOTE, dip,
5606 			    "phci post_attach called %p", (void *)ph));
5607 			if (error == DDI_SUCCESS) {
5608 				MDI_PHCI_SET_ATTACH(ph);
5609 			} else {
5610 				MDI_DEBUG(1, (MDI_NOTE, dip,
5611 				    "!pHCI post_attach failed: error %d",
5612 				    error));
5613 				MDI_PHCI_SET_DETACH(ph);
5614 			}
5615 			break;
5616 
5617 		case DDI_RESUME:
5618 			MDI_DEBUG(2, (MDI_NOTE, dip,
5619 			    "pHCI post_resume: called %p", (void *)ph));
5620 			if (error == DDI_SUCCESS) {
5621 				MDI_PHCI_SET_RESUME(ph);
5622 			} else {
5623 				MDI_DEBUG(1, (MDI_NOTE, dip,
5624 				    "!pHCI post_resume failed: error %d",
5625 				    error));
5626 				MDI_PHCI_SET_SUSPEND(ph);
5627 			}
5628 			break;
5629 		}
5630 		MDI_PHCI_UNLOCK(ph);
5631 	}
5632 
5633 	if (MDI_CLIENT(dip)) {
5634 		ct = i_devi_get_client(dip);
5635 		ASSERT(ct != NULL);
5636 
5637 		MDI_CLIENT_LOCK(ct);
5638 		switch (cmd) {
5639 		case DDI_ATTACH:
5640 			MDI_DEBUG(2, (MDI_NOTE, dip,
5641 			    "client post_attach called %p", (void *)ct));
5642 			if (error != DDI_SUCCESS) {
5643 				MDI_DEBUG(1, (MDI_NOTE, dip,
5644 				    "!client post_attach failed: error %d",
5645 				    error));
5646 				MDI_CLIENT_SET_DETACH(ct);
5647 				MDI_DEBUG(4, (MDI_WARN, dip,
5648 				    "i_mdi_pm_reset_client"));
5649 				i_mdi_pm_reset_client(ct);
5650 				break;
5651 			}
5652 
5653 			/*
5654 			 * Client device has successfully attached, inform
5655 			 * the vhci.
5656 			 */
5657 			vh = ct->ct_vhci;
5658 			if (vh->vh_ops->vo_client_attached)
5659 				(*vh->vh_ops->vo_client_attached)(dip);
5660 
5661 			MDI_CLIENT_SET_ATTACH(ct);
5662 			break;
5663 
5664 		case DDI_RESUME:
5665 			MDI_DEBUG(2, (MDI_NOTE, dip,
5666 			    "client post_attach: called %p", (void *)ct));
5667 			if (error == DDI_SUCCESS) {
5668 				MDI_CLIENT_SET_RESUME(ct);
5669 			} else {
5670 				MDI_DEBUG(1, (MDI_NOTE, dip,
5671 				    "!client post_resume failed: error %d",
5672 				    error));
5673 				MDI_CLIENT_SET_SUSPEND(ct);
5674 			}
5675 			break;
5676 		}
5677 		MDI_CLIENT_UNLOCK(ct);
5678 	}
5679 }
5680 
5681 /*
5682  * mdi_pre_detach():
5683  *		Pre detach notification handler
5684  */
5685 /*ARGSUSED*/
5686 int
5687 mdi_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5688 {
5689 	int rv = DDI_SUCCESS;
5690 
5691 	if (MDI_CLIENT(dip)) {
5692 		(void) i_mdi_client_pre_detach(dip, cmd);
5693 	}
5694 
5695 	if (MDI_PHCI(dip)) {
5696 		rv = i_mdi_phci_pre_detach(dip, cmd);
5697 	}
5698 
5699 	return (rv);
5700 }
5701 
5702 /*ARGSUSED*/
5703 static int
5704 i_mdi_phci_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5705 {
5706 	int		rv = DDI_SUCCESS;
5707 	mdi_phci_t	*ph;
5708 	mdi_client_t	*ct;
5709 	mdi_pathinfo_t	*pip;
5710 	mdi_pathinfo_t	*failed_pip = NULL;
5711 	mdi_pathinfo_t	*next;
5712 
5713 	ph = i_devi_get_phci(dip);
5714 	if (ph == NULL) {
5715 		return (rv);
5716 	}
5717 
5718 	MDI_PHCI_LOCK(ph);
5719 	switch (cmd) {
5720 	case DDI_DETACH:
5721 		MDI_DEBUG(2, (MDI_NOTE, dip,
5722 		    "pHCI pre_detach: called %p", (void *)ph));
5723 		if (!MDI_PHCI_IS_OFFLINE(ph)) {
5724 			/*
5725 			 * mdi_pathinfo nodes are still attached to
5726 			 * this pHCI. Fail the detach for this pHCI.
5727 			 */
5728 			MDI_DEBUG(2, (MDI_WARN, dip,
5729 			    "pHCI pre_detach: paths are still attached %p",
5730 			    (void *)ph));
5731 			rv = DDI_FAILURE;
5732 			break;
5733 		}
5734 		MDI_PHCI_SET_DETACH(ph);
5735 		break;
5736 
5737 	case DDI_SUSPEND:
5738 		/*
5739 		 * pHCI is getting suspended.  Since mpxio client
5740 		 * devices may not be suspended at this point, to avoid
5741 		 * a potential stack overflow, it is important to suspend
5742 		 * client devices before pHCI can be suspended.
5743 		 */
5744 
5745 		MDI_DEBUG(2, (MDI_NOTE, dip,
5746 		    "pHCI pre_suspend: called %p", (void *)ph));
5747 		/*
5748 		 * Suspend all the client devices accessible through this pHCI
5749 		 */
5750 		pip = ph->ph_path_head;
5751 		while (pip != NULL && rv == DDI_SUCCESS) {
5752 			dev_info_t *cdip;
5753 			MDI_PI_LOCK(pip);
5754 			next =
5755 			    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5756 			ct = MDI_PI(pip)->pi_client;
5757 			i_mdi_client_lock(ct, pip);
5758 			cdip = ct->ct_dip;
5759 			MDI_PI_UNLOCK(pip);
5760 			if ((MDI_CLIENT_IS_DETACHED(ct) == 0) &&
5761 			    MDI_CLIENT_IS_SUSPENDED(ct) == 0) {
5762 				i_mdi_client_unlock(ct);
5763 				if ((rv = devi_detach(cdip, DDI_SUSPEND)) !=
5764 				    DDI_SUCCESS) {
5765 					/*
5766 					 * Suspend of one of the client
5767 					 * device has failed.
5768 					 */
5769 					MDI_DEBUG(1, (MDI_WARN, dip,
5770 					    "!suspend of device (%s%d) failed.",
5771 					    ddi_driver_name(cdip),
5772 					    ddi_get_instance(cdip)));
5773 					failed_pip = pip;
5774 					break;
5775 				}
5776 			} else {
5777 				i_mdi_client_unlock(ct);
5778 			}
5779 			pip = next;
5780 		}
5781 
5782 		if (rv == DDI_SUCCESS) {
5783 			/*
5784 			 * Suspend of client devices is complete. Proceed
5785 			 * with pHCI suspend.
5786 			 */
5787 			MDI_PHCI_SET_SUSPEND(ph);
5788 		} else {
5789 			/*
5790 			 * Revert back all the suspended client device states
5791 			 * to converse.
5792 			 */
5793 			pip = ph->ph_path_head;
5794 			while (pip != failed_pip) {
5795 				dev_info_t *cdip;
5796 				MDI_PI_LOCK(pip);
5797 				next =
5798 				    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5799 				ct = MDI_PI(pip)->pi_client;
5800 				i_mdi_client_lock(ct, pip);
5801 				cdip = ct->ct_dip;
5802 				MDI_PI_UNLOCK(pip);
5803 				if (MDI_CLIENT_IS_SUSPENDED(ct)) {
5804 					i_mdi_client_unlock(ct);
5805 					(void) devi_attach(cdip, DDI_RESUME);
5806 				} else {
5807 					i_mdi_client_unlock(ct);
5808 				}
5809 				pip = next;
5810 			}
5811 		}
5812 		break;
5813 
5814 	default:
5815 		rv = DDI_FAILURE;
5816 		break;
5817 	}
5818 	MDI_PHCI_UNLOCK(ph);
5819 	return (rv);
5820 }
5821 
5822 /*ARGSUSED*/
5823 static int
5824 i_mdi_client_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5825 {
5826 	int		rv = DDI_SUCCESS;
5827 	mdi_client_t	*ct;
5828 
5829 	ct = i_devi_get_client(dip);
5830 	if (ct == NULL) {
5831 		return (rv);
5832 	}
5833 
5834 	MDI_CLIENT_LOCK(ct);
5835 	switch (cmd) {
5836 	case DDI_DETACH:
5837 		MDI_DEBUG(2, (MDI_NOTE, dip,
5838 		    "client pre_detach: called %p",
5839 		     (void *)ct));
5840 		MDI_CLIENT_SET_DETACH(ct);
5841 		break;
5842 
5843 	case DDI_SUSPEND:
5844 		MDI_DEBUG(2, (MDI_NOTE, dip,
5845 		    "client pre_suspend: called %p",
5846 		    (void *)ct));
5847 		MDI_CLIENT_SET_SUSPEND(ct);
5848 		break;
5849 
5850 	default:
5851 		rv = DDI_FAILURE;
5852 		break;
5853 	}
5854 	MDI_CLIENT_UNLOCK(ct);
5855 	return (rv);
5856 }
5857 
5858 /*
5859  * mdi_post_detach():
5860  *		Post detach notification handler
5861  */
5862 /*ARGSUSED*/
5863 void
5864 mdi_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5865 {
5866 	/*
5867 	 * Detach/Suspend of mpxio component failed. Update our state
5868 	 * too
5869 	 */
5870 	if (MDI_PHCI(dip))
5871 		i_mdi_phci_post_detach(dip, cmd, error);
5872 
5873 	if (MDI_CLIENT(dip))
5874 		i_mdi_client_post_detach(dip, cmd, error);
5875 }
5876 
5877 /*ARGSUSED*/
5878 static void
5879 i_mdi_phci_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5880 {
5881 	mdi_phci_t	*ph;
5882 
5883 	/*
5884 	 * Detach/Suspend of phci component failed. Update our state
5885 	 * too
5886 	 */
5887 	ph = i_devi_get_phci(dip);
5888 	if (ph == NULL) {
5889 		return;
5890 	}
5891 
5892 	MDI_PHCI_LOCK(ph);
5893 	/*
5894 	 * Detach of pHCI failed. Restore back converse
5895 	 * state
5896 	 */
5897 	switch (cmd) {
5898 	case DDI_DETACH:
5899 		MDI_DEBUG(2, (MDI_NOTE, dip,
5900 		    "pHCI post_detach: called %p",
5901 		    (void *)ph));
5902 		if (error != DDI_SUCCESS)
5903 			MDI_PHCI_SET_ATTACH(ph);
5904 		break;
5905 
5906 	case DDI_SUSPEND:
5907 		MDI_DEBUG(2, (MDI_NOTE, dip,
5908 		    "pHCI post_suspend: called %p",
5909 		    (void *)ph));
5910 		if (error != DDI_SUCCESS)
5911 			MDI_PHCI_SET_RESUME(ph);
5912 		break;
5913 	}
5914 	MDI_PHCI_UNLOCK(ph);
5915 }
5916 
5917 /*ARGSUSED*/
5918 static void
5919 i_mdi_client_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5920 {
5921 	mdi_client_t	*ct;
5922 
5923 	ct = i_devi_get_client(dip);
5924 	if (ct == NULL) {
5925 		return;
5926 	}
5927 	MDI_CLIENT_LOCK(ct);
5928 	/*
5929 	 * Detach of Client failed. Restore back converse
5930 	 * state
5931 	 */
5932 	switch (cmd) {
5933 	case DDI_DETACH:
5934 		MDI_DEBUG(2, (MDI_NOTE, dip,
5935 		    "client post_detach: called %p", (void *)ct));
5936 		if (DEVI_IS_ATTACHING(ct->ct_dip)) {
5937 			MDI_DEBUG(4, (MDI_NOTE, dip,
5938 			    "i_mdi_pm_rele_client\n"));
5939 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
5940 		} else {
5941 			MDI_DEBUG(4, (MDI_NOTE, dip,
5942 			    "i_mdi_pm_reset_client\n"));
5943 			i_mdi_pm_reset_client(ct);
5944 		}
5945 		if (error != DDI_SUCCESS)
5946 			MDI_CLIENT_SET_ATTACH(ct);
5947 		break;
5948 
5949 	case DDI_SUSPEND:
5950 		MDI_DEBUG(2, (MDI_NOTE, dip,
5951 		    "called %p", (void *)ct));
5952 		if (error != DDI_SUCCESS)
5953 			MDI_CLIENT_SET_RESUME(ct);
5954 		break;
5955 	}
5956 	MDI_CLIENT_UNLOCK(ct);
5957 }
5958 
5959 int
5960 mdi_pi_kstat_exists(mdi_pathinfo_t *pip)
5961 {
5962 	return (MDI_PI(pip)->pi_kstats ? 1 : 0);
5963 }
5964 
5965 /*
5966  * create and install per-path (client - pHCI) statistics
5967  * I/O stats supported: nread, nwritten, reads, and writes
5968  * Error stats - hard errors, soft errors, & transport errors
5969  */
5970 int
5971 mdi_pi_kstat_create(mdi_pathinfo_t *pip, char *ksname)
5972 {
5973 	kstat_t			*kiosp, *kerrsp;
5974 	struct pi_errs		*nsp;
5975 	struct mdi_pi_kstats	*mdi_statp;
5976 
5977 	if (MDI_PI(pip)->pi_kstats != NULL)
5978 		return (MDI_SUCCESS);
5979 
5980 	if ((kiosp = kstat_create("mdi", 0, ksname, "iopath",
5981 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL) {
5982 		return (MDI_FAILURE);
5983 	}
5984 
5985 	(void) strcat(ksname, ",err");
5986 	kerrsp = kstat_create("mdi", 0, ksname, "iopath_errors",
5987 	    KSTAT_TYPE_NAMED,
5988 	    sizeof (struct pi_errs) / sizeof (kstat_named_t), 0);
5989 	if (kerrsp == NULL) {
5990 		kstat_delete(kiosp);
5991 		return (MDI_FAILURE);
5992 	}
5993 
5994 	nsp = (struct pi_errs *)kerrsp->ks_data;
5995 	kstat_named_init(&nsp->pi_softerrs, "Soft Errors", KSTAT_DATA_UINT32);
5996 	kstat_named_init(&nsp->pi_harderrs, "Hard Errors", KSTAT_DATA_UINT32);
5997 	kstat_named_init(&nsp->pi_transerrs, "Transport Errors",
5998 	    KSTAT_DATA_UINT32);
5999 	kstat_named_init(&nsp->pi_icnt_busy, "Interconnect Busy",
6000 	    KSTAT_DATA_UINT32);
6001 	kstat_named_init(&nsp->pi_icnt_errors, "Interconnect Errors",
6002 	    KSTAT_DATA_UINT32);
6003 	kstat_named_init(&nsp->pi_phci_rsrc, "pHCI No Resources",
6004 	    KSTAT_DATA_UINT32);
6005 	kstat_named_init(&nsp->pi_phci_localerr, "pHCI Local Errors",
6006 	    KSTAT_DATA_UINT32);
6007 	kstat_named_init(&nsp->pi_phci_invstate, "pHCI Invalid State",
6008 	    KSTAT_DATA_UINT32);
6009 	kstat_named_init(&nsp->pi_failedfrom, "Failed From",
6010 	    KSTAT_DATA_UINT32);
6011 	kstat_named_init(&nsp->pi_failedto, "Failed To", KSTAT_DATA_UINT32);
6012 
6013 	mdi_statp = kmem_alloc(sizeof (*mdi_statp), KM_SLEEP);
6014 	mdi_statp->pi_kstat_ref = 1;
6015 	mdi_statp->pi_kstat_iostats = kiosp;
6016 	mdi_statp->pi_kstat_errstats = kerrsp;
6017 	kstat_install(kiosp);
6018 	kstat_install(kerrsp);
6019 	MDI_PI(pip)->pi_kstats = mdi_statp;
6020 	return (MDI_SUCCESS);
6021 }
6022 
6023 /*
6024  * destroy per-path properties
6025  */
6026 static void
6027 i_mdi_pi_kstat_destroy(mdi_pathinfo_t *pip)
6028 {
6029 
6030 	struct mdi_pi_kstats *mdi_statp;
6031 
6032 	if (MDI_PI(pip)->pi_kstats == NULL)
6033 		return;
6034 	if ((mdi_statp = MDI_PI(pip)->pi_kstats) == NULL)
6035 		return;
6036 
6037 	MDI_PI(pip)->pi_kstats = NULL;
6038 
6039 	/*
6040 	 * the kstat may be shared between multiple pathinfo nodes
6041 	 * decrement this pathinfo's usage, removing the kstats
6042 	 * themselves when the last pathinfo reference is removed.
6043 	 */
6044 	ASSERT(mdi_statp->pi_kstat_ref > 0);
6045 	if (--mdi_statp->pi_kstat_ref != 0)
6046 		return;
6047 
6048 	kstat_delete(mdi_statp->pi_kstat_iostats);
6049 	kstat_delete(mdi_statp->pi_kstat_errstats);
6050 	kmem_free(mdi_statp, sizeof (*mdi_statp));
6051 }
6052 
6053 /*
6054  * update I/O paths KSTATS
6055  */
6056 void
6057 mdi_pi_kstat_iosupdate(mdi_pathinfo_t *pip, struct buf *bp)
6058 {
6059 	kstat_t *iostatp;
6060 	size_t xfer_cnt;
6061 
6062 	ASSERT(pip != NULL);
6063 
6064 	/*
6065 	 * I/O can be driven across a path prior to having path
6066 	 * statistics available, i.e. probe(9e).
6067 	 */
6068 	if (bp != NULL && MDI_PI(pip)->pi_kstats != NULL) {
6069 		iostatp = MDI_PI(pip)->pi_kstats->pi_kstat_iostats;
6070 		xfer_cnt = bp->b_bcount - bp->b_resid;
6071 		if (bp->b_flags & B_READ) {
6072 			KSTAT_IO_PTR(iostatp)->reads++;
6073 			KSTAT_IO_PTR(iostatp)->nread += xfer_cnt;
6074 		} else {
6075 			KSTAT_IO_PTR(iostatp)->writes++;
6076 			KSTAT_IO_PTR(iostatp)->nwritten += xfer_cnt;
6077 		}
6078 	}
6079 }
6080 
6081 /*
6082  * Enable the path(specific client/target/initiator)
6083  * Enabling a path means that MPxIO may select the enabled path for routing
6084  * future I/O requests, subject to other path state constraints.
6085  */
6086 int
6087 mdi_pi_enable_path(mdi_pathinfo_t *pip, int flags)
6088 {
6089 	mdi_phci_t	*ph;
6090 
6091 	ph = MDI_PI(pip)->pi_phci;
6092 	if (ph == NULL) {
6093 		MDI_DEBUG(1, (MDI_NOTE, mdi_pi_get_phci(pip),
6094 		    "!failed: path %s %p: NULL ph",
6095 		    mdi_pi_spathname(pip), (void *)pip));
6096 		return (MDI_FAILURE);
6097 	}
6098 
6099 	(void) i_mdi_enable_disable_path(pip, ph->ph_vhci, flags,
6100 		MDI_ENABLE_OP);
6101 	MDI_DEBUG(5, (MDI_NOTE, ph->ph_dip,
6102 	    "!returning success pip = %p. ph = %p",
6103 	    (void *)pip, (void *)ph));
6104 	return (MDI_SUCCESS);
6105 
6106 }
6107 
6108 /*
6109  * Disable the path (specific client/target/initiator)
6110  * Disabling a path means that MPxIO will not select the disabled path for
6111  * routing any new I/O requests.
6112  */
6113 int
6114 mdi_pi_disable_path(mdi_pathinfo_t *pip, int flags)
6115 {
6116 	mdi_phci_t	*ph;
6117 
6118 	ph = MDI_PI(pip)->pi_phci;
6119 	if (ph == NULL) {
6120 		MDI_DEBUG(1, (MDI_NOTE, mdi_pi_get_phci(pip),
6121 		    "!failed: path %s %p: NULL ph",
6122 		    mdi_pi_spathname(pip), (void *)pip));
6123 		return (MDI_FAILURE);
6124 	}
6125 
6126 	(void) i_mdi_enable_disable_path(pip,
6127 	    ph->ph_vhci, flags, MDI_DISABLE_OP);
6128 	MDI_DEBUG(5, (MDI_NOTE, ph->ph_dip,
6129 	    "!returning success pip = %p. ph = %p",
6130 	    (void *)pip, (void *)ph));
6131 	return (MDI_SUCCESS);
6132 }
6133 
6134 /*
6135  * disable the path to a particular pHCI (pHCI specified in the phci_path
6136  * argument) for a particular client (specified in the client_path argument).
6137  * Disabling a path means that MPxIO will not select the disabled path for
6138  * routing any new I/O requests.
6139  * NOTE: this will be removed once the NWS files are changed to use the new
6140  * mdi_{enable,disable}_path interfaces
6141  */
6142 int
6143 mdi_pi_disable(dev_info_t *cdip, dev_info_t *pdip, int flags)
6144 {
6145 	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_DISABLE_OP));
6146 }
6147 
6148 /*
6149  * Enable the path to a particular pHCI (pHCI specified in the phci_path
6150  * argument) for a particular client (specified in the client_path argument).
6151  * Enabling a path means that MPxIO may select the enabled path for routing
6152  * future I/O requests, subject to other path state constraints.
6153  * NOTE: this will be removed once the NWS files are changed to use the new
6154  * mdi_{enable,disable}_path interfaces
6155  */
6156 
6157 int
6158 mdi_pi_enable(dev_info_t *cdip, dev_info_t *pdip, int flags)
6159 {
6160 	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_ENABLE_OP));
6161 }
6162 
6163 /*
6164  * Common routine for doing enable/disable.
6165  */
6166 static mdi_pathinfo_t *
6167 i_mdi_enable_disable_path(mdi_pathinfo_t *pip, mdi_vhci_t *vh, int flags,
6168 		int op)
6169 {
6170 	int		sync_flag = 0;
6171 	int		rv;
6172 	mdi_pathinfo_t 	*next;
6173 	int		(*f)() = NULL;
6174 
6175 	/*
6176 	 * Check to make sure the path is not already in the
6177 	 * requested state. If it is just return the next path
6178 	 * as we have nothing to do here.
6179 	 */
6180 	if ((MDI_PI_IS_DISABLE(pip) && op == MDI_DISABLE_OP) ||
6181 	    (!MDI_PI_IS_DISABLE(pip) && op == MDI_ENABLE_OP)) {
6182 		MDI_PI_LOCK(pip);
6183 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
6184 		MDI_PI_UNLOCK(pip);
6185 		return (next);
6186 	}
6187 
6188 	f = vh->vh_ops->vo_pi_state_change;
6189 
6190 	sync_flag = (flags << 8) & 0xf00;
6191 
6192 	/*
6193 	 * Do a callback into the mdi consumer to let it
6194 	 * know that path is about to get enabled/disabled.
6195 	 */
6196 	if (f != NULL) {
6197 		rv = (*f)(vh->vh_dip, pip, 0,
6198 			MDI_PI_EXT_STATE(pip),
6199 			MDI_EXT_STATE_CHANGE | sync_flag |
6200 			op | MDI_BEFORE_STATE_CHANGE);
6201 		if (rv != MDI_SUCCESS) {
6202 			MDI_DEBUG(2, (MDI_WARN, vh->vh_dip,
6203 			    "vo_pi_state_change: failed rv = %x", rv));
6204 		}
6205 	}
6206 	MDI_PI_LOCK(pip);
6207 	next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
6208 
6209 	switch (flags) {
6210 		case USER_DISABLE:
6211 			if (op == MDI_DISABLE_OP) {
6212 				MDI_PI_SET_USER_DISABLE(pip);
6213 			} else {
6214 				MDI_PI_SET_USER_ENABLE(pip);
6215 			}
6216 			break;
6217 		case DRIVER_DISABLE:
6218 			if (op == MDI_DISABLE_OP) {
6219 				MDI_PI_SET_DRV_DISABLE(pip);
6220 			} else {
6221 				MDI_PI_SET_DRV_ENABLE(pip);
6222 			}
6223 			break;
6224 		case DRIVER_DISABLE_TRANSIENT:
6225 			if (op == MDI_DISABLE_OP && rv == MDI_SUCCESS) {
6226 				MDI_PI_SET_DRV_DISABLE_TRANS(pip);
6227 			} else {
6228 				MDI_PI_SET_DRV_ENABLE_TRANS(pip);
6229 			}
6230 			break;
6231 	}
6232 	MDI_PI_UNLOCK(pip);
6233 	/*
6234 	 * Do a callback into the mdi consumer to let it
6235 	 * know that path is now enabled/disabled.
6236 	 */
6237 	if (f != NULL) {
6238 		rv = (*f)(vh->vh_dip, pip, 0,
6239 			MDI_PI_EXT_STATE(pip),
6240 			MDI_EXT_STATE_CHANGE | sync_flag |
6241 			op | MDI_AFTER_STATE_CHANGE);
6242 		if (rv != MDI_SUCCESS) {
6243 			MDI_DEBUG(2, (MDI_WARN, vh->vh_dip,
6244 			    "vo_pi_state_change failed: rv = %x", rv));
6245 		}
6246 	}
6247 	return (next);
6248 }
6249 
6250 /*
6251  * Common routine for doing enable/disable.
6252  * NOTE: this will be removed once the NWS files are changed to use the new
6253  * mdi_{enable,disable}_path has been putback
6254  */
6255 int
6256 i_mdi_pi_enable_disable(dev_info_t *cdip, dev_info_t *pdip, int flags, int op)
6257 {
6258 
6259 	mdi_phci_t	*ph;
6260 	mdi_vhci_t	*vh = NULL;
6261 	mdi_client_t	*ct;
6262 	mdi_pathinfo_t	*next, *pip;
6263 	int		found_it;
6264 
6265 	ph = i_devi_get_phci(pdip);
6266 	MDI_DEBUG(5, (MDI_NOTE, cdip ? cdip : pdip,
6267 	    "!op = %d pdip = %p cdip = %p", op, (void *)pdip,
6268 	    (void *)cdip));
6269 	if (ph == NULL) {
6270 		MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6271 		    "!failed: operation %d: NULL ph", op));
6272 		return (MDI_FAILURE);
6273 	}
6274 
6275 	if ((op != MDI_ENABLE_OP) && (op != MDI_DISABLE_OP)) {
6276 		MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6277 		    "!failed: invalid operation %d", op));
6278 		return (MDI_FAILURE);
6279 	}
6280 
6281 	vh = ph->ph_vhci;
6282 
6283 	if (cdip == NULL) {
6284 		/*
6285 		 * Need to mark the Phci as enabled/disabled.
6286 		 */
6287 		MDI_DEBUG(4, (MDI_NOTE, cdip ? cdip : pdip,
6288 		    "op %d for the phci", op));
6289 		MDI_PHCI_LOCK(ph);
6290 		switch (flags) {
6291 			case USER_DISABLE:
6292 				if (op == MDI_DISABLE_OP) {
6293 					MDI_PHCI_SET_USER_DISABLE(ph);
6294 				} else {
6295 					MDI_PHCI_SET_USER_ENABLE(ph);
6296 				}
6297 				break;
6298 			case DRIVER_DISABLE:
6299 				if (op == MDI_DISABLE_OP) {
6300 					MDI_PHCI_SET_DRV_DISABLE(ph);
6301 				} else {
6302 					MDI_PHCI_SET_DRV_ENABLE(ph);
6303 				}
6304 				break;
6305 			case DRIVER_DISABLE_TRANSIENT:
6306 				if (op == MDI_DISABLE_OP) {
6307 					MDI_PHCI_SET_DRV_DISABLE_TRANSIENT(ph);
6308 				} else {
6309 					MDI_PHCI_SET_DRV_ENABLE_TRANSIENT(ph);
6310 				}
6311 				break;
6312 			default:
6313 				MDI_PHCI_UNLOCK(ph);
6314 				MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6315 				    "!invalid flag argument= %d", flags));
6316 		}
6317 
6318 		/*
6319 		 * Phci has been disabled. Now try to enable/disable
6320 		 * path info's to each client.
6321 		 */
6322 		pip = ph->ph_path_head;
6323 		while (pip != NULL) {
6324 			pip = i_mdi_enable_disable_path(pip, vh, flags, op);
6325 		}
6326 		MDI_PHCI_UNLOCK(ph);
6327 	} else {
6328 
6329 		/*
6330 		 * Disable a specific client.
6331 		 */
6332 		ct = i_devi_get_client(cdip);
6333 		if (ct == NULL) {
6334 			MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6335 			    "!failed: operation = %d: NULL ct", op));
6336 			return (MDI_FAILURE);
6337 		}
6338 
6339 		MDI_CLIENT_LOCK(ct);
6340 		pip = ct->ct_path_head;
6341 		found_it = 0;
6342 		while (pip != NULL) {
6343 			MDI_PI_LOCK(pip);
6344 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6345 			if (MDI_PI(pip)->pi_phci == ph) {
6346 				MDI_PI_UNLOCK(pip);
6347 				found_it = 1;
6348 				break;
6349 			}
6350 			MDI_PI_UNLOCK(pip);
6351 			pip = next;
6352 		}
6353 
6354 
6355 		MDI_CLIENT_UNLOCK(ct);
6356 		if (found_it == 0) {
6357 			MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6358 			    "!failed. Could not find corresponding pip\n"));
6359 			return (MDI_FAILURE);
6360 		}
6361 
6362 		(void) i_mdi_enable_disable_path(pip, vh, flags, op);
6363 	}
6364 
6365 	MDI_DEBUG(5, (MDI_NOTE, cdip ? cdip : pdip,
6366 	    "!op %d returning success pdip = %p cdip = %p",
6367 	    op, (void *)pdip, (void *)cdip));
6368 	return (MDI_SUCCESS);
6369 }
6370 
6371 /*
6372  * Ensure phci powered up
6373  */
6374 static void
6375 i_mdi_pm_hold_pip(mdi_pathinfo_t *pip)
6376 {
6377 	dev_info_t	*ph_dip;
6378 
6379 	ASSERT(pip != NULL);
6380 	ASSERT(MDI_PI_LOCKED(pip));
6381 
6382 	if (MDI_PI(pip)->pi_pm_held) {
6383 		return;
6384 	}
6385 
6386 	ph_dip = mdi_pi_get_phci(pip);
6387 	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6388 	    "%s %p", mdi_pi_spathname(pip), (void *)pip));
6389 	if (ph_dip == NULL) {
6390 		return;
6391 	}
6392 
6393 	MDI_PI_UNLOCK(pip);
6394 	MDI_DEBUG(4, (MDI_NOTE, ph_dip, "kidsupcnt was %d",
6395 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
6396 	pm_hold_power(ph_dip);
6397 	MDI_DEBUG(4, (MDI_NOTE, ph_dip, "kidsupcnt is %d",
6398 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
6399 	MDI_PI_LOCK(pip);
6400 
6401 	/* If PM_GET_PM_INFO is NULL the pm_hold_power above was a noop */
6402 	if (DEVI(ph_dip)->devi_pm_info)
6403 		MDI_PI(pip)->pi_pm_held = 1;
6404 }
6405 
6406 /*
6407  * Allow phci powered down
6408  */
6409 static void
6410 i_mdi_pm_rele_pip(mdi_pathinfo_t *pip)
6411 {
6412 	dev_info_t	*ph_dip = NULL;
6413 
6414 	ASSERT(pip != NULL);
6415 	ASSERT(MDI_PI_LOCKED(pip));
6416 
6417 	if (MDI_PI(pip)->pi_pm_held == 0) {
6418 		return;
6419 	}
6420 
6421 	ph_dip = mdi_pi_get_phci(pip);
6422 	ASSERT(ph_dip != NULL);
6423 
6424 	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6425 	    "%s %p", mdi_pi_spathname(pip), (void *)pip));
6426 
6427 	MDI_PI_UNLOCK(pip);
6428 	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6429 	    "kidsupcnt was %d", DEVI(ph_dip)->devi_pm_kidsupcnt));
6430 	pm_rele_power(ph_dip);
6431 	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6432 	    "kidsupcnt is %d", DEVI(ph_dip)->devi_pm_kidsupcnt));
6433 	MDI_PI_LOCK(pip);
6434 
6435 	MDI_PI(pip)->pi_pm_held = 0;
6436 }
6437 
6438 static void
6439 i_mdi_pm_hold_client(mdi_client_t *ct, int incr)
6440 {
6441 	ASSERT(MDI_CLIENT_LOCKED(ct));
6442 
6443 	ct->ct_power_cnt += incr;
6444 	MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
6445 	    "%p ct_power_cnt = %d incr = %d",
6446 	    (void *)ct, ct->ct_power_cnt, incr));
6447 	ASSERT(ct->ct_power_cnt >= 0);
6448 }
6449 
6450 static void
6451 i_mdi_rele_all_phci(mdi_client_t *ct)
6452 {
6453 	mdi_pathinfo_t  *pip;
6454 
6455 	ASSERT(MDI_CLIENT_LOCKED(ct));
6456 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
6457 	while (pip != NULL) {
6458 		mdi_hold_path(pip);
6459 		MDI_PI_LOCK(pip);
6460 		i_mdi_pm_rele_pip(pip);
6461 		MDI_PI_UNLOCK(pip);
6462 		mdi_rele_path(pip);
6463 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6464 	}
6465 }
6466 
6467 static void
6468 i_mdi_pm_rele_client(mdi_client_t *ct, int decr)
6469 {
6470 	ASSERT(MDI_CLIENT_LOCKED(ct));
6471 
6472 	if (i_ddi_devi_attached(ct->ct_dip)) {
6473 		ct->ct_power_cnt -= decr;
6474 		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
6475 		    "%p ct_power_cnt = %d decr = %d",
6476 		    (void *)ct, ct->ct_power_cnt, decr));
6477 	}
6478 
6479 	ASSERT(ct->ct_power_cnt >= 0);
6480 	if (ct->ct_power_cnt == 0) {
6481 		i_mdi_rele_all_phci(ct);
6482 		return;
6483 	}
6484 }
6485 
6486 static void
6487 i_mdi_pm_reset_client(mdi_client_t *ct)
6488 {
6489 	MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
6490 	    "%p ct_power_cnt = %d", (void *)ct, ct->ct_power_cnt));
6491 	ASSERT(MDI_CLIENT_LOCKED(ct));
6492 	ct->ct_power_cnt = 0;
6493 	i_mdi_rele_all_phci(ct);
6494 	ct->ct_powercnt_config = 0;
6495 	ct->ct_powercnt_unconfig = 0;
6496 	ct->ct_powercnt_reset = 1;
6497 }
6498 
6499 static int
6500 i_mdi_power_one_phci(mdi_pathinfo_t *pip)
6501 {
6502 	int		ret;
6503 	dev_info_t	*ph_dip;
6504 
6505 	MDI_PI_LOCK(pip);
6506 	i_mdi_pm_hold_pip(pip);
6507 
6508 	ph_dip = mdi_pi_get_phci(pip);
6509 	MDI_PI_UNLOCK(pip);
6510 
6511 	/* bring all components of phci to full power */
6512 	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6513 	    "pm_powerup for %s%d %p", ddi_driver_name(ph_dip),
6514 	    ddi_get_instance(ph_dip), (void *)pip));
6515 
6516 	ret = pm_powerup(ph_dip);
6517 
6518 	if (ret == DDI_FAILURE) {
6519 		MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6520 		    "pm_powerup FAILED for %s%d %p",
6521 		    ddi_driver_name(ph_dip), ddi_get_instance(ph_dip),
6522 		    (void *)pip));
6523 
6524 		MDI_PI_LOCK(pip);
6525 		i_mdi_pm_rele_pip(pip);
6526 		MDI_PI_UNLOCK(pip);
6527 		return (MDI_FAILURE);
6528 	}
6529 
6530 	return (MDI_SUCCESS);
6531 }
6532 
6533 static int
6534 i_mdi_power_all_phci(mdi_client_t *ct)
6535 {
6536 	mdi_pathinfo_t  *pip;
6537 	int		succeeded = 0;
6538 
6539 	ASSERT(MDI_CLIENT_LOCKED(ct));
6540 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
6541 	while (pip != NULL) {
6542 		/*
6543 		 * Don't power if MDI_PATHINFO_STATE_FAULT
6544 		 * or MDI_PATHINFO_STATE_OFFLINE.
6545 		 */
6546 		if (MDI_PI_IS_INIT(pip) ||
6547 		    MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip)) {
6548 			mdi_hold_path(pip);
6549 			MDI_CLIENT_UNLOCK(ct);
6550 			if (i_mdi_power_one_phci(pip) == MDI_SUCCESS)
6551 				succeeded = 1;
6552 
6553 			ASSERT(ct == MDI_PI(pip)->pi_client);
6554 			MDI_CLIENT_LOCK(ct);
6555 			mdi_rele_path(pip);
6556 		}
6557 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6558 	}
6559 
6560 	return (succeeded ? MDI_SUCCESS : MDI_FAILURE);
6561 }
6562 
6563 /*
6564  * mdi_bus_power():
6565  *		1. Place the phci(s) into powered up state so that
6566  *		   client can do power management
6567  *		2. Ensure phci powered up as client power managing
6568  * Return Values:
6569  *		MDI_SUCCESS
6570  *		MDI_FAILURE
6571  */
6572 int
6573 mdi_bus_power(dev_info_t *parent, void *impl_arg, pm_bus_power_op_t op,
6574     void *arg, void *result)
6575 {
6576 	int			ret = MDI_SUCCESS;
6577 	pm_bp_child_pwrchg_t	*bpc;
6578 	mdi_client_t		*ct;
6579 	dev_info_t		*cdip;
6580 	pm_bp_has_changed_t	*bphc;
6581 
6582 	/*
6583 	 * BUS_POWER_NOINVOL not supported
6584 	 */
6585 	if (op == BUS_POWER_NOINVOL)
6586 		return (MDI_FAILURE);
6587 
6588 	/*
6589 	 * ignore other OPs.
6590 	 * return quickly to save cou cycles on the ct processing
6591 	 */
6592 	switch (op) {
6593 	case BUS_POWER_PRE_NOTIFICATION:
6594 	case BUS_POWER_POST_NOTIFICATION:
6595 		bpc = (pm_bp_child_pwrchg_t *)arg;
6596 		cdip = bpc->bpc_dip;
6597 		break;
6598 	case BUS_POWER_HAS_CHANGED:
6599 		bphc = (pm_bp_has_changed_t *)arg;
6600 		cdip = bphc->bphc_dip;
6601 		break;
6602 	default:
6603 		return (pm_busop_bus_power(parent, impl_arg, op, arg, result));
6604 	}
6605 
6606 	ASSERT(MDI_CLIENT(cdip));
6607 
6608 	ct = i_devi_get_client(cdip);
6609 	if (ct == NULL)
6610 		return (MDI_FAILURE);
6611 
6612 	/*
6613 	 * wait till the mdi_pathinfo node state change are processed
6614 	 */
6615 	MDI_CLIENT_LOCK(ct);
6616 	switch (op) {
6617 	case BUS_POWER_PRE_NOTIFICATION:
6618 		MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6619 		    "BUS_POWER_PRE_NOTIFICATION:"
6620 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d",
6621 		    ddi_node_name(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
6622 		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp));
6623 
6624 		/* serialize power level change per client */
6625 		while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6626 			cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6627 
6628 		MDI_CLIENT_SET_POWER_TRANSITION(ct);
6629 
6630 		if (ct->ct_power_cnt == 0) {
6631 			ret = i_mdi_power_all_phci(ct);
6632 		}
6633 
6634 		/*
6635 		 * if new_level > 0:
6636 		 *	- hold phci(s)
6637 		 *	- power up phci(s) if not already
6638 		 * ignore power down
6639 		 */
6640 		if (bpc->bpc_nlevel > 0) {
6641 			if (!DEVI_IS_ATTACHING(ct->ct_dip)) {
6642 				MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6643 				    "i_mdi_pm_hold_client\n"));
6644 				i_mdi_pm_hold_client(ct, ct->ct_path_count);
6645 			}
6646 		}
6647 		break;
6648 	case BUS_POWER_POST_NOTIFICATION:
6649 		MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6650 		    "BUS_POWER_POST_NOTIFICATION:"
6651 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d result=%d",
6652 		    ddi_node_name(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
6653 		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp,
6654 		    *(int *)result));
6655 
6656 		if (*(int *)result == DDI_SUCCESS) {
6657 			if (bpc->bpc_nlevel > 0) {
6658 				MDI_CLIENT_SET_POWER_UP(ct);
6659 			} else {
6660 				MDI_CLIENT_SET_POWER_DOWN(ct);
6661 			}
6662 		}
6663 
6664 		/* release the hold we did in pre-notification */
6665 		if (bpc->bpc_nlevel > 0 && (*(int *)result != DDI_SUCCESS) &&
6666 		    !DEVI_IS_ATTACHING(ct->ct_dip)) {
6667 			MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6668 			    "i_mdi_pm_rele_client\n"));
6669 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
6670 		}
6671 
6672 		if (bpc->bpc_nlevel == 0 && (*(int *)result == DDI_SUCCESS)) {
6673 			/* another thread might started attaching */
6674 			if (DEVI_IS_ATTACHING(ct->ct_dip)) {
6675 				MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6676 				    "i_mdi_pm_rele_client\n"));
6677 				i_mdi_pm_rele_client(ct, ct->ct_path_count);
6678 			/* detaching has been taken care in pm_post_unconfig */
6679 			} else if (!DEVI_IS_DETACHING(ct->ct_dip)) {
6680 				MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6681 				    "i_mdi_pm_reset_client\n"));
6682 				i_mdi_pm_reset_client(ct);
6683 			}
6684 		}
6685 
6686 		MDI_CLIENT_CLEAR_POWER_TRANSITION(ct);
6687 		cv_broadcast(&ct->ct_powerchange_cv);
6688 
6689 		break;
6690 
6691 	/* need to do more */
6692 	case BUS_POWER_HAS_CHANGED:
6693 		MDI_DEBUG(4, (MDI_NOTE, bphc->bphc_dip,
6694 		    "BUS_POWER_HAS_CHANGED:"
6695 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d",
6696 		    ddi_node_name(bphc->bphc_dip), PM_ADDR(bphc->bphc_dip),
6697 		    bphc->bphc_olevel, bphc->bphc_nlevel, bphc->bphc_comp));
6698 
6699 		if (bphc->bphc_nlevel > 0 &&
6700 		    bphc->bphc_nlevel > bphc->bphc_olevel) {
6701 			if (ct->ct_power_cnt == 0) {
6702 				ret = i_mdi_power_all_phci(ct);
6703 			}
6704 			MDI_DEBUG(4, (MDI_NOTE, bphc->bphc_dip,
6705 			    "i_mdi_pm_hold_client\n"));
6706 			i_mdi_pm_hold_client(ct, ct->ct_path_count);
6707 		}
6708 
6709 		if (bphc->bphc_nlevel == 0 && bphc->bphc_olevel != -1) {
6710 			MDI_DEBUG(4, (MDI_NOTE, bphc->bphc_dip,
6711 			    "i_mdi_pm_rele_client\n"));
6712 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
6713 		}
6714 		break;
6715 	}
6716 
6717 	MDI_CLIENT_UNLOCK(ct);
6718 	return (ret);
6719 }
6720 
6721 static int
6722 i_mdi_pm_pre_config_one(dev_info_t *child)
6723 {
6724 	int		ret = MDI_SUCCESS;
6725 	mdi_client_t	*ct;
6726 
6727 	ct = i_devi_get_client(child);
6728 	if (ct == NULL)
6729 		return (MDI_FAILURE);
6730 
6731 	MDI_CLIENT_LOCK(ct);
6732 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6733 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6734 
6735 	if (!MDI_CLIENT_IS_FAILED(ct)) {
6736 		MDI_CLIENT_UNLOCK(ct);
6737 		MDI_DEBUG(4, (MDI_NOTE, child, "already configured\n"));
6738 		return (MDI_SUCCESS);
6739 	}
6740 
6741 	if (ct->ct_powercnt_config) {
6742 		MDI_CLIENT_UNLOCK(ct);
6743 		MDI_DEBUG(4, (MDI_NOTE, child, "already held\n"));
6744 		return (MDI_SUCCESS);
6745 	}
6746 
6747 	if (ct->ct_power_cnt == 0) {
6748 		ret = i_mdi_power_all_phci(ct);
6749 	}
6750 	MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_hold_client\n"));
6751 	i_mdi_pm_hold_client(ct, ct->ct_path_count);
6752 	ct->ct_powercnt_config = 1;
6753 	ct->ct_powercnt_reset = 0;
6754 	MDI_CLIENT_UNLOCK(ct);
6755 	return (ret);
6756 }
6757 
6758 static int
6759 i_mdi_pm_pre_config(dev_info_t *vdip, dev_info_t *child)
6760 {
6761 	int			ret = MDI_SUCCESS;
6762 	dev_info_t		*cdip;
6763 	int			circ;
6764 
6765 	ASSERT(MDI_VHCI(vdip));
6766 
6767 	/* ndi_devi_config_one */
6768 	if (child) {
6769 		ASSERT(DEVI_BUSY_OWNED(vdip));
6770 		return (i_mdi_pm_pre_config_one(child));
6771 	}
6772 
6773 	/* devi_config_common */
6774 	ndi_devi_enter(vdip, &circ);
6775 	cdip = ddi_get_child(vdip);
6776 	while (cdip) {
6777 		dev_info_t *next = ddi_get_next_sibling(cdip);
6778 
6779 		ret = i_mdi_pm_pre_config_one(cdip);
6780 		if (ret != MDI_SUCCESS)
6781 			break;
6782 		cdip = next;
6783 	}
6784 	ndi_devi_exit(vdip, circ);
6785 	return (ret);
6786 }
6787 
6788 static int
6789 i_mdi_pm_pre_unconfig_one(dev_info_t *child, int *held, int flags)
6790 {
6791 	int		ret = MDI_SUCCESS;
6792 	mdi_client_t	*ct;
6793 
6794 	ct = i_devi_get_client(child);
6795 	if (ct == NULL)
6796 		return (MDI_FAILURE);
6797 
6798 	MDI_CLIENT_LOCK(ct);
6799 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6800 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6801 
6802 	if (!i_ddi_devi_attached(ct->ct_dip)) {
6803 		MDI_DEBUG(4, (MDI_NOTE, child, "node detached already\n"));
6804 		MDI_CLIENT_UNLOCK(ct);
6805 		return (MDI_SUCCESS);
6806 	}
6807 
6808 	if (MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6809 	    (flags & NDI_AUTODETACH)) {
6810 		MDI_DEBUG(4, (MDI_NOTE, child, "auto-modunload\n"));
6811 		MDI_CLIENT_UNLOCK(ct);
6812 		return (MDI_FAILURE);
6813 	}
6814 
6815 	if (ct->ct_powercnt_unconfig) {
6816 		MDI_DEBUG(4, (MDI_NOTE, child, "ct_powercnt_held\n"));
6817 		MDI_CLIENT_UNLOCK(ct);
6818 		*held = 1;
6819 		return (MDI_SUCCESS);
6820 	}
6821 
6822 	if (ct->ct_power_cnt == 0) {
6823 		ret = i_mdi_power_all_phci(ct);
6824 	}
6825 	MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_hold_client\n"));
6826 	i_mdi_pm_hold_client(ct, ct->ct_path_count);
6827 	ct->ct_powercnt_unconfig = 1;
6828 	ct->ct_powercnt_reset = 0;
6829 	MDI_CLIENT_UNLOCK(ct);
6830 	if (ret == MDI_SUCCESS)
6831 		*held = 1;
6832 	return (ret);
6833 }
6834 
6835 static int
6836 i_mdi_pm_pre_unconfig(dev_info_t *vdip, dev_info_t *child, int *held,
6837     int flags)
6838 {
6839 	int			ret = MDI_SUCCESS;
6840 	dev_info_t		*cdip;
6841 	int			circ;
6842 
6843 	ASSERT(MDI_VHCI(vdip));
6844 	*held = 0;
6845 
6846 	/* ndi_devi_unconfig_one */
6847 	if (child) {
6848 		ASSERT(DEVI_BUSY_OWNED(vdip));
6849 		return (i_mdi_pm_pre_unconfig_one(child, held, flags));
6850 	}
6851 
6852 	/* devi_unconfig_common */
6853 	ndi_devi_enter(vdip, &circ);
6854 	cdip = ddi_get_child(vdip);
6855 	while (cdip) {
6856 		dev_info_t *next = ddi_get_next_sibling(cdip);
6857 
6858 		ret = i_mdi_pm_pre_unconfig_one(cdip, held, flags);
6859 		cdip = next;
6860 	}
6861 	ndi_devi_exit(vdip, circ);
6862 
6863 	if (*held)
6864 		ret = MDI_SUCCESS;
6865 
6866 	return (ret);
6867 }
6868 
6869 static void
6870 i_mdi_pm_post_config_one(dev_info_t *child)
6871 {
6872 	mdi_client_t	*ct;
6873 
6874 	ct = i_devi_get_client(child);
6875 	if (ct == NULL)
6876 		return;
6877 
6878 	MDI_CLIENT_LOCK(ct);
6879 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6880 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6881 
6882 	if (ct->ct_powercnt_reset || !ct->ct_powercnt_config) {
6883 		MDI_DEBUG(4, (MDI_NOTE, child, "not configured\n"));
6884 		MDI_CLIENT_UNLOCK(ct);
6885 		return;
6886 	}
6887 
6888 	/* client has not been updated */
6889 	if (MDI_CLIENT_IS_FAILED(ct)) {
6890 		MDI_DEBUG(4, (MDI_NOTE, child, "client failed\n"));
6891 		MDI_CLIENT_UNLOCK(ct);
6892 		return;
6893 	}
6894 
6895 	/* another thread might have powered it down or detached it */
6896 	if ((MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6897 	    !DEVI_IS_ATTACHING(ct->ct_dip)) ||
6898 	    (!i_ddi_devi_attached(ct->ct_dip) &&
6899 	    !DEVI_IS_ATTACHING(ct->ct_dip))) {
6900 		MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_reset_client\n"));
6901 		i_mdi_pm_reset_client(ct);
6902 	} else {
6903 		mdi_pathinfo_t  *pip, *next;
6904 		int	valid_path_count = 0;
6905 
6906 		MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_rele_client\n"));
6907 		pip = ct->ct_path_head;
6908 		while (pip != NULL) {
6909 			MDI_PI_LOCK(pip);
6910 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6911 			if (MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip))
6912 				valid_path_count ++;
6913 			MDI_PI_UNLOCK(pip);
6914 			pip = next;
6915 		}
6916 		i_mdi_pm_rele_client(ct, valid_path_count);
6917 	}
6918 	ct->ct_powercnt_config = 0;
6919 	MDI_CLIENT_UNLOCK(ct);
6920 }
6921 
6922 static void
6923 i_mdi_pm_post_config(dev_info_t *vdip, dev_info_t *child)
6924 {
6925 	int		circ;
6926 	dev_info_t	*cdip;
6927 
6928 	ASSERT(MDI_VHCI(vdip));
6929 
6930 	/* ndi_devi_config_one */
6931 	if (child) {
6932 		ASSERT(DEVI_BUSY_OWNED(vdip));
6933 		i_mdi_pm_post_config_one(child);
6934 		return;
6935 	}
6936 
6937 	/* devi_config_common */
6938 	ndi_devi_enter(vdip, &circ);
6939 	cdip = ddi_get_child(vdip);
6940 	while (cdip) {
6941 		dev_info_t *next = ddi_get_next_sibling(cdip);
6942 
6943 		i_mdi_pm_post_config_one(cdip);
6944 		cdip = next;
6945 	}
6946 	ndi_devi_exit(vdip, circ);
6947 }
6948 
6949 static void
6950 i_mdi_pm_post_unconfig_one(dev_info_t *child)
6951 {
6952 	mdi_client_t	*ct;
6953 
6954 	ct = i_devi_get_client(child);
6955 	if (ct == NULL)
6956 		return;
6957 
6958 	MDI_CLIENT_LOCK(ct);
6959 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6960 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6961 
6962 	if (!ct->ct_powercnt_unconfig || ct->ct_powercnt_reset) {
6963 		MDI_DEBUG(4, (MDI_NOTE, child, "not held\n"));
6964 		MDI_CLIENT_UNLOCK(ct);
6965 		return;
6966 	}
6967 
6968 	/* failure detaching or another thread just attached it */
6969 	if ((MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6970 	    i_ddi_devi_attached(ct->ct_dip)) ||
6971 	    (!i_ddi_devi_attached(ct->ct_dip) &&
6972 	    !DEVI_IS_ATTACHING(ct->ct_dip))) {
6973 		MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_reset_client\n"));
6974 		i_mdi_pm_reset_client(ct);
6975 	} else {
6976 		mdi_pathinfo_t  *pip, *next;
6977 		int	valid_path_count = 0;
6978 
6979 		MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_rele_client\n"));
6980 		pip = ct->ct_path_head;
6981 		while (pip != NULL) {
6982 			MDI_PI_LOCK(pip);
6983 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6984 			if (MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip))
6985 				valid_path_count ++;
6986 			MDI_PI_UNLOCK(pip);
6987 			pip = next;
6988 		}
6989 		i_mdi_pm_rele_client(ct, valid_path_count);
6990 		ct->ct_powercnt_unconfig = 0;
6991 	}
6992 
6993 	MDI_CLIENT_UNLOCK(ct);
6994 }
6995 
6996 static void
6997 i_mdi_pm_post_unconfig(dev_info_t *vdip, dev_info_t *child, int held)
6998 {
6999 	int			circ;
7000 	dev_info_t		*cdip;
7001 
7002 	ASSERT(MDI_VHCI(vdip));
7003 
7004 	if (!held) {
7005 		MDI_DEBUG(4, (MDI_NOTE, vdip, "held = %d", held));
7006 		return;
7007 	}
7008 
7009 	if (child) {
7010 		ASSERT(DEVI_BUSY_OWNED(vdip));
7011 		i_mdi_pm_post_unconfig_one(child);
7012 		return;
7013 	}
7014 
7015 	ndi_devi_enter(vdip, &circ);
7016 	cdip = ddi_get_child(vdip);
7017 	while (cdip) {
7018 		dev_info_t *next = ddi_get_next_sibling(cdip);
7019 
7020 		i_mdi_pm_post_unconfig_one(cdip);
7021 		cdip = next;
7022 	}
7023 	ndi_devi_exit(vdip, circ);
7024 }
7025 
7026 int
7027 mdi_power(dev_info_t *vdip, mdi_pm_op_t op, void *args, char *devnm, int flags)
7028 {
7029 	int			circ, ret = MDI_SUCCESS;
7030 	dev_info_t		*client_dip = NULL;
7031 	mdi_client_t		*ct;
7032 
7033 	/*
7034 	 * Handling ndi_devi_config_one and ndi_devi_unconfig_one.
7035 	 * Power up pHCI for the named client device.
7036 	 * Note: Before the client is enumerated under vhci by phci,
7037 	 * client_dip can be NULL. Then proceed to power up all the
7038 	 * pHCIs.
7039 	 */
7040 	if (devnm != NULL) {
7041 		ndi_devi_enter(vdip, &circ);
7042 		client_dip = ndi_devi_findchild(vdip, devnm);
7043 	}
7044 
7045 	MDI_DEBUG(4, (MDI_NOTE, vdip,
7046 	    "op = %d %s %p", op, devnm ? devnm : "", (void *)client_dip));
7047 
7048 	switch (op) {
7049 	case MDI_PM_PRE_CONFIG:
7050 		ret = i_mdi_pm_pre_config(vdip, client_dip);
7051 		break;
7052 
7053 	case MDI_PM_PRE_UNCONFIG:
7054 		ret = i_mdi_pm_pre_unconfig(vdip, client_dip, (int *)args,
7055 		    flags);
7056 		break;
7057 
7058 	case MDI_PM_POST_CONFIG:
7059 		i_mdi_pm_post_config(vdip, client_dip);
7060 		break;
7061 
7062 	case MDI_PM_POST_UNCONFIG:
7063 		i_mdi_pm_post_unconfig(vdip, client_dip, *(int *)args);
7064 		break;
7065 
7066 	case MDI_PM_HOLD_POWER:
7067 	case MDI_PM_RELE_POWER:
7068 		ASSERT(args);
7069 
7070 		client_dip = (dev_info_t *)args;
7071 		ASSERT(MDI_CLIENT(client_dip));
7072 
7073 		ct = i_devi_get_client(client_dip);
7074 		MDI_CLIENT_LOCK(ct);
7075 
7076 		if (op == MDI_PM_HOLD_POWER) {
7077 			if (ct->ct_power_cnt == 0) {
7078 				(void) i_mdi_power_all_phci(ct);
7079 				MDI_DEBUG(4, (MDI_NOTE, client_dip,
7080 				    "i_mdi_pm_hold_client\n"));
7081 				i_mdi_pm_hold_client(ct, ct->ct_path_count);
7082 			}
7083 		} else {
7084 			if (DEVI_IS_ATTACHING(ct->ct_dip)) {
7085 				MDI_DEBUG(4, (MDI_NOTE, client_dip,
7086 				    "i_mdi_pm_rele_client\n"));
7087 				i_mdi_pm_rele_client(ct, ct->ct_path_count);
7088 			} else {
7089 				MDI_DEBUG(4, (MDI_NOTE, client_dip,
7090 				    "i_mdi_pm_reset_client\n"));
7091 				i_mdi_pm_reset_client(ct);
7092 			}
7093 		}
7094 
7095 		MDI_CLIENT_UNLOCK(ct);
7096 		break;
7097 
7098 	default:
7099 		break;
7100 	}
7101 
7102 	if (devnm)
7103 		ndi_devi_exit(vdip, circ);
7104 
7105 	return (ret);
7106 }
7107 
7108 int
7109 mdi_component_is_vhci(dev_info_t *dip, const char **mdi_class)
7110 {
7111 	mdi_vhci_t *vhci;
7112 
7113 	if (!MDI_VHCI(dip))
7114 		return (MDI_FAILURE);
7115 
7116 	if (mdi_class) {
7117 		vhci = DEVI(dip)->devi_mdi_xhci;
7118 		ASSERT(vhci);
7119 		*mdi_class = vhci->vh_class;
7120 	}
7121 
7122 	return (MDI_SUCCESS);
7123 }
7124 
7125 int
7126 mdi_component_is_phci(dev_info_t *dip, const char **mdi_class)
7127 {
7128 	mdi_phci_t *phci;
7129 
7130 	if (!MDI_PHCI(dip))
7131 		return (MDI_FAILURE);
7132 
7133 	if (mdi_class) {
7134 		phci = DEVI(dip)->devi_mdi_xhci;
7135 		ASSERT(phci);
7136 		*mdi_class = phci->ph_vhci->vh_class;
7137 	}
7138 
7139 	return (MDI_SUCCESS);
7140 }
7141 
7142 int
7143 mdi_component_is_client(dev_info_t *dip, const char **mdi_class)
7144 {
7145 	mdi_client_t *client;
7146 
7147 	if (!MDI_CLIENT(dip))
7148 		return (MDI_FAILURE);
7149 
7150 	if (mdi_class) {
7151 		client = DEVI(dip)->devi_mdi_client;
7152 		ASSERT(client);
7153 		*mdi_class = client->ct_vhci->vh_class;
7154 	}
7155 
7156 	return (MDI_SUCCESS);
7157 }
7158 
7159 void *
7160 mdi_client_get_vhci_private(dev_info_t *dip)
7161 {
7162 	ASSERT(mdi_component_is_client(dip, NULL) == MDI_SUCCESS);
7163 	if (mdi_component_is_client(dip, NULL) == MDI_SUCCESS) {
7164 		mdi_client_t	*ct;
7165 		ct = i_devi_get_client(dip);
7166 		return (ct->ct_vprivate);
7167 	}
7168 	return (NULL);
7169 }
7170 
7171 void
7172 mdi_client_set_vhci_private(dev_info_t *dip, void *data)
7173 {
7174 	ASSERT(mdi_component_is_client(dip, NULL) == MDI_SUCCESS);
7175 	if (mdi_component_is_client(dip, NULL) == MDI_SUCCESS) {
7176 		mdi_client_t	*ct;
7177 		ct = i_devi_get_client(dip);
7178 		ct->ct_vprivate = data;
7179 	}
7180 }
7181 /*
7182  * mdi_pi_get_vhci_private():
7183  *		Get the vhci private information associated with the
7184  *		mdi_pathinfo node
7185  */
7186 void *
7187 mdi_pi_get_vhci_private(mdi_pathinfo_t *pip)
7188 {
7189 	caddr_t	vprivate = NULL;
7190 	if (pip) {
7191 		vprivate = MDI_PI(pip)->pi_vprivate;
7192 	}
7193 	return (vprivate);
7194 }
7195 
7196 /*
7197  * mdi_pi_set_vhci_private():
7198  *		Set the vhci private information in the mdi_pathinfo node
7199  */
7200 void
7201 mdi_pi_set_vhci_private(mdi_pathinfo_t *pip, void *priv)
7202 {
7203 	if (pip) {
7204 		MDI_PI(pip)->pi_vprivate = priv;
7205 	}
7206 }
7207 
7208 /*
7209  * mdi_phci_get_vhci_private():
7210  *		Get the vhci private information associated with the
7211  *		mdi_phci node
7212  */
7213 void *
7214 mdi_phci_get_vhci_private(dev_info_t *dip)
7215 {
7216 	ASSERT(mdi_component_is_phci(dip, NULL) == MDI_SUCCESS);
7217 	if (mdi_component_is_phci(dip, NULL) == MDI_SUCCESS) {
7218 		mdi_phci_t	*ph;
7219 		ph = i_devi_get_phci(dip);
7220 		return (ph->ph_vprivate);
7221 	}
7222 	return (NULL);
7223 }
7224 
7225 /*
7226  * mdi_phci_set_vhci_private():
7227  *		Set the vhci private information in the mdi_phci node
7228  */
7229 void
7230 mdi_phci_set_vhci_private(dev_info_t *dip, void *priv)
7231 {
7232 	ASSERT(mdi_component_is_phci(dip, NULL) == MDI_SUCCESS);
7233 	if (mdi_component_is_phci(dip, NULL) == MDI_SUCCESS) {
7234 		mdi_phci_t	*ph;
7235 		ph = i_devi_get_phci(dip);
7236 		ph->ph_vprivate = priv;
7237 	}
7238 }
7239 
7240 int
7241 mdi_pi_ishidden(mdi_pathinfo_t *pip)
7242 {
7243 	return (MDI_PI_FLAGS_IS_HIDDEN(pip));
7244 }
7245 
7246 int
7247 mdi_pi_device_isremoved(mdi_pathinfo_t *pip)
7248 {
7249 	return (MDI_PI_FLAGS_IS_DEVICE_REMOVED(pip));
7250 }
7251 
7252 /*
7253  * When processing hotplug, if mdi_pi_offline-mdi_pi_free fails then this
7254  * interface is used to represent device removal.
7255  */
7256 int
7257 mdi_pi_device_remove(mdi_pathinfo_t *pip)
7258 {
7259 	MDI_PI_LOCK(pip);
7260 	if (mdi_pi_device_isremoved(pip)) {
7261 		MDI_PI_UNLOCK(pip);
7262 		return (0);
7263 	}
7264 	MDI_PI_FLAGS_SET_DEVICE_REMOVED(pip);
7265 	MDI_PI_FLAGS_SET_HIDDEN(pip);
7266 	MDI_PI_UNLOCK(pip);
7267 
7268 	i_ddi_di_cache_invalidate();
7269 
7270 	return (1);
7271 }
7272 
7273 /*
7274  * When processing hotplug, if a path marked mdi_pi_device_isremoved()
7275  * is now accessible then this interfaces is used to represent device insertion.
7276  */
7277 int
7278 mdi_pi_device_insert(mdi_pathinfo_t *pip)
7279 {
7280 	MDI_PI_LOCK(pip);
7281 	if (!mdi_pi_device_isremoved(pip)) {
7282 		MDI_PI_UNLOCK(pip);
7283 		return (0);
7284 	}
7285 	MDI_PI_FLAGS_CLR_DEVICE_REMOVED(pip);
7286 	MDI_PI_FLAGS_CLR_HIDDEN(pip);
7287 	MDI_PI_UNLOCK(pip);
7288 
7289 	i_ddi_di_cache_invalidate();
7290 
7291 	return (1);
7292 }
7293 
7294 /*
7295  * List of vhci class names:
7296  * A vhci class name must be in this list only if the corresponding vhci
7297  * driver intends to use the mdi provided bus config implementation
7298  * (i.e., mdi_vhci_bus_config()).
7299  */
7300 static char *vhci_class_list[] = { MDI_HCI_CLASS_SCSI, MDI_HCI_CLASS_IB };
7301 #define	N_VHCI_CLASSES	(sizeof (vhci_class_list) / sizeof (char *))
7302 
7303 /*
7304  * During boot time, the on-disk vhci cache for every vhci class is read
7305  * in the form of an nvlist and stored here.
7306  */
7307 static nvlist_t *vhcache_nvl[N_VHCI_CLASSES];
7308 
7309 /* nvpair names in vhci cache nvlist */
7310 #define	MDI_VHCI_CACHE_VERSION	1
7311 #define	MDI_NVPNAME_VERSION	"version"
7312 #define	MDI_NVPNAME_PHCIS	"phcis"
7313 #define	MDI_NVPNAME_CTADDRMAP	"clientaddrmap"
7314 
7315 /*
7316  * Given vhci class name, return its on-disk vhci cache filename.
7317  * Memory for the returned filename which includes the full path is allocated
7318  * by this function.
7319  */
7320 static char *
7321 vhclass2vhcache_filename(char *vhclass)
7322 {
7323 	char *filename;
7324 	int len;
7325 	static char *fmt = "/etc/devices/mdi_%s_cache";
7326 
7327 	/*
7328 	 * fmt contains the on-disk vhci cache file name format;
7329 	 * for scsi_vhci the filename is "/etc/devices/mdi_scsi_vhci_cache".
7330 	 */
7331 
7332 	/* the -1 below is to account for "%s" in the format string */
7333 	len = strlen(fmt) + strlen(vhclass) - 1;
7334 	filename = kmem_alloc(len, KM_SLEEP);
7335 	(void) snprintf(filename, len, fmt, vhclass);
7336 	ASSERT(len == (strlen(filename) + 1));
7337 	return (filename);
7338 }
7339 
7340 /*
7341  * initialize the vhci cache related data structures and read the on-disk
7342  * vhci cached data into memory.
7343  */
7344 static void
7345 setup_vhci_cache(mdi_vhci_t *vh)
7346 {
7347 	mdi_vhci_config_t *vhc;
7348 	mdi_vhci_cache_t *vhcache;
7349 	int i;
7350 	nvlist_t *nvl = NULL;
7351 
7352 	vhc = kmem_zalloc(sizeof (mdi_vhci_config_t), KM_SLEEP);
7353 	vh->vh_config = vhc;
7354 	vhcache = &vhc->vhc_vhcache;
7355 
7356 	vhc->vhc_vhcache_filename = vhclass2vhcache_filename(vh->vh_class);
7357 
7358 	mutex_init(&vhc->vhc_lock, NULL, MUTEX_DEFAULT, NULL);
7359 	cv_init(&vhc->vhc_cv, NULL, CV_DRIVER, NULL);
7360 
7361 	rw_init(&vhcache->vhcache_lock, NULL, RW_DRIVER, NULL);
7362 
7363 	/*
7364 	 * Create string hash; same as mod_hash_create_strhash() except that
7365 	 * we use NULL key destructor.
7366 	 */
7367 	vhcache->vhcache_client_hash = mod_hash_create_extended(vh->vh_class,
7368 	    mdi_bus_config_cache_hash_size,
7369 	    mod_hash_null_keydtor, mod_hash_null_valdtor,
7370 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
7371 
7372 	/*
7373 	 * The on-disk vhci cache is read during booting prior to the
7374 	 * lights-out period by mdi_read_devices_files().
7375 	 */
7376 	for (i = 0; i < N_VHCI_CLASSES; i++) {
7377 		if (strcmp(vhci_class_list[i], vh->vh_class) == 0) {
7378 			nvl = vhcache_nvl[i];
7379 			vhcache_nvl[i] = NULL;
7380 			break;
7381 		}
7382 	}
7383 
7384 	/*
7385 	 * this is to cover the case of some one manually causing unloading
7386 	 * (or detaching) and reloading (or attaching) of a vhci driver.
7387 	 */
7388 	if (nvl == NULL && modrootloaded)
7389 		nvl = read_on_disk_vhci_cache(vh->vh_class);
7390 
7391 	if (nvl != NULL) {
7392 		rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7393 		if (mainnvl_to_vhcache(vhcache, nvl) == MDI_SUCCESS)
7394 			vhcache->vhcache_flags |= MDI_VHCI_CACHE_SETUP_DONE;
7395 		else  {
7396 			cmn_err(CE_WARN,
7397 			    "%s: data file corrupted, will recreate",
7398 			    vhc->vhc_vhcache_filename);
7399 		}
7400 		rw_exit(&vhcache->vhcache_lock);
7401 		nvlist_free(nvl);
7402 	}
7403 
7404 	vhc->vhc_cbid = callb_add(stop_vhcache_flush_thread, vhc,
7405 	    CB_CL_UADMIN_PRE_VFS, "mdi_vhcache_flush");
7406 
7407 	vhc->vhc_path_discovery_boot = mdi_path_discovery_boot;
7408 	vhc->vhc_path_discovery_postboot = mdi_path_discovery_postboot;
7409 }
7410 
7411 /*
7412  * free all vhci cache related resources
7413  */
7414 static int
7415 destroy_vhci_cache(mdi_vhci_t *vh)
7416 {
7417 	mdi_vhci_config_t *vhc = vh->vh_config;
7418 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7419 	mdi_vhcache_phci_t *cphci, *cphci_next;
7420 	mdi_vhcache_client_t *cct, *cct_next;
7421 	mdi_vhcache_pathinfo_t *cpi, *cpi_next;
7422 
7423 	if (stop_vhcache_async_threads(vhc) != MDI_SUCCESS)
7424 		return (MDI_FAILURE);
7425 
7426 	kmem_free(vhc->vhc_vhcache_filename,
7427 	    strlen(vhc->vhc_vhcache_filename) + 1);
7428 
7429 	mod_hash_destroy_strhash(vhcache->vhcache_client_hash);
7430 
7431 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7432 	    cphci = cphci_next) {
7433 		cphci_next = cphci->cphci_next;
7434 		free_vhcache_phci(cphci);
7435 	}
7436 
7437 	for (cct = vhcache->vhcache_client_head; cct != NULL; cct = cct_next) {
7438 		cct_next = cct->cct_next;
7439 		for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi_next) {
7440 			cpi_next = cpi->cpi_next;
7441 			free_vhcache_pathinfo(cpi);
7442 		}
7443 		free_vhcache_client(cct);
7444 	}
7445 
7446 	rw_destroy(&vhcache->vhcache_lock);
7447 
7448 	mutex_destroy(&vhc->vhc_lock);
7449 	cv_destroy(&vhc->vhc_cv);
7450 	kmem_free(vhc, sizeof (mdi_vhci_config_t));
7451 	return (MDI_SUCCESS);
7452 }
7453 
7454 /*
7455  * Stop all vhci cache related async threads and free their resources.
7456  */
7457 static int
7458 stop_vhcache_async_threads(mdi_vhci_config_t *vhc)
7459 {
7460 	mdi_async_client_config_t *acc, *acc_next;
7461 
7462 	mutex_enter(&vhc->vhc_lock);
7463 	vhc->vhc_flags |= MDI_VHC_EXIT;
7464 	ASSERT(vhc->vhc_acc_thrcount >= 0);
7465 	cv_broadcast(&vhc->vhc_cv);
7466 
7467 	while ((vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) ||
7468 	    vhc->vhc_acc_thrcount != 0) {
7469 		mutex_exit(&vhc->vhc_lock);
7470 		delay_random(5);
7471 		mutex_enter(&vhc->vhc_lock);
7472 	}
7473 
7474 	vhc->vhc_flags &= ~MDI_VHC_EXIT;
7475 
7476 	for (acc = vhc->vhc_acc_list_head; acc != NULL; acc = acc_next) {
7477 		acc_next = acc->acc_next;
7478 		free_async_client_config(acc);
7479 	}
7480 	vhc->vhc_acc_list_head = NULL;
7481 	vhc->vhc_acc_list_tail = NULL;
7482 	vhc->vhc_acc_count = 0;
7483 
7484 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) {
7485 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
7486 		mutex_exit(&vhc->vhc_lock);
7487 		if (flush_vhcache(vhc, 0) != MDI_SUCCESS) {
7488 			vhcache_dirty(vhc);
7489 			return (MDI_FAILURE);
7490 		}
7491 	} else
7492 		mutex_exit(&vhc->vhc_lock);
7493 
7494 	if (callb_delete(vhc->vhc_cbid) != 0)
7495 		return (MDI_FAILURE);
7496 
7497 	return (MDI_SUCCESS);
7498 }
7499 
7500 /*
7501  * Stop vhci cache flush thread
7502  */
7503 /* ARGSUSED */
7504 static boolean_t
7505 stop_vhcache_flush_thread(void *arg, int code)
7506 {
7507 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
7508 
7509 	mutex_enter(&vhc->vhc_lock);
7510 	vhc->vhc_flags |= MDI_VHC_EXIT;
7511 	cv_broadcast(&vhc->vhc_cv);
7512 
7513 	while (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) {
7514 		mutex_exit(&vhc->vhc_lock);
7515 		delay_random(5);
7516 		mutex_enter(&vhc->vhc_lock);
7517 	}
7518 
7519 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) {
7520 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
7521 		mutex_exit(&vhc->vhc_lock);
7522 		(void) flush_vhcache(vhc, 1);
7523 	} else
7524 		mutex_exit(&vhc->vhc_lock);
7525 
7526 	return (B_TRUE);
7527 }
7528 
7529 /*
7530  * Enqueue the vhcache phci (cphci) at the tail of the list
7531  */
7532 static void
7533 enqueue_vhcache_phci(mdi_vhci_cache_t *vhcache, mdi_vhcache_phci_t *cphci)
7534 {
7535 	cphci->cphci_next = NULL;
7536 	if (vhcache->vhcache_phci_head == NULL)
7537 		vhcache->vhcache_phci_head = cphci;
7538 	else
7539 		vhcache->vhcache_phci_tail->cphci_next = cphci;
7540 	vhcache->vhcache_phci_tail = cphci;
7541 }
7542 
7543 /*
7544  * Enqueue the vhcache pathinfo (cpi) at the tail of the list
7545  */
7546 static void
7547 enqueue_tail_vhcache_pathinfo(mdi_vhcache_client_t *cct,
7548     mdi_vhcache_pathinfo_t *cpi)
7549 {
7550 	cpi->cpi_next = NULL;
7551 	if (cct->cct_cpi_head == NULL)
7552 		cct->cct_cpi_head = cpi;
7553 	else
7554 		cct->cct_cpi_tail->cpi_next = cpi;
7555 	cct->cct_cpi_tail = cpi;
7556 }
7557 
7558 /*
7559  * Enqueue the vhcache pathinfo (cpi) at the correct location in the
7560  * ordered list. All cpis which do not have MDI_CPI_HINT_PATH_DOES_NOT_EXIST
7561  * flag set come at the beginning of the list. All cpis which have this
7562  * flag set come at the end of the list.
7563  */
7564 static void
7565 enqueue_vhcache_pathinfo(mdi_vhcache_client_t *cct,
7566     mdi_vhcache_pathinfo_t *newcpi)
7567 {
7568 	mdi_vhcache_pathinfo_t *cpi, *prev_cpi;
7569 
7570 	if (cct->cct_cpi_head == NULL ||
7571 	    (newcpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST))
7572 		enqueue_tail_vhcache_pathinfo(cct, newcpi);
7573 	else {
7574 		for (cpi = cct->cct_cpi_head, prev_cpi = NULL; cpi != NULL &&
7575 		    !(cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST);
7576 		    prev_cpi = cpi, cpi = cpi->cpi_next)
7577 			;
7578 
7579 		if (prev_cpi == NULL)
7580 			cct->cct_cpi_head = newcpi;
7581 		else
7582 			prev_cpi->cpi_next = newcpi;
7583 
7584 		newcpi->cpi_next = cpi;
7585 
7586 		if (cpi == NULL)
7587 			cct->cct_cpi_tail = newcpi;
7588 	}
7589 }
7590 
7591 /*
7592  * Enqueue the vhcache client (cct) at the tail of the list
7593  */
7594 static void
7595 enqueue_vhcache_client(mdi_vhci_cache_t *vhcache,
7596     mdi_vhcache_client_t *cct)
7597 {
7598 	cct->cct_next = NULL;
7599 	if (vhcache->vhcache_client_head == NULL)
7600 		vhcache->vhcache_client_head = cct;
7601 	else
7602 		vhcache->vhcache_client_tail->cct_next = cct;
7603 	vhcache->vhcache_client_tail = cct;
7604 }
7605 
7606 static void
7607 free_string_array(char **str, int nelem)
7608 {
7609 	int i;
7610 
7611 	if (str) {
7612 		for (i = 0; i < nelem; i++) {
7613 			if (str[i])
7614 				kmem_free(str[i], strlen(str[i]) + 1);
7615 		}
7616 		kmem_free(str, sizeof (char *) * nelem);
7617 	}
7618 }
7619 
7620 static void
7621 free_vhcache_phci(mdi_vhcache_phci_t *cphci)
7622 {
7623 	kmem_free(cphci->cphci_path, strlen(cphci->cphci_path) + 1);
7624 	kmem_free(cphci, sizeof (*cphci));
7625 }
7626 
7627 static void
7628 free_vhcache_pathinfo(mdi_vhcache_pathinfo_t *cpi)
7629 {
7630 	kmem_free(cpi->cpi_addr, strlen(cpi->cpi_addr) + 1);
7631 	kmem_free(cpi, sizeof (*cpi));
7632 }
7633 
7634 static void
7635 free_vhcache_client(mdi_vhcache_client_t *cct)
7636 {
7637 	kmem_free(cct->cct_name_addr, strlen(cct->cct_name_addr) + 1);
7638 	kmem_free(cct, sizeof (*cct));
7639 }
7640 
7641 static char *
7642 vhcache_mknameaddr(char *ct_name, char *ct_addr, int *ret_len)
7643 {
7644 	char *name_addr;
7645 	int len;
7646 
7647 	len = strlen(ct_name) + strlen(ct_addr) + 2;
7648 	name_addr = kmem_alloc(len, KM_SLEEP);
7649 	(void) snprintf(name_addr, len, "%s@%s", ct_name, ct_addr);
7650 
7651 	if (ret_len)
7652 		*ret_len = len;
7653 	return (name_addr);
7654 }
7655 
7656 /*
7657  * Copy the contents of paddrnvl to vhci cache.
7658  * paddrnvl nvlist contains path information for a vhci client.
7659  * See the comment in mainnvl_to_vhcache() for the format of this nvlist.
7660  */
7661 static void
7662 paddrnvl_to_vhcache(nvlist_t *nvl, mdi_vhcache_phci_t *cphci_list[],
7663     mdi_vhcache_client_t *cct)
7664 {
7665 	nvpair_t *nvp = NULL;
7666 	mdi_vhcache_pathinfo_t *cpi;
7667 	uint_t nelem;
7668 	uint32_t *val;
7669 
7670 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
7671 		ASSERT(nvpair_type(nvp) == DATA_TYPE_UINT32_ARRAY);
7672 		cpi = kmem_zalloc(sizeof (*cpi), KM_SLEEP);
7673 		cpi->cpi_addr = i_ddi_strdup(nvpair_name(nvp), KM_SLEEP);
7674 		(void) nvpair_value_uint32_array(nvp, &val, &nelem);
7675 		ASSERT(nelem == 2);
7676 		cpi->cpi_cphci = cphci_list[val[0]];
7677 		cpi->cpi_flags = val[1];
7678 		enqueue_tail_vhcache_pathinfo(cct, cpi);
7679 	}
7680 }
7681 
7682 /*
7683  * Copy the contents of caddrmapnvl to vhci cache.
7684  * caddrmapnvl nvlist contains vhci client address to phci client address
7685  * mappings. See the comment in mainnvl_to_vhcache() for the format of
7686  * this nvlist.
7687  */
7688 static void
7689 caddrmapnvl_to_vhcache(mdi_vhci_cache_t *vhcache, nvlist_t *nvl,
7690     mdi_vhcache_phci_t *cphci_list[])
7691 {
7692 	nvpair_t *nvp = NULL;
7693 	nvlist_t *paddrnvl;
7694 	mdi_vhcache_client_t *cct;
7695 
7696 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
7697 		ASSERT(nvpair_type(nvp) == DATA_TYPE_NVLIST);
7698 		cct = kmem_zalloc(sizeof (*cct), KM_SLEEP);
7699 		cct->cct_name_addr = i_ddi_strdup(nvpair_name(nvp), KM_SLEEP);
7700 		(void) nvpair_value_nvlist(nvp, &paddrnvl);
7701 		paddrnvl_to_vhcache(paddrnvl, cphci_list, cct);
7702 		/* the client must contain at least one path */
7703 		ASSERT(cct->cct_cpi_head != NULL);
7704 
7705 		enqueue_vhcache_client(vhcache, cct);
7706 		(void) mod_hash_insert(vhcache->vhcache_client_hash,
7707 		    (mod_hash_key_t)cct->cct_name_addr, (mod_hash_val_t)cct);
7708 	}
7709 }
7710 
7711 /*
7712  * Copy the contents of the main nvlist to vhci cache.
7713  *
7714  * VHCI busconfig cached data is stored in the form of a nvlist on the disk.
7715  * The nvlist contains the mappings between the vhci client addresses and
7716  * their corresponding phci client addresses.
7717  *
7718  * The structure of the nvlist is as follows:
7719  *
7720  * Main nvlist:
7721  *	NAME		TYPE		DATA
7722  *	version		int32		version number
7723  *	phcis		string array	array of phci paths
7724  *	clientaddrmap	nvlist_t	c2paddrs_nvl (see below)
7725  *
7726  * structure of c2paddrs_nvl:
7727  *	NAME		TYPE		DATA
7728  *	caddr1		nvlist_t	paddrs_nvl1
7729  *	caddr2		nvlist_t	paddrs_nvl2
7730  *	...
7731  * where caddr1, caddr2, ... are vhci client name and addresses in the
7732  * form of "<clientname>@<clientaddress>".
7733  * (for example: "ssd@2000002037cd9f72");
7734  * paddrs_nvl1, paddrs_nvl2, .. are nvlists that contain path information.
7735  *
7736  * structure of paddrs_nvl:
7737  *	NAME		TYPE		DATA
7738  *	pi_addr1	uint32_array	(phci-id, cpi_flags)
7739  *	pi_addr2	uint32_array	(phci-id, cpi_flags)
7740  *	...
7741  * where pi_addr1, pi_addr2, ... are bus specific addresses of pathinfo nodes
7742  * (so called pi_addrs, for example: "w2100002037cd9f72,0");
7743  * phci-ids are integers that identify pHCIs to which the
7744  * the bus specific address belongs to. These integers are used as an index
7745  * into to the phcis string array in the main nvlist to get the pHCI path.
7746  */
7747 static int
7748 mainnvl_to_vhcache(mdi_vhci_cache_t *vhcache, nvlist_t *nvl)
7749 {
7750 	char **phcis, **phci_namep;
7751 	uint_t nphcis;
7752 	mdi_vhcache_phci_t *cphci, **cphci_list;
7753 	nvlist_t *caddrmapnvl;
7754 	int32_t ver;
7755 	int i;
7756 	size_t cphci_list_size;
7757 
7758 	ASSERT(RW_WRITE_HELD(&vhcache->vhcache_lock));
7759 
7760 	if (nvlist_lookup_int32(nvl, MDI_NVPNAME_VERSION, &ver) != 0 ||
7761 	    ver != MDI_VHCI_CACHE_VERSION)
7762 		return (MDI_FAILURE);
7763 
7764 	if (nvlist_lookup_string_array(nvl, MDI_NVPNAME_PHCIS, &phcis,
7765 	    &nphcis) != 0)
7766 		return (MDI_SUCCESS);
7767 
7768 	ASSERT(nphcis > 0);
7769 
7770 	cphci_list_size = sizeof (mdi_vhcache_phci_t *) * nphcis;
7771 	cphci_list = kmem_alloc(cphci_list_size, KM_SLEEP);
7772 	for (i = 0, phci_namep = phcis; i < nphcis; i++, phci_namep++) {
7773 		cphci = kmem_zalloc(sizeof (mdi_vhcache_phci_t), KM_SLEEP);
7774 		cphci->cphci_path = i_ddi_strdup(*phci_namep, KM_SLEEP);
7775 		enqueue_vhcache_phci(vhcache, cphci);
7776 		cphci_list[i] = cphci;
7777 	}
7778 
7779 	ASSERT(vhcache->vhcache_phci_head != NULL);
7780 
7781 	if (nvlist_lookup_nvlist(nvl, MDI_NVPNAME_CTADDRMAP, &caddrmapnvl) == 0)
7782 		caddrmapnvl_to_vhcache(vhcache, caddrmapnvl, cphci_list);
7783 
7784 	kmem_free(cphci_list, cphci_list_size);
7785 	return (MDI_SUCCESS);
7786 }
7787 
7788 /*
7789  * Build paddrnvl for the specified client using the information in the
7790  * vhci cache and add it to the caddrmapnnvl.
7791  * Returns 0 on success, errno on failure.
7792  */
7793 static int
7794 vhcache_to_paddrnvl(mdi_vhci_cache_t *vhcache, mdi_vhcache_client_t *cct,
7795     nvlist_t *caddrmapnvl)
7796 {
7797 	mdi_vhcache_pathinfo_t *cpi;
7798 	nvlist_t *nvl;
7799 	int err;
7800 	uint32_t val[2];
7801 
7802 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7803 
7804 	if ((err = nvlist_alloc(&nvl, 0, KM_SLEEP)) != 0)
7805 		return (err);
7806 
7807 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
7808 		val[0] = cpi->cpi_cphci->cphci_id;
7809 		val[1] = cpi->cpi_flags;
7810 		if ((err = nvlist_add_uint32_array(nvl, cpi->cpi_addr, val, 2))
7811 		    != 0)
7812 			goto out;
7813 	}
7814 
7815 	err = nvlist_add_nvlist(caddrmapnvl, cct->cct_name_addr, nvl);
7816 out:
7817 	nvlist_free(nvl);
7818 	return (err);
7819 }
7820 
7821 /*
7822  * Build caddrmapnvl using the information in the vhci cache
7823  * and add it to the mainnvl.
7824  * Returns 0 on success, errno on failure.
7825  */
7826 static int
7827 vhcache_to_caddrmapnvl(mdi_vhci_cache_t *vhcache, nvlist_t *mainnvl)
7828 {
7829 	mdi_vhcache_client_t *cct;
7830 	nvlist_t *nvl;
7831 	int err;
7832 
7833 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7834 
7835 	if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) != 0)
7836 		return (err);
7837 
7838 	for (cct = vhcache->vhcache_client_head; cct != NULL;
7839 	    cct = cct->cct_next) {
7840 		if ((err = vhcache_to_paddrnvl(vhcache, cct, nvl)) != 0)
7841 			goto out;
7842 	}
7843 
7844 	err = nvlist_add_nvlist(mainnvl, MDI_NVPNAME_CTADDRMAP, nvl);
7845 out:
7846 	nvlist_free(nvl);
7847 	return (err);
7848 }
7849 
7850 /*
7851  * Build nvlist using the information in the vhci cache.
7852  * See the comment in mainnvl_to_vhcache() for the format of the nvlist.
7853  * Returns nvl on success, NULL on failure.
7854  */
7855 static nvlist_t *
7856 vhcache_to_mainnvl(mdi_vhci_cache_t *vhcache)
7857 {
7858 	mdi_vhcache_phci_t *cphci;
7859 	uint_t phci_count;
7860 	char **phcis;
7861 	nvlist_t *nvl;
7862 	int err, i;
7863 
7864 	if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) != 0) {
7865 		nvl = NULL;
7866 		goto out;
7867 	}
7868 
7869 	if ((err = nvlist_add_int32(nvl, MDI_NVPNAME_VERSION,
7870 	    MDI_VHCI_CACHE_VERSION)) != 0)
7871 		goto out;
7872 
7873 	rw_enter(&vhcache->vhcache_lock, RW_READER);
7874 	if (vhcache->vhcache_phci_head == NULL) {
7875 		rw_exit(&vhcache->vhcache_lock);
7876 		return (nvl);
7877 	}
7878 
7879 	phci_count = 0;
7880 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7881 	    cphci = cphci->cphci_next)
7882 		cphci->cphci_id = phci_count++;
7883 
7884 	/* build phci pathname list */
7885 	phcis = kmem_alloc(sizeof (char *) * phci_count, KM_SLEEP);
7886 	for (cphci = vhcache->vhcache_phci_head, i = 0; cphci != NULL;
7887 	    cphci = cphci->cphci_next, i++)
7888 		phcis[i] = i_ddi_strdup(cphci->cphci_path, KM_SLEEP);
7889 
7890 	err = nvlist_add_string_array(nvl, MDI_NVPNAME_PHCIS, phcis,
7891 	    phci_count);
7892 	free_string_array(phcis, phci_count);
7893 
7894 	if (err == 0 &&
7895 	    (err = vhcache_to_caddrmapnvl(vhcache, nvl)) == 0) {
7896 		rw_exit(&vhcache->vhcache_lock);
7897 		return (nvl);
7898 	}
7899 
7900 	rw_exit(&vhcache->vhcache_lock);
7901 out:
7902 	if (nvl)
7903 		nvlist_free(nvl);
7904 	return (NULL);
7905 }
7906 
7907 /*
7908  * Lookup vhcache phci structure for the specified phci path.
7909  */
7910 static mdi_vhcache_phci_t *
7911 lookup_vhcache_phci_by_name(mdi_vhci_cache_t *vhcache, char *phci_path)
7912 {
7913 	mdi_vhcache_phci_t *cphci;
7914 
7915 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7916 
7917 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7918 	    cphci = cphci->cphci_next) {
7919 		if (strcmp(cphci->cphci_path, phci_path) == 0)
7920 			return (cphci);
7921 	}
7922 
7923 	return (NULL);
7924 }
7925 
7926 /*
7927  * Lookup vhcache phci structure for the specified phci.
7928  */
7929 static mdi_vhcache_phci_t *
7930 lookup_vhcache_phci_by_addr(mdi_vhci_cache_t *vhcache, mdi_phci_t *ph)
7931 {
7932 	mdi_vhcache_phci_t *cphci;
7933 
7934 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7935 
7936 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7937 	    cphci = cphci->cphci_next) {
7938 		if (cphci->cphci_phci == ph)
7939 			return (cphci);
7940 	}
7941 
7942 	return (NULL);
7943 }
7944 
7945 /*
7946  * Add the specified phci to the vhci cache if not already present.
7947  */
7948 static void
7949 vhcache_phci_add(mdi_vhci_config_t *vhc, mdi_phci_t *ph)
7950 {
7951 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7952 	mdi_vhcache_phci_t *cphci;
7953 	char *pathname;
7954 	int cache_updated;
7955 
7956 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7957 
7958 	pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
7959 	(void) ddi_pathname(ph->ph_dip, pathname);
7960 	if ((cphci = lookup_vhcache_phci_by_name(vhcache, pathname))
7961 	    != NULL) {
7962 		cphci->cphci_phci = ph;
7963 		cache_updated = 0;
7964 	} else {
7965 		cphci = kmem_zalloc(sizeof (*cphci), KM_SLEEP);
7966 		cphci->cphci_path = i_ddi_strdup(pathname, KM_SLEEP);
7967 		cphci->cphci_phci = ph;
7968 		enqueue_vhcache_phci(vhcache, cphci);
7969 		cache_updated = 1;
7970 	}
7971 
7972 	rw_exit(&vhcache->vhcache_lock);
7973 
7974 	/*
7975 	 * Since a new phci has been added, reset
7976 	 * vhc_path_discovery_cutoff_time to allow for discovery of paths
7977 	 * during next vhcache_discover_paths().
7978 	 */
7979 	mutex_enter(&vhc->vhc_lock);
7980 	vhc->vhc_path_discovery_cutoff_time = 0;
7981 	mutex_exit(&vhc->vhc_lock);
7982 
7983 	kmem_free(pathname, MAXPATHLEN);
7984 	if (cache_updated)
7985 		vhcache_dirty(vhc);
7986 }
7987 
7988 /*
7989  * Remove the reference to the specified phci from the vhci cache.
7990  */
7991 static void
7992 vhcache_phci_remove(mdi_vhci_config_t *vhc, mdi_phci_t *ph)
7993 {
7994 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7995 	mdi_vhcache_phci_t *cphci;
7996 
7997 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7998 	if ((cphci = lookup_vhcache_phci_by_addr(vhcache, ph)) != NULL) {
7999 		/* do not remove the actual mdi_vhcache_phci structure */
8000 		cphci->cphci_phci = NULL;
8001 	}
8002 	rw_exit(&vhcache->vhcache_lock);
8003 }
8004 
8005 static void
8006 init_vhcache_lookup_token(mdi_vhcache_lookup_token_t *dst,
8007     mdi_vhcache_lookup_token_t *src)
8008 {
8009 	if (src == NULL) {
8010 		dst->lt_cct = NULL;
8011 		dst->lt_cct_lookup_time = 0;
8012 	} else {
8013 		dst->lt_cct = src->lt_cct;
8014 		dst->lt_cct_lookup_time = src->lt_cct_lookup_time;
8015 	}
8016 }
8017 
8018 /*
8019  * Look up vhcache client for the specified client.
8020  */
8021 static mdi_vhcache_client_t *
8022 lookup_vhcache_client(mdi_vhci_cache_t *vhcache, char *ct_name, char *ct_addr,
8023     mdi_vhcache_lookup_token_t *token)
8024 {
8025 	mod_hash_val_t hv;
8026 	char *name_addr;
8027 	int len;
8028 
8029 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
8030 
8031 	/*
8032 	 * If no vhcache clean occurred since the last lookup, we can
8033 	 * simply return the cct from the last lookup operation.
8034 	 * It works because ccts are never freed except during the vhcache
8035 	 * cleanup operation.
8036 	 */
8037 	if (token != NULL &&
8038 	    vhcache->vhcache_clean_time < token->lt_cct_lookup_time)
8039 		return (token->lt_cct);
8040 
8041 	name_addr = vhcache_mknameaddr(ct_name, ct_addr, &len);
8042 	if (mod_hash_find(vhcache->vhcache_client_hash,
8043 	    (mod_hash_key_t)name_addr, &hv) == 0) {
8044 		if (token) {
8045 			token->lt_cct = (mdi_vhcache_client_t *)hv;
8046 			token->lt_cct_lookup_time = lbolt64;
8047 		}
8048 	} else {
8049 		if (token) {
8050 			token->lt_cct = NULL;
8051 			token->lt_cct_lookup_time = 0;
8052 		}
8053 		hv = NULL;
8054 	}
8055 	kmem_free(name_addr, len);
8056 	return ((mdi_vhcache_client_t *)hv);
8057 }
8058 
8059 /*
8060  * Add the specified path to the vhci cache if not already present.
8061  * Also add the vhcache client for the client corresponding to this path
8062  * if it doesn't already exist.
8063  */
8064 static void
8065 vhcache_pi_add(mdi_vhci_config_t *vhc, struct mdi_pathinfo *pip)
8066 {
8067 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8068 	mdi_vhcache_client_t *cct;
8069 	mdi_vhcache_pathinfo_t *cpi;
8070 	mdi_phci_t *ph = pip->pi_phci;
8071 	mdi_client_t *ct = pip->pi_client;
8072 	int cache_updated = 0;
8073 
8074 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8075 
8076 	/* if vhcache client for this pip doesn't already exist, add it */
8077 	if ((cct = lookup_vhcache_client(vhcache, ct->ct_drvname, ct->ct_guid,
8078 	    NULL)) == NULL) {
8079 		cct = kmem_zalloc(sizeof (*cct), KM_SLEEP);
8080 		cct->cct_name_addr = vhcache_mknameaddr(ct->ct_drvname,
8081 		    ct->ct_guid, NULL);
8082 		enqueue_vhcache_client(vhcache, cct);
8083 		(void) mod_hash_insert(vhcache->vhcache_client_hash,
8084 		    (mod_hash_key_t)cct->cct_name_addr, (mod_hash_val_t)cct);
8085 		cache_updated = 1;
8086 	}
8087 
8088 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8089 		if (cpi->cpi_cphci->cphci_phci == ph &&
8090 		    strcmp(cpi->cpi_addr, pip->pi_addr) == 0) {
8091 			cpi->cpi_pip = pip;
8092 			if (cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST) {
8093 				cpi->cpi_flags &=
8094 				    ~MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
8095 				sort_vhcache_paths(cct);
8096 				cache_updated = 1;
8097 			}
8098 			break;
8099 		}
8100 	}
8101 
8102 	if (cpi == NULL) {
8103 		cpi = kmem_zalloc(sizeof (*cpi), KM_SLEEP);
8104 		cpi->cpi_addr = i_ddi_strdup(pip->pi_addr, KM_SLEEP);
8105 		cpi->cpi_cphci = lookup_vhcache_phci_by_addr(vhcache, ph);
8106 		ASSERT(cpi->cpi_cphci != NULL);
8107 		cpi->cpi_pip = pip;
8108 		enqueue_vhcache_pathinfo(cct, cpi);
8109 		cache_updated = 1;
8110 	}
8111 
8112 	rw_exit(&vhcache->vhcache_lock);
8113 
8114 	if (cache_updated)
8115 		vhcache_dirty(vhc);
8116 }
8117 
8118 /*
8119  * Remove the reference to the specified path from the vhci cache.
8120  */
8121 static void
8122 vhcache_pi_remove(mdi_vhci_config_t *vhc, struct mdi_pathinfo *pip)
8123 {
8124 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8125 	mdi_client_t *ct = pip->pi_client;
8126 	mdi_vhcache_client_t *cct;
8127 	mdi_vhcache_pathinfo_t *cpi;
8128 
8129 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8130 	if ((cct = lookup_vhcache_client(vhcache, ct->ct_drvname, ct->ct_guid,
8131 	    NULL)) != NULL) {
8132 		for (cpi = cct->cct_cpi_head; cpi != NULL;
8133 		    cpi = cpi->cpi_next) {
8134 			if (cpi->cpi_pip == pip) {
8135 				cpi->cpi_pip = NULL;
8136 				break;
8137 			}
8138 		}
8139 	}
8140 	rw_exit(&vhcache->vhcache_lock);
8141 }
8142 
8143 /*
8144  * Flush the vhci cache to disk.
8145  * Returns MDI_SUCCESS on success, MDI_FAILURE on failure.
8146  */
8147 static int
8148 flush_vhcache(mdi_vhci_config_t *vhc, int force_flag)
8149 {
8150 	nvlist_t *nvl;
8151 	int err;
8152 	int rv;
8153 
8154 	/*
8155 	 * It is possible that the system may shutdown before
8156 	 * i_ddi_io_initialized (during stmsboot for example). To allow for
8157 	 * flushing the cache in this case do not check for
8158 	 * i_ddi_io_initialized when force flag is set.
8159 	 */
8160 	if (force_flag == 0 && !i_ddi_io_initialized())
8161 		return (MDI_FAILURE);
8162 
8163 	if ((nvl = vhcache_to_mainnvl(&vhc->vhc_vhcache)) != NULL) {
8164 		err = fwrite_nvlist(vhc->vhc_vhcache_filename, nvl);
8165 		nvlist_free(nvl);
8166 	} else
8167 		err = EFAULT;
8168 
8169 	rv = MDI_SUCCESS;
8170 	mutex_enter(&vhc->vhc_lock);
8171 	if (err != 0) {
8172 		if (err == EROFS) {
8173 			vhc->vhc_flags |= MDI_VHC_READONLY_FS;
8174 			vhc->vhc_flags &= ~(MDI_VHC_VHCACHE_FLUSH_ERROR |
8175 			    MDI_VHC_VHCACHE_DIRTY);
8176 		} else {
8177 			if (!(vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_ERROR)) {
8178 				cmn_err(CE_CONT, "%s: update failed\n",
8179 				    vhc->vhc_vhcache_filename);
8180 				vhc->vhc_flags |= MDI_VHC_VHCACHE_FLUSH_ERROR;
8181 			}
8182 			rv = MDI_FAILURE;
8183 		}
8184 	} else if (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_ERROR) {
8185 		cmn_err(CE_CONT,
8186 		    "%s: update now ok\n", vhc->vhc_vhcache_filename);
8187 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_FLUSH_ERROR;
8188 	}
8189 	mutex_exit(&vhc->vhc_lock);
8190 
8191 	return (rv);
8192 }
8193 
8194 /*
8195  * Call flush_vhcache() to flush the vhci cache at the scheduled time.
8196  * Exits itself if left idle for the idle timeout period.
8197  */
8198 static void
8199 vhcache_flush_thread(void *arg)
8200 {
8201 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
8202 	clock_t idle_time, quit_at_ticks;
8203 	callb_cpr_t cprinfo;
8204 
8205 	/* number of seconds to sleep idle before exiting */
8206 	idle_time = mdi_vhcache_flush_daemon_idle_time * TICKS_PER_SECOND;
8207 
8208 	CALLB_CPR_INIT(&cprinfo, &vhc->vhc_lock, callb_generic_cpr,
8209 	    "mdi_vhcache_flush");
8210 	mutex_enter(&vhc->vhc_lock);
8211 	for (; ; ) {
8212 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
8213 		    (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY)) {
8214 			if (ddi_get_lbolt() < vhc->vhc_flush_at_ticks) {
8215 				CALLB_CPR_SAFE_BEGIN(&cprinfo);
8216 				(void) cv_timedwait(&vhc->vhc_cv,
8217 				    &vhc->vhc_lock, vhc->vhc_flush_at_ticks);
8218 				CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
8219 			} else {
8220 				vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
8221 				mutex_exit(&vhc->vhc_lock);
8222 
8223 				if (flush_vhcache(vhc, 0) != MDI_SUCCESS)
8224 					vhcache_dirty(vhc);
8225 
8226 				mutex_enter(&vhc->vhc_lock);
8227 			}
8228 		}
8229 
8230 		quit_at_ticks = ddi_get_lbolt() + idle_time;
8231 
8232 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
8233 		    !(vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) &&
8234 		    ddi_get_lbolt() < quit_at_ticks) {
8235 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
8236 			(void) cv_timedwait(&vhc->vhc_cv, &vhc->vhc_lock,
8237 			    quit_at_ticks);
8238 			CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
8239 		}
8240 
8241 		if ((vhc->vhc_flags & MDI_VHC_EXIT) ||
8242 		    !(vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY))
8243 			goto out;
8244 	}
8245 
8246 out:
8247 	vhc->vhc_flags &= ~MDI_VHC_VHCACHE_FLUSH_THREAD;
8248 	/* CALLB_CPR_EXIT releases the vhc->vhc_lock */
8249 	CALLB_CPR_EXIT(&cprinfo);
8250 }
8251 
8252 /*
8253  * Make vhci cache dirty and schedule flushing by vhcache flush thread.
8254  */
8255 static void
8256 vhcache_dirty(mdi_vhci_config_t *vhc)
8257 {
8258 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8259 	int create_thread;
8260 
8261 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8262 	/* do not flush cache until the cache is fully built */
8263 	if (!(vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE)) {
8264 		rw_exit(&vhcache->vhcache_lock);
8265 		return;
8266 	}
8267 	rw_exit(&vhcache->vhcache_lock);
8268 
8269 	mutex_enter(&vhc->vhc_lock);
8270 	if (vhc->vhc_flags & MDI_VHC_READONLY_FS) {
8271 		mutex_exit(&vhc->vhc_lock);
8272 		return;
8273 	}
8274 
8275 	vhc->vhc_flags |= MDI_VHC_VHCACHE_DIRTY;
8276 	vhc->vhc_flush_at_ticks = ddi_get_lbolt() +
8277 	    mdi_vhcache_flush_delay * TICKS_PER_SECOND;
8278 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) {
8279 		cv_broadcast(&vhc->vhc_cv);
8280 		create_thread = 0;
8281 	} else {
8282 		vhc->vhc_flags |= MDI_VHC_VHCACHE_FLUSH_THREAD;
8283 		create_thread = 1;
8284 	}
8285 	mutex_exit(&vhc->vhc_lock);
8286 
8287 	if (create_thread)
8288 		(void) thread_create(NULL, 0, vhcache_flush_thread, vhc,
8289 		    0, &p0, TS_RUN, minclsyspri);
8290 }
8291 
8292 /*
8293  * phci bus config structure - one for for each phci bus config operation that
8294  * we initiate on behalf of a vhci.
8295  */
8296 typedef struct mdi_phci_bus_config_s {
8297 	char *phbc_phci_path;
8298 	struct mdi_vhci_bus_config_s *phbc_vhbusconfig;	/* vhci bus config */
8299 	struct mdi_phci_bus_config_s *phbc_next;
8300 } mdi_phci_bus_config_t;
8301 
8302 /* vhci bus config structure - one for each vhci bus config operation */
8303 typedef struct mdi_vhci_bus_config_s {
8304 	ddi_bus_config_op_t vhbc_op;	/* bus config op */
8305 	major_t vhbc_op_major;		/* bus config op major */
8306 	uint_t vhbc_op_flags;		/* bus config op flags */
8307 	kmutex_t vhbc_lock;
8308 	kcondvar_t vhbc_cv;
8309 	int vhbc_thr_count;
8310 } mdi_vhci_bus_config_t;
8311 
8312 /*
8313  * bus config the specified phci
8314  */
8315 static void
8316 bus_config_phci(void *arg)
8317 {
8318 	mdi_phci_bus_config_t *phbc = (mdi_phci_bus_config_t *)arg;
8319 	mdi_vhci_bus_config_t *vhbc = phbc->phbc_vhbusconfig;
8320 	dev_info_t *ph_dip;
8321 
8322 	/*
8323 	 * first configure all path components upto phci and then configure
8324 	 * the phci children.
8325 	 */
8326 	if ((ph_dip = e_ddi_hold_devi_by_path(phbc->phbc_phci_path, 0))
8327 	    != NULL) {
8328 		if (vhbc->vhbc_op == BUS_CONFIG_DRIVER ||
8329 		    vhbc->vhbc_op == BUS_UNCONFIG_DRIVER) {
8330 			(void) ndi_devi_config_driver(ph_dip,
8331 			    vhbc->vhbc_op_flags,
8332 			    vhbc->vhbc_op_major);
8333 		} else
8334 			(void) ndi_devi_config(ph_dip,
8335 			    vhbc->vhbc_op_flags);
8336 
8337 		/* release the hold that e_ddi_hold_devi_by_path() placed */
8338 		ndi_rele_devi(ph_dip);
8339 	}
8340 
8341 	kmem_free(phbc->phbc_phci_path, strlen(phbc->phbc_phci_path) + 1);
8342 	kmem_free(phbc, sizeof (*phbc));
8343 
8344 	mutex_enter(&vhbc->vhbc_lock);
8345 	vhbc->vhbc_thr_count--;
8346 	if (vhbc->vhbc_thr_count == 0)
8347 		cv_broadcast(&vhbc->vhbc_cv);
8348 	mutex_exit(&vhbc->vhbc_lock);
8349 }
8350 
8351 /*
8352  * Bus config all phcis associated with the vhci in parallel.
8353  * op must be BUS_CONFIG_DRIVER or BUS_CONFIG_ALL.
8354  */
8355 static void
8356 bus_config_all_phcis(mdi_vhci_cache_t *vhcache, uint_t flags,
8357     ddi_bus_config_op_t op, major_t maj)
8358 {
8359 	mdi_phci_bus_config_t *phbc_head = NULL, *phbc, *phbc_next;
8360 	mdi_vhci_bus_config_t *vhbc;
8361 	mdi_vhcache_phci_t *cphci;
8362 
8363 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8364 	if (vhcache->vhcache_phci_head == NULL) {
8365 		rw_exit(&vhcache->vhcache_lock);
8366 		return;
8367 	}
8368 
8369 	vhbc = kmem_zalloc(sizeof (*vhbc), KM_SLEEP);
8370 
8371 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
8372 	    cphci = cphci->cphci_next) {
8373 		/* skip phcis that haven't attached before root is available */
8374 		if (!modrootloaded && (cphci->cphci_phci == NULL))
8375 			continue;
8376 		phbc = kmem_zalloc(sizeof (*phbc), KM_SLEEP);
8377 		phbc->phbc_phci_path = i_ddi_strdup(cphci->cphci_path,
8378 		    KM_SLEEP);
8379 		phbc->phbc_vhbusconfig = vhbc;
8380 		phbc->phbc_next = phbc_head;
8381 		phbc_head = phbc;
8382 		vhbc->vhbc_thr_count++;
8383 	}
8384 	rw_exit(&vhcache->vhcache_lock);
8385 
8386 	vhbc->vhbc_op = op;
8387 	vhbc->vhbc_op_major = maj;
8388 	vhbc->vhbc_op_flags = NDI_NO_EVENT |
8389 	    (flags & (NDI_CONFIG_REPROBE | NDI_DRV_CONF_REPROBE));
8390 	mutex_init(&vhbc->vhbc_lock, NULL, MUTEX_DEFAULT, NULL);
8391 	cv_init(&vhbc->vhbc_cv, NULL, CV_DRIVER, NULL);
8392 
8393 	/* now create threads to initiate bus config on all phcis in parallel */
8394 	for (phbc = phbc_head; phbc != NULL; phbc = phbc_next) {
8395 		phbc_next = phbc->phbc_next;
8396 		if (mdi_mtc_off)
8397 			bus_config_phci((void *)phbc);
8398 		else
8399 			(void) thread_create(NULL, 0, bus_config_phci, phbc,
8400 			    0, &p0, TS_RUN, minclsyspri);
8401 	}
8402 
8403 	mutex_enter(&vhbc->vhbc_lock);
8404 	/* wait until all threads exit */
8405 	while (vhbc->vhbc_thr_count > 0)
8406 		cv_wait(&vhbc->vhbc_cv, &vhbc->vhbc_lock);
8407 	mutex_exit(&vhbc->vhbc_lock);
8408 
8409 	mutex_destroy(&vhbc->vhbc_lock);
8410 	cv_destroy(&vhbc->vhbc_cv);
8411 	kmem_free(vhbc, sizeof (*vhbc));
8412 }
8413 
8414 /*
8415  * Single threaded version of bus_config_all_phcis()
8416  */
8417 static void
8418 st_bus_config_all_phcis(mdi_vhci_config_t *vhc, uint_t flags,
8419     ddi_bus_config_op_t op, major_t maj)
8420 {
8421 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8422 
8423 	single_threaded_vhconfig_enter(vhc);
8424 	bus_config_all_phcis(vhcache, flags, op, maj);
8425 	single_threaded_vhconfig_exit(vhc);
8426 }
8427 
8428 /*
8429  * Perform BUS_CONFIG_ONE on the specified child of the phci.
8430  * The path includes the child component in addition to the phci path.
8431  */
8432 static int
8433 bus_config_one_phci_child(char *path)
8434 {
8435 	dev_info_t *ph_dip, *child;
8436 	char *devnm;
8437 	int rv = MDI_FAILURE;
8438 
8439 	/* extract the child component of the phci */
8440 	devnm = strrchr(path, '/');
8441 	*devnm++ = '\0';
8442 
8443 	/*
8444 	 * first configure all path components upto phci and then
8445 	 * configure the phci child.
8446 	 */
8447 	if ((ph_dip = e_ddi_hold_devi_by_path(path, 0)) != NULL) {
8448 		if (ndi_devi_config_one(ph_dip, devnm, &child, NDI_NO_EVENT) ==
8449 		    NDI_SUCCESS) {
8450 			/*
8451 			 * release the hold that ndi_devi_config_one() placed
8452 			 */
8453 			ndi_rele_devi(child);
8454 			rv = MDI_SUCCESS;
8455 		}
8456 
8457 		/* release the hold that e_ddi_hold_devi_by_path() placed */
8458 		ndi_rele_devi(ph_dip);
8459 	}
8460 
8461 	devnm--;
8462 	*devnm = '/';
8463 	return (rv);
8464 }
8465 
8466 /*
8467  * Build a list of phci client paths for the specified vhci client.
8468  * The list includes only those phci client paths which aren't configured yet.
8469  */
8470 static mdi_phys_path_t *
8471 build_phclient_path_list(mdi_vhcache_client_t *cct, char *ct_name)
8472 {
8473 	mdi_vhcache_pathinfo_t *cpi;
8474 	mdi_phys_path_t *pp_head = NULL, *pp_tail = NULL, *pp;
8475 	int config_path, len;
8476 
8477 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8478 		/*
8479 		 * include only those paths that aren't configured.
8480 		 */
8481 		config_path = 0;
8482 		if (cpi->cpi_pip == NULL)
8483 			config_path = 1;
8484 		else {
8485 			MDI_PI_LOCK(cpi->cpi_pip);
8486 			if (MDI_PI_IS_INIT(cpi->cpi_pip))
8487 				config_path = 1;
8488 			MDI_PI_UNLOCK(cpi->cpi_pip);
8489 		}
8490 
8491 		if (config_path) {
8492 			pp = kmem_alloc(sizeof (*pp), KM_SLEEP);
8493 			len = strlen(cpi->cpi_cphci->cphci_path) +
8494 			    strlen(ct_name) + strlen(cpi->cpi_addr) + 3;
8495 			pp->phys_path = kmem_alloc(len, KM_SLEEP);
8496 			(void) snprintf(pp->phys_path, len, "%s/%s@%s",
8497 			    cpi->cpi_cphci->cphci_path, ct_name,
8498 			    cpi->cpi_addr);
8499 			pp->phys_path_next = NULL;
8500 
8501 			if (pp_head == NULL)
8502 				pp_head = pp;
8503 			else
8504 				pp_tail->phys_path_next = pp;
8505 			pp_tail = pp;
8506 		}
8507 	}
8508 
8509 	return (pp_head);
8510 }
8511 
8512 /*
8513  * Free the memory allocated for phci client path list.
8514  */
8515 static void
8516 free_phclient_path_list(mdi_phys_path_t *pp_head)
8517 {
8518 	mdi_phys_path_t *pp, *pp_next;
8519 
8520 	for (pp = pp_head; pp != NULL; pp = pp_next) {
8521 		pp_next = pp->phys_path_next;
8522 		kmem_free(pp->phys_path, strlen(pp->phys_path) + 1);
8523 		kmem_free(pp, sizeof (*pp));
8524 	}
8525 }
8526 
8527 /*
8528  * Allocated async client structure and initialize with the specified values.
8529  */
8530 static mdi_async_client_config_t *
8531 alloc_async_client_config(char *ct_name, char *ct_addr,
8532     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
8533 {
8534 	mdi_async_client_config_t *acc;
8535 
8536 	acc = kmem_alloc(sizeof (*acc), KM_SLEEP);
8537 	acc->acc_ct_name = i_ddi_strdup(ct_name, KM_SLEEP);
8538 	acc->acc_ct_addr = i_ddi_strdup(ct_addr, KM_SLEEP);
8539 	acc->acc_phclient_path_list_head = pp_head;
8540 	init_vhcache_lookup_token(&acc->acc_token, tok);
8541 	acc->acc_next = NULL;
8542 	return (acc);
8543 }
8544 
8545 /*
8546  * Free the memory allocated for the async client structure and their members.
8547  */
8548 static void
8549 free_async_client_config(mdi_async_client_config_t *acc)
8550 {
8551 	if (acc->acc_phclient_path_list_head)
8552 		free_phclient_path_list(acc->acc_phclient_path_list_head);
8553 	kmem_free(acc->acc_ct_name, strlen(acc->acc_ct_name) + 1);
8554 	kmem_free(acc->acc_ct_addr, strlen(acc->acc_ct_addr) + 1);
8555 	kmem_free(acc, sizeof (*acc));
8556 }
8557 
8558 /*
8559  * Sort vhcache pathinfos (cpis) of the specified client.
8560  * All cpis which do not have MDI_CPI_HINT_PATH_DOES_NOT_EXIST
8561  * flag set come at the beginning of the list. All cpis which have this
8562  * flag set come at the end of the list.
8563  */
8564 static void
8565 sort_vhcache_paths(mdi_vhcache_client_t *cct)
8566 {
8567 	mdi_vhcache_pathinfo_t *cpi, *cpi_next, *cpi_head;
8568 
8569 	cpi_head = cct->cct_cpi_head;
8570 	cct->cct_cpi_head = cct->cct_cpi_tail = NULL;
8571 	for (cpi = cpi_head; cpi != NULL; cpi = cpi_next) {
8572 		cpi_next = cpi->cpi_next;
8573 		enqueue_vhcache_pathinfo(cct, cpi);
8574 	}
8575 }
8576 
8577 /*
8578  * Verify whether MDI_CPI_HINT_PATH_DOES_NOT_EXIST flag setting is correct for
8579  * every vhcache pathinfo of the specified client. If not adjust the flag
8580  * setting appropriately.
8581  *
8582  * Note that MDI_CPI_HINT_PATH_DOES_NOT_EXIST flag is persisted in the
8583  * on-disk vhci cache. So every time this flag is updated the cache must be
8584  * flushed.
8585  */
8586 static void
8587 adjust_sort_vhcache_paths(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
8588     mdi_vhcache_lookup_token_t *tok)
8589 {
8590 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8591 	mdi_vhcache_client_t *cct;
8592 	mdi_vhcache_pathinfo_t *cpi;
8593 
8594 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8595 	if ((cct = lookup_vhcache_client(vhcache, ct_name, ct_addr, tok))
8596 	    == NULL) {
8597 		rw_exit(&vhcache->vhcache_lock);
8598 		return;
8599 	}
8600 
8601 	/*
8602 	 * to avoid unnecessary on-disk cache updates, first check if an
8603 	 * update is really needed. If no update is needed simply return.
8604 	 */
8605 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8606 		if ((cpi->cpi_pip != NULL &&
8607 		    (cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST)) ||
8608 		    (cpi->cpi_pip == NULL &&
8609 		    !(cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST))) {
8610 			break;
8611 		}
8612 	}
8613 	if (cpi == NULL) {
8614 		rw_exit(&vhcache->vhcache_lock);
8615 		return;
8616 	}
8617 
8618 	if (rw_tryupgrade(&vhcache->vhcache_lock) == 0) {
8619 		rw_exit(&vhcache->vhcache_lock);
8620 		rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8621 		if ((cct = lookup_vhcache_client(vhcache, ct_name, ct_addr,
8622 		    tok)) == NULL) {
8623 			rw_exit(&vhcache->vhcache_lock);
8624 			return;
8625 		}
8626 	}
8627 
8628 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8629 		if (cpi->cpi_pip != NULL)
8630 			cpi->cpi_flags &= ~MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
8631 		else
8632 			cpi->cpi_flags |= MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
8633 	}
8634 	sort_vhcache_paths(cct);
8635 
8636 	rw_exit(&vhcache->vhcache_lock);
8637 	vhcache_dirty(vhc);
8638 }
8639 
8640 /*
8641  * Configure all specified paths of the client.
8642  */
8643 static void
8644 config_client_paths_sync(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
8645     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
8646 {
8647 	mdi_phys_path_t *pp;
8648 
8649 	for (pp = pp_head; pp != NULL; pp = pp->phys_path_next)
8650 		(void) bus_config_one_phci_child(pp->phys_path);
8651 	adjust_sort_vhcache_paths(vhc, ct_name, ct_addr, tok);
8652 }
8653 
8654 /*
8655  * Dequeue elements from vhci async client config list and bus configure
8656  * their corresponding phci clients.
8657  */
8658 static void
8659 config_client_paths_thread(void *arg)
8660 {
8661 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
8662 	mdi_async_client_config_t *acc;
8663 	clock_t quit_at_ticks;
8664 	clock_t idle_time = mdi_async_config_idle_time * TICKS_PER_SECOND;
8665 	callb_cpr_t cprinfo;
8666 
8667 	CALLB_CPR_INIT(&cprinfo, &vhc->vhc_lock, callb_generic_cpr,
8668 	    "mdi_config_client_paths");
8669 
8670 	for (; ; ) {
8671 		quit_at_ticks = ddi_get_lbolt() + idle_time;
8672 
8673 		mutex_enter(&vhc->vhc_lock);
8674 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
8675 		    vhc->vhc_acc_list_head == NULL &&
8676 		    ddi_get_lbolt() < quit_at_ticks) {
8677 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
8678 			(void) cv_timedwait(&vhc->vhc_cv, &vhc->vhc_lock,
8679 			    quit_at_ticks);
8680 			CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
8681 		}
8682 
8683 		if ((vhc->vhc_flags & MDI_VHC_EXIT) ||
8684 		    vhc->vhc_acc_list_head == NULL)
8685 			goto out;
8686 
8687 		acc = vhc->vhc_acc_list_head;
8688 		vhc->vhc_acc_list_head = acc->acc_next;
8689 		if (vhc->vhc_acc_list_head == NULL)
8690 			vhc->vhc_acc_list_tail = NULL;
8691 		vhc->vhc_acc_count--;
8692 		mutex_exit(&vhc->vhc_lock);
8693 
8694 		config_client_paths_sync(vhc, acc->acc_ct_name,
8695 		    acc->acc_ct_addr, acc->acc_phclient_path_list_head,
8696 		    &acc->acc_token);
8697 
8698 		free_async_client_config(acc);
8699 	}
8700 
8701 out:
8702 	vhc->vhc_acc_thrcount--;
8703 	/* CALLB_CPR_EXIT releases the vhc->vhc_lock */
8704 	CALLB_CPR_EXIT(&cprinfo);
8705 }
8706 
8707 /*
8708  * Arrange for all the phci client paths (pp_head) for the specified client
8709  * to be bus configured asynchronously by a thread.
8710  */
8711 static void
8712 config_client_paths_async(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
8713     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
8714 {
8715 	mdi_async_client_config_t *acc, *newacc;
8716 	int create_thread;
8717 
8718 	if (pp_head == NULL)
8719 		return;
8720 
8721 	if (mdi_mtc_off) {
8722 		config_client_paths_sync(vhc, ct_name, ct_addr, pp_head, tok);
8723 		free_phclient_path_list(pp_head);
8724 		return;
8725 	}
8726 
8727 	newacc = alloc_async_client_config(ct_name, ct_addr, pp_head, tok);
8728 	ASSERT(newacc);
8729 
8730 	mutex_enter(&vhc->vhc_lock);
8731 	for (acc = vhc->vhc_acc_list_head; acc != NULL; acc = acc->acc_next) {
8732 		if (strcmp(ct_name, acc->acc_ct_name) == 0 &&
8733 		    strcmp(ct_addr, acc->acc_ct_addr) == 0) {
8734 			free_async_client_config(newacc);
8735 			mutex_exit(&vhc->vhc_lock);
8736 			return;
8737 		}
8738 	}
8739 
8740 	if (vhc->vhc_acc_list_head == NULL)
8741 		vhc->vhc_acc_list_head = newacc;
8742 	else
8743 		vhc->vhc_acc_list_tail->acc_next = newacc;
8744 	vhc->vhc_acc_list_tail = newacc;
8745 	vhc->vhc_acc_count++;
8746 	if (vhc->vhc_acc_count <= vhc->vhc_acc_thrcount) {
8747 		cv_broadcast(&vhc->vhc_cv);
8748 		create_thread = 0;
8749 	} else {
8750 		vhc->vhc_acc_thrcount++;
8751 		create_thread = 1;
8752 	}
8753 	mutex_exit(&vhc->vhc_lock);
8754 
8755 	if (create_thread)
8756 		(void) thread_create(NULL, 0, config_client_paths_thread, vhc,
8757 		    0, &p0, TS_RUN, minclsyspri);
8758 }
8759 
8760 /*
8761  * Return number of online paths for the specified client.
8762  */
8763 static int
8764 nonline_paths(mdi_vhcache_client_t *cct)
8765 {
8766 	mdi_vhcache_pathinfo_t *cpi;
8767 	int online_count = 0;
8768 
8769 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8770 		if (cpi->cpi_pip != NULL) {
8771 			MDI_PI_LOCK(cpi->cpi_pip);
8772 			if (cpi->cpi_pip->pi_state == MDI_PATHINFO_STATE_ONLINE)
8773 				online_count++;
8774 			MDI_PI_UNLOCK(cpi->cpi_pip);
8775 		}
8776 	}
8777 
8778 	return (online_count);
8779 }
8780 
8781 /*
8782  * Bus configure all paths for the specified vhci client.
8783  * If at least one path for the client is already online, the remaining paths
8784  * will be configured asynchronously. Otherwise, it synchronously configures
8785  * the paths until at least one path is online and then rest of the paths
8786  * will be configured asynchronously.
8787  */
8788 static void
8789 config_client_paths(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr)
8790 {
8791 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8792 	mdi_phys_path_t *pp_head, *pp;
8793 	mdi_vhcache_client_t *cct;
8794 	mdi_vhcache_lookup_token_t tok;
8795 
8796 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
8797 
8798 	init_vhcache_lookup_token(&tok, NULL);
8799 
8800 	if (ct_name == NULL || ct_addr == NULL ||
8801 	    (cct = lookup_vhcache_client(vhcache, ct_name, ct_addr, &tok))
8802 	    == NULL ||
8803 	    (pp_head = build_phclient_path_list(cct, ct_name)) == NULL) {
8804 		rw_exit(&vhcache->vhcache_lock);
8805 		return;
8806 	}
8807 
8808 	/* if at least one path is online, configure the rest asynchronously */
8809 	if (nonline_paths(cct) > 0) {
8810 		rw_exit(&vhcache->vhcache_lock);
8811 		config_client_paths_async(vhc, ct_name, ct_addr, pp_head, &tok);
8812 		return;
8813 	}
8814 
8815 	rw_exit(&vhcache->vhcache_lock);
8816 
8817 	for (pp = pp_head; pp != NULL; pp = pp->phys_path_next) {
8818 		if (bus_config_one_phci_child(pp->phys_path) == MDI_SUCCESS) {
8819 			rw_enter(&vhcache->vhcache_lock, RW_READER);
8820 
8821 			if ((cct = lookup_vhcache_client(vhcache, ct_name,
8822 			    ct_addr, &tok)) == NULL) {
8823 				rw_exit(&vhcache->vhcache_lock);
8824 				goto out;
8825 			}
8826 
8827 			if (nonline_paths(cct) > 0 &&
8828 			    pp->phys_path_next != NULL) {
8829 				rw_exit(&vhcache->vhcache_lock);
8830 				config_client_paths_async(vhc, ct_name, ct_addr,
8831 				    pp->phys_path_next, &tok);
8832 				pp->phys_path_next = NULL;
8833 				goto out;
8834 			}
8835 
8836 			rw_exit(&vhcache->vhcache_lock);
8837 		}
8838 	}
8839 
8840 	adjust_sort_vhcache_paths(vhc, ct_name, ct_addr, &tok);
8841 out:
8842 	free_phclient_path_list(pp_head);
8843 }
8844 
8845 static void
8846 single_threaded_vhconfig_enter(mdi_vhci_config_t *vhc)
8847 {
8848 	mutex_enter(&vhc->vhc_lock);
8849 	while (vhc->vhc_flags & MDI_VHC_SINGLE_THREADED)
8850 		cv_wait(&vhc->vhc_cv, &vhc->vhc_lock);
8851 	vhc->vhc_flags |= MDI_VHC_SINGLE_THREADED;
8852 	mutex_exit(&vhc->vhc_lock);
8853 }
8854 
8855 static void
8856 single_threaded_vhconfig_exit(mdi_vhci_config_t *vhc)
8857 {
8858 	mutex_enter(&vhc->vhc_lock);
8859 	vhc->vhc_flags &= ~MDI_VHC_SINGLE_THREADED;
8860 	cv_broadcast(&vhc->vhc_cv);
8861 	mutex_exit(&vhc->vhc_lock);
8862 }
8863 
8864 typedef struct mdi_phci_driver_info {
8865 	char	*phdriver_name;	/* name of the phci driver */
8866 
8867 	/* set to non zero if the phci driver supports root device */
8868 	int	phdriver_root_support;
8869 } mdi_phci_driver_info_t;
8870 
8871 /*
8872  * vhci class and root support capability of a phci driver can be
8873  * specified using ddi-vhci-class and ddi-no-root-support properties in the
8874  * phci driver.conf file. The built-in tables below contain this information
8875  * for those phci drivers whose driver.conf files don't yet contain this info.
8876  *
8877  * All phci drivers expect iscsi have root device support.
8878  */
8879 static mdi_phci_driver_info_t scsi_phci_driver_list[] = {
8880 	{ "fp", 1 },
8881 	{ "iscsi", 0 },
8882 	{ "ibsrp", 1 }
8883 	};
8884 
8885 static mdi_phci_driver_info_t ib_phci_driver_list[] = { "tavor", 1 };
8886 
8887 static void *
8888 mdi_realloc(void *old_ptr, size_t old_size, size_t new_size)
8889 {
8890 	void *new_ptr;
8891 
8892 	new_ptr = kmem_zalloc(new_size, KM_SLEEP);
8893 	if (old_ptr) {
8894 		bcopy(old_ptr, new_ptr, MIN(old_size, new_size));
8895 		kmem_free(old_ptr, old_size);
8896 	}
8897 	return (new_ptr);
8898 }
8899 
8900 static void
8901 add_to_phci_list(char ***driver_list, int **root_support_list,
8902     int *cur_elements, int *max_elements, char *driver_name, int root_support)
8903 {
8904 	ASSERT(*cur_elements <= *max_elements);
8905 	if (*cur_elements == *max_elements) {
8906 		*max_elements += 10;
8907 		*driver_list = mdi_realloc(*driver_list,
8908 		    sizeof (char *) * (*cur_elements),
8909 		    sizeof (char *) * (*max_elements));
8910 		*root_support_list = mdi_realloc(*root_support_list,
8911 		    sizeof (int) * (*cur_elements),
8912 		    sizeof (int) * (*max_elements));
8913 	}
8914 	(*driver_list)[*cur_elements] = i_ddi_strdup(driver_name, KM_SLEEP);
8915 	(*root_support_list)[*cur_elements] = root_support;
8916 	(*cur_elements)++;
8917 }
8918 
8919 static void
8920 get_phci_driver_list(char *vhci_class, char ***driver_list,
8921     int **root_support_list, int *cur_elements, int *max_elements)
8922 {
8923 	mdi_phci_driver_info_t	*st_driver_list, *p;
8924 	int		st_ndrivers, root_support, i, j, driver_conf_count;
8925 	major_t		m;
8926 	struct devnames	*dnp;
8927 	ddi_prop_t	*propp;
8928 
8929 	*driver_list = NULL;
8930 	*root_support_list = NULL;
8931 	*cur_elements = 0;
8932 	*max_elements = 0;
8933 
8934 	/* add the phci drivers derived from the phci driver.conf files */
8935 	for (m = 0; m < devcnt; m++) {
8936 		dnp = &devnamesp[m];
8937 
8938 		if (dnp->dn_flags & DN_PHCI_DRIVER) {
8939 			LOCK_DEV_OPS(&dnp->dn_lock);
8940 			if (dnp->dn_global_prop_ptr != NULL &&
8941 			    (propp = i_ddi_prop_search(DDI_DEV_T_ANY,
8942 			    DDI_VHCI_CLASS, DDI_PROP_TYPE_STRING,
8943 			    &dnp->dn_global_prop_ptr->prop_list)) != NULL &&
8944 			    strcmp(propp->prop_val, vhci_class) == 0) {
8945 
8946 				root_support = (i_ddi_prop_search(DDI_DEV_T_ANY,
8947 				    DDI_NO_ROOT_SUPPORT, DDI_PROP_TYPE_INT,
8948 				    &dnp->dn_global_prop_ptr->prop_list)
8949 				    == NULL) ? 1 : 0;
8950 
8951 				add_to_phci_list(driver_list, root_support_list,
8952 				    cur_elements, max_elements, dnp->dn_name,
8953 				    root_support);
8954 
8955 				UNLOCK_DEV_OPS(&dnp->dn_lock);
8956 			} else
8957 				UNLOCK_DEV_OPS(&dnp->dn_lock);
8958 		}
8959 	}
8960 
8961 	driver_conf_count = *cur_elements;
8962 
8963 	/* add the phci drivers specified in the built-in tables */
8964 	if (strcmp(vhci_class, MDI_HCI_CLASS_SCSI) == 0) {
8965 		st_driver_list = scsi_phci_driver_list;
8966 		st_ndrivers = sizeof (scsi_phci_driver_list) /
8967 		    sizeof (mdi_phci_driver_info_t);
8968 	} else if (strcmp(vhci_class, MDI_HCI_CLASS_IB) == 0) {
8969 		st_driver_list = ib_phci_driver_list;
8970 		st_ndrivers = sizeof (ib_phci_driver_list) /
8971 		    sizeof (mdi_phci_driver_info_t);
8972 	} else {
8973 		st_driver_list = NULL;
8974 		st_ndrivers = 0;
8975 	}
8976 
8977 	for (i = 0, p = st_driver_list; i < st_ndrivers; i++, p++) {
8978 		/* add this phci driver if not already added before */
8979 		for (j = 0; j < driver_conf_count; j++) {
8980 			if (strcmp((*driver_list)[j], p->phdriver_name) == 0)
8981 				break;
8982 		}
8983 		if (j == driver_conf_count) {
8984 			add_to_phci_list(driver_list, root_support_list,
8985 			    cur_elements, max_elements, p->phdriver_name,
8986 			    p->phdriver_root_support);
8987 		}
8988 	}
8989 }
8990 
8991 /*
8992  * Attach the phci driver instances associated with the specified vhci class.
8993  * If root is mounted attach all phci driver instances.
8994  * If root is not mounted, attach the instances of only those phci
8995  * drivers that have the root support.
8996  */
8997 static void
8998 attach_phci_drivers(char *vhci_class)
8999 {
9000 	char	**driver_list, **p;
9001 	int	*root_support_list;
9002 	int	cur_elements, max_elements, i;
9003 	major_t	m;
9004 
9005 	get_phci_driver_list(vhci_class, &driver_list, &root_support_list,
9006 	    &cur_elements, &max_elements);
9007 
9008 	for (i = 0; i < cur_elements; i++) {
9009 		if (modrootloaded || root_support_list[i]) {
9010 			m = ddi_name_to_major(driver_list[i]);
9011 			if (m != DDI_MAJOR_T_NONE &&
9012 			    ddi_hold_installed_driver(m))
9013 				ddi_rele_driver(m);
9014 		}
9015 	}
9016 
9017 	if (driver_list) {
9018 		for (i = 0, p = driver_list; i < cur_elements; i++, p++)
9019 			kmem_free(*p, strlen(*p) + 1);
9020 		kmem_free(driver_list, sizeof (char *) * max_elements);
9021 		kmem_free(root_support_list, sizeof (int) * max_elements);
9022 	}
9023 }
9024 
9025 /*
9026  * Build vhci cache:
9027  *
9028  * Attach phci driver instances and then drive BUS_CONFIG_ALL on
9029  * the phci driver instances. During this process the cache gets built.
9030  *
9031  * Cache is built fully if the root is mounted.
9032  * If the root is not mounted, phci drivers that do not have root support
9033  * are not attached. As a result the cache is built partially. The entries
9034  * in the cache reflect only those phci drivers that have root support.
9035  */
9036 static int
9037 build_vhci_cache(mdi_vhci_t *vh)
9038 {
9039 	mdi_vhci_config_t *vhc = vh->vh_config;
9040 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
9041 
9042 	single_threaded_vhconfig_enter(vhc);
9043 
9044 	rw_enter(&vhcache->vhcache_lock, RW_READER);
9045 	if (vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE) {
9046 		rw_exit(&vhcache->vhcache_lock);
9047 		single_threaded_vhconfig_exit(vhc);
9048 		return (0);
9049 	}
9050 	rw_exit(&vhcache->vhcache_lock);
9051 
9052 	attach_phci_drivers(vh->vh_class);
9053 	bus_config_all_phcis(vhcache, NDI_DRV_CONF_REPROBE | NDI_NO_EVENT,
9054 	    BUS_CONFIG_ALL, DDI_MAJOR_T_NONE);
9055 
9056 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
9057 	vhcache->vhcache_flags |= MDI_VHCI_CACHE_SETUP_DONE;
9058 	rw_exit(&vhcache->vhcache_lock);
9059 
9060 	single_threaded_vhconfig_exit(vhc);
9061 	vhcache_dirty(vhc);
9062 	return (1);
9063 }
9064 
9065 /*
9066  * Determine if discovery of paths is needed.
9067  */
9068 static int
9069 vhcache_do_discovery(mdi_vhci_config_t *vhc)
9070 {
9071 	int rv = 1;
9072 
9073 	mutex_enter(&vhc->vhc_lock);
9074 	if (i_ddi_io_initialized() == 0) {
9075 		if (vhc->vhc_path_discovery_boot > 0) {
9076 			vhc->vhc_path_discovery_boot--;
9077 			goto out;
9078 		}
9079 	} else {
9080 		if (vhc->vhc_path_discovery_postboot > 0) {
9081 			vhc->vhc_path_discovery_postboot--;
9082 			goto out;
9083 		}
9084 	}
9085 
9086 	/*
9087 	 * Do full path discovery at most once per mdi_path_discovery_interval.
9088 	 * This is to avoid a series of full path discoveries when opening
9089 	 * stale /dev/[r]dsk links.
9090 	 */
9091 	if (mdi_path_discovery_interval != -1 &&
9092 	    lbolt64 >= vhc->vhc_path_discovery_cutoff_time)
9093 		goto out;
9094 
9095 	rv = 0;
9096 out:
9097 	mutex_exit(&vhc->vhc_lock);
9098 	return (rv);
9099 }
9100 
9101 /*
9102  * Discover all paths:
9103  *
9104  * Attach phci driver instances and then drive BUS_CONFIG_ALL on all the phci
9105  * driver instances. During this process all paths will be discovered.
9106  */
9107 static int
9108 vhcache_discover_paths(mdi_vhci_t *vh)
9109 {
9110 	mdi_vhci_config_t *vhc = vh->vh_config;
9111 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
9112 	int rv = 0;
9113 
9114 	single_threaded_vhconfig_enter(vhc);
9115 
9116 	if (vhcache_do_discovery(vhc)) {
9117 		attach_phci_drivers(vh->vh_class);
9118 		bus_config_all_phcis(vhcache, NDI_DRV_CONF_REPROBE |
9119 		    NDI_NO_EVENT, BUS_CONFIG_ALL, DDI_MAJOR_T_NONE);
9120 
9121 		mutex_enter(&vhc->vhc_lock);
9122 		vhc->vhc_path_discovery_cutoff_time = lbolt64 +
9123 		    mdi_path_discovery_interval * TICKS_PER_SECOND;
9124 		mutex_exit(&vhc->vhc_lock);
9125 		rv = 1;
9126 	}
9127 
9128 	single_threaded_vhconfig_exit(vhc);
9129 	return (rv);
9130 }
9131 
9132 /*
9133  * Generic vhci bus config implementation:
9134  *
9135  * Parameters
9136  *	vdip	vhci dip
9137  *	flags	bus config flags
9138  *	op	bus config operation
9139  *	The remaining parameters are bus config operation specific
9140  *
9141  * for BUS_CONFIG_ONE
9142  *	arg	pointer to name@addr
9143  *	child	upon successful return from this function, *child will be
9144  *		set to the configured and held devinfo child node of vdip.
9145  *	ct_addr	pointer to client address (i.e. GUID)
9146  *
9147  * for BUS_CONFIG_DRIVER
9148  *	arg	major number of the driver
9149  *	child and ct_addr parameters are ignored
9150  *
9151  * for BUS_CONFIG_ALL
9152  *	arg, child, and ct_addr parameters are ignored
9153  *
9154  * Note that for the rest of the bus config operations, this function simply
9155  * calls the framework provided default bus config routine.
9156  */
9157 int
9158 mdi_vhci_bus_config(dev_info_t *vdip, uint_t flags, ddi_bus_config_op_t op,
9159     void *arg, dev_info_t **child, char *ct_addr)
9160 {
9161 	mdi_vhci_t *vh = i_devi_get_vhci(vdip);
9162 	mdi_vhci_config_t *vhc = vh->vh_config;
9163 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
9164 	int rv = 0;
9165 	int params_valid = 0;
9166 	char *cp;
9167 
9168 	/*
9169 	 * To bus config vhcis we relay operation, possibly using another
9170 	 * thread, to phcis. The phci driver then interacts with MDI to cause
9171 	 * vhci child nodes to be enumerated under the vhci node.  Adding a
9172 	 * vhci child requires an ndi_devi_enter of the vhci. Since another
9173 	 * thread may be adding the child, to avoid deadlock we can't wait
9174 	 * for the relayed operations to complete if we have already entered
9175 	 * the vhci node.
9176 	 */
9177 	if (DEVI_BUSY_OWNED(vdip)) {
9178 		MDI_DEBUG(2, (MDI_NOTE, vdip,
9179 		    "vhci dip is busy owned %p", (void *)vdip));
9180 		goto default_bus_config;
9181 	}
9182 
9183 	rw_enter(&vhcache->vhcache_lock, RW_READER);
9184 	if (!(vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE)) {
9185 		rw_exit(&vhcache->vhcache_lock);
9186 		rv = build_vhci_cache(vh);
9187 		rw_enter(&vhcache->vhcache_lock, RW_READER);
9188 	}
9189 
9190 	switch (op) {
9191 	case BUS_CONFIG_ONE:
9192 		if (arg != NULL && ct_addr != NULL) {
9193 			/* extract node name */
9194 			cp = (char *)arg;
9195 			while (*cp != '\0' && *cp != '@')
9196 				cp++;
9197 			if (*cp == '@') {
9198 				params_valid = 1;
9199 				*cp = '\0';
9200 				config_client_paths(vhc, (char *)arg, ct_addr);
9201 				/* config_client_paths() releases cache_lock */
9202 				*cp = '@';
9203 				break;
9204 			}
9205 		}
9206 
9207 		rw_exit(&vhcache->vhcache_lock);
9208 		break;
9209 
9210 	case BUS_CONFIG_DRIVER:
9211 		rw_exit(&vhcache->vhcache_lock);
9212 		if (rv == 0)
9213 			st_bus_config_all_phcis(vhc, flags, op,
9214 			    (major_t)(uintptr_t)arg);
9215 		break;
9216 
9217 	case BUS_CONFIG_ALL:
9218 		rw_exit(&vhcache->vhcache_lock);
9219 		if (rv == 0)
9220 			st_bus_config_all_phcis(vhc, flags, op, -1);
9221 		break;
9222 
9223 	default:
9224 		rw_exit(&vhcache->vhcache_lock);
9225 		break;
9226 	}
9227 
9228 
9229 default_bus_config:
9230 	/*
9231 	 * All requested child nodes are enumerated under the vhci.
9232 	 * Now configure them.
9233 	 */
9234 	if (ndi_busop_bus_config(vdip, flags, op, arg, child, 0) ==
9235 	    NDI_SUCCESS) {
9236 		return (MDI_SUCCESS);
9237 	} else if (op == BUS_CONFIG_ONE && rv == 0 && params_valid) {
9238 		/* discover all paths and try configuring again */
9239 		if (vhcache_discover_paths(vh) &&
9240 		    ndi_busop_bus_config(vdip, flags, op, arg, child, 0) ==
9241 		    NDI_SUCCESS)
9242 			return (MDI_SUCCESS);
9243 	}
9244 
9245 	return (MDI_FAILURE);
9246 }
9247 
9248 /*
9249  * Read the on-disk vhci cache into an nvlist for the specified vhci class.
9250  */
9251 static nvlist_t *
9252 read_on_disk_vhci_cache(char *vhci_class)
9253 {
9254 	nvlist_t *nvl;
9255 	int err;
9256 	char *filename;
9257 
9258 	filename = vhclass2vhcache_filename(vhci_class);
9259 
9260 	if ((err = fread_nvlist(filename, &nvl)) == 0) {
9261 		kmem_free(filename, strlen(filename) + 1);
9262 		return (nvl);
9263 	} else if (err == EIO)
9264 		cmn_err(CE_WARN, "%s: I/O error, will recreate", filename);
9265 	else if (err == EINVAL)
9266 		cmn_err(CE_WARN,
9267 		    "%s: data file corrupted, will recreate", filename);
9268 
9269 	kmem_free(filename, strlen(filename) + 1);
9270 	return (NULL);
9271 }
9272 
9273 /*
9274  * Read on-disk vhci cache into nvlists for all vhci classes.
9275  * Called during booting by i_ddi_read_devices_files().
9276  */
9277 void
9278 mdi_read_devices_files(void)
9279 {
9280 	int i;
9281 
9282 	for (i = 0; i < N_VHCI_CLASSES; i++)
9283 		vhcache_nvl[i] = read_on_disk_vhci_cache(vhci_class_list[i]);
9284 }
9285 
9286 /*
9287  * Remove all stale entries from vhci cache.
9288  */
9289 static void
9290 clean_vhcache(mdi_vhci_config_t *vhc)
9291 {
9292 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
9293 	mdi_vhcache_phci_t *cphci, *cphci_head, *cphci_next;
9294 	mdi_vhcache_client_t *cct, *cct_head, *cct_next;
9295 	mdi_vhcache_pathinfo_t *cpi, *cpi_head, *cpi_next;
9296 
9297 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
9298 
9299 	cct_head = vhcache->vhcache_client_head;
9300 	vhcache->vhcache_client_head = vhcache->vhcache_client_tail = NULL;
9301 	for (cct = cct_head; cct != NULL; cct = cct_next) {
9302 		cct_next = cct->cct_next;
9303 
9304 		cpi_head = cct->cct_cpi_head;
9305 		cct->cct_cpi_head = cct->cct_cpi_tail = NULL;
9306 		for (cpi = cpi_head; cpi != NULL; cpi = cpi_next) {
9307 			cpi_next = cpi->cpi_next;
9308 			if (cpi->cpi_pip != NULL) {
9309 				ASSERT(cpi->cpi_cphci->cphci_phci != NULL);
9310 				enqueue_tail_vhcache_pathinfo(cct, cpi);
9311 			} else
9312 				free_vhcache_pathinfo(cpi);
9313 		}
9314 
9315 		if (cct->cct_cpi_head != NULL)
9316 			enqueue_vhcache_client(vhcache, cct);
9317 		else {
9318 			(void) mod_hash_destroy(vhcache->vhcache_client_hash,
9319 			    (mod_hash_key_t)cct->cct_name_addr);
9320 			free_vhcache_client(cct);
9321 		}
9322 	}
9323 
9324 	cphci_head = vhcache->vhcache_phci_head;
9325 	vhcache->vhcache_phci_head = vhcache->vhcache_phci_tail = NULL;
9326 	for (cphci = cphci_head; cphci != NULL; cphci = cphci_next) {
9327 		cphci_next = cphci->cphci_next;
9328 		if (cphci->cphci_phci != NULL)
9329 			enqueue_vhcache_phci(vhcache, cphci);
9330 		else
9331 			free_vhcache_phci(cphci);
9332 	}
9333 
9334 	vhcache->vhcache_clean_time = lbolt64;
9335 	rw_exit(&vhcache->vhcache_lock);
9336 	vhcache_dirty(vhc);
9337 }
9338 
9339 /*
9340  * Remove all stale entries from vhci cache.
9341  * Called by i_ddi_clean_devices_files() during the execution of devfsadm -C
9342  */
9343 void
9344 mdi_clean_vhcache(void)
9345 {
9346 	mdi_vhci_t *vh;
9347 
9348 	mutex_enter(&mdi_mutex);
9349 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
9350 		vh->vh_refcnt++;
9351 		mutex_exit(&mdi_mutex);
9352 		clean_vhcache(vh->vh_config);
9353 		mutex_enter(&mdi_mutex);
9354 		vh->vh_refcnt--;
9355 	}
9356 	mutex_exit(&mdi_mutex);
9357 }
9358 
9359 /*
9360  * mdi_vhci_walk_clients():
9361  *		Walker routine to traverse client dev_info nodes
9362  * ddi_walk_devs(ddi_get_child(vdip), f, arg) returns the entire tree
9363  * below the client, including nexus devices, which we dont want.
9364  * So we just traverse the immediate siblings, starting from 1st client.
9365  */
9366 void
9367 mdi_vhci_walk_clients(dev_info_t *vdip,
9368     int (*f)(dev_info_t *, void *), void *arg)
9369 {
9370 	mdi_vhci_t	*vh = i_devi_get_vhci(vdip);
9371 	dev_info_t	*cdip;
9372 	mdi_client_t	*ct;
9373 
9374 	MDI_VHCI_CLIENT_LOCK(vh);
9375 	cdip = ddi_get_child(vdip);
9376 	while (cdip) {
9377 		ct = i_devi_get_client(cdip);
9378 		MDI_CLIENT_LOCK(ct);
9379 
9380 		if (((*f)(cdip, arg)) == DDI_WALK_CONTINUE)
9381 			cdip = ddi_get_next_sibling(cdip);
9382 		else
9383 			cdip = NULL;
9384 
9385 		MDI_CLIENT_UNLOCK(ct);
9386 	}
9387 	MDI_VHCI_CLIENT_UNLOCK(vh);
9388 }
9389 
9390 /*
9391  * mdi_vhci_walk_phcis():
9392  *		Walker routine to traverse phci dev_info nodes
9393  */
9394 void
9395 mdi_vhci_walk_phcis(dev_info_t *vdip,
9396     int (*f)(dev_info_t *, void *), void *arg)
9397 {
9398 	mdi_vhci_t	*vh = i_devi_get_vhci(vdip);
9399 	mdi_phci_t	*ph, *next;
9400 
9401 	MDI_VHCI_PHCI_LOCK(vh);
9402 	ph = vh->vh_phci_head;
9403 	while (ph) {
9404 		MDI_PHCI_LOCK(ph);
9405 
9406 		if (((*f)(ph->ph_dip, arg)) == DDI_WALK_CONTINUE)
9407 			next = ph->ph_next;
9408 		else
9409 			next = NULL;
9410 
9411 		MDI_PHCI_UNLOCK(ph);
9412 		ph = next;
9413 	}
9414 	MDI_VHCI_PHCI_UNLOCK(vh);
9415 }
9416 
9417 
9418 /*
9419  * mdi_walk_vhcis():
9420  *		Walker routine to traverse vhci dev_info nodes
9421  */
9422 void
9423 mdi_walk_vhcis(int (*f)(dev_info_t *, void *), void *arg)
9424 {
9425 	mdi_vhci_t	*vh = NULL;
9426 
9427 	mutex_enter(&mdi_mutex);
9428 	/*
9429 	 * Scan for already registered vhci
9430 	 */
9431 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
9432 		vh->vh_refcnt++;
9433 		mutex_exit(&mdi_mutex);
9434 		if (((*f)(vh->vh_dip, arg)) != DDI_WALK_CONTINUE) {
9435 			mutex_enter(&mdi_mutex);
9436 			vh->vh_refcnt--;
9437 			break;
9438 		} else {
9439 			mutex_enter(&mdi_mutex);
9440 			vh->vh_refcnt--;
9441 		}
9442 	}
9443 
9444 	mutex_exit(&mdi_mutex);
9445 }
9446 
9447 /*
9448  * i_mdi_log_sysevent():
9449  *		Logs events for pickup by syseventd
9450  */
9451 static void
9452 i_mdi_log_sysevent(dev_info_t *dip, char *ph_vh_class, char *subclass)
9453 {
9454 	char		*path_name;
9455 	nvlist_t	*attr_list;
9456 
9457 	if (nvlist_alloc(&attr_list, NV_UNIQUE_NAME_TYPE,
9458 	    KM_SLEEP) != DDI_SUCCESS) {
9459 		goto alloc_failed;
9460 	}
9461 
9462 	path_name = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
9463 	(void) ddi_pathname(dip, path_name);
9464 
9465 	if (nvlist_add_string(attr_list, DDI_DRIVER_NAME,
9466 	    ddi_driver_name(dip)) != DDI_SUCCESS) {
9467 		goto error;
9468 	}
9469 
9470 	if (nvlist_add_int32(attr_list, DDI_DRIVER_MAJOR,
9471 	    (int32_t)ddi_driver_major(dip)) != DDI_SUCCESS) {
9472 		goto error;
9473 	}
9474 
9475 	if (nvlist_add_int32(attr_list, DDI_INSTANCE,
9476 	    (int32_t)ddi_get_instance(dip)) != DDI_SUCCESS) {
9477 		goto error;
9478 	}
9479 
9480 	if (nvlist_add_string(attr_list, DDI_PATHNAME,
9481 	    path_name) != DDI_SUCCESS) {
9482 		goto error;
9483 	}
9484 
9485 	if (nvlist_add_string(attr_list, DDI_CLASS,
9486 	    ph_vh_class) != DDI_SUCCESS) {
9487 		goto error;
9488 	}
9489 
9490 	(void) ddi_log_sysevent(dip, DDI_VENDOR_SUNW, EC_DDI, subclass,
9491 	    attr_list, NULL, DDI_SLEEP);
9492 
9493 error:
9494 	kmem_free(path_name, MAXPATHLEN);
9495 	nvlist_free(attr_list);
9496 	return;
9497 
9498 alloc_failed:
9499 	MDI_DEBUG(1, (MDI_WARN, dip, "!unable to send sysevent"));
9500 }
9501 
9502 char **
9503 mdi_get_phci_driver_list(char *vhci_class, int	*ndrivers)
9504 {
9505 	char	**driver_list, **ret_driver_list = NULL;
9506 	int	*root_support_list;
9507 	int	cur_elements, max_elements;
9508 
9509 	get_phci_driver_list(vhci_class, &driver_list, &root_support_list,
9510 	    &cur_elements, &max_elements);
9511 
9512 
9513 	if (driver_list) {
9514 		kmem_free(root_support_list, sizeof (int) * max_elements);
9515 		ret_driver_list = mdi_realloc(driver_list, sizeof (char *)
9516 		    * max_elements, sizeof (char *) * cur_elements);
9517 	}
9518 	*ndrivers = cur_elements;
9519 
9520 	return (ret_driver_list);
9521 
9522 }
9523 
9524 void
9525 mdi_free_phci_driver_list(char **driver_list, int ndrivers)
9526 {
9527 	char	**p;
9528 	int	i;
9529 
9530 	if (driver_list) {
9531 		for (i = 0, p = driver_list; i < ndrivers; i++, p++)
9532 			kmem_free(*p, strlen(*p) + 1);
9533 		kmem_free(driver_list, sizeof (char *) * ndrivers);
9534 	}
9535 }
9536 
9537 /*
9538  * mdi_is_dev_supported():
9539  *		function called by pHCI bus config operation to determine if a
9540  *		device should be represented as a child of the vHCI or the
9541  *		pHCI.  This decision is made by the vHCI, using cinfo idenity
9542  *		information passed by the pHCI - specifics of the cinfo
9543  *		representation are by agreement between the pHCI and vHCI.
9544  * Return Values:
9545  *		MDI_SUCCESS
9546  *		MDI_FAILURE
9547  */
9548 int
9549 mdi_is_dev_supported(char *class, dev_info_t *pdip, void *cinfo)
9550 {
9551 	mdi_vhci_t	*vh;
9552 
9553 	ASSERT(class && pdip);
9554 
9555 	/*
9556 	 * For dev_supported, mdi_phci_register() must have established pdip as
9557 	 * a pHCI.
9558 	 *
9559 	 * NOTE: mdi_phci_register() does "mpxio-disable" processing, and
9560 	 * MDI_PHCI(pdip) will return false if mpxio is disabled.
9561 	 */
9562 	if (!MDI_PHCI(pdip))
9563 		return (MDI_FAILURE);
9564 
9565 	/* Return MDI_FAILURE if vHCI does not support asking the question. */
9566 	vh = (mdi_vhci_t *)i_mdi_vhci_class2vhci(class);
9567 	if ((vh == NULL) || (vh->vh_ops->vo_is_dev_supported == NULL)) {
9568 		return (MDI_FAILURE);
9569 	}
9570 
9571 	/* Return vHCI answer */
9572 	return (vh->vh_ops->vo_is_dev_supported(vh->vh_dip, pdip, cinfo));
9573 }
9574 
9575 int
9576 mdi_dc_return_dev_state(mdi_pathinfo_t *pip, struct devctl_iocdata *dcp)
9577 {
9578 	uint_t devstate = 0;
9579 	dev_info_t *cdip;
9580 
9581 	if ((pip == NULL) || (dcp == NULL))
9582 		return (MDI_FAILURE);
9583 
9584 	cdip = mdi_pi_get_client(pip);
9585 
9586 	switch (mdi_pi_get_state(pip)) {
9587 	case MDI_PATHINFO_STATE_INIT:
9588 		devstate = DEVICE_DOWN;
9589 		break;
9590 	case MDI_PATHINFO_STATE_ONLINE:
9591 		devstate = DEVICE_ONLINE;
9592 		if ((cdip) && (devi_stillreferenced(cdip) == DEVI_REFERENCED))
9593 			devstate |= DEVICE_BUSY;
9594 		break;
9595 	case MDI_PATHINFO_STATE_STANDBY:
9596 		devstate = DEVICE_ONLINE;
9597 		break;
9598 	case MDI_PATHINFO_STATE_FAULT:
9599 		devstate = DEVICE_DOWN;
9600 		break;
9601 	case MDI_PATHINFO_STATE_OFFLINE:
9602 		devstate = DEVICE_OFFLINE;
9603 		break;
9604 	default:
9605 		ASSERT(MDI_PI(pip)->pi_state);
9606 	}
9607 
9608 	if (copyout(&devstate, dcp->cpyout_buf, sizeof (uint_t)) != 0)
9609 		return (MDI_FAILURE);
9610 
9611 	return (MDI_SUCCESS);
9612 }
9613