xref: /titanic_44/usr/src/uts/common/os/sunmdi.c (revision 4f764f916501bcb9d3233dc547db1928fc2f22ac)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Multipath driver interface (MDI) implementation; see mdi_impl.h for a more
27  * detailed discussion of the overall mpxio architecture.
28  *
29  * Default locking order:
30  *
31  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_phci_mutex);
32  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_client_mutex);
33  * _NOTE(LOCK_ORDER(mdi_vhci:vh_phci_mutex, mdi_phci::ph_mutex);
34  * _NOTE(LOCK_ORDER(mdi_vhci:vh_client_mutex, mdi_client::ct_mutex);
35  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
36  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_client::ct_mutex))
37  * _NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
38  */
39 
40 #include <sys/note.h>
41 #include <sys/types.h>
42 #include <sys/varargs.h>
43 #include <sys/param.h>
44 #include <sys/errno.h>
45 #include <sys/uio.h>
46 #include <sys/buf.h>
47 #include <sys/modctl.h>
48 #include <sys/open.h>
49 #include <sys/kmem.h>
50 #include <sys/poll.h>
51 #include <sys/conf.h>
52 #include <sys/bootconf.h>
53 #include <sys/cmn_err.h>
54 #include <sys/stat.h>
55 #include <sys/ddi.h>
56 #include <sys/sunddi.h>
57 #include <sys/ddipropdefs.h>
58 #include <sys/sunndi.h>
59 #include <sys/ndi_impldefs.h>
60 #include <sys/promif.h>
61 #include <sys/sunmdi.h>
62 #include <sys/mdi_impldefs.h>
63 #include <sys/taskq.h>
64 #include <sys/epm.h>
65 #include <sys/sunpm.h>
66 #include <sys/modhash.h>
67 #include <sys/disp.h>
68 #include <sys/autoconf.h>
69 #include <sys/sysmacros.h>
70 
71 #ifdef	DEBUG
72 #include <sys/debug.h>
73 int	mdi_debug = 1;
74 int	mdi_debug_logonly = 0;
75 #define	MDI_DEBUG(dbglevel, pargs) if (mdi_debug >= (dbglevel))	i_mdi_log pargs
76 #define	MDI_WARN	CE_WARN, __func__
77 #define	MDI_NOTE	CE_NOTE, __func__
78 #define	MDI_CONT	CE_CONT, __func__
79 static void i_mdi_log(int, const char *, dev_info_t *, const char *, ...);
80 #else	/* !DEBUG */
81 #define	MDI_DEBUG(dbglevel, pargs)
82 #endif	/* DEBUG */
83 int	mdi_debug_consoleonly = 0;
84 int	mdi_delay = 3;
85 
86 extern pri_t	minclsyspri;
87 extern int	modrootloaded;
88 
89 /*
90  * Global mutex:
91  * Protects vHCI list and structure members.
92  */
93 kmutex_t	mdi_mutex;
94 
95 /*
96  * Registered vHCI class driver lists
97  */
98 int		mdi_vhci_count;
99 mdi_vhci_t	*mdi_vhci_head;
100 mdi_vhci_t	*mdi_vhci_tail;
101 
102 /*
103  * Client Hash Table size
104  */
105 static int	mdi_client_table_size = CLIENT_HASH_TABLE_SIZE;
106 
107 /*
108  * taskq interface definitions
109  */
110 #define	MDI_TASKQ_N_THREADS	8
111 #define	MDI_TASKQ_PRI		minclsyspri
112 #define	MDI_TASKQ_MINALLOC	(4*mdi_taskq_n_threads)
113 #define	MDI_TASKQ_MAXALLOC	(500*mdi_taskq_n_threads)
114 
115 taskq_t				*mdi_taskq;
116 static uint_t			mdi_taskq_n_threads = MDI_TASKQ_N_THREADS;
117 
118 #define	TICKS_PER_SECOND	(drv_usectohz(1000000))
119 
120 /*
121  * The data should be "quiet" for this interval (in seconds) before the
122  * vhci cached data is flushed to the disk.
123  */
124 static int mdi_vhcache_flush_delay = 10;
125 
126 /* number of seconds the vhcache flush daemon will sleep idle before exiting */
127 static int mdi_vhcache_flush_daemon_idle_time = 60;
128 
129 /*
130  * MDI falls back to discovery of all paths when a bus_config_one fails.
131  * The following parameters can be used to tune this operation.
132  *
133  * mdi_path_discovery_boot
134  *	Number of times path discovery will be attempted during early boot.
135  *	Probably there is no reason to ever set this value to greater than one.
136  *
137  * mdi_path_discovery_postboot
138  *	Number of times path discovery will be attempted after early boot.
139  *	Set it to a minimum of two to allow for discovery of iscsi paths which
140  *	may happen very late during booting.
141  *
142  * mdi_path_discovery_interval
143  *	Minimum number of seconds MDI will wait between successive discovery
144  *	of all paths. Set it to -1 to disable discovery of all paths.
145  */
146 static int mdi_path_discovery_boot = 1;
147 static int mdi_path_discovery_postboot = 2;
148 static int mdi_path_discovery_interval = 10;
149 
150 /*
151  * number of seconds the asynchronous configuration thread will sleep idle
152  * before exiting.
153  */
154 static int mdi_async_config_idle_time = 600;
155 
156 static int mdi_bus_config_cache_hash_size = 256;
157 
158 /* turns off multithreaded configuration for certain operations */
159 static int mdi_mtc_off = 0;
160 
161 /*
162  * The "path" to a pathinfo node is identical to the /devices path to a
163  * devinfo node had the device been enumerated under a pHCI instead of
164  * a vHCI.  This pathinfo "path" is associated with a 'path_instance'.
165  * This association persists across create/delete of the pathinfo nodes,
166  * but not across reboot.
167  */
168 static uint_t		mdi_pathmap_instance = 1;	/* 0 -> any path */
169 static int		mdi_pathmap_hash_size = 256;
170 static kmutex_t		mdi_pathmap_mutex;
171 static mod_hash_t	*mdi_pathmap_bypath;		/* "path"->instance */
172 static mod_hash_t	*mdi_pathmap_byinstance;	/* instance->"path" */
173 static mod_hash_t	*mdi_pathmap_sbyinstance;	/* inst->shortpath */
174 
175 /*
176  * MDI component property name/value string definitions
177  */
178 const char 		*mdi_component_prop = "mpxio-component";
179 const char		*mdi_component_prop_vhci = "vhci";
180 const char		*mdi_component_prop_phci = "phci";
181 const char		*mdi_component_prop_client = "client";
182 
183 /*
184  * MDI client global unique identifier property name
185  */
186 const char		*mdi_client_guid_prop = "client-guid";
187 
188 /*
189  * MDI client load balancing property name/value string definitions
190  */
191 const char		*mdi_load_balance = "load-balance";
192 const char		*mdi_load_balance_none = "none";
193 const char		*mdi_load_balance_rr = "round-robin";
194 const char		*mdi_load_balance_lba = "logical-block";
195 
196 /*
197  * Obsolete vHCI class definition; to be removed after Leadville update
198  */
199 const char *mdi_vhci_class_scsi = MDI_HCI_CLASS_SCSI;
200 
201 static char vhci_greeting[] =
202 	"\tThere already exists one vHCI driver for class %s\n"
203 	"\tOnly one vHCI driver for each class is allowed\n";
204 
205 /*
206  * Static function prototypes
207  */
208 static int		i_mdi_phci_offline(dev_info_t *, uint_t);
209 static int		i_mdi_client_offline(dev_info_t *, uint_t);
210 static int		i_mdi_phci_pre_detach(dev_info_t *, ddi_detach_cmd_t);
211 static void		i_mdi_phci_post_detach(dev_info_t *,
212 			    ddi_detach_cmd_t, int);
213 static int		i_mdi_client_pre_detach(dev_info_t *,
214 			    ddi_detach_cmd_t);
215 static void		i_mdi_client_post_detach(dev_info_t *,
216 			    ddi_detach_cmd_t, int);
217 static void		i_mdi_pm_hold_pip(mdi_pathinfo_t *);
218 static void		i_mdi_pm_rele_pip(mdi_pathinfo_t *);
219 static int 		i_mdi_lba_lb(mdi_client_t *ct,
220 			    mdi_pathinfo_t **ret_pip, struct buf *buf);
221 static void		i_mdi_pm_hold_client(mdi_client_t *, int);
222 static void		i_mdi_pm_rele_client(mdi_client_t *, int);
223 static void		i_mdi_pm_reset_client(mdi_client_t *);
224 static int		i_mdi_power_all_phci(mdi_client_t *);
225 static void		i_mdi_log_sysevent(dev_info_t *, char *, char *);
226 
227 
228 /*
229  * Internal mdi_pathinfo node functions
230  */
231 static void		i_mdi_pi_kstat_destroy(mdi_pathinfo_t *);
232 
233 static mdi_vhci_t	*i_mdi_vhci_class2vhci(char *);
234 static mdi_vhci_t	*i_devi_get_vhci(dev_info_t *);
235 static mdi_phci_t	*i_devi_get_phci(dev_info_t *);
236 static void		i_mdi_phci_lock(mdi_phci_t *, mdi_pathinfo_t *);
237 static void		i_mdi_phci_unlock(mdi_phci_t *);
238 static mdi_pathinfo_t	*i_mdi_pi_alloc(mdi_phci_t *, char *, mdi_client_t *);
239 static void		i_mdi_phci_add_path(mdi_phci_t *, mdi_pathinfo_t *);
240 static void		i_mdi_client_add_path(mdi_client_t *, mdi_pathinfo_t *);
241 static void		i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *,
242 			    mdi_client_t *);
243 static void		i_mdi_phci_remove_path(mdi_phci_t *, mdi_pathinfo_t *);
244 static void		i_mdi_client_remove_path(mdi_client_t *,
245 			    mdi_pathinfo_t *);
246 
247 static int		i_mdi_pi_state_change(mdi_pathinfo_t *,
248 			    mdi_pathinfo_state_t, int);
249 static int		i_mdi_pi_offline(mdi_pathinfo_t *, int);
250 static dev_info_t	*i_mdi_devinfo_create(mdi_vhci_t *, char *, char *,
251 			    char **, int);
252 static dev_info_t	*i_mdi_devinfo_find(mdi_vhci_t *, char *, char *);
253 static int		i_mdi_devinfo_remove(dev_info_t *, dev_info_t *, int);
254 static int		i_mdi_is_child_present(dev_info_t *, dev_info_t *);
255 static mdi_client_t	*i_mdi_client_alloc(mdi_vhci_t *, char *, char *);
256 static void		i_mdi_client_enlist_table(mdi_vhci_t *, mdi_client_t *);
257 static void		i_mdi_client_delist_table(mdi_vhci_t *, mdi_client_t *);
258 static mdi_client_t	*i_mdi_client_find(mdi_vhci_t *, char *, char *);
259 static void		i_mdi_client_update_state(mdi_client_t *);
260 static int		i_mdi_client_compute_state(mdi_client_t *,
261 			    mdi_phci_t *);
262 static void		i_mdi_client_lock(mdi_client_t *, mdi_pathinfo_t *);
263 static void		i_mdi_client_unlock(mdi_client_t *);
264 static int		i_mdi_client_free(mdi_vhci_t *, mdi_client_t *);
265 static mdi_client_t	*i_devi_get_client(dev_info_t *);
266 /*
267  * NOTE: this will be removed once the NWS files are changed to use the new
268  * mdi_{enable,disable}_path interfaces
269  */
270 static int		i_mdi_pi_enable_disable(dev_info_t *, dev_info_t *,
271 				int, int);
272 static mdi_pathinfo_t 	*i_mdi_enable_disable_path(mdi_pathinfo_t *pip,
273 				mdi_vhci_t *vh, int flags, int op);
274 /*
275  * Failover related function prototypes
276  */
277 static int		i_mdi_failover(void *);
278 
279 /*
280  * misc internal functions
281  */
282 static int		i_mdi_get_hash_key(char *);
283 static int		i_map_nvlist_error_to_mdi(int);
284 static void		i_mdi_report_path_state(mdi_client_t *,
285 			    mdi_pathinfo_t *);
286 
287 static void		setup_vhci_cache(mdi_vhci_t *);
288 static int		destroy_vhci_cache(mdi_vhci_t *);
289 static int		stop_vhcache_async_threads(mdi_vhci_config_t *);
290 static boolean_t	stop_vhcache_flush_thread(void *, int);
291 static void		free_string_array(char **, int);
292 static void		free_vhcache_phci(mdi_vhcache_phci_t *);
293 static void		free_vhcache_pathinfo(mdi_vhcache_pathinfo_t *);
294 static void		free_vhcache_client(mdi_vhcache_client_t *);
295 static int		mainnvl_to_vhcache(mdi_vhci_cache_t *, nvlist_t *);
296 static nvlist_t		*vhcache_to_mainnvl(mdi_vhci_cache_t *);
297 static void		vhcache_phci_add(mdi_vhci_config_t *, mdi_phci_t *);
298 static void		vhcache_phci_remove(mdi_vhci_config_t *, mdi_phci_t *);
299 static void		vhcache_pi_add(mdi_vhci_config_t *,
300 			    struct mdi_pathinfo *);
301 static void		vhcache_pi_remove(mdi_vhci_config_t *,
302 			    struct mdi_pathinfo *);
303 static void		free_phclient_path_list(mdi_phys_path_t *);
304 static void		sort_vhcache_paths(mdi_vhcache_client_t *);
305 static int		flush_vhcache(mdi_vhci_config_t *, int);
306 static void		vhcache_dirty(mdi_vhci_config_t *);
307 static void		free_async_client_config(mdi_async_client_config_t *);
308 static void		single_threaded_vhconfig_enter(mdi_vhci_config_t *);
309 static void		single_threaded_vhconfig_exit(mdi_vhci_config_t *);
310 static nvlist_t		*read_on_disk_vhci_cache(char *);
311 extern int		fread_nvlist(char *, nvlist_t **);
312 extern int		fwrite_nvlist(char *, nvlist_t *);
313 
314 /* called once when first vhci registers with mdi */
315 static void
316 i_mdi_init()
317 {
318 	static int initialized = 0;
319 
320 	if (initialized)
321 		return;
322 	initialized = 1;
323 
324 	mutex_init(&mdi_mutex, NULL, MUTEX_DEFAULT, NULL);
325 
326 	/* Create our taskq resources */
327 	mdi_taskq = taskq_create("mdi_taskq", mdi_taskq_n_threads,
328 	    MDI_TASKQ_PRI, MDI_TASKQ_MINALLOC, MDI_TASKQ_MAXALLOC,
329 	    TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
330 	ASSERT(mdi_taskq != NULL);	/* taskq_create never fails */
331 
332 	/* Allocate ['path_instance' <-> "path"] maps */
333 	mutex_init(&mdi_pathmap_mutex, NULL, MUTEX_DRIVER, NULL);
334 	mdi_pathmap_bypath = mod_hash_create_strhash(
335 	    "mdi_pathmap_bypath", mdi_pathmap_hash_size,
336 	    mod_hash_null_valdtor);
337 	mdi_pathmap_byinstance = mod_hash_create_idhash(
338 	    "mdi_pathmap_byinstance", mdi_pathmap_hash_size,
339 	    mod_hash_null_valdtor);
340 	mdi_pathmap_sbyinstance = mod_hash_create_idhash(
341 	    "mdi_pathmap_sbyinstance", mdi_pathmap_hash_size,
342 	    mod_hash_null_valdtor);
343 }
344 
345 /*
346  * mdi_get_component_type():
347  *		Return mpxio component type
348  * Return Values:
349  *		MDI_COMPONENT_NONE
350  *		MDI_COMPONENT_VHCI
351  *		MDI_COMPONENT_PHCI
352  *		MDI_COMPONENT_CLIENT
353  * XXX This doesn't work under multi-level MPxIO and should be
354  *	removed when clients migrate mdi_component_is_*() interfaces.
355  */
356 int
357 mdi_get_component_type(dev_info_t *dip)
358 {
359 	return (DEVI(dip)->devi_mdi_component);
360 }
361 
362 /*
363  * mdi_vhci_register():
364  *		Register a vHCI module with the mpxio framework
365  *		mdi_vhci_register() is called by vHCI drivers to register the
366  *		'class_driver' vHCI driver and its MDI entrypoints with the
367  *		mpxio framework.  The vHCI driver must call this interface as
368  *		part of its attach(9e) handler.
369  *		Competing threads may try to attach mdi_vhci_register() as
370  *		the vHCI drivers are loaded and attached as a result of pHCI
371  *		driver instance registration (mdi_phci_register()) with the
372  *		framework.
373  * Return Values:
374  *		MDI_SUCCESS
375  *		MDI_FAILURE
376  */
377 /*ARGSUSED*/
378 int
379 mdi_vhci_register(char *class, dev_info_t *vdip, mdi_vhci_ops_t *vops,
380     int flags)
381 {
382 	mdi_vhci_t		*vh = NULL;
383 
384 	/* Registrant can't be older */
385 	ASSERT(vops->vo_revision <= MDI_VHCI_OPS_REV);
386 
387 #ifdef DEBUG
388 	/*
389 	 * IB nexus driver is loaded only when IB hardware is present.
390 	 * In order to be able to do this there is a need to drive the loading
391 	 * and attaching of the IB nexus driver (especially when an IB hardware
392 	 * is dynamically plugged in) when an IB HCA driver (PHCI)
393 	 * is being attached. Unfortunately this gets into the limitations
394 	 * of devfs as there seems to be no clean way to drive configuration
395 	 * of a subtree from another subtree of a devfs. Hence, do not ASSERT
396 	 * for IB.
397 	 */
398 	if (strcmp(class, MDI_HCI_CLASS_IB) != 0)
399 		ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));
400 #endif
401 
402 	i_mdi_init();
403 
404 	mutex_enter(&mdi_mutex);
405 	/*
406 	 * Scan for already registered vhci
407 	 */
408 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
409 		if (strcmp(vh->vh_class, class) == 0) {
410 			/*
411 			 * vHCI has already been created.  Check for valid
412 			 * vHCI ops registration.  We only support one vHCI
413 			 * module per class
414 			 */
415 			if (vh->vh_ops != NULL) {
416 				mutex_exit(&mdi_mutex);
417 				cmn_err(CE_NOTE, vhci_greeting, class);
418 				return (MDI_FAILURE);
419 			}
420 			break;
421 		}
422 	}
423 
424 	/*
425 	 * if not yet created, create the vHCI component
426 	 */
427 	if (vh == NULL) {
428 		struct client_hash	*hash = NULL;
429 		char			*load_balance;
430 
431 		/*
432 		 * Allocate and initialize the mdi extensions
433 		 */
434 		vh = kmem_zalloc(sizeof (mdi_vhci_t), KM_SLEEP);
435 		hash = kmem_zalloc(mdi_client_table_size * sizeof (*hash),
436 		    KM_SLEEP);
437 		vh->vh_client_table = hash;
438 		vh->vh_class = kmem_zalloc(strlen(class) + 1, KM_SLEEP);
439 		(void) strcpy(vh->vh_class, class);
440 		vh->vh_lb = LOAD_BALANCE_RR;
441 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vdip,
442 		    0, LOAD_BALANCE_PROP, &load_balance) == DDI_SUCCESS) {
443 			if (strcmp(load_balance, LOAD_BALANCE_PROP_NONE) == 0) {
444 				vh->vh_lb = LOAD_BALANCE_NONE;
445 			} else if (strcmp(load_balance, LOAD_BALANCE_PROP_LBA)
446 				    == 0) {
447 				vh->vh_lb = LOAD_BALANCE_LBA;
448 			}
449 			ddi_prop_free(load_balance);
450 		}
451 
452 		mutex_init(&vh->vh_phci_mutex, NULL, MUTEX_DEFAULT, NULL);
453 		mutex_init(&vh->vh_client_mutex, NULL, MUTEX_DEFAULT, NULL);
454 
455 		/*
456 		 * Store the vHCI ops vectors
457 		 */
458 		vh->vh_dip = vdip;
459 		vh->vh_ops = vops;
460 
461 		setup_vhci_cache(vh);
462 
463 		if (mdi_vhci_head == NULL) {
464 			mdi_vhci_head = vh;
465 		}
466 		if (mdi_vhci_tail) {
467 			mdi_vhci_tail->vh_next = vh;
468 		}
469 		mdi_vhci_tail = vh;
470 		mdi_vhci_count++;
471 	}
472 
473 	/*
474 	 * Claim the devfs node as a vhci component
475 	 */
476 	DEVI(vdip)->devi_mdi_component |= MDI_COMPONENT_VHCI;
477 
478 	/*
479 	 * Initialize our back reference from dev_info node
480 	 */
481 	DEVI(vdip)->devi_mdi_xhci = (caddr_t)vh;
482 	mutex_exit(&mdi_mutex);
483 	return (MDI_SUCCESS);
484 }
485 
486 /*
487  * mdi_vhci_unregister():
488  *		Unregister a vHCI module from mpxio framework
489  *		mdi_vhci_unregister() is called from the detach(9E) entrypoint
490  * 		of a vhci to unregister it from the framework.
491  * Return Values:
492  *		MDI_SUCCESS
493  *		MDI_FAILURE
494  */
495 /*ARGSUSED*/
496 int
497 mdi_vhci_unregister(dev_info_t *vdip, int flags)
498 {
499 	mdi_vhci_t	*found, *vh, *prev = NULL;
500 
501 	ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));
502 
503 	/*
504 	 * Check for invalid VHCI
505 	 */
506 	if ((vh = i_devi_get_vhci(vdip)) == NULL)
507 		return (MDI_FAILURE);
508 
509 	/*
510 	 * Scan the list of registered vHCIs for a match
511 	 */
512 	mutex_enter(&mdi_mutex);
513 	for (found = mdi_vhci_head; found != NULL; found = found->vh_next) {
514 		if (found == vh)
515 			break;
516 		prev = found;
517 	}
518 
519 	if (found == NULL) {
520 		mutex_exit(&mdi_mutex);
521 		return (MDI_FAILURE);
522 	}
523 
524 	/*
525 	 * Check the vHCI, pHCI and client count. All the pHCIs and clients
526 	 * should have been unregistered, before a vHCI can be
527 	 * unregistered.
528 	 */
529 	MDI_VHCI_PHCI_LOCK(vh);
530 	if (vh->vh_refcnt || vh->vh_phci_count || vh->vh_client_count) {
531 		MDI_VHCI_PHCI_UNLOCK(vh);
532 		mutex_exit(&mdi_mutex);
533 		return (MDI_FAILURE);
534 	}
535 	MDI_VHCI_PHCI_UNLOCK(vh);
536 
537 	if (destroy_vhci_cache(vh) != MDI_SUCCESS) {
538 		mutex_exit(&mdi_mutex);
539 		return (MDI_FAILURE);
540 	}
541 
542 	/*
543 	 * Remove the vHCI from the global list
544 	 */
545 	if (vh == mdi_vhci_head) {
546 		mdi_vhci_head = vh->vh_next;
547 	} else {
548 		prev->vh_next = vh->vh_next;
549 	}
550 	if (vh == mdi_vhci_tail) {
551 		mdi_vhci_tail = prev;
552 	}
553 	mdi_vhci_count--;
554 	mutex_exit(&mdi_mutex);
555 
556 	vh->vh_ops = NULL;
557 	DEVI(vdip)->devi_mdi_component &= ~MDI_COMPONENT_VHCI;
558 	DEVI(vdip)->devi_mdi_xhci = NULL;
559 	kmem_free(vh->vh_class, strlen(vh->vh_class)+1);
560 	kmem_free(vh->vh_client_table,
561 	    mdi_client_table_size * sizeof (struct client_hash));
562 	mutex_destroy(&vh->vh_phci_mutex);
563 	mutex_destroy(&vh->vh_client_mutex);
564 
565 	kmem_free(vh, sizeof (mdi_vhci_t));
566 	return (MDI_SUCCESS);
567 }
568 
569 /*
570  * i_mdi_vhci_class2vhci():
571  *		Look for a matching vHCI module given a vHCI class name
572  * Return Values:
573  *		Handle to a vHCI component
574  *		NULL
575  */
576 static mdi_vhci_t *
577 i_mdi_vhci_class2vhci(char *class)
578 {
579 	mdi_vhci_t	*vh = NULL;
580 
581 	ASSERT(!MUTEX_HELD(&mdi_mutex));
582 
583 	mutex_enter(&mdi_mutex);
584 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
585 		if (strcmp(vh->vh_class, class) == 0) {
586 			break;
587 		}
588 	}
589 	mutex_exit(&mdi_mutex);
590 	return (vh);
591 }
592 
593 /*
594  * i_devi_get_vhci():
595  *		Utility function to get the handle to a vHCI component
596  * Return Values:
597  *		Handle to a vHCI component
598  *		NULL
599  */
600 mdi_vhci_t *
601 i_devi_get_vhci(dev_info_t *vdip)
602 {
603 	mdi_vhci_t	*vh = NULL;
604 	if (MDI_VHCI(vdip)) {
605 		vh = (mdi_vhci_t *)DEVI(vdip)->devi_mdi_xhci;
606 	}
607 	return (vh);
608 }
609 
610 /*
611  * mdi_phci_register():
612  *		Register a pHCI module with mpxio framework
613  *		mdi_phci_register() is called by pHCI drivers to register with
614  *		the mpxio framework and a specific 'class_driver' vHCI.  The
615  *		pHCI driver must call this interface as part of its attach(9e)
616  *		handler.
617  * Return Values:
618  *		MDI_SUCCESS
619  *		MDI_FAILURE
620  */
621 /*ARGSUSED*/
622 int
623 mdi_phci_register(char *class, dev_info_t *pdip, int flags)
624 {
625 	mdi_phci_t		*ph;
626 	mdi_vhci_t		*vh;
627 	char			*data;
628 
629 	/*
630 	 * Some subsystems, like fcp, perform pHCI registration from a
631 	 * different thread than the one doing the pHCI attach(9E) - the
632 	 * driver attach code is waiting for this other thread to complete.
633 	 * This means we can only ASSERT DEVI_BUSY_CHANGING of parent
634 	 * (indicating that some thread has done an ndi_devi_enter of parent)
635 	 * not DEVI_BUSY_OWNED (which would indicate that we did the enter).
636 	 */
637 	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));
638 
639 	/*
640 	 * Check for mpxio-disable property. Enable mpxio if the property is
641 	 * missing or not set to "yes".
642 	 * If the property is set to "yes" then emit a brief message.
643 	 */
644 	if ((ddi_prop_lookup_string(DDI_DEV_T_ANY, pdip, 0, "mpxio-disable",
645 	    &data) == DDI_SUCCESS)) {
646 		if (strcmp(data, "yes") == 0) {
647 			MDI_DEBUG(1, (MDI_CONT, pdip,
648 			    "?multipath capabilities disabled via %s.conf.",
649 			    ddi_driver_name(pdip)));
650 			ddi_prop_free(data);
651 			return (MDI_FAILURE);
652 		}
653 		ddi_prop_free(data);
654 	}
655 
656 	/*
657 	 * Search for a matching vHCI
658 	 */
659 	vh = (mdi_vhci_t *)i_mdi_vhci_class2vhci(class);
660 	if (vh == NULL) {
661 		return (MDI_FAILURE);
662 	}
663 
664 	ph = kmem_zalloc(sizeof (mdi_phci_t), KM_SLEEP);
665 	mutex_init(&ph->ph_mutex, NULL, MUTEX_DEFAULT, NULL);
666 	ph->ph_dip = pdip;
667 	ph->ph_vhci = vh;
668 	ph->ph_next = NULL;
669 	ph->ph_unstable = 0;
670 	ph->ph_vprivate = 0;
671 	cv_init(&ph->ph_unstable_cv, NULL, CV_DRIVER, NULL);
672 
673 	MDI_PHCI_LOCK(ph);
674 	MDI_PHCI_SET_POWER_UP(ph);
675 	MDI_PHCI_UNLOCK(ph);
676 	DEVI(pdip)->devi_mdi_component |= MDI_COMPONENT_PHCI;
677 	DEVI(pdip)->devi_mdi_xhci = (caddr_t)ph;
678 
679 	vhcache_phci_add(vh->vh_config, ph);
680 
681 	MDI_VHCI_PHCI_LOCK(vh);
682 	if (vh->vh_phci_head == NULL) {
683 		vh->vh_phci_head = ph;
684 	}
685 	if (vh->vh_phci_tail) {
686 		vh->vh_phci_tail->ph_next = ph;
687 	}
688 	vh->vh_phci_tail = ph;
689 	vh->vh_phci_count++;
690 	MDI_VHCI_PHCI_UNLOCK(vh);
691 
692 	i_mdi_log_sysevent(pdip, class, ESC_DDI_INITIATOR_REGISTER);
693 	return (MDI_SUCCESS);
694 }
695 
696 /*
697  * mdi_phci_unregister():
698  *		Unregister a pHCI module from mpxio framework
699  *		mdi_phci_unregister() is called by the pHCI drivers from their
700  *		detach(9E) handler to unregister their instances from the
701  *		framework.
702  * Return Values:
703  *		MDI_SUCCESS
704  *		MDI_FAILURE
705  */
706 /*ARGSUSED*/
707 int
708 mdi_phci_unregister(dev_info_t *pdip, int flags)
709 {
710 	mdi_vhci_t		*vh;
711 	mdi_phci_t		*ph;
712 	mdi_phci_t		*tmp;
713 	mdi_phci_t		*prev = NULL;
714 	mdi_pathinfo_t		*pip;
715 
716 	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));
717 
718 	ph = i_devi_get_phci(pdip);
719 	if (ph == NULL) {
720 		MDI_DEBUG(1, (MDI_WARN, pdip, "!not a valid pHCI"));
721 		return (MDI_FAILURE);
722 	}
723 
724 	vh = ph->ph_vhci;
725 	ASSERT(vh != NULL);
726 	if (vh == NULL) {
727 		MDI_DEBUG(1, (MDI_WARN, pdip, "!not a valid vHCI"));
728 		return (MDI_FAILURE);
729 	}
730 
731 	MDI_VHCI_PHCI_LOCK(vh);
732 	tmp = vh->vh_phci_head;
733 	while (tmp) {
734 		if (tmp == ph) {
735 			break;
736 		}
737 		prev = tmp;
738 		tmp = tmp->ph_next;
739 	}
740 
741 	if (ph == vh->vh_phci_head) {
742 		vh->vh_phci_head = ph->ph_next;
743 	} else {
744 		prev->ph_next = ph->ph_next;
745 	}
746 
747 	if (ph == vh->vh_phci_tail) {
748 		vh->vh_phci_tail = prev;
749 	}
750 
751 	vh->vh_phci_count--;
752 	MDI_VHCI_PHCI_UNLOCK(vh);
753 
754 	/* Walk remaining pathinfo nodes and disassociate them from pHCI */
755 	MDI_PHCI_LOCK(ph);
756 	for (pip = (mdi_pathinfo_t *)ph->ph_path_head; pip;
757 	    pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link)
758 		MDI_PI(pip)->pi_phci = NULL;
759 	MDI_PHCI_UNLOCK(ph);
760 
761 	i_mdi_log_sysevent(pdip, ph->ph_vhci->vh_class,
762 	    ESC_DDI_INITIATOR_UNREGISTER);
763 	vhcache_phci_remove(vh->vh_config, ph);
764 	cv_destroy(&ph->ph_unstable_cv);
765 	mutex_destroy(&ph->ph_mutex);
766 	kmem_free(ph, sizeof (mdi_phci_t));
767 	DEVI(pdip)->devi_mdi_component &= ~MDI_COMPONENT_PHCI;
768 	DEVI(pdip)->devi_mdi_xhci = NULL;
769 	return (MDI_SUCCESS);
770 }
771 
772 /*
773  * i_devi_get_phci():
774  * 		Utility function to return the phci extensions.
775  */
776 static mdi_phci_t *
777 i_devi_get_phci(dev_info_t *pdip)
778 {
779 	mdi_phci_t	*ph = NULL;
780 
781 	if (MDI_PHCI(pdip)) {
782 		ph = (mdi_phci_t *)DEVI(pdip)->devi_mdi_xhci;
783 	}
784 	return (ph);
785 }
786 
787 /*
788  * Single thread mdi entry into devinfo node for modifying its children.
789  * If necessary we perform an ndi_devi_enter of the vHCI before doing
790  * an ndi_devi_enter of 'dip'.  We maintain circular in two parts: one
791  * for the vHCI and one for the pHCI.
792  */
793 void
794 mdi_devi_enter(dev_info_t *phci_dip, int *circular)
795 {
796 	dev_info_t	*vdip;
797 	int		vcircular, pcircular;
798 
799 	/* Verify calling context */
800 	ASSERT(MDI_PHCI(phci_dip));
801 	vdip = mdi_devi_get_vdip(phci_dip);
802 	ASSERT(vdip);			/* A pHCI always has a vHCI */
803 
804 	/*
805 	 * If pHCI is detaching then the framework has already entered the
806 	 * vHCI on a threads that went down the code path leading to
807 	 * detach_node().  This framework enter of the vHCI during pHCI
808 	 * detach is done to avoid deadlock with vHCI power management
809 	 * operations which enter the vHCI and the enter down the path
810 	 * to the pHCI. If pHCI is detaching then we piggyback this calls
811 	 * enter of the vHCI on frameworks vHCI enter that has already
812 	 * occurred - this is OK because we know that the framework thread
813 	 * doing detach is waiting for our completion.
814 	 *
815 	 * We should DEVI_IS_DETACHING under an enter of the parent to avoid
816 	 * race with detach - but we can't do that because the framework has
817 	 * already entered the parent, so we have some complexity instead.
818 	 */
819 	for (;;) {
820 		if (ndi_devi_tryenter(vdip, &vcircular)) {
821 			ASSERT(vcircular != -1);
822 			if (DEVI_IS_DETACHING(phci_dip)) {
823 				ndi_devi_exit(vdip, vcircular);
824 				vcircular = -1;
825 			}
826 			break;
827 		} else if (DEVI_IS_DETACHING(phci_dip)) {
828 			vcircular = -1;
829 			break;
830 		} else if (servicing_interrupt()) {
831 			/*
832 			 * Don't delay an interrupt (and ensure adaptive
833 			 * mutex inversion support).
834 			 */
835 			ndi_devi_enter(vdip, &vcircular);
836 			break;
837 		} else {
838 			delay_random(mdi_delay);
839 		}
840 	}
841 
842 	ndi_devi_enter(phci_dip, &pcircular);
843 	*circular = (vcircular << 16) | (pcircular & 0xFFFF);
844 }
845 
846 /*
847  * Attempt to mdi_devi_enter.
848  */
849 int
850 mdi_devi_tryenter(dev_info_t *phci_dip, int *circular)
851 {
852 	dev_info_t	*vdip;
853 	int		vcircular, pcircular;
854 
855 	/* Verify calling context */
856 	ASSERT(MDI_PHCI(phci_dip));
857 	vdip = mdi_devi_get_vdip(phci_dip);
858 	ASSERT(vdip);			/* A pHCI always has a vHCI */
859 
860 	if (ndi_devi_tryenter(vdip, &vcircular)) {
861 		if (ndi_devi_tryenter(phci_dip, &pcircular)) {
862 			*circular = (vcircular << 16) | (pcircular & 0xFFFF);
863 			return (1);	/* locked */
864 		}
865 		ndi_devi_exit(vdip, vcircular);
866 	}
867 	return (0);			/* busy */
868 }
869 
870 /*
871  * Release mdi_devi_enter or successful mdi_devi_tryenter.
872  */
873 void
874 mdi_devi_exit(dev_info_t *phci_dip, int circular)
875 {
876 	dev_info_t	*vdip;
877 	int		vcircular, pcircular;
878 
879 	/* Verify calling context */
880 	ASSERT(MDI_PHCI(phci_dip));
881 	vdip = mdi_devi_get_vdip(phci_dip);
882 	ASSERT(vdip);			/* A pHCI always has a vHCI */
883 
884 	/* extract two circular recursion values from single int */
885 	pcircular = (short)(circular & 0xFFFF);
886 	vcircular = (short)((circular >> 16) & 0xFFFF);
887 
888 	ndi_devi_exit(phci_dip, pcircular);
889 	if (vcircular != -1)
890 		ndi_devi_exit(vdip, vcircular);
891 }
892 
893 /*
894  * The functions mdi_devi_exit_phci() and mdi_devi_enter_phci() are used
895  * around a pHCI drivers calls to mdi_pi_online/offline, after holding
896  * the pathinfo node via mdi_hold_path/mdi_rele_path, to avoid deadlock
897  * with vHCI power management code during path online/offline.  Each
898  * mdi_devi_exit_phci must have a matching mdi_devi_enter_phci, and both must
899  * occur within the scope of an active mdi_devi_enter that establishes the
900  * circular value.
901  */
902 void
903 mdi_devi_exit_phci(dev_info_t *phci_dip, int circular)
904 {
905 	int		pcircular;
906 
907 	/* Verify calling context */
908 	ASSERT(MDI_PHCI(phci_dip));
909 
910 	/* Keep hold on pHCI until we reenter in mdi_devi_enter_phci */
911 	ndi_hold_devi(phci_dip);
912 
913 	pcircular = (short)(circular & 0xFFFF);
914 	ndi_devi_exit(phci_dip, pcircular);
915 }
916 
917 void
918 mdi_devi_enter_phci(dev_info_t *phci_dip, int *circular)
919 {
920 	int		pcircular;
921 
922 	/* Verify calling context */
923 	ASSERT(MDI_PHCI(phci_dip));
924 
925 	ndi_devi_enter(phci_dip, &pcircular);
926 
927 	/* Drop hold from mdi_devi_exit_phci. */
928 	ndi_rele_devi(phci_dip);
929 
930 	/* verify matching mdi_devi_exit_phci/mdi_devi_enter_phci use */
931 	ASSERT(pcircular == ((short)(*circular & 0xFFFF)));
932 }
933 
934 /*
935  * mdi_devi_get_vdip():
936  *		given a pHCI dip return vHCI dip
937  */
938 dev_info_t *
939 mdi_devi_get_vdip(dev_info_t *pdip)
940 {
941 	mdi_phci_t	*ph;
942 
943 	ph = i_devi_get_phci(pdip);
944 	if (ph && ph->ph_vhci)
945 		return (ph->ph_vhci->vh_dip);
946 	return (NULL);
947 }
948 
949 /*
950  * mdi_devi_pdip_entered():
951  *		Return 1 if we are vHCI and have done an ndi_devi_enter
952  *		of a pHCI
953  */
954 int
955 mdi_devi_pdip_entered(dev_info_t *vdip)
956 {
957 	mdi_vhci_t	*vh;
958 	mdi_phci_t	*ph;
959 
960 	vh = i_devi_get_vhci(vdip);
961 	if (vh == NULL)
962 		return (0);
963 
964 	MDI_VHCI_PHCI_LOCK(vh);
965 	ph = vh->vh_phci_head;
966 	while (ph) {
967 		if (ph->ph_dip && DEVI_BUSY_OWNED(ph->ph_dip)) {
968 			MDI_VHCI_PHCI_UNLOCK(vh);
969 			return (1);
970 		}
971 		ph = ph->ph_next;
972 	}
973 	MDI_VHCI_PHCI_UNLOCK(vh);
974 	return (0);
975 }
976 
977 /*
978  * mdi_phci_path2devinfo():
979  * 		Utility function to search for a valid phci device given
980  *		the devfs pathname.
981  */
982 dev_info_t *
983 mdi_phci_path2devinfo(dev_info_t *vdip, caddr_t pathname)
984 {
985 	char		*temp_pathname;
986 	mdi_vhci_t	*vh;
987 	mdi_phci_t	*ph;
988 	dev_info_t 	*pdip = NULL;
989 
990 	vh = i_devi_get_vhci(vdip);
991 	ASSERT(vh != NULL);
992 
993 	if (vh == NULL) {
994 		/*
995 		 * Invalid vHCI component, return failure
996 		 */
997 		return (NULL);
998 	}
999 
1000 	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1001 	MDI_VHCI_PHCI_LOCK(vh);
1002 	ph = vh->vh_phci_head;
1003 	while (ph != NULL) {
1004 		pdip = ph->ph_dip;
1005 		ASSERT(pdip != NULL);
1006 		*temp_pathname = '\0';
1007 		(void) ddi_pathname(pdip, temp_pathname);
1008 		if (strcmp(temp_pathname, pathname) == 0) {
1009 			break;
1010 		}
1011 		ph = ph->ph_next;
1012 	}
1013 	if (ph == NULL) {
1014 		pdip = NULL;
1015 	}
1016 	MDI_VHCI_PHCI_UNLOCK(vh);
1017 	kmem_free(temp_pathname, MAXPATHLEN);
1018 	return (pdip);
1019 }
1020 
1021 /*
1022  * mdi_phci_get_path_count():
1023  * 		get number of path information nodes associated with a given
1024  *		pHCI device.
1025  */
1026 int
1027 mdi_phci_get_path_count(dev_info_t *pdip)
1028 {
1029 	mdi_phci_t	*ph;
1030 	int		count = 0;
1031 
1032 	ph = i_devi_get_phci(pdip);
1033 	if (ph != NULL) {
1034 		count = ph->ph_path_count;
1035 	}
1036 	return (count);
1037 }
1038 
1039 /*
1040  * i_mdi_phci_lock():
1041  *		Lock a pHCI device
1042  * Return Values:
1043  *		None
1044  * Note:
1045  *		The default locking order is:
1046  *		_NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
1047  *		But there are number of situations where locks need to be
1048  *		grabbed in reverse order.  This routine implements try and lock
1049  *		mechanism depending on the requested parameter option.
1050  */
1051 static void
1052 i_mdi_phci_lock(mdi_phci_t *ph, mdi_pathinfo_t *pip)
1053 {
1054 	if (pip) {
1055 		/* Reverse locking is requested. */
1056 		while (MDI_PHCI_TRYLOCK(ph) == 0) {
1057 			if (servicing_interrupt()) {
1058 				MDI_PI_HOLD(pip);
1059 				MDI_PI_UNLOCK(pip);
1060 				MDI_PHCI_LOCK(ph);
1061 				MDI_PI_LOCK(pip);
1062 				MDI_PI_RELE(pip);
1063 				break;
1064 			} else {
1065 				/*
1066 				 * tryenter failed. Try to grab again
1067 				 * after a small delay
1068 				 */
1069 				MDI_PI_HOLD(pip);
1070 				MDI_PI_UNLOCK(pip);
1071 				delay_random(mdi_delay);
1072 				MDI_PI_LOCK(pip);
1073 				MDI_PI_RELE(pip);
1074 			}
1075 		}
1076 	} else {
1077 		MDI_PHCI_LOCK(ph);
1078 	}
1079 }
1080 
1081 /*
1082  * i_mdi_phci_unlock():
1083  *		Unlock the pHCI component
1084  */
1085 static void
1086 i_mdi_phci_unlock(mdi_phci_t *ph)
1087 {
1088 	MDI_PHCI_UNLOCK(ph);
1089 }
1090 
1091 /*
1092  * i_mdi_devinfo_create():
1093  *		create client device's devinfo node
1094  * Return Values:
1095  *		dev_info
1096  *		NULL
1097  * Notes:
1098  */
1099 static dev_info_t *
1100 i_mdi_devinfo_create(mdi_vhci_t *vh, char *name, char *guid,
1101 	char **compatible, int ncompatible)
1102 {
1103 	dev_info_t *cdip = NULL;
1104 
1105 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1106 
1107 	/* Verify for duplicate entry */
1108 	cdip = i_mdi_devinfo_find(vh, name, guid);
1109 	ASSERT(cdip == NULL);
1110 	if (cdip) {
1111 		cmn_err(CE_WARN,
1112 		    "i_mdi_devinfo_create: client %s@%s already exists",
1113 			name ? name : "", guid ? guid : "");
1114 	}
1115 
1116 	ndi_devi_alloc_sleep(vh->vh_dip, name, DEVI_SID_NODEID, &cdip);
1117 	if (cdip == NULL)
1118 		goto fail;
1119 
1120 	/*
1121 	 * Create component type and Global unique identifier
1122 	 * properties
1123 	 */
1124 	if (ndi_prop_update_string(DDI_DEV_T_NONE, cdip,
1125 	    MDI_CLIENT_GUID_PROP, guid) != DDI_PROP_SUCCESS) {
1126 		goto fail;
1127 	}
1128 
1129 	/* Decorate the node with compatible property */
1130 	if (compatible &&
1131 	    (ndi_prop_update_string_array(DDI_DEV_T_NONE, cdip,
1132 	    "compatible", compatible, ncompatible) != DDI_PROP_SUCCESS)) {
1133 		goto fail;
1134 	}
1135 
1136 	return (cdip);
1137 
1138 fail:
1139 	if (cdip) {
1140 		(void) ndi_prop_remove_all(cdip);
1141 		(void) ndi_devi_free(cdip);
1142 	}
1143 	return (NULL);
1144 }
1145 
1146 /*
1147  * i_mdi_devinfo_find():
1148  *		Find a matching devinfo node for given client node name
1149  *		and its guid.
1150  * Return Values:
1151  *		Handle to a dev_info node or NULL
1152  */
1153 static dev_info_t *
1154 i_mdi_devinfo_find(mdi_vhci_t *vh, caddr_t name, char *guid)
1155 {
1156 	char			*data;
1157 	dev_info_t 		*cdip = NULL;
1158 	dev_info_t 		*ndip = NULL;
1159 	int			circular;
1160 
1161 	ndi_devi_enter(vh->vh_dip, &circular);
1162 	ndip = (dev_info_t *)DEVI(vh->vh_dip)->devi_child;
1163 	while ((cdip = ndip) != NULL) {
1164 		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
1165 
1166 		if (strcmp(DEVI(cdip)->devi_node_name, name)) {
1167 			continue;
1168 		}
1169 
1170 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, cdip,
1171 		    DDI_PROP_DONTPASS, MDI_CLIENT_GUID_PROP,
1172 		    &data) != DDI_PROP_SUCCESS) {
1173 			continue;
1174 		}
1175 
1176 		if (strcmp(data, guid) != 0) {
1177 			ddi_prop_free(data);
1178 			continue;
1179 		}
1180 		ddi_prop_free(data);
1181 		break;
1182 	}
1183 	ndi_devi_exit(vh->vh_dip, circular);
1184 	return (cdip);
1185 }
1186 
1187 /*
1188  * i_mdi_devinfo_remove():
1189  *		Remove a client device node
1190  */
1191 static int
1192 i_mdi_devinfo_remove(dev_info_t *vdip, dev_info_t *cdip, int flags)
1193 {
1194 	int	rv = MDI_SUCCESS;
1195 
1196 	if (i_mdi_is_child_present(vdip, cdip) == MDI_SUCCESS ||
1197 	    (flags & MDI_CLIENT_FLAGS_DEV_NOT_SUPPORTED)) {
1198 		rv = ndi_devi_offline(cdip, NDI_DEVFS_CLEAN | NDI_DEVI_REMOVE);
1199 		if (rv != NDI_SUCCESS) {
1200 			MDI_DEBUG(1, (MDI_NOTE, cdip,
1201 			    "!failed: cdip %p", (void *)cdip));
1202 		}
1203 		/*
1204 		 * Convert to MDI error code
1205 		 */
1206 		switch (rv) {
1207 		case NDI_SUCCESS:
1208 			rv = MDI_SUCCESS;
1209 			break;
1210 		case NDI_BUSY:
1211 			rv = MDI_BUSY;
1212 			break;
1213 		default:
1214 			rv = MDI_FAILURE;
1215 			break;
1216 		}
1217 	}
1218 	return (rv);
1219 }
1220 
1221 /*
1222  * i_devi_get_client()
1223  *		Utility function to get mpxio component extensions
1224  */
1225 static mdi_client_t *
1226 i_devi_get_client(dev_info_t *cdip)
1227 {
1228 	mdi_client_t	*ct = NULL;
1229 
1230 	if (MDI_CLIENT(cdip)) {
1231 		ct = (mdi_client_t *)DEVI(cdip)->devi_mdi_client;
1232 	}
1233 	return (ct);
1234 }
1235 
1236 /*
1237  * i_mdi_is_child_present():
1238  *		Search for the presence of client device dev_info node
1239  */
1240 static int
1241 i_mdi_is_child_present(dev_info_t *vdip, dev_info_t *cdip)
1242 {
1243 	int		rv = MDI_FAILURE;
1244 	struct dev_info	*dip;
1245 	int		circular;
1246 
1247 	ndi_devi_enter(vdip, &circular);
1248 	dip = DEVI(vdip)->devi_child;
1249 	while (dip) {
1250 		if (dip == DEVI(cdip)) {
1251 			rv = MDI_SUCCESS;
1252 			break;
1253 		}
1254 		dip = dip->devi_sibling;
1255 	}
1256 	ndi_devi_exit(vdip, circular);
1257 	return (rv);
1258 }
1259 
1260 
1261 /*
1262  * i_mdi_client_lock():
1263  *		Grab client component lock
1264  * Return Values:
1265  *		None
1266  * Note:
1267  *		The default locking order is:
1268  *		_NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
1269  *		But there are number of situations where locks need to be
1270  *		grabbed in reverse order.  This routine implements try and lock
1271  *		mechanism depending on the requested parameter option.
1272  */
1273 static void
1274 i_mdi_client_lock(mdi_client_t *ct, mdi_pathinfo_t *pip)
1275 {
1276 	if (pip) {
1277 		/*
1278 		 * Reverse locking is requested.
1279 		 */
1280 		while (MDI_CLIENT_TRYLOCK(ct) == 0) {
1281 			if (servicing_interrupt()) {
1282 				MDI_PI_HOLD(pip);
1283 				MDI_PI_UNLOCK(pip);
1284 				MDI_CLIENT_LOCK(ct);
1285 				MDI_PI_LOCK(pip);
1286 				MDI_PI_RELE(pip);
1287 				break;
1288 			} else {
1289 				/*
1290 				 * tryenter failed. Try to grab again
1291 				 * after a small delay
1292 				 */
1293 				MDI_PI_HOLD(pip);
1294 				MDI_PI_UNLOCK(pip);
1295 				delay_random(mdi_delay);
1296 				MDI_PI_LOCK(pip);
1297 				MDI_PI_RELE(pip);
1298 			}
1299 		}
1300 	} else {
1301 		MDI_CLIENT_LOCK(ct);
1302 	}
1303 }
1304 
1305 /*
1306  * i_mdi_client_unlock():
1307  *		Unlock a client component
1308  */
1309 static void
1310 i_mdi_client_unlock(mdi_client_t *ct)
1311 {
1312 	MDI_CLIENT_UNLOCK(ct);
1313 }
1314 
1315 /*
1316  * i_mdi_client_alloc():
1317  * 		Allocate and initialize a client structure.  Caller should
1318  *		hold the vhci client lock.
1319  * Return Values:
1320  *		Handle to a client component
1321  */
1322 /*ARGSUSED*/
1323 static mdi_client_t *
1324 i_mdi_client_alloc(mdi_vhci_t *vh, char *name, char *lguid)
1325 {
1326 	mdi_client_t	*ct;
1327 
1328 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1329 
1330 	/*
1331 	 * Allocate and initialize a component structure.
1332 	 */
1333 	ct = kmem_zalloc(sizeof (*ct), KM_SLEEP);
1334 	mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL);
1335 	ct->ct_hnext = NULL;
1336 	ct->ct_hprev = NULL;
1337 	ct->ct_dip = NULL;
1338 	ct->ct_vhci = vh;
1339 	ct->ct_drvname = kmem_alloc(strlen(name) + 1, KM_SLEEP);
1340 	(void) strcpy(ct->ct_drvname, name);
1341 	ct->ct_guid = kmem_alloc(strlen(lguid) + 1, KM_SLEEP);
1342 	(void) strcpy(ct->ct_guid, lguid);
1343 	ct->ct_cprivate = NULL;
1344 	ct->ct_vprivate = NULL;
1345 	ct->ct_flags = 0;
1346 	ct->ct_state = MDI_CLIENT_STATE_FAILED;
1347 	MDI_CLIENT_LOCK(ct);
1348 	MDI_CLIENT_SET_OFFLINE(ct);
1349 	MDI_CLIENT_SET_DETACH(ct);
1350 	MDI_CLIENT_SET_POWER_UP(ct);
1351 	MDI_CLIENT_UNLOCK(ct);
1352 	ct->ct_failover_flags = 0;
1353 	ct->ct_failover_status = 0;
1354 	cv_init(&ct->ct_failover_cv, NULL, CV_DRIVER, NULL);
1355 	ct->ct_unstable = 0;
1356 	cv_init(&ct->ct_unstable_cv, NULL, CV_DRIVER, NULL);
1357 	cv_init(&ct->ct_powerchange_cv, NULL, CV_DRIVER, NULL);
1358 	ct->ct_lb = vh->vh_lb;
1359 	ct->ct_lb_args =  kmem_zalloc(sizeof (client_lb_args_t), KM_SLEEP);
1360 	ct->ct_lb_args->region_size = LOAD_BALANCE_DEFAULT_REGION_SIZE;
1361 	ct->ct_path_count = 0;
1362 	ct->ct_path_head = NULL;
1363 	ct->ct_path_tail = NULL;
1364 	ct->ct_path_last = NULL;
1365 
1366 	/*
1367 	 * Add this client component to our client hash queue
1368 	 */
1369 	i_mdi_client_enlist_table(vh, ct);
1370 	return (ct);
1371 }
1372 
1373 /*
1374  * i_mdi_client_enlist_table():
1375  *		Attach the client device to the client hash table. Caller
1376  *		should hold the vhci client lock.
1377  */
1378 static void
1379 i_mdi_client_enlist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1380 {
1381 	int 			index;
1382 	struct client_hash	*head;
1383 
1384 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1385 
1386 	index = i_mdi_get_hash_key(ct->ct_guid);
1387 	head = &vh->vh_client_table[index];
1388 	ct->ct_hnext = (mdi_client_t *)head->ct_hash_head;
1389 	head->ct_hash_head = ct;
1390 	head->ct_hash_count++;
1391 	vh->vh_client_count++;
1392 }
1393 
1394 /*
1395  * i_mdi_client_delist_table():
1396  *		Attach the client device to the client hash table.
1397  *		Caller should hold the vhci client lock.
1398  */
1399 static void
1400 i_mdi_client_delist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1401 {
1402 	int			index;
1403 	char			*guid;
1404 	struct client_hash 	*head;
1405 	mdi_client_t		*next;
1406 	mdi_client_t		*last;
1407 
1408 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1409 
1410 	guid = ct->ct_guid;
1411 	index = i_mdi_get_hash_key(guid);
1412 	head = &vh->vh_client_table[index];
1413 
1414 	last = NULL;
1415 	next = (mdi_client_t *)head->ct_hash_head;
1416 	while (next != NULL) {
1417 		if (next == ct) {
1418 			break;
1419 		}
1420 		last = next;
1421 		next = next->ct_hnext;
1422 	}
1423 
1424 	if (next) {
1425 		head->ct_hash_count--;
1426 		if (last == NULL) {
1427 			head->ct_hash_head = ct->ct_hnext;
1428 		} else {
1429 			last->ct_hnext = ct->ct_hnext;
1430 		}
1431 		ct->ct_hnext = NULL;
1432 		vh->vh_client_count--;
1433 	}
1434 }
1435 
1436 
1437 /*
1438  * i_mdi_client_free():
1439  *		Free a client component
1440  */
1441 static int
1442 i_mdi_client_free(mdi_vhci_t *vh, mdi_client_t *ct)
1443 {
1444 	int		rv = MDI_SUCCESS;
1445 	int		flags = ct->ct_flags;
1446 	dev_info_t	*cdip;
1447 	dev_info_t	*vdip;
1448 
1449 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1450 
1451 	vdip = vh->vh_dip;
1452 	cdip = ct->ct_dip;
1453 
1454 	(void) ndi_prop_remove(DDI_DEV_T_NONE, cdip, MDI_CLIENT_GUID_PROP);
1455 	DEVI(cdip)->devi_mdi_component &= ~MDI_COMPONENT_CLIENT;
1456 	DEVI(cdip)->devi_mdi_client = NULL;
1457 
1458 	/*
1459 	 * Clear out back ref. to dev_info_t node
1460 	 */
1461 	ct->ct_dip = NULL;
1462 
1463 	/*
1464 	 * Remove this client from our hash queue
1465 	 */
1466 	i_mdi_client_delist_table(vh, ct);
1467 
1468 	/*
1469 	 * Uninitialize and free the component
1470 	 */
1471 	kmem_free(ct->ct_drvname, strlen(ct->ct_drvname) + 1);
1472 	kmem_free(ct->ct_guid, strlen(ct->ct_guid) + 1);
1473 	kmem_free(ct->ct_lb_args, sizeof (client_lb_args_t));
1474 	cv_destroy(&ct->ct_failover_cv);
1475 	cv_destroy(&ct->ct_unstable_cv);
1476 	cv_destroy(&ct->ct_powerchange_cv);
1477 	mutex_destroy(&ct->ct_mutex);
1478 	kmem_free(ct, sizeof (*ct));
1479 
1480 	if (cdip != NULL) {
1481 		MDI_VHCI_CLIENT_UNLOCK(vh);
1482 		(void) i_mdi_devinfo_remove(vdip, cdip, flags);
1483 		MDI_VHCI_CLIENT_LOCK(vh);
1484 	}
1485 	return (rv);
1486 }
1487 
1488 /*
1489  * i_mdi_client_find():
1490  * 		Find the client structure corresponding to a given guid
1491  *		Caller should hold the vhci client lock.
1492  */
1493 static mdi_client_t *
1494 i_mdi_client_find(mdi_vhci_t *vh, char *cname, char *guid)
1495 {
1496 	int			index;
1497 	struct client_hash	*head;
1498 	mdi_client_t		*ct;
1499 
1500 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1501 
1502 	index = i_mdi_get_hash_key(guid);
1503 	head = &vh->vh_client_table[index];
1504 
1505 	ct = head->ct_hash_head;
1506 	while (ct != NULL) {
1507 		if (strcmp(ct->ct_guid, guid) == 0 &&
1508 		    (cname == NULL || strcmp(ct->ct_drvname, cname) == 0)) {
1509 			break;
1510 		}
1511 		ct = ct->ct_hnext;
1512 	}
1513 	return (ct);
1514 }
1515 
1516 /*
1517  * i_mdi_client_update_state():
1518  *		Compute and update client device state
1519  * Notes:
1520  *		A client device can be in any of three possible states:
1521  *
1522  *		MDI_CLIENT_STATE_OPTIMAL - Client in optimal state with more
1523  *		one online/standby paths. Can tolerate failures.
1524  *		MDI_CLIENT_STATE_DEGRADED - Client device in degraded state with
1525  *		no alternate paths available as standby. A failure on the online
1526  *		would result in loss of access to device data.
1527  *		MDI_CLIENT_STATE_FAILED - Client device in failed state with
1528  *		no paths available to access the device.
1529  */
1530 static void
1531 i_mdi_client_update_state(mdi_client_t *ct)
1532 {
1533 	int state;
1534 
1535 	ASSERT(MDI_CLIENT_LOCKED(ct));
1536 	state = i_mdi_client_compute_state(ct, NULL);
1537 	MDI_CLIENT_SET_STATE(ct, state);
1538 }
1539 
1540 /*
1541  * i_mdi_client_compute_state():
1542  *		Compute client device state
1543  *
1544  *		mdi_phci_t *	Pointer to pHCI structure which should
1545  *				while computing the new value.  Used by
1546  *				i_mdi_phci_offline() to find the new
1547  *				client state after DR of a pHCI.
1548  */
1549 static int
1550 i_mdi_client_compute_state(mdi_client_t *ct, mdi_phci_t *ph)
1551 {
1552 	int		state;
1553 	int		online_count = 0;
1554 	int		standby_count = 0;
1555 	mdi_pathinfo_t	*pip, *next;
1556 
1557 	ASSERT(MDI_CLIENT_LOCKED(ct));
1558 	pip = ct->ct_path_head;
1559 	while (pip != NULL) {
1560 		MDI_PI_LOCK(pip);
1561 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
1562 		if (MDI_PI(pip)->pi_phci == ph) {
1563 			MDI_PI_UNLOCK(pip);
1564 			pip = next;
1565 			continue;
1566 		}
1567 
1568 		if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1569 				== MDI_PATHINFO_STATE_ONLINE)
1570 			online_count++;
1571 		else if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1572 				== MDI_PATHINFO_STATE_STANDBY)
1573 			standby_count++;
1574 		MDI_PI_UNLOCK(pip);
1575 		pip = next;
1576 	}
1577 
1578 	if (online_count == 0) {
1579 		if (standby_count == 0) {
1580 			state = MDI_CLIENT_STATE_FAILED;
1581 			MDI_DEBUG(2, (MDI_NOTE, ct->ct_dip,
1582 			    "client state failed: ct = %p", (void *)ct));
1583 		} else if (standby_count == 1) {
1584 			state = MDI_CLIENT_STATE_DEGRADED;
1585 		} else {
1586 			state = MDI_CLIENT_STATE_OPTIMAL;
1587 		}
1588 	} else if (online_count == 1) {
1589 		if (standby_count == 0) {
1590 			state = MDI_CLIENT_STATE_DEGRADED;
1591 		} else {
1592 			state = MDI_CLIENT_STATE_OPTIMAL;
1593 		}
1594 	} else {
1595 		state = MDI_CLIENT_STATE_OPTIMAL;
1596 	}
1597 	return (state);
1598 }
1599 
1600 /*
1601  * i_mdi_client2devinfo():
1602  *		Utility function
1603  */
1604 dev_info_t *
1605 i_mdi_client2devinfo(mdi_client_t *ct)
1606 {
1607 	return (ct->ct_dip);
1608 }
1609 
1610 /*
1611  * mdi_client_path2_devinfo():
1612  * 		Given the parent devinfo and child devfs pathname, search for
1613  *		a valid devfs node handle.
1614  */
1615 dev_info_t *
1616 mdi_client_path2devinfo(dev_info_t *vdip, char *pathname)
1617 {
1618 	dev_info_t 	*cdip = NULL;
1619 	dev_info_t 	*ndip = NULL;
1620 	char		*temp_pathname;
1621 	int		circular;
1622 
1623 	/*
1624 	 * Allocate temp buffer
1625 	 */
1626 	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1627 
1628 	/*
1629 	 * Lock parent against changes
1630 	 */
1631 	ndi_devi_enter(vdip, &circular);
1632 	ndip = (dev_info_t *)DEVI(vdip)->devi_child;
1633 	while ((cdip = ndip) != NULL) {
1634 		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
1635 
1636 		*temp_pathname = '\0';
1637 		(void) ddi_pathname(cdip, temp_pathname);
1638 		if (strcmp(temp_pathname, pathname) == 0) {
1639 			break;
1640 		}
1641 	}
1642 	/*
1643 	 * Release devinfo lock
1644 	 */
1645 	ndi_devi_exit(vdip, circular);
1646 
1647 	/*
1648 	 * Free the temp buffer
1649 	 */
1650 	kmem_free(temp_pathname, MAXPATHLEN);
1651 	return (cdip);
1652 }
1653 
1654 /*
1655  * mdi_client_get_path_count():
1656  * 		Utility function to get number of path information nodes
1657  *		associated with a given client device.
1658  */
1659 int
1660 mdi_client_get_path_count(dev_info_t *cdip)
1661 {
1662 	mdi_client_t	*ct;
1663 	int		count = 0;
1664 
1665 	ct = i_devi_get_client(cdip);
1666 	if (ct != NULL) {
1667 		count = ct->ct_path_count;
1668 	}
1669 	return (count);
1670 }
1671 
1672 
1673 /*
1674  * i_mdi_get_hash_key():
1675  * 		Create a hash using strings as keys
1676  *
1677  */
1678 static int
1679 i_mdi_get_hash_key(char *str)
1680 {
1681 	uint32_t	g, hash = 0;
1682 	char		*p;
1683 
1684 	for (p = str; *p != '\0'; p++) {
1685 		g = *p;
1686 		hash += g;
1687 	}
1688 	return (hash % (CLIENT_HASH_TABLE_SIZE - 1));
1689 }
1690 
1691 /*
1692  * mdi_get_lb_policy():
1693  * 		Get current load balancing policy for a given client device
1694  */
1695 client_lb_t
1696 mdi_get_lb_policy(dev_info_t *cdip)
1697 {
1698 	client_lb_t	lb = LOAD_BALANCE_NONE;
1699 	mdi_client_t	*ct;
1700 
1701 	ct = i_devi_get_client(cdip);
1702 	if (ct != NULL) {
1703 		lb = ct->ct_lb;
1704 	}
1705 	return (lb);
1706 }
1707 
1708 /*
1709  * mdi_set_lb_region_size():
1710  * 		Set current region size for the load-balance
1711  */
1712 int
1713 mdi_set_lb_region_size(dev_info_t *cdip, int region_size)
1714 {
1715 	mdi_client_t	*ct;
1716 	int		rv = MDI_FAILURE;
1717 
1718 	ct = i_devi_get_client(cdip);
1719 	if (ct != NULL && ct->ct_lb_args != NULL) {
1720 		ct->ct_lb_args->region_size = region_size;
1721 		rv = MDI_SUCCESS;
1722 	}
1723 	return (rv);
1724 }
1725 
1726 /*
1727  * mdi_Set_lb_policy():
1728  * 		Set current load balancing policy for a given client device
1729  */
1730 int
1731 mdi_set_lb_policy(dev_info_t *cdip, client_lb_t lb)
1732 {
1733 	mdi_client_t	*ct;
1734 	int		rv = MDI_FAILURE;
1735 
1736 	ct = i_devi_get_client(cdip);
1737 	if (ct != NULL) {
1738 		ct->ct_lb = lb;
1739 		rv = MDI_SUCCESS;
1740 	}
1741 	return (rv);
1742 }
1743 
1744 /*
1745  * mdi_failover():
1746  *		failover function called by the vHCI drivers to initiate
1747  *		a failover operation.  This is typically due to non-availability
1748  *		of online paths to route I/O requests.  Failover can be
1749  *		triggered through user application also.
1750  *
1751  *		The vHCI driver calls mdi_failover() to initiate a failover
1752  *		operation. mdi_failover() calls back into the vHCI driver's
1753  *		vo_failover() entry point to perform the actual failover
1754  *		operation.  The reason for requiring the vHCI driver to
1755  *		initiate failover by calling mdi_failover(), instead of directly
1756  *		executing vo_failover() itself, is to ensure that the mdi
1757  *		framework can keep track of the client state properly.
1758  *		Additionally, mdi_failover() provides as a convenience the
1759  *		option of performing the failover operation synchronously or
1760  *		asynchronously
1761  *
1762  *		Upon successful completion of the failover operation, the
1763  *		paths that were previously ONLINE will be in the STANDBY state,
1764  *		and the newly activated paths will be in the ONLINE state.
1765  *
1766  *		The flags modifier determines whether the activation is done
1767  *		synchronously: MDI_FAILOVER_SYNC
1768  * Return Values:
1769  *		MDI_SUCCESS
1770  *		MDI_FAILURE
1771  *		MDI_BUSY
1772  */
1773 /*ARGSUSED*/
1774 int
1775 mdi_failover(dev_info_t *vdip, dev_info_t *cdip, int flags)
1776 {
1777 	int			rv;
1778 	mdi_client_t		*ct;
1779 
1780 	ct = i_devi_get_client(cdip);
1781 	ASSERT(ct != NULL);
1782 	if (ct == NULL) {
1783 		/* cdip is not a valid client device. Nothing more to do. */
1784 		return (MDI_FAILURE);
1785 	}
1786 
1787 	MDI_CLIENT_LOCK(ct);
1788 
1789 	if (MDI_CLIENT_IS_PATH_FREE_IN_PROGRESS(ct)) {
1790 		/* A path to the client is being freed */
1791 		MDI_CLIENT_UNLOCK(ct);
1792 		return (MDI_BUSY);
1793 	}
1794 
1795 
1796 	if (MDI_CLIENT_IS_FAILED(ct)) {
1797 		/*
1798 		 * Client is in failed state. Nothing more to do.
1799 		 */
1800 		MDI_CLIENT_UNLOCK(ct);
1801 		return (MDI_FAILURE);
1802 	}
1803 
1804 	if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
1805 		/*
1806 		 * Failover is already in progress; return BUSY
1807 		 */
1808 		MDI_CLIENT_UNLOCK(ct);
1809 		return (MDI_BUSY);
1810 	}
1811 	/*
1812 	 * Make sure that mdi_pathinfo node state changes are processed.
1813 	 * We do not allow failovers to progress while client path state
1814 	 * changes are in progress
1815 	 */
1816 	if (ct->ct_unstable) {
1817 		if (flags == MDI_FAILOVER_ASYNC) {
1818 			MDI_CLIENT_UNLOCK(ct);
1819 			return (MDI_BUSY);
1820 		} else {
1821 			while (ct->ct_unstable)
1822 				cv_wait(&ct->ct_unstable_cv, &ct->ct_mutex);
1823 		}
1824 	}
1825 
1826 	/*
1827 	 * Client device is in stable state. Before proceeding, perform sanity
1828 	 * checks again.
1829 	 */
1830 	if ((MDI_CLIENT_IS_DETACHED(ct)) || (MDI_CLIENT_IS_FAILED(ct)) ||
1831 	    (!i_ddi_devi_attached(ct->ct_dip))) {
1832 		/*
1833 		 * Client is in failed state. Nothing more to do.
1834 		 */
1835 		MDI_CLIENT_UNLOCK(ct);
1836 		return (MDI_FAILURE);
1837 	}
1838 
1839 	/*
1840 	 * Set the client state as failover in progress.
1841 	 */
1842 	MDI_CLIENT_SET_FAILOVER_IN_PROGRESS(ct);
1843 	ct->ct_failover_flags = flags;
1844 	MDI_CLIENT_UNLOCK(ct);
1845 
1846 	if (flags == MDI_FAILOVER_ASYNC) {
1847 		/*
1848 		 * Submit the initiate failover request via CPR safe
1849 		 * taskq threads.
1850 		 */
1851 		(void) taskq_dispatch(mdi_taskq, (task_func_t *)i_mdi_failover,
1852 		    ct, KM_SLEEP);
1853 		return (MDI_ACCEPT);
1854 	} else {
1855 		/*
1856 		 * Synchronous failover mode.  Typically invoked from the user
1857 		 * land.
1858 		 */
1859 		rv = i_mdi_failover(ct);
1860 	}
1861 	return (rv);
1862 }
1863 
1864 /*
1865  * i_mdi_failover():
1866  *		internal failover function. Invokes vHCI drivers failover
1867  *		callback function and process the failover status
1868  * Return Values:
1869  *		None
1870  *
1871  * Note: A client device in failover state can not be detached or freed.
1872  */
1873 static int
1874 i_mdi_failover(void *arg)
1875 {
1876 	int		rv = MDI_SUCCESS;
1877 	mdi_client_t	*ct = (mdi_client_t *)arg;
1878 	mdi_vhci_t	*vh = ct->ct_vhci;
1879 
1880 	ASSERT(!MDI_CLIENT_LOCKED(ct));
1881 
1882 	if (vh->vh_ops->vo_failover != NULL) {
1883 		/*
1884 		 * Call vHCI drivers callback routine
1885 		 */
1886 		rv = (*vh->vh_ops->vo_failover)(vh->vh_dip, ct->ct_dip,
1887 		    ct->ct_failover_flags);
1888 	}
1889 
1890 	MDI_CLIENT_LOCK(ct);
1891 	MDI_CLIENT_CLEAR_FAILOVER_IN_PROGRESS(ct);
1892 
1893 	/*
1894 	 * Save the failover return status
1895 	 */
1896 	ct->ct_failover_status = rv;
1897 
1898 	/*
1899 	 * As a result of failover, client status would have been changed.
1900 	 * Update the client state and wake up anyone waiting on this client
1901 	 * device.
1902 	 */
1903 	i_mdi_client_update_state(ct);
1904 
1905 	cv_broadcast(&ct->ct_failover_cv);
1906 	MDI_CLIENT_UNLOCK(ct);
1907 	return (rv);
1908 }
1909 
1910 /*
1911  * Load balancing is logical block.
1912  * IOs within the range described by region_size
1913  * would go on the same path. This would improve the
1914  * performance by cache-hit on some of the RAID devices.
1915  * Search only for online paths(At some point we
1916  * may want to balance across target ports).
1917  * If no paths are found then default to round-robin.
1918  */
1919 static int
1920 i_mdi_lba_lb(mdi_client_t *ct, mdi_pathinfo_t **ret_pip, struct buf *bp)
1921 {
1922 	int		path_index = -1;
1923 	int		online_path_count = 0;
1924 	int		online_nonpref_path_count = 0;
1925 	int 		region_size = ct->ct_lb_args->region_size;
1926 	mdi_pathinfo_t	*pip;
1927 	mdi_pathinfo_t	*next;
1928 	int		preferred, path_cnt;
1929 
1930 	pip = ct->ct_path_head;
1931 	while (pip) {
1932 		MDI_PI_LOCK(pip);
1933 		if (MDI_PI(pip)->pi_state ==
1934 		    MDI_PATHINFO_STATE_ONLINE && MDI_PI(pip)->pi_preferred) {
1935 			online_path_count++;
1936 		} else if (MDI_PI(pip)->pi_state ==
1937 		    MDI_PATHINFO_STATE_ONLINE && !MDI_PI(pip)->pi_preferred) {
1938 			online_nonpref_path_count++;
1939 		}
1940 		next = (mdi_pathinfo_t *)
1941 		    MDI_PI(pip)->pi_client_link;
1942 		MDI_PI_UNLOCK(pip);
1943 		pip = next;
1944 	}
1945 	/* if found any online/preferred then use this type */
1946 	if (online_path_count > 0) {
1947 		path_cnt = online_path_count;
1948 		preferred = 1;
1949 	} else if (online_nonpref_path_count > 0) {
1950 		path_cnt = online_nonpref_path_count;
1951 		preferred = 0;
1952 	} else {
1953 		path_cnt = 0;
1954 	}
1955 	if (path_cnt) {
1956 		path_index = (bp->b_blkno >> region_size) % path_cnt;
1957 		pip = ct->ct_path_head;
1958 		while (pip && path_index != -1) {
1959 			MDI_PI_LOCK(pip);
1960 			if (path_index == 0 &&
1961 			    (MDI_PI(pip)->pi_state ==
1962 			    MDI_PATHINFO_STATE_ONLINE) &&
1963 				MDI_PI(pip)->pi_preferred == preferred) {
1964 				MDI_PI_HOLD(pip);
1965 				MDI_PI_UNLOCK(pip);
1966 				*ret_pip = pip;
1967 				return (MDI_SUCCESS);
1968 			}
1969 			path_index --;
1970 			next = (mdi_pathinfo_t *)
1971 			    MDI_PI(pip)->pi_client_link;
1972 			MDI_PI_UNLOCK(pip);
1973 			pip = next;
1974 		}
1975 		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
1976 		    "lba %llx: path %s %p",
1977 		    bp->b_lblkno, mdi_pi_spathname(pip), (void *)pip));
1978 	}
1979 	return (MDI_FAILURE);
1980 }
1981 
1982 /*
1983  * mdi_select_path():
1984  *		select a path to access a client device.
1985  *
1986  *		mdi_select_path() function is called by the vHCI drivers to
1987  *		select a path to route the I/O request to.  The caller passes
1988  *		the block I/O data transfer structure ("buf") as one of the
1989  *		parameters.  The mpxio framework uses the buf structure
1990  *		contents to maintain per path statistics (total I/O size /
1991  *		count pending).  If more than one online paths are available to
1992  *		select, the framework automatically selects a suitable path
1993  *		for routing I/O request. If a failover operation is active for
1994  *		this client device the call shall be failed with MDI_BUSY error
1995  *		code.
1996  *
1997  *		By default this function returns a suitable path in online
1998  *		state based on the current load balancing policy.  Currently
1999  *		we support LOAD_BALANCE_NONE (Previously selected online path
2000  *		will continue to be used till the path is usable) and
2001  *		LOAD_BALANCE_RR (Online paths will be selected in a round
2002  *		robin fashion), LOAD_BALANCE_LB(Online paths will be selected
2003  *		based on the logical block).  The load balancing
2004  *		through vHCI drivers configuration file (driver.conf).
2005  *
2006  *		vHCI drivers may override this default behavior by specifying
2007  *		appropriate flags.  The meaning of the thrid argument depends
2008  *		on the flags specified. If MDI_SELECT_PATH_INSTANCE is set
2009  *		then the argument is the "path instance" of the path to select.
2010  *		If MDI_SELECT_PATH_INSTANCE is not set then the argument is
2011  *		"start_pip". A non NULL "start_pip" is the starting point to
2012  *		walk and find the next appropriate path.  The following values
2013  *		are currently defined: MDI_SELECT_ONLINE_PATH (to select an
2014  *		ONLINE path) and/or MDI_SELECT_STANDBY_PATH (to select an
2015  *		STANDBY path).
2016  *
2017  *		The non-standard behavior is used by the scsi_vhci driver,
2018  *		whenever it has to use a STANDBY/FAULTED path.  Eg. during
2019  *		attach of client devices (to avoid an unnecessary failover
2020  *		when the STANDBY path comes up first), during failover
2021  *		(to activate a STANDBY path as ONLINE).
2022  *
2023  *		The selected path is returned in a a mdi_hold_path() state
2024  *		(pi_ref_cnt). Caller should release the hold by calling
2025  *		mdi_rele_path().
2026  *
2027  * Return Values:
2028  *		MDI_SUCCESS	- Completed successfully
2029  *		MDI_BUSY 	- Client device is busy failing over
2030  *		MDI_NOPATH	- Client device is online, but no valid path are
2031  *				  available to access this client device
2032  *		MDI_FAILURE	- Invalid client device or state
2033  *		MDI_DEVI_ONLINING
2034  *				- Client device (struct dev_info state) is in
2035  *				  onlining state.
2036  */
2037 
2038 /*ARGSUSED*/
2039 int
2040 mdi_select_path(dev_info_t *cdip, struct buf *bp, int flags,
2041     void *arg, mdi_pathinfo_t **ret_pip)
2042 {
2043 	mdi_client_t	*ct;
2044 	mdi_pathinfo_t	*pip;
2045 	mdi_pathinfo_t	*next;
2046 	mdi_pathinfo_t	*head;
2047 	mdi_pathinfo_t	*start;
2048 	client_lb_t	lbp;	/* load balancing policy */
2049 	int		sb = 1;	/* standard behavior */
2050 	int		preferred = 1;	/* preferred path */
2051 	int		cond, cont = 1;
2052 	int		retry = 0;
2053 	mdi_pathinfo_t	*start_pip;	/* request starting pathinfo */
2054 	int		path_instance;	/* request specific path instance */
2055 
2056 	/* determine type of arg based on flags */
2057 	if (flags & MDI_SELECT_PATH_INSTANCE) {
2058 		path_instance = (int)(intptr_t)arg;
2059 		start_pip = NULL;
2060 	} else {
2061 		path_instance = 0;
2062 		start_pip = (mdi_pathinfo_t *)arg;
2063 	}
2064 
2065 	if (flags != 0) {
2066 		/*
2067 		 * disable default behavior
2068 		 */
2069 		sb = 0;
2070 	}
2071 
2072 	*ret_pip = NULL;
2073 	ct = i_devi_get_client(cdip);
2074 	if (ct == NULL) {
2075 		/* mdi extensions are NULL, Nothing more to do */
2076 		return (MDI_FAILURE);
2077 	}
2078 
2079 	MDI_CLIENT_LOCK(ct);
2080 
2081 	if (sb) {
2082 		if (MDI_CLIENT_IS_FAILED(ct)) {
2083 			/*
2084 			 * Client is not ready to accept any I/O requests.
2085 			 * Fail this request.
2086 			 */
2087 			MDI_DEBUG(2, (MDI_NOTE, cdip,
2088 			    "client state offline ct = %p", (void *)ct));
2089 			MDI_CLIENT_UNLOCK(ct);
2090 			return (MDI_FAILURE);
2091 		}
2092 
2093 		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
2094 			/*
2095 			 * Check for Failover is in progress. If so tell the
2096 			 * caller that this device is busy.
2097 			 */
2098 			MDI_DEBUG(2, (MDI_NOTE, cdip,
2099 			    "client failover in progress ct = %p",
2100 			    (void *)ct));
2101 			MDI_CLIENT_UNLOCK(ct);
2102 			return (MDI_BUSY);
2103 		}
2104 
2105 		/*
2106 		 * Check to see whether the client device is attached.
2107 		 * If not so, let the vHCI driver manually select a path
2108 		 * (standby) and let the probe/attach process to continue.
2109 		 */
2110 		if (MDI_CLIENT_IS_DETACHED(ct) || !i_ddi_devi_attached(cdip)) {
2111 			MDI_DEBUG(4, (MDI_NOTE, cdip,
2112 			    "devi is onlining ct = %p", (void *)ct));
2113 			MDI_CLIENT_UNLOCK(ct);
2114 			return (MDI_DEVI_ONLINING);
2115 		}
2116 	}
2117 
2118 	/*
2119 	 * Cache in the client list head.  If head of the list is NULL
2120 	 * return MDI_NOPATH
2121 	 */
2122 	head = ct->ct_path_head;
2123 	if (head == NULL) {
2124 		MDI_CLIENT_UNLOCK(ct);
2125 		return (MDI_NOPATH);
2126 	}
2127 
2128 	/* Caller is specifying a specific pathinfo path by path_instance */
2129 	if (path_instance) {
2130 		/* search for pathinfo with correct path_instance */
2131 		for (pip = head;
2132 		    pip && (mdi_pi_get_path_instance(pip) != path_instance);
2133 		    pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link)
2134 			;
2135 
2136 		/* If path can't be selected then MDI_NOPATH is returned. */
2137 		if (pip == NULL) {
2138 			MDI_CLIENT_UNLOCK(ct);
2139 			return (MDI_NOPATH);
2140 		}
2141 
2142 		/*
2143 		 * Verify state of path. When asked to select a specific
2144 		 * path_instance, we select the requested path in any
2145 		 * state (ONLINE, OFFLINE, STANDBY, FAULT) other than INIT.
2146 		 * We don't however select paths where the pHCI has detached.
2147 		 * NOTE: last pathinfo node of an opened client device may
2148 		 * exist in an OFFLINE state after the pHCI associated with
2149 		 * that path has detached (but pi_phci will be NULL if that
2150 		 * has occurred).
2151 		 */
2152 		MDI_PI_LOCK(pip);
2153 		if ((MDI_PI(pip)->pi_state == MDI_PATHINFO_STATE_INIT) ||
2154 		    (MDI_PI(pip)->pi_phci == NULL)) {
2155 			MDI_PI_UNLOCK(pip);
2156 			MDI_CLIENT_UNLOCK(ct);
2157 			return (MDI_FAILURE);
2158 		}
2159 
2160 		/* Return MDI_BUSY if we have a transient condition */
2161 		if (MDI_PI_IS_TRANSIENT(pip)) {
2162 			MDI_PI_UNLOCK(pip);
2163 			MDI_CLIENT_UNLOCK(ct);
2164 			return (MDI_BUSY);
2165 		}
2166 
2167 		/*
2168 		 * Return the path in hold state. Caller should release the
2169 		 * lock by calling mdi_rele_path()
2170 		 */
2171 		MDI_PI_HOLD(pip);
2172 		MDI_PI_UNLOCK(pip);
2173 		*ret_pip = pip;
2174 		MDI_CLIENT_UNLOCK(ct);
2175 		return (MDI_SUCCESS);
2176 	}
2177 
2178 	/*
2179 	 * for non default behavior, bypass current
2180 	 * load balancing policy and always use LOAD_BALANCE_RR
2181 	 * except that the start point will be adjusted based
2182 	 * on the provided start_pip
2183 	 */
2184 	lbp = sb ? ct->ct_lb : LOAD_BALANCE_RR;
2185 
2186 	switch (lbp) {
2187 	case LOAD_BALANCE_NONE:
2188 		/*
2189 		 * Load balancing is None  or Alternate path mode
2190 		 * Start looking for a online mdi_pathinfo node starting from
2191 		 * last known selected path
2192 		 */
2193 		preferred = 1;
2194 		pip = (mdi_pathinfo_t *)ct->ct_path_last;
2195 		if (pip == NULL) {
2196 			pip = head;
2197 		}
2198 		start = pip;
2199 		do {
2200 			MDI_PI_LOCK(pip);
2201 			/*
2202 			 * No need to explicitly check if the path is disabled.
2203 			 * Since we are checking for state == ONLINE and the
2204 			 * same variable is used for DISABLE/ENABLE information.
2205 			 */
2206 			if ((MDI_PI(pip)->pi_state  ==
2207 				MDI_PATHINFO_STATE_ONLINE) &&
2208 				preferred == MDI_PI(pip)->pi_preferred) {
2209 				/*
2210 				 * Return the path in hold state. Caller should
2211 				 * release the lock by calling mdi_rele_path()
2212 				 */
2213 				MDI_PI_HOLD(pip);
2214 				MDI_PI_UNLOCK(pip);
2215 				ct->ct_path_last = pip;
2216 				*ret_pip = pip;
2217 				MDI_CLIENT_UNLOCK(ct);
2218 				return (MDI_SUCCESS);
2219 			}
2220 
2221 			/*
2222 			 * Path is busy.
2223 			 */
2224 			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
2225 			    MDI_PI_IS_TRANSIENT(pip))
2226 				retry = 1;
2227 			/*
2228 			 * Keep looking for a next available online path
2229 			 */
2230 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2231 			if (next == NULL) {
2232 				next = head;
2233 			}
2234 			MDI_PI_UNLOCK(pip);
2235 			pip = next;
2236 			if (start == pip && preferred) {
2237 				preferred = 0;
2238 			} else if (start == pip && !preferred) {
2239 				cont = 0;
2240 			}
2241 		} while (cont);
2242 		break;
2243 
2244 	case LOAD_BALANCE_LBA:
2245 		/*
2246 		 * Make sure we are looking
2247 		 * for an online path. Otherwise, if it is for a STANDBY
2248 		 * path request, it will go through and fetch an ONLINE
2249 		 * path which is not desirable.
2250 		 */
2251 		if ((ct->ct_lb_args != NULL) &&
2252 			    (ct->ct_lb_args->region_size) && bp &&
2253 				(sb || (flags == MDI_SELECT_ONLINE_PATH))) {
2254 			if (i_mdi_lba_lb(ct, ret_pip, bp)
2255 				    == MDI_SUCCESS) {
2256 				MDI_CLIENT_UNLOCK(ct);
2257 				return (MDI_SUCCESS);
2258 			}
2259 		}
2260 		/* FALLTHROUGH */
2261 	case LOAD_BALANCE_RR:
2262 		/*
2263 		 * Load balancing is Round Robin. Start looking for a online
2264 		 * mdi_pathinfo node starting from last known selected path
2265 		 * as the start point.  If override flags are specified,
2266 		 * process accordingly.
2267 		 * If the search is already in effect(start_pip not null),
2268 		 * then lets just use the same path preference to continue the
2269 		 * traversal.
2270 		 */
2271 
2272 		if (start_pip != NULL) {
2273 			preferred = MDI_PI(start_pip)->pi_preferred;
2274 		} else {
2275 			preferred = 1;
2276 		}
2277 
2278 		start = sb ? (mdi_pathinfo_t *)ct->ct_path_last : start_pip;
2279 		if (start == NULL) {
2280 			pip = head;
2281 		} else {
2282 			pip = (mdi_pathinfo_t *)MDI_PI(start)->pi_client_link;
2283 			if (pip == NULL) {
2284 				if ( flags & MDI_SELECT_NO_PREFERRED) {
2285 					/*
2286 					 * Return since we hit the end of list
2287 					 */
2288 					MDI_CLIENT_UNLOCK(ct);
2289 					return (MDI_NOPATH);
2290 				}
2291 
2292 				if (!sb) {
2293 					if (preferred == 0) {
2294 						/*
2295 						 * Looks like we have completed
2296 						 * the traversal as preferred
2297 						 * value is 0. Time to bail out.
2298 						 */
2299 						*ret_pip = NULL;
2300 						MDI_CLIENT_UNLOCK(ct);
2301 						return (MDI_NOPATH);
2302 					} else {
2303 						/*
2304 						 * Looks like we reached the
2305 						 * end of the list. Lets enable
2306 						 * traversal of non preferred
2307 						 * paths.
2308 						 */
2309 						preferred = 0;
2310 					}
2311 				}
2312 				pip = head;
2313 			}
2314 		}
2315 		start = pip;
2316 		do {
2317 			MDI_PI_LOCK(pip);
2318 			if (sb) {
2319 				cond = ((MDI_PI(pip)->pi_state ==
2320 				    MDI_PATHINFO_STATE_ONLINE &&
2321 					MDI_PI(pip)->pi_preferred ==
2322 						preferred) ? 1 : 0);
2323 			} else {
2324 				if (flags == MDI_SELECT_ONLINE_PATH) {
2325 					cond = ((MDI_PI(pip)->pi_state ==
2326 					    MDI_PATHINFO_STATE_ONLINE &&
2327 						MDI_PI(pip)->pi_preferred ==
2328 						preferred) ? 1 : 0);
2329 				} else if (flags == MDI_SELECT_STANDBY_PATH) {
2330 					cond = ((MDI_PI(pip)->pi_state ==
2331 					    MDI_PATHINFO_STATE_STANDBY &&
2332 						MDI_PI(pip)->pi_preferred ==
2333 						preferred) ? 1 : 0);
2334 				} else if (flags == (MDI_SELECT_ONLINE_PATH |
2335 				    MDI_SELECT_STANDBY_PATH)) {
2336 					cond = (((MDI_PI(pip)->pi_state ==
2337 					    MDI_PATHINFO_STATE_ONLINE ||
2338 					    (MDI_PI(pip)->pi_state ==
2339 					    MDI_PATHINFO_STATE_STANDBY)) &&
2340 						MDI_PI(pip)->pi_preferred ==
2341 						preferred) ? 1 : 0);
2342 				} else if (flags ==
2343 					(MDI_SELECT_STANDBY_PATH |
2344 					MDI_SELECT_ONLINE_PATH |
2345 					MDI_SELECT_USER_DISABLE_PATH)) {
2346 					cond = (((MDI_PI(pip)->pi_state ==
2347 					    MDI_PATHINFO_STATE_ONLINE ||
2348 					    (MDI_PI(pip)->pi_state ==
2349 					    MDI_PATHINFO_STATE_STANDBY) ||
2350 						(MDI_PI(pip)->pi_state ==
2351 					    (MDI_PATHINFO_STATE_ONLINE|
2352 					    MDI_PATHINFO_STATE_USER_DISABLE)) ||
2353 						(MDI_PI(pip)->pi_state ==
2354 					    (MDI_PATHINFO_STATE_STANDBY |
2355 					    MDI_PATHINFO_STATE_USER_DISABLE)))&&
2356 						MDI_PI(pip)->pi_preferred ==
2357 						preferred) ? 1 : 0);
2358 				} else if (flags ==
2359 				    (MDI_SELECT_STANDBY_PATH |
2360 				    MDI_SELECT_ONLINE_PATH |
2361 				    MDI_SELECT_NO_PREFERRED)) {
2362 					cond = (((MDI_PI(pip)->pi_state ==
2363 					    MDI_PATHINFO_STATE_ONLINE) ||
2364 					    (MDI_PI(pip)->pi_state ==
2365 					    MDI_PATHINFO_STATE_STANDBY))
2366 					    ? 1 : 0);
2367 				} else {
2368 					cond = 0;
2369 				}
2370 			}
2371 			/*
2372 			 * No need to explicitly check if the path is disabled.
2373 			 * Since we are checking for state == ONLINE and the
2374 			 * same variable is used for DISABLE/ENABLE information.
2375 			 */
2376 			if (cond) {
2377 				/*
2378 				 * Return the path in hold state. Caller should
2379 				 * release the lock by calling mdi_rele_path()
2380 				 */
2381 				MDI_PI_HOLD(pip);
2382 				MDI_PI_UNLOCK(pip);
2383 				if (sb)
2384 					ct->ct_path_last = pip;
2385 				*ret_pip = pip;
2386 				MDI_CLIENT_UNLOCK(ct);
2387 				return (MDI_SUCCESS);
2388 			}
2389 			/*
2390 			 * Path is busy.
2391 			 */
2392 			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
2393 			    MDI_PI_IS_TRANSIENT(pip))
2394 				retry = 1;
2395 
2396 			/*
2397 			 * Keep looking for a next available online path
2398 			 */
2399 do_again:
2400 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2401 			if (next == NULL) {
2402 				if ( flags & MDI_SELECT_NO_PREFERRED) {
2403 					/*
2404 					 * Bail out since we hit the end of list
2405 					 */
2406 					MDI_PI_UNLOCK(pip);
2407 					break;
2408 				}
2409 
2410 				if (!sb) {
2411 					if (preferred == 1) {
2412 						/*
2413 						 * Looks like we reached the
2414 						 * end of the list. Lets enable
2415 						 * traversal of non preferred
2416 						 * paths.
2417 						 */
2418 						preferred = 0;
2419 						next = head;
2420 					} else {
2421 						/*
2422 						 * We have done both the passes
2423 						 * Preferred as well as for
2424 						 * Non-preferred. Bail out now.
2425 						 */
2426 						cont = 0;
2427 					}
2428 				} else {
2429 					/*
2430 					 * Standard behavior case.
2431 					 */
2432 					next = head;
2433 				}
2434 			}
2435 			MDI_PI_UNLOCK(pip);
2436 			if (cont == 0) {
2437 				break;
2438 			}
2439 			pip = next;
2440 
2441 			if (!sb) {
2442 				/*
2443 				 * We need to handle the selection of
2444 				 * non-preferred path in the following
2445 				 * case:
2446 				 *
2447 				 * +------+   +------+   +------+   +-----+
2448 				 * | A : 1| - | B : 1| - | C : 0| - |NULL |
2449 				 * +------+   +------+   +------+   +-----+
2450 				 *
2451 				 * If we start the search with B, we need to
2452 				 * skip beyond B to pick C which is non -
2453 				 * preferred in the second pass. The following
2454 				 * test, if true, will allow us to skip over
2455 				 * the 'start'(B in the example) to select
2456 				 * other non preferred elements.
2457 				 */
2458 				if ((start_pip != NULL) && (start_pip == pip) &&
2459 				    (MDI_PI(start_pip)->pi_preferred
2460 				    != preferred)) {
2461 					/*
2462 					 * try again after going past the start
2463 					 * pip
2464 					 */
2465 					MDI_PI_LOCK(pip);
2466 					goto do_again;
2467 				}
2468 			} else {
2469 				/*
2470 				 * Standard behavior case
2471 				 */
2472 				if (start == pip && preferred) {
2473 					/* look for nonpreferred paths */
2474 					preferred = 0;
2475 				} else if (start == pip && !preferred) {
2476 					/*
2477 					 * Exit condition
2478 					 */
2479 					cont = 0;
2480 				}
2481 			}
2482 		} while (cont);
2483 		break;
2484 	}
2485 
2486 	MDI_CLIENT_UNLOCK(ct);
2487 	if (retry == 1) {
2488 		return (MDI_BUSY);
2489 	} else {
2490 		return (MDI_NOPATH);
2491 	}
2492 }
2493 
2494 /*
2495  * For a client, return the next available path to any phci
2496  *
2497  * Note:
2498  *		Caller should hold the branch's devinfo node to get a consistent
2499  *		snap shot of the mdi_pathinfo nodes.
2500  *
2501  *		Please note that even the list is stable the mdi_pathinfo
2502  *		node state and properties are volatile.  The caller should lock
2503  *		and unlock the nodes by calling mdi_pi_lock() and
2504  *		mdi_pi_unlock() functions to get a stable properties.
2505  *
2506  *		If there is a need to use the nodes beyond the hold of the
2507  *		devinfo node period (For ex. I/O), then mdi_pathinfo node
2508  *		need to be held against unexpected removal by calling
2509  *		mdi_hold_path() and should be released by calling
2510  *		mdi_rele_path() on completion.
2511  */
2512 mdi_pathinfo_t *
2513 mdi_get_next_phci_path(dev_info_t *ct_dip, mdi_pathinfo_t *pip)
2514 {
2515 	mdi_client_t *ct;
2516 
2517 	if (!MDI_CLIENT(ct_dip))
2518 		return (NULL);
2519 
2520 	/*
2521 	 * Walk through client link
2522 	 */
2523 	ct = (mdi_client_t *)DEVI(ct_dip)->devi_mdi_client;
2524 	ASSERT(ct != NULL);
2525 
2526 	if (pip == NULL)
2527 		return ((mdi_pathinfo_t *)ct->ct_path_head);
2528 
2529 	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link);
2530 }
2531 
2532 /*
2533  * For a phci, return the next available path to any client
2534  * Note: ditto mdi_get_next_phci_path()
2535  */
2536 mdi_pathinfo_t *
2537 mdi_get_next_client_path(dev_info_t *ph_dip, mdi_pathinfo_t *pip)
2538 {
2539 	mdi_phci_t *ph;
2540 
2541 	if (!MDI_PHCI(ph_dip))
2542 		return (NULL);
2543 
2544 	/*
2545 	 * Walk through pHCI link
2546 	 */
2547 	ph = (mdi_phci_t *)DEVI(ph_dip)->devi_mdi_xhci;
2548 	ASSERT(ph != NULL);
2549 
2550 	if (pip == NULL)
2551 		return ((mdi_pathinfo_t *)ph->ph_path_head);
2552 
2553 	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link);
2554 }
2555 
2556 /*
2557  * mdi_hold_path():
2558  *		Hold the mdi_pathinfo node against unwanted unexpected free.
2559  * Return Values:
2560  *		None
2561  */
2562 void
2563 mdi_hold_path(mdi_pathinfo_t *pip)
2564 {
2565 	if (pip) {
2566 		MDI_PI_LOCK(pip);
2567 		MDI_PI_HOLD(pip);
2568 		MDI_PI_UNLOCK(pip);
2569 	}
2570 }
2571 
2572 
2573 /*
2574  * mdi_rele_path():
2575  *		Release the mdi_pathinfo node which was selected
2576  *		through mdi_select_path() mechanism or manually held by
2577  *		calling mdi_hold_path().
2578  * Return Values:
2579  *		None
2580  */
2581 void
2582 mdi_rele_path(mdi_pathinfo_t *pip)
2583 {
2584 	if (pip) {
2585 		MDI_PI_LOCK(pip);
2586 		MDI_PI_RELE(pip);
2587 		if (MDI_PI(pip)->pi_ref_cnt == 0) {
2588 			cv_broadcast(&MDI_PI(pip)->pi_ref_cv);
2589 		}
2590 		MDI_PI_UNLOCK(pip);
2591 	}
2592 }
2593 
2594 /*
2595  * mdi_pi_lock():
2596  * 		Lock the mdi_pathinfo node.
2597  * Note:
2598  *		The caller should release the lock by calling mdi_pi_unlock()
2599  */
2600 void
2601 mdi_pi_lock(mdi_pathinfo_t *pip)
2602 {
2603 	ASSERT(pip != NULL);
2604 	if (pip) {
2605 		MDI_PI_LOCK(pip);
2606 	}
2607 }
2608 
2609 
2610 /*
2611  * mdi_pi_unlock():
2612  * 		Unlock the mdi_pathinfo node.
2613  * Note:
2614  *		The mdi_pathinfo node should have been locked with mdi_pi_lock()
2615  */
2616 void
2617 mdi_pi_unlock(mdi_pathinfo_t *pip)
2618 {
2619 	ASSERT(pip != NULL);
2620 	if (pip) {
2621 		MDI_PI_UNLOCK(pip);
2622 	}
2623 }
2624 
2625 /*
2626  * mdi_pi_find():
2627  *		Search the list of mdi_pathinfo nodes attached to the
2628  *		pHCI/Client device node whose path address matches "paddr".
2629  *		Returns a pointer to the mdi_pathinfo node if a matching node is
2630  *		found.
2631  * Return Values:
2632  *		mdi_pathinfo node handle
2633  *		NULL
2634  * Notes:
2635  *		Caller need not hold any locks to call this function.
2636  */
2637 mdi_pathinfo_t *
2638 mdi_pi_find(dev_info_t *pdip, char *caddr, char *paddr)
2639 {
2640 	mdi_phci_t		*ph;
2641 	mdi_vhci_t		*vh;
2642 	mdi_client_t		*ct;
2643 	mdi_pathinfo_t		*pip = NULL;
2644 
2645 	MDI_DEBUG(2, (MDI_NOTE, pdip,
2646 	    "caddr@%s paddr@%s", caddr ? caddr : "", paddr ? paddr : ""));
2647 	if ((pdip == NULL) || (paddr == NULL)) {
2648 		return (NULL);
2649 	}
2650 	ph = i_devi_get_phci(pdip);
2651 	if (ph == NULL) {
2652 		/*
2653 		 * Invalid pHCI device, Nothing more to do.
2654 		 */
2655 		MDI_DEBUG(2, (MDI_WARN, pdip, "invalid phci"));
2656 		return (NULL);
2657 	}
2658 
2659 	vh = ph->ph_vhci;
2660 	if (vh == NULL) {
2661 		/*
2662 		 * Invalid vHCI device, Nothing more to do.
2663 		 */
2664 		MDI_DEBUG(2, (MDI_WARN, pdip, "invalid vhci"));
2665 		return (NULL);
2666 	}
2667 
2668 	/*
2669 	 * Look for pathinfo node identified by paddr.
2670 	 */
2671 	if (caddr == NULL) {
2672 		/*
2673 		 * Find a mdi_pathinfo node under pHCI list for a matching
2674 		 * unit address.
2675 		 */
2676 		MDI_PHCI_LOCK(ph);
2677 		if (MDI_PHCI_IS_OFFLINE(ph)) {
2678 			MDI_DEBUG(2, (MDI_WARN, pdip,
2679 			    "offline phci %p", (void *)ph));
2680 			MDI_PHCI_UNLOCK(ph);
2681 			return (NULL);
2682 		}
2683 		pip = (mdi_pathinfo_t *)ph->ph_path_head;
2684 
2685 		while (pip != NULL) {
2686 			if (strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2687 				break;
2688 			}
2689 			pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
2690 		}
2691 		MDI_PHCI_UNLOCK(ph);
2692 		MDI_DEBUG(2, (MDI_NOTE, pdip,
2693 		    "found %s %p", mdi_pi_spathname(pip), (void *)pip));
2694 		return (pip);
2695 	}
2696 
2697 	/*
2698 	 * XXX - Is the rest of the code in this function really necessary?
2699 	 * The consumers of mdi_pi_find() can search for the desired pathinfo
2700 	 * node by calling mdi_pi_find(pdip, NULL, paddr). Irrespective of
2701 	 * whether the search is based on the pathinfo nodes attached to
2702 	 * the pHCI or the client node, the result will be the same.
2703 	 */
2704 
2705 	/*
2706 	 * Find the client device corresponding to 'caddr'
2707 	 */
2708 	MDI_VHCI_CLIENT_LOCK(vh);
2709 
2710 	/*
2711 	 * XXX - Passing NULL to the following function works as long as the
2712 	 * the client addresses (caddr) are unique per vhci basis.
2713 	 */
2714 	ct = i_mdi_client_find(vh, NULL, caddr);
2715 	if (ct == NULL) {
2716 		/*
2717 		 * Client not found, Obviously mdi_pathinfo node has not been
2718 		 * created yet.
2719 		 */
2720 		MDI_VHCI_CLIENT_UNLOCK(vh);
2721 		MDI_DEBUG(2, (MDI_NOTE, pdip,
2722 		    "client not found for caddr @%s", caddr ? caddr : ""));
2723 		return (NULL);
2724 	}
2725 
2726 	/*
2727 	 * Hold the client lock and look for a mdi_pathinfo node with matching
2728 	 * pHCI and paddr
2729 	 */
2730 	MDI_CLIENT_LOCK(ct);
2731 
2732 	/*
2733 	 * Release the global mutex as it is no more needed. Note: We always
2734 	 * respect the locking order while acquiring.
2735 	 */
2736 	MDI_VHCI_CLIENT_UNLOCK(vh);
2737 
2738 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2739 	while (pip != NULL) {
2740 		/*
2741 		 * Compare the unit address
2742 		 */
2743 		if ((MDI_PI(pip)->pi_phci == ph) &&
2744 		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2745 			break;
2746 		}
2747 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2748 	}
2749 	MDI_CLIENT_UNLOCK(ct);
2750 	MDI_DEBUG(2, (MDI_NOTE, pdip,
2751 	    "found: %s %p", mdi_pi_spathname(pip), (void *)pip));
2752 	return (pip);
2753 }
2754 
2755 /*
2756  * mdi_pi_alloc():
2757  *		Allocate and initialize a new instance of a mdi_pathinfo node.
2758  *		The mdi_pathinfo node returned by this function identifies a
2759  *		unique device path is capable of having properties attached
2760  *		and passed to mdi_pi_online() to fully attach and online the
2761  *		path and client device node.
2762  *		The mdi_pathinfo node returned by this function must be
2763  *		destroyed using mdi_pi_free() if the path is no longer
2764  *		operational or if the caller fails to attach a client device
2765  *		node when calling mdi_pi_online(). The framework will not free
2766  *		the resources allocated.
2767  *		This function can be called from both interrupt and kernel
2768  *		contexts.  DDI_NOSLEEP flag should be used while calling
2769  *		from interrupt contexts.
2770  * Return Values:
2771  *		MDI_SUCCESS
2772  *		MDI_FAILURE
2773  *		MDI_NOMEM
2774  */
2775 /*ARGSUSED*/
2776 int
2777 mdi_pi_alloc_compatible(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2778     char **compatible, int ncompatible, int flags, mdi_pathinfo_t **ret_pip)
2779 {
2780 	mdi_vhci_t	*vh;
2781 	mdi_phci_t	*ph;
2782 	mdi_client_t	*ct;
2783 	mdi_pathinfo_t	*pip = NULL;
2784 	dev_info_t	*cdip;
2785 	int		rv = MDI_NOMEM;
2786 	int		path_allocated = 0;
2787 
2788 	MDI_DEBUG(2, (MDI_NOTE, pdip,
2789 	    "cname %s: caddr@%s paddr@%s",
2790 	    cname ? cname : "", caddr ? caddr : "", paddr ? paddr : ""));
2791 
2792 	if (pdip == NULL || cname == NULL || caddr == NULL || paddr == NULL ||
2793 	    ret_pip == NULL) {
2794 		/* Nothing more to do */
2795 		return (MDI_FAILURE);
2796 	}
2797 
2798 	*ret_pip = NULL;
2799 
2800 	/* No allocations on detaching pHCI */
2801 	if (DEVI_IS_DETACHING(pdip)) {
2802 		/* Invalid pHCI device, return failure */
2803 		MDI_DEBUG(1, (MDI_WARN, pdip,
2804 		    "!detaching pHCI=%p", (void *)pdip));
2805 		return (MDI_FAILURE);
2806 	}
2807 
2808 	ph = i_devi_get_phci(pdip);
2809 	ASSERT(ph != NULL);
2810 	if (ph == NULL) {
2811 		/* Invalid pHCI device, return failure */
2812 		MDI_DEBUG(1, (MDI_WARN, pdip,
2813 		    "!invalid pHCI=%p", (void *)pdip));
2814 		return (MDI_FAILURE);
2815 	}
2816 
2817 	MDI_PHCI_LOCK(ph);
2818 	vh = ph->ph_vhci;
2819 	if (vh == NULL) {
2820 		/* Invalid vHCI device, return failure */
2821 		MDI_DEBUG(1, (MDI_WARN, pdip,
2822 		    "!invalid vHCI=%p", (void *)pdip));
2823 		MDI_PHCI_UNLOCK(ph);
2824 		return (MDI_FAILURE);
2825 	}
2826 
2827 	if (MDI_PHCI_IS_READY(ph) == 0) {
2828 		/*
2829 		 * Do not allow new node creation when pHCI is in
2830 		 * offline/suspended states
2831 		 */
2832 		MDI_DEBUG(1, (MDI_WARN, pdip,
2833 		    "pHCI=%p is not ready", (void *)ph));
2834 		MDI_PHCI_UNLOCK(ph);
2835 		return (MDI_BUSY);
2836 	}
2837 	MDI_PHCI_UNSTABLE(ph);
2838 	MDI_PHCI_UNLOCK(ph);
2839 
2840 	/* look for a matching client, create one if not found */
2841 	MDI_VHCI_CLIENT_LOCK(vh);
2842 	ct = i_mdi_client_find(vh, cname, caddr);
2843 	if (ct == NULL) {
2844 		ct = i_mdi_client_alloc(vh, cname, caddr);
2845 		ASSERT(ct != NULL);
2846 	}
2847 
2848 	if (ct->ct_dip == NULL) {
2849 		/*
2850 		 * Allocate a devinfo node
2851 		 */
2852 		ct->ct_dip = i_mdi_devinfo_create(vh, cname, caddr,
2853 		    compatible, ncompatible);
2854 		if (ct->ct_dip == NULL) {
2855 			(void) i_mdi_client_free(vh, ct);
2856 			goto fail;
2857 		}
2858 	}
2859 	cdip = ct->ct_dip;
2860 
2861 	DEVI(cdip)->devi_mdi_component |= MDI_COMPONENT_CLIENT;
2862 	DEVI(cdip)->devi_mdi_client = (caddr_t)ct;
2863 
2864 	MDI_CLIENT_LOCK(ct);
2865 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2866 	while (pip != NULL) {
2867 		/*
2868 		 * Compare the unit address
2869 		 */
2870 		if ((MDI_PI(pip)->pi_phci == ph) &&
2871 		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2872 			break;
2873 		}
2874 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2875 	}
2876 	MDI_CLIENT_UNLOCK(ct);
2877 
2878 	if (pip == NULL) {
2879 		/*
2880 		 * This is a new path for this client device.  Allocate and
2881 		 * initialize a new pathinfo node
2882 		 */
2883 		pip = i_mdi_pi_alloc(ph, paddr, ct);
2884 		ASSERT(pip != NULL);
2885 		path_allocated = 1;
2886 	}
2887 	rv = MDI_SUCCESS;
2888 
2889 fail:
2890 	/*
2891 	 * Release the global mutex.
2892 	 */
2893 	MDI_VHCI_CLIENT_UNLOCK(vh);
2894 
2895 	/*
2896 	 * Mark the pHCI as stable
2897 	 */
2898 	MDI_PHCI_LOCK(ph);
2899 	MDI_PHCI_STABLE(ph);
2900 	MDI_PHCI_UNLOCK(ph);
2901 	*ret_pip = pip;
2902 
2903 	MDI_DEBUG(2, (MDI_NOTE, pdip,
2904 	    "alloc %s %p", mdi_pi_spathname(pip), (void *)pip));
2905 
2906 	if (path_allocated)
2907 		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
2908 
2909 	return (rv);
2910 }
2911 
2912 /*ARGSUSED*/
2913 int
2914 mdi_pi_alloc(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2915     int flags, mdi_pathinfo_t **ret_pip)
2916 {
2917 	return (mdi_pi_alloc_compatible(pdip, cname, caddr, paddr, NULL, 0,
2918 	    flags, ret_pip));
2919 }
2920 
2921 /*
2922  * i_mdi_pi_alloc():
2923  *		Allocate a mdi_pathinfo node and add to the pHCI path list
2924  * Return Values:
2925  *		mdi_pathinfo
2926  */
2927 /*ARGSUSED*/
2928 static mdi_pathinfo_t *
2929 i_mdi_pi_alloc(mdi_phci_t *ph, char *paddr, mdi_client_t *ct)
2930 {
2931 	mdi_pathinfo_t	*pip;
2932 	int		ct_circular;
2933 	int		ph_circular;
2934 	static char	path[MAXPATHLEN];	/* mdi_pathmap_mutex protects */
2935 	char		*path_persistent;
2936 	int		path_instance;
2937 	mod_hash_val_t	hv;
2938 
2939 	ASSERT(MDI_VHCI_CLIENT_LOCKED(ph->ph_vhci));
2940 
2941 	pip = kmem_zalloc(sizeof (struct mdi_pathinfo), KM_SLEEP);
2942 	mutex_init(&MDI_PI(pip)->pi_mutex, NULL, MUTEX_DEFAULT, NULL);
2943 	MDI_PI(pip)->pi_state = MDI_PATHINFO_STATE_INIT |
2944 	    MDI_PATHINFO_STATE_TRANSIENT;
2945 
2946 	if (MDI_PHCI_IS_USER_DISABLED(ph))
2947 		MDI_PI_SET_USER_DISABLE(pip);
2948 
2949 	if (MDI_PHCI_IS_DRV_DISABLED_TRANSIENT(ph))
2950 		MDI_PI_SET_DRV_DISABLE_TRANS(pip);
2951 
2952 	if (MDI_PHCI_IS_DRV_DISABLED(ph))
2953 		MDI_PI_SET_DRV_DISABLE(pip);
2954 
2955 	MDI_PI(pip)->pi_old_state = MDI_PATHINFO_STATE_INIT;
2956 	cv_init(&MDI_PI(pip)->pi_state_cv, NULL, CV_DEFAULT, NULL);
2957 	MDI_PI(pip)->pi_client = ct;
2958 	MDI_PI(pip)->pi_phci = ph;
2959 	MDI_PI(pip)->pi_addr = kmem_alloc(strlen(paddr) + 1, KM_SLEEP);
2960 	(void) strcpy(MDI_PI(pip)->pi_addr, paddr);
2961 
2962         /*
2963 	 * We form the "path" to the pathinfo node, and see if we have
2964 	 * already allocated a 'path_instance' for that "path".  If so,
2965 	 * we use the already allocated 'path_instance'.  If not, we
2966 	 * allocate a new 'path_instance' and associate it with a copy of
2967 	 * the "path" string (which is never freed). The association
2968 	 * between a 'path_instance' this "path" string persists until
2969 	 * reboot.
2970 	 */
2971         mutex_enter(&mdi_pathmap_mutex);
2972 	(void) ddi_pathname(ph->ph_dip, path);
2973 	(void) sprintf(path + strlen(path), "/%s@%s",
2974 	    mdi_pi_get_node_name(pip), mdi_pi_get_addr(pip));
2975         if (mod_hash_find(mdi_pathmap_bypath, (mod_hash_key_t)path, &hv) == 0) {
2976                 path_instance = (uint_t)(intptr_t)hv;
2977         } else {
2978 		/* allocate a new 'path_instance' and persistent "path" */
2979 		path_instance = mdi_pathmap_instance++;
2980 		path_persistent = i_ddi_strdup(path, KM_SLEEP);
2981                 (void) mod_hash_insert(mdi_pathmap_bypath,
2982                     (mod_hash_key_t)path_persistent,
2983                     (mod_hash_val_t)(intptr_t)path_instance);
2984 		(void) mod_hash_insert(mdi_pathmap_byinstance,
2985 		    (mod_hash_key_t)(intptr_t)path_instance,
2986 		    (mod_hash_val_t)path_persistent);
2987 
2988 		/* create shortpath name */
2989 		(void) snprintf(path, sizeof(path), "%s%d/%s@%s",
2990 		    ddi_driver_name(ph->ph_dip), ddi_get_instance(ph->ph_dip),
2991 		    mdi_pi_get_node_name(pip), mdi_pi_get_addr(pip));
2992 		path_persistent = i_ddi_strdup(path, KM_SLEEP);
2993 		(void) mod_hash_insert(mdi_pathmap_sbyinstance,
2994 		    (mod_hash_key_t)(intptr_t)path_instance,
2995 		    (mod_hash_val_t)path_persistent);
2996         }
2997         mutex_exit(&mdi_pathmap_mutex);
2998 	MDI_PI(pip)->pi_path_instance = path_instance;
2999 
3000 	(void) nvlist_alloc(&MDI_PI(pip)->pi_prop, NV_UNIQUE_NAME, KM_SLEEP);
3001 	ASSERT(MDI_PI(pip)->pi_prop != NULL);
3002 	MDI_PI(pip)->pi_pprivate = NULL;
3003 	MDI_PI(pip)->pi_cprivate = NULL;
3004 	MDI_PI(pip)->pi_vprivate = NULL;
3005 	MDI_PI(pip)->pi_client_link = NULL;
3006 	MDI_PI(pip)->pi_phci_link = NULL;
3007 	MDI_PI(pip)->pi_ref_cnt = 0;
3008 	MDI_PI(pip)->pi_kstats = NULL;
3009 	MDI_PI(pip)->pi_preferred = 1;
3010 	cv_init(&MDI_PI(pip)->pi_ref_cv, NULL, CV_DEFAULT, NULL);
3011 
3012 	/*
3013 	 * Lock both dev_info nodes against changes in parallel.
3014 	 *
3015 	 * The ndi_devi_enter(Client), is atypical since the client is a leaf.
3016 	 * This atypical operation is done to synchronize pathinfo nodes
3017 	 * during devinfo snapshot (see di_register_pip) by 'pretending' that
3018 	 * the pathinfo nodes are children of the Client.
3019 	 */
3020 	ndi_devi_enter(ct->ct_dip, &ct_circular);
3021 	ndi_devi_enter(ph->ph_dip, &ph_circular);
3022 
3023 	i_mdi_phci_add_path(ph, pip);
3024 	i_mdi_client_add_path(ct, pip);
3025 
3026 	ndi_devi_exit(ph->ph_dip, ph_circular);
3027 	ndi_devi_exit(ct->ct_dip, ct_circular);
3028 
3029 	return (pip);
3030 }
3031 
3032 /*
3033  * mdi_pi_pathname_by_instance():
3034  *	Lookup of "path" by 'path_instance'. Return "path".
3035  *	NOTE: returned "path" remains valid forever (until reboot).
3036  */
3037 char *
3038 mdi_pi_pathname_by_instance(int path_instance)
3039 {
3040 	char		*path;
3041 	mod_hash_val_t	hv;
3042 
3043 	/* mdi_pathmap lookup of "path" by 'path_instance' */
3044 	mutex_enter(&mdi_pathmap_mutex);
3045 	if (mod_hash_find(mdi_pathmap_byinstance,
3046 	    (mod_hash_key_t)(intptr_t)path_instance, &hv) == 0)
3047 		path = (char *)hv;
3048 	else
3049 		path = NULL;
3050 	mutex_exit(&mdi_pathmap_mutex);
3051 	return (path);
3052 }
3053 
3054 /*
3055  * mdi_pi_spathname_by_instance():
3056  *	Lookup of "shortpath" by 'path_instance'. Return "shortpath".
3057  *	NOTE: returned "shortpath" remains valid forever (until reboot).
3058  */
3059 char *
3060 mdi_pi_spathname_by_instance(int path_instance)
3061 {
3062 	char		*path;
3063 	mod_hash_val_t	hv;
3064 
3065 	/* mdi_pathmap lookup of "path" by 'path_instance' */
3066 	mutex_enter(&mdi_pathmap_mutex);
3067 	if (mod_hash_find(mdi_pathmap_sbyinstance,
3068 	    (mod_hash_key_t)(intptr_t)path_instance, &hv) == 0)
3069 		path = (char *)hv;
3070 	else
3071 		path = NULL;
3072 	mutex_exit(&mdi_pathmap_mutex);
3073 	return (path);
3074 }
3075 
3076 
3077 /*
3078  * i_mdi_phci_add_path():
3079  * 		Add a mdi_pathinfo node to pHCI list.
3080  * Notes:
3081  *		Caller should per-pHCI mutex
3082  */
3083 static void
3084 i_mdi_phci_add_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
3085 {
3086 	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
3087 
3088 	MDI_PHCI_LOCK(ph);
3089 	if (ph->ph_path_head == NULL) {
3090 		ph->ph_path_head = pip;
3091 	} else {
3092 		MDI_PI(ph->ph_path_tail)->pi_phci_link = MDI_PI(pip);
3093 	}
3094 	ph->ph_path_tail = pip;
3095 	ph->ph_path_count++;
3096 	MDI_PHCI_UNLOCK(ph);
3097 }
3098 
3099 /*
3100  * i_mdi_client_add_path():
3101  *		Add mdi_pathinfo node to client list
3102  */
3103 static void
3104 i_mdi_client_add_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
3105 {
3106 	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
3107 
3108 	MDI_CLIENT_LOCK(ct);
3109 	if (ct->ct_path_head == NULL) {
3110 		ct->ct_path_head = pip;
3111 	} else {
3112 		MDI_PI(ct->ct_path_tail)->pi_client_link = MDI_PI(pip);
3113 	}
3114 	ct->ct_path_tail = pip;
3115 	ct->ct_path_count++;
3116 	MDI_CLIENT_UNLOCK(ct);
3117 }
3118 
3119 /*
3120  * mdi_pi_free():
3121  *		Free the mdi_pathinfo node and also client device node if this
3122  *		is the last path to the device
3123  * Return Values:
3124  *		MDI_SUCCESS
3125  *		MDI_FAILURE
3126  *		MDI_BUSY
3127  */
3128 /*ARGSUSED*/
3129 int
3130 mdi_pi_free(mdi_pathinfo_t *pip, int flags)
3131 {
3132 	int		rv;
3133 	mdi_vhci_t	*vh;
3134 	mdi_phci_t	*ph;
3135 	mdi_client_t	*ct;
3136 	int		(*f)();
3137 	int		client_held = 0;
3138 
3139 	MDI_PI_LOCK(pip);
3140 	ph = MDI_PI(pip)->pi_phci;
3141 	ASSERT(ph != NULL);
3142 	if (ph == NULL) {
3143 		/*
3144 		 * Invalid pHCI device, return failure
3145 		 */
3146 		MDI_DEBUG(1, (MDI_WARN, NULL,
3147 		    "!invalid pHCI: pip %s %p",
3148 		    mdi_pi_spathname(pip), (void *)pip));
3149 		MDI_PI_UNLOCK(pip);
3150 		return (MDI_FAILURE);
3151 	}
3152 
3153 	vh = ph->ph_vhci;
3154 	ASSERT(vh != NULL);
3155 	if (vh == NULL) {
3156 		/* Invalid pHCI device, return failure */
3157 		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3158 		    "!invalid vHCI: pip %s %p",
3159 		    mdi_pi_spathname(pip), (void *)pip));
3160 		MDI_PI_UNLOCK(pip);
3161 		return (MDI_FAILURE);
3162 	}
3163 
3164 	ct = MDI_PI(pip)->pi_client;
3165 	ASSERT(ct != NULL);
3166 	if (ct == NULL) {
3167 		/*
3168 		 * Invalid Client device, return failure
3169 		 */
3170 		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3171 		    "!invalid client: pip %s %p",
3172 		    mdi_pi_spathname(pip), (void *)pip));
3173 		MDI_PI_UNLOCK(pip);
3174 		return (MDI_FAILURE);
3175 	}
3176 
3177 	/*
3178 	 * Check to see for busy condition.  A mdi_pathinfo can only be freed
3179 	 * if the node state is either offline or init and the reference count
3180 	 * is zero.
3181 	 */
3182 	if (!(MDI_PI_IS_OFFLINE(pip) || MDI_PI_IS_INIT(pip) ||
3183 	    MDI_PI_IS_INITING(pip))) {
3184 		/*
3185 		 * Node is busy
3186 		 */
3187 		MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
3188 		    "!busy: pip %s %p", mdi_pi_spathname(pip), (void *)pip));
3189 		MDI_PI_UNLOCK(pip);
3190 		return (MDI_BUSY);
3191 	}
3192 
3193 	while (MDI_PI(pip)->pi_ref_cnt != 0) {
3194 		/*
3195 		 * Give a chance for pending I/Os to complete.
3196 		 */
3197 		MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3198 		    "!%d cmds still pending on path: %s %p",
3199 		    MDI_PI(pip)->pi_ref_cnt,
3200 		    mdi_pi_spathname(pip), (void *)pip));
3201 		if (cv_reltimedwait(&MDI_PI(pip)->pi_ref_cv,
3202 		    &MDI_PI(pip)->pi_mutex, drv_usectohz(60 * 1000000),
3203 		    TR_CLOCK_TICK) == -1) {
3204 			/*
3205 			 * The timeout time reached without ref_cnt being zero
3206 			 * being signaled.
3207 			 */
3208 			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3209 			    "!Timeout reached on path %s %p without the cond",
3210 			    mdi_pi_spathname(pip), (void *)pip));
3211 			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3212 			    "!%d cmds still pending on path %s %p",
3213 			    MDI_PI(pip)->pi_ref_cnt,
3214 			    mdi_pi_spathname(pip), (void *)pip));
3215 			MDI_PI_UNLOCK(pip);
3216 			return (MDI_BUSY);
3217 		}
3218 	}
3219 	if (MDI_PI(pip)->pi_pm_held) {
3220 		client_held = 1;
3221 	}
3222 	MDI_PI_UNLOCK(pip);
3223 
3224 	vhcache_pi_remove(vh->vh_config, MDI_PI(pip));
3225 
3226 	MDI_CLIENT_LOCK(ct);
3227 
3228 	/* Prevent further failovers till MDI_VHCI_CLIENT_LOCK is held */
3229 	MDI_CLIENT_SET_PATH_FREE_IN_PROGRESS(ct);
3230 
3231 	/*
3232 	 * Wait till failover is complete before removing this node.
3233 	 */
3234 	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
3235 		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
3236 
3237 	MDI_CLIENT_UNLOCK(ct);
3238 	MDI_VHCI_CLIENT_LOCK(vh);
3239 	MDI_CLIENT_LOCK(ct);
3240 	MDI_CLIENT_CLEAR_PATH_FREE_IN_PROGRESS(ct);
3241 
3242 	if (!MDI_PI_IS_INITING(pip)) {
3243 		f = vh->vh_ops->vo_pi_uninit;
3244 		if (f != NULL) {
3245 			rv = (*f)(vh->vh_dip, pip, 0);
3246 		}
3247 	} else
3248 		rv = MDI_SUCCESS;
3249 
3250 	/*
3251 	 * If vo_pi_uninit() completed successfully.
3252 	 */
3253 	if (rv == MDI_SUCCESS) {
3254 		if (client_held) {
3255 			MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
3256 			    "i_mdi_pm_rele_client\n"));
3257 			i_mdi_pm_rele_client(ct, 1);
3258 		}
3259 		i_mdi_pi_free(ph, pip, ct);
3260 		if (ct->ct_path_count == 0) {
3261 			/*
3262 			 * Client lost its last path.
3263 			 * Clean up the client device
3264 			 */
3265 			MDI_CLIENT_UNLOCK(ct);
3266 			(void) i_mdi_client_free(ct->ct_vhci, ct);
3267 			MDI_VHCI_CLIENT_UNLOCK(vh);
3268 			return (rv);
3269 		}
3270 	}
3271 	MDI_CLIENT_UNLOCK(ct);
3272 	MDI_VHCI_CLIENT_UNLOCK(vh);
3273 
3274 	if (rv == MDI_FAILURE)
3275 		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
3276 
3277 	return (rv);
3278 }
3279 
3280 /*
3281  * i_mdi_pi_free():
3282  *		Free the mdi_pathinfo node
3283  */
3284 static void
3285 i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *pip, mdi_client_t *ct)
3286 {
3287 	int	ct_circular;
3288 	int	ph_circular;
3289 
3290 	ASSERT(MDI_CLIENT_LOCKED(ct));
3291 
3292 	/*
3293 	 * remove any per-path kstats
3294 	 */
3295 	i_mdi_pi_kstat_destroy(pip);
3296 
3297 	/* See comments in i_mdi_pi_alloc() */
3298 	ndi_devi_enter(ct->ct_dip, &ct_circular);
3299 	ndi_devi_enter(ph->ph_dip, &ph_circular);
3300 
3301 	i_mdi_client_remove_path(ct, pip);
3302 	i_mdi_phci_remove_path(ph, pip);
3303 
3304 	ndi_devi_exit(ph->ph_dip, ph_circular);
3305 	ndi_devi_exit(ct->ct_dip, ct_circular);
3306 
3307 	mutex_destroy(&MDI_PI(pip)->pi_mutex);
3308 	cv_destroy(&MDI_PI(pip)->pi_state_cv);
3309 	cv_destroy(&MDI_PI(pip)->pi_ref_cv);
3310 	if (MDI_PI(pip)->pi_addr) {
3311 		kmem_free(MDI_PI(pip)->pi_addr,
3312 		    strlen(MDI_PI(pip)->pi_addr) + 1);
3313 		MDI_PI(pip)->pi_addr = NULL;
3314 	}
3315 
3316 	if (MDI_PI(pip)->pi_prop) {
3317 		(void) nvlist_free(MDI_PI(pip)->pi_prop);
3318 		MDI_PI(pip)->pi_prop = NULL;
3319 	}
3320 	kmem_free(pip, sizeof (struct mdi_pathinfo));
3321 }
3322 
3323 
3324 /*
3325  * i_mdi_phci_remove_path():
3326  * 		Remove a mdi_pathinfo node from pHCI list.
3327  * Notes:
3328  *		Caller should hold per-pHCI mutex
3329  */
3330 static void
3331 i_mdi_phci_remove_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
3332 {
3333 	mdi_pathinfo_t	*prev = NULL;
3334 	mdi_pathinfo_t	*path = NULL;
3335 
3336 	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
3337 
3338 	MDI_PHCI_LOCK(ph);
3339 	path = ph->ph_path_head;
3340 	while (path != NULL) {
3341 		if (path == pip) {
3342 			break;
3343 		}
3344 		prev = path;
3345 		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
3346 	}
3347 
3348 	if (path) {
3349 		ph->ph_path_count--;
3350 		if (prev) {
3351 			MDI_PI(prev)->pi_phci_link = MDI_PI(path)->pi_phci_link;
3352 		} else {
3353 			ph->ph_path_head =
3354 			    (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
3355 		}
3356 		if (ph->ph_path_tail == path) {
3357 			ph->ph_path_tail = prev;
3358 		}
3359 	}
3360 
3361 	/*
3362 	 * Clear the pHCI link
3363 	 */
3364 	MDI_PI(pip)->pi_phci_link = NULL;
3365 	MDI_PI(pip)->pi_phci = NULL;
3366 	MDI_PHCI_UNLOCK(ph);
3367 }
3368 
3369 /*
3370  * i_mdi_client_remove_path():
3371  * 		Remove a mdi_pathinfo node from client path list.
3372  */
3373 static void
3374 i_mdi_client_remove_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
3375 {
3376 	mdi_pathinfo_t	*prev = NULL;
3377 	mdi_pathinfo_t	*path;
3378 
3379 	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
3380 
3381 	ASSERT(MDI_CLIENT_LOCKED(ct));
3382 	path = ct->ct_path_head;
3383 	while (path != NULL) {
3384 		if (path == pip) {
3385 			break;
3386 		}
3387 		prev = path;
3388 		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
3389 	}
3390 
3391 	if (path) {
3392 		ct->ct_path_count--;
3393 		if (prev) {
3394 			MDI_PI(prev)->pi_client_link =
3395 			    MDI_PI(path)->pi_client_link;
3396 		} else {
3397 			ct->ct_path_head =
3398 			    (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
3399 		}
3400 		if (ct->ct_path_tail == path) {
3401 			ct->ct_path_tail = prev;
3402 		}
3403 		if (ct->ct_path_last == path) {
3404 			ct->ct_path_last = ct->ct_path_head;
3405 		}
3406 	}
3407 	MDI_PI(pip)->pi_client_link = NULL;
3408 	MDI_PI(pip)->pi_client = NULL;
3409 }
3410 
3411 /*
3412  * i_mdi_pi_state_change():
3413  *		online a mdi_pathinfo node
3414  *
3415  * Return Values:
3416  *		MDI_SUCCESS
3417  *		MDI_FAILURE
3418  */
3419 /*ARGSUSED*/
3420 static int
3421 i_mdi_pi_state_change(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state, int flag)
3422 {
3423 	int		rv = MDI_SUCCESS;
3424 	mdi_vhci_t	*vh;
3425 	mdi_phci_t	*ph;
3426 	mdi_client_t	*ct;
3427 	int		(*f)();
3428 	dev_info_t	*cdip;
3429 
3430 	MDI_PI_LOCK(pip);
3431 
3432 	ph = MDI_PI(pip)->pi_phci;
3433 	ASSERT(ph);
3434 	if (ph == NULL) {
3435 		/*
3436 		 * Invalid pHCI device, fail the request
3437 		 */
3438 		MDI_PI_UNLOCK(pip);
3439 		MDI_DEBUG(1, (MDI_WARN, NULL,
3440 		    "!invalid phci: pip %s %p",
3441 		    mdi_pi_spathname(pip), (void *)pip));
3442 		return (MDI_FAILURE);
3443 	}
3444 
3445 	vh = ph->ph_vhci;
3446 	ASSERT(vh);
3447 	if (vh == NULL) {
3448 		/*
3449 		 * Invalid vHCI device, fail the request
3450 		 */
3451 		MDI_PI_UNLOCK(pip);
3452 		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3453 		    "!invalid vhci: pip %s %p",
3454 		    mdi_pi_spathname(pip), (void *)pip));
3455 		return (MDI_FAILURE);
3456 	}
3457 
3458 	ct = MDI_PI(pip)->pi_client;
3459 	ASSERT(ct != NULL);
3460 	if (ct == NULL) {
3461 		/*
3462 		 * Invalid client device, fail the request
3463 		 */
3464 		MDI_PI_UNLOCK(pip);
3465 		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3466 		    "!invalid client: pip %s %p",
3467 		    mdi_pi_spathname(pip), (void *)pip));
3468 		return (MDI_FAILURE);
3469 	}
3470 
3471 	/*
3472 	 * If this path has not been initialized yet, Callback vHCI driver's
3473 	 * pathinfo node initialize entry point
3474 	 */
3475 
3476 	if (MDI_PI_IS_INITING(pip)) {
3477 		MDI_PI_UNLOCK(pip);
3478 		f = vh->vh_ops->vo_pi_init;
3479 		if (f != NULL) {
3480 			rv = (*f)(vh->vh_dip, pip, 0);
3481 			if (rv != MDI_SUCCESS) {
3482 				MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
3483 				    "!vo_pi_init failed: vHCI %p, pip %s %p",
3484 				    (void *)vh, mdi_pi_spathname(pip),
3485 				    (void *)pip));
3486 				return (MDI_FAILURE);
3487 			}
3488 		}
3489 		MDI_PI_LOCK(pip);
3490 		MDI_PI_CLEAR_TRANSIENT(pip);
3491 	}
3492 
3493 	/*
3494 	 * Do not allow state transition when pHCI is in offline/suspended
3495 	 * states
3496 	 */
3497 	i_mdi_phci_lock(ph, pip);
3498 	if (MDI_PHCI_IS_READY(ph) == 0) {
3499 		MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
3500 		    "!pHCI not ready, pHCI=%p", (void *)ph));
3501 		MDI_PI_UNLOCK(pip);
3502 		i_mdi_phci_unlock(ph);
3503 		return (MDI_BUSY);
3504 	}
3505 	MDI_PHCI_UNSTABLE(ph);
3506 	i_mdi_phci_unlock(ph);
3507 
3508 	/*
3509 	 * Check if mdi_pathinfo state is in transient state.
3510 	 * If yes, offlining is in progress and wait till transient state is
3511 	 * cleared.
3512 	 */
3513 	if (MDI_PI_IS_TRANSIENT(pip)) {
3514 		while (MDI_PI_IS_TRANSIENT(pip)) {
3515 			cv_wait(&MDI_PI(pip)->pi_state_cv,
3516 			    &MDI_PI(pip)->pi_mutex);
3517 		}
3518 	}
3519 
3520 	/*
3521 	 * Grab the client lock in reverse order sequence and release the
3522 	 * mdi_pathinfo mutex.
3523 	 */
3524 	i_mdi_client_lock(ct, pip);
3525 	MDI_PI_UNLOCK(pip);
3526 
3527 	/*
3528 	 * Wait till failover state is cleared
3529 	 */
3530 	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
3531 		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
3532 
3533 	/*
3534 	 * Mark the mdi_pathinfo node state as transient
3535 	 */
3536 	MDI_PI_LOCK(pip);
3537 	switch (state) {
3538 	case MDI_PATHINFO_STATE_ONLINE:
3539 		MDI_PI_SET_ONLINING(pip);
3540 		break;
3541 
3542 	case MDI_PATHINFO_STATE_STANDBY:
3543 		MDI_PI_SET_STANDBYING(pip);
3544 		break;
3545 
3546 	case MDI_PATHINFO_STATE_FAULT:
3547 		/*
3548 		 * Mark the pathinfo state as FAULTED
3549 		 */
3550 		MDI_PI_SET_FAULTING(pip);
3551 		MDI_PI_ERRSTAT(pip, MDI_PI_HARDERR);
3552 		break;
3553 
3554 	case MDI_PATHINFO_STATE_OFFLINE:
3555 		/*
3556 		 * ndi_devi_offline() cannot hold pip or ct locks.
3557 		 */
3558 		MDI_PI_UNLOCK(pip);
3559 
3560 		/*
3561 		 * If this is a user initiated path online->offline operation
3562 		 * who's success would transition a client from DEGRADED to
3563 		 * FAILED then only proceed if we can offline the client first.
3564 		 */
3565 		cdip = ct->ct_dip;
3566 		if ((flag & NDI_USER_REQ) &&
3567 		    MDI_PI_IS_ONLINE(pip) &&
3568 		    (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED)) {
3569 			i_mdi_client_unlock(ct);
3570 			rv = ndi_devi_offline(cdip, NDI_DEVFS_CLEAN);
3571 			if (rv != NDI_SUCCESS) {
3572 				/*
3573 				 * Convert to MDI error code
3574 				 */
3575 				switch (rv) {
3576 				case NDI_BUSY:
3577 					rv = MDI_BUSY;
3578 					break;
3579 				default:
3580 					rv = MDI_FAILURE;
3581 					break;
3582 				}
3583 				goto state_change_exit;
3584 			} else {
3585 				i_mdi_client_lock(ct, NULL);
3586 			}
3587 		}
3588 		/*
3589 		 * Mark the mdi_pathinfo node state as transient
3590 		 */
3591 		MDI_PI_LOCK(pip);
3592 		MDI_PI_SET_OFFLINING(pip);
3593 		break;
3594 	}
3595 	MDI_PI_UNLOCK(pip);
3596 	MDI_CLIENT_UNSTABLE(ct);
3597 	i_mdi_client_unlock(ct);
3598 
3599 	f = vh->vh_ops->vo_pi_state_change;
3600 	if (f != NULL)
3601 		rv = (*f)(vh->vh_dip, pip, state, 0, flag);
3602 
3603 	MDI_CLIENT_LOCK(ct);
3604 	MDI_PI_LOCK(pip);
3605 	if (rv == MDI_NOT_SUPPORTED) {
3606 		MDI_CLIENT_SET_DEV_NOT_SUPPORTED(ct);
3607 	}
3608 	if (rv != MDI_SUCCESS) {
3609 		MDI_DEBUG(2, (MDI_WARN, ct->ct_dip,
3610 		    "vo_pi_state_change failed: rv %x", rv));
3611 	}
3612 	if (MDI_PI_IS_TRANSIENT(pip)) {
3613 		if (rv == MDI_SUCCESS) {
3614 			MDI_PI_CLEAR_TRANSIENT(pip);
3615 		} else {
3616 			MDI_PI(pip)->pi_state = MDI_PI_OLD_STATE(pip);
3617 		}
3618 	}
3619 
3620 	/*
3621 	 * Wake anyone waiting for this mdi_pathinfo node
3622 	 */
3623 	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
3624 	MDI_PI_UNLOCK(pip);
3625 
3626 	/*
3627 	 * Mark the client device as stable
3628 	 */
3629 	MDI_CLIENT_STABLE(ct);
3630 	if (rv == MDI_SUCCESS) {
3631 		if (ct->ct_unstable == 0) {
3632 			cdip = ct->ct_dip;
3633 
3634 			/*
3635 			 * Onlining the mdi_pathinfo node will impact the
3636 			 * client state Update the client and dev_info node
3637 			 * state accordingly
3638 			 */
3639 			rv = NDI_SUCCESS;
3640 			i_mdi_client_update_state(ct);
3641 			switch (MDI_CLIENT_STATE(ct)) {
3642 			case MDI_CLIENT_STATE_OPTIMAL:
3643 			case MDI_CLIENT_STATE_DEGRADED:
3644 				if (cdip && !i_ddi_devi_attached(cdip) &&
3645 				    ((state == MDI_PATHINFO_STATE_ONLINE) ||
3646 				    (state == MDI_PATHINFO_STATE_STANDBY))) {
3647 
3648 					/*
3649 					 * Must do ndi_devi_online() through
3650 					 * hotplug thread for deferred
3651 					 * attach mechanism to work
3652 					 */
3653 					MDI_CLIENT_UNLOCK(ct);
3654 					rv = ndi_devi_online(cdip, 0);
3655 					MDI_CLIENT_LOCK(ct);
3656 					if ((rv != NDI_SUCCESS) &&
3657 					    (MDI_CLIENT_STATE(ct) ==
3658 					    MDI_CLIENT_STATE_DEGRADED)) {
3659 						/*
3660 						 * ndi_devi_online failed.
3661 						 * Reset client flags to
3662 						 * offline.
3663 						 */
3664 						MDI_DEBUG(1, (MDI_WARN, cdip,
3665 						    "!ndi_devi_online failed "
3666 						    "error %x", rv));
3667 						MDI_CLIENT_SET_OFFLINE(ct);
3668 					}
3669 					if (rv != NDI_SUCCESS) {
3670 						/* Reset the path state */
3671 						MDI_PI_LOCK(pip);
3672 						MDI_PI(pip)->pi_state =
3673 						    MDI_PI_OLD_STATE(pip);
3674 						MDI_PI_UNLOCK(pip);
3675 					}
3676 				}
3677 				break;
3678 
3679 			case MDI_CLIENT_STATE_FAILED:
3680 				/*
3681 				 * This is the last path case for
3682 				 * non-user initiated events.
3683 				 */
3684 				if (((flag & NDI_USER_REQ) == 0) &&
3685 				    cdip && (i_ddi_node_state(cdip) >=
3686 				    DS_INITIALIZED)) {
3687 					MDI_CLIENT_UNLOCK(ct);
3688 					rv = ndi_devi_offline(cdip,
3689 					    NDI_DEVFS_CLEAN);
3690 					MDI_CLIENT_LOCK(ct);
3691 
3692 					if (rv != NDI_SUCCESS) {
3693 						/*
3694 						 * ndi_devi_offline failed.
3695 						 * Reset client flags to
3696 						 * online as the path could not
3697 						 * be offlined.
3698 						 */
3699 						MDI_DEBUG(1, (MDI_WARN, cdip,
3700 						    "!ndi_devi_offline failed: "
3701 						    "error %x", rv));
3702 						MDI_CLIENT_SET_ONLINE(ct);
3703 					}
3704 				}
3705 				break;
3706 			}
3707 			/*
3708 			 * Convert to MDI error code
3709 			 */
3710 			switch (rv) {
3711 			case NDI_SUCCESS:
3712 				MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
3713 				i_mdi_report_path_state(ct, pip);
3714 				rv = MDI_SUCCESS;
3715 				break;
3716 			case NDI_BUSY:
3717 				rv = MDI_BUSY;
3718 				break;
3719 			default:
3720 				rv = MDI_FAILURE;
3721 				break;
3722 			}
3723 		}
3724 	}
3725 	MDI_CLIENT_UNLOCK(ct);
3726 
3727 state_change_exit:
3728 	/*
3729 	 * Mark the pHCI as stable again.
3730 	 */
3731 	MDI_PHCI_LOCK(ph);
3732 	MDI_PHCI_STABLE(ph);
3733 	MDI_PHCI_UNLOCK(ph);
3734 	return (rv);
3735 }
3736 
3737 /*
3738  * mdi_pi_online():
3739  *		Place the path_info node in the online state.  The path is
3740  *		now available to be selected by mdi_select_path() for
3741  *		transporting I/O requests to client devices.
3742  * Return Values:
3743  *		MDI_SUCCESS
3744  *		MDI_FAILURE
3745  */
3746 int
3747 mdi_pi_online(mdi_pathinfo_t *pip, int flags)
3748 {
3749 	mdi_client_t	*ct = MDI_PI(pip)->pi_client;
3750 	int		client_held = 0;
3751 	int		rv;
3752 
3753 	ASSERT(ct != NULL);
3754 	rv = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_ONLINE, flags);
3755 	if (rv != MDI_SUCCESS)
3756 		return (rv);
3757 
3758 	MDI_PI_LOCK(pip);
3759 	if (MDI_PI(pip)->pi_pm_held == 0) {
3760 		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
3761 		    "i_mdi_pm_hold_pip %p", (void *)pip));
3762 		i_mdi_pm_hold_pip(pip);
3763 		client_held = 1;
3764 	}
3765 	MDI_PI_UNLOCK(pip);
3766 
3767 	if (client_held) {
3768 		MDI_CLIENT_LOCK(ct);
3769 		if (ct->ct_power_cnt == 0) {
3770 			rv = i_mdi_power_all_phci(ct);
3771 		}
3772 
3773 		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
3774 		    "i_mdi_pm_hold_client %p", (void *)ct));
3775 		i_mdi_pm_hold_client(ct, 1);
3776 		MDI_CLIENT_UNLOCK(ct);
3777 	}
3778 
3779 	return (rv);
3780 }
3781 
3782 /*
3783  * mdi_pi_standby():
3784  *		Place the mdi_pathinfo node in standby state
3785  *
3786  * Return Values:
3787  *		MDI_SUCCESS
3788  *		MDI_FAILURE
3789  */
3790 int
3791 mdi_pi_standby(mdi_pathinfo_t *pip, int flags)
3792 {
3793 	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_STANDBY, flags));
3794 }
3795 
3796 /*
3797  * mdi_pi_fault():
3798  *		Place the mdi_pathinfo node in fault'ed state
3799  * Return Values:
3800  *		MDI_SUCCESS
3801  *		MDI_FAILURE
3802  */
3803 int
3804 mdi_pi_fault(mdi_pathinfo_t *pip, int flags)
3805 {
3806 	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_FAULT, flags));
3807 }
3808 
3809 /*
3810  * mdi_pi_offline():
3811  *		Offline a mdi_pathinfo node.
3812  * Return Values:
3813  *		MDI_SUCCESS
3814  *		MDI_FAILURE
3815  */
3816 int
3817 mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
3818 {
3819 	int	ret, client_held = 0;
3820 	mdi_client_t	*ct;
3821 
3822 	/*
3823 	 * Original code overloaded NDI_DEVI_REMOVE to this interface, and
3824 	 * used it to mean "user initiated operation" (i.e. devctl). Callers
3825 	 * should now just use NDI_USER_REQ.
3826 	 */
3827 	if (flags & NDI_DEVI_REMOVE) {
3828 		flags &= ~NDI_DEVI_REMOVE;
3829 		flags |= NDI_USER_REQ;
3830 	}
3831 
3832 	ret = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_OFFLINE, flags);
3833 
3834 	if (ret == MDI_SUCCESS) {
3835 		MDI_PI_LOCK(pip);
3836 		if (MDI_PI(pip)->pi_pm_held) {
3837 			client_held = 1;
3838 		}
3839 		MDI_PI_UNLOCK(pip);
3840 
3841 		if (client_held) {
3842 			ct = MDI_PI(pip)->pi_client;
3843 			MDI_CLIENT_LOCK(ct);
3844 			MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
3845 			    "i_mdi_pm_rele_client\n"));
3846 			i_mdi_pm_rele_client(ct, 1);
3847 			MDI_CLIENT_UNLOCK(ct);
3848 		}
3849 	}
3850 
3851 	return (ret);
3852 }
3853 
3854 /*
3855  * i_mdi_pi_offline():
3856  *		Offline a mdi_pathinfo node and call the vHCI driver's callback
3857  */
3858 static int
3859 i_mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
3860 {
3861 	dev_info_t	*vdip = NULL;
3862 	mdi_vhci_t	*vh = NULL;
3863 	mdi_client_t	*ct = NULL;
3864 	int		(*f)();
3865 	int		rv;
3866 
3867 	MDI_PI_LOCK(pip);
3868 	ct = MDI_PI(pip)->pi_client;
3869 	ASSERT(ct != NULL);
3870 
3871 	while (MDI_PI(pip)->pi_ref_cnt != 0) {
3872 		/*
3873 		 * Give a chance for pending I/Os to complete.
3874 		 */
3875 		MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3876 		    "!%d cmds still pending on path %s %p",
3877 		    MDI_PI(pip)->pi_ref_cnt, mdi_pi_spathname(pip),
3878 		    (void *)pip));
3879 		if (cv_reltimedwait(&MDI_PI(pip)->pi_ref_cv,
3880 		    &MDI_PI(pip)->pi_mutex, drv_usectohz(60 * 1000000),
3881 		    TR_CLOCK_TICK) == -1) {
3882 			/*
3883 			 * The timeout time reached without ref_cnt being zero
3884 			 * being signaled.
3885 			 */
3886 			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3887 			    "!Timeout reached on path %s %p without the cond",
3888 			    mdi_pi_spathname(pip), (void *)pip));
3889 			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3890 			    "!%d cmds still pending on path %s %p",
3891 			    MDI_PI(pip)->pi_ref_cnt,
3892 			    mdi_pi_spathname(pip), (void *)pip));
3893 		}
3894 	}
3895 	vh = ct->ct_vhci;
3896 	vdip = vh->vh_dip;
3897 
3898 	/*
3899 	 * Notify vHCI that has registered this event
3900 	 */
3901 	ASSERT(vh->vh_ops);
3902 	f = vh->vh_ops->vo_pi_state_change;
3903 
3904 	if (f != NULL) {
3905 		MDI_PI_UNLOCK(pip);
3906 		if ((rv = (*f)(vdip, pip, MDI_PATHINFO_STATE_OFFLINE, 0,
3907 		    flags)) != MDI_SUCCESS) {
3908 			MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
3909 			    "!vo_path_offline failed: vdip %s%d %p: path %s %p",
3910 			    ddi_driver_name(vdip), ddi_get_instance(vdip),
3911 			    (void *)vdip, mdi_pi_spathname(pip), (void *)pip));
3912 		}
3913 		MDI_PI_LOCK(pip);
3914 	}
3915 
3916 	/*
3917 	 * Set the mdi_pathinfo node state and clear the transient condition
3918 	 */
3919 	MDI_PI_SET_OFFLINE(pip);
3920 	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
3921 	MDI_PI_UNLOCK(pip);
3922 
3923 	MDI_CLIENT_LOCK(ct);
3924 	if (rv == MDI_SUCCESS) {
3925 		if (ct->ct_unstable == 0) {
3926 			dev_info_t	*cdip = ct->ct_dip;
3927 
3928 			/*
3929 			 * Onlining the mdi_pathinfo node will impact the
3930 			 * client state Update the client and dev_info node
3931 			 * state accordingly
3932 			 */
3933 			i_mdi_client_update_state(ct);
3934 			rv = NDI_SUCCESS;
3935 			if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
3936 				if (cdip &&
3937 				    (i_ddi_node_state(cdip) >=
3938 				    DS_INITIALIZED)) {
3939 					MDI_CLIENT_UNLOCK(ct);
3940 					rv = ndi_devi_offline(cdip,
3941 					    NDI_DEVFS_CLEAN);
3942 					MDI_CLIENT_LOCK(ct);
3943 					if (rv != NDI_SUCCESS) {
3944 						/*
3945 						 * ndi_devi_offline failed.
3946 						 * Reset client flags to
3947 						 * online.
3948 						 */
3949 						MDI_DEBUG(4, (MDI_WARN, cdip,
3950 						    "ndi_devi_offline failed: "
3951 						    "error %x", rv));
3952 						MDI_CLIENT_SET_ONLINE(ct);
3953 					}
3954 				}
3955 			}
3956 			/*
3957 			 * Convert to MDI error code
3958 			 */
3959 			switch (rv) {
3960 			case NDI_SUCCESS:
3961 				rv = MDI_SUCCESS;
3962 				break;
3963 			case NDI_BUSY:
3964 				rv = MDI_BUSY;
3965 				break;
3966 			default:
3967 				rv = MDI_FAILURE;
3968 				break;
3969 			}
3970 		}
3971 		MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
3972 		i_mdi_report_path_state(ct, pip);
3973 	}
3974 
3975 	MDI_CLIENT_UNLOCK(ct);
3976 
3977 	/*
3978 	 * Change in the mdi_pathinfo node state will impact the client state
3979 	 */
3980 	MDI_DEBUG(2, (MDI_NOTE, ct->ct_dip,
3981 	    "ct = %p pip = %p", (void *)ct, (void *)pip));
3982 	return (rv);
3983 }
3984 
3985 /*
3986  * mdi_pi_get_node_name():
3987  *              Get the name associated with a mdi_pathinfo node.
3988  *              Since pathinfo nodes are not directly named, we
3989  *              return the node_name of the client.
3990  *
3991  * Return Values:
3992  *              char *
3993  */
3994 char *
3995 mdi_pi_get_node_name(mdi_pathinfo_t *pip)
3996 {
3997 	mdi_client_t    *ct;
3998 
3999 	if (pip == NULL)
4000 		return (NULL);
4001 	ct = MDI_PI(pip)->pi_client;
4002 	if ((ct == NULL) || (ct->ct_dip == NULL))
4003 		return (NULL);
4004 	return (ddi_node_name(ct->ct_dip));
4005 }
4006 
4007 /*
4008  * mdi_pi_get_addr():
4009  *		Get the unit address associated with a mdi_pathinfo node
4010  *
4011  * Return Values:
4012  *		char *
4013  */
4014 char *
4015 mdi_pi_get_addr(mdi_pathinfo_t *pip)
4016 {
4017 	if (pip == NULL)
4018 		return (NULL);
4019 
4020 	return (MDI_PI(pip)->pi_addr);
4021 }
4022 
4023 /*
4024  * mdi_pi_get_path_instance():
4025  *		Get the 'path_instance' of a mdi_pathinfo node
4026  *
4027  * Return Values:
4028  *		path_instance
4029  */
4030 int
4031 mdi_pi_get_path_instance(mdi_pathinfo_t *pip)
4032 {
4033 	if (pip == NULL)
4034 		return (0);
4035 
4036 	return (MDI_PI(pip)->pi_path_instance);
4037 }
4038 
4039 /*
4040  * mdi_pi_pathname():
4041  *		Return pointer to path to pathinfo node.
4042  */
4043 char *
4044 mdi_pi_pathname(mdi_pathinfo_t *pip)
4045 {
4046 	if (pip == NULL)
4047 		return (NULL);
4048 	return (mdi_pi_pathname_by_instance(mdi_pi_get_path_instance(pip)));
4049 }
4050 
4051 /*
4052  * mdi_pi_spathname():
4053  *		Return pointer to shortpath to pathinfo node. Used for debug
4054  *		messages, so return "" instead of NULL when unknown.
4055  */
4056 char *
4057 mdi_pi_spathname(mdi_pathinfo_t *pip)
4058 {
4059 	char	*spath = "";
4060 
4061 	if (pip) {
4062 		spath = mdi_pi_spathname_by_instance(
4063 		    mdi_pi_get_path_instance(pip));
4064 		if (spath == NULL)
4065 			spath = "";
4066 	}
4067 	return (spath);
4068 }
4069 
4070 char *
4071 mdi_pi_pathname_obp(mdi_pathinfo_t *pip, char *path)
4072 {
4073 	char *obp_path = NULL;
4074 	if ((pip == NULL) || (path == NULL))
4075 		return (NULL);
4076 
4077 	if (mdi_prop_lookup_string(pip, "obp-path", &obp_path) == MDI_SUCCESS) {
4078 		(void) strcpy(path, obp_path);
4079 		(void) mdi_prop_free(obp_path);
4080 	} else {
4081 		path = NULL;
4082 	}
4083 	return (path);
4084 }
4085 
4086 int
4087 mdi_pi_pathname_obp_set(mdi_pathinfo_t *pip, char *component)
4088 {
4089 	dev_info_t *pdip;
4090 	char *obp_path = NULL;
4091 	int rc = MDI_FAILURE;
4092 
4093 	if (pip == NULL)
4094 		return (MDI_FAILURE);
4095 
4096 	pdip = mdi_pi_get_phci(pip);
4097 	if (pdip == NULL)
4098 		return (MDI_FAILURE);
4099 
4100 	obp_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
4101 
4102 	if (ddi_pathname_obp(pdip, obp_path) == NULL) {
4103 		(void) ddi_pathname(pdip, obp_path);
4104 	}
4105 
4106 	if (component) {
4107 		(void) strncat(obp_path, "/", MAXPATHLEN);
4108 		(void) strncat(obp_path, component, MAXPATHLEN);
4109 	}
4110 	rc = mdi_prop_update_string(pip, "obp-path", obp_path);
4111 
4112 	if (obp_path)
4113 		kmem_free(obp_path, MAXPATHLEN);
4114 	return (rc);
4115 }
4116 
4117 /*
4118  * mdi_pi_get_client():
4119  *		Get the client devinfo associated with a mdi_pathinfo node
4120  *
4121  * Return Values:
4122  *		Handle to client device dev_info node
4123  */
4124 dev_info_t *
4125 mdi_pi_get_client(mdi_pathinfo_t *pip)
4126 {
4127 	dev_info_t	*dip = NULL;
4128 	if (pip) {
4129 		dip = MDI_PI(pip)->pi_client->ct_dip;
4130 	}
4131 	return (dip);
4132 }
4133 
4134 /*
4135  * mdi_pi_get_phci():
4136  *		Get the pHCI devinfo associated with the mdi_pathinfo node
4137  * Return Values:
4138  *		Handle to dev_info node
4139  */
4140 dev_info_t *
4141 mdi_pi_get_phci(mdi_pathinfo_t *pip)
4142 {
4143 	dev_info_t	*dip = NULL;
4144 	mdi_phci_t	*ph;
4145 
4146 	if (pip) {
4147 		ph = MDI_PI(pip)->pi_phci;
4148 		if (ph)
4149 			dip = ph->ph_dip;
4150 	}
4151 	return (dip);
4152 }
4153 
4154 /*
4155  * mdi_pi_get_client_private():
4156  *		Get the client private information associated with the
4157  *		mdi_pathinfo node
4158  */
4159 void *
4160 mdi_pi_get_client_private(mdi_pathinfo_t *pip)
4161 {
4162 	void *cprivate = NULL;
4163 	if (pip) {
4164 		cprivate = MDI_PI(pip)->pi_cprivate;
4165 	}
4166 	return (cprivate);
4167 }
4168 
4169 /*
4170  * mdi_pi_set_client_private():
4171  *		Set the client private information in the mdi_pathinfo node
4172  */
4173 void
4174 mdi_pi_set_client_private(mdi_pathinfo_t *pip, void *priv)
4175 {
4176 	if (pip) {
4177 		MDI_PI(pip)->pi_cprivate = priv;
4178 	}
4179 }
4180 
4181 /*
4182  * mdi_pi_get_phci_private():
4183  *		Get the pHCI private information associated with the
4184  *		mdi_pathinfo node
4185  */
4186 caddr_t
4187 mdi_pi_get_phci_private(mdi_pathinfo_t *pip)
4188 {
4189 	caddr_t	pprivate = NULL;
4190 
4191 	if (pip) {
4192 		pprivate = MDI_PI(pip)->pi_pprivate;
4193 	}
4194 	return (pprivate);
4195 }
4196 
4197 /*
4198  * mdi_pi_set_phci_private():
4199  *		Set the pHCI private information in the mdi_pathinfo node
4200  */
4201 void
4202 mdi_pi_set_phci_private(mdi_pathinfo_t *pip, caddr_t priv)
4203 {
4204 	if (pip) {
4205 		MDI_PI(pip)->pi_pprivate = priv;
4206 	}
4207 }
4208 
4209 /*
4210  * mdi_pi_get_state():
4211  *		Get the mdi_pathinfo node state. Transient states are internal
4212  *		and not provided to the users
4213  */
4214 mdi_pathinfo_state_t
4215 mdi_pi_get_state(mdi_pathinfo_t *pip)
4216 {
4217 	mdi_pathinfo_state_t    state = MDI_PATHINFO_STATE_INIT;
4218 
4219 	if (pip) {
4220 		if (MDI_PI_IS_TRANSIENT(pip)) {
4221 			/*
4222 			 * mdi_pathinfo is in state transition.  Return the
4223 			 * last good state.
4224 			 */
4225 			state = MDI_PI_OLD_STATE(pip);
4226 		} else {
4227 			state = MDI_PI_STATE(pip);
4228 		}
4229 	}
4230 	return (state);
4231 }
4232 
4233 /*
4234  * mdi_pi_get_flags():
4235  *		Get the mdi_pathinfo node flags.
4236  */
4237 uint_t
4238 mdi_pi_get_flags(mdi_pathinfo_t *pip)
4239 {
4240 	return (pip ? MDI_PI(pip)->pi_flags : 0);
4241 }
4242 
4243 /*
4244  * Note that the following function needs to be the new interface for
4245  * mdi_pi_get_state when mpxio gets integrated to ON.
4246  */
4247 int
4248 mdi_pi_get_state2(mdi_pathinfo_t *pip, mdi_pathinfo_state_t *state,
4249 		uint32_t *ext_state)
4250 {
4251 	*state = MDI_PATHINFO_STATE_INIT;
4252 
4253 	if (pip) {
4254 		if (MDI_PI_IS_TRANSIENT(pip)) {
4255 			/*
4256 			 * mdi_pathinfo is in state transition.  Return the
4257 			 * last good state.
4258 			 */
4259 			*state = MDI_PI_OLD_STATE(pip);
4260 			*ext_state = MDI_PI_OLD_EXT_STATE(pip);
4261 		} else {
4262 			*state = MDI_PI_STATE(pip);
4263 			*ext_state = MDI_PI_EXT_STATE(pip);
4264 		}
4265 	}
4266 	return (MDI_SUCCESS);
4267 }
4268 
4269 /*
4270  * mdi_pi_get_preferred:
4271  *	Get the preferred path flag
4272  */
4273 int
4274 mdi_pi_get_preferred(mdi_pathinfo_t *pip)
4275 {
4276 	if (pip) {
4277 		return (MDI_PI(pip)->pi_preferred);
4278 	}
4279 	return (0);
4280 }
4281 
4282 /*
4283  * mdi_pi_set_preferred:
4284  *	Set the preferred path flag
4285  */
4286 void
4287 mdi_pi_set_preferred(mdi_pathinfo_t *pip, int preferred)
4288 {
4289 	if (pip) {
4290 		MDI_PI(pip)->pi_preferred = preferred;
4291 	}
4292 }
4293 
4294 /*
4295  * mdi_pi_set_state():
4296  *		Set the mdi_pathinfo node state
4297  */
4298 void
4299 mdi_pi_set_state(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state)
4300 {
4301 	uint32_t	ext_state;
4302 
4303 	if (pip) {
4304 		ext_state = MDI_PI(pip)->pi_state & MDI_PATHINFO_EXT_STATE_MASK;
4305 		MDI_PI(pip)->pi_state = state;
4306 		MDI_PI(pip)->pi_state |= ext_state;
4307 
4308 		/* Path has changed state, invalidate DINFOCACHE snap shot. */
4309 		i_ddi_di_cache_invalidate();
4310 	}
4311 }
4312 
4313 /*
4314  * Property functions:
4315  */
4316 int
4317 i_map_nvlist_error_to_mdi(int val)
4318 {
4319 	int rv;
4320 
4321 	switch (val) {
4322 	case 0:
4323 		rv = DDI_PROP_SUCCESS;
4324 		break;
4325 	case EINVAL:
4326 	case ENOTSUP:
4327 		rv = DDI_PROP_INVAL_ARG;
4328 		break;
4329 	case ENOMEM:
4330 		rv = DDI_PROP_NO_MEMORY;
4331 		break;
4332 	default:
4333 		rv = DDI_PROP_NOT_FOUND;
4334 		break;
4335 	}
4336 	return (rv);
4337 }
4338 
4339 /*
4340  * mdi_pi_get_next_prop():
4341  * 		Property walk function.  The caller should hold mdi_pi_lock()
4342  *		and release by calling mdi_pi_unlock() at the end of walk to
4343  *		get a consistent value.
4344  */
4345 nvpair_t *
4346 mdi_pi_get_next_prop(mdi_pathinfo_t *pip, nvpair_t *prev)
4347 {
4348 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4349 		return (NULL);
4350 	}
4351 	ASSERT(MDI_PI_LOCKED(pip));
4352 	return (nvlist_next_nvpair(MDI_PI(pip)->pi_prop, prev));
4353 }
4354 
4355 /*
4356  * mdi_prop_remove():
4357  * 		Remove the named property from the named list.
4358  */
4359 int
4360 mdi_prop_remove(mdi_pathinfo_t *pip, char *name)
4361 {
4362 	if (pip == NULL) {
4363 		return (DDI_PROP_NOT_FOUND);
4364 	}
4365 	ASSERT(!MDI_PI_LOCKED(pip));
4366 	MDI_PI_LOCK(pip);
4367 	if (MDI_PI(pip)->pi_prop == NULL) {
4368 		MDI_PI_UNLOCK(pip);
4369 		return (DDI_PROP_NOT_FOUND);
4370 	}
4371 	if (name) {
4372 		(void) nvlist_remove_all(MDI_PI(pip)->pi_prop, name);
4373 	} else {
4374 		char		nvp_name[MAXNAMELEN];
4375 		nvpair_t	*nvp;
4376 		nvp = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, NULL);
4377 		while (nvp) {
4378 			nvpair_t	*next;
4379 			next = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, nvp);
4380 			(void) snprintf(nvp_name, sizeof(nvp_name), "%s",
4381 			    nvpair_name(nvp));
4382 			(void) nvlist_remove_all(MDI_PI(pip)->pi_prop,
4383 			    nvp_name);
4384 			nvp = next;
4385 		}
4386 	}
4387 	MDI_PI_UNLOCK(pip);
4388 	return (DDI_PROP_SUCCESS);
4389 }
4390 
4391 /*
4392  * mdi_prop_size():
4393  * 		Get buffer size needed to pack the property data.
4394  * 		Caller should hold the mdi_pathinfo_t lock to get a consistent
4395  *		buffer size.
4396  */
4397 int
4398 mdi_prop_size(mdi_pathinfo_t *pip, size_t *buflenp)
4399 {
4400 	int	rv;
4401 	size_t	bufsize;
4402 
4403 	*buflenp = 0;
4404 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4405 		return (DDI_PROP_NOT_FOUND);
4406 	}
4407 	ASSERT(MDI_PI_LOCKED(pip));
4408 	rv = nvlist_size(MDI_PI(pip)->pi_prop,
4409 	    &bufsize, NV_ENCODE_NATIVE);
4410 	*buflenp = bufsize;
4411 	return (i_map_nvlist_error_to_mdi(rv));
4412 }
4413 
4414 /*
4415  * mdi_prop_pack():
4416  * 		pack the property list.  The caller should hold the
4417  *		mdi_pathinfo_t node to get a consistent data
4418  */
4419 int
4420 mdi_prop_pack(mdi_pathinfo_t *pip, char **bufp, uint_t buflen)
4421 {
4422 	int	rv;
4423 	size_t	bufsize;
4424 
4425 	if ((pip == NULL) || MDI_PI(pip)->pi_prop == NULL) {
4426 		return (DDI_PROP_NOT_FOUND);
4427 	}
4428 
4429 	ASSERT(MDI_PI_LOCKED(pip));
4430 
4431 	bufsize = buflen;
4432 	rv = nvlist_pack(MDI_PI(pip)->pi_prop, bufp, (size_t *)&bufsize,
4433 	    NV_ENCODE_NATIVE, KM_SLEEP);
4434 
4435 	return (i_map_nvlist_error_to_mdi(rv));
4436 }
4437 
4438 /*
4439  * mdi_prop_update_byte():
4440  *		Create/Update a byte property
4441  */
4442 int
4443 mdi_prop_update_byte(mdi_pathinfo_t *pip, char *name, uchar_t data)
4444 {
4445 	int rv;
4446 
4447 	if (pip == NULL) {
4448 		return (DDI_PROP_INVAL_ARG);
4449 	}
4450 	ASSERT(!MDI_PI_LOCKED(pip));
4451 	MDI_PI_LOCK(pip);
4452 	if (MDI_PI(pip)->pi_prop == NULL) {
4453 		MDI_PI_UNLOCK(pip);
4454 		return (DDI_PROP_NOT_FOUND);
4455 	}
4456 	rv = nvlist_add_byte(MDI_PI(pip)->pi_prop, name, data);
4457 	MDI_PI_UNLOCK(pip);
4458 	return (i_map_nvlist_error_to_mdi(rv));
4459 }
4460 
4461 /*
4462  * mdi_prop_update_byte_array():
4463  *		Create/Update a byte array property
4464  */
4465 int
4466 mdi_prop_update_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t *data,
4467     uint_t nelements)
4468 {
4469 	int rv;
4470 
4471 	if (pip == NULL) {
4472 		return (DDI_PROP_INVAL_ARG);
4473 	}
4474 	ASSERT(!MDI_PI_LOCKED(pip));
4475 	MDI_PI_LOCK(pip);
4476 	if (MDI_PI(pip)->pi_prop == NULL) {
4477 		MDI_PI_UNLOCK(pip);
4478 		return (DDI_PROP_NOT_FOUND);
4479 	}
4480 	rv = nvlist_add_byte_array(MDI_PI(pip)->pi_prop, name, data, nelements);
4481 	MDI_PI_UNLOCK(pip);
4482 	return (i_map_nvlist_error_to_mdi(rv));
4483 }
4484 
4485 /*
4486  * mdi_prop_update_int():
4487  *		Create/Update a 32 bit integer property
4488  */
4489 int
4490 mdi_prop_update_int(mdi_pathinfo_t *pip, char *name, int data)
4491 {
4492 	int rv;
4493 
4494 	if (pip == NULL) {
4495 		return (DDI_PROP_INVAL_ARG);
4496 	}
4497 	ASSERT(!MDI_PI_LOCKED(pip));
4498 	MDI_PI_LOCK(pip);
4499 	if (MDI_PI(pip)->pi_prop == NULL) {
4500 		MDI_PI_UNLOCK(pip);
4501 		return (DDI_PROP_NOT_FOUND);
4502 	}
4503 	rv = nvlist_add_int32(MDI_PI(pip)->pi_prop, name, (int32_t)data);
4504 	MDI_PI_UNLOCK(pip);
4505 	return (i_map_nvlist_error_to_mdi(rv));
4506 }
4507 
4508 /*
4509  * mdi_prop_update_int64():
4510  *		Create/Update a 64 bit integer property
4511  */
4512 int
4513 mdi_prop_update_int64(mdi_pathinfo_t *pip, char *name, int64_t data)
4514 {
4515 	int rv;
4516 
4517 	if (pip == NULL) {
4518 		return (DDI_PROP_INVAL_ARG);
4519 	}
4520 	ASSERT(!MDI_PI_LOCKED(pip));
4521 	MDI_PI_LOCK(pip);
4522 	if (MDI_PI(pip)->pi_prop == NULL) {
4523 		MDI_PI_UNLOCK(pip);
4524 		return (DDI_PROP_NOT_FOUND);
4525 	}
4526 	rv = nvlist_add_int64(MDI_PI(pip)->pi_prop, name, data);
4527 	MDI_PI_UNLOCK(pip);
4528 	return (i_map_nvlist_error_to_mdi(rv));
4529 }
4530 
4531 /*
4532  * mdi_prop_update_int_array():
4533  *		Create/Update a int array property
4534  */
4535 int
4536 mdi_prop_update_int_array(mdi_pathinfo_t *pip, char *name, int *data,
4537 	    uint_t nelements)
4538 {
4539 	int rv;
4540 
4541 	if (pip == NULL) {
4542 		return (DDI_PROP_INVAL_ARG);
4543 	}
4544 	ASSERT(!MDI_PI_LOCKED(pip));
4545 	MDI_PI_LOCK(pip);
4546 	if (MDI_PI(pip)->pi_prop == NULL) {
4547 		MDI_PI_UNLOCK(pip);
4548 		return (DDI_PROP_NOT_FOUND);
4549 	}
4550 	rv = nvlist_add_int32_array(MDI_PI(pip)->pi_prop, name, (int32_t *)data,
4551 	    nelements);
4552 	MDI_PI_UNLOCK(pip);
4553 	return (i_map_nvlist_error_to_mdi(rv));
4554 }
4555 
4556 /*
4557  * mdi_prop_update_string():
4558  *		Create/Update a string property
4559  */
4560 int
4561 mdi_prop_update_string(mdi_pathinfo_t *pip, char *name, char *data)
4562 {
4563 	int rv;
4564 
4565 	if (pip == NULL) {
4566 		return (DDI_PROP_INVAL_ARG);
4567 	}
4568 	ASSERT(!MDI_PI_LOCKED(pip));
4569 	MDI_PI_LOCK(pip);
4570 	if (MDI_PI(pip)->pi_prop == NULL) {
4571 		MDI_PI_UNLOCK(pip);
4572 		return (DDI_PROP_NOT_FOUND);
4573 	}
4574 	rv = nvlist_add_string(MDI_PI(pip)->pi_prop, name, data);
4575 	MDI_PI_UNLOCK(pip);
4576 	return (i_map_nvlist_error_to_mdi(rv));
4577 }
4578 
4579 /*
4580  * mdi_prop_update_string_array():
4581  *		Create/Update a string array property
4582  */
4583 int
4584 mdi_prop_update_string_array(mdi_pathinfo_t *pip, char *name, char **data,
4585     uint_t nelements)
4586 {
4587 	int rv;
4588 
4589 	if (pip == NULL) {
4590 		return (DDI_PROP_INVAL_ARG);
4591 	}
4592 	ASSERT(!MDI_PI_LOCKED(pip));
4593 	MDI_PI_LOCK(pip);
4594 	if (MDI_PI(pip)->pi_prop == NULL) {
4595 		MDI_PI_UNLOCK(pip);
4596 		return (DDI_PROP_NOT_FOUND);
4597 	}
4598 	rv = nvlist_add_string_array(MDI_PI(pip)->pi_prop, name, data,
4599 	    nelements);
4600 	MDI_PI_UNLOCK(pip);
4601 	return (i_map_nvlist_error_to_mdi(rv));
4602 }
4603 
4604 /*
4605  * mdi_prop_lookup_byte():
4606  * 		Look for byte property identified by name.  The data returned
4607  *		is the actual property and valid as long as mdi_pathinfo_t node
4608  *		is alive.
4609  */
4610 int
4611 mdi_prop_lookup_byte(mdi_pathinfo_t *pip, char *name, uchar_t *data)
4612 {
4613 	int rv;
4614 
4615 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4616 		return (DDI_PROP_NOT_FOUND);
4617 	}
4618 	rv = nvlist_lookup_byte(MDI_PI(pip)->pi_prop, name, data);
4619 	return (i_map_nvlist_error_to_mdi(rv));
4620 }
4621 
4622 
4623 /*
4624  * mdi_prop_lookup_byte_array():
4625  * 		Look for byte array property identified by name.  The data
4626  *		returned is the actual property and valid as long as
4627  *		mdi_pathinfo_t node is alive.
4628  */
4629 int
4630 mdi_prop_lookup_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t **data,
4631     uint_t *nelements)
4632 {
4633 	int rv;
4634 
4635 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4636 		return (DDI_PROP_NOT_FOUND);
4637 	}
4638 	rv = nvlist_lookup_byte_array(MDI_PI(pip)->pi_prop, name, data,
4639 	    nelements);
4640 	return (i_map_nvlist_error_to_mdi(rv));
4641 }
4642 
4643 /*
4644  * mdi_prop_lookup_int():
4645  * 		Look for int property identified by name.  The data returned
4646  *		is the actual property and valid as long as mdi_pathinfo_t
4647  *		node is alive.
4648  */
4649 int
4650 mdi_prop_lookup_int(mdi_pathinfo_t *pip, char *name, int *data)
4651 {
4652 	int rv;
4653 
4654 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4655 		return (DDI_PROP_NOT_FOUND);
4656 	}
4657 	rv = nvlist_lookup_int32(MDI_PI(pip)->pi_prop, name, (int32_t *)data);
4658 	return (i_map_nvlist_error_to_mdi(rv));
4659 }
4660 
4661 /*
4662  * mdi_prop_lookup_int64():
4663  * 		Look for int64 property identified by name.  The data returned
4664  *		is the actual property and valid as long as mdi_pathinfo_t node
4665  *		is alive.
4666  */
4667 int
4668 mdi_prop_lookup_int64(mdi_pathinfo_t *pip, char *name, int64_t *data)
4669 {
4670 	int rv;
4671 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4672 		return (DDI_PROP_NOT_FOUND);
4673 	}
4674 	rv = nvlist_lookup_int64(MDI_PI(pip)->pi_prop, name, data);
4675 	return (i_map_nvlist_error_to_mdi(rv));
4676 }
4677 
4678 /*
4679  * mdi_prop_lookup_int_array():
4680  * 		Look for int array property identified by name.  The data
4681  *		returned is the actual property and valid as long as
4682  *		mdi_pathinfo_t node is alive.
4683  */
4684 int
4685 mdi_prop_lookup_int_array(mdi_pathinfo_t *pip, char *name, int **data,
4686     uint_t *nelements)
4687 {
4688 	int rv;
4689 
4690 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4691 		return (DDI_PROP_NOT_FOUND);
4692 	}
4693 	rv = nvlist_lookup_int32_array(MDI_PI(pip)->pi_prop, name,
4694 	    (int32_t **)data, nelements);
4695 	return (i_map_nvlist_error_to_mdi(rv));
4696 }
4697 
4698 /*
4699  * mdi_prop_lookup_string():
4700  * 		Look for string property identified by name.  The data
4701  *		returned is the actual property and valid as long as
4702  *		mdi_pathinfo_t node is alive.
4703  */
4704 int
4705 mdi_prop_lookup_string(mdi_pathinfo_t *pip, char *name, char **data)
4706 {
4707 	int rv;
4708 
4709 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4710 		return (DDI_PROP_NOT_FOUND);
4711 	}
4712 	rv = nvlist_lookup_string(MDI_PI(pip)->pi_prop, name, data);
4713 	return (i_map_nvlist_error_to_mdi(rv));
4714 }
4715 
4716 /*
4717  * mdi_prop_lookup_string_array():
4718  * 		Look for string array property identified by name.  The data
4719  *		returned is the actual property and valid as long as
4720  *		mdi_pathinfo_t node is alive.
4721  */
4722 int
4723 mdi_prop_lookup_string_array(mdi_pathinfo_t *pip, char *name, char ***data,
4724     uint_t *nelements)
4725 {
4726 	int rv;
4727 
4728 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4729 		return (DDI_PROP_NOT_FOUND);
4730 	}
4731 	rv = nvlist_lookup_string_array(MDI_PI(pip)->pi_prop, name, data,
4732 	    nelements);
4733 	return (i_map_nvlist_error_to_mdi(rv));
4734 }
4735 
4736 /*
4737  * mdi_prop_free():
4738  * 		Symmetrical function to ddi_prop_free(). nvlist_lookup_xx()
4739  *		functions return the pointer to actual property data and not a
4740  *		copy of it.  So the data returned is valid as long as
4741  *		mdi_pathinfo_t node is valid.
4742  */
4743 /*ARGSUSED*/
4744 int
4745 mdi_prop_free(void *data)
4746 {
4747 	return (DDI_PROP_SUCCESS);
4748 }
4749 
4750 /*ARGSUSED*/
4751 static void
4752 i_mdi_report_path_state(mdi_client_t *ct, mdi_pathinfo_t *pip)
4753 {
4754 	char		*ct_path;
4755 	char		*ct_status;
4756 	char		*status;
4757 	dev_info_t	*cdip = ct->ct_dip;
4758 	char		lb_buf[64];
4759 	int		report_lb_c = 0, report_lb_p = 0;
4760 
4761 	ASSERT(MDI_CLIENT_LOCKED(ct));
4762 	if ((cdip == NULL) || (ddi_get_instance(cdip) == -1) ||
4763 	    (MDI_CLIENT_IS_REPORT_DEV_NEEDED(ct) == 0)) {
4764 		return;
4765 	}
4766 	if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_OPTIMAL) {
4767 		ct_status = "optimal";
4768 		report_lb_c = 1;
4769 	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED) {
4770 		ct_status = "degraded";
4771 	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
4772 		ct_status = "failed";
4773 	} else {
4774 		ct_status = "unknown";
4775 	}
4776 
4777 	lb_buf[0] = 0;		/* not interested in load balancing config */
4778 
4779 	if (MDI_PI_FLAGS_IS_DEVICE_REMOVED(pip)) {
4780 		status = "removed";
4781 	} else if (MDI_PI_IS_OFFLINE(pip)) {
4782 		status = "offline";
4783 	} else if (MDI_PI_IS_ONLINE(pip)) {
4784 		status = "online";
4785 		report_lb_p = 1;
4786 	} else if (MDI_PI_IS_STANDBY(pip)) {
4787 		status = "standby";
4788 	} else if (MDI_PI_IS_FAULT(pip)) {
4789 		status = "faulted";
4790 	} else {
4791 		status = "unknown";
4792 	}
4793 
4794 	if (cdip) {
4795 		ct_path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4796 
4797 		/*
4798 		 * NOTE: Keeping "multipath status: %s" and
4799 		 * "Load balancing: %s" format unchanged in case someone
4800 		 * scrubs /var/adm/messages looking for these messages.
4801 		 */
4802 		if (report_lb_c && report_lb_p) {
4803 			if (ct->ct_lb == LOAD_BALANCE_LBA) {
4804 				(void) snprintf(lb_buf, sizeof (lb_buf),
4805 				    "%s, region-size: %d", mdi_load_balance_lba,
4806 				    ct->ct_lb_args->region_size);
4807 			} else if (ct->ct_lb == LOAD_BALANCE_NONE) {
4808 				(void) snprintf(lb_buf, sizeof (lb_buf),
4809 				    "%s", mdi_load_balance_none);
4810 			} else {
4811 				(void) snprintf(lb_buf, sizeof (lb_buf), "%s",
4812 				    mdi_load_balance_rr);
4813 			}
4814 
4815 			cmn_err(mdi_debug_consoleonly ? CE_NOTE : CE_CONT,
4816 			    "?%s (%s%d) multipath status: %s: "
4817 			    "path %d %s is %s: Load balancing: %s\n",
4818 			    ddi_pathname(cdip, ct_path), ddi_driver_name(cdip),
4819 			    ddi_get_instance(cdip), ct_status,
4820 			    mdi_pi_get_path_instance(pip),
4821 			    mdi_pi_spathname(pip), status, lb_buf);
4822 		} else {
4823 			cmn_err(mdi_debug_consoleonly ? CE_NOTE : CE_CONT,
4824 			    "?%s (%s%d) multipath status: %s: "
4825 			    "path %d %s is %s\n",
4826 			    ddi_pathname(cdip, ct_path), ddi_driver_name(cdip),
4827 			    ddi_get_instance(cdip), ct_status,
4828 			    mdi_pi_get_path_instance(pip),
4829 			    mdi_pi_spathname(pip), status);
4830 		}
4831 
4832 		kmem_free(ct_path, MAXPATHLEN);
4833 		MDI_CLIENT_CLEAR_REPORT_DEV_NEEDED(ct);
4834 	}
4835 }
4836 
4837 #ifdef	DEBUG
4838 /*
4839  * i_mdi_log():
4840  *		Utility function for error message management
4841  *
4842  *		NOTE: Implementation takes care of trailing \n for cmn_err,
4843  *		MDI_DEBUG should not terminate fmt strings with \n.
4844  *
4845  *		NOTE: If the level is >= 2, and there is no leading !?^
4846  *		then a leading ! is implied (but can be overriden via
4847  *		mdi_debug_consoleonly). If you are using kmdb on the console,
4848  *		consider setting mdi_debug_consoleonly to 1 as an aid.
4849  */
4850 /*PRINTFLIKE4*/
4851 static void
4852 i_mdi_log(int level, const char *func, dev_info_t *dip, const char *fmt, ...)
4853 {
4854 	char		name[MAXNAMELEN];
4855 	char		buf[512];
4856 	char		*bp;
4857 	va_list		ap;
4858 	int		log_only = 0;
4859 	int		boot_only = 0;
4860 	int		console_only = 0;
4861 
4862 	if (dip) {
4863 		(void) snprintf(name, sizeof(name), "%s%d: ",
4864 		    ddi_driver_name(dip), ddi_get_instance(dip));
4865 	} else {
4866 		name[0] = 0;
4867 	}
4868 
4869 	va_start(ap, fmt);
4870 	(void) vsnprintf(buf, sizeof(buf), fmt, ap);
4871 	va_end(ap);
4872 
4873 	switch (buf[0]) {
4874 	case '!':
4875 		bp = &buf[1];
4876 		log_only = 1;
4877 		break;
4878 	case '?':
4879 		bp = &buf[1];
4880 		boot_only = 1;
4881 		break;
4882 	case '^':
4883 		bp = &buf[1];
4884 		console_only = 1;
4885 		break;
4886 	default:
4887 		if (level >= 2)
4888 			log_only = 1;		/* ! implied */
4889 		bp = buf;
4890 		break;
4891 	}
4892 	if (mdi_debug_logonly) {
4893 		log_only = 1;
4894 		boot_only = 0;
4895 		console_only = 0;
4896 	}
4897 	if (mdi_debug_consoleonly) {
4898 		log_only = 0;
4899 		boot_only = 0;
4900 		console_only = 1;
4901 		level = CE_NOTE;
4902 		goto console;
4903 	}
4904 
4905 	switch (level) {
4906 	case CE_NOTE:
4907 		level = CE_CONT;
4908 		/* FALLTHROUGH */
4909 	case CE_CONT:
4910 		if (boot_only) {
4911 			cmn_err(level, "?mdi: %s%s: %s\n", name, func, bp);
4912 		} else if (console_only) {
4913 			cmn_err(level, "^mdi: %s%s: %s\n", name, func, bp);
4914 		} else if (log_only) {
4915 			cmn_err(level, "!mdi: %s%s: %s\n", name, func, bp);
4916 		} else {
4917 			cmn_err(level, "mdi: %s%s: %s\n", name, func, bp);
4918 		}
4919 		break;
4920 
4921 	case CE_WARN:
4922 	case CE_PANIC:
4923 	console:
4924 		if (boot_only) {
4925 			cmn_err(level, "?mdi: %s%s: %s", name, func, bp);
4926 		} else if (console_only) {
4927 			cmn_err(level, "^mdi: %s%s: %s", name, func, bp);
4928 		} else if (log_only) {
4929 			cmn_err(level, "!mdi: %s%s: %s", name, func, bp);
4930 		} else {
4931 			cmn_err(level, "mdi: %s%s: %s", name, func, bp);
4932 		}
4933 		break;
4934 	default:
4935 		cmn_err(level, "mdi: %s%s", name, bp);
4936 		break;
4937 	}
4938 }
4939 #endif	/* DEBUG */
4940 
4941 void
4942 i_mdi_client_online(dev_info_t *ct_dip)
4943 {
4944 	mdi_client_t	*ct;
4945 
4946 	/*
4947 	 * Client online notification. Mark client state as online
4948 	 * restore our binding with dev_info node
4949 	 */
4950 	ct = i_devi_get_client(ct_dip);
4951 	ASSERT(ct != NULL);
4952 	MDI_CLIENT_LOCK(ct);
4953 	MDI_CLIENT_SET_ONLINE(ct);
4954 	/* catch for any memory leaks */
4955 	ASSERT((ct->ct_dip == NULL) || (ct->ct_dip == ct_dip));
4956 	ct->ct_dip = ct_dip;
4957 
4958 	if (ct->ct_power_cnt == 0)
4959 		(void) i_mdi_power_all_phci(ct);
4960 
4961 	MDI_DEBUG(4, (MDI_NOTE, ct_dip,
4962 	    "i_mdi_pm_hold_client %p", (void *)ct));
4963 	i_mdi_pm_hold_client(ct, 1);
4964 
4965 	MDI_CLIENT_UNLOCK(ct);
4966 }
4967 
4968 void
4969 i_mdi_phci_online(dev_info_t *ph_dip)
4970 {
4971 	mdi_phci_t	*ph;
4972 
4973 	/* pHCI online notification. Mark state accordingly */
4974 	ph = i_devi_get_phci(ph_dip);
4975 	ASSERT(ph != NULL);
4976 	MDI_PHCI_LOCK(ph);
4977 	MDI_PHCI_SET_ONLINE(ph);
4978 	MDI_PHCI_UNLOCK(ph);
4979 }
4980 
4981 /*
4982  * mdi_devi_online():
4983  * 		Online notification from NDI framework on pHCI/client
4984  *		device online.
4985  * Return Values:
4986  *		NDI_SUCCESS
4987  *		MDI_FAILURE
4988  */
4989 /*ARGSUSED*/
4990 int
4991 mdi_devi_online(dev_info_t *dip, uint_t flags)
4992 {
4993 	if (MDI_PHCI(dip)) {
4994 		i_mdi_phci_online(dip);
4995 	}
4996 
4997 	if (MDI_CLIENT(dip)) {
4998 		i_mdi_client_online(dip);
4999 	}
5000 	return (NDI_SUCCESS);
5001 }
5002 
5003 /*
5004  * mdi_devi_offline():
5005  * 		Offline notification from NDI framework on pHCI/Client device
5006  *		offline.
5007  *
5008  * Return Values:
5009  *		NDI_SUCCESS
5010  *		NDI_FAILURE
5011  */
5012 /*ARGSUSED*/
5013 int
5014 mdi_devi_offline(dev_info_t *dip, uint_t flags)
5015 {
5016 	int		rv = NDI_SUCCESS;
5017 
5018 	if (MDI_CLIENT(dip)) {
5019 		rv = i_mdi_client_offline(dip, flags);
5020 		if (rv != NDI_SUCCESS)
5021 			return (rv);
5022 	}
5023 
5024 	if (MDI_PHCI(dip)) {
5025 		rv = i_mdi_phci_offline(dip, flags);
5026 
5027 		if ((rv != NDI_SUCCESS) && MDI_CLIENT(dip)) {
5028 			/* set client back online */
5029 			i_mdi_client_online(dip);
5030 		}
5031 	}
5032 
5033 	return (rv);
5034 }
5035 
5036 /*ARGSUSED*/
5037 static int
5038 i_mdi_phci_offline(dev_info_t *dip, uint_t flags)
5039 {
5040 	int		rv = NDI_SUCCESS;
5041 	mdi_phci_t	*ph;
5042 	mdi_client_t	*ct;
5043 	mdi_pathinfo_t	*pip;
5044 	mdi_pathinfo_t	*next;
5045 	mdi_pathinfo_t	*failed_pip = NULL;
5046 	dev_info_t	*cdip;
5047 
5048 	/*
5049 	 * pHCI component offline notification
5050 	 * Make sure that this pHCI instance is free to be offlined.
5051 	 * If it is OK to proceed, Offline and remove all the child
5052 	 * mdi_pathinfo nodes.  This process automatically offlines
5053 	 * corresponding client devices, for which this pHCI provides
5054 	 * critical services.
5055 	 */
5056 	ph = i_devi_get_phci(dip);
5057 	MDI_DEBUG(2, (MDI_NOTE, dip,
5058 	    "called %p %p", (void *)dip, (void *)ph));
5059 	if (ph == NULL) {
5060 		return (rv);
5061 	}
5062 
5063 	MDI_PHCI_LOCK(ph);
5064 
5065 	if (MDI_PHCI_IS_OFFLINE(ph)) {
5066 		MDI_DEBUG(1, (MDI_WARN, dip,
5067 		    "!pHCI already offlined: %p", (void *)dip));
5068 		MDI_PHCI_UNLOCK(ph);
5069 		return (NDI_SUCCESS);
5070 	}
5071 
5072 	/*
5073 	 * Check to see if the pHCI can be offlined
5074 	 */
5075 	if (ph->ph_unstable) {
5076 		MDI_DEBUG(1, (MDI_WARN, dip,
5077 		    "!One or more target devices are in transient state. "
5078 		    "This device can not be removed at this moment. "
5079 		    "Please try again later."));
5080 		MDI_PHCI_UNLOCK(ph);
5081 		return (NDI_BUSY);
5082 	}
5083 
5084 	pip = ph->ph_path_head;
5085 	while (pip != NULL) {
5086 		MDI_PI_LOCK(pip);
5087 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5088 
5089 		/*
5090 		 * The mdi_pathinfo state is OK. Check the client state.
5091 		 * If failover in progress fail the pHCI from offlining
5092 		 */
5093 		ct = MDI_PI(pip)->pi_client;
5094 		i_mdi_client_lock(ct, pip);
5095 		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
5096 		    (ct->ct_unstable)) {
5097 			/*
5098 			 * Failover is in progress, Fail the DR
5099 			 */
5100 			MDI_DEBUG(1, (MDI_WARN, dip,
5101 			    "!pHCI device is busy. "
5102 			    "This device can not be removed at this moment. "
5103 			    "Please try again later."));
5104 			MDI_PI_UNLOCK(pip);
5105 			i_mdi_client_unlock(ct);
5106 			MDI_PHCI_UNLOCK(ph);
5107 			return (NDI_BUSY);
5108 		}
5109 		MDI_PI_UNLOCK(pip);
5110 
5111 		/*
5112 		 * Check to see of we are removing the last path of this
5113 		 * client device...
5114 		 */
5115 		cdip = ct->ct_dip;
5116 		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5117 		    (i_mdi_client_compute_state(ct, ph) ==
5118 		    MDI_CLIENT_STATE_FAILED)) {
5119 			i_mdi_client_unlock(ct);
5120 			MDI_PHCI_UNLOCK(ph);
5121 			if (ndi_devi_offline(cdip,
5122 			    NDI_DEVFS_CLEAN) != NDI_SUCCESS) {
5123 				/*
5124 				 * ndi_devi_offline() failed.
5125 				 * This pHCI provides the critical path
5126 				 * to one or more client devices.
5127 				 * Return busy.
5128 				 */
5129 				MDI_PHCI_LOCK(ph);
5130 				MDI_DEBUG(1, (MDI_WARN, dip,
5131 				    "!pHCI device is busy. "
5132 				    "This device can not be removed at this "
5133 				    "moment. Please try again later."));
5134 				failed_pip = pip;
5135 				break;
5136 			} else {
5137 				MDI_PHCI_LOCK(ph);
5138 				pip = next;
5139 			}
5140 		} else {
5141 			i_mdi_client_unlock(ct);
5142 			pip = next;
5143 		}
5144 	}
5145 
5146 	if (failed_pip) {
5147 		pip = ph->ph_path_head;
5148 		while (pip != failed_pip) {
5149 			MDI_PI_LOCK(pip);
5150 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5151 			ct = MDI_PI(pip)->pi_client;
5152 			i_mdi_client_lock(ct, pip);
5153 			cdip = ct->ct_dip;
5154 			switch (MDI_CLIENT_STATE(ct)) {
5155 			case MDI_CLIENT_STATE_OPTIMAL:
5156 			case MDI_CLIENT_STATE_DEGRADED:
5157 				if (cdip) {
5158 					MDI_PI_UNLOCK(pip);
5159 					i_mdi_client_unlock(ct);
5160 					MDI_PHCI_UNLOCK(ph);
5161 					(void) ndi_devi_online(cdip, 0);
5162 					MDI_PHCI_LOCK(ph);
5163 					pip = next;
5164 					continue;
5165 				}
5166 				break;
5167 
5168 			case MDI_CLIENT_STATE_FAILED:
5169 				if (cdip) {
5170 					MDI_PI_UNLOCK(pip);
5171 					i_mdi_client_unlock(ct);
5172 					MDI_PHCI_UNLOCK(ph);
5173 					(void) ndi_devi_offline(cdip,
5174 						NDI_DEVFS_CLEAN);
5175 					MDI_PHCI_LOCK(ph);
5176 					pip = next;
5177 					continue;
5178 				}
5179 				break;
5180 			}
5181 			MDI_PI_UNLOCK(pip);
5182 			i_mdi_client_unlock(ct);
5183 			pip = next;
5184 		}
5185 		MDI_PHCI_UNLOCK(ph);
5186 		return (NDI_BUSY);
5187 	}
5188 
5189 	/*
5190 	 * Mark the pHCI as offline
5191 	 */
5192 	MDI_PHCI_SET_OFFLINE(ph);
5193 
5194 	/*
5195 	 * Mark the child mdi_pathinfo nodes as transient
5196 	 */
5197 	pip = ph->ph_path_head;
5198 	while (pip != NULL) {
5199 		MDI_PI_LOCK(pip);
5200 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5201 		MDI_PI_SET_OFFLINING(pip);
5202 		MDI_PI_UNLOCK(pip);
5203 		pip = next;
5204 	}
5205 	MDI_PHCI_UNLOCK(ph);
5206 	/*
5207 	 * Give a chance for any pending commands to execute
5208 	 */
5209 	delay_random(mdi_delay);
5210 	MDI_PHCI_LOCK(ph);
5211 	pip = ph->ph_path_head;
5212 	while (pip != NULL) {
5213 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5214 		(void) i_mdi_pi_offline(pip, flags);
5215 		MDI_PI_LOCK(pip);
5216 		ct = MDI_PI(pip)->pi_client;
5217 		if (!MDI_PI_IS_OFFLINE(pip)) {
5218 			MDI_DEBUG(1, (MDI_WARN, dip,
5219 			    "!pHCI device is busy. "
5220 			    "This device can not be removed at this moment. "
5221 			    "Please try again later."));
5222 			MDI_PI_UNLOCK(pip);
5223 			MDI_PHCI_SET_ONLINE(ph);
5224 			MDI_PHCI_UNLOCK(ph);
5225 			return (NDI_BUSY);
5226 		}
5227 		MDI_PI_UNLOCK(pip);
5228 		pip = next;
5229 	}
5230 	MDI_PHCI_UNLOCK(ph);
5231 
5232 	return (rv);
5233 }
5234 
5235 void
5236 mdi_phci_mark_retiring(dev_info_t *dip, char **cons_array)
5237 {
5238 	mdi_phci_t	*ph;
5239 	mdi_client_t	*ct;
5240 	mdi_pathinfo_t	*pip;
5241 	mdi_pathinfo_t	*next;
5242 	dev_info_t	*cdip;
5243 
5244 	if (!MDI_PHCI(dip))
5245 		return;
5246 
5247 	ph = i_devi_get_phci(dip);
5248 	if (ph == NULL) {
5249 		return;
5250 	}
5251 
5252 	MDI_PHCI_LOCK(ph);
5253 
5254 	if (MDI_PHCI_IS_OFFLINE(ph)) {
5255 		/* has no last path */
5256 		MDI_PHCI_UNLOCK(ph);
5257 		return;
5258 	}
5259 
5260 	pip = ph->ph_path_head;
5261 	while (pip != NULL) {
5262 		MDI_PI_LOCK(pip);
5263 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5264 
5265 		ct = MDI_PI(pip)->pi_client;
5266 		i_mdi_client_lock(ct, pip);
5267 		MDI_PI_UNLOCK(pip);
5268 
5269 		cdip = ct->ct_dip;
5270 		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5271 		    (i_mdi_client_compute_state(ct, ph) ==
5272 		    MDI_CLIENT_STATE_FAILED)) {
5273 			/* Last path. Mark client dip as retiring */
5274 			i_mdi_client_unlock(ct);
5275 			MDI_PHCI_UNLOCK(ph);
5276 			(void) e_ddi_mark_retiring(cdip, cons_array);
5277 			MDI_PHCI_LOCK(ph);
5278 			pip = next;
5279 		} else {
5280 			i_mdi_client_unlock(ct);
5281 			pip = next;
5282 		}
5283 	}
5284 
5285 	MDI_PHCI_UNLOCK(ph);
5286 
5287 	return;
5288 }
5289 
5290 void
5291 mdi_phci_retire_notify(dev_info_t *dip, int *constraint)
5292 {
5293 	mdi_phci_t	*ph;
5294 	mdi_client_t	*ct;
5295 	mdi_pathinfo_t	*pip;
5296 	mdi_pathinfo_t	*next;
5297 	dev_info_t	*cdip;
5298 
5299 	if (!MDI_PHCI(dip))
5300 		return;
5301 
5302 	ph = i_devi_get_phci(dip);
5303 	if (ph == NULL)
5304 		return;
5305 
5306 	MDI_PHCI_LOCK(ph);
5307 
5308 	if (MDI_PHCI_IS_OFFLINE(ph)) {
5309 		MDI_PHCI_UNLOCK(ph);
5310 		/* not last path */
5311 		return;
5312 	}
5313 
5314 	if (ph->ph_unstable) {
5315 		MDI_PHCI_UNLOCK(ph);
5316 		/* can't check for constraints */
5317 		*constraint = 0;
5318 		return;
5319 	}
5320 
5321 	pip = ph->ph_path_head;
5322 	while (pip != NULL) {
5323 		MDI_PI_LOCK(pip);
5324 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5325 
5326 		/*
5327 		 * The mdi_pathinfo state is OK. Check the client state.
5328 		 * If failover in progress fail the pHCI from offlining
5329 		 */
5330 		ct = MDI_PI(pip)->pi_client;
5331 		i_mdi_client_lock(ct, pip);
5332 		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
5333 		    (ct->ct_unstable)) {
5334 			/*
5335 			 * Failover is in progress, can't check for constraints
5336 			 */
5337 			MDI_PI_UNLOCK(pip);
5338 			i_mdi_client_unlock(ct);
5339 			MDI_PHCI_UNLOCK(ph);
5340 			*constraint = 0;
5341 			return;
5342 		}
5343 		MDI_PI_UNLOCK(pip);
5344 
5345 		/*
5346 		 * Check to see of we are retiring the last path of this
5347 		 * client device...
5348 		 */
5349 		cdip = ct->ct_dip;
5350 		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5351 		    (i_mdi_client_compute_state(ct, ph) ==
5352 		    MDI_CLIENT_STATE_FAILED)) {
5353 			i_mdi_client_unlock(ct);
5354 			MDI_PHCI_UNLOCK(ph);
5355 			(void) e_ddi_retire_notify(cdip, constraint);
5356 			MDI_PHCI_LOCK(ph);
5357 			pip = next;
5358 		} else {
5359 			i_mdi_client_unlock(ct);
5360 			pip = next;
5361 		}
5362 	}
5363 
5364 	MDI_PHCI_UNLOCK(ph);
5365 
5366 	return;
5367 }
5368 
5369 /*
5370  * offline the path(s) hanging off the pHCI. If the
5371  * last path to any client, check that constraints
5372  * have been applied.
5373  *
5374  * If constraint is 0, we aren't going to retire the
5375  * pHCI. However we still need to go through the paths
5376  * calling e_ddi_retire_finalize() to clear their
5377  * contract barriers.
5378  */
5379 void
5380 mdi_phci_retire_finalize(dev_info_t *dip, int phci_only, void *constraint)
5381 {
5382 	mdi_phci_t	*ph;
5383 	mdi_client_t	*ct;
5384 	mdi_pathinfo_t	*pip;
5385 	mdi_pathinfo_t	*next;
5386 	dev_info_t	*cdip;
5387 	int		unstable = 0;
5388 	int		tmp_constraint;
5389 
5390 	if (!MDI_PHCI(dip))
5391 		return;
5392 
5393 	ph = i_devi_get_phci(dip);
5394 	if (ph == NULL) {
5395 		/* no last path and no pips */
5396 		return;
5397 	}
5398 
5399 	MDI_PHCI_LOCK(ph);
5400 
5401 	if (MDI_PHCI_IS_OFFLINE(ph)) {
5402 		MDI_PHCI_UNLOCK(ph);
5403 		/* no last path and no pips */
5404 		return;
5405 	}
5406 
5407 	/*
5408 	 * Check to see if the pHCI can be offlined
5409 	 */
5410 	if (ph->ph_unstable) {
5411 		unstable = 1;
5412 	}
5413 
5414 	pip = ph->ph_path_head;
5415 	while (pip != NULL) {
5416 		MDI_PI_LOCK(pip);
5417 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5418 
5419 		/*
5420 		 * if failover in progress fail the pHCI from offlining
5421 		 */
5422 		ct = MDI_PI(pip)->pi_client;
5423 		i_mdi_client_lock(ct, pip);
5424 		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
5425 		    (ct->ct_unstable)) {
5426 			unstable = 1;
5427 		}
5428 		MDI_PI_UNLOCK(pip);
5429 
5430 		/*
5431 		 * Check to see of we are removing the last path of this
5432 		 * client device...
5433 		 */
5434 		cdip = ct->ct_dip;
5435 		if (!phci_only && cdip &&
5436 		    (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5437 		    (i_mdi_client_compute_state(ct, ph) ==
5438 		    MDI_CLIENT_STATE_FAILED)) {
5439 			i_mdi_client_unlock(ct);
5440 			MDI_PHCI_UNLOCK(ph);
5441 			/*
5442 			 * This is the last path to this client.
5443 			 *
5444 			 * Constraint will only be set to 1 if this client can
5445 			 * be retired (as already determined by
5446 			 * mdi_phci_retire_notify). However we don't actually
5447 			 * need to retire the client (we just retire the last
5448 			 * path - MPXIO will then fail all I/Os to the client).
5449 			 * But we still need to call e_ddi_retire_finalize so
5450 			 * the contract barriers can be cleared. Therefore we
5451 			 * temporarily set constraint = 0 so that the client
5452 			 * dip is not retired.
5453 			 */
5454 			tmp_constraint = 0;
5455 			(void) e_ddi_retire_finalize(cdip, &tmp_constraint);
5456 			MDI_PHCI_LOCK(ph);
5457 			pip = next;
5458 		} else {
5459 			i_mdi_client_unlock(ct);
5460 			pip = next;
5461 		}
5462 	}
5463 
5464 	if (!phci_only && *((int *)constraint) == 0) {
5465 		MDI_PHCI_UNLOCK(ph);
5466 		return;
5467 	}
5468 
5469 	/*
5470 	 * Cannot offline pip(s)
5471 	 */
5472 	if (unstable) {
5473 		cmn_err(CE_WARN, "%s%d: mdi_phci_retire_finalize: "
5474 		    "pHCI in transient state, cannot retire",
5475 		    ddi_driver_name(dip), ddi_get_instance(dip));
5476 		MDI_PHCI_UNLOCK(ph);
5477 		return;
5478 	}
5479 
5480 	/*
5481 	 * Mark the pHCI as offline
5482 	 */
5483 	MDI_PHCI_SET_OFFLINE(ph);
5484 
5485 	/*
5486 	 * Mark the child mdi_pathinfo nodes as transient
5487 	 */
5488 	pip = ph->ph_path_head;
5489 	while (pip != NULL) {
5490 		MDI_PI_LOCK(pip);
5491 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5492 		MDI_PI_SET_OFFLINING(pip);
5493 		MDI_PI_UNLOCK(pip);
5494 		pip = next;
5495 	}
5496 	MDI_PHCI_UNLOCK(ph);
5497 	/*
5498 	 * Give a chance for any pending commands to execute
5499 	 */
5500 	delay_random(mdi_delay);
5501 	MDI_PHCI_LOCK(ph);
5502 	pip = ph->ph_path_head;
5503 	while (pip != NULL) {
5504 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5505 		(void) i_mdi_pi_offline(pip, 0);
5506 		MDI_PI_LOCK(pip);
5507 		ct = MDI_PI(pip)->pi_client;
5508 		if (!MDI_PI_IS_OFFLINE(pip)) {
5509 			cmn_err(CE_WARN, "mdi_phci_retire_finalize: "
5510 			    "path %d %s busy, cannot offline",
5511 			    mdi_pi_get_path_instance(pip),
5512 			    mdi_pi_spathname(pip));
5513 			MDI_PI_UNLOCK(pip);
5514 			MDI_PHCI_SET_ONLINE(ph);
5515 			MDI_PHCI_UNLOCK(ph);
5516 			return;
5517 		}
5518 		MDI_PI_UNLOCK(pip);
5519 		pip = next;
5520 	}
5521 	MDI_PHCI_UNLOCK(ph);
5522 
5523 	return;
5524 }
5525 
5526 void
5527 mdi_phci_unretire(dev_info_t *dip)
5528 {
5529 	ASSERT(MDI_PHCI(dip));
5530 
5531 	/*
5532 	 * Online the phci
5533 	 */
5534 	i_mdi_phci_online(dip);
5535 }
5536 
5537 /*ARGSUSED*/
5538 static int
5539 i_mdi_client_offline(dev_info_t *dip, uint_t flags)
5540 {
5541 	int		rv = NDI_SUCCESS;
5542 	mdi_client_t	*ct;
5543 
5544 	/*
5545 	 * Client component to go offline.  Make sure that we are
5546 	 * not in failing over state and update client state
5547 	 * accordingly
5548 	 */
5549 	ct = i_devi_get_client(dip);
5550 	MDI_DEBUG(2, (MDI_NOTE, dip,
5551 	    "called %p %p", (void *)dip, (void *)ct));
5552 	if (ct != NULL) {
5553 		MDI_CLIENT_LOCK(ct);
5554 		if (ct->ct_unstable) {
5555 			/*
5556 			 * One or more paths are in transient state,
5557 			 * Dont allow offline of a client device
5558 			 */
5559 			MDI_DEBUG(1, (MDI_WARN, dip,
5560 			    "!One or more paths to "
5561 			    "this device are in transient state. "
5562 			    "This device can not be removed at this moment. "
5563 			    "Please try again later."));
5564 			MDI_CLIENT_UNLOCK(ct);
5565 			return (NDI_BUSY);
5566 		}
5567 		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
5568 			/*
5569 			 * Failover is in progress, Dont allow DR of
5570 			 * a client device
5571 			 */
5572 			MDI_DEBUG(1, (MDI_WARN, dip,
5573 			    "!Client device is Busy. "
5574 			    "This device can not be removed at this moment. "
5575 			    "Please try again later."));
5576 			MDI_CLIENT_UNLOCK(ct);
5577 			return (NDI_BUSY);
5578 		}
5579 		MDI_CLIENT_SET_OFFLINE(ct);
5580 
5581 		/*
5582 		 * Unbind our relationship with the dev_info node
5583 		 */
5584 		if (flags & NDI_DEVI_REMOVE) {
5585 			ct->ct_dip = NULL;
5586 		}
5587 		MDI_CLIENT_UNLOCK(ct);
5588 	}
5589 	return (rv);
5590 }
5591 
5592 /*
5593  * mdi_pre_attach():
5594  *		Pre attach() notification handler
5595  */
5596 /*ARGSUSED*/
5597 int
5598 mdi_pre_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
5599 {
5600 	/* don't support old DDI_PM_RESUME */
5601 	if ((DEVI(dip)->devi_mdi_component != MDI_COMPONENT_NONE) &&
5602 	    (cmd == DDI_PM_RESUME))
5603 		return (DDI_FAILURE);
5604 
5605 	return (DDI_SUCCESS);
5606 }
5607 
5608 /*
5609  * mdi_post_attach():
5610  *		Post attach() notification handler
5611  */
5612 /*ARGSUSED*/
5613 void
5614 mdi_post_attach(dev_info_t *dip, ddi_attach_cmd_t cmd, int error)
5615 {
5616 	mdi_phci_t	*ph;
5617 	mdi_client_t	*ct;
5618 	mdi_vhci_t	*vh;
5619 
5620 	if (MDI_PHCI(dip)) {
5621 		ph = i_devi_get_phci(dip);
5622 		ASSERT(ph != NULL);
5623 
5624 		MDI_PHCI_LOCK(ph);
5625 		switch (cmd) {
5626 		case DDI_ATTACH:
5627 			MDI_DEBUG(2, (MDI_NOTE, dip,
5628 			    "phci post_attach called %p", (void *)ph));
5629 			if (error == DDI_SUCCESS) {
5630 				MDI_PHCI_SET_ATTACH(ph);
5631 			} else {
5632 				MDI_DEBUG(1, (MDI_NOTE, dip,
5633 				    "!pHCI post_attach failed: error %d",
5634 				    error));
5635 				MDI_PHCI_SET_DETACH(ph);
5636 			}
5637 			break;
5638 
5639 		case DDI_RESUME:
5640 			MDI_DEBUG(2, (MDI_NOTE, dip,
5641 			    "pHCI post_resume: called %p", (void *)ph));
5642 			if (error == DDI_SUCCESS) {
5643 				MDI_PHCI_SET_RESUME(ph);
5644 			} else {
5645 				MDI_DEBUG(1, (MDI_NOTE, dip,
5646 				    "!pHCI post_resume failed: error %d",
5647 				    error));
5648 				MDI_PHCI_SET_SUSPEND(ph);
5649 			}
5650 			break;
5651 		}
5652 		MDI_PHCI_UNLOCK(ph);
5653 	}
5654 
5655 	if (MDI_CLIENT(dip)) {
5656 		ct = i_devi_get_client(dip);
5657 		ASSERT(ct != NULL);
5658 
5659 		MDI_CLIENT_LOCK(ct);
5660 		switch (cmd) {
5661 		case DDI_ATTACH:
5662 			MDI_DEBUG(2, (MDI_NOTE, dip,
5663 			    "client post_attach called %p", (void *)ct));
5664 			if (error != DDI_SUCCESS) {
5665 				MDI_DEBUG(1, (MDI_NOTE, dip,
5666 				    "!client post_attach failed: error %d",
5667 				    error));
5668 				MDI_CLIENT_SET_DETACH(ct);
5669 				MDI_DEBUG(4, (MDI_WARN, dip,
5670 				    "i_mdi_pm_reset_client"));
5671 				i_mdi_pm_reset_client(ct);
5672 				break;
5673 			}
5674 
5675 			/*
5676 			 * Client device has successfully attached, inform
5677 			 * the vhci.
5678 			 */
5679 			vh = ct->ct_vhci;
5680 			if (vh->vh_ops->vo_client_attached)
5681 				(*vh->vh_ops->vo_client_attached)(dip);
5682 
5683 			MDI_CLIENT_SET_ATTACH(ct);
5684 			break;
5685 
5686 		case DDI_RESUME:
5687 			MDI_DEBUG(2, (MDI_NOTE, dip,
5688 			    "client post_attach: called %p", (void *)ct));
5689 			if (error == DDI_SUCCESS) {
5690 				MDI_CLIENT_SET_RESUME(ct);
5691 			} else {
5692 				MDI_DEBUG(1, (MDI_NOTE, dip,
5693 				    "!client post_resume failed: error %d",
5694 				    error));
5695 				MDI_CLIENT_SET_SUSPEND(ct);
5696 			}
5697 			break;
5698 		}
5699 		MDI_CLIENT_UNLOCK(ct);
5700 	}
5701 }
5702 
5703 /*
5704  * mdi_pre_detach():
5705  *		Pre detach notification handler
5706  */
5707 /*ARGSUSED*/
5708 int
5709 mdi_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5710 {
5711 	int rv = DDI_SUCCESS;
5712 
5713 	if (MDI_CLIENT(dip)) {
5714 		(void) i_mdi_client_pre_detach(dip, cmd);
5715 	}
5716 
5717 	if (MDI_PHCI(dip)) {
5718 		rv = i_mdi_phci_pre_detach(dip, cmd);
5719 	}
5720 
5721 	return (rv);
5722 }
5723 
5724 /*ARGSUSED*/
5725 static int
5726 i_mdi_phci_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5727 {
5728 	int		rv = DDI_SUCCESS;
5729 	mdi_phci_t	*ph;
5730 	mdi_client_t	*ct;
5731 	mdi_pathinfo_t	*pip;
5732 	mdi_pathinfo_t	*failed_pip = NULL;
5733 	mdi_pathinfo_t	*next;
5734 
5735 	ph = i_devi_get_phci(dip);
5736 	if (ph == NULL) {
5737 		return (rv);
5738 	}
5739 
5740 	MDI_PHCI_LOCK(ph);
5741 	switch (cmd) {
5742 	case DDI_DETACH:
5743 		MDI_DEBUG(2, (MDI_NOTE, dip,
5744 		    "pHCI pre_detach: called %p", (void *)ph));
5745 		if (!MDI_PHCI_IS_OFFLINE(ph)) {
5746 			/*
5747 			 * mdi_pathinfo nodes are still attached to
5748 			 * this pHCI. Fail the detach for this pHCI.
5749 			 */
5750 			MDI_DEBUG(2, (MDI_WARN, dip,
5751 			    "pHCI pre_detach: paths are still attached %p",
5752 			    (void *)ph));
5753 			rv = DDI_FAILURE;
5754 			break;
5755 		}
5756 		MDI_PHCI_SET_DETACH(ph);
5757 		break;
5758 
5759 	case DDI_SUSPEND:
5760 		/*
5761 		 * pHCI is getting suspended.  Since mpxio client
5762 		 * devices may not be suspended at this point, to avoid
5763 		 * a potential stack overflow, it is important to suspend
5764 		 * client devices before pHCI can be suspended.
5765 		 */
5766 
5767 		MDI_DEBUG(2, (MDI_NOTE, dip,
5768 		    "pHCI pre_suspend: called %p", (void *)ph));
5769 		/*
5770 		 * Suspend all the client devices accessible through this pHCI
5771 		 */
5772 		pip = ph->ph_path_head;
5773 		while (pip != NULL && rv == DDI_SUCCESS) {
5774 			dev_info_t *cdip;
5775 			MDI_PI_LOCK(pip);
5776 			next =
5777 			    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5778 			ct = MDI_PI(pip)->pi_client;
5779 			i_mdi_client_lock(ct, pip);
5780 			cdip = ct->ct_dip;
5781 			MDI_PI_UNLOCK(pip);
5782 			if ((MDI_CLIENT_IS_DETACHED(ct) == 0) &&
5783 			    MDI_CLIENT_IS_SUSPENDED(ct) == 0) {
5784 				i_mdi_client_unlock(ct);
5785 				if ((rv = devi_detach(cdip, DDI_SUSPEND)) !=
5786 				    DDI_SUCCESS) {
5787 					/*
5788 					 * Suspend of one of the client
5789 					 * device has failed.
5790 					 */
5791 					MDI_DEBUG(1, (MDI_WARN, dip,
5792 					    "!suspend of device (%s%d) failed.",
5793 					    ddi_driver_name(cdip),
5794 					    ddi_get_instance(cdip)));
5795 					failed_pip = pip;
5796 					break;
5797 				}
5798 			} else {
5799 				i_mdi_client_unlock(ct);
5800 			}
5801 			pip = next;
5802 		}
5803 
5804 		if (rv == DDI_SUCCESS) {
5805 			/*
5806 			 * Suspend of client devices is complete. Proceed
5807 			 * with pHCI suspend.
5808 			 */
5809 			MDI_PHCI_SET_SUSPEND(ph);
5810 		} else {
5811 			/*
5812 			 * Revert back all the suspended client device states
5813 			 * to converse.
5814 			 */
5815 			pip = ph->ph_path_head;
5816 			while (pip != failed_pip) {
5817 				dev_info_t *cdip;
5818 				MDI_PI_LOCK(pip);
5819 				next =
5820 				    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5821 				ct = MDI_PI(pip)->pi_client;
5822 				i_mdi_client_lock(ct, pip);
5823 				cdip = ct->ct_dip;
5824 				MDI_PI_UNLOCK(pip);
5825 				if (MDI_CLIENT_IS_SUSPENDED(ct)) {
5826 					i_mdi_client_unlock(ct);
5827 					(void) devi_attach(cdip, DDI_RESUME);
5828 				} else {
5829 					i_mdi_client_unlock(ct);
5830 				}
5831 				pip = next;
5832 			}
5833 		}
5834 		break;
5835 
5836 	default:
5837 		rv = DDI_FAILURE;
5838 		break;
5839 	}
5840 	MDI_PHCI_UNLOCK(ph);
5841 	return (rv);
5842 }
5843 
5844 /*ARGSUSED*/
5845 static int
5846 i_mdi_client_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5847 {
5848 	int		rv = DDI_SUCCESS;
5849 	mdi_client_t	*ct;
5850 
5851 	ct = i_devi_get_client(dip);
5852 	if (ct == NULL) {
5853 		return (rv);
5854 	}
5855 
5856 	MDI_CLIENT_LOCK(ct);
5857 	switch (cmd) {
5858 	case DDI_DETACH:
5859 		MDI_DEBUG(2, (MDI_NOTE, dip,
5860 		    "client pre_detach: called %p",
5861 		     (void *)ct));
5862 		MDI_CLIENT_SET_DETACH(ct);
5863 		break;
5864 
5865 	case DDI_SUSPEND:
5866 		MDI_DEBUG(2, (MDI_NOTE, dip,
5867 		    "client pre_suspend: called %p",
5868 		    (void *)ct));
5869 		MDI_CLIENT_SET_SUSPEND(ct);
5870 		break;
5871 
5872 	default:
5873 		rv = DDI_FAILURE;
5874 		break;
5875 	}
5876 	MDI_CLIENT_UNLOCK(ct);
5877 	return (rv);
5878 }
5879 
5880 /*
5881  * mdi_post_detach():
5882  *		Post detach notification handler
5883  */
5884 /*ARGSUSED*/
5885 void
5886 mdi_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5887 {
5888 	/*
5889 	 * Detach/Suspend of mpxio component failed. Update our state
5890 	 * too
5891 	 */
5892 	if (MDI_PHCI(dip))
5893 		i_mdi_phci_post_detach(dip, cmd, error);
5894 
5895 	if (MDI_CLIENT(dip))
5896 		i_mdi_client_post_detach(dip, cmd, error);
5897 }
5898 
5899 /*ARGSUSED*/
5900 static void
5901 i_mdi_phci_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5902 {
5903 	mdi_phci_t	*ph;
5904 
5905 	/*
5906 	 * Detach/Suspend of phci component failed. Update our state
5907 	 * too
5908 	 */
5909 	ph = i_devi_get_phci(dip);
5910 	if (ph == NULL) {
5911 		return;
5912 	}
5913 
5914 	MDI_PHCI_LOCK(ph);
5915 	/*
5916 	 * Detach of pHCI failed. Restore back converse
5917 	 * state
5918 	 */
5919 	switch (cmd) {
5920 	case DDI_DETACH:
5921 		MDI_DEBUG(2, (MDI_NOTE, dip,
5922 		    "pHCI post_detach: called %p",
5923 		    (void *)ph));
5924 		if (error != DDI_SUCCESS)
5925 			MDI_PHCI_SET_ATTACH(ph);
5926 		break;
5927 
5928 	case DDI_SUSPEND:
5929 		MDI_DEBUG(2, (MDI_NOTE, dip,
5930 		    "pHCI post_suspend: called %p",
5931 		    (void *)ph));
5932 		if (error != DDI_SUCCESS)
5933 			MDI_PHCI_SET_RESUME(ph);
5934 		break;
5935 	}
5936 	MDI_PHCI_UNLOCK(ph);
5937 }
5938 
5939 /*ARGSUSED*/
5940 static void
5941 i_mdi_client_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5942 {
5943 	mdi_client_t	*ct;
5944 
5945 	ct = i_devi_get_client(dip);
5946 	if (ct == NULL) {
5947 		return;
5948 	}
5949 	MDI_CLIENT_LOCK(ct);
5950 	/*
5951 	 * Detach of Client failed. Restore back converse
5952 	 * state
5953 	 */
5954 	switch (cmd) {
5955 	case DDI_DETACH:
5956 		MDI_DEBUG(2, (MDI_NOTE, dip,
5957 		    "client post_detach: called %p", (void *)ct));
5958 		if (DEVI_IS_ATTACHING(ct->ct_dip)) {
5959 			MDI_DEBUG(4, (MDI_NOTE, dip,
5960 			    "i_mdi_pm_rele_client\n"));
5961 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
5962 		} else {
5963 			MDI_DEBUG(4, (MDI_NOTE, dip,
5964 			    "i_mdi_pm_reset_client\n"));
5965 			i_mdi_pm_reset_client(ct);
5966 		}
5967 		if (error != DDI_SUCCESS)
5968 			MDI_CLIENT_SET_ATTACH(ct);
5969 		break;
5970 
5971 	case DDI_SUSPEND:
5972 		MDI_DEBUG(2, (MDI_NOTE, dip,
5973 		    "called %p", (void *)ct));
5974 		if (error != DDI_SUCCESS)
5975 			MDI_CLIENT_SET_RESUME(ct);
5976 		break;
5977 	}
5978 	MDI_CLIENT_UNLOCK(ct);
5979 }
5980 
5981 int
5982 mdi_pi_kstat_exists(mdi_pathinfo_t *pip)
5983 {
5984 	return (MDI_PI(pip)->pi_kstats ? 1 : 0);
5985 }
5986 
5987 /*
5988  * create and install per-path (client - pHCI) statistics
5989  * I/O stats supported: nread, nwritten, reads, and writes
5990  * Error stats - hard errors, soft errors, & transport errors
5991  */
5992 int
5993 mdi_pi_kstat_create(mdi_pathinfo_t *pip, char *ksname)
5994 {
5995 	kstat_t			*kiosp, *kerrsp;
5996 	struct pi_errs		*nsp;
5997 	struct mdi_pi_kstats	*mdi_statp;
5998 
5999 	if (MDI_PI(pip)->pi_kstats != NULL)
6000 		return (MDI_SUCCESS);
6001 
6002 	if ((kiosp = kstat_create("mdi", 0, ksname, "iopath",
6003 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL) {
6004 		return (MDI_FAILURE);
6005 	}
6006 
6007 	(void) strcat(ksname, ",err");
6008 	kerrsp = kstat_create("mdi", 0, ksname, "iopath_errors",
6009 	    KSTAT_TYPE_NAMED,
6010 	    sizeof (struct pi_errs) / sizeof (kstat_named_t), 0);
6011 	if (kerrsp == NULL) {
6012 		kstat_delete(kiosp);
6013 		return (MDI_FAILURE);
6014 	}
6015 
6016 	nsp = (struct pi_errs *)kerrsp->ks_data;
6017 	kstat_named_init(&nsp->pi_softerrs, "Soft Errors", KSTAT_DATA_UINT32);
6018 	kstat_named_init(&nsp->pi_harderrs, "Hard Errors", KSTAT_DATA_UINT32);
6019 	kstat_named_init(&nsp->pi_transerrs, "Transport Errors",
6020 	    KSTAT_DATA_UINT32);
6021 	kstat_named_init(&nsp->pi_icnt_busy, "Interconnect Busy",
6022 	    KSTAT_DATA_UINT32);
6023 	kstat_named_init(&nsp->pi_icnt_errors, "Interconnect Errors",
6024 	    KSTAT_DATA_UINT32);
6025 	kstat_named_init(&nsp->pi_phci_rsrc, "pHCI No Resources",
6026 	    KSTAT_DATA_UINT32);
6027 	kstat_named_init(&nsp->pi_phci_localerr, "pHCI Local Errors",
6028 	    KSTAT_DATA_UINT32);
6029 	kstat_named_init(&nsp->pi_phci_invstate, "pHCI Invalid State",
6030 	    KSTAT_DATA_UINT32);
6031 	kstat_named_init(&nsp->pi_failedfrom, "Failed From",
6032 	    KSTAT_DATA_UINT32);
6033 	kstat_named_init(&nsp->pi_failedto, "Failed To", KSTAT_DATA_UINT32);
6034 
6035 	mdi_statp = kmem_alloc(sizeof (*mdi_statp), KM_SLEEP);
6036 	mdi_statp->pi_kstat_ref = 1;
6037 	mdi_statp->pi_kstat_iostats = kiosp;
6038 	mdi_statp->pi_kstat_errstats = kerrsp;
6039 	kstat_install(kiosp);
6040 	kstat_install(kerrsp);
6041 	MDI_PI(pip)->pi_kstats = mdi_statp;
6042 	return (MDI_SUCCESS);
6043 }
6044 
6045 /*
6046  * destroy per-path properties
6047  */
6048 static void
6049 i_mdi_pi_kstat_destroy(mdi_pathinfo_t *pip)
6050 {
6051 
6052 	struct mdi_pi_kstats *mdi_statp;
6053 
6054 	if (MDI_PI(pip)->pi_kstats == NULL)
6055 		return;
6056 	if ((mdi_statp = MDI_PI(pip)->pi_kstats) == NULL)
6057 		return;
6058 
6059 	MDI_PI(pip)->pi_kstats = NULL;
6060 
6061 	/*
6062 	 * the kstat may be shared between multiple pathinfo nodes
6063 	 * decrement this pathinfo's usage, removing the kstats
6064 	 * themselves when the last pathinfo reference is removed.
6065 	 */
6066 	ASSERT(mdi_statp->pi_kstat_ref > 0);
6067 	if (--mdi_statp->pi_kstat_ref != 0)
6068 		return;
6069 
6070 	kstat_delete(mdi_statp->pi_kstat_iostats);
6071 	kstat_delete(mdi_statp->pi_kstat_errstats);
6072 	kmem_free(mdi_statp, sizeof (*mdi_statp));
6073 }
6074 
6075 /*
6076  * update I/O paths KSTATS
6077  */
6078 void
6079 mdi_pi_kstat_iosupdate(mdi_pathinfo_t *pip, struct buf *bp)
6080 {
6081 	kstat_t *iostatp;
6082 	size_t xfer_cnt;
6083 
6084 	ASSERT(pip != NULL);
6085 
6086 	/*
6087 	 * I/O can be driven across a path prior to having path
6088 	 * statistics available, i.e. probe(9e).
6089 	 */
6090 	if (bp != NULL && MDI_PI(pip)->pi_kstats != NULL) {
6091 		iostatp = MDI_PI(pip)->pi_kstats->pi_kstat_iostats;
6092 		xfer_cnt = bp->b_bcount - bp->b_resid;
6093 		if (bp->b_flags & B_READ) {
6094 			KSTAT_IO_PTR(iostatp)->reads++;
6095 			KSTAT_IO_PTR(iostatp)->nread += xfer_cnt;
6096 		} else {
6097 			KSTAT_IO_PTR(iostatp)->writes++;
6098 			KSTAT_IO_PTR(iostatp)->nwritten += xfer_cnt;
6099 		}
6100 	}
6101 }
6102 
6103 /*
6104  * Enable the path(specific client/target/initiator)
6105  * Enabling a path means that MPxIO may select the enabled path for routing
6106  * future I/O requests, subject to other path state constraints.
6107  */
6108 int
6109 mdi_pi_enable_path(mdi_pathinfo_t *pip, int flags)
6110 {
6111 	mdi_phci_t	*ph;
6112 
6113 	ph = MDI_PI(pip)->pi_phci;
6114 	if (ph == NULL) {
6115 		MDI_DEBUG(1, (MDI_NOTE, mdi_pi_get_phci(pip),
6116 		    "!failed: path %s %p: NULL ph",
6117 		    mdi_pi_spathname(pip), (void *)pip));
6118 		return (MDI_FAILURE);
6119 	}
6120 
6121 	(void) i_mdi_enable_disable_path(pip, ph->ph_vhci, flags,
6122 		MDI_ENABLE_OP);
6123 	MDI_DEBUG(5, (MDI_NOTE, ph->ph_dip,
6124 	    "!returning success pip = %p. ph = %p",
6125 	    (void *)pip, (void *)ph));
6126 	return (MDI_SUCCESS);
6127 
6128 }
6129 
6130 /*
6131  * Disable the path (specific client/target/initiator)
6132  * Disabling a path means that MPxIO will not select the disabled path for
6133  * routing any new I/O requests.
6134  */
6135 int
6136 mdi_pi_disable_path(mdi_pathinfo_t *pip, int flags)
6137 {
6138 	mdi_phci_t	*ph;
6139 
6140 	ph = MDI_PI(pip)->pi_phci;
6141 	if (ph == NULL) {
6142 		MDI_DEBUG(1, (MDI_NOTE, mdi_pi_get_phci(pip),
6143 		    "!failed: path %s %p: NULL ph",
6144 		    mdi_pi_spathname(pip), (void *)pip));
6145 		return (MDI_FAILURE);
6146 	}
6147 
6148 	(void) i_mdi_enable_disable_path(pip,
6149 	    ph->ph_vhci, flags, MDI_DISABLE_OP);
6150 	MDI_DEBUG(5, (MDI_NOTE, ph->ph_dip,
6151 	    "!returning success pip = %p. ph = %p",
6152 	    (void *)pip, (void *)ph));
6153 	return (MDI_SUCCESS);
6154 }
6155 
6156 /*
6157  * disable the path to a particular pHCI (pHCI specified in the phci_path
6158  * argument) for a particular client (specified in the client_path argument).
6159  * Disabling a path means that MPxIO will not select the disabled path for
6160  * routing any new I/O requests.
6161  * NOTE: this will be removed once the NWS files are changed to use the new
6162  * mdi_{enable,disable}_path interfaces
6163  */
6164 int
6165 mdi_pi_disable(dev_info_t *cdip, dev_info_t *pdip, int flags)
6166 {
6167 	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_DISABLE_OP));
6168 }
6169 
6170 /*
6171  * Enable the path to a particular pHCI (pHCI specified in the phci_path
6172  * argument) for a particular client (specified in the client_path argument).
6173  * Enabling a path means that MPxIO may select the enabled path for routing
6174  * future I/O requests, subject to other path state constraints.
6175  * NOTE: this will be removed once the NWS files are changed to use the new
6176  * mdi_{enable,disable}_path interfaces
6177  */
6178 
6179 int
6180 mdi_pi_enable(dev_info_t *cdip, dev_info_t *pdip, int flags)
6181 {
6182 	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_ENABLE_OP));
6183 }
6184 
6185 /*
6186  * Common routine for doing enable/disable.
6187  */
6188 static mdi_pathinfo_t *
6189 i_mdi_enable_disable_path(mdi_pathinfo_t *pip, mdi_vhci_t *vh, int flags,
6190 		int op)
6191 {
6192 	int		sync_flag = 0;
6193 	int		rv;
6194 	mdi_pathinfo_t 	*next;
6195 	int		(*f)() = NULL;
6196 
6197 	/*
6198 	 * Check to make sure the path is not already in the
6199 	 * requested state. If it is just return the next path
6200 	 * as we have nothing to do here.
6201 	 */
6202 	if ((MDI_PI_IS_DISABLE(pip) && op == MDI_DISABLE_OP) ||
6203 	    (!MDI_PI_IS_DISABLE(pip) && op == MDI_ENABLE_OP)) {
6204 		MDI_PI_LOCK(pip);
6205 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
6206 		MDI_PI_UNLOCK(pip);
6207 		return (next);
6208 	}
6209 
6210 	f = vh->vh_ops->vo_pi_state_change;
6211 
6212 	sync_flag = (flags << 8) & 0xf00;
6213 
6214 	/*
6215 	 * Do a callback into the mdi consumer to let it
6216 	 * know that path is about to get enabled/disabled.
6217 	 */
6218 	if (f != NULL) {
6219 		rv = (*f)(vh->vh_dip, pip, 0,
6220 			MDI_PI_EXT_STATE(pip),
6221 			MDI_EXT_STATE_CHANGE | sync_flag |
6222 			op | MDI_BEFORE_STATE_CHANGE);
6223 		if (rv != MDI_SUCCESS) {
6224 			MDI_DEBUG(2, (MDI_WARN, vh->vh_dip,
6225 			    "vo_pi_state_change: failed rv = %x", rv));
6226 		}
6227 	}
6228 	MDI_PI_LOCK(pip);
6229 	next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
6230 
6231 	switch (flags) {
6232 		case USER_DISABLE:
6233 			if (op == MDI_DISABLE_OP) {
6234 				MDI_PI_SET_USER_DISABLE(pip);
6235 			} else {
6236 				MDI_PI_SET_USER_ENABLE(pip);
6237 			}
6238 			break;
6239 		case DRIVER_DISABLE:
6240 			if (op == MDI_DISABLE_OP) {
6241 				MDI_PI_SET_DRV_DISABLE(pip);
6242 			} else {
6243 				MDI_PI_SET_DRV_ENABLE(pip);
6244 			}
6245 			break;
6246 		case DRIVER_DISABLE_TRANSIENT:
6247 			if (op == MDI_DISABLE_OP && rv == MDI_SUCCESS) {
6248 				MDI_PI_SET_DRV_DISABLE_TRANS(pip);
6249 			} else {
6250 				MDI_PI_SET_DRV_ENABLE_TRANS(pip);
6251 			}
6252 			break;
6253 	}
6254 	MDI_PI_UNLOCK(pip);
6255 	/*
6256 	 * Do a callback into the mdi consumer to let it
6257 	 * know that path is now enabled/disabled.
6258 	 */
6259 	if (f != NULL) {
6260 		rv = (*f)(vh->vh_dip, pip, 0,
6261 			MDI_PI_EXT_STATE(pip),
6262 			MDI_EXT_STATE_CHANGE | sync_flag |
6263 			op | MDI_AFTER_STATE_CHANGE);
6264 		if (rv != MDI_SUCCESS) {
6265 			MDI_DEBUG(2, (MDI_WARN, vh->vh_dip,
6266 			    "vo_pi_state_change failed: rv = %x", rv));
6267 		}
6268 	}
6269 	return (next);
6270 }
6271 
6272 /*
6273  * Common routine for doing enable/disable.
6274  * NOTE: this will be removed once the NWS files are changed to use the new
6275  * mdi_{enable,disable}_path has been putback
6276  */
6277 int
6278 i_mdi_pi_enable_disable(dev_info_t *cdip, dev_info_t *pdip, int flags, int op)
6279 {
6280 
6281 	mdi_phci_t	*ph;
6282 	mdi_vhci_t	*vh = NULL;
6283 	mdi_client_t	*ct;
6284 	mdi_pathinfo_t	*next, *pip;
6285 	int		found_it;
6286 
6287 	ph = i_devi_get_phci(pdip);
6288 	MDI_DEBUG(5, (MDI_NOTE, cdip ? cdip : pdip,
6289 	    "!op = %d pdip = %p cdip = %p", op, (void *)pdip,
6290 	    (void *)cdip));
6291 	if (ph == NULL) {
6292 		MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6293 		    "!failed: operation %d: NULL ph", op));
6294 		return (MDI_FAILURE);
6295 	}
6296 
6297 	if ((op != MDI_ENABLE_OP) && (op != MDI_DISABLE_OP)) {
6298 		MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6299 		    "!failed: invalid operation %d", op));
6300 		return (MDI_FAILURE);
6301 	}
6302 
6303 	vh = ph->ph_vhci;
6304 
6305 	if (cdip == NULL) {
6306 		/*
6307 		 * Need to mark the Phci as enabled/disabled.
6308 		 */
6309 		MDI_DEBUG(4, (MDI_NOTE, cdip ? cdip : pdip,
6310 		    "op %d for the phci", op));
6311 		MDI_PHCI_LOCK(ph);
6312 		switch (flags) {
6313 			case USER_DISABLE:
6314 				if (op == MDI_DISABLE_OP) {
6315 					MDI_PHCI_SET_USER_DISABLE(ph);
6316 				} else {
6317 					MDI_PHCI_SET_USER_ENABLE(ph);
6318 				}
6319 				break;
6320 			case DRIVER_DISABLE:
6321 				if (op == MDI_DISABLE_OP) {
6322 					MDI_PHCI_SET_DRV_DISABLE(ph);
6323 				} else {
6324 					MDI_PHCI_SET_DRV_ENABLE(ph);
6325 				}
6326 				break;
6327 			case DRIVER_DISABLE_TRANSIENT:
6328 				if (op == MDI_DISABLE_OP) {
6329 					MDI_PHCI_SET_DRV_DISABLE_TRANSIENT(ph);
6330 				} else {
6331 					MDI_PHCI_SET_DRV_ENABLE_TRANSIENT(ph);
6332 				}
6333 				break;
6334 			default:
6335 				MDI_PHCI_UNLOCK(ph);
6336 				MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6337 				    "!invalid flag argument= %d", flags));
6338 		}
6339 
6340 		/*
6341 		 * Phci has been disabled. Now try to enable/disable
6342 		 * path info's to each client.
6343 		 */
6344 		pip = ph->ph_path_head;
6345 		while (pip != NULL) {
6346 			pip = i_mdi_enable_disable_path(pip, vh, flags, op);
6347 		}
6348 		MDI_PHCI_UNLOCK(ph);
6349 	} else {
6350 
6351 		/*
6352 		 * Disable a specific client.
6353 		 */
6354 		ct = i_devi_get_client(cdip);
6355 		if (ct == NULL) {
6356 			MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6357 			    "!failed: operation = %d: NULL ct", op));
6358 			return (MDI_FAILURE);
6359 		}
6360 
6361 		MDI_CLIENT_LOCK(ct);
6362 		pip = ct->ct_path_head;
6363 		found_it = 0;
6364 		while (pip != NULL) {
6365 			MDI_PI_LOCK(pip);
6366 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6367 			if (MDI_PI(pip)->pi_phci == ph) {
6368 				MDI_PI_UNLOCK(pip);
6369 				found_it = 1;
6370 				break;
6371 			}
6372 			MDI_PI_UNLOCK(pip);
6373 			pip = next;
6374 		}
6375 
6376 
6377 		MDI_CLIENT_UNLOCK(ct);
6378 		if (found_it == 0) {
6379 			MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6380 			    "!failed. Could not find corresponding pip\n"));
6381 			return (MDI_FAILURE);
6382 		}
6383 
6384 		(void) i_mdi_enable_disable_path(pip, vh, flags, op);
6385 	}
6386 
6387 	MDI_DEBUG(5, (MDI_NOTE, cdip ? cdip : pdip,
6388 	    "!op %d returning success pdip = %p cdip = %p",
6389 	    op, (void *)pdip, (void *)cdip));
6390 	return (MDI_SUCCESS);
6391 }
6392 
6393 /*
6394  * Ensure phci powered up
6395  */
6396 static void
6397 i_mdi_pm_hold_pip(mdi_pathinfo_t *pip)
6398 {
6399 	dev_info_t	*ph_dip;
6400 
6401 	ASSERT(pip != NULL);
6402 	ASSERT(MDI_PI_LOCKED(pip));
6403 
6404 	if (MDI_PI(pip)->pi_pm_held) {
6405 		return;
6406 	}
6407 
6408 	ph_dip = mdi_pi_get_phci(pip);
6409 	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6410 	    "%s %p", mdi_pi_spathname(pip), (void *)pip));
6411 	if (ph_dip == NULL) {
6412 		return;
6413 	}
6414 
6415 	MDI_PI_UNLOCK(pip);
6416 	MDI_DEBUG(4, (MDI_NOTE, ph_dip, "kidsupcnt was %d",
6417 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
6418 	pm_hold_power(ph_dip);
6419 	MDI_DEBUG(4, (MDI_NOTE, ph_dip, "kidsupcnt is %d",
6420 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
6421 	MDI_PI_LOCK(pip);
6422 
6423 	/* If PM_GET_PM_INFO is NULL the pm_hold_power above was a noop */
6424 	if (DEVI(ph_dip)->devi_pm_info)
6425 		MDI_PI(pip)->pi_pm_held = 1;
6426 }
6427 
6428 /*
6429  * Allow phci powered down
6430  */
6431 static void
6432 i_mdi_pm_rele_pip(mdi_pathinfo_t *pip)
6433 {
6434 	dev_info_t	*ph_dip = NULL;
6435 
6436 	ASSERT(pip != NULL);
6437 	ASSERT(MDI_PI_LOCKED(pip));
6438 
6439 	if (MDI_PI(pip)->pi_pm_held == 0) {
6440 		return;
6441 	}
6442 
6443 	ph_dip = mdi_pi_get_phci(pip);
6444 	ASSERT(ph_dip != NULL);
6445 
6446 	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6447 	    "%s %p", mdi_pi_spathname(pip), (void *)pip));
6448 
6449 	MDI_PI_UNLOCK(pip);
6450 	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6451 	    "kidsupcnt was %d", DEVI(ph_dip)->devi_pm_kidsupcnt));
6452 	pm_rele_power(ph_dip);
6453 	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6454 	    "kidsupcnt is %d", DEVI(ph_dip)->devi_pm_kidsupcnt));
6455 	MDI_PI_LOCK(pip);
6456 
6457 	MDI_PI(pip)->pi_pm_held = 0;
6458 }
6459 
6460 static void
6461 i_mdi_pm_hold_client(mdi_client_t *ct, int incr)
6462 {
6463 	ASSERT(MDI_CLIENT_LOCKED(ct));
6464 
6465 	ct->ct_power_cnt += incr;
6466 	MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
6467 	    "%p ct_power_cnt = %d incr = %d",
6468 	    (void *)ct, ct->ct_power_cnt, incr));
6469 	ASSERT(ct->ct_power_cnt >= 0);
6470 }
6471 
6472 static void
6473 i_mdi_rele_all_phci(mdi_client_t *ct)
6474 {
6475 	mdi_pathinfo_t  *pip;
6476 
6477 	ASSERT(MDI_CLIENT_LOCKED(ct));
6478 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
6479 	while (pip != NULL) {
6480 		mdi_hold_path(pip);
6481 		MDI_PI_LOCK(pip);
6482 		i_mdi_pm_rele_pip(pip);
6483 		MDI_PI_UNLOCK(pip);
6484 		mdi_rele_path(pip);
6485 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6486 	}
6487 }
6488 
6489 static void
6490 i_mdi_pm_rele_client(mdi_client_t *ct, int decr)
6491 {
6492 	ASSERT(MDI_CLIENT_LOCKED(ct));
6493 
6494 	if (i_ddi_devi_attached(ct->ct_dip)) {
6495 		ct->ct_power_cnt -= decr;
6496 		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
6497 		    "%p ct_power_cnt = %d decr = %d",
6498 		    (void *)ct, ct->ct_power_cnt, decr));
6499 	}
6500 
6501 	ASSERT(ct->ct_power_cnt >= 0);
6502 	if (ct->ct_power_cnt == 0) {
6503 		i_mdi_rele_all_phci(ct);
6504 		return;
6505 	}
6506 }
6507 
6508 static void
6509 i_mdi_pm_reset_client(mdi_client_t *ct)
6510 {
6511 	MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
6512 	    "%p ct_power_cnt = %d", (void *)ct, ct->ct_power_cnt));
6513 	ASSERT(MDI_CLIENT_LOCKED(ct));
6514 	ct->ct_power_cnt = 0;
6515 	i_mdi_rele_all_phci(ct);
6516 	ct->ct_powercnt_config = 0;
6517 	ct->ct_powercnt_unconfig = 0;
6518 	ct->ct_powercnt_reset = 1;
6519 }
6520 
6521 static int
6522 i_mdi_power_one_phci(mdi_pathinfo_t *pip)
6523 {
6524 	int		ret;
6525 	dev_info_t	*ph_dip;
6526 
6527 	MDI_PI_LOCK(pip);
6528 	i_mdi_pm_hold_pip(pip);
6529 
6530 	ph_dip = mdi_pi_get_phci(pip);
6531 	MDI_PI_UNLOCK(pip);
6532 
6533 	/* bring all components of phci to full power */
6534 	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6535 	    "pm_powerup for %s%d %p", ddi_driver_name(ph_dip),
6536 	    ddi_get_instance(ph_dip), (void *)pip));
6537 
6538 	ret = pm_powerup(ph_dip);
6539 
6540 	if (ret == DDI_FAILURE) {
6541 		MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6542 		    "pm_powerup FAILED for %s%d %p",
6543 		    ddi_driver_name(ph_dip), ddi_get_instance(ph_dip),
6544 		    (void *)pip));
6545 
6546 		MDI_PI_LOCK(pip);
6547 		i_mdi_pm_rele_pip(pip);
6548 		MDI_PI_UNLOCK(pip);
6549 		return (MDI_FAILURE);
6550 	}
6551 
6552 	return (MDI_SUCCESS);
6553 }
6554 
6555 static int
6556 i_mdi_power_all_phci(mdi_client_t *ct)
6557 {
6558 	mdi_pathinfo_t  *pip;
6559 	int		succeeded = 0;
6560 
6561 	ASSERT(MDI_CLIENT_LOCKED(ct));
6562 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
6563 	while (pip != NULL) {
6564 		/*
6565 		 * Don't power if MDI_PATHINFO_STATE_FAULT
6566 		 * or MDI_PATHINFO_STATE_OFFLINE.
6567 		 */
6568 		if (MDI_PI_IS_INIT(pip) ||
6569 		    MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip)) {
6570 			mdi_hold_path(pip);
6571 			MDI_CLIENT_UNLOCK(ct);
6572 			if (i_mdi_power_one_phci(pip) == MDI_SUCCESS)
6573 				succeeded = 1;
6574 
6575 			ASSERT(ct == MDI_PI(pip)->pi_client);
6576 			MDI_CLIENT_LOCK(ct);
6577 			mdi_rele_path(pip);
6578 		}
6579 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6580 	}
6581 
6582 	return (succeeded ? MDI_SUCCESS : MDI_FAILURE);
6583 }
6584 
6585 /*
6586  * mdi_bus_power():
6587  *		1. Place the phci(s) into powered up state so that
6588  *		   client can do power management
6589  *		2. Ensure phci powered up as client power managing
6590  * Return Values:
6591  *		MDI_SUCCESS
6592  *		MDI_FAILURE
6593  */
6594 int
6595 mdi_bus_power(dev_info_t *parent, void *impl_arg, pm_bus_power_op_t op,
6596     void *arg, void *result)
6597 {
6598 	int			ret = MDI_SUCCESS;
6599 	pm_bp_child_pwrchg_t	*bpc;
6600 	mdi_client_t		*ct;
6601 	dev_info_t		*cdip;
6602 	pm_bp_has_changed_t	*bphc;
6603 
6604 	/*
6605 	 * BUS_POWER_NOINVOL not supported
6606 	 */
6607 	if (op == BUS_POWER_NOINVOL)
6608 		return (MDI_FAILURE);
6609 
6610 	/*
6611 	 * ignore other OPs.
6612 	 * return quickly to save cou cycles on the ct processing
6613 	 */
6614 	switch (op) {
6615 	case BUS_POWER_PRE_NOTIFICATION:
6616 	case BUS_POWER_POST_NOTIFICATION:
6617 		bpc = (pm_bp_child_pwrchg_t *)arg;
6618 		cdip = bpc->bpc_dip;
6619 		break;
6620 	case BUS_POWER_HAS_CHANGED:
6621 		bphc = (pm_bp_has_changed_t *)arg;
6622 		cdip = bphc->bphc_dip;
6623 		break;
6624 	default:
6625 		return (pm_busop_bus_power(parent, impl_arg, op, arg, result));
6626 	}
6627 
6628 	ASSERT(MDI_CLIENT(cdip));
6629 
6630 	ct = i_devi_get_client(cdip);
6631 	if (ct == NULL)
6632 		return (MDI_FAILURE);
6633 
6634 	/*
6635 	 * wait till the mdi_pathinfo node state change are processed
6636 	 */
6637 	MDI_CLIENT_LOCK(ct);
6638 	switch (op) {
6639 	case BUS_POWER_PRE_NOTIFICATION:
6640 		MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6641 		    "BUS_POWER_PRE_NOTIFICATION:"
6642 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d",
6643 		    ddi_node_name(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
6644 		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp));
6645 
6646 		/* serialize power level change per client */
6647 		while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6648 			cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6649 
6650 		MDI_CLIENT_SET_POWER_TRANSITION(ct);
6651 
6652 		if (ct->ct_power_cnt == 0) {
6653 			ret = i_mdi_power_all_phci(ct);
6654 		}
6655 
6656 		/*
6657 		 * if new_level > 0:
6658 		 *	- hold phci(s)
6659 		 *	- power up phci(s) if not already
6660 		 * ignore power down
6661 		 */
6662 		if (bpc->bpc_nlevel > 0) {
6663 			if (!DEVI_IS_ATTACHING(ct->ct_dip)) {
6664 				MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6665 				    "i_mdi_pm_hold_client\n"));
6666 				i_mdi_pm_hold_client(ct, ct->ct_path_count);
6667 			}
6668 		}
6669 		break;
6670 	case BUS_POWER_POST_NOTIFICATION:
6671 		MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6672 		    "BUS_POWER_POST_NOTIFICATION:"
6673 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d result=%d",
6674 		    ddi_node_name(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
6675 		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp,
6676 		    *(int *)result));
6677 
6678 		if (*(int *)result == DDI_SUCCESS) {
6679 			if (bpc->bpc_nlevel > 0) {
6680 				MDI_CLIENT_SET_POWER_UP(ct);
6681 			} else {
6682 				MDI_CLIENT_SET_POWER_DOWN(ct);
6683 			}
6684 		}
6685 
6686 		/* release the hold we did in pre-notification */
6687 		if (bpc->bpc_nlevel > 0 && (*(int *)result != DDI_SUCCESS) &&
6688 		    !DEVI_IS_ATTACHING(ct->ct_dip)) {
6689 			MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6690 			    "i_mdi_pm_rele_client\n"));
6691 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
6692 		}
6693 
6694 		if (bpc->bpc_nlevel == 0 && (*(int *)result == DDI_SUCCESS)) {
6695 			/* another thread might started attaching */
6696 			if (DEVI_IS_ATTACHING(ct->ct_dip)) {
6697 				MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6698 				    "i_mdi_pm_rele_client\n"));
6699 				i_mdi_pm_rele_client(ct, ct->ct_path_count);
6700 			/* detaching has been taken care in pm_post_unconfig */
6701 			} else if (!DEVI_IS_DETACHING(ct->ct_dip)) {
6702 				MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6703 				    "i_mdi_pm_reset_client\n"));
6704 				i_mdi_pm_reset_client(ct);
6705 			}
6706 		}
6707 
6708 		MDI_CLIENT_CLEAR_POWER_TRANSITION(ct);
6709 		cv_broadcast(&ct->ct_powerchange_cv);
6710 
6711 		break;
6712 
6713 	/* need to do more */
6714 	case BUS_POWER_HAS_CHANGED:
6715 		MDI_DEBUG(4, (MDI_NOTE, bphc->bphc_dip,
6716 		    "BUS_POWER_HAS_CHANGED:"
6717 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d",
6718 		    ddi_node_name(bphc->bphc_dip), PM_ADDR(bphc->bphc_dip),
6719 		    bphc->bphc_olevel, bphc->bphc_nlevel, bphc->bphc_comp));
6720 
6721 		if (bphc->bphc_nlevel > 0 &&
6722 		    bphc->bphc_nlevel > bphc->bphc_olevel) {
6723 			if (ct->ct_power_cnt == 0) {
6724 				ret = i_mdi_power_all_phci(ct);
6725 			}
6726 			MDI_DEBUG(4, (MDI_NOTE, bphc->bphc_dip,
6727 			    "i_mdi_pm_hold_client\n"));
6728 			i_mdi_pm_hold_client(ct, ct->ct_path_count);
6729 		}
6730 
6731 		if (bphc->bphc_nlevel == 0 && bphc->bphc_olevel != -1) {
6732 			MDI_DEBUG(4, (MDI_NOTE, bphc->bphc_dip,
6733 			    "i_mdi_pm_rele_client\n"));
6734 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
6735 		}
6736 		break;
6737 	}
6738 
6739 	MDI_CLIENT_UNLOCK(ct);
6740 	return (ret);
6741 }
6742 
6743 static int
6744 i_mdi_pm_pre_config_one(dev_info_t *child)
6745 {
6746 	int		ret = MDI_SUCCESS;
6747 	mdi_client_t	*ct;
6748 
6749 	ct = i_devi_get_client(child);
6750 	if (ct == NULL)
6751 		return (MDI_FAILURE);
6752 
6753 	MDI_CLIENT_LOCK(ct);
6754 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6755 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6756 
6757 	if (!MDI_CLIENT_IS_FAILED(ct)) {
6758 		MDI_CLIENT_UNLOCK(ct);
6759 		MDI_DEBUG(4, (MDI_NOTE, child, "already configured\n"));
6760 		return (MDI_SUCCESS);
6761 	}
6762 
6763 	if (ct->ct_powercnt_config) {
6764 		MDI_CLIENT_UNLOCK(ct);
6765 		MDI_DEBUG(4, (MDI_NOTE, child, "already held\n"));
6766 		return (MDI_SUCCESS);
6767 	}
6768 
6769 	if (ct->ct_power_cnt == 0) {
6770 		ret = i_mdi_power_all_phci(ct);
6771 	}
6772 	MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_hold_client\n"));
6773 	i_mdi_pm_hold_client(ct, ct->ct_path_count);
6774 	ct->ct_powercnt_config = 1;
6775 	ct->ct_powercnt_reset = 0;
6776 	MDI_CLIENT_UNLOCK(ct);
6777 	return (ret);
6778 }
6779 
6780 static int
6781 i_mdi_pm_pre_config(dev_info_t *vdip, dev_info_t *child)
6782 {
6783 	int			ret = MDI_SUCCESS;
6784 	dev_info_t		*cdip;
6785 	int			circ;
6786 
6787 	ASSERT(MDI_VHCI(vdip));
6788 
6789 	/* ndi_devi_config_one */
6790 	if (child) {
6791 		ASSERT(DEVI_BUSY_OWNED(vdip));
6792 		return (i_mdi_pm_pre_config_one(child));
6793 	}
6794 
6795 	/* devi_config_common */
6796 	ndi_devi_enter(vdip, &circ);
6797 	cdip = ddi_get_child(vdip);
6798 	while (cdip) {
6799 		dev_info_t *next = ddi_get_next_sibling(cdip);
6800 
6801 		ret = i_mdi_pm_pre_config_one(cdip);
6802 		if (ret != MDI_SUCCESS)
6803 			break;
6804 		cdip = next;
6805 	}
6806 	ndi_devi_exit(vdip, circ);
6807 	return (ret);
6808 }
6809 
6810 static int
6811 i_mdi_pm_pre_unconfig_one(dev_info_t *child, int *held, int flags)
6812 {
6813 	int		ret = MDI_SUCCESS;
6814 	mdi_client_t	*ct;
6815 
6816 	ct = i_devi_get_client(child);
6817 	if (ct == NULL)
6818 		return (MDI_FAILURE);
6819 
6820 	MDI_CLIENT_LOCK(ct);
6821 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6822 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6823 
6824 	if (!i_ddi_devi_attached(ct->ct_dip)) {
6825 		MDI_DEBUG(4, (MDI_NOTE, child, "node detached already\n"));
6826 		MDI_CLIENT_UNLOCK(ct);
6827 		return (MDI_SUCCESS);
6828 	}
6829 
6830 	if (MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6831 	    (flags & NDI_AUTODETACH)) {
6832 		MDI_DEBUG(4, (MDI_NOTE, child, "auto-modunload\n"));
6833 		MDI_CLIENT_UNLOCK(ct);
6834 		return (MDI_FAILURE);
6835 	}
6836 
6837 	if (ct->ct_powercnt_unconfig) {
6838 		MDI_DEBUG(4, (MDI_NOTE, child, "ct_powercnt_held\n"));
6839 		MDI_CLIENT_UNLOCK(ct);
6840 		*held = 1;
6841 		return (MDI_SUCCESS);
6842 	}
6843 
6844 	if (ct->ct_power_cnt == 0) {
6845 		ret = i_mdi_power_all_phci(ct);
6846 	}
6847 	MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_hold_client\n"));
6848 	i_mdi_pm_hold_client(ct, ct->ct_path_count);
6849 	ct->ct_powercnt_unconfig = 1;
6850 	ct->ct_powercnt_reset = 0;
6851 	MDI_CLIENT_UNLOCK(ct);
6852 	if (ret == MDI_SUCCESS)
6853 		*held = 1;
6854 	return (ret);
6855 }
6856 
6857 static int
6858 i_mdi_pm_pre_unconfig(dev_info_t *vdip, dev_info_t *child, int *held,
6859     int flags)
6860 {
6861 	int			ret = MDI_SUCCESS;
6862 	dev_info_t		*cdip;
6863 	int			circ;
6864 
6865 	ASSERT(MDI_VHCI(vdip));
6866 	*held = 0;
6867 
6868 	/* ndi_devi_unconfig_one */
6869 	if (child) {
6870 		ASSERT(DEVI_BUSY_OWNED(vdip));
6871 		return (i_mdi_pm_pre_unconfig_one(child, held, flags));
6872 	}
6873 
6874 	/* devi_unconfig_common */
6875 	ndi_devi_enter(vdip, &circ);
6876 	cdip = ddi_get_child(vdip);
6877 	while (cdip) {
6878 		dev_info_t *next = ddi_get_next_sibling(cdip);
6879 
6880 		ret = i_mdi_pm_pre_unconfig_one(cdip, held, flags);
6881 		cdip = next;
6882 	}
6883 	ndi_devi_exit(vdip, circ);
6884 
6885 	if (*held)
6886 		ret = MDI_SUCCESS;
6887 
6888 	return (ret);
6889 }
6890 
6891 static void
6892 i_mdi_pm_post_config_one(dev_info_t *child)
6893 {
6894 	mdi_client_t	*ct;
6895 
6896 	ct = i_devi_get_client(child);
6897 	if (ct == NULL)
6898 		return;
6899 
6900 	MDI_CLIENT_LOCK(ct);
6901 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6902 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6903 
6904 	if (ct->ct_powercnt_reset || !ct->ct_powercnt_config) {
6905 		MDI_DEBUG(4, (MDI_NOTE, child, "not configured\n"));
6906 		MDI_CLIENT_UNLOCK(ct);
6907 		return;
6908 	}
6909 
6910 	/* client has not been updated */
6911 	if (MDI_CLIENT_IS_FAILED(ct)) {
6912 		MDI_DEBUG(4, (MDI_NOTE, child, "client failed\n"));
6913 		MDI_CLIENT_UNLOCK(ct);
6914 		return;
6915 	}
6916 
6917 	/* another thread might have powered it down or detached it */
6918 	if ((MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6919 	    !DEVI_IS_ATTACHING(ct->ct_dip)) ||
6920 	    (!i_ddi_devi_attached(ct->ct_dip) &&
6921 	    !DEVI_IS_ATTACHING(ct->ct_dip))) {
6922 		MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_reset_client\n"));
6923 		i_mdi_pm_reset_client(ct);
6924 	} else {
6925 		mdi_pathinfo_t  *pip, *next;
6926 		int	valid_path_count = 0;
6927 
6928 		MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_rele_client\n"));
6929 		pip = ct->ct_path_head;
6930 		while (pip != NULL) {
6931 			MDI_PI_LOCK(pip);
6932 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6933 			if (MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip))
6934 				valid_path_count ++;
6935 			MDI_PI_UNLOCK(pip);
6936 			pip = next;
6937 		}
6938 		i_mdi_pm_rele_client(ct, valid_path_count);
6939 	}
6940 	ct->ct_powercnt_config = 0;
6941 	MDI_CLIENT_UNLOCK(ct);
6942 }
6943 
6944 static void
6945 i_mdi_pm_post_config(dev_info_t *vdip, dev_info_t *child)
6946 {
6947 	int		circ;
6948 	dev_info_t	*cdip;
6949 
6950 	ASSERT(MDI_VHCI(vdip));
6951 
6952 	/* ndi_devi_config_one */
6953 	if (child) {
6954 		ASSERT(DEVI_BUSY_OWNED(vdip));
6955 		i_mdi_pm_post_config_one(child);
6956 		return;
6957 	}
6958 
6959 	/* devi_config_common */
6960 	ndi_devi_enter(vdip, &circ);
6961 	cdip = ddi_get_child(vdip);
6962 	while (cdip) {
6963 		dev_info_t *next = ddi_get_next_sibling(cdip);
6964 
6965 		i_mdi_pm_post_config_one(cdip);
6966 		cdip = next;
6967 	}
6968 	ndi_devi_exit(vdip, circ);
6969 }
6970 
6971 static void
6972 i_mdi_pm_post_unconfig_one(dev_info_t *child)
6973 {
6974 	mdi_client_t	*ct;
6975 
6976 	ct = i_devi_get_client(child);
6977 	if (ct == NULL)
6978 		return;
6979 
6980 	MDI_CLIENT_LOCK(ct);
6981 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6982 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6983 
6984 	if (!ct->ct_powercnt_unconfig || ct->ct_powercnt_reset) {
6985 		MDI_DEBUG(4, (MDI_NOTE, child, "not held\n"));
6986 		MDI_CLIENT_UNLOCK(ct);
6987 		return;
6988 	}
6989 
6990 	/* failure detaching or another thread just attached it */
6991 	if ((MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6992 	    i_ddi_devi_attached(ct->ct_dip)) ||
6993 	    (!i_ddi_devi_attached(ct->ct_dip) &&
6994 	    !DEVI_IS_ATTACHING(ct->ct_dip))) {
6995 		MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_reset_client\n"));
6996 		i_mdi_pm_reset_client(ct);
6997 	} else {
6998 		mdi_pathinfo_t  *pip, *next;
6999 		int	valid_path_count = 0;
7000 
7001 		MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_rele_client\n"));
7002 		pip = ct->ct_path_head;
7003 		while (pip != NULL) {
7004 			MDI_PI_LOCK(pip);
7005 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
7006 			if (MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip))
7007 				valid_path_count ++;
7008 			MDI_PI_UNLOCK(pip);
7009 			pip = next;
7010 		}
7011 		i_mdi_pm_rele_client(ct, valid_path_count);
7012 		ct->ct_powercnt_unconfig = 0;
7013 	}
7014 
7015 	MDI_CLIENT_UNLOCK(ct);
7016 }
7017 
7018 static void
7019 i_mdi_pm_post_unconfig(dev_info_t *vdip, dev_info_t *child, int held)
7020 {
7021 	int			circ;
7022 	dev_info_t		*cdip;
7023 
7024 	ASSERT(MDI_VHCI(vdip));
7025 
7026 	if (!held) {
7027 		MDI_DEBUG(4, (MDI_NOTE, vdip, "held = %d", held));
7028 		return;
7029 	}
7030 
7031 	if (child) {
7032 		ASSERT(DEVI_BUSY_OWNED(vdip));
7033 		i_mdi_pm_post_unconfig_one(child);
7034 		return;
7035 	}
7036 
7037 	ndi_devi_enter(vdip, &circ);
7038 	cdip = ddi_get_child(vdip);
7039 	while (cdip) {
7040 		dev_info_t *next = ddi_get_next_sibling(cdip);
7041 
7042 		i_mdi_pm_post_unconfig_one(cdip);
7043 		cdip = next;
7044 	}
7045 	ndi_devi_exit(vdip, circ);
7046 }
7047 
7048 int
7049 mdi_power(dev_info_t *vdip, mdi_pm_op_t op, void *args, char *devnm, int flags)
7050 {
7051 	int			circ, ret = MDI_SUCCESS;
7052 	dev_info_t		*client_dip = NULL;
7053 	mdi_client_t		*ct;
7054 
7055 	/*
7056 	 * Handling ndi_devi_config_one and ndi_devi_unconfig_one.
7057 	 * Power up pHCI for the named client device.
7058 	 * Note: Before the client is enumerated under vhci by phci,
7059 	 * client_dip can be NULL. Then proceed to power up all the
7060 	 * pHCIs.
7061 	 */
7062 	if (devnm != NULL) {
7063 		ndi_devi_enter(vdip, &circ);
7064 		client_dip = ndi_devi_findchild(vdip, devnm);
7065 	}
7066 
7067 	MDI_DEBUG(4, (MDI_NOTE, vdip,
7068 	    "op = %d %s %p", op, devnm ? devnm : "", (void *)client_dip));
7069 
7070 	switch (op) {
7071 	case MDI_PM_PRE_CONFIG:
7072 		ret = i_mdi_pm_pre_config(vdip, client_dip);
7073 		break;
7074 
7075 	case MDI_PM_PRE_UNCONFIG:
7076 		ret = i_mdi_pm_pre_unconfig(vdip, client_dip, (int *)args,
7077 		    flags);
7078 		break;
7079 
7080 	case MDI_PM_POST_CONFIG:
7081 		i_mdi_pm_post_config(vdip, client_dip);
7082 		break;
7083 
7084 	case MDI_PM_POST_UNCONFIG:
7085 		i_mdi_pm_post_unconfig(vdip, client_dip, *(int *)args);
7086 		break;
7087 
7088 	case MDI_PM_HOLD_POWER:
7089 	case MDI_PM_RELE_POWER:
7090 		ASSERT(args);
7091 
7092 		client_dip = (dev_info_t *)args;
7093 		ASSERT(MDI_CLIENT(client_dip));
7094 
7095 		ct = i_devi_get_client(client_dip);
7096 		MDI_CLIENT_LOCK(ct);
7097 
7098 		if (op == MDI_PM_HOLD_POWER) {
7099 			if (ct->ct_power_cnt == 0) {
7100 				(void) i_mdi_power_all_phci(ct);
7101 				MDI_DEBUG(4, (MDI_NOTE, client_dip,
7102 				    "i_mdi_pm_hold_client\n"));
7103 				i_mdi_pm_hold_client(ct, ct->ct_path_count);
7104 			}
7105 		} else {
7106 			if (DEVI_IS_ATTACHING(ct->ct_dip)) {
7107 				MDI_DEBUG(4, (MDI_NOTE, client_dip,
7108 				    "i_mdi_pm_rele_client\n"));
7109 				i_mdi_pm_rele_client(ct, ct->ct_path_count);
7110 			} else {
7111 				MDI_DEBUG(4, (MDI_NOTE, client_dip,
7112 				    "i_mdi_pm_reset_client\n"));
7113 				i_mdi_pm_reset_client(ct);
7114 			}
7115 		}
7116 
7117 		MDI_CLIENT_UNLOCK(ct);
7118 		break;
7119 
7120 	default:
7121 		break;
7122 	}
7123 
7124 	if (devnm)
7125 		ndi_devi_exit(vdip, circ);
7126 
7127 	return (ret);
7128 }
7129 
7130 int
7131 mdi_component_is_vhci(dev_info_t *dip, const char **mdi_class)
7132 {
7133 	mdi_vhci_t *vhci;
7134 
7135 	if (!MDI_VHCI(dip))
7136 		return (MDI_FAILURE);
7137 
7138 	if (mdi_class) {
7139 		vhci = DEVI(dip)->devi_mdi_xhci;
7140 		ASSERT(vhci);
7141 		*mdi_class = vhci->vh_class;
7142 	}
7143 
7144 	return (MDI_SUCCESS);
7145 }
7146 
7147 int
7148 mdi_component_is_phci(dev_info_t *dip, const char **mdi_class)
7149 {
7150 	mdi_phci_t *phci;
7151 
7152 	if (!MDI_PHCI(dip))
7153 		return (MDI_FAILURE);
7154 
7155 	if (mdi_class) {
7156 		phci = DEVI(dip)->devi_mdi_xhci;
7157 		ASSERT(phci);
7158 		*mdi_class = phci->ph_vhci->vh_class;
7159 	}
7160 
7161 	return (MDI_SUCCESS);
7162 }
7163 
7164 int
7165 mdi_component_is_client(dev_info_t *dip, const char **mdi_class)
7166 {
7167 	mdi_client_t *client;
7168 
7169 	if (!MDI_CLIENT(dip))
7170 		return (MDI_FAILURE);
7171 
7172 	if (mdi_class) {
7173 		client = DEVI(dip)->devi_mdi_client;
7174 		ASSERT(client);
7175 		*mdi_class = client->ct_vhci->vh_class;
7176 	}
7177 
7178 	return (MDI_SUCCESS);
7179 }
7180 
7181 void *
7182 mdi_client_get_vhci_private(dev_info_t *dip)
7183 {
7184 	ASSERT(mdi_component_is_client(dip, NULL) == MDI_SUCCESS);
7185 	if (mdi_component_is_client(dip, NULL) == MDI_SUCCESS) {
7186 		mdi_client_t	*ct;
7187 		ct = i_devi_get_client(dip);
7188 		return (ct->ct_vprivate);
7189 	}
7190 	return (NULL);
7191 }
7192 
7193 void
7194 mdi_client_set_vhci_private(dev_info_t *dip, void *data)
7195 {
7196 	ASSERT(mdi_component_is_client(dip, NULL) == MDI_SUCCESS);
7197 	if (mdi_component_is_client(dip, NULL) == MDI_SUCCESS) {
7198 		mdi_client_t	*ct;
7199 		ct = i_devi_get_client(dip);
7200 		ct->ct_vprivate = data;
7201 	}
7202 }
7203 /*
7204  * mdi_pi_get_vhci_private():
7205  *		Get the vhci private information associated with the
7206  *		mdi_pathinfo node
7207  */
7208 void *
7209 mdi_pi_get_vhci_private(mdi_pathinfo_t *pip)
7210 {
7211 	caddr_t	vprivate = NULL;
7212 	if (pip) {
7213 		vprivate = MDI_PI(pip)->pi_vprivate;
7214 	}
7215 	return (vprivate);
7216 }
7217 
7218 /*
7219  * mdi_pi_set_vhci_private():
7220  *		Set the vhci private information in the mdi_pathinfo node
7221  */
7222 void
7223 mdi_pi_set_vhci_private(mdi_pathinfo_t *pip, void *priv)
7224 {
7225 	if (pip) {
7226 		MDI_PI(pip)->pi_vprivate = priv;
7227 	}
7228 }
7229 
7230 /*
7231  * mdi_phci_get_vhci_private():
7232  *		Get the vhci private information associated with the
7233  *		mdi_phci node
7234  */
7235 void *
7236 mdi_phci_get_vhci_private(dev_info_t *dip)
7237 {
7238 	ASSERT(mdi_component_is_phci(dip, NULL) == MDI_SUCCESS);
7239 	if (mdi_component_is_phci(dip, NULL) == MDI_SUCCESS) {
7240 		mdi_phci_t	*ph;
7241 		ph = i_devi_get_phci(dip);
7242 		return (ph->ph_vprivate);
7243 	}
7244 	return (NULL);
7245 }
7246 
7247 /*
7248  * mdi_phci_set_vhci_private():
7249  *		Set the vhci private information in the mdi_phci node
7250  */
7251 void
7252 mdi_phci_set_vhci_private(dev_info_t *dip, void *priv)
7253 {
7254 	ASSERT(mdi_component_is_phci(dip, NULL) == MDI_SUCCESS);
7255 	if (mdi_component_is_phci(dip, NULL) == MDI_SUCCESS) {
7256 		mdi_phci_t	*ph;
7257 		ph = i_devi_get_phci(dip);
7258 		ph->ph_vprivate = priv;
7259 	}
7260 }
7261 
7262 int
7263 mdi_pi_ishidden(mdi_pathinfo_t *pip)
7264 {
7265 	return (MDI_PI_FLAGS_IS_HIDDEN(pip));
7266 }
7267 
7268 int
7269 mdi_pi_device_isremoved(mdi_pathinfo_t *pip)
7270 {
7271 	return (MDI_PI_FLAGS_IS_DEVICE_REMOVED(pip));
7272 }
7273 
7274 /*
7275  * When processing hotplug, if mdi_pi_offline-mdi_pi_free fails then this
7276  * interface is used to represent device removal.
7277  */
7278 int
7279 mdi_pi_device_remove(mdi_pathinfo_t *pip)
7280 {
7281 	MDI_PI_LOCK(pip);
7282 	if (mdi_pi_device_isremoved(pip)) {
7283 		MDI_PI_UNLOCK(pip);
7284 		return (0);
7285 	}
7286 	MDI_PI_FLAGS_SET_DEVICE_REMOVED(pip);
7287 	MDI_PI_FLAGS_SET_HIDDEN(pip);
7288 	MDI_PI_UNLOCK(pip);
7289 
7290 	i_ddi_di_cache_invalidate();
7291 
7292 	return (1);
7293 }
7294 
7295 /*
7296  * When processing hotplug, if a path marked mdi_pi_device_isremoved()
7297  * is now accessible then this interfaces is used to represent device insertion.
7298  */
7299 int
7300 mdi_pi_device_insert(mdi_pathinfo_t *pip)
7301 {
7302 	MDI_PI_LOCK(pip);
7303 	if (!mdi_pi_device_isremoved(pip)) {
7304 		MDI_PI_UNLOCK(pip);
7305 		return (0);
7306 	}
7307 	MDI_PI_FLAGS_CLR_DEVICE_REMOVED(pip);
7308 	MDI_PI_FLAGS_CLR_HIDDEN(pip);
7309 	MDI_PI_UNLOCK(pip);
7310 
7311 	i_ddi_di_cache_invalidate();
7312 
7313 	return (1);
7314 }
7315 
7316 /*
7317  * List of vhci class names:
7318  * A vhci class name must be in this list only if the corresponding vhci
7319  * driver intends to use the mdi provided bus config implementation
7320  * (i.e., mdi_vhci_bus_config()).
7321  */
7322 static char *vhci_class_list[] = { MDI_HCI_CLASS_SCSI, MDI_HCI_CLASS_IB };
7323 #define	N_VHCI_CLASSES	(sizeof (vhci_class_list) / sizeof (char *))
7324 
7325 /*
7326  * During boot time, the on-disk vhci cache for every vhci class is read
7327  * in the form of an nvlist and stored here.
7328  */
7329 static nvlist_t *vhcache_nvl[N_VHCI_CLASSES];
7330 
7331 /* nvpair names in vhci cache nvlist */
7332 #define	MDI_VHCI_CACHE_VERSION	1
7333 #define	MDI_NVPNAME_VERSION	"version"
7334 #define	MDI_NVPNAME_PHCIS	"phcis"
7335 #define	MDI_NVPNAME_CTADDRMAP	"clientaddrmap"
7336 
7337 /*
7338  * Given vhci class name, return its on-disk vhci cache filename.
7339  * Memory for the returned filename which includes the full path is allocated
7340  * by this function.
7341  */
7342 static char *
7343 vhclass2vhcache_filename(char *vhclass)
7344 {
7345 	char *filename;
7346 	int len;
7347 	static char *fmt = "/etc/devices/mdi_%s_cache";
7348 
7349 	/*
7350 	 * fmt contains the on-disk vhci cache file name format;
7351 	 * for scsi_vhci the filename is "/etc/devices/mdi_scsi_vhci_cache".
7352 	 */
7353 
7354 	/* the -1 below is to account for "%s" in the format string */
7355 	len = strlen(fmt) + strlen(vhclass) - 1;
7356 	filename = kmem_alloc(len, KM_SLEEP);
7357 	(void) snprintf(filename, len, fmt, vhclass);
7358 	ASSERT(len == (strlen(filename) + 1));
7359 	return (filename);
7360 }
7361 
7362 /*
7363  * initialize the vhci cache related data structures and read the on-disk
7364  * vhci cached data into memory.
7365  */
7366 static void
7367 setup_vhci_cache(mdi_vhci_t *vh)
7368 {
7369 	mdi_vhci_config_t *vhc;
7370 	mdi_vhci_cache_t *vhcache;
7371 	int i;
7372 	nvlist_t *nvl = NULL;
7373 
7374 	vhc = kmem_zalloc(sizeof (mdi_vhci_config_t), KM_SLEEP);
7375 	vh->vh_config = vhc;
7376 	vhcache = &vhc->vhc_vhcache;
7377 
7378 	vhc->vhc_vhcache_filename = vhclass2vhcache_filename(vh->vh_class);
7379 
7380 	mutex_init(&vhc->vhc_lock, NULL, MUTEX_DEFAULT, NULL);
7381 	cv_init(&vhc->vhc_cv, NULL, CV_DRIVER, NULL);
7382 
7383 	rw_init(&vhcache->vhcache_lock, NULL, RW_DRIVER, NULL);
7384 
7385 	/*
7386 	 * Create string hash; same as mod_hash_create_strhash() except that
7387 	 * we use NULL key destructor.
7388 	 */
7389 	vhcache->vhcache_client_hash = mod_hash_create_extended(vh->vh_class,
7390 	    mdi_bus_config_cache_hash_size,
7391 	    mod_hash_null_keydtor, mod_hash_null_valdtor,
7392 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
7393 
7394 	/*
7395 	 * The on-disk vhci cache is read during booting prior to the
7396 	 * lights-out period by mdi_read_devices_files().
7397 	 */
7398 	for (i = 0; i < N_VHCI_CLASSES; i++) {
7399 		if (strcmp(vhci_class_list[i], vh->vh_class) == 0) {
7400 			nvl = vhcache_nvl[i];
7401 			vhcache_nvl[i] = NULL;
7402 			break;
7403 		}
7404 	}
7405 
7406 	/*
7407 	 * this is to cover the case of some one manually causing unloading
7408 	 * (or detaching) and reloading (or attaching) of a vhci driver.
7409 	 */
7410 	if (nvl == NULL && modrootloaded)
7411 		nvl = read_on_disk_vhci_cache(vh->vh_class);
7412 
7413 	if (nvl != NULL) {
7414 		rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7415 		if (mainnvl_to_vhcache(vhcache, nvl) == MDI_SUCCESS)
7416 			vhcache->vhcache_flags |= MDI_VHCI_CACHE_SETUP_DONE;
7417 		else  {
7418 			cmn_err(CE_WARN,
7419 			    "%s: data file corrupted, will recreate",
7420 			    vhc->vhc_vhcache_filename);
7421 		}
7422 		rw_exit(&vhcache->vhcache_lock);
7423 		nvlist_free(nvl);
7424 	}
7425 
7426 	vhc->vhc_cbid = callb_add(stop_vhcache_flush_thread, vhc,
7427 	    CB_CL_UADMIN_PRE_VFS, "mdi_vhcache_flush");
7428 
7429 	vhc->vhc_path_discovery_boot = mdi_path_discovery_boot;
7430 	vhc->vhc_path_discovery_postboot = mdi_path_discovery_postboot;
7431 }
7432 
7433 /*
7434  * free all vhci cache related resources
7435  */
7436 static int
7437 destroy_vhci_cache(mdi_vhci_t *vh)
7438 {
7439 	mdi_vhci_config_t *vhc = vh->vh_config;
7440 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7441 	mdi_vhcache_phci_t *cphci, *cphci_next;
7442 	mdi_vhcache_client_t *cct, *cct_next;
7443 	mdi_vhcache_pathinfo_t *cpi, *cpi_next;
7444 
7445 	if (stop_vhcache_async_threads(vhc) != MDI_SUCCESS)
7446 		return (MDI_FAILURE);
7447 
7448 	kmem_free(vhc->vhc_vhcache_filename,
7449 	    strlen(vhc->vhc_vhcache_filename) + 1);
7450 
7451 	mod_hash_destroy_strhash(vhcache->vhcache_client_hash);
7452 
7453 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7454 	    cphci = cphci_next) {
7455 		cphci_next = cphci->cphci_next;
7456 		free_vhcache_phci(cphci);
7457 	}
7458 
7459 	for (cct = vhcache->vhcache_client_head; cct != NULL; cct = cct_next) {
7460 		cct_next = cct->cct_next;
7461 		for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi_next) {
7462 			cpi_next = cpi->cpi_next;
7463 			free_vhcache_pathinfo(cpi);
7464 		}
7465 		free_vhcache_client(cct);
7466 	}
7467 
7468 	rw_destroy(&vhcache->vhcache_lock);
7469 
7470 	mutex_destroy(&vhc->vhc_lock);
7471 	cv_destroy(&vhc->vhc_cv);
7472 	kmem_free(vhc, sizeof (mdi_vhci_config_t));
7473 	return (MDI_SUCCESS);
7474 }
7475 
7476 /*
7477  * Stop all vhci cache related async threads and free their resources.
7478  */
7479 static int
7480 stop_vhcache_async_threads(mdi_vhci_config_t *vhc)
7481 {
7482 	mdi_async_client_config_t *acc, *acc_next;
7483 
7484 	mutex_enter(&vhc->vhc_lock);
7485 	vhc->vhc_flags |= MDI_VHC_EXIT;
7486 	ASSERT(vhc->vhc_acc_thrcount >= 0);
7487 	cv_broadcast(&vhc->vhc_cv);
7488 
7489 	while ((vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) ||
7490 	    vhc->vhc_acc_thrcount != 0) {
7491 		mutex_exit(&vhc->vhc_lock);
7492 		delay_random(mdi_delay);
7493 		mutex_enter(&vhc->vhc_lock);
7494 	}
7495 
7496 	vhc->vhc_flags &= ~MDI_VHC_EXIT;
7497 
7498 	for (acc = vhc->vhc_acc_list_head; acc != NULL; acc = acc_next) {
7499 		acc_next = acc->acc_next;
7500 		free_async_client_config(acc);
7501 	}
7502 	vhc->vhc_acc_list_head = NULL;
7503 	vhc->vhc_acc_list_tail = NULL;
7504 	vhc->vhc_acc_count = 0;
7505 
7506 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) {
7507 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
7508 		mutex_exit(&vhc->vhc_lock);
7509 		if (flush_vhcache(vhc, 0) != MDI_SUCCESS) {
7510 			vhcache_dirty(vhc);
7511 			return (MDI_FAILURE);
7512 		}
7513 	} else
7514 		mutex_exit(&vhc->vhc_lock);
7515 
7516 	if (callb_delete(vhc->vhc_cbid) != 0)
7517 		return (MDI_FAILURE);
7518 
7519 	return (MDI_SUCCESS);
7520 }
7521 
7522 /*
7523  * Stop vhci cache flush thread
7524  */
7525 /* ARGSUSED */
7526 static boolean_t
7527 stop_vhcache_flush_thread(void *arg, int code)
7528 {
7529 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
7530 
7531 	mutex_enter(&vhc->vhc_lock);
7532 	vhc->vhc_flags |= MDI_VHC_EXIT;
7533 	cv_broadcast(&vhc->vhc_cv);
7534 
7535 	while (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) {
7536 		mutex_exit(&vhc->vhc_lock);
7537 		delay_random(mdi_delay);
7538 		mutex_enter(&vhc->vhc_lock);
7539 	}
7540 
7541 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) {
7542 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
7543 		mutex_exit(&vhc->vhc_lock);
7544 		(void) flush_vhcache(vhc, 1);
7545 	} else
7546 		mutex_exit(&vhc->vhc_lock);
7547 
7548 	return (B_TRUE);
7549 }
7550 
7551 /*
7552  * Enqueue the vhcache phci (cphci) at the tail of the list
7553  */
7554 static void
7555 enqueue_vhcache_phci(mdi_vhci_cache_t *vhcache, mdi_vhcache_phci_t *cphci)
7556 {
7557 	cphci->cphci_next = NULL;
7558 	if (vhcache->vhcache_phci_head == NULL)
7559 		vhcache->vhcache_phci_head = cphci;
7560 	else
7561 		vhcache->vhcache_phci_tail->cphci_next = cphci;
7562 	vhcache->vhcache_phci_tail = cphci;
7563 }
7564 
7565 /*
7566  * Enqueue the vhcache pathinfo (cpi) at the tail of the list
7567  */
7568 static void
7569 enqueue_tail_vhcache_pathinfo(mdi_vhcache_client_t *cct,
7570     mdi_vhcache_pathinfo_t *cpi)
7571 {
7572 	cpi->cpi_next = NULL;
7573 	if (cct->cct_cpi_head == NULL)
7574 		cct->cct_cpi_head = cpi;
7575 	else
7576 		cct->cct_cpi_tail->cpi_next = cpi;
7577 	cct->cct_cpi_tail = cpi;
7578 }
7579 
7580 /*
7581  * Enqueue the vhcache pathinfo (cpi) at the correct location in the
7582  * ordered list. All cpis which do not have MDI_CPI_HINT_PATH_DOES_NOT_EXIST
7583  * flag set come at the beginning of the list. All cpis which have this
7584  * flag set come at the end of the list.
7585  */
7586 static void
7587 enqueue_vhcache_pathinfo(mdi_vhcache_client_t *cct,
7588     mdi_vhcache_pathinfo_t *newcpi)
7589 {
7590 	mdi_vhcache_pathinfo_t *cpi, *prev_cpi;
7591 
7592 	if (cct->cct_cpi_head == NULL ||
7593 	    (newcpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST))
7594 		enqueue_tail_vhcache_pathinfo(cct, newcpi);
7595 	else {
7596 		for (cpi = cct->cct_cpi_head, prev_cpi = NULL; cpi != NULL &&
7597 		    !(cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST);
7598 		    prev_cpi = cpi, cpi = cpi->cpi_next)
7599 			;
7600 
7601 		if (prev_cpi == NULL)
7602 			cct->cct_cpi_head = newcpi;
7603 		else
7604 			prev_cpi->cpi_next = newcpi;
7605 
7606 		newcpi->cpi_next = cpi;
7607 
7608 		if (cpi == NULL)
7609 			cct->cct_cpi_tail = newcpi;
7610 	}
7611 }
7612 
7613 /*
7614  * Enqueue the vhcache client (cct) at the tail of the list
7615  */
7616 static void
7617 enqueue_vhcache_client(mdi_vhci_cache_t *vhcache,
7618     mdi_vhcache_client_t *cct)
7619 {
7620 	cct->cct_next = NULL;
7621 	if (vhcache->vhcache_client_head == NULL)
7622 		vhcache->vhcache_client_head = cct;
7623 	else
7624 		vhcache->vhcache_client_tail->cct_next = cct;
7625 	vhcache->vhcache_client_tail = cct;
7626 }
7627 
7628 static void
7629 free_string_array(char **str, int nelem)
7630 {
7631 	int i;
7632 
7633 	if (str) {
7634 		for (i = 0; i < nelem; i++) {
7635 			if (str[i])
7636 				kmem_free(str[i], strlen(str[i]) + 1);
7637 		}
7638 		kmem_free(str, sizeof (char *) * nelem);
7639 	}
7640 }
7641 
7642 static void
7643 free_vhcache_phci(mdi_vhcache_phci_t *cphci)
7644 {
7645 	kmem_free(cphci->cphci_path, strlen(cphci->cphci_path) + 1);
7646 	kmem_free(cphci, sizeof (*cphci));
7647 }
7648 
7649 static void
7650 free_vhcache_pathinfo(mdi_vhcache_pathinfo_t *cpi)
7651 {
7652 	kmem_free(cpi->cpi_addr, strlen(cpi->cpi_addr) + 1);
7653 	kmem_free(cpi, sizeof (*cpi));
7654 }
7655 
7656 static void
7657 free_vhcache_client(mdi_vhcache_client_t *cct)
7658 {
7659 	kmem_free(cct->cct_name_addr, strlen(cct->cct_name_addr) + 1);
7660 	kmem_free(cct, sizeof (*cct));
7661 }
7662 
7663 static char *
7664 vhcache_mknameaddr(char *ct_name, char *ct_addr, int *ret_len)
7665 {
7666 	char *name_addr;
7667 	int len;
7668 
7669 	len = strlen(ct_name) + strlen(ct_addr) + 2;
7670 	name_addr = kmem_alloc(len, KM_SLEEP);
7671 	(void) snprintf(name_addr, len, "%s@%s", ct_name, ct_addr);
7672 
7673 	if (ret_len)
7674 		*ret_len = len;
7675 	return (name_addr);
7676 }
7677 
7678 /*
7679  * Copy the contents of paddrnvl to vhci cache.
7680  * paddrnvl nvlist contains path information for a vhci client.
7681  * See the comment in mainnvl_to_vhcache() for the format of this nvlist.
7682  */
7683 static void
7684 paddrnvl_to_vhcache(nvlist_t *nvl, mdi_vhcache_phci_t *cphci_list[],
7685     mdi_vhcache_client_t *cct)
7686 {
7687 	nvpair_t *nvp = NULL;
7688 	mdi_vhcache_pathinfo_t *cpi;
7689 	uint_t nelem;
7690 	uint32_t *val;
7691 
7692 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
7693 		ASSERT(nvpair_type(nvp) == DATA_TYPE_UINT32_ARRAY);
7694 		cpi = kmem_zalloc(sizeof (*cpi), KM_SLEEP);
7695 		cpi->cpi_addr = i_ddi_strdup(nvpair_name(nvp), KM_SLEEP);
7696 		(void) nvpair_value_uint32_array(nvp, &val, &nelem);
7697 		ASSERT(nelem == 2);
7698 		cpi->cpi_cphci = cphci_list[val[0]];
7699 		cpi->cpi_flags = val[1];
7700 		enqueue_tail_vhcache_pathinfo(cct, cpi);
7701 	}
7702 }
7703 
7704 /*
7705  * Copy the contents of caddrmapnvl to vhci cache.
7706  * caddrmapnvl nvlist contains vhci client address to phci client address
7707  * mappings. See the comment in mainnvl_to_vhcache() for the format of
7708  * this nvlist.
7709  */
7710 static void
7711 caddrmapnvl_to_vhcache(mdi_vhci_cache_t *vhcache, nvlist_t *nvl,
7712     mdi_vhcache_phci_t *cphci_list[])
7713 {
7714 	nvpair_t *nvp = NULL;
7715 	nvlist_t *paddrnvl;
7716 	mdi_vhcache_client_t *cct;
7717 
7718 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
7719 		ASSERT(nvpair_type(nvp) == DATA_TYPE_NVLIST);
7720 		cct = kmem_zalloc(sizeof (*cct), KM_SLEEP);
7721 		cct->cct_name_addr = i_ddi_strdup(nvpair_name(nvp), KM_SLEEP);
7722 		(void) nvpair_value_nvlist(nvp, &paddrnvl);
7723 		paddrnvl_to_vhcache(paddrnvl, cphci_list, cct);
7724 		/* the client must contain at least one path */
7725 		ASSERT(cct->cct_cpi_head != NULL);
7726 
7727 		enqueue_vhcache_client(vhcache, cct);
7728 		(void) mod_hash_insert(vhcache->vhcache_client_hash,
7729 		    (mod_hash_key_t)cct->cct_name_addr, (mod_hash_val_t)cct);
7730 	}
7731 }
7732 
7733 /*
7734  * Copy the contents of the main nvlist to vhci cache.
7735  *
7736  * VHCI busconfig cached data is stored in the form of a nvlist on the disk.
7737  * The nvlist contains the mappings between the vhci client addresses and
7738  * their corresponding phci client addresses.
7739  *
7740  * The structure of the nvlist is as follows:
7741  *
7742  * Main nvlist:
7743  *	NAME		TYPE		DATA
7744  *	version		int32		version number
7745  *	phcis		string array	array of phci paths
7746  *	clientaddrmap	nvlist_t	c2paddrs_nvl (see below)
7747  *
7748  * structure of c2paddrs_nvl:
7749  *	NAME		TYPE		DATA
7750  *	caddr1		nvlist_t	paddrs_nvl1
7751  *	caddr2		nvlist_t	paddrs_nvl2
7752  *	...
7753  * where caddr1, caddr2, ... are vhci client name and addresses in the
7754  * form of "<clientname>@<clientaddress>".
7755  * (for example: "ssd@2000002037cd9f72");
7756  * paddrs_nvl1, paddrs_nvl2, .. are nvlists that contain path information.
7757  *
7758  * structure of paddrs_nvl:
7759  *	NAME		TYPE		DATA
7760  *	pi_addr1	uint32_array	(phci-id, cpi_flags)
7761  *	pi_addr2	uint32_array	(phci-id, cpi_flags)
7762  *	...
7763  * where pi_addr1, pi_addr2, ... are bus specific addresses of pathinfo nodes
7764  * (so called pi_addrs, for example: "w2100002037cd9f72,0");
7765  * phci-ids are integers that identify pHCIs to which the
7766  * the bus specific address belongs to. These integers are used as an index
7767  * into to the phcis string array in the main nvlist to get the pHCI path.
7768  */
7769 static int
7770 mainnvl_to_vhcache(mdi_vhci_cache_t *vhcache, nvlist_t *nvl)
7771 {
7772 	char **phcis, **phci_namep;
7773 	uint_t nphcis;
7774 	mdi_vhcache_phci_t *cphci, **cphci_list;
7775 	nvlist_t *caddrmapnvl;
7776 	int32_t ver;
7777 	int i;
7778 	size_t cphci_list_size;
7779 
7780 	ASSERT(RW_WRITE_HELD(&vhcache->vhcache_lock));
7781 
7782 	if (nvlist_lookup_int32(nvl, MDI_NVPNAME_VERSION, &ver) != 0 ||
7783 	    ver != MDI_VHCI_CACHE_VERSION)
7784 		return (MDI_FAILURE);
7785 
7786 	if (nvlist_lookup_string_array(nvl, MDI_NVPNAME_PHCIS, &phcis,
7787 	    &nphcis) != 0)
7788 		return (MDI_SUCCESS);
7789 
7790 	ASSERT(nphcis > 0);
7791 
7792 	cphci_list_size = sizeof (mdi_vhcache_phci_t *) * nphcis;
7793 	cphci_list = kmem_alloc(cphci_list_size, KM_SLEEP);
7794 	for (i = 0, phci_namep = phcis; i < nphcis; i++, phci_namep++) {
7795 		cphci = kmem_zalloc(sizeof (mdi_vhcache_phci_t), KM_SLEEP);
7796 		cphci->cphci_path = i_ddi_strdup(*phci_namep, KM_SLEEP);
7797 		enqueue_vhcache_phci(vhcache, cphci);
7798 		cphci_list[i] = cphci;
7799 	}
7800 
7801 	ASSERT(vhcache->vhcache_phci_head != NULL);
7802 
7803 	if (nvlist_lookup_nvlist(nvl, MDI_NVPNAME_CTADDRMAP, &caddrmapnvl) == 0)
7804 		caddrmapnvl_to_vhcache(vhcache, caddrmapnvl, cphci_list);
7805 
7806 	kmem_free(cphci_list, cphci_list_size);
7807 	return (MDI_SUCCESS);
7808 }
7809 
7810 /*
7811  * Build paddrnvl for the specified client using the information in the
7812  * vhci cache and add it to the caddrmapnnvl.
7813  * Returns 0 on success, errno on failure.
7814  */
7815 static int
7816 vhcache_to_paddrnvl(mdi_vhci_cache_t *vhcache, mdi_vhcache_client_t *cct,
7817     nvlist_t *caddrmapnvl)
7818 {
7819 	mdi_vhcache_pathinfo_t *cpi;
7820 	nvlist_t *nvl;
7821 	int err;
7822 	uint32_t val[2];
7823 
7824 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7825 
7826 	if ((err = nvlist_alloc(&nvl, 0, KM_SLEEP)) != 0)
7827 		return (err);
7828 
7829 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
7830 		val[0] = cpi->cpi_cphci->cphci_id;
7831 		val[1] = cpi->cpi_flags;
7832 		if ((err = nvlist_add_uint32_array(nvl, cpi->cpi_addr, val, 2))
7833 		    != 0)
7834 			goto out;
7835 	}
7836 
7837 	err = nvlist_add_nvlist(caddrmapnvl, cct->cct_name_addr, nvl);
7838 out:
7839 	nvlist_free(nvl);
7840 	return (err);
7841 }
7842 
7843 /*
7844  * Build caddrmapnvl using the information in the vhci cache
7845  * and add it to the mainnvl.
7846  * Returns 0 on success, errno on failure.
7847  */
7848 static int
7849 vhcache_to_caddrmapnvl(mdi_vhci_cache_t *vhcache, nvlist_t *mainnvl)
7850 {
7851 	mdi_vhcache_client_t *cct;
7852 	nvlist_t *nvl;
7853 	int err;
7854 
7855 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7856 
7857 	if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) != 0)
7858 		return (err);
7859 
7860 	for (cct = vhcache->vhcache_client_head; cct != NULL;
7861 	    cct = cct->cct_next) {
7862 		if ((err = vhcache_to_paddrnvl(vhcache, cct, nvl)) != 0)
7863 			goto out;
7864 	}
7865 
7866 	err = nvlist_add_nvlist(mainnvl, MDI_NVPNAME_CTADDRMAP, nvl);
7867 out:
7868 	nvlist_free(nvl);
7869 	return (err);
7870 }
7871 
7872 /*
7873  * Build nvlist using the information in the vhci cache.
7874  * See the comment in mainnvl_to_vhcache() for the format of the nvlist.
7875  * Returns nvl on success, NULL on failure.
7876  */
7877 static nvlist_t *
7878 vhcache_to_mainnvl(mdi_vhci_cache_t *vhcache)
7879 {
7880 	mdi_vhcache_phci_t *cphci;
7881 	uint_t phci_count;
7882 	char **phcis;
7883 	nvlist_t *nvl;
7884 	int err, i;
7885 
7886 	if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) != 0) {
7887 		nvl = NULL;
7888 		goto out;
7889 	}
7890 
7891 	if ((err = nvlist_add_int32(nvl, MDI_NVPNAME_VERSION,
7892 	    MDI_VHCI_CACHE_VERSION)) != 0)
7893 		goto out;
7894 
7895 	rw_enter(&vhcache->vhcache_lock, RW_READER);
7896 	if (vhcache->vhcache_phci_head == NULL) {
7897 		rw_exit(&vhcache->vhcache_lock);
7898 		return (nvl);
7899 	}
7900 
7901 	phci_count = 0;
7902 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7903 	    cphci = cphci->cphci_next)
7904 		cphci->cphci_id = phci_count++;
7905 
7906 	/* build phci pathname list */
7907 	phcis = kmem_alloc(sizeof (char *) * phci_count, KM_SLEEP);
7908 	for (cphci = vhcache->vhcache_phci_head, i = 0; cphci != NULL;
7909 	    cphci = cphci->cphci_next, i++)
7910 		phcis[i] = i_ddi_strdup(cphci->cphci_path, KM_SLEEP);
7911 
7912 	err = nvlist_add_string_array(nvl, MDI_NVPNAME_PHCIS, phcis,
7913 	    phci_count);
7914 	free_string_array(phcis, phci_count);
7915 
7916 	if (err == 0 &&
7917 	    (err = vhcache_to_caddrmapnvl(vhcache, nvl)) == 0) {
7918 		rw_exit(&vhcache->vhcache_lock);
7919 		return (nvl);
7920 	}
7921 
7922 	rw_exit(&vhcache->vhcache_lock);
7923 out:
7924 	if (nvl)
7925 		nvlist_free(nvl);
7926 	return (NULL);
7927 }
7928 
7929 /*
7930  * Lookup vhcache phci structure for the specified phci path.
7931  */
7932 static mdi_vhcache_phci_t *
7933 lookup_vhcache_phci_by_name(mdi_vhci_cache_t *vhcache, char *phci_path)
7934 {
7935 	mdi_vhcache_phci_t *cphci;
7936 
7937 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7938 
7939 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7940 	    cphci = cphci->cphci_next) {
7941 		if (strcmp(cphci->cphci_path, phci_path) == 0)
7942 			return (cphci);
7943 	}
7944 
7945 	return (NULL);
7946 }
7947 
7948 /*
7949  * Lookup vhcache phci structure for the specified phci.
7950  */
7951 static mdi_vhcache_phci_t *
7952 lookup_vhcache_phci_by_addr(mdi_vhci_cache_t *vhcache, mdi_phci_t *ph)
7953 {
7954 	mdi_vhcache_phci_t *cphci;
7955 
7956 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7957 
7958 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7959 	    cphci = cphci->cphci_next) {
7960 		if (cphci->cphci_phci == ph)
7961 			return (cphci);
7962 	}
7963 
7964 	return (NULL);
7965 }
7966 
7967 /*
7968  * Add the specified phci to the vhci cache if not already present.
7969  */
7970 static void
7971 vhcache_phci_add(mdi_vhci_config_t *vhc, mdi_phci_t *ph)
7972 {
7973 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7974 	mdi_vhcache_phci_t *cphci;
7975 	char *pathname;
7976 	int cache_updated;
7977 
7978 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7979 
7980 	pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
7981 	(void) ddi_pathname(ph->ph_dip, pathname);
7982 	if ((cphci = lookup_vhcache_phci_by_name(vhcache, pathname))
7983 	    != NULL) {
7984 		cphci->cphci_phci = ph;
7985 		cache_updated = 0;
7986 	} else {
7987 		cphci = kmem_zalloc(sizeof (*cphci), KM_SLEEP);
7988 		cphci->cphci_path = i_ddi_strdup(pathname, KM_SLEEP);
7989 		cphci->cphci_phci = ph;
7990 		enqueue_vhcache_phci(vhcache, cphci);
7991 		cache_updated = 1;
7992 	}
7993 
7994 	rw_exit(&vhcache->vhcache_lock);
7995 
7996 	/*
7997 	 * Since a new phci has been added, reset
7998 	 * vhc_path_discovery_cutoff_time to allow for discovery of paths
7999 	 * during next vhcache_discover_paths().
8000 	 */
8001 	mutex_enter(&vhc->vhc_lock);
8002 	vhc->vhc_path_discovery_cutoff_time = 0;
8003 	mutex_exit(&vhc->vhc_lock);
8004 
8005 	kmem_free(pathname, MAXPATHLEN);
8006 	if (cache_updated)
8007 		vhcache_dirty(vhc);
8008 }
8009 
8010 /*
8011  * Remove the reference to the specified phci from the vhci cache.
8012  */
8013 static void
8014 vhcache_phci_remove(mdi_vhci_config_t *vhc, mdi_phci_t *ph)
8015 {
8016 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8017 	mdi_vhcache_phci_t *cphci;
8018 
8019 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8020 	if ((cphci = lookup_vhcache_phci_by_addr(vhcache, ph)) != NULL) {
8021 		/* do not remove the actual mdi_vhcache_phci structure */
8022 		cphci->cphci_phci = NULL;
8023 	}
8024 	rw_exit(&vhcache->vhcache_lock);
8025 }
8026 
8027 static void
8028 init_vhcache_lookup_token(mdi_vhcache_lookup_token_t *dst,
8029     mdi_vhcache_lookup_token_t *src)
8030 {
8031 	if (src == NULL) {
8032 		dst->lt_cct = NULL;
8033 		dst->lt_cct_lookup_time = 0;
8034 	} else {
8035 		dst->lt_cct = src->lt_cct;
8036 		dst->lt_cct_lookup_time = src->lt_cct_lookup_time;
8037 	}
8038 }
8039 
8040 /*
8041  * Look up vhcache client for the specified client.
8042  */
8043 static mdi_vhcache_client_t *
8044 lookup_vhcache_client(mdi_vhci_cache_t *vhcache, char *ct_name, char *ct_addr,
8045     mdi_vhcache_lookup_token_t *token)
8046 {
8047 	mod_hash_val_t hv;
8048 	char *name_addr;
8049 	int len;
8050 
8051 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
8052 
8053 	/*
8054 	 * If no vhcache clean occurred since the last lookup, we can
8055 	 * simply return the cct from the last lookup operation.
8056 	 * It works because ccts are never freed except during the vhcache
8057 	 * cleanup operation.
8058 	 */
8059 	if (token != NULL &&
8060 	    vhcache->vhcache_clean_time < token->lt_cct_lookup_time)
8061 		return (token->lt_cct);
8062 
8063 	name_addr = vhcache_mknameaddr(ct_name, ct_addr, &len);
8064 	if (mod_hash_find(vhcache->vhcache_client_hash,
8065 	    (mod_hash_key_t)name_addr, &hv) == 0) {
8066 		if (token) {
8067 			token->lt_cct = (mdi_vhcache_client_t *)hv;
8068 			token->lt_cct_lookup_time = ddi_get_lbolt64();
8069 		}
8070 	} else {
8071 		if (token) {
8072 			token->lt_cct = NULL;
8073 			token->lt_cct_lookup_time = 0;
8074 		}
8075 		hv = NULL;
8076 	}
8077 	kmem_free(name_addr, len);
8078 	return ((mdi_vhcache_client_t *)hv);
8079 }
8080 
8081 /*
8082  * Add the specified path to the vhci cache if not already present.
8083  * Also add the vhcache client for the client corresponding to this path
8084  * if it doesn't already exist.
8085  */
8086 static void
8087 vhcache_pi_add(mdi_vhci_config_t *vhc, struct mdi_pathinfo *pip)
8088 {
8089 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8090 	mdi_vhcache_client_t *cct;
8091 	mdi_vhcache_pathinfo_t *cpi;
8092 	mdi_phci_t *ph = pip->pi_phci;
8093 	mdi_client_t *ct = pip->pi_client;
8094 	int cache_updated = 0;
8095 
8096 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8097 
8098 	/* if vhcache client for this pip doesn't already exist, add it */
8099 	if ((cct = lookup_vhcache_client(vhcache, ct->ct_drvname, ct->ct_guid,
8100 	    NULL)) == NULL) {
8101 		cct = kmem_zalloc(sizeof (*cct), KM_SLEEP);
8102 		cct->cct_name_addr = vhcache_mknameaddr(ct->ct_drvname,
8103 		    ct->ct_guid, NULL);
8104 		enqueue_vhcache_client(vhcache, cct);
8105 		(void) mod_hash_insert(vhcache->vhcache_client_hash,
8106 		    (mod_hash_key_t)cct->cct_name_addr, (mod_hash_val_t)cct);
8107 		cache_updated = 1;
8108 	}
8109 
8110 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8111 		if (cpi->cpi_cphci->cphci_phci == ph &&
8112 		    strcmp(cpi->cpi_addr, pip->pi_addr) == 0) {
8113 			cpi->cpi_pip = pip;
8114 			if (cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST) {
8115 				cpi->cpi_flags &=
8116 				    ~MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
8117 				sort_vhcache_paths(cct);
8118 				cache_updated = 1;
8119 			}
8120 			break;
8121 		}
8122 	}
8123 
8124 	if (cpi == NULL) {
8125 		cpi = kmem_zalloc(sizeof (*cpi), KM_SLEEP);
8126 		cpi->cpi_addr = i_ddi_strdup(pip->pi_addr, KM_SLEEP);
8127 		cpi->cpi_cphci = lookup_vhcache_phci_by_addr(vhcache, ph);
8128 		ASSERT(cpi->cpi_cphci != NULL);
8129 		cpi->cpi_pip = pip;
8130 		enqueue_vhcache_pathinfo(cct, cpi);
8131 		cache_updated = 1;
8132 	}
8133 
8134 	rw_exit(&vhcache->vhcache_lock);
8135 
8136 	if (cache_updated)
8137 		vhcache_dirty(vhc);
8138 }
8139 
8140 /*
8141  * Remove the reference to the specified path from the vhci cache.
8142  */
8143 static void
8144 vhcache_pi_remove(mdi_vhci_config_t *vhc, struct mdi_pathinfo *pip)
8145 {
8146 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8147 	mdi_client_t *ct = pip->pi_client;
8148 	mdi_vhcache_client_t *cct;
8149 	mdi_vhcache_pathinfo_t *cpi;
8150 
8151 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8152 	if ((cct = lookup_vhcache_client(vhcache, ct->ct_drvname, ct->ct_guid,
8153 	    NULL)) != NULL) {
8154 		for (cpi = cct->cct_cpi_head; cpi != NULL;
8155 		    cpi = cpi->cpi_next) {
8156 			if (cpi->cpi_pip == pip) {
8157 				cpi->cpi_pip = NULL;
8158 				break;
8159 			}
8160 		}
8161 	}
8162 	rw_exit(&vhcache->vhcache_lock);
8163 }
8164 
8165 /*
8166  * Flush the vhci cache to disk.
8167  * Returns MDI_SUCCESS on success, MDI_FAILURE on failure.
8168  */
8169 static int
8170 flush_vhcache(mdi_vhci_config_t *vhc, int force_flag)
8171 {
8172 	nvlist_t *nvl;
8173 	int err;
8174 	int rv;
8175 
8176 	/*
8177 	 * It is possible that the system may shutdown before
8178 	 * i_ddi_io_initialized (during stmsboot for example). To allow for
8179 	 * flushing the cache in this case do not check for
8180 	 * i_ddi_io_initialized when force flag is set.
8181 	 */
8182 	if (force_flag == 0 && !i_ddi_io_initialized())
8183 		return (MDI_FAILURE);
8184 
8185 	if ((nvl = vhcache_to_mainnvl(&vhc->vhc_vhcache)) != NULL) {
8186 		err = fwrite_nvlist(vhc->vhc_vhcache_filename, nvl);
8187 		nvlist_free(nvl);
8188 	} else
8189 		err = EFAULT;
8190 
8191 	rv = MDI_SUCCESS;
8192 	mutex_enter(&vhc->vhc_lock);
8193 	if (err != 0) {
8194 		if (err == EROFS) {
8195 			vhc->vhc_flags |= MDI_VHC_READONLY_FS;
8196 			vhc->vhc_flags &= ~(MDI_VHC_VHCACHE_FLUSH_ERROR |
8197 			    MDI_VHC_VHCACHE_DIRTY);
8198 		} else {
8199 			if (!(vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_ERROR)) {
8200 				cmn_err(CE_CONT, "%s: update failed\n",
8201 				    vhc->vhc_vhcache_filename);
8202 				vhc->vhc_flags |= MDI_VHC_VHCACHE_FLUSH_ERROR;
8203 			}
8204 			rv = MDI_FAILURE;
8205 		}
8206 	} else if (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_ERROR) {
8207 		cmn_err(CE_CONT,
8208 		    "%s: update now ok\n", vhc->vhc_vhcache_filename);
8209 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_FLUSH_ERROR;
8210 	}
8211 	mutex_exit(&vhc->vhc_lock);
8212 
8213 	return (rv);
8214 }
8215 
8216 /*
8217  * Call flush_vhcache() to flush the vhci cache at the scheduled time.
8218  * Exits itself if left idle for the idle timeout period.
8219  */
8220 static void
8221 vhcache_flush_thread(void *arg)
8222 {
8223 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
8224 	clock_t idle_time, quit_at_ticks;
8225 	callb_cpr_t cprinfo;
8226 
8227 	/* number of seconds to sleep idle before exiting */
8228 	idle_time = mdi_vhcache_flush_daemon_idle_time * TICKS_PER_SECOND;
8229 
8230 	CALLB_CPR_INIT(&cprinfo, &vhc->vhc_lock, callb_generic_cpr,
8231 	    "mdi_vhcache_flush");
8232 	mutex_enter(&vhc->vhc_lock);
8233 	for (; ; ) {
8234 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
8235 		    (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY)) {
8236 			if (ddi_get_lbolt() < vhc->vhc_flush_at_ticks) {
8237 				CALLB_CPR_SAFE_BEGIN(&cprinfo);
8238 				(void) cv_timedwait(&vhc->vhc_cv,
8239 				    &vhc->vhc_lock, vhc->vhc_flush_at_ticks);
8240 				CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
8241 			} else {
8242 				vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
8243 				mutex_exit(&vhc->vhc_lock);
8244 
8245 				if (flush_vhcache(vhc, 0) != MDI_SUCCESS)
8246 					vhcache_dirty(vhc);
8247 
8248 				mutex_enter(&vhc->vhc_lock);
8249 			}
8250 		}
8251 
8252 		quit_at_ticks = ddi_get_lbolt() + idle_time;
8253 
8254 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
8255 		    !(vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) &&
8256 		    ddi_get_lbolt() < quit_at_ticks) {
8257 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
8258 			(void) cv_timedwait(&vhc->vhc_cv, &vhc->vhc_lock,
8259 			    quit_at_ticks);
8260 			CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
8261 		}
8262 
8263 		if ((vhc->vhc_flags & MDI_VHC_EXIT) ||
8264 		    !(vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY))
8265 			goto out;
8266 	}
8267 
8268 out:
8269 	vhc->vhc_flags &= ~MDI_VHC_VHCACHE_FLUSH_THREAD;
8270 	/* CALLB_CPR_EXIT releases the vhc->vhc_lock */
8271 	CALLB_CPR_EXIT(&cprinfo);
8272 }
8273 
8274 /*
8275  * Make vhci cache dirty and schedule flushing by vhcache flush thread.
8276  */
8277 static void
8278 vhcache_dirty(mdi_vhci_config_t *vhc)
8279 {
8280 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8281 	int create_thread;
8282 
8283 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8284 	/* do not flush cache until the cache is fully built */
8285 	if (!(vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE)) {
8286 		rw_exit(&vhcache->vhcache_lock);
8287 		return;
8288 	}
8289 	rw_exit(&vhcache->vhcache_lock);
8290 
8291 	mutex_enter(&vhc->vhc_lock);
8292 	if (vhc->vhc_flags & MDI_VHC_READONLY_FS) {
8293 		mutex_exit(&vhc->vhc_lock);
8294 		return;
8295 	}
8296 
8297 	vhc->vhc_flags |= MDI_VHC_VHCACHE_DIRTY;
8298 	vhc->vhc_flush_at_ticks = ddi_get_lbolt() +
8299 	    mdi_vhcache_flush_delay * TICKS_PER_SECOND;
8300 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) {
8301 		cv_broadcast(&vhc->vhc_cv);
8302 		create_thread = 0;
8303 	} else {
8304 		vhc->vhc_flags |= MDI_VHC_VHCACHE_FLUSH_THREAD;
8305 		create_thread = 1;
8306 	}
8307 	mutex_exit(&vhc->vhc_lock);
8308 
8309 	if (create_thread)
8310 		(void) thread_create(NULL, 0, vhcache_flush_thread, vhc,
8311 		    0, &p0, TS_RUN, minclsyspri);
8312 }
8313 
8314 /*
8315  * phci bus config structure - one for for each phci bus config operation that
8316  * we initiate on behalf of a vhci.
8317  */
8318 typedef struct mdi_phci_bus_config_s {
8319 	char *phbc_phci_path;
8320 	struct mdi_vhci_bus_config_s *phbc_vhbusconfig;	/* vhci bus config */
8321 	struct mdi_phci_bus_config_s *phbc_next;
8322 } mdi_phci_bus_config_t;
8323 
8324 /* vhci bus config structure - one for each vhci bus config operation */
8325 typedef struct mdi_vhci_bus_config_s {
8326 	ddi_bus_config_op_t vhbc_op;	/* bus config op */
8327 	major_t vhbc_op_major;		/* bus config op major */
8328 	uint_t vhbc_op_flags;		/* bus config op flags */
8329 	kmutex_t vhbc_lock;
8330 	kcondvar_t vhbc_cv;
8331 	int vhbc_thr_count;
8332 } mdi_vhci_bus_config_t;
8333 
8334 /*
8335  * bus config the specified phci
8336  */
8337 static void
8338 bus_config_phci(void *arg)
8339 {
8340 	mdi_phci_bus_config_t *phbc = (mdi_phci_bus_config_t *)arg;
8341 	mdi_vhci_bus_config_t *vhbc = phbc->phbc_vhbusconfig;
8342 	dev_info_t *ph_dip;
8343 
8344 	/*
8345 	 * first configure all path components upto phci and then configure
8346 	 * the phci children.
8347 	 */
8348 	if ((ph_dip = e_ddi_hold_devi_by_path(phbc->phbc_phci_path, 0))
8349 	    != NULL) {
8350 		if (vhbc->vhbc_op == BUS_CONFIG_DRIVER ||
8351 		    vhbc->vhbc_op == BUS_UNCONFIG_DRIVER) {
8352 			(void) ndi_devi_config_driver(ph_dip,
8353 			    vhbc->vhbc_op_flags,
8354 			    vhbc->vhbc_op_major);
8355 		} else
8356 			(void) ndi_devi_config(ph_dip,
8357 			    vhbc->vhbc_op_flags);
8358 
8359 		/* release the hold that e_ddi_hold_devi_by_path() placed */
8360 		ndi_rele_devi(ph_dip);
8361 	}
8362 
8363 	kmem_free(phbc->phbc_phci_path, strlen(phbc->phbc_phci_path) + 1);
8364 	kmem_free(phbc, sizeof (*phbc));
8365 
8366 	mutex_enter(&vhbc->vhbc_lock);
8367 	vhbc->vhbc_thr_count--;
8368 	if (vhbc->vhbc_thr_count == 0)
8369 		cv_broadcast(&vhbc->vhbc_cv);
8370 	mutex_exit(&vhbc->vhbc_lock);
8371 }
8372 
8373 /*
8374  * Bus config all phcis associated with the vhci in parallel.
8375  * op must be BUS_CONFIG_DRIVER or BUS_CONFIG_ALL.
8376  */
8377 static void
8378 bus_config_all_phcis(mdi_vhci_cache_t *vhcache, uint_t flags,
8379     ddi_bus_config_op_t op, major_t maj)
8380 {
8381 	mdi_phci_bus_config_t *phbc_head = NULL, *phbc, *phbc_next;
8382 	mdi_vhci_bus_config_t *vhbc;
8383 	mdi_vhcache_phci_t *cphci;
8384 
8385 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8386 	if (vhcache->vhcache_phci_head == NULL) {
8387 		rw_exit(&vhcache->vhcache_lock);
8388 		return;
8389 	}
8390 
8391 	vhbc = kmem_zalloc(sizeof (*vhbc), KM_SLEEP);
8392 
8393 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
8394 	    cphci = cphci->cphci_next) {
8395 		/* skip phcis that haven't attached before root is available */
8396 		if (!modrootloaded && (cphci->cphci_phci == NULL))
8397 			continue;
8398 		phbc = kmem_zalloc(sizeof (*phbc), KM_SLEEP);
8399 		phbc->phbc_phci_path = i_ddi_strdup(cphci->cphci_path,
8400 		    KM_SLEEP);
8401 		phbc->phbc_vhbusconfig = vhbc;
8402 		phbc->phbc_next = phbc_head;
8403 		phbc_head = phbc;
8404 		vhbc->vhbc_thr_count++;
8405 	}
8406 	rw_exit(&vhcache->vhcache_lock);
8407 
8408 	vhbc->vhbc_op = op;
8409 	vhbc->vhbc_op_major = maj;
8410 	vhbc->vhbc_op_flags = NDI_NO_EVENT |
8411 	    (flags & (NDI_CONFIG_REPROBE | NDI_DRV_CONF_REPROBE));
8412 	mutex_init(&vhbc->vhbc_lock, NULL, MUTEX_DEFAULT, NULL);
8413 	cv_init(&vhbc->vhbc_cv, NULL, CV_DRIVER, NULL);
8414 
8415 	/* now create threads to initiate bus config on all phcis in parallel */
8416 	for (phbc = phbc_head; phbc != NULL; phbc = phbc_next) {
8417 		phbc_next = phbc->phbc_next;
8418 		if (mdi_mtc_off)
8419 			bus_config_phci((void *)phbc);
8420 		else
8421 			(void) thread_create(NULL, 0, bus_config_phci, phbc,
8422 			    0, &p0, TS_RUN, minclsyspri);
8423 	}
8424 
8425 	mutex_enter(&vhbc->vhbc_lock);
8426 	/* wait until all threads exit */
8427 	while (vhbc->vhbc_thr_count > 0)
8428 		cv_wait(&vhbc->vhbc_cv, &vhbc->vhbc_lock);
8429 	mutex_exit(&vhbc->vhbc_lock);
8430 
8431 	mutex_destroy(&vhbc->vhbc_lock);
8432 	cv_destroy(&vhbc->vhbc_cv);
8433 	kmem_free(vhbc, sizeof (*vhbc));
8434 }
8435 
8436 /*
8437  * Single threaded version of bus_config_all_phcis()
8438  */
8439 static void
8440 st_bus_config_all_phcis(mdi_vhci_config_t *vhc, uint_t flags,
8441     ddi_bus_config_op_t op, major_t maj)
8442 {
8443 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8444 
8445 	single_threaded_vhconfig_enter(vhc);
8446 	bus_config_all_phcis(vhcache, flags, op, maj);
8447 	single_threaded_vhconfig_exit(vhc);
8448 }
8449 
8450 /*
8451  * Perform BUS_CONFIG_ONE on the specified child of the phci.
8452  * The path includes the child component in addition to the phci path.
8453  */
8454 static int
8455 bus_config_one_phci_child(char *path)
8456 {
8457 	dev_info_t *ph_dip, *child;
8458 	char *devnm;
8459 	int rv = MDI_FAILURE;
8460 
8461 	/* extract the child component of the phci */
8462 	devnm = strrchr(path, '/');
8463 	*devnm++ = '\0';
8464 
8465 	/*
8466 	 * first configure all path components upto phci and then
8467 	 * configure the phci child.
8468 	 */
8469 	if ((ph_dip = e_ddi_hold_devi_by_path(path, 0)) != NULL) {
8470 		if (ndi_devi_config_one(ph_dip, devnm, &child, NDI_NO_EVENT) ==
8471 		    NDI_SUCCESS) {
8472 			/*
8473 			 * release the hold that ndi_devi_config_one() placed
8474 			 */
8475 			ndi_rele_devi(child);
8476 			rv = MDI_SUCCESS;
8477 		}
8478 
8479 		/* release the hold that e_ddi_hold_devi_by_path() placed */
8480 		ndi_rele_devi(ph_dip);
8481 	}
8482 
8483 	devnm--;
8484 	*devnm = '/';
8485 	return (rv);
8486 }
8487 
8488 /*
8489  * Build a list of phci client paths for the specified vhci client.
8490  * The list includes only those phci client paths which aren't configured yet.
8491  */
8492 static mdi_phys_path_t *
8493 build_phclient_path_list(mdi_vhcache_client_t *cct, char *ct_name)
8494 {
8495 	mdi_vhcache_pathinfo_t *cpi;
8496 	mdi_phys_path_t *pp_head = NULL, *pp_tail = NULL, *pp;
8497 	int config_path, len;
8498 
8499 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8500 		/*
8501 		 * include only those paths that aren't configured.
8502 		 */
8503 		config_path = 0;
8504 		if (cpi->cpi_pip == NULL)
8505 			config_path = 1;
8506 		else {
8507 			MDI_PI_LOCK(cpi->cpi_pip);
8508 			if (MDI_PI_IS_INIT(cpi->cpi_pip))
8509 				config_path = 1;
8510 			MDI_PI_UNLOCK(cpi->cpi_pip);
8511 		}
8512 
8513 		if (config_path) {
8514 			pp = kmem_alloc(sizeof (*pp), KM_SLEEP);
8515 			len = strlen(cpi->cpi_cphci->cphci_path) +
8516 			    strlen(ct_name) + strlen(cpi->cpi_addr) + 3;
8517 			pp->phys_path = kmem_alloc(len, KM_SLEEP);
8518 			(void) snprintf(pp->phys_path, len, "%s/%s@%s",
8519 			    cpi->cpi_cphci->cphci_path, ct_name,
8520 			    cpi->cpi_addr);
8521 			pp->phys_path_next = NULL;
8522 
8523 			if (pp_head == NULL)
8524 				pp_head = pp;
8525 			else
8526 				pp_tail->phys_path_next = pp;
8527 			pp_tail = pp;
8528 		}
8529 	}
8530 
8531 	return (pp_head);
8532 }
8533 
8534 /*
8535  * Free the memory allocated for phci client path list.
8536  */
8537 static void
8538 free_phclient_path_list(mdi_phys_path_t *pp_head)
8539 {
8540 	mdi_phys_path_t *pp, *pp_next;
8541 
8542 	for (pp = pp_head; pp != NULL; pp = pp_next) {
8543 		pp_next = pp->phys_path_next;
8544 		kmem_free(pp->phys_path, strlen(pp->phys_path) + 1);
8545 		kmem_free(pp, sizeof (*pp));
8546 	}
8547 }
8548 
8549 /*
8550  * Allocated async client structure and initialize with the specified values.
8551  */
8552 static mdi_async_client_config_t *
8553 alloc_async_client_config(char *ct_name, char *ct_addr,
8554     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
8555 {
8556 	mdi_async_client_config_t *acc;
8557 
8558 	acc = kmem_alloc(sizeof (*acc), KM_SLEEP);
8559 	acc->acc_ct_name = i_ddi_strdup(ct_name, KM_SLEEP);
8560 	acc->acc_ct_addr = i_ddi_strdup(ct_addr, KM_SLEEP);
8561 	acc->acc_phclient_path_list_head = pp_head;
8562 	init_vhcache_lookup_token(&acc->acc_token, tok);
8563 	acc->acc_next = NULL;
8564 	return (acc);
8565 }
8566 
8567 /*
8568  * Free the memory allocated for the async client structure and their members.
8569  */
8570 static void
8571 free_async_client_config(mdi_async_client_config_t *acc)
8572 {
8573 	if (acc->acc_phclient_path_list_head)
8574 		free_phclient_path_list(acc->acc_phclient_path_list_head);
8575 	kmem_free(acc->acc_ct_name, strlen(acc->acc_ct_name) + 1);
8576 	kmem_free(acc->acc_ct_addr, strlen(acc->acc_ct_addr) + 1);
8577 	kmem_free(acc, sizeof (*acc));
8578 }
8579 
8580 /*
8581  * Sort vhcache pathinfos (cpis) of the specified client.
8582  * All cpis which do not have MDI_CPI_HINT_PATH_DOES_NOT_EXIST
8583  * flag set come at the beginning of the list. All cpis which have this
8584  * flag set come at the end of the list.
8585  */
8586 static void
8587 sort_vhcache_paths(mdi_vhcache_client_t *cct)
8588 {
8589 	mdi_vhcache_pathinfo_t *cpi, *cpi_next, *cpi_head;
8590 
8591 	cpi_head = cct->cct_cpi_head;
8592 	cct->cct_cpi_head = cct->cct_cpi_tail = NULL;
8593 	for (cpi = cpi_head; cpi != NULL; cpi = cpi_next) {
8594 		cpi_next = cpi->cpi_next;
8595 		enqueue_vhcache_pathinfo(cct, cpi);
8596 	}
8597 }
8598 
8599 /*
8600  * Verify whether MDI_CPI_HINT_PATH_DOES_NOT_EXIST flag setting is correct for
8601  * every vhcache pathinfo of the specified client. If not adjust the flag
8602  * setting appropriately.
8603  *
8604  * Note that MDI_CPI_HINT_PATH_DOES_NOT_EXIST flag is persisted in the
8605  * on-disk vhci cache. So every time this flag is updated the cache must be
8606  * flushed.
8607  */
8608 static void
8609 adjust_sort_vhcache_paths(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
8610     mdi_vhcache_lookup_token_t *tok)
8611 {
8612 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8613 	mdi_vhcache_client_t *cct;
8614 	mdi_vhcache_pathinfo_t *cpi;
8615 
8616 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8617 	if ((cct = lookup_vhcache_client(vhcache, ct_name, ct_addr, tok))
8618 	    == NULL) {
8619 		rw_exit(&vhcache->vhcache_lock);
8620 		return;
8621 	}
8622 
8623 	/*
8624 	 * to avoid unnecessary on-disk cache updates, first check if an
8625 	 * update is really needed. If no update is needed simply return.
8626 	 */
8627 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8628 		if ((cpi->cpi_pip != NULL &&
8629 		    (cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST)) ||
8630 		    (cpi->cpi_pip == NULL &&
8631 		    !(cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST))) {
8632 			break;
8633 		}
8634 	}
8635 	if (cpi == NULL) {
8636 		rw_exit(&vhcache->vhcache_lock);
8637 		return;
8638 	}
8639 
8640 	if (rw_tryupgrade(&vhcache->vhcache_lock) == 0) {
8641 		rw_exit(&vhcache->vhcache_lock);
8642 		rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8643 		if ((cct = lookup_vhcache_client(vhcache, ct_name, ct_addr,
8644 		    tok)) == NULL) {
8645 			rw_exit(&vhcache->vhcache_lock);
8646 			return;
8647 		}
8648 	}
8649 
8650 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8651 		if (cpi->cpi_pip != NULL)
8652 			cpi->cpi_flags &= ~MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
8653 		else
8654 			cpi->cpi_flags |= MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
8655 	}
8656 	sort_vhcache_paths(cct);
8657 
8658 	rw_exit(&vhcache->vhcache_lock);
8659 	vhcache_dirty(vhc);
8660 }
8661 
8662 /*
8663  * Configure all specified paths of the client.
8664  */
8665 static void
8666 config_client_paths_sync(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
8667     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
8668 {
8669 	mdi_phys_path_t *pp;
8670 
8671 	for (pp = pp_head; pp != NULL; pp = pp->phys_path_next)
8672 		(void) bus_config_one_phci_child(pp->phys_path);
8673 	adjust_sort_vhcache_paths(vhc, ct_name, ct_addr, tok);
8674 }
8675 
8676 /*
8677  * Dequeue elements from vhci async client config list and bus configure
8678  * their corresponding phci clients.
8679  */
8680 static void
8681 config_client_paths_thread(void *arg)
8682 {
8683 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
8684 	mdi_async_client_config_t *acc;
8685 	clock_t quit_at_ticks;
8686 	clock_t idle_time = mdi_async_config_idle_time * TICKS_PER_SECOND;
8687 	callb_cpr_t cprinfo;
8688 
8689 	CALLB_CPR_INIT(&cprinfo, &vhc->vhc_lock, callb_generic_cpr,
8690 	    "mdi_config_client_paths");
8691 
8692 	for (; ; ) {
8693 		quit_at_ticks = ddi_get_lbolt() + idle_time;
8694 
8695 		mutex_enter(&vhc->vhc_lock);
8696 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
8697 		    vhc->vhc_acc_list_head == NULL &&
8698 		    ddi_get_lbolt() < quit_at_ticks) {
8699 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
8700 			(void) cv_timedwait(&vhc->vhc_cv, &vhc->vhc_lock,
8701 			    quit_at_ticks);
8702 			CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
8703 		}
8704 
8705 		if ((vhc->vhc_flags & MDI_VHC_EXIT) ||
8706 		    vhc->vhc_acc_list_head == NULL)
8707 			goto out;
8708 
8709 		acc = vhc->vhc_acc_list_head;
8710 		vhc->vhc_acc_list_head = acc->acc_next;
8711 		if (vhc->vhc_acc_list_head == NULL)
8712 			vhc->vhc_acc_list_tail = NULL;
8713 		vhc->vhc_acc_count--;
8714 		mutex_exit(&vhc->vhc_lock);
8715 
8716 		config_client_paths_sync(vhc, acc->acc_ct_name,
8717 		    acc->acc_ct_addr, acc->acc_phclient_path_list_head,
8718 		    &acc->acc_token);
8719 
8720 		free_async_client_config(acc);
8721 	}
8722 
8723 out:
8724 	vhc->vhc_acc_thrcount--;
8725 	/* CALLB_CPR_EXIT releases the vhc->vhc_lock */
8726 	CALLB_CPR_EXIT(&cprinfo);
8727 }
8728 
8729 /*
8730  * Arrange for all the phci client paths (pp_head) for the specified client
8731  * to be bus configured asynchronously by a thread.
8732  */
8733 static void
8734 config_client_paths_async(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
8735     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
8736 {
8737 	mdi_async_client_config_t *acc, *newacc;
8738 	int create_thread;
8739 
8740 	if (pp_head == NULL)
8741 		return;
8742 
8743 	if (mdi_mtc_off) {
8744 		config_client_paths_sync(vhc, ct_name, ct_addr, pp_head, tok);
8745 		free_phclient_path_list(pp_head);
8746 		return;
8747 	}
8748 
8749 	newacc = alloc_async_client_config(ct_name, ct_addr, pp_head, tok);
8750 	ASSERT(newacc);
8751 
8752 	mutex_enter(&vhc->vhc_lock);
8753 	for (acc = vhc->vhc_acc_list_head; acc != NULL; acc = acc->acc_next) {
8754 		if (strcmp(ct_name, acc->acc_ct_name) == 0 &&
8755 		    strcmp(ct_addr, acc->acc_ct_addr) == 0) {
8756 			free_async_client_config(newacc);
8757 			mutex_exit(&vhc->vhc_lock);
8758 			return;
8759 		}
8760 	}
8761 
8762 	if (vhc->vhc_acc_list_head == NULL)
8763 		vhc->vhc_acc_list_head = newacc;
8764 	else
8765 		vhc->vhc_acc_list_tail->acc_next = newacc;
8766 	vhc->vhc_acc_list_tail = newacc;
8767 	vhc->vhc_acc_count++;
8768 	if (vhc->vhc_acc_count <= vhc->vhc_acc_thrcount) {
8769 		cv_broadcast(&vhc->vhc_cv);
8770 		create_thread = 0;
8771 	} else {
8772 		vhc->vhc_acc_thrcount++;
8773 		create_thread = 1;
8774 	}
8775 	mutex_exit(&vhc->vhc_lock);
8776 
8777 	if (create_thread)
8778 		(void) thread_create(NULL, 0, config_client_paths_thread, vhc,
8779 		    0, &p0, TS_RUN, minclsyspri);
8780 }
8781 
8782 /*
8783  * Return number of online paths for the specified client.
8784  */
8785 static int
8786 nonline_paths(mdi_vhcache_client_t *cct)
8787 {
8788 	mdi_vhcache_pathinfo_t *cpi;
8789 	int online_count = 0;
8790 
8791 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8792 		if (cpi->cpi_pip != NULL) {
8793 			MDI_PI_LOCK(cpi->cpi_pip);
8794 			if (cpi->cpi_pip->pi_state == MDI_PATHINFO_STATE_ONLINE)
8795 				online_count++;
8796 			MDI_PI_UNLOCK(cpi->cpi_pip);
8797 		}
8798 	}
8799 
8800 	return (online_count);
8801 }
8802 
8803 /*
8804  * Bus configure all paths for the specified vhci client.
8805  * If at least one path for the client is already online, the remaining paths
8806  * will be configured asynchronously. Otherwise, it synchronously configures
8807  * the paths until at least one path is online and then rest of the paths
8808  * will be configured asynchronously.
8809  */
8810 static void
8811 config_client_paths(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr)
8812 {
8813 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8814 	mdi_phys_path_t *pp_head, *pp;
8815 	mdi_vhcache_client_t *cct;
8816 	mdi_vhcache_lookup_token_t tok;
8817 
8818 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
8819 
8820 	init_vhcache_lookup_token(&tok, NULL);
8821 
8822 	if (ct_name == NULL || ct_addr == NULL ||
8823 	    (cct = lookup_vhcache_client(vhcache, ct_name, ct_addr, &tok))
8824 	    == NULL ||
8825 	    (pp_head = build_phclient_path_list(cct, ct_name)) == NULL) {
8826 		rw_exit(&vhcache->vhcache_lock);
8827 		return;
8828 	}
8829 
8830 	/* if at least one path is online, configure the rest asynchronously */
8831 	if (nonline_paths(cct) > 0) {
8832 		rw_exit(&vhcache->vhcache_lock);
8833 		config_client_paths_async(vhc, ct_name, ct_addr, pp_head, &tok);
8834 		return;
8835 	}
8836 
8837 	rw_exit(&vhcache->vhcache_lock);
8838 
8839 	for (pp = pp_head; pp != NULL; pp = pp->phys_path_next) {
8840 		if (bus_config_one_phci_child(pp->phys_path) == MDI_SUCCESS) {
8841 			rw_enter(&vhcache->vhcache_lock, RW_READER);
8842 
8843 			if ((cct = lookup_vhcache_client(vhcache, ct_name,
8844 			    ct_addr, &tok)) == NULL) {
8845 				rw_exit(&vhcache->vhcache_lock);
8846 				goto out;
8847 			}
8848 
8849 			if (nonline_paths(cct) > 0 &&
8850 			    pp->phys_path_next != NULL) {
8851 				rw_exit(&vhcache->vhcache_lock);
8852 				config_client_paths_async(vhc, ct_name, ct_addr,
8853 				    pp->phys_path_next, &tok);
8854 				pp->phys_path_next = NULL;
8855 				goto out;
8856 			}
8857 
8858 			rw_exit(&vhcache->vhcache_lock);
8859 		}
8860 	}
8861 
8862 	adjust_sort_vhcache_paths(vhc, ct_name, ct_addr, &tok);
8863 out:
8864 	free_phclient_path_list(pp_head);
8865 }
8866 
8867 static void
8868 single_threaded_vhconfig_enter(mdi_vhci_config_t *vhc)
8869 {
8870 	mutex_enter(&vhc->vhc_lock);
8871 	while (vhc->vhc_flags & MDI_VHC_SINGLE_THREADED)
8872 		cv_wait(&vhc->vhc_cv, &vhc->vhc_lock);
8873 	vhc->vhc_flags |= MDI_VHC_SINGLE_THREADED;
8874 	mutex_exit(&vhc->vhc_lock);
8875 }
8876 
8877 static void
8878 single_threaded_vhconfig_exit(mdi_vhci_config_t *vhc)
8879 {
8880 	mutex_enter(&vhc->vhc_lock);
8881 	vhc->vhc_flags &= ~MDI_VHC_SINGLE_THREADED;
8882 	cv_broadcast(&vhc->vhc_cv);
8883 	mutex_exit(&vhc->vhc_lock);
8884 }
8885 
8886 typedef struct mdi_phci_driver_info {
8887 	char	*phdriver_name;	/* name of the phci driver */
8888 
8889 	/* set to non zero if the phci driver supports root device */
8890 	int	phdriver_root_support;
8891 } mdi_phci_driver_info_t;
8892 
8893 /*
8894  * vhci class and root support capability of a phci driver can be
8895  * specified using ddi-vhci-class and ddi-no-root-support properties in the
8896  * phci driver.conf file. The built-in tables below contain this information
8897  * for those phci drivers whose driver.conf files don't yet contain this info.
8898  *
8899  * All phci drivers expect iscsi have root device support.
8900  */
8901 static mdi_phci_driver_info_t scsi_phci_driver_list[] = {
8902 	{ "fp", 1 },
8903 	{ "iscsi", 0 },
8904 	{ "ibsrp", 1 }
8905 	};
8906 
8907 static mdi_phci_driver_info_t ib_phci_driver_list[] = { "tavor", 1 };
8908 
8909 static void *
8910 mdi_realloc(void *old_ptr, size_t old_size, size_t new_size)
8911 {
8912 	void *new_ptr;
8913 
8914 	new_ptr = kmem_zalloc(new_size, KM_SLEEP);
8915 	if (old_ptr) {
8916 		bcopy(old_ptr, new_ptr, MIN(old_size, new_size));
8917 		kmem_free(old_ptr, old_size);
8918 	}
8919 	return (new_ptr);
8920 }
8921 
8922 static void
8923 add_to_phci_list(char ***driver_list, int **root_support_list,
8924     int *cur_elements, int *max_elements, char *driver_name, int root_support)
8925 {
8926 	ASSERT(*cur_elements <= *max_elements);
8927 	if (*cur_elements == *max_elements) {
8928 		*max_elements += 10;
8929 		*driver_list = mdi_realloc(*driver_list,
8930 		    sizeof (char *) * (*cur_elements),
8931 		    sizeof (char *) * (*max_elements));
8932 		*root_support_list = mdi_realloc(*root_support_list,
8933 		    sizeof (int) * (*cur_elements),
8934 		    sizeof (int) * (*max_elements));
8935 	}
8936 	(*driver_list)[*cur_elements] = i_ddi_strdup(driver_name, KM_SLEEP);
8937 	(*root_support_list)[*cur_elements] = root_support;
8938 	(*cur_elements)++;
8939 }
8940 
8941 static void
8942 get_phci_driver_list(char *vhci_class, char ***driver_list,
8943     int **root_support_list, int *cur_elements, int *max_elements)
8944 {
8945 	mdi_phci_driver_info_t	*st_driver_list, *p;
8946 	int		st_ndrivers, root_support, i, j, driver_conf_count;
8947 	major_t		m;
8948 	struct devnames	*dnp;
8949 	ddi_prop_t	*propp;
8950 
8951 	*driver_list = NULL;
8952 	*root_support_list = NULL;
8953 	*cur_elements = 0;
8954 	*max_elements = 0;
8955 
8956 	/* add the phci drivers derived from the phci driver.conf files */
8957 	for (m = 0; m < devcnt; m++) {
8958 		dnp = &devnamesp[m];
8959 
8960 		if (dnp->dn_flags & DN_PHCI_DRIVER) {
8961 			LOCK_DEV_OPS(&dnp->dn_lock);
8962 			if (dnp->dn_global_prop_ptr != NULL &&
8963 			    (propp = i_ddi_prop_search(DDI_DEV_T_ANY,
8964 			    DDI_VHCI_CLASS, DDI_PROP_TYPE_STRING,
8965 			    &dnp->dn_global_prop_ptr->prop_list)) != NULL &&
8966 			    strcmp(propp->prop_val, vhci_class) == 0) {
8967 
8968 				root_support = (i_ddi_prop_search(DDI_DEV_T_ANY,
8969 				    DDI_NO_ROOT_SUPPORT, DDI_PROP_TYPE_INT,
8970 				    &dnp->dn_global_prop_ptr->prop_list)
8971 				    == NULL) ? 1 : 0;
8972 
8973 				add_to_phci_list(driver_list, root_support_list,
8974 				    cur_elements, max_elements, dnp->dn_name,
8975 				    root_support);
8976 
8977 				UNLOCK_DEV_OPS(&dnp->dn_lock);
8978 			} else
8979 				UNLOCK_DEV_OPS(&dnp->dn_lock);
8980 		}
8981 	}
8982 
8983 	driver_conf_count = *cur_elements;
8984 
8985 	/* add the phci drivers specified in the built-in tables */
8986 	if (strcmp(vhci_class, MDI_HCI_CLASS_SCSI) == 0) {
8987 		st_driver_list = scsi_phci_driver_list;
8988 		st_ndrivers = sizeof (scsi_phci_driver_list) /
8989 		    sizeof (mdi_phci_driver_info_t);
8990 	} else if (strcmp(vhci_class, MDI_HCI_CLASS_IB) == 0) {
8991 		st_driver_list = ib_phci_driver_list;
8992 		st_ndrivers = sizeof (ib_phci_driver_list) /
8993 		    sizeof (mdi_phci_driver_info_t);
8994 	} else {
8995 		st_driver_list = NULL;
8996 		st_ndrivers = 0;
8997 	}
8998 
8999 	for (i = 0, p = st_driver_list; i < st_ndrivers; i++, p++) {
9000 		/* add this phci driver if not already added before */
9001 		for (j = 0; j < driver_conf_count; j++) {
9002 			if (strcmp((*driver_list)[j], p->phdriver_name) == 0)
9003 				break;
9004 		}
9005 		if (j == driver_conf_count) {
9006 			add_to_phci_list(driver_list, root_support_list,
9007 			    cur_elements, max_elements, p->phdriver_name,
9008 			    p->phdriver_root_support);
9009 		}
9010 	}
9011 }
9012 
9013 /*
9014  * Attach the phci driver instances associated with the specified vhci class.
9015  * If root is mounted attach all phci driver instances.
9016  * If root is not mounted, attach the instances of only those phci
9017  * drivers that have the root support.
9018  */
9019 static void
9020 attach_phci_drivers(char *vhci_class)
9021 {
9022 	char	**driver_list, **p;
9023 	int	*root_support_list;
9024 	int	cur_elements, max_elements, i;
9025 	major_t	m;
9026 
9027 	get_phci_driver_list(vhci_class, &driver_list, &root_support_list,
9028 	    &cur_elements, &max_elements);
9029 
9030 	for (i = 0; i < cur_elements; i++) {
9031 		if (modrootloaded || root_support_list[i]) {
9032 			m = ddi_name_to_major(driver_list[i]);
9033 			if (m != DDI_MAJOR_T_NONE &&
9034 			    ddi_hold_installed_driver(m))
9035 				ddi_rele_driver(m);
9036 		}
9037 	}
9038 
9039 	if (driver_list) {
9040 		for (i = 0, p = driver_list; i < cur_elements; i++, p++)
9041 			kmem_free(*p, strlen(*p) + 1);
9042 		kmem_free(driver_list, sizeof (char *) * max_elements);
9043 		kmem_free(root_support_list, sizeof (int) * max_elements);
9044 	}
9045 }
9046 
9047 /*
9048  * Build vhci cache:
9049  *
9050  * Attach phci driver instances and then drive BUS_CONFIG_ALL on
9051  * the phci driver instances. During this process the cache gets built.
9052  *
9053  * Cache is built fully if the root is mounted.
9054  * If the root is not mounted, phci drivers that do not have root support
9055  * are not attached. As a result the cache is built partially. The entries
9056  * in the cache reflect only those phci drivers that have root support.
9057  */
9058 static int
9059 build_vhci_cache(mdi_vhci_t *vh)
9060 {
9061 	mdi_vhci_config_t *vhc = vh->vh_config;
9062 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
9063 
9064 	single_threaded_vhconfig_enter(vhc);
9065 
9066 	rw_enter(&vhcache->vhcache_lock, RW_READER);
9067 	if (vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE) {
9068 		rw_exit(&vhcache->vhcache_lock);
9069 		single_threaded_vhconfig_exit(vhc);
9070 		return (0);
9071 	}
9072 	rw_exit(&vhcache->vhcache_lock);
9073 
9074 	attach_phci_drivers(vh->vh_class);
9075 	bus_config_all_phcis(vhcache, NDI_DRV_CONF_REPROBE | NDI_NO_EVENT,
9076 	    BUS_CONFIG_ALL, DDI_MAJOR_T_NONE);
9077 
9078 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
9079 	vhcache->vhcache_flags |= MDI_VHCI_CACHE_SETUP_DONE;
9080 	rw_exit(&vhcache->vhcache_lock);
9081 
9082 	single_threaded_vhconfig_exit(vhc);
9083 	vhcache_dirty(vhc);
9084 	return (1);
9085 }
9086 
9087 /*
9088  * Determine if discovery of paths is needed.
9089  */
9090 static int
9091 vhcache_do_discovery(mdi_vhci_config_t *vhc)
9092 {
9093 	int rv = 1;
9094 
9095 	mutex_enter(&vhc->vhc_lock);
9096 	if (i_ddi_io_initialized() == 0) {
9097 		if (vhc->vhc_path_discovery_boot > 0) {
9098 			vhc->vhc_path_discovery_boot--;
9099 			goto out;
9100 		}
9101 	} else {
9102 		if (vhc->vhc_path_discovery_postboot > 0) {
9103 			vhc->vhc_path_discovery_postboot--;
9104 			goto out;
9105 		}
9106 	}
9107 
9108 	/*
9109 	 * Do full path discovery at most once per mdi_path_discovery_interval.
9110 	 * This is to avoid a series of full path discoveries when opening
9111 	 * stale /dev/[r]dsk links.
9112 	 */
9113 	if (mdi_path_discovery_interval != -1 &&
9114 	    ddi_get_lbolt64() >= vhc->vhc_path_discovery_cutoff_time)
9115 		goto out;
9116 
9117 	rv = 0;
9118 out:
9119 	mutex_exit(&vhc->vhc_lock);
9120 	return (rv);
9121 }
9122 
9123 /*
9124  * Discover all paths:
9125  *
9126  * Attach phci driver instances and then drive BUS_CONFIG_ALL on all the phci
9127  * driver instances. During this process all paths will be discovered.
9128  */
9129 static int
9130 vhcache_discover_paths(mdi_vhci_t *vh)
9131 {
9132 	mdi_vhci_config_t *vhc = vh->vh_config;
9133 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
9134 	int rv = 0;
9135 
9136 	single_threaded_vhconfig_enter(vhc);
9137 
9138 	if (vhcache_do_discovery(vhc)) {
9139 		attach_phci_drivers(vh->vh_class);
9140 		bus_config_all_phcis(vhcache, NDI_DRV_CONF_REPROBE |
9141 		    NDI_NO_EVENT, BUS_CONFIG_ALL, DDI_MAJOR_T_NONE);
9142 
9143 		mutex_enter(&vhc->vhc_lock);
9144 		vhc->vhc_path_discovery_cutoff_time = ddi_get_lbolt64() +
9145 		    mdi_path_discovery_interval * TICKS_PER_SECOND;
9146 		mutex_exit(&vhc->vhc_lock);
9147 		rv = 1;
9148 	}
9149 
9150 	single_threaded_vhconfig_exit(vhc);
9151 	return (rv);
9152 }
9153 
9154 /*
9155  * Generic vhci bus config implementation:
9156  *
9157  * Parameters
9158  *	vdip	vhci dip
9159  *	flags	bus config flags
9160  *	op	bus config operation
9161  *	The remaining parameters are bus config operation specific
9162  *
9163  * for BUS_CONFIG_ONE
9164  *	arg	pointer to name@addr
9165  *	child	upon successful return from this function, *child will be
9166  *		set to the configured and held devinfo child node of vdip.
9167  *	ct_addr	pointer to client address (i.e. GUID)
9168  *
9169  * for BUS_CONFIG_DRIVER
9170  *	arg	major number of the driver
9171  *	child and ct_addr parameters are ignored
9172  *
9173  * for BUS_CONFIG_ALL
9174  *	arg, child, and ct_addr parameters are ignored
9175  *
9176  * Note that for the rest of the bus config operations, this function simply
9177  * calls the framework provided default bus config routine.
9178  */
9179 int
9180 mdi_vhci_bus_config(dev_info_t *vdip, uint_t flags, ddi_bus_config_op_t op,
9181     void *arg, dev_info_t **child, char *ct_addr)
9182 {
9183 	mdi_vhci_t *vh = i_devi_get_vhci(vdip);
9184 	mdi_vhci_config_t *vhc = vh->vh_config;
9185 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
9186 	int rv = 0;
9187 	int params_valid = 0;
9188 	char *cp;
9189 
9190 	/*
9191 	 * To bus config vhcis we relay operation, possibly using another
9192 	 * thread, to phcis. The phci driver then interacts with MDI to cause
9193 	 * vhci child nodes to be enumerated under the vhci node.  Adding a
9194 	 * vhci child requires an ndi_devi_enter of the vhci. Since another
9195 	 * thread may be adding the child, to avoid deadlock we can't wait
9196 	 * for the relayed operations to complete if we have already entered
9197 	 * the vhci node.
9198 	 */
9199 	if (DEVI_BUSY_OWNED(vdip)) {
9200 		MDI_DEBUG(2, (MDI_NOTE, vdip,
9201 		    "vhci dip is busy owned %p", (void *)vdip));
9202 		goto default_bus_config;
9203 	}
9204 
9205 	rw_enter(&vhcache->vhcache_lock, RW_READER);
9206 	if (!(vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE)) {
9207 		rw_exit(&vhcache->vhcache_lock);
9208 		rv = build_vhci_cache(vh);
9209 		rw_enter(&vhcache->vhcache_lock, RW_READER);
9210 	}
9211 
9212 	switch (op) {
9213 	case BUS_CONFIG_ONE:
9214 		if (arg != NULL && ct_addr != NULL) {
9215 			/* extract node name */
9216 			cp = (char *)arg;
9217 			while (*cp != '\0' && *cp != '@')
9218 				cp++;
9219 			if (*cp == '@') {
9220 				params_valid = 1;
9221 				*cp = '\0';
9222 				config_client_paths(vhc, (char *)arg, ct_addr);
9223 				/* config_client_paths() releases cache_lock */
9224 				*cp = '@';
9225 				break;
9226 			}
9227 		}
9228 
9229 		rw_exit(&vhcache->vhcache_lock);
9230 		break;
9231 
9232 	case BUS_CONFIG_DRIVER:
9233 		rw_exit(&vhcache->vhcache_lock);
9234 		if (rv == 0)
9235 			st_bus_config_all_phcis(vhc, flags, op,
9236 			    (major_t)(uintptr_t)arg);
9237 		break;
9238 
9239 	case BUS_CONFIG_ALL:
9240 		rw_exit(&vhcache->vhcache_lock);
9241 		if (rv == 0)
9242 			st_bus_config_all_phcis(vhc, flags, op, -1);
9243 		break;
9244 
9245 	default:
9246 		rw_exit(&vhcache->vhcache_lock);
9247 		break;
9248 	}
9249 
9250 
9251 default_bus_config:
9252 	/*
9253 	 * All requested child nodes are enumerated under the vhci.
9254 	 * Now configure them.
9255 	 */
9256 	if (ndi_busop_bus_config(vdip, flags, op, arg, child, 0) ==
9257 	    NDI_SUCCESS) {
9258 		return (MDI_SUCCESS);
9259 	} else if (op == BUS_CONFIG_ONE && rv == 0 && params_valid) {
9260 		/* discover all paths and try configuring again */
9261 		if (vhcache_discover_paths(vh) &&
9262 		    ndi_busop_bus_config(vdip, flags, op, arg, child, 0) ==
9263 		    NDI_SUCCESS)
9264 			return (MDI_SUCCESS);
9265 	}
9266 
9267 	return (MDI_FAILURE);
9268 }
9269 
9270 /*
9271  * Read the on-disk vhci cache into an nvlist for the specified vhci class.
9272  */
9273 static nvlist_t *
9274 read_on_disk_vhci_cache(char *vhci_class)
9275 {
9276 	nvlist_t *nvl;
9277 	int err;
9278 	char *filename;
9279 
9280 	filename = vhclass2vhcache_filename(vhci_class);
9281 
9282 	if ((err = fread_nvlist(filename, &nvl)) == 0) {
9283 		kmem_free(filename, strlen(filename) + 1);
9284 		return (nvl);
9285 	} else if (err == EIO)
9286 		cmn_err(CE_WARN, "%s: I/O error, will recreate", filename);
9287 	else if (err == EINVAL)
9288 		cmn_err(CE_WARN,
9289 		    "%s: data file corrupted, will recreate", filename);
9290 
9291 	kmem_free(filename, strlen(filename) + 1);
9292 	return (NULL);
9293 }
9294 
9295 /*
9296  * Read on-disk vhci cache into nvlists for all vhci classes.
9297  * Called during booting by i_ddi_read_devices_files().
9298  */
9299 void
9300 mdi_read_devices_files(void)
9301 {
9302 	int i;
9303 
9304 	for (i = 0; i < N_VHCI_CLASSES; i++)
9305 		vhcache_nvl[i] = read_on_disk_vhci_cache(vhci_class_list[i]);
9306 }
9307 
9308 /*
9309  * Remove all stale entries from vhci cache.
9310  */
9311 static void
9312 clean_vhcache(mdi_vhci_config_t *vhc)
9313 {
9314 	mdi_vhci_cache_t	*vhcache = &vhc->vhc_vhcache;
9315 	mdi_vhcache_phci_t	*phci, *nxt_phci;
9316 	mdi_vhcache_client_t	*client, *nxt_client;
9317 	mdi_vhcache_pathinfo_t	*path, *nxt_path;
9318 
9319 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
9320 
9321 	client = vhcache->vhcache_client_head;
9322 	vhcache->vhcache_client_head = vhcache->vhcache_client_tail = NULL;
9323 	for ( ; client != NULL; client = nxt_client) {
9324 		nxt_client = client->cct_next;
9325 
9326 		path = client->cct_cpi_head;
9327 		client->cct_cpi_head = client->cct_cpi_tail = NULL;
9328 		for ( ; path != NULL; path = nxt_path) {
9329 			nxt_path = path->cpi_next;
9330 			if ((path->cpi_cphci->cphci_phci != NULL) &&
9331 			    (path->cpi_pip != NULL)) {
9332 				enqueue_tail_vhcache_pathinfo(client, path);
9333 			} else if (path->cpi_pip != NULL) {
9334 				/* Not valid to have a path without a phci. */
9335 				free_vhcache_pathinfo(path);
9336 			}
9337 		}
9338 
9339 		if (client->cct_cpi_head != NULL)
9340 			enqueue_vhcache_client(vhcache, client);
9341 		else {
9342 			(void) mod_hash_destroy(vhcache->vhcache_client_hash,
9343 			    (mod_hash_key_t)client->cct_name_addr);
9344 			free_vhcache_client(client);
9345 		}
9346 	}
9347 
9348 	phci = vhcache->vhcache_phci_head;
9349 	vhcache->vhcache_phci_head = vhcache->vhcache_phci_tail = NULL;
9350 	for ( ; phci != NULL; phci = nxt_phci) {
9351 
9352 		nxt_phci = phci->cphci_next;
9353 		if (phci->cphci_phci != NULL)
9354 			enqueue_vhcache_phci(vhcache, phci);
9355 		else
9356 			free_vhcache_phci(phci);
9357 	}
9358 
9359 	vhcache->vhcache_clean_time = ddi_get_lbolt64();
9360 	rw_exit(&vhcache->vhcache_lock);
9361 	vhcache_dirty(vhc);
9362 }
9363 
9364 /*
9365  * Remove all stale entries from vhci cache.
9366  * Called by i_ddi_clean_devices_files() during the execution of devfsadm -C
9367  */
9368 void
9369 mdi_clean_vhcache(void)
9370 {
9371 	mdi_vhci_t *vh;
9372 
9373 	mutex_enter(&mdi_mutex);
9374 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
9375 		vh->vh_refcnt++;
9376 		mutex_exit(&mdi_mutex);
9377 		clean_vhcache(vh->vh_config);
9378 		mutex_enter(&mdi_mutex);
9379 		vh->vh_refcnt--;
9380 	}
9381 	mutex_exit(&mdi_mutex);
9382 }
9383 
9384 /*
9385  * mdi_vhci_walk_clients():
9386  *		Walker routine to traverse client dev_info nodes
9387  * ddi_walk_devs(ddi_get_child(vdip), f, arg) returns the entire tree
9388  * below the client, including nexus devices, which we dont want.
9389  * So we just traverse the immediate siblings, starting from 1st client.
9390  */
9391 void
9392 mdi_vhci_walk_clients(dev_info_t *vdip,
9393     int (*f)(dev_info_t *, void *), void *arg)
9394 {
9395 	mdi_vhci_t	*vh = i_devi_get_vhci(vdip);
9396 	dev_info_t	*cdip;
9397 	mdi_client_t	*ct;
9398 
9399 	MDI_VHCI_CLIENT_LOCK(vh);
9400 	cdip = ddi_get_child(vdip);
9401 	while (cdip) {
9402 		ct = i_devi_get_client(cdip);
9403 		MDI_CLIENT_LOCK(ct);
9404 
9405 		if (((*f)(cdip, arg)) == DDI_WALK_CONTINUE)
9406 			cdip = ddi_get_next_sibling(cdip);
9407 		else
9408 			cdip = NULL;
9409 
9410 		MDI_CLIENT_UNLOCK(ct);
9411 	}
9412 	MDI_VHCI_CLIENT_UNLOCK(vh);
9413 }
9414 
9415 /*
9416  * mdi_vhci_walk_phcis():
9417  *		Walker routine to traverse phci dev_info nodes
9418  */
9419 void
9420 mdi_vhci_walk_phcis(dev_info_t *vdip,
9421     int (*f)(dev_info_t *, void *), void *arg)
9422 {
9423 	mdi_vhci_t	*vh = i_devi_get_vhci(vdip);
9424 	mdi_phci_t	*ph, *next;
9425 
9426 	MDI_VHCI_PHCI_LOCK(vh);
9427 	ph = vh->vh_phci_head;
9428 	while (ph) {
9429 		MDI_PHCI_LOCK(ph);
9430 
9431 		if (((*f)(ph->ph_dip, arg)) == DDI_WALK_CONTINUE)
9432 			next = ph->ph_next;
9433 		else
9434 			next = NULL;
9435 
9436 		MDI_PHCI_UNLOCK(ph);
9437 		ph = next;
9438 	}
9439 	MDI_VHCI_PHCI_UNLOCK(vh);
9440 }
9441 
9442 
9443 /*
9444  * mdi_walk_vhcis():
9445  *		Walker routine to traverse vhci dev_info nodes
9446  */
9447 void
9448 mdi_walk_vhcis(int (*f)(dev_info_t *, void *), void *arg)
9449 {
9450 	mdi_vhci_t	*vh = NULL;
9451 
9452 	mutex_enter(&mdi_mutex);
9453 	/*
9454 	 * Scan for already registered vhci
9455 	 */
9456 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
9457 		vh->vh_refcnt++;
9458 		mutex_exit(&mdi_mutex);
9459 		if (((*f)(vh->vh_dip, arg)) != DDI_WALK_CONTINUE) {
9460 			mutex_enter(&mdi_mutex);
9461 			vh->vh_refcnt--;
9462 			break;
9463 		} else {
9464 			mutex_enter(&mdi_mutex);
9465 			vh->vh_refcnt--;
9466 		}
9467 	}
9468 
9469 	mutex_exit(&mdi_mutex);
9470 }
9471 
9472 /*
9473  * i_mdi_log_sysevent():
9474  *		Logs events for pickup by syseventd
9475  */
9476 static void
9477 i_mdi_log_sysevent(dev_info_t *dip, char *ph_vh_class, char *subclass)
9478 {
9479 	char		*path_name;
9480 	nvlist_t	*attr_list;
9481 
9482 	if (nvlist_alloc(&attr_list, NV_UNIQUE_NAME_TYPE,
9483 	    KM_SLEEP) != DDI_SUCCESS) {
9484 		goto alloc_failed;
9485 	}
9486 
9487 	path_name = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
9488 	(void) ddi_pathname(dip, path_name);
9489 
9490 	if (nvlist_add_string(attr_list, DDI_DRIVER_NAME,
9491 	    ddi_driver_name(dip)) != DDI_SUCCESS) {
9492 		goto error;
9493 	}
9494 
9495 	if (nvlist_add_int32(attr_list, DDI_DRIVER_MAJOR,
9496 	    (int32_t)ddi_driver_major(dip)) != DDI_SUCCESS) {
9497 		goto error;
9498 	}
9499 
9500 	if (nvlist_add_int32(attr_list, DDI_INSTANCE,
9501 	    (int32_t)ddi_get_instance(dip)) != DDI_SUCCESS) {
9502 		goto error;
9503 	}
9504 
9505 	if (nvlist_add_string(attr_list, DDI_PATHNAME,
9506 	    path_name) != DDI_SUCCESS) {
9507 		goto error;
9508 	}
9509 
9510 	if (nvlist_add_string(attr_list, DDI_CLASS,
9511 	    ph_vh_class) != DDI_SUCCESS) {
9512 		goto error;
9513 	}
9514 
9515 	(void) ddi_log_sysevent(dip, DDI_VENDOR_SUNW, EC_DDI, subclass,
9516 	    attr_list, NULL, DDI_SLEEP);
9517 
9518 error:
9519 	kmem_free(path_name, MAXPATHLEN);
9520 	nvlist_free(attr_list);
9521 	return;
9522 
9523 alloc_failed:
9524 	MDI_DEBUG(1, (MDI_WARN, dip, "!unable to send sysevent"));
9525 }
9526 
9527 char **
9528 mdi_get_phci_driver_list(char *vhci_class, int	*ndrivers)
9529 {
9530 	char	**driver_list, **ret_driver_list = NULL;
9531 	int	*root_support_list;
9532 	int	cur_elements, max_elements;
9533 
9534 	get_phci_driver_list(vhci_class, &driver_list, &root_support_list,
9535 	    &cur_elements, &max_elements);
9536 
9537 
9538 	if (driver_list) {
9539 		kmem_free(root_support_list, sizeof (int) * max_elements);
9540 		ret_driver_list = mdi_realloc(driver_list, sizeof (char *)
9541 		    * max_elements, sizeof (char *) * cur_elements);
9542 	}
9543 	*ndrivers = cur_elements;
9544 
9545 	return (ret_driver_list);
9546 
9547 }
9548 
9549 void
9550 mdi_free_phci_driver_list(char **driver_list, int ndrivers)
9551 {
9552 	char	**p;
9553 	int	i;
9554 
9555 	if (driver_list) {
9556 		for (i = 0, p = driver_list; i < ndrivers; i++, p++)
9557 			kmem_free(*p, strlen(*p) + 1);
9558 		kmem_free(driver_list, sizeof (char *) * ndrivers);
9559 	}
9560 }
9561 
9562 /*
9563  * mdi_is_dev_supported():
9564  *		function called by pHCI bus config operation to determine if a
9565  *		device should be represented as a child of the vHCI or the
9566  *		pHCI.  This decision is made by the vHCI, using cinfo idenity
9567  *		information passed by the pHCI - specifics of the cinfo
9568  *		representation are by agreement between the pHCI and vHCI.
9569  * Return Values:
9570  *		MDI_SUCCESS
9571  *		MDI_FAILURE
9572  */
9573 int
9574 mdi_is_dev_supported(char *class, dev_info_t *pdip, void *cinfo)
9575 {
9576 	mdi_vhci_t	*vh;
9577 
9578 	ASSERT(class && pdip);
9579 
9580 	/*
9581 	 * For dev_supported, mdi_phci_register() must have established pdip as
9582 	 * a pHCI.
9583 	 *
9584 	 * NOTE: mdi_phci_register() does "mpxio-disable" processing, and
9585 	 * MDI_PHCI(pdip) will return false if mpxio is disabled.
9586 	 */
9587 	if (!MDI_PHCI(pdip))
9588 		return (MDI_FAILURE);
9589 
9590 	/* Return MDI_FAILURE if vHCI does not support asking the question. */
9591 	vh = (mdi_vhci_t *)i_mdi_vhci_class2vhci(class);
9592 	if ((vh == NULL) || (vh->vh_ops->vo_is_dev_supported == NULL)) {
9593 		return (MDI_FAILURE);
9594 	}
9595 
9596 	/* Return vHCI answer */
9597 	return (vh->vh_ops->vo_is_dev_supported(vh->vh_dip, pdip, cinfo));
9598 }
9599 
9600 int
9601 mdi_dc_return_dev_state(mdi_pathinfo_t *pip, struct devctl_iocdata *dcp)
9602 {
9603 	uint_t devstate = 0;
9604 	dev_info_t *cdip;
9605 
9606 	if ((pip == NULL) || (dcp == NULL))
9607 		return (MDI_FAILURE);
9608 
9609 	cdip = mdi_pi_get_client(pip);
9610 
9611 	switch (mdi_pi_get_state(pip)) {
9612 	case MDI_PATHINFO_STATE_INIT:
9613 		devstate = DEVICE_DOWN;
9614 		break;
9615 	case MDI_PATHINFO_STATE_ONLINE:
9616 		devstate = DEVICE_ONLINE;
9617 		if ((cdip) && (devi_stillreferenced(cdip) == DEVI_REFERENCED))
9618 			devstate |= DEVICE_BUSY;
9619 		break;
9620 	case MDI_PATHINFO_STATE_STANDBY:
9621 		devstate = DEVICE_ONLINE;
9622 		break;
9623 	case MDI_PATHINFO_STATE_FAULT:
9624 		devstate = DEVICE_DOWN;
9625 		break;
9626 	case MDI_PATHINFO_STATE_OFFLINE:
9627 		devstate = DEVICE_OFFLINE;
9628 		break;
9629 	default:
9630 		ASSERT(MDI_PI(pip)->pi_state);
9631 	}
9632 
9633 	if (copyout(&devstate, dcp->cpyout_buf, sizeof (uint_t)) != 0)
9634 		return (MDI_FAILURE);
9635 
9636 	return (MDI_SUCCESS);
9637 }
9638