xref: /illumos-gate/usr/src/uts/common/os/sunmdi.c (revision fb2a9bae0030340ad72b9c26ba1ffee2ee3cafec)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Multipath driver interface (MDI) implementation; see mdi_impl.h for a more
28  * detailed discussion of the overall mpxio architecture.
29  *
30  * Default locking order:
31  *
32  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_phci_mutex);
33  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_client_mutex);
34  * _NOTE(LOCK_ORDER(mdi_vhci:vh_phci_mutex, mdi_phci::ph_mutex);
35  * _NOTE(LOCK_ORDER(mdi_vhci:vh_client_mutex, mdi_client::ct_mutex);
36  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
37  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_client::ct_mutex))
38  * _NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
39  */
40 
41 #include <sys/note.h>
42 #include <sys/types.h>
43 #include <sys/varargs.h>
44 #include <sys/param.h>
45 #include <sys/errno.h>
46 #include <sys/uio.h>
47 #include <sys/buf.h>
48 #include <sys/modctl.h>
49 #include <sys/open.h>
50 #include <sys/kmem.h>
51 #include <sys/poll.h>
52 #include <sys/conf.h>
53 #include <sys/bootconf.h>
54 #include <sys/cmn_err.h>
55 #include <sys/stat.h>
56 #include <sys/ddi.h>
57 #include <sys/sunddi.h>
58 #include <sys/ddipropdefs.h>
59 #include <sys/sunndi.h>
60 #include <sys/ndi_impldefs.h>
61 #include <sys/promif.h>
62 #include <sys/sunmdi.h>
63 #include <sys/mdi_impldefs.h>
64 #include <sys/taskq.h>
65 #include <sys/epm.h>
66 #include <sys/sunpm.h>
67 #include <sys/modhash.h>
68 #include <sys/disp.h>
69 #include <sys/autoconf.h>
70 #include <sys/sysmacros.h>
71 
72 #ifdef	DEBUG
73 #include <sys/debug.h>
74 int	mdi_debug = 1;
75 int	mdi_debug_logonly = 0;
76 #define	MDI_DEBUG(dbglevel, pargs) if (mdi_debug >= (dbglevel))	i_mdi_log pargs
77 #define	MDI_WARN	CE_WARN, __func__
78 #define	MDI_NOTE	CE_NOTE, __func__
79 #define	MDI_CONT	CE_CONT, __func__
80 static void i_mdi_log(int, const char *, dev_info_t *, const char *, ...);
81 #else	/* !DEBUG */
82 #define	MDI_DEBUG(dbglevel, pargs)
83 #endif	/* DEBUG */
84 int	mdi_debug_consoleonly = 0;
85 int	mdi_delay = 3;
86 
87 extern pri_t	minclsyspri;
88 extern int	modrootloaded;
89 
90 /*
91  * Global mutex:
92  * Protects vHCI list and structure members.
93  */
94 kmutex_t	mdi_mutex;
95 
96 /*
97  * Registered vHCI class driver lists
98  */
99 int		mdi_vhci_count;
100 mdi_vhci_t	*mdi_vhci_head;
101 mdi_vhci_t	*mdi_vhci_tail;
102 
103 /*
104  * Client Hash Table size
105  */
106 static int	mdi_client_table_size = CLIENT_HASH_TABLE_SIZE;
107 
108 /*
109  * taskq interface definitions
110  */
111 #define	MDI_TASKQ_N_THREADS	8
112 #define	MDI_TASKQ_PRI		minclsyspri
113 #define	MDI_TASKQ_MINALLOC	(4*mdi_taskq_n_threads)
114 #define	MDI_TASKQ_MAXALLOC	(500*mdi_taskq_n_threads)
115 
116 taskq_t				*mdi_taskq;
117 static uint_t			mdi_taskq_n_threads = MDI_TASKQ_N_THREADS;
118 
119 #define	TICKS_PER_SECOND	(drv_usectohz(1000000))
120 
121 /*
122  * The data should be "quiet" for this interval (in seconds) before the
123  * vhci cached data is flushed to the disk.
124  */
125 static int mdi_vhcache_flush_delay = 10;
126 
127 /* number of seconds the vhcache flush daemon will sleep idle before exiting */
128 static int mdi_vhcache_flush_daemon_idle_time = 60;
129 
130 /*
131  * MDI falls back to discovery of all paths when a bus_config_one fails.
132  * The following parameters can be used to tune this operation.
133  *
134  * mdi_path_discovery_boot
135  *	Number of times path discovery will be attempted during early boot.
136  *	Probably there is no reason to ever set this value to greater than one.
137  *
138  * mdi_path_discovery_postboot
139  *	Number of times path discovery will be attempted after early boot.
140  *	Set it to a minimum of two to allow for discovery of iscsi paths which
141  *	may happen very late during booting.
142  *
143  * mdi_path_discovery_interval
144  *	Minimum number of seconds MDI will wait between successive discovery
145  *	of all paths. Set it to -1 to disable discovery of all paths.
146  */
147 static int mdi_path_discovery_boot = 1;
148 static int mdi_path_discovery_postboot = 2;
149 static int mdi_path_discovery_interval = 10;
150 
151 /*
152  * number of seconds the asynchronous configuration thread will sleep idle
153  * before exiting.
154  */
155 static int mdi_async_config_idle_time = 600;
156 
157 static int mdi_bus_config_cache_hash_size = 256;
158 
159 /* turns off multithreaded configuration for certain operations */
160 static int mdi_mtc_off = 0;
161 
162 /*
163  * The "path" to a pathinfo node is identical to the /devices path to a
164  * devinfo node had the device been enumerated under a pHCI instead of
165  * a vHCI.  This pathinfo "path" is associated with a 'path_instance'.
166  * This association persists across create/delete of the pathinfo nodes,
167  * but not across reboot.
168  */
169 static uint_t		mdi_pathmap_instance = 1;	/* 0 -> any path */
170 static int		mdi_pathmap_hash_size = 256;
171 static kmutex_t		mdi_pathmap_mutex;
172 static mod_hash_t	*mdi_pathmap_bypath;		/* "path"->instance */
173 static mod_hash_t	*mdi_pathmap_byinstance;	/* instance->"path" */
174 static mod_hash_t	*mdi_pathmap_sbyinstance;	/* inst->shortpath */
175 
176 /*
177  * MDI component property name/value string definitions
178  */
179 const char 		*mdi_component_prop = "mpxio-component";
180 const char		*mdi_component_prop_vhci = "vhci";
181 const char		*mdi_component_prop_phci = "phci";
182 const char		*mdi_component_prop_client = "client";
183 
184 /*
185  * MDI client global unique identifier property name
186  */
187 const char		*mdi_client_guid_prop = "client-guid";
188 
189 /*
190  * MDI client load balancing property name/value string definitions
191  */
192 const char		*mdi_load_balance = "load-balance";
193 const char		*mdi_load_balance_none = "none";
194 const char		*mdi_load_balance_rr = "round-robin";
195 const char		*mdi_load_balance_lba = "logical-block";
196 
197 /*
198  * Obsolete vHCI class definition; to be removed after Leadville update
199  */
200 const char *mdi_vhci_class_scsi = MDI_HCI_CLASS_SCSI;
201 
202 static char vhci_greeting[] =
203 	"\tThere already exists one vHCI driver for class %s\n"
204 	"\tOnly one vHCI driver for each class is allowed\n";
205 
206 /*
207  * Static function prototypes
208  */
209 static int		i_mdi_phci_offline(dev_info_t *, uint_t);
210 static int		i_mdi_client_offline(dev_info_t *, uint_t);
211 static int		i_mdi_phci_pre_detach(dev_info_t *, ddi_detach_cmd_t);
212 static void		i_mdi_phci_post_detach(dev_info_t *,
213 			    ddi_detach_cmd_t, int);
214 static int		i_mdi_client_pre_detach(dev_info_t *,
215 			    ddi_detach_cmd_t);
216 static void		i_mdi_client_post_detach(dev_info_t *,
217 			    ddi_detach_cmd_t, int);
218 static void		i_mdi_pm_hold_pip(mdi_pathinfo_t *);
219 static void		i_mdi_pm_rele_pip(mdi_pathinfo_t *);
220 static int 		i_mdi_lba_lb(mdi_client_t *ct,
221 			    mdi_pathinfo_t **ret_pip, struct buf *buf);
222 static void		i_mdi_pm_hold_client(mdi_client_t *, int);
223 static void		i_mdi_pm_rele_client(mdi_client_t *, int);
224 static void		i_mdi_pm_reset_client(mdi_client_t *);
225 static int		i_mdi_power_all_phci(mdi_client_t *);
226 static void		i_mdi_log_sysevent(dev_info_t *, char *, char *);
227 
228 
229 /*
230  * Internal mdi_pathinfo node functions
231  */
232 static void		i_mdi_pi_kstat_destroy(mdi_pathinfo_t *);
233 
234 static mdi_vhci_t	*i_mdi_vhci_class2vhci(char *);
235 static mdi_vhci_t	*i_devi_get_vhci(dev_info_t *);
236 static mdi_phci_t	*i_devi_get_phci(dev_info_t *);
237 static void		i_mdi_phci_lock(mdi_phci_t *, mdi_pathinfo_t *);
238 static void		i_mdi_phci_unlock(mdi_phci_t *);
239 static mdi_pathinfo_t	*i_mdi_pi_alloc(mdi_phci_t *, char *, mdi_client_t *);
240 static void		i_mdi_phci_add_path(mdi_phci_t *, mdi_pathinfo_t *);
241 static void		i_mdi_client_add_path(mdi_client_t *, mdi_pathinfo_t *);
242 static void		i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *,
243 			    mdi_client_t *);
244 static void		i_mdi_phci_remove_path(mdi_phci_t *, mdi_pathinfo_t *);
245 static void		i_mdi_client_remove_path(mdi_client_t *,
246 			    mdi_pathinfo_t *);
247 
248 static int		i_mdi_pi_state_change(mdi_pathinfo_t *,
249 			    mdi_pathinfo_state_t, int);
250 static int		i_mdi_pi_offline(mdi_pathinfo_t *, int);
251 static dev_info_t	*i_mdi_devinfo_create(mdi_vhci_t *, char *, char *,
252 			    char **, int);
253 static dev_info_t	*i_mdi_devinfo_find(mdi_vhci_t *, char *, char *);
254 static int		i_mdi_devinfo_remove(dev_info_t *, dev_info_t *, int);
255 static int		i_mdi_is_child_present(dev_info_t *, dev_info_t *);
256 static mdi_client_t	*i_mdi_client_alloc(mdi_vhci_t *, char *, char *);
257 static void		i_mdi_client_enlist_table(mdi_vhci_t *, mdi_client_t *);
258 static void		i_mdi_client_delist_table(mdi_vhci_t *, mdi_client_t *);
259 static mdi_client_t	*i_mdi_client_find(mdi_vhci_t *, char *, char *);
260 static void		i_mdi_client_update_state(mdi_client_t *);
261 static int		i_mdi_client_compute_state(mdi_client_t *,
262 			    mdi_phci_t *);
263 static void		i_mdi_client_lock(mdi_client_t *, mdi_pathinfo_t *);
264 static void		i_mdi_client_unlock(mdi_client_t *);
265 static int		i_mdi_client_free(mdi_vhci_t *, mdi_client_t *);
266 static mdi_client_t	*i_devi_get_client(dev_info_t *);
267 /*
268  * NOTE: this will be removed once the NWS files are changed to use the new
269  * mdi_{enable,disable}_path interfaces
270  */
271 static int		i_mdi_pi_enable_disable(dev_info_t *, dev_info_t *,
272 				int, int);
273 static mdi_pathinfo_t 	*i_mdi_enable_disable_path(mdi_pathinfo_t *pip,
274 				mdi_vhci_t *vh, int flags, int op);
275 /*
276  * Failover related function prototypes
277  */
278 static int		i_mdi_failover(void *);
279 
280 /*
281  * misc internal functions
282  */
283 static int		i_mdi_get_hash_key(char *);
284 static int		i_map_nvlist_error_to_mdi(int);
285 static void		i_mdi_report_path_state(mdi_client_t *,
286 			    mdi_pathinfo_t *);
287 
288 static void		setup_vhci_cache(mdi_vhci_t *);
289 static int		destroy_vhci_cache(mdi_vhci_t *);
290 static int		stop_vhcache_async_threads(mdi_vhci_config_t *);
291 static boolean_t	stop_vhcache_flush_thread(void *, int);
292 static void		free_string_array(char **, int);
293 static void		free_vhcache_phci(mdi_vhcache_phci_t *);
294 static void		free_vhcache_pathinfo(mdi_vhcache_pathinfo_t *);
295 static void		free_vhcache_client(mdi_vhcache_client_t *);
296 static int		mainnvl_to_vhcache(mdi_vhci_cache_t *, nvlist_t *);
297 static nvlist_t		*vhcache_to_mainnvl(mdi_vhci_cache_t *);
298 static void		vhcache_phci_add(mdi_vhci_config_t *, mdi_phci_t *);
299 static void		vhcache_phci_remove(mdi_vhci_config_t *, mdi_phci_t *);
300 static void		vhcache_pi_add(mdi_vhci_config_t *,
301 			    struct mdi_pathinfo *);
302 static void		vhcache_pi_remove(mdi_vhci_config_t *,
303 			    struct mdi_pathinfo *);
304 static void		free_phclient_path_list(mdi_phys_path_t *);
305 static void		sort_vhcache_paths(mdi_vhcache_client_t *);
306 static int		flush_vhcache(mdi_vhci_config_t *, int);
307 static void		vhcache_dirty(mdi_vhci_config_t *);
308 static void		free_async_client_config(mdi_async_client_config_t *);
309 static void		single_threaded_vhconfig_enter(mdi_vhci_config_t *);
310 static void		single_threaded_vhconfig_exit(mdi_vhci_config_t *);
311 static nvlist_t		*read_on_disk_vhci_cache(char *);
312 extern int		fread_nvlist(char *, nvlist_t **);
313 extern int		fwrite_nvlist(char *, nvlist_t *);
314 
315 /* called once when first vhci registers with mdi */
316 static void
317 i_mdi_init()
318 {
319 	static int initialized = 0;
320 
321 	if (initialized)
322 		return;
323 	initialized = 1;
324 
325 	mutex_init(&mdi_mutex, NULL, MUTEX_DEFAULT, NULL);
326 
327 	/* Create our taskq resources */
328 	mdi_taskq = taskq_create("mdi_taskq", mdi_taskq_n_threads,
329 	    MDI_TASKQ_PRI, MDI_TASKQ_MINALLOC, MDI_TASKQ_MAXALLOC,
330 	    TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
331 	ASSERT(mdi_taskq != NULL);	/* taskq_create never fails */
332 
333 	/* Allocate ['path_instance' <-> "path"] maps */
334 	mutex_init(&mdi_pathmap_mutex, NULL, MUTEX_DRIVER, NULL);
335 	mdi_pathmap_bypath = mod_hash_create_strhash(
336 	    "mdi_pathmap_bypath", mdi_pathmap_hash_size,
337 	    mod_hash_null_valdtor);
338 	mdi_pathmap_byinstance = mod_hash_create_idhash(
339 	    "mdi_pathmap_byinstance", mdi_pathmap_hash_size,
340 	    mod_hash_null_valdtor);
341 	mdi_pathmap_sbyinstance = mod_hash_create_idhash(
342 	    "mdi_pathmap_sbyinstance", mdi_pathmap_hash_size,
343 	    mod_hash_null_valdtor);
344 }
345 
346 /*
347  * mdi_get_component_type():
348  *		Return mpxio component type
349  * Return Values:
350  *		MDI_COMPONENT_NONE
351  *		MDI_COMPONENT_VHCI
352  *		MDI_COMPONENT_PHCI
353  *		MDI_COMPONENT_CLIENT
354  * XXX This doesn't work under multi-level MPxIO and should be
355  *	removed when clients migrate mdi_component_is_*() interfaces.
356  */
357 int
358 mdi_get_component_type(dev_info_t *dip)
359 {
360 	return (DEVI(dip)->devi_mdi_component);
361 }
362 
363 /*
364  * mdi_vhci_register():
365  *		Register a vHCI module with the mpxio framework
366  *		mdi_vhci_register() is called by vHCI drivers to register the
367  *		'class_driver' vHCI driver and its MDI entrypoints with the
368  *		mpxio framework.  The vHCI driver must call this interface as
369  *		part of its attach(9e) handler.
370  *		Competing threads may try to attach mdi_vhci_register() as
371  *		the vHCI drivers are loaded and attached as a result of pHCI
372  *		driver instance registration (mdi_phci_register()) with the
373  *		framework.
374  * Return Values:
375  *		MDI_SUCCESS
376  *		MDI_FAILURE
377  */
378 /*ARGSUSED*/
379 int
380 mdi_vhci_register(char *class, dev_info_t *vdip, mdi_vhci_ops_t *vops,
381     int flags)
382 {
383 	mdi_vhci_t		*vh = NULL;
384 
385 	/* Registrant can't be older */
386 	ASSERT(vops->vo_revision <= MDI_VHCI_OPS_REV);
387 
388 #ifdef DEBUG
389 	/*
390 	 * IB nexus driver is loaded only when IB hardware is present.
391 	 * In order to be able to do this there is a need to drive the loading
392 	 * and attaching of the IB nexus driver (especially when an IB hardware
393 	 * is dynamically plugged in) when an IB HCA driver (PHCI)
394 	 * is being attached. Unfortunately this gets into the limitations
395 	 * of devfs as there seems to be no clean way to drive configuration
396 	 * of a subtree from another subtree of a devfs. Hence, do not ASSERT
397 	 * for IB.
398 	 */
399 	if (strcmp(class, MDI_HCI_CLASS_IB) != 0)
400 		ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));
401 #endif
402 
403 	i_mdi_init();
404 
405 	mutex_enter(&mdi_mutex);
406 	/*
407 	 * Scan for already registered vhci
408 	 */
409 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
410 		if (strcmp(vh->vh_class, class) == 0) {
411 			/*
412 			 * vHCI has already been created.  Check for valid
413 			 * vHCI ops registration.  We only support one vHCI
414 			 * module per class
415 			 */
416 			if (vh->vh_ops != NULL) {
417 				mutex_exit(&mdi_mutex);
418 				cmn_err(CE_NOTE, vhci_greeting, class);
419 				return (MDI_FAILURE);
420 			}
421 			break;
422 		}
423 	}
424 
425 	/*
426 	 * if not yet created, create the vHCI component
427 	 */
428 	if (vh == NULL) {
429 		struct client_hash	*hash = NULL;
430 		char			*load_balance;
431 
432 		/*
433 		 * Allocate and initialize the mdi extensions
434 		 */
435 		vh = kmem_zalloc(sizeof (mdi_vhci_t), KM_SLEEP);
436 		hash = kmem_zalloc(mdi_client_table_size * sizeof (*hash),
437 		    KM_SLEEP);
438 		vh->vh_client_table = hash;
439 		vh->vh_class = kmem_zalloc(strlen(class) + 1, KM_SLEEP);
440 		(void) strcpy(vh->vh_class, class);
441 		vh->vh_lb = LOAD_BALANCE_RR;
442 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vdip,
443 		    0, LOAD_BALANCE_PROP, &load_balance) == DDI_SUCCESS) {
444 			if (strcmp(load_balance, LOAD_BALANCE_PROP_NONE) == 0) {
445 				vh->vh_lb = LOAD_BALANCE_NONE;
446 			} else if (strcmp(load_balance, LOAD_BALANCE_PROP_LBA)
447 				    == 0) {
448 				vh->vh_lb = LOAD_BALANCE_LBA;
449 			}
450 			ddi_prop_free(load_balance);
451 		}
452 
453 		mutex_init(&vh->vh_phci_mutex, NULL, MUTEX_DEFAULT, NULL);
454 		mutex_init(&vh->vh_client_mutex, NULL, MUTEX_DEFAULT, NULL);
455 
456 		/*
457 		 * Store the vHCI ops vectors
458 		 */
459 		vh->vh_dip = vdip;
460 		vh->vh_ops = vops;
461 
462 		setup_vhci_cache(vh);
463 
464 		if (mdi_vhci_head == NULL) {
465 			mdi_vhci_head = vh;
466 		}
467 		if (mdi_vhci_tail) {
468 			mdi_vhci_tail->vh_next = vh;
469 		}
470 		mdi_vhci_tail = vh;
471 		mdi_vhci_count++;
472 	}
473 
474 	/*
475 	 * Claim the devfs node as a vhci component
476 	 */
477 	DEVI(vdip)->devi_mdi_component |= MDI_COMPONENT_VHCI;
478 
479 	/*
480 	 * Initialize our back reference from dev_info node
481 	 */
482 	DEVI(vdip)->devi_mdi_xhci = (caddr_t)vh;
483 	mutex_exit(&mdi_mutex);
484 	return (MDI_SUCCESS);
485 }
486 
487 /*
488  * mdi_vhci_unregister():
489  *		Unregister a vHCI module from mpxio framework
490  *		mdi_vhci_unregister() is called from the detach(9E) entrypoint
491  * 		of a vhci to unregister it from the framework.
492  * Return Values:
493  *		MDI_SUCCESS
494  *		MDI_FAILURE
495  */
496 /*ARGSUSED*/
497 int
498 mdi_vhci_unregister(dev_info_t *vdip, int flags)
499 {
500 	mdi_vhci_t	*found, *vh, *prev = NULL;
501 
502 	ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));
503 
504 	/*
505 	 * Check for invalid VHCI
506 	 */
507 	if ((vh = i_devi_get_vhci(vdip)) == NULL)
508 		return (MDI_FAILURE);
509 
510 	/*
511 	 * Scan the list of registered vHCIs for a match
512 	 */
513 	mutex_enter(&mdi_mutex);
514 	for (found = mdi_vhci_head; found != NULL; found = found->vh_next) {
515 		if (found == vh)
516 			break;
517 		prev = found;
518 	}
519 
520 	if (found == NULL) {
521 		mutex_exit(&mdi_mutex);
522 		return (MDI_FAILURE);
523 	}
524 
525 	/*
526 	 * Check the vHCI, pHCI and client count. All the pHCIs and clients
527 	 * should have been unregistered, before a vHCI can be
528 	 * unregistered.
529 	 */
530 	MDI_VHCI_PHCI_LOCK(vh);
531 	if (vh->vh_refcnt || vh->vh_phci_count || vh->vh_client_count) {
532 		MDI_VHCI_PHCI_UNLOCK(vh);
533 		mutex_exit(&mdi_mutex);
534 		return (MDI_FAILURE);
535 	}
536 	MDI_VHCI_PHCI_UNLOCK(vh);
537 
538 	if (destroy_vhci_cache(vh) != MDI_SUCCESS) {
539 		mutex_exit(&mdi_mutex);
540 		return (MDI_FAILURE);
541 	}
542 
543 	/*
544 	 * Remove the vHCI from the global list
545 	 */
546 	if (vh == mdi_vhci_head) {
547 		mdi_vhci_head = vh->vh_next;
548 	} else {
549 		prev->vh_next = vh->vh_next;
550 	}
551 	if (vh == mdi_vhci_tail) {
552 		mdi_vhci_tail = prev;
553 	}
554 	mdi_vhci_count--;
555 	mutex_exit(&mdi_mutex);
556 
557 	vh->vh_ops = NULL;
558 	DEVI(vdip)->devi_mdi_component &= ~MDI_COMPONENT_VHCI;
559 	DEVI(vdip)->devi_mdi_xhci = NULL;
560 	kmem_free(vh->vh_class, strlen(vh->vh_class)+1);
561 	kmem_free(vh->vh_client_table,
562 	    mdi_client_table_size * sizeof (struct client_hash));
563 	mutex_destroy(&vh->vh_phci_mutex);
564 	mutex_destroy(&vh->vh_client_mutex);
565 
566 	kmem_free(vh, sizeof (mdi_vhci_t));
567 	return (MDI_SUCCESS);
568 }
569 
570 /*
571  * i_mdi_vhci_class2vhci():
572  *		Look for a matching vHCI module given a vHCI class name
573  * Return Values:
574  *		Handle to a vHCI component
575  *		NULL
576  */
577 static mdi_vhci_t *
578 i_mdi_vhci_class2vhci(char *class)
579 {
580 	mdi_vhci_t	*vh = NULL;
581 
582 	ASSERT(!MUTEX_HELD(&mdi_mutex));
583 
584 	mutex_enter(&mdi_mutex);
585 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
586 		if (strcmp(vh->vh_class, class) == 0) {
587 			break;
588 		}
589 	}
590 	mutex_exit(&mdi_mutex);
591 	return (vh);
592 }
593 
594 /*
595  * i_devi_get_vhci():
596  *		Utility function to get the handle to a vHCI component
597  * Return Values:
598  *		Handle to a vHCI component
599  *		NULL
600  */
601 mdi_vhci_t *
602 i_devi_get_vhci(dev_info_t *vdip)
603 {
604 	mdi_vhci_t	*vh = NULL;
605 	if (MDI_VHCI(vdip)) {
606 		vh = (mdi_vhci_t *)DEVI(vdip)->devi_mdi_xhci;
607 	}
608 	return (vh);
609 }
610 
611 /*
612  * mdi_phci_register():
613  *		Register a pHCI module with mpxio framework
614  *		mdi_phci_register() is called by pHCI drivers to register with
615  *		the mpxio framework and a specific 'class_driver' vHCI.  The
616  *		pHCI driver must call this interface as part of its attach(9e)
617  *		handler.
618  * Return Values:
619  *		MDI_SUCCESS
620  *		MDI_FAILURE
621  */
622 /*ARGSUSED*/
623 int
624 mdi_phci_register(char *class, dev_info_t *pdip, int flags)
625 {
626 	mdi_phci_t		*ph;
627 	mdi_vhci_t		*vh;
628 	char			*data;
629 
630 	/*
631 	 * Some subsystems, like fcp, perform pHCI registration from a
632 	 * different thread than the one doing the pHCI attach(9E) - the
633 	 * driver attach code is waiting for this other thread to complete.
634 	 * This means we can only ASSERT DEVI_BUSY_CHANGING of parent
635 	 * (indicating that some thread has done an ndi_devi_enter of parent)
636 	 * not DEVI_BUSY_OWNED (which would indicate that we did the enter).
637 	 */
638 	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));
639 
640 	/*
641 	 * Check for mpxio-disable property. Enable mpxio if the property is
642 	 * missing or not set to "yes".
643 	 * If the property is set to "yes" then emit a brief message.
644 	 */
645 	if ((ddi_prop_lookup_string(DDI_DEV_T_ANY, pdip, 0, "mpxio-disable",
646 	    &data) == DDI_SUCCESS)) {
647 		if (strcmp(data, "yes") == 0) {
648 			MDI_DEBUG(1, (MDI_CONT, pdip,
649 			    "?multipath capabilities disabled via %s.conf.",
650 			    ddi_driver_name(pdip)));
651 			ddi_prop_free(data);
652 			return (MDI_FAILURE);
653 		}
654 		ddi_prop_free(data);
655 	}
656 
657 	/*
658 	 * Search for a matching vHCI
659 	 */
660 	vh = (mdi_vhci_t *)i_mdi_vhci_class2vhci(class);
661 	if (vh == NULL) {
662 		return (MDI_FAILURE);
663 	}
664 
665 	ph = kmem_zalloc(sizeof (mdi_phci_t), KM_SLEEP);
666 	mutex_init(&ph->ph_mutex, NULL, MUTEX_DEFAULT, NULL);
667 	ph->ph_dip = pdip;
668 	ph->ph_vhci = vh;
669 	ph->ph_next = NULL;
670 	ph->ph_unstable = 0;
671 	ph->ph_vprivate = 0;
672 	cv_init(&ph->ph_unstable_cv, NULL, CV_DRIVER, NULL);
673 
674 	MDI_PHCI_LOCK(ph);
675 	MDI_PHCI_SET_POWER_UP(ph);
676 	MDI_PHCI_UNLOCK(ph);
677 	DEVI(pdip)->devi_mdi_component |= MDI_COMPONENT_PHCI;
678 	DEVI(pdip)->devi_mdi_xhci = (caddr_t)ph;
679 
680 	vhcache_phci_add(vh->vh_config, ph);
681 
682 	MDI_VHCI_PHCI_LOCK(vh);
683 	if (vh->vh_phci_head == NULL) {
684 		vh->vh_phci_head = ph;
685 	}
686 	if (vh->vh_phci_tail) {
687 		vh->vh_phci_tail->ph_next = ph;
688 	}
689 	vh->vh_phci_tail = ph;
690 	vh->vh_phci_count++;
691 	MDI_VHCI_PHCI_UNLOCK(vh);
692 
693 	i_mdi_log_sysevent(pdip, class, ESC_DDI_INITIATOR_REGISTER);
694 	return (MDI_SUCCESS);
695 }
696 
697 /*
698  * mdi_phci_unregister():
699  *		Unregister a pHCI module from mpxio framework
700  *		mdi_phci_unregister() is called by the pHCI drivers from their
701  *		detach(9E) handler to unregister their instances from the
702  *		framework.
703  * Return Values:
704  *		MDI_SUCCESS
705  *		MDI_FAILURE
706  */
707 /*ARGSUSED*/
708 int
709 mdi_phci_unregister(dev_info_t *pdip, int flags)
710 {
711 	mdi_vhci_t		*vh;
712 	mdi_phci_t		*ph;
713 	mdi_phci_t		*tmp;
714 	mdi_phci_t		*prev = NULL;
715 	mdi_pathinfo_t		*pip;
716 
717 	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));
718 
719 	ph = i_devi_get_phci(pdip);
720 	if (ph == NULL) {
721 		MDI_DEBUG(1, (MDI_WARN, pdip, "!not a valid pHCI"));
722 		return (MDI_FAILURE);
723 	}
724 
725 	vh = ph->ph_vhci;
726 	ASSERT(vh != NULL);
727 	if (vh == NULL) {
728 		MDI_DEBUG(1, (MDI_WARN, pdip, "!not a valid vHCI"));
729 		return (MDI_FAILURE);
730 	}
731 
732 	MDI_VHCI_PHCI_LOCK(vh);
733 	tmp = vh->vh_phci_head;
734 	while (tmp) {
735 		if (tmp == ph) {
736 			break;
737 		}
738 		prev = tmp;
739 		tmp = tmp->ph_next;
740 	}
741 
742 	if (ph == vh->vh_phci_head) {
743 		vh->vh_phci_head = ph->ph_next;
744 	} else {
745 		prev->ph_next = ph->ph_next;
746 	}
747 
748 	if (ph == vh->vh_phci_tail) {
749 		vh->vh_phci_tail = prev;
750 	}
751 
752 	vh->vh_phci_count--;
753 	MDI_VHCI_PHCI_UNLOCK(vh);
754 
755 	/* Walk remaining pathinfo nodes and disassociate them from pHCI */
756 	MDI_PHCI_LOCK(ph);
757 	for (pip = (mdi_pathinfo_t *)ph->ph_path_head; pip;
758 	    pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link)
759 		MDI_PI(pip)->pi_phci = NULL;
760 	MDI_PHCI_UNLOCK(ph);
761 
762 	i_mdi_log_sysevent(pdip, ph->ph_vhci->vh_class,
763 	    ESC_DDI_INITIATOR_UNREGISTER);
764 	vhcache_phci_remove(vh->vh_config, ph);
765 	cv_destroy(&ph->ph_unstable_cv);
766 	mutex_destroy(&ph->ph_mutex);
767 	kmem_free(ph, sizeof (mdi_phci_t));
768 	DEVI(pdip)->devi_mdi_component &= ~MDI_COMPONENT_PHCI;
769 	DEVI(pdip)->devi_mdi_xhci = NULL;
770 	return (MDI_SUCCESS);
771 }
772 
773 /*
774  * i_devi_get_phci():
775  * 		Utility function to return the phci extensions.
776  */
777 static mdi_phci_t *
778 i_devi_get_phci(dev_info_t *pdip)
779 {
780 	mdi_phci_t	*ph = NULL;
781 
782 	if (MDI_PHCI(pdip)) {
783 		ph = (mdi_phci_t *)DEVI(pdip)->devi_mdi_xhci;
784 	}
785 	return (ph);
786 }
787 
788 /*
789  * Single thread mdi entry into devinfo node for modifying its children.
790  * If necessary we perform an ndi_devi_enter of the vHCI before doing
791  * an ndi_devi_enter of 'dip'.  We maintain circular in two parts: one
792  * for the vHCI and one for the pHCI.
793  */
794 void
795 mdi_devi_enter(dev_info_t *phci_dip, int *circular)
796 {
797 	dev_info_t	*vdip;
798 	int		vcircular, pcircular;
799 
800 	/* Verify calling context */
801 	ASSERT(MDI_PHCI(phci_dip));
802 	vdip = mdi_devi_get_vdip(phci_dip);
803 	ASSERT(vdip);			/* A pHCI always has a vHCI */
804 
805 	/*
806 	 * If pHCI is detaching then the framework has already entered the
807 	 * vHCI on a threads that went down the code path leading to
808 	 * detach_node().  This framework enter of the vHCI during pHCI
809 	 * detach is done to avoid deadlock with vHCI power management
810 	 * operations which enter the vHCI and the enter down the path
811 	 * to the pHCI. If pHCI is detaching then we piggyback this calls
812 	 * enter of the vHCI on frameworks vHCI enter that has already
813 	 * occurred - this is OK because we know that the framework thread
814 	 * doing detach is waiting for our completion.
815 	 *
816 	 * We should DEVI_IS_DETACHING under an enter of the parent to avoid
817 	 * race with detach - but we can't do that because the framework has
818 	 * already entered the parent, so we have some complexity instead.
819 	 */
820 	for (;;) {
821 		if (ndi_devi_tryenter(vdip, &vcircular)) {
822 			ASSERT(vcircular != -1);
823 			if (DEVI_IS_DETACHING(phci_dip)) {
824 				ndi_devi_exit(vdip, vcircular);
825 				vcircular = -1;
826 			}
827 			break;
828 		} else if (DEVI_IS_DETACHING(phci_dip)) {
829 			vcircular = -1;
830 			break;
831 		} else if (servicing_interrupt()) {
832 			/*
833 			 * Don't delay an interrupt (and ensure adaptive
834 			 * mutex inversion support).
835 			 */
836 			ndi_devi_enter(vdip, &vcircular);
837 			break;
838 		} else {
839 			delay_random(mdi_delay);
840 		}
841 	}
842 
843 	ndi_devi_enter(phci_dip, &pcircular);
844 	*circular = (vcircular << 16) | (pcircular & 0xFFFF);
845 }
846 
847 /*
848  * Attempt to mdi_devi_enter.
849  */
850 int
851 mdi_devi_tryenter(dev_info_t *phci_dip, int *circular)
852 {
853 	dev_info_t	*vdip;
854 	int		vcircular, pcircular;
855 
856 	/* Verify calling context */
857 	ASSERT(MDI_PHCI(phci_dip));
858 	vdip = mdi_devi_get_vdip(phci_dip);
859 	ASSERT(vdip);			/* A pHCI always has a vHCI */
860 
861 	if (ndi_devi_tryenter(vdip, &vcircular)) {
862 		if (ndi_devi_tryenter(phci_dip, &pcircular)) {
863 			*circular = (vcircular << 16) | (pcircular & 0xFFFF);
864 			return (1);	/* locked */
865 		}
866 		ndi_devi_exit(vdip, vcircular);
867 	}
868 	return (0);			/* busy */
869 }
870 
871 /*
872  * Release mdi_devi_enter or successful mdi_devi_tryenter.
873  */
874 void
875 mdi_devi_exit(dev_info_t *phci_dip, int circular)
876 {
877 	dev_info_t	*vdip;
878 	int		vcircular, pcircular;
879 
880 	/* Verify calling context */
881 	ASSERT(MDI_PHCI(phci_dip));
882 	vdip = mdi_devi_get_vdip(phci_dip);
883 	ASSERT(vdip);			/* A pHCI always has a vHCI */
884 
885 	/* extract two circular recursion values from single int */
886 	pcircular = (short)(circular & 0xFFFF);
887 	vcircular = (short)((circular >> 16) & 0xFFFF);
888 
889 	ndi_devi_exit(phci_dip, pcircular);
890 	if (vcircular != -1)
891 		ndi_devi_exit(vdip, vcircular);
892 }
893 
894 /*
895  * The functions mdi_devi_exit_phci() and mdi_devi_enter_phci() are used
896  * around a pHCI drivers calls to mdi_pi_online/offline, after holding
897  * the pathinfo node via mdi_hold_path/mdi_rele_path, to avoid deadlock
898  * with vHCI power management code during path online/offline.  Each
899  * mdi_devi_exit_phci must have a matching mdi_devi_enter_phci, and both must
900  * occur within the scope of an active mdi_devi_enter that establishes the
901  * circular value.
902  */
903 void
904 mdi_devi_exit_phci(dev_info_t *phci_dip, int circular)
905 {
906 	int		pcircular;
907 
908 	/* Verify calling context */
909 	ASSERT(MDI_PHCI(phci_dip));
910 
911 	/* Keep hold on pHCI until we reenter in mdi_devi_enter_phci */
912 	ndi_hold_devi(phci_dip);
913 
914 	pcircular = (short)(circular & 0xFFFF);
915 	ndi_devi_exit(phci_dip, pcircular);
916 }
917 
918 void
919 mdi_devi_enter_phci(dev_info_t *phci_dip, int *circular)
920 {
921 	int		pcircular;
922 
923 	/* Verify calling context */
924 	ASSERT(MDI_PHCI(phci_dip));
925 
926 	ndi_devi_enter(phci_dip, &pcircular);
927 
928 	/* Drop hold from mdi_devi_exit_phci. */
929 	ndi_rele_devi(phci_dip);
930 
931 	/* verify matching mdi_devi_exit_phci/mdi_devi_enter_phci use */
932 	ASSERT(pcircular == ((short)(*circular & 0xFFFF)));
933 }
934 
935 /*
936  * mdi_devi_get_vdip():
937  *		given a pHCI dip return vHCI dip
938  */
939 dev_info_t *
940 mdi_devi_get_vdip(dev_info_t *pdip)
941 {
942 	mdi_phci_t	*ph;
943 
944 	ph = i_devi_get_phci(pdip);
945 	if (ph && ph->ph_vhci)
946 		return (ph->ph_vhci->vh_dip);
947 	return (NULL);
948 }
949 
950 /*
951  * mdi_devi_pdip_entered():
952  *		Return 1 if we are vHCI and have done an ndi_devi_enter
953  *		of a pHCI
954  */
955 int
956 mdi_devi_pdip_entered(dev_info_t *vdip)
957 {
958 	mdi_vhci_t	*vh;
959 	mdi_phci_t	*ph;
960 
961 	vh = i_devi_get_vhci(vdip);
962 	if (vh == NULL)
963 		return (0);
964 
965 	MDI_VHCI_PHCI_LOCK(vh);
966 	ph = vh->vh_phci_head;
967 	while (ph) {
968 		if (ph->ph_dip && DEVI_BUSY_OWNED(ph->ph_dip)) {
969 			MDI_VHCI_PHCI_UNLOCK(vh);
970 			return (1);
971 		}
972 		ph = ph->ph_next;
973 	}
974 	MDI_VHCI_PHCI_UNLOCK(vh);
975 	return (0);
976 }
977 
978 /*
979  * mdi_phci_path2devinfo():
980  * 		Utility function to search for a valid phci device given
981  *		the devfs pathname.
982  */
983 dev_info_t *
984 mdi_phci_path2devinfo(dev_info_t *vdip, caddr_t pathname)
985 {
986 	char		*temp_pathname;
987 	mdi_vhci_t	*vh;
988 	mdi_phci_t	*ph;
989 	dev_info_t 	*pdip = NULL;
990 
991 	vh = i_devi_get_vhci(vdip);
992 	ASSERT(vh != NULL);
993 
994 	if (vh == NULL) {
995 		/*
996 		 * Invalid vHCI component, return failure
997 		 */
998 		return (NULL);
999 	}
1000 
1001 	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1002 	MDI_VHCI_PHCI_LOCK(vh);
1003 	ph = vh->vh_phci_head;
1004 	while (ph != NULL) {
1005 		pdip = ph->ph_dip;
1006 		ASSERT(pdip != NULL);
1007 		*temp_pathname = '\0';
1008 		(void) ddi_pathname(pdip, temp_pathname);
1009 		if (strcmp(temp_pathname, pathname) == 0) {
1010 			break;
1011 		}
1012 		ph = ph->ph_next;
1013 	}
1014 	if (ph == NULL) {
1015 		pdip = NULL;
1016 	}
1017 	MDI_VHCI_PHCI_UNLOCK(vh);
1018 	kmem_free(temp_pathname, MAXPATHLEN);
1019 	return (pdip);
1020 }
1021 
1022 /*
1023  * mdi_phci_get_path_count():
1024  * 		get number of path information nodes associated with a given
1025  *		pHCI device.
1026  */
1027 int
1028 mdi_phci_get_path_count(dev_info_t *pdip)
1029 {
1030 	mdi_phci_t	*ph;
1031 	int		count = 0;
1032 
1033 	ph = i_devi_get_phci(pdip);
1034 	if (ph != NULL) {
1035 		count = ph->ph_path_count;
1036 	}
1037 	return (count);
1038 }
1039 
1040 /*
1041  * i_mdi_phci_lock():
1042  *		Lock a pHCI device
1043  * Return Values:
1044  *		None
1045  * Note:
1046  *		The default locking order is:
1047  *		_NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
1048  *		But there are number of situations where locks need to be
1049  *		grabbed in reverse order.  This routine implements try and lock
1050  *		mechanism depending on the requested parameter option.
1051  */
1052 static void
1053 i_mdi_phci_lock(mdi_phci_t *ph, mdi_pathinfo_t *pip)
1054 {
1055 	if (pip) {
1056 		/* Reverse locking is requested. */
1057 		while (MDI_PHCI_TRYLOCK(ph) == 0) {
1058 			if (servicing_interrupt()) {
1059 				MDI_PI_HOLD(pip);
1060 				MDI_PI_UNLOCK(pip);
1061 				MDI_PHCI_LOCK(ph);
1062 				MDI_PI_LOCK(pip);
1063 				MDI_PI_RELE(pip);
1064 				break;
1065 			} else {
1066 				/*
1067 				 * tryenter failed. Try to grab again
1068 				 * after a small delay
1069 				 */
1070 				MDI_PI_HOLD(pip);
1071 				MDI_PI_UNLOCK(pip);
1072 				delay_random(mdi_delay);
1073 				MDI_PI_LOCK(pip);
1074 				MDI_PI_RELE(pip);
1075 			}
1076 		}
1077 	} else {
1078 		MDI_PHCI_LOCK(ph);
1079 	}
1080 }
1081 
1082 /*
1083  * i_mdi_phci_unlock():
1084  *		Unlock the pHCI component
1085  */
1086 static void
1087 i_mdi_phci_unlock(mdi_phci_t *ph)
1088 {
1089 	MDI_PHCI_UNLOCK(ph);
1090 }
1091 
1092 /*
1093  * i_mdi_devinfo_create():
1094  *		create client device's devinfo node
1095  * Return Values:
1096  *		dev_info
1097  *		NULL
1098  * Notes:
1099  */
1100 static dev_info_t *
1101 i_mdi_devinfo_create(mdi_vhci_t *vh, char *name, char *guid,
1102 	char **compatible, int ncompatible)
1103 {
1104 	dev_info_t *cdip = NULL;
1105 
1106 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1107 
1108 	/* Verify for duplicate entry */
1109 	cdip = i_mdi_devinfo_find(vh, name, guid);
1110 	ASSERT(cdip == NULL);
1111 	if (cdip) {
1112 		cmn_err(CE_WARN,
1113 		    "i_mdi_devinfo_create: client %s@%s already exists",
1114 			name ? name : "", guid ? guid : "");
1115 	}
1116 
1117 	ndi_devi_alloc_sleep(vh->vh_dip, name, DEVI_SID_NODEID, &cdip);
1118 	if (cdip == NULL)
1119 		goto fail;
1120 
1121 	/*
1122 	 * Create component type and Global unique identifier
1123 	 * properties
1124 	 */
1125 	if (ndi_prop_update_string(DDI_DEV_T_NONE, cdip,
1126 	    MDI_CLIENT_GUID_PROP, guid) != DDI_PROP_SUCCESS) {
1127 		goto fail;
1128 	}
1129 
1130 	/* Decorate the node with compatible property */
1131 	if (compatible &&
1132 	    (ndi_prop_update_string_array(DDI_DEV_T_NONE, cdip,
1133 	    "compatible", compatible, ncompatible) != DDI_PROP_SUCCESS)) {
1134 		goto fail;
1135 	}
1136 
1137 	return (cdip);
1138 
1139 fail:
1140 	if (cdip) {
1141 		(void) ndi_prop_remove_all(cdip);
1142 		(void) ndi_devi_free(cdip);
1143 	}
1144 	return (NULL);
1145 }
1146 
1147 /*
1148  * i_mdi_devinfo_find():
1149  *		Find a matching devinfo node for given client node name
1150  *		and its guid.
1151  * Return Values:
1152  *		Handle to a dev_info node or NULL
1153  */
1154 static dev_info_t *
1155 i_mdi_devinfo_find(mdi_vhci_t *vh, caddr_t name, char *guid)
1156 {
1157 	char			*data;
1158 	dev_info_t 		*cdip = NULL;
1159 	dev_info_t 		*ndip = NULL;
1160 	int			circular;
1161 
1162 	ndi_devi_enter(vh->vh_dip, &circular);
1163 	ndip = (dev_info_t *)DEVI(vh->vh_dip)->devi_child;
1164 	while ((cdip = ndip) != NULL) {
1165 		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
1166 
1167 		if (strcmp(DEVI(cdip)->devi_node_name, name)) {
1168 			continue;
1169 		}
1170 
1171 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, cdip,
1172 		    DDI_PROP_DONTPASS, MDI_CLIENT_GUID_PROP,
1173 		    &data) != DDI_PROP_SUCCESS) {
1174 			continue;
1175 		}
1176 
1177 		if (strcmp(data, guid) != 0) {
1178 			ddi_prop_free(data);
1179 			continue;
1180 		}
1181 		ddi_prop_free(data);
1182 		break;
1183 	}
1184 	ndi_devi_exit(vh->vh_dip, circular);
1185 	return (cdip);
1186 }
1187 
1188 /*
1189  * i_mdi_devinfo_remove():
1190  *		Remove a client device node
1191  */
1192 static int
1193 i_mdi_devinfo_remove(dev_info_t *vdip, dev_info_t *cdip, int flags)
1194 {
1195 	int	rv = MDI_SUCCESS;
1196 
1197 	if (i_mdi_is_child_present(vdip, cdip) == MDI_SUCCESS ||
1198 	    (flags & MDI_CLIENT_FLAGS_DEV_NOT_SUPPORTED)) {
1199 		rv = ndi_devi_offline(cdip, NDI_DEVFS_CLEAN | NDI_DEVI_REMOVE);
1200 		if (rv != NDI_SUCCESS) {
1201 			MDI_DEBUG(1, (MDI_NOTE, cdip,
1202 			    "!failed: cdip %p", (void *)cdip));
1203 		}
1204 		/*
1205 		 * Convert to MDI error code
1206 		 */
1207 		switch (rv) {
1208 		case NDI_SUCCESS:
1209 			rv = MDI_SUCCESS;
1210 			break;
1211 		case NDI_BUSY:
1212 			rv = MDI_BUSY;
1213 			break;
1214 		default:
1215 			rv = MDI_FAILURE;
1216 			break;
1217 		}
1218 	}
1219 	return (rv);
1220 }
1221 
1222 /*
1223  * i_devi_get_client()
1224  *		Utility function to get mpxio component extensions
1225  */
1226 static mdi_client_t *
1227 i_devi_get_client(dev_info_t *cdip)
1228 {
1229 	mdi_client_t	*ct = NULL;
1230 
1231 	if (MDI_CLIENT(cdip)) {
1232 		ct = (mdi_client_t *)DEVI(cdip)->devi_mdi_client;
1233 	}
1234 	return (ct);
1235 }
1236 
1237 /*
1238  * i_mdi_is_child_present():
1239  *		Search for the presence of client device dev_info node
1240  */
1241 static int
1242 i_mdi_is_child_present(dev_info_t *vdip, dev_info_t *cdip)
1243 {
1244 	int		rv = MDI_FAILURE;
1245 	struct dev_info	*dip;
1246 	int		circular;
1247 
1248 	ndi_devi_enter(vdip, &circular);
1249 	dip = DEVI(vdip)->devi_child;
1250 	while (dip) {
1251 		if (dip == DEVI(cdip)) {
1252 			rv = MDI_SUCCESS;
1253 			break;
1254 		}
1255 		dip = dip->devi_sibling;
1256 	}
1257 	ndi_devi_exit(vdip, circular);
1258 	return (rv);
1259 }
1260 
1261 
1262 /*
1263  * i_mdi_client_lock():
1264  *		Grab client component lock
1265  * Return Values:
1266  *		None
1267  * Note:
1268  *		The default locking order is:
1269  *		_NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
1270  *		But there are number of situations where locks need to be
1271  *		grabbed in reverse order.  This routine implements try and lock
1272  *		mechanism depending on the requested parameter option.
1273  */
1274 static void
1275 i_mdi_client_lock(mdi_client_t *ct, mdi_pathinfo_t *pip)
1276 {
1277 	if (pip) {
1278 		/*
1279 		 * Reverse locking is requested.
1280 		 */
1281 		while (MDI_CLIENT_TRYLOCK(ct) == 0) {
1282 			if (servicing_interrupt()) {
1283 				MDI_PI_HOLD(pip);
1284 				MDI_PI_UNLOCK(pip);
1285 				MDI_CLIENT_LOCK(ct);
1286 				MDI_PI_LOCK(pip);
1287 				MDI_PI_RELE(pip);
1288 				break;
1289 			} else {
1290 				/*
1291 				 * tryenter failed. Try to grab again
1292 				 * after a small delay
1293 				 */
1294 				MDI_PI_HOLD(pip);
1295 				MDI_PI_UNLOCK(pip);
1296 				delay_random(mdi_delay);
1297 				MDI_PI_LOCK(pip);
1298 				MDI_PI_RELE(pip);
1299 			}
1300 		}
1301 	} else {
1302 		MDI_CLIENT_LOCK(ct);
1303 	}
1304 }
1305 
1306 /*
1307  * i_mdi_client_unlock():
1308  *		Unlock a client component
1309  */
1310 static void
1311 i_mdi_client_unlock(mdi_client_t *ct)
1312 {
1313 	MDI_CLIENT_UNLOCK(ct);
1314 }
1315 
1316 /*
1317  * i_mdi_client_alloc():
1318  * 		Allocate and initialize a client structure.  Caller should
1319  *		hold the vhci client lock.
1320  * Return Values:
1321  *		Handle to a client component
1322  */
1323 /*ARGSUSED*/
1324 static mdi_client_t *
1325 i_mdi_client_alloc(mdi_vhci_t *vh, char *name, char *lguid)
1326 {
1327 	mdi_client_t	*ct;
1328 
1329 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1330 
1331 	/*
1332 	 * Allocate and initialize a component structure.
1333 	 */
1334 	ct = kmem_zalloc(sizeof (*ct), KM_SLEEP);
1335 	mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL);
1336 	ct->ct_hnext = NULL;
1337 	ct->ct_hprev = NULL;
1338 	ct->ct_dip = NULL;
1339 	ct->ct_vhci = vh;
1340 	ct->ct_drvname = kmem_alloc(strlen(name) + 1, KM_SLEEP);
1341 	(void) strcpy(ct->ct_drvname, name);
1342 	ct->ct_guid = kmem_alloc(strlen(lguid) + 1, KM_SLEEP);
1343 	(void) strcpy(ct->ct_guid, lguid);
1344 	ct->ct_cprivate = NULL;
1345 	ct->ct_vprivate = NULL;
1346 	ct->ct_flags = 0;
1347 	ct->ct_state = MDI_CLIENT_STATE_FAILED;
1348 	MDI_CLIENT_LOCK(ct);
1349 	MDI_CLIENT_SET_OFFLINE(ct);
1350 	MDI_CLIENT_SET_DETACH(ct);
1351 	MDI_CLIENT_SET_POWER_UP(ct);
1352 	MDI_CLIENT_UNLOCK(ct);
1353 	ct->ct_failover_flags = 0;
1354 	ct->ct_failover_status = 0;
1355 	cv_init(&ct->ct_failover_cv, NULL, CV_DRIVER, NULL);
1356 	ct->ct_unstable = 0;
1357 	cv_init(&ct->ct_unstable_cv, NULL, CV_DRIVER, NULL);
1358 	cv_init(&ct->ct_powerchange_cv, NULL, CV_DRIVER, NULL);
1359 	ct->ct_lb = vh->vh_lb;
1360 	ct->ct_lb_args =  kmem_zalloc(sizeof (client_lb_args_t), KM_SLEEP);
1361 	ct->ct_lb_args->region_size = LOAD_BALANCE_DEFAULT_REGION_SIZE;
1362 	ct->ct_path_count = 0;
1363 	ct->ct_path_head = NULL;
1364 	ct->ct_path_tail = NULL;
1365 	ct->ct_path_last = NULL;
1366 
1367 	/*
1368 	 * Add this client component to our client hash queue
1369 	 */
1370 	i_mdi_client_enlist_table(vh, ct);
1371 	return (ct);
1372 }
1373 
1374 /*
1375  * i_mdi_client_enlist_table():
1376  *		Attach the client device to the client hash table. Caller
1377  *		should hold the vhci client lock.
1378  */
1379 static void
1380 i_mdi_client_enlist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1381 {
1382 	int 			index;
1383 	struct client_hash	*head;
1384 
1385 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1386 
1387 	index = i_mdi_get_hash_key(ct->ct_guid);
1388 	head = &vh->vh_client_table[index];
1389 	ct->ct_hnext = (mdi_client_t *)head->ct_hash_head;
1390 	head->ct_hash_head = ct;
1391 	head->ct_hash_count++;
1392 	vh->vh_client_count++;
1393 }
1394 
1395 /*
1396  * i_mdi_client_delist_table():
1397  *		Attach the client device to the client hash table.
1398  *		Caller should hold the vhci client lock.
1399  */
1400 static void
1401 i_mdi_client_delist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1402 {
1403 	int			index;
1404 	char			*guid;
1405 	struct client_hash 	*head;
1406 	mdi_client_t		*next;
1407 	mdi_client_t		*last;
1408 
1409 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1410 
1411 	guid = ct->ct_guid;
1412 	index = i_mdi_get_hash_key(guid);
1413 	head = &vh->vh_client_table[index];
1414 
1415 	last = NULL;
1416 	next = (mdi_client_t *)head->ct_hash_head;
1417 	while (next != NULL) {
1418 		if (next == ct) {
1419 			break;
1420 		}
1421 		last = next;
1422 		next = next->ct_hnext;
1423 	}
1424 
1425 	if (next) {
1426 		head->ct_hash_count--;
1427 		if (last == NULL) {
1428 			head->ct_hash_head = ct->ct_hnext;
1429 		} else {
1430 			last->ct_hnext = ct->ct_hnext;
1431 		}
1432 		ct->ct_hnext = NULL;
1433 		vh->vh_client_count--;
1434 	}
1435 }
1436 
1437 
1438 /*
1439  * i_mdi_client_free():
1440  *		Free a client component
1441  */
1442 static int
1443 i_mdi_client_free(mdi_vhci_t *vh, mdi_client_t *ct)
1444 {
1445 	int		rv = MDI_SUCCESS;
1446 	int		flags = ct->ct_flags;
1447 	dev_info_t	*cdip;
1448 	dev_info_t	*vdip;
1449 
1450 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1451 
1452 	vdip = vh->vh_dip;
1453 	cdip = ct->ct_dip;
1454 
1455 	(void) ndi_prop_remove(DDI_DEV_T_NONE, cdip, MDI_CLIENT_GUID_PROP);
1456 	DEVI(cdip)->devi_mdi_component &= ~MDI_COMPONENT_CLIENT;
1457 	DEVI(cdip)->devi_mdi_client = NULL;
1458 
1459 	/*
1460 	 * Clear out back ref. to dev_info_t node
1461 	 */
1462 	ct->ct_dip = NULL;
1463 
1464 	/*
1465 	 * Remove this client from our hash queue
1466 	 */
1467 	i_mdi_client_delist_table(vh, ct);
1468 
1469 	/*
1470 	 * Uninitialize and free the component
1471 	 */
1472 	kmem_free(ct->ct_drvname, strlen(ct->ct_drvname) + 1);
1473 	kmem_free(ct->ct_guid, strlen(ct->ct_guid) + 1);
1474 	kmem_free(ct->ct_lb_args, sizeof (client_lb_args_t));
1475 	cv_destroy(&ct->ct_failover_cv);
1476 	cv_destroy(&ct->ct_unstable_cv);
1477 	cv_destroy(&ct->ct_powerchange_cv);
1478 	mutex_destroy(&ct->ct_mutex);
1479 	kmem_free(ct, sizeof (*ct));
1480 
1481 	if (cdip != NULL) {
1482 		MDI_VHCI_CLIENT_UNLOCK(vh);
1483 		(void) i_mdi_devinfo_remove(vdip, cdip, flags);
1484 		MDI_VHCI_CLIENT_LOCK(vh);
1485 	}
1486 	return (rv);
1487 }
1488 
1489 /*
1490  * i_mdi_client_find():
1491  * 		Find the client structure corresponding to a given guid
1492  *		Caller should hold the vhci client lock.
1493  */
1494 static mdi_client_t *
1495 i_mdi_client_find(mdi_vhci_t *vh, char *cname, char *guid)
1496 {
1497 	int			index;
1498 	struct client_hash	*head;
1499 	mdi_client_t		*ct;
1500 
1501 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1502 
1503 	index = i_mdi_get_hash_key(guid);
1504 	head = &vh->vh_client_table[index];
1505 
1506 	ct = head->ct_hash_head;
1507 	while (ct != NULL) {
1508 		if (strcmp(ct->ct_guid, guid) == 0 &&
1509 		    (cname == NULL || strcmp(ct->ct_drvname, cname) == 0)) {
1510 			break;
1511 		}
1512 		ct = ct->ct_hnext;
1513 	}
1514 	return (ct);
1515 }
1516 
1517 /*
1518  * i_mdi_client_update_state():
1519  *		Compute and update client device state
1520  * Notes:
1521  *		A client device can be in any of three possible states:
1522  *
1523  *		MDI_CLIENT_STATE_OPTIMAL - Client in optimal state with more
1524  *		one online/standby paths. Can tolerate failures.
1525  *		MDI_CLIENT_STATE_DEGRADED - Client device in degraded state with
1526  *		no alternate paths available as standby. A failure on the online
1527  *		would result in loss of access to device data.
1528  *		MDI_CLIENT_STATE_FAILED - Client device in failed state with
1529  *		no paths available to access the device.
1530  */
1531 static void
1532 i_mdi_client_update_state(mdi_client_t *ct)
1533 {
1534 	int state;
1535 
1536 	ASSERT(MDI_CLIENT_LOCKED(ct));
1537 	state = i_mdi_client_compute_state(ct, NULL);
1538 	MDI_CLIENT_SET_STATE(ct, state);
1539 }
1540 
1541 /*
1542  * i_mdi_client_compute_state():
1543  *		Compute client device state
1544  *
1545  *		mdi_phci_t *	Pointer to pHCI structure which should
1546  *				while computing the new value.  Used by
1547  *				i_mdi_phci_offline() to find the new
1548  *				client state after DR of a pHCI.
1549  */
1550 static int
1551 i_mdi_client_compute_state(mdi_client_t *ct, mdi_phci_t *ph)
1552 {
1553 	int		state;
1554 	int		online_count = 0;
1555 	int		standby_count = 0;
1556 	mdi_pathinfo_t	*pip, *next;
1557 
1558 	ASSERT(MDI_CLIENT_LOCKED(ct));
1559 	pip = ct->ct_path_head;
1560 	while (pip != NULL) {
1561 		MDI_PI_LOCK(pip);
1562 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
1563 		if (MDI_PI(pip)->pi_phci == ph) {
1564 			MDI_PI_UNLOCK(pip);
1565 			pip = next;
1566 			continue;
1567 		}
1568 
1569 		if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1570 				== MDI_PATHINFO_STATE_ONLINE)
1571 			online_count++;
1572 		else if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1573 				== MDI_PATHINFO_STATE_STANDBY)
1574 			standby_count++;
1575 		MDI_PI_UNLOCK(pip);
1576 		pip = next;
1577 	}
1578 
1579 	if (online_count == 0) {
1580 		if (standby_count == 0) {
1581 			state = MDI_CLIENT_STATE_FAILED;
1582 			MDI_DEBUG(2, (MDI_NOTE, ct->ct_dip,
1583 			    "client state failed: ct = %p", (void *)ct));
1584 		} else if (standby_count == 1) {
1585 			state = MDI_CLIENT_STATE_DEGRADED;
1586 		} else {
1587 			state = MDI_CLIENT_STATE_OPTIMAL;
1588 		}
1589 	} else if (online_count == 1) {
1590 		if (standby_count == 0) {
1591 			state = MDI_CLIENT_STATE_DEGRADED;
1592 		} else {
1593 			state = MDI_CLIENT_STATE_OPTIMAL;
1594 		}
1595 	} else {
1596 		state = MDI_CLIENT_STATE_OPTIMAL;
1597 	}
1598 	return (state);
1599 }
1600 
1601 /*
1602  * i_mdi_client2devinfo():
1603  *		Utility function
1604  */
1605 dev_info_t *
1606 i_mdi_client2devinfo(mdi_client_t *ct)
1607 {
1608 	return (ct->ct_dip);
1609 }
1610 
1611 /*
1612  * mdi_client_path2_devinfo():
1613  * 		Given the parent devinfo and child devfs pathname, search for
1614  *		a valid devfs node handle.
1615  */
1616 dev_info_t *
1617 mdi_client_path2devinfo(dev_info_t *vdip, char *pathname)
1618 {
1619 	dev_info_t 	*cdip = NULL;
1620 	dev_info_t 	*ndip = NULL;
1621 	char		*temp_pathname;
1622 	int		circular;
1623 
1624 	/*
1625 	 * Allocate temp buffer
1626 	 */
1627 	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1628 
1629 	/*
1630 	 * Lock parent against changes
1631 	 */
1632 	ndi_devi_enter(vdip, &circular);
1633 	ndip = (dev_info_t *)DEVI(vdip)->devi_child;
1634 	while ((cdip = ndip) != NULL) {
1635 		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
1636 
1637 		*temp_pathname = '\0';
1638 		(void) ddi_pathname(cdip, temp_pathname);
1639 		if (strcmp(temp_pathname, pathname) == 0) {
1640 			break;
1641 		}
1642 	}
1643 	/*
1644 	 * Release devinfo lock
1645 	 */
1646 	ndi_devi_exit(vdip, circular);
1647 
1648 	/*
1649 	 * Free the temp buffer
1650 	 */
1651 	kmem_free(temp_pathname, MAXPATHLEN);
1652 	return (cdip);
1653 }
1654 
1655 /*
1656  * mdi_client_get_path_count():
1657  * 		Utility function to get number of path information nodes
1658  *		associated with a given client device.
1659  */
1660 int
1661 mdi_client_get_path_count(dev_info_t *cdip)
1662 {
1663 	mdi_client_t	*ct;
1664 	int		count = 0;
1665 
1666 	ct = i_devi_get_client(cdip);
1667 	if (ct != NULL) {
1668 		count = ct->ct_path_count;
1669 	}
1670 	return (count);
1671 }
1672 
1673 
1674 /*
1675  * i_mdi_get_hash_key():
1676  * 		Create a hash using strings as keys
1677  *
1678  */
1679 static int
1680 i_mdi_get_hash_key(char *str)
1681 {
1682 	uint32_t	g, hash = 0;
1683 	char		*p;
1684 
1685 	for (p = str; *p != '\0'; p++) {
1686 		g = *p;
1687 		hash += g;
1688 	}
1689 	return (hash % (CLIENT_HASH_TABLE_SIZE - 1));
1690 }
1691 
1692 /*
1693  * mdi_get_lb_policy():
1694  * 		Get current load balancing policy for a given client device
1695  */
1696 client_lb_t
1697 mdi_get_lb_policy(dev_info_t *cdip)
1698 {
1699 	client_lb_t	lb = LOAD_BALANCE_NONE;
1700 	mdi_client_t	*ct;
1701 
1702 	ct = i_devi_get_client(cdip);
1703 	if (ct != NULL) {
1704 		lb = ct->ct_lb;
1705 	}
1706 	return (lb);
1707 }
1708 
1709 /*
1710  * mdi_set_lb_region_size():
1711  * 		Set current region size for the load-balance
1712  */
1713 int
1714 mdi_set_lb_region_size(dev_info_t *cdip, int region_size)
1715 {
1716 	mdi_client_t	*ct;
1717 	int		rv = MDI_FAILURE;
1718 
1719 	ct = i_devi_get_client(cdip);
1720 	if (ct != NULL && ct->ct_lb_args != NULL) {
1721 		ct->ct_lb_args->region_size = region_size;
1722 		rv = MDI_SUCCESS;
1723 	}
1724 	return (rv);
1725 }
1726 
1727 /*
1728  * mdi_Set_lb_policy():
1729  * 		Set current load balancing policy for a given client device
1730  */
1731 int
1732 mdi_set_lb_policy(dev_info_t *cdip, client_lb_t lb)
1733 {
1734 	mdi_client_t	*ct;
1735 	int		rv = MDI_FAILURE;
1736 
1737 	ct = i_devi_get_client(cdip);
1738 	if (ct != NULL) {
1739 		ct->ct_lb = lb;
1740 		rv = MDI_SUCCESS;
1741 	}
1742 	return (rv);
1743 }
1744 
1745 /*
1746  * mdi_failover():
1747  *		failover function called by the vHCI drivers to initiate
1748  *		a failover operation.  This is typically due to non-availability
1749  *		of online paths to route I/O requests.  Failover can be
1750  *		triggered through user application also.
1751  *
1752  *		The vHCI driver calls mdi_failover() to initiate a failover
1753  *		operation. mdi_failover() calls back into the vHCI driver's
1754  *		vo_failover() entry point to perform the actual failover
1755  *		operation.  The reason for requiring the vHCI driver to
1756  *		initiate failover by calling mdi_failover(), instead of directly
1757  *		executing vo_failover() itself, is to ensure that the mdi
1758  *		framework can keep track of the client state properly.
1759  *		Additionally, mdi_failover() provides as a convenience the
1760  *		option of performing the failover operation synchronously or
1761  *		asynchronously
1762  *
1763  *		Upon successful completion of the failover operation, the
1764  *		paths that were previously ONLINE will be in the STANDBY state,
1765  *		and the newly activated paths will be in the ONLINE state.
1766  *
1767  *		The flags modifier determines whether the activation is done
1768  *		synchronously: MDI_FAILOVER_SYNC
1769  * Return Values:
1770  *		MDI_SUCCESS
1771  *		MDI_FAILURE
1772  *		MDI_BUSY
1773  */
1774 /*ARGSUSED*/
1775 int
1776 mdi_failover(dev_info_t *vdip, dev_info_t *cdip, int flags)
1777 {
1778 	int			rv;
1779 	mdi_client_t		*ct;
1780 
1781 	ct = i_devi_get_client(cdip);
1782 	ASSERT(ct != NULL);
1783 	if (ct == NULL) {
1784 		/* cdip is not a valid client device. Nothing more to do. */
1785 		return (MDI_FAILURE);
1786 	}
1787 
1788 	MDI_CLIENT_LOCK(ct);
1789 
1790 	if (MDI_CLIENT_IS_PATH_FREE_IN_PROGRESS(ct)) {
1791 		/* A path to the client is being freed */
1792 		MDI_CLIENT_UNLOCK(ct);
1793 		return (MDI_BUSY);
1794 	}
1795 
1796 
1797 	if (MDI_CLIENT_IS_FAILED(ct)) {
1798 		/*
1799 		 * Client is in failed state. Nothing more to do.
1800 		 */
1801 		MDI_CLIENT_UNLOCK(ct);
1802 		return (MDI_FAILURE);
1803 	}
1804 
1805 	if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
1806 		/*
1807 		 * Failover is already in progress; return BUSY
1808 		 */
1809 		MDI_CLIENT_UNLOCK(ct);
1810 		return (MDI_BUSY);
1811 	}
1812 	/*
1813 	 * Make sure that mdi_pathinfo node state changes are processed.
1814 	 * We do not allow failovers to progress while client path state
1815 	 * changes are in progress
1816 	 */
1817 	if (ct->ct_unstable) {
1818 		if (flags == MDI_FAILOVER_ASYNC) {
1819 			MDI_CLIENT_UNLOCK(ct);
1820 			return (MDI_BUSY);
1821 		} else {
1822 			while (ct->ct_unstable)
1823 				cv_wait(&ct->ct_unstable_cv, &ct->ct_mutex);
1824 		}
1825 	}
1826 
1827 	/*
1828 	 * Client device is in stable state. Before proceeding, perform sanity
1829 	 * checks again.
1830 	 */
1831 	if ((MDI_CLIENT_IS_DETACHED(ct)) || (MDI_CLIENT_IS_FAILED(ct)) ||
1832 	    (!i_ddi_devi_attached(ct->ct_dip))) {
1833 		/*
1834 		 * Client is in failed state. Nothing more to do.
1835 		 */
1836 		MDI_CLIENT_UNLOCK(ct);
1837 		return (MDI_FAILURE);
1838 	}
1839 
1840 	/*
1841 	 * Set the client state as failover in progress.
1842 	 */
1843 	MDI_CLIENT_SET_FAILOVER_IN_PROGRESS(ct);
1844 	ct->ct_failover_flags = flags;
1845 	MDI_CLIENT_UNLOCK(ct);
1846 
1847 	if (flags == MDI_FAILOVER_ASYNC) {
1848 		/*
1849 		 * Submit the initiate failover request via CPR safe
1850 		 * taskq threads.
1851 		 */
1852 		(void) taskq_dispatch(mdi_taskq, (task_func_t *)i_mdi_failover,
1853 		    ct, KM_SLEEP);
1854 		return (MDI_ACCEPT);
1855 	} else {
1856 		/*
1857 		 * Synchronous failover mode.  Typically invoked from the user
1858 		 * land.
1859 		 */
1860 		rv = i_mdi_failover(ct);
1861 	}
1862 	return (rv);
1863 }
1864 
1865 /*
1866  * i_mdi_failover():
1867  *		internal failover function. Invokes vHCI drivers failover
1868  *		callback function and process the failover status
1869  * Return Values:
1870  *		None
1871  *
1872  * Note: A client device in failover state can not be detached or freed.
1873  */
1874 static int
1875 i_mdi_failover(void *arg)
1876 {
1877 	int		rv = MDI_SUCCESS;
1878 	mdi_client_t	*ct = (mdi_client_t *)arg;
1879 	mdi_vhci_t	*vh = ct->ct_vhci;
1880 
1881 	ASSERT(!MDI_CLIENT_LOCKED(ct));
1882 
1883 	if (vh->vh_ops->vo_failover != NULL) {
1884 		/*
1885 		 * Call vHCI drivers callback routine
1886 		 */
1887 		rv = (*vh->vh_ops->vo_failover)(vh->vh_dip, ct->ct_dip,
1888 		    ct->ct_failover_flags);
1889 	}
1890 
1891 	MDI_CLIENT_LOCK(ct);
1892 	MDI_CLIENT_CLEAR_FAILOVER_IN_PROGRESS(ct);
1893 
1894 	/*
1895 	 * Save the failover return status
1896 	 */
1897 	ct->ct_failover_status = rv;
1898 
1899 	/*
1900 	 * As a result of failover, client status would have been changed.
1901 	 * Update the client state and wake up anyone waiting on this client
1902 	 * device.
1903 	 */
1904 	i_mdi_client_update_state(ct);
1905 
1906 	cv_broadcast(&ct->ct_failover_cv);
1907 	MDI_CLIENT_UNLOCK(ct);
1908 	return (rv);
1909 }
1910 
1911 /*
1912  * Load balancing is logical block.
1913  * IOs within the range described by region_size
1914  * would go on the same path. This would improve the
1915  * performance by cache-hit on some of the RAID devices.
1916  * Search only for online paths(At some point we
1917  * may want to balance across target ports).
1918  * If no paths are found then default to round-robin.
1919  */
1920 static int
1921 i_mdi_lba_lb(mdi_client_t *ct, mdi_pathinfo_t **ret_pip, struct buf *bp)
1922 {
1923 	int		path_index = -1;
1924 	int		online_path_count = 0;
1925 	int		online_nonpref_path_count = 0;
1926 	int 		region_size = ct->ct_lb_args->region_size;
1927 	mdi_pathinfo_t	*pip;
1928 	mdi_pathinfo_t	*next;
1929 	int		preferred, path_cnt;
1930 
1931 	pip = ct->ct_path_head;
1932 	while (pip) {
1933 		MDI_PI_LOCK(pip);
1934 		if (MDI_PI(pip)->pi_state ==
1935 		    MDI_PATHINFO_STATE_ONLINE && MDI_PI(pip)->pi_preferred) {
1936 			online_path_count++;
1937 		} else if (MDI_PI(pip)->pi_state ==
1938 		    MDI_PATHINFO_STATE_ONLINE && !MDI_PI(pip)->pi_preferred) {
1939 			online_nonpref_path_count++;
1940 		}
1941 		next = (mdi_pathinfo_t *)
1942 		    MDI_PI(pip)->pi_client_link;
1943 		MDI_PI_UNLOCK(pip);
1944 		pip = next;
1945 	}
1946 	/* if found any online/preferred then use this type */
1947 	if (online_path_count > 0) {
1948 		path_cnt = online_path_count;
1949 		preferred = 1;
1950 	} else if (online_nonpref_path_count > 0) {
1951 		path_cnt = online_nonpref_path_count;
1952 		preferred = 0;
1953 	} else {
1954 		path_cnt = 0;
1955 	}
1956 	if (path_cnt) {
1957 		path_index = (bp->b_blkno >> region_size) % path_cnt;
1958 		pip = ct->ct_path_head;
1959 		while (pip && path_index != -1) {
1960 			MDI_PI_LOCK(pip);
1961 			if (path_index == 0 &&
1962 			    (MDI_PI(pip)->pi_state ==
1963 			    MDI_PATHINFO_STATE_ONLINE) &&
1964 				MDI_PI(pip)->pi_preferred == preferred) {
1965 				MDI_PI_HOLD(pip);
1966 				MDI_PI_UNLOCK(pip);
1967 				*ret_pip = pip;
1968 				return (MDI_SUCCESS);
1969 			}
1970 			path_index --;
1971 			next = (mdi_pathinfo_t *)
1972 			    MDI_PI(pip)->pi_client_link;
1973 			MDI_PI_UNLOCK(pip);
1974 			pip = next;
1975 		}
1976 		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
1977 		    "lba %llx: path %s %p",
1978 		    bp->b_lblkno, mdi_pi_spathname(pip), (void *)pip));
1979 	}
1980 	return (MDI_FAILURE);
1981 }
1982 
1983 /*
1984  * mdi_select_path():
1985  *		select a path to access a client device.
1986  *
1987  *		mdi_select_path() function is called by the vHCI drivers to
1988  *		select a path to route the I/O request to.  The caller passes
1989  *		the block I/O data transfer structure ("buf") as one of the
1990  *		parameters.  The mpxio framework uses the buf structure
1991  *		contents to maintain per path statistics (total I/O size /
1992  *		count pending).  If more than one online paths are available to
1993  *		select, the framework automatically selects a suitable path
1994  *		for routing I/O request. If a failover operation is active for
1995  *		this client device the call shall be failed with MDI_BUSY error
1996  *		code.
1997  *
1998  *		By default this function returns a suitable path in online
1999  *		state based on the current load balancing policy.  Currently
2000  *		we support LOAD_BALANCE_NONE (Previously selected online path
2001  *		will continue to be used till the path is usable) and
2002  *		LOAD_BALANCE_RR (Online paths will be selected in a round
2003  *		robin fashion), LOAD_BALANCE_LB(Online paths will be selected
2004  *		based on the logical block).  The load balancing
2005  *		through vHCI drivers configuration file (driver.conf).
2006  *
2007  *		vHCI drivers may override this default behavior by specifying
2008  *		appropriate flags.  The meaning of the thrid argument depends
2009  *		on the flags specified. If MDI_SELECT_PATH_INSTANCE is set
2010  *		then the argument is the "path instance" of the path to select.
2011  *		If MDI_SELECT_PATH_INSTANCE is not set then the argument is
2012  *		"start_pip". A non NULL "start_pip" is the starting point to
2013  *		walk and find the next appropriate path.  The following values
2014  *		are currently defined: MDI_SELECT_ONLINE_PATH (to select an
2015  *		ONLINE path) and/or MDI_SELECT_STANDBY_PATH (to select an
2016  *		STANDBY path).
2017  *
2018  *		The non-standard behavior is used by the scsi_vhci driver,
2019  *		whenever it has to use a STANDBY/FAULTED path.  Eg. during
2020  *		attach of client devices (to avoid an unnecessary failover
2021  *		when the STANDBY path comes up first), during failover
2022  *		(to activate a STANDBY path as ONLINE).
2023  *
2024  *		The selected path is returned in a a mdi_hold_path() state
2025  *		(pi_ref_cnt). Caller should release the hold by calling
2026  *		mdi_rele_path().
2027  *
2028  * Return Values:
2029  *		MDI_SUCCESS	- Completed successfully
2030  *		MDI_BUSY 	- Client device is busy failing over
2031  *		MDI_NOPATH	- Client device is online, but no valid path are
2032  *				  available to access this client device
2033  *		MDI_FAILURE	- Invalid client device or state
2034  *		MDI_DEVI_ONLINING
2035  *				- Client device (struct dev_info state) is in
2036  *				  onlining state.
2037  */
2038 
2039 /*ARGSUSED*/
2040 int
2041 mdi_select_path(dev_info_t *cdip, struct buf *bp, int flags,
2042     void *arg, mdi_pathinfo_t **ret_pip)
2043 {
2044 	mdi_client_t	*ct;
2045 	mdi_pathinfo_t	*pip;
2046 	mdi_pathinfo_t	*next;
2047 	mdi_pathinfo_t	*head;
2048 	mdi_pathinfo_t	*start;
2049 	client_lb_t	lbp;	/* load balancing policy */
2050 	int		sb = 1;	/* standard behavior */
2051 	int		preferred = 1;	/* preferred path */
2052 	int		cond, cont = 1;
2053 	int		retry = 0;
2054 	mdi_pathinfo_t	*start_pip;	/* request starting pathinfo */
2055 	int		path_instance;	/* request specific path instance */
2056 
2057 	/* determine type of arg based on flags */
2058 	if (flags & MDI_SELECT_PATH_INSTANCE) {
2059 		path_instance = (int)(intptr_t)arg;
2060 		start_pip = NULL;
2061 	} else {
2062 		path_instance = 0;
2063 		start_pip = (mdi_pathinfo_t *)arg;
2064 	}
2065 
2066 	if (flags != 0) {
2067 		/*
2068 		 * disable default behavior
2069 		 */
2070 		sb = 0;
2071 	}
2072 
2073 	*ret_pip = NULL;
2074 	ct = i_devi_get_client(cdip);
2075 	if (ct == NULL) {
2076 		/* mdi extensions are NULL, Nothing more to do */
2077 		return (MDI_FAILURE);
2078 	}
2079 
2080 	MDI_CLIENT_LOCK(ct);
2081 
2082 	if (sb) {
2083 		if (MDI_CLIENT_IS_FAILED(ct)) {
2084 			/*
2085 			 * Client is not ready to accept any I/O requests.
2086 			 * Fail this request.
2087 			 */
2088 			MDI_DEBUG(2, (MDI_NOTE, cdip,
2089 			    "client state offline ct = %p", (void *)ct));
2090 			MDI_CLIENT_UNLOCK(ct);
2091 			return (MDI_FAILURE);
2092 		}
2093 
2094 		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
2095 			/*
2096 			 * Check for Failover is in progress. If so tell the
2097 			 * caller that this device is busy.
2098 			 */
2099 			MDI_DEBUG(2, (MDI_NOTE, cdip,
2100 			    "client failover in progress ct = %p",
2101 			    (void *)ct));
2102 			MDI_CLIENT_UNLOCK(ct);
2103 			return (MDI_BUSY);
2104 		}
2105 
2106 		/*
2107 		 * Check to see whether the client device is attached.
2108 		 * If not so, let the vHCI driver manually select a path
2109 		 * (standby) and let the probe/attach process to continue.
2110 		 */
2111 		if (MDI_CLIENT_IS_DETACHED(ct) || !i_ddi_devi_attached(cdip)) {
2112 			MDI_DEBUG(4, (MDI_NOTE, cdip,
2113 			    "devi is onlining ct = %p", (void *)ct));
2114 			MDI_CLIENT_UNLOCK(ct);
2115 			return (MDI_DEVI_ONLINING);
2116 		}
2117 	}
2118 
2119 	/*
2120 	 * Cache in the client list head.  If head of the list is NULL
2121 	 * return MDI_NOPATH
2122 	 */
2123 	head = ct->ct_path_head;
2124 	if (head == NULL) {
2125 		MDI_CLIENT_UNLOCK(ct);
2126 		return (MDI_NOPATH);
2127 	}
2128 
2129 	/* Caller is specifying a specific pathinfo path by path_instance */
2130 	if (path_instance) {
2131 		/* search for pathinfo with correct path_instance */
2132 		for (pip = head;
2133 		    pip && (mdi_pi_get_path_instance(pip) != path_instance);
2134 		    pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link)
2135 			;
2136 
2137 		/* If path can't be selected then MDI_NOPATH is returned. */
2138 		if (pip == NULL) {
2139 			MDI_CLIENT_UNLOCK(ct);
2140 			return (MDI_NOPATH);
2141 		}
2142 
2143 		/*
2144 		 * Verify state of path. When asked to select a specific
2145 		 * path_instance, we select the requested path in any
2146 		 * state (ONLINE, OFFLINE, STANDBY, FAULT) other than INIT.
2147 		 * We don't however select paths where the pHCI has detached.
2148 		 * NOTE: last pathinfo node of an opened client device may
2149 		 * exist in an OFFLINE state after the pHCI associated with
2150 		 * that path has detached (but pi_phci will be NULL if that
2151 		 * has occurred).
2152 		 */
2153 		MDI_PI_LOCK(pip);
2154 		if ((MDI_PI(pip)->pi_state == MDI_PATHINFO_STATE_INIT) ||
2155 		    (MDI_PI(pip)->pi_phci == NULL)) {
2156 			MDI_PI_UNLOCK(pip);
2157 			MDI_CLIENT_UNLOCK(ct);
2158 			return (MDI_FAILURE);
2159 		}
2160 
2161 		/* Return MDI_BUSY if we have a transient condition */
2162 		if (MDI_PI_IS_TRANSIENT(pip)) {
2163 			MDI_PI_UNLOCK(pip);
2164 			MDI_CLIENT_UNLOCK(ct);
2165 			return (MDI_BUSY);
2166 		}
2167 
2168 		/*
2169 		 * Return the path in hold state. Caller should release the
2170 		 * lock by calling mdi_rele_path()
2171 		 */
2172 		MDI_PI_HOLD(pip);
2173 		MDI_PI_UNLOCK(pip);
2174 		*ret_pip = pip;
2175 		MDI_CLIENT_UNLOCK(ct);
2176 		return (MDI_SUCCESS);
2177 	}
2178 
2179 	/*
2180 	 * for non default behavior, bypass current
2181 	 * load balancing policy and always use LOAD_BALANCE_RR
2182 	 * except that the start point will be adjusted based
2183 	 * on the provided start_pip
2184 	 */
2185 	lbp = sb ? ct->ct_lb : LOAD_BALANCE_RR;
2186 
2187 	switch (lbp) {
2188 	case LOAD_BALANCE_NONE:
2189 		/*
2190 		 * Load balancing is None  or Alternate path mode
2191 		 * Start looking for a online mdi_pathinfo node starting from
2192 		 * last known selected path
2193 		 */
2194 		preferred = 1;
2195 		pip = (mdi_pathinfo_t *)ct->ct_path_last;
2196 		if (pip == NULL) {
2197 			pip = head;
2198 		}
2199 		start = pip;
2200 		do {
2201 			MDI_PI_LOCK(pip);
2202 			/*
2203 			 * No need to explicitly check if the path is disabled.
2204 			 * Since we are checking for state == ONLINE and the
2205 			 * same variable is used for DISABLE/ENABLE information.
2206 			 */
2207 			if ((MDI_PI(pip)->pi_state  ==
2208 				MDI_PATHINFO_STATE_ONLINE) &&
2209 				preferred == MDI_PI(pip)->pi_preferred) {
2210 				/*
2211 				 * Return the path in hold state. Caller should
2212 				 * release the lock by calling mdi_rele_path()
2213 				 */
2214 				MDI_PI_HOLD(pip);
2215 				MDI_PI_UNLOCK(pip);
2216 				ct->ct_path_last = pip;
2217 				*ret_pip = pip;
2218 				MDI_CLIENT_UNLOCK(ct);
2219 				return (MDI_SUCCESS);
2220 			}
2221 
2222 			/*
2223 			 * Path is busy.
2224 			 */
2225 			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
2226 			    MDI_PI_IS_TRANSIENT(pip))
2227 				retry = 1;
2228 			/*
2229 			 * Keep looking for a next available online path
2230 			 */
2231 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2232 			if (next == NULL) {
2233 				next = head;
2234 			}
2235 			MDI_PI_UNLOCK(pip);
2236 			pip = next;
2237 			if (start == pip && preferred) {
2238 				preferred = 0;
2239 			} else if (start == pip && !preferred) {
2240 				cont = 0;
2241 			}
2242 		} while (cont);
2243 		break;
2244 
2245 	case LOAD_BALANCE_LBA:
2246 		/*
2247 		 * Make sure we are looking
2248 		 * for an online path. Otherwise, if it is for a STANDBY
2249 		 * path request, it will go through and fetch an ONLINE
2250 		 * path which is not desirable.
2251 		 */
2252 		if ((ct->ct_lb_args != NULL) &&
2253 			    (ct->ct_lb_args->region_size) && bp &&
2254 				(sb || (flags == MDI_SELECT_ONLINE_PATH))) {
2255 			if (i_mdi_lba_lb(ct, ret_pip, bp)
2256 				    == MDI_SUCCESS) {
2257 				MDI_CLIENT_UNLOCK(ct);
2258 				return (MDI_SUCCESS);
2259 			}
2260 		}
2261 		/* FALLTHROUGH */
2262 	case LOAD_BALANCE_RR:
2263 		/*
2264 		 * Load balancing is Round Robin. Start looking for a online
2265 		 * mdi_pathinfo node starting from last known selected path
2266 		 * as the start point.  If override flags are specified,
2267 		 * process accordingly.
2268 		 * If the search is already in effect(start_pip not null),
2269 		 * then lets just use the same path preference to continue the
2270 		 * traversal.
2271 		 */
2272 
2273 		if (start_pip != NULL) {
2274 			preferred = MDI_PI(start_pip)->pi_preferred;
2275 		} else {
2276 			preferred = 1;
2277 		}
2278 
2279 		start = sb ? (mdi_pathinfo_t *)ct->ct_path_last : start_pip;
2280 		if (start == NULL) {
2281 			pip = head;
2282 		} else {
2283 			pip = (mdi_pathinfo_t *)MDI_PI(start)->pi_client_link;
2284 			if (pip == NULL) {
2285 				if ( flags & MDI_SELECT_NO_PREFERRED) {
2286 					/*
2287 					 * Return since we hit the end of list
2288 					 */
2289 					MDI_CLIENT_UNLOCK(ct);
2290 					return (MDI_NOPATH);
2291 				}
2292 
2293 				if (!sb) {
2294 					if (preferred == 0) {
2295 						/*
2296 						 * Looks like we have completed
2297 						 * the traversal as preferred
2298 						 * value is 0. Time to bail out.
2299 						 */
2300 						*ret_pip = NULL;
2301 						MDI_CLIENT_UNLOCK(ct);
2302 						return (MDI_NOPATH);
2303 					} else {
2304 						/*
2305 						 * Looks like we reached the
2306 						 * end of the list. Lets enable
2307 						 * traversal of non preferred
2308 						 * paths.
2309 						 */
2310 						preferred = 0;
2311 					}
2312 				}
2313 				pip = head;
2314 			}
2315 		}
2316 		start = pip;
2317 		do {
2318 			MDI_PI_LOCK(pip);
2319 			if (sb) {
2320 				cond = ((MDI_PI(pip)->pi_state ==
2321 				    MDI_PATHINFO_STATE_ONLINE &&
2322 					MDI_PI(pip)->pi_preferred ==
2323 						preferred) ? 1 : 0);
2324 			} else {
2325 				if (flags == MDI_SELECT_ONLINE_PATH) {
2326 					cond = ((MDI_PI(pip)->pi_state ==
2327 					    MDI_PATHINFO_STATE_ONLINE &&
2328 						MDI_PI(pip)->pi_preferred ==
2329 						preferred) ? 1 : 0);
2330 				} else if (flags == MDI_SELECT_STANDBY_PATH) {
2331 					cond = ((MDI_PI(pip)->pi_state ==
2332 					    MDI_PATHINFO_STATE_STANDBY &&
2333 						MDI_PI(pip)->pi_preferred ==
2334 						preferred) ? 1 : 0);
2335 				} else if (flags == (MDI_SELECT_ONLINE_PATH |
2336 				    MDI_SELECT_STANDBY_PATH)) {
2337 					cond = (((MDI_PI(pip)->pi_state ==
2338 					    MDI_PATHINFO_STATE_ONLINE ||
2339 					    (MDI_PI(pip)->pi_state ==
2340 					    MDI_PATHINFO_STATE_STANDBY)) &&
2341 						MDI_PI(pip)->pi_preferred ==
2342 						preferred) ? 1 : 0);
2343 				} else if (flags ==
2344 					(MDI_SELECT_STANDBY_PATH |
2345 					MDI_SELECT_ONLINE_PATH |
2346 					MDI_SELECT_USER_DISABLE_PATH)) {
2347 					cond = (((MDI_PI(pip)->pi_state ==
2348 					    MDI_PATHINFO_STATE_ONLINE ||
2349 					    (MDI_PI(pip)->pi_state ==
2350 					    MDI_PATHINFO_STATE_STANDBY) ||
2351 						(MDI_PI(pip)->pi_state ==
2352 					    (MDI_PATHINFO_STATE_ONLINE|
2353 					    MDI_PATHINFO_STATE_USER_DISABLE)) ||
2354 						(MDI_PI(pip)->pi_state ==
2355 					    (MDI_PATHINFO_STATE_STANDBY |
2356 					    MDI_PATHINFO_STATE_USER_DISABLE)))&&
2357 						MDI_PI(pip)->pi_preferred ==
2358 						preferred) ? 1 : 0);
2359 				} else if (flags ==
2360 				    (MDI_SELECT_STANDBY_PATH |
2361 				    MDI_SELECT_ONLINE_PATH |
2362 				    MDI_SELECT_NO_PREFERRED)) {
2363 					cond = (((MDI_PI(pip)->pi_state ==
2364 					    MDI_PATHINFO_STATE_ONLINE) ||
2365 					    (MDI_PI(pip)->pi_state ==
2366 					    MDI_PATHINFO_STATE_STANDBY))
2367 					    ? 1 : 0);
2368 				} else {
2369 					cond = 0;
2370 				}
2371 			}
2372 			/*
2373 			 * No need to explicitly check if the path is disabled.
2374 			 * Since we are checking for state == ONLINE and the
2375 			 * same variable is used for DISABLE/ENABLE information.
2376 			 */
2377 			if (cond) {
2378 				/*
2379 				 * Return the path in hold state. Caller should
2380 				 * release the lock by calling mdi_rele_path()
2381 				 */
2382 				MDI_PI_HOLD(pip);
2383 				MDI_PI_UNLOCK(pip);
2384 				if (sb)
2385 					ct->ct_path_last = pip;
2386 				*ret_pip = pip;
2387 				MDI_CLIENT_UNLOCK(ct);
2388 				return (MDI_SUCCESS);
2389 			}
2390 			/*
2391 			 * Path is busy.
2392 			 */
2393 			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
2394 			    MDI_PI_IS_TRANSIENT(pip))
2395 				retry = 1;
2396 
2397 			/*
2398 			 * Keep looking for a next available online path
2399 			 */
2400 do_again:
2401 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2402 			if (next == NULL) {
2403 				if ( flags & MDI_SELECT_NO_PREFERRED) {
2404 					/*
2405 					 * Bail out since we hit the end of list
2406 					 */
2407 					MDI_PI_UNLOCK(pip);
2408 					break;
2409 				}
2410 
2411 				if (!sb) {
2412 					if (preferred == 1) {
2413 						/*
2414 						 * Looks like we reached the
2415 						 * end of the list. Lets enable
2416 						 * traversal of non preferred
2417 						 * paths.
2418 						 */
2419 						preferred = 0;
2420 						next = head;
2421 					} else {
2422 						/*
2423 						 * We have done both the passes
2424 						 * Preferred as well as for
2425 						 * Non-preferred. Bail out now.
2426 						 */
2427 						cont = 0;
2428 					}
2429 				} else {
2430 					/*
2431 					 * Standard behavior case.
2432 					 */
2433 					next = head;
2434 				}
2435 			}
2436 			MDI_PI_UNLOCK(pip);
2437 			if (cont == 0) {
2438 				break;
2439 			}
2440 			pip = next;
2441 
2442 			if (!sb) {
2443 				/*
2444 				 * We need to handle the selection of
2445 				 * non-preferred path in the following
2446 				 * case:
2447 				 *
2448 				 * +------+   +------+   +------+   +-----+
2449 				 * | A : 1| - | B : 1| - | C : 0| - |NULL |
2450 				 * +------+   +------+   +------+   +-----+
2451 				 *
2452 				 * If we start the search with B, we need to
2453 				 * skip beyond B to pick C which is non -
2454 				 * preferred in the second pass. The following
2455 				 * test, if true, will allow us to skip over
2456 				 * the 'start'(B in the example) to select
2457 				 * other non preferred elements.
2458 				 */
2459 				if ((start_pip != NULL) && (start_pip == pip) &&
2460 				    (MDI_PI(start_pip)->pi_preferred
2461 				    != preferred)) {
2462 					/*
2463 					 * try again after going past the start
2464 					 * pip
2465 					 */
2466 					MDI_PI_LOCK(pip);
2467 					goto do_again;
2468 				}
2469 			} else {
2470 				/*
2471 				 * Standard behavior case
2472 				 */
2473 				if (start == pip && preferred) {
2474 					/* look for nonpreferred paths */
2475 					preferred = 0;
2476 				} else if (start == pip && !preferred) {
2477 					/*
2478 					 * Exit condition
2479 					 */
2480 					cont = 0;
2481 				}
2482 			}
2483 		} while (cont);
2484 		break;
2485 	}
2486 
2487 	MDI_CLIENT_UNLOCK(ct);
2488 	if (retry == 1) {
2489 		return (MDI_BUSY);
2490 	} else {
2491 		return (MDI_NOPATH);
2492 	}
2493 }
2494 
2495 /*
2496  * For a client, return the next available path to any phci
2497  *
2498  * Note:
2499  *		Caller should hold the branch's devinfo node to get a consistent
2500  *		snap shot of the mdi_pathinfo nodes.
2501  *
2502  *		Please note that even the list is stable the mdi_pathinfo
2503  *		node state and properties are volatile.  The caller should lock
2504  *		and unlock the nodes by calling mdi_pi_lock() and
2505  *		mdi_pi_unlock() functions to get a stable properties.
2506  *
2507  *		If there is a need to use the nodes beyond the hold of the
2508  *		devinfo node period (For ex. I/O), then mdi_pathinfo node
2509  *		need to be held against unexpected removal by calling
2510  *		mdi_hold_path() and should be released by calling
2511  *		mdi_rele_path() on completion.
2512  */
2513 mdi_pathinfo_t *
2514 mdi_get_next_phci_path(dev_info_t *ct_dip, mdi_pathinfo_t *pip)
2515 {
2516 	mdi_client_t *ct;
2517 
2518 	if (!MDI_CLIENT(ct_dip))
2519 		return (NULL);
2520 
2521 	/*
2522 	 * Walk through client link
2523 	 */
2524 	ct = (mdi_client_t *)DEVI(ct_dip)->devi_mdi_client;
2525 	ASSERT(ct != NULL);
2526 
2527 	if (pip == NULL)
2528 		return ((mdi_pathinfo_t *)ct->ct_path_head);
2529 
2530 	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link);
2531 }
2532 
2533 /*
2534  * For a phci, return the next available path to any client
2535  * Note: ditto mdi_get_next_phci_path()
2536  */
2537 mdi_pathinfo_t *
2538 mdi_get_next_client_path(dev_info_t *ph_dip, mdi_pathinfo_t *pip)
2539 {
2540 	mdi_phci_t *ph;
2541 
2542 	if (!MDI_PHCI(ph_dip))
2543 		return (NULL);
2544 
2545 	/*
2546 	 * Walk through pHCI link
2547 	 */
2548 	ph = (mdi_phci_t *)DEVI(ph_dip)->devi_mdi_xhci;
2549 	ASSERT(ph != NULL);
2550 
2551 	if (pip == NULL)
2552 		return ((mdi_pathinfo_t *)ph->ph_path_head);
2553 
2554 	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link);
2555 }
2556 
2557 /*
2558  * mdi_hold_path():
2559  *		Hold the mdi_pathinfo node against unwanted unexpected free.
2560  * Return Values:
2561  *		None
2562  */
2563 void
2564 mdi_hold_path(mdi_pathinfo_t *pip)
2565 {
2566 	if (pip) {
2567 		MDI_PI_LOCK(pip);
2568 		MDI_PI_HOLD(pip);
2569 		MDI_PI_UNLOCK(pip);
2570 	}
2571 }
2572 
2573 
2574 /*
2575  * mdi_rele_path():
2576  *		Release the mdi_pathinfo node which was selected
2577  *		through mdi_select_path() mechanism or manually held by
2578  *		calling mdi_hold_path().
2579  * Return Values:
2580  *		None
2581  */
2582 void
2583 mdi_rele_path(mdi_pathinfo_t *pip)
2584 {
2585 	if (pip) {
2586 		MDI_PI_LOCK(pip);
2587 		MDI_PI_RELE(pip);
2588 		if (MDI_PI(pip)->pi_ref_cnt == 0) {
2589 			cv_broadcast(&MDI_PI(pip)->pi_ref_cv);
2590 		}
2591 		MDI_PI_UNLOCK(pip);
2592 	}
2593 }
2594 
2595 /*
2596  * mdi_pi_lock():
2597  * 		Lock the mdi_pathinfo node.
2598  * Note:
2599  *		The caller should release the lock by calling mdi_pi_unlock()
2600  */
2601 void
2602 mdi_pi_lock(mdi_pathinfo_t *pip)
2603 {
2604 	ASSERT(pip != NULL);
2605 	if (pip) {
2606 		MDI_PI_LOCK(pip);
2607 	}
2608 }
2609 
2610 
2611 /*
2612  * mdi_pi_unlock():
2613  * 		Unlock the mdi_pathinfo node.
2614  * Note:
2615  *		The mdi_pathinfo node should have been locked with mdi_pi_lock()
2616  */
2617 void
2618 mdi_pi_unlock(mdi_pathinfo_t *pip)
2619 {
2620 	ASSERT(pip != NULL);
2621 	if (pip) {
2622 		MDI_PI_UNLOCK(pip);
2623 	}
2624 }
2625 
2626 /*
2627  * mdi_pi_find():
2628  *		Search the list of mdi_pathinfo nodes attached to the
2629  *		pHCI/Client device node whose path address matches "paddr".
2630  *		Returns a pointer to the mdi_pathinfo node if a matching node is
2631  *		found.
2632  * Return Values:
2633  *		mdi_pathinfo node handle
2634  *		NULL
2635  * Notes:
2636  *		Caller need not hold any locks to call this function.
2637  */
2638 mdi_pathinfo_t *
2639 mdi_pi_find(dev_info_t *pdip, char *caddr, char *paddr)
2640 {
2641 	mdi_phci_t		*ph;
2642 	mdi_vhci_t		*vh;
2643 	mdi_client_t		*ct;
2644 	mdi_pathinfo_t		*pip = NULL;
2645 
2646 	MDI_DEBUG(2, (MDI_NOTE, pdip,
2647 	    "caddr@%s paddr@%s", caddr ? caddr : "", paddr ? paddr : ""));
2648 	if ((pdip == NULL) || (paddr == NULL)) {
2649 		return (NULL);
2650 	}
2651 	ph = i_devi_get_phci(pdip);
2652 	if (ph == NULL) {
2653 		/*
2654 		 * Invalid pHCI device, Nothing more to do.
2655 		 */
2656 		MDI_DEBUG(2, (MDI_WARN, pdip, "invalid phci"));
2657 		return (NULL);
2658 	}
2659 
2660 	vh = ph->ph_vhci;
2661 	if (vh == NULL) {
2662 		/*
2663 		 * Invalid vHCI device, Nothing more to do.
2664 		 */
2665 		MDI_DEBUG(2, (MDI_WARN, pdip, "invalid vhci"));
2666 		return (NULL);
2667 	}
2668 
2669 	/*
2670 	 * Look for pathinfo node identified by paddr.
2671 	 */
2672 	if (caddr == NULL) {
2673 		/*
2674 		 * Find a mdi_pathinfo node under pHCI list for a matching
2675 		 * unit address.
2676 		 */
2677 		MDI_PHCI_LOCK(ph);
2678 		if (MDI_PHCI_IS_OFFLINE(ph)) {
2679 			MDI_DEBUG(2, (MDI_WARN, pdip,
2680 			    "offline phci %p", (void *)ph));
2681 			MDI_PHCI_UNLOCK(ph);
2682 			return (NULL);
2683 		}
2684 		pip = (mdi_pathinfo_t *)ph->ph_path_head;
2685 
2686 		while (pip != NULL) {
2687 			if (strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2688 				break;
2689 			}
2690 			pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
2691 		}
2692 		MDI_PHCI_UNLOCK(ph);
2693 		MDI_DEBUG(2, (MDI_NOTE, pdip,
2694 		    "found %s %p", mdi_pi_spathname(pip), (void *)pip));
2695 		return (pip);
2696 	}
2697 
2698 	/*
2699 	 * XXX - Is the rest of the code in this function really necessary?
2700 	 * The consumers of mdi_pi_find() can search for the desired pathinfo
2701 	 * node by calling mdi_pi_find(pdip, NULL, paddr). Irrespective of
2702 	 * whether the search is based on the pathinfo nodes attached to
2703 	 * the pHCI or the client node, the result will be the same.
2704 	 */
2705 
2706 	/*
2707 	 * Find the client device corresponding to 'caddr'
2708 	 */
2709 	MDI_VHCI_CLIENT_LOCK(vh);
2710 
2711 	/*
2712 	 * XXX - Passing NULL to the following function works as long as the
2713 	 * the client addresses (caddr) are unique per vhci basis.
2714 	 */
2715 	ct = i_mdi_client_find(vh, NULL, caddr);
2716 	if (ct == NULL) {
2717 		/*
2718 		 * Client not found, Obviously mdi_pathinfo node has not been
2719 		 * created yet.
2720 		 */
2721 		MDI_VHCI_CLIENT_UNLOCK(vh);
2722 		MDI_DEBUG(2, (MDI_NOTE, pdip,
2723 		    "client not found for caddr @%s", caddr ? caddr : ""));
2724 		return (NULL);
2725 	}
2726 
2727 	/*
2728 	 * Hold the client lock and look for a mdi_pathinfo node with matching
2729 	 * pHCI and paddr
2730 	 */
2731 	MDI_CLIENT_LOCK(ct);
2732 
2733 	/*
2734 	 * Release the global mutex as it is no more needed. Note: We always
2735 	 * respect the locking order while acquiring.
2736 	 */
2737 	MDI_VHCI_CLIENT_UNLOCK(vh);
2738 
2739 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2740 	while (pip != NULL) {
2741 		/*
2742 		 * Compare the unit address
2743 		 */
2744 		if ((MDI_PI(pip)->pi_phci == ph) &&
2745 		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2746 			break;
2747 		}
2748 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2749 	}
2750 	MDI_CLIENT_UNLOCK(ct);
2751 	MDI_DEBUG(2, (MDI_NOTE, pdip,
2752 	    "found: %s %p", mdi_pi_spathname(pip), (void *)pip));
2753 	return (pip);
2754 }
2755 
2756 /*
2757  * mdi_pi_alloc():
2758  *		Allocate and initialize a new instance of a mdi_pathinfo node.
2759  *		The mdi_pathinfo node returned by this function identifies a
2760  *		unique device path is capable of having properties attached
2761  *		and passed to mdi_pi_online() to fully attach and online the
2762  *		path and client device node.
2763  *		The mdi_pathinfo node returned by this function must be
2764  *		destroyed using mdi_pi_free() if the path is no longer
2765  *		operational or if the caller fails to attach a client device
2766  *		node when calling mdi_pi_online(). The framework will not free
2767  *		the resources allocated.
2768  *		This function can be called from both interrupt and kernel
2769  *		contexts.  DDI_NOSLEEP flag should be used while calling
2770  *		from interrupt contexts.
2771  * Return Values:
2772  *		MDI_SUCCESS
2773  *		MDI_FAILURE
2774  *		MDI_NOMEM
2775  */
2776 /*ARGSUSED*/
2777 int
2778 mdi_pi_alloc_compatible(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2779     char **compatible, int ncompatible, int flags, mdi_pathinfo_t **ret_pip)
2780 {
2781 	mdi_vhci_t	*vh;
2782 	mdi_phci_t	*ph;
2783 	mdi_client_t	*ct;
2784 	mdi_pathinfo_t	*pip = NULL;
2785 	dev_info_t	*cdip;
2786 	int		rv = MDI_NOMEM;
2787 	int		path_allocated = 0;
2788 
2789 	MDI_DEBUG(2, (MDI_NOTE, pdip,
2790 	    "cname %s: caddr@%s paddr@%s",
2791 	    cname ? cname : "", caddr ? caddr : "", paddr ? paddr : ""));
2792 
2793 	if (pdip == NULL || cname == NULL || caddr == NULL || paddr == NULL ||
2794 	    ret_pip == NULL) {
2795 		/* Nothing more to do */
2796 		return (MDI_FAILURE);
2797 	}
2798 
2799 	*ret_pip = NULL;
2800 
2801 	/* No allocations on detaching pHCI */
2802 	if (DEVI_IS_DETACHING(pdip)) {
2803 		/* Invalid pHCI device, return failure */
2804 		MDI_DEBUG(1, (MDI_WARN, pdip,
2805 		    "!detaching pHCI=%p", (void *)pdip));
2806 		return (MDI_FAILURE);
2807 	}
2808 
2809 	ph = i_devi_get_phci(pdip);
2810 	ASSERT(ph != NULL);
2811 	if (ph == NULL) {
2812 		/* Invalid pHCI device, return failure */
2813 		MDI_DEBUG(1, (MDI_WARN, pdip,
2814 		    "!invalid pHCI=%p", (void *)pdip));
2815 		return (MDI_FAILURE);
2816 	}
2817 
2818 	MDI_PHCI_LOCK(ph);
2819 	vh = ph->ph_vhci;
2820 	if (vh == NULL) {
2821 		/* Invalid vHCI device, return failure */
2822 		MDI_DEBUG(1, (MDI_WARN, pdip,
2823 		    "!invalid vHCI=%p", (void *)pdip));
2824 		MDI_PHCI_UNLOCK(ph);
2825 		return (MDI_FAILURE);
2826 	}
2827 
2828 	if (MDI_PHCI_IS_READY(ph) == 0) {
2829 		/*
2830 		 * Do not allow new node creation when pHCI is in
2831 		 * offline/suspended states
2832 		 */
2833 		MDI_DEBUG(1, (MDI_WARN, pdip,
2834 		    "pHCI=%p is not ready", (void *)ph));
2835 		MDI_PHCI_UNLOCK(ph);
2836 		return (MDI_BUSY);
2837 	}
2838 	MDI_PHCI_UNSTABLE(ph);
2839 	MDI_PHCI_UNLOCK(ph);
2840 
2841 	/* look for a matching client, create one if not found */
2842 	MDI_VHCI_CLIENT_LOCK(vh);
2843 	ct = i_mdi_client_find(vh, cname, caddr);
2844 	if (ct == NULL) {
2845 		ct = i_mdi_client_alloc(vh, cname, caddr);
2846 		ASSERT(ct != NULL);
2847 	}
2848 
2849 	if (ct->ct_dip == NULL) {
2850 		/*
2851 		 * Allocate a devinfo node
2852 		 */
2853 		ct->ct_dip = i_mdi_devinfo_create(vh, cname, caddr,
2854 		    compatible, ncompatible);
2855 		if (ct->ct_dip == NULL) {
2856 			(void) i_mdi_client_free(vh, ct);
2857 			goto fail;
2858 		}
2859 	}
2860 	cdip = ct->ct_dip;
2861 
2862 	DEVI(cdip)->devi_mdi_component |= MDI_COMPONENT_CLIENT;
2863 	DEVI(cdip)->devi_mdi_client = (caddr_t)ct;
2864 
2865 	MDI_CLIENT_LOCK(ct);
2866 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2867 	while (pip != NULL) {
2868 		/*
2869 		 * Compare the unit address
2870 		 */
2871 		if ((MDI_PI(pip)->pi_phci == ph) &&
2872 		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2873 			break;
2874 		}
2875 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2876 	}
2877 	MDI_CLIENT_UNLOCK(ct);
2878 
2879 	if (pip == NULL) {
2880 		/*
2881 		 * This is a new path for this client device.  Allocate and
2882 		 * initialize a new pathinfo node
2883 		 */
2884 		pip = i_mdi_pi_alloc(ph, paddr, ct);
2885 		ASSERT(pip != NULL);
2886 		path_allocated = 1;
2887 	}
2888 	rv = MDI_SUCCESS;
2889 
2890 fail:
2891 	/*
2892 	 * Release the global mutex.
2893 	 */
2894 	MDI_VHCI_CLIENT_UNLOCK(vh);
2895 
2896 	/*
2897 	 * Mark the pHCI as stable
2898 	 */
2899 	MDI_PHCI_LOCK(ph);
2900 	MDI_PHCI_STABLE(ph);
2901 	MDI_PHCI_UNLOCK(ph);
2902 	*ret_pip = pip;
2903 
2904 	MDI_DEBUG(2, (MDI_NOTE, pdip,
2905 	    "alloc %s %p", mdi_pi_spathname(pip), (void *)pip));
2906 
2907 	if (path_allocated)
2908 		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
2909 
2910 	return (rv);
2911 }
2912 
2913 /*ARGSUSED*/
2914 int
2915 mdi_pi_alloc(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2916     int flags, mdi_pathinfo_t **ret_pip)
2917 {
2918 	return (mdi_pi_alloc_compatible(pdip, cname, caddr, paddr, NULL, 0,
2919 	    flags, ret_pip));
2920 }
2921 
2922 /*
2923  * i_mdi_pi_alloc():
2924  *		Allocate a mdi_pathinfo node and add to the pHCI path list
2925  * Return Values:
2926  *		mdi_pathinfo
2927  */
2928 /*ARGSUSED*/
2929 static mdi_pathinfo_t *
2930 i_mdi_pi_alloc(mdi_phci_t *ph, char *paddr, mdi_client_t *ct)
2931 {
2932 	mdi_pathinfo_t	*pip;
2933 	int		ct_circular;
2934 	int		ph_circular;
2935 	static char	path[MAXPATHLEN];	/* mdi_pathmap_mutex protects */
2936 	char		*path_persistent;
2937 	int		path_instance;
2938 	mod_hash_val_t	hv;
2939 
2940 	ASSERT(MDI_VHCI_CLIENT_LOCKED(ph->ph_vhci));
2941 
2942 	pip = kmem_zalloc(sizeof (struct mdi_pathinfo), KM_SLEEP);
2943 	mutex_init(&MDI_PI(pip)->pi_mutex, NULL, MUTEX_DEFAULT, NULL);
2944 	MDI_PI(pip)->pi_state = MDI_PATHINFO_STATE_INIT |
2945 	    MDI_PATHINFO_STATE_TRANSIENT;
2946 
2947 	if (MDI_PHCI_IS_USER_DISABLED(ph))
2948 		MDI_PI_SET_USER_DISABLE(pip);
2949 
2950 	if (MDI_PHCI_IS_DRV_DISABLED_TRANSIENT(ph))
2951 		MDI_PI_SET_DRV_DISABLE_TRANS(pip);
2952 
2953 	if (MDI_PHCI_IS_DRV_DISABLED(ph))
2954 		MDI_PI_SET_DRV_DISABLE(pip);
2955 
2956 	MDI_PI(pip)->pi_old_state = MDI_PATHINFO_STATE_INIT;
2957 	cv_init(&MDI_PI(pip)->pi_state_cv, NULL, CV_DEFAULT, NULL);
2958 	MDI_PI(pip)->pi_client = ct;
2959 	MDI_PI(pip)->pi_phci = ph;
2960 	MDI_PI(pip)->pi_addr = kmem_alloc(strlen(paddr) + 1, KM_SLEEP);
2961 	(void) strcpy(MDI_PI(pip)->pi_addr, paddr);
2962 
2963         /*
2964 	 * We form the "path" to the pathinfo node, and see if we have
2965 	 * already allocated a 'path_instance' for that "path".  If so,
2966 	 * we use the already allocated 'path_instance'.  If not, we
2967 	 * allocate a new 'path_instance' and associate it with a copy of
2968 	 * the "path" string (which is never freed). The association
2969 	 * between a 'path_instance' this "path" string persists until
2970 	 * reboot.
2971 	 */
2972         mutex_enter(&mdi_pathmap_mutex);
2973 	(void) ddi_pathname(ph->ph_dip, path);
2974 	(void) sprintf(path + strlen(path), "/%s@%s",
2975 	    mdi_pi_get_node_name(pip), mdi_pi_get_addr(pip));
2976         if (mod_hash_find(mdi_pathmap_bypath, (mod_hash_key_t)path, &hv) == 0) {
2977                 path_instance = (uint_t)(intptr_t)hv;
2978         } else {
2979 		/* allocate a new 'path_instance' and persistent "path" */
2980 		path_instance = mdi_pathmap_instance++;
2981 		path_persistent = i_ddi_strdup(path, KM_SLEEP);
2982                 (void) mod_hash_insert(mdi_pathmap_bypath,
2983                     (mod_hash_key_t)path_persistent,
2984                     (mod_hash_val_t)(intptr_t)path_instance);
2985 		(void) mod_hash_insert(mdi_pathmap_byinstance,
2986 		    (mod_hash_key_t)(intptr_t)path_instance,
2987 		    (mod_hash_val_t)path_persistent);
2988 
2989 		/* create shortpath name */
2990 		(void) snprintf(path, sizeof(path), "%s%d/%s@%s",
2991 		    ddi_driver_name(ph->ph_dip), ddi_get_instance(ph->ph_dip),
2992 		    mdi_pi_get_node_name(pip), mdi_pi_get_addr(pip));
2993 		path_persistent = i_ddi_strdup(path, KM_SLEEP);
2994 		(void) mod_hash_insert(mdi_pathmap_sbyinstance,
2995 		    (mod_hash_key_t)(intptr_t)path_instance,
2996 		    (mod_hash_val_t)path_persistent);
2997         }
2998         mutex_exit(&mdi_pathmap_mutex);
2999 	MDI_PI(pip)->pi_path_instance = path_instance;
3000 
3001 	(void) nvlist_alloc(&MDI_PI(pip)->pi_prop, NV_UNIQUE_NAME, KM_SLEEP);
3002 	ASSERT(MDI_PI(pip)->pi_prop != NULL);
3003 	MDI_PI(pip)->pi_pprivate = NULL;
3004 	MDI_PI(pip)->pi_cprivate = NULL;
3005 	MDI_PI(pip)->pi_vprivate = NULL;
3006 	MDI_PI(pip)->pi_client_link = NULL;
3007 	MDI_PI(pip)->pi_phci_link = NULL;
3008 	MDI_PI(pip)->pi_ref_cnt = 0;
3009 	MDI_PI(pip)->pi_kstats = NULL;
3010 	MDI_PI(pip)->pi_preferred = 1;
3011 	cv_init(&MDI_PI(pip)->pi_ref_cv, NULL, CV_DEFAULT, NULL);
3012 
3013 	/*
3014 	 * Lock both dev_info nodes against changes in parallel.
3015 	 *
3016 	 * The ndi_devi_enter(Client), is atypical since the client is a leaf.
3017 	 * This atypical operation is done to synchronize pathinfo nodes
3018 	 * during devinfo snapshot (see di_register_pip) by 'pretending' that
3019 	 * the pathinfo nodes are children of the Client.
3020 	 */
3021 	ndi_devi_enter(ct->ct_dip, &ct_circular);
3022 	ndi_devi_enter(ph->ph_dip, &ph_circular);
3023 
3024 	i_mdi_phci_add_path(ph, pip);
3025 	i_mdi_client_add_path(ct, pip);
3026 
3027 	ndi_devi_exit(ph->ph_dip, ph_circular);
3028 	ndi_devi_exit(ct->ct_dip, ct_circular);
3029 
3030 	return (pip);
3031 }
3032 
3033 /*
3034  * mdi_pi_pathname_by_instance():
3035  *	Lookup of "path" by 'path_instance'. Return "path".
3036  *	NOTE: returned "path" remains valid forever (until reboot).
3037  */
3038 char *
3039 mdi_pi_pathname_by_instance(int path_instance)
3040 {
3041 	char		*path;
3042 	mod_hash_val_t	hv;
3043 
3044 	/* mdi_pathmap lookup of "path" by 'path_instance' */
3045 	mutex_enter(&mdi_pathmap_mutex);
3046 	if (mod_hash_find(mdi_pathmap_byinstance,
3047 	    (mod_hash_key_t)(intptr_t)path_instance, &hv) == 0)
3048 		path = (char *)hv;
3049 	else
3050 		path = NULL;
3051 	mutex_exit(&mdi_pathmap_mutex);
3052 	return (path);
3053 }
3054 
3055 /*
3056  * mdi_pi_spathname_by_instance():
3057  *	Lookup of "shortpath" by 'path_instance'. Return "shortpath".
3058  *	NOTE: returned "shortpath" remains valid forever (until reboot).
3059  */
3060 char *
3061 mdi_pi_spathname_by_instance(int path_instance)
3062 {
3063 	char		*path;
3064 	mod_hash_val_t	hv;
3065 
3066 	/* mdi_pathmap lookup of "path" by 'path_instance' */
3067 	mutex_enter(&mdi_pathmap_mutex);
3068 	if (mod_hash_find(mdi_pathmap_sbyinstance,
3069 	    (mod_hash_key_t)(intptr_t)path_instance, &hv) == 0)
3070 		path = (char *)hv;
3071 	else
3072 		path = NULL;
3073 	mutex_exit(&mdi_pathmap_mutex);
3074 	return (path);
3075 }
3076 
3077 
3078 /*
3079  * i_mdi_phci_add_path():
3080  * 		Add a mdi_pathinfo node to pHCI list.
3081  * Notes:
3082  *		Caller should per-pHCI mutex
3083  */
3084 static void
3085 i_mdi_phci_add_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
3086 {
3087 	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
3088 
3089 	MDI_PHCI_LOCK(ph);
3090 	if (ph->ph_path_head == NULL) {
3091 		ph->ph_path_head = pip;
3092 	} else {
3093 		MDI_PI(ph->ph_path_tail)->pi_phci_link = MDI_PI(pip);
3094 	}
3095 	ph->ph_path_tail = pip;
3096 	ph->ph_path_count++;
3097 	MDI_PHCI_UNLOCK(ph);
3098 }
3099 
3100 /*
3101  * i_mdi_client_add_path():
3102  *		Add mdi_pathinfo node to client list
3103  */
3104 static void
3105 i_mdi_client_add_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
3106 {
3107 	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
3108 
3109 	MDI_CLIENT_LOCK(ct);
3110 	if (ct->ct_path_head == NULL) {
3111 		ct->ct_path_head = pip;
3112 	} else {
3113 		MDI_PI(ct->ct_path_tail)->pi_client_link = MDI_PI(pip);
3114 	}
3115 	ct->ct_path_tail = pip;
3116 	ct->ct_path_count++;
3117 	MDI_CLIENT_UNLOCK(ct);
3118 }
3119 
3120 /*
3121  * mdi_pi_free():
3122  *		Free the mdi_pathinfo node and also client device node if this
3123  *		is the last path to the device
3124  * Return Values:
3125  *		MDI_SUCCESS
3126  *		MDI_FAILURE
3127  *		MDI_BUSY
3128  */
3129 /*ARGSUSED*/
3130 int
3131 mdi_pi_free(mdi_pathinfo_t *pip, int flags)
3132 {
3133 	int		rv;
3134 	mdi_vhci_t	*vh;
3135 	mdi_phci_t	*ph;
3136 	mdi_client_t	*ct;
3137 	int		(*f)();
3138 	int		client_held = 0;
3139 
3140 	MDI_PI_LOCK(pip);
3141 	ph = MDI_PI(pip)->pi_phci;
3142 	ASSERT(ph != NULL);
3143 	if (ph == NULL) {
3144 		/*
3145 		 * Invalid pHCI device, return failure
3146 		 */
3147 		MDI_DEBUG(1, (MDI_WARN, NULL,
3148 		    "!invalid pHCI: pip %s %p",
3149 		    mdi_pi_spathname(pip), (void *)pip));
3150 		MDI_PI_UNLOCK(pip);
3151 		return (MDI_FAILURE);
3152 	}
3153 
3154 	vh = ph->ph_vhci;
3155 	ASSERT(vh != NULL);
3156 	if (vh == NULL) {
3157 		/* Invalid pHCI device, return failure */
3158 		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3159 		    "!invalid vHCI: pip %s %p",
3160 		    mdi_pi_spathname(pip), (void *)pip));
3161 		MDI_PI_UNLOCK(pip);
3162 		return (MDI_FAILURE);
3163 	}
3164 
3165 	ct = MDI_PI(pip)->pi_client;
3166 	ASSERT(ct != NULL);
3167 	if (ct == NULL) {
3168 		/*
3169 		 * Invalid Client device, return failure
3170 		 */
3171 		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3172 		    "!invalid client: pip %s %p",
3173 		    mdi_pi_spathname(pip), (void *)pip));
3174 		MDI_PI_UNLOCK(pip);
3175 		return (MDI_FAILURE);
3176 	}
3177 
3178 	/*
3179 	 * Check to see for busy condition.  A mdi_pathinfo can only be freed
3180 	 * if the node state is either offline or init and the reference count
3181 	 * is zero.
3182 	 */
3183 	if (!(MDI_PI_IS_OFFLINE(pip) || MDI_PI_IS_INIT(pip) ||
3184 	    MDI_PI_IS_INITING(pip))) {
3185 		/*
3186 		 * Node is busy
3187 		 */
3188 		MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
3189 		    "!busy: pip %s %p", mdi_pi_spathname(pip), (void *)pip));
3190 		MDI_PI_UNLOCK(pip);
3191 		return (MDI_BUSY);
3192 	}
3193 
3194 	while (MDI_PI(pip)->pi_ref_cnt != 0) {
3195 		/*
3196 		 * Give a chance for pending I/Os to complete.
3197 		 */
3198 		MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3199 		    "!%d cmds still pending on path: %s %p",
3200 		    MDI_PI(pip)->pi_ref_cnt,
3201 		    mdi_pi_spathname(pip), (void *)pip));
3202 		if (cv_reltimedwait(&MDI_PI(pip)->pi_ref_cv,
3203 		    &MDI_PI(pip)->pi_mutex, drv_usectohz(60 * 1000000),
3204 		    TR_CLOCK_TICK) == -1) {
3205 			/*
3206 			 * The timeout time reached without ref_cnt being zero
3207 			 * being signaled.
3208 			 */
3209 			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3210 			    "!Timeout reached on path %s %p without the cond",
3211 			    mdi_pi_spathname(pip), (void *)pip));
3212 			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3213 			    "!%d cmds still pending on path %s %p",
3214 			    MDI_PI(pip)->pi_ref_cnt,
3215 			    mdi_pi_spathname(pip), (void *)pip));
3216 			MDI_PI_UNLOCK(pip);
3217 			return (MDI_BUSY);
3218 		}
3219 	}
3220 	if (MDI_PI(pip)->pi_pm_held) {
3221 		client_held = 1;
3222 	}
3223 	MDI_PI_UNLOCK(pip);
3224 
3225 	vhcache_pi_remove(vh->vh_config, MDI_PI(pip));
3226 
3227 	MDI_CLIENT_LOCK(ct);
3228 
3229 	/* Prevent further failovers till MDI_VHCI_CLIENT_LOCK is held */
3230 	MDI_CLIENT_SET_PATH_FREE_IN_PROGRESS(ct);
3231 
3232 	/*
3233 	 * Wait till failover is complete before removing this node.
3234 	 */
3235 	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
3236 		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
3237 
3238 	MDI_CLIENT_UNLOCK(ct);
3239 	MDI_VHCI_CLIENT_LOCK(vh);
3240 	MDI_CLIENT_LOCK(ct);
3241 	MDI_CLIENT_CLEAR_PATH_FREE_IN_PROGRESS(ct);
3242 
3243 	if (!MDI_PI_IS_INITING(pip)) {
3244 		f = vh->vh_ops->vo_pi_uninit;
3245 		if (f != NULL) {
3246 			rv = (*f)(vh->vh_dip, pip, 0);
3247 		}
3248 	} else
3249 		rv = MDI_SUCCESS;
3250 
3251 	/*
3252 	 * If vo_pi_uninit() completed successfully.
3253 	 */
3254 	if (rv == MDI_SUCCESS) {
3255 		if (client_held) {
3256 			MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
3257 			    "i_mdi_pm_rele_client\n"));
3258 			i_mdi_pm_rele_client(ct, 1);
3259 		}
3260 		i_mdi_pi_free(ph, pip, ct);
3261 		if (ct->ct_path_count == 0) {
3262 			/*
3263 			 * Client lost its last path.
3264 			 * Clean up the client device
3265 			 */
3266 			MDI_CLIENT_UNLOCK(ct);
3267 			(void) i_mdi_client_free(ct->ct_vhci, ct);
3268 			MDI_VHCI_CLIENT_UNLOCK(vh);
3269 			return (rv);
3270 		}
3271 	}
3272 	MDI_CLIENT_UNLOCK(ct);
3273 	MDI_VHCI_CLIENT_UNLOCK(vh);
3274 
3275 	if (rv == MDI_FAILURE)
3276 		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
3277 
3278 	return (rv);
3279 }
3280 
3281 /*
3282  * i_mdi_pi_free():
3283  *		Free the mdi_pathinfo node
3284  */
3285 static void
3286 i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *pip, mdi_client_t *ct)
3287 {
3288 	int	ct_circular;
3289 	int	ph_circular;
3290 
3291 	ASSERT(MDI_CLIENT_LOCKED(ct));
3292 
3293 	/*
3294 	 * remove any per-path kstats
3295 	 */
3296 	i_mdi_pi_kstat_destroy(pip);
3297 
3298 	/* See comments in i_mdi_pi_alloc() */
3299 	ndi_devi_enter(ct->ct_dip, &ct_circular);
3300 	ndi_devi_enter(ph->ph_dip, &ph_circular);
3301 
3302 	i_mdi_client_remove_path(ct, pip);
3303 	i_mdi_phci_remove_path(ph, pip);
3304 
3305 	ndi_devi_exit(ph->ph_dip, ph_circular);
3306 	ndi_devi_exit(ct->ct_dip, ct_circular);
3307 
3308 	mutex_destroy(&MDI_PI(pip)->pi_mutex);
3309 	cv_destroy(&MDI_PI(pip)->pi_state_cv);
3310 	cv_destroy(&MDI_PI(pip)->pi_ref_cv);
3311 	if (MDI_PI(pip)->pi_addr) {
3312 		kmem_free(MDI_PI(pip)->pi_addr,
3313 		    strlen(MDI_PI(pip)->pi_addr) + 1);
3314 		MDI_PI(pip)->pi_addr = NULL;
3315 	}
3316 
3317 	if (MDI_PI(pip)->pi_prop) {
3318 		(void) nvlist_free(MDI_PI(pip)->pi_prop);
3319 		MDI_PI(pip)->pi_prop = NULL;
3320 	}
3321 	kmem_free(pip, sizeof (struct mdi_pathinfo));
3322 }
3323 
3324 
3325 /*
3326  * i_mdi_phci_remove_path():
3327  * 		Remove a mdi_pathinfo node from pHCI list.
3328  * Notes:
3329  *		Caller should hold per-pHCI mutex
3330  */
3331 static void
3332 i_mdi_phci_remove_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
3333 {
3334 	mdi_pathinfo_t	*prev = NULL;
3335 	mdi_pathinfo_t	*path = NULL;
3336 
3337 	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
3338 
3339 	MDI_PHCI_LOCK(ph);
3340 	path = ph->ph_path_head;
3341 	while (path != NULL) {
3342 		if (path == pip) {
3343 			break;
3344 		}
3345 		prev = path;
3346 		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
3347 	}
3348 
3349 	if (path) {
3350 		ph->ph_path_count--;
3351 		if (prev) {
3352 			MDI_PI(prev)->pi_phci_link = MDI_PI(path)->pi_phci_link;
3353 		} else {
3354 			ph->ph_path_head =
3355 			    (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
3356 		}
3357 		if (ph->ph_path_tail == path) {
3358 			ph->ph_path_tail = prev;
3359 		}
3360 	}
3361 
3362 	/*
3363 	 * Clear the pHCI link
3364 	 */
3365 	MDI_PI(pip)->pi_phci_link = NULL;
3366 	MDI_PI(pip)->pi_phci = NULL;
3367 	MDI_PHCI_UNLOCK(ph);
3368 }
3369 
3370 /*
3371  * i_mdi_client_remove_path():
3372  * 		Remove a mdi_pathinfo node from client path list.
3373  */
3374 static void
3375 i_mdi_client_remove_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
3376 {
3377 	mdi_pathinfo_t	*prev = NULL;
3378 	mdi_pathinfo_t	*path;
3379 
3380 	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
3381 
3382 	ASSERT(MDI_CLIENT_LOCKED(ct));
3383 	path = ct->ct_path_head;
3384 	while (path != NULL) {
3385 		if (path == pip) {
3386 			break;
3387 		}
3388 		prev = path;
3389 		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
3390 	}
3391 
3392 	if (path) {
3393 		ct->ct_path_count--;
3394 		if (prev) {
3395 			MDI_PI(prev)->pi_client_link =
3396 			    MDI_PI(path)->pi_client_link;
3397 		} else {
3398 			ct->ct_path_head =
3399 			    (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
3400 		}
3401 		if (ct->ct_path_tail == path) {
3402 			ct->ct_path_tail = prev;
3403 		}
3404 		if (ct->ct_path_last == path) {
3405 			ct->ct_path_last = ct->ct_path_head;
3406 		}
3407 	}
3408 	MDI_PI(pip)->pi_client_link = NULL;
3409 	MDI_PI(pip)->pi_client = NULL;
3410 }
3411 
3412 /*
3413  * i_mdi_pi_state_change():
3414  *		online a mdi_pathinfo node
3415  *
3416  * Return Values:
3417  *		MDI_SUCCESS
3418  *		MDI_FAILURE
3419  */
3420 /*ARGSUSED*/
3421 static int
3422 i_mdi_pi_state_change(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state, int flag)
3423 {
3424 	int		rv = MDI_SUCCESS;
3425 	mdi_vhci_t	*vh;
3426 	mdi_phci_t	*ph;
3427 	mdi_client_t	*ct;
3428 	int		(*f)();
3429 	dev_info_t	*cdip;
3430 
3431 	MDI_PI_LOCK(pip);
3432 
3433 	ph = MDI_PI(pip)->pi_phci;
3434 	ASSERT(ph);
3435 	if (ph == NULL) {
3436 		/*
3437 		 * Invalid pHCI device, fail the request
3438 		 */
3439 		MDI_PI_UNLOCK(pip);
3440 		MDI_DEBUG(1, (MDI_WARN, NULL,
3441 		    "!invalid phci: pip %s %p",
3442 		    mdi_pi_spathname(pip), (void *)pip));
3443 		return (MDI_FAILURE);
3444 	}
3445 
3446 	vh = ph->ph_vhci;
3447 	ASSERT(vh);
3448 	if (vh == NULL) {
3449 		/*
3450 		 * Invalid vHCI device, fail the request
3451 		 */
3452 		MDI_PI_UNLOCK(pip);
3453 		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3454 		    "!invalid vhci: pip %s %p",
3455 		    mdi_pi_spathname(pip), (void *)pip));
3456 		return (MDI_FAILURE);
3457 	}
3458 
3459 	ct = MDI_PI(pip)->pi_client;
3460 	ASSERT(ct != NULL);
3461 	if (ct == NULL) {
3462 		/*
3463 		 * Invalid client device, fail the request
3464 		 */
3465 		MDI_PI_UNLOCK(pip);
3466 		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3467 		    "!invalid client: pip %s %p",
3468 		    mdi_pi_spathname(pip), (void *)pip));
3469 		return (MDI_FAILURE);
3470 	}
3471 
3472 	/*
3473 	 * If this path has not been initialized yet, Callback vHCI driver's
3474 	 * pathinfo node initialize entry point
3475 	 */
3476 
3477 	if (MDI_PI_IS_INITING(pip)) {
3478 		MDI_PI_UNLOCK(pip);
3479 		f = vh->vh_ops->vo_pi_init;
3480 		if (f != NULL) {
3481 			rv = (*f)(vh->vh_dip, pip, 0);
3482 			if (rv != MDI_SUCCESS) {
3483 				MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
3484 				    "!vo_pi_init failed: vHCI %p, pip %s %p",
3485 				    (void *)vh, mdi_pi_spathname(pip),
3486 				    (void *)pip));
3487 				return (MDI_FAILURE);
3488 			}
3489 		}
3490 		MDI_PI_LOCK(pip);
3491 		MDI_PI_CLEAR_TRANSIENT(pip);
3492 	}
3493 
3494 	/*
3495 	 * Do not allow state transition when pHCI is in offline/suspended
3496 	 * states
3497 	 */
3498 	i_mdi_phci_lock(ph, pip);
3499 	if (MDI_PHCI_IS_READY(ph) == 0) {
3500 		MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
3501 		    "!pHCI not ready, pHCI=%p", (void *)ph));
3502 		MDI_PI_UNLOCK(pip);
3503 		i_mdi_phci_unlock(ph);
3504 		return (MDI_BUSY);
3505 	}
3506 	MDI_PHCI_UNSTABLE(ph);
3507 	i_mdi_phci_unlock(ph);
3508 
3509 	/*
3510 	 * Check if mdi_pathinfo state is in transient state.
3511 	 * If yes, offlining is in progress and wait till transient state is
3512 	 * cleared.
3513 	 */
3514 	if (MDI_PI_IS_TRANSIENT(pip)) {
3515 		while (MDI_PI_IS_TRANSIENT(pip)) {
3516 			cv_wait(&MDI_PI(pip)->pi_state_cv,
3517 			    &MDI_PI(pip)->pi_mutex);
3518 		}
3519 	}
3520 
3521 	/*
3522 	 * Grab the client lock in reverse order sequence and release the
3523 	 * mdi_pathinfo mutex.
3524 	 */
3525 	i_mdi_client_lock(ct, pip);
3526 	MDI_PI_UNLOCK(pip);
3527 
3528 	/*
3529 	 * Wait till failover state is cleared
3530 	 */
3531 	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
3532 		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
3533 
3534 	/*
3535 	 * Mark the mdi_pathinfo node state as transient
3536 	 */
3537 	MDI_PI_LOCK(pip);
3538 	switch (state) {
3539 	case MDI_PATHINFO_STATE_ONLINE:
3540 		MDI_PI_SET_ONLINING(pip);
3541 		break;
3542 
3543 	case MDI_PATHINFO_STATE_STANDBY:
3544 		MDI_PI_SET_STANDBYING(pip);
3545 		break;
3546 
3547 	case MDI_PATHINFO_STATE_FAULT:
3548 		/*
3549 		 * Mark the pathinfo state as FAULTED
3550 		 */
3551 		MDI_PI_SET_FAULTING(pip);
3552 		MDI_PI_ERRSTAT(pip, MDI_PI_HARDERR);
3553 		break;
3554 
3555 	case MDI_PATHINFO_STATE_OFFLINE:
3556 		/*
3557 		 * ndi_devi_offline() cannot hold pip or ct locks.
3558 		 */
3559 		MDI_PI_UNLOCK(pip);
3560 
3561 		/*
3562 		 * If this is a user initiated path online->offline operation
3563 		 * who's success would transition a client from DEGRADED to
3564 		 * FAILED then only proceed if we can offline the client first.
3565 		 */
3566 		cdip = ct->ct_dip;
3567 		if ((flag & NDI_USER_REQ) &&
3568 		    MDI_PI_IS_ONLINE(pip) &&
3569 		    (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED)) {
3570 			i_mdi_client_unlock(ct);
3571 			rv = ndi_devi_offline(cdip, NDI_DEVFS_CLEAN);
3572 			if (rv != NDI_SUCCESS) {
3573 				/*
3574 				 * Convert to MDI error code
3575 				 */
3576 				switch (rv) {
3577 				case NDI_BUSY:
3578 					rv = MDI_BUSY;
3579 					break;
3580 				default:
3581 					rv = MDI_FAILURE;
3582 					break;
3583 				}
3584 				goto state_change_exit;
3585 			} else {
3586 				i_mdi_client_lock(ct, NULL);
3587 			}
3588 		}
3589 		/*
3590 		 * Mark the mdi_pathinfo node state as transient
3591 		 */
3592 		MDI_PI_LOCK(pip);
3593 		MDI_PI_SET_OFFLINING(pip);
3594 		break;
3595 	}
3596 	MDI_PI_UNLOCK(pip);
3597 	MDI_CLIENT_UNSTABLE(ct);
3598 	i_mdi_client_unlock(ct);
3599 
3600 	f = vh->vh_ops->vo_pi_state_change;
3601 	if (f != NULL)
3602 		rv = (*f)(vh->vh_dip, pip, state, 0, flag);
3603 
3604 	MDI_CLIENT_LOCK(ct);
3605 	MDI_PI_LOCK(pip);
3606 	if (rv == MDI_NOT_SUPPORTED) {
3607 		MDI_CLIENT_SET_DEV_NOT_SUPPORTED(ct);
3608 	}
3609 	if (rv != MDI_SUCCESS) {
3610 		MDI_DEBUG(2, (MDI_WARN, ct->ct_dip,
3611 		    "vo_pi_state_change failed: rv %x", rv));
3612 	}
3613 	if (MDI_PI_IS_TRANSIENT(pip)) {
3614 		if (rv == MDI_SUCCESS) {
3615 			MDI_PI_CLEAR_TRANSIENT(pip);
3616 		} else {
3617 			MDI_PI(pip)->pi_state = MDI_PI_OLD_STATE(pip);
3618 		}
3619 	}
3620 
3621 	/*
3622 	 * Wake anyone waiting for this mdi_pathinfo node
3623 	 */
3624 	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
3625 	MDI_PI_UNLOCK(pip);
3626 
3627 	/*
3628 	 * Mark the client device as stable
3629 	 */
3630 	MDI_CLIENT_STABLE(ct);
3631 	if (rv == MDI_SUCCESS) {
3632 		if (ct->ct_unstable == 0) {
3633 			cdip = ct->ct_dip;
3634 
3635 			/*
3636 			 * Onlining the mdi_pathinfo node will impact the
3637 			 * client state Update the client and dev_info node
3638 			 * state accordingly
3639 			 */
3640 			rv = NDI_SUCCESS;
3641 			i_mdi_client_update_state(ct);
3642 			switch (MDI_CLIENT_STATE(ct)) {
3643 			case MDI_CLIENT_STATE_OPTIMAL:
3644 			case MDI_CLIENT_STATE_DEGRADED:
3645 				if (cdip && !i_ddi_devi_attached(cdip) &&
3646 				    ((state == MDI_PATHINFO_STATE_ONLINE) ||
3647 				    (state == MDI_PATHINFO_STATE_STANDBY))) {
3648 
3649 					/*
3650 					 * Must do ndi_devi_online() through
3651 					 * hotplug thread for deferred
3652 					 * attach mechanism to work
3653 					 */
3654 					MDI_CLIENT_UNLOCK(ct);
3655 					rv = ndi_devi_online(cdip, 0);
3656 					MDI_CLIENT_LOCK(ct);
3657 					if ((rv != NDI_SUCCESS) &&
3658 					    (MDI_CLIENT_STATE(ct) ==
3659 					    MDI_CLIENT_STATE_DEGRADED)) {
3660 						/*
3661 						 * ndi_devi_online failed.
3662 						 * Reset client flags to
3663 						 * offline.
3664 						 */
3665 						MDI_DEBUG(1, (MDI_WARN, cdip,
3666 						    "!ndi_devi_online failed "
3667 						    "error %x", rv));
3668 						MDI_CLIENT_SET_OFFLINE(ct);
3669 					}
3670 					if (rv != NDI_SUCCESS) {
3671 						/* Reset the path state */
3672 						MDI_PI_LOCK(pip);
3673 						MDI_PI(pip)->pi_state =
3674 						    MDI_PI_OLD_STATE(pip);
3675 						MDI_PI_UNLOCK(pip);
3676 					}
3677 				}
3678 				break;
3679 
3680 			case MDI_CLIENT_STATE_FAILED:
3681 				/*
3682 				 * This is the last path case for
3683 				 * non-user initiated events.
3684 				 */
3685 				if (((flag & NDI_USER_REQ) == 0) &&
3686 				    cdip && (i_ddi_node_state(cdip) >=
3687 				    DS_INITIALIZED)) {
3688 					MDI_CLIENT_UNLOCK(ct);
3689 					rv = ndi_devi_offline(cdip,
3690 					    NDI_DEVFS_CLEAN);
3691 					MDI_CLIENT_LOCK(ct);
3692 
3693 					if (rv != NDI_SUCCESS) {
3694 						/*
3695 						 * ndi_devi_offline failed.
3696 						 * Reset client flags to
3697 						 * online as the path could not
3698 						 * be offlined.
3699 						 */
3700 						MDI_DEBUG(1, (MDI_WARN, cdip,
3701 						    "!ndi_devi_offline failed: "
3702 						    "error %x", rv));
3703 						MDI_CLIENT_SET_ONLINE(ct);
3704 					}
3705 				}
3706 				break;
3707 			}
3708 			/*
3709 			 * Convert to MDI error code
3710 			 */
3711 			switch (rv) {
3712 			case NDI_SUCCESS:
3713 				MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
3714 				i_mdi_report_path_state(ct, pip);
3715 				rv = MDI_SUCCESS;
3716 				break;
3717 			case NDI_BUSY:
3718 				rv = MDI_BUSY;
3719 				break;
3720 			default:
3721 				rv = MDI_FAILURE;
3722 				break;
3723 			}
3724 		}
3725 	}
3726 	MDI_CLIENT_UNLOCK(ct);
3727 
3728 state_change_exit:
3729 	/*
3730 	 * Mark the pHCI as stable again.
3731 	 */
3732 	MDI_PHCI_LOCK(ph);
3733 	MDI_PHCI_STABLE(ph);
3734 	MDI_PHCI_UNLOCK(ph);
3735 	return (rv);
3736 }
3737 
3738 /*
3739  * mdi_pi_online():
3740  *		Place the path_info node in the online state.  The path is
3741  *		now available to be selected by mdi_select_path() for
3742  *		transporting I/O requests to client devices.
3743  * Return Values:
3744  *		MDI_SUCCESS
3745  *		MDI_FAILURE
3746  */
3747 int
3748 mdi_pi_online(mdi_pathinfo_t *pip, int flags)
3749 {
3750 	mdi_client_t	*ct = MDI_PI(pip)->pi_client;
3751 	int		client_held = 0;
3752 	int		rv;
3753 
3754 	ASSERT(ct != NULL);
3755 	rv = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_ONLINE, flags);
3756 	if (rv != MDI_SUCCESS)
3757 		return (rv);
3758 
3759 	MDI_PI_LOCK(pip);
3760 	if (MDI_PI(pip)->pi_pm_held == 0) {
3761 		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
3762 		    "i_mdi_pm_hold_pip %p", (void *)pip));
3763 		i_mdi_pm_hold_pip(pip);
3764 		client_held = 1;
3765 	}
3766 	MDI_PI_UNLOCK(pip);
3767 
3768 	if (client_held) {
3769 		MDI_CLIENT_LOCK(ct);
3770 		if (ct->ct_power_cnt == 0) {
3771 			rv = i_mdi_power_all_phci(ct);
3772 		}
3773 
3774 		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
3775 		    "i_mdi_pm_hold_client %p", (void *)ct));
3776 		i_mdi_pm_hold_client(ct, 1);
3777 		MDI_CLIENT_UNLOCK(ct);
3778 	}
3779 
3780 	return (rv);
3781 }
3782 
3783 /*
3784  * mdi_pi_standby():
3785  *		Place the mdi_pathinfo node in standby state
3786  *
3787  * Return Values:
3788  *		MDI_SUCCESS
3789  *		MDI_FAILURE
3790  */
3791 int
3792 mdi_pi_standby(mdi_pathinfo_t *pip, int flags)
3793 {
3794 	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_STANDBY, flags));
3795 }
3796 
3797 /*
3798  * mdi_pi_fault():
3799  *		Place the mdi_pathinfo node in fault'ed state
3800  * Return Values:
3801  *		MDI_SUCCESS
3802  *		MDI_FAILURE
3803  */
3804 int
3805 mdi_pi_fault(mdi_pathinfo_t *pip, int flags)
3806 {
3807 	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_FAULT, flags));
3808 }
3809 
3810 /*
3811  * mdi_pi_offline():
3812  *		Offline a mdi_pathinfo node.
3813  * Return Values:
3814  *		MDI_SUCCESS
3815  *		MDI_FAILURE
3816  */
3817 int
3818 mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
3819 {
3820 	int	ret, client_held = 0;
3821 	mdi_client_t	*ct;
3822 
3823 	/*
3824 	 * Original code overloaded NDI_DEVI_REMOVE to this interface, and
3825 	 * used it to mean "user initiated operation" (i.e. devctl). Callers
3826 	 * should now just use NDI_USER_REQ.
3827 	 */
3828 	if (flags & NDI_DEVI_REMOVE) {
3829 		flags &= ~NDI_DEVI_REMOVE;
3830 		flags |= NDI_USER_REQ;
3831 	}
3832 
3833 	ret = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_OFFLINE, flags);
3834 
3835 	if (ret == MDI_SUCCESS) {
3836 		MDI_PI_LOCK(pip);
3837 		if (MDI_PI(pip)->pi_pm_held) {
3838 			client_held = 1;
3839 		}
3840 		MDI_PI_UNLOCK(pip);
3841 
3842 		if (client_held) {
3843 			ct = MDI_PI(pip)->pi_client;
3844 			MDI_CLIENT_LOCK(ct);
3845 			MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
3846 			    "i_mdi_pm_rele_client\n"));
3847 			i_mdi_pm_rele_client(ct, 1);
3848 			MDI_CLIENT_UNLOCK(ct);
3849 		}
3850 	}
3851 
3852 	return (ret);
3853 }
3854 
3855 /*
3856  * i_mdi_pi_offline():
3857  *		Offline a mdi_pathinfo node and call the vHCI driver's callback
3858  */
3859 static int
3860 i_mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
3861 {
3862 	dev_info_t	*vdip = NULL;
3863 	mdi_vhci_t	*vh = NULL;
3864 	mdi_client_t	*ct = NULL;
3865 	int		(*f)();
3866 	int		rv;
3867 
3868 	MDI_PI_LOCK(pip);
3869 	ct = MDI_PI(pip)->pi_client;
3870 	ASSERT(ct != NULL);
3871 
3872 	while (MDI_PI(pip)->pi_ref_cnt != 0) {
3873 		/*
3874 		 * Give a chance for pending I/Os to complete.
3875 		 */
3876 		MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3877 		    "!%d cmds still pending on path %s %p",
3878 		    MDI_PI(pip)->pi_ref_cnt, mdi_pi_spathname(pip),
3879 		    (void *)pip));
3880 		if (cv_reltimedwait(&MDI_PI(pip)->pi_ref_cv,
3881 		    &MDI_PI(pip)->pi_mutex, drv_usectohz(60 * 1000000),
3882 		    TR_CLOCK_TICK) == -1) {
3883 			/*
3884 			 * The timeout time reached without ref_cnt being zero
3885 			 * being signaled.
3886 			 */
3887 			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3888 			    "!Timeout reached on path %s %p without the cond",
3889 			    mdi_pi_spathname(pip), (void *)pip));
3890 			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3891 			    "!%d cmds still pending on path %s %p",
3892 			    MDI_PI(pip)->pi_ref_cnt,
3893 			    mdi_pi_spathname(pip), (void *)pip));
3894 		}
3895 	}
3896 	vh = ct->ct_vhci;
3897 	vdip = vh->vh_dip;
3898 
3899 	/*
3900 	 * Notify vHCI that has registered this event
3901 	 */
3902 	ASSERT(vh->vh_ops);
3903 	f = vh->vh_ops->vo_pi_state_change;
3904 
3905 	if (f != NULL) {
3906 		MDI_PI_UNLOCK(pip);
3907 		if ((rv = (*f)(vdip, pip, MDI_PATHINFO_STATE_OFFLINE, 0,
3908 		    flags)) != MDI_SUCCESS) {
3909 			MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
3910 			    "!vo_path_offline failed: vdip %s%d %p: path %s %p",
3911 			    ddi_driver_name(vdip), ddi_get_instance(vdip),
3912 			    (void *)vdip, mdi_pi_spathname(pip), (void *)pip));
3913 		}
3914 		MDI_PI_LOCK(pip);
3915 	}
3916 
3917 	/*
3918 	 * Set the mdi_pathinfo node state and clear the transient condition
3919 	 */
3920 	MDI_PI_SET_OFFLINE(pip);
3921 	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
3922 	MDI_PI_UNLOCK(pip);
3923 
3924 	MDI_CLIENT_LOCK(ct);
3925 	if (rv == MDI_SUCCESS) {
3926 		if (ct->ct_unstable == 0) {
3927 			dev_info_t	*cdip = ct->ct_dip;
3928 
3929 			/*
3930 			 * Onlining the mdi_pathinfo node will impact the
3931 			 * client state Update the client and dev_info node
3932 			 * state accordingly
3933 			 */
3934 			i_mdi_client_update_state(ct);
3935 			rv = NDI_SUCCESS;
3936 			if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
3937 				if (cdip &&
3938 				    (i_ddi_node_state(cdip) >=
3939 				    DS_INITIALIZED)) {
3940 					MDI_CLIENT_UNLOCK(ct);
3941 					rv = ndi_devi_offline(cdip,
3942 					    NDI_DEVFS_CLEAN);
3943 					MDI_CLIENT_LOCK(ct);
3944 					if (rv != NDI_SUCCESS) {
3945 						/*
3946 						 * ndi_devi_offline failed.
3947 						 * Reset client flags to
3948 						 * online.
3949 						 */
3950 						MDI_DEBUG(4, (MDI_WARN, cdip,
3951 						    "ndi_devi_offline failed: "
3952 						    "error %x", rv));
3953 						MDI_CLIENT_SET_ONLINE(ct);
3954 					}
3955 				}
3956 			}
3957 			/*
3958 			 * Convert to MDI error code
3959 			 */
3960 			switch (rv) {
3961 			case NDI_SUCCESS:
3962 				rv = MDI_SUCCESS;
3963 				break;
3964 			case NDI_BUSY:
3965 				rv = MDI_BUSY;
3966 				break;
3967 			default:
3968 				rv = MDI_FAILURE;
3969 				break;
3970 			}
3971 		}
3972 		MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
3973 		i_mdi_report_path_state(ct, pip);
3974 	}
3975 
3976 	MDI_CLIENT_UNLOCK(ct);
3977 
3978 	/*
3979 	 * Change in the mdi_pathinfo node state will impact the client state
3980 	 */
3981 	MDI_DEBUG(2, (MDI_NOTE, ct->ct_dip,
3982 	    "ct = %p pip = %p", (void *)ct, (void *)pip));
3983 	return (rv);
3984 }
3985 
3986 /*
3987  * mdi_pi_get_node_name():
3988  *              Get the name associated with a mdi_pathinfo node.
3989  *              Since pathinfo nodes are not directly named, we
3990  *              return the node_name of the client.
3991  *
3992  * Return Values:
3993  *              char *
3994  */
3995 char *
3996 mdi_pi_get_node_name(mdi_pathinfo_t *pip)
3997 {
3998 	mdi_client_t    *ct;
3999 
4000 	if (pip == NULL)
4001 		return (NULL);
4002 	ct = MDI_PI(pip)->pi_client;
4003 	if ((ct == NULL) || (ct->ct_dip == NULL))
4004 		return (NULL);
4005 	return (ddi_node_name(ct->ct_dip));
4006 }
4007 
4008 /*
4009  * mdi_pi_get_addr():
4010  *		Get the unit address associated with a mdi_pathinfo node
4011  *
4012  * Return Values:
4013  *		char *
4014  */
4015 char *
4016 mdi_pi_get_addr(mdi_pathinfo_t *pip)
4017 {
4018 	if (pip == NULL)
4019 		return (NULL);
4020 
4021 	return (MDI_PI(pip)->pi_addr);
4022 }
4023 
4024 /*
4025  * mdi_pi_get_path_instance():
4026  *		Get the 'path_instance' of a mdi_pathinfo node
4027  *
4028  * Return Values:
4029  *		path_instance
4030  */
4031 int
4032 mdi_pi_get_path_instance(mdi_pathinfo_t *pip)
4033 {
4034 	if (pip == NULL)
4035 		return (0);
4036 
4037 	return (MDI_PI(pip)->pi_path_instance);
4038 }
4039 
4040 /*
4041  * mdi_pi_pathname():
4042  *		Return pointer to path to pathinfo node.
4043  */
4044 char *
4045 mdi_pi_pathname(mdi_pathinfo_t *pip)
4046 {
4047 	if (pip == NULL)
4048 		return (NULL);
4049 	return (mdi_pi_pathname_by_instance(mdi_pi_get_path_instance(pip)));
4050 }
4051 
4052 /*
4053  * mdi_pi_spathname():
4054  *		Return pointer to shortpath to pathinfo node. Used for debug
4055  *		messages, so return "" instead of NULL when unknown.
4056  */
4057 char *
4058 mdi_pi_spathname(mdi_pathinfo_t *pip)
4059 {
4060 	char	*spath = "";
4061 
4062 	if (pip) {
4063 		spath = mdi_pi_spathname_by_instance(
4064 		    mdi_pi_get_path_instance(pip));
4065 		if (spath == NULL)
4066 			spath = "";
4067 	}
4068 	return (spath);
4069 }
4070 
4071 char *
4072 mdi_pi_pathname_obp(mdi_pathinfo_t *pip, char *path)
4073 {
4074 	char *obp_path = NULL;
4075 	if ((pip == NULL) || (path == NULL))
4076 		return (NULL);
4077 
4078 	if (mdi_prop_lookup_string(pip, "obp-path", &obp_path) == MDI_SUCCESS) {
4079 		(void) strcpy(path, obp_path);
4080 		(void) mdi_prop_free(obp_path);
4081 	} else {
4082 		path = NULL;
4083 	}
4084 	return (path);
4085 }
4086 
4087 int
4088 mdi_pi_pathname_obp_set(mdi_pathinfo_t *pip, char *component)
4089 {
4090 	dev_info_t *pdip;
4091 	char *obp_path = NULL;
4092 	int rc = MDI_FAILURE;
4093 
4094 	if (pip == NULL)
4095 		return (MDI_FAILURE);
4096 
4097 	pdip = mdi_pi_get_phci(pip);
4098 	if (pdip == NULL)
4099 		return (MDI_FAILURE);
4100 
4101 	obp_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
4102 
4103 	if (ddi_pathname_obp(pdip, obp_path) == NULL) {
4104 		(void) ddi_pathname(pdip, obp_path);
4105 	}
4106 
4107 	if (component) {
4108 		(void) strncat(obp_path, "/", MAXPATHLEN);
4109 		(void) strncat(obp_path, component, MAXPATHLEN);
4110 	}
4111 	rc = mdi_prop_update_string(pip, "obp-path", obp_path);
4112 
4113 	if (obp_path)
4114 		kmem_free(obp_path, MAXPATHLEN);
4115 	return (rc);
4116 }
4117 
4118 /*
4119  * mdi_pi_get_client():
4120  *		Get the client devinfo associated with a mdi_pathinfo node
4121  *
4122  * Return Values:
4123  *		Handle to client device dev_info node
4124  */
4125 dev_info_t *
4126 mdi_pi_get_client(mdi_pathinfo_t *pip)
4127 {
4128 	dev_info_t	*dip = NULL;
4129 	if (pip) {
4130 		dip = MDI_PI(pip)->pi_client->ct_dip;
4131 	}
4132 	return (dip);
4133 }
4134 
4135 /*
4136  * mdi_pi_get_phci():
4137  *		Get the pHCI devinfo associated with the mdi_pathinfo node
4138  * Return Values:
4139  *		Handle to dev_info node
4140  */
4141 dev_info_t *
4142 mdi_pi_get_phci(mdi_pathinfo_t *pip)
4143 {
4144 	dev_info_t	*dip = NULL;
4145 	mdi_phci_t	*ph;
4146 
4147 	if (pip) {
4148 		ph = MDI_PI(pip)->pi_phci;
4149 		if (ph)
4150 			dip = ph->ph_dip;
4151 	}
4152 	return (dip);
4153 }
4154 
4155 /*
4156  * mdi_pi_get_client_private():
4157  *		Get the client private information associated with the
4158  *		mdi_pathinfo node
4159  */
4160 void *
4161 mdi_pi_get_client_private(mdi_pathinfo_t *pip)
4162 {
4163 	void *cprivate = NULL;
4164 	if (pip) {
4165 		cprivate = MDI_PI(pip)->pi_cprivate;
4166 	}
4167 	return (cprivate);
4168 }
4169 
4170 /*
4171  * mdi_pi_set_client_private():
4172  *		Set the client private information in the mdi_pathinfo node
4173  */
4174 void
4175 mdi_pi_set_client_private(mdi_pathinfo_t *pip, void *priv)
4176 {
4177 	if (pip) {
4178 		MDI_PI(pip)->pi_cprivate = priv;
4179 	}
4180 }
4181 
4182 /*
4183  * mdi_pi_get_phci_private():
4184  *		Get the pHCI private information associated with the
4185  *		mdi_pathinfo node
4186  */
4187 caddr_t
4188 mdi_pi_get_phci_private(mdi_pathinfo_t *pip)
4189 {
4190 	caddr_t	pprivate = NULL;
4191 
4192 	if (pip) {
4193 		pprivate = MDI_PI(pip)->pi_pprivate;
4194 	}
4195 	return (pprivate);
4196 }
4197 
4198 /*
4199  * mdi_pi_set_phci_private():
4200  *		Set the pHCI private information in the mdi_pathinfo node
4201  */
4202 void
4203 mdi_pi_set_phci_private(mdi_pathinfo_t *pip, caddr_t priv)
4204 {
4205 	if (pip) {
4206 		MDI_PI(pip)->pi_pprivate = priv;
4207 	}
4208 }
4209 
4210 /*
4211  * mdi_pi_get_state():
4212  *		Get the mdi_pathinfo node state. Transient states are internal
4213  *		and not provided to the users
4214  */
4215 mdi_pathinfo_state_t
4216 mdi_pi_get_state(mdi_pathinfo_t *pip)
4217 {
4218 	mdi_pathinfo_state_t    state = MDI_PATHINFO_STATE_INIT;
4219 
4220 	if (pip) {
4221 		if (MDI_PI_IS_TRANSIENT(pip)) {
4222 			/*
4223 			 * mdi_pathinfo is in state transition.  Return the
4224 			 * last good state.
4225 			 */
4226 			state = MDI_PI_OLD_STATE(pip);
4227 		} else {
4228 			state = MDI_PI_STATE(pip);
4229 		}
4230 	}
4231 	return (state);
4232 }
4233 
4234 /*
4235  * mdi_pi_get_flags():
4236  *		Get the mdi_pathinfo node flags.
4237  */
4238 uint_t
4239 mdi_pi_get_flags(mdi_pathinfo_t *pip)
4240 {
4241 	return (pip ? MDI_PI(pip)->pi_flags : 0);
4242 }
4243 
4244 /*
4245  * Note that the following function needs to be the new interface for
4246  * mdi_pi_get_state when mpxio gets integrated to ON.
4247  */
4248 int
4249 mdi_pi_get_state2(mdi_pathinfo_t *pip, mdi_pathinfo_state_t *state,
4250 		uint32_t *ext_state)
4251 {
4252 	*state = MDI_PATHINFO_STATE_INIT;
4253 
4254 	if (pip) {
4255 		if (MDI_PI_IS_TRANSIENT(pip)) {
4256 			/*
4257 			 * mdi_pathinfo is in state transition.  Return the
4258 			 * last good state.
4259 			 */
4260 			*state = MDI_PI_OLD_STATE(pip);
4261 			*ext_state = MDI_PI_OLD_EXT_STATE(pip);
4262 		} else {
4263 			*state = MDI_PI_STATE(pip);
4264 			*ext_state = MDI_PI_EXT_STATE(pip);
4265 		}
4266 	}
4267 	return (MDI_SUCCESS);
4268 }
4269 
4270 /*
4271  * mdi_pi_get_preferred:
4272  *	Get the preferred path flag
4273  */
4274 int
4275 mdi_pi_get_preferred(mdi_pathinfo_t *pip)
4276 {
4277 	if (pip) {
4278 		return (MDI_PI(pip)->pi_preferred);
4279 	}
4280 	return (0);
4281 }
4282 
4283 /*
4284  * mdi_pi_set_preferred:
4285  *	Set the preferred path flag
4286  */
4287 void
4288 mdi_pi_set_preferred(mdi_pathinfo_t *pip, int preferred)
4289 {
4290 	if (pip) {
4291 		MDI_PI(pip)->pi_preferred = preferred;
4292 	}
4293 }
4294 
4295 /*
4296  * mdi_pi_set_state():
4297  *		Set the mdi_pathinfo node state
4298  */
4299 void
4300 mdi_pi_set_state(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state)
4301 {
4302 	uint32_t	ext_state;
4303 
4304 	if (pip) {
4305 		ext_state = MDI_PI(pip)->pi_state & MDI_PATHINFO_EXT_STATE_MASK;
4306 		MDI_PI(pip)->pi_state = state;
4307 		MDI_PI(pip)->pi_state |= ext_state;
4308 
4309 		/* Path has changed state, invalidate DINFOCACHE snap shot. */
4310 		i_ddi_di_cache_invalidate();
4311 	}
4312 }
4313 
4314 /*
4315  * Property functions:
4316  */
4317 int
4318 i_map_nvlist_error_to_mdi(int val)
4319 {
4320 	int rv;
4321 
4322 	switch (val) {
4323 	case 0:
4324 		rv = DDI_PROP_SUCCESS;
4325 		break;
4326 	case EINVAL:
4327 	case ENOTSUP:
4328 		rv = DDI_PROP_INVAL_ARG;
4329 		break;
4330 	case ENOMEM:
4331 		rv = DDI_PROP_NO_MEMORY;
4332 		break;
4333 	default:
4334 		rv = DDI_PROP_NOT_FOUND;
4335 		break;
4336 	}
4337 	return (rv);
4338 }
4339 
4340 /*
4341  * mdi_pi_get_next_prop():
4342  * 		Property walk function.  The caller should hold mdi_pi_lock()
4343  *		and release by calling mdi_pi_unlock() at the end of walk to
4344  *		get a consistent value.
4345  */
4346 nvpair_t *
4347 mdi_pi_get_next_prop(mdi_pathinfo_t *pip, nvpair_t *prev)
4348 {
4349 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4350 		return (NULL);
4351 	}
4352 	ASSERT(MDI_PI_LOCKED(pip));
4353 	return (nvlist_next_nvpair(MDI_PI(pip)->pi_prop, prev));
4354 }
4355 
4356 /*
4357  * mdi_prop_remove():
4358  * 		Remove the named property from the named list.
4359  */
4360 int
4361 mdi_prop_remove(mdi_pathinfo_t *pip, char *name)
4362 {
4363 	if (pip == NULL) {
4364 		return (DDI_PROP_NOT_FOUND);
4365 	}
4366 	ASSERT(!MDI_PI_LOCKED(pip));
4367 	MDI_PI_LOCK(pip);
4368 	if (MDI_PI(pip)->pi_prop == NULL) {
4369 		MDI_PI_UNLOCK(pip);
4370 		return (DDI_PROP_NOT_FOUND);
4371 	}
4372 	if (name) {
4373 		(void) nvlist_remove_all(MDI_PI(pip)->pi_prop, name);
4374 	} else {
4375 		char		nvp_name[MAXNAMELEN];
4376 		nvpair_t	*nvp;
4377 		nvp = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, NULL);
4378 		while (nvp) {
4379 			nvpair_t	*next;
4380 			next = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, nvp);
4381 			(void) snprintf(nvp_name, sizeof(nvp_name), "%s",
4382 			    nvpair_name(nvp));
4383 			(void) nvlist_remove_all(MDI_PI(pip)->pi_prop,
4384 			    nvp_name);
4385 			nvp = next;
4386 		}
4387 	}
4388 	MDI_PI_UNLOCK(pip);
4389 	return (DDI_PROP_SUCCESS);
4390 }
4391 
4392 /*
4393  * mdi_prop_size():
4394  * 		Get buffer size needed to pack the property data.
4395  * 		Caller should hold the mdi_pathinfo_t lock to get a consistent
4396  *		buffer size.
4397  */
4398 int
4399 mdi_prop_size(mdi_pathinfo_t *pip, size_t *buflenp)
4400 {
4401 	int	rv;
4402 	size_t	bufsize;
4403 
4404 	*buflenp = 0;
4405 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4406 		return (DDI_PROP_NOT_FOUND);
4407 	}
4408 	ASSERT(MDI_PI_LOCKED(pip));
4409 	rv = nvlist_size(MDI_PI(pip)->pi_prop,
4410 	    &bufsize, NV_ENCODE_NATIVE);
4411 	*buflenp = bufsize;
4412 	return (i_map_nvlist_error_to_mdi(rv));
4413 }
4414 
4415 /*
4416  * mdi_prop_pack():
4417  * 		pack the property list.  The caller should hold the
4418  *		mdi_pathinfo_t node to get a consistent data
4419  */
4420 int
4421 mdi_prop_pack(mdi_pathinfo_t *pip, char **bufp, uint_t buflen)
4422 {
4423 	int	rv;
4424 	size_t	bufsize;
4425 
4426 	if ((pip == NULL) || MDI_PI(pip)->pi_prop == NULL) {
4427 		return (DDI_PROP_NOT_FOUND);
4428 	}
4429 
4430 	ASSERT(MDI_PI_LOCKED(pip));
4431 
4432 	bufsize = buflen;
4433 	rv = nvlist_pack(MDI_PI(pip)->pi_prop, bufp, (size_t *)&bufsize,
4434 	    NV_ENCODE_NATIVE, KM_SLEEP);
4435 
4436 	return (i_map_nvlist_error_to_mdi(rv));
4437 }
4438 
4439 /*
4440  * mdi_prop_update_byte():
4441  *		Create/Update a byte property
4442  */
4443 int
4444 mdi_prop_update_byte(mdi_pathinfo_t *pip, char *name, uchar_t data)
4445 {
4446 	int rv;
4447 
4448 	if (pip == NULL) {
4449 		return (DDI_PROP_INVAL_ARG);
4450 	}
4451 	ASSERT(!MDI_PI_LOCKED(pip));
4452 	MDI_PI_LOCK(pip);
4453 	if (MDI_PI(pip)->pi_prop == NULL) {
4454 		MDI_PI_UNLOCK(pip);
4455 		return (DDI_PROP_NOT_FOUND);
4456 	}
4457 	rv = nvlist_add_byte(MDI_PI(pip)->pi_prop, name, data);
4458 	MDI_PI_UNLOCK(pip);
4459 	return (i_map_nvlist_error_to_mdi(rv));
4460 }
4461 
4462 /*
4463  * mdi_prop_update_byte_array():
4464  *		Create/Update a byte array property
4465  */
4466 int
4467 mdi_prop_update_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t *data,
4468     uint_t nelements)
4469 {
4470 	int rv;
4471 
4472 	if (pip == NULL) {
4473 		return (DDI_PROP_INVAL_ARG);
4474 	}
4475 	ASSERT(!MDI_PI_LOCKED(pip));
4476 	MDI_PI_LOCK(pip);
4477 	if (MDI_PI(pip)->pi_prop == NULL) {
4478 		MDI_PI_UNLOCK(pip);
4479 		return (DDI_PROP_NOT_FOUND);
4480 	}
4481 	rv = nvlist_add_byte_array(MDI_PI(pip)->pi_prop, name, data, nelements);
4482 	MDI_PI_UNLOCK(pip);
4483 	return (i_map_nvlist_error_to_mdi(rv));
4484 }
4485 
4486 /*
4487  * mdi_prop_update_int():
4488  *		Create/Update a 32 bit integer property
4489  */
4490 int
4491 mdi_prop_update_int(mdi_pathinfo_t *pip, char *name, int data)
4492 {
4493 	int rv;
4494 
4495 	if (pip == NULL) {
4496 		return (DDI_PROP_INVAL_ARG);
4497 	}
4498 	ASSERT(!MDI_PI_LOCKED(pip));
4499 	MDI_PI_LOCK(pip);
4500 	if (MDI_PI(pip)->pi_prop == NULL) {
4501 		MDI_PI_UNLOCK(pip);
4502 		return (DDI_PROP_NOT_FOUND);
4503 	}
4504 	rv = nvlist_add_int32(MDI_PI(pip)->pi_prop, name, (int32_t)data);
4505 	MDI_PI_UNLOCK(pip);
4506 	return (i_map_nvlist_error_to_mdi(rv));
4507 }
4508 
4509 /*
4510  * mdi_prop_update_int64():
4511  *		Create/Update a 64 bit integer property
4512  */
4513 int
4514 mdi_prop_update_int64(mdi_pathinfo_t *pip, char *name, int64_t data)
4515 {
4516 	int rv;
4517 
4518 	if (pip == NULL) {
4519 		return (DDI_PROP_INVAL_ARG);
4520 	}
4521 	ASSERT(!MDI_PI_LOCKED(pip));
4522 	MDI_PI_LOCK(pip);
4523 	if (MDI_PI(pip)->pi_prop == NULL) {
4524 		MDI_PI_UNLOCK(pip);
4525 		return (DDI_PROP_NOT_FOUND);
4526 	}
4527 	rv = nvlist_add_int64(MDI_PI(pip)->pi_prop, name, data);
4528 	MDI_PI_UNLOCK(pip);
4529 	return (i_map_nvlist_error_to_mdi(rv));
4530 }
4531 
4532 /*
4533  * mdi_prop_update_int_array():
4534  *		Create/Update a int array property
4535  */
4536 int
4537 mdi_prop_update_int_array(mdi_pathinfo_t *pip, char *name, int *data,
4538 	    uint_t nelements)
4539 {
4540 	int rv;
4541 
4542 	if (pip == NULL) {
4543 		return (DDI_PROP_INVAL_ARG);
4544 	}
4545 	ASSERT(!MDI_PI_LOCKED(pip));
4546 	MDI_PI_LOCK(pip);
4547 	if (MDI_PI(pip)->pi_prop == NULL) {
4548 		MDI_PI_UNLOCK(pip);
4549 		return (DDI_PROP_NOT_FOUND);
4550 	}
4551 	rv = nvlist_add_int32_array(MDI_PI(pip)->pi_prop, name, (int32_t *)data,
4552 	    nelements);
4553 	MDI_PI_UNLOCK(pip);
4554 	return (i_map_nvlist_error_to_mdi(rv));
4555 }
4556 
4557 /*
4558  * mdi_prop_update_string():
4559  *		Create/Update a string property
4560  */
4561 int
4562 mdi_prop_update_string(mdi_pathinfo_t *pip, char *name, char *data)
4563 {
4564 	int rv;
4565 
4566 	if (pip == NULL) {
4567 		return (DDI_PROP_INVAL_ARG);
4568 	}
4569 	ASSERT(!MDI_PI_LOCKED(pip));
4570 	MDI_PI_LOCK(pip);
4571 	if (MDI_PI(pip)->pi_prop == NULL) {
4572 		MDI_PI_UNLOCK(pip);
4573 		return (DDI_PROP_NOT_FOUND);
4574 	}
4575 	rv = nvlist_add_string(MDI_PI(pip)->pi_prop, name, data);
4576 	MDI_PI_UNLOCK(pip);
4577 	return (i_map_nvlist_error_to_mdi(rv));
4578 }
4579 
4580 /*
4581  * mdi_prop_update_string_array():
4582  *		Create/Update a string array property
4583  */
4584 int
4585 mdi_prop_update_string_array(mdi_pathinfo_t *pip, char *name, char **data,
4586     uint_t nelements)
4587 {
4588 	int rv;
4589 
4590 	if (pip == NULL) {
4591 		return (DDI_PROP_INVAL_ARG);
4592 	}
4593 	ASSERT(!MDI_PI_LOCKED(pip));
4594 	MDI_PI_LOCK(pip);
4595 	if (MDI_PI(pip)->pi_prop == NULL) {
4596 		MDI_PI_UNLOCK(pip);
4597 		return (DDI_PROP_NOT_FOUND);
4598 	}
4599 	rv = nvlist_add_string_array(MDI_PI(pip)->pi_prop, name, data,
4600 	    nelements);
4601 	MDI_PI_UNLOCK(pip);
4602 	return (i_map_nvlist_error_to_mdi(rv));
4603 }
4604 
4605 /*
4606  * mdi_prop_lookup_byte():
4607  * 		Look for byte property identified by name.  The data returned
4608  *		is the actual property and valid as long as mdi_pathinfo_t node
4609  *		is alive.
4610  */
4611 int
4612 mdi_prop_lookup_byte(mdi_pathinfo_t *pip, char *name, uchar_t *data)
4613 {
4614 	int rv;
4615 
4616 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4617 		return (DDI_PROP_NOT_FOUND);
4618 	}
4619 	rv = nvlist_lookup_byte(MDI_PI(pip)->pi_prop, name, data);
4620 	return (i_map_nvlist_error_to_mdi(rv));
4621 }
4622 
4623 
4624 /*
4625  * mdi_prop_lookup_byte_array():
4626  * 		Look for byte array property identified by name.  The data
4627  *		returned is the actual property and valid as long as
4628  *		mdi_pathinfo_t node is alive.
4629  */
4630 int
4631 mdi_prop_lookup_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t **data,
4632     uint_t *nelements)
4633 {
4634 	int rv;
4635 
4636 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4637 		return (DDI_PROP_NOT_FOUND);
4638 	}
4639 	rv = nvlist_lookup_byte_array(MDI_PI(pip)->pi_prop, name, data,
4640 	    nelements);
4641 	return (i_map_nvlist_error_to_mdi(rv));
4642 }
4643 
4644 /*
4645  * mdi_prop_lookup_int():
4646  * 		Look for int property identified by name.  The data returned
4647  *		is the actual property and valid as long as mdi_pathinfo_t
4648  *		node is alive.
4649  */
4650 int
4651 mdi_prop_lookup_int(mdi_pathinfo_t *pip, char *name, int *data)
4652 {
4653 	int rv;
4654 
4655 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4656 		return (DDI_PROP_NOT_FOUND);
4657 	}
4658 	rv = nvlist_lookup_int32(MDI_PI(pip)->pi_prop, name, (int32_t *)data);
4659 	return (i_map_nvlist_error_to_mdi(rv));
4660 }
4661 
4662 /*
4663  * mdi_prop_lookup_int64():
4664  * 		Look for int64 property identified by name.  The data returned
4665  *		is the actual property and valid as long as mdi_pathinfo_t node
4666  *		is alive.
4667  */
4668 int
4669 mdi_prop_lookup_int64(mdi_pathinfo_t *pip, char *name, int64_t *data)
4670 {
4671 	int rv;
4672 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4673 		return (DDI_PROP_NOT_FOUND);
4674 	}
4675 	rv = nvlist_lookup_int64(MDI_PI(pip)->pi_prop, name, data);
4676 	return (i_map_nvlist_error_to_mdi(rv));
4677 }
4678 
4679 /*
4680  * mdi_prop_lookup_int_array():
4681  * 		Look for int array property identified by name.  The data
4682  *		returned is the actual property and valid as long as
4683  *		mdi_pathinfo_t node is alive.
4684  */
4685 int
4686 mdi_prop_lookup_int_array(mdi_pathinfo_t *pip, char *name, int **data,
4687     uint_t *nelements)
4688 {
4689 	int rv;
4690 
4691 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4692 		return (DDI_PROP_NOT_FOUND);
4693 	}
4694 	rv = nvlist_lookup_int32_array(MDI_PI(pip)->pi_prop, name,
4695 	    (int32_t **)data, nelements);
4696 	return (i_map_nvlist_error_to_mdi(rv));
4697 }
4698 
4699 /*
4700  * mdi_prop_lookup_string():
4701  * 		Look for string property identified by name.  The data
4702  *		returned is the actual property and valid as long as
4703  *		mdi_pathinfo_t node is alive.
4704  */
4705 int
4706 mdi_prop_lookup_string(mdi_pathinfo_t *pip, char *name, char **data)
4707 {
4708 	int rv;
4709 
4710 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4711 		return (DDI_PROP_NOT_FOUND);
4712 	}
4713 	rv = nvlist_lookup_string(MDI_PI(pip)->pi_prop, name, data);
4714 	return (i_map_nvlist_error_to_mdi(rv));
4715 }
4716 
4717 /*
4718  * mdi_prop_lookup_string_array():
4719  * 		Look for string array property identified by name.  The data
4720  *		returned is the actual property and valid as long as
4721  *		mdi_pathinfo_t node is alive.
4722  */
4723 int
4724 mdi_prop_lookup_string_array(mdi_pathinfo_t *pip, char *name, char ***data,
4725     uint_t *nelements)
4726 {
4727 	int rv;
4728 
4729 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4730 		return (DDI_PROP_NOT_FOUND);
4731 	}
4732 	rv = nvlist_lookup_string_array(MDI_PI(pip)->pi_prop, name, data,
4733 	    nelements);
4734 	return (i_map_nvlist_error_to_mdi(rv));
4735 }
4736 
4737 /*
4738  * mdi_prop_free():
4739  * 		Symmetrical function to ddi_prop_free(). nvlist_lookup_xx()
4740  *		functions return the pointer to actual property data and not a
4741  *		copy of it.  So the data returned is valid as long as
4742  *		mdi_pathinfo_t node is valid.
4743  */
4744 /*ARGSUSED*/
4745 int
4746 mdi_prop_free(void *data)
4747 {
4748 	return (DDI_PROP_SUCCESS);
4749 }
4750 
4751 /*ARGSUSED*/
4752 static void
4753 i_mdi_report_path_state(mdi_client_t *ct, mdi_pathinfo_t *pip)
4754 {
4755 	char		*ct_path;
4756 	char		*ct_status;
4757 	char		*status;
4758 	dev_info_t	*cdip = ct->ct_dip;
4759 	char		lb_buf[64];
4760 	int		report_lb_c = 0, report_lb_p = 0;
4761 
4762 	ASSERT(MDI_CLIENT_LOCKED(ct));
4763 	if ((cdip == NULL) || (ddi_get_instance(cdip) == -1) ||
4764 	    (MDI_CLIENT_IS_REPORT_DEV_NEEDED(ct) == 0)) {
4765 		return;
4766 	}
4767 	if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_OPTIMAL) {
4768 		ct_status = "optimal";
4769 		report_lb_c = 1;
4770 	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED) {
4771 		ct_status = "degraded";
4772 	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
4773 		ct_status = "failed";
4774 	} else {
4775 		ct_status = "unknown";
4776 	}
4777 
4778 	lb_buf[0] = 0;		/* not interested in load balancing config */
4779 
4780 	if (MDI_PI_FLAGS_IS_DEVICE_REMOVED(pip)) {
4781 		status = "removed";
4782 	} else if (MDI_PI_IS_OFFLINE(pip)) {
4783 		status = "offline";
4784 	} else if (MDI_PI_IS_ONLINE(pip)) {
4785 		status = "online";
4786 		report_lb_p = 1;
4787 	} else if (MDI_PI_IS_STANDBY(pip)) {
4788 		status = "standby";
4789 	} else if (MDI_PI_IS_FAULT(pip)) {
4790 		status = "faulted";
4791 	} else {
4792 		status = "unknown";
4793 	}
4794 
4795 	if (cdip) {
4796 		ct_path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4797 
4798 		/*
4799 		 * NOTE: Keeping "multipath status: %s" and
4800 		 * "Load balancing: %s" format unchanged in case someone
4801 		 * scrubs /var/adm/messages looking for these messages.
4802 		 */
4803 		if (report_lb_c && report_lb_p) {
4804 			if (ct->ct_lb == LOAD_BALANCE_LBA) {
4805 				(void) snprintf(lb_buf, sizeof (lb_buf),
4806 				    "%s, region-size: %d", mdi_load_balance_lba,
4807 				    ct->ct_lb_args->region_size);
4808 			} else if (ct->ct_lb == LOAD_BALANCE_NONE) {
4809 				(void) snprintf(lb_buf, sizeof (lb_buf),
4810 				    "%s", mdi_load_balance_none);
4811 			} else {
4812 				(void) snprintf(lb_buf, sizeof (lb_buf), "%s",
4813 				    mdi_load_balance_rr);
4814 			}
4815 
4816 			cmn_err(mdi_debug_consoleonly ? CE_NOTE : CE_CONT,
4817 			    "?%s (%s%d) multipath status: %s: "
4818 			    "path %d %s is %s: Load balancing: %s\n",
4819 			    ddi_pathname(cdip, ct_path), ddi_driver_name(cdip),
4820 			    ddi_get_instance(cdip), ct_status,
4821 			    mdi_pi_get_path_instance(pip),
4822 			    mdi_pi_spathname(pip), status, lb_buf);
4823 		} else {
4824 			cmn_err(mdi_debug_consoleonly ? CE_NOTE : CE_CONT,
4825 			    "?%s (%s%d) multipath status: %s: "
4826 			    "path %d %s is %s\n",
4827 			    ddi_pathname(cdip, ct_path), ddi_driver_name(cdip),
4828 			    ddi_get_instance(cdip), ct_status,
4829 			    mdi_pi_get_path_instance(pip),
4830 			    mdi_pi_spathname(pip), status);
4831 		}
4832 
4833 		kmem_free(ct_path, MAXPATHLEN);
4834 		MDI_CLIENT_CLEAR_REPORT_DEV_NEEDED(ct);
4835 	}
4836 }
4837 
4838 #ifdef	DEBUG
4839 /*
4840  * i_mdi_log():
4841  *		Utility function for error message management
4842  *
4843  *		NOTE: Implementation takes care of trailing \n for cmn_err,
4844  *		MDI_DEBUG should not terminate fmt strings with \n.
4845  *
4846  *		NOTE: If the level is >= 2, and there is no leading !?^
4847  *		then a leading ! is implied (but can be overriden via
4848  *		mdi_debug_consoleonly). If you are using kmdb on the console,
4849  *		consider setting mdi_debug_consoleonly to 1 as an aid.
4850  */
4851 /*PRINTFLIKE4*/
4852 static void
4853 i_mdi_log(int level, const char *func, dev_info_t *dip, const char *fmt, ...)
4854 {
4855 	char		name[MAXNAMELEN];
4856 	char		buf[512];
4857 	char		*bp;
4858 	va_list		ap;
4859 	int		log_only = 0;
4860 	int		boot_only = 0;
4861 	int		console_only = 0;
4862 
4863 	if (dip) {
4864 		(void) snprintf(name, sizeof(name), "%s%d: ",
4865 		    ddi_driver_name(dip), ddi_get_instance(dip));
4866 	} else {
4867 		name[0] = 0;
4868 	}
4869 
4870 	va_start(ap, fmt);
4871 	(void) vsnprintf(buf, sizeof(buf), fmt, ap);
4872 	va_end(ap);
4873 
4874 	switch (buf[0]) {
4875 	case '!':
4876 		bp = &buf[1];
4877 		log_only = 1;
4878 		break;
4879 	case '?':
4880 		bp = &buf[1];
4881 		boot_only = 1;
4882 		break;
4883 	case '^':
4884 		bp = &buf[1];
4885 		console_only = 1;
4886 		break;
4887 	default:
4888 		if (level >= 2)
4889 			log_only = 1;		/* ! implied */
4890 		bp = buf;
4891 		break;
4892 	}
4893 	if (mdi_debug_logonly) {
4894 		log_only = 1;
4895 		boot_only = 0;
4896 		console_only = 0;
4897 	}
4898 	if (mdi_debug_consoleonly) {
4899 		log_only = 0;
4900 		boot_only = 0;
4901 		console_only = 1;
4902 		level = CE_NOTE;
4903 		goto console;
4904 	}
4905 
4906 	switch (level) {
4907 	case CE_NOTE:
4908 		level = CE_CONT;
4909 		/* FALLTHROUGH */
4910 	case CE_CONT:
4911 		if (boot_only) {
4912 			cmn_err(level, "?mdi: %s%s: %s\n", name, func, bp);
4913 		} else if (console_only) {
4914 			cmn_err(level, "^mdi: %s%s: %s\n", name, func, bp);
4915 		} else if (log_only) {
4916 			cmn_err(level, "!mdi: %s%s: %s\n", name, func, bp);
4917 		} else {
4918 			cmn_err(level, "mdi: %s%s: %s\n", name, func, bp);
4919 		}
4920 		break;
4921 
4922 	case CE_WARN:
4923 	case CE_PANIC:
4924 	console:
4925 		if (boot_only) {
4926 			cmn_err(level, "?mdi: %s%s: %s", name, func, bp);
4927 		} else if (console_only) {
4928 			cmn_err(level, "^mdi: %s%s: %s", name, func, bp);
4929 		} else if (log_only) {
4930 			cmn_err(level, "!mdi: %s%s: %s", name, func, bp);
4931 		} else {
4932 			cmn_err(level, "mdi: %s%s: %s", name, func, bp);
4933 		}
4934 		break;
4935 	default:
4936 		cmn_err(level, "mdi: %s%s", name, bp);
4937 		break;
4938 	}
4939 }
4940 #endif	/* DEBUG */
4941 
4942 void
4943 i_mdi_client_online(dev_info_t *ct_dip)
4944 {
4945 	mdi_client_t	*ct;
4946 
4947 	/*
4948 	 * Client online notification. Mark client state as online
4949 	 * restore our binding with dev_info node
4950 	 */
4951 	ct = i_devi_get_client(ct_dip);
4952 	ASSERT(ct != NULL);
4953 	MDI_CLIENT_LOCK(ct);
4954 	MDI_CLIENT_SET_ONLINE(ct);
4955 	/* catch for any memory leaks */
4956 	ASSERT((ct->ct_dip == NULL) || (ct->ct_dip == ct_dip));
4957 	ct->ct_dip = ct_dip;
4958 
4959 	if (ct->ct_power_cnt == 0)
4960 		(void) i_mdi_power_all_phci(ct);
4961 
4962 	MDI_DEBUG(4, (MDI_NOTE, ct_dip,
4963 	    "i_mdi_pm_hold_client %p", (void *)ct));
4964 	i_mdi_pm_hold_client(ct, 1);
4965 
4966 	MDI_CLIENT_UNLOCK(ct);
4967 }
4968 
4969 void
4970 i_mdi_phci_online(dev_info_t *ph_dip)
4971 {
4972 	mdi_phci_t	*ph;
4973 
4974 	/* pHCI online notification. Mark state accordingly */
4975 	ph = i_devi_get_phci(ph_dip);
4976 	ASSERT(ph != NULL);
4977 	MDI_PHCI_LOCK(ph);
4978 	MDI_PHCI_SET_ONLINE(ph);
4979 	MDI_PHCI_UNLOCK(ph);
4980 }
4981 
4982 /*
4983  * mdi_devi_online():
4984  * 		Online notification from NDI framework on pHCI/client
4985  *		device online.
4986  * Return Values:
4987  *		NDI_SUCCESS
4988  *		MDI_FAILURE
4989  */
4990 /*ARGSUSED*/
4991 int
4992 mdi_devi_online(dev_info_t *dip, uint_t flags)
4993 {
4994 	if (MDI_PHCI(dip)) {
4995 		i_mdi_phci_online(dip);
4996 	}
4997 
4998 	if (MDI_CLIENT(dip)) {
4999 		i_mdi_client_online(dip);
5000 	}
5001 	return (NDI_SUCCESS);
5002 }
5003 
5004 /*
5005  * mdi_devi_offline():
5006  * 		Offline notification from NDI framework on pHCI/Client device
5007  *		offline.
5008  *
5009  * Return Values:
5010  *		NDI_SUCCESS
5011  *		NDI_FAILURE
5012  */
5013 /*ARGSUSED*/
5014 int
5015 mdi_devi_offline(dev_info_t *dip, uint_t flags)
5016 {
5017 	int		rv = NDI_SUCCESS;
5018 
5019 	if (MDI_CLIENT(dip)) {
5020 		rv = i_mdi_client_offline(dip, flags);
5021 		if (rv != NDI_SUCCESS)
5022 			return (rv);
5023 	}
5024 
5025 	if (MDI_PHCI(dip)) {
5026 		rv = i_mdi_phci_offline(dip, flags);
5027 
5028 		if ((rv != NDI_SUCCESS) && MDI_CLIENT(dip)) {
5029 			/* set client back online */
5030 			i_mdi_client_online(dip);
5031 		}
5032 	}
5033 
5034 	return (rv);
5035 }
5036 
5037 /*ARGSUSED*/
5038 static int
5039 i_mdi_phci_offline(dev_info_t *dip, uint_t flags)
5040 {
5041 	int		rv = NDI_SUCCESS;
5042 	mdi_phci_t	*ph;
5043 	mdi_client_t	*ct;
5044 	mdi_pathinfo_t	*pip;
5045 	mdi_pathinfo_t	*next;
5046 	mdi_pathinfo_t	*failed_pip = NULL;
5047 	dev_info_t	*cdip;
5048 
5049 	/*
5050 	 * pHCI component offline notification
5051 	 * Make sure that this pHCI instance is free to be offlined.
5052 	 * If it is OK to proceed, Offline and remove all the child
5053 	 * mdi_pathinfo nodes.  This process automatically offlines
5054 	 * corresponding client devices, for which this pHCI provides
5055 	 * critical services.
5056 	 */
5057 	ph = i_devi_get_phci(dip);
5058 	MDI_DEBUG(2, (MDI_NOTE, dip,
5059 	    "called %p %p", (void *)dip, (void *)ph));
5060 	if (ph == NULL) {
5061 		return (rv);
5062 	}
5063 
5064 	MDI_PHCI_LOCK(ph);
5065 
5066 	if (MDI_PHCI_IS_OFFLINE(ph)) {
5067 		MDI_DEBUG(1, (MDI_WARN, dip,
5068 		    "!pHCI already offlined: %p", (void *)dip));
5069 		MDI_PHCI_UNLOCK(ph);
5070 		return (NDI_SUCCESS);
5071 	}
5072 
5073 	/*
5074 	 * Check to see if the pHCI can be offlined
5075 	 */
5076 	if (ph->ph_unstable) {
5077 		MDI_DEBUG(1, (MDI_WARN, dip,
5078 		    "!One or more target devices are in transient state. "
5079 		    "This device can not be removed at this moment. "
5080 		    "Please try again later."));
5081 		MDI_PHCI_UNLOCK(ph);
5082 		return (NDI_BUSY);
5083 	}
5084 
5085 	pip = ph->ph_path_head;
5086 	while (pip != NULL) {
5087 		MDI_PI_LOCK(pip);
5088 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5089 
5090 		/*
5091 		 * The mdi_pathinfo state is OK. Check the client state.
5092 		 * If failover in progress fail the pHCI from offlining
5093 		 */
5094 		ct = MDI_PI(pip)->pi_client;
5095 		i_mdi_client_lock(ct, pip);
5096 		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
5097 		    (ct->ct_unstable)) {
5098 			/*
5099 			 * Failover is in progress, Fail the DR
5100 			 */
5101 			MDI_DEBUG(1, (MDI_WARN, dip,
5102 			    "!pHCI device is busy. "
5103 			    "This device can not be removed at this moment. "
5104 			    "Please try again later."));
5105 			MDI_PI_UNLOCK(pip);
5106 			i_mdi_client_unlock(ct);
5107 			MDI_PHCI_UNLOCK(ph);
5108 			return (NDI_BUSY);
5109 		}
5110 		MDI_PI_UNLOCK(pip);
5111 
5112 		/*
5113 		 * Check to see of we are removing the last path of this
5114 		 * client device...
5115 		 */
5116 		cdip = ct->ct_dip;
5117 		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5118 		    (i_mdi_client_compute_state(ct, ph) ==
5119 		    MDI_CLIENT_STATE_FAILED)) {
5120 			i_mdi_client_unlock(ct);
5121 			MDI_PHCI_UNLOCK(ph);
5122 			if (ndi_devi_offline(cdip,
5123 			    NDI_DEVFS_CLEAN) != NDI_SUCCESS) {
5124 				/*
5125 				 * ndi_devi_offline() failed.
5126 				 * This pHCI provides the critical path
5127 				 * to one or more client devices.
5128 				 * Return busy.
5129 				 */
5130 				MDI_PHCI_LOCK(ph);
5131 				MDI_DEBUG(1, (MDI_WARN, dip,
5132 				    "!pHCI device is busy. "
5133 				    "This device can not be removed at this "
5134 				    "moment. Please try again later."));
5135 				failed_pip = pip;
5136 				break;
5137 			} else {
5138 				MDI_PHCI_LOCK(ph);
5139 				pip = next;
5140 			}
5141 		} else {
5142 			i_mdi_client_unlock(ct);
5143 			pip = next;
5144 		}
5145 	}
5146 
5147 	if (failed_pip) {
5148 		pip = ph->ph_path_head;
5149 		while (pip != failed_pip) {
5150 			MDI_PI_LOCK(pip);
5151 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5152 			ct = MDI_PI(pip)->pi_client;
5153 			i_mdi_client_lock(ct, pip);
5154 			cdip = ct->ct_dip;
5155 			switch (MDI_CLIENT_STATE(ct)) {
5156 			case MDI_CLIENT_STATE_OPTIMAL:
5157 			case MDI_CLIENT_STATE_DEGRADED:
5158 				if (cdip) {
5159 					MDI_PI_UNLOCK(pip);
5160 					i_mdi_client_unlock(ct);
5161 					MDI_PHCI_UNLOCK(ph);
5162 					(void) ndi_devi_online(cdip, 0);
5163 					MDI_PHCI_LOCK(ph);
5164 					pip = next;
5165 					continue;
5166 				}
5167 				break;
5168 
5169 			case MDI_CLIENT_STATE_FAILED:
5170 				if (cdip) {
5171 					MDI_PI_UNLOCK(pip);
5172 					i_mdi_client_unlock(ct);
5173 					MDI_PHCI_UNLOCK(ph);
5174 					(void) ndi_devi_offline(cdip,
5175 						NDI_DEVFS_CLEAN);
5176 					MDI_PHCI_LOCK(ph);
5177 					pip = next;
5178 					continue;
5179 				}
5180 				break;
5181 			}
5182 			MDI_PI_UNLOCK(pip);
5183 			i_mdi_client_unlock(ct);
5184 			pip = next;
5185 		}
5186 		MDI_PHCI_UNLOCK(ph);
5187 		return (NDI_BUSY);
5188 	}
5189 
5190 	/*
5191 	 * Mark the pHCI as offline
5192 	 */
5193 	MDI_PHCI_SET_OFFLINE(ph);
5194 
5195 	/*
5196 	 * Mark the child mdi_pathinfo nodes as transient
5197 	 */
5198 	pip = ph->ph_path_head;
5199 	while (pip != NULL) {
5200 		MDI_PI_LOCK(pip);
5201 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5202 		MDI_PI_SET_OFFLINING(pip);
5203 		MDI_PI_UNLOCK(pip);
5204 		pip = next;
5205 	}
5206 	MDI_PHCI_UNLOCK(ph);
5207 	/*
5208 	 * Give a chance for any pending commands to execute
5209 	 */
5210 	delay_random(mdi_delay);
5211 	MDI_PHCI_LOCK(ph);
5212 	pip = ph->ph_path_head;
5213 	while (pip != NULL) {
5214 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5215 		(void) i_mdi_pi_offline(pip, flags);
5216 		MDI_PI_LOCK(pip);
5217 		ct = MDI_PI(pip)->pi_client;
5218 		if (!MDI_PI_IS_OFFLINE(pip)) {
5219 			MDI_DEBUG(1, (MDI_WARN, dip,
5220 			    "!pHCI device is busy. "
5221 			    "This device can not be removed at this moment. "
5222 			    "Please try again later."));
5223 			MDI_PI_UNLOCK(pip);
5224 			MDI_PHCI_SET_ONLINE(ph);
5225 			MDI_PHCI_UNLOCK(ph);
5226 			return (NDI_BUSY);
5227 		}
5228 		MDI_PI_UNLOCK(pip);
5229 		pip = next;
5230 	}
5231 	MDI_PHCI_UNLOCK(ph);
5232 
5233 	return (rv);
5234 }
5235 
5236 void
5237 mdi_phci_mark_retiring(dev_info_t *dip, char **cons_array)
5238 {
5239 	mdi_phci_t	*ph;
5240 	mdi_client_t	*ct;
5241 	mdi_pathinfo_t	*pip;
5242 	mdi_pathinfo_t	*next;
5243 	dev_info_t	*cdip;
5244 
5245 	if (!MDI_PHCI(dip))
5246 		return;
5247 
5248 	ph = i_devi_get_phci(dip);
5249 	if (ph == NULL) {
5250 		return;
5251 	}
5252 
5253 	MDI_PHCI_LOCK(ph);
5254 
5255 	if (MDI_PHCI_IS_OFFLINE(ph)) {
5256 		/* has no last path */
5257 		MDI_PHCI_UNLOCK(ph);
5258 		return;
5259 	}
5260 
5261 	pip = ph->ph_path_head;
5262 	while (pip != NULL) {
5263 		MDI_PI_LOCK(pip);
5264 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5265 
5266 		ct = MDI_PI(pip)->pi_client;
5267 		i_mdi_client_lock(ct, pip);
5268 		MDI_PI_UNLOCK(pip);
5269 
5270 		cdip = ct->ct_dip;
5271 		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5272 		    (i_mdi_client_compute_state(ct, ph) ==
5273 		    MDI_CLIENT_STATE_FAILED)) {
5274 			/* Last path. Mark client dip as retiring */
5275 			i_mdi_client_unlock(ct);
5276 			MDI_PHCI_UNLOCK(ph);
5277 			(void) e_ddi_mark_retiring(cdip, cons_array);
5278 			MDI_PHCI_LOCK(ph);
5279 			pip = next;
5280 		} else {
5281 			i_mdi_client_unlock(ct);
5282 			pip = next;
5283 		}
5284 	}
5285 
5286 	MDI_PHCI_UNLOCK(ph);
5287 
5288 	return;
5289 }
5290 
5291 void
5292 mdi_phci_retire_notify(dev_info_t *dip, int *constraint)
5293 {
5294 	mdi_phci_t	*ph;
5295 	mdi_client_t	*ct;
5296 	mdi_pathinfo_t	*pip;
5297 	mdi_pathinfo_t	*next;
5298 	dev_info_t	*cdip;
5299 
5300 	if (!MDI_PHCI(dip))
5301 		return;
5302 
5303 	ph = i_devi_get_phci(dip);
5304 	if (ph == NULL)
5305 		return;
5306 
5307 	MDI_PHCI_LOCK(ph);
5308 
5309 	if (MDI_PHCI_IS_OFFLINE(ph)) {
5310 		MDI_PHCI_UNLOCK(ph);
5311 		/* not last path */
5312 		return;
5313 	}
5314 
5315 	if (ph->ph_unstable) {
5316 		MDI_PHCI_UNLOCK(ph);
5317 		/* can't check for constraints */
5318 		*constraint = 0;
5319 		return;
5320 	}
5321 
5322 	pip = ph->ph_path_head;
5323 	while (pip != NULL) {
5324 		MDI_PI_LOCK(pip);
5325 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5326 
5327 		/*
5328 		 * The mdi_pathinfo state is OK. Check the client state.
5329 		 * If failover in progress fail the pHCI from offlining
5330 		 */
5331 		ct = MDI_PI(pip)->pi_client;
5332 		i_mdi_client_lock(ct, pip);
5333 		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
5334 		    (ct->ct_unstable)) {
5335 			/*
5336 			 * Failover is in progress, can't check for constraints
5337 			 */
5338 			MDI_PI_UNLOCK(pip);
5339 			i_mdi_client_unlock(ct);
5340 			MDI_PHCI_UNLOCK(ph);
5341 			*constraint = 0;
5342 			return;
5343 		}
5344 		MDI_PI_UNLOCK(pip);
5345 
5346 		/*
5347 		 * Check to see of we are retiring the last path of this
5348 		 * client device...
5349 		 */
5350 		cdip = ct->ct_dip;
5351 		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5352 		    (i_mdi_client_compute_state(ct, ph) ==
5353 		    MDI_CLIENT_STATE_FAILED)) {
5354 			i_mdi_client_unlock(ct);
5355 			MDI_PHCI_UNLOCK(ph);
5356 			(void) e_ddi_retire_notify(cdip, constraint);
5357 			MDI_PHCI_LOCK(ph);
5358 			pip = next;
5359 		} else {
5360 			i_mdi_client_unlock(ct);
5361 			pip = next;
5362 		}
5363 	}
5364 
5365 	MDI_PHCI_UNLOCK(ph);
5366 
5367 	return;
5368 }
5369 
5370 /*
5371  * offline the path(s) hanging off the pHCI. If the
5372  * last path to any client, check that constraints
5373  * have been applied.
5374  */
5375 void
5376 mdi_phci_retire_finalize(dev_info_t *dip, int phci_only)
5377 {
5378 	mdi_phci_t	*ph;
5379 	mdi_client_t	*ct;
5380 	mdi_pathinfo_t	*pip;
5381 	mdi_pathinfo_t	*next;
5382 	dev_info_t	*cdip;
5383 	int		unstable = 0;
5384 	int		constraint;
5385 
5386 	if (!MDI_PHCI(dip))
5387 		return;
5388 
5389 	ph = i_devi_get_phci(dip);
5390 	if (ph == NULL) {
5391 		/* no last path and no pips */
5392 		return;
5393 	}
5394 
5395 	MDI_PHCI_LOCK(ph);
5396 
5397 	if (MDI_PHCI_IS_OFFLINE(ph)) {
5398 		MDI_PHCI_UNLOCK(ph);
5399 		/* no last path and no pips */
5400 		return;
5401 	}
5402 
5403 	/*
5404 	 * Check to see if the pHCI can be offlined
5405 	 */
5406 	if (ph->ph_unstable) {
5407 		unstable = 1;
5408 	}
5409 
5410 	pip = ph->ph_path_head;
5411 	while (pip != NULL) {
5412 		MDI_PI_LOCK(pip);
5413 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5414 
5415 		/*
5416 		 * if failover in progress fail the pHCI from offlining
5417 		 */
5418 		ct = MDI_PI(pip)->pi_client;
5419 		i_mdi_client_lock(ct, pip);
5420 		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
5421 		    (ct->ct_unstable)) {
5422 			unstable = 1;
5423 		}
5424 		MDI_PI_UNLOCK(pip);
5425 
5426 		/*
5427 		 * Check to see of we are removing the last path of this
5428 		 * client device...
5429 		 */
5430 		cdip = ct->ct_dip;
5431 		if (!phci_only && cdip &&
5432 		    (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5433 		    (i_mdi_client_compute_state(ct, ph) ==
5434 		    MDI_CLIENT_STATE_FAILED)) {
5435 			i_mdi_client_unlock(ct);
5436 			MDI_PHCI_UNLOCK(ph);
5437 			/*
5438 			 * We don't retire clients we just retire the
5439 			 * path to a client. If it is the last path
5440 			 * to a client, constraints are checked and
5441 			 * if we pass the last path is offlined. MPXIO will
5442 			 * then fail all I/Os to the client. Since we don't
5443 			 * want to retire the client on a path error
5444 			 * set constraint = 0 so that the client dip
5445 			 * is not retired.
5446 			 */
5447 			constraint = 0;
5448 			(void) e_ddi_retire_finalize(cdip, &constraint);
5449 			MDI_PHCI_LOCK(ph);
5450 			pip = next;
5451 		} else {
5452 			i_mdi_client_unlock(ct);
5453 			pip = next;
5454 		}
5455 	}
5456 
5457 	/*
5458 	 * Cannot offline pip(s)
5459 	 */
5460 	if (unstable) {
5461 		cmn_err(CE_WARN, "%s%d: mdi_phci_retire_finalize: "
5462 		    "pHCI in transient state, cannot retire",
5463 		    ddi_driver_name(dip), ddi_get_instance(dip));
5464 		MDI_PHCI_UNLOCK(ph);
5465 		return;
5466 	}
5467 
5468 	/*
5469 	 * Mark the pHCI as offline
5470 	 */
5471 	MDI_PHCI_SET_OFFLINE(ph);
5472 
5473 	/*
5474 	 * Mark the child mdi_pathinfo nodes as transient
5475 	 */
5476 	pip = ph->ph_path_head;
5477 	while (pip != NULL) {
5478 		MDI_PI_LOCK(pip);
5479 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5480 		MDI_PI_SET_OFFLINING(pip);
5481 		MDI_PI_UNLOCK(pip);
5482 		pip = next;
5483 	}
5484 	MDI_PHCI_UNLOCK(ph);
5485 	/*
5486 	 * Give a chance for any pending commands to execute
5487 	 */
5488 	delay_random(mdi_delay);
5489 	MDI_PHCI_LOCK(ph);
5490 	pip = ph->ph_path_head;
5491 	while (pip != NULL) {
5492 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5493 		(void) i_mdi_pi_offline(pip, 0);
5494 		MDI_PI_LOCK(pip);
5495 		ct = MDI_PI(pip)->pi_client;
5496 		if (!MDI_PI_IS_OFFLINE(pip)) {
5497 			cmn_err(CE_WARN, "mdi_phci_retire_finalize: "
5498 			    "path %d %s busy, cannot offline",
5499 			    mdi_pi_get_path_instance(pip),
5500 			    mdi_pi_spathname(pip));
5501 			MDI_PI_UNLOCK(pip);
5502 			MDI_PHCI_SET_ONLINE(ph);
5503 			MDI_PHCI_UNLOCK(ph);
5504 			return;
5505 		}
5506 		MDI_PI_UNLOCK(pip);
5507 		pip = next;
5508 	}
5509 	MDI_PHCI_UNLOCK(ph);
5510 
5511 	return;
5512 }
5513 
5514 void
5515 mdi_phci_unretire(dev_info_t *dip)
5516 {
5517 	ASSERT(MDI_PHCI(dip));
5518 
5519 	/*
5520 	 * Online the phci
5521 	 */
5522 	i_mdi_phci_online(dip);
5523 }
5524 
5525 /*ARGSUSED*/
5526 static int
5527 i_mdi_client_offline(dev_info_t *dip, uint_t flags)
5528 {
5529 	int		rv = NDI_SUCCESS;
5530 	mdi_client_t	*ct;
5531 
5532 	/*
5533 	 * Client component to go offline.  Make sure that we are
5534 	 * not in failing over state and update client state
5535 	 * accordingly
5536 	 */
5537 	ct = i_devi_get_client(dip);
5538 	MDI_DEBUG(2, (MDI_NOTE, dip,
5539 	    "called %p %p", (void *)dip, (void *)ct));
5540 	if (ct != NULL) {
5541 		MDI_CLIENT_LOCK(ct);
5542 		if (ct->ct_unstable) {
5543 			/*
5544 			 * One or more paths are in transient state,
5545 			 * Dont allow offline of a client device
5546 			 */
5547 			MDI_DEBUG(1, (MDI_WARN, dip,
5548 			    "!One or more paths to "
5549 			    "this device are in transient state. "
5550 			    "This device can not be removed at this moment. "
5551 			    "Please try again later."));
5552 			MDI_CLIENT_UNLOCK(ct);
5553 			return (NDI_BUSY);
5554 		}
5555 		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
5556 			/*
5557 			 * Failover is in progress, Dont allow DR of
5558 			 * a client device
5559 			 */
5560 			MDI_DEBUG(1, (MDI_WARN, dip,
5561 			    "!Client device is Busy. "
5562 			    "This device can not be removed at this moment. "
5563 			    "Please try again later."));
5564 			MDI_CLIENT_UNLOCK(ct);
5565 			return (NDI_BUSY);
5566 		}
5567 		MDI_CLIENT_SET_OFFLINE(ct);
5568 
5569 		/*
5570 		 * Unbind our relationship with the dev_info node
5571 		 */
5572 		if (flags & NDI_DEVI_REMOVE) {
5573 			ct->ct_dip = NULL;
5574 		}
5575 		MDI_CLIENT_UNLOCK(ct);
5576 	}
5577 	return (rv);
5578 }
5579 
5580 /*
5581  * mdi_pre_attach():
5582  *		Pre attach() notification handler
5583  */
5584 /*ARGSUSED*/
5585 int
5586 mdi_pre_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
5587 {
5588 	/* don't support old DDI_PM_RESUME */
5589 	if ((DEVI(dip)->devi_mdi_component != MDI_COMPONENT_NONE) &&
5590 	    (cmd == DDI_PM_RESUME))
5591 		return (DDI_FAILURE);
5592 
5593 	return (DDI_SUCCESS);
5594 }
5595 
5596 /*
5597  * mdi_post_attach():
5598  *		Post attach() notification handler
5599  */
5600 /*ARGSUSED*/
5601 void
5602 mdi_post_attach(dev_info_t *dip, ddi_attach_cmd_t cmd, int error)
5603 {
5604 	mdi_phci_t	*ph;
5605 	mdi_client_t	*ct;
5606 	mdi_vhci_t	*vh;
5607 
5608 	if (MDI_PHCI(dip)) {
5609 		ph = i_devi_get_phci(dip);
5610 		ASSERT(ph != NULL);
5611 
5612 		MDI_PHCI_LOCK(ph);
5613 		switch (cmd) {
5614 		case DDI_ATTACH:
5615 			MDI_DEBUG(2, (MDI_NOTE, dip,
5616 			    "phci post_attach called %p", (void *)ph));
5617 			if (error == DDI_SUCCESS) {
5618 				MDI_PHCI_SET_ATTACH(ph);
5619 			} else {
5620 				MDI_DEBUG(1, (MDI_NOTE, dip,
5621 				    "!pHCI post_attach failed: error %d",
5622 				    error));
5623 				MDI_PHCI_SET_DETACH(ph);
5624 			}
5625 			break;
5626 
5627 		case DDI_RESUME:
5628 			MDI_DEBUG(2, (MDI_NOTE, dip,
5629 			    "pHCI post_resume: called %p", (void *)ph));
5630 			if (error == DDI_SUCCESS) {
5631 				MDI_PHCI_SET_RESUME(ph);
5632 			} else {
5633 				MDI_DEBUG(1, (MDI_NOTE, dip,
5634 				    "!pHCI post_resume failed: error %d",
5635 				    error));
5636 				MDI_PHCI_SET_SUSPEND(ph);
5637 			}
5638 			break;
5639 		}
5640 		MDI_PHCI_UNLOCK(ph);
5641 	}
5642 
5643 	if (MDI_CLIENT(dip)) {
5644 		ct = i_devi_get_client(dip);
5645 		ASSERT(ct != NULL);
5646 
5647 		MDI_CLIENT_LOCK(ct);
5648 		switch (cmd) {
5649 		case DDI_ATTACH:
5650 			MDI_DEBUG(2, (MDI_NOTE, dip,
5651 			    "client post_attach called %p", (void *)ct));
5652 			if (error != DDI_SUCCESS) {
5653 				MDI_DEBUG(1, (MDI_NOTE, dip,
5654 				    "!client post_attach failed: error %d",
5655 				    error));
5656 				MDI_CLIENT_SET_DETACH(ct);
5657 				MDI_DEBUG(4, (MDI_WARN, dip,
5658 				    "i_mdi_pm_reset_client"));
5659 				i_mdi_pm_reset_client(ct);
5660 				break;
5661 			}
5662 
5663 			/*
5664 			 * Client device has successfully attached, inform
5665 			 * the vhci.
5666 			 */
5667 			vh = ct->ct_vhci;
5668 			if (vh->vh_ops->vo_client_attached)
5669 				(*vh->vh_ops->vo_client_attached)(dip);
5670 
5671 			MDI_CLIENT_SET_ATTACH(ct);
5672 			break;
5673 
5674 		case DDI_RESUME:
5675 			MDI_DEBUG(2, (MDI_NOTE, dip,
5676 			    "client post_attach: called %p", (void *)ct));
5677 			if (error == DDI_SUCCESS) {
5678 				MDI_CLIENT_SET_RESUME(ct);
5679 			} else {
5680 				MDI_DEBUG(1, (MDI_NOTE, dip,
5681 				    "!client post_resume failed: error %d",
5682 				    error));
5683 				MDI_CLIENT_SET_SUSPEND(ct);
5684 			}
5685 			break;
5686 		}
5687 		MDI_CLIENT_UNLOCK(ct);
5688 	}
5689 }
5690 
5691 /*
5692  * mdi_pre_detach():
5693  *		Pre detach notification handler
5694  */
5695 /*ARGSUSED*/
5696 int
5697 mdi_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5698 {
5699 	int rv = DDI_SUCCESS;
5700 
5701 	if (MDI_CLIENT(dip)) {
5702 		(void) i_mdi_client_pre_detach(dip, cmd);
5703 	}
5704 
5705 	if (MDI_PHCI(dip)) {
5706 		rv = i_mdi_phci_pre_detach(dip, cmd);
5707 	}
5708 
5709 	return (rv);
5710 }
5711 
5712 /*ARGSUSED*/
5713 static int
5714 i_mdi_phci_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5715 {
5716 	int		rv = DDI_SUCCESS;
5717 	mdi_phci_t	*ph;
5718 	mdi_client_t	*ct;
5719 	mdi_pathinfo_t	*pip;
5720 	mdi_pathinfo_t	*failed_pip = NULL;
5721 	mdi_pathinfo_t	*next;
5722 
5723 	ph = i_devi_get_phci(dip);
5724 	if (ph == NULL) {
5725 		return (rv);
5726 	}
5727 
5728 	MDI_PHCI_LOCK(ph);
5729 	switch (cmd) {
5730 	case DDI_DETACH:
5731 		MDI_DEBUG(2, (MDI_NOTE, dip,
5732 		    "pHCI pre_detach: called %p", (void *)ph));
5733 		if (!MDI_PHCI_IS_OFFLINE(ph)) {
5734 			/*
5735 			 * mdi_pathinfo nodes are still attached to
5736 			 * this pHCI. Fail the detach for this pHCI.
5737 			 */
5738 			MDI_DEBUG(2, (MDI_WARN, dip,
5739 			    "pHCI pre_detach: paths are still attached %p",
5740 			    (void *)ph));
5741 			rv = DDI_FAILURE;
5742 			break;
5743 		}
5744 		MDI_PHCI_SET_DETACH(ph);
5745 		break;
5746 
5747 	case DDI_SUSPEND:
5748 		/*
5749 		 * pHCI is getting suspended.  Since mpxio client
5750 		 * devices may not be suspended at this point, to avoid
5751 		 * a potential stack overflow, it is important to suspend
5752 		 * client devices before pHCI can be suspended.
5753 		 */
5754 
5755 		MDI_DEBUG(2, (MDI_NOTE, dip,
5756 		    "pHCI pre_suspend: called %p", (void *)ph));
5757 		/*
5758 		 * Suspend all the client devices accessible through this pHCI
5759 		 */
5760 		pip = ph->ph_path_head;
5761 		while (pip != NULL && rv == DDI_SUCCESS) {
5762 			dev_info_t *cdip;
5763 			MDI_PI_LOCK(pip);
5764 			next =
5765 			    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5766 			ct = MDI_PI(pip)->pi_client;
5767 			i_mdi_client_lock(ct, pip);
5768 			cdip = ct->ct_dip;
5769 			MDI_PI_UNLOCK(pip);
5770 			if ((MDI_CLIENT_IS_DETACHED(ct) == 0) &&
5771 			    MDI_CLIENT_IS_SUSPENDED(ct) == 0) {
5772 				i_mdi_client_unlock(ct);
5773 				if ((rv = devi_detach(cdip, DDI_SUSPEND)) !=
5774 				    DDI_SUCCESS) {
5775 					/*
5776 					 * Suspend of one of the client
5777 					 * device has failed.
5778 					 */
5779 					MDI_DEBUG(1, (MDI_WARN, dip,
5780 					    "!suspend of device (%s%d) failed.",
5781 					    ddi_driver_name(cdip),
5782 					    ddi_get_instance(cdip)));
5783 					failed_pip = pip;
5784 					break;
5785 				}
5786 			} else {
5787 				i_mdi_client_unlock(ct);
5788 			}
5789 			pip = next;
5790 		}
5791 
5792 		if (rv == DDI_SUCCESS) {
5793 			/*
5794 			 * Suspend of client devices is complete. Proceed
5795 			 * with pHCI suspend.
5796 			 */
5797 			MDI_PHCI_SET_SUSPEND(ph);
5798 		} else {
5799 			/*
5800 			 * Revert back all the suspended client device states
5801 			 * to converse.
5802 			 */
5803 			pip = ph->ph_path_head;
5804 			while (pip != failed_pip) {
5805 				dev_info_t *cdip;
5806 				MDI_PI_LOCK(pip);
5807 				next =
5808 				    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5809 				ct = MDI_PI(pip)->pi_client;
5810 				i_mdi_client_lock(ct, pip);
5811 				cdip = ct->ct_dip;
5812 				MDI_PI_UNLOCK(pip);
5813 				if (MDI_CLIENT_IS_SUSPENDED(ct)) {
5814 					i_mdi_client_unlock(ct);
5815 					(void) devi_attach(cdip, DDI_RESUME);
5816 				} else {
5817 					i_mdi_client_unlock(ct);
5818 				}
5819 				pip = next;
5820 			}
5821 		}
5822 		break;
5823 
5824 	default:
5825 		rv = DDI_FAILURE;
5826 		break;
5827 	}
5828 	MDI_PHCI_UNLOCK(ph);
5829 	return (rv);
5830 }
5831 
5832 /*ARGSUSED*/
5833 static int
5834 i_mdi_client_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5835 {
5836 	int		rv = DDI_SUCCESS;
5837 	mdi_client_t	*ct;
5838 
5839 	ct = i_devi_get_client(dip);
5840 	if (ct == NULL) {
5841 		return (rv);
5842 	}
5843 
5844 	MDI_CLIENT_LOCK(ct);
5845 	switch (cmd) {
5846 	case DDI_DETACH:
5847 		MDI_DEBUG(2, (MDI_NOTE, dip,
5848 		    "client pre_detach: called %p",
5849 		     (void *)ct));
5850 		MDI_CLIENT_SET_DETACH(ct);
5851 		break;
5852 
5853 	case DDI_SUSPEND:
5854 		MDI_DEBUG(2, (MDI_NOTE, dip,
5855 		    "client pre_suspend: called %p",
5856 		    (void *)ct));
5857 		MDI_CLIENT_SET_SUSPEND(ct);
5858 		break;
5859 
5860 	default:
5861 		rv = DDI_FAILURE;
5862 		break;
5863 	}
5864 	MDI_CLIENT_UNLOCK(ct);
5865 	return (rv);
5866 }
5867 
5868 /*
5869  * mdi_post_detach():
5870  *		Post detach notification handler
5871  */
5872 /*ARGSUSED*/
5873 void
5874 mdi_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5875 {
5876 	/*
5877 	 * Detach/Suspend of mpxio component failed. Update our state
5878 	 * too
5879 	 */
5880 	if (MDI_PHCI(dip))
5881 		i_mdi_phci_post_detach(dip, cmd, error);
5882 
5883 	if (MDI_CLIENT(dip))
5884 		i_mdi_client_post_detach(dip, cmd, error);
5885 }
5886 
5887 /*ARGSUSED*/
5888 static void
5889 i_mdi_phci_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5890 {
5891 	mdi_phci_t	*ph;
5892 
5893 	/*
5894 	 * Detach/Suspend of phci component failed. Update our state
5895 	 * too
5896 	 */
5897 	ph = i_devi_get_phci(dip);
5898 	if (ph == NULL) {
5899 		return;
5900 	}
5901 
5902 	MDI_PHCI_LOCK(ph);
5903 	/*
5904 	 * Detach of pHCI failed. Restore back converse
5905 	 * state
5906 	 */
5907 	switch (cmd) {
5908 	case DDI_DETACH:
5909 		MDI_DEBUG(2, (MDI_NOTE, dip,
5910 		    "pHCI post_detach: called %p",
5911 		    (void *)ph));
5912 		if (error != DDI_SUCCESS)
5913 			MDI_PHCI_SET_ATTACH(ph);
5914 		break;
5915 
5916 	case DDI_SUSPEND:
5917 		MDI_DEBUG(2, (MDI_NOTE, dip,
5918 		    "pHCI post_suspend: called %p",
5919 		    (void *)ph));
5920 		if (error != DDI_SUCCESS)
5921 			MDI_PHCI_SET_RESUME(ph);
5922 		break;
5923 	}
5924 	MDI_PHCI_UNLOCK(ph);
5925 }
5926 
5927 /*ARGSUSED*/
5928 static void
5929 i_mdi_client_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5930 {
5931 	mdi_client_t	*ct;
5932 
5933 	ct = i_devi_get_client(dip);
5934 	if (ct == NULL) {
5935 		return;
5936 	}
5937 	MDI_CLIENT_LOCK(ct);
5938 	/*
5939 	 * Detach of Client failed. Restore back converse
5940 	 * state
5941 	 */
5942 	switch (cmd) {
5943 	case DDI_DETACH:
5944 		MDI_DEBUG(2, (MDI_NOTE, dip,
5945 		    "client post_detach: called %p", (void *)ct));
5946 		if (DEVI_IS_ATTACHING(ct->ct_dip)) {
5947 			MDI_DEBUG(4, (MDI_NOTE, dip,
5948 			    "i_mdi_pm_rele_client\n"));
5949 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
5950 		} else {
5951 			MDI_DEBUG(4, (MDI_NOTE, dip,
5952 			    "i_mdi_pm_reset_client\n"));
5953 			i_mdi_pm_reset_client(ct);
5954 		}
5955 		if (error != DDI_SUCCESS)
5956 			MDI_CLIENT_SET_ATTACH(ct);
5957 		break;
5958 
5959 	case DDI_SUSPEND:
5960 		MDI_DEBUG(2, (MDI_NOTE, dip,
5961 		    "called %p", (void *)ct));
5962 		if (error != DDI_SUCCESS)
5963 			MDI_CLIENT_SET_RESUME(ct);
5964 		break;
5965 	}
5966 	MDI_CLIENT_UNLOCK(ct);
5967 }
5968 
5969 int
5970 mdi_pi_kstat_exists(mdi_pathinfo_t *pip)
5971 {
5972 	return (MDI_PI(pip)->pi_kstats ? 1 : 0);
5973 }
5974 
5975 /*
5976  * create and install per-path (client - pHCI) statistics
5977  * I/O stats supported: nread, nwritten, reads, and writes
5978  * Error stats - hard errors, soft errors, & transport errors
5979  */
5980 int
5981 mdi_pi_kstat_create(mdi_pathinfo_t *pip, char *ksname)
5982 {
5983 	kstat_t			*kiosp, *kerrsp;
5984 	struct pi_errs		*nsp;
5985 	struct mdi_pi_kstats	*mdi_statp;
5986 
5987 	if (MDI_PI(pip)->pi_kstats != NULL)
5988 		return (MDI_SUCCESS);
5989 
5990 	if ((kiosp = kstat_create("mdi", 0, ksname, "iopath",
5991 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL) {
5992 		return (MDI_FAILURE);
5993 	}
5994 
5995 	(void) strcat(ksname, ",err");
5996 	kerrsp = kstat_create("mdi", 0, ksname, "iopath_errors",
5997 	    KSTAT_TYPE_NAMED,
5998 	    sizeof (struct pi_errs) / sizeof (kstat_named_t), 0);
5999 	if (kerrsp == NULL) {
6000 		kstat_delete(kiosp);
6001 		return (MDI_FAILURE);
6002 	}
6003 
6004 	nsp = (struct pi_errs *)kerrsp->ks_data;
6005 	kstat_named_init(&nsp->pi_softerrs, "Soft Errors", KSTAT_DATA_UINT32);
6006 	kstat_named_init(&nsp->pi_harderrs, "Hard Errors", KSTAT_DATA_UINT32);
6007 	kstat_named_init(&nsp->pi_transerrs, "Transport Errors",
6008 	    KSTAT_DATA_UINT32);
6009 	kstat_named_init(&nsp->pi_icnt_busy, "Interconnect Busy",
6010 	    KSTAT_DATA_UINT32);
6011 	kstat_named_init(&nsp->pi_icnt_errors, "Interconnect Errors",
6012 	    KSTAT_DATA_UINT32);
6013 	kstat_named_init(&nsp->pi_phci_rsrc, "pHCI No Resources",
6014 	    KSTAT_DATA_UINT32);
6015 	kstat_named_init(&nsp->pi_phci_localerr, "pHCI Local Errors",
6016 	    KSTAT_DATA_UINT32);
6017 	kstat_named_init(&nsp->pi_phci_invstate, "pHCI Invalid State",
6018 	    KSTAT_DATA_UINT32);
6019 	kstat_named_init(&nsp->pi_failedfrom, "Failed From",
6020 	    KSTAT_DATA_UINT32);
6021 	kstat_named_init(&nsp->pi_failedto, "Failed To", KSTAT_DATA_UINT32);
6022 
6023 	mdi_statp = kmem_alloc(sizeof (*mdi_statp), KM_SLEEP);
6024 	mdi_statp->pi_kstat_ref = 1;
6025 	mdi_statp->pi_kstat_iostats = kiosp;
6026 	mdi_statp->pi_kstat_errstats = kerrsp;
6027 	kstat_install(kiosp);
6028 	kstat_install(kerrsp);
6029 	MDI_PI(pip)->pi_kstats = mdi_statp;
6030 	return (MDI_SUCCESS);
6031 }
6032 
6033 /*
6034  * destroy per-path properties
6035  */
6036 static void
6037 i_mdi_pi_kstat_destroy(mdi_pathinfo_t *pip)
6038 {
6039 
6040 	struct mdi_pi_kstats *mdi_statp;
6041 
6042 	if (MDI_PI(pip)->pi_kstats == NULL)
6043 		return;
6044 	if ((mdi_statp = MDI_PI(pip)->pi_kstats) == NULL)
6045 		return;
6046 
6047 	MDI_PI(pip)->pi_kstats = NULL;
6048 
6049 	/*
6050 	 * the kstat may be shared between multiple pathinfo nodes
6051 	 * decrement this pathinfo's usage, removing the kstats
6052 	 * themselves when the last pathinfo reference is removed.
6053 	 */
6054 	ASSERT(mdi_statp->pi_kstat_ref > 0);
6055 	if (--mdi_statp->pi_kstat_ref != 0)
6056 		return;
6057 
6058 	kstat_delete(mdi_statp->pi_kstat_iostats);
6059 	kstat_delete(mdi_statp->pi_kstat_errstats);
6060 	kmem_free(mdi_statp, sizeof (*mdi_statp));
6061 }
6062 
6063 /*
6064  * update I/O paths KSTATS
6065  */
6066 void
6067 mdi_pi_kstat_iosupdate(mdi_pathinfo_t *pip, struct buf *bp)
6068 {
6069 	kstat_t *iostatp;
6070 	size_t xfer_cnt;
6071 
6072 	ASSERT(pip != NULL);
6073 
6074 	/*
6075 	 * I/O can be driven across a path prior to having path
6076 	 * statistics available, i.e. probe(9e).
6077 	 */
6078 	if (bp != NULL && MDI_PI(pip)->pi_kstats != NULL) {
6079 		iostatp = MDI_PI(pip)->pi_kstats->pi_kstat_iostats;
6080 		xfer_cnt = bp->b_bcount - bp->b_resid;
6081 		if (bp->b_flags & B_READ) {
6082 			KSTAT_IO_PTR(iostatp)->reads++;
6083 			KSTAT_IO_PTR(iostatp)->nread += xfer_cnt;
6084 		} else {
6085 			KSTAT_IO_PTR(iostatp)->writes++;
6086 			KSTAT_IO_PTR(iostatp)->nwritten += xfer_cnt;
6087 		}
6088 	}
6089 }
6090 
6091 /*
6092  * Enable the path(specific client/target/initiator)
6093  * Enabling a path means that MPxIO may select the enabled path for routing
6094  * future I/O requests, subject to other path state constraints.
6095  */
6096 int
6097 mdi_pi_enable_path(mdi_pathinfo_t *pip, int flags)
6098 {
6099 	mdi_phci_t	*ph;
6100 
6101 	ph = MDI_PI(pip)->pi_phci;
6102 	if (ph == NULL) {
6103 		MDI_DEBUG(1, (MDI_NOTE, mdi_pi_get_phci(pip),
6104 		    "!failed: path %s %p: NULL ph",
6105 		    mdi_pi_spathname(pip), (void *)pip));
6106 		return (MDI_FAILURE);
6107 	}
6108 
6109 	(void) i_mdi_enable_disable_path(pip, ph->ph_vhci, flags,
6110 		MDI_ENABLE_OP);
6111 	MDI_DEBUG(5, (MDI_NOTE, ph->ph_dip,
6112 	    "!returning success pip = %p. ph = %p",
6113 	    (void *)pip, (void *)ph));
6114 	return (MDI_SUCCESS);
6115 
6116 }
6117 
6118 /*
6119  * Disable the path (specific client/target/initiator)
6120  * Disabling a path means that MPxIO will not select the disabled path for
6121  * routing any new I/O requests.
6122  */
6123 int
6124 mdi_pi_disable_path(mdi_pathinfo_t *pip, int flags)
6125 {
6126 	mdi_phci_t	*ph;
6127 
6128 	ph = MDI_PI(pip)->pi_phci;
6129 	if (ph == NULL) {
6130 		MDI_DEBUG(1, (MDI_NOTE, mdi_pi_get_phci(pip),
6131 		    "!failed: path %s %p: NULL ph",
6132 		    mdi_pi_spathname(pip), (void *)pip));
6133 		return (MDI_FAILURE);
6134 	}
6135 
6136 	(void) i_mdi_enable_disable_path(pip,
6137 	    ph->ph_vhci, flags, MDI_DISABLE_OP);
6138 	MDI_DEBUG(5, (MDI_NOTE, ph->ph_dip,
6139 	    "!returning success pip = %p. ph = %p",
6140 	    (void *)pip, (void *)ph));
6141 	return (MDI_SUCCESS);
6142 }
6143 
6144 /*
6145  * disable the path to a particular pHCI (pHCI specified in the phci_path
6146  * argument) for a particular client (specified in the client_path argument).
6147  * Disabling a path means that MPxIO will not select the disabled path for
6148  * routing any new I/O requests.
6149  * NOTE: this will be removed once the NWS files are changed to use the new
6150  * mdi_{enable,disable}_path interfaces
6151  */
6152 int
6153 mdi_pi_disable(dev_info_t *cdip, dev_info_t *pdip, int flags)
6154 {
6155 	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_DISABLE_OP));
6156 }
6157 
6158 /*
6159  * Enable the path to a particular pHCI (pHCI specified in the phci_path
6160  * argument) for a particular client (specified in the client_path argument).
6161  * Enabling a path means that MPxIO may select the enabled path for routing
6162  * future I/O requests, subject to other path state constraints.
6163  * NOTE: this will be removed once the NWS files are changed to use the new
6164  * mdi_{enable,disable}_path interfaces
6165  */
6166 
6167 int
6168 mdi_pi_enable(dev_info_t *cdip, dev_info_t *pdip, int flags)
6169 {
6170 	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_ENABLE_OP));
6171 }
6172 
6173 /*
6174  * Common routine for doing enable/disable.
6175  */
6176 static mdi_pathinfo_t *
6177 i_mdi_enable_disable_path(mdi_pathinfo_t *pip, mdi_vhci_t *vh, int flags,
6178 		int op)
6179 {
6180 	int		sync_flag = 0;
6181 	int		rv;
6182 	mdi_pathinfo_t 	*next;
6183 	int		(*f)() = NULL;
6184 
6185 	/*
6186 	 * Check to make sure the path is not already in the
6187 	 * requested state. If it is just return the next path
6188 	 * as we have nothing to do here.
6189 	 */
6190 	if ((MDI_PI_IS_DISABLE(pip) && op == MDI_DISABLE_OP) ||
6191 	    (!MDI_PI_IS_DISABLE(pip) && op == MDI_ENABLE_OP)) {
6192 		MDI_PI_LOCK(pip);
6193 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
6194 		MDI_PI_UNLOCK(pip);
6195 		return (next);
6196 	}
6197 
6198 	f = vh->vh_ops->vo_pi_state_change;
6199 
6200 	sync_flag = (flags << 8) & 0xf00;
6201 
6202 	/*
6203 	 * Do a callback into the mdi consumer to let it
6204 	 * know that path is about to get enabled/disabled.
6205 	 */
6206 	if (f != NULL) {
6207 		rv = (*f)(vh->vh_dip, pip, 0,
6208 			MDI_PI_EXT_STATE(pip),
6209 			MDI_EXT_STATE_CHANGE | sync_flag |
6210 			op | MDI_BEFORE_STATE_CHANGE);
6211 		if (rv != MDI_SUCCESS) {
6212 			MDI_DEBUG(2, (MDI_WARN, vh->vh_dip,
6213 			    "vo_pi_state_change: failed rv = %x", rv));
6214 		}
6215 	}
6216 	MDI_PI_LOCK(pip);
6217 	next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
6218 
6219 	switch (flags) {
6220 		case USER_DISABLE:
6221 			if (op == MDI_DISABLE_OP) {
6222 				MDI_PI_SET_USER_DISABLE(pip);
6223 			} else {
6224 				MDI_PI_SET_USER_ENABLE(pip);
6225 			}
6226 			break;
6227 		case DRIVER_DISABLE:
6228 			if (op == MDI_DISABLE_OP) {
6229 				MDI_PI_SET_DRV_DISABLE(pip);
6230 			} else {
6231 				MDI_PI_SET_DRV_ENABLE(pip);
6232 			}
6233 			break;
6234 		case DRIVER_DISABLE_TRANSIENT:
6235 			if (op == MDI_DISABLE_OP && rv == MDI_SUCCESS) {
6236 				MDI_PI_SET_DRV_DISABLE_TRANS(pip);
6237 			} else {
6238 				MDI_PI_SET_DRV_ENABLE_TRANS(pip);
6239 			}
6240 			break;
6241 	}
6242 	MDI_PI_UNLOCK(pip);
6243 	/*
6244 	 * Do a callback into the mdi consumer to let it
6245 	 * know that path is now enabled/disabled.
6246 	 */
6247 	if (f != NULL) {
6248 		rv = (*f)(vh->vh_dip, pip, 0,
6249 			MDI_PI_EXT_STATE(pip),
6250 			MDI_EXT_STATE_CHANGE | sync_flag |
6251 			op | MDI_AFTER_STATE_CHANGE);
6252 		if (rv != MDI_SUCCESS) {
6253 			MDI_DEBUG(2, (MDI_WARN, vh->vh_dip,
6254 			    "vo_pi_state_change failed: rv = %x", rv));
6255 		}
6256 	}
6257 	return (next);
6258 }
6259 
6260 /*
6261  * Common routine for doing enable/disable.
6262  * NOTE: this will be removed once the NWS files are changed to use the new
6263  * mdi_{enable,disable}_path has been putback
6264  */
6265 int
6266 i_mdi_pi_enable_disable(dev_info_t *cdip, dev_info_t *pdip, int flags, int op)
6267 {
6268 
6269 	mdi_phci_t	*ph;
6270 	mdi_vhci_t	*vh = NULL;
6271 	mdi_client_t	*ct;
6272 	mdi_pathinfo_t	*next, *pip;
6273 	int		found_it;
6274 
6275 	ph = i_devi_get_phci(pdip);
6276 	MDI_DEBUG(5, (MDI_NOTE, cdip ? cdip : pdip,
6277 	    "!op = %d pdip = %p cdip = %p", op, (void *)pdip,
6278 	    (void *)cdip));
6279 	if (ph == NULL) {
6280 		MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6281 		    "!failed: operation %d: NULL ph", op));
6282 		return (MDI_FAILURE);
6283 	}
6284 
6285 	if ((op != MDI_ENABLE_OP) && (op != MDI_DISABLE_OP)) {
6286 		MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6287 		    "!failed: invalid operation %d", op));
6288 		return (MDI_FAILURE);
6289 	}
6290 
6291 	vh = ph->ph_vhci;
6292 
6293 	if (cdip == NULL) {
6294 		/*
6295 		 * Need to mark the Phci as enabled/disabled.
6296 		 */
6297 		MDI_DEBUG(4, (MDI_NOTE, cdip ? cdip : pdip,
6298 		    "op %d for the phci", op));
6299 		MDI_PHCI_LOCK(ph);
6300 		switch (flags) {
6301 			case USER_DISABLE:
6302 				if (op == MDI_DISABLE_OP) {
6303 					MDI_PHCI_SET_USER_DISABLE(ph);
6304 				} else {
6305 					MDI_PHCI_SET_USER_ENABLE(ph);
6306 				}
6307 				break;
6308 			case DRIVER_DISABLE:
6309 				if (op == MDI_DISABLE_OP) {
6310 					MDI_PHCI_SET_DRV_DISABLE(ph);
6311 				} else {
6312 					MDI_PHCI_SET_DRV_ENABLE(ph);
6313 				}
6314 				break;
6315 			case DRIVER_DISABLE_TRANSIENT:
6316 				if (op == MDI_DISABLE_OP) {
6317 					MDI_PHCI_SET_DRV_DISABLE_TRANSIENT(ph);
6318 				} else {
6319 					MDI_PHCI_SET_DRV_ENABLE_TRANSIENT(ph);
6320 				}
6321 				break;
6322 			default:
6323 				MDI_PHCI_UNLOCK(ph);
6324 				MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6325 				    "!invalid flag argument= %d", flags));
6326 		}
6327 
6328 		/*
6329 		 * Phci has been disabled. Now try to enable/disable
6330 		 * path info's to each client.
6331 		 */
6332 		pip = ph->ph_path_head;
6333 		while (pip != NULL) {
6334 			pip = i_mdi_enable_disable_path(pip, vh, flags, op);
6335 		}
6336 		MDI_PHCI_UNLOCK(ph);
6337 	} else {
6338 
6339 		/*
6340 		 * Disable a specific client.
6341 		 */
6342 		ct = i_devi_get_client(cdip);
6343 		if (ct == NULL) {
6344 			MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6345 			    "!failed: operation = %d: NULL ct", op));
6346 			return (MDI_FAILURE);
6347 		}
6348 
6349 		MDI_CLIENT_LOCK(ct);
6350 		pip = ct->ct_path_head;
6351 		found_it = 0;
6352 		while (pip != NULL) {
6353 			MDI_PI_LOCK(pip);
6354 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6355 			if (MDI_PI(pip)->pi_phci == ph) {
6356 				MDI_PI_UNLOCK(pip);
6357 				found_it = 1;
6358 				break;
6359 			}
6360 			MDI_PI_UNLOCK(pip);
6361 			pip = next;
6362 		}
6363 
6364 
6365 		MDI_CLIENT_UNLOCK(ct);
6366 		if (found_it == 0) {
6367 			MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6368 			    "!failed. Could not find corresponding pip\n"));
6369 			return (MDI_FAILURE);
6370 		}
6371 
6372 		(void) i_mdi_enable_disable_path(pip, vh, flags, op);
6373 	}
6374 
6375 	MDI_DEBUG(5, (MDI_NOTE, cdip ? cdip : pdip,
6376 	    "!op %d returning success pdip = %p cdip = %p",
6377 	    op, (void *)pdip, (void *)cdip));
6378 	return (MDI_SUCCESS);
6379 }
6380 
6381 /*
6382  * Ensure phci powered up
6383  */
6384 static void
6385 i_mdi_pm_hold_pip(mdi_pathinfo_t *pip)
6386 {
6387 	dev_info_t	*ph_dip;
6388 
6389 	ASSERT(pip != NULL);
6390 	ASSERT(MDI_PI_LOCKED(pip));
6391 
6392 	if (MDI_PI(pip)->pi_pm_held) {
6393 		return;
6394 	}
6395 
6396 	ph_dip = mdi_pi_get_phci(pip);
6397 	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6398 	    "%s %p", mdi_pi_spathname(pip), (void *)pip));
6399 	if (ph_dip == NULL) {
6400 		return;
6401 	}
6402 
6403 	MDI_PI_UNLOCK(pip);
6404 	MDI_DEBUG(4, (MDI_NOTE, ph_dip, "kidsupcnt was %d",
6405 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
6406 	pm_hold_power(ph_dip);
6407 	MDI_DEBUG(4, (MDI_NOTE, ph_dip, "kidsupcnt is %d",
6408 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
6409 	MDI_PI_LOCK(pip);
6410 
6411 	/* If PM_GET_PM_INFO is NULL the pm_hold_power above was a noop */
6412 	if (DEVI(ph_dip)->devi_pm_info)
6413 		MDI_PI(pip)->pi_pm_held = 1;
6414 }
6415 
6416 /*
6417  * Allow phci powered down
6418  */
6419 static void
6420 i_mdi_pm_rele_pip(mdi_pathinfo_t *pip)
6421 {
6422 	dev_info_t	*ph_dip = NULL;
6423 
6424 	ASSERT(pip != NULL);
6425 	ASSERT(MDI_PI_LOCKED(pip));
6426 
6427 	if (MDI_PI(pip)->pi_pm_held == 0) {
6428 		return;
6429 	}
6430 
6431 	ph_dip = mdi_pi_get_phci(pip);
6432 	ASSERT(ph_dip != NULL);
6433 
6434 	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6435 	    "%s %p", mdi_pi_spathname(pip), (void *)pip));
6436 
6437 	MDI_PI_UNLOCK(pip);
6438 	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6439 	    "kidsupcnt was %d", DEVI(ph_dip)->devi_pm_kidsupcnt));
6440 	pm_rele_power(ph_dip);
6441 	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6442 	    "kidsupcnt is %d", DEVI(ph_dip)->devi_pm_kidsupcnt));
6443 	MDI_PI_LOCK(pip);
6444 
6445 	MDI_PI(pip)->pi_pm_held = 0;
6446 }
6447 
6448 static void
6449 i_mdi_pm_hold_client(mdi_client_t *ct, int incr)
6450 {
6451 	ASSERT(MDI_CLIENT_LOCKED(ct));
6452 
6453 	ct->ct_power_cnt += incr;
6454 	MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
6455 	    "%p ct_power_cnt = %d incr = %d",
6456 	    (void *)ct, ct->ct_power_cnt, incr));
6457 	ASSERT(ct->ct_power_cnt >= 0);
6458 }
6459 
6460 static void
6461 i_mdi_rele_all_phci(mdi_client_t *ct)
6462 {
6463 	mdi_pathinfo_t  *pip;
6464 
6465 	ASSERT(MDI_CLIENT_LOCKED(ct));
6466 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
6467 	while (pip != NULL) {
6468 		mdi_hold_path(pip);
6469 		MDI_PI_LOCK(pip);
6470 		i_mdi_pm_rele_pip(pip);
6471 		MDI_PI_UNLOCK(pip);
6472 		mdi_rele_path(pip);
6473 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6474 	}
6475 }
6476 
6477 static void
6478 i_mdi_pm_rele_client(mdi_client_t *ct, int decr)
6479 {
6480 	ASSERT(MDI_CLIENT_LOCKED(ct));
6481 
6482 	if (i_ddi_devi_attached(ct->ct_dip)) {
6483 		ct->ct_power_cnt -= decr;
6484 		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
6485 		    "%p ct_power_cnt = %d decr = %d",
6486 		    (void *)ct, ct->ct_power_cnt, decr));
6487 	}
6488 
6489 	ASSERT(ct->ct_power_cnt >= 0);
6490 	if (ct->ct_power_cnt == 0) {
6491 		i_mdi_rele_all_phci(ct);
6492 		return;
6493 	}
6494 }
6495 
6496 static void
6497 i_mdi_pm_reset_client(mdi_client_t *ct)
6498 {
6499 	MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
6500 	    "%p ct_power_cnt = %d", (void *)ct, ct->ct_power_cnt));
6501 	ASSERT(MDI_CLIENT_LOCKED(ct));
6502 	ct->ct_power_cnt = 0;
6503 	i_mdi_rele_all_phci(ct);
6504 	ct->ct_powercnt_config = 0;
6505 	ct->ct_powercnt_unconfig = 0;
6506 	ct->ct_powercnt_reset = 1;
6507 }
6508 
6509 static int
6510 i_mdi_power_one_phci(mdi_pathinfo_t *pip)
6511 {
6512 	int		ret;
6513 	dev_info_t	*ph_dip;
6514 
6515 	MDI_PI_LOCK(pip);
6516 	i_mdi_pm_hold_pip(pip);
6517 
6518 	ph_dip = mdi_pi_get_phci(pip);
6519 	MDI_PI_UNLOCK(pip);
6520 
6521 	/* bring all components of phci to full power */
6522 	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6523 	    "pm_powerup for %s%d %p", ddi_driver_name(ph_dip),
6524 	    ddi_get_instance(ph_dip), (void *)pip));
6525 
6526 	ret = pm_powerup(ph_dip);
6527 
6528 	if (ret == DDI_FAILURE) {
6529 		MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6530 		    "pm_powerup FAILED for %s%d %p",
6531 		    ddi_driver_name(ph_dip), ddi_get_instance(ph_dip),
6532 		    (void *)pip));
6533 
6534 		MDI_PI_LOCK(pip);
6535 		i_mdi_pm_rele_pip(pip);
6536 		MDI_PI_UNLOCK(pip);
6537 		return (MDI_FAILURE);
6538 	}
6539 
6540 	return (MDI_SUCCESS);
6541 }
6542 
6543 static int
6544 i_mdi_power_all_phci(mdi_client_t *ct)
6545 {
6546 	mdi_pathinfo_t  *pip;
6547 	int		succeeded = 0;
6548 
6549 	ASSERT(MDI_CLIENT_LOCKED(ct));
6550 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
6551 	while (pip != NULL) {
6552 		/*
6553 		 * Don't power if MDI_PATHINFO_STATE_FAULT
6554 		 * or MDI_PATHINFO_STATE_OFFLINE.
6555 		 */
6556 		if (MDI_PI_IS_INIT(pip) ||
6557 		    MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip)) {
6558 			mdi_hold_path(pip);
6559 			MDI_CLIENT_UNLOCK(ct);
6560 			if (i_mdi_power_one_phci(pip) == MDI_SUCCESS)
6561 				succeeded = 1;
6562 
6563 			ASSERT(ct == MDI_PI(pip)->pi_client);
6564 			MDI_CLIENT_LOCK(ct);
6565 			mdi_rele_path(pip);
6566 		}
6567 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6568 	}
6569 
6570 	return (succeeded ? MDI_SUCCESS : MDI_FAILURE);
6571 }
6572 
6573 /*
6574  * mdi_bus_power():
6575  *		1. Place the phci(s) into powered up state so that
6576  *		   client can do power management
6577  *		2. Ensure phci powered up as client power managing
6578  * Return Values:
6579  *		MDI_SUCCESS
6580  *		MDI_FAILURE
6581  */
6582 int
6583 mdi_bus_power(dev_info_t *parent, void *impl_arg, pm_bus_power_op_t op,
6584     void *arg, void *result)
6585 {
6586 	int			ret = MDI_SUCCESS;
6587 	pm_bp_child_pwrchg_t	*bpc;
6588 	mdi_client_t		*ct;
6589 	dev_info_t		*cdip;
6590 	pm_bp_has_changed_t	*bphc;
6591 
6592 	/*
6593 	 * BUS_POWER_NOINVOL not supported
6594 	 */
6595 	if (op == BUS_POWER_NOINVOL)
6596 		return (MDI_FAILURE);
6597 
6598 	/*
6599 	 * ignore other OPs.
6600 	 * return quickly to save cou cycles on the ct processing
6601 	 */
6602 	switch (op) {
6603 	case BUS_POWER_PRE_NOTIFICATION:
6604 	case BUS_POWER_POST_NOTIFICATION:
6605 		bpc = (pm_bp_child_pwrchg_t *)arg;
6606 		cdip = bpc->bpc_dip;
6607 		break;
6608 	case BUS_POWER_HAS_CHANGED:
6609 		bphc = (pm_bp_has_changed_t *)arg;
6610 		cdip = bphc->bphc_dip;
6611 		break;
6612 	default:
6613 		return (pm_busop_bus_power(parent, impl_arg, op, arg, result));
6614 	}
6615 
6616 	ASSERT(MDI_CLIENT(cdip));
6617 
6618 	ct = i_devi_get_client(cdip);
6619 	if (ct == NULL)
6620 		return (MDI_FAILURE);
6621 
6622 	/*
6623 	 * wait till the mdi_pathinfo node state change are processed
6624 	 */
6625 	MDI_CLIENT_LOCK(ct);
6626 	switch (op) {
6627 	case BUS_POWER_PRE_NOTIFICATION:
6628 		MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6629 		    "BUS_POWER_PRE_NOTIFICATION:"
6630 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d",
6631 		    ddi_node_name(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
6632 		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp));
6633 
6634 		/* serialize power level change per client */
6635 		while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6636 			cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6637 
6638 		MDI_CLIENT_SET_POWER_TRANSITION(ct);
6639 
6640 		if (ct->ct_power_cnt == 0) {
6641 			ret = i_mdi_power_all_phci(ct);
6642 		}
6643 
6644 		/*
6645 		 * if new_level > 0:
6646 		 *	- hold phci(s)
6647 		 *	- power up phci(s) if not already
6648 		 * ignore power down
6649 		 */
6650 		if (bpc->bpc_nlevel > 0) {
6651 			if (!DEVI_IS_ATTACHING(ct->ct_dip)) {
6652 				MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6653 				    "i_mdi_pm_hold_client\n"));
6654 				i_mdi_pm_hold_client(ct, ct->ct_path_count);
6655 			}
6656 		}
6657 		break;
6658 	case BUS_POWER_POST_NOTIFICATION:
6659 		MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6660 		    "BUS_POWER_POST_NOTIFICATION:"
6661 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d result=%d",
6662 		    ddi_node_name(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
6663 		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp,
6664 		    *(int *)result));
6665 
6666 		if (*(int *)result == DDI_SUCCESS) {
6667 			if (bpc->bpc_nlevel > 0) {
6668 				MDI_CLIENT_SET_POWER_UP(ct);
6669 			} else {
6670 				MDI_CLIENT_SET_POWER_DOWN(ct);
6671 			}
6672 		}
6673 
6674 		/* release the hold we did in pre-notification */
6675 		if (bpc->bpc_nlevel > 0 && (*(int *)result != DDI_SUCCESS) &&
6676 		    !DEVI_IS_ATTACHING(ct->ct_dip)) {
6677 			MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6678 			    "i_mdi_pm_rele_client\n"));
6679 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
6680 		}
6681 
6682 		if (bpc->bpc_nlevel == 0 && (*(int *)result == DDI_SUCCESS)) {
6683 			/* another thread might started attaching */
6684 			if (DEVI_IS_ATTACHING(ct->ct_dip)) {
6685 				MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6686 				    "i_mdi_pm_rele_client\n"));
6687 				i_mdi_pm_rele_client(ct, ct->ct_path_count);
6688 			/* detaching has been taken care in pm_post_unconfig */
6689 			} else if (!DEVI_IS_DETACHING(ct->ct_dip)) {
6690 				MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6691 				    "i_mdi_pm_reset_client\n"));
6692 				i_mdi_pm_reset_client(ct);
6693 			}
6694 		}
6695 
6696 		MDI_CLIENT_CLEAR_POWER_TRANSITION(ct);
6697 		cv_broadcast(&ct->ct_powerchange_cv);
6698 
6699 		break;
6700 
6701 	/* need to do more */
6702 	case BUS_POWER_HAS_CHANGED:
6703 		MDI_DEBUG(4, (MDI_NOTE, bphc->bphc_dip,
6704 		    "BUS_POWER_HAS_CHANGED:"
6705 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d",
6706 		    ddi_node_name(bphc->bphc_dip), PM_ADDR(bphc->bphc_dip),
6707 		    bphc->bphc_olevel, bphc->bphc_nlevel, bphc->bphc_comp));
6708 
6709 		if (bphc->bphc_nlevel > 0 &&
6710 		    bphc->bphc_nlevel > bphc->bphc_olevel) {
6711 			if (ct->ct_power_cnt == 0) {
6712 				ret = i_mdi_power_all_phci(ct);
6713 			}
6714 			MDI_DEBUG(4, (MDI_NOTE, bphc->bphc_dip,
6715 			    "i_mdi_pm_hold_client\n"));
6716 			i_mdi_pm_hold_client(ct, ct->ct_path_count);
6717 		}
6718 
6719 		if (bphc->bphc_nlevel == 0 && bphc->bphc_olevel != -1) {
6720 			MDI_DEBUG(4, (MDI_NOTE, bphc->bphc_dip,
6721 			    "i_mdi_pm_rele_client\n"));
6722 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
6723 		}
6724 		break;
6725 	}
6726 
6727 	MDI_CLIENT_UNLOCK(ct);
6728 	return (ret);
6729 }
6730 
6731 static int
6732 i_mdi_pm_pre_config_one(dev_info_t *child)
6733 {
6734 	int		ret = MDI_SUCCESS;
6735 	mdi_client_t	*ct;
6736 
6737 	ct = i_devi_get_client(child);
6738 	if (ct == NULL)
6739 		return (MDI_FAILURE);
6740 
6741 	MDI_CLIENT_LOCK(ct);
6742 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6743 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6744 
6745 	if (!MDI_CLIENT_IS_FAILED(ct)) {
6746 		MDI_CLIENT_UNLOCK(ct);
6747 		MDI_DEBUG(4, (MDI_NOTE, child, "already configured\n"));
6748 		return (MDI_SUCCESS);
6749 	}
6750 
6751 	if (ct->ct_powercnt_config) {
6752 		MDI_CLIENT_UNLOCK(ct);
6753 		MDI_DEBUG(4, (MDI_NOTE, child, "already held\n"));
6754 		return (MDI_SUCCESS);
6755 	}
6756 
6757 	if (ct->ct_power_cnt == 0) {
6758 		ret = i_mdi_power_all_phci(ct);
6759 	}
6760 	MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_hold_client\n"));
6761 	i_mdi_pm_hold_client(ct, ct->ct_path_count);
6762 	ct->ct_powercnt_config = 1;
6763 	ct->ct_powercnt_reset = 0;
6764 	MDI_CLIENT_UNLOCK(ct);
6765 	return (ret);
6766 }
6767 
6768 static int
6769 i_mdi_pm_pre_config(dev_info_t *vdip, dev_info_t *child)
6770 {
6771 	int			ret = MDI_SUCCESS;
6772 	dev_info_t		*cdip;
6773 	int			circ;
6774 
6775 	ASSERT(MDI_VHCI(vdip));
6776 
6777 	/* ndi_devi_config_one */
6778 	if (child) {
6779 		ASSERT(DEVI_BUSY_OWNED(vdip));
6780 		return (i_mdi_pm_pre_config_one(child));
6781 	}
6782 
6783 	/* devi_config_common */
6784 	ndi_devi_enter(vdip, &circ);
6785 	cdip = ddi_get_child(vdip);
6786 	while (cdip) {
6787 		dev_info_t *next = ddi_get_next_sibling(cdip);
6788 
6789 		ret = i_mdi_pm_pre_config_one(cdip);
6790 		if (ret != MDI_SUCCESS)
6791 			break;
6792 		cdip = next;
6793 	}
6794 	ndi_devi_exit(vdip, circ);
6795 	return (ret);
6796 }
6797 
6798 static int
6799 i_mdi_pm_pre_unconfig_one(dev_info_t *child, int *held, int flags)
6800 {
6801 	int		ret = MDI_SUCCESS;
6802 	mdi_client_t	*ct;
6803 
6804 	ct = i_devi_get_client(child);
6805 	if (ct == NULL)
6806 		return (MDI_FAILURE);
6807 
6808 	MDI_CLIENT_LOCK(ct);
6809 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6810 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6811 
6812 	if (!i_ddi_devi_attached(ct->ct_dip)) {
6813 		MDI_DEBUG(4, (MDI_NOTE, child, "node detached already\n"));
6814 		MDI_CLIENT_UNLOCK(ct);
6815 		return (MDI_SUCCESS);
6816 	}
6817 
6818 	if (MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6819 	    (flags & NDI_AUTODETACH)) {
6820 		MDI_DEBUG(4, (MDI_NOTE, child, "auto-modunload\n"));
6821 		MDI_CLIENT_UNLOCK(ct);
6822 		return (MDI_FAILURE);
6823 	}
6824 
6825 	if (ct->ct_powercnt_unconfig) {
6826 		MDI_DEBUG(4, (MDI_NOTE, child, "ct_powercnt_held\n"));
6827 		MDI_CLIENT_UNLOCK(ct);
6828 		*held = 1;
6829 		return (MDI_SUCCESS);
6830 	}
6831 
6832 	if (ct->ct_power_cnt == 0) {
6833 		ret = i_mdi_power_all_phci(ct);
6834 	}
6835 	MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_hold_client\n"));
6836 	i_mdi_pm_hold_client(ct, ct->ct_path_count);
6837 	ct->ct_powercnt_unconfig = 1;
6838 	ct->ct_powercnt_reset = 0;
6839 	MDI_CLIENT_UNLOCK(ct);
6840 	if (ret == MDI_SUCCESS)
6841 		*held = 1;
6842 	return (ret);
6843 }
6844 
6845 static int
6846 i_mdi_pm_pre_unconfig(dev_info_t *vdip, dev_info_t *child, int *held,
6847     int flags)
6848 {
6849 	int			ret = MDI_SUCCESS;
6850 	dev_info_t		*cdip;
6851 	int			circ;
6852 
6853 	ASSERT(MDI_VHCI(vdip));
6854 	*held = 0;
6855 
6856 	/* ndi_devi_unconfig_one */
6857 	if (child) {
6858 		ASSERT(DEVI_BUSY_OWNED(vdip));
6859 		return (i_mdi_pm_pre_unconfig_one(child, held, flags));
6860 	}
6861 
6862 	/* devi_unconfig_common */
6863 	ndi_devi_enter(vdip, &circ);
6864 	cdip = ddi_get_child(vdip);
6865 	while (cdip) {
6866 		dev_info_t *next = ddi_get_next_sibling(cdip);
6867 
6868 		ret = i_mdi_pm_pre_unconfig_one(cdip, held, flags);
6869 		cdip = next;
6870 	}
6871 	ndi_devi_exit(vdip, circ);
6872 
6873 	if (*held)
6874 		ret = MDI_SUCCESS;
6875 
6876 	return (ret);
6877 }
6878 
6879 static void
6880 i_mdi_pm_post_config_one(dev_info_t *child)
6881 {
6882 	mdi_client_t	*ct;
6883 
6884 	ct = i_devi_get_client(child);
6885 	if (ct == NULL)
6886 		return;
6887 
6888 	MDI_CLIENT_LOCK(ct);
6889 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6890 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6891 
6892 	if (ct->ct_powercnt_reset || !ct->ct_powercnt_config) {
6893 		MDI_DEBUG(4, (MDI_NOTE, child, "not configured\n"));
6894 		MDI_CLIENT_UNLOCK(ct);
6895 		return;
6896 	}
6897 
6898 	/* client has not been updated */
6899 	if (MDI_CLIENT_IS_FAILED(ct)) {
6900 		MDI_DEBUG(4, (MDI_NOTE, child, "client failed\n"));
6901 		MDI_CLIENT_UNLOCK(ct);
6902 		return;
6903 	}
6904 
6905 	/* another thread might have powered it down or detached it */
6906 	if ((MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6907 	    !DEVI_IS_ATTACHING(ct->ct_dip)) ||
6908 	    (!i_ddi_devi_attached(ct->ct_dip) &&
6909 	    !DEVI_IS_ATTACHING(ct->ct_dip))) {
6910 		MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_reset_client\n"));
6911 		i_mdi_pm_reset_client(ct);
6912 	} else {
6913 		mdi_pathinfo_t  *pip, *next;
6914 		int	valid_path_count = 0;
6915 
6916 		MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_rele_client\n"));
6917 		pip = ct->ct_path_head;
6918 		while (pip != NULL) {
6919 			MDI_PI_LOCK(pip);
6920 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6921 			if (MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip))
6922 				valid_path_count ++;
6923 			MDI_PI_UNLOCK(pip);
6924 			pip = next;
6925 		}
6926 		i_mdi_pm_rele_client(ct, valid_path_count);
6927 	}
6928 	ct->ct_powercnt_config = 0;
6929 	MDI_CLIENT_UNLOCK(ct);
6930 }
6931 
6932 static void
6933 i_mdi_pm_post_config(dev_info_t *vdip, dev_info_t *child)
6934 {
6935 	int		circ;
6936 	dev_info_t	*cdip;
6937 
6938 	ASSERT(MDI_VHCI(vdip));
6939 
6940 	/* ndi_devi_config_one */
6941 	if (child) {
6942 		ASSERT(DEVI_BUSY_OWNED(vdip));
6943 		i_mdi_pm_post_config_one(child);
6944 		return;
6945 	}
6946 
6947 	/* devi_config_common */
6948 	ndi_devi_enter(vdip, &circ);
6949 	cdip = ddi_get_child(vdip);
6950 	while (cdip) {
6951 		dev_info_t *next = ddi_get_next_sibling(cdip);
6952 
6953 		i_mdi_pm_post_config_one(cdip);
6954 		cdip = next;
6955 	}
6956 	ndi_devi_exit(vdip, circ);
6957 }
6958 
6959 static void
6960 i_mdi_pm_post_unconfig_one(dev_info_t *child)
6961 {
6962 	mdi_client_t	*ct;
6963 
6964 	ct = i_devi_get_client(child);
6965 	if (ct == NULL)
6966 		return;
6967 
6968 	MDI_CLIENT_LOCK(ct);
6969 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6970 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6971 
6972 	if (!ct->ct_powercnt_unconfig || ct->ct_powercnt_reset) {
6973 		MDI_DEBUG(4, (MDI_NOTE, child, "not held\n"));
6974 		MDI_CLIENT_UNLOCK(ct);
6975 		return;
6976 	}
6977 
6978 	/* failure detaching or another thread just attached it */
6979 	if ((MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6980 	    i_ddi_devi_attached(ct->ct_dip)) ||
6981 	    (!i_ddi_devi_attached(ct->ct_dip) &&
6982 	    !DEVI_IS_ATTACHING(ct->ct_dip))) {
6983 		MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_reset_client\n"));
6984 		i_mdi_pm_reset_client(ct);
6985 	} else {
6986 		mdi_pathinfo_t  *pip, *next;
6987 		int	valid_path_count = 0;
6988 
6989 		MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_rele_client\n"));
6990 		pip = ct->ct_path_head;
6991 		while (pip != NULL) {
6992 			MDI_PI_LOCK(pip);
6993 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6994 			if (MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip))
6995 				valid_path_count ++;
6996 			MDI_PI_UNLOCK(pip);
6997 			pip = next;
6998 		}
6999 		i_mdi_pm_rele_client(ct, valid_path_count);
7000 		ct->ct_powercnt_unconfig = 0;
7001 	}
7002 
7003 	MDI_CLIENT_UNLOCK(ct);
7004 }
7005 
7006 static void
7007 i_mdi_pm_post_unconfig(dev_info_t *vdip, dev_info_t *child, int held)
7008 {
7009 	int			circ;
7010 	dev_info_t		*cdip;
7011 
7012 	ASSERT(MDI_VHCI(vdip));
7013 
7014 	if (!held) {
7015 		MDI_DEBUG(4, (MDI_NOTE, vdip, "held = %d", held));
7016 		return;
7017 	}
7018 
7019 	if (child) {
7020 		ASSERT(DEVI_BUSY_OWNED(vdip));
7021 		i_mdi_pm_post_unconfig_one(child);
7022 		return;
7023 	}
7024 
7025 	ndi_devi_enter(vdip, &circ);
7026 	cdip = ddi_get_child(vdip);
7027 	while (cdip) {
7028 		dev_info_t *next = ddi_get_next_sibling(cdip);
7029 
7030 		i_mdi_pm_post_unconfig_one(cdip);
7031 		cdip = next;
7032 	}
7033 	ndi_devi_exit(vdip, circ);
7034 }
7035 
7036 int
7037 mdi_power(dev_info_t *vdip, mdi_pm_op_t op, void *args, char *devnm, int flags)
7038 {
7039 	int			circ, ret = MDI_SUCCESS;
7040 	dev_info_t		*client_dip = NULL;
7041 	mdi_client_t		*ct;
7042 
7043 	/*
7044 	 * Handling ndi_devi_config_one and ndi_devi_unconfig_one.
7045 	 * Power up pHCI for the named client device.
7046 	 * Note: Before the client is enumerated under vhci by phci,
7047 	 * client_dip can be NULL. Then proceed to power up all the
7048 	 * pHCIs.
7049 	 */
7050 	if (devnm != NULL) {
7051 		ndi_devi_enter(vdip, &circ);
7052 		client_dip = ndi_devi_findchild(vdip, devnm);
7053 	}
7054 
7055 	MDI_DEBUG(4, (MDI_NOTE, vdip,
7056 	    "op = %d %s %p", op, devnm ? devnm : "", (void *)client_dip));
7057 
7058 	switch (op) {
7059 	case MDI_PM_PRE_CONFIG:
7060 		ret = i_mdi_pm_pre_config(vdip, client_dip);
7061 		break;
7062 
7063 	case MDI_PM_PRE_UNCONFIG:
7064 		ret = i_mdi_pm_pre_unconfig(vdip, client_dip, (int *)args,
7065 		    flags);
7066 		break;
7067 
7068 	case MDI_PM_POST_CONFIG:
7069 		i_mdi_pm_post_config(vdip, client_dip);
7070 		break;
7071 
7072 	case MDI_PM_POST_UNCONFIG:
7073 		i_mdi_pm_post_unconfig(vdip, client_dip, *(int *)args);
7074 		break;
7075 
7076 	case MDI_PM_HOLD_POWER:
7077 	case MDI_PM_RELE_POWER:
7078 		ASSERT(args);
7079 
7080 		client_dip = (dev_info_t *)args;
7081 		ASSERT(MDI_CLIENT(client_dip));
7082 
7083 		ct = i_devi_get_client(client_dip);
7084 		MDI_CLIENT_LOCK(ct);
7085 
7086 		if (op == MDI_PM_HOLD_POWER) {
7087 			if (ct->ct_power_cnt == 0) {
7088 				(void) i_mdi_power_all_phci(ct);
7089 				MDI_DEBUG(4, (MDI_NOTE, client_dip,
7090 				    "i_mdi_pm_hold_client\n"));
7091 				i_mdi_pm_hold_client(ct, ct->ct_path_count);
7092 			}
7093 		} else {
7094 			if (DEVI_IS_ATTACHING(ct->ct_dip)) {
7095 				MDI_DEBUG(4, (MDI_NOTE, client_dip,
7096 				    "i_mdi_pm_rele_client\n"));
7097 				i_mdi_pm_rele_client(ct, ct->ct_path_count);
7098 			} else {
7099 				MDI_DEBUG(4, (MDI_NOTE, client_dip,
7100 				    "i_mdi_pm_reset_client\n"));
7101 				i_mdi_pm_reset_client(ct);
7102 			}
7103 		}
7104 
7105 		MDI_CLIENT_UNLOCK(ct);
7106 		break;
7107 
7108 	default:
7109 		break;
7110 	}
7111 
7112 	if (devnm)
7113 		ndi_devi_exit(vdip, circ);
7114 
7115 	return (ret);
7116 }
7117 
7118 int
7119 mdi_component_is_vhci(dev_info_t *dip, const char **mdi_class)
7120 {
7121 	mdi_vhci_t *vhci;
7122 
7123 	if (!MDI_VHCI(dip))
7124 		return (MDI_FAILURE);
7125 
7126 	if (mdi_class) {
7127 		vhci = DEVI(dip)->devi_mdi_xhci;
7128 		ASSERT(vhci);
7129 		*mdi_class = vhci->vh_class;
7130 	}
7131 
7132 	return (MDI_SUCCESS);
7133 }
7134 
7135 int
7136 mdi_component_is_phci(dev_info_t *dip, const char **mdi_class)
7137 {
7138 	mdi_phci_t *phci;
7139 
7140 	if (!MDI_PHCI(dip))
7141 		return (MDI_FAILURE);
7142 
7143 	if (mdi_class) {
7144 		phci = DEVI(dip)->devi_mdi_xhci;
7145 		ASSERT(phci);
7146 		*mdi_class = phci->ph_vhci->vh_class;
7147 	}
7148 
7149 	return (MDI_SUCCESS);
7150 }
7151 
7152 int
7153 mdi_component_is_client(dev_info_t *dip, const char **mdi_class)
7154 {
7155 	mdi_client_t *client;
7156 
7157 	if (!MDI_CLIENT(dip))
7158 		return (MDI_FAILURE);
7159 
7160 	if (mdi_class) {
7161 		client = DEVI(dip)->devi_mdi_client;
7162 		ASSERT(client);
7163 		*mdi_class = client->ct_vhci->vh_class;
7164 	}
7165 
7166 	return (MDI_SUCCESS);
7167 }
7168 
7169 void *
7170 mdi_client_get_vhci_private(dev_info_t *dip)
7171 {
7172 	ASSERT(mdi_component_is_client(dip, NULL) == MDI_SUCCESS);
7173 	if (mdi_component_is_client(dip, NULL) == MDI_SUCCESS) {
7174 		mdi_client_t	*ct;
7175 		ct = i_devi_get_client(dip);
7176 		return (ct->ct_vprivate);
7177 	}
7178 	return (NULL);
7179 }
7180 
7181 void
7182 mdi_client_set_vhci_private(dev_info_t *dip, void *data)
7183 {
7184 	ASSERT(mdi_component_is_client(dip, NULL) == MDI_SUCCESS);
7185 	if (mdi_component_is_client(dip, NULL) == MDI_SUCCESS) {
7186 		mdi_client_t	*ct;
7187 		ct = i_devi_get_client(dip);
7188 		ct->ct_vprivate = data;
7189 	}
7190 }
7191 /*
7192  * mdi_pi_get_vhci_private():
7193  *		Get the vhci private information associated with the
7194  *		mdi_pathinfo node
7195  */
7196 void *
7197 mdi_pi_get_vhci_private(mdi_pathinfo_t *pip)
7198 {
7199 	caddr_t	vprivate = NULL;
7200 	if (pip) {
7201 		vprivate = MDI_PI(pip)->pi_vprivate;
7202 	}
7203 	return (vprivate);
7204 }
7205 
7206 /*
7207  * mdi_pi_set_vhci_private():
7208  *		Set the vhci private information in the mdi_pathinfo node
7209  */
7210 void
7211 mdi_pi_set_vhci_private(mdi_pathinfo_t *pip, void *priv)
7212 {
7213 	if (pip) {
7214 		MDI_PI(pip)->pi_vprivate = priv;
7215 	}
7216 }
7217 
7218 /*
7219  * mdi_phci_get_vhci_private():
7220  *		Get the vhci private information associated with the
7221  *		mdi_phci node
7222  */
7223 void *
7224 mdi_phci_get_vhci_private(dev_info_t *dip)
7225 {
7226 	ASSERT(mdi_component_is_phci(dip, NULL) == MDI_SUCCESS);
7227 	if (mdi_component_is_phci(dip, NULL) == MDI_SUCCESS) {
7228 		mdi_phci_t	*ph;
7229 		ph = i_devi_get_phci(dip);
7230 		return (ph->ph_vprivate);
7231 	}
7232 	return (NULL);
7233 }
7234 
7235 /*
7236  * mdi_phci_set_vhci_private():
7237  *		Set the vhci private information in the mdi_phci node
7238  */
7239 void
7240 mdi_phci_set_vhci_private(dev_info_t *dip, void *priv)
7241 {
7242 	ASSERT(mdi_component_is_phci(dip, NULL) == MDI_SUCCESS);
7243 	if (mdi_component_is_phci(dip, NULL) == MDI_SUCCESS) {
7244 		mdi_phci_t	*ph;
7245 		ph = i_devi_get_phci(dip);
7246 		ph->ph_vprivate = priv;
7247 	}
7248 }
7249 
7250 int
7251 mdi_pi_ishidden(mdi_pathinfo_t *pip)
7252 {
7253 	return (MDI_PI_FLAGS_IS_HIDDEN(pip));
7254 }
7255 
7256 int
7257 mdi_pi_device_isremoved(mdi_pathinfo_t *pip)
7258 {
7259 	return (MDI_PI_FLAGS_IS_DEVICE_REMOVED(pip));
7260 }
7261 
7262 /*
7263  * When processing hotplug, if mdi_pi_offline-mdi_pi_free fails then this
7264  * interface is used to represent device removal.
7265  */
7266 int
7267 mdi_pi_device_remove(mdi_pathinfo_t *pip)
7268 {
7269 	MDI_PI_LOCK(pip);
7270 	if (mdi_pi_device_isremoved(pip)) {
7271 		MDI_PI_UNLOCK(pip);
7272 		return (0);
7273 	}
7274 	MDI_PI_FLAGS_SET_DEVICE_REMOVED(pip);
7275 	MDI_PI_FLAGS_SET_HIDDEN(pip);
7276 	MDI_PI_UNLOCK(pip);
7277 
7278 	i_ddi_di_cache_invalidate();
7279 
7280 	return (1);
7281 }
7282 
7283 /*
7284  * When processing hotplug, if a path marked mdi_pi_device_isremoved()
7285  * is now accessible then this interfaces is used to represent device insertion.
7286  */
7287 int
7288 mdi_pi_device_insert(mdi_pathinfo_t *pip)
7289 {
7290 	MDI_PI_LOCK(pip);
7291 	if (!mdi_pi_device_isremoved(pip)) {
7292 		MDI_PI_UNLOCK(pip);
7293 		return (0);
7294 	}
7295 	MDI_PI_FLAGS_CLR_DEVICE_REMOVED(pip);
7296 	MDI_PI_FLAGS_CLR_HIDDEN(pip);
7297 	MDI_PI_UNLOCK(pip);
7298 
7299 	i_ddi_di_cache_invalidate();
7300 
7301 	return (1);
7302 }
7303 
7304 /*
7305  * List of vhci class names:
7306  * A vhci class name must be in this list only if the corresponding vhci
7307  * driver intends to use the mdi provided bus config implementation
7308  * (i.e., mdi_vhci_bus_config()).
7309  */
7310 static char *vhci_class_list[] = { MDI_HCI_CLASS_SCSI, MDI_HCI_CLASS_IB };
7311 #define	N_VHCI_CLASSES	(sizeof (vhci_class_list) / sizeof (char *))
7312 
7313 /*
7314  * During boot time, the on-disk vhci cache for every vhci class is read
7315  * in the form of an nvlist and stored here.
7316  */
7317 static nvlist_t *vhcache_nvl[N_VHCI_CLASSES];
7318 
7319 /* nvpair names in vhci cache nvlist */
7320 #define	MDI_VHCI_CACHE_VERSION	1
7321 #define	MDI_NVPNAME_VERSION	"version"
7322 #define	MDI_NVPNAME_PHCIS	"phcis"
7323 #define	MDI_NVPNAME_CTADDRMAP	"clientaddrmap"
7324 
7325 /*
7326  * Given vhci class name, return its on-disk vhci cache filename.
7327  * Memory for the returned filename which includes the full path is allocated
7328  * by this function.
7329  */
7330 static char *
7331 vhclass2vhcache_filename(char *vhclass)
7332 {
7333 	char *filename;
7334 	int len;
7335 	static char *fmt = "/etc/devices/mdi_%s_cache";
7336 
7337 	/*
7338 	 * fmt contains the on-disk vhci cache file name format;
7339 	 * for scsi_vhci the filename is "/etc/devices/mdi_scsi_vhci_cache".
7340 	 */
7341 
7342 	/* the -1 below is to account for "%s" in the format string */
7343 	len = strlen(fmt) + strlen(vhclass) - 1;
7344 	filename = kmem_alloc(len, KM_SLEEP);
7345 	(void) snprintf(filename, len, fmt, vhclass);
7346 	ASSERT(len == (strlen(filename) + 1));
7347 	return (filename);
7348 }
7349 
7350 /*
7351  * initialize the vhci cache related data structures and read the on-disk
7352  * vhci cached data into memory.
7353  */
7354 static void
7355 setup_vhci_cache(mdi_vhci_t *vh)
7356 {
7357 	mdi_vhci_config_t *vhc;
7358 	mdi_vhci_cache_t *vhcache;
7359 	int i;
7360 	nvlist_t *nvl = NULL;
7361 
7362 	vhc = kmem_zalloc(sizeof (mdi_vhci_config_t), KM_SLEEP);
7363 	vh->vh_config = vhc;
7364 	vhcache = &vhc->vhc_vhcache;
7365 
7366 	vhc->vhc_vhcache_filename = vhclass2vhcache_filename(vh->vh_class);
7367 
7368 	mutex_init(&vhc->vhc_lock, NULL, MUTEX_DEFAULT, NULL);
7369 	cv_init(&vhc->vhc_cv, NULL, CV_DRIVER, NULL);
7370 
7371 	rw_init(&vhcache->vhcache_lock, NULL, RW_DRIVER, NULL);
7372 
7373 	/*
7374 	 * Create string hash; same as mod_hash_create_strhash() except that
7375 	 * we use NULL key destructor.
7376 	 */
7377 	vhcache->vhcache_client_hash = mod_hash_create_extended(vh->vh_class,
7378 	    mdi_bus_config_cache_hash_size,
7379 	    mod_hash_null_keydtor, mod_hash_null_valdtor,
7380 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
7381 
7382 	/*
7383 	 * The on-disk vhci cache is read during booting prior to the
7384 	 * lights-out period by mdi_read_devices_files().
7385 	 */
7386 	for (i = 0; i < N_VHCI_CLASSES; i++) {
7387 		if (strcmp(vhci_class_list[i], vh->vh_class) == 0) {
7388 			nvl = vhcache_nvl[i];
7389 			vhcache_nvl[i] = NULL;
7390 			break;
7391 		}
7392 	}
7393 
7394 	/*
7395 	 * this is to cover the case of some one manually causing unloading
7396 	 * (or detaching) and reloading (or attaching) of a vhci driver.
7397 	 */
7398 	if (nvl == NULL && modrootloaded)
7399 		nvl = read_on_disk_vhci_cache(vh->vh_class);
7400 
7401 	if (nvl != NULL) {
7402 		rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7403 		if (mainnvl_to_vhcache(vhcache, nvl) == MDI_SUCCESS)
7404 			vhcache->vhcache_flags |= MDI_VHCI_CACHE_SETUP_DONE;
7405 		else  {
7406 			cmn_err(CE_WARN,
7407 			    "%s: data file corrupted, will recreate",
7408 			    vhc->vhc_vhcache_filename);
7409 		}
7410 		rw_exit(&vhcache->vhcache_lock);
7411 		nvlist_free(nvl);
7412 	}
7413 
7414 	vhc->vhc_cbid = callb_add(stop_vhcache_flush_thread, vhc,
7415 	    CB_CL_UADMIN_PRE_VFS, "mdi_vhcache_flush");
7416 
7417 	vhc->vhc_path_discovery_boot = mdi_path_discovery_boot;
7418 	vhc->vhc_path_discovery_postboot = mdi_path_discovery_postboot;
7419 }
7420 
7421 /*
7422  * free all vhci cache related resources
7423  */
7424 static int
7425 destroy_vhci_cache(mdi_vhci_t *vh)
7426 {
7427 	mdi_vhci_config_t *vhc = vh->vh_config;
7428 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7429 	mdi_vhcache_phci_t *cphci, *cphci_next;
7430 	mdi_vhcache_client_t *cct, *cct_next;
7431 	mdi_vhcache_pathinfo_t *cpi, *cpi_next;
7432 
7433 	if (stop_vhcache_async_threads(vhc) != MDI_SUCCESS)
7434 		return (MDI_FAILURE);
7435 
7436 	kmem_free(vhc->vhc_vhcache_filename,
7437 	    strlen(vhc->vhc_vhcache_filename) + 1);
7438 
7439 	mod_hash_destroy_strhash(vhcache->vhcache_client_hash);
7440 
7441 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7442 	    cphci = cphci_next) {
7443 		cphci_next = cphci->cphci_next;
7444 		free_vhcache_phci(cphci);
7445 	}
7446 
7447 	for (cct = vhcache->vhcache_client_head; cct != NULL; cct = cct_next) {
7448 		cct_next = cct->cct_next;
7449 		for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi_next) {
7450 			cpi_next = cpi->cpi_next;
7451 			free_vhcache_pathinfo(cpi);
7452 		}
7453 		free_vhcache_client(cct);
7454 	}
7455 
7456 	rw_destroy(&vhcache->vhcache_lock);
7457 
7458 	mutex_destroy(&vhc->vhc_lock);
7459 	cv_destroy(&vhc->vhc_cv);
7460 	kmem_free(vhc, sizeof (mdi_vhci_config_t));
7461 	return (MDI_SUCCESS);
7462 }
7463 
7464 /*
7465  * Stop all vhci cache related async threads and free their resources.
7466  */
7467 static int
7468 stop_vhcache_async_threads(mdi_vhci_config_t *vhc)
7469 {
7470 	mdi_async_client_config_t *acc, *acc_next;
7471 
7472 	mutex_enter(&vhc->vhc_lock);
7473 	vhc->vhc_flags |= MDI_VHC_EXIT;
7474 	ASSERT(vhc->vhc_acc_thrcount >= 0);
7475 	cv_broadcast(&vhc->vhc_cv);
7476 
7477 	while ((vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) ||
7478 	    vhc->vhc_acc_thrcount != 0) {
7479 		mutex_exit(&vhc->vhc_lock);
7480 		delay_random(mdi_delay);
7481 		mutex_enter(&vhc->vhc_lock);
7482 	}
7483 
7484 	vhc->vhc_flags &= ~MDI_VHC_EXIT;
7485 
7486 	for (acc = vhc->vhc_acc_list_head; acc != NULL; acc = acc_next) {
7487 		acc_next = acc->acc_next;
7488 		free_async_client_config(acc);
7489 	}
7490 	vhc->vhc_acc_list_head = NULL;
7491 	vhc->vhc_acc_list_tail = NULL;
7492 	vhc->vhc_acc_count = 0;
7493 
7494 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) {
7495 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
7496 		mutex_exit(&vhc->vhc_lock);
7497 		if (flush_vhcache(vhc, 0) != MDI_SUCCESS) {
7498 			vhcache_dirty(vhc);
7499 			return (MDI_FAILURE);
7500 		}
7501 	} else
7502 		mutex_exit(&vhc->vhc_lock);
7503 
7504 	if (callb_delete(vhc->vhc_cbid) != 0)
7505 		return (MDI_FAILURE);
7506 
7507 	return (MDI_SUCCESS);
7508 }
7509 
7510 /*
7511  * Stop vhci cache flush thread
7512  */
7513 /* ARGSUSED */
7514 static boolean_t
7515 stop_vhcache_flush_thread(void *arg, int code)
7516 {
7517 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
7518 
7519 	mutex_enter(&vhc->vhc_lock);
7520 	vhc->vhc_flags |= MDI_VHC_EXIT;
7521 	cv_broadcast(&vhc->vhc_cv);
7522 
7523 	while (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) {
7524 		mutex_exit(&vhc->vhc_lock);
7525 		delay_random(mdi_delay);
7526 		mutex_enter(&vhc->vhc_lock);
7527 	}
7528 
7529 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) {
7530 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
7531 		mutex_exit(&vhc->vhc_lock);
7532 		(void) flush_vhcache(vhc, 1);
7533 	} else
7534 		mutex_exit(&vhc->vhc_lock);
7535 
7536 	return (B_TRUE);
7537 }
7538 
7539 /*
7540  * Enqueue the vhcache phci (cphci) at the tail of the list
7541  */
7542 static void
7543 enqueue_vhcache_phci(mdi_vhci_cache_t *vhcache, mdi_vhcache_phci_t *cphci)
7544 {
7545 	cphci->cphci_next = NULL;
7546 	if (vhcache->vhcache_phci_head == NULL)
7547 		vhcache->vhcache_phci_head = cphci;
7548 	else
7549 		vhcache->vhcache_phci_tail->cphci_next = cphci;
7550 	vhcache->vhcache_phci_tail = cphci;
7551 }
7552 
7553 /*
7554  * Enqueue the vhcache pathinfo (cpi) at the tail of the list
7555  */
7556 static void
7557 enqueue_tail_vhcache_pathinfo(mdi_vhcache_client_t *cct,
7558     mdi_vhcache_pathinfo_t *cpi)
7559 {
7560 	cpi->cpi_next = NULL;
7561 	if (cct->cct_cpi_head == NULL)
7562 		cct->cct_cpi_head = cpi;
7563 	else
7564 		cct->cct_cpi_tail->cpi_next = cpi;
7565 	cct->cct_cpi_tail = cpi;
7566 }
7567 
7568 /*
7569  * Enqueue the vhcache pathinfo (cpi) at the correct location in the
7570  * ordered list. All cpis which do not have MDI_CPI_HINT_PATH_DOES_NOT_EXIST
7571  * flag set come at the beginning of the list. All cpis which have this
7572  * flag set come at the end of the list.
7573  */
7574 static void
7575 enqueue_vhcache_pathinfo(mdi_vhcache_client_t *cct,
7576     mdi_vhcache_pathinfo_t *newcpi)
7577 {
7578 	mdi_vhcache_pathinfo_t *cpi, *prev_cpi;
7579 
7580 	if (cct->cct_cpi_head == NULL ||
7581 	    (newcpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST))
7582 		enqueue_tail_vhcache_pathinfo(cct, newcpi);
7583 	else {
7584 		for (cpi = cct->cct_cpi_head, prev_cpi = NULL; cpi != NULL &&
7585 		    !(cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST);
7586 		    prev_cpi = cpi, cpi = cpi->cpi_next)
7587 			;
7588 
7589 		if (prev_cpi == NULL)
7590 			cct->cct_cpi_head = newcpi;
7591 		else
7592 			prev_cpi->cpi_next = newcpi;
7593 
7594 		newcpi->cpi_next = cpi;
7595 
7596 		if (cpi == NULL)
7597 			cct->cct_cpi_tail = newcpi;
7598 	}
7599 }
7600 
7601 /*
7602  * Enqueue the vhcache client (cct) at the tail of the list
7603  */
7604 static void
7605 enqueue_vhcache_client(mdi_vhci_cache_t *vhcache,
7606     mdi_vhcache_client_t *cct)
7607 {
7608 	cct->cct_next = NULL;
7609 	if (vhcache->vhcache_client_head == NULL)
7610 		vhcache->vhcache_client_head = cct;
7611 	else
7612 		vhcache->vhcache_client_tail->cct_next = cct;
7613 	vhcache->vhcache_client_tail = cct;
7614 }
7615 
7616 static void
7617 free_string_array(char **str, int nelem)
7618 {
7619 	int i;
7620 
7621 	if (str) {
7622 		for (i = 0; i < nelem; i++) {
7623 			if (str[i])
7624 				kmem_free(str[i], strlen(str[i]) + 1);
7625 		}
7626 		kmem_free(str, sizeof (char *) * nelem);
7627 	}
7628 }
7629 
7630 static void
7631 free_vhcache_phci(mdi_vhcache_phci_t *cphci)
7632 {
7633 	kmem_free(cphci->cphci_path, strlen(cphci->cphci_path) + 1);
7634 	kmem_free(cphci, sizeof (*cphci));
7635 }
7636 
7637 static void
7638 free_vhcache_pathinfo(mdi_vhcache_pathinfo_t *cpi)
7639 {
7640 	kmem_free(cpi->cpi_addr, strlen(cpi->cpi_addr) + 1);
7641 	kmem_free(cpi, sizeof (*cpi));
7642 }
7643 
7644 static void
7645 free_vhcache_client(mdi_vhcache_client_t *cct)
7646 {
7647 	kmem_free(cct->cct_name_addr, strlen(cct->cct_name_addr) + 1);
7648 	kmem_free(cct, sizeof (*cct));
7649 }
7650 
7651 static char *
7652 vhcache_mknameaddr(char *ct_name, char *ct_addr, int *ret_len)
7653 {
7654 	char *name_addr;
7655 	int len;
7656 
7657 	len = strlen(ct_name) + strlen(ct_addr) + 2;
7658 	name_addr = kmem_alloc(len, KM_SLEEP);
7659 	(void) snprintf(name_addr, len, "%s@%s", ct_name, ct_addr);
7660 
7661 	if (ret_len)
7662 		*ret_len = len;
7663 	return (name_addr);
7664 }
7665 
7666 /*
7667  * Copy the contents of paddrnvl to vhci cache.
7668  * paddrnvl nvlist contains path information for a vhci client.
7669  * See the comment in mainnvl_to_vhcache() for the format of this nvlist.
7670  */
7671 static void
7672 paddrnvl_to_vhcache(nvlist_t *nvl, mdi_vhcache_phci_t *cphci_list[],
7673     mdi_vhcache_client_t *cct)
7674 {
7675 	nvpair_t *nvp = NULL;
7676 	mdi_vhcache_pathinfo_t *cpi;
7677 	uint_t nelem;
7678 	uint32_t *val;
7679 
7680 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
7681 		ASSERT(nvpair_type(nvp) == DATA_TYPE_UINT32_ARRAY);
7682 		cpi = kmem_zalloc(sizeof (*cpi), KM_SLEEP);
7683 		cpi->cpi_addr = i_ddi_strdup(nvpair_name(nvp), KM_SLEEP);
7684 		(void) nvpair_value_uint32_array(nvp, &val, &nelem);
7685 		ASSERT(nelem == 2);
7686 		cpi->cpi_cphci = cphci_list[val[0]];
7687 		cpi->cpi_flags = val[1];
7688 		enqueue_tail_vhcache_pathinfo(cct, cpi);
7689 	}
7690 }
7691 
7692 /*
7693  * Copy the contents of caddrmapnvl to vhci cache.
7694  * caddrmapnvl nvlist contains vhci client address to phci client address
7695  * mappings. See the comment in mainnvl_to_vhcache() for the format of
7696  * this nvlist.
7697  */
7698 static void
7699 caddrmapnvl_to_vhcache(mdi_vhci_cache_t *vhcache, nvlist_t *nvl,
7700     mdi_vhcache_phci_t *cphci_list[])
7701 {
7702 	nvpair_t *nvp = NULL;
7703 	nvlist_t *paddrnvl;
7704 	mdi_vhcache_client_t *cct;
7705 
7706 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
7707 		ASSERT(nvpair_type(nvp) == DATA_TYPE_NVLIST);
7708 		cct = kmem_zalloc(sizeof (*cct), KM_SLEEP);
7709 		cct->cct_name_addr = i_ddi_strdup(nvpair_name(nvp), KM_SLEEP);
7710 		(void) nvpair_value_nvlist(nvp, &paddrnvl);
7711 		paddrnvl_to_vhcache(paddrnvl, cphci_list, cct);
7712 		/* the client must contain at least one path */
7713 		ASSERT(cct->cct_cpi_head != NULL);
7714 
7715 		enqueue_vhcache_client(vhcache, cct);
7716 		(void) mod_hash_insert(vhcache->vhcache_client_hash,
7717 		    (mod_hash_key_t)cct->cct_name_addr, (mod_hash_val_t)cct);
7718 	}
7719 }
7720 
7721 /*
7722  * Copy the contents of the main nvlist to vhci cache.
7723  *
7724  * VHCI busconfig cached data is stored in the form of a nvlist on the disk.
7725  * The nvlist contains the mappings between the vhci client addresses and
7726  * their corresponding phci client addresses.
7727  *
7728  * The structure of the nvlist is as follows:
7729  *
7730  * Main nvlist:
7731  *	NAME		TYPE		DATA
7732  *	version		int32		version number
7733  *	phcis		string array	array of phci paths
7734  *	clientaddrmap	nvlist_t	c2paddrs_nvl (see below)
7735  *
7736  * structure of c2paddrs_nvl:
7737  *	NAME		TYPE		DATA
7738  *	caddr1		nvlist_t	paddrs_nvl1
7739  *	caddr2		nvlist_t	paddrs_nvl2
7740  *	...
7741  * where caddr1, caddr2, ... are vhci client name and addresses in the
7742  * form of "<clientname>@<clientaddress>".
7743  * (for example: "ssd@2000002037cd9f72");
7744  * paddrs_nvl1, paddrs_nvl2, .. are nvlists that contain path information.
7745  *
7746  * structure of paddrs_nvl:
7747  *	NAME		TYPE		DATA
7748  *	pi_addr1	uint32_array	(phci-id, cpi_flags)
7749  *	pi_addr2	uint32_array	(phci-id, cpi_flags)
7750  *	...
7751  * where pi_addr1, pi_addr2, ... are bus specific addresses of pathinfo nodes
7752  * (so called pi_addrs, for example: "w2100002037cd9f72,0");
7753  * phci-ids are integers that identify pHCIs to which the
7754  * the bus specific address belongs to. These integers are used as an index
7755  * into to the phcis string array in the main nvlist to get the pHCI path.
7756  */
7757 static int
7758 mainnvl_to_vhcache(mdi_vhci_cache_t *vhcache, nvlist_t *nvl)
7759 {
7760 	char **phcis, **phci_namep;
7761 	uint_t nphcis;
7762 	mdi_vhcache_phci_t *cphci, **cphci_list;
7763 	nvlist_t *caddrmapnvl;
7764 	int32_t ver;
7765 	int i;
7766 	size_t cphci_list_size;
7767 
7768 	ASSERT(RW_WRITE_HELD(&vhcache->vhcache_lock));
7769 
7770 	if (nvlist_lookup_int32(nvl, MDI_NVPNAME_VERSION, &ver) != 0 ||
7771 	    ver != MDI_VHCI_CACHE_VERSION)
7772 		return (MDI_FAILURE);
7773 
7774 	if (nvlist_lookup_string_array(nvl, MDI_NVPNAME_PHCIS, &phcis,
7775 	    &nphcis) != 0)
7776 		return (MDI_SUCCESS);
7777 
7778 	ASSERT(nphcis > 0);
7779 
7780 	cphci_list_size = sizeof (mdi_vhcache_phci_t *) * nphcis;
7781 	cphci_list = kmem_alloc(cphci_list_size, KM_SLEEP);
7782 	for (i = 0, phci_namep = phcis; i < nphcis; i++, phci_namep++) {
7783 		cphci = kmem_zalloc(sizeof (mdi_vhcache_phci_t), KM_SLEEP);
7784 		cphci->cphci_path = i_ddi_strdup(*phci_namep, KM_SLEEP);
7785 		enqueue_vhcache_phci(vhcache, cphci);
7786 		cphci_list[i] = cphci;
7787 	}
7788 
7789 	ASSERT(vhcache->vhcache_phci_head != NULL);
7790 
7791 	if (nvlist_lookup_nvlist(nvl, MDI_NVPNAME_CTADDRMAP, &caddrmapnvl) == 0)
7792 		caddrmapnvl_to_vhcache(vhcache, caddrmapnvl, cphci_list);
7793 
7794 	kmem_free(cphci_list, cphci_list_size);
7795 	return (MDI_SUCCESS);
7796 }
7797 
7798 /*
7799  * Build paddrnvl for the specified client using the information in the
7800  * vhci cache and add it to the caddrmapnnvl.
7801  * Returns 0 on success, errno on failure.
7802  */
7803 static int
7804 vhcache_to_paddrnvl(mdi_vhci_cache_t *vhcache, mdi_vhcache_client_t *cct,
7805     nvlist_t *caddrmapnvl)
7806 {
7807 	mdi_vhcache_pathinfo_t *cpi;
7808 	nvlist_t *nvl;
7809 	int err;
7810 	uint32_t val[2];
7811 
7812 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7813 
7814 	if ((err = nvlist_alloc(&nvl, 0, KM_SLEEP)) != 0)
7815 		return (err);
7816 
7817 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
7818 		val[0] = cpi->cpi_cphci->cphci_id;
7819 		val[1] = cpi->cpi_flags;
7820 		if ((err = nvlist_add_uint32_array(nvl, cpi->cpi_addr, val, 2))
7821 		    != 0)
7822 			goto out;
7823 	}
7824 
7825 	err = nvlist_add_nvlist(caddrmapnvl, cct->cct_name_addr, nvl);
7826 out:
7827 	nvlist_free(nvl);
7828 	return (err);
7829 }
7830 
7831 /*
7832  * Build caddrmapnvl using the information in the vhci cache
7833  * and add it to the mainnvl.
7834  * Returns 0 on success, errno on failure.
7835  */
7836 static int
7837 vhcache_to_caddrmapnvl(mdi_vhci_cache_t *vhcache, nvlist_t *mainnvl)
7838 {
7839 	mdi_vhcache_client_t *cct;
7840 	nvlist_t *nvl;
7841 	int err;
7842 
7843 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7844 
7845 	if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) != 0)
7846 		return (err);
7847 
7848 	for (cct = vhcache->vhcache_client_head; cct != NULL;
7849 	    cct = cct->cct_next) {
7850 		if ((err = vhcache_to_paddrnvl(vhcache, cct, nvl)) != 0)
7851 			goto out;
7852 	}
7853 
7854 	err = nvlist_add_nvlist(mainnvl, MDI_NVPNAME_CTADDRMAP, nvl);
7855 out:
7856 	nvlist_free(nvl);
7857 	return (err);
7858 }
7859 
7860 /*
7861  * Build nvlist using the information in the vhci cache.
7862  * See the comment in mainnvl_to_vhcache() for the format of the nvlist.
7863  * Returns nvl on success, NULL on failure.
7864  */
7865 static nvlist_t *
7866 vhcache_to_mainnvl(mdi_vhci_cache_t *vhcache)
7867 {
7868 	mdi_vhcache_phci_t *cphci;
7869 	uint_t phci_count;
7870 	char **phcis;
7871 	nvlist_t *nvl;
7872 	int err, i;
7873 
7874 	if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) != 0) {
7875 		nvl = NULL;
7876 		goto out;
7877 	}
7878 
7879 	if ((err = nvlist_add_int32(nvl, MDI_NVPNAME_VERSION,
7880 	    MDI_VHCI_CACHE_VERSION)) != 0)
7881 		goto out;
7882 
7883 	rw_enter(&vhcache->vhcache_lock, RW_READER);
7884 	if (vhcache->vhcache_phci_head == NULL) {
7885 		rw_exit(&vhcache->vhcache_lock);
7886 		return (nvl);
7887 	}
7888 
7889 	phci_count = 0;
7890 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7891 	    cphci = cphci->cphci_next)
7892 		cphci->cphci_id = phci_count++;
7893 
7894 	/* build phci pathname list */
7895 	phcis = kmem_alloc(sizeof (char *) * phci_count, KM_SLEEP);
7896 	for (cphci = vhcache->vhcache_phci_head, i = 0; cphci != NULL;
7897 	    cphci = cphci->cphci_next, i++)
7898 		phcis[i] = i_ddi_strdup(cphci->cphci_path, KM_SLEEP);
7899 
7900 	err = nvlist_add_string_array(nvl, MDI_NVPNAME_PHCIS, phcis,
7901 	    phci_count);
7902 	free_string_array(phcis, phci_count);
7903 
7904 	if (err == 0 &&
7905 	    (err = vhcache_to_caddrmapnvl(vhcache, nvl)) == 0) {
7906 		rw_exit(&vhcache->vhcache_lock);
7907 		return (nvl);
7908 	}
7909 
7910 	rw_exit(&vhcache->vhcache_lock);
7911 out:
7912 	if (nvl)
7913 		nvlist_free(nvl);
7914 	return (NULL);
7915 }
7916 
7917 /*
7918  * Lookup vhcache phci structure for the specified phci path.
7919  */
7920 static mdi_vhcache_phci_t *
7921 lookup_vhcache_phci_by_name(mdi_vhci_cache_t *vhcache, char *phci_path)
7922 {
7923 	mdi_vhcache_phci_t *cphci;
7924 
7925 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7926 
7927 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7928 	    cphci = cphci->cphci_next) {
7929 		if (strcmp(cphci->cphci_path, phci_path) == 0)
7930 			return (cphci);
7931 	}
7932 
7933 	return (NULL);
7934 }
7935 
7936 /*
7937  * Lookup vhcache phci structure for the specified phci.
7938  */
7939 static mdi_vhcache_phci_t *
7940 lookup_vhcache_phci_by_addr(mdi_vhci_cache_t *vhcache, mdi_phci_t *ph)
7941 {
7942 	mdi_vhcache_phci_t *cphci;
7943 
7944 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7945 
7946 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7947 	    cphci = cphci->cphci_next) {
7948 		if (cphci->cphci_phci == ph)
7949 			return (cphci);
7950 	}
7951 
7952 	return (NULL);
7953 }
7954 
7955 /*
7956  * Add the specified phci to the vhci cache if not already present.
7957  */
7958 static void
7959 vhcache_phci_add(mdi_vhci_config_t *vhc, mdi_phci_t *ph)
7960 {
7961 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7962 	mdi_vhcache_phci_t *cphci;
7963 	char *pathname;
7964 	int cache_updated;
7965 
7966 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7967 
7968 	pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
7969 	(void) ddi_pathname(ph->ph_dip, pathname);
7970 	if ((cphci = lookup_vhcache_phci_by_name(vhcache, pathname))
7971 	    != NULL) {
7972 		cphci->cphci_phci = ph;
7973 		cache_updated = 0;
7974 	} else {
7975 		cphci = kmem_zalloc(sizeof (*cphci), KM_SLEEP);
7976 		cphci->cphci_path = i_ddi_strdup(pathname, KM_SLEEP);
7977 		cphci->cphci_phci = ph;
7978 		enqueue_vhcache_phci(vhcache, cphci);
7979 		cache_updated = 1;
7980 	}
7981 
7982 	rw_exit(&vhcache->vhcache_lock);
7983 
7984 	/*
7985 	 * Since a new phci has been added, reset
7986 	 * vhc_path_discovery_cutoff_time to allow for discovery of paths
7987 	 * during next vhcache_discover_paths().
7988 	 */
7989 	mutex_enter(&vhc->vhc_lock);
7990 	vhc->vhc_path_discovery_cutoff_time = 0;
7991 	mutex_exit(&vhc->vhc_lock);
7992 
7993 	kmem_free(pathname, MAXPATHLEN);
7994 	if (cache_updated)
7995 		vhcache_dirty(vhc);
7996 }
7997 
7998 /*
7999  * Remove the reference to the specified phci from the vhci cache.
8000  */
8001 static void
8002 vhcache_phci_remove(mdi_vhci_config_t *vhc, mdi_phci_t *ph)
8003 {
8004 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8005 	mdi_vhcache_phci_t *cphci;
8006 
8007 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8008 	if ((cphci = lookup_vhcache_phci_by_addr(vhcache, ph)) != NULL) {
8009 		/* do not remove the actual mdi_vhcache_phci structure */
8010 		cphci->cphci_phci = NULL;
8011 	}
8012 	rw_exit(&vhcache->vhcache_lock);
8013 }
8014 
8015 static void
8016 init_vhcache_lookup_token(mdi_vhcache_lookup_token_t *dst,
8017     mdi_vhcache_lookup_token_t *src)
8018 {
8019 	if (src == NULL) {
8020 		dst->lt_cct = NULL;
8021 		dst->lt_cct_lookup_time = 0;
8022 	} else {
8023 		dst->lt_cct = src->lt_cct;
8024 		dst->lt_cct_lookup_time = src->lt_cct_lookup_time;
8025 	}
8026 }
8027 
8028 /*
8029  * Look up vhcache client for the specified client.
8030  */
8031 static mdi_vhcache_client_t *
8032 lookup_vhcache_client(mdi_vhci_cache_t *vhcache, char *ct_name, char *ct_addr,
8033     mdi_vhcache_lookup_token_t *token)
8034 {
8035 	mod_hash_val_t hv;
8036 	char *name_addr;
8037 	int len;
8038 
8039 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
8040 
8041 	/*
8042 	 * If no vhcache clean occurred since the last lookup, we can
8043 	 * simply return the cct from the last lookup operation.
8044 	 * It works because ccts are never freed except during the vhcache
8045 	 * cleanup operation.
8046 	 */
8047 	if (token != NULL &&
8048 	    vhcache->vhcache_clean_time < token->lt_cct_lookup_time)
8049 		return (token->lt_cct);
8050 
8051 	name_addr = vhcache_mknameaddr(ct_name, ct_addr, &len);
8052 	if (mod_hash_find(vhcache->vhcache_client_hash,
8053 	    (mod_hash_key_t)name_addr, &hv) == 0) {
8054 		if (token) {
8055 			token->lt_cct = (mdi_vhcache_client_t *)hv;
8056 			token->lt_cct_lookup_time = ddi_get_lbolt64();
8057 		}
8058 	} else {
8059 		if (token) {
8060 			token->lt_cct = NULL;
8061 			token->lt_cct_lookup_time = 0;
8062 		}
8063 		hv = NULL;
8064 	}
8065 	kmem_free(name_addr, len);
8066 	return ((mdi_vhcache_client_t *)hv);
8067 }
8068 
8069 /*
8070  * Add the specified path to the vhci cache if not already present.
8071  * Also add the vhcache client for the client corresponding to this path
8072  * if it doesn't already exist.
8073  */
8074 static void
8075 vhcache_pi_add(mdi_vhci_config_t *vhc, struct mdi_pathinfo *pip)
8076 {
8077 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8078 	mdi_vhcache_client_t *cct;
8079 	mdi_vhcache_pathinfo_t *cpi;
8080 	mdi_phci_t *ph = pip->pi_phci;
8081 	mdi_client_t *ct = pip->pi_client;
8082 	int cache_updated = 0;
8083 
8084 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8085 
8086 	/* if vhcache client for this pip doesn't already exist, add it */
8087 	if ((cct = lookup_vhcache_client(vhcache, ct->ct_drvname, ct->ct_guid,
8088 	    NULL)) == NULL) {
8089 		cct = kmem_zalloc(sizeof (*cct), KM_SLEEP);
8090 		cct->cct_name_addr = vhcache_mknameaddr(ct->ct_drvname,
8091 		    ct->ct_guid, NULL);
8092 		enqueue_vhcache_client(vhcache, cct);
8093 		(void) mod_hash_insert(vhcache->vhcache_client_hash,
8094 		    (mod_hash_key_t)cct->cct_name_addr, (mod_hash_val_t)cct);
8095 		cache_updated = 1;
8096 	}
8097 
8098 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8099 		if (cpi->cpi_cphci->cphci_phci == ph &&
8100 		    strcmp(cpi->cpi_addr, pip->pi_addr) == 0) {
8101 			cpi->cpi_pip = pip;
8102 			if (cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST) {
8103 				cpi->cpi_flags &=
8104 				    ~MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
8105 				sort_vhcache_paths(cct);
8106 				cache_updated = 1;
8107 			}
8108 			break;
8109 		}
8110 	}
8111 
8112 	if (cpi == NULL) {
8113 		cpi = kmem_zalloc(sizeof (*cpi), KM_SLEEP);
8114 		cpi->cpi_addr = i_ddi_strdup(pip->pi_addr, KM_SLEEP);
8115 		cpi->cpi_cphci = lookup_vhcache_phci_by_addr(vhcache, ph);
8116 		ASSERT(cpi->cpi_cphci != NULL);
8117 		cpi->cpi_pip = pip;
8118 		enqueue_vhcache_pathinfo(cct, cpi);
8119 		cache_updated = 1;
8120 	}
8121 
8122 	rw_exit(&vhcache->vhcache_lock);
8123 
8124 	if (cache_updated)
8125 		vhcache_dirty(vhc);
8126 }
8127 
8128 /*
8129  * Remove the reference to the specified path from the vhci cache.
8130  */
8131 static void
8132 vhcache_pi_remove(mdi_vhci_config_t *vhc, struct mdi_pathinfo *pip)
8133 {
8134 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8135 	mdi_client_t *ct = pip->pi_client;
8136 	mdi_vhcache_client_t *cct;
8137 	mdi_vhcache_pathinfo_t *cpi;
8138 
8139 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8140 	if ((cct = lookup_vhcache_client(vhcache, ct->ct_drvname, ct->ct_guid,
8141 	    NULL)) != NULL) {
8142 		for (cpi = cct->cct_cpi_head; cpi != NULL;
8143 		    cpi = cpi->cpi_next) {
8144 			if (cpi->cpi_pip == pip) {
8145 				cpi->cpi_pip = NULL;
8146 				break;
8147 			}
8148 		}
8149 	}
8150 	rw_exit(&vhcache->vhcache_lock);
8151 }
8152 
8153 /*
8154  * Flush the vhci cache to disk.
8155  * Returns MDI_SUCCESS on success, MDI_FAILURE on failure.
8156  */
8157 static int
8158 flush_vhcache(mdi_vhci_config_t *vhc, int force_flag)
8159 {
8160 	nvlist_t *nvl;
8161 	int err;
8162 	int rv;
8163 
8164 	/*
8165 	 * It is possible that the system may shutdown before
8166 	 * i_ddi_io_initialized (during stmsboot for example). To allow for
8167 	 * flushing the cache in this case do not check for
8168 	 * i_ddi_io_initialized when force flag is set.
8169 	 */
8170 	if (force_flag == 0 && !i_ddi_io_initialized())
8171 		return (MDI_FAILURE);
8172 
8173 	if ((nvl = vhcache_to_mainnvl(&vhc->vhc_vhcache)) != NULL) {
8174 		err = fwrite_nvlist(vhc->vhc_vhcache_filename, nvl);
8175 		nvlist_free(nvl);
8176 	} else
8177 		err = EFAULT;
8178 
8179 	rv = MDI_SUCCESS;
8180 	mutex_enter(&vhc->vhc_lock);
8181 	if (err != 0) {
8182 		if (err == EROFS) {
8183 			vhc->vhc_flags |= MDI_VHC_READONLY_FS;
8184 			vhc->vhc_flags &= ~(MDI_VHC_VHCACHE_FLUSH_ERROR |
8185 			    MDI_VHC_VHCACHE_DIRTY);
8186 		} else {
8187 			if (!(vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_ERROR)) {
8188 				cmn_err(CE_CONT, "%s: update failed\n",
8189 				    vhc->vhc_vhcache_filename);
8190 				vhc->vhc_flags |= MDI_VHC_VHCACHE_FLUSH_ERROR;
8191 			}
8192 			rv = MDI_FAILURE;
8193 		}
8194 	} else if (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_ERROR) {
8195 		cmn_err(CE_CONT,
8196 		    "%s: update now ok\n", vhc->vhc_vhcache_filename);
8197 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_FLUSH_ERROR;
8198 	}
8199 	mutex_exit(&vhc->vhc_lock);
8200 
8201 	return (rv);
8202 }
8203 
8204 /*
8205  * Call flush_vhcache() to flush the vhci cache at the scheduled time.
8206  * Exits itself if left idle for the idle timeout period.
8207  */
8208 static void
8209 vhcache_flush_thread(void *arg)
8210 {
8211 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
8212 	clock_t idle_time, quit_at_ticks;
8213 	callb_cpr_t cprinfo;
8214 
8215 	/* number of seconds to sleep idle before exiting */
8216 	idle_time = mdi_vhcache_flush_daemon_idle_time * TICKS_PER_SECOND;
8217 
8218 	CALLB_CPR_INIT(&cprinfo, &vhc->vhc_lock, callb_generic_cpr,
8219 	    "mdi_vhcache_flush");
8220 	mutex_enter(&vhc->vhc_lock);
8221 	for (; ; ) {
8222 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
8223 		    (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY)) {
8224 			if (ddi_get_lbolt() < vhc->vhc_flush_at_ticks) {
8225 				CALLB_CPR_SAFE_BEGIN(&cprinfo);
8226 				(void) cv_timedwait(&vhc->vhc_cv,
8227 				    &vhc->vhc_lock, vhc->vhc_flush_at_ticks);
8228 				CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
8229 			} else {
8230 				vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
8231 				mutex_exit(&vhc->vhc_lock);
8232 
8233 				if (flush_vhcache(vhc, 0) != MDI_SUCCESS)
8234 					vhcache_dirty(vhc);
8235 
8236 				mutex_enter(&vhc->vhc_lock);
8237 			}
8238 		}
8239 
8240 		quit_at_ticks = ddi_get_lbolt() + idle_time;
8241 
8242 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
8243 		    !(vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) &&
8244 		    ddi_get_lbolt() < quit_at_ticks) {
8245 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
8246 			(void) cv_timedwait(&vhc->vhc_cv, &vhc->vhc_lock,
8247 			    quit_at_ticks);
8248 			CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
8249 		}
8250 
8251 		if ((vhc->vhc_flags & MDI_VHC_EXIT) ||
8252 		    !(vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY))
8253 			goto out;
8254 	}
8255 
8256 out:
8257 	vhc->vhc_flags &= ~MDI_VHC_VHCACHE_FLUSH_THREAD;
8258 	/* CALLB_CPR_EXIT releases the vhc->vhc_lock */
8259 	CALLB_CPR_EXIT(&cprinfo);
8260 }
8261 
8262 /*
8263  * Make vhci cache dirty and schedule flushing by vhcache flush thread.
8264  */
8265 static void
8266 vhcache_dirty(mdi_vhci_config_t *vhc)
8267 {
8268 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8269 	int create_thread;
8270 
8271 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8272 	/* do not flush cache until the cache is fully built */
8273 	if (!(vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE)) {
8274 		rw_exit(&vhcache->vhcache_lock);
8275 		return;
8276 	}
8277 	rw_exit(&vhcache->vhcache_lock);
8278 
8279 	mutex_enter(&vhc->vhc_lock);
8280 	if (vhc->vhc_flags & MDI_VHC_READONLY_FS) {
8281 		mutex_exit(&vhc->vhc_lock);
8282 		return;
8283 	}
8284 
8285 	vhc->vhc_flags |= MDI_VHC_VHCACHE_DIRTY;
8286 	vhc->vhc_flush_at_ticks = ddi_get_lbolt() +
8287 	    mdi_vhcache_flush_delay * TICKS_PER_SECOND;
8288 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) {
8289 		cv_broadcast(&vhc->vhc_cv);
8290 		create_thread = 0;
8291 	} else {
8292 		vhc->vhc_flags |= MDI_VHC_VHCACHE_FLUSH_THREAD;
8293 		create_thread = 1;
8294 	}
8295 	mutex_exit(&vhc->vhc_lock);
8296 
8297 	if (create_thread)
8298 		(void) thread_create(NULL, 0, vhcache_flush_thread, vhc,
8299 		    0, &p0, TS_RUN, minclsyspri);
8300 }
8301 
8302 /*
8303  * phci bus config structure - one for for each phci bus config operation that
8304  * we initiate on behalf of a vhci.
8305  */
8306 typedef struct mdi_phci_bus_config_s {
8307 	char *phbc_phci_path;
8308 	struct mdi_vhci_bus_config_s *phbc_vhbusconfig;	/* vhci bus config */
8309 	struct mdi_phci_bus_config_s *phbc_next;
8310 } mdi_phci_bus_config_t;
8311 
8312 /* vhci bus config structure - one for each vhci bus config operation */
8313 typedef struct mdi_vhci_bus_config_s {
8314 	ddi_bus_config_op_t vhbc_op;	/* bus config op */
8315 	major_t vhbc_op_major;		/* bus config op major */
8316 	uint_t vhbc_op_flags;		/* bus config op flags */
8317 	kmutex_t vhbc_lock;
8318 	kcondvar_t vhbc_cv;
8319 	int vhbc_thr_count;
8320 } mdi_vhci_bus_config_t;
8321 
8322 /*
8323  * bus config the specified phci
8324  */
8325 static void
8326 bus_config_phci(void *arg)
8327 {
8328 	mdi_phci_bus_config_t *phbc = (mdi_phci_bus_config_t *)arg;
8329 	mdi_vhci_bus_config_t *vhbc = phbc->phbc_vhbusconfig;
8330 	dev_info_t *ph_dip;
8331 
8332 	/*
8333 	 * first configure all path components upto phci and then configure
8334 	 * the phci children.
8335 	 */
8336 	if ((ph_dip = e_ddi_hold_devi_by_path(phbc->phbc_phci_path, 0))
8337 	    != NULL) {
8338 		if (vhbc->vhbc_op == BUS_CONFIG_DRIVER ||
8339 		    vhbc->vhbc_op == BUS_UNCONFIG_DRIVER) {
8340 			(void) ndi_devi_config_driver(ph_dip,
8341 			    vhbc->vhbc_op_flags,
8342 			    vhbc->vhbc_op_major);
8343 		} else
8344 			(void) ndi_devi_config(ph_dip,
8345 			    vhbc->vhbc_op_flags);
8346 
8347 		/* release the hold that e_ddi_hold_devi_by_path() placed */
8348 		ndi_rele_devi(ph_dip);
8349 	}
8350 
8351 	kmem_free(phbc->phbc_phci_path, strlen(phbc->phbc_phci_path) + 1);
8352 	kmem_free(phbc, sizeof (*phbc));
8353 
8354 	mutex_enter(&vhbc->vhbc_lock);
8355 	vhbc->vhbc_thr_count--;
8356 	if (vhbc->vhbc_thr_count == 0)
8357 		cv_broadcast(&vhbc->vhbc_cv);
8358 	mutex_exit(&vhbc->vhbc_lock);
8359 }
8360 
8361 /*
8362  * Bus config all phcis associated with the vhci in parallel.
8363  * op must be BUS_CONFIG_DRIVER or BUS_CONFIG_ALL.
8364  */
8365 static void
8366 bus_config_all_phcis(mdi_vhci_cache_t *vhcache, uint_t flags,
8367     ddi_bus_config_op_t op, major_t maj)
8368 {
8369 	mdi_phci_bus_config_t *phbc_head = NULL, *phbc, *phbc_next;
8370 	mdi_vhci_bus_config_t *vhbc;
8371 	mdi_vhcache_phci_t *cphci;
8372 
8373 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8374 	if (vhcache->vhcache_phci_head == NULL) {
8375 		rw_exit(&vhcache->vhcache_lock);
8376 		return;
8377 	}
8378 
8379 	vhbc = kmem_zalloc(sizeof (*vhbc), KM_SLEEP);
8380 
8381 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
8382 	    cphci = cphci->cphci_next) {
8383 		/* skip phcis that haven't attached before root is available */
8384 		if (!modrootloaded && (cphci->cphci_phci == NULL))
8385 			continue;
8386 		phbc = kmem_zalloc(sizeof (*phbc), KM_SLEEP);
8387 		phbc->phbc_phci_path = i_ddi_strdup(cphci->cphci_path,
8388 		    KM_SLEEP);
8389 		phbc->phbc_vhbusconfig = vhbc;
8390 		phbc->phbc_next = phbc_head;
8391 		phbc_head = phbc;
8392 		vhbc->vhbc_thr_count++;
8393 	}
8394 	rw_exit(&vhcache->vhcache_lock);
8395 
8396 	vhbc->vhbc_op = op;
8397 	vhbc->vhbc_op_major = maj;
8398 	vhbc->vhbc_op_flags = NDI_NO_EVENT |
8399 	    (flags & (NDI_CONFIG_REPROBE | NDI_DRV_CONF_REPROBE));
8400 	mutex_init(&vhbc->vhbc_lock, NULL, MUTEX_DEFAULT, NULL);
8401 	cv_init(&vhbc->vhbc_cv, NULL, CV_DRIVER, NULL);
8402 
8403 	/* now create threads to initiate bus config on all phcis in parallel */
8404 	for (phbc = phbc_head; phbc != NULL; phbc = phbc_next) {
8405 		phbc_next = phbc->phbc_next;
8406 		if (mdi_mtc_off)
8407 			bus_config_phci((void *)phbc);
8408 		else
8409 			(void) thread_create(NULL, 0, bus_config_phci, phbc,
8410 			    0, &p0, TS_RUN, minclsyspri);
8411 	}
8412 
8413 	mutex_enter(&vhbc->vhbc_lock);
8414 	/* wait until all threads exit */
8415 	while (vhbc->vhbc_thr_count > 0)
8416 		cv_wait(&vhbc->vhbc_cv, &vhbc->vhbc_lock);
8417 	mutex_exit(&vhbc->vhbc_lock);
8418 
8419 	mutex_destroy(&vhbc->vhbc_lock);
8420 	cv_destroy(&vhbc->vhbc_cv);
8421 	kmem_free(vhbc, sizeof (*vhbc));
8422 }
8423 
8424 /*
8425  * Single threaded version of bus_config_all_phcis()
8426  */
8427 static void
8428 st_bus_config_all_phcis(mdi_vhci_config_t *vhc, uint_t flags,
8429     ddi_bus_config_op_t op, major_t maj)
8430 {
8431 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8432 
8433 	single_threaded_vhconfig_enter(vhc);
8434 	bus_config_all_phcis(vhcache, flags, op, maj);
8435 	single_threaded_vhconfig_exit(vhc);
8436 }
8437 
8438 /*
8439  * Perform BUS_CONFIG_ONE on the specified child of the phci.
8440  * The path includes the child component in addition to the phci path.
8441  */
8442 static int
8443 bus_config_one_phci_child(char *path)
8444 {
8445 	dev_info_t *ph_dip, *child;
8446 	char *devnm;
8447 	int rv = MDI_FAILURE;
8448 
8449 	/* extract the child component of the phci */
8450 	devnm = strrchr(path, '/');
8451 	*devnm++ = '\0';
8452 
8453 	/*
8454 	 * first configure all path components upto phci and then
8455 	 * configure the phci child.
8456 	 */
8457 	if ((ph_dip = e_ddi_hold_devi_by_path(path, 0)) != NULL) {
8458 		if (ndi_devi_config_one(ph_dip, devnm, &child, NDI_NO_EVENT) ==
8459 		    NDI_SUCCESS) {
8460 			/*
8461 			 * release the hold that ndi_devi_config_one() placed
8462 			 */
8463 			ndi_rele_devi(child);
8464 			rv = MDI_SUCCESS;
8465 		}
8466 
8467 		/* release the hold that e_ddi_hold_devi_by_path() placed */
8468 		ndi_rele_devi(ph_dip);
8469 	}
8470 
8471 	devnm--;
8472 	*devnm = '/';
8473 	return (rv);
8474 }
8475 
8476 /*
8477  * Build a list of phci client paths for the specified vhci client.
8478  * The list includes only those phci client paths which aren't configured yet.
8479  */
8480 static mdi_phys_path_t *
8481 build_phclient_path_list(mdi_vhcache_client_t *cct, char *ct_name)
8482 {
8483 	mdi_vhcache_pathinfo_t *cpi;
8484 	mdi_phys_path_t *pp_head = NULL, *pp_tail = NULL, *pp;
8485 	int config_path, len;
8486 
8487 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8488 		/*
8489 		 * include only those paths that aren't configured.
8490 		 */
8491 		config_path = 0;
8492 		if (cpi->cpi_pip == NULL)
8493 			config_path = 1;
8494 		else {
8495 			MDI_PI_LOCK(cpi->cpi_pip);
8496 			if (MDI_PI_IS_INIT(cpi->cpi_pip))
8497 				config_path = 1;
8498 			MDI_PI_UNLOCK(cpi->cpi_pip);
8499 		}
8500 
8501 		if (config_path) {
8502 			pp = kmem_alloc(sizeof (*pp), KM_SLEEP);
8503 			len = strlen(cpi->cpi_cphci->cphci_path) +
8504 			    strlen(ct_name) + strlen(cpi->cpi_addr) + 3;
8505 			pp->phys_path = kmem_alloc(len, KM_SLEEP);
8506 			(void) snprintf(pp->phys_path, len, "%s/%s@%s",
8507 			    cpi->cpi_cphci->cphci_path, ct_name,
8508 			    cpi->cpi_addr);
8509 			pp->phys_path_next = NULL;
8510 
8511 			if (pp_head == NULL)
8512 				pp_head = pp;
8513 			else
8514 				pp_tail->phys_path_next = pp;
8515 			pp_tail = pp;
8516 		}
8517 	}
8518 
8519 	return (pp_head);
8520 }
8521 
8522 /*
8523  * Free the memory allocated for phci client path list.
8524  */
8525 static void
8526 free_phclient_path_list(mdi_phys_path_t *pp_head)
8527 {
8528 	mdi_phys_path_t *pp, *pp_next;
8529 
8530 	for (pp = pp_head; pp != NULL; pp = pp_next) {
8531 		pp_next = pp->phys_path_next;
8532 		kmem_free(pp->phys_path, strlen(pp->phys_path) + 1);
8533 		kmem_free(pp, sizeof (*pp));
8534 	}
8535 }
8536 
8537 /*
8538  * Allocated async client structure and initialize with the specified values.
8539  */
8540 static mdi_async_client_config_t *
8541 alloc_async_client_config(char *ct_name, char *ct_addr,
8542     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
8543 {
8544 	mdi_async_client_config_t *acc;
8545 
8546 	acc = kmem_alloc(sizeof (*acc), KM_SLEEP);
8547 	acc->acc_ct_name = i_ddi_strdup(ct_name, KM_SLEEP);
8548 	acc->acc_ct_addr = i_ddi_strdup(ct_addr, KM_SLEEP);
8549 	acc->acc_phclient_path_list_head = pp_head;
8550 	init_vhcache_lookup_token(&acc->acc_token, tok);
8551 	acc->acc_next = NULL;
8552 	return (acc);
8553 }
8554 
8555 /*
8556  * Free the memory allocated for the async client structure and their members.
8557  */
8558 static void
8559 free_async_client_config(mdi_async_client_config_t *acc)
8560 {
8561 	if (acc->acc_phclient_path_list_head)
8562 		free_phclient_path_list(acc->acc_phclient_path_list_head);
8563 	kmem_free(acc->acc_ct_name, strlen(acc->acc_ct_name) + 1);
8564 	kmem_free(acc->acc_ct_addr, strlen(acc->acc_ct_addr) + 1);
8565 	kmem_free(acc, sizeof (*acc));
8566 }
8567 
8568 /*
8569  * Sort vhcache pathinfos (cpis) of the specified client.
8570  * All cpis which do not have MDI_CPI_HINT_PATH_DOES_NOT_EXIST
8571  * flag set come at the beginning of the list. All cpis which have this
8572  * flag set come at the end of the list.
8573  */
8574 static void
8575 sort_vhcache_paths(mdi_vhcache_client_t *cct)
8576 {
8577 	mdi_vhcache_pathinfo_t *cpi, *cpi_next, *cpi_head;
8578 
8579 	cpi_head = cct->cct_cpi_head;
8580 	cct->cct_cpi_head = cct->cct_cpi_tail = NULL;
8581 	for (cpi = cpi_head; cpi != NULL; cpi = cpi_next) {
8582 		cpi_next = cpi->cpi_next;
8583 		enqueue_vhcache_pathinfo(cct, cpi);
8584 	}
8585 }
8586 
8587 /*
8588  * Verify whether MDI_CPI_HINT_PATH_DOES_NOT_EXIST flag setting is correct for
8589  * every vhcache pathinfo of the specified client. If not adjust the flag
8590  * setting appropriately.
8591  *
8592  * Note that MDI_CPI_HINT_PATH_DOES_NOT_EXIST flag is persisted in the
8593  * on-disk vhci cache. So every time this flag is updated the cache must be
8594  * flushed.
8595  */
8596 static void
8597 adjust_sort_vhcache_paths(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
8598     mdi_vhcache_lookup_token_t *tok)
8599 {
8600 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8601 	mdi_vhcache_client_t *cct;
8602 	mdi_vhcache_pathinfo_t *cpi;
8603 
8604 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8605 	if ((cct = lookup_vhcache_client(vhcache, ct_name, ct_addr, tok))
8606 	    == NULL) {
8607 		rw_exit(&vhcache->vhcache_lock);
8608 		return;
8609 	}
8610 
8611 	/*
8612 	 * to avoid unnecessary on-disk cache updates, first check if an
8613 	 * update is really needed. If no update is needed simply return.
8614 	 */
8615 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8616 		if ((cpi->cpi_pip != NULL &&
8617 		    (cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST)) ||
8618 		    (cpi->cpi_pip == NULL &&
8619 		    !(cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST))) {
8620 			break;
8621 		}
8622 	}
8623 	if (cpi == NULL) {
8624 		rw_exit(&vhcache->vhcache_lock);
8625 		return;
8626 	}
8627 
8628 	if (rw_tryupgrade(&vhcache->vhcache_lock) == 0) {
8629 		rw_exit(&vhcache->vhcache_lock);
8630 		rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8631 		if ((cct = lookup_vhcache_client(vhcache, ct_name, ct_addr,
8632 		    tok)) == NULL) {
8633 			rw_exit(&vhcache->vhcache_lock);
8634 			return;
8635 		}
8636 	}
8637 
8638 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8639 		if (cpi->cpi_pip != NULL)
8640 			cpi->cpi_flags &= ~MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
8641 		else
8642 			cpi->cpi_flags |= MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
8643 	}
8644 	sort_vhcache_paths(cct);
8645 
8646 	rw_exit(&vhcache->vhcache_lock);
8647 	vhcache_dirty(vhc);
8648 }
8649 
8650 /*
8651  * Configure all specified paths of the client.
8652  */
8653 static void
8654 config_client_paths_sync(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
8655     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
8656 {
8657 	mdi_phys_path_t *pp;
8658 
8659 	for (pp = pp_head; pp != NULL; pp = pp->phys_path_next)
8660 		(void) bus_config_one_phci_child(pp->phys_path);
8661 	adjust_sort_vhcache_paths(vhc, ct_name, ct_addr, tok);
8662 }
8663 
8664 /*
8665  * Dequeue elements from vhci async client config list and bus configure
8666  * their corresponding phci clients.
8667  */
8668 static void
8669 config_client_paths_thread(void *arg)
8670 {
8671 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
8672 	mdi_async_client_config_t *acc;
8673 	clock_t quit_at_ticks;
8674 	clock_t idle_time = mdi_async_config_idle_time * TICKS_PER_SECOND;
8675 	callb_cpr_t cprinfo;
8676 
8677 	CALLB_CPR_INIT(&cprinfo, &vhc->vhc_lock, callb_generic_cpr,
8678 	    "mdi_config_client_paths");
8679 
8680 	for (; ; ) {
8681 		quit_at_ticks = ddi_get_lbolt() + idle_time;
8682 
8683 		mutex_enter(&vhc->vhc_lock);
8684 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
8685 		    vhc->vhc_acc_list_head == NULL &&
8686 		    ddi_get_lbolt() < quit_at_ticks) {
8687 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
8688 			(void) cv_timedwait(&vhc->vhc_cv, &vhc->vhc_lock,
8689 			    quit_at_ticks);
8690 			CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
8691 		}
8692 
8693 		if ((vhc->vhc_flags & MDI_VHC_EXIT) ||
8694 		    vhc->vhc_acc_list_head == NULL)
8695 			goto out;
8696 
8697 		acc = vhc->vhc_acc_list_head;
8698 		vhc->vhc_acc_list_head = acc->acc_next;
8699 		if (vhc->vhc_acc_list_head == NULL)
8700 			vhc->vhc_acc_list_tail = NULL;
8701 		vhc->vhc_acc_count--;
8702 		mutex_exit(&vhc->vhc_lock);
8703 
8704 		config_client_paths_sync(vhc, acc->acc_ct_name,
8705 		    acc->acc_ct_addr, acc->acc_phclient_path_list_head,
8706 		    &acc->acc_token);
8707 
8708 		free_async_client_config(acc);
8709 	}
8710 
8711 out:
8712 	vhc->vhc_acc_thrcount--;
8713 	/* CALLB_CPR_EXIT releases the vhc->vhc_lock */
8714 	CALLB_CPR_EXIT(&cprinfo);
8715 }
8716 
8717 /*
8718  * Arrange for all the phci client paths (pp_head) for the specified client
8719  * to be bus configured asynchronously by a thread.
8720  */
8721 static void
8722 config_client_paths_async(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
8723     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
8724 {
8725 	mdi_async_client_config_t *acc, *newacc;
8726 	int create_thread;
8727 
8728 	if (pp_head == NULL)
8729 		return;
8730 
8731 	if (mdi_mtc_off) {
8732 		config_client_paths_sync(vhc, ct_name, ct_addr, pp_head, tok);
8733 		free_phclient_path_list(pp_head);
8734 		return;
8735 	}
8736 
8737 	newacc = alloc_async_client_config(ct_name, ct_addr, pp_head, tok);
8738 	ASSERT(newacc);
8739 
8740 	mutex_enter(&vhc->vhc_lock);
8741 	for (acc = vhc->vhc_acc_list_head; acc != NULL; acc = acc->acc_next) {
8742 		if (strcmp(ct_name, acc->acc_ct_name) == 0 &&
8743 		    strcmp(ct_addr, acc->acc_ct_addr) == 0) {
8744 			free_async_client_config(newacc);
8745 			mutex_exit(&vhc->vhc_lock);
8746 			return;
8747 		}
8748 	}
8749 
8750 	if (vhc->vhc_acc_list_head == NULL)
8751 		vhc->vhc_acc_list_head = newacc;
8752 	else
8753 		vhc->vhc_acc_list_tail->acc_next = newacc;
8754 	vhc->vhc_acc_list_tail = newacc;
8755 	vhc->vhc_acc_count++;
8756 	if (vhc->vhc_acc_count <= vhc->vhc_acc_thrcount) {
8757 		cv_broadcast(&vhc->vhc_cv);
8758 		create_thread = 0;
8759 	} else {
8760 		vhc->vhc_acc_thrcount++;
8761 		create_thread = 1;
8762 	}
8763 	mutex_exit(&vhc->vhc_lock);
8764 
8765 	if (create_thread)
8766 		(void) thread_create(NULL, 0, config_client_paths_thread, vhc,
8767 		    0, &p0, TS_RUN, minclsyspri);
8768 }
8769 
8770 /*
8771  * Return number of online paths for the specified client.
8772  */
8773 static int
8774 nonline_paths(mdi_vhcache_client_t *cct)
8775 {
8776 	mdi_vhcache_pathinfo_t *cpi;
8777 	int online_count = 0;
8778 
8779 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8780 		if (cpi->cpi_pip != NULL) {
8781 			MDI_PI_LOCK(cpi->cpi_pip);
8782 			if (cpi->cpi_pip->pi_state == MDI_PATHINFO_STATE_ONLINE)
8783 				online_count++;
8784 			MDI_PI_UNLOCK(cpi->cpi_pip);
8785 		}
8786 	}
8787 
8788 	return (online_count);
8789 }
8790 
8791 /*
8792  * Bus configure all paths for the specified vhci client.
8793  * If at least one path for the client is already online, the remaining paths
8794  * will be configured asynchronously. Otherwise, it synchronously configures
8795  * the paths until at least one path is online and then rest of the paths
8796  * will be configured asynchronously.
8797  */
8798 static void
8799 config_client_paths(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr)
8800 {
8801 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8802 	mdi_phys_path_t *pp_head, *pp;
8803 	mdi_vhcache_client_t *cct;
8804 	mdi_vhcache_lookup_token_t tok;
8805 
8806 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
8807 
8808 	init_vhcache_lookup_token(&tok, NULL);
8809 
8810 	if (ct_name == NULL || ct_addr == NULL ||
8811 	    (cct = lookup_vhcache_client(vhcache, ct_name, ct_addr, &tok))
8812 	    == NULL ||
8813 	    (pp_head = build_phclient_path_list(cct, ct_name)) == NULL) {
8814 		rw_exit(&vhcache->vhcache_lock);
8815 		return;
8816 	}
8817 
8818 	/* if at least one path is online, configure the rest asynchronously */
8819 	if (nonline_paths(cct) > 0) {
8820 		rw_exit(&vhcache->vhcache_lock);
8821 		config_client_paths_async(vhc, ct_name, ct_addr, pp_head, &tok);
8822 		return;
8823 	}
8824 
8825 	rw_exit(&vhcache->vhcache_lock);
8826 
8827 	for (pp = pp_head; pp != NULL; pp = pp->phys_path_next) {
8828 		if (bus_config_one_phci_child(pp->phys_path) == MDI_SUCCESS) {
8829 			rw_enter(&vhcache->vhcache_lock, RW_READER);
8830 
8831 			if ((cct = lookup_vhcache_client(vhcache, ct_name,
8832 			    ct_addr, &tok)) == NULL) {
8833 				rw_exit(&vhcache->vhcache_lock);
8834 				goto out;
8835 			}
8836 
8837 			if (nonline_paths(cct) > 0 &&
8838 			    pp->phys_path_next != NULL) {
8839 				rw_exit(&vhcache->vhcache_lock);
8840 				config_client_paths_async(vhc, ct_name, ct_addr,
8841 				    pp->phys_path_next, &tok);
8842 				pp->phys_path_next = NULL;
8843 				goto out;
8844 			}
8845 
8846 			rw_exit(&vhcache->vhcache_lock);
8847 		}
8848 	}
8849 
8850 	adjust_sort_vhcache_paths(vhc, ct_name, ct_addr, &tok);
8851 out:
8852 	free_phclient_path_list(pp_head);
8853 }
8854 
8855 static void
8856 single_threaded_vhconfig_enter(mdi_vhci_config_t *vhc)
8857 {
8858 	mutex_enter(&vhc->vhc_lock);
8859 	while (vhc->vhc_flags & MDI_VHC_SINGLE_THREADED)
8860 		cv_wait(&vhc->vhc_cv, &vhc->vhc_lock);
8861 	vhc->vhc_flags |= MDI_VHC_SINGLE_THREADED;
8862 	mutex_exit(&vhc->vhc_lock);
8863 }
8864 
8865 static void
8866 single_threaded_vhconfig_exit(mdi_vhci_config_t *vhc)
8867 {
8868 	mutex_enter(&vhc->vhc_lock);
8869 	vhc->vhc_flags &= ~MDI_VHC_SINGLE_THREADED;
8870 	cv_broadcast(&vhc->vhc_cv);
8871 	mutex_exit(&vhc->vhc_lock);
8872 }
8873 
8874 typedef struct mdi_phci_driver_info {
8875 	char	*phdriver_name;	/* name of the phci driver */
8876 
8877 	/* set to non zero if the phci driver supports root device */
8878 	int	phdriver_root_support;
8879 } mdi_phci_driver_info_t;
8880 
8881 /*
8882  * vhci class and root support capability of a phci driver can be
8883  * specified using ddi-vhci-class and ddi-no-root-support properties in the
8884  * phci driver.conf file. The built-in tables below contain this information
8885  * for those phci drivers whose driver.conf files don't yet contain this info.
8886  *
8887  * All phci drivers expect iscsi have root device support.
8888  */
8889 static mdi_phci_driver_info_t scsi_phci_driver_list[] = {
8890 	{ "fp", 1 },
8891 	{ "iscsi", 0 },
8892 	{ "ibsrp", 1 }
8893 	};
8894 
8895 static mdi_phci_driver_info_t ib_phci_driver_list[] = { "tavor", 1 };
8896 
8897 static void *
8898 mdi_realloc(void *old_ptr, size_t old_size, size_t new_size)
8899 {
8900 	void *new_ptr;
8901 
8902 	new_ptr = kmem_zalloc(new_size, KM_SLEEP);
8903 	if (old_ptr) {
8904 		bcopy(old_ptr, new_ptr, MIN(old_size, new_size));
8905 		kmem_free(old_ptr, old_size);
8906 	}
8907 	return (new_ptr);
8908 }
8909 
8910 static void
8911 add_to_phci_list(char ***driver_list, int **root_support_list,
8912     int *cur_elements, int *max_elements, char *driver_name, int root_support)
8913 {
8914 	ASSERT(*cur_elements <= *max_elements);
8915 	if (*cur_elements == *max_elements) {
8916 		*max_elements += 10;
8917 		*driver_list = mdi_realloc(*driver_list,
8918 		    sizeof (char *) * (*cur_elements),
8919 		    sizeof (char *) * (*max_elements));
8920 		*root_support_list = mdi_realloc(*root_support_list,
8921 		    sizeof (int) * (*cur_elements),
8922 		    sizeof (int) * (*max_elements));
8923 	}
8924 	(*driver_list)[*cur_elements] = i_ddi_strdup(driver_name, KM_SLEEP);
8925 	(*root_support_list)[*cur_elements] = root_support;
8926 	(*cur_elements)++;
8927 }
8928 
8929 static void
8930 get_phci_driver_list(char *vhci_class, char ***driver_list,
8931     int **root_support_list, int *cur_elements, int *max_elements)
8932 {
8933 	mdi_phci_driver_info_t	*st_driver_list, *p;
8934 	int		st_ndrivers, root_support, i, j, driver_conf_count;
8935 	major_t		m;
8936 	struct devnames	*dnp;
8937 	ddi_prop_t	*propp;
8938 
8939 	*driver_list = NULL;
8940 	*root_support_list = NULL;
8941 	*cur_elements = 0;
8942 	*max_elements = 0;
8943 
8944 	/* add the phci drivers derived from the phci driver.conf files */
8945 	for (m = 0; m < devcnt; m++) {
8946 		dnp = &devnamesp[m];
8947 
8948 		if (dnp->dn_flags & DN_PHCI_DRIVER) {
8949 			LOCK_DEV_OPS(&dnp->dn_lock);
8950 			if (dnp->dn_global_prop_ptr != NULL &&
8951 			    (propp = i_ddi_prop_search(DDI_DEV_T_ANY,
8952 			    DDI_VHCI_CLASS, DDI_PROP_TYPE_STRING,
8953 			    &dnp->dn_global_prop_ptr->prop_list)) != NULL &&
8954 			    strcmp(propp->prop_val, vhci_class) == 0) {
8955 
8956 				root_support = (i_ddi_prop_search(DDI_DEV_T_ANY,
8957 				    DDI_NO_ROOT_SUPPORT, DDI_PROP_TYPE_INT,
8958 				    &dnp->dn_global_prop_ptr->prop_list)
8959 				    == NULL) ? 1 : 0;
8960 
8961 				add_to_phci_list(driver_list, root_support_list,
8962 				    cur_elements, max_elements, dnp->dn_name,
8963 				    root_support);
8964 
8965 				UNLOCK_DEV_OPS(&dnp->dn_lock);
8966 			} else
8967 				UNLOCK_DEV_OPS(&dnp->dn_lock);
8968 		}
8969 	}
8970 
8971 	driver_conf_count = *cur_elements;
8972 
8973 	/* add the phci drivers specified in the built-in tables */
8974 	if (strcmp(vhci_class, MDI_HCI_CLASS_SCSI) == 0) {
8975 		st_driver_list = scsi_phci_driver_list;
8976 		st_ndrivers = sizeof (scsi_phci_driver_list) /
8977 		    sizeof (mdi_phci_driver_info_t);
8978 	} else if (strcmp(vhci_class, MDI_HCI_CLASS_IB) == 0) {
8979 		st_driver_list = ib_phci_driver_list;
8980 		st_ndrivers = sizeof (ib_phci_driver_list) /
8981 		    sizeof (mdi_phci_driver_info_t);
8982 	} else {
8983 		st_driver_list = NULL;
8984 		st_ndrivers = 0;
8985 	}
8986 
8987 	for (i = 0, p = st_driver_list; i < st_ndrivers; i++, p++) {
8988 		/* add this phci driver if not already added before */
8989 		for (j = 0; j < driver_conf_count; j++) {
8990 			if (strcmp((*driver_list)[j], p->phdriver_name) == 0)
8991 				break;
8992 		}
8993 		if (j == driver_conf_count) {
8994 			add_to_phci_list(driver_list, root_support_list,
8995 			    cur_elements, max_elements, p->phdriver_name,
8996 			    p->phdriver_root_support);
8997 		}
8998 	}
8999 }
9000 
9001 /*
9002  * Attach the phci driver instances associated with the specified vhci class.
9003  * If root is mounted attach all phci driver instances.
9004  * If root is not mounted, attach the instances of only those phci
9005  * drivers that have the root support.
9006  */
9007 static void
9008 attach_phci_drivers(char *vhci_class)
9009 {
9010 	char	**driver_list, **p;
9011 	int	*root_support_list;
9012 	int	cur_elements, max_elements, i;
9013 	major_t	m;
9014 
9015 	get_phci_driver_list(vhci_class, &driver_list, &root_support_list,
9016 	    &cur_elements, &max_elements);
9017 
9018 	for (i = 0; i < cur_elements; i++) {
9019 		if (modrootloaded || root_support_list[i]) {
9020 			m = ddi_name_to_major(driver_list[i]);
9021 			if (m != DDI_MAJOR_T_NONE &&
9022 			    ddi_hold_installed_driver(m))
9023 				ddi_rele_driver(m);
9024 		}
9025 	}
9026 
9027 	if (driver_list) {
9028 		for (i = 0, p = driver_list; i < cur_elements; i++, p++)
9029 			kmem_free(*p, strlen(*p) + 1);
9030 		kmem_free(driver_list, sizeof (char *) * max_elements);
9031 		kmem_free(root_support_list, sizeof (int) * max_elements);
9032 	}
9033 }
9034 
9035 /*
9036  * Build vhci cache:
9037  *
9038  * Attach phci driver instances and then drive BUS_CONFIG_ALL on
9039  * the phci driver instances. During this process the cache gets built.
9040  *
9041  * Cache is built fully if the root is mounted.
9042  * If the root is not mounted, phci drivers that do not have root support
9043  * are not attached. As a result the cache is built partially. The entries
9044  * in the cache reflect only those phci drivers that have root support.
9045  */
9046 static int
9047 build_vhci_cache(mdi_vhci_t *vh)
9048 {
9049 	mdi_vhci_config_t *vhc = vh->vh_config;
9050 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
9051 
9052 	single_threaded_vhconfig_enter(vhc);
9053 
9054 	rw_enter(&vhcache->vhcache_lock, RW_READER);
9055 	if (vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE) {
9056 		rw_exit(&vhcache->vhcache_lock);
9057 		single_threaded_vhconfig_exit(vhc);
9058 		return (0);
9059 	}
9060 	rw_exit(&vhcache->vhcache_lock);
9061 
9062 	attach_phci_drivers(vh->vh_class);
9063 	bus_config_all_phcis(vhcache, NDI_DRV_CONF_REPROBE | NDI_NO_EVENT,
9064 	    BUS_CONFIG_ALL, DDI_MAJOR_T_NONE);
9065 
9066 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
9067 	vhcache->vhcache_flags |= MDI_VHCI_CACHE_SETUP_DONE;
9068 	rw_exit(&vhcache->vhcache_lock);
9069 
9070 	single_threaded_vhconfig_exit(vhc);
9071 	vhcache_dirty(vhc);
9072 	return (1);
9073 }
9074 
9075 /*
9076  * Determine if discovery of paths is needed.
9077  */
9078 static int
9079 vhcache_do_discovery(mdi_vhci_config_t *vhc)
9080 {
9081 	int rv = 1;
9082 
9083 	mutex_enter(&vhc->vhc_lock);
9084 	if (i_ddi_io_initialized() == 0) {
9085 		if (vhc->vhc_path_discovery_boot > 0) {
9086 			vhc->vhc_path_discovery_boot--;
9087 			goto out;
9088 		}
9089 	} else {
9090 		if (vhc->vhc_path_discovery_postboot > 0) {
9091 			vhc->vhc_path_discovery_postboot--;
9092 			goto out;
9093 		}
9094 	}
9095 
9096 	/*
9097 	 * Do full path discovery at most once per mdi_path_discovery_interval.
9098 	 * This is to avoid a series of full path discoveries when opening
9099 	 * stale /dev/[r]dsk links.
9100 	 */
9101 	if (mdi_path_discovery_interval != -1 &&
9102 	    ddi_get_lbolt64() >= vhc->vhc_path_discovery_cutoff_time)
9103 		goto out;
9104 
9105 	rv = 0;
9106 out:
9107 	mutex_exit(&vhc->vhc_lock);
9108 	return (rv);
9109 }
9110 
9111 /*
9112  * Discover all paths:
9113  *
9114  * Attach phci driver instances and then drive BUS_CONFIG_ALL on all the phci
9115  * driver instances. During this process all paths will be discovered.
9116  */
9117 static int
9118 vhcache_discover_paths(mdi_vhci_t *vh)
9119 {
9120 	mdi_vhci_config_t *vhc = vh->vh_config;
9121 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
9122 	int rv = 0;
9123 
9124 	single_threaded_vhconfig_enter(vhc);
9125 
9126 	if (vhcache_do_discovery(vhc)) {
9127 		attach_phci_drivers(vh->vh_class);
9128 		bus_config_all_phcis(vhcache, NDI_DRV_CONF_REPROBE |
9129 		    NDI_NO_EVENT, BUS_CONFIG_ALL, DDI_MAJOR_T_NONE);
9130 
9131 		mutex_enter(&vhc->vhc_lock);
9132 		vhc->vhc_path_discovery_cutoff_time = ddi_get_lbolt64() +
9133 		    mdi_path_discovery_interval * TICKS_PER_SECOND;
9134 		mutex_exit(&vhc->vhc_lock);
9135 		rv = 1;
9136 	}
9137 
9138 	single_threaded_vhconfig_exit(vhc);
9139 	return (rv);
9140 }
9141 
9142 /*
9143  * Generic vhci bus config implementation:
9144  *
9145  * Parameters
9146  *	vdip	vhci dip
9147  *	flags	bus config flags
9148  *	op	bus config operation
9149  *	The remaining parameters are bus config operation specific
9150  *
9151  * for BUS_CONFIG_ONE
9152  *	arg	pointer to name@addr
9153  *	child	upon successful return from this function, *child will be
9154  *		set to the configured and held devinfo child node of vdip.
9155  *	ct_addr	pointer to client address (i.e. GUID)
9156  *
9157  * for BUS_CONFIG_DRIVER
9158  *	arg	major number of the driver
9159  *	child and ct_addr parameters are ignored
9160  *
9161  * for BUS_CONFIG_ALL
9162  *	arg, child, and ct_addr parameters are ignored
9163  *
9164  * Note that for the rest of the bus config operations, this function simply
9165  * calls the framework provided default bus config routine.
9166  */
9167 int
9168 mdi_vhci_bus_config(dev_info_t *vdip, uint_t flags, ddi_bus_config_op_t op,
9169     void *arg, dev_info_t **child, char *ct_addr)
9170 {
9171 	mdi_vhci_t *vh = i_devi_get_vhci(vdip);
9172 	mdi_vhci_config_t *vhc = vh->vh_config;
9173 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
9174 	int rv = 0;
9175 	int params_valid = 0;
9176 	char *cp;
9177 
9178 	/*
9179 	 * To bus config vhcis we relay operation, possibly using another
9180 	 * thread, to phcis. The phci driver then interacts with MDI to cause
9181 	 * vhci child nodes to be enumerated under the vhci node.  Adding a
9182 	 * vhci child requires an ndi_devi_enter of the vhci. Since another
9183 	 * thread may be adding the child, to avoid deadlock we can't wait
9184 	 * for the relayed operations to complete if we have already entered
9185 	 * the vhci node.
9186 	 */
9187 	if (DEVI_BUSY_OWNED(vdip)) {
9188 		MDI_DEBUG(2, (MDI_NOTE, vdip,
9189 		    "vhci dip is busy owned %p", (void *)vdip));
9190 		goto default_bus_config;
9191 	}
9192 
9193 	rw_enter(&vhcache->vhcache_lock, RW_READER);
9194 	if (!(vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE)) {
9195 		rw_exit(&vhcache->vhcache_lock);
9196 		rv = build_vhci_cache(vh);
9197 		rw_enter(&vhcache->vhcache_lock, RW_READER);
9198 	}
9199 
9200 	switch (op) {
9201 	case BUS_CONFIG_ONE:
9202 		if (arg != NULL && ct_addr != NULL) {
9203 			/* extract node name */
9204 			cp = (char *)arg;
9205 			while (*cp != '\0' && *cp != '@')
9206 				cp++;
9207 			if (*cp == '@') {
9208 				params_valid = 1;
9209 				*cp = '\0';
9210 				config_client_paths(vhc, (char *)arg, ct_addr);
9211 				/* config_client_paths() releases cache_lock */
9212 				*cp = '@';
9213 				break;
9214 			}
9215 		}
9216 
9217 		rw_exit(&vhcache->vhcache_lock);
9218 		break;
9219 
9220 	case BUS_CONFIG_DRIVER:
9221 		rw_exit(&vhcache->vhcache_lock);
9222 		if (rv == 0)
9223 			st_bus_config_all_phcis(vhc, flags, op,
9224 			    (major_t)(uintptr_t)arg);
9225 		break;
9226 
9227 	case BUS_CONFIG_ALL:
9228 		rw_exit(&vhcache->vhcache_lock);
9229 		if (rv == 0)
9230 			st_bus_config_all_phcis(vhc, flags, op, -1);
9231 		break;
9232 
9233 	default:
9234 		rw_exit(&vhcache->vhcache_lock);
9235 		break;
9236 	}
9237 
9238 
9239 default_bus_config:
9240 	/*
9241 	 * All requested child nodes are enumerated under the vhci.
9242 	 * Now configure them.
9243 	 */
9244 	if (ndi_busop_bus_config(vdip, flags, op, arg, child, 0) ==
9245 	    NDI_SUCCESS) {
9246 		return (MDI_SUCCESS);
9247 	} else if (op == BUS_CONFIG_ONE && rv == 0 && params_valid) {
9248 		/* discover all paths and try configuring again */
9249 		if (vhcache_discover_paths(vh) &&
9250 		    ndi_busop_bus_config(vdip, flags, op, arg, child, 0) ==
9251 		    NDI_SUCCESS)
9252 			return (MDI_SUCCESS);
9253 	}
9254 
9255 	return (MDI_FAILURE);
9256 }
9257 
9258 /*
9259  * Read the on-disk vhci cache into an nvlist for the specified vhci class.
9260  */
9261 static nvlist_t *
9262 read_on_disk_vhci_cache(char *vhci_class)
9263 {
9264 	nvlist_t *nvl;
9265 	int err;
9266 	char *filename;
9267 
9268 	filename = vhclass2vhcache_filename(vhci_class);
9269 
9270 	if ((err = fread_nvlist(filename, &nvl)) == 0) {
9271 		kmem_free(filename, strlen(filename) + 1);
9272 		return (nvl);
9273 	} else if (err == EIO)
9274 		cmn_err(CE_WARN, "%s: I/O error, will recreate", filename);
9275 	else if (err == EINVAL)
9276 		cmn_err(CE_WARN,
9277 		    "%s: data file corrupted, will recreate", filename);
9278 
9279 	kmem_free(filename, strlen(filename) + 1);
9280 	return (NULL);
9281 }
9282 
9283 /*
9284  * Read on-disk vhci cache into nvlists for all vhci classes.
9285  * Called during booting by i_ddi_read_devices_files().
9286  */
9287 void
9288 mdi_read_devices_files(void)
9289 {
9290 	int i;
9291 
9292 	for (i = 0; i < N_VHCI_CLASSES; i++)
9293 		vhcache_nvl[i] = read_on_disk_vhci_cache(vhci_class_list[i]);
9294 }
9295 
9296 /*
9297  * Remove all stale entries from vhci cache.
9298  */
9299 static void
9300 clean_vhcache(mdi_vhci_config_t *vhc)
9301 {
9302 	mdi_vhci_cache_t	*vhcache = &vhc->vhc_vhcache;
9303 	mdi_vhcache_phci_t	*phci, *nxt_phci;
9304 	mdi_vhcache_client_t	*client, *nxt_client;
9305 	mdi_vhcache_pathinfo_t	*path, *nxt_path;
9306 
9307 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
9308 
9309 	client = vhcache->vhcache_client_head;
9310 	vhcache->vhcache_client_head = vhcache->vhcache_client_tail = NULL;
9311 	for ( ; client != NULL; client = nxt_client) {
9312 		nxt_client = client->cct_next;
9313 
9314 		path = client->cct_cpi_head;
9315 		client->cct_cpi_head = client->cct_cpi_tail = NULL;
9316 		for ( ; path != NULL; path = nxt_path) {
9317 			nxt_path = path->cpi_next;
9318 			if ((path->cpi_cphci->cphci_phci != NULL) &&
9319 			    (path->cpi_pip != NULL)) {
9320 				enqueue_tail_vhcache_pathinfo(client, path);
9321 			} else if (path->cpi_pip != NULL) {
9322 				/* Not valid to have a path without a phci. */
9323 				free_vhcache_pathinfo(path);
9324 			}
9325 		}
9326 
9327 		if (client->cct_cpi_head != NULL)
9328 			enqueue_vhcache_client(vhcache, client);
9329 		else {
9330 			(void) mod_hash_destroy(vhcache->vhcache_client_hash,
9331 			    (mod_hash_key_t)client->cct_name_addr);
9332 			free_vhcache_client(client);
9333 		}
9334 	}
9335 
9336 	phci = vhcache->vhcache_phci_head;
9337 	vhcache->vhcache_phci_head = vhcache->vhcache_phci_tail = NULL;
9338 	for ( ; phci != NULL; phci = nxt_phci) {
9339 
9340 		nxt_phci = phci->cphci_next;
9341 		if (phci->cphci_phci != NULL)
9342 			enqueue_vhcache_phci(vhcache, phci);
9343 		else
9344 			free_vhcache_phci(phci);
9345 	}
9346 
9347 	vhcache->vhcache_clean_time = ddi_get_lbolt64();
9348 	rw_exit(&vhcache->vhcache_lock);
9349 	vhcache_dirty(vhc);
9350 }
9351 
9352 /*
9353  * Remove all stale entries from vhci cache.
9354  * Called by i_ddi_clean_devices_files() during the execution of devfsadm -C
9355  */
9356 void
9357 mdi_clean_vhcache(void)
9358 {
9359 	mdi_vhci_t *vh;
9360 
9361 	mutex_enter(&mdi_mutex);
9362 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
9363 		vh->vh_refcnt++;
9364 		mutex_exit(&mdi_mutex);
9365 		clean_vhcache(vh->vh_config);
9366 		mutex_enter(&mdi_mutex);
9367 		vh->vh_refcnt--;
9368 	}
9369 	mutex_exit(&mdi_mutex);
9370 }
9371 
9372 /*
9373  * mdi_vhci_walk_clients():
9374  *		Walker routine to traverse client dev_info nodes
9375  * ddi_walk_devs(ddi_get_child(vdip), f, arg) returns the entire tree
9376  * below the client, including nexus devices, which we dont want.
9377  * So we just traverse the immediate siblings, starting from 1st client.
9378  */
9379 void
9380 mdi_vhci_walk_clients(dev_info_t *vdip,
9381     int (*f)(dev_info_t *, void *), void *arg)
9382 {
9383 	mdi_vhci_t	*vh = i_devi_get_vhci(vdip);
9384 	dev_info_t	*cdip;
9385 	mdi_client_t	*ct;
9386 
9387 	MDI_VHCI_CLIENT_LOCK(vh);
9388 	cdip = ddi_get_child(vdip);
9389 	while (cdip) {
9390 		ct = i_devi_get_client(cdip);
9391 		MDI_CLIENT_LOCK(ct);
9392 
9393 		if (((*f)(cdip, arg)) == DDI_WALK_CONTINUE)
9394 			cdip = ddi_get_next_sibling(cdip);
9395 		else
9396 			cdip = NULL;
9397 
9398 		MDI_CLIENT_UNLOCK(ct);
9399 	}
9400 	MDI_VHCI_CLIENT_UNLOCK(vh);
9401 }
9402 
9403 /*
9404  * mdi_vhci_walk_phcis():
9405  *		Walker routine to traverse phci dev_info nodes
9406  */
9407 void
9408 mdi_vhci_walk_phcis(dev_info_t *vdip,
9409     int (*f)(dev_info_t *, void *), void *arg)
9410 {
9411 	mdi_vhci_t	*vh = i_devi_get_vhci(vdip);
9412 	mdi_phci_t	*ph, *next;
9413 
9414 	MDI_VHCI_PHCI_LOCK(vh);
9415 	ph = vh->vh_phci_head;
9416 	while (ph) {
9417 		MDI_PHCI_LOCK(ph);
9418 
9419 		if (((*f)(ph->ph_dip, arg)) == DDI_WALK_CONTINUE)
9420 			next = ph->ph_next;
9421 		else
9422 			next = NULL;
9423 
9424 		MDI_PHCI_UNLOCK(ph);
9425 		ph = next;
9426 	}
9427 	MDI_VHCI_PHCI_UNLOCK(vh);
9428 }
9429 
9430 
9431 /*
9432  * mdi_walk_vhcis():
9433  *		Walker routine to traverse vhci dev_info nodes
9434  */
9435 void
9436 mdi_walk_vhcis(int (*f)(dev_info_t *, void *), void *arg)
9437 {
9438 	mdi_vhci_t	*vh = NULL;
9439 
9440 	mutex_enter(&mdi_mutex);
9441 	/*
9442 	 * Scan for already registered vhci
9443 	 */
9444 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
9445 		vh->vh_refcnt++;
9446 		mutex_exit(&mdi_mutex);
9447 		if (((*f)(vh->vh_dip, arg)) != DDI_WALK_CONTINUE) {
9448 			mutex_enter(&mdi_mutex);
9449 			vh->vh_refcnt--;
9450 			break;
9451 		} else {
9452 			mutex_enter(&mdi_mutex);
9453 			vh->vh_refcnt--;
9454 		}
9455 	}
9456 
9457 	mutex_exit(&mdi_mutex);
9458 }
9459 
9460 /*
9461  * i_mdi_log_sysevent():
9462  *		Logs events for pickup by syseventd
9463  */
9464 static void
9465 i_mdi_log_sysevent(dev_info_t *dip, char *ph_vh_class, char *subclass)
9466 {
9467 	char		*path_name;
9468 	nvlist_t	*attr_list;
9469 
9470 	if (nvlist_alloc(&attr_list, NV_UNIQUE_NAME_TYPE,
9471 	    KM_SLEEP) != DDI_SUCCESS) {
9472 		goto alloc_failed;
9473 	}
9474 
9475 	path_name = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
9476 	(void) ddi_pathname(dip, path_name);
9477 
9478 	if (nvlist_add_string(attr_list, DDI_DRIVER_NAME,
9479 	    ddi_driver_name(dip)) != DDI_SUCCESS) {
9480 		goto error;
9481 	}
9482 
9483 	if (nvlist_add_int32(attr_list, DDI_DRIVER_MAJOR,
9484 	    (int32_t)ddi_driver_major(dip)) != DDI_SUCCESS) {
9485 		goto error;
9486 	}
9487 
9488 	if (nvlist_add_int32(attr_list, DDI_INSTANCE,
9489 	    (int32_t)ddi_get_instance(dip)) != DDI_SUCCESS) {
9490 		goto error;
9491 	}
9492 
9493 	if (nvlist_add_string(attr_list, DDI_PATHNAME,
9494 	    path_name) != DDI_SUCCESS) {
9495 		goto error;
9496 	}
9497 
9498 	if (nvlist_add_string(attr_list, DDI_CLASS,
9499 	    ph_vh_class) != DDI_SUCCESS) {
9500 		goto error;
9501 	}
9502 
9503 	(void) ddi_log_sysevent(dip, DDI_VENDOR_SUNW, EC_DDI, subclass,
9504 	    attr_list, NULL, DDI_SLEEP);
9505 
9506 error:
9507 	kmem_free(path_name, MAXPATHLEN);
9508 	nvlist_free(attr_list);
9509 	return;
9510 
9511 alloc_failed:
9512 	MDI_DEBUG(1, (MDI_WARN, dip, "!unable to send sysevent"));
9513 }
9514 
9515 char **
9516 mdi_get_phci_driver_list(char *vhci_class, int	*ndrivers)
9517 {
9518 	char	**driver_list, **ret_driver_list = NULL;
9519 	int	*root_support_list;
9520 	int	cur_elements, max_elements;
9521 
9522 	get_phci_driver_list(vhci_class, &driver_list, &root_support_list,
9523 	    &cur_elements, &max_elements);
9524 
9525 
9526 	if (driver_list) {
9527 		kmem_free(root_support_list, sizeof (int) * max_elements);
9528 		ret_driver_list = mdi_realloc(driver_list, sizeof (char *)
9529 		    * max_elements, sizeof (char *) * cur_elements);
9530 	}
9531 	*ndrivers = cur_elements;
9532 
9533 	return (ret_driver_list);
9534 
9535 }
9536 
9537 void
9538 mdi_free_phci_driver_list(char **driver_list, int ndrivers)
9539 {
9540 	char	**p;
9541 	int	i;
9542 
9543 	if (driver_list) {
9544 		for (i = 0, p = driver_list; i < ndrivers; i++, p++)
9545 			kmem_free(*p, strlen(*p) + 1);
9546 		kmem_free(driver_list, sizeof (char *) * ndrivers);
9547 	}
9548 }
9549 
9550 /*
9551  * mdi_is_dev_supported():
9552  *		function called by pHCI bus config operation to determine if a
9553  *		device should be represented as a child of the vHCI or the
9554  *		pHCI.  This decision is made by the vHCI, using cinfo idenity
9555  *		information passed by the pHCI - specifics of the cinfo
9556  *		representation are by agreement between the pHCI and vHCI.
9557  * Return Values:
9558  *		MDI_SUCCESS
9559  *		MDI_FAILURE
9560  */
9561 int
9562 mdi_is_dev_supported(char *class, dev_info_t *pdip, void *cinfo)
9563 {
9564 	mdi_vhci_t	*vh;
9565 
9566 	ASSERT(class && pdip);
9567 
9568 	/*
9569 	 * For dev_supported, mdi_phci_register() must have established pdip as
9570 	 * a pHCI.
9571 	 *
9572 	 * NOTE: mdi_phci_register() does "mpxio-disable" processing, and
9573 	 * MDI_PHCI(pdip) will return false if mpxio is disabled.
9574 	 */
9575 	if (!MDI_PHCI(pdip))
9576 		return (MDI_FAILURE);
9577 
9578 	/* Return MDI_FAILURE if vHCI does not support asking the question. */
9579 	vh = (mdi_vhci_t *)i_mdi_vhci_class2vhci(class);
9580 	if ((vh == NULL) || (vh->vh_ops->vo_is_dev_supported == NULL)) {
9581 		return (MDI_FAILURE);
9582 	}
9583 
9584 	/* Return vHCI answer */
9585 	return (vh->vh_ops->vo_is_dev_supported(vh->vh_dip, pdip, cinfo));
9586 }
9587 
9588 int
9589 mdi_dc_return_dev_state(mdi_pathinfo_t *pip, struct devctl_iocdata *dcp)
9590 {
9591 	uint_t devstate = 0;
9592 	dev_info_t *cdip;
9593 
9594 	if ((pip == NULL) || (dcp == NULL))
9595 		return (MDI_FAILURE);
9596 
9597 	cdip = mdi_pi_get_client(pip);
9598 
9599 	switch (mdi_pi_get_state(pip)) {
9600 	case MDI_PATHINFO_STATE_INIT:
9601 		devstate = DEVICE_DOWN;
9602 		break;
9603 	case MDI_PATHINFO_STATE_ONLINE:
9604 		devstate = DEVICE_ONLINE;
9605 		if ((cdip) && (devi_stillreferenced(cdip) == DEVI_REFERENCED))
9606 			devstate |= DEVICE_BUSY;
9607 		break;
9608 	case MDI_PATHINFO_STATE_STANDBY:
9609 		devstate = DEVICE_ONLINE;
9610 		break;
9611 	case MDI_PATHINFO_STATE_FAULT:
9612 		devstate = DEVICE_DOWN;
9613 		break;
9614 	case MDI_PATHINFO_STATE_OFFLINE:
9615 		devstate = DEVICE_OFFLINE;
9616 		break;
9617 	default:
9618 		ASSERT(MDI_PI(pip)->pi_state);
9619 	}
9620 
9621 	if (copyout(&devstate, dcp->cpyout_buf, sizeof (uint_t)) != 0)
9622 		return (MDI_FAILURE);
9623 
9624 	return (MDI_SUCCESS);
9625 }
9626