xref: /illumos-gate/usr/src/uts/common/os/sunmdi.c (revision ca4eed8b351c42874d1c1d9360d832914a0ffd1b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Multipath driver interface (MDI) implementation; see mdi_impl.h for a more
28  * detailed discussion of the overall mpxio architecture.
29  *
30  * Default locking order:
31  *
32  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_phci_mutex);
33  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_client_mutex);
34  * _NOTE(LOCK_ORDER(mdi_vhci:vh_phci_mutex, mdi_phci::ph_mutex);
35  * _NOTE(LOCK_ORDER(mdi_vhci:vh_client_mutex, mdi_client::ct_mutex);
36  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
37  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_client::ct_mutex))
38  * _NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
39  */
40 
41 #include <sys/note.h>
42 #include <sys/types.h>
43 #include <sys/varargs.h>
44 #include <sys/param.h>
45 #include <sys/errno.h>
46 #include <sys/uio.h>
47 #include <sys/buf.h>
48 #include <sys/modctl.h>
49 #include <sys/open.h>
50 #include <sys/kmem.h>
51 #include <sys/poll.h>
52 #include <sys/conf.h>
53 #include <sys/bootconf.h>
54 #include <sys/cmn_err.h>
55 #include <sys/stat.h>
56 #include <sys/ddi.h>
57 #include <sys/sunddi.h>
58 #include <sys/ddipropdefs.h>
59 #include <sys/sunndi.h>
60 #include <sys/ndi_impldefs.h>
61 #include <sys/promif.h>
62 #include <sys/sunmdi.h>
63 #include <sys/mdi_impldefs.h>
64 #include <sys/taskq.h>
65 #include <sys/epm.h>
66 #include <sys/sunpm.h>
67 #include <sys/modhash.h>
68 #include <sys/disp.h>
69 #include <sys/autoconf.h>
70 #include <sys/sysmacros.h>
71 
72 #ifdef	DEBUG
73 #include <sys/debug.h>
74 int	mdi_debug = 1;
75 int	mdi_debug_logonly = 0;
76 #define	MDI_DEBUG(level, stmnt) \
77 	    if (mdi_debug >= (level)) i_mdi_log stmnt
78 static void i_mdi_log(int, dev_info_t *, const char *fmt, ...);
79 #else	/* !DEBUG */
80 #define	MDI_DEBUG(level, stmnt)
81 #endif	/* DEBUG */
82 
83 extern pri_t	minclsyspri;
84 extern int	modrootloaded;
85 
86 /*
87  * Global mutex:
88  * Protects vHCI list and structure members.
89  */
90 kmutex_t	mdi_mutex;
91 
92 /*
93  * Registered vHCI class driver lists
94  */
95 int		mdi_vhci_count;
96 mdi_vhci_t	*mdi_vhci_head;
97 mdi_vhci_t	*mdi_vhci_tail;
98 
99 /*
100  * Client Hash Table size
101  */
102 static int	mdi_client_table_size = CLIENT_HASH_TABLE_SIZE;
103 
104 /*
105  * taskq interface definitions
106  */
107 #define	MDI_TASKQ_N_THREADS	8
108 #define	MDI_TASKQ_PRI		minclsyspri
109 #define	MDI_TASKQ_MINALLOC	(4*mdi_taskq_n_threads)
110 #define	MDI_TASKQ_MAXALLOC	(500*mdi_taskq_n_threads)
111 
112 taskq_t				*mdi_taskq;
113 static uint_t			mdi_taskq_n_threads = MDI_TASKQ_N_THREADS;
114 
115 #define	TICKS_PER_SECOND	(drv_usectohz(1000000))
116 
117 /*
118  * The data should be "quiet" for this interval (in seconds) before the
119  * vhci cached data is flushed to the disk.
120  */
121 static int mdi_vhcache_flush_delay = 10;
122 
123 /* number of seconds the vhcache flush daemon will sleep idle before exiting */
124 static int mdi_vhcache_flush_daemon_idle_time = 60;
125 
126 /*
127  * MDI falls back to discovery of all paths when a bus_config_one fails.
128  * The following parameters can be used to tune this operation.
129  *
130  * mdi_path_discovery_boot
131  *	Number of times path discovery will be attempted during early boot.
132  *	Probably there is no reason to ever set this value to greater than one.
133  *
134  * mdi_path_discovery_postboot
135  *	Number of times path discovery will be attempted after early boot.
136  *	Set it to a minimum of two to allow for discovery of iscsi paths which
137  *	may happen very late during booting.
138  *
139  * mdi_path_discovery_interval
140  *	Minimum number of seconds MDI will wait between successive discovery
141  *	of all paths. Set it to -1 to disable discovery of all paths.
142  */
143 static int mdi_path_discovery_boot = 1;
144 static int mdi_path_discovery_postboot = 2;
145 static int mdi_path_discovery_interval = 10;
146 
147 /*
148  * number of seconds the asynchronous configuration thread will sleep idle
149  * before exiting.
150  */
151 static int mdi_async_config_idle_time = 600;
152 
153 static int mdi_bus_config_cache_hash_size = 256;
154 
155 /* turns off multithreaded configuration for certain operations */
156 static int mdi_mtc_off = 0;
157 
158 /*
159  * The "path" to a pathinfo node is identical to the /devices path to a
160  * devinfo node had the device been enumerated under a pHCI instead of
161  * a vHCI.  This pathinfo "path" is associated with a 'path_instance'.
162  * This association persists across create/delete of the pathinfo nodes,
163  * but not across reboot.
164  */
165 static uint_t		mdi_pathmap_instance = 1;	/* 0 -> any path */
166 static int		mdi_pathmap_hash_size = 256;
167 static kmutex_t		mdi_pathmap_mutex;
168 static mod_hash_t	*mdi_pathmap_bypath;		/* "path"->instance */
169 static mod_hash_t	*mdi_pathmap_byinstance;	/* instance->"path" */
170 
171 /*
172  * MDI component property name/value string definitions
173  */
174 const char 		*mdi_component_prop = "mpxio-component";
175 const char		*mdi_component_prop_vhci = "vhci";
176 const char		*mdi_component_prop_phci = "phci";
177 const char		*mdi_component_prop_client = "client";
178 
179 /*
180  * MDI client global unique identifier property name
181  */
182 const char		*mdi_client_guid_prop = "client-guid";
183 
184 /*
185  * MDI client load balancing property name/value string definitions
186  */
187 const char		*mdi_load_balance = "load-balance";
188 const char		*mdi_load_balance_none = "none";
189 const char		*mdi_load_balance_rr = "round-robin";
190 const char		*mdi_load_balance_lba = "logical-block";
191 
192 /*
193  * Obsolete vHCI class definition; to be removed after Leadville update
194  */
195 const char *mdi_vhci_class_scsi = MDI_HCI_CLASS_SCSI;
196 
197 static char vhci_greeting[] =
198 	"\tThere already exists one vHCI driver for class %s\n"
199 	"\tOnly one vHCI driver for each class is allowed\n";
200 
201 /*
202  * Static function prototypes
203  */
204 static int		i_mdi_phci_offline(dev_info_t *, uint_t);
205 static int		i_mdi_client_offline(dev_info_t *, uint_t);
206 static int		i_mdi_phci_pre_detach(dev_info_t *, ddi_detach_cmd_t);
207 static void		i_mdi_phci_post_detach(dev_info_t *,
208 			    ddi_detach_cmd_t, int);
209 static int		i_mdi_client_pre_detach(dev_info_t *,
210 			    ddi_detach_cmd_t);
211 static void		i_mdi_client_post_detach(dev_info_t *,
212 			    ddi_detach_cmd_t, int);
213 static void		i_mdi_pm_hold_pip(mdi_pathinfo_t *);
214 static void		i_mdi_pm_rele_pip(mdi_pathinfo_t *);
215 static int 		i_mdi_lba_lb(mdi_client_t *ct,
216 			    mdi_pathinfo_t **ret_pip, struct buf *buf);
217 static void		i_mdi_pm_hold_client(mdi_client_t *, int);
218 static void		i_mdi_pm_rele_client(mdi_client_t *, int);
219 static void		i_mdi_pm_reset_client(mdi_client_t *);
220 static int		i_mdi_power_all_phci(mdi_client_t *);
221 static void		i_mdi_log_sysevent(dev_info_t *, char *, char *);
222 
223 
224 /*
225  * Internal mdi_pathinfo node functions
226  */
227 static void		i_mdi_pi_kstat_destroy(mdi_pathinfo_t *);
228 
229 static mdi_vhci_t	*i_mdi_vhci_class2vhci(char *);
230 static mdi_vhci_t	*i_devi_get_vhci(dev_info_t *);
231 static mdi_phci_t	*i_devi_get_phci(dev_info_t *);
232 static void		i_mdi_phci_lock(mdi_phci_t *, mdi_pathinfo_t *);
233 static void		i_mdi_phci_unlock(mdi_phci_t *);
234 static mdi_pathinfo_t	*i_mdi_pi_alloc(mdi_phci_t *, char *, mdi_client_t *);
235 static void		i_mdi_phci_add_path(mdi_phci_t *, mdi_pathinfo_t *);
236 static void		i_mdi_client_add_path(mdi_client_t *, mdi_pathinfo_t *);
237 static void		i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *,
238 			    mdi_client_t *);
239 static void		i_mdi_phci_remove_path(mdi_phci_t *, mdi_pathinfo_t *);
240 static void		i_mdi_client_remove_path(mdi_client_t *,
241 			    mdi_pathinfo_t *);
242 
243 static int		i_mdi_pi_state_change(mdi_pathinfo_t *,
244 			    mdi_pathinfo_state_t, int);
245 static int		i_mdi_pi_offline(mdi_pathinfo_t *, int);
246 static dev_info_t	*i_mdi_devinfo_create(mdi_vhci_t *, char *, char *,
247 			    char **, int);
248 static dev_info_t	*i_mdi_devinfo_find(mdi_vhci_t *, char *, char *);
249 static int		i_mdi_devinfo_remove(dev_info_t *, dev_info_t *, int);
250 static int		i_mdi_is_child_present(dev_info_t *, dev_info_t *);
251 static mdi_client_t	*i_mdi_client_alloc(mdi_vhci_t *, char *, char *);
252 static void		i_mdi_client_enlist_table(mdi_vhci_t *, mdi_client_t *);
253 static void		i_mdi_client_delist_table(mdi_vhci_t *, mdi_client_t *);
254 static mdi_client_t	*i_mdi_client_find(mdi_vhci_t *, char *, char *);
255 static void		i_mdi_client_update_state(mdi_client_t *);
256 static int		i_mdi_client_compute_state(mdi_client_t *,
257 			    mdi_phci_t *);
258 static void		i_mdi_client_lock(mdi_client_t *, mdi_pathinfo_t *);
259 static void		i_mdi_client_unlock(mdi_client_t *);
260 static int		i_mdi_client_free(mdi_vhci_t *, mdi_client_t *);
261 static mdi_client_t	*i_devi_get_client(dev_info_t *);
262 /*
263  * NOTE: this will be removed once the NWS files are changed to use the new
264  * mdi_{enable,disable}_path interfaces
265  */
266 static int		i_mdi_pi_enable_disable(dev_info_t *, dev_info_t *,
267 				int, int);
268 static mdi_pathinfo_t 	*i_mdi_enable_disable_path(mdi_pathinfo_t *pip,
269 				mdi_vhci_t *vh, int flags, int op);
270 /*
271  * Failover related function prototypes
272  */
273 static int		i_mdi_failover(void *);
274 
275 /*
276  * misc internal functions
277  */
278 static int		i_mdi_get_hash_key(char *);
279 static int		i_map_nvlist_error_to_mdi(int);
280 static void		i_mdi_report_path_state(mdi_client_t *,
281 			    mdi_pathinfo_t *);
282 
283 static void		setup_vhci_cache(mdi_vhci_t *);
284 static int		destroy_vhci_cache(mdi_vhci_t *);
285 static int		stop_vhcache_async_threads(mdi_vhci_config_t *);
286 static boolean_t	stop_vhcache_flush_thread(void *, int);
287 static void		free_string_array(char **, int);
288 static void		free_vhcache_phci(mdi_vhcache_phci_t *);
289 static void		free_vhcache_pathinfo(mdi_vhcache_pathinfo_t *);
290 static void		free_vhcache_client(mdi_vhcache_client_t *);
291 static int		mainnvl_to_vhcache(mdi_vhci_cache_t *, nvlist_t *);
292 static nvlist_t		*vhcache_to_mainnvl(mdi_vhci_cache_t *);
293 static void		vhcache_phci_add(mdi_vhci_config_t *, mdi_phci_t *);
294 static void		vhcache_phci_remove(mdi_vhci_config_t *, mdi_phci_t *);
295 static void		vhcache_pi_add(mdi_vhci_config_t *,
296 			    struct mdi_pathinfo *);
297 static void		vhcache_pi_remove(mdi_vhci_config_t *,
298 			    struct mdi_pathinfo *);
299 static void		free_phclient_path_list(mdi_phys_path_t *);
300 static void		sort_vhcache_paths(mdi_vhcache_client_t *);
301 static int		flush_vhcache(mdi_vhci_config_t *, int);
302 static void		vhcache_dirty(mdi_vhci_config_t *);
303 static void		free_async_client_config(mdi_async_client_config_t *);
304 static void		single_threaded_vhconfig_enter(mdi_vhci_config_t *);
305 static void		single_threaded_vhconfig_exit(mdi_vhci_config_t *);
306 static nvlist_t		*read_on_disk_vhci_cache(char *);
307 extern int		fread_nvlist(char *, nvlist_t **);
308 extern int		fwrite_nvlist(char *, nvlist_t *);
309 
310 /* called once when first vhci registers with mdi */
311 static void
312 i_mdi_init()
313 {
314 	static int initialized = 0;
315 
316 	if (initialized)
317 		return;
318 	initialized = 1;
319 
320 	mutex_init(&mdi_mutex, NULL, MUTEX_DEFAULT, NULL);
321 
322 	/* Create our taskq resources */
323 	mdi_taskq = taskq_create("mdi_taskq", mdi_taskq_n_threads,
324 	    MDI_TASKQ_PRI, MDI_TASKQ_MINALLOC, MDI_TASKQ_MAXALLOC,
325 	    TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
326 	ASSERT(mdi_taskq != NULL);	/* taskq_create never fails */
327 
328 	/* Allocate ['path_instance' <-> "path"] maps */
329 	mutex_init(&mdi_pathmap_mutex, NULL, MUTEX_DRIVER, NULL);
330 	mdi_pathmap_bypath = mod_hash_create_strhash(
331 	    "mdi_pathmap_bypath", mdi_pathmap_hash_size,
332 	    mod_hash_null_valdtor);
333 	mdi_pathmap_byinstance = mod_hash_create_idhash(
334 	    "mdi_pathmap_byinstance", mdi_pathmap_hash_size,
335 	    mod_hash_null_valdtor);
336 }
337 
338 /*
339  * mdi_get_component_type():
340  *		Return mpxio component type
341  * Return Values:
342  *		MDI_COMPONENT_NONE
343  *		MDI_COMPONENT_VHCI
344  *		MDI_COMPONENT_PHCI
345  *		MDI_COMPONENT_CLIENT
346  * XXX This doesn't work under multi-level MPxIO and should be
347  *	removed when clients migrate mdi_component_is_*() interfaces.
348  */
349 int
350 mdi_get_component_type(dev_info_t *dip)
351 {
352 	return (DEVI(dip)->devi_mdi_component);
353 }
354 
355 /*
356  * mdi_vhci_register():
357  *		Register a vHCI module with the mpxio framework
358  *		mdi_vhci_register() is called by vHCI drivers to register the
359  *		'class_driver' vHCI driver and its MDI entrypoints with the
360  *		mpxio framework.  The vHCI driver must call this interface as
361  *		part of its attach(9e) handler.
362  *		Competing threads may try to attach mdi_vhci_register() as
363  *		the vHCI drivers are loaded and attached as a result of pHCI
364  *		driver instance registration (mdi_phci_register()) with the
365  *		framework.
366  * Return Values:
367  *		MDI_SUCCESS
368  *		MDI_FAILURE
369  */
370 /*ARGSUSED*/
371 int
372 mdi_vhci_register(char *class, dev_info_t *vdip, mdi_vhci_ops_t *vops,
373     int flags)
374 {
375 	mdi_vhci_t		*vh = NULL;
376 
377 	ASSERT(vops->vo_revision == MDI_VHCI_OPS_REV);
378 #ifdef DEBUG
379 	/*
380 	 * IB nexus driver is loaded only when IB hardware is present.
381 	 * In order to be able to do this there is a need to drive the loading
382 	 * and attaching of the IB nexus driver (especially when an IB hardware
383 	 * is dynamically plugged in) when an IB HCA driver (PHCI)
384 	 * is being attached. Unfortunately this gets into the limitations
385 	 * of devfs as there seems to be no clean way to drive configuration
386 	 * of a subtree from another subtree of a devfs. Hence, do not ASSERT
387 	 * for IB.
388 	 */
389 	if (strcmp(class, MDI_HCI_CLASS_IB) != 0)
390 		ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));
391 #endif
392 
393 	i_mdi_init();
394 
395 	mutex_enter(&mdi_mutex);
396 	/*
397 	 * Scan for already registered vhci
398 	 */
399 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
400 		if (strcmp(vh->vh_class, class) == 0) {
401 			/*
402 			 * vHCI has already been created.  Check for valid
403 			 * vHCI ops registration.  We only support one vHCI
404 			 * module per class
405 			 */
406 			if (vh->vh_ops != NULL) {
407 				mutex_exit(&mdi_mutex);
408 				cmn_err(CE_NOTE, vhci_greeting, class);
409 				return (MDI_FAILURE);
410 			}
411 			break;
412 		}
413 	}
414 
415 	/*
416 	 * if not yet created, create the vHCI component
417 	 */
418 	if (vh == NULL) {
419 		struct client_hash	*hash = NULL;
420 		char			*load_balance;
421 
422 		/*
423 		 * Allocate and initialize the mdi extensions
424 		 */
425 		vh = kmem_zalloc(sizeof (mdi_vhci_t), KM_SLEEP);
426 		hash = kmem_zalloc(mdi_client_table_size * sizeof (*hash),
427 		    KM_SLEEP);
428 		vh->vh_client_table = hash;
429 		vh->vh_class = kmem_zalloc(strlen(class) + 1, KM_SLEEP);
430 		(void) strcpy(vh->vh_class, class);
431 		vh->vh_lb = LOAD_BALANCE_RR;
432 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vdip,
433 		    0, LOAD_BALANCE_PROP, &load_balance) == DDI_SUCCESS) {
434 			if (strcmp(load_balance, LOAD_BALANCE_PROP_NONE) == 0) {
435 				vh->vh_lb = LOAD_BALANCE_NONE;
436 			} else if (strcmp(load_balance, LOAD_BALANCE_PROP_LBA)
437 				    == 0) {
438 				vh->vh_lb = LOAD_BALANCE_LBA;
439 			}
440 			ddi_prop_free(load_balance);
441 		}
442 
443 		mutex_init(&vh->vh_phci_mutex, NULL, MUTEX_DEFAULT, NULL);
444 		mutex_init(&vh->vh_client_mutex, NULL, MUTEX_DEFAULT, NULL);
445 
446 		/*
447 		 * Store the vHCI ops vectors
448 		 */
449 		vh->vh_dip = vdip;
450 		vh->vh_ops = vops;
451 
452 		setup_vhci_cache(vh);
453 
454 		if (mdi_vhci_head == NULL) {
455 			mdi_vhci_head = vh;
456 		}
457 		if (mdi_vhci_tail) {
458 			mdi_vhci_tail->vh_next = vh;
459 		}
460 		mdi_vhci_tail = vh;
461 		mdi_vhci_count++;
462 	}
463 
464 	/*
465 	 * Claim the devfs node as a vhci component
466 	 */
467 	DEVI(vdip)->devi_mdi_component |= MDI_COMPONENT_VHCI;
468 
469 	/*
470 	 * Initialize our back reference from dev_info node
471 	 */
472 	DEVI(vdip)->devi_mdi_xhci = (caddr_t)vh;
473 	mutex_exit(&mdi_mutex);
474 	return (MDI_SUCCESS);
475 }
476 
477 /*
478  * mdi_vhci_unregister():
479  *		Unregister a vHCI module from mpxio framework
480  *		mdi_vhci_unregister() is called from the detach(9E) entrypoint
481  * 		of a vhci to unregister it from the framework.
482  * Return Values:
483  *		MDI_SUCCESS
484  *		MDI_FAILURE
485  */
486 /*ARGSUSED*/
487 int
488 mdi_vhci_unregister(dev_info_t *vdip, int flags)
489 {
490 	mdi_vhci_t	*found, *vh, *prev = NULL;
491 
492 	ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));
493 
494 	/*
495 	 * Check for invalid VHCI
496 	 */
497 	if ((vh = i_devi_get_vhci(vdip)) == NULL)
498 		return (MDI_FAILURE);
499 
500 	/*
501 	 * Scan the list of registered vHCIs for a match
502 	 */
503 	mutex_enter(&mdi_mutex);
504 	for (found = mdi_vhci_head; found != NULL; found = found->vh_next) {
505 		if (found == vh)
506 			break;
507 		prev = found;
508 	}
509 
510 	if (found == NULL) {
511 		mutex_exit(&mdi_mutex);
512 		return (MDI_FAILURE);
513 	}
514 
515 	/*
516 	 * Check the vHCI, pHCI and client count. All the pHCIs and clients
517 	 * should have been unregistered, before a vHCI can be
518 	 * unregistered.
519 	 */
520 	MDI_VHCI_PHCI_LOCK(vh);
521 	if (vh->vh_refcnt || vh->vh_phci_count || vh->vh_client_count) {
522 		MDI_VHCI_PHCI_UNLOCK(vh);
523 		mutex_exit(&mdi_mutex);
524 		return (MDI_FAILURE);
525 	}
526 	MDI_VHCI_PHCI_UNLOCK(vh);
527 
528 	if (destroy_vhci_cache(vh) != MDI_SUCCESS) {
529 		mutex_exit(&mdi_mutex);
530 		return (MDI_FAILURE);
531 	}
532 
533 	/*
534 	 * Remove the vHCI from the global list
535 	 */
536 	if (vh == mdi_vhci_head) {
537 		mdi_vhci_head = vh->vh_next;
538 	} else {
539 		prev->vh_next = vh->vh_next;
540 	}
541 	if (vh == mdi_vhci_tail) {
542 		mdi_vhci_tail = prev;
543 	}
544 	mdi_vhci_count--;
545 	mutex_exit(&mdi_mutex);
546 
547 	vh->vh_ops = NULL;
548 	DEVI(vdip)->devi_mdi_component &= ~MDI_COMPONENT_VHCI;
549 	DEVI(vdip)->devi_mdi_xhci = NULL;
550 	kmem_free(vh->vh_class, strlen(vh->vh_class)+1);
551 	kmem_free(vh->vh_client_table,
552 	    mdi_client_table_size * sizeof (struct client_hash));
553 	mutex_destroy(&vh->vh_phci_mutex);
554 	mutex_destroy(&vh->vh_client_mutex);
555 
556 	kmem_free(vh, sizeof (mdi_vhci_t));
557 	return (MDI_SUCCESS);
558 }
559 
560 /*
561  * i_mdi_vhci_class2vhci():
562  *		Look for a matching vHCI module given a vHCI class name
563  * Return Values:
564  *		Handle to a vHCI component
565  *		NULL
566  */
567 static mdi_vhci_t *
568 i_mdi_vhci_class2vhci(char *class)
569 {
570 	mdi_vhci_t	*vh = NULL;
571 
572 	ASSERT(!MUTEX_HELD(&mdi_mutex));
573 
574 	mutex_enter(&mdi_mutex);
575 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
576 		if (strcmp(vh->vh_class, class) == 0) {
577 			break;
578 		}
579 	}
580 	mutex_exit(&mdi_mutex);
581 	return (vh);
582 }
583 
584 /*
585  * i_devi_get_vhci():
586  *		Utility function to get the handle to a vHCI component
587  * Return Values:
588  *		Handle to a vHCI component
589  *		NULL
590  */
591 mdi_vhci_t *
592 i_devi_get_vhci(dev_info_t *vdip)
593 {
594 	mdi_vhci_t	*vh = NULL;
595 	if (MDI_VHCI(vdip)) {
596 		vh = (mdi_vhci_t *)DEVI(vdip)->devi_mdi_xhci;
597 	}
598 	return (vh);
599 }
600 
601 /*
602  * mdi_phci_register():
603  *		Register a pHCI module with mpxio framework
604  *		mdi_phci_register() is called by pHCI drivers to register with
605  *		the mpxio framework and a specific 'class_driver' vHCI.  The
606  *		pHCI driver must call this interface as part of its attach(9e)
607  *		handler.
608  * Return Values:
609  *		MDI_SUCCESS
610  *		MDI_FAILURE
611  */
612 /*ARGSUSED*/
613 int
614 mdi_phci_register(char *class, dev_info_t *pdip, int flags)
615 {
616 	mdi_phci_t		*ph;
617 	mdi_vhci_t		*vh;
618 	char			*data;
619 	char			*pathname;
620 
621 	/*
622 	 * Some subsystems, like fcp, perform pHCI registration from a
623 	 * different thread than the one doing the pHCI attach(9E) - the
624 	 * driver attach code is waiting for this other thread to complete.
625 	 * This means we can only ASSERT DEVI_BUSY_CHANGING of parent
626 	 * (indicating that some thread has done an ndi_devi_enter of parent)
627 	 * not DEVI_BUSY_OWNED (which would indicate that we did the enter).
628 	 */
629 	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));
630 
631 	pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
632 	(void) ddi_pathname(pdip, pathname);
633 
634 	/*
635 	 * Check for mpxio-disable property. Enable mpxio if the property is
636 	 * missing or not set to "yes".
637 	 * If the property is set to "yes" then emit a brief message.
638 	 */
639 	if ((ddi_prop_lookup_string(DDI_DEV_T_ANY, pdip, 0, "mpxio-disable",
640 	    &data) == DDI_SUCCESS)) {
641 		if (strcmp(data, "yes") == 0) {
642 			MDI_DEBUG(1, (CE_CONT, pdip,
643 			    "?%s (%s%d) multipath capabilities "
644 			    "disabled via %s.conf.\n", pathname,
645 			    ddi_driver_name(pdip), ddi_get_instance(pdip),
646 			    ddi_driver_name(pdip)));
647 			ddi_prop_free(data);
648 			kmem_free(pathname, MAXPATHLEN);
649 			return (MDI_FAILURE);
650 		}
651 		ddi_prop_free(data);
652 	}
653 
654 	kmem_free(pathname, MAXPATHLEN);
655 
656 	/*
657 	 * Search for a matching vHCI
658 	 */
659 	vh = (mdi_vhci_t *)i_mdi_vhci_class2vhci(class);
660 	if (vh == NULL) {
661 		return (MDI_FAILURE);
662 	}
663 
664 	ph = kmem_zalloc(sizeof (mdi_phci_t), KM_SLEEP);
665 	mutex_init(&ph->ph_mutex, NULL, MUTEX_DEFAULT, NULL);
666 	ph->ph_dip = pdip;
667 	ph->ph_vhci = vh;
668 	ph->ph_next = NULL;
669 	ph->ph_unstable = 0;
670 	ph->ph_vprivate = 0;
671 	cv_init(&ph->ph_unstable_cv, NULL, CV_DRIVER, NULL);
672 
673 	MDI_PHCI_LOCK(ph);
674 	MDI_PHCI_SET_POWER_UP(ph);
675 	MDI_PHCI_UNLOCK(ph);
676 	DEVI(pdip)->devi_mdi_component |= MDI_COMPONENT_PHCI;
677 	DEVI(pdip)->devi_mdi_xhci = (caddr_t)ph;
678 
679 	vhcache_phci_add(vh->vh_config, ph);
680 
681 	MDI_VHCI_PHCI_LOCK(vh);
682 	if (vh->vh_phci_head == NULL) {
683 		vh->vh_phci_head = ph;
684 	}
685 	if (vh->vh_phci_tail) {
686 		vh->vh_phci_tail->ph_next = ph;
687 	}
688 	vh->vh_phci_tail = ph;
689 	vh->vh_phci_count++;
690 	MDI_VHCI_PHCI_UNLOCK(vh);
691 
692 	i_mdi_log_sysevent(pdip, class, ESC_DDI_INITIATOR_REGISTER);
693 	return (MDI_SUCCESS);
694 }
695 
696 /*
697  * mdi_phci_unregister():
698  *		Unregister a pHCI module from mpxio framework
699  *		mdi_phci_unregister() is called by the pHCI drivers from their
700  *		detach(9E) handler to unregister their instances from the
701  *		framework.
702  * Return Values:
703  *		MDI_SUCCESS
704  *		MDI_FAILURE
705  */
706 /*ARGSUSED*/
707 int
708 mdi_phci_unregister(dev_info_t *pdip, int flags)
709 {
710 	mdi_vhci_t		*vh;
711 	mdi_phci_t		*ph;
712 	mdi_phci_t		*tmp;
713 	mdi_phci_t		*prev = NULL;
714 
715 	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));
716 
717 	ph = i_devi_get_phci(pdip);
718 	if (ph == NULL) {
719 		MDI_DEBUG(1, (CE_WARN, pdip,
720 		    "!pHCI unregister: Not a valid pHCI"));
721 		return (MDI_FAILURE);
722 	}
723 
724 	vh = ph->ph_vhci;
725 	ASSERT(vh != NULL);
726 	if (vh == NULL) {
727 		MDI_DEBUG(1, (CE_WARN, pdip,
728 		    "!pHCI unregister: Not a valid vHCI"));
729 		return (MDI_FAILURE);
730 	}
731 
732 	MDI_VHCI_PHCI_LOCK(vh);
733 	tmp = vh->vh_phci_head;
734 	while (tmp) {
735 		if (tmp == ph) {
736 			break;
737 		}
738 		prev = tmp;
739 		tmp = tmp->ph_next;
740 	}
741 
742 	if (ph == vh->vh_phci_head) {
743 		vh->vh_phci_head = ph->ph_next;
744 	} else {
745 		prev->ph_next = ph->ph_next;
746 	}
747 
748 	if (ph == vh->vh_phci_tail) {
749 		vh->vh_phci_tail = prev;
750 	}
751 
752 	vh->vh_phci_count--;
753 	MDI_VHCI_PHCI_UNLOCK(vh);
754 
755 	i_mdi_log_sysevent(pdip, ph->ph_vhci->vh_class,
756 	    ESC_DDI_INITIATOR_UNREGISTER);
757 	vhcache_phci_remove(vh->vh_config, ph);
758 	cv_destroy(&ph->ph_unstable_cv);
759 	mutex_destroy(&ph->ph_mutex);
760 	kmem_free(ph, sizeof (mdi_phci_t));
761 	DEVI(pdip)->devi_mdi_component &= ~MDI_COMPONENT_PHCI;
762 	DEVI(pdip)->devi_mdi_xhci = NULL;
763 	return (MDI_SUCCESS);
764 }
765 
766 /*
767  * i_devi_get_phci():
768  * 		Utility function to return the phci extensions.
769  */
770 static mdi_phci_t *
771 i_devi_get_phci(dev_info_t *pdip)
772 {
773 	mdi_phci_t	*ph = NULL;
774 	if (MDI_PHCI(pdip)) {
775 		ph = (mdi_phci_t *)DEVI(pdip)->devi_mdi_xhci;
776 	}
777 	return (ph);
778 }
779 
780 /*
781  * Single thread mdi entry into devinfo node for modifying its children.
782  * If necessary we perform an ndi_devi_enter of the vHCI before doing
783  * an ndi_devi_enter of 'dip'.  We maintain circular in two parts: one
784  * for the vHCI and one for the pHCI.
785  */
786 void
787 mdi_devi_enter(dev_info_t *phci_dip, int *circular)
788 {
789 	dev_info_t	*vdip;
790 	int		vcircular, pcircular;
791 
792 	/* Verify calling context */
793 	ASSERT(MDI_PHCI(phci_dip));
794 	vdip = mdi_devi_get_vdip(phci_dip);
795 	ASSERT(vdip);			/* A pHCI always has a vHCI */
796 
797 	/*
798 	 * If pHCI is detaching then the framework has already entered the
799 	 * vHCI on a threads that went down the code path leading to
800 	 * detach_node().  This framework enter of the vHCI during pHCI
801 	 * detach is done to avoid deadlock with vHCI power management
802 	 * operations which enter the vHCI and the enter down the path
803 	 * to the pHCI. If pHCI is detaching then we piggyback this calls
804 	 * enter of the vHCI on frameworks vHCI enter that has already
805 	 * occurred - this is OK because we know that the framework thread
806 	 * doing detach is waiting for our completion.
807 	 *
808 	 * We should DEVI_IS_DETACHING under an enter of the parent to avoid
809 	 * race with detach - but we can't do that because the framework has
810 	 * already entered the parent, so we have some complexity instead.
811 	 */
812 	for (;;) {
813 		if (ndi_devi_tryenter(vdip, &vcircular)) {
814 			ASSERT(vcircular != -1);
815 			if (DEVI_IS_DETACHING(phci_dip)) {
816 				ndi_devi_exit(vdip, vcircular);
817 				vcircular = -1;
818 			}
819 			break;
820 		} else if (DEVI_IS_DETACHING(phci_dip)) {
821 			vcircular = -1;
822 			break;
823 		} else {
824 			delay(1);
825 		}
826 	}
827 
828 	ndi_devi_enter(phci_dip, &pcircular);
829 	*circular = (vcircular << 16) | (pcircular & 0xFFFF);
830 }
831 
832 /*
833  * Release mdi_devi_enter or successful mdi_devi_tryenter.
834  */
835 void
836 mdi_devi_exit(dev_info_t *phci_dip, int circular)
837 {
838 	dev_info_t	*vdip;
839 	int		vcircular, pcircular;
840 
841 	/* Verify calling context */
842 	ASSERT(MDI_PHCI(phci_dip));
843 	vdip = mdi_devi_get_vdip(phci_dip);
844 	ASSERT(vdip);			/* A pHCI always has a vHCI */
845 
846 	/* extract two circular recursion values from single int */
847 	pcircular = (short)(circular & 0xFFFF);
848 	vcircular = (short)((circular >> 16) & 0xFFFF);
849 
850 	ndi_devi_exit(phci_dip, pcircular);
851 	if (vcircular != -1)
852 		ndi_devi_exit(vdip, vcircular);
853 }
854 
855 /*
856  * The functions mdi_devi_exit_phci() and mdi_devi_enter_phci() are used
857  * around a pHCI drivers calls to mdi_pi_online/offline, after holding
858  * the pathinfo node via mdi_hold_path/mdi_rele_path, to avoid deadlock
859  * with vHCI power management code during path online/offline.  Each
860  * mdi_devi_exit_phci must have a matching mdi_devi_enter_phci, and both must
861  * occur within the scope of an active mdi_devi_enter that establishes the
862  * circular value.
863  */
864 void
865 mdi_devi_exit_phci(dev_info_t *phci_dip, int circular)
866 {
867 	int		pcircular;
868 
869 	/* Verify calling context */
870 	ASSERT(MDI_PHCI(phci_dip));
871 
872 	pcircular = (short)(circular & 0xFFFF);
873 	ndi_devi_exit(phci_dip, pcircular);
874 }
875 
876 void
877 mdi_devi_enter_phci(dev_info_t *phci_dip, int *circular)
878 {
879 	int		pcircular;
880 
881 	/* Verify calling context */
882 	ASSERT(MDI_PHCI(phci_dip));
883 
884 	ndi_devi_enter(phci_dip, &pcircular);
885 
886 	/* verify matching mdi_devi_exit_phci/mdi_devi_enter_phci use */
887 	ASSERT(pcircular == ((short)(*circular & 0xFFFF)));
888 }
889 
890 /*
891  * mdi_devi_get_vdip():
892  *		given a pHCI dip return vHCI dip
893  */
894 dev_info_t *
895 mdi_devi_get_vdip(dev_info_t *pdip)
896 {
897 	mdi_phci_t	*ph;
898 
899 	ph = i_devi_get_phci(pdip);
900 	if (ph && ph->ph_vhci)
901 		return (ph->ph_vhci->vh_dip);
902 	return (NULL);
903 }
904 
905 /*
906  * mdi_devi_pdip_entered():
907  *		Return 1 if we are vHCI and have done an ndi_devi_enter
908  *		of a pHCI
909  */
910 int
911 mdi_devi_pdip_entered(dev_info_t *vdip)
912 {
913 	mdi_vhci_t	*vh;
914 	mdi_phci_t	*ph;
915 
916 	vh = i_devi_get_vhci(vdip);
917 	if (vh == NULL)
918 		return (0);
919 
920 	MDI_VHCI_PHCI_LOCK(vh);
921 	ph = vh->vh_phci_head;
922 	while (ph) {
923 		if (ph->ph_dip && DEVI_BUSY_OWNED(ph->ph_dip)) {
924 			MDI_VHCI_PHCI_UNLOCK(vh);
925 			return (1);
926 		}
927 		ph = ph->ph_next;
928 	}
929 	MDI_VHCI_PHCI_UNLOCK(vh);
930 	return (0);
931 }
932 
933 /*
934  * mdi_phci_path2devinfo():
935  * 		Utility function to search for a valid phci device given
936  *		the devfs pathname.
937  */
938 dev_info_t *
939 mdi_phci_path2devinfo(dev_info_t *vdip, caddr_t pathname)
940 {
941 	char		*temp_pathname;
942 	mdi_vhci_t	*vh;
943 	mdi_phci_t	*ph;
944 	dev_info_t 	*pdip = NULL;
945 
946 	vh = i_devi_get_vhci(vdip);
947 	ASSERT(vh != NULL);
948 
949 	if (vh == NULL) {
950 		/*
951 		 * Invalid vHCI component, return failure
952 		 */
953 		return (NULL);
954 	}
955 
956 	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
957 	MDI_VHCI_PHCI_LOCK(vh);
958 	ph = vh->vh_phci_head;
959 	while (ph != NULL) {
960 		pdip = ph->ph_dip;
961 		ASSERT(pdip != NULL);
962 		*temp_pathname = '\0';
963 		(void) ddi_pathname(pdip, temp_pathname);
964 		if (strcmp(temp_pathname, pathname) == 0) {
965 			break;
966 		}
967 		ph = ph->ph_next;
968 	}
969 	if (ph == NULL) {
970 		pdip = NULL;
971 	}
972 	MDI_VHCI_PHCI_UNLOCK(vh);
973 	kmem_free(temp_pathname, MAXPATHLEN);
974 	return (pdip);
975 }
976 
977 /*
978  * mdi_phci_get_path_count():
979  * 		get number of path information nodes associated with a given
980  *		pHCI device.
981  */
982 int
983 mdi_phci_get_path_count(dev_info_t *pdip)
984 {
985 	mdi_phci_t	*ph;
986 	int		count = 0;
987 
988 	ph = i_devi_get_phci(pdip);
989 	if (ph != NULL) {
990 		count = ph->ph_path_count;
991 	}
992 	return (count);
993 }
994 
995 /*
996  * i_mdi_phci_lock():
997  *		Lock a pHCI device
998  * Return Values:
999  *		None
1000  * Note:
1001  *		The default locking order is:
1002  *		_NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
1003  *		But there are number of situations where locks need to be
1004  *		grabbed in reverse order.  This routine implements try and lock
1005  *		mechanism depending on the requested parameter option.
1006  */
1007 static void
1008 i_mdi_phci_lock(mdi_phci_t *ph, mdi_pathinfo_t *pip)
1009 {
1010 	if (pip) {
1011 		/* Reverse locking is requested. */
1012 		while (MDI_PHCI_TRYLOCK(ph) == 0) {
1013 			/*
1014 			 * tryenter failed. Try to grab again
1015 			 * after a small delay
1016 			 */
1017 			MDI_PI_HOLD(pip);
1018 			MDI_PI_UNLOCK(pip);
1019 			delay(1);
1020 			MDI_PI_LOCK(pip);
1021 			MDI_PI_RELE(pip);
1022 		}
1023 	} else {
1024 		MDI_PHCI_LOCK(ph);
1025 	}
1026 }
1027 
1028 /*
1029  * i_mdi_phci_unlock():
1030  *		Unlock the pHCI component
1031  */
1032 static void
1033 i_mdi_phci_unlock(mdi_phci_t *ph)
1034 {
1035 	MDI_PHCI_UNLOCK(ph);
1036 }
1037 
1038 /*
1039  * i_mdi_devinfo_create():
1040  *		create client device's devinfo node
1041  * Return Values:
1042  *		dev_info
1043  *		NULL
1044  * Notes:
1045  */
1046 static dev_info_t *
1047 i_mdi_devinfo_create(mdi_vhci_t *vh, char *name, char *guid,
1048 	char **compatible, int ncompatible)
1049 {
1050 	dev_info_t *cdip = NULL;
1051 
1052 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1053 
1054 	/* Verify for duplicate entry */
1055 	cdip = i_mdi_devinfo_find(vh, name, guid);
1056 	ASSERT(cdip == NULL);
1057 	if (cdip) {
1058 		cmn_err(CE_WARN,
1059 		    "i_mdi_devinfo_create: client dip %p already exists",
1060 			(void *)cdip);
1061 	}
1062 
1063 	ndi_devi_alloc_sleep(vh->vh_dip, name, DEVI_SID_NODEID, &cdip);
1064 	if (cdip == NULL)
1065 		goto fail;
1066 
1067 	/*
1068 	 * Create component type and Global unique identifier
1069 	 * properties
1070 	 */
1071 	if (ndi_prop_update_string(DDI_DEV_T_NONE, cdip,
1072 	    MDI_CLIENT_GUID_PROP, guid) != DDI_PROP_SUCCESS) {
1073 		goto fail;
1074 	}
1075 
1076 	/* Decorate the node with compatible property */
1077 	if (compatible &&
1078 	    (ndi_prop_update_string_array(DDI_DEV_T_NONE, cdip,
1079 	    "compatible", compatible, ncompatible) != DDI_PROP_SUCCESS)) {
1080 		goto fail;
1081 	}
1082 
1083 	return (cdip);
1084 
1085 fail:
1086 	if (cdip) {
1087 		(void) ndi_prop_remove_all(cdip);
1088 		(void) ndi_devi_free(cdip);
1089 	}
1090 	return (NULL);
1091 }
1092 
1093 /*
1094  * i_mdi_devinfo_find():
1095  *		Find a matching devinfo node for given client node name
1096  *		and its guid.
1097  * Return Values:
1098  *		Handle to a dev_info node or NULL
1099  */
1100 static dev_info_t *
1101 i_mdi_devinfo_find(mdi_vhci_t *vh, caddr_t name, char *guid)
1102 {
1103 	char			*data;
1104 	dev_info_t 		*cdip = NULL;
1105 	dev_info_t 		*ndip = NULL;
1106 	int			circular;
1107 
1108 	ndi_devi_enter(vh->vh_dip, &circular);
1109 	ndip = (dev_info_t *)DEVI(vh->vh_dip)->devi_child;
1110 	while ((cdip = ndip) != NULL) {
1111 		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
1112 
1113 		if (strcmp(DEVI(cdip)->devi_node_name, name)) {
1114 			continue;
1115 		}
1116 
1117 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, cdip,
1118 		    DDI_PROP_DONTPASS, MDI_CLIENT_GUID_PROP,
1119 		    &data) != DDI_PROP_SUCCESS) {
1120 			continue;
1121 		}
1122 
1123 		if (strcmp(data, guid) != 0) {
1124 			ddi_prop_free(data);
1125 			continue;
1126 		}
1127 		ddi_prop_free(data);
1128 		break;
1129 	}
1130 	ndi_devi_exit(vh->vh_dip, circular);
1131 	return (cdip);
1132 }
1133 
1134 /*
1135  * i_mdi_devinfo_remove():
1136  *		Remove a client device node
1137  */
1138 static int
1139 i_mdi_devinfo_remove(dev_info_t *vdip, dev_info_t *cdip, int flags)
1140 {
1141 	int	rv = MDI_SUCCESS;
1142 
1143 	if (i_mdi_is_child_present(vdip, cdip) == MDI_SUCCESS ||
1144 	    (flags & MDI_CLIENT_FLAGS_DEV_NOT_SUPPORTED)) {
1145 		rv = ndi_devi_offline(cdip, NDI_DEVI_REMOVE);
1146 		if (rv != NDI_SUCCESS) {
1147 			MDI_DEBUG(1, (CE_NOTE, NULL, "!i_mdi_devinfo_remove:"
1148 			    " failed. cdip = %p\n", (void *)cdip));
1149 		}
1150 		/*
1151 		 * Convert to MDI error code
1152 		 */
1153 		switch (rv) {
1154 		case NDI_SUCCESS:
1155 			rv = MDI_SUCCESS;
1156 			break;
1157 		case NDI_BUSY:
1158 			rv = MDI_BUSY;
1159 			break;
1160 		default:
1161 			rv = MDI_FAILURE;
1162 			break;
1163 		}
1164 	}
1165 	return (rv);
1166 }
1167 
1168 /*
1169  * i_devi_get_client()
1170  *		Utility function to get mpxio component extensions
1171  */
1172 static mdi_client_t *
1173 i_devi_get_client(dev_info_t *cdip)
1174 {
1175 	mdi_client_t	*ct = NULL;
1176 
1177 	if (MDI_CLIENT(cdip)) {
1178 		ct = (mdi_client_t *)DEVI(cdip)->devi_mdi_client;
1179 	}
1180 	return (ct);
1181 }
1182 
1183 /*
1184  * i_mdi_is_child_present():
1185  *		Search for the presence of client device dev_info node
1186  */
1187 static int
1188 i_mdi_is_child_present(dev_info_t *vdip, dev_info_t *cdip)
1189 {
1190 	int		rv = MDI_FAILURE;
1191 	struct dev_info	*dip;
1192 	int		circular;
1193 
1194 	ndi_devi_enter(vdip, &circular);
1195 	dip = DEVI(vdip)->devi_child;
1196 	while (dip) {
1197 		if (dip == DEVI(cdip)) {
1198 			rv = MDI_SUCCESS;
1199 			break;
1200 		}
1201 		dip = dip->devi_sibling;
1202 	}
1203 	ndi_devi_exit(vdip, circular);
1204 	return (rv);
1205 }
1206 
1207 
1208 /*
1209  * i_mdi_client_lock():
1210  *		Grab client component lock
1211  * Return Values:
1212  *		None
1213  * Note:
1214  *		The default locking order is:
1215  *		_NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
1216  *		But there are number of situations where locks need to be
1217  *		grabbed in reverse order.  This routine implements try and lock
1218  *		mechanism depending on the requested parameter option.
1219  */
1220 static void
1221 i_mdi_client_lock(mdi_client_t *ct, mdi_pathinfo_t *pip)
1222 {
1223 	if (pip) {
1224 		/*
1225 		 * Reverse locking is requested.
1226 		 */
1227 		while (MDI_CLIENT_TRYLOCK(ct) == 0) {
1228 			/*
1229 			 * tryenter failed. Try to grab again
1230 			 * after a small delay
1231 			 */
1232 			MDI_PI_HOLD(pip);
1233 			MDI_PI_UNLOCK(pip);
1234 			delay(1);
1235 			MDI_PI_LOCK(pip);
1236 			MDI_PI_RELE(pip);
1237 		}
1238 	} else {
1239 		MDI_CLIENT_LOCK(ct);
1240 	}
1241 }
1242 
1243 /*
1244  * i_mdi_client_unlock():
1245  *		Unlock a client component
1246  */
1247 static void
1248 i_mdi_client_unlock(mdi_client_t *ct)
1249 {
1250 	MDI_CLIENT_UNLOCK(ct);
1251 }
1252 
1253 /*
1254  * i_mdi_client_alloc():
1255  * 		Allocate and initialize a client structure.  Caller should
1256  *		hold the vhci client lock.
1257  * Return Values:
1258  *		Handle to a client component
1259  */
1260 /*ARGSUSED*/
1261 static mdi_client_t *
1262 i_mdi_client_alloc(mdi_vhci_t *vh, char *name, char *lguid)
1263 {
1264 	mdi_client_t	*ct;
1265 
1266 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1267 
1268 	/*
1269 	 * Allocate and initialize a component structure.
1270 	 */
1271 	ct = kmem_zalloc(sizeof (*ct), KM_SLEEP);
1272 	mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL);
1273 	ct->ct_hnext = NULL;
1274 	ct->ct_hprev = NULL;
1275 	ct->ct_dip = NULL;
1276 	ct->ct_vhci = vh;
1277 	ct->ct_drvname = kmem_alloc(strlen(name) + 1, KM_SLEEP);
1278 	(void) strcpy(ct->ct_drvname, name);
1279 	ct->ct_guid = kmem_alloc(strlen(lguid) + 1, KM_SLEEP);
1280 	(void) strcpy(ct->ct_guid, lguid);
1281 	ct->ct_cprivate = NULL;
1282 	ct->ct_vprivate = NULL;
1283 	ct->ct_flags = 0;
1284 	ct->ct_state = MDI_CLIENT_STATE_FAILED;
1285 	MDI_CLIENT_LOCK(ct);
1286 	MDI_CLIENT_SET_OFFLINE(ct);
1287 	MDI_CLIENT_SET_DETACH(ct);
1288 	MDI_CLIENT_SET_POWER_UP(ct);
1289 	MDI_CLIENT_UNLOCK(ct);
1290 	ct->ct_failover_flags = 0;
1291 	ct->ct_failover_status = 0;
1292 	cv_init(&ct->ct_failover_cv, NULL, CV_DRIVER, NULL);
1293 	ct->ct_unstable = 0;
1294 	cv_init(&ct->ct_unstable_cv, NULL, CV_DRIVER, NULL);
1295 	cv_init(&ct->ct_powerchange_cv, NULL, CV_DRIVER, NULL);
1296 	ct->ct_lb = vh->vh_lb;
1297 	ct->ct_lb_args =  kmem_zalloc(sizeof (client_lb_args_t), KM_SLEEP);
1298 	ct->ct_lb_args->region_size = LOAD_BALANCE_DEFAULT_REGION_SIZE;
1299 	ct->ct_path_count = 0;
1300 	ct->ct_path_head = NULL;
1301 	ct->ct_path_tail = NULL;
1302 	ct->ct_path_last = NULL;
1303 
1304 	/*
1305 	 * Add this client component to our client hash queue
1306 	 */
1307 	i_mdi_client_enlist_table(vh, ct);
1308 	return (ct);
1309 }
1310 
1311 /*
1312  * i_mdi_client_enlist_table():
1313  *		Attach the client device to the client hash table. Caller
1314  *		should hold the vhci client lock.
1315  */
1316 static void
1317 i_mdi_client_enlist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1318 {
1319 	int 			index;
1320 	struct client_hash	*head;
1321 
1322 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1323 
1324 	index = i_mdi_get_hash_key(ct->ct_guid);
1325 	head = &vh->vh_client_table[index];
1326 	ct->ct_hnext = (mdi_client_t *)head->ct_hash_head;
1327 	head->ct_hash_head = ct;
1328 	head->ct_hash_count++;
1329 	vh->vh_client_count++;
1330 }
1331 
1332 /*
1333  * i_mdi_client_delist_table():
1334  *		Attach the client device to the client hash table.
1335  *		Caller should hold the vhci client lock.
1336  */
1337 static void
1338 i_mdi_client_delist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1339 {
1340 	int			index;
1341 	char			*guid;
1342 	struct client_hash 	*head;
1343 	mdi_client_t		*next;
1344 	mdi_client_t		*last;
1345 
1346 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1347 
1348 	guid = ct->ct_guid;
1349 	index = i_mdi_get_hash_key(guid);
1350 	head = &vh->vh_client_table[index];
1351 
1352 	last = NULL;
1353 	next = (mdi_client_t *)head->ct_hash_head;
1354 	while (next != NULL) {
1355 		if (next == ct) {
1356 			break;
1357 		}
1358 		last = next;
1359 		next = next->ct_hnext;
1360 	}
1361 
1362 	if (next) {
1363 		head->ct_hash_count--;
1364 		if (last == NULL) {
1365 			head->ct_hash_head = ct->ct_hnext;
1366 		} else {
1367 			last->ct_hnext = ct->ct_hnext;
1368 		}
1369 		ct->ct_hnext = NULL;
1370 		vh->vh_client_count--;
1371 	}
1372 }
1373 
1374 
1375 /*
1376  * i_mdi_client_free():
1377  *		Free a client component
1378  */
1379 static int
1380 i_mdi_client_free(mdi_vhci_t *vh, mdi_client_t *ct)
1381 {
1382 	int		rv = MDI_SUCCESS;
1383 	int		flags = ct->ct_flags;
1384 	dev_info_t	*cdip;
1385 	dev_info_t	*vdip;
1386 
1387 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1388 
1389 	vdip = vh->vh_dip;
1390 	cdip = ct->ct_dip;
1391 
1392 	(void) ndi_prop_remove(DDI_DEV_T_NONE, cdip, MDI_CLIENT_GUID_PROP);
1393 	DEVI(cdip)->devi_mdi_component &= ~MDI_COMPONENT_CLIENT;
1394 	DEVI(cdip)->devi_mdi_client = NULL;
1395 
1396 	/*
1397 	 * Clear out back ref. to dev_info_t node
1398 	 */
1399 	ct->ct_dip = NULL;
1400 
1401 	/*
1402 	 * Remove this client from our hash queue
1403 	 */
1404 	i_mdi_client_delist_table(vh, ct);
1405 
1406 	/*
1407 	 * Uninitialize and free the component
1408 	 */
1409 	kmem_free(ct->ct_drvname, strlen(ct->ct_drvname) + 1);
1410 	kmem_free(ct->ct_guid, strlen(ct->ct_guid) + 1);
1411 	kmem_free(ct->ct_lb_args, sizeof (client_lb_args_t));
1412 	cv_destroy(&ct->ct_failover_cv);
1413 	cv_destroy(&ct->ct_unstable_cv);
1414 	cv_destroy(&ct->ct_powerchange_cv);
1415 	mutex_destroy(&ct->ct_mutex);
1416 	kmem_free(ct, sizeof (*ct));
1417 
1418 	if (cdip != NULL) {
1419 		MDI_VHCI_CLIENT_UNLOCK(vh);
1420 		(void) i_mdi_devinfo_remove(vdip, cdip, flags);
1421 		MDI_VHCI_CLIENT_LOCK(vh);
1422 	}
1423 	return (rv);
1424 }
1425 
1426 /*
1427  * i_mdi_client_find():
1428  * 		Find the client structure corresponding to a given guid
1429  *		Caller should hold the vhci client lock.
1430  */
1431 static mdi_client_t *
1432 i_mdi_client_find(mdi_vhci_t *vh, char *cname, char *guid)
1433 {
1434 	int			index;
1435 	struct client_hash	*head;
1436 	mdi_client_t		*ct;
1437 
1438 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1439 
1440 	index = i_mdi_get_hash_key(guid);
1441 	head = &vh->vh_client_table[index];
1442 
1443 	ct = head->ct_hash_head;
1444 	while (ct != NULL) {
1445 		if (strcmp(ct->ct_guid, guid) == 0 &&
1446 		    (cname == NULL || strcmp(ct->ct_drvname, cname) == 0)) {
1447 			break;
1448 		}
1449 		ct = ct->ct_hnext;
1450 	}
1451 	return (ct);
1452 }
1453 
1454 /*
1455  * i_mdi_client_update_state():
1456  *		Compute and update client device state
1457  * Notes:
1458  *		A client device can be in any of three possible states:
1459  *
1460  *		MDI_CLIENT_STATE_OPTIMAL - Client in optimal state with more
1461  *		one online/standby paths. Can tolerate failures.
1462  *		MDI_CLIENT_STATE_DEGRADED - Client device in degraded state with
1463  *		no alternate paths available as standby. A failure on the online
1464  *		would result in loss of access to device data.
1465  *		MDI_CLIENT_STATE_FAILED - Client device in failed state with
1466  *		no paths available to access the device.
1467  */
1468 static void
1469 i_mdi_client_update_state(mdi_client_t *ct)
1470 {
1471 	int state;
1472 
1473 	ASSERT(MDI_CLIENT_LOCKED(ct));
1474 	state = i_mdi_client_compute_state(ct, NULL);
1475 	MDI_CLIENT_SET_STATE(ct, state);
1476 }
1477 
1478 /*
1479  * i_mdi_client_compute_state():
1480  *		Compute client device state
1481  *
1482  *		mdi_phci_t *	Pointer to pHCI structure which should
1483  *				while computing the new value.  Used by
1484  *				i_mdi_phci_offline() to find the new
1485  *				client state after DR of a pHCI.
1486  */
1487 static int
1488 i_mdi_client_compute_state(mdi_client_t *ct, mdi_phci_t *ph)
1489 {
1490 	int		state;
1491 	int		online_count = 0;
1492 	int		standby_count = 0;
1493 	mdi_pathinfo_t	*pip, *next;
1494 
1495 	ASSERT(MDI_CLIENT_LOCKED(ct));
1496 	pip = ct->ct_path_head;
1497 	while (pip != NULL) {
1498 		MDI_PI_LOCK(pip);
1499 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
1500 		if (MDI_PI(pip)->pi_phci == ph) {
1501 			MDI_PI_UNLOCK(pip);
1502 			pip = next;
1503 			continue;
1504 		}
1505 
1506 		if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1507 				== MDI_PATHINFO_STATE_ONLINE)
1508 			online_count++;
1509 		else if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1510 				== MDI_PATHINFO_STATE_STANDBY)
1511 			standby_count++;
1512 		MDI_PI_UNLOCK(pip);
1513 		pip = next;
1514 	}
1515 
1516 	if (online_count == 0) {
1517 		if (standby_count == 0) {
1518 			state = MDI_CLIENT_STATE_FAILED;
1519 			MDI_DEBUG(2, (CE_NOTE, NULL, "!client state: failed"
1520 			    " ct = %p\n", (void *)ct));
1521 		} else if (standby_count == 1) {
1522 			state = MDI_CLIENT_STATE_DEGRADED;
1523 		} else {
1524 			state = MDI_CLIENT_STATE_OPTIMAL;
1525 		}
1526 	} else if (online_count == 1) {
1527 		if (standby_count == 0) {
1528 			state = MDI_CLIENT_STATE_DEGRADED;
1529 		} else {
1530 			state = MDI_CLIENT_STATE_OPTIMAL;
1531 		}
1532 	} else {
1533 		state = MDI_CLIENT_STATE_OPTIMAL;
1534 	}
1535 	return (state);
1536 }
1537 
1538 /*
1539  * i_mdi_client2devinfo():
1540  *		Utility function
1541  */
1542 dev_info_t *
1543 i_mdi_client2devinfo(mdi_client_t *ct)
1544 {
1545 	return (ct->ct_dip);
1546 }
1547 
1548 /*
1549  * mdi_client_path2_devinfo():
1550  * 		Given the parent devinfo and child devfs pathname, search for
1551  *		a valid devfs node handle.
1552  */
1553 dev_info_t *
1554 mdi_client_path2devinfo(dev_info_t *vdip, char *pathname)
1555 {
1556 	dev_info_t 	*cdip = NULL;
1557 	dev_info_t 	*ndip = NULL;
1558 	char		*temp_pathname;
1559 	int		circular;
1560 
1561 	/*
1562 	 * Allocate temp buffer
1563 	 */
1564 	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1565 
1566 	/*
1567 	 * Lock parent against changes
1568 	 */
1569 	ndi_devi_enter(vdip, &circular);
1570 	ndip = (dev_info_t *)DEVI(vdip)->devi_child;
1571 	while ((cdip = ndip) != NULL) {
1572 		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
1573 
1574 		*temp_pathname = '\0';
1575 		(void) ddi_pathname(cdip, temp_pathname);
1576 		if (strcmp(temp_pathname, pathname) == 0) {
1577 			break;
1578 		}
1579 	}
1580 	/*
1581 	 * Release devinfo lock
1582 	 */
1583 	ndi_devi_exit(vdip, circular);
1584 
1585 	/*
1586 	 * Free the temp buffer
1587 	 */
1588 	kmem_free(temp_pathname, MAXPATHLEN);
1589 	return (cdip);
1590 }
1591 
1592 /*
1593  * mdi_client_get_path_count():
1594  * 		Utility function to get number of path information nodes
1595  *		associated with a given client device.
1596  */
1597 int
1598 mdi_client_get_path_count(dev_info_t *cdip)
1599 {
1600 	mdi_client_t	*ct;
1601 	int		count = 0;
1602 
1603 	ct = i_devi_get_client(cdip);
1604 	if (ct != NULL) {
1605 		count = ct->ct_path_count;
1606 	}
1607 	return (count);
1608 }
1609 
1610 
1611 /*
1612  * i_mdi_get_hash_key():
1613  * 		Create a hash using strings as keys
1614  *
1615  */
1616 static int
1617 i_mdi_get_hash_key(char *str)
1618 {
1619 	uint32_t	g, hash = 0;
1620 	char		*p;
1621 
1622 	for (p = str; *p != '\0'; p++) {
1623 		g = *p;
1624 		hash += g;
1625 	}
1626 	return (hash % (CLIENT_HASH_TABLE_SIZE - 1));
1627 }
1628 
1629 /*
1630  * mdi_get_lb_policy():
1631  * 		Get current load balancing policy for a given client device
1632  */
1633 client_lb_t
1634 mdi_get_lb_policy(dev_info_t *cdip)
1635 {
1636 	client_lb_t	lb = LOAD_BALANCE_NONE;
1637 	mdi_client_t	*ct;
1638 
1639 	ct = i_devi_get_client(cdip);
1640 	if (ct != NULL) {
1641 		lb = ct->ct_lb;
1642 	}
1643 	return (lb);
1644 }
1645 
1646 /*
1647  * mdi_set_lb_region_size():
1648  * 		Set current region size for the load-balance
1649  */
1650 int
1651 mdi_set_lb_region_size(dev_info_t *cdip, int region_size)
1652 {
1653 	mdi_client_t	*ct;
1654 	int		rv = MDI_FAILURE;
1655 
1656 	ct = i_devi_get_client(cdip);
1657 	if (ct != NULL && ct->ct_lb_args != NULL) {
1658 		ct->ct_lb_args->region_size = region_size;
1659 		rv = MDI_SUCCESS;
1660 	}
1661 	return (rv);
1662 }
1663 
1664 /*
1665  * mdi_Set_lb_policy():
1666  * 		Set current load balancing policy for a given client device
1667  */
1668 int
1669 mdi_set_lb_policy(dev_info_t *cdip, client_lb_t lb)
1670 {
1671 	mdi_client_t	*ct;
1672 	int		rv = MDI_FAILURE;
1673 
1674 	ct = i_devi_get_client(cdip);
1675 	if (ct != NULL) {
1676 		ct->ct_lb = lb;
1677 		rv = MDI_SUCCESS;
1678 	}
1679 	return (rv);
1680 }
1681 
1682 /*
1683  * mdi_failover():
1684  *		failover function called by the vHCI drivers to initiate
1685  *		a failover operation.  This is typically due to non-availability
1686  *		of online paths to route I/O requests.  Failover can be
1687  *		triggered through user application also.
1688  *
1689  *		The vHCI driver calls mdi_failover() to initiate a failover
1690  *		operation. mdi_failover() calls back into the vHCI driver's
1691  *		vo_failover() entry point to perform the actual failover
1692  *		operation.  The reason for requiring the vHCI driver to
1693  *		initiate failover by calling mdi_failover(), instead of directly
1694  *		executing vo_failover() itself, is to ensure that the mdi
1695  *		framework can keep track of the client state properly.
1696  *		Additionally, mdi_failover() provides as a convenience the
1697  *		option of performing the failover operation synchronously or
1698  *		asynchronously
1699  *
1700  *		Upon successful completion of the failover operation, the
1701  *		paths that were previously ONLINE will be in the STANDBY state,
1702  *		and the newly activated paths will be in the ONLINE state.
1703  *
1704  *		The flags modifier determines whether the activation is done
1705  *		synchronously: MDI_FAILOVER_SYNC
1706  * Return Values:
1707  *		MDI_SUCCESS
1708  *		MDI_FAILURE
1709  *		MDI_BUSY
1710  */
1711 /*ARGSUSED*/
1712 int
1713 mdi_failover(dev_info_t *vdip, dev_info_t *cdip, int flags)
1714 {
1715 	int			rv;
1716 	mdi_client_t		*ct;
1717 
1718 	ct = i_devi_get_client(cdip);
1719 	ASSERT(ct != NULL);
1720 	if (ct == NULL) {
1721 		/* cdip is not a valid client device. Nothing more to do. */
1722 		return (MDI_FAILURE);
1723 	}
1724 
1725 	MDI_CLIENT_LOCK(ct);
1726 
1727 	if (MDI_CLIENT_IS_PATH_FREE_IN_PROGRESS(ct)) {
1728 		/* A path to the client is being freed */
1729 		MDI_CLIENT_UNLOCK(ct);
1730 		return (MDI_BUSY);
1731 	}
1732 
1733 
1734 	if (MDI_CLIENT_IS_FAILED(ct)) {
1735 		/*
1736 		 * Client is in failed state. Nothing more to do.
1737 		 */
1738 		MDI_CLIENT_UNLOCK(ct);
1739 		return (MDI_FAILURE);
1740 	}
1741 
1742 	if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
1743 		/*
1744 		 * Failover is already in progress; return BUSY
1745 		 */
1746 		MDI_CLIENT_UNLOCK(ct);
1747 		return (MDI_BUSY);
1748 	}
1749 	/*
1750 	 * Make sure that mdi_pathinfo node state changes are processed.
1751 	 * We do not allow failovers to progress while client path state
1752 	 * changes are in progress
1753 	 */
1754 	if (ct->ct_unstable) {
1755 		if (flags == MDI_FAILOVER_ASYNC) {
1756 			MDI_CLIENT_UNLOCK(ct);
1757 			return (MDI_BUSY);
1758 		} else {
1759 			while (ct->ct_unstable)
1760 				cv_wait(&ct->ct_unstable_cv, &ct->ct_mutex);
1761 		}
1762 	}
1763 
1764 	/*
1765 	 * Client device is in stable state. Before proceeding, perform sanity
1766 	 * checks again.
1767 	 */
1768 	if ((MDI_CLIENT_IS_DETACHED(ct)) || (MDI_CLIENT_IS_FAILED(ct)) ||
1769 	    (!i_ddi_devi_attached(ct->ct_dip))) {
1770 		/*
1771 		 * Client is in failed state. Nothing more to do.
1772 		 */
1773 		MDI_CLIENT_UNLOCK(ct);
1774 		return (MDI_FAILURE);
1775 	}
1776 
1777 	/*
1778 	 * Set the client state as failover in progress.
1779 	 */
1780 	MDI_CLIENT_SET_FAILOVER_IN_PROGRESS(ct);
1781 	ct->ct_failover_flags = flags;
1782 	MDI_CLIENT_UNLOCK(ct);
1783 
1784 	if (flags == MDI_FAILOVER_ASYNC) {
1785 		/*
1786 		 * Submit the initiate failover request via CPR safe
1787 		 * taskq threads.
1788 		 */
1789 		(void) taskq_dispatch(mdi_taskq, (task_func_t *)i_mdi_failover,
1790 		    ct, KM_SLEEP);
1791 		return (MDI_ACCEPT);
1792 	} else {
1793 		/*
1794 		 * Synchronous failover mode.  Typically invoked from the user
1795 		 * land.
1796 		 */
1797 		rv = i_mdi_failover(ct);
1798 	}
1799 	return (rv);
1800 }
1801 
1802 /*
1803  * i_mdi_failover():
1804  *		internal failover function. Invokes vHCI drivers failover
1805  *		callback function and process the failover status
1806  * Return Values:
1807  *		None
1808  *
1809  * Note: A client device in failover state can not be detached or freed.
1810  */
1811 static int
1812 i_mdi_failover(void *arg)
1813 {
1814 	int		rv = MDI_SUCCESS;
1815 	mdi_client_t	*ct = (mdi_client_t *)arg;
1816 	mdi_vhci_t	*vh = ct->ct_vhci;
1817 
1818 	ASSERT(!MDI_CLIENT_LOCKED(ct));
1819 
1820 	if (vh->vh_ops->vo_failover != NULL) {
1821 		/*
1822 		 * Call vHCI drivers callback routine
1823 		 */
1824 		rv = (*vh->vh_ops->vo_failover)(vh->vh_dip, ct->ct_dip,
1825 		    ct->ct_failover_flags);
1826 	}
1827 
1828 	MDI_CLIENT_LOCK(ct);
1829 	MDI_CLIENT_CLEAR_FAILOVER_IN_PROGRESS(ct);
1830 
1831 	/*
1832 	 * Save the failover return status
1833 	 */
1834 	ct->ct_failover_status = rv;
1835 
1836 	/*
1837 	 * As a result of failover, client status would have been changed.
1838 	 * Update the client state and wake up anyone waiting on this client
1839 	 * device.
1840 	 */
1841 	i_mdi_client_update_state(ct);
1842 
1843 	cv_broadcast(&ct->ct_failover_cv);
1844 	MDI_CLIENT_UNLOCK(ct);
1845 	return (rv);
1846 }
1847 
1848 /*
1849  * Load balancing is logical block.
1850  * IOs within the range described by region_size
1851  * would go on the same path. This would improve the
1852  * performance by cache-hit on some of the RAID devices.
1853  * Search only for online paths(At some point we
1854  * may want to balance across target ports).
1855  * If no paths are found then default to round-robin.
1856  */
1857 static int
1858 i_mdi_lba_lb(mdi_client_t *ct, mdi_pathinfo_t **ret_pip, struct buf *bp)
1859 {
1860 	int		path_index = -1;
1861 	int		online_path_count = 0;
1862 	int		online_nonpref_path_count = 0;
1863 	int 		region_size = ct->ct_lb_args->region_size;
1864 	mdi_pathinfo_t	*pip;
1865 	mdi_pathinfo_t	*next;
1866 	int		preferred, path_cnt;
1867 
1868 	pip = ct->ct_path_head;
1869 	while (pip) {
1870 		MDI_PI_LOCK(pip);
1871 		if (MDI_PI(pip)->pi_state ==
1872 		    MDI_PATHINFO_STATE_ONLINE && MDI_PI(pip)->pi_preferred) {
1873 			online_path_count++;
1874 		} else if (MDI_PI(pip)->pi_state ==
1875 		    MDI_PATHINFO_STATE_ONLINE && !MDI_PI(pip)->pi_preferred) {
1876 			online_nonpref_path_count++;
1877 		}
1878 		next = (mdi_pathinfo_t *)
1879 		    MDI_PI(pip)->pi_client_link;
1880 		MDI_PI_UNLOCK(pip);
1881 		pip = next;
1882 	}
1883 	/* if found any online/preferred then use this type */
1884 	if (online_path_count > 0) {
1885 		path_cnt = online_path_count;
1886 		preferred = 1;
1887 	} else if (online_nonpref_path_count > 0) {
1888 		path_cnt = online_nonpref_path_count;
1889 		preferred = 0;
1890 	} else {
1891 		path_cnt = 0;
1892 	}
1893 	if (path_cnt) {
1894 		path_index = (bp->b_blkno >> region_size) % path_cnt;
1895 		pip = ct->ct_path_head;
1896 		while (pip && path_index != -1) {
1897 			MDI_PI_LOCK(pip);
1898 			if (path_index == 0 &&
1899 			    (MDI_PI(pip)->pi_state ==
1900 			    MDI_PATHINFO_STATE_ONLINE) &&
1901 				MDI_PI(pip)->pi_preferred == preferred) {
1902 				MDI_PI_HOLD(pip);
1903 				MDI_PI_UNLOCK(pip);
1904 				*ret_pip = pip;
1905 				return (MDI_SUCCESS);
1906 			}
1907 			path_index --;
1908 			next = (mdi_pathinfo_t *)
1909 			    MDI_PI(pip)->pi_client_link;
1910 			MDI_PI_UNLOCK(pip);
1911 			pip = next;
1912 		}
1913 		if (pip == NULL) {
1914 			MDI_DEBUG(4, (CE_NOTE, NULL,
1915 			    "!lba %llx, no pip !!\n",
1916 				bp->b_lblkno));
1917 		} else {
1918 			MDI_DEBUG(4, (CE_NOTE, NULL,
1919 			    "!lba %llx, no pip for path_index, "
1920 			    "pip %p\n", bp->b_lblkno, (void *)pip));
1921 		}
1922 	}
1923 	return (MDI_FAILURE);
1924 }
1925 
1926 /*
1927  * mdi_select_path():
1928  *		select a path to access a client device.
1929  *
1930  *		mdi_select_path() function is called by the vHCI drivers to
1931  *		select a path to route the I/O request to.  The caller passes
1932  *		the block I/O data transfer structure ("buf") as one of the
1933  *		parameters.  The mpxio framework uses the buf structure
1934  *		contents to maintain per path statistics (total I/O size /
1935  *		count pending).  If more than one online paths are available to
1936  *		select, the framework automatically selects a suitable path
1937  *		for routing I/O request. If a failover operation is active for
1938  *		this client device the call shall be failed with MDI_BUSY error
1939  *		code.
1940  *
1941  *		By default this function returns a suitable path in online
1942  *		state based on the current load balancing policy.  Currently
1943  *		we support LOAD_BALANCE_NONE (Previously selected online path
1944  *		will continue to be used till the path is usable) and
1945  *		LOAD_BALANCE_RR (Online paths will be selected in a round
1946  *		robin fashion), LOAD_BALANCE_LB(Online paths will be selected
1947  *		based on the logical block).  The load balancing
1948  *		through vHCI drivers configuration file (driver.conf).
1949  *
1950  *		vHCI drivers may override this default behavior by specifying
1951  *		appropriate flags.  The meaning of the thrid argument depends
1952  *		on the flags specified. If MDI_SELECT_PATH_INSTANCE is set
1953  *		then the argument is the "path instance" of the path to select.
1954  *		If MDI_SELECT_PATH_INSTANCE is not set then the argument is
1955  *		"start_pip". A non NULL "start_pip" is the starting point to
1956  *		walk and find the next appropriate path.  The following values
1957  *		are currently defined: MDI_SELECT_ONLINE_PATH (to select an
1958  *		ONLINE path) and/or MDI_SELECT_STANDBY_PATH (to select an
1959  *		STANDBY path).
1960  *
1961  *		The non-standard behavior is used by the scsi_vhci driver,
1962  *		whenever it has to use a STANDBY/FAULTED path.  Eg. during
1963  *		attach of client devices (to avoid an unnecessary failover
1964  *		when the STANDBY path comes up first), during failover
1965  *		(to activate a STANDBY path as ONLINE).
1966  *
1967  *		The selected path is returned in a a mdi_hold_path() state
1968  *		(pi_ref_cnt). Caller should release the hold by calling
1969  *		mdi_rele_path().
1970  *
1971  * Return Values:
1972  *		MDI_SUCCESS	- Completed successfully
1973  *		MDI_BUSY 	- Client device is busy failing over
1974  *		MDI_NOPATH	- Client device is online, but no valid path are
1975  *				  available to access this client device
1976  *		MDI_FAILURE	- Invalid client device or state
1977  *		MDI_DEVI_ONLINING
1978  *				- Client device (struct dev_info state) is in
1979  *				  onlining state.
1980  */
1981 
1982 /*ARGSUSED*/
1983 int
1984 mdi_select_path(dev_info_t *cdip, struct buf *bp, int flags,
1985     void *arg, mdi_pathinfo_t **ret_pip)
1986 {
1987 	mdi_client_t	*ct;
1988 	mdi_pathinfo_t	*pip;
1989 	mdi_pathinfo_t	*next;
1990 	mdi_pathinfo_t	*head;
1991 	mdi_pathinfo_t	*start;
1992 	client_lb_t	lbp;	/* load balancing policy */
1993 	int		sb = 1;	/* standard behavior */
1994 	int		preferred = 1;	/* preferred path */
1995 	int		cond, cont = 1;
1996 	int		retry = 0;
1997 	mdi_pathinfo_t	*start_pip;	/* request starting pathinfo */
1998 	int		path_instance;	/* request specific path instance */
1999 
2000 	/* determine type of arg based on flags */
2001 	if (flags & MDI_SELECT_PATH_INSTANCE) {
2002 		flags &= ~MDI_SELECT_PATH_INSTANCE;
2003 		path_instance = (int)(intptr_t)arg;
2004 		start_pip = NULL;
2005 	} else {
2006 		path_instance = 0;
2007 		start_pip = (mdi_pathinfo_t *)arg;
2008 	}
2009 
2010 	if (flags != 0) {
2011 		/*
2012 		 * disable default behavior
2013 		 */
2014 		sb = 0;
2015 	}
2016 
2017 	*ret_pip = NULL;
2018 	ct = i_devi_get_client(cdip);
2019 	if (ct == NULL) {
2020 		/* mdi extensions are NULL, Nothing more to do */
2021 		return (MDI_FAILURE);
2022 	}
2023 
2024 	MDI_CLIENT_LOCK(ct);
2025 
2026 	if (sb) {
2027 		if (MDI_CLIENT_IS_FAILED(ct)) {
2028 			/*
2029 			 * Client is not ready to accept any I/O requests.
2030 			 * Fail this request.
2031 			 */
2032 			MDI_DEBUG(2, (CE_NOTE, cdip, "!mdi_select_path: "
2033 			    "client state offline ct = %p\n", (void *)ct));
2034 			MDI_CLIENT_UNLOCK(ct);
2035 			return (MDI_FAILURE);
2036 		}
2037 
2038 		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
2039 			/*
2040 			 * Check for Failover is in progress. If so tell the
2041 			 * caller that this device is busy.
2042 			 */
2043 			MDI_DEBUG(2, (CE_NOTE, cdip, "!mdi_select_path: "
2044 			    "client failover in progress ct = %p\n",
2045 			    (void *)ct));
2046 			MDI_CLIENT_UNLOCK(ct);
2047 			return (MDI_BUSY);
2048 		}
2049 
2050 		/*
2051 		 * Check to see whether the client device is attached.
2052 		 * If not so, let the vHCI driver manually select a path
2053 		 * (standby) and let the probe/attach process to continue.
2054 		 */
2055 		if (MDI_CLIENT_IS_DETACHED(ct) || !i_ddi_devi_attached(cdip)) {
2056 			MDI_DEBUG(4, (CE_NOTE, cdip, "!Devi is onlining "
2057 			    "ct = %p\n", (void *)ct));
2058 			MDI_CLIENT_UNLOCK(ct);
2059 			return (MDI_DEVI_ONLINING);
2060 		}
2061 	}
2062 
2063 	/*
2064 	 * Cache in the client list head.  If head of the list is NULL
2065 	 * return MDI_NOPATH
2066 	 */
2067 	head = ct->ct_path_head;
2068 	if (head == NULL) {
2069 		MDI_CLIENT_UNLOCK(ct);
2070 		return (MDI_NOPATH);
2071 	}
2072 
2073 	/* Caller is specifying a specific pathinfo path by path_instance */
2074 	if (path_instance) {
2075 		/* search for pathinfo with correct path_instance */
2076 		for (pip = head;
2077 		    pip && (mdi_pi_get_path_instance(pip) != path_instance);
2078 		    pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link)
2079 			;
2080 
2081 		/* If path can't be selected then MDI_FAILURE is returned. */
2082 		if (pip == NULL) {
2083 			MDI_CLIENT_UNLOCK(ct);
2084 			return (MDI_FAILURE);
2085 		}
2086 
2087 		/* verify state of path */
2088 		MDI_PI_LOCK(pip);
2089 		if (MDI_PI(pip)->pi_state != MDI_PATHINFO_STATE_ONLINE) {
2090 			MDI_PI_UNLOCK(pip);
2091 			MDI_CLIENT_UNLOCK(ct);
2092 			return (MDI_FAILURE);
2093 		}
2094 
2095 		/*
2096 		 * Return the path in hold state. Caller should release the
2097 		 * lock by calling mdi_rele_path()
2098 		 */
2099 		MDI_PI_HOLD(pip);
2100 		MDI_PI_UNLOCK(pip);
2101 		ct->ct_path_last = pip;
2102 		*ret_pip = pip;
2103 		MDI_CLIENT_UNLOCK(ct);
2104 		return (MDI_SUCCESS);
2105 	}
2106 
2107 	/*
2108 	 * for non default behavior, bypass current
2109 	 * load balancing policy and always use LOAD_BALANCE_RR
2110 	 * except that the start point will be adjusted based
2111 	 * on the provided start_pip
2112 	 */
2113 	lbp = sb ? ct->ct_lb : LOAD_BALANCE_RR;
2114 
2115 	switch (lbp) {
2116 	case LOAD_BALANCE_NONE:
2117 		/*
2118 		 * Load balancing is None  or Alternate path mode
2119 		 * Start looking for a online mdi_pathinfo node starting from
2120 		 * last known selected path
2121 		 */
2122 		preferred = 1;
2123 		pip = (mdi_pathinfo_t *)ct->ct_path_last;
2124 		if (pip == NULL) {
2125 			pip = head;
2126 		}
2127 		start = pip;
2128 		do {
2129 			MDI_PI_LOCK(pip);
2130 			/*
2131 			 * No need to explicitly check if the path is disabled.
2132 			 * Since we are checking for state == ONLINE and the
2133 			 * same veriable is used for DISABLE/ENABLE information.
2134 			 */
2135 			if ((MDI_PI(pip)->pi_state  ==
2136 				MDI_PATHINFO_STATE_ONLINE) &&
2137 				preferred == MDI_PI(pip)->pi_preferred) {
2138 				/*
2139 				 * Return the path in hold state. Caller should
2140 				 * release the lock by calling mdi_rele_path()
2141 				 */
2142 				MDI_PI_HOLD(pip);
2143 				MDI_PI_UNLOCK(pip);
2144 				ct->ct_path_last = pip;
2145 				*ret_pip = pip;
2146 				MDI_CLIENT_UNLOCK(ct);
2147 				return (MDI_SUCCESS);
2148 			}
2149 
2150 			/*
2151 			 * Path is busy.
2152 			 */
2153 			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
2154 			    MDI_PI_IS_TRANSIENT(pip))
2155 				retry = 1;
2156 			/*
2157 			 * Keep looking for a next available online path
2158 			 */
2159 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2160 			if (next == NULL) {
2161 				next = head;
2162 			}
2163 			MDI_PI_UNLOCK(pip);
2164 			pip = next;
2165 			if (start == pip && preferred) {
2166 				preferred = 0;
2167 			} else if (start == pip && !preferred) {
2168 				cont = 0;
2169 			}
2170 		} while (cont);
2171 		break;
2172 
2173 	case LOAD_BALANCE_LBA:
2174 		/*
2175 		 * Make sure we are looking
2176 		 * for an online path. Otherwise, if it is for a STANDBY
2177 		 * path request, it will go through and fetch an ONLINE
2178 		 * path which is not desirable.
2179 		 */
2180 		if ((ct->ct_lb_args != NULL) &&
2181 			    (ct->ct_lb_args->region_size) && bp &&
2182 				(sb || (flags == MDI_SELECT_ONLINE_PATH))) {
2183 			if (i_mdi_lba_lb(ct, ret_pip, bp)
2184 				    == MDI_SUCCESS) {
2185 				MDI_CLIENT_UNLOCK(ct);
2186 				return (MDI_SUCCESS);
2187 			}
2188 		}
2189 		/*  FALLTHROUGH */
2190 	case LOAD_BALANCE_RR:
2191 		/*
2192 		 * Load balancing is Round Robin. Start looking for a online
2193 		 * mdi_pathinfo node starting from last known selected path
2194 		 * as the start point.  If override flags are specified,
2195 		 * process accordingly.
2196 		 * If the search is already in effect(start_pip not null),
2197 		 * then lets just use the same path preference to continue the
2198 		 * traversal.
2199 		 */
2200 
2201 		if (start_pip != NULL) {
2202 			preferred = MDI_PI(start_pip)->pi_preferred;
2203 		} else {
2204 			preferred = 1;
2205 		}
2206 
2207 		start = sb ? (mdi_pathinfo_t *)ct->ct_path_last : start_pip;
2208 		if (start == NULL) {
2209 			pip = head;
2210 		} else {
2211 			pip = (mdi_pathinfo_t *)MDI_PI(start)->pi_client_link;
2212 			if (pip == NULL) {
2213 				if ( flags & MDI_SELECT_NO_PREFERRED) {
2214 					/*
2215 					 * Return since we hit the end of list
2216 					 */
2217 					MDI_CLIENT_UNLOCK(ct);
2218 					return (MDI_NOPATH);
2219 				}
2220 
2221 				if (!sb) {
2222 					if (preferred == 0) {
2223 						/*
2224 						 * Looks like we have completed
2225 						 * the traversal as preferred
2226 						 * value is 0. Time to bail out.
2227 						 */
2228 						*ret_pip = NULL;
2229 						MDI_CLIENT_UNLOCK(ct);
2230 						return (MDI_NOPATH);
2231 					} else {
2232 						/*
2233 						 * Looks like we reached the
2234 						 * end of the list. Lets enable
2235 						 * traversal of non preferred
2236 						 * paths.
2237 						 */
2238 						preferred = 0;
2239 					}
2240 				}
2241 				pip = head;
2242 			}
2243 		}
2244 		start = pip;
2245 		do {
2246 			MDI_PI_LOCK(pip);
2247 			if (sb) {
2248 				cond = ((MDI_PI(pip)->pi_state ==
2249 				    MDI_PATHINFO_STATE_ONLINE &&
2250 					MDI_PI(pip)->pi_preferred ==
2251 						preferred) ? 1 : 0);
2252 			} else {
2253 				if (flags == MDI_SELECT_ONLINE_PATH) {
2254 					cond = ((MDI_PI(pip)->pi_state ==
2255 					    MDI_PATHINFO_STATE_ONLINE &&
2256 						MDI_PI(pip)->pi_preferred ==
2257 						preferred) ? 1 : 0);
2258 				} else if (flags == MDI_SELECT_STANDBY_PATH) {
2259 					cond = ((MDI_PI(pip)->pi_state ==
2260 					    MDI_PATHINFO_STATE_STANDBY &&
2261 						MDI_PI(pip)->pi_preferred ==
2262 						preferred) ? 1 : 0);
2263 				} else if (flags == (MDI_SELECT_ONLINE_PATH |
2264 				    MDI_SELECT_STANDBY_PATH)) {
2265 					cond = (((MDI_PI(pip)->pi_state ==
2266 					    MDI_PATHINFO_STATE_ONLINE ||
2267 					    (MDI_PI(pip)->pi_state ==
2268 					    MDI_PATHINFO_STATE_STANDBY)) &&
2269 						MDI_PI(pip)->pi_preferred ==
2270 						preferred) ? 1 : 0);
2271 				} else if (flags ==
2272 					(MDI_SELECT_STANDBY_PATH |
2273 					MDI_SELECT_ONLINE_PATH |
2274 					MDI_SELECT_USER_DISABLE_PATH)) {
2275 					cond = (((MDI_PI(pip)->pi_state ==
2276 					    MDI_PATHINFO_STATE_ONLINE ||
2277 					    (MDI_PI(pip)->pi_state ==
2278 					    MDI_PATHINFO_STATE_STANDBY) ||
2279 						(MDI_PI(pip)->pi_state ==
2280 					    (MDI_PATHINFO_STATE_ONLINE|
2281 					    MDI_PATHINFO_STATE_USER_DISABLE)) ||
2282 						(MDI_PI(pip)->pi_state ==
2283 					    (MDI_PATHINFO_STATE_STANDBY |
2284 					    MDI_PATHINFO_STATE_USER_DISABLE)))&&
2285 						MDI_PI(pip)->pi_preferred ==
2286 						preferred) ? 1 : 0);
2287 				} else if (flags ==
2288 				    (MDI_SELECT_STANDBY_PATH |
2289 				    MDI_SELECT_ONLINE_PATH |
2290 				    MDI_SELECT_NO_PREFERRED)) {
2291 					cond = (((MDI_PI(pip)->pi_state ==
2292 					    MDI_PATHINFO_STATE_ONLINE) ||
2293 					    (MDI_PI(pip)->pi_state ==
2294 					    MDI_PATHINFO_STATE_STANDBY))
2295 					    ? 1 : 0);
2296 				} else {
2297 					cond = 0;
2298 				}
2299 			}
2300 			/*
2301 			 * No need to explicitly check if the path is disabled.
2302 			 * Since we are checking for state == ONLINE and the
2303 			 * same veriable is used for DISABLE/ENABLE information.
2304 			 */
2305 			if (cond) {
2306 				/*
2307 				 * Return the path in hold state. Caller should
2308 				 * release the lock by calling mdi_rele_path()
2309 				 */
2310 				MDI_PI_HOLD(pip);
2311 				MDI_PI_UNLOCK(pip);
2312 				if (sb)
2313 					ct->ct_path_last = pip;
2314 				*ret_pip = pip;
2315 				MDI_CLIENT_UNLOCK(ct);
2316 				return (MDI_SUCCESS);
2317 			}
2318 			/*
2319 			 * Path is busy.
2320 			 */
2321 			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
2322 			    MDI_PI_IS_TRANSIENT(pip))
2323 				retry = 1;
2324 
2325 			/*
2326 			 * Keep looking for a next available online path
2327 			 */
2328 do_again:
2329 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2330 			if (next == NULL) {
2331 				if ( flags & MDI_SELECT_NO_PREFERRED) {
2332 					/*
2333 					 * Bail out since we hit the end of list
2334 					 */
2335 					MDI_PI_UNLOCK(pip);
2336 					break;
2337 				}
2338 
2339 				if (!sb) {
2340 					if (preferred == 1) {
2341 						/*
2342 						 * Looks like we reached the
2343 						 * end of the list. Lets enable
2344 						 * traversal of non preferred
2345 						 * paths.
2346 						 */
2347 						preferred = 0;
2348 						next = head;
2349 					} else {
2350 						/*
2351 						 * We have done both the passes
2352 						 * Preferred as well as for
2353 						 * Non-preferred. Bail out now.
2354 						 */
2355 						cont = 0;
2356 					}
2357 				} else {
2358 					/*
2359 					 * Standard behavior case.
2360 					 */
2361 					next = head;
2362 				}
2363 			}
2364 			MDI_PI_UNLOCK(pip);
2365 			if (cont == 0) {
2366 				break;
2367 			}
2368 			pip = next;
2369 
2370 			if (!sb) {
2371 				/*
2372 				 * We need to handle the selection of
2373 				 * non-preferred path in the following
2374 				 * case:
2375 				 *
2376 				 * +------+   +------+   +------+   +-----+
2377 				 * | A : 1| - | B : 1| - | C : 0| - |NULL |
2378 				 * +------+   +------+   +------+   +-----+
2379 				 *
2380 				 * If we start the search with B, we need to
2381 				 * skip beyond B to pick C which is non -
2382 				 * preferred in the second pass. The following
2383 				 * test, if true, will allow us to skip over
2384 				 * the 'start'(B in the example) to select
2385 				 * other non preferred elements.
2386 				 */
2387 				if ((start_pip != NULL) && (start_pip == pip) &&
2388 				    (MDI_PI(start_pip)->pi_preferred
2389 				    != preferred)) {
2390 					/*
2391 					 * try again after going past the start
2392 					 * pip
2393 					 */
2394 					MDI_PI_LOCK(pip);
2395 					goto do_again;
2396 				}
2397 			} else {
2398 				/*
2399 				 * Standard behavior case
2400 				 */
2401 				if (start == pip && preferred) {
2402 					/* look for nonpreferred paths */
2403 					preferred = 0;
2404 				} else if (start == pip && !preferred) {
2405 					/*
2406 					 * Exit condition
2407 					 */
2408 					cont = 0;
2409 				}
2410 			}
2411 		} while (cont);
2412 		break;
2413 	}
2414 
2415 	MDI_CLIENT_UNLOCK(ct);
2416 	if (retry == 1) {
2417 		return (MDI_BUSY);
2418 	} else {
2419 		return (MDI_NOPATH);
2420 	}
2421 }
2422 
2423 /*
2424  * For a client, return the next available path to any phci
2425  *
2426  * Note:
2427  *		Caller should hold the branch's devinfo node to get a consistent
2428  *		snap shot of the mdi_pathinfo nodes.
2429  *
2430  *		Please note that even the list is stable the mdi_pathinfo
2431  *		node state and properties are volatile.  The caller should lock
2432  *		and unlock the nodes by calling mdi_pi_lock() and
2433  *		mdi_pi_unlock() functions to get a stable properties.
2434  *
2435  *		If there is a need to use the nodes beyond the hold of the
2436  *		devinfo node period (For ex. I/O), then mdi_pathinfo node
2437  *		need to be held against unexpected removal by calling
2438  *		mdi_hold_path() and should be released by calling
2439  *		mdi_rele_path() on completion.
2440  */
2441 mdi_pathinfo_t *
2442 mdi_get_next_phci_path(dev_info_t *ct_dip, mdi_pathinfo_t *pip)
2443 {
2444 	mdi_client_t *ct;
2445 
2446 	if (!MDI_CLIENT(ct_dip))
2447 		return (NULL);
2448 
2449 	/*
2450 	 * Walk through client link
2451 	 */
2452 	ct = (mdi_client_t *)DEVI(ct_dip)->devi_mdi_client;
2453 	ASSERT(ct != NULL);
2454 
2455 	if (pip == NULL)
2456 		return ((mdi_pathinfo_t *)ct->ct_path_head);
2457 
2458 	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link);
2459 }
2460 
2461 /*
2462  * For a phci, return the next available path to any client
2463  * Note: ditto mdi_get_next_phci_path()
2464  */
2465 mdi_pathinfo_t *
2466 mdi_get_next_client_path(dev_info_t *ph_dip, mdi_pathinfo_t *pip)
2467 {
2468 	mdi_phci_t *ph;
2469 
2470 	if (!MDI_PHCI(ph_dip))
2471 		return (NULL);
2472 
2473 	/*
2474 	 * Walk through pHCI link
2475 	 */
2476 	ph = (mdi_phci_t *)DEVI(ph_dip)->devi_mdi_xhci;
2477 	ASSERT(ph != NULL);
2478 
2479 	if (pip == NULL)
2480 		return ((mdi_pathinfo_t *)ph->ph_path_head);
2481 
2482 	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link);
2483 }
2484 
2485 /*
2486  * mdi_hold_path():
2487  *		Hold the mdi_pathinfo node against unwanted unexpected free.
2488  * Return Values:
2489  *		None
2490  */
2491 void
2492 mdi_hold_path(mdi_pathinfo_t *pip)
2493 {
2494 	if (pip) {
2495 		MDI_PI_LOCK(pip);
2496 		MDI_PI_HOLD(pip);
2497 		MDI_PI_UNLOCK(pip);
2498 	}
2499 }
2500 
2501 
2502 /*
2503  * mdi_rele_path():
2504  *		Release the mdi_pathinfo node which was selected
2505  *		through mdi_select_path() mechanism or manually held by
2506  *		calling mdi_hold_path().
2507  * Return Values:
2508  *		None
2509  */
2510 void
2511 mdi_rele_path(mdi_pathinfo_t *pip)
2512 {
2513 	if (pip) {
2514 		MDI_PI_LOCK(pip);
2515 		MDI_PI_RELE(pip);
2516 		if (MDI_PI(pip)->pi_ref_cnt == 0) {
2517 			cv_broadcast(&MDI_PI(pip)->pi_ref_cv);
2518 		}
2519 		MDI_PI_UNLOCK(pip);
2520 	}
2521 }
2522 
2523 /*
2524  * mdi_pi_lock():
2525  * 		Lock the mdi_pathinfo node.
2526  * Note:
2527  *		The caller should release the lock by calling mdi_pi_unlock()
2528  */
2529 void
2530 mdi_pi_lock(mdi_pathinfo_t *pip)
2531 {
2532 	ASSERT(pip != NULL);
2533 	if (pip) {
2534 		MDI_PI_LOCK(pip);
2535 	}
2536 }
2537 
2538 
2539 /*
2540  * mdi_pi_unlock():
2541  * 		Unlock the mdi_pathinfo node.
2542  * Note:
2543  *		The mdi_pathinfo node should have been locked with mdi_pi_lock()
2544  */
2545 void
2546 mdi_pi_unlock(mdi_pathinfo_t *pip)
2547 {
2548 	ASSERT(pip != NULL);
2549 	if (pip) {
2550 		MDI_PI_UNLOCK(pip);
2551 	}
2552 }
2553 
2554 /*
2555  * mdi_pi_find():
2556  *		Search the list of mdi_pathinfo nodes attached to the
2557  *		pHCI/Client device node whose path address matches "paddr".
2558  *		Returns a pointer to the mdi_pathinfo node if a matching node is
2559  *		found.
2560  * Return Values:
2561  *		mdi_pathinfo node handle
2562  *		NULL
2563  * Notes:
2564  *		Caller need not hold any locks to call this function.
2565  */
2566 mdi_pathinfo_t *
2567 mdi_pi_find(dev_info_t *pdip, char *caddr, char *paddr)
2568 {
2569 	mdi_phci_t		*ph;
2570 	mdi_vhci_t		*vh;
2571 	mdi_client_t		*ct;
2572 	mdi_pathinfo_t		*pip = NULL;
2573 
2574 	MDI_DEBUG(2, (CE_NOTE, pdip, "!mdi_pi_find: %s %s",
2575 	    caddr ? caddr : "NULL", paddr ? paddr : "NULL"));
2576 	if ((pdip == NULL) || (paddr == NULL)) {
2577 		return (NULL);
2578 	}
2579 	ph = i_devi_get_phci(pdip);
2580 	if (ph == NULL) {
2581 		/*
2582 		 * Invalid pHCI device, Nothing more to do.
2583 		 */
2584 		MDI_DEBUG(2, (CE_WARN, pdip,
2585 		    "!mdi_pi_find: invalid phci"));
2586 		return (NULL);
2587 	}
2588 
2589 	vh = ph->ph_vhci;
2590 	if (vh == NULL) {
2591 		/*
2592 		 * Invalid vHCI device, Nothing more to do.
2593 		 */
2594 		MDI_DEBUG(2, (CE_WARN, pdip,
2595 		    "!mdi_pi_find: invalid vhci"));
2596 		return (NULL);
2597 	}
2598 
2599 	/*
2600 	 * Look for pathinfo node identified by paddr.
2601 	 */
2602 	if (caddr == NULL) {
2603 		/*
2604 		 * Find a mdi_pathinfo node under pHCI list for a matching
2605 		 * unit address.
2606 		 */
2607 		MDI_PHCI_LOCK(ph);
2608 		if (MDI_PHCI_IS_OFFLINE(ph)) {
2609 			MDI_DEBUG(2, (CE_WARN, pdip,
2610 			    "!mdi_pi_find: offline phci %p", (void *)ph));
2611 			MDI_PHCI_UNLOCK(ph);
2612 			return (NULL);
2613 		}
2614 		pip = (mdi_pathinfo_t *)ph->ph_path_head;
2615 
2616 		while (pip != NULL) {
2617 			if (strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2618 				break;
2619 			}
2620 			pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
2621 		}
2622 		MDI_PHCI_UNLOCK(ph);
2623 		MDI_DEBUG(2, (CE_NOTE, pdip, "!mdi_pi_find: found %p",
2624 		    (void *)pip));
2625 		return (pip);
2626 	}
2627 
2628 	/*
2629 	 * XXX - Is the rest of the code in this function really necessary?
2630 	 * The consumers of mdi_pi_find() can search for the desired pathinfo
2631 	 * node by calling mdi_pi_find(pdip, NULL, paddr). Irrespective of
2632 	 * whether the search is based on the pathinfo nodes attached to
2633 	 * the pHCI or the client node, the result will be the same.
2634 	 */
2635 
2636 	/*
2637 	 * Find the client device corresponding to 'caddr'
2638 	 */
2639 	MDI_VHCI_CLIENT_LOCK(vh);
2640 
2641 	/*
2642 	 * XXX - Passing NULL to the following function works as long as the
2643 	 * the client addresses (caddr) are unique per vhci basis.
2644 	 */
2645 	ct = i_mdi_client_find(vh, NULL, caddr);
2646 	if (ct == NULL) {
2647 		/*
2648 		 * Client not found, Obviously mdi_pathinfo node has not been
2649 		 * created yet.
2650 		 */
2651 		MDI_VHCI_CLIENT_UNLOCK(vh);
2652 		MDI_DEBUG(2, (CE_NOTE, pdip, "!mdi_pi_find: client not "
2653 		    "found for caddr %s", caddr ? caddr : "NULL"));
2654 		return (NULL);
2655 	}
2656 
2657 	/*
2658 	 * Hold the client lock and look for a mdi_pathinfo node with matching
2659 	 * pHCI and paddr
2660 	 */
2661 	MDI_CLIENT_LOCK(ct);
2662 
2663 	/*
2664 	 * Release the global mutex as it is no more needed. Note: We always
2665 	 * respect the locking order while acquiring.
2666 	 */
2667 	MDI_VHCI_CLIENT_UNLOCK(vh);
2668 
2669 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2670 	while (pip != NULL) {
2671 		/*
2672 		 * Compare the unit address
2673 		 */
2674 		if ((MDI_PI(pip)->pi_phci == ph) &&
2675 		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2676 			break;
2677 		}
2678 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2679 	}
2680 	MDI_CLIENT_UNLOCK(ct);
2681 	MDI_DEBUG(2, (CE_NOTE, pdip, "!mdi_pi_find: found:: %p", (void *)pip));
2682 	return (pip);
2683 }
2684 
2685 /*
2686  * mdi_pi_alloc():
2687  *		Allocate and initialize a new instance of a mdi_pathinfo node.
2688  *		The mdi_pathinfo node returned by this function identifies a
2689  *		unique device path is capable of having properties attached
2690  *		and passed to mdi_pi_online() to fully attach and online the
2691  *		path and client device node.
2692  *		The mdi_pathinfo node returned by this function must be
2693  *		destroyed using mdi_pi_free() if the path is no longer
2694  *		operational or if the caller fails to attach a client device
2695  *		node when calling mdi_pi_online(). The framework will not free
2696  *		the resources allocated.
2697  *		This function can be called from both interrupt and kernel
2698  *		contexts.  DDI_NOSLEEP flag should be used while calling
2699  *		from interrupt contexts.
2700  * Return Values:
2701  *		MDI_SUCCESS
2702  *		MDI_FAILURE
2703  *		MDI_NOMEM
2704  */
2705 /*ARGSUSED*/
2706 int
2707 mdi_pi_alloc_compatible(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2708     char **compatible, int ncompatible, int flags, mdi_pathinfo_t **ret_pip)
2709 {
2710 	mdi_vhci_t	*vh;
2711 	mdi_phci_t	*ph;
2712 	mdi_client_t	*ct;
2713 	mdi_pathinfo_t	*pip = NULL;
2714 	dev_info_t	*cdip;
2715 	int		rv = MDI_NOMEM;
2716 	int		path_allocated = 0;
2717 
2718 	MDI_DEBUG(2, (CE_NOTE, pdip, "!mdi_pi_alloc_compatible: %s %s %s",
2719 	    cname ? cname : "NULL", caddr ? caddr : "NULL",
2720 	    paddr ? paddr : "NULL"));
2721 
2722 	if (pdip == NULL || cname == NULL || caddr == NULL || paddr == NULL ||
2723 	    ret_pip == NULL) {
2724 		/* Nothing more to do */
2725 		return (MDI_FAILURE);
2726 	}
2727 
2728 	*ret_pip = NULL;
2729 
2730 	/* No allocations on detaching pHCI */
2731 	if (DEVI_IS_DETACHING(pdip)) {
2732 		/* Invalid pHCI device, return failure */
2733 		MDI_DEBUG(1, (CE_WARN, pdip,
2734 		    "!mdi_pi_alloc: detaching pHCI=%p", (void *)pdip));
2735 		return (MDI_FAILURE);
2736 	}
2737 
2738 	ph = i_devi_get_phci(pdip);
2739 	ASSERT(ph != NULL);
2740 	if (ph == NULL) {
2741 		/* Invalid pHCI device, return failure */
2742 		MDI_DEBUG(1, (CE_WARN, pdip,
2743 		    "!mdi_pi_alloc: invalid pHCI=%p", (void *)pdip));
2744 		return (MDI_FAILURE);
2745 	}
2746 
2747 	MDI_PHCI_LOCK(ph);
2748 	vh = ph->ph_vhci;
2749 	if (vh == NULL) {
2750 		/* Invalid vHCI device, return failure */
2751 		MDI_DEBUG(1, (CE_WARN, pdip,
2752 		    "!mdi_pi_alloc: invalid vHCI=%p", (void *)pdip));
2753 		MDI_PHCI_UNLOCK(ph);
2754 		return (MDI_FAILURE);
2755 	}
2756 
2757 	if (MDI_PHCI_IS_READY(ph) == 0) {
2758 		/*
2759 		 * Do not allow new node creation when pHCI is in
2760 		 * offline/suspended states
2761 		 */
2762 		MDI_DEBUG(1, (CE_WARN, pdip,
2763 		    "mdi_pi_alloc: pHCI=%p is not ready", (void *)ph));
2764 		MDI_PHCI_UNLOCK(ph);
2765 		return (MDI_BUSY);
2766 	}
2767 	MDI_PHCI_UNSTABLE(ph);
2768 	MDI_PHCI_UNLOCK(ph);
2769 
2770 	/* look for a matching client, create one if not found */
2771 	MDI_VHCI_CLIENT_LOCK(vh);
2772 	ct = i_mdi_client_find(vh, cname, caddr);
2773 	if (ct == NULL) {
2774 		ct = i_mdi_client_alloc(vh, cname, caddr);
2775 		ASSERT(ct != NULL);
2776 	}
2777 
2778 	if (ct->ct_dip == NULL) {
2779 		/*
2780 		 * Allocate a devinfo node
2781 		 */
2782 		ct->ct_dip = i_mdi_devinfo_create(vh, cname, caddr,
2783 		    compatible, ncompatible);
2784 		if (ct->ct_dip == NULL) {
2785 			(void) i_mdi_client_free(vh, ct);
2786 			goto fail;
2787 		}
2788 	}
2789 	cdip = ct->ct_dip;
2790 
2791 	DEVI(cdip)->devi_mdi_component |= MDI_COMPONENT_CLIENT;
2792 	DEVI(cdip)->devi_mdi_client = (caddr_t)ct;
2793 
2794 	MDI_CLIENT_LOCK(ct);
2795 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2796 	while (pip != NULL) {
2797 		/*
2798 		 * Compare the unit address
2799 		 */
2800 		if ((MDI_PI(pip)->pi_phci == ph) &&
2801 		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2802 			break;
2803 		}
2804 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2805 	}
2806 	MDI_CLIENT_UNLOCK(ct);
2807 
2808 	if (pip == NULL) {
2809 		/*
2810 		 * This is a new path for this client device.  Allocate and
2811 		 * initialize a new pathinfo node
2812 		 */
2813 		pip = i_mdi_pi_alloc(ph, paddr, ct);
2814 		ASSERT(pip != NULL);
2815 		path_allocated = 1;
2816 	}
2817 	rv = MDI_SUCCESS;
2818 
2819 fail:
2820 	/*
2821 	 * Release the global mutex.
2822 	 */
2823 	MDI_VHCI_CLIENT_UNLOCK(vh);
2824 
2825 	/*
2826 	 * Mark the pHCI as stable
2827 	 */
2828 	MDI_PHCI_LOCK(ph);
2829 	MDI_PHCI_STABLE(ph);
2830 	MDI_PHCI_UNLOCK(ph);
2831 	*ret_pip = pip;
2832 
2833 	MDI_DEBUG(2, (CE_NOTE, pdip,
2834 	    "!mdi_pi_alloc_compatible: alloc %p", (void *)pip));
2835 
2836 	if (path_allocated)
2837 		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
2838 
2839 	return (rv);
2840 }
2841 
2842 /*ARGSUSED*/
2843 int
2844 mdi_pi_alloc(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2845     int flags, mdi_pathinfo_t **ret_pip)
2846 {
2847 	return (mdi_pi_alloc_compatible(pdip, cname, caddr, paddr, NULL, 0,
2848 	    flags, ret_pip));
2849 }
2850 
2851 /*
2852  * i_mdi_pi_alloc():
2853  *		Allocate a mdi_pathinfo node and add to the pHCI path list
2854  * Return Values:
2855  *		mdi_pathinfo
2856  */
2857 /*ARGSUSED*/
2858 static mdi_pathinfo_t *
2859 i_mdi_pi_alloc(mdi_phci_t *ph, char *paddr, mdi_client_t *ct)
2860 {
2861 	mdi_pathinfo_t	*pip;
2862 	int		ct_circular;
2863 	int		ph_circular;
2864 	static char	path[MAXPATHLEN];
2865 	char		*path_persistent;
2866 	int		path_instance;
2867 	mod_hash_val_t	hv;
2868 
2869 	ASSERT(MDI_VHCI_CLIENT_LOCKED(ph->ph_vhci));
2870 
2871 	pip = kmem_zalloc(sizeof (struct mdi_pathinfo), KM_SLEEP);
2872 	mutex_init(&MDI_PI(pip)->pi_mutex, NULL, MUTEX_DEFAULT, NULL);
2873 	MDI_PI(pip)->pi_state = MDI_PATHINFO_STATE_INIT |
2874 	    MDI_PATHINFO_STATE_TRANSIENT;
2875 
2876 	if (MDI_PHCI_IS_USER_DISABLED(ph))
2877 		MDI_PI_SET_USER_DISABLE(pip);
2878 
2879 	if (MDI_PHCI_IS_DRV_DISABLED_TRANSIENT(ph))
2880 		MDI_PI_SET_DRV_DISABLE_TRANS(pip);
2881 
2882 	if (MDI_PHCI_IS_DRV_DISABLED(ph))
2883 		MDI_PI_SET_DRV_DISABLE(pip);
2884 
2885 	MDI_PI(pip)->pi_old_state = MDI_PATHINFO_STATE_INIT;
2886 	cv_init(&MDI_PI(pip)->pi_state_cv, NULL, CV_DEFAULT, NULL);
2887 	MDI_PI(pip)->pi_client = ct;
2888 	MDI_PI(pip)->pi_phci = ph;
2889 	MDI_PI(pip)->pi_addr = kmem_alloc(strlen(paddr) + 1, KM_SLEEP);
2890 	(void) strcpy(MDI_PI(pip)->pi_addr, paddr);
2891 
2892         /*
2893 	 * We form the "path" to the pathinfo node, and see if we have
2894 	 * already allocated a 'path_instance' for that "path".  If so,
2895 	 * we use the already allocated 'path_instance'.  If not, we
2896 	 * allocate a new 'path_instance' and associate it with a copy of
2897 	 * the "path" string (which is never freed). The association
2898 	 * between a 'path_instance' this "path" string persists until
2899 	 * reboot.
2900 	 */
2901         mutex_enter(&mdi_pathmap_mutex);
2902 	(void) ddi_pathname(ph->ph_dip, path);
2903 	(void) sprintf(path + strlen(path), "/%s@%s",
2904 	    ddi_node_name(ct->ct_dip), MDI_PI(pip)->pi_addr);
2905         if (mod_hash_find(mdi_pathmap_bypath, (mod_hash_key_t)path, &hv) == 0) {
2906                 path_instance = (uint_t)(intptr_t)hv;
2907         } else {
2908 		/* allocate a new 'path_instance' and persistent "path" */
2909 		path_instance = mdi_pathmap_instance++;
2910 		path_persistent = i_ddi_strdup(path, KM_SLEEP);
2911                 (void) mod_hash_insert(mdi_pathmap_bypath,
2912                     (mod_hash_key_t)path_persistent,
2913                     (mod_hash_val_t)(intptr_t)path_instance);
2914 		(void) mod_hash_insert(mdi_pathmap_byinstance,
2915 		    (mod_hash_key_t)(intptr_t)path_instance,
2916 		    (mod_hash_val_t)path_persistent);
2917         }
2918         mutex_exit(&mdi_pathmap_mutex);
2919 	MDI_PI(pip)->pi_path_instance = path_instance;
2920 
2921 	(void) nvlist_alloc(&MDI_PI(pip)->pi_prop, NV_UNIQUE_NAME, KM_SLEEP);
2922 	ASSERT(MDI_PI(pip)->pi_prop != NULL);
2923 	MDI_PI(pip)->pi_pprivate = NULL;
2924 	MDI_PI(pip)->pi_cprivate = NULL;
2925 	MDI_PI(pip)->pi_vprivate = NULL;
2926 	MDI_PI(pip)->pi_client_link = NULL;
2927 	MDI_PI(pip)->pi_phci_link = NULL;
2928 	MDI_PI(pip)->pi_ref_cnt = 0;
2929 	MDI_PI(pip)->pi_kstats = NULL;
2930 	MDI_PI(pip)->pi_preferred = 1;
2931 	cv_init(&MDI_PI(pip)->pi_ref_cv, NULL, CV_DEFAULT, NULL);
2932 
2933 	/*
2934 	 * Lock both dev_info nodes against changes in parallel.
2935 	 *
2936 	 * The ndi_devi_enter(Client), is atypical since the client is a leaf.
2937 	 * This atypical operation is done to synchronize pathinfo nodes
2938 	 * during devinfo snapshot (see di_register_pip) by 'pretending' that
2939 	 * the pathinfo nodes are children of the Client.
2940 	 */
2941 	ndi_devi_enter(ct->ct_dip, &ct_circular);
2942 	ndi_devi_enter(ph->ph_dip, &ph_circular);
2943 
2944 	i_mdi_phci_add_path(ph, pip);
2945 	i_mdi_client_add_path(ct, pip);
2946 
2947 	ndi_devi_exit(ph->ph_dip, ph_circular);
2948 	ndi_devi_exit(ct->ct_dip, ct_circular);
2949 
2950 	return (pip);
2951 }
2952 
2953 /*
2954  * mdi_pi_pathname_by_instance():
2955  *	Lookup of "path" by 'path_instance'. Return "path".
2956  *	NOTE: returned "path" remains valid forever (until reboot).
2957  */
2958 char *
2959 mdi_pi_pathname_by_instance(int path_instance)
2960 {
2961 	char		*path;
2962 	mod_hash_val_t	hv;
2963 
2964 	/* mdi_pathmap lookup of "path" by 'path_instance' */
2965 	mutex_enter(&mdi_pathmap_mutex);
2966 	if (mod_hash_find(mdi_pathmap_byinstance,
2967 	    (mod_hash_key_t)(intptr_t)path_instance, &hv) == 0)
2968 		path = (char *)hv;
2969 	else
2970 		path = NULL;
2971 	mutex_exit(&mdi_pathmap_mutex);
2972 	return (path);
2973 }
2974 
2975 /*
2976  * i_mdi_phci_add_path():
2977  * 		Add a mdi_pathinfo node to pHCI list.
2978  * Notes:
2979  *		Caller should per-pHCI mutex
2980  */
2981 static void
2982 i_mdi_phci_add_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
2983 {
2984 	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
2985 
2986 	MDI_PHCI_LOCK(ph);
2987 	if (ph->ph_path_head == NULL) {
2988 		ph->ph_path_head = pip;
2989 	} else {
2990 		MDI_PI(ph->ph_path_tail)->pi_phci_link = MDI_PI(pip);
2991 	}
2992 	ph->ph_path_tail = pip;
2993 	ph->ph_path_count++;
2994 	MDI_PHCI_UNLOCK(ph);
2995 }
2996 
2997 /*
2998  * i_mdi_client_add_path():
2999  *		Add mdi_pathinfo node to client list
3000  */
3001 static void
3002 i_mdi_client_add_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
3003 {
3004 	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
3005 
3006 	MDI_CLIENT_LOCK(ct);
3007 	if (ct->ct_path_head == NULL) {
3008 		ct->ct_path_head = pip;
3009 	} else {
3010 		MDI_PI(ct->ct_path_tail)->pi_client_link = MDI_PI(pip);
3011 	}
3012 	ct->ct_path_tail = pip;
3013 	ct->ct_path_count++;
3014 	MDI_CLIENT_UNLOCK(ct);
3015 }
3016 
3017 /*
3018  * mdi_pi_free():
3019  *		Free the mdi_pathinfo node and also client device node if this
3020  *		is the last path to the device
3021  * Return Values:
3022  *		MDI_SUCCESS
3023  *		MDI_FAILURE
3024  *		MDI_BUSY
3025  */
3026 /*ARGSUSED*/
3027 int
3028 mdi_pi_free(mdi_pathinfo_t *pip, int flags)
3029 {
3030 	int		rv = MDI_FAILURE;
3031 	mdi_vhci_t	*vh;
3032 	mdi_phci_t	*ph;
3033 	mdi_client_t	*ct;
3034 	int		(*f)();
3035 	int		client_held = 0;
3036 
3037 	MDI_PI_LOCK(pip);
3038 	ph = MDI_PI(pip)->pi_phci;
3039 	ASSERT(ph != NULL);
3040 	if (ph == NULL) {
3041 		/*
3042 		 * Invalid pHCI device, return failure
3043 		 */
3044 		MDI_DEBUG(1, (CE_WARN, NULL,
3045 		    "!mdi_pi_free: invalid pHCI pip=%p", (void *)pip));
3046 		MDI_PI_UNLOCK(pip);
3047 		return (MDI_FAILURE);
3048 	}
3049 
3050 	vh = ph->ph_vhci;
3051 	ASSERT(vh != NULL);
3052 	if (vh == NULL) {
3053 		/* Invalid pHCI device, return failure */
3054 		MDI_DEBUG(1, (CE_WARN, NULL,
3055 		    "!mdi_pi_free: invalid vHCI pip=%p", (void *)pip));
3056 		MDI_PI_UNLOCK(pip);
3057 		return (MDI_FAILURE);
3058 	}
3059 
3060 	ct = MDI_PI(pip)->pi_client;
3061 	ASSERT(ct != NULL);
3062 	if (ct == NULL) {
3063 		/*
3064 		 * Invalid Client device, return failure
3065 		 */
3066 		MDI_DEBUG(1, (CE_WARN, NULL,
3067 		    "!mdi_pi_free: invalid client pip=%p", (void *)pip));
3068 		MDI_PI_UNLOCK(pip);
3069 		return (MDI_FAILURE);
3070 	}
3071 
3072 	/*
3073 	 * Check to see for busy condition.  A mdi_pathinfo can only be freed
3074 	 * if the node state is either offline or init and the reference count
3075 	 * is zero.
3076 	 */
3077 	if (!(MDI_PI_IS_OFFLINE(pip) || MDI_PI_IS_INIT(pip) ||
3078 	    MDI_PI_IS_INITING(pip))) {
3079 		/*
3080 		 * Node is busy
3081 		 */
3082 		MDI_DEBUG(1, (CE_WARN, ct->ct_dip,
3083 		    "!mdi_pi_free: pathinfo node is busy pip=%p", (void *)pip));
3084 		MDI_PI_UNLOCK(pip);
3085 		return (MDI_BUSY);
3086 	}
3087 
3088 	while (MDI_PI(pip)->pi_ref_cnt != 0) {
3089 		/*
3090 		 * Give a chance for pending I/Os to complete.
3091 		 */
3092 		MDI_DEBUG(1, (CE_NOTE, ct->ct_dip, "!mdi_pi_free: "
3093 		    "%d cmds still pending on path: %p\n",
3094 		    MDI_PI(pip)->pi_ref_cnt, (void *)pip));
3095 		if (cv_timedwait(&MDI_PI(pip)->pi_ref_cv,
3096 		    &MDI_PI(pip)->pi_mutex,
3097 		    ddi_get_lbolt() + drv_usectohz(60 * 1000000)) == -1) {
3098 			/*
3099 			 * The timeout time reached without ref_cnt being zero
3100 			 * being signaled.
3101 			 */
3102 			MDI_DEBUG(1, (CE_NOTE, ct->ct_dip,
3103 			    "!mdi_pi_free: "
3104 			    "Timeout reached on path %p without the cond\n",
3105 			    (void *)pip));
3106 			MDI_DEBUG(1, (CE_NOTE, ct->ct_dip,
3107 			    "!mdi_pi_free: "
3108 			    "%d cmds still pending on path: %p\n",
3109 			    MDI_PI(pip)->pi_ref_cnt, (void *)pip));
3110 			MDI_PI_UNLOCK(pip);
3111 			return (MDI_BUSY);
3112 		}
3113 	}
3114 	if (MDI_PI(pip)->pi_pm_held) {
3115 		client_held = 1;
3116 	}
3117 	MDI_PI_UNLOCK(pip);
3118 
3119 	vhcache_pi_remove(vh->vh_config, MDI_PI(pip));
3120 
3121 	MDI_CLIENT_LOCK(ct);
3122 
3123 	/* Prevent further failovers till MDI_VHCI_CLIENT_LOCK is held */
3124 	MDI_CLIENT_SET_PATH_FREE_IN_PROGRESS(ct);
3125 
3126 	/*
3127 	 * Wait till failover is complete before removing this node.
3128 	 */
3129 	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
3130 		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
3131 
3132 	MDI_CLIENT_UNLOCK(ct);
3133 	MDI_VHCI_CLIENT_LOCK(vh);
3134 	MDI_CLIENT_LOCK(ct);
3135 	MDI_CLIENT_CLEAR_PATH_FREE_IN_PROGRESS(ct);
3136 
3137 	if (!MDI_PI_IS_INITING(pip)) {
3138 		f = vh->vh_ops->vo_pi_uninit;
3139 		if (f != NULL) {
3140 			rv = (*f)(vh->vh_dip, pip, 0);
3141 		}
3142 	}
3143 	/*
3144 	 * If vo_pi_uninit() completed successfully.
3145 	 */
3146 	if (rv == MDI_SUCCESS) {
3147 		if (client_held) {
3148 			MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "mdi_pi_free "
3149 			    "i_mdi_pm_rele_client\n"));
3150 			i_mdi_pm_rele_client(ct, 1);
3151 		}
3152 		i_mdi_pi_free(ph, pip, ct);
3153 		if (ct->ct_path_count == 0) {
3154 			/*
3155 			 * Client lost its last path.
3156 			 * Clean up the client device
3157 			 */
3158 			MDI_CLIENT_UNLOCK(ct);
3159 			(void) i_mdi_client_free(ct->ct_vhci, ct);
3160 			MDI_VHCI_CLIENT_UNLOCK(vh);
3161 			return (rv);
3162 		}
3163 	}
3164 	MDI_CLIENT_UNLOCK(ct);
3165 	MDI_VHCI_CLIENT_UNLOCK(vh);
3166 
3167 	if (rv == MDI_FAILURE)
3168 		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
3169 
3170 	return (rv);
3171 }
3172 
3173 /*
3174  * i_mdi_pi_free():
3175  *		Free the mdi_pathinfo node
3176  */
3177 static void
3178 i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *pip, mdi_client_t *ct)
3179 {
3180 	int	ct_circular;
3181 	int	ph_circular;
3182 
3183 	ASSERT(MDI_CLIENT_LOCKED(ct));
3184 
3185 	/*
3186 	 * remove any per-path kstats
3187 	 */
3188 	i_mdi_pi_kstat_destroy(pip);
3189 
3190 	/* See comments in i_mdi_pi_alloc() */
3191 	ndi_devi_enter(ct->ct_dip, &ct_circular);
3192 	ndi_devi_enter(ph->ph_dip, &ph_circular);
3193 
3194 	i_mdi_client_remove_path(ct, pip);
3195 	i_mdi_phci_remove_path(ph, pip);
3196 
3197 	ndi_devi_exit(ph->ph_dip, ph_circular);
3198 	ndi_devi_exit(ct->ct_dip, ct_circular);
3199 
3200 	mutex_destroy(&MDI_PI(pip)->pi_mutex);
3201 	cv_destroy(&MDI_PI(pip)->pi_state_cv);
3202 	cv_destroy(&MDI_PI(pip)->pi_ref_cv);
3203 	if (MDI_PI(pip)->pi_addr) {
3204 		kmem_free(MDI_PI(pip)->pi_addr,
3205 		    strlen(MDI_PI(pip)->pi_addr) + 1);
3206 		MDI_PI(pip)->pi_addr = NULL;
3207 	}
3208 
3209 	if (MDI_PI(pip)->pi_prop) {
3210 		(void) nvlist_free(MDI_PI(pip)->pi_prop);
3211 		MDI_PI(pip)->pi_prop = NULL;
3212 	}
3213 	kmem_free(pip, sizeof (struct mdi_pathinfo));
3214 }
3215 
3216 
3217 /*
3218  * i_mdi_phci_remove_path():
3219  * 		Remove a mdi_pathinfo node from pHCI list.
3220  * Notes:
3221  *		Caller should hold per-pHCI mutex
3222  */
3223 static void
3224 i_mdi_phci_remove_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
3225 {
3226 	mdi_pathinfo_t	*prev = NULL;
3227 	mdi_pathinfo_t	*path = NULL;
3228 
3229 	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
3230 
3231 	MDI_PHCI_LOCK(ph);
3232 	path = ph->ph_path_head;
3233 	while (path != NULL) {
3234 		if (path == pip) {
3235 			break;
3236 		}
3237 		prev = path;
3238 		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
3239 	}
3240 
3241 	if (path) {
3242 		ph->ph_path_count--;
3243 		if (prev) {
3244 			MDI_PI(prev)->pi_phci_link = MDI_PI(path)->pi_phci_link;
3245 		} else {
3246 			ph->ph_path_head =
3247 			    (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
3248 		}
3249 		if (ph->ph_path_tail == path) {
3250 			ph->ph_path_tail = prev;
3251 		}
3252 	}
3253 
3254 	/*
3255 	 * Clear the pHCI link
3256 	 */
3257 	MDI_PI(pip)->pi_phci_link = NULL;
3258 	MDI_PI(pip)->pi_phci = NULL;
3259 	MDI_PHCI_UNLOCK(ph);
3260 }
3261 
3262 /*
3263  * i_mdi_client_remove_path():
3264  * 		Remove a mdi_pathinfo node from client path list.
3265  */
3266 static void
3267 i_mdi_client_remove_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
3268 {
3269 	mdi_pathinfo_t	*prev = NULL;
3270 	mdi_pathinfo_t	*path;
3271 
3272 	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
3273 
3274 	ASSERT(MDI_CLIENT_LOCKED(ct));
3275 	path = ct->ct_path_head;
3276 	while (path != NULL) {
3277 		if (path == pip) {
3278 			break;
3279 		}
3280 		prev = path;
3281 		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
3282 	}
3283 
3284 	if (path) {
3285 		ct->ct_path_count--;
3286 		if (prev) {
3287 			MDI_PI(prev)->pi_client_link =
3288 			    MDI_PI(path)->pi_client_link;
3289 		} else {
3290 			ct->ct_path_head =
3291 			    (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
3292 		}
3293 		if (ct->ct_path_tail == path) {
3294 			ct->ct_path_tail = prev;
3295 		}
3296 		if (ct->ct_path_last == path) {
3297 			ct->ct_path_last = ct->ct_path_head;
3298 		}
3299 	}
3300 	MDI_PI(pip)->pi_client_link = NULL;
3301 	MDI_PI(pip)->pi_client = NULL;
3302 }
3303 
3304 /*
3305  * i_mdi_pi_state_change():
3306  *		online a mdi_pathinfo node
3307  *
3308  * Return Values:
3309  *		MDI_SUCCESS
3310  *		MDI_FAILURE
3311  */
3312 /*ARGSUSED*/
3313 static int
3314 i_mdi_pi_state_change(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state, int flag)
3315 {
3316 	int		rv = MDI_SUCCESS;
3317 	mdi_vhci_t	*vh;
3318 	mdi_phci_t	*ph;
3319 	mdi_client_t	*ct;
3320 	int		(*f)();
3321 	dev_info_t	*cdip;
3322 
3323 	MDI_PI_LOCK(pip);
3324 
3325 	ph = MDI_PI(pip)->pi_phci;
3326 	ASSERT(ph);
3327 	if (ph == NULL) {
3328 		/*
3329 		 * Invalid pHCI device, fail the request
3330 		 */
3331 		MDI_PI_UNLOCK(pip);
3332 		MDI_DEBUG(1, (CE_WARN, NULL,
3333 		    "!mdi_pi_state_change: invalid phci pip=%p", (void *)pip));
3334 		return (MDI_FAILURE);
3335 	}
3336 
3337 	vh = ph->ph_vhci;
3338 	ASSERT(vh);
3339 	if (vh == NULL) {
3340 		/*
3341 		 * Invalid vHCI device, fail the request
3342 		 */
3343 		MDI_PI_UNLOCK(pip);
3344 		MDI_DEBUG(1, (CE_WARN, NULL,
3345 		    "!mdi_pi_state_change: invalid vhci pip=%p", (void *)pip));
3346 		return (MDI_FAILURE);
3347 	}
3348 
3349 	ct = MDI_PI(pip)->pi_client;
3350 	ASSERT(ct != NULL);
3351 	if (ct == NULL) {
3352 		/*
3353 		 * Invalid client device, fail the request
3354 		 */
3355 		MDI_PI_UNLOCK(pip);
3356 		MDI_DEBUG(1, (CE_WARN, NULL,
3357 		    "!mdi_pi_state_change: invalid client pip=%p",
3358 		    (void *)pip));
3359 		return (MDI_FAILURE);
3360 	}
3361 
3362 	/*
3363 	 * If this path has not been initialized yet, Callback vHCI driver's
3364 	 * pathinfo node initialize entry point
3365 	 */
3366 
3367 	if (MDI_PI_IS_INITING(pip)) {
3368 		MDI_PI_UNLOCK(pip);
3369 		f = vh->vh_ops->vo_pi_init;
3370 		if (f != NULL) {
3371 			rv = (*f)(vh->vh_dip, pip, 0);
3372 			if (rv != MDI_SUCCESS) {
3373 				MDI_DEBUG(1, (CE_WARN, ct->ct_dip,
3374 				    "!vo_pi_init: failed vHCI=0x%p, pip=0x%p",
3375 				    (void *)vh, (void *)pip));
3376 				return (MDI_FAILURE);
3377 			}
3378 		}
3379 		MDI_PI_LOCK(pip);
3380 		MDI_PI_CLEAR_TRANSIENT(pip);
3381 	}
3382 
3383 	/*
3384 	 * Do not allow state transition when pHCI is in offline/suspended
3385 	 * states
3386 	 */
3387 	i_mdi_phci_lock(ph, pip);
3388 	if (MDI_PHCI_IS_READY(ph) == 0) {
3389 		MDI_DEBUG(1, (CE_WARN, ct->ct_dip,
3390 		    "!mdi_pi_state_change: pHCI not ready, pHCI=%p",
3391 		    (void *)ph));
3392 		MDI_PI_UNLOCK(pip);
3393 		i_mdi_phci_unlock(ph);
3394 		return (MDI_BUSY);
3395 	}
3396 	MDI_PHCI_UNSTABLE(ph);
3397 	i_mdi_phci_unlock(ph);
3398 
3399 	/*
3400 	 * Check if mdi_pathinfo state is in transient state.
3401 	 * If yes, offlining is in progress and wait till transient state is
3402 	 * cleared.
3403 	 */
3404 	if (MDI_PI_IS_TRANSIENT(pip)) {
3405 		while (MDI_PI_IS_TRANSIENT(pip)) {
3406 			cv_wait(&MDI_PI(pip)->pi_state_cv,
3407 			    &MDI_PI(pip)->pi_mutex);
3408 		}
3409 	}
3410 
3411 	/*
3412 	 * Grab the client lock in reverse order sequence and release the
3413 	 * mdi_pathinfo mutex.
3414 	 */
3415 	i_mdi_client_lock(ct, pip);
3416 	MDI_PI_UNLOCK(pip);
3417 
3418 	/*
3419 	 * Wait till failover state is cleared
3420 	 */
3421 	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
3422 		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
3423 
3424 	/*
3425 	 * Mark the mdi_pathinfo node state as transient
3426 	 */
3427 	MDI_PI_LOCK(pip);
3428 	switch (state) {
3429 	case MDI_PATHINFO_STATE_ONLINE:
3430 		MDI_PI_SET_ONLINING(pip);
3431 		break;
3432 
3433 	case MDI_PATHINFO_STATE_STANDBY:
3434 		MDI_PI_SET_STANDBYING(pip);
3435 		break;
3436 
3437 	case MDI_PATHINFO_STATE_FAULT:
3438 		/*
3439 		 * Mark the pathinfo state as FAULTED
3440 		 */
3441 		MDI_PI_SET_FAULTING(pip);
3442 		MDI_PI_ERRSTAT(pip, MDI_PI_HARDERR);
3443 		break;
3444 
3445 	case MDI_PATHINFO_STATE_OFFLINE:
3446 		/*
3447 		 * ndi_devi_offline() cannot hold pip or ct locks.
3448 		 */
3449 		MDI_PI_UNLOCK(pip);
3450 		/*
3451 		 * Don't offline the client dev_info node unless we have
3452 		 * no available paths left at all.
3453 		 */
3454 		cdip = ct->ct_dip;
3455 		if ((flag & NDI_DEVI_REMOVE) &&
3456 		    (ct->ct_path_count == 1)) {
3457 			i_mdi_client_unlock(ct);
3458 			rv = ndi_devi_offline(cdip, 0);
3459 			if (rv != NDI_SUCCESS) {
3460 				/*
3461 				 * Convert to MDI error code
3462 				 */
3463 				switch (rv) {
3464 				case NDI_BUSY:
3465 					rv = MDI_BUSY;
3466 					break;
3467 				default:
3468 					rv = MDI_FAILURE;
3469 					break;
3470 				}
3471 				goto state_change_exit;
3472 			} else {
3473 				i_mdi_client_lock(ct, NULL);
3474 			}
3475 		}
3476 		/*
3477 		 * Mark the mdi_pathinfo node state as transient
3478 		 */
3479 		MDI_PI_LOCK(pip);
3480 		MDI_PI_SET_OFFLINING(pip);
3481 		break;
3482 	}
3483 	MDI_PI_UNLOCK(pip);
3484 	MDI_CLIENT_UNSTABLE(ct);
3485 	i_mdi_client_unlock(ct);
3486 
3487 	f = vh->vh_ops->vo_pi_state_change;
3488 	if (f != NULL)
3489 		rv = (*f)(vh->vh_dip, pip, state, 0, flag);
3490 
3491 	MDI_CLIENT_LOCK(ct);
3492 	MDI_PI_LOCK(pip);
3493 	if (rv == MDI_NOT_SUPPORTED) {
3494 		MDI_CLIENT_SET_DEV_NOT_SUPPORTED(ct);
3495 	}
3496 	if (rv != MDI_SUCCESS) {
3497 		MDI_DEBUG(2, (CE_WARN, ct->ct_dip,
3498 		    "!vo_pi_state_change: failed rv = %x", rv));
3499 	}
3500 	if (MDI_PI_IS_TRANSIENT(pip)) {
3501 		if (rv == MDI_SUCCESS) {
3502 			MDI_PI_CLEAR_TRANSIENT(pip);
3503 		} else {
3504 			MDI_PI(pip)->pi_state = MDI_PI_OLD_STATE(pip);
3505 		}
3506 	}
3507 
3508 	/*
3509 	 * Wake anyone waiting for this mdi_pathinfo node
3510 	 */
3511 	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
3512 	MDI_PI_UNLOCK(pip);
3513 
3514 	/*
3515 	 * Mark the client device as stable
3516 	 */
3517 	MDI_CLIENT_STABLE(ct);
3518 	if (rv == MDI_SUCCESS) {
3519 		if (ct->ct_unstable == 0) {
3520 			cdip = ct->ct_dip;
3521 
3522 			/*
3523 			 * Onlining the mdi_pathinfo node will impact the
3524 			 * client state Update the client and dev_info node
3525 			 * state accordingly
3526 			 */
3527 			rv = NDI_SUCCESS;
3528 			i_mdi_client_update_state(ct);
3529 			switch (MDI_CLIENT_STATE(ct)) {
3530 			case MDI_CLIENT_STATE_OPTIMAL:
3531 			case MDI_CLIENT_STATE_DEGRADED:
3532 				if (cdip && !i_ddi_devi_attached(cdip) &&
3533 				    ((state == MDI_PATHINFO_STATE_ONLINE) ||
3534 				    (state == MDI_PATHINFO_STATE_STANDBY))) {
3535 
3536 					/*
3537 					 * Must do ndi_devi_online() through
3538 					 * hotplug thread for deferred
3539 					 * attach mechanism to work
3540 					 */
3541 					MDI_CLIENT_UNLOCK(ct);
3542 					rv = ndi_devi_online(cdip, 0);
3543 					MDI_CLIENT_LOCK(ct);
3544 					if ((rv != NDI_SUCCESS) &&
3545 					    (MDI_CLIENT_STATE(ct) ==
3546 					    MDI_CLIENT_STATE_DEGRADED)) {
3547 						/*
3548 						 * ndi_devi_online failed.
3549 						 * Reset client flags to
3550 						 * offline.
3551 						 */
3552 						MDI_DEBUG(1, (CE_WARN, cdip,
3553 						    "!ndi_devi_online: failed "
3554 						    " Error: %x", rv));
3555 						MDI_CLIENT_SET_OFFLINE(ct);
3556 					}
3557 					if (rv != NDI_SUCCESS) {
3558 						/* Reset the path state */
3559 						MDI_PI_LOCK(pip);
3560 						MDI_PI(pip)->pi_state =
3561 						    MDI_PI_OLD_STATE(pip);
3562 						MDI_PI_UNLOCK(pip);
3563 					}
3564 				}
3565 				break;
3566 
3567 			case MDI_CLIENT_STATE_FAILED:
3568 				/*
3569 				 * This is the last path case for
3570 				 * non-user initiated events.
3571 				 */
3572 				if (((flag & NDI_DEVI_REMOVE) == 0) &&
3573 				    cdip && (i_ddi_node_state(cdip) >=
3574 				    DS_INITIALIZED)) {
3575 					MDI_CLIENT_UNLOCK(ct);
3576 					rv = ndi_devi_offline(cdip, 0);
3577 					MDI_CLIENT_LOCK(ct);
3578 
3579 					if (rv != NDI_SUCCESS) {
3580 						/*
3581 						 * ndi_devi_offline failed.
3582 						 * Reset client flags to
3583 						 * online as the path could not
3584 						 * be offlined.
3585 						 */
3586 						MDI_DEBUG(1, (CE_WARN, cdip,
3587 						    "!ndi_devi_offline: failed "
3588 						    " Error: %x", rv));
3589 						MDI_CLIENT_SET_ONLINE(ct);
3590 					}
3591 				}
3592 				break;
3593 			}
3594 			/*
3595 			 * Convert to MDI error code
3596 			 */
3597 			switch (rv) {
3598 			case NDI_SUCCESS:
3599 				MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
3600 				i_mdi_report_path_state(ct, pip);
3601 				rv = MDI_SUCCESS;
3602 				break;
3603 			case NDI_BUSY:
3604 				rv = MDI_BUSY;
3605 				break;
3606 			default:
3607 				rv = MDI_FAILURE;
3608 				break;
3609 			}
3610 		}
3611 	}
3612 	MDI_CLIENT_UNLOCK(ct);
3613 
3614 state_change_exit:
3615 	/*
3616 	 * Mark the pHCI as stable again.
3617 	 */
3618 	MDI_PHCI_LOCK(ph);
3619 	MDI_PHCI_STABLE(ph);
3620 	MDI_PHCI_UNLOCK(ph);
3621 	return (rv);
3622 }
3623 
3624 /*
3625  * mdi_pi_online():
3626  *		Place the path_info node in the online state.  The path is
3627  *		now available to be selected by mdi_select_path() for
3628  *		transporting I/O requests to client devices.
3629  * Return Values:
3630  *		MDI_SUCCESS
3631  *		MDI_FAILURE
3632  */
3633 int
3634 mdi_pi_online(mdi_pathinfo_t *pip, int flags)
3635 {
3636 	mdi_client_t	*ct = MDI_PI(pip)->pi_client;
3637 	int		client_held = 0;
3638 	int		rv;
3639 	int		se_flag;
3640 	int		kmem_flag;
3641 
3642 	ASSERT(ct != NULL);
3643 	rv = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_ONLINE, flags);
3644 	if (rv != MDI_SUCCESS)
3645 		return (rv);
3646 
3647 	MDI_PI_LOCK(pip);
3648 	if (MDI_PI(pip)->pi_pm_held == 0) {
3649 		MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "mdi_pi_online "
3650 		    "i_mdi_pm_hold_pip %p\n", (void *)pip));
3651 		i_mdi_pm_hold_pip(pip);
3652 		client_held = 1;
3653 	}
3654 	MDI_PI_UNLOCK(pip);
3655 
3656 	if (client_held) {
3657 		MDI_CLIENT_LOCK(ct);
3658 		if (ct->ct_power_cnt == 0) {
3659 			rv = i_mdi_power_all_phci(ct);
3660 		}
3661 
3662 		MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "mdi_pi_online "
3663 		    "i_mdi_pm_hold_client %p\n", (void *)ct));
3664 		i_mdi_pm_hold_client(ct, 1);
3665 		MDI_CLIENT_UNLOCK(ct);
3666 	}
3667 
3668 	/* determine interrupt context */
3669 	se_flag = (servicing_interrupt()) ? SE_NOSLEEP : SE_SLEEP;
3670 	kmem_flag = (se_flag == SE_SLEEP) ? KM_SLEEP : KM_NOSLEEP;
3671 
3672 	/* A new path is online.  Invalidate DINFOCACHE snap shot. */
3673 	i_ddi_di_cache_invalidate(kmem_flag);
3674 
3675 	return (rv);
3676 }
3677 
3678 /*
3679  * mdi_pi_standby():
3680  *		Place the mdi_pathinfo node in standby state
3681  *
3682  * Return Values:
3683  *		MDI_SUCCESS
3684  *		MDI_FAILURE
3685  */
3686 int
3687 mdi_pi_standby(mdi_pathinfo_t *pip, int flags)
3688 {
3689 	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_STANDBY, flags));
3690 }
3691 
3692 /*
3693  * mdi_pi_fault():
3694  *		Place the mdi_pathinfo node in fault'ed state
3695  * Return Values:
3696  *		MDI_SUCCESS
3697  *		MDI_FAILURE
3698  */
3699 int
3700 mdi_pi_fault(mdi_pathinfo_t *pip, int flags)
3701 {
3702 	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_FAULT, flags));
3703 }
3704 
3705 /*
3706  * mdi_pi_offline():
3707  *		Offline a mdi_pathinfo node.
3708  * Return Values:
3709  *		MDI_SUCCESS
3710  *		MDI_FAILURE
3711  */
3712 int
3713 mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
3714 {
3715 	int	ret, client_held = 0;
3716 	mdi_client_t	*ct;
3717 	int		se_flag;
3718 	int		kmem_flag;
3719 
3720 	ret = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_OFFLINE, flags);
3721 
3722 	if (ret == MDI_SUCCESS) {
3723 		MDI_PI_LOCK(pip);
3724 		if (MDI_PI(pip)->pi_pm_held) {
3725 			client_held = 1;
3726 		}
3727 		MDI_PI_UNLOCK(pip);
3728 
3729 		if (client_held) {
3730 			ct = MDI_PI(pip)->pi_client;
3731 			MDI_CLIENT_LOCK(ct);
3732 			MDI_DEBUG(4, (CE_NOTE, ct->ct_dip,
3733 			    "mdi_pi_offline i_mdi_pm_rele_client\n"));
3734 			i_mdi_pm_rele_client(ct, 1);
3735 			MDI_CLIENT_UNLOCK(ct);
3736 		}
3737 
3738 		/* determine interrupt context */
3739 		se_flag = (servicing_interrupt()) ? SE_NOSLEEP : SE_SLEEP;
3740 		kmem_flag = (se_flag == SE_SLEEP) ? KM_SLEEP : KM_NOSLEEP;
3741 
3742 		/* pathinfo is offlined. update DINFOCACHE. */
3743 		i_ddi_di_cache_invalidate(kmem_flag);
3744 	}
3745 
3746 	return (ret);
3747 }
3748 
3749 /*
3750  * i_mdi_pi_offline():
3751  *		Offline a mdi_pathinfo node and call the vHCI driver's callback
3752  */
3753 static int
3754 i_mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
3755 {
3756 	dev_info_t	*vdip = NULL;
3757 	mdi_vhci_t	*vh = NULL;
3758 	mdi_client_t	*ct = NULL;
3759 	int		(*f)();
3760 	int		rv;
3761 
3762 	MDI_PI_LOCK(pip);
3763 	ct = MDI_PI(pip)->pi_client;
3764 	ASSERT(ct != NULL);
3765 
3766 	while (MDI_PI(pip)->pi_ref_cnt != 0) {
3767 		/*
3768 		 * Give a chance for pending I/Os to complete.
3769 		 */
3770 		MDI_DEBUG(1, (CE_NOTE, ct->ct_dip, "!i_mdi_pi_offline: "
3771 		    "%d cmds still pending on path: %p\n",
3772 		    MDI_PI(pip)->pi_ref_cnt, (void *)pip));
3773 		if (cv_timedwait(&MDI_PI(pip)->pi_ref_cv,
3774 		    &MDI_PI(pip)->pi_mutex,
3775 		    ddi_get_lbolt() + drv_usectohz(60 * 1000000)) == -1) {
3776 			/*
3777 			 * The timeout time reached without ref_cnt being zero
3778 			 * being signaled.
3779 			 */
3780 			MDI_DEBUG(1, (CE_NOTE, ct->ct_dip, "!i_mdi_pi_offline: "
3781 			    "Timeout reached on path %p without the cond\n",
3782 			    (void *)pip));
3783 			MDI_DEBUG(1, (CE_NOTE, ct->ct_dip, "!i_mdi_pi_offline: "
3784 			    "%d cmds still pending on path: %p\n",
3785 			    MDI_PI(pip)->pi_ref_cnt, (void *)pip));
3786 		}
3787 	}
3788 	vh = ct->ct_vhci;
3789 	vdip = vh->vh_dip;
3790 
3791 	/*
3792 	 * Notify vHCI that has registered this event
3793 	 */
3794 	ASSERT(vh->vh_ops);
3795 	f = vh->vh_ops->vo_pi_state_change;
3796 
3797 	if (f != NULL) {
3798 		MDI_PI_UNLOCK(pip);
3799 		if ((rv = (*f)(vdip, pip, MDI_PATHINFO_STATE_OFFLINE, 0,
3800 		    flags)) != MDI_SUCCESS) {
3801 			MDI_DEBUG(1, (CE_WARN, ct->ct_dip,
3802 			    "!vo_path_offline failed "
3803 			    "vdip %p, pip %p", (void *)vdip, (void *)pip));
3804 		}
3805 		MDI_PI_LOCK(pip);
3806 	}
3807 
3808 	/*
3809 	 * Set the mdi_pathinfo node state and clear the transient condition
3810 	 */
3811 	MDI_PI_SET_OFFLINE(pip);
3812 	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
3813 	MDI_PI_UNLOCK(pip);
3814 
3815 	MDI_CLIENT_LOCK(ct);
3816 	if (rv == MDI_SUCCESS) {
3817 		if (ct->ct_unstable == 0) {
3818 			dev_info_t	*cdip = ct->ct_dip;
3819 
3820 			/*
3821 			 * Onlining the mdi_pathinfo node will impact the
3822 			 * client state Update the client and dev_info node
3823 			 * state accordingly
3824 			 */
3825 			i_mdi_client_update_state(ct);
3826 			rv = NDI_SUCCESS;
3827 			if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
3828 				if (cdip &&
3829 				    (i_ddi_node_state(cdip) >=
3830 				    DS_INITIALIZED)) {
3831 					MDI_CLIENT_UNLOCK(ct);
3832 					rv = ndi_devi_offline(cdip, 0);
3833 					MDI_CLIENT_LOCK(ct);
3834 					if (rv != NDI_SUCCESS) {
3835 						/*
3836 						 * ndi_devi_offline failed.
3837 						 * Reset client flags to
3838 						 * online.
3839 						 */
3840 						MDI_DEBUG(4, (CE_WARN, cdip,
3841 						    "!ndi_devi_offline: failed "
3842 						    " Error: %x", rv));
3843 						MDI_CLIENT_SET_ONLINE(ct);
3844 					}
3845 				}
3846 			}
3847 			/*
3848 			 * Convert to MDI error code
3849 			 */
3850 			switch (rv) {
3851 			case NDI_SUCCESS:
3852 				rv = MDI_SUCCESS;
3853 				break;
3854 			case NDI_BUSY:
3855 				rv = MDI_BUSY;
3856 				break;
3857 			default:
3858 				rv = MDI_FAILURE;
3859 				break;
3860 			}
3861 		}
3862 		MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
3863 		i_mdi_report_path_state(ct, pip);
3864 	}
3865 
3866 	MDI_CLIENT_UNLOCK(ct);
3867 
3868 	/*
3869 	 * Change in the mdi_pathinfo node state will impact the client state
3870 	 */
3871 	MDI_DEBUG(2, (CE_NOTE, NULL, "!i_mdi_pi_offline ct = %p pip = %p",
3872 	    (void *)ct, (void *)pip));
3873 	return (rv);
3874 }
3875 
3876 
3877 /*
3878  * mdi_pi_get_addr():
3879  *		Get the unit address associated with a mdi_pathinfo node
3880  *
3881  * Return Values:
3882  *		char *
3883  */
3884 char *
3885 mdi_pi_get_addr(mdi_pathinfo_t *pip)
3886 {
3887 	if (pip == NULL)
3888 		return (NULL);
3889 
3890 	return (MDI_PI(pip)->pi_addr);
3891 }
3892 
3893 /*
3894  * mdi_pi_get_path_instance():
3895  *		Get the 'path_instance' of a mdi_pathinfo node
3896  *
3897  * Return Values:
3898  *		path_instance
3899  */
3900 int
3901 mdi_pi_get_path_instance(mdi_pathinfo_t *pip)
3902 {
3903 	if (pip == NULL)
3904 		return (0);
3905 
3906 	return (MDI_PI(pip)->pi_path_instance);
3907 }
3908 
3909 /*
3910  * mdi_pi_pathname():
3911  *		Return pointer to path to pathinfo node.
3912  */
3913 char *
3914 mdi_pi_pathname(mdi_pathinfo_t *pip)
3915 {
3916 	if (pip == NULL)
3917 		return (NULL);
3918 	return (mdi_pi_pathname_by_instance(mdi_pi_get_path_instance(pip)));
3919 }
3920 
3921 char *
3922 mdi_pi_pathname_obp(mdi_pathinfo_t *pip, char *path)
3923 {
3924 	char *obp_path = NULL;
3925 	if ((pip == NULL) || (path == NULL))
3926 		return (NULL);
3927 
3928 	if (mdi_prop_lookup_string(pip, "obp-path", &obp_path) == MDI_SUCCESS) {
3929 		(void) strcpy(path, obp_path);
3930 		(void) mdi_prop_free(obp_path);
3931 	} else {
3932 		path = NULL;
3933 	}
3934 	return (path);
3935 }
3936 
3937 int
3938 mdi_pi_pathname_obp_set(mdi_pathinfo_t *pip, char *component)
3939 {
3940 	dev_info_t *pdip;
3941 	char *obp_path = NULL;
3942 	int rc = MDI_FAILURE;
3943 
3944 	if (pip == NULL)
3945 		return (MDI_FAILURE);
3946 
3947 	pdip = mdi_pi_get_phci(pip);
3948 	if (pdip == NULL)
3949 		return (MDI_FAILURE);
3950 
3951 	obp_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
3952 
3953 	if (ddi_pathname_obp(pdip, obp_path) == NULL) {
3954 		(void) ddi_pathname(pdip, obp_path);
3955 	}
3956 
3957 	if (component) {
3958 		(void) strncat(obp_path, "/", MAXPATHLEN);
3959 		(void) strncat(obp_path, component, MAXPATHLEN);
3960 	}
3961 	rc = mdi_prop_update_string(pip, "obp-path", obp_path);
3962 
3963 	if (obp_path)
3964 		kmem_free(obp_path, MAXPATHLEN);
3965 	return (rc);
3966 }
3967 
3968 /*
3969  * mdi_pi_get_client():
3970  *		Get the client devinfo associated with a mdi_pathinfo node
3971  *
3972  * Return Values:
3973  *		Handle to client device dev_info node
3974  */
3975 dev_info_t *
3976 mdi_pi_get_client(mdi_pathinfo_t *pip)
3977 {
3978 	dev_info_t	*dip = NULL;
3979 	if (pip) {
3980 		dip = MDI_PI(pip)->pi_client->ct_dip;
3981 	}
3982 	return (dip);
3983 }
3984 
3985 /*
3986  * mdi_pi_get_phci():
3987  *		Get the pHCI devinfo associated with the mdi_pathinfo node
3988  * Return Values:
3989  *		Handle to dev_info node
3990  */
3991 dev_info_t *
3992 mdi_pi_get_phci(mdi_pathinfo_t *pip)
3993 {
3994 	dev_info_t	*dip = NULL;
3995 	if (pip) {
3996 		dip = MDI_PI(pip)->pi_phci->ph_dip;
3997 	}
3998 	return (dip);
3999 }
4000 
4001 /*
4002  * mdi_pi_get_client_private():
4003  *		Get the client private information associated with the
4004  *		mdi_pathinfo node
4005  */
4006 void *
4007 mdi_pi_get_client_private(mdi_pathinfo_t *pip)
4008 {
4009 	void *cprivate = NULL;
4010 	if (pip) {
4011 		cprivate = MDI_PI(pip)->pi_cprivate;
4012 	}
4013 	return (cprivate);
4014 }
4015 
4016 /*
4017  * mdi_pi_set_client_private():
4018  *		Set the client private information in the mdi_pathinfo node
4019  */
4020 void
4021 mdi_pi_set_client_private(mdi_pathinfo_t *pip, void *priv)
4022 {
4023 	if (pip) {
4024 		MDI_PI(pip)->pi_cprivate = priv;
4025 	}
4026 }
4027 
4028 /*
4029  * mdi_pi_get_phci_private():
4030  *		Get the pHCI private information associated with the
4031  *		mdi_pathinfo node
4032  */
4033 caddr_t
4034 mdi_pi_get_phci_private(mdi_pathinfo_t *pip)
4035 {
4036 	caddr_t	pprivate = NULL;
4037 	if (pip) {
4038 		pprivate = MDI_PI(pip)->pi_pprivate;
4039 	}
4040 	return (pprivate);
4041 }
4042 
4043 /*
4044  * mdi_pi_set_phci_private():
4045  *		Set the pHCI private information in the mdi_pathinfo node
4046  */
4047 void
4048 mdi_pi_set_phci_private(mdi_pathinfo_t *pip, caddr_t priv)
4049 {
4050 	if (pip) {
4051 		MDI_PI(pip)->pi_pprivate = priv;
4052 	}
4053 }
4054 
4055 /*
4056  * mdi_pi_get_state():
4057  *		Get the mdi_pathinfo node state. Transient states are internal
4058  *		and not provided to the users
4059  */
4060 mdi_pathinfo_state_t
4061 mdi_pi_get_state(mdi_pathinfo_t *pip)
4062 {
4063 	mdi_pathinfo_state_t    state = MDI_PATHINFO_STATE_INIT;
4064 
4065 	if (pip) {
4066 		if (MDI_PI_IS_TRANSIENT(pip)) {
4067 			/*
4068 			 * mdi_pathinfo is in state transition.  Return the
4069 			 * last good state.
4070 			 */
4071 			state = MDI_PI_OLD_STATE(pip);
4072 		} else {
4073 			state = MDI_PI_STATE(pip);
4074 		}
4075 	}
4076 	return (state);
4077 }
4078 
4079 /*
4080  * Note that the following function needs to be the new interface for
4081  * mdi_pi_get_state when mpxio gets integrated to ON.
4082  */
4083 int
4084 mdi_pi_get_state2(mdi_pathinfo_t *pip, mdi_pathinfo_state_t *state,
4085 		uint32_t *ext_state)
4086 {
4087 	*state = MDI_PATHINFO_STATE_INIT;
4088 
4089 	if (pip) {
4090 		if (MDI_PI_IS_TRANSIENT(pip)) {
4091 			/*
4092 			 * mdi_pathinfo is in state transition.  Return the
4093 			 * last good state.
4094 			 */
4095 			*state = MDI_PI_OLD_STATE(pip);
4096 			*ext_state = MDI_PI_OLD_EXT_STATE(pip);
4097 		} else {
4098 			*state = MDI_PI_STATE(pip);
4099 			*ext_state = MDI_PI_EXT_STATE(pip);
4100 		}
4101 	}
4102 	return (MDI_SUCCESS);
4103 }
4104 
4105 /*
4106  * mdi_pi_get_preferred:
4107  *	Get the preferred path flag
4108  */
4109 int
4110 mdi_pi_get_preferred(mdi_pathinfo_t *pip)
4111 {
4112 	if (pip) {
4113 		return (MDI_PI(pip)->pi_preferred);
4114 	}
4115 	return (0);
4116 }
4117 
4118 /*
4119  * mdi_pi_set_preferred:
4120  *	Set the preferred path flag
4121  */
4122 void
4123 mdi_pi_set_preferred(mdi_pathinfo_t *pip, int preferred)
4124 {
4125 	if (pip) {
4126 		MDI_PI(pip)->pi_preferred = preferred;
4127 	}
4128 }
4129 
4130 /*
4131  * mdi_pi_set_state():
4132  *		Set the mdi_pathinfo node state
4133  */
4134 void
4135 mdi_pi_set_state(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state)
4136 {
4137 	uint32_t	ext_state;
4138 
4139 	if (pip) {
4140 		ext_state = MDI_PI(pip)->pi_state & MDI_PATHINFO_EXT_STATE_MASK;
4141 		MDI_PI(pip)->pi_state = state;
4142 		MDI_PI(pip)->pi_state |= ext_state;
4143 	}
4144 }
4145 
4146 /*
4147  * Property functions:
4148  */
4149 int
4150 i_map_nvlist_error_to_mdi(int val)
4151 {
4152 	int rv;
4153 
4154 	switch (val) {
4155 	case 0:
4156 		rv = DDI_PROP_SUCCESS;
4157 		break;
4158 	case EINVAL:
4159 	case ENOTSUP:
4160 		rv = DDI_PROP_INVAL_ARG;
4161 		break;
4162 	case ENOMEM:
4163 		rv = DDI_PROP_NO_MEMORY;
4164 		break;
4165 	default:
4166 		rv = DDI_PROP_NOT_FOUND;
4167 		break;
4168 	}
4169 	return (rv);
4170 }
4171 
4172 /*
4173  * mdi_pi_get_next_prop():
4174  * 		Property walk function.  The caller should hold mdi_pi_lock()
4175  *		and release by calling mdi_pi_unlock() at the end of walk to
4176  *		get a consistent value.
4177  */
4178 nvpair_t *
4179 mdi_pi_get_next_prop(mdi_pathinfo_t *pip, nvpair_t *prev)
4180 {
4181 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4182 		return (NULL);
4183 	}
4184 	ASSERT(MDI_PI_LOCKED(pip));
4185 	return (nvlist_next_nvpair(MDI_PI(pip)->pi_prop, prev));
4186 }
4187 
4188 /*
4189  * mdi_prop_remove():
4190  * 		Remove the named property from the named list.
4191  */
4192 int
4193 mdi_prop_remove(mdi_pathinfo_t *pip, char *name)
4194 {
4195 	if (pip == NULL) {
4196 		return (DDI_PROP_NOT_FOUND);
4197 	}
4198 	ASSERT(!MDI_PI_LOCKED(pip));
4199 	MDI_PI_LOCK(pip);
4200 	if (MDI_PI(pip)->pi_prop == NULL) {
4201 		MDI_PI_UNLOCK(pip);
4202 		return (DDI_PROP_NOT_FOUND);
4203 	}
4204 	if (name) {
4205 		(void) nvlist_remove_all(MDI_PI(pip)->pi_prop, name);
4206 	} else {
4207 		char		nvp_name[MAXNAMELEN];
4208 		nvpair_t	*nvp;
4209 		nvp = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, NULL);
4210 		while (nvp) {
4211 			nvpair_t	*next;
4212 			next = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, nvp);
4213 			(void) snprintf(nvp_name, MAXNAMELEN, "%s",
4214 			    nvpair_name(nvp));
4215 			(void) nvlist_remove_all(MDI_PI(pip)->pi_prop,
4216 			    nvp_name);
4217 			nvp = next;
4218 		}
4219 	}
4220 	MDI_PI_UNLOCK(pip);
4221 	return (DDI_PROP_SUCCESS);
4222 }
4223 
4224 /*
4225  * mdi_prop_size():
4226  * 		Get buffer size needed to pack the property data.
4227  * 		Caller should hold the mdi_pathinfo_t lock to get a consistent
4228  *		buffer size.
4229  */
4230 int
4231 mdi_prop_size(mdi_pathinfo_t *pip, size_t *buflenp)
4232 {
4233 	int	rv;
4234 	size_t	bufsize;
4235 
4236 	*buflenp = 0;
4237 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4238 		return (DDI_PROP_NOT_FOUND);
4239 	}
4240 	ASSERT(MDI_PI_LOCKED(pip));
4241 	rv = nvlist_size(MDI_PI(pip)->pi_prop,
4242 	    &bufsize, NV_ENCODE_NATIVE);
4243 	*buflenp = bufsize;
4244 	return (i_map_nvlist_error_to_mdi(rv));
4245 }
4246 
4247 /*
4248  * mdi_prop_pack():
4249  * 		pack the property list.  The caller should hold the
4250  *		mdi_pathinfo_t node to get a consistent data
4251  */
4252 int
4253 mdi_prop_pack(mdi_pathinfo_t *pip, char **bufp, uint_t buflen)
4254 {
4255 	int	rv;
4256 	size_t	bufsize;
4257 
4258 	if ((pip == NULL) || MDI_PI(pip)->pi_prop == NULL) {
4259 		return (DDI_PROP_NOT_FOUND);
4260 	}
4261 
4262 	ASSERT(MDI_PI_LOCKED(pip));
4263 
4264 	bufsize = buflen;
4265 	rv = nvlist_pack(MDI_PI(pip)->pi_prop, bufp, (size_t *)&bufsize,
4266 	    NV_ENCODE_NATIVE, KM_SLEEP);
4267 
4268 	return (i_map_nvlist_error_to_mdi(rv));
4269 }
4270 
4271 /*
4272  * mdi_prop_update_byte():
4273  *		Create/Update a byte property
4274  */
4275 int
4276 mdi_prop_update_byte(mdi_pathinfo_t *pip, char *name, uchar_t data)
4277 {
4278 	int rv;
4279 
4280 	if (pip == NULL) {
4281 		return (DDI_PROP_INVAL_ARG);
4282 	}
4283 	ASSERT(!MDI_PI_LOCKED(pip));
4284 	MDI_PI_LOCK(pip);
4285 	if (MDI_PI(pip)->pi_prop == NULL) {
4286 		MDI_PI_UNLOCK(pip);
4287 		return (DDI_PROP_NOT_FOUND);
4288 	}
4289 	rv = nvlist_add_byte(MDI_PI(pip)->pi_prop, name, data);
4290 	MDI_PI_UNLOCK(pip);
4291 	return (i_map_nvlist_error_to_mdi(rv));
4292 }
4293 
4294 /*
4295  * mdi_prop_update_byte_array():
4296  *		Create/Update a byte array property
4297  */
4298 int
4299 mdi_prop_update_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t *data,
4300     uint_t nelements)
4301 {
4302 	int rv;
4303 
4304 	if (pip == NULL) {
4305 		return (DDI_PROP_INVAL_ARG);
4306 	}
4307 	ASSERT(!MDI_PI_LOCKED(pip));
4308 	MDI_PI_LOCK(pip);
4309 	if (MDI_PI(pip)->pi_prop == NULL) {
4310 		MDI_PI_UNLOCK(pip);
4311 		return (DDI_PROP_NOT_FOUND);
4312 	}
4313 	rv = nvlist_add_byte_array(MDI_PI(pip)->pi_prop, name, data, nelements);
4314 	MDI_PI_UNLOCK(pip);
4315 	return (i_map_nvlist_error_to_mdi(rv));
4316 }
4317 
4318 /*
4319  * mdi_prop_update_int():
4320  *		Create/Update a 32 bit integer property
4321  */
4322 int
4323 mdi_prop_update_int(mdi_pathinfo_t *pip, char *name, int data)
4324 {
4325 	int rv;
4326 
4327 	if (pip == NULL) {
4328 		return (DDI_PROP_INVAL_ARG);
4329 	}
4330 	ASSERT(!MDI_PI_LOCKED(pip));
4331 	MDI_PI_LOCK(pip);
4332 	if (MDI_PI(pip)->pi_prop == NULL) {
4333 		MDI_PI_UNLOCK(pip);
4334 		return (DDI_PROP_NOT_FOUND);
4335 	}
4336 	rv = nvlist_add_int32(MDI_PI(pip)->pi_prop, name, (int32_t)data);
4337 	MDI_PI_UNLOCK(pip);
4338 	return (i_map_nvlist_error_to_mdi(rv));
4339 }
4340 
4341 /*
4342  * mdi_prop_update_int64():
4343  *		Create/Update a 64 bit integer property
4344  */
4345 int
4346 mdi_prop_update_int64(mdi_pathinfo_t *pip, char *name, int64_t data)
4347 {
4348 	int rv;
4349 
4350 	if (pip == NULL) {
4351 		return (DDI_PROP_INVAL_ARG);
4352 	}
4353 	ASSERT(!MDI_PI_LOCKED(pip));
4354 	MDI_PI_LOCK(pip);
4355 	if (MDI_PI(pip)->pi_prop == NULL) {
4356 		MDI_PI_UNLOCK(pip);
4357 		return (DDI_PROP_NOT_FOUND);
4358 	}
4359 	rv = nvlist_add_int64(MDI_PI(pip)->pi_prop, name, data);
4360 	MDI_PI_UNLOCK(pip);
4361 	return (i_map_nvlist_error_to_mdi(rv));
4362 }
4363 
4364 /*
4365  * mdi_prop_update_int_array():
4366  *		Create/Update a int array property
4367  */
4368 int
4369 mdi_prop_update_int_array(mdi_pathinfo_t *pip, char *name, int *data,
4370 	    uint_t nelements)
4371 {
4372 	int rv;
4373 
4374 	if (pip == NULL) {
4375 		return (DDI_PROP_INVAL_ARG);
4376 	}
4377 	ASSERT(!MDI_PI_LOCKED(pip));
4378 	MDI_PI_LOCK(pip);
4379 	if (MDI_PI(pip)->pi_prop == NULL) {
4380 		MDI_PI_UNLOCK(pip);
4381 		return (DDI_PROP_NOT_FOUND);
4382 	}
4383 	rv = nvlist_add_int32_array(MDI_PI(pip)->pi_prop, name, (int32_t *)data,
4384 	    nelements);
4385 	MDI_PI_UNLOCK(pip);
4386 	return (i_map_nvlist_error_to_mdi(rv));
4387 }
4388 
4389 /*
4390  * mdi_prop_update_string():
4391  *		Create/Update a string property
4392  */
4393 int
4394 mdi_prop_update_string(mdi_pathinfo_t *pip, char *name, char *data)
4395 {
4396 	int rv;
4397 
4398 	if (pip == NULL) {
4399 		return (DDI_PROP_INVAL_ARG);
4400 	}
4401 	ASSERT(!MDI_PI_LOCKED(pip));
4402 	MDI_PI_LOCK(pip);
4403 	if (MDI_PI(pip)->pi_prop == NULL) {
4404 		MDI_PI_UNLOCK(pip);
4405 		return (DDI_PROP_NOT_FOUND);
4406 	}
4407 	rv = nvlist_add_string(MDI_PI(pip)->pi_prop, name, data);
4408 	MDI_PI_UNLOCK(pip);
4409 	return (i_map_nvlist_error_to_mdi(rv));
4410 }
4411 
4412 /*
4413  * mdi_prop_update_string_array():
4414  *		Create/Update a string array property
4415  */
4416 int
4417 mdi_prop_update_string_array(mdi_pathinfo_t *pip, char *name, char **data,
4418     uint_t nelements)
4419 {
4420 	int rv;
4421 
4422 	if (pip == NULL) {
4423 		return (DDI_PROP_INVAL_ARG);
4424 	}
4425 	ASSERT(!MDI_PI_LOCKED(pip));
4426 	MDI_PI_LOCK(pip);
4427 	if (MDI_PI(pip)->pi_prop == NULL) {
4428 		MDI_PI_UNLOCK(pip);
4429 		return (DDI_PROP_NOT_FOUND);
4430 	}
4431 	rv = nvlist_add_string_array(MDI_PI(pip)->pi_prop, name, data,
4432 	    nelements);
4433 	MDI_PI_UNLOCK(pip);
4434 	return (i_map_nvlist_error_to_mdi(rv));
4435 }
4436 
4437 /*
4438  * mdi_prop_lookup_byte():
4439  * 		Look for byte property identified by name.  The data returned
4440  *		is the actual property and valid as long as mdi_pathinfo_t node
4441  *		is alive.
4442  */
4443 int
4444 mdi_prop_lookup_byte(mdi_pathinfo_t *pip, char *name, uchar_t *data)
4445 {
4446 	int rv;
4447 
4448 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4449 		return (DDI_PROP_NOT_FOUND);
4450 	}
4451 	rv = nvlist_lookup_byte(MDI_PI(pip)->pi_prop, name, data);
4452 	return (i_map_nvlist_error_to_mdi(rv));
4453 }
4454 
4455 
4456 /*
4457  * mdi_prop_lookup_byte_array():
4458  * 		Look for byte array property identified by name.  The data
4459  *		returned is the actual property and valid as long as
4460  *		mdi_pathinfo_t node is alive.
4461  */
4462 int
4463 mdi_prop_lookup_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t **data,
4464     uint_t *nelements)
4465 {
4466 	int rv;
4467 
4468 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4469 		return (DDI_PROP_NOT_FOUND);
4470 	}
4471 	rv = nvlist_lookup_byte_array(MDI_PI(pip)->pi_prop, name, data,
4472 	    nelements);
4473 	return (i_map_nvlist_error_to_mdi(rv));
4474 }
4475 
4476 /*
4477  * mdi_prop_lookup_int():
4478  * 		Look for int property identified by name.  The data returned
4479  *		is the actual property and valid as long as mdi_pathinfo_t
4480  *		node is alive.
4481  */
4482 int
4483 mdi_prop_lookup_int(mdi_pathinfo_t *pip, char *name, int *data)
4484 {
4485 	int rv;
4486 
4487 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4488 		return (DDI_PROP_NOT_FOUND);
4489 	}
4490 	rv = nvlist_lookup_int32(MDI_PI(pip)->pi_prop, name, (int32_t *)data);
4491 	return (i_map_nvlist_error_to_mdi(rv));
4492 }
4493 
4494 /*
4495  * mdi_prop_lookup_int64():
4496  * 		Look for int64 property identified by name.  The data returned
4497  *		is the actual property and valid as long as mdi_pathinfo_t node
4498  *		is alive.
4499  */
4500 int
4501 mdi_prop_lookup_int64(mdi_pathinfo_t *pip, char *name, int64_t *data)
4502 {
4503 	int rv;
4504 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4505 		return (DDI_PROP_NOT_FOUND);
4506 	}
4507 	rv = nvlist_lookup_int64(MDI_PI(pip)->pi_prop, name, data);
4508 	return (i_map_nvlist_error_to_mdi(rv));
4509 }
4510 
4511 /*
4512  * mdi_prop_lookup_int_array():
4513  * 		Look for int array property identified by name.  The data
4514  *		returned is the actual property and valid as long as
4515  *		mdi_pathinfo_t node is alive.
4516  */
4517 int
4518 mdi_prop_lookup_int_array(mdi_pathinfo_t *pip, char *name, int **data,
4519     uint_t *nelements)
4520 {
4521 	int rv;
4522 
4523 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4524 		return (DDI_PROP_NOT_FOUND);
4525 	}
4526 	rv = nvlist_lookup_int32_array(MDI_PI(pip)->pi_prop, name,
4527 	    (int32_t **)data, nelements);
4528 	return (i_map_nvlist_error_to_mdi(rv));
4529 }
4530 
4531 /*
4532  * mdi_prop_lookup_string():
4533  * 		Look for string property identified by name.  The data
4534  *		returned is the actual property and valid as long as
4535  *		mdi_pathinfo_t node is alive.
4536  */
4537 int
4538 mdi_prop_lookup_string(mdi_pathinfo_t *pip, char *name, char **data)
4539 {
4540 	int rv;
4541 
4542 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4543 		return (DDI_PROP_NOT_FOUND);
4544 	}
4545 	rv = nvlist_lookup_string(MDI_PI(pip)->pi_prop, name, data);
4546 	return (i_map_nvlist_error_to_mdi(rv));
4547 }
4548 
4549 /*
4550  * mdi_prop_lookup_string_array():
4551  * 		Look for string array property identified by name.  The data
4552  *		returned is the actual property and valid as long as
4553  *		mdi_pathinfo_t node is alive.
4554  */
4555 int
4556 mdi_prop_lookup_string_array(mdi_pathinfo_t *pip, char *name, char ***data,
4557     uint_t *nelements)
4558 {
4559 	int rv;
4560 
4561 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4562 		return (DDI_PROP_NOT_FOUND);
4563 	}
4564 	rv = nvlist_lookup_string_array(MDI_PI(pip)->pi_prop, name, data,
4565 	    nelements);
4566 	return (i_map_nvlist_error_to_mdi(rv));
4567 }
4568 
4569 /*
4570  * mdi_prop_free():
4571  * 		Symmetrical function to ddi_prop_free(). nvlist_lookup_xx()
4572  *		functions return the pointer to actual property data and not a
4573  *		copy of it.  So the data returned is valid as long as
4574  *		mdi_pathinfo_t node is valid.
4575  */
4576 /*ARGSUSED*/
4577 int
4578 mdi_prop_free(void *data)
4579 {
4580 	return (DDI_PROP_SUCCESS);
4581 }
4582 
4583 /*ARGSUSED*/
4584 static void
4585 i_mdi_report_path_state(mdi_client_t *ct, mdi_pathinfo_t *pip)
4586 {
4587 	char		*phci_path, *ct_path;
4588 	char		*ct_status;
4589 	char		*status;
4590 	dev_info_t	*dip = ct->ct_dip;
4591 	char		lb_buf[64];
4592 
4593 	ASSERT(MDI_CLIENT_LOCKED(ct));
4594 	if ((dip == NULL) || (ddi_get_instance(dip) == -1) ||
4595 	    (MDI_CLIENT_IS_REPORT_DEV_NEEDED(ct) == 0)) {
4596 		return;
4597 	}
4598 	if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_OPTIMAL) {
4599 		ct_status = "optimal";
4600 	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED) {
4601 		ct_status = "degraded";
4602 	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
4603 		ct_status = "failed";
4604 	} else {
4605 		ct_status = "unknown";
4606 	}
4607 
4608 	if (MDI_PI_IS_OFFLINE(pip)) {
4609 		status = "offline";
4610 	} else if (MDI_PI_IS_ONLINE(pip)) {
4611 		status = "online";
4612 	} else if (MDI_PI_IS_STANDBY(pip)) {
4613 		status = "standby";
4614 	} else if (MDI_PI_IS_FAULT(pip)) {
4615 		status = "faulted";
4616 	} else {
4617 		status = "unknown";
4618 	}
4619 
4620 	if (ct->ct_lb == LOAD_BALANCE_LBA) {
4621 		(void) snprintf(lb_buf, sizeof (lb_buf),
4622 		    "%s, region-size: %d", mdi_load_balance_lba,
4623 			ct->ct_lb_args->region_size);
4624 	} else if (ct->ct_lb == LOAD_BALANCE_NONE) {
4625 		(void) snprintf(lb_buf, sizeof (lb_buf),
4626 		    "%s", mdi_load_balance_none);
4627 	} else {
4628 		(void) snprintf(lb_buf, sizeof (lb_buf), "%s",
4629 		    mdi_load_balance_rr);
4630 	}
4631 
4632 	if (dip) {
4633 		ct_path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4634 		phci_path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4635 		cmn_err(CE_CONT, "?%s (%s%d) multipath status: %s, "
4636 		    "path %s (%s%d) to target address: %s is %s"
4637 		    " Load balancing: %s\n",
4638 		    ddi_pathname(dip, ct_path), ddi_driver_name(dip),
4639 		    ddi_get_instance(dip), ct_status,
4640 		    ddi_pathname(MDI_PI(pip)->pi_phci->ph_dip, phci_path),
4641 		    ddi_driver_name(MDI_PI(pip)->pi_phci->ph_dip),
4642 		    ddi_get_instance(MDI_PI(pip)->pi_phci->ph_dip),
4643 		    MDI_PI(pip)->pi_addr, status, lb_buf);
4644 		kmem_free(phci_path, MAXPATHLEN);
4645 		kmem_free(ct_path, MAXPATHLEN);
4646 		MDI_CLIENT_CLEAR_REPORT_DEV_NEEDED(ct);
4647 	}
4648 }
4649 
4650 #ifdef	DEBUG
4651 /*
4652  * i_mdi_log():
4653  *		Utility function for error message management
4654  *
4655  */
4656 /*PRINTFLIKE3*/
4657 static void
4658 i_mdi_log(int level, dev_info_t *dip, const char *fmt, ...)
4659 {
4660 	char		name[MAXNAMELEN];
4661 	char		buf[MAXNAMELEN];
4662 	char		*bp;
4663 	va_list		ap;
4664 	int		log_only = 0;
4665 	int		boot_only = 0;
4666 	int		console_only = 0;
4667 
4668 	if (dip) {
4669 		(void) snprintf(name, MAXNAMELEN, "%s%d: ",
4670 		    ddi_node_name(dip), ddi_get_instance(dip));
4671 	} else {
4672 		name[0] = 0;
4673 	}
4674 
4675 	va_start(ap, fmt);
4676 	(void) vsnprintf(buf, MAXNAMELEN, fmt, ap);
4677 	va_end(ap);
4678 
4679 	switch (buf[0]) {
4680 	case '!':
4681 		bp = &buf[1];
4682 		log_only = 1;
4683 		break;
4684 	case '?':
4685 		bp = &buf[1];
4686 		boot_only = 1;
4687 		break;
4688 	case '^':
4689 		bp = &buf[1];
4690 		console_only = 1;
4691 		break;
4692 	default:
4693 		bp = buf;
4694 		break;
4695 	}
4696 	if (mdi_debug_logonly) {
4697 		log_only = 1;
4698 		boot_only = 0;
4699 		console_only = 0;
4700 	}
4701 
4702 	switch (level) {
4703 	case CE_NOTE:
4704 		level = CE_CONT;
4705 		/* FALLTHROUGH */
4706 	case CE_CONT:
4707 	case CE_WARN:
4708 	case CE_PANIC:
4709 		if (boot_only) {
4710 			cmn_err(level, "?mdi: %s%s", name, bp);
4711 		} else if (console_only) {
4712 			cmn_err(level, "^mdi: %s%s", name, bp);
4713 		} else if (log_only) {
4714 			cmn_err(level, "!mdi: %s%s", name, bp);
4715 		} else {
4716 			cmn_err(level, "mdi: %s%s", name, bp);
4717 		}
4718 		break;
4719 	default:
4720 		cmn_err(level, "mdi: %s%s", name, bp);
4721 		break;
4722 	}
4723 }
4724 #endif	/* DEBUG */
4725 
4726 void
4727 i_mdi_client_online(dev_info_t *ct_dip)
4728 {
4729 	mdi_client_t	*ct;
4730 
4731 	/*
4732 	 * Client online notification. Mark client state as online
4733 	 * restore our binding with dev_info node
4734 	 */
4735 	ct = i_devi_get_client(ct_dip);
4736 	ASSERT(ct != NULL);
4737 	MDI_CLIENT_LOCK(ct);
4738 	MDI_CLIENT_SET_ONLINE(ct);
4739 	/* catch for any memory leaks */
4740 	ASSERT((ct->ct_dip == NULL) || (ct->ct_dip == ct_dip));
4741 	ct->ct_dip = ct_dip;
4742 
4743 	if (ct->ct_power_cnt == 0)
4744 		(void) i_mdi_power_all_phci(ct);
4745 
4746 	MDI_DEBUG(4, (CE_NOTE, ct_dip, "i_mdi_client_online "
4747 	    "i_mdi_pm_hold_client %p\n", (void *)ct));
4748 	i_mdi_pm_hold_client(ct, 1);
4749 
4750 	MDI_CLIENT_UNLOCK(ct);
4751 }
4752 
4753 void
4754 i_mdi_phci_online(dev_info_t *ph_dip)
4755 {
4756 	mdi_phci_t	*ph;
4757 
4758 	/* pHCI online notification. Mark state accordingly */
4759 	ph = i_devi_get_phci(ph_dip);
4760 	ASSERT(ph != NULL);
4761 	MDI_PHCI_LOCK(ph);
4762 	MDI_PHCI_SET_ONLINE(ph);
4763 	MDI_PHCI_UNLOCK(ph);
4764 }
4765 
4766 /*
4767  * mdi_devi_online():
4768  * 		Online notification from NDI framework on pHCI/client
4769  *		device online.
4770  * Return Values:
4771  *		NDI_SUCCESS
4772  *		MDI_FAILURE
4773  */
4774 /*ARGSUSED*/
4775 int
4776 mdi_devi_online(dev_info_t *dip, uint_t flags)
4777 {
4778 	if (MDI_PHCI(dip)) {
4779 		i_mdi_phci_online(dip);
4780 	}
4781 
4782 	if (MDI_CLIENT(dip)) {
4783 		i_mdi_client_online(dip);
4784 	}
4785 	return (NDI_SUCCESS);
4786 }
4787 
4788 /*
4789  * mdi_devi_offline():
4790  * 		Offline notification from NDI framework on pHCI/Client device
4791  *		offline.
4792  *
4793  * Return Values:
4794  *		NDI_SUCCESS
4795  *		NDI_FAILURE
4796  */
4797 /*ARGSUSED*/
4798 int
4799 mdi_devi_offline(dev_info_t *dip, uint_t flags)
4800 {
4801 	int		rv = NDI_SUCCESS;
4802 
4803 	if (MDI_CLIENT(dip)) {
4804 		rv = i_mdi_client_offline(dip, flags);
4805 		if (rv != NDI_SUCCESS)
4806 			return (rv);
4807 	}
4808 
4809 	if (MDI_PHCI(dip)) {
4810 		rv = i_mdi_phci_offline(dip, flags);
4811 
4812 		if ((rv != NDI_SUCCESS) && MDI_CLIENT(dip)) {
4813 			/* set client back online */
4814 			i_mdi_client_online(dip);
4815 		}
4816 	}
4817 
4818 	return (rv);
4819 }
4820 
4821 /*ARGSUSED*/
4822 static int
4823 i_mdi_phci_offline(dev_info_t *dip, uint_t flags)
4824 {
4825 	int		rv = NDI_SUCCESS;
4826 	mdi_phci_t	*ph;
4827 	mdi_client_t	*ct;
4828 	mdi_pathinfo_t	*pip;
4829 	mdi_pathinfo_t	*next;
4830 	mdi_pathinfo_t	*failed_pip = NULL;
4831 	dev_info_t	*cdip;
4832 
4833 	/*
4834 	 * pHCI component offline notification
4835 	 * Make sure that this pHCI instance is free to be offlined.
4836 	 * If it is OK to proceed, Offline and remove all the child
4837 	 * mdi_pathinfo nodes.  This process automatically offlines
4838 	 * corresponding client devices, for which this pHCI provides
4839 	 * critical services.
4840 	 */
4841 	ph = i_devi_get_phci(dip);
4842 	MDI_DEBUG(2, (CE_NOTE, dip, "!mdi_phci_offline called %p %p\n",
4843 	    (void *)dip, (void *)ph));
4844 	if (ph == NULL) {
4845 		return (rv);
4846 	}
4847 
4848 	MDI_PHCI_LOCK(ph);
4849 
4850 	if (MDI_PHCI_IS_OFFLINE(ph)) {
4851 		MDI_DEBUG(1, (CE_WARN, dip, "!pHCI %p already offlined",
4852 		    (void *)ph));
4853 		MDI_PHCI_UNLOCK(ph);
4854 		return (NDI_SUCCESS);
4855 	}
4856 
4857 	/*
4858 	 * Check to see if the pHCI can be offlined
4859 	 */
4860 	if (ph->ph_unstable) {
4861 		MDI_DEBUG(1, (CE_WARN, dip,
4862 		    "!One or more target devices are in transient "
4863 		    "state. This device can not be removed at "
4864 		    "this moment. Please try again later."));
4865 		MDI_PHCI_UNLOCK(ph);
4866 		return (NDI_BUSY);
4867 	}
4868 
4869 	pip = ph->ph_path_head;
4870 	while (pip != NULL) {
4871 		MDI_PI_LOCK(pip);
4872 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4873 
4874 		/*
4875 		 * The mdi_pathinfo state is OK. Check the client state.
4876 		 * If failover in progress fail the pHCI from offlining
4877 		 */
4878 		ct = MDI_PI(pip)->pi_client;
4879 		i_mdi_client_lock(ct, pip);
4880 		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
4881 		    (ct->ct_unstable)) {
4882 			/*
4883 			 * Failover is in progress, Fail the DR
4884 			 */
4885 			MDI_DEBUG(1, (CE_WARN, dip,
4886 			    "!pHCI device (%s%d) is Busy. %s",
4887 			    ddi_driver_name(dip), ddi_get_instance(dip),
4888 			    "This device can not be removed at "
4889 			    "this moment. Please try again later."));
4890 			MDI_PI_UNLOCK(pip);
4891 			i_mdi_client_unlock(ct);
4892 			MDI_PHCI_UNLOCK(ph);
4893 			return (NDI_BUSY);
4894 		}
4895 		MDI_PI_UNLOCK(pip);
4896 
4897 		/*
4898 		 * Check to see of we are removing the last path of this
4899 		 * client device...
4900 		 */
4901 		cdip = ct->ct_dip;
4902 		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
4903 		    (i_mdi_client_compute_state(ct, ph) ==
4904 		    MDI_CLIENT_STATE_FAILED)) {
4905 			i_mdi_client_unlock(ct);
4906 			MDI_PHCI_UNLOCK(ph);
4907 			if (ndi_devi_offline(cdip, 0) != NDI_SUCCESS) {
4908 				/*
4909 				 * ndi_devi_offline() failed.
4910 				 * This pHCI provides the critical path
4911 				 * to one or more client devices.
4912 				 * Return busy.
4913 				 */
4914 				MDI_PHCI_LOCK(ph);
4915 				MDI_DEBUG(1, (CE_WARN, dip,
4916 				    "!pHCI device (%s%d) is Busy. %s",
4917 				    ddi_driver_name(dip), ddi_get_instance(dip),
4918 				    "This device can not be removed at "
4919 				    "this moment. Please try again later."));
4920 				failed_pip = pip;
4921 				break;
4922 			} else {
4923 				MDI_PHCI_LOCK(ph);
4924 				pip = next;
4925 			}
4926 		} else {
4927 			i_mdi_client_unlock(ct);
4928 			pip = next;
4929 		}
4930 	}
4931 
4932 	if (failed_pip) {
4933 		pip = ph->ph_path_head;
4934 		while (pip != failed_pip) {
4935 			MDI_PI_LOCK(pip);
4936 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4937 			ct = MDI_PI(pip)->pi_client;
4938 			i_mdi_client_lock(ct, pip);
4939 			cdip = ct->ct_dip;
4940 			switch (MDI_CLIENT_STATE(ct)) {
4941 			case MDI_CLIENT_STATE_OPTIMAL:
4942 			case MDI_CLIENT_STATE_DEGRADED:
4943 				if (cdip) {
4944 					MDI_PI_UNLOCK(pip);
4945 					i_mdi_client_unlock(ct);
4946 					MDI_PHCI_UNLOCK(ph);
4947 					(void) ndi_devi_online(cdip, 0);
4948 					MDI_PHCI_LOCK(ph);
4949 					pip = next;
4950 					continue;
4951 				}
4952 				break;
4953 
4954 			case MDI_CLIENT_STATE_FAILED:
4955 				if (cdip) {
4956 					MDI_PI_UNLOCK(pip);
4957 					i_mdi_client_unlock(ct);
4958 					MDI_PHCI_UNLOCK(ph);
4959 					(void) ndi_devi_offline(cdip, 0);
4960 					MDI_PHCI_LOCK(ph);
4961 					pip = next;
4962 					continue;
4963 				}
4964 				break;
4965 			}
4966 			MDI_PI_UNLOCK(pip);
4967 			i_mdi_client_unlock(ct);
4968 			pip = next;
4969 		}
4970 		MDI_PHCI_UNLOCK(ph);
4971 		return (NDI_BUSY);
4972 	}
4973 
4974 	/*
4975 	 * Mark the pHCI as offline
4976 	 */
4977 	MDI_PHCI_SET_OFFLINE(ph);
4978 
4979 	/*
4980 	 * Mark the child mdi_pathinfo nodes as transient
4981 	 */
4982 	pip = ph->ph_path_head;
4983 	while (pip != NULL) {
4984 		MDI_PI_LOCK(pip);
4985 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4986 		MDI_PI_SET_OFFLINING(pip);
4987 		MDI_PI_UNLOCK(pip);
4988 		pip = next;
4989 	}
4990 	MDI_PHCI_UNLOCK(ph);
4991 	/*
4992 	 * Give a chance for any pending commands to execute
4993 	 */
4994 	delay(1);
4995 	MDI_PHCI_LOCK(ph);
4996 	pip = ph->ph_path_head;
4997 	while (pip != NULL) {
4998 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4999 		(void) i_mdi_pi_offline(pip, flags);
5000 		MDI_PI_LOCK(pip);
5001 		ct = MDI_PI(pip)->pi_client;
5002 		if (!MDI_PI_IS_OFFLINE(pip)) {
5003 			MDI_DEBUG(1, (CE_WARN, dip,
5004 			    "!pHCI device (%s%d) is Busy. %s",
5005 			    ddi_driver_name(dip), ddi_get_instance(dip),
5006 			    "This device can not be removed at "
5007 			    "this moment. Please try again later."));
5008 			MDI_PI_UNLOCK(pip);
5009 			MDI_PHCI_SET_ONLINE(ph);
5010 			MDI_PHCI_UNLOCK(ph);
5011 			return (NDI_BUSY);
5012 		}
5013 		MDI_PI_UNLOCK(pip);
5014 		pip = next;
5015 	}
5016 	MDI_PHCI_UNLOCK(ph);
5017 
5018 	return (rv);
5019 }
5020 
5021 void
5022 mdi_phci_mark_retiring(dev_info_t *dip, char **cons_array)
5023 {
5024 	mdi_phci_t	*ph;
5025 	mdi_client_t	*ct;
5026 	mdi_pathinfo_t	*pip;
5027 	mdi_pathinfo_t	*next;
5028 	dev_info_t	*cdip;
5029 
5030 	if (!MDI_PHCI(dip))
5031 		return;
5032 
5033 	ph = i_devi_get_phci(dip);
5034 	if (ph == NULL) {
5035 		return;
5036 	}
5037 
5038 	MDI_PHCI_LOCK(ph);
5039 
5040 	if (MDI_PHCI_IS_OFFLINE(ph)) {
5041 		/* has no last path */
5042 		MDI_PHCI_UNLOCK(ph);
5043 		return;
5044 	}
5045 
5046 	pip = ph->ph_path_head;
5047 	while (pip != NULL) {
5048 		MDI_PI_LOCK(pip);
5049 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5050 
5051 		ct = MDI_PI(pip)->pi_client;
5052 		i_mdi_client_lock(ct, pip);
5053 		MDI_PI_UNLOCK(pip);
5054 
5055 		cdip = ct->ct_dip;
5056 		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5057 		    (i_mdi_client_compute_state(ct, ph) ==
5058 		    MDI_CLIENT_STATE_FAILED)) {
5059 			/* Last path. Mark client dip as retiring */
5060 			i_mdi_client_unlock(ct);
5061 			MDI_PHCI_UNLOCK(ph);
5062 			(void) e_ddi_mark_retiring(cdip, cons_array);
5063 			MDI_PHCI_LOCK(ph);
5064 			pip = next;
5065 		} else {
5066 			i_mdi_client_unlock(ct);
5067 			pip = next;
5068 		}
5069 	}
5070 
5071 	MDI_PHCI_UNLOCK(ph);
5072 
5073 	return;
5074 }
5075 
5076 void
5077 mdi_phci_retire_notify(dev_info_t *dip, int *constraint)
5078 {
5079 	mdi_phci_t	*ph;
5080 	mdi_client_t	*ct;
5081 	mdi_pathinfo_t	*pip;
5082 	mdi_pathinfo_t	*next;
5083 	dev_info_t	*cdip;
5084 
5085 	if (!MDI_PHCI(dip))
5086 		return;
5087 
5088 	ph = i_devi_get_phci(dip);
5089 	if (ph == NULL)
5090 		return;
5091 
5092 	MDI_PHCI_LOCK(ph);
5093 
5094 	if (MDI_PHCI_IS_OFFLINE(ph)) {
5095 		MDI_PHCI_UNLOCK(ph);
5096 		/* not last path */
5097 		return;
5098 	}
5099 
5100 	if (ph->ph_unstable) {
5101 		MDI_PHCI_UNLOCK(ph);
5102 		/* can't check for constraints */
5103 		*constraint = 0;
5104 		return;
5105 	}
5106 
5107 	pip = ph->ph_path_head;
5108 	while (pip != NULL) {
5109 		MDI_PI_LOCK(pip);
5110 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5111 
5112 		/*
5113 		 * The mdi_pathinfo state is OK. Check the client state.
5114 		 * If failover in progress fail the pHCI from offlining
5115 		 */
5116 		ct = MDI_PI(pip)->pi_client;
5117 		i_mdi_client_lock(ct, pip);
5118 		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
5119 		    (ct->ct_unstable)) {
5120 			/*
5121 			 * Failover is in progress, can't check for constraints
5122 			 */
5123 			MDI_PI_UNLOCK(pip);
5124 			i_mdi_client_unlock(ct);
5125 			MDI_PHCI_UNLOCK(ph);
5126 			*constraint = 0;
5127 			return;
5128 		}
5129 		MDI_PI_UNLOCK(pip);
5130 
5131 		/*
5132 		 * Check to see of we are retiring the last path of this
5133 		 * client device...
5134 		 */
5135 		cdip = ct->ct_dip;
5136 		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5137 		    (i_mdi_client_compute_state(ct, ph) ==
5138 		    MDI_CLIENT_STATE_FAILED)) {
5139 			i_mdi_client_unlock(ct);
5140 			MDI_PHCI_UNLOCK(ph);
5141 			(void) e_ddi_retire_notify(cdip, constraint);
5142 			MDI_PHCI_LOCK(ph);
5143 			pip = next;
5144 		} else {
5145 			i_mdi_client_unlock(ct);
5146 			pip = next;
5147 		}
5148 	}
5149 
5150 	MDI_PHCI_UNLOCK(ph);
5151 
5152 	return;
5153 }
5154 
5155 /*
5156  * offline the path(s) hanging off the PHCI. If the
5157  * last path to any client, check that constraints
5158  * have been applied.
5159  */
5160 void
5161 mdi_phci_retire_finalize(dev_info_t *dip, int phci_only)
5162 {
5163 	mdi_phci_t	*ph;
5164 	mdi_client_t	*ct;
5165 	mdi_pathinfo_t	*pip;
5166 	mdi_pathinfo_t	*next;
5167 	dev_info_t	*cdip;
5168 	int		unstable = 0;
5169 	int		constraint;
5170 
5171 	if (!MDI_PHCI(dip))
5172 		return;
5173 
5174 	ph = i_devi_get_phci(dip);
5175 	if (ph == NULL) {
5176 		/* no last path and no pips */
5177 		return;
5178 	}
5179 
5180 	MDI_PHCI_LOCK(ph);
5181 
5182 	if (MDI_PHCI_IS_OFFLINE(ph)) {
5183 		MDI_PHCI_UNLOCK(ph);
5184 		/* no last path and no pips */
5185 		return;
5186 	}
5187 
5188 	/*
5189 	 * Check to see if the pHCI can be offlined
5190 	 */
5191 	if (ph->ph_unstable) {
5192 		unstable = 1;
5193 	}
5194 
5195 	pip = ph->ph_path_head;
5196 	while (pip != NULL) {
5197 		MDI_PI_LOCK(pip);
5198 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5199 
5200 		/*
5201 		 * if failover in progress fail the pHCI from offlining
5202 		 */
5203 		ct = MDI_PI(pip)->pi_client;
5204 		i_mdi_client_lock(ct, pip);
5205 		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
5206 		    (ct->ct_unstable)) {
5207 			unstable = 1;
5208 		}
5209 		MDI_PI_UNLOCK(pip);
5210 
5211 		/*
5212 		 * Check to see of we are removing the last path of this
5213 		 * client device...
5214 		 */
5215 		cdip = ct->ct_dip;
5216 		if (!phci_only && cdip &&
5217 		    (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5218 		    (i_mdi_client_compute_state(ct, ph) ==
5219 		    MDI_CLIENT_STATE_FAILED)) {
5220 			i_mdi_client_unlock(ct);
5221 			MDI_PHCI_UNLOCK(ph);
5222 			/*
5223 			 * We don't retire clients we just retire the
5224 			 * path to a client. If it is the last path
5225 			 * to a client, constraints are checked and
5226 			 * if we pass the last path is offlined. MPXIO will
5227 			 * then fail all I/Os to the client. Since we don't
5228 			 * want to retire the client on a path error
5229 			 * set constraint = 0 so that the client dip
5230 			 * is not retired.
5231 			 */
5232 			constraint = 0;
5233 			(void) e_ddi_retire_finalize(cdip, &constraint);
5234 			MDI_PHCI_LOCK(ph);
5235 			pip = next;
5236 		} else {
5237 			i_mdi_client_unlock(ct);
5238 			pip = next;
5239 		}
5240 	}
5241 
5242 	/*
5243 	 * Cannot offline pip(s)
5244 	 */
5245 	if (unstable) {
5246 		cmn_err(CE_WARN, "PHCI in transient state, cannot "
5247 		    "retire, dip = %p", (void *)dip);
5248 		MDI_PHCI_UNLOCK(ph);
5249 		return;
5250 	}
5251 
5252 	/*
5253 	 * Mark the pHCI as offline
5254 	 */
5255 	MDI_PHCI_SET_OFFLINE(ph);
5256 
5257 	/*
5258 	 * Mark the child mdi_pathinfo nodes as transient
5259 	 */
5260 	pip = ph->ph_path_head;
5261 	while (pip != NULL) {
5262 		MDI_PI_LOCK(pip);
5263 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5264 		MDI_PI_SET_OFFLINING(pip);
5265 		MDI_PI_UNLOCK(pip);
5266 		pip = next;
5267 	}
5268 	MDI_PHCI_UNLOCK(ph);
5269 	/*
5270 	 * Give a chance for any pending commands to execute
5271 	 */
5272 	delay(1);
5273 	MDI_PHCI_LOCK(ph);
5274 	pip = ph->ph_path_head;
5275 	while (pip != NULL) {
5276 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5277 		(void) i_mdi_pi_offline(pip, 0);
5278 		MDI_PI_LOCK(pip);
5279 		ct = MDI_PI(pip)->pi_client;
5280 		if (!MDI_PI_IS_OFFLINE(pip)) {
5281 			cmn_err(CE_WARN, "PHCI busy, cannot offline path: "
5282 			    "PHCI dip = %p", (void *)dip);
5283 			MDI_PI_UNLOCK(pip);
5284 			MDI_PHCI_SET_ONLINE(ph);
5285 			MDI_PHCI_UNLOCK(ph);
5286 			return;
5287 		}
5288 		MDI_PI_UNLOCK(pip);
5289 		pip = next;
5290 	}
5291 	MDI_PHCI_UNLOCK(ph);
5292 
5293 	return;
5294 }
5295 
5296 void
5297 mdi_phci_unretire(dev_info_t *dip)
5298 {
5299 	ASSERT(MDI_PHCI(dip));
5300 
5301 	/*
5302 	 * Online the phci
5303 	 */
5304 	i_mdi_phci_online(dip);
5305 }
5306 
5307 /*ARGSUSED*/
5308 static int
5309 i_mdi_client_offline(dev_info_t *dip, uint_t flags)
5310 {
5311 	int		rv = NDI_SUCCESS;
5312 	mdi_client_t	*ct;
5313 
5314 	/*
5315 	 * Client component to go offline.  Make sure that we are
5316 	 * not in failing over state and update client state
5317 	 * accordingly
5318 	 */
5319 	ct = i_devi_get_client(dip);
5320 	MDI_DEBUG(2, (CE_NOTE, dip, "!i_mdi_client_offline called %p %p\n",
5321 	    (void *)dip, (void *)ct));
5322 	if (ct != NULL) {
5323 		MDI_CLIENT_LOCK(ct);
5324 		if (ct->ct_unstable) {
5325 			/*
5326 			 * One or more paths are in transient state,
5327 			 * Dont allow offline of a client device
5328 			 */
5329 			MDI_DEBUG(1, (CE_WARN, dip,
5330 			    "!One or more paths to this device is "
5331 			    "in transient state. This device can not "
5332 			    "be removed at this moment. "
5333 			    "Please try again later."));
5334 			MDI_CLIENT_UNLOCK(ct);
5335 			return (NDI_BUSY);
5336 		}
5337 		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
5338 			/*
5339 			 * Failover is in progress, Dont allow DR of
5340 			 * a client device
5341 			 */
5342 			MDI_DEBUG(1, (CE_WARN, dip,
5343 			    "!Client device (%s%d) is Busy. %s",
5344 			    ddi_driver_name(dip), ddi_get_instance(dip),
5345 			    "This device can not be removed at "
5346 			    "this moment. Please try again later."));
5347 			MDI_CLIENT_UNLOCK(ct);
5348 			return (NDI_BUSY);
5349 		}
5350 		MDI_CLIENT_SET_OFFLINE(ct);
5351 
5352 		/*
5353 		 * Unbind our relationship with the dev_info node
5354 		 */
5355 		if (flags & NDI_DEVI_REMOVE) {
5356 			ct->ct_dip = NULL;
5357 		}
5358 		MDI_CLIENT_UNLOCK(ct);
5359 	}
5360 	return (rv);
5361 }
5362 
5363 /*
5364  * mdi_pre_attach():
5365  *		Pre attach() notification handler
5366  */
5367 /*ARGSUSED*/
5368 int
5369 mdi_pre_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
5370 {
5371 	/* don't support old DDI_PM_RESUME */
5372 	if ((DEVI(dip)->devi_mdi_component != MDI_COMPONENT_NONE) &&
5373 	    (cmd == DDI_PM_RESUME))
5374 		return (DDI_FAILURE);
5375 
5376 	return (DDI_SUCCESS);
5377 }
5378 
5379 /*
5380  * mdi_post_attach():
5381  *		Post attach() notification handler
5382  */
5383 /*ARGSUSED*/
5384 void
5385 mdi_post_attach(dev_info_t *dip, ddi_attach_cmd_t cmd, int error)
5386 {
5387 	mdi_phci_t	*ph;
5388 	mdi_client_t	*ct;
5389 	mdi_vhci_t	*vh;
5390 
5391 	if (MDI_PHCI(dip)) {
5392 		ph = i_devi_get_phci(dip);
5393 		ASSERT(ph != NULL);
5394 
5395 		MDI_PHCI_LOCK(ph);
5396 		switch (cmd) {
5397 		case DDI_ATTACH:
5398 			MDI_DEBUG(2, (CE_NOTE, dip,
5399 			    "!pHCI post_attach: called %p\n", (void *)ph));
5400 			if (error == DDI_SUCCESS) {
5401 				MDI_PHCI_SET_ATTACH(ph);
5402 			} else {
5403 				MDI_DEBUG(1, (CE_NOTE, dip,
5404 				    "!pHCI post_attach: failed error=%d\n",
5405 				    error));
5406 				MDI_PHCI_SET_DETACH(ph);
5407 			}
5408 			break;
5409 
5410 		case DDI_RESUME:
5411 			MDI_DEBUG(2, (CE_NOTE, dip,
5412 			    "!pHCI post_resume: called %p\n", (void *)ph));
5413 			if (error == DDI_SUCCESS) {
5414 				MDI_PHCI_SET_RESUME(ph);
5415 			} else {
5416 				MDI_DEBUG(1, (CE_NOTE, dip,
5417 				    "!pHCI post_resume: failed error=%d\n",
5418 				    error));
5419 				MDI_PHCI_SET_SUSPEND(ph);
5420 			}
5421 			break;
5422 		}
5423 		MDI_PHCI_UNLOCK(ph);
5424 	}
5425 
5426 	if (MDI_CLIENT(dip)) {
5427 		ct = i_devi_get_client(dip);
5428 		ASSERT(ct != NULL);
5429 
5430 		MDI_CLIENT_LOCK(ct);
5431 		switch (cmd) {
5432 		case DDI_ATTACH:
5433 			MDI_DEBUG(2, (CE_NOTE, dip,
5434 			    "!Client post_attach: called %p\n", (void *)ct));
5435 			if (error != DDI_SUCCESS) {
5436 				MDI_DEBUG(1, (CE_NOTE, dip,
5437 				    "!Client post_attach: failed error=%d\n",
5438 				    error));
5439 				MDI_CLIENT_SET_DETACH(ct);
5440 				MDI_DEBUG(4, (CE_WARN, dip,
5441 				    "mdi_post_attach i_mdi_pm_reset_client\n"));
5442 				i_mdi_pm_reset_client(ct);
5443 				break;
5444 			}
5445 
5446 			/*
5447 			 * Client device has successfully attached, inform
5448 			 * the vhci.
5449 			 */
5450 			vh = ct->ct_vhci;
5451 			if (vh->vh_ops->vo_client_attached)
5452 				(*vh->vh_ops->vo_client_attached)(dip);
5453 
5454 			MDI_CLIENT_SET_ATTACH(ct);
5455 			break;
5456 
5457 		case DDI_RESUME:
5458 			MDI_DEBUG(2, (CE_NOTE, dip,
5459 			    "!Client post_attach: called %p\n", (void *)ct));
5460 			if (error == DDI_SUCCESS) {
5461 				MDI_CLIENT_SET_RESUME(ct);
5462 			} else {
5463 				MDI_DEBUG(1, (CE_NOTE, dip,
5464 				    "!Client post_resume: failed error=%d\n",
5465 				    error));
5466 				MDI_CLIENT_SET_SUSPEND(ct);
5467 			}
5468 			break;
5469 		}
5470 		MDI_CLIENT_UNLOCK(ct);
5471 	}
5472 }
5473 
5474 /*
5475  * mdi_pre_detach():
5476  *		Pre detach notification handler
5477  */
5478 /*ARGSUSED*/
5479 int
5480 mdi_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5481 {
5482 	int rv = DDI_SUCCESS;
5483 
5484 	if (MDI_CLIENT(dip)) {
5485 		(void) i_mdi_client_pre_detach(dip, cmd);
5486 	}
5487 
5488 	if (MDI_PHCI(dip)) {
5489 		rv = i_mdi_phci_pre_detach(dip, cmd);
5490 	}
5491 
5492 	return (rv);
5493 }
5494 
5495 /*ARGSUSED*/
5496 static int
5497 i_mdi_phci_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5498 {
5499 	int		rv = DDI_SUCCESS;
5500 	mdi_phci_t	*ph;
5501 	mdi_client_t	*ct;
5502 	mdi_pathinfo_t	*pip;
5503 	mdi_pathinfo_t	*failed_pip = NULL;
5504 	mdi_pathinfo_t	*next;
5505 
5506 	ph = i_devi_get_phci(dip);
5507 	if (ph == NULL) {
5508 		return (rv);
5509 	}
5510 
5511 	MDI_PHCI_LOCK(ph);
5512 	switch (cmd) {
5513 	case DDI_DETACH:
5514 		MDI_DEBUG(2, (CE_NOTE, dip,
5515 		    "!pHCI pre_detach: called %p\n", (void *)ph));
5516 		if (!MDI_PHCI_IS_OFFLINE(ph)) {
5517 			/*
5518 			 * mdi_pathinfo nodes are still attached to
5519 			 * this pHCI. Fail the detach for this pHCI.
5520 			 */
5521 			MDI_DEBUG(2, (CE_WARN, dip,
5522 			    "!pHCI pre_detach: "
5523 			    "mdi_pathinfo nodes are still attached "
5524 			    "%p\n", (void *)ph));
5525 			rv = DDI_FAILURE;
5526 			break;
5527 		}
5528 		MDI_PHCI_SET_DETACH(ph);
5529 		break;
5530 
5531 	case DDI_SUSPEND:
5532 		/*
5533 		 * pHCI is getting suspended.  Since mpxio client
5534 		 * devices may not be suspended at this point, to avoid
5535 		 * a potential stack overflow, it is important to suspend
5536 		 * client devices before pHCI can be suspended.
5537 		 */
5538 
5539 		MDI_DEBUG(2, (CE_NOTE, dip,
5540 		    "!pHCI pre_suspend: called %p\n", (void *)ph));
5541 		/*
5542 		 * Suspend all the client devices accessible through this pHCI
5543 		 */
5544 		pip = ph->ph_path_head;
5545 		while (pip != NULL && rv == DDI_SUCCESS) {
5546 			dev_info_t *cdip;
5547 			MDI_PI_LOCK(pip);
5548 			next =
5549 			    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5550 			ct = MDI_PI(pip)->pi_client;
5551 			i_mdi_client_lock(ct, pip);
5552 			cdip = ct->ct_dip;
5553 			MDI_PI_UNLOCK(pip);
5554 			if ((MDI_CLIENT_IS_DETACHED(ct) == 0) &&
5555 			    MDI_CLIENT_IS_SUSPENDED(ct) == 0) {
5556 				i_mdi_client_unlock(ct);
5557 				if ((rv = devi_detach(cdip, DDI_SUSPEND)) !=
5558 				    DDI_SUCCESS) {
5559 					/*
5560 					 * Suspend of one of the client
5561 					 * device has failed.
5562 					 */
5563 					MDI_DEBUG(1, (CE_WARN, dip,
5564 					    "!Suspend of device (%s%d) failed.",
5565 					    ddi_driver_name(cdip),
5566 					    ddi_get_instance(cdip)));
5567 					failed_pip = pip;
5568 					break;
5569 				}
5570 			} else {
5571 				i_mdi_client_unlock(ct);
5572 			}
5573 			pip = next;
5574 		}
5575 
5576 		if (rv == DDI_SUCCESS) {
5577 			/*
5578 			 * Suspend of client devices is complete. Proceed
5579 			 * with pHCI suspend.
5580 			 */
5581 			MDI_PHCI_SET_SUSPEND(ph);
5582 		} else {
5583 			/*
5584 			 * Revert back all the suspended client device states
5585 			 * to converse.
5586 			 */
5587 			pip = ph->ph_path_head;
5588 			while (pip != failed_pip) {
5589 				dev_info_t *cdip;
5590 				MDI_PI_LOCK(pip);
5591 				next =
5592 				    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5593 				ct = MDI_PI(pip)->pi_client;
5594 				i_mdi_client_lock(ct, pip);
5595 				cdip = ct->ct_dip;
5596 				MDI_PI_UNLOCK(pip);
5597 				if (MDI_CLIENT_IS_SUSPENDED(ct)) {
5598 					i_mdi_client_unlock(ct);
5599 					(void) devi_attach(cdip, DDI_RESUME);
5600 				} else {
5601 					i_mdi_client_unlock(ct);
5602 				}
5603 				pip = next;
5604 			}
5605 		}
5606 		break;
5607 
5608 	default:
5609 		rv = DDI_FAILURE;
5610 		break;
5611 	}
5612 	MDI_PHCI_UNLOCK(ph);
5613 	return (rv);
5614 }
5615 
5616 /*ARGSUSED*/
5617 static int
5618 i_mdi_client_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5619 {
5620 	int		rv = DDI_SUCCESS;
5621 	mdi_client_t	*ct;
5622 
5623 	ct = i_devi_get_client(dip);
5624 	if (ct == NULL) {
5625 		return (rv);
5626 	}
5627 
5628 	MDI_CLIENT_LOCK(ct);
5629 	switch (cmd) {
5630 	case DDI_DETACH:
5631 		MDI_DEBUG(2, (CE_NOTE, dip,
5632 		    "!Client pre_detach: called %p\n", (void *)ct));
5633 		MDI_CLIENT_SET_DETACH(ct);
5634 		break;
5635 
5636 	case DDI_SUSPEND:
5637 		MDI_DEBUG(2, (CE_NOTE, dip,
5638 		    "!Client pre_suspend: called %p\n", (void *)ct));
5639 		MDI_CLIENT_SET_SUSPEND(ct);
5640 		break;
5641 
5642 	default:
5643 		rv = DDI_FAILURE;
5644 		break;
5645 	}
5646 	MDI_CLIENT_UNLOCK(ct);
5647 	return (rv);
5648 }
5649 
5650 /*
5651  * mdi_post_detach():
5652  *		Post detach notification handler
5653  */
5654 /*ARGSUSED*/
5655 void
5656 mdi_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5657 {
5658 	/*
5659 	 * Detach/Suspend of mpxio component failed. Update our state
5660 	 * too
5661 	 */
5662 	if (MDI_PHCI(dip))
5663 		i_mdi_phci_post_detach(dip, cmd, error);
5664 
5665 	if (MDI_CLIENT(dip))
5666 		i_mdi_client_post_detach(dip, cmd, error);
5667 }
5668 
5669 /*ARGSUSED*/
5670 static void
5671 i_mdi_phci_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5672 {
5673 	mdi_phci_t	*ph;
5674 
5675 	/*
5676 	 * Detach/Suspend of phci component failed. Update our state
5677 	 * too
5678 	 */
5679 	ph = i_devi_get_phci(dip);
5680 	if (ph == NULL) {
5681 		return;
5682 	}
5683 
5684 	MDI_PHCI_LOCK(ph);
5685 	/*
5686 	 * Detach of pHCI failed. Restore back converse
5687 	 * state
5688 	 */
5689 	switch (cmd) {
5690 	case DDI_DETACH:
5691 		MDI_DEBUG(2, (CE_NOTE, dip,
5692 		    "!pHCI post_detach: called %p\n", (void *)ph));
5693 		if (error != DDI_SUCCESS)
5694 			MDI_PHCI_SET_ATTACH(ph);
5695 		break;
5696 
5697 	case DDI_SUSPEND:
5698 		MDI_DEBUG(2, (CE_NOTE, dip,
5699 		    "!pHCI post_suspend: called %p\n", (void *)ph));
5700 		if (error != DDI_SUCCESS)
5701 			MDI_PHCI_SET_RESUME(ph);
5702 		break;
5703 	}
5704 	MDI_PHCI_UNLOCK(ph);
5705 }
5706 
5707 /*ARGSUSED*/
5708 static void
5709 i_mdi_client_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5710 {
5711 	mdi_client_t	*ct;
5712 
5713 	ct = i_devi_get_client(dip);
5714 	if (ct == NULL) {
5715 		return;
5716 	}
5717 	MDI_CLIENT_LOCK(ct);
5718 	/*
5719 	 * Detach of Client failed. Restore back converse
5720 	 * state
5721 	 */
5722 	switch (cmd) {
5723 	case DDI_DETACH:
5724 		MDI_DEBUG(2, (CE_NOTE, dip,
5725 		    "!Client post_detach: called %p\n", (void *)ct));
5726 		if (DEVI_IS_ATTACHING(ct->ct_dip)) {
5727 			MDI_DEBUG(4, (CE_NOTE, dip, "i_mdi_client_post_detach "
5728 			    "i_mdi_pm_rele_client\n"));
5729 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
5730 		} else {
5731 			MDI_DEBUG(4, (CE_NOTE, dip, "i_mdi_client_post_detach "
5732 			    "i_mdi_pm_reset_client\n"));
5733 			i_mdi_pm_reset_client(ct);
5734 		}
5735 		if (error != DDI_SUCCESS)
5736 			MDI_CLIENT_SET_ATTACH(ct);
5737 		break;
5738 
5739 	case DDI_SUSPEND:
5740 		MDI_DEBUG(2, (CE_NOTE, dip,
5741 		    "!Client post_suspend: called %p\n", (void *)ct));
5742 		if (error != DDI_SUCCESS)
5743 			MDI_CLIENT_SET_RESUME(ct);
5744 		break;
5745 	}
5746 	MDI_CLIENT_UNLOCK(ct);
5747 }
5748 
5749 int
5750 mdi_pi_kstat_exists(mdi_pathinfo_t *pip)
5751 {
5752 	return (MDI_PI(pip)->pi_kstats ? 1 : 0);
5753 }
5754 
5755 /*
5756  * create and install per-path (client - pHCI) statistics
5757  * I/O stats supported: nread, nwritten, reads, and writes
5758  * Error stats - hard errors, soft errors, & transport errors
5759  */
5760 int
5761 mdi_pi_kstat_create(mdi_pathinfo_t *pip, char *ksname)
5762 {
5763 	kstat_t			*kiosp, *kerrsp;
5764 	struct pi_errs		*nsp;
5765 	struct mdi_pi_kstats	*mdi_statp;
5766 
5767 	if (MDI_PI(pip)->pi_kstats != NULL)
5768 		return (MDI_SUCCESS);
5769 
5770 	if ((kiosp = kstat_create("mdi", 0, ksname, "iopath",
5771 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL) {
5772 		return (MDI_FAILURE);
5773 	}
5774 
5775 	(void) strcat(ksname, ",err");
5776 	kerrsp = kstat_create("mdi", 0, ksname, "iopath_errors",
5777 	    KSTAT_TYPE_NAMED,
5778 	    sizeof (struct pi_errs) / sizeof (kstat_named_t), 0);
5779 	if (kerrsp == NULL) {
5780 		kstat_delete(kiosp);
5781 		return (MDI_FAILURE);
5782 	}
5783 
5784 	nsp = (struct pi_errs *)kerrsp->ks_data;
5785 	kstat_named_init(&nsp->pi_softerrs, "Soft Errors", KSTAT_DATA_UINT32);
5786 	kstat_named_init(&nsp->pi_harderrs, "Hard Errors", KSTAT_DATA_UINT32);
5787 	kstat_named_init(&nsp->pi_transerrs, "Transport Errors",
5788 	    KSTAT_DATA_UINT32);
5789 	kstat_named_init(&nsp->pi_icnt_busy, "Interconnect Busy",
5790 	    KSTAT_DATA_UINT32);
5791 	kstat_named_init(&nsp->pi_icnt_errors, "Interconnect Errors",
5792 	    KSTAT_DATA_UINT32);
5793 	kstat_named_init(&nsp->pi_phci_rsrc, "pHCI No Resources",
5794 	    KSTAT_DATA_UINT32);
5795 	kstat_named_init(&nsp->pi_phci_localerr, "pHCI Local Errors",
5796 	    KSTAT_DATA_UINT32);
5797 	kstat_named_init(&nsp->pi_phci_invstate, "pHCI Invalid State",
5798 	    KSTAT_DATA_UINT32);
5799 	kstat_named_init(&nsp->pi_failedfrom, "Failed From",
5800 	    KSTAT_DATA_UINT32);
5801 	kstat_named_init(&nsp->pi_failedto, "Failed To", KSTAT_DATA_UINT32);
5802 
5803 	mdi_statp = kmem_alloc(sizeof (*mdi_statp), KM_SLEEP);
5804 	mdi_statp->pi_kstat_ref = 1;
5805 	mdi_statp->pi_kstat_iostats = kiosp;
5806 	mdi_statp->pi_kstat_errstats = kerrsp;
5807 	kstat_install(kiosp);
5808 	kstat_install(kerrsp);
5809 	MDI_PI(pip)->pi_kstats = mdi_statp;
5810 	return (MDI_SUCCESS);
5811 }
5812 
5813 /*
5814  * destroy per-path properties
5815  */
5816 static void
5817 i_mdi_pi_kstat_destroy(mdi_pathinfo_t *pip)
5818 {
5819 
5820 	struct mdi_pi_kstats *mdi_statp;
5821 
5822 	if (MDI_PI(pip)->pi_kstats == NULL)
5823 		return;
5824 	if ((mdi_statp = MDI_PI(pip)->pi_kstats) == NULL)
5825 		return;
5826 
5827 	MDI_PI(pip)->pi_kstats = NULL;
5828 
5829 	/*
5830 	 * the kstat may be shared between multiple pathinfo nodes
5831 	 * decrement this pathinfo's usage, removing the kstats
5832 	 * themselves when the last pathinfo reference is removed.
5833 	 */
5834 	ASSERT(mdi_statp->pi_kstat_ref > 0);
5835 	if (--mdi_statp->pi_kstat_ref != 0)
5836 		return;
5837 
5838 	kstat_delete(mdi_statp->pi_kstat_iostats);
5839 	kstat_delete(mdi_statp->pi_kstat_errstats);
5840 	kmem_free(mdi_statp, sizeof (*mdi_statp));
5841 }
5842 
5843 /*
5844  * update I/O paths KSTATS
5845  */
5846 void
5847 mdi_pi_kstat_iosupdate(mdi_pathinfo_t *pip, struct buf *bp)
5848 {
5849 	kstat_t *iostatp;
5850 	size_t xfer_cnt;
5851 
5852 	ASSERT(pip != NULL);
5853 
5854 	/*
5855 	 * I/O can be driven across a path prior to having path
5856 	 * statistics available, i.e. probe(9e).
5857 	 */
5858 	if (bp != NULL && MDI_PI(pip)->pi_kstats != NULL) {
5859 		iostatp = MDI_PI(pip)->pi_kstats->pi_kstat_iostats;
5860 		xfer_cnt = bp->b_bcount - bp->b_resid;
5861 		if (bp->b_flags & B_READ) {
5862 			KSTAT_IO_PTR(iostatp)->reads++;
5863 			KSTAT_IO_PTR(iostatp)->nread += xfer_cnt;
5864 		} else {
5865 			KSTAT_IO_PTR(iostatp)->writes++;
5866 			KSTAT_IO_PTR(iostatp)->nwritten += xfer_cnt;
5867 		}
5868 	}
5869 }
5870 
5871 /*
5872  * Enable the path(specific client/target/initiator)
5873  * Enabling a path means that MPxIO may select the enabled path for routing
5874  * future I/O requests, subject to other path state constraints.
5875  */
5876 int
5877 mdi_pi_enable_path(mdi_pathinfo_t *pip, int flags)
5878 {
5879 	mdi_phci_t	*ph;
5880 
5881 	ph = i_devi_get_phci(mdi_pi_get_phci(pip));
5882 	if (ph == NULL) {
5883 		MDI_DEBUG(1, (CE_NOTE, NULL, "!mdi_pi_enable_path:"
5884 			" failed. pip: %p ph = NULL\n", (void *)pip));
5885 		return (MDI_FAILURE);
5886 	}
5887 
5888 	(void) i_mdi_enable_disable_path(pip, ph->ph_vhci, flags,
5889 		MDI_ENABLE_OP);
5890 	MDI_DEBUG(5, (CE_NOTE, NULL, "!mdi_pi_enable_path:"
5891 		" Returning success pip = %p. ph = %p\n",
5892 		(void *)pip, (void *)ph));
5893 	return (MDI_SUCCESS);
5894 
5895 }
5896 
5897 /*
5898  * Disable the path (specific client/target/initiator)
5899  * Disabling a path means that MPxIO will not select the disabled path for
5900  * routing any new I/O requests.
5901  */
5902 int
5903 mdi_pi_disable_path(mdi_pathinfo_t *pip, int flags)
5904 {
5905 	mdi_phci_t	*ph;
5906 
5907 	ph = i_devi_get_phci(mdi_pi_get_phci(pip));
5908 	if (ph == NULL) {
5909 		MDI_DEBUG(1, (CE_NOTE, NULL, "!mdi_pi_disable_path:"
5910 			" failed. pip: %p ph = NULL\n", (void *)pip));
5911 		return (MDI_FAILURE);
5912 	}
5913 
5914 	(void) i_mdi_enable_disable_path(pip,
5915 			ph->ph_vhci, flags, MDI_DISABLE_OP);
5916 	MDI_DEBUG(5, (CE_NOTE, NULL, "!mdi_pi_disable_path:"
5917 		"Returning success pip = %p. ph = %p",
5918 		(void *)pip, (void *)ph));
5919 	return (MDI_SUCCESS);
5920 }
5921 
5922 /*
5923  * disable the path to a particular pHCI (pHCI specified in the phci_path
5924  * argument) for a particular client (specified in the client_path argument).
5925  * Disabling a path means that MPxIO will not select the disabled path for
5926  * routing any new I/O requests.
5927  * NOTE: this will be removed once the NWS files are changed to use the new
5928  * mdi_{enable,disable}_path interfaces
5929  */
5930 int
5931 mdi_pi_disable(dev_info_t *cdip, dev_info_t *pdip, int flags)
5932 {
5933 	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_DISABLE_OP));
5934 }
5935 
5936 /*
5937  * Enable the path to a particular pHCI (pHCI specified in the phci_path
5938  * argument) for a particular client (specified in the client_path argument).
5939  * Enabling a path means that MPxIO may select the enabled path for routing
5940  * future I/O requests, subject to other path state constraints.
5941  * NOTE: this will be removed once the NWS files are changed to use the new
5942  * mdi_{enable,disable}_path interfaces
5943  */
5944 
5945 int
5946 mdi_pi_enable(dev_info_t *cdip, dev_info_t *pdip, int flags)
5947 {
5948 	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_ENABLE_OP));
5949 }
5950 
5951 /*
5952  * Common routine for doing enable/disable.
5953  */
5954 static mdi_pathinfo_t *
5955 i_mdi_enable_disable_path(mdi_pathinfo_t *pip, mdi_vhci_t *vh, int flags,
5956 		int op)
5957 {
5958 	int		sync_flag = 0;
5959 	int		rv;
5960 	mdi_pathinfo_t 	*next;
5961 	int		(*f)() = NULL;
5962 
5963 	f = vh->vh_ops->vo_pi_state_change;
5964 
5965 	sync_flag = (flags << 8) & 0xf00;
5966 
5967 	/*
5968 	 * Do a callback into the mdi consumer to let it
5969 	 * know that path is about to get enabled/disabled.
5970 	 */
5971 	if (f != NULL) {
5972 		rv = (*f)(vh->vh_dip, pip, 0,
5973 			MDI_PI_EXT_STATE(pip),
5974 			MDI_EXT_STATE_CHANGE | sync_flag |
5975 			op | MDI_BEFORE_STATE_CHANGE);
5976 		if (rv != MDI_SUCCESS) {
5977 			MDI_DEBUG(2, (CE_WARN, vh->vh_dip,
5978 			"!vo_pi_state_change: failed rv = %x", rv));
5979 		}
5980 	}
5981 	MDI_PI_LOCK(pip);
5982 	next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5983 
5984 	switch (flags) {
5985 		case USER_DISABLE:
5986 			if (op == MDI_DISABLE_OP) {
5987 				MDI_PI_SET_USER_DISABLE(pip);
5988 			} else {
5989 				MDI_PI_SET_USER_ENABLE(pip);
5990 			}
5991 			break;
5992 		case DRIVER_DISABLE:
5993 			if (op == MDI_DISABLE_OP) {
5994 				MDI_PI_SET_DRV_DISABLE(pip);
5995 			} else {
5996 				MDI_PI_SET_DRV_ENABLE(pip);
5997 			}
5998 			break;
5999 		case DRIVER_DISABLE_TRANSIENT:
6000 			if (op == MDI_DISABLE_OP && rv == MDI_SUCCESS) {
6001 				MDI_PI_SET_DRV_DISABLE_TRANS(pip);
6002 			} else {
6003 				MDI_PI_SET_DRV_ENABLE_TRANS(pip);
6004 			}
6005 			break;
6006 	}
6007 	MDI_PI_UNLOCK(pip);
6008 	/*
6009 	 * Do a callback into the mdi consumer to let it
6010 	 * know that path is now enabled/disabled.
6011 	 */
6012 	if (f != NULL) {
6013 		rv = (*f)(vh->vh_dip, pip, 0,
6014 			MDI_PI_EXT_STATE(pip),
6015 			MDI_EXT_STATE_CHANGE | sync_flag |
6016 			op | MDI_AFTER_STATE_CHANGE);
6017 		if (rv != MDI_SUCCESS) {
6018 			MDI_DEBUG(2, (CE_WARN, vh->vh_dip,
6019 			"!vo_pi_state_change: failed rv = %x", rv));
6020 		}
6021 	}
6022 	return (next);
6023 }
6024 
6025 /*
6026  * Common routine for doing enable/disable.
6027  * NOTE: this will be removed once the NWS files are changed to use the new
6028  * mdi_{enable,disable}_path has been putback
6029  */
6030 int
6031 i_mdi_pi_enable_disable(dev_info_t *cdip, dev_info_t *pdip, int flags, int op)
6032 {
6033 
6034 	mdi_phci_t	*ph;
6035 	mdi_vhci_t	*vh = NULL;
6036 	mdi_client_t	*ct;
6037 	mdi_pathinfo_t	*next, *pip;
6038 	int		found_it;
6039 
6040 	ph = i_devi_get_phci(pdip);
6041 	MDI_DEBUG(5, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable: "
6042 		"Op = %d pdip = %p cdip = %p\n", op, (void *)pdip,
6043 		(void *)cdip));
6044 	if (ph == NULL) {
6045 		MDI_DEBUG(1, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable:"
6046 			"Op %d failed. ph = NULL\n", op));
6047 		return (MDI_FAILURE);
6048 	}
6049 
6050 	if ((op != MDI_ENABLE_OP) && (op != MDI_DISABLE_OP)) {
6051 		MDI_DEBUG(1, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable: "
6052 			"Op Invalid operation = %d\n", op));
6053 		return (MDI_FAILURE);
6054 	}
6055 
6056 	vh = ph->ph_vhci;
6057 
6058 	if (cdip == NULL) {
6059 		/*
6060 		 * Need to mark the Phci as enabled/disabled.
6061 		 */
6062 		MDI_DEBUG(3, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable: "
6063 		"Op %d for the phci\n", op));
6064 		MDI_PHCI_LOCK(ph);
6065 		switch (flags) {
6066 			case USER_DISABLE:
6067 				if (op == MDI_DISABLE_OP) {
6068 					MDI_PHCI_SET_USER_DISABLE(ph);
6069 				} else {
6070 					MDI_PHCI_SET_USER_ENABLE(ph);
6071 				}
6072 				break;
6073 			case DRIVER_DISABLE:
6074 				if (op == MDI_DISABLE_OP) {
6075 					MDI_PHCI_SET_DRV_DISABLE(ph);
6076 				} else {
6077 					MDI_PHCI_SET_DRV_ENABLE(ph);
6078 				}
6079 				break;
6080 			case DRIVER_DISABLE_TRANSIENT:
6081 				if (op == MDI_DISABLE_OP) {
6082 					MDI_PHCI_SET_DRV_DISABLE_TRANSIENT(ph);
6083 				} else {
6084 					MDI_PHCI_SET_DRV_ENABLE_TRANSIENT(ph);
6085 				}
6086 				break;
6087 			default:
6088 				MDI_PHCI_UNLOCK(ph);
6089 				MDI_DEBUG(1, (CE_NOTE, NULL,
6090 				"!i_mdi_pi_enable_disable:"
6091 				" Invalid flag argument= %d\n", flags));
6092 		}
6093 
6094 		/*
6095 		 * Phci has been disabled. Now try to enable/disable
6096 		 * path info's to each client.
6097 		 */
6098 		pip = ph->ph_path_head;
6099 		while (pip != NULL) {
6100 			pip = i_mdi_enable_disable_path(pip, vh, flags, op);
6101 		}
6102 		MDI_PHCI_UNLOCK(ph);
6103 	} else {
6104 
6105 		/*
6106 		 * Disable a specific client.
6107 		 */
6108 		ct = i_devi_get_client(cdip);
6109 		if (ct == NULL) {
6110 			MDI_DEBUG(1, (CE_NOTE, NULL,
6111 			"!i_mdi_pi_enable_disable:"
6112 			" failed. ct = NULL operation = %d\n", op));
6113 			return (MDI_FAILURE);
6114 		}
6115 
6116 		MDI_CLIENT_LOCK(ct);
6117 		pip = ct->ct_path_head;
6118 		found_it = 0;
6119 		while (pip != NULL) {
6120 			MDI_PI_LOCK(pip);
6121 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6122 			if (MDI_PI(pip)->pi_phci == ph) {
6123 				MDI_PI_UNLOCK(pip);
6124 				found_it = 1;
6125 				break;
6126 			}
6127 			MDI_PI_UNLOCK(pip);
6128 			pip = next;
6129 		}
6130 
6131 
6132 		MDI_CLIENT_UNLOCK(ct);
6133 		if (found_it == 0) {
6134 			MDI_DEBUG(1, (CE_NOTE, NULL,
6135 			"!i_mdi_pi_enable_disable:"
6136 			" failed. Could not find corresponding pip\n"));
6137 			return (MDI_FAILURE);
6138 		}
6139 
6140 		(void) i_mdi_enable_disable_path(pip, vh, flags, op);
6141 	}
6142 
6143 	MDI_DEBUG(5, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable: "
6144 		"Op %d Returning success pdip = %p cdip = %p\n",
6145 		op, (void *)pdip, (void *)cdip));
6146 	return (MDI_SUCCESS);
6147 }
6148 
6149 /*
6150  * Ensure phci powered up
6151  */
6152 static void
6153 i_mdi_pm_hold_pip(mdi_pathinfo_t *pip)
6154 {
6155 	dev_info_t	*ph_dip;
6156 
6157 	ASSERT(pip != NULL);
6158 	ASSERT(MDI_PI_LOCKED(pip));
6159 
6160 	if (MDI_PI(pip)->pi_pm_held) {
6161 		return;
6162 	}
6163 
6164 	ph_dip = mdi_pi_get_phci(pip);
6165 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_pm_hold_pip for %s%d %p\n",
6166 	    ddi_get_name(ph_dip), ddi_get_instance(ph_dip), (void *)pip));
6167 	if (ph_dip == NULL) {
6168 		return;
6169 	}
6170 
6171 	MDI_PI_UNLOCK(pip);
6172 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt was %d\n",
6173 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
6174 
6175 	pm_hold_power(ph_dip);
6176 
6177 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt is %d\n",
6178 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
6179 	MDI_PI_LOCK(pip);
6180 
6181 	/* If PM_GET_PM_INFO is NULL the pm_hold_power above was a noop */
6182 	if (DEVI(ph_dip)->devi_pm_info)
6183 		MDI_PI(pip)->pi_pm_held = 1;
6184 }
6185 
6186 /*
6187  * Allow phci powered down
6188  */
6189 static void
6190 i_mdi_pm_rele_pip(mdi_pathinfo_t *pip)
6191 {
6192 	dev_info_t	*ph_dip = NULL;
6193 
6194 	ASSERT(pip != NULL);
6195 	ASSERT(MDI_PI_LOCKED(pip));
6196 
6197 	if (MDI_PI(pip)->pi_pm_held == 0) {
6198 		return;
6199 	}
6200 
6201 	ph_dip = mdi_pi_get_phci(pip);
6202 	ASSERT(ph_dip != NULL);
6203 
6204 	MDI_PI_UNLOCK(pip);
6205 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_pm_rele_pip for %s%d %p\n",
6206 	    ddi_get_name(ph_dip), ddi_get_instance(ph_dip), (void *)pip));
6207 
6208 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt was %d\n",
6209 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
6210 	pm_rele_power(ph_dip);
6211 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt is %d\n",
6212 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
6213 
6214 	MDI_PI_LOCK(pip);
6215 	MDI_PI(pip)->pi_pm_held = 0;
6216 }
6217 
6218 static void
6219 i_mdi_pm_hold_client(mdi_client_t *ct, int incr)
6220 {
6221 	ASSERT(MDI_CLIENT_LOCKED(ct));
6222 
6223 	ct->ct_power_cnt += incr;
6224 	MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "i_mdi_pm_hold_client %p "
6225 	    "ct_power_cnt = %d incr = %d\n", (void *)ct,
6226 	    ct->ct_power_cnt, incr));
6227 	ASSERT(ct->ct_power_cnt >= 0);
6228 }
6229 
6230 static void
6231 i_mdi_rele_all_phci(mdi_client_t *ct)
6232 {
6233 	mdi_pathinfo_t  *pip;
6234 
6235 	ASSERT(MDI_CLIENT_LOCKED(ct));
6236 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
6237 	while (pip != NULL) {
6238 		mdi_hold_path(pip);
6239 		MDI_PI_LOCK(pip);
6240 		i_mdi_pm_rele_pip(pip);
6241 		MDI_PI_UNLOCK(pip);
6242 		mdi_rele_path(pip);
6243 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6244 	}
6245 }
6246 
6247 static void
6248 i_mdi_pm_rele_client(mdi_client_t *ct, int decr)
6249 {
6250 	ASSERT(MDI_CLIENT_LOCKED(ct));
6251 
6252 	if (i_ddi_devi_attached(ct->ct_dip)) {
6253 		ct->ct_power_cnt -= decr;
6254 		MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "i_mdi_pm_rele_client %p "
6255 		    "ct_power_cnt = %d decr = %d\n",
6256 		    (void *)ct, ct->ct_power_cnt, decr));
6257 	}
6258 
6259 	ASSERT(ct->ct_power_cnt >= 0);
6260 	if (ct->ct_power_cnt == 0) {
6261 		i_mdi_rele_all_phci(ct);
6262 		return;
6263 	}
6264 }
6265 
6266 static void
6267 i_mdi_pm_reset_client(mdi_client_t *ct)
6268 {
6269 	MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "i_mdi_pm_reset_client %p "
6270 	    "ct_power_cnt = %d\n", (void *)ct, ct->ct_power_cnt));
6271 	ASSERT(MDI_CLIENT_LOCKED(ct));
6272 	ct->ct_power_cnt = 0;
6273 	i_mdi_rele_all_phci(ct);
6274 	ct->ct_powercnt_config = 0;
6275 	ct->ct_powercnt_unconfig = 0;
6276 	ct->ct_powercnt_reset = 1;
6277 }
6278 
6279 static int
6280 i_mdi_power_one_phci(mdi_pathinfo_t *pip)
6281 {
6282 	int		ret;
6283 	dev_info_t	*ph_dip;
6284 
6285 	MDI_PI_LOCK(pip);
6286 	i_mdi_pm_hold_pip(pip);
6287 
6288 	ph_dip = mdi_pi_get_phci(pip);
6289 	MDI_PI_UNLOCK(pip);
6290 
6291 	/* bring all components of phci to full power */
6292 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_power_one_phci "
6293 	    "pm_powerup for %s%d %p\n", ddi_get_name(ph_dip),
6294 	    ddi_get_instance(ph_dip), (void *)pip));
6295 
6296 	ret = pm_powerup(ph_dip);
6297 
6298 	if (ret == DDI_FAILURE) {
6299 		MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_power_one_phci "
6300 		    "pm_powerup FAILED for %s%d %p\n",
6301 		    ddi_get_name(ph_dip), ddi_get_instance(ph_dip),
6302 		    (void *)pip));
6303 
6304 		MDI_PI_LOCK(pip);
6305 		i_mdi_pm_rele_pip(pip);
6306 		MDI_PI_UNLOCK(pip);
6307 		return (MDI_FAILURE);
6308 	}
6309 
6310 	return (MDI_SUCCESS);
6311 }
6312 
6313 static int
6314 i_mdi_power_all_phci(mdi_client_t *ct)
6315 {
6316 	mdi_pathinfo_t  *pip;
6317 	int		succeeded = 0;
6318 
6319 	ASSERT(MDI_CLIENT_LOCKED(ct));
6320 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
6321 	while (pip != NULL) {
6322 		/*
6323 		 * Don't power if MDI_PATHINFO_STATE_FAULT
6324 		 * or MDI_PATHINFO_STATE_OFFLINE.
6325 		 */
6326 		if (MDI_PI_IS_INIT(pip) ||
6327 		    MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip)) {
6328 			mdi_hold_path(pip);
6329 			MDI_CLIENT_UNLOCK(ct);
6330 			if (i_mdi_power_one_phci(pip) == MDI_SUCCESS)
6331 				succeeded = 1;
6332 
6333 			ASSERT(ct == MDI_PI(pip)->pi_client);
6334 			MDI_CLIENT_LOCK(ct);
6335 			mdi_rele_path(pip);
6336 		}
6337 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6338 	}
6339 
6340 	return (succeeded ? MDI_SUCCESS : MDI_FAILURE);
6341 }
6342 
6343 /*
6344  * mdi_bus_power():
6345  *		1. Place the phci(s) into powered up state so that
6346  *		   client can do power management
6347  *		2. Ensure phci powered up as client power managing
6348  * Return Values:
6349  *		MDI_SUCCESS
6350  *		MDI_FAILURE
6351  */
6352 int
6353 mdi_bus_power(dev_info_t *parent, void *impl_arg, pm_bus_power_op_t op,
6354     void *arg, void *result)
6355 {
6356 	int			ret = MDI_SUCCESS;
6357 	pm_bp_child_pwrchg_t	*bpc;
6358 	mdi_client_t		*ct;
6359 	dev_info_t		*cdip;
6360 	pm_bp_has_changed_t	*bphc;
6361 
6362 	/*
6363 	 * BUS_POWER_NOINVOL not supported
6364 	 */
6365 	if (op == BUS_POWER_NOINVOL)
6366 		return (MDI_FAILURE);
6367 
6368 	/*
6369 	 * ignore other OPs.
6370 	 * return quickly to save cou cycles on the ct processing
6371 	 */
6372 	switch (op) {
6373 	case BUS_POWER_PRE_NOTIFICATION:
6374 	case BUS_POWER_POST_NOTIFICATION:
6375 		bpc = (pm_bp_child_pwrchg_t *)arg;
6376 		cdip = bpc->bpc_dip;
6377 		break;
6378 	case BUS_POWER_HAS_CHANGED:
6379 		bphc = (pm_bp_has_changed_t *)arg;
6380 		cdip = bphc->bphc_dip;
6381 		break;
6382 	default:
6383 		return (pm_busop_bus_power(parent, impl_arg, op, arg, result));
6384 	}
6385 
6386 	ASSERT(MDI_CLIENT(cdip));
6387 
6388 	ct = i_devi_get_client(cdip);
6389 	if (ct == NULL)
6390 		return (MDI_FAILURE);
6391 
6392 	/*
6393 	 * wait till the mdi_pathinfo node state change are processed
6394 	 */
6395 	MDI_CLIENT_LOCK(ct);
6396 	switch (op) {
6397 	case BUS_POWER_PRE_NOTIFICATION:
6398 		MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip, "mdi_bus_power "
6399 		    "BUS_POWER_PRE_NOTIFICATION:"
6400 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d\n",
6401 		    PM_NAME(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
6402 		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp));
6403 
6404 		/* serialize power level change per client */
6405 		while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6406 			cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6407 
6408 		MDI_CLIENT_SET_POWER_TRANSITION(ct);
6409 
6410 		if (ct->ct_power_cnt == 0) {
6411 			ret = i_mdi_power_all_phci(ct);
6412 		}
6413 
6414 		/*
6415 		 * if new_level > 0:
6416 		 *	- hold phci(s)
6417 		 *	- power up phci(s) if not already
6418 		 * ignore power down
6419 		 */
6420 		if (bpc->bpc_nlevel > 0) {
6421 			if (!DEVI_IS_ATTACHING(ct->ct_dip)) {
6422 				MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
6423 				    "mdi_bus_power i_mdi_pm_hold_client\n"));
6424 				i_mdi_pm_hold_client(ct, ct->ct_path_count);
6425 			}
6426 		}
6427 		break;
6428 	case BUS_POWER_POST_NOTIFICATION:
6429 		MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip, "mdi_bus_power "
6430 		    "BUS_POWER_POST_NOTIFICATION:"
6431 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d result=%d\n",
6432 		    PM_NAME(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
6433 		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp,
6434 		    *(int *)result));
6435 
6436 		if (*(int *)result == DDI_SUCCESS) {
6437 			if (bpc->bpc_nlevel > 0) {
6438 				MDI_CLIENT_SET_POWER_UP(ct);
6439 			} else {
6440 				MDI_CLIENT_SET_POWER_DOWN(ct);
6441 			}
6442 		}
6443 
6444 		/* release the hold we did in pre-notification */
6445 		if (bpc->bpc_nlevel > 0 && (*(int *)result != DDI_SUCCESS) &&
6446 		    !DEVI_IS_ATTACHING(ct->ct_dip)) {
6447 			MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
6448 			    "mdi_bus_power i_mdi_pm_rele_client\n"));
6449 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
6450 		}
6451 
6452 		if (bpc->bpc_nlevel == 0 && (*(int *)result == DDI_SUCCESS)) {
6453 			/* another thread might started attaching */
6454 			if (DEVI_IS_ATTACHING(ct->ct_dip)) {
6455 				MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
6456 				    "mdi_bus_power i_mdi_pm_rele_client\n"));
6457 				i_mdi_pm_rele_client(ct, ct->ct_path_count);
6458 			/* detaching has been taken care in pm_post_unconfig */
6459 			} else if (!DEVI_IS_DETACHING(ct->ct_dip)) {
6460 				MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
6461 				    "mdi_bus_power i_mdi_pm_reset_client\n"));
6462 				i_mdi_pm_reset_client(ct);
6463 			}
6464 		}
6465 
6466 		MDI_CLIENT_CLEAR_POWER_TRANSITION(ct);
6467 		cv_broadcast(&ct->ct_powerchange_cv);
6468 
6469 		break;
6470 
6471 	/* need to do more */
6472 	case BUS_POWER_HAS_CHANGED:
6473 		MDI_DEBUG(4, (CE_NOTE, bphc->bphc_dip, "mdi_bus_power "
6474 		    "BUS_POWER_HAS_CHANGED:"
6475 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d\n",
6476 		    PM_NAME(bphc->bphc_dip), PM_ADDR(bphc->bphc_dip),
6477 		    bphc->bphc_olevel, bphc->bphc_nlevel, bphc->bphc_comp));
6478 
6479 		if (bphc->bphc_nlevel > 0 &&
6480 		    bphc->bphc_nlevel > bphc->bphc_olevel) {
6481 			if (ct->ct_power_cnt == 0) {
6482 				ret = i_mdi_power_all_phci(ct);
6483 			}
6484 			MDI_DEBUG(4, (CE_NOTE, bphc->bphc_dip,
6485 			    "mdi_bus_power i_mdi_pm_hold_client\n"));
6486 			i_mdi_pm_hold_client(ct, ct->ct_path_count);
6487 		}
6488 
6489 		if (bphc->bphc_nlevel == 0 && bphc->bphc_olevel != -1) {
6490 			MDI_DEBUG(4, (CE_NOTE, bphc->bphc_dip,
6491 			    "mdi_bus_power i_mdi_pm_rele_client\n"));
6492 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
6493 		}
6494 		break;
6495 	}
6496 
6497 	MDI_CLIENT_UNLOCK(ct);
6498 	return (ret);
6499 }
6500 
6501 static int
6502 i_mdi_pm_pre_config_one(dev_info_t *child)
6503 {
6504 	int		ret = MDI_SUCCESS;
6505 	mdi_client_t	*ct;
6506 
6507 	ct = i_devi_get_client(child);
6508 	if (ct == NULL)
6509 		return (MDI_FAILURE);
6510 
6511 	MDI_CLIENT_LOCK(ct);
6512 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6513 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6514 
6515 	if (!MDI_CLIENT_IS_FAILED(ct)) {
6516 		MDI_CLIENT_UNLOCK(ct);
6517 		MDI_DEBUG(4, (CE_NOTE, child,
6518 		    "i_mdi_pm_pre_config_one already configured\n"));
6519 		return (MDI_SUCCESS);
6520 	}
6521 
6522 	if (ct->ct_powercnt_config) {
6523 		MDI_CLIENT_UNLOCK(ct);
6524 		MDI_DEBUG(4, (CE_NOTE, child,
6525 		    "i_mdi_pm_pre_config_one ALREADY held\n"));
6526 		return (MDI_SUCCESS);
6527 	}
6528 
6529 	if (ct->ct_power_cnt == 0) {
6530 		ret = i_mdi_power_all_phci(ct);
6531 	}
6532 	MDI_DEBUG(4, (CE_NOTE, child,
6533 	    "i_mdi_pm_pre_config_one i_mdi_pm_hold_client\n"));
6534 	i_mdi_pm_hold_client(ct, ct->ct_path_count);
6535 	ct->ct_powercnt_config = 1;
6536 	ct->ct_powercnt_reset = 0;
6537 	MDI_CLIENT_UNLOCK(ct);
6538 	return (ret);
6539 }
6540 
6541 static int
6542 i_mdi_pm_pre_config(dev_info_t *vdip, dev_info_t *child)
6543 {
6544 	int			ret = MDI_SUCCESS;
6545 	dev_info_t		*cdip;
6546 	int			circ;
6547 
6548 	ASSERT(MDI_VHCI(vdip));
6549 
6550 	/* ndi_devi_config_one */
6551 	if (child) {
6552 		ASSERT(DEVI_BUSY_OWNED(vdip));
6553 		return (i_mdi_pm_pre_config_one(child));
6554 	}
6555 
6556 	/* devi_config_common */
6557 	ndi_devi_enter(vdip, &circ);
6558 	cdip = ddi_get_child(vdip);
6559 	while (cdip) {
6560 		dev_info_t *next = ddi_get_next_sibling(cdip);
6561 
6562 		ret = i_mdi_pm_pre_config_one(cdip);
6563 		if (ret != MDI_SUCCESS)
6564 			break;
6565 		cdip = next;
6566 	}
6567 	ndi_devi_exit(vdip, circ);
6568 	return (ret);
6569 }
6570 
6571 static int
6572 i_mdi_pm_pre_unconfig_one(dev_info_t *child, int *held, int flags)
6573 {
6574 	int		ret = MDI_SUCCESS;
6575 	mdi_client_t	*ct;
6576 
6577 	ct = i_devi_get_client(child);
6578 	if (ct == NULL)
6579 		return (MDI_FAILURE);
6580 
6581 	MDI_CLIENT_LOCK(ct);
6582 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6583 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6584 
6585 	if (!i_ddi_devi_attached(ct->ct_dip)) {
6586 		MDI_DEBUG(4, (CE_NOTE, child,
6587 		    "i_mdi_pm_pre_unconfig node detached already\n"));
6588 		MDI_CLIENT_UNLOCK(ct);
6589 		return (MDI_SUCCESS);
6590 	}
6591 
6592 	if (MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6593 	    (flags & NDI_AUTODETACH)) {
6594 		MDI_DEBUG(4, (CE_NOTE, child,
6595 		    "i_mdi_pm_pre_unconfig auto-modunload\n"));
6596 		MDI_CLIENT_UNLOCK(ct);
6597 		return (MDI_FAILURE);
6598 	}
6599 
6600 	if (ct->ct_powercnt_unconfig) {
6601 		MDI_DEBUG(4, (CE_NOTE, child,
6602 		    "i_mdi_pm_pre_unconfig ct_powercnt_held\n"));
6603 		MDI_CLIENT_UNLOCK(ct);
6604 		*held = 1;
6605 		return (MDI_SUCCESS);
6606 	}
6607 
6608 	if (ct->ct_power_cnt == 0) {
6609 		ret = i_mdi_power_all_phci(ct);
6610 	}
6611 	MDI_DEBUG(4, (CE_NOTE, child,
6612 	    "i_mdi_pm_pre_unconfig i_mdi_pm_hold_client\n"));
6613 	i_mdi_pm_hold_client(ct, ct->ct_path_count);
6614 	ct->ct_powercnt_unconfig = 1;
6615 	ct->ct_powercnt_reset = 0;
6616 	MDI_CLIENT_UNLOCK(ct);
6617 	if (ret == MDI_SUCCESS)
6618 		*held = 1;
6619 	return (ret);
6620 }
6621 
6622 static int
6623 i_mdi_pm_pre_unconfig(dev_info_t *vdip, dev_info_t *child, int *held,
6624     int flags)
6625 {
6626 	int			ret = MDI_SUCCESS;
6627 	dev_info_t		*cdip;
6628 	int			circ;
6629 
6630 	ASSERT(MDI_VHCI(vdip));
6631 	*held = 0;
6632 
6633 	/* ndi_devi_unconfig_one */
6634 	if (child) {
6635 		ASSERT(DEVI_BUSY_OWNED(vdip));
6636 		return (i_mdi_pm_pre_unconfig_one(child, held, flags));
6637 	}
6638 
6639 	/* devi_unconfig_common */
6640 	ndi_devi_enter(vdip, &circ);
6641 	cdip = ddi_get_child(vdip);
6642 	while (cdip) {
6643 		dev_info_t *next = ddi_get_next_sibling(cdip);
6644 
6645 		ret = i_mdi_pm_pre_unconfig_one(cdip, held, flags);
6646 		cdip = next;
6647 	}
6648 	ndi_devi_exit(vdip, circ);
6649 
6650 	if (*held)
6651 		ret = MDI_SUCCESS;
6652 
6653 	return (ret);
6654 }
6655 
6656 static void
6657 i_mdi_pm_post_config_one(dev_info_t *child)
6658 {
6659 	mdi_client_t	*ct;
6660 
6661 	ct = i_devi_get_client(child);
6662 	if (ct == NULL)
6663 		return;
6664 
6665 	MDI_CLIENT_LOCK(ct);
6666 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6667 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6668 
6669 	if (ct->ct_powercnt_reset || !ct->ct_powercnt_config) {
6670 		MDI_DEBUG(4, (CE_NOTE, child,
6671 		    "i_mdi_pm_post_config_one NOT configured\n"));
6672 		MDI_CLIENT_UNLOCK(ct);
6673 		return;
6674 	}
6675 
6676 	/* client has not been updated */
6677 	if (MDI_CLIENT_IS_FAILED(ct)) {
6678 		MDI_DEBUG(4, (CE_NOTE, child,
6679 		    "i_mdi_pm_post_config_one NOT configured\n"));
6680 		MDI_CLIENT_UNLOCK(ct);
6681 		return;
6682 	}
6683 
6684 	/* another thread might have powered it down or detached it */
6685 	if ((MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6686 	    !DEVI_IS_ATTACHING(ct->ct_dip)) ||
6687 	    (!i_ddi_devi_attached(ct->ct_dip) &&
6688 	    !DEVI_IS_ATTACHING(ct->ct_dip))) {
6689 		MDI_DEBUG(4, (CE_NOTE, child,
6690 		    "i_mdi_pm_post_config i_mdi_pm_reset_client\n"));
6691 		i_mdi_pm_reset_client(ct);
6692 	} else {
6693 		mdi_pathinfo_t  *pip, *next;
6694 		int	valid_path_count = 0;
6695 
6696 		MDI_DEBUG(4, (CE_NOTE, child,
6697 		    "i_mdi_pm_post_config i_mdi_pm_rele_client\n"));
6698 		pip = ct->ct_path_head;
6699 		while (pip != NULL) {
6700 			MDI_PI_LOCK(pip);
6701 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6702 			if (MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip))
6703 				valid_path_count ++;
6704 			MDI_PI_UNLOCK(pip);
6705 			pip = next;
6706 		}
6707 		i_mdi_pm_rele_client(ct, valid_path_count);
6708 	}
6709 	ct->ct_powercnt_config = 0;
6710 	MDI_CLIENT_UNLOCK(ct);
6711 }
6712 
6713 static void
6714 i_mdi_pm_post_config(dev_info_t *vdip, dev_info_t *child)
6715 {
6716 	int		circ;
6717 	dev_info_t	*cdip;
6718 
6719 	ASSERT(MDI_VHCI(vdip));
6720 
6721 	/* ndi_devi_config_one */
6722 	if (child) {
6723 		ASSERT(DEVI_BUSY_OWNED(vdip));
6724 		i_mdi_pm_post_config_one(child);
6725 		return;
6726 	}
6727 
6728 	/* devi_config_common */
6729 	ndi_devi_enter(vdip, &circ);
6730 	cdip = ddi_get_child(vdip);
6731 	while (cdip) {
6732 		dev_info_t *next = ddi_get_next_sibling(cdip);
6733 
6734 		i_mdi_pm_post_config_one(cdip);
6735 		cdip = next;
6736 	}
6737 	ndi_devi_exit(vdip, circ);
6738 }
6739 
6740 static void
6741 i_mdi_pm_post_unconfig_one(dev_info_t *child)
6742 {
6743 	mdi_client_t	*ct;
6744 
6745 	ct = i_devi_get_client(child);
6746 	if (ct == NULL)
6747 		return;
6748 
6749 	MDI_CLIENT_LOCK(ct);
6750 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6751 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6752 
6753 	if (!ct->ct_powercnt_unconfig || ct->ct_powercnt_reset) {
6754 		MDI_DEBUG(4, (CE_NOTE, child,
6755 		    "i_mdi_pm_post_unconfig NOT held\n"));
6756 		MDI_CLIENT_UNLOCK(ct);
6757 		return;
6758 	}
6759 
6760 	/* failure detaching or another thread just attached it */
6761 	if ((MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6762 	    i_ddi_devi_attached(ct->ct_dip)) ||
6763 	    (!i_ddi_devi_attached(ct->ct_dip) &&
6764 	    !DEVI_IS_ATTACHING(ct->ct_dip))) {
6765 		MDI_DEBUG(4, (CE_NOTE, child,
6766 		    "i_mdi_pm_post_unconfig i_mdi_pm_reset_client\n"));
6767 		i_mdi_pm_reset_client(ct);
6768 	} else {
6769 		mdi_pathinfo_t  *pip, *next;
6770 		int	valid_path_count = 0;
6771 
6772 		MDI_DEBUG(4, (CE_NOTE, child,
6773 		    "i_mdi_pm_post_unconfig i_mdi_pm_rele_client\n"));
6774 		pip = ct->ct_path_head;
6775 		while (pip != NULL) {
6776 			MDI_PI_LOCK(pip);
6777 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6778 			if (MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip))
6779 				valid_path_count ++;
6780 			MDI_PI_UNLOCK(pip);
6781 			pip = next;
6782 		}
6783 		i_mdi_pm_rele_client(ct, valid_path_count);
6784 		ct->ct_powercnt_unconfig = 0;
6785 	}
6786 
6787 	MDI_CLIENT_UNLOCK(ct);
6788 }
6789 
6790 static void
6791 i_mdi_pm_post_unconfig(dev_info_t *vdip, dev_info_t *child, int held)
6792 {
6793 	int			circ;
6794 	dev_info_t		*cdip;
6795 
6796 	ASSERT(MDI_VHCI(vdip));
6797 
6798 	if (!held) {
6799 		MDI_DEBUG(4, (CE_NOTE, vdip,
6800 		    "i_mdi_pm_post_unconfig held = %d\n", held));
6801 		return;
6802 	}
6803 
6804 	if (child) {
6805 		ASSERT(DEVI_BUSY_OWNED(vdip));
6806 		i_mdi_pm_post_unconfig_one(child);
6807 		return;
6808 	}
6809 
6810 	ndi_devi_enter(vdip, &circ);
6811 	cdip = ddi_get_child(vdip);
6812 	while (cdip) {
6813 		dev_info_t *next = ddi_get_next_sibling(cdip);
6814 
6815 		i_mdi_pm_post_unconfig_one(cdip);
6816 		cdip = next;
6817 	}
6818 	ndi_devi_exit(vdip, circ);
6819 }
6820 
6821 int
6822 mdi_power(dev_info_t *vdip, mdi_pm_op_t op, void *args, char *devnm, int flags)
6823 {
6824 	int			circ, ret = MDI_SUCCESS;
6825 	dev_info_t		*client_dip = NULL;
6826 	mdi_client_t		*ct;
6827 
6828 	/*
6829 	 * Handling ndi_devi_config_one and ndi_devi_unconfig_one.
6830 	 * Power up pHCI for the named client device.
6831 	 * Note: Before the client is enumerated under vhci by phci,
6832 	 * client_dip can be NULL. Then proceed to power up all the
6833 	 * pHCIs.
6834 	 */
6835 	if (devnm != NULL) {
6836 		ndi_devi_enter(vdip, &circ);
6837 		client_dip = ndi_devi_findchild(vdip, devnm);
6838 	}
6839 
6840 	MDI_DEBUG(4, (CE_NOTE, vdip, "mdi_power op = %d %s %p\n",
6841 	    op, devnm ? devnm : "NULL", (void *)client_dip));
6842 
6843 	switch (op) {
6844 	case MDI_PM_PRE_CONFIG:
6845 		ret = i_mdi_pm_pre_config(vdip, client_dip);
6846 		break;
6847 
6848 	case MDI_PM_PRE_UNCONFIG:
6849 		ret = i_mdi_pm_pre_unconfig(vdip, client_dip, (int *)args,
6850 		    flags);
6851 		break;
6852 
6853 	case MDI_PM_POST_CONFIG:
6854 		i_mdi_pm_post_config(vdip, client_dip);
6855 		break;
6856 
6857 	case MDI_PM_POST_UNCONFIG:
6858 		i_mdi_pm_post_unconfig(vdip, client_dip, *(int *)args);
6859 		break;
6860 
6861 	case MDI_PM_HOLD_POWER:
6862 	case MDI_PM_RELE_POWER:
6863 		ASSERT(args);
6864 
6865 		client_dip = (dev_info_t *)args;
6866 		ASSERT(MDI_CLIENT(client_dip));
6867 
6868 		ct = i_devi_get_client(client_dip);
6869 		MDI_CLIENT_LOCK(ct);
6870 
6871 		if (op == MDI_PM_HOLD_POWER) {
6872 			if (ct->ct_power_cnt == 0) {
6873 				(void) i_mdi_power_all_phci(ct);
6874 				MDI_DEBUG(4, (CE_NOTE, client_dip,
6875 				    "mdi_power i_mdi_pm_hold_client\n"));
6876 				i_mdi_pm_hold_client(ct, ct->ct_path_count);
6877 			}
6878 		} else {
6879 			if (DEVI_IS_ATTACHING(ct->ct_dip)) {
6880 				MDI_DEBUG(4, (CE_NOTE, client_dip,
6881 				    "mdi_power i_mdi_pm_rele_client\n"));
6882 				i_mdi_pm_rele_client(ct, ct->ct_path_count);
6883 			} else {
6884 				MDI_DEBUG(4, (CE_NOTE, client_dip,
6885 				    "mdi_power i_mdi_pm_reset_client\n"));
6886 				i_mdi_pm_reset_client(ct);
6887 			}
6888 		}
6889 
6890 		MDI_CLIENT_UNLOCK(ct);
6891 		break;
6892 
6893 	default:
6894 		break;
6895 	}
6896 
6897 	if (devnm)
6898 		ndi_devi_exit(vdip, circ);
6899 
6900 	return (ret);
6901 }
6902 
6903 int
6904 mdi_component_is_vhci(dev_info_t *dip, const char **mdi_class)
6905 {
6906 	mdi_vhci_t *vhci;
6907 
6908 	if (!MDI_VHCI(dip))
6909 		return (MDI_FAILURE);
6910 
6911 	if (mdi_class) {
6912 		vhci = DEVI(dip)->devi_mdi_xhci;
6913 		ASSERT(vhci);
6914 		*mdi_class = vhci->vh_class;
6915 	}
6916 
6917 	return (MDI_SUCCESS);
6918 }
6919 
6920 int
6921 mdi_component_is_phci(dev_info_t *dip, const char **mdi_class)
6922 {
6923 	mdi_phci_t *phci;
6924 
6925 	if (!MDI_PHCI(dip))
6926 		return (MDI_FAILURE);
6927 
6928 	if (mdi_class) {
6929 		phci = DEVI(dip)->devi_mdi_xhci;
6930 		ASSERT(phci);
6931 		*mdi_class = phci->ph_vhci->vh_class;
6932 	}
6933 
6934 	return (MDI_SUCCESS);
6935 }
6936 
6937 int
6938 mdi_component_is_client(dev_info_t *dip, const char **mdi_class)
6939 {
6940 	mdi_client_t *client;
6941 
6942 	if (!MDI_CLIENT(dip))
6943 		return (MDI_FAILURE);
6944 
6945 	if (mdi_class) {
6946 		client = DEVI(dip)->devi_mdi_client;
6947 		ASSERT(client);
6948 		*mdi_class = client->ct_vhci->vh_class;
6949 	}
6950 
6951 	return (MDI_SUCCESS);
6952 }
6953 
6954 void *
6955 mdi_client_get_vhci_private(dev_info_t *dip)
6956 {
6957 	ASSERT(mdi_component_is_client(dip, NULL) == MDI_SUCCESS);
6958 	if (mdi_component_is_client(dip, NULL) == MDI_SUCCESS) {
6959 		mdi_client_t	*ct;
6960 		ct = i_devi_get_client(dip);
6961 		return (ct->ct_vprivate);
6962 	}
6963 	return (NULL);
6964 }
6965 
6966 void
6967 mdi_client_set_vhci_private(dev_info_t *dip, void *data)
6968 {
6969 	ASSERT(mdi_component_is_client(dip, NULL) == MDI_SUCCESS);
6970 	if (mdi_component_is_client(dip, NULL) == MDI_SUCCESS) {
6971 		mdi_client_t	*ct;
6972 		ct = i_devi_get_client(dip);
6973 		ct->ct_vprivate = data;
6974 	}
6975 }
6976 /*
6977  * mdi_pi_get_vhci_private():
6978  *		Get the vhci private information associated with the
6979  *		mdi_pathinfo node
6980  */
6981 void *
6982 mdi_pi_get_vhci_private(mdi_pathinfo_t *pip)
6983 {
6984 	caddr_t	vprivate = NULL;
6985 	if (pip) {
6986 		vprivate = MDI_PI(pip)->pi_vprivate;
6987 	}
6988 	return (vprivate);
6989 }
6990 
6991 /*
6992  * mdi_pi_set_vhci_private():
6993  *		Set the vhci private information in the mdi_pathinfo node
6994  */
6995 void
6996 mdi_pi_set_vhci_private(mdi_pathinfo_t *pip, void *priv)
6997 {
6998 	if (pip) {
6999 		MDI_PI(pip)->pi_vprivate = priv;
7000 	}
7001 }
7002 
7003 /*
7004  * mdi_phci_get_vhci_private():
7005  *		Get the vhci private information associated with the
7006  *		mdi_phci node
7007  */
7008 void *
7009 mdi_phci_get_vhci_private(dev_info_t *dip)
7010 {
7011 	ASSERT(mdi_component_is_phci(dip, NULL) == MDI_SUCCESS);
7012 	if (mdi_component_is_phci(dip, NULL) == MDI_SUCCESS) {
7013 		mdi_phci_t	*ph;
7014 		ph = i_devi_get_phci(dip);
7015 		return (ph->ph_vprivate);
7016 	}
7017 	return (NULL);
7018 }
7019 
7020 /*
7021  * mdi_phci_set_vhci_private():
7022  *		Set the vhci private information in the mdi_phci node
7023  */
7024 void
7025 mdi_phci_set_vhci_private(dev_info_t *dip, void *priv)
7026 {
7027 	ASSERT(mdi_component_is_phci(dip, NULL) == MDI_SUCCESS);
7028 	if (mdi_component_is_phci(dip, NULL) == MDI_SUCCESS) {
7029 		mdi_phci_t	*ph;
7030 		ph = i_devi_get_phci(dip);
7031 		ph->ph_vprivate = priv;
7032 	}
7033 }
7034 
7035 /*
7036  * List of vhci class names:
7037  * A vhci class name must be in this list only if the corresponding vhci
7038  * driver intends to use the mdi provided bus config implementation
7039  * (i.e., mdi_vhci_bus_config()).
7040  */
7041 static char *vhci_class_list[] = { MDI_HCI_CLASS_SCSI, MDI_HCI_CLASS_IB };
7042 #define	N_VHCI_CLASSES	(sizeof (vhci_class_list) / sizeof (char *))
7043 
7044 /*
7045  * During boot time, the on-disk vhci cache for every vhci class is read
7046  * in the form of an nvlist and stored here.
7047  */
7048 static nvlist_t *vhcache_nvl[N_VHCI_CLASSES];
7049 
7050 /* nvpair names in vhci cache nvlist */
7051 #define	MDI_VHCI_CACHE_VERSION	1
7052 #define	MDI_NVPNAME_VERSION	"version"
7053 #define	MDI_NVPNAME_PHCIS	"phcis"
7054 #define	MDI_NVPNAME_CTADDRMAP	"clientaddrmap"
7055 
7056 /*
7057  * Given vhci class name, return its on-disk vhci cache filename.
7058  * Memory for the returned filename which includes the full path is allocated
7059  * by this function.
7060  */
7061 static char *
7062 vhclass2vhcache_filename(char *vhclass)
7063 {
7064 	char *filename;
7065 	int len;
7066 	static char *fmt = "/etc/devices/mdi_%s_cache";
7067 
7068 	/*
7069 	 * fmt contains the on-disk vhci cache file name format;
7070 	 * for scsi_vhci the filename is "/etc/devices/mdi_scsi_vhci_cache".
7071 	 */
7072 
7073 	/* the -1 below is to account for "%s" in the format string */
7074 	len = strlen(fmt) + strlen(vhclass) - 1;
7075 	filename = kmem_alloc(len, KM_SLEEP);
7076 	(void) snprintf(filename, len, fmt, vhclass);
7077 	ASSERT(len == (strlen(filename) + 1));
7078 	return (filename);
7079 }
7080 
7081 /*
7082  * initialize the vhci cache related data structures and read the on-disk
7083  * vhci cached data into memory.
7084  */
7085 static void
7086 setup_vhci_cache(mdi_vhci_t *vh)
7087 {
7088 	mdi_vhci_config_t *vhc;
7089 	mdi_vhci_cache_t *vhcache;
7090 	int i;
7091 	nvlist_t *nvl = NULL;
7092 
7093 	vhc = kmem_zalloc(sizeof (mdi_vhci_config_t), KM_SLEEP);
7094 	vh->vh_config = vhc;
7095 	vhcache = &vhc->vhc_vhcache;
7096 
7097 	vhc->vhc_vhcache_filename = vhclass2vhcache_filename(vh->vh_class);
7098 
7099 	mutex_init(&vhc->vhc_lock, NULL, MUTEX_DEFAULT, NULL);
7100 	cv_init(&vhc->vhc_cv, NULL, CV_DRIVER, NULL);
7101 
7102 	rw_init(&vhcache->vhcache_lock, NULL, RW_DRIVER, NULL);
7103 
7104 	/*
7105 	 * Create string hash; same as mod_hash_create_strhash() except that
7106 	 * we use NULL key destructor.
7107 	 */
7108 	vhcache->vhcache_client_hash = mod_hash_create_extended(vh->vh_class,
7109 	    mdi_bus_config_cache_hash_size,
7110 	    mod_hash_null_keydtor, mod_hash_null_valdtor,
7111 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
7112 
7113 	/*
7114 	 * The on-disk vhci cache is read during booting prior to the
7115 	 * lights-out period by mdi_read_devices_files().
7116 	 */
7117 	for (i = 0; i < N_VHCI_CLASSES; i++) {
7118 		if (strcmp(vhci_class_list[i], vh->vh_class) == 0) {
7119 			nvl = vhcache_nvl[i];
7120 			vhcache_nvl[i] = NULL;
7121 			break;
7122 		}
7123 	}
7124 
7125 	/*
7126 	 * this is to cover the case of some one manually causing unloading
7127 	 * (or detaching) and reloading (or attaching) of a vhci driver.
7128 	 */
7129 	if (nvl == NULL && modrootloaded)
7130 		nvl = read_on_disk_vhci_cache(vh->vh_class);
7131 
7132 	if (nvl != NULL) {
7133 		rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7134 		if (mainnvl_to_vhcache(vhcache, nvl) == MDI_SUCCESS)
7135 			vhcache->vhcache_flags |= MDI_VHCI_CACHE_SETUP_DONE;
7136 		else  {
7137 			cmn_err(CE_WARN,
7138 			    "%s: data file corrupted, will recreate\n",
7139 			    vhc->vhc_vhcache_filename);
7140 		}
7141 		rw_exit(&vhcache->vhcache_lock);
7142 		nvlist_free(nvl);
7143 	}
7144 
7145 	vhc->vhc_cbid = callb_add(stop_vhcache_flush_thread, vhc,
7146 	    CB_CL_UADMIN_PRE_VFS, "mdi_vhcache_flush");
7147 
7148 	vhc->vhc_path_discovery_boot = mdi_path_discovery_boot;
7149 	vhc->vhc_path_discovery_postboot = mdi_path_discovery_postboot;
7150 }
7151 
7152 /*
7153  * free all vhci cache related resources
7154  */
7155 static int
7156 destroy_vhci_cache(mdi_vhci_t *vh)
7157 {
7158 	mdi_vhci_config_t *vhc = vh->vh_config;
7159 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7160 	mdi_vhcache_phci_t *cphci, *cphci_next;
7161 	mdi_vhcache_client_t *cct, *cct_next;
7162 	mdi_vhcache_pathinfo_t *cpi, *cpi_next;
7163 
7164 	if (stop_vhcache_async_threads(vhc) != MDI_SUCCESS)
7165 		return (MDI_FAILURE);
7166 
7167 	kmem_free(vhc->vhc_vhcache_filename,
7168 	    strlen(vhc->vhc_vhcache_filename) + 1);
7169 
7170 	mod_hash_destroy_strhash(vhcache->vhcache_client_hash);
7171 
7172 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7173 	    cphci = cphci_next) {
7174 		cphci_next = cphci->cphci_next;
7175 		free_vhcache_phci(cphci);
7176 	}
7177 
7178 	for (cct = vhcache->vhcache_client_head; cct != NULL; cct = cct_next) {
7179 		cct_next = cct->cct_next;
7180 		for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi_next) {
7181 			cpi_next = cpi->cpi_next;
7182 			free_vhcache_pathinfo(cpi);
7183 		}
7184 		free_vhcache_client(cct);
7185 	}
7186 
7187 	rw_destroy(&vhcache->vhcache_lock);
7188 
7189 	mutex_destroy(&vhc->vhc_lock);
7190 	cv_destroy(&vhc->vhc_cv);
7191 	kmem_free(vhc, sizeof (mdi_vhci_config_t));
7192 	return (MDI_SUCCESS);
7193 }
7194 
7195 /*
7196  * Stop all vhci cache related async threads and free their resources.
7197  */
7198 static int
7199 stop_vhcache_async_threads(mdi_vhci_config_t *vhc)
7200 {
7201 	mdi_async_client_config_t *acc, *acc_next;
7202 
7203 	mutex_enter(&vhc->vhc_lock);
7204 	vhc->vhc_flags |= MDI_VHC_EXIT;
7205 	ASSERT(vhc->vhc_acc_thrcount >= 0);
7206 	cv_broadcast(&vhc->vhc_cv);
7207 
7208 	while ((vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) ||
7209 	    vhc->vhc_acc_thrcount != 0) {
7210 		mutex_exit(&vhc->vhc_lock);
7211 		delay(1);
7212 		mutex_enter(&vhc->vhc_lock);
7213 	}
7214 
7215 	vhc->vhc_flags &= ~MDI_VHC_EXIT;
7216 
7217 	for (acc = vhc->vhc_acc_list_head; acc != NULL; acc = acc_next) {
7218 		acc_next = acc->acc_next;
7219 		free_async_client_config(acc);
7220 	}
7221 	vhc->vhc_acc_list_head = NULL;
7222 	vhc->vhc_acc_list_tail = NULL;
7223 	vhc->vhc_acc_count = 0;
7224 
7225 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) {
7226 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
7227 		mutex_exit(&vhc->vhc_lock);
7228 		if (flush_vhcache(vhc, 0) != MDI_SUCCESS) {
7229 			vhcache_dirty(vhc);
7230 			return (MDI_FAILURE);
7231 		}
7232 	} else
7233 		mutex_exit(&vhc->vhc_lock);
7234 
7235 	if (callb_delete(vhc->vhc_cbid) != 0)
7236 		return (MDI_FAILURE);
7237 
7238 	return (MDI_SUCCESS);
7239 }
7240 
7241 /*
7242  * Stop vhci cache flush thread
7243  */
7244 /* ARGSUSED */
7245 static boolean_t
7246 stop_vhcache_flush_thread(void *arg, int code)
7247 {
7248 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
7249 
7250 	mutex_enter(&vhc->vhc_lock);
7251 	vhc->vhc_flags |= MDI_VHC_EXIT;
7252 	cv_broadcast(&vhc->vhc_cv);
7253 
7254 	while (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) {
7255 		mutex_exit(&vhc->vhc_lock);
7256 		delay(1);
7257 		mutex_enter(&vhc->vhc_lock);
7258 	}
7259 
7260 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) {
7261 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
7262 		mutex_exit(&vhc->vhc_lock);
7263 		(void) flush_vhcache(vhc, 1);
7264 	} else
7265 		mutex_exit(&vhc->vhc_lock);
7266 
7267 	return (B_TRUE);
7268 }
7269 
7270 /*
7271  * Enqueue the vhcache phci (cphci) at the tail of the list
7272  */
7273 static void
7274 enqueue_vhcache_phci(mdi_vhci_cache_t *vhcache, mdi_vhcache_phci_t *cphci)
7275 {
7276 	cphci->cphci_next = NULL;
7277 	if (vhcache->vhcache_phci_head == NULL)
7278 		vhcache->vhcache_phci_head = cphci;
7279 	else
7280 		vhcache->vhcache_phci_tail->cphci_next = cphci;
7281 	vhcache->vhcache_phci_tail = cphci;
7282 }
7283 
7284 /*
7285  * Enqueue the vhcache pathinfo (cpi) at the tail of the list
7286  */
7287 static void
7288 enqueue_tail_vhcache_pathinfo(mdi_vhcache_client_t *cct,
7289     mdi_vhcache_pathinfo_t *cpi)
7290 {
7291 	cpi->cpi_next = NULL;
7292 	if (cct->cct_cpi_head == NULL)
7293 		cct->cct_cpi_head = cpi;
7294 	else
7295 		cct->cct_cpi_tail->cpi_next = cpi;
7296 	cct->cct_cpi_tail = cpi;
7297 }
7298 
7299 /*
7300  * Enqueue the vhcache pathinfo (cpi) at the correct location in the
7301  * ordered list. All cpis which do not have MDI_CPI_HINT_PATH_DOES_NOT_EXIST
7302  * flag set come at the beginning of the list. All cpis which have this
7303  * flag set come at the end of the list.
7304  */
7305 static void
7306 enqueue_vhcache_pathinfo(mdi_vhcache_client_t *cct,
7307     mdi_vhcache_pathinfo_t *newcpi)
7308 {
7309 	mdi_vhcache_pathinfo_t *cpi, *prev_cpi;
7310 
7311 	if (cct->cct_cpi_head == NULL ||
7312 	    (newcpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST))
7313 		enqueue_tail_vhcache_pathinfo(cct, newcpi);
7314 	else {
7315 		for (cpi = cct->cct_cpi_head, prev_cpi = NULL; cpi != NULL &&
7316 		    !(cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST);
7317 		    prev_cpi = cpi, cpi = cpi->cpi_next)
7318 			;
7319 
7320 		if (prev_cpi == NULL)
7321 			cct->cct_cpi_head = newcpi;
7322 		else
7323 			prev_cpi->cpi_next = newcpi;
7324 
7325 		newcpi->cpi_next = cpi;
7326 
7327 		if (cpi == NULL)
7328 			cct->cct_cpi_tail = newcpi;
7329 	}
7330 }
7331 
7332 /*
7333  * Enqueue the vhcache client (cct) at the tail of the list
7334  */
7335 static void
7336 enqueue_vhcache_client(mdi_vhci_cache_t *vhcache,
7337     mdi_vhcache_client_t *cct)
7338 {
7339 	cct->cct_next = NULL;
7340 	if (vhcache->vhcache_client_head == NULL)
7341 		vhcache->vhcache_client_head = cct;
7342 	else
7343 		vhcache->vhcache_client_tail->cct_next = cct;
7344 	vhcache->vhcache_client_tail = cct;
7345 }
7346 
7347 static void
7348 free_string_array(char **str, int nelem)
7349 {
7350 	int i;
7351 
7352 	if (str) {
7353 		for (i = 0; i < nelem; i++) {
7354 			if (str[i])
7355 				kmem_free(str[i], strlen(str[i]) + 1);
7356 		}
7357 		kmem_free(str, sizeof (char *) * nelem);
7358 	}
7359 }
7360 
7361 static void
7362 free_vhcache_phci(mdi_vhcache_phci_t *cphci)
7363 {
7364 	kmem_free(cphci->cphci_path, strlen(cphci->cphci_path) + 1);
7365 	kmem_free(cphci, sizeof (*cphci));
7366 }
7367 
7368 static void
7369 free_vhcache_pathinfo(mdi_vhcache_pathinfo_t *cpi)
7370 {
7371 	kmem_free(cpi->cpi_addr, strlen(cpi->cpi_addr) + 1);
7372 	kmem_free(cpi, sizeof (*cpi));
7373 }
7374 
7375 static void
7376 free_vhcache_client(mdi_vhcache_client_t *cct)
7377 {
7378 	kmem_free(cct->cct_name_addr, strlen(cct->cct_name_addr) + 1);
7379 	kmem_free(cct, sizeof (*cct));
7380 }
7381 
7382 static char *
7383 vhcache_mknameaddr(char *ct_name, char *ct_addr, int *ret_len)
7384 {
7385 	char *name_addr;
7386 	int len;
7387 
7388 	len = strlen(ct_name) + strlen(ct_addr) + 2;
7389 	name_addr = kmem_alloc(len, KM_SLEEP);
7390 	(void) snprintf(name_addr, len, "%s@%s", ct_name, ct_addr);
7391 
7392 	if (ret_len)
7393 		*ret_len = len;
7394 	return (name_addr);
7395 }
7396 
7397 /*
7398  * Copy the contents of paddrnvl to vhci cache.
7399  * paddrnvl nvlist contains path information for a vhci client.
7400  * See the comment in mainnvl_to_vhcache() for the format of this nvlist.
7401  */
7402 static void
7403 paddrnvl_to_vhcache(nvlist_t *nvl, mdi_vhcache_phci_t *cphci_list[],
7404     mdi_vhcache_client_t *cct)
7405 {
7406 	nvpair_t *nvp = NULL;
7407 	mdi_vhcache_pathinfo_t *cpi;
7408 	uint_t nelem;
7409 	uint32_t *val;
7410 
7411 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
7412 		ASSERT(nvpair_type(nvp) == DATA_TYPE_UINT32_ARRAY);
7413 		cpi = kmem_zalloc(sizeof (*cpi), KM_SLEEP);
7414 		cpi->cpi_addr = i_ddi_strdup(nvpair_name(nvp), KM_SLEEP);
7415 		(void) nvpair_value_uint32_array(nvp, &val, &nelem);
7416 		ASSERT(nelem == 2);
7417 		cpi->cpi_cphci = cphci_list[val[0]];
7418 		cpi->cpi_flags = val[1];
7419 		enqueue_tail_vhcache_pathinfo(cct, cpi);
7420 	}
7421 }
7422 
7423 /*
7424  * Copy the contents of caddrmapnvl to vhci cache.
7425  * caddrmapnvl nvlist contains vhci client address to phci client address
7426  * mappings. See the comment in mainnvl_to_vhcache() for the format of
7427  * this nvlist.
7428  */
7429 static void
7430 caddrmapnvl_to_vhcache(mdi_vhci_cache_t *vhcache, nvlist_t *nvl,
7431     mdi_vhcache_phci_t *cphci_list[])
7432 {
7433 	nvpair_t *nvp = NULL;
7434 	nvlist_t *paddrnvl;
7435 	mdi_vhcache_client_t *cct;
7436 
7437 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
7438 		ASSERT(nvpair_type(nvp) == DATA_TYPE_NVLIST);
7439 		cct = kmem_zalloc(sizeof (*cct), KM_SLEEP);
7440 		cct->cct_name_addr = i_ddi_strdup(nvpair_name(nvp), KM_SLEEP);
7441 		(void) nvpair_value_nvlist(nvp, &paddrnvl);
7442 		paddrnvl_to_vhcache(paddrnvl, cphci_list, cct);
7443 		/* the client must contain at least one path */
7444 		ASSERT(cct->cct_cpi_head != NULL);
7445 
7446 		enqueue_vhcache_client(vhcache, cct);
7447 		(void) mod_hash_insert(vhcache->vhcache_client_hash,
7448 		    (mod_hash_key_t)cct->cct_name_addr, (mod_hash_val_t)cct);
7449 	}
7450 }
7451 
7452 /*
7453  * Copy the contents of the main nvlist to vhci cache.
7454  *
7455  * VHCI busconfig cached data is stored in the form of a nvlist on the disk.
7456  * The nvlist contains the mappings between the vhci client addresses and
7457  * their corresponding phci client addresses.
7458  *
7459  * The structure of the nvlist is as follows:
7460  *
7461  * Main nvlist:
7462  *	NAME		TYPE		DATA
7463  *	version		int32		version number
7464  *	phcis		string array	array of phci paths
7465  *	clientaddrmap	nvlist_t	c2paddrs_nvl (see below)
7466  *
7467  * structure of c2paddrs_nvl:
7468  *	NAME		TYPE		DATA
7469  *	caddr1		nvlist_t	paddrs_nvl1
7470  *	caddr2		nvlist_t	paddrs_nvl2
7471  *	...
7472  * where caddr1, caddr2, ... are vhci client name and addresses in the
7473  * form of "<clientname>@<clientaddress>".
7474  * (for example: "ssd@2000002037cd9f72");
7475  * paddrs_nvl1, paddrs_nvl2, .. are nvlists that contain path information.
7476  *
7477  * structure of paddrs_nvl:
7478  *	NAME		TYPE		DATA
7479  *	pi_addr1	uint32_array	(phci-id, cpi_flags)
7480  *	pi_addr2	uint32_array	(phci-id, cpi_flags)
7481  *	...
7482  * where pi_addr1, pi_addr2, ... are bus specific addresses of pathinfo nodes
7483  * (so called pi_addrs, for example: "w2100002037cd9f72,0");
7484  * phci-ids are integers that identify PHCIs to which the
7485  * the bus specific address belongs to. These integers are used as an index
7486  * into to the phcis string array in the main nvlist to get the PHCI path.
7487  */
7488 static int
7489 mainnvl_to_vhcache(mdi_vhci_cache_t *vhcache, nvlist_t *nvl)
7490 {
7491 	char **phcis, **phci_namep;
7492 	uint_t nphcis;
7493 	mdi_vhcache_phci_t *cphci, **cphci_list;
7494 	nvlist_t *caddrmapnvl;
7495 	int32_t ver;
7496 	int i;
7497 	size_t cphci_list_size;
7498 
7499 	ASSERT(RW_WRITE_HELD(&vhcache->vhcache_lock));
7500 
7501 	if (nvlist_lookup_int32(nvl, MDI_NVPNAME_VERSION, &ver) != 0 ||
7502 	    ver != MDI_VHCI_CACHE_VERSION)
7503 		return (MDI_FAILURE);
7504 
7505 	if (nvlist_lookup_string_array(nvl, MDI_NVPNAME_PHCIS, &phcis,
7506 	    &nphcis) != 0)
7507 		return (MDI_SUCCESS);
7508 
7509 	ASSERT(nphcis > 0);
7510 
7511 	cphci_list_size = sizeof (mdi_vhcache_phci_t *) * nphcis;
7512 	cphci_list = kmem_alloc(cphci_list_size, KM_SLEEP);
7513 	for (i = 0, phci_namep = phcis; i < nphcis; i++, phci_namep++) {
7514 		cphci = kmem_zalloc(sizeof (mdi_vhcache_phci_t), KM_SLEEP);
7515 		cphci->cphci_path = i_ddi_strdup(*phci_namep, KM_SLEEP);
7516 		enqueue_vhcache_phci(vhcache, cphci);
7517 		cphci_list[i] = cphci;
7518 	}
7519 
7520 	ASSERT(vhcache->vhcache_phci_head != NULL);
7521 
7522 	if (nvlist_lookup_nvlist(nvl, MDI_NVPNAME_CTADDRMAP, &caddrmapnvl) == 0)
7523 		caddrmapnvl_to_vhcache(vhcache, caddrmapnvl, cphci_list);
7524 
7525 	kmem_free(cphci_list, cphci_list_size);
7526 	return (MDI_SUCCESS);
7527 }
7528 
7529 /*
7530  * Build paddrnvl for the specified client using the information in the
7531  * vhci cache and add it to the caddrmapnnvl.
7532  * Returns 0 on success, errno on failure.
7533  */
7534 static int
7535 vhcache_to_paddrnvl(mdi_vhci_cache_t *vhcache, mdi_vhcache_client_t *cct,
7536     nvlist_t *caddrmapnvl)
7537 {
7538 	mdi_vhcache_pathinfo_t *cpi;
7539 	nvlist_t *nvl;
7540 	int err;
7541 	uint32_t val[2];
7542 
7543 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7544 
7545 	if ((err = nvlist_alloc(&nvl, 0, KM_SLEEP)) != 0)
7546 		return (err);
7547 
7548 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
7549 		val[0] = cpi->cpi_cphci->cphci_id;
7550 		val[1] = cpi->cpi_flags;
7551 		if ((err = nvlist_add_uint32_array(nvl, cpi->cpi_addr, val, 2))
7552 		    != 0)
7553 			goto out;
7554 	}
7555 
7556 	err = nvlist_add_nvlist(caddrmapnvl, cct->cct_name_addr, nvl);
7557 out:
7558 	nvlist_free(nvl);
7559 	return (err);
7560 }
7561 
7562 /*
7563  * Build caddrmapnvl using the information in the vhci cache
7564  * and add it to the mainnvl.
7565  * Returns 0 on success, errno on failure.
7566  */
7567 static int
7568 vhcache_to_caddrmapnvl(mdi_vhci_cache_t *vhcache, nvlist_t *mainnvl)
7569 {
7570 	mdi_vhcache_client_t *cct;
7571 	nvlist_t *nvl;
7572 	int err;
7573 
7574 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7575 
7576 	if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) != 0)
7577 		return (err);
7578 
7579 	for (cct = vhcache->vhcache_client_head; cct != NULL;
7580 	    cct = cct->cct_next) {
7581 		if ((err = vhcache_to_paddrnvl(vhcache, cct, nvl)) != 0)
7582 			goto out;
7583 	}
7584 
7585 	err = nvlist_add_nvlist(mainnvl, MDI_NVPNAME_CTADDRMAP, nvl);
7586 out:
7587 	nvlist_free(nvl);
7588 	return (err);
7589 }
7590 
7591 /*
7592  * Build nvlist using the information in the vhci cache.
7593  * See the comment in mainnvl_to_vhcache() for the format of the nvlist.
7594  * Returns nvl on success, NULL on failure.
7595  */
7596 static nvlist_t *
7597 vhcache_to_mainnvl(mdi_vhci_cache_t *vhcache)
7598 {
7599 	mdi_vhcache_phci_t *cphci;
7600 	uint_t phci_count;
7601 	char **phcis;
7602 	nvlist_t *nvl;
7603 	int err, i;
7604 
7605 	if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) != 0) {
7606 		nvl = NULL;
7607 		goto out;
7608 	}
7609 
7610 	if ((err = nvlist_add_int32(nvl, MDI_NVPNAME_VERSION,
7611 	    MDI_VHCI_CACHE_VERSION)) != 0)
7612 		goto out;
7613 
7614 	rw_enter(&vhcache->vhcache_lock, RW_READER);
7615 	if (vhcache->vhcache_phci_head == NULL) {
7616 		rw_exit(&vhcache->vhcache_lock);
7617 		return (nvl);
7618 	}
7619 
7620 	phci_count = 0;
7621 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7622 	    cphci = cphci->cphci_next)
7623 		cphci->cphci_id = phci_count++;
7624 
7625 	/* build phci pathname list */
7626 	phcis = kmem_alloc(sizeof (char *) * phci_count, KM_SLEEP);
7627 	for (cphci = vhcache->vhcache_phci_head, i = 0; cphci != NULL;
7628 	    cphci = cphci->cphci_next, i++)
7629 		phcis[i] = i_ddi_strdup(cphci->cphci_path, KM_SLEEP);
7630 
7631 	err = nvlist_add_string_array(nvl, MDI_NVPNAME_PHCIS, phcis,
7632 	    phci_count);
7633 	free_string_array(phcis, phci_count);
7634 
7635 	if (err == 0 &&
7636 	    (err = vhcache_to_caddrmapnvl(vhcache, nvl)) == 0) {
7637 		rw_exit(&vhcache->vhcache_lock);
7638 		return (nvl);
7639 	}
7640 
7641 	rw_exit(&vhcache->vhcache_lock);
7642 out:
7643 	if (nvl)
7644 		nvlist_free(nvl);
7645 	return (NULL);
7646 }
7647 
7648 /*
7649  * Lookup vhcache phci structure for the specified phci path.
7650  */
7651 static mdi_vhcache_phci_t *
7652 lookup_vhcache_phci_by_name(mdi_vhci_cache_t *vhcache, char *phci_path)
7653 {
7654 	mdi_vhcache_phci_t *cphci;
7655 
7656 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7657 
7658 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7659 	    cphci = cphci->cphci_next) {
7660 		if (strcmp(cphci->cphci_path, phci_path) == 0)
7661 			return (cphci);
7662 	}
7663 
7664 	return (NULL);
7665 }
7666 
7667 /*
7668  * Lookup vhcache phci structure for the specified phci.
7669  */
7670 static mdi_vhcache_phci_t *
7671 lookup_vhcache_phci_by_addr(mdi_vhci_cache_t *vhcache, mdi_phci_t *ph)
7672 {
7673 	mdi_vhcache_phci_t *cphci;
7674 
7675 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7676 
7677 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7678 	    cphci = cphci->cphci_next) {
7679 		if (cphci->cphci_phci == ph)
7680 			return (cphci);
7681 	}
7682 
7683 	return (NULL);
7684 }
7685 
7686 /*
7687  * Add the specified phci to the vhci cache if not already present.
7688  */
7689 static void
7690 vhcache_phci_add(mdi_vhci_config_t *vhc, mdi_phci_t *ph)
7691 {
7692 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7693 	mdi_vhcache_phci_t *cphci;
7694 	char *pathname;
7695 	int cache_updated;
7696 
7697 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7698 
7699 	pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
7700 	(void) ddi_pathname(ph->ph_dip, pathname);
7701 	if ((cphci = lookup_vhcache_phci_by_name(vhcache, pathname))
7702 	    != NULL) {
7703 		cphci->cphci_phci = ph;
7704 		cache_updated = 0;
7705 	} else {
7706 		cphci = kmem_zalloc(sizeof (*cphci), KM_SLEEP);
7707 		cphci->cphci_path = i_ddi_strdup(pathname, KM_SLEEP);
7708 		cphci->cphci_phci = ph;
7709 		enqueue_vhcache_phci(vhcache, cphci);
7710 		cache_updated = 1;
7711 	}
7712 
7713 	rw_exit(&vhcache->vhcache_lock);
7714 
7715 	/*
7716 	 * Since a new phci has been added, reset
7717 	 * vhc_path_discovery_cutoff_time to allow for discovery of paths
7718 	 * during next vhcache_discover_paths().
7719 	 */
7720 	mutex_enter(&vhc->vhc_lock);
7721 	vhc->vhc_path_discovery_cutoff_time = 0;
7722 	mutex_exit(&vhc->vhc_lock);
7723 
7724 	kmem_free(pathname, MAXPATHLEN);
7725 	if (cache_updated)
7726 		vhcache_dirty(vhc);
7727 }
7728 
7729 /*
7730  * Remove the reference to the specified phci from the vhci cache.
7731  */
7732 static void
7733 vhcache_phci_remove(mdi_vhci_config_t *vhc, mdi_phci_t *ph)
7734 {
7735 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7736 	mdi_vhcache_phci_t *cphci;
7737 
7738 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7739 	if ((cphci = lookup_vhcache_phci_by_addr(vhcache, ph)) != NULL) {
7740 		/* do not remove the actual mdi_vhcache_phci structure */
7741 		cphci->cphci_phci = NULL;
7742 	}
7743 	rw_exit(&vhcache->vhcache_lock);
7744 }
7745 
7746 static void
7747 init_vhcache_lookup_token(mdi_vhcache_lookup_token_t *dst,
7748     mdi_vhcache_lookup_token_t *src)
7749 {
7750 	if (src == NULL) {
7751 		dst->lt_cct = NULL;
7752 		dst->lt_cct_lookup_time = 0;
7753 	} else {
7754 		dst->lt_cct = src->lt_cct;
7755 		dst->lt_cct_lookup_time = src->lt_cct_lookup_time;
7756 	}
7757 }
7758 
7759 /*
7760  * Look up vhcache client for the specified client.
7761  */
7762 static mdi_vhcache_client_t *
7763 lookup_vhcache_client(mdi_vhci_cache_t *vhcache, char *ct_name, char *ct_addr,
7764     mdi_vhcache_lookup_token_t *token)
7765 {
7766 	mod_hash_val_t hv;
7767 	char *name_addr;
7768 	int len;
7769 
7770 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7771 
7772 	/*
7773 	 * If no vhcache clean occurred since the last lookup, we can
7774 	 * simply return the cct from the last lookup operation.
7775 	 * It works because ccts are never freed except during the vhcache
7776 	 * cleanup operation.
7777 	 */
7778 	if (token != NULL &&
7779 	    vhcache->vhcache_clean_time < token->lt_cct_lookup_time)
7780 		return (token->lt_cct);
7781 
7782 	name_addr = vhcache_mknameaddr(ct_name, ct_addr, &len);
7783 	if (mod_hash_find(vhcache->vhcache_client_hash,
7784 	    (mod_hash_key_t)name_addr, &hv) == 0) {
7785 		if (token) {
7786 			token->lt_cct = (mdi_vhcache_client_t *)hv;
7787 			token->lt_cct_lookup_time = lbolt64;
7788 		}
7789 	} else {
7790 		if (token) {
7791 			token->lt_cct = NULL;
7792 			token->lt_cct_lookup_time = 0;
7793 		}
7794 		hv = NULL;
7795 	}
7796 	kmem_free(name_addr, len);
7797 	return ((mdi_vhcache_client_t *)hv);
7798 }
7799 
7800 /*
7801  * Add the specified path to the vhci cache if not already present.
7802  * Also add the vhcache client for the client corresponding to this path
7803  * if it doesn't already exist.
7804  */
7805 static void
7806 vhcache_pi_add(mdi_vhci_config_t *vhc, struct mdi_pathinfo *pip)
7807 {
7808 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7809 	mdi_vhcache_client_t *cct;
7810 	mdi_vhcache_pathinfo_t *cpi;
7811 	mdi_phci_t *ph = pip->pi_phci;
7812 	mdi_client_t *ct = pip->pi_client;
7813 	int cache_updated = 0;
7814 
7815 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7816 
7817 	/* if vhcache client for this pip doesn't already exist, add it */
7818 	if ((cct = lookup_vhcache_client(vhcache, ct->ct_drvname, ct->ct_guid,
7819 	    NULL)) == NULL) {
7820 		cct = kmem_zalloc(sizeof (*cct), KM_SLEEP);
7821 		cct->cct_name_addr = vhcache_mknameaddr(ct->ct_drvname,
7822 		    ct->ct_guid, NULL);
7823 		enqueue_vhcache_client(vhcache, cct);
7824 		(void) mod_hash_insert(vhcache->vhcache_client_hash,
7825 		    (mod_hash_key_t)cct->cct_name_addr, (mod_hash_val_t)cct);
7826 		cache_updated = 1;
7827 	}
7828 
7829 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
7830 		if (cpi->cpi_cphci->cphci_phci == ph &&
7831 		    strcmp(cpi->cpi_addr, pip->pi_addr) == 0) {
7832 			cpi->cpi_pip = pip;
7833 			if (cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST) {
7834 				cpi->cpi_flags &=
7835 				    ~MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
7836 				sort_vhcache_paths(cct);
7837 				cache_updated = 1;
7838 			}
7839 			break;
7840 		}
7841 	}
7842 
7843 	if (cpi == NULL) {
7844 		cpi = kmem_zalloc(sizeof (*cpi), KM_SLEEP);
7845 		cpi->cpi_addr = i_ddi_strdup(pip->pi_addr, KM_SLEEP);
7846 		cpi->cpi_cphci = lookup_vhcache_phci_by_addr(vhcache, ph);
7847 		ASSERT(cpi->cpi_cphci != NULL);
7848 		cpi->cpi_pip = pip;
7849 		enqueue_vhcache_pathinfo(cct, cpi);
7850 		cache_updated = 1;
7851 	}
7852 
7853 	rw_exit(&vhcache->vhcache_lock);
7854 
7855 	if (cache_updated)
7856 		vhcache_dirty(vhc);
7857 }
7858 
7859 /*
7860  * Remove the reference to the specified path from the vhci cache.
7861  */
7862 static void
7863 vhcache_pi_remove(mdi_vhci_config_t *vhc, struct mdi_pathinfo *pip)
7864 {
7865 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7866 	mdi_client_t *ct = pip->pi_client;
7867 	mdi_vhcache_client_t *cct;
7868 	mdi_vhcache_pathinfo_t *cpi;
7869 
7870 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7871 	if ((cct = lookup_vhcache_client(vhcache, ct->ct_drvname, ct->ct_guid,
7872 	    NULL)) != NULL) {
7873 		for (cpi = cct->cct_cpi_head; cpi != NULL;
7874 		    cpi = cpi->cpi_next) {
7875 			if (cpi->cpi_pip == pip) {
7876 				cpi->cpi_pip = NULL;
7877 				break;
7878 			}
7879 		}
7880 	}
7881 	rw_exit(&vhcache->vhcache_lock);
7882 }
7883 
7884 /*
7885  * Flush the vhci cache to disk.
7886  * Returns MDI_SUCCESS on success, MDI_FAILURE on failure.
7887  */
7888 static int
7889 flush_vhcache(mdi_vhci_config_t *vhc, int force_flag)
7890 {
7891 	nvlist_t *nvl;
7892 	int err;
7893 	int rv;
7894 
7895 	/*
7896 	 * It is possible that the system may shutdown before
7897 	 * i_ddi_io_initialized (during stmsboot for example). To allow for
7898 	 * flushing the cache in this case do not check for
7899 	 * i_ddi_io_initialized when force flag is set.
7900 	 */
7901 	if (force_flag == 0 && !i_ddi_io_initialized())
7902 		return (MDI_FAILURE);
7903 
7904 	if ((nvl = vhcache_to_mainnvl(&vhc->vhc_vhcache)) != NULL) {
7905 		err = fwrite_nvlist(vhc->vhc_vhcache_filename, nvl);
7906 		nvlist_free(nvl);
7907 	} else
7908 		err = EFAULT;
7909 
7910 	rv = MDI_SUCCESS;
7911 	mutex_enter(&vhc->vhc_lock);
7912 	if (err != 0) {
7913 		if (err == EROFS) {
7914 			vhc->vhc_flags |= MDI_VHC_READONLY_FS;
7915 			vhc->vhc_flags &= ~(MDI_VHC_VHCACHE_FLUSH_ERROR |
7916 			    MDI_VHC_VHCACHE_DIRTY);
7917 		} else {
7918 			if (!(vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_ERROR)) {
7919 				cmn_err(CE_CONT, "%s: update failed\n",
7920 				    vhc->vhc_vhcache_filename);
7921 				vhc->vhc_flags |= MDI_VHC_VHCACHE_FLUSH_ERROR;
7922 			}
7923 			rv = MDI_FAILURE;
7924 		}
7925 	} else if (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_ERROR) {
7926 		cmn_err(CE_CONT,
7927 		    "%s: update now ok\n", vhc->vhc_vhcache_filename);
7928 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_FLUSH_ERROR;
7929 	}
7930 	mutex_exit(&vhc->vhc_lock);
7931 
7932 	return (rv);
7933 }
7934 
7935 /*
7936  * Call flush_vhcache() to flush the vhci cache at the scheduled time.
7937  * Exits itself if left idle for the idle timeout period.
7938  */
7939 static void
7940 vhcache_flush_thread(void *arg)
7941 {
7942 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
7943 	clock_t idle_time, quit_at_ticks;
7944 	callb_cpr_t cprinfo;
7945 
7946 	/* number of seconds to sleep idle before exiting */
7947 	idle_time = mdi_vhcache_flush_daemon_idle_time * TICKS_PER_SECOND;
7948 
7949 	CALLB_CPR_INIT(&cprinfo, &vhc->vhc_lock, callb_generic_cpr,
7950 	    "mdi_vhcache_flush");
7951 	mutex_enter(&vhc->vhc_lock);
7952 	for (; ; ) {
7953 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
7954 		    (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY)) {
7955 			if (ddi_get_lbolt() < vhc->vhc_flush_at_ticks) {
7956 				CALLB_CPR_SAFE_BEGIN(&cprinfo);
7957 				(void) cv_timedwait(&vhc->vhc_cv,
7958 				    &vhc->vhc_lock, vhc->vhc_flush_at_ticks);
7959 				CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
7960 			} else {
7961 				vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
7962 				mutex_exit(&vhc->vhc_lock);
7963 
7964 				if (flush_vhcache(vhc, 0) != MDI_SUCCESS)
7965 					vhcache_dirty(vhc);
7966 
7967 				mutex_enter(&vhc->vhc_lock);
7968 			}
7969 		}
7970 
7971 		quit_at_ticks = ddi_get_lbolt() + idle_time;
7972 
7973 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
7974 		    !(vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) &&
7975 		    ddi_get_lbolt() < quit_at_ticks) {
7976 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
7977 			(void) cv_timedwait(&vhc->vhc_cv, &vhc->vhc_lock,
7978 			    quit_at_ticks);
7979 			CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
7980 		}
7981 
7982 		if ((vhc->vhc_flags & MDI_VHC_EXIT) ||
7983 		    !(vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY))
7984 			goto out;
7985 	}
7986 
7987 out:
7988 	vhc->vhc_flags &= ~MDI_VHC_VHCACHE_FLUSH_THREAD;
7989 	/* CALLB_CPR_EXIT releases the vhc->vhc_lock */
7990 	CALLB_CPR_EXIT(&cprinfo);
7991 }
7992 
7993 /*
7994  * Make vhci cache dirty and schedule flushing by vhcache flush thread.
7995  */
7996 static void
7997 vhcache_dirty(mdi_vhci_config_t *vhc)
7998 {
7999 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8000 	int create_thread;
8001 
8002 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8003 	/* do not flush cache until the cache is fully built */
8004 	if (!(vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE)) {
8005 		rw_exit(&vhcache->vhcache_lock);
8006 		return;
8007 	}
8008 	rw_exit(&vhcache->vhcache_lock);
8009 
8010 	mutex_enter(&vhc->vhc_lock);
8011 	if (vhc->vhc_flags & MDI_VHC_READONLY_FS) {
8012 		mutex_exit(&vhc->vhc_lock);
8013 		return;
8014 	}
8015 
8016 	vhc->vhc_flags |= MDI_VHC_VHCACHE_DIRTY;
8017 	vhc->vhc_flush_at_ticks = ddi_get_lbolt() +
8018 	    mdi_vhcache_flush_delay * TICKS_PER_SECOND;
8019 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) {
8020 		cv_broadcast(&vhc->vhc_cv);
8021 		create_thread = 0;
8022 	} else {
8023 		vhc->vhc_flags |= MDI_VHC_VHCACHE_FLUSH_THREAD;
8024 		create_thread = 1;
8025 	}
8026 	mutex_exit(&vhc->vhc_lock);
8027 
8028 	if (create_thread)
8029 		(void) thread_create(NULL, 0, vhcache_flush_thread, vhc,
8030 		    0, &p0, TS_RUN, minclsyspri);
8031 }
8032 
8033 /*
8034  * phci bus config structure - one for for each phci bus config operation that
8035  * we initiate on behalf of a vhci.
8036  */
8037 typedef struct mdi_phci_bus_config_s {
8038 	char *phbc_phci_path;
8039 	struct mdi_vhci_bus_config_s *phbc_vhbusconfig;	/* vhci bus config */
8040 	struct mdi_phci_bus_config_s *phbc_next;
8041 } mdi_phci_bus_config_t;
8042 
8043 /* vhci bus config structure - one for each vhci bus config operation */
8044 typedef struct mdi_vhci_bus_config_s {
8045 	ddi_bus_config_op_t vhbc_op;	/* bus config op */
8046 	major_t vhbc_op_major;		/* bus config op major */
8047 	uint_t vhbc_op_flags;		/* bus config op flags */
8048 	kmutex_t vhbc_lock;
8049 	kcondvar_t vhbc_cv;
8050 	int vhbc_thr_count;
8051 } mdi_vhci_bus_config_t;
8052 
8053 /*
8054  * bus config the specified phci
8055  */
8056 static void
8057 bus_config_phci(void *arg)
8058 {
8059 	mdi_phci_bus_config_t *phbc = (mdi_phci_bus_config_t *)arg;
8060 	mdi_vhci_bus_config_t *vhbc = phbc->phbc_vhbusconfig;
8061 	dev_info_t *ph_dip;
8062 
8063 	/*
8064 	 * first configure all path components upto phci and then configure
8065 	 * the phci children.
8066 	 */
8067 	if ((ph_dip = e_ddi_hold_devi_by_path(phbc->phbc_phci_path, 0))
8068 	    != NULL) {
8069 		if (vhbc->vhbc_op == BUS_CONFIG_DRIVER ||
8070 		    vhbc->vhbc_op == BUS_UNCONFIG_DRIVER) {
8071 			(void) ndi_devi_config_driver(ph_dip,
8072 			    vhbc->vhbc_op_flags,
8073 			    vhbc->vhbc_op_major);
8074 		} else
8075 			(void) ndi_devi_config(ph_dip,
8076 			    vhbc->vhbc_op_flags);
8077 
8078 		/* release the hold that e_ddi_hold_devi_by_path() placed */
8079 		ndi_rele_devi(ph_dip);
8080 	}
8081 
8082 	kmem_free(phbc->phbc_phci_path, strlen(phbc->phbc_phci_path) + 1);
8083 	kmem_free(phbc, sizeof (*phbc));
8084 
8085 	mutex_enter(&vhbc->vhbc_lock);
8086 	vhbc->vhbc_thr_count--;
8087 	if (vhbc->vhbc_thr_count == 0)
8088 		cv_broadcast(&vhbc->vhbc_cv);
8089 	mutex_exit(&vhbc->vhbc_lock);
8090 }
8091 
8092 /*
8093  * Bus config all phcis associated with the vhci in parallel.
8094  * op must be BUS_CONFIG_DRIVER or BUS_CONFIG_ALL.
8095  */
8096 static void
8097 bus_config_all_phcis(mdi_vhci_cache_t *vhcache, uint_t flags,
8098     ddi_bus_config_op_t op, major_t maj)
8099 {
8100 	mdi_phci_bus_config_t *phbc_head = NULL, *phbc, *phbc_next;
8101 	mdi_vhci_bus_config_t *vhbc;
8102 	mdi_vhcache_phci_t *cphci;
8103 
8104 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8105 	if (vhcache->vhcache_phci_head == NULL) {
8106 		rw_exit(&vhcache->vhcache_lock);
8107 		return;
8108 	}
8109 
8110 	vhbc = kmem_zalloc(sizeof (*vhbc), KM_SLEEP);
8111 
8112 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
8113 	    cphci = cphci->cphci_next) {
8114 		/* skip phcis that haven't attached before root is available */
8115 		if (!modrootloaded && (cphci->cphci_phci == NULL))
8116 			continue;
8117 		phbc = kmem_zalloc(sizeof (*phbc), KM_SLEEP);
8118 		phbc->phbc_phci_path = i_ddi_strdup(cphci->cphci_path,
8119 		    KM_SLEEP);
8120 		phbc->phbc_vhbusconfig = vhbc;
8121 		phbc->phbc_next = phbc_head;
8122 		phbc_head = phbc;
8123 		vhbc->vhbc_thr_count++;
8124 	}
8125 	rw_exit(&vhcache->vhcache_lock);
8126 
8127 	vhbc->vhbc_op = op;
8128 	vhbc->vhbc_op_major = maj;
8129 	vhbc->vhbc_op_flags = NDI_NO_EVENT |
8130 	    (flags & (NDI_CONFIG_REPROBE | NDI_DRV_CONF_REPROBE));
8131 	mutex_init(&vhbc->vhbc_lock, NULL, MUTEX_DEFAULT, NULL);
8132 	cv_init(&vhbc->vhbc_cv, NULL, CV_DRIVER, NULL);
8133 
8134 	/* now create threads to initiate bus config on all phcis in parallel */
8135 	for (phbc = phbc_head; phbc != NULL; phbc = phbc_next) {
8136 		phbc_next = phbc->phbc_next;
8137 		if (mdi_mtc_off)
8138 			bus_config_phci((void *)phbc);
8139 		else
8140 			(void) thread_create(NULL, 0, bus_config_phci, phbc,
8141 			    0, &p0, TS_RUN, minclsyspri);
8142 	}
8143 
8144 	mutex_enter(&vhbc->vhbc_lock);
8145 	/* wait until all threads exit */
8146 	while (vhbc->vhbc_thr_count > 0)
8147 		cv_wait(&vhbc->vhbc_cv, &vhbc->vhbc_lock);
8148 	mutex_exit(&vhbc->vhbc_lock);
8149 
8150 	mutex_destroy(&vhbc->vhbc_lock);
8151 	cv_destroy(&vhbc->vhbc_cv);
8152 	kmem_free(vhbc, sizeof (*vhbc));
8153 }
8154 
8155 /*
8156  * Single threaded version of bus_config_all_phcis()
8157  */
8158 static void
8159 st_bus_config_all_phcis(mdi_vhci_config_t *vhc, uint_t flags,
8160     ddi_bus_config_op_t op, major_t maj)
8161 {
8162 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8163 
8164 	single_threaded_vhconfig_enter(vhc);
8165 	bus_config_all_phcis(vhcache, flags, op, maj);
8166 	single_threaded_vhconfig_exit(vhc);
8167 }
8168 
8169 /*
8170  * Perform BUS_CONFIG_ONE on the specified child of the phci.
8171  * The path includes the child component in addition to the phci path.
8172  */
8173 static int
8174 bus_config_one_phci_child(char *path)
8175 {
8176 	dev_info_t *ph_dip, *child;
8177 	char *devnm;
8178 	int rv = MDI_FAILURE;
8179 
8180 	/* extract the child component of the phci */
8181 	devnm = strrchr(path, '/');
8182 	*devnm++ = '\0';
8183 
8184 	/*
8185 	 * first configure all path components upto phci and then
8186 	 * configure the phci child.
8187 	 */
8188 	if ((ph_dip = e_ddi_hold_devi_by_path(path, 0)) != NULL) {
8189 		if (ndi_devi_config_one(ph_dip, devnm, &child, NDI_NO_EVENT) ==
8190 		    NDI_SUCCESS) {
8191 			/*
8192 			 * release the hold that ndi_devi_config_one() placed
8193 			 */
8194 			ndi_rele_devi(child);
8195 			rv = MDI_SUCCESS;
8196 		}
8197 
8198 		/* release the hold that e_ddi_hold_devi_by_path() placed */
8199 		ndi_rele_devi(ph_dip);
8200 	}
8201 
8202 	devnm--;
8203 	*devnm = '/';
8204 	return (rv);
8205 }
8206 
8207 /*
8208  * Build a list of phci client paths for the specified vhci client.
8209  * The list includes only those phci client paths which aren't configured yet.
8210  */
8211 static mdi_phys_path_t *
8212 build_phclient_path_list(mdi_vhcache_client_t *cct, char *ct_name)
8213 {
8214 	mdi_vhcache_pathinfo_t *cpi;
8215 	mdi_phys_path_t *pp_head = NULL, *pp_tail = NULL, *pp;
8216 	int config_path, len;
8217 
8218 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8219 		/*
8220 		 * include only those paths that aren't configured.
8221 		 */
8222 		config_path = 0;
8223 		if (cpi->cpi_pip == NULL)
8224 			config_path = 1;
8225 		else {
8226 			MDI_PI_LOCK(cpi->cpi_pip);
8227 			if (MDI_PI_IS_INIT(cpi->cpi_pip))
8228 				config_path = 1;
8229 			MDI_PI_UNLOCK(cpi->cpi_pip);
8230 		}
8231 
8232 		if (config_path) {
8233 			pp = kmem_alloc(sizeof (*pp), KM_SLEEP);
8234 			len = strlen(cpi->cpi_cphci->cphci_path) +
8235 			    strlen(ct_name) + strlen(cpi->cpi_addr) + 3;
8236 			pp->phys_path = kmem_alloc(len, KM_SLEEP);
8237 			(void) snprintf(pp->phys_path, len, "%s/%s@%s",
8238 			    cpi->cpi_cphci->cphci_path, ct_name,
8239 			    cpi->cpi_addr);
8240 			pp->phys_path_next = NULL;
8241 
8242 			if (pp_head == NULL)
8243 				pp_head = pp;
8244 			else
8245 				pp_tail->phys_path_next = pp;
8246 			pp_tail = pp;
8247 		}
8248 	}
8249 
8250 	return (pp_head);
8251 }
8252 
8253 /*
8254  * Free the memory allocated for phci client path list.
8255  */
8256 static void
8257 free_phclient_path_list(mdi_phys_path_t *pp_head)
8258 {
8259 	mdi_phys_path_t *pp, *pp_next;
8260 
8261 	for (pp = pp_head; pp != NULL; pp = pp_next) {
8262 		pp_next = pp->phys_path_next;
8263 		kmem_free(pp->phys_path, strlen(pp->phys_path) + 1);
8264 		kmem_free(pp, sizeof (*pp));
8265 	}
8266 }
8267 
8268 /*
8269  * Allocated async client structure and initialize with the specified values.
8270  */
8271 static mdi_async_client_config_t *
8272 alloc_async_client_config(char *ct_name, char *ct_addr,
8273     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
8274 {
8275 	mdi_async_client_config_t *acc;
8276 
8277 	acc = kmem_alloc(sizeof (*acc), KM_SLEEP);
8278 	acc->acc_ct_name = i_ddi_strdup(ct_name, KM_SLEEP);
8279 	acc->acc_ct_addr = i_ddi_strdup(ct_addr, KM_SLEEP);
8280 	acc->acc_phclient_path_list_head = pp_head;
8281 	init_vhcache_lookup_token(&acc->acc_token, tok);
8282 	acc->acc_next = NULL;
8283 	return (acc);
8284 }
8285 
8286 /*
8287  * Free the memory allocated for the async client structure and their members.
8288  */
8289 static void
8290 free_async_client_config(mdi_async_client_config_t *acc)
8291 {
8292 	if (acc->acc_phclient_path_list_head)
8293 		free_phclient_path_list(acc->acc_phclient_path_list_head);
8294 	kmem_free(acc->acc_ct_name, strlen(acc->acc_ct_name) + 1);
8295 	kmem_free(acc->acc_ct_addr, strlen(acc->acc_ct_addr) + 1);
8296 	kmem_free(acc, sizeof (*acc));
8297 }
8298 
8299 /*
8300  * Sort vhcache pathinfos (cpis) of the specified client.
8301  * All cpis which do not have MDI_CPI_HINT_PATH_DOES_NOT_EXIST
8302  * flag set come at the beginning of the list. All cpis which have this
8303  * flag set come at the end of the list.
8304  */
8305 static void
8306 sort_vhcache_paths(mdi_vhcache_client_t *cct)
8307 {
8308 	mdi_vhcache_pathinfo_t *cpi, *cpi_next, *cpi_head;
8309 
8310 	cpi_head = cct->cct_cpi_head;
8311 	cct->cct_cpi_head = cct->cct_cpi_tail = NULL;
8312 	for (cpi = cpi_head; cpi != NULL; cpi = cpi_next) {
8313 		cpi_next = cpi->cpi_next;
8314 		enqueue_vhcache_pathinfo(cct, cpi);
8315 	}
8316 }
8317 
8318 /*
8319  * Verify whether MDI_CPI_HINT_PATH_DOES_NOT_EXIST flag setting is correct for
8320  * every vhcache pathinfo of the specified client. If not adjust the flag
8321  * setting appropriately.
8322  *
8323  * Note that MDI_CPI_HINT_PATH_DOES_NOT_EXIST flag is persisted in the
8324  * on-disk vhci cache. So every time this flag is updated the cache must be
8325  * flushed.
8326  */
8327 static void
8328 adjust_sort_vhcache_paths(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
8329     mdi_vhcache_lookup_token_t *tok)
8330 {
8331 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8332 	mdi_vhcache_client_t *cct;
8333 	mdi_vhcache_pathinfo_t *cpi;
8334 
8335 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8336 	if ((cct = lookup_vhcache_client(vhcache, ct_name, ct_addr, tok))
8337 	    == NULL) {
8338 		rw_exit(&vhcache->vhcache_lock);
8339 		return;
8340 	}
8341 
8342 	/*
8343 	 * to avoid unnecessary on-disk cache updates, first check if an
8344 	 * update is really needed. If no update is needed simply return.
8345 	 */
8346 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8347 		if ((cpi->cpi_pip != NULL &&
8348 		    (cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST)) ||
8349 		    (cpi->cpi_pip == NULL &&
8350 		    !(cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST))) {
8351 			break;
8352 		}
8353 	}
8354 	if (cpi == NULL) {
8355 		rw_exit(&vhcache->vhcache_lock);
8356 		return;
8357 	}
8358 
8359 	if (rw_tryupgrade(&vhcache->vhcache_lock) == 0) {
8360 		rw_exit(&vhcache->vhcache_lock);
8361 		rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8362 		if ((cct = lookup_vhcache_client(vhcache, ct_name, ct_addr,
8363 		    tok)) == NULL) {
8364 			rw_exit(&vhcache->vhcache_lock);
8365 			return;
8366 		}
8367 	}
8368 
8369 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8370 		if (cpi->cpi_pip != NULL)
8371 			cpi->cpi_flags &= ~MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
8372 		else
8373 			cpi->cpi_flags |= MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
8374 	}
8375 	sort_vhcache_paths(cct);
8376 
8377 	rw_exit(&vhcache->vhcache_lock);
8378 	vhcache_dirty(vhc);
8379 }
8380 
8381 /*
8382  * Configure all specified paths of the client.
8383  */
8384 static void
8385 config_client_paths_sync(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
8386     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
8387 {
8388 	mdi_phys_path_t *pp;
8389 
8390 	for (pp = pp_head; pp != NULL; pp = pp->phys_path_next)
8391 		(void) bus_config_one_phci_child(pp->phys_path);
8392 	adjust_sort_vhcache_paths(vhc, ct_name, ct_addr, tok);
8393 }
8394 
8395 /*
8396  * Dequeue elements from vhci async client config list and bus configure
8397  * their corresponding phci clients.
8398  */
8399 static void
8400 config_client_paths_thread(void *arg)
8401 {
8402 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
8403 	mdi_async_client_config_t *acc;
8404 	clock_t quit_at_ticks;
8405 	clock_t idle_time = mdi_async_config_idle_time * TICKS_PER_SECOND;
8406 	callb_cpr_t cprinfo;
8407 
8408 	CALLB_CPR_INIT(&cprinfo, &vhc->vhc_lock, callb_generic_cpr,
8409 	    "mdi_config_client_paths");
8410 
8411 	for (; ; ) {
8412 		quit_at_ticks = ddi_get_lbolt() + idle_time;
8413 
8414 		mutex_enter(&vhc->vhc_lock);
8415 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
8416 		    vhc->vhc_acc_list_head == NULL &&
8417 		    ddi_get_lbolt() < quit_at_ticks) {
8418 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
8419 			(void) cv_timedwait(&vhc->vhc_cv, &vhc->vhc_lock,
8420 			    quit_at_ticks);
8421 			CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
8422 		}
8423 
8424 		if ((vhc->vhc_flags & MDI_VHC_EXIT) ||
8425 		    vhc->vhc_acc_list_head == NULL)
8426 			goto out;
8427 
8428 		acc = vhc->vhc_acc_list_head;
8429 		vhc->vhc_acc_list_head = acc->acc_next;
8430 		if (vhc->vhc_acc_list_head == NULL)
8431 			vhc->vhc_acc_list_tail = NULL;
8432 		vhc->vhc_acc_count--;
8433 		mutex_exit(&vhc->vhc_lock);
8434 
8435 		config_client_paths_sync(vhc, acc->acc_ct_name,
8436 		    acc->acc_ct_addr, acc->acc_phclient_path_list_head,
8437 		    &acc->acc_token);
8438 
8439 		free_async_client_config(acc);
8440 	}
8441 
8442 out:
8443 	vhc->vhc_acc_thrcount--;
8444 	/* CALLB_CPR_EXIT releases the vhc->vhc_lock */
8445 	CALLB_CPR_EXIT(&cprinfo);
8446 }
8447 
8448 /*
8449  * Arrange for all the phci client paths (pp_head) for the specified client
8450  * to be bus configured asynchronously by a thread.
8451  */
8452 static void
8453 config_client_paths_async(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
8454     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
8455 {
8456 	mdi_async_client_config_t *acc, *newacc;
8457 	int create_thread;
8458 
8459 	if (pp_head == NULL)
8460 		return;
8461 
8462 	if (mdi_mtc_off) {
8463 		config_client_paths_sync(vhc, ct_name, ct_addr, pp_head, tok);
8464 		free_phclient_path_list(pp_head);
8465 		return;
8466 	}
8467 
8468 	newacc = alloc_async_client_config(ct_name, ct_addr, pp_head, tok);
8469 	ASSERT(newacc);
8470 
8471 	mutex_enter(&vhc->vhc_lock);
8472 	for (acc = vhc->vhc_acc_list_head; acc != NULL; acc = acc->acc_next) {
8473 		if (strcmp(ct_name, acc->acc_ct_name) == 0 &&
8474 		    strcmp(ct_addr, acc->acc_ct_addr) == 0) {
8475 			free_async_client_config(newacc);
8476 			mutex_exit(&vhc->vhc_lock);
8477 			return;
8478 		}
8479 	}
8480 
8481 	if (vhc->vhc_acc_list_head == NULL)
8482 		vhc->vhc_acc_list_head = newacc;
8483 	else
8484 		vhc->vhc_acc_list_tail->acc_next = newacc;
8485 	vhc->vhc_acc_list_tail = newacc;
8486 	vhc->vhc_acc_count++;
8487 	if (vhc->vhc_acc_count <= vhc->vhc_acc_thrcount) {
8488 		cv_broadcast(&vhc->vhc_cv);
8489 		create_thread = 0;
8490 	} else {
8491 		vhc->vhc_acc_thrcount++;
8492 		create_thread = 1;
8493 	}
8494 	mutex_exit(&vhc->vhc_lock);
8495 
8496 	if (create_thread)
8497 		(void) thread_create(NULL, 0, config_client_paths_thread, vhc,
8498 		    0, &p0, TS_RUN, minclsyspri);
8499 }
8500 
8501 /*
8502  * Return number of online paths for the specified client.
8503  */
8504 static int
8505 nonline_paths(mdi_vhcache_client_t *cct)
8506 {
8507 	mdi_vhcache_pathinfo_t *cpi;
8508 	int online_count = 0;
8509 
8510 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8511 		if (cpi->cpi_pip != NULL) {
8512 			MDI_PI_LOCK(cpi->cpi_pip);
8513 			if (cpi->cpi_pip->pi_state == MDI_PATHINFO_STATE_ONLINE)
8514 				online_count++;
8515 			MDI_PI_UNLOCK(cpi->cpi_pip);
8516 		}
8517 	}
8518 
8519 	return (online_count);
8520 }
8521 
8522 /*
8523  * Bus configure all paths for the specified vhci client.
8524  * If at least one path for the client is already online, the remaining paths
8525  * will be configured asynchronously. Otherwise, it synchronously configures
8526  * the paths until at least one path is online and then rest of the paths
8527  * will be configured asynchronously.
8528  */
8529 static void
8530 config_client_paths(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr)
8531 {
8532 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8533 	mdi_phys_path_t *pp_head, *pp;
8534 	mdi_vhcache_client_t *cct;
8535 	mdi_vhcache_lookup_token_t tok;
8536 
8537 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
8538 
8539 	init_vhcache_lookup_token(&tok, NULL);
8540 
8541 	if (ct_name == NULL || ct_addr == NULL ||
8542 	    (cct = lookup_vhcache_client(vhcache, ct_name, ct_addr, &tok))
8543 	    == NULL ||
8544 	    (pp_head = build_phclient_path_list(cct, ct_name)) == NULL) {
8545 		rw_exit(&vhcache->vhcache_lock);
8546 		return;
8547 	}
8548 
8549 	/* if at least one path is online, configure the rest asynchronously */
8550 	if (nonline_paths(cct) > 0) {
8551 		rw_exit(&vhcache->vhcache_lock);
8552 		config_client_paths_async(vhc, ct_name, ct_addr, pp_head, &tok);
8553 		return;
8554 	}
8555 
8556 	rw_exit(&vhcache->vhcache_lock);
8557 
8558 	for (pp = pp_head; pp != NULL; pp = pp->phys_path_next) {
8559 		if (bus_config_one_phci_child(pp->phys_path) == MDI_SUCCESS) {
8560 			rw_enter(&vhcache->vhcache_lock, RW_READER);
8561 
8562 			if ((cct = lookup_vhcache_client(vhcache, ct_name,
8563 			    ct_addr, &tok)) == NULL) {
8564 				rw_exit(&vhcache->vhcache_lock);
8565 				goto out;
8566 			}
8567 
8568 			if (nonline_paths(cct) > 0 &&
8569 			    pp->phys_path_next != NULL) {
8570 				rw_exit(&vhcache->vhcache_lock);
8571 				config_client_paths_async(vhc, ct_name, ct_addr,
8572 				    pp->phys_path_next, &tok);
8573 				pp->phys_path_next = NULL;
8574 				goto out;
8575 			}
8576 
8577 			rw_exit(&vhcache->vhcache_lock);
8578 		}
8579 	}
8580 
8581 	adjust_sort_vhcache_paths(vhc, ct_name, ct_addr, &tok);
8582 out:
8583 	free_phclient_path_list(pp_head);
8584 }
8585 
8586 static void
8587 single_threaded_vhconfig_enter(mdi_vhci_config_t *vhc)
8588 {
8589 	mutex_enter(&vhc->vhc_lock);
8590 	while (vhc->vhc_flags & MDI_VHC_SINGLE_THREADED)
8591 		cv_wait(&vhc->vhc_cv, &vhc->vhc_lock);
8592 	vhc->vhc_flags |= MDI_VHC_SINGLE_THREADED;
8593 	mutex_exit(&vhc->vhc_lock);
8594 }
8595 
8596 static void
8597 single_threaded_vhconfig_exit(mdi_vhci_config_t *vhc)
8598 {
8599 	mutex_enter(&vhc->vhc_lock);
8600 	vhc->vhc_flags &= ~MDI_VHC_SINGLE_THREADED;
8601 	cv_broadcast(&vhc->vhc_cv);
8602 	mutex_exit(&vhc->vhc_lock);
8603 }
8604 
8605 typedef struct mdi_phci_driver_info {
8606 	char	*phdriver_name;	/* name of the phci driver */
8607 
8608 	/* set to non zero if the phci driver supports root device */
8609 	int	phdriver_root_support;
8610 } mdi_phci_driver_info_t;
8611 
8612 /*
8613  * vhci class and root support capability of a phci driver can be
8614  * specified using ddi-vhci-class and ddi-no-root-support properties in the
8615  * phci driver.conf file. The built-in tables below contain this information
8616  * for those phci drivers whose driver.conf files don't yet contain this info.
8617  *
8618  * All phci drivers expect iscsi have root device support.
8619  */
8620 static mdi_phci_driver_info_t scsi_phci_driver_list[] = {
8621 	{ "fp", 1 },
8622 	{ "iscsi", 0 },
8623 	{ "ibsrp", 1 }
8624 	};
8625 
8626 static mdi_phci_driver_info_t ib_phci_driver_list[] = { "tavor", 1 };
8627 
8628 static void *
8629 mdi_realloc(void *old_ptr, size_t old_size, size_t new_size)
8630 {
8631 	void *new_ptr;
8632 
8633 	new_ptr = kmem_zalloc(new_size, KM_SLEEP);
8634 	if (old_ptr) {
8635 		bcopy(old_ptr, new_ptr, MIN(old_size, new_size));
8636 		kmem_free(old_ptr, old_size);
8637 	}
8638 	return (new_ptr);
8639 }
8640 
8641 static void
8642 add_to_phci_list(char ***driver_list, int **root_support_list,
8643     int *cur_elements, int *max_elements, char *driver_name, int root_support)
8644 {
8645 	ASSERT(*cur_elements <= *max_elements);
8646 	if (*cur_elements == *max_elements) {
8647 		*max_elements += 10;
8648 		*driver_list = mdi_realloc(*driver_list,
8649 		    sizeof (char *) * (*cur_elements),
8650 		    sizeof (char *) * (*max_elements));
8651 		*root_support_list = mdi_realloc(*root_support_list,
8652 		    sizeof (int) * (*cur_elements),
8653 		    sizeof (int) * (*max_elements));
8654 	}
8655 	(*driver_list)[*cur_elements] = i_ddi_strdup(driver_name, KM_SLEEP);
8656 	(*root_support_list)[*cur_elements] = root_support;
8657 	(*cur_elements)++;
8658 }
8659 
8660 static void
8661 get_phci_driver_list(char *vhci_class, char ***driver_list,
8662     int **root_support_list, int *cur_elements, int *max_elements)
8663 {
8664 	mdi_phci_driver_info_t	*st_driver_list, *p;
8665 	int		st_ndrivers, root_support, i, j, driver_conf_count;
8666 	major_t		m;
8667 	struct devnames	*dnp;
8668 	ddi_prop_t	*propp;
8669 
8670 	*driver_list = NULL;
8671 	*root_support_list = NULL;
8672 	*cur_elements = 0;
8673 	*max_elements = 0;
8674 
8675 	/* add the phci drivers derived from the phci driver.conf files */
8676 	for (m = 0; m < devcnt; m++) {
8677 		dnp = &devnamesp[m];
8678 
8679 		if (dnp->dn_flags & DN_PHCI_DRIVER) {
8680 			LOCK_DEV_OPS(&dnp->dn_lock);
8681 			if (dnp->dn_global_prop_ptr != NULL &&
8682 			    (propp = i_ddi_prop_search(DDI_DEV_T_ANY,
8683 			    DDI_VHCI_CLASS, DDI_PROP_TYPE_STRING,
8684 			    &dnp->dn_global_prop_ptr->prop_list)) != NULL &&
8685 			    strcmp(propp->prop_val, vhci_class) == 0) {
8686 
8687 				root_support = (i_ddi_prop_search(DDI_DEV_T_ANY,
8688 				    DDI_NO_ROOT_SUPPORT, DDI_PROP_TYPE_INT,
8689 				    &dnp->dn_global_prop_ptr->prop_list)
8690 				    == NULL) ? 1 : 0;
8691 
8692 				add_to_phci_list(driver_list, root_support_list,
8693 				    cur_elements, max_elements, dnp->dn_name,
8694 				    root_support);
8695 
8696 				UNLOCK_DEV_OPS(&dnp->dn_lock);
8697 			} else
8698 				UNLOCK_DEV_OPS(&dnp->dn_lock);
8699 		}
8700 	}
8701 
8702 	driver_conf_count = *cur_elements;
8703 
8704 	/* add the phci drivers specified in the built-in tables */
8705 	if (strcmp(vhci_class, MDI_HCI_CLASS_SCSI) == 0) {
8706 		st_driver_list = scsi_phci_driver_list;
8707 		st_ndrivers = sizeof (scsi_phci_driver_list) /
8708 		    sizeof (mdi_phci_driver_info_t);
8709 	} else if (strcmp(vhci_class, MDI_HCI_CLASS_IB) == 0) {
8710 		st_driver_list = ib_phci_driver_list;
8711 		st_ndrivers = sizeof (ib_phci_driver_list) /
8712 		    sizeof (mdi_phci_driver_info_t);
8713 	} else {
8714 		st_driver_list = NULL;
8715 		st_ndrivers = 0;
8716 	}
8717 
8718 	for (i = 0, p = st_driver_list; i < st_ndrivers; i++, p++) {
8719 		/* add this phci driver if not already added before */
8720 		for (j = 0; j < driver_conf_count; j++) {
8721 			if (strcmp((*driver_list)[j], p->phdriver_name) == 0)
8722 				break;
8723 		}
8724 		if (j == driver_conf_count) {
8725 			add_to_phci_list(driver_list, root_support_list,
8726 			    cur_elements, max_elements, p->phdriver_name,
8727 			    p->phdriver_root_support);
8728 		}
8729 	}
8730 }
8731 
8732 /*
8733  * Attach the phci driver instances associated with the specified vhci class.
8734  * If root is mounted attach all phci driver instances.
8735  * If root is not mounted, attach the instances of only those phci
8736  * drivers that have the root support.
8737  */
8738 static void
8739 attach_phci_drivers(char *vhci_class)
8740 {
8741 	char	**driver_list, **p;
8742 	int	*root_support_list;
8743 	int	cur_elements, max_elements, i;
8744 	major_t	m;
8745 
8746 	get_phci_driver_list(vhci_class, &driver_list, &root_support_list,
8747 	    &cur_elements, &max_elements);
8748 
8749 	for (i = 0; i < cur_elements; i++) {
8750 		if (modrootloaded || root_support_list[i]) {
8751 			m = ddi_name_to_major(driver_list[i]);
8752 			if (m != DDI_MAJOR_T_NONE &&
8753 			    ddi_hold_installed_driver(m))
8754 				ddi_rele_driver(m);
8755 		}
8756 	}
8757 
8758 	if (driver_list) {
8759 		for (i = 0, p = driver_list; i < cur_elements; i++, p++)
8760 			kmem_free(*p, strlen(*p) + 1);
8761 		kmem_free(driver_list, sizeof (char *) * max_elements);
8762 		kmem_free(root_support_list, sizeof (int) * max_elements);
8763 	}
8764 }
8765 
8766 /*
8767  * Build vhci cache:
8768  *
8769  * Attach phci driver instances and then drive BUS_CONFIG_ALL on
8770  * the phci driver instances. During this process the cache gets built.
8771  *
8772  * Cache is built fully if the root is mounted.
8773  * If the root is not mounted, phci drivers that do not have root support
8774  * are not attached. As a result the cache is built partially. The entries
8775  * in the cache reflect only those phci drivers that have root support.
8776  */
8777 static int
8778 build_vhci_cache(mdi_vhci_t *vh)
8779 {
8780 	mdi_vhci_config_t *vhc = vh->vh_config;
8781 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8782 
8783 	single_threaded_vhconfig_enter(vhc);
8784 
8785 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8786 	if (vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE) {
8787 		rw_exit(&vhcache->vhcache_lock);
8788 		single_threaded_vhconfig_exit(vhc);
8789 		return (0);
8790 	}
8791 	rw_exit(&vhcache->vhcache_lock);
8792 
8793 	attach_phci_drivers(vh->vh_class);
8794 	bus_config_all_phcis(vhcache, NDI_DRV_CONF_REPROBE | NDI_NO_EVENT,
8795 	    BUS_CONFIG_ALL, DDI_MAJOR_T_NONE);
8796 
8797 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8798 	vhcache->vhcache_flags |= MDI_VHCI_CACHE_SETUP_DONE;
8799 	rw_exit(&vhcache->vhcache_lock);
8800 
8801 	single_threaded_vhconfig_exit(vhc);
8802 	vhcache_dirty(vhc);
8803 	return (1);
8804 }
8805 
8806 /*
8807  * Determine if discovery of paths is needed.
8808  */
8809 static int
8810 vhcache_do_discovery(mdi_vhci_config_t *vhc)
8811 {
8812 	int rv = 1;
8813 
8814 	mutex_enter(&vhc->vhc_lock);
8815 	if (i_ddi_io_initialized() == 0) {
8816 		if (vhc->vhc_path_discovery_boot > 0) {
8817 			vhc->vhc_path_discovery_boot--;
8818 			goto out;
8819 		}
8820 	} else {
8821 		if (vhc->vhc_path_discovery_postboot > 0) {
8822 			vhc->vhc_path_discovery_postboot--;
8823 			goto out;
8824 		}
8825 	}
8826 
8827 	/*
8828 	 * Do full path discovery at most once per mdi_path_discovery_interval.
8829 	 * This is to avoid a series of full path discoveries when opening
8830 	 * stale /dev/[r]dsk links.
8831 	 */
8832 	if (mdi_path_discovery_interval != -1 &&
8833 	    lbolt64 >= vhc->vhc_path_discovery_cutoff_time)
8834 		goto out;
8835 
8836 	rv = 0;
8837 out:
8838 	mutex_exit(&vhc->vhc_lock);
8839 	return (rv);
8840 }
8841 
8842 /*
8843  * Discover all paths:
8844  *
8845  * Attach phci driver instances and then drive BUS_CONFIG_ALL on all the phci
8846  * driver instances. During this process all paths will be discovered.
8847  */
8848 static int
8849 vhcache_discover_paths(mdi_vhci_t *vh)
8850 {
8851 	mdi_vhci_config_t *vhc = vh->vh_config;
8852 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8853 	int rv = 0;
8854 
8855 	single_threaded_vhconfig_enter(vhc);
8856 
8857 	if (vhcache_do_discovery(vhc)) {
8858 		attach_phci_drivers(vh->vh_class);
8859 		bus_config_all_phcis(vhcache, NDI_DRV_CONF_REPROBE |
8860 		    NDI_NO_EVENT, BUS_CONFIG_ALL, DDI_MAJOR_T_NONE);
8861 
8862 		mutex_enter(&vhc->vhc_lock);
8863 		vhc->vhc_path_discovery_cutoff_time = lbolt64 +
8864 		    mdi_path_discovery_interval * TICKS_PER_SECOND;
8865 		mutex_exit(&vhc->vhc_lock);
8866 		rv = 1;
8867 	}
8868 
8869 	single_threaded_vhconfig_exit(vhc);
8870 	return (rv);
8871 }
8872 
8873 /*
8874  * Generic vhci bus config implementation:
8875  *
8876  * Parameters
8877  *	vdip	vhci dip
8878  *	flags	bus config flags
8879  *	op	bus config operation
8880  *	The remaining parameters are bus config operation specific
8881  *
8882  * for BUS_CONFIG_ONE
8883  *	arg	pointer to name@addr
8884  *	child	upon successful return from this function, *child will be
8885  *		set to the configured and held devinfo child node of vdip.
8886  *	ct_addr	pointer to client address (i.e. GUID)
8887  *
8888  * for BUS_CONFIG_DRIVER
8889  *	arg	major number of the driver
8890  *	child and ct_addr parameters are ignored
8891  *
8892  * for BUS_CONFIG_ALL
8893  *	arg, child, and ct_addr parameters are ignored
8894  *
8895  * Note that for the rest of the bus config operations, this function simply
8896  * calls the framework provided default bus config routine.
8897  */
8898 int
8899 mdi_vhci_bus_config(dev_info_t *vdip, uint_t flags, ddi_bus_config_op_t op,
8900     void *arg, dev_info_t **child, char *ct_addr)
8901 {
8902 	mdi_vhci_t *vh = i_devi_get_vhci(vdip);
8903 	mdi_vhci_config_t *vhc = vh->vh_config;
8904 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8905 	int rv = 0;
8906 	int params_valid = 0;
8907 	char *cp;
8908 
8909 	/*
8910 	 * To bus config vhcis we relay operation, possibly using another
8911 	 * thread, to phcis. The phci driver then interacts with MDI to cause
8912 	 * vhci child nodes to be enumerated under the vhci node.  Adding a
8913 	 * vhci child requires an ndi_devi_enter of the vhci. Since another
8914 	 * thread may be adding the child, to avoid deadlock we can't wait
8915 	 * for the relayed operations to complete if we have already entered
8916 	 * the vhci node.
8917 	 */
8918 	if (DEVI_BUSY_OWNED(vdip)) {
8919 		MDI_DEBUG(2, (CE_NOTE, vdip, "!MDI: vhci bus config: "
8920 		    "vhci dip is busy owned %p\n", (void *)vdip));
8921 		goto default_bus_config;
8922 	}
8923 
8924 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8925 	if (!(vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE)) {
8926 		rw_exit(&vhcache->vhcache_lock);
8927 		rv = build_vhci_cache(vh);
8928 		rw_enter(&vhcache->vhcache_lock, RW_READER);
8929 	}
8930 
8931 	switch (op) {
8932 	case BUS_CONFIG_ONE:
8933 		if (arg != NULL && ct_addr != NULL) {
8934 			/* extract node name */
8935 			cp = (char *)arg;
8936 			while (*cp != '\0' && *cp != '@')
8937 				cp++;
8938 			if (*cp == '@') {
8939 				params_valid = 1;
8940 				*cp = '\0';
8941 				config_client_paths(vhc, (char *)arg, ct_addr);
8942 				/* config_client_paths() releases cache_lock */
8943 				*cp = '@';
8944 				break;
8945 			}
8946 		}
8947 
8948 		rw_exit(&vhcache->vhcache_lock);
8949 		break;
8950 
8951 	case BUS_CONFIG_DRIVER:
8952 		rw_exit(&vhcache->vhcache_lock);
8953 		if (rv == 0)
8954 			st_bus_config_all_phcis(vhc, flags, op,
8955 			    (major_t)(uintptr_t)arg);
8956 		break;
8957 
8958 	case BUS_CONFIG_ALL:
8959 		rw_exit(&vhcache->vhcache_lock);
8960 		if (rv == 0)
8961 			st_bus_config_all_phcis(vhc, flags, op, -1);
8962 		break;
8963 
8964 	default:
8965 		rw_exit(&vhcache->vhcache_lock);
8966 		break;
8967 	}
8968 
8969 
8970 default_bus_config:
8971 	/*
8972 	 * All requested child nodes are enumerated under the vhci.
8973 	 * Now configure them.
8974 	 */
8975 	if (ndi_busop_bus_config(vdip, flags, op, arg, child, 0) ==
8976 	    NDI_SUCCESS) {
8977 		return (MDI_SUCCESS);
8978 	} else if (op == BUS_CONFIG_ONE && rv == 0 && params_valid) {
8979 		/* discover all paths and try configuring again */
8980 		if (vhcache_discover_paths(vh) &&
8981 		    ndi_busop_bus_config(vdip, flags, op, arg, child, 0) ==
8982 		    NDI_SUCCESS)
8983 			return (MDI_SUCCESS);
8984 	}
8985 
8986 	return (MDI_FAILURE);
8987 }
8988 
8989 /*
8990  * Read the on-disk vhci cache into an nvlist for the specified vhci class.
8991  */
8992 static nvlist_t *
8993 read_on_disk_vhci_cache(char *vhci_class)
8994 {
8995 	nvlist_t *nvl;
8996 	int err;
8997 	char *filename;
8998 
8999 	filename = vhclass2vhcache_filename(vhci_class);
9000 
9001 	if ((err = fread_nvlist(filename, &nvl)) == 0) {
9002 		kmem_free(filename, strlen(filename) + 1);
9003 		return (nvl);
9004 	} else if (err == EIO)
9005 		cmn_err(CE_WARN, "%s: I/O error, will recreate\n", filename);
9006 	else if (err == EINVAL)
9007 		cmn_err(CE_WARN,
9008 		    "%s: data file corrupted, will recreate\n", filename);
9009 
9010 	kmem_free(filename, strlen(filename) + 1);
9011 	return (NULL);
9012 }
9013 
9014 /*
9015  * Read on-disk vhci cache into nvlists for all vhci classes.
9016  * Called during booting by i_ddi_read_devices_files().
9017  */
9018 void
9019 mdi_read_devices_files(void)
9020 {
9021 	int i;
9022 
9023 	for (i = 0; i < N_VHCI_CLASSES; i++)
9024 		vhcache_nvl[i] = read_on_disk_vhci_cache(vhci_class_list[i]);
9025 }
9026 
9027 /*
9028  * Remove all stale entries from vhci cache.
9029  */
9030 static void
9031 clean_vhcache(mdi_vhci_config_t *vhc)
9032 {
9033 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
9034 	mdi_vhcache_phci_t *cphci, *cphci_head, *cphci_next;
9035 	mdi_vhcache_client_t *cct, *cct_head, *cct_next;
9036 	mdi_vhcache_pathinfo_t *cpi, *cpi_head, *cpi_next;
9037 
9038 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
9039 
9040 	cct_head = vhcache->vhcache_client_head;
9041 	vhcache->vhcache_client_head = vhcache->vhcache_client_tail = NULL;
9042 	for (cct = cct_head; cct != NULL; cct = cct_next) {
9043 		cct_next = cct->cct_next;
9044 
9045 		cpi_head = cct->cct_cpi_head;
9046 		cct->cct_cpi_head = cct->cct_cpi_tail = NULL;
9047 		for (cpi = cpi_head; cpi != NULL; cpi = cpi_next) {
9048 			cpi_next = cpi->cpi_next;
9049 			if (cpi->cpi_pip != NULL) {
9050 				ASSERT(cpi->cpi_cphci->cphci_phci != NULL);
9051 				enqueue_tail_vhcache_pathinfo(cct, cpi);
9052 			} else
9053 				free_vhcache_pathinfo(cpi);
9054 		}
9055 
9056 		if (cct->cct_cpi_head != NULL)
9057 			enqueue_vhcache_client(vhcache, cct);
9058 		else {
9059 			(void) mod_hash_destroy(vhcache->vhcache_client_hash,
9060 			    (mod_hash_key_t)cct->cct_name_addr);
9061 			free_vhcache_client(cct);
9062 		}
9063 	}
9064 
9065 	cphci_head = vhcache->vhcache_phci_head;
9066 	vhcache->vhcache_phci_head = vhcache->vhcache_phci_tail = NULL;
9067 	for (cphci = cphci_head; cphci != NULL; cphci = cphci_next) {
9068 		cphci_next = cphci->cphci_next;
9069 		if (cphci->cphci_phci != NULL)
9070 			enqueue_vhcache_phci(vhcache, cphci);
9071 		else
9072 			free_vhcache_phci(cphci);
9073 	}
9074 
9075 	vhcache->vhcache_clean_time = lbolt64;
9076 	rw_exit(&vhcache->vhcache_lock);
9077 	vhcache_dirty(vhc);
9078 }
9079 
9080 /*
9081  * Remove all stale entries from vhci cache.
9082  * Called by i_ddi_clean_devices_files() during the execution of devfsadm -C
9083  */
9084 void
9085 mdi_clean_vhcache(void)
9086 {
9087 	mdi_vhci_t *vh;
9088 
9089 	mutex_enter(&mdi_mutex);
9090 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
9091 		vh->vh_refcnt++;
9092 		mutex_exit(&mdi_mutex);
9093 		clean_vhcache(vh->vh_config);
9094 		mutex_enter(&mdi_mutex);
9095 		vh->vh_refcnt--;
9096 	}
9097 	mutex_exit(&mdi_mutex);
9098 }
9099 
9100 /*
9101  * mdi_vhci_walk_clients():
9102  *		Walker routine to traverse client dev_info nodes
9103  * ddi_walk_devs(ddi_get_child(vdip), f, arg) returns the entire tree
9104  * below the client, including nexus devices, which we dont want.
9105  * So we just traverse the immediate siblings, starting from 1st client.
9106  */
9107 void
9108 mdi_vhci_walk_clients(dev_info_t *vdip,
9109     int (*f)(dev_info_t *, void *), void *arg)
9110 {
9111 	mdi_vhci_t	*vh = i_devi_get_vhci(vdip);
9112 	dev_info_t	*cdip;
9113 	mdi_client_t	*ct;
9114 
9115 	MDI_VHCI_CLIENT_LOCK(vh);
9116 	cdip = ddi_get_child(vdip);
9117 	while (cdip) {
9118 		ct = i_devi_get_client(cdip);
9119 		MDI_CLIENT_LOCK(ct);
9120 
9121 		if (((*f)(cdip, arg)) == DDI_WALK_CONTINUE)
9122 			cdip = ddi_get_next_sibling(cdip);
9123 		else
9124 			cdip = NULL;
9125 
9126 		MDI_CLIENT_UNLOCK(ct);
9127 	}
9128 	MDI_VHCI_CLIENT_UNLOCK(vh);
9129 }
9130 
9131 /*
9132  * mdi_vhci_walk_phcis():
9133  *		Walker routine to traverse phci dev_info nodes
9134  */
9135 void
9136 mdi_vhci_walk_phcis(dev_info_t *vdip,
9137     int (*f)(dev_info_t *, void *), void *arg)
9138 {
9139 	mdi_vhci_t	*vh = i_devi_get_vhci(vdip);
9140 	mdi_phci_t	*ph, *next;
9141 
9142 	MDI_VHCI_PHCI_LOCK(vh);
9143 	ph = vh->vh_phci_head;
9144 	while (ph) {
9145 		MDI_PHCI_LOCK(ph);
9146 
9147 		if (((*f)(ph->ph_dip, arg)) == DDI_WALK_CONTINUE)
9148 			next = ph->ph_next;
9149 		else
9150 			next = NULL;
9151 
9152 		MDI_PHCI_UNLOCK(ph);
9153 		ph = next;
9154 	}
9155 	MDI_VHCI_PHCI_UNLOCK(vh);
9156 }
9157 
9158 
9159 /*
9160  * mdi_walk_vhcis():
9161  *		Walker routine to traverse vhci dev_info nodes
9162  */
9163 void
9164 mdi_walk_vhcis(int (*f)(dev_info_t *, void *), void *arg)
9165 {
9166 	mdi_vhci_t	*vh = NULL;
9167 
9168 	mutex_enter(&mdi_mutex);
9169 	/*
9170 	 * Scan for already registered vhci
9171 	 */
9172 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
9173 		vh->vh_refcnt++;
9174 		mutex_exit(&mdi_mutex);
9175 		if (((*f)(vh->vh_dip, arg)) != DDI_WALK_CONTINUE) {
9176 			mutex_enter(&mdi_mutex);
9177 			vh->vh_refcnt--;
9178 			break;
9179 		} else {
9180 			mutex_enter(&mdi_mutex);
9181 			vh->vh_refcnt--;
9182 		}
9183 	}
9184 
9185 	mutex_exit(&mdi_mutex);
9186 }
9187 
9188 /*
9189  * i_mdi_log_sysevent():
9190  *		Logs events for pickup by syseventd
9191  */
9192 static void
9193 i_mdi_log_sysevent(dev_info_t *dip, char *ph_vh_class, char *subclass)
9194 {
9195 	char		*path_name;
9196 	nvlist_t	*attr_list;
9197 
9198 	if (nvlist_alloc(&attr_list, NV_UNIQUE_NAME_TYPE,
9199 	    KM_SLEEP) != DDI_SUCCESS) {
9200 		goto alloc_failed;
9201 	}
9202 
9203 	path_name = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
9204 	(void) ddi_pathname(dip, path_name);
9205 
9206 	if (nvlist_add_string(attr_list, DDI_DRIVER_NAME,
9207 	    ddi_driver_name(dip)) != DDI_SUCCESS) {
9208 		goto error;
9209 	}
9210 
9211 	if (nvlist_add_int32(attr_list, DDI_DRIVER_MAJOR,
9212 	    (int32_t)ddi_driver_major(dip)) != DDI_SUCCESS) {
9213 		goto error;
9214 	}
9215 
9216 	if (nvlist_add_int32(attr_list, DDI_INSTANCE,
9217 	    (int32_t)ddi_get_instance(dip)) != DDI_SUCCESS) {
9218 		goto error;
9219 	}
9220 
9221 	if (nvlist_add_string(attr_list, DDI_PATHNAME,
9222 	    path_name) != DDI_SUCCESS) {
9223 		goto error;
9224 	}
9225 
9226 	if (nvlist_add_string(attr_list, DDI_CLASS,
9227 	    ph_vh_class) != DDI_SUCCESS) {
9228 		goto error;
9229 	}
9230 
9231 	(void) ddi_log_sysevent(dip, DDI_VENDOR_SUNW, EC_DDI, subclass,
9232 	    attr_list, NULL, DDI_SLEEP);
9233 
9234 error:
9235 	kmem_free(path_name, MAXPATHLEN);
9236 	nvlist_free(attr_list);
9237 	return;
9238 
9239 alloc_failed:
9240 	MDI_DEBUG(1, (CE_WARN, dip,
9241 	    "!i_mdi_log_sysevent: Unable to send sysevent"));
9242 }
9243 
9244 char **
9245 mdi_get_phci_driver_list(char *vhci_class, int	*ndrivers)
9246 {
9247 	char	**driver_list, **ret_driver_list = NULL;
9248 	int	*root_support_list;
9249 	int	cur_elements, max_elements;
9250 
9251 	get_phci_driver_list(vhci_class, &driver_list, &root_support_list,
9252 	    &cur_elements, &max_elements);
9253 
9254 
9255 	if (driver_list) {
9256 		kmem_free(root_support_list, sizeof (int) * max_elements);
9257 		ret_driver_list = mdi_realloc(driver_list, sizeof (char *)
9258 		    * max_elements, sizeof (char *) * cur_elements);
9259 	}
9260 	*ndrivers = cur_elements;
9261 
9262 	return (ret_driver_list);
9263 
9264 }
9265 
9266 void
9267 mdi_free_phci_driver_list(char **driver_list, int ndrivers)
9268 {
9269 	char	**p;
9270 	int	i;
9271 
9272 	if (driver_list) {
9273 		for (i = 0, p = driver_list; i < ndrivers; i++, p++)
9274 			kmem_free(*p, strlen(*p) + 1);
9275 		kmem_free(driver_list, sizeof (char *) * ndrivers);
9276 	}
9277 }
9278