xref: /titanic_50/usr/src/uts/common/os/sunmdi.c (revision cbcb6089bf49be7bed77b8c9c1727b26f2e9c913)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Multipath driver interface (MDI) implementation; see mdi_impl.h for a more
30  * detailed discussion of the overall mpxio architecture.
31  *
32  * Default locking order:
33  *
34  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_phci::ph_mutex))
35  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_client::ct_mutex))
36  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
37  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_client::ct_mutex))
38  * _NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
39  */
40 
41 #include <sys/note.h>
42 #include <sys/types.h>
43 #include <sys/varargs.h>
44 #include <sys/param.h>
45 #include <sys/errno.h>
46 #include <sys/uio.h>
47 #include <sys/buf.h>
48 #include <sys/modctl.h>
49 #include <sys/open.h>
50 #include <sys/kmem.h>
51 #include <sys/poll.h>
52 #include <sys/conf.h>
53 #include <sys/bootconf.h>
54 #include <sys/cmn_err.h>
55 #include <sys/stat.h>
56 #include <sys/ddi.h>
57 #include <sys/sunddi.h>
58 #include <sys/ddipropdefs.h>
59 #include <sys/sunndi.h>
60 #include <sys/ndi_impldefs.h>
61 #include <sys/promif.h>
62 #include <sys/sunmdi.h>
63 #include <sys/mdi_impldefs.h>
64 #include <sys/taskq.h>
65 #include <sys/epm.h>
66 #include <sys/sunpm.h>
67 #include <sys/modhash.h>
68 
69 #ifdef	DEBUG
70 #include <sys/debug.h>
71 int	mdi_debug = 1;
72 #define	MDI_DEBUG(level, stmnt) \
73 	    if (mdi_debug >= (level)) i_mdi_log stmnt
74 static void i_mdi_log(int, dev_info_t *, const char *fmt, ...);
75 #else	/* !DEBUG */
76 #define	MDI_DEBUG(level, stmnt)
77 #endif	/* DEBUG */
78 
79 extern pri_t	minclsyspri;
80 extern int	modrootloaded;
81 
82 /*
83  * Global mutex:
84  * Protects vHCI list and structure members, pHCI and Client lists.
85  */
86 kmutex_t	mdi_mutex;
87 
88 /*
89  * Registered vHCI class driver lists
90  */
91 int		mdi_vhci_count;
92 mdi_vhci_t	*mdi_vhci_head;
93 mdi_vhci_t	*mdi_vhci_tail;
94 
95 /*
96  * Client Hash Table size
97  */
98 static int	mdi_client_table_size = CLIENT_HASH_TABLE_SIZE;
99 
100 /*
101  * taskq interface definitions
102  */
103 #define	MDI_TASKQ_N_THREADS	8
104 #define	MDI_TASKQ_PRI		minclsyspri
105 #define	MDI_TASKQ_MINALLOC	(4*mdi_taskq_n_threads)
106 #define	MDI_TASKQ_MAXALLOC	(500*mdi_taskq_n_threads)
107 
108 taskq_t				*mdi_taskq;
109 static uint_t			mdi_taskq_n_threads = MDI_TASKQ_N_THREADS;
110 
111 #define	TICKS_PER_SECOND	(drv_usectohz(1000000))
112 
113 /*
114  * The data should be "quiet" for this interval (in seconds) before the
115  * vhci cached data is flushed to the disk.
116  */
117 static int mdi_vhcache_flush_delay = 10;
118 
119 /* number of seconds the vhcache flush daemon will sleep idle before exiting */
120 static int mdi_vhcache_flush_daemon_idle_time = 60;
121 
122 /*
123  * number of seconds the asynchronous configuration thread will sleep idle
124  * before exiting.
125  */
126 static int mdi_async_config_idle_time = 600;
127 
128 static int mdi_bus_config_cache_hash_size = 256;
129 
130 /* turns off multithreaded configuration for certain operations */
131 static int mdi_mtc_off = 0;
132 
133 /*
134  * MDI component property name/value string definitions
135  */
136 const char 		*mdi_component_prop = "mpxio-component";
137 const char		*mdi_component_prop_vhci = "vhci";
138 const char		*mdi_component_prop_phci = "phci";
139 const char		*mdi_component_prop_client = "client";
140 
141 /*
142  * MDI client global unique identifier property name
143  */
144 const char		*mdi_client_guid_prop = "client-guid";
145 
146 /*
147  * MDI client load balancing property name/value string definitions
148  */
149 const char		*mdi_load_balance = "load-balance";
150 const char		*mdi_load_balance_none = "none";
151 const char		*mdi_load_balance_rr = "round-robin";
152 const char		*mdi_load_balance_lba = "logical-block";
153 
154 /*
155  * Obsolete vHCI class definition; to be removed after Leadville update
156  */
157 const char *mdi_vhci_class_scsi = MDI_HCI_CLASS_SCSI;
158 
159 static char vhci_greeting[] =
160 	"\tThere already exists one vHCI driver for class %s\n"
161 	"\tOnly one vHCI driver for each class is allowed\n";
162 
163 /*
164  * Static function prototypes
165  */
166 static int		i_mdi_phci_offline(dev_info_t *, uint_t);
167 static int		i_mdi_client_offline(dev_info_t *, uint_t);
168 static int		i_mdi_phci_pre_detach(dev_info_t *, ddi_detach_cmd_t);
169 static void		i_mdi_phci_post_detach(dev_info_t *,
170 			    ddi_detach_cmd_t, int);
171 static int		i_mdi_client_pre_detach(dev_info_t *,
172 			    ddi_detach_cmd_t);
173 static void		i_mdi_client_post_detach(dev_info_t *,
174 			    ddi_detach_cmd_t, int);
175 static void		i_mdi_pm_hold_pip(mdi_pathinfo_t *);
176 static void		i_mdi_pm_rele_pip(mdi_pathinfo_t *);
177 static int 		i_mdi_lba_lb(mdi_client_t *ct,
178 			    mdi_pathinfo_t **ret_pip, struct buf *buf);
179 static void		i_mdi_pm_hold_client(mdi_client_t *, int);
180 static void		i_mdi_pm_rele_client(mdi_client_t *, int);
181 static void		i_mdi_pm_reset_client(mdi_client_t *);
182 static void		i_mdi_pm_hold_all_phci(mdi_client_t *);
183 static int		i_mdi_power_all_phci(mdi_client_t *);
184 
185 
186 /*
187  * Internal mdi_pathinfo node functions
188  */
189 static int		i_mdi_pi_kstat_create(mdi_pathinfo_t *);
190 static void		i_mdi_pi_kstat_destroy(mdi_pathinfo_t *);
191 
192 static mdi_vhci_t	*i_mdi_vhci_class2vhci(char *);
193 static mdi_vhci_t	*i_devi_get_vhci(dev_info_t *);
194 static mdi_phci_t	*i_devi_get_phci(dev_info_t *);
195 static void		i_mdi_phci_lock(mdi_phci_t *, mdi_pathinfo_t *);
196 static void		i_mdi_phci_get_client_lock(mdi_phci_t *,
197 			    mdi_client_t *);
198 static void		i_mdi_phci_unlock(mdi_phci_t *);
199 static mdi_pathinfo_t	*i_mdi_pi_alloc(mdi_phci_t *, char *, mdi_client_t *);
200 static void		i_mdi_phci_add_path(mdi_phci_t *, mdi_pathinfo_t *);
201 static void		i_mdi_client_add_path(mdi_client_t *, mdi_pathinfo_t *);
202 static void		i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *,
203 			    mdi_client_t *);
204 static void		i_mdi_phci_remove_path(mdi_phci_t *, mdi_pathinfo_t *);
205 static void		i_mdi_client_remove_path(mdi_client_t *,
206 			    mdi_pathinfo_t *);
207 
208 static int		i_mdi_pi_state_change(mdi_pathinfo_t *,
209 			    mdi_pathinfo_state_t, int);
210 static int		i_mdi_pi_offline(mdi_pathinfo_t *, int);
211 static dev_info_t	*i_mdi_devinfo_create(mdi_vhci_t *, char *, char *,
212 			    char **, int);
213 static dev_info_t	*i_mdi_devinfo_find(mdi_vhci_t *, char *, char *);
214 static int		i_mdi_devinfo_remove(dev_info_t *, dev_info_t *, int);
215 static int		i_mdi_is_child_present(dev_info_t *, dev_info_t *);
216 static mdi_client_t	*i_mdi_client_alloc(mdi_vhci_t *, char *, char *);
217 static void		i_mdi_client_enlist_table(mdi_vhci_t *, mdi_client_t *);
218 static void		i_mdi_client_delist_table(mdi_vhci_t *, mdi_client_t *);
219 static mdi_client_t	*i_mdi_client_find(mdi_vhci_t *, char *, char *);
220 static void		i_mdi_client_update_state(mdi_client_t *);
221 static int		i_mdi_client_compute_state(mdi_client_t *,
222 			    mdi_phci_t *);
223 static void		i_mdi_client_lock(mdi_client_t *, mdi_pathinfo_t *);
224 static void		i_mdi_client_unlock(mdi_client_t *);
225 static int		i_mdi_client_free(mdi_vhci_t *, mdi_client_t *);
226 static mdi_client_t	*i_devi_get_client(dev_info_t *);
227 static int		i_mdi_pi_enable_disable(dev_info_t *, dev_info_t *, int,
228 			int);
229 /*
230  * Failover related function prototypes
231  */
232 static int		i_mdi_failover(void *);
233 
234 /*
235  * misc internal functions
236  */
237 static int		i_mdi_get_hash_key(char *);
238 static int		i_map_nvlist_error_to_mdi(int);
239 static void		i_mdi_report_path_state(mdi_client_t *,
240 			    mdi_pathinfo_t *);
241 
242 static void		setup_vhci_cache(mdi_vhci_t *);
243 static int		destroy_vhci_cache(mdi_vhci_t *);
244 static void		setup_phci_driver_list(mdi_vhci_t *);
245 static void		free_phci_driver_list(mdi_vhci_config_t *);
246 static int		stop_vhcache_async_threads(mdi_vhci_config_t *);
247 static boolean_t	stop_vhcache_flush_thread(void *, int);
248 static void		free_string_array(char **, int);
249 static void		free_vhcache_phci(mdi_vhcache_phci_t *);
250 static void		free_vhcache_pathinfo(mdi_vhcache_pathinfo_t *);
251 static void		free_vhcache_client(mdi_vhcache_client_t *);
252 static int		mainnvl_to_vhcache(mdi_vhci_cache_t *, nvlist_t *);
253 static nvlist_t		*vhcache_to_mainnvl(mdi_vhci_cache_t *);
254 static void		vhcache_phci_add(mdi_vhci_config_t *, mdi_phci_t *);
255 static void		vhcache_phci_remove(mdi_vhci_config_t *, mdi_phci_t *);
256 static void		vhcache_pi_add(mdi_vhci_config_t *,
257 			    struct mdi_pathinfo *);
258 static void		vhcache_pi_remove(mdi_vhci_config_t *,
259 			    struct mdi_pathinfo *);
260 static void		free_phclient_path_list(mdi_phys_path_t *);
261 static void		sort_vhcache_paths(mdi_vhcache_client_t *);
262 static int		flush_vhcache(mdi_vhci_config_t *, int);
263 static void		vhcache_dirty(mdi_vhci_config_t *);
264 static void		free_async_client_config(mdi_async_client_config_t *);
265 static nvlist_t		*read_on_disk_vhci_cache(char *);
266 extern int		fread_nvlist(char *, nvlist_t **);
267 extern int		fwrite_nvlist(char *, nvlist_t *);
268 
269 /* called once when first vhci registers with mdi */
270 static void
271 i_mdi_init()
272 {
273 	static int initialized = 0;
274 
275 	if (initialized)
276 		return;
277 	initialized = 1;
278 
279 	mutex_init(&mdi_mutex, NULL, MUTEX_DEFAULT, NULL);
280 	/*
281 	 * Create our taskq resources
282 	 */
283 	mdi_taskq = taskq_create("mdi_taskq", mdi_taskq_n_threads,
284 	    MDI_TASKQ_PRI, MDI_TASKQ_MINALLOC, MDI_TASKQ_MAXALLOC,
285 	    TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
286 	ASSERT(mdi_taskq != NULL);	/* taskq_create never fails */
287 }
288 
289 /*
290  * mdi_get_component_type():
291  *		Return mpxio component type
292  * Return Values:
293  *		MDI_COMPONENT_NONE
294  *		MDI_COMPONENT_VHCI
295  *		MDI_COMPONENT_PHCI
296  *		MDI_COMPONENT_CLIENT
297  * XXX This doesn't work under multi-level MPxIO and should be
298  *	removed when clients migrate mdi_is_*() interfaces.
299  */
300 int
301 mdi_get_component_type(dev_info_t *dip)
302 {
303 	return (DEVI(dip)->devi_mdi_component);
304 }
305 
306 /*
307  * mdi_vhci_register():
308  *		Register a vHCI module with the mpxio framework
309  *		mdi_vhci_register() is called by vHCI drivers to register the
310  *		'class_driver' vHCI driver and its MDI entrypoints with the
311  *		mpxio framework.  The vHCI driver must call this interface as
312  *		part of its attach(9e) handler.
313  *		Competing threads may try to attach mdi_vhci_register() as
314  *		the vHCI drivers are loaded and attached as a result of pHCI
315  *		driver instance registration (mdi_phci_register()) with the
316  *		framework.
317  * Return Values:
318  *		MDI_SUCCESS
319  *		MDI_FAILURE
320  */
321 
322 /*ARGSUSED*/
323 int
324 mdi_vhci_register(char *class, dev_info_t *vdip, mdi_vhci_ops_t *vops,
325     int flags)
326 {
327 	mdi_vhci_t		*vh = NULL;
328 
329 	ASSERT(vops->vo_revision == MDI_VHCI_OPS_REV);
330 
331 	i_mdi_init();
332 
333 	mutex_enter(&mdi_mutex);
334 	/*
335 	 * Scan for already registered vhci
336 	 */
337 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
338 		if (strcmp(vh->vh_class, class) == 0) {
339 			/*
340 			 * vHCI has already been created.  Check for valid
341 			 * vHCI ops registration.  We only support one vHCI
342 			 * module per class
343 			 */
344 			if (vh->vh_ops != NULL) {
345 				mutex_exit(&mdi_mutex);
346 				cmn_err(CE_NOTE, vhci_greeting, class);
347 				return (MDI_FAILURE);
348 			}
349 			break;
350 		}
351 	}
352 
353 	/*
354 	 * if not yet created, create the vHCI component
355 	 */
356 	if (vh == NULL) {
357 		struct client_hash	*hash = NULL;
358 		char			*load_balance;
359 
360 		/*
361 		 * Allocate and initialize the mdi extensions
362 		 */
363 		vh = kmem_zalloc(sizeof (mdi_vhci_t), KM_SLEEP);
364 		hash = kmem_zalloc(mdi_client_table_size * sizeof (*hash),
365 		    KM_SLEEP);
366 		vh->vh_client_table = hash;
367 		vh->vh_class = kmem_zalloc(strlen(class) + 1, KM_SLEEP);
368 		(void) strcpy(vh->vh_class, class);
369 		vh->vh_lb = LOAD_BALANCE_RR;
370 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vdip,
371 		    0, LOAD_BALANCE_PROP, &load_balance) == DDI_SUCCESS) {
372 			if (strcmp(load_balance, LOAD_BALANCE_PROP_NONE) == 0) {
373 				vh->vh_lb = LOAD_BALANCE_NONE;
374 			} else if (strcmp(load_balance, LOAD_BALANCE_PROP_LBA)
375 				    == 0) {
376 				vh->vh_lb = LOAD_BALANCE_LBA;
377 			}
378 			ddi_prop_free(load_balance);
379 		}
380 
381 		/*
382 		 * Store the vHCI ops vectors
383 		 */
384 		vh->vh_dip = vdip;
385 		vh->vh_ops = vops;
386 
387 		setup_vhci_cache(vh);
388 
389 		if (mdi_vhci_head == NULL) {
390 			mdi_vhci_head = vh;
391 		}
392 		if (mdi_vhci_tail) {
393 			mdi_vhci_tail->vh_next = vh;
394 		}
395 		mdi_vhci_tail = vh;
396 		mdi_vhci_count++;
397 	}
398 
399 	/*
400 	 * Claim the devfs node as a vhci component
401 	 */
402 	DEVI(vdip)->devi_mdi_component |= MDI_COMPONENT_VHCI;
403 
404 	/*
405 	 * Initialize our back reference from dev_info node
406 	 */
407 	DEVI(vdip)->devi_mdi_xhci = (caddr_t)vh;
408 	mutex_exit(&mdi_mutex);
409 	return (MDI_SUCCESS);
410 }
411 
412 /*
413  * mdi_vhci_unregister():
414  *		Unregister a vHCI module from mpxio framework
415  *		mdi_vhci_unregister() is called from the detach(9E) entrypoint
416  * 		of a vhci to unregister it from the framework.
417  * Return Values:
418  *		MDI_SUCCESS
419  *		MDI_FAILURE
420  */
421 
422 /*ARGSUSED*/
423 int
424 mdi_vhci_unregister(dev_info_t *vdip, int flags)
425 {
426 	mdi_vhci_t	*found, *vh, *prev = NULL;
427 
428 	/*
429 	 * Check for invalid VHCI
430 	 */
431 	if ((vh = i_devi_get_vhci(vdip)) == NULL)
432 		return (MDI_FAILURE);
433 
434 	mutex_enter(&mdi_mutex);
435 
436 	/*
437 	 * Scan the list of registered vHCIs for a match
438 	 */
439 	for (found = mdi_vhci_head; found != NULL; found = found->vh_next) {
440 		if (found == vh)
441 			break;
442 		prev = found;
443 	}
444 
445 	if (found == NULL) {
446 		mutex_exit(&mdi_mutex);
447 		return (MDI_FAILURE);
448 	}
449 
450 	/*
451 	 * Check the pHCI and client count. All the pHCIs and clients
452 	 * should have been unregistered, before a vHCI can be
453 	 * unregistered.
454 	 */
455 	if (vh->vh_phci_count || vh->vh_client_count || vh->vh_refcnt) {
456 		mutex_exit(&mdi_mutex);
457 		return (MDI_FAILURE);
458 	}
459 
460 	/*
461 	 * Remove the vHCI from the global list
462 	 */
463 	if (vh == mdi_vhci_head) {
464 		mdi_vhci_head = vh->vh_next;
465 	} else {
466 		prev->vh_next = vh->vh_next;
467 	}
468 	if (vh == mdi_vhci_tail) {
469 		mdi_vhci_tail = prev;
470 	}
471 
472 	mdi_vhci_count--;
473 	mutex_exit(&mdi_mutex);
474 
475 	if (destroy_vhci_cache(vh) != MDI_SUCCESS) {
476 		/* add vhci to the global list */
477 		mutex_enter(&mdi_mutex);
478 		if (mdi_vhci_head == NULL)
479 			mdi_vhci_head = vh;
480 		else
481 			mdi_vhci_tail->vh_next = vh;
482 		mdi_vhci_tail = vh;
483 		mdi_vhci_count++;
484 		mutex_exit(&mdi_mutex);
485 		return (MDI_FAILURE);
486 	}
487 
488 	vh->vh_ops = NULL;
489 	DEVI(vdip)->devi_mdi_component &= ~MDI_COMPONENT_VHCI;
490 	DEVI(vdip)->devi_mdi_xhci = NULL;
491 	kmem_free(vh->vh_class, strlen(vh->vh_class)+1);
492 	kmem_free(vh->vh_client_table,
493 	    mdi_client_table_size * sizeof (struct client_hash));
494 	kmem_free(vh, sizeof (mdi_vhci_t));
495 	return (MDI_SUCCESS);
496 }
497 
498 /*
499  * i_mdi_vhci_class2vhci():
500  *		Look for a matching vHCI module given a vHCI class name
501  * Return Values:
502  *		Handle to a vHCI component
503  *		NULL
504  */
505 static mdi_vhci_t *
506 i_mdi_vhci_class2vhci(char *class)
507 {
508 	mdi_vhci_t	*vh = NULL;
509 
510 	ASSERT(!MUTEX_HELD(&mdi_mutex));
511 
512 	mutex_enter(&mdi_mutex);
513 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
514 		if (strcmp(vh->vh_class, class) == 0) {
515 			break;
516 		}
517 	}
518 	mutex_exit(&mdi_mutex);
519 	return (vh);
520 }
521 
522 /*
523  * i_devi_get_vhci():
524  *		Utility function to get the handle to a vHCI component
525  * Return Values:
526  *		Handle to a vHCI component
527  *		NULL
528  */
529 mdi_vhci_t *
530 i_devi_get_vhci(dev_info_t *vdip)
531 {
532 	mdi_vhci_t	*vh = NULL;
533 	if (MDI_VHCI(vdip)) {
534 		vh = (mdi_vhci_t *)DEVI(vdip)->devi_mdi_xhci;
535 	}
536 	return (vh);
537 }
538 
539 /*
540  * mdi_phci_register():
541  *		Register a pHCI module with mpxio framework
542  *		mdi_phci_register() is called by pHCI drivers to register with
543  *		the mpxio framework and a specific 'class_driver' vHCI.  The
544  *		pHCI driver must call this interface as part of its attach(9e)
545  *		handler.
546  * Return Values:
547  *		MDI_SUCCESS
548  *		MDI_FAILURE
549  */
550 
551 /*ARGSUSED*/
552 int
553 mdi_phci_register(char *class, dev_info_t *pdip, int flags)
554 {
555 	mdi_phci_t		*ph;
556 	mdi_vhci_t		*vh;
557 	char			*data;
558 	char			*pathname;
559 
560 	pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
561 	(void) ddi_pathname(pdip, pathname);
562 
563 	/*
564 	 * Check for mpxio-disable property. Enable mpxio if the property is
565 	 * missing or not set to "yes".
566 	 * If the property is set to "yes" then emit a brief message.
567 	 */
568 	if ((ddi_prop_lookup_string(DDI_DEV_T_ANY, pdip, 0, "mpxio-disable",
569 	    &data) == DDI_SUCCESS)) {
570 		if (strcmp(data, "yes") == 0) {
571 			MDI_DEBUG(1, (CE_CONT, pdip,
572 			    "?%s (%s%d) multipath capabilities "
573 			    "disabled via %s.conf.\n", pathname,
574 			    ddi_driver_name(pdip), ddi_get_instance(pdip),
575 			    ddi_driver_name(pdip)));
576 			ddi_prop_free(data);
577 			kmem_free(pathname, MAXPATHLEN);
578 			return (MDI_FAILURE);
579 		}
580 		ddi_prop_free(data);
581 	}
582 
583 	kmem_free(pathname, MAXPATHLEN);
584 
585 	/*
586 	 * Search for a matching vHCI
587 	 */
588 	vh = (mdi_vhci_t *)i_mdi_vhci_class2vhci(class);
589 	if (vh == NULL) {
590 		return (MDI_FAILURE);
591 	}
592 
593 	ph = kmem_zalloc(sizeof (mdi_phci_t), KM_SLEEP);
594 	mutex_init(&ph->ph_mutex, NULL, MUTEX_DEFAULT, NULL);
595 	ph->ph_dip = pdip;
596 	ph->ph_vhci = vh;
597 	ph->ph_next = NULL;
598 	ph->ph_unstable = 0;
599 	ph->ph_vprivate = 0;
600 	cv_init(&ph->ph_unstable_cv, NULL, CV_DRIVER, NULL);
601 	cv_init(&ph->ph_powerchange_cv, NULL, CV_DRIVER, NULL);
602 
603 	MDI_PHCI_SET_POWER_UP(ph);
604 	DEVI(pdip)->devi_mdi_component |= MDI_COMPONENT_PHCI;
605 	DEVI(pdip)->devi_mdi_xhci = (caddr_t)ph;
606 
607 	vhcache_phci_add(vh->vh_config, ph);
608 
609 	mutex_enter(&mdi_mutex);
610 	if (vh->vh_phci_head == NULL) {
611 		vh->vh_phci_head = ph;
612 	}
613 	if (vh->vh_phci_tail) {
614 		vh->vh_phci_tail->ph_next = ph;
615 	}
616 	vh->vh_phci_tail = ph;
617 	vh->vh_phci_count++;
618 	mutex_exit(&mdi_mutex);
619 	return (MDI_SUCCESS);
620 }
621 
622 /*
623  * mdi_phci_unregister():
624  *		Unregister a pHCI module from mpxio framework
625  *		mdi_phci_unregister() is called by the pHCI drivers from their
626  *		detach(9E) handler to unregister their instances from the
627  *		framework.
628  * Return Values:
629  *		MDI_SUCCESS
630  *		MDI_FAILURE
631  */
632 
633 /*ARGSUSED*/
634 int
635 mdi_phci_unregister(dev_info_t *pdip, int flags)
636 {
637 	mdi_vhci_t		*vh;
638 	mdi_phci_t		*ph;
639 	mdi_phci_t		*tmp;
640 	mdi_phci_t		*prev = NULL;
641 
642 	ph = i_devi_get_phci(pdip);
643 	if (ph == NULL) {
644 		MDI_DEBUG(1, (CE_WARN, pdip,
645 		    "!pHCI unregister: Not a valid pHCI"));
646 		return (MDI_FAILURE);
647 	}
648 
649 	vh = ph->ph_vhci;
650 	ASSERT(vh != NULL);
651 	if (vh == NULL) {
652 		MDI_DEBUG(1, (CE_WARN, pdip,
653 		    "!pHCI unregister: Not a valid vHCI"));
654 		return (MDI_FAILURE);
655 	}
656 
657 	mutex_enter(&mdi_mutex);
658 	tmp = vh->vh_phci_head;
659 	while (tmp) {
660 		if (tmp == ph) {
661 			break;
662 		}
663 		prev = tmp;
664 		tmp = tmp->ph_next;
665 	}
666 
667 	if (ph == vh->vh_phci_head) {
668 		vh->vh_phci_head = ph->ph_next;
669 	} else {
670 		prev->ph_next = ph->ph_next;
671 	}
672 
673 	if (ph == vh->vh_phci_tail) {
674 		vh->vh_phci_tail = prev;
675 	}
676 
677 	vh->vh_phci_count--;
678 
679 	mutex_exit(&mdi_mutex);
680 
681 	vhcache_phci_remove(vh->vh_config, ph);
682 	cv_destroy(&ph->ph_unstable_cv);
683 	cv_destroy(&ph->ph_powerchange_cv);
684 	mutex_destroy(&ph->ph_mutex);
685 	kmem_free(ph, sizeof (mdi_phci_t));
686 	DEVI(pdip)->devi_mdi_component &= ~MDI_COMPONENT_PHCI;
687 	DEVI(pdip)->devi_mdi_xhci = NULL;
688 	return (MDI_SUCCESS);
689 }
690 
691 /*
692  * i_devi_get_phci():
693  * 		Utility function to return the phci extensions.
694  */
695 static mdi_phci_t *
696 i_devi_get_phci(dev_info_t *pdip)
697 {
698 	mdi_phci_t	*ph = NULL;
699 	if (MDI_PHCI(pdip)) {
700 		ph = (mdi_phci_t *)DEVI(pdip)->devi_mdi_xhci;
701 	}
702 	return (ph);
703 }
704 
705 /*
706  * mdi_phci_path2devinfo():
707  * 		Utility function to search for a valid phci device given
708  *		the devfs pathname.
709  */
710 
711 dev_info_t *
712 mdi_phci_path2devinfo(dev_info_t *vdip, caddr_t pathname)
713 {
714 	char		*temp_pathname;
715 	mdi_vhci_t	*vh;
716 	mdi_phci_t	*ph;
717 	dev_info_t 	*pdip = NULL;
718 
719 	vh = i_devi_get_vhci(vdip);
720 	ASSERT(vh != NULL);
721 
722 	if (vh == NULL) {
723 		/*
724 		 * Invalid vHCI component, return failure
725 		 */
726 		return (NULL);
727 	}
728 
729 	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
730 	mutex_enter(&mdi_mutex);
731 	ph = vh->vh_phci_head;
732 	while (ph != NULL) {
733 		pdip = ph->ph_dip;
734 		ASSERT(pdip != NULL);
735 		*temp_pathname = '\0';
736 		(void) ddi_pathname(pdip, temp_pathname);
737 		if (strcmp(temp_pathname, pathname) == 0) {
738 			break;
739 		}
740 		ph = ph->ph_next;
741 	}
742 	if (ph == NULL) {
743 		pdip = NULL;
744 	}
745 	mutex_exit(&mdi_mutex);
746 	kmem_free(temp_pathname, MAXPATHLEN);
747 	return (pdip);
748 }
749 
750 /*
751  * mdi_phci_get_path_count():
752  * 		get number of path information nodes associated with a given
753  *		pHCI device.
754  */
755 int
756 mdi_phci_get_path_count(dev_info_t *pdip)
757 {
758 	mdi_phci_t	*ph;
759 	int		count = 0;
760 
761 	ph = i_devi_get_phci(pdip);
762 	if (ph != NULL) {
763 		count = ph->ph_path_count;
764 	}
765 	return (count);
766 }
767 
768 /*
769  * i_mdi_phci_lock():
770  *		Lock a pHCI device
771  * Return Values:
772  *		None
773  * Note:
774  *		The default locking order is:
775  *		_NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
776  *		But there are number of situations where locks need to be
777  *		grabbed in reverse order.  This routine implements try and lock
778  *		mechanism depending on the requested parameter option.
779  */
780 static void
781 i_mdi_phci_lock(mdi_phci_t *ph, mdi_pathinfo_t *pip)
782 {
783 	if (pip) {
784 		/* Reverse locking is requested. */
785 		while (MDI_PHCI_TRYLOCK(ph) == 0) {
786 			/*
787 			 * tryenter failed. Try to grab again
788 			 * after a small delay
789 			 */
790 			MDI_PI_HOLD(pip);
791 			MDI_PI_UNLOCK(pip);
792 			delay(1);
793 			MDI_PI_LOCK(pip);
794 			MDI_PI_RELE(pip);
795 		}
796 	} else {
797 		MDI_PHCI_LOCK(ph);
798 	}
799 }
800 
801 /*
802  * i_mdi_phci_get_client_lock():
803  *		Lock a pHCI device
804  * Return Values:
805  *		None
806  * Note:
807  *		The default locking order is:
808  *		_NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_client::ct_mutex))
809  *		But there are number of situations where locks need to be
810  *		grabbed in reverse order.  This routine implements try and lock
811  *		mechanism depending on the requested parameter option.
812  */
813 static void
814 i_mdi_phci_get_client_lock(mdi_phci_t *ph, mdi_client_t *ct)
815 {
816 	if (ct) {
817 		/* Reverse locking is requested. */
818 		while (MDI_PHCI_TRYLOCK(ph) == 0) {
819 			/*
820 			 * tryenter failed. Try to grab again
821 			 * after a small delay
822 			 */
823 			MDI_CLIENT_UNLOCK(ct);
824 			delay(1);
825 			MDI_CLIENT_LOCK(ct);
826 		}
827 	} else {
828 		MDI_PHCI_LOCK(ph);
829 	}
830 }
831 
832 /*
833  * i_mdi_phci_unlock():
834  *		Unlock the pHCI component
835  */
836 static void
837 i_mdi_phci_unlock(mdi_phci_t *ph)
838 {
839 	MDI_PHCI_UNLOCK(ph);
840 }
841 
842 /*
843  * i_mdi_devinfo_create():
844  *		create client device's devinfo node
845  * Return Values:
846  *		dev_info
847  *		NULL
848  * Notes:
849  */
850 static dev_info_t *
851 i_mdi_devinfo_create(mdi_vhci_t *vh, char *name, char *guid,
852 	char **compatible, int ncompatible)
853 {
854 	dev_info_t *cdip = NULL;
855 
856 	ASSERT(MUTEX_HELD(&mdi_mutex));
857 
858 	/* Verify for duplicate entry */
859 	cdip = i_mdi_devinfo_find(vh, name, guid);
860 	ASSERT(cdip == NULL);
861 	if (cdip) {
862 		cmn_err(CE_WARN,
863 		    "i_mdi_devinfo_create: client dip %p already exists",
864 			(void *)cdip);
865 	}
866 
867 	ndi_devi_alloc_sleep(vh->vh_dip, name, DEVI_SID_NODEID, &cdip);
868 	if (cdip == NULL)
869 		goto fail;
870 
871 	/*
872 	 * Create component type and Global unique identifier
873 	 * properties
874 	 */
875 	if (ndi_prop_update_string(DDI_DEV_T_NONE, cdip,
876 	    MDI_CLIENT_GUID_PROP, guid) != DDI_PROP_SUCCESS) {
877 		goto fail;
878 	}
879 
880 	/* Decorate the node with compatible property */
881 	if (compatible &&
882 	    (ndi_prop_update_string_array(DDI_DEV_T_NONE, cdip,
883 	    "compatible", compatible, ncompatible) != DDI_PROP_SUCCESS)) {
884 		goto fail;
885 	}
886 
887 	return (cdip);
888 
889 fail:
890 	if (cdip) {
891 		(void) ndi_prop_remove_all(cdip);
892 		(void) ndi_devi_free(cdip);
893 	}
894 	return (NULL);
895 }
896 
897 /*
898  * i_mdi_devinfo_find():
899  *		Find a matching devinfo node for given client node name
900  *		and its guid.
901  * Return Values:
902  *		Handle to a dev_info node or NULL
903  */
904 
905 static dev_info_t *
906 i_mdi_devinfo_find(mdi_vhci_t *vh, caddr_t name, char *guid)
907 {
908 	char			*data;
909 	dev_info_t 		*cdip = NULL;
910 	dev_info_t 		*ndip = NULL;
911 	int			circular;
912 
913 	ndi_devi_enter(vh->vh_dip, &circular);
914 	ndip = (dev_info_t *)DEVI(vh->vh_dip)->devi_child;
915 	while ((cdip = ndip) != NULL) {
916 		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
917 
918 		if (strcmp(DEVI(cdip)->devi_node_name, name)) {
919 			continue;
920 		}
921 
922 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, cdip,
923 		    DDI_PROP_DONTPASS, MDI_CLIENT_GUID_PROP,
924 		    &data) != DDI_PROP_SUCCESS) {
925 			continue;
926 		}
927 
928 		if (strcmp(data, guid) != 0) {
929 			ddi_prop_free(data);
930 			continue;
931 		}
932 		ddi_prop_free(data);
933 		break;
934 	}
935 	ndi_devi_exit(vh->vh_dip, circular);
936 	return (cdip);
937 }
938 
939 /*
940  * i_mdi_devinfo_remove():
941  *		Remove a client device node
942  */
943 static int
944 i_mdi_devinfo_remove(dev_info_t *vdip, dev_info_t *cdip, int flags)
945 {
946 	int	rv = MDI_SUCCESS;
947 	if (i_mdi_is_child_present(vdip, cdip) == MDI_SUCCESS ||
948 	    (flags & MDI_CLIENT_FLAGS_DEV_NOT_SUPPORTED)) {
949 		rv = ndi_devi_offline(cdip, NDI_DEVI_REMOVE);
950 		if (rv != NDI_SUCCESS) {
951 			MDI_DEBUG(1, (CE_NOTE, NULL, "!i_mdi_devinfo_remove:"
952 			    " failed. cdip = %p\n", cdip));
953 		}
954 		/*
955 		 * Convert to MDI error code
956 		 */
957 		switch (rv) {
958 		case NDI_SUCCESS:
959 			rv = MDI_SUCCESS;
960 			break;
961 		case NDI_BUSY:
962 			rv = MDI_BUSY;
963 			break;
964 		default:
965 			rv = MDI_FAILURE;
966 			break;
967 		}
968 	}
969 	return (rv);
970 }
971 
972 /*
973  * i_devi_get_client()
974  *		Utility function to get mpxio component extensions
975  */
976 static mdi_client_t *
977 i_devi_get_client(dev_info_t *cdip)
978 {
979 	mdi_client_t	*ct = NULL;
980 	if (MDI_CLIENT(cdip)) {
981 		ct = (mdi_client_t *)DEVI(cdip)->devi_mdi_client;
982 	}
983 	return (ct);
984 }
985 
986 /*
987  * i_mdi_is_child_present():
988  *		Search for the presence of client device dev_info node
989  */
990 
991 static int
992 i_mdi_is_child_present(dev_info_t *vdip, dev_info_t *cdip)
993 {
994 	int		rv = MDI_FAILURE;
995 	struct dev_info	*dip;
996 	int		circular;
997 
998 	ndi_devi_enter(vdip, &circular);
999 	dip = DEVI(vdip)->devi_child;
1000 	while (dip) {
1001 		if (dip == DEVI(cdip)) {
1002 			rv = MDI_SUCCESS;
1003 			break;
1004 		}
1005 		dip = dip->devi_sibling;
1006 	}
1007 	ndi_devi_exit(vdip, circular);
1008 	return (rv);
1009 }
1010 
1011 
1012 /*
1013  * i_mdi_client_lock():
1014  *		Grab client component lock
1015  * Return Values:
1016  *		None
1017  * Note:
1018  *		The default locking order is:
1019  *		_NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
1020  *		But there are number of situations where locks need to be
1021  *		grabbed in reverse order.  This routine implements try and lock
1022  *		mechanism depending on the requested parameter option.
1023  */
1024 
1025 static void
1026 i_mdi_client_lock(mdi_client_t *ct, mdi_pathinfo_t *pip)
1027 {
1028 	if (pip) {
1029 		/*
1030 		 * Reverse locking is requested.
1031 		 */
1032 		while (MDI_CLIENT_TRYLOCK(ct) == 0) {
1033 			/*
1034 			 * tryenter failed. Try to grab again
1035 			 * after a small delay
1036 			 */
1037 			MDI_PI_HOLD(pip);
1038 			MDI_PI_UNLOCK(pip);
1039 			delay(1);
1040 			MDI_PI_LOCK(pip);
1041 			MDI_PI_RELE(pip);
1042 		}
1043 	} else {
1044 		MDI_CLIENT_LOCK(ct);
1045 	}
1046 }
1047 
1048 /*
1049  * i_mdi_client_unlock():
1050  *		Unlock a client component
1051  */
1052 
1053 static void
1054 i_mdi_client_unlock(mdi_client_t *ct)
1055 {
1056 	MDI_CLIENT_UNLOCK(ct);
1057 }
1058 
1059 /*
1060  * i_mdi_client_alloc():
1061  * 		Allocate and initialize a client structure.  Caller should
1062  *		hold the global mdi_mutex.
1063  * Return Values:
1064  *		Handle to a client component
1065  */
1066 /*ARGSUSED*/
1067 static mdi_client_t *
1068 i_mdi_client_alloc(mdi_vhci_t *vh, char *name, char *lguid)
1069 {
1070 	mdi_client_t	*ct;
1071 
1072 	ASSERT(MUTEX_HELD(&mdi_mutex));
1073 
1074 	/*
1075 	 * Allocate and initialize a component structure.
1076 	 */
1077 	ct = kmem_zalloc(sizeof (*ct), KM_SLEEP);
1078 	mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL);
1079 	ct->ct_hnext = NULL;
1080 	ct->ct_hprev = NULL;
1081 	ct->ct_dip = NULL;
1082 	ct->ct_vhci = vh;
1083 	ct->ct_drvname = kmem_alloc(strlen(name) + 1, KM_SLEEP);
1084 	(void) strcpy(ct->ct_drvname, name);
1085 	ct->ct_guid = kmem_alloc(strlen(lguid) + 1, KM_SLEEP);
1086 	(void) strcpy(ct->ct_guid, lguid);
1087 	ct->ct_cprivate = NULL;
1088 	ct->ct_vprivate = NULL;
1089 	ct->ct_flags = 0;
1090 	ct->ct_state = MDI_CLIENT_STATE_FAILED;
1091 	MDI_CLIENT_SET_OFFLINE(ct);
1092 	MDI_CLIENT_SET_DETACH(ct);
1093 	MDI_CLIENT_SET_POWER_UP(ct);
1094 	ct->ct_failover_flags = 0;
1095 	ct->ct_failover_status = 0;
1096 	cv_init(&ct->ct_failover_cv, NULL, CV_DRIVER, NULL);
1097 	ct->ct_unstable = 0;
1098 	cv_init(&ct->ct_unstable_cv, NULL, CV_DRIVER, NULL);
1099 	cv_init(&ct->ct_powerchange_cv, NULL, CV_DRIVER, NULL);
1100 	ct->ct_lb = vh->vh_lb;
1101 	ct->ct_lb_args =  kmem_zalloc(sizeof (client_lb_args_t), KM_SLEEP);
1102 	ct->ct_lb_args->region_size = LOAD_BALANCE_DEFAULT_REGION_SIZE;
1103 	ct->ct_path_count = 0;
1104 	ct->ct_path_head = NULL;
1105 	ct->ct_path_tail = NULL;
1106 	ct->ct_path_last = NULL;
1107 
1108 	/*
1109 	 * Add this client component to our client hash queue
1110 	 */
1111 	i_mdi_client_enlist_table(vh, ct);
1112 	return (ct);
1113 }
1114 
1115 /*
1116  * i_mdi_client_enlist_table():
1117  *		Attach the client device to the client hash table. Caller
1118  *		should hold the mdi_mutex
1119  */
1120 
1121 static void
1122 i_mdi_client_enlist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1123 {
1124 	int 			index;
1125 	struct client_hash	*head;
1126 
1127 	ASSERT(MUTEX_HELD(&mdi_mutex));
1128 	index = i_mdi_get_hash_key(ct->ct_guid);
1129 	head = &vh->vh_client_table[index];
1130 	ct->ct_hnext = (mdi_client_t *)head->ct_hash_head;
1131 	head->ct_hash_head = ct;
1132 	head->ct_hash_count++;
1133 	vh->vh_client_count++;
1134 }
1135 
1136 /*
1137  * i_mdi_client_delist_table():
1138  *		Attach the client device to the client hash table.
1139  *		Caller should hold the mdi_mutex
1140  */
1141 
1142 static void
1143 i_mdi_client_delist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1144 {
1145 	int			index;
1146 	char			*guid;
1147 	struct client_hash 	*head;
1148 	mdi_client_t		*next;
1149 	mdi_client_t		*last;
1150 
1151 	ASSERT(MUTEX_HELD(&mdi_mutex));
1152 	guid = ct->ct_guid;
1153 	index = i_mdi_get_hash_key(guid);
1154 	head = &vh->vh_client_table[index];
1155 
1156 	last = NULL;
1157 	next = (mdi_client_t *)head->ct_hash_head;
1158 	while (next != NULL) {
1159 		if (next == ct) {
1160 			break;
1161 		}
1162 		last = next;
1163 		next = next->ct_hnext;
1164 	}
1165 
1166 	if (next) {
1167 		head->ct_hash_count--;
1168 		if (last == NULL) {
1169 			head->ct_hash_head = ct->ct_hnext;
1170 		} else {
1171 			last->ct_hnext = ct->ct_hnext;
1172 		}
1173 		ct->ct_hnext = NULL;
1174 		vh->vh_client_count--;
1175 	}
1176 }
1177 
1178 
1179 /*
1180  * i_mdi_client_free():
1181  *		Free a client component
1182  */
1183 static int
1184 i_mdi_client_free(mdi_vhci_t *vh, mdi_client_t *ct)
1185 {
1186 	int		rv = MDI_SUCCESS;
1187 	int		flags = ct->ct_flags;
1188 	dev_info_t	*cdip;
1189 	dev_info_t	*vdip;
1190 
1191 	ASSERT(MUTEX_HELD(&mdi_mutex));
1192 	vdip = vh->vh_dip;
1193 	cdip = ct->ct_dip;
1194 
1195 	(void) ndi_prop_remove(DDI_DEV_T_NONE, cdip, MDI_CLIENT_GUID_PROP);
1196 	DEVI(cdip)->devi_mdi_component &= ~MDI_COMPONENT_CLIENT;
1197 	DEVI(cdip)->devi_mdi_client = NULL;
1198 
1199 	/*
1200 	 * Clear out back ref. to dev_info_t node
1201 	 */
1202 	ct->ct_dip = NULL;
1203 
1204 	/*
1205 	 * Remove this client from our hash queue
1206 	 */
1207 	i_mdi_client_delist_table(vh, ct);
1208 
1209 	/*
1210 	 * Uninitialize and free the component
1211 	 */
1212 	kmem_free(ct->ct_drvname, strlen(ct->ct_drvname) + 1);
1213 	kmem_free(ct->ct_guid, strlen(ct->ct_guid) + 1);
1214 	kmem_free(ct->ct_lb_args, sizeof (client_lb_args_t));
1215 	cv_destroy(&ct->ct_failover_cv);
1216 	cv_destroy(&ct->ct_unstable_cv);
1217 	cv_destroy(&ct->ct_powerchange_cv);
1218 	mutex_destroy(&ct->ct_mutex);
1219 	kmem_free(ct, sizeof (*ct));
1220 
1221 	if (cdip != NULL) {
1222 		mutex_exit(&mdi_mutex);
1223 		(void) i_mdi_devinfo_remove(vdip, cdip, flags);
1224 		mutex_enter(&mdi_mutex);
1225 	}
1226 	return (rv);
1227 }
1228 
1229 /*
1230  * i_mdi_client_find():
1231  * 		Find the client structure corresponding to a given guid
1232  *		Caller should hold the mdi_mutex
1233  */
1234 static mdi_client_t *
1235 i_mdi_client_find(mdi_vhci_t *vh, char *cname, char *guid)
1236 {
1237 	int			index;
1238 	struct client_hash	*head;
1239 	mdi_client_t		*ct;
1240 
1241 	ASSERT(MUTEX_HELD(&mdi_mutex));
1242 	index = i_mdi_get_hash_key(guid);
1243 	head = &vh->vh_client_table[index];
1244 
1245 	ct = head->ct_hash_head;
1246 	while (ct != NULL) {
1247 		if (strcmp(ct->ct_guid, guid) == 0 &&
1248 		    (cname == NULL || strcmp(ct->ct_drvname, cname) == 0)) {
1249 			break;
1250 		}
1251 		ct = ct->ct_hnext;
1252 	}
1253 	return (ct);
1254 }
1255 
1256 
1257 
1258 /*
1259  * i_mdi_client_update_state():
1260  *		Compute and update client device state
1261  * Notes:
1262  *		A client device can be in any of three possible states:
1263  *
1264  *		MDI_CLIENT_STATE_OPTIMAL - Client in optimal state with more
1265  *		one online/standby paths. Can tolerate failures.
1266  *		MDI_CLIENT_STATE_DEGRADED - Client device in degraded state with
1267  *		no alternate paths available as standby. A failure on the online
1268  *		would result in loss of access to device data.
1269  *		MDI_CLIENT_STATE_FAILED - Client device in failed state with
1270  *		no paths available to access the device.
1271  */
1272 static void
1273 i_mdi_client_update_state(mdi_client_t *ct)
1274 {
1275 	int state;
1276 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1277 	state = i_mdi_client_compute_state(ct, NULL);
1278 	MDI_CLIENT_SET_STATE(ct, state);
1279 }
1280 
1281 /*
1282  * i_mdi_client_compute_state():
1283  *		Compute client device state
1284  *
1285  *		mdi_phci_t *	Pointer to pHCI structure which should
1286  *				while computing the new value.  Used by
1287  *				i_mdi_phci_offline() to find the new
1288  *				client state after DR of a pHCI.
1289  */
1290 static int
1291 i_mdi_client_compute_state(mdi_client_t *ct, mdi_phci_t *ph)
1292 {
1293 	int		state;
1294 	int		online_count = 0;
1295 	int		standby_count = 0;
1296 	mdi_pathinfo_t	*pip, *next;
1297 
1298 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1299 	pip = ct->ct_path_head;
1300 	while (pip != NULL) {
1301 		MDI_PI_LOCK(pip);
1302 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
1303 		if (MDI_PI(pip)->pi_phci == ph) {
1304 			MDI_PI_UNLOCK(pip);
1305 			pip = next;
1306 			continue;
1307 		}
1308 		if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1309 				== MDI_PATHINFO_STATE_ONLINE)
1310 			online_count++;
1311 		else if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1312 				== MDI_PATHINFO_STATE_STANDBY)
1313 			standby_count++;
1314 		MDI_PI_UNLOCK(pip);
1315 		pip = next;
1316 	}
1317 
1318 	if (online_count == 0) {
1319 		if (standby_count == 0) {
1320 			state = MDI_CLIENT_STATE_FAILED;
1321 			MDI_DEBUG(2, (CE_NOTE, NULL, "!client state: failed"
1322 			    " ct = %p\n", ct));
1323 		} else if (standby_count == 1) {
1324 			state = MDI_CLIENT_STATE_DEGRADED;
1325 		} else {
1326 			state = MDI_CLIENT_STATE_OPTIMAL;
1327 		}
1328 	} else if (online_count == 1) {
1329 		if (standby_count == 0) {
1330 			state = MDI_CLIENT_STATE_DEGRADED;
1331 		} else {
1332 			state = MDI_CLIENT_STATE_OPTIMAL;
1333 		}
1334 	} else {
1335 		state = MDI_CLIENT_STATE_OPTIMAL;
1336 	}
1337 	return (state);
1338 }
1339 
1340 /*
1341  * i_mdi_client2devinfo():
1342  *		Utility function
1343  */
1344 dev_info_t *
1345 i_mdi_client2devinfo(mdi_client_t *ct)
1346 {
1347 	return (ct->ct_dip);
1348 }
1349 
1350 /*
1351  * mdi_client_path2_devinfo():
1352  * 		Given the parent devinfo and child devfs pathname, search for
1353  *		a valid devfs node handle.
1354  */
1355 dev_info_t *
1356 mdi_client_path2devinfo(dev_info_t *vdip, char *pathname)
1357 {
1358 	dev_info_t 	*cdip = NULL;
1359 	dev_info_t 	*ndip = NULL;
1360 	char		*temp_pathname;
1361 	int		circular;
1362 
1363 	/*
1364 	 * Allocate temp buffer
1365 	 */
1366 	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1367 
1368 	/*
1369 	 * Lock parent against changes
1370 	 */
1371 	ndi_devi_enter(vdip, &circular);
1372 	ndip = (dev_info_t *)DEVI(vdip)->devi_child;
1373 	while ((cdip = ndip) != NULL) {
1374 		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
1375 
1376 		*temp_pathname = '\0';
1377 		(void) ddi_pathname(cdip, temp_pathname);
1378 		if (strcmp(temp_pathname, pathname) == 0) {
1379 			break;
1380 		}
1381 	}
1382 	/*
1383 	 * Release devinfo lock
1384 	 */
1385 	ndi_devi_exit(vdip, circular);
1386 
1387 	/*
1388 	 * Free the temp buffer
1389 	 */
1390 	kmem_free(temp_pathname, MAXPATHLEN);
1391 	return (cdip);
1392 }
1393 
1394 
1395 /*
1396  * mdi_client_get_path_count():
1397  * 		Utility function to get number of path information nodes
1398  *		associated with a given client device.
1399  */
1400 int
1401 mdi_client_get_path_count(dev_info_t *cdip)
1402 {
1403 	mdi_client_t	*ct;
1404 	int		count = 0;
1405 
1406 	ct = i_devi_get_client(cdip);
1407 	if (ct != NULL) {
1408 		count = ct->ct_path_count;
1409 	}
1410 	return (count);
1411 }
1412 
1413 
1414 /*
1415  * i_mdi_get_hash_key():
1416  * 		Create a hash using strings as keys
1417  *
1418  */
1419 static int
1420 i_mdi_get_hash_key(char *str)
1421 {
1422 	uint32_t	g, hash = 0;
1423 	char		*p;
1424 
1425 	for (p = str; *p != '\0'; p++) {
1426 		g = *p;
1427 		hash += g;
1428 	}
1429 	return (hash % (CLIENT_HASH_TABLE_SIZE - 1));
1430 }
1431 
1432 /*
1433  * mdi_get_lb_policy():
1434  * 		Get current load balancing policy for a given client device
1435  */
1436 client_lb_t
1437 mdi_get_lb_policy(dev_info_t *cdip)
1438 {
1439 	client_lb_t	lb = LOAD_BALANCE_NONE;
1440 	mdi_client_t	*ct;
1441 
1442 	ct = i_devi_get_client(cdip);
1443 	if (ct != NULL) {
1444 		lb = ct->ct_lb;
1445 	}
1446 	return (lb);
1447 }
1448 
1449 /*
1450  * mdi_set_lb_region_size():
1451  * 		Set current region size for the load-balance
1452  */
1453 int
1454 mdi_set_lb_region_size(dev_info_t *cdip, int region_size)
1455 {
1456 	mdi_client_t	*ct;
1457 	int		rv = MDI_FAILURE;
1458 
1459 	ct = i_devi_get_client(cdip);
1460 	if (ct != NULL && ct->ct_lb_args != NULL) {
1461 		ct->ct_lb_args->region_size = region_size;
1462 		rv = MDI_SUCCESS;
1463 	}
1464 	return (rv);
1465 }
1466 
1467 /*
1468  * mdi_Set_lb_policy():
1469  * 		Set current load balancing policy for a given client device
1470  */
1471 int
1472 mdi_set_lb_policy(dev_info_t *cdip, client_lb_t lb)
1473 {
1474 	mdi_client_t	*ct;
1475 	int		rv = MDI_FAILURE;
1476 
1477 	ct = i_devi_get_client(cdip);
1478 	if (ct != NULL) {
1479 		ct->ct_lb = lb;
1480 		rv = MDI_SUCCESS;
1481 	}
1482 	return (rv);
1483 }
1484 
1485 /*
1486  * mdi_failover():
1487  *		failover function called by the vHCI drivers to initiate
1488  *		a failover operation.  This is typically due to non-availability
1489  *		of online paths to route I/O requests.  Failover can be
1490  *		triggered through user application also.
1491  *
1492  *		The vHCI driver calls mdi_failover() to initiate a failover
1493  *		operation. mdi_failover() calls back into the vHCI driver's
1494  *		vo_failover() entry point to perform the actual failover
1495  *		operation.  The reason for requiring the vHCI driver to
1496  *		initiate failover by calling mdi_failover(), instead of directly
1497  *		executing vo_failover() itself, is to ensure that the mdi
1498  *		framework can keep track of the client state properly.
1499  *		Additionally, mdi_failover() provides as a convenience the
1500  *		option of performing the failover operation synchronously or
1501  *		asynchronously
1502  *
1503  *		Upon successful completion of the failover operation, the
1504  *		paths that were previously ONLINE will be in the STANDBY state,
1505  *		and the newly activated paths will be in the ONLINE state.
1506  *
1507  *		The flags modifier determines whether the activation is done
1508  *		synchronously: MDI_FAILOVER_SYNC
1509  * Return Values:
1510  *		MDI_SUCCESS
1511  *		MDI_FAILURE
1512  *		MDI_BUSY
1513  */
1514 /*ARGSUSED*/
1515 int
1516 mdi_failover(dev_info_t *vdip, dev_info_t *cdip, int flags)
1517 {
1518 	int			rv;
1519 	mdi_client_t		*ct;
1520 
1521 	ct = i_devi_get_client(cdip);
1522 	ASSERT(ct != NULL);
1523 	if (ct == NULL) {
1524 		/* cdip is not a valid client device. Nothing more to do. */
1525 		return (MDI_FAILURE);
1526 	}
1527 
1528 	MDI_CLIENT_LOCK(ct);
1529 
1530 	if (MDI_CLIENT_IS_PATH_FREE_IN_PROGRESS(ct)) {
1531 		/* A path to the client is being freed */
1532 		MDI_CLIENT_UNLOCK(ct);
1533 		return (MDI_BUSY);
1534 	}
1535 
1536 
1537 	if (MDI_CLIENT_IS_FAILED(ct)) {
1538 		/*
1539 		 * Client is in failed state. Nothing more to do.
1540 		 */
1541 		MDI_CLIENT_UNLOCK(ct);
1542 		return (MDI_FAILURE);
1543 	}
1544 
1545 	if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
1546 		/*
1547 		 * Failover is already in progress; return BUSY
1548 		 */
1549 		MDI_CLIENT_UNLOCK(ct);
1550 		return (MDI_BUSY);
1551 	}
1552 	/*
1553 	 * Make sure that mdi_pathinfo node state changes are processed.
1554 	 * We do not allow failovers to progress while client path state
1555 	 * changes are in progress
1556 	 */
1557 	if (ct->ct_unstable) {
1558 		if (flags == MDI_FAILOVER_ASYNC) {
1559 			MDI_CLIENT_UNLOCK(ct);
1560 			return (MDI_BUSY);
1561 		} else {
1562 			while (ct->ct_unstable)
1563 				cv_wait(&ct->ct_unstable_cv, &ct->ct_mutex);
1564 		}
1565 	}
1566 
1567 	/*
1568 	 * Client device is in stable state. Before proceeding, perform sanity
1569 	 * checks again.
1570 	 */
1571 	if ((MDI_CLIENT_IS_DETACHED(ct)) || (MDI_CLIENT_IS_FAILED(ct)) ||
1572 	    (i_ddi_node_state(ct->ct_dip) < DS_READY)) {
1573 		/*
1574 		 * Client is in failed state. Nothing more to do.
1575 		 */
1576 		MDI_CLIENT_UNLOCK(ct);
1577 		return (MDI_FAILURE);
1578 	}
1579 
1580 	/*
1581 	 * Set the client state as failover in progress.
1582 	 */
1583 	MDI_CLIENT_SET_FAILOVER_IN_PROGRESS(ct);
1584 	ct->ct_failover_flags = flags;
1585 	MDI_CLIENT_UNLOCK(ct);
1586 
1587 	if (flags == MDI_FAILOVER_ASYNC) {
1588 		/*
1589 		 * Submit the initiate failover request via CPR safe
1590 		 * taskq threads.
1591 		 */
1592 		(void) taskq_dispatch(mdi_taskq, (task_func_t *)i_mdi_failover,
1593 		    ct, KM_SLEEP);
1594 		return (MDI_ACCEPT);
1595 	} else {
1596 		/*
1597 		 * Synchronous failover mode.  Typically invoked from the user
1598 		 * land.
1599 		 */
1600 		rv = i_mdi_failover(ct);
1601 	}
1602 	return (rv);
1603 }
1604 
1605 /*
1606  * i_mdi_failover():
1607  *		internal failover function. Invokes vHCI drivers failover
1608  *		callback function and process the failover status
1609  * Return Values:
1610  *		None
1611  *
1612  * Note: A client device in failover state can not be detached or freed.
1613  */
1614 static int
1615 i_mdi_failover(void *arg)
1616 {
1617 	int		rv = MDI_SUCCESS;
1618 	mdi_client_t	*ct = (mdi_client_t *)arg;
1619 	mdi_vhci_t	*vh = ct->ct_vhci;
1620 
1621 	ASSERT(!MUTEX_HELD(&ct->ct_mutex));
1622 
1623 	if (vh->vh_ops->vo_failover != NULL) {
1624 		/*
1625 		 * Call vHCI drivers callback routine
1626 		 */
1627 		rv = (*vh->vh_ops->vo_failover)(vh->vh_dip, ct->ct_dip,
1628 		    ct->ct_failover_flags);
1629 	}
1630 
1631 	MDI_CLIENT_LOCK(ct);
1632 	MDI_CLIENT_CLEAR_FAILOVER_IN_PROGRESS(ct);
1633 
1634 	/*
1635 	 * Save the failover return status
1636 	 */
1637 	ct->ct_failover_status = rv;
1638 
1639 	/*
1640 	 * As a result of failover, client status would have been changed.
1641 	 * Update the client state and wake up anyone waiting on this client
1642 	 * device.
1643 	 */
1644 	i_mdi_client_update_state(ct);
1645 
1646 	cv_broadcast(&ct->ct_failover_cv);
1647 	MDI_CLIENT_UNLOCK(ct);
1648 	return (rv);
1649 }
1650 
1651 /*
1652  * Load balancing is logical block.
1653  * IOs within the range described by region_size
1654  * would go on the same path. This would improve the
1655  * performance by cache-hit on some of the RAID devices.
1656  * Search only for online paths(At some point we
1657  * may want to balance across target ports).
1658  * If no paths are found then default to round-robin.
1659  */
1660 static int
1661 i_mdi_lba_lb(mdi_client_t *ct, mdi_pathinfo_t **ret_pip, struct buf *bp)
1662 {
1663 	int		path_index = -1;
1664 	int		online_path_count = 0;
1665 	int		online_nonpref_path_count = 0;
1666 	int 		region_size = ct->ct_lb_args->region_size;
1667 	mdi_pathinfo_t	*pip;
1668 	mdi_pathinfo_t	*next;
1669 	int		preferred, path_cnt;
1670 
1671 	pip = ct->ct_path_head;
1672 	while (pip) {
1673 		MDI_PI_LOCK(pip);
1674 		if (MDI_PI(pip)->pi_state ==
1675 		    MDI_PATHINFO_STATE_ONLINE && MDI_PI(pip)->pi_preferred) {
1676 			online_path_count++;
1677 		} else if (MDI_PI(pip)->pi_state ==
1678 		    MDI_PATHINFO_STATE_ONLINE && !MDI_PI(pip)->pi_preferred) {
1679 			online_nonpref_path_count++;
1680 		}
1681 		next = (mdi_pathinfo_t *)
1682 		    MDI_PI(pip)->pi_client_link;
1683 		MDI_PI_UNLOCK(pip);
1684 		pip = next;
1685 	}
1686 	/* if found any online/preferred then use this type */
1687 	if (online_path_count > 0) {
1688 		path_cnt = online_path_count;
1689 		preferred = 1;
1690 	} else if (online_nonpref_path_count > 0) {
1691 		path_cnt = online_nonpref_path_count;
1692 		preferred = 0;
1693 	} else {
1694 		path_cnt = 0;
1695 	}
1696 	if (path_cnt) {
1697 		path_index = (bp->b_blkno >> region_size) % path_cnt;
1698 		pip = ct->ct_path_head;
1699 		while (pip && path_index != -1) {
1700 			MDI_PI_LOCK(pip);
1701 			if (path_index == 0 &&
1702 			    (MDI_PI(pip)->pi_state ==
1703 			    MDI_PATHINFO_STATE_ONLINE) &&
1704 				MDI_PI(pip)->pi_preferred == preferred) {
1705 				MDI_PI_HOLD(pip);
1706 				MDI_PI_UNLOCK(pip);
1707 				*ret_pip = pip;
1708 				return (MDI_SUCCESS);
1709 			}
1710 			path_index --;
1711 			next = (mdi_pathinfo_t *)
1712 			    MDI_PI(pip)->pi_client_link;
1713 			MDI_PI_UNLOCK(pip);
1714 			pip = next;
1715 		}
1716 		if (pip == NULL) {
1717 			MDI_DEBUG(4, (CE_NOTE, NULL,
1718 			    "!lba %p, no pip !!\n",
1719 				bp->b_blkno));
1720 		} else {
1721 			MDI_DEBUG(4, (CE_NOTE, NULL,
1722 			    "!lba %p, no pip for path_index, "
1723 			    "pip %p\n", pip));
1724 		}
1725 	}
1726 	return (MDI_FAILURE);
1727 }
1728 
1729 /*
1730  * mdi_select_path():
1731  *		select a path to access a client device.
1732  *
1733  *		mdi_select_path() function is called by the vHCI drivers to
1734  *		select a path to route the I/O request to.  The caller passes
1735  *		the block I/O data transfer structure ("buf") as one of the
1736  *		parameters.  The mpxio framework uses the buf structure
1737  *		contents to maintain per path statistics (total I/O size /
1738  *		count pending).  If more than one online paths are available to
1739  *		select, the framework automatically selects a suitable path
1740  *		for routing I/O request. If a failover operation is active for
1741  *		this client device the call shall be failed with MDI_BUSY error
1742  *		code.
1743  *
1744  *		By default this function returns a suitable path in online
1745  *		state based on the current load balancing policy.  Currently
1746  *		we support LOAD_BALANCE_NONE (Previously selected online path
1747  *		will continue to be used till the path is usable) and
1748  *		LOAD_BALANCE_RR (Online paths will be selected in a round
1749  *		robin fashion), LOAD_BALANCE_LB(Online paths will be selected
1750  *		based on the logical block).  The load balancing
1751  *		through vHCI drivers configuration file (driver.conf).
1752  *
1753  *		vHCI drivers may override this default behavior by specifying
1754  *		appropriate flags.  If start_pip is specified (non NULL) is
1755  *		used as start point to walk and find the next appropriate path.
1756  *		The following values are currently defined:
1757  *		MDI_SELECT_ONLINE_PATH (to select an ONLINE path) and/or
1758  *		MDI_SELECT_STANDBY_PATH (to select an STANDBY path).
1759  *
1760  *		The non-standard behavior is used by the scsi_vhci driver,
1761  *		whenever it has to use a STANDBY/FAULTED path.  Eg. during
1762  *		attach of client devices (to avoid an unnecessary failover
1763  *		when the STANDBY path comes up first), during failover
1764  *		(to activate a STANDBY path as ONLINE).
1765  *
1766  *		The selected path in returned in a held state (ref_cnt).
1767  *		Caller should release the hold by calling mdi_rele_path().
1768  *
1769  * Return Values:
1770  *		MDI_SUCCESS	- Completed successfully
1771  *		MDI_BUSY 	- Client device is busy failing over
1772  *		MDI_NOPATH	- Client device is online, but no valid path are
1773  *				  available to access this client device
1774  *		MDI_FAILURE	- Invalid client device or state
1775  *		MDI_DEVI_ONLINING
1776  *				- Client device (struct dev_info state) is in
1777  *				  onlining state.
1778  */
1779 
1780 /*ARGSUSED*/
1781 int
1782 mdi_select_path(dev_info_t *cdip, struct buf *bp, int flags,
1783     mdi_pathinfo_t *start_pip, mdi_pathinfo_t **ret_pip)
1784 {
1785 	mdi_client_t	*ct;
1786 	mdi_pathinfo_t	*pip;
1787 	mdi_pathinfo_t	*next;
1788 	mdi_pathinfo_t	*head;
1789 	mdi_pathinfo_t	*start;
1790 	client_lb_t	lbp;	/* load balancing policy */
1791 	int		sb = 1;	/* standard behavior */
1792 	int		preferred = 1;	/* preferred path */
1793 	int		cond, cont = 1;
1794 	int		retry = 0;
1795 
1796 	if (flags != 0) {
1797 		/*
1798 		 * disable default behavior
1799 		 */
1800 		sb = 0;
1801 	}
1802 
1803 	*ret_pip = NULL;
1804 	ct = i_devi_get_client(cdip);
1805 	if (ct == NULL) {
1806 		/* mdi extensions are NULL, Nothing more to do */
1807 		return (MDI_FAILURE);
1808 	}
1809 
1810 	MDI_CLIENT_LOCK(ct);
1811 
1812 	if (sb) {
1813 		if (MDI_CLIENT_IS_FAILED(ct)) {
1814 			/*
1815 			 * Client is not ready to accept any I/O requests.
1816 			 * Fail this request.
1817 			 */
1818 			MDI_DEBUG(2, (CE_NOTE, cdip, "!mdi_select_path: "
1819 			    "client state offline ct = %p\n", ct));
1820 			MDI_CLIENT_UNLOCK(ct);
1821 			return (MDI_FAILURE);
1822 		}
1823 
1824 		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
1825 			/*
1826 			 * Check for Failover is in progress. If so tell the
1827 			 * caller that this device is busy.
1828 			 */
1829 			MDI_DEBUG(2, (CE_NOTE, cdip, "!mdi_select_path: "
1830 			    "client failover in progress ct = %p\n", ct));
1831 			MDI_CLIENT_UNLOCK(ct);
1832 			return (MDI_BUSY);
1833 		}
1834 
1835 		/*
1836 		 * Check to see whether the client device is attached.
1837 		 * If not so, let the vHCI driver manually select a path
1838 		 * (standby) and let the probe/attach process to continue.
1839 		 */
1840 		if ((MDI_CLIENT_IS_DETACHED(ct)) ||
1841 		    i_ddi_node_state(cdip) < DS_READY) {
1842 			MDI_DEBUG(4, (CE_NOTE, cdip, "!Devi is onlining\n"));
1843 			MDI_CLIENT_UNLOCK(ct);
1844 			return (MDI_DEVI_ONLINING);
1845 		}
1846 	}
1847 
1848 	/*
1849 	 * Cache in the client list head.  If head of the list is NULL
1850 	 * return MDI_NOPATH
1851 	 */
1852 	head = ct->ct_path_head;
1853 	if (head == NULL) {
1854 		MDI_CLIENT_UNLOCK(ct);
1855 		return (MDI_NOPATH);
1856 	}
1857 
1858 	/*
1859 	 * for non default behavior, bypass current
1860 	 * load balancing policy and always use LOAD_BALANCE_RR
1861 	 * except that the start point will be adjusted based
1862 	 * on the provided start_pip
1863 	 */
1864 	lbp = sb ? ct->ct_lb : LOAD_BALANCE_RR;
1865 
1866 	switch (lbp) {
1867 	case LOAD_BALANCE_NONE:
1868 		/*
1869 		 * Load balancing is None  or Alternate path mode
1870 		 * Start looking for a online mdi_pathinfo node starting from
1871 		 * last known selected path
1872 		 */
1873 		preferred = 1;
1874 		pip = (mdi_pathinfo_t *)ct->ct_path_last;
1875 		if (pip == NULL) {
1876 			pip = head;
1877 		}
1878 		start = pip;
1879 		do {
1880 			MDI_PI_LOCK(pip);
1881 			/*
1882 			 * No need to explicitly check if the path is disabled.
1883 			 * Since we are checking for state == ONLINE and the
1884 			 * same veriable is used for DISABLE/ENABLE information.
1885 			 */
1886 			if (MDI_PI(pip)->pi_state  ==
1887 				MDI_PATHINFO_STATE_ONLINE &&
1888 				preferred == MDI_PI(pip)->pi_preferred) {
1889 				/*
1890 				 * Return the path in hold state. Caller should
1891 				 * release the lock by calling mdi_rele_path()
1892 				 */
1893 				MDI_PI_HOLD(pip);
1894 				MDI_PI_UNLOCK(pip);
1895 				ct->ct_path_last = pip;
1896 				*ret_pip = pip;
1897 				MDI_CLIENT_UNLOCK(ct);
1898 				return (MDI_SUCCESS);
1899 			}
1900 
1901 			/*
1902 			 * Path is busy.
1903 			 */
1904 			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
1905 			    MDI_PI_IS_TRANSIENT(pip))
1906 				retry = 1;
1907 			/*
1908 			 * Keep looking for a next available online path
1909 			 */
1910 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
1911 			if (next == NULL) {
1912 				next = head;
1913 			}
1914 			MDI_PI_UNLOCK(pip);
1915 			pip = next;
1916 			if (start == pip && preferred) {
1917 				preferred = 0;
1918 			} else if (start == pip && !preferred) {
1919 				cont = 0;
1920 			}
1921 		} while (cont);
1922 		break;
1923 
1924 	case LOAD_BALANCE_LBA:
1925 		/*
1926 		 * Make sure we are looking
1927 		 * for an online path. Otherwise, if it is for a STANDBY
1928 		 * path request, it will go through and fetch an ONLINE
1929 		 * path which is not desirable.
1930 		 */
1931 		if ((ct->ct_lb_args != NULL) &&
1932 			    (ct->ct_lb_args->region_size) && bp &&
1933 				(sb || (flags == MDI_SELECT_ONLINE_PATH))) {
1934 			if (i_mdi_lba_lb(ct, ret_pip, bp)
1935 				    == MDI_SUCCESS) {
1936 				MDI_CLIENT_UNLOCK(ct);
1937 				return (MDI_SUCCESS);
1938 			}
1939 		}
1940 		/*  FALLTHROUGH */
1941 	case LOAD_BALANCE_RR:
1942 		/*
1943 		 * Load balancing is Round Robin. Start looking for a online
1944 		 * mdi_pathinfo node starting from last known selected path
1945 		 * as the start point.  If override flags are specified,
1946 		 * process accordingly.
1947 		 * If the search is already in effect(start_pip not null),
1948 		 * then lets just use the same path preference to continue the
1949 		 * traversal.
1950 		 */
1951 
1952 		if (start_pip != NULL) {
1953 			preferred = MDI_PI(start_pip)->pi_preferred;
1954 		} else {
1955 			preferred = 1;
1956 		}
1957 
1958 		start = sb ? (mdi_pathinfo_t *)ct->ct_path_last : start_pip;
1959 		if (start == NULL) {
1960 			pip = head;
1961 		} else {
1962 			pip = (mdi_pathinfo_t *)MDI_PI(start)->pi_client_link;
1963 			if (pip == NULL) {
1964 				if (!sb) {
1965 					if (preferred == 0) {
1966 						/*
1967 						 * Looks like we have completed
1968 						 * the traversal as preferred
1969 						 * value is 0. Time to bail out.
1970 						 */
1971 						*ret_pip = NULL;
1972 						MDI_CLIENT_UNLOCK(ct);
1973 						return (MDI_NOPATH);
1974 					} else {
1975 						/*
1976 						 * Looks like we reached the
1977 						 * end of the list. Lets enable
1978 						 * traversal of non preferred
1979 						 * paths.
1980 						 */
1981 						preferred = 0;
1982 					}
1983 				}
1984 				pip = head;
1985 			}
1986 		}
1987 		start = pip;
1988 		do {
1989 			MDI_PI_LOCK(pip);
1990 			if (sb) {
1991 				cond = ((MDI_PI(pip)->pi_state ==
1992 				    MDI_PATHINFO_STATE_ONLINE &&
1993 					MDI_PI(pip)->pi_preferred ==
1994 						preferred) ? 1 : 0);
1995 			} else {
1996 				if (flags == MDI_SELECT_ONLINE_PATH) {
1997 					cond = ((MDI_PI(pip)->pi_state ==
1998 					    MDI_PATHINFO_STATE_ONLINE &&
1999 						MDI_PI(pip)->pi_preferred ==
2000 						preferred) ? 1 : 0);
2001 				} else if (flags == MDI_SELECT_STANDBY_PATH) {
2002 					cond = ((MDI_PI(pip)->pi_state ==
2003 					    MDI_PATHINFO_STATE_STANDBY &&
2004 						MDI_PI(pip)->pi_preferred ==
2005 						preferred) ? 1 : 0);
2006 				} else if (flags == (MDI_SELECT_ONLINE_PATH |
2007 				    MDI_SELECT_STANDBY_PATH)) {
2008 					cond = (((MDI_PI(pip)->pi_state ==
2009 					    MDI_PATHINFO_STATE_ONLINE ||
2010 					    (MDI_PI(pip)->pi_state ==
2011 					    MDI_PATHINFO_STATE_STANDBY)) &&
2012 						MDI_PI(pip)->pi_preferred ==
2013 						preferred) ? 1 : 0);
2014 				} else {
2015 					cond = 0;
2016 				}
2017 			}
2018 			/*
2019 			 * No need to explicitly check if the path is disabled.
2020 			 * Since we are checking for state == ONLINE and the
2021 			 * same veriable is used for DISABLE/ENABLE information.
2022 			 */
2023 			if (cond) {
2024 				/*
2025 				 * Return the path in hold state. Caller should
2026 				 * release the lock by calling mdi_rele_path()
2027 				 */
2028 				MDI_PI_HOLD(pip);
2029 				MDI_PI_UNLOCK(pip);
2030 				if (sb)
2031 					ct->ct_path_last = pip;
2032 				*ret_pip = pip;
2033 				MDI_CLIENT_UNLOCK(ct);
2034 				return (MDI_SUCCESS);
2035 			}
2036 			/*
2037 			 * Path is busy.
2038 			 */
2039 			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
2040 			    MDI_PI_IS_TRANSIENT(pip))
2041 				retry = 1;
2042 
2043 			/*
2044 			 * Keep looking for a next available online path
2045 			 */
2046 do_again:
2047 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2048 			if (next == NULL) {
2049 				if (!sb) {
2050 					if (preferred == 1) {
2051 						/*
2052 						 * Looks like we reached the
2053 						 * end of the list. Lets enable
2054 						 * traversal of non preferred
2055 						 * paths.
2056 						 */
2057 						preferred = 0;
2058 						next = head;
2059 					} else {
2060 						/*
2061 						 * We have done both the passes
2062 						 * Preferred as well as for
2063 						 * Non-preferred. Bail out now.
2064 						 */
2065 						cont = 0;
2066 					}
2067 				} else {
2068 					/*
2069 					 * Standard behavior case.
2070 					 */
2071 					next = head;
2072 				}
2073 			}
2074 			MDI_PI_UNLOCK(pip);
2075 			if (cont == 0) {
2076 				break;
2077 			}
2078 			pip = next;
2079 
2080 			if (!sb) {
2081 				/*
2082 				 * We need to handle the selection of
2083 				 * non-preferred path in the following
2084 				 * case:
2085 				 *
2086 				 * +------+   +------+   +------+   +-----+
2087 				 * | A : 1| - | B : 1| - | C : 0| - |NULL |
2088 				 * +------+   +------+   +------+   +-----+
2089 				 *
2090 				 * If we start the search with B, we need to
2091 				 * skip beyond B to pick C which is non -
2092 				 * preferred in the second pass. The following
2093 				 * test, if true, will allow us to skip over
2094 				 * the 'start'(B in the example) to select
2095 				 * other non preferred elements.
2096 				 */
2097 				if ((start_pip != NULL) && (start_pip == pip) &&
2098 				    (MDI_PI(start_pip)->pi_preferred
2099 				    != preferred)) {
2100 					/*
2101 					 * try again after going past the start
2102 					 * pip
2103 					 */
2104 					MDI_PI_LOCK(pip);
2105 					goto do_again;
2106 				}
2107 			} else {
2108 				/*
2109 				 * Standard behavior case
2110 				 */
2111 				if (start == pip && preferred) {
2112 					/* look for nonpreferred paths */
2113 					preferred = 0;
2114 				} else if (start == pip && !preferred) {
2115 					/*
2116 					 * Exit condition
2117 					 */
2118 					cont = 0;
2119 				}
2120 			}
2121 		} while (cont);
2122 		break;
2123 	}
2124 
2125 	MDI_CLIENT_UNLOCK(ct);
2126 	if (retry == 1) {
2127 		return (MDI_BUSY);
2128 	} else {
2129 		return (MDI_NOPATH);
2130 	}
2131 }
2132 
2133 /*
2134  * For a client, return the next available path to any phci
2135  *
2136  * Note:
2137  *		Caller should hold the branch's devinfo node to get a consistent
2138  *		snap shot of the mdi_pathinfo nodes.
2139  *
2140  *		Please note that even the list is stable the mdi_pathinfo
2141  *		node state and properties are volatile.  The caller should lock
2142  *		and unlock the nodes by calling mdi_pi_lock() and
2143  *		mdi_pi_unlock() functions to get a stable properties.
2144  *
2145  *		If there is a need to use the nodes beyond the hold of the
2146  *		devinfo node period (For ex. I/O), then mdi_pathinfo node
2147  *		need to be held against unexpected removal by calling
2148  *		mdi_hold_path() and should be released by calling
2149  *		mdi_rele_path() on completion.
2150  */
2151 mdi_pathinfo_t *
2152 mdi_get_next_phci_path(dev_info_t *ct_dip, mdi_pathinfo_t *pip)
2153 {
2154 	mdi_client_t *ct;
2155 
2156 	if (!MDI_CLIENT(ct_dip))
2157 		return (NULL);
2158 
2159 	/*
2160 	 * Walk through client link
2161 	 */
2162 	ct = (mdi_client_t *)DEVI(ct_dip)->devi_mdi_client;
2163 	ASSERT(ct != NULL);
2164 
2165 	if (pip == NULL)
2166 		return ((mdi_pathinfo_t *)ct->ct_path_head);
2167 
2168 	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link);
2169 }
2170 
2171 /*
2172  * For a phci, return the next available path to any client
2173  * Note: ditto mdi_get_next_phci_path()
2174  */
2175 mdi_pathinfo_t *
2176 mdi_get_next_client_path(dev_info_t *ph_dip, mdi_pathinfo_t *pip)
2177 {
2178 	mdi_phci_t *ph;
2179 
2180 	if (!MDI_PHCI(ph_dip))
2181 		return (NULL);
2182 
2183 	/*
2184 	 * Walk through pHCI link
2185 	 */
2186 	ph = (mdi_phci_t *)DEVI(ph_dip)->devi_mdi_xhci;
2187 	ASSERT(ph != NULL);
2188 
2189 	if (pip == NULL)
2190 		return ((mdi_pathinfo_t *)ph->ph_path_head);
2191 
2192 	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link);
2193 }
2194 
2195 /*
2196  * mdi_get_nextpath():
2197  *		mdi_pathinfo node walker function.  Get the next node from the
2198  *		client or pHCI device list.
2199  *
2200  * XXX This is wrapper function for compatibility purposes only.
2201  *
2202  *	It doesn't work under Multi-level MPxIO, where a dip
2203  *	is both client and phci (which link should next_path follow?).
2204  *	Once Leadville is modified to call mdi_get_next_phci/client_path,
2205  *	this interface should be removed.
2206  */
2207 void
2208 mdi_get_next_path(dev_info_t *dip, mdi_pathinfo_t *pip,
2209     mdi_pathinfo_t **ret_pip)
2210 {
2211 	if (MDI_CLIENT(dip)) {
2212 		*ret_pip = mdi_get_next_phci_path(dip, pip);
2213 	} else if (MDI_PHCI(dip)) {
2214 		*ret_pip = mdi_get_next_client_path(dip, pip);
2215 	} else {
2216 		*ret_pip = NULL;
2217 	}
2218 }
2219 
2220 /*
2221  * mdi_hold_path():
2222  *		Hold the mdi_pathinfo node against unwanted unexpected free.
2223  * Return Values:
2224  *		None
2225  */
2226 void
2227 mdi_hold_path(mdi_pathinfo_t *pip)
2228 {
2229 	if (pip) {
2230 		MDI_PI_LOCK(pip);
2231 		MDI_PI_HOLD(pip);
2232 		MDI_PI_UNLOCK(pip);
2233 	}
2234 }
2235 
2236 
2237 /*
2238  * mdi_rele_path():
2239  *		Release the mdi_pathinfo node which was selected
2240  *		through mdi_select_path() mechanism or manually held by
2241  *		calling mdi_hold_path().
2242  * Return Values:
2243  *		None
2244  */
2245 void
2246 mdi_rele_path(mdi_pathinfo_t *pip)
2247 {
2248 	if (pip) {
2249 		MDI_PI_LOCK(pip);
2250 		MDI_PI_RELE(pip);
2251 		if (MDI_PI(pip)->pi_ref_cnt == 0) {
2252 			cv_broadcast(&MDI_PI(pip)->pi_ref_cv);
2253 		}
2254 		MDI_PI_UNLOCK(pip);
2255 	}
2256 }
2257 
2258 
2259 /*
2260  * mdi_pi_lock():
2261  * 		Lock the mdi_pathinfo node.
2262  * Note:
2263  *		The caller should release the lock by calling mdi_pi_unlock()
2264  */
2265 void
2266 mdi_pi_lock(mdi_pathinfo_t *pip)
2267 {
2268 	ASSERT(pip != NULL);
2269 	if (pip) {
2270 		MDI_PI_LOCK(pip);
2271 	}
2272 }
2273 
2274 
2275 /*
2276  * mdi_pi_unlock():
2277  * 		Unlock the mdi_pathinfo node.
2278  * Note:
2279  *		The mdi_pathinfo node should have been locked with mdi_pi_lock()
2280  */
2281 void
2282 mdi_pi_unlock(mdi_pathinfo_t *pip)
2283 {
2284 	ASSERT(pip != NULL);
2285 	if (pip) {
2286 		MDI_PI_UNLOCK(pip);
2287 	}
2288 }
2289 
2290 /*
2291  * mdi_pi_find():
2292  *		Search the list of mdi_pathinfo nodes attached to the
2293  *		pHCI/Client device node whose path address matches "paddr".
2294  *		Returns a pointer to the mdi_pathinfo node if a matching node is
2295  *		found.
2296  * Return Values:
2297  *		mdi_pathinfo node handle
2298  *		NULL
2299  * Notes:
2300  *		Caller need not hold any locks to call this function.
2301  */
2302 mdi_pathinfo_t *
2303 mdi_pi_find(dev_info_t *pdip, char *caddr, char *paddr)
2304 {
2305 	mdi_phci_t		*ph;
2306 	mdi_vhci_t		*vh;
2307 	mdi_client_t		*ct;
2308 	mdi_pathinfo_t		*pip = NULL;
2309 
2310 	if ((pdip == NULL) || (paddr == NULL)) {
2311 		return (NULL);
2312 	}
2313 	ph = i_devi_get_phci(pdip);
2314 	if (ph == NULL) {
2315 		/*
2316 		 * Invalid pHCI device, Nothing more to do.
2317 		 */
2318 		MDI_DEBUG(2, (CE_WARN, NULL,
2319 		    "!mdi_pi_find: invalid phci"));
2320 		return (NULL);
2321 	}
2322 
2323 	vh = ph->ph_vhci;
2324 	if (vh == NULL) {
2325 		/*
2326 		 * Invalid vHCI device, Nothing more to do.
2327 		 */
2328 		MDI_DEBUG(2, (CE_WARN, NULL,
2329 		    "!mdi_pi_find: invalid phci"));
2330 		return (NULL);
2331 	}
2332 
2333 	/*
2334 	 * Look for client device identified by caddr (guid)
2335 	 */
2336 	if (caddr == NULL) {
2337 		/*
2338 		 * Find a mdi_pathinfo node under pHCI list for a matching
2339 		 * unit address.
2340 		 */
2341 		mutex_enter(&ph->ph_mutex);
2342 		pip = (mdi_pathinfo_t *)ph->ph_path_head;
2343 
2344 		while (pip != NULL) {
2345 			if (strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2346 				break;
2347 			}
2348 			pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
2349 		}
2350 		mutex_exit(&ph->ph_mutex);
2351 		return (pip);
2352 	}
2353 
2354 	/*
2355 	 * XXX - Is the rest of the code in this function really necessary?
2356 	 * The consumers of mdi_pi_find() can search for the desired pathinfo
2357 	 * node by calling mdi_pi_find(pdip, NULL, paddr). Irrespective of
2358 	 * whether the search is based on the pathinfo nodes attached to
2359 	 * the pHCI or the client node, the result will be the same.
2360 	 */
2361 
2362 	/*
2363 	 * Find the client device corresponding to 'caddr'
2364 	 */
2365 	mutex_enter(&mdi_mutex);
2366 
2367 	/*
2368 	 * XXX - Passing NULL to the following function works as long as the
2369 	 * the client addresses (caddr) are unique per vhci basis.
2370 	 */
2371 	ct = i_mdi_client_find(vh, NULL, caddr);
2372 	if (ct == NULL) {
2373 		/*
2374 		 * Client not found, Obviously mdi_pathinfo node has not been
2375 		 * created yet.
2376 		 */
2377 		mutex_exit(&mdi_mutex);
2378 		return (pip);
2379 	}
2380 
2381 	/*
2382 	 * Hold the client lock and look for a mdi_pathinfo node with matching
2383 	 * pHCI and paddr
2384 	 */
2385 	MDI_CLIENT_LOCK(ct);
2386 
2387 	/*
2388 	 * Release the global mutex as it is no more needed. Note: We always
2389 	 * respect the locking order while acquiring.
2390 	 */
2391 	mutex_exit(&mdi_mutex);
2392 
2393 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2394 	while (pip != NULL) {
2395 		/*
2396 		 * Compare the unit address
2397 		 */
2398 		if ((MDI_PI(pip)->pi_phci == ph) &&
2399 		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2400 			break;
2401 		}
2402 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2403 	}
2404 	MDI_CLIENT_UNLOCK(ct);
2405 	return (pip);
2406 }
2407 
2408 /*
2409  * mdi_pi_alloc():
2410  *		Allocate and initialize a new instance of a mdi_pathinfo node.
2411  *		The mdi_pathinfo node returned by this function identifies a
2412  *		unique device path is capable of having properties attached
2413  *		and passed to mdi_pi_online() to fully attach and online the
2414  *		path and client device node.
2415  *		The mdi_pathinfo node returned by this function must be
2416  *		destroyed using mdi_pi_free() if the path is no longer
2417  *		operational or if the caller fails to attach a client device
2418  *		node when calling mdi_pi_online(). The framework will not free
2419  *		the resources allocated.
2420  *		This function can be called from both interrupt and kernel
2421  *		contexts.  DDI_NOSLEEP flag should be used while calling
2422  *		from interrupt contexts.
2423  * Return Values:
2424  *		MDI_SUCCESS
2425  *		MDI_FAILURE
2426  *		MDI_NOMEM
2427  */
2428 /*ARGSUSED*/
2429 int
2430 mdi_pi_alloc_compatible(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2431     char **compatible, int ncompatible, int flags, mdi_pathinfo_t **ret_pip)
2432 {
2433 	mdi_vhci_t	*vh;
2434 	mdi_phci_t	*ph;
2435 	mdi_client_t	*ct;
2436 	mdi_pathinfo_t	*pip = NULL;
2437 	dev_info_t	*cdip;
2438 	int		rv = MDI_NOMEM;
2439 	int		path_allocated = 0;
2440 
2441 	if (pdip == NULL || cname == NULL || caddr == NULL || paddr == NULL ||
2442 	    ret_pip == NULL) {
2443 		/* Nothing more to do */
2444 		return (MDI_FAILURE);
2445 	}
2446 
2447 	*ret_pip = NULL;
2448 	ph = i_devi_get_phci(pdip);
2449 	ASSERT(ph != NULL);
2450 	if (ph == NULL) {
2451 		/* Invalid pHCI device, return failure */
2452 		MDI_DEBUG(1, (CE_WARN, NULL,
2453 		    "!mdi_pi_alloc: invalid pHCI=%p", pdip));
2454 		return (MDI_FAILURE);
2455 	}
2456 
2457 	MDI_PHCI_LOCK(ph);
2458 	vh = ph->ph_vhci;
2459 	if (vh == NULL) {
2460 		/* Invalid vHCI device, return failure */
2461 		MDI_DEBUG(1, (CE_WARN, NULL,
2462 		    "!mdi_pi_alloc: invalid pHCI=%p", pdip));
2463 		MDI_PHCI_UNLOCK(ph);
2464 		return (MDI_FAILURE);
2465 	}
2466 
2467 	if (MDI_PHCI_IS_READY(ph) == 0) {
2468 		/*
2469 		 * Do not allow new node creation when pHCI is in
2470 		 * offline/suspended states
2471 		 */
2472 		MDI_DEBUG(1, (CE_WARN, NULL,
2473 		    "mdi_pi_alloc: pHCI=%p is not ready", ph));
2474 		MDI_PHCI_UNLOCK(ph);
2475 		return (MDI_BUSY);
2476 	}
2477 	MDI_PHCI_UNSTABLE(ph);
2478 	MDI_PHCI_UNLOCK(ph);
2479 
2480 	/* look for a matching client, create one if not found */
2481 	mutex_enter(&mdi_mutex);
2482 	ct = i_mdi_client_find(vh, cname, caddr);
2483 	if (ct == NULL) {
2484 		ct = i_mdi_client_alloc(vh, cname, caddr);
2485 		ASSERT(ct != NULL);
2486 	}
2487 
2488 	if (ct->ct_dip == NULL) {
2489 		/*
2490 		 * Allocate a devinfo node
2491 		 */
2492 		ct->ct_dip = i_mdi_devinfo_create(vh, cname, caddr,
2493 		    compatible, ncompatible);
2494 		if (ct->ct_dip == NULL) {
2495 			(void) i_mdi_client_free(vh, ct);
2496 			goto fail;
2497 		}
2498 	}
2499 	cdip = ct->ct_dip;
2500 
2501 	DEVI(cdip)->devi_mdi_component |= MDI_COMPONENT_CLIENT;
2502 	DEVI(cdip)->devi_mdi_client = (caddr_t)ct;
2503 
2504 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2505 	while (pip != NULL) {
2506 		/*
2507 		 * Compare the unit address
2508 		 */
2509 		if ((MDI_PI(pip)->pi_phci == ph) &&
2510 		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2511 			break;
2512 		}
2513 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2514 	}
2515 
2516 	if (pip == NULL) {
2517 		/*
2518 		 * This is a new path for this client device.  Allocate and
2519 		 * initialize a new pathinfo node
2520 		 */
2521 		pip = i_mdi_pi_alloc(ph, paddr, ct);
2522 		ASSERT(pip != NULL);
2523 		path_allocated = 1;
2524 	}
2525 	rv = MDI_SUCCESS;
2526 
2527 fail:
2528 	/*
2529 	 * Release the global mutex.
2530 	 */
2531 	mutex_exit(&mdi_mutex);
2532 
2533 	/*
2534 	 * Mark the pHCI as stable
2535 	 */
2536 	MDI_PHCI_LOCK(ph);
2537 	MDI_PHCI_STABLE(ph);
2538 	MDI_PHCI_UNLOCK(ph);
2539 	*ret_pip = pip;
2540 
2541 	if (path_allocated)
2542 		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
2543 
2544 	return (rv);
2545 }
2546 
2547 /*ARGSUSED*/
2548 int
2549 mdi_pi_alloc(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2550     int flags, mdi_pathinfo_t **ret_pip)
2551 {
2552 	return (mdi_pi_alloc_compatible(pdip, cname, caddr, paddr, NULL, 0,
2553 	    flags, ret_pip));
2554 }
2555 
2556 /*
2557  * i_mdi_pi_alloc():
2558  *		Allocate a mdi_pathinfo node and add to the pHCI path list
2559  * Return Values:
2560  *		mdi_pathinfo
2561  */
2562 
2563 /*ARGSUSED*/
2564 static mdi_pathinfo_t *
2565 i_mdi_pi_alloc(mdi_phci_t *ph, char *paddr, mdi_client_t *ct)
2566 {
2567 	mdi_pathinfo_t	*pip;
2568 	int		ct_circular;
2569 	int		ph_circular;
2570 
2571 	pip = kmem_zalloc(sizeof (struct mdi_pathinfo), KM_SLEEP);
2572 	mutex_init(&MDI_PI(pip)->pi_mutex, NULL, MUTEX_DEFAULT, NULL);
2573 	MDI_PI(pip)->pi_state = MDI_PATHINFO_STATE_INIT |
2574 	    MDI_PATHINFO_STATE_TRANSIENT;
2575 
2576 	if (MDI_PHCI_IS_USER_DISABLED(ph))
2577 		MDI_PI_SET_USER_DISABLE(pip);
2578 
2579 	if (MDI_PHCI_IS_DRV_DISABLED_TRANSIENT(ph))
2580 		MDI_PI_SET_DRV_DISABLE_TRANS(pip);
2581 
2582 	if (MDI_PHCI_IS_DRV_DISABLED(ph))
2583 		MDI_PI_SET_DRV_DISABLE(pip);
2584 
2585 	MDI_PI(pip)->pi_old_state = MDI_PATHINFO_STATE_INIT;
2586 	cv_init(&MDI_PI(pip)->pi_state_cv, NULL, CV_DEFAULT, NULL);
2587 	MDI_PI(pip)->pi_client = ct;
2588 	MDI_PI(pip)->pi_phci = ph;
2589 	MDI_PI(pip)->pi_addr = kmem_alloc(strlen(paddr) + 1, KM_SLEEP);
2590 	(void) strcpy(MDI_PI(pip)->pi_addr, paddr);
2591 	(void) nvlist_alloc(&MDI_PI(pip)->pi_prop, NV_UNIQUE_NAME, KM_SLEEP);
2592 	ASSERT(MDI_PI(pip)->pi_prop != NULL);
2593 	MDI_PI(pip)->pi_pprivate = NULL;
2594 	MDI_PI(pip)->pi_cprivate = NULL;
2595 	MDI_PI(pip)->pi_vprivate = NULL;
2596 	MDI_PI(pip)->pi_client_link = NULL;
2597 	MDI_PI(pip)->pi_phci_link = NULL;
2598 	MDI_PI(pip)->pi_ref_cnt = 0;
2599 	MDI_PI(pip)->pi_kstats = NULL;
2600 	MDI_PI(pip)->pi_preferred = 1;
2601 	cv_init(&MDI_PI(pip)->pi_ref_cv, NULL, CV_DEFAULT, NULL);
2602 
2603 	/*
2604 	 * Lock both dev_info nodes against changes in parallel.
2605 	 */
2606 	ndi_devi_enter(ct->ct_dip, &ct_circular);
2607 	ndi_devi_enter(ph->ph_dip, &ph_circular);
2608 
2609 	i_mdi_phci_add_path(ph, pip);
2610 	i_mdi_client_add_path(ct, pip);
2611 
2612 	ndi_devi_exit(ph->ph_dip, ph_circular);
2613 	ndi_devi_exit(ct->ct_dip, ct_circular);
2614 
2615 	return (pip);
2616 }
2617 
2618 /*
2619  * i_mdi_phci_add_path():
2620  * 		Add a mdi_pathinfo node to pHCI list.
2621  * Notes:
2622  *		Caller should per-pHCI mutex
2623  */
2624 
2625 static void
2626 i_mdi_phci_add_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
2627 {
2628 	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
2629 
2630 	if (ph->ph_path_head == NULL) {
2631 		ph->ph_path_head = pip;
2632 	} else {
2633 		MDI_PI(ph->ph_path_tail)->pi_phci_link = MDI_PI(pip);
2634 	}
2635 	ph->ph_path_tail = pip;
2636 	ph->ph_path_count++;
2637 }
2638 
2639 /*
2640  * i_mdi_client_add_path():
2641  *		Add mdi_pathinfo node to client list
2642  */
2643 
2644 static void
2645 i_mdi_client_add_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
2646 {
2647 	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
2648 
2649 	if (ct->ct_path_head == NULL) {
2650 		ct->ct_path_head = pip;
2651 	} else {
2652 		MDI_PI(ct->ct_path_tail)->pi_client_link = MDI_PI(pip);
2653 	}
2654 	ct->ct_path_tail = pip;
2655 	ct->ct_path_count++;
2656 }
2657 
2658 /*
2659  * mdi_pi_free():
2660  *		Free the mdi_pathinfo node and also client device node if this
2661  *		is the last path to the device
2662  * Return Values:
2663  *		MDI_SUCCESS
2664  *		MDI_FAILURE
2665  *		MDI_BUSY
2666  */
2667 
2668 /*ARGSUSED*/
2669 int
2670 mdi_pi_free(mdi_pathinfo_t *pip, int flags)
2671 {
2672 	int		rv = MDI_SUCCESS;
2673 	mdi_vhci_t	*vh;
2674 	mdi_phci_t	*ph;
2675 	mdi_client_t	*ct;
2676 	int		(*f)();
2677 	int		client_held = 0;
2678 
2679 	MDI_PI_LOCK(pip);
2680 	ph = MDI_PI(pip)->pi_phci;
2681 	ASSERT(ph != NULL);
2682 	if (ph == NULL) {
2683 		/*
2684 		 * Invalid pHCI device, return failure
2685 		 */
2686 		MDI_DEBUG(1, (CE_WARN, NULL,
2687 		    "!mdi_pi_free: invalid pHCI"));
2688 		MDI_PI_UNLOCK(pip);
2689 		return (MDI_FAILURE);
2690 	}
2691 
2692 	vh = ph->ph_vhci;
2693 	ASSERT(vh != NULL);
2694 	if (vh == NULL) {
2695 		/* Invalid pHCI device, return failure */
2696 		MDI_DEBUG(1, (CE_WARN, NULL,
2697 		    "!mdi_pi_free: invalid vHCI"));
2698 		MDI_PI_UNLOCK(pip);
2699 		return (MDI_FAILURE);
2700 	}
2701 
2702 	ct = MDI_PI(pip)->pi_client;
2703 	ASSERT(ct != NULL);
2704 	if (ct == NULL) {
2705 		/*
2706 		 * Invalid Client device, return failure
2707 		 */
2708 		MDI_DEBUG(1, (CE_WARN, NULL,
2709 		    "!mdi_pi_free: invalid client"));
2710 		MDI_PI_UNLOCK(pip);
2711 		return (MDI_FAILURE);
2712 	}
2713 
2714 	/*
2715 	 * Check to see for busy condition.  A mdi_pathinfo can only be freed
2716 	 * if the node state is either offline or init and the reference count
2717 	 * is zero.
2718 	 */
2719 	if (!(MDI_PI_IS_OFFLINE(pip) || MDI_PI_IS_INIT(pip) ||
2720 	    MDI_PI_IS_INITING(pip))) {
2721 		/*
2722 		 * Node is busy
2723 		 */
2724 		MDI_DEBUG(1, (CE_WARN, NULL,
2725 		    "!mdi_pi_free: pathinfo node is busy pip=%p", pip));
2726 		MDI_PI_UNLOCK(pip);
2727 		return (MDI_BUSY);
2728 	}
2729 
2730 	while (MDI_PI(pip)->pi_ref_cnt != 0) {
2731 		/*
2732 		 * Give a chance for pending I/Os to complete.
2733 		 */
2734 		MDI_DEBUG(1, (CE_NOTE, ct->ct_vhci->vh_dip, "!i_mdi_pi_free: "
2735 		    "%d cmds still pending on path: %p\n",
2736 		    MDI_PI(pip)->pi_ref_cnt, pip));
2737 		if (cv_timedwait(&MDI_PI(pip)->pi_ref_cv,
2738 		    &MDI_PI(pip)->pi_mutex,
2739 		    ddi_get_lbolt() + drv_usectohz(60 * 1000000)) == -1) {
2740 			/*
2741 			 * The timeout time reached without ref_cnt being zero
2742 			 * being signaled.
2743 			 */
2744 			MDI_DEBUG(1, (CE_NOTE, ct->ct_vhci->vh_dip,
2745 			    "!i_mdi_pi_free: "
2746 			    "Timeout reached on path %p without the cond\n",
2747 			    pip));
2748 			MDI_DEBUG(1, (CE_NOTE, ct->ct_vhci->vh_dip,
2749 			    "!i_mdi_pi_free: "
2750 			    "%d cmds still pending on path: %p\n",
2751 			    MDI_PI(pip)->pi_ref_cnt, pip));
2752 			MDI_PI_UNLOCK(pip);
2753 			return (MDI_BUSY);
2754 		}
2755 	}
2756 	if (MDI_PI(pip)->pi_pm_held) {
2757 		client_held = 1;
2758 	}
2759 	MDI_PI_UNLOCK(pip);
2760 
2761 	vhcache_pi_remove(vh->vh_config, MDI_PI(pip));
2762 
2763 	MDI_CLIENT_LOCK(ct);
2764 
2765 	/* Prevent further failovers till mdi_mutex is held */
2766 	MDI_CLIENT_SET_PATH_FREE_IN_PROGRESS(ct);
2767 
2768 	/*
2769 	 * Wait till failover is complete before removing this node.
2770 	 */
2771 	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
2772 		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
2773 
2774 	MDI_CLIENT_UNLOCK(ct);
2775 	mutex_enter(&mdi_mutex);
2776 	MDI_CLIENT_LOCK(ct);
2777 	MDI_CLIENT_CLEAR_PATH_FREE_IN_PROGRESS(ct);
2778 
2779 	if (!MDI_PI_IS_INITING(pip)) {
2780 		f = vh->vh_ops->vo_pi_uninit;
2781 		if (f != NULL) {
2782 			rv = (*f)(vh->vh_dip, pip, 0);
2783 		}
2784 	}
2785 	/*
2786 	 * If vo_pi_uninit() completed successfully.
2787 	 */
2788 	if (rv == MDI_SUCCESS) {
2789 		if (client_held) {
2790 			MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "mdi_pi_free "
2791 			    "i_mdi_pm_rele_client\n"));
2792 			i_mdi_pm_rele_client(ct, 1);
2793 		}
2794 		i_mdi_pi_free(ph, pip, ct);
2795 		if (ct->ct_path_count == 0) {
2796 			/*
2797 			 * Client lost its last path.
2798 			 * Clean up the client device
2799 			 */
2800 			MDI_CLIENT_UNLOCK(ct);
2801 			(void) i_mdi_client_free(ct->ct_vhci, ct);
2802 			mutex_exit(&mdi_mutex);
2803 			return (rv);
2804 		}
2805 	}
2806 	MDI_CLIENT_UNLOCK(ct);
2807 	mutex_exit(&mdi_mutex);
2808 
2809 	if (rv == MDI_FAILURE)
2810 		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
2811 
2812 	return (rv);
2813 }
2814 
2815 /*
2816  * i_mdi_pi_free():
2817  *		Free the mdi_pathinfo node
2818  */
2819 static void
2820 i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *pip, mdi_client_t *ct)
2821 {
2822 	int	ct_circular;
2823 	int	ph_circular;
2824 
2825 	/*
2826 	 * remove any per-path kstats
2827 	 */
2828 	i_mdi_pi_kstat_destroy(pip);
2829 
2830 	ndi_devi_enter(ct->ct_dip, &ct_circular);
2831 	ndi_devi_enter(ph->ph_dip, &ph_circular);
2832 
2833 	i_mdi_client_remove_path(ct, pip);
2834 	i_mdi_phci_remove_path(ph, pip);
2835 
2836 	ndi_devi_exit(ph->ph_dip, ph_circular);
2837 	ndi_devi_exit(ct->ct_dip, ct_circular);
2838 
2839 	mutex_destroy(&MDI_PI(pip)->pi_mutex);
2840 	cv_destroy(&MDI_PI(pip)->pi_state_cv);
2841 	cv_destroy(&MDI_PI(pip)->pi_ref_cv);
2842 	if (MDI_PI(pip)->pi_addr) {
2843 		kmem_free(MDI_PI(pip)->pi_addr,
2844 		    strlen(MDI_PI(pip)->pi_addr) + 1);
2845 		MDI_PI(pip)->pi_addr = NULL;
2846 	}
2847 
2848 	if (MDI_PI(pip)->pi_prop) {
2849 		(void) nvlist_free(MDI_PI(pip)->pi_prop);
2850 		MDI_PI(pip)->pi_prop = NULL;
2851 	}
2852 	kmem_free(pip, sizeof (struct mdi_pathinfo));
2853 }
2854 
2855 
2856 /*
2857  * i_mdi_phci_remove_path():
2858  * 		Remove a mdi_pathinfo node from pHCI list.
2859  * Notes:
2860  *		Caller should hold per-pHCI mutex
2861  */
2862 
2863 static void
2864 i_mdi_phci_remove_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
2865 {
2866 	mdi_pathinfo_t	*prev = NULL;
2867 	mdi_pathinfo_t	*path = NULL;
2868 
2869 	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
2870 
2871 	path = ph->ph_path_head;
2872 	while (path != NULL) {
2873 		if (path == pip) {
2874 			break;
2875 		}
2876 		prev = path;
2877 		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
2878 	}
2879 
2880 	if (path) {
2881 		ph->ph_path_count--;
2882 		if (prev) {
2883 			MDI_PI(prev)->pi_phci_link = MDI_PI(path)->pi_phci_link;
2884 		} else {
2885 			ph->ph_path_head =
2886 			    (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
2887 		}
2888 		if (ph->ph_path_tail == path) {
2889 			ph->ph_path_tail = prev;
2890 		}
2891 	}
2892 
2893 	/*
2894 	 * Clear the pHCI link
2895 	 */
2896 	MDI_PI(pip)->pi_phci_link = NULL;
2897 	MDI_PI(pip)->pi_phci = NULL;
2898 }
2899 
2900 /*
2901  * i_mdi_client_remove_path():
2902  * 		Remove a mdi_pathinfo node from client path list.
2903  */
2904 
2905 static void
2906 i_mdi_client_remove_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
2907 {
2908 	mdi_pathinfo_t	*prev = NULL;
2909 	mdi_pathinfo_t	*path;
2910 
2911 	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
2912 
2913 	path = ct->ct_path_head;
2914 	while (path != NULL) {
2915 		if (path == pip) {
2916 			break;
2917 		}
2918 		prev = path;
2919 		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
2920 	}
2921 
2922 	if (path) {
2923 		ct->ct_path_count--;
2924 		if (prev) {
2925 			MDI_PI(prev)->pi_client_link =
2926 			    MDI_PI(path)->pi_client_link;
2927 		} else {
2928 			ct->ct_path_head =
2929 			    (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
2930 		}
2931 		if (ct->ct_path_tail == path) {
2932 			ct->ct_path_tail = prev;
2933 		}
2934 		if (ct->ct_path_last == path) {
2935 			ct->ct_path_last = ct->ct_path_head;
2936 		}
2937 	}
2938 	MDI_PI(pip)->pi_client_link = NULL;
2939 	MDI_PI(pip)->pi_client = NULL;
2940 }
2941 
2942 /*
2943  * i_mdi_pi_state_change():
2944  *		online a mdi_pathinfo node
2945  *
2946  * Return Values:
2947  *		MDI_SUCCESS
2948  *		MDI_FAILURE
2949  */
2950 /*ARGSUSED*/
2951 static int
2952 i_mdi_pi_state_change(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state, int flag)
2953 {
2954 	int		rv = MDI_SUCCESS;
2955 	mdi_vhci_t	*vh;
2956 	mdi_phci_t	*ph;
2957 	mdi_client_t	*ct;
2958 	int		(*f)();
2959 	dev_info_t	*cdip;
2960 
2961 	MDI_PI_LOCK(pip);
2962 
2963 	ph = MDI_PI(pip)->pi_phci;
2964 	ASSERT(ph);
2965 	if (ph == NULL) {
2966 		/*
2967 		 * Invalid pHCI device, fail the request
2968 		 */
2969 		MDI_PI_UNLOCK(pip);
2970 		MDI_DEBUG(1, (CE_WARN, NULL,
2971 		    "!mdi_pi_state_change: invalid phci"));
2972 		return (MDI_FAILURE);
2973 	}
2974 
2975 	vh = ph->ph_vhci;
2976 	ASSERT(vh);
2977 	if (vh == NULL) {
2978 		/*
2979 		 * Invalid vHCI device, fail the request
2980 		 */
2981 		MDI_PI_UNLOCK(pip);
2982 		MDI_DEBUG(1, (CE_WARN, NULL,
2983 		    "!mdi_pi_state_change: invalid vhci"));
2984 		return (MDI_FAILURE);
2985 	}
2986 
2987 	ct = MDI_PI(pip)->pi_client;
2988 	ASSERT(ct != NULL);
2989 	if (ct == NULL) {
2990 		/*
2991 		 * Invalid client device, fail the request
2992 		 */
2993 		MDI_PI_UNLOCK(pip);
2994 		MDI_DEBUG(1, (CE_WARN, NULL,
2995 		    "!mdi_pi_state_change: invalid client"));
2996 		return (MDI_FAILURE);
2997 	}
2998 
2999 	/*
3000 	 * If this path has not been initialized yet, Callback vHCI driver's
3001 	 * pathinfo node initialize entry point
3002 	 */
3003 
3004 	if (MDI_PI_IS_INITING(pip)) {
3005 		MDI_PI_UNLOCK(pip);
3006 		f = vh->vh_ops->vo_pi_init;
3007 		if (f != NULL) {
3008 			rv = (*f)(vh->vh_dip, pip, 0);
3009 			if (rv != MDI_SUCCESS) {
3010 				MDI_DEBUG(1, (CE_WARN, vh->vh_dip,
3011 				    "!vo_pi_init: failed vHCI=0x%p, pip=0x%p",
3012 				    vh, pip));
3013 				return (MDI_FAILURE);
3014 			}
3015 		}
3016 		MDI_PI_LOCK(pip);
3017 		MDI_PI_CLEAR_TRANSIENT(pip);
3018 	}
3019 
3020 	/*
3021 	 * Do not allow state transition when pHCI is in offline/suspended
3022 	 * states
3023 	 */
3024 	i_mdi_phci_lock(ph, pip);
3025 	if (MDI_PHCI_IS_READY(ph) == 0) {
3026 		MDI_DEBUG(1, (CE_WARN, NULL,
3027 		    "!mdi_pi_state_change: pHCI not ready, pHCI=%p", ph));
3028 		MDI_PI_UNLOCK(pip);
3029 		i_mdi_phci_unlock(ph);
3030 		return (MDI_BUSY);
3031 	}
3032 	MDI_PHCI_UNSTABLE(ph);
3033 	i_mdi_phci_unlock(ph);
3034 
3035 	/*
3036 	 * Check if mdi_pathinfo state is in transient state.
3037 	 * If yes, offlining is in progress and wait till transient state is
3038 	 * cleared.
3039 	 */
3040 	if (MDI_PI_IS_TRANSIENT(pip)) {
3041 		while (MDI_PI_IS_TRANSIENT(pip)) {
3042 			cv_wait(&MDI_PI(pip)->pi_state_cv,
3043 			    &MDI_PI(pip)->pi_mutex);
3044 		}
3045 	}
3046 
3047 	/*
3048 	 * Grab the client lock in reverse order sequence and release the
3049 	 * mdi_pathinfo mutex.
3050 	 */
3051 	i_mdi_client_lock(ct, pip);
3052 	MDI_PI_UNLOCK(pip);
3053 
3054 	/*
3055 	 * Wait till failover state is cleared
3056 	 */
3057 	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
3058 		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
3059 
3060 	/*
3061 	 * Mark the mdi_pathinfo node state as transient
3062 	 */
3063 	MDI_PI_LOCK(pip);
3064 	switch (state) {
3065 	case MDI_PATHINFO_STATE_ONLINE:
3066 		MDI_PI_SET_ONLINING(pip);
3067 		break;
3068 
3069 	case MDI_PATHINFO_STATE_STANDBY:
3070 		MDI_PI_SET_STANDBYING(pip);
3071 		break;
3072 
3073 	case MDI_PATHINFO_STATE_FAULT:
3074 		/*
3075 		 * Mark the pathinfo state as FAULTED
3076 		 */
3077 		MDI_PI_SET_FAULTING(pip);
3078 		MDI_PI_ERRSTAT(pip, MDI_PI_HARDERR);
3079 		break;
3080 
3081 	case MDI_PATHINFO_STATE_OFFLINE:
3082 		/*
3083 		 * ndi_devi_offline() cannot hold pip or ct locks.
3084 		 */
3085 		MDI_PI_UNLOCK(pip);
3086 		/*
3087 		 * Do not offline if path will become last path and path
3088 		 * is busy for user initiated events.
3089 		 */
3090 		cdip = ct->ct_dip;
3091 		if ((flag & NDI_DEVI_REMOVE) &&
3092 		    (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED)) {
3093 			i_mdi_client_unlock(ct);
3094 			rv = ndi_devi_offline(cdip, 0);
3095 			if (rv != NDI_SUCCESS) {
3096 				/*
3097 				 * Convert to MDI error code
3098 				 */
3099 				switch (rv) {
3100 				case NDI_BUSY:
3101 					rv = MDI_BUSY;
3102 					break;
3103 				default:
3104 					rv = MDI_FAILURE;
3105 					break;
3106 				}
3107 				goto state_change_exit;
3108 			} else {
3109 				i_mdi_client_lock(ct, NULL);
3110 			}
3111 		}
3112 		/*
3113 		 * Mark the mdi_pathinfo node state as transient
3114 		 */
3115 		MDI_PI_LOCK(pip);
3116 		MDI_PI_SET_OFFLINING(pip);
3117 		break;
3118 	}
3119 	MDI_PI_UNLOCK(pip);
3120 	MDI_CLIENT_UNSTABLE(ct);
3121 	i_mdi_client_unlock(ct);
3122 
3123 	f = vh->vh_ops->vo_pi_state_change;
3124 	if (f != NULL) {
3125 		rv = (*f)(vh->vh_dip, pip, state, 0, flag);
3126 		if (rv == MDI_NOT_SUPPORTED) {
3127 			MDI_CLIENT_SET_DEV_NOT_SUPPORTED(ct);
3128 		}
3129 		if (rv != MDI_SUCCESS) {
3130 			MDI_DEBUG(2, (CE_WARN, vh->vh_dip,
3131 			    "!vo_pi_state_change: failed rv = %x", rv));
3132 		}
3133 	}
3134 	MDI_CLIENT_LOCK(ct);
3135 	MDI_PI_LOCK(pip);
3136 	if (MDI_PI_IS_TRANSIENT(pip)) {
3137 		if (rv == MDI_SUCCESS) {
3138 			MDI_PI_CLEAR_TRANSIENT(pip);
3139 		} else {
3140 			MDI_PI(pip)->pi_state = MDI_PI_OLD_STATE(pip);
3141 		}
3142 	}
3143 
3144 	/*
3145 	 * Wake anyone waiting for this mdi_pathinfo node
3146 	 */
3147 	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
3148 	MDI_PI_UNLOCK(pip);
3149 
3150 	/*
3151 	 * Mark the client device as stable
3152 	 */
3153 	MDI_CLIENT_STABLE(ct);
3154 	if (rv == MDI_SUCCESS) {
3155 		if (ct->ct_unstable == 0) {
3156 			cdip = ct->ct_dip;
3157 
3158 			/*
3159 			 * Onlining the mdi_pathinfo node will impact the
3160 			 * client state Update the client and dev_info node
3161 			 * state accordingly
3162 			 */
3163 			rv = NDI_SUCCESS;
3164 			i_mdi_client_update_state(ct);
3165 			switch (MDI_CLIENT_STATE(ct)) {
3166 			case MDI_CLIENT_STATE_OPTIMAL:
3167 			case MDI_CLIENT_STATE_DEGRADED:
3168 				if (cdip &&
3169 				    (i_ddi_node_state(cdip) < DS_READY) &&
3170 				    ((state == MDI_PATHINFO_STATE_ONLINE) ||
3171 				    (state == MDI_PATHINFO_STATE_STANDBY))) {
3172 
3173 					i_mdi_client_unlock(ct);
3174 					/*
3175 					 * Must do ndi_devi_online() through
3176 					 * hotplug thread for deferred
3177 					 * attach mechanism to work
3178 					 */
3179 					rv = ndi_devi_online(cdip, 0);
3180 					i_mdi_client_lock(ct, NULL);
3181 					if ((rv != NDI_SUCCESS) &&
3182 					    (MDI_CLIENT_STATE(ct) ==
3183 					    MDI_CLIENT_STATE_DEGRADED)) {
3184 						/*
3185 						 * ndi_devi_online failed.
3186 						 * Reset client flags to
3187 						 * offline.
3188 						 */
3189 						MDI_DEBUG(1, (CE_WARN, cdip,
3190 						    "!ndi_devi_online: failed "
3191 						    " Error: %x", rv));
3192 						MDI_CLIENT_SET_OFFLINE(ct);
3193 					}
3194 					if (rv != NDI_SUCCESS) {
3195 						/* Reset the path state */
3196 						MDI_PI_LOCK(pip);
3197 						MDI_PI(pip)->pi_state =
3198 						    MDI_PI_OLD_STATE(pip);
3199 						MDI_PI_UNLOCK(pip);
3200 					}
3201 				}
3202 				break;
3203 
3204 			case MDI_CLIENT_STATE_FAILED:
3205 				/*
3206 				 * This is the last path case for
3207 				 * non-user initiated events.
3208 				 */
3209 				if (((flag & NDI_DEVI_REMOVE) == 0) &&
3210 				    cdip && (i_ddi_node_state(cdip) >=
3211 				    DS_INITIALIZED)) {
3212 					i_mdi_client_unlock(ct);
3213 					rv = ndi_devi_offline(cdip, 0);
3214 					i_mdi_client_lock(ct, NULL);
3215 
3216 					if (rv != NDI_SUCCESS) {
3217 						/*
3218 						 * ndi_devi_offline failed.
3219 						 * Reset client flags to
3220 						 * online as the path could not
3221 						 * be offlined.
3222 						 */
3223 						MDI_DEBUG(1, (CE_WARN, cdip,
3224 						    "!ndi_devi_offline: failed "
3225 						    " Error: %x", rv));
3226 						MDI_CLIENT_SET_ONLINE(ct);
3227 					}
3228 				}
3229 				break;
3230 			}
3231 			/*
3232 			 * Convert to MDI error code
3233 			 */
3234 			switch (rv) {
3235 			case NDI_SUCCESS:
3236 				MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
3237 				i_mdi_report_path_state(ct, pip);
3238 				rv = MDI_SUCCESS;
3239 				break;
3240 			case NDI_BUSY:
3241 				rv = MDI_BUSY;
3242 				break;
3243 			default:
3244 				rv = MDI_FAILURE;
3245 				break;
3246 			}
3247 		}
3248 	}
3249 	MDI_CLIENT_UNLOCK(ct);
3250 
3251 state_change_exit:
3252 	/*
3253 	 * Mark the pHCI as stable again.
3254 	 */
3255 	MDI_PHCI_LOCK(ph);
3256 	MDI_PHCI_STABLE(ph);
3257 	MDI_PHCI_UNLOCK(ph);
3258 	return (rv);
3259 }
3260 
3261 /*
3262  * mdi_pi_online():
3263  *		Place the path_info node in the online state.  The path is
3264  *		now available to be selected by mdi_select_path() for
3265  *		transporting I/O requests to client devices.
3266  * Return Values:
3267  *		MDI_SUCCESS
3268  *		MDI_FAILURE
3269  */
3270 int
3271 mdi_pi_online(mdi_pathinfo_t *pip, int flags)
3272 {
3273 	mdi_client_t *ct = MDI_PI(pip)->pi_client;
3274 	dev_info_t *cdip;
3275 	int		client_held = 0;
3276 	int rv;
3277 
3278 	ASSERT(ct != NULL);
3279 	rv = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_ONLINE, flags);
3280 	if (rv != MDI_SUCCESS)
3281 		return (rv);
3282 
3283 	MDI_PI_LOCK(pip);
3284 	if (MDI_PI(pip)->pi_pm_held == 0) {
3285 		MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "mdi_pi_online "
3286 		    "i_mdi_pm_hold_pip\n"));
3287 		i_mdi_pm_hold_pip(pip);
3288 		client_held = 1;
3289 	}
3290 	MDI_PI_UNLOCK(pip);
3291 
3292 	if (client_held) {
3293 		MDI_CLIENT_LOCK(ct);
3294 		if (ct->ct_power_cnt == 0) {
3295 			rv = i_mdi_power_all_phci(ct);
3296 		}
3297 
3298 		MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "mdi_pi_online "
3299 		    "i_mdi_pm_hold_client\n"));
3300 		i_mdi_pm_hold_client(ct, 1);
3301 		MDI_CLIENT_UNLOCK(ct);
3302 	}
3303 
3304 	/*
3305 	 * Create the per-path (pathinfo) IO and error kstats which
3306 	 * are reported via iostat(1m).
3307 	 *
3308 	 * Defer creating the per-path kstats if device is not yet
3309 	 * attached;  the names of the kstats are constructed in part
3310 	 * using the devices instance number which is assigned during
3311 	 * process of attaching the client device.
3312 	 *
3313 	 * The framework post_attach handler, mdi_post_attach(), is
3314 	 * is responsible for initializing the client's pathinfo list
3315 	 * once successfully attached.
3316 	 */
3317 	cdip = ct->ct_dip;
3318 	ASSERT(cdip);
3319 	if (cdip == NULL || (i_ddi_node_state(cdip) < DS_ATTACHED))
3320 		return (rv);
3321 
3322 	MDI_CLIENT_LOCK(ct);
3323 	rv = i_mdi_pi_kstat_create(pip);
3324 	MDI_CLIENT_UNLOCK(ct);
3325 	return (rv);
3326 }
3327 
3328 /*
3329  * mdi_pi_standby():
3330  *		Place the mdi_pathinfo node in standby state
3331  *
3332  * Return Values:
3333  *		MDI_SUCCESS
3334  *		MDI_FAILURE
3335  */
3336 int
3337 mdi_pi_standby(mdi_pathinfo_t *pip, int flags)
3338 {
3339 	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_STANDBY, flags));
3340 }
3341 
3342 /*
3343  * mdi_pi_fault():
3344  *		Place the mdi_pathinfo node in fault'ed state
3345  * Return Values:
3346  *		MDI_SUCCESS
3347  *		MDI_FAILURE
3348  */
3349 int
3350 mdi_pi_fault(mdi_pathinfo_t *pip, int flags)
3351 {
3352 	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_FAULT, flags));
3353 }
3354 
3355 /*
3356  * mdi_pi_offline():
3357  *		Offline a mdi_pathinfo node.
3358  * Return Values:
3359  *		MDI_SUCCESS
3360  *		MDI_FAILURE
3361  */
3362 int
3363 mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
3364 {
3365 	int	ret, client_held = 0;
3366 	mdi_client_t	*ct;
3367 
3368 	ret = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_OFFLINE, flags);
3369 
3370 	if (ret == MDI_SUCCESS) {
3371 		MDI_PI_LOCK(pip);
3372 		if (MDI_PI(pip)->pi_pm_held) {
3373 			client_held = 1;
3374 		}
3375 		MDI_PI_UNLOCK(pip);
3376 
3377 		if (client_held) {
3378 			ct = MDI_PI(pip)->pi_client;
3379 			MDI_CLIENT_LOCK(ct);
3380 			MDI_DEBUG(4, (CE_NOTE, ct->ct_dip,
3381 			    "mdi_pi_offline i_mdi_pm_rele_client\n"));
3382 			i_mdi_pm_rele_client(ct, 1);
3383 			MDI_CLIENT_UNLOCK(ct);
3384 		}
3385 	}
3386 
3387 	return (ret);
3388 }
3389 
3390 /*
3391  * i_mdi_pi_offline():
3392  *		Offline a mdi_pathinfo node and call the vHCI driver's callback
3393  */
3394 static int
3395 i_mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
3396 {
3397 	dev_info_t	*vdip = NULL;
3398 	mdi_vhci_t	*vh = NULL;
3399 	mdi_client_t	*ct = NULL;
3400 	int		(*f)();
3401 	int		rv;
3402 
3403 	MDI_PI_LOCK(pip);
3404 	ct = MDI_PI(pip)->pi_client;
3405 	ASSERT(ct != NULL);
3406 
3407 	while (MDI_PI(pip)->pi_ref_cnt != 0) {
3408 		/*
3409 		 * Give a chance for pending I/Os to complete.
3410 		 */
3411 		MDI_DEBUG(1, (CE_NOTE, vdip, "!i_mdi_pi_offline: "
3412 		    "%d cmds still pending on path: %p\n",
3413 		    MDI_PI(pip)->pi_ref_cnt, pip));
3414 		if (cv_timedwait(&MDI_PI(pip)->pi_ref_cv,
3415 		    &MDI_PI(pip)->pi_mutex,
3416 		    ddi_get_lbolt() + drv_usectohz(60 * 1000000)) == -1) {
3417 			/*
3418 			 * The timeout time reached without ref_cnt being zero
3419 			 * being signaled.
3420 			 */
3421 			MDI_DEBUG(1, (CE_NOTE, vdip, "!i_mdi_pi_offline: "
3422 			    "Timeout reached on path %p without the cond\n",
3423 			    pip));
3424 			MDI_DEBUG(1, (CE_NOTE, vdip, "!i_mdi_pi_offline: "
3425 			    "%d cmds still pending on path: %p\n",
3426 			    MDI_PI(pip)->pi_ref_cnt, pip));
3427 		}
3428 	}
3429 	vh = ct->ct_vhci;
3430 	vdip = vh->vh_dip;
3431 
3432 	/*
3433 	 * Notify vHCI that has registered this event
3434 	 */
3435 	ASSERT(vh->vh_ops);
3436 	f = vh->vh_ops->vo_pi_state_change;
3437 
3438 	if (f != NULL) {
3439 		MDI_PI_UNLOCK(pip);
3440 		if ((rv = (*f)(vdip, pip, MDI_PATHINFO_STATE_OFFLINE, 0,
3441 		    flags)) != MDI_SUCCESS) {
3442 			MDI_DEBUG(1, (CE_WARN, vdip, "!vo_path_offline failed "
3443 			    "vdip 0x%x, pip 0x%x", vdip, pip));
3444 		}
3445 		MDI_PI_LOCK(pip);
3446 	}
3447 
3448 	/*
3449 	 * Set the mdi_pathinfo node state and clear the transient condition
3450 	 */
3451 	MDI_PI_SET_OFFLINE(pip);
3452 	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
3453 	MDI_PI_UNLOCK(pip);
3454 
3455 	MDI_CLIENT_LOCK(ct);
3456 	if (rv == MDI_SUCCESS) {
3457 		if (ct->ct_unstable == 0) {
3458 			dev_info_t	*cdip = ct->ct_dip;
3459 
3460 			/*
3461 			 * Onlining the mdi_pathinfo node will impact the
3462 			 * client state Update the client and dev_info node
3463 			 * state accordingly
3464 			 */
3465 			i_mdi_client_update_state(ct);
3466 			rv = NDI_SUCCESS;
3467 			if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
3468 				if (cdip &&
3469 				    (i_ddi_node_state(cdip) >=
3470 				    DS_INITIALIZED)) {
3471 					MDI_CLIENT_UNLOCK(ct);
3472 					rv = ndi_devi_offline(cdip, 0);
3473 					MDI_CLIENT_LOCK(ct);
3474 					if (rv != NDI_SUCCESS) {
3475 						/*
3476 						 * ndi_devi_offline failed.
3477 						 * Reset client flags to
3478 						 * online.
3479 						 */
3480 						MDI_DEBUG(4, (CE_WARN, cdip,
3481 						    "!ndi_devi_offline: failed "
3482 						    " Error: %x", rv));
3483 						MDI_CLIENT_SET_ONLINE(ct);
3484 					}
3485 				}
3486 			}
3487 			/*
3488 			 * Convert to MDI error code
3489 			 */
3490 			switch (rv) {
3491 			case NDI_SUCCESS:
3492 				rv = MDI_SUCCESS;
3493 				break;
3494 			case NDI_BUSY:
3495 				rv = MDI_BUSY;
3496 				break;
3497 			default:
3498 				rv = MDI_FAILURE;
3499 				break;
3500 			}
3501 		}
3502 		MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
3503 		i_mdi_report_path_state(ct, pip);
3504 	}
3505 
3506 	MDI_CLIENT_UNLOCK(ct);
3507 
3508 	/*
3509 	 * Change in the mdi_pathinfo node state will impact the client state
3510 	 */
3511 	MDI_DEBUG(2, (CE_NOTE, NULL, "!i_mdi_pi_offline ct = %p pip = %p",
3512 	    ct, pip));
3513 	return (rv);
3514 }
3515 
3516 
3517 /*
3518  * mdi_pi_get_addr():
3519  *		Get the unit address associated with a mdi_pathinfo node
3520  *
3521  * Return Values:
3522  *		char *
3523  */
3524 char *
3525 mdi_pi_get_addr(mdi_pathinfo_t *pip)
3526 {
3527 	if (pip == NULL)
3528 		return (NULL);
3529 
3530 	return (MDI_PI(pip)->pi_addr);
3531 }
3532 
3533 /*
3534  * mdi_pi_get_client():
3535  *		Get the client devinfo associated with a mdi_pathinfo node
3536  *
3537  * Return Values:
3538  *		Handle to client device dev_info node
3539  */
3540 dev_info_t *
3541 mdi_pi_get_client(mdi_pathinfo_t *pip)
3542 {
3543 	dev_info_t	*dip = NULL;
3544 	if (pip) {
3545 		dip = MDI_PI(pip)->pi_client->ct_dip;
3546 	}
3547 	return (dip);
3548 }
3549 
3550 /*
3551  * mdi_pi_get_phci():
3552  *		Get the pHCI devinfo associated with the mdi_pathinfo node
3553  * Return Values:
3554  *		Handle to dev_info node
3555  */
3556 dev_info_t *
3557 mdi_pi_get_phci(mdi_pathinfo_t *pip)
3558 {
3559 	dev_info_t	*dip = NULL;
3560 	if (pip) {
3561 		dip = MDI_PI(pip)->pi_phci->ph_dip;
3562 	}
3563 	return (dip);
3564 }
3565 
3566 /*
3567  * mdi_pi_get_client_private():
3568  *		Get the client private information associated with the
3569  *		mdi_pathinfo node
3570  */
3571 void *
3572 mdi_pi_get_client_private(mdi_pathinfo_t *pip)
3573 {
3574 	void *cprivate = NULL;
3575 	if (pip) {
3576 		cprivate = MDI_PI(pip)->pi_cprivate;
3577 	}
3578 	return (cprivate);
3579 }
3580 
3581 /*
3582  * mdi_pi_set_client_private():
3583  *		Set the client private information in the mdi_pathinfo node
3584  */
3585 void
3586 mdi_pi_set_client_private(mdi_pathinfo_t *pip, void *priv)
3587 {
3588 	if (pip) {
3589 		MDI_PI(pip)->pi_cprivate = priv;
3590 	}
3591 }
3592 
3593 /*
3594  * mdi_pi_get_phci_private():
3595  *		Get the pHCI private information associated with the
3596  *		mdi_pathinfo node
3597  */
3598 caddr_t
3599 mdi_pi_get_phci_private(mdi_pathinfo_t *pip)
3600 {
3601 	caddr_t	pprivate = NULL;
3602 	if (pip) {
3603 		pprivate = MDI_PI(pip)->pi_pprivate;
3604 	}
3605 	return (pprivate);
3606 }
3607 
3608 /*
3609  * mdi_pi_set_phci_private():
3610  *		Set the pHCI private information in the mdi_pathinfo node
3611  */
3612 void
3613 mdi_pi_set_phci_private(mdi_pathinfo_t *pip, caddr_t priv)
3614 {
3615 	if (pip) {
3616 		MDI_PI(pip)->pi_pprivate = priv;
3617 	}
3618 }
3619 
3620 /*
3621  * mdi_pi_get_state():
3622  *		Get the mdi_pathinfo node state. Transient states are internal
3623  *		and not provided to the users
3624  */
3625 mdi_pathinfo_state_t
3626 mdi_pi_get_state(mdi_pathinfo_t *pip)
3627 {
3628 	mdi_pathinfo_state_t    state = MDI_PATHINFO_STATE_INIT;
3629 
3630 	if (pip) {
3631 		if (MDI_PI_IS_TRANSIENT(pip)) {
3632 			/*
3633 			 * mdi_pathinfo is in state transition.  Return the
3634 			 * last good state.
3635 			 */
3636 			state = MDI_PI_OLD_STATE(pip);
3637 		} else {
3638 			state = MDI_PI_STATE(pip);
3639 		}
3640 	}
3641 	return (state);
3642 }
3643 
3644 /*
3645  * Note that the following function needs to be the new interface for
3646  * mdi_pi_get_state when mpxio gets integrated to ON.
3647  */
3648 int
3649 mdi_pi_get_state2(mdi_pathinfo_t *pip, mdi_pathinfo_state_t *state,
3650 		uint32_t *ext_state)
3651 {
3652 	*state = MDI_PATHINFO_STATE_INIT;
3653 
3654 	if (pip) {
3655 		if (MDI_PI_IS_TRANSIENT(pip)) {
3656 			/*
3657 			 * mdi_pathinfo is in state transition.  Return the
3658 			 * last good state.
3659 			 */
3660 			*state = MDI_PI_OLD_STATE(pip);
3661 			*ext_state = MDI_PI_OLD_EXT_STATE(pip);
3662 		} else {
3663 			*state = MDI_PI_STATE(pip);
3664 			*ext_state = MDI_PI_EXT_STATE(pip);
3665 		}
3666 	}
3667 	return (MDI_SUCCESS);
3668 }
3669 
3670 /*
3671  * mdi_pi_get_preferred:
3672  *	Get the preferred path flag
3673  */
3674 int
3675 mdi_pi_get_preferred(mdi_pathinfo_t *pip)
3676 {
3677 	if (pip) {
3678 		return (MDI_PI(pip)->pi_preferred);
3679 	}
3680 	return (0);
3681 }
3682 
3683 /*
3684  * mdi_pi_set_preferred:
3685  *	Set the preferred path flag
3686  */
3687 void
3688 mdi_pi_set_preferred(mdi_pathinfo_t *pip, int preferred)
3689 {
3690 	if (pip) {
3691 		MDI_PI(pip)->pi_preferred = preferred;
3692 	}
3693 }
3694 
3695 
3696 /*
3697  * mdi_pi_set_state():
3698  *		Set the mdi_pathinfo node state
3699  */
3700 void
3701 mdi_pi_set_state(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state)
3702 {
3703 	uint32_t	ext_state;
3704 
3705 	if (pip) {
3706 		ext_state = MDI_PI(pip)->pi_state & MDI_PATHINFO_EXT_STATE_MASK;
3707 		MDI_PI(pip)->pi_state = state;
3708 		MDI_PI(pip)->pi_state |= ext_state;
3709 	}
3710 }
3711 
3712 /*
3713  * Property functions:
3714  */
3715 
3716 int
3717 i_map_nvlist_error_to_mdi(int val)
3718 {
3719 	int rv;
3720 
3721 	switch (val) {
3722 	case 0:
3723 		rv = DDI_PROP_SUCCESS;
3724 		break;
3725 	case EINVAL:
3726 	case ENOTSUP:
3727 		rv = DDI_PROP_INVAL_ARG;
3728 		break;
3729 	case ENOMEM:
3730 		rv = DDI_PROP_NO_MEMORY;
3731 		break;
3732 	default:
3733 		rv = DDI_PROP_NOT_FOUND;
3734 		break;
3735 	}
3736 	return (rv);
3737 }
3738 
3739 /*
3740  * mdi_pi_get_next_prop():
3741  * 		Property walk function.  The caller should hold mdi_pi_lock()
3742  *		and release by calling mdi_pi_unlock() at the end of walk to
3743  *		get a consistent value.
3744  */
3745 
3746 nvpair_t *
3747 mdi_pi_get_next_prop(mdi_pathinfo_t *pip, nvpair_t *prev)
3748 {
3749 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
3750 		return (NULL);
3751 	}
3752 	ASSERT(MUTEX_HELD(&MDI_PI(pip)->pi_mutex));
3753 	return (nvlist_next_nvpair(MDI_PI(pip)->pi_prop, prev));
3754 }
3755 
3756 /*
3757  * mdi_prop_remove():
3758  * 		Remove the named property from the named list.
3759  */
3760 
3761 int
3762 mdi_prop_remove(mdi_pathinfo_t *pip, char *name)
3763 {
3764 	if (pip == NULL) {
3765 		return (DDI_PROP_NOT_FOUND);
3766 	}
3767 	ASSERT(!MUTEX_HELD(&MDI_PI(pip)->pi_mutex));
3768 	MDI_PI_LOCK(pip);
3769 	if (MDI_PI(pip)->pi_prop == NULL) {
3770 		MDI_PI_UNLOCK(pip);
3771 		return (DDI_PROP_NOT_FOUND);
3772 	}
3773 	if (name) {
3774 		(void) nvlist_remove_all(MDI_PI(pip)->pi_prop, name);
3775 	} else {
3776 		char		nvp_name[MAXNAMELEN];
3777 		nvpair_t	*nvp;
3778 		nvp = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, NULL);
3779 		while (nvp) {
3780 			nvpair_t	*next;
3781 			next = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, nvp);
3782 			(void) snprintf(nvp_name, MAXNAMELEN, "%s",
3783 			    nvpair_name(nvp));
3784 			(void) nvlist_remove_all(MDI_PI(pip)->pi_prop,
3785 			    nvp_name);
3786 			nvp = next;
3787 		}
3788 	}
3789 	MDI_PI_UNLOCK(pip);
3790 	return (DDI_PROP_SUCCESS);
3791 }
3792 
3793 /*
3794  * mdi_prop_size():
3795  * 		Get buffer size needed to pack the property data.
3796  * 		Caller should hold the mdi_pathinfo_t lock to get a consistent
3797  *		buffer size.
3798  */
3799 
3800 int
3801 mdi_prop_size(mdi_pathinfo_t *pip, size_t *buflenp)
3802 {
3803 	int	rv;
3804 	size_t	bufsize;
3805 
3806 	*buflenp = 0;
3807 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
3808 		return (DDI_PROP_NOT_FOUND);
3809 	}
3810 	ASSERT(MUTEX_HELD(&MDI_PI(pip)->pi_mutex));
3811 	rv = nvlist_size(MDI_PI(pip)->pi_prop,
3812 	    &bufsize, NV_ENCODE_NATIVE);
3813 	*buflenp = bufsize;
3814 	return (i_map_nvlist_error_to_mdi(rv));
3815 }
3816 
3817 /*
3818  * mdi_prop_pack():
3819  * 		pack the property list.  The caller should hold the
3820  *		mdi_pathinfo_t node to get a consistent data
3821  */
3822 
3823 int
3824 mdi_prop_pack(mdi_pathinfo_t *pip, char **bufp, uint_t buflen)
3825 {
3826 	int	rv;
3827 	size_t	bufsize;
3828 
3829 	if ((pip == NULL) || MDI_PI(pip)->pi_prop == NULL) {
3830 		return (DDI_PROP_NOT_FOUND);
3831 	}
3832 
3833 	ASSERT(MUTEX_HELD(&MDI_PI(pip)->pi_mutex));
3834 
3835 	bufsize = buflen;
3836 	rv = nvlist_pack(MDI_PI(pip)->pi_prop, bufp, (size_t *)&bufsize,
3837 	    NV_ENCODE_NATIVE, KM_SLEEP);
3838 
3839 	return (i_map_nvlist_error_to_mdi(rv));
3840 }
3841 
3842 /*
3843  * mdi_prop_update_byte():
3844  *		Create/Update a byte property
3845  */
3846 int
3847 mdi_prop_update_byte(mdi_pathinfo_t *pip, char *name, uchar_t data)
3848 {
3849 	int rv;
3850 
3851 	if (pip == NULL) {
3852 		return (DDI_PROP_INVAL_ARG);
3853 	}
3854 	ASSERT(!MUTEX_HELD(&MDI_PI(pip)->pi_mutex));
3855 	MDI_PI_LOCK(pip);
3856 	if (MDI_PI(pip)->pi_prop == NULL) {
3857 		MDI_PI_UNLOCK(pip);
3858 		return (DDI_PROP_NOT_FOUND);
3859 	}
3860 	rv = nvlist_add_byte(MDI_PI(pip)->pi_prop, name, data);
3861 	MDI_PI_UNLOCK(pip);
3862 	return (i_map_nvlist_error_to_mdi(rv));
3863 }
3864 
3865 /*
3866  * mdi_prop_update_byte_array():
3867  *		Create/Update a byte array property
3868  */
3869 int
3870 mdi_prop_update_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t *data,
3871     uint_t nelements)
3872 {
3873 	int rv;
3874 
3875 	if (pip == NULL) {
3876 		return (DDI_PROP_INVAL_ARG);
3877 	}
3878 	ASSERT(!MUTEX_HELD(&MDI_PI(pip)->pi_mutex));
3879 	MDI_PI_LOCK(pip);
3880 	if (MDI_PI(pip)->pi_prop == NULL) {
3881 		MDI_PI_UNLOCK(pip);
3882 		return (DDI_PROP_NOT_FOUND);
3883 	}
3884 	rv = nvlist_add_byte_array(MDI_PI(pip)->pi_prop, name, data, nelements);
3885 	MDI_PI_UNLOCK(pip);
3886 	return (i_map_nvlist_error_to_mdi(rv));
3887 }
3888 
3889 /*
3890  * mdi_prop_update_int():
3891  *		Create/Update a 32 bit integer property
3892  */
3893 int
3894 mdi_prop_update_int(mdi_pathinfo_t *pip, char *name, int data)
3895 {
3896 	int rv;
3897 
3898 	if (pip == NULL) {
3899 		return (DDI_PROP_INVAL_ARG);
3900 	}
3901 	ASSERT(!MUTEX_HELD(&MDI_PI(pip)->pi_mutex));
3902 	MDI_PI_LOCK(pip);
3903 	if (MDI_PI(pip)->pi_prop == NULL) {
3904 		MDI_PI_UNLOCK(pip);
3905 		return (DDI_PROP_NOT_FOUND);
3906 	}
3907 	rv = nvlist_add_int32(MDI_PI(pip)->pi_prop, name, (int32_t)data);
3908 	MDI_PI_UNLOCK(pip);
3909 	return (i_map_nvlist_error_to_mdi(rv));
3910 }
3911 
3912 /*
3913  * mdi_prop_update_int64():
3914  *		Create/Update a 64 bit integer property
3915  */
3916 int
3917 mdi_prop_update_int64(mdi_pathinfo_t *pip, char *name, int64_t data)
3918 {
3919 	int rv;
3920 
3921 	if (pip == NULL) {
3922 		return (DDI_PROP_INVAL_ARG);
3923 	}
3924 	ASSERT(!MUTEX_HELD(&MDI_PI(pip)->pi_mutex));
3925 	MDI_PI_LOCK(pip);
3926 	if (MDI_PI(pip)->pi_prop == NULL) {
3927 		MDI_PI_UNLOCK(pip);
3928 		return (DDI_PROP_NOT_FOUND);
3929 	}
3930 	rv = nvlist_add_int64(MDI_PI(pip)->pi_prop, name, data);
3931 	MDI_PI_UNLOCK(pip);
3932 	return (i_map_nvlist_error_to_mdi(rv));
3933 }
3934 
3935 /*
3936  * mdi_prop_update_int_array():
3937  *		Create/Update a int array property
3938  */
3939 int
3940 mdi_prop_update_int_array(mdi_pathinfo_t *pip, char *name, int *data,
3941 	    uint_t nelements)
3942 {
3943 	int rv;
3944 
3945 	if (pip == NULL) {
3946 		return (DDI_PROP_INVAL_ARG);
3947 	}
3948 	ASSERT(!MUTEX_HELD(&MDI_PI(pip)->pi_mutex));
3949 	MDI_PI_LOCK(pip);
3950 	if (MDI_PI(pip)->pi_prop == NULL) {
3951 		MDI_PI_UNLOCK(pip);
3952 		return (DDI_PROP_NOT_FOUND);
3953 	}
3954 	rv = nvlist_add_int32_array(MDI_PI(pip)->pi_prop, name, (int32_t *)data,
3955 	    nelements);
3956 	MDI_PI_UNLOCK(pip);
3957 	return (i_map_nvlist_error_to_mdi(rv));
3958 }
3959 
3960 /*
3961  * mdi_prop_update_string():
3962  *		Create/Update a string property
3963  */
3964 int
3965 mdi_prop_update_string(mdi_pathinfo_t *pip, char *name, char *data)
3966 {
3967 	int rv;
3968 
3969 	if (pip == NULL) {
3970 		return (DDI_PROP_INVAL_ARG);
3971 	}
3972 	ASSERT(!MUTEX_HELD(&MDI_PI(pip)->pi_mutex));
3973 	MDI_PI_LOCK(pip);
3974 	if (MDI_PI(pip)->pi_prop == NULL) {
3975 		MDI_PI_UNLOCK(pip);
3976 		return (DDI_PROP_NOT_FOUND);
3977 	}
3978 	rv = nvlist_add_string(MDI_PI(pip)->pi_prop, name, data);
3979 	MDI_PI_UNLOCK(pip);
3980 	return (i_map_nvlist_error_to_mdi(rv));
3981 }
3982 
3983 /*
3984  * mdi_prop_update_string_array():
3985  *		Create/Update a string array property
3986  */
3987 int
3988 mdi_prop_update_string_array(mdi_pathinfo_t *pip, char *name, char **data,
3989     uint_t nelements)
3990 {
3991 	int rv;
3992 
3993 	if (pip == NULL) {
3994 		return (DDI_PROP_INVAL_ARG);
3995 	}
3996 	ASSERT(!MUTEX_HELD(&MDI_PI(pip)->pi_mutex));
3997 	MDI_PI_LOCK(pip);
3998 	if (MDI_PI(pip)->pi_prop == NULL) {
3999 		MDI_PI_UNLOCK(pip);
4000 		return (DDI_PROP_NOT_FOUND);
4001 	}
4002 	rv = nvlist_add_string_array(MDI_PI(pip)->pi_prop, name, data,
4003 	    nelements);
4004 	MDI_PI_UNLOCK(pip);
4005 	return (i_map_nvlist_error_to_mdi(rv));
4006 }
4007 
4008 /*
4009  * mdi_prop_lookup_byte():
4010  * 		Look for byte property identified by name.  The data returned
4011  *		is the actual property and valid as long as mdi_pathinfo_t node
4012  *		is alive.
4013  */
4014 int
4015 mdi_prop_lookup_byte(mdi_pathinfo_t *pip, char *name, uchar_t *data)
4016 {
4017 	int rv;
4018 
4019 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4020 		return (DDI_PROP_NOT_FOUND);
4021 	}
4022 	rv = nvlist_lookup_byte(MDI_PI(pip)->pi_prop, name, data);
4023 	return (i_map_nvlist_error_to_mdi(rv));
4024 }
4025 
4026 
4027 /*
4028  * mdi_prop_lookup_byte_array():
4029  * 		Look for byte array property identified by name.  The data
4030  *		returned is the actual property and valid as long as
4031  *		mdi_pathinfo_t node is alive.
4032  */
4033 int
4034 mdi_prop_lookup_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t **data,
4035     uint_t *nelements)
4036 {
4037 	int rv;
4038 
4039 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4040 		return (DDI_PROP_NOT_FOUND);
4041 	}
4042 	rv = nvlist_lookup_byte_array(MDI_PI(pip)->pi_prop, name, data,
4043 	    nelements);
4044 	return (i_map_nvlist_error_to_mdi(rv));
4045 }
4046 
4047 /*
4048  * mdi_prop_lookup_int():
4049  * 		Look for int property identified by name.  The data returned
4050  *		is the actual property and valid as long as mdi_pathinfo_t
4051  *		node is alive.
4052  */
4053 int
4054 mdi_prop_lookup_int(mdi_pathinfo_t *pip, char *name, int *data)
4055 {
4056 	int rv;
4057 
4058 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4059 		return (DDI_PROP_NOT_FOUND);
4060 	}
4061 	rv = nvlist_lookup_int32(MDI_PI(pip)->pi_prop, name, (int32_t *)data);
4062 	return (i_map_nvlist_error_to_mdi(rv));
4063 }
4064 
4065 /*
4066  * mdi_prop_lookup_int64():
4067  * 		Look for int64 property identified by name.  The data returned
4068  *		is the actual property and valid as long as mdi_pathinfo_t node
4069  *		is alive.
4070  */
4071 int
4072 mdi_prop_lookup_int64(mdi_pathinfo_t *pip, char *name, int64_t *data)
4073 {
4074 	int rv;
4075 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4076 		return (DDI_PROP_NOT_FOUND);
4077 	}
4078 	rv = nvlist_lookup_int64(MDI_PI(pip)->pi_prop, name, data);
4079 	return (i_map_nvlist_error_to_mdi(rv));
4080 }
4081 
4082 /*
4083  * mdi_prop_lookup_int_array():
4084  * 		Look for int array property identified by name.  The data
4085  *		returned is the actual property and valid as long as
4086  *		mdi_pathinfo_t node is alive.
4087  */
4088 int
4089 mdi_prop_lookup_int_array(mdi_pathinfo_t *pip, char *name, int **data,
4090     uint_t *nelements)
4091 {
4092 	int rv;
4093 
4094 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4095 		return (DDI_PROP_NOT_FOUND);
4096 	}
4097 	rv = nvlist_lookup_int32_array(MDI_PI(pip)->pi_prop, name,
4098 	    (int32_t **)data, nelements);
4099 	return (i_map_nvlist_error_to_mdi(rv));
4100 }
4101 
4102 /*
4103  * mdi_prop_lookup_string():
4104  * 		Look for string property identified by name.  The data
4105  *		returned is the actual property and valid as long as
4106  *		mdi_pathinfo_t node is alive.
4107  */
4108 int
4109 mdi_prop_lookup_string(mdi_pathinfo_t *pip, char *name, char **data)
4110 {
4111 	int rv;
4112 
4113 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4114 		return (DDI_PROP_NOT_FOUND);
4115 	}
4116 	rv = nvlist_lookup_string(MDI_PI(pip)->pi_prop, name, data);
4117 	return (i_map_nvlist_error_to_mdi(rv));
4118 }
4119 
4120 /*
4121  * mdi_prop_lookup_string_array():
4122  * 		Look for string array property identified by name.  The data
4123  *		returned is the actual property and valid as long as
4124  *		mdi_pathinfo_t node is alive.
4125  */
4126 
4127 int
4128 mdi_prop_lookup_string_array(mdi_pathinfo_t *pip, char *name, char ***data,
4129     uint_t *nelements)
4130 {
4131 	int rv;
4132 
4133 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4134 		return (DDI_PROP_NOT_FOUND);
4135 	}
4136 	rv = nvlist_lookup_string_array(MDI_PI(pip)->pi_prop, name, data,
4137 	    nelements);
4138 	return (i_map_nvlist_error_to_mdi(rv));
4139 }
4140 
4141 /*
4142  * mdi_prop_free():
4143  * 		Symmetrical function to ddi_prop_free(). nvlist_lookup_xx()
4144  *		functions return the pointer to actual property data and not a
4145  *		copy of it.  So the data returned is valid as long as
4146  *		mdi_pathinfo_t node is valid.
4147  */
4148 
4149 /*ARGSUSED*/
4150 int
4151 mdi_prop_free(void *data)
4152 {
4153 	return (DDI_PROP_SUCCESS);
4154 }
4155 
4156 /*ARGSUSED*/
4157 static void
4158 i_mdi_report_path_state(mdi_client_t *ct, mdi_pathinfo_t *pip)
4159 {
4160 	char		*phci_path, *ct_path;
4161 	char		*ct_status;
4162 	char		*status;
4163 	dev_info_t	*dip = ct->ct_dip;
4164 	char		lb_buf[64];
4165 
4166 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
4167 	if ((dip == NULL) || (ddi_get_instance(dip) == -1) ||
4168 	    (MDI_CLIENT_IS_REPORT_DEV_NEEDED(ct) == 0)) {
4169 		return;
4170 	}
4171 	if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_OPTIMAL) {
4172 		ct_status = "optimal";
4173 	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED) {
4174 		ct_status = "degraded";
4175 	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
4176 		ct_status = "failed";
4177 	} else {
4178 		ct_status = "unknown";
4179 	}
4180 
4181 	if (MDI_PI_IS_OFFLINE(pip)) {
4182 		status = "offline";
4183 	} else if (MDI_PI_IS_ONLINE(pip)) {
4184 		status = "online";
4185 	} else if (MDI_PI_IS_STANDBY(pip)) {
4186 		status = "standby";
4187 	} else if (MDI_PI_IS_FAULT(pip)) {
4188 		status = "faulted";
4189 	} else {
4190 		status = "unknown";
4191 	}
4192 
4193 	if (ct->ct_lb == LOAD_BALANCE_LBA) {
4194 		(void) snprintf(lb_buf, sizeof (lb_buf),
4195 		    "%s, region-size: %d", mdi_load_balance_lba,
4196 			ct->ct_lb_args->region_size);
4197 	} else if (ct->ct_lb == LOAD_BALANCE_NONE) {
4198 		(void) snprintf(lb_buf, sizeof (lb_buf),
4199 		    "%s", mdi_load_balance_none);
4200 	} else {
4201 		(void) snprintf(lb_buf, sizeof (lb_buf), "%s",
4202 		    mdi_load_balance_rr);
4203 	}
4204 
4205 	if (dip) {
4206 		ct_path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4207 		phci_path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4208 		cmn_err(CE_CONT, "?%s (%s%d) multipath status: %s, "
4209 		    "path %s (%s%d) to target address: %s is %s"
4210 		    " Load balancing: %s\n",
4211 		    ddi_pathname(dip, ct_path), ddi_driver_name(dip),
4212 		    ddi_get_instance(dip), ct_status,
4213 		    ddi_pathname(MDI_PI(pip)->pi_phci->ph_dip, phci_path),
4214 		    ddi_driver_name(MDI_PI(pip)->pi_phci->ph_dip),
4215 		    ddi_get_instance(MDI_PI(pip)->pi_phci->ph_dip),
4216 		    MDI_PI(pip)->pi_addr, status, lb_buf);
4217 		kmem_free(phci_path, MAXPATHLEN);
4218 		kmem_free(ct_path, MAXPATHLEN);
4219 		MDI_CLIENT_CLEAR_REPORT_DEV_NEEDED(ct);
4220 	}
4221 }
4222 
4223 #ifdef	DEBUG
4224 /*
4225  * i_mdi_log():
4226  *		Utility function for error message management
4227  *
4228  */
4229 
4230 /*VARARGS3*/
4231 static void
4232 i_mdi_log(int level, dev_info_t *dip, const char *fmt, ...)
4233 {
4234 	char		buf[MAXNAMELEN];
4235 	char		name[MAXNAMELEN];
4236 	va_list		ap;
4237 	int		log_only = 0;
4238 	int		boot_only = 0;
4239 	int		console_only = 0;
4240 
4241 	if (dip) {
4242 		if (level == CE_PANIC || level == CE_WARN || level == CE_NOTE) {
4243 			(void) snprintf(name, MAXNAMELEN, "%s%d:\n",
4244 			    ddi_node_name(dip), ddi_get_instance(dip));
4245 		} else {
4246 			(void) snprintf(name, MAXNAMELEN, "%s%d:",
4247 			    ddi_node_name(dip), ddi_get_instance(dip));
4248 		}
4249 	} else {
4250 		name[0] = '\0';
4251 	}
4252 
4253 	va_start(ap, fmt);
4254 	(void) vsnprintf(buf, MAXNAMELEN, fmt, ap);
4255 	va_end(ap);
4256 
4257 	switch (buf[0]) {
4258 	case '!':
4259 		log_only = 1;
4260 		break;
4261 	case '?':
4262 		boot_only = 1;
4263 		break;
4264 	case '^':
4265 		console_only = 1;
4266 		break;
4267 	}
4268 
4269 	switch (level) {
4270 	case CE_NOTE:
4271 		level = CE_CONT;
4272 		/* FALLTHROUGH */
4273 	case CE_CONT:
4274 	case CE_WARN:
4275 	case CE_PANIC:
4276 		if (boot_only) {
4277 			cmn_err(level, "?%s\t%s", name, &buf[1]);
4278 		} else if (console_only) {
4279 			cmn_err(level, "^%s\t%s", name, &buf[1]);
4280 		} else if (log_only) {
4281 			cmn_err(level, "!%s\t%s", name, &buf[1]);
4282 		} else {
4283 			cmn_err(level, "%s\t%s", name, buf);
4284 		}
4285 		break;
4286 	default:
4287 		cmn_err(level, "%s\t%s", name, buf);
4288 		break;
4289 	}
4290 }
4291 #endif	/* DEBUG */
4292 
4293 void
4294 i_mdi_client_online(dev_info_t *ct_dip)
4295 {
4296 	mdi_client_t	*ct;
4297 
4298 	/*
4299 	 * Client online notification. Mark client state as online
4300 	 * restore our binding with dev_info node
4301 	 */
4302 	ct = i_devi_get_client(ct_dip);
4303 	ASSERT(ct != NULL);
4304 	MDI_CLIENT_LOCK(ct);
4305 	MDI_CLIENT_SET_ONLINE(ct);
4306 	/* catch for any memory leaks */
4307 	ASSERT((ct->ct_dip == NULL) || (ct->ct_dip == ct_dip));
4308 	ct->ct_dip = ct_dip;
4309 
4310 	if (ct->ct_power_cnt == 0)
4311 		(void) i_mdi_power_all_phci(ct);
4312 
4313 	MDI_DEBUG(4, (CE_NOTE, ct_dip, "i_mdi_client_online "
4314 	    "i_mdi_pm_hold_client\n"));
4315 	i_mdi_pm_hold_client(ct, 1);
4316 
4317 	MDI_CLIENT_UNLOCK(ct);
4318 }
4319 
4320 void
4321 i_mdi_phci_online(dev_info_t *ph_dip)
4322 {
4323 	mdi_phci_t	*ph;
4324 
4325 	/* pHCI online notification. Mark state accordingly */
4326 	ph = i_devi_get_phci(ph_dip);
4327 	ASSERT(ph != NULL);
4328 	MDI_PHCI_LOCK(ph);
4329 	MDI_PHCI_SET_ONLINE(ph);
4330 	MDI_PHCI_UNLOCK(ph);
4331 }
4332 
4333 /*
4334  * mdi_devi_online():
4335  * 		Online notification from NDI framework on pHCI/client
4336  *		device online.
4337  * Return Values:
4338  *		NDI_SUCCESS
4339  *		MDI_FAILURE
4340  */
4341 
4342 /*ARGSUSED*/
4343 int
4344 mdi_devi_online(dev_info_t *dip, uint_t flags)
4345 {
4346 	if (MDI_PHCI(dip)) {
4347 		i_mdi_phci_online(dip);
4348 	}
4349 
4350 	if (MDI_CLIENT(dip)) {
4351 		i_mdi_client_online(dip);
4352 	}
4353 	return (NDI_SUCCESS);
4354 }
4355 
4356 /*
4357  * mdi_devi_offline():
4358  * 		Offline notification from NDI framework on pHCI/Client device
4359  *		offline.
4360  *
4361  * Return Values:
4362  *		NDI_SUCCESS
4363  *		NDI_FAILURE
4364  */
4365 
4366 /*ARGSUSED*/
4367 int
4368 mdi_devi_offline(dev_info_t *dip, uint_t flags)
4369 {
4370 	int		rv = NDI_SUCCESS;
4371 
4372 	if (MDI_CLIENT(dip)) {
4373 		rv = i_mdi_client_offline(dip, flags);
4374 		if (rv != NDI_SUCCESS)
4375 			return (rv);
4376 	}
4377 
4378 	if (MDI_PHCI(dip)) {
4379 		rv = i_mdi_phci_offline(dip, flags);
4380 		if ((rv != NDI_SUCCESS) && MDI_CLIENT(dip)) {
4381 			/* set client back online */
4382 			i_mdi_client_online(dip);
4383 		}
4384 	}
4385 
4386 	return (rv);
4387 }
4388 
4389 /*ARGSUSED*/
4390 static int
4391 i_mdi_phci_offline(dev_info_t *dip, uint_t flags)
4392 {
4393 	int		rv = NDI_SUCCESS;
4394 	mdi_phci_t	*ph;
4395 	mdi_client_t	*ct;
4396 	mdi_pathinfo_t	*pip;
4397 	mdi_pathinfo_t	*next;
4398 	mdi_pathinfo_t	*failed_pip = NULL;
4399 	dev_info_t	*cdip;
4400 
4401 	/*
4402 	 * pHCI component offline notification
4403 	 * Make sure that this pHCI instance is free to be offlined.
4404 	 * If it is OK to proceed, Offline and remove all the child
4405 	 * mdi_pathinfo nodes.  This process automatically offlines
4406 	 * corresponding client devices, for which this pHCI provides
4407 	 * critical services.
4408 	 */
4409 	MDI_DEBUG(2, (CE_NOTE, dip, "!mdi_phci_offline called %p\n",
4410 	    dip));
4411 
4412 	ph = i_devi_get_phci(dip);
4413 	if (ph == NULL) {
4414 		return (rv);
4415 	}
4416 
4417 	MDI_PHCI_LOCK(ph);
4418 
4419 	if (MDI_PHCI_IS_OFFLINE(ph)) {
4420 		MDI_DEBUG(1, (CE_WARN, dip, "!pHCI %p already offlined", ph));
4421 		MDI_PHCI_UNLOCK(ph);
4422 		return (NDI_SUCCESS);
4423 	}
4424 
4425 	/*
4426 	 * Check to see if the pHCI can be offlined
4427 	 */
4428 	if (ph->ph_unstable) {
4429 		MDI_DEBUG(1, (CE_WARN, dip,
4430 		    "!One or more target devices are in transient "
4431 		    "state. This device can not be removed at "
4432 		    "this moment. Please try again later."));
4433 		MDI_PHCI_UNLOCK(ph);
4434 		return (NDI_BUSY);
4435 	}
4436 
4437 	pip = ph->ph_path_head;
4438 	while (pip != NULL) {
4439 		MDI_PI_LOCK(pip);
4440 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4441 		/*
4442 		 * The mdi_pathinfo state is OK. Check the client state.
4443 		 * If failover in progress fail the pHCI from offlining
4444 		 */
4445 		ct = MDI_PI(pip)->pi_client;
4446 		i_mdi_client_lock(ct, pip);
4447 		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
4448 		    (ct->ct_unstable)) {
4449 			/*
4450 			 * Failover is in progress, Fail the DR
4451 			 */
4452 			MDI_DEBUG(1, (CE_WARN, dip,
4453 			    "!pHCI device (%s%d) is Busy. %s",
4454 			    ddi_driver_name(dip), ddi_get_instance(dip),
4455 			    "This device can not be removed at "
4456 			    "this moment. Please try again later."));
4457 			MDI_PI_UNLOCK(pip);
4458 			MDI_CLIENT_UNLOCK(ct);
4459 			MDI_PHCI_UNLOCK(ph);
4460 			return (NDI_BUSY);
4461 		}
4462 		MDI_PI_UNLOCK(pip);
4463 
4464 		/*
4465 		 * Check to see of we are removing the last path of this
4466 		 * client device...
4467 		 */
4468 		cdip = ct->ct_dip;
4469 		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
4470 		    (i_mdi_client_compute_state(ct, ph) ==
4471 		    MDI_CLIENT_STATE_FAILED)) {
4472 			i_mdi_client_unlock(ct);
4473 			MDI_PHCI_UNLOCK(ph);
4474 			if (ndi_devi_offline(cdip, 0) != NDI_SUCCESS) {
4475 				/*
4476 				 * ndi_devi_offline() failed.
4477 				 * This pHCI provides the critical path
4478 				 * to one or more client devices.
4479 				 * Return busy.
4480 				 */
4481 				MDI_PHCI_LOCK(ph);
4482 				MDI_DEBUG(1, (CE_WARN, dip,
4483 				    "!pHCI device (%s%d) is Busy. %s",
4484 				    ddi_driver_name(dip), ddi_get_instance(dip),
4485 				    "This device can not be removed at "
4486 				    "this moment. Please try again later."));
4487 				failed_pip = pip;
4488 				break;
4489 			} else {
4490 				MDI_PHCI_LOCK(ph);
4491 				pip = next;
4492 			}
4493 		} else {
4494 			i_mdi_client_unlock(ct);
4495 			pip = next;
4496 		}
4497 	}
4498 
4499 	if (failed_pip) {
4500 		pip = ph->ph_path_head;
4501 		while (pip != failed_pip) {
4502 			MDI_PI_LOCK(pip);
4503 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4504 			ct = MDI_PI(pip)->pi_client;
4505 			i_mdi_client_lock(ct, pip);
4506 			cdip = ct->ct_dip;
4507 			switch (MDI_CLIENT_STATE(ct)) {
4508 			case MDI_CLIENT_STATE_OPTIMAL:
4509 			case MDI_CLIENT_STATE_DEGRADED:
4510 				if (cdip) {
4511 					MDI_PI_UNLOCK(pip);
4512 					i_mdi_client_unlock(ct);
4513 					MDI_PHCI_UNLOCK(ph);
4514 					(void) ndi_devi_online(cdip, 0);
4515 					MDI_PHCI_LOCK(ph);
4516 					pip = next;
4517 					continue;
4518 				}
4519 				break;
4520 
4521 			case MDI_CLIENT_STATE_FAILED:
4522 				if (cdip) {
4523 					MDI_PI_UNLOCK(pip);
4524 					i_mdi_client_unlock(ct);
4525 					MDI_PHCI_UNLOCK(ph);
4526 					(void) ndi_devi_offline(cdip, 0);
4527 					MDI_PHCI_LOCK(ph);
4528 					pip = next;
4529 					continue;
4530 				}
4531 				break;
4532 			}
4533 			MDI_PI_UNLOCK(pip);
4534 			i_mdi_client_unlock(ct);
4535 			pip = next;
4536 		}
4537 		MDI_PHCI_UNLOCK(ph);
4538 		return (NDI_BUSY);
4539 	}
4540 
4541 	/*
4542 	 * Mark the pHCI as offline
4543 	 */
4544 	MDI_PHCI_SET_OFFLINE(ph);
4545 
4546 	/*
4547 	 * Mark the child mdi_pathinfo nodes as transient
4548 	 */
4549 	pip = ph->ph_path_head;
4550 	while (pip != NULL) {
4551 		MDI_PI_LOCK(pip);
4552 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4553 		MDI_PI_SET_OFFLINING(pip);
4554 		MDI_PI_UNLOCK(pip);
4555 		pip = next;
4556 	}
4557 	MDI_PHCI_UNLOCK(ph);
4558 	/*
4559 	 * Give a chance for any pending commands to execute
4560 	 */
4561 	delay(1);
4562 	MDI_PHCI_LOCK(ph);
4563 	pip = ph->ph_path_head;
4564 	while (pip != NULL) {
4565 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4566 		(void) i_mdi_pi_offline(pip, flags);
4567 		MDI_PI_LOCK(pip);
4568 		ct = MDI_PI(pip)->pi_client;
4569 		if (!MDI_PI_IS_OFFLINE(pip)) {
4570 			MDI_DEBUG(1, (CE_WARN, dip,
4571 			    "!pHCI device (%s%d) is Busy. %s",
4572 			    ddi_driver_name(dip), ddi_get_instance(dip),
4573 			    "This device can not be removed at "
4574 			    "this moment. Please try again later."));
4575 			MDI_PI_UNLOCK(pip);
4576 			MDI_PHCI_SET_ONLINE(ph);
4577 			MDI_PHCI_UNLOCK(ph);
4578 			return (NDI_BUSY);
4579 		}
4580 		MDI_PI_UNLOCK(pip);
4581 		pip = next;
4582 	}
4583 	MDI_PHCI_UNLOCK(ph);
4584 
4585 	return (rv);
4586 }
4587 
4588 /*ARGSUSED*/
4589 static int
4590 i_mdi_client_offline(dev_info_t *dip, uint_t flags)
4591 {
4592 	int		rv = NDI_SUCCESS;
4593 	mdi_client_t	*ct;
4594 
4595 	/*
4596 	 * Client component to go offline.  Make sure that we are
4597 	 * not in failing over state and update client state
4598 	 * accordingly
4599 	 */
4600 	MDI_DEBUG(2, (CE_NOTE, dip, "!i_mdi_client_offline called %p\n",
4601 	    dip));
4602 	ct = i_devi_get_client(dip);
4603 	if (ct != NULL) {
4604 		MDI_CLIENT_LOCK(ct);
4605 		if (ct->ct_unstable) {
4606 			/*
4607 			 * One or more paths are in transient state,
4608 			 * Dont allow offline of a client device
4609 			 */
4610 			MDI_DEBUG(1, (CE_WARN, dip,
4611 			    "!One or more paths to this device is "
4612 			    "in transient state. This device can not "
4613 			    "be removed at this moment. "
4614 			    "Please try again later."));
4615 			MDI_CLIENT_UNLOCK(ct);
4616 			return (NDI_BUSY);
4617 		}
4618 		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
4619 			/*
4620 			 * Failover is in progress, Dont allow DR of
4621 			 * a client device
4622 			 */
4623 			MDI_DEBUG(1, (CE_WARN, dip,
4624 			    "!Client device (%s%d) is Busy. %s",
4625 			    ddi_driver_name(dip), ddi_get_instance(dip),
4626 			    "This device can not be removed at "
4627 			    "this moment. Please try again later."));
4628 			MDI_CLIENT_UNLOCK(ct);
4629 			return (NDI_BUSY);
4630 		}
4631 		MDI_CLIENT_SET_OFFLINE(ct);
4632 
4633 		/*
4634 		 * Unbind our relationship with the dev_info node
4635 		 */
4636 		if (flags & NDI_DEVI_REMOVE) {
4637 			ct->ct_dip = NULL;
4638 		}
4639 		MDI_CLIENT_UNLOCK(ct);
4640 	}
4641 	return (rv);
4642 }
4643 
4644 /*
4645  * mdi_pre_attach():
4646  *		Pre attach() notification handler
4647  */
4648 
4649 /*ARGSUSED*/
4650 int
4651 mdi_pre_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
4652 {
4653 	/* don't support old DDI_PM_RESUME */
4654 	if ((DEVI(dip)->devi_mdi_component != MDI_COMPONENT_NONE) &&
4655 	    (cmd == DDI_PM_RESUME))
4656 		return (DDI_FAILURE);
4657 
4658 	return (DDI_SUCCESS);
4659 }
4660 
4661 /*
4662  * mdi_post_attach():
4663  *		Post attach() notification handler
4664  */
4665 
4666 /*ARGSUSED*/
4667 void
4668 mdi_post_attach(dev_info_t *dip, ddi_attach_cmd_t cmd, int error)
4669 {
4670 	mdi_phci_t	*ph;
4671 	mdi_client_t	*ct;
4672 	mdi_pathinfo_t	*pip;
4673 
4674 	if (MDI_PHCI(dip)) {
4675 		ph = i_devi_get_phci(dip);
4676 		ASSERT(ph != NULL);
4677 
4678 		MDI_PHCI_LOCK(ph);
4679 		switch (cmd) {
4680 		case DDI_ATTACH:
4681 			MDI_DEBUG(2, (CE_NOTE, dip,
4682 			    "!pHCI post_attach: called %p\n", ph));
4683 			if (error == DDI_SUCCESS) {
4684 				MDI_PHCI_SET_ATTACH(ph);
4685 			} else {
4686 				MDI_DEBUG(1, (CE_NOTE, dip,
4687 				    "!pHCI post_attach: failed error=%d\n",
4688 				    error));
4689 				MDI_PHCI_SET_DETACH(ph);
4690 			}
4691 			break;
4692 
4693 		case DDI_RESUME:
4694 			MDI_DEBUG(2, (CE_NOTE, dip,
4695 			    "!pHCI post_resume: called %p\n", ph));
4696 			if (error == DDI_SUCCESS) {
4697 				MDI_PHCI_SET_RESUME(ph);
4698 			} else {
4699 				MDI_DEBUG(1, (CE_NOTE, dip,
4700 				    "!pHCI post_resume: failed error=%d\n",
4701 				    error));
4702 				MDI_PHCI_SET_SUSPEND(ph);
4703 			}
4704 			break;
4705 		}
4706 		MDI_PHCI_UNLOCK(ph);
4707 	}
4708 
4709 	if (MDI_CLIENT(dip)) {
4710 		ct = i_devi_get_client(dip);
4711 		ASSERT(ct != NULL);
4712 
4713 		MDI_CLIENT_LOCK(ct);
4714 		switch (cmd) {
4715 		case DDI_ATTACH:
4716 			MDI_DEBUG(2, (CE_NOTE, dip,
4717 			    "!Client post_attach: called %p\n", ct));
4718 			if (error != DDI_SUCCESS) {
4719 				MDI_DEBUG(1, (CE_NOTE, dip,
4720 				    "!Client post_attach: failed error=%d\n",
4721 				    error));
4722 				MDI_CLIENT_SET_DETACH(ct);
4723 				MDI_DEBUG(4, (CE_WARN, dip,
4724 				    "mdi_post_attach i_mdi_pm_reset_client\n"));
4725 				i_mdi_pm_reset_client(ct);
4726 				break;
4727 			}
4728 
4729 			/*
4730 			 * Client device has successfully attached.
4731 			 * Create kstats for any pathinfo structures
4732 			 * initially associated with this client.
4733 			 */
4734 			for (pip = ct->ct_path_head; pip != NULL;
4735 			    pip = (mdi_pathinfo_t *)
4736 			    MDI_PI(pip)->pi_client_link) {
4737 				(void) i_mdi_pi_kstat_create(pip);
4738 				i_mdi_report_path_state(ct, pip);
4739 			}
4740 			MDI_CLIENT_SET_ATTACH(ct);
4741 			break;
4742 
4743 		case DDI_RESUME:
4744 			MDI_DEBUG(2, (CE_NOTE, dip,
4745 			    "!Client post_attach: called %p\n", ct));
4746 			if (error == DDI_SUCCESS) {
4747 				MDI_CLIENT_SET_RESUME(ct);
4748 			} else {
4749 				MDI_DEBUG(1, (CE_NOTE, dip,
4750 				    "!Client post_resume: failed error=%d\n",
4751 				    error));
4752 				MDI_CLIENT_SET_SUSPEND(ct);
4753 			}
4754 			break;
4755 		}
4756 		MDI_CLIENT_UNLOCK(ct);
4757 	}
4758 }
4759 
4760 /*
4761  * mdi_pre_detach():
4762  *		Pre detach notification handler
4763  */
4764 
4765 /*ARGSUSED*/
4766 int
4767 mdi_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
4768 {
4769 	int rv = DDI_SUCCESS;
4770 
4771 	if (MDI_CLIENT(dip)) {
4772 		(void) i_mdi_client_pre_detach(dip, cmd);
4773 	}
4774 
4775 	if (MDI_PHCI(dip)) {
4776 		rv = i_mdi_phci_pre_detach(dip, cmd);
4777 	}
4778 
4779 	return (rv);
4780 }
4781 
4782 /*ARGSUSED*/
4783 static int
4784 i_mdi_phci_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
4785 {
4786 	int		rv = DDI_SUCCESS;
4787 	mdi_phci_t	*ph;
4788 	mdi_client_t	*ct;
4789 	mdi_pathinfo_t	*pip;
4790 	mdi_pathinfo_t	*failed_pip = NULL;
4791 	mdi_pathinfo_t	*next;
4792 
4793 	ph = i_devi_get_phci(dip);
4794 	if (ph == NULL) {
4795 		return (rv);
4796 	}
4797 
4798 	MDI_PHCI_LOCK(ph);
4799 	switch (cmd) {
4800 	case DDI_DETACH:
4801 		MDI_DEBUG(2, (CE_NOTE, dip,
4802 		    "!pHCI pre_detach: called %p\n", ph));
4803 		if (!MDI_PHCI_IS_OFFLINE(ph)) {
4804 			/*
4805 			 * mdi_pathinfo nodes are still attached to
4806 			 * this pHCI. Fail the detach for this pHCI.
4807 			 */
4808 			MDI_DEBUG(2, (CE_WARN, dip,
4809 			    "!pHCI pre_detach: "
4810 			    "mdi_pathinfo nodes are still attached "
4811 			    "%p\n", ph));
4812 			rv = DDI_FAILURE;
4813 			break;
4814 		}
4815 		MDI_PHCI_SET_DETACH(ph);
4816 		break;
4817 
4818 	case DDI_SUSPEND:
4819 		/*
4820 		 * pHCI is getting suspended.  Since mpxio client
4821 		 * devices may not be suspended at this point, to avoid
4822 		 * a potential stack overflow, it is important to suspend
4823 		 * client devices before pHCI can be suspended.
4824 		 */
4825 
4826 		MDI_DEBUG(2, (CE_NOTE, dip,
4827 		    "!pHCI pre_suspend: called %p\n", ph));
4828 		/*
4829 		 * Suspend all the client devices accessible through this pHCI
4830 		 */
4831 		pip = ph->ph_path_head;
4832 		while (pip != NULL && rv == DDI_SUCCESS) {
4833 			dev_info_t *cdip;
4834 			MDI_PI_LOCK(pip);
4835 			next =
4836 			    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4837 			ct = MDI_PI(pip)->pi_client;
4838 			i_mdi_client_lock(ct, pip);
4839 			cdip = ct->ct_dip;
4840 			MDI_PI_UNLOCK(pip);
4841 			if ((MDI_CLIENT_IS_DETACHED(ct) == 0) &&
4842 			    MDI_CLIENT_IS_SUSPENDED(ct) == 0) {
4843 				i_mdi_client_unlock(ct);
4844 				if ((rv = devi_detach(cdip, DDI_SUSPEND)) !=
4845 				    DDI_SUCCESS) {
4846 					/*
4847 					 * Suspend of one of the client
4848 					 * device has failed.
4849 					 */
4850 					MDI_DEBUG(1, (CE_WARN, dip,
4851 					    "!Suspend of device (%s%d) failed.",
4852 					    ddi_driver_name(cdip),
4853 					    ddi_get_instance(cdip)));
4854 					failed_pip = pip;
4855 					break;
4856 				}
4857 			} else {
4858 				i_mdi_client_unlock(ct);
4859 			}
4860 			pip = next;
4861 		}
4862 
4863 		if (rv == DDI_SUCCESS) {
4864 			/*
4865 			 * Suspend of client devices is complete. Proceed
4866 			 * with pHCI suspend.
4867 			 */
4868 			MDI_PHCI_SET_SUSPEND(ph);
4869 		} else {
4870 			/*
4871 			 * Revert back all the suspended client device states
4872 			 * to converse.
4873 			 */
4874 			pip = ph->ph_path_head;
4875 			while (pip != failed_pip) {
4876 				dev_info_t *cdip;
4877 				MDI_PI_LOCK(pip);
4878 				next =
4879 				    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4880 				ct = MDI_PI(pip)->pi_client;
4881 				i_mdi_client_lock(ct, pip);
4882 				cdip = ct->ct_dip;
4883 				MDI_PI_UNLOCK(pip);
4884 				if (MDI_CLIENT_IS_SUSPENDED(ct)) {
4885 					i_mdi_client_unlock(ct);
4886 					(void) devi_attach(cdip, DDI_RESUME);
4887 				} else {
4888 					i_mdi_client_unlock(ct);
4889 				}
4890 				pip = next;
4891 			}
4892 		}
4893 		break;
4894 
4895 	default:
4896 		rv = DDI_FAILURE;
4897 		break;
4898 	}
4899 	MDI_PHCI_UNLOCK(ph);
4900 	return (rv);
4901 }
4902 
4903 /*ARGSUSED*/
4904 static int
4905 i_mdi_client_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
4906 {
4907 	int		rv = DDI_SUCCESS;
4908 	mdi_client_t	*ct;
4909 
4910 	ct = i_devi_get_client(dip);
4911 	if (ct == NULL) {
4912 		return (rv);
4913 	}
4914 
4915 	MDI_CLIENT_LOCK(ct);
4916 	switch (cmd) {
4917 	case DDI_DETACH:
4918 		MDI_DEBUG(2, (CE_NOTE, dip,
4919 		    "!Client pre_detach: called %p\n", ct));
4920 		MDI_CLIENT_SET_DETACH(ct);
4921 		break;
4922 
4923 	case DDI_SUSPEND:
4924 		MDI_DEBUG(2, (CE_NOTE, dip,
4925 		    "!Client pre_suspend: called %p\n", ct));
4926 		MDI_CLIENT_SET_SUSPEND(ct);
4927 		break;
4928 
4929 	default:
4930 		rv = DDI_FAILURE;
4931 		break;
4932 	}
4933 	MDI_CLIENT_UNLOCK(ct);
4934 	return (rv);
4935 }
4936 
4937 /*
4938  * mdi_post_detach():
4939  *		Post detach notification handler
4940  */
4941 
4942 /*ARGSUSED*/
4943 void
4944 mdi_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
4945 {
4946 	/*
4947 	 * Detach/Suspend of mpxio component failed. Update our state
4948 	 * too
4949 	 */
4950 	if (MDI_PHCI(dip))
4951 		i_mdi_phci_post_detach(dip, cmd, error);
4952 
4953 	if (MDI_CLIENT(dip))
4954 		i_mdi_client_post_detach(dip, cmd, error);
4955 }
4956 
4957 /*ARGSUSED*/
4958 static void
4959 i_mdi_phci_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
4960 {
4961 	mdi_phci_t	*ph;
4962 
4963 	/*
4964 	 * Detach/Suspend of phci component failed. Update our state
4965 	 * too
4966 	 */
4967 	ph = i_devi_get_phci(dip);
4968 	if (ph == NULL) {
4969 		return;
4970 	}
4971 
4972 	MDI_PHCI_LOCK(ph);
4973 	/*
4974 	 * Detach of pHCI failed. Restore back converse
4975 	 * state
4976 	 */
4977 	switch (cmd) {
4978 	case DDI_DETACH:
4979 		MDI_DEBUG(2, (CE_NOTE, dip,
4980 		    "!pHCI post_detach: called %p\n", ph));
4981 		if (error != DDI_SUCCESS)
4982 			MDI_PHCI_SET_ATTACH(ph);
4983 		break;
4984 
4985 	case DDI_SUSPEND:
4986 		MDI_DEBUG(2, (CE_NOTE, dip,
4987 		    "!pHCI post_suspend: called %p\n", ph));
4988 		if (error != DDI_SUCCESS)
4989 			MDI_PHCI_SET_RESUME(ph);
4990 		break;
4991 	}
4992 	MDI_PHCI_UNLOCK(ph);
4993 }
4994 
4995 /*ARGSUSED*/
4996 static void
4997 i_mdi_client_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
4998 {
4999 	mdi_client_t	*ct;
5000 
5001 	ct = i_devi_get_client(dip);
5002 	if (ct == NULL) {
5003 		return;
5004 	}
5005 	MDI_CLIENT_LOCK(ct);
5006 	/*
5007 	 * Detach of Client failed. Restore back converse
5008 	 * state
5009 	 */
5010 	switch (cmd) {
5011 	case DDI_DETACH:
5012 		MDI_DEBUG(2, (CE_NOTE, dip,
5013 		    "!Client post_detach: called %p\n", ct));
5014 		if (DEVI_IS_ATTACHING(ct->ct_dip)) {
5015 			MDI_DEBUG(4, (CE_NOTE, dip, "i_mdi_client_post_detach "
5016 			    "i_mdi_pm_rele_client\n"));
5017 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
5018 		} else {
5019 			MDI_DEBUG(4, (CE_NOTE, dip, "i_mdi_client_post_detach "
5020 			    "i_mdi_pm_reset_client\n"));
5021 			i_mdi_pm_reset_client(ct);
5022 		}
5023 		if (error != DDI_SUCCESS)
5024 			MDI_CLIENT_SET_ATTACH(ct);
5025 		break;
5026 
5027 	case DDI_SUSPEND:
5028 		MDI_DEBUG(2, (CE_NOTE, dip,
5029 		    "!Client post_suspend: called %p\n", ct));
5030 		if (error != DDI_SUCCESS)
5031 			MDI_CLIENT_SET_RESUME(ct);
5032 		break;
5033 	}
5034 	MDI_CLIENT_UNLOCK(ct);
5035 }
5036 
5037 /*
5038  * create and install per-path (client - pHCI) statistics
5039  * I/O stats supported: nread, nwritten, reads, and writes
5040  * Error stats - hard errors, soft errors, & transport errors
5041  */
5042 static int
5043 i_mdi_pi_kstat_create(mdi_pathinfo_t *pip)
5044 {
5045 
5046 	dev_info_t *client = MDI_PI(pip)->pi_client->ct_dip;
5047 	dev_info_t *ppath = MDI_PI(pip)->pi_phci->ph_dip;
5048 	char ksname[KSTAT_STRLEN];
5049 	mdi_pathinfo_t *cpip;
5050 	const char *err_postfix = ",err";
5051 	kstat_t	*kiosp, *kerrsp;
5052 	struct pi_errs	*nsp;
5053 	struct mdi_pi_kstats *mdi_statp;
5054 
5055 	ASSERT(client != NULL && ppath != NULL);
5056 
5057 	ASSERT(mutex_owned(&(MDI_PI(pip)->pi_client->ct_mutex)));
5058 
5059 	if (MDI_PI(pip)->pi_kstats != NULL)
5060 		return (MDI_SUCCESS);
5061 
5062 	for (cpip = MDI_PI(pip)->pi_client->ct_path_head; cpip != NULL;
5063 	    cpip = (mdi_pathinfo_t *)(MDI_PI(cpip)->pi_client_link)) {
5064 		if (cpip == pip)
5065 			continue;
5066 		/*
5067 		 * We have found a different path with same parent
5068 		 * kstats for a given client-pHCI are common
5069 		 */
5070 		if ((MDI_PI(cpip)->pi_phci->ph_dip == ppath) &&
5071 		    (MDI_PI(cpip)->pi_kstats != NULL)) {
5072 			MDI_PI(cpip)->pi_kstats->pi_kstat_ref++;
5073 			MDI_PI(pip)->pi_kstats = MDI_PI(cpip)->pi_kstats;
5074 			return (MDI_SUCCESS);
5075 		}
5076 	}
5077 
5078 	/*
5079 	 * stats are named as follows: TGTx.HBAy, e.g. "ssd0.fp0"
5080 	 * clamp length of name against max length of error kstat name
5081 	 */
5082 	if (snprintf(ksname, KSTAT_STRLEN, "%s%d.%s%d",
5083 	    ddi_driver_name(client), ddi_get_instance(client),
5084 	    ddi_driver_name(ppath), ddi_get_instance(ppath)) >
5085 	    (KSTAT_STRLEN - strlen(err_postfix))) {
5086 		return (MDI_FAILURE);
5087 	}
5088 	if ((kiosp = kstat_create("mdi", 0, ksname, "iopath",
5089 	    KSTAT_TYPE_IO, 1, 0)) == NULL) {
5090 		return (MDI_FAILURE);
5091 	}
5092 
5093 	(void) strcat(ksname, err_postfix);
5094 	kerrsp = kstat_create("mdi", 0, ksname, "iopath_errors",
5095 	    KSTAT_TYPE_NAMED,
5096 	    sizeof (struct pi_errs) / sizeof (kstat_named_t), 0);
5097 
5098 	if (kerrsp == NULL) {
5099 		kstat_delete(kiosp);
5100 		return (MDI_FAILURE);
5101 	}
5102 
5103 	nsp = (struct pi_errs *)kerrsp->ks_data;
5104 	kstat_named_init(&nsp->pi_softerrs, "Soft Errors", KSTAT_DATA_UINT32);
5105 	kstat_named_init(&nsp->pi_harderrs, "Hard Errors", KSTAT_DATA_UINT32);
5106 	kstat_named_init(&nsp->pi_transerrs, "Transport Errors",
5107 	    KSTAT_DATA_UINT32);
5108 	kstat_named_init(&nsp->pi_icnt_busy, "Interconnect Busy",
5109 	    KSTAT_DATA_UINT32);
5110 	kstat_named_init(&nsp->pi_icnt_errors, "Interconnect Errors",
5111 	    KSTAT_DATA_UINT32);
5112 	kstat_named_init(&nsp->pi_phci_rsrc, "pHCI No Resources",
5113 	    KSTAT_DATA_UINT32);
5114 	kstat_named_init(&nsp->pi_phci_localerr, "pHCI Local Errors",
5115 	    KSTAT_DATA_UINT32);
5116 	kstat_named_init(&nsp->pi_phci_invstate, "pHCI Invalid State",
5117 	    KSTAT_DATA_UINT32);
5118 	kstat_named_init(&nsp->pi_failedfrom, "Failed From",
5119 	    KSTAT_DATA_UINT32);
5120 	kstat_named_init(&nsp->pi_failedto, "Failed To", KSTAT_DATA_UINT32);
5121 
5122 	mdi_statp = kmem_alloc(sizeof (*mdi_statp), KM_SLEEP);
5123 	mdi_statp->pi_kstat_ref = 1;
5124 	mdi_statp->pi_kstat_iostats = kiosp;
5125 	mdi_statp->pi_kstat_errstats = kerrsp;
5126 	kstat_install(kiosp);
5127 	kstat_install(kerrsp);
5128 	MDI_PI(pip)->pi_kstats = mdi_statp;
5129 	return (MDI_SUCCESS);
5130 }
5131 
5132 /*
5133  * destroy per-path properties
5134  */
5135 static void
5136 i_mdi_pi_kstat_destroy(mdi_pathinfo_t *pip)
5137 {
5138 
5139 	struct mdi_pi_kstats *mdi_statp;
5140 
5141 	if ((mdi_statp = MDI_PI(pip)->pi_kstats) == NULL)
5142 		return;
5143 
5144 	MDI_PI(pip)->pi_kstats = NULL;
5145 
5146 	/*
5147 	 * the kstat may be shared between multiple pathinfo nodes
5148 	 * decrement this pathinfo's usage, removing the kstats
5149 	 * themselves when the last pathinfo reference is removed.
5150 	 */
5151 	ASSERT(mdi_statp->pi_kstat_ref > 0);
5152 	if (--mdi_statp->pi_kstat_ref != 0)
5153 		return;
5154 
5155 	kstat_delete(mdi_statp->pi_kstat_iostats);
5156 	kstat_delete(mdi_statp->pi_kstat_errstats);
5157 	kmem_free(mdi_statp, sizeof (*mdi_statp));
5158 }
5159 
5160 /*
5161  * update I/O paths KSTATS
5162  */
5163 void
5164 mdi_pi_kstat_iosupdate(mdi_pathinfo_t *pip, struct buf *bp)
5165 {
5166 	kstat_t *iostatp;
5167 	size_t xfer_cnt;
5168 
5169 	ASSERT(pip != NULL);
5170 
5171 	/*
5172 	 * I/O can be driven across a path prior to having path
5173 	 * statistics available, i.e. probe(9e).
5174 	 */
5175 	if (bp != NULL && MDI_PI(pip)->pi_kstats != NULL) {
5176 		iostatp = MDI_PI(pip)->pi_kstats->pi_kstat_iostats;
5177 		xfer_cnt = bp->b_bcount - bp->b_resid;
5178 		if (bp->b_flags & B_READ) {
5179 			KSTAT_IO_PTR(iostatp)->reads++;
5180 			KSTAT_IO_PTR(iostatp)->nread += xfer_cnt;
5181 		} else {
5182 			KSTAT_IO_PTR(iostatp)->writes++;
5183 			KSTAT_IO_PTR(iostatp)->nwritten += xfer_cnt;
5184 		}
5185 	}
5186 }
5187 
5188 /*
5189  * disable the path to a particular pHCI (pHCI specified in the phci_path
5190  * argument) for a particular client (specified in the client_path argument).
5191  * Disabling a path means that MPxIO will not select the disabled path for
5192  * routing any new I/O requests.
5193  */
5194 int
5195 mdi_pi_disable(dev_info_t *cdip, dev_info_t *pdip, int flags)
5196 {
5197 	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_DISABLE_OP));
5198 }
5199 
5200 /*
5201  * Enable the path to a particular pHCI (pHCI specified in the phci_path
5202  * argument) for a particular client (specified in the client_path argument).
5203  * Enabling a path means that MPxIO may select the enabled path for routing
5204  * future I/O requests, subject to other path state constraints.
5205  */
5206 
5207 int
5208 mdi_pi_enable(dev_info_t *cdip, dev_info_t *pdip, int flags)
5209 {
5210 	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_ENABLE_OP));
5211 }
5212 
5213 
5214 /*
5215  * Common routine for doing enable/disable.
5216  */
5217 int
5218 i_mdi_pi_enable_disable(dev_info_t *cdip, dev_info_t *pdip, int flags, int op)
5219 {
5220 
5221 	mdi_phci_t	*ph;
5222 	mdi_vhci_t	*vh = NULL;
5223 	mdi_client_t	*ct;
5224 	mdi_pathinfo_t	*next, *pip;
5225 	int		found_it;
5226 	int		(*f)() = NULL;
5227 	int		rv;
5228 	int		sync_flag = 0;
5229 
5230 	ph = i_devi_get_phci(pdip);
5231 	MDI_DEBUG(5, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable:"
5232 		" Operation = %d pdip = %p cdip = %p\n", op, pdip, cdip));
5233 	if (ph == NULL) {
5234 		MDI_DEBUG(1, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable:"
5235 			" failed. ph = NULL operation = %d\n", op));
5236 		return (MDI_FAILURE);
5237 	}
5238 
5239 	if ((op != MDI_ENABLE_OP) && (op != MDI_DISABLE_OP)) {
5240 		MDI_DEBUG(1, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable:"
5241 			" Invalid operation = %d\n", op));
5242 		return (MDI_FAILURE);
5243 	}
5244 
5245 	sync_flag = (flags << 8) & 0xf00;
5246 
5247 	vh = ph->ph_vhci;
5248 	f = vh->vh_ops->vo_pi_state_change;
5249 
5250 	if (cdip == NULL) {
5251 		/*
5252 		 * Need to mark the Phci as enabled/disabled.
5253 		 */
5254 		MDI_DEBUG(3, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable:"
5255 		"Operation %d for the phci\n", op));
5256 		MDI_PHCI_LOCK(ph);
5257 		switch (flags) {
5258 			case USER_DISABLE:
5259 				if (op == MDI_DISABLE_OP)
5260 					MDI_PHCI_SET_USER_DISABLE(ph);
5261 				else
5262 					MDI_PHCI_SET_USER_ENABLE(ph);
5263 				break;
5264 			case DRIVER_DISABLE:
5265 				if (op == MDI_DISABLE_OP)
5266 					MDI_PHCI_SET_DRV_DISABLE(ph);
5267 				else
5268 					MDI_PHCI_SET_DRV_ENABLE(ph);
5269 				break;
5270 			case DRIVER_DISABLE_TRANSIENT:
5271 				if (op == MDI_DISABLE_OP)
5272 					MDI_PHCI_SET_DRV_DISABLE_TRANSIENT(ph);
5273 				else
5274 					MDI_PHCI_SET_DRV_ENABLE_TRANSIENT(ph);
5275 				break;
5276 			default:
5277 				MDI_PHCI_UNLOCK(ph);
5278 				MDI_DEBUG(1, (CE_NOTE, NULL,
5279 				"!i_mdi_pi_enable_disable:"
5280 				" Invalid flag argument= %d\n", flags));
5281 		}
5282 
5283 		/*
5284 		 * Phci has been disabled. Now try to enable/disable
5285 		 * path info's to each client.
5286 		 */
5287 		pip = ph->ph_path_head;
5288 		while (pip != NULL) {
5289 			/*
5290 			 * Do a callback into the mdi consumer to let it
5291 			 * know that path is about to be enabled/disabled.
5292 			 */
5293 			if (f != NULL) {
5294 				rv = (*f)(vh->vh_dip, pip, 0,
5295 					MDI_PI_EXT_STATE(pip),
5296 					MDI_EXT_STATE_CHANGE | sync_flag |
5297 					op | MDI_BEFORE_STATE_CHANGE);
5298 				if (rv != MDI_SUCCESS) {
5299 				MDI_DEBUG(2, (CE_WARN, vh->vh_dip,
5300 				"!vo_pi_state_change: failed rv = %x", rv));
5301 				}
5302 			}
5303 
5304 			MDI_PI_LOCK(pip);
5305 			next =
5306 				(mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5307 			switch (flags) {
5308 			case USER_DISABLE:
5309 				if (op == MDI_DISABLE_OP)
5310 					MDI_PI_SET_USER_DISABLE(pip);
5311 				else
5312 					MDI_PI_SET_USER_ENABLE(pip);
5313 				break;
5314 			case DRIVER_DISABLE:
5315 				if (op == MDI_DISABLE_OP)
5316 					MDI_PI_SET_DRV_DISABLE(pip);
5317 				else
5318 					MDI_PI_SET_DRV_ENABLE(pip);
5319 				break;
5320 			case DRIVER_DISABLE_TRANSIENT:
5321 				if (op == MDI_DISABLE_OP && rv == MDI_SUCCESS)
5322 					MDI_PI_SET_DRV_DISABLE_TRANS(pip);
5323 				else
5324 					MDI_PI_SET_DRV_ENABLE_TRANS(pip);
5325 				break;
5326 			}
5327 			MDI_PI_UNLOCK(pip);
5328 			/*
5329 			 * Do a callback into the mdi consumer to let it
5330 			 * know that path is now enabled/disabled.
5331 			 */
5332 			if (f != NULL) {
5333 				rv = (*f)(vh->vh_dip, pip, 0,
5334 					MDI_PI_EXT_STATE(pip),
5335 					MDI_EXT_STATE_CHANGE | sync_flag |
5336 					op | MDI_AFTER_STATE_CHANGE);
5337 				if (rv != MDI_SUCCESS) {
5338 				MDI_DEBUG(2, (CE_WARN, vh->vh_dip,
5339 				"!vo_pi_state_change: failed rv = %x", rv));
5340 				}
5341 			}
5342 			pip = next;
5343 		}
5344 		MDI_PHCI_UNLOCK(ph);
5345 	} else {
5346 
5347 		/*
5348 		 * Disable a specific client.
5349 		 */
5350 		ct = i_devi_get_client(cdip);
5351 		if (ct == NULL) {
5352 			MDI_DEBUG(1, (CE_NOTE, NULL,
5353 			"!i_mdi_pi_enable_disable:"
5354 			" failed. ct = NULL operation = %d\n", op));
5355 			return (MDI_FAILURE);
5356 		}
5357 
5358 		MDI_CLIENT_LOCK(ct);
5359 		pip = ct->ct_path_head;
5360 		found_it = 0;
5361 		while (pip != NULL) {
5362 			MDI_PI_LOCK(pip);
5363 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
5364 			if (MDI_PI(pip)->pi_phci == ph) {
5365 				MDI_PI_UNLOCK(pip);
5366 				found_it = 1;
5367 				break;
5368 			}
5369 			MDI_PI_UNLOCK(pip);
5370 			pip = next;
5371 		}
5372 
5373 		MDI_CLIENT_UNLOCK(ct);
5374 		if (found_it == 0) {
5375 			MDI_DEBUG(1, (CE_NOTE, NULL,
5376 			"!i_mdi_pi_enable_disable:"
5377 			" failed. Could not find corresponding pip\n"));
5378 			return (MDI_FAILURE);
5379 		}
5380 		/*
5381 		 * Do a callback into the mdi consumer to let it
5382 		 * know that path is about to get enabled/disabled.
5383 		 */
5384 		if (f != NULL) {
5385 			rv = (*f)(vh->vh_dip, pip, 0,
5386 				MDI_PI_EXT_STATE(pip),
5387 				MDI_EXT_STATE_CHANGE | sync_flag |
5388 				op | MDI_BEFORE_STATE_CHANGE);
5389 			if (rv != MDI_SUCCESS) {
5390 				MDI_DEBUG(2, (CE_WARN, vh->vh_dip,
5391 				"!vo_pi_state_change: failed rv = %x", rv));
5392 			}
5393 		}
5394 		MDI_PI_LOCK(pip);
5395 		switch (flags) {
5396 			case USER_DISABLE:
5397 				if (op == MDI_DISABLE_OP)
5398 					MDI_PI_SET_USER_DISABLE(pip);
5399 				else
5400 					MDI_PI_SET_USER_ENABLE(pip);
5401 				break;
5402 			case DRIVER_DISABLE:
5403 				if (op == MDI_DISABLE_OP)
5404 					MDI_PI_SET_DRV_DISABLE(pip);
5405 				else
5406 					MDI_PI_SET_DRV_ENABLE(pip);
5407 				break;
5408 			case DRIVER_DISABLE_TRANSIENT:
5409 				if (op == MDI_DISABLE_OP && rv == MDI_SUCCESS)
5410 					MDI_PI_SET_DRV_DISABLE_TRANS(pip);
5411 				else
5412 					MDI_PI_SET_DRV_ENABLE_TRANS(pip);
5413 				break;
5414 		}
5415 		MDI_PI_UNLOCK(pip);
5416 		/*
5417 		 * Do a callback into the mdi consumer to let it
5418 		 * know that path is now enabled/disabled.
5419 		 */
5420 		if (f != NULL) {
5421 			rv = (*f)(vh->vh_dip, pip, 0,
5422 				MDI_PI_EXT_STATE(pip),
5423 				MDI_EXT_STATE_CHANGE | sync_flag |
5424 				op | MDI_AFTER_STATE_CHANGE);
5425 			if (rv != MDI_SUCCESS) {
5426 				MDI_DEBUG(2, (CE_WARN, vh->vh_dip,
5427 				"!vo_pi_state_change: failed rv = %x", rv));
5428 			}
5429 		}
5430 	}
5431 
5432 	MDI_DEBUG(5, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable:"
5433 		" Returning success pdip = %p cdip = %p\n", op, pdip, cdip));
5434 	return (MDI_SUCCESS);
5435 }
5436 
5437 /*ARGSUSED3*/
5438 int
5439 mdi_devi_config_one(dev_info_t *pdip, char *devnm, dev_info_t **cdipp,
5440     int flags, clock_t timeout)
5441 {
5442 	mdi_pathinfo_t *pip;
5443 	dev_info_t *dip;
5444 	clock_t interval = drv_usectohz(100000);	/* 0.1 sec */
5445 	char *paddr;
5446 
5447 	MDI_DEBUG(2, (CE_NOTE, NULL, "configure device %s", devnm));
5448 
5449 	if (!MDI_PHCI(pdip))
5450 		return (MDI_FAILURE);
5451 
5452 	paddr = strchr(devnm, '@');
5453 	if (paddr == NULL)
5454 		return (MDI_FAILURE);
5455 
5456 	paddr++;	/* skip '@' */
5457 	pip = mdi_pi_find(pdip, NULL, paddr);
5458 	while (pip == NULL && timeout > 0) {
5459 		if (interval > timeout)
5460 			interval = timeout;
5461 		if (flags & NDI_DEVI_DEBUG) {
5462 			cmn_err(CE_CONT, "%s%d: %s timeout %ld %ld\n",
5463 			    ddi_driver_name(pdip), ddi_get_instance(pdip),
5464 			    paddr, interval, timeout);
5465 		}
5466 		delay(interval);
5467 		timeout -= interval;
5468 		interval += interval;
5469 		pip = mdi_pi_find(pdip, NULL, paddr);
5470 	}
5471 
5472 	if (pip == NULL)
5473 		return (MDI_FAILURE);
5474 	dip = mdi_pi_get_client(pip);
5475 	if (ndi_devi_online(dip, flags) != NDI_SUCCESS)
5476 		return (MDI_FAILURE);
5477 	*cdipp = dip;
5478 
5479 	/* TODO: holding should happen inside search functions */
5480 	ndi_hold_devi(dip);
5481 	return (MDI_SUCCESS);
5482 }
5483 
5484 /*
5485  * Ensure phci powered up
5486  */
5487 static void
5488 i_mdi_pm_hold_pip(mdi_pathinfo_t *pip)
5489 {
5490 	dev_info_t	*ph_dip;
5491 
5492 	ASSERT(pip != NULL);
5493 	ASSERT(MUTEX_HELD(&MDI_PI(pip)->pi_mutex));
5494 
5495 	if (MDI_PI(pip)->pi_pm_held) {
5496 		return;
5497 	}
5498 
5499 	ph_dip = mdi_pi_get_phci(pip);
5500 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_pm_hold_pip for %s%d\n",
5501 	    ddi_get_name(ph_dip), ddi_get_instance(ph_dip)));
5502 	if (ph_dip == NULL) {
5503 		return;
5504 	}
5505 
5506 	MDI_PI_UNLOCK(pip);
5507 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt was %d\n",
5508 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
5509 	pm_hold_power(ph_dip);
5510 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt is %d\n",
5511 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
5512 	MDI_PI_LOCK(pip);
5513 
5514 	MDI_PI(pip)->pi_pm_held = 1;
5515 }
5516 
5517 /*
5518  * Allow phci powered down
5519  */
5520 static void
5521 i_mdi_pm_rele_pip(mdi_pathinfo_t *pip)
5522 {
5523 	dev_info_t	*ph_dip = NULL;
5524 
5525 	ASSERT(pip != NULL);
5526 	ASSERT(MUTEX_HELD(&MDI_PI(pip)->pi_mutex));
5527 
5528 	if (MDI_PI(pip)->pi_pm_held == 0) {
5529 		return;
5530 	}
5531 
5532 	ph_dip = mdi_pi_get_phci(pip);
5533 	ASSERT(ph_dip != NULL);
5534 
5535 	MDI_PI_UNLOCK(pip);
5536 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_pm_rele_pip for %s%d\n",
5537 	    ddi_get_name(ph_dip), ddi_get_instance(ph_dip)));
5538 
5539 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt was %d\n",
5540 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
5541 	pm_rele_power(ph_dip);
5542 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt is %d\n",
5543 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
5544 
5545 	MDI_PI_LOCK(pip);
5546 	MDI_PI(pip)->pi_pm_held = 0;
5547 }
5548 
5549 static void
5550 i_mdi_pm_hold_client(mdi_client_t *ct, int incr)
5551 {
5552 	ASSERT(ct);
5553 
5554 	ct->ct_power_cnt += incr;
5555 	MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "i_mdi_pm_hold_client "
5556 	    "ct_power_cnt = %d incr = %d\n", ct->ct_power_cnt, incr));
5557 	ASSERT(ct->ct_power_cnt >= 0);
5558 }
5559 
5560 static void
5561 i_mdi_rele_all_phci(mdi_client_t *ct)
5562 {
5563 	mdi_pathinfo_t  *pip;
5564 
5565 	ASSERT(mutex_owned(&ct->ct_mutex));
5566 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
5567 	while (pip != NULL) {
5568 		mdi_hold_path(pip);
5569 		MDI_PI_LOCK(pip);
5570 		i_mdi_pm_rele_pip(pip);
5571 		MDI_PI_UNLOCK(pip);
5572 		mdi_rele_path(pip);
5573 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
5574 	}
5575 }
5576 
5577 static void
5578 i_mdi_pm_rele_client(mdi_client_t *ct, int decr)
5579 {
5580 	ASSERT(ct);
5581 
5582 	if (i_ddi_node_state(ct->ct_dip) >= DS_READY) {
5583 		ct->ct_power_cnt -= decr;
5584 		MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "i_mdi_pm_rele_client "
5585 		    "ct_power_cnt = %d decr = %d\n", ct->ct_power_cnt, decr));
5586 	}
5587 
5588 	ASSERT(ct->ct_power_cnt >= 0);
5589 	if (ct->ct_power_cnt == 0) {
5590 		i_mdi_rele_all_phci(ct);
5591 		return;
5592 	}
5593 }
5594 
5595 static void
5596 i_mdi_pm_reset_client(mdi_client_t *ct)
5597 {
5598 	MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "i_mdi_pm_reset_client "
5599 	    "ct_power_cnt = %d\n", ct->ct_power_cnt));
5600 	ct->ct_power_cnt = 0;
5601 	i_mdi_rele_all_phci(ct);
5602 	ct->ct_powercnt_reset = 1;
5603 	ct->ct_powercnt_held = 0;
5604 }
5605 
5606 static void
5607 i_mdi_pm_hold_all_phci(mdi_client_t *ct)
5608 {
5609 	mdi_pathinfo_t  *pip;
5610 	ASSERT(mutex_owned(&ct->ct_mutex));
5611 
5612 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
5613 	while (pip != NULL) {
5614 		mdi_hold_path(pip);
5615 		MDI_PI_LOCK(pip);
5616 		i_mdi_pm_hold_pip(pip);
5617 		MDI_PI_UNLOCK(pip);
5618 		mdi_rele_path(pip);
5619 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
5620 	}
5621 }
5622 
5623 static int
5624 i_mdi_power_one_phci(mdi_pathinfo_t *pip)
5625 {
5626 	int		ret;
5627 	dev_info_t	*ph_dip;
5628 
5629 	MDI_PI_LOCK(pip);
5630 	i_mdi_pm_hold_pip(pip);
5631 
5632 	ph_dip = mdi_pi_get_phci(pip);
5633 	MDI_PI_UNLOCK(pip);
5634 
5635 	/* bring all components of phci to full power */
5636 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_power_one_phci "
5637 	    "pm_powerup for %s%d\n", ddi_get_name(ph_dip),
5638 	    ddi_get_instance(ph_dip)));
5639 
5640 	ret = pm_powerup(ph_dip);
5641 
5642 	if (ret == DDI_FAILURE) {
5643 		MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_power_one_phci "
5644 		    "pm_powerup FAILED for %s%d\n",
5645 		    ddi_get_name(ph_dip), ddi_get_instance(ph_dip)));
5646 
5647 		MDI_PI_LOCK(pip);
5648 		i_mdi_pm_rele_pip(pip);
5649 		MDI_PI_UNLOCK(pip);
5650 		return (MDI_FAILURE);
5651 	}
5652 
5653 	return (MDI_SUCCESS);
5654 }
5655 
5656 static int
5657 i_mdi_power_all_phci(mdi_client_t *ct)
5658 {
5659 	mdi_pathinfo_t  *pip;
5660 	int		succeeded = 0;
5661 
5662 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
5663 	while (pip != NULL) {
5664 		mdi_hold_path(pip);
5665 		MDI_CLIENT_UNLOCK(ct);
5666 		if (i_mdi_power_one_phci(pip) == MDI_SUCCESS)
5667 			succeeded = 1;
5668 
5669 		ASSERT(ct == MDI_PI(pip)->pi_client);
5670 		MDI_CLIENT_LOCK(ct);
5671 		mdi_rele_path(pip);
5672 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
5673 	}
5674 
5675 	return (succeeded ? MDI_SUCCESS : MDI_FAILURE);
5676 }
5677 
5678 /*
5679  * mdi_bus_power():
5680  *		1. Place the phci(s) into powered up state so that
5681  *		   client can do power management
5682  *		2. Ensure phci powered up as client power managing
5683  * Return Values:
5684  *		MDI_SUCCESS
5685  *		MDI_FAILURE
5686  */
5687 int
5688 mdi_bus_power(dev_info_t *parent, void *impl_arg, pm_bus_power_op_t op,
5689     void *arg, void *result)
5690 {
5691 	int			ret = MDI_SUCCESS;
5692 	pm_bp_child_pwrchg_t	*bpc;
5693 	mdi_client_t		*ct;
5694 	dev_info_t		*cdip;
5695 	pm_bp_has_changed_t	*bphc;
5696 
5697 	/*
5698 	 * BUS_POWER_NOINVOL not supported
5699 	 */
5700 	if (op == BUS_POWER_NOINVOL)
5701 		return (MDI_FAILURE);
5702 
5703 	/*
5704 	 * ignore other OPs.
5705 	 * return quickly to save cou cycles on the ct processing
5706 	 */
5707 	switch (op) {
5708 	case BUS_POWER_PRE_NOTIFICATION:
5709 	case BUS_POWER_POST_NOTIFICATION:
5710 		bpc = (pm_bp_child_pwrchg_t *)arg;
5711 		cdip = bpc->bpc_dip;
5712 		break;
5713 	case BUS_POWER_HAS_CHANGED:
5714 		bphc = (pm_bp_has_changed_t *)arg;
5715 		cdip = bphc->bphc_dip;
5716 		break;
5717 	default:
5718 		return (pm_busop_bus_power(parent, impl_arg, op, arg, result));
5719 	}
5720 
5721 	ASSERT(MDI_CLIENT(cdip));
5722 
5723 	ct = i_devi_get_client(cdip);
5724 	if (ct == NULL)
5725 		return (MDI_FAILURE);
5726 
5727 	/*
5728 	 * wait till the mdi_pathinfo node state change are processed
5729 	 */
5730 	MDI_CLIENT_LOCK(ct);
5731 	switch (op) {
5732 	case BUS_POWER_PRE_NOTIFICATION:
5733 		MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip, "mdi_bus_power "
5734 		    "BUS_POWER_PRE_NOTIFICATION:"
5735 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d\n",
5736 		    PM_NAME(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
5737 		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp));
5738 
5739 		/* serialize power level change per client */
5740 		while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
5741 			cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
5742 
5743 		MDI_CLIENT_SET_POWER_TRANSITION(ct);
5744 
5745 		if (ct->ct_power_cnt == 0) {
5746 			ret = i_mdi_power_all_phci(ct);
5747 		}
5748 
5749 		/*
5750 		 * if new_level > 0:
5751 		 *	- hold phci(s)
5752 		 *	- power up phci(s) if not already
5753 		 * ignore power down
5754 		 */
5755 		if (bpc->bpc_nlevel > 0) {
5756 			if (!DEVI_IS_ATTACHING(ct->ct_dip)) {
5757 				MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
5758 				    "mdi_bus_power i_mdi_pm_hold_client\n"));
5759 				i_mdi_pm_hold_client(ct, ct->ct_path_count);
5760 			}
5761 		}
5762 		break;
5763 	case BUS_POWER_POST_NOTIFICATION:
5764 		MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip, "mdi_bus_power "
5765 		    "BUS_POWER_POST_NOTIFICATION:"
5766 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d result=%d\n",
5767 		    PM_NAME(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
5768 		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp,
5769 		    *(int *)result));
5770 
5771 		if (*(int *)result == DDI_SUCCESS) {
5772 			if (bpc->bpc_nlevel > 0) {
5773 				MDI_CLIENT_SET_POWER_UP(ct);
5774 			} else {
5775 				MDI_CLIENT_SET_POWER_DOWN(ct);
5776 			}
5777 		}
5778 
5779 		/* release the hold we did in pre-notification */
5780 		if (bpc->bpc_nlevel > 0 && (*(int *)result != DDI_SUCCESS) &&
5781 		    !DEVI_IS_ATTACHING(ct->ct_dip)) {
5782 			MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
5783 			    "mdi_bus_power i_mdi_pm_rele_client\n"));
5784 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
5785 		}
5786 
5787 		if (bpc->bpc_nlevel == 0 && (*(int *)result == DDI_SUCCESS)) {
5788 			/* another thread might started attaching */
5789 			if (DEVI_IS_ATTACHING(ct->ct_dip)) {
5790 				MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
5791 				    "mdi_bus_power i_mdi_pm_rele_client\n"));
5792 				i_mdi_pm_rele_client(ct, ct->ct_path_count);
5793 			/* detaching has been taken care in pm_post_unconfig */
5794 			} else if (!DEVI_IS_DETACHING(ct->ct_dip)) {
5795 				MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
5796 				    "mdi_bus_power i_mdi_pm_reset_client\n"));
5797 				i_mdi_pm_reset_client(ct);
5798 			}
5799 		}
5800 
5801 		MDI_CLIENT_CLEAR_POWER_TRANSITION(ct);
5802 		cv_broadcast(&ct->ct_powerchange_cv);
5803 
5804 		break;
5805 
5806 	/* need to do more */
5807 	case BUS_POWER_HAS_CHANGED:
5808 		MDI_DEBUG(4, (CE_NOTE, bphc->bphc_dip, "mdi_bus_power "
5809 		    "BUS_POWER_HAS_CHANGED:"
5810 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d\n",
5811 		    PM_NAME(bphc->bphc_dip), PM_ADDR(bphc->bphc_dip),
5812 		    bphc->bphc_olevel, bphc->bphc_nlevel, bphc->bphc_comp));
5813 
5814 		if (bphc->bphc_nlevel > 0 &&
5815 		    bphc->bphc_nlevel > bphc->bphc_olevel) {
5816 			if (ct->ct_power_cnt == 0) {
5817 				ret = i_mdi_power_all_phci(ct);
5818 			}
5819 			MDI_DEBUG(4, (CE_NOTE, bphc->bphc_dip,
5820 			    "mdi_bus_power i_mdi_pm_hold_client\n"));
5821 			i_mdi_pm_hold_client(ct, ct->ct_path_count);
5822 		}
5823 
5824 		if (bphc->bphc_nlevel == 0 && bphc->bphc_olevel != -1) {
5825 			MDI_DEBUG(4, (CE_NOTE, bphc->bphc_dip,
5826 			    "mdi_bus_power i_mdi_pm_rele_client\n"));
5827 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
5828 		}
5829 		break;
5830 	}
5831 
5832 	MDI_CLIENT_UNLOCK(ct);
5833 	return (ret);
5834 }
5835 
5836 static int
5837 i_mdi_pm_pre_config_one(dev_info_t *child)
5838 {
5839 	int		ret = MDI_SUCCESS;
5840 	mdi_client_t	*ct;
5841 
5842 	ct = i_devi_get_client(child);
5843 	if (ct == NULL)
5844 		return (MDI_FAILURE);
5845 
5846 	MDI_CLIENT_LOCK(ct);
5847 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
5848 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
5849 
5850 	if (!MDI_CLIENT_IS_FAILED(ct)) {
5851 		MDI_CLIENT_UNLOCK(ct);
5852 		MDI_DEBUG(4, (CE_NOTE, child,
5853 		    "i_mdi_pm_pre_config_one already configured\n"));
5854 		return (MDI_SUCCESS);
5855 	}
5856 
5857 	if (ct->ct_powercnt_held) {
5858 		MDI_CLIENT_UNLOCK(ct);
5859 		MDI_DEBUG(4, (CE_NOTE, child,
5860 		    "i_mdi_pm_pre_config_one ALREADY held\n"));
5861 		return (MDI_SUCCESS);
5862 	}
5863 
5864 	if (ct->ct_power_cnt == 0) {
5865 		ret = i_mdi_power_all_phci(ct);
5866 	}
5867 	MDI_DEBUG(4, (CE_NOTE, child,
5868 	    "i_mdi_pm_pre_config_one i_mdi_pm_hold_client\n"));
5869 	i_mdi_pm_hold_client(ct, ct->ct_path_count);
5870 	ct->ct_powercnt_held = 1;
5871 	ct->ct_powercnt_reset = 0;
5872 	MDI_CLIENT_UNLOCK(ct);
5873 	return (ret);
5874 }
5875 
5876 static int
5877 i_mdi_pm_pre_config(dev_info_t *parent, dev_info_t *child)
5878 {
5879 	int			ret = MDI_SUCCESS;
5880 	dev_info_t		*cdip;
5881 	int			circ;
5882 
5883 	ASSERT(MDI_VHCI(parent));
5884 
5885 	/* ndi_devi_config_one */
5886 	if (child) {
5887 		return (i_mdi_pm_pre_config_one(child));
5888 	}
5889 
5890 	/* devi_config_common */
5891 	ndi_devi_enter(parent, &circ);
5892 	cdip = ddi_get_child(parent);
5893 	while (cdip) {
5894 		dev_info_t *next = ddi_get_next_sibling(cdip);
5895 
5896 		ret = i_mdi_pm_pre_config_one(cdip);
5897 		if (ret != MDI_SUCCESS)
5898 			break;
5899 		cdip = next;
5900 	}
5901 	ndi_devi_exit(parent, circ);
5902 	return (ret);
5903 }
5904 
5905 static int
5906 i_mdi_pm_pre_unconfig_one(dev_info_t *child, int *held, int flags)
5907 {
5908 	int		ret = MDI_SUCCESS;
5909 	mdi_client_t	*ct;
5910 
5911 	ct = i_devi_get_client(child);
5912 	if (ct == NULL)
5913 		return (MDI_FAILURE);
5914 
5915 	MDI_CLIENT_LOCK(ct);
5916 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
5917 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
5918 
5919 	if (i_ddi_node_state(ct->ct_dip) < DS_READY) {
5920 		MDI_DEBUG(4, (CE_NOTE, child,
5921 		    "i_mdi_pm_pre_unconfig node detached already\n"));
5922 		MDI_CLIENT_UNLOCK(ct);
5923 		return (MDI_SUCCESS);
5924 	}
5925 
5926 	if (MDI_CLIENT_IS_POWERED_DOWN(ct) &&
5927 	    (flags & NDI_AUTODETACH)) {
5928 		MDI_DEBUG(4, (CE_NOTE, child,
5929 		    "i_mdi_pm_pre_unconfig auto-modunload\n"));
5930 		MDI_CLIENT_UNLOCK(ct);
5931 		return (MDI_FAILURE);
5932 	}
5933 
5934 	if (ct->ct_powercnt_held) {
5935 		MDI_DEBUG(4, (CE_NOTE, child,
5936 		    "i_mdi_pm_pre_unconfig ct_powercnt_held\n"));
5937 		MDI_CLIENT_UNLOCK(ct);
5938 		*held = 1;
5939 		return (MDI_SUCCESS);
5940 	}
5941 
5942 	if (ct->ct_power_cnt == 0) {
5943 		ret = i_mdi_power_all_phci(ct);
5944 	}
5945 	MDI_DEBUG(4, (CE_NOTE, child,
5946 	    "i_mdi_pm_pre_unconfig i_mdi_pm_hold_client\n"));
5947 	i_mdi_pm_hold_client(ct, ct->ct_path_count);
5948 	ct->ct_powercnt_held = 1;
5949 	ct->ct_powercnt_reset = 0;
5950 	MDI_CLIENT_UNLOCK(ct);
5951 	if (ret == MDI_SUCCESS)
5952 		*held = 1;
5953 	return (ret);
5954 }
5955 
5956 static int
5957 i_mdi_pm_pre_unconfig(dev_info_t *parent, dev_info_t *child, int *held,
5958     int flags)
5959 {
5960 	int			ret = MDI_SUCCESS;
5961 	dev_info_t		*cdip;
5962 	int			circ;
5963 
5964 	ASSERT(MDI_VHCI(parent));
5965 	*held = 0;
5966 
5967 	/* ndi_devi_unconfig_one */
5968 	if (child) {
5969 		return (i_mdi_pm_pre_unconfig_one(child, held, flags));
5970 	}
5971 
5972 	/* devi_unconfig_common */
5973 	ndi_devi_enter(parent, &circ);
5974 	cdip = ddi_get_child(parent);
5975 	while (cdip) {
5976 		dev_info_t *next = ddi_get_next_sibling(cdip);
5977 
5978 		ret = i_mdi_pm_pre_unconfig_one(cdip, held, flags);
5979 		cdip = next;
5980 	}
5981 	ndi_devi_exit(parent, circ);
5982 
5983 	if (*held)
5984 		ret = MDI_SUCCESS;
5985 
5986 	return (ret);
5987 }
5988 
5989 static void
5990 i_mdi_pm_post_config_one(dev_info_t *child)
5991 {
5992 	mdi_client_t	*ct;
5993 
5994 	ct = i_devi_get_client(child);
5995 	if (ct == NULL)
5996 		return;
5997 
5998 	MDI_CLIENT_LOCK(ct);
5999 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6000 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6001 
6002 	if (ct->ct_powercnt_reset || !ct->ct_powercnt_held) {
6003 		MDI_DEBUG(4, (CE_NOTE, child,
6004 		    "i_mdi_pm_post_config_one NOT held\n"));
6005 		MDI_CLIENT_UNLOCK(ct);
6006 		return;
6007 	}
6008 
6009 	/* client has not been updated */
6010 	if (MDI_CLIENT_IS_FAILED(ct)) {
6011 		MDI_DEBUG(4, (CE_NOTE, child,
6012 		    "i_mdi_pm_post_config_one NOT configured\n"));
6013 		MDI_CLIENT_UNLOCK(ct);
6014 		return;
6015 	}
6016 
6017 	/* another thread might have powered it down or detached it */
6018 	if ((MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6019 	    !DEVI_IS_ATTACHING(ct->ct_dip)) ||
6020 	    (i_ddi_node_state(ct->ct_dip) < DS_READY &&
6021 	    !DEVI_IS_ATTACHING(ct->ct_dip))) {
6022 		MDI_DEBUG(4, (CE_NOTE, child,
6023 		    "i_mdi_pm_post_config i_mdi_pm_reset_client\n"));
6024 		i_mdi_pm_reset_client(ct);
6025 	} else {
6026 		mdi_pathinfo_t	*pip, *next;
6027 		int	valid_path_count = 0;
6028 
6029 		MDI_DEBUG(4, (CE_NOTE, child,
6030 		    "i_mdi_pm_post_config i_mdi_pm_rele_client\n"));
6031 		pip = ct->ct_path_head;
6032 		while (pip != NULL) {
6033 			MDI_PI_LOCK(pip);
6034 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6035 			if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
6036 				== MDI_PATHINFO_STATE_ONLINE ||
6037 			    (MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
6038 				== MDI_PATHINFO_STATE_STANDBY)
6039 				valid_path_count ++;
6040 			MDI_PI_UNLOCK(pip);
6041 			pip = next;
6042 		}
6043 		i_mdi_pm_rele_client(ct, valid_path_count);
6044 	}
6045 	ct->ct_powercnt_held = 0;
6046 	MDI_CLIENT_UNLOCK(ct);
6047 }
6048 
6049 static void
6050 i_mdi_pm_post_config(dev_info_t *parent, dev_info_t *child)
6051 {
6052 	int		circ;
6053 	dev_info_t	*cdip;
6054 	ASSERT(MDI_VHCI(parent));
6055 
6056 	/* ndi_devi_config_one */
6057 	if (child) {
6058 		i_mdi_pm_post_config_one(child);
6059 		return;
6060 	}
6061 
6062 	/* devi_config_common */
6063 	ndi_devi_enter(parent, &circ);
6064 	cdip = ddi_get_child(parent);
6065 	while (cdip) {
6066 		dev_info_t *next = ddi_get_next_sibling(cdip);
6067 
6068 		i_mdi_pm_post_config_one(cdip);
6069 		cdip = next;
6070 	}
6071 	ndi_devi_exit(parent, circ);
6072 }
6073 
6074 static void
6075 i_mdi_pm_post_unconfig_one(dev_info_t *child)
6076 {
6077 	mdi_client_t	*ct;
6078 
6079 	ct = i_devi_get_client(child);
6080 	if (ct == NULL)
6081 		return;
6082 
6083 	MDI_CLIENT_LOCK(ct);
6084 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6085 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6086 
6087 	if (!ct->ct_powercnt_held) {
6088 		MDI_DEBUG(4, (CE_NOTE, child,
6089 		    "i_mdi_pm_post_unconfig NOT held\n"));
6090 		MDI_CLIENT_UNLOCK(ct);
6091 		return;
6092 	}
6093 
6094 	/* failure detaching or another thread just attached it */
6095 	if ((MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6096 	    i_ddi_node_state(ct->ct_dip) == DS_READY) ||
6097 	    (i_ddi_node_state(ct->ct_dip) != DS_READY &&
6098 	    !DEVI_IS_ATTACHING(ct->ct_dip))) {
6099 		MDI_DEBUG(4, (CE_NOTE, child,
6100 		    "i_mdi_pm_post_unconfig i_mdi_pm_reset_client\n"));
6101 		i_mdi_pm_reset_client(ct);
6102 	}
6103 
6104 	MDI_DEBUG(4, (CE_NOTE, child,
6105 	    "i_mdi_pm_post_unconfig not changed\n"));
6106 	MDI_CLIENT_UNLOCK(ct);
6107 }
6108 
6109 static void
6110 i_mdi_pm_post_unconfig(dev_info_t *parent, dev_info_t *child, int held)
6111 {
6112 	int			circ;
6113 	dev_info_t		*cdip;
6114 
6115 	ASSERT(MDI_VHCI(parent));
6116 
6117 	if (!held) {
6118 		MDI_DEBUG(4, (CE_NOTE, parent,
6119 		    "i_mdi_pm_post_unconfig held = %d\n", held));
6120 		return;
6121 	}
6122 
6123 	if (child) {
6124 		i_mdi_pm_post_unconfig_one(child);
6125 		return;
6126 	}
6127 
6128 	ndi_devi_enter(parent, &circ);
6129 	cdip = ddi_get_child(parent);
6130 	while (cdip) {
6131 		dev_info_t *next = ddi_get_next_sibling(cdip);
6132 
6133 		i_mdi_pm_post_unconfig_one(cdip);
6134 		cdip = next;
6135 	}
6136 	ndi_devi_exit(parent, circ);
6137 }
6138 
6139 int
6140 mdi_power(dev_info_t *vdip, mdi_pm_op_t op, void *args, char *devnm, int flags)
6141 {
6142 	int			circ, ret = MDI_SUCCESS;
6143 	dev_info_t		*client_dip = NULL;
6144 	mdi_client_t		*ct;
6145 
6146 	/*
6147 	 * Handling ndi_devi_config_one and ndi_devi_unconfig_one.
6148 	 * Power up pHCI for the named client device.
6149 	 * Note: Before the client is enumerated under vhci by phci,
6150 	 * client_dip can be NULL. Then proceed to power up all the
6151 	 * pHCIs.
6152 	 */
6153 	if (devnm != NULL) {
6154 		ndi_devi_enter(vdip, &circ);
6155 		client_dip = ndi_devi_findchild(vdip, devnm);
6156 		ndi_devi_exit(vdip, circ);
6157 	}
6158 
6159 	MDI_DEBUG(4, (CE_NOTE, vdip, "mdi_power op = %d\n", op));
6160 
6161 	switch (op) {
6162 	case MDI_PM_PRE_CONFIG:
6163 		ret = i_mdi_pm_pre_config(vdip, client_dip);
6164 
6165 		break;
6166 	case MDI_PM_PRE_UNCONFIG:
6167 		ret = i_mdi_pm_pre_unconfig(vdip, client_dip, (int *)args,
6168 		    flags);
6169 
6170 		break;
6171 	case MDI_PM_POST_CONFIG:
6172 		i_mdi_pm_post_config(vdip, client_dip);
6173 
6174 		break;
6175 	case MDI_PM_POST_UNCONFIG:
6176 		i_mdi_pm_post_unconfig(vdip, client_dip, *(int *)args);
6177 
6178 		break;
6179 	case MDI_PM_HOLD_POWER:
6180 	case MDI_PM_RELE_POWER:
6181 		ASSERT(args);
6182 
6183 		client_dip = (dev_info_t *)args;
6184 		ASSERT(MDI_CLIENT(client_dip));
6185 
6186 		ct = i_devi_get_client(client_dip);
6187 		MDI_CLIENT_LOCK(ct);
6188 
6189 		if (op == MDI_PM_HOLD_POWER) {
6190 			if (ct->ct_power_cnt == 0) {
6191 				(void) i_mdi_power_all_phci(ct);
6192 				MDI_DEBUG(4, (CE_NOTE, client_dip,
6193 				    "mdi_power i_mdi_pm_hold_client\n"));
6194 				i_mdi_pm_hold_client(ct, ct->ct_path_count);
6195 			}
6196 		} else {
6197 			if (DEVI_IS_ATTACHING(ct->ct_dip)) {
6198 				MDI_DEBUG(4, (CE_NOTE, client_dip,
6199 				    "mdi_power i_mdi_pm_rele_client\n"));
6200 				i_mdi_pm_rele_client(ct, ct->ct_path_count);
6201 			} else {
6202 				MDI_DEBUG(4, (CE_NOTE, client_dip,
6203 				    "mdi_power i_mdi_pm_reset_client\n"));
6204 				i_mdi_pm_reset_client(ct);
6205 			}
6206 		}
6207 
6208 		MDI_CLIENT_UNLOCK(ct);
6209 		break;
6210 	default:
6211 		break;
6212 	}
6213 
6214 	return (ret);
6215 }
6216 
6217 int
6218 mdi_component_is_vhci(dev_info_t *dip, const char **mdi_class)
6219 {
6220 	mdi_vhci_t *vhci;
6221 
6222 	if (!MDI_VHCI(dip))
6223 		return (MDI_FAILURE);
6224 
6225 	if (mdi_class) {
6226 		vhci = DEVI(dip)->devi_mdi_xhci;
6227 		ASSERT(vhci);
6228 		*mdi_class = vhci->vh_class;
6229 	}
6230 
6231 	return (MDI_SUCCESS);
6232 }
6233 
6234 int
6235 mdi_component_is_phci(dev_info_t *dip, const char **mdi_class)
6236 {
6237 	mdi_phci_t *phci;
6238 
6239 	if (!MDI_PHCI(dip))
6240 		return (MDI_FAILURE);
6241 
6242 	if (mdi_class) {
6243 		phci = DEVI(dip)->devi_mdi_xhci;
6244 		ASSERT(phci);
6245 		*mdi_class = phci->ph_vhci->vh_class;
6246 	}
6247 
6248 	return (MDI_SUCCESS);
6249 }
6250 
6251 int
6252 mdi_component_is_client(dev_info_t *dip, const char **mdi_class)
6253 {
6254 	mdi_client_t *client;
6255 
6256 	if (!MDI_CLIENT(dip))
6257 		return (MDI_FAILURE);
6258 
6259 	if (mdi_class) {
6260 		client = DEVI(dip)->devi_mdi_client;
6261 		ASSERT(client);
6262 		*mdi_class = client->ct_vhci->vh_class;
6263 	}
6264 
6265 	return (MDI_SUCCESS);
6266 }
6267 
6268 void *
6269 mdi_client_get_vhci_private(dev_info_t *dip)
6270 {
6271 	ASSERT(mdi_component_is_client(dip, NULL) == MDI_SUCCESS);
6272 	if (mdi_component_is_client(dip, NULL) == MDI_SUCCESS) {
6273 		mdi_client_t	*ct;
6274 		ct = i_devi_get_client(dip);
6275 		return (ct->ct_vprivate);
6276 	}
6277 	return (NULL);
6278 }
6279 
6280 void
6281 mdi_client_set_vhci_private(dev_info_t *dip, void *data)
6282 {
6283 	ASSERT(mdi_component_is_client(dip, NULL) == MDI_SUCCESS);
6284 	if (mdi_component_is_client(dip, NULL) == MDI_SUCCESS) {
6285 		mdi_client_t	*ct;
6286 		ct = i_devi_get_client(dip);
6287 		ct->ct_vprivate = data;
6288 	}
6289 }
6290 /*
6291  * mdi_pi_get_vhci_private():
6292  *		Get the vhci private information associated with the
6293  *		mdi_pathinfo node
6294  */
6295 void *
6296 mdi_pi_get_vhci_private(mdi_pathinfo_t *pip)
6297 {
6298 	caddr_t	vprivate = NULL;
6299 	if (pip) {
6300 		vprivate = MDI_PI(pip)->pi_vprivate;
6301 	}
6302 	return (vprivate);
6303 }
6304 
6305 /*
6306  * mdi_pi_set_vhci_private():
6307  *		Set the vhci private information in the mdi_pathinfo node
6308  */
6309 void
6310 mdi_pi_set_vhci_private(mdi_pathinfo_t *pip, void *priv)
6311 {
6312 	if (pip) {
6313 		MDI_PI(pip)->pi_vprivate = priv;
6314 	}
6315 }
6316 
6317 /*
6318  * mdi_phci_get_vhci_private():
6319  *		Get the vhci private information associated with the
6320  *		mdi_phci node
6321  */
6322 void *
6323 mdi_phci_get_vhci_private(dev_info_t *dip)
6324 {
6325 	ASSERT(mdi_component_is_phci(dip, NULL) == MDI_SUCCESS);
6326 	if (mdi_component_is_phci(dip, NULL) == MDI_SUCCESS) {
6327 		mdi_phci_t	*ph;
6328 		ph = i_devi_get_phci(dip);
6329 		return (ph->ph_vprivate);
6330 	}
6331 	return (NULL);
6332 }
6333 
6334 /*
6335  * mdi_phci_set_vhci_private():
6336  *		Set the vhci private information in the mdi_phci node
6337  */
6338 void
6339 mdi_phci_set_vhci_private(dev_info_t *dip, void *priv)
6340 {
6341 	ASSERT(mdi_component_is_phci(dip, NULL) == MDI_SUCCESS);
6342 	if (mdi_component_is_phci(dip, NULL) == MDI_SUCCESS) {
6343 		mdi_phci_t	*ph;
6344 		ph = i_devi_get_phci(dip);
6345 		ph->ph_vprivate = priv;
6346 	}
6347 }
6348 
6349 /*
6350  * List of vhci class names:
6351  * A vhci class name must be in this list only if the corresponding vhci
6352  * driver intends to use the mdi provided bus config implementation
6353  * (i.e., mdi_vhci_bus_config()).
6354  */
6355 static char *vhci_class_list[] = { MDI_HCI_CLASS_SCSI, MDI_HCI_CLASS_IB };
6356 #define	N_VHCI_CLASSES	(sizeof (vhci_class_list) / sizeof (char *))
6357 
6358 /*
6359  * Built-in list of phci drivers for every vhci class.
6360  * All phci drivers expect iscsi have root device support.
6361  */
6362 static mdi_phci_driver_info_t scsi_phci_driver_list[] = {
6363 	{ "fp", 1 },
6364 	{ "iscsi", 0 },
6365 	{ "ibsrp", 1 }
6366 	};
6367 
6368 static mdi_phci_driver_info_t ib_phci_driver_list[] = { "tavor", 1 };
6369 
6370 /*
6371  * During boot time, the on-disk vhci cache for every vhci class is read
6372  * in the form of an nvlist and stored here.
6373  */
6374 static nvlist_t *vhcache_nvl[N_VHCI_CLASSES];
6375 
6376 /* nvpair names in vhci cache nvlist */
6377 #define	MDI_VHCI_CACHE_VERSION	1
6378 #define	MDI_NVPNAME_VERSION	"version"
6379 #define	MDI_NVPNAME_PHCIS	"phcis"
6380 #define	MDI_NVPNAME_CTADDRMAP	"clientaddrmap"
6381 
6382 typedef enum {
6383 	VHCACHE_NOT_REBUILT,
6384 	VHCACHE_PARTIALLY_BUILT,
6385 	VHCACHE_FULLY_BUILT
6386 } vhcache_build_status_t;
6387 
6388 /*
6389  * Given vhci class name, return its on-disk vhci cache filename.
6390  * Memory for the returned filename which includes the full path is allocated
6391  * by this function.
6392  */
6393 static char *
6394 vhclass2vhcache_filename(char *vhclass)
6395 {
6396 	char *filename;
6397 	int len;
6398 	static char *fmt = "/etc/devices/mdi_%s_cache";
6399 
6400 	/*
6401 	 * fmt contains the on-disk vhci cache file name format;
6402 	 * for scsi_vhci the filename is "/etc/devices/mdi_scsi_vhci_cache".
6403 	 */
6404 
6405 	/* the -1 below is to account for "%s" in the format string */
6406 	len = strlen(fmt) + strlen(vhclass) - 1;
6407 	filename = kmem_alloc(len, KM_SLEEP);
6408 	(void) snprintf(filename, len, fmt, vhclass);
6409 	ASSERT(len == (strlen(filename) + 1));
6410 	return (filename);
6411 }
6412 
6413 /*
6414  * initialize the vhci cache related data structures and read the on-disk
6415  * vhci cached data into memory.
6416  */
6417 static void
6418 setup_vhci_cache(mdi_vhci_t *vh)
6419 {
6420 	mdi_vhci_config_t *vhc;
6421 	mdi_vhci_cache_t *vhcache;
6422 	int i;
6423 	nvlist_t *nvl = NULL;
6424 
6425 	vhc = kmem_zalloc(sizeof (mdi_vhci_config_t), KM_SLEEP);
6426 	vh->vh_config = vhc;
6427 	vhcache = &vhc->vhc_vhcache;
6428 
6429 	vhc->vhc_vhcache_filename = vhclass2vhcache_filename(vh->vh_class);
6430 
6431 	mutex_init(&vhc->vhc_lock, NULL, MUTEX_DEFAULT, NULL);
6432 	cv_init(&vhc->vhc_cv, NULL, CV_DRIVER, NULL);
6433 
6434 	rw_init(&vhcache->vhcache_lock, NULL, RW_DRIVER, NULL);
6435 
6436 	/*
6437 	 * Create string hash; same as mod_hash_create_strhash() except that
6438 	 * we use NULL key destructor.
6439 	 */
6440 	vhcache->vhcache_client_hash = mod_hash_create_extended(vh->vh_class,
6441 	    mdi_bus_config_cache_hash_size,
6442 	    mod_hash_null_keydtor, mod_hash_null_valdtor,
6443 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
6444 
6445 	setup_phci_driver_list(vh);
6446 
6447 	/*
6448 	 * The on-disk vhci cache is read during booting prior to the
6449 	 * lights-out period by mdi_read_devices_files().
6450 	 */
6451 	for (i = 0; i < N_VHCI_CLASSES; i++) {
6452 		if (strcmp(vhci_class_list[i], vh->vh_class) == 0) {
6453 			nvl = vhcache_nvl[i];
6454 			vhcache_nvl[i] = NULL;
6455 			break;
6456 		}
6457 	}
6458 
6459 	/*
6460 	 * this is to cover the case of some one manually causing unloading
6461 	 * (or detaching) and reloading (or attaching) of a vhci driver.
6462 	 */
6463 	if (nvl == NULL && modrootloaded)
6464 		nvl = read_on_disk_vhci_cache(vh->vh_class);
6465 
6466 	if (nvl != NULL) {
6467 		rw_enter(&vhcache->vhcache_lock, RW_WRITER);
6468 		if (mainnvl_to_vhcache(vhcache, nvl) == MDI_SUCCESS)
6469 			vhcache->vhcache_flags |= MDI_VHCI_CACHE_SETUP_DONE;
6470 		else  {
6471 			cmn_err(CE_WARN,
6472 			    "%s: data file corrupted, will recreate\n",
6473 			    vhc->vhc_vhcache_filename);
6474 		}
6475 		rw_exit(&vhcache->vhcache_lock);
6476 		nvlist_free(nvl);
6477 	}
6478 
6479 	vhc->vhc_cbid = callb_add(stop_vhcache_flush_thread, vhc,
6480 	    CB_CL_UADMIN_PRE_VFS, "mdi_vhcache_flush");
6481 }
6482 
6483 /*
6484  * free all vhci cache related resources
6485  */
6486 static int
6487 destroy_vhci_cache(mdi_vhci_t *vh)
6488 {
6489 	mdi_vhci_config_t *vhc = vh->vh_config;
6490 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
6491 	mdi_vhcache_phci_t *cphci, *cphci_next;
6492 	mdi_vhcache_client_t *cct, *cct_next;
6493 	mdi_vhcache_pathinfo_t *cpi, *cpi_next;
6494 
6495 	if (stop_vhcache_async_threads(vhc) != MDI_SUCCESS)
6496 		return (MDI_FAILURE);
6497 
6498 	kmem_free(vhc->vhc_vhcache_filename,
6499 	    strlen(vhc->vhc_vhcache_filename) + 1);
6500 
6501 	if (vhc->vhc_phci_driver_list)
6502 		free_phci_driver_list(vhc);
6503 
6504 	mod_hash_destroy_strhash(vhcache->vhcache_client_hash);
6505 
6506 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
6507 	    cphci = cphci_next) {
6508 		cphci_next = cphci->cphci_next;
6509 		free_vhcache_phci(cphci);
6510 	}
6511 
6512 	for (cct = vhcache->vhcache_client_head; cct != NULL; cct = cct_next) {
6513 		cct_next = cct->cct_next;
6514 		for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi_next) {
6515 			cpi_next = cpi->cpi_next;
6516 			free_vhcache_pathinfo(cpi);
6517 		}
6518 		free_vhcache_client(cct);
6519 	}
6520 
6521 	rw_destroy(&vhcache->vhcache_lock);
6522 
6523 	mutex_destroy(&vhc->vhc_lock);
6524 	cv_destroy(&vhc->vhc_cv);
6525 	kmem_free(vhc, sizeof (mdi_vhci_config_t));
6526 	return (MDI_SUCCESS);
6527 }
6528 
6529 /*
6530  * Setup the list of phci drivers associated with the specified vhci class.
6531  * MDI uses this information to rebuild bus config cache if in case the
6532  * cache is not available or corrupted.
6533  */
6534 static void
6535 setup_phci_driver_list(mdi_vhci_t *vh)
6536 {
6537 	mdi_vhci_config_t *vhc = vh->vh_config;
6538 	mdi_phci_driver_info_t *driver_list;
6539 	char **driver_list1;
6540 	uint_t ndrivers, ndrivers1;
6541 	int i, j;
6542 
6543 	if (strcmp(vh->vh_class, MDI_HCI_CLASS_SCSI) == 0) {
6544 		driver_list = scsi_phci_driver_list;
6545 		ndrivers = sizeof (scsi_phci_driver_list) /
6546 		    sizeof (mdi_phci_driver_info_t);
6547 	} else if (strcmp(vh->vh_class, MDI_HCI_CLASS_IB) == 0) {
6548 		driver_list = ib_phci_driver_list;
6549 		ndrivers = sizeof (ib_phci_driver_list) /
6550 		    sizeof (mdi_phci_driver_info_t);
6551 	} else {
6552 		driver_list = NULL;
6553 		ndrivers = 0;
6554 	}
6555 
6556 	/*
6557 	 * The driver.conf file of a vhci driver can specify additional
6558 	 * phci drivers using a project private "phci-drivers" property.
6559 	 */
6560 	if (ddi_prop_lookup_string_array(DDI_DEV_T_ANY, vh->vh_dip,
6561 	    DDI_PROP_DONTPASS, "phci-drivers", &driver_list1,
6562 	    &ndrivers1) != DDI_PROP_SUCCESS)
6563 		ndrivers1 = 0;
6564 
6565 	vhc->vhc_nphci_drivers = ndrivers + ndrivers1;
6566 	if (vhc->vhc_nphci_drivers == 0)
6567 		return;
6568 
6569 	vhc->vhc_phci_driver_list = kmem_alloc(
6570 	    sizeof (mdi_phci_driver_info_t) * vhc->vhc_nphci_drivers, KM_SLEEP);
6571 
6572 	for (i = 0; i < ndrivers; i++) {
6573 		vhc->vhc_phci_driver_list[i].phdriver_name =
6574 		    i_ddi_strdup(driver_list[i].phdriver_name, KM_SLEEP);
6575 		vhc->vhc_phci_driver_list[i].phdriver_root_support =
6576 		    driver_list[i].phdriver_root_support;
6577 	}
6578 
6579 	for (j = 0; j < ndrivers1; j++, i++) {
6580 		vhc->vhc_phci_driver_list[i].phdriver_name =
6581 		    i_ddi_strdup(driver_list1[j], KM_SLEEP);
6582 		vhc->vhc_phci_driver_list[i].phdriver_root_support = 1;
6583 	}
6584 
6585 	if (ndrivers1)
6586 		ddi_prop_free(driver_list1);
6587 }
6588 
6589 /*
6590  * Free the memory allocated for the phci driver list
6591  */
6592 static void
6593 free_phci_driver_list(mdi_vhci_config_t *vhc)
6594 {
6595 	int i;
6596 
6597 	if (vhc->vhc_phci_driver_list == NULL)
6598 		return;
6599 
6600 	for (i = 0; i < vhc->vhc_nphci_drivers; i++) {
6601 		kmem_free(vhc->vhc_phci_driver_list[i].phdriver_name,
6602 		    strlen(vhc->vhc_phci_driver_list[i].phdriver_name) + 1);
6603 	}
6604 
6605 	kmem_free(vhc->vhc_phci_driver_list,
6606 	    sizeof (mdi_phci_driver_info_t) * vhc->vhc_nphci_drivers);
6607 }
6608 
6609 /*
6610  * Stop all vhci cache related async threads and free their resources.
6611  */
6612 static int
6613 stop_vhcache_async_threads(mdi_vhci_config_t *vhc)
6614 {
6615 	mdi_async_client_config_t *acc, *acc_next;
6616 
6617 	mutex_enter(&vhc->vhc_lock);
6618 	vhc->vhc_flags |= MDI_VHC_EXIT;
6619 	ASSERT(vhc->vhc_acc_thrcount >= 0);
6620 	cv_broadcast(&vhc->vhc_cv);
6621 
6622 	while ((vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) ||
6623 	    (vhc->vhc_flags & MDI_VHC_BUILD_VHCI_CACHE_THREAD) ||
6624 	    vhc->vhc_acc_thrcount != 0) {
6625 		mutex_exit(&vhc->vhc_lock);
6626 		delay(1);
6627 		mutex_enter(&vhc->vhc_lock);
6628 	}
6629 
6630 	vhc->vhc_flags &= ~MDI_VHC_EXIT;
6631 
6632 	for (acc = vhc->vhc_acc_list_head; acc != NULL; acc = acc_next) {
6633 		acc_next = acc->acc_next;
6634 		free_async_client_config(acc);
6635 	}
6636 	vhc->vhc_acc_list_head = NULL;
6637 	vhc->vhc_acc_list_tail = NULL;
6638 	vhc->vhc_acc_count = 0;
6639 
6640 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) {
6641 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
6642 		mutex_exit(&vhc->vhc_lock);
6643 		if (flush_vhcache(vhc, 0) != MDI_SUCCESS) {
6644 			vhcache_dirty(vhc);
6645 			return (MDI_FAILURE);
6646 		}
6647 	} else
6648 		mutex_exit(&vhc->vhc_lock);
6649 
6650 	if (callb_delete(vhc->vhc_cbid) != 0)
6651 		return (MDI_FAILURE);
6652 
6653 	return (MDI_SUCCESS);
6654 }
6655 
6656 /*
6657  * Stop vhci cache flush thread
6658  */
6659 /* ARGSUSED */
6660 static boolean_t
6661 stop_vhcache_flush_thread(void *arg, int code)
6662 {
6663 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
6664 
6665 	mutex_enter(&vhc->vhc_lock);
6666 	vhc->vhc_flags |= MDI_VHC_EXIT;
6667 	cv_broadcast(&vhc->vhc_cv);
6668 
6669 	while (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) {
6670 		mutex_exit(&vhc->vhc_lock);
6671 		delay(1);
6672 		mutex_enter(&vhc->vhc_lock);
6673 	}
6674 
6675 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) {
6676 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
6677 		mutex_exit(&vhc->vhc_lock);
6678 		(void) flush_vhcache(vhc, 1);
6679 	} else
6680 		mutex_exit(&vhc->vhc_lock);
6681 
6682 	return (B_TRUE);
6683 }
6684 
6685 /*
6686  * Enqueue the vhcache phci (cphci) at the tail of the list
6687  */
6688 static void
6689 enqueue_vhcache_phci(mdi_vhci_cache_t *vhcache, mdi_vhcache_phci_t *cphci)
6690 {
6691 	cphci->cphci_next = NULL;
6692 	if (vhcache->vhcache_phci_head == NULL)
6693 		vhcache->vhcache_phci_head = cphci;
6694 	else
6695 		vhcache->vhcache_phci_tail->cphci_next = cphci;
6696 	vhcache->vhcache_phci_tail = cphci;
6697 }
6698 
6699 /*
6700  * Enqueue the vhcache pathinfo (cpi) at the tail of the list
6701  */
6702 static void
6703 enqueue_tail_vhcache_pathinfo(mdi_vhcache_client_t *cct,
6704     mdi_vhcache_pathinfo_t *cpi)
6705 {
6706 	cpi->cpi_next = NULL;
6707 	if (cct->cct_cpi_head == NULL)
6708 		cct->cct_cpi_head = cpi;
6709 	else
6710 		cct->cct_cpi_tail->cpi_next = cpi;
6711 	cct->cct_cpi_tail = cpi;
6712 }
6713 
6714 /*
6715  * Enqueue the vhcache pathinfo (cpi) at the correct location in the
6716  * ordered list. All cpis which do not have MDI_CPI_HINT_PATH_DOES_NOT_EXIST
6717  * flag set come at the beginning of the list. All cpis which have this
6718  * flag set come at the end of the list.
6719  */
6720 static void
6721 enqueue_vhcache_pathinfo(mdi_vhcache_client_t *cct,
6722     mdi_vhcache_pathinfo_t *newcpi)
6723 {
6724 	mdi_vhcache_pathinfo_t *cpi, *prev_cpi;
6725 
6726 	if (cct->cct_cpi_head == NULL ||
6727 	    (newcpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST))
6728 		enqueue_tail_vhcache_pathinfo(cct, newcpi);
6729 	else {
6730 		for (cpi = cct->cct_cpi_head, prev_cpi = NULL; cpi != NULL &&
6731 		    !(cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST);
6732 		    prev_cpi = cpi, cpi = cpi->cpi_next)
6733 			;
6734 
6735 		if (prev_cpi == NULL)
6736 			cct->cct_cpi_head = newcpi;
6737 		else
6738 			prev_cpi->cpi_next = newcpi;
6739 
6740 		newcpi->cpi_next = cpi;
6741 
6742 		if (cpi == NULL)
6743 			cct->cct_cpi_tail = newcpi;
6744 	}
6745 }
6746 
6747 /*
6748  * Enqueue the vhcache client (cct) at the tail of the list
6749  */
6750 static void
6751 enqueue_vhcache_client(mdi_vhci_cache_t *vhcache,
6752     mdi_vhcache_client_t *cct)
6753 {
6754 	cct->cct_next = NULL;
6755 	if (vhcache->vhcache_client_head == NULL)
6756 		vhcache->vhcache_client_head = cct;
6757 	else
6758 		vhcache->vhcache_client_tail->cct_next = cct;
6759 	vhcache->vhcache_client_tail = cct;
6760 }
6761 
6762 static void
6763 free_string_array(char **str, int nelem)
6764 {
6765 	int i;
6766 
6767 	if (str) {
6768 		for (i = 0; i < nelem; i++) {
6769 			if (str[i])
6770 				kmem_free(str[i], strlen(str[i]) + 1);
6771 		}
6772 		kmem_free(str, sizeof (char *) * nelem);
6773 	}
6774 }
6775 
6776 static void
6777 free_vhcache_phci(mdi_vhcache_phci_t *cphci)
6778 {
6779 	kmem_free(cphci->cphci_path, strlen(cphci->cphci_path) + 1);
6780 	kmem_free(cphci, sizeof (*cphci));
6781 }
6782 
6783 static void
6784 free_vhcache_pathinfo(mdi_vhcache_pathinfo_t *cpi)
6785 {
6786 	kmem_free(cpi->cpi_addr, strlen(cpi->cpi_addr) + 1);
6787 	kmem_free(cpi, sizeof (*cpi));
6788 }
6789 
6790 static void
6791 free_vhcache_client(mdi_vhcache_client_t *cct)
6792 {
6793 	kmem_free(cct->cct_name_addr, strlen(cct->cct_name_addr) + 1);
6794 	kmem_free(cct, sizeof (*cct));
6795 }
6796 
6797 static char *
6798 vhcache_mknameaddr(char *ct_name, char *ct_addr, int *ret_len)
6799 {
6800 	char *name_addr;
6801 	int len;
6802 
6803 	len = strlen(ct_name) + strlen(ct_addr) + 2;
6804 	name_addr = kmem_alloc(len, KM_SLEEP);
6805 	(void) snprintf(name_addr, len, "%s@%s", ct_name, ct_addr);
6806 
6807 	if (ret_len)
6808 		*ret_len = len;
6809 	return (name_addr);
6810 }
6811 
6812 /*
6813  * Copy the contents of paddrnvl to vhci cache.
6814  * paddrnvl nvlist contains path information for a vhci client.
6815  * See the comment in mainnvl_to_vhcache() for the format of this nvlist.
6816  */
6817 static void
6818 paddrnvl_to_vhcache(nvlist_t *nvl, mdi_vhcache_phci_t *cphci_list[],
6819     mdi_vhcache_client_t *cct)
6820 {
6821 	nvpair_t *nvp = NULL;
6822 	mdi_vhcache_pathinfo_t *cpi;
6823 	uint_t nelem;
6824 	uint32_t *val;
6825 
6826 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
6827 		ASSERT(nvpair_type(nvp) == DATA_TYPE_UINT32_ARRAY);
6828 		cpi = kmem_zalloc(sizeof (*cpi), KM_SLEEP);
6829 		cpi->cpi_addr = i_ddi_strdup(nvpair_name(nvp), KM_SLEEP);
6830 		(void) nvpair_value_uint32_array(nvp, &val, &nelem);
6831 		ASSERT(nelem == 2);
6832 		cpi->cpi_cphci = cphci_list[val[0]];
6833 		cpi->cpi_flags = val[1];
6834 		enqueue_tail_vhcache_pathinfo(cct, cpi);
6835 	}
6836 }
6837 
6838 /*
6839  * Copy the contents of caddrmapnvl to vhci cache.
6840  * caddrmapnvl nvlist contains vhci client address to phci client address
6841  * mappings. See the comment in mainnvl_to_vhcache() for the format of
6842  * this nvlist.
6843  */
6844 static void
6845 caddrmapnvl_to_vhcache(mdi_vhci_cache_t *vhcache, nvlist_t *nvl,
6846     mdi_vhcache_phci_t *cphci_list[])
6847 {
6848 	nvpair_t *nvp = NULL;
6849 	nvlist_t *paddrnvl;
6850 	mdi_vhcache_client_t *cct;
6851 
6852 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
6853 		ASSERT(nvpair_type(nvp) == DATA_TYPE_NVLIST);
6854 		cct = kmem_zalloc(sizeof (*cct), KM_SLEEP);
6855 		cct->cct_name_addr = i_ddi_strdup(nvpair_name(nvp), KM_SLEEP);
6856 		(void) nvpair_value_nvlist(nvp, &paddrnvl);
6857 		paddrnvl_to_vhcache(paddrnvl, cphci_list, cct);
6858 		/* the client must contain at least one path */
6859 		ASSERT(cct->cct_cpi_head != NULL);
6860 
6861 		enqueue_vhcache_client(vhcache, cct);
6862 		(void) mod_hash_insert(vhcache->vhcache_client_hash,
6863 		    (mod_hash_key_t)cct->cct_name_addr, (mod_hash_val_t)cct);
6864 	}
6865 }
6866 
6867 /*
6868  * Copy the contents of the main nvlist to vhci cache.
6869  *
6870  * VHCI busconfig cached data is stored in the form of a nvlist on the disk.
6871  * The nvlist contains the mappings between the vhci client addresses and
6872  * their corresponding phci client addresses.
6873  *
6874  * The structure of the nvlist is as follows:
6875  *
6876  * Main nvlist:
6877  *	NAME		TYPE		DATA
6878  *	version		int32		version number
6879  *	phcis		string array	array of phci paths
6880  *	clientaddrmap	nvlist_t	c2paddrs_nvl (see below)
6881  *
6882  * structure of c2paddrs_nvl:
6883  *	NAME		TYPE		DATA
6884  *	caddr1		nvlist_t	paddrs_nvl1
6885  *	caddr2		nvlist_t	paddrs_nvl2
6886  *	...
6887  * where caddr1, caddr2, ... are vhci client name and addresses in the
6888  * form of "<clientname>@<clientaddress>".
6889  * (for example: "ssd@2000002037cd9f72");
6890  * paddrs_nvl1, paddrs_nvl2, .. are nvlists that contain path information.
6891  *
6892  * structure of paddrs_nvl:
6893  *	NAME		TYPE		DATA
6894  *	pi_addr1	uint32_array	(phci-id, cpi_flags)
6895  *	pi_addr2	uint32_array	(phci-id, cpi_flags)
6896  *	...
6897  * where pi_addr1, pi_addr2, ... are bus specific addresses of pathinfo nodes
6898  * (so called pi_addrs, for example: "w2100002037cd9f72,0");
6899  * phci-ids are integers that identify PHCIs to which the
6900  * the bus specific address belongs to. These integers are used as an index
6901  * into to the phcis string array in the main nvlist to get the PHCI path.
6902  */
6903 static int
6904 mainnvl_to_vhcache(mdi_vhci_cache_t *vhcache, nvlist_t *nvl)
6905 {
6906 	char **phcis, **phci_namep;
6907 	uint_t nphcis;
6908 	mdi_vhcache_phci_t *cphci, **cphci_list;
6909 	nvlist_t *caddrmapnvl;
6910 	int32_t ver;
6911 	int i;
6912 	size_t cphci_list_size;
6913 
6914 	ASSERT(RW_WRITE_HELD(&vhcache->vhcache_lock));
6915 
6916 	if (nvlist_lookup_int32(nvl, MDI_NVPNAME_VERSION, &ver) != 0 ||
6917 	    ver != MDI_VHCI_CACHE_VERSION)
6918 		return (MDI_FAILURE);
6919 
6920 	if (nvlist_lookup_string_array(nvl, MDI_NVPNAME_PHCIS, &phcis,
6921 	    &nphcis) != 0)
6922 		return (MDI_SUCCESS);
6923 
6924 	ASSERT(nphcis > 0);
6925 
6926 	cphci_list_size = sizeof (mdi_vhcache_phci_t *) * nphcis;
6927 	cphci_list = kmem_alloc(cphci_list_size, KM_SLEEP);
6928 	for (i = 0, phci_namep = phcis; i < nphcis; i++, phci_namep++) {
6929 		cphci = kmem_zalloc(sizeof (mdi_vhcache_phci_t), KM_SLEEP);
6930 		cphci->cphci_path = i_ddi_strdup(*phci_namep, KM_SLEEP);
6931 		enqueue_vhcache_phci(vhcache, cphci);
6932 		cphci_list[i] = cphci;
6933 	}
6934 
6935 	ASSERT(vhcache->vhcache_phci_head != NULL);
6936 
6937 	if (nvlist_lookup_nvlist(nvl, MDI_NVPNAME_CTADDRMAP, &caddrmapnvl) == 0)
6938 		caddrmapnvl_to_vhcache(vhcache, caddrmapnvl, cphci_list);
6939 
6940 	kmem_free(cphci_list, cphci_list_size);
6941 	return (MDI_SUCCESS);
6942 }
6943 
6944 /*
6945  * Build paddrnvl for the specified client using the information in the
6946  * vhci cache and add it to the caddrmapnnvl.
6947  * Returns 0 on success, errno on failure.
6948  */
6949 static int
6950 vhcache_to_paddrnvl(mdi_vhci_cache_t *vhcache, mdi_vhcache_client_t *cct,
6951     nvlist_t *caddrmapnvl)
6952 {
6953 	mdi_vhcache_pathinfo_t *cpi;
6954 	nvlist_t *nvl;
6955 	int err;
6956 	uint32_t val[2];
6957 
6958 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
6959 
6960 	if ((err = nvlist_alloc(&nvl, 0, KM_SLEEP)) != 0)
6961 		return (err);
6962 
6963 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
6964 		val[0] = cpi->cpi_cphci->cphci_id;
6965 		val[1] = cpi->cpi_flags;
6966 		if ((err = nvlist_add_uint32_array(nvl, cpi->cpi_addr, val, 2))
6967 		    != 0)
6968 			goto out;
6969 	}
6970 
6971 	err = nvlist_add_nvlist(caddrmapnvl, cct->cct_name_addr, nvl);
6972 out:
6973 	nvlist_free(nvl);
6974 	return (err);
6975 }
6976 
6977 /*
6978  * Build caddrmapnvl using the information in the vhci cache
6979  * and add it to the mainnvl.
6980  * Returns 0 on success, errno on failure.
6981  */
6982 static int
6983 vhcache_to_caddrmapnvl(mdi_vhci_cache_t *vhcache, nvlist_t *mainnvl)
6984 {
6985 	mdi_vhcache_client_t *cct;
6986 	nvlist_t *nvl;
6987 	int err;
6988 
6989 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
6990 
6991 	if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) != 0)
6992 		return (err);
6993 
6994 	for (cct = vhcache->vhcache_client_head; cct != NULL;
6995 	    cct = cct->cct_next) {
6996 		if ((err = vhcache_to_paddrnvl(vhcache, cct, nvl)) != 0)
6997 			goto out;
6998 	}
6999 
7000 	err = nvlist_add_nvlist(mainnvl, MDI_NVPNAME_CTADDRMAP, nvl);
7001 out:
7002 	nvlist_free(nvl);
7003 	return (err);
7004 }
7005 
7006 /*
7007  * Build nvlist using the information in the vhci cache.
7008  * See the comment in mainnvl_to_vhcache() for the format of the nvlist.
7009  * Returns nvl on success, NULL on failure.
7010  */
7011 static nvlist_t *
7012 vhcache_to_mainnvl(mdi_vhci_cache_t *vhcache)
7013 {
7014 	mdi_vhcache_phci_t *cphci;
7015 	uint_t phci_count;
7016 	char **phcis;
7017 	nvlist_t *nvl;
7018 	int err, i;
7019 
7020 	if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) != 0) {
7021 		nvl = NULL;
7022 		goto out;
7023 	}
7024 
7025 	if ((err = nvlist_add_int32(nvl, MDI_NVPNAME_VERSION,
7026 	    MDI_VHCI_CACHE_VERSION)) != 0)
7027 		goto out;
7028 
7029 	rw_enter(&vhcache->vhcache_lock, RW_READER);
7030 	if (vhcache->vhcache_phci_head == NULL) {
7031 		rw_exit(&vhcache->vhcache_lock);
7032 		return (nvl);
7033 	}
7034 
7035 	phci_count = 0;
7036 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7037 	    cphci = cphci->cphci_next)
7038 		cphci->cphci_id = phci_count++;
7039 
7040 	/* build phci pathname list */
7041 	phcis = kmem_alloc(sizeof (char *) * phci_count, KM_SLEEP);
7042 	for (cphci = vhcache->vhcache_phci_head, i = 0; cphci != NULL;
7043 	    cphci = cphci->cphci_next, i++)
7044 		phcis[i] = i_ddi_strdup(cphci->cphci_path, KM_SLEEP);
7045 
7046 	err = nvlist_add_string_array(nvl, MDI_NVPNAME_PHCIS, phcis,
7047 	    phci_count);
7048 	free_string_array(phcis, phci_count);
7049 
7050 	if (err == 0 &&
7051 	    (err = vhcache_to_caddrmapnvl(vhcache, nvl)) == 0) {
7052 		rw_exit(&vhcache->vhcache_lock);
7053 		return (nvl);
7054 	}
7055 
7056 	rw_exit(&vhcache->vhcache_lock);
7057 out:
7058 	if (nvl)
7059 		nvlist_free(nvl);
7060 	return (NULL);
7061 }
7062 
7063 /*
7064  * Lookup vhcache phci structure for the specified phci path.
7065  */
7066 static mdi_vhcache_phci_t *
7067 lookup_vhcache_phci_by_name(mdi_vhci_cache_t *vhcache, char *phci_path)
7068 {
7069 	mdi_vhcache_phci_t *cphci;
7070 
7071 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7072 
7073 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7074 	    cphci = cphci->cphci_next) {
7075 		if (strcmp(cphci->cphci_path, phci_path) == 0)
7076 			return (cphci);
7077 	}
7078 
7079 	return (NULL);
7080 }
7081 
7082 /*
7083  * Lookup vhcache phci structure for the specified phci.
7084  */
7085 static mdi_vhcache_phci_t *
7086 lookup_vhcache_phci_by_addr(mdi_vhci_cache_t *vhcache, mdi_phci_t *ph)
7087 {
7088 	mdi_vhcache_phci_t *cphci;
7089 
7090 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7091 
7092 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7093 	    cphci = cphci->cphci_next) {
7094 		if (cphci->cphci_phci == ph)
7095 			return (cphci);
7096 	}
7097 
7098 	return (NULL);
7099 }
7100 
7101 /*
7102  * Add the specified phci to the vhci cache if not already present.
7103  */
7104 static void
7105 vhcache_phci_add(mdi_vhci_config_t *vhc, mdi_phci_t *ph)
7106 {
7107 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7108 	mdi_vhcache_phci_t *cphci;
7109 	char *pathname;
7110 	int cache_updated;
7111 
7112 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7113 
7114 	pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
7115 	(void) ddi_pathname(ph->ph_dip, pathname);
7116 	if ((cphci = lookup_vhcache_phci_by_name(vhcache, pathname))
7117 	    != NULL) {
7118 		cphci->cphci_phci = ph;
7119 		cache_updated = 0;
7120 	} else {
7121 		cphci = kmem_zalloc(sizeof (*cphci), KM_SLEEP);
7122 		cphci->cphci_path = i_ddi_strdup(pathname, KM_SLEEP);
7123 		cphci->cphci_phci = ph;
7124 		enqueue_vhcache_phci(vhcache, cphci);
7125 		cache_updated = 1;
7126 	}
7127 	rw_exit(&vhcache->vhcache_lock);
7128 
7129 	kmem_free(pathname, MAXPATHLEN);
7130 	if (cache_updated)
7131 		vhcache_dirty(vhc);
7132 }
7133 
7134 /*
7135  * Remove the reference to the specified phci from the vhci cache.
7136  */
7137 static void
7138 vhcache_phci_remove(mdi_vhci_config_t *vhc, mdi_phci_t *ph)
7139 {
7140 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7141 	mdi_vhcache_phci_t *cphci;
7142 
7143 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7144 	if ((cphci = lookup_vhcache_phci_by_addr(vhcache, ph)) != NULL) {
7145 		/* do not remove the actual mdi_vhcache_phci structure */
7146 		cphci->cphci_phci = NULL;
7147 	}
7148 	rw_exit(&vhcache->vhcache_lock);
7149 }
7150 
7151 static void
7152 init_vhcache_lookup_token(mdi_vhcache_lookup_token_t *dst,
7153     mdi_vhcache_lookup_token_t *src)
7154 {
7155 	if (src == NULL) {
7156 		dst->lt_cct = NULL;
7157 		dst->lt_cct_lookup_time = 0;
7158 	} else {
7159 		dst->lt_cct = src->lt_cct;
7160 		dst->lt_cct_lookup_time = src->lt_cct_lookup_time;
7161 	}
7162 }
7163 
7164 /*
7165  * Look up vhcache client for the specified client.
7166  */
7167 static mdi_vhcache_client_t *
7168 lookup_vhcache_client(mdi_vhci_cache_t *vhcache, char *ct_name, char *ct_addr,
7169     mdi_vhcache_lookup_token_t *token)
7170 {
7171 	mod_hash_val_t hv;
7172 	char *name_addr;
7173 	int len;
7174 
7175 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7176 
7177 	/*
7178 	 * If no vhcache clean occurred since the last lookup, we can
7179 	 * simply return the cct from the last lookup operation.
7180 	 * It works because ccts are never freed except during the vhcache
7181 	 * cleanup operation.
7182 	 */
7183 	if (token != NULL &&
7184 	    vhcache->vhcache_clean_time < token->lt_cct_lookup_time)
7185 		return (token->lt_cct);
7186 
7187 	name_addr = vhcache_mknameaddr(ct_name, ct_addr, &len);
7188 	if (mod_hash_find(vhcache->vhcache_client_hash,
7189 	    (mod_hash_key_t)name_addr, &hv) == 0) {
7190 		if (token) {
7191 			token->lt_cct = (mdi_vhcache_client_t *)hv;
7192 			token->lt_cct_lookup_time = lbolt64;
7193 		}
7194 	} else {
7195 		if (token) {
7196 			token->lt_cct = NULL;
7197 			token->lt_cct_lookup_time = 0;
7198 		}
7199 		hv = NULL;
7200 	}
7201 	kmem_free(name_addr, len);
7202 	return ((mdi_vhcache_client_t *)hv);
7203 }
7204 
7205 /*
7206  * Add the specified path to the vhci cache if not already present.
7207  * Also add the vhcache client for the client corresponding to this path
7208  * if it doesn't already exist.
7209  */
7210 static void
7211 vhcache_pi_add(mdi_vhci_config_t *vhc, struct mdi_pathinfo *pip)
7212 {
7213 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7214 	mdi_vhcache_client_t *cct;
7215 	mdi_vhcache_pathinfo_t *cpi;
7216 	mdi_phci_t *ph = pip->pi_phci;
7217 	mdi_client_t *ct = pip->pi_client;
7218 	int cache_updated = 0;
7219 
7220 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7221 
7222 	/* if vhcache client for this pip doesn't already exist, add it */
7223 	if ((cct = lookup_vhcache_client(vhcache, ct->ct_drvname, ct->ct_guid,
7224 	    NULL)) == NULL) {
7225 		cct = kmem_zalloc(sizeof (*cct), KM_SLEEP);
7226 		cct->cct_name_addr = vhcache_mknameaddr(ct->ct_drvname,
7227 		    ct->ct_guid, NULL);
7228 		enqueue_vhcache_client(vhcache, cct);
7229 		(void) mod_hash_insert(vhcache->vhcache_client_hash,
7230 		    (mod_hash_key_t)cct->cct_name_addr, (mod_hash_val_t)cct);
7231 		cache_updated = 1;
7232 	}
7233 
7234 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
7235 		if (cpi->cpi_cphci->cphci_phci == ph &&
7236 		    strcmp(cpi->cpi_addr, pip->pi_addr) == 0) {
7237 			cpi->cpi_pip = pip;
7238 			if (cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST) {
7239 				cpi->cpi_flags &=
7240 				    ~MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
7241 				sort_vhcache_paths(cct);
7242 				cache_updated = 1;
7243 			}
7244 			break;
7245 		}
7246 	}
7247 
7248 	if (cpi == NULL) {
7249 		cpi = kmem_zalloc(sizeof (*cpi), KM_SLEEP);
7250 		cpi->cpi_addr = i_ddi_strdup(pip->pi_addr, KM_SLEEP);
7251 		cpi->cpi_cphci = lookup_vhcache_phci_by_addr(vhcache, ph);
7252 		ASSERT(cpi->cpi_cphci != NULL);
7253 		cpi->cpi_pip = pip;
7254 		enqueue_vhcache_pathinfo(cct, cpi);
7255 		cache_updated = 1;
7256 	}
7257 
7258 	rw_exit(&vhcache->vhcache_lock);
7259 
7260 	if (cache_updated)
7261 		vhcache_dirty(vhc);
7262 }
7263 
7264 /*
7265  * Remove the reference to the specified path from the vhci cache.
7266  */
7267 static void
7268 vhcache_pi_remove(mdi_vhci_config_t *vhc, struct mdi_pathinfo *pip)
7269 {
7270 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7271 	mdi_client_t *ct = pip->pi_client;
7272 	mdi_vhcache_client_t *cct;
7273 	mdi_vhcache_pathinfo_t *cpi;
7274 
7275 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7276 	if ((cct = lookup_vhcache_client(vhcache, ct->ct_drvname, ct->ct_guid,
7277 	    NULL)) != NULL) {
7278 		for (cpi = cct->cct_cpi_head; cpi != NULL;
7279 		    cpi = cpi->cpi_next) {
7280 			if (cpi->cpi_pip == pip) {
7281 				cpi->cpi_pip = NULL;
7282 				break;
7283 			}
7284 		}
7285 	}
7286 	rw_exit(&vhcache->vhcache_lock);
7287 }
7288 
7289 /*
7290  * Flush the vhci cache to disk.
7291  * Returns MDI_SUCCESS on success, MDI_FAILURE on failure.
7292  */
7293 static int
7294 flush_vhcache(mdi_vhci_config_t *vhc, int force_flag)
7295 {
7296 	nvlist_t *nvl;
7297 	int err;
7298 	int rv;
7299 
7300 	/*
7301 	 * It is possible that the system may shutdown before
7302 	 * i_ddi_io_initialized (during stmsboot for example). To allow for
7303 	 * flushing the cache in this case do not check for
7304 	 * i_ddi_io_initialized when force flag is set.
7305 	 */
7306 	if (force_flag == 0 && !i_ddi_io_initialized())
7307 		return (MDI_FAILURE);
7308 
7309 	if ((nvl = vhcache_to_mainnvl(&vhc->vhc_vhcache)) != NULL) {
7310 		err = fwrite_nvlist(vhc->vhc_vhcache_filename, nvl);
7311 		nvlist_free(nvl);
7312 	} else
7313 		err = EFAULT;
7314 
7315 	rv = MDI_SUCCESS;
7316 	mutex_enter(&vhc->vhc_lock);
7317 	if (err != 0) {
7318 		if (err == EROFS) {
7319 			vhc->vhc_flags |= MDI_VHC_READONLY_FS;
7320 			vhc->vhc_flags &= ~(MDI_VHC_VHCACHE_FLUSH_ERROR |
7321 			    MDI_VHC_VHCACHE_DIRTY);
7322 		} else {
7323 			if (!(vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_ERROR)) {
7324 				cmn_err(CE_CONT, "%s: update failed\n",
7325 				    vhc->vhc_vhcache_filename);
7326 				vhc->vhc_flags |= MDI_VHC_VHCACHE_FLUSH_ERROR;
7327 			}
7328 			rv = MDI_FAILURE;
7329 		}
7330 	} else if (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_ERROR) {
7331 		cmn_err(CE_CONT,
7332 		    "%s: update now ok\n", vhc->vhc_vhcache_filename);
7333 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_FLUSH_ERROR;
7334 	}
7335 	mutex_exit(&vhc->vhc_lock);
7336 
7337 	return (rv);
7338 }
7339 
7340 /*
7341  * Call flush_vhcache() to flush the vhci cache at the scheduled time.
7342  * Exits itself if left idle for the idle timeout period.
7343  */
7344 static void
7345 vhcache_flush_thread(void *arg)
7346 {
7347 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
7348 	clock_t idle_time, quit_at_ticks;
7349 	callb_cpr_t cprinfo;
7350 
7351 	/* number of seconds to sleep idle before exiting */
7352 	idle_time = mdi_vhcache_flush_daemon_idle_time * TICKS_PER_SECOND;
7353 
7354 	CALLB_CPR_INIT(&cprinfo, &vhc->vhc_lock, callb_generic_cpr,
7355 	    "mdi_vhcache_flush");
7356 	mutex_enter(&vhc->vhc_lock);
7357 	for (; ; ) {
7358 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
7359 		    (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY)) {
7360 			if (ddi_get_lbolt() < vhc->vhc_flush_at_ticks) {
7361 				CALLB_CPR_SAFE_BEGIN(&cprinfo);
7362 				(void) cv_timedwait(&vhc->vhc_cv,
7363 				    &vhc->vhc_lock, vhc->vhc_flush_at_ticks);
7364 				CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
7365 			} else {
7366 				vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
7367 				mutex_exit(&vhc->vhc_lock);
7368 
7369 				if (flush_vhcache(vhc, 0) != MDI_SUCCESS)
7370 					vhcache_dirty(vhc);
7371 
7372 				mutex_enter(&vhc->vhc_lock);
7373 			}
7374 		}
7375 
7376 		quit_at_ticks = ddi_get_lbolt() + idle_time;
7377 
7378 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
7379 		    !(vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) &&
7380 		    ddi_get_lbolt() < quit_at_ticks) {
7381 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
7382 			(void) cv_timedwait(&vhc->vhc_cv, &vhc->vhc_lock,
7383 			    quit_at_ticks);
7384 			CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
7385 		}
7386 
7387 		if ((vhc->vhc_flags & MDI_VHC_EXIT) ||
7388 		    !(vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY))
7389 			goto out;
7390 	}
7391 
7392 out:
7393 	vhc->vhc_flags &= ~MDI_VHC_VHCACHE_FLUSH_THREAD;
7394 	/* CALLB_CPR_EXIT releases the vhc->vhc_lock */
7395 	CALLB_CPR_EXIT(&cprinfo);
7396 }
7397 
7398 /*
7399  * Make vhci cache dirty and schedule flushing by vhcache flush thread.
7400  */
7401 static void
7402 vhcache_dirty(mdi_vhci_config_t *vhc)
7403 {
7404 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7405 	int create_thread;
7406 
7407 	rw_enter(&vhcache->vhcache_lock, RW_READER);
7408 	/* do not flush cache until the cache is fully built */
7409 	if (!(vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE)) {
7410 		rw_exit(&vhcache->vhcache_lock);
7411 		return;
7412 	}
7413 	rw_exit(&vhcache->vhcache_lock);
7414 
7415 	mutex_enter(&vhc->vhc_lock);
7416 	if (vhc->vhc_flags & MDI_VHC_READONLY_FS) {
7417 		mutex_exit(&vhc->vhc_lock);
7418 		return;
7419 	}
7420 
7421 	vhc->vhc_flags |= MDI_VHC_VHCACHE_DIRTY;
7422 	vhc->vhc_flush_at_ticks = ddi_get_lbolt() +
7423 	    mdi_vhcache_flush_delay * TICKS_PER_SECOND;
7424 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) {
7425 		cv_broadcast(&vhc->vhc_cv);
7426 		create_thread = 0;
7427 	} else {
7428 		vhc->vhc_flags |= MDI_VHC_VHCACHE_FLUSH_THREAD;
7429 		create_thread = 1;
7430 	}
7431 	mutex_exit(&vhc->vhc_lock);
7432 
7433 	if (create_thread)
7434 		(void) thread_create(NULL, 0, vhcache_flush_thread, vhc,
7435 		    0, &p0, TS_RUN, minclsyspri);
7436 }
7437 
7438 /*
7439  * phci bus config structure - one for for each phci bus config operation that
7440  * we initiate on behalf of a vhci.
7441  */
7442 typedef struct mdi_phci_bus_config_s {
7443 	char *phbc_phci_path;
7444 	struct mdi_vhci_bus_config_s *phbc_vhbusconfig;	/* vhci bus config */
7445 	struct mdi_phci_bus_config_s *phbc_next;
7446 } mdi_phci_bus_config_t;
7447 
7448 /* vhci bus config structure - one for each vhci bus config operation */
7449 typedef struct mdi_vhci_bus_config_s {
7450 	ddi_bus_config_op_t vhbc_op;	/* bus config op */
7451 	major_t vhbc_op_major;		/* bus config op major */
7452 	uint_t vhbc_op_flags;		/* bus config op flags */
7453 	kmutex_t vhbc_lock;
7454 	kcondvar_t vhbc_cv;
7455 	int vhbc_thr_count;
7456 } mdi_vhci_bus_config_t;
7457 
7458 /*
7459  * bus config the specified phci
7460  */
7461 static void
7462 bus_config_phci(void *arg)
7463 {
7464 	mdi_phci_bus_config_t *phbc = (mdi_phci_bus_config_t *)arg;
7465 	mdi_vhci_bus_config_t *vhbc = phbc->phbc_vhbusconfig;
7466 	dev_info_t *ph_dip;
7467 
7468 	/*
7469 	 * first configure all path components upto phci and then configure
7470 	 * the phci children.
7471 	 */
7472 	if ((ph_dip = e_ddi_hold_devi_by_path(phbc->phbc_phci_path, 0))
7473 	    != NULL) {
7474 		if (vhbc->vhbc_op == BUS_CONFIG_DRIVER ||
7475 		    vhbc->vhbc_op == BUS_UNCONFIG_DRIVER) {
7476 			(void) ndi_devi_config_driver(ph_dip,
7477 			    vhbc->vhbc_op_flags,
7478 			    vhbc->vhbc_op_major);
7479 		} else
7480 			(void) ndi_devi_config(ph_dip,
7481 			    vhbc->vhbc_op_flags);
7482 
7483 		/* release the hold that e_ddi_hold_devi_by_path() placed */
7484 		ndi_rele_devi(ph_dip);
7485 	}
7486 
7487 	kmem_free(phbc->phbc_phci_path, strlen(phbc->phbc_phci_path) + 1);
7488 	kmem_free(phbc, sizeof (*phbc));
7489 
7490 	mutex_enter(&vhbc->vhbc_lock);
7491 	vhbc->vhbc_thr_count--;
7492 	if (vhbc->vhbc_thr_count == 0)
7493 		cv_broadcast(&vhbc->vhbc_cv);
7494 	mutex_exit(&vhbc->vhbc_lock);
7495 }
7496 
7497 /*
7498  * Bus config all phcis associated with the vhci in parallel.
7499  * op must be BUS_CONFIG_DRIVER or BUS_CONFIG_ALL.
7500  */
7501 static void
7502 bus_config_all_phcis(mdi_vhci_cache_t *vhcache, uint_t flags,
7503     ddi_bus_config_op_t op, major_t maj)
7504 {
7505 	mdi_phci_bus_config_t *phbc_head = NULL, *phbc, *phbc_next;
7506 	mdi_vhci_bus_config_t *vhbc;
7507 	mdi_vhcache_phci_t *cphci;
7508 
7509 	rw_enter(&vhcache->vhcache_lock, RW_READER);
7510 	if (vhcache->vhcache_phci_head == NULL) {
7511 		rw_exit(&vhcache->vhcache_lock);
7512 		return;
7513 	}
7514 
7515 	vhbc = kmem_zalloc(sizeof (*vhbc), KM_SLEEP);
7516 
7517 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7518 	    cphci = cphci->cphci_next) {
7519 		phbc = kmem_zalloc(sizeof (*phbc), KM_SLEEP);
7520 		phbc->phbc_phci_path = i_ddi_strdup(cphci->cphci_path,
7521 		    KM_SLEEP);
7522 		phbc->phbc_vhbusconfig = vhbc;
7523 		phbc->phbc_next = phbc_head;
7524 		phbc_head = phbc;
7525 		vhbc->vhbc_thr_count++;
7526 	}
7527 	rw_exit(&vhcache->vhcache_lock);
7528 
7529 	vhbc->vhbc_op = op;
7530 	vhbc->vhbc_op_major = maj;
7531 	vhbc->vhbc_op_flags = NDI_NO_EVENT |
7532 	    (flags & (NDI_CONFIG_REPROBE | NDI_DRV_CONF_REPROBE));
7533 	mutex_init(&vhbc->vhbc_lock, NULL, MUTEX_DEFAULT, NULL);
7534 	cv_init(&vhbc->vhbc_cv, NULL, CV_DRIVER, NULL);
7535 
7536 	/* now create threads to initiate bus config on all phcis in parallel */
7537 	for (phbc = phbc_head; phbc != NULL; phbc = phbc_next) {
7538 		phbc_next = phbc->phbc_next;
7539 		if (mdi_mtc_off)
7540 			bus_config_phci((void *)phbc);
7541 		else
7542 			(void) thread_create(NULL, 0, bus_config_phci, phbc,
7543 			    0, &p0, TS_RUN, minclsyspri);
7544 	}
7545 
7546 	mutex_enter(&vhbc->vhbc_lock);
7547 	/* wait until all threads exit */
7548 	while (vhbc->vhbc_thr_count > 0)
7549 		cv_wait(&vhbc->vhbc_cv, &vhbc->vhbc_lock);
7550 	mutex_exit(&vhbc->vhbc_lock);
7551 
7552 	mutex_destroy(&vhbc->vhbc_lock);
7553 	cv_destroy(&vhbc->vhbc_cv);
7554 	kmem_free(vhbc, sizeof (*vhbc));
7555 }
7556 
7557 /*
7558  * Perform BUS_CONFIG_ONE on the specified child of the phci.
7559  * The path includes the child component in addition to the phci path.
7560  */
7561 static int
7562 bus_config_one_phci_child(char *path)
7563 {
7564 	dev_info_t *ph_dip, *child;
7565 	char *devnm;
7566 	int rv = MDI_FAILURE;
7567 
7568 	/* extract the child component of the phci */
7569 	devnm = strrchr(path, '/');
7570 	*devnm++ = '\0';
7571 
7572 	/*
7573 	 * first configure all path components upto phci and then
7574 	 * configure the phci child.
7575 	 */
7576 	if ((ph_dip = e_ddi_hold_devi_by_path(path, 0)) != NULL) {
7577 		if (ndi_devi_config_one(ph_dip, devnm, &child, NDI_NO_EVENT) ==
7578 		    NDI_SUCCESS) {
7579 			/*
7580 			 * release the hold that ndi_devi_config_one() placed
7581 			 */
7582 			ndi_rele_devi(child);
7583 			rv = MDI_SUCCESS;
7584 		}
7585 
7586 		/* release the hold that e_ddi_hold_devi_by_path() placed */
7587 		ndi_rele_devi(ph_dip);
7588 	}
7589 
7590 	devnm--;
7591 	*devnm = '/';
7592 	return (rv);
7593 }
7594 
7595 /*
7596  * Build a list of phci client paths for the specified vhci client.
7597  * The list includes only those phci client paths which aren't configured yet.
7598  */
7599 static mdi_phys_path_t *
7600 build_phclient_path_list(mdi_vhcache_client_t *cct, char *ct_name)
7601 {
7602 	mdi_vhcache_pathinfo_t *cpi;
7603 	mdi_phys_path_t *pp_head = NULL, *pp_tail = NULL, *pp;
7604 	int config_path, len;
7605 
7606 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
7607 		/*
7608 		 * include only those paths that aren't configured.
7609 		 */
7610 		config_path = 0;
7611 		if (cpi->cpi_pip == NULL)
7612 			config_path = 1;
7613 		else {
7614 			MDI_PI_LOCK(cpi->cpi_pip);
7615 			if (MDI_PI_IS_INIT(cpi->cpi_pip))
7616 				config_path = 1;
7617 			MDI_PI_UNLOCK(cpi->cpi_pip);
7618 		}
7619 
7620 		if (config_path) {
7621 			pp = kmem_alloc(sizeof (*pp), KM_SLEEP);
7622 			len = strlen(cpi->cpi_cphci->cphci_path) +
7623 			    strlen(ct_name) + strlen(cpi->cpi_addr) + 3;
7624 			pp->phys_path = kmem_alloc(len, KM_SLEEP);
7625 			(void) snprintf(pp->phys_path, len, "%s/%s@%s",
7626 			    cpi->cpi_cphci->cphci_path, ct_name,
7627 			    cpi->cpi_addr);
7628 			pp->phys_path_next = NULL;
7629 
7630 			if (pp_head == NULL)
7631 				pp_head = pp;
7632 			else
7633 				pp_tail->phys_path_next = pp;
7634 			pp_tail = pp;
7635 		}
7636 	}
7637 
7638 	return (pp_head);
7639 }
7640 
7641 /*
7642  * Free the memory allocated for phci client path list.
7643  */
7644 static void
7645 free_phclient_path_list(mdi_phys_path_t *pp_head)
7646 {
7647 	mdi_phys_path_t *pp, *pp_next;
7648 
7649 	for (pp = pp_head; pp != NULL; pp = pp_next) {
7650 		pp_next = pp->phys_path_next;
7651 		kmem_free(pp->phys_path, strlen(pp->phys_path) + 1);
7652 		kmem_free(pp, sizeof (*pp));
7653 	}
7654 }
7655 
7656 /*
7657  * Allocated async client structure and initialize with the specified values.
7658  */
7659 static mdi_async_client_config_t *
7660 alloc_async_client_config(char *ct_name, char *ct_addr,
7661     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
7662 {
7663 	mdi_async_client_config_t *acc;
7664 
7665 	acc = kmem_alloc(sizeof (*acc), KM_SLEEP);
7666 	acc->acc_ct_name = i_ddi_strdup(ct_name, KM_SLEEP);
7667 	acc->acc_ct_addr = i_ddi_strdup(ct_addr, KM_SLEEP);
7668 	acc->acc_phclient_path_list_head = pp_head;
7669 	init_vhcache_lookup_token(&acc->acc_token, tok);
7670 	acc->acc_next = NULL;
7671 	return (acc);
7672 }
7673 
7674 /*
7675  * Free the memory allocated for the async client structure and their members.
7676  */
7677 static void
7678 free_async_client_config(mdi_async_client_config_t *acc)
7679 {
7680 	if (acc->acc_phclient_path_list_head)
7681 		free_phclient_path_list(acc->acc_phclient_path_list_head);
7682 	kmem_free(acc->acc_ct_name, strlen(acc->acc_ct_name) + 1);
7683 	kmem_free(acc->acc_ct_addr, strlen(acc->acc_ct_addr) + 1);
7684 	kmem_free(acc, sizeof (*acc));
7685 }
7686 
7687 /*
7688  * Sort vhcache pathinfos (cpis) of the specified client.
7689  * All cpis which do not have MDI_CPI_HINT_PATH_DOES_NOT_EXIST
7690  * flag set come at the beginning of the list. All cpis which have this
7691  * flag set come at the end of the list.
7692  */
7693 static void
7694 sort_vhcache_paths(mdi_vhcache_client_t *cct)
7695 {
7696 	mdi_vhcache_pathinfo_t *cpi, *cpi_next, *cpi_head;
7697 
7698 	cpi_head = cct->cct_cpi_head;
7699 	cct->cct_cpi_head = cct->cct_cpi_tail = NULL;
7700 	for (cpi = cpi_head; cpi != NULL; cpi = cpi_next) {
7701 		cpi_next = cpi->cpi_next;
7702 		enqueue_vhcache_pathinfo(cct, cpi);
7703 	}
7704 }
7705 
7706 /*
7707  * Verify whether MDI_CPI_HINT_PATH_DOES_NOT_EXIST flag setting is correct for
7708  * every vhcache pathinfo of the specified client. If not adjust the flag
7709  * setting appropriately.
7710  *
7711  * Note that MDI_CPI_HINT_PATH_DOES_NOT_EXIST flag is persisted in the
7712  * on-disk vhci cache. So every time this flag is updated the cache must be
7713  * flushed.
7714  */
7715 static void
7716 adjust_sort_vhcache_paths(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
7717     mdi_vhcache_lookup_token_t *tok)
7718 {
7719 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7720 	mdi_vhcache_client_t *cct;
7721 	mdi_vhcache_pathinfo_t *cpi;
7722 
7723 	rw_enter(&vhcache->vhcache_lock, RW_READER);
7724 	if ((cct = lookup_vhcache_client(vhcache, ct_name, ct_addr, tok))
7725 	    == NULL) {
7726 		rw_exit(&vhcache->vhcache_lock);
7727 		return;
7728 	}
7729 
7730 	/*
7731 	 * to avoid unnecessary on-disk cache updates, first check if an
7732 	 * update is really needed. If no update is needed simply return.
7733 	 */
7734 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
7735 		if ((cpi->cpi_pip != NULL &&
7736 		    (cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST)) ||
7737 		    (cpi->cpi_pip == NULL &&
7738 		    !(cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST))) {
7739 			break;
7740 		}
7741 	}
7742 	if (cpi == NULL) {
7743 		rw_exit(&vhcache->vhcache_lock);
7744 		return;
7745 	}
7746 
7747 	if (rw_tryupgrade(&vhcache->vhcache_lock) == 0) {
7748 		rw_exit(&vhcache->vhcache_lock);
7749 		rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7750 		if ((cct = lookup_vhcache_client(vhcache, ct_name, ct_addr,
7751 		    tok)) == NULL) {
7752 			rw_exit(&vhcache->vhcache_lock);
7753 			return;
7754 		}
7755 	}
7756 
7757 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
7758 		if (cpi->cpi_pip != NULL)
7759 			cpi->cpi_flags &= ~MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
7760 		else
7761 			cpi->cpi_flags |= MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
7762 	}
7763 	sort_vhcache_paths(cct);
7764 
7765 	rw_exit(&vhcache->vhcache_lock);
7766 	vhcache_dirty(vhc);
7767 }
7768 
7769 /*
7770  * Configure all specified paths of the client.
7771  */
7772 static void
7773 config_client_paths_sync(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
7774     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
7775 {
7776 	mdi_phys_path_t *pp;
7777 
7778 	for (pp = pp_head; pp != NULL; pp = pp->phys_path_next)
7779 		(void) bus_config_one_phci_child(pp->phys_path);
7780 	adjust_sort_vhcache_paths(vhc, ct_name, ct_addr, tok);
7781 }
7782 
7783 /*
7784  * Dequeue elements from vhci async client config list and bus configure
7785  * their corresponding phci clients.
7786  */
7787 static void
7788 config_client_paths_thread(void *arg)
7789 {
7790 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
7791 	mdi_async_client_config_t *acc;
7792 	clock_t quit_at_ticks;
7793 	clock_t idle_time = mdi_async_config_idle_time * TICKS_PER_SECOND;
7794 	callb_cpr_t cprinfo;
7795 
7796 	CALLB_CPR_INIT(&cprinfo, &vhc->vhc_lock, callb_generic_cpr,
7797 	    "mdi_config_client_paths");
7798 
7799 	for (; ; ) {
7800 		quit_at_ticks = ddi_get_lbolt() + idle_time;
7801 
7802 		mutex_enter(&vhc->vhc_lock);
7803 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
7804 		    vhc->vhc_acc_list_head == NULL &&
7805 		    ddi_get_lbolt() < quit_at_ticks) {
7806 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
7807 			(void) cv_timedwait(&vhc->vhc_cv, &vhc->vhc_lock,
7808 			    quit_at_ticks);
7809 			CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
7810 		}
7811 
7812 		if ((vhc->vhc_flags & MDI_VHC_EXIT) ||
7813 		    vhc->vhc_acc_list_head == NULL)
7814 			goto out;
7815 
7816 		acc = vhc->vhc_acc_list_head;
7817 		vhc->vhc_acc_list_head = acc->acc_next;
7818 		if (vhc->vhc_acc_list_head == NULL)
7819 			vhc->vhc_acc_list_tail = NULL;
7820 		vhc->vhc_acc_count--;
7821 		mutex_exit(&vhc->vhc_lock);
7822 
7823 		config_client_paths_sync(vhc, acc->acc_ct_name,
7824 		    acc->acc_ct_addr, acc->acc_phclient_path_list_head,
7825 		    &acc->acc_token);
7826 
7827 		free_async_client_config(acc);
7828 	}
7829 
7830 out:
7831 	vhc->vhc_acc_thrcount--;
7832 	/* CALLB_CPR_EXIT releases the vhc->vhc_lock */
7833 	CALLB_CPR_EXIT(&cprinfo);
7834 }
7835 
7836 /*
7837  * Arrange for all the phci client paths (pp_head) for the specified client
7838  * to be bus configured asynchronously by a thread.
7839  */
7840 static void
7841 config_client_paths_async(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
7842     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
7843 {
7844 	mdi_async_client_config_t *acc, *newacc;
7845 	int create_thread;
7846 
7847 	if (pp_head == NULL)
7848 		return;
7849 
7850 	if (mdi_mtc_off) {
7851 		config_client_paths_sync(vhc, ct_name, ct_addr, pp_head, tok);
7852 		free_phclient_path_list(pp_head);
7853 		return;
7854 	}
7855 
7856 	newacc = alloc_async_client_config(ct_name, ct_addr, pp_head, tok);
7857 	ASSERT(newacc);
7858 
7859 	mutex_enter(&vhc->vhc_lock);
7860 	for (acc = vhc->vhc_acc_list_head; acc != NULL; acc = acc->acc_next) {
7861 		if (strcmp(ct_name, acc->acc_ct_name) == 0 &&
7862 		    strcmp(ct_addr, acc->acc_ct_addr) == 0) {
7863 			free_async_client_config(newacc);
7864 			mutex_exit(&vhc->vhc_lock);
7865 			return;
7866 		}
7867 	}
7868 
7869 	if (vhc->vhc_acc_list_head == NULL)
7870 		vhc->vhc_acc_list_head = newacc;
7871 	else
7872 		vhc->vhc_acc_list_tail->acc_next = newacc;
7873 	vhc->vhc_acc_list_tail = newacc;
7874 	vhc->vhc_acc_count++;
7875 	if (vhc->vhc_acc_count <= vhc->vhc_acc_thrcount) {
7876 		cv_broadcast(&vhc->vhc_cv);
7877 		create_thread = 0;
7878 	} else {
7879 		vhc->vhc_acc_thrcount++;
7880 		create_thread = 1;
7881 	}
7882 	mutex_exit(&vhc->vhc_lock);
7883 
7884 	if (create_thread)
7885 		(void) thread_create(NULL, 0, config_client_paths_thread, vhc,
7886 		    0, &p0, TS_RUN, minclsyspri);
7887 }
7888 
7889 /*
7890  * Return number of online paths for the specified client.
7891  */
7892 static int
7893 nonline_paths(mdi_vhcache_client_t *cct)
7894 {
7895 	mdi_vhcache_pathinfo_t *cpi;
7896 	int online_count = 0;
7897 
7898 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
7899 		if (cpi->cpi_pip != NULL) {
7900 			MDI_PI_LOCK(cpi->cpi_pip);
7901 			if (cpi->cpi_pip->pi_state == MDI_PATHINFO_STATE_ONLINE)
7902 				online_count++;
7903 			MDI_PI_UNLOCK(cpi->cpi_pip);
7904 		}
7905 	}
7906 
7907 	return (online_count);
7908 }
7909 
7910 /*
7911  * Bus configure all paths for the specified vhci client.
7912  * If at least one path for the client is already online, the remaining paths
7913  * will be configured asynchronously. Otherwise, it synchronously configures
7914  * the paths until at least one path is online and then rest of the paths
7915  * will be configured asynchronously.
7916  */
7917 static void
7918 config_client_paths(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr)
7919 {
7920 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7921 	mdi_phys_path_t *pp_head, *pp;
7922 	mdi_vhcache_client_t *cct;
7923 	mdi_vhcache_lookup_token_t tok;
7924 
7925 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7926 
7927 	init_vhcache_lookup_token(&tok, NULL);
7928 
7929 	if (ct_name == NULL || ct_addr == NULL ||
7930 	    (cct = lookup_vhcache_client(vhcache, ct_name, ct_addr, &tok))
7931 	    == NULL ||
7932 	    (pp_head = build_phclient_path_list(cct, ct_name)) == NULL) {
7933 		rw_exit(&vhcache->vhcache_lock);
7934 		return;
7935 	}
7936 
7937 	/* if at least one path is online, configure the rest asynchronously */
7938 	if (nonline_paths(cct) > 0) {
7939 		rw_exit(&vhcache->vhcache_lock);
7940 		config_client_paths_async(vhc, ct_name, ct_addr, pp_head, &tok);
7941 		return;
7942 	}
7943 
7944 	rw_exit(&vhcache->vhcache_lock);
7945 
7946 	for (pp = pp_head; pp != NULL; pp = pp->phys_path_next) {
7947 		if (bus_config_one_phci_child(pp->phys_path) == MDI_SUCCESS) {
7948 			rw_enter(&vhcache->vhcache_lock, RW_READER);
7949 
7950 			if ((cct = lookup_vhcache_client(vhcache, ct_name,
7951 			    ct_addr, &tok)) == NULL) {
7952 				rw_exit(&vhcache->vhcache_lock);
7953 				goto out;
7954 			}
7955 
7956 			if (nonline_paths(cct) > 0 &&
7957 			    pp->phys_path_next != NULL) {
7958 				rw_exit(&vhcache->vhcache_lock);
7959 				config_client_paths_async(vhc, ct_name, ct_addr,
7960 				    pp->phys_path_next, &tok);
7961 				pp->phys_path_next = NULL;
7962 				goto out;
7963 			}
7964 
7965 			rw_exit(&vhcache->vhcache_lock);
7966 		}
7967 	}
7968 
7969 	adjust_sort_vhcache_paths(vhc, ct_name, ct_addr, &tok);
7970 out:
7971 	free_phclient_path_list(pp_head);
7972 }
7973 
7974 static void
7975 single_threaded_vhconfig_enter(mdi_vhci_config_t *vhc)
7976 {
7977 	mutex_enter(&vhc->vhc_lock);
7978 	while (vhc->vhc_flags & MDI_VHC_SINGLE_THREADED)
7979 		cv_wait(&vhc->vhc_cv, &vhc->vhc_lock);
7980 	vhc->vhc_flags |= MDI_VHC_SINGLE_THREADED;
7981 	mutex_exit(&vhc->vhc_lock);
7982 }
7983 
7984 static void
7985 single_threaded_vhconfig_exit(mdi_vhci_config_t *vhc)
7986 {
7987 	mutex_enter(&vhc->vhc_lock);
7988 	vhc->vhc_flags &= ~MDI_VHC_SINGLE_THREADED;
7989 	cv_broadcast(&vhc->vhc_cv);
7990 	mutex_exit(&vhc->vhc_lock);
7991 }
7992 
7993 /*
7994  * Attach the phci driver instances associated with the vhci:
7995  * If root is mounted attach all phci driver instances.
7996  * If root is not mounted, attach the instances of only those phci
7997  * drivers that have the root support.
7998  */
7999 static void
8000 attach_phci_drivers(mdi_vhci_config_t *vhc, int root_mounted)
8001 {
8002 	int  i;
8003 	major_t m;
8004 
8005 	for (i = 0; i < vhc->vhc_nphci_drivers; i++) {
8006 		if (root_mounted == 0 &&
8007 		    vhc->vhc_phci_driver_list[i].phdriver_root_support == 0)
8008 			continue;
8009 
8010 		m = ddi_name_to_major(
8011 		    vhc->vhc_phci_driver_list[i].phdriver_name);
8012 		if (m != (major_t)-1) {
8013 			if (ddi_hold_installed_driver(m) != NULL)
8014 				ddi_rele_driver(m);
8015 		}
8016 	}
8017 }
8018 
8019 /*
8020  * Build vhci cache:
8021  *
8022  * Attach phci driver instances and then drive BUS_CONFIG_ALL on
8023  * the phci driver instances. During this process the cache gets built.
8024  *
8025  * Cache is built fully if the root is mounted (i.e., root_mounted is nonzero).
8026  *
8027  * If the root is not mounted, phci drivers that do not have root support
8028  * are not attached. As a result the cache is built partially. The entries
8029  * in the cache reflect only those phci drivers that have root support.
8030  */
8031 static vhcache_build_status_t
8032 build_vhci_cache(mdi_vhci_config_t *vhc, int root_mounted)
8033 {
8034 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8035 
8036 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8037 	if (vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE) {
8038 		rw_exit(&vhcache->vhcache_lock);
8039 		return (VHCACHE_NOT_REBUILT);
8040 	}
8041 	rw_exit(&vhcache->vhcache_lock);
8042 
8043 	attach_phci_drivers(vhc, root_mounted);
8044 	bus_config_all_phcis(vhcache, NDI_DRV_CONF_REPROBE | NDI_NO_EVENT,
8045 	    BUS_CONFIG_ALL, (major_t)-1);
8046 
8047 	if (root_mounted) {
8048 		rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8049 		vhcache->vhcache_flags |= MDI_VHCI_CACHE_SETUP_DONE;
8050 		rw_exit(&vhcache->vhcache_lock);
8051 		vhcache_dirty(vhc);
8052 		return (VHCACHE_FULLY_BUILT);
8053 	} else
8054 		return (VHCACHE_PARTIALLY_BUILT);
8055 }
8056 
8057 /*
8058  * Wait until the root is mounted and then build the vhci cache.
8059  */
8060 static void
8061 build_vhci_cache_thread(void *arg)
8062 {
8063 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
8064 
8065 	mutex_enter(&vhc->vhc_lock);
8066 	while (!modrootloaded && !(vhc->vhc_flags & MDI_VHC_EXIT)) {
8067 		(void) cv_timedwait(&vhc->vhc_cv, &vhc->vhc_lock,
8068 		    ddi_get_lbolt() + 10 * TICKS_PER_SECOND);
8069 	}
8070 	if (vhc->vhc_flags & MDI_VHC_EXIT)
8071 		goto out;
8072 
8073 	mutex_exit(&vhc->vhc_lock);
8074 
8075 	/*
8076 	 * Now that the root is mounted. So build_vhci_cache() will build
8077 	 * the full cache.
8078 	 */
8079 	(void) build_vhci_cache(vhc, 1);
8080 
8081 	mutex_enter(&vhc->vhc_lock);
8082 out:
8083 	vhc->vhc_flags &= ~MDI_VHC_BUILD_VHCI_CACHE_THREAD;
8084 	mutex_exit(&vhc->vhc_lock);
8085 }
8086 
8087 /*
8088  * Build vhci cache - a wrapper for build_vhci_cache().
8089  *
8090  * In a normal case on-disk vhci cache is read and setup during booting.
8091  * But if the on-disk vhci cache is not there or deleted or corrupted then
8092  * this function sets up the vhci cache.
8093  *
8094  * The cache is built fully if the root is mounted.
8095  *
8096  * If the root is not mounted, initially the cache is built reflecting only
8097  * those driver entries that have the root support. A separate thread is
8098  * created to handle the creation of full cache. This thread will wait
8099  * until the root is mounted and then rebuilds the cache.
8100  */
8101 static int
8102 e_build_vhci_cache(mdi_vhci_config_t *vhc)
8103 {
8104 	vhcache_build_status_t rv;
8105 
8106 	single_threaded_vhconfig_enter(vhc);
8107 
8108 	mutex_enter(&vhc->vhc_lock);
8109 	if (vhc->vhc_flags & MDI_VHC_BUILD_VHCI_CACHE_THREAD) {
8110 		if (modrootloaded) {
8111 			cv_broadcast(&vhc->vhc_cv);
8112 			/* wait until build vhci cache thread exits */
8113 			while (vhc->vhc_flags & MDI_VHC_BUILD_VHCI_CACHE_THREAD)
8114 				cv_wait(&vhc->vhc_cv, &vhc->vhc_lock);
8115 			rv = VHCACHE_FULLY_BUILT;
8116 		} else {
8117 			/*
8118 			 * The presense of MDI_VHC_BUILD_VHCI_CACHE_THREAD
8119 			 * flag indicates that the cache has already been
8120 			 * partially built.
8121 			 */
8122 			rv = VHCACHE_PARTIALLY_BUILT;
8123 		}
8124 
8125 		mutex_exit(&vhc->vhc_lock);
8126 		single_threaded_vhconfig_exit(vhc);
8127 		return (rv);
8128 	}
8129 	mutex_exit(&vhc->vhc_lock);
8130 
8131 	rv = build_vhci_cache(vhc, modrootloaded);
8132 
8133 	if (rv == VHCACHE_PARTIALLY_BUILT) {
8134 		/*
8135 		 * create a thread; this thread will wait until the root is
8136 		 * mounted and then fully rebuilds the cache.
8137 		 */
8138 		mutex_enter(&vhc->vhc_lock);
8139 		vhc->vhc_flags |= MDI_VHC_BUILD_VHCI_CACHE_THREAD;
8140 		mutex_exit(&vhc->vhc_lock);
8141 		(void) thread_create(NULL, 0, build_vhci_cache_thread,
8142 		    vhc, 0, &p0, TS_RUN, minclsyspri);
8143 	}
8144 
8145 	single_threaded_vhconfig_exit(vhc);
8146 	return (rv);
8147 }
8148 
8149 /*
8150  * Generic vhci bus config implementation:
8151  *
8152  * Parameters
8153  *	vdip	vhci dip
8154  *	flags	bus config flags
8155  *	op	bus config operation
8156  *	The remaining parameters are bus config operation specific
8157  *
8158  * for BUS_CONFIG_ONE
8159  *	arg	pointer to name@addr
8160  *	child	upon successful return from this function, *child will be
8161  *		set to the configured and held devinfo child node of vdip.
8162  *	ct_addr	pointer to client address (i.e. GUID)
8163  *
8164  * for BUS_CONFIG_DRIVER
8165  *	arg	major number of the driver
8166  *	child and ct_addr parameters are ignored
8167  *
8168  * for BUS_CONFIG_ALL
8169  *	arg, child, and ct_addr parameters are ignored
8170  *
8171  * Note that for the rest of the bus config operations, this function simply
8172  * calls the framework provided default bus config routine.
8173  */
8174 int
8175 mdi_vhci_bus_config(dev_info_t *vdip, uint_t flags, ddi_bus_config_op_t op,
8176     void *arg, dev_info_t **child, char *ct_addr)
8177 {
8178 	mdi_vhci_t *vh = i_devi_get_vhci(vdip);
8179 	mdi_vhci_config_t *vhc = vh->vh_config;
8180 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8181 	vhcache_build_status_t rv = VHCACHE_NOT_REBUILT;
8182 	char *cp;
8183 
8184 	/*
8185 	 * While bus configuring phcis, the phci driver interactions with MDI
8186 	 * cause child nodes to be enumerated under the vhci node for which
8187 	 * they need to ndi_devi_enter the vhci node.
8188 	 *
8189 	 * Unfortunately, to avoid the deadlock, we ourself can not wait for
8190 	 * for the bus config operations on phcis to finish while holding the
8191 	 * ndi_devi_enter lock. To avoid this deadlock, skip bus configs on
8192 	 * phcis and call the default framework provided bus config function
8193 	 * if we are called with ndi_devi_enter lock held.
8194 	 */
8195 	if (DEVI_BUSY_OWNED(vdip)) {
8196 		MDI_DEBUG(2, (CE_NOTE, vdip,
8197 		    "!MDI: vhci bus config: vhci dip is busy owned\n"));
8198 		goto default_bus_config;
8199 	}
8200 
8201 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8202 	if (!(vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE)) {
8203 		rw_exit(&vhcache->vhcache_lock);
8204 		rv = e_build_vhci_cache(vhc);
8205 		rw_enter(&vhcache->vhcache_lock, RW_READER);
8206 	}
8207 
8208 	switch (op) {
8209 	case BUS_CONFIG_ONE:
8210 		/* extract node name */
8211 		cp = (char *)arg;
8212 		while (*cp != '\0' && *cp != '@')
8213 			cp++;
8214 		if (*cp == '@') {
8215 			*cp = '\0';
8216 			config_client_paths(vhc, (char *)arg, ct_addr);
8217 			/* config_client_paths() releases the cache_lock */
8218 			*cp = '@';
8219 		} else
8220 			rw_exit(&vhcache->vhcache_lock);
8221 		break;
8222 
8223 	case BUS_CONFIG_DRIVER:
8224 		rw_exit(&vhcache->vhcache_lock);
8225 		if (rv == VHCACHE_NOT_REBUILT)
8226 			bus_config_all_phcis(vhcache, flags, op,
8227 			    (major_t)(uintptr_t)arg);
8228 		break;
8229 
8230 	case BUS_CONFIG_ALL:
8231 		rw_exit(&vhcache->vhcache_lock);
8232 		if (rv == VHCACHE_NOT_REBUILT)
8233 			bus_config_all_phcis(vhcache, flags, op, -1);
8234 		break;
8235 
8236 	default:
8237 		rw_exit(&vhcache->vhcache_lock);
8238 		break;
8239 	}
8240 
8241 
8242 default_bus_config:
8243 	/*
8244 	 * All requested child nodes are enumerated under the vhci.
8245 	 * Now configure them.
8246 	 */
8247 	if (ndi_busop_bus_config(vdip, flags, op, arg, child, 0) ==
8248 	    NDI_SUCCESS) {
8249 		return (MDI_SUCCESS);
8250 	}
8251 
8252 	return (MDI_FAILURE);
8253 }
8254 
8255 /*
8256  * Read the on-disk vhci cache into an nvlist for the specified vhci class.
8257  */
8258 static nvlist_t *
8259 read_on_disk_vhci_cache(char *vhci_class)
8260 {
8261 	nvlist_t *nvl;
8262 	int err;
8263 	char *filename;
8264 
8265 	filename = vhclass2vhcache_filename(vhci_class);
8266 
8267 	if ((err = fread_nvlist(filename, &nvl)) == 0) {
8268 		kmem_free(filename, strlen(filename) + 1);
8269 		return (nvl);
8270 	} else if (err == EIO)
8271 		cmn_err(CE_WARN, "%s: I/O error, will recreate\n", filename);
8272 	else if (err == EINVAL)
8273 		cmn_err(CE_WARN,
8274 		    "%s: data file corrupted, will recreate\n", filename);
8275 
8276 	kmem_free(filename, strlen(filename) + 1);
8277 	return (NULL);
8278 }
8279 
8280 /*
8281  * Read on-disk vhci cache into nvlists for all vhci classes.
8282  * Called during booting by i_ddi_read_devices_files().
8283  */
8284 void
8285 mdi_read_devices_files(void)
8286 {
8287 	int i;
8288 
8289 	for (i = 0; i < N_VHCI_CLASSES; i++)
8290 		vhcache_nvl[i] = read_on_disk_vhci_cache(vhci_class_list[i]);
8291 }
8292 
8293 /*
8294  * Remove all stale entries from vhci cache.
8295  */
8296 static void
8297 clean_vhcache(mdi_vhci_config_t *vhc)
8298 {
8299 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8300 	mdi_vhcache_phci_t *cphci, *cphci_head, *cphci_next;
8301 	mdi_vhcache_client_t *cct, *cct_head, *cct_next;
8302 	mdi_vhcache_pathinfo_t *cpi, *cpi_head, *cpi_next;
8303 
8304 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8305 
8306 	cct_head = vhcache->vhcache_client_head;
8307 	vhcache->vhcache_client_head = vhcache->vhcache_client_tail = NULL;
8308 	for (cct = cct_head; cct != NULL; cct = cct_next) {
8309 		cct_next = cct->cct_next;
8310 
8311 		cpi_head = cct->cct_cpi_head;
8312 		cct->cct_cpi_head = cct->cct_cpi_tail = NULL;
8313 		for (cpi = cpi_head; cpi != NULL; cpi = cpi_next) {
8314 			cpi_next = cpi->cpi_next;
8315 			if (cpi->cpi_pip != NULL) {
8316 				ASSERT(cpi->cpi_cphci->cphci_phci != NULL);
8317 				enqueue_tail_vhcache_pathinfo(cct, cpi);
8318 			} else
8319 				free_vhcache_pathinfo(cpi);
8320 		}
8321 
8322 		if (cct->cct_cpi_head != NULL)
8323 			enqueue_vhcache_client(vhcache, cct);
8324 		else {
8325 			(void) mod_hash_destroy(vhcache->vhcache_client_hash,
8326 			    (mod_hash_key_t)cct->cct_name_addr);
8327 			free_vhcache_client(cct);
8328 		}
8329 	}
8330 
8331 	cphci_head = vhcache->vhcache_phci_head;
8332 	vhcache->vhcache_phci_head = vhcache->vhcache_phci_tail = NULL;
8333 	for (cphci = cphci_head; cphci != NULL; cphci = cphci_next) {
8334 		cphci_next = cphci->cphci_next;
8335 		if (cphci->cphci_phci != NULL)
8336 			enqueue_vhcache_phci(vhcache, cphci);
8337 		else
8338 			free_vhcache_phci(cphci);
8339 	}
8340 
8341 	vhcache->vhcache_clean_time = lbolt64;
8342 	rw_exit(&vhcache->vhcache_lock);
8343 	vhcache_dirty(vhc);
8344 }
8345 
8346 /*
8347  * Remove all stale entries from vhci cache.
8348  * Called by i_ddi_clean_devices_files() during the execution of devfsadm -C
8349  */
8350 void
8351 mdi_clean_vhcache(void)
8352 {
8353 	mdi_vhci_t *vh;
8354 
8355 	mutex_enter(&mdi_mutex);
8356 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
8357 		vh->vh_refcnt++;
8358 		mutex_exit(&mdi_mutex);
8359 		clean_vhcache(vh->vh_config);
8360 		mutex_enter(&mdi_mutex);
8361 		vh->vh_refcnt--;
8362 	}
8363 	mutex_exit(&mdi_mutex);
8364 }
8365