xref: /illumos-gate/usr/src/uts/common/os/sunmdi.c (revision 0173c38a73f34277e0c97a19fedfd25d81ba8380)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 #pragma ident	"%Z%%M%	%I%	%E% SMI"
26 
27 /*
28  * Multipath driver interface (MDI) implementation; see mdi_impl.h for a more
29  * detailed discussion of the overall mpxio architecture.
30  *
31  * Default locking order:
32  *
33  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_phci_mutex);
34  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_client_mutex);
35  * _NOTE(LOCK_ORDER(mdi_vhci:vh_phci_mutex, mdi_phci::ph_mutex);
36  * _NOTE(LOCK_ORDER(mdi_vhci:vh_client_mutex, mdi_client::ct_mutex);
37  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
38  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_client::ct_mutex))
39  * _NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
40  */
41 
42 #include <sys/note.h>
43 #include <sys/types.h>
44 #include <sys/varargs.h>
45 #include <sys/param.h>
46 #include <sys/errno.h>
47 #include <sys/uio.h>
48 #include <sys/buf.h>
49 #include <sys/modctl.h>
50 #include <sys/open.h>
51 #include <sys/kmem.h>
52 #include <sys/poll.h>
53 #include <sys/conf.h>
54 #include <sys/bootconf.h>
55 #include <sys/cmn_err.h>
56 #include <sys/stat.h>
57 #include <sys/ddi.h>
58 #include <sys/sunddi.h>
59 #include <sys/ddipropdefs.h>
60 #include <sys/sunndi.h>
61 #include <sys/ndi_impldefs.h>
62 #include <sys/promif.h>
63 #include <sys/sunmdi.h>
64 #include <sys/mdi_impldefs.h>
65 #include <sys/taskq.h>
66 #include <sys/epm.h>
67 #include <sys/sunpm.h>
68 #include <sys/modhash.h>
69 #include <sys/disp.h>
70 #include <sys/autoconf.h>
71 #include <sys/sysmacros.h>
72 
73 #ifdef	DEBUG
74 #include <sys/debug.h>
75 int	mdi_debug = 1;
76 int	mdi_debug_logonly = 0;
77 #define	MDI_DEBUG(level, stmnt) \
78 	    if (mdi_debug >= (level)) i_mdi_log stmnt
79 static void i_mdi_log(int, dev_info_t *, const char *fmt, ...);
80 #else	/* !DEBUG */
81 #define	MDI_DEBUG(level, stmnt)
82 #endif	/* DEBUG */
83 
84 extern pri_t	minclsyspri;
85 extern int	modrootloaded;
86 
87 /*
88  * Global mutex:
89  * Protects vHCI list and structure members.
90  */
91 kmutex_t	mdi_mutex;
92 
93 /*
94  * Registered vHCI class driver lists
95  */
96 int		mdi_vhci_count;
97 mdi_vhci_t	*mdi_vhci_head;
98 mdi_vhci_t	*mdi_vhci_tail;
99 
100 /*
101  * Client Hash Table size
102  */
103 static int	mdi_client_table_size = CLIENT_HASH_TABLE_SIZE;
104 
105 /*
106  * taskq interface definitions
107  */
108 #define	MDI_TASKQ_N_THREADS	8
109 #define	MDI_TASKQ_PRI		minclsyspri
110 #define	MDI_TASKQ_MINALLOC	(4*mdi_taskq_n_threads)
111 #define	MDI_TASKQ_MAXALLOC	(500*mdi_taskq_n_threads)
112 
113 taskq_t				*mdi_taskq;
114 static uint_t			mdi_taskq_n_threads = MDI_TASKQ_N_THREADS;
115 
116 #define	TICKS_PER_SECOND	(drv_usectohz(1000000))
117 
118 /*
119  * The data should be "quiet" for this interval (in seconds) before the
120  * vhci cached data is flushed to the disk.
121  */
122 static int mdi_vhcache_flush_delay = 10;
123 
124 /* number of seconds the vhcache flush daemon will sleep idle before exiting */
125 static int mdi_vhcache_flush_daemon_idle_time = 60;
126 
127 /*
128  * MDI falls back to discovery of all paths when a bus_config_one fails.
129  * The following parameters can be used to tune this operation.
130  *
131  * mdi_path_discovery_boot
132  *	Number of times path discovery will be attempted during early boot.
133  *	Probably there is no reason to ever set this value to greater than one.
134  *
135  * mdi_path_discovery_postboot
136  *	Number of times path discovery will be attempted after early boot.
137  *	Set it to a minimum of two to allow for discovery of iscsi paths which
138  *	may happen very late during booting.
139  *
140  * mdi_path_discovery_interval
141  *	Minimum number of seconds MDI will wait between successive discovery
142  *	of all paths. Set it to -1 to disable discovery of all paths.
143  */
144 static int mdi_path_discovery_boot = 1;
145 static int mdi_path_discovery_postboot = 2;
146 static int mdi_path_discovery_interval = 10;
147 
148 /*
149  * number of seconds the asynchronous configuration thread will sleep idle
150  * before exiting.
151  */
152 static int mdi_async_config_idle_time = 600;
153 
154 static int mdi_bus_config_cache_hash_size = 256;
155 
156 /* turns off multithreaded configuration for certain operations */
157 static int mdi_mtc_off = 0;
158 
159 /*
160  * MDI component property name/value string definitions
161  */
162 const char 		*mdi_component_prop = "mpxio-component";
163 const char		*mdi_component_prop_vhci = "vhci";
164 const char		*mdi_component_prop_phci = "phci";
165 const char		*mdi_component_prop_client = "client";
166 
167 /*
168  * MDI client global unique identifier property name
169  */
170 const char		*mdi_client_guid_prop = "client-guid";
171 
172 /*
173  * MDI client load balancing property name/value string definitions
174  */
175 const char		*mdi_load_balance = "load-balance";
176 const char		*mdi_load_balance_none = "none";
177 const char		*mdi_load_balance_rr = "round-robin";
178 const char		*mdi_load_balance_lba = "logical-block";
179 
180 /*
181  * Obsolete vHCI class definition; to be removed after Leadville update
182  */
183 const char *mdi_vhci_class_scsi = MDI_HCI_CLASS_SCSI;
184 
185 static char vhci_greeting[] =
186 	"\tThere already exists one vHCI driver for class %s\n"
187 	"\tOnly one vHCI driver for each class is allowed\n";
188 
189 /*
190  * Static function prototypes
191  */
192 static int		i_mdi_phci_offline(dev_info_t *, uint_t);
193 static int		i_mdi_client_offline(dev_info_t *, uint_t);
194 static int		i_mdi_phci_pre_detach(dev_info_t *, ddi_detach_cmd_t);
195 static void		i_mdi_phci_post_detach(dev_info_t *,
196 			    ddi_detach_cmd_t, int);
197 static int		i_mdi_client_pre_detach(dev_info_t *,
198 			    ddi_detach_cmd_t);
199 static void		i_mdi_client_post_detach(dev_info_t *,
200 			    ddi_detach_cmd_t, int);
201 static void		i_mdi_pm_hold_pip(mdi_pathinfo_t *);
202 static void		i_mdi_pm_rele_pip(mdi_pathinfo_t *);
203 static int 		i_mdi_lba_lb(mdi_client_t *ct,
204 			    mdi_pathinfo_t **ret_pip, struct buf *buf);
205 static void		i_mdi_pm_hold_client(mdi_client_t *, int);
206 static void		i_mdi_pm_rele_client(mdi_client_t *, int);
207 static void		i_mdi_pm_reset_client(mdi_client_t *);
208 static int		i_mdi_power_all_phci(mdi_client_t *);
209 static void		i_mdi_log_sysevent(dev_info_t *, char *, char *);
210 
211 
212 /*
213  * Internal mdi_pathinfo node functions
214  */
215 static int		i_mdi_pi_kstat_create(mdi_pathinfo_t *);
216 static void		i_mdi_pi_kstat_destroy(mdi_pathinfo_t *);
217 
218 static mdi_vhci_t	*i_mdi_vhci_class2vhci(char *);
219 static mdi_vhci_t	*i_devi_get_vhci(dev_info_t *);
220 static mdi_phci_t	*i_devi_get_phci(dev_info_t *);
221 static void		i_mdi_phci_lock(mdi_phci_t *, mdi_pathinfo_t *);
222 static void		i_mdi_phci_unlock(mdi_phci_t *);
223 static mdi_pathinfo_t	*i_mdi_pi_alloc(mdi_phci_t *, char *, mdi_client_t *);
224 static void		i_mdi_phci_add_path(mdi_phci_t *, mdi_pathinfo_t *);
225 static void		i_mdi_client_add_path(mdi_client_t *, mdi_pathinfo_t *);
226 static void		i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *,
227 			    mdi_client_t *);
228 static void		i_mdi_phci_remove_path(mdi_phci_t *, mdi_pathinfo_t *);
229 static void		i_mdi_client_remove_path(mdi_client_t *,
230 			    mdi_pathinfo_t *);
231 
232 static int		i_mdi_pi_state_change(mdi_pathinfo_t *,
233 			    mdi_pathinfo_state_t, int);
234 static int		i_mdi_pi_offline(mdi_pathinfo_t *, int);
235 static dev_info_t	*i_mdi_devinfo_create(mdi_vhci_t *, char *, char *,
236 			    char **, int);
237 static dev_info_t	*i_mdi_devinfo_find(mdi_vhci_t *, char *, char *);
238 static int		i_mdi_devinfo_remove(dev_info_t *, dev_info_t *, int);
239 static int		i_mdi_is_child_present(dev_info_t *, dev_info_t *);
240 static mdi_client_t	*i_mdi_client_alloc(mdi_vhci_t *, char *, char *);
241 static void		i_mdi_client_enlist_table(mdi_vhci_t *, mdi_client_t *);
242 static void		i_mdi_client_delist_table(mdi_vhci_t *, mdi_client_t *);
243 static mdi_client_t	*i_mdi_client_find(mdi_vhci_t *, char *, char *);
244 static void		i_mdi_client_update_state(mdi_client_t *);
245 static int		i_mdi_client_compute_state(mdi_client_t *,
246 			    mdi_phci_t *);
247 static void		i_mdi_client_lock(mdi_client_t *, mdi_pathinfo_t *);
248 static void		i_mdi_client_unlock(mdi_client_t *);
249 static int		i_mdi_client_free(mdi_vhci_t *, mdi_client_t *);
250 static mdi_client_t	*i_devi_get_client(dev_info_t *);
251 /*
252  * NOTE: this will be removed once the NWS files are changed to use the new
253  * mdi_{enable,disable}_path interfaces
254  */
255 static int		i_mdi_pi_enable_disable(dev_info_t *, dev_info_t *,
256 				int, int);
257 static mdi_pathinfo_t 	*i_mdi_enable_disable_path(mdi_pathinfo_t *pip,
258 				mdi_vhci_t *vh, int flags, int op);
259 /*
260  * Failover related function prototypes
261  */
262 static int		i_mdi_failover(void *);
263 
264 /*
265  * misc internal functions
266  */
267 static int		i_mdi_get_hash_key(char *);
268 static int		i_map_nvlist_error_to_mdi(int);
269 static void		i_mdi_report_path_state(mdi_client_t *,
270 			    mdi_pathinfo_t *);
271 
272 static void		setup_vhci_cache(mdi_vhci_t *);
273 static int		destroy_vhci_cache(mdi_vhci_t *);
274 static int		stop_vhcache_async_threads(mdi_vhci_config_t *);
275 static boolean_t	stop_vhcache_flush_thread(void *, int);
276 static void		free_string_array(char **, int);
277 static void		free_vhcache_phci(mdi_vhcache_phci_t *);
278 static void		free_vhcache_pathinfo(mdi_vhcache_pathinfo_t *);
279 static void		free_vhcache_client(mdi_vhcache_client_t *);
280 static int		mainnvl_to_vhcache(mdi_vhci_cache_t *, nvlist_t *);
281 static nvlist_t		*vhcache_to_mainnvl(mdi_vhci_cache_t *);
282 static void		vhcache_phci_add(mdi_vhci_config_t *, mdi_phci_t *);
283 static void		vhcache_phci_remove(mdi_vhci_config_t *, mdi_phci_t *);
284 static void		vhcache_pi_add(mdi_vhci_config_t *,
285 			    struct mdi_pathinfo *);
286 static void		vhcache_pi_remove(mdi_vhci_config_t *,
287 			    struct mdi_pathinfo *);
288 static void		free_phclient_path_list(mdi_phys_path_t *);
289 static void		sort_vhcache_paths(mdi_vhcache_client_t *);
290 static int		flush_vhcache(mdi_vhci_config_t *, int);
291 static void		vhcache_dirty(mdi_vhci_config_t *);
292 static void		free_async_client_config(mdi_async_client_config_t *);
293 static void		single_threaded_vhconfig_enter(mdi_vhci_config_t *);
294 static void		single_threaded_vhconfig_exit(mdi_vhci_config_t *);
295 static nvlist_t		*read_on_disk_vhci_cache(char *);
296 extern int		fread_nvlist(char *, nvlist_t **);
297 extern int		fwrite_nvlist(char *, nvlist_t *);
298 
299 /* called once when first vhci registers with mdi */
300 static void
301 i_mdi_init()
302 {
303 	static int initialized = 0;
304 
305 	if (initialized)
306 		return;
307 	initialized = 1;
308 
309 	mutex_init(&mdi_mutex, NULL, MUTEX_DEFAULT, NULL);
310 	/*
311 	 * Create our taskq resources
312 	 */
313 	mdi_taskq = taskq_create("mdi_taskq", mdi_taskq_n_threads,
314 	    MDI_TASKQ_PRI, MDI_TASKQ_MINALLOC, MDI_TASKQ_MAXALLOC,
315 	    TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
316 	ASSERT(mdi_taskq != NULL);	/* taskq_create never fails */
317 }
318 
319 /*
320  * mdi_get_component_type():
321  *		Return mpxio component type
322  * Return Values:
323  *		MDI_COMPONENT_NONE
324  *		MDI_COMPONENT_VHCI
325  *		MDI_COMPONENT_PHCI
326  *		MDI_COMPONENT_CLIENT
327  * XXX This doesn't work under multi-level MPxIO and should be
328  *	removed when clients migrate mdi_component_is_*() interfaces.
329  */
330 int
331 mdi_get_component_type(dev_info_t *dip)
332 {
333 	return (DEVI(dip)->devi_mdi_component);
334 }
335 
336 /*
337  * mdi_vhci_register():
338  *		Register a vHCI module with the mpxio framework
339  *		mdi_vhci_register() is called by vHCI drivers to register the
340  *		'class_driver' vHCI driver and its MDI entrypoints with the
341  *		mpxio framework.  The vHCI driver must call this interface as
342  *		part of its attach(9e) handler.
343  *		Competing threads may try to attach mdi_vhci_register() as
344  *		the vHCI drivers are loaded and attached as a result of pHCI
345  *		driver instance registration (mdi_phci_register()) with the
346  *		framework.
347  * Return Values:
348  *		MDI_SUCCESS
349  *		MDI_FAILURE
350  */
351 /*ARGSUSED*/
352 int
353 mdi_vhci_register(char *class, dev_info_t *vdip, mdi_vhci_ops_t *vops,
354     int flags)
355 {
356 	mdi_vhci_t		*vh = NULL;
357 
358 	ASSERT(vops->vo_revision == MDI_VHCI_OPS_REV);
359 	ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));
360 
361 	i_mdi_init();
362 
363 	mutex_enter(&mdi_mutex);
364 	/*
365 	 * Scan for already registered vhci
366 	 */
367 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
368 		if (strcmp(vh->vh_class, class) == 0) {
369 			/*
370 			 * vHCI has already been created.  Check for valid
371 			 * vHCI ops registration.  We only support one vHCI
372 			 * module per class
373 			 */
374 			if (vh->vh_ops != NULL) {
375 				mutex_exit(&mdi_mutex);
376 				cmn_err(CE_NOTE, vhci_greeting, class);
377 				return (MDI_FAILURE);
378 			}
379 			break;
380 		}
381 	}
382 
383 	/*
384 	 * if not yet created, create the vHCI component
385 	 */
386 	if (vh == NULL) {
387 		struct client_hash	*hash = NULL;
388 		char			*load_balance;
389 
390 		/*
391 		 * Allocate and initialize the mdi extensions
392 		 */
393 		vh = kmem_zalloc(sizeof (mdi_vhci_t), KM_SLEEP);
394 		hash = kmem_zalloc(mdi_client_table_size * sizeof (*hash),
395 		    KM_SLEEP);
396 		vh->vh_client_table = hash;
397 		vh->vh_class = kmem_zalloc(strlen(class) + 1, KM_SLEEP);
398 		(void) strcpy(vh->vh_class, class);
399 		vh->vh_lb = LOAD_BALANCE_RR;
400 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vdip,
401 		    0, LOAD_BALANCE_PROP, &load_balance) == DDI_SUCCESS) {
402 			if (strcmp(load_balance, LOAD_BALANCE_PROP_NONE) == 0) {
403 				vh->vh_lb = LOAD_BALANCE_NONE;
404 			} else if (strcmp(load_balance, LOAD_BALANCE_PROP_LBA)
405 				    == 0) {
406 				vh->vh_lb = LOAD_BALANCE_LBA;
407 			}
408 			ddi_prop_free(load_balance);
409 		}
410 
411 		mutex_init(&vh->vh_phci_mutex, NULL, MUTEX_DEFAULT, NULL);
412 		mutex_init(&vh->vh_client_mutex, NULL, MUTEX_DEFAULT, NULL);
413 
414 		/*
415 		 * Store the vHCI ops vectors
416 		 */
417 		vh->vh_dip = vdip;
418 		vh->vh_ops = vops;
419 
420 		setup_vhci_cache(vh);
421 
422 		if (mdi_vhci_head == NULL) {
423 			mdi_vhci_head = vh;
424 		}
425 		if (mdi_vhci_tail) {
426 			mdi_vhci_tail->vh_next = vh;
427 		}
428 		mdi_vhci_tail = vh;
429 		mdi_vhci_count++;
430 	}
431 
432 	/*
433 	 * Claim the devfs node as a vhci component
434 	 */
435 	DEVI(vdip)->devi_mdi_component |= MDI_COMPONENT_VHCI;
436 
437 	/*
438 	 * Initialize our back reference from dev_info node
439 	 */
440 	DEVI(vdip)->devi_mdi_xhci = (caddr_t)vh;
441 	mutex_exit(&mdi_mutex);
442 	return (MDI_SUCCESS);
443 }
444 
445 /*
446  * mdi_vhci_unregister():
447  *		Unregister a vHCI module from mpxio framework
448  *		mdi_vhci_unregister() is called from the detach(9E) entrypoint
449  * 		of a vhci to unregister it from the framework.
450  * Return Values:
451  *		MDI_SUCCESS
452  *		MDI_FAILURE
453  */
454 /*ARGSUSED*/
455 int
456 mdi_vhci_unregister(dev_info_t *vdip, int flags)
457 {
458 	mdi_vhci_t	*found, *vh, *prev = NULL;
459 
460 	ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));
461 
462 	/*
463 	 * Check for invalid VHCI
464 	 */
465 	if ((vh = i_devi_get_vhci(vdip)) == NULL)
466 		return (MDI_FAILURE);
467 
468 	/*
469 	 * Scan the list of registered vHCIs for a match
470 	 */
471 	mutex_enter(&mdi_mutex);
472 	for (found = mdi_vhci_head; found != NULL; found = found->vh_next) {
473 		if (found == vh)
474 			break;
475 		prev = found;
476 	}
477 
478 	if (found == NULL) {
479 		mutex_exit(&mdi_mutex);
480 		return (MDI_FAILURE);
481 	}
482 
483 	/*
484 	 * Check the vHCI, pHCI and client count. All the pHCIs and clients
485 	 * should have been unregistered, before a vHCI can be
486 	 * unregistered.
487 	 */
488 	MDI_VHCI_PHCI_LOCK(vh);
489 	if (vh->vh_refcnt || vh->vh_phci_count || vh->vh_client_count) {
490 		MDI_VHCI_PHCI_UNLOCK(vh);
491 		mutex_exit(&mdi_mutex);
492 		return (MDI_FAILURE);
493 	}
494 	MDI_VHCI_PHCI_UNLOCK(vh);
495 
496 	if (destroy_vhci_cache(vh) != MDI_SUCCESS) {
497 		mutex_exit(&mdi_mutex);
498 		return (MDI_FAILURE);
499 	}
500 
501 	/*
502 	 * Remove the vHCI from the global list
503 	 */
504 	if (vh == mdi_vhci_head) {
505 		mdi_vhci_head = vh->vh_next;
506 	} else {
507 		prev->vh_next = vh->vh_next;
508 	}
509 	if (vh == mdi_vhci_tail) {
510 		mdi_vhci_tail = prev;
511 	}
512 	mdi_vhci_count--;
513 	mutex_exit(&mdi_mutex);
514 
515 	vh->vh_ops = NULL;
516 	DEVI(vdip)->devi_mdi_component &= ~MDI_COMPONENT_VHCI;
517 	DEVI(vdip)->devi_mdi_xhci = NULL;
518 	kmem_free(vh->vh_class, strlen(vh->vh_class)+1);
519 	kmem_free(vh->vh_client_table,
520 	    mdi_client_table_size * sizeof (struct client_hash));
521 	mutex_destroy(&vh->vh_phci_mutex);
522 	mutex_destroy(&vh->vh_client_mutex);
523 
524 	kmem_free(vh, sizeof (mdi_vhci_t));
525 	return (MDI_SUCCESS);
526 }
527 
528 /*
529  * i_mdi_vhci_class2vhci():
530  *		Look for a matching vHCI module given a vHCI class name
531  * Return Values:
532  *		Handle to a vHCI component
533  *		NULL
534  */
535 static mdi_vhci_t *
536 i_mdi_vhci_class2vhci(char *class)
537 {
538 	mdi_vhci_t	*vh = NULL;
539 
540 	ASSERT(!MUTEX_HELD(&mdi_mutex));
541 
542 	mutex_enter(&mdi_mutex);
543 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
544 		if (strcmp(vh->vh_class, class) == 0) {
545 			break;
546 		}
547 	}
548 	mutex_exit(&mdi_mutex);
549 	return (vh);
550 }
551 
552 /*
553  * i_devi_get_vhci():
554  *		Utility function to get the handle to a vHCI component
555  * Return Values:
556  *		Handle to a vHCI component
557  *		NULL
558  */
559 mdi_vhci_t *
560 i_devi_get_vhci(dev_info_t *vdip)
561 {
562 	mdi_vhci_t	*vh = NULL;
563 	if (MDI_VHCI(vdip)) {
564 		vh = (mdi_vhci_t *)DEVI(vdip)->devi_mdi_xhci;
565 	}
566 	return (vh);
567 }
568 
569 /*
570  * mdi_phci_register():
571  *		Register a pHCI module with mpxio framework
572  *		mdi_phci_register() is called by pHCI drivers to register with
573  *		the mpxio framework and a specific 'class_driver' vHCI.  The
574  *		pHCI driver must call this interface as part of its attach(9e)
575  *		handler.
576  * Return Values:
577  *		MDI_SUCCESS
578  *		MDI_FAILURE
579  */
580 /*ARGSUSED*/
581 int
582 mdi_phci_register(char *class, dev_info_t *pdip, int flags)
583 {
584 	mdi_phci_t		*ph;
585 	mdi_vhci_t		*vh;
586 	char			*data;
587 	char			*pathname;
588 
589 	/*
590 	 * Some subsystems, like fcp, perform pHCI registration from a
591 	 * different thread than the one doing the pHCI attach(9E) - the
592 	 * driver attach code is waiting for this other thread to complete.
593 	 * This means we can only ASSERT DEVI_BUSY_CHANGING of parent
594 	 * (indicating that some thread has done an ndi_devi_enter of parent)
595 	 * not DEVI_BUSY_OWNED (which would indicate that we did the enter).
596 	 */
597 	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));
598 
599 	pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
600 	(void) ddi_pathname(pdip, pathname);
601 
602 	/*
603 	 * Check for mpxio-disable property. Enable mpxio if the property is
604 	 * missing or not set to "yes".
605 	 * If the property is set to "yes" then emit a brief message.
606 	 */
607 	if ((ddi_prop_lookup_string(DDI_DEV_T_ANY, pdip, 0, "mpxio-disable",
608 	    &data) == DDI_SUCCESS)) {
609 		if (strcmp(data, "yes") == 0) {
610 			MDI_DEBUG(1, (CE_CONT, pdip,
611 			    "?%s (%s%d) multipath capabilities "
612 			    "disabled via %s.conf.\n", pathname,
613 			    ddi_driver_name(pdip), ddi_get_instance(pdip),
614 			    ddi_driver_name(pdip)));
615 			ddi_prop_free(data);
616 			kmem_free(pathname, MAXPATHLEN);
617 			return (MDI_FAILURE);
618 		}
619 		ddi_prop_free(data);
620 	}
621 
622 	kmem_free(pathname, MAXPATHLEN);
623 
624 	/*
625 	 * Search for a matching vHCI
626 	 */
627 	vh = (mdi_vhci_t *)i_mdi_vhci_class2vhci(class);
628 	if (vh == NULL) {
629 		return (MDI_FAILURE);
630 	}
631 
632 	ph = kmem_zalloc(sizeof (mdi_phci_t), KM_SLEEP);
633 	mutex_init(&ph->ph_mutex, NULL, MUTEX_DEFAULT, NULL);
634 	ph->ph_dip = pdip;
635 	ph->ph_vhci = vh;
636 	ph->ph_next = NULL;
637 	ph->ph_unstable = 0;
638 	ph->ph_vprivate = 0;
639 	cv_init(&ph->ph_unstable_cv, NULL, CV_DRIVER, NULL);
640 
641 	MDI_PHCI_LOCK(ph);
642 	MDI_PHCI_SET_POWER_UP(ph);
643 	MDI_PHCI_UNLOCK(ph);
644 	DEVI(pdip)->devi_mdi_component |= MDI_COMPONENT_PHCI;
645 	DEVI(pdip)->devi_mdi_xhci = (caddr_t)ph;
646 
647 	vhcache_phci_add(vh->vh_config, ph);
648 
649 	MDI_VHCI_PHCI_LOCK(vh);
650 	if (vh->vh_phci_head == NULL) {
651 		vh->vh_phci_head = ph;
652 	}
653 	if (vh->vh_phci_tail) {
654 		vh->vh_phci_tail->ph_next = ph;
655 	}
656 	vh->vh_phci_tail = ph;
657 	vh->vh_phci_count++;
658 	MDI_VHCI_PHCI_UNLOCK(vh);
659 
660 	i_mdi_log_sysevent(pdip, class, ESC_DDI_INITIATOR_REGISTER);
661 	return (MDI_SUCCESS);
662 }
663 
664 /*
665  * mdi_phci_unregister():
666  *		Unregister a pHCI module from mpxio framework
667  *		mdi_phci_unregister() is called by the pHCI drivers from their
668  *		detach(9E) handler to unregister their instances from the
669  *		framework.
670  * Return Values:
671  *		MDI_SUCCESS
672  *		MDI_FAILURE
673  */
674 /*ARGSUSED*/
675 int
676 mdi_phci_unregister(dev_info_t *pdip, int flags)
677 {
678 	mdi_vhci_t		*vh;
679 	mdi_phci_t		*ph;
680 	mdi_phci_t		*tmp;
681 	mdi_phci_t		*prev = NULL;
682 
683 	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));
684 
685 	ph = i_devi_get_phci(pdip);
686 	if (ph == NULL) {
687 		MDI_DEBUG(1, (CE_WARN, pdip,
688 		    "!pHCI unregister: Not a valid pHCI"));
689 		return (MDI_FAILURE);
690 	}
691 
692 	vh = ph->ph_vhci;
693 	ASSERT(vh != NULL);
694 	if (vh == NULL) {
695 		MDI_DEBUG(1, (CE_WARN, pdip,
696 		    "!pHCI unregister: Not a valid vHCI"));
697 		return (MDI_FAILURE);
698 	}
699 
700 	MDI_VHCI_PHCI_LOCK(vh);
701 	tmp = vh->vh_phci_head;
702 	while (tmp) {
703 		if (tmp == ph) {
704 			break;
705 		}
706 		prev = tmp;
707 		tmp = tmp->ph_next;
708 	}
709 
710 	if (ph == vh->vh_phci_head) {
711 		vh->vh_phci_head = ph->ph_next;
712 	} else {
713 		prev->ph_next = ph->ph_next;
714 	}
715 
716 	if (ph == vh->vh_phci_tail) {
717 		vh->vh_phci_tail = prev;
718 	}
719 
720 	vh->vh_phci_count--;
721 	MDI_VHCI_PHCI_UNLOCK(vh);
722 
723 	i_mdi_log_sysevent(pdip, ph->ph_vhci->vh_class,
724 	    ESC_DDI_INITIATOR_UNREGISTER);
725 	vhcache_phci_remove(vh->vh_config, ph);
726 	cv_destroy(&ph->ph_unstable_cv);
727 	mutex_destroy(&ph->ph_mutex);
728 	kmem_free(ph, sizeof (mdi_phci_t));
729 	DEVI(pdip)->devi_mdi_component &= ~MDI_COMPONENT_PHCI;
730 	DEVI(pdip)->devi_mdi_xhci = NULL;
731 	return (MDI_SUCCESS);
732 }
733 
734 /*
735  * i_devi_get_phci():
736  * 		Utility function to return the phci extensions.
737  */
738 static mdi_phci_t *
739 i_devi_get_phci(dev_info_t *pdip)
740 {
741 	mdi_phci_t	*ph = NULL;
742 	if (MDI_PHCI(pdip)) {
743 		ph = (mdi_phci_t *)DEVI(pdip)->devi_mdi_xhci;
744 	}
745 	return (ph);
746 }
747 
748 /*
749  * Single thread mdi entry into devinfo node for modifying its children.
750  * If necessary we perform an ndi_devi_enter of the vHCI before doing
751  * an ndi_devi_enter of 'dip'.  We maintain circular in two parts: one
752  * for the vHCI and one for the pHCI.
753  */
754 void
755 mdi_devi_enter(dev_info_t *phci_dip, int *circular)
756 {
757 	dev_info_t	*vdip;
758 	int		vcircular, pcircular;
759 
760 	/* Verify calling context */
761 	ASSERT(MDI_PHCI(phci_dip));
762 	vdip = mdi_devi_get_vdip(phci_dip);
763 	ASSERT(vdip);			/* A pHCI always has a vHCI */
764 
765 	/*
766 	 * If pHCI is detaching then the framework has already entered the
767 	 * vHCI on a threads that went down the code path leading to
768 	 * detach_node().  This framework enter of the vHCI during pHCI
769 	 * detach is done to avoid deadlock with vHCI power management
770 	 * operations which enter the vHCI and the enter down the path
771 	 * to the pHCI. If pHCI is detaching then we piggyback this calls
772 	 * enter of the vHCI on frameworks vHCI enter that has already
773 	 * occurred - this is OK because we know that the framework thread
774 	 * doing detach is waiting for our completion.
775 	 *
776 	 * We should DEVI_IS_DETACHING under an enter of the parent to avoid
777 	 * race with detach - but we can't do that because the framework has
778 	 * already entered the parent, so we have some complexity instead.
779 	 */
780 	for (;;) {
781 		if (ndi_devi_tryenter(vdip, &vcircular)) {
782 			ASSERT(vcircular != -1);
783 			if (DEVI_IS_DETACHING(phci_dip)) {
784 				ndi_devi_exit(vdip, vcircular);
785 				vcircular = -1;
786 			}
787 			break;
788 		} else if (DEVI_IS_DETACHING(phci_dip)) {
789 			vcircular = -1;
790 			break;
791 		} else {
792 			delay(1);
793 		}
794 	}
795 
796 	ndi_devi_enter(phci_dip, &pcircular);
797 	*circular = (vcircular << 16) | (pcircular & 0xFFFF);
798 }
799 
800 /*
801  * Release mdi_devi_enter or successful mdi_devi_tryenter.
802  */
803 void
804 mdi_devi_exit(dev_info_t *phci_dip, int circular)
805 {
806 	dev_info_t	*vdip;
807 	int		vcircular, pcircular;
808 
809 	/* Verify calling context */
810 	ASSERT(MDI_PHCI(phci_dip));
811 	vdip = mdi_devi_get_vdip(phci_dip);
812 	ASSERT(vdip);			/* A pHCI always has a vHCI */
813 
814 	/* extract two circular recursion values from single int */
815 	pcircular = (short)(circular & 0xFFFF);
816 	vcircular = (short)((circular >> 16) & 0xFFFF);
817 
818 	ndi_devi_exit(phci_dip, pcircular);
819 	if (vcircular != -1)
820 		ndi_devi_exit(vdip, vcircular);
821 }
822 
823 /*
824  * The functions mdi_devi_exit_phci() and mdi_devi_enter_phci() are used
825  * around a pHCI drivers calls to mdi_pi_online/offline, after holding
826  * the pathinfo node via mdi_hold_path/mdi_rele_path, to avoid deadlock
827  * with vHCI power management code during path online/offline.  Each
828  * mdi_devi_exit_phci must have a matching mdi_devi_enter_phci, and both must
829  * occur within the scope of an active mdi_devi_enter that establishes the
830  * circular value.
831  */
832 void
833 mdi_devi_exit_phci(dev_info_t *phci_dip, int circular)
834 {
835 	int		pcircular;
836 
837 	/* Verify calling context */
838 	ASSERT(MDI_PHCI(phci_dip));
839 
840 	pcircular = (short)(circular & 0xFFFF);
841 	ndi_devi_exit(phci_dip, pcircular);
842 }
843 
844 void
845 mdi_devi_enter_phci(dev_info_t *phci_dip, int *circular)
846 {
847 	int		pcircular;
848 
849 	/* Verify calling context */
850 	ASSERT(MDI_PHCI(phci_dip));
851 
852 	ndi_devi_enter(phci_dip, &pcircular);
853 
854 	/* verify matching mdi_devi_exit_phci/mdi_devi_enter_phci use */
855 	ASSERT(pcircular == ((short)(*circular & 0xFFFF)));
856 }
857 
858 /*
859  * mdi_devi_get_vdip():
860  *		given a pHCI dip return vHCI dip
861  */
862 dev_info_t *
863 mdi_devi_get_vdip(dev_info_t *pdip)
864 {
865 	mdi_phci_t	*ph;
866 
867 	ph = i_devi_get_phci(pdip);
868 	if (ph && ph->ph_vhci)
869 		return (ph->ph_vhci->vh_dip);
870 	return (NULL);
871 }
872 
873 /*
874  * mdi_devi_pdip_entered():
875  *		Return 1 if we are vHCI and have done an ndi_devi_enter
876  *		of a pHCI
877  */
878 int
879 mdi_devi_pdip_entered(dev_info_t *vdip)
880 {
881 	mdi_vhci_t	*vh;
882 	mdi_phci_t	*ph;
883 
884 	vh = i_devi_get_vhci(vdip);
885 	if (vh == NULL)
886 		return (0);
887 
888 	MDI_VHCI_PHCI_LOCK(vh);
889 	ph = vh->vh_phci_head;
890 	while (ph) {
891 		if (ph->ph_dip && DEVI_BUSY_OWNED(ph->ph_dip)) {
892 			MDI_VHCI_PHCI_UNLOCK(vh);
893 			return (1);
894 		}
895 		ph = ph->ph_next;
896 	}
897 	MDI_VHCI_PHCI_UNLOCK(vh);
898 	return (0);
899 }
900 
901 /*
902  * mdi_phci_path2devinfo():
903  * 		Utility function to search for a valid phci device given
904  *		the devfs pathname.
905  */
906 dev_info_t *
907 mdi_phci_path2devinfo(dev_info_t *vdip, caddr_t pathname)
908 {
909 	char		*temp_pathname;
910 	mdi_vhci_t	*vh;
911 	mdi_phci_t	*ph;
912 	dev_info_t 	*pdip = NULL;
913 
914 	vh = i_devi_get_vhci(vdip);
915 	ASSERT(vh != NULL);
916 
917 	if (vh == NULL) {
918 		/*
919 		 * Invalid vHCI component, return failure
920 		 */
921 		return (NULL);
922 	}
923 
924 	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
925 	MDI_VHCI_PHCI_LOCK(vh);
926 	ph = vh->vh_phci_head;
927 	while (ph != NULL) {
928 		pdip = ph->ph_dip;
929 		ASSERT(pdip != NULL);
930 		*temp_pathname = '\0';
931 		(void) ddi_pathname(pdip, temp_pathname);
932 		if (strcmp(temp_pathname, pathname) == 0) {
933 			break;
934 		}
935 		ph = ph->ph_next;
936 	}
937 	if (ph == NULL) {
938 		pdip = NULL;
939 	}
940 	MDI_VHCI_PHCI_UNLOCK(vh);
941 	kmem_free(temp_pathname, MAXPATHLEN);
942 	return (pdip);
943 }
944 
945 /*
946  * mdi_phci_get_path_count():
947  * 		get number of path information nodes associated with a given
948  *		pHCI device.
949  */
950 int
951 mdi_phci_get_path_count(dev_info_t *pdip)
952 {
953 	mdi_phci_t	*ph;
954 	int		count = 0;
955 
956 	ph = i_devi_get_phci(pdip);
957 	if (ph != NULL) {
958 		count = ph->ph_path_count;
959 	}
960 	return (count);
961 }
962 
963 /*
964  * i_mdi_phci_lock():
965  *		Lock a pHCI device
966  * Return Values:
967  *		None
968  * Note:
969  *		The default locking order is:
970  *		_NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
971  *		But there are number of situations where locks need to be
972  *		grabbed in reverse order.  This routine implements try and lock
973  *		mechanism depending on the requested parameter option.
974  */
975 static void
976 i_mdi_phci_lock(mdi_phci_t *ph, mdi_pathinfo_t *pip)
977 {
978 	if (pip) {
979 		/* Reverse locking is requested. */
980 		while (MDI_PHCI_TRYLOCK(ph) == 0) {
981 			/*
982 			 * tryenter failed. Try to grab again
983 			 * after a small delay
984 			 */
985 			MDI_PI_HOLD(pip);
986 			MDI_PI_UNLOCK(pip);
987 			delay(1);
988 			MDI_PI_LOCK(pip);
989 			MDI_PI_RELE(pip);
990 		}
991 	} else {
992 		MDI_PHCI_LOCK(ph);
993 	}
994 }
995 
996 /*
997  * i_mdi_phci_unlock():
998  *		Unlock the pHCI component
999  */
1000 static void
1001 i_mdi_phci_unlock(mdi_phci_t *ph)
1002 {
1003 	MDI_PHCI_UNLOCK(ph);
1004 }
1005 
1006 /*
1007  * i_mdi_devinfo_create():
1008  *		create client device's devinfo node
1009  * Return Values:
1010  *		dev_info
1011  *		NULL
1012  * Notes:
1013  */
1014 static dev_info_t *
1015 i_mdi_devinfo_create(mdi_vhci_t *vh, char *name, char *guid,
1016 	char **compatible, int ncompatible)
1017 {
1018 	dev_info_t *cdip = NULL;
1019 
1020 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1021 
1022 	/* Verify for duplicate entry */
1023 	cdip = i_mdi_devinfo_find(vh, name, guid);
1024 	ASSERT(cdip == NULL);
1025 	if (cdip) {
1026 		cmn_err(CE_WARN,
1027 		    "i_mdi_devinfo_create: client dip %p already exists",
1028 			(void *)cdip);
1029 	}
1030 
1031 	ndi_devi_alloc_sleep(vh->vh_dip, name, DEVI_SID_NODEID, &cdip);
1032 	if (cdip == NULL)
1033 		goto fail;
1034 
1035 	/*
1036 	 * Create component type and Global unique identifier
1037 	 * properties
1038 	 */
1039 	if (ndi_prop_update_string(DDI_DEV_T_NONE, cdip,
1040 	    MDI_CLIENT_GUID_PROP, guid) != DDI_PROP_SUCCESS) {
1041 		goto fail;
1042 	}
1043 
1044 	/* Decorate the node with compatible property */
1045 	if (compatible &&
1046 	    (ndi_prop_update_string_array(DDI_DEV_T_NONE, cdip,
1047 	    "compatible", compatible, ncompatible) != DDI_PROP_SUCCESS)) {
1048 		goto fail;
1049 	}
1050 
1051 	return (cdip);
1052 
1053 fail:
1054 	if (cdip) {
1055 		(void) ndi_prop_remove_all(cdip);
1056 		(void) ndi_devi_free(cdip);
1057 	}
1058 	return (NULL);
1059 }
1060 
1061 /*
1062  * i_mdi_devinfo_find():
1063  *		Find a matching devinfo node for given client node name
1064  *		and its guid.
1065  * Return Values:
1066  *		Handle to a dev_info node or NULL
1067  */
1068 static dev_info_t *
1069 i_mdi_devinfo_find(mdi_vhci_t *vh, caddr_t name, char *guid)
1070 {
1071 	char			*data;
1072 	dev_info_t 		*cdip = NULL;
1073 	dev_info_t 		*ndip = NULL;
1074 	int			circular;
1075 
1076 	ndi_devi_enter(vh->vh_dip, &circular);
1077 	ndip = (dev_info_t *)DEVI(vh->vh_dip)->devi_child;
1078 	while ((cdip = ndip) != NULL) {
1079 		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
1080 
1081 		if (strcmp(DEVI(cdip)->devi_node_name, name)) {
1082 			continue;
1083 		}
1084 
1085 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, cdip,
1086 		    DDI_PROP_DONTPASS, MDI_CLIENT_GUID_PROP,
1087 		    &data) != DDI_PROP_SUCCESS) {
1088 			continue;
1089 		}
1090 
1091 		if (strcmp(data, guid) != 0) {
1092 			ddi_prop_free(data);
1093 			continue;
1094 		}
1095 		ddi_prop_free(data);
1096 		break;
1097 	}
1098 	ndi_devi_exit(vh->vh_dip, circular);
1099 	return (cdip);
1100 }
1101 
1102 /*
1103  * i_mdi_devinfo_remove():
1104  *		Remove a client device node
1105  */
1106 static int
1107 i_mdi_devinfo_remove(dev_info_t *vdip, dev_info_t *cdip, int flags)
1108 {
1109 	int	rv = MDI_SUCCESS;
1110 
1111 	if (i_mdi_is_child_present(vdip, cdip) == MDI_SUCCESS ||
1112 	    (flags & MDI_CLIENT_FLAGS_DEV_NOT_SUPPORTED)) {
1113 		rv = ndi_devi_offline(cdip, NDI_DEVI_REMOVE);
1114 		if (rv != NDI_SUCCESS) {
1115 			MDI_DEBUG(1, (CE_NOTE, NULL, "!i_mdi_devinfo_remove:"
1116 			    " failed. cdip = %p\n", (void *)cdip));
1117 		}
1118 		/*
1119 		 * Convert to MDI error code
1120 		 */
1121 		switch (rv) {
1122 		case NDI_SUCCESS:
1123 			rv = MDI_SUCCESS;
1124 			break;
1125 		case NDI_BUSY:
1126 			rv = MDI_BUSY;
1127 			break;
1128 		default:
1129 			rv = MDI_FAILURE;
1130 			break;
1131 		}
1132 	}
1133 	return (rv);
1134 }
1135 
1136 /*
1137  * i_devi_get_client()
1138  *		Utility function to get mpxio component extensions
1139  */
1140 static mdi_client_t *
1141 i_devi_get_client(dev_info_t *cdip)
1142 {
1143 	mdi_client_t	*ct = NULL;
1144 
1145 	if (MDI_CLIENT(cdip)) {
1146 		ct = (mdi_client_t *)DEVI(cdip)->devi_mdi_client;
1147 	}
1148 	return (ct);
1149 }
1150 
1151 /*
1152  * i_mdi_is_child_present():
1153  *		Search for the presence of client device dev_info node
1154  */
1155 static int
1156 i_mdi_is_child_present(dev_info_t *vdip, dev_info_t *cdip)
1157 {
1158 	int		rv = MDI_FAILURE;
1159 	struct dev_info	*dip;
1160 	int		circular;
1161 
1162 	ndi_devi_enter(vdip, &circular);
1163 	dip = DEVI(vdip)->devi_child;
1164 	while (dip) {
1165 		if (dip == DEVI(cdip)) {
1166 			rv = MDI_SUCCESS;
1167 			break;
1168 		}
1169 		dip = dip->devi_sibling;
1170 	}
1171 	ndi_devi_exit(vdip, circular);
1172 	return (rv);
1173 }
1174 
1175 
1176 /*
1177  * i_mdi_client_lock():
1178  *		Grab client component lock
1179  * Return Values:
1180  *		None
1181  * Note:
1182  *		The default locking order is:
1183  *		_NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
1184  *		But there are number of situations where locks need to be
1185  *		grabbed in reverse order.  This routine implements try and lock
1186  *		mechanism depending on the requested parameter option.
1187  */
1188 static void
1189 i_mdi_client_lock(mdi_client_t *ct, mdi_pathinfo_t *pip)
1190 {
1191 	if (pip) {
1192 		/*
1193 		 * Reverse locking is requested.
1194 		 */
1195 		while (MDI_CLIENT_TRYLOCK(ct) == 0) {
1196 			/*
1197 			 * tryenter failed. Try to grab again
1198 			 * after a small delay
1199 			 */
1200 			MDI_PI_HOLD(pip);
1201 			MDI_PI_UNLOCK(pip);
1202 			delay(1);
1203 			MDI_PI_LOCK(pip);
1204 			MDI_PI_RELE(pip);
1205 		}
1206 	} else {
1207 		MDI_CLIENT_LOCK(ct);
1208 	}
1209 }
1210 
1211 /*
1212  * i_mdi_client_unlock():
1213  *		Unlock a client component
1214  */
1215 static void
1216 i_mdi_client_unlock(mdi_client_t *ct)
1217 {
1218 	MDI_CLIENT_UNLOCK(ct);
1219 }
1220 
1221 /*
1222  * i_mdi_client_alloc():
1223  * 		Allocate and initialize a client structure.  Caller should
1224  *		hold the vhci client lock.
1225  * Return Values:
1226  *		Handle to a client component
1227  */
1228 /*ARGSUSED*/
1229 static mdi_client_t *
1230 i_mdi_client_alloc(mdi_vhci_t *vh, char *name, char *lguid)
1231 {
1232 	mdi_client_t	*ct;
1233 
1234 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1235 
1236 	/*
1237 	 * Allocate and initialize a component structure.
1238 	 */
1239 	ct = kmem_zalloc(sizeof (*ct), KM_SLEEP);
1240 	mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL);
1241 	ct->ct_hnext = NULL;
1242 	ct->ct_hprev = NULL;
1243 	ct->ct_dip = NULL;
1244 	ct->ct_vhci = vh;
1245 	ct->ct_drvname = kmem_alloc(strlen(name) + 1, KM_SLEEP);
1246 	(void) strcpy(ct->ct_drvname, name);
1247 	ct->ct_guid = kmem_alloc(strlen(lguid) + 1, KM_SLEEP);
1248 	(void) strcpy(ct->ct_guid, lguid);
1249 	ct->ct_cprivate = NULL;
1250 	ct->ct_vprivate = NULL;
1251 	ct->ct_flags = 0;
1252 	ct->ct_state = MDI_CLIENT_STATE_FAILED;
1253 	MDI_CLIENT_LOCK(ct);
1254 	MDI_CLIENT_SET_OFFLINE(ct);
1255 	MDI_CLIENT_SET_DETACH(ct);
1256 	MDI_CLIENT_SET_POWER_UP(ct);
1257 	MDI_CLIENT_UNLOCK(ct);
1258 	ct->ct_failover_flags = 0;
1259 	ct->ct_failover_status = 0;
1260 	cv_init(&ct->ct_failover_cv, NULL, CV_DRIVER, NULL);
1261 	ct->ct_unstable = 0;
1262 	cv_init(&ct->ct_unstable_cv, NULL, CV_DRIVER, NULL);
1263 	cv_init(&ct->ct_powerchange_cv, NULL, CV_DRIVER, NULL);
1264 	ct->ct_lb = vh->vh_lb;
1265 	ct->ct_lb_args =  kmem_zalloc(sizeof (client_lb_args_t), KM_SLEEP);
1266 	ct->ct_lb_args->region_size = LOAD_BALANCE_DEFAULT_REGION_SIZE;
1267 	ct->ct_path_count = 0;
1268 	ct->ct_path_head = NULL;
1269 	ct->ct_path_tail = NULL;
1270 	ct->ct_path_last = NULL;
1271 
1272 	/*
1273 	 * Add this client component to our client hash queue
1274 	 */
1275 	i_mdi_client_enlist_table(vh, ct);
1276 	return (ct);
1277 }
1278 
1279 /*
1280  * i_mdi_client_enlist_table():
1281  *		Attach the client device to the client hash table. Caller
1282  *		should hold the vhci client lock.
1283  */
1284 static void
1285 i_mdi_client_enlist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1286 {
1287 	int 			index;
1288 	struct client_hash	*head;
1289 
1290 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1291 
1292 	index = i_mdi_get_hash_key(ct->ct_guid);
1293 	head = &vh->vh_client_table[index];
1294 	ct->ct_hnext = (mdi_client_t *)head->ct_hash_head;
1295 	head->ct_hash_head = ct;
1296 	head->ct_hash_count++;
1297 	vh->vh_client_count++;
1298 }
1299 
1300 /*
1301  * i_mdi_client_delist_table():
1302  *		Attach the client device to the client hash table.
1303  *		Caller should hold the vhci client lock.
1304  */
1305 static void
1306 i_mdi_client_delist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1307 {
1308 	int			index;
1309 	char			*guid;
1310 	struct client_hash 	*head;
1311 	mdi_client_t		*next;
1312 	mdi_client_t		*last;
1313 
1314 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1315 
1316 	guid = ct->ct_guid;
1317 	index = i_mdi_get_hash_key(guid);
1318 	head = &vh->vh_client_table[index];
1319 
1320 	last = NULL;
1321 	next = (mdi_client_t *)head->ct_hash_head;
1322 	while (next != NULL) {
1323 		if (next == ct) {
1324 			break;
1325 		}
1326 		last = next;
1327 		next = next->ct_hnext;
1328 	}
1329 
1330 	if (next) {
1331 		head->ct_hash_count--;
1332 		if (last == NULL) {
1333 			head->ct_hash_head = ct->ct_hnext;
1334 		} else {
1335 			last->ct_hnext = ct->ct_hnext;
1336 		}
1337 		ct->ct_hnext = NULL;
1338 		vh->vh_client_count--;
1339 	}
1340 }
1341 
1342 
1343 /*
1344  * i_mdi_client_free():
1345  *		Free a client component
1346  */
1347 static int
1348 i_mdi_client_free(mdi_vhci_t *vh, mdi_client_t *ct)
1349 {
1350 	int		rv = MDI_SUCCESS;
1351 	int		flags = ct->ct_flags;
1352 	dev_info_t	*cdip;
1353 	dev_info_t	*vdip;
1354 
1355 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1356 
1357 	vdip = vh->vh_dip;
1358 	cdip = ct->ct_dip;
1359 
1360 	(void) ndi_prop_remove(DDI_DEV_T_NONE, cdip, MDI_CLIENT_GUID_PROP);
1361 	DEVI(cdip)->devi_mdi_component &= ~MDI_COMPONENT_CLIENT;
1362 	DEVI(cdip)->devi_mdi_client = NULL;
1363 
1364 	/*
1365 	 * Clear out back ref. to dev_info_t node
1366 	 */
1367 	ct->ct_dip = NULL;
1368 
1369 	/*
1370 	 * Remove this client from our hash queue
1371 	 */
1372 	i_mdi_client_delist_table(vh, ct);
1373 
1374 	/*
1375 	 * Uninitialize and free the component
1376 	 */
1377 	kmem_free(ct->ct_drvname, strlen(ct->ct_drvname) + 1);
1378 	kmem_free(ct->ct_guid, strlen(ct->ct_guid) + 1);
1379 	kmem_free(ct->ct_lb_args, sizeof (client_lb_args_t));
1380 	cv_destroy(&ct->ct_failover_cv);
1381 	cv_destroy(&ct->ct_unstable_cv);
1382 	cv_destroy(&ct->ct_powerchange_cv);
1383 	mutex_destroy(&ct->ct_mutex);
1384 	kmem_free(ct, sizeof (*ct));
1385 
1386 	if (cdip != NULL) {
1387 		MDI_VHCI_CLIENT_UNLOCK(vh);
1388 		(void) i_mdi_devinfo_remove(vdip, cdip, flags);
1389 		MDI_VHCI_CLIENT_LOCK(vh);
1390 	}
1391 	return (rv);
1392 }
1393 
1394 /*
1395  * i_mdi_client_find():
1396  * 		Find the client structure corresponding to a given guid
1397  *		Caller should hold the vhci client lock.
1398  */
1399 static mdi_client_t *
1400 i_mdi_client_find(mdi_vhci_t *vh, char *cname, char *guid)
1401 {
1402 	int			index;
1403 	struct client_hash	*head;
1404 	mdi_client_t		*ct;
1405 
1406 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1407 
1408 	index = i_mdi_get_hash_key(guid);
1409 	head = &vh->vh_client_table[index];
1410 
1411 	ct = head->ct_hash_head;
1412 	while (ct != NULL) {
1413 		if (strcmp(ct->ct_guid, guid) == 0 &&
1414 		    (cname == NULL || strcmp(ct->ct_drvname, cname) == 0)) {
1415 			break;
1416 		}
1417 		ct = ct->ct_hnext;
1418 	}
1419 	return (ct);
1420 }
1421 
1422 /*
1423  * i_mdi_client_update_state():
1424  *		Compute and update client device state
1425  * Notes:
1426  *		A client device can be in any of three possible states:
1427  *
1428  *		MDI_CLIENT_STATE_OPTIMAL - Client in optimal state with more
1429  *		one online/standby paths. Can tolerate failures.
1430  *		MDI_CLIENT_STATE_DEGRADED - Client device in degraded state with
1431  *		no alternate paths available as standby. A failure on the online
1432  *		would result in loss of access to device data.
1433  *		MDI_CLIENT_STATE_FAILED - Client device in failed state with
1434  *		no paths available to access the device.
1435  */
1436 static void
1437 i_mdi_client_update_state(mdi_client_t *ct)
1438 {
1439 	int state;
1440 
1441 	ASSERT(MDI_CLIENT_LOCKED(ct));
1442 	state = i_mdi_client_compute_state(ct, NULL);
1443 	MDI_CLIENT_SET_STATE(ct, state);
1444 }
1445 
1446 /*
1447  * i_mdi_client_compute_state():
1448  *		Compute client device state
1449  *
1450  *		mdi_phci_t *	Pointer to pHCI structure which should
1451  *				while computing the new value.  Used by
1452  *				i_mdi_phci_offline() to find the new
1453  *				client state after DR of a pHCI.
1454  */
1455 static int
1456 i_mdi_client_compute_state(mdi_client_t *ct, mdi_phci_t *ph)
1457 {
1458 	int		state;
1459 	int		online_count = 0;
1460 	int		standby_count = 0;
1461 	mdi_pathinfo_t	*pip, *next;
1462 
1463 	ASSERT(MDI_CLIENT_LOCKED(ct));
1464 	pip = ct->ct_path_head;
1465 	while (pip != NULL) {
1466 		MDI_PI_LOCK(pip);
1467 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
1468 		if (MDI_PI(pip)->pi_phci == ph) {
1469 			MDI_PI_UNLOCK(pip);
1470 			pip = next;
1471 			continue;
1472 		}
1473 
1474 		if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1475 				== MDI_PATHINFO_STATE_ONLINE)
1476 			online_count++;
1477 		else if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1478 				== MDI_PATHINFO_STATE_STANDBY)
1479 			standby_count++;
1480 		MDI_PI_UNLOCK(pip);
1481 		pip = next;
1482 	}
1483 
1484 	if (online_count == 0) {
1485 		if (standby_count == 0) {
1486 			state = MDI_CLIENT_STATE_FAILED;
1487 			MDI_DEBUG(2, (CE_NOTE, NULL, "!client state: failed"
1488 			    " ct = %p\n", (void *)ct));
1489 		} else if (standby_count == 1) {
1490 			state = MDI_CLIENT_STATE_DEGRADED;
1491 		} else {
1492 			state = MDI_CLIENT_STATE_OPTIMAL;
1493 		}
1494 	} else if (online_count == 1) {
1495 		if (standby_count == 0) {
1496 			state = MDI_CLIENT_STATE_DEGRADED;
1497 		} else {
1498 			state = MDI_CLIENT_STATE_OPTIMAL;
1499 		}
1500 	} else {
1501 		state = MDI_CLIENT_STATE_OPTIMAL;
1502 	}
1503 	return (state);
1504 }
1505 
1506 /*
1507  * i_mdi_client2devinfo():
1508  *		Utility function
1509  */
1510 dev_info_t *
1511 i_mdi_client2devinfo(mdi_client_t *ct)
1512 {
1513 	return (ct->ct_dip);
1514 }
1515 
1516 /*
1517  * mdi_client_path2_devinfo():
1518  * 		Given the parent devinfo and child devfs pathname, search for
1519  *		a valid devfs node handle.
1520  */
1521 dev_info_t *
1522 mdi_client_path2devinfo(dev_info_t *vdip, char *pathname)
1523 {
1524 	dev_info_t 	*cdip = NULL;
1525 	dev_info_t 	*ndip = NULL;
1526 	char		*temp_pathname;
1527 	int		circular;
1528 
1529 	/*
1530 	 * Allocate temp buffer
1531 	 */
1532 	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1533 
1534 	/*
1535 	 * Lock parent against changes
1536 	 */
1537 	ndi_devi_enter(vdip, &circular);
1538 	ndip = (dev_info_t *)DEVI(vdip)->devi_child;
1539 	while ((cdip = ndip) != NULL) {
1540 		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
1541 
1542 		*temp_pathname = '\0';
1543 		(void) ddi_pathname(cdip, temp_pathname);
1544 		if (strcmp(temp_pathname, pathname) == 0) {
1545 			break;
1546 		}
1547 	}
1548 	/*
1549 	 * Release devinfo lock
1550 	 */
1551 	ndi_devi_exit(vdip, circular);
1552 
1553 	/*
1554 	 * Free the temp buffer
1555 	 */
1556 	kmem_free(temp_pathname, MAXPATHLEN);
1557 	return (cdip);
1558 }
1559 
1560 /*
1561  * mdi_client_get_path_count():
1562  * 		Utility function to get number of path information nodes
1563  *		associated with a given client device.
1564  */
1565 int
1566 mdi_client_get_path_count(dev_info_t *cdip)
1567 {
1568 	mdi_client_t	*ct;
1569 	int		count = 0;
1570 
1571 	ct = i_devi_get_client(cdip);
1572 	if (ct != NULL) {
1573 		count = ct->ct_path_count;
1574 	}
1575 	return (count);
1576 }
1577 
1578 
1579 /*
1580  * i_mdi_get_hash_key():
1581  * 		Create a hash using strings as keys
1582  *
1583  */
1584 static int
1585 i_mdi_get_hash_key(char *str)
1586 {
1587 	uint32_t	g, hash = 0;
1588 	char		*p;
1589 
1590 	for (p = str; *p != '\0'; p++) {
1591 		g = *p;
1592 		hash += g;
1593 	}
1594 	return (hash % (CLIENT_HASH_TABLE_SIZE - 1));
1595 }
1596 
1597 /*
1598  * mdi_get_lb_policy():
1599  * 		Get current load balancing policy for a given client device
1600  */
1601 client_lb_t
1602 mdi_get_lb_policy(dev_info_t *cdip)
1603 {
1604 	client_lb_t	lb = LOAD_BALANCE_NONE;
1605 	mdi_client_t	*ct;
1606 
1607 	ct = i_devi_get_client(cdip);
1608 	if (ct != NULL) {
1609 		lb = ct->ct_lb;
1610 	}
1611 	return (lb);
1612 }
1613 
1614 /*
1615  * mdi_set_lb_region_size():
1616  * 		Set current region size for the load-balance
1617  */
1618 int
1619 mdi_set_lb_region_size(dev_info_t *cdip, int region_size)
1620 {
1621 	mdi_client_t	*ct;
1622 	int		rv = MDI_FAILURE;
1623 
1624 	ct = i_devi_get_client(cdip);
1625 	if (ct != NULL && ct->ct_lb_args != NULL) {
1626 		ct->ct_lb_args->region_size = region_size;
1627 		rv = MDI_SUCCESS;
1628 	}
1629 	return (rv);
1630 }
1631 
1632 /*
1633  * mdi_Set_lb_policy():
1634  * 		Set current load balancing policy for a given client device
1635  */
1636 int
1637 mdi_set_lb_policy(dev_info_t *cdip, client_lb_t lb)
1638 {
1639 	mdi_client_t	*ct;
1640 	int		rv = MDI_FAILURE;
1641 
1642 	ct = i_devi_get_client(cdip);
1643 	if (ct != NULL) {
1644 		ct->ct_lb = lb;
1645 		rv = MDI_SUCCESS;
1646 	}
1647 	return (rv);
1648 }
1649 
1650 /*
1651  * mdi_failover():
1652  *		failover function called by the vHCI drivers to initiate
1653  *		a failover operation.  This is typically due to non-availability
1654  *		of online paths to route I/O requests.  Failover can be
1655  *		triggered through user application also.
1656  *
1657  *		The vHCI driver calls mdi_failover() to initiate a failover
1658  *		operation. mdi_failover() calls back into the vHCI driver's
1659  *		vo_failover() entry point to perform the actual failover
1660  *		operation.  The reason for requiring the vHCI driver to
1661  *		initiate failover by calling mdi_failover(), instead of directly
1662  *		executing vo_failover() itself, is to ensure that the mdi
1663  *		framework can keep track of the client state properly.
1664  *		Additionally, mdi_failover() provides as a convenience the
1665  *		option of performing the failover operation synchronously or
1666  *		asynchronously
1667  *
1668  *		Upon successful completion of the failover operation, the
1669  *		paths that were previously ONLINE will be in the STANDBY state,
1670  *		and the newly activated paths will be in the ONLINE state.
1671  *
1672  *		The flags modifier determines whether the activation is done
1673  *		synchronously: MDI_FAILOVER_SYNC
1674  * Return Values:
1675  *		MDI_SUCCESS
1676  *		MDI_FAILURE
1677  *		MDI_BUSY
1678  */
1679 /*ARGSUSED*/
1680 int
1681 mdi_failover(dev_info_t *vdip, dev_info_t *cdip, int flags)
1682 {
1683 	int			rv;
1684 	mdi_client_t		*ct;
1685 
1686 	ct = i_devi_get_client(cdip);
1687 	ASSERT(ct != NULL);
1688 	if (ct == NULL) {
1689 		/* cdip is not a valid client device. Nothing more to do. */
1690 		return (MDI_FAILURE);
1691 	}
1692 
1693 	MDI_CLIENT_LOCK(ct);
1694 
1695 	if (MDI_CLIENT_IS_PATH_FREE_IN_PROGRESS(ct)) {
1696 		/* A path to the client is being freed */
1697 		MDI_CLIENT_UNLOCK(ct);
1698 		return (MDI_BUSY);
1699 	}
1700 
1701 
1702 	if (MDI_CLIENT_IS_FAILED(ct)) {
1703 		/*
1704 		 * Client is in failed state. Nothing more to do.
1705 		 */
1706 		MDI_CLIENT_UNLOCK(ct);
1707 		return (MDI_FAILURE);
1708 	}
1709 
1710 	if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
1711 		/*
1712 		 * Failover is already in progress; return BUSY
1713 		 */
1714 		MDI_CLIENT_UNLOCK(ct);
1715 		return (MDI_BUSY);
1716 	}
1717 	/*
1718 	 * Make sure that mdi_pathinfo node state changes are processed.
1719 	 * We do not allow failovers to progress while client path state
1720 	 * changes are in progress
1721 	 */
1722 	if (ct->ct_unstable) {
1723 		if (flags == MDI_FAILOVER_ASYNC) {
1724 			MDI_CLIENT_UNLOCK(ct);
1725 			return (MDI_BUSY);
1726 		} else {
1727 			while (ct->ct_unstable)
1728 				cv_wait(&ct->ct_unstable_cv, &ct->ct_mutex);
1729 		}
1730 	}
1731 
1732 	/*
1733 	 * Client device is in stable state. Before proceeding, perform sanity
1734 	 * checks again.
1735 	 */
1736 	if ((MDI_CLIENT_IS_DETACHED(ct)) || (MDI_CLIENT_IS_FAILED(ct)) ||
1737 	    (!i_ddi_devi_attached(ct->ct_dip))) {
1738 		/*
1739 		 * Client is in failed state. Nothing more to do.
1740 		 */
1741 		MDI_CLIENT_UNLOCK(ct);
1742 		return (MDI_FAILURE);
1743 	}
1744 
1745 	/*
1746 	 * Set the client state as failover in progress.
1747 	 */
1748 	MDI_CLIENT_SET_FAILOVER_IN_PROGRESS(ct);
1749 	ct->ct_failover_flags = flags;
1750 	MDI_CLIENT_UNLOCK(ct);
1751 
1752 	if (flags == MDI_FAILOVER_ASYNC) {
1753 		/*
1754 		 * Submit the initiate failover request via CPR safe
1755 		 * taskq threads.
1756 		 */
1757 		(void) taskq_dispatch(mdi_taskq, (task_func_t *)i_mdi_failover,
1758 		    ct, KM_SLEEP);
1759 		return (MDI_ACCEPT);
1760 	} else {
1761 		/*
1762 		 * Synchronous failover mode.  Typically invoked from the user
1763 		 * land.
1764 		 */
1765 		rv = i_mdi_failover(ct);
1766 	}
1767 	return (rv);
1768 }
1769 
1770 /*
1771  * i_mdi_failover():
1772  *		internal failover function. Invokes vHCI drivers failover
1773  *		callback function and process the failover status
1774  * Return Values:
1775  *		None
1776  *
1777  * Note: A client device in failover state can not be detached or freed.
1778  */
1779 static int
1780 i_mdi_failover(void *arg)
1781 {
1782 	int		rv = MDI_SUCCESS;
1783 	mdi_client_t	*ct = (mdi_client_t *)arg;
1784 	mdi_vhci_t	*vh = ct->ct_vhci;
1785 
1786 	ASSERT(!MDI_CLIENT_LOCKED(ct));
1787 
1788 	if (vh->vh_ops->vo_failover != NULL) {
1789 		/*
1790 		 * Call vHCI drivers callback routine
1791 		 */
1792 		rv = (*vh->vh_ops->vo_failover)(vh->vh_dip, ct->ct_dip,
1793 		    ct->ct_failover_flags);
1794 	}
1795 
1796 	MDI_CLIENT_LOCK(ct);
1797 	MDI_CLIENT_CLEAR_FAILOVER_IN_PROGRESS(ct);
1798 
1799 	/*
1800 	 * Save the failover return status
1801 	 */
1802 	ct->ct_failover_status = rv;
1803 
1804 	/*
1805 	 * As a result of failover, client status would have been changed.
1806 	 * Update the client state and wake up anyone waiting on this client
1807 	 * device.
1808 	 */
1809 	i_mdi_client_update_state(ct);
1810 
1811 	cv_broadcast(&ct->ct_failover_cv);
1812 	MDI_CLIENT_UNLOCK(ct);
1813 	return (rv);
1814 }
1815 
1816 /*
1817  * Load balancing is logical block.
1818  * IOs within the range described by region_size
1819  * would go on the same path. This would improve the
1820  * performance by cache-hit on some of the RAID devices.
1821  * Search only for online paths(At some point we
1822  * may want to balance across target ports).
1823  * If no paths are found then default to round-robin.
1824  */
1825 static int
1826 i_mdi_lba_lb(mdi_client_t *ct, mdi_pathinfo_t **ret_pip, struct buf *bp)
1827 {
1828 	int		path_index = -1;
1829 	int		online_path_count = 0;
1830 	int		online_nonpref_path_count = 0;
1831 	int 		region_size = ct->ct_lb_args->region_size;
1832 	mdi_pathinfo_t	*pip;
1833 	mdi_pathinfo_t	*next;
1834 	int		preferred, path_cnt;
1835 
1836 	pip = ct->ct_path_head;
1837 	while (pip) {
1838 		MDI_PI_LOCK(pip);
1839 		if (MDI_PI(pip)->pi_state ==
1840 		    MDI_PATHINFO_STATE_ONLINE && MDI_PI(pip)->pi_preferred) {
1841 			online_path_count++;
1842 		} else if (MDI_PI(pip)->pi_state ==
1843 		    MDI_PATHINFO_STATE_ONLINE && !MDI_PI(pip)->pi_preferred) {
1844 			online_nonpref_path_count++;
1845 		}
1846 		next = (mdi_pathinfo_t *)
1847 		    MDI_PI(pip)->pi_client_link;
1848 		MDI_PI_UNLOCK(pip);
1849 		pip = next;
1850 	}
1851 	/* if found any online/preferred then use this type */
1852 	if (online_path_count > 0) {
1853 		path_cnt = online_path_count;
1854 		preferred = 1;
1855 	} else if (online_nonpref_path_count > 0) {
1856 		path_cnt = online_nonpref_path_count;
1857 		preferred = 0;
1858 	} else {
1859 		path_cnt = 0;
1860 	}
1861 	if (path_cnt) {
1862 		path_index = (bp->b_blkno >> region_size) % path_cnt;
1863 		pip = ct->ct_path_head;
1864 		while (pip && path_index != -1) {
1865 			MDI_PI_LOCK(pip);
1866 			if (path_index == 0 &&
1867 			    (MDI_PI(pip)->pi_state ==
1868 			    MDI_PATHINFO_STATE_ONLINE) &&
1869 				MDI_PI(pip)->pi_preferred == preferred) {
1870 				MDI_PI_HOLD(pip);
1871 				MDI_PI_UNLOCK(pip);
1872 				*ret_pip = pip;
1873 				return (MDI_SUCCESS);
1874 			}
1875 			path_index --;
1876 			next = (mdi_pathinfo_t *)
1877 			    MDI_PI(pip)->pi_client_link;
1878 			MDI_PI_UNLOCK(pip);
1879 			pip = next;
1880 		}
1881 		if (pip == NULL) {
1882 			MDI_DEBUG(4, (CE_NOTE, NULL,
1883 			    "!lba %llx, no pip !!\n",
1884 				bp->b_lblkno));
1885 		} else {
1886 			MDI_DEBUG(4, (CE_NOTE, NULL,
1887 			    "!lba %llx, no pip for path_index, "
1888 			    "pip %p\n", bp->b_lblkno, (void *)pip));
1889 		}
1890 	}
1891 	return (MDI_FAILURE);
1892 }
1893 
1894 /*
1895  * mdi_select_path():
1896  *		select a path to access a client device.
1897  *
1898  *		mdi_select_path() function is called by the vHCI drivers to
1899  *		select a path to route the I/O request to.  The caller passes
1900  *		the block I/O data transfer structure ("buf") as one of the
1901  *		parameters.  The mpxio framework uses the buf structure
1902  *		contents to maintain per path statistics (total I/O size /
1903  *		count pending).  If more than one online paths are available to
1904  *		select, the framework automatically selects a suitable path
1905  *		for routing I/O request. If a failover operation is active for
1906  *		this client device the call shall be failed with MDI_BUSY error
1907  *		code.
1908  *
1909  *		By default this function returns a suitable path in online
1910  *		state based on the current load balancing policy.  Currently
1911  *		we support LOAD_BALANCE_NONE (Previously selected online path
1912  *		will continue to be used till the path is usable) and
1913  *		LOAD_BALANCE_RR (Online paths will be selected in a round
1914  *		robin fashion), LOAD_BALANCE_LB(Online paths will be selected
1915  *		based on the logical block).  The load balancing
1916  *		through vHCI drivers configuration file (driver.conf).
1917  *
1918  *		vHCI drivers may override this default behavior by specifying
1919  *		appropriate flags.  If start_pip is specified (non NULL) is
1920  *		used as start point to walk and find the next appropriate path.
1921  *		The following values are currently defined:
1922  *		MDI_SELECT_ONLINE_PATH (to select an ONLINE path) and/or
1923  *		MDI_SELECT_STANDBY_PATH (to select an STANDBY path).
1924  *
1925  *		The non-standard behavior is used by the scsi_vhci driver,
1926  *		whenever it has to use a STANDBY/FAULTED path.  Eg. during
1927  *		attach of client devices (to avoid an unnecessary failover
1928  *		when the STANDBY path comes up first), during failover
1929  *		(to activate a STANDBY path as ONLINE).
1930  *
1931  *		The selected path is returned in a a mdi_hold_path() state
1932  *		(pi_ref_cnt). Caller should release the hold by calling
1933  *		mdi_rele_path().
1934  *
1935  * Return Values:
1936  *		MDI_SUCCESS	- Completed successfully
1937  *		MDI_BUSY 	- Client device is busy failing over
1938  *		MDI_NOPATH	- Client device is online, but no valid path are
1939  *				  available to access this client device
1940  *		MDI_FAILURE	- Invalid client device or state
1941  *		MDI_DEVI_ONLINING
1942  *				- Client device (struct dev_info state) is in
1943  *				  onlining state.
1944  */
1945 
1946 /*ARGSUSED*/
1947 int
1948 mdi_select_path(dev_info_t *cdip, struct buf *bp, int flags,
1949     mdi_pathinfo_t *start_pip, mdi_pathinfo_t **ret_pip)
1950 {
1951 	mdi_client_t	*ct;
1952 	mdi_pathinfo_t	*pip;
1953 	mdi_pathinfo_t	*next;
1954 	mdi_pathinfo_t	*head;
1955 	mdi_pathinfo_t	*start;
1956 	client_lb_t	lbp;	/* load balancing policy */
1957 	int		sb = 1;	/* standard behavior */
1958 	int		preferred = 1;	/* preferred path */
1959 	int		cond, cont = 1;
1960 	int		retry = 0;
1961 
1962 	if (flags != 0) {
1963 		/*
1964 		 * disable default behavior
1965 		 */
1966 		sb = 0;
1967 	}
1968 
1969 	*ret_pip = NULL;
1970 	ct = i_devi_get_client(cdip);
1971 	if (ct == NULL) {
1972 		/* mdi extensions are NULL, Nothing more to do */
1973 		return (MDI_FAILURE);
1974 	}
1975 
1976 	MDI_CLIENT_LOCK(ct);
1977 
1978 	if (sb) {
1979 		if (MDI_CLIENT_IS_FAILED(ct)) {
1980 			/*
1981 			 * Client is not ready to accept any I/O requests.
1982 			 * Fail this request.
1983 			 */
1984 			MDI_DEBUG(2, (CE_NOTE, cdip, "!mdi_select_path: "
1985 			    "client state offline ct = %p\n", (void *)ct));
1986 			MDI_CLIENT_UNLOCK(ct);
1987 			return (MDI_FAILURE);
1988 		}
1989 
1990 		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
1991 			/*
1992 			 * Check for Failover is in progress. If so tell the
1993 			 * caller that this device is busy.
1994 			 */
1995 			MDI_DEBUG(2, (CE_NOTE, cdip, "!mdi_select_path: "
1996 			    "client failover in progress ct = %p\n",
1997 			    (void *)ct));
1998 			MDI_CLIENT_UNLOCK(ct);
1999 			return (MDI_BUSY);
2000 		}
2001 
2002 		/*
2003 		 * Check to see whether the client device is attached.
2004 		 * If not so, let the vHCI driver manually select a path
2005 		 * (standby) and let the probe/attach process to continue.
2006 		 */
2007 		if (MDI_CLIENT_IS_DETACHED(ct) || !i_ddi_devi_attached(cdip)) {
2008 			MDI_DEBUG(4, (CE_NOTE, cdip, "!Devi is onlining "
2009 			    "ct = %p\n", (void *)ct));
2010 			MDI_CLIENT_UNLOCK(ct);
2011 			return (MDI_DEVI_ONLINING);
2012 		}
2013 	}
2014 
2015 	/*
2016 	 * Cache in the client list head.  If head of the list is NULL
2017 	 * return MDI_NOPATH
2018 	 */
2019 	head = ct->ct_path_head;
2020 	if (head == NULL) {
2021 		MDI_CLIENT_UNLOCK(ct);
2022 		return (MDI_NOPATH);
2023 	}
2024 
2025 	/*
2026 	 * for non default behavior, bypass current
2027 	 * load balancing policy and always use LOAD_BALANCE_RR
2028 	 * except that the start point will be adjusted based
2029 	 * on the provided start_pip
2030 	 */
2031 	lbp = sb ? ct->ct_lb : LOAD_BALANCE_RR;
2032 
2033 	switch (lbp) {
2034 	case LOAD_BALANCE_NONE:
2035 		/*
2036 		 * Load balancing is None  or Alternate path mode
2037 		 * Start looking for a online mdi_pathinfo node starting from
2038 		 * last known selected path
2039 		 */
2040 		preferred = 1;
2041 		pip = (mdi_pathinfo_t *)ct->ct_path_last;
2042 		if (pip == NULL) {
2043 			pip = head;
2044 		}
2045 		start = pip;
2046 		do {
2047 			MDI_PI_LOCK(pip);
2048 			/*
2049 			 * No need to explicitly check if the path is disabled.
2050 			 * Since we are checking for state == ONLINE and the
2051 			 * same veriable is used for DISABLE/ENABLE information.
2052 			 */
2053 			if ((MDI_PI(pip)->pi_state  ==
2054 				MDI_PATHINFO_STATE_ONLINE) &&
2055 				preferred == MDI_PI(pip)->pi_preferred) {
2056 				/*
2057 				 * Return the path in hold state. Caller should
2058 				 * release the lock by calling mdi_rele_path()
2059 				 */
2060 				MDI_PI_HOLD(pip);
2061 				MDI_PI_UNLOCK(pip);
2062 				ct->ct_path_last = pip;
2063 				*ret_pip = pip;
2064 				MDI_CLIENT_UNLOCK(ct);
2065 				return (MDI_SUCCESS);
2066 			}
2067 
2068 			/*
2069 			 * Path is busy.
2070 			 */
2071 			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
2072 			    MDI_PI_IS_TRANSIENT(pip))
2073 				retry = 1;
2074 			/*
2075 			 * Keep looking for a next available online path
2076 			 */
2077 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2078 			if (next == NULL) {
2079 				next = head;
2080 			}
2081 			MDI_PI_UNLOCK(pip);
2082 			pip = next;
2083 			if (start == pip && preferred) {
2084 				preferred = 0;
2085 			} else if (start == pip && !preferred) {
2086 				cont = 0;
2087 			}
2088 		} while (cont);
2089 		break;
2090 
2091 	case LOAD_BALANCE_LBA:
2092 		/*
2093 		 * Make sure we are looking
2094 		 * for an online path. Otherwise, if it is for a STANDBY
2095 		 * path request, it will go through and fetch an ONLINE
2096 		 * path which is not desirable.
2097 		 */
2098 		if ((ct->ct_lb_args != NULL) &&
2099 			    (ct->ct_lb_args->region_size) && bp &&
2100 				(sb || (flags == MDI_SELECT_ONLINE_PATH))) {
2101 			if (i_mdi_lba_lb(ct, ret_pip, bp)
2102 				    == MDI_SUCCESS) {
2103 				MDI_CLIENT_UNLOCK(ct);
2104 				return (MDI_SUCCESS);
2105 			}
2106 		}
2107 		/*  FALLTHROUGH */
2108 	case LOAD_BALANCE_RR:
2109 		/*
2110 		 * Load balancing is Round Robin. Start looking for a online
2111 		 * mdi_pathinfo node starting from last known selected path
2112 		 * as the start point.  If override flags are specified,
2113 		 * process accordingly.
2114 		 * If the search is already in effect(start_pip not null),
2115 		 * then lets just use the same path preference to continue the
2116 		 * traversal.
2117 		 */
2118 
2119 		if (start_pip != NULL) {
2120 			preferred = MDI_PI(start_pip)->pi_preferred;
2121 		} else {
2122 			preferred = 1;
2123 		}
2124 
2125 		start = sb ? (mdi_pathinfo_t *)ct->ct_path_last : start_pip;
2126 		if (start == NULL) {
2127 			pip = head;
2128 		} else {
2129 			pip = (mdi_pathinfo_t *)MDI_PI(start)->pi_client_link;
2130 			if (pip == NULL) {
2131 				if (!sb) {
2132 					if (preferred == 0) {
2133 						/*
2134 						 * Looks like we have completed
2135 						 * the traversal as preferred
2136 						 * value is 0. Time to bail out.
2137 						 */
2138 						*ret_pip = NULL;
2139 						MDI_CLIENT_UNLOCK(ct);
2140 						return (MDI_NOPATH);
2141 					} else {
2142 						/*
2143 						 * Looks like we reached the
2144 						 * end of the list. Lets enable
2145 						 * traversal of non preferred
2146 						 * paths.
2147 						 */
2148 						preferred = 0;
2149 					}
2150 				}
2151 				pip = head;
2152 			}
2153 		}
2154 		start = pip;
2155 		do {
2156 			MDI_PI_LOCK(pip);
2157 			if (sb) {
2158 				cond = ((MDI_PI(pip)->pi_state ==
2159 				    MDI_PATHINFO_STATE_ONLINE &&
2160 					MDI_PI(pip)->pi_preferred ==
2161 						preferred) ? 1 : 0);
2162 			} else {
2163 				if (flags == MDI_SELECT_ONLINE_PATH) {
2164 					cond = ((MDI_PI(pip)->pi_state ==
2165 					    MDI_PATHINFO_STATE_ONLINE &&
2166 						MDI_PI(pip)->pi_preferred ==
2167 						preferred) ? 1 : 0);
2168 				} else if (flags == MDI_SELECT_STANDBY_PATH) {
2169 					cond = ((MDI_PI(pip)->pi_state ==
2170 					    MDI_PATHINFO_STATE_STANDBY &&
2171 						MDI_PI(pip)->pi_preferred ==
2172 						preferred) ? 1 : 0);
2173 				} else if (flags == (MDI_SELECT_ONLINE_PATH |
2174 				    MDI_SELECT_STANDBY_PATH)) {
2175 					cond = (((MDI_PI(pip)->pi_state ==
2176 					    MDI_PATHINFO_STATE_ONLINE ||
2177 					    (MDI_PI(pip)->pi_state ==
2178 					    MDI_PATHINFO_STATE_STANDBY)) &&
2179 						MDI_PI(pip)->pi_preferred ==
2180 						preferred) ? 1 : 0);
2181 				} else if (flags ==
2182 					(MDI_SELECT_STANDBY_PATH |
2183 					MDI_SELECT_ONLINE_PATH |
2184 					MDI_SELECT_USER_DISABLE_PATH)) {
2185 					cond = (((MDI_PI(pip)->pi_state ==
2186 					    MDI_PATHINFO_STATE_ONLINE ||
2187 					    (MDI_PI(pip)->pi_state ==
2188 					    MDI_PATHINFO_STATE_STANDBY) ||
2189 						(MDI_PI(pip)->pi_state ==
2190 					    (MDI_PATHINFO_STATE_ONLINE|
2191 					    MDI_PATHINFO_STATE_USER_DISABLE)) ||
2192 						(MDI_PI(pip)->pi_state ==
2193 					    (MDI_PATHINFO_STATE_STANDBY |
2194 					    MDI_PATHINFO_STATE_USER_DISABLE)))&&
2195 						MDI_PI(pip)->pi_preferred ==
2196 						preferred) ? 1 : 0);
2197 				} else {
2198 					cond = 0;
2199 				}
2200 			}
2201 			/*
2202 			 * No need to explicitly check if the path is disabled.
2203 			 * Since we are checking for state == ONLINE and the
2204 			 * same veriable is used for DISABLE/ENABLE information.
2205 			 */
2206 			if (cond) {
2207 				/*
2208 				 * Return the path in hold state. Caller should
2209 				 * release the lock by calling mdi_rele_path()
2210 				 */
2211 				MDI_PI_HOLD(pip);
2212 				MDI_PI_UNLOCK(pip);
2213 				if (sb)
2214 					ct->ct_path_last = pip;
2215 				*ret_pip = pip;
2216 				MDI_CLIENT_UNLOCK(ct);
2217 				return (MDI_SUCCESS);
2218 			}
2219 			/*
2220 			 * Path is busy.
2221 			 */
2222 			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
2223 			    MDI_PI_IS_TRANSIENT(pip))
2224 				retry = 1;
2225 
2226 			/*
2227 			 * Keep looking for a next available online path
2228 			 */
2229 do_again:
2230 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2231 			if (next == NULL) {
2232 				if (!sb) {
2233 					if (preferred == 1) {
2234 						/*
2235 						 * Looks like we reached the
2236 						 * end of the list. Lets enable
2237 						 * traversal of non preferred
2238 						 * paths.
2239 						 */
2240 						preferred = 0;
2241 						next = head;
2242 					} else {
2243 						/*
2244 						 * We have done both the passes
2245 						 * Preferred as well as for
2246 						 * Non-preferred. Bail out now.
2247 						 */
2248 						cont = 0;
2249 					}
2250 				} else {
2251 					/*
2252 					 * Standard behavior case.
2253 					 */
2254 					next = head;
2255 				}
2256 			}
2257 			MDI_PI_UNLOCK(pip);
2258 			if (cont == 0) {
2259 				break;
2260 			}
2261 			pip = next;
2262 
2263 			if (!sb) {
2264 				/*
2265 				 * We need to handle the selection of
2266 				 * non-preferred path in the following
2267 				 * case:
2268 				 *
2269 				 * +------+   +------+   +------+   +-----+
2270 				 * | A : 1| - | B : 1| - | C : 0| - |NULL |
2271 				 * +------+   +------+   +------+   +-----+
2272 				 *
2273 				 * If we start the search with B, we need to
2274 				 * skip beyond B to pick C which is non -
2275 				 * preferred in the second pass. The following
2276 				 * test, if true, will allow us to skip over
2277 				 * the 'start'(B in the example) to select
2278 				 * other non preferred elements.
2279 				 */
2280 				if ((start_pip != NULL) && (start_pip == pip) &&
2281 				    (MDI_PI(start_pip)->pi_preferred
2282 				    != preferred)) {
2283 					/*
2284 					 * try again after going past the start
2285 					 * pip
2286 					 */
2287 					MDI_PI_LOCK(pip);
2288 					goto do_again;
2289 				}
2290 			} else {
2291 				/*
2292 				 * Standard behavior case
2293 				 */
2294 				if (start == pip && preferred) {
2295 					/* look for nonpreferred paths */
2296 					preferred = 0;
2297 				} else if (start == pip && !preferred) {
2298 					/*
2299 					 * Exit condition
2300 					 */
2301 					cont = 0;
2302 				}
2303 			}
2304 		} while (cont);
2305 		break;
2306 	}
2307 
2308 	MDI_CLIENT_UNLOCK(ct);
2309 	if (retry == 1) {
2310 		return (MDI_BUSY);
2311 	} else {
2312 		return (MDI_NOPATH);
2313 	}
2314 }
2315 
2316 /*
2317  * For a client, return the next available path to any phci
2318  *
2319  * Note:
2320  *		Caller should hold the branch's devinfo node to get a consistent
2321  *		snap shot of the mdi_pathinfo nodes.
2322  *
2323  *		Please note that even the list is stable the mdi_pathinfo
2324  *		node state and properties are volatile.  The caller should lock
2325  *		and unlock the nodes by calling mdi_pi_lock() and
2326  *		mdi_pi_unlock() functions to get a stable properties.
2327  *
2328  *		If there is a need to use the nodes beyond the hold of the
2329  *		devinfo node period (For ex. I/O), then mdi_pathinfo node
2330  *		need to be held against unexpected removal by calling
2331  *		mdi_hold_path() and should be released by calling
2332  *		mdi_rele_path() on completion.
2333  */
2334 mdi_pathinfo_t *
2335 mdi_get_next_phci_path(dev_info_t *ct_dip, mdi_pathinfo_t *pip)
2336 {
2337 	mdi_client_t *ct;
2338 
2339 	if (!MDI_CLIENT(ct_dip))
2340 		return (NULL);
2341 
2342 	/*
2343 	 * Walk through client link
2344 	 */
2345 	ct = (mdi_client_t *)DEVI(ct_dip)->devi_mdi_client;
2346 	ASSERT(ct != NULL);
2347 
2348 	if (pip == NULL)
2349 		return ((mdi_pathinfo_t *)ct->ct_path_head);
2350 
2351 	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link);
2352 }
2353 
2354 /*
2355  * For a phci, return the next available path to any client
2356  * Note: ditto mdi_get_next_phci_path()
2357  */
2358 mdi_pathinfo_t *
2359 mdi_get_next_client_path(dev_info_t *ph_dip, mdi_pathinfo_t *pip)
2360 {
2361 	mdi_phci_t *ph;
2362 
2363 	if (!MDI_PHCI(ph_dip))
2364 		return (NULL);
2365 
2366 	/*
2367 	 * Walk through pHCI link
2368 	 */
2369 	ph = (mdi_phci_t *)DEVI(ph_dip)->devi_mdi_xhci;
2370 	ASSERT(ph != NULL);
2371 
2372 	if (pip == NULL)
2373 		return ((mdi_pathinfo_t *)ph->ph_path_head);
2374 
2375 	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link);
2376 }
2377 
2378 /*
2379  * mdi_hold_path():
2380  *		Hold the mdi_pathinfo node against unwanted unexpected free.
2381  * Return Values:
2382  *		None
2383  */
2384 void
2385 mdi_hold_path(mdi_pathinfo_t *pip)
2386 {
2387 	if (pip) {
2388 		MDI_PI_LOCK(pip);
2389 		MDI_PI_HOLD(pip);
2390 		MDI_PI_UNLOCK(pip);
2391 	}
2392 }
2393 
2394 
2395 /*
2396  * mdi_rele_path():
2397  *		Release the mdi_pathinfo node which was selected
2398  *		through mdi_select_path() mechanism or manually held by
2399  *		calling mdi_hold_path().
2400  * Return Values:
2401  *		None
2402  */
2403 void
2404 mdi_rele_path(mdi_pathinfo_t *pip)
2405 {
2406 	if (pip) {
2407 		MDI_PI_LOCK(pip);
2408 		MDI_PI_RELE(pip);
2409 		if (MDI_PI(pip)->pi_ref_cnt == 0) {
2410 			cv_broadcast(&MDI_PI(pip)->pi_ref_cv);
2411 		}
2412 		MDI_PI_UNLOCK(pip);
2413 	}
2414 }
2415 
2416 /*
2417  * mdi_pi_lock():
2418  * 		Lock the mdi_pathinfo node.
2419  * Note:
2420  *		The caller should release the lock by calling mdi_pi_unlock()
2421  */
2422 void
2423 mdi_pi_lock(mdi_pathinfo_t *pip)
2424 {
2425 	ASSERT(pip != NULL);
2426 	if (pip) {
2427 		MDI_PI_LOCK(pip);
2428 	}
2429 }
2430 
2431 
2432 /*
2433  * mdi_pi_unlock():
2434  * 		Unlock the mdi_pathinfo node.
2435  * Note:
2436  *		The mdi_pathinfo node should have been locked with mdi_pi_lock()
2437  */
2438 void
2439 mdi_pi_unlock(mdi_pathinfo_t *pip)
2440 {
2441 	ASSERT(pip != NULL);
2442 	if (pip) {
2443 		MDI_PI_UNLOCK(pip);
2444 	}
2445 }
2446 
2447 /*
2448  * mdi_pi_find():
2449  *		Search the list of mdi_pathinfo nodes attached to the
2450  *		pHCI/Client device node whose path address matches "paddr".
2451  *		Returns a pointer to the mdi_pathinfo node if a matching node is
2452  *		found.
2453  * Return Values:
2454  *		mdi_pathinfo node handle
2455  *		NULL
2456  * Notes:
2457  *		Caller need not hold any locks to call this function.
2458  */
2459 mdi_pathinfo_t *
2460 mdi_pi_find(dev_info_t *pdip, char *caddr, char *paddr)
2461 {
2462 	mdi_phci_t		*ph;
2463 	mdi_vhci_t		*vh;
2464 	mdi_client_t		*ct;
2465 	mdi_pathinfo_t		*pip = NULL;
2466 
2467 	MDI_DEBUG(2, (CE_NOTE, pdip, "!mdi_pi_find: %s %s",
2468 	    caddr ? caddr : "NULL", paddr ? paddr : "NULL"));
2469 	if ((pdip == NULL) || (paddr == NULL)) {
2470 		return (NULL);
2471 	}
2472 	ph = i_devi_get_phci(pdip);
2473 	if (ph == NULL) {
2474 		/*
2475 		 * Invalid pHCI device, Nothing more to do.
2476 		 */
2477 		MDI_DEBUG(2, (CE_WARN, pdip,
2478 		    "!mdi_pi_find: invalid phci"));
2479 		return (NULL);
2480 	}
2481 
2482 	vh = ph->ph_vhci;
2483 	if (vh == NULL) {
2484 		/*
2485 		 * Invalid vHCI device, Nothing more to do.
2486 		 */
2487 		MDI_DEBUG(2, (CE_WARN, pdip,
2488 		    "!mdi_pi_find: invalid vhci"));
2489 		return (NULL);
2490 	}
2491 
2492 	/*
2493 	 * Look for pathinfo node identified by paddr.
2494 	 */
2495 	if (caddr == NULL) {
2496 		/*
2497 		 * Find a mdi_pathinfo node under pHCI list for a matching
2498 		 * unit address.
2499 		 */
2500 		MDI_PHCI_LOCK(ph);
2501 		if (MDI_PHCI_IS_OFFLINE(ph)) {
2502 			MDI_DEBUG(2, (CE_WARN, pdip,
2503 			    "!mdi_pi_find: offline phci %p", (void *)ph));
2504 			MDI_PHCI_UNLOCK(ph);
2505 			return (NULL);
2506 		}
2507 		pip = (mdi_pathinfo_t *)ph->ph_path_head;
2508 
2509 		while (pip != NULL) {
2510 			if (strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2511 				break;
2512 			}
2513 			pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
2514 		}
2515 		MDI_PHCI_UNLOCK(ph);
2516 		MDI_DEBUG(2, (CE_NOTE, pdip, "!mdi_pi_find: found %p",
2517 		    (void *)pip));
2518 		return (pip);
2519 	}
2520 
2521 	/*
2522 	 * XXX - Is the rest of the code in this function really necessary?
2523 	 * The consumers of mdi_pi_find() can search for the desired pathinfo
2524 	 * node by calling mdi_pi_find(pdip, NULL, paddr). Irrespective of
2525 	 * whether the search is based on the pathinfo nodes attached to
2526 	 * the pHCI or the client node, the result will be the same.
2527 	 */
2528 
2529 	/*
2530 	 * Find the client device corresponding to 'caddr'
2531 	 */
2532 	MDI_VHCI_CLIENT_LOCK(vh);
2533 
2534 	/*
2535 	 * XXX - Passing NULL to the following function works as long as the
2536 	 * the client addresses (caddr) are unique per vhci basis.
2537 	 */
2538 	ct = i_mdi_client_find(vh, NULL, caddr);
2539 	if (ct == NULL) {
2540 		/*
2541 		 * Client not found, Obviously mdi_pathinfo node has not been
2542 		 * created yet.
2543 		 */
2544 		MDI_VHCI_CLIENT_UNLOCK(vh);
2545 		MDI_DEBUG(2, (CE_NOTE, pdip, "!mdi_pi_find: client not "
2546 		    "found for caddr %s", caddr ? caddr : "NULL"));
2547 		return (NULL);
2548 	}
2549 
2550 	/*
2551 	 * Hold the client lock and look for a mdi_pathinfo node with matching
2552 	 * pHCI and paddr
2553 	 */
2554 	MDI_CLIENT_LOCK(ct);
2555 
2556 	/*
2557 	 * Release the global mutex as it is no more needed. Note: We always
2558 	 * respect the locking order while acquiring.
2559 	 */
2560 	MDI_VHCI_CLIENT_UNLOCK(vh);
2561 
2562 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2563 	while (pip != NULL) {
2564 		/*
2565 		 * Compare the unit address
2566 		 */
2567 		if ((MDI_PI(pip)->pi_phci == ph) &&
2568 		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2569 			break;
2570 		}
2571 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2572 	}
2573 	MDI_CLIENT_UNLOCK(ct);
2574 	MDI_DEBUG(2, (CE_NOTE, pdip, "!mdi_pi_find: found:: %p", (void *)pip));
2575 	return (pip);
2576 }
2577 
2578 /*
2579  * mdi_pi_alloc():
2580  *		Allocate and initialize a new instance of a mdi_pathinfo node.
2581  *		The mdi_pathinfo node returned by this function identifies a
2582  *		unique device path is capable of having properties attached
2583  *		and passed to mdi_pi_online() to fully attach and online the
2584  *		path and client device node.
2585  *		The mdi_pathinfo node returned by this function must be
2586  *		destroyed using mdi_pi_free() if the path is no longer
2587  *		operational or if the caller fails to attach a client device
2588  *		node when calling mdi_pi_online(). The framework will not free
2589  *		the resources allocated.
2590  *		This function can be called from both interrupt and kernel
2591  *		contexts.  DDI_NOSLEEP flag should be used while calling
2592  *		from interrupt contexts.
2593  * Return Values:
2594  *		MDI_SUCCESS
2595  *		MDI_FAILURE
2596  *		MDI_NOMEM
2597  */
2598 /*ARGSUSED*/
2599 int
2600 mdi_pi_alloc_compatible(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2601     char **compatible, int ncompatible, int flags, mdi_pathinfo_t **ret_pip)
2602 {
2603 	mdi_vhci_t	*vh;
2604 	mdi_phci_t	*ph;
2605 	mdi_client_t	*ct;
2606 	mdi_pathinfo_t	*pip = NULL;
2607 	dev_info_t	*cdip;
2608 	int		rv = MDI_NOMEM;
2609 	int		path_allocated = 0;
2610 
2611 	MDI_DEBUG(2, (CE_NOTE, pdip, "!mdi_pi_alloc_compatible: %s %s %s",
2612 	    cname ? cname : "NULL", caddr ? caddr : "NULL",
2613 	    paddr ? paddr : "NULL"));
2614 
2615 	if (pdip == NULL || cname == NULL || caddr == NULL || paddr == NULL ||
2616 	    ret_pip == NULL) {
2617 		/* Nothing more to do */
2618 		return (MDI_FAILURE);
2619 	}
2620 
2621 	*ret_pip = NULL;
2622 
2623 	/* No allocations on detaching pHCI */
2624 	if (DEVI_IS_DETACHING(pdip)) {
2625 		/* Invalid pHCI device, return failure */
2626 		MDI_DEBUG(1, (CE_WARN, pdip,
2627 		    "!mdi_pi_alloc: detaching pHCI=%p", (void *)pdip));
2628 		return (MDI_FAILURE);
2629 	}
2630 
2631 	ph = i_devi_get_phci(pdip);
2632 	ASSERT(ph != NULL);
2633 	if (ph == NULL) {
2634 		/* Invalid pHCI device, return failure */
2635 		MDI_DEBUG(1, (CE_WARN, pdip,
2636 		    "!mdi_pi_alloc: invalid pHCI=%p", (void *)pdip));
2637 		return (MDI_FAILURE);
2638 	}
2639 
2640 	MDI_PHCI_LOCK(ph);
2641 	vh = ph->ph_vhci;
2642 	if (vh == NULL) {
2643 		/* Invalid vHCI device, return failure */
2644 		MDI_DEBUG(1, (CE_WARN, pdip,
2645 		    "!mdi_pi_alloc: invalid vHCI=%p", (void *)pdip));
2646 		MDI_PHCI_UNLOCK(ph);
2647 		return (MDI_FAILURE);
2648 	}
2649 
2650 	if (MDI_PHCI_IS_READY(ph) == 0) {
2651 		/*
2652 		 * Do not allow new node creation when pHCI is in
2653 		 * offline/suspended states
2654 		 */
2655 		MDI_DEBUG(1, (CE_WARN, pdip,
2656 		    "mdi_pi_alloc: pHCI=%p is not ready", (void *)ph));
2657 		MDI_PHCI_UNLOCK(ph);
2658 		return (MDI_BUSY);
2659 	}
2660 	MDI_PHCI_UNSTABLE(ph);
2661 	MDI_PHCI_UNLOCK(ph);
2662 
2663 	/* look for a matching client, create one if not found */
2664 	MDI_VHCI_CLIENT_LOCK(vh);
2665 	ct = i_mdi_client_find(vh, cname, caddr);
2666 	if (ct == NULL) {
2667 		ct = i_mdi_client_alloc(vh, cname, caddr);
2668 		ASSERT(ct != NULL);
2669 	}
2670 
2671 	if (ct->ct_dip == NULL) {
2672 		/*
2673 		 * Allocate a devinfo node
2674 		 */
2675 		ct->ct_dip = i_mdi_devinfo_create(vh, cname, caddr,
2676 		    compatible, ncompatible);
2677 		if (ct->ct_dip == NULL) {
2678 			(void) i_mdi_client_free(vh, ct);
2679 			goto fail;
2680 		}
2681 	}
2682 	cdip = ct->ct_dip;
2683 
2684 	DEVI(cdip)->devi_mdi_component |= MDI_COMPONENT_CLIENT;
2685 	DEVI(cdip)->devi_mdi_client = (caddr_t)ct;
2686 
2687 	MDI_CLIENT_LOCK(ct);
2688 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2689 	while (pip != NULL) {
2690 		/*
2691 		 * Compare the unit address
2692 		 */
2693 		if ((MDI_PI(pip)->pi_phci == ph) &&
2694 		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2695 			break;
2696 		}
2697 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2698 	}
2699 	MDI_CLIENT_UNLOCK(ct);
2700 
2701 	if (pip == NULL) {
2702 		/*
2703 		 * This is a new path for this client device.  Allocate and
2704 		 * initialize a new pathinfo node
2705 		 */
2706 		pip = i_mdi_pi_alloc(ph, paddr, ct);
2707 		ASSERT(pip != NULL);
2708 		path_allocated = 1;
2709 	}
2710 	rv = MDI_SUCCESS;
2711 
2712 fail:
2713 	/*
2714 	 * Release the global mutex.
2715 	 */
2716 	MDI_VHCI_CLIENT_UNLOCK(vh);
2717 
2718 	/*
2719 	 * Mark the pHCI as stable
2720 	 */
2721 	MDI_PHCI_LOCK(ph);
2722 	MDI_PHCI_STABLE(ph);
2723 	MDI_PHCI_UNLOCK(ph);
2724 	*ret_pip = pip;
2725 
2726 	MDI_DEBUG(2, (CE_NOTE, pdip,
2727 	    "!mdi_pi_alloc_compatible: alloc %p", (void *)pip));
2728 
2729 	if (path_allocated)
2730 		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
2731 
2732 	return (rv);
2733 }
2734 
2735 /*ARGSUSED*/
2736 int
2737 mdi_pi_alloc(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2738     int flags, mdi_pathinfo_t **ret_pip)
2739 {
2740 	return (mdi_pi_alloc_compatible(pdip, cname, caddr, paddr, NULL, 0,
2741 	    flags, ret_pip));
2742 }
2743 
2744 /*
2745  * i_mdi_pi_alloc():
2746  *		Allocate a mdi_pathinfo node and add to the pHCI path list
2747  * Return Values:
2748  *		mdi_pathinfo
2749  */
2750 /*ARGSUSED*/
2751 static mdi_pathinfo_t *
2752 i_mdi_pi_alloc(mdi_phci_t *ph, char *paddr, mdi_client_t *ct)
2753 {
2754 	mdi_pathinfo_t	*pip;
2755 	int		ct_circular;
2756 	int		ph_circular;
2757 	int		se_flag;
2758 	int		kmem_flag;
2759 
2760 	ASSERT(MDI_VHCI_CLIENT_LOCKED(ph->ph_vhci));
2761 
2762 	pip = kmem_zalloc(sizeof (struct mdi_pathinfo), KM_SLEEP);
2763 	mutex_init(&MDI_PI(pip)->pi_mutex, NULL, MUTEX_DEFAULT, NULL);
2764 	MDI_PI(pip)->pi_state = MDI_PATHINFO_STATE_INIT |
2765 	    MDI_PATHINFO_STATE_TRANSIENT;
2766 
2767 	if (MDI_PHCI_IS_USER_DISABLED(ph))
2768 		MDI_PI_SET_USER_DISABLE(pip);
2769 
2770 	if (MDI_PHCI_IS_DRV_DISABLED_TRANSIENT(ph))
2771 		MDI_PI_SET_DRV_DISABLE_TRANS(pip);
2772 
2773 	if (MDI_PHCI_IS_DRV_DISABLED(ph))
2774 		MDI_PI_SET_DRV_DISABLE(pip);
2775 
2776 	MDI_PI(pip)->pi_old_state = MDI_PATHINFO_STATE_INIT;
2777 	cv_init(&MDI_PI(pip)->pi_state_cv, NULL, CV_DEFAULT, NULL);
2778 	MDI_PI(pip)->pi_client = ct;
2779 	MDI_PI(pip)->pi_phci = ph;
2780 	MDI_PI(pip)->pi_addr = kmem_alloc(strlen(paddr) + 1, KM_SLEEP);
2781 	(void) strcpy(MDI_PI(pip)->pi_addr, paddr);
2782 	(void) nvlist_alloc(&MDI_PI(pip)->pi_prop, NV_UNIQUE_NAME, KM_SLEEP);
2783 	ASSERT(MDI_PI(pip)->pi_prop != NULL);
2784 	MDI_PI(pip)->pi_pprivate = NULL;
2785 	MDI_PI(pip)->pi_cprivate = NULL;
2786 	MDI_PI(pip)->pi_vprivate = NULL;
2787 	MDI_PI(pip)->pi_client_link = NULL;
2788 	MDI_PI(pip)->pi_phci_link = NULL;
2789 	MDI_PI(pip)->pi_ref_cnt = 0;
2790 	MDI_PI(pip)->pi_kstats = NULL;
2791 	MDI_PI(pip)->pi_preferred = 1;
2792 	cv_init(&MDI_PI(pip)->pi_ref_cv, NULL, CV_DEFAULT, NULL);
2793 
2794 	/*
2795 	 * Lock both dev_info nodes against changes in parallel.
2796 	 *
2797 	 * The ndi_devi_enter(Client), is atypical since the client is a leaf.
2798 	 * This atypical operation is done to synchronize pathinfo nodes
2799 	 * during devinfo snapshot (see di_register_pip) by 'pretending' that
2800 	 * the pathinfo nodes are children of the Client.
2801 	 */
2802 	ndi_devi_enter(ct->ct_dip, &ct_circular);
2803 	ndi_devi_enter(ph->ph_dip, &ph_circular);
2804 
2805 	i_mdi_phci_add_path(ph, pip);
2806 	i_mdi_client_add_path(ct, pip);
2807 
2808 	ndi_devi_exit(ph->ph_dip, ph_circular);
2809 	ndi_devi_exit(ct->ct_dip, ct_circular);
2810 
2811 	/* determine interrupt context */
2812 	se_flag = (servicing_interrupt()) ? SE_NOSLEEP : SE_SLEEP;
2813 	kmem_flag = (se_flag == SE_SLEEP) ? KM_SLEEP : KM_NOSLEEP;
2814 
2815 	i_ddi_di_cache_invalidate(kmem_flag);
2816 
2817 	return (pip);
2818 }
2819 
2820 /*
2821  * i_mdi_phci_add_path():
2822  * 		Add a mdi_pathinfo node to pHCI list.
2823  * Notes:
2824  *		Caller should per-pHCI mutex
2825  */
2826 static void
2827 i_mdi_phci_add_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
2828 {
2829 	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
2830 
2831 	MDI_PHCI_LOCK(ph);
2832 	if (ph->ph_path_head == NULL) {
2833 		ph->ph_path_head = pip;
2834 	} else {
2835 		MDI_PI(ph->ph_path_tail)->pi_phci_link = MDI_PI(pip);
2836 	}
2837 	ph->ph_path_tail = pip;
2838 	ph->ph_path_count++;
2839 	MDI_PHCI_UNLOCK(ph);
2840 }
2841 
2842 /*
2843  * i_mdi_client_add_path():
2844  *		Add mdi_pathinfo node to client list
2845  */
2846 static void
2847 i_mdi_client_add_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
2848 {
2849 	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
2850 
2851 	MDI_CLIENT_LOCK(ct);
2852 	if (ct->ct_path_head == NULL) {
2853 		ct->ct_path_head = pip;
2854 	} else {
2855 		MDI_PI(ct->ct_path_tail)->pi_client_link = MDI_PI(pip);
2856 	}
2857 	ct->ct_path_tail = pip;
2858 	ct->ct_path_count++;
2859 	MDI_CLIENT_UNLOCK(ct);
2860 }
2861 
2862 /*
2863  * mdi_pi_free():
2864  *		Free the mdi_pathinfo node and also client device node if this
2865  *		is the last path to the device
2866  * Return Values:
2867  *		MDI_SUCCESS
2868  *		MDI_FAILURE
2869  *		MDI_BUSY
2870  */
2871 /*ARGSUSED*/
2872 int
2873 mdi_pi_free(mdi_pathinfo_t *pip, int flags)
2874 {
2875 	int		rv = MDI_SUCCESS;
2876 	mdi_vhci_t	*vh;
2877 	mdi_phci_t	*ph;
2878 	mdi_client_t	*ct;
2879 	int		(*f)();
2880 	int		client_held = 0;
2881 
2882 	MDI_PI_LOCK(pip);
2883 	ph = MDI_PI(pip)->pi_phci;
2884 	ASSERT(ph != NULL);
2885 	if (ph == NULL) {
2886 		/*
2887 		 * Invalid pHCI device, return failure
2888 		 */
2889 		MDI_DEBUG(1, (CE_WARN, NULL,
2890 		    "!mdi_pi_free: invalid pHCI pip=%p", (void *)pip));
2891 		MDI_PI_UNLOCK(pip);
2892 		return (MDI_FAILURE);
2893 	}
2894 
2895 	vh = ph->ph_vhci;
2896 	ASSERT(vh != NULL);
2897 	if (vh == NULL) {
2898 		/* Invalid pHCI device, return failure */
2899 		MDI_DEBUG(1, (CE_WARN, NULL,
2900 		    "!mdi_pi_free: invalid vHCI pip=%p", (void *)pip));
2901 		MDI_PI_UNLOCK(pip);
2902 		return (MDI_FAILURE);
2903 	}
2904 
2905 	ct = MDI_PI(pip)->pi_client;
2906 	ASSERT(ct != NULL);
2907 	if (ct == NULL) {
2908 		/*
2909 		 * Invalid Client device, return failure
2910 		 */
2911 		MDI_DEBUG(1, (CE_WARN, NULL,
2912 		    "!mdi_pi_free: invalid client pip=%p", (void *)pip));
2913 		MDI_PI_UNLOCK(pip);
2914 		return (MDI_FAILURE);
2915 	}
2916 
2917 	/*
2918 	 * Check to see for busy condition.  A mdi_pathinfo can only be freed
2919 	 * if the node state is either offline or init and the reference count
2920 	 * is zero.
2921 	 */
2922 	if (!(MDI_PI_IS_OFFLINE(pip) || MDI_PI_IS_INIT(pip) ||
2923 	    MDI_PI_IS_INITING(pip))) {
2924 		/*
2925 		 * Node is busy
2926 		 */
2927 		MDI_DEBUG(1, (CE_WARN, ct->ct_dip,
2928 		    "!mdi_pi_free: pathinfo node is busy pip=%p", (void *)pip));
2929 		MDI_PI_UNLOCK(pip);
2930 		return (MDI_BUSY);
2931 	}
2932 
2933 	while (MDI_PI(pip)->pi_ref_cnt != 0) {
2934 		/*
2935 		 * Give a chance for pending I/Os to complete.
2936 		 */
2937 		MDI_DEBUG(1, (CE_NOTE, ct->ct_dip, "!mdi_pi_free: "
2938 		    "%d cmds still pending on path: %p\n",
2939 		    MDI_PI(pip)->pi_ref_cnt, (void *)pip));
2940 		if (cv_timedwait(&MDI_PI(pip)->pi_ref_cv,
2941 		    &MDI_PI(pip)->pi_mutex,
2942 		    ddi_get_lbolt() + drv_usectohz(60 * 1000000)) == -1) {
2943 			/*
2944 			 * The timeout time reached without ref_cnt being zero
2945 			 * being signaled.
2946 			 */
2947 			MDI_DEBUG(1, (CE_NOTE, ct->ct_dip,
2948 			    "!mdi_pi_free: "
2949 			    "Timeout reached on path %p without the cond\n",
2950 			    (void *)pip));
2951 			MDI_DEBUG(1, (CE_NOTE, ct->ct_dip,
2952 			    "!mdi_pi_free: "
2953 			    "%d cmds still pending on path: %p\n",
2954 			    MDI_PI(pip)->pi_ref_cnt, (void *)pip));
2955 			MDI_PI_UNLOCK(pip);
2956 			return (MDI_BUSY);
2957 		}
2958 	}
2959 	if (MDI_PI(pip)->pi_pm_held) {
2960 		client_held = 1;
2961 	}
2962 	MDI_PI_UNLOCK(pip);
2963 
2964 	vhcache_pi_remove(vh->vh_config, MDI_PI(pip));
2965 
2966 	MDI_CLIENT_LOCK(ct);
2967 
2968 	/* Prevent further failovers till MDI_VHCI_CLIENT_LOCK is held */
2969 	MDI_CLIENT_SET_PATH_FREE_IN_PROGRESS(ct);
2970 
2971 	/*
2972 	 * Wait till failover is complete before removing this node.
2973 	 */
2974 	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
2975 		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
2976 
2977 	MDI_CLIENT_UNLOCK(ct);
2978 	MDI_VHCI_CLIENT_LOCK(vh);
2979 	MDI_CLIENT_LOCK(ct);
2980 	MDI_CLIENT_CLEAR_PATH_FREE_IN_PROGRESS(ct);
2981 
2982 	if (!MDI_PI_IS_INITING(pip)) {
2983 		f = vh->vh_ops->vo_pi_uninit;
2984 		if (f != NULL) {
2985 			rv = (*f)(vh->vh_dip, pip, 0);
2986 		}
2987 	}
2988 	/*
2989 	 * If vo_pi_uninit() completed successfully.
2990 	 */
2991 	if (rv == MDI_SUCCESS) {
2992 		if (client_held) {
2993 			MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "mdi_pi_free "
2994 			    "i_mdi_pm_rele_client\n"));
2995 			i_mdi_pm_rele_client(ct, 1);
2996 		}
2997 		i_mdi_pi_free(ph, pip, ct);
2998 		if (ct->ct_path_count == 0) {
2999 			/*
3000 			 * Client lost its last path.
3001 			 * Clean up the client device
3002 			 */
3003 			MDI_CLIENT_UNLOCK(ct);
3004 			(void) i_mdi_client_free(ct->ct_vhci, ct);
3005 			MDI_VHCI_CLIENT_UNLOCK(vh);
3006 			return (rv);
3007 		}
3008 	}
3009 	MDI_CLIENT_UNLOCK(ct);
3010 	MDI_VHCI_CLIENT_UNLOCK(vh);
3011 
3012 	if (rv == MDI_FAILURE)
3013 		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
3014 
3015 	return (rv);
3016 }
3017 
3018 /*
3019  * i_mdi_pi_free():
3020  *		Free the mdi_pathinfo node
3021  */
3022 static void
3023 i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *pip, mdi_client_t *ct)
3024 {
3025 	int	ct_circular;
3026 	int	ph_circular;
3027 	int	se_flag;
3028 	int	kmem_flag;
3029 
3030 	ASSERT(MDI_CLIENT_LOCKED(ct));
3031 
3032 	/*
3033 	 * remove any per-path kstats
3034 	 */
3035 	i_mdi_pi_kstat_destroy(pip);
3036 
3037 	/* See comments in i_mdi_pi_alloc() */
3038 	ndi_devi_enter(ct->ct_dip, &ct_circular);
3039 	ndi_devi_enter(ph->ph_dip, &ph_circular);
3040 
3041 	i_mdi_client_remove_path(ct, pip);
3042 	i_mdi_phci_remove_path(ph, pip);
3043 
3044 	ndi_devi_exit(ph->ph_dip, ph_circular);
3045 	ndi_devi_exit(ct->ct_dip, ct_circular);
3046 
3047 	/* determine interrupt context */
3048 	se_flag = (servicing_interrupt()) ? SE_NOSLEEP : SE_SLEEP;
3049 	kmem_flag = (se_flag == SE_SLEEP) ? KM_SLEEP : KM_NOSLEEP;
3050 
3051 	i_ddi_di_cache_invalidate(kmem_flag);
3052 
3053 	mutex_destroy(&MDI_PI(pip)->pi_mutex);
3054 	cv_destroy(&MDI_PI(pip)->pi_state_cv);
3055 	cv_destroy(&MDI_PI(pip)->pi_ref_cv);
3056 	if (MDI_PI(pip)->pi_addr) {
3057 		kmem_free(MDI_PI(pip)->pi_addr,
3058 		    strlen(MDI_PI(pip)->pi_addr) + 1);
3059 		MDI_PI(pip)->pi_addr = NULL;
3060 	}
3061 
3062 	if (MDI_PI(pip)->pi_prop) {
3063 		(void) nvlist_free(MDI_PI(pip)->pi_prop);
3064 		MDI_PI(pip)->pi_prop = NULL;
3065 	}
3066 	kmem_free(pip, sizeof (struct mdi_pathinfo));
3067 }
3068 
3069 
3070 /*
3071  * i_mdi_phci_remove_path():
3072  * 		Remove a mdi_pathinfo node from pHCI list.
3073  * Notes:
3074  *		Caller should hold per-pHCI mutex
3075  */
3076 static void
3077 i_mdi_phci_remove_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
3078 {
3079 	mdi_pathinfo_t	*prev = NULL;
3080 	mdi_pathinfo_t	*path = NULL;
3081 
3082 	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
3083 
3084 	MDI_PHCI_LOCK(ph);
3085 	path = ph->ph_path_head;
3086 	while (path != NULL) {
3087 		if (path == pip) {
3088 			break;
3089 		}
3090 		prev = path;
3091 		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
3092 	}
3093 
3094 	if (path) {
3095 		ph->ph_path_count--;
3096 		if (prev) {
3097 			MDI_PI(prev)->pi_phci_link = MDI_PI(path)->pi_phci_link;
3098 		} else {
3099 			ph->ph_path_head =
3100 			    (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
3101 		}
3102 		if (ph->ph_path_tail == path) {
3103 			ph->ph_path_tail = prev;
3104 		}
3105 	}
3106 
3107 	/*
3108 	 * Clear the pHCI link
3109 	 */
3110 	MDI_PI(pip)->pi_phci_link = NULL;
3111 	MDI_PI(pip)->pi_phci = NULL;
3112 	MDI_PHCI_UNLOCK(ph);
3113 }
3114 
3115 /*
3116  * i_mdi_client_remove_path():
3117  * 		Remove a mdi_pathinfo node from client path list.
3118  */
3119 static void
3120 i_mdi_client_remove_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
3121 {
3122 	mdi_pathinfo_t	*prev = NULL;
3123 	mdi_pathinfo_t	*path;
3124 
3125 	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
3126 
3127 	ASSERT(MDI_CLIENT_LOCKED(ct));
3128 	path = ct->ct_path_head;
3129 	while (path != NULL) {
3130 		if (path == pip) {
3131 			break;
3132 		}
3133 		prev = path;
3134 		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
3135 	}
3136 
3137 	if (path) {
3138 		ct->ct_path_count--;
3139 		if (prev) {
3140 			MDI_PI(prev)->pi_client_link =
3141 			    MDI_PI(path)->pi_client_link;
3142 		} else {
3143 			ct->ct_path_head =
3144 			    (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
3145 		}
3146 		if (ct->ct_path_tail == path) {
3147 			ct->ct_path_tail = prev;
3148 		}
3149 		if (ct->ct_path_last == path) {
3150 			ct->ct_path_last = ct->ct_path_head;
3151 		}
3152 	}
3153 	MDI_PI(pip)->pi_client_link = NULL;
3154 	MDI_PI(pip)->pi_client = NULL;
3155 }
3156 
3157 /*
3158  * i_mdi_pi_state_change():
3159  *		online a mdi_pathinfo node
3160  *
3161  * Return Values:
3162  *		MDI_SUCCESS
3163  *		MDI_FAILURE
3164  */
3165 /*ARGSUSED*/
3166 static int
3167 i_mdi_pi_state_change(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state, int flag)
3168 {
3169 	int		rv = MDI_SUCCESS;
3170 	mdi_vhci_t	*vh;
3171 	mdi_phci_t	*ph;
3172 	mdi_client_t	*ct;
3173 	int		(*f)();
3174 	dev_info_t	*cdip;
3175 
3176 	MDI_PI_LOCK(pip);
3177 
3178 	ph = MDI_PI(pip)->pi_phci;
3179 	ASSERT(ph);
3180 	if (ph == NULL) {
3181 		/*
3182 		 * Invalid pHCI device, fail the request
3183 		 */
3184 		MDI_PI_UNLOCK(pip);
3185 		MDI_DEBUG(1, (CE_WARN, NULL,
3186 		    "!mdi_pi_state_change: invalid phci pip=%p", (void *)pip));
3187 		return (MDI_FAILURE);
3188 	}
3189 
3190 	vh = ph->ph_vhci;
3191 	ASSERT(vh);
3192 	if (vh == NULL) {
3193 		/*
3194 		 * Invalid vHCI device, fail the request
3195 		 */
3196 		MDI_PI_UNLOCK(pip);
3197 		MDI_DEBUG(1, (CE_WARN, NULL,
3198 		    "!mdi_pi_state_change: invalid vhci pip=%p", (void *)pip));
3199 		return (MDI_FAILURE);
3200 	}
3201 
3202 	ct = MDI_PI(pip)->pi_client;
3203 	ASSERT(ct != NULL);
3204 	if (ct == NULL) {
3205 		/*
3206 		 * Invalid client device, fail the request
3207 		 */
3208 		MDI_PI_UNLOCK(pip);
3209 		MDI_DEBUG(1, (CE_WARN, NULL,
3210 		    "!mdi_pi_state_change: invalid client pip=%p",
3211 		    (void *)pip));
3212 		return (MDI_FAILURE);
3213 	}
3214 
3215 	/*
3216 	 * If this path has not been initialized yet, Callback vHCI driver's
3217 	 * pathinfo node initialize entry point
3218 	 */
3219 
3220 	if (MDI_PI_IS_INITING(pip)) {
3221 		MDI_PI_UNLOCK(pip);
3222 		f = vh->vh_ops->vo_pi_init;
3223 		if (f != NULL) {
3224 			rv = (*f)(vh->vh_dip, pip, 0);
3225 			if (rv != MDI_SUCCESS) {
3226 				MDI_DEBUG(1, (CE_WARN, ct->ct_dip,
3227 				    "!vo_pi_init: failed vHCI=0x%p, pip=0x%p",
3228 				    (void *)vh, (void *)pip));
3229 				return (MDI_FAILURE);
3230 			}
3231 		}
3232 		MDI_PI_LOCK(pip);
3233 		MDI_PI_CLEAR_TRANSIENT(pip);
3234 	}
3235 
3236 	/*
3237 	 * Do not allow state transition when pHCI is in offline/suspended
3238 	 * states
3239 	 */
3240 	i_mdi_phci_lock(ph, pip);
3241 	if (MDI_PHCI_IS_READY(ph) == 0) {
3242 		MDI_DEBUG(1, (CE_WARN, ct->ct_dip,
3243 		    "!mdi_pi_state_change: pHCI not ready, pHCI=%p",
3244 		    (void *)ph));
3245 		MDI_PI_UNLOCK(pip);
3246 		i_mdi_phci_unlock(ph);
3247 		return (MDI_BUSY);
3248 	}
3249 	MDI_PHCI_UNSTABLE(ph);
3250 	i_mdi_phci_unlock(ph);
3251 
3252 	/*
3253 	 * Check if mdi_pathinfo state is in transient state.
3254 	 * If yes, offlining is in progress and wait till transient state is
3255 	 * cleared.
3256 	 */
3257 	if (MDI_PI_IS_TRANSIENT(pip)) {
3258 		while (MDI_PI_IS_TRANSIENT(pip)) {
3259 			cv_wait(&MDI_PI(pip)->pi_state_cv,
3260 			    &MDI_PI(pip)->pi_mutex);
3261 		}
3262 	}
3263 
3264 	/*
3265 	 * Grab the client lock in reverse order sequence and release the
3266 	 * mdi_pathinfo mutex.
3267 	 */
3268 	i_mdi_client_lock(ct, pip);
3269 	MDI_PI_UNLOCK(pip);
3270 
3271 	/*
3272 	 * Wait till failover state is cleared
3273 	 */
3274 	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
3275 		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
3276 
3277 	/*
3278 	 * Mark the mdi_pathinfo node state as transient
3279 	 */
3280 	MDI_PI_LOCK(pip);
3281 	switch (state) {
3282 	case MDI_PATHINFO_STATE_ONLINE:
3283 		MDI_PI_SET_ONLINING(pip);
3284 		break;
3285 
3286 	case MDI_PATHINFO_STATE_STANDBY:
3287 		MDI_PI_SET_STANDBYING(pip);
3288 		break;
3289 
3290 	case MDI_PATHINFO_STATE_FAULT:
3291 		/*
3292 		 * Mark the pathinfo state as FAULTED
3293 		 */
3294 		MDI_PI_SET_FAULTING(pip);
3295 		MDI_PI_ERRSTAT(pip, MDI_PI_HARDERR);
3296 		break;
3297 
3298 	case MDI_PATHINFO_STATE_OFFLINE:
3299 		/*
3300 		 * ndi_devi_offline() cannot hold pip or ct locks.
3301 		 */
3302 		MDI_PI_UNLOCK(pip);
3303 		/*
3304 		 * Do not offline if path will become last path and path
3305 		 * is busy for user initiated events.
3306 		 */
3307 		cdip = ct->ct_dip;
3308 		if ((flag & NDI_DEVI_REMOVE) &&
3309 		    (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED)) {
3310 			i_mdi_client_unlock(ct);
3311 			rv = ndi_devi_offline(cdip, 0);
3312 			if (rv != NDI_SUCCESS) {
3313 				/*
3314 				 * Convert to MDI error code
3315 				 */
3316 				switch (rv) {
3317 				case NDI_BUSY:
3318 					rv = MDI_BUSY;
3319 					break;
3320 				default:
3321 					rv = MDI_FAILURE;
3322 					break;
3323 				}
3324 				goto state_change_exit;
3325 			} else {
3326 				i_mdi_client_lock(ct, NULL);
3327 			}
3328 		}
3329 		/*
3330 		 * Mark the mdi_pathinfo node state as transient
3331 		 */
3332 		MDI_PI_LOCK(pip);
3333 		MDI_PI_SET_OFFLINING(pip);
3334 		break;
3335 	}
3336 	MDI_PI_UNLOCK(pip);
3337 	MDI_CLIENT_UNSTABLE(ct);
3338 	i_mdi_client_unlock(ct);
3339 
3340 	f = vh->vh_ops->vo_pi_state_change;
3341 	if (f != NULL)
3342 		rv = (*f)(vh->vh_dip, pip, state, 0, flag);
3343 
3344 	MDI_CLIENT_LOCK(ct);
3345 	MDI_PI_LOCK(pip);
3346 	if (rv == MDI_NOT_SUPPORTED) {
3347 		MDI_CLIENT_SET_DEV_NOT_SUPPORTED(ct);
3348 	}
3349 	if (rv != MDI_SUCCESS) {
3350 		MDI_DEBUG(2, (CE_WARN, ct->ct_dip,
3351 		    "!vo_pi_state_change: failed rv = %x", rv));
3352 	}
3353 	if (MDI_PI_IS_TRANSIENT(pip)) {
3354 		if (rv == MDI_SUCCESS) {
3355 			MDI_PI_CLEAR_TRANSIENT(pip);
3356 		} else {
3357 			MDI_PI(pip)->pi_state = MDI_PI_OLD_STATE(pip);
3358 		}
3359 	}
3360 
3361 	/*
3362 	 * Wake anyone waiting for this mdi_pathinfo node
3363 	 */
3364 	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
3365 	MDI_PI_UNLOCK(pip);
3366 
3367 	/*
3368 	 * Mark the client device as stable
3369 	 */
3370 	MDI_CLIENT_STABLE(ct);
3371 	if (rv == MDI_SUCCESS) {
3372 		if (ct->ct_unstable == 0) {
3373 			cdip = ct->ct_dip;
3374 
3375 			/*
3376 			 * Onlining the mdi_pathinfo node will impact the
3377 			 * client state Update the client and dev_info node
3378 			 * state accordingly
3379 			 */
3380 			rv = NDI_SUCCESS;
3381 			i_mdi_client_update_state(ct);
3382 			switch (MDI_CLIENT_STATE(ct)) {
3383 			case MDI_CLIENT_STATE_OPTIMAL:
3384 			case MDI_CLIENT_STATE_DEGRADED:
3385 				if (cdip && !i_ddi_devi_attached(cdip) &&
3386 				    ((state == MDI_PATHINFO_STATE_ONLINE) ||
3387 				    (state == MDI_PATHINFO_STATE_STANDBY))) {
3388 
3389 					/*
3390 					 * Must do ndi_devi_online() through
3391 					 * hotplug thread for deferred
3392 					 * attach mechanism to work
3393 					 */
3394 					MDI_CLIENT_UNLOCK(ct);
3395 					rv = ndi_devi_online(cdip, 0);
3396 					MDI_CLIENT_LOCK(ct);
3397 					if ((rv != NDI_SUCCESS) &&
3398 					    (MDI_CLIENT_STATE(ct) ==
3399 					    MDI_CLIENT_STATE_DEGRADED)) {
3400 						/*
3401 						 * ndi_devi_online failed.
3402 						 * Reset client flags to
3403 						 * offline.
3404 						 */
3405 						MDI_DEBUG(1, (CE_WARN, cdip,
3406 						    "!ndi_devi_online: failed "
3407 						    " Error: %x", rv));
3408 						MDI_CLIENT_SET_OFFLINE(ct);
3409 					}
3410 					if (rv != NDI_SUCCESS) {
3411 						/* Reset the path state */
3412 						MDI_PI_LOCK(pip);
3413 						MDI_PI(pip)->pi_state =
3414 						    MDI_PI_OLD_STATE(pip);
3415 						MDI_PI_UNLOCK(pip);
3416 					}
3417 				}
3418 				break;
3419 
3420 			case MDI_CLIENT_STATE_FAILED:
3421 				/*
3422 				 * This is the last path case for
3423 				 * non-user initiated events.
3424 				 */
3425 				if (((flag & NDI_DEVI_REMOVE) == 0) &&
3426 				    cdip && (i_ddi_node_state(cdip) >=
3427 				    DS_INITIALIZED)) {
3428 					MDI_CLIENT_UNLOCK(ct);
3429 					rv = ndi_devi_offline(cdip, 0);
3430 					MDI_CLIENT_LOCK(ct);
3431 
3432 					if (rv != NDI_SUCCESS) {
3433 						/*
3434 						 * ndi_devi_offline failed.
3435 						 * Reset client flags to
3436 						 * online as the path could not
3437 						 * be offlined.
3438 						 */
3439 						MDI_DEBUG(1, (CE_WARN, cdip,
3440 						    "!ndi_devi_offline: failed "
3441 						    " Error: %x", rv));
3442 						MDI_CLIENT_SET_ONLINE(ct);
3443 					}
3444 				}
3445 				break;
3446 			}
3447 			/*
3448 			 * Convert to MDI error code
3449 			 */
3450 			switch (rv) {
3451 			case NDI_SUCCESS:
3452 				MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
3453 				i_mdi_report_path_state(ct, pip);
3454 				rv = MDI_SUCCESS;
3455 				break;
3456 			case NDI_BUSY:
3457 				rv = MDI_BUSY;
3458 				break;
3459 			default:
3460 				rv = MDI_FAILURE;
3461 				break;
3462 			}
3463 		}
3464 	}
3465 	MDI_CLIENT_UNLOCK(ct);
3466 
3467 state_change_exit:
3468 	/*
3469 	 * Mark the pHCI as stable again.
3470 	 */
3471 	MDI_PHCI_LOCK(ph);
3472 	MDI_PHCI_STABLE(ph);
3473 	MDI_PHCI_UNLOCK(ph);
3474 	return (rv);
3475 }
3476 
3477 /*
3478  * mdi_pi_online():
3479  *		Place the path_info node in the online state.  The path is
3480  *		now available to be selected by mdi_select_path() for
3481  *		transporting I/O requests to client devices.
3482  * Return Values:
3483  *		MDI_SUCCESS
3484  *		MDI_FAILURE
3485  */
3486 int
3487 mdi_pi_online(mdi_pathinfo_t *pip, int flags)
3488 {
3489 	mdi_client_t *ct = MDI_PI(pip)->pi_client;
3490 	dev_info_t *cdip;
3491 	int		client_held = 0;
3492 	int rv;
3493 
3494 	ASSERT(ct != NULL);
3495 	rv = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_ONLINE, flags);
3496 	if (rv != MDI_SUCCESS)
3497 		return (rv);
3498 
3499 	MDI_PI_LOCK(pip);
3500 	if (MDI_PI(pip)->pi_pm_held == 0) {
3501 		MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "mdi_pi_online "
3502 		    "i_mdi_pm_hold_pip %p\n", (void *)pip));
3503 		i_mdi_pm_hold_pip(pip);
3504 		client_held = 1;
3505 	}
3506 	MDI_PI_UNLOCK(pip);
3507 
3508 	if (client_held) {
3509 		MDI_CLIENT_LOCK(ct);
3510 		if (ct->ct_power_cnt == 0) {
3511 			rv = i_mdi_power_all_phci(ct);
3512 		}
3513 
3514 		MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "mdi_pi_online "
3515 		    "i_mdi_pm_hold_client %p\n", (void *)ct));
3516 		i_mdi_pm_hold_client(ct, 1);
3517 		MDI_CLIENT_UNLOCK(ct);
3518 	}
3519 
3520 	/*
3521 	 * Create the per-path (pathinfo) IO and error kstats which
3522 	 * are reported via iostat(1m).
3523 	 *
3524 	 * Defer creating the per-path kstats if device is not yet
3525 	 * attached;  the names of the kstats are constructed in part
3526 	 * using the devices instance number which is assigned during
3527 	 * process of attaching the client device.
3528 	 *
3529 	 * The framework post_attach handler, mdi_post_attach(), is
3530 	 * is responsible for initializing the client's pathinfo list
3531 	 * once successfully attached.
3532 	 */
3533 	cdip = ct->ct_dip;
3534 	ASSERT(cdip);
3535 	if (cdip == NULL || !i_ddi_devi_attached(cdip))
3536 		return (rv);
3537 
3538 	MDI_CLIENT_LOCK(ct);
3539 	rv = i_mdi_pi_kstat_create(pip);
3540 	MDI_CLIENT_UNLOCK(ct);
3541 	return (rv);
3542 }
3543 
3544 /*
3545  * mdi_pi_standby():
3546  *		Place the mdi_pathinfo node in standby state
3547  *
3548  * Return Values:
3549  *		MDI_SUCCESS
3550  *		MDI_FAILURE
3551  */
3552 int
3553 mdi_pi_standby(mdi_pathinfo_t *pip, int flags)
3554 {
3555 	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_STANDBY, flags));
3556 }
3557 
3558 /*
3559  * mdi_pi_fault():
3560  *		Place the mdi_pathinfo node in fault'ed state
3561  * Return Values:
3562  *		MDI_SUCCESS
3563  *		MDI_FAILURE
3564  */
3565 int
3566 mdi_pi_fault(mdi_pathinfo_t *pip, int flags)
3567 {
3568 	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_FAULT, flags));
3569 }
3570 
3571 /*
3572  * mdi_pi_offline():
3573  *		Offline a mdi_pathinfo node.
3574  * Return Values:
3575  *		MDI_SUCCESS
3576  *		MDI_FAILURE
3577  */
3578 int
3579 mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
3580 {
3581 	int	ret, client_held = 0;
3582 	mdi_client_t	*ct;
3583 
3584 	ret = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_OFFLINE, flags);
3585 
3586 	if (ret == MDI_SUCCESS) {
3587 		MDI_PI_LOCK(pip);
3588 		if (MDI_PI(pip)->pi_pm_held) {
3589 			client_held = 1;
3590 		}
3591 		MDI_PI_UNLOCK(pip);
3592 
3593 		if (client_held) {
3594 			ct = MDI_PI(pip)->pi_client;
3595 			MDI_CLIENT_LOCK(ct);
3596 			MDI_DEBUG(4, (CE_NOTE, ct->ct_dip,
3597 			    "mdi_pi_offline i_mdi_pm_rele_client\n"));
3598 			i_mdi_pm_rele_client(ct, 1);
3599 			MDI_CLIENT_UNLOCK(ct);
3600 		}
3601 	}
3602 
3603 	return (ret);
3604 }
3605 
3606 /*
3607  * i_mdi_pi_offline():
3608  *		Offline a mdi_pathinfo node and call the vHCI driver's callback
3609  */
3610 static int
3611 i_mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
3612 {
3613 	dev_info_t	*vdip = NULL;
3614 	mdi_vhci_t	*vh = NULL;
3615 	mdi_client_t	*ct = NULL;
3616 	int		(*f)();
3617 	int		rv;
3618 
3619 	MDI_PI_LOCK(pip);
3620 	ct = MDI_PI(pip)->pi_client;
3621 	ASSERT(ct != NULL);
3622 
3623 	while (MDI_PI(pip)->pi_ref_cnt != 0) {
3624 		/*
3625 		 * Give a chance for pending I/Os to complete.
3626 		 */
3627 		MDI_DEBUG(1, (CE_NOTE, ct->ct_dip, "!i_mdi_pi_offline: "
3628 		    "%d cmds still pending on path: %p\n",
3629 		    MDI_PI(pip)->pi_ref_cnt, (void *)pip));
3630 		if (cv_timedwait(&MDI_PI(pip)->pi_ref_cv,
3631 		    &MDI_PI(pip)->pi_mutex,
3632 		    ddi_get_lbolt() + drv_usectohz(60 * 1000000)) == -1) {
3633 			/*
3634 			 * The timeout time reached without ref_cnt being zero
3635 			 * being signaled.
3636 			 */
3637 			MDI_DEBUG(1, (CE_NOTE, ct->ct_dip, "!i_mdi_pi_offline: "
3638 			    "Timeout reached on path %p without the cond\n",
3639 			    (void *)pip));
3640 			MDI_DEBUG(1, (CE_NOTE, ct->ct_dip, "!i_mdi_pi_offline: "
3641 			    "%d cmds still pending on path: %p\n",
3642 			    MDI_PI(pip)->pi_ref_cnt, (void *)pip));
3643 		}
3644 	}
3645 	vh = ct->ct_vhci;
3646 	vdip = vh->vh_dip;
3647 
3648 	/*
3649 	 * Notify vHCI that has registered this event
3650 	 */
3651 	ASSERT(vh->vh_ops);
3652 	f = vh->vh_ops->vo_pi_state_change;
3653 
3654 	if (f != NULL) {
3655 		MDI_PI_UNLOCK(pip);
3656 		if ((rv = (*f)(vdip, pip, MDI_PATHINFO_STATE_OFFLINE, 0,
3657 		    flags)) != MDI_SUCCESS) {
3658 			MDI_DEBUG(1, (CE_WARN, ct->ct_dip,
3659 			    "!vo_path_offline failed "
3660 			    "vdip %p, pip %p", (void *)vdip, (void *)pip));
3661 		}
3662 		MDI_PI_LOCK(pip);
3663 	}
3664 
3665 	/*
3666 	 * Set the mdi_pathinfo node state and clear the transient condition
3667 	 */
3668 	MDI_PI_SET_OFFLINE(pip);
3669 	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
3670 	MDI_PI_UNLOCK(pip);
3671 
3672 	MDI_CLIENT_LOCK(ct);
3673 	if (rv == MDI_SUCCESS) {
3674 		if (ct->ct_unstable == 0) {
3675 			dev_info_t	*cdip = ct->ct_dip;
3676 
3677 			/*
3678 			 * Onlining the mdi_pathinfo node will impact the
3679 			 * client state Update the client and dev_info node
3680 			 * state accordingly
3681 			 */
3682 			i_mdi_client_update_state(ct);
3683 			rv = NDI_SUCCESS;
3684 			if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
3685 				if (cdip &&
3686 				    (i_ddi_node_state(cdip) >=
3687 				    DS_INITIALIZED)) {
3688 					MDI_CLIENT_UNLOCK(ct);
3689 					rv = ndi_devi_offline(cdip, 0);
3690 					MDI_CLIENT_LOCK(ct);
3691 					if (rv != NDI_SUCCESS) {
3692 						/*
3693 						 * ndi_devi_offline failed.
3694 						 * Reset client flags to
3695 						 * online.
3696 						 */
3697 						MDI_DEBUG(4, (CE_WARN, cdip,
3698 						    "!ndi_devi_offline: failed "
3699 						    " Error: %x", rv));
3700 						MDI_CLIENT_SET_ONLINE(ct);
3701 					}
3702 				}
3703 			}
3704 			/*
3705 			 * Convert to MDI error code
3706 			 */
3707 			switch (rv) {
3708 			case NDI_SUCCESS:
3709 				rv = MDI_SUCCESS;
3710 				break;
3711 			case NDI_BUSY:
3712 				rv = MDI_BUSY;
3713 				break;
3714 			default:
3715 				rv = MDI_FAILURE;
3716 				break;
3717 			}
3718 		}
3719 		MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
3720 		i_mdi_report_path_state(ct, pip);
3721 	}
3722 
3723 	MDI_CLIENT_UNLOCK(ct);
3724 
3725 	/*
3726 	 * Change in the mdi_pathinfo node state will impact the client state
3727 	 */
3728 	MDI_DEBUG(2, (CE_NOTE, NULL, "!i_mdi_pi_offline ct = %p pip = %p",
3729 	    (void *)ct, (void *)pip));
3730 	return (rv);
3731 }
3732 
3733 
3734 /*
3735  * mdi_pi_get_addr():
3736  *		Get the unit address associated with a mdi_pathinfo node
3737  *
3738  * Return Values:
3739  *		char *
3740  */
3741 char *
3742 mdi_pi_get_addr(mdi_pathinfo_t *pip)
3743 {
3744 	if (pip == NULL)
3745 		return (NULL);
3746 
3747 	return (MDI_PI(pip)->pi_addr);
3748 }
3749 
3750 /*
3751  * mdi_pi_get_client():
3752  *		Get the client devinfo associated with a mdi_pathinfo node
3753  *
3754  * Return Values:
3755  *		Handle to client device dev_info node
3756  */
3757 dev_info_t *
3758 mdi_pi_get_client(mdi_pathinfo_t *pip)
3759 {
3760 	dev_info_t	*dip = NULL;
3761 	if (pip) {
3762 		dip = MDI_PI(pip)->pi_client->ct_dip;
3763 	}
3764 	return (dip);
3765 }
3766 
3767 /*
3768  * mdi_pi_get_phci():
3769  *		Get the pHCI devinfo associated with the mdi_pathinfo node
3770  * Return Values:
3771  *		Handle to dev_info node
3772  */
3773 dev_info_t *
3774 mdi_pi_get_phci(mdi_pathinfo_t *pip)
3775 {
3776 	dev_info_t	*dip = NULL;
3777 	if (pip) {
3778 		dip = MDI_PI(pip)->pi_phci->ph_dip;
3779 	}
3780 	return (dip);
3781 }
3782 
3783 /*
3784  * mdi_pi_get_client_private():
3785  *		Get the client private information associated with the
3786  *		mdi_pathinfo node
3787  */
3788 void *
3789 mdi_pi_get_client_private(mdi_pathinfo_t *pip)
3790 {
3791 	void *cprivate = NULL;
3792 	if (pip) {
3793 		cprivate = MDI_PI(pip)->pi_cprivate;
3794 	}
3795 	return (cprivate);
3796 }
3797 
3798 /*
3799  * mdi_pi_set_client_private():
3800  *		Set the client private information in the mdi_pathinfo node
3801  */
3802 void
3803 mdi_pi_set_client_private(mdi_pathinfo_t *pip, void *priv)
3804 {
3805 	if (pip) {
3806 		MDI_PI(pip)->pi_cprivate = priv;
3807 	}
3808 }
3809 
3810 /*
3811  * mdi_pi_get_phci_private():
3812  *		Get the pHCI private information associated with the
3813  *		mdi_pathinfo node
3814  */
3815 caddr_t
3816 mdi_pi_get_phci_private(mdi_pathinfo_t *pip)
3817 {
3818 	caddr_t	pprivate = NULL;
3819 	if (pip) {
3820 		pprivate = MDI_PI(pip)->pi_pprivate;
3821 	}
3822 	return (pprivate);
3823 }
3824 
3825 /*
3826  * mdi_pi_set_phci_private():
3827  *		Set the pHCI private information in the mdi_pathinfo node
3828  */
3829 void
3830 mdi_pi_set_phci_private(mdi_pathinfo_t *pip, caddr_t priv)
3831 {
3832 	if (pip) {
3833 		MDI_PI(pip)->pi_pprivate = priv;
3834 	}
3835 }
3836 
3837 /*
3838  * mdi_pi_get_state():
3839  *		Get the mdi_pathinfo node state. Transient states are internal
3840  *		and not provided to the users
3841  */
3842 mdi_pathinfo_state_t
3843 mdi_pi_get_state(mdi_pathinfo_t *pip)
3844 {
3845 	mdi_pathinfo_state_t    state = MDI_PATHINFO_STATE_INIT;
3846 
3847 	if (pip) {
3848 		if (MDI_PI_IS_TRANSIENT(pip)) {
3849 			/*
3850 			 * mdi_pathinfo is in state transition.  Return the
3851 			 * last good state.
3852 			 */
3853 			state = MDI_PI_OLD_STATE(pip);
3854 		} else {
3855 			state = MDI_PI_STATE(pip);
3856 		}
3857 	}
3858 	return (state);
3859 }
3860 
3861 /*
3862  * Note that the following function needs to be the new interface for
3863  * mdi_pi_get_state when mpxio gets integrated to ON.
3864  */
3865 int
3866 mdi_pi_get_state2(mdi_pathinfo_t *pip, mdi_pathinfo_state_t *state,
3867 		uint32_t *ext_state)
3868 {
3869 	*state = MDI_PATHINFO_STATE_INIT;
3870 
3871 	if (pip) {
3872 		if (MDI_PI_IS_TRANSIENT(pip)) {
3873 			/*
3874 			 * mdi_pathinfo is in state transition.  Return the
3875 			 * last good state.
3876 			 */
3877 			*state = MDI_PI_OLD_STATE(pip);
3878 			*ext_state = MDI_PI_OLD_EXT_STATE(pip);
3879 		} else {
3880 			*state = MDI_PI_STATE(pip);
3881 			*ext_state = MDI_PI_EXT_STATE(pip);
3882 		}
3883 	}
3884 	return (MDI_SUCCESS);
3885 }
3886 
3887 /*
3888  * mdi_pi_get_preferred:
3889  *	Get the preferred path flag
3890  */
3891 int
3892 mdi_pi_get_preferred(mdi_pathinfo_t *pip)
3893 {
3894 	if (pip) {
3895 		return (MDI_PI(pip)->pi_preferred);
3896 	}
3897 	return (0);
3898 }
3899 
3900 /*
3901  * mdi_pi_set_preferred:
3902  *	Set the preferred path flag
3903  */
3904 void
3905 mdi_pi_set_preferred(mdi_pathinfo_t *pip, int preferred)
3906 {
3907 	if (pip) {
3908 		MDI_PI(pip)->pi_preferred = preferred;
3909 	}
3910 }
3911 
3912 /*
3913  * mdi_pi_set_state():
3914  *		Set the mdi_pathinfo node state
3915  */
3916 void
3917 mdi_pi_set_state(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state)
3918 {
3919 	uint32_t	ext_state;
3920 
3921 	if (pip) {
3922 		ext_state = MDI_PI(pip)->pi_state & MDI_PATHINFO_EXT_STATE_MASK;
3923 		MDI_PI(pip)->pi_state = state;
3924 		MDI_PI(pip)->pi_state |= ext_state;
3925 	}
3926 }
3927 
3928 /*
3929  * Property functions:
3930  */
3931 int
3932 i_map_nvlist_error_to_mdi(int val)
3933 {
3934 	int rv;
3935 
3936 	switch (val) {
3937 	case 0:
3938 		rv = DDI_PROP_SUCCESS;
3939 		break;
3940 	case EINVAL:
3941 	case ENOTSUP:
3942 		rv = DDI_PROP_INVAL_ARG;
3943 		break;
3944 	case ENOMEM:
3945 		rv = DDI_PROP_NO_MEMORY;
3946 		break;
3947 	default:
3948 		rv = DDI_PROP_NOT_FOUND;
3949 		break;
3950 	}
3951 	return (rv);
3952 }
3953 
3954 /*
3955  * mdi_pi_get_next_prop():
3956  * 		Property walk function.  The caller should hold mdi_pi_lock()
3957  *		and release by calling mdi_pi_unlock() at the end of walk to
3958  *		get a consistent value.
3959  */
3960 nvpair_t *
3961 mdi_pi_get_next_prop(mdi_pathinfo_t *pip, nvpair_t *prev)
3962 {
3963 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
3964 		return (NULL);
3965 	}
3966 	ASSERT(MDI_PI_LOCKED(pip));
3967 	return (nvlist_next_nvpair(MDI_PI(pip)->pi_prop, prev));
3968 }
3969 
3970 /*
3971  * mdi_prop_remove():
3972  * 		Remove the named property from the named list.
3973  */
3974 int
3975 mdi_prop_remove(mdi_pathinfo_t *pip, char *name)
3976 {
3977 	if (pip == NULL) {
3978 		return (DDI_PROP_NOT_FOUND);
3979 	}
3980 	ASSERT(!MDI_PI_LOCKED(pip));
3981 	MDI_PI_LOCK(pip);
3982 	if (MDI_PI(pip)->pi_prop == NULL) {
3983 		MDI_PI_UNLOCK(pip);
3984 		return (DDI_PROP_NOT_FOUND);
3985 	}
3986 	if (name) {
3987 		(void) nvlist_remove_all(MDI_PI(pip)->pi_prop, name);
3988 	} else {
3989 		char		nvp_name[MAXNAMELEN];
3990 		nvpair_t	*nvp;
3991 		nvp = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, NULL);
3992 		while (nvp) {
3993 			nvpair_t	*next;
3994 			next = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, nvp);
3995 			(void) snprintf(nvp_name, MAXNAMELEN, "%s",
3996 			    nvpair_name(nvp));
3997 			(void) nvlist_remove_all(MDI_PI(pip)->pi_prop,
3998 			    nvp_name);
3999 			nvp = next;
4000 		}
4001 	}
4002 	MDI_PI_UNLOCK(pip);
4003 	return (DDI_PROP_SUCCESS);
4004 }
4005 
4006 /*
4007  * mdi_prop_size():
4008  * 		Get buffer size needed to pack the property data.
4009  * 		Caller should hold the mdi_pathinfo_t lock to get a consistent
4010  *		buffer size.
4011  */
4012 int
4013 mdi_prop_size(mdi_pathinfo_t *pip, size_t *buflenp)
4014 {
4015 	int	rv;
4016 	size_t	bufsize;
4017 
4018 	*buflenp = 0;
4019 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4020 		return (DDI_PROP_NOT_FOUND);
4021 	}
4022 	ASSERT(MDI_PI_LOCKED(pip));
4023 	rv = nvlist_size(MDI_PI(pip)->pi_prop,
4024 	    &bufsize, NV_ENCODE_NATIVE);
4025 	*buflenp = bufsize;
4026 	return (i_map_nvlist_error_to_mdi(rv));
4027 }
4028 
4029 /*
4030  * mdi_prop_pack():
4031  * 		pack the property list.  The caller should hold the
4032  *		mdi_pathinfo_t node to get a consistent data
4033  */
4034 int
4035 mdi_prop_pack(mdi_pathinfo_t *pip, char **bufp, uint_t buflen)
4036 {
4037 	int	rv;
4038 	size_t	bufsize;
4039 
4040 	if ((pip == NULL) || MDI_PI(pip)->pi_prop == NULL) {
4041 		return (DDI_PROP_NOT_FOUND);
4042 	}
4043 
4044 	ASSERT(MDI_PI_LOCKED(pip));
4045 
4046 	bufsize = buflen;
4047 	rv = nvlist_pack(MDI_PI(pip)->pi_prop, bufp, (size_t *)&bufsize,
4048 	    NV_ENCODE_NATIVE, KM_SLEEP);
4049 
4050 	return (i_map_nvlist_error_to_mdi(rv));
4051 }
4052 
4053 /*
4054  * mdi_prop_update_byte():
4055  *		Create/Update a byte property
4056  */
4057 int
4058 mdi_prop_update_byte(mdi_pathinfo_t *pip, char *name, uchar_t data)
4059 {
4060 	int rv;
4061 
4062 	if (pip == NULL) {
4063 		return (DDI_PROP_INVAL_ARG);
4064 	}
4065 	ASSERT(!MDI_PI_LOCKED(pip));
4066 	MDI_PI_LOCK(pip);
4067 	if (MDI_PI(pip)->pi_prop == NULL) {
4068 		MDI_PI_UNLOCK(pip);
4069 		return (DDI_PROP_NOT_FOUND);
4070 	}
4071 	rv = nvlist_add_byte(MDI_PI(pip)->pi_prop, name, data);
4072 	MDI_PI_UNLOCK(pip);
4073 	return (i_map_nvlist_error_to_mdi(rv));
4074 }
4075 
4076 /*
4077  * mdi_prop_update_byte_array():
4078  *		Create/Update a byte array property
4079  */
4080 int
4081 mdi_prop_update_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t *data,
4082     uint_t nelements)
4083 {
4084 	int rv;
4085 
4086 	if (pip == NULL) {
4087 		return (DDI_PROP_INVAL_ARG);
4088 	}
4089 	ASSERT(!MDI_PI_LOCKED(pip));
4090 	MDI_PI_LOCK(pip);
4091 	if (MDI_PI(pip)->pi_prop == NULL) {
4092 		MDI_PI_UNLOCK(pip);
4093 		return (DDI_PROP_NOT_FOUND);
4094 	}
4095 	rv = nvlist_add_byte_array(MDI_PI(pip)->pi_prop, name, data, nelements);
4096 	MDI_PI_UNLOCK(pip);
4097 	return (i_map_nvlist_error_to_mdi(rv));
4098 }
4099 
4100 /*
4101  * mdi_prop_update_int():
4102  *		Create/Update a 32 bit integer property
4103  */
4104 int
4105 mdi_prop_update_int(mdi_pathinfo_t *pip, char *name, int data)
4106 {
4107 	int rv;
4108 
4109 	if (pip == NULL) {
4110 		return (DDI_PROP_INVAL_ARG);
4111 	}
4112 	ASSERT(!MDI_PI_LOCKED(pip));
4113 	MDI_PI_LOCK(pip);
4114 	if (MDI_PI(pip)->pi_prop == NULL) {
4115 		MDI_PI_UNLOCK(pip);
4116 		return (DDI_PROP_NOT_FOUND);
4117 	}
4118 	rv = nvlist_add_int32(MDI_PI(pip)->pi_prop, name, (int32_t)data);
4119 	MDI_PI_UNLOCK(pip);
4120 	return (i_map_nvlist_error_to_mdi(rv));
4121 }
4122 
4123 /*
4124  * mdi_prop_update_int64():
4125  *		Create/Update a 64 bit integer property
4126  */
4127 int
4128 mdi_prop_update_int64(mdi_pathinfo_t *pip, char *name, int64_t data)
4129 {
4130 	int rv;
4131 
4132 	if (pip == NULL) {
4133 		return (DDI_PROP_INVAL_ARG);
4134 	}
4135 	ASSERT(!MDI_PI_LOCKED(pip));
4136 	MDI_PI_LOCK(pip);
4137 	if (MDI_PI(pip)->pi_prop == NULL) {
4138 		MDI_PI_UNLOCK(pip);
4139 		return (DDI_PROP_NOT_FOUND);
4140 	}
4141 	rv = nvlist_add_int64(MDI_PI(pip)->pi_prop, name, data);
4142 	MDI_PI_UNLOCK(pip);
4143 	return (i_map_nvlist_error_to_mdi(rv));
4144 }
4145 
4146 /*
4147  * mdi_prop_update_int_array():
4148  *		Create/Update a int array property
4149  */
4150 int
4151 mdi_prop_update_int_array(mdi_pathinfo_t *pip, char *name, int *data,
4152 	    uint_t nelements)
4153 {
4154 	int rv;
4155 
4156 	if (pip == NULL) {
4157 		return (DDI_PROP_INVAL_ARG);
4158 	}
4159 	ASSERT(!MDI_PI_LOCKED(pip));
4160 	MDI_PI_LOCK(pip);
4161 	if (MDI_PI(pip)->pi_prop == NULL) {
4162 		MDI_PI_UNLOCK(pip);
4163 		return (DDI_PROP_NOT_FOUND);
4164 	}
4165 	rv = nvlist_add_int32_array(MDI_PI(pip)->pi_prop, name, (int32_t *)data,
4166 	    nelements);
4167 	MDI_PI_UNLOCK(pip);
4168 	return (i_map_nvlist_error_to_mdi(rv));
4169 }
4170 
4171 /*
4172  * mdi_prop_update_string():
4173  *		Create/Update a string property
4174  */
4175 int
4176 mdi_prop_update_string(mdi_pathinfo_t *pip, char *name, char *data)
4177 {
4178 	int rv;
4179 
4180 	if (pip == NULL) {
4181 		return (DDI_PROP_INVAL_ARG);
4182 	}
4183 	ASSERT(!MDI_PI_LOCKED(pip));
4184 	MDI_PI_LOCK(pip);
4185 	if (MDI_PI(pip)->pi_prop == NULL) {
4186 		MDI_PI_UNLOCK(pip);
4187 		return (DDI_PROP_NOT_FOUND);
4188 	}
4189 	rv = nvlist_add_string(MDI_PI(pip)->pi_prop, name, data);
4190 	MDI_PI_UNLOCK(pip);
4191 	return (i_map_nvlist_error_to_mdi(rv));
4192 }
4193 
4194 /*
4195  * mdi_prop_update_string_array():
4196  *		Create/Update a string array property
4197  */
4198 int
4199 mdi_prop_update_string_array(mdi_pathinfo_t *pip, char *name, char **data,
4200     uint_t nelements)
4201 {
4202 	int rv;
4203 
4204 	if (pip == NULL) {
4205 		return (DDI_PROP_INVAL_ARG);
4206 	}
4207 	ASSERT(!MDI_PI_LOCKED(pip));
4208 	MDI_PI_LOCK(pip);
4209 	if (MDI_PI(pip)->pi_prop == NULL) {
4210 		MDI_PI_UNLOCK(pip);
4211 		return (DDI_PROP_NOT_FOUND);
4212 	}
4213 	rv = nvlist_add_string_array(MDI_PI(pip)->pi_prop, name, data,
4214 	    nelements);
4215 	MDI_PI_UNLOCK(pip);
4216 	return (i_map_nvlist_error_to_mdi(rv));
4217 }
4218 
4219 /*
4220  * mdi_prop_lookup_byte():
4221  * 		Look for byte property identified by name.  The data returned
4222  *		is the actual property and valid as long as mdi_pathinfo_t node
4223  *		is alive.
4224  */
4225 int
4226 mdi_prop_lookup_byte(mdi_pathinfo_t *pip, char *name, uchar_t *data)
4227 {
4228 	int rv;
4229 
4230 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4231 		return (DDI_PROP_NOT_FOUND);
4232 	}
4233 	rv = nvlist_lookup_byte(MDI_PI(pip)->pi_prop, name, data);
4234 	return (i_map_nvlist_error_to_mdi(rv));
4235 }
4236 
4237 
4238 /*
4239  * mdi_prop_lookup_byte_array():
4240  * 		Look for byte array property identified by name.  The data
4241  *		returned is the actual property and valid as long as
4242  *		mdi_pathinfo_t node is alive.
4243  */
4244 int
4245 mdi_prop_lookup_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t **data,
4246     uint_t *nelements)
4247 {
4248 	int rv;
4249 
4250 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4251 		return (DDI_PROP_NOT_FOUND);
4252 	}
4253 	rv = nvlist_lookup_byte_array(MDI_PI(pip)->pi_prop, name, data,
4254 	    nelements);
4255 	return (i_map_nvlist_error_to_mdi(rv));
4256 }
4257 
4258 /*
4259  * mdi_prop_lookup_int():
4260  * 		Look for int property identified by name.  The data returned
4261  *		is the actual property and valid as long as mdi_pathinfo_t
4262  *		node is alive.
4263  */
4264 int
4265 mdi_prop_lookup_int(mdi_pathinfo_t *pip, char *name, int *data)
4266 {
4267 	int rv;
4268 
4269 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4270 		return (DDI_PROP_NOT_FOUND);
4271 	}
4272 	rv = nvlist_lookup_int32(MDI_PI(pip)->pi_prop, name, (int32_t *)data);
4273 	return (i_map_nvlist_error_to_mdi(rv));
4274 }
4275 
4276 /*
4277  * mdi_prop_lookup_int64():
4278  * 		Look for int64 property identified by name.  The data returned
4279  *		is the actual property and valid as long as mdi_pathinfo_t node
4280  *		is alive.
4281  */
4282 int
4283 mdi_prop_lookup_int64(mdi_pathinfo_t *pip, char *name, int64_t *data)
4284 {
4285 	int rv;
4286 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4287 		return (DDI_PROP_NOT_FOUND);
4288 	}
4289 	rv = nvlist_lookup_int64(MDI_PI(pip)->pi_prop, name, data);
4290 	return (i_map_nvlist_error_to_mdi(rv));
4291 }
4292 
4293 /*
4294  * mdi_prop_lookup_int_array():
4295  * 		Look for int array property identified by name.  The data
4296  *		returned is the actual property and valid as long as
4297  *		mdi_pathinfo_t node is alive.
4298  */
4299 int
4300 mdi_prop_lookup_int_array(mdi_pathinfo_t *pip, char *name, int **data,
4301     uint_t *nelements)
4302 {
4303 	int rv;
4304 
4305 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4306 		return (DDI_PROP_NOT_FOUND);
4307 	}
4308 	rv = nvlist_lookup_int32_array(MDI_PI(pip)->pi_prop, name,
4309 	    (int32_t **)data, nelements);
4310 	return (i_map_nvlist_error_to_mdi(rv));
4311 }
4312 
4313 /*
4314  * mdi_prop_lookup_string():
4315  * 		Look for string property identified by name.  The data
4316  *		returned is the actual property and valid as long as
4317  *		mdi_pathinfo_t node is alive.
4318  */
4319 int
4320 mdi_prop_lookup_string(mdi_pathinfo_t *pip, char *name, char **data)
4321 {
4322 	int rv;
4323 
4324 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4325 		return (DDI_PROP_NOT_FOUND);
4326 	}
4327 	rv = nvlist_lookup_string(MDI_PI(pip)->pi_prop, name, data);
4328 	return (i_map_nvlist_error_to_mdi(rv));
4329 }
4330 
4331 /*
4332  * mdi_prop_lookup_string_array():
4333  * 		Look for string array property identified by name.  The data
4334  *		returned is the actual property and valid as long as
4335  *		mdi_pathinfo_t node is alive.
4336  */
4337 int
4338 mdi_prop_lookup_string_array(mdi_pathinfo_t *pip, char *name, char ***data,
4339     uint_t *nelements)
4340 {
4341 	int rv;
4342 
4343 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4344 		return (DDI_PROP_NOT_FOUND);
4345 	}
4346 	rv = nvlist_lookup_string_array(MDI_PI(pip)->pi_prop, name, data,
4347 	    nelements);
4348 	return (i_map_nvlist_error_to_mdi(rv));
4349 }
4350 
4351 /*
4352  * mdi_prop_free():
4353  * 		Symmetrical function to ddi_prop_free(). nvlist_lookup_xx()
4354  *		functions return the pointer to actual property data and not a
4355  *		copy of it.  So the data returned is valid as long as
4356  *		mdi_pathinfo_t node is valid.
4357  */
4358 /*ARGSUSED*/
4359 int
4360 mdi_prop_free(void *data)
4361 {
4362 	return (DDI_PROP_SUCCESS);
4363 }
4364 
4365 /*ARGSUSED*/
4366 static void
4367 i_mdi_report_path_state(mdi_client_t *ct, mdi_pathinfo_t *pip)
4368 {
4369 	char		*phci_path, *ct_path;
4370 	char		*ct_status;
4371 	char		*status;
4372 	dev_info_t	*dip = ct->ct_dip;
4373 	char		lb_buf[64];
4374 
4375 	ASSERT(MDI_CLIENT_LOCKED(ct));
4376 	if ((dip == NULL) || (ddi_get_instance(dip) == -1) ||
4377 	    (MDI_CLIENT_IS_REPORT_DEV_NEEDED(ct) == 0)) {
4378 		return;
4379 	}
4380 	if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_OPTIMAL) {
4381 		ct_status = "optimal";
4382 	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED) {
4383 		ct_status = "degraded";
4384 	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
4385 		ct_status = "failed";
4386 	} else {
4387 		ct_status = "unknown";
4388 	}
4389 
4390 	if (MDI_PI_IS_OFFLINE(pip)) {
4391 		status = "offline";
4392 	} else if (MDI_PI_IS_ONLINE(pip)) {
4393 		status = "online";
4394 	} else if (MDI_PI_IS_STANDBY(pip)) {
4395 		status = "standby";
4396 	} else if (MDI_PI_IS_FAULT(pip)) {
4397 		status = "faulted";
4398 	} else {
4399 		status = "unknown";
4400 	}
4401 
4402 	if (ct->ct_lb == LOAD_BALANCE_LBA) {
4403 		(void) snprintf(lb_buf, sizeof (lb_buf),
4404 		    "%s, region-size: %d", mdi_load_balance_lba,
4405 			ct->ct_lb_args->region_size);
4406 	} else if (ct->ct_lb == LOAD_BALANCE_NONE) {
4407 		(void) snprintf(lb_buf, sizeof (lb_buf),
4408 		    "%s", mdi_load_balance_none);
4409 	} else {
4410 		(void) snprintf(lb_buf, sizeof (lb_buf), "%s",
4411 		    mdi_load_balance_rr);
4412 	}
4413 
4414 	if (dip) {
4415 		ct_path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4416 		phci_path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4417 		cmn_err(CE_CONT, "?%s (%s%d) multipath status: %s, "
4418 		    "path %s (%s%d) to target address: %s is %s"
4419 		    " Load balancing: %s\n",
4420 		    ddi_pathname(dip, ct_path), ddi_driver_name(dip),
4421 		    ddi_get_instance(dip), ct_status,
4422 		    ddi_pathname(MDI_PI(pip)->pi_phci->ph_dip, phci_path),
4423 		    ddi_driver_name(MDI_PI(pip)->pi_phci->ph_dip),
4424 		    ddi_get_instance(MDI_PI(pip)->pi_phci->ph_dip),
4425 		    MDI_PI(pip)->pi_addr, status, lb_buf);
4426 		kmem_free(phci_path, MAXPATHLEN);
4427 		kmem_free(ct_path, MAXPATHLEN);
4428 		MDI_CLIENT_CLEAR_REPORT_DEV_NEEDED(ct);
4429 	}
4430 }
4431 
4432 #ifdef	DEBUG
4433 /*
4434  * i_mdi_log():
4435  *		Utility function for error message management
4436  *
4437  */
4438 /*PRINTFLIKE3*/
4439 static void
4440 i_mdi_log(int level, dev_info_t *dip, const char *fmt, ...)
4441 {
4442 	char		name[MAXNAMELEN];
4443 	char		buf[MAXNAMELEN];
4444 	char		*bp;
4445 	va_list		ap;
4446 	int		log_only = 0;
4447 	int		boot_only = 0;
4448 	int		console_only = 0;
4449 
4450 	if (dip) {
4451 		(void) snprintf(name, MAXNAMELEN, "%s%d: ",
4452 		    ddi_node_name(dip), ddi_get_instance(dip));
4453 	} else {
4454 		name[0] = 0;
4455 	}
4456 
4457 	va_start(ap, fmt);
4458 	(void) vsnprintf(buf, MAXNAMELEN, fmt, ap);
4459 	va_end(ap);
4460 
4461 	switch (buf[0]) {
4462 	case '!':
4463 		bp = &buf[1];
4464 		log_only = 1;
4465 		break;
4466 	case '?':
4467 		bp = &buf[1];
4468 		boot_only = 1;
4469 		break;
4470 	case '^':
4471 		bp = &buf[1];
4472 		console_only = 1;
4473 		break;
4474 	default:
4475 		bp = buf;
4476 		break;
4477 	}
4478 	if (mdi_debug_logonly) {
4479 		log_only = 1;
4480 		boot_only = 0;
4481 		console_only = 0;
4482 	}
4483 
4484 	switch (level) {
4485 	case CE_NOTE:
4486 		level = CE_CONT;
4487 		/* FALLTHROUGH */
4488 	case CE_CONT:
4489 	case CE_WARN:
4490 	case CE_PANIC:
4491 		if (boot_only) {
4492 			cmn_err(level, "?mdi: %s%s", name, bp);
4493 		} else if (console_only) {
4494 			cmn_err(level, "^mdi: %s%s", name, bp);
4495 		} else if (log_only) {
4496 			cmn_err(level, "!mdi: %s%s", name, bp);
4497 		} else {
4498 			cmn_err(level, "mdi: %s%s", name, bp);
4499 		}
4500 		break;
4501 	default:
4502 		cmn_err(level, "mdi: %s%s", name, bp);
4503 		break;
4504 	}
4505 }
4506 #endif	/* DEBUG */
4507 
4508 void
4509 i_mdi_client_online(dev_info_t *ct_dip)
4510 {
4511 	mdi_client_t	*ct;
4512 
4513 	/*
4514 	 * Client online notification. Mark client state as online
4515 	 * restore our binding with dev_info node
4516 	 */
4517 	ct = i_devi_get_client(ct_dip);
4518 	ASSERT(ct != NULL);
4519 	MDI_CLIENT_LOCK(ct);
4520 	MDI_CLIENT_SET_ONLINE(ct);
4521 	/* catch for any memory leaks */
4522 	ASSERT((ct->ct_dip == NULL) || (ct->ct_dip == ct_dip));
4523 	ct->ct_dip = ct_dip;
4524 
4525 	if (ct->ct_power_cnt == 0)
4526 		(void) i_mdi_power_all_phci(ct);
4527 
4528 	MDI_DEBUG(4, (CE_NOTE, ct_dip, "i_mdi_client_online "
4529 	    "i_mdi_pm_hold_client %p\n", (void *)ct));
4530 	i_mdi_pm_hold_client(ct, 1);
4531 
4532 	MDI_CLIENT_UNLOCK(ct);
4533 }
4534 
4535 void
4536 i_mdi_phci_online(dev_info_t *ph_dip)
4537 {
4538 	mdi_phci_t	*ph;
4539 
4540 	/* pHCI online notification. Mark state accordingly */
4541 	ph = i_devi_get_phci(ph_dip);
4542 	ASSERT(ph != NULL);
4543 	MDI_PHCI_LOCK(ph);
4544 	MDI_PHCI_SET_ONLINE(ph);
4545 	MDI_PHCI_UNLOCK(ph);
4546 }
4547 
4548 /*
4549  * mdi_devi_online():
4550  * 		Online notification from NDI framework on pHCI/client
4551  *		device online.
4552  * Return Values:
4553  *		NDI_SUCCESS
4554  *		MDI_FAILURE
4555  */
4556 /*ARGSUSED*/
4557 int
4558 mdi_devi_online(dev_info_t *dip, uint_t flags)
4559 {
4560 	if (MDI_PHCI(dip)) {
4561 		i_mdi_phci_online(dip);
4562 	}
4563 
4564 	if (MDI_CLIENT(dip)) {
4565 		i_mdi_client_online(dip);
4566 	}
4567 	return (NDI_SUCCESS);
4568 }
4569 
4570 /*
4571  * mdi_devi_offline():
4572  * 		Offline notification from NDI framework on pHCI/Client device
4573  *		offline.
4574  *
4575  * Return Values:
4576  *		NDI_SUCCESS
4577  *		NDI_FAILURE
4578  */
4579 /*ARGSUSED*/
4580 int
4581 mdi_devi_offline(dev_info_t *dip, uint_t flags)
4582 {
4583 	int		rv = NDI_SUCCESS;
4584 
4585 	if (MDI_CLIENT(dip)) {
4586 		rv = i_mdi_client_offline(dip, flags);
4587 		if (rv != NDI_SUCCESS)
4588 			return (rv);
4589 	}
4590 
4591 	if (MDI_PHCI(dip)) {
4592 		rv = i_mdi_phci_offline(dip, flags);
4593 
4594 		if ((rv != NDI_SUCCESS) && MDI_CLIENT(dip)) {
4595 			/* set client back online */
4596 			i_mdi_client_online(dip);
4597 		}
4598 	}
4599 
4600 	return (rv);
4601 }
4602 
4603 /*ARGSUSED*/
4604 static int
4605 i_mdi_phci_offline(dev_info_t *dip, uint_t flags)
4606 {
4607 	int		rv = NDI_SUCCESS;
4608 	mdi_phci_t	*ph;
4609 	mdi_client_t	*ct;
4610 	mdi_pathinfo_t	*pip;
4611 	mdi_pathinfo_t	*next;
4612 	mdi_pathinfo_t	*failed_pip = NULL;
4613 	dev_info_t	*cdip;
4614 
4615 	/*
4616 	 * pHCI component offline notification
4617 	 * Make sure that this pHCI instance is free to be offlined.
4618 	 * If it is OK to proceed, Offline and remove all the child
4619 	 * mdi_pathinfo nodes.  This process automatically offlines
4620 	 * corresponding client devices, for which this pHCI provides
4621 	 * critical services.
4622 	 */
4623 	ph = i_devi_get_phci(dip);
4624 	MDI_DEBUG(2, (CE_NOTE, dip, "!mdi_phci_offline called %p %p\n",
4625 	    (void *)dip, (void *)ph));
4626 	if (ph == NULL) {
4627 		return (rv);
4628 	}
4629 
4630 	MDI_PHCI_LOCK(ph);
4631 
4632 	if (MDI_PHCI_IS_OFFLINE(ph)) {
4633 		MDI_DEBUG(1, (CE_WARN, dip, "!pHCI %p already offlined",
4634 		    (void *)ph));
4635 		MDI_PHCI_UNLOCK(ph);
4636 		return (NDI_SUCCESS);
4637 	}
4638 
4639 	/*
4640 	 * Check to see if the pHCI can be offlined
4641 	 */
4642 	if (ph->ph_unstable) {
4643 		MDI_DEBUG(1, (CE_WARN, dip,
4644 		    "!One or more target devices are in transient "
4645 		    "state. This device can not be removed at "
4646 		    "this moment. Please try again later."));
4647 		MDI_PHCI_UNLOCK(ph);
4648 		return (NDI_BUSY);
4649 	}
4650 
4651 	pip = ph->ph_path_head;
4652 	while (pip != NULL) {
4653 		MDI_PI_LOCK(pip);
4654 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4655 
4656 		/*
4657 		 * The mdi_pathinfo state is OK. Check the client state.
4658 		 * If failover in progress fail the pHCI from offlining
4659 		 */
4660 		ct = MDI_PI(pip)->pi_client;
4661 		i_mdi_client_lock(ct, pip);
4662 		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
4663 		    (ct->ct_unstable)) {
4664 			/*
4665 			 * Failover is in progress, Fail the DR
4666 			 */
4667 			MDI_DEBUG(1, (CE_WARN, dip,
4668 			    "!pHCI device (%s%d) is Busy. %s",
4669 			    ddi_driver_name(dip), ddi_get_instance(dip),
4670 			    "This device can not be removed at "
4671 			    "this moment. Please try again later."));
4672 			MDI_PI_UNLOCK(pip);
4673 			i_mdi_client_unlock(ct);
4674 			MDI_PHCI_UNLOCK(ph);
4675 			return (NDI_BUSY);
4676 		}
4677 		MDI_PI_UNLOCK(pip);
4678 
4679 		/*
4680 		 * Check to see of we are removing the last path of this
4681 		 * client device...
4682 		 */
4683 		cdip = ct->ct_dip;
4684 		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
4685 		    (i_mdi_client_compute_state(ct, ph) ==
4686 		    MDI_CLIENT_STATE_FAILED)) {
4687 			i_mdi_client_unlock(ct);
4688 			MDI_PHCI_UNLOCK(ph);
4689 			if (ndi_devi_offline(cdip, 0) != NDI_SUCCESS) {
4690 				/*
4691 				 * ndi_devi_offline() failed.
4692 				 * This pHCI provides the critical path
4693 				 * to one or more client devices.
4694 				 * Return busy.
4695 				 */
4696 				MDI_PHCI_LOCK(ph);
4697 				MDI_DEBUG(1, (CE_WARN, dip,
4698 				    "!pHCI device (%s%d) is Busy. %s",
4699 				    ddi_driver_name(dip), ddi_get_instance(dip),
4700 				    "This device can not be removed at "
4701 				    "this moment. Please try again later."));
4702 				failed_pip = pip;
4703 				break;
4704 			} else {
4705 				MDI_PHCI_LOCK(ph);
4706 				pip = next;
4707 			}
4708 		} else {
4709 			i_mdi_client_unlock(ct);
4710 			pip = next;
4711 		}
4712 	}
4713 
4714 	if (failed_pip) {
4715 		pip = ph->ph_path_head;
4716 		while (pip != failed_pip) {
4717 			MDI_PI_LOCK(pip);
4718 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4719 			ct = MDI_PI(pip)->pi_client;
4720 			i_mdi_client_lock(ct, pip);
4721 			cdip = ct->ct_dip;
4722 			switch (MDI_CLIENT_STATE(ct)) {
4723 			case MDI_CLIENT_STATE_OPTIMAL:
4724 			case MDI_CLIENT_STATE_DEGRADED:
4725 				if (cdip) {
4726 					MDI_PI_UNLOCK(pip);
4727 					i_mdi_client_unlock(ct);
4728 					MDI_PHCI_UNLOCK(ph);
4729 					(void) ndi_devi_online(cdip, 0);
4730 					MDI_PHCI_LOCK(ph);
4731 					pip = next;
4732 					continue;
4733 				}
4734 				break;
4735 
4736 			case MDI_CLIENT_STATE_FAILED:
4737 				if (cdip) {
4738 					MDI_PI_UNLOCK(pip);
4739 					i_mdi_client_unlock(ct);
4740 					MDI_PHCI_UNLOCK(ph);
4741 					(void) ndi_devi_offline(cdip, 0);
4742 					MDI_PHCI_LOCK(ph);
4743 					pip = next;
4744 					continue;
4745 				}
4746 				break;
4747 			}
4748 			MDI_PI_UNLOCK(pip);
4749 			i_mdi_client_unlock(ct);
4750 			pip = next;
4751 		}
4752 		MDI_PHCI_UNLOCK(ph);
4753 		return (NDI_BUSY);
4754 	}
4755 
4756 	/*
4757 	 * Mark the pHCI as offline
4758 	 */
4759 	MDI_PHCI_SET_OFFLINE(ph);
4760 
4761 	/*
4762 	 * Mark the child mdi_pathinfo nodes as transient
4763 	 */
4764 	pip = ph->ph_path_head;
4765 	while (pip != NULL) {
4766 		MDI_PI_LOCK(pip);
4767 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4768 		MDI_PI_SET_OFFLINING(pip);
4769 		MDI_PI_UNLOCK(pip);
4770 		pip = next;
4771 	}
4772 	MDI_PHCI_UNLOCK(ph);
4773 	/*
4774 	 * Give a chance for any pending commands to execute
4775 	 */
4776 	delay(1);
4777 	MDI_PHCI_LOCK(ph);
4778 	pip = ph->ph_path_head;
4779 	while (pip != NULL) {
4780 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4781 		(void) i_mdi_pi_offline(pip, flags);
4782 		MDI_PI_LOCK(pip);
4783 		ct = MDI_PI(pip)->pi_client;
4784 		if (!MDI_PI_IS_OFFLINE(pip)) {
4785 			MDI_DEBUG(1, (CE_WARN, dip,
4786 			    "!pHCI device (%s%d) is Busy. %s",
4787 			    ddi_driver_name(dip), ddi_get_instance(dip),
4788 			    "This device can not be removed at "
4789 			    "this moment. Please try again later."));
4790 			MDI_PI_UNLOCK(pip);
4791 			MDI_PHCI_SET_ONLINE(ph);
4792 			MDI_PHCI_UNLOCK(ph);
4793 			return (NDI_BUSY);
4794 		}
4795 		MDI_PI_UNLOCK(pip);
4796 		pip = next;
4797 	}
4798 	MDI_PHCI_UNLOCK(ph);
4799 
4800 	return (rv);
4801 }
4802 
4803 /*ARGSUSED*/
4804 static int
4805 i_mdi_client_offline(dev_info_t *dip, uint_t flags)
4806 {
4807 	int		rv = NDI_SUCCESS;
4808 	mdi_client_t	*ct;
4809 
4810 	/*
4811 	 * Client component to go offline.  Make sure that we are
4812 	 * not in failing over state and update client state
4813 	 * accordingly
4814 	 */
4815 	ct = i_devi_get_client(dip);
4816 	MDI_DEBUG(2, (CE_NOTE, dip, "!i_mdi_client_offline called %p %p\n",
4817 	    (void *)dip, (void *)ct));
4818 	if (ct != NULL) {
4819 		MDI_CLIENT_LOCK(ct);
4820 		if (ct->ct_unstable) {
4821 			/*
4822 			 * One or more paths are in transient state,
4823 			 * Dont allow offline of a client device
4824 			 */
4825 			MDI_DEBUG(1, (CE_WARN, dip,
4826 			    "!One or more paths to this device is "
4827 			    "in transient state. This device can not "
4828 			    "be removed at this moment. "
4829 			    "Please try again later."));
4830 			MDI_CLIENT_UNLOCK(ct);
4831 			return (NDI_BUSY);
4832 		}
4833 		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
4834 			/*
4835 			 * Failover is in progress, Dont allow DR of
4836 			 * a client device
4837 			 */
4838 			MDI_DEBUG(1, (CE_WARN, dip,
4839 			    "!Client device (%s%d) is Busy. %s",
4840 			    ddi_driver_name(dip), ddi_get_instance(dip),
4841 			    "This device can not be removed at "
4842 			    "this moment. Please try again later."));
4843 			MDI_CLIENT_UNLOCK(ct);
4844 			return (NDI_BUSY);
4845 		}
4846 		MDI_CLIENT_SET_OFFLINE(ct);
4847 
4848 		/*
4849 		 * Unbind our relationship with the dev_info node
4850 		 */
4851 		if (flags & NDI_DEVI_REMOVE) {
4852 			ct->ct_dip = NULL;
4853 		}
4854 		MDI_CLIENT_UNLOCK(ct);
4855 	}
4856 	return (rv);
4857 }
4858 
4859 /*
4860  * mdi_pre_attach():
4861  *		Pre attach() notification handler
4862  */
4863 /*ARGSUSED*/
4864 int
4865 mdi_pre_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
4866 {
4867 	/* don't support old DDI_PM_RESUME */
4868 	if ((DEVI(dip)->devi_mdi_component != MDI_COMPONENT_NONE) &&
4869 	    (cmd == DDI_PM_RESUME))
4870 		return (DDI_FAILURE);
4871 
4872 	return (DDI_SUCCESS);
4873 }
4874 
4875 /*
4876  * mdi_post_attach():
4877  *		Post attach() notification handler
4878  */
4879 /*ARGSUSED*/
4880 void
4881 mdi_post_attach(dev_info_t *dip, ddi_attach_cmd_t cmd, int error)
4882 {
4883 	mdi_phci_t	*ph;
4884 	mdi_client_t	*ct;
4885 	mdi_pathinfo_t	*pip;
4886 
4887 	if (MDI_PHCI(dip)) {
4888 		ph = i_devi_get_phci(dip);
4889 		ASSERT(ph != NULL);
4890 
4891 		MDI_PHCI_LOCK(ph);
4892 		switch (cmd) {
4893 		case DDI_ATTACH:
4894 			MDI_DEBUG(2, (CE_NOTE, dip,
4895 			    "!pHCI post_attach: called %p\n", (void *)ph));
4896 			if (error == DDI_SUCCESS) {
4897 				MDI_PHCI_SET_ATTACH(ph);
4898 			} else {
4899 				MDI_DEBUG(1, (CE_NOTE, dip,
4900 				    "!pHCI post_attach: failed error=%d\n",
4901 				    error));
4902 				MDI_PHCI_SET_DETACH(ph);
4903 			}
4904 			break;
4905 
4906 		case DDI_RESUME:
4907 			MDI_DEBUG(2, (CE_NOTE, dip,
4908 			    "!pHCI post_resume: called %p\n", (void *)ph));
4909 			if (error == DDI_SUCCESS) {
4910 				MDI_PHCI_SET_RESUME(ph);
4911 			} else {
4912 				MDI_DEBUG(1, (CE_NOTE, dip,
4913 				    "!pHCI post_resume: failed error=%d\n",
4914 				    error));
4915 				MDI_PHCI_SET_SUSPEND(ph);
4916 			}
4917 			break;
4918 		}
4919 		MDI_PHCI_UNLOCK(ph);
4920 	}
4921 
4922 	if (MDI_CLIENT(dip)) {
4923 		ct = i_devi_get_client(dip);
4924 		ASSERT(ct != NULL);
4925 
4926 		MDI_CLIENT_LOCK(ct);
4927 		switch (cmd) {
4928 		case DDI_ATTACH:
4929 			MDI_DEBUG(2, (CE_NOTE, dip,
4930 			    "!Client post_attach: called %p\n", (void *)ct));
4931 			if (error != DDI_SUCCESS) {
4932 				MDI_DEBUG(1, (CE_NOTE, dip,
4933 				    "!Client post_attach: failed error=%d\n",
4934 				    error));
4935 				MDI_CLIENT_SET_DETACH(ct);
4936 				MDI_DEBUG(4, (CE_WARN, dip,
4937 				    "mdi_post_attach i_mdi_pm_reset_client\n"));
4938 				i_mdi_pm_reset_client(ct);
4939 				break;
4940 			}
4941 
4942 			/*
4943 			 * Client device has successfully attached.
4944 			 * Create kstats for any pathinfo structures
4945 			 * initially associated with this client.
4946 			 */
4947 			for (pip = ct->ct_path_head; pip != NULL;
4948 			    pip = (mdi_pathinfo_t *)
4949 			    MDI_PI(pip)->pi_client_link) {
4950 				if (!MDI_PI_IS_OFFLINE(pip)) {
4951 					(void) i_mdi_pi_kstat_create(pip);
4952 					i_mdi_report_path_state(ct, pip);
4953 				}
4954 			}
4955 			MDI_CLIENT_SET_ATTACH(ct);
4956 			break;
4957 
4958 		case DDI_RESUME:
4959 			MDI_DEBUG(2, (CE_NOTE, dip,
4960 			    "!Client post_attach: called %p\n", (void *)ct));
4961 			if (error == DDI_SUCCESS) {
4962 				MDI_CLIENT_SET_RESUME(ct);
4963 			} else {
4964 				MDI_DEBUG(1, (CE_NOTE, dip,
4965 				    "!Client post_resume: failed error=%d\n",
4966 				    error));
4967 				MDI_CLIENT_SET_SUSPEND(ct);
4968 			}
4969 			break;
4970 		}
4971 		MDI_CLIENT_UNLOCK(ct);
4972 	}
4973 }
4974 
4975 /*
4976  * mdi_pre_detach():
4977  *		Pre detach notification handler
4978  */
4979 /*ARGSUSED*/
4980 int
4981 mdi_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
4982 {
4983 	int rv = DDI_SUCCESS;
4984 
4985 	if (MDI_CLIENT(dip)) {
4986 		(void) i_mdi_client_pre_detach(dip, cmd);
4987 	}
4988 
4989 	if (MDI_PHCI(dip)) {
4990 		rv = i_mdi_phci_pre_detach(dip, cmd);
4991 	}
4992 
4993 	return (rv);
4994 }
4995 
4996 /*ARGSUSED*/
4997 static int
4998 i_mdi_phci_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
4999 {
5000 	int		rv = DDI_SUCCESS;
5001 	mdi_phci_t	*ph;
5002 	mdi_client_t	*ct;
5003 	mdi_pathinfo_t	*pip;
5004 	mdi_pathinfo_t	*failed_pip = NULL;
5005 	mdi_pathinfo_t	*next;
5006 
5007 	ph = i_devi_get_phci(dip);
5008 	if (ph == NULL) {
5009 		return (rv);
5010 	}
5011 
5012 	MDI_PHCI_LOCK(ph);
5013 	switch (cmd) {
5014 	case DDI_DETACH:
5015 		MDI_DEBUG(2, (CE_NOTE, dip,
5016 		    "!pHCI pre_detach: called %p\n", (void *)ph));
5017 		if (!MDI_PHCI_IS_OFFLINE(ph)) {
5018 			/*
5019 			 * mdi_pathinfo nodes are still attached to
5020 			 * this pHCI. Fail the detach for this pHCI.
5021 			 */
5022 			MDI_DEBUG(2, (CE_WARN, dip,
5023 			    "!pHCI pre_detach: "
5024 			    "mdi_pathinfo nodes are still attached "
5025 			    "%p\n", (void *)ph));
5026 			rv = DDI_FAILURE;
5027 			break;
5028 		}
5029 		MDI_PHCI_SET_DETACH(ph);
5030 		break;
5031 
5032 	case DDI_SUSPEND:
5033 		/*
5034 		 * pHCI is getting suspended.  Since mpxio client
5035 		 * devices may not be suspended at this point, to avoid
5036 		 * a potential stack overflow, it is important to suspend
5037 		 * client devices before pHCI can be suspended.
5038 		 */
5039 
5040 		MDI_DEBUG(2, (CE_NOTE, dip,
5041 		    "!pHCI pre_suspend: called %p\n", (void *)ph));
5042 		/*
5043 		 * Suspend all the client devices accessible through this pHCI
5044 		 */
5045 		pip = ph->ph_path_head;
5046 		while (pip != NULL && rv == DDI_SUCCESS) {
5047 			dev_info_t *cdip;
5048 			MDI_PI_LOCK(pip);
5049 			next =
5050 			    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5051 			ct = MDI_PI(pip)->pi_client;
5052 			i_mdi_client_lock(ct, pip);
5053 			cdip = ct->ct_dip;
5054 			MDI_PI_UNLOCK(pip);
5055 			if ((MDI_CLIENT_IS_DETACHED(ct) == 0) &&
5056 			    MDI_CLIENT_IS_SUSPENDED(ct) == 0) {
5057 				i_mdi_client_unlock(ct);
5058 				if ((rv = devi_detach(cdip, DDI_SUSPEND)) !=
5059 				    DDI_SUCCESS) {
5060 					/*
5061 					 * Suspend of one of the client
5062 					 * device has failed.
5063 					 */
5064 					MDI_DEBUG(1, (CE_WARN, dip,
5065 					    "!Suspend of device (%s%d) failed.",
5066 					    ddi_driver_name(cdip),
5067 					    ddi_get_instance(cdip)));
5068 					failed_pip = pip;
5069 					break;
5070 				}
5071 			} else {
5072 				i_mdi_client_unlock(ct);
5073 			}
5074 			pip = next;
5075 		}
5076 
5077 		if (rv == DDI_SUCCESS) {
5078 			/*
5079 			 * Suspend of client devices is complete. Proceed
5080 			 * with pHCI suspend.
5081 			 */
5082 			MDI_PHCI_SET_SUSPEND(ph);
5083 		} else {
5084 			/*
5085 			 * Revert back all the suspended client device states
5086 			 * to converse.
5087 			 */
5088 			pip = ph->ph_path_head;
5089 			while (pip != failed_pip) {
5090 				dev_info_t *cdip;
5091 				MDI_PI_LOCK(pip);
5092 				next =
5093 				    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5094 				ct = MDI_PI(pip)->pi_client;
5095 				i_mdi_client_lock(ct, pip);
5096 				cdip = ct->ct_dip;
5097 				MDI_PI_UNLOCK(pip);
5098 				if (MDI_CLIENT_IS_SUSPENDED(ct)) {
5099 					i_mdi_client_unlock(ct);
5100 					(void) devi_attach(cdip, DDI_RESUME);
5101 				} else {
5102 					i_mdi_client_unlock(ct);
5103 				}
5104 				pip = next;
5105 			}
5106 		}
5107 		break;
5108 
5109 	default:
5110 		rv = DDI_FAILURE;
5111 		break;
5112 	}
5113 	MDI_PHCI_UNLOCK(ph);
5114 	return (rv);
5115 }
5116 
5117 /*ARGSUSED*/
5118 static int
5119 i_mdi_client_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5120 {
5121 	int		rv = DDI_SUCCESS;
5122 	mdi_client_t	*ct;
5123 
5124 	ct = i_devi_get_client(dip);
5125 	if (ct == NULL) {
5126 		return (rv);
5127 	}
5128 
5129 	MDI_CLIENT_LOCK(ct);
5130 	switch (cmd) {
5131 	case DDI_DETACH:
5132 		MDI_DEBUG(2, (CE_NOTE, dip,
5133 		    "!Client pre_detach: called %p\n", (void *)ct));
5134 		MDI_CLIENT_SET_DETACH(ct);
5135 		break;
5136 
5137 	case DDI_SUSPEND:
5138 		MDI_DEBUG(2, (CE_NOTE, dip,
5139 		    "!Client pre_suspend: called %p\n", (void *)ct));
5140 		MDI_CLIENT_SET_SUSPEND(ct);
5141 		break;
5142 
5143 	default:
5144 		rv = DDI_FAILURE;
5145 		break;
5146 	}
5147 	MDI_CLIENT_UNLOCK(ct);
5148 	return (rv);
5149 }
5150 
5151 /*
5152  * mdi_post_detach():
5153  *		Post detach notification handler
5154  */
5155 /*ARGSUSED*/
5156 void
5157 mdi_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5158 {
5159 	/*
5160 	 * Detach/Suspend of mpxio component failed. Update our state
5161 	 * too
5162 	 */
5163 	if (MDI_PHCI(dip))
5164 		i_mdi_phci_post_detach(dip, cmd, error);
5165 
5166 	if (MDI_CLIENT(dip))
5167 		i_mdi_client_post_detach(dip, cmd, error);
5168 }
5169 
5170 /*ARGSUSED*/
5171 static void
5172 i_mdi_phci_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5173 {
5174 	mdi_phci_t	*ph;
5175 
5176 	/*
5177 	 * Detach/Suspend of phci component failed. Update our state
5178 	 * too
5179 	 */
5180 	ph = i_devi_get_phci(dip);
5181 	if (ph == NULL) {
5182 		return;
5183 	}
5184 
5185 	MDI_PHCI_LOCK(ph);
5186 	/*
5187 	 * Detach of pHCI failed. Restore back converse
5188 	 * state
5189 	 */
5190 	switch (cmd) {
5191 	case DDI_DETACH:
5192 		MDI_DEBUG(2, (CE_NOTE, dip,
5193 		    "!pHCI post_detach: called %p\n", (void *)ph));
5194 		if (error != DDI_SUCCESS)
5195 			MDI_PHCI_SET_ATTACH(ph);
5196 		break;
5197 
5198 	case DDI_SUSPEND:
5199 		MDI_DEBUG(2, (CE_NOTE, dip,
5200 		    "!pHCI post_suspend: called %p\n", (void *)ph));
5201 		if (error != DDI_SUCCESS)
5202 			MDI_PHCI_SET_RESUME(ph);
5203 		break;
5204 	}
5205 	MDI_PHCI_UNLOCK(ph);
5206 }
5207 
5208 /*ARGSUSED*/
5209 static void
5210 i_mdi_client_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5211 {
5212 	mdi_client_t	*ct;
5213 
5214 	ct = i_devi_get_client(dip);
5215 	if (ct == NULL) {
5216 		return;
5217 	}
5218 	MDI_CLIENT_LOCK(ct);
5219 	/*
5220 	 * Detach of Client failed. Restore back converse
5221 	 * state
5222 	 */
5223 	switch (cmd) {
5224 	case DDI_DETACH:
5225 		MDI_DEBUG(2, (CE_NOTE, dip,
5226 		    "!Client post_detach: called %p\n", (void *)ct));
5227 		if (DEVI_IS_ATTACHING(ct->ct_dip)) {
5228 			MDI_DEBUG(4, (CE_NOTE, dip, "i_mdi_client_post_detach "
5229 			    "i_mdi_pm_rele_client\n"));
5230 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
5231 		} else {
5232 			MDI_DEBUG(4, (CE_NOTE, dip, "i_mdi_client_post_detach "
5233 			    "i_mdi_pm_reset_client\n"));
5234 			i_mdi_pm_reset_client(ct);
5235 		}
5236 		if (error != DDI_SUCCESS)
5237 			MDI_CLIENT_SET_ATTACH(ct);
5238 		break;
5239 
5240 	case DDI_SUSPEND:
5241 		MDI_DEBUG(2, (CE_NOTE, dip,
5242 		    "!Client post_suspend: called %p\n", (void *)ct));
5243 		if (error != DDI_SUCCESS)
5244 			MDI_CLIENT_SET_RESUME(ct);
5245 		break;
5246 	}
5247 	MDI_CLIENT_UNLOCK(ct);
5248 }
5249 
5250 /*
5251  * create and install per-path (client - pHCI) statistics
5252  * I/O stats supported: nread, nwritten, reads, and writes
5253  * Error stats - hard errors, soft errors, & transport errors
5254  */
5255 static int
5256 i_mdi_pi_kstat_create(mdi_pathinfo_t *pip)
5257 {
5258 
5259 	dev_info_t *client = MDI_PI(pip)->pi_client->ct_dip;
5260 	dev_info_t *ppath = MDI_PI(pip)->pi_phci->ph_dip;
5261 	char ksname[KSTAT_STRLEN];
5262 	mdi_pathinfo_t *cpip;
5263 	const char *err_postfix = ",err";
5264 	kstat_t	*kiosp, *kerrsp;
5265 	struct pi_errs	*nsp;
5266 	struct mdi_pi_kstats *mdi_statp;
5267 
5268 	ASSERT(client != NULL && ppath != NULL);
5269 
5270 	ASSERT(MDI_CLIENT_LOCKED(MDI_PI(pip)->pi_client));
5271 
5272 	if (MDI_PI(pip)->pi_kstats != NULL)
5273 		return (MDI_SUCCESS);
5274 
5275 	for (cpip = MDI_PI(pip)->pi_client->ct_path_head; cpip != NULL;
5276 	    cpip = (mdi_pathinfo_t *)(MDI_PI(cpip)->pi_client_link)) {
5277 		if ((cpip == pip) || MDI_PI_IS_OFFLINE(pip))
5278 			continue;
5279 		/*
5280 		 * We have found a different path with same parent
5281 		 * kstats for a given client-pHCI are common
5282 		 */
5283 		if ((MDI_PI(cpip)->pi_phci->ph_dip == ppath) &&
5284 		    (MDI_PI(cpip)->pi_kstats != NULL)) {
5285 			MDI_PI(cpip)->pi_kstats->pi_kstat_ref++;
5286 			MDI_PI(pip)->pi_kstats = MDI_PI(cpip)->pi_kstats;
5287 			return (MDI_SUCCESS);
5288 		}
5289 	}
5290 
5291 	/*
5292 	 * stats are named as follows: TGTx.HBAy, e.g. "ssd0.fp0"
5293 	 * clamp length of name against max length of error kstat name
5294 	 */
5295 	if (snprintf(ksname, KSTAT_STRLEN, "%s%d.%s%d",
5296 	    ddi_driver_name(client), ddi_get_instance(client),
5297 	    ddi_driver_name(ppath), ddi_get_instance(ppath)) >
5298 	    (KSTAT_STRLEN - strlen(err_postfix))) {
5299 		return (MDI_FAILURE);
5300 	}
5301 	if ((kiosp = kstat_create("mdi", 0, ksname, "iopath",
5302 	    KSTAT_TYPE_IO, 1, 0)) == NULL) {
5303 		return (MDI_FAILURE);
5304 	}
5305 
5306 	(void) strcat(ksname, err_postfix);
5307 	kerrsp = kstat_create("mdi", 0, ksname, "iopath_errors",
5308 	    KSTAT_TYPE_NAMED,
5309 	    sizeof (struct pi_errs) / sizeof (kstat_named_t), 0);
5310 
5311 	if (kerrsp == NULL) {
5312 		kstat_delete(kiosp);
5313 		return (MDI_FAILURE);
5314 	}
5315 
5316 	nsp = (struct pi_errs *)kerrsp->ks_data;
5317 	kstat_named_init(&nsp->pi_softerrs, "Soft Errors", KSTAT_DATA_UINT32);
5318 	kstat_named_init(&nsp->pi_harderrs, "Hard Errors", KSTAT_DATA_UINT32);
5319 	kstat_named_init(&nsp->pi_transerrs, "Transport Errors",
5320 	    KSTAT_DATA_UINT32);
5321 	kstat_named_init(&nsp->pi_icnt_busy, "Interconnect Busy",
5322 	    KSTAT_DATA_UINT32);
5323 	kstat_named_init(&nsp->pi_icnt_errors, "Interconnect Errors",
5324 	    KSTAT_DATA_UINT32);
5325 	kstat_named_init(&nsp->pi_phci_rsrc, "pHCI No Resources",
5326 	    KSTAT_DATA_UINT32);
5327 	kstat_named_init(&nsp->pi_phci_localerr, "pHCI Local Errors",
5328 	    KSTAT_DATA_UINT32);
5329 	kstat_named_init(&nsp->pi_phci_invstate, "pHCI Invalid State",
5330 	    KSTAT_DATA_UINT32);
5331 	kstat_named_init(&nsp->pi_failedfrom, "Failed From",
5332 	    KSTAT_DATA_UINT32);
5333 	kstat_named_init(&nsp->pi_failedto, "Failed To", KSTAT_DATA_UINT32);
5334 
5335 	mdi_statp = kmem_alloc(sizeof (*mdi_statp), KM_SLEEP);
5336 	mdi_statp->pi_kstat_ref = 1;
5337 	mdi_statp->pi_kstat_iostats = kiosp;
5338 	mdi_statp->pi_kstat_errstats = kerrsp;
5339 	kstat_install(kiosp);
5340 	kstat_install(kerrsp);
5341 	MDI_PI(pip)->pi_kstats = mdi_statp;
5342 	return (MDI_SUCCESS);
5343 }
5344 
5345 /*
5346  * destroy per-path properties
5347  */
5348 static void
5349 i_mdi_pi_kstat_destroy(mdi_pathinfo_t *pip)
5350 {
5351 
5352 	struct mdi_pi_kstats *mdi_statp;
5353 
5354 	if ((mdi_statp = MDI_PI(pip)->pi_kstats) == NULL)
5355 		return;
5356 
5357 	MDI_PI(pip)->pi_kstats = NULL;
5358 
5359 	/*
5360 	 * the kstat may be shared between multiple pathinfo nodes
5361 	 * decrement this pathinfo's usage, removing the kstats
5362 	 * themselves when the last pathinfo reference is removed.
5363 	 */
5364 	ASSERT(mdi_statp->pi_kstat_ref > 0);
5365 	if (--mdi_statp->pi_kstat_ref != 0)
5366 		return;
5367 
5368 	kstat_delete(mdi_statp->pi_kstat_iostats);
5369 	kstat_delete(mdi_statp->pi_kstat_errstats);
5370 	kmem_free(mdi_statp, sizeof (*mdi_statp));
5371 }
5372 
5373 /*
5374  * update I/O paths KSTATS
5375  */
5376 void
5377 mdi_pi_kstat_iosupdate(mdi_pathinfo_t *pip, struct buf *bp)
5378 {
5379 	kstat_t *iostatp;
5380 	size_t xfer_cnt;
5381 
5382 	ASSERT(pip != NULL);
5383 
5384 	/*
5385 	 * I/O can be driven across a path prior to having path
5386 	 * statistics available, i.e. probe(9e).
5387 	 */
5388 	if (bp != NULL && MDI_PI(pip)->pi_kstats != NULL) {
5389 		iostatp = MDI_PI(pip)->pi_kstats->pi_kstat_iostats;
5390 		xfer_cnt = bp->b_bcount - bp->b_resid;
5391 		if (bp->b_flags & B_READ) {
5392 			KSTAT_IO_PTR(iostatp)->reads++;
5393 			KSTAT_IO_PTR(iostatp)->nread += xfer_cnt;
5394 		} else {
5395 			KSTAT_IO_PTR(iostatp)->writes++;
5396 			KSTAT_IO_PTR(iostatp)->nwritten += xfer_cnt;
5397 		}
5398 	}
5399 }
5400 
5401 /*
5402  * Enable the path(specific client/target/initiator)
5403  * Enabling a path means that MPxIO may select the enabled path for routing
5404  * future I/O requests, subject to other path state constraints.
5405  */
5406 int
5407 mdi_pi_enable_path(mdi_pathinfo_t *pip, int flags)
5408 {
5409 	mdi_phci_t	*ph;
5410 
5411 	ph = i_devi_get_phci(mdi_pi_get_phci(pip));
5412 	if (ph == NULL) {
5413 		MDI_DEBUG(1, (CE_NOTE, NULL, "!mdi_pi_enable_path:"
5414 			" failed. pip: %p ph = NULL\n", (void *)pip));
5415 		return (MDI_FAILURE);
5416 	}
5417 
5418 	(void) i_mdi_enable_disable_path(pip, ph->ph_vhci, flags,
5419 		MDI_ENABLE_OP);
5420 	MDI_DEBUG(5, (CE_NOTE, NULL, "!mdi_pi_enable_path:"
5421 		" Returning success pip = %p. ph = %p\n",
5422 		(void *)pip, (void *)ph));
5423 	return (MDI_SUCCESS);
5424 
5425 }
5426 
5427 /*
5428  * Disable the path (specific client/target/initiator)
5429  * Disabling a path means that MPxIO will not select the disabled path for
5430  * routing any new I/O requests.
5431  */
5432 int
5433 mdi_pi_disable_path(mdi_pathinfo_t *pip, int flags)
5434 {
5435 	mdi_phci_t	*ph;
5436 
5437 	ph = i_devi_get_phci(mdi_pi_get_phci(pip));
5438 	if (ph == NULL) {
5439 		MDI_DEBUG(1, (CE_NOTE, NULL, "!mdi_pi_disable_path:"
5440 			" failed. pip: %p ph = NULL\n", (void *)pip));
5441 		return (MDI_FAILURE);
5442 	}
5443 
5444 	(void) i_mdi_enable_disable_path(pip,
5445 			ph->ph_vhci, flags, MDI_DISABLE_OP);
5446 	MDI_DEBUG(5, (CE_NOTE, NULL, "!mdi_pi_disable_path:"
5447 		"Returning success pip = %p. ph = %p",
5448 		(void *)pip, (void *)ph));
5449 	return (MDI_SUCCESS);
5450 }
5451 
5452 /*
5453  * disable the path to a particular pHCI (pHCI specified in the phci_path
5454  * argument) for a particular client (specified in the client_path argument).
5455  * Disabling a path means that MPxIO will not select the disabled path for
5456  * routing any new I/O requests.
5457  * NOTE: this will be removed once the NWS files are changed to use the new
5458  * mdi_{enable,disable}_path interfaces
5459  */
5460 int
5461 mdi_pi_disable(dev_info_t *cdip, dev_info_t *pdip, int flags)
5462 {
5463 	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_DISABLE_OP));
5464 }
5465 
5466 /*
5467  * Enable the path to a particular pHCI (pHCI specified in the phci_path
5468  * argument) for a particular client (specified in the client_path argument).
5469  * Enabling a path means that MPxIO may select the enabled path for routing
5470  * future I/O requests, subject to other path state constraints.
5471  * NOTE: this will be removed once the NWS files are changed to use the new
5472  * mdi_{enable,disable}_path interfaces
5473  */
5474 
5475 int
5476 mdi_pi_enable(dev_info_t *cdip, dev_info_t *pdip, int flags)
5477 {
5478 	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_ENABLE_OP));
5479 }
5480 
5481 /*
5482  * Common routine for doing enable/disable.
5483  */
5484 static mdi_pathinfo_t *
5485 i_mdi_enable_disable_path(mdi_pathinfo_t *pip, mdi_vhci_t *vh, int flags,
5486 		int op)
5487 {
5488 	int		sync_flag = 0;
5489 	int		rv;
5490 	mdi_pathinfo_t 	*next;
5491 	int		(*f)() = NULL;
5492 
5493 	f = vh->vh_ops->vo_pi_state_change;
5494 
5495 	sync_flag = (flags << 8) & 0xf00;
5496 
5497 	/*
5498 	 * Do a callback into the mdi consumer to let it
5499 	 * know that path is about to get enabled/disabled.
5500 	 */
5501 	if (f != NULL) {
5502 		rv = (*f)(vh->vh_dip, pip, 0,
5503 			MDI_PI_EXT_STATE(pip),
5504 			MDI_EXT_STATE_CHANGE | sync_flag |
5505 			op | MDI_BEFORE_STATE_CHANGE);
5506 		if (rv != MDI_SUCCESS) {
5507 			MDI_DEBUG(2, (CE_WARN, vh->vh_dip,
5508 			"!vo_pi_state_change: failed rv = %x", rv));
5509 		}
5510 	}
5511 	MDI_PI_LOCK(pip);
5512 	next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5513 
5514 	switch (flags) {
5515 		case USER_DISABLE:
5516 			if (op == MDI_DISABLE_OP) {
5517 				MDI_PI_SET_USER_DISABLE(pip);
5518 			} else {
5519 				MDI_PI_SET_USER_ENABLE(pip);
5520 			}
5521 			break;
5522 		case DRIVER_DISABLE:
5523 			if (op == MDI_DISABLE_OP) {
5524 				MDI_PI_SET_DRV_DISABLE(pip);
5525 			} else {
5526 				MDI_PI_SET_DRV_ENABLE(pip);
5527 			}
5528 			break;
5529 		case DRIVER_DISABLE_TRANSIENT:
5530 			if (op == MDI_DISABLE_OP && rv == MDI_SUCCESS) {
5531 				MDI_PI_SET_DRV_DISABLE_TRANS(pip);
5532 			} else {
5533 				MDI_PI_SET_DRV_ENABLE_TRANS(pip);
5534 			}
5535 			break;
5536 	}
5537 	MDI_PI_UNLOCK(pip);
5538 	/*
5539 	 * Do a callback into the mdi consumer to let it
5540 	 * know that path is now enabled/disabled.
5541 	 */
5542 	if (f != NULL) {
5543 		rv = (*f)(vh->vh_dip, pip, 0,
5544 			MDI_PI_EXT_STATE(pip),
5545 			MDI_EXT_STATE_CHANGE | sync_flag |
5546 			op | MDI_AFTER_STATE_CHANGE);
5547 		if (rv != MDI_SUCCESS) {
5548 			MDI_DEBUG(2, (CE_WARN, vh->vh_dip,
5549 			"!vo_pi_state_change: failed rv = %x", rv));
5550 		}
5551 	}
5552 	return (next);
5553 }
5554 
5555 /*
5556  * Common routine for doing enable/disable.
5557  * NOTE: this will be removed once the NWS files are changed to use the new
5558  * mdi_{enable,disable}_path has been putback
5559  */
5560 int
5561 i_mdi_pi_enable_disable(dev_info_t *cdip, dev_info_t *pdip, int flags, int op)
5562 {
5563 
5564 	mdi_phci_t	*ph;
5565 	mdi_vhci_t	*vh = NULL;
5566 	mdi_client_t	*ct;
5567 	mdi_pathinfo_t	*next, *pip;
5568 	int		found_it;
5569 
5570 	ph = i_devi_get_phci(pdip);
5571 	MDI_DEBUG(5, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable: "
5572 		"Op = %d pdip = %p cdip = %p\n", op, (void *)pdip,
5573 		(void *)cdip));
5574 	if (ph == NULL) {
5575 		MDI_DEBUG(1, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable:"
5576 			"Op %d failed. ph = NULL\n", op));
5577 		return (MDI_FAILURE);
5578 	}
5579 
5580 	if ((op != MDI_ENABLE_OP) && (op != MDI_DISABLE_OP)) {
5581 		MDI_DEBUG(1, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable: "
5582 			"Op Invalid operation = %d\n", op));
5583 		return (MDI_FAILURE);
5584 	}
5585 
5586 	vh = ph->ph_vhci;
5587 
5588 	if (cdip == NULL) {
5589 		/*
5590 		 * Need to mark the Phci as enabled/disabled.
5591 		 */
5592 		MDI_DEBUG(3, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable: "
5593 		"Op %d for the phci\n", op));
5594 		MDI_PHCI_LOCK(ph);
5595 		switch (flags) {
5596 			case USER_DISABLE:
5597 				if (op == MDI_DISABLE_OP) {
5598 					MDI_PHCI_SET_USER_DISABLE(ph);
5599 				} else {
5600 					MDI_PHCI_SET_USER_ENABLE(ph);
5601 				}
5602 				break;
5603 			case DRIVER_DISABLE:
5604 				if (op == MDI_DISABLE_OP) {
5605 					MDI_PHCI_SET_DRV_DISABLE(ph);
5606 				} else {
5607 					MDI_PHCI_SET_DRV_ENABLE(ph);
5608 				}
5609 				break;
5610 			case DRIVER_DISABLE_TRANSIENT:
5611 				if (op == MDI_DISABLE_OP) {
5612 					MDI_PHCI_SET_DRV_DISABLE_TRANSIENT(ph);
5613 				} else {
5614 					MDI_PHCI_SET_DRV_ENABLE_TRANSIENT(ph);
5615 				}
5616 				break;
5617 			default:
5618 				MDI_PHCI_UNLOCK(ph);
5619 				MDI_DEBUG(1, (CE_NOTE, NULL,
5620 				"!i_mdi_pi_enable_disable:"
5621 				" Invalid flag argument= %d\n", flags));
5622 		}
5623 
5624 		/*
5625 		 * Phci has been disabled. Now try to enable/disable
5626 		 * path info's to each client.
5627 		 */
5628 		pip = ph->ph_path_head;
5629 		while (pip != NULL) {
5630 			pip = i_mdi_enable_disable_path(pip, vh, flags, op);
5631 		}
5632 		MDI_PHCI_UNLOCK(ph);
5633 	} else {
5634 
5635 		/*
5636 		 * Disable a specific client.
5637 		 */
5638 		ct = i_devi_get_client(cdip);
5639 		if (ct == NULL) {
5640 			MDI_DEBUG(1, (CE_NOTE, NULL,
5641 			"!i_mdi_pi_enable_disable:"
5642 			" failed. ct = NULL operation = %d\n", op));
5643 			return (MDI_FAILURE);
5644 		}
5645 
5646 		MDI_CLIENT_LOCK(ct);
5647 		pip = ct->ct_path_head;
5648 		found_it = 0;
5649 		while (pip != NULL) {
5650 			MDI_PI_LOCK(pip);
5651 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
5652 			if (MDI_PI(pip)->pi_phci == ph) {
5653 				MDI_PI_UNLOCK(pip);
5654 				found_it = 1;
5655 				break;
5656 			}
5657 			MDI_PI_UNLOCK(pip);
5658 			pip = next;
5659 		}
5660 
5661 
5662 		MDI_CLIENT_UNLOCK(ct);
5663 		if (found_it == 0) {
5664 			MDI_DEBUG(1, (CE_NOTE, NULL,
5665 			"!i_mdi_pi_enable_disable:"
5666 			" failed. Could not find corresponding pip\n"));
5667 			return (MDI_FAILURE);
5668 		}
5669 
5670 		(void) i_mdi_enable_disable_path(pip, vh, flags, op);
5671 	}
5672 
5673 	MDI_DEBUG(5, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable: "
5674 		"Op %d Returning success pdip = %p cdip = %p\n",
5675 		op, (void *)pdip, (void *)cdip));
5676 	return (MDI_SUCCESS);
5677 }
5678 
5679 /*
5680  * Ensure phci powered up
5681  */
5682 static void
5683 i_mdi_pm_hold_pip(mdi_pathinfo_t *pip)
5684 {
5685 	dev_info_t	*ph_dip;
5686 
5687 	ASSERT(pip != NULL);
5688 	ASSERT(MDI_PI_LOCKED(pip));
5689 
5690 	if (MDI_PI(pip)->pi_pm_held) {
5691 		return;
5692 	}
5693 
5694 	ph_dip = mdi_pi_get_phci(pip);
5695 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_pm_hold_pip for %s%d %p\n",
5696 	    ddi_get_name(ph_dip), ddi_get_instance(ph_dip), (void *)pip));
5697 	if (ph_dip == NULL) {
5698 		return;
5699 	}
5700 
5701 	MDI_PI_UNLOCK(pip);
5702 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt was %d\n",
5703 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
5704 
5705 	pm_hold_power(ph_dip);
5706 
5707 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt is %d\n",
5708 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
5709 	MDI_PI_LOCK(pip);
5710 
5711 	/* If PM_GET_PM_INFO is NULL the pm_hold_power above was a noop */
5712 	if (DEVI(ph_dip)->devi_pm_info)
5713 		MDI_PI(pip)->pi_pm_held = 1;
5714 }
5715 
5716 /*
5717  * Allow phci powered down
5718  */
5719 static void
5720 i_mdi_pm_rele_pip(mdi_pathinfo_t *pip)
5721 {
5722 	dev_info_t	*ph_dip = NULL;
5723 
5724 	ASSERT(pip != NULL);
5725 	ASSERT(MDI_PI_LOCKED(pip));
5726 
5727 	if (MDI_PI(pip)->pi_pm_held == 0) {
5728 		return;
5729 	}
5730 
5731 	ph_dip = mdi_pi_get_phci(pip);
5732 	ASSERT(ph_dip != NULL);
5733 
5734 	MDI_PI_UNLOCK(pip);
5735 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_pm_rele_pip for %s%d %p\n",
5736 	    ddi_get_name(ph_dip), ddi_get_instance(ph_dip), (void *)pip));
5737 
5738 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt was %d\n",
5739 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
5740 	pm_rele_power(ph_dip);
5741 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt is %d\n",
5742 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
5743 
5744 	MDI_PI_LOCK(pip);
5745 	MDI_PI(pip)->pi_pm_held = 0;
5746 }
5747 
5748 static void
5749 i_mdi_pm_hold_client(mdi_client_t *ct, int incr)
5750 {
5751 	ASSERT(MDI_CLIENT_LOCKED(ct));
5752 
5753 	ct->ct_power_cnt += incr;
5754 	MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "i_mdi_pm_hold_client %p "
5755 	    "ct_power_cnt = %d incr = %d\n", (void *)ct,
5756 	    ct->ct_power_cnt, incr));
5757 	ASSERT(ct->ct_power_cnt >= 0);
5758 }
5759 
5760 static void
5761 i_mdi_rele_all_phci(mdi_client_t *ct)
5762 {
5763 	mdi_pathinfo_t  *pip;
5764 
5765 	ASSERT(MDI_CLIENT_LOCKED(ct));
5766 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
5767 	while (pip != NULL) {
5768 		mdi_hold_path(pip);
5769 		MDI_PI_LOCK(pip);
5770 		i_mdi_pm_rele_pip(pip);
5771 		MDI_PI_UNLOCK(pip);
5772 		mdi_rele_path(pip);
5773 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
5774 	}
5775 }
5776 
5777 static void
5778 i_mdi_pm_rele_client(mdi_client_t *ct, int decr)
5779 {
5780 	ASSERT(MDI_CLIENT_LOCKED(ct));
5781 
5782 	if (i_ddi_devi_attached(ct->ct_dip)) {
5783 		ct->ct_power_cnt -= decr;
5784 		MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "i_mdi_pm_rele_client %p "
5785 		    "ct_power_cnt = %d decr = %d\n",
5786 		    (void *)ct, ct->ct_power_cnt, decr));
5787 	}
5788 
5789 	ASSERT(ct->ct_power_cnt >= 0);
5790 	if (ct->ct_power_cnt == 0) {
5791 		i_mdi_rele_all_phci(ct);
5792 		return;
5793 	}
5794 }
5795 
5796 static void
5797 i_mdi_pm_reset_client(mdi_client_t *ct)
5798 {
5799 	MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "i_mdi_pm_reset_client %p "
5800 	    "ct_power_cnt = %d\n", (void *)ct, ct->ct_power_cnt));
5801 	ASSERT(MDI_CLIENT_LOCKED(ct));
5802 	ct->ct_power_cnt = 0;
5803 	i_mdi_rele_all_phci(ct);
5804 	ct->ct_powercnt_config = 0;
5805 	ct->ct_powercnt_unconfig = 0;
5806 	ct->ct_powercnt_reset = 1;
5807 }
5808 
5809 static int
5810 i_mdi_power_one_phci(mdi_pathinfo_t *pip)
5811 {
5812 	int		ret;
5813 	dev_info_t	*ph_dip;
5814 
5815 	MDI_PI_LOCK(pip);
5816 	i_mdi_pm_hold_pip(pip);
5817 
5818 	ph_dip = mdi_pi_get_phci(pip);
5819 	MDI_PI_UNLOCK(pip);
5820 
5821 	/* bring all components of phci to full power */
5822 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_power_one_phci "
5823 	    "pm_powerup for %s%d %p\n", ddi_get_name(ph_dip),
5824 	    ddi_get_instance(ph_dip), (void *)pip));
5825 
5826 	ret = pm_powerup(ph_dip);
5827 
5828 	if (ret == DDI_FAILURE) {
5829 		MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_power_one_phci "
5830 		    "pm_powerup FAILED for %s%d %p\n",
5831 		    ddi_get_name(ph_dip), ddi_get_instance(ph_dip),
5832 		    (void *)pip));
5833 
5834 		MDI_PI_LOCK(pip);
5835 		i_mdi_pm_rele_pip(pip);
5836 		MDI_PI_UNLOCK(pip);
5837 		return (MDI_FAILURE);
5838 	}
5839 
5840 	return (MDI_SUCCESS);
5841 }
5842 
5843 static int
5844 i_mdi_power_all_phci(mdi_client_t *ct)
5845 {
5846 	mdi_pathinfo_t  *pip;
5847 	int		succeeded = 0;
5848 
5849 	ASSERT(MDI_CLIENT_LOCKED(ct));
5850 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
5851 	while (pip != NULL) {
5852 		/*
5853 		 * Don't power if MDI_PATHINFO_STATE_FAULT
5854 		 * or MDI_PATHINFO_STATE_OFFLINE.
5855 		 */
5856 		if (MDI_PI_IS_INIT(pip) ||
5857 		    MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip)) {
5858 			mdi_hold_path(pip);
5859 			MDI_CLIENT_UNLOCK(ct);
5860 			if (i_mdi_power_one_phci(pip) == MDI_SUCCESS)
5861 				succeeded = 1;
5862 
5863 			ASSERT(ct == MDI_PI(pip)->pi_client);
5864 			MDI_CLIENT_LOCK(ct);
5865 			mdi_rele_path(pip);
5866 		}
5867 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
5868 	}
5869 
5870 	return (succeeded ? MDI_SUCCESS : MDI_FAILURE);
5871 }
5872 
5873 /*
5874  * mdi_bus_power():
5875  *		1. Place the phci(s) into powered up state so that
5876  *		   client can do power management
5877  *		2. Ensure phci powered up as client power managing
5878  * Return Values:
5879  *		MDI_SUCCESS
5880  *		MDI_FAILURE
5881  */
5882 int
5883 mdi_bus_power(dev_info_t *parent, void *impl_arg, pm_bus_power_op_t op,
5884     void *arg, void *result)
5885 {
5886 	int			ret = MDI_SUCCESS;
5887 	pm_bp_child_pwrchg_t	*bpc;
5888 	mdi_client_t		*ct;
5889 	dev_info_t		*cdip;
5890 	pm_bp_has_changed_t	*bphc;
5891 
5892 	/*
5893 	 * BUS_POWER_NOINVOL not supported
5894 	 */
5895 	if (op == BUS_POWER_NOINVOL)
5896 		return (MDI_FAILURE);
5897 
5898 	/*
5899 	 * ignore other OPs.
5900 	 * return quickly to save cou cycles on the ct processing
5901 	 */
5902 	switch (op) {
5903 	case BUS_POWER_PRE_NOTIFICATION:
5904 	case BUS_POWER_POST_NOTIFICATION:
5905 		bpc = (pm_bp_child_pwrchg_t *)arg;
5906 		cdip = bpc->bpc_dip;
5907 		break;
5908 	case BUS_POWER_HAS_CHANGED:
5909 		bphc = (pm_bp_has_changed_t *)arg;
5910 		cdip = bphc->bphc_dip;
5911 		break;
5912 	default:
5913 		return (pm_busop_bus_power(parent, impl_arg, op, arg, result));
5914 	}
5915 
5916 	ASSERT(MDI_CLIENT(cdip));
5917 
5918 	ct = i_devi_get_client(cdip);
5919 	if (ct == NULL)
5920 		return (MDI_FAILURE);
5921 
5922 	/*
5923 	 * wait till the mdi_pathinfo node state change are processed
5924 	 */
5925 	MDI_CLIENT_LOCK(ct);
5926 	switch (op) {
5927 	case BUS_POWER_PRE_NOTIFICATION:
5928 		MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip, "mdi_bus_power "
5929 		    "BUS_POWER_PRE_NOTIFICATION:"
5930 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d\n",
5931 		    PM_NAME(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
5932 		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp));
5933 
5934 		/* serialize power level change per client */
5935 		while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
5936 			cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
5937 
5938 		MDI_CLIENT_SET_POWER_TRANSITION(ct);
5939 
5940 		if (ct->ct_power_cnt == 0) {
5941 			ret = i_mdi_power_all_phci(ct);
5942 		}
5943 
5944 		/*
5945 		 * if new_level > 0:
5946 		 *	- hold phci(s)
5947 		 *	- power up phci(s) if not already
5948 		 * ignore power down
5949 		 */
5950 		if (bpc->bpc_nlevel > 0) {
5951 			if (!DEVI_IS_ATTACHING(ct->ct_dip)) {
5952 				MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
5953 				    "mdi_bus_power i_mdi_pm_hold_client\n"));
5954 				i_mdi_pm_hold_client(ct, ct->ct_path_count);
5955 			}
5956 		}
5957 		break;
5958 	case BUS_POWER_POST_NOTIFICATION:
5959 		MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip, "mdi_bus_power "
5960 		    "BUS_POWER_POST_NOTIFICATION:"
5961 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d result=%d\n",
5962 		    PM_NAME(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
5963 		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp,
5964 		    *(int *)result));
5965 
5966 		if (*(int *)result == DDI_SUCCESS) {
5967 			if (bpc->bpc_nlevel > 0) {
5968 				MDI_CLIENT_SET_POWER_UP(ct);
5969 			} else {
5970 				MDI_CLIENT_SET_POWER_DOWN(ct);
5971 			}
5972 		}
5973 
5974 		/* release the hold we did in pre-notification */
5975 		if (bpc->bpc_nlevel > 0 && (*(int *)result != DDI_SUCCESS) &&
5976 		    !DEVI_IS_ATTACHING(ct->ct_dip)) {
5977 			MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
5978 			    "mdi_bus_power i_mdi_pm_rele_client\n"));
5979 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
5980 		}
5981 
5982 		if (bpc->bpc_nlevel == 0 && (*(int *)result == DDI_SUCCESS)) {
5983 			/* another thread might started attaching */
5984 			if (DEVI_IS_ATTACHING(ct->ct_dip)) {
5985 				MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
5986 				    "mdi_bus_power i_mdi_pm_rele_client\n"));
5987 				i_mdi_pm_rele_client(ct, ct->ct_path_count);
5988 			/* detaching has been taken care in pm_post_unconfig */
5989 			} else if (!DEVI_IS_DETACHING(ct->ct_dip)) {
5990 				MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
5991 				    "mdi_bus_power i_mdi_pm_reset_client\n"));
5992 				i_mdi_pm_reset_client(ct);
5993 			}
5994 		}
5995 
5996 		MDI_CLIENT_CLEAR_POWER_TRANSITION(ct);
5997 		cv_broadcast(&ct->ct_powerchange_cv);
5998 
5999 		break;
6000 
6001 	/* need to do more */
6002 	case BUS_POWER_HAS_CHANGED:
6003 		MDI_DEBUG(4, (CE_NOTE, bphc->bphc_dip, "mdi_bus_power "
6004 		    "BUS_POWER_HAS_CHANGED:"
6005 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d\n",
6006 		    PM_NAME(bphc->bphc_dip), PM_ADDR(bphc->bphc_dip),
6007 		    bphc->bphc_olevel, bphc->bphc_nlevel, bphc->bphc_comp));
6008 
6009 		if (bphc->bphc_nlevel > 0 &&
6010 		    bphc->bphc_nlevel > bphc->bphc_olevel) {
6011 			if (ct->ct_power_cnt == 0) {
6012 				ret = i_mdi_power_all_phci(ct);
6013 			}
6014 			MDI_DEBUG(4, (CE_NOTE, bphc->bphc_dip,
6015 			    "mdi_bus_power i_mdi_pm_hold_client\n"));
6016 			i_mdi_pm_hold_client(ct, ct->ct_path_count);
6017 		}
6018 
6019 		if (bphc->bphc_nlevel == 0 && bphc->bphc_olevel != -1) {
6020 			MDI_DEBUG(4, (CE_NOTE, bphc->bphc_dip,
6021 			    "mdi_bus_power i_mdi_pm_rele_client\n"));
6022 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
6023 		}
6024 		break;
6025 	}
6026 
6027 	MDI_CLIENT_UNLOCK(ct);
6028 	return (ret);
6029 }
6030 
6031 static int
6032 i_mdi_pm_pre_config_one(dev_info_t *child)
6033 {
6034 	int		ret = MDI_SUCCESS;
6035 	mdi_client_t	*ct;
6036 
6037 	ct = i_devi_get_client(child);
6038 	if (ct == NULL)
6039 		return (MDI_FAILURE);
6040 
6041 	MDI_CLIENT_LOCK(ct);
6042 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6043 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6044 
6045 	if (!MDI_CLIENT_IS_FAILED(ct)) {
6046 		MDI_CLIENT_UNLOCK(ct);
6047 		MDI_DEBUG(4, (CE_NOTE, child,
6048 		    "i_mdi_pm_pre_config_one already configured\n"));
6049 		return (MDI_SUCCESS);
6050 	}
6051 
6052 	if (ct->ct_powercnt_config) {
6053 		MDI_CLIENT_UNLOCK(ct);
6054 		MDI_DEBUG(4, (CE_NOTE, child,
6055 		    "i_mdi_pm_pre_config_one ALREADY held\n"));
6056 		return (MDI_SUCCESS);
6057 	}
6058 
6059 	if (ct->ct_power_cnt == 0) {
6060 		ret = i_mdi_power_all_phci(ct);
6061 	}
6062 	MDI_DEBUG(4, (CE_NOTE, child,
6063 	    "i_mdi_pm_pre_config_one i_mdi_pm_hold_client\n"));
6064 	i_mdi_pm_hold_client(ct, ct->ct_path_count);
6065 	ct->ct_powercnt_config = 1;
6066 	ct->ct_powercnt_reset = 0;
6067 	MDI_CLIENT_UNLOCK(ct);
6068 	return (ret);
6069 }
6070 
6071 static int
6072 i_mdi_pm_pre_config(dev_info_t *vdip, dev_info_t *child)
6073 {
6074 	int			ret = MDI_SUCCESS;
6075 	dev_info_t		*cdip;
6076 	int			circ;
6077 
6078 	ASSERT(MDI_VHCI(vdip));
6079 
6080 	/* ndi_devi_config_one */
6081 	if (child) {
6082 		ASSERT(DEVI_BUSY_OWNED(vdip));
6083 		return (i_mdi_pm_pre_config_one(child));
6084 	}
6085 
6086 	/* devi_config_common */
6087 	ndi_devi_enter(vdip, &circ);
6088 	cdip = ddi_get_child(vdip);
6089 	while (cdip) {
6090 		dev_info_t *next = ddi_get_next_sibling(cdip);
6091 
6092 		ret = i_mdi_pm_pre_config_one(cdip);
6093 		if (ret != MDI_SUCCESS)
6094 			break;
6095 		cdip = next;
6096 	}
6097 	ndi_devi_exit(vdip, circ);
6098 	return (ret);
6099 }
6100 
6101 static int
6102 i_mdi_pm_pre_unconfig_one(dev_info_t *child, int *held, int flags)
6103 {
6104 	int		ret = MDI_SUCCESS;
6105 	mdi_client_t	*ct;
6106 
6107 	ct = i_devi_get_client(child);
6108 	if (ct == NULL)
6109 		return (MDI_FAILURE);
6110 
6111 	MDI_CLIENT_LOCK(ct);
6112 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6113 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6114 
6115 	if (!i_ddi_devi_attached(ct->ct_dip)) {
6116 		MDI_DEBUG(4, (CE_NOTE, child,
6117 		    "i_mdi_pm_pre_unconfig node detached already\n"));
6118 		MDI_CLIENT_UNLOCK(ct);
6119 		return (MDI_SUCCESS);
6120 	}
6121 
6122 	if (MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6123 	    (flags & NDI_AUTODETACH)) {
6124 		MDI_DEBUG(4, (CE_NOTE, child,
6125 		    "i_mdi_pm_pre_unconfig auto-modunload\n"));
6126 		MDI_CLIENT_UNLOCK(ct);
6127 		return (MDI_FAILURE);
6128 	}
6129 
6130 	if (ct->ct_powercnt_unconfig) {
6131 		MDI_DEBUG(4, (CE_NOTE, child,
6132 		    "i_mdi_pm_pre_unconfig ct_powercnt_held\n"));
6133 		MDI_CLIENT_UNLOCK(ct);
6134 		*held = 1;
6135 		return (MDI_SUCCESS);
6136 	}
6137 
6138 	if (ct->ct_power_cnt == 0) {
6139 		ret = i_mdi_power_all_phci(ct);
6140 	}
6141 	MDI_DEBUG(4, (CE_NOTE, child,
6142 	    "i_mdi_pm_pre_unconfig i_mdi_pm_hold_client\n"));
6143 	i_mdi_pm_hold_client(ct, ct->ct_path_count);
6144 	ct->ct_powercnt_unconfig = 1;
6145 	ct->ct_powercnt_reset = 0;
6146 	MDI_CLIENT_UNLOCK(ct);
6147 	if (ret == MDI_SUCCESS)
6148 		*held = 1;
6149 	return (ret);
6150 }
6151 
6152 static int
6153 i_mdi_pm_pre_unconfig(dev_info_t *vdip, dev_info_t *child, int *held,
6154     int flags)
6155 {
6156 	int			ret = MDI_SUCCESS;
6157 	dev_info_t		*cdip;
6158 	int			circ;
6159 
6160 	ASSERT(MDI_VHCI(vdip));
6161 	*held = 0;
6162 
6163 	/* ndi_devi_unconfig_one */
6164 	if (child) {
6165 		ASSERT(DEVI_BUSY_OWNED(vdip));
6166 		return (i_mdi_pm_pre_unconfig_one(child, held, flags));
6167 	}
6168 
6169 	/* devi_unconfig_common */
6170 	ndi_devi_enter(vdip, &circ);
6171 	cdip = ddi_get_child(vdip);
6172 	while (cdip) {
6173 		dev_info_t *next = ddi_get_next_sibling(cdip);
6174 
6175 		ret = i_mdi_pm_pre_unconfig_one(cdip, held, flags);
6176 		cdip = next;
6177 	}
6178 	ndi_devi_exit(vdip, circ);
6179 
6180 	if (*held)
6181 		ret = MDI_SUCCESS;
6182 
6183 	return (ret);
6184 }
6185 
6186 static void
6187 i_mdi_pm_post_config_one(dev_info_t *child)
6188 {
6189 	mdi_client_t	*ct;
6190 
6191 	ct = i_devi_get_client(child);
6192 	if (ct == NULL)
6193 		return;
6194 
6195 	MDI_CLIENT_LOCK(ct);
6196 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6197 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6198 
6199 	if (ct->ct_powercnt_reset || !ct->ct_powercnt_config) {
6200 		MDI_DEBUG(4, (CE_NOTE, child,
6201 		    "i_mdi_pm_post_config_one NOT configured\n"));
6202 		MDI_CLIENT_UNLOCK(ct);
6203 		return;
6204 	}
6205 
6206 	/* client has not been updated */
6207 	if (MDI_CLIENT_IS_FAILED(ct)) {
6208 		MDI_DEBUG(4, (CE_NOTE, child,
6209 		    "i_mdi_pm_post_config_one NOT configured\n"));
6210 		MDI_CLIENT_UNLOCK(ct);
6211 		return;
6212 	}
6213 
6214 	/* another thread might have powered it down or detached it */
6215 	if ((MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6216 	    !DEVI_IS_ATTACHING(ct->ct_dip)) ||
6217 	    (!i_ddi_devi_attached(ct->ct_dip) &&
6218 	    !DEVI_IS_ATTACHING(ct->ct_dip))) {
6219 		MDI_DEBUG(4, (CE_NOTE, child,
6220 		    "i_mdi_pm_post_config i_mdi_pm_reset_client\n"));
6221 		i_mdi_pm_reset_client(ct);
6222 	} else {
6223 		mdi_pathinfo_t  *pip, *next;
6224 		int	valid_path_count = 0;
6225 
6226 		MDI_DEBUG(4, (CE_NOTE, child,
6227 		    "i_mdi_pm_post_config i_mdi_pm_rele_client\n"));
6228 		pip = ct->ct_path_head;
6229 		while (pip != NULL) {
6230 			MDI_PI_LOCK(pip);
6231 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6232 			if (MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip))
6233 				valid_path_count ++;
6234 			MDI_PI_UNLOCK(pip);
6235 			pip = next;
6236 		}
6237 		i_mdi_pm_rele_client(ct, valid_path_count);
6238 	}
6239 	ct->ct_powercnt_config = 0;
6240 	MDI_CLIENT_UNLOCK(ct);
6241 }
6242 
6243 static void
6244 i_mdi_pm_post_config(dev_info_t *vdip, dev_info_t *child)
6245 {
6246 	int		circ;
6247 	dev_info_t	*cdip;
6248 
6249 	ASSERT(MDI_VHCI(vdip));
6250 
6251 	/* ndi_devi_config_one */
6252 	if (child) {
6253 		ASSERT(DEVI_BUSY_OWNED(vdip));
6254 		i_mdi_pm_post_config_one(child);
6255 		return;
6256 	}
6257 
6258 	/* devi_config_common */
6259 	ndi_devi_enter(vdip, &circ);
6260 	cdip = ddi_get_child(vdip);
6261 	while (cdip) {
6262 		dev_info_t *next = ddi_get_next_sibling(cdip);
6263 
6264 		i_mdi_pm_post_config_one(cdip);
6265 		cdip = next;
6266 	}
6267 	ndi_devi_exit(vdip, circ);
6268 }
6269 
6270 static void
6271 i_mdi_pm_post_unconfig_one(dev_info_t *child)
6272 {
6273 	mdi_client_t	*ct;
6274 
6275 	ct = i_devi_get_client(child);
6276 	if (ct == NULL)
6277 		return;
6278 
6279 	MDI_CLIENT_LOCK(ct);
6280 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6281 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6282 
6283 	if (!ct->ct_powercnt_unconfig || ct->ct_powercnt_reset) {
6284 		MDI_DEBUG(4, (CE_NOTE, child,
6285 		    "i_mdi_pm_post_unconfig NOT held\n"));
6286 		MDI_CLIENT_UNLOCK(ct);
6287 		return;
6288 	}
6289 
6290 	/* failure detaching or another thread just attached it */
6291 	if ((MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6292 	    i_ddi_devi_attached(ct->ct_dip)) ||
6293 	    (!i_ddi_devi_attached(ct->ct_dip) &&
6294 	    !DEVI_IS_ATTACHING(ct->ct_dip))) {
6295 		MDI_DEBUG(4, (CE_NOTE, child,
6296 		    "i_mdi_pm_post_unconfig i_mdi_pm_reset_client\n"));
6297 		i_mdi_pm_reset_client(ct);
6298 	} else {
6299 		mdi_pathinfo_t  *pip, *next;
6300 		int	valid_path_count = 0;
6301 
6302 		MDI_DEBUG(4, (CE_NOTE, child,
6303 		    "i_mdi_pm_post_unconfig i_mdi_pm_rele_client\n"));
6304 		pip = ct->ct_path_head;
6305 		while (pip != NULL) {
6306 			MDI_PI_LOCK(pip);
6307 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6308 			if (MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip))
6309 				valid_path_count ++;
6310 			MDI_PI_UNLOCK(pip);
6311 			pip = next;
6312 		}
6313 		i_mdi_pm_rele_client(ct, valid_path_count);
6314 		ct->ct_powercnt_unconfig = 0;
6315 	}
6316 
6317 	MDI_CLIENT_UNLOCK(ct);
6318 }
6319 
6320 static void
6321 i_mdi_pm_post_unconfig(dev_info_t *vdip, dev_info_t *child, int held)
6322 {
6323 	int			circ;
6324 	dev_info_t		*cdip;
6325 
6326 	ASSERT(MDI_VHCI(vdip));
6327 
6328 	if (!held) {
6329 		MDI_DEBUG(4, (CE_NOTE, vdip,
6330 		    "i_mdi_pm_post_unconfig held = %d\n", held));
6331 		return;
6332 	}
6333 
6334 	if (child) {
6335 		ASSERT(DEVI_BUSY_OWNED(vdip));
6336 		i_mdi_pm_post_unconfig_one(child);
6337 		return;
6338 	}
6339 
6340 	ndi_devi_enter(vdip, &circ);
6341 	cdip = ddi_get_child(vdip);
6342 	while (cdip) {
6343 		dev_info_t *next = ddi_get_next_sibling(cdip);
6344 
6345 		i_mdi_pm_post_unconfig_one(cdip);
6346 		cdip = next;
6347 	}
6348 	ndi_devi_exit(vdip, circ);
6349 }
6350 
6351 int
6352 mdi_power(dev_info_t *vdip, mdi_pm_op_t op, void *args, char *devnm, int flags)
6353 {
6354 	int			circ, ret = MDI_SUCCESS;
6355 	dev_info_t		*client_dip = NULL;
6356 	mdi_client_t		*ct;
6357 
6358 	/*
6359 	 * Handling ndi_devi_config_one and ndi_devi_unconfig_one.
6360 	 * Power up pHCI for the named client device.
6361 	 * Note: Before the client is enumerated under vhci by phci,
6362 	 * client_dip can be NULL. Then proceed to power up all the
6363 	 * pHCIs.
6364 	 */
6365 	if (devnm != NULL) {
6366 		ndi_devi_enter(vdip, &circ);
6367 		client_dip = ndi_devi_findchild(vdip, devnm);
6368 	}
6369 
6370 	MDI_DEBUG(4, (CE_NOTE, vdip, "mdi_power op = %d %s %p\n",
6371 	    op, devnm ? devnm : "NULL", (void *)client_dip));
6372 
6373 	switch (op) {
6374 	case MDI_PM_PRE_CONFIG:
6375 		ret = i_mdi_pm_pre_config(vdip, client_dip);
6376 		break;
6377 
6378 	case MDI_PM_PRE_UNCONFIG:
6379 		ret = i_mdi_pm_pre_unconfig(vdip, client_dip, (int *)args,
6380 		    flags);
6381 		break;
6382 
6383 	case MDI_PM_POST_CONFIG:
6384 		i_mdi_pm_post_config(vdip, client_dip);
6385 		break;
6386 
6387 	case MDI_PM_POST_UNCONFIG:
6388 		i_mdi_pm_post_unconfig(vdip, client_dip, *(int *)args);
6389 		break;
6390 
6391 	case MDI_PM_HOLD_POWER:
6392 	case MDI_PM_RELE_POWER:
6393 		ASSERT(args);
6394 
6395 		client_dip = (dev_info_t *)args;
6396 		ASSERT(MDI_CLIENT(client_dip));
6397 
6398 		ct = i_devi_get_client(client_dip);
6399 		MDI_CLIENT_LOCK(ct);
6400 
6401 		if (op == MDI_PM_HOLD_POWER) {
6402 			if (ct->ct_power_cnt == 0) {
6403 				(void) i_mdi_power_all_phci(ct);
6404 				MDI_DEBUG(4, (CE_NOTE, client_dip,
6405 				    "mdi_power i_mdi_pm_hold_client\n"));
6406 				i_mdi_pm_hold_client(ct, ct->ct_path_count);
6407 			}
6408 		} else {
6409 			if (DEVI_IS_ATTACHING(ct->ct_dip)) {
6410 				MDI_DEBUG(4, (CE_NOTE, client_dip,
6411 				    "mdi_power i_mdi_pm_rele_client\n"));
6412 				i_mdi_pm_rele_client(ct, ct->ct_path_count);
6413 			} else {
6414 				MDI_DEBUG(4, (CE_NOTE, client_dip,
6415 				    "mdi_power i_mdi_pm_reset_client\n"));
6416 				i_mdi_pm_reset_client(ct);
6417 			}
6418 		}
6419 
6420 		MDI_CLIENT_UNLOCK(ct);
6421 		break;
6422 
6423 	default:
6424 		break;
6425 	}
6426 
6427 	if (devnm)
6428 		ndi_devi_exit(vdip, circ);
6429 
6430 	return (ret);
6431 }
6432 
6433 int
6434 mdi_component_is_vhci(dev_info_t *dip, const char **mdi_class)
6435 {
6436 	mdi_vhci_t *vhci;
6437 
6438 	if (!MDI_VHCI(dip))
6439 		return (MDI_FAILURE);
6440 
6441 	if (mdi_class) {
6442 		vhci = DEVI(dip)->devi_mdi_xhci;
6443 		ASSERT(vhci);
6444 		*mdi_class = vhci->vh_class;
6445 	}
6446 
6447 	return (MDI_SUCCESS);
6448 }
6449 
6450 int
6451 mdi_component_is_phci(dev_info_t *dip, const char **mdi_class)
6452 {
6453 	mdi_phci_t *phci;
6454 
6455 	if (!MDI_PHCI(dip))
6456 		return (MDI_FAILURE);
6457 
6458 	if (mdi_class) {
6459 		phci = DEVI(dip)->devi_mdi_xhci;
6460 		ASSERT(phci);
6461 		*mdi_class = phci->ph_vhci->vh_class;
6462 	}
6463 
6464 	return (MDI_SUCCESS);
6465 }
6466 
6467 int
6468 mdi_component_is_client(dev_info_t *dip, const char **mdi_class)
6469 {
6470 	mdi_client_t *client;
6471 
6472 	if (!MDI_CLIENT(dip))
6473 		return (MDI_FAILURE);
6474 
6475 	if (mdi_class) {
6476 		client = DEVI(dip)->devi_mdi_client;
6477 		ASSERT(client);
6478 		*mdi_class = client->ct_vhci->vh_class;
6479 	}
6480 
6481 	return (MDI_SUCCESS);
6482 }
6483 
6484 void *
6485 mdi_client_get_vhci_private(dev_info_t *dip)
6486 {
6487 	ASSERT(mdi_component_is_client(dip, NULL) == MDI_SUCCESS);
6488 	if (mdi_component_is_client(dip, NULL) == MDI_SUCCESS) {
6489 		mdi_client_t	*ct;
6490 		ct = i_devi_get_client(dip);
6491 		return (ct->ct_vprivate);
6492 	}
6493 	return (NULL);
6494 }
6495 
6496 void
6497 mdi_client_set_vhci_private(dev_info_t *dip, void *data)
6498 {
6499 	ASSERT(mdi_component_is_client(dip, NULL) == MDI_SUCCESS);
6500 	if (mdi_component_is_client(dip, NULL) == MDI_SUCCESS) {
6501 		mdi_client_t	*ct;
6502 		ct = i_devi_get_client(dip);
6503 		ct->ct_vprivate = data;
6504 	}
6505 }
6506 /*
6507  * mdi_pi_get_vhci_private():
6508  *		Get the vhci private information associated with the
6509  *		mdi_pathinfo node
6510  */
6511 void *
6512 mdi_pi_get_vhci_private(mdi_pathinfo_t *pip)
6513 {
6514 	caddr_t	vprivate = NULL;
6515 	if (pip) {
6516 		vprivate = MDI_PI(pip)->pi_vprivate;
6517 	}
6518 	return (vprivate);
6519 }
6520 
6521 /*
6522  * mdi_pi_set_vhci_private():
6523  *		Set the vhci private information in the mdi_pathinfo node
6524  */
6525 void
6526 mdi_pi_set_vhci_private(mdi_pathinfo_t *pip, void *priv)
6527 {
6528 	if (pip) {
6529 		MDI_PI(pip)->pi_vprivate = priv;
6530 	}
6531 }
6532 
6533 /*
6534  * mdi_phci_get_vhci_private():
6535  *		Get the vhci private information associated with the
6536  *		mdi_phci node
6537  */
6538 void *
6539 mdi_phci_get_vhci_private(dev_info_t *dip)
6540 {
6541 	ASSERT(mdi_component_is_phci(dip, NULL) == MDI_SUCCESS);
6542 	if (mdi_component_is_phci(dip, NULL) == MDI_SUCCESS) {
6543 		mdi_phci_t	*ph;
6544 		ph = i_devi_get_phci(dip);
6545 		return (ph->ph_vprivate);
6546 	}
6547 	return (NULL);
6548 }
6549 
6550 /*
6551  * mdi_phci_set_vhci_private():
6552  *		Set the vhci private information in the mdi_phci node
6553  */
6554 void
6555 mdi_phci_set_vhci_private(dev_info_t *dip, void *priv)
6556 {
6557 	ASSERT(mdi_component_is_phci(dip, NULL) == MDI_SUCCESS);
6558 	if (mdi_component_is_phci(dip, NULL) == MDI_SUCCESS) {
6559 		mdi_phci_t	*ph;
6560 		ph = i_devi_get_phci(dip);
6561 		ph->ph_vprivate = priv;
6562 	}
6563 }
6564 
6565 /*
6566  * List of vhci class names:
6567  * A vhci class name must be in this list only if the corresponding vhci
6568  * driver intends to use the mdi provided bus config implementation
6569  * (i.e., mdi_vhci_bus_config()).
6570  */
6571 static char *vhci_class_list[] = { MDI_HCI_CLASS_SCSI, MDI_HCI_CLASS_IB };
6572 #define	N_VHCI_CLASSES	(sizeof (vhci_class_list) / sizeof (char *))
6573 
6574 /*
6575  * During boot time, the on-disk vhci cache for every vhci class is read
6576  * in the form of an nvlist and stored here.
6577  */
6578 static nvlist_t *vhcache_nvl[N_VHCI_CLASSES];
6579 
6580 /* nvpair names in vhci cache nvlist */
6581 #define	MDI_VHCI_CACHE_VERSION	1
6582 #define	MDI_NVPNAME_VERSION	"version"
6583 #define	MDI_NVPNAME_PHCIS	"phcis"
6584 #define	MDI_NVPNAME_CTADDRMAP	"clientaddrmap"
6585 
6586 /*
6587  * Given vhci class name, return its on-disk vhci cache filename.
6588  * Memory for the returned filename which includes the full path is allocated
6589  * by this function.
6590  */
6591 static char *
6592 vhclass2vhcache_filename(char *vhclass)
6593 {
6594 	char *filename;
6595 	int len;
6596 	static char *fmt = "/etc/devices/mdi_%s_cache";
6597 
6598 	/*
6599 	 * fmt contains the on-disk vhci cache file name format;
6600 	 * for scsi_vhci the filename is "/etc/devices/mdi_scsi_vhci_cache".
6601 	 */
6602 
6603 	/* the -1 below is to account for "%s" in the format string */
6604 	len = strlen(fmt) + strlen(vhclass) - 1;
6605 	filename = kmem_alloc(len, KM_SLEEP);
6606 	(void) snprintf(filename, len, fmt, vhclass);
6607 	ASSERT(len == (strlen(filename) + 1));
6608 	return (filename);
6609 }
6610 
6611 /*
6612  * initialize the vhci cache related data structures and read the on-disk
6613  * vhci cached data into memory.
6614  */
6615 static void
6616 setup_vhci_cache(mdi_vhci_t *vh)
6617 {
6618 	mdi_vhci_config_t *vhc;
6619 	mdi_vhci_cache_t *vhcache;
6620 	int i;
6621 	nvlist_t *nvl = NULL;
6622 
6623 	vhc = kmem_zalloc(sizeof (mdi_vhci_config_t), KM_SLEEP);
6624 	vh->vh_config = vhc;
6625 	vhcache = &vhc->vhc_vhcache;
6626 
6627 	vhc->vhc_vhcache_filename = vhclass2vhcache_filename(vh->vh_class);
6628 
6629 	mutex_init(&vhc->vhc_lock, NULL, MUTEX_DEFAULT, NULL);
6630 	cv_init(&vhc->vhc_cv, NULL, CV_DRIVER, NULL);
6631 
6632 	rw_init(&vhcache->vhcache_lock, NULL, RW_DRIVER, NULL);
6633 
6634 	/*
6635 	 * Create string hash; same as mod_hash_create_strhash() except that
6636 	 * we use NULL key destructor.
6637 	 */
6638 	vhcache->vhcache_client_hash = mod_hash_create_extended(vh->vh_class,
6639 	    mdi_bus_config_cache_hash_size,
6640 	    mod_hash_null_keydtor, mod_hash_null_valdtor,
6641 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
6642 
6643 	/*
6644 	 * The on-disk vhci cache is read during booting prior to the
6645 	 * lights-out period by mdi_read_devices_files().
6646 	 */
6647 	for (i = 0; i < N_VHCI_CLASSES; i++) {
6648 		if (strcmp(vhci_class_list[i], vh->vh_class) == 0) {
6649 			nvl = vhcache_nvl[i];
6650 			vhcache_nvl[i] = NULL;
6651 			break;
6652 		}
6653 	}
6654 
6655 	/*
6656 	 * this is to cover the case of some one manually causing unloading
6657 	 * (or detaching) and reloading (or attaching) of a vhci driver.
6658 	 */
6659 	if (nvl == NULL && modrootloaded)
6660 		nvl = read_on_disk_vhci_cache(vh->vh_class);
6661 
6662 	if (nvl != NULL) {
6663 		rw_enter(&vhcache->vhcache_lock, RW_WRITER);
6664 		if (mainnvl_to_vhcache(vhcache, nvl) == MDI_SUCCESS)
6665 			vhcache->vhcache_flags |= MDI_VHCI_CACHE_SETUP_DONE;
6666 		else  {
6667 			cmn_err(CE_WARN,
6668 			    "%s: data file corrupted, will recreate\n",
6669 			    vhc->vhc_vhcache_filename);
6670 		}
6671 		rw_exit(&vhcache->vhcache_lock);
6672 		nvlist_free(nvl);
6673 	}
6674 
6675 	vhc->vhc_cbid = callb_add(stop_vhcache_flush_thread, vhc,
6676 	    CB_CL_UADMIN_PRE_VFS, "mdi_vhcache_flush");
6677 
6678 	vhc->vhc_path_discovery_boot = mdi_path_discovery_boot;
6679 	vhc->vhc_path_discovery_postboot = mdi_path_discovery_postboot;
6680 }
6681 
6682 /*
6683  * free all vhci cache related resources
6684  */
6685 static int
6686 destroy_vhci_cache(mdi_vhci_t *vh)
6687 {
6688 	mdi_vhci_config_t *vhc = vh->vh_config;
6689 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
6690 	mdi_vhcache_phci_t *cphci, *cphci_next;
6691 	mdi_vhcache_client_t *cct, *cct_next;
6692 	mdi_vhcache_pathinfo_t *cpi, *cpi_next;
6693 
6694 	if (stop_vhcache_async_threads(vhc) != MDI_SUCCESS)
6695 		return (MDI_FAILURE);
6696 
6697 	kmem_free(vhc->vhc_vhcache_filename,
6698 	    strlen(vhc->vhc_vhcache_filename) + 1);
6699 
6700 	mod_hash_destroy_strhash(vhcache->vhcache_client_hash);
6701 
6702 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
6703 	    cphci = cphci_next) {
6704 		cphci_next = cphci->cphci_next;
6705 		free_vhcache_phci(cphci);
6706 	}
6707 
6708 	for (cct = vhcache->vhcache_client_head; cct != NULL; cct = cct_next) {
6709 		cct_next = cct->cct_next;
6710 		for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi_next) {
6711 			cpi_next = cpi->cpi_next;
6712 			free_vhcache_pathinfo(cpi);
6713 		}
6714 		free_vhcache_client(cct);
6715 	}
6716 
6717 	rw_destroy(&vhcache->vhcache_lock);
6718 
6719 	mutex_destroy(&vhc->vhc_lock);
6720 	cv_destroy(&vhc->vhc_cv);
6721 	kmem_free(vhc, sizeof (mdi_vhci_config_t));
6722 	return (MDI_SUCCESS);
6723 }
6724 
6725 /*
6726  * Stop all vhci cache related async threads and free their resources.
6727  */
6728 static int
6729 stop_vhcache_async_threads(mdi_vhci_config_t *vhc)
6730 {
6731 	mdi_async_client_config_t *acc, *acc_next;
6732 
6733 	mutex_enter(&vhc->vhc_lock);
6734 	vhc->vhc_flags |= MDI_VHC_EXIT;
6735 	ASSERT(vhc->vhc_acc_thrcount >= 0);
6736 	cv_broadcast(&vhc->vhc_cv);
6737 
6738 	while ((vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) ||
6739 	    vhc->vhc_acc_thrcount != 0) {
6740 		mutex_exit(&vhc->vhc_lock);
6741 		delay(1);
6742 		mutex_enter(&vhc->vhc_lock);
6743 	}
6744 
6745 	vhc->vhc_flags &= ~MDI_VHC_EXIT;
6746 
6747 	for (acc = vhc->vhc_acc_list_head; acc != NULL; acc = acc_next) {
6748 		acc_next = acc->acc_next;
6749 		free_async_client_config(acc);
6750 	}
6751 	vhc->vhc_acc_list_head = NULL;
6752 	vhc->vhc_acc_list_tail = NULL;
6753 	vhc->vhc_acc_count = 0;
6754 
6755 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) {
6756 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
6757 		mutex_exit(&vhc->vhc_lock);
6758 		if (flush_vhcache(vhc, 0) != MDI_SUCCESS) {
6759 			vhcache_dirty(vhc);
6760 			return (MDI_FAILURE);
6761 		}
6762 	} else
6763 		mutex_exit(&vhc->vhc_lock);
6764 
6765 	if (callb_delete(vhc->vhc_cbid) != 0)
6766 		return (MDI_FAILURE);
6767 
6768 	return (MDI_SUCCESS);
6769 }
6770 
6771 /*
6772  * Stop vhci cache flush thread
6773  */
6774 /* ARGSUSED */
6775 static boolean_t
6776 stop_vhcache_flush_thread(void *arg, int code)
6777 {
6778 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
6779 
6780 	mutex_enter(&vhc->vhc_lock);
6781 	vhc->vhc_flags |= MDI_VHC_EXIT;
6782 	cv_broadcast(&vhc->vhc_cv);
6783 
6784 	while (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) {
6785 		mutex_exit(&vhc->vhc_lock);
6786 		delay(1);
6787 		mutex_enter(&vhc->vhc_lock);
6788 	}
6789 
6790 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) {
6791 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
6792 		mutex_exit(&vhc->vhc_lock);
6793 		(void) flush_vhcache(vhc, 1);
6794 	} else
6795 		mutex_exit(&vhc->vhc_lock);
6796 
6797 	return (B_TRUE);
6798 }
6799 
6800 /*
6801  * Enqueue the vhcache phci (cphci) at the tail of the list
6802  */
6803 static void
6804 enqueue_vhcache_phci(mdi_vhci_cache_t *vhcache, mdi_vhcache_phci_t *cphci)
6805 {
6806 	cphci->cphci_next = NULL;
6807 	if (vhcache->vhcache_phci_head == NULL)
6808 		vhcache->vhcache_phci_head = cphci;
6809 	else
6810 		vhcache->vhcache_phci_tail->cphci_next = cphci;
6811 	vhcache->vhcache_phci_tail = cphci;
6812 }
6813 
6814 /*
6815  * Enqueue the vhcache pathinfo (cpi) at the tail of the list
6816  */
6817 static void
6818 enqueue_tail_vhcache_pathinfo(mdi_vhcache_client_t *cct,
6819     mdi_vhcache_pathinfo_t *cpi)
6820 {
6821 	cpi->cpi_next = NULL;
6822 	if (cct->cct_cpi_head == NULL)
6823 		cct->cct_cpi_head = cpi;
6824 	else
6825 		cct->cct_cpi_tail->cpi_next = cpi;
6826 	cct->cct_cpi_tail = cpi;
6827 }
6828 
6829 /*
6830  * Enqueue the vhcache pathinfo (cpi) at the correct location in the
6831  * ordered list. All cpis which do not have MDI_CPI_HINT_PATH_DOES_NOT_EXIST
6832  * flag set come at the beginning of the list. All cpis which have this
6833  * flag set come at the end of the list.
6834  */
6835 static void
6836 enqueue_vhcache_pathinfo(mdi_vhcache_client_t *cct,
6837     mdi_vhcache_pathinfo_t *newcpi)
6838 {
6839 	mdi_vhcache_pathinfo_t *cpi, *prev_cpi;
6840 
6841 	if (cct->cct_cpi_head == NULL ||
6842 	    (newcpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST))
6843 		enqueue_tail_vhcache_pathinfo(cct, newcpi);
6844 	else {
6845 		for (cpi = cct->cct_cpi_head, prev_cpi = NULL; cpi != NULL &&
6846 		    !(cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST);
6847 		    prev_cpi = cpi, cpi = cpi->cpi_next)
6848 			;
6849 
6850 		if (prev_cpi == NULL)
6851 			cct->cct_cpi_head = newcpi;
6852 		else
6853 			prev_cpi->cpi_next = newcpi;
6854 
6855 		newcpi->cpi_next = cpi;
6856 
6857 		if (cpi == NULL)
6858 			cct->cct_cpi_tail = newcpi;
6859 	}
6860 }
6861 
6862 /*
6863  * Enqueue the vhcache client (cct) at the tail of the list
6864  */
6865 static void
6866 enqueue_vhcache_client(mdi_vhci_cache_t *vhcache,
6867     mdi_vhcache_client_t *cct)
6868 {
6869 	cct->cct_next = NULL;
6870 	if (vhcache->vhcache_client_head == NULL)
6871 		vhcache->vhcache_client_head = cct;
6872 	else
6873 		vhcache->vhcache_client_tail->cct_next = cct;
6874 	vhcache->vhcache_client_tail = cct;
6875 }
6876 
6877 static void
6878 free_string_array(char **str, int nelem)
6879 {
6880 	int i;
6881 
6882 	if (str) {
6883 		for (i = 0; i < nelem; i++) {
6884 			if (str[i])
6885 				kmem_free(str[i], strlen(str[i]) + 1);
6886 		}
6887 		kmem_free(str, sizeof (char *) * nelem);
6888 	}
6889 }
6890 
6891 static void
6892 free_vhcache_phci(mdi_vhcache_phci_t *cphci)
6893 {
6894 	kmem_free(cphci->cphci_path, strlen(cphci->cphci_path) + 1);
6895 	kmem_free(cphci, sizeof (*cphci));
6896 }
6897 
6898 static void
6899 free_vhcache_pathinfo(mdi_vhcache_pathinfo_t *cpi)
6900 {
6901 	kmem_free(cpi->cpi_addr, strlen(cpi->cpi_addr) + 1);
6902 	kmem_free(cpi, sizeof (*cpi));
6903 }
6904 
6905 static void
6906 free_vhcache_client(mdi_vhcache_client_t *cct)
6907 {
6908 	kmem_free(cct->cct_name_addr, strlen(cct->cct_name_addr) + 1);
6909 	kmem_free(cct, sizeof (*cct));
6910 }
6911 
6912 static char *
6913 vhcache_mknameaddr(char *ct_name, char *ct_addr, int *ret_len)
6914 {
6915 	char *name_addr;
6916 	int len;
6917 
6918 	len = strlen(ct_name) + strlen(ct_addr) + 2;
6919 	name_addr = kmem_alloc(len, KM_SLEEP);
6920 	(void) snprintf(name_addr, len, "%s@%s", ct_name, ct_addr);
6921 
6922 	if (ret_len)
6923 		*ret_len = len;
6924 	return (name_addr);
6925 }
6926 
6927 /*
6928  * Copy the contents of paddrnvl to vhci cache.
6929  * paddrnvl nvlist contains path information for a vhci client.
6930  * See the comment in mainnvl_to_vhcache() for the format of this nvlist.
6931  */
6932 static void
6933 paddrnvl_to_vhcache(nvlist_t *nvl, mdi_vhcache_phci_t *cphci_list[],
6934     mdi_vhcache_client_t *cct)
6935 {
6936 	nvpair_t *nvp = NULL;
6937 	mdi_vhcache_pathinfo_t *cpi;
6938 	uint_t nelem;
6939 	uint32_t *val;
6940 
6941 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
6942 		ASSERT(nvpair_type(nvp) == DATA_TYPE_UINT32_ARRAY);
6943 		cpi = kmem_zalloc(sizeof (*cpi), KM_SLEEP);
6944 		cpi->cpi_addr = i_ddi_strdup(nvpair_name(nvp), KM_SLEEP);
6945 		(void) nvpair_value_uint32_array(nvp, &val, &nelem);
6946 		ASSERT(nelem == 2);
6947 		cpi->cpi_cphci = cphci_list[val[0]];
6948 		cpi->cpi_flags = val[1];
6949 		enqueue_tail_vhcache_pathinfo(cct, cpi);
6950 	}
6951 }
6952 
6953 /*
6954  * Copy the contents of caddrmapnvl to vhci cache.
6955  * caddrmapnvl nvlist contains vhci client address to phci client address
6956  * mappings. See the comment in mainnvl_to_vhcache() for the format of
6957  * this nvlist.
6958  */
6959 static void
6960 caddrmapnvl_to_vhcache(mdi_vhci_cache_t *vhcache, nvlist_t *nvl,
6961     mdi_vhcache_phci_t *cphci_list[])
6962 {
6963 	nvpair_t *nvp = NULL;
6964 	nvlist_t *paddrnvl;
6965 	mdi_vhcache_client_t *cct;
6966 
6967 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
6968 		ASSERT(nvpair_type(nvp) == DATA_TYPE_NVLIST);
6969 		cct = kmem_zalloc(sizeof (*cct), KM_SLEEP);
6970 		cct->cct_name_addr = i_ddi_strdup(nvpair_name(nvp), KM_SLEEP);
6971 		(void) nvpair_value_nvlist(nvp, &paddrnvl);
6972 		paddrnvl_to_vhcache(paddrnvl, cphci_list, cct);
6973 		/* the client must contain at least one path */
6974 		ASSERT(cct->cct_cpi_head != NULL);
6975 
6976 		enqueue_vhcache_client(vhcache, cct);
6977 		(void) mod_hash_insert(vhcache->vhcache_client_hash,
6978 		    (mod_hash_key_t)cct->cct_name_addr, (mod_hash_val_t)cct);
6979 	}
6980 }
6981 
6982 /*
6983  * Copy the contents of the main nvlist to vhci cache.
6984  *
6985  * VHCI busconfig cached data is stored in the form of a nvlist on the disk.
6986  * The nvlist contains the mappings between the vhci client addresses and
6987  * their corresponding phci client addresses.
6988  *
6989  * The structure of the nvlist is as follows:
6990  *
6991  * Main nvlist:
6992  *	NAME		TYPE		DATA
6993  *	version		int32		version number
6994  *	phcis		string array	array of phci paths
6995  *	clientaddrmap	nvlist_t	c2paddrs_nvl (see below)
6996  *
6997  * structure of c2paddrs_nvl:
6998  *	NAME		TYPE		DATA
6999  *	caddr1		nvlist_t	paddrs_nvl1
7000  *	caddr2		nvlist_t	paddrs_nvl2
7001  *	...
7002  * where caddr1, caddr2, ... are vhci client name and addresses in the
7003  * form of "<clientname>@<clientaddress>".
7004  * (for example: "ssd@2000002037cd9f72");
7005  * paddrs_nvl1, paddrs_nvl2, .. are nvlists that contain path information.
7006  *
7007  * structure of paddrs_nvl:
7008  *	NAME		TYPE		DATA
7009  *	pi_addr1	uint32_array	(phci-id, cpi_flags)
7010  *	pi_addr2	uint32_array	(phci-id, cpi_flags)
7011  *	...
7012  * where pi_addr1, pi_addr2, ... are bus specific addresses of pathinfo nodes
7013  * (so called pi_addrs, for example: "w2100002037cd9f72,0");
7014  * phci-ids are integers that identify PHCIs to which the
7015  * the bus specific address belongs to. These integers are used as an index
7016  * into to the phcis string array in the main nvlist to get the PHCI path.
7017  */
7018 static int
7019 mainnvl_to_vhcache(mdi_vhci_cache_t *vhcache, nvlist_t *nvl)
7020 {
7021 	char **phcis, **phci_namep;
7022 	uint_t nphcis;
7023 	mdi_vhcache_phci_t *cphci, **cphci_list;
7024 	nvlist_t *caddrmapnvl;
7025 	int32_t ver;
7026 	int i;
7027 	size_t cphci_list_size;
7028 
7029 	ASSERT(RW_WRITE_HELD(&vhcache->vhcache_lock));
7030 
7031 	if (nvlist_lookup_int32(nvl, MDI_NVPNAME_VERSION, &ver) != 0 ||
7032 	    ver != MDI_VHCI_CACHE_VERSION)
7033 		return (MDI_FAILURE);
7034 
7035 	if (nvlist_lookup_string_array(nvl, MDI_NVPNAME_PHCIS, &phcis,
7036 	    &nphcis) != 0)
7037 		return (MDI_SUCCESS);
7038 
7039 	ASSERT(nphcis > 0);
7040 
7041 	cphci_list_size = sizeof (mdi_vhcache_phci_t *) * nphcis;
7042 	cphci_list = kmem_alloc(cphci_list_size, KM_SLEEP);
7043 	for (i = 0, phci_namep = phcis; i < nphcis; i++, phci_namep++) {
7044 		cphci = kmem_zalloc(sizeof (mdi_vhcache_phci_t), KM_SLEEP);
7045 		cphci->cphci_path = i_ddi_strdup(*phci_namep, KM_SLEEP);
7046 		enqueue_vhcache_phci(vhcache, cphci);
7047 		cphci_list[i] = cphci;
7048 	}
7049 
7050 	ASSERT(vhcache->vhcache_phci_head != NULL);
7051 
7052 	if (nvlist_lookup_nvlist(nvl, MDI_NVPNAME_CTADDRMAP, &caddrmapnvl) == 0)
7053 		caddrmapnvl_to_vhcache(vhcache, caddrmapnvl, cphci_list);
7054 
7055 	kmem_free(cphci_list, cphci_list_size);
7056 	return (MDI_SUCCESS);
7057 }
7058 
7059 /*
7060  * Build paddrnvl for the specified client using the information in the
7061  * vhci cache and add it to the caddrmapnnvl.
7062  * Returns 0 on success, errno on failure.
7063  */
7064 static int
7065 vhcache_to_paddrnvl(mdi_vhci_cache_t *vhcache, mdi_vhcache_client_t *cct,
7066     nvlist_t *caddrmapnvl)
7067 {
7068 	mdi_vhcache_pathinfo_t *cpi;
7069 	nvlist_t *nvl;
7070 	int err;
7071 	uint32_t val[2];
7072 
7073 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7074 
7075 	if ((err = nvlist_alloc(&nvl, 0, KM_SLEEP)) != 0)
7076 		return (err);
7077 
7078 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
7079 		val[0] = cpi->cpi_cphci->cphci_id;
7080 		val[1] = cpi->cpi_flags;
7081 		if ((err = nvlist_add_uint32_array(nvl, cpi->cpi_addr, val, 2))
7082 		    != 0)
7083 			goto out;
7084 	}
7085 
7086 	err = nvlist_add_nvlist(caddrmapnvl, cct->cct_name_addr, nvl);
7087 out:
7088 	nvlist_free(nvl);
7089 	return (err);
7090 }
7091 
7092 /*
7093  * Build caddrmapnvl using the information in the vhci cache
7094  * and add it to the mainnvl.
7095  * Returns 0 on success, errno on failure.
7096  */
7097 static int
7098 vhcache_to_caddrmapnvl(mdi_vhci_cache_t *vhcache, nvlist_t *mainnvl)
7099 {
7100 	mdi_vhcache_client_t *cct;
7101 	nvlist_t *nvl;
7102 	int err;
7103 
7104 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7105 
7106 	if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) != 0)
7107 		return (err);
7108 
7109 	for (cct = vhcache->vhcache_client_head; cct != NULL;
7110 	    cct = cct->cct_next) {
7111 		if ((err = vhcache_to_paddrnvl(vhcache, cct, nvl)) != 0)
7112 			goto out;
7113 	}
7114 
7115 	err = nvlist_add_nvlist(mainnvl, MDI_NVPNAME_CTADDRMAP, nvl);
7116 out:
7117 	nvlist_free(nvl);
7118 	return (err);
7119 }
7120 
7121 /*
7122  * Build nvlist using the information in the vhci cache.
7123  * See the comment in mainnvl_to_vhcache() for the format of the nvlist.
7124  * Returns nvl on success, NULL on failure.
7125  */
7126 static nvlist_t *
7127 vhcache_to_mainnvl(mdi_vhci_cache_t *vhcache)
7128 {
7129 	mdi_vhcache_phci_t *cphci;
7130 	uint_t phci_count;
7131 	char **phcis;
7132 	nvlist_t *nvl;
7133 	int err, i;
7134 
7135 	if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) != 0) {
7136 		nvl = NULL;
7137 		goto out;
7138 	}
7139 
7140 	if ((err = nvlist_add_int32(nvl, MDI_NVPNAME_VERSION,
7141 	    MDI_VHCI_CACHE_VERSION)) != 0)
7142 		goto out;
7143 
7144 	rw_enter(&vhcache->vhcache_lock, RW_READER);
7145 	if (vhcache->vhcache_phci_head == NULL) {
7146 		rw_exit(&vhcache->vhcache_lock);
7147 		return (nvl);
7148 	}
7149 
7150 	phci_count = 0;
7151 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7152 	    cphci = cphci->cphci_next)
7153 		cphci->cphci_id = phci_count++;
7154 
7155 	/* build phci pathname list */
7156 	phcis = kmem_alloc(sizeof (char *) * phci_count, KM_SLEEP);
7157 	for (cphci = vhcache->vhcache_phci_head, i = 0; cphci != NULL;
7158 	    cphci = cphci->cphci_next, i++)
7159 		phcis[i] = i_ddi_strdup(cphci->cphci_path, KM_SLEEP);
7160 
7161 	err = nvlist_add_string_array(nvl, MDI_NVPNAME_PHCIS, phcis,
7162 	    phci_count);
7163 	free_string_array(phcis, phci_count);
7164 
7165 	if (err == 0 &&
7166 	    (err = vhcache_to_caddrmapnvl(vhcache, nvl)) == 0) {
7167 		rw_exit(&vhcache->vhcache_lock);
7168 		return (nvl);
7169 	}
7170 
7171 	rw_exit(&vhcache->vhcache_lock);
7172 out:
7173 	if (nvl)
7174 		nvlist_free(nvl);
7175 	return (NULL);
7176 }
7177 
7178 /*
7179  * Lookup vhcache phci structure for the specified phci path.
7180  */
7181 static mdi_vhcache_phci_t *
7182 lookup_vhcache_phci_by_name(mdi_vhci_cache_t *vhcache, char *phci_path)
7183 {
7184 	mdi_vhcache_phci_t *cphci;
7185 
7186 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7187 
7188 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7189 	    cphci = cphci->cphci_next) {
7190 		if (strcmp(cphci->cphci_path, phci_path) == 0)
7191 			return (cphci);
7192 	}
7193 
7194 	return (NULL);
7195 }
7196 
7197 /*
7198  * Lookup vhcache phci structure for the specified phci.
7199  */
7200 static mdi_vhcache_phci_t *
7201 lookup_vhcache_phci_by_addr(mdi_vhci_cache_t *vhcache, mdi_phci_t *ph)
7202 {
7203 	mdi_vhcache_phci_t *cphci;
7204 
7205 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7206 
7207 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7208 	    cphci = cphci->cphci_next) {
7209 		if (cphci->cphci_phci == ph)
7210 			return (cphci);
7211 	}
7212 
7213 	return (NULL);
7214 }
7215 
7216 /*
7217  * Add the specified phci to the vhci cache if not already present.
7218  */
7219 static void
7220 vhcache_phci_add(mdi_vhci_config_t *vhc, mdi_phci_t *ph)
7221 {
7222 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7223 	mdi_vhcache_phci_t *cphci;
7224 	char *pathname;
7225 	int cache_updated;
7226 
7227 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7228 
7229 	pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
7230 	(void) ddi_pathname(ph->ph_dip, pathname);
7231 	if ((cphci = lookup_vhcache_phci_by_name(vhcache, pathname))
7232 	    != NULL) {
7233 		cphci->cphci_phci = ph;
7234 		cache_updated = 0;
7235 	} else {
7236 		cphci = kmem_zalloc(sizeof (*cphci), KM_SLEEP);
7237 		cphci->cphci_path = i_ddi_strdup(pathname, KM_SLEEP);
7238 		cphci->cphci_phci = ph;
7239 		enqueue_vhcache_phci(vhcache, cphci);
7240 		cache_updated = 1;
7241 	}
7242 
7243 	rw_exit(&vhcache->vhcache_lock);
7244 
7245 	/*
7246 	 * Since a new phci has been added, reset
7247 	 * vhc_path_discovery_cutoff_time to allow for discovery of paths
7248 	 * during next vhcache_discover_paths().
7249 	 */
7250 	mutex_enter(&vhc->vhc_lock);
7251 	vhc->vhc_path_discovery_cutoff_time = 0;
7252 	mutex_exit(&vhc->vhc_lock);
7253 
7254 	kmem_free(pathname, MAXPATHLEN);
7255 	if (cache_updated)
7256 		vhcache_dirty(vhc);
7257 }
7258 
7259 /*
7260  * Remove the reference to the specified phci from the vhci cache.
7261  */
7262 static void
7263 vhcache_phci_remove(mdi_vhci_config_t *vhc, mdi_phci_t *ph)
7264 {
7265 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7266 	mdi_vhcache_phci_t *cphci;
7267 
7268 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7269 	if ((cphci = lookup_vhcache_phci_by_addr(vhcache, ph)) != NULL) {
7270 		/* do not remove the actual mdi_vhcache_phci structure */
7271 		cphci->cphci_phci = NULL;
7272 	}
7273 	rw_exit(&vhcache->vhcache_lock);
7274 }
7275 
7276 static void
7277 init_vhcache_lookup_token(mdi_vhcache_lookup_token_t *dst,
7278     mdi_vhcache_lookup_token_t *src)
7279 {
7280 	if (src == NULL) {
7281 		dst->lt_cct = NULL;
7282 		dst->lt_cct_lookup_time = 0;
7283 	} else {
7284 		dst->lt_cct = src->lt_cct;
7285 		dst->lt_cct_lookup_time = src->lt_cct_lookup_time;
7286 	}
7287 }
7288 
7289 /*
7290  * Look up vhcache client for the specified client.
7291  */
7292 static mdi_vhcache_client_t *
7293 lookup_vhcache_client(mdi_vhci_cache_t *vhcache, char *ct_name, char *ct_addr,
7294     mdi_vhcache_lookup_token_t *token)
7295 {
7296 	mod_hash_val_t hv;
7297 	char *name_addr;
7298 	int len;
7299 
7300 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7301 
7302 	/*
7303 	 * If no vhcache clean occurred since the last lookup, we can
7304 	 * simply return the cct from the last lookup operation.
7305 	 * It works because ccts are never freed except during the vhcache
7306 	 * cleanup operation.
7307 	 */
7308 	if (token != NULL &&
7309 	    vhcache->vhcache_clean_time < token->lt_cct_lookup_time)
7310 		return (token->lt_cct);
7311 
7312 	name_addr = vhcache_mknameaddr(ct_name, ct_addr, &len);
7313 	if (mod_hash_find(vhcache->vhcache_client_hash,
7314 	    (mod_hash_key_t)name_addr, &hv) == 0) {
7315 		if (token) {
7316 			token->lt_cct = (mdi_vhcache_client_t *)hv;
7317 			token->lt_cct_lookup_time = lbolt64;
7318 		}
7319 	} else {
7320 		if (token) {
7321 			token->lt_cct = NULL;
7322 			token->lt_cct_lookup_time = 0;
7323 		}
7324 		hv = NULL;
7325 	}
7326 	kmem_free(name_addr, len);
7327 	return ((mdi_vhcache_client_t *)hv);
7328 }
7329 
7330 /*
7331  * Add the specified path to the vhci cache if not already present.
7332  * Also add the vhcache client for the client corresponding to this path
7333  * if it doesn't already exist.
7334  */
7335 static void
7336 vhcache_pi_add(mdi_vhci_config_t *vhc, struct mdi_pathinfo *pip)
7337 {
7338 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7339 	mdi_vhcache_client_t *cct;
7340 	mdi_vhcache_pathinfo_t *cpi;
7341 	mdi_phci_t *ph = pip->pi_phci;
7342 	mdi_client_t *ct = pip->pi_client;
7343 	int cache_updated = 0;
7344 
7345 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7346 
7347 	/* if vhcache client for this pip doesn't already exist, add it */
7348 	if ((cct = lookup_vhcache_client(vhcache, ct->ct_drvname, ct->ct_guid,
7349 	    NULL)) == NULL) {
7350 		cct = kmem_zalloc(sizeof (*cct), KM_SLEEP);
7351 		cct->cct_name_addr = vhcache_mknameaddr(ct->ct_drvname,
7352 		    ct->ct_guid, NULL);
7353 		enqueue_vhcache_client(vhcache, cct);
7354 		(void) mod_hash_insert(vhcache->vhcache_client_hash,
7355 		    (mod_hash_key_t)cct->cct_name_addr, (mod_hash_val_t)cct);
7356 		cache_updated = 1;
7357 	}
7358 
7359 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
7360 		if (cpi->cpi_cphci->cphci_phci == ph &&
7361 		    strcmp(cpi->cpi_addr, pip->pi_addr) == 0) {
7362 			cpi->cpi_pip = pip;
7363 			if (cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST) {
7364 				cpi->cpi_flags &=
7365 				    ~MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
7366 				sort_vhcache_paths(cct);
7367 				cache_updated = 1;
7368 			}
7369 			break;
7370 		}
7371 	}
7372 
7373 	if (cpi == NULL) {
7374 		cpi = kmem_zalloc(sizeof (*cpi), KM_SLEEP);
7375 		cpi->cpi_addr = i_ddi_strdup(pip->pi_addr, KM_SLEEP);
7376 		cpi->cpi_cphci = lookup_vhcache_phci_by_addr(vhcache, ph);
7377 		ASSERT(cpi->cpi_cphci != NULL);
7378 		cpi->cpi_pip = pip;
7379 		enqueue_vhcache_pathinfo(cct, cpi);
7380 		cache_updated = 1;
7381 	}
7382 
7383 	rw_exit(&vhcache->vhcache_lock);
7384 
7385 	if (cache_updated)
7386 		vhcache_dirty(vhc);
7387 }
7388 
7389 /*
7390  * Remove the reference to the specified path from the vhci cache.
7391  */
7392 static void
7393 vhcache_pi_remove(mdi_vhci_config_t *vhc, struct mdi_pathinfo *pip)
7394 {
7395 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7396 	mdi_client_t *ct = pip->pi_client;
7397 	mdi_vhcache_client_t *cct;
7398 	mdi_vhcache_pathinfo_t *cpi;
7399 
7400 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7401 	if ((cct = lookup_vhcache_client(vhcache, ct->ct_drvname, ct->ct_guid,
7402 	    NULL)) != NULL) {
7403 		for (cpi = cct->cct_cpi_head; cpi != NULL;
7404 		    cpi = cpi->cpi_next) {
7405 			if (cpi->cpi_pip == pip) {
7406 				cpi->cpi_pip = NULL;
7407 				break;
7408 			}
7409 		}
7410 	}
7411 	rw_exit(&vhcache->vhcache_lock);
7412 }
7413 
7414 /*
7415  * Flush the vhci cache to disk.
7416  * Returns MDI_SUCCESS on success, MDI_FAILURE on failure.
7417  */
7418 static int
7419 flush_vhcache(mdi_vhci_config_t *vhc, int force_flag)
7420 {
7421 	nvlist_t *nvl;
7422 	int err;
7423 	int rv;
7424 
7425 	/*
7426 	 * It is possible that the system may shutdown before
7427 	 * i_ddi_io_initialized (during stmsboot for example). To allow for
7428 	 * flushing the cache in this case do not check for
7429 	 * i_ddi_io_initialized when force flag is set.
7430 	 */
7431 	if (force_flag == 0 && !i_ddi_io_initialized())
7432 		return (MDI_FAILURE);
7433 
7434 	if ((nvl = vhcache_to_mainnvl(&vhc->vhc_vhcache)) != NULL) {
7435 		err = fwrite_nvlist(vhc->vhc_vhcache_filename, nvl);
7436 		nvlist_free(nvl);
7437 	} else
7438 		err = EFAULT;
7439 
7440 	rv = MDI_SUCCESS;
7441 	mutex_enter(&vhc->vhc_lock);
7442 	if (err != 0) {
7443 		if (err == EROFS) {
7444 			vhc->vhc_flags |= MDI_VHC_READONLY_FS;
7445 			vhc->vhc_flags &= ~(MDI_VHC_VHCACHE_FLUSH_ERROR |
7446 			    MDI_VHC_VHCACHE_DIRTY);
7447 		} else {
7448 			if (!(vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_ERROR)) {
7449 				cmn_err(CE_CONT, "%s: update failed\n",
7450 				    vhc->vhc_vhcache_filename);
7451 				vhc->vhc_flags |= MDI_VHC_VHCACHE_FLUSH_ERROR;
7452 			}
7453 			rv = MDI_FAILURE;
7454 		}
7455 	} else if (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_ERROR) {
7456 		cmn_err(CE_CONT,
7457 		    "%s: update now ok\n", vhc->vhc_vhcache_filename);
7458 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_FLUSH_ERROR;
7459 	}
7460 	mutex_exit(&vhc->vhc_lock);
7461 
7462 	return (rv);
7463 }
7464 
7465 /*
7466  * Call flush_vhcache() to flush the vhci cache at the scheduled time.
7467  * Exits itself if left idle for the idle timeout period.
7468  */
7469 static void
7470 vhcache_flush_thread(void *arg)
7471 {
7472 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
7473 	clock_t idle_time, quit_at_ticks;
7474 	callb_cpr_t cprinfo;
7475 
7476 	/* number of seconds to sleep idle before exiting */
7477 	idle_time = mdi_vhcache_flush_daemon_idle_time * TICKS_PER_SECOND;
7478 
7479 	CALLB_CPR_INIT(&cprinfo, &vhc->vhc_lock, callb_generic_cpr,
7480 	    "mdi_vhcache_flush");
7481 	mutex_enter(&vhc->vhc_lock);
7482 	for (; ; ) {
7483 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
7484 		    (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY)) {
7485 			if (ddi_get_lbolt() < vhc->vhc_flush_at_ticks) {
7486 				CALLB_CPR_SAFE_BEGIN(&cprinfo);
7487 				(void) cv_timedwait(&vhc->vhc_cv,
7488 				    &vhc->vhc_lock, vhc->vhc_flush_at_ticks);
7489 				CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
7490 			} else {
7491 				vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
7492 				mutex_exit(&vhc->vhc_lock);
7493 
7494 				if (flush_vhcache(vhc, 0) != MDI_SUCCESS)
7495 					vhcache_dirty(vhc);
7496 
7497 				mutex_enter(&vhc->vhc_lock);
7498 			}
7499 		}
7500 
7501 		quit_at_ticks = ddi_get_lbolt() + idle_time;
7502 
7503 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
7504 		    !(vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) &&
7505 		    ddi_get_lbolt() < quit_at_ticks) {
7506 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
7507 			(void) cv_timedwait(&vhc->vhc_cv, &vhc->vhc_lock,
7508 			    quit_at_ticks);
7509 			CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
7510 		}
7511 
7512 		if ((vhc->vhc_flags & MDI_VHC_EXIT) ||
7513 		    !(vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY))
7514 			goto out;
7515 	}
7516 
7517 out:
7518 	vhc->vhc_flags &= ~MDI_VHC_VHCACHE_FLUSH_THREAD;
7519 	/* CALLB_CPR_EXIT releases the vhc->vhc_lock */
7520 	CALLB_CPR_EXIT(&cprinfo);
7521 }
7522 
7523 /*
7524  * Make vhci cache dirty and schedule flushing by vhcache flush thread.
7525  */
7526 static void
7527 vhcache_dirty(mdi_vhci_config_t *vhc)
7528 {
7529 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7530 	int create_thread;
7531 
7532 	rw_enter(&vhcache->vhcache_lock, RW_READER);
7533 	/* do not flush cache until the cache is fully built */
7534 	if (!(vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE)) {
7535 		rw_exit(&vhcache->vhcache_lock);
7536 		return;
7537 	}
7538 	rw_exit(&vhcache->vhcache_lock);
7539 
7540 	mutex_enter(&vhc->vhc_lock);
7541 	if (vhc->vhc_flags & MDI_VHC_READONLY_FS) {
7542 		mutex_exit(&vhc->vhc_lock);
7543 		return;
7544 	}
7545 
7546 	vhc->vhc_flags |= MDI_VHC_VHCACHE_DIRTY;
7547 	vhc->vhc_flush_at_ticks = ddi_get_lbolt() +
7548 	    mdi_vhcache_flush_delay * TICKS_PER_SECOND;
7549 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) {
7550 		cv_broadcast(&vhc->vhc_cv);
7551 		create_thread = 0;
7552 	} else {
7553 		vhc->vhc_flags |= MDI_VHC_VHCACHE_FLUSH_THREAD;
7554 		create_thread = 1;
7555 	}
7556 	mutex_exit(&vhc->vhc_lock);
7557 
7558 	if (create_thread)
7559 		(void) thread_create(NULL, 0, vhcache_flush_thread, vhc,
7560 		    0, &p0, TS_RUN, minclsyspri);
7561 }
7562 
7563 /*
7564  * phci bus config structure - one for for each phci bus config operation that
7565  * we initiate on behalf of a vhci.
7566  */
7567 typedef struct mdi_phci_bus_config_s {
7568 	char *phbc_phci_path;
7569 	struct mdi_vhci_bus_config_s *phbc_vhbusconfig;	/* vhci bus config */
7570 	struct mdi_phci_bus_config_s *phbc_next;
7571 } mdi_phci_bus_config_t;
7572 
7573 /* vhci bus config structure - one for each vhci bus config operation */
7574 typedef struct mdi_vhci_bus_config_s {
7575 	ddi_bus_config_op_t vhbc_op;	/* bus config op */
7576 	major_t vhbc_op_major;		/* bus config op major */
7577 	uint_t vhbc_op_flags;		/* bus config op flags */
7578 	kmutex_t vhbc_lock;
7579 	kcondvar_t vhbc_cv;
7580 	int vhbc_thr_count;
7581 } mdi_vhci_bus_config_t;
7582 
7583 /*
7584  * bus config the specified phci
7585  */
7586 static void
7587 bus_config_phci(void *arg)
7588 {
7589 	mdi_phci_bus_config_t *phbc = (mdi_phci_bus_config_t *)arg;
7590 	mdi_vhci_bus_config_t *vhbc = phbc->phbc_vhbusconfig;
7591 	dev_info_t *ph_dip;
7592 
7593 	/*
7594 	 * first configure all path components upto phci and then configure
7595 	 * the phci children.
7596 	 */
7597 	if ((ph_dip = e_ddi_hold_devi_by_path(phbc->phbc_phci_path, 0))
7598 	    != NULL) {
7599 		if (vhbc->vhbc_op == BUS_CONFIG_DRIVER ||
7600 		    vhbc->vhbc_op == BUS_UNCONFIG_DRIVER) {
7601 			(void) ndi_devi_config_driver(ph_dip,
7602 			    vhbc->vhbc_op_flags,
7603 			    vhbc->vhbc_op_major);
7604 		} else
7605 			(void) ndi_devi_config(ph_dip,
7606 			    vhbc->vhbc_op_flags);
7607 
7608 		/* release the hold that e_ddi_hold_devi_by_path() placed */
7609 		ndi_rele_devi(ph_dip);
7610 	}
7611 
7612 	kmem_free(phbc->phbc_phci_path, strlen(phbc->phbc_phci_path) + 1);
7613 	kmem_free(phbc, sizeof (*phbc));
7614 
7615 	mutex_enter(&vhbc->vhbc_lock);
7616 	vhbc->vhbc_thr_count--;
7617 	if (vhbc->vhbc_thr_count == 0)
7618 		cv_broadcast(&vhbc->vhbc_cv);
7619 	mutex_exit(&vhbc->vhbc_lock);
7620 }
7621 
7622 /*
7623  * Bus config all phcis associated with the vhci in parallel.
7624  * op must be BUS_CONFIG_DRIVER or BUS_CONFIG_ALL.
7625  */
7626 static void
7627 bus_config_all_phcis(mdi_vhci_cache_t *vhcache, uint_t flags,
7628     ddi_bus_config_op_t op, major_t maj)
7629 {
7630 	mdi_phci_bus_config_t *phbc_head = NULL, *phbc, *phbc_next;
7631 	mdi_vhci_bus_config_t *vhbc;
7632 	mdi_vhcache_phci_t *cphci;
7633 
7634 	rw_enter(&vhcache->vhcache_lock, RW_READER);
7635 	if (vhcache->vhcache_phci_head == NULL) {
7636 		rw_exit(&vhcache->vhcache_lock);
7637 		return;
7638 	}
7639 
7640 	vhbc = kmem_zalloc(sizeof (*vhbc), KM_SLEEP);
7641 
7642 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7643 	    cphci = cphci->cphci_next) {
7644 		phbc = kmem_zalloc(sizeof (*phbc), KM_SLEEP);
7645 		phbc->phbc_phci_path = i_ddi_strdup(cphci->cphci_path,
7646 		    KM_SLEEP);
7647 		phbc->phbc_vhbusconfig = vhbc;
7648 		phbc->phbc_next = phbc_head;
7649 		phbc_head = phbc;
7650 		vhbc->vhbc_thr_count++;
7651 	}
7652 	rw_exit(&vhcache->vhcache_lock);
7653 
7654 	vhbc->vhbc_op = op;
7655 	vhbc->vhbc_op_major = maj;
7656 	vhbc->vhbc_op_flags = NDI_NO_EVENT |
7657 	    (flags & (NDI_CONFIG_REPROBE | NDI_DRV_CONF_REPROBE));
7658 	mutex_init(&vhbc->vhbc_lock, NULL, MUTEX_DEFAULT, NULL);
7659 	cv_init(&vhbc->vhbc_cv, NULL, CV_DRIVER, NULL);
7660 
7661 	/* now create threads to initiate bus config on all phcis in parallel */
7662 	for (phbc = phbc_head; phbc != NULL; phbc = phbc_next) {
7663 		phbc_next = phbc->phbc_next;
7664 		if (mdi_mtc_off)
7665 			bus_config_phci((void *)phbc);
7666 		else
7667 			(void) thread_create(NULL, 0, bus_config_phci, phbc,
7668 			    0, &p0, TS_RUN, minclsyspri);
7669 	}
7670 
7671 	mutex_enter(&vhbc->vhbc_lock);
7672 	/* wait until all threads exit */
7673 	while (vhbc->vhbc_thr_count > 0)
7674 		cv_wait(&vhbc->vhbc_cv, &vhbc->vhbc_lock);
7675 	mutex_exit(&vhbc->vhbc_lock);
7676 
7677 	mutex_destroy(&vhbc->vhbc_lock);
7678 	cv_destroy(&vhbc->vhbc_cv);
7679 	kmem_free(vhbc, sizeof (*vhbc));
7680 }
7681 
7682 /*
7683  * Single threaded version of bus_config_all_phcis()
7684  */
7685 static void
7686 st_bus_config_all_phcis(mdi_vhci_config_t *vhc, uint_t flags,
7687     ddi_bus_config_op_t op, major_t maj)
7688 {
7689 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7690 
7691 	single_threaded_vhconfig_enter(vhc);
7692 	bus_config_all_phcis(vhcache, flags, op, maj);
7693 	single_threaded_vhconfig_exit(vhc);
7694 }
7695 
7696 /*
7697  * Perform BUS_CONFIG_ONE on the specified child of the phci.
7698  * The path includes the child component in addition to the phci path.
7699  */
7700 static int
7701 bus_config_one_phci_child(char *path)
7702 {
7703 	dev_info_t *ph_dip, *child;
7704 	char *devnm;
7705 	int rv = MDI_FAILURE;
7706 
7707 	/* extract the child component of the phci */
7708 	devnm = strrchr(path, '/');
7709 	*devnm++ = '\0';
7710 
7711 	/*
7712 	 * first configure all path components upto phci and then
7713 	 * configure the phci child.
7714 	 */
7715 	if ((ph_dip = e_ddi_hold_devi_by_path(path, 0)) != NULL) {
7716 		if (ndi_devi_config_one(ph_dip, devnm, &child, NDI_NO_EVENT) ==
7717 		    NDI_SUCCESS) {
7718 			/*
7719 			 * release the hold that ndi_devi_config_one() placed
7720 			 */
7721 			ndi_rele_devi(child);
7722 			rv = MDI_SUCCESS;
7723 		}
7724 
7725 		/* release the hold that e_ddi_hold_devi_by_path() placed */
7726 		ndi_rele_devi(ph_dip);
7727 	}
7728 
7729 	devnm--;
7730 	*devnm = '/';
7731 	return (rv);
7732 }
7733 
7734 /*
7735  * Build a list of phci client paths for the specified vhci client.
7736  * The list includes only those phci client paths which aren't configured yet.
7737  */
7738 static mdi_phys_path_t *
7739 build_phclient_path_list(mdi_vhcache_client_t *cct, char *ct_name)
7740 {
7741 	mdi_vhcache_pathinfo_t *cpi;
7742 	mdi_phys_path_t *pp_head = NULL, *pp_tail = NULL, *pp;
7743 	int config_path, len;
7744 
7745 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
7746 		/*
7747 		 * include only those paths that aren't configured.
7748 		 */
7749 		config_path = 0;
7750 		if (cpi->cpi_pip == NULL)
7751 			config_path = 1;
7752 		else {
7753 			MDI_PI_LOCK(cpi->cpi_pip);
7754 			if (MDI_PI_IS_INIT(cpi->cpi_pip))
7755 				config_path = 1;
7756 			MDI_PI_UNLOCK(cpi->cpi_pip);
7757 		}
7758 
7759 		if (config_path) {
7760 			pp = kmem_alloc(sizeof (*pp), KM_SLEEP);
7761 			len = strlen(cpi->cpi_cphci->cphci_path) +
7762 			    strlen(ct_name) + strlen(cpi->cpi_addr) + 3;
7763 			pp->phys_path = kmem_alloc(len, KM_SLEEP);
7764 			(void) snprintf(pp->phys_path, len, "%s/%s@%s",
7765 			    cpi->cpi_cphci->cphci_path, ct_name,
7766 			    cpi->cpi_addr);
7767 			pp->phys_path_next = NULL;
7768 
7769 			if (pp_head == NULL)
7770 				pp_head = pp;
7771 			else
7772 				pp_tail->phys_path_next = pp;
7773 			pp_tail = pp;
7774 		}
7775 	}
7776 
7777 	return (pp_head);
7778 }
7779 
7780 /*
7781  * Free the memory allocated for phci client path list.
7782  */
7783 static void
7784 free_phclient_path_list(mdi_phys_path_t *pp_head)
7785 {
7786 	mdi_phys_path_t *pp, *pp_next;
7787 
7788 	for (pp = pp_head; pp != NULL; pp = pp_next) {
7789 		pp_next = pp->phys_path_next;
7790 		kmem_free(pp->phys_path, strlen(pp->phys_path) + 1);
7791 		kmem_free(pp, sizeof (*pp));
7792 	}
7793 }
7794 
7795 /*
7796  * Allocated async client structure and initialize with the specified values.
7797  */
7798 static mdi_async_client_config_t *
7799 alloc_async_client_config(char *ct_name, char *ct_addr,
7800     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
7801 {
7802 	mdi_async_client_config_t *acc;
7803 
7804 	acc = kmem_alloc(sizeof (*acc), KM_SLEEP);
7805 	acc->acc_ct_name = i_ddi_strdup(ct_name, KM_SLEEP);
7806 	acc->acc_ct_addr = i_ddi_strdup(ct_addr, KM_SLEEP);
7807 	acc->acc_phclient_path_list_head = pp_head;
7808 	init_vhcache_lookup_token(&acc->acc_token, tok);
7809 	acc->acc_next = NULL;
7810 	return (acc);
7811 }
7812 
7813 /*
7814  * Free the memory allocated for the async client structure and their members.
7815  */
7816 static void
7817 free_async_client_config(mdi_async_client_config_t *acc)
7818 {
7819 	if (acc->acc_phclient_path_list_head)
7820 		free_phclient_path_list(acc->acc_phclient_path_list_head);
7821 	kmem_free(acc->acc_ct_name, strlen(acc->acc_ct_name) + 1);
7822 	kmem_free(acc->acc_ct_addr, strlen(acc->acc_ct_addr) + 1);
7823 	kmem_free(acc, sizeof (*acc));
7824 }
7825 
7826 /*
7827  * Sort vhcache pathinfos (cpis) of the specified client.
7828  * All cpis which do not have MDI_CPI_HINT_PATH_DOES_NOT_EXIST
7829  * flag set come at the beginning of the list. All cpis which have this
7830  * flag set come at the end of the list.
7831  */
7832 static void
7833 sort_vhcache_paths(mdi_vhcache_client_t *cct)
7834 {
7835 	mdi_vhcache_pathinfo_t *cpi, *cpi_next, *cpi_head;
7836 
7837 	cpi_head = cct->cct_cpi_head;
7838 	cct->cct_cpi_head = cct->cct_cpi_tail = NULL;
7839 	for (cpi = cpi_head; cpi != NULL; cpi = cpi_next) {
7840 		cpi_next = cpi->cpi_next;
7841 		enqueue_vhcache_pathinfo(cct, cpi);
7842 	}
7843 }
7844 
7845 /*
7846  * Verify whether MDI_CPI_HINT_PATH_DOES_NOT_EXIST flag setting is correct for
7847  * every vhcache pathinfo of the specified client. If not adjust the flag
7848  * setting appropriately.
7849  *
7850  * Note that MDI_CPI_HINT_PATH_DOES_NOT_EXIST flag is persisted in the
7851  * on-disk vhci cache. So every time this flag is updated the cache must be
7852  * flushed.
7853  */
7854 static void
7855 adjust_sort_vhcache_paths(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
7856     mdi_vhcache_lookup_token_t *tok)
7857 {
7858 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7859 	mdi_vhcache_client_t *cct;
7860 	mdi_vhcache_pathinfo_t *cpi;
7861 
7862 	rw_enter(&vhcache->vhcache_lock, RW_READER);
7863 	if ((cct = lookup_vhcache_client(vhcache, ct_name, ct_addr, tok))
7864 	    == NULL) {
7865 		rw_exit(&vhcache->vhcache_lock);
7866 		return;
7867 	}
7868 
7869 	/*
7870 	 * to avoid unnecessary on-disk cache updates, first check if an
7871 	 * update is really needed. If no update is needed simply return.
7872 	 */
7873 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
7874 		if ((cpi->cpi_pip != NULL &&
7875 		    (cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST)) ||
7876 		    (cpi->cpi_pip == NULL &&
7877 		    !(cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST))) {
7878 			break;
7879 		}
7880 	}
7881 	if (cpi == NULL) {
7882 		rw_exit(&vhcache->vhcache_lock);
7883 		return;
7884 	}
7885 
7886 	if (rw_tryupgrade(&vhcache->vhcache_lock) == 0) {
7887 		rw_exit(&vhcache->vhcache_lock);
7888 		rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7889 		if ((cct = lookup_vhcache_client(vhcache, ct_name, ct_addr,
7890 		    tok)) == NULL) {
7891 			rw_exit(&vhcache->vhcache_lock);
7892 			return;
7893 		}
7894 	}
7895 
7896 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
7897 		if (cpi->cpi_pip != NULL)
7898 			cpi->cpi_flags &= ~MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
7899 		else
7900 			cpi->cpi_flags |= MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
7901 	}
7902 	sort_vhcache_paths(cct);
7903 
7904 	rw_exit(&vhcache->vhcache_lock);
7905 	vhcache_dirty(vhc);
7906 }
7907 
7908 /*
7909  * Configure all specified paths of the client.
7910  */
7911 static void
7912 config_client_paths_sync(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
7913     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
7914 {
7915 	mdi_phys_path_t *pp;
7916 
7917 	for (pp = pp_head; pp != NULL; pp = pp->phys_path_next)
7918 		(void) bus_config_one_phci_child(pp->phys_path);
7919 	adjust_sort_vhcache_paths(vhc, ct_name, ct_addr, tok);
7920 }
7921 
7922 /*
7923  * Dequeue elements from vhci async client config list and bus configure
7924  * their corresponding phci clients.
7925  */
7926 static void
7927 config_client_paths_thread(void *arg)
7928 {
7929 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
7930 	mdi_async_client_config_t *acc;
7931 	clock_t quit_at_ticks;
7932 	clock_t idle_time = mdi_async_config_idle_time * TICKS_PER_SECOND;
7933 	callb_cpr_t cprinfo;
7934 
7935 	CALLB_CPR_INIT(&cprinfo, &vhc->vhc_lock, callb_generic_cpr,
7936 	    "mdi_config_client_paths");
7937 
7938 	for (; ; ) {
7939 		quit_at_ticks = ddi_get_lbolt() + idle_time;
7940 
7941 		mutex_enter(&vhc->vhc_lock);
7942 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
7943 		    vhc->vhc_acc_list_head == NULL &&
7944 		    ddi_get_lbolt() < quit_at_ticks) {
7945 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
7946 			(void) cv_timedwait(&vhc->vhc_cv, &vhc->vhc_lock,
7947 			    quit_at_ticks);
7948 			CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
7949 		}
7950 
7951 		if ((vhc->vhc_flags & MDI_VHC_EXIT) ||
7952 		    vhc->vhc_acc_list_head == NULL)
7953 			goto out;
7954 
7955 		acc = vhc->vhc_acc_list_head;
7956 		vhc->vhc_acc_list_head = acc->acc_next;
7957 		if (vhc->vhc_acc_list_head == NULL)
7958 			vhc->vhc_acc_list_tail = NULL;
7959 		vhc->vhc_acc_count--;
7960 		mutex_exit(&vhc->vhc_lock);
7961 
7962 		config_client_paths_sync(vhc, acc->acc_ct_name,
7963 		    acc->acc_ct_addr, acc->acc_phclient_path_list_head,
7964 		    &acc->acc_token);
7965 
7966 		free_async_client_config(acc);
7967 	}
7968 
7969 out:
7970 	vhc->vhc_acc_thrcount--;
7971 	/* CALLB_CPR_EXIT releases the vhc->vhc_lock */
7972 	CALLB_CPR_EXIT(&cprinfo);
7973 }
7974 
7975 /*
7976  * Arrange for all the phci client paths (pp_head) for the specified client
7977  * to be bus configured asynchronously by a thread.
7978  */
7979 static void
7980 config_client_paths_async(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
7981     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
7982 {
7983 	mdi_async_client_config_t *acc, *newacc;
7984 	int create_thread;
7985 
7986 	if (pp_head == NULL)
7987 		return;
7988 
7989 	if (mdi_mtc_off) {
7990 		config_client_paths_sync(vhc, ct_name, ct_addr, pp_head, tok);
7991 		free_phclient_path_list(pp_head);
7992 		return;
7993 	}
7994 
7995 	newacc = alloc_async_client_config(ct_name, ct_addr, pp_head, tok);
7996 	ASSERT(newacc);
7997 
7998 	mutex_enter(&vhc->vhc_lock);
7999 	for (acc = vhc->vhc_acc_list_head; acc != NULL; acc = acc->acc_next) {
8000 		if (strcmp(ct_name, acc->acc_ct_name) == 0 &&
8001 		    strcmp(ct_addr, acc->acc_ct_addr) == 0) {
8002 			free_async_client_config(newacc);
8003 			mutex_exit(&vhc->vhc_lock);
8004 			return;
8005 		}
8006 	}
8007 
8008 	if (vhc->vhc_acc_list_head == NULL)
8009 		vhc->vhc_acc_list_head = newacc;
8010 	else
8011 		vhc->vhc_acc_list_tail->acc_next = newacc;
8012 	vhc->vhc_acc_list_tail = newacc;
8013 	vhc->vhc_acc_count++;
8014 	if (vhc->vhc_acc_count <= vhc->vhc_acc_thrcount) {
8015 		cv_broadcast(&vhc->vhc_cv);
8016 		create_thread = 0;
8017 	} else {
8018 		vhc->vhc_acc_thrcount++;
8019 		create_thread = 1;
8020 	}
8021 	mutex_exit(&vhc->vhc_lock);
8022 
8023 	if (create_thread)
8024 		(void) thread_create(NULL, 0, config_client_paths_thread, vhc,
8025 		    0, &p0, TS_RUN, minclsyspri);
8026 }
8027 
8028 /*
8029  * Return number of online paths for the specified client.
8030  */
8031 static int
8032 nonline_paths(mdi_vhcache_client_t *cct)
8033 {
8034 	mdi_vhcache_pathinfo_t *cpi;
8035 	int online_count = 0;
8036 
8037 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8038 		if (cpi->cpi_pip != NULL) {
8039 			MDI_PI_LOCK(cpi->cpi_pip);
8040 			if (cpi->cpi_pip->pi_state == MDI_PATHINFO_STATE_ONLINE)
8041 				online_count++;
8042 			MDI_PI_UNLOCK(cpi->cpi_pip);
8043 		}
8044 	}
8045 
8046 	return (online_count);
8047 }
8048 
8049 /*
8050  * Bus configure all paths for the specified vhci client.
8051  * If at least one path for the client is already online, the remaining paths
8052  * will be configured asynchronously. Otherwise, it synchronously configures
8053  * the paths until at least one path is online and then rest of the paths
8054  * will be configured asynchronously.
8055  */
8056 static void
8057 config_client_paths(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr)
8058 {
8059 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8060 	mdi_phys_path_t *pp_head, *pp;
8061 	mdi_vhcache_client_t *cct;
8062 	mdi_vhcache_lookup_token_t tok;
8063 
8064 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
8065 
8066 	init_vhcache_lookup_token(&tok, NULL);
8067 
8068 	if (ct_name == NULL || ct_addr == NULL ||
8069 	    (cct = lookup_vhcache_client(vhcache, ct_name, ct_addr, &tok))
8070 	    == NULL ||
8071 	    (pp_head = build_phclient_path_list(cct, ct_name)) == NULL) {
8072 		rw_exit(&vhcache->vhcache_lock);
8073 		return;
8074 	}
8075 
8076 	/* if at least one path is online, configure the rest asynchronously */
8077 	if (nonline_paths(cct) > 0) {
8078 		rw_exit(&vhcache->vhcache_lock);
8079 		config_client_paths_async(vhc, ct_name, ct_addr, pp_head, &tok);
8080 		return;
8081 	}
8082 
8083 	rw_exit(&vhcache->vhcache_lock);
8084 
8085 	for (pp = pp_head; pp != NULL; pp = pp->phys_path_next) {
8086 		if (bus_config_one_phci_child(pp->phys_path) == MDI_SUCCESS) {
8087 			rw_enter(&vhcache->vhcache_lock, RW_READER);
8088 
8089 			if ((cct = lookup_vhcache_client(vhcache, ct_name,
8090 			    ct_addr, &tok)) == NULL) {
8091 				rw_exit(&vhcache->vhcache_lock);
8092 				goto out;
8093 			}
8094 
8095 			if (nonline_paths(cct) > 0 &&
8096 			    pp->phys_path_next != NULL) {
8097 				rw_exit(&vhcache->vhcache_lock);
8098 				config_client_paths_async(vhc, ct_name, ct_addr,
8099 				    pp->phys_path_next, &tok);
8100 				pp->phys_path_next = NULL;
8101 				goto out;
8102 			}
8103 
8104 			rw_exit(&vhcache->vhcache_lock);
8105 		}
8106 	}
8107 
8108 	adjust_sort_vhcache_paths(vhc, ct_name, ct_addr, &tok);
8109 out:
8110 	free_phclient_path_list(pp_head);
8111 }
8112 
8113 static void
8114 single_threaded_vhconfig_enter(mdi_vhci_config_t *vhc)
8115 {
8116 	mutex_enter(&vhc->vhc_lock);
8117 	while (vhc->vhc_flags & MDI_VHC_SINGLE_THREADED)
8118 		cv_wait(&vhc->vhc_cv, &vhc->vhc_lock);
8119 	vhc->vhc_flags |= MDI_VHC_SINGLE_THREADED;
8120 	mutex_exit(&vhc->vhc_lock);
8121 }
8122 
8123 static void
8124 single_threaded_vhconfig_exit(mdi_vhci_config_t *vhc)
8125 {
8126 	mutex_enter(&vhc->vhc_lock);
8127 	vhc->vhc_flags &= ~MDI_VHC_SINGLE_THREADED;
8128 	cv_broadcast(&vhc->vhc_cv);
8129 	mutex_exit(&vhc->vhc_lock);
8130 }
8131 
8132 typedef struct mdi_phci_driver_info {
8133 	char	*phdriver_name;	/* name of the phci driver */
8134 
8135 	/* set to non zero if the phci driver supports root device */
8136 	int	phdriver_root_support;
8137 } mdi_phci_driver_info_t;
8138 
8139 /*
8140  * vhci class and root support capability of a phci driver can be
8141  * specified using ddi-vhci-class and ddi-no-root-support properties in the
8142  * phci driver.conf file. The built-in tables below contain this information
8143  * for those phci drivers whose driver.conf files don't yet contain this info.
8144  *
8145  * All phci drivers expect iscsi have root device support.
8146  */
8147 static mdi_phci_driver_info_t scsi_phci_driver_list[] = {
8148 	{ "fp", 1 },
8149 	{ "iscsi", 0 },
8150 	{ "ibsrp", 1 }
8151 	};
8152 
8153 static mdi_phci_driver_info_t ib_phci_driver_list[] = { "tavor", 1 };
8154 
8155 static void *
8156 mdi_realloc(void *old_ptr, size_t old_size, size_t new_size)
8157 {
8158 	void *new_ptr;
8159 
8160 	new_ptr = kmem_zalloc(new_size, KM_SLEEP);
8161 	if (old_ptr) {
8162 		bcopy(old_ptr, new_ptr, MIN(old_size, new_size));
8163 		kmem_free(old_ptr, old_size);
8164 	}
8165 	return (new_ptr);
8166 }
8167 
8168 static void
8169 add_to_phci_list(char ***driver_list, int **root_support_list,
8170     int *cur_elements, int *max_elements, char *driver_name, int root_support)
8171 {
8172 	ASSERT(*cur_elements <= *max_elements);
8173 	if (*cur_elements == *max_elements) {
8174 		*max_elements += 10;
8175 		*driver_list = mdi_realloc(*driver_list,
8176 		    sizeof (char *) * (*cur_elements),
8177 		    sizeof (char *) * (*max_elements));
8178 		*root_support_list = mdi_realloc(*root_support_list,
8179 		    sizeof (int) * (*cur_elements),
8180 		    sizeof (int) * (*max_elements));
8181 	}
8182 	(*driver_list)[*cur_elements] = i_ddi_strdup(driver_name, KM_SLEEP);
8183 	(*root_support_list)[*cur_elements] = root_support;
8184 	(*cur_elements)++;
8185 }
8186 
8187 static void
8188 get_phci_driver_list(char *vhci_class, char ***driver_list,
8189     int **root_support_list, int *cur_elements, int *max_elements)
8190 {
8191 	mdi_phci_driver_info_t	*st_driver_list, *p;
8192 	int		st_ndrivers, root_support, i, j, driver_conf_count;
8193 	major_t		m;
8194 	struct devnames	*dnp;
8195 	ddi_prop_t	*propp;
8196 
8197 	*driver_list = NULL;
8198 	*root_support_list = NULL;
8199 	*cur_elements = 0;
8200 	*max_elements = 0;
8201 
8202 	/* add the phci drivers derived from the phci driver.conf files */
8203 	for (m = 0; m < devcnt; m++) {
8204 		dnp = &devnamesp[m];
8205 
8206 		if (dnp->dn_flags & DN_PHCI_DRIVER) {
8207 			LOCK_DEV_OPS(&dnp->dn_lock);
8208 			if (dnp->dn_global_prop_ptr != NULL &&
8209 			    (propp = i_ddi_prop_search(DDI_DEV_T_ANY,
8210 			    DDI_VHCI_CLASS, DDI_PROP_TYPE_STRING,
8211 			    &dnp->dn_global_prop_ptr->prop_list)) != NULL &&
8212 			    strcmp(propp->prop_val, vhci_class) == 0) {
8213 
8214 				root_support = (i_ddi_prop_search(DDI_DEV_T_ANY,
8215 				    DDI_NO_ROOT_SUPPORT, DDI_PROP_TYPE_INT,
8216 				    &dnp->dn_global_prop_ptr->prop_list)
8217 				    == NULL) ? 1 : 0;
8218 
8219 				add_to_phci_list(driver_list, root_support_list,
8220 				    cur_elements, max_elements, dnp->dn_name,
8221 				    root_support);
8222 
8223 				UNLOCK_DEV_OPS(&dnp->dn_lock);
8224 			} else
8225 				UNLOCK_DEV_OPS(&dnp->dn_lock);
8226 		}
8227 	}
8228 
8229 	driver_conf_count = *cur_elements;
8230 
8231 	/* add the phci drivers specified in the built-in tables */
8232 	if (strcmp(vhci_class, MDI_HCI_CLASS_SCSI) == 0) {
8233 		st_driver_list = scsi_phci_driver_list;
8234 		st_ndrivers = sizeof (scsi_phci_driver_list) /
8235 		    sizeof (mdi_phci_driver_info_t);
8236 	} else if (strcmp(vhci_class, MDI_HCI_CLASS_IB) == 0) {
8237 		st_driver_list = ib_phci_driver_list;
8238 		st_ndrivers = sizeof (ib_phci_driver_list) /
8239 		    sizeof (mdi_phci_driver_info_t);
8240 	} else {
8241 		st_driver_list = NULL;
8242 		st_ndrivers = 0;
8243 	}
8244 
8245 	for (i = 0, p = st_driver_list; i < st_ndrivers; i++, p++) {
8246 		/* add this phci driver if not already added before */
8247 		for (j = 0; j < driver_conf_count; j++) {
8248 			if (strcmp((*driver_list)[j], p->phdriver_name) == 0)
8249 				break;
8250 		}
8251 		if (j == driver_conf_count) {
8252 			add_to_phci_list(driver_list, root_support_list,
8253 			    cur_elements, max_elements, p->phdriver_name,
8254 			    p->phdriver_root_support);
8255 		}
8256 	}
8257 }
8258 
8259 /*
8260  * Attach the phci driver instances associated with the specified vhci class.
8261  * If root is mounted attach all phci driver instances.
8262  * If root is not mounted, attach the instances of only those phci
8263  * drivers that have the root support.
8264  */
8265 static void
8266 attach_phci_drivers(char *vhci_class)
8267 {
8268 	char	**driver_list, **p;
8269 	int	*root_support_list;
8270 	int	cur_elements, max_elements, i;
8271 	major_t	m;
8272 
8273 	get_phci_driver_list(vhci_class, &driver_list, &root_support_list,
8274 	    &cur_elements, &max_elements);
8275 
8276 	for (i = 0; i < cur_elements; i++) {
8277 		if (modrootloaded || root_support_list[i]) {
8278 			m = ddi_name_to_major(driver_list[i]);
8279 			if (m != (major_t)-1 && ddi_hold_installed_driver(m))
8280 				ddi_rele_driver(m);
8281 		}
8282 	}
8283 
8284 	if (driver_list) {
8285 		for (i = 0, p = driver_list; i < cur_elements; i++, p++)
8286 			kmem_free(*p, strlen(*p) + 1);
8287 		kmem_free(driver_list, sizeof (char *) * max_elements);
8288 		kmem_free(root_support_list, sizeof (int) * max_elements);
8289 	}
8290 }
8291 
8292 /*
8293  * Build vhci cache:
8294  *
8295  * Attach phci driver instances and then drive BUS_CONFIG_ALL on
8296  * the phci driver instances. During this process the cache gets built.
8297  *
8298  * Cache is built fully if the root is mounted.
8299  * If the root is not mounted, phci drivers that do not have root support
8300  * are not attached. As a result the cache is built partially. The entries
8301  * in the cache reflect only those phci drivers that have root support.
8302  */
8303 static int
8304 build_vhci_cache(mdi_vhci_t *vh)
8305 {
8306 	mdi_vhci_config_t *vhc = vh->vh_config;
8307 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8308 
8309 	single_threaded_vhconfig_enter(vhc);
8310 
8311 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8312 	if (vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE) {
8313 		rw_exit(&vhcache->vhcache_lock);
8314 		single_threaded_vhconfig_exit(vhc);
8315 		return (0);
8316 	}
8317 	rw_exit(&vhcache->vhcache_lock);
8318 
8319 	attach_phci_drivers(vh->vh_class);
8320 	bus_config_all_phcis(vhcache, NDI_DRV_CONF_REPROBE | NDI_NO_EVENT,
8321 	    BUS_CONFIG_ALL, (major_t)-1);
8322 
8323 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8324 	vhcache->vhcache_flags |= MDI_VHCI_CACHE_SETUP_DONE;
8325 	rw_exit(&vhcache->vhcache_lock);
8326 
8327 	single_threaded_vhconfig_exit(vhc);
8328 	vhcache_dirty(vhc);
8329 	return (1);
8330 }
8331 
8332 /*
8333  * Determine if discovery of paths is needed.
8334  */
8335 static int
8336 vhcache_do_discovery(mdi_vhci_config_t *vhc)
8337 {
8338 	int rv = 1;
8339 
8340 	mutex_enter(&vhc->vhc_lock);
8341 	if (i_ddi_io_initialized() == 0) {
8342 		if (vhc->vhc_path_discovery_boot > 0) {
8343 			vhc->vhc_path_discovery_boot--;
8344 			goto out;
8345 		}
8346 	} else {
8347 		if (vhc->vhc_path_discovery_postboot > 0) {
8348 			vhc->vhc_path_discovery_postboot--;
8349 			goto out;
8350 		}
8351 	}
8352 
8353 	/*
8354 	 * Do full path discovery at most once per mdi_path_discovery_interval.
8355 	 * This is to avoid a series of full path discoveries when opening
8356 	 * stale /dev/[r]dsk links.
8357 	 */
8358 	if (mdi_path_discovery_interval != -1 &&
8359 	    lbolt64 >= vhc->vhc_path_discovery_cutoff_time)
8360 		goto out;
8361 
8362 	rv = 0;
8363 out:
8364 	mutex_exit(&vhc->vhc_lock);
8365 	return (rv);
8366 }
8367 
8368 /*
8369  * Discover all paths:
8370  *
8371  * Attach phci driver instances and then drive BUS_CONFIG_ALL on all the phci
8372  * driver instances. During this process all paths will be discovered.
8373  */
8374 static int
8375 vhcache_discover_paths(mdi_vhci_t *vh)
8376 {
8377 	mdi_vhci_config_t *vhc = vh->vh_config;
8378 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8379 	int rv = 0;
8380 
8381 	single_threaded_vhconfig_enter(vhc);
8382 
8383 	if (vhcache_do_discovery(vhc)) {
8384 		attach_phci_drivers(vh->vh_class);
8385 		bus_config_all_phcis(vhcache, NDI_DRV_CONF_REPROBE |
8386 		    NDI_NO_EVENT, BUS_CONFIG_ALL, (major_t)-1);
8387 
8388 		mutex_enter(&vhc->vhc_lock);
8389 		vhc->vhc_path_discovery_cutoff_time = lbolt64 +
8390 		    mdi_path_discovery_interval * TICKS_PER_SECOND;
8391 		mutex_exit(&vhc->vhc_lock);
8392 		rv = 1;
8393 	}
8394 
8395 	single_threaded_vhconfig_exit(vhc);
8396 	return (rv);
8397 }
8398 
8399 /*
8400  * Generic vhci bus config implementation:
8401  *
8402  * Parameters
8403  *	vdip	vhci dip
8404  *	flags	bus config flags
8405  *	op	bus config operation
8406  *	The remaining parameters are bus config operation specific
8407  *
8408  * for BUS_CONFIG_ONE
8409  *	arg	pointer to name@addr
8410  *	child	upon successful return from this function, *child will be
8411  *		set to the configured and held devinfo child node of vdip.
8412  *	ct_addr	pointer to client address (i.e. GUID)
8413  *
8414  * for BUS_CONFIG_DRIVER
8415  *	arg	major number of the driver
8416  *	child and ct_addr parameters are ignored
8417  *
8418  * for BUS_CONFIG_ALL
8419  *	arg, child, and ct_addr parameters are ignored
8420  *
8421  * Note that for the rest of the bus config operations, this function simply
8422  * calls the framework provided default bus config routine.
8423  */
8424 int
8425 mdi_vhci_bus_config(dev_info_t *vdip, uint_t flags, ddi_bus_config_op_t op,
8426     void *arg, dev_info_t **child, char *ct_addr)
8427 {
8428 	mdi_vhci_t *vh = i_devi_get_vhci(vdip);
8429 	mdi_vhci_config_t *vhc = vh->vh_config;
8430 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8431 	int rv = 0;
8432 	int params_valid = 0;
8433 	char *cp;
8434 
8435 	/*
8436 	 * To bus config vhcis we relay operation, possibly using another
8437 	 * thread, to phcis. The phci driver then interacts with MDI to cause
8438 	 * vhci child nodes to be enumerated under the vhci node.  Adding a
8439 	 * vhci child requires an ndi_devi_enter of the vhci. Since another
8440 	 * thread may be adding the child, to avoid deadlock we can't wait
8441 	 * for the relayed operations to complete if we have already entered
8442 	 * the vhci node.
8443 	 */
8444 	if (DEVI_BUSY_OWNED(vdip)) {
8445 		MDI_DEBUG(2, (CE_NOTE, vdip, "!MDI: vhci bus config: "
8446 		    "vhci dip is busy owned %p\n", (void *)vdip));
8447 		goto default_bus_config;
8448 	}
8449 
8450 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8451 	if (!(vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE)) {
8452 		rw_exit(&vhcache->vhcache_lock);
8453 		rv = build_vhci_cache(vh);
8454 		rw_enter(&vhcache->vhcache_lock, RW_READER);
8455 	}
8456 
8457 	switch (op) {
8458 	case BUS_CONFIG_ONE:
8459 		if (arg != NULL && ct_addr != NULL) {
8460 			/* extract node name */
8461 			cp = (char *)arg;
8462 			while (*cp != '\0' && *cp != '@')
8463 				cp++;
8464 			if (*cp == '@') {
8465 				params_valid = 1;
8466 				*cp = '\0';
8467 				config_client_paths(vhc, (char *)arg, ct_addr);
8468 				/* config_client_paths() releases cache_lock */
8469 				*cp = '@';
8470 				break;
8471 			}
8472 		}
8473 
8474 		rw_exit(&vhcache->vhcache_lock);
8475 		break;
8476 
8477 	case BUS_CONFIG_DRIVER:
8478 		rw_exit(&vhcache->vhcache_lock);
8479 		if (rv == 0)
8480 			st_bus_config_all_phcis(vhc, flags, op,
8481 			    (major_t)(uintptr_t)arg);
8482 		break;
8483 
8484 	case BUS_CONFIG_ALL:
8485 		rw_exit(&vhcache->vhcache_lock);
8486 		if (rv == 0)
8487 			st_bus_config_all_phcis(vhc, flags, op, -1);
8488 		break;
8489 
8490 	default:
8491 		rw_exit(&vhcache->vhcache_lock);
8492 		break;
8493 	}
8494 
8495 
8496 default_bus_config:
8497 	/*
8498 	 * All requested child nodes are enumerated under the vhci.
8499 	 * Now configure them.
8500 	 */
8501 	if (ndi_busop_bus_config(vdip, flags, op, arg, child, 0) ==
8502 	    NDI_SUCCESS) {
8503 		return (MDI_SUCCESS);
8504 	} else if (op == BUS_CONFIG_ONE && rv == 0 && params_valid) {
8505 		/* discover all paths and try configuring again */
8506 		if (vhcache_discover_paths(vh) &&
8507 		    ndi_busop_bus_config(vdip, flags, op, arg, child, 0) ==
8508 		    NDI_SUCCESS)
8509 			return (MDI_SUCCESS);
8510 	}
8511 
8512 	return (MDI_FAILURE);
8513 }
8514 
8515 /*
8516  * Read the on-disk vhci cache into an nvlist for the specified vhci class.
8517  */
8518 static nvlist_t *
8519 read_on_disk_vhci_cache(char *vhci_class)
8520 {
8521 	nvlist_t *nvl;
8522 	int err;
8523 	char *filename;
8524 
8525 	filename = vhclass2vhcache_filename(vhci_class);
8526 
8527 	if ((err = fread_nvlist(filename, &nvl)) == 0) {
8528 		kmem_free(filename, strlen(filename) + 1);
8529 		return (nvl);
8530 	} else if (err == EIO)
8531 		cmn_err(CE_WARN, "%s: I/O error, will recreate\n", filename);
8532 	else if (err == EINVAL)
8533 		cmn_err(CE_WARN,
8534 		    "%s: data file corrupted, will recreate\n", filename);
8535 
8536 	kmem_free(filename, strlen(filename) + 1);
8537 	return (NULL);
8538 }
8539 
8540 /*
8541  * Read on-disk vhci cache into nvlists for all vhci classes.
8542  * Called during booting by i_ddi_read_devices_files().
8543  */
8544 void
8545 mdi_read_devices_files(void)
8546 {
8547 	int i;
8548 
8549 	for (i = 0; i < N_VHCI_CLASSES; i++)
8550 		vhcache_nvl[i] = read_on_disk_vhci_cache(vhci_class_list[i]);
8551 }
8552 
8553 /*
8554  * Remove all stale entries from vhci cache.
8555  */
8556 static void
8557 clean_vhcache(mdi_vhci_config_t *vhc)
8558 {
8559 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8560 	mdi_vhcache_phci_t *cphci, *cphci_head, *cphci_next;
8561 	mdi_vhcache_client_t *cct, *cct_head, *cct_next;
8562 	mdi_vhcache_pathinfo_t *cpi, *cpi_head, *cpi_next;
8563 
8564 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8565 
8566 	cct_head = vhcache->vhcache_client_head;
8567 	vhcache->vhcache_client_head = vhcache->vhcache_client_tail = NULL;
8568 	for (cct = cct_head; cct != NULL; cct = cct_next) {
8569 		cct_next = cct->cct_next;
8570 
8571 		cpi_head = cct->cct_cpi_head;
8572 		cct->cct_cpi_head = cct->cct_cpi_tail = NULL;
8573 		for (cpi = cpi_head; cpi != NULL; cpi = cpi_next) {
8574 			cpi_next = cpi->cpi_next;
8575 			if (cpi->cpi_pip != NULL) {
8576 				ASSERT(cpi->cpi_cphci->cphci_phci != NULL);
8577 				enqueue_tail_vhcache_pathinfo(cct, cpi);
8578 			} else
8579 				free_vhcache_pathinfo(cpi);
8580 		}
8581 
8582 		if (cct->cct_cpi_head != NULL)
8583 			enqueue_vhcache_client(vhcache, cct);
8584 		else {
8585 			(void) mod_hash_destroy(vhcache->vhcache_client_hash,
8586 			    (mod_hash_key_t)cct->cct_name_addr);
8587 			free_vhcache_client(cct);
8588 		}
8589 	}
8590 
8591 	cphci_head = vhcache->vhcache_phci_head;
8592 	vhcache->vhcache_phci_head = vhcache->vhcache_phci_tail = NULL;
8593 	for (cphci = cphci_head; cphci != NULL; cphci = cphci_next) {
8594 		cphci_next = cphci->cphci_next;
8595 		if (cphci->cphci_phci != NULL)
8596 			enqueue_vhcache_phci(vhcache, cphci);
8597 		else
8598 			free_vhcache_phci(cphci);
8599 	}
8600 
8601 	vhcache->vhcache_clean_time = lbolt64;
8602 	rw_exit(&vhcache->vhcache_lock);
8603 	vhcache_dirty(vhc);
8604 }
8605 
8606 /*
8607  * Remove all stale entries from vhci cache.
8608  * Called by i_ddi_clean_devices_files() during the execution of devfsadm -C
8609  */
8610 void
8611 mdi_clean_vhcache(void)
8612 {
8613 	mdi_vhci_t *vh;
8614 
8615 	mutex_enter(&mdi_mutex);
8616 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
8617 		vh->vh_refcnt++;
8618 		mutex_exit(&mdi_mutex);
8619 		clean_vhcache(vh->vh_config);
8620 		mutex_enter(&mdi_mutex);
8621 		vh->vh_refcnt--;
8622 	}
8623 	mutex_exit(&mdi_mutex);
8624 }
8625 
8626 /*
8627  * mdi_vhci_walk_clients():
8628  *		Walker routine to traverse client dev_info nodes
8629  * ddi_walk_devs(ddi_get_child(vdip), f, arg) returns the entire tree
8630  * below the client, including nexus devices, which we dont want.
8631  * So we just traverse the immediate siblings, starting from 1st client.
8632  */
8633 void
8634 mdi_vhci_walk_clients(dev_info_t *vdip,
8635     int (*f)(dev_info_t *, void *), void *arg)
8636 {
8637 	mdi_vhci_t	*vh = i_devi_get_vhci(vdip);
8638 	dev_info_t	*cdip;
8639 	mdi_client_t	*ct;
8640 
8641 	MDI_VHCI_CLIENT_LOCK(vh);
8642 	cdip = ddi_get_child(vdip);
8643 	while (cdip) {
8644 		ct = i_devi_get_client(cdip);
8645 		MDI_CLIENT_LOCK(ct);
8646 
8647 		if (((*f)(cdip, arg)) == DDI_WALK_CONTINUE)
8648 			cdip = ddi_get_next_sibling(cdip);
8649 		else
8650 			cdip = NULL;
8651 
8652 		MDI_CLIENT_UNLOCK(ct);
8653 	}
8654 	MDI_VHCI_CLIENT_UNLOCK(vh);
8655 }
8656 
8657 /*
8658  * mdi_vhci_walk_phcis():
8659  *		Walker routine to traverse phci dev_info nodes
8660  */
8661 void
8662 mdi_vhci_walk_phcis(dev_info_t *vdip,
8663     int (*f)(dev_info_t *, void *), void *arg)
8664 {
8665 	mdi_vhci_t	*vh = i_devi_get_vhci(vdip);
8666 	mdi_phci_t	*ph, *next;
8667 
8668 	MDI_VHCI_PHCI_LOCK(vh);
8669 	ph = vh->vh_phci_head;
8670 	while (ph) {
8671 		MDI_PHCI_LOCK(ph);
8672 
8673 		if (((*f)(ph->ph_dip, arg)) == DDI_WALK_CONTINUE)
8674 			next = ph->ph_next;
8675 		else
8676 			next = NULL;
8677 
8678 		MDI_PHCI_UNLOCK(ph);
8679 		ph = next;
8680 	}
8681 	MDI_VHCI_PHCI_UNLOCK(vh);
8682 }
8683 
8684 
8685 /*
8686  * mdi_walk_vhcis():
8687  *		Walker routine to traverse vhci dev_info nodes
8688  */
8689 void
8690 mdi_walk_vhcis(int (*f)(dev_info_t *, void *), void *arg)
8691 {
8692 	mdi_vhci_t	*vh = NULL;
8693 
8694 	mutex_enter(&mdi_mutex);
8695 	/*
8696 	 * Scan for already registered vhci
8697 	 */
8698 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
8699 		vh->vh_refcnt++;
8700 		mutex_exit(&mdi_mutex);
8701 		if (((*f)(vh->vh_dip, arg)) != DDI_WALK_CONTINUE) {
8702 			mutex_enter(&mdi_mutex);
8703 			vh->vh_refcnt--;
8704 			break;
8705 		} else {
8706 			mutex_enter(&mdi_mutex);
8707 			vh->vh_refcnt--;
8708 		}
8709 	}
8710 
8711 	mutex_exit(&mdi_mutex);
8712 }
8713 
8714 /*
8715  * i_mdi_log_sysevent():
8716  *		Logs events for pickup by syseventd
8717  */
8718 static void
8719 i_mdi_log_sysevent(dev_info_t *dip, char *ph_vh_class, char *subclass)
8720 {
8721 	char		*path_name;
8722 	nvlist_t	*attr_list;
8723 
8724 	if (nvlist_alloc(&attr_list, NV_UNIQUE_NAME_TYPE,
8725 	    KM_SLEEP) != DDI_SUCCESS) {
8726 		goto alloc_failed;
8727 	}
8728 
8729 	path_name = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
8730 	(void) ddi_pathname(dip, path_name);
8731 
8732 	if (nvlist_add_string(attr_list, DDI_DRIVER_NAME,
8733 	    ddi_driver_name(dip)) != DDI_SUCCESS) {
8734 		goto error;
8735 	}
8736 
8737 	if (nvlist_add_int32(attr_list, DDI_DRIVER_MAJOR,
8738 	    (int32_t)ddi_driver_major(dip)) != DDI_SUCCESS) {
8739 		goto error;
8740 	}
8741 
8742 	if (nvlist_add_int32(attr_list, DDI_INSTANCE,
8743 	    (int32_t)ddi_get_instance(dip)) != DDI_SUCCESS) {
8744 		goto error;
8745 	}
8746 
8747 	if (nvlist_add_string(attr_list, DDI_PATHNAME,
8748 	    path_name) != DDI_SUCCESS) {
8749 		goto error;
8750 	}
8751 
8752 	if (nvlist_add_string(attr_list, DDI_CLASS,
8753 	    ph_vh_class) != DDI_SUCCESS) {
8754 		goto error;
8755 	}
8756 
8757 	(void) ddi_log_sysevent(dip, DDI_VENDOR_SUNW, EC_DDI, subclass,
8758 	    attr_list, NULL, DDI_SLEEP);
8759 
8760 error:
8761 	kmem_free(path_name, MAXPATHLEN);
8762 	nvlist_free(attr_list);
8763 	return;
8764 
8765 alloc_failed:
8766 	MDI_DEBUG(1, (CE_WARN, dip,
8767 	    "!i_mdi_log_sysevent: Unable to send sysevent"));
8768 }
8769 
8770 char **
8771 mdi_get_phci_driver_list(char *vhci_class, int	*ndrivers)
8772 {
8773 	char	**driver_list, **ret_driver_list = NULL;
8774 	int	*root_support_list;
8775 	int	cur_elements, max_elements;
8776 
8777 	get_phci_driver_list(vhci_class, &driver_list, &root_support_list,
8778 	    &cur_elements, &max_elements);
8779 
8780 
8781 	if (driver_list) {
8782 		kmem_free(root_support_list, sizeof (int) * max_elements);
8783 		ret_driver_list = mdi_realloc(driver_list, sizeof (char *)
8784 		    * max_elements, sizeof (char *) * cur_elements);
8785 	}
8786 	*ndrivers = cur_elements;
8787 
8788 	return (ret_driver_list);
8789 
8790 }
8791 
8792 void
8793 mdi_free_phci_driver_list(char **driver_list, int ndrivers)
8794 {
8795 	char	**p;
8796 	int	i;
8797 
8798 	if (driver_list) {
8799 		for (i = 0, p = driver_list; i < ndrivers; i++, p++)
8800 			kmem_free(*p, strlen(*p) + 1);
8801 		kmem_free(driver_list, sizeof (char *) * ndrivers);
8802 	}
8803 }
8804