xref: /titanic_50/usr/src/uts/common/os/sunmdi.c (revision 910cba4f2f1e94daf355ee8635285732ac47326c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 #pragma ident	"%Z%%M%	%I%	%E% SMI"
26 
27 /*
28  * Multipath driver interface (MDI) implementation; see mdi_impl.h for a more
29  * detailed discussion of the overall mpxio architecture.
30  *
31  * Default locking order:
32  *
33  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_phci_mutex);
34  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_client_mutex);
35  * _NOTE(LOCK_ORDER(mdi_vhci:vh_phci_mutex, mdi_phci::ph_mutex);
36  * _NOTE(LOCK_ORDER(mdi_vhci:vh_client_mutex, mdi_client::ct_mutex);
37  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
38  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_client::ct_mutex))
39  * _NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
40  */
41 
42 #include <sys/note.h>
43 #include <sys/types.h>
44 #include <sys/varargs.h>
45 #include <sys/param.h>
46 #include <sys/errno.h>
47 #include <sys/uio.h>
48 #include <sys/buf.h>
49 #include <sys/modctl.h>
50 #include <sys/open.h>
51 #include <sys/kmem.h>
52 #include <sys/poll.h>
53 #include <sys/conf.h>
54 #include <sys/bootconf.h>
55 #include <sys/cmn_err.h>
56 #include <sys/stat.h>
57 #include <sys/ddi.h>
58 #include <sys/sunddi.h>
59 #include <sys/ddipropdefs.h>
60 #include <sys/sunndi.h>
61 #include <sys/ndi_impldefs.h>
62 #include <sys/promif.h>
63 #include <sys/sunmdi.h>
64 #include <sys/mdi_impldefs.h>
65 #include <sys/taskq.h>
66 #include <sys/epm.h>
67 #include <sys/sunpm.h>
68 #include <sys/modhash.h>
69 #include <sys/disp.h>
70 #include <sys/autoconf.h>
71 
72 #ifdef	DEBUG
73 #include <sys/debug.h>
74 int	mdi_debug = 1;
75 int	mdi_debug_logonly = 0;
76 #define	MDI_DEBUG(level, stmnt) \
77 	    if (mdi_debug >= (level)) i_mdi_log stmnt
78 static void i_mdi_log(int, dev_info_t *, const char *fmt, ...);
79 #else	/* !DEBUG */
80 #define	MDI_DEBUG(level, stmnt)
81 #endif	/* DEBUG */
82 
83 extern pri_t	minclsyspri;
84 extern int	modrootloaded;
85 
86 /*
87  * Global mutex:
88  * Protects vHCI list and structure members.
89  */
90 kmutex_t	mdi_mutex;
91 
92 /*
93  * Registered vHCI class driver lists
94  */
95 int		mdi_vhci_count;
96 mdi_vhci_t	*mdi_vhci_head;
97 mdi_vhci_t	*mdi_vhci_tail;
98 
99 /*
100  * Client Hash Table size
101  */
102 static int	mdi_client_table_size = CLIENT_HASH_TABLE_SIZE;
103 
104 /*
105  * taskq interface definitions
106  */
107 #define	MDI_TASKQ_N_THREADS	8
108 #define	MDI_TASKQ_PRI		minclsyspri
109 #define	MDI_TASKQ_MINALLOC	(4*mdi_taskq_n_threads)
110 #define	MDI_TASKQ_MAXALLOC	(500*mdi_taskq_n_threads)
111 
112 taskq_t				*mdi_taskq;
113 static uint_t			mdi_taskq_n_threads = MDI_TASKQ_N_THREADS;
114 
115 #define	TICKS_PER_SECOND	(drv_usectohz(1000000))
116 
117 /*
118  * The data should be "quiet" for this interval (in seconds) before the
119  * vhci cached data is flushed to the disk.
120  */
121 static int mdi_vhcache_flush_delay = 10;
122 
123 /* number of seconds the vhcache flush daemon will sleep idle before exiting */
124 static int mdi_vhcache_flush_daemon_idle_time = 60;
125 
126 /*
127  * MDI falls back to discovery of all paths when a bus_config_one fails.
128  * The following parameters can be used to tune this operation.
129  *
130  * mdi_path_discovery_boot
131  *	Number of times path discovery will be attempted during early boot.
132  *	Probably there is no reason to ever set this value to greater than one.
133  *
134  * mdi_path_discovery_postboot
135  *	Number of times path discovery will be attempted after early boot.
136  *	Set it to a minimum of two to allow for discovery of iscsi paths which
137  *	may happen very late during booting.
138  *
139  * mdi_path_discovery_interval
140  *	Minimum number of seconds MDI will wait between successive discovery
141  *	of all paths. Set it to -1 to disable discovery of all paths.
142  */
143 static int mdi_path_discovery_boot = 1;
144 static int mdi_path_discovery_postboot = 2;
145 static int mdi_path_discovery_interval = 10;
146 
147 /*
148  * number of seconds the asynchronous configuration thread will sleep idle
149  * before exiting.
150  */
151 static int mdi_async_config_idle_time = 600;
152 
153 static int mdi_bus_config_cache_hash_size = 256;
154 
155 /* turns off multithreaded configuration for certain operations */
156 static int mdi_mtc_off = 0;
157 
158 /*
159  * MDI component property name/value string definitions
160  */
161 const char 		*mdi_component_prop = "mpxio-component";
162 const char		*mdi_component_prop_vhci = "vhci";
163 const char		*mdi_component_prop_phci = "phci";
164 const char		*mdi_component_prop_client = "client";
165 
166 /*
167  * MDI client global unique identifier property name
168  */
169 const char		*mdi_client_guid_prop = "client-guid";
170 
171 /*
172  * MDI client load balancing property name/value string definitions
173  */
174 const char		*mdi_load_balance = "load-balance";
175 const char		*mdi_load_balance_none = "none";
176 const char		*mdi_load_balance_rr = "round-robin";
177 const char		*mdi_load_balance_lba = "logical-block";
178 
179 /*
180  * Obsolete vHCI class definition; to be removed after Leadville update
181  */
182 const char *mdi_vhci_class_scsi = MDI_HCI_CLASS_SCSI;
183 
184 static char vhci_greeting[] =
185 	"\tThere already exists one vHCI driver for class %s\n"
186 	"\tOnly one vHCI driver for each class is allowed\n";
187 
188 /*
189  * Static function prototypes
190  */
191 static int		i_mdi_phci_offline(dev_info_t *, uint_t);
192 static int		i_mdi_client_offline(dev_info_t *, uint_t);
193 static int		i_mdi_phci_pre_detach(dev_info_t *, ddi_detach_cmd_t);
194 static void		i_mdi_phci_post_detach(dev_info_t *,
195 			    ddi_detach_cmd_t, int);
196 static int		i_mdi_client_pre_detach(dev_info_t *,
197 			    ddi_detach_cmd_t);
198 static void		i_mdi_client_post_detach(dev_info_t *,
199 			    ddi_detach_cmd_t, int);
200 static void		i_mdi_pm_hold_pip(mdi_pathinfo_t *);
201 static void		i_mdi_pm_rele_pip(mdi_pathinfo_t *);
202 static int 		i_mdi_lba_lb(mdi_client_t *ct,
203 			    mdi_pathinfo_t **ret_pip, struct buf *buf);
204 static void		i_mdi_pm_hold_client(mdi_client_t *, int);
205 static void		i_mdi_pm_rele_client(mdi_client_t *, int);
206 static void		i_mdi_pm_reset_client(mdi_client_t *);
207 static int		i_mdi_power_all_phci(mdi_client_t *);
208 static void		i_mdi_log_sysevent(dev_info_t *, char *, char *);
209 
210 
211 /*
212  * Internal mdi_pathinfo node functions
213  */
214 static int		i_mdi_pi_kstat_create(mdi_pathinfo_t *);
215 static void		i_mdi_pi_kstat_destroy(mdi_pathinfo_t *);
216 
217 static mdi_vhci_t	*i_mdi_vhci_class2vhci(char *);
218 static mdi_vhci_t	*i_devi_get_vhci(dev_info_t *);
219 static mdi_phci_t	*i_devi_get_phci(dev_info_t *);
220 static void		i_mdi_phci_lock(mdi_phci_t *, mdi_pathinfo_t *);
221 static void		i_mdi_phci_unlock(mdi_phci_t *);
222 static mdi_pathinfo_t	*i_mdi_pi_alloc(mdi_phci_t *, char *, mdi_client_t *);
223 static void		i_mdi_phci_add_path(mdi_phci_t *, mdi_pathinfo_t *);
224 static void		i_mdi_client_add_path(mdi_client_t *, mdi_pathinfo_t *);
225 static void		i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *,
226 			    mdi_client_t *);
227 static void		i_mdi_phci_remove_path(mdi_phci_t *, mdi_pathinfo_t *);
228 static void		i_mdi_client_remove_path(mdi_client_t *,
229 			    mdi_pathinfo_t *);
230 
231 static int		i_mdi_pi_state_change(mdi_pathinfo_t *,
232 			    mdi_pathinfo_state_t, int);
233 static int		i_mdi_pi_offline(mdi_pathinfo_t *, int);
234 static dev_info_t	*i_mdi_devinfo_create(mdi_vhci_t *, char *, char *,
235 			    char **, int);
236 static dev_info_t	*i_mdi_devinfo_find(mdi_vhci_t *, char *, char *);
237 static int		i_mdi_devinfo_remove(dev_info_t *, dev_info_t *, int);
238 static int		i_mdi_is_child_present(dev_info_t *, dev_info_t *);
239 static mdi_client_t	*i_mdi_client_alloc(mdi_vhci_t *, char *, char *);
240 static void		i_mdi_client_enlist_table(mdi_vhci_t *, mdi_client_t *);
241 static void		i_mdi_client_delist_table(mdi_vhci_t *, mdi_client_t *);
242 static mdi_client_t	*i_mdi_client_find(mdi_vhci_t *, char *, char *);
243 static void		i_mdi_client_update_state(mdi_client_t *);
244 static int		i_mdi_client_compute_state(mdi_client_t *,
245 			    mdi_phci_t *);
246 static void		i_mdi_client_lock(mdi_client_t *, mdi_pathinfo_t *);
247 static void		i_mdi_client_unlock(mdi_client_t *);
248 static int		i_mdi_client_free(mdi_vhci_t *, mdi_client_t *);
249 static mdi_client_t	*i_devi_get_client(dev_info_t *);
250 /*
251  * NOTE: this will be removed once the NWS files are changed to use the new
252  * mdi_{enable,disable}_path interfaces
253  */
254 static int		i_mdi_pi_enable_disable(dev_info_t *, dev_info_t *,
255 				int, int);
256 static mdi_pathinfo_t 	*i_mdi_enable_disable_path(mdi_pathinfo_t *pip,
257 				mdi_vhci_t *vh, int flags, int op);
258 /*
259  * Failover related function prototypes
260  */
261 static int		i_mdi_failover(void *);
262 
263 /*
264  * misc internal functions
265  */
266 static int		i_mdi_get_hash_key(char *);
267 static int		i_map_nvlist_error_to_mdi(int);
268 static void		i_mdi_report_path_state(mdi_client_t *,
269 			    mdi_pathinfo_t *);
270 
271 static void		setup_vhci_cache(mdi_vhci_t *);
272 static int		destroy_vhci_cache(mdi_vhci_t *);
273 static void		setup_phci_driver_list(mdi_vhci_t *);
274 static void		free_phci_driver_list(mdi_vhci_config_t *);
275 static int		stop_vhcache_async_threads(mdi_vhci_config_t *);
276 static boolean_t	stop_vhcache_flush_thread(void *, int);
277 static void		free_string_array(char **, int);
278 static void		free_vhcache_phci(mdi_vhcache_phci_t *);
279 static void		free_vhcache_pathinfo(mdi_vhcache_pathinfo_t *);
280 static void		free_vhcache_client(mdi_vhcache_client_t *);
281 static int		mainnvl_to_vhcache(mdi_vhci_cache_t *, nvlist_t *);
282 static nvlist_t		*vhcache_to_mainnvl(mdi_vhci_cache_t *);
283 static void		vhcache_phci_add(mdi_vhci_config_t *, mdi_phci_t *);
284 static void		vhcache_phci_remove(mdi_vhci_config_t *, mdi_phci_t *);
285 static void		vhcache_pi_add(mdi_vhci_config_t *,
286 			    struct mdi_pathinfo *);
287 static void		vhcache_pi_remove(mdi_vhci_config_t *,
288 			    struct mdi_pathinfo *);
289 static void		free_phclient_path_list(mdi_phys_path_t *);
290 static void		sort_vhcache_paths(mdi_vhcache_client_t *);
291 static int		flush_vhcache(mdi_vhci_config_t *, int);
292 static void		vhcache_dirty(mdi_vhci_config_t *);
293 static void		free_async_client_config(mdi_async_client_config_t *);
294 static void		single_threaded_vhconfig_enter(mdi_vhci_config_t *);
295 static void		single_threaded_vhconfig_exit(mdi_vhci_config_t *);
296 static nvlist_t		*read_on_disk_vhci_cache(char *);
297 extern int		fread_nvlist(char *, nvlist_t **);
298 extern int		fwrite_nvlist(char *, nvlist_t *);
299 
300 /* called once when first vhci registers with mdi */
301 static void
302 i_mdi_init()
303 {
304 	static int initialized = 0;
305 
306 	if (initialized)
307 		return;
308 	initialized = 1;
309 
310 	mutex_init(&mdi_mutex, NULL, MUTEX_DEFAULT, NULL);
311 	/*
312 	 * Create our taskq resources
313 	 */
314 	mdi_taskq = taskq_create("mdi_taskq", mdi_taskq_n_threads,
315 	    MDI_TASKQ_PRI, MDI_TASKQ_MINALLOC, MDI_TASKQ_MAXALLOC,
316 	    TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
317 	ASSERT(mdi_taskq != NULL);	/* taskq_create never fails */
318 }
319 
320 /*
321  * mdi_get_component_type():
322  *		Return mpxio component type
323  * Return Values:
324  *		MDI_COMPONENT_NONE
325  *		MDI_COMPONENT_VHCI
326  *		MDI_COMPONENT_PHCI
327  *		MDI_COMPONENT_CLIENT
328  * XXX This doesn't work under multi-level MPxIO and should be
329  *	removed when clients migrate mdi_component_is_*() interfaces.
330  */
331 int
332 mdi_get_component_type(dev_info_t *dip)
333 {
334 	return (DEVI(dip)->devi_mdi_component);
335 }
336 
337 /*
338  * mdi_vhci_register():
339  *		Register a vHCI module with the mpxio framework
340  *		mdi_vhci_register() is called by vHCI drivers to register the
341  *		'class_driver' vHCI driver and its MDI entrypoints with the
342  *		mpxio framework.  The vHCI driver must call this interface as
343  *		part of its attach(9e) handler.
344  *		Competing threads may try to attach mdi_vhci_register() as
345  *		the vHCI drivers are loaded and attached as a result of pHCI
346  *		driver instance registration (mdi_phci_register()) with the
347  *		framework.
348  * Return Values:
349  *		MDI_SUCCESS
350  *		MDI_FAILURE
351  */
352 /*ARGSUSED*/
353 int
354 mdi_vhci_register(char *class, dev_info_t *vdip, mdi_vhci_ops_t *vops,
355     int flags)
356 {
357 	mdi_vhci_t		*vh = NULL;
358 
359 	ASSERT(vops->vo_revision == MDI_VHCI_OPS_REV);
360 	ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));
361 
362 	i_mdi_init();
363 
364 	mutex_enter(&mdi_mutex);
365 	/*
366 	 * Scan for already registered vhci
367 	 */
368 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
369 		if (strcmp(vh->vh_class, class) == 0) {
370 			/*
371 			 * vHCI has already been created.  Check for valid
372 			 * vHCI ops registration.  We only support one vHCI
373 			 * module per class
374 			 */
375 			if (vh->vh_ops != NULL) {
376 				mutex_exit(&mdi_mutex);
377 				cmn_err(CE_NOTE, vhci_greeting, class);
378 				return (MDI_FAILURE);
379 			}
380 			break;
381 		}
382 	}
383 
384 	/*
385 	 * if not yet created, create the vHCI component
386 	 */
387 	if (vh == NULL) {
388 		struct client_hash	*hash = NULL;
389 		char			*load_balance;
390 
391 		/*
392 		 * Allocate and initialize the mdi extensions
393 		 */
394 		vh = kmem_zalloc(sizeof (mdi_vhci_t), KM_SLEEP);
395 		hash = kmem_zalloc(mdi_client_table_size * sizeof (*hash),
396 		    KM_SLEEP);
397 		vh->vh_client_table = hash;
398 		vh->vh_class = kmem_zalloc(strlen(class) + 1, KM_SLEEP);
399 		(void) strcpy(vh->vh_class, class);
400 		vh->vh_lb = LOAD_BALANCE_RR;
401 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vdip,
402 		    0, LOAD_BALANCE_PROP, &load_balance) == DDI_SUCCESS) {
403 			if (strcmp(load_balance, LOAD_BALANCE_PROP_NONE) == 0) {
404 				vh->vh_lb = LOAD_BALANCE_NONE;
405 			} else if (strcmp(load_balance, LOAD_BALANCE_PROP_LBA)
406 				    == 0) {
407 				vh->vh_lb = LOAD_BALANCE_LBA;
408 			}
409 			ddi_prop_free(load_balance);
410 		}
411 
412 		mutex_init(&vh->vh_phci_mutex, NULL, MUTEX_DEFAULT, NULL);
413 		mutex_init(&vh->vh_client_mutex, NULL, MUTEX_DEFAULT, NULL);
414 
415 		/*
416 		 * Store the vHCI ops vectors
417 		 */
418 		vh->vh_dip = vdip;
419 		vh->vh_ops = vops;
420 
421 		setup_vhci_cache(vh);
422 
423 		if (mdi_vhci_head == NULL) {
424 			mdi_vhci_head = vh;
425 		}
426 		if (mdi_vhci_tail) {
427 			mdi_vhci_tail->vh_next = vh;
428 		}
429 		mdi_vhci_tail = vh;
430 		mdi_vhci_count++;
431 	}
432 
433 	/*
434 	 * Claim the devfs node as a vhci component
435 	 */
436 	DEVI(vdip)->devi_mdi_component |= MDI_COMPONENT_VHCI;
437 
438 	/*
439 	 * Initialize our back reference from dev_info node
440 	 */
441 	DEVI(vdip)->devi_mdi_xhci = (caddr_t)vh;
442 	mutex_exit(&mdi_mutex);
443 	return (MDI_SUCCESS);
444 }
445 
446 /*
447  * mdi_vhci_unregister():
448  *		Unregister a vHCI module from mpxio framework
449  *		mdi_vhci_unregister() is called from the detach(9E) entrypoint
450  * 		of a vhci to unregister it from the framework.
451  * Return Values:
452  *		MDI_SUCCESS
453  *		MDI_FAILURE
454  */
455 /*ARGSUSED*/
456 int
457 mdi_vhci_unregister(dev_info_t *vdip, int flags)
458 {
459 	mdi_vhci_t	*found, *vh, *prev = NULL;
460 
461 	ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));
462 
463 	/*
464 	 * Check for invalid VHCI
465 	 */
466 	if ((vh = i_devi_get_vhci(vdip)) == NULL)
467 		return (MDI_FAILURE);
468 
469 	/*
470 	 * Scan the list of registered vHCIs for a match
471 	 */
472 	mutex_enter(&mdi_mutex);
473 	for (found = mdi_vhci_head; found != NULL; found = found->vh_next) {
474 		if (found == vh)
475 			break;
476 		prev = found;
477 	}
478 
479 	if (found == NULL) {
480 		mutex_exit(&mdi_mutex);
481 		return (MDI_FAILURE);
482 	}
483 
484 	/*
485 	 * Check the vHCI, pHCI and client count. All the pHCIs and clients
486 	 * should have been unregistered, before a vHCI can be
487 	 * unregistered.
488 	 */
489 	MDI_VHCI_PHCI_LOCK(vh);
490 	if (vh->vh_refcnt || vh->vh_phci_count || vh->vh_client_count) {
491 		MDI_VHCI_PHCI_UNLOCK(vh);
492 		mutex_exit(&mdi_mutex);
493 		return (MDI_FAILURE);
494 	}
495 	MDI_VHCI_PHCI_UNLOCK(vh);
496 
497 	if (destroy_vhci_cache(vh) != MDI_SUCCESS) {
498 		mutex_exit(&mdi_mutex);
499 		return (MDI_FAILURE);
500 	}
501 
502 	/*
503 	 * Remove the vHCI from the global list
504 	 */
505 	if (vh == mdi_vhci_head) {
506 		mdi_vhci_head = vh->vh_next;
507 	} else {
508 		prev->vh_next = vh->vh_next;
509 	}
510 	if (vh == mdi_vhci_tail) {
511 		mdi_vhci_tail = prev;
512 	}
513 	mdi_vhci_count--;
514 	mutex_exit(&mdi_mutex);
515 
516 	vh->vh_ops = NULL;
517 	DEVI(vdip)->devi_mdi_component &= ~MDI_COMPONENT_VHCI;
518 	DEVI(vdip)->devi_mdi_xhci = NULL;
519 	kmem_free(vh->vh_class, strlen(vh->vh_class)+1);
520 	kmem_free(vh->vh_client_table,
521 	    mdi_client_table_size * sizeof (struct client_hash));
522 	mutex_destroy(&vh->vh_phci_mutex);
523 	mutex_destroy(&vh->vh_client_mutex);
524 
525 	kmem_free(vh, sizeof (mdi_vhci_t));
526 	return (MDI_SUCCESS);
527 }
528 
529 /*
530  * i_mdi_vhci_class2vhci():
531  *		Look for a matching vHCI module given a vHCI class name
532  * Return Values:
533  *		Handle to a vHCI component
534  *		NULL
535  */
536 static mdi_vhci_t *
537 i_mdi_vhci_class2vhci(char *class)
538 {
539 	mdi_vhci_t	*vh = NULL;
540 
541 	ASSERT(!MUTEX_HELD(&mdi_mutex));
542 
543 	mutex_enter(&mdi_mutex);
544 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
545 		if (strcmp(vh->vh_class, class) == 0) {
546 			break;
547 		}
548 	}
549 	mutex_exit(&mdi_mutex);
550 	return (vh);
551 }
552 
553 /*
554  * i_devi_get_vhci():
555  *		Utility function to get the handle to a vHCI component
556  * Return Values:
557  *		Handle to a vHCI component
558  *		NULL
559  */
560 mdi_vhci_t *
561 i_devi_get_vhci(dev_info_t *vdip)
562 {
563 	mdi_vhci_t	*vh = NULL;
564 	if (MDI_VHCI(vdip)) {
565 		vh = (mdi_vhci_t *)DEVI(vdip)->devi_mdi_xhci;
566 	}
567 	return (vh);
568 }
569 
570 /*
571  * mdi_phci_register():
572  *		Register a pHCI module with mpxio framework
573  *		mdi_phci_register() is called by pHCI drivers to register with
574  *		the mpxio framework and a specific 'class_driver' vHCI.  The
575  *		pHCI driver must call this interface as part of its attach(9e)
576  *		handler.
577  * Return Values:
578  *		MDI_SUCCESS
579  *		MDI_FAILURE
580  */
581 /*ARGSUSED*/
582 int
583 mdi_phci_register(char *class, dev_info_t *pdip, int flags)
584 {
585 	mdi_phci_t		*ph;
586 	mdi_vhci_t		*vh;
587 	char			*data;
588 	char			*pathname;
589 
590 	/*
591 	 * Some subsystems, like fcp, perform pHCI registration from a
592 	 * different thread than the one doing the pHCI attach(9E) - the
593 	 * driver attach code is waiting for this other thread to complete.
594 	 * This means we can only ASSERT DEVI_BUSY_CHANGING of parent
595 	 * (indicating that some thread has done an ndi_devi_enter of parent)
596 	 * not DEVI_BUSY_OWNED (which would indicate that we did the enter).
597 	 */
598 	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));
599 
600 	pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
601 	(void) ddi_pathname(pdip, pathname);
602 
603 	/*
604 	 * Check for mpxio-disable property. Enable mpxio if the property is
605 	 * missing or not set to "yes".
606 	 * If the property is set to "yes" then emit a brief message.
607 	 */
608 	if ((ddi_prop_lookup_string(DDI_DEV_T_ANY, pdip, 0, "mpxio-disable",
609 	    &data) == DDI_SUCCESS)) {
610 		if (strcmp(data, "yes") == 0) {
611 			MDI_DEBUG(1, (CE_CONT, pdip,
612 			    "?%s (%s%d) multipath capabilities "
613 			    "disabled via %s.conf.\n", pathname,
614 			    ddi_driver_name(pdip), ddi_get_instance(pdip),
615 			    ddi_driver_name(pdip)));
616 			ddi_prop_free(data);
617 			kmem_free(pathname, MAXPATHLEN);
618 			return (MDI_FAILURE);
619 		}
620 		ddi_prop_free(data);
621 	}
622 
623 	kmem_free(pathname, MAXPATHLEN);
624 
625 	/*
626 	 * Search for a matching vHCI
627 	 */
628 	vh = (mdi_vhci_t *)i_mdi_vhci_class2vhci(class);
629 	if (vh == NULL) {
630 		return (MDI_FAILURE);
631 	}
632 
633 	ph = kmem_zalloc(sizeof (mdi_phci_t), KM_SLEEP);
634 	mutex_init(&ph->ph_mutex, NULL, MUTEX_DEFAULT, NULL);
635 	ph->ph_dip = pdip;
636 	ph->ph_vhci = vh;
637 	ph->ph_next = NULL;
638 	ph->ph_unstable = 0;
639 	ph->ph_vprivate = 0;
640 	cv_init(&ph->ph_unstable_cv, NULL, CV_DRIVER, NULL);
641 
642 	MDI_PHCI_LOCK(ph);
643 	MDI_PHCI_SET_POWER_UP(ph);
644 	MDI_PHCI_UNLOCK(ph);
645 	DEVI(pdip)->devi_mdi_component |= MDI_COMPONENT_PHCI;
646 	DEVI(pdip)->devi_mdi_xhci = (caddr_t)ph;
647 
648 	vhcache_phci_add(vh->vh_config, ph);
649 
650 	MDI_VHCI_PHCI_LOCK(vh);
651 	if (vh->vh_phci_head == NULL) {
652 		vh->vh_phci_head = ph;
653 	}
654 	if (vh->vh_phci_tail) {
655 		vh->vh_phci_tail->ph_next = ph;
656 	}
657 	vh->vh_phci_tail = ph;
658 	vh->vh_phci_count++;
659 	MDI_VHCI_PHCI_UNLOCK(vh);
660 
661 	i_mdi_log_sysevent(pdip, class, ESC_DDI_INITIATOR_REGISTER);
662 	return (MDI_SUCCESS);
663 }
664 
665 /*
666  * mdi_phci_unregister():
667  *		Unregister a pHCI module from mpxio framework
668  *		mdi_phci_unregister() is called by the pHCI drivers from their
669  *		detach(9E) handler to unregister their instances from the
670  *		framework.
671  * Return Values:
672  *		MDI_SUCCESS
673  *		MDI_FAILURE
674  */
675 /*ARGSUSED*/
676 int
677 mdi_phci_unregister(dev_info_t *pdip, int flags)
678 {
679 	mdi_vhci_t		*vh;
680 	mdi_phci_t		*ph;
681 	mdi_phci_t		*tmp;
682 	mdi_phci_t		*prev = NULL;
683 
684 	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));
685 
686 	ph = i_devi_get_phci(pdip);
687 	if (ph == NULL) {
688 		MDI_DEBUG(1, (CE_WARN, pdip,
689 		    "!pHCI unregister: Not a valid pHCI"));
690 		return (MDI_FAILURE);
691 	}
692 
693 	vh = ph->ph_vhci;
694 	ASSERT(vh != NULL);
695 	if (vh == NULL) {
696 		MDI_DEBUG(1, (CE_WARN, pdip,
697 		    "!pHCI unregister: Not a valid vHCI"));
698 		return (MDI_FAILURE);
699 	}
700 
701 	MDI_VHCI_PHCI_LOCK(vh);
702 	tmp = vh->vh_phci_head;
703 	while (tmp) {
704 		if (tmp == ph) {
705 			break;
706 		}
707 		prev = tmp;
708 		tmp = tmp->ph_next;
709 	}
710 
711 	if (ph == vh->vh_phci_head) {
712 		vh->vh_phci_head = ph->ph_next;
713 	} else {
714 		prev->ph_next = ph->ph_next;
715 	}
716 
717 	if (ph == vh->vh_phci_tail) {
718 		vh->vh_phci_tail = prev;
719 	}
720 
721 	vh->vh_phci_count--;
722 	MDI_VHCI_PHCI_UNLOCK(vh);
723 
724 	i_mdi_log_sysevent(pdip, ph->ph_vhci->vh_class,
725 	    ESC_DDI_INITIATOR_UNREGISTER);
726 	vhcache_phci_remove(vh->vh_config, ph);
727 	cv_destroy(&ph->ph_unstable_cv);
728 	mutex_destroy(&ph->ph_mutex);
729 	kmem_free(ph, sizeof (mdi_phci_t));
730 	DEVI(pdip)->devi_mdi_component &= ~MDI_COMPONENT_PHCI;
731 	DEVI(pdip)->devi_mdi_xhci = NULL;
732 	return (MDI_SUCCESS);
733 }
734 
735 /*
736  * i_devi_get_phci():
737  * 		Utility function to return the phci extensions.
738  */
739 static mdi_phci_t *
740 i_devi_get_phci(dev_info_t *pdip)
741 {
742 	mdi_phci_t	*ph = NULL;
743 	if (MDI_PHCI(pdip)) {
744 		ph = (mdi_phci_t *)DEVI(pdip)->devi_mdi_xhci;
745 	}
746 	return (ph);
747 }
748 
749 /*
750  * Single thread mdi entry into devinfo node for modifying its children.
751  * If necessary we perform an ndi_devi_enter of the vHCI before doing
752  * an ndi_devi_enter of 'dip'.  We maintain circular in two parts: one
753  * for the vHCI and one for the pHCI.
754  */
755 void
756 mdi_devi_enter(dev_info_t *phci_dip, int *circular)
757 {
758 	dev_info_t	*vdip;
759 	int		vcircular, pcircular;
760 
761 	/* Verify calling context */
762 	ASSERT(MDI_PHCI(phci_dip));
763 	vdip = mdi_devi_get_vdip(phci_dip);
764 	ASSERT(vdip);			/* A pHCI always has a vHCI */
765 
766 	/*
767 	 * If pHCI is detaching then the framework has already entered the
768 	 * vHCI on a threads that went down the code path leading to
769 	 * detach_node().  This framework enter of the vHCI during pHCI
770 	 * detach is done to avoid deadlock with vHCI power management
771 	 * operations which enter the vHCI and the enter down the path
772 	 * to the pHCI. If pHCI is detaching then we piggyback this calls
773 	 * enter of the vHCI on frameworks vHCI enter that has already
774 	 * occurred - this is OK because we know that the framework thread
775 	 * doing detach is waiting for our completion.
776 	 *
777 	 * We should DEVI_IS_DETACHING under an enter of the parent to avoid
778 	 * race with detach - but we can't do that because the framework has
779 	 * already entered the parent, so we have some complexity instead.
780 	 */
781 	for (;;) {
782 		if (ndi_devi_tryenter(vdip, &vcircular)) {
783 			ASSERT(vcircular != -1);
784 			if (DEVI_IS_DETACHING(phci_dip)) {
785 				ndi_devi_exit(vdip, vcircular);
786 				vcircular = -1;
787 			}
788 			break;
789 		} else if (DEVI_IS_DETACHING(phci_dip)) {
790 			vcircular = -1;
791 			break;
792 		} else {
793 			delay(1);
794 		}
795 	}
796 
797 	ndi_devi_enter(phci_dip, &pcircular);
798 	*circular = (vcircular << 16) | (pcircular & 0xFFFF);
799 }
800 
801 /*
802  * Release mdi_devi_enter or successful mdi_devi_tryenter.
803  */
804 void
805 mdi_devi_exit(dev_info_t *phci_dip, int circular)
806 {
807 	dev_info_t	*vdip;
808 	int		vcircular, pcircular;
809 
810 	/* Verify calling context */
811 	ASSERT(MDI_PHCI(phci_dip));
812 	vdip = mdi_devi_get_vdip(phci_dip);
813 	ASSERT(vdip);			/* A pHCI always has a vHCI */
814 
815 	/* extract two circular recursion values from single int */
816 	pcircular = (short)(circular & 0xFFFF);
817 	vcircular = (short)((circular >> 16) & 0xFFFF);
818 
819 	ndi_devi_exit(phci_dip, pcircular);
820 	if (vcircular != -1)
821 		ndi_devi_exit(vdip, vcircular);
822 }
823 
824 /*
825  * The functions mdi_devi_exit_phci() and mdi_devi_enter_phci() are used
826  * around a pHCI drivers calls to mdi_pi_online/offline, after holding
827  * the pathinfo node via mdi_hold_path/mdi_rele_path, to avoid deadlock
828  * with vHCI power management code during path online/offline.  Each
829  * mdi_devi_exit_phci must have a matching mdi_devi_enter_phci, and both must
830  * occur within the scope of an active mdi_devi_enter that establishes the
831  * circular value.
832  */
833 void
834 mdi_devi_exit_phci(dev_info_t *phci_dip, int circular)
835 {
836 	int		pcircular;
837 
838 	/* Verify calling context */
839 	ASSERT(MDI_PHCI(phci_dip));
840 
841 	pcircular = (short)(circular & 0xFFFF);
842 	ndi_devi_exit(phci_dip, pcircular);
843 }
844 
845 void
846 mdi_devi_enter_phci(dev_info_t *phci_dip, int *circular)
847 {
848 	int		pcircular;
849 
850 	/* Verify calling context */
851 	ASSERT(MDI_PHCI(phci_dip));
852 
853 	ndi_devi_enter(phci_dip, &pcircular);
854 
855 	/* verify matching mdi_devi_exit_phci/mdi_devi_enter_phci use */
856 	ASSERT(pcircular == ((short)(*circular & 0xFFFF)));
857 }
858 
859 /*
860  * mdi_devi_get_vdip():
861  *		given a pHCI dip return vHCI dip
862  */
863 dev_info_t *
864 mdi_devi_get_vdip(dev_info_t *pdip)
865 {
866 	mdi_phci_t	*ph;
867 
868 	ph = i_devi_get_phci(pdip);
869 	if (ph && ph->ph_vhci)
870 		return (ph->ph_vhci->vh_dip);
871 	return (NULL);
872 }
873 
874 /*
875  * mdi_devi_pdip_entered():
876  *		Return 1 if we are vHCI and have done an ndi_devi_enter
877  *		of a pHCI
878  */
879 int
880 mdi_devi_pdip_entered(dev_info_t *vdip)
881 {
882 	mdi_vhci_t	*vh;
883 	mdi_phci_t	*ph;
884 
885 	vh = i_devi_get_vhci(vdip);
886 	if (vh == NULL)
887 		return (0);
888 
889 	MDI_VHCI_PHCI_LOCK(vh);
890 	ph = vh->vh_phci_head;
891 	while (ph) {
892 		if (ph->ph_dip && DEVI_BUSY_OWNED(ph->ph_dip)) {
893 			MDI_VHCI_PHCI_UNLOCK(vh);
894 			return (1);
895 		}
896 		ph = ph->ph_next;
897 	}
898 	MDI_VHCI_PHCI_UNLOCK(vh);
899 	return (0);
900 }
901 
902 /*
903  * mdi_phci_path2devinfo():
904  * 		Utility function to search for a valid phci device given
905  *		the devfs pathname.
906  */
907 dev_info_t *
908 mdi_phci_path2devinfo(dev_info_t *vdip, caddr_t pathname)
909 {
910 	char		*temp_pathname;
911 	mdi_vhci_t	*vh;
912 	mdi_phci_t	*ph;
913 	dev_info_t 	*pdip = NULL;
914 
915 	vh = i_devi_get_vhci(vdip);
916 	ASSERT(vh != NULL);
917 
918 	if (vh == NULL) {
919 		/*
920 		 * Invalid vHCI component, return failure
921 		 */
922 		return (NULL);
923 	}
924 
925 	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
926 	MDI_VHCI_PHCI_LOCK(vh);
927 	ph = vh->vh_phci_head;
928 	while (ph != NULL) {
929 		pdip = ph->ph_dip;
930 		ASSERT(pdip != NULL);
931 		*temp_pathname = '\0';
932 		(void) ddi_pathname(pdip, temp_pathname);
933 		if (strcmp(temp_pathname, pathname) == 0) {
934 			break;
935 		}
936 		ph = ph->ph_next;
937 	}
938 	if (ph == NULL) {
939 		pdip = NULL;
940 	}
941 	MDI_VHCI_PHCI_UNLOCK(vh);
942 	kmem_free(temp_pathname, MAXPATHLEN);
943 	return (pdip);
944 }
945 
946 /*
947  * mdi_phci_get_path_count():
948  * 		get number of path information nodes associated with a given
949  *		pHCI device.
950  */
951 int
952 mdi_phci_get_path_count(dev_info_t *pdip)
953 {
954 	mdi_phci_t	*ph;
955 	int		count = 0;
956 
957 	ph = i_devi_get_phci(pdip);
958 	if (ph != NULL) {
959 		count = ph->ph_path_count;
960 	}
961 	return (count);
962 }
963 
964 /*
965  * i_mdi_phci_lock():
966  *		Lock a pHCI device
967  * Return Values:
968  *		None
969  * Note:
970  *		The default locking order is:
971  *		_NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
972  *		But there are number of situations where locks need to be
973  *		grabbed in reverse order.  This routine implements try and lock
974  *		mechanism depending on the requested parameter option.
975  */
976 static void
977 i_mdi_phci_lock(mdi_phci_t *ph, mdi_pathinfo_t *pip)
978 {
979 	if (pip) {
980 		/* Reverse locking is requested. */
981 		while (MDI_PHCI_TRYLOCK(ph) == 0) {
982 			/*
983 			 * tryenter failed. Try to grab again
984 			 * after a small delay
985 			 */
986 			MDI_PI_HOLD(pip);
987 			MDI_PI_UNLOCK(pip);
988 			delay(1);
989 			MDI_PI_LOCK(pip);
990 			MDI_PI_RELE(pip);
991 		}
992 	} else {
993 		MDI_PHCI_LOCK(ph);
994 	}
995 }
996 
997 /*
998  * i_mdi_phci_unlock():
999  *		Unlock the pHCI component
1000  */
1001 static void
1002 i_mdi_phci_unlock(mdi_phci_t *ph)
1003 {
1004 	MDI_PHCI_UNLOCK(ph);
1005 }
1006 
1007 /*
1008  * i_mdi_devinfo_create():
1009  *		create client device's devinfo node
1010  * Return Values:
1011  *		dev_info
1012  *		NULL
1013  * Notes:
1014  */
1015 static dev_info_t *
1016 i_mdi_devinfo_create(mdi_vhci_t *vh, char *name, char *guid,
1017 	char **compatible, int ncompatible)
1018 {
1019 	dev_info_t *cdip = NULL;
1020 
1021 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1022 
1023 	/* Verify for duplicate entry */
1024 	cdip = i_mdi_devinfo_find(vh, name, guid);
1025 	ASSERT(cdip == NULL);
1026 	if (cdip) {
1027 		cmn_err(CE_WARN,
1028 		    "i_mdi_devinfo_create: client dip %p already exists",
1029 			(void *)cdip);
1030 	}
1031 
1032 	ndi_devi_alloc_sleep(vh->vh_dip, name, DEVI_SID_NODEID, &cdip);
1033 	if (cdip == NULL)
1034 		goto fail;
1035 
1036 	/*
1037 	 * Create component type and Global unique identifier
1038 	 * properties
1039 	 */
1040 	if (ndi_prop_update_string(DDI_DEV_T_NONE, cdip,
1041 	    MDI_CLIENT_GUID_PROP, guid) != DDI_PROP_SUCCESS) {
1042 		goto fail;
1043 	}
1044 
1045 	/* Decorate the node with compatible property */
1046 	if (compatible &&
1047 	    (ndi_prop_update_string_array(DDI_DEV_T_NONE, cdip,
1048 	    "compatible", compatible, ncompatible) != DDI_PROP_SUCCESS)) {
1049 		goto fail;
1050 	}
1051 
1052 	return (cdip);
1053 
1054 fail:
1055 	if (cdip) {
1056 		(void) ndi_prop_remove_all(cdip);
1057 		(void) ndi_devi_free(cdip);
1058 	}
1059 	return (NULL);
1060 }
1061 
1062 /*
1063  * i_mdi_devinfo_find():
1064  *		Find a matching devinfo node for given client node name
1065  *		and its guid.
1066  * Return Values:
1067  *		Handle to a dev_info node or NULL
1068  */
1069 static dev_info_t *
1070 i_mdi_devinfo_find(mdi_vhci_t *vh, caddr_t name, char *guid)
1071 {
1072 	char			*data;
1073 	dev_info_t 		*cdip = NULL;
1074 	dev_info_t 		*ndip = NULL;
1075 	int			circular;
1076 
1077 	ndi_devi_enter(vh->vh_dip, &circular);
1078 	ndip = (dev_info_t *)DEVI(vh->vh_dip)->devi_child;
1079 	while ((cdip = ndip) != NULL) {
1080 		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
1081 
1082 		if (strcmp(DEVI(cdip)->devi_node_name, name)) {
1083 			continue;
1084 		}
1085 
1086 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, cdip,
1087 		    DDI_PROP_DONTPASS, MDI_CLIENT_GUID_PROP,
1088 		    &data) != DDI_PROP_SUCCESS) {
1089 			continue;
1090 		}
1091 
1092 		if (strcmp(data, guid) != 0) {
1093 			ddi_prop_free(data);
1094 			continue;
1095 		}
1096 		ddi_prop_free(data);
1097 		break;
1098 	}
1099 	ndi_devi_exit(vh->vh_dip, circular);
1100 	return (cdip);
1101 }
1102 
1103 /*
1104  * i_mdi_devinfo_remove():
1105  *		Remove a client device node
1106  */
1107 static int
1108 i_mdi_devinfo_remove(dev_info_t *vdip, dev_info_t *cdip, int flags)
1109 {
1110 	int	rv = MDI_SUCCESS;
1111 
1112 	if (i_mdi_is_child_present(vdip, cdip) == MDI_SUCCESS ||
1113 	    (flags & MDI_CLIENT_FLAGS_DEV_NOT_SUPPORTED)) {
1114 		rv = ndi_devi_offline(cdip, NDI_DEVI_REMOVE);
1115 		if (rv != NDI_SUCCESS) {
1116 			MDI_DEBUG(1, (CE_NOTE, NULL, "!i_mdi_devinfo_remove:"
1117 			    " failed. cdip = %p\n", (void *)cdip));
1118 		}
1119 		/*
1120 		 * Convert to MDI error code
1121 		 */
1122 		switch (rv) {
1123 		case NDI_SUCCESS:
1124 			rv = MDI_SUCCESS;
1125 			break;
1126 		case NDI_BUSY:
1127 			rv = MDI_BUSY;
1128 			break;
1129 		default:
1130 			rv = MDI_FAILURE;
1131 			break;
1132 		}
1133 	}
1134 	return (rv);
1135 }
1136 
1137 /*
1138  * i_devi_get_client()
1139  *		Utility function to get mpxio component extensions
1140  */
1141 static mdi_client_t *
1142 i_devi_get_client(dev_info_t *cdip)
1143 {
1144 	mdi_client_t	*ct = NULL;
1145 
1146 	if (MDI_CLIENT(cdip)) {
1147 		ct = (mdi_client_t *)DEVI(cdip)->devi_mdi_client;
1148 	}
1149 	return (ct);
1150 }
1151 
1152 /*
1153  * i_mdi_is_child_present():
1154  *		Search for the presence of client device dev_info node
1155  */
1156 static int
1157 i_mdi_is_child_present(dev_info_t *vdip, dev_info_t *cdip)
1158 {
1159 	int		rv = MDI_FAILURE;
1160 	struct dev_info	*dip;
1161 	int		circular;
1162 
1163 	ndi_devi_enter(vdip, &circular);
1164 	dip = DEVI(vdip)->devi_child;
1165 	while (dip) {
1166 		if (dip == DEVI(cdip)) {
1167 			rv = MDI_SUCCESS;
1168 			break;
1169 		}
1170 		dip = dip->devi_sibling;
1171 	}
1172 	ndi_devi_exit(vdip, circular);
1173 	return (rv);
1174 }
1175 
1176 
1177 /*
1178  * i_mdi_client_lock():
1179  *		Grab client component lock
1180  * Return Values:
1181  *		None
1182  * Note:
1183  *		The default locking order is:
1184  *		_NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
1185  *		But there are number of situations where locks need to be
1186  *		grabbed in reverse order.  This routine implements try and lock
1187  *		mechanism depending on the requested parameter option.
1188  */
1189 static void
1190 i_mdi_client_lock(mdi_client_t *ct, mdi_pathinfo_t *pip)
1191 {
1192 	if (pip) {
1193 		/*
1194 		 * Reverse locking is requested.
1195 		 */
1196 		while (MDI_CLIENT_TRYLOCK(ct) == 0) {
1197 			/*
1198 			 * tryenter failed. Try to grab again
1199 			 * after a small delay
1200 			 */
1201 			MDI_PI_HOLD(pip);
1202 			MDI_PI_UNLOCK(pip);
1203 			delay(1);
1204 			MDI_PI_LOCK(pip);
1205 			MDI_PI_RELE(pip);
1206 		}
1207 	} else {
1208 		MDI_CLIENT_LOCK(ct);
1209 	}
1210 }
1211 
1212 /*
1213  * i_mdi_client_unlock():
1214  *		Unlock a client component
1215  */
1216 static void
1217 i_mdi_client_unlock(mdi_client_t *ct)
1218 {
1219 	MDI_CLIENT_UNLOCK(ct);
1220 }
1221 
1222 /*
1223  * i_mdi_client_alloc():
1224  * 		Allocate and initialize a client structure.  Caller should
1225  *		hold the vhci client lock.
1226  * Return Values:
1227  *		Handle to a client component
1228  */
1229 /*ARGSUSED*/
1230 static mdi_client_t *
1231 i_mdi_client_alloc(mdi_vhci_t *vh, char *name, char *lguid)
1232 {
1233 	mdi_client_t	*ct;
1234 
1235 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1236 
1237 	/*
1238 	 * Allocate and initialize a component structure.
1239 	 */
1240 	ct = kmem_zalloc(sizeof (*ct), KM_SLEEP);
1241 	mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL);
1242 	ct->ct_hnext = NULL;
1243 	ct->ct_hprev = NULL;
1244 	ct->ct_dip = NULL;
1245 	ct->ct_vhci = vh;
1246 	ct->ct_drvname = kmem_alloc(strlen(name) + 1, KM_SLEEP);
1247 	(void) strcpy(ct->ct_drvname, name);
1248 	ct->ct_guid = kmem_alloc(strlen(lguid) + 1, KM_SLEEP);
1249 	(void) strcpy(ct->ct_guid, lguid);
1250 	ct->ct_cprivate = NULL;
1251 	ct->ct_vprivate = NULL;
1252 	ct->ct_flags = 0;
1253 	ct->ct_state = MDI_CLIENT_STATE_FAILED;
1254 	MDI_CLIENT_LOCK(ct);
1255 	MDI_CLIENT_SET_OFFLINE(ct);
1256 	MDI_CLIENT_SET_DETACH(ct);
1257 	MDI_CLIENT_SET_POWER_UP(ct);
1258 	MDI_CLIENT_UNLOCK(ct);
1259 	ct->ct_failover_flags = 0;
1260 	ct->ct_failover_status = 0;
1261 	cv_init(&ct->ct_failover_cv, NULL, CV_DRIVER, NULL);
1262 	ct->ct_unstable = 0;
1263 	cv_init(&ct->ct_unstable_cv, NULL, CV_DRIVER, NULL);
1264 	cv_init(&ct->ct_powerchange_cv, NULL, CV_DRIVER, NULL);
1265 	ct->ct_lb = vh->vh_lb;
1266 	ct->ct_lb_args =  kmem_zalloc(sizeof (client_lb_args_t), KM_SLEEP);
1267 	ct->ct_lb_args->region_size = LOAD_BALANCE_DEFAULT_REGION_SIZE;
1268 	ct->ct_path_count = 0;
1269 	ct->ct_path_head = NULL;
1270 	ct->ct_path_tail = NULL;
1271 	ct->ct_path_last = NULL;
1272 
1273 	/*
1274 	 * Add this client component to our client hash queue
1275 	 */
1276 	i_mdi_client_enlist_table(vh, ct);
1277 	return (ct);
1278 }
1279 
1280 /*
1281  * i_mdi_client_enlist_table():
1282  *		Attach the client device to the client hash table. Caller
1283  *		should hold the vhci client lock.
1284  */
1285 static void
1286 i_mdi_client_enlist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1287 {
1288 	int 			index;
1289 	struct client_hash	*head;
1290 
1291 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1292 
1293 	index = i_mdi_get_hash_key(ct->ct_guid);
1294 	head = &vh->vh_client_table[index];
1295 	ct->ct_hnext = (mdi_client_t *)head->ct_hash_head;
1296 	head->ct_hash_head = ct;
1297 	head->ct_hash_count++;
1298 	vh->vh_client_count++;
1299 }
1300 
1301 /*
1302  * i_mdi_client_delist_table():
1303  *		Attach the client device to the client hash table.
1304  *		Caller should hold the vhci client lock.
1305  */
1306 static void
1307 i_mdi_client_delist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1308 {
1309 	int			index;
1310 	char			*guid;
1311 	struct client_hash 	*head;
1312 	mdi_client_t		*next;
1313 	mdi_client_t		*last;
1314 
1315 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1316 
1317 	guid = ct->ct_guid;
1318 	index = i_mdi_get_hash_key(guid);
1319 	head = &vh->vh_client_table[index];
1320 
1321 	last = NULL;
1322 	next = (mdi_client_t *)head->ct_hash_head;
1323 	while (next != NULL) {
1324 		if (next == ct) {
1325 			break;
1326 		}
1327 		last = next;
1328 		next = next->ct_hnext;
1329 	}
1330 
1331 	if (next) {
1332 		head->ct_hash_count--;
1333 		if (last == NULL) {
1334 			head->ct_hash_head = ct->ct_hnext;
1335 		} else {
1336 			last->ct_hnext = ct->ct_hnext;
1337 		}
1338 		ct->ct_hnext = NULL;
1339 		vh->vh_client_count--;
1340 	}
1341 }
1342 
1343 
1344 /*
1345  * i_mdi_client_free():
1346  *		Free a client component
1347  */
1348 static int
1349 i_mdi_client_free(mdi_vhci_t *vh, mdi_client_t *ct)
1350 {
1351 	int		rv = MDI_SUCCESS;
1352 	int		flags = ct->ct_flags;
1353 	dev_info_t	*cdip;
1354 	dev_info_t	*vdip;
1355 
1356 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1357 
1358 	vdip = vh->vh_dip;
1359 	cdip = ct->ct_dip;
1360 
1361 	(void) ndi_prop_remove(DDI_DEV_T_NONE, cdip, MDI_CLIENT_GUID_PROP);
1362 	DEVI(cdip)->devi_mdi_component &= ~MDI_COMPONENT_CLIENT;
1363 	DEVI(cdip)->devi_mdi_client = NULL;
1364 
1365 	/*
1366 	 * Clear out back ref. to dev_info_t node
1367 	 */
1368 	ct->ct_dip = NULL;
1369 
1370 	/*
1371 	 * Remove this client from our hash queue
1372 	 */
1373 	i_mdi_client_delist_table(vh, ct);
1374 
1375 	/*
1376 	 * Uninitialize and free the component
1377 	 */
1378 	kmem_free(ct->ct_drvname, strlen(ct->ct_drvname) + 1);
1379 	kmem_free(ct->ct_guid, strlen(ct->ct_guid) + 1);
1380 	kmem_free(ct->ct_lb_args, sizeof (client_lb_args_t));
1381 	cv_destroy(&ct->ct_failover_cv);
1382 	cv_destroy(&ct->ct_unstable_cv);
1383 	cv_destroy(&ct->ct_powerchange_cv);
1384 	mutex_destroy(&ct->ct_mutex);
1385 	kmem_free(ct, sizeof (*ct));
1386 
1387 	if (cdip != NULL) {
1388 		MDI_VHCI_CLIENT_UNLOCK(vh);
1389 		(void) i_mdi_devinfo_remove(vdip, cdip, flags);
1390 		MDI_VHCI_CLIENT_LOCK(vh);
1391 	}
1392 	return (rv);
1393 }
1394 
1395 /*
1396  * i_mdi_client_find():
1397  * 		Find the client structure corresponding to a given guid
1398  *		Caller should hold the vhci client lock.
1399  */
1400 static mdi_client_t *
1401 i_mdi_client_find(mdi_vhci_t *vh, char *cname, char *guid)
1402 {
1403 	int			index;
1404 	struct client_hash	*head;
1405 	mdi_client_t		*ct;
1406 
1407 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1408 
1409 	index = i_mdi_get_hash_key(guid);
1410 	head = &vh->vh_client_table[index];
1411 
1412 	ct = head->ct_hash_head;
1413 	while (ct != NULL) {
1414 		if (strcmp(ct->ct_guid, guid) == 0 &&
1415 		    (cname == NULL || strcmp(ct->ct_drvname, cname) == 0)) {
1416 			break;
1417 		}
1418 		ct = ct->ct_hnext;
1419 	}
1420 	return (ct);
1421 }
1422 
1423 /*
1424  * i_mdi_client_update_state():
1425  *		Compute and update client device state
1426  * Notes:
1427  *		A client device can be in any of three possible states:
1428  *
1429  *		MDI_CLIENT_STATE_OPTIMAL - Client in optimal state with more
1430  *		one online/standby paths. Can tolerate failures.
1431  *		MDI_CLIENT_STATE_DEGRADED - Client device in degraded state with
1432  *		no alternate paths available as standby. A failure on the online
1433  *		would result in loss of access to device data.
1434  *		MDI_CLIENT_STATE_FAILED - Client device in failed state with
1435  *		no paths available to access the device.
1436  */
1437 static void
1438 i_mdi_client_update_state(mdi_client_t *ct)
1439 {
1440 	int state;
1441 
1442 	ASSERT(MDI_CLIENT_LOCKED(ct));
1443 	state = i_mdi_client_compute_state(ct, NULL);
1444 	MDI_CLIENT_SET_STATE(ct, state);
1445 }
1446 
1447 /*
1448  * i_mdi_client_compute_state():
1449  *		Compute client device state
1450  *
1451  *		mdi_phci_t *	Pointer to pHCI structure which should
1452  *				while computing the new value.  Used by
1453  *				i_mdi_phci_offline() to find the new
1454  *				client state after DR of a pHCI.
1455  */
1456 static int
1457 i_mdi_client_compute_state(mdi_client_t *ct, mdi_phci_t *ph)
1458 {
1459 	int		state;
1460 	int		online_count = 0;
1461 	int		standby_count = 0;
1462 	mdi_pathinfo_t	*pip, *next;
1463 
1464 	ASSERT(MDI_CLIENT_LOCKED(ct));
1465 	pip = ct->ct_path_head;
1466 	while (pip != NULL) {
1467 		MDI_PI_LOCK(pip);
1468 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
1469 		if (MDI_PI(pip)->pi_phci == ph) {
1470 			MDI_PI_UNLOCK(pip);
1471 			pip = next;
1472 			continue;
1473 		}
1474 
1475 		if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1476 				== MDI_PATHINFO_STATE_ONLINE)
1477 			online_count++;
1478 		else if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1479 				== MDI_PATHINFO_STATE_STANDBY)
1480 			standby_count++;
1481 		MDI_PI_UNLOCK(pip);
1482 		pip = next;
1483 	}
1484 
1485 	if (online_count == 0) {
1486 		if (standby_count == 0) {
1487 			state = MDI_CLIENT_STATE_FAILED;
1488 			MDI_DEBUG(2, (CE_NOTE, NULL, "!client state: failed"
1489 			    " ct = %p\n", (void *)ct));
1490 		} else if (standby_count == 1) {
1491 			state = MDI_CLIENT_STATE_DEGRADED;
1492 		} else {
1493 			state = MDI_CLIENT_STATE_OPTIMAL;
1494 		}
1495 	} else if (online_count == 1) {
1496 		if (standby_count == 0) {
1497 			state = MDI_CLIENT_STATE_DEGRADED;
1498 		} else {
1499 			state = MDI_CLIENT_STATE_OPTIMAL;
1500 		}
1501 	} else {
1502 		state = MDI_CLIENT_STATE_OPTIMAL;
1503 	}
1504 	return (state);
1505 }
1506 
1507 /*
1508  * i_mdi_client2devinfo():
1509  *		Utility function
1510  */
1511 dev_info_t *
1512 i_mdi_client2devinfo(mdi_client_t *ct)
1513 {
1514 	return (ct->ct_dip);
1515 }
1516 
1517 /*
1518  * mdi_client_path2_devinfo():
1519  * 		Given the parent devinfo and child devfs pathname, search for
1520  *		a valid devfs node handle.
1521  */
1522 dev_info_t *
1523 mdi_client_path2devinfo(dev_info_t *vdip, char *pathname)
1524 {
1525 	dev_info_t 	*cdip = NULL;
1526 	dev_info_t 	*ndip = NULL;
1527 	char		*temp_pathname;
1528 	int		circular;
1529 
1530 	/*
1531 	 * Allocate temp buffer
1532 	 */
1533 	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1534 
1535 	/*
1536 	 * Lock parent against changes
1537 	 */
1538 	ndi_devi_enter(vdip, &circular);
1539 	ndip = (dev_info_t *)DEVI(vdip)->devi_child;
1540 	while ((cdip = ndip) != NULL) {
1541 		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
1542 
1543 		*temp_pathname = '\0';
1544 		(void) ddi_pathname(cdip, temp_pathname);
1545 		if (strcmp(temp_pathname, pathname) == 0) {
1546 			break;
1547 		}
1548 	}
1549 	/*
1550 	 * Release devinfo lock
1551 	 */
1552 	ndi_devi_exit(vdip, circular);
1553 
1554 	/*
1555 	 * Free the temp buffer
1556 	 */
1557 	kmem_free(temp_pathname, MAXPATHLEN);
1558 	return (cdip);
1559 }
1560 
1561 /*
1562  * mdi_client_get_path_count():
1563  * 		Utility function to get number of path information nodes
1564  *		associated with a given client device.
1565  */
1566 int
1567 mdi_client_get_path_count(dev_info_t *cdip)
1568 {
1569 	mdi_client_t	*ct;
1570 	int		count = 0;
1571 
1572 	ct = i_devi_get_client(cdip);
1573 	if (ct != NULL) {
1574 		count = ct->ct_path_count;
1575 	}
1576 	return (count);
1577 }
1578 
1579 
1580 /*
1581  * i_mdi_get_hash_key():
1582  * 		Create a hash using strings as keys
1583  *
1584  */
1585 static int
1586 i_mdi_get_hash_key(char *str)
1587 {
1588 	uint32_t	g, hash = 0;
1589 	char		*p;
1590 
1591 	for (p = str; *p != '\0'; p++) {
1592 		g = *p;
1593 		hash += g;
1594 	}
1595 	return (hash % (CLIENT_HASH_TABLE_SIZE - 1));
1596 }
1597 
1598 /*
1599  * mdi_get_lb_policy():
1600  * 		Get current load balancing policy for a given client device
1601  */
1602 client_lb_t
1603 mdi_get_lb_policy(dev_info_t *cdip)
1604 {
1605 	client_lb_t	lb = LOAD_BALANCE_NONE;
1606 	mdi_client_t	*ct;
1607 
1608 	ct = i_devi_get_client(cdip);
1609 	if (ct != NULL) {
1610 		lb = ct->ct_lb;
1611 	}
1612 	return (lb);
1613 }
1614 
1615 /*
1616  * mdi_set_lb_region_size():
1617  * 		Set current region size for the load-balance
1618  */
1619 int
1620 mdi_set_lb_region_size(dev_info_t *cdip, int region_size)
1621 {
1622 	mdi_client_t	*ct;
1623 	int		rv = MDI_FAILURE;
1624 
1625 	ct = i_devi_get_client(cdip);
1626 	if (ct != NULL && ct->ct_lb_args != NULL) {
1627 		ct->ct_lb_args->region_size = region_size;
1628 		rv = MDI_SUCCESS;
1629 	}
1630 	return (rv);
1631 }
1632 
1633 /*
1634  * mdi_Set_lb_policy():
1635  * 		Set current load balancing policy for a given client device
1636  */
1637 int
1638 mdi_set_lb_policy(dev_info_t *cdip, client_lb_t lb)
1639 {
1640 	mdi_client_t	*ct;
1641 	int		rv = MDI_FAILURE;
1642 
1643 	ct = i_devi_get_client(cdip);
1644 	if (ct != NULL) {
1645 		ct->ct_lb = lb;
1646 		rv = MDI_SUCCESS;
1647 	}
1648 	return (rv);
1649 }
1650 
1651 /*
1652  * mdi_failover():
1653  *		failover function called by the vHCI drivers to initiate
1654  *		a failover operation.  This is typically due to non-availability
1655  *		of online paths to route I/O requests.  Failover can be
1656  *		triggered through user application also.
1657  *
1658  *		The vHCI driver calls mdi_failover() to initiate a failover
1659  *		operation. mdi_failover() calls back into the vHCI driver's
1660  *		vo_failover() entry point to perform the actual failover
1661  *		operation.  The reason for requiring the vHCI driver to
1662  *		initiate failover by calling mdi_failover(), instead of directly
1663  *		executing vo_failover() itself, is to ensure that the mdi
1664  *		framework can keep track of the client state properly.
1665  *		Additionally, mdi_failover() provides as a convenience the
1666  *		option of performing the failover operation synchronously or
1667  *		asynchronously
1668  *
1669  *		Upon successful completion of the failover operation, the
1670  *		paths that were previously ONLINE will be in the STANDBY state,
1671  *		and the newly activated paths will be in the ONLINE state.
1672  *
1673  *		The flags modifier determines whether the activation is done
1674  *		synchronously: MDI_FAILOVER_SYNC
1675  * Return Values:
1676  *		MDI_SUCCESS
1677  *		MDI_FAILURE
1678  *		MDI_BUSY
1679  */
1680 /*ARGSUSED*/
1681 int
1682 mdi_failover(dev_info_t *vdip, dev_info_t *cdip, int flags)
1683 {
1684 	int			rv;
1685 	mdi_client_t		*ct;
1686 
1687 	ct = i_devi_get_client(cdip);
1688 	ASSERT(ct != NULL);
1689 	if (ct == NULL) {
1690 		/* cdip is not a valid client device. Nothing more to do. */
1691 		return (MDI_FAILURE);
1692 	}
1693 
1694 	MDI_CLIENT_LOCK(ct);
1695 
1696 	if (MDI_CLIENT_IS_PATH_FREE_IN_PROGRESS(ct)) {
1697 		/* A path to the client is being freed */
1698 		MDI_CLIENT_UNLOCK(ct);
1699 		return (MDI_BUSY);
1700 	}
1701 
1702 
1703 	if (MDI_CLIENT_IS_FAILED(ct)) {
1704 		/*
1705 		 * Client is in failed state. Nothing more to do.
1706 		 */
1707 		MDI_CLIENT_UNLOCK(ct);
1708 		return (MDI_FAILURE);
1709 	}
1710 
1711 	if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
1712 		/*
1713 		 * Failover is already in progress; return BUSY
1714 		 */
1715 		MDI_CLIENT_UNLOCK(ct);
1716 		return (MDI_BUSY);
1717 	}
1718 	/*
1719 	 * Make sure that mdi_pathinfo node state changes are processed.
1720 	 * We do not allow failovers to progress while client path state
1721 	 * changes are in progress
1722 	 */
1723 	if (ct->ct_unstable) {
1724 		if (flags == MDI_FAILOVER_ASYNC) {
1725 			MDI_CLIENT_UNLOCK(ct);
1726 			return (MDI_BUSY);
1727 		} else {
1728 			while (ct->ct_unstable)
1729 				cv_wait(&ct->ct_unstable_cv, &ct->ct_mutex);
1730 		}
1731 	}
1732 
1733 	/*
1734 	 * Client device is in stable state. Before proceeding, perform sanity
1735 	 * checks again.
1736 	 */
1737 	if ((MDI_CLIENT_IS_DETACHED(ct)) || (MDI_CLIENT_IS_FAILED(ct)) ||
1738 	    (!i_ddi_devi_attached(ct->ct_dip))) {
1739 		/*
1740 		 * Client is in failed state. Nothing more to do.
1741 		 */
1742 		MDI_CLIENT_UNLOCK(ct);
1743 		return (MDI_FAILURE);
1744 	}
1745 
1746 	/*
1747 	 * Set the client state as failover in progress.
1748 	 */
1749 	MDI_CLIENT_SET_FAILOVER_IN_PROGRESS(ct);
1750 	ct->ct_failover_flags = flags;
1751 	MDI_CLIENT_UNLOCK(ct);
1752 
1753 	if (flags == MDI_FAILOVER_ASYNC) {
1754 		/*
1755 		 * Submit the initiate failover request via CPR safe
1756 		 * taskq threads.
1757 		 */
1758 		(void) taskq_dispatch(mdi_taskq, (task_func_t *)i_mdi_failover,
1759 		    ct, KM_SLEEP);
1760 		return (MDI_ACCEPT);
1761 	} else {
1762 		/*
1763 		 * Synchronous failover mode.  Typically invoked from the user
1764 		 * land.
1765 		 */
1766 		rv = i_mdi_failover(ct);
1767 	}
1768 	return (rv);
1769 }
1770 
1771 /*
1772  * i_mdi_failover():
1773  *		internal failover function. Invokes vHCI drivers failover
1774  *		callback function and process the failover status
1775  * Return Values:
1776  *		None
1777  *
1778  * Note: A client device in failover state can not be detached or freed.
1779  */
1780 static int
1781 i_mdi_failover(void *arg)
1782 {
1783 	int		rv = MDI_SUCCESS;
1784 	mdi_client_t	*ct = (mdi_client_t *)arg;
1785 	mdi_vhci_t	*vh = ct->ct_vhci;
1786 
1787 	ASSERT(!MDI_CLIENT_LOCKED(ct));
1788 
1789 	if (vh->vh_ops->vo_failover != NULL) {
1790 		/*
1791 		 * Call vHCI drivers callback routine
1792 		 */
1793 		rv = (*vh->vh_ops->vo_failover)(vh->vh_dip, ct->ct_dip,
1794 		    ct->ct_failover_flags);
1795 	}
1796 
1797 	MDI_CLIENT_LOCK(ct);
1798 	MDI_CLIENT_CLEAR_FAILOVER_IN_PROGRESS(ct);
1799 
1800 	/*
1801 	 * Save the failover return status
1802 	 */
1803 	ct->ct_failover_status = rv;
1804 
1805 	/*
1806 	 * As a result of failover, client status would have been changed.
1807 	 * Update the client state and wake up anyone waiting on this client
1808 	 * device.
1809 	 */
1810 	i_mdi_client_update_state(ct);
1811 
1812 	cv_broadcast(&ct->ct_failover_cv);
1813 	MDI_CLIENT_UNLOCK(ct);
1814 	return (rv);
1815 }
1816 
1817 /*
1818  * Load balancing is logical block.
1819  * IOs within the range described by region_size
1820  * would go on the same path. This would improve the
1821  * performance by cache-hit on some of the RAID devices.
1822  * Search only for online paths(At some point we
1823  * may want to balance across target ports).
1824  * If no paths are found then default to round-robin.
1825  */
1826 static int
1827 i_mdi_lba_lb(mdi_client_t *ct, mdi_pathinfo_t **ret_pip, struct buf *bp)
1828 {
1829 	int		path_index = -1;
1830 	int		online_path_count = 0;
1831 	int		online_nonpref_path_count = 0;
1832 	int 		region_size = ct->ct_lb_args->region_size;
1833 	mdi_pathinfo_t	*pip;
1834 	mdi_pathinfo_t	*next;
1835 	int		preferred, path_cnt;
1836 
1837 	pip = ct->ct_path_head;
1838 	while (pip) {
1839 		MDI_PI_LOCK(pip);
1840 		if (MDI_PI(pip)->pi_state ==
1841 		    MDI_PATHINFO_STATE_ONLINE && MDI_PI(pip)->pi_preferred) {
1842 			online_path_count++;
1843 		} else if (MDI_PI(pip)->pi_state ==
1844 		    MDI_PATHINFO_STATE_ONLINE && !MDI_PI(pip)->pi_preferred) {
1845 			online_nonpref_path_count++;
1846 		}
1847 		next = (mdi_pathinfo_t *)
1848 		    MDI_PI(pip)->pi_client_link;
1849 		MDI_PI_UNLOCK(pip);
1850 		pip = next;
1851 	}
1852 	/* if found any online/preferred then use this type */
1853 	if (online_path_count > 0) {
1854 		path_cnt = online_path_count;
1855 		preferred = 1;
1856 	} else if (online_nonpref_path_count > 0) {
1857 		path_cnt = online_nonpref_path_count;
1858 		preferred = 0;
1859 	} else {
1860 		path_cnt = 0;
1861 	}
1862 	if (path_cnt) {
1863 		path_index = (bp->b_blkno >> region_size) % path_cnt;
1864 		pip = ct->ct_path_head;
1865 		while (pip && path_index != -1) {
1866 			MDI_PI_LOCK(pip);
1867 			if (path_index == 0 &&
1868 			    (MDI_PI(pip)->pi_state ==
1869 			    MDI_PATHINFO_STATE_ONLINE) &&
1870 				MDI_PI(pip)->pi_preferred == preferred) {
1871 				MDI_PI_HOLD(pip);
1872 				MDI_PI_UNLOCK(pip);
1873 				*ret_pip = pip;
1874 				return (MDI_SUCCESS);
1875 			}
1876 			path_index --;
1877 			next = (mdi_pathinfo_t *)
1878 			    MDI_PI(pip)->pi_client_link;
1879 			MDI_PI_UNLOCK(pip);
1880 			pip = next;
1881 		}
1882 		if (pip == NULL) {
1883 			MDI_DEBUG(4, (CE_NOTE, NULL,
1884 			    "!lba %llx, no pip !!\n",
1885 				bp->b_lblkno));
1886 		} else {
1887 			MDI_DEBUG(4, (CE_NOTE, NULL,
1888 			    "!lba %llx, no pip for path_index, "
1889 			    "pip %p\n", bp->b_lblkno, (void *)pip));
1890 		}
1891 	}
1892 	return (MDI_FAILURE);
1893 }
1894 
1895 /*
1896  * mdi_select_path():
1897  *		select a path to access a client device.
1898  *
1899  *		mdi_select_path() function is called by the vHCI drivers to
1900  *		select a path to route the I/O request to.  The caller passes
1901  *		the block I/O data transfer structure ("buf") as one of the
1902  *		parameters.  The mpxio framework uses the buf structure
1903  *		contents to maintain per path statistics (total I/O size /
1904  *		count pending).  If more than one online paths are available to
1905  *		select, the framework automatically selects a suitable path
1906  *		for routing I/O request. If a failover operation is active for
1907  *		this client device the call shall be failed with MDI_BUSY error
1908  *		code.
1909  *
1910  *		By default this function returns a suitable path in online
1911  *		state based on the current load balancing policy.  Currently
1912  *		we support LOAD_BALANCE_NONE (Previously selected online path
1913  *		will continue to be used till the path is usable) and
1914  *		LOAD_BALANCE_RR (Online paths will be selected in a round
1915  *		robin fashion), LOAD_BALANCE_LB(Online paths will be selected
1916  *		based on the logical block).  The load balancing
1917  *		through vHCI drivers configuration file (driver.conf).
1918  *
1919  *		vHCI drivers may override this default behavior by specifying
1920  *		appropriate flags.  If start_pip is specified (non NULL) is
1921  *		used as start point to walk and find the next appropriate path.
1922  *		The following values are currently defined:
1923  *		MDI_SELECT_ONLINE_PATH (to select an ONLINE path) and/or
1924  *		MDI_SELECT_STANDBY_PATH (to select an STANDBY path).
1925  *
1926  *		The non-standard behavior is used by the scsi_vhci driver,
1927  *		whenever it has to use a STANDBY/FAULTED path.  Eg. during
1928  *		attach of client devices (to avoid an unnecessary failover
1929  *		when the STANDBY path comes up first), during failover
1930  *		(to activate a STANDBY path as ONLINE).
1931  *
1932  *		The selected path is returned in a a mdi_hold_path() state
1933  *		(pi_ref_cnt). Caller should release the hold by calling
1934  *		mdi_rele_path().
1935  *
1936  * Return Values:
1937  *		MDI_SUCCESS	- Completed successfully
1938  *		MDI_BUSY 	- Client device is busy failing over
1939  *		MDI_NOPATH	- Client device is online, but no valid path are
1940  *				  available to access this client device
1941  *		MDI_FAILURE	- Invalid client device or state
1942  *		MDI_DEVI_ONLINING
1943  *				- Client device (struct dev_info state) is in
1944  *				  onlining state.
1945  */
1946 
1947 /*ARGSUSED*/
1948 int
1949 mdi_select_path(dev_info_t *cdip, struct buf *bp, int flags,
1950     mdi_pathinfo_t *start_pip, mdi_pathinfo_t **ret_pip)
1951 {
1952 	mdi_client_t	*ct;
1953 	mdi_pathinfo_t	*pip;
1954 	mdi_pathinfo_t	*next;
1955 	mdi_pathinfo_t	*head;
1956 	mdi_pathinfo_t	*start;
1957 	client_lb_t	lbp;	/* load balancing policy */
1958 	int		sb = 1;	/* standard behavior */
1959 	int		preferred = 1;	/* preferred path */
1960 	int		cond, cont = 1;
1961 	int		retry = 0;
1962 
1963 	if (flags != 0) {
1964 		/*
1965 		 * disable default behavior
1966 		 */
1967 		sb = 0;
1968 	}
1969 
1970 	*ret_pip = NULL;
1971 	ct = i_devi_get_client(cdip);
1972 	if (ct == NULL) {
1973 		/* mdi extensions are NULL, Nothing more to do */
1974 		return (MDI_FAILURE);
1975 	}
1976 
1977 	MDI_CLIENT_LOCK(ct);
1978 
1979 	if (sb) {
1980 		if (MDI_CLIENT_IS_FAILED(ct)) {
1981 			/*
1982 			 * Client is not ready to accept any I/O requests.
1983 			 * Fail this request.
1984 			 */
1985 			MDI_DEBUG(2, (CE_NOTE, cdip, "!mdi_select_path: "
1986 			    "client state offline ct = %p\n", (void *)ct));
1987 			MDI_CLIENT_UNLOCK(ct);
1988 			return (MDI_FAILURE);
1989 		}
1990 
1991 		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
1992 			/*
1993 			 * Check for Failover is in progress. If so tell the
1994 			 * caller that this device is busy.
1995 			 */
1996 			MDI_DEBUG(2, (CE_NOTE, cdip, "!mdi_select_path: "
1997 			    "client failover in progress ct = %p\n",
1998 			    (void *)ct));
1999 			MDI_CLIENT_UNLOCK(ct);
2000 			return (MDI_BUSY);
2001 		}
2002 
2003 		/*
2004 		 * Check to see whether the client device is attached.
2005 		 * If not so, let the vHCI driver manually select a path
2006 		 * (standby) and let the probe/attach process to continue.
2007 		 */
2008 		if (MDI_CLIENT_IS_DETACHED(ct) || !i_ddi_devi_attached(cdip)) {
2009 			MDI_DEBUG(4, (CE_NOTE, cdip, "!Devi is onlining "
2010 			    "ct = %p\n", (void *)ct));
2011 			MDI_CLIENT_UNLOCK(ct);
2012 			return (MDI_DEVI_ONLINING);
2013 		}
2014 	}
2015 
2016 	/*
2017 	 * Cache in the client list head.  If head of the list is NULL
2018 	 * return MDI_NOPATH
2019 	 */
2020 	head = ct->ct_path_head;
2021 	if (head == NULL) {
2022 		MDI_CLIENT_UNLOCK(ct);
2023 		return (MDI_NOPATH);
2024 	}
2025 
2026 	/*
2027 	 * for non default behavior, bypass current
2028 	 * load balancing policy and always use LOAD_BALANCE_RR
2029 	 * except that the start point will be adjusted based
2030 	 * on the provided start_pip
2031 	 */
2032 	lbp = sb ? ct->ct_lb : LOAD_BALANCE_RR;
2033 
2034 	switch (lbp) {
2035 	case LOAD_BALANCE_NONE:
2036 		/*
2037 		 * Load balancing is None  or Alternate path mode
2038 		 * Start looking for a online mdi_pathinfo node starting from
2039 		 * last known selected path
2040 		 */
2041 		preferred = 1;
2042 		pip = (mdi_pathinfo_t *)ct->ct_path_last;
2043 		if (pip == NULL) {
2044 			pip = head;
2045 		}
2046 		start = pip;
2047 		do {
2048 			MDI_PI_LOCK(pip);
2049 			/*
2050 			 * No need to explicitly check if the path is disabled.
2051 			 * Since we are checking for state == ONLINE and the
2052 			 * same veriable is used for DISABLE/ENABLE information.
2053 			 */
2054 			if ((MDI_PI(pip)->pi_state  ==
2055 				MDI_PATHINFO_STATE_ONLINE) &&
2056 				preferred == MDI_PI(pip)->pi_preferred) {
2057 				/*
2058 				 * Return the path in hold state. Caller should
2059 				 * release the lock by calling mdi_rele_path()
2060 				 */
2061 				MDI_PI_HOLD(pip);
2062 				MDI_PI_UNLOCK(pip);
2063 				ct->ct_path_last = pip;
2064 				*ret_pip = pip;
2065 				MDI_CLIENT_UNLOCK(ct);
2066 				return (MDI_SUCCESS);
2067 			}
2068 
2069 			/*
2070 			 * Path is busy.
2071 			 */
2072 			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
2073 			    MDI_PI_IS_TRANSIENT(pip))
2074 				retry = 1;
2075 			/*
2076 			 * Keep looking for a next available online path
2077 			 */
2078 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2079 			if (next == NULL) {
2080 				next = head;
2081 			}
2082 			MDI_PI_UNLOCK(pip);
2083 			pip = next;
2084 			if (start == pip && preferred) {
2085 				preferred = 0;
2086 			} else if (start == pip && !preferred) {
2087 				cont = 0;
2088 			}
2089 		} while (cont);
2090 		break;
2091 
2092 	case LOAD_BALANCE_LBA:
2093 		/*
2094 		 * Make sure we are looking
2095 		 * for an online path. Otherwise, if it is for a STANDBY
2096 		 * path request, it will go through and fetch an ONLINE
2097 		 * path which is not desirable.
2098 		 */
2099 		if ((ct->ct_lb_args != NULL) &&
2100 			    (ct->ct_lb_args->region_size) && bp &&
2101 				(sb || (flags == MDI_SELECT_ONLINE_PATH))) {
2102 			if (i_mdi_lba_lb(ct, ret_pip, bp)
2103 				    == MDI_SUCCESS) {
2104 				MDI_CLIENT_UNLOCK(ct);
2105 				return (MDI_SUCCESS);
2106 			}
2107 		}
2108 		/*  FALLTHROUGH */
2109 	case LOAD_BALANCE_RR:
2110 		/*
2111 		 * Load balancing is Round Robin. Start looking for a online
2112 		 * mdi_pathinfo node starting from last known selected path
2113 		 * as the start point.  If override flags are specified,
2114 		 * process accordingly.
2115 		 * If the search is already in effect(start_pip not null),
2116 		 * then lets just use the same path preference to continue the
2117 		 * traversal.
2118 		 */
2119 
2120 		if (start_pip != NULL) {
2121 			preferred = MDI_PI(start_pip)->pi_preferred;
2122 		} else {
2123 			preferred = 1;
2124 		}
2125 
2126 		start = sb ? (mdi_pathinfo_t *)ct->ct_path_last : start_pip;
2127 		if (start == NULL) {
2128 			pip = head;
2129 		} else {
2130 			pip = (mdi_pathinfo_t *)MDI_PI(start)->pi_client_link;
2131 			if (pip == NULL) {
2132 				if (!sb) {
2133 					if (preferred == 0) {
2134 						/*
2135 						 * Looks like we have completed
2136 						 * the traversal as preferred
2137 						 * value is 0. Time to bail out.
2138 						 */
2139 						*ret_pip = NULL;
2140 						MDI_CLIENT_UNLOCK(ct);
2141 						return (MDI_NOPATH);
2142 					} else {
2143 						/*
2144 						 * Looks like we reached the
2145 						 * end of the list. Lets enable
2146 						 * traversal of non preferred
2147 						 * paths.
2148 						 */
2149 						preferred = 0;
2150 					}
2151 				}
2152 				pip = head;
2153 			}
2154 		}
2155 		start = pip;
2156 		do {
2157 			MDI_PI_LOCK(pip);
2158 			if (sb) {
2159 				cond = ((MDI_PI(pip)->pi_state ==
2160 				    MDI_PATHINFO_STATE_ONLINE &&
2161 					MDI_PI(pip)->pi_preferred ==
2162 						preferred) ? 1 : 0);
2163 			} else {
2164 				if (flags == MDI_SELECT_ONLINE_PATH) {
2165 					cond = ((MDI_PI(pip)->pi_state ==
2166 					    MDI_PATHINFO_STATE_ONLINE &&
2167 						MDI_PI(pip)->pi_preferred ==
2168 						preferred) ? 1 : 0);
2169 				} else if (flags == MDI_SELECT_STANDBY_PATH) {
2170 					cond = ((MDI_PI(pip)->pi_state ==
2171 					    MDI_PATHINFO_STATE_STANDBY &&
2172 						MDI_PI(pip)->pi_preferred ==
2173 						preferred) ? 1 : 0);
2174 				} else if (flags == (MDI_SELECT_ONLINE_PATH |
2175 				    MDI_SELECT_STANDBY_PATH)) {
2176 					cond = (((MDI_PI(pip)->pi_state ==
2177 					    MDI_PATHINFO_STATE_ONLINE ||
2178 					    (MDI_PI(pip)->pi_state ==
2179 					    MDI_PATHINFO_STATE_STANDBY)) &&
2180 						MDI_PI(pip)->pi_preferred ==
2181 						preferred) ? 1 : 0);
2182 				} else if (flags ==
2183 					(MDI_SELECT_STANDBY_PATH |
2184 					MDI_SELECT_ONLINE_PATH |
2185 					MDI_SELECT_USER_DISABLE_PATH)) {
2186 					cond = (((MDI_PI(pip)->pi_state ==
2187 					    MDI_PATHINFO_STATE_ONLINE ||
2188 					    (MDI_PI(pip)->pi_state ==
2189 					    MDI_PATHINFO_STATE_STANDBY) ||
2190 						(MDI_PI(pip)->pi_state ==
2191 					    (MDI_PATHINFO_STATE_ONLINE|
2192 					    MDI_PATHINFO_STATE_USER_DISABLE)) ||
2193 						(MDI_PI(pip)->pi_state ==
2194 					    (MDI_PATHINFO_STATE_STANDBY |
2195 					    MDI_PATHINFO_STATE_USER_DISABLE)))&&
2196 						MDI_PI(pip)->pi_preferred ==
2197 						preferred) ? 1 : 0);
2198 				} else {
2199 					cond = 0;
2200 				}
2201 			}
2202 			/*
2203 			 * No need to explicitly check if the path is disabled.
2204 			 * Since we are checking for state == ONLINE and the
2205 			 * same veriable is used for DISABLE/ENABLE information.
2206 			 */
2207 			if (cond) {
2208 				/*
2209 				 * Return the path in hold state. Caller should
2210 				 * release the lock by calling mdi_rele_path()
2211 				 */
2212 				MDI_PI_HOLD(pip);
2213 				MDI_PI_UNLOCK(pip);
2214 				if (sb)
2215 					ct->ct_path_last = pip;
2216 				*ret_pip = pip;
2217 				MDI_CLIENT_UNLOCK(ct);
2218 				return (MDI_SUCCESS);
2219 			}
2220 			/*
2221 			 * Path is busy.
2222 			 */
2223 			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
2224 			    MDI_PI_IS_TRANSIENT(pip))
2225 				retry = 1;
2226 
2227 			/*
2228 			 * Keep looking for a next available online path
2229 			 */
2230 do_again:
2231 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2232 			if (next == NULL) {
2233 				if (!sb) {
2234 					if (preferred == 1) {
2235 						/*
2236 						 * Looks like we reached the
2237 						 * end of the list. Lets enable
2238 						 * traversal of non preferred
2239 						 * paths.
2240 						 */
2241 						preferred = 0;
2242 						next = head;
2243 					} else {
2244 						/*
2245 						 * We have done both the passes
2246 						 * Preferred as well as for
2247 						 * Non-preferred. Bail out now.
2248 						 */
2249 						cont = 0;
2250 					}
2251 				} else {
2252 					/*
2253 					 * Standard behavior case.
2254 					 */
2255 					next = head;
2256 				}
2257 			}
2258 			MDI_PI_UNLOCK(pip);
2259 			if (cont == 0) {
2260 				break;
2261 			}
2262 			pip = next;
2263 
2264 			if (!sb) {
2265 				/*
2266 				 * We need to handle the selection of
2267 				 * non-preferred path in the following
2268 				 * case:
2269 				 *
2270 				 * +------+   +------+   +------+   +-----+
2271 				 * | A : 1| - | B : 1| - | C : 0| - |NULL |
2272 				 * +------+   +------+   +------+   +-----+
2273 				 *
2274 				 * If we start the search with B, we need to
2275 				 * skip beyond B to pick C which is non -
2276 				 * preferred in the second pass. The following
2277 				 * test, if true, will allow us to skip over
2278 				 * the 'start'(B in the example) to select
2279 				 * other non preferred elements.
2280 				 */
2281 				if ((start_pip != NULL) && (start_pip == pip) &&
2282 				    (MDI_PI(start_pip)->pi_preferred
2283 				    != preferred)) {
2284 					/*
2285 					 * try again after going past the start
2286 					 * pip
2287 					 */
2288 					MDI_PI_LOCK(pip);
2289 					goto do_again;
2290 				}
2291 			} else {
2292 				/*
2293 				 * Standard behavior case
2294 				 */
2295 				if (start == pip && preferred) {
2296 					/* look for nonpreferred paths */
2297 					preferred = 0;
2298 				} else if (start == pip && !preferred) {
2299 					/*
2300 					 * Exit condition
2301 					 */
2302 					cont = 0;
2303 				}
2304 			}
2305 		} while (cont);
2306 		break;
2307 	}
2308 
2309 	MDI_CLIENT_UNLOCK(ct);
2310 	if (retry == 1) {
2311 		return (MDI_BUSY);
2312 	} else {
2313 		return (MDI_NOPATH);
2314 	}
2315 }
2316 
2317 /*
2318  * For a client, return the next available path to any phci
2319  *
2320  * Note:
2321  *		Caller should hold the branch's devinfo node to get a consistent
2322  *		snap shot of the mdi_pathinfo nodes.
2323  *
2324  *		Please note that even the list is stable the mdi_pathinfo
2325  *		node state and properties are volatile.  The caller should lock
2326  *		and unlock the nodes by calling mdi_pi_lock() and
2327  *		mdi_pi_unlock() functions to get a stable properties.
2328  *
2329  *		If there is a need to use the nodes beyond the hold of the
2330  *		devinfo node period (For ex. I/O), then mdi_pathinfo node
2331  *		need to be held against unexpected removal by calling
2332  *		mdi_hold_path() and should be released by calling
2333  *		mdi_rele_path() on completion.
2334  */
2335 mdi_pathinfo_t *
2336 mdi_get_next_phci_path(dev_info_t *ct_dip, mdi_pathinfo_t *pip)
2337 {
2338 	mdi_client_t *ct;
2339 
2340 	if (!MDI_CLIENT(ct_dip))
2341 		return (NULL);
2342 
2343 	/*
2344 	 * Walk through client link
2345 	 */
2346 	ct = (mdi_client_t *)DEVI(ct_dip)->devi_mdi_client;
2347 	ASSERT(ct != NULL);
2348 
2349 	if (pip == NULL)
2350 		return ((mdi_pathinfo_t *)ct->ct_path_head);
2351 
2352 	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link);
2353 }
2354 
2355 /*
2356  * For a phci, return the next available path to any client
2357  * Note: ditto mdi_get_next_phci_path()
2358  */
2359 mdi_pathinfo_t *
2360 mdi_get_next_client_path(dev_info_t *ph_dip, mdi_pathinfo_t *pip)
2361 {
2362 	mdi_phci_t *ph;
2363 
2364 	if (!MDI_PHCI(ph_dip))
2365 		return (NULL);
2366 
2367 	/*
2368 	 * Walk through pHCI link
2369 	 */
2370 	ph = (mdi_phci_t *)DEVI(ph_dip)->devi_mdi_xhci;
2371 	ASSERT(ph != NULL);
2372 
2373 	if (pip == NULL)
2374 		return ((mdi_pathinfo_t *)ph->ph_path_head);
2375 
2376 	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link);
2377 }
2378 
2379 /*
2380  * mdi_hold_path():
2381  *		Hold the mdi_pathinfo node against unwanted unexpected free.
2382  * Return Values:
2383  *		None
2384  */
2385 void
2386 mdi_hold_path(mdi_pathinfo_t *pip)
2387 {
2388 	if (pip) {
2389 		MDI_PI_LOCK(pip);
2390 		MDI_PI_HOLD(pip);
2391 		MDI_PI_UNLOCK(pip);
2392 	}
2393 }
2394 
2395 
2396 /*
2397  * mdi_rele_path():
2398  *		Release the mdi_pathinfo node which was selected
2399  *		through mdi_select_path() mechanism or manually held by
2400  *		calling mdi_hold_path().
2401  * Return Values:
2402  *		None
2403  */
2404 void
2405 mdi_rele_path(mdi_pathinfo_t *pip)
2406 {
2407 	if (pip) {
2408 		MDI_PI_LOCK(pip);
2409 		MDI_PI_RELE(pip);
2410 		if (MDI_PI(pip)->pi_ref_cnt == 0) {
2411 			cv_broadcast(&MDI_PI(pip)->pi_ref_cv);
2412 		}
2413 		MDI_PI_UNLOCK(pip);
2414 	}
2415 }
2416 
2417 /*
2418  * mdi_pi_lock():
2419  * 		Lock the mdi_pathinfo node.
2420  * Note:
2421  *		The caller should release the lock by calling mdi_pi_unlock()
2422  */
2423 void
2424 mdi_pi_lock(mdi_pathinfo_t *pip)
2425 {
2426 	ASSERT(pip != NULL);
2427 	if (pip) {
2428 		MDI_PI_LOCK(pip);
2429 	}
2430 }
2431 
2432 
2433 /*
2434  * mdi_pi_unlock():
2435  * 		Unlock the mdi_pathinfo node.
2436  * Note:
2437  *		The mdi_pathinfo node should have been locked with mdi_pi_lock()
2438  */
2439 void
2440 mdi_pi_unlock(mdi_pathinfo_t *pip)
2441 {
2442 	ASSERT(pip != NULL);
2443 	if (pip) {
2444 		MDI_PI_UNLOCK(pip);
2445 	}
2446 }
2447 
2448 /*
2449  * mdi_pi_find():
2450  *		Search the list of mdi_pathinfo nodes attached to the
2451  *		pHCI/Client device node whose path address matches "paddr".
2452  *		Returns a pointer to the mdi_pathinfo node if a matching node is
2453  *		found.
2454  * Return Values:
2455  *		mdi_pathinfo node handle
2456  *		NULL
2457  * Notes:
2458  *		Caller need not hold any locks to call this function.
2459  */
2460 mdi_pathinfo_t *
2461 mdi_pi_find(dev_info_t *pdip, char *caddr, char *paddr)
2462 {
2463 	mdi_phci_t		*ph;
2464 	mdi_vhci_t		*vh;
2465 	mdi_client_t		*ct;
2466 	mdi_pathinfo_t		*pip = NULL;
2467 
2468 	MDI_DEBUG(2, (CE_NOTE, pdip, "!mdi_pi_find: %s %s",
2469 	    caddr ? caddr : "NULL", paddr ? paddr : "NULL"));
2470 	if ((pdip == NULL) || (paddr == NULL)) {
2471 		return (NULL);
2472 	}
2473 	ph = i_devi_get_phci(pdip);
2474 	if (ph == NULL) {
2475 		/*
2476 		 * Invalid pHCI device, Nothing more to do.
2477 		 */
2478 		MDI_DEBUG(2, (CE_WARN, pdip,
2479 		    "!mdi_pi_find: invalid phci"));
2480 		return (NULL);
2481 	}
2482 
2483 	vh = ph->ph_vhci;
2484 	if (vh == NULL) {
2485 		/*
2486 		 * Invalid vHCI device, Nothing more to do.
2487 		 */
2488 		MDI_DEBUG(2, (CE_WARN, pdip,
2489 		    "!mdi_pi_find: invalid vhci"));
2490 		return (NULL);
2491 	}
2492 
2493 	/*
2494 	 * Look for pathinfo node identified by paddr.
2495 	 */
2496 	if (caddr == NULL) {
2497 		/*
2498 		 * Find a mdi_pathinfo node under pHCI list for a matching
2499 		 * unit address.
2500 		 */
2501 		MDI_PHCI_LOCK(ph);
2502 		if (MDI_PHCI_IS_OFFLINE(ph)) {
2503 			MDI_DEBUG(2, (CE_WARN, pdip,
2504 			    "!mdi_pi_find: offline phci %p", (void *)ph));
2505 			MDI_PHCI_UNLOCK(ph);
2506 			return (NULL);
2507 		}
2508 		pip = (mdi_pathinfo_t *)ph->ph_path_head;
2509 
2510 		while (pip != NULL) {
2511 			if (strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2512 				break;
2513 			}
2514 			pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
2515 		}
2516 		MDI_PHCI_UNLOCK(ph);
2517 		MDI_DEBUG(2, (CE_NOTE, pdip, "!mdi_pi_find: found %p",
2518 		    (void *)pip));
2519 		return (pip);
2520 	}
2521 
2522 	/*
2523 	 * XXX - Is the rest of the code in this function really necessary?
2524 	 * The consumers of mdi_pi_find() can search for the desired pathinfo
2525 	 * node by calling mdi_pi_find(pdip, NULL, paddr). Irrespective of
2526 	 * whether the search is based on the pathinfo nodes attached to
2527 	 * the pHCI or the client node, the result will be the same.
2528 	 */
2529 
2530 	/*
2531 	 * Find the client device corresponding to 'caddr'
2532 	 */
2533 	MDI_VHCI_CLIENT_LOCK(vh);
2534 
2535 	/*
2536 	 * XXX - Passing NULL to the following function works as long as the
2537 	 * the client addresses (caddr) are unique per vhci basis.
2538 	 */
2539 	ct = i_mdi_client_find(vh, NULL, caddr);
2540 	if (ct == NULL) {
2541 		/*
2542 		 * Client not found, Obviously mdi_pathinfo node has not been
2543 		 * created yet.
2544 		 */
2545 		MDI_VHCI_CLIENT_UNLOCK(vh);
2546 		MDI_DEBUG(2, (CE_NOTE, pdip, "!mdi_pi_find: client not "
2547 		    "found for caddr %s", caddr ? caddr : "NULL"));
2548 		return (NULL);
2549 	}
2550 
2551 	/*
2552 	 * Hold the client lock and look for a mdi_pathinfo node with matching
2553 	 * pHCI and paddr
2554 	 */
2555 	MDI_CLIENT_LOCK(ct);
2556 
2557 	/*
2558 	 * Release the global mutex as it is no more needed. Note: We always
2559 	 * respect the locking order while acquiring.
2560 	 */
2561 	MDI_VHCI_CLIENT_UNLOCK(vh);
2562 
2563 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2564 	while (pip != NULL) {
2565 		/*
2566 		 * Compare the unit address
2567 		 */
2568 		if ((MDI_PI(pip)->pi_phci == ph) &&
2569 		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2570 			break;
2571 		}
2572 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2573 	}
2574 	MDI_CLIENT_UNLOCK(ct);
2575 	MDI_DEBUG(2, (CE_NOTE, pdip, "!mdi_pi_find: found:: %p", (void *)pip));
2576 	return (pip);
2577 }
2578 
2579 /*
2580  * mdi_pi_alloc():
2581  *		Allocate and initialize a new instance of a mdi_pathinfo node.
2582  *		The mdi_pathinfo node returned by this function identifies a
2583  *		unique device path is capable of having properties attached
2584  *		and passed to mdi_pi_online() to fully attach and online the
2585  *		path and client device node.
2586  *		The mdi_pathinfo node returned by this function must be
2587  *		destroyed using mdi_pi_free() if the path is no longer
2588  *		operational or if the caller fails to attach a client device
2589  *		node when calling mdi_pi_online(). The framework will not free
2590  *		the resources allocated.
2591  *		This function can be called from both interrupt and kernel
2592  *		contexts.  DDI_NOSLEEP flag should be used while calling
2593  *		from interrupt contexts.
2594  * Return Values:
2595  *		MDI_SUCCESS
2596  *		MDI_FAILURE
2597  *		MDI_NOMEM
2598  */
2599 /*ARGSUSED*/
2600 int
2601 mdi_pi_alloc_compatible(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2602     char **compatible, int ncompatible, int flags, mdi_pathinfo_t **ret_pip)
2603 {
2604 	mdi_vhci_t	*vh;
2605 	mdi_phci_t	*ph;
2606 	mdi_client_t	*ct;
2607 	mdi_pathinfo_t	*pip = NULL;
2608 	dev_info_t	*cdip;
2609 	int		rv = MDI_NOMEM;
2610 	int		path_allocated = 0;
2611 
2612 	MDI_DEBUG(2, (CE_NOTE, pdip, "!mdi_pi_alloc_compatible: %s %s %s",
2613 	    cname ? cname : "NULL", caddr ? caddr : "NULL",
2614 	    paddr ? paddr : "NULL"));
2615 
2616 	if (pdip == NULL || cname == NULL || caddr == NULL || paddr == NULL ||
2617 	    ret_pip == NULL) {
2618 		/* Nothing more to do */
2619 		return (MDI_FAILURE);
2620 	}
2621 
2622 	*ret_pip = NULL;
2623 
2624 	/* No allocations on detaching pHCI */
2625 	if (DEVI_IS_DETACHING(pdip)) {
2626 		/* Invalid pHCI device, return failure */
2627 		MDI_DEBUG(1, (CE_WARN, pdip,
2628 		    "!mdi_pi_alloc: detaching pHCI=%p", (void *)pdip));
2629 		return (MDI_FAILURE);
2630 	}
2631 
2632 	ph = i_devi_get_phci(pdip);
2633 	ASSERT(ph != NULL);
2634 	if (ph == NULL) {
2635 		/* Invalid pHCI device, return failure */
2636 		MDI_DEBUG(1, (CE_WARN, pdip,
2637 		    "!mdi_pi_alloc: invalid pHCI=%p", (void *)pdip));
2638 		return (MDI_FAILURE);
2639 	}
2640 
2641 	MDI_PHCI_LOCK(ph);
2642 	vh = ph->ph_vhci;
2643 	if (vh == NULL) {
2644 		/* Invalid vHCI device, return failure */
2645 		MDI_DEBUG(1, (CE_WARN, pdip,
2646 		    "!mdi_pi_alloc: invalid vHCI=%p", (void *)pdip));
2647 		MDI_PHCI_UNLOCK(ph);
2648 		return (MDI_FAILURE);
2649 	}
2650 
2651 	if (MDI_PHCI_IS_READY(ph) == 0) {
2652 		/*
2653 		 * Do not allow new node creation when pHCI is in
2654 		 * offline/suspended states
2655 		 */
2656 		MDI_DEBUG(1, (CE_WARN, pdip,
2657 		    "mdi_pi_alloc: pHCI=%p is not ready", (void *)ph));
2658 		MDI_PHCI_UNLOCK(ph);
2659 		return (MDI_BUSY);
2660 	}
2661 	MDI_PHCI_UNSTABLE(ph);
2662 	MDI_PHCI_UNLOCK(ph);
2663 
2664 	/* look for a matching client, create one if not found */
2665 	MDI_VHCI_CLIENT_LOCK(vh);
2666 	ct = i_mdi_client_find(vh, cname, caddr);
2667 	if (ct == NULL) {
2668 		ct = i_mdi_client_alloc(vh, cname, caddr);
2669 		ASSERT(ct != NULL);
2670 	}
2671 
2672 	if (ct->ct_dip == NULL) {
2673 		/*
2674 		 * Allocate a devinfo node
2675 		 */
2676 		ct->ct_dip = i_mdi_devinfo_create(vh, cname, caddr,
2677 		    compatible, ncompatible);
2678 		if (ct->ct_dip == NULL) {
2679 			(void) i_mdi_client_free(vh, ct);
2680 			goto fail;
2681 		}
2682 	}
2683 	cdip = ct->ct_dip;
2684 
2685 	DEVI(cdip)->devi_mdi_component |= MDI_COMPONENT_CLIENT;
2686 	DEVI(cdip)->devi_mdi_client = (caddr_t)ct;
2687 
2688 	MDI_CLIENT_LOCK(ct);
2689 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2690 	while (pip != NULL) {
2691 		/*
2692 		 * Compare the unit address
2693 		 */
2694 		if ((MDI_PI(pip)->pi_phci == ph) &&
2695 		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2696 			break;
2697 		}
2698 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2699 	}
2700 	MDI_CLIENT_UNLOCK(ct);
2701 
2702 	if (pip == NULL) {
2703 		/*
2704 		 * This is a new path for this client device.  Allocate and
2705 		 * initialize a new pathinfo node
2706 		 */
2707 		pip = i_mdi_pi_alloc(ph, paddr, ct);
2708 		ASSERT(pip != NULL);
2709 		path_allocated = 1;
2710 	}
2711 	rv = MDI_SUCCESS;
2712 
2713 fail:
2714 	/*
2715 	 * Release the global mutex.
2716 	 */
2717 	MDI_VHCI_CLIENT_UNLOCK(vh);
2718 
2719 	/*
2720 	 * Mark the pHCI as stable
2721 	 */
2722 	MDI_PHCI_LOCK(ph);
2723 	MDI_PHCI_STABLE(ph);
2724 	MDI_PHCI_UNLOCK(ph);
2725 	*ret_pip = pip;
2726 
2727 	MDI_DEBUG(2, (CE_NOTE, pdip,
2728 	    "!mdi_pi_alloc_compatible: alloc %p", (void *)pip));
2729 
2730 	if (path_allocated)
2731 		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
2732 
2733 	return (rv);
2734 }
2735 
2736 /*ARGSUSED*/
2737 int
2738 mdi_pi_alloc(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2739     int flags, mdi_pathinfo_t **ret_pip)
2740 {
2741 	return (mdi_pi_alloc_compatible(pdip, cname, caddr, paddr, NULL, 0,
2742 	    flags, ret_pip));
2743 }
2744 
2745 /*
2746  * i_mdi_pi_alloc():
2747  *		Allocate a mdi_pathinfo node and add to the pHCI path list
2748  * Return Values:
2749  *		mdi_pathinfo
2750  */
2751 /*ARGSUSED*/
2752 static mdi_pathinfo_t *
2753 i_mdi_pi_alloc(mdi_phci_t *ph, char *paddr, mdi_client_t *ct)
2754 {
2755 	mdi_pathinfo_t	*pip;
2756 	int		ct_circular;
2757 	int		ph_circular;
2758 	int		se_flag;
2759 	int		kmem_flag;
2760 
2761 	ASSERT(MDI_VHCI_CLIENT_LOCKED(ph->ph_vhci));
2762 
2763 	pip = kmem_zalloc(sizeof (struct mdi_pathinfo), KM_SLEEP);
2764 	mutex_init(&MDI_PI(pip)->pi_mutex, NULL, MUTEX_DEFAULT, NULL);
2765 	MDI_PI(pip)->pi_state = MDI_PATHINFO_STATE_INIT |
2766 	    MDI_PATHINFO_STATE_TRANSIENT;
2767 
2768 	if (MDI_PHCI_IS_USER_DISABLED(ph))
2769 		MDI_PI_SET_USER_DISABLE(pip);
2770 
2771 	if (MDI_PHCI_IS_DRV_DISABLED_TRANSIENT(ph))
2772 		MDI_PI_SET_DRV_DISABLE_TRANS(pip);
2773 
2774 	if (MDI_PHCI_IS_DRV_DISABLED(ph))
2775 		MDI_PI_SET_DRV_DISABLE(pip);
2776 
2777 	MDI_PI(pip)->pi_old_state = MDI_PATHINFO_STATE_INIT;
2778 	cv_init(&MDI_PI(pip)->pi_state_cv, NULL, CV_DEFAULT, NULL);
2779 	MDI_PI(pip)->pi_client = ct;
2780 	MDI_PI(pip)->pi_phci = ph;
2781 	MDI_PI(pip)->pi_addr = kmem_alloc(strlen(paddr) + 1, KM_SLEEP);
2782 	(void) strcpy(MDI_PI(pip)->pi_addr, paddr);
2783 	(void) nvlist_alloc(&MDI_PI(pip)->pi_prop, NV_UNIQUE_NAME, KM_SLEEP);
2784 	ASSERT(MDI_PI(pip)->pi_prop != NULL);
2785 	MDI_PI(pip)->pi_pprivate = NULL;
2786 	MDI_PI(pip)->pi_cprivate = NULL;
2787 	MDI_PI(pip)->pi_vprivate = NULL;
2788 	MDI_PI(pip)->pi_client_link = NULL;
2789 	MDI_PI(pip)->pi_phci_link = NULL;
2790 	MDI_PI(pip)->pi_ref_cnt = 0;
2791 	MDI_PI(pip)->pi_kstats = NULL;
2792 	MDI_PI(pip)->pi_preferred = 1;
2793 	cv_init(&MDI_PI(pip)->pi_ref_cv, NULL, CV_DEFAULT, NULL);
2794 
2795 	/*
2796 	 * Lock both dev_info nodes against changes in parallel.
2797 	 *
2798 	 * The ndi_devi_enter(Client), is atypical since the client is a leaf.
2799 	 * This atypical operation is done to synchronize pathinfo nodes
2800 	 * during devinfo snapshot (see di_register_pip) by 'pretending' that
2801 	 * the pathinfo nodes are children of the Client.
2802 	 */
2803 	ndi_devi_enter(ct->ct_dip, &ct_circular);
2804 	ndi_devi_enter(ph->ph_dip, &ph_circular);
2805 
2806 	i_mdi_phci_add_path(ph, pip);
2807 	i_mdi_client_add_path(ct, pip);
2808 
2809 	ndi_devi_exit(ph->ph_dip, ph_circular);
2810 	ndi_devi_exit(ct->ct_dip, ct_circular);
2811 
2812 	/* determine interrupt context */
2813 	se_flag = (servicing_interrupt()) ? SE_NOSLEEP : SE_SLEEP;
2814 	kmem_flag = (se_flag == SE_SLEEP) ? KM_SLEEP : KM_NOSLEEP;
2815 
2816 	i_ddi_di_cache_invalidate(kmem_flag);
2817 
2818 	return (pip);
2819 }
2820 
2821 /*
2822  * i_mdi_phci_add_path():
2823  * 		Add a mdi_pathinfo node to pHCI list.
2824  * Notes:
2825  *		Caller should per-pHCI mutex
2826  */
2827 static void
2828 i_mdi_phci_add_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
2829 {
2830 	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
2831 
2832 	MDI_PHCI_LOCK(ph);
2833 	if (ph->ph_path_head == NULL) {
2834 		ph->ph_path_head = pip;
2835 	} else {
2836 		MDI_PI(ph->ph_path_tail)->pi_phci_link = MDI_PI(pip);
2837 	}
2838 	ph->ph_path_tail = pip;
2839 	ph->ph_path_count++;
2840 	MDI_PHCI_UNLOCK(ph);
2841 }
2842 
2843 /*
2844  * i_mdi_client_add_path():
2845  *		Add mdi_pathinfo node to client list
2846  */
2847 static void
2848 i_mdi_client_add_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
2849 {
2850 	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
2851 
2852 	MDI_CLIENT_LOCK(ct);
2853 	if (ct->ct_path_head == NULL) {
2854 		ct->ct_path_head = pip;
2855 	} else {
2856 		MDI_PI(ct->ct_path_tail)->pi_client_link = MDI_PI(pip);
2857 	}
2858 	ct->ct_path_tail = pip;
2859 	ct->ct_path_count++;
2860 	MDI_CLIENT_UNLOCK(ct);
2861 }
2862 
2863 /*
2864  * mdi_pi_free():
2865  *		Free the mdi_pathinfo node and also client device node if this
2866  *		is the last path to the device
2867  * Return Values:
2868  *		MDI_SUCCESS
2869  *		MDI_FAILURE
2870  *		MDI_BUSY
2871  */
2872 /*ARGSUSED*/
2873 int
2874 mdi_pi_free(mdi_pathinfo_t *pip, int flags)
2875 {
2876 	int		rv = MDI_SUCCESS;
2877 	mdi_vhci_t	*vh;
2878 	mdi_phci_t	*ph;
2879 	mdi_client_t	*ct;
2880 	int		(*f)();
2881 	int		client_held = 0;
2882 
2883 	MDI_PI_LOCK(pip);
2884 	ph = MDI_PI(pip)->pi_phci;
2885 	ASSERT(ph != NULL);
2886 	if (ph == NULL) {
2887 		/*
2888 		 * Invalid pHCI device, return failure
2889 		 */
2890 		MDI_DEBUG(1, (CE_WARN, NULL,
2891 		    "!mdi_pi_free: invalid pHCI pip=%p", (void *)pip));
2892 		MDI_PI_UNLOCK(pip);
2893 		return (MDI_FAILURE);
2894 	}
2895 
2896 	vh = ph->ph_vhci;
2897 	ASSERT(vh != NULL);
2898 	if (vh == NULL) {
2899 		/* Invalid pHCI device, return failure */
2900 		MDI_DEBUG(1, (CE_WARN, NULL,
2901 		    "!mdi_pi_free: invalid vHCI pip=%p", (void *)pip));
2902 		MDI_PI_UNLOCK(pip);
2903 		return (MDI_FAILURE);
2904 	}
2905 
2906 	ct = MDI_PI(pip)->pi_client;
2907 	ASSERT(ct != NULL);
2908 	if (ct == NULL) {
2909 		/*
2910 		 * Invalid Client device, return failure
2911 		 */
2912 		MDI_DEBUG(1, (CE_WARN, NULL,
2913 		    "!mdi_pi_free: invalid client pip=%p", (void *)pip));
2914 		MDI_PI_UNLOCK(pip);
2915 		return (MDI_FAILURE);
2916 	}
2917 
2918 	/*
2919 	 * Check to see for busy condition.  A mdi_pathinfo can only be freed
2920 	 * if the node state is either offline or init and the reference count
2921 	 * is zero.
2922 	 */
2923 	if (!(MDI_PI_IS_OFFLINE(pip) || MDI_PI_IS_INIT(pip) ||
2924 	    MDI_PI_IS_INITING(pip))) {
2925 		/*
2926 		 * Node is busy
2927 		 */
2928 		MDI_DEBUG(1, (CE_WARN, ct->ct_dip,
2929 		    "!mdi_pi_free: pathinfo node is busy pip=%p", (void *)pip));
2930 		MDI_PI_UNLOCK(pip);
2931 		return (MDI_BUSY);
2932 	}
2933 
2934 	while (MDI_PI(pip)->pi_ref_cnt != 0) {
2935 		/*
2936 		 * Give a chance for pending I/Os to complete.
2937 		 */
2938 		MDI_DEBUG(1, (CE_NOTE, ct->ct_dip, "!mdi_pi_free: "
2939 		    "%d cmds still pending on path: %p\n",
2940 		    MDI_PI(pip)->pi_ref_cnt, (void *)pip));
2941 		if (cv_timedwait(&MDI_PI(pip)->pi_ref_cv,
2942 		    &MDI_PI(pip)->pi_mutex,
2943 		    ddi_get_lbolt() + drv_usectohz(60 * 1000000)) == -1) {
2944 			/*
2945 			 * The timeout time reached without ref_cnt being zero
2946 			 * being signaled.
2947 			 */
2948 			MDI_DEBUG(1, (CE_NOTE, ct->ct_dip,
2949 			    "!mdi_pi_free: "
2950 			    "Timeout reached on path %p without the cond\n",
2951 			    (void *)pip));
2952 			MDI_DEBUG(1, (CE_NOTE, ct->ct_dip,
2953 			    "!mdi_pi_free: "
2954 			    "%d cmds still pending on path: %p\n",
2955 			    MDI_PI(pip)->pi_ref_cnt, (void *)pip));
2956 			MDI_PI_UNLOCK(pip);
2957 			return (MDI_BUSY);
2958 		}
2959 	}
2960 	if (MDI_PI(pip)->pi_pm_held) {
2961 		client_held = 1;
2962 	}
2963 	MDI_PI_UNLOCK(pip);
2964 
2965 	vhcache_pi_remove(vh->vh_config, MDI_PI(pip));
2966 
2967 	MDI_CLIENT_LOCK(ct);
2968 
2969 	/* Prevent further failovers till MDI_VHCI_CLIENT_LOCK is held */
2970 	MDI_CLIENT_SET_PATH_FREE_IN_PROGRESS(ct);
2971 
2972 	/*
2973 	 * Wait till failover is complete before removing this node.
2974 	 */
2975 	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
2976 		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
2977 
2978 	MDI_CLIENT_UNLOCK(ct);
2979 	MDI_VHCI_CLIENT_LOCK(vh);
2980 	MDI_CLIENT_LOCK(ct);
2981 	MDI_CLIENT_CLEAR_PATH_FREE_IN_PROGRESS(ct);
2982 
2983 	if (!MDI_PI_IS_INITING(pip)) {
2984 		f = vh->vh_ops->vo_pi_uninit;
2985 		if (f != NULL) {
2986 			rv = (*f)(vh->vh_dip, pip, 0);
2987 		}
2988 	}
2989 	/*
2990 	 * If vo_pi_uninit() completed successfully.
2991 	 */
2992 	if (rv == MDI_SUCCESS) {
2993 		if (client_held) {
2994 			MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "mdi_pi_free "
2995 			    "i_mdi_pm_rele_client\n"));
2996 			i_mdi_pm_rele_client(ct, 1);
2997 		}
2998 		i_mdi_pi_free(ph, pip, ct);
2999 		if (ct->ct_path_count == 0) {
3000 			/*
3001 			 * Client lost its last path.
3002 			 * Clean up the client device
3003 			 */
3004 			MDI_CLIENT_UNLOCK(ct);
3005 			(void) i_mdi_client_free(ct->ct_vhci, ct);
3006 			MDI_VHCI_CLIENT_UNLOCK(vh);
3007 			return (rv);
3008 		}
3009 	}
3010 	MDI_CLIENT_UNLOCK(ct);
3011 	MDI_VHCI_CLIENT_UNLOCK(vh);
3012 
3013 	if (rv == MDI_FAILURE)
3014 		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
3015 
3016 	return (rv);
3017 }
3018 
3019 /*
3020  * i_mdi_pi_free():
3021  *		Free the mdi_pathinfo node
3022  */
3023 static void
3024 i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *pip, mdi_client_t *ct)
3025 {
3026 	int	ct_circular;
3027 	int	ph_circular;
3028 	int	se_flag;
3029 	int	kmem_flag;
3030 
3031 	ASSERT(MDI_CLIENT_LOCKED(ct));
3032 
3033 	/*
3034 	 * remove any per-path kstats
3035 	 */
3036 	i_mdi_pi_kstat_destroy(pip);
3037 
3038 	/* See comments in i_mdi_pi_alloc() */
3039 	ndi_devi_enter(ct->ct_dip, &ct_circular);
3040 	ndi_devi_enter(ph->ph_dip, &ph_circular);
3041 
3042 	i_mdi_client_remove_path(ct, pip);
3043 	i_mdi_phci_remove_path(ph, pip);
3044 
3045 	ndi_devi_exit(ph->ph_dip, ph_circular);
3046 	ndi_devi_exit(ct->ct_dip, ct_circular);
3047 
3048 	/* determine interrupt context */
3049 	se_flag = (servicing_interrupt()) ? SE_NOSLEEP : SE_SLEEP;
3050 	kmem_flag = (se_flag == SE_SLEEP) ? KM_SLEEP : KM_NOSLEEP;
3051 
3052 	i_ddi_di_cache_invalidate(kmem_flag);
3053 
3054 	mutex_destroy(&MDI_PI(pip)->pi_mutex);
3055 	cv_destroy(&MDI_PI(pip)->pi_state_cv);
3056 	cv_destroy(&MDI_PI(pip)->pi_ref_cv);
3057 	if (MDI_PI(pip)->pi_addr) {
3058 		kmem_free(MDI_PI(pip)->pi_addr,
3059 		    strlen(MDI_PI(pip)->pi_addr) + 1);
3060 		MDI_PI(pip)->pi_addr = NULL;
3061 	}
3062 
3063 	if (MDI_PI(pip)->pi_prop) {
3064 		(void) nvlist_free(MDI_PI(pip)->pi_prop);
3065 		MDI_PI(pip)->pi_prop = NULL;
3066 	}
3067 	kmem_free(pip, sizeof (struct mdi_pathinfo));
3068 }
3069 
3070 
3071 /*
3072  * i_mdi_phci_remove_path():
3073  * 		Remove a mdi_pathinfo node from pHCI list.
3074  * Notes:
3075  *		Caller should hold per-pHCI mutex
3076  */
3077 static void
3078 i_mdi_phci_remove_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
3079 {
3080 	mdi_pathinfo_t	*prev = NULL;
3081 	mdi_pathinfo_t	*path = NULL;
3082 
3083 	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
3084 
3085 	MDI_PHCI_LOCK(ph);
3086 	path = ph->ph_path_head;
3087 	while (path != NULL) {
3088 		if (path == pip) {
3089 			break;
3090 		}
3091 		prev = path;
3092 		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
3093 	}
3094 
3095 	if (path) {
3096 		ph->ph_path_count--;
3097 		if (prev) {
3098 			MDI_PI(prev)->pi_phci_link = MDI_PI(path)->pi_phci_link;
3099 		} else {
3100 			ph->ph_path_head =
3101 			    (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
3102 		}
3103 		if (ph->ph_path_tail == path) {
3104 			ph->ph_path_tail = prev;
3105 		}
3106 	}
3107 
3108 	/*
3109 	 * Clear the pHCI link
3110 	 */
3111 	MDI_PI(pip)->pi_phci_link = NULL;
3112 	MDI_PI(pip)->pi_phci = NULL;
3113 	MDI_PHCI_UNLOCK(ph);
3114 }
3115 
3116 /*
3117  * i_mdi_client_remove_path():
3118  * 		Remove a mdi_pathinfo node from client path list.
3119  */
3120 static void
3121 i_mdi_client_remove_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
3122 {
3123 	mdi_pathinfo_t	*prev = NULL;
3124 	mdi_pathinfo_t	*path;
3125 
3126 	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
3127 
3128 	ASSERT(MDI_CLIENT_LOCKED(ct));
3129 	path = ct->ct_path_head;
3130 	while (path != NULL) {
3131 		if (path == pip) {
3132 			break;
3133 		}
3134 		prev = path;
3135 		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
3136 	}
3137 
3138 	if (path) {
3139 		ct->ct_path_count--;
3140 		if (prev) {
3141 			MDI_PI(prev)->pi_client_link =
3142 			    MDI_PI(path)->pi_client_link;
3143 		} else {
3144 			ct->ct_path_head =
3145 			    (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
3146 		}
3147 		if (ct->ct_path_tail == path) {
3148 			ct->ct_path_tail = prev;
3149 		}
3150 		if (ct->ct_path_last == path) {
3151 			ct->ct_path_last = ct->ct_path_head;
3152 		}
3153 	}
3154 	MDI_PI(pip)->pi_client_link = NULL;
3155 	MDI_PI(pip)->pi_client = NULL;
3156 }
3157 
3158 /*
3159  * i_mdi_pi_state_change():
3160  *		online a mdi_pathinfo node
3161  *
3162  * Return Values:
3163  *		MDI_SUCCESS
3164  *		MDI_FAILURE
3165  */
3166 /*ARGSUSED*/
3167 static int
3168 i_mdi_pi_state_change(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state, int flag)
3169 {
3170 	int		rv = MDI_SUCCESS;
3171 	mdi_vhci_t	*vh;
3172 	mdi_phci_t	*ph;
3173 	mdi_client_t	*ct;
3174 	int		(*f)();
3175 	dev_info_t	*cdip;
3176 
3177 	MDI_PI_LOCK(pip);
3178 
3179 	ph = MDI_PI(pip)->pi_phci;
3180 	ASSERT(ph);
3181 	if (ph == NULL) {
3182 		/*
3183 		 * Invalid pHCI device, fail the request
3184 		 */
3185 		MDI_PI_UNLOCK(pip);
3186 		MDI_DEBUG(1, (CE_WARN, NULL,
3187 		    "!mdi_pi_state_change: invalid phci pip=%p", (void *)pip));
3188 		return (MDI_FAILURE);
3189 	}
3190 
3191 	vh = ph->ph_vhci;
3192 	ASSERT(vh);
3193 	if (vh == NULL) {
3194 		/*
3195 		 * Invalid vHCI device, fail the request
3196 		 */
3197 		MDI_PI_UNLOCK(pip);
3198 		MDI_DEBUG(1, (CE_WARN, NULL,
3199 		    "!mdi_pi_state_change: invalid vhci pip=%p", (void *)pip));
3200 		return (MDI_FAILURE);
3201 	}
3202 
3203 	ct = MDI_PI(pip)->pi_client;
3204 	ASSERT(ct != NULL);
3205 	if (ct == NULL) {
3206 		/*
3207 		 * Invalid client device, fail the request
3208 		 */
3209 		MDI_PI_UNLOCK(pip);
3210 		MDI_DEBUG(1, (CE_WARN, NULL,
3211 		    "!mdi_pi_state_change: invalid client pip=%p",
3212 		    (void *)pip));
3213 		return (MDI_FAILURE);
3214 	}
3215 
3216 	/*
3217 	 * If this path has not been initialized yet, Callback vHCI driver's
3218 	 * pathinfo node initialize entry point
3219 	 */
3220 
3221 	if (MDI_PI_IS_INITING(pip)) {
3222 		MDI_PI_UNLOCK(pip);
3223 		f = vh->vh_ops->vo_pi_init;
3224 		if (f != NULL) {
3225 			rv = (*f)(vh->vh_dip, pip, 0);
3226 			if (rv != MDI_SUCCESS) {
3227 				MDI_DEBUG(1, (CE_WARN, ct->ct_dip,
3228 				    "!vo_pi_init: failed vHCI=0x%p, pip=0x%p",
3229 				    (void *)vh, (void *)pip));
3230 				return (MDI_FAILURE);
3231 			}
3232 		}
3233 		MDI_PI_LOCK(pip);
3234 		MDI_PI_CLEAR_TRANSIENT(pip);
3235 	}
3236 
3237 	/*
3238 	 * Do not allow state transition when pHCI is in offline/suspended
3239 	 * states
3240 	 */
3241 	i_mdi_phci_lock(ph, pip);
3242 	if (MDI_PHCI_IS_READY(ph) == 0) {
3243 		MDI_DEBUG(1, (CE_WARN, ct->ct_dip,
3244 		    "!mdi_pi_state_change: pHCI not ready, pHCI=%p",
3245 		    (void *)ph));
3246 		MDI_PI_UNLOCK(pip);
3247 		i_mdi_phci_unlock(ph);
3248 		return (MDI_BUSY);
3249 	}
3250 	MDI_PHCI_UNSTABLE(ph);
3251 	i_mdi_phci_unlock(ph);
3252 
3253 	/*
3254 	 * Check if mdi_pathinfo state is in transient state.
3255 	 * If yes, offlining is in progress and wait till transient state is
3256 	 * cleared.
3257 	 */
3258 	if (MDI_PI_IS_TRANSIENT(pip)) {
3259 		while (MDI_PI_IS_TRANSIENT(pip)) {
3260 			cv_wait(&MDI_PI(pip)->pi_state_cv,
3261 			    &MDI_PI(pip)->pi_mutex);
3262 		}
3263 	}
3264 
3265 	/*
3266 	 * Grab the client lock in reverse order sequence and release the
3267 	 * mdi_pathinfo mutex.
3268 	 */
3269 	i_mdi_client_lock(ct, pip);
3270 	MDI_PI_UNLOCK(pip);
3271 
3272 	/*
3273 	 * Wait till failover state is cleared
3274 	 */
3275 	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
3276 		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
3277 
3278 	/*
3279 	 * Mark the mdi_pathinfo node state as transient
3280 	 */
3281 	MDI_PI_LOCK(pip);
3282 	switch (state) {
3283 	case MDI_PATHINFO_STATE_ONLINE:
3284 		MDI_PI_SET_ONLINING(pip);
3285 		break;
3286 
3287 	case MDI_PATHINFO_STATE_STANDBY:
3288 		MDI_PI_SET_STANDBYING(pip);
3289 		break;
3290 
3291 	case MDI_PATHINFO_STATE_FAULT:
3292 		/*
3293 		 * Mark the pathinfo state as FAULTED
3294 		 */
3295 		MDI_PI_SET_FAULTING(pip);
3296 		MDI_PI_ERRSTAT(pip, MDI_PI_HARDERR);
3297 		break;
3298 
3299 	case MDI_PATHINFO_STATE_OFFLINE:
3300 		/*
3301 		 * ndi_devi_offline() cannot hold pip or ct locks.
3302 		 */
3303 		MDI_PI_UNLOCK(pip);
3304 		/*
3305 		 * Do not offline if path will become last path and path
3306 		 * is busy for user initiated events.
3307 		 */
3308 		cdip = ct->ct_dip;
3309 		if ((flag & NDI_DEVI_REMOVE) &&
3310 		    (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED)) {
3311 			i_mdi_client_unlock(ct);
3312 			rv = ndi_devi_offline(cdip, 0);
3313 			if (rv != NDI_SUCCESS) {
3314 				/*
3315 				 * Convert to MDI error code
3316 				 */
3317 				switch (rv) {
3318 				case NDI_BUSY:
3319 					rv = MDI_BUSY;
3320 					break;
3321 				default:
3322 					rv = MDI_FAILURE;
3323 					break;
3324 				}
3325 				goto state_change_exit;
3326 			} else {
3327 				i_mdi_client_lock(ct, NULL);
3328 			}
3329 		}
3330 		/*
3331 		 * Mark the mdi_pathinfo node state as transient
3332 		 */
3333 		MDI_PI_LOCK(pip);
3334 		MDI_PI_SET_OFFLINING(pip);
3335 		break;
3336 	}
3337 	MDI_PI_UNLOCK(pip);
3338 	MDI_CLIENT_UNSTABLE(ct);
3339 	i_mdi_client_unlock(ct);
3340 
3341 	f = vh->vh_ops->vo_pi_state_change;
3342 	if (f != NULL)
3343 		rv = (*f)(vh->vh_dip, pip, state, 0, flag);
3344 
3345 	MDI_CLIENT_LOCK(ct);
3346 	MDI_PI_LOCK(pip);
3347 	if (rv == MDI_NOT_SUPPORTED) {
3348 		MDI_CLIENT_SET_DEV_NOT_SUPPORTED(ct);
3349 	}
3350 	if (rv != MDI_SUCCESS) {
3351 		MDI_DEBUG(2, (CE_WARN, ct->ct_dip,
3352 		    "!vo_pi_state_change: failed rv = %x", rv));
3353 	}
3354 	if (MDI_PI_IS_TRANSIENT(pip)) {
3355 		if (rv == MDI_SUCCESS) {
3356 			MDI_PI_CLEAR_TRANSIENT(pip);
3357 		} else {
3358 			MDI_PI(pip)->pi_state = MDI_PI_OLD_STATE(pip);
3359 		}
3360 	}
3361 
3362 	/*
3363 	 * Wake anyone waiting for this mdi_pathinfo node
3364 	 */
3365 	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
3366 	MDI_PI_UNLOCK(pip);
3367 
3368 	/*
3369 	 * Mark the client device as stable
3370 	 */
3371 	MDI_CLIENT_STABLE(ct);
3372 	if (rv == MDI_SUCCESS) {
3373 		if (ct->ct_unstable == 0) {
3374 			cdip = ct->ct_dip;
3375 
3376 			/*
3377 			 * Onlining the mdi_pathinfo node will impact the
3378 			 * client state Update the client and dev_info node
3379 			 * state accordingly
3380 			 */
3381 			rv = NDI_SUCCESS;
3382 			i_mdi_client_update_state(ct);
3383 			switch (MDI_CLIENT_STATE(ct)) {
3384 			case MDI_CLIENT_STATE_OPTIMAL:
3385 			case MDI_CLIENT_STATE_DEGRADED:
3386 				if (cdip && !i_ddi_devi_attached(cdip) &&
3387 				    ((state == MDI_PATHINFO_STATE_ONLINE) ||
3388 				    (state == MDI_PATHINFO_STATE_STANDBY))) {
3389 
3390 					/*
3391 					 * Must do ndi_devi_online() through
3392 					 * hotplug thread for deferred
3393 					 * attach mechanism to work
3394 					 */
3395 					MDI_CLIENT_UNLOCK(ct);
3396 					rv = ndi_devi_online(cdip, 0);
3397 					MDI_CLIENT_LOCK(ct);
3398 					if ((rv != NDI_SUCCESS) &&
3399 					    (MDI_CLIENT_STATE(ct) ==
3400 					    MDI_CLIENT_STATE_DEGRADED)) {
3401 						/*
3402 						 * ndi_devi_online failed.
3403 						 * Reset client flags to
3404 						 * offline.
3405 						 */
3406 						MDI_DEBUG(1, (CE_WARN, cdip,
3407 						    "!ndi_devi_online: failed "
3408 						    " Error: %x", rv));
3409 						MDI_CLIENT_SET_OFFLINE(ct);
3410 					}
3411 					if (rv != NDI_SUCCESS) {
3412 						/* Reset the path state */
3413 						MDI_PI_LOCK(pip);
3414 						MDI_PI(pip)->pi_state =
3415 						    MDI_PI_OLD_STATE(pip);
3416 						MDI_PI_UNLOCK(pip);
3417 					}
3418 				}
3419 				break;
3420 
3421 			case MDI_CLIENT_STATE_FAILED:
3422 				/*
3423 				 * This is the last path case for
3424 				 * non-user initiated events.
3425 				 */
3426 				if (((flag & NDI_DEVI_REMOVE) == 0) &&
3427 				    cdip && (i_ddi_node_state(cdip) >=
3428 				    DS_INITIALIZED)) {
3429 					MDI_CLIENT_UNLOCK(ct);
3430 					rv = ndi_devi_offline(cdip, 0);
3431 					MDI_CLIENT_LOCK(ct);
3432 
3433 					if (rv != NDI_SUCCESS) {
3434 						/*
3435 						 * ndi_devi_offline failed.
3436 						 * Reset client flags to
3437 						 * online as the path could not
3438 						 * be offlined.
3439 						 */
3440 						MDI_DEBUG(1, (CE_WARN, cdip,
3441 						    "!ndi_devi_offline: failed "
3442 						    " Error: %x", rv));
3443 						MDI_CLIENT_SET_ONLINE(ct);
3444 					}
3445 				}
3446 				break;
3447 			}
3448 			/*
3449 			 * Convert to MDI error code
3450 			 */
3451 			switch (rv) {
3452 			case NDI_SUCCESS:
3453 				MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
3454 				i_mdi_report_path_state(ct, pip);
3455 				rv = MDI_SUCCESS;
3456 				break;
3457 			case NDI_BUSY:
3458 				rv = MDI_BUSY;
3459 				break;
3460 			default:
3461 				rv = MDI_FAILURE;
3462 				break;
3463 			}
3464 		}
3465 	}
3466 	MDI_CLIENT_UNLOCK(ct);
3467 
3468 state_change_exit:
3469 	/*
3470 	 * Mark the pHCI as stable again.
3471 	 */
3472 	MDI_PHCI_LOCK(ph);
3473 	MDI_PHCI_STABLE(ph);
3474 	MDI_PHCI_UNLOCK(ph);
3475 	return (rv);
3476 }
3477 
3478 /*
3479  * mdi_pi_online():
3480  *		Place the path_info node in the online state.  The path is
3481  *		now available to be selected by mdi_select_path() for
3482  *		transporting I/O requests to client devices.
3483  * Return Values:
3484  *		MDI_SUCCESS
3485  *		MDI_FAILURE
3486  */
3487 int
3488 mdi_pi_online(mdi_pathinfo_t *pip, int flags)
3489 {
3490 	mdi_client_t *ct = MDI_PI(pip)->pi_client;
3491 	dev_info_t *cdip;
3492 	int		client_held = 0;
3493 	int rv;
3494 
3495 	ASSERT(ct != NULL);
3496 	rv = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_ONLINE, flags);
3497 	if (rv != MDI_SUCCESS)
3498 		return (rv);
3499 
3500 	MDI_PI_LOCK(pip);
3501 	if (MDI_PI(pip)->pi_pm_held == 0) {
3502 		MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "mdi_pi_online "
3503 		    "i_mdi_pm_hold_pip %p\n", (void *)pip));
3504 		i_mdi_pm_hold_pip(pip);
3505 		client_held = 1;
3506 	}
3507 	MDI_PI_UNLOCK(pip);
3508 
3509 	if (client_held) {
3510 		MDI_CLIENT_LOCK(ct);
3511 		if (ct->ct_power_cnt == 0) {
3512 			rv = i_mdi_power_all_phci(ct);
3513 		}
3514 
3515 		MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "mdi_pi_online "
3516 		    "i_mdi_pm_hold_client %p\n", (void *)ct));
3517 		i_mdi_pm_hold_client(ct, 1);
3518 		MDI_CLIENT_UNLOCK(ct);
3519 	}
3520 
3521 	/*
3522 	 * Create the per-path (pathinfo) IO and error kstats which
3523 	 * are reported via iostat(1m).
3524 	 *
3525 	 * Defer creating the per-path kstats if device is not yet
3526 	 * attached;  the names of the kstats are constructed in part
3527 	 * using the devices instance number which is assigned during
3528 	 * process of attaching the client device.
3529 	 *
3530 	 * The framework post_attach handler, mdi_post_attach(), is
3531 	 * is responsible for initializing the client's pathinfo list
3532 	 * once successfully attached.
3533 	 */
3534 	cdip = ct->ct_dip;
3535 	ASSERT(cdip);
3536 	if (cdip == NULL || !i_ddi_devi_attached(cdip))
3537 		return (rv);
3538 
3539 	MDI_CLIENT_LOCK(ct);
3540 	rv = i_mdi_pi_kstat_create(pip);
3541 	MDI_CLIENT_UNLOCK(ct);
3542 	return (rv);
3543 }
3544 
3545 /*
3546  * mdi_pi_standby():
3547  *		Place the mdi_pathinfo node in standby state
3548  *
3549  * Return Values:
3550  *		MDI_SUCCESS
3551  *		MDI_FAILURE
3552  */
3553 int
3554 mdi_pi_standby(mdi_pathinfo_t *pip, int flags)
3555 {
3556 	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_STANDBY, flags));
3557 }
3558 
3559 /*
3560  * mdi_pi_fault():
3561  *		Place the mdi_pathinfo node in fault'ed state
3562  * Return Values:
3563  *		MDI_SUCCESS
3564  *		MDI_FAILURE
3565  */
3566 int
3567 mdi_pi_fault(mdi_pathinfo_t *pip, int flags)
3568 {
3569 	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_FAULT, flags));
3570 }
3571 
3572 /*
3573  * mdi_pi_offline():
3574  *		Offline a mdi_pathinfo node.
3575  * Return Values:
3576  *		MDI_SUCCESS
3577  *		MDI_FAILURE
3578  */
3579 int
3580 mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
3581 {
3582 	int	ret, client_held = 0;
3583 	mdi_client_t	*ct;
3584 
3585 	ret = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_OFFLINE, flags);
3586 
3587 	if (ret == MDI_SUCCESS) {
3588 		MDI_PI_LOCK(pip);
3589 		if (MDI_PI(pip)->pi_pm_held) {
3590 			client_held = 1;
3591 		}
3592 		MDI_PI_UNLOCK(pip);
3593 
3594 		if (client_held) {
3595 			ct = MDI_PI(pip)->pi_client;
3596 			MDI_CLIENT_LOCK(ct);
3597 			MDI_DEBUG(4, (CE_NOTE, ct->ct_dip,
3598 			    "mdi_pi_offline i_mdi_pm_rele_client\n"));
3599 			i_mdi_pm_rele_client(ct, 1);
3600 			MDI_CLIENT_UNLOCK(ct);
3601 		}
3602 	}
3603 
3604 	return (ret);
3605 }
3606 
3607 /*
3608  * i_mdi_pi_offline():
3609  *		Offline a mdi_pathinfo node and call the vHCI driver's callback
3610  */
3611 static int
3612 i_mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
3613 {
3614 	dev_info_t	*vdip = NULL;
3615 	mdi_vhci_t	*vh = NULL;
3616 	mdi_client_t	*ct = NULL;
3617 	int		(*f)();
3618 	int		rv;
3619 
3620 	MDI_PI_LOCK(pip);
3621 	ct = MDI_PI(pip)->pi_client;
3622 	ASSERT(ct != NULL);
3623 
3624 	while (MDI_PI(pip)->pi_ref_cnt != 0) {
3625 		/*
3626 		 * Give a chance for pending I/Os to complete.
3627 		 */
3628 		MDI_DEBUG(1, (CE_NOTE, ct->ct_dip, "!i_mdi_pi_offline: "
3629 		    "%d cmds still pending on path: %p\n",
3630 		    MDI_PI(pip)->pi_ref_cnt, (void *)pip));
3631 		if (cv_timedwait(&MDI_PI(pip)->pi_ref_cv,
3632 		    &MDI_PI(pip)->pi_mutex,
3633 		    ddi_get_lbolt() + drv_usectohz(60 * 1000000)) == -1) {
3634 			/*
3635 			 * The timeout time reached without ref_cnt being zero
3636 			 * being signaled.
3637 			 */
3638 			MDI_DEBUG(1, (CE_NOTE, ct->ct_dip, "!i_mdi_pi_offline: "
3639 			    "Timeout reached on path %p without the cond\n",
3640 			    (void *)pip));
3641 			MDI_DEBUG(1, (CE_NOTE, ct->ct_dip, "!i_mdi_pi_offline: "
3642 			    "%d cmds still pending on path: %p\n",
3643 			    MDI_PI(pip)->pi_ref_cnt, (void *)pip));
3644 		}
3645 	}
3646 	vh = ct->ct_vhci;
3647 	vdip = vh->vh_dip;
3648 
3649 	/*
3650 	 * Notify vHCI that has registered this event
3651 	 */
3652 	ASSERT(vh->vh_ops);
3653 	f = vh->vh_ops->vo_pi_state_change;
3654 
3655 	if (f != NULL) {
3656 		MDI_PI_UNLOCK(pip);
3657 		if ((rv = (*f)(vdip, pip, MDI_PATHINFO_STATE_OFFLINE, 0,
3658 		    flags)) != MDI_SUCCESS) {
3659 			MDI_DEBUG(1, (CE_WARN, ct->ct_dip,
3660 			    "!vo_path_offline failed "
3661 			    "vdip %p, pip %p", (void *)vdip, (void *)pip));
3662 		}
3663 		MDI_PI_LOCK(pip);
3664 	}
3665 
3666 	/*
3667 	 * Set the mdi_pathinfo node state and clear the transient condition
3668 	 */
3669 	MDI_PI_SET_OFFLINE(pip);
3670 	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
3671 	MDI_PI_UNLOCK(pip);
3672 
3673 	MDI_CLIENT_LOCK(ct);
3674 	if (rv == MDI_SUCCESS) {
3675 		if (ct->ct_unstable == 0) {
3676 			dev_info_t	*cdip = ct->ct_dip;
3677 
3678 			/*
3679 			 * Onlining the mdi_pathinfo node will impact the
3680 			 * client state Update the client and dev_info node
3681 			 * state accordingly
3682 			 */
3683 			i_mdi_client_update_state(ct);
3684 			rv = NDI_SUCCESS;
3685 			if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
3686 				if (cdip &&
3687 				    (i_ddi_node_state(cdip) >=
3688 				    DS_INITIALIZED)) {
3689 					MDI_CLIENT_UNLOCK(ct);
3690 					rv = ndi_devi_offline(cdip, 0);
3691 					MDI_CLIENT_LOCK(ct);
3692 					if (rv != NDI_SUCCESS) {
3693 						/*
3694 						 * ndi_devi_offline failed.
3695 						 * Reset client flags to
3696 						 * online.
3697 						 */
3698 						MDI_DEBUG(4, (CE_WARN, cdip,
3699 						    "!ndi_devi_offline: failed "
3700 						    " Error: %x", rv));
3701 						MDI_CLIENT_SET_ONLINE(ct);
3702 					}
3703 				}
3704 			}
3705 			/*
3706 			 * Convert to MDI error code
3707 			 */
3708 			switch (rv) {
3709 			case NDI_SUCCESS:
3710 				rv = MDI_SUCCESS;
3711 				break;
3712 			case NDI_BUSY:
3713 				rv = MDI_BUSY;
3714 				break;
3715 			default:
3716 				rv = MDI_FAILURE;
3717 				break;
3718 			}
3719 		}
3720 		MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
3721 		i_mdi_report_path_state(ct, pip);
3722 	}
3723 
3724 	MDI_CLIENT_UNLOCK(ct);
3725 
3726 	/*
3727 	 * Change in the mdi_pathinfo node state will impact the client state
3728 	 */
3729 	MDI_DEBUG(2, (CE_NOTE, NULL, "!i_mdi_pi_offline ct = %p pip = %p",
3730 	    (void *)ct, (void *)pip));
3731 	return (rv);
3732 }
3733 
3734 
3735 /*
3736  * mdi_pi_get_addr():
3737  *		Get the unit address associated with a mdi_pathinfo node
3738  *
3739  * Return Values:
3740  *		char *
3741  */
3742 char *
3743 mdi_pi_get_addr(mdi_pathinfo_t *pip)
3744 {
3745 	if (pip == NULL)
3746 		return (NULL);
3747 
3748 	return (MDI_PI(pip)->pi_addr);
3749 }
3750 
3751 /*
3752  * mdi_pi_get_client():
3753  *		Get the client devinfo associated with a mdi_pathinfo node
3754  *
3755  * Return Values:
3756  *		Handle to client device dev_info node
3757  */
3758 dev_info_t *
3759 mdi_pi_get_client(mdi_pathinfo_t *pip)
3760 {
3761 	dev_info_t	*dip = NULL;
3762 	if (pip) {
3763 		dip = MDI_PI(pip)->pi_client->ct_dip;
3764 	}
3765 	return (dip);
3766 }
3767 
3768 /*
3769  * mdi_pi_get_phci():
3770  *		Get the pHCI devinfo associated with the mdi_pathinfo node
3771  * Return Values:
3772  *		Handle to dev_info node
3773  */
3774 dev_info_t *
3775 mdi_pi_get_phci(mdi_pathinfo_t *pip)
3776 {
3777 	dev_info_t	*dip = NULL;
3778 	if (pip) {
3779 		dip = MDI_PI(pip)->pi_phci->ph_dip;
3780 	}
3781 	return (dip);
3782 }
3783 
3784 /*
3785  * mdi_pi_get_client_private():
3786  *		Get the client private information associated with the
3787  *		mdi_pathinfo node
3788  */
3789 void *
3790 mdi_pi_get_client_private(mdi_pathinfo_t *pip)
3791 {
3792 	void *cprivate = NULL;
3793 	if (pip) {
3794 		cprivate = MDI_PI(pip)->pi_cprivate;
3795 	}
3796 	return (cprivate);
3797 }
3798 
3799 /*
3800  * mdi_pi_set_client_private():
3801  *		Set the client private information in the mdi_pathinfo node
3802  */
3803 void
3804 mdi_pi_set_client_private(mdi_pathinfo_t *pip, void *priv)
3805 {
3806 	if (pip) {
3807 		MDI_PI(pip)->pi_cprivate = priv;
3808 	}
3809 }
3810 
3811 /*
3812  * mdi_pi_get_phci_private():
3813  *		Get the pHCI private information associated with the
3814  *		mdi_pathinfo node
3815  */
3816 caddr_t
3817 mdi_pi_get_phci_private(mdi_pathinfo_t *pip)
3818 {
3819 	caddr_t	pprivate = NULL;
3820 	if (pip) {
3821 		pprivate = MDI_PI(pip)->pi_pprivate;
3822 	}
3823 	return (pprivate);
3824 }
3825 
3826 /*
3827  * mdi_pi_set_phci_private():
3828  *		Set the pHCI private information in the mdi_pathinfo node
3829  */
3830 void
3831 mdi_pi_set_phci_private(mdi_pathinfo_t *pip, caddr_t priv)
3832 {
3833 	if (pip) {
3834 		MDI_PI(pip)->pi_pprivate = priv;
3835 	}
3836 }
3837 
3838 /*
3839  * mdi_pi_get_state():
3840  *		Get the mdi_pathinfo node state. Transient states are internal
3841  *		and not provided to the users
3842  */
3843 mdi_pathinfo_state_t
3844 mdi_pi_get_state(mdi_pathinfo_t *pip)
3845 {
3846 	mdi_pathinfo_state_t    state = MDI_PATHINFO_STATE_INIT;
3847 
3848 	if (pip) {
3849 		if (MDI_PI_IS_TRANSIENT(pip)) {
3850 			/*
3851 			 * mdi_pathinfo is in state transition.  Return the
3852 			 * last good state.
3853 			 */
3854 			state = MDI_PI_OLD_STATE(pip);
3855 		} else {
3856 			state = MDI_PI_STATE(pip);
3857 		}
3858 	}
3859 	return (state);
3860 }
3861 
3862 /*
3863  * Note that the following function needs to be the new interface for
3864  * mdi_pi_get_state when mpxio gets integrated to ON.
3865  */
3866 int
3867 mdi_pi_get_state2(mdi_pathinfo_t *pip, mdi_pathinfo_state_t *state,
3868 		uint32_t *ext_state)
3869 {
3870 	*state = MDI_PATHINFO_STATE_INIT;
3871 
3872 	if (pip) {
3873 		if (MDI_PI_IS_TRANSIENT(pip)) {
3874 			/*
3875 			 * mdi_pathinfo is in state transition.  Return the
3876 			 * last good state.
3877 			 */
3878 			*state = MDI_PI_OLD_STATE(pip);
3879 			*ext_state = MDI_PI_OLD_EXT_STATE(pip);
3880 		} else {
3881 			*state = MDI_PI_STATE(pip);
3882 			*ext_state = MDI_PI_EXT_STATE(pip);
3883 		}
3884 	}
3885 	return (MDI_SUCCESS);
3886 }
3887 
3888 /*
3889  * mdi_pi_get_preferred:
3890  *	Get the preferred path flag
3891  */
3892 int
3893 mdi_pi_get_preferred(mdi_pathinfo_t *pip)
3894 {
3895 	if (pip) {
3896 		return (MDI_PI(pip)->pi_preferred);
3897 	}
3898 	return (0);
3899 }
3900 
3901 /*
3902  * mdi_pi_set_preferred:
3903  *	Set the preferred path flag
3904  */
3905 void
3906 mdi_pi_set_preferred(mdi_pathinfo_t *pip, int preferred)
3907 {
3908 	if (pip) {
3909 		MDI_PI(pip)->pi_preferred = preferred;
3910 	}
3911 }
3912 
3913 /*
3914  * mdi_pi_set_state():
3915  *		Set the mdi_pathinfo node state
3916  */
3917 void
3918 mdi_pi_set_state(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state)
3919 {
3920 	uint32_t	ext_state;
3921 
3922 	if (pip) {
3923 		ext_state = MDI_PI(pip)->pi_state & MDI_PATHINFO_EXT_STATE_MASK;
3924 		MDI_PI(pip)->pi_state = state;
3925 		MDI_PI(pip)->pi_state |= ext_state;
3926 	}
3927 }
3928 
3929 /*
3930  * Property functions:
3931  */
3932 int
3933 i_map_nvlist_error_to_mdi(int val)
3934 {
3935 	int rv;
3936 
3937 	switch (val) {
3938 	case 0:
3939 		rv = DDI_PROP_SUCCESS;
3940 		break;
3941 	case EINVAL:
3942 	case ENOTSUP:
3943 		rv = DDI_PROP_INVAL_ARG;
3944 		break;
3945 	case ENOMEM:
3946 		rv = DDI_PROP_NO_MEMORY;
3947 		break;
3948 	default:
3949 		rv = DDI_PROP_NOT_FOUND;
3950 		break;
3951 	}
3952 	return (rv);
3953 }
3954 
3955 /*
3956  * mdi_pi_get_next_prop():
3957  * 		Property walk function.  The caller should hold mdi_pi_lock()
3958  *		and release by calling mdi_pi_unlock() at the end of walk to
3959  *		get a consistent value.
3960  */
3961 nvpair_t *
3962 mdi_pi_get_next_prop(mdi_pathinfo_t *pip, nvpair_t *prev)
3963 {
3964 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
3965 		return (NULL);
3966 	}
3967 	ASSERT(MDI_PI_LOCKED(pip));
3968 	return (nvlist_next_nvpair(MDI_PI(pip)->pi_prop, prev));
3969 }
3970 
3971 /*
3972  * mdi_prop_remove():
3973  * 		Remove the named property from the named list.
3974  */
3975 int
3976 mdi_prop_remove(mdi_pathinfo_t *pip, char *name)
3977 {
3978 	if (pip == NULL) {
3979 		return (DDI_PROP_NOT_FOUND);
3980 	}
3981 	ASSERT(!MDI_PI_LOCKED(pip));
3982 	MDI_PI_LOCK(pip);
3983 	if (MDI_PI(pip)->pi_prop == NULL) {
3984 		MDI_PI_UNLOCK(pip);
3985 		return (DDI_PROP_NOT_FOUND);
3986 	}
3987 	if (name) {
3988 		(void) nvlist_remove_all(MDI_PI(pip)->pi_prop, name);
3989 	} else {
3990 		char		nvp_name[MAXNAMELEN];
3991 		nvpair_t	*nvp;
3992 		nvp = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, NULL);
3993 		while (nvp) {
3994 			nvpair_t	*next;
3995 			next = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, nvp);
3996 			(void) snprintf(nvp_name, MAXNAMELEN, "%s",
3997 			    nvpair_name(nvp));
3998 			(void) nvlist_remove_all(MDI_PI(pip)->pi_prop,
3999 			    nvp_name);
4000 			nvp = next;
4001 		}
4002 	}
4003 	MDI_PI_UNLOCK(pip);
4004 	return (DDI_PROP_SUCCESS);
4005 }
4006 
4007 /*
4008  * mdi_prop_size():
4009  * 		Get buffer size needed to pack the property data.
4010  * 		Caller should hold the mdi_pathinfo_t lock to get a consistent
4011  *		buffer size.
4012  */
4013 int
4014 mdi_prop_size(mdi_pathinfo_t *pip, size_t *buflenp)
4015 {
4016 	int	rv;
4017 	size_t	bufsize;
4018 
4019 	*buflenp = 0;
4020 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4021 		return (DDI_PROP_NOT_FOUND);
4022 	}
4023 	ASSERT(MDI_PI_LOCKED(pip));
4024 	rv = nvlist_size(MDI_PI(pip)->pi_prop,
4025 	    &bufsize, NV_ENCODE_NATIVE);
4026 	*buflenp = bufsize;
4027 	return (i_map_nvlist_error_to_mdi(rv));
4028 }
4029 
4030 /*
4031  * mdi_prop_pack():
4032  * 		pack the property list.  The caller should hold the
4033  *		mdi_pathinfo_t node to get a consistent data
4034  */
4035 int
4036 mdi_prop_pack(mdi_pathinfo_t *pip, char **bufp, uint_t buflen)
4037 {
4038 	int	rv;
4039 	size_t	bufsize;
4040 
4041 	if ((pip == NULL) || MDI_PI(pip)->pi_prop == NULL) {
4042 		return (DDI_PROP_NOT_FOUND);
4043 	}
4044 
4045 	ASSERT(MDI_PI_LOCKED(pip));
4046 
4047 	bufsize = buflen;
4048 	rv = nvlist_pack(MDI_PI(pip)->pi_prop, bufp, (size_t *)&bufsize,
4049 	    NV_ENCODE_NATIVE, KM_SLEEP);
4050 
4051 	return (i_map_nvlist_error_to_mdi(rv));
4052 }
4053 
4054 /*
4055  * mdi_prop_update_byte():
4056  *		Create/Update a byte property
4057  */
4058 int
4059 mdi_prop_update_byte(mdi_pathinfo_t *pip, char *name, uchar_t data)
4060 {
4061 	int rv;
4062 
4063 	if (pip == NULL) {
4064 		return (DDI_PROP_INVAL_ARG);
4065 	}
4066 	ASSERT(!MDI_PI_LOCKED(pip));
4067 	MDI_PI_LOCK(pip);
4068 	if (MDI_PI(pip)->pi_prop == NULL) {
4069 		MDI_PI_UNLOCK(pip);
4070 		return (DDI_PROP_NOT_FOUND);
4071 	}
4072 	rv = nvlist_add_byte(MDI_PI(pip)->pi_prop, name, data);
4073 	MDI_PI_UNLOCK(pip);
4074 	return (i_map_nvlist_error_to_mdi(rv));
4075 }
4076 
4077 /*
4078  * mdi_prop_update_byte_array():
4079  *		Create/Update a byte array property
4080  */
4081 int
4082 mdi_prop_update_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t *data,
4083     uint_t nelements)
4084 {
4085 	int rv;
4086 
4087 	if (pip == NULL) {
4088 		return (DDI_PROP_INVAL_ARG);
4089 	}
4090 	ASSERT(!MDI_PI_LOCKED(pip));
4091 	MDI_PI_LOCK(pip);
4092 	if (MDI_PI(pip)->pi_prop == NULL) {
4093 		MDI_PI_UNLOCK(pip);
4094 		return (DDI_PROP_NOT_FOUND);
4095 	}
4096 	rv = nvlist_add_byte_array(MDI_PI(pip)->pi_prop, name, data, nelements);
4097 	MDI_PI_UNLOCK(pip);
4098 	return (i_map_nvlist_error_to_mdi(rv));
4099 }
4100 
4101 /*
4102  * mdi_prop_update_int():
4103  *		Create/Update a 32 bit integer property
4104  */
4105 int
4106 mdi_prop_update_int(mdi_pathinfo_t *pip, char *name, int data)
4107 {
4108 	int rv;
4109 
4110 	if (pip == NULL) {
4111 		return (DDI_PROP_INVAL_ARG);
4112 	}
4113 	ASSERT(!MDI_PI_LOCKED(pip));
4114 	MDI_PI_LOCK(pip);
4115 	if (MDI_PI(pip)->pi_prop == NULL) {
4116 		MDI_PI_UNLOCK(pip);
4117 		return (DDI_PROP_NOT_FOUND);
4118 	}
4119 	rv = nvlist_add_int32(MDI_PI(pip)->pi_prop, name, (int32_t)data);
4120 	MDI_PI_UNLOCK(pip);
4121 	return (i_map_nvlist_error_to_mdi(rv));
4122 }
4123 
4124 /*
4125  * mdi_prop_update_int64():
4126  *		Create/Update a 64 bit integer property
4127  */
4128 int
4129 mdi_prop_update_int64(mdi_pathinfo_t *pip, char *name, int64_t data)
4130 {
4131 	int rv;
4132 
4133 	if (pip == NULL) {
4134 		return (DDI_PROP_INVAL_ARG);
4135 	}
4136 	ASSERT(!MDI_PI_LOCKED(pip));
4137 	MDI_PI_LOCK(pip);
4138 	if (MDI_PI(pip)->pi_prop == NULL) {
4139 		MDI_PI_UNLOCK(pip);
4140 		return (DDI_PROP_NOT_FOUND);
4141 	}
4142 	rv = nvlist_add_int64(MDI_PI(pip)->pi_prop, name, data);
4143 	MDI_PI_UNLOCK(pip);
4144 	return (i_map_nvlist_error_to_mdi(rv));
4145 }
4146 
4147 /*
4148  * mdi_prop_update_int_array():
4149  *		Create/Update a int array property
4150  */
4151 int
4152 mdi_prop_update_int_array(mdi_pathinfo_t *pip, char *name, int *data,
4153 	    uint_t nelements)
4154 {
4155 	int rv;
4156 
4157 	if (pip == NULL) {
4158 		return (DDI_PROP_INVAL_ARG);
4159 	}
4160 	ASSERT(!MDI_PI_LOCKED(pip));
4161 	MDI_PI_LOCK(pip);
4162 	if (MDI_PI(pip)->pi_prop == NULL) {
4163 		MDI_PI_UNLOCK(pip);
4164 		return (DDI_PROP_NOT_FOUND);
4165 	}
4166 	rv = nvlist_add_int32_array(MDI_PI(pip)->pi_prop, name, (int32_t *)data,
4167 	    nelements);
4168 	MDI_PI_UNLOCK(pip);
4169 	return (i_map_nvlist_error_to_mdi(rv));
4170 }
4171 
4172 /*
4173  * mdi_prop_update_string():
4174  *		Create/Update a string property
4175  */
4176 int
4177 mdi_prop_update_string(mdi_pathinfo_t *pip, char *name, char *data)
4178 {
4179 	int rv;
4180 
4181 	if (pip == NULL) {
4182 		return (DDI_PROP_INVAL_ARG);
4183 	}
4184 	ASSERT(!MDI_PI_LOCKED(pip));
4185 	MDI_PI_LOCK(pip);
4186 	if (MDI_PI(pip)->pi_prop == NULL) {
4187 		MDI_PI_UNLOCK(pip);
4188 		return (DDI_PROP_NOT_FOUND);
4189 	}
4190 	rv = nvlist_add_string(MDI_PI(pip)->pi_prop, name, data);
4191 	MDI_PI_UNLOCK(pip);
4192 	return (i_map_nvlist_error_to_mdi(rv));
4193 }
4194 
4195 /*
4196  * mdi_prop_update_string_array():
4197  *		Create/Update a string array property
4198  */
4199 int
4200 mdi_prop_update_string_array(mdi_pathinfo_t *pip, char *name, char **data,
4201     uint_t nelements)
4202 {
4203 	int rv;
4204 
4205 	if (pip == NULL) {
4206 		return (DDI_PROP_INVAL_ARG);
4207 	}
4208 	ASSERT(!MDI_PI_LOCKED(pip));
4209 	MDI_PI_LOCK(pip);
4210 	if (MDI_PI(pip)->pi_prop == NULL) {
4211 		MDI_PI_UNLOCK(pip);
4212 		return (DDI_PROP_NOT_FOUND);
4213 	}
4214 	rv = nvlist_add_string_array(MDI_PI(pip)->pi_prop, name, data,
4215 	    nelements);
4216 	MDI_PI_UNLOCK(pip);
4217 	return (i_map_nvlist_error_to_mdi(rv));
4218 }
4219 
4220 /*
4221  * mdi_prop_lookup_byte():
4222  * 		Look for byte property identified by name.  The data returned
4223  *		is the actual property and valid as long as mdi_pathinfo_t node
4224  *		is alive.
4225  */
4226 int
4227 mdi_prop_lookup_byte(mdi_pathinfo_t *pip, char *name, uchar_t *data)
4228 {
4229 	int rv;
4230 
4231 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4232 		return (DDI_PROP_NOT_FOUND);
4233 	}
4234 	rv = nvlist_lookup_byte(MDI_PI(pip)->pi_prop, name, data);
4235 	return (i_map_nvlist_error_to_mdi(rv));
4236 }
4237 
4238 
4239 /*
4240  * mdi_prop_lookup_byte_array():
4241  * 		Look for byte array property identified by name.  The data
4242  *		returned is the actual property and valid as long as
4243  *		mdi_pathinfo_t node is alive.
4244  */
4245 int
4246 mdi_prop_lookup_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t **data,
4247     uint_t *nelements)
4248 {
4249 	int rv;
4250 
4251 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4252 		return (DDI_PROP_NOT_FOUND);
4253 	}
4254 	rv = nvlist_lookup_byte_array(MDI_PI(pip)->pi_prop, name, data,
4255 	    nelements);
4256 	return (i_map_nvlist_error_to_mdi(rv));
4257 }
4258 
4259 /*
4260  * mdi_prop_lookup_int():
4261  * 		Look for int property identified by name.  The data returned
4262  *		is the actual property and valid as long as mdi_pathinfo_t
4263  *		node is alive.
4264  */
4265 int
4266 mdi_prop_lookup_int(mdi_pathinfo_t *pip, char *name, int *data)
4267 {
4268 	int rv;
4269 
4270 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4271 		return (DDI_PROP_NOT_FOUND);
4272 	}
4273 	rv = nvlist_lookup_int32(MDI_PI(pip)->pi_prop, name, (int32_t *)data);
4274 	return (i_map_nvlist_error_to_mdi(rv));
4275 }
4276 
4277 /*
4278  * mdi_prop_lookup_int64():
4279  * 		Look for int64 property identified by name.  The data returned
4280  *		is the actual property and valid as long as mdi_pathinfo_t node
4281  *		is alive.
4282  */
4283 int
4284 mdi_prop_lookup_int64(mdi_pathinfo_t *pip, char *name, int64_t *data)
4285 {
4286 	int rv;
4287 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4288 		return (DDI_PROP_NOT_FOUND);
4289 	}
4290 	rv = nvlist_lookup_int64(MDI_PI(pip)->pi_prop, name, data);
4291 	return (i_map_nvlist_error_to_mdi(rv));
4292 }
4293 
4294 /*
4295  * mdi_prop_lookup_int_array():
4296  * 		Look for int array property identified by name.  The data
4297  *		returned is the actual property and valid as long as
4298  *		mdi_pathinfo_t node is alive.
4299  */
4300 int
4301 mdi_prop_lookup_int_array(mdi_pathinfo_t *pip, char *name, int **data,
4302     uint_t *nelements)
4303 {
4304 	int rv;
4305 
4306 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4307 		return (DDI_PROP_NOT_FOUND);
4308 	}
4309 	rv = nvlist_lookup_int32_array(MDI_PI(pip)->pi_prop, name,
4310 	    (int32_t **)data, nelements);
4311 	return (i_map_nvlist_error_to_mdi(rv));
4312 }
4313 
4314 /*
4315  * mdi_prop_lookup_string():
4316  * 		Look for string property identified by name.  The data
4317  *		returned is the actual property and valid as long as
4318  *		mdi_pathinfo_t node is alive.
4319  */
4320 int
4321 mdi_prop_lookup_string(mdi_pathinfo_t *pip, char *name, char **data)
4322 {
4323 	int rv;
4324 
4325 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4326 		return (DDI_PROP_NOT_FOUND);
4327 	}
4328 	rv = nvlist_lookup_string(MDI_PI(pip)->pi_prop, name, data);
4329 	return (i_map_nvlist_error_to_mdi(rv));
4330 }
4331 
4332 /*
4333  * mdi_prop_lookup_string_array():
4334  * 		Look for string array property identified by name.  The data
4335  *		returned is the actual property and valid as long as
4336  *		mdi_pathinfo_t node is alive.
4337  */
4338 int
4339 mdi_prop_lookup_string_array(mdi_pathinfo_t *pip, char *name, char ***data,
4340     uint_t *nelements)
4341 {
4342 	int rv;
4343 
4344 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4345 		return (DDI_PROP_NOT_FOUND);
4346 	}
4347 	rv = nvlist_lookup_string_array(MDI_PI(pip)->pi_prop, name, data,
4348 	    nelements);
4349 	return (i_map_nvlist_error_to_mdi(rv));
4350 }
4351 
4352 /*
4353  * mdi_prop_free():
4354  * 		Symmetrical function to ddi_prop_free(). nvlist_lookup_xx()
4355  *		functions return the pointer to actual property data and not a
4356  *		copy of it.  So the data returned is valid as long as
4357  *		mdi_pathinfo_t node is valid.
4358  */
4359 /*ARGSUSED*/
4360 int
4361 mdi_prop_free(void *data)
4362 {
4363 	return (DDI_PROP_SUCCESS);
4364 }
4365 
4366 /*ARGSUSED*/
4367 static void
4368 i_mdi_report_path_state(mdi_client_t *ct, mdi_pathinfo_t *pip)
4369 {
4370 	char		*phci_path, *ct_path;
4371 	char		*ct_status;
4372 	char		*status;
4373 	dev_info_t	*dip = ct->ct_dip;
4374 	char		lb_buf[64];
4375 
4376 	ASSERT(MDI_CLIENT_LOCKED(ct));
4377 	if ((dip == NULL) || (ddi_get_instance(dip) == -1) ||
4378 	    (MDI_CLIENT_IS_REPORT_DEV_NEEDED(ct) == 0)) {
4379 		return;
4380 	}
4381 	if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_OPTIMAL) {
4382 		ct_status = "optimal";
4383 	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED) {
4384 		ct_status = "degraded";
4385 	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
4386 		ct_status = "failed";
4387 	} else {
4388 		ct_status = "unknown";
4389 	}
4390 
4391 	if (MDI_PI_IS_OFFLINE(pip)) {
4392 		status = "offline";
4393 	} else if (MDI_PI_IS_ONLINE(pip)) {
4394 		status = "online";
4395 	} else if (MDI_PI_IS_STANDBY(pip)) {
4396 		status = "standby";
4397 	} else if (MDI_PI_IS_FAULT(pip)) {
4398 		status = "faulted";
4399 	} else {
4400 		status = "unknown";
4401 	}
4402 
4403 	if (ct->ct_lb == LOAD_BALANCE_LBA) {
4404 		(void) snprintf(lb_buf, sizeof (lb_buf),
4405 		    "%s, region-size: %d", mdi_load_balance_lba,
4406 			ct->ct_lb_args->region_size);
4407 	} else if (ct->ct_lb == LOAD_BALANCE_NONE) {
4408 		(void) snprintf(lb_buf, sizeof (lb_buf),
4409 		    "%s", mdi_load_balance_none);
4410 	} else {
4411 		(void) snprintf(lb_buf, sizeof (lb_buf), "%s",
4412 		    mdi_load_balance_rr);
4413 	}
4414 
4415 	if (dip) {
4416 		ct_path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4417 		phci_path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4418 		cmn_err(CE_CONT, "?%s (%s%d) multipath status: %s, "
4419 		    "path %s (%s%d) to target address: %s is %s"
4420 		    " Load balancing: %s\n",
4421 		    ddi_pathname(dip, ct_path), ddi_driver_name(dip),
4422 		    ddi_get_instance(dip), ct_status,
4423 		    ddi_pathname(MDI_PI(pip)->pi_phci->ph_dip, phci_path),
4424 		    ddi_driver_name(MDI_PI(pip)->pi_phci->ph_dip),
4425 		    ddi_get_instance(MDI_PI(pip)->pi_phci->ph_dip),
4426 		    MDI_PI(pip)->pi_addr, status, lb_buf);
4427 		kmem_free(phci_path, MAXPATHLEN);
4428 		kmem_free(ct_path, MAXPATHLEN);
4429 		MDI_CLIENT_CLEAR_REPORT_DEV_NEEDED(ct);
4430 	}
4431 }
4432 
4433 #ifdef	DEBUG
4434 /*
4435  * i_mdi_log():
4436  *		Utility function for error message management
4437  *
4438  */
4439 /*PRINTFLIKE3*/
4440 static void
4441 i_mdi_log(int level, dev_info_t *dip, const char *fmt, ...)
4442 {
4443 	char		name[MAXNAMELEN];
4444 	char		buf[MAXNAMELEN];
4445 	char		*bp;
4446 	va_list		ap;
4447 	int		log_only = 0;
4448 	int		boot_only = 0;
4449 	int		console_only = 0;
4450 
4451 	if (dip) {
4452 		(void) snprintf(name, MAXNAMELEN, "%s%d: ",
4453 		    ddi_node_name(dip), ddi_get_instance(dip));
4454 	} else {
4455 		name[0] = 0;
4456 	}
4457 
4458 	va_start(ap, fmt);
4459 	(void) vsnprintf(buf, MAXNAMELEN, fmt, ap);
4460 	va_end(ap);
4461 
4462 	switch (buf[0]) {
4463 	case '!':
4464 		bp = &buf[1];
4465 		log_only = 1;
4466 		break;
4467 	case '?':
4468 		bp = &buf[1];
4469 		boot_only = 1;
4470 		break;
4471 	case '^':
4472 		bp = &buf[1];
4473 		console_only = 1;
4474 		break;
4475 	default:
4476 		bp = buf;
4477 		break;
4478 	}
4479 	if (mdi_debug_logonly) {
4480 		log_only = 1;
4481 		boot_only = 0;
4482 		console_only = 0;
4483 	}
4484 
4485 	switch (level) {
4486 	case CE_NOTE:
4487 		level = CE_CONT;
4488 		/* FALLTHROUGH */
4489 	case CE_CONT:
4490 	case CE_WARN:
4491 	case CE_PANIC:
4492 		if (boot_only) {
4493 			cmn_err(level, "?mdi: %s%s", name, bp);
4494 		} else if (console_only) {
4495 			cmn_err(level, "^mdi: %s%s", name, bp);
4496 		} else if (log_only) {
4497 			cmn_err(level, "!mdi: %s%s", name, bp);
4498 		} else {
4499 			cmn_err(level, "mdi: %s%s", name, bp);
4500 		}
4501 		break;
4502 	default:
4503 		cmn_err(level, "mdi: %s%s", name, bp);
4504 		break;
4505 	}
4506 }
4507 #endif	/* DEBUG */
4508 
4509 void
4510 i_mdi_client_online(dev_info_t *ct_dip)
4511 {
4512 	mdi_client_t	*ct;
4513 
4514 	/*
4515 	 * Client online notification. Mark client state as online
4516 	 * restore our binding with dev_info node
4517 	 */
4518 	ct = i_devi_get_client(ct_dip);
4519 	ASSERT(ct != NULL);
4520 	MDI_CLIENT_LOCK(ct);
4521 	MDI_CLIENT_SET_ONLINE(ct);
4522 	/* catch for any memory leaks */
4523 	ASSERT((ct->ct_dip == NULL) || (ct->ct_dip == ct_dip));
4524 	ct->ct_dip = ct_dip;
4525 
4526 	if (ct->ct_power_cnt == 0)
4527 		(void) i_mdi_power_all_phci(ct);
4528 
4529 	MDI_DEBUG(4, (CE_NOTE, ct_dip, "i_mdi_client_online "
4530 	    "i_mdi_pm_hold_client %p\n", (void *)ct));
4531 	i_mdi_pm_hold_client(ct, 1);
4532 
4533 	MDI_CLIENT_UNLOCK(ct);
4534 }
4535 
4536 void
4537 i_mdi_phci_online(dev_info_t *ph_dip)
4538 {
4539 	mdi_phci_t	*ph;
4540 
4541 	/* pHCI online notification. Mark state accordingly */
4542 	ph = i_devi_get_phci(ph_dip);
4543 	ASSERT(ph != NULL);
4544 	MDI_PHCI_LOCK(ph);
4545 	MDI_PHCI_SET_ONLINE(ph);
4546 	MDI_PHCI_UNLOCK(ph);
4547 }
4548 
4549 /*
4550  * mdi_devi_online():
4551  * 		Online notification from NDI framework on pHCI/client
4552  *		device online.
4553  * Return Values:
4554  *		NDI_SUCCESS
4555  *		MDI_FAILURE
4556  */
4557 /*ARGSUSED*/
4558 int
4559 mdi_devi_online(dev_info_t *dip, uint_t flags)
4560 {
4561 	if (MDI_PHCI(dip)) {
4562 		i_mdi_phci_online(dip);
4563 	}
4564 
4565 	if (MDI_CLIENT(dip)) {
4566 		i_mdi_client_online(dip);
4567 	}
4568 	return (NDI_SUCCESS);
4569 }
4570 
4571 /*
4572  * mdi_devi_offline():
4573  * 		Offline notification from NDI framework on pHCI/Client device
4574  *		offline.
4575  *
4576  * Return Values:
4577  *		NDI_SUCCESS
4578  *		NDI_FAILURE
4579  */
4580 /*ARGSUSED*/
4581 int
4582 mdi_devi_offline(dev_info_t *dip, uint_t flags)
4583 {
4584 	int		rv = NDI_SUCCESS;
4585 
4586 	if (MDI_CLIENT(dip)) {
4587 		rv = i_mdi_client_offline(dip, flags);
4588 		if (rv != NDI_SUCCESS)
4589 			return (rv);
4590 	}
4591 
4592 	if (MDI_PHCI(dip)) {
4593 		rv = i_mdi_phci_offline(dip, flags);
4594 
4595 		if ((rv != NDI_SUCCESS) && MDI_CLIENT(dip)) {
4596 			/* set client back online */
4597 			i_mdi_client_online(dip);
4598 		}
4599 	}
4600 
4601 	return (rv);
4602 }
4603 
4604 /*ARGSUSED*/
4605 static int
4606 i_mdi_phci_offline(dev_info_t *dip, uint_t flags)
4607 {
4608 	int		rv = NDI_SUCCESS;
4609 	mdi_phci_t	*ph;
4610 	mdi_client_t	*ct;
4611 	mdi_pathinfo_t	*pip;
4612 	mdi_pathinfo_t	*next;
4613 	mdi_pathinfo_t	*failed_pip = NULL;
4614 	dev_info_t	*cdip;
4615 
4616 	/*
4617 	 * pHCI component offline notification
4618 	 * Make sure that this pHCI instance is free to be offlined.
4619 	 * If it is OK to proceed, Offline and remove all the child
4620 	 * mdi_pathinfo nodes.  This process automatically offlines
4621 	 * corresponding client devices, for which this pHCI provides
4622 	 * critical services.
4623 	 */
4624 	ph = i_devi_get_phci(dip);
4625 	MDI_DEBUG(2, (CE_NOTE, dip, "!mdi_phci_offline called %p %p\n",
4626 	    (void *)dip, (void *)ph));
4627 	if (ph == NULL) {
4628 		return (rv);
4629 	}
4630 
4631 	MDI_PHCI_LOCK(ph);
4632 
4633 	if (MDI_PHCI_IS_OFFLINE(ph)) {
4634 		MDI_DEBUG(1, (CE_WARN, dip, "!pHCI %p already offlined",
4635 		    (void *)ph));
4636 		MDI_PHCI_UNLOCK(ph);
4637 		return (NDI_SUCCESS);
4638 	}
4639 
4640 	/*
4641 	 * Check to see if the pHCI can be offlined
4642 	 */
4643 	if (ph->ph_unstable) {
4644 		MDI_DEBUG(1, (CE_WARN, dip,
4645 		    "!One or more target devices are in transient "
4646 		    "state. This device can not be removed at "
4647 		    "this moment. Please try again later."));
4648 		MDI_PHCI_UNLOCK(ph);
4649 		return (NDI_BUSY);
4650 	}
4651 
4652 	pip = ph->ph_path_head;
4653 	while (pip != NULL) {
4654 		MDI_PI_LOCK(pip);
4655 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4656 
4657 		/*
4658 		 * The mdi_pathinfo state is OK. Check the client state.
4659 		 * If failover in progress fail the pHCI from offlining
4660 		 */
4661 		ct = MDI_PI(pip)->pi_client;
4662 		i_mdi_client_lock(ct, pip);
4663 		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
4664 		    (ct->ct_unstable)) {
4665 			/*
4666 			 * Failover is in progress, Fail the DR
4667 			 */
4668 			MDI_DEBUG(1, (CE_WARN, dip,
4669 			    "!pHCI device (%s%d) is Busy. %s",
4670 			    ddi_driver_name(dip), ddi_get_instance(dip),
4671 			    "This device can not be removed at "
4672 			    "this moment. Please try again later."));
4673 			MDI_PI_UNLOCK(pip);
4674 			i_mdi_client_unlock(ct);
4675 			MDI_PHCI_UNLOCK(ph);
4676 			return (NDI_BUSY);
4677 		}
4678 		MDI_PI_UNLOCK(pip);
4679 
4680 		/*
4681 		 * Check to see of we are removing the last path of this
4682 		 * client device...
4683 		 */
4684 		cdip = ct->ct_dip;
4685 		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
4686 		    (i_mdi_client_compute_state(ct, ph) ==
4687 		    MDI_CLIENT_STATE_FAILED)) {
4688 			i_mdi_client_unlock(ct);
4689 			MDI_PHCI_UNLOCK(ph);
4690 			if (ndi_devi_offline(cdip, 0) != NDI_SUCCESS) {
4691 				/*
4692 				 * ndi_devi_offline() failed.
4693 				 * This pHCI provides the critical path
4694 				 * to one or more client devices.
4695 				 * Return busy.
4696 				 */
4697 				MDI_PHCI_LOCK(ph);
4698 				MDI_DEBUG(1, (CE_WARN, dip,
4699 				    "!pHCI device (%s%d) is Busy. %s",
4700 				    ddi_driver_name(dip), ddi_get_instance(dip),
4701 				    "This device can not be removed at "
4702 				    "this moment. Please try again later."));
4703 				failed_pip = pip;
4704 				break;
4705 			} else {
4706 				MDI_PHCI_LOCK(ph);
4707 				pip = next;
4708 			}
4709 		} else {
4710 			i_mdi_client_unlock(ct);
4711 			pip = next;
4712 		}
4713 	}
4714 
4715 	if (failed_pip) {
4716 		pip = ph->ph_path_head;
4717 		while (pip != failed_pip) {
4718 			MDI_PI_LOCK(pip);
4719 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4720 			ct = MDI_PI(pip)->pi_client;
4721 			i_mdi_client_lock(ct, pip);
4722 			cdip = ct->ct_dip;
4723 			switch (MDI_CLIENT_STATE(ct)) {
4724 			case MDI_CLIENT_STATE_OPTIMAL:
4725 			case MDI_CLIENT_STATE_DEGRADED:
4726 				if (cdip) {
4727 					MDI_PI_UNLOCK(pip);
4728 					i_mdi_client_unlock(ct);
4729 					MDI_PHCI_UNLOCK(ph);
4730 					(void) ndi_devi_online(cdip, 0);
4731 					MDI_PHCI_LOCK(ph);
4732 					pip = next;
4733 					continue;
4734 				}
4735 				break;
4736 
4737 			case MDI_CLIENT_STATE_FAILED:
4738 				if (cdip) {
4739 					MDI_PI_UNLOCK(pip);
4740 					i_mdi_client_unlock(ct);
4741 					MDI_PHCI_UNLOCK(ph);
4742 					(void) ndi_devi_offline(cdip, 0);
4743 					MDI_PHCI_LOCK(ph);
4744 					pip = next;
4745 					continue;
4746 				}
4747 				break;
4748 			}
4749 			MDI_PI_UNLOCK(pip);
4750 			i_mdi_client_unlock(ct);
4751 			pip = next;
4752 		}
4753 		MDI_PHCI_UNLOCK(ph);
4754 		return (NDI_BUSY);
4755 	}
4756 
4757 	/*
4758 	 * Mark the pHCI as offline
4759 	 */
4760 	MDI_PHCI_SET_OFFLINE(ph);
4761 
4762 	/*
4763 	 * Mark the child mdi_pathinfo nodes as transient
4764 	 */
4765 	pip = ph->ph_path_head;
4766 	while (pip != NULL) {
4767 		MDI_PI_LOCK(pip);
4768 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4769 		MDI_PI_SET_OFFLINING(pip);
4770 		MDI_PI_UNLOCK(pip);
4771 		pip = next;
4772 	}
4773 	MDI_PHCI_UNLOCK(ph);
4774 	/*
4775 	 * Give a chance for any pending commands to execute
4776 	 */
4777 	delay(1);
4778 	MDI_PHCI_LOCK(ph);
4779 	pip = ph->ph_path_head;
4780 	while (pip != NULL) {
4781 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4782 		(void) i_mdi_pi_offline(pip, flags);
4783 		MDI_PI_LOCK(pip);
4784 		ct = MDI_PI(pip)->pi_client;
4785 		if (!MDI_PI_IS_OFFLINE(pip)) {
4786 			MDI_DEBUG(1, (CE_WARN, dip,
4787 			    "!pHCI device (%s%d) is Busy. %s",
4788 			    ddi_driver_name(dip), ddi_get_instance(dip),
4789 			    "This device can not be removed at "
4790 			    "this moment. Please try again later."));
4791 			MDI_PI_UNLOCK(pip);
4792 			MDI_PHCI_SET_ONLINE(ph);
4793 			MDI_PHCI_UNLOCK(ph);
4794 			return (NDI_BUSY);
4795 		}
4796 		MDI_PI_UNLOCK(pip);
4797 		pip = next;
4798 	}
4799 	MDI_PHCI_UNLOCK(ph);
4800 
4801 	return (rv);
4802 }
4803 
4804 /*ARGSUSED*/
4805 static int
4806 i_mdi_client_offline(dev_info_t *dip, uint_t flags)
4807 {
4808 	int		rv = NDI_SUCCESS;
4809 	mdi_client_t	*ct;
4810 
4811 	/*
4812 	 * Client component to go offline.  Make sure that we are
4813 	 * not in failing over state and update client state
4814 	 * accordingly
4815 	 */
4816 	ct = i_devi_get_client(dip);
4817 	MDI_DEBUG(2, (CE_NOTE, dip, "!i_mdi_client_offline called %p %p\n",
4818 	    (void *)dip, (void *)ct));
4819 	if (ct != NULL) {
4820 		MDI_CLIENT_LOCK(ct);
4821 		if (ct->ct_unstable) {
4822 			/*
4823 			 * One or more paths are in transient state,
4824 			 * Dont allow offline of a client device
4825 			 */
4826 			MDI_DEBUG(1, (CE_WARN, dip,
4827 			    "!One or more paths to this device is "
4828 			    "in transient state. This device can not "
4829 			    "be removed at this moment. "
4830 			    "Please try again later."));
4831 			MDI_CLIENT_UNLOCK(ct);
4832 			return (NDI_BUSY);
4833 		}
4834 		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
4835 			/*
4836 			 * Failover is in progress, Dont allow DR of
4837 			 * a client device
4838 			 */
4839 			MDI_DEBUG(1, (CE_WARN, dip,
4840 			    "!Client device (%s%d) is Busy. %s",
4841 			    ddi_driver_name(dip), ddi_get_instance(dip),
4842 			    "This device can not be removed at "
4843 			    "this moment. Please try again later."));
4844 			MDI_CLIENT_UNLOCK(ct);
4845 			return (NDI_BUSY);
4846 		}
4847 		MDI_CLIENT_SET_OFFLINE(ct);
4848 
4849 		/*
4850 		 * Unbind our relationship with the dev_info node
4851 		 */
4852 		if (flags & NDI_DEVI_REMOVE) {
4853 			ct->ct_dip = NULL;
4854 		}
4855 		MDI_CLIENT_UNLOCK(ct);
4856 	}
4857 	return (rv);
4858 }
4859 
4860 /*
4861  * mdi_pre_attach():
4862  *		Pre attach() notification handler
4863  */
4864 /*ARGSUSED*/
4865 int
4866 mdi_pre_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
4867 {
4868 	/* don't support old DDI_PM_RESUME */
4869 	if ((DEVI(dip)->devi_mdi_component != MDI_COMPONENT_NONE) &&
4870 	    (cmd == DDI_PM_RESUME))
4871 		return (DDI_FAILURE);
4872 
4873 	return (DDI_SUCCESS);
4874 }
4875 
4876 /*
4877  * mdi_post_attach():
4878  *		Post attach() notification handler
4879  */
4880 /*ARGSUSED*/
4881 void
4882 mdi_post_attach(dev_info_t *dip, ddi_attach_cmd_t cmd, int error)
4883 {
4884 	mdi_phci_t	*ph;
4885 	mdi_client_t	*ct;
4886 	mdi_pathinfo_t	*pip;
4887 
4888 	if (MDI_PHCI(dip)) {
4889 		ph = i_devi_get_phci(dip);
4890 		ASSERT(ph != NULL);
4891 
4892 		MDI_PHCI_LOCK(ph);
4893 		switch (cmd) {
4894 		case DDI_ATTACH:
4895 			MDI_DEBUG(2, (CE_NOTE, dip,
4896 			    "!pHCI post_attach: called %p\n", (void *)ph));
4897 			if (error == DDI_SUCCESS) {
4898 				MDI_PHCI_SET_ATTACH(ph);
4899 			} else {
4900 				MDI_DEBUG(1, (CE_NOTE, dip,
4901 				    "!pHCI post_attach: failed error=%d\n",
4902 				    error));
4903 				MDI_PHCI_SET_DETACH(ph);
4904 			}
4905 			break;
4906 
4907 		case DDI_RESUME:
4908 			MDI_DEBUG(2, (CE_NOTE, dip,
4909 			    "!pHCI post_resume: called %p\n", (void *)ph));
4910 			if (error == DDI_SUCCESS) {
4911 				MDI_PHCI_SET_RESUME(ph);
4912 			} else {
4913 				MDI_DEBUG(1, (CE_NOTE, dip,
4914 				    "!pHCI post_resume: failed error=%d\n",
4915 				    error));
4916 				MDI_PHCI_SET_SUSPEND(ph);
4917 			}
4918 			break;
4919 		}
4920 		MDI_PHCI_UNLOCK(ph);
4921 	}
4922 
4923 	if (MDI_CLIENT(dip)) {
4924 		ct = i_devi_get_client(dip);
4925 		ASSERT(ct != NULL);
4926 
4927 		MDI_CLIENT_LOCK(ct);
4928 		switch (cmd) {
4929 		case DDI_ATTACH:
4930 			MDI_DEBUG(2, (CE_NOTE, dip,
4931 			    "!Client post_attach: called %p\n", (void *)ct));
4932 			if (error != DDI_SUCCESS) {
4933 				MDI_DEBUG(1, (CE_NOTE, dip,
4934 				    "!Client post_attach: failed error=%d\n",
4935 				    error));
4936 				MDI_CLIENT_SET_DETACH(ct);
4937 				MDI_DEBUG(4, (CE_WARN, dip,
4938 				    "mdi_post_attach i_mdi_pm_reset_client\n"));
4939 				i_mdi_pm_reset_client(ct);
4940 				break;
4941 			}
4942 
4943 			/*
4944 			 * Client device has successfully attached.
4945 			 * Create kstats for any pathinfo structures
4946 			 * initially associated with this client.
4947 			 */
4948 			for (pip = ct->ct_path_head; pip != NULL;
4949 			    pip = (mdi_pathinfo_t *)
4950 			    MDI_PI(pip)->pi_client_link) {
4951 				if (!MDI_PI_IS_OFFLINE(pip)) {
4952 					(void) i_mdi_pi_kstat_create(pip);
4953 					i_mdi_report_path_state(ct, pip);
4954 				}
4955 			}
4956 			MDI_CLIENT_SET_ATTACH(ct);
4957 			break;
4958 
4959 		case DDI_RESUME:
4960 			MDI_DEBUG(2, (CE_NOTE, dip,
4961 			    "!Client post_attach: called %p\n", (void *)ct));
4962 			if (error == DDI_SUCCESS) {
4963 				MDI_CLIENT_SET_RESUME(ct);
4964 			} else {
4965 				MDI_DEBUG(1, (CE_NOTE, dip,
4966 				    "!Client post_resume: failed error=%d\n",
4967 				    error));
4968 				MDI_CLIENT_SET_SUSPEND(ct);
4969 			}
4970 			break;
4971 		}
4972 		MDI_CLIENT_UNLOCK(ct);
4973 	}
4974 }
4975 
4976 /*
4977  * mdi_pre_detach():
4978  *		Pre detach notification handler
4979  */
4980 /*ARGSUSED*/
4981 int
4982 mdi_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
4983 {
4984 	int rv = DDI_SUCCESS;
4985 
4986 	if (MDI_CLIENT(dip)) {
4987 		(void) i_mdi_client_pre_detach(dip, cmd);
4988 	}
4989 
4990 	if (MDI_PHCI(dip)) {
4991 		rv = i_mdi_phci_pre_detach(dip, cmd);
4992 	}
4993 
4994 	return (rv);
4995 }
4996 
4997 /*ARGSUSED*/
4998 static int
4999 i_mdi_phci_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5000 {
5001 	int		rv = DDI_SUCCESS;
5002 	mdi_phci_t	*ph;
5003 	mdi_client_t	*ct;
5004 	mdi_pathinfo_t	*pip;
5005 	mdi_pathinfo_t	*failed_pip = NULL;
5006 	mdi_pathinfo_t	*next;
5007 
5008 	ph = i_devi_get_phci(dip);
5009 	if (ph == NULL) {
5010 		return (rv);
5011 	}
5012 
5013 	MDI_PHCI_LOCK(ph);
5014 	switch (cmd) {
5015 	case DDI_DETACH:
5016 		MDI_DEBUG(2, (CE_NOTE, dip,
5017 		    "!pHCI pre_detach: called %p\n", (void *)ph));
5018 		if (!MDI_PHCI_IS_OFFLINE(ph)) {
5019 			/*
5020 			 * mdi_pathinfo nodes are still attached to
5021 			 * this pHCI. Fail the detach for this pHCI.
5022 			 */
5023 			MDI_DEBUG(2, (CE_WARN, dip,
5024 			    "!pHCI pre_detach: "
5025 			    "mdi_pathinfo nodes are still attached "
5026 			    "%p\n", (void *)ph));
5027 			rv = DDI_FAILURE;
5028 			break;
5029 		}
5030 		MDI_PHCI_SET_DETACH(ph);
5031 		break;
5032 
5033 	case DDI_SUSPEND:
5034 		/*
5035 		 * pHCI is getting suspended.  Since mpxio client
5036 		 * devices may not be suspended at this point, to avoid
5037 		 * a potential stack overflow, it is important to suspend
5038 		 * client devices before pHCI can be suspended.
5039 		 */
5040 
5041 		MDI_DEBUG(2, (CE_NOTE, dip,
5042 		    "!pHCI pre_suspend: called %p\n", (void *)ph));
5043 		/*
5044 		 * Suspend all the client devices accessible through this pHCI
5045 		 */
5046 		pip = ph->ph_path_head;
5047 		while (pip != NULL && rv == DDI_SUCCESS) {
5048 			dev_info_t *cdip;
5049 			MDI_PI_LOCK(pip);
5050 			next =
5051 			    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5052 			ct = MDI_PI(pip)->pi_client;
5053 			i_mdi_client_lock(ct, pip);
5054 			cdip = ct->ct_dip;
5055 			MDI_PI_UNLOCK(pip);
5056 			if ((MDI_CLIENT_IS_DETACHED(ct) == 0) &&
5057 			    MDI_CLIENT_IS_SUSPENDED(ct) == 0) {
5058 				i_mdi_client_unlock(ct);
5059 				if ((rv = devi_detach(cdip, DDI_SUSPEND)) !=
5060 				    DDI_SUCCESS) {
5061 					/*
5062 					 * Suspend of one of the client
5063 					 * device has failed.
5064 					 */
5065 					MDI_DEBUG(1, (CE_WARN, dip,
5066 					    "!Suspend of device (%s%d) failed.",
5067 					    ddi_driver_name(cdip),
5068 					    ddi_get_instance(cdip)));
5069 					failed_pip = pip;
5070 					break;
5071 				}
5072 			} else {
5073 				i_mdi_client_unlock(ct);
5074 			}
5075 			pip = next;
5076 		}
5077 
5078 		if (rv == DDI_SUCCESS) {
5079 			/*
5080 			 * Suspend of client devices is complete. Proceed
5081 			 * with pHCI suspend.
5082 			 */
5083 			MDI_PHCI_SET_SUSPEND(ph);
5084 		} else {
5085 			/*
5086 			 * Revert back all the suspended client device states
5087 			 * to converse.
5088 			 */
5089 			pip = ph->ph_path_head;
5090 			while (pip != failed_pip) {
5091 				dev_info_t *cdip;
5092 				MDI_PI_LOCK(pip);
5093 				next =
5094 				    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5095 				ct = MDI_PI(pip)->pi_client;
5096 				i_mdi_client_lock(ct, pip);
5097 				cdip = ct->ct_dip;
5098 				MDI_PI_UNLOCK(pip);
5099 				if (MDI_CLIENT_IS_SUSPENDED(ct)) {
5100 					i_mdi_client_unlock(ct);
5101 					(void) devi_attach(cdip, DDI_RESUME);
5102 				} else {
5103 					i_mdi_client_unlock(ct);
5104 				}
5105 				pip = next;
5106 			}
5107 		}
5108 		break;
5109 
5110 	default:
5111 		rv = DDI_FAILURE;
5112 		break;
5113 	}
5114 	MDI_PHCI_UNLOCK(ph);
5115 	return (rv);
5116 }
5117 
5118 /*ARGSUSED*/
5119 static int
5120 i_mdi_client_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5121 {
5122 	int		rv = DDI_SUCCESS;
5123 	mdi_client_t	*ct;
5124 
5125 	ct = i_devi_get_client(dip);
5126 	if (ct == NULL) {
5127 		return (rv);
5128 	}
5129 
5130 	MDI_CLIENT_LOCK(ct);
5131 	switch (cmd) {
5132 	case DDI_DETACH:
5133 		MDI_DEBUG(2, (CE_NOTE, dip,
5134 		    "!Client pre_detach: called %p\n", (void *)ct));
5135 		MDI_CLIENT_SET_DETACH(ct);
5136 		break;
5137 
5138 	case DDI_SUSPEND:
5139 		MDI_DEBUG(2, (CE_NOTE, dip,
5140 		    "!Client pre_suspend: called %p\n", (void *)ct));
5141 		MDI_CLIENT_SET_SUSPEND(ct);
5142 		break;
5143 
5144 	default:
5145 		rv = DDI_FAILURE;
5146 		break;
5147 	}
5148 	MDI_CLIENT_UNLOCK(ct);
5149 	return (rv);
5150 }
5151 
5152 /*
5153  * mdi_post_detach():
5154  *		Post detach notification handler
5155  */
5156 /*ARGSUSED*/
5157 void
5158 mdi_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5159 {
5160 	/*
5161 	 * Detach/Suspend of mpxio component failed. Update our state
5162 	 * too
5163 	 */
5164 	if (MDI_PHCI(dip))
5165 		i_mdi_phci_post_detach(dip, cmd, error);
5166 
5167 	if (MDI_CLIENT(dip))
5168 		i_mdi_client_post_detach(dip, cmd, error);
5169 }
5170 
5171 /*ARGSUSED*/
5172 static void
5173 i_mdi_phci_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5174 {
5175 	mdi_phci_t	*ph;
5176 
5177 	/*
5178 	 * Detach/Suspend of phci component failed. Update our state
5179 	 * too
5180 	 */
5181 	ph = i_devi_get_phci(dip);
5182 	if (ph == NULL) {
5183 		return;
5184 	}
5185 
5186 	MDI_PHCI_LOCK(ph);
5187 	/*
5188 	 * Detach of pHCI failed. Restore back converse
5189 	 * state
5190 	 */
5191 	switch (cmd) {
5192 	case DDI_DETACH:
5193 		MDI_DEBUG(2, (CE_NOTE, dip,
5194 		    "!pHCI post_detach: called %p\n", (void *)ph));
5195 		if (error != DDI_SUCCESS)
5196 			MDI_PHCI_SET_ATTACH(ph);
5197 		break;
5198 
5199 	case DDI_SUSPEND:
5200 		MDI_DEBUG(2, (CE_NOTE, dip,
5201 		    "!pHCI post_suspend: called %p\n", (void *)ph));
5202 		if (error != DDI_SUCCESS)
5203 			MDI_PHCI_SET_RESUME(ph);
5204 		break;
5205 	}
5206 	MDI_PHCI_UNLOCK(ph);
5207 }
5208 
5209 /*ARGSUSED*/
5210 static void
5211 i_mdi_client_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5212 {
5213 	mdi_client_t	*ct;
5214 
5215 	ct = i_devi_get_client(dip);
5216 	if (ct == NULL) {
5217 		return;
5218 	}
5219 	MDI_CLIENT_LOCK(ct);
5220 	/*
5221 	 * Detach of Client failed. Restore back converse
5222 	 * state
5223 	 */
5224 	switch (cmd) {
5225 	case DDI_DETACH:
5226 		MDI_DEBUG(2, (CE_NOTE, dip,
5227 		    "!Client post_detach: called %p\n", (void *)ct));
5228 		if (DEVI_IS_ATTACHING(ct->ct_dip)) {
5229 			MDI_DEBUG(4, (CE_NOTE, dip, "i_mdi_client_post_detach "
5230 			    "i_mdi_pm_rele_client\n"));
5231 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
5232 		} else {
5233 			MDI_DEBUG(4, (CE_NOTE, dip, "i_mdi_client_post_detach "
5234 			    "i_mdi_pm_reset_client\n"));
5235 			i_mdi_pm_reset_client(ct);
5236 		}
5237 		if (error != DDI_SUCCESS)
5238 			MDI_CLIENT_SET_ATTACH(ct);
5239 		break;
5240 
5241 	case DDI_SUSPEND:
5242 		MDI_DEBUG(2, (CE_NOTE, dip,
5243 		    "!Client post_suspend: called %p\n", (void *)ct));
5244 		if (error != DDI_SUCCESS)
5245 			MDI_CLIENT_SET_RESUME(ct);
5246 		break;
5247 	}
5248 	MDI_CLIENT_UNLOCK(ct);
5249 }
5250 
5251 /*
5252  * create and install per-path (client - pHCI) statistics
5253  * I/O stats supported: nread, nwritten, reads, and writes
5254  * Error stats - hard errors, soft errors, & transport errors
5255  */
5256 static int
5257 i_mdi_pi_kstat_create(mdi_pathinfo_t *pip)
5258 {
5259 
5260 	dev_info_t *client = MDI_PI(pip)->pi_client->ct_dip;
5261 	dev_info_t *ppath = MDI_PI(pip)->pi_phci->ph_dip;
5262 	char ksname[KSTAT_STRLEN];
5263 	mdi_pathinfo_t *cpip;
5264 	const char *err_postfix = ",err";
5265 	kstat_t	*kiosp, *kerrsp;
5266 	struct pi_errs	*nsp;
5267 	struct mdi_pi_kstats *mdi_statp;
5268 
5269 	ASSERT(client != NULL && ppath != NULL);
5270 
5271 	ASSERT(MDI_CLIENT_LOCKED(MDI_PI(pip)->pi_client));
5272 
5273 	if (MDI_PI(pip)->pi_kstats != NULL)
5274 		return (MDI_SUCCESS);
5275 
5276 	for (cpip = MDI_PI(pip)->pi_client->ct_path_head; cpip != NULL;
5277 	    cpip = (mdi_pathinfo_t *)(MDI_PI(cpip)->pi_client_link)) {
5278 		if ((cpip == pip) || MDI_PI_IS_OFFLINE(pip))
5279 			continue;
5280 		/*
5281 		 * We have found a different path with same parent
5282 		 * kstats for a given client-pHCI are common
5283 		 */
5284 		if ((MDI_PI(cpip)->pi_phci->ph_dip == ppath) &&
5285 		    (MDI_PI(cpip)->pi_kstats != NULL)) {
5286 			MDI_PI(cpip)->pi_kstats->pi_kstat_ref++;
5287 			MDI_PI(pip)->pi_kstats = MDI_PI(cpip)->pi_kstats;
5288 			return (MDI_SUCCESS);
5289 		}
5290 	}
5291 
5292 	/*
5293 	 * stats are named as follows: TGTx.HBAy, e.g. "ssd0.fp0"
5294 	 * clamp length of name against max length of error kstat name
5295 	 */
5296 	if (snprintf(ksname, KSTAT_STRLEN, "%s%d.%s%d",
5297 	    ddi_driver_name(client), ddi_get_instance(client),
5298 	    ddi_driver_name(ppath), ddi_get_instance(ppath)) >
5299 	    (KSTAT_STRLEN - strlen(err_postfix))) {
5300 		return (MDI_FAILURE);
5301 	}
5302 	if ((kiosp = kstat_create("mdi", 0, ksname, "iopath",
5303 	    KSTAT_TYPE_IO, 1, 0)) == NULL) {
5304 		return (MDI_FAILURE);
5305 	}
5306 
5307 	(void) strcat(ksname, err_postfix);
5308 	kerrsp = kstat_create("mdi", 0, ksname, "iopath_errors",
5309 	    KSTAT_TYPE_NAMED,
5310 	    sizeof (struct pi_errs) / sizeof (kstat_named_t), 0);
5311 
5312 	if (kerrsp == NULL) {
5313 		kstat_delete(kiosp);
5314 		return (MDI_FAILURE);
5315 	}
5316 
5317 	nsp = (struct pi_errs *)kerrsp->ks_data;
5318 	kstat_named_init(&nsp->pi_softerrs, "Soft Errors", KSTAT_DATA_UINT32);
5319 	kstat_named_init(&nsp->pi_harderrs, "Hard Errors", KSTAT_DATA_UINT32);
5320 	kstat_named_init(&nsp->pi_transerrs, "Transport Errors",
5321 	    KSTAT_DATA_UINT32);
5322 	kstat_named_init(&nsp->pi_icnt_busy, "Interconnect Busy",
5323 	    KSTAT_DATA_UINT32);
5324 	kstat_named_init(&nsp->pi_icnt_errors, "Interconnect Errors",
5325 	    KSTAT_DATA_UINT32);
5326 	kstat_named_init(&nsp->pi_phci_rsrc, "pHCI No Resources",
5327 	    KSTAT_DATA_UINT32);
5328 	kstat_named_init(&nsp->pi_phci_localerr, "pHCI Local Errors",
5329 	    KSTAT_DATA_UINT32);
5330 	kstat_named_init(&nsp->pi_phci_invstate, "pHCI Invalid State",
5331 	    KSTAT_DATA_UINT32);
5332 	kstat_named_init(&nsp->pi_failedfrom, "Failed From",
5333 	    KSTAT_DATA_UINT32);
5334 	kstat_named_init(&nsp->pi_failedto, "Failed To", KSTAT_DATA_UINT32);
5335 
5336 	mdi_statp = kmem_alloc(sizeof (*mdi_statp), KM_SLEEP);
5337 	mdi_statp->pi_kstat_ref = 1;
5338 	mdi_statp->pi_kstat_iostats = kiosp;
5339 	mdi_statp->pi_kstat_errstats = kerrsp;
5340 	kstat_install(kiosp);
5341 	kstat_install(kerrsp);
5342 	MDI_PI(pip)->pi_kstats = mdi_statp;
5343 	return (MDI_SUCCESS);
5344 }
5345 
5346 /*
5347  * destroy per-path properties
5348  */
5349 static void
5350 i_mdi_pi_kstat_destroy(mdi_pathinfo_t *pip)
5351 {
5352 
5353 	struct mdi_pi_kstats *mdi_statp;
5354 
5355 	if ((mdi_statp = MDI_PI(pip)->pi_kstats) == NULL)
5356 		return;
5357 
5358 	MDI_PI(pip)->pi_kstats = NULL;
5359 
5360 	/*
5361 	 * the kstat may be shared between multiple pathinfo nodes
5362 	 * decrement this pathinfo's usage, removing the kstats
5363 	 * themselves when the last pathinfo reference is removed.
5364 	 */
5365 	ASSERT(mdi_statp->pi_kstat_ref > 0);
5366 	if (--mdi_statp->pi_kstat_ref != 0)
5367 		return;
5368 
5369 	kstat_delete(mdi_statp->pi_kstat_iostats);
5370 	kstat_delete(mdi_statp->pi_kstat_errstats);
5371 	kmem_free(mdi_statp, sizeof (*mdi_statp));
5372 }
5373 
5374 /*
5375  * update I/O paths KSTATS
5376  */
5377 void
5378 mdi_pi_kstat_iosupdate(mdi_pathinfo_t *pip, struct buf *bp)
5379 {
5380 	kstat_t *iostatp;
5381 	size_t xfer_cnt;
5382 
5383 	ASSERT(pip != NULL);
5384 
5385 	/*
5386 	 * I/O can be driven across a path prior to having path
5387 	 * statistics available, i.e. probe(9e).
5388 	 */
5389 	if (bp != NULL && MDI_PI(pip)->pi_kstats != NULL) {
5390 		iostatp = MDI_PI(pip)->pi_kstats->pi_kstat_iostats;
5391 		xfer_cnt = bp->b_bcount - bp->b_resid;
5392 		if (bp->b_flags & B_READ) {
5393 			KSTAT_IO_PTR(iostatp)->reads++;
5394 			KSTAT_IO_PTR(iostatp)->nread += xfer_cnt;
5395 		} else {
5396 			KSTAT_IO_PTR(iostatp)->writes++;
5397 			KSTAT_IO_PTR(iostatp)->nwritten += xfer_cnt;
5398 		}
5399 	}
5400 }
5401 
5402 /*
5403  * Enable the path(specific client/target/initiator)
5404  * Enabling a path means that MPxIO may select the enabled path for routing
5405  * future I/O requests, subject to other path state constraints.
5406  */
5407 int
5408 mdi_pi_enable_path(mdi_pathinfo_t *pip, int flags)
5409 {
5410 	mdi_phci_t	*ph;
5411 
5412 	ph = i_devi_get_phci(mdi_pi_get_phci(pip));
5413 	if (ph == NULL) {
5414 		MDI_DEBUG(1, (CE_NOTE, NULL, "!mdi_pi_enable_path:"
5415 			" failed. pip: %p ph = NULL\n", (void *)pip));
5416 		return (MDI_FAILURE);
5417 	}
5418 
5419 	(void) i_mdi_enable_disable_path(pip, ph->ph_vhci, flags,
5420 		MDI_ENABLE_OP);
5421 	MDI_DEBUG(5, (CE_NOTE, NULL, "!mdi_pi_enable_path:"
5422 		" Returning success pip = %p. ph = %p\n",
5423 		(void *)pip, (void *)ph));
5424 	return (MDI_SUCCESS);
5425 
5426 }
5427 
5428 /*
5429  * Disable the path (specific client/target/initiator)
5430  * Disabling a path means that MPxIO will not select the disabled path for
5431  * routing any new I/O requests.
5432  */
5433 int
5434 mdi_pi_disable_path(mdi_pathinfo_t *pip, int flags)
5435 {
5436 	mdi_phci_t	*ph;
5437 
5438 	ph = i_devi_get_phci(mdi_pi_get_phci(pip));
5439 	if (ph == NULL) {
5440 		MDI_DEBUG(1, (CE_NOTE, NULL, "!mdi_pi_disable_path:"
5441 			" failed. pip: %p ph = NULL\n", (void *)pip));
5442 		return (MDI_FAILURE);
5443 	}
5444 
5445 	(void) i_mdi_enable_disable_path(pip,
5446 			ph->ph_vhci, flags, MDI_DISABLE_OP);
5447 	MDI_DEBUG(5, (CE_NOTE, NULL, "!mdi_pi_disable_path:"
5448 		"Returning success pip = %p. ph = %p",
5449 		(void *)pip, (void *)ph));
5450 	return (MDI_SUCCESS);
5451 }
5452 
5453 /*
5454  * disable the path to a particular pHCI (pHCI specified in the phci_path
5455  * argument) for a particular client (specified in the client_path argument).
5456  * Disabling a path means that MPxIO will not select the disabled path for
5457  * routing any new I/O requests.
5458  * NOTE: this will be removed once the NWS files are changed to use the new
5459  * mdi_{enable,disable}_path interfaces
5460  */
5461 int
5462 mdi_pi_disable(dev_info_t *cdip, dev_info_t *pdip, int flags)
5463 {
5464 	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_DISABLE_OP));
5465 }
5466 
5467 /*
5468  * Enable the path to a particular pHCI (pHCI specified in the phci_path
5469  * argument) for a particular client (specified in the client_path argument).
5470  * Enabling a path means that MPxIO may select the enabled path for routing
5471  * future I/O requests, subject to other path state constraints.
5472  * NOTE: this will be removed once the NWS files are changed to use the new
5473  * mdi_{enable,disable}_path interfaces
5474  */
5475 
5476 int
5477 mdi_pi_enable(dev_info_t *cdip, dev_info_t *pdip, int flags)
5478 {
5479 	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_ENABLE_OP));
5480 }
5481 
5482 /*
5483  * Common routine for doing enable/disable.
5484  */
5485 static mdi_pathinfo_t *
5486 i_mdi_enable_disable_path(mdi_pathinfo_t *pip, mdi_vhci_t *vh, int flags,
5487 		int op)
5488 {
5489 	int		sync_flag = 0;
5490 	int		rv;
5491 	mdi_pathinfo_t 	*next;
5492 	int		(*f)() = NULL;
5493 
5494 	f = vh->vh_ops->vo_pi_state_change;
5495 
5496 	sync_flag = (flags << 8) & 0xf00;
5497 
5498 	/*
5499 	 * Do a callback into the mdi consumer to let it
5500 	 * know that path is about to get enabled/disabled.
5501 	 */
5502 	if (f != NULL) {
5503 		rv = (*f)(vh->vh_dip, pip, 0,
5504 			MDI_PI_EXT_STATE(pip),
5505 			MDI_EXT_STATE_CHANGE | sync_flag |
5506 			op | MDI_BEFORE_STATE_CHANGE);
5507 		if (rv != MDI_SUCCESS) {
5508 			MDI_DEBUG(2, (CE_WARN, vh->vh_dip,
5509 			"!vo_pi_state_change: failed rv = %x", rv));
5510 		}
5511 	}
5512 	MDI_PI_LOCK(pip);
5513 	next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5514 
5515 	switch (flags) {
5516 		case USER_DISABLE:
5517 			if (op == MDI_DISABLE_OP) {
5518 				MDI_PI_SET_USER_DISABLE(pip);
5519 			} else {
5520 				MDI_PI_SET_USER_ENABLE(pip);
5521 			}
5522 			break;
5523 		case DRIVER_DISABLE:
5524 			if (op == MDI_DISABLE_OP) {
5525 				MDI_PI_SET_DRV_DISABLE(pip);
5526 			} else {
5527 				MDI_PI_SET_DRV_ENABLE(pip);
5528 			}
5529 			break;
5530 		case DRIVER_DISABLE_TRANSIENT:
5531 			if (op == MDI_DISABLE_OP && rv == MDI_SUCCESS) {
5532 				MDI_PI_SET_DRV_DISABLE_TRANS(pip);
5533 			} else {
5534 				MDI_PI_SET_DRV_ENABLE_TRANS(pip);
5535 			}
5536 			break;
5537 	}
5538 	MDI_PI_UNLOCK(pip);
5539 	/*
5540 	 * Do a callback into the mdi consumer to let it
5541 	 * know that path is now enabled/disabled.
5542 	 */
5543 	if (f != NULL) {
5544 		rv = (*f)(vh->vh_dip, pip, 0,
5545 			MDI_PI_EXT_STATE(pip),
5546 			MDI_EXT_STATE_CHANGE | sync_flag |
5547 			op | MDI_AFTER_STATE_CHANGE);
5548 		if (rv != MDI_SUCCESS) {
5549 			MDI_DEBUG(2, (CE_WARN, vh->vh_dip,
5550 			"!vo_pi_state_change: failed rv = %x", rv));
5551 		}
5552 	}
5553 	return (next);
5554 }
5555 
5556 /*
5557  * Common routine for doing enable/disable.
5558  * NOTE: this will be removed once the NWS files are changed to use the new
5559  * mdi_{enable,disable}_path has been putback
5560  */
5561 int
5562 i_mdi_pi_enable_disable(dev_info_t *cdip, dev_info_t *pdip, int flags, int op)
5563 {
5564 
5565 	mdi_phci_t	*ph;
5566 	mdi_vhci_t	*vh = NULL;
5567 	mdi_client_t	*ct;
5568 	mdi_pathinfo_t	*next, *pip;
5569 	int		found_it;
5570 
5571 	ph = i_devi_get_phci(pdip);
5572 	MDI_DEBUG(5, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable: "
5573 		"Op = %d pdip = %p cdip = %p\n", op, (void *)pdip,
5574 		(void *)cdip));
5575 	if (ph == NULL) {
5576 		MDI_DEBUG(1, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable:"
5577 			"Op %d failed. ph = NULL\n", op));
5578 		return (MDI_FAILURE);
5579 	}
5580 
5581 	if ((op != MDI_ENABLE_OP) && (op != MDI_DISABLE_OP)) {
5582 		MDI_DEBUG(1, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable: "
5583 			"Op Invalid operation = %d\n", op));
5584 		return (MDI_FAILURE);
5585 	}
5586 
5587 	vh = ph->ph_vhci;
5588 
5589 	if (cdip == NULL) {
5590 		/*
5591 		 * Need to mark the Phci as enabled/disabled.
5592 		 */
5593 		MDI_DEBUG(3, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable: "
5594 		"Op %d for the phci\n", op));
5595 		MDI_PHCI_LOCK(ph);
5596 		switch (flags) {
5597 			case USER_DISABLE:
5598 				if (op == MDI_DISABLE_OP) {
5599 					MDI_PHCI_SET_USER_DISABLE(ph);
5600 				} else {
5601 					MDI_PHCI_SET_USER_ENABLE(ph);
5602 				}
5603 				break;
5604 			case DRIVER_DISABLE:
5605 				if (op == MDI_DISABLE_OP) {
5606 					MDI_PHCI_SET_DRV_DISABLE(ph);
5607 				} else {
5608 					MDI_PHCI_SET_DRV_ENABLE(ph);
5609 				}
5610 				break;
5611 			case DRIVER_DISABLE_TRANSIENT:
5612 				if (op == MDI_DISABLE_OP) {
5613 					MDI_PHCI_SET_DRV_DISABLE_TRANSIENT(ph);
5614 				} else {
5615 					MDI_PHCI_SET_DRV_ENABLE_TRANSIENT(ph);
5616 				}
5617 				break;
5618 			default:
5619 				MDI_PHCI_UNLOCK(ph);
5620 				MDI_DEBUG(1, (CE_NOTE, NULL,
5621 				"!i_mdi_pi_enable_disable:"
5622 				" Invalid flag argument= %d\n", flags));
5623 		}
5624 
5625 		/*
5626 		 * Phci has been disabled. Now try to enable/disable
5627 		 * path info's to each client.
5628 		 */
5629 		pip = ph->ph_path_head;
5630 		while (pip != NULL) {
5631 			pip = i_mdi_enable_disable_path(pip, vh, flags, op);
5632 		}
5633 		MDI_PHCI_UNLOCK(ph);
5634 	} else {
5635 
5636 		/*
5637 		 * Disable a specific client.
5638 		 */
5639 		ct = i_devi_get_client(cdip);
5640 		if (ct == NULL) {
5641 			MDI_DEBUG(1, (CE_NOTE, NULL,
5642 			"!i_mdi_pi_enable_disable:"
5643 			" failed. ct = NULL operation = %d\n", op));
5644 			return (MDI_FAILURE);
5645 		}
5646 
5647 		MDI_CLIENT_LOCK(ct);
5648 		pip = ct->ct_path_head;
5649 		found_it = 0;
5650 		while (pip != NULL) {
5651 			MDI_PI_LOCK(pip);
5652 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
5653 			if (MDI_PI(pip)->pi_phci == ph) {
5654 				MDI_PI_UNLOCK(pip);
5655 				found_it = 1;
5656 				break;
5657 			}
5658 			MDI_PI_UNLOCK(pip);
5659 			pip = next;
5660 		}
5661 
5662 
5663 		MDI_CLIENT_UNLOCK(ct);
5664 		if (found_it == 0) {
5665 			MDI_DEBUG(1, (CE_NOTE, NULL,
5666 			"!i_mdi_pi_enable_disable:"
5667 			" failed. Could not find corresponding pip\n"));
5668 			return (MDI_FAILURE);
5669 		}
5670 
5671 		(void) i_mdi_enable_disable_path(pip, vh, flags, op);
5672 	}
5673 
5674 	MDI_DEBUG(5, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable: "
5675 		"Op %d Returning success pdip = %p cdip = %p\n",
5676 		op, (void *)pdip, (void *)cdip));
5677 	return (MDI_SUCCESS);
5678 }
5679 
5680 /*
5681  * Ensure phci powered up
5682  */
5683 static void
5684 i_mdi_pm_hold_pip(mdi_pathinfo_t *pip)
5685 {
5686 	dev_info_t	*ph_dip;
5687 
5688 	ASSERT(pip != NULL);
5689 	ASSERT(MDI_PI_LOCKED(pip));
5690 
5691 	if (MDI_PI(pip)->pi_pm_held) {
5692 		return;
5693 	}
5694 
5695 	ph_dip = mdi_pi_get_phci(pip);
5696 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_pm_hold_pip for %s%d %p\n",
5697 	    ddi_get_name(ph_dip), ddi_get_instance(ph_dip), (void *)pip));
5698 	if (ph_dip == NULL) {
5699 		return;
5700 	}
5701 
5702 	MDI_PI_UNLOCK(pip);
5703 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt was %d\n",
5704 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
5705 
5706 	pm_hold_power(ph_dip);
5707 
5708 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt is %d\n",
5709 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
5710 	MDI_PI_LOCK(pip);
5711 
5712 	MDI_PI(pip)->pi_pm_held = 1;
5713 }
5714 
5715 /*
5716  * Allow phci powered down
5717  */
5718 static void
5719 i_mdi_pm_rele_pip(mdi_pathinfo_t *pip)
5720 {
5721 	dev_info_t	*ph_dip = NULL;
5722 
5723 	ASSERT(pip != NULL);
5724 	ASSERT(MDI_PI_LOCKED(pip));
5725 
5726 	if (MDI_PI(pip)->pi_pm_held == 0) {
5727 		return;
5728 	}
5729 
5730 	ph_dip = mdi_pi_get_phci(pip);
5731 	ASSERT(ph_dip != NULL);
5732 
5733 	MDI_PI_UNLOCK(pip);
5734 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_pm_rele_pip for %s%d %p\n",
5735 	    ddi_get_name(ph_dip), ddi_get_instance(ph_dip), (void *)pip));
5736 
5737 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt was %d\n",
5738 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
5739 	pm_rele_power(ph_dip);
5740 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt is %d\n",
5741 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
5742 
5743 	MDI_PI_LOCK(pip);
5744 	MDI_PI(pip)->pi_pm_held = 0;
5745 }
5746 
5747 static void
5748 i_mdi_pm_hold_client(mdi_client_t *ct, int incr)
5749 {
5750 	ASSERT(MDI_CLIENT_LOCKED(ct));
5751 
5752 	ct->ct_power_cnt += incr;
5753 	MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "i_mdi_pm_hold_client %p "
5754 	    "ct_power_cnt = %d incr = %d\n", (void *)ct,
5755 	    ct->ct_power_cnt, incr));
5756 	ASSERT(ct->ct_power_cnt >= 0);
5757 }
5758 
5759 static void
5760 i_mdi_rele_all_phci(mdi_client_t *ct)
5761 {
5762 	mdi_pathinfo_t  *pip;
5763 
5764 	ASSERT(MDI_CLIENT_LOCKED(ct));
5765 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
5766 	while (pip != NULL) {
5767 		mdi_hold_path(pip);
5768 		MDI_PI_LOCK(pip);
5769 		i_mdi_pm_rele_pip(pip);
5770 		MDI_PI_UNLOCK(pip);
5771 		mdi_rele_path(pip);
5772 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
5773 	}
5774 }
5775 
5776 static void
5777 i_mdi_pm_rele_client(mdi_client_t *ct, int decr)
5778 {
5779 	ASSERT(MDI_CLIENT_LOCKED(ct));
5780 
5781 	if (i_ddi_devi_attached(ct->ct_dip)) {
5782 		ct->ct_power_cnt -= decr;
5783 		MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "i_mdi_pm_rele_client %p "
5784 		    "ct_power_cnt = %d decr = %d\n",
5785 		    (void *)ct, ct->ct_power_cnt, decr));
5786 	}
5787 
5788 	ASSERT(ct->ct_power_cnt >= 0);
5789 	if (ct->ct_power_cnt == 0) {
5790 		i_mdi_rele_all_phci(ct);
5791 		return;
5792 	}
5793 }
5794 
5795 static void
5796 i_mdi_pm_reset_client(mdi_client_t *ct)
5797 {
5798 	MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "i_mdi_pm_reset_client %p "
5799 	    "ct_power_cnt = %d\n", (void *)ct, ct->ct_power_cnt));
5800 	ASSERT(MDI_CLIENT_LOCKED(ct));
5801 	ct->ct_power_cnt = 0;
5802 	i_mdi_rele_all_phci(ct);
5803 	ct->ct_powercnt_config = 0;
5804 	ct->ct_powercnt_unconfig = 0;
5805 	ct->ct_powercnt_reset = 1;
5806 }
5807 
5808 static int
5809 i_mdi_power_one_phci(mdi_pathinfo_t *pip)
5810 {
5811 	int		ret;
5812 	dev_info_t	*ph_dip;
5813 
5814 	MDI_PI_LOCK(pip);
5815 	i_mdi_pm_hold_pip(pip);
5816 
5817 	ph_dip = mdi_pi_get_phci(pip);
5818 	MDI_PI_UNLOCK(pip);
5819 
5820 	/* bring all components of phci to full power */
5821 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_power_one_phci "
5822 	    "pm_powerup for %s%d %p\n", ddi_get_name(ph_dip),
5823 	    ddi_get_instance(ph_dip), (void *)pip));
5824 
5825 	ret = pm_powerup(ph_dip);
5826 
5827 	if (ret == DDI_FAILURE) {
5828 		MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_power_one_phci "
5829 		    "pm_powerup FAILED for %s%d %p\n",
5830 		    ddi_get_name(ph_dip), ddi_get_instance(ph_dip),
5831 		    (void *)pip));
5832 
5833 		MDI_PI_LOCK(pip);
5834 		i_mdi_pm_rele_pip(pip);
5835 		MDI_PI_UNLOCK(pip);
5836 		return (MDI_FAILURE);
5837 	}
5838 
5839 	return (MDI_SUCCESS);
5840 }
5841 
5842 static int
5843 i_mdi_power_all_phci(mdi_client_t *ct)
5844 {
5845 	mdi_pathinfo_t  *pip;
5846 	int		succeeded = 0;
5847 
5848 	ASSERT(MDI_CLIENT_LOCKED(ct));
5849 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
5850 	while (pip != NULL) {
5851 		if (MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip)) {
5852 			mdi_hold_path(pip);
5853 			MDI_CLIENT_UNLOCK(ct);
5854 			if (i_mdi_power_one_phci(pip) == MDI_SUCCESS)
5855 				succeeded = 1;
5856 
5857 			ASSERT(ct == MDI_PI(pip)->pi_client);
5858 			MDI_CLIENT_LOCK(ct);
5859 			mdi_rele_path(pip);
5860 		}
5861 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
5862 	}
5863 
5864 	return (succeeded ? MDI_SUCCESS : MDI_FAILURE);
5865 }
5866 
5867 /*
5868  * mdi_bus_power():
5869  *		1. Place the phci(s) into powered up state so that
5870  *		   client can do power management
5871  *		2. Ensure phci powered up as client power managing
5872  * Return Values:
5873  *		MDI_SUCCESS
5874  *		MDI_FAILURE
5875  */
5876 int
5877 mdi_bus_power(dev_info_t *parent, void *impl_arg, pm_bus_power_op_t op,
5878     void *arg, void *result)
5879 {
5880 	int			ret = MDI_SUCCESS;
5881 	pm_bp_child_pwrchg_t	*bpc;
5882 	mdi_client_t		*ct;
5883 	dev_info_t		*cdip;
5884 	pm_bp_has_changed_t	*bphc;
5885 
5886 	/*
5887 	 * BUS_POWER_NOINVOL not supported
5888 	 */
5889 	if (op == BUS_POWER_NOINVOL)
5890 		return (MDI_FAILURE);
5891 
5892 	/*
5893 	 * ignore other OPs.
5894 	 * return quickly to save cou cycles on the ct processing
5895 	 */
5896 	switch (op) {
5897 	case BUS_POWER_PRE_NOTIFICATION:
5898 	case BUS_POWER_POST_NOTIFICATION:
5899 		bpc = (pm_bp_child_pwrchg_t *)arg;
5900 		cdip = bpc->bpc_dip;
5901 		break;
5902 	case BUS_POWER_HAS_CHANGED:
5903 		bphc = (pm_bp_has_changed_t *)arg;
5904 		cdip = bphc->bphc_dip;
5905 		break;
5906 	default:
5907 		return (pm_busop_bus_power(parent, impl_arg, op, arg, result));
5908 	}
5909 
5910 	ASSERT(MDI_CLIENT(cdip));
5911 
5912 	ct = i_devi_get_client(cdip);
5913 	if (ct == NULL)
5914 		return (MDI_FAILURE);
5915 
5916 	/*
5917 	 * wait till the mdi_pathinfo node state change are processed
5918 	 */
5919 	MDI_CLIENT_LOCK(ct);
5920 	switch (op) {
5921 	case BUS_POWER_PRE_NOTIFICATION:
5922 		MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip, "mdi_bus_power "
5923 		    "BUS_POWER_PRE_NOTIFICATION:"
5924 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d\n",
5925 		    PM_NAME(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
5926 		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp));
5927 
5928 		/* serialize power level change per client */
5929 		while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
5930 			cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
5931 
5932 		MDI_CLIENT_SET_POWER_TRANSITION(ct);
5933 
5934 		if (ct->ct_power_cnt == 0) {
5935 			ret = i_mdi_power_all_phci(ct);
5936 		}
5937 
5938 		/*
5939 		 * if new_level > 0:
5940 		 *	- hold phci(s)
5941 		 *	- power up phci(s) if not already
5942 		 * ignore power down
5943 		 */
5944 		if (bpc->bpc_nlevel > 0) {
5945 			if (!DEVI_IS_ATTACHING(ct->ct_dip)) {
5946 				MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
5947 				    "mdi_bus_power i_mdi_pm_hold_client\n"));
5948 				i_mdi_pm_hold_client(ct, ct->ct_path_count);
5949 			}
5950 		}
5951 		break;
5952 	case BUS_POWER_POST_NOTIFICATION:
5953 		MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip, "mdi_bus_power "
5954 		    "BUS_POWER_POST_NOTIFICATION:"
5955 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d result=%d\n",
5956 		    PM_NAME(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
5957 		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp,
5958 		    *(int *)result));
5959 
5960 		if (*(int *)result == DDI_SUCCESS) {
5961 			if (bpc->bpc_nlevel > 0) {
5962 				MDI_CLIENT_SET_POWER_UP(ct);
5963 			} else {
5964 				MDI_CLIENT_SET_POWER_DOWN(ct);
5965 			}
5966 		}
5967 
5968 		/* release the hold we did in pre-notification */
5969 		if (bpc->bpc_nlevel > 0 && (*(int *)result != DDI_SUCCESS) &&
5970 		    !DEVI_IS_ATTACHING(ct->ct_dip)) {
5971 			MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
5972 			    "mdi_bus_power i_mdi_pm_rele_client\n"));
5973 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
5974 		}
5975 
5976 		if (bpc->bpc_nlevel == 0 && (*(int *)result == DDI_SUCCESS)) {
5977 			/* another thread might started attaching */
5978 			if (DEVI_IS_ATTACHING(ct->ct_dip)) {
5979 				MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
5980 				    "mdi_bus_power i_mdi_pm_rele_client\n"));
5981 				i_mdi_pm_rele_client(ct, ct->ct_path_count);
5982 			/* detaching has been taken care in pm_post_unconfig */
5983 			} else if (!DEVI_IS_DETACHING(ct->ct_dip)) {
5984 				MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
5985 				    "mdi_bus_power i_mdi_pm_reset_client\n"));
5986 				i_mdi_pm_reset_client(ct);
5987 			}
5988 		}
5989 
5990 		MDI_CLIENT_CLEAR_POWER_TRANSITION(ct);
5991 		cv_broadcast(&ct->ct_powerchange_cv);
5992 
5993 		break;
5994 
5995 	/* need to do more */
5996 	case BUS_POWER_HAS_CHANGED:
5997 		MDI_DEBUG(4, (CE_NOTE, bphc->bphc_dip, "mdi_bus_power "
5998 		    "BUS_POWER_HAS_CHANGED:"
5999 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d\n",
6000 		    PM_NAME(bphc->bphc_dip), PM_ADDR(bphc->bphc_dip),
6001 		    bphc->bphc_olevel, bphc->bphc_nlevel, bphc->bphc_comp));
6002 
6003 		if (bphc->bphc_nlevel > 0 &&
6004 		    bphc->bphc_nlevel > bphc->bphc_olevel) {
6005 			if (ct->ct_power_cnt == 0) {
6006 				ret = i_mdi_power_all_phci(ct);
6007 			}
6008 			MDI_DEBUG(4, (CE_NOTE, bphc->bphc_dip,
6009 			    "mdi_bus_power i_mdi_pm_hold_client\n"));
6010 			i_mdi_pm_hold_client(ct, ct->ct_path_count);
6011 		}
6012 
6013 		if (bphc->bphc_nlevel == 0 && bphc->bphc_olevel != -1) {
6014 			MDI_DEBUG(4, (CE_NOTE, bphc->bphc_dip,
6015 			    "mdi_bus_power i_mdi_pm_rele_client\n"));
6016 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
6017 		}
6018 		break;
6019 	}
6020 
6021 	MDI_CLIENT_UNLOCK(ct);
6022 	return (ret);
6023 }
6024 
6025 static int
6026 i_mdi_pm_pre_config_one(dev_info_t *child)
6027 {
6028 	int		ret = MDI_SUCCESS;
6029 	mdi_client_t	*ct;
6030 
6031 	ct = i_devi_get_client(child);
6032 	if (ct == NULL)
6033 		return (MDI_FAILURE);
6034 
6035 	MDI_CLIENT_LOCK(ct);
6036 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6037 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6038 
6039 	if (!MDI_CLIENT_IS_FAILED(ct)) {
6040 		MDI_CLIENT_UNLOCK(ct);
6041 		MDI_DEBUG(4, (CE_NOTE, child,
6042 		    "i_mdi_pm_pre_config_one already configured\n"));
6043 		return (MDI_SUCCESS);
6044 	}
6045 
6046 	if (ct->ct_powercnt_config) {
6047 		MDI_CLIENT_UNLOCK(ct);
6048 		MDI_DEBUG(4, (CE_NOTE, child,
6049 		    "i_mdi_pm_pre_config_one ALREADY held\n"));
6050 		return (MDI_SUCCESS);
6051 	}
6052 
6053 	if (ct->ct_power_cnt == 0) {
6054 		ret = i_mdi_power_all_phci(ct);
6055 	}
6056 	MDI_DEBUG(4, (CE_NOTE, child,
6057 	    "i_mdi_pm_pre_config_one i_mdi_pm_hold_client\n"));
6058 	i_mdi_pm_hold_client(ct, ct->ct_path_count);
6059 	ct->ct_powercnt_config = 1;
6060 	ct->ct_powercnt_reset = 0;
6061 	MDI_CLIENT_UNLOCK(ct);
6062 	return (ret);
6063 }
6064 
6065 static int
6066 i_mdi_pm_pre_config(dev_info_t *vdip, dev_info_t *child)
6067 {
6068 	int			ret = MDI_SUCCESS;
6069 	dev_info_t		*cdip;
6070 	int			circ;
6071 
6072 	ASSERT(MDI_VHCI(vdip));
6073 
6074 	/* ndi_devi_config_one */
6075 	if (child) {
6076 		ASSERT(DEVI_BUSY_OWNED(vdip));
6077 		return (i_mdi_pm_pre_config_one(child));
6078 	}
6079 
6080 	/* devi_config_common */
6081 	ndi_devi_enter(vdip, &circ);
6082 	cdip = ddi_get_child(vdip);
6083 	while (cdip) {
6084 		dev_info_t *next = ddi_get_next_sibling(cdip);
6085 
6086 		ret = i_mdi_pm_pre_config_one(cdip);
6087 		if (ret != MDI_SUCCESS)
6088 			break;
6089 		cdip = next;
6090 	}
6091 	ndi_devi_exit(vdip, circ);
6092 	return (ret);
6093 }
6094 
6095 static int
6096 i_mdi_pm_pre_unconfig_one(dev_info_t *child, int *held, int flags)
6097 {
6098 	int		ret = MDI_SUCCESS;
6099 	mdi_client_t	*ct;
6100 
6101 	ct = i_devi_get_client(child);
6102 	if (ct == NULL)
6103 		return (MDI_FAILURE);
6104 
6105 	MDI_CLIENT_LOCK(ct);
6106 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6107 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6108 
6109 	if (!i_ddi_devi_attached(ct->ct_dip)) {
6110 		MDI_DEBUG(4, (CE_NOTE, child,
6111 		    "i_mdi_pm_pre_unconfig node detached already\n"));
6112 		MDI_CLIENT_UNLOCK(ct);
6113 		return (MDI_SUCCESS);
6114 	}
6115 
6116 	if (MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6117 	    (flags & NDI_AUTODETACH)) {
6118 		MDI_DEBUG(4, (CE_NOTE, child,
6119 		    "i_mdi_pm_pre_unconfig auto-modunload\n"));
6120 		MDI_CLIENT_UNLOCK(ct);
6121 		return (MDI_FAILURE);
6122 	}
6123 
6124 	if (ct->ct_powercnt_unconfig) {
6125 		MDI_DEBUG(4, (CE_NOTE, child,
6126 		    "i_mdi_pm_pre_unconfig ct_powercnt_held\n"));
6127 		MDI_CLIENT_UNLOCK(ct);
6128 		*held = 1;
6129 		return (MDI_SUCCESS);
6130 	}
6131 
6132 	if (ct->ct_power_cnt == 0) {
6133 		ret = i_mdi_power_all_phci(ct);
6134 	}
6135 	MDI_DEBUG(4, (CE_NOTE, child,
6136 	    "i_mdi_pm_pre_unconfig i_mdi_pm_hold_client\n"));
6137 	i_mdi_pm_hold_client(ct, ct->ct_path_count);
6138 	ct->ct_powercnt_unconfig = 1;
6139 	ct->ct_powercnt_reset = 0;
6140 	MDI_CLIENT_UNLOCK(ct);
6141 	if (ret == MDI_SUCCESS)
6142 		*held = 1;
6143 	return (ret);
6144 }
6145 
6146 static int
6147 i_mdi_pm_pre_unconfig(dev_info_t *vdip, dev_info_t *child, int *held,
6148     int flags)
6149 {
6150 	int			ret = MDI_SUCCESS;
6151 	dev_info_t		*cdip;
6152 	int			circ;
6153 
6154 	ASSERT(MDI_VHCI(vdip));
6155 	*held = 0;
6156 
6157 	/* ndi_devi_unconfig_one */
6158 	if (child) {
6159 		ASSERT(DEVI_BUSY_OWNED(vdip));
6160 		return (i_mdi_pm_pre_unconfig_one(child, held, flags));
6161 	}
6162 
6163 	/* devi_unconfig_common */
6164 	ndi_devi_enter(vdip, &circ);
6165 	cdip = ddi_get_child(vdip);
6166 	while (cdip) {
6167 		dev_info_t *next = ddi_get_next_sibling(cdip);
6168 
6169 		ret = i_mdi_pm_pre_unconfig_one(cdip, held, flags);
6170 		cdip = next;
6171 	}
6172 	ndi_devi_exit(vdip, circ);
6173 
6174 	if (*held)
6175 		ret = MDI_SUCCESS;
6176 
6177 	return (ret);
6178 }
6179 
6180 static void
6181 i_mdi_pm_post_config_one(dev_info_t *child)
6182 {
6183 	mdi_client_t	*ct;
6184 
6185 	ct = i_devi_get_client(child);
6186 	if (ct == NULL)
6187 		return;
6188 
6189 	MDI_CLIENT_LOCK(ct);
6190 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6191 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6192 
6193 	if (ct->ct_powercnt_reset || !ct->ct_powercnt_config) {
6194 		MDI_DEBUG(4, (CE_NOTE, child,
6195 		    "i_mdi_pm_post_config_one NOT configured\n"));
6196 		MDI_CLIENT_UNLOCK(ct);
6197 		return;
6198 	}
6199 
6200 	/* client has not been updated */
6201 	if (MDI_CLIENT_IS_FAILED(ct)) {
6202 		MDI_DEBUG(4, (CE_NOTE, child,
6203 		    "i_mdi_pm_post_config_one NOT configured\n"));
6204 		MDI_CLIENT_UNLOCK(ct);
6205 		return;
6206 	}
6207 
6208 	/* another thread might have powered it down or detached it */
6209 	if ((MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6210 	    !DEVI_IS_ATTACHING(ct->ct_dip)) ||
6211 	    (!i_ddi_devi_attached(ct->ct_dip) &&
6212 	    !DEVI_IS_ATTACHING(ct->ct_dip))) {
6213 		MDI_DEBUG(4, (CE_NOTE, child,
6214 		    "i_mdi_pm_post_config i_mdi_pm_reset_client\n"));
6215 		i_mdi_pm_reset_client(ct);
6216 	} else {
6217 		mdi_pathinfo_t  *pip, *next;
6218 		int	valid_path_count = 0;
6219 
6220 		MDI_DEBUG(4, (CE_NOTE, child,
6221 		    "i_mdi_pm_post_config i_mdi_pm_rele_client\n"));
6222 		pip = ct->ct_path_head;
6223 		while (pip != NULL) {
6224 			MDI_PI_LOCK(pip);
6225 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6226 			if (MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip))
6227 				valid_path_count ++;
6228 			MDI_PI_UNLOCK(pip);
6229 			pip = next;
6230 		}
6231 		i_mdi_pm_rele_client(ct, valid_path_count);
6232 	}
6233 	ct->ct_powercnt_config = 0;
6234 	MDI_CLIENT_UNLOCK(ct);
6235 }
6236 
6237 static void
6238 i_mdi_pm_post_config(dev_info_t *vdip, dev_info_t *child)
6239 {
6240 	int		circ;
6241 	dev_info_t	*cdip;
6242 
6243 	ASSERT(MDI_VHCI(vdip));
6244 
6245 	/* ndi_devi_config_one */
6246 	if (child) {
6247 		ASSERT(DEVI_BUSY_OWNED(vdip));
6248 		i_mdi_pm_post_config_one(child);
6249 		return;
6250 	}
6251 
6252 	/* devi_config_common */
6253 	ndi_devi_enter(vdip, &circ);
6254 	cdip = ddi_get_child(vdip);
6255 	while (cdip) {
6256 		dev_info_t *next = ddi_get_next_sibling(cdip);
6257 
6258 		i_mdi_pm_post_config_one(cdip);
6259 		cdip = next;
6260 	}
6261 	ndi_devi_exit(vdip, circ);
6262 }
6263 
6264 static void
6265 i_mdi_pm_post_unconfig_one(dev_info_t *child)
6266 {
6267 	mdi_client_t	*ct;
6268 
6269 	ct = i_devi_get_client(child);
6270 	if (ct == NULL)
6271 		return;
6272 
6273 	MDI_CLIENT_LOCK(ct);
6274 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6275 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6276 
6277 	if (!ct->ct_powercnt_unconfig || ct->ct_powercnt_reset) {
6278 		MDI_DEBUG(4, (CE_NOTE, child,
6279 		    "i_mdi_pm_post_unconfig NOT held\n"));
6280 		MDI_CLIENT_UNLOCK(ct);
6281 		return;
6282 	}
6283 
6284 	/* failure detaching or another thread just attached it */
6285 	if ((MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6286 	    i_ddi_devi_attached(ct->ct_dip)) ||
6287 	    (!i_ddi_devi_attached(ct->ct_dip) &&
6288 	    !DEVI_IS_ATTACHING(ct->ct_dip))) {
6289 		MDI_DEBUG(4, (CE_NOTE, child,
6290 		    "i_mdi_pm_post_unconfig i_mdi_pm_reset_client\n"));
6291 		i_mdi_pm_reset_client(ct);
6292 	} else {
6293 		mdi_pathinfo_t  *pip, *next;
6294 		int	valid_path_count = 0;
6295 
6296 		MDI_DEBUG(4, (CE_NOTE, child,
6297 		    "i_mdi_pm_post_unconfig i_mdi_pm_rele_client\n"));
6298 		pip = ct->ct_path_head;
6299 		while (pip != NULL) {
6300 			MDI_PI_LOCK(pip);
6301 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6302 			if (MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip))
6303 				valid_path_count ++;
6304 			MDI_PI_UNLOCK(pip);
6305 			pip = next;
6306 		}
6307 		i_mdi_pm_rele_client(ct, valid_path_count);
6308 		ct->ct_powercnt_unconfig = 0;
6309 	}
6310 
6311 	MDI_CLIENT_UNLOCK(ct);
6312 }
6313 
6314 static void
6315 i_mdi_pm_post_unconfig(dev_info_t *vdip, dev_info_t *child, int held)
6316 {
6317 	int			circ;
6318 	dev_info_t		*cdip;
6319 
6320 	ASSERT(MDI_VHCI(vdip));
6321 
6322 	if (!held) {
6323 		MDI_DEBUG(4, (CE_NOTE, vdip,
6324 		    "i_mdi_pm_post_unconfig held = %d\n", held));
6325 		return;
6326 	}
6327 
6328 	if (child) {
6329 		ASSERT(DEVI_BUSY_OWNED(vdip));
6330 		i_mdi_pm_post_unconfig_one(child);
6331 		return;
6332 	}
6333 
6334 	ndi_devi_enter(vdip, &circ);
6335 	cdip = ddi_get_child(vdip);
6336 	while (cdip) {
6337 		dev_info_t *next = ddi_get_next_sibling(cdip);
6338 
6339 		i_mdi_pm_post_unconfig_one(cdip);
6340 		cdip = next;
6341 	}
6342 	ndi_devi_exit(vdip, circ);
6343 }
6344 
6345 int
6346 mdi_power(dev_info_t *vdip, mdi_pm_op_t op, void *args, char *devnm, int flags)
6347 {
6348 	int			circ, ret = MDI_SUCCESS;
6349 	dev_info_t		*client_dip = NULL;
6350 	mdi_client_t		*ct;
6351 
6352 	/*
6353 	 * Handling ndi_devi_config_one and ndi_devi_unconfig_one.
6354 	 * Power up pHCI for the named client device.
6355 	 * Note: Before the client is enumerated under vhci by phci,
6356 	 * client_dip can be NULL. Then proceed to power up all the
6357 	 * pHCIs.
6358 	 */
6359 	if (devnm != NULL) {
6360 		ndi_devi_enter(vdip, &circ);
6361 		client_dip = ndi_devi_findchild(vdip, devnm);
6362 	}
6363 
6364 	MDI_DEBUG(4, (CE_NOTE, vdip, "mdi_power op = %d %s %p\n",
6365 	    op, devnm ? devnm : "NULL", (void *)client_dip));
6366 
6367 	switch (op) {
6368 	case MDI_PM_PRE_CONFIG:
6369 		ret = i_mdi_pm_pre_config(vdip, client_dip);
6370 		break;
6371 
6372 	case MDI_PM_PRE_UNCONFIG:
6373 		ret = i_mdi_pm_pre_unconfig(vdip, client_dip, (int *)args,
6374 		    flags);
6375 		break;
6376 
6377 	case MDI_PM_POST_CONFIG:
6378 		i_mdi_pm_post_config(vdip, client_dip);
6379 		break;
6380 
6381 	case MDI_PM_POST_UNCONFIG:
6382 		i_mdi_pm_post_unconfig(vdip, client_dip, *(int *)args);
6383 		break;
6384 
6385 	case MDI_PM_HOLD_POWER:
6386 	case MDI_PM_RELE_POWER:
6387 		ASSERT(args);
6388 
6389 		client_dip = (dev_info_t *)args;
6390 		ASSERT(MDI_CLIENT(client_dip));
6391 
6392 		ct = i_devi_get_client(client_dip);
6393 		MDI_CLIENT_LOCK(ct);
6394 
6395 		if (op == MDI_PM_HOLD_POWER) {
6396 			if (ct->ct_power_cnt == 0) {
6397 				(void) i_mdi_power_all_phci(ct);
6398 				MDI_DEBUG(4, (CE_NOTE, client_dip,
6399 				    "mdi_power i_mdi_pm_hold_client\n"));
6400 				i_mdi_pm_hold_client(ct, ct->ct_path_count);
6401 			}
6402 		} else {
6403 			if (DEVI_IS_ATTACHING(ct->ct_dip)) {
6404 				MDI_DEBUG(4, (CE_NOTE, client_dip,
6405 				    "mdi_power i_mdi_pm_rele_client\n"));
6406 				i_mdi_pm_rele_client(ct, ct->ct_path_count);
6407 			} else {
6408 				MDI_DEBUG(4, (CE_NOTE, client_dip,
6409 				    "mdi_power i_mdi_pm_reset_client\n"));
6410 				i_mdi_pm_reset_client(ct);
6411 			}
6412 		}
6413 
6414 		MDI_CLIENT_UNLOCK(ct);
6415 		break;
6416 
6417 	default:
6418 		break;
6419 	}
6420 
6421 	if (devnm)
6422 		ndi_devi_exit(vdip, circ);
6423 
6424 	return (ret);
6425 }
6426 
6427 int
6428 mdi_component_is_vhci(dev_info_t *dip, const char **mdi_class)
6429 {
6430 	mdi_vhci_t *vhci;
6431 
6432 	if (!MDI_VHCI(dip))
6433 		return (MDI_FAILURE);
6434 
6435 	if (mdi_class) {
6436 		vhci = DEVI(dip)->devi_mdi_xhci;
6437 		ASSERT(vhci);
6438 		*mdi_class = vhci->vh_class;
6439 	}
6440 
6441 	return (MDI_SUCCESS);
6442 }
6443 
6444 int
6445 mdi_component_is_phci(dev_info_t *dip, const char **mdi_class)
6446 {
6447 	mdi_phci_t *phci;
6448 
6449 	if (!MDI_PHCI(dip))
6450 		return (MDI_FAILURE);
6451 
6452 	if (mdi_class) {
6453 		phci = DEVI(dip)->devi_mdi_xhci;
6454 		ASSERT(phci);
6455 		*mdi_class = phci->ph_vhci->vh_class;
6456 	}
6457 
6458 	return (MDI_SUCCESS);
6459 }
6460 
6461 int
6462 mdi_component_is_client(dev_info_t *dip, const char **mdi_class)
6463 {
6464 	mdi_client_t *client;
6465 
6466 	if (!MDI_CLIENT(dip))
6467 		return (MDI_FAILURE);
6468 
6469 	if (mdi_class) {
6470 		client = DEVI(dip)->devi_mdi_client;
6471 		ASSERT(client);
6472 		*mdi_class = client->ct_vhci->vh_class;
6473 	}
6474 
6475 	return (MDI_SUCCESS);
6476 }
6477 
6478 void *
6479 mdi_client_get_vhci_private(dev_info_t *dip)
6480 {
6481 	ASSERT(mdi_component_is_client(dip, NULL) == MDI_SUCCESS);
6482 	if (mdi_component_is_client(dip, NULL) == MDI_SUCCESS) {
6483 		mdi_client_t	*ct;
6484 		ct = i_devi_get_client(dip);
6485 		return (ct->ct_vprivate);
6486 	}
6487 	return (NULL);
6488 }
6489 
6490 void
6491 mdi_client_set_vhci_private(dev_info_t *dip, void *data)
6492 {
6493 	ASSERT(mdi_component_is_client(dip, NULL) == MDI_SUCCESS);
6494 	if (mdi_component_is_client(dip, NULL) == MDI_SUCCESS) {
6495 		mdi_client_t	*ct;
6496 		ct = i_devi_get_client(dip);
6497 		ct->ct_vprivate = data;
6498 	}
6499 }
6500 /*
6501  * mdi_pi_get_vhci_private():
6502  *		Get the vhci private information associated with the
6503  *		mdi_pathinfo node
6504  */
6505 void *
6506 mdi_pi_get_vhci_private(mdi_pathinfo_t *pip)
6507 {
6508 	caddr_t	vprivate = NULL;
6509 	if (pip) {
6510 		vprivate = MDI_PI(pip)->pi_vprivate;
6511 	}
6512 	return (vprivate);
6513 }
6514 
6515 /*
6516  * mdi_pi_set_vhci_private():
6517  *		Set the vhci private information in the mdi_pathinfo node
6518  */
6519 void
6520 mdi_pi_set_vhci_private(mdi_pathinfo_t *pip, void *priv)
6521 {
6522 	if (pip) {
6523 		MDI_PI(pip)->pi_vprivate = priv;
6524 	}
6525 }
6526 
6527 /*
6528  * mdi_phci_get_vhci_private():
6529  *		Get the vhci private information associated with the
6530  *		mdi_phci node
6531  */
6532 void *
6533 mdi_phci_get_vhci_private(dev_info_t *dip)
6534 {
6535 	ASSERT(mdi_component_is_phci(dip, NULL) == MDI_SUCCESS);
6536 	if (mdi_component_is_phci(dip, NULL) == MDI_SUCCESS) {
6537 		mdi_phci_t	*ph;
6538 		ph = i_devi_get_phci(dip);
6539 		return (ph->ph_vprivate);
6540 	}
6541 	return (NULL);
6542 }
6543 
6544 /*
6545  * mdi_phci_set_vhci_private():
6546  *		Set the vhci private information in the mdi_phci node
6547  */
6548 void
6549 mdi_phci_set_vhci_private(dev_info_t *dip, void *priv)
6550 {
6551 	ASSERT(mdi_component_is_phci(dip, NULL) == MDI_SUCCESS);
6552 	if (mdi_component_is_phci(dip, NULL) == MDI_SUCCESS) {
6553 		mdi_phci_t	*ph;
6554 		ph = i_devi_get_phci(dip);
6555 		ph->ph_vprivate = priv;
6556 	}
6557 }
6558 
6559 /*
6560  * List of vhci class names:
6561  * A vhci class name must be in this list only if the corresponding vhci
6562  * driver intends to use the mdi provided bus config implementation
6563  * (i.e., mdi_vhci_bus_config()).
6564  */
6565 static char *vhci_class_list[] = { MDI_HCI_CLASS_SCSI, MDI_HCI_CLASS_IB };
6566 #define	N_VHCI_CLASSES	(sizeof (vhci_class_list) / sizeof (char *))
6567 
6568 /*
6569  * Built-in list of phci drivers for every vhci class.
6570  * All phci drivers expect iscsi have root device support.
6571  */
6572 static mdi_phci_driver_info_t scsi_phci_driver_list[] = {
6573 	{ "fp", 1 },
6574 	{ "iscsi", 0 },
6575 	{ "ibsrp", 1 }
6576 	};
6577 
6578 static mdi_phci_driver_info_t ib_phci_driver_list[] = { "tavor", 1 };
6579 
6580 /*
6581  * During boot time, the on-disk vhci cache for every vhci class is read
6582  * in the form of an nvlist and stored here.
6583  */
6584 static nvlist_t *vhcache_nvl[N_VHCI_CLASSES];
6585 
6586 /* nvpair names in vhci cache nvlist */
6587 #define	MDI_VHCI_CACHE_VERSION	1
6588 #define	MDI_NVPNAME_VERSION	"version"
6589 #define	MDI_NVPNAME_PHCIS	"phcis"
6590 #define	MDI_NVPNAME_CTADDRMAP	"clientaddrmap"
6591 
6592 /*
6593  * Given vhci class name, return its on-disk vhci cache filename.
6594  * Memory for the returned filename which includes the full path is allocated
6595  * by this function.
6596  */
6597 static char *
6598 vhclass2vhcache_filename(char *vhclass)
6599 {
6600 	char *filename;
6601 	int len;
6602 	static char *fmt = "/etc/devices/mdi_%s_cache";
6603 
6604 	/*
6605 	 * fmt contains the on-disk vhci cache file name format;
6606 	 * for scsi_vhci the filename is "/etc/devices/mdi_scsi_vhci_cache".
6607 	 */
6608 
6609 	/* the -1 below is to account for "%s" in the format string */
6610 	len = strlen(fmt) + strlen(vhclass) - 1;
6611 	filename = kmem_alloc(len, KM_SLEEP);
6612 	(void) snprintf(filename, len, fmt, vhclass);
6613 	ASSERT(len == (strlen(filename) + 1));
6614 	return (filename);
6615 }
6616 
6617 /*
6618  * initialize the vhci cache related data structures and read the on-disk
6619  * vhci cached data into memory.
6620  */
6621 static void
6622 setup_vhci_cache(mdi_vhci_t *vh)
6623 {
6624 	mdi_vhci_config_t *vhc;
6625 	mdi_vhci_cache_t *vhcache;
6626 	int i;
6627 	nvlist_t *nvl = NULL;
6628 
6629 	vhc = kmem_zalloc(sizeof (mdi_vhci_config_t), KM_SLEEP);
6630 	vh->vh_config = vhc;
6631 	vhcache = &vhc->vhc_vhcache;
6632 
6633 	vhc->vhc_vhcache_filename = vhclass2vhcache_filename(vh->vh_class);
6634 
6635 	mutex_init(&vhc->vhc_lock, NULL, MUTEX_DEFAULT, NULL);
6636 	cv_init(&vhc->vhc_cv, NULL, CV_DRIVER, NULL);
6637 
6638 	rw_init(&vhcache->vhcache_lock, NULL, RW_DRIVER, NULL);
6639 
6640 	/*
6641 	 * Create string hash; same as mod_hash_create_strhash() except that
6642 	 * we use NULL key destructor.
6643 	 */
6644 	vhcache->vhcache_client_hash = mod_hash_create_extended(vh->vh_class,
6645 	    mdi_bus_config_cache_hash_size,
6646 	    mod_hash_null_keydtor, mod_hash_null_valdtor,
6647 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
6648 
6649 	setup_phci_driver_list(vh);
6650 
6651 	/*
6652 	 * The on-disk vhci cache is read during booting prior to the
6653 	 * lights-out period by mdi_read_devices_files().
6654 	 */
6655 	for (i = 0; i < N_VHCI_CLASSES; i++) {
6656 		if (strcmp(vhci_class_list[i], vh->vh_class) == 0) {
6657 			nvl = vhcache_nvl[i];
6658 			vhcache_nvl[i] = NULL;
6659 			break;
6660 		}
6661 	}
6662 
6663 	/*
6664 	 * this is to cover the case of some one manually causing unloading
6665 	 * (or detaching) and reloading (or attaching) of a vhci driver.
6666 	 */
6667 	if (nvl == NULL && modrootloaded)
6668 		nvl = read_on_disk_vhci_cache(vh->vh_class);
6669 
6670 	if (nvl != NULL) {
6671 		rw_enter(&vhcache->vhcache_lock, RW_WRITER);
6672 		if (mainnvl_to_vhcache(vhcache, nvl) == MDI_SUCCESS)
6673 			vhcache->vhcache_flags |= MDI_VHCI_CACHE_SETUP_DONE;
6674 		else  {
6675 			cmn_err(CE_WARN,
6676 			    "%s: data file corrupted, will recreate\n",
6677 			    vhc->vhc_vhcache_filename);
6678 		}
6679 		rw_exit(&vhcache->vhcache_lock);
6680 		nvlist_free(nvl);
6681 	}
6682 
6683 	vhc->vhc_cbid = callb_add(stop_vhcache_flush_thread, vhc,
6684 	    CB_CL_UADMIN_PRE_VFS, "mdi_vhcache_flush");
6685 
6686 	vhc->vhc_path_discovery_boot = mdi_path_discovery_boot;
6687 	vhc->vhc_path_discovery_postboot = mdi_path_discovery_postboot;
6688 }
6689 
6690 /*
6691  * free all vhci cache related resources
6692  */
6693 static int
6694 destroy_vhci_cache(mdi_vhci_t *vh)
6695 {
6696 	mdi_vhci_config_t *vhc = vh->vh_config;
6697 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
6698 	mdi_vhcache_phci_t *cphci, *cphci_next;
6699 	mdi_vhcache_client_t *cct, *cct_next;
6700 	mdi_vhcache_pathinfo_t *cpi, *cpi_next;
6701 
6702 	if (stop_vhcache_async_threads(vhc) != MDI_SUCCESS)
6703 		return (MDI_FAILURE);
6704 
6705 	kmem_free(vhc->vhc_vhcache_filename,
6706 	    strlen(vhc->vhc_vhcache_filename) + 1);
6707 
6708 	if (vhc->vhc_phci_driver_list)
6709 		free_phci_driver_list(vhc);
6710 
6711 	mod_hash_destroy_strhash(vhcache->vhcache_client_hash);
6712 
6713 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
6714 	    cphci = cphci_next) {
6715 		cphci_next = cphci->cphci_next;
6716 		free_vhcache_phci(cphci);
6717 	}
6718 
6719 	for (cct = vhcache->vhcache_client_head; cct != NULL; cct = cct_next) {
6720 		cct_next = cct->cct_next;
6721 		for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi_next) {
6722 			cpi_next = cpi->cpi_next;
6723 			free_vhcache_pathinfo(cpi);
6724 		}
6725 		free_vhcache_client(cct);
6726 	}
6727 
6728 	rw_destroy(&vhcache->vhcache_lock);
6729 
6730 	mutex_destroy(&vhc->vhc_lock);
6731 	cv_destroy(&vhc->vhc_cv);
6732 	kmem_free(vhc, sizeof (mdi_vhci_config_t));
6733 	return (MDI_SUCCESS);
6734 }
6735 
6736 /*
6737  * Setup the list of phci drivers associated with the specified vhci class.
6738  * MDI uses this information to rebuild bus config cache if in case the
6739  * cache is not available or corrupted.
6740  */
6741 static void
6742 setup_phci_driver_list(mdi_vhci_t *vh)
6743 {
6744 	mdi_vhci_config_t *vhc = vh->vh_config;
6745 	mdi_phci_driver_info_t *driver_list;
6746 	char **driver_list1;
6747 	uint_t ndrivers, ndrivers1;
6748 	int i, j;
6749 
6750 	if (strcmp(vh->vh_class, MDI_HCI_CLASS_SCSI) == 0) {
6751 		driver_list = scsi_phci_driver_list;
6752 		ndrivers = sizeof (scsi_phci_driver_list) /
6753 		    sizeof (mdi_phci_driver_info_t);
6754 	} else if (strcmp(vh->vh_class, MDI_HCI_CLASS_IB) == 0) {
6755 		driver_list = ib_phci_driver_list;
6756 		ndrivers = sizeof (ib_phci_driver_list) /
6757 		    sizeof (mdi_phci_driver_info_t);
6758 	} else {
6759 		driver_list = NULL;
6760 		ndrivers = 0;
6761 	}
6762 
6763 	/*
6764 	 * The driver.conf file of a vhci driver can specify additional
6765 	 * phci drivers using a project private "phci-drivers" property.
6766 	 */
6767 	if (ddi_prop_lookup_string_array(DDI_DEV_T_ANY, vh->vh_dip,
6768 	    DDI_PROP_DONTPASS, "phci-drivers", &driver_list1,
6769 	    &ndrivers1) != DDI_PROP_SUCCESS)
6770 		ndrivers1 = 0;
6771 
6772 	vhc->vhc_nphci_drivers = ndrivers + ndrivers1;
6773 	if (vhc->vhc_nphci_drivers == 0)
6774 		return;
6775 
6776 	vhc->vhc_phci_driver_list = kmem_alloc(
6777 	    sizeof (mdi_phci_driver_info_t) * vhc->vhc_nphci_drivers, KM_SLEEP);
6778 
6779 	for (i = 0; i < ndrivers; i++) {
6780 		vhc->vhc_phci_driver_list[i].phdriver_name =
6781 		    i_ddi_strdup(driver_list[i].phdriver_name, KM_SLEEP);
6782 		vhc->vhc_phci_driver_list[i].phdriver_root_support =
6783 		    driver_list[i].phdriver_root_support;
6784 	}
6785 
6786 	for (j = 0; j < ndrivers1; j++, i++) {
6787 		vhc->vhc_phci_driver_list[i].phdriver_name =
6788 		    i_ddi_strdup(driver_list1[j], KM_SLEEP);
6789 		vhc->vhc_phci_driver_list[i].phdriver_root_support = 1;
6790 	}
6791 
6792 	if (ndrivers1)
6793 		ddi_prop_free(driver_list1);
6794 }
6795 
6796 /*
6797  * Free the memory allocated for the phci driver list
6798  */
6799 static void
6800 free_phci_driver_list(mdi_vhci_config_t *vhc)
6801 {
6802 	int i;
6803 
6804 	if (vhc->vhc_phci_driver_list == NULL)
6805 		return;
6806 
6807 	for (i = 0; i < vhc->vhc_nphci_drivers; i++) {
6808 		kmem_free(vhc->vhc_phci_driver_list[i].phdriver_name,
6809 		    strlen(vhc->vhc_phci_driver_list[i].phdriver_name) + 1);
6810 	}
6811 
6812 	kmem_free(vhc->vhc_phci_driver_list,
6813 	    sizeof (mdi_phci_driver_info_t) * vhc->vhc_nphci_drivers);
6814 }
6815 
6816 /*
6817  * Stop all vhci cache related async threads and free their resources.
6818  */
6819 static int
6820 stop_vhcache_async_threads(mdi_vhci_config_t *vhc)
6821 {
6822 	mdi_async_client_config_t *acc, *acc_next;
6823 
6824 	mutex_enter(&vhc->vhc_lock);
6825 	vhc->vhc_flags |= MDI_VHC_EXIT;
6826 	ASSERT(vhc->vhc_acc_thrcount >= 0);
6827 	cv_broadcast(&vhc->vhc_cv);
6828 
6829 	while ((vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) ||
6830 	    vhc->vhc_acc_thrcount != 0) {
6831 		mutex_exit(&vhc->vhc_lock);
6832 		delay(1);
6833 		mutex_enter(&vhc->vhc_lock);
6834 	}
6835 
6836 	vhc->vhc_flags &= ~MDI_VHC_EXIT;
6837 
6838 	for (acc = vhc->vhc_acc_list_head; acc != NULL; acc = acc_next) {
6839 		acc_next = acc->acc_next;
6840 		free_async_client_config(acc);
6841 	}
6842 	vhc->vhc_acc_list_head = NULL;
6843 	vhc->vhc_acc_list_tail = NULL;
6844 	vhc->vhc_acc_count = 0;
6845 
6846 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) {
6847 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
6848 		mutex_exit(&vhc->vhc_lock);
6849 		if (flush_vhcache(vhc, 0) != MDI_SUCCESS) {
6850 			vhcache_dirty(vhc);
6851 			return (MDI_FAILURE);
6852 		}
6853 	} else
6854 		mutex_exit(&vhc->vhc_lock);
6855 
6856 	if (callb_delete(vhc->vhc_cbid) != 0)
6857 		return (MDI_FAILURE);
6858 
6859 	return (MDI_SUCCESS);
6860 }
6861 
6862 /*
6863  * Stop vhci cache flush thread
6864  */
6865 /* ARGSUSED */
6866 static boolean_t
6867 stop_vhcache_flush_thread(void *arg, int code)
6868 {
6869 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
6870 
6871 	mutex_enter(&vhc->vhc_lock);
6872 	vhc->vhc_flags |= MDI_VHC_EXIT;
6873 	cv_broadcast(&vhc->vhc_cv);
6874 
6875 	while (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) {
6876 		mutex_exit(&vhc->vhc_lock);
6877 		delay(1);
6878 		mutex_enter(&vhc->vhc_lock);
6879 	}
6880 
6881 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) {
6882 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
6883 		mutex_exit(&vhc->vhc_lock);
6884 		(void) flush_vhcache(vhc, 1);
6885 	} else
6886 		mutex_exit(&vhc->vhc_lock);
6887 
6888 	return (B_TRUE);
6889 }
6890 
6891 /*
6892  * Enqueue the vhcache phci (cphci) at the tail of the list
6893  */
6894 static void
6895 enqueue_vhcache_phci(mdi_vhci_cache_t *vhcache, mdi_vhcache_phci_t *cphci)
6896 {
6897 	cphci->cphci_next = NULL;
6898 	if (vhcache->vhcache_phci_head == NULL)
6899 		vhcache->vhcache_phci_head = cphci;
6900 	else
6901 		vhcache->vhcache_phci_tail->cphci_next = cphci;
6902 	vhcache->vhcache_phci_tail = cphci;
6903 }
6904 
6905 /*
6906  * Enqueue the vhcache pathinfo (cpi) at the tail of the list
6907  */
6908 static void
6909 enqueue_tail_vhcache_pathinfo(mdi_vhcache_client_t *cct,
6910     mdi_vhcache_pathinfo_t *cpi)
6911 {
6912 	cpi->cpi_next = NULL;
6913 	if (cct->cct_cpi_head == NULL)
6914 		cct->cct_cpi_head = cpi;
6915 	else
6916 		cct->cct_cpi_tail->cpi_next = cpi;
6917 	cct->cct_cpi_tail = cpi;
6918 }
6919 
6920 /*
6921  * Enqueue the vhcache pathinfo (cpi) at the correct location in the
6922  * ordered list. All cpis which do not have MDI_CPI_HINT_PATH_DOES_NOT_EXIST
6923  * flag set come at the beginning of the list. All cpis which have this
6924  * flag set come at the end of the list.
6925  */
6926 static void
6927 enqueue_vhcache_pathinfo(mdi_vhcache_client_t *cct,
6928     mdi_vhcache_pathinfo_t *newcpi)
6929 {
6930 	mdi_vhcache_pathinfo_t *cpi, *prev_cpi;
6931 
6932 	if (cct->cct_cpi_head == NULL ||
6933 	    (newcpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST))
6934 		enqueue_tail_vhcache_pathinfo(cct, newcpi);
6935 	else {
6936 		for (cpi = cct->cct_cpi_head, prev_cpi = NULL; cpi != NULL &&
6937 		    !(cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST);
6938 		    prev_cpi = cpi, cpi = cpi->cpi_next)
6939 			;
6940 
6941 		if (prev_cpi == NULL)
6942 			cct->cct_cpi_head = newcpi;
6943 		else
6944 			prev_cpi->cpi_next = newcpi;
6945 
6946 		newcpi->cpi_next = cpi;
6947 
6948 		if (cpi == NULL)
6949 			cct->cct_cpi_tail = newcpi;
6950 	}
6951 }
6952 
6953 /*
6954  * Enqueue the vhcache client (cct) at the tail of the list
6955  */
6956 static void
6957 enqueue_vhcache_client(mdi_vhci_cache_t *vhcache,
6958     mdi_vhcache_client_t *cct)
6959 {
6960 	cct->cct_next = NULL;
6961 	if (vhcache->vhcache_client_head == NULL)
6962 		vhcache->vhcache_client_head = cct;
6963 	else
6964 		vhcache->vhcache_client_tail->cct_next = cct;
6965 	vhcache->vhcache_client_tail = cct;
6966 }
6967 
6968 static void
6969 free_string_array(char **str, int nelem)
6970 {
6971 	int i;
6972 
6973 	if (str) {
6974 		for (i = 0; i < nelem; i++) {
6975 			if (str[i])
6976 				kmem_free(str[i], strlen(str[i]) + 1);
6977 		}
6978 		kmem_free(str, sizeof (char *) * nelem);
6979 	}
6980 }
6981 
6982 static void
6983 free_vhcache_phci(mdi_vhcache_phci_t *cphci)
6984 {
6985 	kmem_free(cphci->cphci_path, strlen(cphci->cphci_path) + 1);
6986 	kmem_free(cphci, sizeof (*cphci));
6987 }
6988 
6989 static void
6990 free_vhcache_pathinfo(mdi_vhcache_pathinfo_t *cpi)
6991 {
6992 	kmem_free(cpi->cpi_addr, strlen(cpi->cpi_addr) + 1);
6993 	kmem_free(cpi, sizeof (*cpi));
6994 }
6995 
6996 static void
6997 free_vhcache_client(mdi_vhcache_client_t *cct)
6998 {
6999 	kmem_free(cct->cct_name_addr, strlen(cct->cct_name_addr) + 1);
7000 	kmem_free(cct, sizeof (*cct));
7001 }
7002 
7003 static char *
7004 vhcache_mknameaddr(char *ct_name, char *ct_addr, int *ret_len)
7005 {
7006 	char *name_addr;
7007 	int len;
7008 
7009 	len = strlen(ct_name) + strlen(ct_addr) + 2;
7010 	name_addr = kmem_alloc(len, KM_SLEEP);
7011 	(void) snprintf(name_addr, len, "%s@%s", ct_name, ct_addr);
7012 
7013 	if (ret_len)
7014 		*ret_len = len;
7015 	return (name_addr);
7016 }
7017 
7018 /*
7019  * Copy the contents of paddrnvl to vhci cache.
7020  * paddrnvl nvlist contains path information for a vhci client.
7021  * See the comment in mainnvl_to_vhcache() for the format of this nvlist.
7022  */
7023 static void
7024 paddrnvl_to_vhcache(nvlist_t *nvl, mdi_vhcache_phci_t *cphci_list[],
7025     mdi_vhcache_client_t *cct)
7026 {
7027 	nvpair_t *nvp = NULL;
7028 	mdi_vhcache_pathinfo_t *cpi;
7029 	uint_t nelem;
7030 	uint32_t *val;
7031 
7032 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
7033 		ASSERT(nvpair_type(nvp) == DATA_TYPE_UINT32_ARRAY);
7034 		cpi = kmem_zalloc(sizeof (*cpi), KM_SLEEP);
7035 		cpi->cpi_addr = i_ddi_strdup(nvpair_name(nvp), KM_SLEEP);
7036 		(void) nvpair_value_uint32_array(nvp, &val, &nelem);
7037 		ASSERT(nelem == 2);
7038 		cpi->cpi_cphci = cphci_list[val[0]];
7039 		cpi->cpi_flags = val[1];
7040 		enqueue_tail_vhcache_pathinfo(cct, cpi);
7041 	}
7042 }
7043 
7044 /*
7045  * Copy the contents of caddrmapnvl to vhci cache.
7046  * caddrmapnvl nvlist contains vhci client address to phci client address
7047  * mappings. See the comment in mainnvl_to_vhcache() for the format of
7048  * this nvlist.
7049  */
7050 static void
7051 caddrmapnvl_to_vhcache(mdi_vhci_cache_t *vhcache, nvlist_t *nvl,
7052     mdi_vhcache_phci_t *cphci_list[])
7053 {
7054 	nvpair_t *nvp = NULL;
7055 	nvlist_t *paddrnvl;
7056 	mdi_vhcache_client_t *cct;
7057 
7058 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
7059 		ASSERT(nvpair_type(nvp) == DATA_TYPE_NVLIST);
7060 		cct = kmem_zalloc(sizeof (*cct), KM_SLEEP);
7061 		cct->cct_name_addr = i_ddi_strdup(nvpair_name(nvp), KM_SLEEP);
7062 		(void) nvpair_value_nvlist(nvp, &paddrnvl);
7063 		paddrnvl_to_vhcache(paddrnvl, cphci_list, cct);
7064 		/* the client must contain at least one path */
7065 		ASSERT(cct->cct_cpi_head != NULL);
7066 
7067 		enqueue_vhcache_client(vhcache, cct);
7068 		(void) mod_hash_insert(vhcache->vhcache_client_hash,
7069 		    (mod_hash_key_t)cct->cct_name_addr, (mod_hash_val_t)cct);
7070 	}
7071 }
7072 
7073 /*
7074  * Copy the contents of the main nvlist to vhci cache.
7075  *
7076  * VHCI busconfig cached data is stored in the form of a nvlist on the disk.
7077  * The nvlist contains the mappings between the vhci client addresses and
7078  * their corresponding phci client addresses.
7079  *
7080  * The structure of the nvlist is as follows:
7081  *
7082  * Main nvlist:
7083  *	NAME		TYPE		DATA
7084  *	version		int32		version number
7085  *	phcis		string array	array of phci paths
7086  *	clientaddrmap	nvlist_t	c2paddrs_nvl (see below)
7087  *
7088  * structure of c2paddrs_nvl:
7089  *	NAME		TYPE		DATA
7090  *	caddr1		nvlist_t	paddrs_nvl1
7091  *	caddr2		nvlist_t	paddrs_nvl2
7092  *	...
7093  * where caddr1, caddr2, ... are vhci client name and addresses in the
7094  * form of "<clientname>@<clientaddress>".
7095  * (for example: "ssd@2000002037cd9f72");
7096  * paddrs_nvl1, paddrs_nvl2, .. are nvlists that contain path information.
7097  *
7098  * structure of paddrs_nvl:
7099  *	NAME		TYPE		DATA
7100  *	pi_addr1	uint32_array	(phci-id, cpi_flags)
7101  *	pi_addr2	uint32_array	(phci-id, cpi_flags)
7102  *	...
7103  * where pi_addr1, pi_addr2, ... are bus specific addresses of pathinfo nodes
7104  * (so called pi_addrs, for example: "w2100002037cd9f72,0");
7105  * phci-ids are integers that identify PHCIs to which the
7106  * the bus specific address belongs to. These integers are used as an index
7107  * into to the phcis string array in the main nvlist to get the PHCI path.
7108  */
7109 static int
7110 mainnvl_to_vhcache(mdi_vhci_cache_t *vhcache, nvlist_t *nvl)
7111 {
7112 	char **phcis, **phci_namep;
7113 	uint_t nphcis;
7114 	mdi_vhcache_phci_t *cphci, **cphci_list;
7115 	nvlist_t *caddrmapnvl;
7116 	int32_t ver;
7117 	int i;
7118 	size_t cphci_list_size;
7119 
7120 	ASSERT(RW_WRITE_HELD(&vhcache->vhcache_lock));
7121 
7122 	if (nvlist_lookup_int32(nvl, MDI_NVPNAME_VERSION, &ver) != 0 ||
7123 	    ver != MDI_VHCI_CACHE_VERSION)
7124 		return (MDI_FAILURE);
7125 
7126 	if (nvlist_lookup_string_array(nvl, MDI_NVPNAME_PHCIS, &phcis,
7127 	    &nphcis) != 0)
7128 		return (MDI_SUCCESS);
7129 
7130 	ASSERT(nphcis > 0);
7131 
7132 	cphci_list_size = sizeof (mdi_vhcache_phci_t *) * nphcis;
7133 	cphci_list = kmem_alloc(cphci_list_size, KM_SLEEP);
7134 	for (i = 0, phci_namep = phcis; i < nphcis; i++, phci_namep++) {
7135 		cphci = kmem_zalloc(sizeof (mdi_vhcache_phci_t), KM_SLEEP);
7136 		cphci->cphci_path = i_ddi_strdup(*phci_namep, KM_SLEEP);
7137 		enqueue_vhcache_phci(vhcache, cphci);
7138 		cphci_list[i] = cphci;
7139 	}
7140 
7141 	ASSERT(vhcache->vhcache_phci_head != NULL);
7142 
7143 	if (nvlist_lookup_nvlist(nvl, MDI_NVPNAME_CTADDRMAP, &caddrmapnvl) == 0)
7144 		caddrmapnvl_to_vhcache(vhcache, caddrmapnvl, cphci_list);
7145 
7146 	kmem_free(cphci_list, cphci_list_size);
7147 	return (MDI_SUCCESS);
7148 }
7149 
7150 /*
7151  * Build paddrnvl for the specified client using the information in the
7152  * vhci cache and add it to the caddrmapnnvl.
7153  * Returns 0 on success, errno on failure.
7154  */
7155 static int
7156 vhcache_to_paddrnvl(mdi_vhci_cache_t *vhcache, mdi_vhcache_client_t *cct,
7157     nvlist_t *caddrmapnvl)
7158 {
7159 	mdi_vhcache_pathinfo_t *cpi;
7160 	nvlist_t *nvl;
7161 	int err;
7162 	uint32_t val[2];
7163 
7164 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7165 
7166 	if ((err = nvlist_alloc(&nvl, 0, KM_SLEEP)) != 0)
7167 		return (err);
7168 
7169 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
7170 		val[0] = cpi->cpi_cphci->cphci_id;
7171 		val[1] = cpi->cpi_flags;
7172 		if ((err = nvlist_add_uint32_array(nvl, cpi->cpi_addr, val, 2))
7173 		    != 0)
7174 			goto out;
7175 	}
7176 
7177 	err = nvlist_add_nvlist(caddrmapnvl, cct->cct_name_addr, nvl);
7178 out:
7179 	nvlist_free(nvl);
7180 	return (err);
7181 }
7182 
7183 /*
7184  * Build caddrmapnvl using the information in the vhci cache
7185  * and add it to the mainnvl.
7186  * Returns 0 on success, errno on failure.
7187  */
7188 static int
7189 vhcache_to_caddrmapnvl(mdi_vhci_cache_t *vhcache, nvlist_t *mainnvl)
7190 {
7191 	mdi_vhcache_client_t *cct;
7192 	nvlist_t *nvl;
7193 	int err;
7194 
7195 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7196 
7197 	if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) != 0)
7198 		return (err);
7199 
7200 	for (cct = vhcache->vhcache_client_head; cct != NULL;
7201 	    cct = cct->cct_next) {
7202 		if ((err = vhcache_to_paddrnvl(vhcache, cct, nvl)) != 0)
7203 			goto out;
7204 	}
7205 
7206 	err = nvlist_add_nvlist(mainnvl, MDI_NVPNAME_CTADDRMAP, nvl);
7207 out:
7208 	nvlist_free(nvl);
7209 	return (err);
7210 }
7211 
7212 /*
7213  * Build nvlist using the information in the vhci cache.
7214  * See the comment in mainnvl_to_vhcache() for the format of the nvlist.
7215  * Returns nvl on success, NULL on failure.
7216  */
7217 static nvlist_t *
7218 vhcache_to_mainnvl(mdi_vhci_cache_t *vhcache)
7219 {
7220 	mdi_vhcache_phci_t *cphci;
7221 	uint_t phci_count;
7222 	char **phcis;
7223 	nvlist_t *nvl;
7224 	int err, i;
7225 
7226 	if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) != 0) {
7227 		nvl = NULL;
7228 		goto out;
7229 	}
7230 
7231 	if ((err = nvlist_add_int32(nvl, MDI_NVPNAME_VERSION,
7232 	    MDI_VHCI_CACHE_VERSION)) != 0)
7233 		goto out;
7234 
7235 	rw_enter(&vhcache->vhcache_lock, RW_READER);
7236 	if (vhcache->vhcache_phci_head == NULL) {
7237 		rw_exit(&vhcache->vhcache_lock);
7238 		return (nvl);
7239 	}
7240 
7241 	phci_count = 0;
7242 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7243 	    cphci = cphci->cphci_next)
7244 		cphci->cphci_id = phci_count++;
7245 
7246 	/* build phci pathname list */
7247 	phcis = kmem_alloc(sizeof (char *) * phci_count, KM_SLEEP);
7248 	for (cphci = vhcache->vhcache_phci_head, i = 0; cphci != NULL;
7249 	    cphci = cphci->cphci_next, i++)
7250 		phcis[i] = i_ddi_strdup(cphci->cphci_path, KM_SLEEP);
7251 
7252 	err = nvlist_add_string_array(nvl, MDI_NVPNAME_PHCIS, phcis,
7253 	    phci_count);
7254 	free_string_array(phcis, phci_count);
7255 
7256 	if (err == 0 &&
7257 	    (err = vhcache_to_caddrmapnvl(vhcache, nvl)) == 0) {
7258 		rw_exit(&vhcache->vhcache_lock);
7259 		return (nvl);
7260 	}
7261 
7262 	rw_exit(&vhcache->vhcache_lock);
7263 out:
7264 	if (nvl)
7265 		nvlist_free(nvl);
7266 	return (NULL);
7267 }
7268 
7269 /*
7270  * Lookup vhcache phci structure for the specified phci path.
7271  */
7272 static mdi_vhcache_phci_t *
7273 lookup_vhcache_phci_by_name(mdi_vhci_cache_t *vhcache, char *phci_path)
7274 {
7275 	mdi_vhcache_phci_t *cphci;
7276 
7277 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7278 
7279 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7280 	    cphci = cphci->cphci_next) {
7281 		if (strcmp(cphci->cphci_path, phci_path) == 0)
7282 			return (cphci);
7283 	}
7284 
7285 	return (NULL);
7286 }
7287 
7288 /*
7289  * Lookup vhcache phci structure for the specified phci.
7290  */
7291 static mdi_vhcache_phci_t *
7292 lookup_vhcache_phci_by_addr(mdi_vhci_cache_t *vhcache, mdi_phci_t *ph)
7293 {
7294 	mdi_vhcache_phci_t *cphci;
7295 
7296 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7297 
7298 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7299 	    cphci = cphci->cphci_next) {
7300 		if (cphci->cphci_phci == ph)
7301 			return (cphci);
7302 	}
7303 
7304 	return (NULL);
7305 }
7306 
7307 /*
7308  * Add the specified phci to the vhci cache if not already present.
7309  */
7310 static void
7311 vhcache_phci_add(mdi_vhci_config_t *vhc, mdi_phci_t *ph)
7312 {
7313 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7314 	mdi_vhcache_phci_t *cphci;
7315 	char *pathname;
7316 	int cache_updated;
7317 
7318 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7319 
7320 	pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
7321 	(void) ddi_pathname(ph->ph_dip, pathname);
7322 	if ((cphci = lookup_vhcache_phci_by_name(vhcache, pathname))
7323 	    != NULL) {
7324 		cphci->cphci_phci = ph;
7325 		cache_updated = 0;
7326 	} else {
7327 		cphci = kmem_zalloc(sizeof (*cphci), KM_SLEEP);
7328 		cphci->cphci_path = i_ddi_strdup(pathname, KM_SLEEP);
7329 		cphci->cphci_phci = ph;
7330 		enqueue_vhcache_phci(vhcache, cphci);
7331 		cache_updated = 1;
7332 	}
7333 
7334 	rw_exit(&vhcache->vhcache_lock);
7335 
7336 	/*
7337 	 * Since a new phci has been added, reset
7338 	 * vhc_path_discovery_cutoff_time to allow for discovery of paths
7339 	 * during next vhcache_discover_paths().
7340 	 */
7341 	mutex_enter(&vhc->vhc_lock);
7342 	vhc->vhc_path_discovery_cutoff_time = 0;
7343 	mutex_exit(&vhc->vhc_lock);
7344 
7345 	kmem_free(pathname, MAXPATHLEN);
7346 	if (cache_updated)
7347 		vhcache_dirty(vhc);
7348 }
7349 
7350 /*
7351  * Remove the reference to the specified phci from the vhci cache.
7352  */
7353 static void
7354 vhcache_phci_remove(mdi_vhci_config_t *vhc, mdi_phci_t *ph)
7355 {
7356 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7357 	mdi_vhcache_phci_t *cphci;
7358 
7359 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7360 	if ((cphci = lookup_vhcache_phci_by_addr(vhcache, ph)) != NULL) {
7361 		/* do not remove the actual mdi_vhcache_phci structure */
7362 		cphci->cphci_phci = NULL;
7363 	}
7364 	rw_exit(&vhcache->vhcache_lock);
7365 }
7366 
7367 static void
7368 init_vhcache_lookup_token(mdi_vhcache_lookup_token_t *dst,
7369     mdi_vhcache_lookup_token_t *src)
7370 {
7371 	if (src == NULL) {
7372 		dst->lt_cct = NULL;
7373 		dst->lt_cct_lookup_time = 0;
7374 	} else {
7375 		dst->lt_cct = src->lt_cct;
7376 		dst->lt_cct_lookup_time = src->lt_cct_lookup_time;
7377 	}
7378 }
7379 
7380 /*
7381  * Look up vhcache client for the specified client.
7382  */
7383 static mdi_vhcache_client_t *
7384 lookup_vhcache_client(mdi_vhci_cache_t *vhcache, char *ct_name, char *ct_addr,
7385     mdi_vhcache_lookup_token_t *token)
7386 {
7387 	mod_hash_val_t hv;
7388 	char *name_addr;
7389 	int len;
7390 
7391 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7392 
7393 	/*
7394 	 * If no vhcache clean occurred since the last lookup, we can
7395 	 * simply return the cct from the last lookup operation.
7396 	 * It works because ccts are never freed except during the vhcache
7397 	 * cleanup operation.
7398 	 */
7399 	if (token != NULL &&
7400 	    vhcache->vhcache_clean_time < token->lt_cct_lookup_time)
7401 		return (token->lt_cct);
7402 
7403 	name_addr = vhcache_mknameaddr(ct_name, ct_addr, &len);
7404 	if (mod_hash_find(vhcache->vhcache_client_hash,
7405 	    (mod_hash_key_t)name_addr, &hv) == 0) {
7406 		if (token) {
7407 			token->lt_cct = (mdi_vhcache_client_t *)hv;
7408 			token->lt_cct_lookup_time = lbolt64;
7409 		}
7410 	} else {
7411 		if (token) {
7412 			token->lt_cct = NULL;
7413 			token->lt_cct_lookup_time = 0;
7414 		}
7415 		hv = NULL;
7416 	}
7417 	kmem_free(name_addr, len);
7418 	return ((mdi_vhcache_client_t *)hv);
7419 }
7420 
7421 /*
7422  * Add the specified path to the vhci cache if not already present.
7423  * Also add the vhcache client for the client corresponding to this path
7424  * if it doesn't already exist.
7425  */
7426 static void
7427 vhcache_pi_add(mdi_vhci_config_t *vhc, struct mdi_pathinfo *pip)
7428 {
7429 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7430 	mdi_vhcache_client_t *cct;
7431 	mdi_vhcache_pathinfo_t *cpi;
7432 	mdi_phci_t *ph = pip->pi_phci;
7433 	mdi_client_t *ct = pip->pi_client;
7434 	int cache_updated = 0;
7435 
7436 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7437 
7438 	/* if vhcache client for this pip doesn't already exist, add it */
7439 	if ((cct = lookup_vhcache_client(vhcache, ct->ct_drvname, ct->ct_guid,
7440 	    NULL)) == NULL) {
7441 		cct = kmem_zalloc(sizeof (*cct), KM_SLEEP);
7442 		cct->cct_name_addr = vhcache_mknameaddr(ct->ct_drvname,
7443 		    ct->ct_guid, NULL);
7444 		enqueue_vhcache_client(vhcache, cct);
7445 		(void) mod_hash_insert(vhcache->vhcache_client_hash,
7446 		    (mod_hash_key_t)cct->cct_name_addr, (mod_hash_val_t)cct);
7447 		cache_updated = 1;
7448 	}
7449 
7450 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
7451 		if (cpi->cpi_cphci->cphci_phci == ph &&
7452 		    strcmp(cpi->cpi_addr, pip->pi_addr) == 0) {
7453 			cpi->cpi_pip = pip;
7454 			if (cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST) {
7455 				cpi->cpi_flags &=
7456 				    ~MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
7457 				sort_vhcache_paths(cct);
7458 				cache_updated = 1;
7459 			}
7460 			break;
7461 		}
7462 	}
7463 
7464 	if (cpi == NULL) {
7465 		cpi = kmem_zalloc(sizeof (*cpi), KM_SLEEP);
7466 		cpi->cpi_addr = i_ddi_strdup(pip->pi_addr, KM_SLEEP);
7467 		cpi->cpi_cphci = lookup_vhcache_phci_by_addr(vhcache, ph);
7468 		ASSERT(cpi->cpi_cphci != NULL);
7469 		cpi->cpi_pip = pip;
7470 		enqueue_vhcache_pathinfo(cct, cpi);
7471 		cache_updated = 1;
7472 	}
7473 
7474 	rw_exit(&vhcache->vhcache_lock);
7475 
7476 	if (cache_updated)
7477 		vhcache_dirty(vhc);
7478 }
7479 
7480 /*
7481  * Remove the reference to the specified path from the vhci cache.
7482  */
7483 static void
7484 vhcache_pi_remove(mdi_vhci_config_t *vhc, struct mdi_pathinfo *pip)
7485 {
7486 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7487 	mdi_client_t *ct = pip->pi_client;
7488 	mdi_vhcache_client_t *cct;
7489 	mdi_vhcache_pathinfo_t *cpi;
7490 
7491 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7492 	if ((cct = lookup_vhcache_client(vhcache, ct->ct_drvname, ct->ct_guid,
7493 	    NULL)) != NULL) {
7494 		for (cpi = cct->cct_cpi_head; cpi != NULL;
7495 		    cpi = cpi->cpi_next) {
7496 			if (cpi->cpi_pip == pip) {
7497 				cpi->cpi_pip = NULL;
7498 				break;
7499 			}
7500 		}
7501 	}
7502 	rw_exit(&vhcache->vhcache_lock);
7503 }
7504 
7505 /*
7506  * Flush the vhci cache to disk.
7507  * Returns MDI_SUCCESS on success, MDI_FAILURE on failure.
7508  */
7509 static int
7510 flush_vhcache(mdi_vhci_config_t *vhc, int force_flag)
7511 {
7512 	nvlist_t *nvl;
7513 	int err;
7514 	int rv;
7515 
7516 	/*
7517 	 * It is possible that the system may shutdown before
7518 	 * i_ddi_io_initialized (during stmsboot for example). To allow for
7519 	 * flushing the cache in this case do not check for
7520 	 * i_ddi_io_initialized when force flag is set.
7521 	 */
7522 	if (force_flag == 0 && !i_ddi_io_initialized())
7523 		return (MDI_FAILURE);
7524 
7525 	if ((nvl = vhcache_to_mainnvl(&vhc->vhc_vhcache)) != NULL) {
7526 		err = fwrite_nvlist(vhc->vhc_vhcache_filename, nvl);
7527 		nvlist_free(nvl);
7528 	} else
7529 		err = EFAULT;
7530 
7531 	rv = MDI_SUCCESS;
7532 	mutex_enter(&vhc->vhc_lock);
7533 	if (err != 0) {
7534 		if (err == EROFS) {
7535 			vhc->vhc_flags |= MDI_VHC_READONLY_FS;
7536 			vhc->vhc_flags &= ~(MDI_VHC_VHCACHE_FLUSH_ERROR |
7537 			    MDI_VHC_VHCACHE_DIRTY);
7538 		} else {
7539 			if (!(vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_ERROR)) {
7540 				cmn_err(CE_CONT, "%s: update failed\n",
7541 				    vhc->vhc_vhcache_filename);
7542 				vhc->vhc_flags |= MDI_VHC_VHCACHE_FLUSH_ERROR;
7543 			}
7544 			rv = MDI_FAILURE;
7545 		}
7546 	} else if (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_ERROR) {
7547 		cmn_err(CE_CONT,
7548 		    "%s: update now ok\n", vhc->vhc_vhcache_filename);
7549 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_FLUSH_ERROR;
7550 	}
7551 	mutex_exit(&vhc->vhc_lock);
7552 
7553 	return (rv);
7554 }
7555 
7556 /*
7557  * Call flush_vhcache() to flush the vhci cache at the scheduled time.
7558  * Exits itself if left idle for the idle timeout period.
7559  */
7560 static void
7561 vhcache_flush_thread(void *arg)
7562 {
7563 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
7564 	clock_t idle_time, quit_at_ticks;
7565 	callb_cpr_t cprinfo;
7566 
7567 	/* number of seconds to sleep idle before exiting */
7568 	idle_time = mdi_vhcache_flush_daemon_idle_time * TICKS_PER_SECOND;
7569 
7570 	CALLB_CPR_INIT(&cprinfo, &vhc->vhc_lock, callb_generic_cpr,
7571 	    "mdi_vhcache_flush");
7572 	mutex_enter(&vhc->vhc_lock);
7573 	for (; ; ) {
7574 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
7575 		    (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY)) {
7576 			if (ddi_get_lbolt() < vhc->vhc_flush_at_ticks) {
7577 				CALLB_CPR_SAFE_BEGIN(&cprinfo);
7578 				(void) cv_timedwait(&vhc->vhc_cv,
7579 				    &vhc->vhc_lock, vhc->vhc_flush_at_ticks);
7580 				CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
7581 			} else {
7582 				vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
7583 				mutex_exit(&vhc->vhc_lock);
7584 
7585 				if (flush_vhcache(vhc, 0) != MDI_SUCCESS)
7586 					vhcache_dirty(vhc);
7587 
7588 				mutex_enter(&vhc->vhc_lock);
7589 			}
7590 		}
7591 
7592 		quit_at_ticks = ddi_get_lbolt() + idle_time;
7593 
7594 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
7595 		    !(vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) &&
7596 		    ddi_get_lbolt() < quit_at_ticks) {
7597 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
7598 			(void) cv_timedwait(&vhc->vhc_cv, &vhc->vhc_lock,
7599 			    quit_at_ticks);
7600 			CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
7601 		}
7602 
7603 		if ((vhc->vhc_flags & MDI_VHC_EXIT) ||
7604 		    !(vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY))
7605 			goto out;
7606 	}
7607 
7608 out:
7609 	vhc->vhc_flags &= ~MDI_VHC_VHCACHE_FLUSH_THREAD;
7610 	/* CALLB_CPR_EXIT releases the vhc->vhc_lock */
7611 	CALLB_CPR_EXIT(&cprinfo);
7612 }
7613 
7614 /*
7615  * Make vhci cache dirty and schedule flushing by vhcache flush thread.
7616  */
7617 static void
7618 vhcache_dirty(mdi_vhci_config_t *vhc)
7619 {
7620 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7621 	int create_thread;
7622 
7623 	rw_enter(&vhcache->vhcache_lock, RW_READER);
7624 	/* do not flush cache until the cache is fully built */
7625 	if (!(vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE)) {
7626 		rw_exit(&vhcache->vhcache_lock);
7627 		return;
7628 	}
7629 	rw_exit(&vhcache->vhcache_lock);
7630 
7631 	mutex_enter(&vhc->vhc_lock);
7632 	if (vhc->vhc_flags & MDI_VHC_READONLY_FS) {
7633 		mutex_exit(&vhc->vhc_lock);
7634 		return;
7635 	}
7636 
7637 	vhc->vhc_flags |= MDI_VHC_VHCACHE_DIRTY;
7638 	vhc->vhc_flush_at_ticks = ddi_get_lbolt() +
7639 	    mdi_vhcache_flush_delay * TICKS_PER_SECOND;
7640 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) {
7641 		cv_broadcast(&vhc->vhc_cv);
7642 		create_thread = 0;
7643 	} else {
7644 		vhc->vhc_flags |= MDI_VHC_VHCACHE_FLUSH_THREAD;
7645 		create_thread = 1;
7646 	}
7647 	mutex_exit(&vhc->vhc_lock);
7648 
7649 	if (create_thread)
7650 		(void) thread_create(NULL, 0, vhcache_flush_thread, vhc,
7651 		    0, &p0, TS_RUN, minclsyspri);
7652 }
7653 
7654 /*
7655  * phci bus config structure - one for for each phci bus config operation that
7656  * we initiate on behalf of a vhci.
7657  */
7658 typedef struct mdi_phci_bus_config_s {
7659 	char *phbc_phci_path;
7660 	struct mdi_vhci_bus_config_s *phbc_vhbusconfig;	/* vhci bus config */
7661 	struct mdi_phci_bus_config_s *phbc_next;
7662 } mdi_phci_bus_config_t;
7663 
7664 /* vhci bus config structure - one for each vhci bus config operation */
7665 typedef struct mdi_vhci_bus_config_s {
7666 	ddi_bus_config_op_t vhbc_op;	/* bus config op */
7667 	major_t vhbc_op_major;		/* bus config op major */
7668 	uint_t vhbc_op_flags;		/* bus config op flags */
7669 	kmutex_t vhbc_lock;
7670 	kcondvar_t vhbc_cv;
7671 	int vhbc_thr_count;
7672 } mdi_vhci_bus_config_t;
7673 
7674 /*
7675  * bus config the specified phci
7676  */
7677 static void
7678 bus_config_phci(void *arg)
7679 {
7680 	mdi_phci_bus_config_t *phbc = (mdi_phci_bus_config_t *)arg;
7681 	mdi_vhci_bus_config_t *vhbc = phbc->phbc_vhbusconfig;
7682 	dev_info_t *ph_dip;
7683 
7684 	/*
7685 	 * first configure all path components upto phci and then configure
7686 	 * the phci children.
7687 	 */
7688 	if ((ph_dip = e_ddi_hold_devi_by_path(phbc->phbc_phci_path, 0))
7689 	    != NULL) {
7690 		if (vhbc->vhbc_op == BUS_CONFIG_DRIVER ||
7691 		    vhbc->vhbc_op == BUS_UNCONFIG_DRIVER) {
7692 			(void) ndi_devi_config_driver(ph_dip,
7693 			    vhbc->vhbc_op_flags,
7694 			    vhbc->vhbc_op_major);
7695 		} else
7696 			(void) ndi_devi_config(ph_dip,
7697 			    vhbc->vhbc_op_flags);
7698 
7699 		/* release the hold that e_ddi_hold_devi_by_path() placed */
7700 		ndi_rele_devi(ph_dip);
7701 	}
7702 
7703 	kmem_free(phbc->phbc_phci_path, strlen(phbc->phbc_phci_path) + 1);
7704 	kmem_free(phbc, sizeof (*phbc));
7705 
7706 	mutex_enter(&vhbc->vhbc_lock);
7707 	vhbc->vhbc_thr_count--;
7708 	if (vhbc->vhbc_thr_count == 0)
7709 		cv_broadcast(&vhbc->vhbc_cv);
7710 	mutex_exit(&vhbc->vhbc_lock);
7711 }
7712 
7713 /*
7714  * Bus config all phcis associated with the vhci in parallel.
7715  * op must be BUS_CONFIG_DRIVER or BUS_CONFIG_ALL.
7716  */
7717 static void
7718 bus_config_all_phcis(mdi_vhci_cache_t *vhcache, uint_t flags,
7719     ddi_bus_config_op_t op, major_t maj)
7720 {
7721 	mdi_phci_bus_config_t *phbc_head = NULL, *phbc, *phbc_next;
7722 	mdi_vhci_bus_config_t *vhbc;
7723 	mdi_vhcache_phci_t *cphci;
7724 
7725 	rw_enter(&vhcache->vhcache_lock, RW_READER);
7726 	if (vhcache->vhcache_phci_head == NULL) {
7727 		rw_exit(&vhcache->vhcache_lock);
7728 		return;
7729 	}
7730 
7731 	vhbc = kmem_zalloc(sizeof (*vhbc), KM_SLEEP);
7732 
7733 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7734 	    cphci = cphci->cphci_next) {
7735 		phbc = kmem_zalloc(sizeof (*phbc), KM_SLEEP);
7736 		phbc->phbc_phci_path = i_ddi_strdup(cphci->cphci_path,
7737 		    KM_SLEEP);
7738 		phbc->phbc_vhbusconfig = vhbc;
7739 		phbc->phbc_next = phbc_head;
7740 		phbc_head = phbc;
7741 		vhbc->vhbc_thr_count++;
7742 	}
7743 	rw_exit(&vhcache->vhcache_lock);
7744 
7745 	vhbc->vhbc_op = op;
7746 	vhbc->vhbc_op_major = maj;
7747 	vhbc->vhbc_op_flags = NDI_NO_EVENT |
7748 	    (flags & (NDI_CONFIG_REPROBE | NDI_DRV_CONF_REPROBE));
7749 	mutex_init(&vhbc->vhbc_lock, NULL, MUTEX_DEFAULT, NULL);
7750 	cv_init(&vhbc->vhbc_cv, NULL, CV_DRIVER, NULL);
7751 
7752 	/* now create threads to initiate bus config on all phcis in parallel */
7753 	for (phbc = phbc_head; phbc != NULL; phbc = phbc_next) {
7754 		phbc_next = phbc->phbc_next;
7755 		if (mdi_mtc_off)
7756 			bus_config_phci((void *)phbc);
7757 		else
7758 			(void) thread_create(NULL, 0, bus_config_phci, phbc,
7759 			    0, &p0, TS_RUN, minclsyspri);
7760 	}
7761 
7762 	mutex_enter(&vhbc->vhbc_lock);
7763 	/* wait until all threads exit */
7764 	while (vhbc->vhbc_thr_count > 0)
7765 		cv_wait(&vhbc->vhbc_cv, &vhbc->vhbc_lock);
7766 	mutex_exit(&vhbc->vhbc_lock);
7767 
7768 	mutex_destroy(&vhbc->vhbc_lock);
7769 	cv_destroy(&vhbc->vhbc_cv);
7770 	kmem_free(vhbc, sizeof (*vhbc));
7771 }
7772 
7773 /*
7774  * Single threaded version of bus_config_all_phcis()
7775  */
7776 static void
7777 st_bus_config_all_phcis(mdi_vhci_config_t *vhc, uint_t flags,
7778     ddi_bus_config_op_t op, major_t maj)
7779 {
7780 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7781 
7782 	single_threaded_vhconfig_enter(vhc);
7783 	bus_config_all_phcis(vhcache, flags, op, maj);
7784 	single_threaded_vhconfig_exit(vhc);
7785 }
7786 
7787 /*
7788  * Perform BUS_CONFIG_ONE on the specified child of the phci.
7789  * The path includes the child component in addition to the phci path.
7790  */
7791 static int
7792 bus_config_one_phci_child(char *path)
7793 {
7794 	dev_info_t *ph_dip, *child;
7795 	char *devnm;
7796 	int rv = MDI_FAILURE;
7797 
7798 	/* extract the child component of the phci */
7799 	devnm = strrchr(path, '/');
7800 	*devnm++ = '\0';
7801 
7802 	/*
7803 	 * first configure all path components upto phci and then
7804 	 * configure the phci child.
7805 	 */
7806 	if ((ph_dip = e_ddi_hold_devi_by_path(path, 0)) != NULL) {
7807 		if (ndi_devi_config_one(ph_dip, devnm, &child, NDI_NO_EVENT) ==
7808 		    NDI_SUCCESS) {
7809 			/*
7810 			 * release the hold that ndi_devi_config_one() placed
7811 			 */
7812 			ndi_rele_devi(child);
7813 			rv = MDI_SUCCESS;
7814 		}
7815 
7816 		/* release the hold that e_ddi_hold_devi_by_path() placed */
7817 		ndi_rele_devi(ph_dip);
7818 	}
7819 
7820 	devnm--;
7821 	*devnm = '/';
7822 	return (rv);
7823 }
7824 
7825 /*
7826  * Build a list of phci client paths for the specified vhci client.
7827  * The list includes only those phci client paths which aren't configured yet.
7828  */
7829 static mdi_phys_path_t *
7830 build_phclient_path_list(mdi_vhcache_client_t *cct, char *ct_name)
7831 {
7832 	mdi_vhcache_pathinfo_t *cpi;
7833 	mdi_phys_path_t *pp_head = NULL, *pp_tail = NULL, *pp;
7834 	int config_path, len;
7835 
7836 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
7837 		/*
7838 		 * include only those paths that aren't configured.
7839 		 */
7840 		config_path = 0;
7841 		if (cpi->cpi_pip == NULL)
7842 			config_path = 1;
7843 		else {
7844 			MDI_PI_LOCK(cpi->cpi_pip);
7845 			if (MDI_PI_IS_INIT(cpi->cpi_pip))
7846 				config_path = 1;
7847 			MDI_PI_UNLOCK(cpi->cpi_pip);
7848 		}
7849 
7850 		if (config_path) {
7851 			pp = kmem_alloc(sizeof (*pp), KM_SLEEP);
7852 			len = strlen(cpi->cpi_cphci->cphci_path) +
7853 			    strlen(ct_name) + strlen(cpi->cpi_addr) + 3;
7854 			pp->phys_path = kmem_alloc(len, KM_SLEEP);
7855 			(void) snprintf(pp->phys_path, len, "%s/%s@%s",
7856 			    cpi->cpi_cphci->cphci_path, ct_name,
7857 			    cpi->cpi_addr);
7858 			pp->phys_path_next = NULL;
7859 
7860 			if (pp_head == NULL)
7861 				pp_head = pp;
7862 			else
7863 				pp_tail->phys_path_next = pp;
7864 			pp_tail = pp;
7865 		}
7866 	}
7867 
7868 	return (pp_head);
7869 }
7870 
7871 /*
7872  * Free the memory allocated for phci client path list.
7873  */
7874 static void
7875 free_phclient_path_list(mdi_phys_path_t *pp_head)
7876 {
7877 	mdi_phys_path_t *pp, *pp_next;
7878 
7879 	for (pp = pp_head; pp != NULL; pp = pp_next) {
7880 		pp_next = pp->phys_path_next;
7881 		kmem_free(pp->phys_path, strlen(pp->phys_path) + 1);
7882 		kmem_free(pp, sizeof (*pp));
7883 	}
7884 }
7885 
7886 /*
7887  * Allocated async client structure and initialize with the specified values.
7888  */
7889 static mdi_async_client_config_t *
7890 alloc_async_client_config(char *ct_name, char *ct_addr,
7891     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
7892 {
7893 	mdi_async_client_config_t *acc;
7894 
7895 	acc = kmem_alloc(sizeof (*acc), KM_SLEEP);
7896 	acc->acc_ct_name = i_ddi_strdup(ct_name, KM_SLEEP);
7897 	acc->acc_ct_addr = i_ddi_strdup(ct_addr, KM_SLEEP);
7898 	acc->acc_phclient_path_list_head = pp_head;
7899 	init_vhcache_lookup_token(&acc->acc_token, tok);
7900 	acc->acc_next = NULL;
7901 	return (acc);
7902 }
7903 
7904 /*
7905  * Free the memory allocated for the async client structure and their members.
7906  */
7907 static void
7908 free_async_client_config(mdi_async_client_config_t *acc)
7909 {
7910 	if (acc->acc_phclient_path_list_head)
7911 		free_phclient_path_list(acc->acc_phclient_path_list_head);
7912 	kmem_free(acc->acc_ct_name, strlen(acc->acc_ct_name) + 1);
7913 	kmem_free(acc->acc_ct_addr, strlen(acc->acc_ct_addr) + 1);
7914 	kmem_free(acc, sizeof (*acc));
7915 }
7916 
7917 /*
7918  * Sort vhcache pathinfos (cpis) of the specified client.
7919  * All cpis which do not have MDI_CPI_HINT_PATH_DOES_NOT_EXIST
7920  * flag set come at the beginning of the list. All cpis which have this
7921  * flag set come at the end of the list.
7922  */
7923 static void
7924 sort_vhcache_paths(mdi_vhcache_client_t *cct)
7925 {
7926 	mdi_vhcache_pathinfo_t *cpi, *cpi_next, *cpi_head;
7927 
7928 	cpi_head = cct->cct_cpi_head;
7929 	cct->cct_cpi_head = cct->cct_cpi_tail = NULL;
7930 	for (cpi = cpi_head; cpi != NULL; cpi = cpi_next) {
7931 		cpi_next = cpi->cpi_next;
7932 		enqueue_vhcache_pathinfo(cct, cpi);
7933 	}
7934 }
7935 
7936 /*
7937  * Verify whether MDI_CPI_HINT_PATH_DOES_NOT_EXIST flag setting is correct for
7938  * every vhcache pathinfo of the specified client. If not adjust the flag
7939  * setting appropriately.
7940  *
7941  * Note that MDI_CPI_HINT_PATH_DOES_NOT_EXIST flag is persisted in the
7942  * on-disk vhci cache. So every time this flag is updated the cache must be
7943  * flushed.
7944  */
7945 static void
7946 adjust_sort_vhcache_paths(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
7947     mdi_vhcache_lookup_token_t *tok)
7948 {
7949 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7950 	mdi_vhcache_client_t *cct;
7951 	mdi_vhcache_pathinfo_t *cpi;
7952 
7953 	rw_enter(&vhcache->vhcache_lock, RW_READER);
7954 	if ((cct = lookup_vhcache_client(vhcache, ct_name, ct_addr, tok))
7955 	    == NULL) {
7956 		rw_exit(&vhcache->vhcache_lock);
7957 		return;
7958 	}
7959 
7960 	/*
7961 	 * to avoid unnecessary on-disk cache updates, first check if an
7962 	 * update is really needed. If no update is needed simply return.
7963 	 */
7964 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
7965 		if ((cpi->cpi_pip != NULL &&
7966 		    (cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST)) ||
7967 		    (cpi->cpi_pip == NULL &&
7968 		    !(cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST))) {
7969 			break;
7970 		}
7971 	}
7972 	if (cpi == NULL) {
7973 		rw_exit(&vhcache->vhcache_lock);
7974 		return;
7975 	}
7976 
7977 	if (rw_tryupgrade(&vhcache->vhcache_lock) == 0) {
7978 		rw_exit(&vhcache->vhcache_lock);
7979 		rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7980 		if ((cct = lookup_vhcache_client(vhcache, ct_name, ct_addr,
7981 		    tok)) == NULL) {
7982 			rw_exit(&vhcache->vhcache_lock);
7983 			return;
7984 		}
7985 	}
7986 
7987 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
7988 		if (cpi->cpi_pip != NULL)
7989 			cpi->cpi_flags &= ~MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
7990 		else
7991 			cpi->cpi_flags |= MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
7992 	}
7993 	sort_vhcache_paths(cct);
7994 
7995 	rw_exit(&vhcache->vhcache_lock);
7996 	vhcache_dirty(vhc);
7997 }
7998 
7999 /*
8000  * Configure all specified paths of the client.
8001  */
8002 static void
8003 config_client_paths_sync(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
8004     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
8005 {
8006 	mdi_phys_path_t *pp;
8007 
8008 	for (pp = pp_head; pp != NULL; pp = pp->phys_path_next)
8009 		(void) bus_config_one_phci_child(pp->phys_path);
8010 	adjust_sort_vhcache_paths(vhc, ct_name, ct_addr, tok);
8011 }
8012 
8013 /*
8014  * Dequeue elements from vhci async client config list and bus configure
8015  * their corresponding phci clients.
8016  */
8017 static void
8018 config_client_paths_thread(void *arg)
8019 {
8020 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
8021 	mdi_async_client_config_t *acc;
8022 	clock_t quit_at_ticks;
8023 	clock_t idle_time = mdi_async_config_idle_time * TICKS_PER_SECOND;
8024 	callb_cpr_t cprinfo;
8025 
8026 	CALLB_CPR_INIT(&cprinfo, &vhc->vhc_lock, callb_generic_cpr,
8027 	    "mdi_config_client_paths");
8028 
8029 	for (; ; ) {
8030 		quit_at_ticks = ddi_get_lbolt() + idle_time;
8031 
8032 		mutex_enter(&vhc->vhc_lock);
8033 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
8034 		    vhc->vhc_acc_list_head == NULL &&
8035 		    ddi_get_lbolt() < quit_at_ticks) {
8036 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
8037 			(void) cv_timedwait(&vhc->vhc_cv, &vhc->vhc_lock,
8038 			    quit_at_ticks);
8039 			CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
8040 		}
8041 
8042 		if ((vhc->vhc_flags & MDI_VHC_EXIT) ||
8043 		    vhc->vhc_acc_list_head == NULL)
8044 			goto out;
8045 
8046 		acc = vhc->vhc_acc_list_head;
8047 		vhc->vhc_acc_list_head = acc->acc_next;
8048 		if (vhc->vhc_acc_list_head == NULL)
8049 			vhc->vhc_acc_list_tail = NULL;
8050 		vhc->vhc_acc_count--;
8051 		mutex_exit(&vhc->vhc_lock);
8052 
8053 		config_client_paths_sync(vhc, acc->acc_ct_name,
8054 		    acc->acc_ct_addr, acc->acc_phclient_path_list_head,
8055 		    &acc->acc_token);
8056 
8057 		free_async_client_config(acc);
8058 	}
8059 
8060 out:
8061 	vhc->vhc_acc_thrcount--;
8062 	/* CALLB_CPR_EXIT releases the vhc->vhc_lock */
8063 	CALLB_CPR_EXIT(&cprinfo);
8064 }
8065 
8066 /*
8067  * Arrange for all the phci client paths (pp_head) for the specified client
8068  * to be bus configured asynchronously by a thread.
8069  */
8070 static void
8071 config_client_paths_async(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
8072     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
8073 {
8074 	mdi_async_client_config_t *acc, *newacc;
8075 	int create_thread;
8076 
8077 	if (pp_head == NULL)
8078 		return;
8079 
8080 	if (mdi_mtc_off) {
8081 		config_client_paths_sync(vhc, ct_name, ct_addr, pp_head, tok);
8082 		free_phclient_path_list(pp_head);
8083 		return;
8084 	}
8085 
8086 	newacc = alloc_async_client_config(ct_name, ct_addr, pp_head, tok);
8087 	ASSERT(newacc);
8088 
8089 	mutex_enter(&vhc->vhc_lock);
8090 	for (acc = vhc->vhc_acc_list_head; acc != NULL; acc = acc->acc_next) {
8091 		if (strcmp(ct_name, acc->acc_ct_name) == 0 &&
8092 		    strcmp(ct_addr, acc->acc_ct_addr) == 0) {
8093 			free_async_client_config(newacc);
8094 			mutex_exit(&vhc->vhc_lock);
8095 			return;
8096 		}
8097 	}
8098 
8099 	if (vhc->vhc_acc_list_head == NULL)
8100 		vhc->vhc_acc_list_head = newacc;
8101 	else
8102 		vhc->vhc_acc_list_tail->acc_next = newacc;
8103 	vhc->vhc_acc_list_tail = newacc;
8104 	vhc->vhc_acc_count++;
8105 	if (vhc->vhc_acc_count <= vhc->vhc_acc_thrcount) {
8106 		cv_broadcast(&vhc->vhc_cv);
8107 		create_thread = 0;
8108 	} else {
8109 		vhc->vhc_acc_thrcount++;
8110 		create_thread = 1;
8111 	}
8112 	mutex_exit(&vhc->vhc_lock);
8113 
8114 	if (create_thread)
8115 		(void) thread_create(NULL, 0, config_client_paths_thread, vhc,
8116 		    0, &p0, TS_RUN, minclsyspri);
8117 }
8118 
8119 /*
8120  * Return number of online paths for the specified client.
8121  */
8122 static int
8123 nonline_paths(mdi_vhcache_client_t *cct)
8124 {
8125 	mdi_vhcache_pathinfo_t *cpi;
8126 	int online_count = 0;
8127 
8128 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8129 		if (cpi->cpi_pip != NULL) {
8130 			MDI_PI_LOCK(cpi->cpi_pip);
8131 			if (cpi->cpi_pip->pi_state == MDI_PATHINFO_STATE_ONLINE)
8132 				online_count++;
8133 			MDI_PI_UNLOCK(cpi->cpi_pip);
8134 		}
8135 	}
8136 
8137 	return (online_count);
8138 }
8139 
8140 /*
8141  * Bus configure all paths for the specified vhci client.
8142  * If at least one path for the client is already online, the remaining paths
8143  * will be configured asynchronously. Otherwise, it synchronously configures
8144  * the paths until at least one path is online and then rest of the paths
8145  * will be configured asynchronously.
8146  */
8147 static void
8148 config_client_paths(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr)
8149 {
8150 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8151 	mdi_phys_path_t *pp_head, *pp;
8152 	mdi_vhcache_client_t *cct;
8153 	mdi_vhcache_lookup_token_t tok;
8154 
8155 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
8156 
8157 	init_vhcache_lookup_token(&tok, NULL);
8158 
8159 	if (ct_name == NULL || ct_addr == NULL ||
8160 	    (cct = lookup_vhcache_client(vhcache, ct_name, ct_addr, &tok))
8161 	    == NULL ||
8162 	    (pp_head = build_phclient_path_list(cct, ct_name)) == NULL) {
8163 		rw_exit(&vhcache->vhcache_lock);
8164 		return;
8165 	}
8166 
8167 	/* if at least one path is online, configure the rest asynchronously */
8168 	if (nonline_paths(cct) > 0) {
8169 		rw_exit(&vhcache->vhcache_lock);
8170 		config_client_paths_async(vhc, ct_name, ct_addr, pp_head, &tok);
8171 		return;
8172 	}
8173 
8174 	rw_exit(&vhcache->vhcache_lock);
8175 
8176 	for (pp = pp_head; pp != NULL; pp = pp->phys_path_next) {
8177 		if (bus_config_one_phci_child(pp->phys_path) == MDI_SUCCESS) {
8178 			rw_enter(&vhcache->vhcache_lock, RW_READER);
8179 
8180 			if ((cct = lookup_vhcache_client(vhcache, ct_name,
8181 			    ct_addr, &tok)) == NULL) {
8182 				rw_exit(&vhcache->vhcache_lock);
8183 				goto out;
8184 			}
8185 
8186 			if (nonline_paths(cct) > 0 &&
8187 			    pp->phys_path_next != NULL) {
8188 				rw_exit(&vhcache->vhcache_lock);
8189 				config_client_paths_async(vhc, ct_name, ct_addr,
8190 				    pp->phys_path_next, &tok);
8191 				pp->phys_path_next = NULL;
8192 				goto out;
8193 			}
8194 
8195 			rw_exit(&vhcache->vhcache_lock);
8196 		}
8197 	}
8198 
8199 	adjust_sort_vhcache_paths(vhc, ct_name, ct_addr, &tok);
8200 out:
8201 	free_phclient_path_list(pp_head);
8202 }
8203 
8204 static void
8205 single_threaded_vhconfig_enter(mdi_vhci_config_t *vhc)
8206 {
8207 	mutex_enter(&vhc->vhc_lock);
8208 	while (vhc->vhc_flags & MDI_VHC_SINGLE_THREADED)
8209 		cv_wait(&vhc->vhc_cv, &vhc->vhc_lock);
8210 	vhc->vhc_flags |= MDI_VHC_SINGLE_THREADED;
8211 	mutex_exit(&vhc->vhc_lock);
8212 }
8213 
8214 static void
8215 single_threaded_vhconfig_exit(mdi_vhci_config_t *vhc)
8216 {
8217 	mutex_enter(&vhc->vhc_lock);
8218 	vhc->vhc_flags &= ~MDI_VHC_SINGLE_THREADED;
8219 	cv_broadcast(&vhc->vhc_cv);
8220 	mutex_exit(&vhc->vhc_lock);
8221 }
8222 
8223 /*
8224  * Attach the phci driver instances associated with the vhci:
8225  * If root is mounted attach all phci driver instances.
8226  * If root is not mounted, attach the instances of only those phci
8227  * drivers that have the root support.
8228  */
8229 static void
8230 attach_phci_drivers(mdi_vhci_config_t *vhc)
8231 {
8232 	int  i;
8233 	major_t m;
8234 
8235 	for (i = 0; i < vhc->vhc_nphci_drivers; i++) {
8236 		if (modrootloaded == 0 &&
8237 		    vhc->vhc_phci_driver_list[i].phdriver_root_support == 0)
8238 			continue;
8239 
8240 		m = ddi_name_to_major(
8241 		    vhc->vhc_phci_driver_list[i].phdriver_name);
8242 		if (m != (major_t)-1) {
8243 			if (ddi_hold_installed_driver(m) != NULL)
8244 				ddi_rele_driver(m);
8245 		}
8246 	}
8247 }
8248 
8249 /*
8250  * Build vhci cache:
8251  *
8252  * Attach phci driver instances and then drive BUS_CONFIG_ALL on
8253  * the phci driver instances. During this process the cache gets built.
8254  *
8255  * Cache is built fully if the root is mounted.
8256  * If the root is not mounted, phci drivers that do not have root support
8257  * are not attached. As a result the cache is built partially. The entries
8258  * in the cache reflect only those phci drivers that have root support.
8259  */
8260 static int
8261 build_vhci_cache(mdi_vhci_config_t *vhc)
8262 {
8263 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8264 
8265 	single_threaded_vhconfig_enter(vhc);
8266 
8267 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8268 	if (vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE) {
8269 		rw_exit(&vhcache->vhcache_lock);
8270 		single_threaded_vhconfig_exit(vhc);
8271 		return (0);
8272 	}
8273 	rw_exit(&vhcache->vhcache_lock);
8274 
8275 	attach_phci_drivers(vhc);
8276 	bus_config_all_phcis(vhcache, NDI_DRV_CONF_REPROBE | NDI_NO_EVENT,
8277 	    BUS_CONFIG_ALL, (major_t)-1);
8278 
8279 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8280 	vhcache->vhcache_flags |= MDI_VHCI_CACHE_SETUP_DONE;
8281 	rw_exit(&vhcache->vhcache_lock);
8282 
8283 	single_threaded_vhconfig_exit(vhc);
8284 	vhcache_dirty(vhc);
8285 	return (1);
8286 }
8287 
8288 /*
8289  * Determine if discovery of paths is needed.
8290  */
8291 static int
8292 vhcache_do_discovery(mdi_vhci_config_t *vhc)
8293 {
8294 	int rv = 1;
8295 
8296 	mutex_enter(&vhc->vhc_lock);
8297 	if (i_ddi_io_initialized() == 0) {
8298 		if (vhc->vhc_path_discovery_boot > 0) {
8299 			vhc->vhc_path_discovery_boot--;
8300 			goto out;
8301 		}
8302 	} else {
8303 		if (vhc->vhc_path_discovery_postboot > 0) {
8304 			vhc->vhc_path_discovery_postboot--;
8305 			goto out;
8306 		}
8307 	}
8308 
8309 	/*
8310 	 * Do full path discovery at most once per mdi_path_discovery_interval.
8311 	 * This is to avoid a series of full path discoveries when opening
8312 	 * stale /dev/[r]dsk links.
8313 	 */
8314 	if (mdi_path_discovery_interval != -1 &&
8315 	    lbolt64 >= vhc->vhc_path_discovery_cutoff_time)
8316 		goto out;
8317 
8318 	rv = 0;
8319 out:
8320 	mutex_exit(&vhc->vhc_lock);
8321 	return (rv);
8322 }
8323 
8324 /*
8325  * Discover all paths:
8326  *
8327  * Attach phci driver instances and then drive BUS_CONFIG_ALL on all the phci
8328  * driver instances. During this process all paths will be discovered.
8329  */
8330 static int
8331 vhcache_discover_paths(mdi_vhci_config_t *vhc)
8332 {
8333 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8334 	int rv = 0;
8335 
8336 	single_threaded_vhconfig_enter(vhc);
8337 
8338 	if (vhcache_do_discovery(vhc)) {
8339 		attach_phci_drivers(vhc);
8340 		bus_config_all_phcis(vhcache, NDI_DRV_CONF_REPROBE |
8341 		    NDI_NO_EVENT, BUS_CONFIG_ALL, (major_t)-1);
8342 
8343 		mutex_enter(&vhc->vhc_lock);
8344 		vhc->vhc_path_discovery_cutoff_time = lbolt64 +
8345 		    mdi_path_discovery_interval * TICKS_PER_SECOND;
8346 		mutex_exit(&vhc->vhc_lock);
8347 		rv = 1;
8348 	}
8349 
8350 	single_threaded_vhconfig_exit(vhc);
8351 	return (rv);
8352 }
8353 
8354 /*
8355  * Generic vhci bus config implementation:
8356  *
8357  * Parameters
8358  *	vdip	vhci dip
8359  *	flags	bus config flags
8360  *	op	bus config operation
8361  *	The remaining parameters are bus config operation specific
8362  *
8363  * for BUS_CONFIG_ONE
8364  *	arg	pointer to name@addr
8365  *	child	upon successful return from this function, *child will be
8366  *		set to the configured and held devinfo child node of vdip.
8367  *	ct_addr	pointer to client address (i.e. GUID)
8368  *
8369  * for BUS_CONFIG_DRIVER
8370  *	arg	major number of the driver
8371  *	child and ct_addr parameters are ignored
8372  *
8373  * for BUS_CONFIG_ALL
8374  *	arg, child, and ct_addr parameters are ignored
8375  *
8376  * Note that for the rest of the bus config operations, this function simply
8377  * calls the framework provided default bus config routine.
8378  */
8379 int
8380 mdi_vhci_bus_config(dev_info_t *vdip, uint_t flags, ddi_bus_config_op_t op,
8381     void *arg, dev_info_t **child, char *ct_addr)
8382 {
8383 	mdi_vhci_t *vh = i_devi_get_vhci(vdip);
8384 	mdi_vhci_config_t *vhc = vh->vh_config;
8385 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8386 	int rv = 0;
8387 	int params_valid = 0;
8388 	char *cp;
8389 
8390 	/*
8391 	 * To bus config vhcis we relay operation, possibly using another
8392 	 * thread, to phcis. The phci driver then interacts with MDI to cause
8393 	 * vhci child nodes to be enumerated under the vhci node.  Adding a
8394 	 * vhci child requires an ndi_devi_enter of the vhci. Since another
8395 	 * thread may be adding the child, to avoid deadlock we can't wait
8396 	 * for the relayed operations to complete if we have already entered
8397 	 * the vhci node.
8398 	 */
8399 	if (DEVI_BUSY_OWNED(vdip)) {
8400 		MDI_DEBUG(2, (CE_NOTE, vdip, "!MDI: vhci bus config: "
8401 		    "vhci dip is busy owned %p\n", (void *)vdip));
8402 		goto default_bus_config;
8403 	}
8404 
8405 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8406 	if (!(vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE)) {
8407 		rw_exit(&vhcache->vhcache_lock);
8408 		rv = build_vhci_cache(vhc);
8409 		rw_enter(&vhcache->vhcache_lock, RW_READER);
8410 	}
8411 
8412 	switch (op) {
8413 	case BUS_CONFIG_ONE:
8414 		if (arg != NULL && ct_addr != NULL) {
8415 			/* extract node name */
8416 			cp = (char *)arg;
8417 			while (*cp != '\0' && *cp != '@')
8418 				cp++;
8419 			if (*cp == '@') {
8420 				params_valid = 1;
8421 				*cp = '\0';
8422 				config_client_paths(vhc, (char *)arg, ct_addr);
8423 				/* config_client_paths() releases cache_lock */
8424 				*cp = '@';
8425 				break;
8426 			}
8427 		}
8428 
8429 		rw_exit(&vhcache->vhcache_lock);
8430 		break;
8431 
8432 	case BUS_CONFIG_DRIVER:
8433 		rw_exit(&vhcache->vhcache_lock);
8434 		if (rv == 0)
8435 			st_bus_config_all_phcis(vhc, flags, op,
8436 			    (major_t)(uintptr_t)arg);
8437 		break;
8438 
8439 	case BUS_CONFIG_ALL:
8440 		rw_exit(&vhcache->vhcache_lock);
8441 		if (rv == 0)
8442 			st_bus_config_all_phcis(vhc, flags, op, -1);
8443 		break;
8444 
8445 	default:
8446 		rw_exit(&vhcache->vhcache_lock);
8447 		break;
8448 	}
8449 
8450 
8451 default_bus_config:
8452 	/*
8453 	 * All requested child nodes are enumerated under the vhci.
8454 	 * Now configure them.
8455 	 */
8456 	if (ndi_busop_bus_config(vdip, flags, op, arg, child, 0) ==
8457 	    NDI_SUCCESS) {
8458 		return (MDI_SUCCESS);
8459 	} else if (op == BUS_CONFIG_ONE && rv == 0 && params_valid) {
8460 		/* discover all paths and try configuring again */
8461 		if (vhcache_discover_paths(vhc) &&
8462 		    ndi_busop_bus_config(vdip, flags, op, arg, child, 0) ==
8463 		    NDI_SUCCESS)
8464 			return (MDI_SUCCESS);
8465 	}
8466 
8467 	return (MDI_FAILURE);
8468 }
8469 
8470 /*
8471  * Read the on-disk vhci cache into an nvlist for the specified vhci class.
8472  */
8473 static nvlist_t *
8474 read_on_disk_vhci_cache(char *vhci_class)
8475 {
8476 	nvlist_t *nvl;
8477 	int err;
8478 	char *filename;
8479 
8480 	filename = vhclass2vhcache_filename(vhci_class);
8481 
8482 	if ((err = fread_nvlist(filename, &nvl)) == 0) {
8483 		kmem_free(filename, strlen(filename) + 1);
8484 		return (nvl);
8485 	} else if (err == EIO)
8486 		cmn_err(CE_WARN, "%s: I/O error, will recreate\n", filename);
8487 	else if (err == EINVAL)
8488 		cmn_err(CE_WARN,
8489 		    "%s: data file corrupted, will recreate\n", filename);
8490 
8491 	kmem_free(filename, strlen(filename) + 1);
8492 	return (NULL);
8493 }
8494 
8495 /*
8496  * Read on-disk vhci cache into nvlists for all vhci classes.
8497  * Called during booting by i_ddi_read_devices_files().
8498  */
8499 void
8500 mdi_read_devices_files(void)
8501 {
8502 	int i;
8503 
8504 	for (i = 0; i < N_VHCI_CLASSES; i++)
8505 		vhcache_nvl[i] = read_on_disk_vhci_cache(vhci_class_list[i]);
8506 }
8507 
8508 /*
8509  * Remove all stale entries from vhci cache.
8510  */
8511 static void
8512 clean_vhcache(mdi_vhci_config_t *vhc)
8513 {
8514 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8515 	mdi_vhcache_phci_t *cphci, *cphci_head, *cphci_next;
8516 	mdi_vhcache_client_t *cct, *cct_head, *cct_next;
8517 	mdi_vhcache_pathinfo_t *cpi, *cpi_head, *cpi_next;
8518 
8519 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8520 
8521 	cct_head = vhcache->vhcache_client_head;
8522 	vhcache->vhcache_client_head = vhcache->vhcache_client_tail = NULL;
8523 	for (cct = cct_head; cct != NULL; cct = cct_next) {
8524 		cct_next = cct->cct_next;
8525 
8526 		cpi_head = cct->cct_cpi_head;
8527 		cct->cct_cpi_head = cct->cct_cpi_tail = NULL;
8528 		for (cpi = cpi_head; cpi != NULL; cpi = cpi_next) {
8529 			cpi_next = cpi->cpi_next;
8530 			if (cpi->cpi_pip != NULL) {
8531 				ASSERT(cpi->cpi_cphci->cphci_phci != NULL);
8532 				enqueue_tail_vhcache_pathinfo(cct, cpi);
8533 			} else
8534 				free_vhcache_pathinfo(cpi);
8535 		}
8536 
8537 		if (cct->cct_cpi_head != NULL)
8538 			enqueue_vhcache_client(vhcache, cct);
8539 		else {
8540 			(void) mod_hash_destroy(vhcache->vhcache_client_hash,
8541 			    (mod_hash_key_t)cct->cct_name_addr);
8542 			free_vhcache_client(cct);
8543 		}
8544 	}
8545 
8546 	cphci_head = vhcache->vhcache_phci_head;
8547 	vhcache->vhcache_phci_head = vhcache->vhcache_phci_tail = NULL;
8548 	for (cphci = cphci_head; cphci != NULL; cphci = cphci_next) {
8549 		cphci_next = cphci->cphci_next;
8550 		if (cphci->cphci_phci != NULL)
8551 			enqueue_vhcache_phci(vhcache, cphci);
8552 		else
8553 			free_vhcache_phci(cphci);
8554 	}
8555 
8556 	vhcache->vhcache_clean_time = lbolt64;
8557 	rw_exit(&vhcache->vhcache_lock);
8558 	vhcache_dirty(vhc);
8559 }
8560 
8561 /*
8562  * Remove all stale entries from vhci cache.
8563  * Called by i_ddi_clean_devices_files() during the execution of devfsadm -C
8564  */
8565 void
8566 mdi_clean_vhcache(void)
8567 {
8568 	mdi_vhci_t *vh;
8569 
8570 	mutex_enter(&mdi_mutex);
8571 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
8572 		vh->vh_refcnt++;
8573 		mutex_exit(&mdi_mutex);
8574 		clean_vhcache(vh->vh_config);
8575 		mutex_enter(&mdi_mutex);
8576 		vh->vh_refcnt--;
8577 	}
8578 	mutex_exit(&mdi_mutex);
8579 }
8580 
8581 /*
8582  * mdi_vhci_walk_clients():
8583  *		Walker routine to traverse client dev_info nodes
8584  * ddi_walk_devs(ddi_get_child(vdip), f, arg) returns the entire tree
8585  * below the client, including nexus devices, which we dont want.
8586  * So we just traverse the immediate siblings, starting from 1st client.
8587  */
8588 void
8589 mdi_vhci_walk_clients(dev_info_t *vdip,
8590     int (*f)(dev_info_t *, void *), void *arg)
8591 {
8592 	mdi_vhci_t	*vh = i_devi_get_vhci(vdip);
8593 	dev_info_t	*cdip;
8594 	mdi_client_t	*ct;
8595 
8596 	MDI_VHCI_CLIENT_LOCK(vh);
8597 	cdip = ddi_get_child(vdip);
8598 	while (cdip) {
8599 		ct = i_devi_get_client(cdip);
8600 		MDI_CLIENT_LOCK(ct);
8601 
8602 		if (((*f)(cdip, arg)) == DDI_WALK_CONTINUE)
8603 			cdip = ddi_get_next_sibling(cdip);
8604 		else
8605 			cdip = NULL;
8606 
8607 		MDI_CLIENT_UNLOCK(ct);
8608 	}
8609 	MDI_VHCI_CLIENT_UNLOCK(vh);
8610 }
8611 
8612 /*
8613  * mdi_vhci_walk_phcis():
8614  *		Walker routine to traverse phci dev_info nodes
8615  */
8616 void
8617 mdi_vhci_walk_phcis(dev_info_t *vdip,
8618     int (*f)(dev_info_t *, void *), void *arg)
8619 {
8620 	mdi_vhci_t	*vh = i_devi_get_vhci(vdip);
8621 	mdi_phci_t	*ph, *next;
8622 
8623 	MDI_VHCI_PHCI_LOCK(vh);
8624 	ph = vh->vh_phci_head;
8625 	while (ph) {
8626 		MDI_PHCI_LOCK(ph);
8627 
8628 		if (((*f)(ph->ph_dip, arg)) == DDI_WALK_CONTINUE)
8629 			next = ph->ph_next;
8630 		else
8631 			next = NULL;
8632 
8633 		MDI_PHCI_UNLOCK(ph);
8634 		ph = next;
8635 	}
8636 	MDI_VHCI_PHCI_UNLOCK(vh);
8637 }
8638 
8639 
8640 /*
8641  * mdi_walk_vhcis():
8642  *		Walker routine to traverse vhci dev_info nodes
8643  */
8644 void
8645 mdi_walk_vhcis(int (*f)(dev_info_t *, void *), void *arg)
8646 {
8647 	mdi_vhci_t	*vh = NULL;
8648 
8649 	mutex_enter(&mdi_mutex);
8650 	/*
8651 	 * Scan for already registered vhci
8652 	 */
8653 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
8654 		vh->vh_refcnt++;
8655 		mutex_exit(&mdi_mutex);
8656 		if (((*f)(vh->vh_dip, arg)) != DDI_WALK_CONTINUE) {
8657 			mutex_enter(&mdi_mutex);
8658 			vh->vh_refcnt--;
8659 			break;
8660 		} else {
8661 			mutex_enter(&mdi_mutex);
8662 			vh->vh_refcnt--;
8663 		}
8664 	}
8665 
8666 	mutex_exit(&mdi_mutex);
8667 }
8668 
8669 /*
8670  * i_mdi_log_sysevent():
8671  *		Logs events for pickup by syseventd
8672  */
8673 static void
8674 i_mdi_log_sysevent(dev_info_t *dip, char *ph_vh_class, char *subclass)
8675 {
8676 	char		*path_name;
8677 	nvlist_t	*attr_list;
8678 
8679 	if (nvlist_alloc(&attr_list, NV_UNIQUE_NAME_TYPE,
8680 	    KM_SLEEP) != DDI_SUCCESS) {
8681 		goto alloc_failed;
8682 	}
8683 
8684 	path_name = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
8685 	(void) ddi_pathname(dip, path_name);
8686 
8687 	if (nvlist_add_string(attr_list, DDI_DRIVER_NAME,
8688 	    ddi_driver_name(dip)) != DDI_SUCCESS) {
8689 		goto error;
8690 	}
8691 
8692 	if (nvlist_add_int32(attr_list, DDI_DRIVER_MAJOR,
8693 	    (int32_t)ddi_driver_major(dip)) != DDI_SUCCESS) {
8694 		goto error;
8695 	}
8696 
8697 	if (nvlist_add_int32(attr_list, DDI_INSTANCE,
8698 	    (int32_t)ddi_get_instance(dip)) != DDI_SUCCESS) {
8699 		goto error;
8700 	}
8701 
8702 	if (nvlist_add_string(attr_list, DDI_PATHNAME,
8703 	    path_name) != DDI_SUCCESS) {
8704 		goto error;
8705 	}
8706 
8707 	if (nvlist_add_string(attr_list, DDI_CLASS,
8708 	    ph_vh_class) != DDI_SUCCESS) {
8709 		goto error;
8710 	}
8711 
8712 	(void) ddi_log_sysevent(dip, DDI_VENDOR_SUNW, EC_DDI, subclass,
8713 	    attr_list, NULL, DDI_SLEEP);
8714 
8715 error:
8716 	kmem_free(path_name, MAXPATHLEN);
8717 	nvlist_free(attr_list);
8718 	return;
8719 
8720 alloc_failed:
8721 	MDI_DEBUG(1, (CE_WARN, dip,
8722 	    "!i_mdi_log_sysevent: Unable to send sysevent"));
8723 }
8724