xref: /titanic_44/usr/src/uts/common/os/sunmdi.c (revision 66e150d7d3c0cb2de3c45c74612784ffd3e73de6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Multipath driver interface (MDI) implementation; see mdi_impl.h for a more
28  * detailed discussion of the overall mpxio architecture.
29  *
30  * Default locking order:
31  *
32  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_phci_mutex);
33  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_client_mutex);
34  * _NOTE(LOCK_ORDER(mdi_vhci:vh_phci_mutex, mdi_phci::ph_mutex);
35  * _NOTE(LOCK_ORDER(mdi_vhci:vh_client_mutex, mdi_client::ct_mutex);
36  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
37  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_client::ct_mutex))
38  * _NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
39  */
40 
41 #include <sys/note.h>
42 #include <sys/types.h>
43 #include <sys/varargs.h>
44 #include <sys/param.h>
45 #include <sys/errno.h>
46 #include <sys/uio.h>
47 #include <sys/buf.h>
48 #include <sys/modctl.h>
49 #include <sys/open.h>
50 #include <sys/kmem.h>
51 #include <sys/poll.h>
52 #include <sys/conf.h>
53 #include <sys/bootconf.h>
54 #include <sys/cmn_err.h>
55 #include <sys/stat.h>
56 #include <sys/ddi.h>
57 #include <sys/sunddi.h>
58 #include <sys/ddipropdefs.h>
59 #include <sys/sunndi.h>
60 #include <sys/ndi_impldefs.h>
61 #include <sys/promif.h>
62 #include <sys/sunmdi.h>
63 #include <sys/mdi_impldefs.h>
64 #include <sys/taskq.h>
65 #include <sys/epm.h>
66 #include <sys/sunpm.h>
67 #include <sys/modhash.h>
68 #include <sys/disp.h>
69 #include <sys/autoconf.h>
70 #include <sys/sysmacros.h>
71 
72 #ifdef	DEBUG
73 #include <sys/debug.h>
74 int	mdi_debug = 1;
75 int	mdi_debug_logonly = 0;
76 #define	MDI_DEBUG(level, stmnt) \
77 	    if (mdi_debug >= (level)) i_mdi_log stmnt
78 static void i_mdi_log(int, dev_info_t *, const char *fmt, ...);
79 #else	/* !DEBUG */
80 #define	MDI_DEBUG(level, stmnt)
81 #endif	/* DEBUG */
82 
83 extern pri_t	minclsyspri;
84 extern int	modrootloaded;
85 
86 /*
87  * Global mutex:
88  * Protects vHCI list and structure members.
89  */
90 kmutex_t	mdi_mutex;
91 
92 /*
93  * Registered vHCI class driver lists
94  */
95 int		mdi_vhci_count;
96 mdi_vhci_t	*mdi_vhci_head;
97 mdi_vhci_t	*mdi_vhci_tail;
98 
99 /*
100  * Client Hash Table size
101  */
102 static int	mdi_client_table_size = CLIENT_HASH_TABLE_SIZE;
103 
104 /*
105  * taskq interface definitions
106  */
107 #define	MDI_TASKQ_N_THREADS	8
108 #define	MDI_TASKQ_PRI		minclsyspri
109 #define	MDI_TASKQ_MINALLOC	(4*mdi_taskq_n_threads)
110 #define	MDI_TASKQ_MAXALLOC	(500*mdi_taskq_n_threads)
111 
112 taskq_t				*mdi_taskq;
113 static uint_t			mdi_taskq_n_threads = MDI_TASKQ_N_THREADS;
114 
115 #define	TICKS_PER_SECOND	(drv_usectohz(1000000))
116 
117 /*
118  * The data should be "quiet" for this interval (in seconds) before the
119  * vhci cached data is flushed to the disk.
120  */
121 static int mdi_vhcache_flush_delay = 10;
122 
123 /* number of seconds the vhcache flush daemon will sleep idle before exiting */
124 static int mdi_vhcache_flush_daemon_idle_time = 60;
125 
126 /*
127  * MDI falls back to discovery of all paths when a bus_config_one fails.
128  * The following parameters can be used to tune this operation.
129  *
130  * mdi_path_discovery_boot
131  *	Number of times path discovery will be attempted during early boot.
132  *	Probably there is no reason to ever set this value to greater than one.
133  *
134  * mdi_path_discovery_postboot
135  *	Number of times path discovery will be attempted after early boot.
136  *	Set it to a minimum of two to allow for discovery of iscsi paths which
137  *	may happen very late during booting.
138  *
139  * mdi_path_discovery_interval
140  *	Minimum number of seconds MDI will wait between successive discovery
141  *	of all paths. Set it to -1 to disable discovery of all paths.
142  */
143 static int mdi_path_discovery_boot = 1;
144 static int mdi_path_discovery_postboot = 2;
145 static int mdi_path_discovery_interval = 10;
146 
147 /*
148  * number of seconds the asynchronous configuration thread will sleep idle
149  * before exiting.
150  */
151 static int mdi_async_config_idle_time = 600;
152 
153 static int mdi_bus_config_cache_hash_size = 256;
154 
155 /* turns off multithreaded configuration for certain operations */
156 static int mdi_mtc_off = 0;
157 
158 /*
159  * The "path" to a pathinfo node is identical to the /devices path to a
160  * devinfo node had the device been enumerated under a pHCI instead of
161  * a vHCI.  This pathinfo "path" is associated with a 'path_instance'.
162  * This association persists across create/delete of the pathinfo nodes,
163  * but not across reboot.
164  */
165 static uint_t		mdi_pathmap_instance = 1;	/* 0 -> any path */
166 static int		mdi_pathmap_hash_size = 256;
167 static kmutex_t		mdi_pathmap_mutex;
168 static mod_hash_t	*mdi_pathmap_bypath;		/* "path"->instance */
169 static mod_hash_t	*mdi_pathmap_byinstance;	/* instance->"path" */
170 
171 /*
172  * MDI component property name/value string definitions
173  */
174 const char 		*mdi_component_prop = "mpxio-component";
175 const char		*mdi_component_prop_vhci = "vhci";
176 const char		*mdi_component_prop_phci = "phci";
177 const char		*mdi_component_prop_client = "client";
178 
179 /*
180  * MDI client global unique identifier property name
181  */
182 const char		*mdi_client_guid_prop = "client-guid";
183 
184 /*
185  * MDI client load balancing property name/value string definitions
186  */
187 const char		*mdi_load_balance = "load-balance";
188 const char		*mdi_load_balance_none = "none";
189 const char		*mdi_load_balance_rr = "round-robin";
190 const char		*mdi_load_balance_lba = "logical-block";
191 
192 /*
193  * Obsolete vHCI class definition; to be removed after Leadville update
194  */
195 const char *mdi_vhci_class_scsi = MDI_HCI_CLASS_SCSI;
196 
197 static char vhci_greeting[] =
198 	"\tThere already exists one vHCI driver for class %s\n"
199 	"\tOnly one vHCI driver for each class is allowed\n";
200 
201 /*
202  * Static function prototypes
203  */
204 static int		i_mdi_phci_offline(dev_info_t *, uint_t);
205 static int		i_mdi_client_offline(dev_info_t *, uint_t);
206 static int		i_mdi_phci_pre_detach(dev_info_t *, ddi_detach_cmd_t);
207 static void		i_mdi_phci_post_detach(dev_info_t *,
208 			    ddi_detach_cmd_t, int);
209 static int		i_mdi_client_pre_detach(dev_info_t *,
210 			    ddi_detach_cmd_t);
211 static void		i_mdi_client_post_detach(dev_info_t *,
212 			    ddi_detach_cmd_t, int);
213 static void		i_mdi_pm_hold_pip(mdi_pathinfo_t *);
214 static void		i_mdi_pm_rele_pip(mdi_pathinfo_t *);
215 static int 		i_mdi_lba_lb(mdi_client_t *ct,
216 			    mdi_pathinfo_t **ret_pip, struct buf *buf);
217 static void		i_mdi_pm_hold_client(mdi_client_t *, int);
218 static void		i_mdi_pm_rele_client(mdi_client_t *, int);
219 static void		i_mdi_pm_reset_client(mdi_client_t *);
220 static int		i_mdi_power_all_phci(mdi_client_t *);
221 static void		i_mdi_log_sysevent(dev_info_t *, char *, char *);
222 
223 
224 /*
225  * Internal mdi_pathinfo node functions
226  */
227 static void		i_mdi_pi_kstat_destroy(mdi_pathinfo_t *);
228 
229 static mdi_vhci_t	*i_mdi_vhci_class2vhci(char *);
230 static mdi_vhci_t	*i_devi_get_vhci(dev_info_t *);
231 static mdi_phci_t	*i_devi_get_phci(dev_info_t *);
232 static void		i_mdi_phci_lock(mdi_phci_t *, mdi_pathinfo_t *);
233 static void		i_mdi_phci_unlock(mdi_phci_t *);
234 static mdi_pathinfo_t	*i_mdi_pi_alloc(mdi_phci_t *, char *, mdi_client_t *);
235 static void		i_mdi_phci_add_path(mdi_phci_t *, mdi_pathinfo_t *);
236 static void		i_mdi_client_add_path(mdi_client_t *, mdi_pathinfo_t *);
237 static void		i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *,
238 			    mdi_client_t *);
239 static void		i_mdi_phci_remove_path(mdi_phci_t *, mdi_pathinfo_t *);
240 static void		i_mdi_client_remove_path(mdi_client_t *,
241 			    mdi_pathinfo_t *);
242 
243 static int		i_mdi_pi_state_change(mdi_pathinfo_t *,
244 			    mdi_pathinfo_state_t, int);
245 static int		i_mdi_pi_offline(mdi_pathinfo_t *, int);
246 static dev_info_t	*i_mdi_devinfo_create(mdi_vhci_t *, char *, char *,
247 			    char **, int);
248 static dev_info_t	*i_mdi_devinfo_find(mdi_vhci_t *, char *, char *);
249 static int		i_mdi_devinfo_remove(dev_info_t *, dev_info_t *, int);
250 static int		i_mdi_is_child_present(dev_info_t *, dev_info_t *);
251 static mdi_client_t	*i_mdi_client_alloc(mdi_vhci_t *, char *, char *);
252 static void		i_mdi_client_enlist_table(mdi_vhci_t *, mdi_client_t *);
253 static void		i_mdi_client_delist_table(mdi_vhci_t *, mdi_client_t *);
254 static mdi_client_t	*i_mdi_client_find(mdi_vhci_t *, char *, char *);
255 static void		i_mdi_client_update_state(mdi_client_t *);
256 static int		i_mdi_client_compute_state(mdi_client_t *,
257 			    mdi_phci_t *);
258 static void		i_mdi_client_lock(mdi_client_t *, mdi_pathinfo_t *);
259 static void		i_mdi_client_unlock(mdi_client_t *);
260 static int		i_mdi_client_free(mdi_vhci_t *, mdi_client_t *);
261 static mdi_client_t	*i_devi_get_client(dev_info_t *);
262 /*
263  * NOTE: this will be removed once the NWS files are changed to use the new
264  * mdi_{enable,disable}_path interfaces
265  */
266 static int		i_mdi_pi_enable_disable(dev_info_t *, dev_info_t *,
267 				int, int);
268 static mdi_pathinfo_t 	*i_mdi_enable_disable_path(mdi_pathinfo_t *pip,
269 				mdi_vhci_t *vh, int flags, int op);
270 /*
271  * Failover related function prototypes
272  */
273 static int		i_mdi_failover(void *);
274 
275 /*
276  * misc internal functions
277  */
278 static int		i_mdi_get_hash_key(char *);
279 static int		i_map_nvlist_error_to_mdi(int);
280 static void		i_mdi_report_path_state(mdi_client_t *,
281 			    mdi_pathinfo_t *);
282 
283 static void		setup_vhci_cache(mdi_vhci_t *);
284 static int		destroy_vhci_cache(mdi_vhci_t *);
285 static int		stop_vhcache_async_threads(mdi_vhci_config_t *);
286 static boolean_t	stop_vhcache_flush_thread(void *, int);
287 static void		free_string_array(char **, int);
288 static void		free_vhcache_phci(mdi_vhcache_phci_t *);
289 static void		free_vhcache_pathinfo(mdi_vhcache_pathinfo_t *);
290 static void		free_vhcache_client(mdi_vhcache_client_t *);
291 static int		mainnvl_to_vhcache(mdi_vhci_cache_t *, nvlist_t *);
292 static nvlist_t		*vhcache_to_mainnvl(mdi_vhci_cache_t *);
293 static void		vhcache_phci_add(mdi_vhci_config_t *, mdi_phci_t *);
294 static void		vhcache_phci_remove(mdi_vhci_config_t *, mdi_phci_t *);
295 static void		vhcache_pi_add(mdi_vhci_config_t *,
296 			    struct mdi_pathinfo *);
297 static void		vhcache_pi_remove(mdi_vhci_config_t *,
298 			    struct mdi_pathinfo *);
299 static void		free_phclient_path_list(mdi_phys_path_t *);
300 static void		sort_vhcache_paths(mdi_vhcache_client_t *);
301 static int		flush_vhcache(mdi_vhci_config_t *, int);
302 static void		vhcache_dirty(mdi_vhci_config_t *);
303 static void		free_async_client_config(mdi_async_client_config_t *);
304 static void		single_threaded_vhconfig_enter(mdi_vhci_config_t *);
305 static void		single_threaded_vhconfig_exit(mdi_vhci_config_t *);
306 static nvlist_t		*read_on_disk_vhci_cache(char *);
307 extern int		fread_nvlist(char *, nvlist_t **);
308 extern int		fwrite_nvlist(char *, nvlist_t *);
309 
310 /* called once when first vhci registers with mdi */
311 static void
312 i_mdi_init()
313 {
314 	static int initialized = 0;
315 
316 	if (initialized)
317 		return;
318 	initialized = 1;
319 
320 	mutex_init(&mdi_mutex, NULL, MUTEX_DEFAULT, NULL);
321 
322 	/* Create our taskq resources */
323 	mdi_taskq = taskq_create("mdi_taskq", mdi_taskq_n_threads,
324 	    MDI_TASKQ_PRI, MDI_TASKQ_MINALLOC, MDI_TASKQ_MAXALLOC,
325 	    TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
326 	ASSERT(mdi_taskq != NULL);	/* taskq_create never fails */
327 
328 	/* Allocate ['path_instance' <-> "path"] maps */
329 	mutex_init(&mdi_pathmap_mutex, NULL, MUTEX_DRIVER, NULL);
330 	mdi_pathmap_bypath = mod_hash_create_strhash(
331 	    "mdi_pathmap_bypath", mdi_pathmap_hash_size,
332 	    mod_hash_null_valdtor);
333 	mdi_pathmap_byinstance = mod_hash_create_idhash(
334 	    "mdi_pathmap_byinstance", mdi_pathmap_hash_size,
335 	    mod_hash_null_valdtor);
336 }
337 
338 /*
339  * mdi_get_component_type():
340  *		Return mpxio component type
341  * Return Values:
342  *		MDI_COMPONENT_NONE
343  *		MDI_COMPONENT_VHCI
344  *		MDI_COMPONENT_PHCI
345  *		MDI_COMPONENT_CLIENT
346  * XXX This doesn't work under multi-level MPxIO and should be
347  *	removed when clients migrate mdi_component_is_*() interfaces.
348  */
349 int
350 mdi_get_component_type(dev_info_t *dip)
351 {
352 	return (DEVI(dip)->devi_mdi_component);
353 }
354 
355 /*
356  * mdi_vhci_register():
357  *		Register a vHCI module with the mpxio framework
358  *		mdi_vhci_register() is called by vHCI drivers to register the
359  *		'class_driver' vHCI driver and its MDI entrypoints with the
360  *		mpxio framework.  The vHCI driver must call this interface as
361  *		part of its attach(9e) handler.
362  *		Competing threads may try to attach mdi_vhci_register() as
363  *		the vHCI drivers are loaded and attached as a result of pHCI
364  *		driver instance registration (mdi_phci_register()) with the
365  *		framework.
366  * Return Values:
367  *		MDI_SUCCESS
368  *		MDI_FAILURE
369  */
370 /*ARGSUSED*/
371 int
372 mdi_vhci_register(char *class, dev_info_t *vdip, mdi_vhci_ops_t *vops,
373     int flags)
374 {
375 	mdi_vhci_t		*vh = NULL;
376 
377 	ASSERT(vops->vo_revision == MDI_VHCI_OPS_REV);
378 	ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));
379 
380 	i_mdi_init();
381 
382 	mutex_enter(&mdi_mutex);
383 	/*
384 	 * Scan for already registered vhci
385 	 */
386 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
387 		if (strcmp(vh->vh_class, class) == 0) {
388 			/*
389 			 * vHCI has already been created.  Check for valid
390 			 * vHCI ops registration.  We only support one vHCI
391 			 * module per class
392 			 */
393 			if (vh->vh_ops != NULL) {
394 				mutex_exit(&mdi_mutex);
395 				cmn_err(CE_NOTE, vhci_greeting, class);
396 				return (MDI_FAILURE);
397 			}
398 			break;
399 		}
400 	}
401 
402 	/*
403 	 * if not yet created, create the vHCI component
404 	 */
405 	if (vh == NULL) {
406 		struct client_hash	*hash = NULL;
407 		char			*load_balance;
408 
409 		/*
410 		 * Allocate and initialize the mdi extensions
411 		 */
412 		vh = kmem_zalloc(sizeof (mdi_vhci_t), KM_SLEEP);
413 		hash = kmem_zalloc(mdi_client_table_size * sizeof (*hash),
414 		    KM_SLEEP);
415 		vh->vh_client_table = hash;
416 		vh->vh_class = kmem_zalloc(strlen(class) + 1, KM_SLEEP);
417 		(void) strcpy(vh->vh_class, class);
418 		vh->vh_lb = LOAD_BALANCE_RR;
419 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vdip,
420 		    0, LOAD_BALANCE_PROP, &load_balance) == DDI_SUCCESS) {
421 			if (strcmp(load_balance, LOAD_BALANCE_PROP_NONE) == 0) {
422 				vh->vh_lb = LOAD_BALANCE_NONE;
423 			} else if (strcmp(load_balance, LOAD_BALANCE_PROP_LBA)
424 				    == 0) {
425 				vh->vh_lb = LOAD_BALANCE_LBA;
426 			}
427 			ddi_prop_free(load_balance);
428 		}
429 
430 		mutex_init(&vh->vh_phci_mutex, NULL, MUTEX_DEFAULT, NULL);
431 		mutex_init(&vh->vh_client_mutex, NULL, MUTEX_DEFAULT, NULL);
432 
433 		/*
434 		 * Store the vHCI ops vectors
435 		 */
436 		vh->vh_dip = vdip;
437 		vh->vh_ops = vops;
438 
439 		setup_vhci_cache(vh);
440 
441 		if (mdi_vhci_head == NULL) {
442 			mdi_vhci_head = vh;
443 		}
444 		if (mdi_vhci_tail) {
445 			mdi_vhci_tail->vh_next = vh;
446 		}
447 		mdi_vhci_tail = vh;
448 		mdi_vhci_count++;
449 	}
450 
451 	/*
452 	 * Claim the devfs node as a vhci component
453 	 */
454 	DEVI(vdip)->devi_mdi_component |= MDI_COMPONENT_VHCI;
455 
456 	/*
457 	 * Initialize our back reference from dev_info node
458 	 */
459 	DEVI(vdip)->devi_mdi_xhci = (caddr_t)vh;
460 	mutex_exit(&mdi_mutex);
461 	return (MDI_SUCCESS);
462 }
463 
464 /*
465  * mdi_vhci_unregister():
466  *		Unregister a vHCI module from mpxio framework
467  *		mdi_vhci_unregister() is called from the detach(9E) entrypoint
468  * 		of a vhci to unregister it from the framework.
469  * Return Values:
470  *		MDI_SUCCESS
471  *		MDI_FAILURE
472  */
473 /*ARGSUSED*/
474 int
475 mdi_vhci_unregister(dev_info_t *vdip, int flags)
476 {
477 	mdi_vhci_t	*found, *vh, *prev = NULL;
478 
479 	ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));
480 
481 	/*
482 	 * Check for invalid VHCI
483 	 */
484 	if ((vh = i_devi_get_vhci(vdip)) == NULL)
485 		return (MDI_FAILURE);
486 
487 	/*
488 	 * Scan the list of registered vHCIs for a match
489 	 */
490 	mutex_enter(&mdi_mutex);
491 	for (found = mdi_vhci_head; found != NULL; found = found->vh_next) {
492 		if (found == vh)
493 			break;
494 		prev = found;
495 	}
496 
497 	if (found == NULL) {
498 		mutex_exit(&mdi_mutex);
499 		return (MDI_FAILURE);
500 	}
501 
502 	/*
503 	 * Check the vHCI, pHCI and client count. All the pHCIs and clients
504 	 * should have been unregistered, before a vHCI can be
505 	 * unregistered.
506 	 */
507 	MDI_VHCI_PHCI_LOCK(vh);
508 	if (vh->vh_refcnt || vh->vh_phci_count || vh->vh_client_count) {
509 		MDI_VHCI_PHCI_UNLOCK(vh);
510 		mutex_exit(&mdi_mutex);
511 		return (MDI_FAILURE);
512 	}
513 	MDI_VHCI_PHCI_UNLOCK(vh);
514 
515 	if (destroy_vhci_cache(vh) != MDI_SUCCESS) {
516 		mutex_exit(&mdi_mutex);
517 		return (MDI_FAILURE);
518 	}
519 
520 	/*
521 	 * Remove the vHCI from the global list
522 	 */
523 	if (vh == mdi_vhci_head) {
524 		mdi_vhci_head = vh->vh_next;
525 	} else {
526 		prev->vh_next = vh->vh_next;
527 	}
528 	if (vh == mdi_vhci_tail) {
529 		mdi_vhci_tail = prev;
530 	}
531 	mdi_vhci_count--;
532 	mutex_exit(&mdi_mutex);
533 
534 	vh->vh_ops = NULL;
535 	DEVI(vdip)->devi_mdi_component &= ~MDI_COMPONENT_VHCI;
536 	DEVI(vdip)->devi_mdi_xhci = NULL;
537 	kmem_free(vh->vh_class, strlen(vh->vh_class)+1);
538 	kmem_free(vh->vh_client_table,
539 	    mdi_client_table_size * sizeof (struct client_hash));
540 	mutex_destroy(&vh->vh_phci_mutex);
541 	mutex_destroy(&vh->vh_client_mutex);
542 
543 	kmem_free(vh, sizeof (mdi_vhci_t));
544 	return (MDI_SUCCESS);
545 }
546 
547 /*
548  * i_mdi_vhci_class2vhci():
549  *		Look for a matching vHCI module given a vHCI class name
550  * Return Values:
551  *		Handle to a vHCI component
552  *		NULL
553  */
554 static mdi_vhci_t *
555 i_mdi_vhci_class2vhci(char *class)
556 {
557 	mdi_vhci_t	*vh = NULL;
558 
559 	ASSERT(!MUTEX_HELD(&mdi_mutex));
560 
561 	mutex_enter(&mdi_mutex);
562 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
563 		if (strcmp(vh->vh_class, class) == 0) {
564 			break;
565 		}
566 	}
567 	mutex_exit(&mdi_mutex);
568 	return (vh);
569 }
570 
571 /*
572  * i_devi_get_vhci():
573  *		Utility function to get the handle to a vHCI component
574  * Return Values:
575  *		Handle to a vHCI component
576  *		NULL
577  */
578 mdi_vhci_t *
579 i_devi_get_vhci(dev_info_t *vdip)
580 {
581 	mdi_vhci_t	*vh = NULL;
582 	if (MDI_VHCI(vdip)) {
583 		vh = (mdi_vhci_t *)DEVI(vdip)->devi_mdi_xhci;
584 	}
585 	return (vh);
586 }
587 
588 /*
589  * mdi_phci_register():
590  *		Register a pHCI module with mpxio framework
591  *		mdi_phci_register() is called by pHCI drivers to register with
592  *		the mpxio framework and a specific 'class_driver' vHCI.  The
593  *		pHCI driver must call this interface as part of its attach(9e)
594  *		handler.
595  * Return Values:
596  *		MDI_SUCCESS
597  *		MDI_FAILURE
598  */
599 /*ARGSUSED*/
600 int
601 mdi_phci_register(char *class, dev_info_t *pdip, int flags)
602 {
603 	mdi_phci_t		*ph;
604 	mdi_vhci_t		*vh;
605 	char			*data;
606 	char			*pathname;
607 
608 	/*
609 	 * Some subsystems, like fcp, perform pHCI registration from a
610 	 * different thread than the one doing the pHCI attach(9E) - the
611 	 * driver attach code is waiting for this other thread to complete.
612 	 * This means we can only ASSERT DEVI_BUSY_CHANGING of parent
613 	 * (indicating that some thread has done an ndi_devi_enter of parent)
614 	 * not DEVI_BUSY_OWNED (which would indicate that we did the enter).
615 	 */
616 	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));
617 
618 	pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
619 	(void) ddi_pathname(pdip, pathname);
620 
621 	/*
622 	 * Check for mpxio-disable property. Enable mpxio if the property is
623 	 * missing or not set to "yes".
624 	 * If the property is set to "yes" then emit a brief message.
625 	 */
626 	if ((ddi_prop_lookup_string(DDI_DEV_T_ANY, pdip, 0, "mpxio-disable",
627 	    &data) == DDI_SUCCESS)) {
628 		if (strcmp(data, "yes") == 0) {
629 			MDI_DEBUG(1, (CE_CONT, pdip,
630 			    "?%s (%s%d) multipath capabilities "
631 			    "disabled via %s.conf.\n", pathname,
632 			    ddi_driver_name(pdip), ddi_get_instance(pdip),
633 			    ddi_driver_name(pdip)));
634 			ddi_prop_free(data);
635 			kmem_free(pathname, MAXPATHLEN);
636 			return (MDI_FAILURE);
637 		}
638 		ddi_prop_free(data);
639 	}
640 
641 	kmem_free(pathname, MAXPATHLEN);
642 
643 	/*
644 	 * Search for a matching vHCI
645 	 */
646 	vh = (mdi_vhci_t *)i_mdi_vhci_class2vhci(class);
647 	if (vh == NULL) {
648 		return (MDI_FAILURE);
649 	}
650 
651 	ph = kmem_zalloc(sizeof (mdi_phci_t), KM_SLEEP);
652 	mutex_init(&ph->ph_mutex, NULL, MUTEX_DEFAULT, NULL);
653 	ph->ph_dip = pdip;
654 	ph->ph_vhci = vh;
655 	ph->ph_next = NULL;
656 	ph->ph_unstable = 0;
657 	ph->ph_vprivate = 0;
658 	cv_init(&ph->ph_unstable_cv, NULL, CV_DRIVER, NULL);
659 
660 	MDI_PHCI_LOCK(ph);
661 	MDI_PHCI_SET_POWER_UP(ph);
662 	MDI_PHCI_UNLOCK(ph);
663 	DEVI(pdip)->devi_mdi_component |= MDI_COMPONENT_PHCI;
664 	DEVI(pdip)->devi_mdi_xhci = (caddr_t)ph;
665 
666 	vhcache_phci_add(vh->vh_config, ph);
667 
668 	MDI_VHCI_PHCI_LOCK(vh);
669 	if (vh->vh_phci_head == NULL) {
670 		vh->vh_phci_head = ph;
671 	}
672 	if (vh->vh_phci_tail) {
673 		vh->vh_phci_tail->ph_next = ph;
674 	}
675 	vh->vh_phci_tail = ph;
676 	vh->vh_phci_count++;
677 	MDI_VHCI_PHCI_UNLOCK(vh);
678 
679 	i_mdi_log_sysevent(pdip, class, ESC_DDI_INITIATOR_REGISTER);
680 	return (MDI_SUCCESS);
681 }
682 
683 /*
684  * mdi_phci_unregister():
685  *		Unregister a pHCI module from mpxio framework
686  *		mdi_phci_unregister() is called by the pHCI drivers from their
687  *		detach(9E) handler to unregister their instances from the
688  *		framework.
689  * Return Values:
690  *		MDI_SUCCESS
691  *		MDI_FAILURE
692  */
693 /*ARGSUSED*/
694 int
695 mdi_phci_unregister(dev_info_t *pdip, int flags)
696 {
697 	mdi_vhci_t		*vh;
698 	mdi_phci_t		*ph;
699 	mdi_phci_t		*tmp;
700 	mdi_phci_t		*prev = NULL;
701 
702 	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));
703 
704 	ph = i_devi_get_phci(pdip);
705 	if (ph == NULL) {
706 		MDI_DEBUG(1, (CE_WARN, pdip,
707 		    "!pHCI unregister: Not a valid pHCI"));
708 		return (MDI_FAILURE);
709 	}
710 
711 	vh = ph->ph_vhci;
712 	ASSERT(vh != NULL);
713 	if (vh == NULL) {
714 		MDI_DEBUG(1, (CE_WARN, pdip,
715 		    "!pHCI unregister: Not a valid vHCI"));
716 		return (MDI_FAILURE);
717 	}
718 
719 	MDI_VHCI_PHCI_LOCK(vh);
720 	tmp = vh->vh_phci_head;
721 	while (tmp) {
722 		if (tmp == ph) {
723 			break;
724 		}
725 		prev = tmp;
726 		tmp = tmp->ph_next;
727 	}
728 
729 	if (ph == vh->vh_phci_head) {
730 		vh->vh_phci_head = ph->ph_next;
731 	} else {
732 		prev->ph_next = ph->ph_next;
733 	}
734 
735 	if (ph == vh->vh_phci_tail) {
736 		vh->vh_phci_tail = prev;
737 	}
738 
739 	vh->vh_phci_count--;
740 	MDI_VHCI_PHCI_UNLOCK(vh);
741 
742 	i_mdi_log_sysevent(pdip, ph->ph_vhci->vh_class,
743 	    ESC_DDI_INITIATOR_UNREGISTER);
744 	vhcache_phci_remove(vh->vh_config, ph);
745 	cv_destroy(&ph->ph_unstable_cv);
746 	mutex_destroy(&ph->ph_mutex);
747 	kmem_free(ph, sizeof (mdi_phci_t));
748 	DEVI(pdip)->devi_mdi_component &= ~MDI_COMPONENT_PHCI;
749 	DEVI(pdip)->devi_mdi_xhci = NULL;
750 	return (MDI_SUCCESS);
751 }
752 
753 /*
754  * i_devi_get_phci():
755  * 		Utility function to return the phci extensions.
756  */
757 static mdi_phci_t *
758 i_devi_get_phci(dev_info_t *pdip)
759 {
760 	mdi_phci_t	*ph = NULL;
761 	if (MDI_PHCI(pdip)) {
762 		ph = (mdi_phci_t *)DEVI(pdip)->devi_mdi_xhci;
763 	}
764 	return (ph);
765 }
766 
767 /*
768  * Single thread mdi entry into devinfo node for modifying its children.
769  * If necessary we perform an ndi_devi_enter of the vHCI before doing
770  * an ndi_devi_enter of 'dip'.  We maintain circular in two parts: one
771  * for the vHCI and one for the pHCI.
772  */
773 void
774 mdi_devi_enter(dev_info_t *phci_dip, int *circular)
775 {
776 	dev_info_t	*vdip;
777 	int		vcircular, pcircular;
778 
779 	/* Verify calling context */
780 	ASSERT(MDI_PHCI(phci_dip));
781 	vdip = mdi_devi_get_vdip(phci_dip);
782 	ASSERT(vdip);			/* A pHCI always has a vHCI */
783 
784 	/*
785 	 * If pHCI is detaching then the framework has already entered the
786 	 * vHCI on a threads that went down the code path leading to
787 	 * detach_node().  This framework enter of the vHCI during pHCI
788 	 * detach is done to avoid deadlock with vHCI power management
789 	 * operations which enter the vHCI and the enter down the path
790 	 * to the pHCI. If pHCI is detaching then we piggyback this calls
791 	 * enter of the vHCI on frameworks vHCI enter that has already
792 	 * occurred - this is OK because we know that the framework thread
793 	 * doing detach is waiting for our completion.
794 	 *
795 	 * We should DEVI_IS_DETACHING under an enter of the parent to avoid
796 	 * race with detach - but we can't do that because the framework has
797 	 * already entered the parent, so we have some complexity instead.
798 	 */
799 	for (;;) {
800 		if (ndi_devi_tryenter(vdip, &vcircular)) {
801 			ASSERT(vcircular != -1);
802 			if (DEVI_IS_DETACHING(phci_dip)) {
803 				ndi_devi_exit(vdip, vcircular);
804 				vcircular = -1;
805 			}
806 			break;
807 		} else if (DEVI_IS_DETACHING(phci_dip)) {
808 			vcircular = -1;
809 			break;
810 		} else {
811 			delay(1);
812 		}
813 	}
814 
815 	ndi_devi_enter(phci_dip, &pcircular);
816 	*circular = (vcircular << 16) | (pcircular & 0xFFFF);
817 }
818 
819 /*
820  * Release mdi_devi_enter or successful mdi_devi_tryenter.
821  */
822 void
823 mdi_devi_exit(dev_info_t *phci_dip, int circular)
824 {
825 	dev_info_t	*vdip;
826 	int		vcircular, pcircular;
827 
828 	/* Verify calling context */
829 	ASSERT(MDI_PHCI(phci_dip));
830 	vdip = mdi_devi_get_vdip(phci_dip);
831 	ASSERT(vdip);			/* A pHCI always has a vHCI */
832 
833 	/* extract two circular recursion values from single int */
834 	pcircular = (short)(circular & 0xFFFF);
835 	vcircular = (short)((circular >> 16) & 0xFFFF);
836 
837 	ndi_devi_exit(phci_dip, pcircular);
838 	if (vcircular != -1)
839 		ndi_devi_exit(vdip, vcircular);
840 }
841 
842 /*
843  * The functions mdi_devi_exit_phci() and mdi_devi_enter_phci() are used
844  * around a pHCI drivers calls to mdi_pi_online/offline, after holding
845  * the pathinfo node via mdi_hold_path/mdi_rele_path, to avoid deadlock
846  * with vHCI power management code during path online/offline.  Each
847  * mdi_devi_exit_phci must have a matching mdi_devi_enter_phci, and both must
848  * occur within the scope of an active mdi_devi_enter that establishes the
849  * circular value.
850  */
851 void
852 mdi_devi_exit_phci(dev_info_t *phci_dip, int circular)
853 {
854 	int		pcircular;
855 
856 	/* Verify calling context */
857 	ASSERT(MDI_PHCI(phci_dip));
858 
859 	pcircular = (short)(circular & 0xFFFF);
860 	ndi_devi_exit(phci_dip, pcircular);
861 }
862 
863 void
864 mdi_devi_enter_phci(dev_info_t *phci_dip, int *circular)
865 {
866 	int		pcircular;
867 
868 	/* Verify calling context */
869 	ASSERT(MDI_PHCI(phci_dip));
870 
871 	ndi_devi_enter(phci_dip, &pcircular);
872 
873 	/* verify matching mdi_devi_exit_phci/mdi_devi_enter_phci use */
874 	ASSERT(pcircular == ((short)(*circular & 0xFFFF)));
875 }
876 
877 /*
878  * mdi_devi_get_vdip():
879  *		given a pHCI dip return vHCI dip
880  */
881 dev_info_t *
882 mdi_devi_get_vdip(dev_info_t *pdip)
883 {
884 	mdi_phci_t	*ph;
885 
886 	ph = i_devi_get_phci(pdip);
887 	if (ph && ph->ph_vhci)
888 		return (ph->ph_vhci->vh_dip);
889 	return (NULL);
890 }
891 
892 /*
893  * mdi_devi_pdip_entered():
894  *		Return 1 if we are vHCI and have done an ndi_devi_enter
895  *		of a pHCI
896  */
897 int
898 mdi_devi_pdip_entered(dev_info_t *vdip)
899 {
900 	mdi_vhci_t	*vh;
901 	mdi_phci_t	*ph;
902 
903 	vh = i_devi_get_vhci(vdip);
904 	if (vh == NULL)
905 		return (0);
906 
907 	MDI_VHCI_PHCI_LOCK(vh);
908 	ph = vh->vh_phci_head;
909 	while (ph) {
910 		if (ph->ph_dip && DEVI_BUSY_OWNED(ph->ph_dip)) {
911 			MDI_VHCI_PHCI_UNLOCK(vh);
912 			return (1);
913 		}
914 		ph = ph->ph_next;
915 	}
916 	MDI_VHCI_PHCI_UNLOCK(vh);
917 	return (0);
918 }
919 
920 /*
921  * mdi_phci_path2devinfo():
922  * 		Utility function to search for a valid phci device given
923  *		the devfs pathname.
924  */
925 dev_info_t *
926 mdi_phci_path2devinfo(dev_info_t *vdip, caddr_t pathname)
927 {
928 	char		*temp_pathname;
929 	mdi_vhci_t	*vh;
930 	mdi_phci_t	*ph;
931 	dev_info_t 	*pdip = NULL;
932 
933 	vh = i_devi_get_vhci(vdip);
934 	ASSERT(vh != NULL);
935 
936 	if (vh == NULL) {
937 		/*
938 		 * Invalid vHCI component, return failure
939 		 */
940 		return (NULL);
941 	}
942 
943 	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
944 	MDI_VHCI_PHCI_LOCK(vh);
945 	ph = vh->vh_phci_head;
946 	while (ph != NULL) {
947 		pdip = ph->ph_dip;
948 		ASSERT(pdip != NULL);
949 		*temp_pathname = '\0';
950 		(void) ddi_pathname(pdip, temp_pathname);
951 		if (strcmp(temp_pathname, pathname) == 0) {
952 			break;
953 		}
954 		ph = ph->ph_next;
955 	}
956 	if (ph == NULL) {
957 		pdip = NULL;
958 	}
959 	MDI_VHCI_PHCI_UNLOCK(vh);
960 	kmem_free(temp_pathname, MAXPATHLEN);
961 	return (pdip);
962 }
963 
964 /*
965  * mdi_phci_get_path_count():
966  * 		get number of path information nodes associated with a given
967  *		pHCI device.
968  */
969 int
970 mdi_phci_get_path_count(dev_info_t *pdip)
971 {
972 	mdi_phci_t	*ph;
973 	int		count = 0;
974 
975 	ph = i_devi_get_phci(pdip);
976 	if (ph != NULL) {
977 		count = ph->ph_path_count;
978 	}
979 	return (count);
980 }
981 
982 /*
983  * i_mdi_phci_lock():
984  *		Lock a pHCI device
985  * Return Values:
986  *		None
987  * Note:
988  *		The default locking order is:
989  *		_NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
990  *		But there are number of situations where locks need to be
991  *		grabbed in reverse order.  This routine implements try and lock
992  *		mechanism depending on the requested parameter option.
993  */
994 static void
995 i_mdi_phci_lock(mdi_phci_t *ph, mdi_pathinfo_t *pip)
996 {
997 	if (pip) {
998 		/* Reverse locking is requested. */
999 		while (MDI_PHCI_TRYLOCK(ph) == 0) {
1000 			/*
1001 			 * tryenter failed. Try to grab again
1002 			 * after a small delay
1003 			 */
1004 			MDI_PI_HOLD(pip);
1005 			MDI_PI_UNLOCK(pip);
1006 			delay(1);
1007 			MDI_PI_LOCK(pip);
1008 			MDI_PI_RELE(pip);
1009 		}
1010 	} else {
1011 		MDI_PHCI_LOCK(ph);
1012 	}
1013 }
1014 
1015 /*
1016  * i_mdi_phci_unlock():
1017  *		Unlock the pHCI component
1018  */
1019 static void
1020 i_mdi_phci_unlock(mdi_phci_t *ph)
1021 {
1022 	MDI_PHCI_UNLOCK(ph);
1023 }
1024 
1025 /*
1026  * i_mdi_devinfo_create():
1027  *		create client device's devinfo node
1028  * Return Values:
1029  *		dev_info
1030  *		NULL
1031  * Notes:
1032  */
1033 static dev_info_t *
1034 i_mdi_devinfo_create(mdi_vhci_t *vh, char *name, char *guid,
1035 	char **compatible, int ncompatible)
1036 {
1037 	dev_info_t *cdip = NULL;
1038 
1039 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1040 
1041 	/* Verify for duplicate entry */
1042 	cdip = i_mdi_devinfo_find(vh, name, guid);
1043 	ASSERT(cdip == NULL);
1044 	if (cdip) {
1045 		cmn_err(CE_WARN,
1046 		    "i_mdi_devinfo_create: client dip %p already exists",
1047 			(void *)cdip);
1048 	}
1049 
1050 	ndi_devi_alloc_sleep(vh->vh_dip, name, DEVI_SID_NODEID, &cdip);
1051 	if (cdip == NULL)
1052 		goto fail;
1053 
1054 	/*
1055 	 * Create component type and Global unique identifier
1056 	 * properties
1057 	 */
1058 	if (ndi_prop_update_string(DDI_DEV_T_NONE, cdip,
1059 	    MDI_CLIENT_GUID_PROP, guid) != DDI_PROP_SUCCESS) {
1060 		goto fail;
1061 	}
1062 
1063 	/* Decorate the node with compatible property */
1064 	if (compatible &&
1065 	    (ndi_prop_update_string_array(DDI_DEV_T_NONE, cdip,
1066 	    "compatible", compatible, ncompatible) != DDI_PROP_SUCCESS)) {
1067 		goto fail;
1068 	}
1069 
1070 	return (cdip);
1071 
1072 fail:
1073 	if (cdip) {
1074 		(void) ndi_prop_remove_all(cdip);
1075 		(void) ndi_devi_free(cdip);
1076 	}
1077 	return (NULL);
1078 }
1079 
1080 /*
1081  * i_mdi_devinfo_find():
1082  *		Find a matching devinfo node for given client node name
1083  *		and its guid.
1084  * Return Values:
1085  *		Handle to a dev_info node or NULL
1086  */
1087 static dev_info_t *
1088 i_mdi_devinfo_find(mdi_vhci_t *vh, caddr_t name, char *guid)
1089 {
1090 	char			*data;
1091 	dev_info_t 		*cdip = NULL;
1092 	dev_info_t 		*ndip = NULL;
1093 	int			circular;
1094 
1095 	ndi_devi_enter(vh->vh_dip, &circular);
1096 	ndip = (dev_info_t *)DEVI(vh->vh_dip)->devi_child;
1097 	while ((cdip = ndip) != NULL) {
1098 		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
1099 
1100 		if (strcmp(DEVI(cdip)->devi_node_name, name)) {
1101 			continue;
1102 		}
1103 
1104 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, cdip,
1105 		    DDI_PROP_DONTPASS, MDI_CLIENT_GUID_PROP,
1106 		    &data) != DDI_PROP_SUCCESS) {
1107 			continue;
1108 		}
1109 
1110 		if (strcmp(data, guid) != 0) {
1111 			ddi_prop_free(data);
1112 			continue;
1113 		}
1114 		ddi_prop_free(data);
1115 		break;
1116 	}
1117 	ndi_devi_exit(vh->vh_dip, circular);
1118 	return (cdip);
1119 }
1120 
1121 /*
1122  * i_mdi_devinfo_remove():
1123  *		Remove a client device node
1124  */
1125 static int
1126 i_mdi_devinfo_remove(dev_info_t *vdip, dev_info_t *cdip, int flags)
1127 {
1128 	int	rv = MDI_SUCCESS;
1129 
1130 	if (i_mdi_is_child_present(vdip, cdip) == MDI_SUCCESS ||
1131 	    (flags & MDI_CLIENT_FLAGS_DEV_NOT_SUPPORTED)) {
1132 		rv = ndi_devi_offline(cdip, NDI_DEVI_REMOVE);
1133 		if (rv != NDI_SUCCESS) {
1134 			MDI_DEBUG(1, (CE_NOTE, NULL, "!i_mdi_devinfo_remove:"
1135 			    " failed. cdip = %p\n", (void *)cdip));
1136 		}
1137 		/*
1138 		 * Convert to MDI error code
1139 		 */
1140 		switch (rv) {
1141 		case NDI_SUCCESS:
1142 			rv = MDI_SUCCESS;
1143 			break;
1144 		case NDI_BUSY:
1145 			rv = MDI_BUSY;
1146 			break;
1147 		default:
1148 			rv = MDI_FAILURE;
1149 			break;
1150 		}
1151 	}
1152 	return (rv);
1153 }
1154 
1155 /*
1156  * i_devi_get_client()
1157  *		Utility function to get mpxio component extensions
1158  */
1159 static mdi_client_t *
1160 i_devi_get_client(dev_info_t *cdip)
1161 {
1162 	mdi_client_t	*ct = NULL;
1163 
1164 	if (MDI_CLIENT(cdip)) {
1165 		ct = (mdi_client_t *)DEVI(cdip)->devi_mdi_client;
1166 	}
1167 	return (ct);
1168 }
1169 
1170 /*
1171  * i_mdi_is_child_present():
1172  *		Search for the presence of client device dev_info node
1173  */
1174 static int
1175 i_mdi_is_child_present(dev_info_t *vdip, dev_info_t *cdip)
1176 {
1177 	int		rv = MDI_FAILURE;
1178 	struct dev_info	*dip;
1179 	int		circular;
1180 
1181 	ndi_devi_enter(vdip, &circular);
1182 	dip = DEVI(vdip)->devi_child;
1183 	while (dip) {
1184 		if (dip == DEVI(cdip)) {
1185 			rv = MDI_SUCCESS;
1186 			break;
1187 		}
1188 		dip = dip->devi_sibling;
1189 	}
1190 	ndi_devi_exit(vdip, circular);
1191 	return (rv);
1192 }
1193 
1194 
1195 /*
1196  * i_mdi_client_lock():
1197  *		Grab client component lock
1198  * Return Values:
1199  *		None
1200  * Note:
1201  *		The default locking order is:
1202  *		_NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
1203  *		But there are number of situations where locks need to be
1204  *		grabbed in reverse order.  This routine implements try and lock
1205  *		mechanism depending on the requested parameter option.
1206  */
1207 static void
1208 i_mdi_client_lock(mdi_client_t *ct, mdi_pathinfo_t *pip)
1209 {
1210 	if (pip) {
1211 		/*
1212 		 * Reverse locking is requested.
1213 		 */
1214 		while (MDI_CLIENT_TRYLOCK(ct) == 0) {
1215 			/*
1216 			 * tryenter failed. Try to grab again
1217 			 * after a small delay
1218 			 */
1219 			MDI_PI_HOLD(pip);
1220 			MDI_PI_UNLOCK(pip);
1221 			delay(1);
1222 			MDI_PI_LOCK(pip);
1223 			MDI_PI_RELE(pip);
1224 		}
1225 	} else {
1226 		MDI_CLIENT_LOCK(ct);
1227 	}
1228 }
1229 
1230 /*
1231  * i_mdi_client_unlock():
1232  *		Unlock a client component
1233  */
1234 static void
1235 i_mdi_client_unlock(mdi_client_t *ct)
1236 {
1237 	MDI_CLIENT_UNLOCK(ct);
1238 }
1239 
1240 /*
1241  * i_mdi_client_alloc():
1242  * 		Allocate and initialize a client structure.  Caller should
1243  *		hold the vhci client lock.
1244  * Return Values:
1245  *		Handle to a client component
1246  */
1247 /*ARGSUSED*/
1248 static mdi_client_t *
1249 i_mdi_client_alloc(mdi_vhci_t *vh, char *name, char *lguid)
1250 {
1251 	mdi_client_t	*ct;
1252 
1253 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1254 
1255 	/*
1256 	 * Allocate and initialize a component structure.
1257 	 */
1258 	ct = kmem_zalloc(sizeof (*ct), KM_SLEEP);
1259 	mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL);
1260 	ct->ct_hnext = NULL;
1261 	ct->ct_hprev = NULL;
1262 	ct->ct_dip = NULL;
1263 	ct->ct_vhci = vh;
1264 	ct->ct_drvname = kmem_alloc(strlen(name) + 1, KM_SLEEP);
1265 	(void) strcpy(ct->ct_drvname, name);
1266 	ct->ct_guid = kmem_alloc(strlen(lguid) + 1, KM_SLEEP);
1267 	(void) strcpy(ct->ct_guid, lguid);
1268 	ct->ct_cprivate = NULL;
1269 	ct->ct_vprivate = NULL;
1270 	ct->ct_flags = 0;
1271 	ct->ct_state = MDI_CLIENT_STATE_FAILED;
1272 	MDI_CLIENT_LOCK(ct);
1273 	MDI_CLIENT_SET_OFFLINE(ct);
1274 	MDI_CLIENT_SET_DETACH(ct);
1275 	MDI_CLIENT_SET_POWER_UP(ct);
1276 	MDI_CLIENT_UNLOCK(ct);
1277 	ct->ct_failover_flags = 0;
1278 	ct->ct_failover_status = 0;
1279 	cv_init(&ct->ct_failover_cv, NULL, CV_DRIVER, NULL);
1280 	ct->ct_unstable = 0;
1281 	cv_init(&ct->ct_unstable_cv, NULL, CV_DRIVER, NULL);
1282 	cv_init(&ct->ct_powerchange_cv, NULL, CV_DRIVER, NULL);
1283 	ct->ct_lb = vh->vh_lb;
1284 	ct->ct_lb_args =  kmem_zalloc(sizeof (client_lb_args_t), KM_SLEEP);
1285 	ct->ct_lb_args->region_size = LOAD_BALANCE_DEFAULT_REGION_SIZE;
1286 	ct->ct_path_count = 0;
1287 	ct->ct_path_head = NULL;
1288 	ct->ct_path_tail = NULL;
1289 	ct->ct_path_last = NULL;
1290 
1291 	/*
1292 	 * Add this client component to our client hash queue
1293 	 */
1294 	i_mdi_client_enlist_table(vh, ct);
1295 	return (ct);
1296 }
1297 
1298 /*
1299  * i_mdi_client_enlist_table():
1300  *		Attach the client device to the client hash table. Caller
1301  *		should hold the vhci client lock.
1302  */
1303 static void
1304 i_mdi_client_enlist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1305 {
1306 	int 			index;
1307 	struct client_hash	*head;
1308 
1309 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1310 
1311 	index = i_mdi_get_hash_key(ct->ct_guid);
1312 	head = &vh->vh_client_table[index];
1313 	ct->ct_hnext = (mdi_client_t *)head->ct_hash_head;
1314 	head->ct_hash_head = ct;
1315 	head->ct_hash_count++;
1316 	vh->vh_client_count++;
1317 }
1318 
1319 /*
1320  * i_mdi_client_delist_table():
1321  *		Attach the client device to the client hash table.
1322  *		Caller should hold the vhci client lock.
1323  */
1324 static void
1325 i_mdi_client_delist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1326 {
1327 	int			index;
1328 	char			*guid;
1329 	struct client_hash 	*head;
1330 	mdi_client_t		*next;
1331 	mdi_client_t		*last;
1332 
1333 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1334 
1335 	guid = ct->ct_guid;
1336 	index = i_mdi_get_hash_key(guid);
1337 	head = &vh->vh_client_table[index];
1338 
1339 	last = NULL;
1340 	next = (mdi_client_t *)head->ct_hash_head;
1341 	while (next != NULL) {
1342 		if (next == ct) {
1343 			break;
1344 		}
1345 		last = next;
1346 		next = next->ct_hnext;
1347 	}
1348 
1349 	if (next) {
1350 		head->ct_hash_count--;
1351 		if (last == NULL) {
1352 			head->ct_hash_head = ct->ct_hnext;
1353 		} else {
1354 			last->ct_hnext = ct->ct_hnext;
1355 		}
1356 		ct->ct_hnext = NULL;
1357 		vh->vh_client_count--;
1358 	}
1359 }
1360 
1361 
1362 /*
1363  * i_mdi_client_free():
1364  *		Free a client component
1365  */
1366 static int
1367 i_mdi_client_free(mdi_vhci_t *vh, mdi_client_t *ct)
1368 {
1369 	int		rv = MDI_SUCCESS;
1370 	int		flags = ct->ct_flags;
1371 	dev_info_t	*cdip;
1372 	dev_info_t	*vdip;
1373 
1374 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1375 
1376 	vdip = vh->vh_dip;
1377 	cdip = ct->ct_dip;
1378 
1379 	(void) ndi_prop_remove(DDI_DEV_T_NONE, cdip, MDI_CLIENT_GUID_PROP);
1380 	DEVI(cdip)->devi_mdi_component &= ~MDI_COMPONENT_CLIENT;
1381 	DEVI(cdip)->devi_mdi_client = NULL;
1382 
1383 	/*
1384 	 * Clear out back ref. to dev_info_t node
1385 	 */
1386 	ct->ct_dip = NULL;
1387 
1388 	/*
1389 	 * Remove this client from our hash queue
1390 	 */
1391 	i_mdi_client_delist_table(vh, ct);
1392 
1393 	/*
1394 	 * Uninitialize and free the component
1395 	 */
1396 	kmem_free(ct->ct_drvname, strlen(ct->ct_drvname) + 1);
1397 	kmem_free(ct->ct_guid, strlen(ct->ct_guid) + 1);
1398 	kmem_free(ct->ct_lb_args, sizeof (client_lb_args_t));
1399 	cv_destroy(&ct->ct_failover_cv);
1400 	cv_destroy(&ct->ct_unstable_cv);
1401 	cv_destroy(&ct->ct_powerchange_cv);
1402 	mutex_destroy(&ct->ct_mutex);
1403 	kmem_free(ct, sizeof (*ct));
1404 
1405 	if (cdip != NULL) {
1406 		MDI_VHCI_CLIENT_UNLOCK(vh);
1407 		(void) i_mdi_devinfo_remove(vdip, cdip, flags);
1408 		MDI_VHCI_CLIENT_LOCK(vh);
1409 	}
1410 	return (rv);
1411 }
1412 
1413 /*
1414  * i_mdi_client_find():
1415  * 		Find the client structure corresponding to a given guid
1416  *		Caller should hold the vhci client lock.
1417  */
1418 static mdi_client_t *
1419 i_mdi_client_find(mdi_vhci_t *vh, char *cname, char *guid)
1420 {
1421 	int			index;
1422 	struct client_hash	*head;
1423 	mdi_client_t		*ct;
1424 
1425 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1426 
1427 	index = i_mdi_get_hash_key(guid);
1428 	head = &vh->vh_client_table[index];
1429 
1430 	ct = head->ct_hash_head;
1431 	while (ct != NULL) {
1432 		if (strcmp(ct->ct_guid, guid) == 0 &&
1433 		    (cname == NULL || strcmp(ct->ct_drvname, cname) == 0)) {
1434 			break;
1435 		}
1436 		ct = ct->ct_hnext;
1437 	}
1438 	return (ct);
1439 }
1440 
1441 /*
1442  * i_mdi_client_update_state():
1443  *		Compute and update client device state
1444  * Notes:
1445  *		A client device can be in any of three possible states:
1446  *
1447  *		MDI_CLIENT_STATE_OPTIMAL - Client in optimal state with more
1448  *		one online/standby paths. Can tolerate failures.
1449  *		MDI_CLIENT_STATE_DEGRADED - Client device in degraded state with
1450  *		no alternate paths available as standby. A failure on the online
1451  *		would result in loss of access to device data.
1452  *		MDI_CLIENT_STATE_FAILED - Client device in failed state with
1453  *		no paths available to access the device.
1454  */
1455 static void
1456 i_mdi_client_update_state(mdi_client_t *ct)
1457 {
1458 	int state;
1459 
1460 	ASSERT(MDI_CLIENT_LOCKED(ct));
1461 	state = i_mdi_client_compute_state(ct, NULL);
1462 	MDI_CLIENT_SET_STATE(ct, state);
1463 }
1464 
1465 /*
1466  * i_mdi_client_compute_state():
1467  *		Compute client device state
1468  *
1469  *		mdi_phci_t *	Pointer to pHCI structure which should
1470  *				while computing the new value.  Used by
1471  *				i_mdi_phci_offline() to find the new
1472  *				client state after DR of a pHCI.
1473  */
1474 static int
1475 i_mdi_client_compute_state(mdi_client_t *ct, mdi_phci_t *ph)
1476 {
1477 	int		state;
1478 	int		online_count = 0;
1479 	int		standby_count = 0;
1480 	mdi_pathinfo_t	*pip, *next;
1481 
1482 	ASSERT(MDI_CLIENT_LOCKED(ct));
1483 	pip = ct->ct_path_head;
1484 	while (pip != NULL) {
1485 		MDI_PI_LOCK(pip);
1486 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
1487 		if (MDI_PI(pip)->pi_phci == ph) {
1488 			MDI_PI_UNLOCK(pip);
1489 			pip = next;
1490 			continue;
1491 		}
1492 
1493 		if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1494 				== MDI_PATHINFO_STATE_ONLINE)
1495 			online_count++;
1496 		else if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1497 				== MDI_PATHINFO_STATE_STANDBY)
1498 			standby_count++;
1499 		MDI_PI_UNLOCK(pip);
1500 		pip = next;
1501 	}
1502 
1503 	if (online_count == 0) {
1504 		if (standby_count == 0) {
1505 			state = MDI_CLIENT_STATE_FAILED;
1506 			MDI_DEBUG(2, (CE_NOTE, NULL, "!client state: failed"
1507 			    " ct = %p\n", (void *)ct));
1508 		} else if (standby_count == 1) {
1509 			state = MDI_CLIENT_STATE_DEGRADED;
1510 		} else {
1511 			state = MDI_CLIENT_STATE_OPTIMAL;
1512 		}
1513 	} else if (online_count == 1) {
1514 		if (standby_count == 0) {
1515 			state = MDI_CLIENT_STATE_DEGRADED;
1516 		} else {
1517 			state = MDI_CLIENT_STATE_OPTIMAL;
1518 		}
1519 	} else {
1520 		state = MDI_CLIENT_STATE_OPTIMAL;
1521 	}
1522 	return (state);
1523 }
1524 
1525 /*
1526  * i_mdi_client2devinfo():
1527  *		Utility function
1528  */
1529 dev_info_t *
1530 i_mdi_client2devinfo(mdi_client_t *ct)
1531 {
1532 	return (ct->ct_dip);
1533 }
1534 
1535 /*
1536  * mdi_client_path2_devinfo():
1537  * 		Given the parent devinfo and child devfs pathname, search for
1538  *		a valid devfs node handle.
1539  */
1540 dev_info_t *
1541 mdi_client_path2devinfo(dev_info_t *vdip, char *pathname)
1542 {
1543 	dev_info_t 	*cdip = NULL;
1544 	dev_info_t 	*ndip = NULL;
1545 	char		*temp_pathname;
1546 	int		circular;
1547 
1548 	/*
1549 	 * Allocate temp buffer
1550 	 */
1551 	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1552 
1553 	/*
1554 	 * Lock parent against changes
1555 	 */
1556 	ndi_devi_enter(vdip, &circular);
1557 	ndip = (dev_info_t *)DEVI(vdip)->devi_child;
1558 	while ((cdip = ndip) != NULL) {
1559 		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
1560 
1561 		*temp_pathname = '\0';
1562 		(void) ddi_pathname(cdip, temp_pathname);
1563 		if (strcmp(temp_pathname, pathname) == 0) {
1564 			break;
1565 		}
1566 	}
1567 	/*
1568 	 * Release devinfo lock
1569 	 */
1570 	ndi_devi_exit(vdip, circular);
1571 
1572 	/*
1573 	 * Free the temp buffer
1574 	 */
1575 	kmem_free(temp_pathname, MAXPATHLEN);
1576 	return (cdip);
1577 }
1578 
1579 /*
1580  * mdi_client_get_path_count():
1581  * 		Utility function to get number of path information nodes
1582  *		associated with a given client device.
1583  */
1584 int
1585 mdi_client_get_path_count(dev_info_t *cdip)
1586 {
1587 	mdi_client_t	*ct;
1588 	int		count = 0;
1589 
1590 	ct = i_devi_get_client(cdip);
1591 	if (ct != NULL) {
1592 		count = ct->ct_path_count;
1593 	}
1594 	return (count);
1595 }
1596 
1597 
1598 /*
1599  * i_mdi_get_hash_key():
1600  * 		Create a hash using strings as keys
1601  *
1602  */
1603 static int
1604 i_mdi_get_hash_key(char *str)
1605 {
1606 	uint32_t	g, hash = 0;
1607 	char		*p;
1608 
1609 	for (p = str; *p != '\0'; p++) {
1610 		g = *p;
1611 		hash += g;
1612 	}
1613 	return (hash % (CLIENT_HASH_TABLE_SIZE - 1));
1614 }
1615 
1616 /*
1617  * mdi_get_lb_policy():
1618  * 		Get current load balancing policy for a given client device
1619  */
1620 client_lb_t
1621 mdi_get_lb_policy(dev_info_t *cdip)
1622 {
1623 	client_lb_t	lb = LOAD_BALANCE_NONE;
1624 	mdi_client_t	*ct;
1625 
1626 	ct = i_devi_get_client(cdip);
1627 	if (ct != NULL) {
1628 		lb = ct->ct_lb;
1629 	}
1630 	return (lb);
1631 }
1632 
1633 /*
1634  * mdi_set_lb_region_size():
1635  * 		Set current region size for the load-balance
1636  */
1637 int
1638 mdi_set_lb_region_size(dev_info_t *cdip, int region_size)
1639 {
1640 	mdi_client_t	*ct;
1641 	int		rv = MDI_FAILURE;
1642 
1643 	ct = i_devi_get_client(cdip);
1644 	if (ct != NULL && ct->ct_lb_args != NULL) {
1645 		ct->ct_lb_args->region_size = region_size;
1646 		rv = MDI_SUCCESS;
1647 	}
1648 	return (rv);
1649 }
1650 
1651 /*
1652  * mdi_Set_lb_policy():
1653  * 		Set current load balancing policy for a given client device
1654  */
1655 int
1656 mdi_set_lb_policy(dev_info_t *cdip, client_lb_t lb)
1657 {
1658 	mdi_client_t	*ct;
1659 	int		rv = MDI_FAILURE;
1660 
1661 	ct = i_devi_get_client(cdip);
1662 	if (ct != NULL) {
1663 		ct->ct_lb = lb;
1664 		rv = MDI_SUCCESS;
1665 	}
1666 	return (rv);
1667 }
1668 
1669 /*
1670  * mdi_failover():
1671  *		failover function called by the vHCI drivers to initiate
1672  *		a failover operation.  This is typically due to non-availability
1673  *		of online paths to route I/O requests.  Failover can be
1674  *		triggered through user application also.
1675  *
1676  *		The vHCI driver calls mdi_failover() to initiate a failover
1677  *		operation. mdi_failover() calls back into the vHCI driver's
1678  *		vo_failover() entry point to perform the actual failover
1679  *		operation.  The reason for requiring the vHCI driver to
1680  *		initiate failover by calling mdi_failover(), instead of directly
1681  *		executing vo_failover() itself, is to ensure that the mdi
1682  *		framework can keep track of the client state properly.
1683  *		Additionally, mdi_failover() provides as a convenience the
1684  *		option of performing the failover operation synchronously or
1685  *		asynchronously
1686  *
1687  *		Upon successful completion of the failover operation, the
1688  *		paths that were previously ONLINE will be in the STANDBY state,
1689  *		and the newly activated paths will be in the ONLINE state.
1690  *
1691  *		The flags modifier determines whether the activation is done
1692  *		synchronously: MDI_FAILOVER_SYNC
1693  * Return Values:
1694  *		MDI_SUCCESS
1695  *		MDI_FAILURE
1696  *		MDI_BUSY
1697  */
1698 /*ARGSUSED*/
1699 int
1700 mdi_failover(dev_info_t *vdip, dev_info_t *cdip, int flags)
1701 {
1702 	int			rv;
1703 	mdi_client_t		*ct;
1704 
1705 	ct = i_devi_get_client(cdip);
1706 	ASSERT(ct != NULL);
1707 	if (ct == NULL) {
1708 		/* cdip is not a valid client device. Nothing more to do. */
1709 		return (MDI_FAILURE);
1710 	}
1711 
1712 	MDI_CLIENT_LOCK(ct);
1713 
1714 	if (MDI_CLIENT_IS_PATH_FREE_IN_PROGRESS(ct)) {
1715 		/* A path to the client is being freed */
1716 		MDI_CLIENT_UNLOCK(ct);
1717 		return (MDI_BUSY);
1718 	}
1719 
1720 
1721 	if (MDI_CLIENT_IS_FAILED(ct)) {
1722 		/*
1723 		 * Client is in failed state. Nothing more to do.
1724 		 */
1725 		MDI_CLIENT_UNLOCK(ct);
1726 		return (MDI_FAILURE);
1727 	}
1728 
1729 	if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
1730 		/*
1731 		 * Failover is already in progress; return BUSY
1732 		 */
1733 		MDI_CLIENT_UNLOCK(ct);
1734 		return (MDI_BUSY);
1735 	}
1736 	/*
1737 	 * Make sure that mdi_pathinfo node state changes are processed.
1738 	 * We do not allow failovers to progress while client path state
1739 	 * changes are in progress
1740 	 */
1741 	if (ct->ct_unstable) {
1742 		if (flags == MDI_FAILOVER_ASYNC) {
1743 			MDI_CLIENT_UNLOCK(ct);
1744 			return (MDI_BUSY);
1745 		} else {
1746 			while (ct->ct_unstable)
1747 				cv_wait(&ct->ct_unstable_cv, &ct->ct_mutex);
1748 		}
1749 	}
1750 
1751 	/*
1752 	 * Client device is in stable state. Before proceeding, perform sanity
1753 	 * checks again.
1754 	 */
1755 	if ((MDI_CLIENT_IS_DETACHED(ct)) || (MDI_CLIENT_IS_FAILED(ct)) ||
1756 	    (!i_ddi_devi_attached(ct->ct_dip))) {
1757 		/*
1758 		 * Client is in failed state. Nothing more to do.
1759 		 */
1760 		MDI_CLIENT_UNLOCK(ct);
1761 		return (MDI_FAILURE);
1762 	}
1763 
1764 	/*
1765 	 * Set the client state as failover in progress.
1766 	 */
1767 	MDI_CLIENT_SET_FAILOVER_IN_PROGRESS(ct);
1768 	ct->ct_failover_flags = flags;
1769 	MDI_CLIENT_UNLOCK(ct);
1770 
1771 	if (flags == MDI_FAILOVER_ASYNC) {
1772 		/*
1773 		 * Submit the initiate failover request via CPR safe
1774 		 * taskq threads.
1775 		 */
1776 		(void) taskq_dispatch(mdi_taskq, (task_func_t *)i_mdi_failover,
1777 		    ct, KM_SLEEP);
1778 		return (MDI_ACCEPT);
1779 	} else {
1780 		/*
1781 		 * Synchronous failover mode.  Typically invoked from the user
1782 		 * land.
1783 		 */
1784 		rv = i_mdi_failover(ct);
1785 	}
1786 	return (rv);
1787 }
1788 
1789 /*
1790  * i_mdi_failover():
1791  *		internal failover function. Invokes vHCI drivers failover
1792  *		callback function and process the failover status
1793  * Return Values:
1794  *		None
1795  *
1796  * Note: A client device in failover state can not be detached or freed.
1797  */
1798 static int
1799 i_mdi_failover(void *arg)
1800 {
1801 	int		rv = MDI_SUCCESS;
1802 	mdi_client_t	*ct = (mdi_client_t *)arg;
1803 	mdi_vhci_t	*vh = ct->ct_vhci;
1804 
1805 	ASSERT(!MDI_CLIENT_LOCKED(ct));
1806 
1807 	if (vh->vh_ops->vo_failover != NULL) {
1808 		/*
1809 		 * Call vHCI drivers callback routine
1810 		 */
1811 		rv = (*vh->vh_ops->vo_failover)(vh->vh_dip, ct->ct_dip,
1812 		    ct->ct_failover_flags);
1813 	}
1814 
1815 	MDI_CLIENT_LOCK(ct);
1816 	MDI_CLIENT_CLEAR_FAILOVER_IN_PROGRESS(ct);
1817 
1818 	/*
1819 	 * Save the failover return status
1820 	 */
1821 	ct->ct_failover_status = rv;
1822 
1823 	/*
1824 	 * As a result of failover, client status would have been changed.
1825 	 * Update the client state and wake up anyone waiting on this client
1826 	 * device.
1827 	 */
1828 	i_mdi_client_update_state(ct);
1829 
1830 	cv_broadcast(&ct->ct_failover_cv);
1831 	MDI_CLIENT_UNLOCK(ct);
1832 	return (rv);
1833 }
1834 
1835 /*
1836  * Load balancing is logical block.
1837  * IOs within the range described by region_size
1838  * would go on the same path. This would improve the
1839  * performance by cache-hit on some of the RAID devices.
1840  * Search only for online paths(At some point we
1841  * may want to balance across target ports).
1842  * If no paths are found then default to round-robin.
1843  */
1844 static int
1845 i_mdi_lba_lb(mdi_client_t *ct, mdi_pathinfo_t **ret_pip, struct buf *bp)
1846 {
1847 	int		path_index = -1;
1848 	int		online_path_count = 0;
1849 	int		online_nonpref_path_count = 0;
1850 	int 		region_size = ct->ct_lb_args->region_size;
1851 	mdi_pathinfo_t	*pip;
1852 	mdi_pathinfo_t	*next;
1853 	int		preferred, path_cnt;
1854 
1855 	pip = ct->ct_path_head;
1856 	while (pip) {
1857 		MDI_PI_LOCK(pip);
1858 		if (MDI_PI(pip)->pi_state ==
1859 		    MDI_PATHINFO_STATE_ONLINE && MDI_PI(pip)->pi_preferred) {
1860 			online_path_count++;
1861 		} else if (MDI_PI(pip)->pi_state ==
1862 		    MDI_PATHINFO_STATE_ONLINE && !MDI_PI(pip)->pi_preferred) {
1863 			online_nonpref_path_count++;
1864 		}
1865 		next = (mdi_pathinfo_t *)
1866 		    MDI_PI(pip)->pi_client_link;
1867 		MDI_PI_UNLOCK(pip);
1868 		pip = next;
1869 	}
1870 	/* if found any online/preferred then use this type */
1871 	if (online_path_count > 0) {
1872 		path_cnt = online_path_count;
1873 		preferred = 1;
1874 	} else if (online_nonpref_path_count > 0) {
1875 		path_cnt = online_nonpref_path_count;
1876 		preferred = 0;
1877 	} else {
1878 		path_cnt = 0;
1879 	}
1880 	if (path_cnt) {
1881 		path_index = (bp->b_blkno >> region_size) % path_cnt;
1882 		pip = ct->ct_path_head;
1883 		while (pip && path_index != -1) {
1884 			MDI_PI_LOCK(pip);
1885 			if (path_index == 0 &&
1886 			    (MDI_PI(pip)->pi_state ==
1887 			    MDI_PATHINFO_STATE_ONLINE) &&
1888 				MDI_PI(pip)->pi_preferred == preferred) {
1889 				MDI_PI_HOLD(pip);
1890 				MDI_PI_UNLOCK(pip);
1891 				*ret_pip = pip;
1892 				return (MDI_SUCCESS);
1893 			}
1894 			path_index --;
1895 			next = (mdi_pathinfo_t *)
1896 			    MDI_PI(pip)->pi_client_link;
1897 			MDI_PI_UNLOCK(pip);
1898 			pip = next;
1899 		}
1900 		if (pip == NULL) {
1901 			MDI_DEBUG(4, (CE_NOTE, NULL,
1902 			    "!lba %llx, no pip !!\n",
1903 				bp->b_lblkno));
1904 		} else {
1905 			MDI_DEBUG(4, (CE_NOTE, NULL,
1906 			    "!lba %llx, no pip for path_index, "
1907 			    "pip %p\n", bp->b_lblkno, (void *)pip));
1908 		}
1909 	}
1910 	return (MDI_FAILURE);
1911 }
1912 
1913 /*
1914  * mdi_select_path():
1915  *		select a path to access a client device.
1916  *
1917  *		mdi_select_path() function is called by the vHCI drivers to
1918  *		select a path to route the I/O request to.  The caller passes
1919  *		the block I/O data transfer structure ("buf") as one of the
1920  *		parameters.  The mpxio framework uses the buf structure
1921  *		contents to maintain per path statistics (total I/O size /
1922  *		count pending).  If more than one online paths are available to
1923  *		select, the framework automatically selects a suitable path
1924  *		for routing I/O request. If a failover operation is active for
1925  *		this client device the call shall be failed with MDI_BUSY error
1926  *		code.
1927  *
1928  *		By default this function returns a suitable path in online
1929  *		state based on the current load balancing policy.  Currently
1930  *		we support LOAD_BALANCE_NONE (Previously selected online path
1931  *		will continue to be used till the path is usable) and
1932  *		LOAD_BALANCE_RR (Online paths will be selected in a round
1933  *		robin fashion), LOAD_BALANCE_LB(Online paths will be selected
1934  *		based on the logical block).  The load balancing
1935  *		through vHCI drivers configuration file (driver.conf).
1936  *
1937  *		vHCI drivers may override this default behavior by specifying
1938  *		appropriate flags.  The meaning of the thrid argument depends
1939  *		on the flags specified. If MDI_SELECT_PATH_INSTANCE is set
1940  *		then the argument is the "path instance" of the path to select.
1941  *		If MDI_SELECT_PATH_INSTANCE is not set then the argument is
1942  *		"start_pip". A non NULL "start_pip" is the starting point to
1943  *		walk and find the next appropriate path.  The following values
1944  *		are currently defined: MDI_SELECT_ONLINE_PATH (to select an
1945  *		ONLINE path) and/or MDI_SELECT_STANDBY_PATH (to select an
1946  *		STANDBY path).
1947  *
1948  *		The non-standard behavior is used by the scsi_vhci driver,
1949  *		whenever it has to use a STANDBY/FAULTED path.  Eg. during
1950  *		attach of client devices (to avoid an unnecessary failover
1951  *		when the STANDBY path comes up first), during failover
1952  *		(to activate a STANDBY path as ONLINE).
1953  *
1954  *		The selected path is returned in a a mdi_hold_path() state
1955  *		(pi_ref_cnt). Caller should release the hold by calling
1956  *		mdi_rele_path().
1957  *
1958  * Return Values:
1959  *		MDI_SUCCESS	- Completed successfully
1960  *		MDI_BUSY 	- Client device is busy failing over
1961  *		MDI_NOPATH	- Client device is online, but no valid path are
1962  *				  available to access this client device
1963  *		MDI_FAILURE	- Invalid client device or state
1964  *		MDI_DEVI_ONLINING
1965  *				- Client device (struct dev_info state) is in
1966  *				  onlining state.
1967  */
1968 
1969 /*ARGSUSED*/
1970 int
1971 mdi_select_path(dev_info_t *cdip, struct buf *bp, int flags,
1972     void *arg, mdi_pathinfo_t **ret_pip)
1973 {
1974 	mdi_client_t	*ct;
1975 	mdi_pathinfo_t	*pip;
1976 	mdi_pathinfo_t	*next;
1977 	mdi_pathinfo_t	*head;
1978 	mdi_pathinfo_t	*start;
1979 	client_lb_t	lbp;	/* load balancing policy */
1980 	int		sb = 1;	/* standard behavior */
1981 	int		preferred = 1;	/* preferred path */
1982 	int		cond, cont = 1;
1983 	int		retry = 0;
1984 	mdi_pathinfo_t	*start_pip;	/* request starting pathinfo */
1985 	int		path_instance;	/* request specific path instance */
1986 
1987 	/* determine type of arg based on flags */
1988 	if (flags & MDI_SELECT_PATH_INSTANCE) {
1989 		flags &= ~MDI_SELECT_PATH_INSTANCE;
1990 		path_instance = (int)(intptr_t)arg;
1991 		start_pip = NULL;
1992 	} else {
1993 		path_instance = 0;
1994 		start_pip = (mdi_pathinfo_t *)arg;
1995 	}
1996 
1997 	if (flags != 0) {
1998 		/*
1999 		 * disable default behavior
2000 		 */
2001 		sb = 0;
2002 	}
2003 
2004 	*ret_pip = NULL;
2005 	ct = i_devi_get_client(cdip);
2006 	if (ct == NULL) {
2007 		/* mdi extensions are NULL, Nothing more to do */
2008 		return (MDI_FAILURE);
2009 	}
2010 
2011 	MDI_CLIENT_LOCK(ct);
2012 
2013 	if (sb) {
2014 		if (MDI_CLIENT_IS_FAILED(ct)) {
2015 			/*
2016 			 * Client is not ready to accept any I/O requests.
2017 			 * Fail this request.
2018 			 */
2019 			MDI_DEBUG(2, (CE_NOTE, cdip, "!mdi_select_path: "
2020 			    "client state offline ct = %p\n", (void *)ct));
2021 			MDI_CLIENT_UNLOCK(ct);
2022 			return (MDI_FAILURE);
2023 		}
2024 
2025 		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
2026 			/*
2027 			 * Check for Failover is in progress. If so tell the
2028 			 * caller that this device is busy.
2029 			 */
2030 			MDI_DEBUG(2, (CE_NOTE, cdip, "!mdi_select_path: "
2031 			    "client failover in progress ct = %p\n",
2032 			    (void *)ct));
2033 			MDI_CLIENT_UNLOCK(ct);
2034 			return (MDI_BUSY);
2035 		}
2036 
2037 		/*
2038 		 * Check to see whether the client device is attached.
2039 		 * If not so, let the vHCI driver manually select a path
2040 		 * (standby) and let the probe/attach process to continue.
2041 		 */
2042 		if (MDI_CLIENT_IS_DETACHED(ct) || !i_ddi_devi_attached(cdip)) {
2043 			MDI_DEBUG(4, (CE_NOTE, cdip, "!Devi is onlining "
2044 			    "ct = %p\n", (void *)ct));
2045 			MDI_CLIENT_UNLOCK(ct);
2046 			return (MDI_DEVI_ONLINING);
2047 		}
2048 	}
2049 
2050 	/*
2051 	 * Cache in the client list head.  If head of the list is NULL
2052 	 * return MDI_NOPATH
2053 	 */
2054 	head = ct->ct_path_head;
2055 	if (head == NULL) {
2056 		MDI_CLIENT_UNLOCK(ct);
2057 		return (MDI_NOPATH);
2058 	}
2059 
2060 	/* Caller is specifying a specific pathinfo path by path_instance */
2061 	if (path_instance) {
2062 		/* search for pathinfo with correct path_instance */
2063 		for (pip = head;
2064 		    pip && (mdi_pi_get_path_instance(pip) != path_instance);
2065 		    pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link)
2066 			;
2067 
2068 		/* If path can't be selected then MDI_FAILURE is returned. */
2069 		if (pip == NULL) {
2070 			MDI_CLIENT_UNLOCK(ct);
2071 			return (MDI_FAILURE);
2072 		}
2073 
2074 		/* verify state of path */
2075 		MDI_PI_LOCK(pip);
2076 		if (MDI_PI(pip)->pi_state != MDI_PATHINFO_STATE_ONLINE) {
2077 			MDI_PI_UNLOCK(pip);
2078 			MDI_CLIENT_UNLOCK(ct);
2079 			return (MDI_FAILURE);
2080 		}
2081 
2082 		/*
2083 		 * Return the path in hold state. Caller should release the
2084 		 * lock by calling mdi_rele_path()
2085 		 */
2086 		MDI_PI_HOLD(pip);
2087 		MDI_PI_UNLOCK(pip);
2088 		ct->ct_path_last = pip;
2089 		*ret_pip = pip;
2090 		MDI_CLIENT_UNLOCK(ct);
2091 		return (MDI_SUCCESS);
2092 	}
2093 
2094 	/*
2095 	 * for non default behavior, bypass current
2096 	 * load balancing policy and always use LOAD_BALANCE_RR
2097 	 * except that the start point will be adjusted based
2098 	 * on the provided start_pip
2099 	 */
2100 	lbp = sb ? ct->ct_lb : LOAD_BALANCE_RR;
2101 
2102 	switch (lbp) {
2103 	case LOAD_BALANCE_NONE:
2104 		/*
2105 		 * Load balancing is None  or Alternate path mode
2106 		 * Start looking for a online mdi_pathinfo node starting from
2107 		 * last known selected path
2108 		 */
2109 		preferred = 1;
2110 		pip = (mdi_pathinfo_t *)ct->ct_path_last;
2111 		if (pip == NULL) {
2112 			pip = head;
2113 		}
2114 		start = pip;
2115 		do {
2116 			MDI_PI_LOCK(pip);
2117 			/*
2118 			 * No need to explicitly check if the path is disabled.
2119 			 * Since we are checking for state == ONLINE and the
2120 			 * same veriable is used for DISABLE/ENABLE information.
2121 			 */
2122 			if ((MDI_PI(pip)->pi_state  ==
2123 				MDI_PATHINFO_STATE_ONLINE) &&
2124 				preferred == MDI_PI(pip)->pi_preferred) {
2125 				/*
2126 				 * Return the path in hold state. Caller should
2127 				 * release the lock by calling mdi_rele_path()
2128 				 */
2129 				MDI_PI_HOLD(pip);
2130 				MDI_PI_UNLOCK(pip);
2131 				ct->ct_path_last = pip;
2132 				*ret_pip = pip;
2133 				MDI_CLIENT_UNLOCK(ct);
2134 				return (MDI_SUCCESS);
2135 			}
2136 
2137 			/*
2138 			 * Path is busy.
2139 			 */
2140 			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
2141 			    MDI_PI_IS_TRANSIENT(pip))
2142 				retry = 1;
2143 			/*
2144 			 * Keep looking for a next available online path
2145 			 */
2146 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2147 			if (next == NULL) {
2148 				next = head;
2149 			}
2150 			MDI_PI_UNLOCK(pip);
2151 			pip = next;
2152 			if (start == pip && preferred) {
2153 				preferred = 0;
2154 			} else if (start == pip && !preferred) {
2155 				cont = 0;
2156 			}
2157 		} while (cont);
2158 		break;
2159 
2160 	case LOAD_BALANCE_LBA:
2161 		/*
2162 		 * Make sure we are looking
2163 		 * for an online path. Otherwise, if it is for a STANDBY
2164 		 * path request, it will go through and fetch an ONLINE
2165 		 * path which is not desirable.
2166 		 */
2167 		if ((ct->ct_lb_args != NULL) &&
2168 			    (ct->ct_lb_args->region_size) && bp &&
2169 				(sb || (flags == MDI_SELECT_ONLINE_PATH))) {
2170 			if (i_mdi_lba_lb(ct, ret_pip, bp)
2171 				    == MDI_SUCCESS) {
2172 				MDI_CLIENT_UNLOCK(ct);
2173 				return (MDI_SUCCESS);
2174 			}
2175 		}
2176 		/*  FALLTHROUGH */
2177 	case LOAD_BALANCE_RR:
2178 		/*
2179 		 * Load balancing is Round Robin. Start looking for a online
2180 		 * mdi_pathinfo node starting from last known selected path
2181 		 * as the start point.  If override flags are specified,
2182 		 * process accordingly.
2183 		 * If the search is already in effect(start_pip not null),
2184 		 * then lets just use the same path preference to continue the
2185 		 * traversal.
2186 		 */
2187 
2188 		if (start_pip != NULL) {
2189 			preferred = MDI_PI(start_pip)->pi_preferred;
2190 		} else {
2191 			preferred = 1;
2192 		}
2193 
2194 		start = sb ? (mdi_pathinfo_t *)ct->ct_path_last : start_pip;
2195 		if (start == NULL) {
2196 			pip = head;
2197 		} else {
2198 			pip = (mdi_pathinfo_t *)MDI_PI(start)->pi_client_link;
2199 			if (pip == NULL) {
2200 				if (!sb) {
2201 					if (preferred == 0) {
2202 						/*
2203 						 * Looks like we have completed
2204 						 * the traversal as preferred
2205 						 * value is 0. Time to bail out.
2206 						 */
2207 						*ret_pip = NULL;
2208 						MDI_CLIENT_UNLOCK(ct);
2209 						return (MDI_NOPATH);
2210 					} else {
2211 						/*
2212 						 * Looks like we reached the
2213 						 * end of the list. Lets enable
2214 						 * traversal of non preferred
2215 						 * paths.
2216 						 */
2217 						preferred = 0;
2218 					}
2219 				}
2220 				pip = head;
2221 			}
2222 		}
2223 		start = pip;
2224 		do {
2225 			MDI_PI_LOCK(pip);
2226 			if (sb) {
2227 				cond = ((MDI_PI(pip)->pi_state ==
2228 				    MDI_PATHINFO_STATE_ONLINE &&
2229 					MDI_PI(pip)->pi_preferred ==
2230 						preferred) ? 1 : 0);
2231 			} else {
2232 				if (flags == MDI_SELECT_ONLINE_PATH) {
2233 					cond = ((MDI_PI(pip)->pi_state ==
2234 					    MDI_PATHINFO_STATE_ONLINE &&
2235 						MDI_PI(pip)->pi_preferred ==
2236 						preferred) ? 1 : 0);
2237 				} else if (flags == MDI_SELECT_STANDBY_PATH) {
2238 					cond = ((MDI_PI(pip)->pi_state ==
2239 					    MDI_PATHINFO_STATE_STANDBY &&
2240 						MDI_PI(pip)->pi_preferred ==
2241 						preferred) ? 1 : 0);
2242 				} else if (flags == (MDI_SELECT_ONLINE_PATH |
2243 				    MDI_SELECT_STANDBY_PATH)) {
2244 					cond = (((MDI_PI(pip)->pi_state ==
2245 					    MDI_PATHINFO_STATE_ONLINE ||
2246 					    (MDI_PI(pip)->pi_state ==
2247 					    MDI_PATHINFO_STATE_STANDBY)) &&
2248 						MDI_PI(pip)->pi_preferred ==
2249 						preferred) ? 1 : 0);
2250 				} else if (flags ==
2251 					(MDI_SELECT_STANDBY_PATH |
2252 					MDI_SELECT_ONLINE_PATH |
2253 					MDI_SELECT_USER_DISABLE_PATH)) {
2254 					cond = (((MDI_PI(pip)->pi_state ==
2255 					    MDI_PATHINFO_STATE_ONLINE ||
2256 					    (MDI_PI(pip)->pi_state ==
2257 					    MDI_PATHINFO_STATE_STANDBY) ||
2258 						(MDI_PI(pip)->pi_state ==
2259 					    (MDI_PATHINFO_STATE_ONLINE|
2260 					    MDI_PATHINFO_STATE_USER_DISABLE)) ||
2261 						(MDI_PI(pip)->pi_state ==
2262 					    (MDI_PATHINFO_STATE_STANDBY |
2263 					    MDI_PATHINFO_STATE_USER_DISABLE)))&&
2264 						MDI_PI(pip)->pi_preferred ==
2265 						preferred) ? 1 : 0);
2266 				} else {
2267 					cond = 0;
2268 				}
2269 			}
2270 			/*
2271 			 * No need to explicitly check if the path is disabled.
2272 			 * Since we are checking for state == ONLINE and the
2273 			 * same veriable is used for DISABLE/ENABLE information.
2274 			 */
2275 			if (cond) {
2276 				/*
2277 				 * Return the path in hold state. Caller should
2278 				 * release the lock by calling mdi_rele_path()
2279 				 */
2280 				MDI_PI_HOLD(pip);
2281 				MDI_PI_UNLOCK(pip);
2282 				if (sb)
2283 					ct->ct_path_last = pip;
2284 				*ret_pip = pip;
2285 				MDI_CLIENT_UNLOCK(ct);
2286 				return (MDI_SUCCESS);
2287 			}
2288 			/*
2289 			 * Path is busy.
2290 			 */
2291 			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
2292 			    MDI_PI_IS_TRANSIENT(pip))
2293 				retry = 1;
2294 
2295 			/*
2296 			 * Keep looking for a next available online path
2297 			 */
2298 do_again:
2299 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2300 			if (next == NULL) {
2301 				if (!sb) {
2302 					if (preferred == 1) {
2303 						/*
2304 						 * Looks like we reached the
2305 						 * end of the list. Lets enable
2306 						 * traversal of non preferred
2307 						 * paths.
2308 						 */
2309 						preferred = 0;
2310 						next = head;
2311 					} else {
2312 						/*
2313 						 * We have done both the passes
2314 						 * Preferred as well as for
2315 						 * Non-preferred. Bail out now.
2316 						 */
2317 						cont = 0;
2318 					}
2319 				} else {
2320 					/*
2321 					 * Standard behavior case.
2322 					 */
2323 					next = head;
2324 				}
2325 			}
2326 			MDI_PI_UNLOCK(pip);
2327 			if (cont == 0) {
2328 				break;
2329 			}
2330 			pip = next;
2331 
2332 			if (!sb) {
2333 				/*
2334 				 * We need to handle the selection of
2335 				 * non-preferred path in the following
2336 				 * case:
2337 				 *
2338 				 * +------+   +------+   +------+   +-----+
2339 				 * | A : 1| - | B : 1| - | C : 0| - |NULL |
2340 				 * +------+   +------+   +------+   +-----+
2341 				 *
2342 				 * If we start the search with B, we need to
2343 				 * skip beyond B to pick C which is non -
2344 				 * preferred in the second pass. The following
2345 				 * test, if true, will allow us to skip over
2346 				 * the 'start'(B in the example) to select
2347 				 * other non preferred elements.
2348 				 */
2349 				if ((start_pip != NULL) && (start_pip == pip) &&
2350 				    (MDI_PI(start_pip)->pi_preferred
2351 				    != preferred)) {
2352 					/*
2353 					 * try again after going past the start
2354 					 * pip
2355 					 */
2356 					MDI_PI_LOCK(pip);
2357 					goto do_again;
2358 				}
2359 			} else {
2360 				/*
2361 				 * Standard behavior case
2362 				 */
2363 				if (start == pip && preferred) {
2364 					/* look for nonpreferred paths */
2365 					preferred = 0;
2366 				} else if (start == pip && !preferred) {
2367 					/*
2368 					 * Exit condition
2369 					 */
2370 					cont = 0;
2371 				}
2372 			}
2373 		} while (cont);
2374 		break;
2375 	}
2376 
2377 	MDI_CLIENT_UNLOCK(ct);
2378 	if (retry == 1) {
2379 		return (MDI_BUSY);
2380 	} else {
2381 		return (MDI_NOPATH);
2382 	}
2383 }
2384 
2385 /*
2386  * For a client, return the next available path to any phci
2387  *
2388  * Note:
2389  *		Caller should hold the branch's devinfo node to get a consistent
2390  *		snap shot of the mdi_pathinfo nodes.
2391  *
2392  *		Please note that even the list is stable the mdi_pathinfo
2393  *		node state and properties are volatile.  The caller should lock
2394  *		and unlock the nodes by calling mdi_pi_lock() and
2395  *		mdi_pi_unlock() functions to get a stable properties.
2396  *
2397  *		If there is a need to use the nodes beyond the hold of the
2398  *		devinfo node period (For ex. I/O), then mdi_pathinfo node
2399  *		need to be held against unexpected removal by calling
2400  *		mdi_hold_path() and should be released by calling
2401  *		mdi_rele_path() on completion.
2402  */
2403 mdi_pathinfo_t *
2404 mdi_get_next_phci_path(dev_info_t *ct_dip, mdi_pathinfo_t *pip)
2405 {
2406 	mdi_client_t *ct;
2407 
2408 	if (!MDI_CLIENT(ct_dip))
2409 		return (NULL);
2410 
2411 	/*
2412 	 * Walk through client link
2413 	 */
2414 	ct = (mdi_client_t *)DEVI(ct_dip)->devi_mdi_client;
2415 	ASSERT(ct != NULL);
2416 
2417 	if (pip == NULL)
2418 		return ((mdi_pathinfo_t *)ct->ct_path_head);
2419 
2420 	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link);
2421 }
2422 
2423 /*
2424  * For a phci, return the next available path to any client
2425  * Note: ditto mdi_get_next_phci_path()
2426  */
2427 mdi_pathinfo_t *
2428 mdi_get_next_client_path(dev_info_t *ph_dip, mdi_pathinfo_t *pip)
2429 {
2430 	mdi_phci_t *ph;
2431 
2432 	if (!MDI_PHCI(ph_dip))
2433 		return (NULL);
2434 
2435 	/*
2436 	 * Walk through pHCI link
2437 	 */
2438 	ph = (mdi_phci_t *)DEVI(ph_dip)->devi_mdi_xhci;
2439 	ASSERT(ph != NULL);
2440 
2441 	if (pip == NULL)
2442 		return ((mdi_pathinfo_t *)ph->ph_path_head);
2443 
2444 	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link);
2445 }
2446 
2447 /*
2448  * mdi_hold_path():
2449  *		Hold the mdi_pathinfo node against unwanted unexpected free.
2450  * Return Values:
2451  *		None
2452  */
2453 void
2454 mdi_hold_path(mdi_pathinfo_t *pip)
2455 {
2456 	if (pip) {
2457 		MDI_PI_LOCK(pip);
2458 		MDI_PI_HOLD(pip);
2459 		MDI_PI_UNLOCK(pip);
2460 	}
2461 }
2462 
2463 
2464 /*
2465  * mdi_rele_path():
2466  *		Release the mdi_pathinfo node which was selected
2467  *		through mdi_select_path() mechanism or manually held by
2468  *		calling mdi_hold_path().
2469  * Return Values:
2470  *		None
2471  */
2472 void
2473 mdi_rele_path(mdi_pathinfo_t *pip)
2474 {
2475 	if (pip) {
2476 		MDI_PI_LOCK(pip);
2477 		MDI_PI_RELE(pip);
2478 		if (MDI_PI(pip)->pi_ref_cnt == 0) {
2479 			cv_broadcast(&MDI_PI(pip)->pi_ref_cv);
2480 		}
2481 		MDI_PI_UNLOCK(pip);
2482 	}
2483 }
2484 
2485 /*
2486  * mdi_pi_lock():
2487  * 		Lock the mdi_pathinfo node.
2488  * Note:
2489  *		The caller should release the lock by calling mdi_pi_unlock()
2490  */
2491 void
2492 mdi_pi_lock(mdi_pathinfo_t *pip)
2493 {
2494 	ASSERT(pip != NULL);
2495 	if (pip) {
2496 		MDI_PI_LOCK(pip);
2497 	}
2498 }
2499 
2500 
2501 /*
2502  * mdi_pi_unlock():
2503  * 		Unlock the mdi_pathinfo node.
2504  * Note:
2505  *		The mdi_pathinfo node should have been locked with mdi_pi_lock()
2506  */
2507 void
2508 mdi_pi_unlock(mdi_pathinfo_t *pip)
2509 {
2510 	ASSERT(pip != NULL);
2511 	if (pip) {
2512 		MDI_PI_UNLOCK(pip);
2513 	}
2514 }
2515 
2516 /*
2517  * mdi_pi_find():
2518  *		Search the list of mdi_pathinfo nodes attached to the
2519  *		pHCI/Client device node whose path address matches "paddr".
2520  *		Returns a pointer to the mdi_pathinfo node if a matching node is
2521  *		found.
2522  * Return Values:
2523  *		mdi_pathinfo node handle
2524  *		NULL
2525  * Notes:
2526  *		Caller need not hold any locks to call this function.
2527  */
2528 mdi_pathinfo_t *
2529 mdi_pi_find(dev_info_t *pdip, char *caddr, char *paddr)
2530 {
2531 	mdi_phci_t		*ph;
2532 	mdi_vhci_t		*vh;
2533 	mdi_client_t		*ct;
2534 	mdi_pathinfo_t		*pip = NULL;
2535 
2536 	MDI_DEBUG(2, (CE_NOTE, pdip, "!mdi_pi_find: %s %s",
2537 	    caddr ? caddr : "NULL", paddr ? paddr : "NULL"));
2538 	if ((pdip == NULL) || (paddr == NULL)) {
2539 		return (NULL);
2540 	}
2541 	ph = i_devi_get_phci(pdip);
2542 	if (ph == NULL) {
2543 		/*
2544 		 * Invalid pHCI device, Nothing more to do.
2545 		 */
2546 		MDI_DEBUG(2, (CE_WARN, pdip,
2547 		    "!mdi_pi_find: invalid phci"));
2548 		return (NULL);
2549 	}
2550 
2551 	vh = ph->ph_vhci;
2552 	if (vh == NULL) {
2553 		/*
2554 		 * Invalid vHCI device, Nothing more to do.
2555 		 */
2556 		MDI_DEBUG(2, (CE_WARN, pdip,
2557 		    "!mdi_pi_find: invalid vhci"));
2558 		return (NULL);
2559 	}
2560 
2561 	/*
2562 	 * Look for pathinfo node identified by paddr.
2563 	 */
2564 	if (caddr == NULL) {
2565 		/*
2566 		 * Find a mdi_pathinfo node under pHCI list for a matching
2567 		 * unit address.
2568 		 */
2569 		MDI_PHCI_LOCK(ph);
2570 		if (MDI_PHCI_IS_OFFLINE(ph)) {
2571 			MDI_DEBUG(2, (CE_WARN, pdip,
2572 			    "!mdi_pi_find: offline phci %p", (void *)ph));
2573 			MDI_PHCI_UNLOCK(ph);
2574 			return (NULL);
2575 		}
2576 		pip = (mdi_pathinfo_t *)ph->ph_path_head;
2577 
2578 		while (pip != NULL) {
2579 			if (strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2580 				break;
2581 			}
2582 			pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
2583 		}
2584 		MDI_PHCI_UNLOCK(ph);
2585 		MDI_DEBUG(2, (CE_NOTE, pdip, "!mdi_pi_find: found %p",
2586 		    (void *)pip));
2587 		return (pip);
2588 	}
2589 
2590 	/*
2591 	 * XXX - Is the rest of the code in this function really necessary?
2592 	 * The consumers of mdi_pi_find() can search for the desired pathinfo
2593 	 * node by calling mdi_pi_find(pdip, NULL, paddr). Irrespective of
2594 	 * whether the search is based on the pathinfo nodes attached to
2595 	 * the pHCI or the client node, the result will be the same.
2596 	 */
2597 
2598 	/*
2599 	 * Find the client device corresponding to 'caddr'
2600 	 */
2601 	MDI_VHCI_CLIENT_LOCK(vh);
2602 
2603 	/*
2604 	 * XXX - Passing NULL to the following function works as long as the
2605 	 * the client addresses (caddr) are unique per vhci basis.
2606 	 */
2607 	ct = i_mdi_client_find(vh, NULL, caddr);
2608 	if (ct == NULL) {
2609 		/*
2610 		 * Client not found, Obviously mdi_pathinfo node has not been
2611 		 * created yet.
2612 		 */
2613 		MDI_VHCI_CLIENT_UNLOCK(vh);
2614 		MDI_DEBUG(2, (CE_NOTE, pdip, "!mdi_pi_find: client not "
2615 		    "found for caddr %s", caddr ? caddr : "NULL"));
2616 		return (NULL);
2617 	}
2618 
2619 	/*
2620 	 * Hold the client lock and look for a mdi_pathinfo node with matching
2621 	 * pHCI and paddr
2622 	 */
2623 	MDI_CLIENT_LOCK(ct);
2624 
2625 	/*
2626 	 * Release the global mutex as it is no more needed. Note: We always
2627 	 * respect the locking order while acquiring.
2628 	 */
2629 	MDI_VHCI_CLIENT_UNLOCK(vh);
2630 
2631 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2632 	while (pip != NULL) {
2633 		/*
2634 		 * Compare the unit address
2635 		 */
2636 		if ((MDI_PI(pip)->pi_phci == ph) &&
2637 		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2638 			break;
2639 		}
2640 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2641 	}
2642 	MDI_CLIENT_UNLOCK(ct);
2643 	MDI_DEBUG(2, (CE_NOTE, pdip, "!mdi_pi_find: found:: %p", (void *)pip));
2644 	return (pip);
2645 }
2646 
2647 /*
2648  * mdi_pi_alloc():
2649  *		Allocate and initialize a new instance of a mdi_pathinfo node.
2650  *		The mdi_pathinfo node returned by this function identifies a
2651  *		unique device path is capable of having properties attached
2652  *		and passed to mdi_pi_online() to fully attach and online the
2653  *		path and client device node.
2654  *		The mdi_pathinfo node returned by this function must be
2655  *		destroyed using mdi_pi_free() if the path is no longer
2656  *		operational or if the caller fails to attach a client device
2657  *		node when calling mdi_pi_online(). The framework will not free
2658  *		the resources allocated.
2659  *		This function can be called from both interrupt and kernel
2660  *		contexts.  DDI_NOSLEEP flag should be used while calling
2661  *		from interrupt contexts.
2662  * Return Values:
2663  *		MDI_SUCCESS
2664  *		MDI_FAILURE
2665  *		MDI_NOMEM
2666  */
2667 /*ARGSUSED*/
2668 int
2669 mdi_pi_alloc_compatible(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2670     char **compatible, int ncompatible, int flags, mdi_pathinfo_t **ret_pip)
2671 {
2672 	mdi_vhci_t	*vh;
2673 	mdi_phci_t	*ph;
2674 	mdi_client_t	*ct;
2675 	mdi_pathinfo_t	*pip = NULL;
2676 	dev_info_t	*cdip;
2677 	int		rv = MDI_NOMEM;
2678 	int		path_allocated = 0;
2679 
2680 	MDI_DEBUG(2, (CE_NOTE, pdip, "!mdi_pi_alloc_compatible: %s %s %s",
2681 	    cname ? cname : "NULL", caddr ? caddr : "NULL",
2682 	    paddr ? paddr : "NULL"));
2683 
2684 	if (pdip == NULL || cname == NULL || caddr == NULL || paddr == NULL ||
2685 	    ret_pip == NULL) {
2686 		/* Nothing more to do */
2687 		return (MDI_FAILURE);
2688 	}
2689 
2690 	*ret_pip = NULL;
2691 
2692 	/* No allocations on detaching pHCI */
2693 	if (DEVI_IS_DETACHING(pdip)) {
2694 		/* Invalid pHCI device, return failure */
2695 		MDI_DEBUG(1, (CE_WARN, pdip,
2696 		    "!mdi_pi_alloc: detaching pHCI=%p", (void *)pdip));
2697 		return (MDI_FAILURE);
2698 	}
2699 
2700 	ph = i_devi_get_phci(pdip);
2701 	ASSERT(ph != NULL);
2702 	if (ph == NULL) {
2703 		/* Invalid pHCI device, return failure */
2704 		MDI_DEBUG(1, (CE_WARN, pdip,
2705 		    "!mdi_pi_alloc: invalid pHCI=%p", (void *)pdip));
2706 		return (MDI_FAILURE);
2707 	}
2708 
2709 	MDI_PHCI_LOCK(ph);
2710 	vh = ph->ph_vhci;
2711 	if (vh == NULL) {
2712 		/* Invalid vHCI device, return failure */
2713 		MDI_DEBUG(1, (CE_WARN, pdip,
2714 		    "!mdi_pi_alloc: invalid vHCI=%p", (void *)pdip));
2715 		MDI_PHCI_UNLOCK(ph);
2716 		return (MDI_FAILURE);
2717 	}
2718 
2719 	if (MDI_PHCI_IS_READY(ph) == 0) {
2720 		/*
2721 		 * Do not allow new node creation when pHCI is in
2722 		 * offline/suspended states
2723 		 */
2724 		MDI_DEBUG(1, (CE_WARN, pdip,
2725 		    "mdi_pi_alloc: pHCI=%p is not ready", (void *)ph));
2726 		MDI_PHCI_UNLOCK(ph);
2727 		return (MDI_BUSY);
2728 	}
2729 	MDI_PHCI_UNSTABLE(ph);
2730 	MDI_PHCI_UNLOCK(ph);
2731 
2732 	/* look for a matching client, create one if not found */
2733 	MDI_VHCI_CLIENT_LOCK(vh);
2734 	ct = i_mdi_client_find(vh, cname, caddr);
2735 	if (ct == NULL) {
2736 		ct = i_mdi_client_alloc(vh, cname, caddr);
2737 		ASSERT(ct != NULL);
2738 	}
2739 
2740 	if (ct->ct_dip == NULL) {
2741 		/*
2742 		 * Allocate a devinfo node
2743 		 */
2744 		ct->ct_dip = i_mdi_devinfo_create(vh, cname, caddr,
2745 		    compatible, ncompatible);
2746 		if (ct->ct_dip == NULL) {
2747 			(void) i_mdi_client_free(vh, ct);
2748 			goto fail;
2749 		}
2750 	}
2751 	cdip = ct->ct_dip;
2752 
2753 	DEVI(cdip)->devi_mdi_component |= MDI_COMPONENT_CLIENT;
2754 	DEVI(cdip)->devi_mdi_client = (caddr_t)ct;
2755 
2756 	MDI_CLIENT_LOCK(ct);
2757 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2758 	while (pip != NULL) {
2759 		/*
2760 		 * Compare the unit address
2761 		 */
2762 		if ((MDI_PI(pip)->pi_phci == ph) &&
2763 		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2764 			break;
2765 		}
2766 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2767 	}
2768 	MDI_CLIENT_UNLOCK(ct);
2769 
2770 	if (pip == NULL) {
2771 		/*
2772 		 * This is a new path for this client device.  Allocate and
2773 		 * initialize a new pathinfo node
2774 		 */
2775 		pip = i_mdi_pi_alloc(ph, paddr, ct);
2776 		ASSERT(pip != NULL);
2777 		path_allocated = 1;
2778 	}
2779 	rv = MDI_SUCCESS;
2780 
2781 fail:
2782 	/*
2783 	 * Release the global mutex.
2784 	 */
2785 	MDI_VHCI_CLIENT_UNLOCK(vh);
2786 
2787 	/*
2788 	 * Mark the pHCI as stable
2789 	 */
2790 	MDI_PHCI_LOCK(ph);
2791 	MDI_PHCI_STABLE(ph);
2792 	MDI_PHCI_UNLOCK(ph);
2793 	*ret_pip = pip;
2794 
2795 	MDI_DEBUG(2, (CE_NOTE, pdip,
2796 	    "!mdi_pi_alloc_compatible: alloc %p", (void *)pip));
2797 
2798 	if (path_allocated)
2799 		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
2800 
2801 	return (rv);
2802 }
2803 
2804 /*ARGSUSED*/
2805 int
2806 mdi_pi_alloc(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2807     int flags, mdi_pathinfo_t **ret_pip)
2808 {
2809 	return (mdi_pi_alloc_compatible(pdip, cname, caddr, paddr, NULL, 0,
2810 	    flags, ret_pip));
2811 }
2812 
2813 /*
2814  * i_mdi_pi_alloc():
2815  *		Allocate a mdi_pathinfo node and add to the pHCI path list
2816  * Return Values:
2817  *		mdi_pathinfo
2818  */
2819 /*ARGSUSED*/
2820 static mdi_pathinfo_t *
2821 i_mdi_pi_alloc(mdi_phci_t *ph, char *paddr, mdi_client_t *ct)
2822 {
2823 	mdi_pathinfo_t	*pip;
2824 	int		ct_circular;
2825 	int		ph_circular;
2826 	static char	path[MAXPATHLEN];
2827 	char		*path_persistent;
2828 	int		path_instance;
2829 	mod_hash_val_t	hv;
2830 
2831 	ASSERT(MDI_VHCI_CLIENT_LOCKED(ph->ph_vhci));
2832 
2833 	pip = kmem_zalloc(sizeof (struct mdi_pathinfo), KM_SLEEP);
2834 	mutex_init(&MDI_PI(pip)->pi_mutex, NULL, MUTEX_DEFAULT, NULL);
2835 	MDI_PI(pip)->pi_state = MDI_PATHINFO_STATE_INIT |
2836 	    MDI_PATHINFO_STATE_TRANSIENT;
2837 
2838 	if (MDI_PHCI_IS_USER_DISABLED(ph))
2839 		MDI_PI_SET_USER_DISABLE(pip);
2840 
2841 	if (MDI_PHCI_IS_DRV_DISABLED_TRANSIENT(ph))
2842 		MDI_PI_SET_DRV_DISABLE_TRANS(pip);
2843 
2844 	if (MDI_PHCI_IS_DRV_DISABLED(ph))
2845 		MDI_PI_SET_DRV_DISABLE(pip);
2846 
2847 	MDI_PI(pip)->pi_old_state = MDI_PATHINFO_STATE_INIT;
2848 	cv_init(&MDI_PI(pip)->pi_state_cv, NULL, CV_DEFAULT, NULL);
2849 	MDI_PI(pip)->pi_client = ct;
2850 	MDI_PI(pip)->pi_phci = ph;
2851 	MDI_PI(pip)->pi_addr = kmem_alloc(strlen(paddr) + 1, KM_SLEEP);
2852 	(void) strcpy(MDI_PI(pip)->pi_addr, paddr);
2853 
2854         /*
2855 	 * We form the "path" to the pathinfo node, and see if we have
2856 	 * already allocated a 'path_instance' for that "path".  If so,
2857 	 * we use the already allocated 'path_instance'.  If not, we
2858 	 * allocate a new 'path_instance' and associate it with a copy of
2859 	 * the "path" string (which is never freed). The association
2860 	 * between a 'path_instance' this "path" string persists until
2861 	 * reboot.
2862 	 */
2863         mutex_enter(&mdi_pathmap_mutex);
2864 	(void) ddi_pathname(ph->ph_dip, path);
2865 	(void) sprintf(path + strlen(path), "/%s@%s",
2866 	    ddi_node_name(ct->ct_dip), MDI_PI(pip)->pi_addr);
2867         if (mod_hash_find(mdi_pathmap_bypath, (mod_hash_key_t)path, &hv) == 0) {
2868                 path_instance = (uint_t)(intptr_t)hv;
2869         } else {
2870 		/* allocate a new 'path_instance' and persistent "path" */
2871 		path_instance = mdi_pathmap_instance++;
2872 		path_persistent = i_ddi_strdup(path, KM_SLEEP);
2873                 (void) mod_hash_insert(mdi_pathmap_bypath,
2874                     (mod_hash_key_t)path_persistent,
2875                     (mod_hash_val_t)(intptr_t)path_instance);
2876 		(void) mod_hash_insert(mdi_pathmap_byinstance,
2877 		    (mod_hash_key_t)(intptr_t)path_instance,
2878 		    (mod_hash_val_t)path_persistent);
2879         }
2880         mutex_exit(&mdi_pathmap_mutex);
2881 	MDI_PI(pip)->pi_path_instance = path_instance;
2882 
2883 	(void) nvlist_alloc(&MDI_PI(pip)->pi_prop, NV_UNIQUE_NAME, KM_SLEEP);
2884 	ASSERT(MDI_PI(pip)->pi_prop != NULL);
2885 	MDI_PI(pip)->pi_pprivate = NULL;
2886 	MDI_PI(pip)->pi_cprivate = NULL;
2887 	MDI_PI(pip)->pi_vprivate = NULL;
2888 	MDI_PI(pip)->pi_client_link = NULL;
2889 	MDI_PI(pip)->pi_phci_link = NULL;
2890 	MDI_PI(pip)->pi_ref_cnt = 0;
2891 	MDI_PI(pip)->pi_kstats = NULL;
2892 	MDI_PI(pip)->pi_preferred = 1;
2893 	cv_init(&MDI_PI(pip)->pi_ref_cv, NULL, CV_DEFAULT, NULL);
2894 
2895 	/*
2896 	 * Lock both dev_info nodes against changes in parallel.
2897 	 *
2898 	 * The ndi_devi_enter(Client), is atypical since the client is a leaf.
2899 	 * This atypical operation is done to synchronize pathinfo nodes
2900 	 * during devinfo snapshot (see di_register_pip) by 'pretending' that
2901 	 * the pathinfo nodes are children of the Client.
2902 	 */
2903 	ndi_devi_enter(ct->ct_dip, &ct_circular);
2904 	ndi_devi_enter(ph->ph_dip, &ph_circular);
2905 
2906 	i_mdi_phci_add_path(ph, pip);
2907 	i_mdi_client_add_path(ct, pip);
2908 
2909 	ndi_devi_exit(ph->ph_dip, ph_circular);
2910 	ndi_devi_exit(ct->ct_dip, ct_circular);
2911 
2912 	return (pip);
2913 }
2914 
2915 /*
2916  * mdi_pi_pathname_by_instance():
2917  *	Lookup of "path" by 'path_instance'. Return "path".
2918  *	NOTE: returned "path" remains valid forever (until reboot).
2919  */
2920 char *
2921 mdi_pi_pathname_by_instance(int path_instance)
2922 {
2923 	char		*path;
2924 	mod_hash_val_t	hv;
2925 
2926 	/* mdi_pathmap lookup of "path" by 'path_instance' */
2927 	mutex_enter(&mdi_pathmap_mutex);
2928 	if (mod_hash_find(mdi_pathmap_byinstance,
2929 	    (mod_hash_key_t)(intptr_t)path_instance, &hv) == 0)
2930 		path = (char *)hv;
2931 	else
2932 		path = NULL;
2933 	mutex_exit(&mdi_pathmap_mutex);
2934 	return (path);
2935 }
2936 
2937 /*
2938  * i_mdi_phci_add_path():
2939  * 		Add a mdi_pathinfo node to pHCI list.
2940  * Notes:
2941  *		Caller should per-pHCI mutex
2942  */
2943 static void
2944 i_mdi_phci_add_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
2945 {
2946 	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
2947 
2948 	MDI_PHCI_LOCK(ph);
2949 	if (ph->ph_path_head == NULL) {
2950 		ph->ph_path_head = pip;
2951 	} else {
2952 		MDI_PI(ph->ph_path_tail)->pi_phci_link = MDI_PI(pip);
2953 	}
2954 	ph->ph_path_tail = pip;
2955 	ph->ph_path_count++;
2956 	MDI_PHCI_UNLOCK(ph);
2957 }
2958 
2959 /*
2960  * i_mdi_client_add_path():
2961  *		Add mdi_pathinfo node to client list
2962  */
2963 static void
2964 i_mdi_client_add_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
2965 {
2966 	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
2967 
2968 	MDI_CLIENT_LOCK(ct);
2969 	if (ct->ct_path_head == NULL) {
2970 		ct->ct_path_head = pip;
2971 	} else {
2972 		MDI_PI(ct->ct_path_tail)->pi_client_link = MDI_PI(pip);
2973 	}
2974 	ct->ct_path_tail = pip;
2975 	ct->ct_path_count++;
2976 	MDI_CLIENT_UNLOCK(ct);
2977 }
2978 
2979 /*
2980  * mdi_pi_free():
2981  *		Free the mdi_pathinfo node and also client device node if this
2982  *		is the last path to the device
2983  * Return Values:
2984  *		MDI_SUCCESS
2985  *		MDI_FAILURE
2986  *		MDI_BUSY
2987  */
2988 /*ARGSUSED*/
2989 int
2990 mdi_pi_free(mdi_pathinfo_t *pip, int flags)
2991 {
2992 	int		rv = MDI_FAILURE;
2993 	mdi_vhci_t	*vh;
2994 	mdi_phci_t	*ph;
2995 	mdi_client_t	*ct;
2996 	int		(*f)();
2997 	int		client_held = 0;
2998 
2999 	MDI_PI_LOCK(pip);
3000 	ph = MDI_PI(pip)->pi_phci;
3001 	ASSERT(ph != NULL);
3002 	if (ph == NULL) {
3003 		/*
3004 		 * Invalid pHCI device, return failure
3005 		 */
3006 		MDI_DEBUG(1, (CE_WARN, NULL,
3007 		    "!mdi_pi_free: invalid pHCI pip=%p", (void *)pip));
3008 		MDI_PI_UNLOCK(pip);
3009 		return (MDI_FAILURE);
3010 	}
3011 
3012 	vh = ph->ph_vhci;
3013 	ASSERT(vh != NULL);
3014 	if (vh == NULL) {
3015 		/* Invalid pHCI device, return failure */
3016 		MDI_DEBUG(1, (CE_WARN, NULL,
3017 		    "!mdi_pi_free: invalid vHCI pip=%p", (void *)pip));
3018 		MDI_PI_UNLOCK(pip);
3019 		return (MDI_FAILURE);
3020 	}
3021 
3022 	ct = MDI_PI(pip)->pi_client;
3023 	ASSERT(ct != NULL);
3024 	if (ct == NULL) {
3025 		/*
3026 		 * Invalid Client device, return failure
3027 		 */
3028 		MDI_DEBUG(1, (CE_WARN, NULL,
3029 		    "!mdi_pi_free: invalid client pip=%p", (void *)pip));
3030 		MDI_PI_UNLOCK(pip);
3031 		return (MDI_FAILURE);
3032 	}
3033 
3034 	/*
3035 	 * Check to see for busy condition.  A mdi_pathinfo can only be freed
3036 	 * if the node state is either offline or init and the reference count
3037 	 * is zero.
3038 	 */
3039 	if (!(MDI_PI_IS_OFFLINE(pip) || MDI_PI_IS_INIT(pip) ||
3040 	    MDI_PI_IS_INITING(pip))) {
3041 		/*
3042 		 * Node is busy
3043 		 */
3044 		MDI_DEBUG(1, (CE_WARN, ct->ct_dip,
3045 		    "!mdi_pi_free: pathinfo node is busy pip=%p", (void *)pip));
3046 		MDI_PI_UNLOCK(pip);
3047 		return (MDI_BUSY);
3048 	}
3049 
3050 	while (MDI_PI(pip)->pi_ref_cnt != 0) {
3051 		/*
3052 		 * Give a chance for pending I/Os to complete.
3053 		 */
3054 		MDI_DEBUG(1, (CE_NOTE, ct->ct_dip, "!mdi_pi_free: "
3055 		    "%d cmds still pending on path: %p\n",
3056 		    MDI_PI(pip)->pi_ref_cnt, (void *)pip));
3057 		if (cv_timedwait(&MDI_PI(pip)->pi_ref_cv,
3058 		    &MDI_PI(pip)->pi_mutex,
3059 		    ddi_get_lbolt() + drv_usectohz(60 * 1000000)) == -1) {
3060 			/*
3061 			 * The timeout time reached without ref_cnt being zero
3062 			 * being signaled.
3063 			 */
3064 			MDI_DEBUG(1, (CE_NOTE, ct->ct_dip,
3065 			    "!mdi_pi_free: "
3066 			    "Timeout reached on path %p without the cond\n",
3067 			    (void *)pip));
3068 			MDI_DEBUG(1, (CE_NOTE, ct->ct_dip,
3069 			    "!mdi_pi_free: "
3070 			    "%d cmds still pending on path: %p\n",
3071 			    MDI_PI(pip)->pi_ref_cnt, (void *)pip));
3072 			MDI_PI_UNLOCK(pip);
3073 			return (MDI_BUSY);
3074 		}
3075 	}
3076 	if (MDI_PI(pip)->pi_pm_held) {
3077 		client_held = 1;
3078 	}
3079 	MDI_PI_UNLOCK(pip);
3080 
3081 	vhcache_pi_remove(vh->vh_config, MDI_PI(pip));
3082 
3083 	MDI_CLIENT_LOCK(ct);
3084 
3085 	/* Prevent further failovers till MDI_VHCI_CLIENT_LOCK is held */
3086 	MDI_CLIENT_SET_PATH_FREE_IN_PROGRESS(ct);
3087 
3088 	/*
3089 	 * Wait till failover is complete before removing this node.
3090 	 */
3091 	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
3092 		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
3093 
3094 	MDI_CLIENT_UNLOCK(ct);
3095 	MDI_VHCI_CLIENT_LOCK(vh);
3096 	MDI_CLIENT_LOCK(ct);
3097 	MDI_CLIENT_CLEAR_PATH_FREE_IN_PROGRESS(ct);
3098 
3099 	if (!MDI_PI_IS_INITING(pip)) {
3100 		f = vh->vh_ops->vo_pi_uninit;
3101 		if (f != NULL) {
3102 			rv = (*f)(vh->vh_dip, pip, 0);
3103 		}
3104 	}
3105 	/*
3106 	 * If vo_pi_uninit() completed successfully.
3107 	 */
3108 	if (rv == MDI_SUCCESS) {
3109 		if (client_held) {
3110 			MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "mdi_pi_free "
3111 			    "i_mdi_pm_rele_client\n"));
3112 			i_mdi_pm_rele_client(ct, 1);
3113 		}
3114 		i_mdi_pi_free(ph, pip, ct);
3115 		if (ct->ct_path_count == 0) {
3116 			/*
3117 			 * Client lost its last path.
3118 			 * Clean up the client device
3119 			 */
3120 			MDI_CLIENT_UNLOCK(ct);
3121 			(void) i_mdi_client_free(ct->ct_vhci, ct);
3122 			MDI_VHCI_CLIENT_UNLOCK(vh);
3123 			return (rv);
3124 		}
3125 	}
3126 	MDI_CLIENT_UNLOCK(ct);
3127 	MDI_VHCI_CLIENT_UNLOCK(vh);
3128 
3129 	if (rv == MDI_FAILURE)
3130 		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
3131 
3132 	return (rv);
3133 }
3134 
3135 /*
3136  * i_mdi_pi_free():
3137  *		Free the mdi_pathinfo node
3138  */
3139 static void
3140 i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *pip, mdi_client_t *ct)
3141 {
3142 	int	ct_circular;
3143 	int	ph_circular;
3144 
3145 	ASSERT(MDI_CLIENT_LOCKED(ct));
3146 
3147 	/*
3148 	 * remove any per-path kstats
3149 	 */
3150 	i_mdi_pi_kstat_destroy(pip);
3151 
3152 	/* See comments in i_mdi_pi_alloc() */
3153 	ndi_devi_enter(ct->ct_dip, &ct_circular);
3154 	ndi_devi_enter(ph->ph_dip, &ph_circular);
3155 
3156 	i_mdi_client_remove_path(ct, pip);
3157 	i_mdi_phci_remove_path(ph, pip);
3158 
3159 	ndi_devi_exit(ph->ph_dip, ph_circular);
3160 	ndi_devi_exit(ct->ct_dip, ct_circular);
3161 
3162 	mutex_destroy(&MDI_PI(pip)->pi_mutex);
3163 	cv_destroy(&MDI_PI(pip)->pi_state_cv);
3164 	cv_destroy(&MDI_PI(pip)->pi_ref_cv);
3165 	if (MDI_PI(pip)->pi_addr) {
3166 		kmem_free(MDI_PI(pip)->pi_addr,
3167 		    strlen(MDI_PI(pip)->pi_addr) + 1);
3168 		MDI_PI(pip)->pi_addr = NULL;
3169 	}
3170 
3171 	if (MDI_PI(pip)->pi_prop) {
3172 		(void) nvlist_free(MDI_PI(pip)->pi_prop);
3173 		MDI_PI(pip)->pi_prop = NULL;
3174 	}
3175 	kmem_free(pip, sizeof (struct mdi_pathinfo));
3176 }
3177 
3178 
3179 /*
3180  * i_mdi_phci_remove_path():
3181  * 		Remove a mdi_pathinfo node from pHCI list.
3182  * Notes:
3183  *		Caller should hold per-pHCI mutex
3184  */
3185 static void
3186 i_mdi_phci_remove_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
3187 {
3188 	mdi_pathinfo_t	*prev = NULL;
3189 	mdi_pathinfo_t	*path = NULL;
3190 
3191 	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
3192 
3193 	MDI_PHCI_LOCK(ph);
3194 	path = ph->ph_path_head;
3195 	while (path != NULL) {
3196 		if (path == pip) {
3197 			break;
3198 		}
3199 		prev = path;
3200 		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
3201 	}
3202 
3203 	if (path) {
3204 		ph->ph_path_count--;
3205 		if (prev) {
3206 			MDI_PI(prev)->pi_phci_link = MDI_PI(path)->pi_phci_link;
3207 		} else {
3208 			ph->ph_path_head =
3209 			    (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
3210 		}
3211 		if (ph->ph_path_tail == path) {
3212 			ph->ph_path_tail = prev;
3213 		}
3214 	}
3215 
3216 	/*
3217 	 * Clear the pHCI link
3218 	 */
3219 	MDI_PI(pip)->pi_phci_link = NULL;
3220 	MDI_PI(pip)->pi_phci = NULL;
3221 	MDI_PHCI_UNLOCK(ph);
3222 }
3223 
3224 /*
3225  * i_mdi_client_remove_path():
3226  * 		Remove a mdi_pathinfo node from client path list.
3227  */
3228 static void
3229 i_mdi_client_remove_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
3230 {
3231 	mdi_pathinfo_t	*prev = NULL;
3232 	mdi_pathinfo_t	*path;
3233 
3234 	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
3235 
3236 	ASSERT(MDI_CLIENT_LOCKED(ct));
3237 	path = ct->ct_path_head;
3238 	while (path != NULL) {
3239 		if (path == pip) {
3240 			break;
3241 		}
3242 		prev = path;
3243 		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
3244 	}
3245 
3246 	if (path) {
3247 		ct->ct_path_count--;
3248 		if (prev) {
3249 			MDI_PI(prev)->pi_client_link =
3250 			    MDI_PI(path)->pi_client_link;
3251 		} else {
3252 			ct->ct_path_head =
3253 			    (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
3254 		}
3255 		if (ct->ct_path_tail == path) {
3256 			ct->ct_path_tail = prev;
3257 		}
3258 		if (ct->ct_path_last == path) {
3259 			ct->ct_path_last = ct->ct_path_head;
3260 		}
3261 	}
3262 	MDI_PI(pip)->pi_client_link = NULL;
3263 	MDI_PI(pip)->pi_client = NULL;
3264 }
3265 
3266 /*
3267  * i_mdi_pi_state_change():
3268  *		online a mdi_pathinfo node
3269  *
3270  * Return Values:
3271  *		MDI_SUCCESS
3272  *		MDI_FAILURE
3273  */
3274 /*ARGSUSED*/
3275 static int
3276 i_mdi_pi_state_change(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state, int flag)
3277 {
3278 	int		rv = MDI_SUCCESS;
3279 	mdi_vhci_t	*vh;
3280 	mdi_phci_t	*ph;
3281 	mdi_client_t	*ct;
3282 	int		(*f)();
3283 	dev_info_t	*cdip;
3284 
3285 	MDI_PI_LOCK(pip);
3286 
3287 	ph = MDI_PI(pip)->pi_phci;
3288 	ASSERT(ph);
3289 	if (ph == NULL) {
3290 		/*
3291 		 * Invalid pHCI device, fail the request
3292 		 */
3293 		MDI_PI_UNLOCK(pip);
3294 		MDI_DEBUG(1, (CE_WARN, NULL,
3295 		    "!mdi_pi_state_change: invalid phci pip=%p", (void *)pip));
3296 		return (MDI_FAILURE);
3297 	}
3298 
3299 	vh = ph->ph_vhci;
3300 	ASSERT(vh);
3301 	if (vh == NULL) {
3302 		/*
3303 		 * Invalid vHCI device, fail the request
3304 		 */
3305 		MDI_PI_UNLOCK(pip);
3306 		MDI_DEBUG(1, (CE_WARN, NULL,
3307 		    "!mdi_pi_state_change: invalid vhci pip=%p", (void *)pip));
3308 		return (MDI_FAILURE);
3309 	}
3310 
3311 	ct = MDI_PI(pip)->pi_client;
3312 	ASSERT(ct != NULL);
3313 	if (ct == NULL) {
3314 		/*
3315 		 * Invalid client device, fail the request
3316 		 */
3317 		MDI_PI_UNLOCK(pip);
3318 		MDI_DEBUG(1, (CE_WARN, NULL,
3319 		    "!mdi_pi_state_change: invalid client pip=%p",
3320 		    (void *)pip));
3321 		return (MDI_FAILURE);
3322 	}
3323 
3324 	/*
3325 	 * If this path has not been initialized yet, Callback vHCI driver's
3326 	 * pathinfo node initialize entry point
3327 	 */
3328 
3329 	if (MDI_PI_IS_INITING(pip)) {
3330 		MDI_PI_UNLOCK(pip);
3331 		f = vh->vh_ops->vo_pi_init;
3332 		if (f != NULL) {
3333 			rv = (*f)(vh->vh_dip, pip, 0);
3334 			if (rv != MDI_SUCCESS) {
3335 				MDI_DEBUG(1, (CE_WARN, ct->ct_dip,
3336 				    "!vo_pi_init: failed vHCI=0x%p, pip=0x%p",
3337 				    (void *)vh, (void *)pip));
3338 				return (MDI_FAILURE);
3339 			}
3340 		}
3341 		MDI_PI_LOCK(pip);
3342 		MDI_PI_CLEAR_TRANSIENT(pip);
3343 	}
3344 
3345 	/*
3346 	 * Do not allow state transition when pHCI is in offline/suspended
3347 	 * states
3348 	 */
3349 	i_mdi_phci_lock(ph, pip);
3350 	if (MDI_PHCI_IS_READY(ph) == 0) {
3351 		MDI_DEBUG(1, (CE_WARN, ct->ct_dip,
3352 		    "!mdi_pi_state_change: pHCI not ready, pHCI=%p",
3353 		    (void *)ph));
3354 		MDI_PI_UNLOCK(pip);
3355 		i_mdi_phci_unlock(ph);
3356 		return (MDI_BUSY);
3357 	}
3358 	MDI_PHCI_UNSTABLE(ph);
3359 	i_mdi_phci_unlock(ph);
3360 
3361 	/*
3362 	 * Check if mdi_pathinfo state is in transient state.
3363 	 * If yes, offlining is in progress and wait till transient state is
3364 	 * cleared.
3365 	 */
3366 	if (MDI_PI_IS_TRANSIENT(pip)) {
3367 		while (MDI_PI_IS_TRANSIENT(pip)) {
3368 			cv_wait(&MDI_PI(pip)->pi_state_cv,
3369 			    &MDI_PI(pip)->pi_mutex);
3370 		}
3371 	}
3372 
3373 	/*
3374 	 * Grab the client lock in reverse order sequence and release the
3375 	 * mdi_pathinfo mutex.
3376 	 */
3377 	i_mdi_client_lock(ct, pip);
3378 	MDI_PI_UNLOCK(pip);
3379 
3380 	/*
3381 	 * Wait till failover state is cleared
3382 	 */
3383 	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
3384 		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
3385 
3386 	/*
3387 	 * Mark the mdi_pathinfo node state as transient
3388 	 */
3389 	MDI_PI_LOCK(pip);
3390 	switch (state) {
3391 	case MDI_PATHINFO_STATE_ONLINE:
3392 		MDI_PI_SET_ONLINING(pip);
3393 		break;
3394 
3395 	case MDI_PATHINFO_STATE_STANDBY:
3396 		MDI_PI_SET_STANDBYING(pip);
3397 		break;
3398 
3399 	case MDI_PATHINFO_STATE_FAULT:
3400 		/*
3401 		 * Mark the pathinfo state as FAULTED
3402 		 */
3403 		MDI_PI_SET_FAULTING(pip);
3404 		MDI_PI_ERRSTAT(pip, MDI_PI_HARDERR);
3405 		break;
3406 
3407 	case MDI_PATHINFO_STATE_OFFLINE:
3408 		/*
3409 		 * ndi_devi_offline() cannot hold pip or ct locks.
3410 		 */
3411 		MDI_PI_UNLOCK(pip);
3412 		/*
3413 		 * Don't offline the client dev_info node unless we have
3414 		 * no available paths left at all.
3415 		 */
3416 		cdip = ct->ct_dip;
3417 		if ((flag & NDI_DEVI_REMOVE) &&
3418 		    (ct->ct_path_count == 1)) {
3419 			i_mdi_client_unlock(ct);
3420 			rv = ndi_devi_offline(cdip, 0);
3421 			if (rv != NDI_SUCCESS) {
3422 				/*
3423 				 * Convert to MDI error code
3424 				 */
3425 				switch (rv) {
3426 				case NDI_BUSY:
3427 					rv = MDI_BUSY;
3428 					break;
3429 				default:
3430 					rv = MDI_FAILURE;
3431 					break;
3432 				}
3433 				goto state_change_exit;
3434 			} else {
3435 				i_mdi_client_lock(ct, NULL);
3436 			}
3437 		}
3438 		/*
3439 		 * Mark the mdi_pathinfo node state as transient
3440 		 */
3441 		MDI_PI_LOCK(pip);
3442 		MDI_PI_SET_OFFLINING(pip);
3443 		break;
3444 	}
3445 	MDI_PI_UNLOCK(pip);
3446 	MDI_CLIENT_UNSTABLE(ct);
3447 	i_mdi_client_unlock(ct);
3448 
3449 	f = vh->vh_ops->vo_pi_state_change;
3450 	if (f != NULL)
3451 		rv = (*f)(vh->vh_dip, pip, state, 0, flag);
3452 
3453 	MDI_CLIENT_LOCK(ct);
3454 	MDI_PI_LOCK(pip);
3455 	if (rv == MDI_NOT_SUPPORTED) {
3456 		MDI_CLIENT_SET_DEV_NOT_SUPPORTED(ct);
3457 	}
3458 	if (rv != MDI_SUCCESS) {
3459 		MDI_DEBUG(2, (CE_WARN, ct->ct_dip,
3460 		    "!vo_pi_state_change: failed rv = %x", rv));
3461 	}
3462 	if (MDI_PI_IS_TRANSIENT(pip)) {
3463 		if (rv == MDI_SUCCESS) {
3464 			MDI_PI_CLEAR_TRANSIENT(pip);
3465 		} else {
3466 			MDI_PI(pip)->pi_state = MDI_PI_OLD_STATE(pip);
3467 		}
3468 	}
3469 
3470 	/*
3471 	 * Wake anyone waiting for this mdi_pathinfo node
3472 	 */
3473 	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
3474 	MDI_PI_UNLOCK(pip);
3475 
3476 	/*
3477 	 * Mark the client device as stable
3478 	 */
3479 	MDI_CLIENT_STABLE(ct);
3480 	if (rv == MDI_SUCCESS) {
3481 		if (ct->ct_unstable == 0) {
3482 			cdip = ct->ct_dip;
3483 
3484 			/*
3485 			 * Onlining the mdi_pathinfo node will impact the
3486 			 * client state Update the client and dev_info node
3487 			 * state accordingly
3488 			 */
3489 			rv = NDI_SUCCESS;
3490 			i_mdi_client_update_state(ct);
3491 			switch (MDI_CLIENT_STATE(ct)) {
3492 			case MDI_CLIENT_STATE_OPTIMAL:
3493 			case MDI_CLIENT_STATE_DEGRADED:
3494 				if (cdip && !i_ddi_devi_attached(cdip) &&
3495 				    ((state == MDI_PATHINFO_STATE_ONLINE) ||
3496 				    (state == MDI_PATHINFO_STATE_STANDBY))) {
3497 
3498 					/*
3499 					 * Must do ndi_devi_online() through
3500 					 * hotplug thread for deferred
3501 					 * attach mechanism to work
3502 					 */
3503 					MDI_CLIENT_UNLOCK(ct);
3504 					rv = ndi_devi_online(cdip, 0);
3505 					MDI_CLIENT_LOCK(ct);
3506 					if ((rv != NDI_SUCCESS) &&
3507 					    (MDI_CLIENT_STATE(ct) ==
3508 					    MDI_CLIENT_STATE_DEGRADED)) {
3509 						/*
3510 						 * ndi_devi_online failed.
3511 						 * Reset client flags to
3512 						 * offline.
3513 						 */
3514 						MDI_DEBUG(1, (CE_WARN, cdip,
3515 						    "!ndi_devi_online: failed "
3516 						    " Error: %x", rv));
3517 						MDI_CLIENT_SET_OFFLINE(ct);
3518 					}
3519 					if (rv != NDI_SUCCESS) {
3520 						/* Reset the path state */
3521 						MDI_PI_LOCK(pip);
3522 						MDI_PI(pip)->pi_state =
3523 						    MDI_PI_OLD_STATE(pip);
3524 						MDI_PI_UNLOCK(pip);
3525 					}
3526 				}
3527 				break;
3528 
3529 			case MDI_CLIENT_STATE_FAILED:
3530 				/*
3531 				 * This is the last path case for
3532 				 * non-user initiated events.
3533 				 */
3534 				if (((flag & NDI_DEVI_REMOVE) == 0) &&
3535 				    cdip && (i_ddi_node_state(cdip) >=
3536 				    DS_INITIALIZED)) {
3537 					MDI_CLIENT_UNLOCK(ct);
3538 					rv = ndi_devi_offline(cdip, 0);
3539 					MDI_CLIENT_LOCK(ct);
3540 
3541 					if (rv != NDI_SUCCESS) {
3542 						/*
3543 						 * ndi_devi_offline failed.
3544 						 * Reset client flags to
3545 						 * online as the path could not
3546 						 * be offlined.
3547 						 */
3548 						MDI_DEBUG(1, (CE_WARN, cdip,
3549 						    "!ndi_devi_offline: failed "
3550 						    " Error: %x", rv));
3551 						MDI_CLIENT_SET_ONLINE(ct);
3552 					}
3553 				}
3554 				break;
3555 			}
3556 			/*
3557 			 * Convert to MDI error code
3558 			 */
3559 			switch (rv) {
3560 			case NDI_SUCCESS:
3561 				MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
3562 				i_mdi_report_path_state(ct, pip);
3563 				rv = MDI_SUCCESS;
3564 				break;
3565 			case NDI_BUSY:
3566 				rv = MDI_BUSY;
3567 				break;
3568 			default:
3569 				rv = MDI_FAILURE;
3570 				break;
3571 			}
3572 		}
3573 	}
3574 	MDI_CLIENT_UNLOCK(ct);
3575 
3576 state_change_exit:
3577 	/*
3578 	 * Mark the pHCI as stable again.
3579 	 */
3580 	MDI_PHCI_LOCK(ph);
3581 	MDI_PHCI_STABLE(ph);
3582 	MDI_PHCI_UNLOCK(ph);
3583 	return (rv);
3584 }
3585 
3586 /*
3587  * mdi_pi_online():
3588  *		Place the path_info node in the online state.  The path is
3589  *		now available to be selected by mdi_select_path() for
3590  *		transporting I/O requests to client devices.
3591  * Return Values:
3592  *		MDI_SUCCESS
3593  *		MDI_FAILURE
3594  */
3595 int
3596 mdi_pi_online(mdi_pathinfo_t *pip, int flags)
3597 {
3598 	mdi_client_t	*ct = MDI_PI(pip)->pi_client;
3599 	int		client_held = 0;
3600 	int		rv;
3601 	int		se_flag;
3602 	int		kmem_flag;
3603 
3604 	ASSERT(ct != NULL);
3605 	rv = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_ONLINE, flags);
3606 	if (rv != MDI_SUCCESS)
3607 		return (rv);
3608 
3609 	MDI_PI_LOCK(pip);
3610 	if (MDI_PI(pip)->pi_pm_held == 0) {
3611 		MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "mdi_pi_online "
3612 		    "i_mdi_pm_hold_pip %p\n", (void *)pip));
3613 		i_mdi_pm_hold_pip(pip);
3614 		client_held = 1;
3615 	}
3616 	MDI_PI_UNLOCK(pip);
3617 
3618 	if (client_held) {
3619 		MDI_CLIENT_LOCK(ct);
3620 		if (ct->ct_power_cnt == 0) {
3621 			rv = i_mdi_power_all_phci(ct);
3622 		}
3623 
3624 		MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "mdi_pi_online "
3625 		    "i_mdi_pm_hold_client %p\n", (void *)ct));
3626 		i_mdi_pm_hold_client(ct, 1);
3627 		MDI_CLIENT_UNLOCK(ct);
3628 	}
3629 
3630 	/* determine interrupt context */
3631 	se_flag = (servicing_interrupt()) ? SE_NOSLEEP : SE_SLEEP;
3632 	kmem_flag = (se_flag == SE_SLEEP) ? KM_SLEEP : KM_NOSLEEP;
3633 
3634 	/* A new path is online.  Invalidate DINFOCACHE snap shot. */
3635 	i_ddi_di_cache_invalidate(kmem_flag);
3636 
3637 	return (rv);
3638 }
3639 
3640 /*
3641  * mdi_pi_standby():
3642  *		Place the mdi_pathinfo node in standby state
3643  *
3644  * Return Values:
3645  *		MDI_SUCCESS
3646  *		MDI_FAILURE
3647  */
3648 int
3649 mdi_pi_standby(mdi_pathinfo_t *pip, int flags)
3650 {
3651 	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_STANDBY, flags));
3652 }
3653 
3654 /*
3655  * mdi_pi_fault():
3656  *		Place the mdi_pathinfo node in fault'ed state
3657  * Return Values:
3658  *		MDI_SUCCESS
3659  *		MDI_FAILURE
3660  */
3661 int
3662 mdi_pi_fault(mdi_pathinfo_t *pip, int flags)
3663 {
3664 	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_FAULT, flags));
3665 }
3666 
3667 /*
3668  * mdi_pi_offline():
3669  *		Offline a mdi_pathinfo node.
3670  * Return Values:
3671  *		MDI_SUCCESS
3672  *		MDI_FAILURE
3673  */
3674 int
3675 mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
3676 {
3677 	int	ret, client_held = 0;
3678 	mdi_client_t	*ct;
3679 	int		se_flag;
3680 	int		kmem_flag;
3681 
3682 	ret = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_OFFLINE, flags);
3683 
3684 	if (ret == MDI_SUCCESS) {
3685 		MDI_PI_LOCK(pip);
3686 		if (MDI_PI(pip)->pi_pm_held) {
3687 			client_held = 1;
3688 		}
3689 		MDI_PI_UNLOCK(pip);
3690 
3691 		if (client_held) {
3692 			ct = MDI_PI(pip)->pi_client;
3693 			MDI_CLIENT_LOCK(ct);
3694 			MDI_DEBUG(4, (CE_NOTE, ct->ct_dip,
3695 			    "mdi_pi_offline i_mdi_pm_rele_client\n"));
3696 			i_mdi_pm_rele_client(ct, 1);
3697 			MDI_CLIENT_UNLOCK(ct);
3698 		}
3699 
3700 		/* determine interrupt context */
3701 		se_flag = (servicing_interrupt()) ? SE_NOSLEEP : SE_SLEEP;
3702 		kmem_flag = (se_flag == SE_SLEEP) ? KM_SLEEP : KM_NOSLEEP;
3703 
3704 		/* pathinfo is offlined. update DINFOCACHE. */
3705 		i_ddi_di_cache_invalidate(kmem_flag);
3706 	}
3707 
3708 	return (ret);
3709 }
3710 
3711 /*
3712  * i_mdi_pi_offline():
3713  *		Offline a mdi_pathinfo node and call the vHCI driver's callback
3714  */
3715 static int
3716 i_mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
3717 {
3718 	dev_info_t	*vdip = NULL;
3719 	mdi_vhci_t	*vh = NULL;
3720 	mdi_client_t	*ct = NULL;
3721 	int		(*f)();
3722 	int		rv;
3723 
3724 	MDI_PI_LOCK(pip);
3725 	ct = MDI_PI(pip)->pi_client;
3726 	ASSERT(ct != NULL);
3727 
3728 	while (MDI_PI(pip)->pi_ref_cnt != 0) {
3729 		/*
3730 		 * Give a chance for pending I/Os to complete.
3731 		 */
3732 		MDI_DEBUG(1, (CE_NOTE, ct->ct_dip, "!i_mdi_pi_offline: "
3733 		    "%d cmds still pending on path: %p\n",
3734 		    MDI_PI(pip)->pi_ref_cnt, (void *)pip));
3735 		if (cv_timedwait(&MDI_PI(pip)->pi_ref_cv,
3736 		    &MDI_PI(pip)->pi_mutex,
3737 		    ddi_get_lbolt() + drv_usectohz(60 * 1000000)) == -1) {
3738 			/*
3739 			 * The timeout time reached without ref_cnt being zero
3740 			 * being signaled.
3741 			 */
3742 			MDI_DEBUG(1, (CE_NOTE, ct->ct_dip, "!i_mdi_pi_offline: "
3743 			    "Timeout reached on path %p without the cond\n",
3744 			    (void *)pip));
3745 			MDI_DEBUG(1, (CE_NOTE, ct->ct_dip, "!i_mdi_pi_offline: "
3746 			    "%d cmds still pending on path: %p\n",
3747 			    MDI_PI(pip)->pi_ref_cnt, (void *)pip));
3748 		}
3749 	}
3750 	vh = ct->ct_vhci;
3751 	vdip = vh->vh_dip;
3752 
3753 	/*
3754 	 * Notify vHCI that has registered this event
3755 	 */
3756 	ASSERT(vh->vh_ops);
3757 	f = vh->vh_ops->vo_pi_state_change;
3758 
3759 	if (f != NULL) {
3760 		MDI_PI_UNLOCK(pip);
3761 		if ((rv = (*f)(vdip, pip, MDI_PATHINFO_STATE_OFFLINE, 0,
3762 		    flags)) != MDI_SUCCESS) {
3763 			MDI_DEBUG(1, (CE_WARN, ct->ct_dip,
3764 			    "!vo_path_offline failed "
3765 			    "vdip %p, pip %p", (void *)vdip, (void *)pip));
3766 		}
3767 		MDI_PI_LOCK(pip);
3768 	}
3769 
3770 	/*
3771 	 * Set the mdi_pathinfo node state and clear the transient condition
3772 	 */
3773 	MDI_PI_SET_OFFLINE(pip);
3774 	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
3775 	MDI_PI_UNLOCK(pip);
3776 
3777 	MDI_CLIENT_LOCK(ct);
3778 	if (rv == MDI_SUCCESS) {
3779 		if (ct->ct_unstable == 0) {
3780 			dev_info_t	*cdip = ct->ct_dip;
3781 
3782 			/*
3783 			 * Onlining the mdi_pathinfo node will impact the
3784 			 * client state Update the client and dev_info node
3785 			 * state accordingly
3786 			 */
3787 			i_mdi_client_update_state(ct);
3788 			rv = NDI_SUCCESS;
3789 			if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
3790 				if (cdip &&
3791 				    (i_ddi_node_state(cdip) >=
3792 				    DS_INITIALIZED)) {
3793 					MDI_CLIENT_UNLOCK(ct);
3794 					rv = ndi_devi_offline(cdip, 0);
3795 					MDI_CLIENT_LOCK(ct);
3796 					if (rv != NDI_SUCCESS) {
3797 						/*
3798 						 * ndi_devi_offline failed.
3799 						 * Reset client flags to
3800 						 * online.
3801 						 */
3802 						MDI_DEBUG(4, (CE_WARN, cdip,
3803 						    "!ndi_devi_offline: failed "
3804 						    " Error: %x", rv));
3805 						MDI_CLIENT_SET_ONLINE(ct);
3806 					}
3807 				}
3808 			}
3809 			/*
3810 			 * Convert to MDI error code
3811 			 */
3812 			switch (rv) {
3813 			case NDI_SUCCESS:
3814 				rv = MDI_SUCCESS;
3815 				break;
3816 			case NDI_BUSY:
3817 				rv = MDI_BUSY;
3818 				break;
3819 			default:
3820 				rv = MDI_FAILURE;
3821 				break;
3822 			}
3823 		}
3824 		MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
3825 		i_mdi_report_path_state(ct, pip);
3826 	}
3827 
3828 	MDI_CLIENT_UNLOCK(ct);
3829 
3830 	/*
3831 	 * Change in the mdi_pathinfo node state will impact the client state
3832 	 */
3833 	MDI_DEBUG(2, (CE_NOTE, NULL, "!i_mdi_pi_offline ct = %p pip = %p",
3834 	    (void *)ct, (void *)pip));
3835 	return (rv);
3836 }
3837 
3838 
3839 /*
3840  * mdi_pi_get_addr():
3841  *		Get the unit address associated with a mdi_pathinfo node
3842  *
3843  * Return Values:
3844  *		char *
3845  */
3846 char *
3847 mdi_pi_get_addr(mdi_pathinfo_t *pip)
3848 {
3849 	if (pip == NULL)
3850 		return (NULL);
3851 
3852 	return (MDI_PI(pip)->pi_addr);
3853 }
3854 
3855 /*
3856  * mdi_pi_get_path_instance():
3857  *		Get the 'path_instance' of a mdi_pathinfo node
3858  *
3859  * Return Values:
3860  *		path_instance
3861  */
3862 int
3863 mdi_pi_get_path_instance(mdi_pathinfo_t *pip)
3864 {
3865 	if (pip == NULL)
3866 		return (0);
3867 
3868 	return (MDI_PI(pip)->pi_path_instance);
3869 }
3870 
3871 /*
3872  * mdi_pi_pathname():
3873  *		Return pointer to path to pathinfo node.
3874  */
3875 char *
3876 mdi_pi_pathname(mdi_pathinfo_t *pip)
3877 {
3878 	if (pip == NULL)
3879 		return (NULL);
3880 	return (mdi_pi_pathname_by_instance(mdi_pi_get_path_instance(pip)));
3881 }
3882 
3883 char *
3884 mdi_pi_pathname_obp(mdi_pathinfo_t *pip, char *path)
3885 {
3886 	char *obp_path = NULL;
3887 	if ((pip == NULL) || (path == NULL))
3888 		return (NULL);
3889 
3890 	if (mdi_prop_lookup_string(pip, "obp-path", &obp_path) == MDI_SUCCESS) {
3891 		(void) strcpy(path, obp_path);
3892 		(void) mdi_prop_free(obp_path);
3893 	} else {
3894 		path = NULL;
3895 	}
3896 	return (path);
3897 }
3898 
3899 int
3900 mdi_pi_pathname_obp_set(mdi_pathinfo_t *pip, char *component)
3901 {
3902 	dev_info_t *pdip;
3903 	char obp_path[MAXPATHLEN];
3904 
3905 	if (pip == NULL)
3906 		return (MDI_FAILURE);
3907 	bzero(obp_path, sizeof (obp_path));
3908 
3909 	pdip = mdi_pi_get_phci(pip);
3910 	if (pdip == NULL)
3911 		return (MDI_FAILURE);
3912 
3913 	if (ddi_pathname_obp(pdip, obp_path) == NULL) {
3914 		(void) ddi_pathname(pdip, obp_path);
3915 	}
3916 
3917 	if (component) {
3918 		(void) strncat(obp_path, "/", sizeof (obp_path));
3919 		(void) strncat(obp_path, component, sizeof (obp_path));
3920 	}
3921 
3922 	return (mdi_prop_update_string(pip, "obp-path", obp_path));
3923 }
3924 
3925 /*
3926  * mdi_pi_get_client():
3927  *		Get the client devinfo associated with a mdi_pathinfo node
3928  *
3929  * Return Values:
3930  *		Handle to client device dev_info node
3931  */
3932 dev_info_t *
3933 mdi_pi_get_client(mdi_pathinfo_t *pip)
3934 {
3935 	dev_info_t	*dip = NULL;
3936 	if (pip) {
3937 		dip = MDI_PI(pip)->pi_client->ct_dip;
3938 	}
3939 	return (dip);
3940 }
3941 
3942 /*
3943  * mdi_pi_get_phci():
3944  *		Get the pHCI devinfo associated with the mdi_pathinfo node
3945  * Return Values:
3946  *		Handle to dev_info node
3947  */
3948 dev_info_t *
3949 mdi_pi_get_phci(mdi_pathinfo_t *pip)
3950 {
3951 	dev_info_t	*dip = NULL;
3952 	if (pip) {
3953 		dip = MDI_PI(pip)->pi_phci->ph_dip;
3954 	}
3955 	return (dip);
3956 }
3957 
3958 /*
3959  * mdi_pi_get_client_private():
3960  *		Get the client private information associated with the
3961  *		mdi_pathinfo node
3962  */
3963 void *
3964 mdi_pi_get_client_private(mdi_pathinfo_t *pip)
3965 {
3966 	void *cprivate = NULL;
3967 	if (pip) {
3968 		cprivate = MDI_PI(pip)->pi_cprivate;
3969 	}
3970 	return (cprivate);
3971 }
3972 
3973 /*
3974  * mdi_pi_set_client_private():
3975  *		Set the client private information in the mdi_pathinfo node
3976  */
3977 void
3978 mdi_pi_set_client_private(mdi_pathinfo_t *pip, void *priv)
3979 {
3980 	if (pip) {
3981 		MDI_PI(pip)->pi_cprivate = priv;
3982 	}
3983 }
3984 
3985 /*
3986  * mdi_pi_get_phci_private():
3987  *		Get the pHCI private information associated with the
3988  *		mdi_pathinfo node
3989  */
3990 caddr_t
3991 mdi_pi_get_phci_private(mdi_pathinfo_t *pip)
3992 {
3993 	caddr_t	pprivate = NULL;
3994 	if (pip) {
3995 		pprivate = MDI_PI(pip)->pi_pprivate;
3996 	}
3997 	return (pprivate);
3998 }
3999 
4000 /*
4001  * mdi_pi_set_phci_private():
4002  *		Set the pHCI private information in the mdi_pathinfo node
4003  */
4004 void
4005 mdi_pi_set_phci_private(mdi_pathinfo_t *pip, caddr_t priv)
4006 {
4007 	if (pip) {
4008 		MDI_PI(pip)->pi_pprivate = priv;
4009 	}
4010 }
4011 
4012 /*
4013  * mdi_pi_get_state():
4014  *		Get the mdi_pathinfo node state. Transient states are internal
4015  *		and not provided to the users
4016  */
4017 mdi_pathinfo_state_t
4018 mdi_pi_get_state(mdi_pathinfo_t *pip)
4019 {
4020 	mdi_pathinfo_state_t    state = MDI_PATHINFO_STATE_INIT;
4021 
4022 	if (pip) {
4023 		if (MDI_PI_IS_TRANSIENT(pip)) {
4024 			/*
4025 			 * mdi_pathinfo is in state transition.  Return the
4026 			 * last good state.
4027 			 */
4028 			state = MDI_PI_OLD_STATE(pip);
4029 		} else {
4030 			state = MDI_PI_STATE(pip);
4031 		}
4032 	}
4033 	return (state);
4034 }
4035 
4036 /*
4037  * Note that the following function needs to be the new interface for
4038  * mdi_pi_get_state when mpxio gets integrated to ON.
4039  */
4040 int
4041 mdi_pi_get_state2(mdi_pathinfo_t *pip, mdi_pathinfo_state_t *state,
4042 		uint32_t *ext_state)
4043 {
4044 	*state = MDI_PATHINFO_STATE_INIT;
4045 
4046 	if (pip) {
4047 		if (MDI_PI_IS_TRANSIENT(pip)) {
4048 			/*
4049 			 * mdi_pathinfo is in state transition.  Return the
4050 			 * last good state.
4051 			 */
4052 			*state = MDI_PI_OLD_STATE(pip);
4053 			*ext_state = MDI_PI_OLD_EXT_STATE(pip);
4054 		} else {
4055 			*state = MDI_PI_STATE(pip);
4056 			*ext_state = MDI_PI_EXT_STATE(pip);
4057 		}
4058 	}
4059 	return (MDI_SUCCESS);
4060 }
4061 
4062 /*
4063  * mdi_pi_get_preferred:
4064  *	Get the preferred path flag
4065  */
4066 int
4067 mdi_pi_get_preferred(mdi_pathinfo_t *pip)
4068 {
4069 	if (pip) {
4070 		return (MDI_PI(pip)->pi_preferred);
4071 	}
4072 	return (0);
4073 }
4074 
4075 /*
4076  * mdi_pi_set_preferred:
4077  *	Set the preferred path flag
4078  */
4079 void
4080 mdi_pi_set_preferred(mdi_pathinfo_t *pip, int preferred)
4081 {
4082 	if (pip) {
4083 		MDI_PI(pip)->pi_preferred = preferred;
4084 	}
4085 }
4086 
4087 /*
4088  * mdi_pi_set_state():
4089  *		Set the mdi_pathinfo node state
4090  */
4091 void
4092 mdi_pi_set_state(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state)
4093 {
4094 	uint32_t	ext_state;
4095 
4096 	if (pip) {
4097 		ext_state = MDI_PI(pip)->pi_state & MDI_PATHINFO_EXT_STATE_MASK;
4098 		MDI_PI(pip)->pi_state = state;
4099 		MDI_PI(pip)->pi_state |= ext_state;
4100 	}
4101 }
4102 
4103 /*
4104  * Property functions:
4105  */
4106 int
4107 i_map_nvlist_error_to_mdi(int val)
4108 {
4109 	int rv;
4110 
4111 	switch (val) {
4112 	case 0:
4113 		rv = DDI_PROP_SUCCESS;
4114 		break;
4115 	case EINVAL:
4116 	case ENOTSUP:
4117 		rv = DDI_PROP_INVAL_ARG;
4118 		break;
4119 	case ENOMEM:
4120 		rv = DDI_PROP_NO_MEMORY;
4121 		break;
4122 	default:
4123 		rv = DDI_PROP_NOT_FOUND;
4124 		break;
4125 	}
4126 	return (rv);
4127 }
4128 
4129 /*
4130  * mdi_pi_get_next_prop():
4131  * 		Property walk function.  The caller should hold mdi_pi_lock()
4132  *		and release by calling mdi_pi_unlock() at the end of walk to
4133  *		get a consistent value.
4134  */
4135 nvpair_t *
4136 mdi_pi_get_next_prop(mdi_pathinfo_t *pip, nvpair_t *prev)
4137 {
4138 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4139 		return (NULL);
4140 	}
4141 	ASSERT(MDI_PI_LOCKED(pip));
4142 	return (nvlist_next_nvpair(MDI_PI(pip)->pi_prop, prev));
4143 }
4144 
4145 /*
4146  * mdi_prop_remove():
4147  * 		Remove the named property from the named list.
4148  */
4149 int
4150 mdi_prop_remove(mdi_pathinfo_t *pip, char *name)
4151 {
4152 	if (pip == NULL) {
4153 		return (DDI_PROP_NOT_FOUND);
4154 	}
4155 	ASSERT(!MDI_PI_LOCKED(pip));
4156 	MDI_PI_LOCK(pip);
4157 	if (MDI_PI(pip)->pi_prop == NULL) {
4158 		MDI_PI_UNLOCK(pip);
4159 		return (DDI_PROP_NOT_FOUND);
4160 	}
4161 	if (name) {
4162 		(void) nvlist_remove_all(MDI_PI(pip)->pi_prop, name);
4163 	} else {
4164 		char		nvp_name[MAXNAMELEN];
4165 		nvpair_t	*nvp;
4166 		nvp = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, NULL);
4167 		while (nvp) {
4168 			nvpair_t	*next;
4169 			next = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, nvp);
4170 			(void) snprintf(nvp_name, MAXNAMELEN, "%s",
4171 			    nvpair_name(nvp));
4172 			(void) nvlist_remove_all(MDI_PI(pip)->pi_prop,
4173 			    nvp_name);
4174 			nvp = next;
4175 		}
4176 	}
4177 	MDI_PI_UNLOCK(pip);
4178 	return (DDI_PROP_SUCCESS);
4179 }
4180 
4181 /*
4182  * mdi_prop_size():
4183  * 		Get buffer size needed to pack the property data.
4184  * 		Caller should hold the mdi_pathinfo_t lock to get a consistent
4185  *		buffer size.
4186  */
4187 int
4188 mdi_prop_size(mdi_pathinfo_t *pip, size_t *buflenp)
4189 {
4190 	int	rv;
4191 	size_t	bufsize;
4192 
4193 	*buflenp = 0;
4194 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4195 		return (DDI_PROP_NOT_FOUND);
4196 	}
4197 	ASSERT(MDI_PI_LOCKED(pip));
4198 	rv = nvlist_size(MDI_PI(pip)->pi_prop,
4199 	    &bufsize, NV_ENCODE_NATIVE);
4200 	*buflenp = bufsize;
4201 	return (i_map_nvlist_error_to_mdi(rv));
4202 }
4203 
4204 /*
4205  * mdi_prop_pack():
4206  * 		pack the property list.  The caller should hold the
4207  *		mdi_pathinfo_t node to get a consistent data
4208  */
4209 int
4210 mdi_prop_pack(mdi_pathinfo_t *pip, char **bufp, uint_t buflen)
4211 {
4212 	int	rv;
4213 	size_t	bufsize;
4214 
4215 	if ((pip == NULL) || MDI_PI(pip)->pi_prop == NULL) {
4216 		return (DDI_PROP_NOT_FOUND);
4217 	}
4218 
4219 	ASSERT(MDI_PI_LOCKED(pip));
4220 
4221 	bufsize = buflen;
4222 	rv = nvlist_pack(MDI_PI(pip)->pi_prop, bufp, (size_t *)&bufsize,
4223 	    NV_ENCODE_NATIVE, KM_SLEEP);
4224 
4225 	return (i_map_nvlist_error_to_mdi(rv));
4226 }
4227 
4228 /*
4229  * mdi_prop_update_byte():
4230  *		Create/Update a byte property
4231  */
4232 int
4233 mdi_prop_update_byte(mdi_pathinfo_t *pip, char *name, uchar_t data)
4234 {
4235 	int rv;
4236 
4237 	if (pip == NULL) {
4238 		return (DDI_PROP_INVAL_ARG);
4239 	}
4240 	ASSERT(!MDI_PI_LOCKED(pip));
4241 	MDI_PI_LOCK(pip);
4242 	if (MDI_PI(pip)->pi_prop == NULL) {
4243 		MDI_PI_UNLOCK(pip);
4244 		return (DDI_PROP_NOT_FOUND);
4245 	}
4246 	rv = nvlist_add_byte(MDI_PI(pip)->pi_prop, name, data);
4247 	MDI_PI_UNLOCK(pip);
4248 	return (i_map_nvlist_error_to_mdi(rv));
4249 }
4250 
4251 /*
4252  * mdi_prop_update_byte_array():
4253  *		Create/Update a byte array property
4254  */
4255 int
4256 mdi_prop_update_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t *data,
4257     uint_t nelements)
4258 {
4259 	int rv;
4260 
4261 	if (pip == NULL) {
4262 		return (DDI_PROP_INVAL_ARG);
4263 	}
4264 	ASSERT(!MDI_PI_LOCKED(pip));
4265 	MDI_PI_LOCK(pip);
4266 	if (MDI_PI(pip)->pi_prop == NULL) {
4267 		MDI_PI_UNLOCK(pip);
4268 		return (DDI_PROP_NOT_FOUND);
4269 	}
4270 	rv = nvlist_add_byte_array(MDI_PI(pip)->pi_prop, name, data, nelements);
4271 	MDI_PI_UNLOCK(pip);
4272 	return (i_map_nvlist_error_to_mdi(rv));
4273 }
4274 
4275 /*
4276  * mdi_prop_update_int():
4277  *		Create/Update a 32 bit integer property
4278  */
4279 int
4280 mdi_prop_update_int(mdi_pathinfo_t *pip, char *name, int data)
4281 {
4282 	int rv;
4283 
4284 	if (pip == NULL) {
4285 		return (DDI_PROP_INVAL_ARG);
4286 	}
4287 	ASSERT(!MDI_PI_LOCKED(pip));
4288 	MDI_PI_LOCK(pip);
4289 	if (MDI_PI(pip)->pi_prop == NULL) {
4290 		MDI_PI_UNLOCK(pip);
4291 		return (DDI_PROP_NOT_FOUND);
4292 	}
4293 	rv = nvlist_add_int32(MDI_PI(pip)->pi_prop, name, (int32_t)data);
4294 	MDI_PI_UNLOCK(pip);
4295 	return (i_map_nvlist_error_to_mdi(rv));
4296 }
4297 
4298 /*
4299  * mdi_prop_update_int64():
4300  *		Create/Update a 64 bit integer property
4301  */
4302 int
4303 mdi_prop_update_int64(mdi_pathinfo_t *pip, char *name, int64_t data)
4304 {
4305 	int rv;
4306 
4307 	if (pip == NULL) {
4308 		return (DDI_PROP_INVAL_ARG);
4309 	}
4310 	ASSERT(!MDI_PI_LOCKED(pip));
4311 	MDI_PI_LOCK(pip);
4312 	if (MDI_PI(pip)->pi_prop == NULL) {
4313 		MDI_PI_UNLOCK(pip);
4314 		return (DDI_PROP_NOT_FOUND);
4315 	}
4316 	rv = nvlist_add_int64(MDI_PI(pip)->pi_prop, name, data);
4317 	MDI_PI_UNLOCK(pip);
4318 	return (i_map_nvlist_error_to_mdi(rv));
4319 }
4320 
4321 /*
4322  * mdi_prop_update_int_array():
4323  *		Create/Update a int array property
4324  */
4325 int
4326 mdi_prop_update_int_array(mdi_pathinfo_t *pip, char *name, int *data,
4327 	    uint_t nelements)
4328 {
4329 	int rv;
4330 
4331 	if (pip == NULL) {
4332 		return (DDI_PROP_INVAL_ARG);
4333 	}
4334 	ASSERT(!MDI_PI_LOCKED(pip));
4335 	MDI_PI_LOCK(pip);
4336 	if (MDI_PI(pip)->pi_prop == NULL) {
4337 		MDI_PI_UNLOCK(pip);
4338 		return (DDI_PROP_NOT_FOUND);
4339 	}
4340 	rv = nvlist_add_int32_array(MDI_PI(pip)->pi_prop, name, (int32_t *)data,
4341 	    nelements);
4342 	MDI_PI_UNLOCK(pip);
4343 	return (i_map_nvlist_error_to_mdi(rv));
4344 }
4345 
4346 /*
4347  * mdi_prop_update_string():
4348  *		Create/Update a string property
4349  */
4350 int
4351 mdi_prop_update_string(mdi_pathinfo_t *pip, char *name, char *data)
4352 {
4353 	int rv;
4354 
4355 	if (pip == NULL) {
4356 		return (DDI_PROP_INVAL_ARG);
4357 	}
4358 	ASSERT(!MDI_PI_LOCKED(pip));
4359 	MDI_PI_LOCK(pip);
4360 	if (MDI_PI(pip)->pi_prop == NULL) {
4361 		MDI_PI_UNLOCK(pip);
4362 		return (DDI_PROP_NOT_FOUND);
4363 	}
4364 	rv = nvlist_add_string(MDI_PI(pip)->pi_prop, name, data);
4365 	MDI_PI_UNLOCK(pip);
4366 	return (i_map_nvlist_error_to_mdi(rv));
4367 }
4368 
4369 /*
4370  * mdi_prop_update_string_array():
4371  *		Create/Update a string array property
4372  */
4373 int
4374 mdi_prop_update_string_array(mdi_pathinfo_t *pip, char *name, char **data,
4375     uint_t nelements)
4376 {
4377 	int rv;
4378 
4379 	if (pip == NULL) {
4380 		return (DDI_PROP_INVAL_ARG);
4381 	}
4382 	ASSERT(!MDI_PI_LOCKED(pip));
4383 	MDI_PI_LOCK(pip);
4384 	if (MDI_PI(pip)->pi_prop == NULL) {
4385 		MDI_PI_UNLOCK(pip);
4386 		return (DDI_PROP_NOT_FOUND);
4387 	}
4388 	rv = nvlist_add_string_array(MDI_PI(pip)->pi_prop, name, data,
4389 	    nelements);
4390 	MDI_PI_UNLOCK(pip);
4391 	return (i_map_nvlist_error_to_mdi(rv));
4392 }
4393 
4394 /*
4395  * mdi_prop_lookup_byte():
4396  * 		Look for byte property identified by name.  The data returned
4397  *		is the actual property and valid as long as mdi_pathinfo_t node
4398  *		is alive.
4399  */
4400 int
4401 mdi_prop_lookup_byte(mdi_pathinfo_t *pip, char *name, uchar_t *data)
4402 {
4403 	int rv;
4404 
4405 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4406 		return (DDI_PROP_NOT_FOUND);
4407 	}
4408 	rv = nvlist_lookup_byte(MDI_PI(pip)->pi_prop, name, data);
4409 	return (i_map_nvlist_error_to_mdi(rv));
4410 }
4411 
4412 
4413 /*
4414  * mdi_prop_lookup_byte_array():
4415  * 		Look for byte array property identified by name.  The data
4416  *		returned is the actual property and valid as long as
4417  *		mdi_pathinfo_t node is alive.
4418  */
4419 int
4420 mdi_prop_lookup_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t **data,
4421     uint_t *nelements)
4422 {
4423 	int rv;
4424 
4425 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4426 		return (DDI_PROP_NOT_FOUND);
4427 	}
4428 	rv = nvlist_lookup_byte_array(MDI_PI(pip)->pi_prop, name, data,
4429 	    nelements);
4430 	return (i_map_nvlist_error_to_mdi(rv));
4431 }
4432 
4433 /*
4434  * mdi_prop_lookup_int():
4435  * 		Look for int property identified by name.  The data returned
4436  *		is the actual property and valid as long as mdi_pathinfo_t
4437  *		node is alive.
4438  */
4439 int
4440 mdi_prop_lookup_int(mdi_pathinfo_t *pip, char *name, int *data)
4441 {
4442 	int rv;
4443 
4444 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4445 		return (DDI_PROP_NOT_FOUND);
4446 	}
4447 	rv = nvlist_lookup_int32(MDI_PI(pip)->pi_prop, name, (int32_t *)data);
4448 	return (i_map_nvlist_error_to_mdi(rv));
4449 }
4450 
4451 /*
4452  * mdi_prop_lookup_int64():
4453  * 		Look for int64 property identified by name.  The data returned
4454  *		is the actual property and valid as long as mdi_pathinfo_t node
4455  *		is alive.
4456  */
4457 int
4458 mdi_prop_lookup_int64(mdi_pathinfo_t *pip, char *name, int64_t *data)
4459 {
4460 	int rv;
4461 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4462 		return (DDI_PROP_NOT_FOUND);
4463 	}
4464 	rv = nvlist_lookup_int64(MDI_PI(pip)->pi_prop, name, data);
4465 	return (i_map_nvlist_error_to_mdi(rv));
4466 }
4467 
4468 /*
4469  * mdi_prop_lookup_int_array():
4470  * 		Look for int array property identified by name.  The data
4471  *		returned is the actual property and valid as long as
4472  *		mdi_pathinfo_t node is alive.
4473  */
4474 int
4475 mdi_prop_lookup_int_array(mdi_pathinfo_t *pip, char *name, int **data,
4476     uint_t *nelements)
4477 {
4478 	int rv;
4479 
4480 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4481 		return (DDI_PROP_NOT_FOUND);
4482 	}
4483 	rv = nvlist_lookup_int32_array(MDI_PI(pip)->pi_prop, name,
4484 	    (int32_t **)data, nelements);
4485 	return (i_map_nvlist_error_to_mdi(rv));
4486 }
4487 
4488 /*
4489  * mdi_prop_lookup_string():
4490  * 		Look for string property identified by name.  The data
4491  *		returned is the actual property and valid as long as
4492  *		mdi_pathinfo_t node is alive.
4493  */
4494 int
4495 mdi_prop_lookup_string(mdi_pathinfo_t *pip, char *name, char **data)
4496 {
4497 	int rv;
4498 
4499 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4500 		return (DDI_PROP_NOT_FOUND);
4501 	}
4502 	rv = nvlist_lookup_string(MDI_PI(pip)->pi_prop, name, data);
4503 	return (i_map_nvlist_error_to_mdi(rv));
4504 }
4505 
4506 /*
4507  * mdi_prop_lookup_string_array():
4508  * 		Look for string array property identified by name.  The data
4509  *		returned is the actual property and valid as long as
4510  *		mdi_pathinfo_t node is alive.
4511  */
4512 int
4513 mdi_prop_lookup_string_array(mdi_pathinfo_t *pip, char *name, char ***data,
4514     uint_t *nelements)
4515 {
4516 	int rv;
4517 
4518 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4519 		return (DDI_PROP_NOT_FOUND);
4520 	}
4521 	rv = nvlist_lookup_string_array(MDI_PI(pip)->pi_prop, name, data,
4522 	    nelements);
4523 	return (i_map_nvlist_error_to_mdi(rv));
4524 }
4525 
4526 /*
4527  * mdi_prop_free():
4528  * 		Symmetrical function to ddi_prop_free(). nvlist_lookup_xx()
4529  *		functions return the pointer to actual property data and not a
4530  *		copy of it.  So the data returned is valid as long as
4531  *		mdi_pathinfo_t node is valid.
4532  */
4533 /*ARGSUSED*/
4534 int
4535 mdi_prop_free(void *data)
4536 {
4537 	return (DDI_PROP_SUCCESS);
4538 }
4539 
4540 /*ARGSUSED*/
4541 static void
4542 i_mdi_report_path_state(mdi_client_t *ct, mdi_pathinfo_t *pip)
4543 {
4544 	char		*phci_path, *ct_path;
4545 	char		*ct_status;
4546 	char		*status;
4547 	dev_info_t	*dip = ct->ct_dip;
4548 	char		lb_buf[64];
4549 
4550 	ASSERT(MDI_CLIENT_LOCKED(ct));
4551 	if ((dip == NULL) || (ddi_get_instance(dip) == -1) ||
4552 	    (MDI_CLIENT_IS_REPORT_DEV_NEEDED(ct) == 0)) {
4553 		return;
4554 	}
4555 	if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_OPTIMAL) {
4556 		ct_status = "optimal";
4557 	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED) {
4558 		ct_status = "degraded";
4559 	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
4560 		ct_status = "failed";
4561 	} else {
4562 		ct_status = "unknown";
4563 	}
4564 
4565 	if (MDI_PI_IS_OFFLINE(pip)) {
4566 		status = "offline";
4567 	} else if (MDI_PI_IS_ONLINE(pip)) {
4568 		status = "online";
4569 	} else if (MDI_PI_IS_STANDBY(pip)) {
4570 		status = "standby";
4571 	} else if (MDI_PI_IS_FAULT(pip)) {
4572 		status = "faulted";
4573 	} else {
4574 		status = "unknown";
4575 	}
4576 
4577 	if (ct->ct_lb == LOAD_BALANCE_LBA) {
4578 		(void) snprintf(lb_buf, sizeof (lb_buf),
4579 		    "%s, region-size: %d", mdi_load_balance_lba,
4580 			ct->ct_lb_args->region_size);
4581 	} else if (ct->ct_lb == LOAD_BALANCE_NONE) {
4582 		(void) snprintf(lb_buf, sizeof (lb_buf),
4583 		    "%s", mdi_load_balance_none);
4584 	} else {
4585 		(void) snprintf(lb_buf, sizeof (lb_buf), "%s",
4586 		    mdi_load_balance_rr);
4587 	}
4588 
4589 	if (dip) {
4590 		ct_path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4591 		phci_path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4592 		cmn_err(CE_CONT, "?%s (%s%d) multipath status: %s, "
4593 		    "path %s (%s%d) to target address: %s is %s"
4594 		    " Load balancing: %s\n",
4595 		    ddi_pathname(dip, ct_path), ddi_driver_name(dip),
4596 		    ddi_get_instance(dip), ct_status,
4597 		    ddi_pathname(MDI_PI(pip)->pi_phci->ph_dip, phci_path),
4598 		    ddi_driver_name(MDI_PI(pip)->pi_phci->ph_dip),
4599 		    ddi_get_instance(MDI_PI(pip)->pi_phci->ph_dip),
4600 		    MDI_PI(pip)->pi_addr, status, lb_buf);
4601 		kmem_free(phci_path, MAXPATHLEN);
4602 		kmem_free(ct_path, MAXPATHLEN);
4603 		MDI_CLIENT_CLEAR_REPORT_DEV_NEEDED(ct);
4604 	}
4605 }
4606 
4607 #ifdef	DEBUG
4608 /*
4609  * i_mdi_log():
4610  *		Utility function for error message management
4611  *
4612  */
4613 /*PRINTFLIKE3*/
4614 static void
4615 i_mdi_log(int level, dev_info_t *dip, const char *fmt, ...)
4616 {
4617 	char		name[MAXNAMELEN];
4618 	char		buf[MAXNAMELEN];
4619 	char		*bp;
4620 	va_list		ap;
4621 	int		log_only = 0;
4622 	int		boot_only = 0;
4623 	int		console_only = 0;
4624 
4625 	if (dip) {
4626 		(void) snprintf(name, MAXNAMELEN, "%s%d: ",
4627 		    ddi_node_name(dip), ddi_get_instance(dip));
4628 	} else {
4629 		name[0] = 0;
4630 	}
4631 
4632 	va_start(ap, fmt);
4633 	(void) vsnprintf(buf, MAXNAMELEN, fmt, ap);
4634 	va_end(ap);
4635 
4636 	switch (buf[0]) {
4637 	case '!':
4638 		bp = &buf[1];
4639 		log_only = 1;
4640 		break;
4641 	case '?':
4642 		bp = &buf[1];
4643 		boot_only = 1;
4644 		break;
4645 	case '^':
4646 		bp = &buf[1];
4647 		console_only = 1;
4648 		break;
4649 	default:
4650 		bp = buf;
4651 		break;
4652 	}
4653 	if (mdi_debug_logonly) {
4654 		log_only = 1;
4655 		boot_only = 0;
4656 		console_only = 0;
4657 	}
4658 
4659 	switch (level) {
4660 	case CE_NOTE:
4661 		level = CE_CONT;
4662 		/* FALLTHROUGH */
4663 	case CE_CONT:
4664 	case CE_WARN:
4665 	case CE_PANIC:
4666 		if (boot_only) {
4667 			cmn_err(level, "?mdi: %s%s", name, bp);
4668 		} else if (console_only) {
4669 			cmn_err(level, "^mdi: %s%s", name, bp);
4670 		} else if (log_only) {
4671 			cmn_err(level, "!mdi: %s%s", name, bp);
4672 		} else {
4673 			cmn_err(level, "mdi: %s%s", name, bp);
4674 		}
4675 		break;
4676 	default:
4677 		cmn_err(level, "mdi: %s%s", name, bp);
4678 		break;
4679 	}
4680 }
4681 #endif	/* DEBUG */
4682 
4683 void
4684 i_mdi_client_online(dev_info_t *ct_dip)
4685 {
4686 	mdi_client_t	*ct;
4687 
4688 	/*
4689 	 * Client online notification. Mark client state as online
4690 	 * restore our binding with dev_info node
4691 	 */
4692 	ct = i_devi_get_client(ct_dip);
4693 	ASSERT(ct != NULL);
4694 	MDI_CLIENT_LOCK(ct);
4695 	MDI_CLIENT_SET_ONLINE(ct);
4696 	/* catch for any memory leaks */
4697 	ASSERT((ct->ct_dip == NULL) || (ct->ct_dip == ct_dip));
4698 	ct->ct_dip = ct_dip;
4699 
4700 	if (ct->ct_power_cnt == 0)
4701 		(void) i_mdi_power_all_phci(ct);
4702 
4703 	MDI_DEBUG(4, (CE_NOTE, ct_dip, "i_mdi_client_online "
4704 	    "i_mdi_pm_hold_client %p\n", (void *)ct));
4705 	i_mdi_pm_hold_client(ct, 1);
4706 
4707 	MDI_CLIENT_UNLOCK(ct);
4708 }
4709 
4710 void
4711 i_mdi_phci_online(dev_info_t *ph_dip)
4712 {
4713 	mdi_phci_t	*ph;
4714 
4715 	/* pHCI online notification. Mark state accordingly */
4716 	ph = i_devi_get_phci(ph_dip);
4717 	ASSERT(ph != NULL);
4718 	MDI_PHCI_LOCK(ph);
4719 	MDI_PHCI_SET_ONLINE(ph);
4720 	MDI_PHCI_UNLOCK(ph);
4721 }
4722 
4723 /*
4724  * mdi_devi_online():
4725  * 		Online notification from NDI framework on pHCI/client
4726  *		device online.
4727  * Return Values:
4728  *		NDI_SUCCESS
4729  *		MDI_FAILURE
4730  */
4731 /*ARGSUSED*/
4732 int
4733 mdi_devi_online(dev_info_t *dip, uint_t flags)
4734 {
4735 	if (MDI_PHCI(dip)) {
4736 		i_mdi_phci_online(dip);
4737 	}
4738 
4739 	if (MDI_CLIENT(dip)) {
4740 		i_mdi_client_online(dip);
4741 	}
4742 	return (NDI_SUCCESS);
4743 }
4744 
4745 /*
4746  * mdi_devi_offline():
4747  * 		Offline notification from NDI framework on pHCI/Client device
4748  *		offline.
4749  *
4750  * Return Values:
4751  *		NDI_SUCCESS
4752  *		NDI_FAILURE
4753  */
4754 /*ARGSUSED*/
4755 int
4756 mdi_devi_offline(dev_info_t *dip, uint_t flags)
4757 {
4758 	int		rv = NDI_SUCCESS;
4759 
4760 	if (MDI_CLIENT(dip)) {
4761 		rv = i_mdi_client_offline(dip, flags);
4762 		if (rv != NDI_SUCCESS)
4763 			return (rv);
4764 	}
4765 
4766 	if (MDI_PHCI(dip)) {
4767 		rv = i_mdi_phci_offline(dip, flags);
4768 
4769 		if ((rv != NDI_SUCCESS) && MDI_CLIENT(dip)) {
4770 			/* set client back online */
4771 			i_mdi_client_online(dip);
4772 		}
4773 	}
4774 
4775 	return (rv);
4776 }
4777 
4778 /*ARGSUSED*/
4779 static int
4780 i_mdi_phci_offline(dev_info_t *dip, uint_t flags)
4781 {
4782 	int		rv = NDI_SUCCESS;
4783 	mdi_phci_t	*ph;
4784 	mdi_client_t	*ct;
4785 	mdi_pathinfo_t	*pip;
4786 	mdi_pathinfo_t	*next;
4787 	mdi_pathinfo_t	*failed_pip = NULL;
4788 	dev_info_t	*cdip;
4789 
4790 	/*
4791 	 * pHCI component offline notification
4792 	 * Make sure that this pHCI instance is free to be offlined.
4793 	 * If it is OK to proceed, Offline and remove all the child
4794 	 * mdi_pathinfo nodes.  This process automatically offlines
4795 	 * corresponding client devices, for which this pHCI provides
4796 	 * critical services.
4797 	 */
4798 	ph = i_devi_get_phci(dip);
4799 	MDI_DEBUG(2, (CE_NOTE, dip, "!mdi_phci_offline called %p %p\n",
4800 	    (void *)dip, (void *)ph));
4801 	if (ph == NULL) {
4802 		return (rv);
4803 	}
4804 
4805 	MDI_PHCI_LOCK(ph);
4806 
4807 	if (MDI_PHCI_IS_OFFLINE(ph)) {
4808 		MDI_DEBUG(1, (CE_WARN, dip, "!pHCI %p already offlined",
4809 		    (void *)ph));
4810 		MDI_PHCI_UNLOCK(ph);
4811 		return (NDI_SUCCESS);
4812 	}
4813 
4814 	/*
4815 	 * Check to see if the pHCI can be offlined
4816 	 */
4817 	if (ph->ph_unstable) {
4818 		MDI_DEBUG(1, (CE_WARN, dip,
4819 		    "!One or more target devices are in transient "
4820 		    "state. This device can not be removed at "
4821 		    "this moment. Please try again later."));
4822 		MDI_PHCI_UNLOCK(ph);
4823 		return (NDI_BUSY);
4824 	}
4825 
4826 	pip = ph->ph_path_head;
4827 	while (pip != NULL) {
4828 		MDI_PI_LOCK(pip);
4829 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4830 
4831 		/*
4832 		 * The mdi_pathinfo state is OK. Check the client state.
4833 		 * If failover in progress fail the pHCI from offlining
4834 		 */
4835 		ct = MDI_PI(pip)->pi_client;
4836 		i_mdi_client_lock(ct, pip);
4837 		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
4838 		    (ct->ct_unstable)) {
4839 			/*
4840 			 * Failover is in progress, Fail the DR
4841 			 */
4842 			MDI_DEBUG(1, (CE_WARN, dip,
4843 			    "!pHCI device (%s%d) is Busy. %s",
4844 			    ddi_driver_name(dip), ddi_get_instance(dip),
4845 			    "This device can not be removed at "
4846 			    "this moment. Please try again later."));
4847 			MDI_PI_UNLOCK(pip);
4848 			i_mdi_client_unlock(ct);
4849 			MDI_PHCI_UNLOCK(ph);
4850 			return (NDI_BUSY);
4851 		}
4852 		MDI_PI_UNLOCK(pip);
4853 
4854 		/*
4855 		 * Check to see of we are removing the last path of this
4856 		 * client device...
4857 		 */
4858 		cdip = ct->ct_dip;
4859 		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
4860 		    (i_mdi_client_compute_state(ct, ph) ==
4861 		    MDI_CLIENT_STATE_FAILED)) {
4862 			i_mdi_client_unlock(ct);
4863 			MDI_PHCI_UNLOCK(ph);
4864 			if (ndi_devi_offline(cdip, 0) != NDI_SUCCESS) {
4865 				/*
4866 				 * ndi_devi_offline() failed.
4867 				 * This pHCI provides the critical path
4868 				 * to one or more client devices.
4869 				 * Return busy.
4870 				 */
4871 				MDI_PHCI_LOCK(ph);
4872 				MDI_DEBUG(1, (CE_WARN, dip,
4873 				    "!pHCI device (%s%d) is Busy. %s",
4874 				    ddi_driver_name(dip), ddi_get_instance(dip),
4875 				    "This device can not be removed at "
4876 				    "this moment. Please try again later."));
4877 				failed_pip = pip;
4878 				break;
4879 			} else {
4880 				MDI_PHCI_LOCK(ph);
4881 				pip = next;
4882 			}
4883 		} else {
4884 			i_mdi_client_unlock(ct);
4885 			pip = next;
4886 		}
4887 	}
4888 
4889 	if (failed_pip) {
4890 		pip = ph->ph_path_head;
4891 		while (pip != failed_pip) {
4892 			MDI_PI_LOCK(pip);
4893 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4894 			ct = MDI_PI(pip)->pi_client;
4895 			i_mdi_client_lock(ct, pip);
4896 			cdip = ct->ct_dip;
4897 			switch (MDI_CLIENT_STATE(ct)) {
4898 			case MDI_CLIENT_STATE_OPTIMAL:
4899 			case MDI_CLIENT_STATE_DEGRADED:
4900 				if (cdip) {
4901 					MDI_PI_UNLOCK(pip);
4902 					i_mdi_client_unlock(ct);
4903 					MDI_PHCI_UNLOCK(ph);
4904 					(void) ndi_devi_online(cdip, 0);
4905 					MDI_PHCI_LOCK(ph);
4906 					pip = next;
4907 					continue;
4908 				}
4909 				break;
4910 
4911 			case MDI_CLIENT_STATE_FAILED:
4912 				if (cdip) {
4913 					MDI_PI_UNLOCK(pip);
4914 					i_mdi_client_unlock(ct);
4915 					MDI_PHCI_UNLOCK(ph);
4916 					(void) ndi_devi_offline(cdip, 0);
4917 					MDI_PHCI_LOCK(ph);
4918 					pip = next;
4919 					continue;
4920 				}
4921 				break;
4922 			}
4923 			MDI_PI_UNLOCK(pip);
4924 			i_mdi_client_unlock(ct);
4925 			pip = next;
4926 		}
4927 		MDI_PHCI_UNLOCK(ph);
4928 		return (NDI_BUSY);
4929 	}
4930 
4931 	/*
4932 	 * Mark the pHCI as offline
4933 	 */
4934 	MDI_PHCI_SET_OFFLINE(ph);
4935 
4936 	/*
4937 	 * Mark the child mdi_pathinfo nodes as transient
4938 	 */
4939 	pip = ph->ph_path_head;
4940 	while (pip != NULL) {
4941 		MDI_PI_LOCK(pip);
4942 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4943 		MDI_PI_SET_OFFLINING(pip);
4944 		MDI_PI_UNLOCK(pip);
4945 		pip = next;
4946 	}
4947 	MDI_PHCI_UNLOCK(ph);
4948 	/*
4949 	 * Give a chance for any pending commands to execute
4950 	 */
4951 	delay(1);
4952 	MDI_PHCI_LOCK(ph);
4953 	pip = ph->ph_path_head;
4954 	while (pip != NULL) {
4955 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4956 		(void) i_mdi_pi_offline(pip, flags);
4957 		MDI_PI_LOCK(pip);
4958 		ct = MDI_PI(pip)->pi_client;
4959 		if (!MDI_PI_IS_OFFLINE(pip)) {
4960 			MDI_DEBUG(1, (CE_WARN, dip,
4961 			    "!pHCI device (%s%d) is Busy. %s",
4962 			    ddi_driver_name(dip), ddi_get_instance(dip),
4963 			    "This device can not be removed at "
4964 			    "this moment. Please try again later."));
4965 			MDI_PI_UNLOCK(pip);
4966 			MDI_PHCI_SET_ONLINE(ph);
4967 			MDI_PHCI_UNLOCK(ph);
4968 			return (NDI_BUSY);
4969 		}
4970 		MDI_PI_UNLOCK(pip);
4971 		pip = next;
4972 	}
4973 	MDI_PHCI_UNLOCK(ph);
4974 
4975 	return (rv);
4976 }
4977 
4978 void
4979 mdi_phci_mark_retiring(dev_info_t *dip, char **cons_array)
4980 {
4981 	mdi_phci_t	*ph;
4982 	mdi_client_t	*ct;
4983 	mdi_pathinfo_t	*pip;
4984 	mdi_pathinfo_t	*next;
4985 	dev_info_t	*cdip;
4986 
4987 	if (!MDI_PHCI(dip))
4988 		return;
4989 
4990 	ph = i_devi_get_phci(dip);
4991 	if (ph == NULL) {
4992 		return;
4993 	}
4994 
4995 	MDI_PHCI_LOCK(ph);
4996 
4997 	if (MDI_PHCI_IS_OFFLINE(ph)) {
4998 		/* has no last path */
4999 		MDI_PHCI_UNLOCK(ph);
5000 		return;
5001 	}
5002 
5003 	pip = ph->ph_path_head;
5004 	while (pip != NULL) {
5005 		MDI_PI_LOCK(pip);
5006 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5007 
5008 		ct = MDI_PI(pip)->pi_client;
5009 		i_mdi_client_lock(ct, pip);
5010 		MDI_PI_UNLOCK(pip);
5011 
5012 		cdip = ct->ct_dip;
5013 		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5014 		    (i_mdi_client_compute_state(ct, ph) ==
5015 		    MDI_CLIENT_STATE_FAILED)) {
5016 			/* Last path. Mark client dip as retiring */
5017 			i_mdi_client_unlock(ct);
5018 			MDI_PHCI_UNLOCK(ph);
5019 			(void) e_ddi_mark_retiring(cdip, cons_array);
5020 			MDI_PHCI_LOCK(ph);
5021 			pip = next;
5022 		} else {
5023 			i_mdi_client_unlock(ct);
5024 			pip = next;
5025 		}
5026 	}
5027 
5028 	MDI_PHCI_UNLOCK(ph);
5029 
5030 	return;
5031 }
5032 
5033 void
5034 mdi_phci_retire_notify(dev_info_t *dip, int *constraint)
5035 {
5036 	mdi_phci_t	*ph;
5037 	mdi_client_t	*ct;
5038 	mdi_pathinfo_t	*pip;
5039 	mdi_pathinfo_t	*next;
5040 	dev_info_t	*cdip;
5041 
5042 	if (!MDI_PHCI(dip))
5043 		return;
5044 
5045 	ph = i_devi_get_phci(dip);
5046 	if (ph == NULL)
5047 		return;
5048 
5049 	MDI_PHCI_LOCK(ph);
5050 
5051 	if (MDI_PHCI_IS_OFFLINE(ph)) {
5052 		MDI_PHCI_UNLOCK(ph);
5053 		/* not last path */
5054 		return;
5055 	}
5056 
5057 	if (ph->ph_unstable) {
5058 		MDI_PHCI_UNLOCK(ph);
5059 		/* can't check for constraints */
5060 		*constraint = 0;
5061 		return;
5062 	}
5063 
5064 	pip = ph->ph_path_head;
5065 	while (pip != NULL) {
5066 		MDI_PI_LOCK(pip);
5067 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5068 
5069 		/*
5070 		 * The mdi_pathinfo state is OK. Check the client state.
5071 		 * If failover in progress fail the pHCI from offlining
5072 		 */
5073 		ct = MDI_PI(pip)->pi_client;
5074 		i_mdi_client_lock(ct, pip);
5075 		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
5076 		    (ct->ct_unstable)) {
5077 			/*
5078 			 * Failover is in progress, can't check for constraints
5079 			 */
5080 			MDI_PI_UNLOCK(pip);
5081 			i_mdi_client_unlock(ct);
5082 			MDI_PHCI_UNLOCK(ph);
5083 			*constraint = 0;
5084 			return;
5085 		}
5086 		MDI_PI_UNLOCK(pip);
5087 
5088 		/*
5089 		 * Check to see of we are retiring the last path of this
5090 		 * client device...
5091 		 */
5092 		cdip = ct->ct_dip;
5093 		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5094 		    (i_mdi_client_compute_state(ct, ph) ==
5095 		    MDI_CLIENT_STATE_FAILED)) {
5096 			i_mdi_client_unlock(ct);
5097 			MDI_PHCI_UNLOCK(ph);
5098 			(void) e_ddi_retire_notify(cdip, constraint);
5099 			MDI_PHCI_LOCK(ph);
5100 			pip = next;
5101 		} else {
5102 			i_mdi_client_unlock(ct);
5103 			pip = next;
5104 		}
5105 	}
5106 
5107 	MDI_PHCI_UNLOCK(ph);
5108 
5109 	return;
5110 }
5111 
5112 /*
5113  * offline the path(s) hanging off the PHCI. If the
5114  * last path to any client, check that constraints
5115  * have been applied.
5116  */
5117 void
5118 mdi_phci_retire_finalize(dev_info_t *dip, int phci_only)
5119 {
5120 	mdi_phci_t	*ph;
5121 	mdi_client_t	*ct;
5122 	mdi_pathinfo_t	*pip;
5123 	mdi_pathinfo_t	*next;
5124 	dev_info_t	*cdip;
5125 	int		unstable = 0;
5126 	int		constraint;
5127 
5128 	if (!MDI_PHCI(dip))
5129 		return;
5130 
5131 	ph = i_devi_get_phci(dip);
5132 	if (ph == NULL) {
5133 		/* no last path and no pips */
5134 		return;
5135 	}
5136 
5137 	MDI_PHCI_LOCK(ph);
5138 
5139 	if (MDI_PHCI_IS_OFFLINE(ph)) {
5140 		MDI_PHCI_UNLOCK(ph);
5141 		/* no last path and no pips */
5142 		return;
5143 	}
5144 
5145 	/*
5146 	 * Check to see if the pHCI can be offlined
5147 	 */
5148 	if (ph->ph_unstable) {
5149 		unstable = 1;
5150 	}
5151 
5152 	pip = ph->ph_path_head;
5153 	while (pip != NULL) {
5154 		MDI_PI_LOCK(pip);
5155 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5156 
5157 		/*
5158 		 * if failover in progress fail the pHCI from offlining
5159 		 */
5160 		ct = MDI_PI(pip)->pi_client;
5161 		i_mdi_client_lock(ct, pip);
5162 		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
5163 		    (ct->ct_unstable)) {
5164 			unstable = 1;
5165 		}
5166 		MDI_PI_UNLOCK(pip);
5167 
5168 		/*
5169 		 * Check to see of we are removing the last path of this
5170 		 * client device...
5171 		 */
5172 		cdip = ct->ct_dip;
5173 		if (!phci_only && cdip &&
5174 		    (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5175 		    (i_mdi_client_compute_state(ct, ph) ==
5176 		    MDI_CLIENT_STATE_FAILED)) {
5177 			i_mdi_client_unlock(ct);
5178 			MDI_PHCI_UNLOCK(ph);
5179 			/*
5180 			 * We don't retire clients we just retire the
5181 			 * path to a client. If it is the last path
5182 			 * to a client, constraints are checked and
5183 			 * if we pass the last path is offlined. MPXIO will
5184 			 * then fail all I/Os to the client. Since we don't
5185 			 * want to retire the client on a path error
5186 			 * set constraint = 0 so that the client dip
5187 			 * is not retired.
5188 			 */
5189 			constraint = 0;
5190 			(void) e_ddi_retire_finalize(cdip, &constraint);
5191 			MDI_PHCI_LOCK(ph);
5192 			pip = next;
5193 		} else {
5194 			i_mdi_client_unlock(ct);
5195 			pip = next;
5196 		}
5197 	}
5198 
5199 	/*
5200 	 * Cannot offline pip(s)
5201 	 */
5202 	if (unstable) {
5203 		cmn_err(CE_WARN, "PHCI in transient state, cannot "
5204 		    "retire, dip = %p", (void *)dip);
5205 		MDI_PHCI_UNLOCK(ph);
5206 		return;
5207 	}
5208 
5209 	/*
5210 	 * Mark the pHCI as offline
5211 	 */
5212 	MDI_PHCI_SET_OFFLINE(ph);
5213 
5214 	/*
5215 	 * Mark the child mdi_pathinfo nodes as transient
5216 	 */
5217 	pip = ph->ph_path_head;
5218 	while (pip != NULL) {
5219 		MDI_PI_LOCK(pip);
5220 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5221 		MDI_PI_SET_OFFLINING(pip);
5222 		MDI_PI_UNLOCK(pip);
5223 		pip = next;
5224 	}
5225 	MDI_PHCI_UNLOCK(ph);
5226 	/*
5227 	 * Give a chance for any pending commands to execute
5228 	 */
5229 	delay(1);
5230 	MDI_PHCI_LOCK(ph);
5231 	pip = ph->ph_path_head;
5232 	while (pip != NULL) {
5233 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5234 		(void) i_mdi_pi_offline(pip, 0);
5235 		MDI_PI_LOCK(pip);
5236 		ct = MDI_PI(pip)->pi_client;
5237 		if (!MDI_PI_IS_OFFLINE(pip)) {
5238 			cmn_err(CE_WARN, "PHCI busy, cannot offline path: "
5239 			    "PHCI dip = %p", (void *)dip);
5240 			MDI_PI_UNLOCK(pip);
5241 			MDI_PHCI_SET_ONLINE(ph);
5242 			MDI_PHCI_UNLOCK(ph);
5243 			return;
5244 		}
5245 		MDI_PI_UNLOCK(pip);
5246 		pip = next;
5247 	}
5248 	MDI_PHCI_UNLOCK(ph);
5249 
5250 	return;
5251 }
5252 
5253 void
5254 mdi_phci_unretire(dev_info_t *dip)
5255 {
5256 	ASSERT(MDI_PHCI(dip));
5257 
5258 	/*
5259 	 * Online the phci
5260 	 */
5261 	i_mdi_phci_online(dip);
5262 }
5263 
5264 /*ARGSUSED*/
5265 static int
5266 i_mdi_client_offline(dev_info_t *dip, uint_t flags)
5267 {
5268 	int		rv = NDI_SUCCESS;
5269 	mdi_client_t	*ct;
5270 
5271 	/*
5272 	 * Client component to go offline.  Make sure that we are
5273 	 * not in failing over state and update client state
5274 	 * accordingly
5275 	 */
5276 	ct = i_devi_get_client(dip);
5277 	MDI_DEBUG(2, (CE_NOTE, dip, "!i_mdi_client_offline called %p %p\n",
5278 	    (void *)dip, (void *)ct));
5279 	if (ct != NULL) {
5280 		MDI_CLIENT_LOCK(ct);
5281 		if (ct->ct_unstable) {
5282 			/*
5283 			 * One or more paths are in transient state,
5284 			 * Dont allow offline of a client device
5285 			 */
5286 			MDI_DEBUG(1, (CE_WARN, dip,
5287 			    "!One or more paths to this device is "
5288 			    "in transient state. This device can not "
5289 			    "be removed at this moment. "
5290 			    "Please try again later."));
5291 			MDI_CLIENT_UNLOCK(ct);
5292 			return (NDI_BUSY);
5293 		}
5294 		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
5295 			/*
5296 			 * Failover is in progress, Dont allow DR of
5297 			 * a client device
5298 			 */
5299 			MDI_DEBUG(1, (CE_WARN, dip,
5300 			    "!Client device (%s%d) is Busy. %s",
5301 			    ddi_driver_name(dip), ddi_get_instance(dip),
5302 			    "This device can not be removed at "
5303 			    "this moment. Please try again later."));
5304 			MDI_CLIENT_UNLOCK(ct);
5305 			return (NDI_BUSY);
5306 		}
5307 		MDI_CLIENT_SET_OFFLINE(ct);
5308 
5309 		/*
5310 		 * Unbind our relationship with the dev_info node
5311 		 */
5312 		if (flags & NDI_DEVI_REMOVE) {
5313 			ct->ct_dip = NULL;
5314 		}
5315 		MDI_CLIENT_UNLOCK(ct);
5316 	}
5317 	return (rv);
5318 }
5319 
5320 /*
5321  * mdi_pre_attach():
5322  *		Pre attach() notification handler
5323  */
5324 /*ARGSUSED*/
5325 int
5326 mdi_pre_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
5327 {
5328 	/* don't support old DDI_PM_RESUME */
5329 	if ((DEVI(dip)->devi_mdi_component != MDI_COMPONENT_NONE) &&
5330 	    (cmd == DDI_PM_RESUME))
5331 		return (DDI_FAILURE);
5332 
5333 	return (DDI_SUCCESS);
5334 }
5335 
5336 /*
5337  * mdi_post_attach():
5338  *		Post attach() notification handler
5339  */
5340 /*ARGSUSED*/
5341 void
5342 mdi_post_attach(dev_info_t *dip, ddi_attach_cmd_t cmd, int error)
5343 {
5344 	mdi_phci_t	*ph;
5345 	mdi_client_t	*ct;
5346 	mdi_vhci_t	*vh;
5347 
5348 	if (MDI_PHCI(dip)) {
5349 		ph = i_devi_get_phci(dip);
5350 		ASSERT(ph != NULL);
5351 
5352 		MDI_PHCI_LOCK(ph);
5353 		switch (cmd) {
5354 		case DDI_ATTACH:
5355 			MDI_DEBUG(2, (CE_NOTE, dip,
5356 			    "!pHCI post_attach: called %p\n", (void *)ph));
5357 			if (error == DDI_SUCCESS) {
5358 				MDI_PHCI_SET_ATTACH(ph);
5359 			} else {
5360 				MDI_DEBUG(1, (CE_NOTE, dip,
5361 				    "!pHCI post_attach: failed error=%d\n",
5362 				    error));
5363 				MDI_PHCI_SET_DETACH(ph);
5364 			}
5365 			break;
5366 
5367 		case DDI_RESUME:
5368 			MDI_DEBUG(2, (CE_NOTE, dip,
5369 			    "!pHCI post_resume: called %p\n", (void *)ph));
5370 			if (error == DDI_SUCCESS) {
5371 				MDI_PHCI_SET_RESUME(ph);
5372 			} else {
5373 				MDI_DEBUG(1, (CE_NOTE, dip,
5374 				    "!pHCI post_resume: failed error=%d\n",
5375 				    error));
5376 				MDI_PHCI_SET_SUSPEND(ph);
5377 			}
5378 			break;
5379 		}
5380 		MDI_PHCI_UNLOCK(ph);
5381 	}
5382 
5383 	if (MDI_CLIENT(dip)) {
5384 		ct = i_devi_get_client(dip);
5385 		ASSERT(ct != NULL);
5386 
5387 		MDI_CLIENT_LOCK(ct);
5388 		switch (cmd) {
5389 		case DDI_ATTACH:
5390 			MDI_DEBUG(2, (CE_NOTE, dip,
5391 			    "!Client post_attach: called %p\n", (void *)ct));
5392 			if (error != DDI_SUCCESS) {
5393 				MDI_DEBUG(1, (CE_NOTE, dip,
5394 				    "!Client post_attach: failed error=%d\n",
5395 				    error));
5396 				MDI_CLIENT_SET_DETACH(ct);
5397 				MDI_DEBUG(4, (CE_WARN, dip,
5398 				    "mdi_post_attach i_mdi_pm_reset_client\n"));
5399 				i_mdi_pm_reset_client(ct);
5400 				break;
5401 			}
5402 
5403 			/*
5404 			 * Client device has successfully attached, inform
5405 			 * the vhci.
5406 			 */
5407 			vh = ct->ct_vhci;
5408 			if (vh->vh_ops->vo_client_attached)
5409 				(*vh->vh_ops->vo_client_attached)(dip);
5410 
5411 			MDI_CLIENT_SET_ATTACH(ct);
5412 			break;
5413 
5414 		case DDI_RESUME:
5415 			MDI_DEBUG(2, (CE_NOTE, dip,
5416 			    "!Client post_attach: called %p\n", (void *)ct));
5417 			if (error == DDI_SUCCESS) {
5418 				MDI_CLIENT_SET_RESUME(ct);
5419 			} else {
5420 				MDI_DEBUG(1, (CE_NOTE, dip,
5421 				    "!Client post_resume: failed error=%d\n",
5422 				    error));
5423 				MDI_CLIENT_SET_SUSPEND(ct);
5424 			}
5425 			break;
5426 		}
5427 		MDI_CLIENT_UNLOCK(ct);
5428 	}
5429 }
5430 
5431 /*
5432  * mdi_pre_detach():
5433  *		Pre detach notification handler
5434  */
5435 /*ARGSUSED*/
5436 int
5437 mdi_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5438 {
5439 	int rv = DDI_SUCCESS;
5440 
5441 	if (MDI_CLIENT(dip)) {
5442 		(void) i_mdi_client_pre_detach(dip, cmd);
5443 	}
5444 
5445 	if (MDI_PHCI(dip)) {
5446 		rv = i_mdi_phci_pre_detach(dip, cmd);
5447 	}
5448 
5449 	return (rv);
5450 }
5451 
5452 /*ARGSUSED*/
5453 static int
5454 i_mdi_phci_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5455 {
5456 	int		rv = DDI_SUCCESS;
5457 	mdi_phci_t	*ph;
5458 	mdi_client_t	*ct;
5459 	mdi_pathinfo_t	*pip;
5460 	mdi_pathinfo_t	*failed_pip = NULL;
5461 	mdi_pathinfo_t	*next;
5462 
5463 	ph = i_devi_get_phci(dip);
5464 	if (ph == NULL) {
5465 		return (rv);
5466 	}
5467 
5468 	MDI_PHCI_LOCK(ph);
5469 	switch (cmd) {
5470 	case DDI_DETACH:
5471 		MDI_DEBUG(2, (CE_NOTE, dip,
5472 		    "!pHCI pre_detach: called %p\n", (void *)ph));
5473 		if (!MDI_PHCI_IS_OFFLINE(ph)) {
5474 			/*
5475 			 * mdi_pathinfo nodes are still attached to
5476 			 * this pHCI. Fail the detach for this pHCI.
5477 			 */
5478 			MDI_DEBUG(2, (CE_WARN, dip,
5479 			    "!pHCI pre_detach: "
5480 			    "mdi_pathinfo nodes are still attached "
5481 			    "%p\n", (void *)ph));
5482 			rv = DDI_FAILURE;
5483 			break;
5484 		}
5485 		MDI_PHCI_SET_DETACH(ph);
5486 		break;
5487 
5488 	case DDI_SUSPEND:
5489 		/*
5490 		 * pHCI is getting suspended.  Since mpxio client
5491 		 * devices may not be suspended at this point, to avoid
5492 		 * a potential stack overflow, it is important to suspend
5493 		 * client devices before pHCI can be suspended.
5494 		 */
5495 
5496 		MDI_DEBUG(2, (CE_NOTE, dip,
5497 		    "!pHCI pre_suspend: called %p\n", (void *)ph));
5498 		/*
5499 		 * Suspend all the client devices accessible through this pHCI
5500 		 */
5501 		pip = ph->ph_path_head;
5502 		while (pip != NULL && rv == DDI_SUCCESS) {
5503 			dev_info_t *cdip;
5504 			MDI_PI_LOCK(pip);
5505 			next =
5506 			    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5507 			ct = MDI_PI(pip)->pi_client;
5508 			i_mdi_client_lock(ct, pip);
5509 			cdip = ct->ct_dip;
5510 			MDI_PI_UNLOCK(pip);
5511 			if ((MDI_CLIENT_IS_DETACHED(ct) == 0) &&
5512 			    MDI_CLIENT_IS_SUSPENDED(ct) == 0) {
5513 				i_mdi_client_unlock(ct);
5514 				if ((rv = devi_detach(cdip, DDI_SUSPEND)) !=
5515 				    DDI_SUCCESS) {
5516 					/*
5517 					 * Suspend of one of the client
5518 					 * device has failed.
5519 					 */
5520 					MDI_DEBUG(1, (CE_WARN, dip,
5521 					    "!Suspend of device (%s%d) failed.",
5522 					    ddi_driver_name(cdip),
5523 					    ddi_get_instance(cdip)));
5524 					failed_pip = pip;
5525 					break;
5526 				}
5527 			} else {
5528 				i_mdi_client_unlock(ct);
5529 			}
5530 			pip = next;
5531 		}
5532 
5533 		if (rv == DDI_SUCCESS) {
5534 			/*
5535 			 * Suspend of client devices is complete. Proceed
5536 			 * with pHCI suspend.
5537 			 */
5538 			MDI_PHCI_SET_SUSPEND(ph);
5539 		} else {
5540 			/*
5541 			 * Revert back all the suspended client device states
5542 			 * to converse.
5543 			 */
5544 			pip = ph->ph_path_head;
5545 			while (pip != failed_pip) {
5546 				dev_info_t *cdip;
5547 				MDI_PI_LOCK(pip);
5548 				next =
5549 				    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5550 				ct = MDI_PI(pip)->pi_client;
5551 				i_mdi_client_lock(ct, pip);
5552 				cdip = ct->ct_dip;
5553 				MDI_PI_UNLOCK(pip);
5554 				if (MDI_CLIENT_IS_SUSPENDED(ct)) {
5555 					i_mdi_client_unlock(ct);
5556 					(void) devi_attach(cdip, DDI_RESUME);
5557 				} else {
5558 					i_mdi_client_unlock(ct);
5559 				}
5560 				pip = next;
5561 			}
5562 		}
5563 		break;
5564 
5565 	default:
5566 		rv = DDI_FAILURE;
5567 		break;
5568 	}
5569 	MDI_PHCI_UNLOCK(ph);
5570 	return (rv);
5571 }
5572 
5573 /*ARGSUSED*/
5574 static int
5575 i_mdi_client_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5576 {
5577 	int		rv = DDI_SUCCESS;
5578 	mdi_client_t	*ct;
5579 
5580 	ct = i_devi_get_client(dip);
5581 	if (ct == NULL) {
5582 		return (rv);
5583 	}
5584 
5585 	MDI_CLIENT_LOCK(ct);
5586 	switch (cmd) {
5587 	case DDI_DETACH:
5588 		MDI_DEBUG(2, (CE_NOTE, dip,
5589 		    "!Client pre_detach: called %p\n", (void *)ct));
5590 		MDI_CLIENT_SET_DETACH(ct);
5591 		break;
5592 
5593 	case DDI_SUSPEND:
5594 		MDI_DEBUG(2, (CE_NOTE, dip,
5595 		    "!Client pre_suspend: called %p\n", (void *)ct));
5596 		MDI_CLIENT_SET_SUSPEND(ct);
5597 		break;
5598 
5599 	default:
5600 		rv = DDI_FAILURE;
5601 		break;
5602 	}
5603 	MDI_CLIENT_UNLOCK(ct);
5604 	return (rv);
5605 }
5606 
5607 /*
5608  * mdi_post_detach():
5609  *		Post detach notification handler
5610  */
5611 /*ARGSUSED*/
5612 void
5613 mdi_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5614 {
5615 	/*
5616 	 * Detach/Suspend of mpxio component failed. Update our state
5617 	 * too
5618 	 */
5619 	if (MDI_PHCI(dip))
5620 		i_mdi_phci_post_detach(dip, cmd, error);
5621 
5622 	if (MDI_CLIENT(dip))
5623 		i_mdi_client_post_detach(dip, cmd, error);
5624 }
5625 
5626 /*ARGSUSED*/
5627 static void
5628 i_mdi_phci_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5629 {
5630 	mdi_phci_t	*ph;
5631 
5632 	/*
5633 	 * Detach/Suspend of phci component failed. Update our state
5634 	 * too
5635 	 */
5636 	ph = i_devi_get_phci(dip);
5637 	if (ph == NULL) {
5638 		return;
5639 	}
5640 
5641 	MDI_PHCI_LOCK(ph);
5642 	/*
5643 	 * Detach of pHCI failed. Restore back converse
5644 	 * state
5645 	 */
5646 	switch (cmd) {
5647 	case DDI_DETACH:
5648 		MDI_DEBUG(2, (CE_NOTE, dip,
5649 		    "!pHCI post_detach: called %p\n", (void *)ph));
5650 		if (error != DDI_SUCCESS)
5651 			MDI_PHCI_SET_ATTACH(ph);
5652 		break;
5653 
5654 	case DDI_SUSPEND:
5655 		MDI_DEBUG(2, (CE_NOTE, dip,
5656 		    "!pHCI post_suspend: called %p\n", (void *)ph));
5657 		if (error != DDI_SUCCESS)
5658 			MDI_PHCI_SET_RESUME(ph);
5659 		break;
5660 	}
5661 	MDI_PHCI_UNLOCK(ph);
5662 }
5663 
5664 /*ARGSUSED*/
5665 static void
5666 i_mdi_client_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5667 {
5668 	mdi_client_t	*ct;
5669 
5670 	ct = i_devi_get_client(dip);
5671 	if (ct == NULL) {
5672 		return;
5673 	}
5674 	MDI_CLIENT_LOCK(ct);
5675 	/*
5676 	 * Detach of Client failed. Restore back converse
5677 	 * state
5678 	 */
5679 	switch (cmd) {
5680 	case DDI_DETACH:
5681 		MDI_DEBUG(2, (CE_NOTE, dip,
5682 		    "!Client post_detach: called %p\n", (void *)ct));
5683 		if (DEVI_IS_ATTACHING(ct->ct_dip)) {
5684 			MDI_DEBUG(4, (CE_NOTE, dip, "i_mdi_client_post_detach "
5685 			    "i_mdi_pm_rele_client\n"));
5686 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
5687 		} else {
5688 			MDI_DEBUG(4, (CE_NOTE, dip, "i_mdi_client_post_detach "
5689 			    "i_mdi_pm_reset_client\n"));
5690 			i_mdi_pm_reset_client(ct);
5691 		}
5692 		if (error != DDI_SUCCESS)
5693 			MDI_CLIENT_SET_ATTACH(ct);
5694 		break;
5695 
5696 	case DDI_SUSPEND:
5697 		MDI_DEBUG(2, (CE_NOTE, dip,
5698 		    "!Client post_suspend: called %p\n", (void *)ct));
5699 		if (error != DDI_SUCCESS)
5700 			MDI_CLIENT_SET_RESUME(ct);
5701 		break;
5702 	}
5703 	MDI_CLIENT_UNLOCK(ct);
5704 }
5705 
5706 int
5707 mdi_pi_kstat_exists(mdi_pathinfo_t *pip)
5708 {
5709 	return (MDI_PI(pip)->pi_kstats ? 1 : 0);
5710 }
5711 
5712 /*
5713  * create and install per-path (client - pHCI) statistics
5714  * I/O stats supported: nread, nwritten, reads, and writes
5715  * Error stats - hard errors, soft errors, & transport errors
5716  */
5717 int
5718 mdi_pi_kstat_create(mdi_pathinfo_t *pip, char *ksname)
5719 {
5720 	kstat_t			*kiosp, *kerrsp;
5721 	struct pi_errs		*nsp;
5722 	struct mdi_pi_kstats	*mdi_statp;
5723 
5724 	if (MDI_PI(pip)->pi_kstats != NULL)
5725 		return (MDI_SUCCESS);
5726 
5727 	if ((kiosp = kstat_create("mdi", 0, ksname, "iopath",
5728 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL) {
5729 		return (MDI_FAILURE);
5730 	}
5731 
5732 	(void) strcat(ksname, ",err");
5733 	kerrsp = kstat_create("mdi", 0, ksname, "iopath_errors",
5734 	    KSTAT_TYPE_NAMED,
5735 	    sizeof (struct pi_errs) / sizeof (kstat_named_t), 0);
5736 	if (kerrsp == NULL) {
5737 		kstat_delete(kiosp);
5738 		return (MDI_FAILURE);
5739 	}
5740 
5741 	nsp = (struct pi_errs *)kerrsp->ks_data;
5742 	kstat_named_init(&nsp->pi_softerrs, "Soft Errors", KSTAT_DATA_UINT32);
5743 	kstat_named_init(&nsp->pi_harderrs, "Hard Errors", KSTAT_DATA_UINT32);
5744 	kstat_named_init(&nsp->pi_transerrs, "Transport Errors",
5745 	    KSTAT_DATA_UINT32);
5746 	kstat_named_init(&nsp->pi_icnt_busy, "Interconnect Busy",
5747 	    KSTAT_DATA_UINT32);
5748 	kstat_named_init(&nsp->pi_icnt_errors, "Interconnect Errors",
5749 	    KSTAT_DATA_UINT32);
5750 	kstat_named_init(&nsp->pi_phci_rsrc, "pHCI No Resources",
5751 	    KSTAT_DATA_UINT32);
5752 	kstat_named_init(&nsp->pi_phci_localerr, "pHCI Local Errors",
5753 	    KSTAT_DATA_UINT32);
5754 	kstat_named_init(&nsp->pi_phci_invstate, "pHCI Invalid State",
5755 	    KSTAT_DATA_UINT32);
5756 	kstat_named_init(&nsp->pi_failedfrom, "Failed From",
5757 	    KSTAT_DATA_UINT32);
5758 	kstat_named_init(&nsp->pi_failedto, "Failed To", KSTAT_DATA_UINT32);
5759 
5760 	mdi_statp = kmem_alloc(sizeof (*mdi_statp), KM_SLEEP);
5761 	mdi_statp->pi_kstat_ref = 1;
5762 	mdi_statp->pi_kstat_iostats = kiosp;
5763 	mdi_statp->pi_kstat_errstats = kerrsp;
5764 	kstat_install(kiosp);
5765 	kstat_install(kerrsp);
5766 	MDI_PI(pip)->pi_kstats = mdi_statp;
5767 	return (MDI_SUCCESS);
5768 }
5769 
5770 /*
5771  * destroy per-path properties
5772  */
5773 static void
5774 i_mdi_pi_kstat_destroy(mdi_pathinfo_t *pip)
5775 {
5776 
5777 	struct mdi_pi_kstats *mdi_statp;
5778 
5779 	if (MDI_PI(pip)->pi_kstats == NULL)
5780 		return;
5781 	if ((mdi_statp = MDI_PI(pip)->pi_kstats) == NULL)
5782 		return;
5783 
5784 	MDI_PI(pip)->pi_kstats = NULL;
5785 
5786 	/*
5787 	 * the kstat may be shared between multiple pathinfo nodes
5788 	 * decrement this pathinfo's usage, removing the kstats
5789 	 * themselves when the last pathinfo reference is removed.
5790 	 */
5791 	ASSERT(mdi_statp->pi_kstat_ref > 0);
5792 	if (--mdi_statp->pi_kstat_ref != 0)
5793 		return;
5794 
5795 	kstat_delete(mdi_statp->pi_kstat_iostats);
5796 	kstat_delete(mdi_statp->pi_kstat_errstats);
5797 	kmem_free(mdi_statp, sizeof (*mdi_statp));
5798 }
5799 
5800 /*
5801  * update I/O paths KSTATS
5802  */
5803 void
5804 mdi_pi_kstat_iosupdate(mdi_pathinfo_t *pip, struct buf *bp)
5805 {
5806 	kstat_t *iostatp;
5807 	size_t xfer_cnt;
5808 
5809 	ASSERT(pip != NULL);
5810 
5811 	/*
5812 	 * I/O can be driven across a path prior to having path
5813 	 * statistics available, i.e. probe(9e).
5814 	 */
5815 	if (bp != NULL && MDI_PI(pip)->pi_kstats != NULL) {
5816 		iostatp = MDI_PI(pip)->pi_kstats->pi_kstat_iostats;
5817 		xfer_cnt = bp->b_bcount - bp->b_resid;
5818 		if (bp->b_flags & B_READ) {
5819 			KSTAT_IO_PTR(iostatp)->reads++;
5820 			KSTAT_IO_PTR(iostatp)->nread += xfer_cnt;
5821 		} else {
5822 			KSTAT_IO_PTR(iostatp)->writes++;
5823 			KSTAT_IO_PTR(iostatp)->nwritten += xfer_cnt;
5824 		}
5825 	}
5826 }
5827 
5828 /*
5829  * Enable the path(specific client/target/initiator)
5830  * Enabling a path means that MPxIO may select the enabled path for routing
5831  * future I/O requests, subject to other path state constraints.
5832  */
5833 int
5834 mdi_pi_enable_path(mdi_pathinfo_t *pip, int flags)
5835 {
5836 	mdi_phci_t	*ph;
5837 
5838 	ph = i_devi_get_phci(mdi_pi_get_phci(pip));
5839 	if (ph == NULL) {
5840 		MDI_DEBUG(1, (CE_NOTE, NULL, "!mdi_pi_enable_path:"
5841 			" failed. pip: %p ph = NULL\n", (void *)pip));
5842 		return (MDI_FAILURE);
5843 	}
5844 
5845 	(void) i_mdi_enable_disable_path(pip, ph->ph_vhci, flags,
5846 		MDI_ENABLE_OP);
5847 	MDI_DEBUG(5, (CE_NOTE, NULL, "!mdi_pi_enable_path:"
5848 		" Returning success pip = %p. ph = %p\n",
5849 		(void *)pip, (void *)ph));
5850 	return (MDI_SUCCESS);
5851 
5852 }
5853 
5854 /*
5855  * Disable the path (specific client/target/initiator)
5856  * Disabling a path means that MPxIO will not select the disabled path for
5857  * routing any new I/O requests.
5858  */
5859 int
5860 mdi_pi_disable_path(mdi_pathinfo_t *pip, int flags)
5861 {
5862 	mdi_phci_t	*ph;
5863 
5864 	ph = i_devi_get_phci(mdi_pi_get_phci(pip));
5865 	if (ph == NULL) {
5866 		MDI_DEBUG(1, (CE_NOTE, NULL, "!mdi_pi_disable_path:"
5867 			" failed. pip: %p ph = NULL\n", (void *)pip));
5868 		return (MDI_FAILURE);
5869 	}
5870 
5871 	(void) i_mdi_enable_disable_path(pip,
5872 			ph->ph_vhci, flags, MDI_DISABLE_OP);
5873 	MDI_DEBUG(5, (CE_NOTE, NULL, "!mdi_pi_disable_path:"
5874 		"Returning success pip = %p. ph = %p",
5875 		(void *)pip, (void *)ph));
5876 	return (MDI_SUCCESS);
5877 }
5878 
5879 /*
5880  * disable the path to a particular pHCI (pHCI specified in the phci_path
5881  * argument) for a particular client (specified in the client_path argument).
5882  * Disabling a path means that MPxIO will not select the disabled path for
5883  * routing any new I/O requests.
5884  * NOTE: this will be removed once the NWS files are changed to use the new
5885  * mdi_{enable,disable}_path interfaces
5886  */
5887 int
5888 mdi_pi_disable(dev_info_t *cdip, dev_info_t *pdip, int flags)
5889 {
5890 	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_DISABLE_OP));
5891 }
5892 
5893 /*
5894  * Enable the path to a particular pHCI (pHCI specified in the phci_path
5895  * argument) for a particular client (specified in the client_path argument).
5896  * Enabling a path means that MPxIO may select the enabled path for routing
5897  * future I/O requests, subject to other path state constraints.
5898  * NOTE: this will be removed once the NWS files are changed to use the new
5899  * mdi_{enable,disable}_path interfaces
5900  */
5901 
5902 int
5903 mdi_pi_enable(dev_info_t *cdip, dev_info_t *pdip, int flags)
5904 {
5905 	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_ENABLE_OP));
5906 }
5907 
5908 /*
5909  * Common routine for doing enable/disable.
5910  */
5911 static mdi_pathinfo_t *
5912 i_mdi_enable_disable_path(mdi_pathinfo_t *pip, mdi_vhci_t *vh, int flags,
5913 		int op)
5914 {
5915 	int		sync_flag = 0;
5916 	int		rv;
5917 	mdi_pathinfo_t 	*next;
5918 	int		(*f)() = NULL;
5919 
5920 	f = vh->vh_ops->vo_pi_state_change;
5921 
5922 	sync_flag = (flags << 8) & 0xf00;
5923 
5924 	/*
5925 	 * Do a callback into the mdi consumer to let it
5926 	 * know that path is about to get enabled/disabled.
5927 	 */
5928 	if (f != NULL) {
5929 		rv = (*f)(vh->vh_dip, pip, 0,
5930 			MDI_PI_EXT_STATE(pip),
5931 			MDI_EXT_STATE_CHANGE | sync_flag |
5932 			op | MDI_BEFORE_STATE_CHANGE);
5933 		if (rv != MDI_SUCCESS) {
5934 			MDI_DEBUG(2, (CE_WARN, vh->vh_dip,
5935 			"!vo_pi_state_change: failed rv = %x", rv));
5936 		}
5937 	}
5938 	MDI_PI_LOCK(pip);
5939 	next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5940 
5941 	switch (flags) {
5942 		case USER_DISABLE:
5943 			if (op == MDI_DISABLE_OP) {
5944 				MDI_PI_SET_USER_DISABLE(pip);
5945 			} else {
5946 				MDI_PI_SET_USER_ENABLE(pip);
5947 			}
5948 			break;
5949 		case DRIVER_DISABLE:
5950 			if (op == MDI_DISABLE_OP) {
5951 				MDI_PI_SET_DRV_DISABLE(pip);
5952 			} else {
5953 				MDI_PI_SET_DRV_ENABLE(pip);
5954 			}
5955 			break;
5956 		case DRIVER_DISABLE_TRANSIENT:
5957 			if (op == MDI_DISABLE_OP && rv == MDI_SUCCESS) {
5958 				MDI_PI_SET_DRV_DISABLE_TRANS(pip);
5959 			} else {
5960 				MDI_PI_SET_DRV_ENABLE_TRANS(pip);
5961 			}
5962 			break;
5963 	}
5964 	MDI_PI_UNLOCK(pip);
5965 	/*
5966 	 * Do a callback into the mdi consumer to let it
5967 	 * know that path is now enabled/disabled.
5968 	 */
5969 	if (f != NULL) {
5970 		rv = (*f)(vh->vh_dip, pip, 0,
5971 			MDI_PI_EXT_STATE(pip),
5972 			MDI_EXT_STATE_CHANGE | sync_flag |
5973 			op | MDI_AFTER_STATE_CHANGE);
5974 		if (rv != MDI_SUCCESS) {
5975 			MDI_DEBUG(2, (CE_WARN, vh->vh_dip,
5976 			"!vo_pi_state_change: failed rv = %x", rv));
5977 		}
5978 	}
5979 	return (next);
5980 }
5981 
5982 /*
5983  * Common routine for doing enable/disable.
5984  * NOTE: this will be removed once the NWS files are changed to use the new
5985  * mdi_{enable,disable}_path has been putback
5986  */
5987 int
5988 i_mdi_pi_enable_disable(dev_info_t *cdip, dev_info_t *pdip, int flags, int op)
5989 {
5990 
5991 	mdi_phci_t	*ph;
5992 	mdi_vhci_t	*vh = NULL;
5993 	mdi_client_t	*ct;
5994 	mdi_pathinfo_t	*next, *pip;
5995 	int		found_it;
5996 
5997 	ph = i_devi_get_phci(pdip);
5998 	MDI_DEBUG(5, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable: "
5999 		"Op = %d pdip = %p cdip = %p\n", op, (void *)pdip,
6000 		(void *)cdip));
6001 	if (ph == NULL) {
6002 		MDI_DEBUG(1, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable:"
6003 			"Op %d failed. ph = NULL\n", op));
6004 		return (MDI_FAILURE);
6005 	}
6006 
6007 	if ((op != MDI_ENABLE_OP) && (op != MDI_DISABLE_OP)) {
6008 		MDI_DEBUG(1, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable: "
6009 			"Op Invalid operation = %d\n", op));
6010 		return (MDI_FAILURE);
6011 	}
6012 
6013 	vh = ph->ph_vhci;
6014 
6015 	if (cdip == NULL) {
6016 		/*
6017 		 * Need to mark the Phci as enabled/disabled.
6018 		 */
6019 		MDI_DEBUG(3, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable: "
6020 		"Op %d for the phci\n", op));
6021 		MDI_PHCI_LOCK(ph);
6022 		switch (flags) {
6023 			case USER_DISABLE:
6024 				if (op == MDI_DISABLE_OP) {
6025 					MDI_PHCI_SET_USER_DISABLE(ph);
6026 				} else {
6027 					MDI_PHCI_SET_USER_ENABLE(ph);
6028 				}
6029 				break;
6030 			case DRIVER_DISABLE:
6031 				if (op == MDI_DISABLE_OP) {
6032 					MDI_PHCI_SET_DRV_DISABLE(ph);
6033 				} else {
6034 					MDI_PHCI_SET_DRV_ENABLE(ph);
6035 				}
6036 				break;
6037 			case DRIVER_DISABLE_TRANSIENT:
6038 				if (op == MDI_DISABLE_OP) {
6039 					MDI_PHCI_SET_DRV_DISABLE_TRANSIENT(ph);
6040 				} else {
6041 					MDI_PHCI_SET_DRV_ENABLE_TRANSIENT(ph);
6042 				}
6043 				break;
6044 			default:
6045 				MDI_PHCI_UNLOCK(ph);
6046 				MDI_DEBUG(1, (CE_NOTE, NULL,
6047 				"!i_mdi_pi_enable_disable:"
6048 				" Invalid flag argument= %d\n", flags));
6049 		}
6050 
6051 		/*
6052 		 * Phci has been disabled. Now try to enable/disable
6053 		 * path info's to each client.
6054 		 */
6055 		pip = ph->ph_path_head;
6056 		while (pip != NULL) {
6057 			pip = i_mdi_enable_disable_path(pip, vh, flags, op);
6058 		}
6059 		MDI_PHCI_UNLOCK(ph);
6060 	} else {
6061 
6062 		/*
6063 		 * Disable a specific client.
6064 		 */
6065 		ct = i_devi_get_client(cdip);
6066 		if (ct == NULL) {
6067 			MDI_DEBUG(1, (CE_NOTE, NULL,
6068 			"!i_mdi_pi_enable_disable:"
6069 			" failed. ct = NULL operation = %d\n", op));
6070 			return (MDI_FAILURE);
6071 		}
6072 
6073 		MDI_CLIENT_LOCK(ct);
6074 		pip = ct->ct_path_head;
6075 		found_it = 0;
6076 		while (pip != NULL) {
6077 			MDI_PI_LOCK(pip);
6078 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6079 			if (MDI_PI(pip)->pi_phci == ph) {
6080 				MDI_PI_UNLOCK(pip);
6081 				found_it = 1;
6082 				break;
6083 			}
6084 			MDI_PI_UNLOCK(pip);
6085 			pip = next;
6086 		}
6087 
6088 
6089 		MDI_CLIENT_UNLOCK(ct);
6090 		if (found_it == 0) {
6091 			MDI_DEBUG(1, (CE_NOTE, NULL,
6092 			"!i_mdi_pi_enable_disable:"
6093 			" failed. Could not find corresponding pip\n"));
6094 			return (MDI_FAILURE);
6095 		}
6096 
6097 		(void) i_mdi_enable_disable_path(pip, vh, flags, op);
6098 	}
6099 
6100 	MDI_DEBUG(5, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable: "
6101 		"Op %d Returning success pdip = %p cdip = %p\n",
6102 		op, (void *)pdip, (void *)cdip));
6103 	return (MDI_SUCCESS);
6104 }
6105 
6106 /*
6107  * Ensure phci powered up
6108  */
6109 static void
6110 i_mdi_pm_hold_pip(mdi_pathinfo_t *pip)
6111 {
6112 	dev_info_t	*ph_dip;
6113 
6114 	ASSERT(pip != NULL);
6115 	ASSERT(MDI_PI_LOCKED(pip));
6116 
6117 	if (MDI_PI(pip)->pi_pm_held) {
6118 		return;
6119 	}
6120 
6121 	ph_dip = mdi_pi_get_phci(pip);
6122 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_pm_hold_pip for %s%d %p\n",
6123 	    ddi_get_name(ph_dip), ddi_get_instance(ph_dip), (void *)pip));
6124 	if (ph_dip == NULL) {
6125 		return;
6126 	}
6127 
6128 	MDI_PI_UNLOCK(pip);
6129 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt was %d\n",
6130 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
6131 
6132 	pm_hold_power(ph_dip);
6133 
6134 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt is %d\n",
6135 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
6136 	MDI_PI_LOCK(pip);
6137 
6138 	/* If PM_GET_PM_INFO is NULL the pm_hold_power above was a noop */
6139 	if (DEVI(ph_dip)->devi_pm_info)
6140 		MDI_PI(pip)->pi_pm_held = 1;
6141 }
6142 
6143 /*
6144  * Allow phci powered down
6145  */
6146 static void
6147 i_mdi_pm_rele_pip(mdi_pathinfo_t *pip)
6148 {
6149 	dev_info_t	*ph_dip = NULL;
6150 
6151 	ASSERT(pip != NULL);
6152 	ASSERT(MDI_PI_LOCKED(pip));
6153 
6154 	if (MDI_PI(pip)->pi_pm_held == 0) {
6155 		return;
6156 	}
6157 
6158 	ph_dip = mdi_pi_get_phci(pip);
6159 	ASSERT(ph_dip != NULL);
6160 
6161 	MDI_PI_UNLOCK(pip);
6162 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_pm_rele_pip for %s%d %p\n",
6163 	    ddi_get_name(ph_dip), ddi_get_instance(ph_dip), (void *)pip));
6164 
6165 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt was %d\n",
6166 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
6167 	pm_rele_power(ph_dip);
6168 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt is %d\n",
6169 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
6170 
6171 	MDI_PI_LOCK(pip);
6172 	MDI_PI(pip)->pi_pm_held = 0;
6173 }
6174 
6175 static void
6176 i_mdi_pm_hold_client(mdi_client_t *ct, int incr)
6177 {
6178 	ASSERT(MDI_CLIENT_LOCKED(ct));
6179 
6180 	ct->ct_power_cnt += incr;
6181 	MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "i_mdi_pm_hold_client %p "
6182 	    "ct_power_cnt = %d incr = %d\n", (void *)ct,
6183 	    ct->ct_power_cnt, incr));
6184 	ASSERT(ct->ct_power_cnt >= 0);
6185 }
6186 
6187 static void
6188 i_mdi_rele_all_phci(mdi_client_t *ct)
6189 {
6190 	mdi_pathinfo_t  *pip;
6191 
6192 	ASSERT(MDI_CLIENT_LOCKED(ct));
6193 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
6194 	while (pip != NULL) {
6195 		mdi_hold_path(pip);
6196 		MDI_PI_LOCK(pip);
6197 		i_mdi_pm_rele_pip(pip);
6198 		MDI_PI_UNLOCK(pip);
6199 		mdi_rele_path(pip);
6200 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6201 	}
6202 }
6203 
6204 static void
6205 i_mdi_pm_rele_client(mdi_client_t *ct, int decr)
6206 {
6207 	ASSERT(MDI_CLIENT_LOCKED(ct));
6208 
6209 	if (i_ddi_devi_attached(ct->ct_dip)) {
6210 		ct->ct_power_cnt -= decr;
6211 		MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "i_mdi_pm_rele_client %p "
6212 		    "ct_power_cnt = %d decr = %d\n",
6213 		    (void *)ct, ct->ct_power_cnt, decr));
6214 	}
6215 
6216 	ASSERT(ct->ct_power_cnt >= 0);
6217 	if (ct->ct_power_cnt == 0) {
6218 		i_mdi_rele_all_phci(ct);
6219 		return;
6220 	}
6221 }
6222 
6223 static void
6224 i_mdi_pm_reset_client(mdi_client_t *ct)
6225 {
6226 	MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "i_mdi_pm_reset_client %p "
6227 	    "ct_power_cnt = %d\n", (void *)ct, ct->ct_power_cnt));
6228 	ASSERT(MDI_CLIENT_LOCKED(ct));
6229 	ct->ct_power_cnt = 0;
6230 	i_mdi_rele_all_phci(ct);
6231 	ct->ct_powercnt_config = 0;
6232 	ct->ct_powercnt_unconfig = 0;
6233 	ct->ct_powercnt_reset = 1;
6234 }
6235 
6236 static int
6237 i_mdi_power_one_phci(mdi_pathinfo_t *pip)
6238 {
6239 	int		ret;
6240 	dev_info_t	*ph_dip;
6241 
6242 	MDI_PI_LOCK(pip);
6243 	i_mdi_pm_hold_pip(pip);
6244 
6245 	ph_dip = mdi_pi_get_phci(pip);
6246 	MDI_PI_UNLOCK(pip);
6247 
6248 	/* bring all components of phci to full power */
6249 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_power_one_phci "
6250 	    "pm_powerup for %s%d %p\n", ddi_get_name(ph_dip),
6251 	    ddi_get_instance(ph_dip), (void *)pip));
6252 
6253 	ret = pm_powerup(ph_dip);
6254 
6255 	if (ret == DDI_FAILURE) {
6256 		MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_power_one_phci "
6257 		    "pm_powerup FAILED for %s%d %p\n",
6258 		    ddi_get_name(ph_dip), ddi_get_instance(ph_dip),
6259 		    (void *)pip));
6260 
6261 		MDI_PI_LOCK(pip);
6262 		i_mdi_pm_rele_pip(pip);
6263 		MDI_PI_UNLOCK(pip);
6264 		return (MDI_FAILURE);
6265 	}
6266 
6267 	return (MDI_SUCCESS);
6268 }
6269 
6270 static int
6271 i_mdi_power_all_phci(mdi_client_t *ct)
6272 {
6273 	mdi_pathinfo_t  *pip;
6274 	int		succeeded = 0;
6275 
6276 	ASSERT(MDI_CLIENT_LOCKED(ct));
6277 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
6278 	while (pip != NULL) {
6279 		/*
6280 		 * Don't power if MDI_PATHINFO_STATE_FAULT
6281 		 * or MDI_PATHINFO_STATE_OFFLINE.
6282 		 */
6283 		if (MDI_PI_IS_INIT(pip) ||
6284 		    MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip)) {
6285 			mdi_hold_path(pip);
6286 			MDI_CLIENT_UNLOCK(ct);
6287 			if (i_mdi_power_one_phci(pip) == MDI_SUCCESS)
6288 				succeeded = 1;
6289 
6290 			ASSERT(ct == MDI_PI(pip)->pi_client);
6291 			MDI_CLIENT_LOCK(ct);
6292 			mdi_rele_path(pip);
6293 		}
6294 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6295 	}
6296 
6297 	return (succeeded ? MDI_SUCCESS : MDI_FAILURE);
6298 }
6299 
6300 /*
6301  * mdi_bus_power():
6302  *		1. Place the phci(s) into powered up state so that
6303  *		   client can do power management
6304  *		2. Ensure phci powered up as client power managing
6305  * Return Values:
6306  *		MDI_SUCCESS
6307  *		MDI_FAILURE
6308  */
6309 int
6310 mdi_bus_power(dev_info_t *parent, void *impl_arg, pm_bus_power_op_t op,
6311     void *arg, void *result)
6312 {
6313 	int			ret = MDI_SUCCESS;
6314 	pm_bp_child_pwrchg_t	*bpc;
6315 	mdi_client_t		*ct;
6316 	dev_info_t		*cdip;
6317 	pm_bp_has_changed_t	*bphc;
6318 
6319 	/*
6320 	 * BUS_POWER_NOINVOL not supported
6321 	 */
6322 	if (op == BUS_POWER_NOINVOL)
6323 		return (MDI_FAILURE);
6324 
6325 	/*
6326 	 * ignore other OPs.
6327 	 * return quickly to save cou cycles on the ct processing
6328 	 */
6329 	switch (op) {
6330 	case BUS_POWER_PRE_NOTIFICATION:
6331 	case BUS_POWER_POST_NOTIFICATION:
6332 		bpc = (pm_bp_child_pwrchg_t *)arg;
6333 		cdip = bpc->bpc_dip;
6334 		break;
6335 	case BUS_POWER_HAS_CHANGED:
6336 		bphc = (pm_bp_has_changed_t *)arg;
6337 		cdip = bphc->bphc_dip;
6338 		break;
6339 	default:
6340 		return (pm_busop_bus_power(parent, impl_arg, op, arg, result));
6341 	}
6342 
6343 	ASSERT(MDI_CLIENT(cdip));
6344 
6345 	ct = i_devi_get_client(cdip);
6346 	if (ct == NULL)
6347 		return (MDI_FAILURE);
6348 
6349 	/*
6350 	 * wait till the mdi_pathinfo node state change are processed
6351 	 */
6352 	MDI_CLIENT_LOCK(ct);
6353 	switch (op) {
6354 	case BUS_POWER_PRE_NOTIFICATION:
6355 		MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip, "mdi_bus_power "
6356 		    "BUS_POWER_PRE_NOTIFICATION:"
6357 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d\n",
6358 		    PM_NAME(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
6359 		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp));
6360 
6361 		/* serialize power level change per client */
6362 		while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6363 			cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6364 
6365 		MDI_CLIENT_SET_POWER_TRANSITION(ct);
6366 
6367 		if (ct->ct_power_cnt == 0) {
6368 			ret = i_mdi_power_all_phci(ct);
6369 		}
6370 
6371 		/*
6372 		 * if new_level > 0:
6373 		 *	- hold phci(s)
6374 		 *	- power up phci(s) if not already
6375 		 * ignore power down
6376 		 */
6377 		if (bpc->bpc_nlevel > 0) {
6378 			if (!DEVI_IS_ATTACHING(ct->ct_dip)) {
6379 				MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
6380 				    "mdi_bus_power i_mdi_pm_hold_client\n"));
6381 				i_mdi_pm_hold_client(ct, ct->ct_path_count);
6382 			}
6383 		}
6384 		break;
6385 	case BUS_POWER_POST_NOTIFICATION:
6386 		MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip, "mdi_bus_power "
6387 		    "BUS_POWER_POST_NOTIFICATION:"
6388 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d result=%d\n",
6389 		    PM_NAME(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
6390 		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp,
6391 		    *(int *)result));
6392 
6393 		if (*(int *)result == DDI_SUCCESS) {
6394 			if (bpc->bpc_nlevel > 0) {
6395 				MDI_CLIENT_SET_POWER_UP(ct);
6396 			} else {
6397 				MDI_CLIENT_SET_POWER_DOWN(ct);
6398 			}
6399 		}
6400 
6401 		/* release the hold we did in pre-notification */
6402 		if (bpc->bpc_nlevel > 0 && (*(int *)result != DDI_SUCCESS) &&
6403 		    !DEVI_IS_ATTACHING(ct->ct_dip)) {
6404 			MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
6405 			    "mdi_bus_power i_mdi_pm_rele_client\n"));
6406 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
6407 		}
6408 
6409 		if (bpc->bpc_nlevel == 0 && (*(int *)result == DDI_SUCCESS)) {
6410 			/* another thread might started attaching */
6411 			if (DEVI_IS_ATTACHING(ct->ct_dip)) {
6412 				MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
6413 				    "mdi_bus_power i_mdi_pm_rele_client\n"));
6414 				i_mdi_pm_rele_client(ct, ct->ct_path_count);
6415 			/* detaching has been taken care in pm_post_unconfig */
6416 			} else if (!DEVI_IS_DETACHING(ct->ct_dip)) {
6417 				MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
6418 				    "mdi_bus_power i_mdi_pm_reset_client\n"));
6419 				i_mdi_pm_reset_client(ct);
6420 			}
6421 		}
6422 
6423 		MDI_CLIENT_CLEAR_POWER_TRANSITION(ct);
6424 		cv_broadcast(&ct->ct_powerchange_cv);
6425 
6426 		break;
6427 
6428 	/* need to do more */
6429 	case BUS_POWER_HAS_CHANGED:
6430 		MDI_DEBUG(4, (CE_NOTE, bphc->bphc_dip, "mdi_bus_power "
6431 		    "BUS_POWER_HAS_CHANGED:"
6432 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d\n",
6433 		    PM_NAME(bphc->bphc_dip), PM_ADDR(bphc->bphc_dip),
6434 		    bphc->bphc_olevel, bphc->bphc_nlevel, bphc->bphc_comp));
6435 
6436 		if (bphc->bphc_nlevel > 0 &&
6437 		    bphc->bphc_nlevel > bphc->bphc_olevel) {
6438 			if (ct->ct_power_cnt == 0) {
6439 				ret = i_mdi_power_all_phci(ct);
6440 			}
6441 			MDI_DEBUG(4, (CE_NOTE, bphc->bphc_dip,
6442 			    "mdi_bus_power i_mdi_pm_hold_client\n"));
6443 			i_mdi_pm_hold_client(ct, ct->ct_path_count);
6444 		}
6445 
6446 		if (bphc->bphc_nlevel == 0 && bphc->bphc_olevel != -1) {
6447 			MDI_DEBUG(4, (CE_NOTE, bphc->bphc_dip,
6448 			    "mdi_bus_power i_mdi_pm_rele_client\n"));
6449 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
6450 		}
6451 		break;
6452 	}
6453 
6454 	MDI_CLIENT_UNLOCK(ct);
6455 	return (ret);
6456 }
6457 
6458 static int
6459 i_mdi_pm_pre_config_one(dev_info_t *child)
6460 {
6461 	int		ret = MDI_SUCCESS;
6462 	mdi_client_t	*ct;
6463 
6464 	ct = i_devi_get_client(child);
6465 	if (ct == NULL)
6466 		return (MDI_FAILURE);
6467 
6468 	MDI_CLIENT_LOCK(ct);
6469 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6470 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6471 
6472 	if (!MDI_CLIENT_IS_FAILED(ct)) {
6473 		MDI_CLIENT_UNLOCK(ct);
6474 		MDI_DEBUG(4, (CE_NOTE, child,
6475 		    "i_mdi_pm_pre_config_one already configured\n"));
6476 		return (MDI_SUCCESS);
6477 	}
6478 
6479 	if (ct->ct_powercnt_config) {
6480 		MDI_CLIENT_UNLOCK(ct);
6481 		MDI_DEBUG(4, (CE_NOTE, child,
6482 		    "i_mdi_pm_pre_config_one ALREADY held\n"));
6483 		return (MDI_SUCCESS);
6484 	}
6485 
6486 	if (ct->ct_power_cnt == 0) {
6487 		ret = i_mdi_power_all_phci(ct);
6488 	}
6489 	MDI_DEBUG(4, (CE_NOTE, child,
6490 	    "i_mdi_pm_pre_config_one i_mdi_pm_hold_client\n"));
6491 	i_mdi_pm_hold_client(ct, ct->ct_path_count);
6492 	ct->ct_powercnt_config = 1;
6493 	ct->ct_powercnt_reset = 0;
6494 	MDI_CLIENT_UNLOCK(ct);
6495 	return (ret);
6496 }
6497 
6498 static int
6499 i_mdi_pm_pre_config(dev_info_t *vdip, dev_info_t *child)
6500 {
6501 	int			ret = MDI_SUCCESS;
6502 	dev_info_t		*cdip;
6503 	int			circ;
6504 
6505 	ASSERT(MDI_VHCI(vdip));
6506 
6507 	/* ndi_devi_config_one */
6508 	if (child) {
6509 		ASSERT(DEVI_BUSY_OWNED(vdip));
6510 		return (i_mdi_pm_pre_config_one(child));
6511 	}
6512 
6513 	/* devi_config_common */
6514 	ndi_devi_enter(vdip, &circ);
6515 	cdip = ddi_get_child(vdip);
6516 	while (cdip) {
6517 		dev_info_t *next = ddi_get_next_sibling(cdip);
6518 
6519 		ret = i_mdi_pm_pre_config_one(cdip);
6520 		if (ret != MDI_SUCCESS)
6521 			break;
6522 		cdip = next;
6523 	}
6524 	ndi_devi_exit(vdip, circ);
6525 	return (ret);
6526 }
6527 
6528 static int
6529 i_mdi_pm_pre_unconfig_one(dev_info_t *child, int *held, int flags)
6530 {
6531 	int		ret = MDI_SUCCESS;
6532 	mdi_client_t	*ct;
6533 
6534 	ct = i_devi_get_client(child);
6535 	if (ct == NULL)
6536 		return (MDI_FAILURE);
6537 
6538 	MDI_CLIENT_LOCK(ct);
6539 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6540 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6541 
6542 	if (!i_ddi_devi_attached(ct->ct_dip)) {
6543 		MDI_DEBUG(4, (CE_NOTE, child,
6544 		    "i_mdi_pm_pre_unconfig node detached already\n"));
6545 		MDI_CLIENT_UNLOCK(ct);
6546 		return (MDI_SUCCESS);
6547 	}
6548 
6549 	if (MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6550 	    (flags & NDI_AUTODETACH)) {
6551 		MDI_DEBUG(4, (CE_NOTE, child,
6552 		    "i_mdi_pm_pre_unconfig auto-modunload\n"));
6553 		MDI_CLIENT_UNLOCK(ct);
6554 		return (MDI_FAILURE);
6555 	}
6556 
6557 	if (ct->ct_powercnt_unconfig) {
6558 		MDI_DEBUG(4, (CE_NOTE, child,
6559 		    "i_mdi_pm_pre_unconfig ct_powercnt_held\n"));
6560 		MDI_CLIENT_UNLOCK(ct);
6561 		*held = 1;
6562 		return (MDI_SUCCESS);
6563 	}
6564 
6565 	if (ct->ct_power_cnt == 0) {
6566 		ret = i_mdi_power_all_phci(ct);
6567 	}
6568 	MDI_DEBUG(4, (CE_NOTE, child,
6569 	    "i_mdi_pm_pre_unconfig i_mdi_pm_hold_client\n"));
6570 	i_mdi_pm_hold_client(ct, ct->ct_path_count);
6571 	ct->ct_powercnt_unconfig = 1;
6572 	ct->ct_powercnt_reset = 0;
6573 	MDI_CLIENT_UNLOCK(ct);
6574 	if (ret == MDI_SUCCESS)
6575 		*held = 1;
6576 	return (ret);
6577 }
6578 
6579 static int
6580 i_mdi_pm_pre_unconfig(dev_info_t *vdip, dev_info_t *child, int *held,
6581     int flags)
6582 {
6583 	int			ret = MDI_SUCCESS;
6584 	dev_info_t		*cdip;
6585 	int			circ;
6586 
6587 	ASSERT(MDI_VHCI(vdip));
6588 	*held = 0;
6589 
6590 	/* ndi_devi_unconfig_one */
6591 	if (child) {
6592 		ASSERT(DEVI_BUSY_OWNED(vdip));
6593 		return (i_mdi_pm_pre_unconfig_one(child, held, flags));
6594 	}
6595 
6596 	/* devi_unconfig_common */
6597 	ndi_devi_enter(vdip, &circ);
6598 	cdip = ddi_get_child(vdip);
6599 	while (cdip) {
6600 		dev_info_t *next = ddi_get_next_sibling(cdip);
6601 
6602 		ret = i_mdi_pm_pre_unconfig_one(cdip, held, flags);
6603 		cdip = next;
6604 	}
6605 	ndi_devi_exit(vdip, circ);
6606 
6607 	if (*held)
6608 		ret = MDI_SUCCESS;
6609 
6610 	return (ret);
6611 }
6612 
6613 static void
6614 i_mdi_pm_post_config_one(dev_info_t *child)
6615 {
6616 	mdi_client_t	*ct;
6617 
6618 	ct = i_devi_get_client(child);
6619 	if (ct == NULL)
6620 		return;
6621 
6622 	MDI_CLIENT_LOCK(ct);
6623 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6624 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6625 
6626 	if (ct->ct_powercnt_reset || !ct->ct_powercnt_config) {
6627 		MDI_DEBUG(4, (CE_NOTE, child,
6628 		    "i_mdi_pm_post_config_one NOT configured\n"));
6629 		MDI_CLIENT_UNLOCK(ct);
6630 		return;
6631 	}
6632 
6633 	/* client has not been updated */
6634 	if (MDI_CLIENT_IS_FAILED(ct)) {
6635 		MDI_DEBUG(4, (CE_NOTE, child,
6636 		    "i_mdi_pm_post_config_one NOT configured\n"));
6637 		MDI_CLIENT_UNLOCK(ct);
6638 		return;
6639 	}
6640 
6641 	/* another thread might have powered it down or detached it */
6642 	if ((MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6643 	    !DEVI_IS_ATTACHING(ct->ct_dip)) ||
6644 	    (!i_ddi_devi_attached(ct->ct_dip) &&
6645 	    !DEVI_IS_ATTACHING(ct->ct_dip))) {
6646 		MDI_DEBUG(4, (CE_NOTE, child,
6647 		    "i_mdi_pm_post_config i_mdi_pm_reset_client\n"));
6648 		i_mdi_pm_reset_client(ct);
6649 	} else {
6650 		mdi_pathinfo_t  *pip, *next;
6651 		int	valid_path_count = 0;
6652 
6653 		MDI_DEBUG(4, (CE_NOTE, child,
6654 		    "i_mdi_pm_post_config i_mdi_pm_rele_client\n"));
6655 		pip = ct->ct_path_head;
6656 		while (pip != NULL) {
6657 			MDI_PI_LOCK(pip);
6658 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6659 			if (MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip))
6660 				valid_path_count ++;
6661 			MDI_PI_UNLOCK(pip);
6662 			pip = next;
6663 		}
6664 		i_mdi_pm_rele_client(ct, valid_path_count);
6665 	}
6666 	ct->ct_powercnt_config = 0;
6667 	MDI_CLIENT_UNLOCK(ct);
6668 }
6669 
6670 static void
6671 i_mdi_pm_post_config(dev_info_t *vdip, dev_info_t *child)
6672 {
6673 	int		circ;
6674 	dev_info_t	*cdip;
6675 
6676 	ASSERT(MDI_VHCI(vdip));
6677 
6678 	/* ndi_devi_config_one */
6679 	if (child) {
6680 		ASSERT(DEVI_BUSY_OWNED(vdip));
6681 		i_mdi_pm_post_config_one(child);
6682 		return;
6683 	}
6684 
6685 	/* devi_config_common */
6686 	ndi_devi_enter(vdip, &circ);
6687 	cdip = ddi_get_child(vdip);
6688 	while (cdip) {
6689 		dev_info_t *next = ddi_get_next_sibling(cdip);
6690 
6691 		i_mdi_pm_post_config_one(cdip);
6692 		cdip = next;
6693 	}
6694 	ndi_devi_exit(vdip, circ);
6695 }
6696 
6697 static void
6698 i_mdi_pm_post_unconfig_one(dev_info_t *child)
6699 {
6700 	mdi_client_t	*ct;
6701 
6702 	ct = i_devi_get_client(child);
6703 	if (ct == NULL)
6704 		return;
6705 
6706 	MDI_CLIENT_LOCK(ct);
6707 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6708 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6709 
6710 	if (!ct->ct_powercnt_unconfig || ct->ct_powercnt_reset) {
6711 		MDI_DEBUG(4, (CE_NOTE, child,
6712 		    "i_mdi_pm_post_unconfig NOT held\n"));
6713 		MDI_CLIENT_UNLOCK(ct);
6714 		return;
6715 	}
6716 
6717 	/* failure detaching or another thread just attached it */
6718 	if ((MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6719 	    i_ddi_devi_attached(ct->ct_dip)) ||
6720 	    (!i_ddi_devi_attached(ct->ct_dip) &&
6721 	    !DEVI_IS_ATTACHING(ct->ct_dip))) {
6722 		MDI_DEBUG(4, (CE_NOTE, child,
6723 		    "i_mdi_pm_post_unconfig i_mdi_pm_reset_client\n"));
6724 		i_mdi_pm_reset_client(ct);
6725 	} else {
6726 		mdi_pathinfo_t  *pip, *next;
6727 		int	valid_path_count = 0;
6728 
6729 		MDI_DEBUG(4, (CE_NOTE, child,
6730 		    "i_mdi_pm_post_unconfig i_mdi_pm_rele_client\n"));
6731 		pip = ct->ct_path_head;
6732 		while (pip != NULL) {
6733 			MDI_PI_LOCK(pip);
6734 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6735 			if (MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip))
6736 				valid_path_count ++;
6737 			MDI_PI_UNLOCK(pip);
6738 			pip = next;
6739 		}
6740 		i_mdi_pm_rele_client(ct, valid_path_count);
6741 		ct->ct_powercnt_unconfig = 0;
6742 	}
6743 
6744 	MDI_CLIENT_UNLOCK(ct);
6745 }
6746 
6747 static void
6748 i_mdi_pm_post_unconfig(dev_info_t *vdip, dev_info_t *child, int held)
6749 {
6750 	int			circ;
6751 	dev_info_t		*cdip;
6752 
6753 	ASSERT(MDI_VHCI(vdip));
6754 
6755 	if (!held) {
6756 		MDI_DEBUG(4, (CE_NOTE, vdip,
6757 		    "i_mdi_pm_post_unconfig held = %d\n", held));
6758 		return;
6759 	}
6760 
6761 	if (child) {
6762 		ASSERT(DEVI_BUSY_OWNED(vdip));
6763 		i_mdi_pm_post_unconfig_one(child);
6764 		return;
6765 	}
6766 
6767 	ndi_devi_enter(vdip, &circ);
6768 	cdip = ddi_get_child(vdip);
6769 	while (cdip) {
6770 		dev_info_t *next = ddi_get_next_sibling(cdip);
6771 
6772 		i_mdi_pm_post_unconfig_one(cdip);
6773 		cdip = next;
6774 	}
6775 	ndi_devi_exit(vdip, circ);
6776 }
6777 
6778 int
6779 mdi_power(dev_info_t *vdip, mdi_pm_op_t op, void *args, char *devnm, int flags)
6780 {
6781 	int			circ, ret = MDI_SUCCESS;
6782 	dev_info_t		*client_dip = NULL;
6783 	mdi_client_t		*ct;
6784 
6785 	/*
6786 	 * Handling ndi_devi_config_one and ndi_devi_unconfig_one.
6787 	 * Power up pHCI for the named client device.
6788 	 * Note: Before the client is enumerated under vhci by phci,
6789 	 * client_dip can be NULL. Then proceed to power up all the
6790 	 * pHCIs.
6791 	 */
6792 	if (devnm != NULL) {
6793 		ndi_devi_enter(vdip, &circ);
6794 		client_dip = ndi_devi_findchild(vdip, devnm);
6795 	}
6796 
6797 	MDI_DEBUG(4, (CE_NOTE, vdip, "mdi_power op = %d %s %p\n",
6798 	    op, devnm ? devnm : "NULL", (void *)client_dip));
6799 
6800 	switch (op) {
6801 	case MDI_PM_PRE_CONFIG:
6802 		ret = i_mdi_pm_pre_config(vdip, client_dip);
6803 		break;
6804 
6805 	case MDI_PM_PRE_UNCONFIG:
6806 		ret = i_mdi_pm_pre_unconfig(vdip, client_dip, (int *)args,
6807 		    flags);
6808 		break;
6809 
6810 	case MDI_PM_POST_CONFIG:
6811 		i_mdi_pm_post_config(vdip, client_dip);
6812 		break;
6813 
6814 	case MDI_PM_POST_UNCONFIG:
6815 		i_mdi_pm_post_unconfig(vdip, client_dip, *(int *)args);
6816 		break;
6817 
6818 	case MDI_PM_HOLD_POWER:
6819 	case MDI_PM_RELE_POWER:
6820 		ASSERT(args);
6821 
6822 		client_dip = (dev_info_t *)args;
6823 		ASSERT(MDI_CLIENT(client_dip));
6824 
6825 		ct = i_devi_get_client(client_dip);
6826 		MDI_CLIENT_LOCK(ct);
6827 
6828 		if (op == MDI_PM_HOLD_POWER) {
6829 			if (ct->ct_power_cnt == 0) {
6830 				(void) i_mdi_power_all_phci(ct);
6831 				MDI_DEBUG(4, (CE_NOTE, client_dip,
6832 				    "mdi_power i_mdi_pm_hold_client\n"));
6833 				i_mdi_pm_hold_client(ct, ct->ct_path_count);
6834 			}
6835 		} else {
6836 			if (DEVI_IS_ATTACHING(ct->ct_dip)) {
6837 				MDI_DEBUG(4, (CE_NOTE, client_dip,
6838 				    "mdi_power i_mdi_pm_rele_client\n"));
6839 				i_mdi_pm_rele_client(ct, ct->ct_path_count);
6840 			} else {
6841 				MDI_DEBUG(4, (CE_NOTE, client_dip,
6842 				    "mdi_power i_mdi_pm_reset_client\n"));
6843 				i_mdi_pm_reset_client(ct);
6844 			}
6845 		}
6846 
6847 		MDI_CLIENT_UNLOCK(ct);
6848 		break;
6849 
6850 	default:
6851 		break;
6852 	}
6853 
6854 	if (devnm)
6855 		ndi_devi_exit(vdip, circ);
6856 
6857 	return (ret);
6858 }
6859 
6860 int
6861 mdi_component_is_vhci(dev_info_t *dip, const char **mdi_class)
6862 {
6863 	mdi_vhci_t *vhci;
6864 
6865 	if (!MDI_VHCI(dip))
6866 		return (MDI_FAILURE);
6867 
6868 	if (mdi_class) {
6869 		vhci = DEVI(dip)->devi_mdi_xhci;
6870 		ASSERT(vhci);
6871 		*mdi_class = vhci->vh_class;
6872 	}
6873 
6874 	return (MDI_SUCCESS);
6875 }
6876 
6877 int
6878 mdi_component_is_phci(dev_info_t *dip, const char **mdi_class)
6879 {
6880 	mdi_phci_t *phci;
6881 
6882 	if (!MDI_PHCI(dip))
6883 		return (MDI_FAILURE);
6884 
6885 	if (mdi_class) {
6886 		phci = DEVI(dip)->devi_mdi_xhci;
6887 		ASSERT(phci);
6888 		*mdi_class = phci->ph_vhci->vh_class;
6889 	}
6890 
6891 	return (MDI_SUCCESS);
6892 }
6893 
6894 int
6895 mdi_component_is_client(dev_info_t *dip, const char **mdi_class)
6896 {
6897 	mdi_client_t *client;
6898 
6899 	if (!MDI_CLIENT(dip))
6900 		return (MDI_FAILURE);
6901 
6902 	if (mdi_class) {
6903 		client = DEVI(dip)->devi_mdi_client;
6904 		ASSERT(client);
6905 		*mdi_class = client->ct_vhci->vh_class;
6906 	}
6907 
6908 	return (MDI_SUCCESS);
6909 }
6910 
6911 void *
6912 mdi_client_get_vhci_private(dev_info_t *dip)
6913 {
6914 	ASSERT(mdi_component_is_client(dip, NULL) == MDI_SUCCESS);
6915 	if (mdi_component_is_client(dip, NULL) == MDI_SUCCESS) {
6916 		mdi_client_t	*ct;
6917 		ct = i_devi_get_client(dip);
6918 		return (ct->ct_vprivate);
6919 	}
6920 	return (NULL);
6921 }
6922 
6923 void
6924 mdi_client_set_vhci_private(dev_info_t *dip, void *data)
6925 {
6926 	ASSERT(mdi_component_is_client(dip, NULL) == MDI_SUCCESS);
6927 	if (mdi_component_is_client(dip, NULL) == MDI_SUCCESS) {
6928 		mdi_client_t	*ct;
6929 		ct = i_devi_get_client(dip);
6930 		ct->ct_vprivate = data;
6931 	}
6932 }
6933 /*
6934  * mdi_pi_get_vhci_private():
6935  *		Get the vhci private information associated with the
6936  *		mdi_pathinfo node
6937  */
6938 void *
6939 mdi_pi_get_vhci_private(mdi_pathinfo_t *pip)
6940 {
6941 	caddr_t	vprivate = NULL;
6942 	if (pip) {
6943 		vprivate = MDI_PI(pip)->pi_vprivate;
6944 	}
6945 	return (vprivate);
6946 }
6947 
6948 /*
6949  * mdi_pi_set_vhci_private():
6950  *		Set the vhci private information in the mdi_pathinfo node
6951  */
6952 void
6953 mdi_pi_set_vhci_private(mdi_pathinfo_t *pip, void *priv)
6954 {
6955 	if (pip) {
6956 		MDI_PI(pip)->pi_vprivate = priv;
6957 	}
6958 }
6959 
6960 /*
6961  * mdi_phci_get_vhci_private():
6962  *		Get the vhci private information associated with the
6963  *		mdi_phci node
6964  */
6965 void *
6966 mdi_phci_get_vhci_private(dev_info_t *dip)
6967 {
6968 	ASSERT(mdi_component_is_phci(dip, NULL) == MDI_SUCCESS);
6969 	if (mdi_component_is_phci(dip, NULL) == MDI_SUCCESS) {
6970 		mdi_phci_t	*ph;
6971 		ph = i_devi_get_phci(dip);
6972 		return (ph->ph_vprivate);
6973 	}
6974 	return (NULL);
6975 }
6976 
6977 /*
6978  * mdi_phci_set_vhci_private():
6979  *		Set the vhci private information in the mdi_phci node
6980  */
6981 void
6982 mdi_phci_set_vhci_private(dev_info_t *dip, void *priv)
6983 {
6984 	ASSERT(mdi_component_is_phci(dip, NULL) == MDI_SUCCESS);
6985 	if (mdi_component_is_phci(dip, NULL) == MDI_SUCCESS) {
6986 		mdi_phci_t	*ph;
6987 		ph = i_devi_get_phci(dip);
6988 		ph->ph_vprivate = priv;
6989 	}
6990 }
6991 
6992 /*
6993  * List of vhci class names:
6994  * A vhci class name must be in this list only if the corresponding vhci
6995  * driver intends to use the mdi provided bus config implementation
6996  * (i.e., mdi_vhci_bus_config()).
6997  */
6998 static char *vhci_class_list[] = { MDI_HCI_CLASS_SCSI, MDI_HCI_CLASS_IB };
6999 #define	N_VHCI_CLASSES	(sizeof (vhci_class_list) / sizeof (char *))
7000 
7001 /*
7002  * During boot time, the on-disk vhci cache for every vhci class is read
7003  * in the form of an nvlist and stored here.
7004  */
7005 static nvlist_t *vhcache_nvl[N_VHCI_CLASSES];
7006 
7007 /* nvpair names in vhci cache nvlist */
7008 #define	MDI_VHCI_CACHE_VERSION	1
7009 #define	MDI_NVPNAME_VERSION	"version"
7010 #define	MDI_NVPNAME_PHCIS	"phcis"
7011 #define	MDI_NVPNAME_CTADDRMAP	"clientaddrmap"
7012 
7013 /*
7014  * Given vhci class name, return its on-disk vhci cache filename.
7015  * Memory for the returned filename which includes the full path is allocated
7016  * by this function.
7017  */
7018 static char *
7019 vhclass2vhcache_filename(char *vhclass)
7020 {
7021 	char *filename;
7022 	int len;
7023 	static char *fmt = "/etc/devices/mdi_%s_cache";
7024 
7025 	/*
7026 	 * fmt contains the on-disk vhci cache file name format;
7027 	 * for scsi_vhci the filename is "/etc/devices/mdi_scsi_vhci_cache".
7028 	 */
7029 
7030 	/* the -1 below is to account for "%s" in the format string */
7031 	len = strlen(fmt) + strlen(vhclass) - 1;
7032 	filename = kmem_alloc(len, KM_SLEEP);
7033 	(void) snprintf(filename, len, fmt, vhclass);
7034 	ASSERT(len == (strlen(filename) + 1));
7035 	return (filename);
7036 }
7037 
7038 /*
7039  * initialize the vhci cache related data structures and read the on-disk
7040  * vhci cached data into memory.
7041  */
7042 static void
7043 setup_vhci_cache(mdi_vhci_t *vh)
7044 {
7045 	mdi_vhci_config_t *vhc;
7046 	mdi_vhci_cache_t *vhcache;
7047 	int i;
7048 	nvlist_t *nvl = NULL;
7049 
7050 	vhc = kmem_zalloc(sizeof (mdi_vhci_config_t), KM_SLEEP);
7051 	vh->vh_config = vhc;
7052 	vhcache = &vhc->vhc_vhcache;
7053 
7054 	vhc->vhc_vhcache_filename = vhclass2vhcache_filename(vh->vh_class);
7055 
7056 	mutex_init(&vhc->vhc_lock, NULL, MUTEX_DEFAULT, NULL);
7057 	cv_init(&vhc->vhc_cv, NULL, CV_DRIVER, NULL);
7058 
7059 	rw_init(&vhcache->vhcache_lock, NULL, RW_DRIVER, NULL);
7060 
7061 	/*
7062 	 * Create string hash; same as mod_hash_create_strhash() except that
7063 	 * we use NULL key destructor.
7064 	 */
7065 	vhcache->vhcache_client_hash = mod_hash_create_extended(vh->vh_class,
7066 	    mdi_bus_config_cache_hash_size,
7067 	    mod_hash_null_keydtor, mod_hash_null_valdtor,
7068 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
7069 
7070 	/*
7071 	 * The on-disk vhci cache is read during booting prior to the
7072 	 * lights-out period by mdi_read_devices_files().
7073 	 */
7074 	for (i = 0; i < N_VHCI_CLASSES; i++) {
7075 		if (strcmp(vhci_class_list[i], vh->vh_class) == 0) {
7076 			nvl = vhcache_nvl[i];
7077 			vhcache_nvl[i] = NULL;
7078 			break;
7079 		}
7080 	}
7081 
7082 	/*
7083 	 * this is to cover the case of some one manually causing unloading
7084 	 * (or detaching) and reloading (or attaching) of a vhci driver.
7085 	 */
7086 	if (nvl == NULL && modrootloaded)
7087 		nvl = read_on_disk_vhci_cache(vh->vh_class);
7088 
7089 	if (nvl != NULL) {
7090 		rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7091 		if (mainnvl_to_vhcache(vhcache, nvl) == MDI_SUCCESS)
7092 			vhcache->vhcache_flags |= MDI_VHCI_CACHE_SETUP_DONE;
7093 		else  {
7094 			cmn_err(CE_WARN,
7095 			    "%s: data file corrupted, will recreate\n",
7096 			    vhc->vhc_vhcache_filename);
7097 		}
7098 		rw_exit(&vhcache->vhcache_lock);
7099 		nvlist_free(nvl);
7100 	}
7101 
7102 	vhc->vhc_cbid = callb_add(stop_vhcache_flush_thread, vhc,
7103 	    CB_CL_UADMIN_PRE_VFS, "mdi_vhcache_flush");
7104 
7105 	vhc->vhc_path_discovery_boot = mdi_path_discovery_boot;
7106 	vhc->vhc_path_discovery_postboot = mdi_path_discovery_postboot;
7107 }
7108 
7109 /*
7110  * free all vhci cache related resources
7111  */
7112 static int
7113 destroy_vhci_cache(mdi_vhci_t *vh)
7114 {
7115 	mdi_vhci_config_t *vhc = vh->vh_config;
7116 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7117 	mdi_vhcache_phci_t *cphci, *cphci_next;
7118 	mdi_vhcache_client_t *cct, *cct_next;
7119 	mdi_vhcache_pathinfo_t *cpi, *cpi_next;
7120 
7121 	if (stop_vhcache_async_threads(vhc) != MDI_SUCCESS)
7122 		return (MDI_FAILURE);
7123 
7124 	kmem_free(vhc->vhc_vhcache_filename,
7125 	    strlen(vhc->vhc_vhcache_filename) + 1);
7126 
7127 	mod_hash_destroy_strhash(vhcache->vhcache_client_hash);
7128 
7129 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7130 	    cphci = cphci_next) {
7131 		cphci_next = cphci->cphci_next;
7132 		free_vhcache_phci(cphci);
7133 	}
7134 
7135 	for (cct = vhcache->vhcache_client_head; cct != NULL; cct = cct_next) {
7136 		cct_next = cct->cct_next;
7137 		for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi_next) {
7138 			cpi_next = cpi->cpi_next;
7139 			free_vhcache_pathinfo(cpi);
7140 		}
7141 		free_vhcache_client(cct);
7142 	}
7143 
7144 	rw_destroy(&vhcache->vhcache_lock);
7145 
7146 	mutex_destroy(&vhc->vhc_lock);
7147 	cv_destroy(&vhc->vhc_cv);
7148 	kmem_free(vhc, sizeof (mdi_vhci_config_t));
7149 	return (MDI_SUCCESS);
7150 }
7151 
7152 /*
7153  * Stop all vhci cache related async threads and free their resources.
7154  */
7155 static int
7156 stop_vhcache_async_threads(mdi_vhci_config_t *vhc)
7157 {
7158 	mdi_async_client_config_t *acc, *acc_next;
7159 
7160 	mutex_enter(&vhc->vhc_lock);
7161 	vhc->vhc_flags |= MDI_VHC_EXIT;
7162 	ASSERT(vhc->vhc_acc_thrcount >= 0);
7163 	cv_broadcast(&vhc->vhc_cv);
7164 
7165 	while ((vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) ||
7166 	    vhc->vhc_acc_thrcount != 0) {
7167 		mutex_exit(&vhc->vhc_lock);
7168 		delay(1);
7169 		mutex_enter(&vhc->vhc_lock);
7170 	}
7171 
7172 	vhc->vhc_flags &= ~MDI_VHC_EXIT;
7173 
7174 	for (acc = vhc->vhc_acc_list_head; acc != NULL; acc = acc_next) {
7175 		acc_next = acc->acc_next;
7176 		free_async_client_config(acc);
7177 	}
7178 	vhc->vhc_acc_list_head = NULL;
7179 	vhc->vhc_acc_list_tail = NULL;
7180 	vhc->vhc_acc_count = 0;
7181 
7182 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) {
7183 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
7184 		mutex_exit(&vhc->vhc_lock);
7185 		if (flush_vhcache(vhc, 0) != MDI_SUCCESS) {
7186 			vhcache_dirty(vhc);
7187 			return (MDI_FAILURE);
7188 		}
7189 	} else
7190 		mutex_exit(&vhc->vhc_lock);
7191 
7192 	if (callb_delete(vhc->vhc_cbid) != 0)
7193 		return (MDI_FAILURE);
7194 
7195 	return (MDI_SUCCESS);
7196 }
7197 
7198 /*
7199  * Stop vhci cache flush thread
7200  */
7201 /* ARGSUSED */
7202 static boolean_t
7203 stop_vhcache_flush_thread(void *arg, int code)
7204 {
7205 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
7206 
7207 	mutex_enter(&vhc->vhc_lock);
7208 	vhc->vhc_flags |= MDI_VHC_EXIT;
7209 	cv_broadcast(&vhc->vhc_cv);
7210 
7211 	while (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) {
7212 		mutex_exit(&vhc->vhc_lock);
7213 		delay(1);
7214 		mutex_enter(&vhc->vhc_lock);
7215 	}
7216 
7217 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) {
7218 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
7219 		mutex_exit(&vhc->vhc_lock);
7220 		(void) flush_vhcache(vhc, 1);
7221 	} else
7222 		mutex_exit(&vhc->vhc_lock);
7223 
7224 	return (B_TRUE);
7225 }
7226 
7227 /*
7228  * Enqueue the vhcache phci (cphci) at the tail of the list
7229  */
7230 static void
7231 enqueue_vhcache_phci(mdi_vhci_cache_t *vhcache, mdi_vhcache_phci_t *cphci)
7232 {
7233 	cphci->cphci_next = NULL;
7234 	if (vhcache->vhcache_phci_head == NULL)
7235 		vhcache->vhcache_phci_head = cphci;
7236 	else
7237 		vhcache->vhcache_phci_tail->cphci_next = cphci;
7238 	vhcache->vhcache_phci_tail = cphci;
7239 }
7240 
7241 /*
7242  * Enqueue the vhcache pathinfo (cpi) at the tail of the list
7243  */
7244 static void
7245 enqueue_tail_vhcache_pathinfo(mdi_vhcache_client_t *cct,
7246     mdi_vhcache_pathinfo_t *cpi)
7247 {
7248 	cpi->cpi_next = NULL;
7249 	if (cct->cct_cpi_head == NULL)
7250 		cct->cct_cpi_head = cpi;
7251 	else
7252 		cct->cct_cpi_tail->cpi_next = cpi;
7253 	cct->cct_cpi_tail = cpi;
7254 }
7255 
7256 /*
7257  * Enqueue the vhcache pathinfo (cpi) at the correct location in the
7258  * ordered list. All cpis which do not have MDI_CPI_HINT_PATH_DOES_NOT_EXIST
7259  * flag set come at the beginning of the list. All cpis which have this
7260  * flag set come at the end of the list.
7261  */
7262 static void
7263 enqueue_vhcache_pathinfo(mdi_vhcache_client_t *cct,
7264     mdi_vhcache_pathinfo_t *newcpi)
7265 {
7266 	mdi_vhcache_pathinfo_t *cpi, *prev_cpi;
7267 
7268 	if (cct->cct_cpi_head == NULL ||
7269 	    (newcpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST))
7270 		enqueue_tail_vhcache_pathinfo(cct, newcpi);
7271 	else {
7272 		for (cpi = cct->cct_cpi_head, prev_cpi = NULL; cpi != NULL &&
7273 		    !(cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST);
7274 		    prev_cpi = cpi, cpi = cpi->cpi_next)
7275 			;
7276 
7277 		if (prev_cpi == NULL)
7278 			cct->cct_cpi_head = newcpi;
7279 		else
7280 			prev_cpi->cpi_next = newcpi;
7281 
7282 		newcpi->cpi_next = cpi;
7283 
7284 		if (cpi == NULL)
7285 			cct->cct_cpi_tail = newcpi;
7286 	}
7287 }
7288 
7289 /*
7290  * Enqueue the vhcache client (cct) at the tail of the list
7291  */
7292 static void
7293 enqueue_vhcache_client(mdi_vhci_cache_t *vhcache,
7294     mdi_vhcache_client_t *cct)
7295 {
7296 	cct->cct_next = NULL;
7297 	if (vhcache->vhcache_client_head == NULL)
7298 		vhcache->vhcache_client_head = cct;
7299 	else
7300 		vhcache->vhcache_client_tail->cct_next = cct;
7301 	vhcache->vhcache_client_tail = cct;
7302 }
7303 
7304 static void
7305 free_string_array(char **str, int nelem)
7306 {
7307 	int i;
7308 
7309 	if (str) {
7310 		for (i = 0; i < nelem; i++) {
7311 			if (str[i])
7312 				kmem_free(str[i], strlen(str[i]) + 1);
7313 		}
7314 		kmem_free(str, sizeof (char *) * nelem);
7315 	}
7316 }
7317 
7318 static void
7319 free_vhcache_phci(mdi_vhcache_phci_t *cphci)
7320 {
7321 	kmem_free(cphci->cphci_path, strlen(cphci->cphci_path) + 1);
7322 	kmem_free(cphci, sizeof (*cphci));
7323 }
7324 
7325 static void
7326 free_vhcache_pathinfo(mdi_vhcache_pathinfo_t *cpi)
7327 {
7328 	kmem_free(cpi->cpi_addr, strlen(cpi->cpi_addr) + 1);
7329 	kmem_free(cpi, sizeof (*cpi));
7330 }
7331 
7332 static void
7333 free_vhcache_client(mdi_vhcache_client_t *cct)
7334 {
7335 	kmem_free(cct->cct_name_addr, strlen(cct->cct_name_addr) + 1);
7336 	kmem_free(cct, sizeof (*cct));
7337 }
7338 
7339 static char *
7340 vhcache_mknameaddr(char *ct_name, char *ct_addr, int *ret_len)
7341 {
7342 	char *name_addr;
7343 	int len;
7344 
7345 	len = strlen(ct_name) + strlen(ct_addr) + 2;
7346 	name_addr = kmem_alloc(len, KM_SLEEP);
7347 	(void) snprintf(name_addr, len, "%s@%s", ct_name, ct_addr);
7348 
7349 	if (ret_len)
7350 		*ret_len = len;
7351 	return (name_addr);
7352 }
7353 
7354 /*
7355  * Copy the contents of paddrnvl to vhci cache.
7356  * paddrnvl nvlist contains path information for a vhci client.
7357  * See the comment in mainnvl_to_vhcache() for the format of this nvlist.
7358  */
7359 static void
7360 paddrnvl_to_vhcache(nvlist_t *nvl, mdi_vhcache_phci_t *cphci_list[],
7361     mdi_vhcache_client_t *cct)
7362 {
7363 	nvpair_t *nvp = NULL;
7364 	mdi_vhcache_pathinfo_t *cpi;
7365 	uint_t nelem;
7366 	uint32_t *val;
7367 
7368 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
7369 		ASSERT(nvpair_type(nvp) == DATA_TYPE_UINT32_ARRAY);
7370 		cpi = kmem_zalloc(sizeof (*cpi), KM_SLEEP);
7371 		cpi->cpi_addr = i_ddi_strdup(nvpair_name(nvp), KM_SLEEP);
7372 		(void) nvpair_value_uint32_array(nvp, &val, &nelem);
7373 		ASSERT(nelem == 2);
7374 		cpi->cpi_cphci = cphci_list[val[0]];
7375 		cpi->cpi_flags = val[1];
7376 		enqueue_tail_vhcache_pathinfo(cct, cpi);
7377 	}
7378 }
7379 
7380 /*
7381  * Copy the contents of caddrmapnvl to vhci cache.
7382  * caddrmapnvl nvlist contains vhci client address to phci client address
7383  * mappings. See the comment in mainnvl_to_vhcache() for the format of
7384  * this nvlist.
7385  */
7386 static void
7387 caddrmapnvl_to_vhcache(mdi_vhci_cache_t *vhcache, nvlist_t *nvl,
7388     mdi_vhcache_phci_t *cphci_list[])
7389 {
7390 	nvpair_t *nvp = NULL;
7391 	nvlist_t *paddrnvl;
7392 	mdi_vhcache_client_t *cct;
7393 
7394 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
7395 		ASSERT(nvpair_type(nvp) == DATA_TYPE_NVLIST);
7396 		cct = kmem_zalloc(sizeof (*cct), KM_SLEEP);
7397 		cct->cct_name_addr = i_ddi_strdup(nvpair_name(nvp), KM_SLEEP);
7398 		(void) nvpair_value_nvlist(nvp, &paddrnvl);
7399 		paddrnvl_to_vhcache(paddrnvl, cphci_list, cct);
7400 		/* the client must contain at least one path */
7401 		ASSERT(cct->cct_cpi_head != NULL);
7402 
7403 		enqueue_vhcache_client(vhcache, cct);
7404 		(void) mod_hash_insert(vhcache->vhcache_client_hash,
7405 		    (mod_hash_key_t)cct->cct_name_addr, (mod_hash_val_t)cct);
7406 	}
7407 }
7408 
7409 /*
7410  * Copy the contents of the main nvlist to vhci cache.
7411  *
7412  * VHCI busconfig cached data is stored in the form of a nvlist on the disk.
7413  * The nvlist contains the mappings between the vhci client addresses and
7414  * their corresponding phci client addresses.
7415  *
7416  * The structure of the nvlist is as follows:
7417  *
7418  * Main nvlist:
7419  *	NAME		TYPE		DATA
7420  *	version		int32		version number
7421  *	phcis		string array	array of phci paths
7422  *	clientaddrmap	nvlist_t	c2paddrs_nvl (see below)
7423  *
7424  * structure of c2paddrs_nvl:
7425  *	NAME		TYPE		DATA
7426  *	caddr1		nvlist_t	paddrs_nvl1
7427  *	caddr2		nvlist_t	paddrs_nvl2
7428  *	...
7429  * where caddr1, caddr2, ... are vhci client name and addresses in the
7430  * form of "<clientname>@<clientaddress>".
7431  * (for example: "ssd@2000002037cd9f72");
7432  * paddrs_nvl1, paddrs_nvl2, .. are nvlists that contain path information.
7433  *
7434  * structure of paddrs_nvl:
7435  *	NAME		TYPE		DATA
7436  *	pi_addr1	uint32_array	(phci-id, cpi_flags)
7437  *	pi_addr2	uint32_array	(phci-id, cpi_flags)
7438  *	...
7439  * where pi_addr1, pi_addr2, ... are bus specific addresses of pathinfo nodes
7440  * (so called pi_addrs, for example: "w2100002037cd9f72,0");
7441  * phci-ids are integers that identify PHCIs to which the
7442  * the bus specific address belongs to. These integers are used as an index
7443  * into to the phcis string array in the main nvlist to get the PHCI path.
7444  */
7445 static int
7446 mainnvl_to_vhcache(mdi_vhci_cache_t *vhcache, nvlist_t *nvl)
7447 {
7448 	char **phcis, **phci_namep;
7449 	uint_t nphcis;
7450 	mdi_vhcache_phci_t *cphci, **cphci_list;
7451 	nvlist_t *caddrmapnvl;
7452 	int32_t ver;
7453 	int i;
7454 	size_t cphci_list_size;
7455 
7456 	ASSERT(RW_WRITE_HELD(&vhcache->vhcache_lock));
7457 
7458 	if (nvlist_lookup_int32(nvl, MDI_NVPNAME_VERSION, &ver) != 0 ||
7459 	    ver != MDI_VHCI_CACHE_VERSION)
7460 		return (MDI_FAILURE);
7461 
7462 	if (nvlist_lookup_string_array(nvl, MDI_NVPNAME_PHCIS, &phcis,
7463 	    &nphcis) != 0)
7464 		return (MDI_SUCCESS);
7465 
7466 	ASSERT(nphcis > 0);
7467 
7468 	cphci_list_size = sizeof (mdi_vhcache_phci_t *) * nphcis;
7469 	cphci_list = kmem_alloc(cphci_list_size, KM_SLEEP);
7470 	for (i = 0, phci_namep = phcis; i < nphcis; i++, phci_namep++) {
7471 		cphci = kmem_zalloc(sizeof (mdi_vhcache_phci_t), KM_SLEEP);
7472 		cphci->cphci_path = i_ddi_strdup(*phci_namep, KM_SLEEP);
7473 		enqueue_vhcache_phci(vhcache, cphci);
7474 		cphci_list[i] = cphci;
7475 	}
7476 
7477 	ASSERT(vhcache->vhcache_phci_head != NULL);
7478 
7479 	if (nvlist_lookup_nvlist(nvl, MDI_NVPNAME_CTADDRMAP, &caddrmapnvl) == 0)
7480 		caddrmapnvl_to_vhcache(vhcache, caddrmapnvl, cphci_list);
7481 
7482 	kmem_free(cphci_list, cphci_list_size);
7483 	return (MDI_SUCCESS);
7484 }
7485 
7486 /*
7487  * Build paddrnvl for the specified client using the information in the
7488  * vhci cache and add it to the caddrmapnnvl.
7489  * Returns 0 on success, errno on failure.
7490  */
7491 static int
7492 vhcache_to_paddrnvl(mdi_vhci_cache_t *vhcache, mdi_vhcache_client_t *cct,
7493     nvlist_t *caddrmapnvl)
7494 {
7495 	mdi_vhcache_pathinfo_t *cpi;
7496 	nvlist_t *nvl;
7497 	int err;
7498 	uint32_t val[2];
7499 
7500 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7501 
7502 	if ((err = nvlist_alloc(&nvl, 0, KM_SLEEP)) != 0)
7503 		return (err);
7504 
7505 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
7506 		val[0] = cpi->cpi_cphci->cphci_id;
7507 		val[1] = cpi->cpi_flags;
7508 		if ((err = nvlist_add_uint32_array(nvl, cpi->cpi_addr, val, 2))
7509 		    != 0)
7510 			goto out;
7511 	}
7512 
7513 	err = nvlist_add_nvlist(caddrmapnvl, cct->cct_name_addr, nvl);
7514 out:
7515 	nvlist_free(nvl);
7516 	return (err);
7517 }
7518 
7519 /*
7520  * Build caddrmapnvl using the information in the vhci cache
7521  * and add it to the mainnvl.
7522  * Returns 0 on success, errno on failure.
7523  */
7524 static int
7525 vhcache_to_caddrmapnvl(mdi_vhci_cache_t *vhcache, nvlist_t *mainnvl)
7526 {
7527 	mdi_vhcache_client_t *cct;
7528 	nvlist_t *nvl;
7529 	int err;
7530 
7531 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7532 
7533 	if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) != 0)
7534 		return (err);
7535 
7536 	for (cct = vhcache->vhcache_client_head; cct != NULL;
7537 	    cct = cct->cct_next) {
7538 		if ((err = vhcache_to_paddrnvl(vhcache, cct, nvl)) != 0)
7539 			goto out;
7540 	}
7541 
7542 	err = nvlist_add_nvlist(mainnvl, MDI_NVPNAME_CTADDRMAP, nvl);
7543 out:
7544 	nvlist_free(nvl);
7545 	return (err);
7546 }
7547 
7548 /*
7549  * Build nvlist using the information in the vhci cache.
7550  * See the comment in mainnvl_to_vhcache() for the format of the nvlist.
7551  * Returns nvl on success, NULL on failure.
7552  */
7553 static nvlist_t *
7554 vhcache_to_mainnvl(mdi_vhci_cache_t *vhcache)
7555 {
7556 	mdi_vhcache_phci_t *cphci;
7557 	uint_t phci_count;
7558 	char **phcis;
7559 	nvlist_t *nvl;
7560 	int err, i;
7561 
7562 	if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) != 0) {
7563 		nvl = NULL;
7564 		goto out;
7565 	}
7566 
7567 	if ((err = nvlist_add_int32(nvl, MDI_NVPNAME_VERSION,
7568 	    MDI_VHCI_CACHE_VERSION)) != 0)
7569 		goto out;
7570 
7571 	rw_enter(&vhcache->vhcache_lock, RW_READER);
7572 	if (vhcache->vhcache_phci_head == NULL) {
7573 		rw_exit(&vhcache->vhcache_lock);
7574 		return (nvl);
7575 	}
7576 
7577 	phci_count = 0;
7578 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7579 	    cphci = cphci->cphci_next)
7580 		cphci->cphci_id = phci_count++;
7581 
7582 	/* build phci pathname list */
7583 	phcis = kmem_alloc(sizeof (char *) * phci_count, KM_SLEEP);
7584 	for (cphci = vhcache->vhcache_phci_head, i = 0; cphci != NULL;
7585 	    cphci = cphci->cphci_next, i++)
7586 		phcis[i] = i_ddi_strdup(cphci->cphci_path, KM_SLEEP);
7587 
7588 	err = nvlist_add_string_array(nvl, MDI_NVPNAME_PHCIS, phcis,
7589 	    phci_count);
7590 	free_string_array(phcis, phci_count);
7591 
7592 	if (err == 0 &&
7593 	    (err = vhcache_to_caddrmapnvl(vhcache, nvl)) == 0) {
7594 		rw_exit(&vhcache->vhcache_lock);
7595 		return (nvl);
7596 	}
7597 
7598 	rw_exit(&vhcache->vhcache_lock);
7599 out:
7600 	if (nvl)
7601 		nvlist_free(nvl);
7602 	return (NULL);
7603 }
7604 
7605 /*
7606  * Lookup vhcache phci structure for the specified phci path.
7607  */
7608 static mdi_vhcache_phci_t *
7609 lookup_vhcache_phci_by_name(mdi_vhci_cache_t *vhcache, char *phci_path)
7610 {
7611 	mdi_vhcache_phci_t *cphci;
7612 
7613 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7614 
7615 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7616 	    cphci = cphci->cphci_next) {
7617 		if (strcmp(cphci->cphci_path, phci_path) == 0)
7618 			return (cphci);
7619 	}
7620 
7621 	return (NULL);
7622 }
7623 
7624 /*
7625  * Lookup vhcache phci structure for the specified phci.
7626  */
7627 static mdi_vhcache_phci_t *
7628 lookup_vhcache_phci_by_addr(mdi_vhci_cache_t *vhcache, mdi_phci_t *ph)
7629 {
7630 	mdi_vhcache_phci_t *cphci;
7631 
7632 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7633 
7634 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7635 	    cphci = cphci->cphci_next) {
7636 		if (cphci->cphci_phci == ph)
7637 			return (cphci);
7638 	}
7639 
7640 	return (NULL);
7641 }
7642 
7643 /*
7644  * Add the specified phci to the vhci cache if not already present.
7645  */
7646 static void
7647 vhcache_phci_add(mdi_vhci_config_t *vhc, mdi_phci_t *ph)
7648 {
7649 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7650 	mdi_vhcache_phci_t *cphci;
7651 	char *pathname;
7652 	int cache_updated;
7653 
7654 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7655 
7656 	pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
7657 	(void) ddi_pathname(ph->ph_dip, pathname);
7658 	if ((cphci = lookup_vhcache_phci_by_name(vhcache, pathname))
7659 	    != NULL) {
7660 		cphci->cphci_phci = ph;
7661 		cache_updated = 0;
7662 	} else {
7663 		cphci = kmem_zalloc(sizeof (*cphci), KM_SLEEP);
7664 		cphci->cphci_path = i_ddi_strdup(pathname, KM_SLEEP);
7665 		cphci->cphci_phci = ph;
7666 		enqueue_vhcache_phci(vhcache, cphci);
7667 		cache_updated = 1;
7668 	}
7669 
7670 	rw_exit(&vhcache->vhcache_lock);
7671 
7672 	/*
7673 	 * Since a new phci has been added, reset
7674 	 * vhc_path_discovery_cutoff_time to allow for discovery of paths
7675 	 * during next vhcache_discover_paths().
7676 	 */
7677 	mutex_enter(&vhc->vhc_lock);
7678 	vhc->vhc_path_discovery_cutoff_time = 0;
7679 	mutex_exit(&vhc->vhc_lock);
7680 
7681 	kmem_free(pathname, MAXPATHLEN);
7682 	if (cache_updated)
7683 		vhcache_dirty(vhc);
7684 }
7685 
7686 /*
7687  * Remove the reference to the specified phci from the vhci cache.
7688  */
7689 static void
7690 vhcache_phci_remove(mdi_vhci_config_t *vhc, mdi_phci_t *ph)
7691 {
7692 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7693 	mdi_vhcache_phci_t *cphci;
7694 
7695 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7696 	if ((cphci = lookup_vhcache_phci_by_addr(vhcache, ph)) != NULL) {
7697 		/* do not remove the actual mdi_vhcache_phci structure */
7698 		cphci->cphci_phci = NULL;
7699 	}
7700 	rw_exit(&vhcache->vhcache_lock);
7701 }
7702 
7703 static void
7704 init_vhcache_lookup_token(mdi_vhcache_lookup_token_t *dst,
7705     mdi_vhcache_lookup_token_t *src)
7706 {
7707 	if (src == NULL) {
7708 		dst->lt_cct = NULL;
7709 		dst->lt_cct_lookup_time = 0;
7710 	} else {
7711 		dst->lt_cct = src->lt_cct;
7712 		dst->lt_cct_lookup_time = src->lt_cct_lookup_time;
7713 	}
7714 }
7715 
7716 /*
7717  * Look up vhcache client for the specified client.
7718  */
7719 static mdi_vhcache_client_t *
7720 lookup_vhcache_client(mdi_vhci_cache_t *vhcache, char *ct_name, char *ct_addr,
7721     mdi_vhcache_lookup_token_t *token)
7722 {
7723 	mod_hash_val_t hv;
7724 	char *name_addr;
7725 	int len;
7726 
7727 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7728 
7729 	/*
7730 	 * If no vhcache clean occurred since the last lookup, we can
7731 	 * simply return the cct from the last lookup operation.
7732 	 * It works because ccts are never freed except during the vhcache
7733 	 * cleanup operation.
7734 	 */
7735 	if (token != NULL &&
7736 	    vhcache->vhcache_clean_time < token->lt_cct_lookup_time)
7737 		return (token->lt_cct);
7738 
7739 	name_addr = vhcache_mknameaddr(ct_name, ct_addr, &len);
7740 	if (mod_hash_find(vhcache->vhcache_client_hash,
7741 	    (mod_hash_key_t)name_addr, &hv) == 0) {
7742 		if (token) {
7743 			token->lt_cct = (mdi_vhcache_client_t *)hv;
7744 			token->lt_cct_lookup_time = lbolt64;
7745 		}
7746 	} else {
7747 		if (token) {
7748 			token->lt_cct = NULL;
7749 			token->lt_cct_lookup_time = 0;
7750 		}
7751 		hv = NULL;
7752 	}
7753 	kmem_free(name_addr, len);
7754 	return ((mdi_vhcache_client_t *)hv);
7755 }
7756 
7757 /*
7758  * Add the specified path to the vhci cache if not already present.
7759  * Also add the vhcache client for the client corresponding to this path
7760  * if it doesn't already exist.
7761  */
7762 static void
7763 vhcache_pi_add(mdi_vhci_config_t *vhc, struct mdi_pathinfo *pip)
7764 {
7765 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7766 	mdi_vhcache_client_t *cct;
7767 	mdi_vhcache_pathinfo_t *cpi;
7768 	mdi_phci_t *ph = pip->pi_phci;
7769 	mdi_client_t *ct = pip->pi_client;
7770 	int cache_updated = 0;
7771 
7772 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7773 
7774 	/* if vhcache client for this pip doesn't already exist, add it */
7775 	if ((cct = lookup_vhcache_client(vhcache, ct->ct_drvname, ct->ct_guid,
7776 	    NULL)) == NULL) {
7777 		cct = kmem_zalloc(sizeof (*cct), KM_SLEEP);
7778 		cct->cct_name_addr = vhcache_mknameaddr(ct->ct_drvname,
7779 		    ct->ct_guid, NULL);
7780 		enqueue_vhcache_client(vhcache, cct);
7781 		(void) mod_hash_insert(vhcache->vhcache_client_hash,
7782 		    (mod_hash_key_t)cct->cct_name_addr, (mod_hash_val_t)cct);
7783 		cache_updated = 1;
7784 	}
7785 
7786 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
7787 		if (cpi->cpi_cphci->cphci_phci == ph &&
7788 		    strcmp(cpi->cpi_addr, pip->pi_addr) == 0) {
7789 			cpi->cpi_pip = pip;
7790 			if (cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST) {
7791 				cpi->cpi_flags &=
7792 				    ~MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
7793 				sort_vhcache_paths(cct);
7794 				cache_updated = 1;
7795 			}
7796 			break;
7797 		}
7798 	}
7799 
7800 	if (cpi == NULL) {
7801 		cpi = kmem_zalloc(sizeof (*cpi), KM_SLEEP);
7802 		cpi->cpi_addr = i_ddi_strdup(pip->pi_addr, KM_SLEEP);
7803 		cpi->cpi_cphci = lookup_vhcache_phci_by_addr(vhcache, ph);
7804 		ASSERT(cpi->cpi_cphci != NULL);
7805 		cpi->cpi_pip = pip;
7806 		enqueue_vhcache_pathinfo(cct, cpi);
7807 		cache_updated = 1;
7808 	}
7809 
7810 	rw_exit(&vhcache->vhcache_lock);
7811 
7812 	if (cache_updated)
7813 		vhcache_dirty(vhc);
7814 }
7815 
7816 /*
7817  * Remove the reference to the specified path from the vhci cache.
7818  */
7819 static void
7820 vhcache_pi_remove(mdi_vhci_config_t *vhc, struct mdi_pathinfo *pip)
7821 {
7822 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7823 	mdi_client_t *ct = pip->pi_client;
7824 	mdi_vhcache_client_t *cct;
7825 	mdi_vhcache_pathinfo_t *cpi;
7826 
7827 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7828 	if ((cct = lookup_vhcache_client(vhcache, ct->ct_drvname, ct->ct_guid,
7829 	    NULL)) != NULL) {
7830 		for (cpi = cct->cct_cpi_head; cpi != NULL;
7831 		    cpi = cpi->cpi_next) {
7832 			if (cpi->cpi_pip == pip) {
7833 				cpi->cpi_pip = NULL;
7834 				break;
7835 			}
7836 		}
7837 	}
7838 	rw_exit(&vhcache->vhcache_lock);
7839 }
7840 
7841 /*
7842  * Flush the vhci cache to disk.
7843  * Returns MDI_SUCCESS on success, MDI_FAILURE on failure.
7844  */
7845 static int
7846 flush_vhcache(mdi_vhci_config_t *vhc, int force_flag)
7847 {
7848 	nvlist_t *nvl;
7849 	int err;
7850 	int rv;
7851 
7852 	/*
7853 	 * It is possible that the system may shutdown before
7854 	 * i_ddi_io_initialized (during stmsboot for example). To allow for
7855 	 * flushing the cache in this case do not check for
7856 	 * i_ddi_io_initialized when force flag is set.
7857 	 */
7858 	if (force_flag == 0 && !i_ddi_io_initialized())
7859 		return (MDI_FAILURE);
7860 
7861 	if ((nvl = vhcache_to_mainnvl(&vhc->vhc_vhcache)) != NULL) {
7862 		err = fwrite_nvlist(vhc->vhc_vhcache_filename, nvl);
7863 		nvlist_free(nvl);
7864 	} else
7865 		err = EFAULT;
7866 
7867 	rv = MDI_SUCCESS;
7868 	mutex_enter(&vhc->vhc_lock);
7869 	if (err != 0) {
7870 		if (err == EROFS) {
7871 			vhc->vhc_flags |= MDI_VHC_READONLY_FS;
7872 			vhc->vhc_flags &= ~(MDI_VHC_VHCACHE_FLUSH_ERROR |
7873 			    MDI_VHC_VHCACHE_DIRTY);
7874 		} else {
7875 			if (!(vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_ERROR)) {
7876 				cmn_err(CE_CONT, "%s: update failed\n",
7877 				    vhc->vhc_vhcache_filename);
7878 				vhc->vhc_flags |= MDI_VHC_VHCACHE_FLUSH_ERROR;
7879 			}
7880 			rv = MDI_FAILURE;
7881 		}
7882 	} else if (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_ERROR) {
7883 		cmn_err(CE_CONT,
7884 		    "%s: update now ok\n", vhc->vhc_vhcache_filename);
7885 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_FLUSH_ERROR;
7886 	}
7887 	mutex_exit(&vhc->vhc_lock);
7888 
7889 	return (rv);
7890 }
7891 
7892 /*
7893  * Call flush_vhcache() to flush the vhci cache at the scheduled time.
7894  * Exits itself if left idle for the idle timeout period.
7895  */
7896 static void
7897 vhcache_flush_thread(void *arg)
7898 {
7899 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
7900 	clock_t idle_time, quit_at_ticks;
7901 	callb_cpr_t cprinfo;
7902 
7903 	/* number of seconds to sleep idle before exiting */
7904 	idle_time = mdi_vhcache_flush_daemon_idle_time * TICKS_PER_SECOND;
7905 
7906 	CALLB_CPR_INIT(&cprinfo, &vhc->vhc_lock, callb_generic_cpr,
7907 	    "mdi_vhcache_flush");
7908 	mutex_enter(&vhc->vhc_lock);
7909 	for (; ; ) {
7910 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
7911 		    (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY)) {
7912 			if (ddi_get_lbolt() < vhc->vhc_flush_at_ticks) {
7913 				CALLB_CPR_SAFE_BEGIN(&cprinfo);
7914 				(void) cv_timedwait(&vhc->vhc_cv,
7915 				    &vhc->vhc_lock, vhc->vhc_flush_at_ticks);
7916 				CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
7917 			} else {
7918 				vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
7919 				mutex_exit(&vhc->vhc_lock);
7920 
7921 				if (flush_vhcache(vhc, 0) != MDI_SUCCESS)
7922 					vhcache_dirty(vhc);
7923 
7924 				mutex_enter(&vhc->vhc_lock);
7925 			}
7926 		}
7927 
7928 		quit_at_ticks = ddi_get_lbolt() + idle_time;
7929 
7930 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
7931 		    !(vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) &&
7932 		    ddi_get_lbolt() < quit_at_ticks) {
7933 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
7934 			(void) cv_timedwait(&vhc->vhc_cv, &vhc->vhc_lock,
7935 			    quit_at_ticks);
7936 			CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
7937 		}
7938 
7939 		if ((vhc->vhc_flags & MDI_VHC_EXIT) ||
7940 		    !(vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY))
7941 			goto out;
7942 	}
7943 
7944 out:
7945 	vhc->vhc_flags &= ~MDI_VHC_VHCACHE_FLUSH_THREAD;
7946 	/* CALLB_CPR_EXIT releases the vhc->vhc_lock */
7947 	CALLB_CPR_EXIT(&cprinfo);
7948 }
7949 
7950 /*
7951  * Make vhci cache dirty and schedule flushing by vhcache flush thread.
7952  */
7953 static void
7954 vhcache_dirty(mdi_vhci_config_t *vhc)
7955 {
7956 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7957 	int create_thread;
7958 
7959 	rw_enter(&vhcache->vhcache_lock, RW_READER);
7960 	/* do not flush cache until the cache is fully built */
7961 	if (!(vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE)) {
7962 		rw_exit(&vhcache->vhcache_lock);
7963 		return;
7964 	}
7965 	rw_exit(&vhcache->vhcache_lock);
7966 
7967 	mutex_enter(&vhc->vhc_lock);
7968 	if (vhc->vhc_flags & MDI_VHC_READONLY_FS) {
7969 		mutex_exit(&vhc->vhc_lock);
7970 		return;
7971 	}
7972 
7973 	vhc->vhc_flags |= MDI_VHC_VHCACHE_DIRTY;
7974 	vhc->vhc_flush_at_ticks = ddi_get_lbolt() +
7975 	    mdi_vhcache_flush_delay * TICKS_PER_SECOND;
7976 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) {
7977 		cv_broadcast(&vhc->vhc_cv);
7978 		create_thread = 0;
7979 	} else {
7980 		vhc->vhc_flags |= MDI_VHC_VHCACHE_FLUSH_THREAD;
7981 		create_thread = 1;
7982 	}
7983 	mutex_exit(&vhc->vhc_lock);
7984 
7985 	if (create_thread)
7986 		(void) thread_create(NULL, 0, vhcache_flush_thread, vhc,
7987 		    0, &p0, TS_RUN, minclsyspri);
7988 }
7989 
7990 /*
7991  * phci bus config structure - one for for each phci bus config operation that
7992  * we initiate on behalf of a vhci.
7993  */
7994 typedef struct mdi_phci_bus_config_s {
7995 	char *phbc_phci_path;
7996 	struct mdi_vhci_bus_config_s *phbc_vhbusconfig;	/* vhci bus config */
7997 	struct mdi_phci_bus_config_s *phbc_next;
7998 } mdi_phci_bus_config_t;
7999 
8000 /* vhci bus config structure - one for each vhci bus config operation */
8001 typedef struct mdi_vhci_bus_config_s {
8002 	ddi_bus_config_op_t vhbc_op;	/* bus config op */
8003 	major_t vhbc_op_major;		/* bus config op major */
8004 	uint_t vhbc_op_flags;		/* bus config op flags */
8005 	kmutex_t vhbc_lock;
8006 	kcondvar_t vhbc_cv;
8007 	int vhbc_thr_count;
8008 } mdi_vhci_bus_config_t;
8009 
8010 /*
8011  * bus config the specified phci
8012  */
8013 static void
8014 bus_config_phci(void *arg)
8015 {
8016 	mdi_phci_bus_config_t *phbc = (mdi_phci_bus_config_t *)arg;
8017 	mdi_vhci_bus_config_t *vhbc = phbc->phbc_vhbusconfig;
8018 	dev_info_t *ph_dip;
8019 
8020 	/*
8021 	 * first configure all path components upto phci and then configure
8022 	 * the phci children.
8023 	 */
8024 	if ((ph_dip = e_ddi_hold_devi_by_path(phbc->phbc_phci_path, 0))
8025 	    != NULL) {
8026 		if (vhbc->vhbc_op == BUS_CONFIG_DRIVER ||
8027 		    vhbc->vhbc_op == BUS_UNCONFIG_DRIVER) {
8028 			(void) ndi_devi_config_driver(ph_dip,
8029 			    vhbc->vhbc_op_flags,
8030 			    vhbc->vhbc_op_major);
8031 		} else
8032 			(void) ndi_devi_config(ph_dip,
8033 			    vhbc->vhbc_op_flags);
8034 
8035 		/* release the hold that e_ddi_hold_devi_by_path() placed */
8036 		ndi_rele_devi(ph_dip);
8037 	}
8038 
8039 	kmem_free(phbc->phbc_phci_path, strlen(phbc->phbc_phci_path) + 1);
8040 	kmem_free(phbc, sizeof (*phbc));
8041 
8042 	mutex_enter(&vhbc->vhbc_lock);
8043 	vhbc->vhbc_thr_count--;
8044 	if (vhbc->vhbc_thr_count == 0)
8045 		cv_broadcast(&vhbc->vhbc_cv);
8046 	mutex_exit(&vhbc->vhbc_lock);
8047 }
8048 
8049 /*
8050  * Bus config all phcis associated with the vhci in parallel.
8051  * op must be BUS_CONFIG_DRIVER or BUS_CONFIG_ALL.
8052  */
8053 static void
8054 bus_config_all_phcis(mdi_vhci_cache_t *vhcache, uint_t flags,
8055     ddi_bus_config_op_t op, major_t maj)
8056 {
8057 	mdi_phci_bus_config_t *phbc_head = NULL, *phbc, *phbc_next;
8058 	mdi_vhci_bus_config_t *vhbc;
8059 	mdi_vhcache_phci_t *cphci;
8060 
8061 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8062 	if (vhcache->vhcache_phci_head == NULL) {
8063 		rw_exit(&vhcache->vhcache_lock);
8064 		return;
8065 	}
8066 
8067 	vhbc = kmem_zalloc(sizeof (*vhbc), KM_SLEEP);
8068 
8069 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
8070 	    cphci = cphci->cphci_next) {
8071 		/* skip phcis that haven't attached before root is available */
8072 		if (!modrootloaded && (cphci->cphci_phci == NULL))
8073 			continue;
8074 		phbc = kmem_zalloc(sizeof (*phbc), KM_SLEEP);
8075 		phbc->phbc_phci_path = i_ddi_strdup(cphci->cphci_path,
8076 		    KM_SLEEP);
8077 		phbc->phbc_vhbusconfig = vhbc;
8078 		phbc->phbc_next = phbc_head;
8079 		phbc_head = phbc;
8080 		vhbc->vhbc_thr_count++;
8081 	}
8082 	rw_exit(&vhcache->vhcache_lock);
8083 
8084 	vhbc->vhbc_op = op;
8085 	vhbc->vhbc_op_major = maj;
8086 	vhbc->vhbc_op_flags = NDI_NO_EVENT |
8087 	    (flags & (NDI_CONFIG_REPROBE | NDI_DRV_CONF_REPROBE));
8088 	mutex_init(&vhbc->vhbc_lock, NULL, MUTEX_DEFAULT, NULL);
8089 	cv_init(&vhbc->vhbc_cv, NULL, CV_DRIVER, NULL);
8090 
8091 	/* now create threads to initiate bus config on all phcis in parallel */
8092 	for (phbc = phbc_head; phbc != NULL; phbc = phbc_next) {
8093 		phbc_next = phbc->phbc_next;
8094 		if (mdi_mtc_off)
8095 			bus_config_phci((void *)phbc);
8096 		else
8097 			(void) thread_create(NULL, 0, bus_config_phci, phbc,
8098 			    0, &p0, TS_RUN, minclsyspri);
8099 	}
8100 
8101 	mutex_enter(&vhbc->vhbc_lock);
8102 	/* wait until all threads exit */
8103 	while (vhbc->vhbc_thr_count > 0)
8104 		cv_wait(&vhbc->vhbc_cv, &vhbc->vhbc_lock);
8105 	mutex_exit(&vhbc->vhbc_lock);
8106 
8107 	mutex_destroy(&vhbc->vhbc_lock);
8108 	cv_destroy(&vhbc->vhbc_cv);
8109 	kmem_free(vhbc, sizeof (*vhbc));
8110 }
8111 
8112 /*
8113  * Single threaded version of bus_config_all_phcis()
8114  */
8115 static void
8116 st_bus_config_all_phcis(mdi_vhci_config_t *vhc, uint_t flags,
8117     ddi_bus_config_op_t op, major_t maj)
8118 {
8119 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8120 
8121 	single_threaded_vhconfig_enter(vhc);
8122 	bus_config_all_phcis(vhcache, flags, op, maj);
8123 	single_threaded_vhconfig_exit(vhc);
8124 }
8125 
8126 /*
8127  * Perform BUS_CONFIG_ONE on the specified child of the phci.
8128  * The path includes the child component in addition to the phci path.
8129  */
8130 static int
8131 bus_config_one_phci_child(char *path)
8132 {
8133 	dev_info_t *ph_dip, *child;
8134 	char *devnm;
8135 	int rv = MDI_FAILURE;
8136 
8137 	/* extract the child component of the phci */
8138 	devnm = strrchr(path, '/');
8139 	*devnm++ = '\0';
8140 
8141 	/*
8142 	 * first configure all path components upto phci and then
8143 	 * configure the phci child.
8144 	 */
8145 	if ((ph_dip = e_ddi_hold_devi_by_path(path, 0)) != NULL) {
8146 		if (ndi_devi_config_one(ph_dip, devnm, &child, NDI_NO_EVENT) ==
8147 		    NDI_SUCCESS) {
8148 			/*
8149 			 * release the hold that ndi_devi_config_one() placed
8150 			 */
8151 			ndi_rele_devi(child);
8152 			rv = MDI_SUCCESS;
8153 		}
8154 
8155 		/* release the hold that e_ddi_hold_devi_by_path() placed */
8156 		ndi_rele_devi(ph_dip);
8157 	}
8158 
8159 	devnm--;
8160 	*devnm = '/';
8161 	return (rv);
8162 }
8163 
8164 /*
8165  * Build a list of phci client paths for the specified vhci client.
8166  * The list includes only those phci client paths which aren't configured yet.
8167  */
8168 static mdi_phys_path_t *
8169 build_phclient_path_list(mdi_vhcache_client_t *cct, char *ct_name)
8170 {
8171 	mdi_vhcache_pathinfo_t *cpi;
8172 	mdi_phys_path_t *pp_head = NULL, *pp_tail = NULL, *pp;
8173 	int config_path, len;
8174 
8175 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8176 		/*
8177 		 * include only those paths that aren't configured.
8178 		 */
8179 		config_path = 0;
8180 		if (cpi->cpi_pip == NULL)
8181 			config_path = 1;
8182 		else {
8183 			MDI_PI_LOCK(cpi->cpi_pip);
8184 			if (MDI_PI_IS_INIT(cpi->cpi_pip))
8185 				config_path = 1;
8186 			MDI_PI_UNLOCK(cpi->cpi_pip);
8187 		}
8188 
8189 		if (config_path) {
8190 			pp = kmem_alloc(sizeof (*pp), KM_SLEEP);
8191 			len = strlen(cpi->cpi_cphci->cphci_path) +
8192 			    strlen(ct_name) + strlen(cpi->cpi_addr) + 3;
8193 			pp->phys_path = kmem_alloc(len, KM_SLEEP);
8194 			(void) snprintf(pp->phys_path, len, "%s/%s@%s",
8195 			    cpi->cpi_cphci->cphci_path, ct_name,
8196 			    cpi->cpi_addr);
8197 			pp->phys_path_next = NULL;
8198 
8199 			if (pp_head == NULL)
8200 				pp_head = pp;
8201 			else
8202 				pp_tail->phys_path_next = pp;
8203 			pp_tail = pp;
8204 		}
8205 	}
8206 
8207 	return (pp_head);
8208 }
8209 
8210 /*
8211  * Free the memory allocated for phci client path list.
8212  */
8213 static void
8214 free_phclient_path_list(mdi_phys_path_t *pp_head)
8215 {
8216 	mdi_phys_path_t *pp, *pp_next;
8217 
8218 	for (pp = pp_head; pp != NULL; pp = pp_next) {
8219 		pp_next = pp->phys_path_next;
8220 		kmem_free(pp->phys_path, strlen(pp->phys_path) + 1);
8221 		kmem_free(pp, sizeof (*pp));
8222 	}
8223 }
8224 
8225 /*
8226  * Allocated async client structure and initialize with the specified values.
8227  */
8228 static mdi_async_client_config_t *
8229 alloc_async_client_config(char *ct_name, char *ct_addr,
8230     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
8231 {
8232 	mdi_async_client_config_t *acc;
8233 
8234 	acc = kmem_alloc(sizeof (*acc), KM_SLEEP);
8235 	acc->acc_ct_name = i_ddi_strdup(ct_name, KM_SLEEP);
8236 	acc->acc_ct_addr = i_ddi_strdup(ct_addr, KM_SLEEP);
8237 	acc->acc_phclient_path_list_head = pp_head;
8238 	init_vhcache_lookup_token(&acc->acc_token, tok);
8239 	acc->acc_next = NULL;
8240 	return (acc);
8241 }
8242 
8243 /*
8244  * Free the memory allocated for the async client structure and their members.
8245  */
8246 static void
8247 free_async_client_config(mdi_async_client_config_t *acc)
8248 {
8249 	if (acc->acc_phclient_path_list_head)
8250 		free_phclient_path_list(acc->acc_phclient_path_list_head);
8251 	kmem_free(acc->acc_ct_name, strlen(acc->acc_ct_name) + 1);
8252 	kmem_free(acc->acc_ct_addr, strlen(acc->acc_ct_addr) + 1);
8253 	kmem_free(acc, sizeof (*acc));
8254 }
8255 
8256 /*
8257  * Sort vhcache pathinfos (cpis) of the specified client.
8258  * All cpis which do not have MDI_CPI_HINT_PATH_DOES_NOT_EXIST
8259  * flag set come at the beginning of the list. All cpis which have this
8260  * flag set come at the end of the list.
8261  */
8262 static void
8263 sort_vhcache_paths(mdi_vhcache_client_t *cct)
8264 {
8265 	mdi_vhcache_pathinfo_t *cpi, *cpi_next, *cpi_head;
8266 
8267 	cpi_head = cct->cct_cpi_head;
8268 	cct->cct_cpi_head = cct->cct_cpi_tail = NULL;
8269 	for (cpi = cpi_head; cpi != NULL; cpi = cpi_next) {
8270 		cpi_next = cpi->cpi_next;
8271 		enqueue_vhcache_pathinfo(cct, cpi);
8272 	}
8273 }
8274 
8275 /*
8276  * Verify whether MDI_CPI_HINT_PATH_DOES_NOT_EXIST flag setting is correct for
8277  * every vhcache pathinfo of the specified client. If not adjust the flag
8278  * setting appropriately.
8279  *
8280  * Note that MDI_CPI_HINT_PATH_DOES_NOT_EXIST flag is persisted in the
8281  * on-disk vhci cache. So every time this flag is updated the cache must be
8282  * flushed.
8283  */
8284 static void
8285 adjust_sort_vhcache_paths(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
8286     mdi_vhcache_lookup_token_t *tok)
8287 {
8288 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8289 	mdi_vhcache_client_t *cct;
8290 	mdi_vhcache_pathinfo_t *cpi;
8291 
8292 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8293 	if ((cct = lookup_vhcache_client(vhcache, ct_name, ct_addr, tok))
8294 	    == NULL) {
8295 		rw_exit(&vhcache->vhcache_lock);
8296 		return;
8297 	}
8298 
8299 	/*
8300 	 * to avoid unnecessary on-disk cache updates, first check if an
8301 	 * update is really needed. If no update is needed simply return.
8302 	 */
8303 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8304 		if ((cpi->cpi_pip != NULL &&
8305 		    (cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST)) ||
8306 		    (cpi->cpi_pip == NULL &&
8307 		    !(cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST))) {
8308 			break;
8309 		}
8310 	}
8311 	if (cpi == NULL) {
8312 		rw_exit(&vhcache->vhcache_lock);
8313 		return;
8314 	}
8315 
8316 	if (rw_tryupgrade(&vhcache->vhcache_lock) == 0) {
8317 		rw_exit(&vhcache->vhcache_lock);
8318 		rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8319 		if ((cct = lookup_vhcache_client(vhcache, ct_name, ct_addr,
8320 		    tok)) == NULL) {
8321 			rw_exit(&vhcache->vhcache_lock);
8322 			return;
8323 		}
8324 	}
8325 
8326 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8327 		if (cpi->cpi_pip != NULL)
8328 			cpi->cpi_flags &= ~MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
8329 		else
8330 			cpi->cpi_flags |= MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
8331 	}
8332 	sort_vhcache_paths(cct);
8333 
8334 	rw_exit(&vhcache->vhcache_lock);
8335 	vhcache_dirty(vhc);
8336 }
8337 
8338 /*
8339  * Configure all specified paths of the client.
8340  */
8341 static void
8342 config_client_paths_sync(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
8343     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
8344 {
8345 	mdi_phys_path_t *pp;
8346 
8347 	for (pp = pp_head; pp != NULL; pp = pp->phys_path_next)
8348 		(void) bus_config_one_phci_child(pp->phys_path);
8349 	adjust_sort_vhcache_paths(vhc, ct_name, ct_addr, tok);
8350 }
8351 
8352 /*
8353  * Dequeue elements from vhci async client config list and bus configure
8354  * their corresponding phci clients.
8355  */
8356 static void
8357 config_client_paths_thread(void *arg)
8358 {
8359 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
8360 	mdi_async_client_config_t *acc;
8361 	clock_t quit_at_ticks;
8362 	clock_t idle_time = mdi_async_config_idle_time * TICKS_PER_SECOND;
8363 	callb_cpr_t cprinfo;
8364 
8365 	CALLB_CPR_INIT(&cprinfo, &vhc->vhc_lock, callb_generic_cpr,
8366 	    "mdi_config_client_paths");
8367 
8368 	for (; ; ) {
8369 		quit_at_ticks = ddi_get_lbolt() + idle_time;
8370 
8371 		mutex_enter(&vhc->vhc_lock);
8372 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
8373 		    vhc->vhc_acc_list_head == NULL &&
8374 		    ddi_get_lbolt() < quit_at_ticks) {
8375 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
8376 			(void) cv_timedwait(&vhc->vhc_cv, &vhc->vhc_lock,
8377 			    quit_at_ticks);
8378 			CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
8379 		}
8380 
8381 		if ((vhc->vhc_flags & MDI_VHC_EXIT) ||
8382 		    vhc->vhc_acc_list_head == NULL)
8383 			goto out;
8384 
8385 		acc = vhc->vhc_acc_list_head;
8386 		vhc->vhc_acc_list_head = acc->acc_next;
8387 		if (vhc->vhc_acc_list_head == NULL)
8388 			vhc->vhc_acc_list_tail = NULL;
8389 		vhc->vhc_acc_count--;
8390 		mutex_exit(&vhc->vhc_lock);
8391 
8392 		config_client_paths_sync(vhc, acc->acc_ct_name,
8393 		    acc->acc_ct_addr, acc->acc_phclient_path_list_head,
8394 		    &acc->acc_token);
8395 
8396 		free_async_client_config(acc);
8397 	}
8398 
8399 out:
8400 	vhc->vhc_acc_thrcount--;
8401 	/* CALLB_CPR_EXIT releases the vhc->vhc_lock */
8402 	CALLB_CPR_EXIT(&cprinfo);
8403 }
8404 
8405 /*
8406  * Arrange for all the phci client paths (pp_head) for the specified client
8407  * to be bus configured asynchronously by a thread.
8408  */
8409 static void
8410 config_client_paths_async(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
8411     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
8412 {
8413 	mdi_async_client_config_t *acc, *newacc;
8414 	int create_thread;
8415 
8416 	if (pp_head == NULL)
8417 		return;
8418 
8419 	if (mdi_mtc_off) {
8420 		config_client_paths_sync(vhc, ct_name, ct_addr, pp_head, tok);
8421 		free_phclient_path_list(pp_head);
8422 		return;
8423 	}
8424 
8425 	newacc = alloc_async_client_config(ct_name, ct_addr, pp_head, tok);
8426 	ASSERT(newacc);
8427 
8428 	mutex_enter(&vhc->vhc_lock);
8429 	for (acc = vhc->vhc_acc_list_head; acc != NULL; acc = acc->acc_next) {
8430 		if (strcmp(ct_name, acc->acc_ct_name) == 0 &&
8431 		    strcmp(ct_addr, acc->acc_ct_addr) == 0) {
8432 			free_async_client_config(newacc);
8433 			mutex_exit(&vhc->vhc_lock);
8434 			return;
8435 		}
8436 	}
8437 
8438 	if (vhc->vhc_acc_list_head == NULL)
8439 		vhc->vhc_acc_list_head = newacc;
8440 	else
8441 		vhc->vhc_acc_list_tail->acc_next = newacc;
8442 	vhc->vhc_acc_list_tail = newacc;
8443 	vhc->vhc_acc_count++;
8444 	if (vhc->vhc_acc_count <= vhc->vhc_acc_thrcount) {
8445 		cv_broadcast(&vhc->vhc_cv);
8446 		create_thread = 0;
8447 	} else {
8448 		vhc->vhc_acc_thrcount++;
8449 		create_thread = 1;
8450 	}
8451 	mutex_exit(&vhc->vhc_lock);
8452 
8453 	if (create_thread)
8454 		(void) thread_create(NULL, 0, config_client_paths_thread, vhc,
8455 		    0, &p0, TS_RUN, minclsyspri);
8456 }
8457 
8458 /*
8459  * Return number of online paths for the specified client.
8460  */
8461 static int
8462 nonline_paths(mdi_vhcache_client_t *cct)
8463 {
8464 	mdi_vhcache_pathinfo_t *cpi;
8465 	int online_count = 0;
8466 
8467 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8468 		if (cpi->cpi_pip != NULL) {
8469 			MDI_PI_LOCK(cpi->cpi_pip);
8470 			if (cpi->cpi_pip->pi_state == MDI_PATHINFO_STATE_ONLINE)
8471 				online_count++;
8472 			MDI_PI_UNLOCK(cpi->cpi_pip);
8473 		}
8474 	}
8475 
8476 	return (online_count);
8477 }
8478 
8479 /*
8480  * Bus configure all paths for the specified vhci client.
8481  * If at least one path for the client is already online, the remaining paths
8482  * will be configured asynchronously. Otherwise, it synchronously configures
8483  * the paths until at least one path is online and then rest of the paths
8484  * will be configured asynchronously.
8485  */
8486 static void
8487 config_client_paths(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr)
8488 {
8489 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8490 	mdi_phys_path_t *pp_head, *pp;
8491 	mdi_vhcache_client_t *cct;
8492 	mdi_vhcache_lookup_token_t tok;
8493 
8494 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
8495 
8496 	init_vhcache_lookup_token(&tok, NULL);
8497 
8498 	if (ct_name == NULL || ct_addr == NULL ||
8499 	    (cct = lookup_vhcache_client(vhcache, ct_name, ct_addr, &tok))
8500 	    == NULL ||
8501 	    (pp_head = build_phclient_path_list(cct, ct_name)) == NULL) {
8502 		rw_exit(&vhcache->vhcache_lock);
8503 		return;
8504 	}
8505 
8506 	/* if at least one path is online, configure the rest asynchronously */
8507 	if (nonline_paths(cct) > 0) {
8508 		rw_exit(&vhcache->vhcache_lock);
8509 		config_client_paths_async(vhc, ct_name, ct_addr, pp_head, &tok);
8510 		return;
8511 	}
8512 
8513 	rw_exit(&vhcache->vhcache_lock);
8514 
8515 	for (pp = pp_head; pp != NULL; pp = pp->phys_path_next) {
8516 		if (bus_config_one_phci_child(pp->phys_path) == MDI_SUCCESS) {
8517 			rw_enter(&vhcache->vhcache_lock, RW_READER);
8518 
8519 			if ((cct = lookup_vhcache_client(vhcache, ct_name,
8520 			    ct_addr, &tok)) == NULL) {
8521 				rw_exit(&vhcache->vhcache_lock);
8522 				goto out;
8523 			}
8524 
8525 			if (nonline_paths(cct) > 0 &&
8526 			    pp->phys_path_next != NULL) {
8527 				rw_exit(&vhcache->vhcache_lock);
8528 				config_client_paths_async(vhc, ct_name, ct_addr,
8529 				    pp->phys_path_next, &tok);
8530 				pp->phys_path_next = NULL;
8531 				goto out;
8532 			}
8533 
8534 			rw_exit(&vhcache->vhcache_lock);
8535 		}
8536 	}
8537 
8538 	adjust_sort_vhcache_paths(vhc, ct_name, ct_addr, &tok);
8539 out:
8540 	free_phclient_path_list(pp_head);
8541 }
8542 
8543 static void
8544 single_threaded_vhconfig_enter(mdi_vhci_config_t *vhc)
8545 {
8546 	mutex_enter(&vhc->vhc_lock);
8547 	while (vhc->vhc_flags & MDI_VHC_SINGLE_THREADED)
8548 		cv_wait(&vhc->vhc_cv, &vhc->vhc_lock);
8549 	vhc->vhc_flags |= MDI_VHC_SINGLE_THREADED;
8550 	mutex_exit(&vhc->vhc_lock);
8551 }
8552 
8553 static void
8554 single_threaded_vhconfig_exit(mdi_vhci_config_t *vhc)
8555 {
8556 	mutex_enter(&vhc->vhc_lock);
8557 	vhc->vhc_flags &= ~MDI_VHC_SINGLE_THREADED;
8558 	cv_broadcast(&vhc->vhc_cv);
8559 	mutex_exit(&vhc->vhc_lock);
8560 }
8561 
8562 typedef struct mdi_phci_driver_info {
8563 	char	*phdriver_name;	/* name of the phci driver */
8564 
8565 	/* set to non zero if the phci driver supports root device */
8566 	int	phdriver_root_support;
8567 } mdi_phci_driver_info_t;
8568 
8569 /*
8570  * vhci class and root support capability of a phci driver can be
8571  * specified using ddi-vhci-class and ddi-no-root-support properties in the
8572  * phci driver.conf file. The built-in tables below contain this information
8573  * for those phci drivers whose driver.conf files don't yet contain this info.
8574  *
8575  * All phci drivers expect iscsi have root device support.
8576  */
8577 static mdi_phci_driver_info_t scsi_phci_driver_list[] = {
8578 	{ "fp", 1 },
8579 	{ "iscsi", 0 },
8580 	{ "ibsrp", 1 }
8581 	};
8582 
8583 static mdi_phci_driver_info_t ib_phci_driver_list[] = { "tavor", 1 };
8584 
8585 static void *
8586 mdi_realloc(void *old_ptr, size_t old_size, size_t new_size)
8587 {
8588 	void *new_ptr;
8589 
8590 	new_ptr = kmem_zalloc(new_size, KM_SLEEP);
8591 	if (old_ptr) {
8592 		bcopy(old_ptr, new_ptr, MIN(old_size, new_size));
8593 		kmem_free(old_ptr, old_size);
8594 	}
8595 	return (new_ptr);
8596 }
8597 
8598 static void
8599 add_to_phci_list(char ***driver_list, int **root_support_list,
8600     int *cur_elements, int *max_elements, char *driver_name, int root_support)
8601 {
8602 	ASSERT(*cur_elements <= *max_elements);
8603 	if (*cur_elements == *max_elements) {
8604 		*max_elements += 10;
8605 		*driver_list = mdi_realloc(*driver_list,
8606 		    sizeof (char *) * (*cur_elements),
8607 		    sizeof (char *) * (*max_elements));
8608 		*root_support_list = mdi_realloc(*root_support_list,
8609 		    sizeof (int) * (*cur_elements),
8610 		    sizeof (int) * (*max_elements));
8611 	}
8612 	(*driver_list)[*cur_elements] = i_ddi_strdup(driver_name, KM_SLEEP);
8613 	(*root_support_list)[*cur_elements] = root_support;
8614 	(*cur_elements)++;
8615 }
8616 
8617 static void
8618 get_phci_driver_list(char *vhci_class, char ***driver_list,
8619     int **root_support_list, int *cur_elements, int *max_elements)
8620 {
8621 	mdi_phci_driver_info_t	*st_driver_list, *p;
8622 	int		st_ndrivers, root_support, i, j, driver_conf_count;
8623 	major_t		m;
8624 	struct devnames	*dnp;
8625 	ddi_prop_t	*propp;
8626 
8627 	*driver_list = NULL;
8628 	*root_support_list = NULL;
8629 	*cur_elements = 0;
8630 	*max_elements = 0;
8631 
8632 	/* add the phci drivers derived from the phci driver.conf files */
8633 	for (m = 0; m < devcnt; m++) {
8634 		dnp = &devnamesp[m];
8635 
8636 		if (dnp->dn_flags & DN_PHCI_DRIVER) {
8637 			LOCK_DEV_OPS(&dnp->dn_lock);
8638 			if (dnp->dn_global_prop_ptr != NULL &&
8639 			    (propp = i_ddi_prop_search(DDI_DEV_T_ANY,
8640 			    DDI_VHCI_CLASS, DDI_PROP_TYPE_STRING,
8641 			    &dnp->dn_global_prop_ptr->prop_list)) != NULL &&
8642 			    strcmp(propp->prop_val, vhci_class) == 0) {
8643 
8644 				root_support = (i_ddi_prop_search(DDI_DEV_T_ANY,
8645 				    DDI_NO_ROOT_SUPPORT, DDI_PROP_TYPE_INT,
8646 				    &dnp->dn_global_prop_ptr->prop_list)
8647 				    == NULL) ? 1 : 0;
8648 
8649 				add_to_phci_list(driver_list, root_support_list,
8650 				    cur_elements, max_elements, dnp->dn_name,
8651 				    root_support);
8652 
8653 				UNLOCK_DEV_OPS(&dnp->dn_lock);
8654 			} else
8655 				UNLOCK_DEV_OPS(&dnp->dn_lock);
8656 		}
8657 	}
8658 
8659 	driver_conf_count = *cur_elements;
8660 
8661 	/* add the phci drivers specified in the built-in tables */
8662 	if (strcmp(vhci_class, MDI_HCI_CLASS_SCSI) == 0) {
8663 		st_driver_list = scsi_phci_driver_list;
8664 		st_ndrivers = sizeof (scsi_phci_driver_list) /
8665 		    sizeof (mdi_phci_driver_info_t);
8666 	} else if (strcmp(vhci_class, MDI_HCI_CLASS_IB) == 0) {
8667 		st_driver_list = ib_phci_driver_list;
8668 		st_ndrivers = sizeof (ib_phci_driver_list) /
8669 		    sizeof (mdi_phci_driver_info_t);
8670 	} else {
8671 		st_driver_list = NULL;
8672 		st_ndrivers = 0;
8673 	}
8674 
8675 	for (i = 0, p = st_driver_list; i < st_ndrivers; i++, p++) {
8676 		/* add this phci driver if not already added before */
8677 		for (j = 0; j < driver_conf_count; j++) {
8678 			if (strcmp((*driver_list)[j], p->phdriver_name) == 0)
8679 				break;
8680 		}
8681 		if (j == driver_conf_count) {
8682 			add_to_phci_list(driver_list, root_support_list,
8683 			    cur_elements, max_elements, p->phdriver_name,
8684 			    p->phdriver_root_support);
8685 		}
8686 	}
8687 }
8688 
8689 /*
8690  * Attach the phci driver instances associated with the specified vhci class.
8691  * If root is mounted attach all phci driver instances.
8692  * If root is not mounted, attach the instances of only those phci
8693  * drivers that have the root support.
8694  */
8695 static void
8696 attach_phci_drivers(char *vhci_class)
8697 {
8698 	char	**driver_list, **p;
8699 	int	*root_support_list;
8700 	int	cur_elements, max_elements, i;
8701 	major_t	m;
8702 
8703 	get_phci_driver_list(vhci_class, &driver_list, &root_support_list,
8704 	    &cur_elements, &max_elements);
8705 
8706 	for (i = 0; i < cur_elements; i++) {
8707 		if (modrootloaded || root_support_list[i]) {
8708 			m = ddi_name_to_major(driver_list[i]);
8709 			if (m != DDI_MAJOR_T_NONE &&
8710 			    ddi_hold_installed_driver(m))
8711 				ddi_rele_driver(m);
8712 		}
8713 	}
8714 
8715 	if (driver_list) {
8716 		for (i = 0, p = driver_list; i < cur_elements; i++, p++)
8717 			kmem_free(*p, strlen(*p) + 1);
8718 		kmem_free(driver_list, sizeof (char *) * max_elements);
8719 		kmem_free(root_support_list, sizeof (int) * max_elements);
8720 	}
8721 }
8722 
8723 /*
8724  * Build vhci cache:
8725  *
8726  * Attach phci driver instances and then drive BUS_CONFIG_ALL on
8727  * the phci driver instances. During this process the cache gets built.
8728  *
8729  * Cache is built fully if the root is mounted.
8730  * If the root is not mounted, phci drivers that do not have root support
8731  * are not attached. As a result the cache is built partially. The entries
8732  * in the cache reflect only those phci drivers that have root support.
8733  */
8734 static int
8735 build_vhci_cache(mdi_vhci_t *vh)
8736 {
8737 	mdi_vhci_config_t *vhc = vh->vh_config;
8738 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8739 
8740 	single_threaded_vhconfig_enter(vhc);
8741 
8742 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8743 	if (vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE) {
8744 		rw_exit(&vhcache->vhcache_lock);
8745 		single_threaded_vhconfig_exit(vhc);
8746 		return (0);
8747 	}
8748 	rw_exit(&vhcache->vhcache_lock);
8749 
8750 	attach_phci_drivers(vh->vh_class);
8751 	bus_config_all_phcis(vhcache, NDI_DRV_CONF_REPROBE | NDI_NO_EVENT,
8752 	    BUS_CONFIG_ALL, DDI_MAJOR_T_NONE);
8753 
8754 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8755 	vhcache->vhcache_flags |= MDI_VHCI_CACHE_SETUP_DONE;
8756 	rw_exit(&vhcache->vhcache_lock);
8757 
8758 	single_threaded_vhconfig_exit(vhc);
8759 	vhcache_dirty(vhc);
8760 	return (1);
8761 }
8762 
8763 /*
8764  * Determine if discovery of paths is needed.
8765  */
8766 static int
8767 vhcache_do_discovery(mdi_vhci_config_t *vhc)
8768 {
8769 	int rv = 1;
8770 
8771 	mutex_enter(&vhc->vhc_lock);
8772 	if (i_ddi_io_initialized() == 0) {
8773 		if (vhc->vhc_path_discovery_boot > 0) {
8774 			vhc->vhc_path_discovery_boot--;
8775 			goto out;
8776 		}
8777 	} else {
8778 		if (vhc->vhc_path_discovery_postboot > 0) {
8779 			vhc->vhc_path_discovery_postboot--;
8780 			goto out;
8781 		}
8782 	}
8783 
8784 	/*
8785 	 * Do full path discovery at most once per mdi_path_discovery_interval.
8786 	 * This is to avoid a series of full path discoveries when opening
8787 	 * stale /dev/[r]dsk links.
8788 	 */
8789 	if (mdi_path_discovery_interval != -1 &&
8790 	    lbolt64 >= vhc->vhc_path_discovery_cutoff_time)
8791 		goto out;
8792 
8793 	rv = 0;
8794 out:
8795 	mutex_exit(&vhc->vhc_lock);
8796 	return (rv);
8797 }
8798 
8799 /*
8800  * Discover all paths:
8801  *
8802  * Attach phci driver instances and then drive BUS_CONFIG_ALL on all the phci
8803  * driver instances. During this process all paths will be discovered.
8804  */
8805 static int
8806 vhcache_discover_paths(mdi_vhci_t *vh)
8807 {
8808 	mdi_vhci_config_t *vhc = vh->vh_config;
8809 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8810 	int rv = 0;
8811 
8812 	single_threaded_vhconfig_enter(vhc);
8813 
8814 	if (vhcache_do_discovery(vhc)) {
8815 		attach_phci_drivers(vh->vh_class);
8816 		bus_config_all_phcis(vhcache, NDI_DRV_CONF_REPROBE |
8817 		    NDI_NO_EVENT, BUS_CONFIG_ALL, DDI_MAJOR_T_NONE);
8818 
8819 		mutex_enter(&vhc->vhc_lock);
8820 		vhc->vhc_path_discovery_cutoff_time = lbolt64 +
8821 		    mdi_path_discovery_interval * TICKS_PER_SECOND;
8822 		mutex_exit(&vhc->vhc_lock);
8823 		rv = 1;
8824 	}
8825 
8826 	single_threaded_vhconfig_exit(vhc);
8827 	return (rv);
8828 }
8829 
8830 /*
8831  * Generic vhci bus config implementation:
8832  *
8833  * Parameters
8834  *	vdip	vhci dip
8835  *	flags	bus config flags
8836  *	op	bus config operation
8837  *	The remaining parameters are bus config operation specific
8838  *
8839  * for BUS_CONFIG_ONE
8840  *	arg	pointer to name@addr
8841  *	child	upon successful return from this function, *child will be
8842  *		set to the configured and held devinfo child node of vdip.
8843  *	ct_addr	pointer to client address (i.e. GUID)
8844  *
8845  * for BUS_CONFIG_DRIVER
8846  *	arg	major number of the driver
8847  *	child and ct_addr parameters are ignored
8848  *
8849  * for BUS_CONFIG_ALL
8850  *	arg, child, and ct_addr parameters are ignored
8851  *
8852  * Note that for the rest of the bus config operations, this function simply
8853  * calls the framework provided default bus config routine.
8854  */
8855 int
8856 mdi_vhci_bus_config(dev_info_t *vdip, uint_t flags, ddi_bus_config_op_t op,
8857     void *arg, dev_info_t **child, char *ct_addr)
8858 {
8859 	mdi_vhci_t *vh = i_devi_get_vhci(vdip);
8860 	mdi_vhci_config_t *vhc = vh->vh_config;
8861 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8862 	int rv = 0;
8863 	int params_valid = 0;
8864 	char *cp;
8865 
8866 	/*
8867 	 * To bus config vhcis we relay operation, possibly using another
8868 	 * thread, to phcis. The phci driver then interacts with MDI to cause
8869 	 * vhci child nodes to be enumerated under the vhci node.  Adding a
8870 	 * vhci child requires an ndi_devi_enter of the vhci. Since another
8871 	 * thread may be adding the child, to avoid deadlock we can't wait
8872 	 * for the relayed operations to complete if we have already entered
8873 	 * the vhci node.
8874 	 */
8875 	if (DEVI_BUSY_OWNED(vdip)) {
8876 		MDI_DEBUG(2, (CE_NOTE, vdip, "!MDI: vhci bus config: "
8877 		    "vhci dip is busy owned %p\n", (void *)vdip));
8878 		goto default_bus_config;
8879 	}
8880 
8881 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8882 	if (!(vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE)) {
8883 		rw_exit(&vhcache->vhcache_lock);
8884 		rv = build_vhci_cache(vh);
8885 		rw_enter(&vhcache->vhcache_lock, RW_READER);
8886 	}
8887 
8888 	switch (op) {
8889 	case BUS_CONFIG_ONE:
8890 		if (arg != NULL && ct_addr != NULL) {
8891 			/* extract node name */
8892 			cp = (char *)arg;
8893 			while (*cp != '\0' && *cp != '@')
8894 				cp++;
8895 			if (*cp == '@') {
8896 				params_valid = 1;
8897 				*cp = '\0';
8898 				config_client_paths(vhc, (char *)arg, ct_addr);
8899 				/* config_client_paths() releases cache_lock */
8900 				*cp = '@';
8901 				break;
8902 			}
8903 		}
8904 
8905 		rw_exit(&vhcache->vhcache_lock);
8906 		break;
8907 
8908 	case BUS_CONFIG_DRIVER:
8909 		rw_exit(&vhcache->vhcache_lock);
8910 		if (rv == 0)
8911 			st_bus_config_all_phcis(vhc, flags, op,
8912 			    (major_t)(uintptr_t)arg);
8913 		break;
8914 
8915 	case BUS_CONFIG_ALL:
8916 		rw_exit(&vhcache->vhcache_lock);
8917 		if (rv == 0)
8918 			st_bus_config_all_phcis(vhc, flags, op, -1);
8919 		break;
8920 
8921 	default:
8922 		rw_exit(&vhcache->vhcache_lock);
8923 		break;
8924 	}
8925 
8926 
8927 default_bus_config:
8928 	/*
8929 	 * All requested child nodes are enumerated under the vhci.
8930 	 * Now configure them.
8931 	 */
8932 	if (ndi_busop_bus_config(vdip, flags, op, arg, child, 0) ==
8933 	    NDI_SUCCESS) {
8934 		return (MDI_SUCCESS);
8935 	} else if (op == BUS_CONFIG_ONE && rv == 0 && params_valid) {
8936 		/* discover all paths and try configuring again */
8937 		if (vhcache_discover_paths(vh) &&
8938 		    ndi_busop_bus_config(vdip, flags, op, arg, child, 0) ==
8939 		    NDI_SUCCESS)
8940 			return (MDI_SUCCESS);
8941 	}
8942 
8943 	return (MDI_FAILURE);
8944 }
8945 
8946 /*
8947  * Read the on-disk vhci cache into an nvlist for the specified vhci class.
8948  */
8949 static nvlist_t *
8950 read_on_disk_vhci_cache(char *vhci_class)
8951 {
8952 	nvlist_t *nvl;
8953 	int err;
8954 	char *filename;
8955 
8956 	filename = vhclass2vhcache_filename(vhci_class);
8957 
8958 	if ((err = fread_nvlist(filename, &nvl)) == 0) {
8959 		kmem_free(filename, strlen(filename) + 1);
8960 		return (nvl);
8961 	} else if (err == EIO)
8962 		cmn_err(CE_WARN, "%s: I/O error, will recreate\n", filename);
8963 	else if (err == EINVAL)
8964 		cmn_err(CE_WARN,
8965 		    "%s: data file corrupted, will recreate\n", filename);
8966 
8967 	kmem_free(filename, strlen(filename) + 1);
8968 	return (NULL);
8969 }
8970 
8971 /*
8972  * Read on-disk vhci cache into nvlists for all vhci classes.
8973  * Called during booting by i_ddi_read_devices_files().
8974  */
8975 void
8976 mdi_read_devices_files(void)
8977 {
8978 	int i;
8979 
8980 	for (i = 0; i < N_VHCI_CLASSES; i++)
8981 		vhcache_nvl[i] = read_on_disk_vhci_cache(vhci_class_list[i]);
8982 }
8983 
8984 /*
8985  * Remove all stale entries from vhci cache.
8986  */
8987 static void
8988 clean_vhcache(mdi_vhci_config_t *vhc)
8989 {
8990 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8991 	mdi_vhcache_phci_t *cphci, *cphci_head, *cphci_next;
8992 	mdi_vhcache_client_t *cct, *cct_head, *cct_next;
8993 	mdi_vhcache_pathinfo_t *cpi, *cpi_head, *cpi_next;
8994 
8995 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8996 
8997 	cct_head = vhcache->vhcache_client_head;
8998 	vhcache->vhcache_client_head = vhcache->vhcache_client_tail = NULL;
8999 	for (cct = cct_head; cct != NULL; cct = cct_next) {
9000 		cct_next = cct->cct_next;
9001 
9002 		cpi_head = cct->cct_cpi_head;
9003 		cct->cct_cpi_head = cct->cct_cpi_tail = NULL;
9004 		for (cpi = cpi_head; cpi != NULL; cpi = cpi_next) {
9005 			cpi_next = cpi->cpi_next;
9006 			if (cpi->cpi_pip != NULL) {
9007 				ASSERT(cpi->cpi_cphci->cphci_phci != NULL);
9008 				enqueue_tail_vhcache_pathinfo(cct, cpi);
9009 			} else
9010 				free_vhcache_pathinfo(cpi);
9011 		}
9012 
9013 		if (cct->cct_cpi_head != NULL)
9014 			enqueue_vhcache_client(vhcache, cct);
9015 		else {
9016 			(void) mod_hash_destroy(vhcache->vhcache_client_hash,
9017 			    (mod_hash_key_t)cct->cct_name_addr);
9018 			free_vhcache_client(cct);
9019 		}
9020 	}
9021 
9022 	cphci_head = vhcache->vhcache_phci_head;
9023 	vhcache->vhcache_phci_head = vhcache->vhcache_phci_tail = NULL;
9024 	for (cphci = cphci_head; cphci != NULL; cphci = cphci_next) {
9025 		cphci_next = cphci->cphci_next;
9026 		if (cphci->cphci_phci != NULL)
9027 			enqueue_vhcache_phci(vhcache, cphci);
9028 		else
9029 			free_vhcache_phci(cphci);
9030 	}
9031 
9032 	vhcache->vhcache_clean_time = lbolt64;
9033 	rw_exit(&vhcache->vhcache_lock);
9034 	vhcache_dirty(vhc);
9035 }
9036 
9037 /*
9038  * Remove all stale entries from vhci cache.
9039  * Called by i_ddi_clean_devices_files() during the execution of devfsadm -C
9040  */
9041 void
9042 mdi_clean_vhcache(void)
9043 {
9044 	mdi_vhci_t *vh;
9045 
9046 	mutex_enter(&mdi_mutex);
9047 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
9048 		vh->vh_refcnt++;
9049 		mutex_exit(&mdi_mutex);
9050 		clean_vhcache(vh->vh_config);
9051 		mutex_enter(&mdi_mutex);
9052 		vh->vh_refcnt--;
9053 	}
9054 	mutex_exit(&mdi_mutex);
9055 }
9056 
9057 /*
9058  * mdi_vhci_walk_clients():
9059  *		Walker routine to traverse client dev_info nodes
9060  * ddi_walk_devs(ddi_get_child(vdip), f, arg) returns the entire tree
9061  * below the client, including nexus devices, which we dont want.
9062  * So we just traverse the immediate siblings, starting from 1st client.
9063  */
9064 void
9065 mdi_vhci_walk_clients(dev_info_t *vdip,
9066     int (*f)(dev_info_t *, void *), void *arg)
9067 {
9068 	mdi_vhci_t	*vh = i_devi_get_vhci(vdip);
9069 	dev_info_t	*cdip;
9070 	mdi_client_t	*ct;
9071 
9072 	MDI_VHCI_CLIENT_LOCK(vh);
9073 	cdip = ddi_get_child(vdip);
9074 	while (cdip) {
9075 		ct = i_devi_get_client(cdip);
9076 		MDI_CLIENT_LOCK(ct);
9077 
9078 		if (((*f)(cdip, arg)) == DDI_WALK_CONTINUE)
9079 			cdip = ddi_get_next_sibling(cdip);
9080 		else
9081 			cdip = NULL;
9082 
9083 		MDI_CLIENT_UNLOCK(ct);
9084 	}
9085 	MDI_VHCI_CLIENT_UNLOCK(vh);
9086 }
9087 
9088 /*
9089  * mdi_vhci_walk_phcis():
9090  *		Walker routine to traverse phci dev_info nodes
9091  */
9092 void
9093 mdi_vhci_walk_phcis(dev_info_t *vdip,
9094     int (*f)(dev_info_t *, void *), void *arg)
9095 {
9096 	mdi_vhci_t	*vh = i_devi_get_vhci(vdip);
9097 	mdi_phci_t	*ph, *next;
9098 
9099 	MDI_VHCI_PHCI_LOCK(vh);
9100 	ph = vh->vh_phci_head;
9101 	while (ph) {
9102 		MDI_PHCI_LOCK(ph);
9103 
9104 		if (((*f)(ph->ph_dip, arg)) == DDI_WALK_CONTINUE)
9105 			next = ph->ph_next;
9106 		else
9107 			next = NULL;
9108 
9109 		MDI_PHCI_UNLOCK(ph);
9110 		ph = next;
9111 	}
9112 	MDI_VHCI_PHCI_UNLOCK(vh);
9113 }
9114 
9115 
9116 /*
9117  * mdi_walk_vhcis():
9118  *		Walker routine to traverse vhci dev_info nodes
9119  */
9120 void
9121 mdi_walk_vhcis(int (*f)(dev_info_t *, void *), void *arg)
9122 {
9123 	mdi_vhci_t	*vh = NULL;
9124 
9125 	mutex_enter(&mdi_mutex);
9126 	/*
9127 	 * Scan for already registered vhci
9128 	 */
9129 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
9130 		vh->vh_refcnt++;
9131 		mutex_exit(&mdi_mutex);
9132 		if (((*f)(vh->vh_dip, arg)) != DDI_WALK_CONTINUE) {
9133 			mutex_enter(&mdi_mutex);
9134 			vh->vh_refcnt--;
9135 			break;
9136 		} else {
9137 			mutex_enter(&mdi_mutex);
9138 			vh->vh_refcnt--;
9139 		}
9140 	}
9141 
9142 	mutex_exit(&mdi_mutex);
9143 }
9144 
9145 /*
9146  * i_mdi_log_sysevent():
9147  *		Logs events for pickup by syseventd
9148  */
9149 static void
9150 i_mdi_log_sysevent(dev_info_t *dip, char *ph_vh_class, char *subclass)
9151 {
9152 	char		*path_name;
9153 	nvlist_t	*attr_list;
9154 
9155 	if (nvlist_alloc(&attr_list, NV_UNIQUE_NAME_TYPE,
9156 	    KM_SLEEP) != DDI_SUCCESS) {
9157 		goto alloc_failed;
9158 	}
9159 
9160 	path_name = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
9161 	(void) ddi_pathname(dip, path_name);
9162 
9163 	if (nvlist_add_string(attr_list, DDI_DRIVER_NAME,
9164 	    ddi_driver_name(dip)) != DDI_SUCCESS) {
9165 		goto error;
9166 	}
9167 
9168 	if (nvlist_add_int32(attr_list, DDI_DRIVER_MAJOR,
9169 	    (int32_t)ddi_driver_major(dip)) != DDI_SUCCESS) {
9170 		goto error;
9171 	}
9172 
9173 	if (nvlist_add_int32(attr_list, DDI_INSTANCE,
9174 	    (int32_t)ddi_get_instance(dip)) != DDI_SUCCESS) {
9175 		goto error;
9176 	}
9177 
9178 	if (nvlist_add_string(attr_list, DDI_PATHNAME,
9179 	    path_name) != DDI_SUCCESS) {
9180 		goto error;
9181 	}
9182 
9183 	if (nvlist_add_string(attr_list, DDI_CLASS,
9184 	    ph_vh_class) != DDI_SUCCESS) {
9185 		goto error;
9186 	}
9187 
9188 	(void) ddi_log_sysevent(dip, DDI_VENDOR_SUNW, EC_DDI, subclass,
9189 	    attr_list, NULL, DDI_SLEEP);
9190 
9191 error:
9192 	kmem_free(path_name, MAXPATHLEN);
9193 	nvlist_free(attr_list);
9194 	return;
9195 
9196 alloc_failed:
9197 	MDI_DEBUG(1, (CE_WARN, dip,
9198 	    "!i_mdi_log_sysevent: Unable to send sysevent"));
9199 }
9200 
9201 char **
9202 mdi_get_phci_driver_list(char *vhci_class, int	*ndrivers)
9203 {
9204 	char	**driver_list, **ret_driver_list = NULL;
9205 	int	*root_support_list;
9206 	int	cur_elements, max_elements;
9207 
9208 	get_phci_driver_list(vhci_class, &driver_list, &root_support_list,
9209 	    &cur_elements, &max_elements);
9210 
9211 
9212 	if (driver_list) {
9213 		kmem_free(root_support_list, sizeof (int) * max_elements);
9214 		ret_driver_list = mdi_realloc(driver_list, sizeof (char *)
9215 		    * max_elements, sizeof (char *) * cur_elements);
9216 	}
9217 	*ndrivers = cur_elements;
9218 
9219 	return (ret_driver_list);
9220 
9221 }
9222 
9223 void
9224 mdi_free_phci_driver_list(char **driver_list, int ndrivers)
9225 {
9226 	char	**p;
9227 	int	i;
9228 
9229 	if (driver_list) {
9230 		for (i = 0, p = driver_list; i < ndrivers; i++, p++)
9231 			kmem_free(*p, strlen(*p) + 1);
9232 		kmem_free(driver_list, sizeof (char *) * ndrivers);
9233 	}
9234 }
9235