xref: /titanic_41/usr/src/uts/common/os/sunmdi.c (revision 38b58fe3ef2f055821cf2e96d60de0b76624522e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Multipath driver interface (MDI) implementation; see mdi_impl.h for a more
28  * detailed discussion of the overall mpxio architecture.
29  *
30  * Default locking order:
31  *
32  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_phci_mutex);
33  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_client_mutex);
34  * _NOTE(LOCK_ORDER(mdi_vhci:vh_phci_mutex, mdi_phci::ph_mutex);
35  * _NOTE(LOCK_ORDER(mdi_vhci:vh_client_mutex, mdi_client::ct_mutex);
36  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
37  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_client::ct_mutex))
38  * _NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
39  */
40 
41 #include <sys/note.h>
42 #include <sys/types.h>
43 #include <sys/varargs.h>
44 #include <sys/param.h>
45 #include <sys/errno.h>
46 #include <sys/uio.h>
47 #include <sys/buf.h>
48 #include <sys/modctl.h>
49 #include <sys/open.h>
50 #include <sys/kmem.h>
51 #include <sys/poll.h>
52 #include <sys/conf.h>
53 #include <sys/bootconf.h>
54 #include <sys/cmn_err.h>
55 #include <sys/stat.h>
56 #include <sys/ddi.h>
57 #include <sys/sunddi.h>
58 #include <sys/ddipropdefs.h>
59 #include <sys/sunndi.h>
60 #include <sys/ndi_impldefs.h>
61 #include <sys/promif.h>
62 #include <sys/sunmdi.h>
63 #include <sys/mdi_impldefs.h>
64 #include <sys/taskq.h>
65 #include <sys/epm.h>
66 #include <sys/sunpm.h>
67 #include <sys/modhash.h>
68 #include <sys/disp.h>
69 #include <sys/autoconf.h>
70 #include <sys/sysmacros.h>
71 
72 #ifdef	DEBUG
73 #include <sys/debug.h>
74 int	mdi_debug = 1;
75 int	mdi_debug_logonly = 0;
76 #define	MDI_DEBUG(dbglevel, pargs) if (mdi_debug >= (dbglevel))	i_mdi_log pargs
77 #define	MDI_WARN	CE_WARN, __func__
78 #define	MDI_NOTE	CE_NOTE, __func__
79 #define	MDI_CONT	CE_CONT, __func__
80 static void i_mdi_log(int, const char *, dev_info_t *, const char *, ...);
81 #else	/* !DEBUG */
82 #define	MDI_DEBUG(dbglevel, pargs)
83 #endif	/* DEBUG */
84 int	mdi_debug_consoleonly = 0;
85 
86 extern pri_t	minclsyspri;
87 extern int	modrootloaded;
88 
89 /*
90  * Global mutex:
91  * Protects vHCI list and structure members.
92  */
93 kmutex_t	mdi_mutex;
94 
95 /*
96  * Registered vHCI class driver lists
97  */
98 int		mdi_vhci_count;
99 mdi_vhci_t	*mdi_vhci_head;
100 mdi_vhci_t	*mdi_vhci_tail;
101 
102 /*
103  * Client Hash Table size
104  */
105 static int	mdi_client_table_size = CLIENT_HASH_TABLE_SIZE;
106 
107 /*
108  * taskq interface definitions
109  */
110 #define	MDI_TASKQ_N_THREADS	8
111 #define	MDI_TASKQ_PRI		minclsyspri
112 #define	MDI_TASKQ_MINALLOC	(4*mdi_taskq_n_threads)
113 #define	MDI_TASKQ_MAXALLOC	(500*mdi_taskq_n_threads)
114 
115 taskq_t				*mdi_taskq;
116 static uint_t			mdi_taskq_n_threads = MDI_TASKQ_N_THREADS;
117 
118 #define	TICKS_PER_SECOND	(drv_usectohz(1000000))
119 
120 /*
121  * The data should be "quiet" for this interval (in seconds) before the
122  * vhci cached data is flushed to the disk.
123  */
124 static int mdi_vhcache_flush_delay = 10;
125 
126 /* number of seconds the vhcache flush daemon will sleep idle before exiting */
127 static int mdi_vhcache_flush_daemon_idle_time = 60;
128 
129 /*
130  * MDI falls back to discovery of all paths when a bus_config_one fails.
131  * The following parameters can be used to tune this operation.
132  *
133  * mdi_path_discovery_boot
134  *	Number of times path discovery will be attempted during early boot.
135  *	Probably there is no reason to ever set this value to greater than one.
136  *
137  * mdi_path_discovery_postboot
138  *	Number of times path discovery will be attempted after early boot.
139  *	Set it to a minimum of two to allow for discovery of iscsi paths which
140  *	may happen very late during booting.
141  *
142  * mdi_path_discovery_interval
143  *	Minimum number of seconds MDI will wait between successive discovery
144  *	of all paths. Set it to -1 to disable discovery of all paths.
145  */
146 static int mdi_path_discovery_boot = 1;
147 static int mdi_path_discovery_postboot = 2;
148 static int mdi_path_discovery_interval = 10;
149 
150 /*
151  * number of seconds the asynchronous configuration thread will sleep idle
152  * before exiting.
153  */
154 static int mdi_async_config_idle_time = 600;
155 
156 static int mdi_bus_config_cache_hash_size = 256;
157 
158 /* turns off multithreaded configuration for certain operations */
159 static int mdi_mtc_off = 0;
160 
161 /*
162  * The "path" to a pathinfo node is identical to the /devices path to a
163  * devinfo node had the device been enumerated under a pHCI instead of
164  * a vHCI.  This pathinfo "path" is associated with a 'path_instance'.
165  * This association persists across create/delete of the pathinfo nodes,
166  * but not across reboot.
167  */
168 static uint_t		mdi_pathmap_instance = 1;	/* 0 -> any path */
169 static int		mdi_pathmap_hash_size = 256;
170 static kmutex_t		mdi_pathmap_mutex;
171 static mod_hash_t	*mdi_pathmap_bypath;		/* "path"->instance */
172 static mod_hash_t	*mdi_pathmap_byinstance;	/* instance->"path" */
173 static mod_hash_t	*mdi_pathmap_sbyinstance;	/* inst->shortpath */
174 
175 /*
176  * MDI component property name/value string definitions
177  */
178 const char 		*mdi_component_prop = "mpxio-component";
179 const char		*mdi_component_prop_vhci = "vhci";
180 const char		*mdi_component_prop_phci = "phci";
181 const char		*mdi_component_prop_client = "client";
182 
183 /*
184  * MDI client global unique identifier property name
185  */
186 const char		*mdi_client_guid_prop = "client-guid";
187 
188 /*
189  * MDI client load balancing property name/value string definitions
190  */
191 const char		*mdi_load_balance = "load-balance";
192 const char		*mdi_load_balance_none = "none";
193 const char		*mdi_load_balance_rr = "round-robin";
194 const char		*mdi_load_balance_lba = "logical-block";
195 
196 /*
197  * Obsolete vHCI class definition; to be removed after Leadville update
198  */
199 const char *mdi_vhci_class_scsi = MDI_HCI_CLASS_SCSI;
200 
201 static char vhci_greeting[] =
202 	"\tThere already exists one vHCI driver for class %s\n"
203 	"\tOnly one vHCI driver for each class is allowed\n";
204 
205 /*
206  * Static function prototypes
207  */
208 static int		i_mdi_phci_offline(dev_info_t *, uint_t);
209 static int		i_mdi_client_offline(dev_info_t *, uint_t);
210 static int		i_mdi_phci_pre_detach(dev_info_t *, ddi_detach_cmd_t);
211 static void		i_mdi_phci_post_detach(dev_info_t *,
212 			    ddi_detach_cmd_t, int);
213 static int		i_mdi_client_pre_detach(dev_info_t *,
214 			    ddi_detach_cmd_t);
215 static void		i_mdi_client_post_detach(dev_info_t *,
216 			    ddi_detach_cmd_t, int);
217 static void		i_mdi_pm_hold_pip(mdi_pathinfo_t *);
218 static void		i_mdi_pm_rele_pip(mdi_pathinfo_t *);
219 static int 		i_mdi_lba_lb(mdi_client_t *ct,
220 			    mdi_pathinfo_t **ret_pip, struct buf *buf);
221 static void		i_mdi_pm_hold_client(mdi_client_t *, int);
222 static void		i_mdi_pm_rele_client(mdi_client_t *, int);
223 static void		i_mdi_pm_reset_client(mdi_client_t *);
224 static int		i_mdi_power_all_phci(mdi_client_t *);
225 static void		i_mdi_log_sysevent(dev_info_t *, char *, char *);
226 
227 
228 /*
229  * Internal mdi_pathinfo node functions
230  */
231 static void		i_mdi_pi_kstat_destroy(mdi_pathinfo_t *);
232 
233 static mdi_vhci_t	*i_mdi_vhci_class2vhci(char *);
234 static mdi_vhci_t	*i_devi_get_vhci(dev_info_t *);
235 static mdi_phci_t	*i_devi_get_phci(dev_info_t *);
236 static void		i_mdi_phci_lock(mdi_phci_t *, mdi_pathinfo_t *);
237 static void		i_mdi_phci_unlock(mdi_phci_t *);
238 static mdi_pathinfo_t	*i_mdi_pi_alloc(mdi_phci_t *, char *, mdi_client_t *);
239 static void		i_mdi_phci_add_path(mdi_phci_t *, mdi_pathinfo_t *);
240 static void		i_mdi_client_add_path(mdi_client_t *, mdi_pathinfo_t *);
241 static void		i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *,
242 			    mdi_client_t *);
243 static void		i_mdi_phci_remove_path(mdi_phci_t *, mdi_pathinfo_t *);
244 static void		i_mdi_client_remove_path(mdi_client_t *,
245 			    mdi_pathinfo_t *);
246 
247 static int		i_mdi_pi_state_change(mdi_pathinfo_t *,
248 			    mdi_pathinfo_state_t, int);
249 static int		i_mdi_pi_offline(mdi_pathinfo_t *, int);
250 static dev_info_t	*i_mdi_devinfo_create(mdi_vhci_t *, char *, char *,
251 			    char **, int);
252 static dev_info_t	*i_mdi_devinfo_find(mdi_vhci_t *, char *, char *);
253 static int		i_mdi_devinfo_remove(dev_info_t *, dev_info_t *, int);
254 static int		i_mdi_is_child_present(dev_info_t *, dev_info_t *);
255 static mdi_client_t	*i_mdi_client_alloc(mdi_vhci_t *, char *, char *);
256 static void		i_mdi_client_enlist_table(mdi_vhci_t *, mdi_client_t *);
257 static void		i_mdi_client_delist_table(mdi_vhci_t *, mdi_client_t *);
258 static mdi_client_t	*i_mdi_client_find(mdi_vhci_t *, char *, char *);
259 static void		i_mdi_client_update_state(mdi_client_t *);
260 static int		i_mdi_client_compute_state(mdi_client_t *,
261 			    mdi_phci_t *);
262 static void		i_mdi_client_lock(mdi_client_t *, mdi_pathinfo_t *);
263 static void		i_mdi_client_unlock(mdi_client_t *);
264 static int		i_mdi_client_free(mdi_vhci_t *, mdi_client_t *);
265 static mdi_client_t	*i_devi_get_client(dev_info_t *);
266 /*
267  * NOTE: this will be removed once the NWS files are changed to use the new
268  * mdi_{enable,disable}_path interfaces
269  */
270 static int		i_mdi_pi_enable_disable(dev_info_t *, dev_info_t *,
271 				int, int);
272 static mdi_pathinfo_t 	*i_mdi_enable_disable_path(mdi_pathinfo_t *pip,
273 				mdi_vhci_t *vh, int flags, int op);
274 /*
275  * Failover related function prototypes
276  */
277 static int		i_mdi_failover(void *);
278 
279 /*
280  * misc internal functions
281  */
282 static int		i_mdi_get_hash_key(char *);
283 static int		i_map_nvlist_error_to_mdi(int);
284 static void		i_mdi_report_path_state(mdi_client_t *,
285 			    mdi_pathinfo_t *);
286 
287 static void		setup_vhci_cache(mdi_vhci_t *);
288 static int		destroy_vhci_cache(mdi_vhci_t *);
289 static int		stop_vhcache_async_threads(mdi_vhci_config_t *);
290 static boolean_t	stop_vhcache_flush_thread(void *, int);
291 static void		free_string_array(char **, int);
292 static void		free_vhcache_phci(mdi_vhcache_phci_t *);
293 static void		free_vhcache_pathinfo(mdi_vhcache_pathinfo_t *);
294 static void		free_vhcache_client(mdi_vhcache_client_t *);
295 static int		mainnvl_to_vhcache(mdi_vhci_cache_t *, nvlist_t *);
296 static nvlist_t		*vhcache_to_mainnvl(mdi_vhci_cache_t *);
297 static void		vhcache_phci_add(mdi_vhci_config_t *, mdi_phci_t *);
298 static void		vhcache_phci_remove(mdi_vhci_config_t *, mdi_phci_t *);
299 static void		vhcache_pi_add(mdi_vhci_config_t *,
300 			    struct mdi_pathinfo *);
301 static void		vhcache_pi_remove(mdi_vhci_config_t *,
302 			    struct mdi_pathinfo *);
303 static void		free_phclient_path_list(mdi_phys_path_t *);
304 static void		sort_vhcache_paths(mdi_vhcache_client_t *);
305 static int		flush_vhcache(mdi_vhci_config_t *, int);
306 static void		vhcache_dirty(mdi_vhci_config_t *);
307 static void		free_async_client_config(mdi_async_client_config_t *);
308 static void		single_threaded_vhconfig_enter(mdi_vhci_config_t *);
309 static void		single_threaded_vhconfig_exit(mdi_vhci_config_t *);
310 static nvlist_t		*read_on_disk_vhci_cache(char *);
311 extern int		fread_nvlist(char *, nvlist_t **);
312 extern int		fwrite_nvlist(char *, nvlist_t *);
313 
314 /* called once when first vhci registers with mdi */
315 static void
316 i_mdi_init()
317 {
318 	static int initialized = 0;
319 
320 	if (initialized)
321 		return;
322 	initialized = 1;
323 
324 	mutex_init(&mdi_mutex, NULL, MUTEX_DEFAULT, NULL);
325 
326 	/* Create our taskq resources */
327 	mdi_taskq = taskq_create("mdi_taskq", mdi_taskq_n_threads,
328 	    MDI_TASKQ_PRI, MDI_TASKQ_MINALLOC, MDI_TASKQ_MAXALLOC,
329 	    TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
330 	ASSERT(mdi_taskq != NULL);	/* taskq_create never fails */
331 
332 	/* Allocate ['path_instance' <-> "path"] maps */
333 	mutex_init(&mdi_pathmap_mutex, NULL, MUTEX_DRIVER, NULL);
334 	mdi_pathmap_bypath = mod_hash_create_strhash(
335 	    "mdi_pathmap_bypath", mdi_pathmap_hash_size,
336 	    mod_hash_null_valdtor);
337 	mdi_pathmap_byinstance = mod_hash_create_idhash(
338 	    "mdi_pathmap_byinstance", mdi_pathmap_hash_size,
339 	    mod_hash_null_valdtor);
340 	mdi_pathmap_sbyinstance = mod_hash_create_idhash(
341 	    "mdi_pathmap_sbyinstance", mdi_pathmap_hash_size,
342 	    mod_hash_null_valdtor);
343 }
344 
345 /*
346  * mdi_get_component_type():
347  *		Return mpxio component type
348  * Return Values:
349  *		MDI_COMPONENT_NONE
350  *		MDI_COMPONENT_VHCI
351  *		MDI_COMPONENT_PHCI
352  *		MDI_COMPONENT_CLIENT
353  * XXX This doesn't work under multi-level MPxIO and should be
354  *	removed when clients migrate mdi_component_is_*() interfaces.
355  */
356 int
357 mdi_get_component_type(dev_info_t *dip)
358 {
359 	return (DEVI(dip)->devi_mdi_component);
360 }
361 
362 /*
363  * mdi_vhci_register():
364  *		Register a vHCI module with the mpxio framework
365  *		mdi_vhci_register() is called by vHCI drivers to register the
366  *		'class_driver' vHCI driver and its MDI entrypoints with the
367  *		mpxio framework.  The vHCI driver must call this interface as
368  *		part of its attach(9e) handler.
369  *		Competing threads may try to attach mdi_vhci_register() as
370  *		the vHCI drivers are loaded and attached as a result of pHCI
371  *		driver instance registration (mdi_phci_register()) with the
372  *		framework.
373  * Return Values:
374  *		MDI_SUCCESS
375  *		MDI_FAILURE
376  */
377 /*ARGSUSED*/
378 int
379 mdi_vhci_register(char *class, dev_info_t *vdip, mdi_vhci_ops_t *vops,
380     int flags)
381 {
382 	mdi_vhci_t		*vh = NULL;
383 
384 	/* Registrant can't be older */
385 	ASSERT(vops->vo_revision <= MDI_VHCI_OPS_REV);
386 
387 #ifdef DEBUG
388 	/*
389 	 * IB nexus driver is loaded only when IB hardware is present.
390 	 * In order to be able to do this there is a need to drive the loading
391 	 * and attaching of the IB nexus driver (especially when an IB hardware
392 	 * is dynamically plugged in) when an IB HCA driver (PHCI)
393 	 * is being attached. Unfortunately this gets into the limitations
394 	 * of devfs as there seems to be no clean way to drive configuration
395 	 * of a subtree from another subtree of a devfs. Hence, do not ASSERT
396 	 * for IB.
397 	 */
398 	if (strcmp(class, MDI_HCI_CLASS_IB) != 0)
399 		ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));
400 #endif
401 
402 	i_mdi_init();
403 
404 	mutex_enter(&mdi_mutex);
405 	/*
406 	 * Scan for already registered vhci
407 	 */
408 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
409 		if (strcmp(vh->vh_class, class) == 0) {
410 			/*
411 			 * vHCI has already been created.  Check for valid
412 			 * vHCI ops registration.  We only support one vHCI
413 			 * module per class
414 			 */
415 			if (vh->vh_ops != NULL) {
416 				mutex_exit(&mdi_mutex);
417 				cmn_err(CE_NOTE, vhci_greeting, class);
418 				return (MDI_FAILURE);
419 			}
420 			break;
421 		}
422 	}
423 
424 	/*
425 	 * if not yet created, create the vHCI component
426 	 */
427 	if (vh == NULL) {
428 		struct client_hash	*hash = NULL;
429 		char			*load_balance;
430 
431 		/*
432 		 * Allocate and initialize the mdi extensions
433 		 */
434 		vh = kmem_zalloc(sizeof (mdi_vhci_t), KM_SLEEP);
435 		hash = kmem_zalloc(mdi_client_table_size * sizeof (*hash),
436 		    KM_SLEEP);
437 		vh->vh_client_table = hash;
438 		vh->vh_class = kmem_zalloc(strlen(class) + 1, KM_SLEEP);
439 		(void) strcpy(vh->vh_class, class);
440 		vh->vh_lb = LOAD_BALANCE_RR;
441 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vdip,
442 		    0, LOAD_BALANCE_PROP, &load_balance) == DDI_SUCCESS) {
443 			if (strcmp(load_balance, LOAD_BALANCE_PROP_NONE) == 0) {
444 				vh->vh_lb = LOAD_BALANCE_NONE;
445 			} else if (strcmp(load_balance, LOAD_BALANCE_PROP_LBA)
446 				    == 0) {
447 				vh->vh_lb = LOAD_BALANCE_LBA;
448 			}
449 			ddi_prop_free(load_balance);
450 		}
451 
452 		mutex_init(&vh->vh_phci_mutex, NULL, MUTEX_DEFAULT, NULL);
453 		mutex_init(&vh->vh_client_mutex, NULL, MUTEX_DEFAULT, NULL);
454 
455 		/*
456 		 * Store the vHCI ops vectors
457 		 */
458 		vh->vh_dip = vdip;
459 		vh->vh_ops = vops;
460 
461 		setup_vhci_cache(vh);
462 
463 		if (mdi_vhci_head == NULL) {
464 			mdi_vhci_head = vh;
465 		}
466 		if (mdi_vhci_tail) {
467 			mdi_vhci_tail->vh_next = vh;
468 		}
469 		mdi_vhci_tail = vh;
470 		mdi_vhci_count++;
471 	}
472 
473 	/*
474 	 * Claim the devfs node as a vhci component
475 	 */
476 	DEVI(vdip)->devi_mdi_component |= MDI_COMPONENT_VHCI;
477 
478 	/*
479 	 * Initialize our back reference from dev_info node
480 	 */
481 	DEVI(vdip)->devi_mdi_xhci = (caddr_t)vh;
482 	mutex_exit(&mdi_mutex);
483 	return (MDI_SUCCESS);
484 }
485 
486 /*
487  * mdi_vhci_unregister():
488  *		Unregister a vHCI module from mpxio framework
489  *		mdi_vhci_unregister() is called from the detach(9E) entrypoint
490  * 		of a vhci to unregister it from the framework.
491  * Return Values:
492  *		MDI_SUCCESS
493  *		MDI_FAILURE
494  */
495 /*ARGSUSED*/
496 int
497 mdi_vhci_unregister(dev_info_t *vdip, int flags)
498 {
499 	mdi_vhci_t	*found, *vh, *prev = NULL;
500 
501 	ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));
502 
503 	/*
504 	 * Check for invalid VHCI
505 	 */
506 	if ((vh = i_devi_get_vhci(vdip)) == NULL)
507 		return (MDI_FAILURE);
508 
509 	/*
510 	 * Scan the list of registered vHCIs for a match
511 	 */
512 	mutex_enter(&mdi_mutex);
513 	for (found = mdi_vhci_head; found != NULL; found = found->vh_next) {
514 		if (found == vh)
515 			break;
516 		prev = found;
517 	}
518 
519 	if (found == NULL) {
520 		mutex_exit(&mdi_mutex);
521 		return (MDI_FAILURE);
522 	}
523 
524 	/*
525 	 * Check the vHCI, pHCI and client count. All the pHCIs and clients
526 	 * should have been unregistered, before a vHCI can be
527 	 * unregistered.
528 	 */
529 	MDI_VHCI_PHCI_LOCK(vh);
530 	if (vh->vh_refcnt || vh->vh_phci_count || vh->vh_client_count) {
531 		MDI_VHCI_PHCI_UNLOCK(vh);
532 		mutex_exit(&mdi_mutex);
533 		return (MDI_FAILURE);
534 	}
535 	MDI_VHCI_PHCI_UNLOCK(vh);
536 
537 	if (destroy_vhci_cache(vh) != MDI_SUCCESS) {
538 		mutex_exit(&mdi_mutex);
539 		return (MDI_FAILURE);
540 	}
541 
542 	/*
543 	 * Remove the vHCI from the global list
544 	 */
545 	if (vh == mdi_vhci_head) {
546 		mdi_vhci_head = vh->vh_next;
547 	} else {
548 		prev->vh_next = vh->vh_next;
549 	}
550 	if (vh == mdi_vhci_tail) {
551 		mdi_vhci_tail = prev;
552 	}
553 	mdi_vhci_count--;
554 	mutex_exit(&mdi_mutex);
555 
556 	vh->vh_ops = NULL;
557 	DEVI(vdip)->devi_mdi_component &= ~MDI_COMPONENT_VHCI;
558 	DEVI(vdip)->devi_mdi_xhci = NULL;
559 	kmem_free(vh->vh_class, strlen(vh->vh_class)+1);
560 	kmem_free(vh->vh_client_table,
561 	    mdi_client_table_size * sizeof (struct client_hash));
562 	mutex_destroy(&vh->vh_phci_mutex);
563 	mutex_destroy(&vh->vh_client_mutex);
564 
565 	kmem_free(vh, sizeof (mdi_vhci_t));
566 	return (MDI_SUCCESS);
567 }
568 
569 /*
570  * i_mdi_vhci_class2vhci():
571  *		Look for a matching vHCI module given a vHCI class name
572  * Return Values:
573  *		Handle to a vHCI component
574  *		NULL
575  */
576 static mdi_vhci_t *
577 i_mdi_vhci_class2vhci(char *class)
578 {
579 	mdi_vhci_t	*vh = NULL;
580 
581 	ASSERT(!MUTEX_HELD(&mdi_mutex));
582 
583 	mutex_enter(&mdi_mutex);
584 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
585 		if (strcmp(vh->vh_class, class) == 0) {
586 			break;
587 		}
588 	}
589 	mutex_exit(&mdi_mutex);
590 	return (vh);
591 }
592 
593 /*
594  * i_devi_get_vhci():
595  *		Utility function to get the handle to a vHCI component
596  * Return Values:
597  *		Handle to a vHCI component
598  *		NULL
599  */
600 mdi_vhci_t *
601 i_devi_get_vhci(dev_info_t *vdip)
602 {
603 	mdi_vhci_t	*vh = NULL;
604 	if (MDI_VHCI(vdip)) {
605 		vh = (mdi_vhci_t *)DEVI(vdip)->devi_mdi_xhci;
606 	}
607 	return (vh);
608 }
609 
610 /*
611  * mdi_phci_register():
612  *		Register a pHCI module with mpxio framework
613  *		mdi_phci_register() is called by pHCI drivers to register with
614  *		the mpxio framework and a specific 'class_driver' vHCI.  The
615  *		pHCI driver must call this interface as part of its attach(9e)
616  *		handler.
617  * Return Values:
618  *		MDI_SUCCESS
619  *		MDI_FAILURE
620  */
621 /*ARGSUSED*/
622 int
623 mdi_phci_register(char *class, dev_info_t *pdip, int flags)
624 {
625 	mdi_phci_t		*ph;
626 	mdi_vhci_t		*vh;
627 	char			*data;
628 
629 	/*
630 	 * Some subsystems, like fcp, perform pHCI registration from a
631 	 * different thread than the one doing the pHCI attach(9E) - the
632 	 * driver attach code is waiting for this other thread to complete.
633 	 * This means we can only ASSERT DEVI_BUSY_CHANGING of parent
634 	 * (indicating that some thread has done an ndi_devi_enter of parent)
635 	 * not DEVI_BUSY_OWNED (which would indicate that we did the enter).
636 	 */
637 	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));
638 
639 	/*
640 	 * Check for mpxio-disable property. Enable mpxio if the property is
641 	 * missing or not set to "yes".
642 	 * If the property is set to "yes" then emit a brief message.
643 	 */
644 	if ((ddi_prop_lookup_string(DDI_DEV_T_ANY, pdip, 0, "mpxio-disable",
645 	    &data) == DDI_SUCCESS)) {
646 		if (strcmp(data, "yes") == 0) {
647 			MDI_DEBUG(1, (MDI_CONT, pdip,
648 			    "?multipath capabilities disabled via %s.conf.",
649 			    ddi_driver_name(pdip)));
650 			ddi_prop_free(data);
651 			return (MDI_FAILURE);
652 		}
653 		ddi_prop_free(data);
654 	}
655 
656 	/*
657 	 * Search for a matching vHCI
658 	 */
659 	vh = (mdi_vhci_t *)i_mdi_vhci_class2vhci(class);
660 	if (vh == NULL) {
661 		return (MDI_FAILURE);
662 	}
663 
664 	ph = kmem_zalloc(sizeof (mdi_phci_t), KM_SLEEP);
665 	mutex_init(&ph->ph_mutex, NULL, MUTEX_DEFAULT, NULL);
666 	ph->ph_dip = pdip;
667 	ph->ph_vhci = vh;
668 	ph->ph_next = NULL;
669 	ph->ph_unstable = 0;
670 	ph->ph_vprivate = 0;
671 	cv_init(&ph->ph_unstable_cv, NULL, CV_DRIVER, NULL);
672 
673 	MDI_PHCI_LOCK(ph);
674 	MDI_PHCI_SET_POWER_UP(ph);
675 	MDI_PHCI_UNLOCK(ph);
676 	DEVI(pdip)->devi_mdi_component |= MDI_COMPONENT_PHCI;
677 	DEVI(pdip)->devi_mdi_xhci = (caddr_t)ph;
678 
679 	vhcache_phci_add(vh->vh_config, ph);
680 
681 	MDI_VHCI_PHCI_LOCK(vh);
682 	if (vh->vh_phci_head == NULL) {
683 		vh->vh_phci_head = ph;
684 	}
685 	if (vh->vh_phci_tail) {
686 		vh->vh_phci_tail->ph_next = ph;
687 	}
688 	vh->vh_phci_tail = ph;
689 	vh->vh_phci_count++;
690 	MDI_VHCI_PHCI_UNLOCK(vh);
691 
692 	i_mdi_log_sysevent(pdip, class, ESC_DDI_INITIATOR_REGISTER);
693 	return (MDI_SUCCESS);
694 }
695 
696 /*
697  * mdi_phci_unregister():
698  *		Unregister a pHCI module from mpxio framework
699  *		mdi_phci_unregister() is called by the pHCI drivers from their
700  *		detach(9E) handler to unregister their instances from the
701  *		framework.
702  * Return Values:
703  *		MDI_SUCCESS
704  *		MDI_FAILURE
705  */
706 /*ARGSUSED*/
707 int
708 mdi_phci_unregister(dev_info_t *pdip, int flags)
709 {
710 	mdi_vhci_t		*vh;
711 	mdi_phci_t		*ph;
712 	mdi_phci_t		*tmp;
713 	mdi_phci_t		*prev = NULL;
714 	mdi_pathinfo_t		*pip;
715 
716 	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));
717 
718 	ph = i_devi_get_phci(pdip);
719 	if (ph == NULL) {
720 		MDI_DEBUG(1, (MDI_WARN, pdip, "!not a valid pHCI"));
721 		return (MDI_FAILURE);
722 	}
723 
724 	vh = ph->ph_vhci;
725 	ASSERT(vh != NULL);
726 	if (vh == NULL) {
727 		MDI_DEBUG(1, (MDI_WARN, pdip, "!not a valid vHCI"));
728 		return (MDI_FAILURE);
729 	}
730 
731 	MDI_VHCI_PHCI_LOCK(vh);
732 	tmp = vh->vh_phci_head;
733 	while (tmp) {
734 		if (tmp == ph) {
735 			break;
736 		}
737 		prev = tmp;
738 		tmp = tmp->ph_next;
739 	}
740 
741 	if (ph == vh->vh_phci_head) {
742 		vh->vh_phci_head = ph->ph_next;
743 	} else {
744 		prev->ph_next = ph->ph_next;
745 	}
746 
747 	if (ph == vh->vh_phci_tail) {
748 		vh->vh_phci_tail = prev;
749 	}
750 
751 	vh->vh_phci_count--;
752 	MDI_VHCI_PHCI_UNLOCK(vh);
753 
754 	/* Walk remaining pathinfo nodes and disassociate them from pHCI */
755 	MDI_PHCI_LOCK(ph);
756 	for (pip = (mdi_pathinfo_t *)ph->ph_path_head; pip;
757 	    pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link)
758 		MDI_PI(pip)->pi_phci = NULL;
759 	MDI_PHCI_UNLOCK(ph);
760 
761 	i_mdi_log_sysevent(pdip, ph->ph_vhci->vh_class,
762 	    ESC_DDI_INITIATOR_UNREGISTER);
763 	vhcache_phci_remove(vh->vh_config, ph);
764 	cv_destroy(&ph->ph_unstable_cv);
765 	mutex_destroy(&ph->ph_mutex);
766 	kmem_free(ph, sizeof (mdi_phci_t));
767 	DEVI(pdip)->devi_mdi_component &= ~MDI_COMPONENT_PHCI;
768 	DEVI(pdip)->devi_mdi_xhci = NULL;
769 	return (MDI_SUCCESS);
770 }
771 
772 /*
773  * i_devi_get_phci():
774  * 		Utility function to return the phci extensions.
775  */
776 static mdi_phci_t *
777 i_devi_get_phci(dev_info_t *pdip)
778 {
779 	mdi_phci_t	*ph = NULL;
780 
781 	if (MDI_PHCI(pdip)) {
782 		ph = (mdi_phci_t *)DEVI(pdip)->devi_mdi_xhci;
783 	}
784 	return (ph);
785 }
786 
787 /*
788  * Single thread mdi entry into devinfo node for modifying its children.
789  * If necessary we perform an ndi_devi_enter of the vHCI before doing
790  * an ndi_devi_enter of 'dip'.  We maintain circular in two parts: one
791  * for the vHCI and one for the pHCI.
792  */
793 void
794 mdi_devi_enter(dev_info_t *phci_dip, int *circular)
795 {
796 	dev_info_t	*vdip;
797 	int		vcircular, pcircular;
798 
799 	/* Verify calling context */
800 	ASSERT(MDI_PHCI(phci_dip));
801 	vdip = mdi_devi_get_vdip(phci_dip);
802 	ASSERT(vdip);			/* A pHCI always has a vHCI */
803 
804 	/*
805 	 * If pHCI is detaching then the framework has already entered the
806 	 * vHCI on a threads that went down the code path leading to
807 	 * detach_node().  This framework enter of the vHCI during pHCI
808 	 * detach is done to avoid deadlock with vHCI power management
809 	 * operations which enter the vHCI and the enter down the path
810 	 * to the pHCI. If pHCI is detaching then we piggyback this calls
811 	 * enter of the vHCI on frameworks vHCI enter that has already
812 	 * occurred - this is OK because we know that the framework thread
813 	 * doing detach is waiting for our completion.
814 	 *
815 	 * We should DEVI_IS_DETACHING under an enter of the parent to avoid
816 	 * race with detach - but we can't do that because the framework has
817 	 * already entered the parent, so we have some complexity instead.
818 	 */
819 	for (;;) {
820 		if (ndi_devi_tryenter(vdip, &vcircular)) {
821 			ASSERT(vcircular != -1);
822 			if (DEVI_IS_DETACHING(phci_dip)) {
823 				ndi_devi_exit(vdip, vcircular);
824 				vcircular = -1;
825 			}
826 			break;
827 		} else if (DEVI_IS_DETACHING(phci_dip)) {
828 			vcircular = -1;
829 			break;
830 		} else if (servicing_interrupt()) {
831 			/*
832 			 * Don't delay an interrupt (and ensure adaptive
833 			 * mutex inversion support).
834 			 */
835 			ndi_devi_enter(vdip, &vcircular);
836 			break;
837 		} else {
838 			delay_random(2);
839 		}
840 	}
841 
842 	ndi_devi_enter(phci_dip, &pcircular);
843 	*circular = (vcircular << 16) | (pcircular & 0xFFFF);
844 }
845 
846 /*
847  * Attempt to mdi_devi_enter.
848  */
849 int
850 mdi_devi_tryenter(dev_info_t *phci_dip, int *circular)
851 {
852 	dev_info_t	*vdip;
853 	int		vcircular, pcircular;
854 
855 	/* Verify calling context */
856 	ASSERT(MDI_PHCI(phci_dip));
857 	vdip = mdi_devi_get_vdip(phci_dip);
858 	ASSERT(vdip);			/* A pHCI always has a vHCI */
859 
860 	if (ndi_devi_tryenter(vdip, &vcircular)) {
861 		if (ndi_devi_tryenter(phci_dip, &pcircular)) {
862 			*circular = (vcircular << 16) | (pcircular & 0xFFFF);
863 			return (1);	/* locked */
864 		}
865 		ndi_devi_exit(vdip, vcircular);
866 	}
867 	return (0);			/* busy */
868 }
869 
870 /*
871  * Release mdi_devi_enter or successful mdi_devi_tryenter.
872  */
873 void
874 mdi_devi_exit(dev_info_t *phci_dip, int circular)
875 {
876 	dev_info_t	*vdip;
877 	int		vcircular, pcircular;
878 
879 	/* Verify calling context */
880 	ASSERT(MDI_PHCI(phci_dip));
881 	vdip = mdi_devi_get_vdip(phci_dip);
882 	ASSERT(vdip);			/* A pHCI always has a vHCI */
883 
884 	/* extract two circular recursion values from single int */
885 	pcircular = (short)(circular & 0xFFFF);
886 	vcircular = (short)((circular >> 16) & 0xFFFF);
887 
888 	ndi_devi_exit(phci_dip, pcircular);
889 	if (vcircular != -1)
890 		ndi_devi_exit(vdip, vcircular);
891 }
892 
893 /*
894  * The functions mdi_devi_exit_phci() and mdi_devi_enter_phci() are used
895  * around a pHCI drivers calls to mdi_pi_online/offline, after holding
896  * the pathinfo node via mdi_hold_path/mdi_rele_path, to avoid deadlock
897  * with vHCI power management code during path online/offline.  Each
898  * mdi_devi_exit_phci must have a matching mdi_devi_enter_phci, and both must
899  * occur within the scope of an active mdi_devi_enter that establishes the
900  * circular value.
901  */
902 void
903 mdi_devi_exit_phci(dev_info_t *phci_dip, int circular)
904 {
905 	int		pcircular;
906 
907 	/* Verify calling context */
908 	ASSERT(MDI_PHCI(phci_dip));
909 
910 	/* Keep hold on pHCI until we reenter in mdi_devi_enter_phci */
911 	ndi_hold_devi(phci_dip);
912 
913 	pcircular = (short)(circular & 0xFFFF);
914 	ndi_devi_exit(phci_dip, pcircular);
915 }
916 
917 void
918 mdi_devi_enter_phci(dev_info_t *phci_dip, int *circular)
919 {
920 	int		pcircular;
921 
922 	/* Verify calling context */
923 	ASSERT(MDI_PHCI(phci_dip));
924 
925 	ndi_devi_enter(phci_dip, &pcircular);
926 
927 	/* Drop hold from mdi_devi_exit_phci. */
928 	ndi_rele_devi(phci_dip);
929 
930 	/* verify matching mdi_devi_exit_phci/mdi_devi_enter_phci use */
931 	ASSERT(pcircular == ((short)(*circular & 0xFFFF)));
932 }
933 
934 /*
935  * mdi_devi_get_vdip():
936  *		given a pHCI dip return vHCI dip
937  */
938 dev_info_t *
939 mdi_devi_get_vdip(dev_info_t *pdip)
940 {
941 	mdi_phci_t	*ph;
942 
943 	ph = i_devi_get_phci(pdip);
944 	if (ph && ph->ph_vhci)
945 		return (ph->ph_vhci->vh_dip);
946 	return (NULL);
947 }
948 
949 /*
950  * mdi_devi_pdip_entered():
951  *		Return 1 if we are vHCI and have done an ndi_devi_enter
952  *		of a pHCI
953  */
954 int
955 mdi_devi_pdip_entered(dev_info_t *vdip)
956 {
957 	mdi_vhci_t	*vh;
958 	mdi_phci_t	*ph;
959 
960 	vh = i_devi_get_vhci(vdip);
961 	if (vh == NULL)
962 		return (0);
963 
964 	MDI_VHCI_PHCI_LOCK(vh);
965 	ph = vh->vh_phci_head;
966 	while (ph) {
967 		if (ph->ph_dip && DEVI_BUSY_OWNED(ph->ph_dip)) {
968 			MDI_VHCI_PHCI_UNLOCK(vh);
969 			return (1);
970 		}
971 		ph = ph->ph_next;
972 	}
973 	MDI_VHCI_PHCI_UNLOCK(vh);
974 	return (0);
975 }
976 
977 /*
978  * mdi_phci_path2devinfo():
979  * 		Utility function to search for a valid phci device given
980  *		the devfs pathname.
981  */
982 dev_info_t *
983 mdi_phci_path2devinfo(dev_info_t *vdip, caddr_t pathname)
984 {
985 	char		*temp_pathname;
986 	mdi_vhci_t	*vh;
987 	mdi_phci_t	*ph;
988 	dev_info_t 	*pdip = NULL;
989 
990 	vh = i_devi_get_vhci(vdip);
991 	ASSERT(vh != NULL);
992 
993 	if (vh == NULL) {
994 		/*
995 		 * Invalid vHCI component, return failure
996 		 */
997 		return (NULL);
998 	}
999 
1000 	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1001 	MDI_VHCI_PHCI_LOCK(vh);
1002 	ph = vh->vh_phci_head;
1003 	while (ph != NULL) {
1004 		pdip = ph->ph_dip;
1005 		ASSERT(pdip != NULL);
1006 		*temp_pathname = '\0';
1007 		(void) ddi_pathname(pdip, temp_pathname);
1008 		if (strcmp(temp_pathname, pathname) == 0) {
1009 			break;
1010 		}
1011 		ph = ph->ph_next;
1012 	}
1013 	if (ph == NULL) {
1014 		pdip = NULL;
1015 	}
1016 	MDI_VHCI_PHCI_UNLOCK(vh);
1017 	kmem_free(temp_pathname, MAXPATHLEN);
1018 	return (pdip);
1019 }
1020 
1021 /*
1022  * mdi_phci_get_path_count():
1023  * 		get number of path information nodes associated with a given
1024  *		pHCI device.
1025  */
1026 int
1027 mdi_phci_get_path_count(dev_info_t *pdip)
1028 {
1029 	mdi_phci_t	*ph;
1030 	int		count = 0;
1031 
1032 	ph = i_devi_get_phci(pdip);
1033 	if (ph != NULL) {
1034 		count = ph->ph_path_count;
1035 	}
1036 	return (count);
1037 }
1038 
1039 /*
1040  * i_mdi_phci_lock():
1041  *		Lock a pHCI device
1042  * Return Values:
1043  *		None
1044  * Note:
1045  *		The default locking order is:
1046  *		_NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
1047  *		But there are number of situations where locks need to be
1048  *		grabbed in reverse order.  This routine implements try and lock
1049  *		mechanism depending on the requested parameter option.
1050  */
1051 static void
1052 i_mdi_phci_lock(mdi_phci_t *ph, mdi_pathinfo_t *pip)
1053 {
1054 	if (pip) {
1055 		/* Reverse locking is requested. */
1056 		while (MDI_PHCI_TRYLOCK(ph) == 0) {
1057 			if (servicing_interrupt()) {
1058 				MDI_PI_HOLD(pip);
1059 				MDI_PI_UNLOCK(pip);
1060 				MDI_PHCI_LOCK(ph);
1061 				MDI_PI_LOCK(pip);
1062 				MDI_PI_RELE(pip);
1063 				break;
1064 			} else {
1065 				/*
1066 				 * tryenter failed. Try to grab again
1067 				 * after a small delay
1068 				 */
1069 				MDI_PI_HOLD(pip);
1070 				MDI_PI_UNLOCK(pip);
1071 				delay_random(2);
1072 				MDI_PI_LOCK(pip);
1073 				MDI_PI_RELE(pip);
1074 			}
1075 		}
1076 	} else {
1077 		MDI_PHCI_LOCK(ph);
1078 	}
1079 }
1080 
1081 /*
1082  * i_mdi_phci_unlock():
1083  *		Unlock the pHCI component
1084  */
1085 static void
1086 i_mdi_phci_unlock(mdi_phci_t *ph)
1087 {
1088 	MDI_PHCI_UNLOCK(ph);
1089 }
1090 
1091 /*
1092  * i_mdi_devinfo_create():
1093  *		create client device's devinfo node
1094  * Return Values:
1095  *		dev_info
1096  *		NULL
1097  * Notes:
1098  */
1099 static dev_info_t *
1100 i_mdi_devinfo_create(mdi_vhci_t *vh, char *name, char *guid,
1101 	char **compatible, int ncompatible)
1102 {
1103 	dev_info_t *cdip = NULL;
1104 
1105 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1106 
1107 	/* Verify for duplicate entry */
1108 	cdip = i_mdi_devinfo_find(vh, name, guid);
1109 	ASSERT(cdip == NULL);
1110 	if (cdip) {
1111 		cmn_err(CE_WARN,
1112 		    "i_mdi_devinfo_create: client %s@%s already exists",
1113 			name ? name : "", guid ? guid : "");
1114 	}
1115 
1116 	ndi_devi_alloc_sleep(vh->vh_dip, name, DEVI_SID_NODEID, &cdip);
1117 	if (cdip == NULL)
1118 		goto fail;
1119 
1120 	/*
1121 	 * Create component type and Global unique identifier
1122 	 * properties
1123 	 */
1124 	if (ndi_prop_update_string(DDI_DEV_T_NONE, cdip,
1125 	    MDI_CLIENT_GUID_PROP, guid) != DDI_PROP_SUCCESS) {
1126 		goto fail;
1127 	}
1128 
1129 	/* Decorate the node with compatible property */
1130 	if (compatible &&
1131 	    (ndi_prop_update_string_array(DDI_DEV_T_NONE, cdip,
1132 	    "compatible", compatible, ncompatible) != DDI_PROP_SUCCESS)) {
1133 		goto fail;
1134 	}
1135 
1136 	return (cdip);
1137 
1138 fail:
1139 	if (cdip) {
1140 		(void) ndi_prop_remove_all(cdip);
1141 		(void) ndi_devi_free(cdip);
1142 	}
1143 	return (NULL);
1144 }
1145 
1146 /*
1147  * i_mdi_devinfo_find():
1148  *		Find a matching devinfo node for given client node name
1149  *		and its guid.
1150  * Return Values:
1151  *		Handle to a dev_info node or NULL
1152  */
1153 static dev_info_t *
1154 i_mdi_devinfo_find(mdi_vhci_t *vh, caddr_t name, char *guid)
1155 {
1156 	char			*data;
1157 	dev_info_t 		*cdip = NULL;
1158 	dev_info_t 		*ndip = NULL;
1159 	int			circular;
1160 
1161 	ndi_devi_enter(vh->vh_dip, &circular);
1162 	ndip = (dev_info_t *)DEVI(vh->vh_dip)->devi_child;
1163 	while ((cdip = ndip) != NULL) {
1164 		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
1165 
1166 		if (strcmp(DEVI(cdip)->devi_node_name, name)) {
1167 			continue;
1168 		}
1169 
1170 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, cdip,
1171 		    DDI_PROP_DONTPASS, MDI_CLIENT_GUID_PROP,
1172 		    &data) != DDI_PROP_SUCCESS) {
1173 			continue;
1174 		}
1175 
1176 		if (strcmp(data, guid) != 0) {
1177 			ddi_prop_free(data);
1178 			continue;
1179 		}
1180 		ddi_prop_free(data);
1181 		break;
1182 	}
1183 	ndi_devi_exit(vh->vh_dip, circular);
1184 	return (cdip);
1185 }
1186 
1187 /*
1188  * i_mdi_devinfo_remove():
1189  *		Remove a client device node
1190  */
1191 static int
1192 i_mdi_devinfo_remove(dev_info_t *vdip, dev_info_t *cdip, int flags)
1193 {
1194 	int	rv = MDI_SUCCESS;
1195 
1196 	if (i_mdi_is_child_present(vdip, cdip) == MDI_SUCCESS ||
1197 	    (flags & MDI_CLIENT_FLAGS_DEV_NOT_SUPPORTED)) {
1198 		rv = ndi_devi_offline(cdip, NDI_DEVFS_CLEAN | NDI_DEVI_REMOVE);
1199 		if (rv != NDI_SUCCESS) {
1200 			MDI_DEBUG(1, (MDI_NOTE, cdip,
1201 			    "!failed: cdip %p", (void *)cdip));
1202 		}
1203 		/*
1204 		 * Convert to MDI error code
1205 		 */
1206 		switch (rv) {
1207 		case NDI_SUCCESS:
1208 			rv = MDI_SUCCESS;
1209 			break;
1210 		case NDI_BUSY:
1211 			rv = MDI_BUSY;
1212 			break;
1213 		default:
1214 			rv = MDI_FAILURE;
1215 			break;
1216 		}
1217 	}
1218 	return (rv);
1219 }
1220 
1221 /*
1222  * i_devi_get_client()
1223  *		Utility function to get mpxio component extensions
1224  */
1225 static mdi_client_t *
1226 i_devi_get_client(dev_info_t *cdip)
1227 {
1228 	mdi_client_t	*ct = NULL;
1229 
1230 	if (MDI_CLIENT(cdip)) {
1231 		ct = (mdi_client_t *)DEVI(cdip)->devi_mdi_client;
1232 	}
1233 	return (ct);
1234 }
1235 
1236 /*
1237  * i_mdi_is_child_present():
1238  *		Search for the presence of client device dev_info node
1239  */
1240 static int
1241 i_mdi_is_child_present(dev_info_t *vdip, dev_info_t *cdip)
1242 {
1243 	int		rv = MDI_FAILURE;
1244 	struct dev_info	*dip;
1245 	int		circular;
1246 
1247 	ndi_devi_enter(vdip, &circular);
1248 	dip = DEVI(vdip)->devi_child;
1249 	while (dip) {
1250 		if (dip == DEVI(cdip)) {
1251 			rv = MDI_SUCCESS;
1252 			break;
1253 		}
1254 		dip = dip->devi_sibling;
1255 	}
1256 	ndi_devi_exit(vdip, circular);
1257 	return (rv);
1258 }
1259 
1260 
1261 /*
1262  * i_mdi_client_lock():
1263  *		Grab client component lock
1264  * Return Values:
1265  *		None
1266  * Note:
1267  *		The default locking order is:
1268  *		_NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
1269  *		But there are number of situations where locks need to be
1270  *		grabbed in reverse order.  This routine implements try and lock
1271  *		mechanism depending on the requested parameter option.
1272  */
1273 static void
1274 i_mdi_client_lock(mdi_client_t *ct, mdi_pathinfo_t *pip)
1275 {
1276 	if (pip) {
1277 		/*
1278 		 * Reverse locking is requested.
1279 		 */
1280 		while (MDI_CLIENT_TRYLOCK(ct) == 0) {
1281 			if (servicing_interrupt()) {
1282 				MDI_PI_HOLD(pip);
1283 				MDI_PI_UNLOCK(pip);
1284 				MDI_CLIENT_LOCK(ct);
1285 				MDI_PI_LOCK(pip);
1286 				MDI_PI_RELE(pip);
1287 				break;
1288 			} else {
1289 				/*
1290 				 * tryenter failed. Try to grab again
1291 				 * after a small delay
1292 				 */
1293 				MDI_PI_HOLD(pip);
1294 				MDI_PI_UNLOCK(pip);
1295 				delay_random(2);
1296 				MDI_PI_LOCK(pip);
1297 				MDI_PI_RELE(pip);
1298 			}
1299 		}
1300 	} else {
1301 		MDI_CLIENT_LOCK(ct);
1302 	}
1303 }
1304 
1305 /*
1306  * i_mdi_client_unlock():
1307  *		Unlock a client component
1308  */
1309 static void
1310 i_mdi_client_unlock(mdi_client_t *ct)
1311 {
1312 	MDI_CLIENT_UNLOCK(ct);
1313 }
1314 
1315 /*
1316  * i_mdi_client_alloc():
1317  * 		Allocate and initialize a client structure.  Caller should
1318  *		hold the vhci client lock.
1319  * Return Values:
1320  *		Handle to a client component
1321  */
1322 /*ARGSUSED*/
1323 static mdi_client_t *
1324 i_mdi_client_alloc(mdi_vhci_t *vh, char *name, char *lguid)
1325 {
1326 	mdi_client_t	*ct;
1327 
1328 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1329 
1330 	/*
1331 	 * Allocate and initialize a component structure.
1332 	 */
1333 	ct = kmem_zalloc(sizeof (*ct), KM_SLEEP);
1334 	mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL);
1335 	ct->ct_hnext = NULL;
1336 	ct->ct_hprev = NULL;
1337 	ct->ct_dip = NULL;
1338 	ct->ct_vhci = vh;
1339 	ct->ct_drvname = kmem_alloc(strlen(name) + 1, KM_SLEEP);
1340 	(void) strcpy(ct->ct_drvname, name);
1341 	ct->ct_guid = kmem_alloc(strlen(lguid) + 1, KM_SLEEP);
1342 	(void) strcpy(ct->ct_guid, lguid);
1343 	ct->ct_cprivate = NULL;
1344 	ct->ct_vprivate = NULL;
1345 	ct->ct_flags = 0;
1346 	ct->ct_state = MDI_CLIENT_STATE_FAILED;
1347 	MDI_CLIENT_LOCK(ct);
1348 	MDI_CLIENT_SET_OFFLINE(ct);
1349 	MDI_CLIENT_SET_DETACH(ct);
1350 	MDI_CLIENT_SET_POWER_UP(ct);
1351 	MDI_CLIENT_UNLOCK(ct);
1352 	ct->ct_failover_flags = 0;
1353 	ct->ct_failover_status = 0;
1354 	cv_init(&ct->ct_failover_cv, NULL, CV_DRIVER, NULL);
1355 	ct->ct_unstable = 0;
1356 	cv_init(&ct->ct_unstable_cv, NULL, CV_DRIVER, NULL);
1357 	cv_init(&ct->ct_powerchange_cv, NULL, CV_DRIVER, NULL);
1358 	ct->ct_lb = vh->vh_lb;
1359 	ct->ct_lb_args =  kmem_zalloc(sizeof (client_lb_args_t), KM_SLEEP);
1360 	ct->ct_lb_args->region_size = LOAD_BALANCE_DEFAULT_REGION_SIZE;
1361 	ct->ct_path_count = 0;
1362 	ct->ct_path_head = NULL;
1363 	ct->ct_path_tail = NULL;
1364 	ct->ct_path_last = NULL;
1365 
1366 	/*
1367 	 * Add this client component to our client hash queue
1368 	 */
1369 	i_mdi_client_enlist_table(vh, ct);
1370 	return (ct);
1371 }
1372 
1373 /*
1374  * i_mdi_client_enlist_table():
1375  *		Attach the client device to the client hash table. Caller
1376  *		should hold the vhci client lock.
1377  */
1378 static void
1379 i_mdi_client_enlist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1380 {
1381 	int 			index;
1382 	struct client_hash	*head;
1383 
1384 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1385 
1386 	index = i_mdi_get_hash_key(ct->ct_guid);
1387 	head = &vh->vh_client_table[index];
1388 	ct->ct_hnext = (mdi_client_t *)head->ct_hash_head;
1389 	head->ct_hash_head = ct;
1390 	head->ct_hash_count++;
1391 	vh->vh_client_count++;
1392 }
1393 
1394 /*
1395  * i_mdi_client_delist_table():
1396  *		Attach the client device to the client hash table.
1397  *		Caller should hold the vhci client lock.
1398  */
1399 static void
1400 i_mdi_client_delist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1401 {
1402 	int			index;
1403 	char			*guid;
1404 	struct client_hash 	*head;
1405 	mdi_client_t		*next;
1406 	mdi_client_t		*last;
1407 
1408 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1409 
1410 	guid = ct->ct_guid;
1411 	index = i_mdi_get_hash_key(guid);
1412 	head = &vh->vh_client_table[index];
1413 
1414 	last = NULL;
1415 	next = (mdi_client_t *)head->ct_hash_head;
1416 	while (next != NULL) {
1417 		if (next == ct) {
1418 			break;
1419 		}
1420 		last = next;
1421 		next = next->ct_hnext;
1422 	}
1423 
1424 	if (next) {
1425 		head->ct_hash_count--;
1426 		if (last == NULL) {
1427 			head->ct_hash_head = ct->ct_hnext;
1428 		} else {
1429 			last->ct_hnext = ct->ct_hnext;
1430 		}
1431 		ct->ct_hnext = NULL;
1432 		vh->vh_client_count--;
1433 	}
1434 }
1435 
1436 
1437 /*
1438  * i_mdi_client_free():
1439  *		Free a client component
1440  */
1441 static int
1442 i_mdi_client_free(mdi_vhci_t *vh, mdi_client_t *ct)
1443 {
1444 	int		rv = MDI_SUCCESS;
1445 	int		flags = ct->ct_flags;
1446 	dev_info_t	*cdip;
1447 	dev_info_t	*vdip;
1448 
1449 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1450 
1451 	vdip = vh->vh_dip;
1452 	cdip = ct->ct_dip;
1453 
1454 	(void) ndi_prop_remove(DDI_DEV_T_NONE, cdip, MDI_CLIENT_GUID_PROP);
1455 	DEVI(cdip)->devi_mdi_component &= ~MDI_COMPONENT_CLIENT;
1456 	DEVI(cdip)->devi_mdi_client = NULL;
1457 
1458 	/*
1459 	 * Clear out back ref. to dev_info_t node
1460 	 */
1461 	ct->ct_dip = NULL;
1462 
1463 	/*
1464 	 * Remove this client from our hash queue
1465 	 */
1466 	i_mdi_client_delist_table(vh, ct);
1467 
1468 	/*
1469 	 * Uninitialize and free the component
1470 	 */
1471 	kmem_free(ct->ct_drvname, strlen(ct->ct_drvname) + 1);
1472 	kmem_free(ct->ct_guid, strlen(ct->ct_guid) + 1);
1473 	kmem_free(ct->ct_lb_args, sizeof (client_lb_args_t));
1474 	cv_destroy(&ct->ct_failover_cv);
1475 	cv_destroy(&ct->ct_unstable_cv);
1476 	cv_destroy(&ct->ct_powerchange_cv);
1477 	mutex_destroy(&ct->ct_mutex);
1478 	kmem_free(ct, sizeof (*ct));
1479 
1480 	if (cdip != NULL) {
1481 		MDI_VHCI_CLIENT_UNLOCK(vh);
1482 		(void) i_mdi_devinfo_remove(vdip, cdip, flags);
1483 		MDI_VHCI_CLIENT_LOCK(vh);
1484 	}
1485 	return (rv);
1486 }
1487 
1488 /*
1489  * i_mdi_client_find():
1490  * 		Find the client structure corresponding to a given guid
1491  *		Caller should hold the vhci client lock.
1492  */
1493 static mdi_client_t *
1494 i_mdi_client_find(mdi_vhci_t *vh, char *cname, char *guid)
1495 {
1496 	int			index;
1497 	struct client_hash	*head;
1498 	mdi_client_t		*ct;
1499 
1500 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1501 
1502 	index = i_mdi_get_hash_key(guid);
1503 	head = &vh->vh_client_table[index];
1504 
1505 	ct = head->ct_hash_head;
1506 	while (ct != NULL) {
1507 		if (strcmp(ct->ct_guid, guid) == 0 &&
1508 		    (cname == NULL || strcmp(ct->ct_drvname, cname) == 0)) {
1509 			break;
1510 		}
1511 		ct = ct->ct_hnext;
1512 	}
1513 	return (ct);
1514 }
1515 
1516 /*
1517  * i_mdi_client_update_state():
1518  *		Compute and update client device state
1519  * Notes:
1520  *		A client device can be in any of three possible states:
1521  *
1522  *		MDI_CLIENT_STATE_OPTIMAL - Client in optimal state with more
1523  *		one online/standby paths. Can tolerate failures.
1524  *		MDI_CLIENT_STATE_DEGRADED - Client device in degraded state with
1525  *		no alternate paths available as standby. A failure on the online
1526  *		would result in loss of access to device data.
1527  *		MDI_CLIENT_STATE_FAILED - Client device in failed state with
1528  *		no paths available to access the device.
1529  */
1530 static void
1531 i_mdi_client_update_state(mdi_client_t *ct)
1532 {
1533 	int state;
1534 
1535 	ASSERT(MDI_CLIENT_LOCKED(ct));
1536 	state = i_mdi_client_compute_state(ct, NULL);
1537 	MDI_CLIENT_SET_STATE(ct, state);
1538 }
1539 
1540 /*
1541  * i_mdi_client_compute_state():
1542  *		Compute client device state
1543  *
1544  *		mdi_phci_t *	Pointer to pHCI structure which should
1545  *				while computing the new value.  Used by
1546  *				i_mdi_phci_offline() to find the new
1547  *				client state after DR of a pHCI.
1548  */
1549 static int
1550 i_mdi_client_compute_state(mdi_client_t *ct, mdi_phci_t *ph)
1551 {
1552 	int		state;
1553 	int		online_count = 0;
1554 	int		standby_count = 0;
1555 	mdi_pathinfo_t	*pip, *next;
1556 
1557 	ASSERT(MDI_CLIENT_LOCKED(ct));
1558 	pip = ct->ct_path_head;
1559 	while (pip != NULL) {
1560 		MDI_PI_LOCK(pip);
1561 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
1562 		if (MDI_PI(pip)->pi_phci == ph) {
1563 			MDI_PI_UNLOCK(pip);
1564 			pip = next;
1565 			continue;
1566 		}
1567 
1568 		if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1569 				== MDI_PATHINFO_STATE_ONLINE)
1570 			online_count++;
1571 		else if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1572 				== MDI_PATHINFO_STATE_STANDBY)
1573 			standby_count++;
1574 		MDI_PI_UNLOCK(pip);
1575 		pip = next;
1576 	}
1577 
1578 	if (online_count == 0) {
1579 		if (standby_count == 0) {
1580 			state = MDI_CLIENT_STATE_FAILED;
1581 			MDI_DEBUG(2, (MDI_NOTE, ct->ct_dip,
1582 			    "client state failed: ct = %p", (void *)ct));
1583 		} else if (standby_count == 1) {
1584 			state = MDI_CLIENT_STATE_DEGRADED;
1585 		} else {
1586 			state = MDI_CLIENT_STATE_OPTIMAL;
1587 		}
1588 	} else if (online_count == 1) {
1589 		if (standby_count == 0) {
1590 			state = MDI_CLIENT_STATE_DEGRADED;
1591 		} else {
1592 			state = MDI_CLIENT_STATE_OPTIMAL;
1593 		}
1594 	} else {
1595 		state = MDI_CLIENT_STATE_OPTIMAL;
1596 	}
1597 	return (state);
1598 }
1599 
1600 /*
1601  * i_mdi_client2devinfo():
1602  *		Utility function
1603  */
1604 dev_info_t *
1605 i_mdi_client2devinfo(mdi_client_t *ct)
1606 {
1607 	return (ct->ct_dip);
1608 }
1609 
1610 /*
1611  * mdi_client_path2_devinfo():
1612  * 		Given the parent devinfo and child devfs pathname, search for
1613  *		a valid devfs node handle.
1614  */
1615 dev_info_t *
1616 mdi_client_path2devinfo(dev_info_t *vdip, char *pathname)
1617 {
1618 	dev_info_t 	*cdip = NULL;
1619 	dev_info_t 	*ndip = NULL;
1620 	char		*temp_pathname;
1621 	int		circular;
1622 
1623 	/*
1624 	 * Allocate temp buffer
1625 	 */
1626 	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1627 
1628 	/*
1629 	 * Lock parent against changes
1630 	 */
1631 	ndi_devi_enter(vdip, &circular);
1632 	ndip = (dev_info_t *)DEVI(vdip)->devi_child;
1633 	while ((cdip = ndip) != NULL) {
1634 		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
1635 
1636 		*temp_pathname = '\0';
1637 		(void) ddi_pathname(cdip, temp_pathname);
1638 		if (strcmp(temp_pathname, pathname) == 0) {
1639 			break;
1640 		}
1641 	}
1642 	/*
1643 	 * Release devinfo lock
1644 	 */
1645 	ndi_devi_exit(vdip, circular);
1646 
1647 	/*
1648 	 * Free the temp buffer
1649 	 */
1650 	kmem_free(temp_pathname, MAXPATHLEN);
1651 	return (cdip);
1652 }
1653 
1654 /*
1655  * mdi_client_get_path_count():
1656  * 		Utility function to get number of path information nodes
1657  *		associated with a given client device.
1658  */
1659 int
1660 mdi_client_get_path_count(dev_info_t *cdip)
1661 {
1662 	mdi_client_t	*ct;
1663 	int		count = 0;
1664 
1665 	ct = i_devi_get_client(cdip);
1666 	if (ct != NULL) {
1667 		count = ct->ct_path_count;
1668 	}
1669 	return (count);
1670 }
1671 
1672 
1673 /*
1674  * i_mdi_get_hash_key():
1675  * 		Create a hash using strings as keys
1676  *
1677  */
1678 static int
1679 i_mdi_get_hash_key(char *str)
1680 {
1681 	uint32_t	g, hash = 0;
1682 	char		*p;
1683 
1684 	for (p = str; *p != '\0'; p++) {
1685 		g = *p;
1686 		hash += g;
1687 	}
1688 	return (hash % (CLIENT_HASH_TABLE_SIZE - 1));
1689 }
1690 
1691 /*
1692  * mdi_get_lb_policy():
1693  * 		Get current load balancing policy for a given client device
1694  */
1695 client_lb_t
1696 mdi_get_lb_policy(dev_info_t *cdip)
1697 {
1698 	client_lb_t	lb = LOAD_BALANCE_NONE;
1699 	mdi_client_t	*ct;
1700 
1701 	ct = i_devi_get_client(cdip);
1702 	if (ct != NULL) {
1703 		lb = ct->ct_lb;
1704 	}
1705 	return (lb);
1706 }
1707 
1708 /*
1709  * mdi_set_lb_region_size():
1710  * 		Set current region size for the load-balance
1711  */
1712 int
1713 mdi_set_lb_region_size(dev_info_t *cdip, int region_size)
1714 {
1715 	mdi_client_t	*ct;
1716 	int		rv = MDI_FAILURE;
1717 
1718 	ct = i_devi_get_client(cdip);
1719 	if (ct != NULL && ct->ct_lb_args != NULL) {
1720 		ct->ct_lb_args->region_size = region_size;
1721 		rv = MDI_SUCCESS;
1722 	}
1723 	return (rv);
1724 }
1725 
1726 /*
1727  * mdi_Set_lb_policy():
1728  * 		Set current load balancing policy for a given client device
1729  */
1730 int
1731 mdi_set_lb_policy(dev_info_t *cdip, client_lb_t lb)
1732 {
1733 	mdi_client_t	*ct;
1734 	int		rv = MDI_FAILURE;
1735 
1736 	ct = i_devi_get_client(cdip);
1737 	if (ct != NULL) {
1738 		ct->ct_lb = lb;
1739 		rv = MDI_SUCCESS;
1740 	}
1741 	return (rv);
1742 }
1743 
1744 /*
1745  * mdi_failover():
1746  *		failover function called by the vHCI drivers to initiate
1747  *		a failover operation.  This is typically due to non-availability
1748  *		of online paths to route I/O requests.  Failover can be
1749  *		triggered through user application also.
1750  *
1751  *		The vHCI driver calls mdi_failover() to initiate a failover
1752  *		operation. mdi_failover() calls back into the vHCI driver's
1753  *		vo_failover() entry point to perform the actual failover
1754  *		operation.  The reason for requiring the vHCI driver to
1755  *		initiate failover by calling mdi_failover(), instead of directly
1756  *		executing vo_failover() itself, is to ensure that the mdi
1757  *		framework can keep track of the client state properly.
1758  *		Additionally, mdi_failover() provides as a convenience the
1759  *		option of performing the failover operation synchronously or
1760  *		asynchronously
1761  *
1762  *		Upon successful completion of the failover operation, the
1763  *		paths that were previously ONLINE will be in the STANDBY state,
1764  *		and the newly activated paths will be in the ONLINE state.
1765  *
1766  *		The flags modifier determines whether the activation is done
1767  *		synchronously: MDI_FAILOVER_SYNC
1768  * Return Values:
1769  *		MDI_SUCCESS
1770  *		MDI_FAILURE
1771  *		MDI_BUSY
1772  */
1773 /*ARGSUSED*/
1774 int
1775 mdi_failover(dev_info_t *vdip, dev_info_t *cdip, int flags)
1776 {
1777 	int			rv;
1778 	mdi_client_t		*ct;
1779 
1780 	ct = i_devi_get_client(cdip);
1781 	ASSERT(ct != NULL);
1782 	if (ct == NULL) {
1783 		/* cdip is not a valid client device. Nothing more to do. */
1784 		return (MDI_FAILURE);
1785 	}
1786 
1787 	MDI_CLIENT_LOCK(ct);
1788 
1789 	if (MDI_CLIENT_IS_PATH_FREE_IN_PROGRESS(ct)) {
1790 		/* A path to the client is being freed */
1791 		MDI_CLIENT_UNLOCK(ct);
1792 		return (MDI_BUSY);
1793 	}
1794 
1795 
1796 	if (MDI_CLIENT_IS_FAILED(ct)) {
1797 		/*
1798 		 * Client is in failed state. Nothing more to do.
1799 		 */
1800 		MDI_CLIENT_UNLOCK(ct);
1801 		return (MDI_FAILURE);
1802 	}
1803 
1804 	if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
1805 		/*
1806 		 * Failover is already in progress; return BUSY
1807 		 */
1808 		MDI_CLIENT_UNLOCK(ct);
1809 		return (MDI_BUSY);
1810 	}
1811 	/*
1812 	 * Make sure that mdi_pathinfo node state changes are processed.
1813 	 * We do not allow failovers to progress while client path state
1814 	 * changes are in progress
1815 	 */
1816 	if (ct->ct_unstable) {
1817 		if (flags == MDI_FAILOVER_ASYNC) {
1818 			MDI_CLIENT_UNLOCK(ct);
1819 			return (MDI_BUSY);
1820 		} else {
1821 			while (ct->ct_unstable)
1822 				cv_wait(&ct->ct_unstable_cv, &ct->ct_mutex);
1823 		}
1824 	}
1825 
1826 	/*
1827 	 * Client device is in stable state. Before proceeding, perform sanity
1828 	 * checks again.
1829 	 */
1830 	if ((MDI_CLIENT_IS_DETACHED(ct)) || (MDI_CLIENT_IS_FAILED(ct)) ||
1831 	    (!i_ddi_devi_attached(ct->ct_dip))) {
1832 		/*
1833 		 * Client is in failed state. Nothing more to do.
1834 		 */
1835 		MDI_CLIENT_UNLOCK(ct);
1836 		return (MDI_FAILURE);
1837 	}
1838 
1839 	/*
1840 	 * Set the client state as failover in progress.
1841 	 */
1842 	MDI_CLIENT_SET_FAILOVER_IN_PROGRESS(ct);
1843 	ct->ct_failover_flags = flags;
1844 	MDI_CLIENT_UNLOCK(ct);
1845 
1846 	if (flags == MDI_FAILOVER_ASYNC) {
1847 		/*
1848 		 * Submit the initiate failover request via CPR safe
1849 		 * taskq threads.
1850 		 */
1851 		(void) taskq_dispatch(mdi_taskq, (task_func_t *)i_mdi_failover,
1852 		    ct, KM_SLEEP);
1853 		return (MDI_ACCEPT);
1854 	} else {
1855 		/*
1856 		 * Synchronous failover mode.  Typically invoked from the user
1857 		 * land.
1858 		 */
1859 		rv = i_mdi_failover(ct);
1860 	}
1861 	return (rv);
1862 }
1863 
1864 /*
1865  * i_mdi_failover():
1866  *		internal failover function. Invokes vHCI drivers failover
1867  *		callback function and process the failover status
1868  * Return Values:
1869  *		None
1870  *
1871  * Note: A client device in failover state can not be detached or freed.
1872  */
1873 static int
1874 i_mdi_failover(void *arg)
1875 {
1876 	int		rv = MDI_SUCCESS;
1877 	mdi_client_t	*ct = (mdi_client_t *)arg;
1878 	mdi_vhci_t	*vh = ct->ct_vhci;
1879 
1880 	ASSERT(!MDI_CLIENT_LOCKED(ct));
1881 
1882 	if (vh->vh_ops->vo_failover != NULL) {
1883 		/*
1884 		 * Call vHCI drivers callback routine
1885 		 */
1886 		rv = (*vh->vh_ops->vo_failover)(vh->vh_dip, ct->ct_dip,
1887 		    ct->ct_failover_flags);
1888 	}
1889 
1890 	MDI_CLIENT_LOCK(ct);
1891 	MDI_CLIENT_CLEAR_FAILOVER_IN_PROGRESS(ct);
1892 
1893 	/*
1894 	 * Save the failover return status
1895 	 */
1896 	ct->ct_failover_status = rv;
1897 
1898 	/*
1899 	 * As a result of failover, client status would have been changed.
1900 	 * Update the client state and wake up anyone waiting on this client
1901 	 * device.
1902 	 */
1903 	i_mdi_client_update_state(ct);
1904 
1905 	cv_broadcast(&ct->ct_failover_cv);
1906 	MDI_CLIENT_UNLOCK(ct);
1907 	return (rv);
1908 }
1909 
1910 /*
1911  * Load balancing is logical block.
1912  * IOs within the range described by region_size
1913  * would go on the same path. This would improve the
1914  * performance by cache-hit on some of the RAID devices.
1915  * Search only for online paths(At some point we
1916  * may want to balance across target ports).
1917  * If no paths are found then default to round-robin.
1918  */
1919 static int
1920 i_mdi_lba_lb(mdi_client_t *ct, mdi_pathinfo_t **ret_pip, struct buf *bp)
1921 {
1922 	int		path_index = -1;
1923 	int		online_path_count = 0;
1924 	int		online_nonpref_path_count = 0;
1925 	int 		region_size = ct->ct_lb_args->region_size;
1926 	mdi_pathinfo_t	*pip;
1927 	mdi_pathinfo_t	*next;
1928 	int		preferred, path_cnt;
1929 
1930 	pip = ct->ct_path_head;
1931 	while (pip) {
1932 		MDI_PI_LOCK(pip);
1933 		if (MDI_PI(pip)->pi_state ==
1934 		    MDI_PATHINFO_STATE_ONLINE && MDI_PI(pip)->pi_preferred) {
1935 			online_path_count++;
1936 		} else if (MDI_PI(pip)->pi_state ==
1937 		    MDI_PATHINFO_STATE_ONLINE && !MDI_PI(pip)->pi_preferred) {
1938 			online_nonpref_path_count++;
1939 		}
1940 		next = (mdi_pathinfo_t *)
1941 		    MDI_PI(pip)->pi_client_link;
1942 		MDI_PI_UNLOCK(pip);
1943 		pip = next;
1944 	}
1945 	/* if found any online/preferred then use this type */
1946 	if (online_path_count > 0) {
1947 		path_cnt = online_path_count;
1948 		preferred = 1;
1949 	} else if (online_nonpref_path_count > 0) {
1950 		path_cnt = online_nonpref_path_count;
1951 		preferred = 0;
1952 	} else {
1953 		path_cnt = 0;
1954 	}
1955 	if (path_cnt) {
1956 		path_index = (bp->b_blkno >> region_size) % path_cnt;
1957 		pip = ct->ct_path_head;
1958 		while (pip && path_index != -1) {
1959 			MDI_PI_LOCK(pip);
1960 			if (path_index == 0 &&
1961 			    (MDI_PI(pip)->pi_state ==
1962 			    MDI_PATHINFO_STATE_ONLINE) &&
1963 				MDI_PI(pip)->pi_preferred == preferred) {
1964 				MDI_PI_HOLD(pip);
1965 				MDI_PI_UNLOCK(pip);
1966 				*ret_pip = pip;
1967 				return (MDI_SUCCESS);
1968 			}
1969 			path_index --;
1970 			next = (mdi_pathinfo_t *)
1971 			    MDI_PI(pip)->pi_client_link;
1972 			MDI_PI_UNLOCK(pip);
1973 			pip = next;
1974 		}
1975 		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
1976 		    "lba %llx: path %s %p",
1977 		    bp->b_lblkno, mdi_pi_spathname(pip), (void *)pip));
1978 	}
1979 	return (MDI_FAILURE);
1980 }
1981 
1982 /*
1983  * mdi_select_path():
1984  *		select a path to access a client device.
1985  *
1986  *		mdi_select_path() function is called by the vHCI drivers to
1987  *		select a path to route the I/O request to.  The caller passes
1988  *		the block I/O data transfer structure ("buf") as one of the
1989  *		parameters.  The mpxio framework uses the buf structure
1990  *		contents to maintain per path statistics (total I/O size /
1991  *		count pending).  If more than one online paths are available to
1992  *		select, the framework automatically selects a suitable path
1993  *		for routing I/O request. If a failover operation is active for
1994  *		this client device the call shall be failed with MDI_BUSY error
1995  *		code.
1996  *
1997  *		By default this function returns a suitable path in online
1998  *		state based on the current load balancing policy.  Currently
1999  *		we support LOAD_BALANCE_NONE (Previously selected online path
2000  *		will continue to be used till the path is usable) and
2001  *		LOAD_BALANCE_RR (Online paths will be selected in a round
2002  *		robin fashion), LOAD_BALANCE_LB(Online paths will be selected
2003  *		based on the logical block).  The load balancing
2004  *		through vHCI drivers configuration file (driver.conf).
2005  *
2006  *		vHCI drivers may override this default behavior by specifying
2007  *		appropriate flags.  The meaning of the thrid argument depends
2008  *		on the flags specified. If MDI_SELECT_PATH_INSTANCE is set
2009  *		then the argument is the "path instance" of the path to select.
2010  *		If MDI_SELECT_PATH_INSTANCE is not set then the argument is
2011  *		"start_pip". A non NULL "start_pip" is the starting point to
2012  *		walk and find the next appropriate path.  The following values
2013  *		are currently defined: MDI_SELECT_ONLINE_PATH (to select an
2014  *		ONLINE path) and/or MDI_SELECT_STANDBY_PATH (to select an
2015  *		STANDBY path).
2016  *
2017  *		The non-standard behavior is used by the scsi_vhci driver,
2018  *		whenever it has to use a STANDBY/FAULTED path.  Eg. during
2019  *		attach of client devices (to avoid an unnecessary failover
2020  *		when the STANDBY path comes up first), during failover
2021  *		(to activate a STANDBY path as ONLINE).
2022  *
2023  *		The selected path is returned in a a mdi_hold_path() state
2024  *		(pi_ref_cnt). Caller should release the hold by calling
2025  *		mdi_rele_path().
2026  *
2027  * Return Values:
2028  *		MDI_SUCCESS	- Completed successfully
2029  *		MDI_BUSY 	- Client device is busy failing over
2030  *		MDI_NOPATH	- Client device is online, but no valid path are
2031  *				  available to access this client device
2032  *		MDI_FAILURE	- Invalid client device or state
2033  *		MDI_DEVI_ONLINING
2034  *				- Client device (struct dev_info state) is in
2035  *				  onlining state.
2036  */
2037 
2038 /*ARGSUSED*/
2039 int
2040 mdi_select_path(dev_info_t *cdip, struct buf *bp, int flags,
2041     void *arg, mdi_pathinfo_t **ret_pip)
2042 {
2043 	mdi_client_t	*ct;
2044 	mdi_pathinfo_t	*pip;
2045 	mdi_pathinfo_t	*next;
2046 	mdi_pathinfo_t	*head;
2047 	mdi_pathinfo_t	*start;
2048 	client_lb_t	lbp;	/* load balancing policy */
2049 	int		sb = 1;	/* standard behavior */
2050 	int		preferred = 1;	/* preferred path */
2051 	int		cond, cont = 1;
2052 	int		retry = 0;
2053 	mdi_pathinfo_t	*start_pip;	/* request starting pathinfo */
2054 	int		path_instance;	/* request specific path instance */
2055 
2056 	/* determine type of arg based on flags */
2057 	if (flags & MDI_SELECT_PATH_INSTANCE) {
2058 		path_instance = (int)(intptr_t)arg;
2059 		start_pip = NULL;
2060 	} else {
2061 		path_instance = 0;
2062 		start_pip = (mdi_pathinfo_t *)arg;
2063 	}
2064 
2065 	if (flags != 0) {
2066 		/*
2067 		 * disable default behavior
2068 		 */
2069 		sb = 0;
2070 	}
2071 
2072 	*ret_pip = NULL;
2073 	ct = i_devi_get_client(cdip);
2074 	if (ct == NULL) {
2075 		/* mdi extensions are NULL, Nothing more to do */
2076 		return (MDI_FAILURE);
2077 	}
2078 
2079 	MDI_CLIENT_LOCK(ct);
2080 
2081 	if (sb) {
2082 		if (MDI_CLIENT_IS_FAILED(ct)) {
2083 			/*
2084 			 * Client is not ready to accept any I/O requests.
2085 			 * Fail this request.
2086 			 */
2087 			MDI_DEBUG(2, (MDI_NOTE, cdip,
2088 			    "client state offline ct = %p", (void *)ct));
2089 			MDI_CLIENT_UNLOCK(ct);
2090 			return (MDI_FAILURE);
2091 		}
2092 
2093 		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
2094 			/*
2095 			 * Check for Failover is in progress. If so tell the
2096 			 * caller that this device is busy.
2097 			 */
2098 			MDI_DEBUG(2, (MDI_NOTE, cdip,
2099 			    "client failover in progress ct = %p",
2100 			    (void *)ct));
2101 			MDI_CLIENT_UNLOCK(ct);
2102 			return (MDI_BUSY);
2103 		}
2104 
2105 		/*
2106 		 * Check to see whether the client device is attached.
2107 		 * If not so, let the vHCI driver manually select a path
2108 		 * (standby) and let the probe/attach process to continue.
2109 		 */
2110 		if (MDI_CLIENT_IS_DETACHED(ct) || !i_ddi_devi_attached(cdip)) {
2111 			MDI_DEBUG(4, (MDI_NOTE, cdip,
2112 			    "devi is onlining ct = %p", (void *)ct));
2113 			MDI_CLIENT_UNLOCK(ct);
2114 			return (MDI_DEVI_ONLINING);
2115 		}
2116 	}
2117 
2118 	/*
2119 	 * Cache in the client list head.  If head of the list is NULL
2120 	 * return MDI_NOPATH
2121 	 */
2122 	head = ct->ct_path_head;
2123 	if (head == NULL) {
2124 		MDI_CLIENT_UNLOCK(ct);
2125 		return (MDI_NOPATH);
2126 	}
2127 
2128 	/* Caller is specifying a specific pathinfo path by path_instance */
2129 	if (path_instance) {
2130 		/* search for pathinfo with correct path_instance */
2131 		for (pip = head;
2132 		    pip && (mdi_pi_get_path_instance(pip) != path_instance);
2133 		    pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link)
2134 			;
2135 
2136 		/* If path can't be selected then MDI_NOPATH is returned. */
2137 		if (pip == NULL) {
2138 			MDI_CLIENT_UNLOCK(ct);
2139 			return (MDI_NOPATH);
2140 		}
2141 
2142 		/*
2143 		 * Verify state of path. When asked to select a specific
2144 		 * path_instance, we select the requested path in any
2145 		 * state (ONLINE, OFFLINE, STANDBY, FAULT) other than INIT.
2146 		 * We don't however select paths where the pHCI has detached.
2147 		 * NOTE: last pathinfo node of an opened client device may
2148 		 * exist in an OFFLINE state after the pHCI associated with
2149 		 * that path has detached (but pi_phci will be NULL if that
2150 		 * has occurred).
2151 		 */
2152 		MDI_PI_LOCK(pip);
2153 		if ((MDI_PI(pip)->pi_state == MDI_PATHINFO_STATE_INIT) ||
2154 		    (MDI_PI(pip)->pi_phci == NULL)) {
2155 			MDI_PI_UNLOCK(pip);
2156 			MDI_CLIENT_UNLOCK(ct);
2157 			return (MDI_FAILURE);
2158 		}
2159 
2160 		/* Return MDI_BUSY if we have a transient condition */
2161 		if (MDI_PI_IS_TRANSIENT(pip)) {
2162 			MDI_PI_UNLOCK(pip);
2163 			MDI_CLIENT_UNLOCK(ct);
2164 			return (MDI_BUSY);
2165 		}
2166 
2167 		/*
2168 		 * Return the path in hold state. Caller should release the
2169 		 * lock by calling mdi_rele_path()
2170 		 */
2171 		MDI_PI_HOLD(pip);
2172 		MDI_PI_UNLOCK(pip);
2173 		*ret_pip = pip;
2174 		MDI_CLIENT_UNLOCK(ct);
2175 		return (MDI_SUCCESS);
2176 	}
2177 
2178 	/*
2179 	 * for non default behavior, bypass current
2180 	 * load balancing policy and always use LOAD_BALANCE_RR
2181 	 * except that the start point will be adjusted based
2182 	 * on the provided start_pip
2183 	 */
2184 	lbp = sb ? ct->ct_lb : LOAD_BALANCE_RR;
2185 
2186 	switch (lbp) {
2187 	case LOAD_BALANCE_NONE:
2188 		/*
2189 		 * Load balancing is None  or Alternate path mode
2190 		 * Start looking for a online mdi_pathinfo node starting from
2191 		 * last known selected path
2192 		 */
2193 		preferred = 1;
2194 		pip = (mdi_pathinfo_t *)ct->ct_path_last;
2195 		if (pip == NULL) {
2196 			pip = head;
2197 		}
2198 		start = pip;
2199 		do {
2200 			MDI_PI_LOCK(pip);
2201 			/*
2202 			 * No need to explicitly check if the path is disabled.
2203 			 * Since we are checking for state == ONLINE and the
2204 			 * same variable is used for DISABLE/ENABLE information.
2205 			 */
2206 			if ((MDI_PI(pip)->pi_state  ==
2207 				MDI_PATHINFO_STATE_ONLINE) &&
2208 				preferred == MDI_PI(pip)->pi_preferred) {
2209 				/*
2210 				 * Return the path in hold state. Caller should
2211 				 * release the lock by calling mdi_rele_path()
2212 				 */
2213 				MDI_PI_HOLD(pip);
2214 				MDI_PI_UNLOCK(pip);
2215 				ct->ct_path_last = pip;
2216 				*ret_pip = pip;
2217 				MDI_CLIENT_UNLOCK(ct);
2218 				return (MDI_SUCCESS);
2219 			}
2220 
2221 			/*
2222 			 * Path is busy.
2223 			 */
2224 			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
2225 			    MDI_PI_IS_TRANSIENT(pip))
2226 				retry = 1;
2227 			/*
2228 			 * Keep looking for a next available online path
2229 			 */
2230 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2231 			if (next == NULL) {
2232 				next = head;
2233 			}
2234 			MDI_PI_UNLOCK(pip);
2235 			pip = next;
2236 			if (start == pip && preferred) {
2237 				preferred = 0;
2238 			} else if (start == pip && !preferred) {
2239 				cont = 0;
2240 			}
2241 		} while (cont);
2242 		break;
2243 
2244 	case LOAD_BALANCE_LBA:
2245 		/*
2246 		 * Make sure we are looking
2247 		 * for an online path. Otherwise, if it is for a STANDBY
2248 		 * path request, it will go through and fetch an ONLINE
2249 		 * path which is not desirable.
2250 		 */
2251 		if ((ct->ct_lb_args != NULL) &&
2252 			    (ct->ct_lb_args->region_size) && bp &&
2253 				(sb || (flags == MDI_SELECT_ONLINE_PATH))) {
2254 			if (i_mdi_lba_lb(ct, ret_pip, bp)
2255 				    == MDI_SUCCESS) {
2256 				MDI_CLIENT_UNLOCK(ct);
2257 				return (MDI_SUCCESS);
2258 			}
2259 		}
2260 		/* FALLTHROUGH */
2261 	case LOAD_BALANCE_RR:
2262 		/*
2263 		 * Load balancing is Round Robin. Start looking for a online
2264 		 * mdi_pathinfo node starting from last known selected path
2265 		 * as the start point.  If override flags are specified,
2266 		 * process accordingly.
2267 		 * If the search is already in effect(start_pip not null),
2268 		 * then lets just use the same path preference to continue the
2269 		 * traversal.
2270 		 */
2271 
2272 		if (start_pip != NULL) {
2273 			preferred = MDI_PI(start_pip)->pi_preferred;
2274 		} else {
2275 			preferred = 1;
2276 		}
2277 
2278 		start = sb ? (mdi_pathinfo_t *)ct->ct_path_last : start_pip;
2279 		if (start == NULL) {
2280 			pip = head;
2281 		} else {
2282 			pip = (mdi_pathinfo_t *)MDI_PI(start)->pi_client_link;
2283 			if (pip == NULL) {
2284 				if ( flags & MDI_SELECT_NO_PREFERRED) {
2285 					/*
2286 					 * Return since we hit the end of list
2287 					 */
2288 					MDI_CLIENT_UNLOCK(ct);
2289 					return (MDI_NOPATH);
2290 				}
2291 
2292 				if (!sb) {
2293 					if (preferred == 0) {
2294 						/*
2295 						 * Looks like we have completed
2296 						 * the traversal as preferred
2297 						 * value is 0. Time to bail out.
2298 						 */
2299 						*ret_pip = NULL;
2300 						MDI_CLIENT_UNLOCK(ct);
2301 						return (MDI_NOPATH);
2302 					} else {
2303 						/*
2304 						 * Looks like we reached the
2305 						 * end of the list. Lets enable
2306 						 * traversal of non preferred
2307 						 * paths.
2308 						 */
2309 						preferred = 0;
2310 					}
2311 				}
2312 				pip = head;
2313 			}
2314 		}
2315 		start = pip;
2316 		do {
2317 			MDI_PI_LOCK(pip);
2318 			if (sb) {
2319 				cond = ((MDI_PI(pip)->pi_state ==
2320 				    MDI_PATHINFO_STATE_ONLINE &&
2321 					MDI_PI(pip)->pi_preferred ==
2322 						preferred) ? 1 : 0);
2323 			} else {
2324 				if (flags == MDI_SELECT_ONLINE_PATH) {
2325 					cond = ((MDI_PI(pip)->pi_state ==
2326 					    MDI_PATHINFO_STATE_ONLINE &&
2327 						MDI_PI(pip)->pi_preferred ==
2328 						preferred) ? 1 : 0);
2329 				} else if (flags == MDI_SELECT_STANDBY_PATH) {
2330 					cond = ((MDI_PI(pip)->pi_state ==
2331 					    MDI_PATHINFO_STATE_STANDBY &&
2332 						MDI_PI(pip)->pi_preferred ==
2333 						preferred) ? 1 : 0);
2334 				} else if (flags == (MDI_SELECT_ONLINE_PATH |
2335 				    MDI_SELECT_STANDBY_PATH)) {
2336 					cond = (((MDI_PI(pip)->pi_state ==
2337 					    MDI_PATHINFO_STATE_ONLINE ||
2338 					    (MDI_PI(pip)->pi_state ==
2339 					    MDI_PATHINFO_STATE_STANDBY)) &&
2340 						MDI_PI(pip)->pi_preferred ==
2341 						preferred) ? 1 : 0);
2342 				} else if (flags ==
2343 					(MDI_SELECT_STANDBY_PATH |
2344 					MDI_SELECT_ONLINE_PATH |
2345 					MDI_SELECT_USER_DISABLE_PATH)) {
2346 					cond = (((MDI_PI(pip)->pi_state ==
2347 					    MDI_PATHINFO_STATE_ONLINE ||
2348 					    (MDI_PI(pip)->pi_state ==
2349 					    MDI_PATHINFO_STATE_STANDBY) ||
2350 						(MDI_PI(pip)->pi_state ==
2351 					    (MDI_PATHINFO_STATE_ONLINE|
2352 					    MDI_PATHINFO_STATE_USER_DISABLE)) ||
2353 						(MDI_PI(pip)->pi_state ==
2354 					    (MDI_PATHINFO_STATE_STANDBY |
2355 					    MDI_PATHINFO_STATE_USER_DISABLE)))&&
2356 						MDI_PI(pip)->pi_preferred ==
2357 						preferred) ? 1 : 0);
2358 				} else if (flags ==
2359 				    (MDI_SELECT_STANDBY_PATH |
2360 				    MDI_SELECT_ONLINE_PATH |
2361 				    MDI_SELECT_NO_PREFERRED)) {
2362 					cond = (((MDI_PI(pip)->pi_state ==
2363 					    MDI_PATHINFO_STATE_ONLINE) ||
2364 					    (MDI_PI(pip)->pi_state ==
2365 					    MDI_PATHINFO_STATE_STANDBY))
2366 					    ? 1 : 0);
2367 				} else {
2368 					cond = 0;
2369 				}
2370 			}
2371 			/*
2372 			 * No need to explicitly check if the path is disabled.
2373 			 * Since we are checking for state == ONLINE and the
2374 			 * same variable is used for DISABLE/ENABLE information.
2375 			 */
2376 			if (cond) {
2377 				/*
2378 				 * Return the path in hold state. Caller should
2379 				 * release the lock by calling mdi_rele_path()
2380 				 */
2381 				MDI_PI_HOLD(pip);
2382 				MDI_PI_UNLOCK(pip);
2383 				if (sb)
2384 					ct->ct_path_last = pip;
2385 				*ret_pip = pip;
2386 				MDI_CLIENT_UNLOCK(ct);
2387 				return (MDI_SUCCESS);
2388 			}
2389 			/*
2390 			 * Path is busy.
2391 			 */
2392 			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
2393 			    MDI_PI_IS_TRANSIENT(pip))
2394 				retry = 1;
2395 
2396 			/*
2397 			 * Keep looking for a next available online path
2398 			 */
2399 do_again:
2400 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2401 			if (next == NULL) {
2402 				if ( flags & MDI_SELECT_NO_PREFERRED) {
2403 					/*
2404 					 * Bail out since we hit the end of list
2405 					 */
2406 					MDI_PI_UNLOCK(pip);
2407 					break;
2408 				}
2409 
2410 				if (!sb) {
2411 					if (preferred == 1) {
2412 						/*
2413 						 * Looks like we reached the
2414 						 * end of the list. Lets enable
2415 						 * traversal of non preferred
2416 						 * paths.
2417 						 */
2418 						preferred = 0;
2419 						next = head;
2420 					} else {
2421 						/*
2422 						 * We have done both the passes
2423 						 * Preferred as well as for
2424 						 * Non-preferred. Bail out now.
2425 						 */
2426 						cont = 0;
2427 					}
2428 				} else {
2429 					/*
2430 					 * Standard behavior case.
2431 					 */
2432 					next = head;
2433 				}
2434 			}
2435 			MDI_PI_UNLOCK(pip);
2436 			if (cont == 0) {
2437 				break;
2438 			}
2439 			pip = next;
2440 
2441 			if (!sb) {
2442 				/*
2443 				 * We need to handle the selection of
2444 				 * non-preferred path in the following
2445 				 * case:
2446 				 *
2447 				 * +------+   +------+   +------+   +-----+
2448 				 * | A : 1| - | B : 1| - | C : 0| - |NULL |
2449 				 * +------+   +------+   +------+   +-----+
2450 				 *
2451 				 * If we start the search with B, we need to
2452 				 * skip beyond B to pick C which is non -
2453 				 * preferred in the second pass. The following
2454 				 * test, if true, will allow us to skip over
2455 				 * the 'start'(B in the example) to select
2456 				 * other non preferred elements.
2457 				 */
2458 				if ((start_pip != NULL) && (start_pip == pip) &&
2459 				    (MDI_PI(start_pip)->pi_preferred
2460 				    != preferred)) {
2461 					/*
2462 					 * try again after going past the start
2463 					 * pip
2464 					 */
2465 					MDI_PI_LOCK(pip);
2466 					goto do_again;
2467 				}
2468 			} else {
2469 				/*
2470 				 * Standard behavior case
2471 				 */
2472 				if (start == pip && preferred) {
2473 					/* look for nonpreferred paths */
2474 					preferred = 0;
2475 				} else if (start == pip && !preferred) {
2476 					/*
2477 					 * Exit condition
2478 					 */
2479 					cont = 0;
2480 				}
2481 			}
2482 		} while (cont);
2483 		break;
2484 	}
2485 
2486 	MDI_CLIENT_UNLOCK(ct);
2487 	if (retry == 1) {
2488 		return (MDI_BUSY);
2489 	} else {
2490 		return (MDI_NOPATH);
2491 	}
2492 }
2493 
2494 /*
2495  * For a client, return the next available path to any phci
2496  *
2497  * Note:
2498  *		Caller should hold the branch's devinfo node to get a consistent
2499  *		snap shot of the mdi_pathinfo nodes.
2500  *
2501  *		Please note that even the list is stable the mdi_pathinfo
2502  *		node state and properties are volatile.  The caller should lock
2503  *		and unlock the nodes by calling mdi_pi_lock() and
2504  *		mdi_pi_unlock() functions to get a stable properties.
2505  *
2506  *		If there is a need to use the nodes beyond the hold of the
2507  *		devinfo node period (For ex. I/O), then mdi_pathinfo node
2508  *		need to be held against unexpected removal by calling
2509  *		mdi_hold_path() and should be released by calling
2510  *		mdi_rele_path() on completion.
2511  */
2512 mdi_pathinfo_t *
2513 mdi_get_next_phci_path(dev_info_t *ct_dip, mdi_pathinfo_t *pip)
2514 {
2515 	mdi_client_t *ct;
2516 
2517 	if (!MDI_CLIENT(ct_dip))
2518 		return (NULL);
2519 
2520 	/*
2521 	 * Walk through client link
2522 	 */
2523 	ct = (mdi_client_t *)DEVI(ct_dip)->devi_mdi_client;
2524 	ASSERT(ct != NULL);
2525 
2526 	if (pip == NULL)
2527 		return ((mdi_pathinfo_t *)ct->ct_path_head);
2528 
2529 	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link);
2530 }
2531 
2532 /*
2533  * For a phci, return the next available path to any client
2534  * Note: ditto mdi_get_next_phci_path()
2535  */
2536 mdi_pathinfo_t *
2537 mdi_get_next_client_path(dev_info_t *ph_dip, mdi_pathinfo_t *pip)
2538 {
2539 	mdi_phci_t *ph;
2540 
2541 	if (!MDI_PHCI(ph_dip))
2542 		return (NULL);
2543 
2544 	/*
2545 	 * Walk through pHCI link
2546 	 */
2547 	ph = (mdi_phci_t *)DEVI(ph_dip)->devi_mdi_xhci;
2548 	ASSERT(ph != NULL);
2549 
2550 	if (pip == NULL)
2551 		return ((mdi_pathinfo_t *)ph->ph_path_head);
2552 
2553 	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link);
2554 }
2555 
2556 /*
2557  * mdi_hold_path():
2558  *		Hold the mdi_pathinfo node against unwanted unexpected free.
2559  * Return Values:
2560  *		None
2561  */
2562 void
2563 mdi_hold_path(mdi_pathinfo_t *pip)
2564 {
2565 	if (pip) {
2566 		MDI_PI_LOCK(pip);
2567 		MDI_PI_HOLD(pip);
2568 		MDI_PI_UNLOCK(pip);
2569 	}
2570 }
2571 
2572 
2573 /*
2574  * mdi_rele_path():
2575  *		Release the mdi_pathinfo node which was selected
2576  *		through mdi_select_path() mechanism or manually held by
2577  *		calling mdi_hold_path().
2578  * Return Values:
2579  *		None
2580  */
2581 void
2582 mdi_rele_path(mdi_pathinfo_t *pip)
2583 {
2584 	if (pip) {
2585 		MDI_PI_LOCK(pip);
2586 		MDI_PI_RELE(pip);
2587 		if (MDI_PI(pip)->pi_ref_cnt == 0) {
2588 			cv_broadcast(&MDI_PI(pip)->pi_ref_cv);
2589 		}
2590 		MDI_PI_UNLOCK(pip);
2591 	}
2592 }
2593 
2594 /*
2595  * mdi_pi_lock():
2596  * 		Lock the mdi_pathinfo node.
2597  * Note:
2598  *		The caller should release the lock by calling mdi_pi_unlock()
2599  */
2600 void
2601 mdi_pi_lock(mdi_pathinfo_t *pip)
2602 {
2603 	ASSERT(pip != NULL);
2604 	if (pip) {
2605 		MDI_PI_LOCK(pip);
2606 	}
2607 }
2608 
2609 
2610 /*
2611  * mdi_pi_unlock():
2612  * 		Unlock the mdi_pathinfo node.
2613  * Note:
2614  *		The mdi_pathinfo node should have been locked with mdi_pi_lock()
2615  */
2616 void
2617 mdi_pi_unlock(mdi_pathinfo_t *pip)
2618 {
2619 	ASSERT(pip != NULL);
2620 	if (pip) {
2621 		MDI_PI_UNLOCK(pip);
2622 	}
2623 }
2624 
2625 /*
2626  * mdi_pi_find():
2627  *		Search the list of mdi_pathinfo nodes attached to the
2628  *		pHCI/Client device node whose path address matches "paddr".
2629  *		Returns a pointer to the mdi_pathinfo node if a matching node is
2630  *		found.
2631  * Return Values:
2632  *		mdi_pathinfo node handle
2633  *		NULL
2634  * Notes:
2635  *		Caller need not hold any locks to call this function.
2636  */
2637 mdi_pathinfo_t *
2638 mdi_pi_find(dev_info_t *pdip, char *caddr, char *paddr)
2639 {
2640 	mdi_phci_t		*ph;
2641 	mdi_vhci_t		*vh;
2642 	mdi_client_t		*ct;
2643 	mdi_pathinfo_t		*pip = NULL;
2644 
2645 	MDI_DEBUG(2, (MDI_NOTE, pdip,
2646 	    "caddr@%s paddr@%s", caddr ? caddr : "", paddr ? paddr : ""));
2647 	if ((pdip == NULL) || (paddr == NULL)) {
2648 		return (NULL);
2649 	}
2650 	ph = i_devi_get_phci(pdip);
2651 	if (ph == NULL) {
2652 		/*
2653 		 * Invalid pHCI device, Nothing more to do.
2654 		 */
2655 		MDI_DEBUG(2, (MDI_WARN, pdip, "invalid phci"));
2656 		return (NULL);
2657 	}
2658 
2659 	vh = ph->ph_vhci;
2660 	if (vh == NULL) {
2661 		/*
2662 		 * Invalid vHCI device, Nothing more to do.
2663 		 */
2664 		MDI_DEBUG(2, (MDI_WARN, pdip, "invalid vhci"));
2665 		return (NULL);
2666 	}
2667 
2668 	/*
2669 	 * Look for pathinfo node identified by paddr.
2670 	 */
2671 	if (caddr == NULL) {
2672 		/*
2673 		 * Find a mdi_pathinfo node under pHCI list for a matching
2674 		 * unit address.
2675 		 */
2676 		MDI_PHCI_LOCK(ph);
2677 		if (MDI_PHCI_IS_OFFLINE(ph)) {
2678 			MDI_DEBUG(2, (MDI_WARN, pdip,
2679 			    "offline phci %p", (void *)ph));
2680 			MDI_PHCI_UNLOCK(ph);
2681 			return (NULL);
2682 		}
2683 		pip = (mdi_pathinfo_t *)ph->ph_path_head;
2684 
2685 		while (pip != NULL) {
2686 			if (strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2687 				break;
2688 			}
2689 			pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
2690 		}
2691 		MDI_PHCI_UNLOCK(ph);
2692 		MDI_DEBUG(2, (MDI_NOTE, pdip,
2693 		    "found %s %p", mdi_pi_spathname(pip), (void *)pip));
2694 		return (pip);
2695 	}
2696 
2697 	/*
2698 	 * XXX - Is the rest of the code in this function really necessary?
2699 	 * The consumers of mdi_pi_find() can search for the desired pathinfo
2700 	 * node by calling mdi_pi_find(pdip, NULL, paddr). Irrespective of
2701 	 * whether the search is based on the pathinfo nodes attached to
2702 	 * the pHCI or the client node, the result will be the same.
2703 	 */
2704 
2705 	/*
2706 	 * Find the client device corresponding to 'caddr'
2707 	 */
2708 	MDI_VHCI_CLIENT_LOCK(vh);
2709 
2710 	/*
2711 	 * XXX - Passing NULL to the following function works as long as the
2712 	 * the client addresses (caddr) are unique per vhci basis.
2713 	 */
2714 	ct = i_mdi_client_find(vh, NULL, caddr);
2715 	if (ct == NULL) {
2716 		/*
2717 		 * Client not found, Obviously mdi_pathinfo node has not been
2718 		 * created yet.
2719 		 */
2720 		MDI_VHCI_CLIENT_UNLOCK(vh);
2721 		MDI_DEBUG(2, (MDI_NOTE, pdip,
2722 		    "client not found for caddr @%s", caddr ? caddr : ""));
2723 		return (NULL);
2724 	}
2725 
2726 	/*
2727 	 * Hold the client lock and look for a mdi_pathinfo node with matching
2728 	 * pHCI and paddr
2729 	 */
2730 	MDI_CLIENT_LOCK(ct);
2731 
2732 	/*
2733 	 * Release the global mutex as it is no more needed. Note: We always
2734 	 * respect the locking order while acquiring.
2735 	 */
2736 	MDI_VHCI_CLIENT_UNLOCK(vh);
2737 
2738 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2739 	while (pip != NULL) {
2740 		/*
2741 		 * Compare the unit address
2742 		 */
2743 		if ((MDI_PI(pip)->pi_phci == ph) &&
2744 		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2745 			break;
2746 		}
2747 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2748 	}
2749 	MDI_CLIENT_UNLOCK(ct);
2750 	MDI_DEBUG(2, (MDI_NOTE, pdip,
2751 	    "found: %s %p", mdi_pi_spathname(pip), (void *)pip));
2752 	return (pip);
2753 }
2754 
2755 /*
2756  * mdi_pi_alloc():
2757  *		Allocate and initialize a new instance of a mdi_pathinfo node.
2758  *		The mdi_pathinfo node returned by this function identifies a
2759  *		unique device path is capable of having properties attached
2760  *		and passed to mdi_pi_online() to fully attach and online the
2761  *		path and client device node.
2762  *		The mdi_pathinfo node returned by this function must be
2763  *		destroyed using mdi_pi_free() if the path is no longer
2764  *		operational or if the caller fails to attach a client device
2765  *		node when calling mdi_pi_online(). The framework will not free
2766  *		the resources allocated.
2767  *		This function can be called from both interrupt and kernel
2768  *		contexts.  DDI_NOSLEEP flag should be used while calling
2769  *		from interrupt contexts.
2770  * Return Values:
2771  *		MDI_SUCCESS
2772  *		MDI_FAILURE
2773  *		MDI_NOMEM
2774  */
2775 /*ARGSUSED*/
2776 int
2777 mdi_pi_alloc_compatible(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2778     char **compatible, int ncompatible, int flags, mdi_pathinfo_t **ret_pip)
2779 {
2780 	mdi_vhci_t	*vh;
2781 	mdi_phci_t	*ph;
2782 	mdi_client_t	*ct;
2783 	mdi_pathinfo_t	*pip = NULL;
2784 	dev_info_t	*cdip;
2785 	int		rv = MDI_NOMEM;
2786 	int		path_allocated = 0;
2787 
2788 	MDI_DEBUG(2, (MDI_NOTE, pdip,
2789 	    "cname %s: caddr@%s paddr@%s",
2790 	    cname ? cname : "", caddr ? caddr : "", paddr ? paddr : ""));
2791 
2792 	if (pdip == NULL || cname == NULL || caddr == NULL || paddr == NULL ||
2793 	    ret_pip == NULL) {
2794 		/* Nothing more to do */
2795 		return (MDI_FAILURE);
2796 	}
2797 
2798 	*ret_pip = NULL;
2799 
2800 	/* No allocations on detaching pHCI */
2801 	if (DEVI_IS_DETACHING(pdip)) {
2802 		/* Invalid pHCI device, return failure */
2803 		MDI_DEBUG(1, (MDI_WARN, pdip,
2804 		    "!detaching pHCI=%p", (void *)pdip));
2805 		return (MDI_FAILURE);
2806 	}
2807 
2808 	ph = i_devi_get_phci(pdip);
2809 	ASSERT(ph != NULL);
2810 	if (ph == NULL) {
2811 		/* Invalid pHCI device, return failure */
2812 		MDI_DEBUG(1, (MDI_WARN, pdip,
2813 		    "!invalid pHCI=%p", (void *)pdip));
2814 		return (MDI_FAILURE);
2815 	}
2816 
2817 	MDI_PHCI_LOCK(ph);
2818 	vh = ph->ph_vhci;
2819 	if (vh == NULL) {
2820 		/* Invalid vHCI device, return failure */
2821 		MDI_DEBUG(1, (MDI_WARN, pdip,
2822 		    "!invalid vHCI=%p", (void *)pdip));
2823 		MDI_PHCI_UNLOCK(ph);
2824 		return (MDI_FAILURE);
2825 	}
2826 
2827 	if (MDI_PHCI_IS_READY(ph) == 0) {
2828 		/*
2829 		 * Do not allow new node creation when pHCI is in
2830 		 * offline/suspended states
2831 		 */
2832 		MDI_DEBUG(1, (MDI_WARN, pdip,
2833 		    "pHCI=%p is not ready", (void *)ph));
2834 		MDI_PHCI_UNLOCK(ph);
2835 		return (MDI_BUSY);
2836 	}
2837 	MDI_PHCI_UNSTABLE(ph);
2838 	MDI_PHCI_UNLOCK(ph);
2839 
2840 	/* look for a matching client, create one if not found */
2841 	MDI_VHCI_CLIENT_LOCK(vh);
2842 	ct = i_mdi_client_find(vh, cname, caddr);
2843 	if (ct == NULL) {
2844 		ct = i_mdi_client_alloc(vh, cname, caddr);
2845 		ASSERT(ct != NULL);
2846 	}
2847 
2848 	if (ct->ct_dip == NULL) {
2849 		/*
2850 		 * Allocate a devinfo node
2851 		 */
2852 		ct->ct_dip = i_mdi_devinfo_create(vh, cname, caddr,
2853 		    compatible, ncompatible);
2854 		if (ct->ct_dip == NULL) {
2855 			(void) i_mdi_client_free(vh, ct);
2856 			goto fail;
2857 		}
2858 	}
2859 	cdip = ct->ct_dip;
2860 
2861 	DEVI(cdip)->devi_mdi_component |= MDI_COMPONENT_CLIENT;
2862 	DEVI(cdip)->devi_mdi_client = (caddr_t)ct;
2863 
2864 	MDI_CLIENT_LOCK(ct);
2865 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2866 	while (pip != NULL) {
2867 		/*
2868 		 * Compare the unit address
2869 		 */
2870 		if ((MDI_PI(pip)->pi_phci == ph) &&
2871 		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2872 			break;
2873 		}
2874 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2875 	}
2876 	MDI_CLIENT_UNLOCK(ct);
2877 
2878 	if (pip == NULL) {
2879 		/*
2880 		 * This is a new path for this client device.  Allocate and
2881 		 * initialize a new pathinfo node
2882 		 */
2883 		pip = i_mdi_pi_alloc(ph, paddr, ct);
2884 		ASSERT(pip != NULL);
2885 		path_allocated = 1;
2886 	}
2887 	rv = MDI_SUCCESS;
2888 
2889 fail:
2890 	/*
2891 	 * Release the global mutex.
2892 	 */
2893 	MDI_VHCI_CLIENT_UNLOCK(vh);
2894 
2895 	/*
2896 	 * Mark the pHCI as stable
2897 	 */
2898 	MDI_PHCI_LOCK(ph);
2899 	MDI_PHCI_STABLE(ph);
2900 	MDI_PHCI_UNLOCK(ph);
2901 	*ret_pip = pip;
2902 
2903 	MDI_DEBUG(2, (MDI_NOTE, pdip,
2904 	    "alloc %s %p", mdi_pi_spathname(pip), (void *)pip));
2905 
2906 	if (path_allocated)
2907 		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
2908 
2909 	return (rv);
2910 }
2911 
2912 /*ARGSUSED*/
2913 int
2914 mdi_pi_alloc(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2915     int flags, mdi_pathinfo_t **ret_pip)
2916 {
2917 	return (mdi_pi_alloc_compatible(pdip, cname, caddr, paddr, NULL, 0,
2918 	    flags, ret_pip));
2919 }
2920 
2921 /*
2922  * i_mdi_pi_alloc():
2923  *		Allocate a mdi_pathinfo node and add to the pHCI path list
2924  * Return Values:
2925  *		mdi_pathinfo
2926  */
2927 /*ARGSUSED*/
2928 static mdi_pathinfo_t *
2929 i_mdi_pi_alloc(mdi_phci_t *ph, char *paddr, mdi_client_t *ct)
2930 {
2931 	mdi_pathinfo_t	*pip;
2932 	int		ct_circular;
2933 	int		ph_circular;
2934 	static char	path[MAXPATHLEN];	/* mdi_pathmap_mutex protects */
2935 	char		*path_persistent;
2936 	int		path_instance;
2937 	mod_hash_val_t	hv;
2938 
2939 	ASSERT(MDI_VHCI_CLIENT_LOCKED(ph->ph_vhci));
2940 
2941 	pip = kmem_zalloc(sizeof (struct mdi_pathinfo), KM_SLEEP);
2942 	mutex_init(&MDI_PI(pip)->pi_mutex, NULL, MUTEX_DEFAULT, NULL);
2943 	MDI_PI(pip)->pi_state = MDI_PATHINFO_STATE_INIT |
2944 	    MDI_PATHINFO_STATE_TRANSIENT;
2945 
2946 	if (MDI_PHCI_IS_USER_DISABLED(ph))
2947 		MDI_PI_SET_USER_DISABLE(pip);
2948 
2949 	if (MDI_PHCI_IS_DRV_DISABLED_TRANSIENT(ph))
2950 		MDI_PI_SET_DRV_DISABLE_TRANS(pip);
2951 
2952 	if (MDI_PHCI_IS_DRV_DISABLED(ph))
2953 		MDI_PI_SET_DRV_DISABLE(pip);
2954 
2955 	MDI_PI(pip)->pi_old_state = MDI_PATHINFO_STATE_INIT;
2956 	cv_init(&MDI_PI(pip)->pi_state_cv, NULL, CV_DEFAULT, NULL);
2957 	MDI_PI(pip)->pi_client = ct;
2958 	MDI_PI(pip)->pi_phci = ph;
2959 	MDI_PI(pip)->pi_addr = kmem_alloc(strlen(paddr) + 1, KM_SLEEP);
2960 	(void) strcpy(MDI_PI(pip)->pi_addr, paddr);
2961 
2962         /*
2963 	 * We form the "path" to the pathinfo node, and see if we have
2964 	 * already allocated a 'path_instance' for that "path".  If so,
2965 	 * we use the already allocated 'path_instance'.  If not, we
2966 	 * allocate a new 'path_instance' and associate it with a copy of
2967 	 * the "path" string (which is never freed). The association
2968 	 * between a 'path_instance' this "path" string persists until
2969 	 * reboot.
2970 	 */
2971         mutex_enter(&mdi_pathmap_mutex);
2972 	(void) ddi_pathname(ph->ph_dip, path);
2973 	(void) sprintf(path + strlen(path), "/%s@%s",
2974 	    mdi_pi_get_node_name(pip), mdi_pi_get_addr(pip));
2975         if (mod_hash_find(mdi_pathmap_bypath, (mod_hash_key_t)path, &hv) == 0) {
2976                 path_instance = (uint_t)(intptr_t)hv;
2977         } else {
2978 		/* allocate a new 'path_instance' and persistent "path" */
2979 		path_instance = mdi_pathmap_instance++;
2980 		path_persistent = i_ddi_strdup(path, KM_SLEEP);
2981                 (void) mod_hash_insert(mdi_pathmap_bypath,
2982                     (mod_hash_key_t)path_persistent,
2983                     (mod_hash_val_t)(intptr_t)path_instance);
2984 		(void) mod_hash_insert(mdi_pathmap_byinstance,
2985 		    (mod_hash_key_t)(intptr_t)path_instance,
2986 		    (mod_hash_val_t)path_persistent);
2987 
2988 		/* create shortpath name */
2989 		(void) snprintf(path, sizeof(path), "%s%d/%s@%s",
2990 		    ddi_driver_name(ph->ph_dip), ddi_get_instance(ph->ph_dip),
2991 		    mdi_pi_get_node_name(pip), mdi_pi_get_addr(pip));
2992 		path_persistent = i_ddi_strdup(path, KM_SLEEP);
2993 		(void) mod_hash_insert(mdi_pathmap_sbyinstance,
2994 		    (mod_hash_key_t)(intptr_t)path_instance,
2995 		    (mod_hash_val_t)path_persistent);
2996         }
2997         mutex_exit(&mdi_pathmap_mutex);
2998 	MDI_PI(pip)->pi_path_instance = path_instance;
2999 
3000 	(void) nvlist_alloc(&MDI_PI(pip)->pi_prop, NV_UNIQUE_NAME, KM_SLEEP);
3001 	ASSERT(MDI_PI(pip)->pi_prop != NULL);
3002 	MDI_PI(pip)->pi_pprivate = NULL;
3003 	MDI_PI(pip)->pi_cprivate = NULL;
3004 	MDI_PI(pip)->pi_vprivate = NULL;
3005 	MDI_PI(pip)->pi_client_link = NULL;
3006 	MDI_PI(pip)->pi_phci_link = NULL;
3007 	MDI_PI(pip)->pi_ref_cnt = 0;
3008 	MDI_PI(pip)->pi_kstats = NULL;
3009 	MDI_PI(pip)->pi_preferred = 1;
3010 	cv_init(&MDI_PI(pip)->pi_ref_cv, NULL, CV_DEFAULT, NULL);
3011 
3012 	/*
3013 	 * Lock both dev_info nodes against changes in parallel.
3014 	 *
3015 	 * The ndi_devi_enter(Client), is atypical since the client is a leaf.
3016 	 * This atypical operation is done to synchronize pathinfo nodes
3017 	 * during devinfo snapshot (see di_register_pip) by 'pretending' that
3018 	 * the pathinfo nodes are children of the Client.
3019 	 */
3020 	ndi_devi_enter(ct->ct_dip, &ct_circular);
3021 	ndi_devi_enter(ph->ph_dip, &ph_circular);
3022 
3023 	i_mdi_phci_add_path(ph, pip);
3024 	i_mdi_client_add_path(ct, pip);
3025 
3026 	ndi_devi_exit(ph->ph_dip, ph_circular);
3027 	ndi_devi_exit(ct->ct_dip, ct_circular);
3028 
3029 	return (pip);
3030 }
3031 
3032 /*
3033  * mdi_pi_pathname_by_instance():
3034  *	Lookup of "path" by 'path_instance'. Return "path".
3035  *	NOTE: returned "path" remains valid forever (until reboot).
3036  */
3037 char *
3038 mdi_pi_pathname_by_instance(int path_instance)
3039 {
3040 	char		*path;
3041 	mod_hash_val_t	hv;
3042 
3043 	/* mdi_pathmap lookup of "path" by 'path_instance' */
3044 	mutex_enter(&mdi_pathmap_mutex);
3045 	if (mod_hash_find(mdi_pathmap_byinstance,
3046 	    (mod_hash_key_t)(intptr_t)path_instance, &hv) == 0)
3047 		path = (char *)hv;
3048 	else
3049 		path = NULL;
3050 	mutex_exit(&mdi_pathmap_mutex);
3051 	return (path);
3052 }
3053 
3054 /*
3055  * mdi_pi_spathname_by_instance():
3056  *	Lookup of "shortpath" by 'path_instance'. Return "shortpath".
3057  *	NOTE: returned "shortpath" remains valid forever (until reboot).
3058  */
3059 char *
3060 mdi_pi_spathname_by_instance(int path_instance)
3061 {
3062 	char		*path;
3063 	mod_hash_val_t	hv;
3064 
3065 	/* mdi_pathmap lookup of "path" by 'path_instance' */
3066 	mutex_enter(&mdi_pathmap_mutex);
3067 	if (mod_hash_find(mdi_pathmap_sbyinstance,
3068 	    (mod_hash_key_t)(intptr_t)path_instance, &hv) == 0)
3069 		path = (char *)hv;
3070 	else
3071 		path = NULL;
3072 	mutex_exit(&mdi_pathmap_mutex);
3073 	return (path);
3074 }
3075 
3076 
3077 /*
3078  * i_mdi_phci_add_path():
3079  * 		Add a mdi_pathinfo node to pHCI list.
3080  * Notes:
3081  *		Caller should per-pHCI mutex
3082  */
3083 static void
3084 i_mdi_phci_add_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
3085 {
3086 	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
3087 
3088 	MDI_PHCI_LOCK(ph);
3089 	if (ph->ph_path_head == NULL) {
3090 		ph->ph_path_head = pip;
3091 	} else {
3092 		MDI_PI(ph->ph_path_tail)->pi_phci_link = MDI_PI(pip);
3093 	}
3094 	ph->ph_path_tail = pip;
3095 	ph->ph_path_count++;
3096 	MDI_PHCI_UNLOCK(ph);
3097 }
3098 
3099 /*
3100  * i_mdi_client_add_path():
3101  *		Add mdi_pathinfo node to client list
3102  */
3103 static void
3104 i_mdi_client_add_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
3105 {
3106 	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
3107 
3108 	MDI_CLIENT_LOCK(ct);
3109 	if (ct->ct_path_head == NULL) {
3110 		ct->ct_path_head = pip;
3111 	} else {
3112 		MDI_PI(ct->ct_path_tail)->pi_client_link = MDI_PI(pip);
3113 	}
3114 	ct->ct_path_tail = pip;
3115 	ct->ct_path_count++;
3116 	MDI_CLIENT_UNLOCK(ct);
3117 }
3118 
3119 /*
3120  * mdi_pi_free():
3121  *		Free the mdi_pathinfo node and also client device node if this
3122  *		is the last path to the device
3123  * Return Values:
3124  *		MDI_SUCCESS
3125  *		MDI_FAILURE
3126  *		MDI_BUSY
3127  */
3128 /*ARGSUSED*/
3129 int
3130 mdi_pi_free(mdi_pathinfo_t *pip, int flags)
3131 {
3132 	int		rv = MDI_FAILURE;
3133 	mdi_vhci_t	*vh;
3134 	mdi_phci_t	*ph;
3135 	mdi_client_t	*ct;
3136 	int		(*f)();
3137 	int		client_held = 0;
3138 
3139 	MDI_PI_LOCK(pip);
3140 	ph = MDI_PI(pip)->pi_phci;
3141 	ASSERT(ph != NULL);
3142 	if (ph == NULL) {
3143 		/*
3144 		 * Invalid pHCI device, return failure
3145 		 */
3146 		MDI_DEBUG(1, (MDI_WARN, NULL,
3147 		    "!invalid pHCI: pip %s %p",
3148 		    mdi_pi_spathname(pip), (void *)pip));
3149 		MDI_PI_UNLOCK(pip);
3150 		return (MDI_FAILURE);
3151 	}
3152 
3153 	vh = ph->ph_vhci;
3154 	ASSERT(vh != NULL);
3155 	if (vh == NULL) {
3156 		/* Invalid pHCI device, return failure */
3157 		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3158 		    "!invalid vHCI: pip %s %p",
3159 		    mdi_pi_spathname(pip), (void *)pip));
3160 		MDI_PI_UNLOCK(pip);
3161 		return (MDI_FAILURE);
3162 	}
3163 
3164 	ct = MDI_PI(pip)->pi_client;
3165 	ASSERT(ct != NULL);
3166 	if (ct == NULL) {
3167 		/*
3168 		 * Invalid Client device, return failure
3169 		 */
3170 		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3171 		    "!invalid client: pip %s %p",
3172 		    mdi_pi_spathname(pip), (void *)pip));
3173 		MDI_PI_UNLOCK(pip);
3174 		return (MDI_FAILURE);
3175 	}
3176 
3177 	/*
3178 	 * Check to see for busy condition.  A mdi_pathinfo can only be freed
3179 	 * if the node state is either offline or init and the reference count
3180 	 * is zero.
3181 	 */
3182 	if (!(MDI_PI_IS_OFFLINE(pip) || MDI_PI_IS_INIT(pip) ||
3183 	    MDI_PI_IS_INITING(pip))) {
3184 		/*
3185 		 * Node is busy
3186 		 */
3187 		MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
3188 		    "!busy: pip %s %p", mdi_pi_spathname(pip), (void *)pip));
3189 		MDI_PI_UNLOCK(pip);
3190 		return (MDI_BUSY);
3191 	}
3192 
3193 	while (MDI_PI(pip)->pi_ref_cnt != 0) {
3194 		/*
3195 		 * Give a chance for pending I/Os to complete.
3196 		 */
3197 		MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3198 		    "!%d cmds still pending on path: %s %p",
3199 		    MDI_PI(pip)->pi_ref_cnt,
3200 		    mdi_pi_spathname(pip), (void *)pip));
3201 		if (cv_timedwait(&MDI_PI(pip)->pi_ref_cv,
3202 		    &MDI_PI(pip)->pi_mutex,
3203 		    ddi_get_lbolt() + drv_usectohz(60 * 1000000)) == -1) {
3204 			/*
3205 			 * The timeout time reached without ref_cnt being zero
3206 			 * being signaled.
3207 			 */
3208 			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3209 			    "!Timeout reached on path %s %p without the cond",
3210 			    mdi_pi_spathname(pip), (void *)pip));
3211 			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3212 			    "!%d cmds still pending on path %s %p",
3213 			    MDI_PI(pip)->pi_ref_cnt,
3214 			    mdi_pi_spathname(pip), (void *)pip));
3215 			MDI_PI_UNLOCK(pip);
3216 			return (MDI_BUSY);
3217 		}
3218 	}
3219 	if (MDI_PI(pip)->pi_pm_held) {
3220 		client_held = 1;
3221 	}
3222 	MDI_PI_UNLOCK(pip);
3223 
3224 	vhcache_pi_remove(vh->vh_config, MDI_PI(pip));
3225 
3226 	MDI_CLIENT_LOCK(ct);
3227 
3228 	/* Prevent further failovers till MDI_VHCI_CLIENT_LOCK is held */
3229 	MDI_CLIENT_SET_PATH_FREE_IN_PROGRESS(ct);
3230 
3231 	/*
3232 	 * Wait till failover is complete before removing this node.
3233 	 */
3234 	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
3235 		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
3236 
3237 	MDI_CLIENT_UNLOCK(ct);
3238 	MDI_VHCI_CLIENT_LOCK(vh);
3239 	MDI_CLIENT_LOCK(ct);
3240 	MDI_CLIENT_CLEAR_PATH_FREE_IN_PROGRESS(ct);
3241 
3242 	if (!MDI_PI_IS_INITING(pip)) {
3243 		f = vh->vh_ops->vo_pi_uninit;
3244 		if (f != NULL) {
3245 			rv = (*f)(vh->vh_dip, pip, 0);
3246 		}
3247 	}
3248 	/*
3249 	 * If vo_pi_uninit() completed successfully.
3250 	 */
3251 	if (rv == MDI_SUCCESS) {
3252 		if (client_held) {
3253 			MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
3254 			    "i_mdi_pm_rele_client\n"));
3255 			i_mdi_pm_rele_client(ct, 1);
3256 		}
3257 		i_mdi_pi_free(ph, pip, ct);
3258 		if (ct->ct_path_count == 0) {
3259 			/*
3260 			 * Client lost its last path.
3261 			 * Clean up the client device
3262 			 */
3263 			MDI_CLIENT_UNLOCK(ct);
3264 			(void) i_mdi_client_free(ct->ct_vhci, ct);
3265 			MDI_VHCI_CLIENT_UNLOCK(vh);
3266 			return (rv);
3267 		}
3268 	}
3269 	MDI_CLIENT_UNLOCK(ct);
3270 	MDI_VHCI_CLIENT_UNLOCK(vh);
3271 
3272 	if (rv == MDI_FAILURE)
3273 		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
3274 
3275 	return (rv);
3276 }
3277 
3278 /*
3279  * i_mdi_pi_free():
3280  *		Free the mdi_pathinfo node
3281  */
3282 static void
3283 i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *pip, mdi_client_t *ct)
3284 {
3285 	int	ct_circular;
3286 	int	ph_circular;
3287 
3288 	ASSERT(MDI_CLIENT_LOCKED(ct));
3289 
3290 	/*
3291 	 * remove any per-path kstats
3292 	 */
3293 	i_mdi_pi_kstat_destroy(pip);
3294 
3295 	/* See comments in i_mdi_pi_alloc() */
3296 	ndi_devi_enter(ct->ct_dip, &ct_circular);
3297 	ndi_devi_enter(ph->ph_dip, &ph_circular);
3298 
3299 	i_mdi_client_remove_path(ct, pip);
3300 	i_mdi_phci_remove_path(ph, pip);
3301 
3302 	ndi_devi_exit(ph->ph_dip, ph_circular);
3303 	ndi_devi_exit(ct->ct_dip, ct_circular);
3304 
3305 	mutex_destroy(&MDI_PI(pip)->pi_mutex);
3306 	cv_destroy(&MDI_PI(pip)->pi_state_cv);
3307 	cv_destroy(&MDI_PI(pip)->pi_ref_cv);
3308 	if (MDI_PI(pip)->pi_addr) {
3309 		kmem_free(MDI_PI(pip)->pi_addr,
3310 		    strlen(MDI_PI(pip)->pi_addr) + 1);
3311 		MDI_PI(pip)->pi_addr = NULL;
3312 	}
3313 
3314 	if (MDI_PI(pip)->pi_prop) {
3315 		(void) nvlist_free(MDI_PI(pip)->pi_prop);
3316 		MDI_PI(pip)->pi_prop = NULL;
3317 	}
3318 	kmem_free(pip, sizeof (struct mdi_pathinfo));
3319 }
3320 
3321 
3322 /*
3323  * i_mdi_phci_remove_path():
3324  * 		Remove a mdi_pathinfo node from pHCI list.
3325  * Notes:
3326  *		Caller should hold per-pHCI mutex
3327  */
3328 static void
3329 i_mdi_phci_remove_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
3330 {
3331 	mdi_pathinfo_t	*prev = NULL;
3332 	mdi_pathinfo_t	*path = NULL;
3333 
3334 	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
3335 
3336 	MDI_PHCI_LOCK(ph);
3337 	path = ph->ph_path_head;
3338 	while (path != NULL) {
3339 		if (path == pip) {
3340 			break;
3341 		}
3342 		prev = path;
3343 		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
3344 	}
3345 
3346 	if (path) {
3347 		ph->ph_path_count--;
3348 		if (prev) {
3349 			MDI_PI(prev)->pi_phci_link = MDI_PI(path)->pi_phci_link;
3350 		} else {
3351 			ph->ph_path_head =
3352 			    (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
3353 		}
3354 		if (ph->ph_path_tail == path) {
3355 			ph->ph_path_tail = prev;
3356 		}
3357 	}
3358 
3359 	/*
3360 	 * Clear the pHCI link
3361 	 */
3362 	MDI_PI(pip)->pi_phci_link = NULL;
3363 	MDI_PI(pip)->pi_phci = NULL;
3364 	MDI_PHCI_UNLOCK(ph);
3365 }
3366 
3367 /*
3368  * i_mdi_client_remove_path():
3369  * 		Remove a mdi_pathinfo node from client path list.
3370  */
3371 static void
3372 i_mdi_client_remove_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
3373 {
3374 	mdi_pathinfo_t	*prev = NULL;
3375 	mdi_pathinfo_t	*path;
3376 
3377 	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
3378 
3379 	ASSERT(MDI_CLIENT_LOCKED(ct));
3380 	path = ct->ct_path_head;
3381 	while (path != NULL) {
3382 		if (path == pip) {
3383 			break;
3384 		}
3385 		prev = path;
3386 		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
3387 	}
3388 
3389 	if (path) {
3390 		ct->ct_path_count--;
3391 		if (prev) {
3392 			MDI_PI(prev)->pi_client_link =
3393 			    MDI_PI(path)->pi_client_link;
3394 		} else {
3395 			ct->ct_path_head =
3396 			    (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
3397 		}
3398 		if (ct->ct_path_tail == path) {
3399 			ct->ct_path_tail = prev;
3400 		}
3401 		if (ct->ct_path_last == path) {
3402 			ct->ct_path_last = ct->ct_path_head;
3403 		}
3404 	}
3405 	MDI_PI(pip)->pi_client_link = NULL;
3406 	MDI_PI(pip)->pi_client = NULL;
3407 }
3408 
3409 /*
3410  * i_mdi_pi_state_change():
3411  *		online a mdi_pathinfo node
3412  *
3413  * Return Values:
3414  *		MDI_SUCCESS
3415  *		MDI_FAILURE
3416  */
3417 /*ARGSUSED*/
3418 static int
3419 i_mdi_pi_state_change(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state, int flag)
3420 {
3421 	int		rv = MDI_SUCCESS;
3422 	mdi_vhci_t	*vh;
3423 	mdi_phci_t	*ph;
3424 	mdi_client_t	*ct;
3425 	int		(*f)();
3426 	dev_info_t	*cdip;
3427 
3428 	MDI_PI_LOCK(pip);
3429 
3430 	ph = MDI_PI(pip)->pi_phci;
3431 	ASSERT(ph);
3432 	if (ph == NULL) {
3433 		/*
3434 		 * Invalid pHCI device, fail the request
3435 		 */
3436 		MDI_PI_UNLOCK(pip);
3437 		MDI_DEBUG(1, (MDI_WARN, NULL,
3438 		    "!invalid phci: pip %s %p",
3439 		    mdi_pi_spathname(pip), (void *)pip));
3440 		return (MDI_FAILURE);
3441 	}
3442 
3443 	vh = ph->ph_vhci;
3444 	ASSERT(vh);
3445 	if (vh == NULL) {
3446 		/*
3447 		 * Invalid vHCI device, fail the request
3448 		 */
3449 		MDI_PI_UNLOCK(pip);
3450 		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3451 		    "!invalid vhci: pip %s %p",
3452 		    mdi_pi_spathname(pip), (void *)pip));
3453 		return (MDI_FAILURE);
3454 	}
3455 
3456 	ct = MDI_PI(pip)->pi_client;
3457 	ASSERT(ct != NULL);
3458 	if (ct == NULL) {
3459 		/*
3460 		 * Invalid client device, fail the request
3461 		 */
3462 		MDI_PI_UNLOCK(pip);
3463 		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3464 		    "!invalid client: pip %s %p",
3465 		    mdi_pi_spathname(pip), (void *)pip));
3466 		return (MDI_FAILURE);
3467 	}
3468 
3469 	/*
3470 	 * If this path has not been initialized yet, Callback vHCI driver's
3471 	 * pathinfo node initialize entry point
3472 	 */
3473 
3474 	if (MDI_PI_IS_INITING(pip)) {
3475 		MDI_PI_UNLOCK(pip);
3476 		f = vh->vh_ops->vo_pi_init;
3477 		if (f != NULL) {
3478 			rv = (*f)(vh->vh_dip, pip, 0);
3479 			if (rv != MDI_SUCCESS) {
3480 				MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
3481 				    "!vo_pi_init failed: vHCI %p, pip %s %p",
3482 				    (void *)vh, mdi_pi_spathname(pip),
3483 				    (void *)pip));
3484 				return (MDI_FAILURE);
3485 			}
3486 		}
3487 		MDI_PI_LOCK(pip);
3488 		MDI_PI_CLEAR_TRANSIENT(pip);
3489 	}
3490 
3491 	/*
3492 	 * Do not allow state transition when pHCI is in offline/suspended
3493 	 * states
3494 	 */
3495 	i_mdi_phci_lock(ph, pip);
3496 	if (MDI_PHCI_IS_READY(ph) == 0) {
3497 		MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
3498 		    "!pHCI not ready, pHCI=%p", (void *)ph));
3499 		MDI_PI_UNLOCK(pip);
3500 		i_mdi_phci_unlock(ph);
3501 		return (MDI_BUSY);
3502 	}
3503 	MDI_PHCI_UNSTABLE(ph);
3504 	i_mdi_phci_unlock(ph);
3505 
3506 	/*
3507 	 * Check if mdi_pathinfo state is in transient state.
3508 	 * If yes, offlining is in progress and wait till transient state is
3509 	 * cleared.
3510 	 */
3511 	if (MDI_PI_IS_TRANSIENT(pip)) {
3512 		while (MDI_PI_IS_TRANSIENT(pip)) {
3513 			cv_wait(&MDI_PI(pip)->pi_state_cv,
3514 			    &MDI_PI(pip)->pi_mutex);
3515 		}
3516 	}
3517 
3518 	/*
3519 	 * Grab the client lock in reverse order sequence and release the
3520 	 * mdi_pathinfo mutex.
3521 	 */
3522 	i_mdi_client_lock(ct, pip);
3523 	MDI_PI_UNLOCK(pip);
3524 
3525 	/*
3526 	 * Wait till failover state is cleared
3527 	 */
3528 	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
3529 		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
3530 
3531 	/*
3532 	 * Mark the mdi_pathinfo node state as transient
3533 	 */
3534 	MDI_PI_LOCK(pip);
3535 	switch (state) {
3536 	case MDI_PATHINFO_STATE_ONLINE:
3537 		MDI_PI_SET_ONLINING(pip);
3538 		break;
3539 
3540 	case MDI_PATHINFO_STATE_STANDBY:
3541 		MDI_PI_SET_STANDBYING(pip);
3542 		break;
3543 
3544 	case MDI_PATHINFO_STATE_FAULT:
3545 		/*
3546 		 * Mark the pathinfo state as FAULTED
3547 		 */
3548 		MDI_PI_SET_FAULTING(pip);
3549 		MDI_PI_ERRSTAT(pip, MDI_PI_HARDERR);
3550 		break;
3551 
3552 	case MDI_PATHINFO_STATE_OFFLINE:
3553 		/*
3554 		 * ndi_devi_offline() cannot hold pip or ct locks.
3555 		 */
3556 		MDI_PI_UNLOCK(pip);
3557 
3558 		/*
3559 		 * If this is a user initiated path online->offline operation
3560 		 * who's success would transition a client from DEGRADED to
3561 		 * FAILED then only proceed if we can offline the client first.
3562 		 */
3563 		cdip = ct->ct_dip;
3564 		if ((flag & NDI_USER_REQ) &&
3565 		    MDI_PI_IS_ONLINE(pip) &&
3566 		    (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED)) {
3567 			i_mdi_client_unlock(ct);
3568 			rv = ndi_devi_offline(cdip, NDI_DEVFS_CLEAN);
3569 			if (rv != NDI_SUCCESS) {
3570 				/*
3571 				 * Convert to MDI error code
3572 				 */
3573 				switch (rv) {
3574 				case NDI_BUSY:
3575 					rv = MDI_BUSY;
3576 					break;
3577 				default:
3578 					rv = MDI_FAILURE;
3579 					break;
3580 				}
3581 				goto state_change_exit;
3582 			} else {
3583 				i_mdi_client_lock(ct, NULL);
3584 			}
3585 		}
3586 		/*
3587 		 * Mark the mdi_pathinfo node state as transient
3588 		 */
3589 		MDI_PI_LOCK(pip);
3590 		MDI_PI_SET_OFFLINING(pip);
3591 		break;
3592 	}
3593 	MDI_PI_UNLOCK(pip);
3594 	MDI_CLIENT_UNSTABLE(ct);
3595 	i_mdi_client_unlock(ct);
3596 
3597 	f = vh->vh_ops->vo_pi_state_change;
3598 	if (f != NULL)
3599 		rv = (*f)(vh->vh_dip, pip, state, 0, flag);
3600 
3601 	MDI_CLIENT_LOCK(ct);
3602 	MDI_PI_LOCK(pip);
3603 	if (rv == MDI_NOT_SUPPORTED) {
3604 		MDI_CLIENT_SET_DEV_NOT_SUPPORTED(ct);
3605 	}
3606 	if (rv != MDI_SUCCESS) {
3607 		MDI_DEBUG(2, (MDI_WARN, ct->ct_dip,
3608 		    "vo_pi_state_change failed: rv %x", rv));
3609 	}
3610 	if (MDI_PI_IS_TRANSIENT(pip)) {
3611 		if (rv == MDI_SUCCESS) {
3612 			MDI_PI_CLEAR_TRANSIENT(pip);
3613 		} else {
3614 			MDI_PI(pip)->pi_state = MDI_PI_OLD_STATE(pip);
3615 		}
3616 	}
3617 
3618 	/*
3619 	 * Wake anyone waiting for this mdi_pathinfo node
3620 	 */
3621 	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
3622 	MDI_PI_UNLOCK(pip);
3623 
3624 	/*
3625 	 * Mark the client device as stable
3626 	 */
3627 	MDI_CLIENT_STABLE(ct);
3628 	if (rv == MDI_SUCCESS) {
3629 		if (ct->ct_unstable == 0) {
3630 			cdip = ct->ct_dip;
3631 
3632 			/*
3633 			 * Onlining the mdi_pathinfo node will impact the
3634 			 * client state Update the client and dev_info node
3635 			 * state accordingly
3636 			 */
3637 			rv = NDI_SUCCESS;
3638 			i_mdi_client_update_state(ct);
3639 			switch (MDI_CLIENT_STATE(ct)) {
3640 			case MDI_CLIENT_STATE_OPTIMAL:
3641 			case MDI_CLIENT_STATE_DEGRADED:
3642 				if (cdip && !i_ddi_devi_attached(cdip) &&
3643 				    ((state == MDI_PATHINFO_STATE_ONLINE) ||
3644 				    (state == MDI_PATHINFO_STATE_STANDBY))) {
3645 
3646 					/*
3647 					 * Must do ndi_devi_online() through
3648 					 * hotplug thread for deferred
3649 					 * attach mechanism to work
3650 					 */
3651 					MDI_CLIENT_UNLOCK(ct);
3652 					rv = ndi_devi_online(cdip, 0);
3653 					MDI_CLIENT_LOCK(ct);
3654 					if ((rv != NDI_SUCCESS) &&
3655 					    (MDI_CLIENT_STATE(ct) ==
3656 					    MDI_CLIENT_STATE_DEGRADED)) {
3657 						/*
3658 						 * ndi_devi_online failed.
3659 						 * Reset client flags to
3660 						 * offline.
3661 						 */
3662 						MDI_DEBUG(1, (MDI_WARN, cdip,
3663 						    "!ndi_devi_online failed "
3664 						    "error %x", rv));
3665 						MDI_CLIENT_SET_OFFLINE(ct);
3666 					}
3667 					if (rv != NDI_SUCCESS) {
3668 						/* Reset the path state */
3669 						MDI_PI_LOCK(pip);
3670 						MDI_PI(pip)->pi_state =
3671 						    MDI_PI_OLD_STATE(pip);
3672 						MDI_PI_UNLOCK(pip);
3673 					}
3674 				}
3675 				break;
3676 
3677 			case MDI_CLIENT_STATE_FAILED:
3678 				/*
3679 				 * This is the last path case for
3680 				 * non-user initiated events.
3681 				 */
3682 				if (((flag & NDI_USER_REQ) == 0) &&
3683 				    cdip && (i_ddi_node_state(cdip) >=
3684 				    DS_INITIALIZED)) {
3685 					MDI_CLIENT_UNLOCK(ct);
3686 					rv = ndi_devi_offline(cdip,
3687 					    NDI_DEVFS_CLEAN);
3688 					MDI_CLIENT_LOCK(ct);
3689 
3690 					if (rv != NDI_SUCCESS) {
3691 						/*
3692 						 * ndi_devi_offline failed.
3693 						 * Reset client flags to
3694 						 * online as the path could not
3695 						 * be offlined.
3696 						 */
3697 						MDI_DEBUG(1, (MDI_WARN, cdip,
3698 						    "!ndi_devi_offline failed: "
3699 						    "error %x", rv));
3700 						MDI_CLIENT_SET_ONLINE(ct);
3701 					}
3702 				}
3703 				break;
3704 			}
3705 			/*
3706 			 * Convert to MDI error code
3707 			 */
3708 			switch (rv) {
3709 			case NDI_SUCCESS:
3710 				MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
3711 				i_mdi_report_path_state(ct, pip);
3712 				rv = MDI_SUCCESS;
3713 				break;
3714 			case NDI_BUSY:
3715 				rv = MDI_BUSY;
3716 				break;
3717 			default:
3718 				rv = MDI_FAILURE;
3719 				break;
3720 			}
3721 		}
3722 	}
3723 	MDI_CLIENT_UNLOCK(ct);
3724 
3725 state_change_exit:
3726 	/*
3727 	 * Mark the pHCI as stable again.
3728 	 */
3729 	MDI_PHCI_LOCK(ph);
3730 	MDI_PHCI_STABLE(ph);
3731 	MDI_PHCI_UNLOCK(ph);
3732 	return (rv);
3733 }
3734 
3735 /*
3736  * mdi_pi_online():
3737  *		Place the path_info node in the online state.  The path is
3738  *		now available to be selected by mdi_select_path() for
3739  *		transporting I/O requests to client devices.
3740  * Return Values:
3741  *		MDI_SUCCESS
3742  *		MDI_FAILURE
3743  */
3744 int
3745 mdi_pi_online(mdi_pathinfo_t *pip, int flags)
3746 {
3747 	mdi_client_t	*ct = MDI_PI(pip)->pi_client;
3748 	int		client_held = 0;
3749 	int		rv;
3750 
3751 	ASSERT(ct != NULL);
3752 	rv = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_ONLINE, flags);
3753 	if (rv != MDI_SUCCESS)
3754 		return (rv);
3755 
3756 	MDI_PI_LOCK(pip);
3757 	if (MDI_PI(pip)->pi_pm_held == 0) {
3758 		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
3759 		    "i_mdi_pm_hold_pip %p", (void *)pip));
3760 		i_mdi_pm_hold_pip(pip);
3761 		client_held = 1;
3762 	}
3763 	MDI_PI_UNLOCK(pip);
3764 
3765 	if (client_held) {
3766 		MDI_CLIENT_LOCK(ct);
3767 		if (ct->ct_power_cnt == 0) {
3768 			rv = i_mdi_power_all_phci(ct);
3769 		}
3770 
3771 		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
3772 		    "i_mdi_pm_hold_client %p", (void *)ct));
3773 		i_mdi_pm_hold_client(ct, 1);
3774 		MDI_CLIENT_UNLOCK(ct);
3775 	}
3776 
3777 	return (rv);
3778 }
3779 
3780 /*
3781  * mdi_pi_standby():
3782  *		Place the mdi_pathinfo node in standby state
3783  *
3784  * Return Values:
3785  *		MDI_SUCCESS
3786  *		MDI_FAILURE
3787  */
3788 int
3789 mdi_pi_standby(mdi_pathinfo_t *pip, int flags)
3790 {
3791 	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_STANDBY, flags));
3792 }
3793 
3794 /*
3795  * mdi_pi_fault():
3796  *		Place the mdi_pathinfo node in fault'ed state
3797  * Return Values:
3798  *		MDI_SUCCESS
3799  *		MDI_FAILURE
3800  */
3801 int
3802 mdi_pi_fault(mdi_pathinfo_t *pip, int flags)
3803 {
3804 	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_FAULT, flags));
3805 }
3806 
3807 /*
3808  * mdi_pi_offline():
3809  *		Offline a mdi_pathinfo node.
3810  * Return Values:
3811  *		MDI_SUCCESS
3812  *		MDI_FAILURE
3813  */
3814 int
3815 mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
3816 {
3817 	int	ret, client_held = 0;
3818 	mdi_client_t	*ct;
3819 
3820 	/*
3821 	 * Original code overloaded NDI_DEVI_REMOVE to this interface, and
3822 	 * used it to mean "user initiated operation" (i.e. devctl). Callers
3823 	 * should now just use NDI_USER_REQ.
3824 	 */
3825 	if (flags & NDI_DEVI_REMOVE) {
3826 		flags &= ~NDI_DEVI_REMOVE;
3827 		flags |= NDI_USER_REQ;
3828 	}
3829 
3830 	ret = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_OFFLINE, flags);
3831 
3832 	if (ret == MDI_SUCCESS) {
3833 		MDI_PI_LOCK(pip);
3834 		if (MDI_PI(pip)->pi_pm_held) {
3835 			client_held = 1;
3836 		}
3837 		MDI_PI_UNLOCK(pip);
3838 
3839 		if (client_held) {
3840 			ct = MDI_PI(pip)->pi_client;
3841 			MDI_CLIENT_LOCK(ct);
3842 			MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
3843 			    "i_mdi_pm_rele_client\n"));
3844 			i_mdi_pm_rele_client(ct, 1);
3845 			MDI_CLIENT_UNLOCK(ct);
3846 		}
3847 	}
3848 
3849 	return (ret);
3850 }
3851 
3852 /*
3853  * i_mdi_pi_offline():
3854  *		Offline a mdi_pathinfo node and call the vHCI driver's callback
3855  */
3856 static int
3857 i_mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
3858 {
3859 	dev_info_t	*vdip = NULL;
3860 	mdi_vhci_t	*vh = NULL;
3861 	mdi_client_t	*ct = NULL;
3862 	int		(*f)();
3863 	int		rv;
3864 
3865 	MDI_PI_LOCK(pip);
3866 	ct = MDI_PI(pip)->pi_client;
3867 	ASSERT(ct != NULL);
3868 
3869 	while (MDI_PI(pip)->pi_ref_cnt != 0) {
3870 		/*
3871 		 * Give a chance for pending I/Os to complete.
3872 		 */
3873 		MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3874 		    "!%d cmds still pending on path %s %p",
3875 		    MDI_PI(pip)->pi_ref_cnt, mdi_pi_spathname(pip),
3876 		    (void *)pip));
3877 		if (cv_timedwait(&MDI_PI(pip)->pi_ref_cv,
3878 		    &MDI_PI(pip)->pi_mutex,
3879 		    ddi_get_lbolt() + drv_usectohz(60 * 1000000)) == -1) {
3880 			/*
3881 			 * The timeout time reached without ref_cnt being zero
3882 			 * being signaled.
3883 			 */
3884 			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3885 			    "!Timeout reached on path %s %p without the cond",
3886 			    mdi_pi_spathname(pip), (void *)pip));
3887 			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3888 			    "!%d cmds still pending on path %s %p",
3889 			    MDI_PI(pip)->pi_ref_cnt,
3890 			    mdi_pi_spathname(pip), (void *)pip));
3891 		}
3892 	}
3893 	vh = ct->ct_vhci;
3894 	vdip = vh->vh_dip;
3895 
3896 	/*
3897 	 * Notify vHCI that has registered this event
3898 	 */
3899 	ASSERT(vh->vh_ops);
3900 	f = vh->vh_ops->vo_pi_state_change;
3901 
3902 	if (f != NULL) {
3903 		MDI_PI_UNLOCK(pip);
3904 		if ((rv = (*f)(vdip, pip, MDI_PATHINFO_STATE_OFFLINE, 0,
3905 		    flags)) != MDI_SUCCESS) {
3906 			MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
3907 			    "!vo_path_offline failed: vdip %s%d %p: path %s %p",
3908 			    ddi_driver_name(vdip), ddi_get_instance(vdip),
3909 			    (void *)vdip, mdi_pi_spathname(pip), (void *)pip));
3910 		}
3911 		MDI_PI_LOCK(pip);
3912 	}
3913 
3914 	/*
3915 	 * Set the mdi_pathinfo node state and clear the transient condition
3916 	 */
3917 	MDI_PI_SET_OFFLINE(pip);
3918 	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
3919 	MDI_PI_UNLOCK(pip);
3920 
3921 	MDI_CLIENT_LOCK(ct);
3922 	if (rv == MDI_SUCCESS) {
3923 		if (ct->ct_unstable == 0) {
3924 			dev_info_t	*cdip = ct->ct_dip;
3925 
3926 			/*
3927 			 * Onlining the mdi_pathinfo node will impact the
3928 			 * client state Update the client and dev_info node
3929 			 * state accordingly
3930 			 */
3931 			i_mdi_client_update_state(ct);
3932 			rv = NDI_SUCCESS;
3933 			if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
3934 				if (cdip &&
3935 				    (i_ddi_node_state(cdip) >=
3936 				    DS_INITIALIZED)) {
3937 					MDI_CLIENT_UNLOCK(ct);
3938 					rv = ndi_devi_offline(cdip,
3939 					    NDI_DEVFS_CLEAN);
3940 					MDI_CLIENT_LOCK(ct);
3941 					if (rv != NDI_SUCCESS) {
3942 						/*
3943 						 * ndi_devi_offline failed.
3944 						 * Reset client flags to
3945 						 * online.
3946 						 */
3947 						MDI_DEBUG(4, (MDI_WARN, cdip,
3948 						    "ndi_devi_offline failed: "
3949 						    "error %x", rv));
3950 						MDI_CLIENT_SET_ONLINE(ct);
3951 					}
3952 				}
3953 			}
3954 			/*
3955 			 * Convert to MDI error code
3956 			 */
3957 			switch (rv) {
3958 			case NDI_SUCCESS:
3959 				rv = MDI_SUCCESS;
3960 				break;
3961 			case NDI_BUSY:
3962 				rv = MDI_BUSY;
3963 				break;
3964 			default:
3965 				rv = MDI_FAILURE;
3966 				break;
3967 			}
3968 		}
3969 		MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
3970 		i_mdi_report_path_state(ct, pip);
3971 	}
3972 
3973 	MDI_CLIENT_UNLOCK(ct);
3974 
3975 	/*
3976 	 * Change in the mdi_pathinfo node state will impact the client state
3977 	 */
3978 	MDI_DEBUG(2, (MDI_NOTE, ct->ct_dip,
3979 	    "ct = %p pip = %p", (void *)ct, (void *)pip));
3980 	return (rv);
3981 }
3982 
3983 /*
3984  * mdi_pi_get_node_name():
3985  *              Get the name associated with a mdi_pathinfo node.
3986  *              Since pathinfo nodes are not directly named, we
3987  *              return the node_name of the client.
3988  *
3989  * Return Values:
3990  *              char *
3991  */
3992 char *
3993 mdi_pi_get_node_name(mdi_pathinfo_t *pip)
3994 {
3995 	mdi_client_t    *ct;
3996 
3997 	if (pip == NULL)
3998 		return (NULL);
3999 	ct = MDI_PI(pip)->pi_client;
4000 	if ((ct == NULL) || (ct->ct_dip == NULL))
4001 		return (NULL);
4002 	return (ddi_node_name(ct->ct_dip));
4003 }
4004 
4005 /*
4006  * mdi_pi_get_addr():
4007  *		Get the unit address associated with a mdi_pathinfo node
4008  *
4009  * Return Values:
4010  *		char *
4011  */
4012 char *
4013 mdi_pi_get_addr(mdi_pathinfo_t *pip)
4014 {
4015 	if (pip == NULL)
4016 		return (NULL);
4017 
4018 	return (MDI_PI(pip)->pi_addr);
4019 }
4020 
4021 /*
4022  * mdi_pi_get_path_instance():
4023  *		Get the 'path_instance' of a mdi_pathinfo node
4024  *
4025  * Return Values:
4026  *		path_instance
4027  */
4028 int
4029 mdi_pi_get_path_instance(mdi_pathinfo_t *pip)
4030 {
4031 	if (pip == NULL)
4032 		return (0);
4033 
4034 	return (MDI_PI(pip)->pi_path_instance);
4035 }
4036 
4037 /*
4038  * mdi_pi_pathname():
4039  *		Return pointer to path to pathinfo node.
4040  */
4041 char *
4042 mdi_pi_pathname(mdi_pathinfo_t *pip)
4043 {
4044 	if (pip == NULL)
4045 		return (NULL);
4046 	return (mdi_pi_pathname_by_instance(mdi_pi_get_path_instance(pip)));
4047 }
4048 
4049 /*
4050  * mdi_pi_spathname():
4051  *		Return pointer to shortpath to pathinfo node. Used for debug
4052  *		messages, so return "" instead of NULL when unknown.
4053  */
4054 char *
4055 mdi_pi_spathname(mdi_pathinfo_t *pip)
4056 {
4057 	char	*spath = "";
4058 
4059 	if (pip) {
4060 		spath = mdi_pi_spathname_by_instance(
4061 		    mdi_pi_get_path_instance(pip));
4062 		if (spath == NULL)
4063 			spath = "";
4064 	}
4065 	return (spath);
4066 }
4067 
4068 char *
4069 mdi_pi_pathname_obp(mdi_pathinfo_t *pip, char *path)
4070 {
4071 	char *obp_path = NULL;
4072 	if ((pip == NULL) || (path == NULL))
4073 		return (NULL);
4074 
4075 	if (mdi_prop_lookup_string(pip, "obp-path", &obp_path) == MDI_SUCCESS) {
4076 		(void) strcpy(path, obp_path);
4077 		(void) mdi_prop_free(obp_path);
4078 	} else {
4079 		path = NULL;
4080 	}
4081 	return (path);
4082 }
4083 
4084 int
4085 mdi_pi_pathname_obp_set(mdi_pathinfo_t *pip, char *component)
4086 {
4087 	dev_info_t *pdip;
4088 	char *obp_path = NULL;
4089 	int rc = MDI_FAILURE;
4090 
4091 	if (pip == NULL)
4092 		return (MDI_FAILURE);
4093 
4094 	pdip = mdi_pi_get_phci(pip);
4095 	if (pdip == NULL)
4096 		return (MDI_FAILURE);
4097 
4098 	obp_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
4099 
4100 	if (ddi_pathname_obp(pdip, obp_path) == NULL) {
4101 		(void) ddi_pathname(pdip, obp_path);
4102 	}
4103 
4104 	if (component) {
4105 		(void) strncat(obp_path, "/", MAXPATHLEN);
4106 		(void) strncat(obp_path, component, MAXPATHLEN);
4107 	}
4108 	rc = mdi_prop_update_string(pip, "obp-path", obp_path);
4109 
4110 	if (obp_path)
4111 		kmem_free(obp_path, MAXPATHLEN);
4112 	return (rc);
4113 }
4114 
4115 /*
4116  * mdi_pi_get_client():
4117  *		Get the client devinfo associated with a mdi_pathinfo node
4118  *
4119  * Return Values:
4120  *		Handle to client device dev_info node
4121  */
4122 dev_info_t *
4123 mdi_pi_get_client(mdi_pathinfo_t *pip)
4124 {
4125 	dev_info_t	*dip = NULL;
4126 	if (pip) {
4127 		dip = MDI_PI(pip)->pi_client->ct_dip;
4128 	}
4129 	return (dip);
4130 }
4131 
4132 /*
4133  * mdi_pi_get_phci():
4134  *		Get the pHCI devinfo associated with the mdi_pathinfo node
4135  * Return Values:
4136  *		Handle to dev_info node
4137  */
4138 dev_info_t *
4139 mdi_pi_get_phci(mdi_pathinfo_t *pip)
4140 {
4141 	dev_info_t	*dip = NULL;
4142 	mdi_phci_t	*ph;
4143 
4144 	if (pip) {
4145 		ph = MDI_PI(pip)->pi_phci;
4146 		if (ph)
4147 			dip = ph->ph_dip;
4148 	}
4149 	return (dip);
4150 }
4151 
4152 /*
4153  * mdi_pi_get_client_private():
4154  *		Get the client private information associated with the
4155  *		mdi_pathinfo node
4156  */
4157 void *
4158 mdi_pi_get_client_private(mdi_pathinfo_t *pip)
4159 {
4160 	void *cprivate = NULL;
4161 	if (pip) {
4162 		cprivate = MDI_PI(pip)->pi_cprivate;
4163 	}
4164 	return (cprivate);
4165 }
4166 
4167 /*
4168  * mdi_pi_set_client_private():
4169  *		Set the client private information in the mdi_pathinfo node
4170  */
4171 void
4172 mdi_pi_set_client_private(mdi_pathinfo_t *pip, void *priv)
4173 {
4174 	if (pip) {
4175 		MDI_PI(pip)->pi_cprivate = priv;
4176 	}
4177 }
4178 
4179 /*
4180  * mdi_pi_get_phci_private():
4181  *		Get the pHCI private information associated with the
4182  *		mdi_pathinfo node
4183  */
4184 caddr_t
4185 mdi_pi_get_phci_private(mdi_pathinfo_t *pip)
4186 {
4187 	caddr_t	pprivate = NULL;
4188 
4189 	if (pip) {
4190 		pprivate = MDI_PI(pip)->pi_pprivate;
4191 	}
4192 	return (pprivate);
4193 }
4194 
4195 /*
4196  * mdi_pi_set_phci_private():
4197  *		Set the pHCI private information in the mdi_pathinfo node
4198  */
4199 void
4200 mdi_pi_set_phci_private(mdi_pathinfo_t *pip, caddr_t priv)
4201 {
4202 	if (pip) {
4203 		MDI_PI(pip)->pi_pprivate = priv;
4204 	}
4205 }
4206 
4207 /*
4208  * mdi_pi_get_state():
4209  *		Get the mdi_pathinfo node state. Transient states are internal
4210  *		and not provided to the users
4211  */
4212 mdi_pathinfo_state_t
4213 mdi_pi_get_state(mdi_pathinfo_t *pip)
4214 {
4215 	mdi_pathinfo_state_t    state = MDI_PATHINFO_STATE_INIT;
4216 
4217 	if (pip) {
4218 		if (MDI_PI_IS_TRANSIENT(pip)) {
4219 			/*
4220 			 * mdi_pathinfo is in state transition.  Return the
4221 			 * last good state.
4222 			 */
4223 			state = MDI_PI_OLD_STATE(pip);
4224 		} else {
4225 			state = MDI_PI_STATE(pip);
4226 		}
4227 	}
4228 	return (state);
4229 }
4230 
4231 /*
4232  * mdi_pi_get_flags():
4233  *		Get the mdi_pathinfo node flags.
4234  */
4235 uint_t
4236 mdi_pi_get_flags(mdi_pathinfo_t *pip)
4237 {
4238 	return (pip ? MDI_PI(pip)->pi_flags : 0);
4239 }
4240 
4241 /*
4242  * Note that the following function needs to be the new interface for
4243  * mdi_pi_get_state when mpxio gets integrated to ON.
4244  */
4245 int
4246 mdi_pi_get_state2(mdi_pathinfo_t *pip, mdi_pathinfo_state_t *state,
4247 		uint32_t *ext_state)
4248 {
4249 	*state = MDI_PATHINFO_STATE_INIT;
4250 
4251 	if (pip) {
4252 		if (MDI_PI_IS_TRANSIENT(pip)) {
4253 			/*
4254 			 * mdi_pathinfo is in state transition.  Return the
4255 			 * last good state.
4256 			 */
4257 			*state = MDI_PI_OLD_STATE(pip);
4258 			*ext_state = MDI_PI_OLD_EXT_STATE(pip);
4259 		} else {
4260 			*state = MDI_PI_STATE(pip);
4261 			*ext_state = MDI_PI_EXT_STATE(pip);
4262 		}
4263 	}
4264 	return (MDI_SUCCESS);
4265 }
4266 
4267 /*
4268  * mdi_pi_get_preferred:
4269  *	Get the preferred path flag
4270  */
4271 int
4272 mdi_pi_get_preferred(mdi_pathinfo_t *pip)
4273 {
4274 	if (pip) {
4275 		return (MDI_PI(pip)->pi_preferred);
4276 	}
4277 	return (0);
4278 }
4279 
4280 /*
4281  * mdi_pi_set_preferred:
4282  *	Set the preferred path flag
4283  */
4284 void
4285 mdi_pi_set_preferred(mdi_pathinfo_t *pip, int preferred)
4286 {
4287 	if (pip) {
4288 		MDI_PI(pip)->pi_preferred = preferred;
4289 	}
4290 }
4291 
4292 /*
4293  * mdi_pi_set_state():
4294  *		Set the mdi_pathinfo node state
4295  */
4296 void
4297 mdi_pi_set_state(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state)
4298 {
4299 	uint32_t	ext_state;
4300 
4301 	if (pip) {
4302 		ext_state = MDI_PI(pip)->pi_state & MDI_PATHINFO_EXT_STATE_MASK;
4303 		MDI_PI(pip)->pi_state = state;
4304 		MDI_PI(pip)->pi_state |= ext_state;
4305 
4306 		/* Path has changed state, invalidate DINFOCACHE snap shot. */
4307 		i_ddi_di_cache_invalidate();
4308 	}
4309 }
4310 
4311 /*
4312  * Property functions:
4313  */
4314 int
4315 i_map_nvlist_error_to_mdi(int val)
4316 {
4317 	int rv;
4318 
4319 	switch (val) {
4320 	case 0:
4321 		rv = DDI_PROP_SUCCESS;
4322 		break;
4323 	case EINVAL:
4324 	case ENOTSUP:
4325 		rv = DDI_PROP_INVAL_ARG;
4326 		break;
4327 	case ENOMEM:
4328 		rv = DDI_PROP_NO_MEMORY;
4329 		break;
4330 	default:
4331 		rv = DDI_PROP_NOT_FOUND;
4332 		break;
4333 	}
4334 	return (rv);
4335 }
4336 
4337 /*
4338  * mdi_pi_get_next_prop():
4339  * 		Property walk function.  The caller should hold mdi_pi_lock()
4340  *		and release by calling mdi_pi_unlock() at the end of walk to
4341  *		get a consistent value.
4342  */
4343 nvpair_t *
4344 mdi_pi_get_next_prop(mdi_pathinfo_t *pip, nvpair_t *prev)
4345 {
4346 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4347 		return (NULL);
4348 	}
4349 	ASSERT(MDI_PI_LOCKED(pip));
4350 	return (nvlist_next_nvpair(MDI_PI(pip)->pi_prop, prev));
4351 }
4352 
4353 /*
4354  * mdi_prop_remove():
4355  * 		Remove the named property from the named list.
4356  */
4357 int
4358 mdi_prop_remove(mdi_pathinfo_t *pip, char *name)
4359 {
4360 	if (pip == NULL) {
4361 		return (DDI_PROP_NOT_FOUND);
4362 	}
4363 	ASSERT(!MDI_PI_LOCKED(pip));
4364 	MDI_PI_LOCK(pip);
4365 	if (MDI_PI(pip)->pi_prop == NULL) {
4366 		MDI_PI_UNLOCK(pip);
4367 		return (DDI_PROP_NOT_FOUND);
4368 	}
4369 	if (name) {
4370 		(void) nvlist_remove_all(MDI_PI(pip)->pi_prop, name);
4371 	} else {
4372 		char		nvp_name[MAXNAMELEN];
4373 		nvpair_t	*nvp;
4374 		nvp = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, NULL);
4375 		while (nvp) {
4376 			nvpair_t	*next;
4377 			next = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, nvp);
4378 			(void) snprintf(nvp_name, sizeof(nvp_name), "%s",
4379 			    nvpair_name(nvp));
4380 			(void) nvlist_remove_all(MDI_PI(pip)->pi_prop,
4381 			    nvp_name);
4382 			nvp = next;
4383 		}
4384 	}
4385 	MDI_PI_UNLOCK(pip);
4386 	return (DDI_PROP_SUCCESS);
4387 }
4388 
4389 /*
4390  * mdi_prop_size():
4391  * 		Get buffer size needed to pack the property data.
4392  * 		Caller should hold the mdi_pathinfo_t lock to get a consistent
4393  *		buffer size.
4394  */
4395 int
4396 mdi_prop_size(mdi_pathinfo_t *pip, size_t *buflenp)
4397 {
4398 	int	rv;
4399 	size_t	bufsize;
4400 
4401 	*buflenp = 0;
4402 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4403 		return (DDI_PROP_NOT_FOUND);
4404 	}
4405 	ASSERT(MDI_PI_LOCKED(pip));
4406 	rv = nvlist_size(MDI_PI(pip)->pi_prop,
4407 	    &bufsize, NV_ENCODE_NATIVE);
4408 	*buflenp = bufsize;
4409 	return (i_map_nvlist_error_to_mdi(rv));
4410 }
4411 
4412 /*
4413  * mdi_prop_pack():
4414  * 		pack the property list.  The caller should hold the
4415  *		mdi_pathinfo_t node to get a consistent data
4416  */
4417 int
4418 mdi_prop_pack(mdi_pathinfo_t *pip, char **bufp, uint_t buflen)
4419 {
4420 	int	rv;
4421 	size_t	bufsize;
4422 
4423 	if ((pip == NULL) || MDI_PI(pip)->pi_prop == NULL) {
4424 		return (DDI_PROP_NOT_FOUND);
4425 	}
4426 
4427 	ASSERT(MDI_PI_LOCKED(pip));
4428 
4429 	bufsize = buflen;
4430 	rv = nvlist_pack(MDI_PI(pip)->pi_prop, bufp, (size_t *)&bufsize,
4431 	    NV_ENCODE_NATIVE, KM_SLEEP);
4432 
4433 	return (i_map_nvlist_error_to_mdi(rv));
4434 }
4435 
4436 /*
4437  * mdi_prop_update_byte():
4438  *		Create/Update a byte property
4439  */
4440 int
4441 mdi_prop_update_byte(mdi_pathinfo_t *pip, char *name, uchar_t data)
4442 {
4443 	int rv;
4444 
4445 	if (pip == NULL) {
4446 		return (DDI_PROP_INVAL_ARG);
4447 	}
4448 	ASSERT(!MDI_PI_LOCKED(pip));
4449 	MDI_PI_LOCK(pip);
4450 	if (MDI_PI(pip)->pi_prop == NULL) {
4451 		MDI_PI_UNLOCK(pip);
4452 		return (DDI_PROP_NOT_FOUND);
4453 	}
4454 	rv = nvlist_add_byte(MDI_PI(pip)->pi_prop, name, data);
4455 	MDI_PI_UNLOCK(pip);
4456 	return (i_map_nvlist_error_to_mdi(rv));
4457 }
4458 
4459 /*
4460  * mdi_prop_update_byte_array():
4461  *		Create/Update a byte array property
4462  */
4463 int
4464 mdi_prop_update_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t *data,
4465     uint_t nelements)
4466 {
4467 	int rv;
4468 
4469 	if (pip == NULL) {
4470 		return (DDI_PROP_INVAL_ARG);
4471 	}
4472 	ASSERT(!MDI_PI_LOCKED(pip));
4473 	MDI_PI_LOCK(pip);
4474 	if (MDI_PI(pip)->pi_prop == NULL) {
4475 		MDI_PI_UNLOCK(pip);
4476 		return (DDI_PROP_NOT_FOUND);
4477 	}
4478 	rv = nvlist_add_byte_array(MDI_PI(pip)->pi_prop, name, data, nelements);
4479 	MDI_PI_UNLOCK(pip);
4480 	return (i_map_nvlist_error_to_mdi(rv));
4481 }
4482 
4483 /*
4484  * mdi_prop_update_int():
4485  *		Create/Update a 32 bit integer property
4486  */
4487 int
4488 mdi_prop_update_int(mdi_pathinfo_t *pip, char *name, int data)
4489 {
4490 	int rv;
4491 
4492 	if (pip == NULL) {
4493 		return (DDI_PROP_INVAL_ARG);
4494 	}
4495 	ASSERT(!MDI_PI_LOCKED(pip));
4496 	MDI_PI_LOCK(pip);
4497 	if (MDI_PI(pip)->pi_prop == NULL) {
4498 		MDI_PI_UNLOCK(pip);
4499 		return (DDI_PROP_NOT_FOUND);
4500 	}
4501 	rv = nvlist_add_int32(MDI_PI(pip)->pi_prop, name, (int32_t)data);
4502 	MDI_PI_UNLOCK(pip);
4503 	return (i_map_nvlist_error_to_mdi(rv));
4504 }
4505 
4506 /*
4507  * mdi_prop_update_int64():
4508  *		Create/Update a 64 bit integer property
4509  */
4510 int
4511 mdi_prop_update_int64(mdi_pathinfo_t *pip, char *name, int64_t data)
4512 {
4513 	int rv;
4514 
4515 	if (pip == NULL) {
4516 		return (DDI_PROP_INVAL_ARG);
4517 	}
4518 	ASSERT(!MDI_PI_LOCKED(pip));
4519 	MDI_PI_LOCK(pip);
4520 	if (MDI_PI(pip)->pi_prop == NULL) {
4521 		MDI_PI_UNLOCK(pip);
4522 		return (DDI_PROP_NOT_FOUND);
4523 	}
4524 	rv = nvlist_add_int64(MDI_PI(pip)->pi_prop, name, data);
4525 	MDI_PI_UNLOCK(pip);
4526 	return (i_map_nvlist_error_to_mdi(rv));
4527 }
4528 
4529 /*
4530  * mdi_prop_update_int_array():
4531  *		Create/Update a int array property
4532  */
4533 int
4534 mdi_prop_update_int_array(mdi_pathinfo_t *pip, char *name, int *data,
4535 	    uint_t nelements)
4536 {
4537 	int rv;
4538 
4539 	if (pip == NULL) {
4540 		return (DDI_PROP_INVAL_ARG);
4541 	}
4542 	ASSERT(!MDI_PI_LOCKED(pip));
4543 	MDI_PI_LOCK(pip);
4544 	if (MDI_PI(pip)->pi_prop == NULL) {
4545 		MDI_PI_UNLOCK(pip);
4546 		return (DDI_PROP_NOT_FOUND);
4547 	}
4548 	rv = nvlist_add_int32_array(MDI_PI(pip)->pi_prop, name, (int32_t *)data,
4549 	    nelements);
4550 	MDI_PI_UNLOCK(pip);
4551 	return (i_map_nvlist_error_to_mdi(rv));
4552 }
4553 
4554 /*
4555  * mdi_prop_update_string():
4556  *		Create/Update a string property
4557  */
4558 int
4559 mdi_prop_update_string(mdi_pathinfo_t *pip, char *name, char *data)
4560 {
4561 	int rv;
4562 
4563 	if (pip == NULL) {
4564 		return (DDI_PROP_INVAL_ARG);
4565 	}
4566 	ASSERT(!MDI_PI_LOCKED(pip));
4567 	MDI_PI_LOCK(pip);
4568 	if (MDI_PI(pip)->pi_prop == NULL) {
4569 		MDI_PI_UNLOCK(pip);
4570 		return (DDI_PROP_NOT_FOUND);
4571 	}
4572 	rv = nvlist_add_string(MDI_PI(pip)->pi_prop, name, data);
4573 	MDI_PI_UNLOCK(pip);
4574 	return (i_map_nvlist_error_to_mdi(rv));
4575 }
4576 
4577 /*
4578  * mdi_prop_update_string_array():
4579  *		Create/Update a string array property
4580  */
4581 int
4582 mdi_prop_update_string_array(mdi_pathinfo_t *pip, char *name, char **data,
4583     uint_t nelements)
4584 {
4585 	int rv;
4586 
4587 	if (pip == NULL) {
4588 		return (DDI_PROP_INVAL_ARG);
4589 	}
4590 	ASSERT(!MDI_PI_LOCKED(pip));
4591 	MDI_PI_LOCK(pip);
4592 	if (MDI_PI(pip)->pi_prop == NULL) {
4593 		MDI_PI_UNLOCK(pip);
4594 		return (DDI_PROP_NOT_FOUND);
4595 	}
4596 	rv = nvlist_add_string_array(MDI_PI(pip)->pi_prop, name, data,
4597 	    nelements);
4598 	MDI_PI_UNLOCK(pip);
4599 	return (i_map_nvlist_error_to_mdi(rv));
4600 }
4601 
4602 /*
4603  * mdi_prop_lookup_byte():
4604  * 		Look for byte property identified by name.  The data returned
4605  *		is the actual property and valid as long as mdi_pathinfo_t node
4606  *		is alive.
4607  */
4608 int
4609 mdi_prop_lookup_byte(mdi_pathinfo_t *pip, char *name, uchar_t *data)
4610 {
4611 	int rv;
4612 
4613 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4614 		return (DDI_PROP_NOT_FOUND);
4615 	}
4616 	rv = nvlist_lookup_byte(MDI_PI(pip)->pi_prop, name, data);
4617 	return (i_map_nvlist_error_to_mdi(rv));
4618 }
4619 
4620 
4621 /*
4622  * mdi_prop_lookup_byte_array():
4623  * 		Look for byte array property identified by name.  The data
4624  *		returned is the actual property and valid as long as
4625  *		mdi_pathinfo_t node is alive.
4626  */
4627 int
4628 mdi_prop_lookup_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t **data,
4629     uint_t *nelements)
4630 {
4631 	int rv;
4632 
4633 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4634 		return (DDI_PROP_NOT_FOUND);
4635 	}
4636 	rv = nvlist_lookup_byte_array(MDI_PI(pip)->pi_prop, name, data,
4637 	    nelements);
4638 	return (i_map_nvlist_error_to_mdi(rv));
4639 }
4640 
4641 /*
4642  * mdi_prop_lookup_int():
4643  * 		Look for int property identified by name.  The data returned
4644  *		is the actual property and valid as long as mdi_pathinfo_t
4645  *		node is alive.
4646  */
4647 int
4648 mdi_prop_lookup_int(mdi_pathinfo_t *pip, char *name, int *data)
4649 {
4650 	int rv;
4651 
4652 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4653 		return (DDI_PROP_NOT_FOUND);
4654 	}
4655 	rv = nvlist_lookup_int32(MDI_PI(pip)->pi_prop, name, (int32_t *)data);
4656 	return (i_map_nvlist_error_to_mdi(rv));
4657 }
4658 
4659 /*
4660  * mdi_prop_lookup_int64():
4661  * 		Look for int64 property identified by name.  The data returned
4662  *		is the actual property and valid as long as mdi_pathinfo_t node
4663  *		is alive.
4664  */
4665 int
4666 mdi_prop_lookup_int64(mdi_pathinfo_t *pip, char *name, int64_t *data)
4667 {
4668 	int rv;
4669 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4670 		return (DDI_PROP_NOT_FOUND);
4671 	}
4672 	rv = nvlist_lookup_int64(MDI_PI(pip)->pi_prop, name, data);
4673 	return (i_map_nvlist_error_to_mdi(rv));
4674 }
4675 
4676 /*
4677  * mdi_prop_lookup_int_array():
4678  * 		Look for int array property identified by name.  The data
4679  *		returned is the actual property and valid as long as
4680  *		mdi_pathinfo_t node is alive.
4681  */
4682 int
4683 mdi_prop_lookup_int_array(mdi_pathinfo_t *pip, char *name, int **data,
4684     uint_t *nelements)
4685 {
4686 	int rv;
4687 
4688 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4689 		return (DDI_PROP_NOT_FOUND);
4690 	}
4691 	rv = nvlist_lookup_int32_array(MDI_PI(pip)->pi_prop, name,
4692 	    (int32_t **)data, nelements);
4693 	return (i_map_nvlist_error_to_mdi(rv));
4694 }
4695 
4696 /*
4697  * mdi_prop_lookup_string():
4698  * 		Look for string property identified by name.  The data
4699  *		returned is the actual property and valid as long as
4700  *		mdi_pathinfo_t node is alive.
4701  */
4702 int
4703 mdi_prop_lookup_string(mdi_pathinfo_t *pip, char *name, char **data)
4704 {
4705 	int rv;
4706 
4707 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4708 		return (DDI_PROP_NOT_FOUND);
4709 	}
4710 	rv = nvlist_lookup_string(MDI_PI(pip)->pi_prop, name, data);
4711 	return (i_map_nvlist_error_to_mdi(rv));
4712 }
4713 
4714 /*
4715  * mdi_prop_lookup_string_array():
4716  * 		Look for string array property identified by name.  The data
4717  *		returned is the actual property and valid as long as
4718  *		mdi_pathinfo_t node is alive.
4719  */
4720 int
4721 mdi_prop_lookup_string_array(mdi_pathinfo_t *pip, char *name, char ***data,
4722     uint_t *nelements)
4723 {
4724 	int rv;
4725 
4726 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4727 		return (DDI_PROP_NOT_FOUND);
4728 	}
4729 	rv = nvlist_lookup_string_array(MDI_PI(pip)->pi_prop, name, data,
4730 	    nelements);
4731 	return (i_map_nvlist_error_to_mdi(rv));
4732 }
4733 
4734 /*
4735  * mdi_prop_free():
4736  * 		Symmetrical function to ddi_prop_free(). nvlist_lookup_xx()
4737  *		functions return the pointer to actual property data and not a
4738  *		copy of it.  So the data returned is valid as long as
4739  *		mdi_pathinfo_t node is valid.
4740  */
4741 /*ARGSUSED*/
4742 int
4743 mdi_prop_free(void *data)
4744 {
4745 	return (DDI_PROP_SUCCESS);
4746 }
4747 
4748 /*ARGSUSED*/
4749 static void
4750 i_mdi_report_path_state(mdi_client_t *ct, mdi_pathinfo_t *pip)
4751 {
4752 	char		*ct_path;
4753 	char		*ct_status;
4754 	char		*status;
4755 	dev_info_t	*cdip = ct->ct_dip;
4756 	char		lb_buf[64];
4757 	int		report_lb_c = 0, report_lb_p = 0;
4758 
4759 	ASSERT(MDI_CLIENT_LOCKED(ct));
4760 	if ((cdip == NULL) || (ddi_get_instance(cdip) == -1) ||
4761 	    (MDI_CLIENT_IS_REPORT_DEV_NEEDED(ct) == 0)) {
4762 		return;
4763 	}
4764 	if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_OPTIMAL) {
4765 		ct_status = "optimal";
4766 		report_lb_c = 1;
4767 	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED) {
4768 		ct_status = "degraded";
4769 	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
4770 		ct_status = "failed";
4771 	} else {
4772 		ct_status = "unknown";
4773 	}
4774 
4775 	lb_buf[0] = 0;		/* not interested in load balancing config */
4776 
4777 	if (MDI_PI_FLAGS_IS_DEVICE_REMOVED(pip)) {
4778 		status = "removed";
4779 	} else if (MDI_PI_IS_OFFLINE(pip)) {
4780 		status = "offline";
4781 	} else if (MDI_PI_IS_ONLINE(pip)) {
4782 		status = "online";
4783 		report_lb_p = 1;
4784 	} else if (MDI_PI_IS_STANDBY(pip)) {
4785 		status = "standby";
4786 	} else if (MDI_PI_IS_FAULT(pip)) {
4787 		status = "faulted";
4788 	} else {
4789 		status = "unknown";
4790 	}
4791 
4792 	if (cdip) {
4793 		ct_path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4794 
4795 		/*
4796 		 * NOTE: Keeping "multipath status: %s" and
4797 		 * "Load balancing: %s" format unchanged in case someone
4798 		 * scrubs /var/adm/messages looking for these messages.
4799 		 */
4800 		if (report_lb_c && report_lb_p) {
4801 			if (ct->ct_lb == LOAD_BALANCE_LBA) {
4802 				(void) snprintf(lb_buf, sizeof (lb_buf),
4803 				    "%s, region-size: %d", mdi_load_balance_lba,
4804 				    ct->ct_lb_args->region_size);
4805 			} else if (ct->ct_lb == LOAD_BALANCE_NONE) {
4806 				(void) snprintf(lb_buf, sizeof (lb_buf),
4807 				    "%s", mdi_load_balance_none);
4808 			} else {
4809 				(void) snprintf(lb_buf, sizeof (lb_buf), "%s",
4810 				    mdi_load_balance_rr);
4811 			}
4812 
4813 			cmn_err(mdi_debug_consoleonly ? CE_NOTE : CE_CONT,
4814 			    "?%s (%s%d) multipath status: %s: "
4815 			    "path %d %s is %s: Load balancing: %s\n",
4816 			    ddi_pathname(cdip, ct_path), ddi_driver_name(cdip),
4817 			    ddi_get_instance(cdip), ct_status,
4818 			    mdi_pi_get_path_instance(pip),
4819 			    mdi_pi_spathname(pip), status, lb_buf);
4820 		} else {
4821 			cmn_err(mdi_debug_consoleonly ? CE_NOTE : CE_CONT,
4822 			    "?%s (%s%d) multipath status: %s: "
4823 			    "path %d %s is %s\n",
4824 			    ddi_pathname(cdip, ct_path), ddi_driver_name(cdip),
4825 			    ddi_get_instance(cdip), ct_status,
4826 			    mdi_pi_get_path_instance(pip),
4827 			    mdi_pi_spathname(pip), status);
4828 		}
4829 
4830 		kmem_free(ct_path, MAXPATHLEN);
4831 		MDI_CLIENT_CLEAR_REPORT_DEV_NEEDED(ct);
4832 	}
4833 }
4834 
4835 #ifdef	DEBUG
4836 /*
4837  * i_mdi_log():
4838  *		Utility function for error message management
4839  *
4840  *		NOTE: Implementation takes care of trailing \n for cmn_err,
4841  *		MDI_DEBUG should not terminate fmt strings with \n.
4842  *
4843  *		NOTE: If the level is >= 2, and there is no leading !?^
4844  *		then a leading ! is implied (but can be overriden via
4845  *		mdi_debug_consoleonly). If you are using kmdb on the console,
4846  *		consider setting mdi_debug_consoleonly to 1 as an aid.
4847  */
4848 /*PRINTFLIKE4*/
4849 static void
4850 i_mdi_log(int level, const char *func, dev_info_t *dip, const char *fmt, ...)
4851 {
4852 	char		name[MAXNAMELEN];
4853 	char		buf[512];
4854 	char		*bp;
4855 	va_list		ap;
4856 	int		log_only = 0;
4857 	int		boot_only = 0;
4858 	int		console_only = 0;
4859 
4860 	if (dip) {
4861 		(void) snprintf(name, sizeof(name), "%s%d: ",
4862 		    ddi_driver_name(dip), ddi_get_instance(dip));
4863 	} else {
4864 		name[0] = 0;
4865 	}
4866 
4867 	va_start(ap, fmt);
4868 	(void) vsnprintf(buf, sizeof(buf), fmt, ap);
4869 	va_end(ap);
4870 
4871 	switch (buf[0]) {
4872 	case '!':
4873 		bp = &buf[1];
4874 		log_only = 1;
4875 		break;
4876 	case '?':
4877 		bp = &buf[1];
4878 		boot_only = 1;
4879 		break;
4880 	case '^':
4881 		bp = &buf[1];
4882 		console_only = 1;
4883 		break;
4884 	default:
4885 		if (level >= 2)
4886 			log_only = 1;		/* ! implied */
4887 		bp = buf;
4888 		break;
4889 	}
4890 	if (mdi_debug_logonly) {
4891 		log_only = 1;
4892 		boot_only = 0;
4893 		console_only = 0;
4894 	}
4895 	if (mdi_debug_consoleonly) {
4896 		log_only = 0;
4897 		boot_only = 0;
4898 		console_only = 1;
4899 		level = CE_NOTE;
4900 		goto console;
4901 	}
4902 
4903 	switch (level) {
4904 	case CE_NOTE:
4905 		level = CE_CONT;
4906 		/* FALLTHROUGH */
4907 	case CE_CONT:
4908 		if (boot_only) {
4909 			cmn_err(level, "?mdi: %s%s: %s\n", name, func, bp);
4910 		} else if (console_only) {
4911 			cmn_err(level, "^mdi: %s%s: %s\n", name, func, bp);
4912 		} else if (log_only) {
4913 			cmn_err(level, "!mdi: %s%s: %s\n", name, func, bp);
4914 		} else {
4915 			cmn_err(level, "mdi: %s%s: %s\n", name, func, bp);
4916 		}
4917 		break;
4918 
4919 	case CE_WARN:
4920 	case CE_PANIC:
4921 	console:
4922 		if (boot_only) {
4923 			cmn_err(level, "?mdi: %s%s: %s", name, func, bp);
4924 		} else if (console_only) {
4925 			cmn_err(level, "^mdi: %s%s: %s", name, func, bp);
4926 		} else if (log_only) {
4927 			cmn_err(level, "!mdi: %s%s: %s", name, func, bp);
4928 		} else {
4929 			cmn_err(level, "mdi: %s%s: %s", name, func, bp);
4930 		}
4931 		break;
4932 	default:
4933 		cmn_err(level, "mdi: %s%s", name, bp);
4934 		break;
4935 	}
4936 }
4937 #endif	/* DEBUG */
4938 
4939 void
4940 i_mdi_client_online(dev_info_t *ct_dip)
4941 {
4942 	mdi_client_t	*ct;
4943 
4944 	/*
4945 	 * Client online notification. Mark client state as online
4946 	 * restore our binding with dev_info node
4947 	 */
4948 	ct = i_devi_get_client(ct_dip);
4949 	ASSERT(ct != NULL);
4950 	MDI_CLIENT_LOCK(ct);
4951 	MDI_CLIENT_SET_ONLINE(ct);
4952 	/* catch for any memory leaks */
4953 	ASSERT((ct->ct_dip == NULL) || (ct->ct_dip == ct_dip));
4954 	ct->ct_dip = ct_dip;
4955 
4956 	if (ct->ct_power_cnt == 0)
4957 		(void) i_mdi_power_all_phci(ct);
4958 
4959 	MDI_DEBUG(4, (MDI_NOTE, ct_dip,
4960 	    "i_mdi_pm_hold_client %p", (void *)ct));
4961 	i_mdi_pm_hold_client(ct, 1);
4962 
4963 	MDI_CLIENT_UNLOCK(ct);
4964 }
4965 
4966 void
4967 i_mdi_phci_online(dev_info_t *ph_dip)
4968 {
4969 	mdi_phci_t	*ph;
4970 
4971 	/* pHCI online notification. Mark state accordingly */
4972 	ph = i_devi_get_phci(ph_dip);
4973 	ASSERT(ph != NULL);
4974 	MDI_PHCI_LOCK(ph);
4975 	MDI_PHCI_SET_ONLINE(ph);
4976 	MDI_PHCI_UNLOCK(ph);
4977 }
4978 
4979 /*
4980  * mdi_devi_online():
4981  * 		Online notification from NDI framework on pHCI/client
4982  *		device online.
4983  * Return Values:
4984  *		NDI_SUCCESS
4985  *		MDI_FAILURE
4986  */
4987 /*ARGSUSED*/
4988 int
4989 mdi_devi_online(dev_info_t *dip, uint_t flags)
4990 {
4991 	if (MDI_PHCI(dip)) {
4992 		i_mdi_phci_online(dip);
4993 	}
4994 
4995 	if (MDI_CLIENT(dip)) {
4996 		i_mdi_client_online(dip);
4997 	}
4998 	return (NDI_SUCCESS);
4999 }
5000 
5001 /*
5002  * mdi_devi_offline():
5003  * 		Offline notification from NDI framework on pHCI/Client device
5004  *		offline.
5005  *
5006  * Return Values:
5007  *		NDI_SUCCESS
5008  *		NDI_FAILURE
5009  */
5010 /*ARGSUSED*/
5011 int
5012 mdi_devi_offline(dev_info_t *dip, uint_t flags)
5013 {
5014 	int		rv = NDI_SUCCESS;
5015 
5016 	if (MDI_CLIENT(dip)) {
5017 		rv = i_mdi_client_offline(dip, flags);
5018 		if (rv != NDI_SUCCESS)
5019 			return (rv);
5020 	}
5021 
5022 	if (MDI_PHCI(dip)) {
5023 		rv = i_mdi_phci_offline(dip, flags);
5024 
5025 		if ((rv != NDI_SUCCESS) && MDI_CLIENT(dip)) {
5026 			/* set client back online */
5027 			i_mdi_client_online(dip);
5028 		}
5029 	}
5030 
5031 	return (rv);
5032 }
5033 
5034 /*ARGSUSED*/
5035 static int
5036 i_mdi_phci_offline(dev_info_t *dip, uint_t flags)
5037 {
5038 	int		rv = NDI_SUCCESS;
5039 	mdi_phci_t	*ph;
5040 	mdi_client_t	*ct;
5041 	mdi_pathinfo_t	*pip;
5042 	mdi_pathinfo_t	*next;
5043 	mdi_pathinfo_t	*failed_pip = NULL;
5044 	dev_info_t	*cdip;
5045 
5046 	/*
5047 	 * pHCI component offline notification
5048 	 * Make sure that this pHCI instance is free to be offlined.
5049 	 * If it is OK to proceed, Offline and remove all the child
5050 	 * mdi_pathinfo nodes.  This process automatically offlines
5051 	 * corresponding client devices, for which this pHCI provides
5052 	 * critical services.
5053 	 */
5054 	ph = i_devi_get_phci(dip);
5055 	MDI_DEBUG(2, (MDI_NOTE, dip,
5056 	    "called %p %p", (void *)dip, (void *)ph));
5057 	if (ph == NULL) {
5058 		return (rv);
5059 	}
5060 
5061 	MDI_PHCI_LOCK(ph);
5062 
5063 	if (MDI_PHCI_IS_OFFLINE(ph)) {
5064 		MDI_DEBUG(1, (MDI_WARN, dip,
5065 		    "!pHCI already offlined: %p", (void *)dip));
5066 		MDI_PHCI_UNLOCK(ph);
5067 		return (NDI_SUCCESS);
5068 	}
5069 
5070 	/*
5071 	 * Check to see if the pHCI can be offlined
5072 	 */
5073 	if (ph->ph_unstable) {
5074 		MDI_DEBUG(1, (MDI_WARN, dip,
5075 		    "!One or more target devices are in transient state. "
5076 		    "This device can not be removed at this moment. "
5077 		    "Please try again later."));
5078 		MDI_PHCI_UNLOCK(ph);
5079 		return (NDI_BUSY);
5080 	}
5081 
5082 	pip = ph->ph_path_head;
5083 	while (pip != NULL) {
5084 		MDI_PI_LOCK(pip);
5085 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5086 
5087 		/*
5088 		 * The mdi_pathinfo state is OK. Check the client state.
5089 		 * If failover in progress fail the pHCI from offlining
5090 		 */
5091 		ct = MDI_PI(pip)->pi_client;
5092 		i_mdi_client_lock(ct, pip);
5093 		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
5094 		    (ct->ct_unstable)) {
5095 			/*
5096 			 * Failover is in progress, Fail the DR
5097 			 */
5098 			MDI_DEBUG(1, (MDI_WARN, dip,
5099 			    "!pHCI device is busy. "
5100 			    "This device can not be removed at this moment. "
5101 			    "Please try again later."));
5102 			MDI_PI_UNLOCK(pip);
5103 			i_mdi_client_unlock(ct);
5104 			MDI_PHCI_UNLOCK(ph);
5105 			return (NDI_BUSY);
5106 		}
5107 		MDI_PI_UNLOCK(pip);
5108 
5109 		/*
5110 		 * Check to see of we are removing the last path of this
5111 		 * client device...
5112 		 */
5113 		cdip = ct->ct_dip;
5114 		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5115 		    (i_mdi_client_compute_state(ct, ph) ==
5116 		    MDI_CLIENT_STATE_FAILED)) {
5117 			i_mdi_client_unlock(ct);
5118 			MDI_PHCI_UNLOCK(ph);
5119 			if (ndi_devi_offline(cdip,
5120 			    NDI_DEVFS_CLEAN) != NDI_SUCCESS) {
5121 				/*
5122 				 * ndi_devi_offline() failed.
5123 				 * This pHCI provides the critical path
5124 				 * to one or more client devices.
5125 				 * Return busy.
5126 				 */
5127 				MDI_PHCI_LOCK(ph);
5128 				MDI_DEBUG(1, (MDI_WARN, dip,
5129 				    "!pHCI device is busy. "
5130 				    "This device can not be removed at this "
5131 				    "moment. Please try again later."));
5132 				failed_pip = pip;
5133 				break;
5134 			} else {
5135 				MDI_PHCI_LOCK(ph);
5136 				pip = next;
5137 			}
5138 		} else {
5139 			i_mdi_client_unlock(ct);
5140 			pip = next;
5141 		}
5142 	}
5143 
5144 	if (failed_pip) {
5145 		pip = ph->ph_path_head;
5146 		while (pip != failed_pip) {
5147 			MDI_PI_LOCK(pip);
5148 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5149 			ct = MDI_PI(pip)->pi_client;
5150 			i_mdi_client_lock(ct, pip);
5151 			cdip = ct->ct_dip;
5152 			switch (MDI_CLIENT_STATE(ct)) {
5153 			case MDI_CLIENT_STATE_OPTIMAL:
5154 			case MDI_CLIENT_STATE_DEGRADED:
5155 				if (cdip) {
5156 					MDI_PI_UNLOCK(pip);
5157 					i_mdi_client_unlock(ct);
5158 					MDI_PHCI_UNLOCK(ph);
5159 					(void) ndi_devi_online(cdip, 0);
5160 					MDI_PHCI_LOCK(ph);
5161 					pip = next;
5162 					continue;
5163 				}
5164 				break;
5165 
5166 			case MDI_CLIENT_STATE_FAILED:
5167 				if (cdip) {
5168 					MDI_PI_UNLOCK(pip);
5169 					i_mdi_client_unlock(ct);
5170 					MDI_PHCI_UNLOCK(ph);
5171 					(void) ndi_devi_offline(cdip,
5172 						NDI_DEVFS_CLEAN);
5173 					MDI_PHCI_LOCK(ph);
5174 					pip = next;
5175 					continue;
5176 				}
5177 				break;
5178 			}
5179 			MDI_PI_UNLOCK(pip);
5180 			i_mdi_client_unlock(ct);
5181 			pip = next;
5182 		}
5183 		MDI_PHCI_UNLOCK(ph);
5184 		return (NDI_BUSY);
5185 	}
5186 
5187 	/*
5188 	 * Mark the pHCI as offline
5189 	 */
5190 	MDI_PHCI_SET_OFFLINE(ph);
5191 
5192 	/*
5193 	 * Mark the child mdi_pathinfo nodes as transient
5194 	 */
5195 	pip = ph->ph_path_head;
5196 	while (pip != NULL) {
5197 		MDI_PI_LOCK(pip);
5198 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5199 		MDI_PI_SET_OFFLINING(pip);
5200 		MDI_PI_UNLOCK(pip);
5201 		pip = next;
5202 	}
5203 	MDI_PHCI_UNLOCK(ph);
5204 	/*
5205 	 * Give a chance for any pending commands to execute
5206 	 */
5207 	delay_random(5);
5208 	MDI_PHCI_LOCK(ph);
5209 	pip = ph->ph_path_head;
5210 	while (pip != NULL) {
5211 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5212 		(void) i_mdi_pi_offline(pip, flags);
5213 		MDI_PI_LOCK(pip);
5214 		ct = MDI_PI(pip)->pi_client;
5215 		if (!MDI_PI_IS_OFFLINE(pip)) {
5216 			MDI_DEBUG(1, (MDI_WARN, dip,
5217 			    "!pHCI device is busy. "
5218 			    "This device can not be removed at this moment. "
5219 			    "Please try again later."));
5220 			MDI_PI_UNLOCK(pip);
5221 			MDI_PHCI_SET_ONLINE(ph);
5222 			MDI_PHCI_UNLOCK(ph);
5223 			return (NDI_BUSY);
5224 		}
5225 		MDI_PI_UNLOCK(pip);
5226 		pip = next;
5227 	}
5228 	MDI_PHCI_UNLOCK(ph);
5229 
5230 	return (rv);
5231 }
5232 
5233 void
5234 mdi_phci_mark_retiring(dev_info_t *dip, char **cons_array)
5235 {
5236 	mdi_phci_t	*ph;
5237 	mdi_client_t	*ct;
5238 	mdi_pathinfo_t	*pip;
5239 	mdi_pathinfo_t	*next;
5240 	dev_info_t	*cdip;
5241 
5242 	if (!MDI_PHCI(dip))
5243 		return;
5244 
5245 	ph = i_devi_get_phci(dip);
5246 	if (ph == NULL) {
5247 		return;
5248 	}
5249 
5250 	MDI_PHCI_LOCK(ph);
5251 
5252 	if (MDI_PHCI_IS_OFFLINE(ph)) {
5253 		/* has no last path */
5254 		MDI_PHCI_UNLOCK(ph);
5255 		return;
5256 	}
5257 
5258 	pip = ph->ph_path_head;
5259 	while (pip != NULL) {
5260 		MDI_PI_LOCK(pip);
5261 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5262 
5263 		ct = MDI_PI(pip)->pi_client;
5264 		i_mdi_client_lock(ct, pip);
5265 		MDI_PI_UNLOCK(pip);
5266 
5267 		cdip = ct->ct_dip;
5268 		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5269 		    (i_mdi_client_compute_state(ct, ph) ==
5270 		    MDI_CLIENT_STATE_FAILED)) {
5271 			/* Last path. Mark client dip as retiring */
5272 			i_mdi_client_unlock(ct);
5273 			MDI_PHCI_UNLOCK(ph);
5274 			(void) e_ddi_mark_retiring(cdip, cons_array);
5275 			MDI_PHCI_LOCK(ph);
5276 			pip = next;
5277 		} else {
5278 			i_mdi_client_unlock(ct);
5279 			pip = next;
5280 		}
5281 	}
5282 
5283 	MDI_PHCI_UNLOCK(ph);
5284 
5285 	return;
5286 }
5287 
5288 void
5289 mdi_phci_retire_notify(dev_info_t *dip, int *constraint)
5290 {
5291 	mdi_phci_t	*ph;
5292 	mdi_client_t	*ct;
5293 	mdi_pathinfo_t	*pip;
5294 	mdi_pathinfo_t	*next;
5295 	dev_info_t	*cdip;
5296 
5297 	if (!MDI_PHCI(dip))
5298 		return;
5299 
5300 	ph = i_devi_get_phci(dip);
5301 	if (ph == NULL)
5302 		return;
5303 
5304 	MDI_PHCI_LOCK(ph);
5305 
5306 	if (MDI_PHCI_IS_OFFLINE(ph)) {
5307 		MDI_PHCI_UNLOCK(ph);
5308 		/* not last path */
5309 		return;
5310 	}
5311 
5312 	if (ph->ph_unstable) {
5313 		MDI_PHCI_UNLOCK(ph);
5314 		/* can't check for constraints */
5315 		*constraint = 0;
5316 		return;
5317 	}
5318 
5319 	pip = ph->ph_path_head;
5320 	while (pip != NULL) {
5321 		MDI_PI_LOCK(pip);
5322 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5323 
5324 		/*
5325 		 * The mdi_pathinfo state is OK. Check the client state.
5326 		 * If failover in progress fail the pHCI from offlining
5327 		 */
5328 		ct = MDI_PI(pip)->pi_client;
5329 		i_mdi_client_lock(ct, pip);
5330 		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
5331 		    (ct->ct_unstable)) {
5332 			/*
5333 			 * Failover is in progress, can't check for constraints
5334 			 */
5335 			MDI_PI_UNLOCK(pip);
5336 			i_mdi_client_unlock(ct);
5337 			MDI_PHCI_UNLOCK(ph);
5338 			*constraint = 0;
5339 			return;
5340 		}
5341 		MDI_PI_UNLOCK(pip);
5342 
5343 		/*
5344 		 * Check to see of we are retiring the last path of this
5345 		 * client device...
5346 		 */
5347 		cdip = ct->ct_dip;
5348 		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5349 		    (i_mdi_client_compute_state(ct, ph) ==
5350 		    MDI_CLIENT_STATE_FAILED)) {
5351 			i_mdi_client_unlock(ct);
5352 			MDI_PHCI_UNLOCK(ph);
5353 			(void) e_ddi_retire_notify(cdip, constraint);
5354 			MDI_PHCI_LOCK(ph);
5355 			pip = next;
5356 		} else {
5357 			i_mdi_client_unlock(ct);
5358 			pip = next;
5359 		}
5360 	}
5361 
5362 	MDI_PHCI_UNLOCK(ph);
5363 
5364 	return;
5365 }
5366 
5367 /*
5368  * offline the path(s) hanging off the pHCI. If the
5369  * last path to any client, check that constraints
5370  * have been applied.
5371  */
5372 void
5373 mdi_phci_retire_finalize(dev_info_t *dip, int phci_only)
5374 {
5375 	mdi_phci_t	*ph;
5376 	mdi_client_t	*ct;
5377 	mdi_pathinfo_t	*pip;
5378 	mdi_pathinfo_t	*next;
5379 	dev_info_t	*cdip;
5380 	int		unstable = 0;
5381 	int		constraint;
5382 
5383 	if (!MDI_PHCI(dip))
5384 		return;
5385 
5386 	ph = i_devi_get_phci(dip);
5387 	if (ph == NULL) {
5388 		/* no last path and no pips */
5389 		return;
5390 	}
5391 
5392 	MDI_PHCI_LOCK(ph);
5393 
5394 	if (MDI_PHCI_IS_OFFLINE(ph)) {
5395 		MDI_PHCI_UNLOCK(ph);
5396 		/* no last path and no pips */
5397 		return;
5398 	}
5399 
5400 	/*
5401 	 * Check to see if the pHCI can be offlined
5402 	 */
5403 	if (ph->ph_unstable) {
5404 		unstable = 1;
5405 	}
5406 
5407 	pip = ph->ph_path_head;
5408 	while (pip != NULL) {
5409 		MDI_PI_LOCK(pip);
5410 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5411 
5412 		/*
5413 		 * if failover in progress fail the pHCI from offlining
5414 		 */
5415 		ct = MDI_PI(pip)->pi_client;
5416 		i_mdi_client_lock(ct, pip);
5417 		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
5418 		    (ct->ct_unstable)) {
5419 			unstable = 1;
5420 		}
5421 		MDI_PI_UNLOCK(pip);
5422 
5423 		/*
5424 		 * Check to see of we are removing the last path of this
5425 		 * client device...
5426 		 */
5427 		cdip = ct->ct_dip;
5428 		if (!phci_only && cdip &&
5429 		    (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5430 		    (i_mdi_client_compute_state(ct, ph) ==
5431 		    MDI_CLIENT_STATE_FAILED)) {
5432 			i_mdi_client_unlock(ct);
5433 			MDI_PHCI_UNLOCK(ph);
5434 			/*
5435 			 * We don't retire clients we just retire the
5436 			 * path to a client. If it is the last path
5437 			 * to a client, constraints are checked and
5438 			 * if we pass the last path is offlined. MPXIO will
5439 			 * then fail all I/Os to the client. Since we don't
5440 			 * want to retire the client on a path error
5441 			 * set constraint = 0 so that the client dip
5442 			 * is not retired.
5443 			 */
5444 			constraint = 0;
5445 			(void) e_ddi_retire_finalize(cdip, &constraint);
5446 			MDI_PHCI_LOCK(ph);
5447 			pip = next;
5448 		} else {
5449 			i_mdi_client_unlock(ct);
5450 			pip = next;
5451 		}
5452 	}
5453 
5454 	/*
5455 	 * Cannot offline pip(s)
5456 	 */
5457 	if (unstable) {
5458 		cmn_err(CE_WARN, "%s%d: mdi_phci_retire_finalize: "
5459 		    "pHCI in transient state, cannot retire",
5460 		    ddi_driver_name(dip), ddi_get_instance(dip));
5461 		MDI_PHCI_UNLOCK(ph);
5462 		return;
5463 	}
5464 
5465 	/*
5466 	 * Mark the pHCI as offline
5467 	 */
5468 	MDI_PHCI_SET_OFFLINE(ph);
5469 
5470 	/*
5471 	 * Mark the child mdi_pathinfo nodes as transient
5472 	 */
5473 	pip = ph->ph_path_head;
5474 	while (pip != NULL) {
5475 		MDI_PI_LOCK(pip);
5476 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5477 		MDI_PI_SET_OFFLINING(pip);
5478 		MDI_PI_UNLOCK(pip);
5479 		pip = next;
5480 	}
5481 	MDI_PHCI_UNLOCK(ph);
5482 	/*
5483 	 * Give a chance for any pending commands to execute
5484 	 */
5485 	delay_random(5);
5486 	MDI_PHCI_LOCK(ph);
5487 	pip = ph->ph_path_head;
5488 	while (pip != NULL) {
5489 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5490 		(void) i_mdi_pi_offline(pip, 0);
5491 		MDI_PI_LOCK(pip);
5492 		ct = MDI_PI(pip)->pi_client;
5493 		if (!MDI_PI_IS_OFFLINE(pip)) {
5494 			cmn_err(CE_WARN, "mdi_phci_retire_finalize: "
5495 			    "path %d %s busy, cannot offline",
5496 			    mdi_pi_get_path_instance(pip),
5497 			    mdi_pi_spathname(pip));
5498 			MDI_PI_UNLOCK(pip);
5499 			MDI_PHCI_SET_ONLINE(ph);
5500 			MDI_PHCI_UNLOCK(ph);
5501 			return;
5502 		}
5503 		MDI_PI_UNLOCK(pip);
5504 		pip = next;
5505 	}
5506 	MDI_PHCI_UNLOCK(ph);
5507 
5508 	return;
5509 }
5510 
5511 void
5512 mdi_phci_unretire(dev_info_t *dip)
5513 {
5514 	ASSERT(MDI_PHCI(dip));
5515 
5516 	/*
5517 	 * Online the phci
5518 	 */
5519 	i_mdi_phci_online(dip);
5520 }
5521 
5522 /*ARGSUSED*/
5523 static int
5524 i_mdi_client_offline(dev_info_t *dip, uint_t flags)
5525 {
5526 	int		rv = NDI_SUCCESS;
5527 	mdi_client_t	*ct;
5528 
5529 	/*
5530 	 * Client component to go offline.  Make sure that we are
5531 	 * not in failing over state and update client state
5532 	 * accordingly
5533 	 */
5534 	ct = i_devi_get_client(dip);
5535 	MDI_DEBUG(2, (MDI_NOTE, dip,
5536 	    "called %p %p", (void *)dip, (void *)ct));
5537 	if (ct != NULL) {
5538 		MDI_CLIENT_LOCK(ct);
5539 		if (ct->ct_unstable) {
5540 			/*
5541 			 * One or more paths are in transient state,
5542 			 * Dont allow offline of a client device
5543 			 */
5544 			MDI_DEBUG(1, (MDI_WARN, dip,
5545 			    "!One or more paths to "
5546 			    "this device are in transient state. "
5547 			    "This device can not be removed at this moment. "
5548 			    "Please try again later."));
5549 			MDI_CLIENT_UNLOCK(ct);
5550 			return (NDI_BUSY);
5551 		}
5552 		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
5553 			/*
5554 			 * Failover is in progress, Dont allow DR of
5555 			 * a client device
5556 			 */
5557 			MDI_DEBUG(1, (MDI_WARN, dip,
5558 			    "!Client device is Busy. "
5559 			    "This device can not be removed at this moment. "
5560 			    "Please try again later."));
5561 			MDI_CLIENT_UNLOCK(ct);
5562 			return (NDI_BUSY);
5563 		}
5564 		MDI_CLIENT_SET_OFFLINE(ct);
5565 
5566 		/*
5567 		 * Unbind our relationship with the dev_info node
5568 		 */
5569 		if (flags & NDI_DEVI_REMOVE) {
5570 			ct->ct_dip = NULL;
5571 		}
5572 		MDI_CLIENT_UNLOCK(ct);
5573 	}
5574 	return (rv);
5575 }
5576 
5577 /*
5578  * mdi_pre_attach():
5579  *		Pre attach() notification handler
5580  */
5581 /*ARGSUSED*/
5582 int
5583 mdi_pre_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
5584 {
5585 	/* don't support old DDI_PM_RESUME */
5586 	if ((DEVI(dip)->devi_mdi_component != MDI_COMPONENT_NONE) &&
5587 	    (cmd == DDI_PM_RESUME))
5588 		return (DDI_FAILURE);
5589 
5590 	return (DDI_SUCCESS);
5591 }
5592 
5593 /*
5594  * mdi_post_attach():
5595  *		Post attach() notification handler
5596  */
5597 /*ARGSUSED*/
5598 void
5599 mdi_post_attach(dev_info_t *dip, ddi_attach_cmd_t cmd, int error)
5600 {
5601 	mdi_phci_t	*ph;
5602 	mdi_client_t	*ct;
5603 	mdi_vhci_t	*vh;
5604 
5605 	if (MDI_PHCI(dip)) {
5606 		ph = i_devi_get_phci(dip);
5607 		ASSERT(ph != NULL);
5608 
5609 		MDI_PHCI_LOCK(ph);
5610 		switch (cmd) {
5611 		case DDI_ATTACH:
5612 			MDI_DEBUG(2, (MDI_NOTE, dip,
5613 			    "phci post_attach called %p", (void *)ph));
5614 			if (error == DDI_SUCCESS) {
5615 				MDI_PHCI_SET_ATTACH(ph);
5616 			} else {
5617 				MDI_DEBUG(1, (MDI_NOTE, dip,
5618 				    "!pHCI post_attach failed: error %d",
5619 				    error));
5620 				MDI_PHCI_SET_DETACH(ph);
5621 			}
5622 			break;
5623 
5624 		case DDI_RESUME:
5625 			MDI_DEBUG(2, (MDI_NOTE, dip,
5626 			    "pHCI post_resume: called %p", (void *)ph));
5627 			if (error == DDI_SUCCESS) {
5628 				MDI_PHCI_SET_RESUME(ph);
5629 			} else {
5630 				MDI_DEBUG(1, (MDI_NOTE, dip,
5631 				    "!pHCI post_resume failed: error %d",
5632 				    error));
5633 				MDI_PHCI_SET_SUSPEND(ph);
5634 			}
5635 			break;
5636 		}
5637 		MDI_PHCI_UNLOCK(ph);
5638 	}
5639 
5640 	if (MDI_CLIENT(dip)) {
5641 		ct = i_devi_get_client(dip);
5642 		ASSERT(ct != NULL);
5643 
5644 		MDI_CLIENT_LOCK(ct);
5645 		switch (cmd) {
5646 		case DDI_ATTACH:
5647 			MDI_DEBUG(2, (MDI_NOTE, dip,
5648 			    "client post_attach called %p", (void *)ct));
5649 			if (error != DDI_SUCCESS) {
5650 				MDI_DEBUG(1, (MDI_NOTE, dip,
5651 				    "!client post_attach failed: error %d",
5652 				    error));
5653 				MDI_CLIENT_SET_DETACH(ct);
5654 				MDI_DEBUG(4, (MDI_WARN, dip,
5655 				    "i_mdi_pm_reset_client"));
5656 				i_mdi_pm_reset_client(ct);
5657 				break;
5658 			}
5659 
5660 			/*
5661 			 * Client device has successfully attached, inform
5662 			 * the vhci.
5663 			 */
5664 			vh = ct->ct_vhci;
5665 			if (vh->vh_ops->vo_client_attached)
5666 				(*vh->vh_ops->vo_client_attached)(dip);
5667 
5668 			MDI_CLIENT_SET_ATTACH(ct);
5669 			break;
5670 
5671 		case DDI_RESUME:
5672 			MDI_DEBUG(2, (MDI_NOTE, dip,
5673 			    "client post_attach: called %p", (void *)ct));
5674 			if (error == DDI_SUCCESS) {
5675 				MDI_CLIENT_SET_RESUME(ct);
5676 			} else {
5677 				MDI_DEBUG(1, (MDI_NOTE, dip,
5678 				    "!client post_resume failed: error %d",
5679 				    error));
5680 				MDI_CLIENT_SET_SUSPEND(ct);
5681 			}
5682 			break;
5683 		}
5684 		MDI_CLIENT_UNLOCK(ct);
5685 	}
5686 }
5687 
5688 /*
5689  * mdi_pre_detach():
5690  *		Pre detach notification handler
5691  */
5692 /*ARGSUSED*/
5693 int
5694 mdi_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5695 {
5696 	int rv = DDI_SUCCESS;
5697 
5698 	if (MDI_CLIENT(dip)) {
5699 		(void) i_mdi_client_pre_detach(dip, cmd);
5700 	}
5701 
5702 	if (MDI_PHCI(dip)) {
5703 		rv = i_mdi_phci_pre_detach(dip, cmd);
5704 	}
5705 
5706 	return (rv);
5707 }
5708 
5709 /*ARGSUSED*/
5710 static int
5711 i_mdi_phci_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5712 {
5713 	int		rv = DDI_SUCCESS;
5714 	mdi_phci_t	*ph;
5715 	mdi_client_t	*ct;
5716 	mdi_pathinfo_t	*pip;
5717 	mdi_pathinfo_t	*failed_pip = NULL;
5718 	mdi_pathinfo_t	*next;
5719 
5720 	ph = i_devi_get_phci(dip);
5721 	if (ph == NULL) {
5722 		return (rv);
5723 	}
5724 
5725 	MDI_PHCI_LOCK(ph);
5726 	switch (cmd) {
5727 	case DDI_DETACH:
5728 		MDI_DEBUG(2, (MDI_NOTE, dip,
5729 		    "pHCI pre_detach: called %p", (void *)ph));
5730 		if (!MDI_PHCI_IS_OFFLINE(ph)) {
5731 			/*
5732 			 * mdi_pathinfo nodes are still attached to
5733 			 * this pHCI. Fail the detach for this pHCI.
5734 			 */
5735 			MDI_DEBUG(2, (MDI_WARN, dip,
5736 			    "pHCI pre_detach: paths are still attached %p",
5737 			    (void *)ph));
5738 			rv = DDI_FAILURE;
5739 			break;
5740 		}
5741 		MDI_PHCI_SET_DETACH(ph);
5742 		break;
5743 
5744 	case DDI_SUSPEND:
5745 		/*
5746 		 * pHCI is getting suspended.  Since mpxio client
5747 		 * devices may not be suspended at this point, to avoid
5748 		 * a potential stack overflow, it is important to suspend
5749 		 * client devices before pHCI can be suspended.
5750 		 */
5751 
5752 		MDI_DEBUG(2, (MDI_NOTE, dip,
5753 		    "pHCI pre_suspend: called %p", (void *)ph));
5754 		/*
5755 		 * Suspend all the client devices accessible through this pHCI
5756 		 */
5757 		pip = ph->ph_path_head;
5758 		while (pip != NULL && rv == DDI_SUCCESS) {
5759 			dev_info_t *cdip;
5760 			MDI_PI_LOCK(pip);
5761 			next =
5762 			    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5763 			ct = MDI_PI(pip)->pi_client;
5764 			i_mdi_client_lock(ct, pip);
5765 			cdip = ct->ct_dip;
5766 			MDI_PI_UNLOCK(pip);
5767 			if ((MDI_CLIENT_IS_DETACHED(ct) == 0) &&
5768 			    MDI_CLIENT_IS_SUSPENDED(ct) == 0) {
5769 				i_mdi_client_unlock(ct);
5770 				if ((rv = devi_detach(cdip, DDI_SUSPEND)) !=
5771 				    DDI_SUCCESS) {
5772 					/*
5773 					 * Suspend of one of the client
5774 					 * device has failed.
5775 					 */
5776 					MDI_DEBUG(1, (MDI_WARN, dip,
5777 					    "!suspend of device (%s%d) failed.",
5778 					    ddi_driver_name(cdip),
5779 					    ddi_get_instance(cdip)));
5780 					failed_pip = pip;
5781 					break;
5782 				}
5783 			} else {
5784 				i_mdi_client_unlock(ct);
5785 			}
5786 			pip = next;
5787 		}
5788 
5789 		if (rv == DDI_SUCCESS) {
5790 			/*
5791 			 * Suspend of client devices is complete. Proceed
5792 			 * with pHCI suspend.
5793 			 */
5794 			MDI_PHCI_SET_SUSPEND(ph);
5795 		} else {
5796 			/*
5797 			 * Revert back all the suspended client device states
5798 			 * to converse.
5799 			 */
5800 			pip = ph->ph_path_head;
5801 			while (pip != failed_pip) {
5802 				dev_info_t *cdip;
5803 				MDI_PI_LOCK(pip);
5804 				next =
5805 				    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5806 				ct = MDI_PI(pip)->pi_client;
5807 				i_mdi_client_lock(ct, pip);
5808 				cdip = ct->ct_dip;
5809 				MDI_PI_UNLOCK(pip);
5810 				if (MDI_CLIENT_IS_SUSPENDED(ct)) {
5811 					i_mdi_client_unlock(ct);
5812 					(void) devi_attach(cdip, DDI_RESUME);
5813 				} else {
5814 					i_mdi_client_unlock(ct);
5815 				}
5816 				pip = next;
5817 			}
5818 		}
5819 		break;
5820 
5821 	default:
5822 		rv = DDI_FAILURE;
5823 		break;
5824 	}
5825 	MDI_PHCI_UNLOCK(ph);
5826 	return (rv);
5827 }
5828 
5829 /*ARGSUSED*/
5830 static int
5831 i_mdi_client_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5832 {
5833 	int		rv = DDI_SUCCESS;
5834 	mdi_client_t	*ct;
5835 
5836 	ct = i_devi_get_client(dip);
5837 	if (ct == NULL) {
5838 		return (rv);
5839 	}
5840 
5841 	MDI_CLIENT_LOCK(ct);
5842 	switch (cmd) {
5843 	case DDI_DETACH:
5844 		MDI_DEBUG(2, (MDI_NOTE, dip,
5845 		    "client pre_detach: called %p",
5846 		     (void *)ct));
5847 		MDI_CLIENT_SET_DETACH(ct);
5848 		break;
5849 
5850 	case DDI_SUSPEND:
5851 		MDI_DEBUG(2, (MDI_NOTE, dip,
5852 		    "client pre_suspend: called %p",
5853 		    (void *)ct));
5854 		MDI_CLIENT_SET_SUSPEND(ct);
5855 		break;
5856 
5857 	default:
5858 		rv = DDI_FAILURE;
5859 		break;
5860 	}
5861 	MDI_CLIENT_UNLOCK(ct);
5862 	return (rv);
5863 }
5864 
5865 /*
5866  * mdi_post_detach():
5867  *		Post detach notification handler
5868  */
5869 /*ARGSUSED*/
5870 void
5871 mdi_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5872 {
5873 	/*
5874 	 * Detach/Suspend of mpxio component failed. Update our state
5875 	 * too
5876 	 */
5877 	if (MDI_PHCI(dip))
5878 		i_mdi_phci_post_detach(dip, cmd, error);
5879 
5880 	if (MDI_CLIENT(dip))
5881 		i_mdi_client_post_detach(dip, cmd, error);
5882 }
5883 
5884 /*ARGSUSED*/
5885 static void
5886 i_mdi_phci_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5887 {
5888 	mdi_phci_t	*ph;
5889 
5890 	/*
5891 	 * Detach/Suspend of phci component failed. Update our state
5892 	 * too
5893 	 */
5894 	ph = i_devi_get_phci(dip);
5895 	if (ph == NULL) {
5896 		return;
5897 	}
5898 
5899 	MDI_PHCI_LOCK(ph);
5900 	/*
5901 	 * Detach of pHCI failed. Restore back converse
5902 	 * state
5903 	 */
5904 	switch (cmd) {
5905 	case DDI_DETACH:
5906 		MDI_DEBUG(2, (MDI_NOTE, dip,
5907 		    "pHCI post_detach: called %p",
5908 		    (void *)ph));
5909 		if (error != DDI_SUCCESS)
5910 			MDI_PHCI_SET_ATTACH(ph);
5911 		break;
5912 
5913 	case DDI_SUSPEND:
5914 		MDI_DEBUG(2, (MDI_NOTE, dip,
5915 		    "pHCI post_suspend: called %p",
5916 		    (void *)ph));
5917 		if (error != DDI_SUCCESS)
5918 			MDI_PHCI_SET_RESUME(ph);
5919 		break;
5920 	}
5921 	MDI_PHCI_UNLOCK(ph);
5922 }
5923 
5924 /*ARGSUSED*/
5925 static void
5926 i_mdi_client_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5927 {
5928 	mdi_client_t	*ct;
5929 
5930 	ct = i_devi_get_client(dip);
5931 	if (ct == NULL) {
5932 		return;
5933 	}
5934 	MDI_CLIENT_LOCK(ct);
5935 	/*
5936 	 * Detach of Client failed. Restore back converse
5937 	 * state
5938 	 */
5939 	switch (cmd) {
5940 	case DDI_DETACH:
5941 		MDI_DEBUG(2, (MDI_NOTE, dip,
5942 		    "client post_detach: called %p", (void *)ct));
5943 		if (DEVI_IS_ATTACHING(ct->ct_dip)) {
5944 			MDI_DEBUG(4, (MDI_NOTE, dip,
5945 			    "i_mdi_pm_rele_client\n"));
5946 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
5947 		} else {
5948 			MDI_DEBUG(4, (MDI_NOTE, dip,
5949 			    "i_mdi_pm_reset_client\n"));
5950 			i_mdi_pm_reset_client(ct);
5951 		}
5952 		if (error != DDI_SUCCESS)
5953 			MDI_CLIENT_SET_ATTACH(ct);
5954 		break;
5955 
5956 	case DDI_SUSPEND:
5957 		MDI_DEBUG(2, (MDI_NOTE, dip,
5958 		    "called %p", (void *)ct));
5959 		if (error != DDI_SUCCESS)
5960 			MDI_CLIENT_SET_RESUME(ct);
5961 		break;
5962 	}
5963 	MDI_CLIENT_UNLOCK(ct);
5964 }
5965 
5966 int
5967 mdi_pi_kstat_exists(mdi_pathinfo_t *pip)
5968 {
5969 	return (MDI_PI(pip)->pi_kstats ? 1 : 0);
5970 }
5971 
5972 /*
5973  * create and install per-path (client - pHCI) statistics
5974  * I/O stats supported: nread, nwritten, reads, and writes
5975  * Error stats - hard errors, soft errors, & transport errors
5976  */
5977 int
5978 mdi_pi_kstat_create(mdi_pathinfo_t *pip, char *ksname)
5979 {
5980 	kstat_t			*kiosp, *kerrsp;
5981 	struct pi_errs		*nsp;
5982 	struct mdi_pi_kstats	*mdi_statp;
5983 
5984 	if (MDI_PI(pip)->pi_kstats != NULL)
5985 		return (MDI_SUCCESS);
5986 
5987 	if ((kiosp = kstat_create("mdi", 0, ksname, "iopath",
5988 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL) {
5989 		return (MDI_FAILURE);
5990 	}
5991 
5992 	(void) strcat(ksname, ",err");
5993 	kerrsp = kstat_create("mdi", 0, ksname, "iopath_errors",
5994 	    KSTAT_TYPE_NAMED,
5995 	    sizeof (struct pi_errs) / sizeof (kstat_named_t), 0);
5996 	if (kerrsp == NULL) {
5997 		kstat_delete(kiosp);
5998 		return (MDI_FAILURE);
5999 	}
6000 
6001 	nsp = (struct pi_errs *)kerrsp->ks_data;
6002 	kstat_named_init(&nsp->pi_softerrs, "Soft Errors", KSTAT_DATA_UINT32);
6003 	kstat_named_init(&nsp->pi_harderrs, "Hard Errors", KSTAT_DATA_UINT32);
6004 	kstat_named_init(&nsp->pi_transerrs, "Transport Errors",
6005 	    KSTAT_DATA_UINT32);
6006 	kstat_named_init(&nsp->pi_icnt_busy, "Interconnect Busy",
6007 	    KSTAT_DATA_UINT32);
6008 	kstat_named_init(&nsp->pi_icnt_errors, "Interconnect Errors",
6009 	    KSTAT_DATA_UINT32);
6010 	kstat_named_init(&nsp->pi_phci_rsrc, "pHCI No Resources",
6011 	    KSTAT_DATA_UINT32);
6012 	kstat_named_init(&nsp->pi_phci_localerr, "pHCI Local Errors",
6013 	    KSTAT_DATA_UINT32);
6014 	kstat_named_init(&nsp->pi_phci_invstate, "pHCI Invalid State",
6015 	    KSTAT_DATA_UINT32);
6016 	kstat_named_init(&nsp->pi_failedfrom, "Failed From",
6017 	    KSTAT_DATA_UINT32);
6018 	kstat_named_init(&nsp->pi_failedto, "Failed To", KSTAT_DATA_UINT32);
6019 
6020 	mdi_statp = kmem_alloc(sizeof (*mdi_statp), KM_SLEEP);
6021 	mdi_statp->pi_kstat_ref = 1;
6022 	mdi_statp->pi_kstat_iostats = kiosp;
6023 	mdi_statp->pi_kstat_errstats = kerrsp;
6024 	kstat_install(kiosp);
6025 	kstat_install(kerrsp);
6026 	MDI_PI(pip)->pi_kstats = mdi_statp;
6027 	return (MDI_SUCCESS);
6028 }
6029 
6030 /*
6031  * destroy per-path properties
6032  */
6033 static void
6034 i_mdi_pi_kstat_destroy(mdi_pathinfo_t *pip)
6035 {
6036 
6037 	struct mdi_pi_kstats *mdi_statp;
6038 
6039 	if (MDI_PI(pip)->pi_kstats == NULL)
6040 		return;
6041 	if ((mdi_statp = MDI_PI(pip)->pi_kstats) == NULL)
6042 		return;
6043 
6044 	MDI_PI(pip)->pi_kstats = NULL;
6045 
6046 	/*
6047 	 * the kstat may be shared between multiple pathinfo nodes
6048 	 * decrement this pathinfo's usage, removing the kstats
6049 	 * themselves when the last pathinfo reference is removed.
6050 	 */
6051 	ASSERT(mdi_statp->pi_kstat_ref > 0);
6052 	if (--mdi_statp->pi_kstat_ref != 0)
6053 		return;
6054 
6055 	kstat_delete(mdi_statp->pi_kstat_iostats);
6056 	kstat_delete(mdi_statp->pi_kstat_errstats);
6057 	kmem_free(mdi_statp, sizeof (*mdi_statp));
6058 }
6059 
6060 /*
6061  * update I/O paths KSTATS
6062  */
6063 void
6064 mdi_pi_kstat_iosupdate(mdi_pathinfo_t *pip, struct buf *bp)
6065 {
6066 	kstat_t *iostatp;
6067 	size_t xfer_cnt;
6068 
6069 	ASSERT(pip != NULL);
6070 
6071 	/*
6072 	 * I/O can be driven across a path prior to having path
6073 	 * statistics available, i.e. probe(9e).
6074 	 */
6075 	if (bp != NULL && MDI_PI(pip)->pi_kstats != NULL) {
6076 		iostatp = MDI_PI(pip)->pi_kstats->pi_kstat_iostats;
6077 		xfer_cnt = bp->b_bcount - bp->b_resid;
6078 		if (bp->b_flags & B_READ) {
6079 			KSTAT_IO_PTR(iostatp)->reads++;
6080 			KSTAT_IO_PTR(iostatp)->nread += xfer_cnt;
6081 		} else {
6082 			KSTAT_IO_PTR(iostatp)->writes++;
6083 			KSTAT_IO_PTR(iostatp)->nwritten += xfer_cnt;
6084 		}
6085 	}
6086 }
6087 
6088 /*
6089  * Enable the path(specific client/target/initiator)
6090  * Enabling a path means that MPxIO may select the enabled path for routing
6091  * future I/O requests, subject to other path state constraints.
6092  */
6093 int
6094 mdi_pi_enable_path(mdi_pathinfo_t *pip, int flags)
6095 {
6096 	mdi_phci_t	*ph;
6097 
6098 	ph = MDI_PI(pip)->pi_phci;
6099 	if (ph == NULL) {
6100 		MDI_DEBUG(1, (MDI_NOTE, mdi_pi_get_phci(pip),
6101 		    "!failed: path %s %p: NULL ph",
6102 		    mdi_pi_spathname(pip), (void *)pip));
6103 		return (MDI_FAILURE);
6104 	}
6105 
6106 	(void) i_mdi_enable_disable_path(pip, ph->ph_vhci, flags,
6107 		MDI_ENABLE_OP);
6108 	MDI_DEBUG(5, (MDI_NOTE, ph->ph_dip,
6109 	    "!returning success pip = %p. ph = %p",
6110 	    (void *)pip, (void *)ph));
6111 	return (MDI_SUCCESS);
6112 
6113 }
6114 
6115 /*
6116  * Disable the path (specific client/target/initiator)
6117  * Disabling a path means that MPxIO will not select the disabled path for
6118  * routing any new I/O requests.
6119  */
6120 int
6121 mdi_pi_disable_path(mdi_pathinfo_t *pip, int flags)
6122 {
6123 	mdi_phci_t	*ph;
6124 
6125 	ph = MDI_PI(pip)->pi_phci;
6126 	if (ph == NULL) {
6127 		MDI_DEBUG(1, (MDI_NOTE, mdi_pi_get_phci(pip),
6128 		    "!failed: path %s %p: NULL ph",
6129 		    mdi_pi_spathname(pip), (void *)pip));
6130 		return (MDI_FAILURE);
6131 	}
6132 
6133 	(void) i_mdi_enable_disable_path(pip,
6134 	    ph->ph_vhci, flags, MDI_DISABLE_OP);
6135 	MDI_DEBUG(5, (MDI_NOTE, ph->ph_dip,
6136 	    "!returning success pip = %p. ph = %p",
6137 	    (void *)pip, (void *)ph));
6138 	return (MDI_SUCCESS);
6139 }
6140 
6141 /*
6142  * disable the path to a particular pHCI (pHCI specified in the phci_path
6143  * argument) for a particular client (specified in the client_path argument).
6144  * Disabling a path means that MPxIO will not select the disabled path for
6145  * routing any new I/O requests.
6146  * NOTE: this will be removed once the NWS files are changed to use the new
6147  * mdi_{enable,disable}_path interfaces
6148  */
6149 int
6150 mdi_pi_disable(dev_info_t *cdip, dev_info_t *pdip, int flags)
6151 {
6152 	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_DISABLE_OP));
6153 }
6154 
6155 /*
6156  * Enable the path to a particular pHCI (pHCI specified in the phci_path
6157  * argument) for a particular client (specified in the client_path argument).
6158  * Enabling a path means that MPxIO may select the enabled path for routing
6159  * future I/O requests, subject to other path state constraints.
6160  * NOTE: this will be removed once the NWS files are changed to use the new
6161  * mdi_{enable,disable}_path interfaces
6162  */
6163 
6164 int
6165 mdi_pi_enable(dev_info_t *cdip, dev_info_t *pdip, int flags)
6166 {
6167 	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_ENABLE_OP));
6168 }
6169 
6170 /*
6171  * Common routine for doing enable/disable.
6172  */
6173 static mdi_pathinfo_t *
6174 i_mdi_enable_disable_path(mdi_pathinfo_t *pip, mdi_vhci_t *vh, int flags,
6175 		int op)
6176 {
6177 	int		sync_flag = 0;
6178 	int		rv;
6179 	mdi_pathinfo_t 	*next;
6180 	int		(*f)() = NULL;
6181 
6182 	/*
6183 	 * Check to make sure the path is not already in the
6184 	 * requested state. If it is just return the next path
6185 	 * as we have nothing to do here.
6186 	 */
6187 	if ((MDI_PI_IS_DISABLE(pip) && op == MDI_DISABLE_OP) ||
6188 	    (!MDI_PI_IS_DISABLE(pip) && op == MDI_ENABLE_OP)) {
6189 		MDI_PI_LOCK(pip);
6190 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
6191 		MDI_PI_UNLOCK(pip);
6192 		return (next);
6193 	}
6194 
6195 	f = vh->vh_ops->vo_pi_state_change;
6196 
6197 	sync_flag = (flags << 8) & 0xf00;
6198 
6199 	/*
6200 	 * Do a callback into the mdi consumer to let it
6201 	 * know that path is about to get enabled/disabled.
6202 	 */
6203 	if (f != NULL) {
6204 		rv = (*f)(vh->vh_dip, pip, 0,
6205 			MDI_PI_EXT_STATE(pip),
6206 			MDI_EXT_STATE_CHANGE | sync_flag |
6207 			op | MDI_BEFORE_STATE_CHANGE);
6208 		if (rv != MDI_SUCCESS) {
6209 			MDI_DEBUG(2, (MDI_WARN, vh->vh_dip,
6210 			    "vo_pi_state_change: failed rv = %x", rv));
6211 		}
6212 	}
6213 	MDI_PI_LOCK(pip);
6214 	next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
6215 
6216 	switch (flags) {
6217 		case USER_DISABLE:
6218 			if (op == MDI_DISABLE_OP) {
6219 				MDI_PI_SET_USER_DISABLE(pip);
6220 			} else {
6221 				MDI_PI_SET_USER_ENABLE(pip);
6222 			}
6223 			break;
6224 		case DRIVER_DISABLE:
6225 			if (op == MDI_DISABLE_OP) {
6226 				MDI_PI_SET_DRV_DISABLE(pip);
6227 			} else {
6228 				MDI_PI_SET_DRV_ENABLE(pip);
6229 			}
6230 			break;
6231 		case DRIVER_DISABLE_TRANSIENT:
6232 			if (op == MDI_DISABLE_OP && rv == MDI_SUCCESS) {
6233 				MDI_PI_SET_DRV_DISABLE_TRANS(pip);
6234 			} else {
6235 				MDI_PI_SET_DRV_ENABLE_TRANS(pip);
6236 			}
6237 			break;
6238 	}
6239 	MDI_PI_UNLOCK(pip);
6240 	/*
6241 	 * Do a callback into the mdi consumer to let it
6242 	 * know that path is now enabled/disabled.
6243 	 */
6244 	if (f != NULL) {
6245 		rv = (*f)(vh->vh_dip, pip, 0,
6246 			MDI_PI_EXT_STATE(pip),
6247 			MDI_EXT_STATE_CHANGE | sync_flag |
6248 			op | MDI_AFTER_STATE_CHANGE);
6249 		if (rv != MDI_SUCCESS) {
6250 			MDI_DEBUG(2, (MDI_WARN, vh->vh_dip,
6251 			    "vo_pi_state_change failed: rv = %x", rv));
6252 		}
6253 	}
6254 	return (next);
6255 }
6256 
6257 /*
6258  * Common routine for doing enable/disable.
6259  * NOTE: this will be removed once the NWS files are changed to use the new
6260  * mdi_{enable,disable}_path has been putback
6261  */
6262 int
6263 i_mdi_pi_enable_disable(dev_info_t *cdip, dev_info_t *pdip, int flags, int op)
6264 {
6265 
6266 	mdi_phci_t	*ph;
6267 	mdi_vhci_t	*vh = NULL;
6268 	mdi_client_t	*ct;
6269 	mdi_pathinfo_t	*next, *pip;
6270 	int		found_it;
6271 
6272 	ph = i_devi_get_phci(pdip);
6273 	MDI_DEBUG(5, (MDI_NOTE, cdip ? cdip : pdip,
6274 	    "!op = %d pdip = %p cdip = %p", op, (void *)pdip,
6275 	    (void *)cdip));
6276 	if (ph == NULL) {
6277 		MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6278 		    "!failed: operation %d: NULL ph", op));
6279 		return (MDI_FAILURE);
6280 	}
6281 
6282 	if ((op != MDI_ENABLE_OP) && (op != MDI_DISABLE_OP)) {
6283 		MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6284 		    "!failed: invalid operation %d", op));
6285 		return (MDI_FAILURE);
6286 	}
6287 
6288 	vh = ph->ph_vhci;
6289 
6290 	if (cdip == NULL) {
6291 		/*
6292 		 * Need to mark the Phci as enabled/disabled.
6293 		 */
6294 		MDI_DEBUG(4, (MDI_NOTE, cdip ? cdip : pdip,
6295 		    "op %d for the phci", op));
6296 		MDI_PHCI_LOCK(ph);
6297 		switch (flags) {
6298 			case USER_DISABLE:
6299 				if (op == MDI_DISABLE_OP) {
6300 					MDI_PHCI_SET_USER_DISABLE(ph);
6301 				} else {
6302 					MDI_PHCI_SET_USER_ENABLE(ph);
6303 				}
6304 				break;
6305 			case DRIVER_DISABLE:
6306 				if (op == MDI_DISABLE_OP) {
6307 					MDI_PHCI_SET_DRV_DISABLE(ph);
6308 				} else {
6309 					MDI_PHCI_SET_DRV_ENABLE(ph);
6310 				}
6311 				break;
6312 			case DRIVER_DISABLE_TRANSIENT:
6313 				if (op == MDI_DISABLE_OP) {
6314 					MDI_PHCI_SET_DRV_DISABLE_TRANSIENT(ph);
6315 				} else {
6316 					MDI_PHCI_SET_DRV_ENABLE_TRANSIENT(ph);
6317 				}
6318 				break;
6319 			default:
6320 				MDI_PHCI_UNLOCK(ph);
6321 				MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6322 				    "!invalid flag argument= %d", flags));
6323 		}
6324 
6325 		/*
6326 		 * Phci has been disabled. Now try to enable/disable
6327 		 * path info's to each client.
6328 		 */
6329 		pip = ph->ph_path_head;
6330 		while (pip != NULL) {
6331 			pip = i_mdi_enable_disable_path(pip, vh, flags, op);
6332 		}
6333 		MDI_PHCI_UNLOCK(ph);
6334 	} else {
6335 
6336 		/*
6337 		 * Disable a specific client.
6338 		 */
6339 		ct = i_devi_get_client(cdip);
6340 		if (ct == NULL) {
6341 			MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6342 			    "!failed: operation = %d: NULL ct", op));
6343 			return (MDI_FAILURE);
6344 		}
6345 
6346 		MDI_CLIENT_LOCK(ct);
6347 		pip = ct->ct_path_head;
6348 		found_it = 0;
6349 		while (pip != NULL) {
6350 			MDI_PI_LOCK(pip);
6351 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6352 			if (MDI_PI(pip)->pi_phci == ph) {
6353 				MDI_PI_UNLOCK(pip);
6354 				found_it = 1;
6355 				break;
6356 			}
6357 			MDI_PI_UNLOCK(pip);
6358 			pip = next;
6359 		}
6360 
6361 
6362 		MDI_CLIENT_UNLOCK(ct);
6363 		if (found_it == 0) {
6364 			MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6365 			    "!failed. Could not find corresponding pip\n"));
6366 			return (MDI_FAILURE);
6367 		}
6368 
6369 		(void) i_mdi_enable_disable_path(pip, vh, flags, op);
6370 	}
6371 
6372 	MDI_DEBUG(5, (MDI_NOTE, cdip ? cdip : pdip,
6373 	    "!op %d returning success pdip = %p cdip = %p",
6374 	    op, (void *)pdip, (void *)cdip));
6375 	return (MDI_SUCCESS);
6376 }
6377 
6378 /*
6379  * Ensure phci powered up
6380  */
6381 static void
6382 i_mdi_pm_hold_pip(mdi_pathinfo_t *pip)
6383 {
6384 	dev_info_t	*ph_dip;
6385 
6386 	ASSERT(pip != NULL);
6387 	ASSERT(MDI_PI_LOCKED(pip));
6388 
6389 	if (MDI_PI(pip)->pi_pm_held) {
6390 		return;
6391 	}
6392 
6393 	ph_dip = mdi_pi_get_phci(pip);
6394 	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6395 	    "%s %p", mdi_pi_spathname(pip), (void *)pip));
6396 	if (ph_dip == NULL) {
6397 		return;
6398 	}
6399 
6400 	MDI_PI_UNLOCK(pip);
6401 	MDI_DEBUG(4, (MDI_NOTE, ph_dip, "kidsupcnt was %d",
6402 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
6403 	pm_hold_power(ph_dip);
6404 	MDI_DEBUG(4, (MDI_NOTE, ph_dip, "kidsupcnt is %d",
6405 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
6406 	MDI_PI_LOCK(pip);
6407 
6408 	/* If PM_GET_PM_INFO is NULL the pm_hold_power above was a noop */
6409 	if (DEVI(ph_dip)->devi_pm_info)
6410 		MDI_PI(pip)->pi_pm_held = 1;
6411 }
6412 
6413 /*
6414  * Allow phci powered down
6415  */
6416 static void
6417 i_mdi_pm_rele_pip(mdi_pathinfo_t *pip)
6418 {
6419 	dev_info_t	*ph_dip = NULL;
6420 
6421 	ASSERT(pip != NULL);
6422 	ASSERT(MDI_PI_LOCKED(pip));
6423 
6424 	if (MDI_PI(pip)->pi_pm_held == 0) {
6425 		return;
6426 	}
6427 
6428 	ph_dip = mdi_pi_get_phci(pip);
6429 	ASSERT(ph_dip != NULL);
6430 
6431 	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6432 	    "%s %p", mdi_pi_spathname(pip), (void *)pip));
6433 
6434 	MDI_PI_UNLOCK(pip);
6435 	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6436 	    "kidsupcnt was %d", DEVI(ph_dip)->devi_pm_kidsupcnt));
6437 	pm_rele_power(ph_dip);
6438 	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6439 	    "kidsupcnt is %d", DEVI(ph_dip)->devi_pm_kidsupcnt));
6440 	MDI_PI_LOCK(pip);
6441 
6442 	MDI_PI(pip)->pi_pm_held = 0;
6443 }
6444 
6445 static void
6446 i_mdi_pm_hold_client(mdi_client_t *ct, int incr)
6447 {
6448 	ASSERT(MDI_CLIENT_LOCKED(ct));
6449 
6450 	ct->ct_power_cnt += incr;
6451 	MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
6452 	    "%p ct_power_cnt = %d incr = %d",
6453 	    (void *)ct, ct->ct_power_cnt, incr));
6454 	ASSERT(ct->ct_power_cnt >= 0);
6455 }
6456 
6457 static void
6458 i_mdi_rele_all_phci(mdi_client_t *ct)
6459 {
6460 	mdi_pathinfo_t  *pip;
6461 
6462 	ASSERT(MDI_CLIENT_LOCKED(ct));
6463 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
6464 	while (pip != NULL) {
6465 		mdi_hold_path(pip);
6466 		MDI_PI_LOCK(pip);
6467 		i_mdi_pm_rele_pip(pip);
6468 		MDI_PI_UNLOCK(pip);
6469 		mdi_rele_path(pip);
6470 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6471 	}
6472 }
6473 
6474 static void
6475 i_mdi_pm_rele_client(mdi_client_t *ct, int decr)
6476 {
6477 	ASSERT(MDI_CLIENT_LOCKED(ct));
6478 
6479 	if (i_ddi_devi_attached(ct->ct_dip)) {
6480 		ct->ct_power_cnt -= decr;
6481 		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
6482 		    "%p ct_power_cnt = %d decr = %d",
6483 		    (void *)ct, ct->ct_power_cnt, decr));
6484 	}
6485 
6486 	ASSERT(ct->ct_power_cnt >= 0);
6487 	if (ct->ct_power_cnt == 0) {
6488 		i_mdi_rele_all_phci(ct);
6489 		return;
6490 	}
6491 }
6492 
6493 static void
6494 i_mdi_pm_reset_client(mdi_client_t *ct)
6495 {
6496 	MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
6497 	    "%p ct_power_cnt = %d", (void *)ct, ct->ct_power_cnt));
6498 	ASSERT(MDI_CLIENT_LOCKED(ct));
6499 	ct->ct_power_cnt = 0;
6500 	i_mdi_rele_all_phci(ct);
6501 	ct->ct_powercnt_config = 0;
6502 	ct->ct_powercnt_unconfig = 0;
6503 	ct->ct_powercnt_reset = 1;
6504 }
6505 
6506 static int
6507 i_mdi_power_one_phci(mdi_pathinfo_t *pip)
6508 {
6509 	int		ret;
6510 	dev_info_t	*ph_dip;
6511 
6512 	MDI_PI_LOCK(pip);
6513 	i_mdi_pm_hold_pip(pip);
6514 
6515 	ph_dip = mdi_pi_get_phci(pip);
6516 	MDI_PI_UNLOCK(pip);
6517 
6518 	/* bring all components of phci to full power */
6519 	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6520 	    "pm_powerup for %s%d %p", ddi_driver_name(ph_dip),
6521 	    ddi_get_instance(ph_dip), (void *)pip));
6522 
6523 	ret = pm_powerup(ph_dip);
6524 
6525 	if (ret == DDI_FAILURE) {
6526 		MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6527 		    "pm_powerup FAILED for %s%d %p",
6528 		    ddi_driver_name(ph_dip), ddi_get_instance(ph_dip),
6529 		    (void *)pip));
6530 
6531 		MDI_PI_LOCK(pip);
6532 		i_mdi_pm_rele_pip(pip);
6533 		MDI_PI_UNLOCK(pip);
6534 		return (MDI_FAILURE);
6535 	}
6536 
6537 	return (MDI_SUCCESS);
6538 }
6539 
6540 static int
6541 i_mdi_power_all_phci(mdi_client_t *ct)
6542 {
6543 	mdi_pathinfo_t  *pip;
6544 	int		succeeded = 0;
6545 
6546 	ASSERT(MDI_CLIENT_LOCKED(ct));
6547 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
6548 	while (pip != NULL) {
6549 		/*
6550 		 * Don't power if MDI_PATHINFO_STATE_FAULT
6551 		 * or MDI_PATHINFO_STATE_OFFLINE.
6552 		 */
6553 		if (MDI_PI_IS_INIT(pip) ||
6554 		    MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip)) {
6555 			mdi_hold_path(pip);
6556 			MDI_CLIENT_UNLOCK(ct);
6557 			if (i_mdi_power_one_phci(pip) == MDI_SUCCESS)
6558 				succeeded = 1;
6559 
6560 			ASSERT(ct == MDI_PI(pip)->pi_client);
6561 			MDI_CLIENT_LOCK(ct);
6562 			mdi_rele_path(pip);
6563 		}
6564 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6565 	}
6566 
6567 	return (succeeded ? MDI_SUCCESS : MDI_FAILURE);
6568 }
6569 
6570 /*
6571  * mdi_bus_power():
6572  *		1. Place the phci(s) into powered up state so that
6573  *		   client can do power management
6574  *		2. Ensure phci powered up as client power managing
6575  * Return Values:
6576  *		MDI_SUCCESS
6577  *		MDI_FAILURE
6578  */
6579 int
6580 mdi_bus_power(dev_info_t *parent, void *impl_arg, pm_bus_power_op_t op,
6581     void *arg, void *result)
6582 {
6583 	int			ret = MDI_SUCCESS;
6584 	pm_bp_child_pwrchg_t	*bpc;
6585 	mdi_client_t		*ct;
6586 	dev_info_t		*cdip;
6587 	pm_bp_has_changed_t	*bphc;
6588 
6589 	/*
6590 	 * BUS_POWER_NOINVOL not supported
6591 	 */
6592 	if (op == BUS_POWER_NOINVOL)
6593 		return (MDI_FAILURE);
6594 
6595 	/*
6596 	 * ignore other OPs.
6597 	 * return quickly to save cou cycles on the ct processing
6598 	 */
6599 	switch (op) {
6600 	case BUS_POWER_PRE_NOTIFICATION:
6601 	case BUS_POWER_POST_NOTIFICATION:
6602 		bpc = (pm_bp_child_pwrchg_t *)arg;
6603 		cdip = bpc->bpc_dip;
6604 		break;
6605 	case BUS_POWER_HAS_CHANGED:
6606 		bphc = (pm_bp_has_changed_t *)arg;
6607 		cdip = bphc->bphc_dip;
6608 		break;
6609 	default:
6610 		return (pm_busop_bus_power(parent, impl_arg, op, arg, result));
6611 	}
6612 
6613 	ASSERT(MDI_CLIENT(cdip));
6614 
6615 	ct = i_devi_get_client(cdip);
6616 	if (ct == NULL)
6617 		return (MDI_FAILURE);
6618 
6619 	/*
6620 	 * wait till the mdi_pathinfo node state change are processed
6621 	 */
6622 	MDI_CLIENT_LOCK(ct);
6623 	switch (op) {
6624 	case BUS_POWER_PRE_NOTIFICATION:
6625 		MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6626 		    "BUS_POWER_PRE_NOTIFICATION:"
6627 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d",
6628 		    ddi_node_name(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
6629 		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp));
6630 
6631 		/* serialize power level change per client */
6632 		while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6633 			cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6634 
6635 		MDI_CLIENT_SET_POWER_TRANSITION(ct);
6636 
6637 		if (ct->ct_power_cnt == 0) {
6638 			ret = i_mdi_power_all_phci(ct);
6639 		}
6640 
6641 		/*
6642 		 * if new_level > 0:
6643 		 *	- hold phci(s)
6644 		 *	- power up phci(s) if not already
6645 		 * ignore power down
6646 		 */
6647 		if (bpc->bpc_nlevel > 0) {
6648 			if (!DEVI_IS_ATTACHING(ct->ct_dip)) {
6649 				MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6650 				    "i_mdi_pm_hold_client\n"));
6651 				i_mdi_pm_hold_client(ct, ct->ct_path_count);
6652 			}
6653 		}
6654 		break;
6655 	case BUS_POWER_POST_NOTIFICATION:
6656 		MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6657 		    "BUS_POWER_POST_NOTIFICATION:"
6658 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d result=%d",
6659 		    ddi_node_name(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
6660 		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp,
6661 		    *(int *)result));
6662 
6663 		if (*(int *)result == DDI_SUCCESS) {
6664 			if (bpc->bpc_nlevel > 0) {
6665 				MDI_CLIENT_SET_POWER_UP(ct);
6666 			} else {
6667 				MDI_CLIENT_SET_POWER_DOWN(ct);
6668 			}
6669 		}
6670 
6671 		/* release the hold we did in pre-notification */
6672 		if (bpc->bpc_nlevel > 0 && (*(int *)result != DDI_SUCCESS) &&
6673 		    !DEVI_IS_ATTACHING(ct->ct_dip)) {
6674 			MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6675 			    "i_mdi_pm_rele_client\n"));
6676 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
6677 		}
6678 
6679 		if (bpc->bpc_nlevel == 0 && (*(int *)result == DDI_SUCCESS)) {
6680 			/* another thread might started attaching */
6681 			if (DEVI_IS_ATTACHING(ct->ct_dip)) {
6682 				MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6683 				    "i_mdi_pm_rele_client\n"));
6684 				i_mdi_pm_rele_client(ct, ct->ct_path_count);
6685 			/* detaching has been taken care in pm_post_unconfig */
6686 			} else if (!DEVI_IS_DETACHING(ct->ct_dip)) {
6687 				MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6688 				    "i_mdi_pm_reset_client\n"));
6689 				i_mdi_pm_reset_client(ct);
6690 			}
6691 		}
6692 
6693 		MDI_CLIENT_CLEAR_POWER_TRANSITION(ct);
6694 		cv_broadcast(&ct->ct_powerchange_cv);
6695 
6696 		break;
6697 
6698 	/* need to do more */
6699 	case BUS_POWER_HAS_CHANGED:
6700 		MDI_DEBUG(4, (MDI_NOTE, bphc->bphc_dip,
6701 		    "BUS_POWER_HAS_CHANGED:"
6702 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d",
6703 		    ddi_node_name(bphc->bphc_dip), PM_ADDR(bphc->bphc_dip),
6704 		    bphc->bphc_olevel, bphc->bphc_nlevel, bphc->bphc_comp));
6705 
6706 		if (bphc->bphc_nlevel > 0 &&
6707 		    bphc->bphc_nlevel > bphc->bphc_olevel) {
6708 			if (ct->ct_power_cnt == 0) {
6709 				ret = i_mdi_power_all_phci(ct);
6710 			}
6711 			MDI_DEBUG(4, (MDI_NOTE, bphc->bphc_dip,
6712 			    "i_mdi_pm_hold_client\n"));
6713 			i_mdi_pm_hold_client(ct, ct->ct_path_count);
6714 		}
6715 
6716 		if (bphc->bphc_nlevel == 0 && bphc->bphc_olevel != -1) {
6717 			MDI_DEBUG(4, (MDI_NOTE, bphc->bphc_dip,
6718 			    "i_mdi_pm_rele_client\n"));
6719 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
6720 		}
6721 		break;
6722 	}
6723 
6724 	MDI_CLIENT_UNLOCK(ct);
6725 	return (ret);
6726 }
6727 
6728 static int
6729 i_mdi_pm_pre_config_one(dev_info_t *child)
6730 {
6731 	int		ret = MDI_SUCCESS;
6732 	mdi_client_t	*ct;
6733 
6734 	ct = i_devi_get_client(child);
6735 	if (ct == NULL)
6736 		return (MDI_FAILURE);
6737 
6738 	MDI_CLIENT_LOCK(ct);
6739 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6740 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6741 
6742 	if (!MDI_CLIENT_IS_FAILED(ct)) {
6743 		MDI_CLIENT_UNLOCK(ct);
6744 		MDI_DEBUG(4, (MDI_NOTE, child, "already configured\n"));
6745 		return (MDI_SUCCESS);
6746 	}
6747 
6748 	if (ct->ct_powercnt_config) {
6749 		MDI_CLIENT_UNLOCK(ct);
6750 		MDI_DEBUG(4, (MDI_NOTE, child, "already held\n"));
6751 		return (MDI_SUCCESS);
6752 	}
6753 
6754 	if (ct->ct_power_cnt == 0) {
6755 		ret = i_mdi_power_all_phci(ct);
6756 	}
6757 	MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_hold_client\n"));
6758 	i_mdi_pm_hold_client(ct, ct->ct_path_count);
6759 	ct->ct_powercnt_config = 1;
6760 	ct->ct_powercnt_reset = 0;
6761 	MDI_CLIENT_UNLOCK(ct);
6762 	return (ret);
6763 }
6764 
6765 static int
6766 i_mdi_pm_pre_config(dev_info_t *vdip, dev_info_t *child)
6767 {
6768 	int			ret = MDI_SUCCESS;
6769 	dev_info_t		*cdip;
6770 	int			circ;
6771 
6772 	ASSERT(MDI_VHCI(vdip));
6773 
6774 	/* ndi_devi_config_one */
6775 	if (child) {
6776 		ASSERT(DEVI_BUSY_OWNED(vdip));
6777 		return (i_mdi_pm_pre_config_one(child));
6778 	}
6779 
6780 	/* devi_config_common */
6781 	ndi_devi_enter(vdip, &circ);
6782 	cdip = ddi_get_child(vdip);
6783 	while (cdip) {
6784 		dev_info_t *next = ddi_get_next_sibling(cdip);
6785 
6786 		ret = i_mdi_pm_pre_config_one(cdip);
6787 		if (ret != MDI_SUCCESS)
6788 			break;
6789 		cdip = next;
6790 	}
6791 	ndi_devi_exit(vdip, circ);
6792 	return (ret);
6793 }
6794 
6795 static int
6796 i_mdi_pm_pre_unconfig_one(dev_info_t *child, int *held, int flags)
6797 {
6798 	int		ret = MDI_SUCCESS;
6799 	mdi_client_t	*ct;
6800 
6801 	ct = i_devi_get_client(child);
6802 	if (ct == NULL)
6803 		return (MDI_FAILURE);
6804 
6805 	MDI_CLIENT_LOCK(ct);
6806 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6807 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6808 
6809 	if (!i_ddi_devi_attached(ct->ct_dip)) {
6810 		MDI_DEBUG(4, (MDI_NOTE, child, "node detached already\n"));
6811 		MDI_CLIENT_UNLOCK(ct);
6812 		return (MDI_SUCCESS);
6813 	}
6814 
6815 	if (MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6816 	    (flags & NDI_AUTODETACH)) {
6817 		MDI_DEBUG(4, (MDI_NOTE, child, "auto-modunload\n"));
6818 		MDI_CLIENT_UNLOCK(ct);
6819 		return (MDI_FAILURE);
6820 	}
6821 
6822 	if (ct->ct_powercnt_unconfig) {
6823 		MDI_DEBUG(4, (MDI_NOTE, child, "ct_powercnt_held\n"));
6824 		MDI_CLIENT_UNLOCK(ct);
6825 		*held = 1;
6826 		return (MDI_SUCCESS);
6827 	}
6828 
6829 	if (ct->ct_power_cnt == 0) {
6830 		ret = i_mdi_power_all_phci(ct);
6831 	}
6832 	MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_hold_client\n"));
6833 	i_mdi_pm_hold_client(ct, ct->ct_path_count);
6834 	ct->ct_powercnt_unconfig = 1;
6835 	ct->ct_powercnt_reset = 0;
6836 	MDI_CLIENT_UNLOCK(ct);
6837 	if (ret == MDI_SUCCESS)
6838 		*held = 1;
6839 	return (ret);
6840 }
6841 
6842 static int
6843 i_mdi_pm_pre_unconfig(dev_info_t *vdip, dev_info_t *child, int *held,
6844     int flags)
6845 {
6846 	int			ret = MDI_SUCCESS;
6847 	dev_info_t		*cdip;
6848 	int			circ;
6849 
6850 	ASSERT(MDI_VHCI(vdip));
6851 	*held = 0;
6852 
6853 	/* ndi_devi_unconfig_one */
6854 	if (child) {
6855 		ASSERT(DEVI_BUSY_OWNED(vdip));
6856 		return (i_mdi_pm_pre_unconfig_one(child, held, flags));
6857 	}
6858 
6859 	/* devi_unconfig_common */
6860 	ndi_devi_enter(vdip, &circ);
6861 	cdip = ddi_get_child(vdip);
6862 	while (cdip) {
6863 		dev_info_t *next = ddi_get_next_sibling(cdip);
6864 
6865 		ret = i_mdi_pm_pre_unconfig_one(cdip, held, flags);
6866 		cdip = next;
6867 	}
6868 	ndi_devi_exit(vdip, circ);
6869 
6870 	if (*held)
6871 		ret = MDI_SUCCESS;
6872 
6873 	return (ret);
6874 }
6875 
6876 static void
6877 i_mdi_pm_post_config_one(dev_info_t *child)
6878 {
6879 	mdi_client_t	*ct;
6880 
6881 	ct = i_devi_get_client(child);
6882 	if (ct == NULL)
6883 		return;
6884 
6885 	MDI_CLIENT_LOCK(ct);
6886 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6887 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6888 
6889 	if (ct->ct_powercnt_reset || !ct->ct_powercnt_config) {
6890 		MDI_DEBUG(4, (MDI_NOTE, child, "not configured\n"));
6891 		MDI_CLIENT_UNLOCK(ct);
6892 		return;
6893 	}
6894 
6895 	/* client has not been updated */
6896 	if (MDI_CLIENT_IS_FAILED(ct)) {
6897 		MDI_DEBUG(4, (MDI_NOTE, child, "client failed\n"));
6898 		MDI_CLIENT_UNLOCK(ct);
6899 		return;
6900 	}
6901 
6902 	/* another thread might have powered it down or detached it */
6903 	if ((MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6904 	    !DEVI_IS_ATTACHING(ct->ct_dip)) ||
6905 	    (!i_ddi_devi_attached(ct->ct_dip) &&
6906 	    !DEVI_IS_ATTACHING(ct->ct_dip))) {
6907 		MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_reset_client\n"));
6908 		i_mdi_pm_reset_client(ct);
6909 	} else {
6910 		mdi_pathinfo_t  *pip, *next;
6911 		int	valid_path_count = 0;
6912 
6913 		MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_rele_client\n"));
6914 		pip = ct->ct_path_head;
6915 		while (pip != NULL) {
6916 			MDI_PI_LOCK(pip);
6917 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6918 			if (MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip))
6919 				valid_path_count ++;
6920 			MDI_PI_UNLOCK(pip);
6921 			pip = next;
6922 		}
6923 		i_mdi_pm_rele_client(ct, valid_path_count);
6924 	}
6925 	ct->ct_powercnt_config = 0;
6926 	MDI_CLIENT_UNLOCK(ct);
6927 }
6928 
6929 static void
6930 i_mdi_pm_post_config(dev_info_t *vdip, dev_info_t *child)
6931 {
6932 	int		circ;
6933 	dev_info_t	*cdip;
6934 
6935 	ASSERT(MDI_VHCI(vdip));
6936 
6937 	/* ndi_devi_config_one */
6938 	if (child) {
6939 		ASSERT(DEVI_BUSY_OWNED(vdip));
6940 		i_mdi_pm_post_config_one(child);
6941 		return;
6942 	}
6943 
6944 	/* devi_config_common */
6945 	ndi_devi_enter(vdip, &circ);
6946 	cdip = ddi_get_child(vdip);
6947 	while (cdip) {
6948 		dev_info_t *next = ddi_get_next_sibling(cdip);
6949 
6950 		i_mdi_pm_post_config_one(cdip);
6951 		cdip = next;
6952 	}
6953 	ndi_devi_exit(vdip, circ);
6954 }
6955 
6956 static void
6957 i_mdi_pm_post_unconfig_one(dev_info_t *child)
6958 {
6959 	mdi_client_t	*ct;
6960 
6961 	ct = i_devi_get_client(child);
6962 	if (ct == NULL)
6963 		return;
6964 
6965 	MDI_CLIENT_LOCK(ct);
6966 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6967 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6968 
6969 	if (!ct->ct_powercnt_unconfig || ct->ct_powercnt_reset) {
6970 		MDI_DEBUG(4, (MDI_NOTE, child, "not held\n"));
6971 		MDI_CLIENT_UNLOCK(ct);
6972 		return;
6973 	}
6974 
6975 	/* failure detaching or another thread just attached it */
6976 	if ((MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6977 	    i_ddi_devi_attached(ct->ct_dip)) ||
6978 	    (!i_ddi_devi_attached(ct->ct_dip) &&
6979 	    !DEVI_IS_ATTACHING(ct->ct_dip))) {
6980 		MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_reset_client\n"));
6981 		i_mdi_pm_reset_client(ct);
6982 	} else {
6983 		mdi_pathinfo_t  *pip, *next;
6984 		int	valid_path_count = 0;
6985 
6986 		MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_rele_client\n"));
6987 		pip = ct->ct_path_head;
6988 		while (pip != NULL) {
6989 			MDI_PI_LOCK(pip);
6990 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6991 			if (MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip))
6992 				valid_path_count ++;
6993 			MDI_PI_UNLOCK(pip);
6994 			pip = next;
6995 		}
6996 		i_mdi_pm_rele_client(ct, valid_path_count);
6997 		ct->ct_powercnt_unconfig = 0;
6998 	}
6999 
7000 	MDI_CLIENT_UNLOCK(ct);
7001 }
7002 
7003 static void
7004 i_mdi_pm_post_unconfig(dev_info_t *vdip, dev_info_t *child, int held)
7005 {
7006 	int			circ;
7007 	dev_info_t		*cdip;
7008 
7009 	ASSERT(MDI_VHCI(vdip));
7010 
7011 	if (!held) {
7012 		MDI_DEBUG(4, (MDI_NOTE, vdip, "held = %d", held));
7013 		return;
7014 	}
7015 
7016 	if (child) {
7017 		ASSERT(DEVI_BUSY_OWNED(vdip));
7018 		i_mdi_pm_post_unconfig_one(child);
7019 		return;
7020 	}
7021 
7022 	ndi_devi_enter(vdip, &circ);
7023 	cdip = ddi_get_child(vdip);
7024 	while (cdip) {
7025 		dev_info_t *next = ddi_get_next_sibling(cdip);
7026 
7027 		i_mdi_pm_post_unconfig_one(cdip);
7028 		cdip = next;
7029 	}
7030 	ndi_devi_exit(vdip, circ);
7031 }
7032 
7033 int
7034 mdi_power(dev_info_t *vdip, mdi_pm_op_t op, void *args, char *devnm, int flags)
7035 {
7036 	int			circ, ret = MDI_SUCCESS;
7037 	dev_info_t		*client_dip = NULL;
7038 	mdi_client_t		*ct;
7039 
7040 	/*
7041 	 * Handling ndi_devi_config_one and ndi_devi_unconfig_one.
7042 	 * Power up pHCI for the named client device.
7043 	 * Note: Before the client is enumerated under vhci by phci,
7044 	 * client_dip can be NULL. Then proceed to power up all the
7045 	 * pHCIs.
7046 	 */
7047 	if (devnm != NULL) {
7048 		ndi_devi_enter(vdip, &circ);
7049 		client_dip = ndi_devi_findchild(vdip, devnm);
7050 	}
7051 
7052 	MDI_DEBUG(4, (MDI_NOTE, vdip,
7053 	    "op = %d %s %p", op, devnm ? devnm : "", (void *)client_dip));
7054 
7055 	switch (op) {
7056 	case MDI_PM_PRE_CONFIG:
7057 		ret = i_mdi_pm_pre_config(vdip, client_dip);
7058 		break;
7059 
7060 	case MDI_PM_PRE_UNCONFIG:
7061 		ret = i_mdi_pm_pre_unconfig(vdip, client_dip, (int *)args,
7062 		    flags);
7063 		break;
7064 
7065 	case MDI_PM_POST_CONFIG:
7066 		i_mdi_pm_post_config(vdip, client_dip);
7067 		break;
7068 
7069 	case MDI_PM_POST_UNCONFIG:
7070 		i_mdi_pm_post_unconfig(vdip, client_dip, *(int *)args);
7071 		break;
7072 
7073 	case MDI_PM_HOLD_POWER:
7074 	case MDI_PM_RELE_POWER:
7075 		ASSERT(args);
7076 
7077 		client_dip = (dev_info_t *)args;
7078 		ASSERT(MDI_CLIENT(client_dip));
7079 
7080 		ct = i_devi_get_client(client_dip);
7081 		MDI_CLIENT_LOCK(ct);
7082 
7083 		if (op == MDI_PM_HOLD_POWER) {
7084 			if (ct->ct_power_cnt == 0) {
7085 				(void) i_mdi_power_all_phci(ct);
7086 				MDI_DEBUG(4, (MDI_NOTE, client_dip,
7087 				    "i_mdi_pm_hold_client\n"));
7088 				i_mdi_pm_hold_client(ct, ct->ct_path_count);
7089 			}
7090 		} else {
7091 			if (DEVI_IS_ATTACHING(ct->ct_dip)) {
7092 				MDI_DEBUG(4, (MDI_NOTE, client_dip,
7093 				    "i_mdi_pm_rele_client\n"));
7094 				i_mdi_pm_rele_client(ct, ct->ct_path_count);
7095 			} else {
7096 				MDI_DEBUG(4, (MDI_NOTE, client_dip,
7097 				    "i_mdi_pm_reset_client\n"));
7098 				i_mdi_pm_reset_client(ct);
7099 			}
7100 		}
7101 
7102 		MDI_CLIENT_UNLOCK(ct);
7103 		break;
7104 
7105 	default:
7106 		break;
7107 	}
7108 
7109 	if (devnm)
7110 		ndi_devi_exit(vdip, circ);
7111 
7112 	return (ret);
7113 }
7114 
7115 int
7116 mdi_component_is_vhci(dev_info_t *dip, const char **mdi_class)
7117 {
7118 	mdi_vhci_t *vhci;
7119 
7120 	if (!MDI_VHCI(dip))
7121 		return (MDI_FAILURE);
7122 
7123 	if (mdi_class) {
7124 		vhci = DEVI(dip)->devi_mdi_xhci;
7125 		ASSERT(vhci);
7126 		*mdi_class = vhci->vh_class;
7127 	}
7128 
7129 	return (MDI_SUCCESS);
7130 }
7131 
7132 int
7133 mdi_component_is_phci(dev_info_t *dip, const char **mdi_class)
7134 {
7135 	mdi_phci_t *phci;
7136 
7137 	if (!MDI_PHCI(dip))
7138 		return (MDI_FAILURE);
7139 
7140 	if (mdi_class) {
7141 		phci = DEVI(dip)->devi_mdi_xhci;
7142 		ASSERT(phci);
7143 		*mdi_class = phci->ph_vhci->vh_class;
7144 	}
7145 
7146 	return (MDI_SUCCESS);
7147 }
7148 
7149 int
7150 mdi_component_is_client(dev_info_t *dip, const char **mdi_class)
7151 {
7152 	mdi_client_t *client;
7153 
7154 	if (!MDI_CLIENT(dip))
7155 		return (MDI_FAILURE);
7156 
7157 	if (mdi_class) {
7158 		client = DEVI(dip)->devi_mdi_client;
7159 		ASSERT(client);
7160 		*mdi_class = client->ct_vhci->vh_class;
7161 	}
7162 
7163 	return (MDI_SUCCESS);
7164 }
7165 
7166 void *
7167 mdi_client_get_vhci_private(dev_info_t *dip)
7168 {
7169 	ASSERT(mdi_component_is_client(dip, NULL) == MDI_SUCCESS);
7170 	if (mdi_component_is_client(dip, NULL) == MDI_SUCCESS) {
7171 		mdi_client_t	*ct;
7172 		ct = i_devi_get_client(dip);
7173 		return (ct->ct_vprivate);
7174 	}
7175 	return (NULL);
7176 }
7177 
7178 void
7179 mdi_client_set_vhci_private(dev_info_t *dip, void *data)
7180 {
7181 	ASSERT(mdi_component_is_client(dip, NULL) == MDI_SUCCESS);
7182 	if (mdi_component_is_client(dip, NULL) == MDI_SUCCESS) {
7183 		mdi_client_t	*ct;
7184 		ct = i_devi_get_client(dip);
7185 		ct->ct_vprivate = data;
7186 	}
7187 }
7188 /*
7189  * mdi_pi_get_vhci_private():
7190  *		Get the vhci private information associated with the
7191  *		mdi_pathinfo node
7192  */
7193 void *
7194 mdi_pi_get_vhci_private(mdi_pathinfo_t *pip)
7195 {
7196 	caddr_t	vprivate = NULL;
7197 	if (pip) {
7198 		vprivate = MDI_PI(pip)->pi_vprivate;
7199 	}
7200 	return (vprivate);
7201 }
7202 
7203 /*
7204  * mdi_pi_set_vhci_private():
7205  *		Set the vhci private information in the mdi_pathinfo node
7206  */
7207 void
7208 mdi_pi_set_vhci_private(mdi_pathinfo_t *pip, void *priv)
7209 {
7210 	if (pip) {
7211 		MDI_PI(pip)->pi_vprivate = priv;
7212 	}
7213 }
7214 
7215 /*
7216  * mdi_phci_get_vhci_private():
7217  *		Get the vhci private information associated with the
7218  *		mdi_phci node
7219  */
7220 void *
7221 mdi_phci_get_vhci_private(dev_info_t *dip)
7222 {
7223 	ASSERT(mdi_component_is_phci(dip, NULL) == MDI_SUCCESS);
7224 	if (mdi_component_is_phci(dip, NULL) == MDI_SUCCESS) {
7225 		mdi_phci_t	*ph;
7226 		ph = i_devi_get_phci(dip);
7227 		return (ph->ph_vprivate);
7228 	}
7229 	return (NULL);
7230 }
7231 
7232 /*
7233  * mdi_phci_set_vhci_private():
7234  *		Set the vhci private information in the mdi_phci node
7235  */
7236 void
7237 mdi_phci_set_vhci_private(dev_info_t *dip, void *priv)
7238 {
7239 	ASSERT(mdi_component_is_phci(dip, NULL) == MDI_SUCCESS);
7240 	if (mdi_component_is_phci(dip, NULL) == MDI_SUCCESS) {
7241 		mdi_phci_t	*ph;
7242 		ph = i_devi_get_phci(dip);
7243 		ph->ph_vprivate = priv;
7244 	}
7245 }
7246 
7247 int
7248 mdi_pi_ishidden(mdi_pathinfo_t *pip)
7249 {
7250 	return (MDI_PI_FLAGS_IS_HIDDEN(pip));
7251 }
7252 
7253 int
7254 mdi_pi_device_isremoved(mdi_pathinfo_t *pip)
7255 {
7256 	return (MDI_PI_FLAGS_IS_DEVICE_REMOVED(pip));
7257 }
7258 
7259 /*
7260  * When processing hotplug, if mdi_pi_offline-mdi_pi_free fails then this
7261  * interface is used to represent device removal.
7262  */
7263 int
7264 mdi_pi_device_remove(mdi_pathinfo_t *pip)
7265 {
7266 	MDI_PI_LOCK(pip);
7267 	if (mdi_pi_device_isremoved(pip)) {
7268 		MDI_PI_UNLOCK(pip);
7269 		return (0);
7270 	}
7271 	MDI_PI_FLAGS_SET_DEVICE_REMOVED(pip);
7272 	MDI_PI_FLAGS_SET_HIDDEN(pip);
7273 	MDI_PI_UNLOCK(pip);
7274 
7275 	i_ddi_di_cache_invalidate();
7276 
7277 	return (1);
7278 }
7279 
7280 /*
7281  * When processing hotplug, if a path marked mdi_pi_device_isremoved()
7282  * is now accessible then this interfaces is used to represent device insertion.
7283  */
7284 int
7285 mdi_pi_device_insert(mdi_pathinfo_t *pip)
7286 {
7287 	MDI_PI_LOCK(pip);
7288 	if (!mdi_pi_device_isremoved(pip)) {
7289 		MDI_PI_UNLOCK(pip);
7290 		return (0);
7291 	}
7292 	MDI_PI_FLAGS_CLR_DEVICE_REMOVED(pip);
7293 	MDI_PI_FLAGS_CLR_HIDDEN(pip);
7294 	MDI_PI_UNLOCK(pip);
7295 
7296 	i_ddi_di_cache_invalidate();
7297 
7298 	return (1);
7299 }
7300 
7301 /*
7302  * List of vhci class names:
7303  * A vhci class name must be in this list only if the corresponding vhci
7304  * driver intends to use the mdi provided bus config implementation
7305  * (i.e., mdi_vhci_bus_config()).
7306  */
7307 static char *vhci_class_list[] = { MDI_HCI_CLASS_SCSI, MDI_HCI_CLASS_IB };
7308 #define	N_VHCI_CLASSES	(sizeof (vhci_class_list) / sizeof (char *))
7309 
7310 /*
7311  * During boot time, the on-disk vhci cache for every vhci class is read
7312  * in the form of an nvlist and stored here.
7313  */
7314 static nvlist_t *vhcache_nvl[N_VHCI_CLASSES];
7315 
7316 /* nvpair names in vhci cache nvlist */
7317 #define	MDI_VHCI_CACHE_VERSION	1
7318 #define	MDI_NVPNAME_VERSION	"version"
7319 #define	MDI_NVPNAME_PHCIS	"phcis"
7320 #define	MDI_NVPNAME_CTADDRMAP	"clientaddrmap"
7321 
7322 /*
7323  * Given vhci class name, return its on-disk vhci cache filename.
7324  * Memory for the returned filename which includes the full path is allocated
7325  * by this function.
7326  */
7327 static char *
7328 vhclass2vhcache_filename(char *vhclass)
7329 {
7330 	char *filename;
7331 	int len;
7332 	static char *fmt = "/etc/devices/mdi_%s_cache";
7333 
7334 	/*
7335 	 * fmt contains the on-disk vhci cache file name format;
7336 	 * for scsi_vhci the filename is "/etc/devices/mdi_scsi_vhci_cache".
7337 	 */
7338 
7339 	/* the -1 below is to account for "%s" in the format string */
7340 	len = strlen(fmt) + strlen(vhclass) - 1;
7341 	filename = kmem_alloc(len, KM_SLEEP);
7342 	(void) snprintf(filename, len, fmt, vhclass);
7343 	ASSERT(len == (strlen(filename) + 1));
7344 	return (filename);
7345 }
7346 
7347 /*
7348  * initialize the vhci cache related data structures and read the on-disk
7349  * vhci cached data into memory.
7350  */
7351 static void
7352 setup_vhci_cache(mdi_vhci_t *vh)
7353 {
7354 	mdi_vhci_config_t *vhc;
7355 	mdi_vhci_cache_t *vhcache;
7356 	int i;
7357 	nvlist_t *nvl = NULL;
7358 
7359 	vhc = kmem_zalloc(sizeof (mdi_vhci_config_t), KM_SLEEP);
7360 	vh->vh_config = vhc;
7361 	vhcache = &vhc->vhc_vhcache;
7362 
7363 	vhc->vhc_vhcache_filename = vhclass2vhcache_filename(vh->vh_class);
7364 
7365 	mutex_init(&vhc->vhc_lock, NULL, MUTEX_DEFAULT, NULL);
7366 	cv_init(&vhc->vhc_cv, NULL, CV_DRIVER, NULL);
7367 
7368 	rw_init(&vhcache->vhcache_lock, NULL, RW_DRIVER, NULL);
7369 
7370 	/*
7371 	 * Create string hash; same as mod_hash_create_strhash() except that
7372 	 * we use NULL key destructor.
7373 	 */
7374 	vhcache->vhcache_client_hash = mod_hash_create_extended(vh->vh_class,
7375 	    mdi_bus_config_cache_hash_size,
7376 	    mod_hash_null_keydtor, mod_hash_null_valdtor,
7377 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
7378 
7379 	/*
7380 	 * The on-disk vhci cache is read during booting prior to the
7381 	 * lights-out period by mdi_read_devices_files().
7382 	 */
7383 	for (i = 0; i < N_VHCI_CLASSES; i++) {
7384 		if (strcmp(vhci_class_list[i], vh->vh_class) == 0) {
7385 			nvl = vhcache_nvl[i];
7386 			vhcache_nvl[i] = NULL;
7387 			break;
7388 		}
7389 	}
7390 
7391 	/*
7392 	 * this is to cover the case of some one manually causing unloading
7393 	 * (or detaching) and reloading (or attaching) of a vhci driver.
7394 	 */
7395 	if (nvl == NULL && modrootloaded)
7396 		nvl = read_on_disk_vhci_cache(vh->vh_class);
7397 
7398 	if (nvl != NULL) {
7399 		rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7400 		if (mainnvl_to_vhcache(vhcache, nvl) == MDI_SUCCESS)
7401 			vhcache->vhcache_flags |= MDI_VHCI_CACHE_SETUP_DONE;
7402 		else  {
7403 			cmn_err(CE_WARN,
7404 			    "%s: data file corrupted, will recreate",
7405 			    vhc->vhc_vhcache_filename);
7406 		}
7407 		rw_exit(&vhcache->vhcache_lock);
7408 		nvlist_free(nvl);
7409 	}
7410 
7411 	vhc->vhc_cbid = callb_add(stop_vhcache_flush_thread, vhc,
7412 	    CB_CL_UADMIN_PRE_VFS, "mdi_vhcache_flush");
7413 
7414 	vhc->vhc_path_discovery_boot = mdi_path_discovery_boot;
7415 	vhc->vhc_path_discovery_postboot = mdi_path_discovery_postboot;
7416 }
7417 
7418 /*
7419  * free all vhci cache related resources
7420  */
7421 static int
7422 destroy_vhci_cache(mdi_vhci_t *vh)
7423 {
7424 	mdi_vhci_config_t *vhc = vh->vh_config;
7425 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7426 	mdi_vhcache_phci_t *cphci, *cphci_next;
7427 	mdi_vhcache_client_t *cct, *cct_next;
7428 	mdi_vhcache_pathinfo_t *cpi, *cpi_next;
7429 
7430 	if (stop_vhcache_async_threads(vhc) != MDI_SUCCESS)
7431 		return (MDI_FAILURE);
7432 
7433 	kmem_free(vhc->vhc_vhcache_filename,
7434 	    strlen(vhc->vhc_vhcache_filename) + 1);
7435 
7436 	mod_hash_destroy_strhash(vhcache->vhcache_client_hash);
7437 
7438 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7439 	    cphci = cphci_next) {
7440 		cphci_next = cphci->cphci_next;
7441 		free_vhcache_phci(cphci);
7442 	}
7443 
7444 	for (cct = vhcache->vhcache_client_head; cct != NULL; cct = cct_next) {
7445 		cct_next = cct->cct_next;
7446 		for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi_next) {
7447 			cpi_next = cpi->cpi_next;
7448 			free_vhcache_pathinfo(cpi);
7449 		}
7450 		free_vhcache_client(cct);
7451 	}
7452 
7453 	rw_destroy(&vhcache->vhcache_lock);
7454 
7455 	mutex_destroy(&vhc->vhc_lock);
7456 	cv_destroy(&vhc->vhc_cv);
7457 	kmem_free(vhc, sizeof (mdi_vhci_config_t));
7458 	return (MDI_SUCCESS);
7459 }
7460 
7461 /*
7462  * Stop all vhci cache related async threads and free their resources.
7463  */
7464 static int
7465 stop_vhcache_async_threads(mdi_vhci_config_t *vhc)
7466 {
7467 	mdi_async_client_config_t *acc, *acc_next;
7468 
7469 	mutex_enter(&vhc->vhc_lock);
7470 	vhc->vhc_flags |= MDI_VHC_EXIT;
7471 	ASSERT(vhc->vhc_acc_thrcount >= 0);
7472 	cv_broadcast(&vhc->vhc_cv);
7473 
7474 	while ((vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) ||
7475 	    vhc->vhc_acc_thrcount != 0) {
7476 		mutex_exit(&vhc->vhc_lock);
7477 		delay_random(5);
7478 		mutex_enter(&vhc->vhc_lock);
7479 	}
7480 
7481 	vhc->vhc_flags &= ~MDI_VHC_EXIT;
7482 
7483 	for (acc = vhc->vhc_acc_list_head; acc != NULL; acc = acc_next) {
7484 		acc_next = acc->acc_next;
7485 		free_async_client_config(acc);
7486 	}
7487 	vhc->vhc_acc_list_head = NULL;
7488 	vhc->vhc_acc_list_tail = NULL;
7489 	vhc->vhc_acc_count = 0;
7490 
7491 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) {
7492 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
7493 		mutex_exit(&vhc->vhc_lock);
7494 		if (flush_vhcache(vhc, 0) != MDI_SUCCESS) {
7495 			vhcache_dirty(vhc);
7496 			return (MDI_FAILURE);
7497 		}
7498 	} else
7499 		mutex_exit(&vhc->vhc_lock);
7500 
7501 	if (callb_delete(vhc->vhc_cbid) != 0)
7502 		return (MDI_FAILURE);
7503 
7504 	return (MDI_SUCCESS);
7505 }
7506 
7507 /*
7508  * Stop vhci cache flush thread
7509  */
7510 /* ARGSUSED */
7511 static boolean_t
7512 stop_vhcache_flush_thread(void *arg, int code)
7513 {
7514 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
7515 
7516 	mutex_enter(&vhc->vhc_lock);
7517 	vhc->vhc_flags |= MDI_VHC_EXIT;
7518 	cv_broadcast(&vhc->vhc_cv);
7519 
7520 	while (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) {
7521 		mutex_exit(&vhc->vhc_lock);
7522 		delay_random(5);
7523 		mutex_enter(&vhc->vhc_lock);
7524 	}
7525 
7526 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) {
7527 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
7528 		mutex_exit(&vhc->vhc_lock);
7529 		(void) flush_vhcache(vhc, 1);
7530 	} else
7531 		mutex_exit(&vhc->vhc_lock);
7532 
7533 	return (B_TRUE);
7534 }
7535 
7536 /*
7537  * Enqueue the vhcache phci (cphci) at the tail of the list
7538  */
7539 static void
7540 enqueue_vhcache_phci(mdi_vhci_cache_t *vhcache, mdi_vhcache_phci_t *cphci)
7541 {
7542 	cphci->cphci_next = NULL;
7543 	if (vhcache->vhcache_phci_head == NULL)
7544 		vhcache->vhcache_phci_head = cphci;
7545 	else
7546 		vhcache->vhcache_phci_tail->cphci_next = cphci;
7547 	vhcache->vhcache_phci_tail = cphci;
7548 }
7549 
7550 /*
7551  * Enqueue the vhcache pathinfo (cpi) at the tail of the list
7552  */
7553 static void
7554 enqueue_tail_vhcache_pathinfo(mdi_vhcache_client_t *cct,
7555     mdi_vhcache_pathinfo_t *cpi)
7556 {
7557 	cpi->cpi_next = NULL;
7558 	if (cct->cct_cpi_head == NULL)
7559 		cct->cct_cpi_head = cpi;
7560 	else
7561 		cct->cct_cpi_tail->cpi_next = cpi;
7562 	cct->cct_cpi_tail = cpi;
7563 }
7564 
7565 /*
7566  * Enqueue the vhcache pathinfo (cpi) at the correct location in the
7567  * ordered list. All cpis which do not have MDI_CPI_HINT_PATH_DOES_NOT_EXIST
7568  * flag set come at the beginning of the list. All cpis which have this
7569  * flag set come at the end of the list.
7570  */
7571 static void
7572 enqueue_vhcache_pathinfo(mdi_vhcache_client_t *cct,
7573     mdi_vhcache_pathinfo_t *newcpi)
7574 {
7575 	mdi_vhcache_pathinfo_t *cpi, *prev_cpi;
7576 
7577 	if (cct->cct_cpi_head == NULL ||
7578 	    (newcpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST))
7579 		enqueue_tail_vhcache_pathinfo(cct, newcpi);
7580 	else {
7581 		for (cpi = cct->cct_cpi_head, prev_cpi = NULL; cpi != NULL &&
7582 		    !(cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST);
7583 		    prev_cpi = cpi, cpi = cpi->cpi_next)
7584 			;
7585 
7586 		if (prev_cpi == NULL)
7587 			cct->cct_cpi_head = newcpi;
7588 		else
7589 			prev_cpi->cpi_next = newcpi;
7590 
7591 		newcpi->cpi_next = cpi;
7592 
7593 		if (cpi == NULL)
7594 			cct->cct_cpi_tail = newcpi;
7595 	}
7596 }
7597 
7598 /*
7599  * Enqueue the vhcache client (cct) at the tail of the list
7600  */
7601 static void
7602 enqueue_vhcache_client(mdi_vhci_cache_t *vhcache,
7603     mdi_vhcache_client_t *cct)
7604 {
7605 	cct->cct_next = NULL;
7606 	if (vhcache->vhcache_client_head == NULL)
7607 		vhcache->vhcache_client_head = cct;
7608 	else
7609 		vhcache->vhcache_client_tail->cct_next = cct;
7610 	vhcache->vhcache_client_tail = cct;
7611 }
7612 
7613 static void
7614 free_string_array(char **str, int nelem)
7615 {
7616 	int i;
7617 
7618 	if (str) {
7619 		for (i = 0; i < nelem; i++) {
7620 			if (str[i])
7621 				kmem_free(str[i], strlen(str[i]) + 1);
7622 		}
7623 		kmem_free(str, sizeof (char *) * nelem);
7624 	}
7625 }
7626 
7627 static void
7628 free_vhcache_phci(mdi_vhcache_phci_t *cphci)
7629 {
7630 	kmem_free(cphci->cphci_path, strlen(cphci->cphci_path) + 1);
7631 	kmem_free(cphci, sizeof (*cphci));
7632 }
7633 
7634 static void
7635 free_vhcache_pathinfo(mdi_vhcache_pathinfo_t *cpi)
7636 {
7637 	kmem_free(cpi->cpi_addr, strlen(cpi->cpi_addr) + 1);
7638 	kmem_free(cpi, sizeof (*cpi));
7639 }
7640 
7641 static void
7642 free_vhcache_client(mdi_vhcache_client_t *cct)
7643 {
7644 	kmem_free(cct->cct_name_addr, strlen(cct->cct_name_addr) + 1);
7645 	kmem_free(cct, sizeof (*cct));
7646 }
7647 
7648 static char *
7649 vhcache_mknameaddr(char *ct_name, char *ct_addr, int *ret_len)
7650 {
7651 	char *name_addr;
7652 	int len;
7653 
7654 	len = strlen(ct_name) + strlen(ct_addr) + 2;
7655 	name_addr = kmem_alloc(len, KM_SLEEP);
7656 	(void) snprintf(name_addr, len, "%s@%s", ct_name, ct_addr);
7657 
7658 	if (ret_len)
7659 		*ret_len = len;
7660 	return (name_addr);
7661 }
7662 
7663 /*
7664  * Copy the contents of paddrnvl to vhci cache.
7665  * paddrnvl nvlist contains path information for a vhci client.
7666  * See the comment in mainnvl_to_vhcache() for the format of this nvlist.
7667  */
7668 static void
7669 paddrnvl_to_vhcache(nvlist_t *nvl, mdi_vhcache_phci_t *cphci_list[],
7670     mdi_vhcache_client_t *cct)
7671 {
7672 	nvpair_t *nvp = NULL;
7673 	mdi_vhcache_pathinfo_t *cpi;
7674 	uint_t nelem;
7675 	uint32_t *val;
7676 
7677 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
7678 		ASSERT(nvpair_type(nvp) == DATA_TYPE_UINT32_ARRAY);
7679 		cpi = kmem_zalloc(sizeof (*cpi), KM_SLEEP);
7680 		cpi->cpi_addr = i_ddi_strdup(nvpair_name(nvp), KM_SLEEP);
7681 		(void) nvpair_value_uint32_array(nvp, &val, &nelem);
7682 		ASSERT(nelem == 2);
7683 		cpi->cpi_cphci = cphci_list[val[0]];
7684 		cpi->cpi_flags = val[1];
7685 		enqueue_tail_vhcache_pathinfo(cct, cpi);
7686 	}
7687 }
7688 
7689 /*
7690  * Copy the contents of caddrmapnvl to vhci cache.
7691  * caddrmapnvl nvlist contains vhci client address to phci client address
7692  * mappings. See the comment in mainnvl_to_vhcache() for the format of
7693  * this nvlist.
7694  */
7695 static void
7696 caddrmapnvl_to_vhcache(mdi_vhci_cache_t *vhcache, nvlist_t *nvl,
7697     mdi_vhcache_phci_t *cphci_list[])
7698 {
7699 	nvpair_t *nvp = NULL;
7700 	nvlist_t *paddrnvl;
7701 	mdi_vhcache_client_t *cct;
7702 
7703 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
7704 		ASSERT(nvpair_type(nvp) == DATA_TYPE_NVLIST);
7705 		cct = kmem_zalloc(sizeof (*cct), KM_SLEEP);
7706 		cct->cct_name_addr = i_ddi_strdup(nvpair_name(nvp), KM_SLEEP);
7707 		(void) nvpair_value_nvlist(nvp, &paddrnvl);
7708 		paddrnvl_to_vhcache(paddrnvl, cphci_list, cct);
7709 		/* the client must contain at least one path */
7710 		ASSERT(cct->cct_cpi_head != NULL);
7711 
7712 		enqueue_vhcache_client(vhcache, cct);
7713 		(void) mod_hash_insert(vhcache->vhcache_client_hash,
7714 		    (mod_hash_key_t)cct->cct_name_addr, (mod_hash_val_t)cct);
7715 	}
7716 }
7717 
7718 /*
7719  * Copy the contents of the main nvlist to vhci cache.
7720  *
7721  * VHCI busconfig cached data is stored in the form of a nvlist on the disk.
7722  * The nvlist contains the mappings between the vhci client addresses and
7723  * their corresponding phci client addresses.
7724  *
7725  * The structure of the nvlist is as follows:
7726  *
7727  * Main nvlist:
7728  *	NAME		TYPE		DATA
7729  *	version		int32		version number
7730  *	phcis		string array	array of phci paths
7731  *	clientaddrmap	nvlist_t	c2paddrs_nvl (see below)
7732  *
7733  * structure of c2paddrs_nvl:
7734  *	NAME		TYPE		DATA
7735  *	caddr1		nvlist_t	paddrs_nvl1
7736  *	caddr2		nvlist_t	paddrs_nvl2
7737  *	...
7738  * where caddr1, caddr2, ... are vhci client name and addresses in the
7739  * form of "<clientname>@<clientaddress>".
7740  * (for example: "ssd@2000002037cd9f72");
7741  * paddrs_nvl1, paddrs_nvl2, .. are nvlists that contain path information.
7742  *
7743  * structure of paddrs_nvl:
7744  *	NAME		TYPE		DATA
7745  *	pi_addr1	uint32_array	(phci-id, cpi_flags)
7746  *	pi_addr2	uint32_array	(phci-id, cpi_flags)
7747  *	...
7748  * where pi_addr1, pi_addr2, ... are bus specific addresses of pathinfo nodes
7749  * (so called pi_addrs, for example: "w2100002037cd9f72,0");
7750  * phci-ids are integers that identify pHCIs to which the
7751  * the bus specific address belongs to. These integers are used as an index
7752  * into to the phcis string array in the main nvlist to get the pHCI path.
7753  */
7754 static int
7755 mainnvl_to_vhcache(mdi_vhci_cache_t *vhcache, nvlist_t *nvl)
7756 {
7757 	char **phcis, **phci_namep;
7758 	uint_t nphcis;
7759 	mdi_vhcache_phci_t *cphci, **cphci_list;
7760 	nvlist_t *caddrmapnvl;
7761 	int32_t ver;
7762 	int i;
7763 	size_t cphci_list_size;
7764 
7765 	ASSERT(RW_WRITE_HELD(&vhcache->vhcache_lock));
7766 
7767 	if (nvlist_lookup_int32(nvl, MDI_NVPNAME_VERSION, &ver) != 0 ||
7768 	    ver != MDI_VHCI_CACHE_VERSION)
7769 		return (MDI_FAILURE);
7770 
7771 	if (nvlist_lookup_string_array(nvl, MDI_NVPNAME_PHCIS, &phcis,
7772 	    &nphcis) != 0)
7773 		return (MDI_SUCCESS);
7774 
7775 	ASSERT(nphcis > 0);
7776 
7777 	cphci_list_size = sizeof (mdi_vhcache_phci_t *) * nphcis;
7778 	cphci_list = kmem_alloc(cphci_list_size, KM_SLEEP);
7779 	for (i = 0, phci_namep = phcis; i < nphcis; i++, phci_namep++) {
7780 		cphci = kmem_zalloc(sizeof (mdi_vhcache_phci_t), KM_SLEEP);
7781 		cphci->cphci_path = i_ddi_strdup(*phci_namep, KM_SLEEP);
7782 		enqueue_vhcache_phci(vhcache, cphci);
7783 		cphci_list[i] = cphci;
7784 	}
7785 
7786 	ASSERT(vhcache->vhcache_phci_head != NULL);
7787 
7788 	if (nvlist_lookup_nvlist(nvl, MDI_NVPNAME_CTADDRMAP, &caddrmapnvl) == 0)
7789 		caddrmapnvl_to_vhcache(vhcache, caddrmapnvl, cphci_list);
7790 
7791 	kmem_free(cphci_list, cphci_list_size);
7792 	return (MDI_SUCCESS);
7793 }
7794 
7795 /*
7796  * Build paddrnvl for the specified client using the information in the
7797  * vhci cache and add it to the caddrmapnnvl.
7798  * Returns 0 on success, errno on failure.
7799  */
7800 static int
7801 vhcache_to_paddrnvl(mdi_vhci_cache_t *vhcache, mdi_vhcache_client_t *cct,
7802     nvlist_t *caddrmapnvl)
7803 {
7804 	mdi_vhcache_pathinfo_t *cpi;
7805 	nvlist_t *nvl;
7806 	int err;
7807 	uint32_t val[2];
7808 
7809 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7810 
7811 	if ((err = nvlist_alloc(&nvl, 0, KM_SLEEP)) != 0)
7812 		return (err);
7813 
7814 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
7815 		val[0] = cpi->cpi_cphci->cphci_id;
7816 		val[1] = cpi->cpi_flags;
7817 		if ((err = nvlist_add_uint32_array(nvl, cpi->cpi_addr, val, 2))
7818 		    != 0)
7819 			goto out;
7820 	}
7821 
7822 	err = nvlist_add_nvlist(caddrmapnvl, cct->cct_name_addr, nvl);
7823 out:
7824 	nvlist_free(nvl);
7825 	return (err);
7826 }
7827 
7828 /*
7829  * Build caddrmapnvl using the information in the vhci cache
7830  * and add it to the mainnvl.
7831  * Returns 0 on success, errno on failure.
7832  */
7833 static int
7834 vhcache_to_caddrmapnvl(mdi_vhci_cache_t *vhcache, nvlist_t *mainnvl)
7835 {
7836 	mdi_vhcache_client_t *cct;
7837 	nvlist_t *nvl;
7838 	int err;
7839 
7840 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7841 
7842 	if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) != 0)
7843 		return (err);
7844 
7845 	for (cct = vhcache->vhcache_client_head; cct != NULL;
7846 	    cct = cct->cct_next) {
7847 		if ((err = vhcache_to_paddrnvl(vhcache, cct, nvl)) != 0)
7848 			goto out;
7849 	}
7850 
7851 	err = nvlist_add_nvlist(mainnvl, MDI_NVPNAME_CTADDRMAP, nvl);
7852 out:
7853 	nvlist_free(nvl);
7854 	return (err);
7855 }
7856 
7857 /*
7858  * Build nvlist using the information in the vhci cache.
7859  * See the comment in mainnvl_to_vhcache() for the format of the nvlist.
7860  * Returns nvl on success, NULL on failure.
7861  */
7862 static nvlist_t *
7863 vhcache_to_mainnvl(mdi_vhci_cache_t *vhcache)
7864 {
7865 	mdi_vhcache_phci_t *cphci;
7866 	uint_t phci_count;
7867 	char **phcis;
7868 	nvlist_t *nvl;
7869 	int err, i;
7870 
7871 	if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) != 0) {
7872 		nvl = NULL;
7873 		goto out;
7874 	}
7875 
7876 	if ((err = nvlist_add_int32(nvl, MDI_NVPNAME_VERSION,
7877 	    MDI_VHCI_CACHE_VERSION)) != 0)
7878 		goto out;
7879 
7880 	rw_enter(&vhcache->vhcache_lock, RW_READER);
7881 	if (vhcache->vhcache_phci_head == NULL) {
7882 		rw_exit(&vhcache->vhcache_lock);
7883 		return (nvl);
7884 	}
7885 
7886 	phci_count = 0;
7887 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7888 	    cphci = cphci->cphci_next)
7889 		cphci->cphci_id = phci_count++;
7890 
7891 	/* build phci pathname list */
7892 	phcis = kmem_alloc(sizeof (char *) * phci_count, KM_SLEEP);
7893 	for (cphci = vhcache->vhcache_phci_head, i = 0; cphci != NULL;
7894 	    cphci = cphci->cphci_next, i++)
7895 		phcis[i] = i_ddi_strdup(cphci->cphci_path, KM_SLEEP);
7896 
7897 	err = nvlist_add_string_array(nvl, MDI_NVPNAME_PHCIS, phcis,
7898 	    phci_count);
7899 	free_string_array(phcis, phci_count);
7900 
7901 	if (err == 0 &&
7902 	    (err = vhcache_to_caddrmapnvl(vhcache, nvl)) == 0) {
7903 		rw_exit(&vhcache->vhcache_lock);
7904 		return (nvl);
7905 	}
7906 
7907 	rw_exit(&vhcache->vhcache_lock);
7908 out:
7909 	if (nvl)
7910 		nvlist_free(nvl);
7911 	return (NULL);
7912 }
7913 
7914 /*
7915  * Lookup vhcache phci structure for the specified phci path.
7916  */
7917 static mdi_vhcache_phci_t *
7918 lookup_vhcache_phci_by_name(mdi_vhci_cache_t *vhcache, char *phci_path)
7919 {
7920 	mdi_vhcache_phci_t *cphci;
7921 
7922 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7923 
7924 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7925 	    cphci = cphci->cphci_next) {
7926 		if (strcmp(cphci->cphci_path, phci_path) == 0)
7927 			return (cphci);
7928 	}
7929 
7930 	return (NULL);
7931 }
7932 
7933 /*
7934  * Lookup vhcache phci structure for the specified phci.
7935  */
7936 static mdi_vhcache_phci_t *
7937 lookup_vhcache_phci_by_addr(mdi_vhci_cache_t *vhcache, mdi_phci_t *ph)
7938 {
7939 	mdi_vhcache_phci_t *cphci;
7940 
7941 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
7942 
7943 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
7944 	    cphci = cphci->cphci_next) {
7945 		if (cphci->cphci_phci == ph)
7946 			return (cphci);
7947 	}
7948 
7949 	return (NULL);
7950 }
7951 
7952 /*
7953  * Add the specified phci to the vhci cache if not already present.
7954  */
7955 static void
7956 vhcache_phci_add(mdi_vhci_config_t *vhc, mdi_phci_t *ph)
7957 {
7958 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
7959 	mdi_vhcache_phci_t *cphci;
7960 	char *pathname;
7961 	int cache_updated;
7962 
7963 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
7964 
7965 	pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
7966 	(void) ddi_pathname(ph->ph_dip, pathname);
7967 	if ((cphci = lookup_vhcache_phci_by_name(vhcache, pathname))
7968 	    != NULL) {
7969 		cphci->cphci_phci = ph;
7970 		cache_updated = 0;
7971 	} else {
7972 		cphci = kmem_zalloc(sizeof (*cphci), KM_SLEEP);
7973 		cphci->cphci_path = i_ddi_strdup(pathname, KM_SLEEP);
7974 		cphci->cphci_phci = ph;
7975 		enqueue_vhcache_phci(vhcache, cphci);
7976 		cache_updated = 1;
7977 	}
7978 
7979 	rw_exit(&vhcache->vhcache_lock);
7980 
7981 	/*
7982 	 * Since a new phci has been added, reset
7983 	 * vhc_path_discovery_cutoff_time to allow for discovery of paths
7984 	 * during next vhcache_discover_paths().
7985 	 */
7986 	mutex_enter(&vhc->vhc_lock);
7987 	vhc->vhc_path_discovery_cutoff_time = 0;
7988 	mutex_exit(&vhc->vhc_lock);
7989 
7990 	kmem_free(pathname, MAXPATHLEN);
7991 	if (cache_updated)
7992 		vhcache_dirty(vhc);
7993 }
7994 
7995 /*
7996  * Remove the reference to the specified phci from the vhci cache.
7997  */
7998 static void
7999 vhcache_phci_remove(mdi_vhci_config_t *vhc, mdi_phci_t *ph)
8000 {
8001 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8002 	mdi_vhcache_phci_t *cphci;
8003 
8004 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8005 	if ((cphci = lookup_vhcache_phci_by_addr(vhcache, ph)) != NULL) {
8006 		/* do not remove the actual mdi_vhcache_phci structure */
8007 		cphci->cphci_phci = NULL;
8008 	}
8009 	rw_exit(&vhcache->vhcache_lock);
8010 }
8011 
8012 static void
8013 init_vhcache_lookup_token(mdi_vhcache_lookup_token_t *dst,
8014     mdi_vhcache_lookup_token_t *src)
8015 {
8016 	if (src == NULL) {
8017 		dst->lt_cct = NULL;
8018 		dst->lt_cct_lookup_time = 0;
8019 	} else {
8020 		dst->lt_cct = src->lt_cct;
8021 		dst->lt_cct_lookup_time = src->lt_cct_lookup_time;
8022 	}
8023 }
8024 
8025 /*
8026  * Look up vhcache client for the specified client.
8027  */
8028 static mdi_vhcache_client_t *
8029 lookup_vhcache_client(mdi_vhci_cache_t *vhcache, char *ct_name, char *ct_addr,
8030     mdi_vhcache_lookup_token_t *token)
8031 {
8032 	mod_hash_val_t hv;
8033 	char *name_addr;
8034 	int len;
8035 
8036 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
8037 
8038 	/*
8039 	 * If no vhcache clean occurred since the last lookup, we can
8040 	 * simply return the cct from the last lookup operation.
8041 	 * It works because ccts are never freed except during the vhcache
8042 	 * cleanup operation.
8043 	 */
8044 	if (token != NULL &&
8045 	    vhcache->vhcache_clean_time < token->lt_cct_lookup_time)
8046 		return (token->lt_cct);
8047 
8048 	name_addr = vhcache_mknameaddr(ct_name, ct_addr, &len);
8049 	if (mod_hash_find(vhcache->vhcache_client_hash,
8050 	    (mod_hash_key_t)name_addr, &hv) == 0) {
8051 		if (token) {
8052 			token->lt_cct = (mdi_vhcache_client_t *)hv;
8053 			token->lt_cct_lookup_time = lbolt64;
8054 		}
8055 	} else {
8056 		if (token) {
8057 			token->lt_cct = NULL;
8058 			token->lt_cct_lookup_time = 0;
8059 		}
8060 		hv = NULL;
8061 	}
8062 	kmem_free(name_addr, len);
8063 	return ((mdi_vhcache_client_t *)hv);
8064 }
8065 
8066 /*
8067  * Add the specified path to the vhci cache if not already present.
8068  * Also add the vhcache client for the client corresponding to this path
8069  * if it doesn't already exist.
8070  */
8071 static void
8072 vhcache_pi_add(mdi_vhci_config_t *vhc, struct mdi_pathinfo *pip)
8073 {
8074 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8075 	mdi_vhcache_client_t *cct;
8076 	mdi_vhcache_pathinfo_t *cpi;
8077 	mdi_phci_t *ph = pip->pi_phci;
8078 	mdi_client_t *ct = pip->pi_client;
8079 	int cache_updated = 0;
8080 
8081 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8082 
8083 	/* if vhcache client for this pip doesn't already exist, add it */
8084 	if ((cct = lookup_vhcache_client(vhcache, ct->ct_drvname, ct->ct_guid,
8085 	    NULL)) == NULL) {
8086 		cct = kmem_zalloc(sizeof (*cct), KM_SLEEP);
8087 		cct->cct_name_addr = vhcache_mknameaddr(ct->ct_drvname,
8088 		    ct->ct_guid, NULL);
8089 		enqueue_vhcache_client(vhcache, cct);
8090 		(void) mod_hash_insert(vhcache->vhcache_client_hash,
8091 		    (mod_hash_key_t)cct->cct_name_addr, (mod_hash_val_t)cct);
8092 		cache_updated = 1;
8093 	}
8094 
8095 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8096 		if (cpi->cpi_cphci->cphci_phci == ph &&
8097 		    strcmp(cpi->cpi_addr, pip->pi_addr) == 0) {
8098 			cpi->cpi_pip = pip;
8099 			if (cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST) {
8100 				cpi->cpi_flags &=
8101 				    ~MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
8102 				sort_vhcache_paths(cct);
8103 				cache_updated = 1;
8104 			}
8105 			break;
8106 		}
8107 	}
8108 
8109 	if (cpi == NULL) {
8110 		cpi = kmem_zalloc(sizeof (*cpi), KM_SLEEP);
8111 		cpi->cpi_addr = i_ddi_strdup(pip->pi_addr, KM_SLEEP);
8112 		cpi->cpi_cphci = lookup_vhcache_phci_by_addr(vhcache, ph);
8113 		ASSERT(cpi->cpi_cphci != NULL);
8114 		cpi->cpi_pip = pip;
8115 		enqueue_vhcache_pathinfo(cct, cpi);
8116 		cache_updated = 1;
8117 	}
8118 
8119 	rw_exit(&vhcache->vhcache_lock);
8120 
8121 	if (cache_updated)
8122 		vhcache_dirty(vhc);
8123 }
8124 
8125 /*
8126  * Remove the reference to the specified path from the vhci cache.
8127  */
8128 static void
8129 vhcache_pi_remove(mdi_vhci_config_t *vhc, struct mdi_pathinfo *pip)
8130 {
8131 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8132 	mdi_client_t *ct = pip->pi_client;
8133 	mdi_vhcache_client_t *cct;
8134 	mdi_vhcache_pathinfo_t *cpi;
8135 
8136 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8137 	if ((cct = lookup_vhcache_client(vhcache, ct->ct_drvname, ct->ct_guid,
8138 	    NULL)) != NULL) {
8139 		for (cpi = cct->cct_cpi_head; cpi != NULL;
8140 		    cpi = cpi->cpi_next) {
8141 			if (cpi->cpi_pip == pip) {
8142 				cpi->cpi_pip = NULL;
8143 				break;
8144 			}
8145 		}
8146 	}
8147 	rw_exit(&vhcache->vhcache_lock);
8148 }
8149 
8150 /*
8151  * Flush the vhci cache to disk.
8152  * Returns MDI_SUCCESS on success, MDI_FAILURE on failure.
8153  */
8154 static int
8155 flush_vhcache(mdi_vhci_config_t *vhc, int force_flag)
8156 {
8157 	nvlist_t *nvl;
8158 	int err;
8159 	int rv;
8160 
8161 	/*
8162 	 * It is possible that the system may shutdown before
8163 	 * i_ddi_io_initialized (during stmsboot for example). To allow for
8164 	 * flushing the cache in this case do not check for
8165 	 * i_ddi_io_initialized when force flag is set.
8166 	 */
8167 	if (force_flag == 0 && !i_ddi_io_initialized())
8168 		return (MDI_FAILURE);
8169 
8170 	if ((nvl = vhcache_to_mainnvl(&vhc->vhc_vhcache)) != NULL) {
8171 		err = fwrite_nvlist(vhc->vhc_vhcache_filename, nvl);
8172 		nvlist_free(nvl);
8173 	} else
8174 		err = EFAULT;
8175 
8176 	rv = MDI_SUCCESS;
8177 	mutex_enter(&vhc->vhc_lock);
8178 	if (err != 0) {
8179 		if (err == EROFS) {
8180 			vhc->vhc_flags |= MDI_VHC_READONLY_FS;
8181 			vhc->vhc_flags &= ~(MDI_VHC_VHCACHE_FLUSH_ERROR |
8182 			    MDI_VHC_VHCACHE_DIRTY);
8183 		} else {
8184 			if (!(vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_ERROR)) {
8185 				cmn_err(CE_CONT, "%s: update failed\n",
8186 				    vhc->vhc_vhcache_filename);
8187 				vhc->vhc_flags |= MDI_VHC_VHCACHE_FLUSH_ERROR;
8188 			}
8189 			rv = MDI_FAILURE;
8190 		}
8191 	} else if (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_ERROR) {
8192 		cmn_err(CE_CONT,
8193 		    "%s: update now ok\n", vhc->vhc_vhcache_filename);
8194 		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_FLUSH_ERROR;
8195 	}
8196 	mutex_exit(&vhc->vhc_lock);
8197 
8198 	return (rv);
8199 }
8200 
8201 /*
8202  * Call flush_vhcache() to flush the vhci cache at the scheduled time.
8203  * Exits itself if left idle for the idle timeout period.
8204  */
8205 static void
8206 vhcache_flush_thread(void *arg)
8207 {
8208 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
8209 	clock_t idle_time, quit_at_ticks;
8210 	callb_cpr_t cprinfo;
8211 
8212 	/* number of seconds to sleep idle before exiting */
8213 	idle_time = mdi_vhcache_flush_daemon_idle_time * TICKS_PER_SECOND;
8214 
8215 	CALLB_CPR_INIT(&cprinfo, &vhc->vhc_lock, callb_generic_cpr,
8216 	    "mdi_vhcache_flush");
8217 	mutex_enter(&vhc->vhc_lock);
8218 	for (; ; ) {
8219 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
8220 		    (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY)) {
8221 			if (ddi_get_lbolt() < vhc->vhc_flush_at_ticks) {
8222 				CALLB_CPR_SAFE_BEGIN(&cprinfo);
8223 				(void) cv_timedwait(&vhc->vhc_cv,
8224 				    &vhc->vhc_lock, vhc->vhc_flush_at_ticks);
8225 				CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
8226 			} else {
8227 				vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
8228 				mutex_exit(&vhc->vhc_lock);
8229 
8230 				if (flush_vhcache(vhc, 0) != MDI_SUCCESS)
8231 					vhcache_dirty(vhc);
8232 
8233 				mutex_enter(&vhc->vhc_lock);
8234 			}
8235 		}
8236 
8237 		quit_at_ticks = ddi_get_lbolt() + idle_time;
8238 
8239 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
8240 		    !(vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) &&
8241 		    ddi_get_lbolt() < quit_at_ticks) {
8242 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
8243 			(void) cv_timedwait(&vhc->vhc_cv, &vhc->vhc_lock,
8244 			    quit_at_ticks);
8245 			CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
8246 		}
8247 
8248 		if ((vhc->vhc_flags & MDI_VHC_EXIT) ||
8249 		    !(vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY))
8250 			goto out;
8251 	}
8252 
8253 out:
8254 	vhc->vhc_flags &= ~MDI_VHC_VHCACHE_FLUSH_THREAD;
8255 	/* CALLB_CPR_EXIT releases the vhc->vhc_lock */
8256 	CALLB_CPR_EXIT(&cprinfo);
8257 }
8258 
8259 /*
8260  * Make vhci cache dirty and schedule flushing by vhcache flush thread.
8261  */
8262 static void
8263 vhcache_dirty(mdi_vhci_config_t *vhc)
8264 {
8265 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8266 	int create_thread;
8267 
8268 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8269 	/* do not flush cache until the cache is fully built */
8270 	if (!(vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE)) {
8271 		rw_exit(&vhcache->vhcache_lock);
8272 		return;
8273 	}
8274 	rw_exit(&vhcache->vhcache_lock);
8275 
8276 	mutex_enter(&vhc->vhc_lock);
8277 	if (vhc->vhc_flags & MDI_VHC_READONLY_FS) {
8278 		mutex_exit(&vhc->vhc_lock);
8279 		return;
8280 	}
8281 
8282 	vhc->vhc_flags |= MDI_VHC_VHCACHE_DIRTY;
8283 	vhc->vhc_flush_at_ticks = ddi_get_lbolt() +
8284 	    mdi_vhcache_flush_delay * TICKS_PER_SECOND;
8285 	if (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) {
8286 		cv_broadcast(&vhc->vhc_cv);
8287 		create_thread = 0;
8288 	} else {
8289 		vhc->vhc_flags |= MDI_VHC_VHCACHE_FLUSH_THREAD;
8290 		create_thread = 1;
8291 	}
8292 	mutex_exit(&vhc->vhc_lock);
8293 
8294 	if (create_thread)
8295 		(void) thread_create(NULL, 0, vhcache_flush_thread, vhc,
8296 		    0, &p0, TS_RUN, minclsyspri);
8297 }
8298 
8299 /*
8300  * phci bus config structure - one for for each phci bus config operation that
8301  * we initiate on behalf of a vhci.
8302  */
8303 typedef struct mdi_phci_bus_config_s {
8304 	char *phbc_phci_path;
8305 	struct mdi_vhci_bus_config_s *phbc_vhbusconfig;	/* vhci bus config */
8306 	struct mdi_phci_bus_config_s *phbc_next;
8307 } mdi_phci_bus_config_t;
8308 
8309 /* vhci bus config structure - one for each vhci bus config operation */
8310 typedef struct mdi_vhci_bus_config_s {
8311 	ddi_bus_config_op_t vhbc_op;	/* bus config op */
8312 	major_t vhbc_op_major;		/* bus config op major */
8313 	uint_t vhbc_op_flags;		/* bus config op flags */
8314 	kmutex_t vhbc_lock;
8315 	kcondvar_t vhbc_cv;
8316 	int vhbc_thr_count;
8317 } mdi_vhci_bus_config_t;
8318 
8319 /*
8320  * bus config the specified phci
8321  */
8322 static void
8323 bus_config_phci(void *arg)
8324 {
8325 	mdi_phci_bus_config_t *phbc = (mdi_phci_bus_config_t *)arg;
8326 	mdi_vhci_bus_config_t *vhbc = phbc->phbc_vhbusconfig;
8327 	dev_info_t *ph_dip;
8328 
8329 	/*
8330 	 * first configure all path components upto phci and then configure
8331 	 * the phci children.
8332 	 */
8333 	if ((ph_dip = e_ddi_hold_devi_by_path(phbc->phbc_phci_path, 0))
8334 	    != NULL) {
8335 		if (vhbc->vhbc_op == BUS_CONFIG_DRIVER ||
8336 		    vhbc->vhbc_op == BUS_UNCONFIG_DRIVER) {
8337 			(void) ndi_devi_config_driver(ph_dip,
8338 			    vhbc->vhbc_op_flags,
8339 			    vhbc->vhbc_op_major);
8340 		} else
8341 			(void) ndi_devi_config(ph_dip,
8342 			    vhbc->vhbc_op_flags);
8343 
8344 		/* release the hold that e_ddi_hold_devi_by_path() placed */
8345 		ndi_rele_devi(ph_dip);
8346 	}
8347 
8348 	kmem_free(phbc->phbc_phci_path, strlen(phbc->phbc_phci_path) + 1);
8349 	kmem_free(phbc, sizeof (*phbc));
8350 
8351 	mutex_enter(&vhbc->vhbc_lock);
8352 	vhbc->vhbc_thr_count--;
8353 	if (vhbc->vhbc_thr_count == 0)
8354 		cv_broadcast(&vhbc->vhbc_cv);
8355 	mutex_exit(&vhbc->vhbc_lock);
8356 }
8357 
8358 /*
8359  * Bus config all phcis associated with the vhci in parallel.
8360  * op must be BUS_CONFIG_DRIVER or BUS_CONFIG_ALL.
8361  */
8362 static void
8363 bus_config_all_phcis(mdi_vhci_cache_t *vhcache, uint_t flags,
8364     ddi_bus_config_op_t op, major_t maj)
8365 {
8366 	mdi_phci_bus_config_t *phbc_head = NULL, *phbc, *phbc_next;
8367 	mdi_vhci_bus_config_t *vhbc;
8368 	mdi_vhcache_phci_t *cphci;
8369 
8370 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8371 	if (vhcache->vhcache_phci_head == NULL) {
8372 		rw_exit(&vhcache->vhcache_lock);
8373 		return;
8374 	}
8375 
8376 	vhbc = kmem_zalloc(sizeof (*vhbc), KM_SLEEP);
8377 
8378 	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
8379 	    cphci = cphci->cphci_next) {
8380 		/* skip phcis that haven't attached before root is available */
8381 		if (!modrootloaded && (cphci->cphci_phci == NULL))
8382 			continue;
8383 		phbc = kmem_zalloc(sizeof (*phbc), KM_SLEEP);
8384 		phbc->phbc_phci_path = i_ddi_strdup(cphci->cphci_path,
8385 		    KM_SLEEP);
8386 		phbc->phbc_vhbusconfig = vhbc;
8387 		phbc->phbc_next = phbc_head;
8388 		phbc_head = phbc;
8389 		vhbc->vhbc_thr_count++;
8390 	}
8391 	rw_exit(&vhcache->vhcache_lock);
8392 
8393 	vhbc->vhbc_op = op;
8394 	vhbc->vhbc_op_major = maj;
8395 	vhbc->vhbc_op_flags = NDI_NO_EVENT |
8396 	    (flags & (NDI_CONFIG_REPROBE | NDI_DRV_CONF_REPROBE));
8397 	mutex_init(&vhbc->vhbc_lock, NULL, MUTEX_DEFAULT, NULL);
8398 	cv_init(&vhbc->vhbc_cv, NULL, CV_DRIVER, NULL);
8399 
8400 	/* now create threads to initiate bus config on all phcis in parallel */
8401 	for (phbc = phbc_head; phbc != NULL; phbc = phbc_next) {
8402 		phbc_next = phbc->phbc_next;
8403 		if (mdi_mtc_off)
8404 			bus_config_phci((void *)phbc);
8405 		else
8406 			(void) thread_create(NULL, 0, bus_config_phci, phbc,
8407 			    0, &p0, TS_RUN, minclsyspri);
8408 	}
8409 
8410 	mutex_enter(&vhbc->vhbc_lock);
8411 	/* wait until all threads exit */
8412 	while (vhbc->vhbc_thr_count > 0)
8413 		cv_wait(&vhbc->vhbc_cv, &vhbc->vhbc_lock);
8414 	mutex_exit(&vhbc->vhbc_lock);
8415 
8416 	mutex_destroy(&vhbc->vhbc_lock);
8417 	cv_destroy(&vhbc->vhbc_cv);
8418 	kmem_free(vhbc, sizeof (*vhbc));
8419 }
8420 
8421 /*
8422  * Single threaded version of bus_config_all_phcis()
8423  */
8424 static void
8425 st_bus_config_all_phcis(mdi_vhci_config_t *vhc, uint_t flags,
8426     ddi_bus_config_op_t op, major_t maj)
8427 {
8428 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8429 
8430 	single_threaded_vhconfig_enter(vhc);
8431 	bus_config_all_phcis(vhcache, flags, op, maj);
8432 	single_threaded_vhconfig_exit(vhc);
8433 }
8434 
8435 /*
8436  * Perform BUS_CONFIG_ONE on the specified child of the phci.
8437  * The path includes the child component in addition to the phci path.
8438  */
8439 static int
8440 bus_config_one_phci_child(char *path)
8441 {
8442 	dev_info_t *ph_dip, *child;
8443 	char *devnm;
8444 	int rv = MDI_FAILURE;
8445 
8446 	/* extract the child component of the phci */
8447 	devnm = strrchr(path, '/');
8448 	*devnm++ = '\0';
8449 
8450 	/*
8451 	 * first configure all path components upto phci and then
8452 	 * configure the phci child.
8453 	 */
8454 	if ((ph_dip = e_ddi_hold_devi_by_path(path, 0)) != NULL) {
8455 		if (ndi_devi_config_one(ph_dip, devnm, &child, NDI_NO_EVENT) ==
8456 		    NDI_SUCCESS) {
8457 			/*
8458 			 * release the hold that ndi_devi_config_one() placed
8459 			 */
8460 			ndi_rele_devi(child);
8461 			rv = MDI_SUCCESS;
8462 		}
8463 
8464 		/* release the hold that e_ddi_hold_devi_by_path() placed */
8465 		ndi_rele_devi(ph_dip);
8466 	}
8467 
8468 	devnm--;
8469 	*devnm = '/';
8470 	return (rv);
8471 }
8472 
8473 /*
8474  * Build a list of phci client paths for the specified vhci client.
8475  * The list includes only those phci client paths which aren't configured yet.
8476  */
8477 static mdi_phys_path_t *
8478 build_phclient_path_list(mdi_vhcache_client_t *cct, char *ct_name)
8479 {
8480 	mdi_vhcache_pathinfo_t *cpi;
8481 	mdi_phys_path_t *pp_head = NULL, *pp_tail = NULL, *pp;
8482 	int config_path, len;
8483 
8484 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8485 		/*
8486 		 * include only those paths that aren't configured.
8487 		 */
8488 		config_path = 0;
8489 		if (cpi->cpi_pip == NULL)
8490 			config_path = 1;
8491 		else {
8492 			MDI_PI_LOCK(cpi->cpi_pip);
8493 			if (MDI_PI_IS_INIT(cpi->cpi_pip))
8494 				config_path = 1;
8495 			MDI_PI_UNLOCK(cpi->cpi_pip);
8496 		}
8497 
8498 		if (config_path) {
8499 			pp = kmem_alloc(sizeof (*pp), KM_SLEEP);
8500 			len = strlen(cpi->cpi_cphci->cphci_path) +
8501 			    strlen(ct_name) + strlen(cpi->cpi_addr) + 3;
8502 			pp->phys_path = kmem_alloc(len, KM_SLEEP);
8503 			(void) snprintf(pp->phys_path, len, "%s/%s@%s",
8504 			    cpi->cpi_cphci->cphci_path, ct_name,
8505 			    cpi->cpi_addr);
8506 			pp->phys_path_next = NULL;
8507 
8508 			if (pp_head == NULL)
8509 				pp_head = pp;
8510 			else
8511 				pp_tail->phys_path_next = pp;
8512 			pp_tail = pp;
8513 		}
8514 	}
8515 
8516 	return (pp_head);
8517 }
8518 
8519 /*
8520  * Free the memory allocated for phci client path list.
8521  */
8522 static void
8523 free_phclient_path_list(mdi_phys_path_t *pp_head)
8524 {
8525 	mdi_phys_path_t *pp, *pp_next;
8526 
8527 	for (pp = pp_head; pp != NULL; pp = pp_next) {
8528 		pp_next = pp->phys_path_next;
8529 		kmem_free(pp->phys_path, strlen(pp->phys_path) + 1);
8530 		kmem_free(pp, sizeof (*pp));
8531 	}
8532 }
8533 
8534 /*
8535  * Allocated async client structure and initialize with the specified values.
8536  */
8537 static mdi_async_client_config_t *
8538 alloc_async_client_config(char *ct_name, char *ct_addr,
8539     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
8540 {
8541 	mdi_async_client_config_t *acc;
8542 
8543 	acc = kmem_alloc(sizeof (*acc), KM_SLEEP);
8544 	acc->acc_ct_name = i_ddi_strdup(ct_name, KM_SLEEP);
8545 	acc->acc_ct_addr = i_ddi_strdup(ct_addr, KM_SLEEP);
8546 	acc->acc_phclient_path_list_head = pp_head;
8547 	init_vhcache_lookup_token(&acc->acc_token, tok);
8548 	acc->acc_next = NULL;
8549 	return (acc);
8550 }
8551 
8552 /*
8553  * Free the memory allocated for the async client structure and their members.
8554  */
8555 static void
8556 free_async_client_config(mdi_async_client_config_t *acc)
8557 {
8558 	if (acc->acc_phclient_path_list_head)
8559 		free_phclient_path_list(acc->acc_phclient_path_list_head);
8560 	kmem_free(acc->acc_ct_name, strlen(acc->acc_ct_name) + 1);
8561 	kmem_free(acc->acc_ct_addr, strlen(acc->acc_ct_addr) + 1);
8562 	kmem_free(acc, sizeof (*acc));
8563 }
8564 
8565 /*
8566  * Sort vhcache pathinfos (cpis) of the specified client.
8567  * All cpis which do not have MDI_CPI_HINT_PATH_DOES_NOT_EXIST
8568  * flag set come at the beginning of the list. All cpis which have this
8569  * flag set come at the end of the list.
8570  */
8571 static void
8572 sort_vhcache_paths(mdi_vhcache_client_t *cct)
8573 {
8574 	mdi_vhcache_pathinfo_t *cpi, *cpi_next, *cpi_head;
8575 
8576 	cpi_head = cct->cct_cpi_head;
8577 	cct->cct_cpi_head = cct->cct_cpi_tail = NULL;
8578 	for (cpi = cpi_head; cpi != NULL; cpi = cpi_next) {
8579 		cpi_next = cpi->cpi_next;
8580 		enqueue_vhcache_pathinfo(cct, cpi);
8581 	}
8582 }
8583 
8584 /*
8585  * Verify whether MDI_CPI_HINT_PATH_DOES_NOT_EXIST flag setting is correct for
8586  * every vhcache pathinfo of the specified client. If not adjust the flag
8587  * setting appropriately.
8588  *
8589  * Note that MDI_CPI_HINT_PATH_DOES_NOT_EXIST flag is persisted in the
8590  * on-disk vhci cache. So every time this flag is updated the cache must be
8591  * flushed.
8592  */
8593 static void
8594 adjust_sort_vhcache_paths(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
8595     mdi_vhcache_lookup_token_t *tok)
8596 {
8597 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8598 	mdi_vhcache_client_t *cct;
8599 	mdi_vhcache_pathinfo_t *cpi;
8600 
8601 	rw_enter(&vhcache->vhcache_lock, RW_READER);
8602 	if ((cct = lookup_vhcache_client(vhcache, ct_name, ct_addr, tok))
8603 	    == NULL) {
8604 		rw_exit(&vhcache->vhcache_lock);
8605 		return;
8606 	}
8607 
8608 	/*
8609 	 * to avoid unnecessary on-disk cache updates, first check if an
8610 	 * update is really needed. If no update is needed simply return.
8611 	 */
8612 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8613 		if ((cpi->cpi_pip != NULL &&
8614 		    (cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST)) ||
8615 		    (cpi->cpi_pip == NULL &&
8616 		    !(cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST))) {
8617 			break;
8618 		}
8619 	}
8620 	if (cpi == NULL) {
8621 		rw_exit(&vhcache->vhcache_lock);
8622 		return;
8623 	}
8624 
8625 	if (rw_tryupgrade(&vhcache->vhcache_lock) == 0) {
8626 		rw_exit(&vhcache->vhcache_lock);
8627 		rw_enter(&vhcache->vhcache_lock, RW_WRITER);
8628 		if ((cct = lookup_vhcache_client(vhcache, ct_name, ct_addr,
8629 		    tok)) == NULL) {
8630 			rw_exit(&vhcache->vhcache_lock);
8631 			return;
8632 		}
8633 	}
8634 
8635 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8636 		if (cpi->cpi_pip != NULL)
8637 			cpi->cpi_flags &= ~MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
8638 		else
8639 			cpi->cpi_flags |= MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
8640 	}
8641 	sort_vhcache_paths(cct);
8642 
8643 	rw_exit(&vhcache->vhcache_lock);
8644 	vhcache_dirty(vhc);
8645 }
8646 
8647 /*
8648  * Configure all specified paths of the client.
8649  */
8650 static void
8651 config_client_paths_sync(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
8652     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
8653 {
8654 	mdi_phys_path_t *pp;
8655 
8656 	for (pp = pp_head; pp != NULL; pp = pp->phys_path_next)
8657 		(void) bus_config_one_phci_child(pp->phys_path);
8658 	adjust_sort_vhcache_paths(vhc, ct_name, ct_addr, tok);
8659 }
8660 
8661 /*
8662  * Dequeue elements from vhci async client config list and bus configure
8663  * their corresponding phci clients.
8664  */
8665 static void
8666 config_client_paths_thread(void *arg)
8667 {
8668 	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
8669 	mdi_async_client_config_t *acc;
8670 	clock_t quit_at_ticks;
8671 	clock_t idle_time = mdi_async_config_idle_time * TICKS_PER_SECOND;
8672 	callb_cpr_t cprinfo;
8673 
8674 	CALLB_CPR_INIT(&cprinfo, &vhc->vhc_lock, callb_generic_cpr,
8675 	    "mdi_config_client_paths");
8676 
8677 	for (; ; ) {
8678 		quit_at_ticks = ddi_get_lbolt() + idle_time;
8679 
8680 		mutex_enter(&vhc->vhc_lock);
8681 		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
8682 		    vhc->vhc_acc_list_head == NULL &&
8683 		    ddi_get_lbolt() < quit_at_ticks) {
8684 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
8685 			(void) cv_timedwait(&vhc->vhc_cv, &vhc->vhc_lock,
8686 			    quit_at_ticks);
8687 			CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
8688 		}
8689 
8690 		if ((vhc->vhc_flags & MDI_VHC_EXIT) ||
8691 		    vhc->vhc_acc_list_head == NULL)
8692 			goto out;
8693 
8694 		acc = vhc->vhc_acc_list_head;
8695 		vhc->vhc_acc_list_head = acc->acc_next;
8696 		if (vhc->vhc_acc_list_head == NULL)
8697 			vhc->vhc_acc_list_tail = NULL;
8698 		vhc->vhc_acc_count--;
8699 		mutex_exit(&vhc->vhc_lock);
8700 
8701 		config_client_paths_sync(vhc, acc->acc_ct_name,
8702 		    acc->acc_ct_addr, acc->acc_phclient_path_list_head,
8703 		    &acc->acc_token);
8704 
8705 		free_async_client_config(acc);
8706 	}
8707 
8708 out:
8709 	vhc->vhc_acc_thrcount--;
8710 	/* CALLB_CPR_EXIT releases the vhc->vhc_lock */
8711 	CALLB_CPR_EXIT(&cprinfo);
8712 }
8713 
8714 /*
8715  * Arrange for all the phci client paths (pp_head) for the specified client
8716  * to be bus configured asynchronously by a thread.
8717  */
8718 static void
8719 config_client_paths_async(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
8720     mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
8721 {
8722 	mdi_async_client_config_t *acc, *newacc;
8723 	int create_thread;
8724 
8725 	if (pp_head == NULL)
8726 		return;
8727 
8728 	if (mdi_mtc_off) {
8729 		config_client_paths_sync(vhc, ct_name, ct_addr, pp_head, tok);
8730 		free_phclient_path_list(pp_head);
8731 		return;
8732 	}
8733 
8734 	newacc = alloc_async_client_config(ct_name, ct_addr, pp_head, tok);
8735 	ASSERT(newacc);
8736 
8737 	mutex_enter(&vhc->vhc_lock);
8738 	for (acc = vhc->vhc_acc_list_head; acc != NULL; acc = acc->acc_next) {
8739 		if (strcmp(ct_name, acc->acc_ct_name) == 0 &&
8740 		    strcmp(ct_addr, acc->acc_ct_addr) == 0) {
8741 			free_async_client_config(newacc);
8742 			mutex_exit(&vhc->vhc_lock);
8743 			return;
8744 		}
8745 	}
8746 
8747 	if (vhc->vhc_acc_list_head == NULL)
8748 		vhc->vhc_acc_list_head = newacc;
8749 	else
8750 		vhc->vhc_acc_list_tail->acc_next = newacc;
8751 	vhc->vhc_acc_list_tail = newacc;
8752 	vhc->vhc_acc_count++;
8753 	if (vhc->vhc_acc_count <= vhc->vhc_acc_thrcount) {
8754 		cv_broadcast(&vhc->vhc_cv);
8755 		create_thread = 0;
8756 	} else {
8757 		vhc->vhc_acc_thrcount++;
8758 		create_thread = 1;
8759 	}
8760 	mutex_exit(&vhc->vhc_lock);
8761 
8762 	if (create_thread)
8763 		(void) thread_create(NULL, 0, config_client_paths_thread, vhc,
8764 		    0, &p0, TS_RUN, minclsyspri);
8765 }
8766 
8767 /*
8768  * Return number of online paths for the specified client.
8769  */
8770 static int
8771 nonline_paths(mdi_vhcache_client_t *cct)
8772 {
8773 	mdi_vhcache_pathinfo_t *cpi;
8774 	int online_count = 0;
8775 
8776 	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
8777 		if (cpi->cpi_pip != NULL) {
8778 			MDI_PI_LOCK(cpi->cpi_pip);
8779 			if (cpi->cpi_pip->pi_state == MDI_PATHINFO_STATE_ONLINE)
8780 				online_count++;
8781 			MDI_PI_UNLOCK(cpi->cpi_pip);
8782 		}
8783 	}
8784 
8785 	return (online_count);
8786 }
8787 
8788 /*
8789  * Bus configure all paths for the specified vhci client.
8790  * If at least one path for the client is already online, the remaining paths
8791  * will be configured asynchronously. Otherwise, it synchronously configures
8792  * the paths until at least one path is online and then rest of the paths
8793  * will be configured asynchronously.
8794  */
8795 static void
8796 config_client_paths(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr)
8797 {
8798 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
8799 	mdi_phys_path_t *pp_head, *pp;
8800 	mdi_vhcache_client_t *cct;
8801 	mdi_vhcache_lookup_token_t tok;
8802 
8803 	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));
8804 
8805 	init_vhcache_lookup_token(&tok, NULL);
8806 
8807 	if (ct_name == NULL || ct_addr == NULL ||
8808 	    (cct = lookup_vhcache_client(vhcache, ct_name, ct_addr, &tok))
8809 	    == NULL ||
8810 	    (pp_head = build_phclient_path_list(cct, ct_name)) == NULL) {
8811 		rw_exit(&vhcache->vhcache_lock);
8812 		return;
8813 	}
8814 
8815 	/* if at least one path is online, configure the rest asynchronously */
8816 	if (nonline_paths(cct) > 0) {
8817 		rw_exit(&vhcache->vhcache_lock);
8818 		config_client_paths_async(vhc, ct_name, ct_addr, pp_head, &tok);
8819 		return;
8820 	}
8821 
8822 	rw_exit(&vhcache->vhcache_lock);
8823 
8824 	for (pp = pp_head; pp != NULL; pp = pp->phys_path_next) {
8825 		if (bus_config_one_phci_child(pp->phys_path) == MDI_SUCCESS) {
8826 			rw_enter(&vhcache->vhcache_lock, RW_READER);
8827 
8828 			if ((cct = lookup_vhcache_client(vhcache, ct_name,
8829 			    ct_addr, &tok)) == NULL) {
8830 				rw_exit(&vhcache->vhcache_lock);
8831 				goto out;
8832 			}
8833 
8834 			if (nonline_paths(cct) > 0 &&
8835 			    pp->phys_path_next != NULL) {
8836 				rw_exit(&vhcache->vhcache_lock);
8837 				config_client_paths_async(vhc, ct_name, ct_addr,
8838 				    pp->phys_path_next, &tok);
8839 				pp->phys_path_next = NULL;
8840 				goto out;
8841 			}
8842 
8843 			rw_exit(&vhcache->vhcache_lock);
8844 		}
8845 	}
8846 
8847 	adjust_sort_vhcache_paths(vhc, ct_name, ct_addr, &tok);
8848 out:
8849 	free_phclient_path_list(pp_head);
8850 }
8851 
8852 static void
8853 single_threaded_vhconfig_enter(mdi_vhci_config_t *vhc)
8854 {
8855 	mutex_enter(&vhc->vhc_lock);
8856 	while (vhc->vhc_flags & MDI_VHC_SINGLE_THREADED)
8857 		cv_wait(&vhc->vhc_cv, &vhc->vhc_lock);
8858 	vhc->vhc_flags |= MDI_VHC_SINGLE_THREADED;
8859 	mutex_exit(&vhc->vhc_lock);
8860 }
8861 
8862 static void
8863 single_threaded_vhconfig_exit(mdi_vhci_config_t *vhc)
8864 {
8865 	mutex_enter(&vhc->vhc_lock);
8866 	vhc->vhc_flags &= ~MDI_VHC_SINGLE_THREADED;
8867 	cv_broadcast(&vhc->vhc_cv);
8868 	mutex_exit(&vhc->vhc_lock);
8869 }
8870 
8871 typedef struct mdi_phci_driver_info {
8872 	char	*phdriver_name;	/* name of the phci driver */
8873 
8874 	/* set to non zero if the phci driver supports root device */
8875 	int	phdriver_root_support;
8876 } mdi_phci_driver_info_t;
8877 
8878 /*
8879  * vhci class and root support capability of a phci driver can be
8880  * specified using ddi-vhci-class and ddi-no-root-support properties in the
8881  * phci driver.conf file. The built-in tables below contain this information
8882  * for those phci drivers whose driver.conf files don't yet contain this info.
8883  *
8884  * All phci drivers expect iscsi have root device support.
8885  */
8886 static mdi_phci_driver_info_t scsi_phci_driver_list[] = {
8887 	{ "fp", 1 },
8888 	{ "iscsi", 0 },
8889 	{ "ibsrp", 1 }
8890 	};
8891 
8892 static mdi_phci_driver_info_t ib_phci_driver_list[] = { "tavor", 1 };
8893 
8894 static void *
8895 mdi_realloc(void *old_ptr, size_t old_size, size_t new_size)
8896 {
8897 	void *new_ptr;
8898 
8899 	new_ptr = kmem_zalloc(new_size, KM_SLEEP);
8900 	if (old_ptr) {
8901 		bcopy(old_ptr, new_ptr, MIN(old_size, new_size));
8902 		kmem_free(old_ptr, old_size);
8903 	}
8904 	return (new_ptr);
8905 }
8906 
8907 static void
8908 add_to_phci_list(char ***driver_list, int **root_support_list,
8909     int *cur_elements, int *max_elements, char *driver_name, int root_support)
8910 {
8911 	ASSERT(*cur_elements <= *max_elements);
8912 	if (*cur_elements == *max_elements) {
8913 		*max_elements += 10;
8914 		*driver_list = mdi_realloc(*driver_list,
8915 		    sizeof (char *) * (*cur_elements),
8916 		    sizeof (char *) * (*max_elements));
8917 		*root_support_list = mdi_realloc(*root_support_list,
8918 		    sizeof (int) * (*cur_elements),
8919 		    sizeof (int) * (*max_elements));
8920 	}
8921 	(*driver_list)[*cur_elements] = i_ddi_strdup(driver_name, KM_SLEEP);
8922 	(*root_support_list)[*cur_elements] = root_support;
8923 	(*cur_elements)++;
8924 }
8925 
8926 static void
8927 get_phci_driver_list(char *vhci_class, char ***driver_list,
8928     int **root_support_list, int *cur_elements, int *max_elements)
8929 {
8930 	mdi_phci_driver_info_t	*st_driver_list, *p;
8931 	int		st_ndrivers, root_support, i, j, driver_conf_count;
8932 	major_t		m;
8933 	struct devnames	*dnp;
8934 	ddi_prop_t	*propp;
8935 
8936 	*driver_list = NULL;
8937 	*root_support_list = NULL;
8938 	*cur_elements = 0;
8939 	*max_elements = 0;
8940 
8941 	/* add the phci drivers derived from the phci driver.conf files */
8942 	for (m = 0; m < devcnt; m++) {
8943 		dnp = &devnamesp[m];
8944 
8945 		if (dnp->dn_flags & DN_PHCI_DRIVER) {
8946 			LOCK_DEV_OPS(&dnp->dn_lock);
8947 			if (dnp->dn_global_prop_ptr != NULL &&
8948 			    (propp = i_ddi_prop_search(DDI_DEV_T_ANY,
8949 			    DDI_VHCI_CLASS, DDI_PROP_TYPE_STRING,
8950 			    &dnp->dn_global_prop_ptr->prop_list)) != NULL &&
8951 			    strcmp(propp->prop_val, vhci_class) == 0) {
8952 
8953 				root_support = (i_ddi_prop_search(DDI_DEV_T_ANY,
8954 				    DDI_NO_ROOT_SUPPORT, DDI_PROP_TYPE_INT,
8955 				    &dnp->dn_global_prop_ptr->prop_list)
8956 				    == NULL) ? 1 : 0;
8957 
8958 				add_to_phci_list(driver_list, root_support_list,
8959 				    cur_elements, max_elements, dnp->dn_name,
8960 				    root_support);
8961 
8962 				UNLOCK_DEV_OPS(&dnp->dn_lock);
8963 			} else
8964 				UNLOCK_DEV_OPS(&dnp->dn_lock);
8965 		}
8966 	}
8967 
8968 	driver_conf_count = *cur_elements;
8969 
8970 	/* add the phci drivers specified in the built-in tables */
8971 	if (strcmp(vhci_class, MDI_HCI_CLASS_SCSI) == 0) {
8972 		st_driver_list = scsi_phci_driver_list;
8973 		st_ndrivers = sizeof (scsi_phci_driver_list) /
8974 		    sizeof (mdi_phci_driver_info_t);
8975 	} else if (strcmp(vhci_class, MDI_HCI_CLASS_IB) == 0) {
8976 		st_driver_list = ib_phci_driver_list;
8977 		st_ndrivers = sizeof (ib_phci_driver_list) /
8978 		    sizeof (mdi_phci_driver_info_t);
8979 	} else {
8980 		st_driver_list = NULL;
8981 		st_ndrivers = 0;
8982 	}
8983 
8984 	for (i = 0, p = st_driver_list; i < st_ndrivers; i++, p++) {
8985 		/* add this phci driver if not already added before */
8986 		for (j = 0; j < driver_conf_count; j++) {
8987 			if (strcmp((*driver_list)[j], p->phdriver_name) == 0)
8988 				break;
8989 		}
8990 		if (j == driver_conf_count) {
8991 			add_to_phci_list(driver_list, root_support_list,
8992 			    cur_elements, max_elements, p->phdriver_name,
8993 			    p->phdriver_root_support);
8994 		}
8995 	}
8996 }
8997 
8998 /*
8999  * Attach the phci driver instances associated with the specified vhci class.
9000  * If root is mounted attach all phci driver instances.
9001  * If root is not mounted, attach the instances of only those phci
9002  * drivers that have the root support.
9003  */
9004 static void
9005 attach_phci_drivers(char *vhci_class)
9006 {
9007 	char	**driver_list, **p;
9008 	int	*root_support_list;
9009 	int	cur_elements, max_elements, i;
9010 	major_t	m;
9011 
9012 	get_phci_driver_list(vhci_class, &driver_list, &root_support_list,
9013 	    &cur_elements, &max_elements);
9014 
9015 	for (i = 0; i < cur_elements; i++) {
9016 		if (modrootloaded || root_support_list[i]) {
9017 			m = ddi_name_to_major(driver_list[i]);
9018 			if (m != DDI_MAJOR_T_NONE &&
9019 			    ddi_hold_installed_driver(m))
9020 				ddi_rele_driver(m);
9021 		}
9022 	}
9023 
9024 	if (driver_list) {
9025 		for (i = 0, p = driver_list; i < cur_elements; i++, p++)
9026 			kmem_free(*p, strlen(*p) + 1);
9027 		kmem_free(driver_list, sizeof (char *) * max_elements);
9028 		kmem_free(root_support_list, sizeof (int) * max_elements);
9029 	}
9030 }
9031 
9032 /*
9033  * Build vhci cache:
9034  *
9035  * Attach phci driver instances and then drive BUS_CONFIG_ALL on
9036  * the phci driver instances. During this process the cache gets built.
9037  *
9038  * Cache is built fully if the root is mounted.
9039  * If the root is not mounted, phci drivers that do not have root support
9040  * are not attached. As a result the cache is built partially. The entries
9041  * in the cache reflect only those phci drivers that have root support.
9042  */
9043 static int
9044 build_vhci_cache(mdi_vhci_t *vh)
9045 {
9046 	mdi_vhci_config_t *vhc = vh->vh_config;
9047 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
9048 
9049 	single_threaded_vhconfig_enter(vhc);
9050 
9051 	rw_enter(&vhcache->vhcache_lock, RW_READER);
9052 	if (vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE) {
9053 		rw_exit(&vhcache->vhcache_lock);
9054 		single_threaded_vhconfig_exit(vhc);
9055 		return (0);
9056 	}
9057 	rw_exit(&vhcache->vhcache_lock);
9058 
9059 	attach_phci_drivers(vh->vh_class);
9060 	bus_config_all_phcis(vhcache, NDI_DRV_CONF_REPROBE | NDI_NO_EVENT,
9061 	    BUS_CONFIG_ALL, DDI_MAJOR_T_NONE);
9062 
9063 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
9064 	vhcache->vhcache_flags |= MDI_VHCI_CACHE_SETUP_DONE;
9065 	rw_exit(&vhcache->vhcache_lock);
9066 
9067 	single_threaded_vhconfig_exit(vhc);
9068 	vhcache_dirty(vhc);
9069 	return (1);
9070 }
9071 
9072 /*
9073  * Determine if discovery of paths is needed.
9074  */
9075 static int
9076 vhcache_do_discovery(mdi_vhci_config_t *vhc)
9077 {
9078 	int rv = 1;
9079 
9080 	mutex_enter(&vhc->vhc_lock);
9081 	if (i_ddi_io_initialized() == 0) {
9082 		if (vhc->vhc_path_discovery_boot > 0) {
9083 			vhc->vhc_path_discovery_boot--;
9084 			goto out;
9085 		}
9086 	} else {
9087 		if (vhc->vhc_path_discovery_postboot > 0) {
9088 			vhc->vhc_path_discovery_postboot--;
9089 			goto out;
9090 		}
9091 	}
9092 
9093 	/*
9094 	 * Do full path discovery at most once per mdi_path_discovery_interval.
9095 	 * This is to avoid a series of full path discoveries when opening
9096 	 * stale /dev/[r]dsk links.
9097 	 */
9098 	if (mdi_path_discovery_interval != -1 &&
9099 	    lbolt64 >= vhc->vhc_path_discovery_cutoff_time)
9100 		goto out;
9101 
9102 	rv = 0;
9103 out:
9104 	mutex_exit(&vhc->vhc_lock);
9105 	return (rv);
9106 }
9107 
9108 /*
9109  * Discover all paths:
9110  *
9111  * Attach phci driver instances and then drive BUS_CONFIG_ALL on all the phci
9112  * driver instances. During this process all paths will be discovered.
9113  */
9114 static int
9115 vhcache_discover_paths(mdi_vhci_t *vh)
9116 {
9117 	mdi_vhci_config_t *vhc = vh->vh_config;
9118 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
9119 	int rv = 0;
9120 
9121 	single_threaded_vhconfig_enter(vhc);
9122 
9123 	if (vhcache_do_discovery(vhc)) {
9124 		attach_phci_drivers(vh->vh_class);
9125 		bus_config_all_phcis(vhcache, NDI_DRV_CONF_REPROBE |
9126 		    NDI_NO_EVENT, BUS_CONFIG_ALL, DDI_MAJOR_T_NONE);
9127 
9128 		mutex_enter(&vhc->vhc_lock);
9129 		vhc->vhc_path_discovery_cutoff_time = lbolt64 +
9130 		    mdi_path_discovery_interval * TICKS_PER_SECOND;
9131 		mutex_exit(&vhc->vhc_lock);
9132 		rv = 1;
9133 	}
9134 
9135 	single_threaded_vhconfig_exit(vhc);
9136 	return (rv);
9137 }
9138 
9139 /*
9140  * Generic vhci bus config implementation:
9141  *
9142  * Parameters
9143  *	vdip	vhci dip
9144  *	flags	bus config flags
9145  *	op	bus config operation
9146  *	The remaining parameters are bus config operation specific
9147  *
9148  * for BUS_CONFIG_ONE
9149  *	arg	pointer to name@addr
9150  *	child	upon successful return from this function, *child will be
9151  *		set to the configured and held devinfo child node of vdip.
9152  *	ct_addr	pointer to client address (i.e. GUID)
9153  *
9154  * for BUS_CONFIG_DRIVER
9155  *	arg	major number of the driver
9156  *	child and ct_addr parameters are ignored
9157  *
9158  * for BUS_CONFIG_ALL
9159  *	arg, child, and ct_addr parameters are ignored
9160  *
9161  * Note that for the rest of the bus config operations, this function simply
9162  * calls the framework provided default bus config routine.
9163  */
9164 int
9165 mdi_vhci_bus_config(dev_info_t *vdip, uint_t flags, ddi_bus_config_op_t op,
9166     void *arg, dev_info_t **child, char *ct_addr)
9167 {
9168 	mdi_vhci_t *vh = i_devi_get_vhci(vdip);
9169 	mdi_vhci_config_t *vhc = vh->vh_config;
9170 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
9171 	int rv = 0;
9172 	int params_valid = 0;
9173 	char *cp;
9174 
9175 	/*
9176 	 * To bus config vhcis we relay operation, possibly using another
9177 	 * thread, to phcis. The phci driver then interacts with MDI to cause
9178 	 * vhci child nodes to be enumerated under the vhci node.  Adding a
9179 	 * vhci child requires an ndi_devi_enter of the vhci. Since another
9180 	 * thread may be adding the child, to avoid deadlock we can't wait
9181 	 * for the relayed operations to complete if we have already entered
9182 	 * the vhci node.
9183 	 */
9184 	if (DEVI_BUSY_OWNED(vdip)) {
9185 		MDI_DEBUG(2, (MDI_NOTE, vdip,
9186 		    "vhci dip is busy owned %p", (void *)vdip));
9187 		goto default_bus_config;
9188 	}
9189 
9190 	rw_enter(&vhcache->vhcache_lock, RW_READER);
9191 	if (!(vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE)) {
9192 		rw_exit(&vhcache->vhcache_lock);
9193 		rv = build_vhci_cache(vh);
9194 		rw_enter(&vhcache->vhcache_lock, RW_READER);
9195 	}
9196 
9197 	switch (op) {
9198 	case BUS_CONFIG_ONE:
9199 		if (arg != NULL && ct_addr != NULL) {
9200 			/* extract node name */
9201 			cp = (char *)arg;
9202 			while (*cp != '\0' && *cp != '@')
9203 				cp++;
9204 			if (*cp == '@') {
9205 				params_valid = 1;
9206 				*cp = '\0';
9207 				config_client_paths(vhc, (char *)arg, ct_addr);
9208 				/* config_client_paths() releases cache_lock */
9209 				*cp = '@';
9210 				break;
9211 			}
9212 		}
9213 
9214 		rw_exit(&vhcache->vhcache_lock);
9215 		break;
9216 
9217 	case BUS_CONFIG_DRIVER:
9218 		rw_exit(&vhcache->vhcache_lock);
9219 		if (rv == 0)
9220 			st_bus_config_all_phcis(vhc, flags, op,
9221 			    (major_t)(uintptr_t)arg);
9222 		break;
9223 
9224 	case BUS_CONFIG_ALL:
9225 		rw_exit(&vhcache->vhcache_lock);
9226 		if (rv == 0)
9227 			st_bus_config_all_phcis(vhc, flags, op, -1);
9228 		break;
9229 
9230 	default:
9231 		rw_exit(&vhcache->vhcache_lock);
9232 		break;
9233 	}
9234 
9235 
9236 default_bus_config:
9237 	/*
9238 	 * All requested child nodes are enumerated under the vhci.
9239 	 * Now configure them.
9240 	 */
9241 	if (ndi_busop_bus_config(vdip, flags, op, arg, child, 0) ==
9242 	    NDI_SUCCESS) {
9243 		return (MDI_SUCCESS);
9244 	} else if (op == BUS_CONFIG_ONE && rv == 0 && params_valid) {
9245 		/* discover all paths and try configuring again */
9246 		if (vhcache_discover_paths(vh) &&
9247 		    ndi_busop_bus_config(vdip, flags, op, arg, child, 0) ==
9248 		    NDI_SUCCESS)
9249 			return (MDI_SUCCESS);
9250 	}
9251 
9252 	return (MDI_FAILURE);
9253 }
9254 
9255 /*
9256  * Read the on-disk vhci cache into an nvlist for the specified vhci class.
9257  */
9258 static nvlist_t *
9259 read_on_disk_vhci_cache(char *vhci_class)
9260 {
9261 	nvlist_t *nvl;
9262 	int err;
9263 	char *filename;
9264 
9265 	filename = vhclass2vhcache_filename(vhci_class);
9266 
9267 	if ((err = fread_nvlist(filename, &nvl)) == 0) {
9268 		kmem_free(filename, strlen(filename) + 1);
9269 		return (nvl);
9270 	} else if (err == EIO)
9271 		cmn_err(CE_WARN, "%s: I/O error, will recreate", filename);
9272 	else if (err == EINVAL)
9273 		cmn_err(CE_WARN,
9274 		    "%s: data file corrupted, will recreate", filename);
9275 
9276 	kmem_free(filename, strlen(filename) + 1);
9277 	return (NULL);
9278 }
9279 
9280 /*
9281  * Read on-disk vhci cache into nvlists for all vhci classes.
9282  * Called during booting by i_ddi_read_devices_files().
9283  */
9284 void
9285 mdi_read_devices_files(void)
9286 {
9287 	int i;
9288 
9289 	for (i = 0; i < N_VHCI_CLASSES; i++)
9290 		vhcache_nvl[i] = read_on_disk_vhci_cache(vhci_class_list[i]);
9291 }
9292 
9293 /*
9294  * Remove all stale entries from vhci cache.
9295  */
9296 static void
9297 clean_vhcache(mdi_vhci_config_t *vhc)
9298 {
9299 	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
9300 	mdi_vhcache_phci_t *cphci, *cphci_head, *cphci_next;
9301 	mdi_vhcache_client_t *cct, *cct_head, *cct_next;
9302 	mdi_vhcache_pathinfo_t *cpi, *cpi_head, *cpi_next;
9303 
9304 	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
9305 
9306 	cct_head = vhcache->vhcache_client_head;
9307 	vhcache->vhcache_client_head = vhcache->vhcache_client_tail = NULL;
9308 	for (cct = cct_head; cct != NULL; cct = cct_next) {
9309 		cct_next = cct->cct_next;
9310 
9311 		cpi_head = cct->cct_cpi_head;
9312 		cct->cct_cpi_head = cct->cct_cpi_tail = NULL;
9313 		for (cpi = cpi_head; cpi != NULL; cpi = cpi_next) {
9314 			cpi_next = cpi->cpi_next;
9315 			if (cpi->cpi_pip != NULL) {
9316 				ASSERT(cpi->cpi_cphci->cphci_phci != NULL);
9317 				enqueue_tail_vhcache_pathinfo(cct, cpi);
9318 			} else
9319 				free_vhcache_pathinfo(cpi);
9320 		}
9321 
9322 		if (cct->cct_cpi_head != NULL)
9323 			enqueue_vhcache_client(vhcache, cct);
9324 		else {
9325 			(void) mod_hash_destroy(vhcache->vhcache_client_hash,
9326 			    (mod_hash_key_t)cct->cct_name_addr);
9327 			free_vhcache_client(cct);
9328 		}
9329 	}
9330 
9331 	cphci_head = vhcache->vhcache_phci_head;
9332 	vhcache->vhcache_phci_head = vhcache->vhcache_phci_tail = NULL;
9333 	for (cphci = cphci_head; cphci != NULL; cphci = cphci_next) {
9334 		cphci_next = cphci->cphci_next;
9335 		if (cphci->cphci_phci != NULL)
9336 			enqueue_vhcache_phci(vhcache, cphci);
9337 		else
9338 			free_vhcache_phci(cphci);
9339 	}
9340 
9341 	vhcache->vhcache_clean_time = lbolt64;
9342 	rw_exit(&vhcache->vhcache_lock);
9343 	vhcache_dirty(vhc);
9344 }
9345 
9346 /*
9347  * Remove all stale entries from vhci cache.
9348  * Called by i_ddi_clean_devices_files() during the execution of devfsadm -C
9349  */
9350 void
9351 mdi_clean_vhcache(void)
9352 {
9353 	mdi_vhci_t *vh;
9354 
9355 	mutex_enter(&mdi_mutex);
9356 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
9357 		vh->vh_refcnt++;
9358 		mutex_exit(&mdi_mutex);
9359 		clean_vhcache(vh->vh_config);
9360 		mutex_enter(&mdi_mutex);
9361 		vh->vh_refcnt--;
9362 	}
9363 	mutex_exit(&mdi_mutex);
9364 }
9365 
9366 /*
9367  * mdi_vhci_walk_clients():
9368  *		Walker routine to traverse client dev_info nodes
9369  * ddi_walk_devs(ddi_get_child(vdip), f, arg) returns the entire tree
9370  * below the client, including nexus devices, which we dont want.
9371  * So we just traverse the immediate siblings, starting from 1st client.
9372  */
9373 void
9374 mdi_vhci_walk_clients(dev_info_t *vdip,
9375     int (*f)(dev_info_t *, void *), void *arg)
9376 {
9377 	mdi_vhci_t	*vh = i_devi_get_vhci(vdip);
9378 	dev_info_t	*cdip;
9379 	mdi_client_t	*ct;
9380 
9381 	MDI_VHCI_CLIENT_LOCK(vh);
9382 	cdip = ddi_get_child(vdip);
9383 	while (cdip) {
9384 		ct = i_devi_get_client(cdip);
9385 		MDI_CLIENT_LOCK(ct);
9386 
9387 		if (((*f)(cdip, arg)) == DDI_WALK_CONTINUE)
9388 			cdip = ddi_get_next_sibling(cdip);
9389 		else
9390 			cdip = NULL;
9391 
9392 		MDI_CLIENT_UNLOCK(ct);
9393 	}
9394 	MDI_VHCI_CLIENT_UNLOCK(vh);
9395 }
9396 
9397 /*
9398  * mdi_vhci_walk_phcis():
9399  *		Walker routine to traverse phci dev_info nodes
9400  */
9401 void
9402 mdi_vhci_walk_phcis(dev_info_t *vdip,
9403     int (*f)(dev_info_t *, void *), void *arg)
9404 {
9405 	mdi_vhci_t	*vh = i_devi_get_vhci(vdip);
9406 	mdi_phci_t	*ph, *next;
9407 
9408 	MDI_VHCI_PHCI_LOCK(vh);
9409 	ph = vh->vh_phci_head;
9410 	while (ph) {
9411 		MDI_PHCI_LOCK(ph);
9412 
9413 		if (((*f)(ph->ph_dip, arg)) == DDI_WALK_CONTINUE)
9414 			next = ph->ph_next;
9415 		else
9416 			next = NULL;
9417 
9418 		MDI_PHCI_UNLOCK(ph);
9419 		ph = next;
9420 	}
9421 	MDI_VHCI_PHCI_UNLOCK(vh);
9422 }
9423 
9424 
9425 /*
9426  * mdi_walk_vhcis():
9427  *		Walker routine to traverse vhci dev_info nodes
9428  */
9429 void
9430 mdi_walk_vhcis(int (*f)(dev_info_t *, void *), void *arg)
9431 {
9432 	mdi_vhci_t	*vh = NULL;
9433 
9434 	mutex_enter(&mdi_mutex);
9435 	/*
9436 	 * Scan for already registered vhci
9437 	 */
9438 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
9439 		vh->vh_refcnt++;
9440 		mutex_exit(&mdi_mutex);
9441 		if (((*f)(vh->vh_dip, arg)) != DDI_WALK_CONTINUE) {
9442 			mutex_enter(&mdi_mutex);
9443 			vh->vh_refcnt--;
9444 			break;
9445 		} else {
9446 			mutex_enter(&mdi_mutex);
9447 			vh->vh_refcnt--;
9448 		}
9449 	}
9450 
9451 	mutex_exit(&mdi_mutex);
9452 }
9453 
9454 /*
9455  * i_mdi_log_sysevent():
9456  *		Logs events for pickup by syseventd
9457  */
9458 static void
9459 i_mdi_log_sysevent(dev_info_t *dip, char *ph_vh_class, char *subclass)
9460 {
9461 	char		*path_name;
9462 	nvlist_t	*attr_list;
9463 
9464 	if (nvlist_alloc(&attr_list, NV_UNIQUE_NAME_TYPE,
9465 	    KM_SLEEP) != DDI_SUCCESS) {
9466 		goto alloc_failed;
9467 	}
9468 
9469 	path_name = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
9470 	(void) ddi_pathname(dip, path_name);
9471 
9472 	if (nvlist_add_string(attr_list, DDI_DRIVER_NAME,
9473 	    ddi_driver_name(dip)) != DDI_SUCCESS) {
9474 		goto error;
9475 	}
9476 
9477 	if (nvlist_add_int32(attr_list, DDI_DRIVER_MAJOR,
9478 	    (int32_t)ddi_driver_major(dip)) != DDI_SUCCESS) {
9479 		goto error;
9480 	}
9481 
9482 	if (nvlist_add_int32(attr_list, DDI_INSTANCE,
9483 	    (int32_t)ddi_get_instance(dip)) != DDI_SUCCESS) {
9484 		goto error;
9485 	}
9486 
9487 	if (nvlist_add_string(attr_list, DDI_PATHNAME,
9488 	    path_name) != DDI_SUCCESS) {
9489 		goto error;
9490 	}
9491 
9492 	if (nvlist_add_string(attr_list, DDI_CLASS,
9493 	    ph_vh_class) != DDI_SUCCESS) {
9494 		goto error;
9495 	}
9496 
9497 	(void) ddi_log_sysevent(dip, DDI_VENDOR_SUNW, EC_DDI, subclass,
9498 	    attr_list, NULL, DDI_SLEEP);
9499 
9500 error:
9501 	kmem_free(path_name, MAXPATHLEN);
9502 	nvlist_free(attr_list);
9503 	return;
9504 
9505 alloc_failed:
9506 	MDI_DEBUG(1, (MDI_WARN, dip, "!unable to send sysevent"));
9507 }
9508 
9509 char **
9510 mdi_get_phci_driver_list(char *vhci_class, int	*ndrivers)
9511 {
9512 	char	**driver_list, **ret_driver_list = NULL;
9513 	int	*root_support_list;
9514 	int	cur_elements, max_elements;
9515 
9516 	get_phci_driver_list(vhci_class, &driver_list, &root_support_list,
9517 	    &cur_elements, &max_elements);
9518 
9519 
9520 	if (driver_list) {
9521 		kmem_free(root_support_list, sizeof (int) * max_elements);
9522 		ret_driver_list = mdi_realloc(driver_list, sizeof (char *)
9523 		    * max_elements, sizeof (char *) * cur_elements);
9524 	}
9525 	*ndrivers = cur_elements;
9526 
9527 	return (ret_driver_list);
9528 
9529 }
9530 
9531 void
9532 mdi_free_phci_driver_list(char **driver_list, int ndrivers)
9533 {
9534 	char	**p;
9535 	int	i;
9536 
9537 	if (driver_list) {
9538 		for (i = 0, p = driver_list; i < ndrivers; i++, p++)
9539 			kmem_free(*p, strlen(*p) + 1);
9540 		kmem_free(driver_list, sizeof (char *) * ndrivers);
9541 	}
9542 }
9543 
9544 /*
9545  * mdi_is_dev_supported():
9546  *		function called by pHCI bus config operation to determine if a
9547  *		device should be represented as a child of the vHCI or the
9548  *		pHCI.  This decision is made by the vHCI, using cinfo idenity
9549  *		information passed by the pHCI - specifics of the cinfo
9550  *		representation are by agreement between the pHCI and vHCI.
9551  * Return Values:
9552  *		MDI_SUCCESS
9553  *		MDI_FAILURE
9554  */
9555 int
9556 mdi_is_dev_supported(char *class, dev_info_t *pdip, void *cinfo)
9557 {
9558 	mdi_vhci_t	*vh;
9559 
9560 	ASSERT(class && pdip);
9561 
9562 	/*
9563 	 * For dev_supported, mdi_phci_register() must have established pdip as
9564 	 * a pHCI.
9565 	 *
9566 	 * NOTE: mdi_phci_register() does "mpxio-disable" processing, and
9567 	 * MDI_PHCI(pdip) will return false if mpxio is disabled.
9568 	 */
9569 	if (!MDI_PHCI(pdip))
9570 		return (MDI_FAILURE);
9571 
9572 	/* Return MDI_FAILURE if vHCI does not support asking the question. */
9573 	vh = (mdi_vhci_t *)i_mdi_vhci_class2vhci(class);
9574 	if ((vh == NULL) || (vh->vh_ops->vo_is_dev_supported == NULL)) {
9575 		return (MDI_FAILURE);
9576 	}
9577 
9578 	/* Return vHCI answer */
9579 	return (vh->vh_ops->vo_is_dev_supported(vh->vh_dip, pdip, cinfo));
9580 }
9581 
9582 int
9583 mdi_dc_return_dev_state(mdi_pathinfo_t *pip, struct devctl_iocdata *dcp)
9584 {
9585 	uint_t devstate = 0;
9586 	dev_info_t *cdip;
9587 
9588 	if ((pip == NULL) || (dcp == NULL))
9589 		return (MDI_FAILURE);
9590 
9591 	cdip = mdi_pi_get_client(pip);
9592 
9593 	switch (mdi_pi_get_state(pip)) {
9594 	case MDI_PATHINFO_STATE_INIT:
9595 		devstate = DEVICE_DOWN;
9596 		break;
9597 	case MDI_PATHINFO_STATE_ONLINE:
9598 		devstate = DEVICE_ONLINE;
9599 		if ((cdip) && (devi_stillreferenced(cdip) == DEVI_REFERENCED))
9600 			devstate |= DEVICE_BUSY;
9601 		break;
9602 	case MDI_PATHINFO_STATE_STANDBY:
9603 		devstate = DEVICE_ONLINE;
9604 		break;
9605 	case MDI_PATHINFO_STATE_FAULT:
9606 		devstate = DEVICE_DOWN;
9607 		break;
9608 	case MDI_PATHINFO_STATE_OFFLINE:
9609 		devstate = DEVICE_OFFLINE;
9610 		break;
9611 	default:
9612 		ASSERT(MDI_PI(pip)->pi_state);
9613 	}
9614 
9615 	if (copyout(&devstate, dcp->cpyout_buf, sizeof (uint_t)) != 0)
9616 		return (MDI_FAILURE);
9617 
9618 	return (MDI_SUCCESS);
9619 }
9620