xref: /titanic_50/usr/src/uts/common/os/sunmdi.c (revision 61961e0f20c7637a3846bb39786bb9dffa91dfb9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Multipath driver interface (MDI) implementation; see mdi_impl.h for a more
30  * detailed discussion of the overall mpxio architecture.
31  *
32  * Default locking order:
33  *
34  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_phci::ph_mutex))
35  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_client::ct_mutex))
36  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
37  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_client::ct_mutex))
38  * _NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
39  */
40 
41 #include <sys/note.h>
42 #include <sys/types.h>
43 #include <sys/varargs.h>
44 #include <sys/param.h>
45 #include <sys/errno.h>
46 #include <sys/uio.h>
47 #include <sys/buf.h>
48 #include <sys/modctl.h>
49 #include <sys/open.h>
50 #include <sys/kmem.h>
51 #include <sys/poll.h>
52 #include <sys/conf.h>
53 #include <sys/bootconf.h>
54 #include <sys/cmn_err.h>
55 #include <sys/stat.h>
56 #include <sys/ddi.h>
57 #include <sys/sunddi.h>
58 #include <sys/ddipropdefs.h>
59 #include <sys/sunndi.h>
60 #include <sys/ndi_impldefs.h>
61 #include <sys/promif.h>
62 #include <sys/sunmdi.h>
63 #include <sys/mdi_impldefs.h>
64 #include <sys/taskq.h>
65 #include <sys/epm.h>
66 #include <sys/sunpm.h>
67 
68 #ifdef	DEBUG
69 #include <sys/debug.h>
70 int	mdi_debug = 1;
71 #define	MDI_DEBUG(level, stmnt) \
72 	    if (mdi_debug >= (level)) i_mdi_log stmnt
73 static void i_mdi_log(int, dev_info_t *, const char *fmt, ...);
74 #else	/* !DEBUG */
75 #define	MDI_DEBUG(level, stmnt)
76 #endif	/* DEBUG */
77 
78 extern pri_t	minclsyspri;
79 extern int	modrootloaded;
80 
81 /*
82  * Global mutex:
83  * Protects vHCI list and structure members, pHCI and Client lists.
84  */
85 kmutex_t	mdi_mutex;
86 
87 /*
88  * Registered vHCI class driver lists
89  */
90 int		mdi_vhci_count;
91 mdi_vhci_t	*mdi_vhci_head;
92 mdi_vhci_t	*mdi_vhci_tail;
93 
94 /*
95  * Client Hash Table size
96  */
97 static int	mdi_client_table_size = CLIENT_HASH_TABLE_SIZE;
98 
99 /*
100  * taskq interface definitions
101  */
102 #define	MDI_TASKQ_N_THREADS	8
103 #define	MDI_TASKQ_PRI		minclsyspri
104 #define	MDI_TASKQ_MINALLOC	(4*mdi_taskq_n_threads)
105 #define	MDI_TASKQ_MAXALLOC	(500*mdi_taskq_n_threads)
106 
107 taskq_t				*mdi_taskq;
108 static uint_t			mdi_taskq_n_threads = MDI_TASKQ_N_THREADS;
109 
110 static int		mdi_max_bus_config_threads = 100;
111 /*
112  * To reduce unnecessary BUS_CONFIG_ALLs, do not BUS_CONFIG_ALL phcis in the
113  * context of a BUS_CONFIG_ONE if a BUS_CONFIG_ALL has already been performed
114  * in the last mdi_bus_config_timeout seconds.
115  */
116 static int		mdi_bus_config_timeout = 60;	/* in seconds */
117 
118 /*
119  * MDI component property name/value string definitions
120  */
121 const char 		*mdi_component_prop = "mpxio-component";
122 const char		*mdi_component_prop_vhci = "vhci";
123 const char		*mdi_component_prop_phci = "phci";
124 const char		*mdi_component_prop_client = "client";
125 
126 /*
127  * MDI client global unique identifier property name
128  */
129 const char		*mdi_client_guid_prop = "client-guid";
130 
131 /*
132  * MDI client load balancing property name/value string definitions
133  */
134 const char		*mdi_load_balance = "load-balance";
135 const char		*mdi_load_balance_none = "none";
136 const char		*mdi_load_balance_rr = "round-robin";
137 const char		*mdi_load_balance_lba = "logical-block";
138 
139 /*
140  * Obsolete vHCI class definition; to be removed after Leadville update
141  */
142 const char *mdi_vhci_class_scsi = MDI_HCI_CLASS_SCSI;
143 
144 static char vhci_greeting[] =
145 	"\tThere already exists one vHCI driver for class %s\n"
146 	"\tOnly one vHCI driver for each class is allowed\n";
147 
148 /*
149  * Static function prototypes
150  */
151 static int		i_mdi_phci_offline(dev_info_t *, uint_t);
152 static int		i_mdi_client_offline(dev_info_t *, uint_t);
153 static int		i_mdi_phci_pre_detach(dev_info_t *, ddi_detach_cmd_t);
154 static void		i_mdi_phci_post_detach(dev_info_t *,
155 			    ddi_detach_cmd_t, int);
156 static int		i_mdi_client_pre_detach(dev_info_t *,
157 			    ddi_detach_cmd_t);
158 static void		i_mdi_client_post_detach(dev_info_t *,
159 			    ddi_detach_cmd_t, int);
160 static void		i_mdi_pm_hold_pip(mdi_pathinfo_t *);
161 static void		i_mdi_pm_rele_pip(mdi_pathinfo_t *);
162 static int 		i_mdi_lba_lb(mdi_client_t *ct,
163 			    mdi_pathinfo_t **ret_pip, struct buf *buf);
164 static void		i_mdi_pm_hold_client(mdi_client_t *, int);
165 static void		i_mdi_pm_rele_client(mdi_client_t *, int);
166 static void		i_mdi_pm_reset_client(mdi_client_t *);
167 static void		i_mdi_pm_hold_all_phci(mdi_client_t *);
168 static int		i_mdi_power_all_phci(mdi_client_t *);
169 
170 
171 /*
172  * Internal mdi_pathinfo node functions
173  */
174 static int		i_mdi_pi_kstat_create(mdi_pathinfo_t *);
175 static void		i_mdi_pi_kstat_destroy(mdi_pathinfo_t *);
176 
177 static mdi_vhci_t	*i_mdi_vhci_class2vhci(char *);
178 static mdi_vhci_t	*i_devi_get_vhci(dev_info_t *);
179 static mdi_phci_t	*i_devi_get_phci(dev_info_t *);
180 static void		i_mdi_phci_lock(mdi_phci_t *, mdi_pathinfo_t *);
181 static void		i_mdi_phci_get_client_lock(mdi_phci_t *,
182 			    mdi_client_t *);
183 static void		i_mdi_phci_unlock(mdi_phci_t *);
184 static mdi_pathinfo_t	*i_mdi_pi_alloc(mdi_phci_t *, char *,
185 			    mdi_client_t *, int);
186 static void		i_mdi_phci_add_path(mdi_phci_t *, mdi_pathinfo_t *);
187 static void		i_mdi_client_add_path(mdi_client_t *, mdi_pathinfo_t *);
188 static void		i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *,
189 			    mdi_client_t *);
190 static void		i_mdi_phci_remove_path(mdi_phci_t *, mdi_pathinfo_t *);
191 static void		i_mdi_client_remove_path(mdi_client_t *,
192 			    mdi_pathinfo_t *);
193 
194 static int		i_mdi_pi_state_change(mdi_pathinfo_t *,
195 			    mdi_pathinfo_state_t, int);
196 static int		i_mdi_pi_offline(mdi_pathinfo_t *, int);
197 static dev_info_t	*i_mdi_devinfo_create(mdi_vhci_t *, char *, char *,
198 			    char **, int, int);
199 static dev_info_t	*i_mdi_devinfo_find(mdi_vhci_t *, char *, char *);
200 static int		i_mdi_devinfo_remove(dev_info_t *, dev_info_t *, int);
201 static int		i_mdi_is_child_present(dev_info_t *, dev_info_t *);
202 static mdi_client_t	*i_mdi_client_alloc(mdi_vhci_t *, char *, char *, int);
203 static void		i_mdi_client_enlist_table(mdi_vhci_t *, mdi_client_t *);
204 static void		i_mdi_client_delist_table(mdi_vhci_t *, mdi_client_t *);
205 static mdi_client_t	*i_mdi_client_find(mdi_vhci_t *, char *);
206 static void		i_mdi_client_update_state(mdi_client_t *);
207 static int		i_mdi_client_compute_state(mdi_client_t *,
208 			    mdi_phci_t *);
209 static void		i_mdi_client_lock(mdi_client_t *, mdi_pathinfo_t *);
210 static void		i_mdi_client_unlock(mdi_client_t *);
211 static int		i_mdi_client_free(mdi_vhci_t *, mdi_client_t *);
212 static mdi_client_t	*i_devi_get_client(dev_info_t *);
213 static int		i_mdi_pi_enable_disable(dev_info_t *, dev_info_t *, int,
214 			int);
215 /*
216  * Failover related function prototypes
217  */
218 static int		i_mdi_failover(void *);
219 
220 /*
221  * misc internal functions
222  */
223 static int		i_mdi_get_hash_key(char *);
224 static int		i_map_nvlist_error_to_mdi(int);
225 static void		i_mdi_report_path_state(mdi_client_t *,
226 			    mdi_pathinfo_t *);
227 
228 /* called once when first vhci registers with mdi */
229 static void
230 i_mdi_init()
231 {
232 	static int initialized = 0;
233 
234 	if (initialized)
235 		return;
236 	initialized = 1;
237 
238 	mutex_init(&mdi_mutex, NULL, MUTEX_DEFAULT, NULL);
239 	/*
240 	 * Create our taskq resources
241 	 */
242 	mdi_taskq = taskq_create("mdi_taskq", mdi_taskq_n_threads,
243 	    MDI_TASKQ_PRI, MDI_TASKQ_MINALLOC, MDI_TASKQ_MAXALLOC,
244 	    TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
245 	ASSERT(mdi_taskq != NULL);	/* taskq_create never fails */
246 }
247 
248 /*
249  * mdi_get_component_type():
250  *		Return mpxio component type
251  * Return Values:
252  *		MDI_COMPONENT_NONE
253  *		MDI_COMPONENT_VHCI
254  *		MDI_COMPONENT_PHCI
255  *		MDI_COMPONENT_CLIENT
256  * XXX This doesn't work under multi-level MPxIO and should be
257  *	removed when clients migrate mdi_is_*() interfaces.
258  */
259 int
260 mdi_get_component_type(dev_info_t *dip)
261 {
262 	return (DEVI(dip)->devi_mdi_component);
263 }
264 
265 /*
266  * mdi_vhci_register():
267  *		Register a vHCI module with the mpxio framework
268  *		mdi_vhci_register() is called by vHCI drivers to register the
269  *		'class_driver' vHCI driver and its MDI entrypoints with the
270  *		mpxio framework.  The vHCI driver must call this interface as
271  *		part of its attach(9e) handler.
272  *		Competing threads may try to attach mdi_vhci_register() as
273  *		the vHCI drivers are loaded and attached as a result of pHCI
274  *		driver instance registration (mdi_phci_register()) with the
275  *		framework.
276  * Return Values:
277  *		MDI_SUCCESS
278  *		MDI_FAILURE
279  */
280 
281 /*ARGSUSED*/
282 int
283 mdi_vhci_register(char *class, dev_info_t *vdip, mdi_vhci_ops_t *vops,
284     int flags)
285 {
286 	mdi_vhci_t		*vh = NULL;
287 
288 	ASSERT(vops->vo_revision == MDI_VHCI_OPS_REV);
289 
290 	i_mdi_init();
291 
292 	mutex_enter(&mdi_mutex);
293 	/*
294 	 * Scan for already registered vhci
295 	 */
296 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
297 		if (strcmp(vh->vh_class, class) == 0) {
298 			/*
299 			 * vHCI has already been created.  Check for valid
300 			 * vHCI ops registration.  We only support one vHCI
301 			 * module per class
302 			 */
303 			if (vh->vh_ops != NULL) {
304 				mutex_exit(&mdi_mutex);
305 				cmn_err(CE_NOTE, vhci_greeting, class);
306 				return (MDI_FAILURE);
307 			}
308 			break;
309 		}
310 	}
311 
312 	/*
313 	 * if not yet created, create the vHCI component
314 	 */
315 	if (vh == NULL) {
316 		struct client_hash	*hash = NULL;
317 		char			*load_balance;
318 
319 		/*
320 		 * Allocate and initialize the mdi extensions
321 		 */
322 		vh = kmem_zalloc(sizeof (mdi_vhci_t), KM_SLEEP);
323 		hash = kmem_zalloc(mdi_client_table_size * sizeof (*hash),
324 		    KM_SLEEP);
325 		vh->vh_client_table = hash;
326 		vh->vh_class = kmem_zalloc(strlen(class) + 1, KM_SLEEP);
327 		(void) strcpy(vh->vh_class, class);
328 		vh->vh_lb = LOAD_BALANCE_RR;
329 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vdip,
330 		    0, LOAD_BALANCE_PROP, &load_balance) == DDI_SUCCESS) {
331 			if (strcmp(load_balance, LOAD_BALANCE_PROP_NONE) == 0) {
332 				vh->vh_lb = LOAD_BALANCE_NONE;
333 			} else if (strcmp(load_balance, LOAD_BALANCE_PROP_LBA)
334 				    == 0) {
335 				vh->vh_lb = LOAD_BALANCE_LBA;
336 			}
337 			ddi_prop_free(load_balance);
338 		}
339 
340 		/*
341 		 * Store the vHCI ops vectors
342 		 */
343 		vh->vh_dip = vdip;
344 		vh->vh_ops = vops;
345 
346 		/*
347 		 * other members of vh_bus_config are initialized by
348 		 * the above kmem_zalloc of the vhci structure.
349 		 */
350 		cv_init(&vh->vh_bus_config.vhc_cv, NULL, CV_DRIVER, NULL);
351 
352 		if (mdi_vhci_head == NULL) {
353 			mdi_vhci_head = vh;
354 		}
355 		if (mdi_vhci_tail) {
356 			mdi_vhci_tail->vh_next = vh;
357 		}
358 		mdi_vhci_tail = vh;
359 		mdi_vhci_count++;
360 	}
361 
362 	/*
363 	 * Claim the devfs node as a vhci component
364 	 */
365 	DEVI(vdip)->devi_mdi_component |= MDI_COMPONENT_VHCI;
366 
367 	/*
368 	 * Initialize our back reference from dev_info node
369 	 */
370 	DEVI(vdip)->devi_mdi_xhci = (caddr_t)vh;
371 	mutex_exit(&mdi_mutex);
372 	return (MDI_SUCCESS);
373 }
374 
375 /*
376  * mdi_vhci_unregister():
377  *		Unregister a vHCI module from mpxio framework
378  *		mdi_vhci_unregister() is called from the detach(9E) entrypoint
379  * 		of a vhci to unregister it from the framework.
380  * Return Values:
381  *		MDI_SUCCESS
382  *		MDI_FAILURE
383  */
384 
385 /*ARGSUSED*/
386 int
387 mdi_vhci_unregister(dev_info_t *vdip, int flags)
388 {
389 	mdi_vhci_t	*found, *vh, *prev = NULL;
390 	mdi_phci_config_t *phc, *next_phc;
391 
392 	/*
393 	 * Check for invalid VHCI
394 	 */
395 	if ((vh = i_devi_get_vhci(vdip)) == NULL)
396 		return (MDI_FAILURE);
397 
398 	mutex_enter(&mdi_mutex);
399 
400 	/*
401 	 * Scan the list of registered vHCIs for a match
402 	 */
403 	for (found = mdi_vhci_head; found != NULL; found = found->vh_next) {
404 		if (found == vh)
405 			break;
406 		prev = found;
407 	}
408 
409 	if (found == NULL) {
410 		mutex_exit(&mdi_mutex);
411 		return (MDI_FAILURE);
412 	}
413 
414 	/*
415 	 * Check the pHCI and client count. All the pHCIs and clients
416 	 * should have been unregistered, before a vHCI can be
417 	 * unregistered.
418 	 */
419 	if (vh->vh_phci_count || vh->vh_client_count) {
420 		MDI_DEBUG(1, (CE_NOTE, NULL,
421 		    "!mdi_vhci_unregister: pHCI in registered state.\n"));
422 		mutex_exit(&mdi_mutex);
423 		return (MDI_FAILURE);
424 	}
425 
426 	/*
427 	 * Remove the vHCI from the global list
428 	 */
429 	if (vh == mdi_vhci_head) {
430 		mdi_vhci_head = vh->vh_next;
431 	} else {
432 		prev->vh_next = vh->vh_next;
433 	}
434 	if (vh == mdi_vhci_tail) {
435 		mdi_vhci_tail = prev;
436 	}
437 
438 	vh->vh_ops = NULL;
439 	mdi_vhci_count--;
440 	mutex_exit(&mdi_mutex);
441 	DEVI(vdip)->devi_mdi_component &= ~MDI_COMPONENT_VHCI;
442 	DEVI(vdip)->devi_mdi_xhci = NULL;
443 	kmem_free(vh->vh_class, strlen(vh->vh_class)+1);
444 	kmem_free(vh->vh_client_table,
445 	    mdi_client_table_size * sizeof (struct client_hash));
446 
447 	/*
448 	 * there must be no more tasks on the bus config taskq as the vhci
449 	 * driver can not be detached while bus config is in progress.
450 	 */
451 	ASSERT(vh->vh_bus_config.vhc_start_time == 0);
452 
453 	if (vh->vh_bus_config.vhc_taskq != NULL)
454 		taskq_destroy(vh->vh_bus_config.vhc_taskq);
455 
456 	for (phc = vh->vh_bus_config.vhc_phc; phc != NULL; phc = next_phc) {
457 		next_phc = phc->phc_next;
458 		kmem_free(phc, sizeof (*phc));
459 	}
460 
461 	cv_destroy(&vh->vh_bus_config.vhc_cv);
462 
463 	kmem_free(vh, sizeof (mdi_vhci_t));
464 	return (MDI_SUCCESS);
465 }
466 
467 /*
468  * i_mdi_vhci_class2vhci():
469  *		Look for a matching vHCI module given a vHCI class name
470  * Return Values:
471  *		Handle to a vHCI component
472  *		NULL
473  */
474 static mdi_vhci_t *
475 i_mdi_vhci_class2vhci(char *class)
476 {
477 	mdi_vhci_t	*vh = NULL;
478 
479 	ASSERT(!MUTEX_HELD(&mdi_mutex));
480 
481 	mutex_enter(&mdi_mutex);
482 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
483 		if (strcmp(vh->vh_class, class) == 0) {
484 			break;
485 		}
486 	}
487 	mutex_exit(&mdi_mutex);
488 	return (vh);
489 }
490 
491 /*
492  * i_devi_get_vhci():
493  *		Utility function to get the handle to a vHCI component
494  * Return Values:
495  *		Handle to a vHCI component
496  *		NULL
497  */
498 mdi_vhci_t *
499 i_devi_get_vhci(dev_info_t *vdip)
500 {
501 	mdi_vhci_t	*vh = NULL;
502 	if (MDI_VHCI(vdip)) {
503 		vh = (mdi_vhci_t *)DEVI(vdip)->devi_mdi_xhci;
504 	}
505 	return (vh);
506 }
507 
508 /*
509  * mdi_phci_register():
510  *		Register a pHCI module with mpxio framework
511  *		mdi_phci_register() is called by pHCI drivers to register with
512  *		the mpxio framework and a specific 'class_driver' vHCI.  The
513  *		pHCI driver must call this interface as part of its attach(9e)
514  *		handler.
515  * Return Values:
516  *		MDI_SUCCESS
517  *		MDI_FAILURE
518  */
519 
520 /*ARGSUSED*/
521 int
522 mdi_phci_register(char *class, dev_info_t *pdip, int flags)
523 {
524 	mdi_phci_t		*ph;
525 	mdi_vhci_t		*vh;
526 	char			*data;
527 	char			*pathname;
528 
529 	pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
530 	(void) ddi_pathname(pdip, pathname);
531 
532 	/*
533 	 * Check for mpxio-disable property. Enable mpxio if the property is
534 	 * missing or not set to "yes".
535 	 * If the property is set to "yes" then emit a brief message.
536 	 */
537 	if ((ddi_prop_lookup_string(DDI_DEV_T_ANY, pdip, 0, "mpxio-disable",
538 	    &data) == DDI_SUCCESS)) {
539 		if (strcmp(data, "yes") == 0) {
540 			MDI_DEBUG(1, (CE_CONT, pdip,
541 			    "?%s (%s%d) multipath capabilities "
542 			    "disabled via %s.conf.\n", pathname,
543 			    ddi_driver_name(pdip), ddi_get_instance(pdip),
544 			    ddi_driver_name(pdip)));
545 			ddi_prop_free(data);
546 			kmem_free(pathname, MAXPATHLEN);
547 			return (MDI_FAILURE);
548 		}
549 		ddi_prop_free(data);
550 	}
551 
552 	kmem_free(pathname, MAXPATHLEN);
553 
554 	/*
555 	 * Search for a matching vHCI
556 	 */
557 	vh = (mdi_vhci_t *)i_mdi_vhci_class2vhci(class);
558 	if (vh == NULL) {
559 		return (MDI_FAILURE);
560 	}
561 
562 	ph = kmem_zalloc(sizeof (mdi_phci_t), KM_SLEEP);
563 	mutex_init(&ph->ph_mutex, NULL, MUTEX_DEFAULT, NULL);
564 	ph->ph_dip = pdip;
565 	ph->ph_vhci = vh;
566 	ph->ph_next = NULL;
567 	ph->ph_unstable = 0;
568 	ph->ph_vprivate = 0;
569 	cv_init(&ph->ph_unstable_cv, NULL, CV_DRIVER, NULL);
570 	cv_init(&ph->ph_powerchange_cv, NULL, CV_DRIVER, NULL);
571 
572 	MDI_PHCI_SET_POWER_UP(ph);
573 	DEVI(pdip)->devi_mdi_component |= MDI_COMPONENT_PHCI;
574 	DEVI(pdip)->devi_mdi_xhci = (caddr_t)ph;
575 
576 	mutex_enter(&mdi_mutex);
577 	if (vh->vh_phci_head == NULL) {
578 		vh->vh_phci_head = ph;
579 	}
580 	if (vh->vh_phci_tail) {
581 		vh->vh_phci_tail->ph_next = ph;
582 	}
583 	vh->vh_phci_tail = ph;
584 	vh->vh_phci_count++;
585 	/* to force discovery of all phci children during busconfig */
586 	vh->vh_bus_config.vhc_cutoff_time = -1;
587 	mutex_exit(&mdi_mutex);
588 	return (MDI_SUCCESS);
589 }
590 
591 /*
592  * mdi_phci_unregister():
593  *		Unregister a pHCI module from mpxio framework
594  *		mdi_phci_unregister() is called by the pHCI drivers from their
595  *		detach(9E) handler to unregister their instances from the
596  *		framework.
597  * Return Values:
598  *		MDI_SUCCESS
599  *		MDI_FAILURE
600  */
601 
602 /*ARGSUSED*/
603 int
604 mdi_phci_unregister(dev_info_t *pdip, int flags)
605 {
606 	mdi_vhci_t		*vh;
607 	mdi_phci_t		*ph;
608 	mdi_phci_t		*tmp;
609 	mdi_phci_t		*prev = NULL;
610 
611 	ph = i_devi_get_phci(pdip);
612 	if (ph == NULL) {
613 		MDI_DEBUG(1, (CE_WARN, pdip,
614 		    "!pHCI unregister: Not a valid pHCI"));
615 		return (MDI_FAILURE);
616 	}
617 
618 	vh = ph->ph_vhci;
619 	ASSERT(vh != NULL);
620 	if (vh == NULL) {
621 		MDI_DEBUG(1, (CE_WARN, pdip,
622 		    "!pHCI unregister: Not a valid vHCI"));
623 		return (MDI_FAILURE);
624 	}
625 
626 	mutex_enter(&mdi_mutex);
627 	tmp = vh->vh_phci_head;
628 	while (tmp) {
629 		if (tmp == ph) {
630 			break;
631 		}
632 		prev = tmp;
633 		tmp = tmp->ph_next;
634 	}
635 
636 	if (ph == vh->vh_phci_head) {
637 		vh->vh_phci_head = ph->ph_next;
638 	} else {
639 		prev->ph_next = ph->ph_next;
640 	}
641 
642 	if (ph == vh->vh_phci_tail) {
643 		vh->vh_phci_tail = prev;
644 	}
645 
646 	vh->vh_phci_count--;
647 
648 	/*
649 	 * If no busconfig is in progress, release the phci busconfig resources.
650 	 * We only need vh->vh_phci_count of busconfig resources.
651 	 */
652 	if (vh->vh_bus_config.vhc_start_time == 0 &&
653 	    vh->vh_bus_config.vhc_phc_cnt > vh->vh_phci_count) {
654 		int count;
655 
656 		count = vh->vh_bus_config.vhc_phc_cnt - vh->vh_phci_count;
657 		while (count--) {
658 			mdi_phci_config_t *phc;
659 
660 			phc = vh->vh_bus_config.vhc_phc;
661 			vh->vh_bus_config.vhc_phc = phc->phc_next;
662 			kmem_free(phc, sizeof (*phc));
663 		}
664 		vh->vh_bus_config.vhc_phc_cnt = vh->vh_phci_count;
665 	}
666 
667 	mutex_exit(&mdi_mutex);
668 
669 	cv_destroy(&ph->ph_unstable_cv);
670 	cv_destroy(&ph->ph_powerchange_cv);
671 	mutex_destroy(&ph->ph_mutex);
672 	kmem_free(ph, sizeof (mdi_phci_t));
673 	DEVI(pdip)->devi_mdi_component &= ~MDI_COMPONENT_PHCI;
674 	DEVI(pdip)->devi_mdi_xhci = NULL;
675 	return (MDI_SUCCESS);
676 }
677 
678 /*
679  * i_devi_get_phci():
680  * 		Utility function to return the phci extensions.
681  */
682 static mdi_phci_t *
683 i_devi_get_phci(dev_info_t *pdip)
684 {
685 	mdi_phci_t	*ph = NULL;
686 	if (MDI_PHCI(pdip)) {
687 		ph = (mdi_phci_t *)DEVI(pdip)->devi_mdi_xhci;
688 	}
689 	return (ph);
690 }
691 
692 /*
693  * mdi_phci_path2devinfo():
694  * 		Utility function to search for a valid phci device given
695  *		the devfs pathname.
696  */
697 
698 dev_info_t *
699 mdi_phci_path2devinfo(dev_info_t *vdip, caddr_t pathname)
700 {
701 	char		*temp_pathname;
702 	mdi_vhci_t	*vh;
703 	mdi_phci_t	*ph;
704 	dev_info_t 	*pdip = NULL;
705 
706 	vh = i_devi_get_vhci(vdip);
707 	ASSERT(vh != NULL);
708 
709 	if (vh == NULL) {
710 		/*
711 		 * Invalid vHCI component, return failure
712 		 */
713 		return (NULL);
714 	}
715 
716 	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
717 	mutex_enter(&mdi_mutex);
718 	ph = vh->vh_phci_head;
719 	while (ph != NULL) {
720 		pdip = ph->ph_dip;
721 		ASSERT(pdip != NULL);
722 		*temp_pathname = '\0';
723 		(void) ddi_pathname(pdip, temp_pathname);
724 		if (strcmp(temp_pathname, pathname) == 0) {
725 			break;
726 		}
727 		ph = ph->ph_next;
728 	}
729 	if (ph == NULL) {
730 		pdip = NULL;
731 	}
732 	mutex_exit(&mdi_mutex);
733 	kmem_free(temp_pathname, MAXPATHLEN);
734 	return (pdip);
735 }
736 
737 /*
738  * mdi_phci_get_path_count():
739  * 		get number of path information nodes associated with a given
740  *		pHCI device.
741  */
742 int
743 mdi_phci_get_path_count(dev_info_t *pdip)
744 {
745 	mdi_phci_t	*ph;
746 	int		count = 0;
747 
748 	ph = i_devi_get_phci(pdip);
749 	if (ph != NULL) {
750 		count = ph->ph_path_count;
751 	}
752 	return (count);
753 }
754 
755 /*
756  * i_mdi_phci_lock():
757  *		Lock a pHCI device
758  * Return Values:
759  *		None
760  * Note:
761  *		The default locking order is:
762  *		_NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
763  *		But there are number of situations where locks need to be
764  *		grabbed in reverse order.  This routine implements try and lock
765  *		mechanism depending on the requested parameter option.
766  */
767 static void
768 i_mdi_phci_lock(mdi_phci_t *ph, mdi_pathinfo_t *pip)
769 {
770 	if (pip) {
771 		/* Reverse locking is requested. */
772 		while (MDI_PHCI_TRYLOCK(ph) == 0) {
773 			/*
774 			 * tryenter failed. Try to grab again
775 			 * after a small delay
776 			 */
777 			MDI_PI_HOLD(pip);
778 			MDI_PI_UNLOCK(pip);
779 			delay(1);
780 			MDI_PI_LOCK(pip);
781 			MDI_PI_RELE(pip);
782 		}
783 	} else {
784 		MDI_PHCI_LOCK(ph);
785 	}
786 }
787 
788 /*
789  * i_mdi_phci_get_client_lock():
790  *		Lock a pHCI device
791  * Return Values:
792  *		None
793  * Note:
794  *		The default locking order is:
795  *		_NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_client::ct_mutex))
796  *		But there are number of situations where locks need to be
797  *		grabbed in reverse order.  This routine implements try and lock
798  *		mechanism depending on the requested parameter option.
799  */
800 static void
801 i_mdi_phci_get_client_lock(mdi_phci_t *ph, mdi_client_t *ct)
802 {
803 	if (ct) {
804 		/* Reverse locking is requested. */
805 		while (MDI_PHCI_TRYLOCK(ph) == 0) {
806 			/*
807 			 * tryenter failed. Try to grab again
808 			 * after a small delay
809 			 */
810 			MDI_CLIENT_UNLOCK(ct);
811 			delay(1);
812 			MDI_CLIENT_LOCK(ct);
813 		}
814 	} else {
815 		MDI_PHCI_LOCK(ph);
816 	}
817 }
818 
819 /*
820  * i_mdi_phci_unlock():
821  *		Unlock the pHCI component
822  */
823 static void
824 i_mdi_phci_unlock(mdi_phci_t *ph)
825 {
826 	MDI_PHCI_UNLOCK(ph);
827 }
828 
829 /*
830  * i_mdi_devinfo_create():
831  *		create client device's devinfo node
832  * Return Values:
833  *		dev_info
834  *		NULL
835  * Notes:
836  */
837 static dev_info_t *
838 i_mdi_devinfo_create(mdi_vhci_t *vh, char *name, char *guid,
839 	char **compatible, int ncompatible, int flags)
840 {
841 	dev_info_t *cdip = NULL;
842 
843 	ASSERT(MUTEX_HELD(&mdi_mutex));
844 
845 	/* Verify for duplicate entry */
846 	cdip = i_mdi_devinfo_find(vh, name, guid);
847 	ASSERT(cdip == NULL);
848 	if (cdip) {
849 		cmn_err(CE_WARN,
850 		    "i_mdi_devinfo_create: client dip %p already exists",
851 			(void *)cdip);
852 	}
853 
854 	if (flags == DDI_SLEEP) {
855 		ndi_devi_alloc_sleep(vh->vh_dip, name,
856 		    DEVI_SID_NODEID, &cdip);
857 	} else {
858 		(void) ndi_devi_alloc(vh->vh_dip, name,
859 		    DEVI_SID_NODEID, &cdip);
860 	}
861 	if (cdip == NULL)
862 		goto fail;
863 
864 	/*
865 	 * Create component type and Global unique identifier
866 	 * properties
867 	 */
868 	if (ndi_prop_update_string(DDI_DEV_T_NONE, cdip,
869 	    MDI_CLIENT_GUID_PROP, guid) != DDI_PROP_SUCCESS) {
870 		goto fail;
871 	}
872 
873 	/* Decorate the node with compatible property */
874 	if (compatible &&
875 	    (ndi_prop_update_string_array(DDI_DEV_T_NONE, cdip,
876 	    "compatible", compatible, ncompatible) != DDI_PROP_SUCCESS)) {
877 		goto fail;
878 	}
879 
880 	return (cdip);
881 
882 fail:
883 	if (cdip) {
884 		(void) ndi_prop_remove_all(cdip);
885 		(void) ndi_devi_free(cdip);
886 	}
887 	return (NULL);
888 }
889 
890 /*
891  * i_mdi_devinfo_find():
892  *		Find a matching devinfo node for given client node name
893  *		and its guid.
894  * Return Values:
895  *		Handle to a dev_info node or NULL
896  */
897 
898 static dev_info_t *
899 i_mdi_devinfo_find(mdi_vhci_t *vh, caddr_t name, char *guid)
900 {
901 	char			*data;
902 	dev_info_t 		*cdip = NULL;
903 	dev_info_t 		*ndip = NULL;
904 	int			circular;
905 
906 	ndi_devi_enter(vh->vh_dip, &circular);
907 	ndip = (dev_info_t *)DEVI(vh->vh_dip)->devi_child;
908 	while ((cdip = ndip) != NULL) {
909 		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
910 
911 		if (strcmp(DEVI(cdip)->devi_node_name, name)) {
912 			continue;
913 		}
914 
915 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, cdip,
916 		    DDI_PROP_DONTPASS, MDI_CLIENT_GUID_PROP,
917 		    &data) != DDI_PROP_SUCCESS) {
918 			continue;
919 		}
920 
921 		if (strcmp(data, guid) != 0) {
922 			ddi_prop_free(data);
923 			continue;
924 		}
925 		ddi_prop_free(data);
926 		break;
927 	}
928 	ndi_devi_exit(vh->vh_dip, circular);
929 	return (cdip);
930 }
931 
932 /*
933  * i_mdi_devinfo_remove():
934  *		Remove a client device node
935  */
936 static int
937 i_mdi_devinfo_remove(dev_info_t *vdip, dev_info_t *cdip, int flags)
938 {
939 	int	rv = MDI_SUCCESS;
940 	if (i_mdi_is_child_present(vdip, cdip) == MDI_SUCCESS ||
941 	    (flags & MDI_CLIENT_FLAGS_DEV_NOT_SUPPORTED)) {
942 		rv = ndi_devi_offline(cdip, NDI_DEVI_REMOVE);
943 		if (rv != NDI_SUCCESS) {
944 			MDI_DEBUG(1, (CE_NOTE, NULL, "!i_mdi_devinfo_remove:"
945 			    " failed. cdip = %p\n", cdip));
946 		}
947 		/*
948 		 * Convert to MDI error code
949 		 */
950 		switch (rv) {
951 		case NDI_SUCCESS:
952 			rv = MDI_SUCCESS;
953 			break;
954 		case NDI_BUSY:
955 			rv = MDI_BUSY;
956 			break;
957 		default:
958 			rv = MDI_FAILURE;
959 			break;
960 		}
961 	}
962 	return (rv);
963 }
964 
965 /*
966  * i_devi_get_client()
967  *		Utility function to get mpxio component extensions
968  */
969 static mdi_client_t *
970 i_devi_get_client(dev_info_t *cdip)
971 {
972 	mdi_client_t	*ct = NULL;
973 	if (MDI_CLIENT(cdip)) {
974 		ct = (mdi_client_t *)DEVI(cdip)->devi_mdi_client;
975 	}
976 	return (ct);
977 }
978 
979 /*
980  * i_mdi_is_child_present():
981  *		Search for the presence of client device dev_info node
982  */
983 
984 static int
985 i_mdi_is_child_present(dev_info_t *vdip, dev_info_t *cdip)
986 {
987 	int		rv = MDI_FAILURE;
988 	struct dev_info	*dip;
989 	int		circular;
990 
991 	ndi_devi_enter(vdip, &circular);
992 	dip = DEVI(vdip)->devi_child;
993 	while (dip) {
994 		if (dip == DEVI(cdip)) {
995 			rv = MDI_SUCCESS;
996 			break;
997 		}
998 		dip = dip->devi_sibling;
999 	}
1000 	ndi_devi_exit(vdip, circular);
1001 	return (rv);
1002 }
1003 
1004 
1005 /*
1006  * i_mdi_client_lock():
1007  *		Grab client component lock
1008  * Return Values:
1009  *		None
1010  * Note:
1011  *		The default locking order is:
1012  *		_NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
1013  *		But there are number of situations where locks need to be
1014  *		grabbed in reverse order.  This routine implements try and lock
1015  *		mechanism depending on the requested parameter option.
1016  */
1017 
1018 static void
1019 i_mdi_client_lock(mdi_client_t *ct, mdi_pathinfo_t *pip)
1020 {
1021 	if (pip) {
1022 		/*
1023 		 * Reverse locking is requested.
1024 		 */
1025 		while (MDI_CLIENT_TRYLOCK(ct) == 0) {
1026 			/*
1027 			 * tryenter failed. Try to grab again
1028 			 * after a small delay
1029 			 */
1030 			MDI_PI_HOLD(pip);
1031 			MDI_PI_UNLOCK(pip);
1032 			delay(1);
1033 			MDI_PI_LOCK(pip);
1034 			MDI_PI_RELE(pip);
1035 		}
1036 	} else {
1037 		MDI_CLIENT_LOCK(ct);
1038 	}
1039 }
1040 
1041 /*
1042  * i_mdi_client_unlock():
1043  *		Unlock a client component
1044  */
1045 
1046 static void
1047 i_mdi_client_unlock(mdi_client_t *ct)
1048 {
1049 	MDI_CLIENT_UNLOCK(ct);
1050 }
1051 
1052 /*
1053  * i_mdi_client_alloc():
1054  * 		Allocate and initialize a client structure.  Caller should
1055  *		hold the global mdi_mutex.
1056  * Return Values:
1057  *		Handle to a client component
1058  */
1059 /*ARGSUSED*/
1060 static mdi_client_t *
1061 i_mdi_client_alloc(mdi_vhci_t *vh, char *name, char *lguid, int flags)
1062 {
1063 	mdi_client_t	*ct;
1064 	char		*drvname = NULL;
1065 	char		*guid = NULL;
1066 	client_lb_args_t 	*lb_args = NULL;
1067 
1068 	ASSERT(MUTEX_HELD(&mdi_mutex));
1069 
1070 	/*
1071 	 * Allocate and initialize a component structure.
1072 	 */
1073 	ct = kmem_zalloc(sizeof (*ct),
1074 	    (flags == DDI_SLEEP) ? KM_SLEEP : KM_NOSLEEP);
1075 	if (ct == NULL)
1076 		goto fail;
1077 	mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL);
1078 	ct->ct_hnext = NULL;
1079 	ct->ct_hprev = NULL;
1080 	ct->ct_dip = NULL;
1081 	ct->ct_vhci = vh;
1082 	drvname = kmem_alloc(strlen(name) + 1,
1083 	    (flags == DDI_SLEEP) ? KM_SLEEP : KM_NOSLEEP);
1084 	if (drvname == NULL)
1085 		goto fail;
1086 	ct->ct_drvname = drvname;
1087 	(void) strcpy(ct->ct_drvname, name);
1088 	guid = kmem_alloc(strlen(lguid) + 1,
1089 	    (flags == DDI_SLEEP) ? KM_SLEEP : KM_NOSLEEP);
1090 	if (guid == NULL)
1091 		goto fail;
1092 	ct->ct_guid = guid;
1093 	(void) strcpy(ct->ct_guid, lguid);
1094 	ct->ct_cprivate = NULL;
1095 	ct->ct_vprivate = NULL;
1096 	ct->ct_flags = 0;
1097 	ct->ct_state = MDI_CLIENT_STATE_FAILED;
1098 	MDI_CLIENT_SET_OFFLINE(ct);
1099 	MDI_CLIENT_SET_DETACH(ct);
1100 	MDI_CLIENT_SET_POWER_UP(ct);
1101 	ct->ct_failover_flags = 0;
1102 	ct->ct_failover_status = 0;
1103 	cv_init(&ct->ct_failover_cv, NULL, CV_DRIVER, NULL);
1104 	ct->ct_unstable = 0;
1105 	cv_init(&ct->ct_unstable_cv, NULL, CV_DRIVER, NULL);
1106 	cv_init(&ct->ct_powerchange_cv, NULL, CV_DRIVER, NULL);
1107 	ct->ct_lb = vh->vh_lb;
1108 	lb_args =  kmem_zalloc(sizeof (client_lb_args_t),
1109 		(flags == DDI_SLEEP) ? KM_SLEEP : KM_NOSLEEP);
1110 	if (lb_args == NULL)
1111 		goto fail;
1112 	ct->ct_lb_args = lb_args;
1113 	ct->ct_lb_args->region_size = LOAD_BALANCE_DEFAULT_REGION_SIZE;
1114 	ct->ct_path_count = 0;
1115 	ct->ct_path_head = NULL;
1116 	ct->ct_path_tail = NULL;
1117 	ct->ct_path_last = NULL;
1118 
1119 
1120 	/*
1121 	 * Add this client component to our client hash queue
1122 	 */
1123 	i_mdi_client_enlist_table(vh, ct);
1124 	return (ct);
1125 
1126 fail:
1127 	if (guid)
1128 		kmem_free(guid, strlen(lguid) + 1);
1129 	if (drvname)
1130 		kmem_free(drvname, strlen(name) + 1);
1131 	if (lb_args)
1132 		kmem_free(lb_args, sizeof (client_lb_args_t));
1133 	kmem_free(ct, sizeof (*ct));
1134 	return (NULL);
1135 }
1136 
1137 /*
1138  * i_mdi_client_enlist_table():
1139  *		Attach the client device to the client hash table. Caller
1140  *		should hold the mdi_mutex
1141  */
1142 
1143 static void
1144 i_mdi_client_enlist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1145 {
1146 	int 			index;
1147 	struct client_hash	*head;
1148 
1149 	ASSERT(MUTEX_HELD(&mdi_mutex));
1150 	index = i_mdi_get_hash_key(ct->ct_guid);
1151 	head = &vh->vh_client_table[index];
1152 	ct->ct_hnext = (mdi_client_t *)head->ct_hash_head;
1153 	head->ct_hash_head = ct;
1154 	head->ct_hash_count++;
1155 	vh->vh_client_count++;
1156 }
1157 
1158 /*
1159  * i_mdi_client_delist_table():
1160  *		Attach the client device to the client hash table.
1161  *		Caller should hold the mdi_mutex
1162  */
1163 
1164 static void
1165 i_mdi_client_delist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1166 {
1167 	int			index;
1168 	char			*guid;
1169 	struct client_hash 	*head;
1170 	mdi_client_t		*next;
1171 	mdi_client_t		*last;
1172 
1173 	ASSERT(MUTEX_HELD(&mdi_mutex));
1174 	guid = ct->ct_guid;
1175 	index = i_mdi_get_hash_key(guid);
1176 	head = &vh->vh_client_table[index];
1177 
1178 	last = NULL;
1179 	next = (mdi_client_t *)head->ct_hash_head;
1180 	while (next != NULL) {
1181 		if (next == ct) {
1182 			break;
1183 		}
1184 		last = next;
1185 		next = next->ct_hnext;
1186 	}
1187 
1188 	if (next) {
1189 		head->ct_hash_count--;
1190 		if (last == NULL) {
1191 			head->ct_hash_head = ct->ct_hnext;
1192 		} else {
1193 			last->ct_hnext = ct->ct_hnext;
1194 		}
1195 		ct->ct_hnext = NULL;
1196 		vh->vh_client_count--;
1197 	}
1198 }
1199 
1200 
1201 /*
1202  * i_mdi_client_free():
1203  *		Free a client component
1204  */
1205 static int
1206 i_mdi_client_free(mdi_vhci_t *vh, mdi_client_t *ct)
1207 {
1208 	int		rv = MDI_SUCCESS;
1209 	int		flags = ct->ct_flags;
1210 	dev_info_t	*cdip;
1211 	dev_info_t	*vdip;
1212 
1213 	ASSERT(MUTEX_HELD(&mdi_mutex));
1214 	vdip = vh->vh_dip;
1215 	cdip = ct->ct_dip;
1216 
1217 	(void) ndi_prop_remove(DDI_DEV_T_NONE, cdip, MDI_CLIENT_GUID_PROP);
1218 	DEVI(cdip)->devi_mdi_component &= ~MDI_COMPONENT_CLIENT;
1219 	DEVI(cdip)->devi_mdi_client = NULL;
1220 
1221 	/*
1222 	 * Clear out back ref. to dev_info_t node
1223 	 */
1224 	ct->ct_dip = NULL;
1225 
1226 	/*
1227 	 * Remove this client from our hash queue
1228 	 */
1229 	i_mdi_client_delist_table(vh, ct);
1230 
1231 	/*
1232 	 * Uninitialize and free the component
1233 	 */
1234 	kmem_free(ct->ct_drvname, strlen(ct->ct_drvname) + 1);
1235 	kmem_free(ct->ct_guid, strlen(ct->ct_guid) + 1);
1236 	kmem_free(ct->ct_lb_args, sizeof (client_lb_args_t));
1237 	cv_destroy(&ct->ct_failover_cv);
1238 	cv_destroy(&ct->ct_unstable_cv);
1239 	cv_destroy(&ct->ct_powerchange_cv);
1240 	mutex_destroy(&ct->ct_mutex);
1241 	kmem_free(ct, sizeof (*ct));
1242 
1243 	if (cdip != NULL) {
1244 		mutex_exit(&mdi_mutex);
1245 		(void) i_mdi_devinfo_remove(vdip, cdip, flags);
1246 		mutex_enter(&mdi_mutex);
1247 	}
1248 	return (rv);
1249 }
1250 
1251 /*
1252  * i_mdi_client_find():
1253  * 		Find the client structure corresponding to a given guid
1254  *		Caller should hold the mdi_mutex
1255  */
1256 static mdi_client_t *
1257 i_mdi_client_find(mdi_vhci_t *vh, char *guid)
1258 {
1259 	int			index;
1260 	struct client_hash	*head;
1261 	mdi_client_t		*ct;
1262 
1263 	ASSERT(MUTEX_HELD(&mdi_mutex));
1264 	index = i_mdi_get_hash_key(guid);
1265 	head = &vh->vh_client_table[index];
1266 
1267 	ct = head->ct_hash_head;
1268 	while (ct != NULL) {
1269 		if (strcmp(ct->ct_guid, guid) == 0) {
1270 			break;
1271 		}
1272 		ct = ct->ct_hnext;
1273 	}
1274 	return (ct);
1275 }
1276 
1277 
1278 
1279 /*
1280  * i_mdi_client_update_state():
1281  *		Compute and update client device state
1282  * Notes:
1283  *		A client device can be in any of three possible states:
1284  *
1285  *		MDI_CLIENT_STATE_OPTIMAL - Client in optimal state with more
1286  *		one online/standby paths. Can tolerate failures.
1287  *		MDI_CLIENT_STATE_DEGRADED - Client device in degraded state with
1288  *		no alternate paths available as standby. A failure on the online
1289  *		would result in loss of access to device data.
1290  *		MDI_CLIENT_STATE_FAILED - Client device in failed state with
1291  *		no paths available to access the device.
1292  */
1293 static void
1294 i_mdi_client_update_state(mdi_client_t *ct)
1295 {
1296 	int state;
1297 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1298 	state = i_mdi_client_compute_state(ct, NULL);
1299 	MDI_CLIENT_SET_STATE(ct, state);
1300 }
1301 
1302 /*
1303  * i_mdi_client_compute_state():
1304  *		Compute client device state
1305  *
1306  *		mdi_phci_t *	Pointer to pHCI structure which should
1307  *				while computing the new value.  Used by
1308  *				i_mdi_phci_offline() to find the new
1309  *				client state after DR of a pHCI.
1310  */
1311 static int
1312 i_mdi_client_compute_state(mdi_client_t *ct, mdi_phci_t *ph)
1313 {
1314 	int		state;
1315 	int		online_count = 0;
1316 	int		standby_count = 0;
1317 	mdi_pathinfo_t	*pip, *next;
1318 
1319 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1320 	pip = ct->ct_path_head;
1321 	while (pip != NULL) {
1322 		MDI_PI_LOCK(pip);
1323 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
1324 		if (MDI_PI(pip)->pi_phci == ph) {
1325 			MDI_PI_UNLOCK(pip);
1326 			pip = next;
1327 			continue;
1328 		}
1329 		if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1330 				== MDI_PATHINFO_STATE_ONLINE)
1331 			online_count++;
1332 		else if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1333 				== MDI_PATHINFO_STATE_STANDBY)
1334 			standby_count++;
1335 		MDI_PI_UNLOCK(pip);
1336 		pip = next;
1337 	}
1338 
1339 	if (online_count == 0) {
1340 		if (standby_count == 0) {
1341 			state = MDI_CLIENT_STATE_FAILED;
1342 			MDI_DEBUG(2, (CE_NOTE, NULL, "!client state: failed"
1343 			    " ct = %p\n", ct));
1344 		} else if (standby_count == 1) {
1345 			state = MDI_CLIENT_STATE_DEGRADED;
1346 		} else {
1347 			state = MDI_CLIENT_STATE_OPTIMAL;
1348 		}
1349 	} else if (online_count == 1) {
1350 		if (standby_count == 0) {
1351 			state = MDI_CLIENT_STATE_DEGRADED;
1352 		} else {
1353 			state = MDI_CLIENT_STATE_OPTIMAL;
1354 		}
1355 	} else {
1356 		state = MDI_CLIENT_STATE_OPTIMAL;
1357 	}
1358 	return (state);
1359 }
1360 
1361 /*
1362  * i_mdi_client2devinfo():
1363  *		Utility function
1364  */
1365 dev_info_t *
1366 i_mdi_client2devinfo(mdi_client_t *ct)
1367 {
1368 	return (ct->ct_dip);
1369 }
1370 
1371 /*
1372  * mdi_client_path2_devinfo():
1373  * 		Given the parent devinfo and child devfs pathname, search for
1374  *		a valid devfs node handle.
1375  */
1376 dev_info_t *
1377 mdi_client_path2devinfo(dev_info_t *vdip, char *pathname)
1378 {
1379 	dev_info_t 	*cdip = NULL;
1380 	dev_info_t 	*ndip = NULL;
1381 	char		*temp_pathname;
1382 	int		circular;
1383 
1384 	/*
1385 	 * Allocate temp buffer
1386 	 */
1387 	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1388 
1389 	/*
1390 	 * Lock parent against changes
1391 	 */
1392 	ndi_devi_enter(vdip, &circular);
1393 	ndip = (dev_info_t *)DEVI(vdip)->devi_child;
1394 	while ((cdip = ndip) != NULL) {
1395 		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
1396 
1397 		*temp_pathname = '\0';
1398 		(void) ddi_pathname(cdip, temp_pathname);
1399 		if (strcmp(temp_pathname, pathname) == 0) {
1400 			break;
1401 		}
1402 	}
1403 	/*
1404 	 * Release devinfo lock
1405 	 */
1406 	ndi_devi_exit(vdip, circular);
1407 
1408 	/*
1409 	 * Free the temp buffer
1410 	 */
1411 	kmem_free(temp_pathname, MAXPATHLEN);
1412 	return (cdip);
1413 }
1414 
1415 
1416 /*
1417  * mdi_client_get_path_count():
1418  * 		Utility function to get number of path information nodes
1419  *		associated with a given client device.
1420  */
1421 int
1422 mdi_client_get_path_count(dev_info_t *cdip)
1423 {
1424 	mdi_client_t	*ct;
1425 	int		count = 0;
1426 
1427 	ct = i_devi_get_client(cdip);
1428 	if (ct != NULL) {
1429 		count = ct->ct_path_count;
1430 	}
1431 	return (count);
1432 }
1433 
1434 
1435 /*
1436  * i_mdi_get_hash_key():
1437  * 		Create a hash using strings as keys
1438  *
1439  */
1440 static int
1441 i_mdi_get_hash_key(char *str)
1442 {
1443 	uint32_t	g, hash = 0;
1444 	char		*p;
1445 
1446 	for (p = str; *p != '\0'; p++) {
1447 		g = *p;
1448 		hash += g;
1449 	}
1450 	return (hash % (CLIENT_HASH_TABLE_SIZE - 1));
1451 }
1452 
1453 /*
1454  * mdi_get_lb_policy():
1455  * 		Get current load balancing policy for a given client device
1456  */
1457 client_lb_t
1458 mdi_get_lb_policy(dev_info_t *cdip)
1459 {
1460 	client_lb_t	lb = LOAD_BALANCE_NONE;
1461 	mdi_client_t	*ct;
1462 
1463 	ct = i_devi_get_client(cdip);
1464 	if (ct != NULL) {
1465 		lb = ct->ct_lb;
1466 	}
1467 	return (lb);
1468 }
1469 
1470 /*
1471  * mdi_set_lb_region_size():
1472  * 		Set current region size for the load-balance
1473  */
1474 int
1475 mdi_set_lb_region_size(dev_info_t *cdip, int region_size)
1476 {
1477 	mdi_client_t	*ct;
1478 	int		rv = MDI_FAILURE;
1479 
1480 	ct = i_devi_get_client(cdip);
1481 	if (ct != NULL && ct->ct_lb_args != NULL) {
1482 		ct->ct_lb_args->region_size = region_size;
1483 		rv = MDI_SUCCESS;
1484 	}
1485 	return (rv);
1486 }
1487 
1488 /*
1489  * mdi_Set_lb_policy():
1490  * 		Set current load balancing policy for a given client device
1491  */
1492 int
1493 mdi_set_lb_policy(dev_info_t *cdip, client_lb_t lb)
1494 {
1495 	mdi_client_t	*ct;
1496 	int		rv = MDI_FAILURE;
1497 
1498 	ct = i_devi_get_client(cdip);
1499 	if (ct != NULL) {
1500 		ct->ct_lb = lb;
1501 		rv = MDI_SUCCESS;
1502 	}
1503 	return (rv);
1504 }
1505 
1506 /*
1507  * mdi_failover():
1508  *		failover function called by the vHCI drivers to initiate
1509  *		a failover operation.  This is typically due to non-availability
1510  *		of online paths to route I/O requests.  Failover can be
1511  *		triggered through user application also.
1512  *
1513  *		The vHCI driver calls mdi_failover() to initiate a failover
1514  *		operation. mdi_failover() calls back into the vHCI driver's
1515  *		vo_failover() entry point to perform the actual failover
1516  *		operation.  The reason for requiring the vHCI driver to
1517  *		initiate failover by calling mdi_failover(), instead of directly
1518  *		executing vo_failover() itself, is to ensure that the mdi
1519  *		framework can keep track of the client state properly.
1520  *		Additionally, mdi_failover() provides as a convenience the
1521  *		option of performing the failover operation synchronously or
1522  *		asynchronously
1523  *
1524  *		Upon successful completion of the failover operation, the
1525  *		paths that were previously ONLINE will be in the STANDBY state,
1526  *		and the newly activated paths will be in the ONLINE state.
1527  *
1528  *		The flags modifier determines whether the activation is done
1529  *		synchronously: MDI_FAILOVER_SYNC
1530  * Return Values:
1531  *		MDI_SUCCESS
1532  *		MDI_FAILURE
1533  *		MDI_BUSY
1534  */
1535 /*ARGSUSED*/
1536 int
1537 mdi_failover(dev_info_t *vdip, dev_info_t *cdip, int flags)
1538 {
1539 	int			rv;
1540 	mdi_client_t		*ct;
1541 
1542 	ct = i_devi_get_client(cdip);
1543 	ASSERT(ct != NULL);
1544 	if (ct == NULL) {
1545 		/* cdip is not a valid client device. Nothing more to do. */
1546 		return (MDI_FAILURE);
1547 	}
1548 
1549 	MDI_CLIENT_LOCK(ct);
1550 
1551 	if (MDI_CLIENT_IS_PATH_FREE_IN_PROGRESS(ct)) {
1552 		/* A path to the client is being freed */
1553 		MDI_CLIENT_UNLOCK(ct);
1554 		return (MDI_BUSY);
1555 	}
1556 
1557 
1558 	if (MDI_CLIENT_IS_FAILED(ct)) {
1559 		/*
1560 		 * Client is in failed state. Nothing more to do.
1561 		 */
1562 		MDI_CLIENT_UNLOCK(ct);
1563 		return (MDI_FAILURE);
1564 	}
1565 
1566 	if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
1567 		/*
1568 		 * Failover is already in progress; return BUSY
1569 		 */
1570 		MDI_CLIENT_UNLOCK(ct);
1571 		return (MDI_BUSY);
1572 	}
1573 	/*
1574 	 * Make sure that mdi_pathinfo node state changes are processed.
1575 	 * We do not allow failovers to progress while client path state
1576 	 * changes are in progress
1577 	 */
1578 	if (ct->ct_unstable) {
1579 		if (flags == MDI_FAILOVER_ASYNC) {
1580 			MDI_CLIENT_UNLOCK(ct);
1581 			return (MDI_BUSY);
1582 		} else {
1583 			while (ct->ct_unstable)
1584 				cv_wait(&ct->ct_unstable_cv, &ct->ct_mutex);
1585 		}
1586 	}
1587 
1588 	/*
1589 	 * Client device is in stable state. Before proceeding, perform sanity
1590 	 * checks again.
1591 	 */
1592 	if ((MDI_CLIENT_IS_DETACHED(ct)) || (MDI_CLIENT_IS_FAILED(ct)) ||
1593 	    (i_ddi_node_state(ct->ct_dip) < DS_READY)) {
1594 		/*
1595 		 * Client is in failed state. Nothing more to do.
1596 		 */
1597 		MDI_CLIENT_UNLOCK(ct);
1598 		return (MDI_FAILURE);
1599 	}
1600 
1601 	/*
1602 	 * Set the client state as failover in progress.
1603 	 */
1604 	MDI_CLIENT_SET_FAILOVER_IN_PROGRESS(ct);
1605 	ct->ct_failover_flags = flags;
1606 	MDI_CLIENT_UNLOCK(ct);
1607 
1608 	if (flags == MDI_FAILOVER_ASYNC) {
1609 		/*
1610 		 * Submit the initiate failover request via CPR safe
1611 		 * taskq threads.
1612 		 */
1613 		(void) taskq_dispatch(mdi_taskq, (task_func_t *)i_mdi_failover,
1614 		    ct, KM_SLEEP);
1615 		return (MDI_ACCEPT);
1616 	} else {
1617 		/*
1618 		 * Synchronous failover mode.  Typically invoked from the user
1619 		 * land.
1620 		 */
1621 		rv = i_mdi_failover(ct);
1622 	}
1623 	return (rv);
1624 }
1625 
1626 /*
1627  * i_mdi_failover():
1628  *		internal failover function. Invokes vHCI drivers failover
1629  *		callback function and process the failover status
1630  * Return Values:
1631  *		None
1632  *
1633  * Note: A client device in failover state can not be detached or freed.
1634  */
1635 static int
1636 i_mdi_failover(void *arg)
1637 {
1638 	int		rv = MDI_SUCCESS;
1639 	mdi_client_t	*ct = (mdi_client_t *)arg;
1640 	mdi_vhci_t	*vh = ct->ct_vhci;
1641 
1642 	ASSERT(!MUTEX_HELD(&ct->ct_mutex));
1643 
1644 	if (vh->vh_ops->vo_failover != NULL) {
1645 		/*
1646 		 * Call vHCI drivers callback routine
1647 		 */
1648 		rv = (*vh->vh_ops->vo_failover)(vh->vh_dip, ct->ct_dip,
1649 		    ct->ct_failover_flags);
1650 	}
1651 
1652 	MDI_CLIENT_LOCK(ct);
1653 	MDI_CLIENT_CLEAR_FAILOVER_IN_PROGRESS(ct);
1654 
1655 	/*
1656 	 * Save the failover return status
1657 	 */
1658 	ct->ct_failover_status = rv;
1659 
1660 	/*
1661 	 * As a result of failover, client status would have been changed.
1662 	 * Update the client state and wake up anyone waiting on this client
1663 	 * device.
1664 	 */
1665 	i_mdi_client_update_state(ct);
1666 
1667 	cv_broadcast(&ct->ct_failover_cv);
1668 	MDI_CLIENT_UNLOCK(ct);
1669 	return (rv);
1670 }
1671 
1672 /*
1673  * Load balancing is logical block.
1674  * IOs within the range described by region_size
1675  * would go on the same path. This would improve the
1676  * performance by cache-hit on some of the RAID devices.
1677  * Search only for online paths(At some point we
1678  * may want to balance across target ports).
1679  * If no paths are found then default to round-robin.
1680  */
1681 static int
1682 i_mdi_lba_lb(mdi_client_t *ct, mdi_pathinfo_t **ret_pip, struct buf *bp)
1683 {
1684 	int		path_index = -1;
1685 	int		online_path_count = 0;
1686 	int		online_nonpref_path_count = 0;
1687 	int 		region_size = ct->ct_lb_args->region_size;
1688 	mdi_pathinfo_t	*pip;
1689 	mdi_pathinfo_t	*next;
1690 	int		preferred, path_cnt;
1691 
1692 	pip = ct->ct_path_head;
1693 	while (pip) {
1694 		MDI_PI_LOCK(pip);
1695 		if (MDI_PI(pip)->pi_state ==
1696 		    MDI_PATHINFO_STATE_ONLINE && MDI_PI(pip)->pi_preferred) {
1697 			online_path_count++;
1698 		} else if (MDI_PI(pip)->pi_state ==
1699 		    MDI_PATHINFO_STATE_ONLINE && !MDI_PI(pip)->pi_preferred) {
1700 			online_nonpref_path_count++;
1701 		}
1702 		next = (mdi_pathinfo_t *)
1703 		    MDI_PI(pip)->pi_client_link;
1704 		MDI_PI_UNLOCK(pip);
1705 		pip = next;
1706 	}
1707 	/* if found any online/preferred then use this type */
1708 	if (online_path_count > 0) {
1709 		path_cnt = online_path_count;
1710 		preferred = 1;
1711 	} else if (online_nonpref_path_count > 0) {
1712 		path_cnt = online_nonpref_path_count;
1713 		preferred = 0;
1714 	} else {
1715 		path_cnt = 0;
1716 	}
1717 	if (path_cnt) {
1718 		path_index = (bp->b_blkno >> region_size) % path_cnt;
1719 		pip = ct->ct_path_head;
1720 		while (pip && path_index != -1) {
1721 			MDI_PI_LOCK(pip);
1722 			if (path_index == 0 &&
1723 			    (MDI_PI(pip)->pi_state ==
1724 			    MDI_PATHINFO_STATE_ONLINE) &&
1725 				MDI_PI(pip)->pi_preferred == preferred) {
1726 				MDI_PI_HOLD(pip);
1727 				MDI_PI_UNLOCK(pip);
1728 				*ret_pip = pip;
1729 				return (MDI_SUCCESS);
1730 			}
1731 			path_index --;
1732 			next = (mdi_pathinfo_t *)
1733 			    MDI_PI(pip)->pi_client_link;
1734 			MDI_PI_UNLOCK(pip);
1735 			pip = next;
1736 		}
1737 		if (pip == NULL) {
1738 			MDI_DEBUG(4, (CE_NOTE, NULL,
1739 			    "!lba %p, no pip !!\n",
1740 				bp->b_blkno));
1741 		} else {
1742 			MDI_DEBUG(4, (CE_NOTE, NULL,
1743 			    "!lba %p, no pip for path_index, "
1744 			    "pip %p\n", pip));
1745 		}
1746 	}
1747 	return (MDI_FAILURE);
1748 }
1749 
1750 /*
1751  * mdi_select_path():
1752  *		select a path to access a client device.
1753  *
1754  *		mdi_select_path() function is called by the vHCI drivers to
1755  *		select a path to route the I/O request to.  The caller passes
1756  *		the block I/O data transfer structure ("buf") as one of the
1757  *		parameters.  The mpxio framework uses the buf structure
1758  *		contents to maintain per path statistics (total I/O size /
1759  *		count pending).  If more than one online paths are available to
1760  *		select, the framework automatically selects a suitable path
1761  *		for routing I/O request. If a failover operation is active for
1762  *		this client device the call shall be failed with MDI_BUSY error
1763  *		code.
1764  *
1765  *		By default this function returns a suitable path in online
1766  *		state based on the current load balancing policy.  Currently
1767  *		we support LOAD_BALANCE_NONE (Previously selected online path
1768  *		will continue to be used till the path is usable) and
1769  *		LOAD_BALANCE_RR (Online paths will be selected in a round
1770  *		robin fashion), LOAD_BALANCE_LB(Online paths will be selected
1771  *		based on the logical block).  The load balancing
1772  *		through vHCI drivers configuration file (driver.conf).
1773  *
1774  *		vHCI drivers may override this default behavior by specifying
1775  *		appropriate flags.  If start_pip is specified (non NULL) is
1776  *		used as start point to walk and find the next appropriate path.
1777  *		The following values are currently defined:
1778  *		MDI_SELECT_ONLINE_PATH (to select an ONLINE path) and/or
1779  *		MDI_SELECT_STANDBY_PATH (to select an STANDBY path).
1780  *
1781  *		The non-standard behavior is used by the scsi_vhci driver,
1782  *		whenever it has to use a STANDBY/FAULTED path.  Eg. during
1783  *		attach of client devices (to avoid an unnecessary failover
1784  *		when the STANDBY path comes up first), during failover
1785  *		(to activate a STANDBY path as ONLINE).
1786  *
1787  *		The selected path in returned in a held state (ref_cnt).
1788  *		Caller should release the hold by calling mdi_rele_path().
1789  *
1790  * Return Values:
1791  *		MDI_SUCCESS	- Completed successfully
1792  *		MDI_BUSY 	- Client device is busy failing over
1793  *		MDI_NOPATH	- Client device is online, but no valid path are
1794  *				  available to access this client device
1795  *		MDI_FAILURE	- Invalid client device or state
1796  *		MDI_DEVI_ONLINING
1797  *				- Client device (struct dev_info state) is in
1798  *				  onlining state.
1799  */
1800 
1801 /*ARGSUSED*/
1802 int
1803 mdi_select_path(dev_info_t *cdip, struct buf *bp, int flags,
1804     mdi_pathinfo_t *start_pip, mdi_pathinfo_t **ret_pip)
1805 {
1806 	mdi_client_t	*ct;
1807 	mdi_pathinfo_t	*pip;
1808 	mdi_pathinfo_t	*next;
1809 	mdi_pathinfo_t	*head;
1810 	mdi_pathinfo_t	*start;
1811 	client_lb_t	lbp;	/* load balancing policy */
1812 	int		sb = 1;	/* standard behavior */
1813 	int		preferred = 1;	/* preferred path */
1814 	int		cond, cont = 1;
1815 	int		retry = 0;
1816 
1817 	if (flags != 0) {
1818 		/*
1819 		 * disable default behavior
1820 		 */
1821 		sb = 0;
1822 	}
1823 
1824 	*ret_pip = NULL;
1825 	ct = i_devi_get_client(cdip);
1826 	if (ct == NULL) {
1827 		/* mdi extensions are NULL, Nothing more to do */
1828 		return (MDI_FAILURE);
1829 	}
1830 
1831 	MDI_CLIENT_LOCK(ct);
1832 
1833 	if (sb) {
1834 		if (MDI_CLIENT_IS_FAILED(ct)) {
1835 			/*
1836 			 * Client is not ready to accept any I/O requests.
1837 			 * Fail this request.
1838 			 */
1839 			MDI_DEBUG(2, (CE_NOTE, cdip, "!mdi_select_path: "
1840 			    "client state offline ct = %p\n", ct));
1841 			MDI_CLIENT_UNLOCK(ct);
1842 			return (MDI_FAILURE);
1843 		}
1844 
1845 		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
1846 			/*
1847 			 * Check for Failover is in progress. If so tell the
1848 			 * caller that this device is busy.
1849 			 */
1850 			MDI_DEBUG(2, (CE_NOTE, cdip, "!mdi_select_path: "
1851 			    "client failover in progress ct = %p\n", ct));
1852 			MDI_CLIENT_UNLOCK(ct);
1853 			return (MDI_BUSY);
1854 		}
1855 
1856 		/*
1857 		 * Check to see whether the client device is attached.
1858 		 * If not so, let the vHCI driver manually select a path
1859 		 * (standby) and let the probe/attach process to continue.
1860 		 */
1861 		if ((MDI_CLIENT_IS_DETACHED(ct)) ||
1862 		    i_ddi_node_state(cdip) < DS_READY) {
1863 			MDI_DEBUG(4, (CE_NOTE, cdip, "!Devi is onlining\n"));
1864 			MDI_CLIENT_UNLOCK(ct);
1865 			return (MDI_DEVI_ONLINING);
1866 		}
1867 	}
1868 
1869 	/*
1870 	 * Cache in the client list head.  If head of the list is NULL
1871 	 * return MDI_NOPATH
1872 	 */
1873 	head = ct->ct_path_head;
1874 	if (head == NULL) {
1875 		MDI_CLIENT_UNLOCK(ct);
1876 		return (MDI_NOPATH);
1877 	}
1878 
1879 	/*
1880 	 * for non default behavior, bypass current
1881 	 * load balancing policy and always use LOAD_BALANCE_RR
1882 	 * except that the start point will be adjusted based
1883 	 * on the provided start_pip
1884 	 */
1885 	lbp = sb ? ct->ct_lb : LOAD_BALANCE_RR;
1886 
1887 	switch (lbp) {
1888 	case LOAD_BALANCE_NONE:
1889 		/*
1890 		 * Load balancing is None  or Alternate path mode
1891 		 * Start looking for a online mdi_pathinfo node starting from
1892 		 * last known selected path
1893 		 */
1894 		preferred = 1;
1895 		pip = (mdi_pathinfo_t *)ct->ct_path_last;
1896 		if (pip == NULL) {
1897 			pip = head;
1898 		}
1899 		start = pip;
1900 		do {
1901 			MDI_PI_LOCK(pip);
1902 			/*
1903 			 * No need to explicitly check if the path is disabled.
1904 			 * Since we are checking for state == ONLINE and the
1905 			 * same veriable is used for DISABLE/ENABLE information.
1906 			 */
1907 			if (MDI_PI(pip)->pi_state  ==
1908 				MDI_PATHINFO_STATE_ONLINE &&
1909 				preferred == MDI_PI(pip)->pi_preferred) {
1910 				/*
1911 				 * Return the path in hold state. Caller should
1912 				 * release the lock by calling mdi_rele_path()
1913 				 */
1914 				MDI_PI_HOLD(pip);
1915 				MDI_PI_UNLOCK(pip);
1916 				ct->ct_path_last = pip;
1917 				*ret_pip = pip;
1918 				MDI_CLIENT_UNLOCK(ct);
1919 				return (MDI_SUCCESS);
1920 			}
1921 
1922 			/*
1923 			 * Path is busy.
1924 			 */
1925 			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
1926 			    MDI_PI_IS_TRANSIENT(pip))
1927 				retry = 1;
1928 			/*
1929 			 * Keep looking for a next available online path
1930 			 */
1931 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
1932 			if (next == NULL) {
1933 				next = head;
1934 			}
1935 			MDI_PI_UNLOCK(pip);
1936 			pip = next;
1937 			if (start == pip && preferred) {
1938 				preferred = 0;
1939 			} else if (start == pip && !preferred) {
1940 				cont = 0;
1941 			}
1942 		} while (cont);
1943 		break;
1944 
1945 	case LOAD_BALANCE_LBA:
1946 		/*
1947 		 * Make sure we are looking
1948 		 * for an online path. Otherwise, if it is for a STANDBY
1949 		 * path request, it will go through and fetch an ONLINE
1950 		 * path which is not desirable.
1951 		 */
1952 		if ((ct->ct_lb_args != NULL) &&
1953 			    (ct->ct_lb_args->region_size) && bp &&
1954 				(sb || (flags == MDI_SELECT_ONLINE_PATH))) {
1955 			if (i_mdi_lba_lb(ct, ret_pip, bp)
1956 				    == MDI_SUCCESS) {
1957 				MDI_CLIENT_UNLOCK(ct);
1958 				return (MDI_SUCCESS);
1959 			}
1960 		}
1961 		/*  FALLTHROUGH */
1962 	case LOAD_BALANCE_RR:
1963 		/*
1964 		 * Load balancing is Round Robin. Start looking for a online
1965 		 * mdi_pathinfo node starting from last known selected path
1966 		 * as the start point.  If override flags are specified,
1967 		 * process accordingly.
1968 		 * If the search is already in effect(start_pip not null),
1969 		 * then lets just use the same path preference to continue the
1970 		 * traversal.
1971 		 */
1972 
1973 		if (start_pip != NULL) {
1974 			preferred = MDI_PI(start_pip)->pi_preferred;
1975 		} else {
1976 			preferred = 1;
1977 		}
1978 
1979 		start = sb ? (mdi_pathinfo_t *)ct->ct_path_last : start_pip;
1980 		if (start == NULL) {
1981 			pip = head;
1982 		} else {
1983 			pip = (mdi_pathinfo_t *)MDI_PI(start)->pi_client_link;
1984 			if (pip == NULL) {
1985 				if (!sb) {
1986 					if (preferred == 0) {
1987 						/*
1988 						 * Looks like we have completed
1989 						 * the traversal as preferred
1990 						 * value is 0. Time to bail out.
1991 						 */
1992 						*ret_pip = NULL;
1993 						MDI_CLIENT_UNLOCK(ct);
1994 						return (MDI_NOPATH);
1995 					} else {
1996 						/*
1997 						 * Looks like we reached the
1998 						 * end of the list. Lets enable
1999 						 * traversal of non preferred
2000 						 * paths.
2001 						 */
2002 						preferred = 0;
2003 					}
2004 				}
2005 				pip = head;
2006 			}
2007 		}
2008 		start = pip;
2009 		do {
2010 			MDI_PI_LOCK(pip);
2011 			if (sb) {
2012 				cond = ((MDI_PI(pip)->pi_state ==
2013 				    MDI_PATHINFO_STATE_ONLINE &&
2014 					MDI_PI(pip)->pi_preferred ==
2015 						preferred) ? 1 : 0);
2016 			} else {
2017 				if (flags == MDI_SELECT_ONLINE_PATH) {
2018 					cond = ((MDI_PI(pip)->pi_state ==
2019 					    MDI_PATHINFO_STATE_ONLINE &&
2020 						MDI_PI(pip)->pi_preferred ==
2021 						preferred) ? 1 : 0);
2022 				} else if (flags == MDI_SELECT_STANDBY_PATH) {
2023 					cond = ((MDI_PI(pip)->pi_state ==
2024 					    MDI_PATHINFO_STATE_STANDBY &&
2025 						MDI_PI(pip)->pi_preferred ==
2026 						preferred) ? 1 : 0);
2027 				} else if (flags == (MDI_SELECT_ONLINE_PATH |
2028 				    MDI_SELECT_STANDBY_PATH)) {
2029 					cond = (((MDI_PI(pip)->pi_state ==
2030 					    MDI_PATHINFO_STATE_ONLINE ||
2031 					    (MDI_PI(pip)->pi_state ==
2032 					    MDI_PATHINFO_STATE_STANDBY)) &&
2033 						MDI_PI(pip)->pi_preferred ==
2034 						preferred) ? 1 : 0);
2035 				} else {
2036 					cond = 0;
2037 				}
2038 			}
2039 			/*
2040 			 * No need to explicitly check if the path is disabled.
2041 			 * Since we are checking for state == ONLINE and the
2042 			 * same veriable is used for DISABLE/ENABLE information.
2043 			 */
2044 			if (cond) {
2045 				/*
2046 				 * Return the path in hold state. Caller should
2047 				 * release the lock by calling mdi_rele_path()
2048 				 */
2049 				MDI_PI_HOLD(pip);
2050 				MDI_PI_UNLOCK(pip);
2051 				if (sb)
2052 					ct->ct_path_last = pip;
2053 				*ret_pip = pip;
2054 				MDI_CLIENT_UNLOCK(ct);
2055 				return (MDI_SUCCESS);
2056 			}
2057 			/*
2058 			 * Path is busy.
2059 			 */
2060 			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
2061 			    MDI_PI_IS_TRANSIENT(pip))
2062 				retry = 1;
2063 
2064 			/*
2065 			 * Keep looking for a next available online path
2066 			 */
2067 do_again:
2068 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2069 			if (next == NULL) {
2070 				if (!sb) {
2071 					if (preferred == 1) {
2072 						/*
2073 						 * Looks like we reached the
2074 						 * end of the list. Lets enable
2075 						 * traversal of non preferred
2076 						 * paths.
2077 						 */
2078 						preferred = 0;
2079 						next = head;
2080 					} else {
2081 						/*
2082 						 * We have done both the passes
2083 						 * Preferred as well as for
2084 						 * Non-preferred. Bail out now.
2085 						 */
2086 						cont = 0;
2087 					}
2088 				} else {
2089 					/*
2090 					 * Standard behavior case.
2091 					 */
2092 					next = head;
2093 				}
2094 			}
2095 			MDI_PI_UNLOCK(pip);
2096 			if (cont == 0) {
2097 				break;
2098 			}
2099 			pip = next;
2100 
2101 			if (!sb) {
2102 				/*
2103 				 * We need to handle the selection of
2104 				 * non-preferred path in the following
2105 				 * case:
2106 				 *
2107 				 * +------+   +------+   +------+   +-----+
2108 				 * | A : 1| - | B : 1| - | C : 0| - |NULL |
2109 				 * +------+   +------+   +------+   +-----+
2110 				 *
2111 				 * If we start the search with B, we need to
2112 				 * skip beyond B to pick C which is non -
2113 				 * preferred in the second pass. The following
2114 				 * test, if true, will allow us to skip over
2115 				 * the 'start'(B in the example) to select
2116 				 * other non preferred elements.
2117 				 */
2118 				if ((start_pip != NULL) && (start_pip == pip) &&
2119 				    (MDI_PI(start_pip)->pi_preferred
2120 				    != preferred)) {
2121 					/*
2122 					 * try again after going past the start
2123 					 * pip
2124 					 */
2125 					MDI_PI_LOCK(pip);
2126 					goto do_again;
2127 				}
2128 			} else {
2129 				/*
2130 				 * Standard behavior case
2131 				 */
2132 				if (start == pip && preferred) {
2133 					/* look for nonpreferred paths */
2134 					preferred = 0;
2135 				} else if (start == pip && !preferred) {
2136 					/*
2137 					 * Exit condition
2138 					 */
2139 					cont = 0;
2140 				}
2141 			}
2142 		} while (cont);
2143 		break;
2144 	}
2145 
2146 	MDI_CLIENT_UNLOCK(ct);
2147 	if (retry == 1) {
2148 		return (MDI_BUSY);
2149 	} else {
2150 		return (MDI_NOPATH);
2151 	}
2152 }
2153 
2154 /*
2155  * For a client, return the next available path to any phci
2156  *
2157  * Note:
2158  *		Caller should hold the branch's devinfo node to get a consistent
2159  *		snap shot of the mdi_pathinfo nodes.
2160  *
2161  *		Please note that even the list is stable the mdi_pathinfo
2162  *		node state and properties are volatile.  The caller should lock
2163  *		and unlock the nodes by calling mdi_pi_lock() and
2164  *		mdi_pi_unlock() functions to get a stable properties.
2165  *
2166  *		If there is a need to use the nodes beyond the hold of the
2167  *		devinfo node period (For ex. I/O), then mdi_pathinfo node
2168  *		need to be held against unexpected removal by calling
2169  *		mdi_hold_path() and should be released by calling
2170  *		mdi_rele_path() on completion.
2171  */
2172 mdi_pathinfo_t *
2173 mdi_get_next_phci_path(dev_info_t *ct_dip, mdi_pathinfo_t *pip)
2174 {
2175 	mdi_client_t *ct;
2176 
2177 	if (!MDI_CLIENT(ct_dip))
2178 		return (NULL);
2179 
2180 	/*
2181 	 * Walk through client link
2182 	 */
2183 	ct = (mdi_client_t *)DEVI(ct_dip)->devi_mdi_client;
2184 	ASSERT(ct != NULL);
2185 
2186 	if (pip == NULL)
2187 		return ((mdi_pathinfo_t *)ct->ct_path_head);
2188 
2189 	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link);
2190 }
2191 
2192 /*
2193  * For a phci, return the next available path to any client
2194  * Note: ditto mdi_get_next_phci_path()
2195  */
2196 mdi_pathinfo_t *
2197 mdi_get_next_client_path(dev_info_t *ph_dip, mdi_pathinfo_t *pip)
2198 {
2199 	mdi_phci_t *ph;
2200 
2201 	if (!MDI_PHCI(ph_dip))
2202 		return (NULL);
2203 
2204 	/*
2205 	 * Walk through pHCI link
2206 	 */
2207 	ph = (mdi_phci_t *)DEVI(ph_dip)->devi_mdi_xhci;
2208 	ASSERT(ph != NULL);
2209 
2210 	if (pip == NULL)
2211 		return ((mdi_pathinfo_t *)ph->ph_path_head);
2212 
2213 	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link);
2214 }
2215 
2216 /*
2217  * mdi_get_nextpath():
2218  *		mdi_pathinfo node walker function.  Get the next node from the
2219  *		client or pHCI device list.
2220  *
2221  * XXX This is wrapper function for compatibility purposes only.
2222  *
2223  *	It doesn't work under Multi-level MPxIO, where a dip
2224  *	is both client and phci (which link should next_path follow?).
2225  *	Once Leadville is modified to call mdi_get_next_phci/client_path,
2226  *	this interface should be removed.
2227  */
2228 void
2229 mdi_get_next_path(dev_info_t *dip, mdi_pathinfo_t *pip,
2230     mdi_pathinfo_t **ret_pip)
2231 {
2232 	if (MDI_CLIENT(dip)) {
2233 		*ret_pip = mdi_get_next_phci_path(dip, pip);
2234 	} else if (MDI_PHCI(dip)) {
2235 		*ret_pip = mdi_get_next_client_path(dip, pip);
2236 	} else {
2237 		*ret_pip = NULL;
2238 	}
2239 }
2240 
2241 /*
2242  * mdi_hold_path():
2243  *		Hold the mdi_pathinfo node against unwanted unexpected free.
2244  * Return Values:
2245  *		None
2246  */
2247 void
2248 mdi_hold_path(mdi_pathinfo_t *pip)
2249 {
2250 	if (pip) {
2251 		MDI_PI_LOCK(pip);
2252 		MDI_PI_HOLD(pip);
2253 		MDI_PI_UNLOCK(pip);
2254 	}
2255 }
2256 
2257 
2258 /*
2259  * mdi_rele_path():
2260  *		Release the mdi_pathinfo node which was selected
2261  *		through mdi_select_path() mechanism or manually held by
2262  *		calling mdi_hold_path().
2263  * Return Values:
2264  *		None
2265  */
2266 void
2267 mdi_rele_path(mdi_pathinfo_t *pip)
2268 {
2269 	if (pip) {
2270 		MDI_PI_LOCK(pip);
2271 		MDI_PI_RELE(pip);
2272 		if (MDI_PI(pip)->pi_ref_cnt == 0) {
2273 			cv_broadcast(&MDI_PI(pip)->pi_ref_cv);
2274 		}
2275 		MDI_PI_UNLOCK(pip);
2276 	}
2277 }
2278 
2279 
2280 /*
2281  * mdi_pi_lock():
2282  * 		Lock the mdi_pathinfo node.
2283  * Note:
2284  *		The caller should release the lock by calling mdi_pi_unlock()
2285  */
2286 void
2287 mdi_pi_lock(mdi_pathinfo_t *pip)
2288 {
2289 	ASSERT(pip != NULL);
2290 	if (pip) {
2291 		MDI_PI_LOCK(pip);
2292 	}
2293 }
2294 
2295 
2296 /*
2297  * mdi_pi_unlock():
2298  * 		Unlock the mdi_pathinfo node.
2299  * Note:
2300  *		The mdi_pathinfo node should have been locked with mdi_pi_lock()
2301  */
2302 void
2303 mdi_pi_unlock(mdi_pathinfo_t *pip)
2304 {
2305 	ASSERT(pip != NULL);
2306 	if (pip) {
2307 		MDI_PI_UNLOCK(pip);
2308 	}
2309 }
2310 
2311 /*
2312  * mdi_pi_find():
2313  *		Search the list of mdi_pathinfo nodes attached to the
2314  *		pHCI/Client device node whose path address matches "paddr".
2315  *		Returns a pointer to the mdi_pathinfo node if a matching node is
2316  *		found.
2317  * Return Values:
2318  *		mdi_pathinfo node handle
2319  *		NULL
2320  * Notes:
2321  *		Caller need not hold any locks to call this function.
2322  */
2323 mdi_pathinfo_t *
2324 mdi_pi_find(dev_info_t *pdip, char *caddr, char *paddr)
2325 {
2326 	mdi_phci_t		*ph;
2327 	mdi_vhci_t		*vh;
2328 	mdi_client_t		*ct;
2329 	mdi_pathinfo_t		*pip = NULL;
2330 
2331 	if ((pdip == NULL) || (paddr == NULL)) {
2332 		return (NULL);
2333 	}
2334 	ph = i_devi_get_phci(pdip);
2335 	if (ph == NULL) {
2336 		/*
2337 		 * Invalid pHCI device, Nothing more to do.
2338 		 */
2339 		MDI_DEBUG(2, (CE_WARN, NULL,
2340 		    "!mdi_pi_find: invalid phci"));
2341 		return (NULL);
2342 	}
2343 
2344 	vh = ph->ph_vhci;
2345 	if (vh == NULL) {
2346 		/*
2347 		 * Invalid vHCI device, Nothing more to do.
2348 		 */
2349 		MDI_DEBUG(2, (CE_WARN, NULL,
2350 		    "!mdi_pi_find: invalid phci"));
2351 		return (NULL);
2352 	}
2353 
2354 	/*
2355 	 * Look for client device identified by caddr (guid)
2356 	 */
2357 	if (caddr == NULL) {
2358 		/*
2359 		 * Find a mdi_pathinfo node under pHCI list for a matching
2360 		 * unit address.
2361 		 */
2362 		mutex_enter(&ph->ph_mutex);
2363 		pip = (mdi_pathinfo_t *)ph->ph_path_head;
2364 
2365 		while (pip != NULL) {
2366 			if (strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2367 				break;
2368 			}
2369 			pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
2370 		}
2371 		mutex_exit(&ph->ph_mutex);
2372 		return (pip);
2373 	}
2374 
2375 	/*
2376 	 * Find the client device corresponding to 'caddr'
2377 	 */
2378 	mutex_enter(&mdi_mutex);
2379 	ct = i_mdi_client_find(vh, caddr);
2380 	if (ct == NULL) {
2381 		/*
2382 		 * Client not found, Obviously mdi_pathinfo node has not been
2383 		 * created yet.
2384 		 */
2385 		mutex_exit(&mdi_mutex);
2386 		return (pip);
2387 	}
2388 
2389 	/*
2390 	 * Hold the client lock and look for a mdi_pathinfo node with matching
2391 	 * pHCI and paddr
2392 	 */
2393 	MDI_CLIENT_LOCK(ct);
2394 
2395 	/*
2396 	 * Release the global mutex as it is no more needed. Note: We always
2397 	 * respect the locking order while acquiring.
2398 	 */
2399 	mutex_exit(&mdi_mutex);
2400 
2401 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2402 	while (pip != NULL) {
2403 		/*
2404 		 * Compare the unit address
2405 		 */
2406 		if ((MDI_PI(pip)->pi_phci == ph) &&
2407 		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2408 			break;
2409 		}
2410 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2411 	}
2412 	MDI_CLIENT_UNLOCK(ct);
2413 	return (pip);
2414 }
2415 
2416 /*
2417  * mdi_pi_alloc():
2418  *		Allocate and initialize a new instance of a mdi_pathinfo node.
2419  *		The mdi_pathinfo node returned by this function identifies a
2420  *		unique device path is capable of having properties attached
2421  *		and passed to mdi_pi_online() to fully attach and online the
2422  *		path and client device node.
2423  *		The mdi_pathinfo node returned by this function must be
2424  *		destroyed using mdi_pi_free() if the path is no longer
2425  *		operational or if the caller fails to attach a client device
2426  *		node when calling mdi_pi_online(). The framework will not free
2427  *		the resources allocated.
2428  *		This function can be called from both interrupt and kernel
2429  *		contexts.  DDI_NOSLEEP flag should be used while calling
2430  *		from interrupt contexts.
2431  * Return Values:
2432  *		MDI_SUCCESS
2433  *		MDI_FAILURE
2434  *		MDI_NOMEM
2435  */
2436 /*ARGSUSED*/
2437 int
2438 mdi_pi_alloc_compatible(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2439     char **compatible, int ncompatible, int flags, mdi_pathinfo_t **ret_pip)
2440 {
2441 	mdi_vhci_t	*vh;
2442 	mdi_phci_t	*ph;
2443 	mdi_client_t	*ct;
2444 	mdi_pathinfo_t	*pip = NULL;
2445 	dev_info_t	*cdip;
2446 	int		rv = MDI_NOMEM;
2447 
2448 	if (pdip == NULL || cname == NULL || caddr == NULL || paddr == NULL ||
2449 	    ret_pip == NULL) {
2450 		/* Nothing more to do */
2451 		return (MDI_FAILURE);
2452 	}
2453 
2454 	*ret_pip = NULL;
2455 	ph = i_devi_get_phci(pdip);
2456 	ASSERT(ph != NULL);
2457 	if (ph == NULL) {
2458 		/* Invalid pHCI device, return failure */
2459 		MDI_DEBUG(1, (CE_WARN, NULL,
2460 		    "!mdi_pi_alloc: invalid pHCI=%p", pdip));
2461 		return (MDI_FAILURE);
2462 	}
2463 
2464 	MDI_PHCI_LOCK(ph);
2465 	vh = ph->ph_vhci;
2466 	if (vh == NULL) {
2467 		/* Invalid vHCI device, return failure */
2468 		MDI_DEBUG(1, (CE_WARN, NULL,
2469 		    "!mdi_pi_alloc: invalid pHCI=%p", pdip));
2470 		MDI_PHCI_UNLOCK(ph);
2471 		return (MDI_FAILURE);
2472 	}
2473 
2474 	if (MDI_PHCI_IS_READY(ph) == 0) {
2475 		/*
2476 		 * Do not allow new node creation when pHCI is in
2477 		 * offline/suspended states
2478 		 */
2479 		MDI_DEBUG(1, (CE_WARN, NULL,
2480 		    "mdi_pi_alloc: pHCI=%p is not ready", ph));
2481 		MDI_PHCI_UNLOCK(ph);
2482 		return (MDI_BUSY);
2483 	}
2484 	MDI_PHCI_UNSTABLE(ph);
2485 	MDI_PHCI_UNLOCK(ph);
2486 
2487 	/*
2488 	 * Look for a client device with matching guid identified by caddr,
2489 	 * If not found create one
2490 	 */
2491 	mutex_enter(&mdi_mutex);
2492 	ct = i_mdi_client_find(vh, caddr);
2493 	if (ct == NULL) {
2494 		ct = i_mdi_client_alloc(vh, cname, caddr, flags);
2495 		if (ct == NULL)
2496 			goto fail;
2497 	}
2498 
2499 	if (ct->ct_dip == NULL) {
2500 		/*
2501 		 * Allocate a devinfo node
2502 		 */
2503 		ct->ct_dip = i_mdi_devinfo_create(vh, cname, caddr,
2504 		    compatible, ncompatible, flags);
2505 		if (ct->ct_dip == NULL) {
2506 			(void) i_mdi_client_free(vh, ct);
2507 			goto fail;
2508 		}
2509 	}
2510 	cdip = ct->ct_dip;
2511 
2512 	DEVI(cdip)->devi_mdi_component |= MDI_COMPONENT_CLIENT;
2513 	DEVI(cdip)->devi_mdi_client = (caddr_t)ct;
2514 
2515 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2516 	while (pip != NULL) {
2517 		/*
2518 		 * Compare the unit address
2519 		 */
2520 		if ((MDI_PI(pip)->pi_phci == ph) &&
2521 		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2522 			break;
2523 		}
2524 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2525 	}
2526 
2527 	if (pip == NULL) {
2528 		/*
2529 		 * This is a new path for this client device.  Allocate and
2530 		 * initialize a new pathinfo node
2531 		 */
2532 		pip = i_mdi_pi_alloc(ph, paddr, ct, flags);
2533 		if (pip == NULL) {
2534 			(void) i_mdi_client_free(vh, ct);
2535 			goto fail;
2536 		}
2537 	}
2538 	rv = MDI_SUCCESS;
2539 
2540 fail:
2541 	/*
2542 	 * Release the global mutex.
2543 	 */
2544 	mutex_exit(&mdi_mutex);
2545 
2546 	/*
2547 	 * Mark the pHCI as stable
2548 	 */
2549 	MDI_PHCI_LOCK(ph);
2550 	MDI_PHCI_STABLE(ph);
2551 	MDI_PHCI_UNLOCK(ph);
2552 	*ret_pip = pip;
2553 	return (rv);
2554 }
2555 
2556 /*ARGSUSED*/
2557 int
2558 mdi_pi_alloc(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2559     int flags, mdi_pathinfo_t **ret_pip)
2560 {
2561 	return (mdi_pi_alloc_compatible(pdip, cname, caddr, paddr, NULL, 0,
2562 	    flags, ret_pip));
2563 }
2564 
2565 /*
2566  * i_mdi_pi_alloc():
2567  *		Allocate a mdi_pathinfo node and add to the pHCI path list
2568  * Return Values:
2569  *		mdi_pathinfo
2570  */
2571 
2572 /*ARGSUSED*/
2573 static mdi_pathinfo_t *
2574 i_mdi_pi_alloc(mdi_phci_t *ph, char *paddr, mdi_client_t *ct, int flags)
2575 {
2576 	mdi_pathinfo_t	*pip = NULL;
2577 	char		*pi_addr = NULL;
2578 	nvlist_t	*pi_prop = NULL;
2579 
2580 	int		ct_circular;
2581 	int		ph_circular;
2582 
2583 	pip = kmem_zalloc(sizeof (struct mdi_pathinfo),
2584 	    (flags == DDI_SLEEP) ? KM_SLEEP : KM_NOSLEEP);
2585 	if (pip == NULL)
2586 		goto fail;
2587 	mutex_init(&MDI_PI(pip)->pi_mutex, NULL, MUTEX_DEFAULT, NULL);
2588 	MDI_PI(pip)->pi_state = MDI_PATHINFO_STATE_INIT |
2589 	    MDI_PATHINFO_STATE_TRANSIENT;
2590 
2591 	if (MDI_PHCI_IS_USER_DISABLED(ph))
2592 		MDI_PI_SET_USER_DISABLE(pip);
2593 
2594 	if (MDI_PHCI_IS_DRV_DISABLED_TRANSIENT(ph))
2595 		MDI_PI_SET_DRV_DISABLE_TRANS(pip);
2596 
2597 	if (MDI_PHCI_IS_DRV_DISABLED(ph))
2598 		MDI_PI_SET_DRV_DISABLE(pip);
2599 
2600 	MDI_PI(pip)->pi_old_state = MDI_PATHINFO_STATE_INIT;
2601 	cv_init(&MDI_PI(pip)->pi_state_cv, NULL, CV_DEFAULT, NULL);
2602 	MDI_PI(pip)->pi_client = ct;
2603 	MDI_PI(pip)->pi_phci = ph;
2604 	pi_addr =
2605 	    MDI_PI(pip)->pi_addr = kmem_alloc(strlen(paddr) + 1,
2606 	    (flags == DDI_SLEEP) ? KM_SLEEP : KM_NOSLEEP);
2607 	if (pi_addr == NULL)
2608 		goto fail;
2609 	(void) strcpy(MDI_PI(pip)->pi_addr, paddr);
2610 	(void) nvlist_alloc(&pi_prop, NV_UNIQUE_NAME,
2611 	    (flags == DDI_SLEEP) ? KM_SLEEP : KM_NOSLEEP);
2612 	if (pi_prop == NULL)
2613 		goto fail;
2614 	MDI_PI(pip)->pi_prop = pi_prop;
2615 	MDI_PI(pip)->pi_pprivate = NULL;
2616 	MDI_PI(pip)->pi_cprivate = NULL;
2617 	MDI_PI(pip)->pi_vprivate = NULL;
2618 	MDI_PI(pip)->pi_client_link = NULL;
2619 	MDI_PI(pip)->pi_phci_link = NULL;
2620 	MDI_PI(pip)->pi_ref_cnt = 0;
2621 	MDI_PI(pip)->pi_kstats = NULL;
2622 	MDI_PI(pip)->pi_preferred = 1;
2623 	cv_init(&MDI_PI(pip)->pi_ref_cv, NULL, CV_DEFAULT, NULL);
2624 
2625 	/*
2626 	 * Lock both dev_info nodes against changes in parallel.
2627 	 */
2628 	ndi_devi_enter(ct->ct_dip, &ct_circular);
2629 	ndi_devi_enter(ph->ph_dip, &ph_circular);
2630 
2631 	i_mdi_phci_add_path(ph, pip);
2632 	i_mdi_client_add_path(ct, pip);
2633 
2634 	ndi_devi_exit(ph->ph_dip, ph_circular);
2635 	ndi_devi_exit(ct->ct_dip, ct_circular);
2636 
2637 	return (pip);
2638 
2639 fail:
2640 	if (pi_prop)
2641 		(void) nvlist_free(pi_prop);
2642 	if (pi_addr)
2643 		kmem_free(pi_addr, strlen(paddr) + 1);
2644 	kmem_free(pip, sizeof (struct mdi_pathinfo));
2645 	return (NULL);
2646 }
2647 
2648 /*
2649  * i_mdi_phci_add_path():
2650  * 		Add a mdi_pathinfo node to pHCI list.
2651  * Notes:
2652  *		Caller should per-pHCI mutex
2653  */
2654 
2655 static void
2656 i_mdi_phci_add_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
2657 {
2658 	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
2659 
2660 	if (ph->ph_path_head == NULL) {
2661 		ph->ph_path_head = pip;
2662 	} else {
2663 		MDI_PI(ph->ph_path_tail)->pi_phci_link = MDI_PI(pip);
2664 	}
2665 	ph->ph_path_tail = pip;
2666 	ph->ph_path_count++;
2667 }
2668 
2669 /*
2670  * i_mdi_client_add_path():
2671  *		Add mdi_pathinfo node to client list
2672  */
2673 
2674 static void
2675 i_mdi_client_add_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
2676 {
2677 	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
2678 
2679 	if (ct->ct_path_head == NULL) {
2680 		ct->ct_path_head = pip;
2681 	} else {
2682 		MDI_PI(ct->ct_path_tail)->pi_client_link = MDI_PI(pip);
2683 	}
2684 	ct->ct_path_tail = pip;
2685 	ct->ct_path_count++;
2686 }
2687 
2688 /*
2689  * mdi_pi_free():
2690  *		Free the mdi_pathinfo node and also client device node if this
2691  *		is the last path to the device
2692  * Return Values:
2693  *		MDI_SUCCESS
2694  *		MDI_FAILURE
2695  *		MDI_BUSY
2696  */
2697 
2698 /*ARGSUSED*/
2699 int
2700 mdi_pi_free(mdi_pathinfo_t *pip, int flags)
2701 {
2702 	int		rv = MDI_SUCCESS;
2703 	mdi_vhci_t	*vh;
2704 	mdi_phci_t	*ph;
2705 	mdi_client_t	*ct;
2706 	int		(*f)();
2707 	int		client_held = 0;
2708 
2709 	MDI_PI_LOCK(pip);
2710 	ph = MDI_PI(pip)->pi_phci;
2711 	ASSERT(ph != NULL);
2712 	if (ph == NULL) {
2713 		/*
2714 		 * Invalid pHCI device, return failure
2715 		 */
2716 		MDI_DEBUG(1, (CE_WARN, NULL,
2717 		    "!mdi_pi_free: invalid pHCI"));
2718 		MDI_PI_UNLOCK(pip);
2719 		return (MDI_FAILURE);
2720 	}
2721 
2722 	vh = ph->ph_vhci;
2723 	ASSERT(vh != NULL);
2724 	if (vh == NULL) {
2725 		/* Invalid pHCI device, return failure */
2726 		MDI_DEBUG(1, (CE_WARN, NULL,
2727 		    "!mdi_pi_free: invalid vHCI"));
2728 		MDI_PI_UNLOCK(pip);
2729 		return (MDI_FAILURE);
2730 	}
2731 
2732 	ct = MDI_PI(pip)->pi_client;
2733 	ASSERT(ct != NULL);
2734 	if (ct == NULL) {
2735 		/*
2736 		 * Invalid Client device, return failure
2737 		 */
2738 		MDI_DEBUG(1, (CE_WARN, NULL,
2739 		    "!mdi_pi_free: invalid client"));
2740 		MDI_PI_UNLOCK(pip);
2741 		return (MDI_FAILURE);
2742 	}
2743 
2744 	/*
2745 	 * Check to see for busy condition.  A mdi_pathinfo can only be freed
2746 	 * if the node state is either offline or init and the reference count
2747 	 * is zero.
2748 	 */
2749 	if (!(MDI_PI_IS_OFFLINE(pip) || MDI_PI_IS_INIT(pip) ||
2750 	    MDI_PI_IS_INITING(pip))) {
2751 		/*
2752 		 * Node is busy
2753 		 */
2754 		MDI_DEBUG(1, (CE_WARN, NULL,
2755 		    "!mdi_pi_free: pathinfo node is busy pip=%p", pip));
2756 		MDI_PI_UNLOCK(pip);
2757 		return (MDI_BUSY);
2758 	}
2759 
2760 	while (MDI_PI(pip)->pi_ref_cnt != 0) {
2761 		/*
2762 		 * Give a chance for pending I/Os to complete.
2763 		 */
2764 		MDI_DEBUG(1, (CE_NOTE, ct->ct_vhci->vh_dip, "!i_mdi_pi_free: "
2765 		    "%d cmds still pending on path: %p\n",
2766 		    MDI_PI(pip)->pi_ref_cnt, pip));
2767 		if (cv_timedwait(&MDI_PI(pip)->pi_ref_cv,
2768 		    &MDI_PI(pip)->pi_mutex,
2769 		    ddi_get_lbolt() + drv_usectohz(60 * 1000000)) == -1) {
2770 			/*
2771 			 * The timeout time reached without ref_cnt being zero
2772 			 * being signaled.
2773 			 */
2774 			MDI_DEBUG(1, (CE_NOTE, ct->ct_vhci->vh_dip,
2775 			    "!i_mdi_pi_free: "
2776 			    "Timeout reached on path %p without the cond\n",
2777 			    pip));
2778 			MDI_DEBUG(1, (CE_NOTE, ct->ct_vhci->vh_dip,
2779 			    "!i_mdi_pi_free: "
2780 			    "%d cmds still pending on path: %p\n",
2781 			    MDI_PI(pip)->pi_ref_cnt, pip));
2782 			MDI_PI_UNLOCK(pip);
2783 			return (MDI_BUSY);
2784 		}
2785 	}
2786 	if (MDI_PI(pip)->pi_pm_held) {
2787 		client_held = 1;
2788 	}
2789 	MDI_PI_UNLOCK(pip);
2790 
2791 	MDI_CLIENT_LOCK(ct);
2792 
2793 	/* Prevent further failovers till mdi_mutex is held */
2794 	MDI_CLIENT_SET_PATH_FREE_IN_PROGRESS(ct);
2795 
2796 	/*
2797 	 * Wait till failover is complete before removing this node.
2798 	 */
2799 	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
2800 		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
2801 
2802 	MDI_CLIENT_UNLOCK(ct);
2803 	mutex_enter(&mdi_mutex);
2804 	MDI_CLIENT_LOCK(ct);
2805 	MDI_CLIENT_CLEAR_PATH_FREE_IN_PROGRESS(ct);
2806 
2807 	if (!MDI_PI_IS_INITING(pip)) {
2808 		f = vh->vh_ops->vo_pi_uninit;
2809 		if (f != NULL) {
2810 			rv = (*f)(vh->vh_dip, pip, 0);
2811 		}
2812 	}
2813 	/*
2814 	 * If vo_pi_uninit() completed successfully.
2815 	 */
2816 	if (rv == MDI_SUCCESS) {
2817 		if (client_held) {
2818 			MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "mdi_pi_free "
2819 			    "i_mdi_pm_rele_client\n"));
2820 			i_mdi_pm_rele_client(ct, 1);
2821 		}
2822 		i_mdi_pi_free(ph, pip, ct);
2823 		if (ct->ct_path_count == 0) {
2824 			/*
2825 			 * Client lost its last path.
2826 			 * Clean up the client device
2827 			 */
2828 			MDI_CLIENT_UNLOCK(ct);
2829 			(void) i_mdi_client_free(ct->ct_vhci, ct);
2830 			mutex_exit(&mdi_mutex);
2831 			return (rv);
2832 		}
2833 	}
2834 	MDI_CLIENT_UNLOCK(ct);
2835 	mutex_exit(&mdi_mutex);
2836 	return (rv);
2837 }
2838 
2839 /*
2840  * i_mdi_pi_free():
2841  *		Free the mdi_pathinfo node
2842  */
2843 static void
2844 i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *pip, mdi_client_t *ct)
2845 {
2846 	int	ct_circular;
2847 	int	ph_circular;
2848 
2849 	/*
2850 	 * remove any per-path kstats
2851 	 */
2852 	i_mdi_pi_kstat_destroy(pip);
2853 
2854 	ndi_devi_enter(ct->ct_dip, &ct_circular);
2855 	ndi_devi_enter(ph->ph_dip, &ph_circular);
2856 
2857 	i_mdi_client_remove_path(ct, pip);
2858 	i_mdi_phci_remove_path(ph, pip);
2859 
2860 	ndi_devi_exit(ph->ph_dip, ph_circular);
2861 	ndi_devi_exit(ct->ct_dip, ct_circular);
2862 
2863 	mutex_destroy(&MDI_PI(pip)->pi_mutex);
2864 	cv_destroy(&MDI_PI(pip)->pi_state_cv);
2865 	cv_destroy(&MDI_PI(pip)->pi_ref_cv);
2866 	if (MDI_PI(pip)->pi_addr) {
2867 		kmem_free(MDI_PI(pip)->pi_addr,
2868 		    strlen(MDI_PI(pip)->pi_addr) + 1);
2869 		MDI_PI(pip)->pi_addr = NULL;
2870 	}
2871 
2872 	if (MDI_PI(pip)->pi_prop) {
2873 		(void) nvlist_free(MDI_PI(pip)->pi_prop);
2874 		MDI_PI(pip)->pi_prop = NULL;
2875 	}
2876 	kmem_free(pip, sizeof (struct mdi_pathinfo));
2877 }
2878 
2879 
2880 /*
2881  * i_mdi_phci_remove_path():
2882  * 		Remove a mdi_pathinfo node from pHCI list.
2883  * Notes:
2884  *		Caller should hold per-pHCI mutex
2885  */
2886 
2887 static void
2888 i_mdi_phci_remove_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
2889 {
2890 	mdi_pathinfo_t	*prev = NULL;
2891 	mdi_pathinfo_t	*path = NULL;
2892 
2893 	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
2894 
2895 	path = ph->ph_path_head;
2896 	while (path != NULL) {
2897 		if (path == pip) {
2898 			break;
2899 		}
2900 		prev = path;
2901 		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
2902 	}
2903 
2904 	if (path) {
2905 		ph->ph_path_count--;
2906 		if (prev) {
2907 			MDI_PI(prev)->pi_phci_link = MDI_PI(path)->pi_phci_link;
2908 		} else {
2909 			ph->ph_path_head =
2910 			    (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
2911 		}
2912 		if (ph->ph_path_tail == path) {
2913 			ph->ph_path_tail = prev;
2914 		}
2915 	}
2916 
2917 	/*
2918 	 * Clear the pHCI link
2919 	 */
2920 	MDI_PI(pip)->pi_phci_link = NULL;
2921 	MDI_PI(pip)->pi_phci = NULL;
2922 }
2923 
2924 /*
2925  * i_mdi_client_remove_path():
2926  * 		Remove a mdi_pathinfo node from client path list.
2927  */
2928 
2929 static void
2930 i_mdi_client_remove_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
2931 {
2932 	mdi_pathinfo_t	*prev = NULL;
2933 	mdi_pathinfo_t	*path;
2934 
2935 	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
2936 
2937 	path = ct->ct_path_head;
2938 	while (path != NULL) {
2939 		if (path == pip) {
2940 			break;
2941 		}
2942 		prev = path;
2943 		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
2944 	}
2945 
2946 	if (path) {
2947 		ct->ct_path_count--;
2948 		if (prev) {
2949 			MDI_PI(prev)->pi_client_link =
2950 			    MDI_PI(path)->pi_client_link;
2951 		} else {
2952 			ct->ct_path_head =
2953 			    (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
2954 		}
2955 		if (ct->ct_path_tail == path) {
2956 			ct->ct_path_tail = prev;
2957 		}
2958 		if (ct->ct_path_last == path) {
2959 			ct->ct_path_last = ct->ct_path_head;
2960 		}
2961 	}
2962 	MDI_PI(pip)->pi_client_link = NULL;
2963 	MDI_PI(pip)->pi_client = NULL;
2964 }
2965 
2966 /*
2967  * i_mdi_pi_state_change():
2968  *		online a mdi_pathinfo node
2969  *
2970  * Return Values:
2971  *		MDI_SUCCESS
2972  *		MDI_FAILURE
2973  */
2974 /*ARGSUSED*/
2975 static int
2976 i_mdi_pi_state_change(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state, int flag)
2977 {
2978 	int		rv = MDI_SUCCESS;
2979 	mdi_vhci_t	*vh;
2980 	mdi_phci_t	*ph;
2981 	mdi_client_t	*ct;
2982 	int		(*f)();
2983 	dev_info_t	*cdip;
2984 
2985 	MDI_PI_LOCK(pip);
2986 
2987 	ph = MDI_PI(pip)->pi_phci;
2988 	ASSERT(ph);
2989 	if (ph == NULL) {
2990 		/*
2991 		 * Invalid pHCI device, fail the request
2992 		 */
2993 		MDI_PI_UNLOCK(pip);
2994 		MDI_DEBUG(1, (CE_WARN, NULL,
2995 		    "!mdi_pi_state_change: invalid phci"));
2996 		return (MDI_FAILURE);
2997 	}
2998 
2999 	vh = ph->ph_vhci;
3000 	ASSERT(vh);
3001 	if (vh == NULL) {
3002 		/*
3003 		 * Invalid vHCI device, fail the request
3004 		 */
3005 		MDI_PI_UNLOCK(pip);
3006 		MDI_DEBUG(1, (CE_WARN, NULL,
3007 		    "!mdi_pi_state_change: invalid vhci"));
3008 		return (MDI_FAILURE);
3009 	}
3010 
3011 	ct = MDI_PI(pip)->pi_client;
3012 	ASSERT(ct != NULL);
3013 	if (ct == NULL) {
3014 		/*
3015 		 * Invalid client device, fail the request
3016 		 */
3017 		MDI_PI_UNLOCK(pip);
3018 		MDI_DEBUG(1, (CE_WARN, NULL,
3019 		    "!mdi_pi_state_change: invalid client"));
3020 		return (MDI_FAILURE);
3021 	}
3022 
3023 	/*
3024 	 * If this path has not been initialized yet, Callback vHCI driver's
3025 	 * pathinfo node initialize entry point
3026 	 */
3027 
3028 	if (MDI_PI_IS_INITING(pip)) {
3029 		MDI_PI_UNLOCK(pip);
3030 		f = vh->vh_ops->vo_pi_init;
3031 		if (f != NULL) {
3032 			rv = (*f)(vh->vh_dip, pip, 0);
3033 			if (rv != MDI_SUCCESS) {
3034 				MDI_DEBUG(1, (CE_WARN, vh->vh_dip,
3035 				    "!vo_pi_init: failed vHCI=0x%p, pip=0x%p",
3036 				    vh, pip));
3037 				return (MDI_FAILURE);
3038 			}
3039 		}
3040 		MDI_PI_LOCK(pip);
3041 		MDI_PI_CLEAR_TRANSIENT(pip);
3042 	}
3043 
3044 	/*
3045 	 * Do not allow state transition when pHCI is in offline/suspended
3046 	 * states
3047 	 */
3048 	i_mdi_phci_lock(ph, pip);
3049 	if (MDI_PHCI_IS_READY(ph) == 0) {
3050 		MDI_DEBUG(1, (CE_WARN, NULL,
3051 		    "!mdi_pi_state_change: pHCI not ready, pHCI=%p", ph));
3052 		MDI_PI_UNLOCK(pip);
3053 		i_mdi_phci_unlock(ph);
3054 		return (MDI_BUSY);
3055 	}
3056 	MDI_PHCI_UNSTABLE(ph);
3057 	i_mdi_phci_unlock(ph);
3058 
3059 	/*
3060 	 * Check if mdi_pathinfo state is in transient state.
3061 	 * If yes, offlining is in progress and wait till transient state is
3062 	 * cleared.
3063 	 */
3064 	if (MDI_PI_IS_TRANSIENT(pip)) {
3065 		while (MDI_PI_IS_TRANSIENT(pip)) {
3066 			cv_wait(&MDI_PI(pip)->pi_state_cv,
3067 			    &MDI_PI(pip)->pi_mutex);
3068 		}
3069 	}
3070 
3071 	/*
3072 	 * Grab the client lock in reverse order sequence and release the
3073 	 * mdi_pathinfo mutex.
3074 	 */
3075 	i_mdi_client_lock(ct, pip);
3076 	MDI_PI_UNLOCK(pip);
3077 
3078 	/*
3079 	 * Wait till failover state is cleared
3080 	 */
3081 	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
3082 		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
3083 
3084 	/*
3085 	 * Mark the mdi_pathinfo node state as transient
3086 	 */
3087 	MDI_PI_LOCK(pip);
3088 	switch (state) {
3089 	case MDI_PATHINFO_STATE_ONLINE:
3090 		MDI_PI_SET_ONLINING(pip);
3091 		break;
3092 
3093 	case MDI_PATHINFO_STATE_STANDBY:
3094 		MDI_PI_SET_STANDBYING(pip);
3095 		break;
3096 
3097 	case MDI_PATHINFO_STATE_FAULT:
3098 		/*
3099 		 * Mark the pathinfo state as FAULTED
3100 		 */
3101 		MDI_PI_SET_FAULTING(pip);
3102 		MDI_PI_ERRSTAT(pip, MDI_PI_HARDERR);
3103 		break;
3104 
3105 	case MDI_PATHINFO_STATE_OFFLINE:
3106 		/*
3107 		 * ndi_devi_offline() cannot hold pip or ct locks.
3108 		 */
3109 		MDI_PI_UNLOCK(pip);
3110 		/*
3111 		 * Do not offline if path will become last path and path
3112 		 * is busy for user initiated events.
3113 		 */
3114 		cdip = ct->ct_dip;
3115 		if ((flag & NDI_DEVI_REMOVE) &&
3116 		    (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED)) {
3117 			i_mdi_client_unlock(ct);
3118 			rv = ndi_devi_offline(cdip, 0);
3119 			if (rv != NDI_SUCCESS) {
3120 				/*
3121 				 * Convert to MDI error code
3122 				 */
3123 				switch (rv) {
3124 				case NDI_BUSY:
3125 					rv = MDI_BUSY;
3126 					break;
3127 				default:
3128 					rv = MDI_FAILURE;
3129 					break;
3130 				}
3131 				goto state_change_exit;
3132 			} else {
3133 				i_mdi_client_lock(ct, NULL);
3134 			}
3135 		}
3136 		/*
3137 		 * Mark the mdi_pathinfo node state as transient
3138 		 */
3139 		MDI_PI_LOCK(pip);
3140 		MDI_PI_SET_OFFLINING(pip);
3141 		break;
3142 	}
3143 	MDI_PI_UNLOCK(pip);
3144 	MDI_CLIENT_UNSTABLE(ct);
3145 	i_mdi_client_unlock(ct);
3146 
3147 	f = vh->vh_ops->vo_pi_state_change;
3148 	if (f != NULL) {
3149 		rv = (*f)(vh->vh_dip, pip, state, 0, flag);
3150 		if (rv == MDI_NOT_SUPPORTED) {
3151 			MDI_CLIENT_SET_DEV_NOT_SUPPORTED(ct);
3152 		}
3153 		if (rv != MDI_SUCCESS) {
3154 			MDI_DEBUG(2, (CE_WARN, vh->vh_dip,
3155 			    "!vo_pi_state_change: failed rv = %x", rv));
3156 		}
3157 	}
3158 	MDI_CLIENT_LOCK(ct);
3159 	MDI_PI_LOCK(pip);
3160 	if (MDI_PI_IS_TRANSIENT(pip)) {
3161 		if (rv == MDI_SUCCESS) {
3162 			MDI_PI_CLEAR_TRANSIENT(pip);
3163 		} else {
3164 			MDI_PI(pip)->pi_state = MDI_PI_OLD_STATE(pip);
3165 		}
3166 	}
3167 
3168 	/*
3169 	 * Wake anyone waiting for this mdi_pathinfo node
3170 	 */
3171 	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
3172 	MDI_PI_UNLOCK(pip);
3173 
3174 	/*
3175 	 * Mark the client device as stable
3176 	 */
3177 	MDI_CLIENT_STABLE(ct);
3178 	if (rv == MDI_SUCCESS) {
3179 		if (ct->ct_unstable == 0) {
3180 			cdip = ct->ct_dip;
3181 
3182 			/*
3183 			 * Onlining the mdi_pathinfo node will impact the
3184 			 * client state Update the client and dev_info node
3185 			 * state accordingly
3186 			 */
3187 			rv = NDI_SUCCESS;
3188 			i_mdi_client_update_state(ct);
3189 			switch (MDI_CLIENT_STATE(ct)) {
3190 			case MDI_CLIENT_STATE_OPTIMAL:
3191 			case MDI_CLIENT_STATE_DEGRADED:
3192 				if (cdip &&
3193 				    (i_ddi_node_state(cdip) < DS_READY) &&
3194 				    ((state == MDI_PATHINFO_STATE_ONLINE) ||
3195 				    (state == MDI_PATHINFO_STATE_STANDBY))) {
3196 
3197 					i_mdi_client_unlock(ct);
3198 					/*
3199 					 * Must do ndi_devi_online() through
3200 					 * hotplug thread for deferred
3201 					 * attach mechanism to work
3202 					 */
3203 					rv = ndi_devi_online(cdip, 0);
3204 					i_mdi_client_lock(ct, NULL);
3205 					if ((rv != NDI_SUCCESS) &&
3206 					    (MDI_CLIENT_STATE(ct) ==
3207 					    MDI_CLIENT_STATE_DEGRADED)) {
3208 						/*
3209 						 * ndi_devi_online failed.
3210 						 * Reset client flags to
3211 						 * offline.
3212 						 */
3213 						MDI_DEBUG(1, (CE_WARN, cdip,
3214 						    "!ndi_devi_online: failed "
3215 						    " Error: %x", rv));
3216 						MDI_CLIENT_SET_OFFLINE(ct);
3217 					}
3218 					if (rv != NDI_SUCCESS) {
3219 						/* Reset the path state */
3220 						MDI_PI_LOCK(pip);
3221 						MDI_PI(pip)->pi_state =
3222 						    MDI_PI_OLD_STATE(pip);
3223 						MDI_PI_UNLOCK(pip);
3224 					}
3225 				}
3226 				break;
3227 
3228 			case MDI_CLIENT_STATE_FAILED:
3229 				/*
3230 				 * This is the last path case for
3231 				 * non-user initiated events.
3232 				 */
3233 				if (((flag & NDI_DEVI_REMOVE) == 0) &&
3234 				    cdip && (i_ddi_node_state(cdip) >=
3235 				    DS_INITIALIZED)) {
3236 					i_mdi_client_unlock(ct);
3237 					rv = ndi_devi_offline(cdip, 0);
3238 					i_mdi_client_lock(ct, NULL);
3239 
3240 					if (rv != NDI_SUCCESS) {
3241 						/*
3242 						 * ndi_devi_offline failed.
3243 						 * Reset client flags to
3244 						 * online as the path could not
3245 						 * be offlined.
3246 						 */
3247 						MDI_DEBUG(1, (CE_WARN, cdip,
3248 						    "!ndi_devi_offline: failed "
3249 						    " Error: %x", rv));
3250 						MDI_CLIENT_SET_ONLINE(ct);
3251 					}
3252 				}
3253 				break;
3254 			}
3255 			/*
3256 			 * Convert to MDI error code
3257 			 */
3258 			switch (rv) {
3259 			case NDI_SUCCESS:
3260 				MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
3261 				i_mdi_report_path_state(ct, pip);
3262 				rv = MDI_SUCCESS;
3263 				break;
3264 			case NDI_BUSY:
3265 				rv = MDI_BUSY;
3266 				break;
3267 			default:
3268 				rv = MDI_FAILURE;
3269 				break;
3270 			}
3271 		}
3272 	}
3273 	MDI_CLIENT_UNLOCK(ct);
3274 
3275 state_change_exit:
3276 	/*
3277 	 * Mark the pHCI as stable again.
3278 	 */
3279 	MDI_PHCI_LOCK(ph);
3280 	MDI_PHCI_STABLE(ph);
3281 	MDI_PHCI_UNLOCK(ph);
3282 	return (rv);
3283 }
3284 
3285 /*
3286  * mdi_pi_online():
3287  *		Place the path_info node in the online state.  The path is
3288  *		now available to be selected by mdi_select_path() for
3289  *		transporting I/O requests to client devices.
3290  * Return Values:
3291  *		MDI_SUCCESS
3292  *		MDI_FAILURE
3293  */
3294 int
3295 mdi_pi_online(mdi_pathinfo_t *pip, int flags)
3296 {
3297 	mdi_client_t *ct = MDI_PI(pip)->pi_client;
3298 	dev_info_t *cdip;
3299 	int		client_held = 0;
3300 	int rv;
3301 
3302 	ASSERT(ct != NULL);
3303 	rv = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_ONLINE, flags);
3304 	if (rv != MDI_SUCCESS)
3305 		return (rv);
3306 
3307 	MDI_PI_LOCK(pip);
3308 	if (MDI_PI(pip)->pi_pm_held == 0) {
3309 		MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "mdi_pi_online "
3310 		    "i_mdi_pm_hold_pip\n"));
3311 		i_mdi_pm_hold_pip(pip);
3312 		client_held = 1;
3313 	}
3314 	MDI_PI_UNLOCK(pip);
3315 
3316 	if (client_held) {
3317 		MDI_CLIENT_LOCK(ct);
3318 		if (ct->ct_power_cnt == 0) {
3319 			rv = i_mdi_power_all_phci(ct);
3320 		}
3321 
3322 		MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "mdi_pi_online "
3323 		    "i_mdi_pm_hold_client\n"));
3324 		i_mdi_pm_hold_client(ct, 1);
3325 		MDI_CLIENT_UNLOCK(ct);
3326 	}
3327 
3328 	/*
3329 	 * Create the per-path (pathinfo) IO and error kstats which
3330 	 * are reported via iostat(1m).
3331 	 *
3332 	 * Defer creating the per-path kstats if device is not yet
3333 	 * attached;  the names of the kstats are constructed in part
3334 	 * using the devices instance number which is assigned during
3335 	 * process of attaching the client device.
3336 	 *
3337 	 * The framework post_attach handler, mdi_post_attach(), is
3338 	 * is responsible for initializing the client's pathinfo list
3339 	 * once successfully attached.
3340 	 */
3341 	cdip = ct->ct_dip;
3342 	ASSERT(cdip);
3343 	if (cdip == NULL || (i_ddi_node_state(cdip) < DS_ATTACHED))
3344 		return (rv);
3345 
3346 	MDI_CLIENT_LOCK(ct);
3347 	rv = i_mdi_pi_kstat_create(pip);
3348 	MDI_CLIENT_UNLOCK(ct);
3349 	return (rv);
3350 }
3351 
3352 /*
3353  * mdi_pi_standby():
3354  *		Place the mdi_pathinfo node in standby state
3355  *
3356  * Return Values:
3357  *		MDI_SUCCESS
3358  *		MDI_FAILURE
3359  */
3360 int
3361 mdi_pi_standby(mdi_pathinfo_t *pip, int flags)
3362 {
3363 	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_STANDBY, flags));
3364 }
3365 
3366 /*
3367  * mdi_pi_fault():
3368  *		Place the mdi_pathinfo node in fault'ed state
3369  * Return Values:
3370  *		MDI_SUCCESS
3371  *		MDI_FAILURE
3372  */
3373 int
3374 mdi_pi_fault(mdi_pathinfo_t *pip, int flags)
3375 {
3376 	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_FAULT, flags));
3377 }
3378 
3379 /*
3380  * mdi_pi_offline():
3381  *		Offline a mdi_pathinfo node.
3382  * Return Values:
3383  *		MDI_SUCCESS
3384  *		MDI_FAILURE
3385  */
3386 int
3387 mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
3388 {
3389 	int	ret, client_held = 0;
3390 	mdi_client_t	*ct;
3391 
3392 	ret = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_OFFLINE, flags);
3393 
3394 	if (ret == MDI_SUCCESS) {
3395 		MDI_PI_LOCK(pip);
3396 		if (MDI_PI(pip)->pi_pm_held) {
3397 			client_held = 1;
3398 		}
3399 		MDI_PI_UNLOCK(pip);
3400 
3401 		if (client_held) {
3402 			ct = MDI_PI(pip)->pi_client;
3403 			MDI_CLIENT_LOCK(ct);
3404 			MDI_DEBUG(4, (CE_NOTE, ct->ct_dip,
3405 			    "mdi_pi_offline i_mdi_pm_rele_client\n"));
3406 			i_mdi_pm_rele_client(ct, 1);
3407 			MDI_CLIENT_UNLOCK(ct);
3408 		}
3409 	}
3410 
3411 	return (ret);
3412 }
3413 
3414 /*
3415  * i_mdi_pi_offline():
3416  *		Offline a mdi_pathinfo node and call the vHCI driver's callback
3417  */
3418 static int
3419 i_mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
3420 {
3421 	dev_info_t	*vdip = NULL;
3422 	mdi_vhci_t	*vh = NULL;
3423 	mdi_client_t	*ct = NULL;
3424 	int		(*f)();
3425 	int		rv;
3426 
3427 	MDI_PI_LOCK(pip);
3428 	ct = MDI_PI(pip)->pi_client;
3429 	ASSERT(ct != NULL);
3430 
3431 	while (MDI_PI(pip)->pi_ref_cnt != 0) {
3432 		/*
3433 		 * Give a chance for pending I/Os to complete.
3434 		 */
3435 		MDI_DEBUG(1, (CE_NOTE, vdip, "!i_mdi_pi_offline: "
3436 		    "%d cmds still pending on path: %p\n",
3437 		    MDI_PI(pip)->pi_ref_cnt, pip));
3438 		if (cv_timedwait(&MDI_PI(pip)->pi_ref_cv,
3439 		    &MDI_PI(pip)->pi_mutex,
3440 		    ddi_get_lbolt() + drv_usectohz(60 * 1000000)) == -1) {
3441 			/*
3442 			 * The timeout time reached without ref_cnt being zero
3443 			 * being signaled.
3444 			 */
3445 			MDI_DEBUG(1, (CE_NOTE, vdip, "!i_mdi_pi_offline: "
3446 			    "Timeout reached on path %p without the cond\n",
3447 			    pip));
3448 			MDI_DEBUG(1, (CE_NOTE, vdip, "!i_mdi_pi_offline: "
3449 			    "%d cmds still pending on path: %p\n",
3450 			    MDI_PI(pip)->pi_ref_cnt, pip));
3451 		}
3452 	}
3453 	vh = ct->ct_vhci;
3454 	vdip = vh->vh_dip;
3455 
3456 	/*
3457 	 * Notify vHCI that has registered this event
3458 	 */
3459 	ASSERT(vh->vh_ops);
3460 	f = vh->vh_ops->vo_pi_state_change;
3461 
3462 	if (f != NULL) {
3463 		MDI_PI_UNLOCK(pip);
3464 		if ((rv = (*f)(vdip, pip, MDI_PATHINFO_STATE_OFFLINE, 0,
3465 		    flags)) != MDI_SUCCESS) {
3466 			MDI_DEBUG(1, (CE_WARN, vdip, "!vo_path_offline failed "
3467 			    "vdip 0x%x, pip 0x%x", vdip, pip));
3468 		}
3469 		MDI_PI_LOCK(pip);
3470 	}
3471 
3472 	/*
3473 	 * Set the mdi_pathinfo node state and clear the transient condition
3474 	 */
3475 	MDI_PI_SET_OFFLINE(pip);
3476 	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
3477 	MDI_PI_UNLOCK(pip);
3478 
3479 	MDI_CLIENT_LOCK(ct);
3480 	if (rv == MDI_SUCCESS) {
3481 		if (ct->ct_unstable == 0) {
3482 			dev_info_t	*cdip = ct->ct_dip;
3483 
3484 			/*
3485 			 * Onlining the mdi_pathinfo node will impact the
3486 			 * client state Update the client and dev_info node
3487 			 * state accordingly
3488 			 */
3489 			i_mdi_client_update_state(ct);
3490 			rv = NDI_SUCCESS;
3491 			if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
3492 				if (cdip &&
3493 				    (i_ddi_node_state(cdip) >=
3494 				    DS_INITIALIZED)) {
3495 					MDI_CLIENT_UNLOCK(ct);
3496 					rv = ndi_devi_offline(cdip, 0);
3497 					MDI_CLIENT_LOCK(ct);
3498 					if (rv != NDI_SUCCESS) {
3499 						/*
3500 						 * ndi_devi_offline failed.
3501 						 * Reset client flags to
3502 						 * online.
3503 						 */
3504 						MDI_DEBUG(4, (CE_WARN, cdip,
3505 						    "!ndi_devi_offline: failed "
3506 						    " Error: %x", rv));
3507 						MDI_CLIENT_SET_ONLINE(ct);
3508 					}
3509 				}
3510 			}
3511 			/*
3512 			 * Convert to MDI error code
3513 			 */
3514 			switch (rv) {
3515 			case NDI_SUCCESS:
3516 				rv = MDI_SUCCESS;
3517 				break;
3518 			case NDI_BUSY:
3519 				rv = MDI_BUSY;
3520 				break;
3521 			default:
3522 				rv = MDI_FAILURE;
3523 				break;
3524 			}
3525 		}
3526 		MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
3527 		i_mdi_report_path_state(ct, pip);
3528 	}
3529 
3530 	MDI_CLIENT_UNLOCK(ct);
3531 
3532 	/*
3533 	 * Change in the mdi_pathinfo node state will impact the client state
3534 	 */
3535 	MDI_DEBUG(2, (CE_NOTE, NULL, "!i_mdi_pi_offline ct = %p pip = %p",
3536 	    ct, pip));
3537 	return (rv);
3538 }
3539 
3540 
3541 /*
3542  * mdi_pi_get_addr():
3543  *		Get the unit address associated with a mdi_pathinfo node
3544  *
3545  * Return Values:
3546  *		char *
3547  */
3548 char *
3549 mdi_pi_get_addr(mdi_pathinfo_t *pip)
3550 {
3551 	if (pip == NULL)
3552 		return (NULL);
3553 
3554 	return (MDI_PI(pip)->pi_addr);
3555 }
3556 
3557 /*
3558  * mdi_pi_get_client():
3559  *		Get the client devinfo associated with a mdi_pathinfo node
3560  *
3561  * Return Values:
3562  *		Handle to client device dev_info node
3563  */
3564 dev_info_t *
3565 mdi_pi_get_client(mdi_pathinfo_t *pip)
3566 {
3567 	dev_info_t	*dip = NULL;
3568 	if (pip) {
3569 		dip = MDI_PI(pip)->pi_client->ct_dip;
3570 	}
3571 	return (dip);
3572 }
3573 
3574 /*
3575  * mdi_pi_get_phci():
3576  *		Get the pHCI devinfo associated with the mdi_pathinfo node
3577  * Return Values:
3578  *		Handle to dev_info node
3579  */
3580 dev_info_t *
3581 mdi_pi_get_phci(mdi_pathinfo_t *pip)
3582 {
3583 	dev_info_t	*dip = NULL;
3584 	if (pip) {
3585 		dip = MDI_PI(pip)->pi_phci->ph_dip;
3586 	}
3587 	return (dip);
3588 }
3589 
3590 /*
3591  * mdi_pi_get_client_private():
3592  *		Get the client private information associated with the
3593  *		mdi_pathinfo node
3594  */
3595 void *
3596 mdi_pi_get_client_private(mdi_pathinfo_t *pip)
3597 {
3598 	void *cprivate = NULL;
3599 	if (pip) {
3600 		cprivate = MDI_PI(pip)->pi_cprivate;
3601 	}
3602 	return (cprivate);
3603 }
3604 
3605 /*
3606  * mdi_pi_set_client_private():
3607  *		Set the client private information in the mdi_pathinfo node
3608  */
3609 void
3610 mdi_pi_set_client_private(mdi_pathinfo_t *pip, void *priv)
3611 {
3612 	if (pip) {
3613 		MDI_PI(pip)->pi_cprivate = priv;
3614 	}
3615 }
3616 
3617 /*
3618  * mdi_pi_get_phci_private():
3619  *		Get the pHCI private information associated with the
3620  *		mdi_pathinfo node
3621  */
3622 caddr_t
3623 mdi_pi_get_phci_private(mdi_pathinfo_t *pip)
3624 {
3625 	caddr_t	pprivate = NULL;
3626 	if (pip) {
3627 		pprivate = MDI_PI(pip)->pi_pprivate;
3628 	}
3629 	return (pprivate);
3630 }
3631 
3632 /*
3633  * mdi_pi_set_phci_private():
3634  *		Set the pHCI private information in the mdi_pathinfo node
3635  */
3636 void
3637 mdi_pi_set_phci_private(mdi_pathinfo_t *pip, caddr_t priv)
3638 {
3639 	if (pip) {
3640 		MDI_PI(pip)->pi_pprivate = priv;
3641 	}
3642 }
3643 
3644 /*
3645  * mdi_pi_get_state():
3646  *		Get the mdi_pathinfo node state. Transient states are internal
3647  *		and not provided to the users
3648  */
3649 mdi_pathinfo_state_t
3650 mdi_pi_get_state(mdi_pathinfo_t *pip)
3651 {
3652 	mdi_pathinfo_state_t    state = MDI_PATHINFO_STATE_INIT;
3653 
3654 	if (pip) {
3655 		if (MDI_PI_IS_TRANSIENT(pip)) {
3656 			/*
3657 			 * mdi_pathinfo is in state transition.  Return the
3658 			 * last good state.
3659 			 */
3660 			state = MDI_PI_OLD_STATE(pip);
3661 		} else {
3662 			state = MDI_PI_STATE(pip);
3663 		}
3664 	}
3665 	return (state);
3666 }
3667 
3668 /*
3669  * Note that the following function needs to be the new interface for
3670  * mdi_pi_get_state when mpxio gets integrated to ON.
3671  */
3672 int
3673 mdi_pi_get_state2(mdi_pathinfo_t *pip, mdi_pathinfo_state_t *state,
3674 		uint32_t *ext_state)
3675 {
3676 	*state = MDI_PATHINFO_STATE_INIT;
3677 
3678 	if (pip) {
3679 		if (MDI_PI_IS_TRANSIENT(pip)) {
3680 			/*
3681 			 * mdi_pathinfo is in state transition.  Return the
3682 			 * last good state.
3683 			 */
3684 			*state = MDI_PI_OLD_STATE(pip);
3685 			*ext_state = MDI_PI_OLD_EXT_STATE(pip);
3686 		} else {
3687 			*state = MDI_PI_STATE(pip);
3688 			*ext_state = MDI_PI_EXT_STATE(pip);
3689 		}
3690 	}
3691 	return (MDI_SUCCESS);
3692 }
3693 
3694 /*
3695  * mdi_pi_get_preferred:
3696  *	Get the preferred path flag
3697  */
3698 int
3699 mdi_pi_get_preferred(mdi_pathinfo_t *pip)
3700 {
3701 	if (pip) {
3702 		return (MDI_PI(pip)->pi_preferred);
3703 	}
3704 	return (0);
3705 }
3706 
3707 /*
3708  * mdi_pi_set_preferred:
3709  *	Set the preferred path flag
3710  */
3711 void
3712 mdi_pi_set_preferred(mdi_pathinfo_t *pip, int preferred)
3713 {
3714 	if (pip) {
3715 		MDI_PI(pip)->pi_preferred = preferred;
3716 	}
3717 }
3718 
3719 
3720 /*
3721  * mdi_pi_set_state():
3722  *		Set the mdi_pathinfo node state
3723  */
3724 void
3725 mdi_pi_set_state(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state)
3726 {
3727 	uint32_t	ext_state;
3728 
3729 	if (pip) {
3730 		ext_state = MDI_PI(pip)->pi_state & MDI_PATHINFO_EXT_STATE_MASK;
3731 		MDI_PI(pip)->pi_state = state;
3732 		MDI_PI(pip)->pi_state |= ext_state;
3733 	}
3734 }
3735 
3736 /*
3737  * Property functions:
3738  */
3739 
3740 int
3741 i_map_nvlist_error_to_mdi(int val)
3742 {
3743 	int rv;
3744 
3745 	switch (val) {
3746 	case 0:
3747 		rv = DDI_PROP_SUCCESS;
3748 		break;
3749 	case EINVAL:
3750 	case ENOTSUP:
3751 		rv = DDI_PROP_INVAL_ARG;
3752 		break;
3753 	case ENOMEM:
3754 		rv = DDI_PROP_NO_MEMORY;
3755 		break;
3756 	default:
3757 		rv = DDI_PROP_NOT_FOUND;
3758 		break;
3759 	}
3760 	return (rv);
3761 }
3762 
3763 /*
3764  * mdi_pi_get_next_prop():
3765  * 		Property walk function.  The caller should hold mdi_pi_lock()
3766  *		and release by calling mdi_pi_unlock() at the end of walk to
3767  *		get a consistent value.
3768  */
3769 
3770 nvpair_t *
3771 mdi_pi_get_next_prop(mdi_pathinfo_t *pip, nvpair_t *prev)
3772 {
3773 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
3774 		return (NULL);
3775 	}
3776 	ASSERT(MUTEX_HELD(&MDI_PI(pip)->pi_mutex));
3777 	return (nvlist_next_nvpair(MDI_PI(pip)->pi_prop, prev));
3778 }
3779 
3780 /*
3781  * mdi_prop_remove():
3782  * 		Remove the named property from the named list.
3783  */
3784 
3785 int
3786 mdi_prop_remove(mdi_pathinfo_t *pip, char *name)
3787 {
3788 	if (pip == NULL) {
3789 		return (DDI_PROP_NOT_FOUND);
3790 	}
3791 	ASSERT(!MUTEX_HELD(&MDI_PI(pip)->pi_mutex));
3792 	MDI_PI_LOCK(pip);
3793 	if (MDI_PI(pip)->pi_prop == NULL) {
3794 		MDI_PI_UNLOCK(pip);
3795 		return (DDI_PROP_NOT_FOUND);
3796 	}
3797 	if (name) {
3798 		(void) nvlist_remove_all(MDI_PI(pip)->pi_prop, name);
3799 	} else {
3800 		char		nvp_name[MAXNAMELEN];
3801 		nvpair_t	*nvp;
3802 		nvp = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, NULL);
3803 		while (nvp) {
3804 			nvpair_t	*next;
3805 			next = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, nvp);
3806 			(void) snprintf(nvp_name, MAXNAMELEN, "%s",
3807 			    nvpair_name(nvp));
3808 			(void) nvlist_remove_all(MDI_PI(pip)->pi_prop,
3809 			    nvp_name);
3810 			nvp = next;
3811 		}
3812 	}
3813 	MDI_PI_UNLOCK(pip);
3814 	return (DDI_PROP_SUCCESS);
3815 }
3816 
3817 /*
3818  * mdi_prop_size():
3819  * 		Get buffer size needed to pack the property data.
3820  * 		Caller should hold the mdi_pathinfo_t lock to get a consistent
3821  *		buffer size.
3822  */
3823 
3824 int
3825 mdi_prop_size(mdi_pathinfo_t *pip, size_t *buflenp)
3826 {
3827 	int	rv;
3828 	size_t	bufsize;
3829 
3830 	*buflenp = 0;
3831 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
3832 		return (DDI_PROP_NOT_FOUND);
3833 	}
3834 	ASSERT(MUTEX_HELD(&MDI_PI(pip)->pi_mutex));
3835 	rv = nvlist_size(MDI_PI(pip)->pi_prop,
3836 	    &bufsize, NV_ENCODE_NATIVE);
3837 	*buflenp = bufsize;
3838 	return (i_map_nvlist_error_to_mdi(rv));
3839 }
3840 
3841 /*
3842  * mdi_prop_pack():
3843  * 		pack the property list.  The caller should hold the
3844  *		mdi_pathinfo_t node to get a consistent data
3845  */
3846 
3847 int
3848 mdi_prop_pack(mdi_pathinfo_t *pip, char **bufp, uint_t buflen)
3849 {
3850 	int	rv;
3851 	size_t	bufsize;
3852 
3853 	if ((pip == NULL) || MDI_PI(pip)->pi_prop == NULL) {
3854 		return (DDI_PROP_NOT_FOUND);
3855 	}
3856 
3857 	ASSERT(MUTEX_HELD(&MDI_PI(pip)->pi_mutex));
3858 
3859 	bufsize = buflen;
3860 	rv = nvlist_pack(MDI_PI(pip)->pi_prop, bufp, (size_t *)&bufsize,
3861 	    NV_ENCODE_NATIVE, KM_SLEEP);
3862 
3863 	return (i_map_nvlist_error_to_mdi(rv));
3864 }
3865 
3866 /*
3867  * mdi_prop_update_byte():
3868  *		Create/Update a byte property
3869  */
3870 int
3871 mdi_prop_update_byte(mdi_pathinfo_t *pip, char *name, uchar_t data)
3872 {
3873 	int rv;
3874 
3875 	if (pip == NULL) {
3876 		return (DDI_PROP_INVAL_ARG);
3877 	}
3878 	ASSERT(!MUTEX_HELD(&MDI_PI(pip)->pi_mutex));
3879 	MDI_PI_LOCK(pip);
3880 	if (MDI_PI(pip)->pi_prop == NULL) {
3881 		MDI_PI_UNLOCK(pip);
3882 		return (DDI_PROP_NOT_FOUND);
3883 	}
3884 	rv = nvlist_add_byte(MDI_PI(pip)->pi_prop, name, data);
3885 	MDI_PI_UNLOCK(pip);
3886 	return (i_map_nvlist_error_to_mdi(rv));
3887 }
3888 
3889 /*
3890  * mdi_prop_update_byte_array():
3891  *		Create/Update a byte array property
3892  */
3893 int
3894 mdi_prop_update_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t *data,
3895     uint_t nelements)
3896 {
3897 	int rv;
3898 
3899 	if (pip == NULL) {
3900 		return (DDI_PROP_INVAL_ARG);
3901 	}
3902 	ASSERT(!MUTEX_HELD(&MDI_PI(pip)->pi_mutex));
3903 	MDI_PI_LOCK(pip);
3904 	if (MDI_PI(pip)->pi_prop == NULL) {
3905 		MDI_PI_UNLOCK(pip);
3906 		return (DDI_PROP_NOT_FOUND);
3907 	}
3908 	rv = nvlist_add_byte_array(MDI_PI(pip)->pi_prop, name, data, nelements);
3909 	MDI_PI_UNLOCK(pip);
3910 	return (i_map_nvlist_error_to_mdi(rv));
3911 }
3912 
3913 /*
3914  * mdi_prop_update_int():
3915  *		Create/Update a 32 bit integer property
3916  */
3917 int
3918 mdi_prop_update_int(mdi_pathinfo_t *pip, char *name, int data)
3919 {
3920 	int rv;
3921 
3922 	if (pip == NULL) {
3923 		return (DDI_PROP_INVAL_ARG);
3924 	}
3925 	ASSERT(!MUTEX_HELD(&MDI_PI(pip)->pi_mutex));
3926 	MDI_PI_LOCK(pip);
3927 	if (MDI_PI(pip)->pi_prop == NULL) {
3928 		MDI_PI_UNLOCK(pip);
3929 		return (DDI_PROP_NOT_FOUND);
3930 	}
3931 	rv = nvlist_add_int32(MDI_PI(pip)->pi_prop, name, (int32_t)data);
3932 	MDI_PI_UNLOCK(pip);
3933 	return (i_map_nvlist_error_to_mdi(rv));
3934 }
3935 
3936 /*
3937  * mdi_prop_update_int64():
3938  *		Create/Update a 64 bit integer property
3939  */
3940 int
3941 mdi_prop_update_int64(mdi_pathinfo_t *pip, char *name, int64_t data)
3942 {
3943 	int rv;
3944 
3945 	if (pip == NULL) {
3946 		return (DDI_PROP_INVAL_ARG);
3947 	}
3948 	ASSERT(!MUTEX_HELD(&MDI_PI(pip)->pi_mutex));
3949 	MDI_PI_LOCK(pip);
3950 	if (MDI_PI(pip)->pi_prop == NULL) {
3951 		MDI_PI_UNLOCK(pip);
3952 		return (DDI_PROP_NOT_FOUND);
3953 	}
3954 	rv = nvlist_add_int64(MDI_PI(pip)->pi_prop, name, data);
3955 	MDI_PI_UNLOCK(pip);
3956 	return (i_map_nvlist_error_to_mdi(rv));
3957 }
3958 
3959 /*
3960  * mdi_prop_update_int_array():
3961  *		Create/Update a int array property
3962  */
3963 int
3964 mdi_prop_update_int_array(mdi_pathinfo_t *pip, char *name, int *data,
3965 	    uint_t nelements)
3966 {
3967 	int rv;
3968 
3969 	if (pip == NULL) {
3970 		return (DDI_PROP_INVAL_ARG);
3971 	}
3972 	ASSERT(!MUTEX_HELD(&MDI_PI(pip)->pi_mutex));
3973 	MDI_PI_LOCK(pip);
3974 	if (MDI_PI(pip)->pi_prop == NULL) {
3975 		MDI_PI_UNLOCK(pip);
3976 		return (DDI_PROP_NOT_FOUND);
3977 	}
3978 	rv = nvlist_add_int32_array(MDI_PI(pip)->pi_prop, name, (int32_t *)data,
3979 	    nelements);
3980 	MDI_PI_UNLOCK(pip);
3981 	return (i_map_nvlist_error_to_mdi(rv));
3982 }
3983 
3984 /*
3985  * mdi_prop_update_string():
3986  *		Create/Update a string property
3987  */
3988 int
3989 mdi_prop_update_string(mdi_pathinfo_t *pip, char *name, char *data)
3990 {
3991 	int rv;
3992 
3993 	if (pip == NULL) {
3994 		return (DDI_PROP_INVAL_ARG);
3995 	}
3996 	ASSERT(!MUTEX_HELD(&MDI_PI(pip)->pi_mutex));
3997 	MDI_PI_LOCK(pip);
3998 	if (MDI_PI(pip)->pi_prop == NULL) {
3999 		MDI_PI_UNLOCK(pip);
4000 		return (DDI_PROP_NOT_FOUND);
4001 	}
4002 	rv = nvlist_add_string(MDI_PI(pip)->pi_prop, name, data);
4003 	MDI_PI_UNLOCK(pip);
4004 	return (i_map_nvlist_error_to_mdi(rv));
4005 }
4006 
4007 /*
4008  * mdi_prop_update_string_array():
4009  *		Create/Update a string array property
4010  */
4011 int
4012 mdi_prop_update_string_array(mdi_pathinfo_t *pip, char *name, char **data,
4013     uint_t nelements)
4014 {
4015 	int rv;
4016 
4017 	if (pip == NULL) {
4018 		return (DDI_PROP_INVAL_ARG);
4019 	}
4020 	ASSERT(!MUTEX_HELD(&MDI_PI(pip)->pi_mutex));
4021 	MDI_PI_LOCK(pip);
4022 	if (MDI_PI(pip)->pi_prop == NULL) {
4023 		MDI_PI_UNLOCK(pip);
4024 		return (DDI_PROP_NOT_FOUND);
4025 	}
4026 	rv = nvlist_add_string_array(MDI_PI(pip)->pi_prop, name, data,
4027 	    nelements);
4028 	MDI_PI_UNLOCK(pip);
4029 	return (i_map_nvlist_error_to_mdi(rv));
4030 }
4031 
4032 /*
4033  * mdi_prop_lookup_byte():
4034  * 		Look for byte property identified by name.  The data returned
4035  *		is the actual property and valid as long as mdi_pathinfo_t node
4036  *		is alive.
4037  */
4038 int
4039 mdi_prop_lookup_byte(mdi_pathinfo_t *pip, char *name, uchar_t *data)
4040 {
4041 	int rv;
4042 
4043 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4044 		return (DDI_PROP_NOT_FOUND);
4045 	}
4046 	rv = nvlist_lookup_byte(MDI_PI(pip)->pi_prop, name, data);
4047 	return (i_map_nvlist_error_to_mdi(rv));
4048 }
4049 
4050 
4051 /*
4052  * mdi_prop_lookup_byte_array():
4053  * 		Look for byte array property identified by name.  The data
4054  *		returned is the actual property and valid as long as
4055  *		mdi_pathinfo_t node is alive.
4056  */
4057 int
4058 mdi_prop_lookup_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t **data,
4059     uint_t *nelements)
4060 {
4061 	int rv;
4062 
4063 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4064 		return (DDI_PROP_NOT_FOUND);
4065 	}
4066 	rv = nvlist_lookup_byte_array(MDI_PI(pip)->pi_prop, name, data,
4067 	    nelements);
4068 	return (i_map_nvlist_error_to_mdi(rv));
4069 }
4070 
4071 /*
4072  * mdi_prop_lookup_int():
4073  * 		Look for int property identified by name.  The data returned
4074  *		is the actual property and valid as long as mdi_pathinfo_t
4075  *		node is alive.
4076  */
4077 int
4078 mdi_prop_lookup_int(mdi_pathinfo_t *pip, char *name, int *data)
4079 {
4080 	int rv;
4081 
4082 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4083 		return (DDI_PROP_NOT_FOUND);
4084 	}
4085 	rv = nvlist_lookup_int32(MDI_PI(pip)->pi_prop, name, (int32_t *)data);
4086 	return (i_map_nvlist_error_to_mdi(rv));
4087 }
4088 
4089 /*
4090  * mdi_prop_lookup_int64():
4091  * 		Look for int64 property identified by name.  The data returned
4092  *		is the actual property and valid as long as mdi_pathinfo_t node
4093  *		is alive.
4094  */
4095 int
4096 mdi_prop_lookup_int64(mdi_pathinfo_t *pip, char *name, int64_t *data)
4097 {
4098 	int rv;
4099 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4100 		return (DDI_PROP_NOT_FOUND);
4101 	}
4102 	rv = nvlist_lookup_int64(MDI_PI(pip)->pi_prop, name, data);
4103 	return (i_map_nvlist_error_to_mdi(rv));
4104 }
4105 
4106 /*
4107  * mdi_prop_lookup_int_array():
4108  * 		Look for int array property identified by name.  The data
4109  *		returned is the actual property and valid as long as
4110  *		mdi_pathinfo_t node is alive.
4111  */
4112 int
4113 mdi_prop_lookup_int_array(mdi_pathinfo_t *pip, char *name, int **data,
4114     uint_t *nelements)
4115 {
4116 	int rv;
4117 
4118 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4119 		return (DDI_PROP_NOT_FOUND);
4120 	}
4121 	rv = nvlist_lookup_int32_array(MDI_PI(pip)->pi_prop, name,
4122 	    (int32_t **)data, nelements);
4123 	return (i_map_nvlist_error_to_mdi(rv));
4124 }
4125 
4126 /*
4127  * mdi_prop_lookup_string():
4128  * 		Look for string property identified by name.  The data
4129  *		returned is the actual property and valid as long as
4130  *		mdi_pathinfo_t node is alive.
4131  */
4132 int
4133 mdi_prop_lookup_string(mdi_pathinfo_t *pip, char *name, char **data)
4134 {
4135 	int rv;
4136 
4137 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4138 		return (DDI_PROP_NOT_FOUND);
4139 	}
4140 	rv = nvlist_lookup_string(MDI_PI(pip)->pi_prop, name, data);
4141 	return (i_map_nvlist_error_to_mdi(rv));
4142 }
4143 
4144 /*
4145  * mdi_prop_lookup_string_array():
4146  * 		Look for string array property identified by name.  The data
4147  *		returned is the actual property and valid as long as
4148  *		mdi_pathinfo_t node is alive.
4149  */
4150 
4151 int
4152 mdi_prop_lookup_string_array(mdi_pathinfo_t *pip, char *name, char ***data,
4153     uint_t *nelements)
4154 {
4155 	int rv;
4156 
4157 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4158 		return (DDI_PROP_NOT_FOUND);
4159 	}
4160 	rv = nvlist_lookup_string_array(MDI_PI(pip)->pi_prop, name, data,
4161 	    nelements);
4162 	return (i_map_nvlist_error_to_mdi(rv));
4163 }
4164 
4165 /*
4166  * mdi_prop_free():
4167  * 		Symmetrical function to ddi_prop_free(). nvlist_lookup_xx()
4168  *		functions return the pointer to actual property data and not a
4169  *		copy of it.  So the data returned is valid as long as
4170  *		mdi_pathinfo_t node is valid.
4171  */
4172 
4173 /*ARGSUSED*/
4174 int
4175 mdi_prop_free(void *data)
4176 {
4177 	return (DDI_PROP_SUCCESS);
4178 }
4179 
4180 /*ARGSUSED*/
4181 static void
4182 i_mdi_report_path_state(mdi_client_t *ct, mdi_pathinfo_t *pip)
4183 {
4184 	char		*phci_path, *ct_path;
4185 	char		*ct_status;
4186 	char		*status;
4187 	dev_info_t	*dip = ct->ct_dip;
4188 	char		lb_buf[64];
4189 
4190 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
4191 	if ((dip == NULL) || (ddi_get_instance(dip) == -1) ||
4192 	    (MDI_CLIENT_IS_REPORT_DEV_NEEDED(ct) == 0)) {
4193 		return;
4194 	}
4195 	if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_OPTIMAL) {
4196 		ct_status = "optimal";
4197 	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED) {
4198 		ct_status = "degraded";
4199 	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
4200 		ct_status = "failed";
4201 	} else {
4202 		ct_status = "unknown";
4203 	}
4204 
4205 	if (MDI_PI_IS_OFFLINE(pip)) {
4206 		status = "offline";
4207 	} else if (MDI_PI_IS_ONLINE(pip)) {
4208 		status = "online";
4209 	} else if (MDI_PI_IS_STANDBY(pip)) {
4210 		status = "standby";
4211 	} else if (MDI_PI_IS_FAULT(pip)) {
4212 		status = "faulted";
4213 	} else {
4214 		status = "unknown";
4215 	}
4216 
4217 	if (ct->ct_lb == LOAD_BALANCE_LBA) {
4218 		(void) snprintf(lb_buf, sizeof (lb_buf),
4219 		    "%s, region-size: %d", mdi_load_balance_lba,
4220 			ct->ct_lb_args->region_size);
4221 	} else if (ct->ct_lb == LOAD_BALANCE_NONE) {
4222 		(void) snprintf(lb_buf, sizeof (lb_buf),
4223 		    "%s", mdi_load_balance_none);
4224 	} else {
4225 		(void) snprintf(lb_buf, sizeof (lb_buf), "%s",
4226 		    mdi_load_balance_rr);
4227 	}
4228 
4229 	if (dip) {
4230 		ct_path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4231 		phci_path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4232 		cmn_err(CE_CONT, "?%s (%s%d) multipath status: %s, "
4233 		    "path %s (%s%d) to target address: %s is %s"
4234 		    " Load balancing: %s\n",
4235 		    ddi_pathname(dip, ct_path), ddi_driver_name(dip),
4236 		    ddi_get_instance(dip), ct_status,
4237 		    ddi_pathname(MDI_PI(pip)->pi_phci->ph_dip, phci_path),
4238 		    ddi_driver_name(MDI_PI(pip)->pi_phci->ph_dip),
4239 		    ddi_get_instance(MDI_PI(pip)->pi_phci->ph_dip),
4240 		    MDI_PI(pip)->pi_addr, status, lb_buf);
4241 		kmem_free(phci_path, MAXPATHLEN);
4242 		kmem_free(ct_path, MAXPATHLEN);
4243 		MDI_CLIENT_CLEAR_REPORT_DEV_NEEDED(ct);
4244 	}
4245 }
4246 
4247 #ifdef	DEBUG
4248 /*
4249  * i_mdi_log():
4250  *		Utility function for error message management
4251  *
4252  */
4253 
4254 /*VARARGS3*/
4255 static void
4256 i_mdi_log(int level, dev_info_t *dip, const char *fmt, ...)
4257 {
4258 	char		buf[MAXNAMELEN];
4259 	char		name[MAXNAMELEN];
4260 	va_list		ap;
4261 	int		log_only = 0;
4262 	int		boot_only = 0;
4263 	int		console_only = 0;
4264 
4265 	if (dip) {
4266 		if (level == CE_PANIC || level == CE_WARN || level == CE_NOTE) {
4267 			(void) snprintf(name, MAXNAMELEN, "%s%d:\n",
4268 			    ddi_node_name(dip), ddi_get_instance(dip));
4269 		} else {
4270 			(void) snprintf(name, MAXNAMELEN, "%s%d:",
4271 			    ddi_node_name(dip), ddi_get_instance(dip));
4272 		}
4273 	} else {
4274 		name[0] = '\0';
4275 	}
4276 
4277 	va_start(ap, fmt);
4278 	(void) vsnprintf(buf, MAXNAMELEN, fmt, ap);
4279 	va_end(ap);
4280 
4281 	switch (buf[0]) {
4282 	case '!':
4283 		log_only = 1;
4284 		break;
4285 	case '?':
4286 		boot_only = 1;
4287 		break;
4288 	case '^':
4289 		console_only = 1;
4290 		break;
4291 	}
4292 
4293 	switch (level) {
4294 	case CE_NOTE:
4295 		level = CE_CONT;
4296 		/* FALLTHROUGH */
4297 	case CE_CONT:
4298 	case CE_WARN:
4299 	case CE_PANIC:
4300 		if (boot_only) {
4301 			cmn_err(level, "?%s\t%s", name, &buf[1]);
4302 		} else if (console_only) {
4303 			cmn_err(level, "^%s\t%s", name, &buf[1]);
4304 		} else if (log_only) {
4305 			cmn_err(level, "!%s\t%s", name, &buf[1]);
4306 		} else {
4307 			cmn_err(level, "%s\t%s", name, buf);
4308 		}
4309 		break;
4310 	default:
4311 		cmn_err(level, "%s\t%s", name, buf);
4312 		break;
4313 	}
4314 }
4315 #endif	/* DEBUG */
4316 
4317 void
4318 i_mdi_client_online(dev_info_t *ct_dip)
4319 {
4320 	mdi_client_t	*ct;
4321 
4322 	/*
4323 	 * Client online notification. Mark client state as online
4324 	 * restore our binding with dev_info node
4325 	 */
4326 	ct = i_devi_get_client(ct_dip);
4327 	ASSERT(ct != NULL);
4328 	MDI_CLIENT_LOCK(ct);
4329 	MDI_CLIENT_SET_ONLINE(ct);
4330 	/* catch for any memory leaks */
4331 	ASSERT((ct->ct_dip == NULL) || (ct->ct_dip == ct_dip));
4332 	ct->ct_dip = ct_dip;
4333 
4334 	if (ct->ct_power_cnt == 0)
4335 		(void) i_mdi_power_all_phci(ct);
4336 
4337 	MDI_DEBUG(4, (CE_NOTE, ct_dip, "i_mdi_client_online "
4338 	    "i_mdi_pm_hold_client\n"));
4339 	i_mdi_pm_hold_client(ct, 1);
4340 
4341 	MDI_CLIENT_UNLOCK(ct);
4342 }
4343 
4344 void
4345 i_mdi_phci_online(dev_info_t *ph_dip)
4346 {
4347 	mdi_phci_t	*ph;
4348 
4349 	/* pHCI online notification. Mark state accordingly */
4350 	ph = i_devi_get_phci(ph_dip);
4351 	ASSERT(ph != NULL);
4352 	MDI_PHCI_LOCK(ph);
4353 	MDI_PHCI_SET_ONLINE(ph);
4354 	MDI_PHCI_UNLOCK(ph);
4355 }
4356 
4357 /*
4358  * mdi_devi_online():
4359  * 		Online notification from NDI framework on pHCI/client
4360  *		device online.
4361  * Return Values:
4362  *		NDI_SUCCESS
4363  *		MDI_FAILURE
4364  */
4365 
4366 /*ARGSUSED*/
4367 int
4368 mdi_devi_online(dev_info_t *dip, uint_t flags)
4369 {
4370 	if (MDI_PHCI(dip)) {
4371 		i_mdi_phci_online(dip);
4372 	}
4373 
4374 	if (MDI_CLIENT(dip)) {
4375 		i_mdi_client_online(dip);
4376 	}
4377 	return (NDI_SUCCESS);
4378 }
4379 
4380 /*
4381  * mdi_devi_offline():
4382  * 		Offline notification from NDI framework on pHCI/Client device
4383  *		offline.
4384  *
4385  * Return Values:
4386  *		NDI_SUCCESS
4387  *		NDI_FAILURE
4388  */
4389 
4390 /*ARGSUSED*/
4391 int
4392 mdi_devi_offline(dev_info_t *dip, uint_t flags)
4393 {
4394 	int		rv = NDI_SUCCESS;
4395 
4396 	if (MDI_CLIENT(dip)) {
4397 		rv = i_mdi_client_offline(dip, flags);
4398 		if (rv != NDI_SUCCESS)
4399 			return (rv);
4400 	}
4401 
4402 	if (MDI_PHCI(dip)) {
4403 		rv = i_mdi_phci_offline(dip, flags);
4404 		if ((rv != NDI_SUCCESS) && MDI_CLIENT(dip)) {
4405 			/* set client back online */
4406 			i_mdi_client_online(dip);
4407 		}
4408 	}
4409 
4410 	return (rv);
4411 }
4412 
4413 /*ARGSUSED*/
4414 static int
4415 i_mdi_phci_offline(dev_info_t *dip, uint_t flags)
4416 {
4417 	int		rv = NDI_SUCCESS;
4418 	mdi_phci_t	*ph;
4419 	mdi_client_t	*ct;
4420 	mdi_pathinfo_t	*pip;
4421 	mdi_pathinfo_t	*next;
4422 	mdi_pathinfo_t	*failed_pip = NULL;
4423 	dev_info_t	*cdip;
4424 
4425 	/*
4426 	 * pHCI component offline notification
4427 	 * Make sure that this pHCI instance is free to be offlined.
4428 	 * If it is OK to proceed, Offline and remove all the child
4429 	 * mdi_pathinfo nodes.  This process automatically offlines
4430 	 * corresponding client devices, for which this pHCI provides
4431 	 * critical services.
4432 	 */
4433 	MDI_DEBUG(2, (CE_NOTE, dip, "!mdi_phci_offline called %p\n",
4434 	    dip));
4435 
4436 	ph = i_devi_get_phci(dip);
4437 	if (ph == NULL) {
4438 		return (rv);
4439 	}
4440 
4441 	MDI_PHCI_LOCK(ph);
4442 
4443 	if (MDI_PHCI_IS_OFFLINE(ph)) {
4444 		MDI_DEBUG(1, (CE_WARN, dip, "!pHCI %p already offlined", ph));
4445 		MDI_PHCI_UNLOCK(ph);
4446 		return (NDI_SUCCESS);
4447 	}
4448 
4449 	/*
4450 	 * Check to see if the pHCI can be offlined
4451 	 */
4452 	if (ph->ph_unstable) {
4453 		MDI_DEBUG(1, (CE_WARN, dip,
4454 		    "!One or more target devices are in transient "
4455 		    "state. This device can not be removed at "
4456 		    "this moment. Please try again later."));
4457 		MDI_PHCI_UNLOCK(ph);
4458 		return (NDI_BUSY);
4459 	}
4460 
4461 	pip = ph->ph_path_head;
4462 	while (pip != NULL) {
4463 		MDI_PI_LOCK(pip);
4464 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4465 		/*
4466 		 * The mdi_pathinfo state is OK. Check the client state.
4467 		 * If failover in progress fail the pHCI from offlining
4468 		 */
4469 		ct = MDI_PI(pip)->pi_client;
4470 		i_mdi_client_lock(ct, pip);
4471 		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
4472 		    (ct->ct_unstable)) {
4473 			/*
4474 			 * Failover is in progress, Fail the DR
4475 			 */
4476 			MDI_DEBUG(1, (CE_WARN, dip,
4477 			    "!pHCI device (%s%d) is Busy. %s",
4478 			    ddi_driver_name(dip), ddi_get_instance(dip),
4479 			    "This device can not be removed at "
4480 			    "this moment. Please try again later."));
4481 			MDI_PI_UNLOCK(pip);
4482 			MDI_CLIENT_UNLOCK(ct);
4483 			MDI_PHCI_UNLOCK(ph);
4484 			return (NDI_BUSY);
4485 		}
4486 		MDI_PI_UNLOCK(pip);
4487 
4488 		/*
4489 		 * Check to see of we are removing the last path of this
4490 		 * client device...
4491 		 */
4492 		cdip = ct->ct_dip;
4493 		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
4494 		    (i_mdi_client_compute_state(ct, ph) ==
4495 		    MDI_CLIENT_STATE_FAILED)) {
4496 			i_mdi_client_unlock(ct);
4497 			MDI_PHCI_UNLOCK(ph);
4498 			if (ndi_devi_offline(cdip, 0) != NDI_SUCCESS) {
4499 				/*
4500 				 * ndi_devi_offline() failed.
4501 				 * This pHCI provides the critical path
4502 				 * to one or more client devices.
4503 				 * Return busy.
4504 				 */
4505 				MDI_PHCI_LOCK(ph);
4506 				MDI_DEBUG(1, (CE_WARN, dip,
4507 				    "!pHCI device (%s%d) is Busy. %s",
4508 				    ddi_driver_name(dip), ddi_get_instance(dip),
4509 				    "This device can not be removed at "
4510 				    "this moment. Please try again later."));
4511 				failed_pip = pip;
4512 				break;
4513 			} else {
4514 				MDI_PHCI_LOCK(ph);
4515 				pip = next;
4516 			}
4517 		} else {
4518 			i_mdi_client_unlock(ct);
4519 			pip = next;
4520 		}
4521 	}
4522 
4523 	if (failed_pip) {
4524 		pip = ph->ph_path_head;
4525 		while (pip != failed_pip) {
4526 			MDI_PI_LOCK(pip);
4527 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4528 			ct = MDI_PI(pip)->pi_client;
4529 			i_mdi_client_lock(ct, pip);
4530 			cdip = ct->ct_dip;
4531 			switch (MDI_CLIENT_STATE(ct)) {
4532 			case MDI_CLIENT_STATE_OPTIMAL:
4533 			case MDI_CLIENT_STATE_DEGRADED:
4534 				if (cdip) {
4535 					MDI_PI_UNLOCK(pip);
4536 					i_mdi_client_unlock(ct);
4537 					MDI_PHCI_UNLOCK(ph);
4538 					(void) ndi_devi_online(cdip, 0);
4539 					MDI_PHCI_LOCK(ph);
4540 					pip = next;
4541 					continue;
4542 				}
4543 				break;
4544 
4545 			case MDI_CLIENT_STATE_FAILED:
4546 				if (cdip) {
4547 					MDI_PI_UNLOCK(pip);
4548 					i_mdi_client_unlock(ct);
4549 					MDI_PHCI_UNLOCK(ph);
4550 					(void) ndi_devi_offline(cdip, 0);
4551 					MDI_PHCI_LOCK(ph);
4552 					pip = next;
4553 					continue;
4554 				}
4555 				break;
4556 			}
4557 			MDI_PI_UNLOCK(pip);
4558 			i_mdi_client_unlock(ct);
4559 			pip = next;
4560 		}
4561 		MDI_PHCI_UNLOCK(ph);
4562 		return (NDI_BUSY);
4563 	}
4564 
4565 	/*
4566 	 * Mark the pHCI as offline
4567 	 */
4568 	MDI_PHCI_SET_OFFLINE(ph);
4569 
4570 	/*
4571 	 * Mark the child mdi_pathinfo nodes as transient
4572 	 */
4573 	pip = ph->ph_path_head;
4574 	while (pip != NULL) {
4575 		MDI_PI_LOCK(pip);
4576 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4577 		MDI_PI_SET_OFFLINING(pip);
4578 		MDI_PI_UNLOCK(pip);
4579 		pip = next;
4580 	}
4581 	MDI_PHCI_UNLOCK(ph);
4582 	/*
4583 	 * Give a chance for any pending commands to execute
4584 	 */
4585 	delay(1);
4586 	MDI_PHCI_LOCK(ph);
4587 	pip = ph->ph_path_head;
4588 	while (pip != NULL) {
4589 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4590 		(void) i_mdi_pi_offline(pip, flags);
4591 		MDI_PI_LOCK(pip);
4592 		ct = MDI_PI(pip)->pi_client;
4593 		if (!MDI_PI_IS_OFFLINE(pip)) {
4594 			MDI_DEBUG(1, (CE_WARN, dip,
4595 			    "!pHCI device (%s%d) is Busy. %s",
4596 			    ddi_driver_name(dip), ddi_get_instance(dip),
4597 			    "This device can not be removed at "
4598 			    "this moment. Please try again later."));
4599 			MDI_PI_UNLOCK(pip);
4600 			MDI_PHCI_SET_ONLINE(ph);
4601 			MDI_PHCI_UNLOCK(ph);
4602 			return (NDI_BUSY);
4603 		}
4604 		MDI_PI_UNLOCK(pip);
4605 		pip = next;
4606 	}
4607 	MDI_PHCI_UNLOCK(ph);
4608 
4609 	return (rv);
4610 }
4611 
4612 /*ARGSUSED*/
4613 static int
4614 i_mdi_client_offline(dev_info_t *dip, uint_t flags)
4615 {
4616 	int		rv = NDI_SUCCESS;
4617 	mdi_client_t	*ct;
4618 
4619 	/*
4620 	 * Client component to go offline.  Make sure that we are
4621 	 * not in failing over state and update client state
4622 	 * accordingly
4623 	 */
4624 	MDI_DEBUG(2, (CE_NOTE, dip, "!i_mdi_client_offline called %p\n",
4625 	    dip));
4626 	ct = i_devi_get_client(dip);
4627 	if (ct != NULL) {
4628 		MDI_CLIENT_LOCK(ct);
4629 		if (ct->ct_unstable) {
4630 			/*
4631 			 * One or more paths are in transient state,
4632 			 * Dont allow offline of a client device
4633 			 */
4634 			MDI_DEBUG(1, (CE_WARN, dip,
4635 			    "!One or more paths to this device is "
4636 			    "in transient state. This device can not "
4637 			    "be removed at this moment. "
4638 			    "Please try again later."));
4639 			MDI_CLIENT_UNLOCK(ct);
4640 			return (NDI_BUSY);
4641 		}
4642 		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
4643 			/*
4644 			 * Failover is in progress, Dont allow DR of
4645 			 * a client device
4646 			 */
4647 			MDI_DEBUG(1, (CE_WARN, dip,
4648 			    "!Client device (%s%d) is Busy. %s",
4649 			    ddi_driver_name(dip), ddi_get_instance(dip),
4650 			    "This device can not be removed at "
4651 			    "this moment. Please try again later."));
4652 			MDI_CLIENT_UNLOCK(ct);
4653 			return (NDI_BUSY);
4654 		}
4655 		MDI_CLIENT_SET_OFFLINE(ct);
4656 
4657 		/*
4658 		 * Unbind our relationship with the dev_info node
4659 		 */
4660 		if (flags & NDI_DEVI_REMOVE) {
4661 			ct->ct_dip = NULL;
4662 		}
4663 		MDI_CLIENT_UNLOCK(ct);
4664 	}
4665 	return (rv);
4666 }
4667 
4668 /*
4669  * mdi_pre_attach():
4670  *		Pre attach() notification handler
4671  */
4672 
4673 /*ARGSUSED*/
4674 int
4675 mdi_pre_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
4676 {
4677 	/* don't support old DDI_PM_RESUME */
4678 	if ((DEVI(dip)->devi_mdi_component != MDI_COMPONENT_NONE) &&
4679 	    (cmd == DDI_PM_RESUME))
4680 		return (DDI_FAILURE);
4681 
4682 	return (DDI_SUCCESS);
4683 }
4684 
4685 /*
4686  * mdi_post_attach():
4687  *		Post attach() notification handler
4688  */
4689 
4690 /*ARGSUSED*/
4691 void
4692 mdi_post_attach(dev_info_t *dip, ddi_attach_cmd_t cmd, int error)
4693 {
4694 	mdi_phci_t	*ph;
4695 	mdi_client_t	*ct;
4696 	mdi_pathinfo_t	*pip;
4697 
4698 	if (MDI_PHCI(dip)) {
4699 		ph = i_devi_get_phci(dip);
4700 		ASSERT(ph != NULL);
4701 
4702 		MDI_PHCI_LOCK(ph);
4703 		switch (cmd) {
4704 		case DDI_ATTACH:
4705 			MDI_DEBUG(2, (CE_NOTE, dip,
4706 			    "!pHCI post_attach: called %p\n", ph));
4707 			if (error == DDI_SUCCESS) {
4708 				MDI_PHCI_SET_ATTACH(ph);
4709 			} else {
4710 				MDI_DEBUG(1, (CE_NOTE, dip,
4711 				    "!pHCI post_attach: failed error=%d\n",
4712 				    error));
4713 				MDI_PHCI_SET_DETACH(ph);
4714 			}
4715 			break;
4716 
4717 		case DDI_RESUME:
4718 			MDI_DEBUG(2, (CE_NOTE, dip,
4719 			    "!pHCI post_resume: called %p\n", ph));
4720 			if (error == DDI_SUCCESS) {
4721 				MDI_PHCI_SET_RESUME(ph);
4722 			} else {
4723 				MDI_DEBUG(1, (CE_NOTE, dip,
4724 				    "!pHCI post_resume: failed error=%d\n",
4725 				    error));
4726 				MDI_PHCI_SET_SUSPEND(ph);
4727 			}
4728 			break;
4729 		}
4730 		MDI_PHCI_UNLOCK(ph);
4731 	}
4732 
4733 	if (MDI_CLIENT(dip)) {
4734 		ct = i_devi_get_client(dip);
4735 		ASSERT(ct != NULL);
4736 
4737 		MDI_CLIENT_LOCK(ct);
4738 		switch (cmd) {
4739 		case DDI_ATTACH:
4740 			MDI_DEBUG(2, (CE_NOTE, dip,
4741 			    "!Client post_attach: called %p\n", ct));
4742 			if (error != DDI_SUCCESS) {
4743 				MDI_DEBUG(1, (CE_NOTE, dip,
4744 				    "!Client post_attach: failed error=%d\n",
4745 				    error));
4746 				MDI_CLIENT_SET_DETACH(ct);
4747 				MDI_DEBUG(4, (CE_WARN, dip,
4748 				    "mdi_post_attach i_mdi_pm_reset_client\n"));
4749 				i_mdi_pm_reset_client(ct);
4750 				break;
4751 			}
4752 
4753 			/*
4754 			 * Client device has successfully attached.
4755 			 * Create kstats for any pathinfo structures
4756 			 * initially associated with this client.
4757 			 */
4758 			for (pip = ct->ct_path_head; pip != NULL;
4759 			    pip = (mdi_pathinfo_t *)
4760 			    MDI_PI(pip)->pi_client_link) {
4761 				(void) i_mdi_pi_kstat_create(pip);
4762 				i_mdi_report_path_state(ct, pip);
4763 			}
4764 			MDI_CLIENT_SET_ATTACH(ct);
4765 			break;
4766 
4767 		case DDI_RESUME:
4768 			MDI_DEBUG(2, (CE_NOTE, dip,
4769 			    "!Client post_attach: called %p\n", ct));
4770 			if (error == DDI_SUCCESS) {
4771 				MDI_CLIENT_SET_RESUME(ct);
4772 			} else {
4773 				MDI_DEBUG(1, (CE_NOTE, dip,
4774 				    "!Client post_resume: failed error=%d\n",
4775 				    error));
4776 				MDI_CLIENT_SET_SUSPEND(ct);
4777 			}
4778 			break;
4779 		}
4780 		MDI_CLIENT_UNLOCK(ct);
4781 	}
4782 }
4783 
4784 /*
4785  * mdi_pre_detach():
4786  *		Pre detach notification handler
4787  */
4788 
4789 /*ARGSUSED*/
4790 int
4791 mdi_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
4792 {
4793 	int rv = DDI_SUCCESS;
4794 
4795 	if (MDI_CLIENT(dip)) {
4796 		(void) i_mdi_client_pre_detach(dip, cmd);
4797 	}
4798 
4799 	if (MDI_PHCI(dip)) {
4800 		rv = i_mdi_phci_pre_detach(dip, cmd);
4801 	}
4802 
4803 	return (rv);
4804 }
4805 
4806 /*ARGSUSED*/
4807 static int
4808 i_mdi_phci_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
4809 {
4810 	int		rv = DDI_SUCCESS;
4811 	mdi_phci_t	*ph;
4812 	mdi_client_t	*ct;
4813 	mdi_pathinfo_t	*pip;
4814 	mdi_pathinfo_t	*failed_pip = NULL;
4815 	mdi_pathinfo_t	*next;
4816 
4817 	ph = i_devi_get_phci(dip);
4818 	if (ph == NULL) {
4819 		return (rv);
4820 	}
4821 
4822 	MDI_PHCI_LOCK(ph);
4823 	switch (cmd) {
4824 	case DDI_DETACH:
4825 		MDI_DEBUG(2, (CE_NOTE, dip,
4826 		    "!pHCI pre_detach: called %p\n", ph));
4827 		if (!MDI_PHCI_IS_OFFLINE(ph)) {
4828 			/*
4829 			 * mdi_pathinfo nodes are still attached to
4830 			 * this pHCI. Fail the detach for this pHCI.
4831 			 */
4832 			MDI_DEBUG(2, (CE_WARN, dip,
4833 			    "!pHCI pre_detach: "
4834 			    "mdi_pathinfo nodes are still attached "
4835 			    "%p\n", ph));
4836 			rv = DDI_FAILURE;
4837 			break;
4838 		}
4839 		MDI_PHCI_SET_DETACH(ph);
4840 		break;
4841 
4842 	case DDI_SUSPEND:
4843 		/*
4844 		 * pHCI is getting suspended.  Since mpxio client
4845 		 * devices may not be suspended at this point, to avoid
4846 		 * a potential stack overflow, it is important to suspend
4847 		 * client devices before pHCI can be suspended.
4848 		 */
4849 
4850 		MDI_DEBUG(2, (CE_NOTE, dip,
4851 		    "!pHCI pre_suspend: called %p\n", ph));
4852 		/*
4853 		 * Suspend all the client devices accessible through this pHCI
4854 		 */
4855 		pip = ph->ph_path_head;
4856 		while (pip != NULL && rv == DDI_SUCCESS) {
4857 			dev_info_t *cdip;
4858 			MDI_PI_LOCK(pip);
4859 			next =
4860 			    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4861 			ct = MDI_PI(pip)->pi_client;
4862 			i_mdi_client_lock(ct, pip);
4863 			cdip = ct->ct_dip;
4864 			MDI_PI_UNLOCK(pip);
4865 			if ((MDI_CLIENT_IS_DETACHED(ct) == 0) &&
4866 			    MDI_CLIENT_IS_SUSPENDED(ct) == 0) {
4867 				i_mdi_client_unlock(ct);
4868 				if ((rv = devi_detach(cdip, DDI_SUSPEND)) !=
4869 				    DDI_SUCCESS) {
4870 					/*
4871 					 * Suspend of one of the client
4872 					 * device has failed.
4873 					 */
4874 					MDI_DEBUG(1, (CE_WARN, dip,
4875 					    "!Suspend of device (%s%d) failed.",
4876 					    ddi_driver_name(cdip),
4877 					    ddi_get_instance(cdip)));
4878 					failed_pip = pip;
4879 					break;
4880 				}
4881 			} else {
4882 				i_mdi_client_unlock(ct);
4883 			}
4884 			pip = next;
4885 		}
4886 
4887 		if (rv == DDI_SUCCESS) {
4888 			/*
4889 			 * Suspend of client devices is complete. Proceed
4890 			 * with pHCI suspend.
4891 			 */
4892 			MDI_PHCI_SET_SUSPEND(ph);
4893 		} else {
4894 			/*
4895 			 * Revert back all the suspended client device states
4896 			 * to converse.
4897 			 */
4898 			pip = ph->ph_path_head;
4899 			while (pip != failed_pip) {
4900 				dev_info_t *cdip;
4901 				MDI_PI_LOCK(pip);
4902 				next =
4903 				    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
4904 				ct = MDI_PI(pip)->pi_client;
4905 				i_mdi_client_lock(ct, pip);
4906 				cdip = ct->ct_dip;
4907 				MDI_PI_UNLOCK(pip);
4908 				if (MDI_CLIENT_IS_SUSPENDED(ct)) {
4909 					i_mdi_client_unlock(ct);
4910 					(void) devi_attach(cdip, DDI_RESUME);
4911 				} else {
4912 					i_mdi_client_unlock(ct);
4913 				}
4914 				pip = next;
4915 			}
4916 		}
4917 		break;
4918 
4919 	default:
4920 		rv = DDI_FAILURE;
4921 		break;
4922 	}
4923 	MDI_PHCI_UNLOCK(ph);
4924 	return (rv);
4925 }
4926 
4927 /*ARGSUSED*/
4928 static int
4929 i_mdi_client_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
4930 {
4931 	int		rv = DDI_SUCCESS;
4932 	mdi_client_t	*ct;
4933 
4934 	ct = i_devi_get_client(dip);
4935 	if (ct == NULL) {
4936 		return (rv);
4937 	}
4938 
4939 	MDI_CLIENT_LOCK(ct);
4940 	switch (cmd) {
4941 	case DDI_DETACH:
4942 		MDI_DEBUG(2, (CE_NOTE, dip,
4943 		    "!Client pre_detach: called %p\n", ct));
4944 		MDI_CLIENT_SET_DETACH(ct);
4945 		break;
4946 
4947 	case DDI_SUSPEND:
4948 		MDI_DEBUG(2, (CE_NOTE, dip,
4949 		    "!Client pre_suspend: called %p\n", ct));
4950 		MDI_CLIENT_SET_SUSPEND(ct);
4951 		break;
4952 
4953 	default:
4954 		rv = DDI_FAILURE;
4955 		break;
4956 	}
4957 	MDI_CLIENT_UNLOCK(ct);
4958 	return (rv);
4959 }
4960 
4961 /*
4962  * mdi_post_detach():
4963  *		Post detach notification handler
4964  */
4965 
4966 /*ARGSUSED*/
4967 void
4968 mdi_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
4969 {
4970 	/*
4971 	 * Detach/Suspend of mpxio component failed. Update our state
4972 	 * too
4973 	 */
4974 	if (MDI_PHCI(dip))
4975 		i_mdi_phci_post_detach(dip, cmd, error);
4976 
4977 	if (MDI_CLIENT(dip))
4978 		i_mdi_client_post_detach(dip, cmd, error);
4979 }
4980 
4981 /*ARGSUSED*/
4982 static void
4983 i_mdi_phci_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
4984 {
4985 	mdi_phci_t	*ph;
4986 
4987 	/*
4988 	 * Detach/Suspend of phci component failed. Update our state
4989 	 * too
4990 	 */
4991 	ph = i_devi_get_phci(dip);
4992 	if (ph == NULL) {
4993 		return;
4994 	}
4995 
4996 	MDI_PHCI_LOCK(ph);
4997 	/*
4998 	 * Detach of pHCI failed. Restore back converse
4999 	 * state
5000 	 */
5001 	switch (cmd) {
5002 	case DDI_DETACH:
5003 		MDI_DEBUG(2, (CE_NOTE, dip,
5004 		    "!pHCI post_detach: called %p\n", ph));
5005 		if (error != DDI_SUCCESS)
5006 			MDI_PHCI_SET_ATTACH(ph);
5007 		break;
5008 
5009 	case DDI_SUSPEND:
5010 		MDI_DEBUG(2, (CE_NOTE, dip,
5011 		    "!pHCI post_suspend: called %p\n", ph));
5012 		if (error != DDI_SUCCESS)
5013 			MDI_PHCI_SET_RESUME(ph);
5014 		break;
5015 	}
5016 	MDI_PHCI_UNLOCK(ph);
5017 }
5018 
5019 /*ARGSUSED*/
5020 static void
5021 i_mdi_client_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5022 {
5023 	mdi_client_t	*ct;
5024 
5025 	ct = i_devi_get_client(dip);
5026 	if (ct == NULL) {
5027 		return;
5028 	}
5029 	MDI_CLIENT_LOCK(ct);
5030 	/*
5031 	 * Detach of Client failed. Restore back converse
5032 	 * state
5033 	 */
5034 	switch (cmd) {
5035 	case DDI_DETACH:
5036 		MDI_DEBUG(2, (CE_NOTE, dip,
5037 		    "!Client post_detach: called %p\n", ct));
5038 		if (DEVI_IS_ATTACHING(ct->ct_dip)) {
5039 			MDI_DEBUG(4, (CE_NOTE, dip, "i_mdi_client_post_detach "
5040 			    "i_mdi_pm_rele_client\n"));
5041 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
5042 		} else {
5043 			MDI_DEBUG(4, (CE_NOTE, dip, "i_mdi_client_post_detach "
5044 			    "i_mdi_pm_reset_client\n"));
5045 			i_mdi_pm_reset_client(ct);
5046 		}
5047 		if (error != DDI_SUCCESS)
5048 			MDI_CLIENT_SET_ATTACH(ct);
5049 		break;
5050 
5051 	case DDI_SUSPEND:
5052 		MDI_DEBUG(2, (CE_NOTE, dip,
5053 		    "!Client post_suspend: called %p\n", ct));
5054 		if (error != DDI_SUCCESS)
5055 			MDI_CLIENT_SET_RESUME(ct);
5056 		break;
5057 	}
5058 	MDI_CLIENT_UNLOCK(ct);
5059 }
5060 
5061 /*
5062  * create and install per-path (client - pHCI) statistics
5063  * I/O stats supported: nread, nwritten, reads, and writes
5064  * Error stats - hard errors, soft errors, & transport errors
5065  */
5066 static int
5067 i_mdi_pi_kstat_create(mdi_pathinfo_t *pip)
5068 {
5069 
5070 	dev_info_t *client = MDI_PI(pip)->pi_client->ct_dip;
5071 	dev_info_t *ppath = MDI_PI(pip)->pi_phci->ph_dip;
5072 	char ksname[KSTAT_STRLEN];
5073 	mdi_pathinfo_t *cpip;
5074 	const char *err_postfix = ",err";
5075 	kstat_t	*kiosp, *kerrsp;
5076 	struct pi_errs	*nsp;
5077 	struct mdi_pi_kstats *mdi_statp;
5078 
5079 	ASSERT(client != NULL && ppath != NULL);
5080 
5081 	ASSERT(mutex_owned(&(MDI_PI(pip)->pi_client->ct_mutex)));
5082 
5083 	if (MDI_PI(pip)->pi_kstats != NULL)
5084 		return (MDI_SUCCESS);
5085 
5086 	for (cpip = MDI_PI(pip)->pi_client->ct_path_head; cpip != NULL;
5087 	    cpip = (mdi_pathinfo_t *)(MDI_PI(cpip)->pi_client_link)) {
5088 		if (cpip == pip)
5089 			continue;
5090 		/*
5091 		 * We have found a different path with same parent
5092 		 * kstats for a given client-pHCI are common
5093 		 */
5094 		if ((MDI_PI(cpip)->pi_phci->ph_dip == ppath) &&
5095 		    (MDI_PI(cpip)->pi_kstats != NULL)) {
5096 			MDI_PI(cpip)->pi_kstats->pi_kstat_ref++;
5097 			MDI_PI(pip)->pi_kstats = MDI_PI(cpip)->pi_kstats;
5098 			return (MDI_SUCCESS);
5099 		}
5100 	}
5101 
5102 	/*
5103 	 * stats are named as follows: TGTx.HBAy, e.g. "ssd0.fp0"
5104 	 * clamp length of name against max length of error kstat name
5105 	 */
5106 	if (snprintf(ksname, KSTAT_STRLEN, "%s%d.%s%d",
5107 	    ddi_driver_name(client), ddi_get_instance(client),
5108 	    ddi_driver_name(ppath), ddi_get_instance(ppath)) >
5109 	    (KSTAT_STRLEN - strlen(err_postfix))) {
5110 		return (MDI_FAILURE);
5111 	}
5112 	if ((kiosp = kstat_create("mdi", 0, ksname, "iopath",
5113 	    KSTAT_TYPE_IO, 1, 0)) == NULL) {
5114 		return (MDI_FAILURE);
5115 	}
5116 
5117 	(void) strcat(ksname, err_postfix);
5118 	kerrsp = kstat_create("mdi", 0, ksname, "iopath_errors",
5119 	    KSTAT_TYPE_NAMED,
5120 	    sizeof (struct pi_errs) / sizeof (kstat_named_t), 0);
5121 
5122 	if (kerrsp == NULL) {
5123 		kstat_delete(kiosp);
5124 		return (MDI_FAILURE);
5125 	}
5126 
5127 	nsp = (struct pi_errs *)kerrsp->ks_data;
5128 	kstat_named_init(&nsp->pi_softerrs, "Soft Errors", KSTAT_DATA_UINT32);
5129 	kstat_named_init(&nsp->pi_harderrs, "Hard Errors", KSTAT_DATA_UINT32);
5130 	kstat_named_init(&nsp->pi_transerrs, "Transport Errors",
5131 	    KSTAT_DATA_UINT32);
5132 	kstat_named_init(&nsp->pi_icnt_busy, "Interconnect Busy",
5133 	    KSTAT_DATA_UINT32);
5134 	kstat_named_init(&nsp->pi_icnt_errors, "Interconnect Errors",
5135 	    KSTAT_DATA_UINT32);
5136 	kstat_named_init(&nsp->pi_phci_rsrc, "pHCI No Resources",
5137 	    KSTAT_DATA_UINT32);
5138 	kstat_named_init(&nsp->pi_phci_localerr, "pHCI Local Errors",
5139 	    KSTAT_DATA_UINT32);
5140 	kstat_named_init(&nsp->pi_phci_invstate, "pHCI Invalid State",
5141 	    KSTAT_DATA_UINT32);
5142 	kstat_named_init(&nsp->pi_failedfrom, "Failed From",
5143 	    KSTAT_DATA_UINT32);
5144 	kstat_named_init(&nsp->pi_failedto, "Failed To", KSTAT_DATA_UINT32);
5145 
5146 	mdi_statp = kmem_alloc(sizeof (*mdi_statp), KM_SLEEP);
5147 	mdi_statp->pi_kstat_ref = 1;
5148 	mdi_statp->pi_kstat_iostats = kiosp;
5149 	mdi_statp->pi_kstat_errstats = kerrsp;
5150 	kstat_install(kiosp);
5151 	kstat_install(kerrsp);
5152 	MDI_PI(pip)->pi_kstats = mdi_statp;
5153 	return (MDI_SUCCESS);
5154 }
5155 
5156 /*
5157  * destroy per-path properties
5158  */
5159 static void
5160 i_mdi_pi_kstat_destroy(mdi_pathinfo_t *pip)
5161 {
5162 
5163 	struct mdi_pi_kstats *mdi_statp;
5164 
5165 	if ((mdi_statp = MDI_PI(pip)->pi_kstats) == NULL)
5166 		return;
5167 
5168 	MDI_PI(pip)->pi_kstats = NULL;
5169 
5170 	/*
5171 	 * the kstat may be shared between multiple pathinfo nodes
5172 	 * decrement this pathinfo's usage, removing the kstats
5173 	 * themselves when the last pathinfo reference is removed.
5174 	 */
5175 	ASSERT(mdi_statp->pi_kstat_ref > 0);
5176 	if (--mdi_statp->pi_kstat_ref != 0)
5177 		return;
5178 
5179 	kstat_delete(mdi_statp->pi_kstat_iostats);
5180 	kstat_delete(mdi_statp->pi_kstat_errstats);
5181 	kmem_free(mdi_statp, sizeof (*mdi_statp));
5182 }
5183 
5184 /*
5185  * update I/O paths KSTATS
5186  */
5187 void
5188 mdi_pi_kstat_iosupdate(mdi_pathinfo_t *pip, struct buf *bp)
5189 {
5190 	kstat_t *iostatp;
5191 	size_t xfer_cnt;
5192 
5193 	ASSERT(pip != NULL);
5194 
5195 	/*
5196 	 * I/O can be driven across a path prior to having path
5197 	 * statistics available, i.e. probe(9e).
5198 	 */
5199 	if (bp != NULL && MDI_PI(pip)->pi_kstats != NULL) {
5200 		iostatp = MDI_PI(pip)->pi_kstats->pi_kstat_iostats;
5201 		xfer_cnt = bp->b_bcount - bp->b_resid;
5202 		if (bp->b_flags & B_READ) {
5203 			KSTAT_IO_PTR(iostatp)->reads++;
5204 			KSTAT_IO_PTR(iostatp)->nread += xfer_cnt;
5205 		} else {
5206 			KSTAT_IO_PTR(iostatp)->writes++;
5207 			KSTAT_IO_PTR(iostatp)->nwritten += xfer_cnt;
5208 		}
5209 	}
5210 }
5211 
5212 /*
5213  * disable the path to a particular pHCI (pHCI specified in the phci_path
5214  * argument) for a particular client (specified in the client_path argument).
5215  * Disabling a path means that MPxIO will not select the disabled path for
5216  * routing any new I/O requests.
5217  */
5218 int
5219 mdi_pi_disable(dev_info_t *cdip, dev_info_t *pdip, int flags)
5220 {
5221 	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_DISABLE_OP));
5222 }
5223 
5224 /*
5225  * Enable the path to a particular pHCI (pHCI specified in the phci_path
5226  * argument) for a particular client (specified in the client_path argument).
5227  * Enabling a path means that MPxIO may select the enabled path for routing
5228  * future I/O requests, subject to other path state constraints.
5229  */
5230 
5231 int
5232 mdi_pi_enable(dev_info_t *cdip, dev_info_t *pdip, int flags)
5233 {
5234 	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_ENABLE_OP));
5235 }
5236 
5237 
5238 /*
5239  * Common routine for doing enable/disable.
5240  */
5241 int
5242 i_mdi_pi_enable_disable(dev_info_t *cdip, dev_info_t *pdip, int flags, int op)
5243 {
5244 
5245 	mdi_phci_t	*ph;
5246 	mdi_vhci_t	*vh = NULL;
5247 	mdi_client_t	*ct;
5248 	mdi_pathinfo_t	*next, *pip;
5249 	int		found_it;
5250 	int		(*f)() = NULL;
5251 	int		rv;
5252 	int		sync_flag = 0;
5253 
5254 	ph = i_devi_get_phci(pdip);
5255 	MDI_DEBUG(5, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable:"
5256 		" Operation = %d pdip = %p cdip = %p\n", op, pdip, cdip));
5257 	if (ph == NULL) {
5258 		MDI_DEBUG(1, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable:"
5259 			" failed. ph = NULL operation = %d\n", op));
5260 		return (MDI_FAILURE);
5261 	}
5262 
5263 	if ((op != MDI_ENABLE_OP) && (op != MDI_DISABLE_OP)) {
5264 		MDI_DEBUG(1, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable:"
5265 			" Invalid operation = %d\n", op));
5266 		return (MDI_FAILURE);
5267 	}
5268 
5269 	sync_flag = (flags << 8) & 0xf00;
5270 
5271 	vh = ph->ph_vhci;
5272 	f = vh->vh_ops->vo_pi_state_change;
5273 
5274 	if (cdip == NULL) {
5275 		/*
5276 		 * Need to mark the Phci as enabled/disabled.
5277 		 */
5278 		MDI_DEBUG(3, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable:"
5279 		"Operation %d for the phci\n", op));
5280 		MDI_PHCI_LOCK(ph);
5281 		switch (flags) {
5282 			case USER_DISABLE:
5283 				if (op == MDI_DISABLE_OP)
5284 					MDI_PHCI_SET_USER_DISABLE(ph);
5285 				else
5286 					MDI_PHCI_SET_USER_ENABLE(ph);
5287 				break;
5288 			case DRIVER_DISABLE:
5289 				if (op == MDI_DISABLE_OP)
5290 					MDI_PHCI_SET_DRV_DISABLE(ph);
5291 				else
5292 					MDI_PHCI_SET_DRV_ENABLE(ph);
5293 				break;
5294 			case DRIVER_DISABLE_TRANSIENT:
5295 				if (op == MDI_DISABLE_OP)
5296 					MDI_PHCI_SET_DRV_DISABLE_TRANSIENT(ph);
5297 				else
5298 					MDI_PHCI_SET_DRV_ENABLE_TRANSIENT(ph);
5299 				break;
5300 			default:
5301 				MDI_PHCI_UNLOCK(ph);
5302 				MDI_DEBUG(1, (CE_NOTE, NULL,
5303 				"!i_mdi_pi_enable_disable:"
5304 				" Invalid flag argument= %d\n", flags));
5305 		}
5306 
5307 		/*
5308 		 * Phci has been disabled. Now try to enable/disable
5309 		 * path info's to each client.
5310 		 */
5311 		pip = ph->ph_path_head;
5312 		while (pip != NULL) {
5313 			/*
5314 			 * Do a callback into the mdi consumer to let it
5315 			 * know that path is about to be enabled/disabled.
5316 			 */
5317 			if (f != NULL) {
5318 				rv = (*f)(vh->vh_dip, pip, 0,
5319 					MDI_PI_EXT_STATE(pip),
5320 					MDI_EXT_STATE_CHANGE | sync_flag |
5321 					op | MDI_BEFORE_STATE_CHANGE);
5322 				if (rv != MDI_SUCCESS) {
5323 				MDI_DEBUG(2, (CE_WARN, vh->vh_dip,
5324 				"!vo_pi_state_change: failed rv = %x", rv));
5325 				}
5326 			}
5327 
5328 			MDI_PI_LOCK(pip);
5329 			next =
5330 				(mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5331 			switch (flags) {
5332 			case USER_DISABLE:
5333 				if (op == MDI_DISABLE_OP)
5334 					MDI_PI_SET_USER_DISABLE(pip);
5335 				else
5336 					MDI_PI_SET_USER_ENABLE(pip);
5337 				break;
5338 			case DRIVER_DISABLE:
5339 				if (op == MDI_DISABLE_OP)
5340 					MDI_PI_SET_DRV_DISABLE(pip);
5341 				else
5342 					MDI_PI_SET_DRV_ENABLE(pip);
5343 				break;
5344 			case DRIVER_DISABLE_TRANSIENT:
5345 				if (op == MDI_DISABLE_OP && rv == MDI_SUCCESS)
5346 					MDI_PI_SET_DRV_DISABLE_TRANS(pip);
5347 				else
5348 					MDI_PI_SET_DRV_ENABLE_TRANS(pip);
5349 				break;
5350 			}
5351 			MDI_PI_UNLOCK(pip);
5352 			/*
5353 			 * Do a callback into the mdi consumer to let it
5354 			 * know that path is now enabled/disabled.
5355 			 */
5356 			if (f != NULL) {
5357 				rv = (*f)(vh->vh_dip, pip, 0,
5358 					MDI_PI_EXT_STATE(pip),
5359 					MDI_EXT_STATE_CHANGE | sync_flag |
5360 					op | MDI_AFTER_STATE_CHANGE);
5361 				if (rv != MDI_SUCCESS) {
5362 				MDI_DEBUG(2, (CE_WARN, vh->vh_dip,
5363 				"!vo_pi_state_change: failed rv = %x", rv));
5364 				}
5365 			}
5366 			pip = next;
5367 		}
5368 		MDI_PHCI_UNLOCK(ph);
5369 	} else {
5370 
5371 		/*
5372 		 * Disable a specific client.
5373 		 */
5374 		ct = i_devi_get_client(cdip);
5375 		if (ct == NULL) {
5376 			MDI_DEBUG(1, (CE_NOTE, NULL,
5377 			"!i_mdi_pi_enable_disable:"
5378 			" failed. ct = NULL operation = %d\n", op));
5379 			return (MDI_FAILURE);
5380 		}
5381 
5382 		MDI_CLIENT_LOCK(ct);
5383 		pip = ct->ct_path_head;
5384 		found_it = 0;
5385 		while (pip != NULL) {
5386 			MDI_PI_LOCK(pip);
5387 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
5388 			if (MDI_PI(pip)->pi_phci == ph) {
5389 				MDI_PI_UNLOCK(pip);
5390 				found_it = 1;
5391 				break;
5392 			}
5393 			MDI_PI_UNLOCK(pip);
5394 			pip = next;
5395 		}
5396 
5397 		MDI_CLIENT_UNLOCK(ct);
5398 		if (found_it == 0) {
5399 			MDI_DEBUG(1, (CE_NOTE, NULL,
5400 			"!i_mdi_pi_enable_disable:"
5401 			" failed. Could not find corresponding pip\n"));
5402 			return (MDI_FAILURE);
5403 		}
5404 		/*
5405 		 * Do a callback into the mdi consumer to let it
5406 		 * know that path is about to get enabled/disabled.
5407 		 */
5408 		if (f != NULL) {
5409 			rv = (*f)(vh->vh_dip, pip, 0,
5410 				MDI_PI_EXT_STATE(pip),
5411 				MDI_EXT_STATE_CHANGE | sync_flag |
5412 				op | MDI_BEFORE_STATE_CHANGE);
5413 			if (rv != MDI_SUCCESS) {
5414 				MDI_DEBUG(2, (CE_WARN, vh->vh_dip,
5415 				"!vo_pi_state_change: failed rv = %x", rv));
5416 			}
5417 		}
5418 		MDI_PI_LOCK(pip);
5419 		switch (flags) {
5420 			case USER_DISABLE:
5421 				if (op == MDI_DISABLE_OP)
5422 					MDI_PI_SET_USER_DISABLE(pip);
5423 				else
5424 					MDI_PI_SET_USER_ENABLE(pip);
5425 				break;
5426 			case DRIVER_DISABLE:
5427 				if (op == MDI_DISABLE_OP)
5428 					MDI_PI_SET_DRV_DISABLE(pip);
5429 				else
5430 					MDI_PI_SET_DRV_ENABLE(pip);
5431 				break;
5432 			case DRIVER_DISABLE_TRANSIENT:
5433 				if (op == MDI_DISABLE_OP && rv == MDI_SUCCESS)
5434 					MDI_PI_SET_DRV_DISABLE_TRANS(pip);
5435 				else
5436 					MDI_PI_SET_DRV_ENABLE_TRANS(pip);
5437 				break;
5438 		}
5439 		MDI_PI_UNLOCK(pip);
5440 		/*
5441 		 * Do a callback into the mdi consumer to let it
5442 		 * know that path is now enabled/disabled.
5443 		 */
5444 		if (f != NULL) {
5445 			rv = (*f)(vh->vh_dip, pip, 0,
5446 				MDI_PI_EXT_STATE(pip),
5447 				MDI_EXT_STATE_CHANGE | sync_flag |
5448 				op | MDI_AFTER_STATE_CHANGE);
5449 			if (rv != MDI_SUCCESS) {
5450 				MDI_DEBUG(2, (CE_WARN, vh->vh_dip,
5451 				"!vo_pi_state_change: failed rv = %x", rv));
5452 			}
5453 		}
5454 	}
5455 
5456 	MDI_DEBUG(5, (CE_NOTE, NULL, "!i_mdi_pi_enable_disable:"
5457 		" Returning success pdip = %p cdip = %p\n", op, pdip, cdip));
5458 	return (MDI_SUCCESS);
5459 }
5460 
5461 /*ARGSUSED3*/
5462 int
5463 mdi_devi_config_one(dev_info_t *pdip, char *devnm, dev_info_t **cdipp,
5464     int flags, clock_t timeout)
5465 {
5466 	mdi_pathinfo_t *pip;
5467 	dev_info_t *dip;
5468 	clock_t interval = drv_usectohz(100000);	/* 0.1 sec */
5469 	char *paddr;
5470 
5471 	MDI_DEBUG(2, (CE_NOTE, NULL, "configure device %s", devnm));
5472 
5473 	if (!MDI_PHCI(pdip))
5474 		return (MDI_FAILURE);
5475 
5476 	paddr = strchr(devnm, '@');
5477 	if (paddr == NULL)
5478 		return (MDI_FAILURE);
5479 
5480 	paddr++;	/* skip '@' */
5481 	pip = mdi_pi_find(pdip, NULL, paddr);
5482 	while (pip == NULL && timeout > 0) {
5483 		if (interval > timeout)
5484 			interval = timeout;
5485 		if (flags & NDI_DEVI_DEBUG) {
5486 			cmn_err(CE_CONT, "%s%d: %s timeout %ld %ld\n",
5487 			    ddi_driver_name(pdip), ddi_get_instance(pdip),
5488 			    paddr, interval, timeout);
5489 		}
5490 		delay(interval);
5491 		timeout -= interval;
5492 		interval += interval;
5493 		pip = mdi_pi_find(pdip, NULL, paddr);
5494 	}
5495 
5496 	if (pip == NULL)
5497 		return (MDI_FAILURE);
5498 	dip = mdi_pi_get_client(pip);
5499 	if (ndi_devi_online(dip, flags) != NDI_SUCCESS)
5500 		return (MDI_FAILURE);
5501 	*cdipp = dip;
5502 
5503 	/* TODO: holding should happen inside search functions */
5504 	ndi_hold_devi(dip);
5505 	return (MDI_SUCCESS);
5506 }
5507 
5508 /*
5509  * Ensure phci powered up
5510  */
5511 static void
5512 i_mdi_pm_hold_pip(mdi_pathinfo_t *pip)
5513 {
5514 	dev_info_t	*ph_dip;
5515 
5516 	ASSERT(pip != NULL);
5517 	ASSERT(MUTEX_HELD(&MDI_PI(pip)->pi_mutex));
5518 
5519 	if (MDI_PI(pip)->pi_pm_held) {
5520 		return;
5521 	}
5522 
5523 	ph_dip = mdi_pi_get_phci(pip);
5524 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_pm_hold_pip for %s%d\n",
5525 	    ddi_get_name(ph_dip), ddi_get_instance(ph_dip)));
5526 	if (ph_dip == NULL) {
5527 		return;
5528 	}
5529 
5530 	MDI_PI_UNLOCK(pip);
5531 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt was %d\n",
5532 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
5533 	pm_hold_power(ph_dip);
5534 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt is %d\n",
5535 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
5536 	MDI_PI_LOCK(pip);
5537 
5538 	MDI_PI(pip)->pi_pm_held = 1;
5539 }
5540 
5541 /*
5542  * Allow phci powered down
5543  */
5544 static void
5545 i_mdi_pm_rele_pip(mdi_pathinfo_t *pip)
5546 {
5547 	dev_info_t	*ph_dip = NULL;
5548 
5549 	ASSERT(pip != NULL);
5550 	ASSERT(MUTEX_HELD(&MDI_PI(pip)->pi_mutex));
5551 
5552 	if (MDI_PI(pip)->pi_pm_held == 0) {
5553 		return;
5554 	}
5555 
5556 	ph_dip = mdi_pi_get_phci(pip);
5557 	ASSERT(ph_dip != NULL);
5558 
5559 	MDI_PI_UNLOCK(pip);
5560 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_pm_rele_pip for %s%d\n",
5561 	    ddi_get_name(ph_dip), ddi_get_instance(ph_dip)));
5562 
5563 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt was %d\n",
5564 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
5565 	pm_rele_power(ph_dip);
5566 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "kidsupcnt is %d\n",
5567 	    DEVI(ph_dip)->devi_pm_kidsupcnt));
5568 
5569 	MDI_PI_LOCK(pip);
5570 	MDI_PI(pip)->pi_pm_held = 0;
5571 }
5572 
5573 static void
5574 i_mdi_pm_hold_client(mdi_client_t *ct, int incr)
5575 {
5576 	ASSERT(ct);
5577 
5578 	ct->ct_power_cnt += incr;
5579 	MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "i_mdi_pm_hold_client "
5580 	    "ct_power_cnt = %d incr = %d\n", ct->ct_power_cnt, incr));
5581 	ASSERT(ct->ct_power_cnt >= 0);
5582 }
5583 
5584 static void
5585 i_mdi_rele_all_phci(mdi_client_t *ct)
5586 {
5587 	mdi_pathinfo_t  *pip;
5588 
5589 	ASSERT(mutex_owned(&ct->ct_mutex));
5590 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
5591 	while (pip != NULL) {
5592 		mdi_hold_path(pip);
5593 		MDI_PI_LOCK(pip);
5594 		i_mdi_pm_rele_pip(pip);
5595 		MDI_PI_UNLOCK(pip);
5596 		mdi_rele_path(pip);
5597 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
5598 	}
5599 }
5600 
5601 static void
5602 i_mdi_pm_rele_client(mdi_client_t *ct, int decr)
5603 {
5604 	ASSERT(ct);
5605 
5606 	if (i_ddi_node_state(ct->ct_dip) >= DS_READY) {
5607 		ct->ct_power_cnt -= decr;
5608 		MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "i_mdi_pm_rele_client "
5609 		    "ct_power_cnt = %d decr = %d\n", ct->ct_power_cnt, decr));
5610 	}
5611 
5612 	ASSERT(ct->ct_power_cnt >= 0);
5613 	if (ct->ct_power_cnt == 0) {
5614 		i_mdi_rele_all_phci(ct);
5615 		return;
5616 	}
5617 }
5618 
5619 static void
5620 i_mdi_pm_reset_client(mdi_client_t *ct)
5621 {
5622 	MDI_DEBUG(4, (CE_NOTE, ct->ct_dip, "i_mdi_pm_reset_client "
5623 	    "ct_power_cnt = %d\n", ct->ct_power_cnt));
5624 	ct->ct_power_cnt = 0;
5625 	i_mdi_rele_all_phci(ct);
5626 	ct->ct_powercnt_reset = 1;
5627 	ct->ct_powercnt_held = 0;
5628 }
5629 
5630 static void
5631 i_mdi_pm_hold_all_phci(mdi_client_t *ct)
5632 {
5633 	mdi_pathinfo_t  *pip;
5634 	ASSERT(mutex_owned(&ct->ct_mutex));
5635 
5636 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
5637 	while (pip != NULL) {
5638 		mdi_hold_path(pip);
5639 		MDI_PI_LOCK(pip);
5640 		i_mdi_pm_hold_pip(pip);
5641 		MDI_PI_UNLOCK(pip);
5642 		mdi_rele_path(pip);
5643 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
5644 	}
5645 }
5646 
5647 static int
5648 i_mdi_power_one_phci(mdi_pathinfo_t *pip)
5649 {
5650 	int		ret;
5651 	dev_info_t	*ph_dip;
5652 
5653 	MDI_PI_LOCK(pip);
5654 	i_mdi_pm_hold_pip(pip);
5655 
5656 	ph_dip = mdi_pi_get_phci(pip);
5657 	MDI_PI_UNLOCK(pip);
5658 
5659 	/* bring all components of phci to full power */
5660 	MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_power_one_phci "
5661 	    "pm_powerup for %s%d\n", ddi_get_name(ph_dip),
5662 	    ddi_get_instance(ph_dip)));
5663 
5664 	ret = pm_powerup(ph_dip);
5665 
5666 	if (ret == DDI_FAILURE) {
5667 		MDI_DEBUG(4, (CE_NOTE, ph_dip, "i_mdi_power_one_phci "
5668 		    "pm_powerup FAILED for %s%d\n",
5669 		    ddi_get_name(ph_dip), ddi_get_instance(ph_dip)));
5670 
5671 		MDI_PI_LOCK(pip);
5672 		i_mdi_pm_rele_pip(pip);
5673 		MDI_PI_UNLOCK(pip);
5674 		return (MDI_FAILURE);
5675 	}
5676 
5677 	return (MDI_SUCCESS);
5678 }
5679 
5680 static int
5681 i_mdi_power_all_phci(mdi_client_t *ct)
5682 {
5683 	mdi_pathinfo_t  *pip;
5684 	int		succeeded = 0;
5685 
5686 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
5687 	while (pip != NULL) {
5688 		mdi_hold_path(pip);
5689 		MDI_CLIENT_UNLOCK(ct);
5690 		if (i_mdi_power_one_phci(pip) == MDI_SUCCESS)
5691 			succeeded = 1;
5692 
5693 		ASSERT(ct == MDI_PI(pip)->pi_client);
5694 		MDI_CLIENT_LOCK(ct);
5695 		mdi_rele_path(pip);
5696 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
5697 	}
5698 
5699 	return (succeeded ? MDI_SUCCESS : MDI_FAILURE);
5700 }
5701 
5702 /*
5703  * mdi_bus_power():
5704  *		1. Place the phci(s) into powered up state so that
5705  *		   client can do power management
5706  *		2. Ensure phci powered up as client power managing
5707  * Return Values:
5708  *		MDI_SUCCESS
5709  *		MDI_FAILURE
5710  */
5711 int
5712 mdi_bus_power(dev_info_t *parent, void *impl_arg, pm_bus_power_op_t op,
5713     void *arg, void *result)
5714 {
5715 	int			ret = MDI_SUCCESS;
5716 	pm_bp_child_pwrchg_t	*bpc;
5717 	mdi_client_t		*ct;
5718 	dev_info_t		*cdip;
5719 	pm_bp_has_changed_t	*bphc;
5720 
5721 	/*
5722 	 * BUS_POWER_NOINVOL not supported
5723 	 */
5724 	if (op == BUS_POWER_NOINVOL)
5725 		return (MDI_FAILURE);
5726 
5727 	/*
5728 	 * ignore other OPs.
5729 	 * return quickly to save cou cycles on the ct processing
5730 	 */
5731 	switch (op) {
5732 	case BUS_POWER_PRE_NOTIFICATION:
5733 	case BUS_POWER_POST_NOTIFICATION:
5734 		bpc = (pm_bp_child_pwrchg_t *)arg;
5735 		cdip = bpc->bpc_dip;
5736 		break;
5737 	case BUS_POWER_HAS_CHANGED:
5738 		bphc = (pm_bp_has_changed_t *)arg;
5739 		cdip = bphc->bphc_dip;
5740 		break;
5741 	default:
5742 		return (pm_busop_bus_power(parent, impl_arg, op, arg, result));
5743 	}
5744 
5745 	ASSERT(MDI_CLIENT(cdip));
5746 
5747 	ct = i_devi_get_client(cdip);
5748 	if (ct == NULL)
5749 		return (MDI_FAILURE);
5750 
5751 	/*
5752 	 * wait till the mdi_pathinfo node state change are processed
5753 	 */
5754 	MDI_CLIENT_LOCK(ct);
5755 	switch (op) {
5756 	case BUS_POWER_PRE_NOTIFICATION:
5757 		MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip, "mdi_bus_power "
5758 		    "BUS_POWER_PRE_NOTIFICATION:"
5759 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d\n",
5760 		    PM_NAME(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
5761 		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp));
5762 
5763 		/* serialize power level change per client */
5764 		while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
5765 			cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
5766 
5767 		MDI_CLIENT_SET_POWER_TRANSITION(ct);
5768 
5769 		if (ct->ct_power_cnt == 0) {
5770 			ret = i_mdi_power_all_phci(ct);
5771 		}
5772 
5773 		/*
5774 		 * if new_level > 0:
5775 		 *	- hold phci(s)
5776 		 *	- power up phci(s) if not already
5777 		 * ignore power down
5778 		 */
5779 		if (bpc->bpc_nlevel > 0) {
5780 			if (!DEVI_IS_ATTACHING(ct->ct_dip)) {
5781 				MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
5782 				    "mdi_bus_power i_mdi_pm_hold_client\n"));
5783 				i_mdi_pm_hold_client(ct, ct->ct_path_count);
5784 			}
5785 		}
5786 		break;
5787 	case BUS_POWER_POST_NOTIFICATION:
5788 		MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip, "mdi_bus_power "
5789 		    "BUS_POWER_POST_NOTIFICATION:"
5790 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d result=%d\n",
5791 		    PM_NAME(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
5792 		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp,
5793 		    *(int *)result));
5794 
5795 		if (*(int *)result == DDI_SUCCESS) {
5796 			if (bpc->bpc_nlevel > 0) {
5797 				MDI_CLIENT_SET_POWER_UP(ct);
5798 			} else {
5799 				MDI_CLIENT_SET_POWER_DOWN(ct);
5800 			}
5801 		}
5802 
5803 		/* release the hold we did in pre-notification */
5804 		if (bpc->bpc_nlevel > 0 && (*(int *)result != DDI_SUCCESS) &&
5805 		    !DEVI_IS_ATTACHING(ct->ct_dip)) {
5806 			MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
5807 			    "mdi_bus_power i_mdi_pm_rele_client\n"));
5808 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
5809 		}
5810 
5811 		if (bpc->bpc_nlevel == 0 && (*(int *)result == DDI_SUCCESS)) {
5812 			/* another thread might started attaching */
5813 			if (DEVI_IS_ATTACHING(ct->ct_dip)) {
5814 				MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
5815 				    "mdi_bus_power i_mdi_pm_rele_client\n"));
5816 				i_mdi_pm_rele_client(ct, ct->ct_path_count);
5817 			/* detaching has been taken care in pm_post_unconfig */
5818 			} else if (!DEVI_IS_DETACHING(ct->ct_dip)) {
5819 				MDI_DEBUG(4, (CE_NOTE, bpc->bpc_dip,
5820 				    "mdi_bus_power i_mdi_pm_reset_client\n"));
5821 				i_mdi_pm_reset_client(ct);
5822 			}
5823 		}
5824 
5825 		MDI_CLIENT_CLEAR_POWER_TRANSITION(ct);
5826 		cv_broadcast(&ct->ct_powerchange_cv);
5827 
5828 		break;
5829 
5830 	/* need to do more */
5831 	case BUS_POWER_HAS_CHANGED:
5832 		MDI_DEBUG(4, (CE_NOTE, bphc->bphc_dip, "mdi_bus_power "
5833 		    "BUS_POWER_HAS_CHANGED:"
5834 		    "%s@%s, olevel=%d, nlevel=%d, comp=%d\n",
5835 		    PM_NAME(bphc->bphc_dip), PM_ADDR(bphc->bphc_dip),
5836 		    bphc->bphc_olevel, bphc->bphc_nlevel, bphc->bphc_comp));
5837 
5838 		if (bphc->bphc_nlevel > 0 &&
5839 		    bphc->bphc_nlevel > bphc->bphc_olevel) {
5840 			if (ct->ct_power_cnt == 0) {
5841 				ret = i_mdi_power_all_phci(ct);
5842 			}
5843 			MDI_DEBUG(4, (CE_NOTE, bphc->bphc_dip,
5844 			    "mdi_bus_power i_mdi_pm_hold_client\n"));
5845 			i_mdi_pm_hold_client(ct, ct->ct_path_count);
5846 		}
5847 
5848 		if (bphc->bphc_nlevel == 0 && bphc->bphc_olevel != -1) {
5849 			MDI_DEBUG(4, (CE_NOTE, bphc->bphc_dip,
5850 			    "mdi_bus_power i_mdi_pm_rele_client\n"));
5851 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
5852 		}
5853 		break;
5854 	}
5855 
5856 	MDI_CLIENT_UNLOCK(ct);
5857 	return (ret);
5858 }
5859 
5860 static int
5861 i_mdi_pm_pre_config_one(dev_info_t *child)
5862 {
5863 	int		ret = MDI_SUCCESS;
5864 	mdi_client_t	*ct;
5865 
5866 	ct = i_devi_get_client(child);
5867 	if (ct == NULL)
5868 		return (MDI_FAILURE);
5869 
5870 	MDI_CLIENT_LOCK(ct);
5871 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
5872 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
5873 
5874 	if (!MDI_CLIENT_IS_FAILED(ct)) {
5875 		MDI_CLIENT_UNLOCK(ct);
5876 		MDI_DEBUG(4, (CE_NOTE, child,
5877 		    "i_mdi_pm_pre_config_one already configured\n"));
5878 		return (MDI_SUCCESS);
5879 	}
5880 
5881 	if (ct->ct_powercnt_held) {
5882 		MDI_CLIENT_UNLOCK(ct);
5883 		MDI_DEBUG(4, (CE_NOTE, child,
5884 		    "i_mdi_pm_pre_config_one ALREADY held\n"));
5885 		return (MDI_SUCCESS);
5886 	}
5887 
5888 	if (ct->ct_power_cnt == 0) {
5889 		ret = i_mdi_power_all_phci(ct);
5890 	}
5891 	MDI_DEBUG(4, (CE_NOTE, child,
5892 	    "i_mdi_pm_pre_config_one i_mdi_pm_hold_client\n"));
5893 	i_mdi_pm_hold_client(ct, ct->ct_path_count);
5894 	ct->ct_powercnt_held = 1;
5895 	ct->ct_powercnt_reset = 0;
5896 	MDI_CLIENT_UNLOCK(ct);
5897 	return (ret);
5898 }
5899 
5900 static int
5901 i_mdi_pm_pre_config(dev_info_t *parent, dev_info_t *child)
5902 {
5903 	int			ret = MDI_SUCCESS;
5904 	dev_info_t		*cdip;
5905 	int			circ;
5906 
5907 	ASSERT(MDI_VHCI(parent));
5908 
5909 	/* ndi_devi_config_one */
5910 	if (child) {
5911 		return (i_mdi_pm_pre_config_one(child));
5912 	}
5913 
5914 	/* devi_config_common */
5915 	ndi_devi_enter(parent, &circ);
5916 	cdip = ddi_get_child(parent);
5917 	while (cdip) {
5918 		dev_info_t *next = ddi_get_next_sibling(cdip);
5919 
5920 		ret = i_mdi_pm_pre_config_one(cdip);
5921 		if (ret != MDI_SUCCESS)
5922 			break;
5923 		cdip = next;
5924 	}
5925 	ndi_devi_exit(parent, circ);
5926 	return (ret);
5927 }
5928 
5929 static int
5930 i_mdi_pm_pre_unconfig_one(dev_info_t *child, int *held, int flags)
5931 {
5932 	int		ret = MDI_SUCCESS;
5933 	mdi_client_t	*ct;
5934 
5935 	ct = i_devi_get_client(child);
5936 	if (ct == NULL)
5937 		return (MDI_FAILURE);
5938 
5939 	MDI_CLIENT_LOCK(ct);
5940 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
5941 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
5942 
5943 	if (i_ddi_node_state(ct->ct_dip) < DS_READY) {
5944 		MDI_DEBUG(4, (CE_NOTE, child,
5945 		    "i_mdi_pm_pre_unconfig node detached already\n"));
5946 		MDI_CLIENT_UNLOCK(ct);
5947 		return (MDI_SUCCESS);
5948 	}
5949 
5950 	if (MDI_CLIENT_IS_POWERED_DOWN(ct) &&
5951 	    (flags & NDI_AUTODETACH)) {
5952 		MDI_DEBUG(4, (CE_NOTE, child,
5953 		    "i_mdi_pm_pre_unconfig auto-modunload\n"));
5954 		MDI_CLIENT_UNLOCK(ct);
5955 		return (MDI_FAILURE);
5956 	}
5957 
5958 	if (ct->ct_powercnt_held) {
5959 		MDI_DEBUG(4, (CE_NOTE, child,
5960 		    "i_mdi_pm_pre_unconfig ct_powercnt_held\n"));
5961 		MDI_CLIENT_UNLOCK(ct);
5962 		*held = 1;
5963 		return (MDI_SUCCESS);
5964 	}
5965 
5966 	if (ct->ct_power_cnt == 0) {
5967 		ret = i_mdi_power_all_phci(ct);
5968 	}
5969 	MDI_DEBUG(4, (CE_NOTE, child,
5970 	    "i_mdi_pm_pre_unconfig i_mdi_pm_hold_client\n"));
5971 	i_mdi_pm_hold_client(ct, ct->ct_path_count);
5972 	ct->ct_powercnt_held = 1;
5973 	ct->ct_powercnt_reset = 0;
5974 	MDI_CLIENT_UNLOCK(ct);
5975 	if (ret == MDI_SUCCESS)
5976 		*held = 1;
5977 	return (ret);
5978 }
5979 
5980 static int
5981 i_mdi_pm_pre_unconfig(dev_info_t *parent, dev_info_t *child, int *held,
5982     int flags)
5983 {
5984 	int			ret = MDI_SUCCESS;
5985 	dev_info_t		*cdip;
5986 	int			circ;
5987 
5988 	ASSERT(MDI_VHCI(parent));
5989 	*held = 0;
5990 
5991 	/* ndi_devi_unconfig_one */
5992 	if (child) {
5993 		return (i_mdi_pm_pre_unconfig_one(child, held, flags));
5994 	}
5995 
5996 	/* devi_unconfig_common */
5997 	ndi_devi_enter(parent, &circ);
5998 	cdip = ddi_get_child(parent);
5999 	while (cdip) {
6000 		dev_info_t *next = ddi_get_next_sibling(cdip);
6001 
6002 		ret = i_mdi_pm_pre_unconfig_one(cdip, held, flags);
6003 		cdip = next;
6004 	}
6005 	ndi_devi_exit(parent, circ);
6006 
6007 	if (*held)
6008 		ret = MDI_SUCCESS;
6009 
6010 	return (ret);
6011 }
6012 
6013 static void
6014 i_mdi_pm_post_config_one(dev_info_t *child)
6015 {
6016 	mdi_client_t	*ct;
6017 
6018 	ct = i_devi_get_client(child);
6019 	if (ct == NULL)
6020 		return;
6021 
6022 	MDI_CLIENT_LOCK(ct);
6023 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6024 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6025 
6026 	if (ct->ct_powercnt_reset || !ct->ct_powercnt_held) {
6027 		MDI_DEBUG(4, (CE_NOTE, child,
6028 		    "i_mdi_pm_post_config_one NOT held\n"));
6029 		MDI_CLIENT_UNLOCK(ct);
6030 		return;
6031 	}
6032 
6033 	/* client has not been updated */
6034 	if (MDI_CLIENT_IS_FAILED(ct)) {
6035 		MDI_DEBUG(4, (CE_NOTE, child,
6036 		    "i_mdi_pm_post_config_one NOT configured\n"));
6037 		MDI_CLIENT_UNLOCK(ct);
6038 		return;
6039 	}
6040 
6041 	/* another thread might have powered it down or detached it */
6042 	if ((MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6043 	    !DEVI_IS_ATTACHING(ct->ct_dip)) ||
6044 	    (i_ddi_node_state(ct->ct_dip) < DS_READY &&
6045 	    !DEVI_IS_ATTACHING(ct->ct_dip))) {
6046 		MDI_DEBUG(4, (CE_NOTE, child,
6047 		    "i_mdi_pm_post_config i_mdi_pm_reset_client\n"));
6048 		i_mdi_pm_reset_client(ct);
6049 	} else {
6050 		mdi_pathinfo_t	*pip, *next;
6051 		int	valid_path_count = 0;
6052 
6053 		MDI_DEBUG(4, (CE_NOTE, child,
6054 		    "i_mdi_pm_post_config i_mdi_pm_rele_client\n"));
6055 		pip = ct->ct_path_head;
6056 		while (pip != NULL) {
6057 			MDI_PI_LOCK(pip);
6058 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6059 			if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
6060 				== MDI_PATHINFO_STATE_ONLINE ||
6061 			    (MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
6062 				== MDI_PATHINFO_STATE_STANDBY)
6063 				valid_path_count ++;
6064 			MDI_PI_UNLOCK(pip);
6065 			pip = next;
6066 		}
6067 		i_mdi_pm_rele_client(ct, valid_path_count);
6068 	}
6069 	ct->ct_powercnt_held = 0;
6070 	MDI_CLIENT_UNLOCK(ct);
6071 }
6072 
6073 static void
6074 i_mdi_pm_post_config(dev_info_t *parent, dev_info_t *child)
6075 {
6076 	int		circ;
6077 	dev_info_t	*cdip;
6078 	ASSERT(MDI_VHCI(parent));
6079 
6080 	/* ndi_devi_config_one */
6081 	if (child) {
6082 		i_mdi_pm_post_config_one(child);
6083 		return;
6084 	}
6085 
6086 	/* devi_config_common */
6087 	ndi_devi_enter(parent, &circ);
6088 	cdip = ddi_get_child(parent);
6089 	while (cdip) {
6090 		dev_info_t *next = ddi_get_next_sibling(cdip);
6091 
6092 		i_mdi_pm_post_config_one(cdip);
6093 		cdip = next;
6094 	}
6095 	ndi_devi_exit(parent, circ);
6096 }
6097 
6098 static void
6099 i_mdi_pm_post_unconfig_one(dev_info_t *child)
6100 {
6101 	mdi_client_t	*ct;
6102 
6103 	ct = i_devi_get_client(child);
6104 	if (ct == NULL)
6105 		return;
6106 
6107 	MDI_CLIENT_LOCK(ct);
6108 	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6109 		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6110 
6111 	if (!ct->ct_powercnt_held) {
6112 		MDI_DEBUG(4, (CE_NOTE, child,
6113 		    "i_mdi_pm_post_unconfig NOT held\n"));
6114 		MDI_CLIENT_UNLOCK(ct);
6115 		return;
6116 	}
6117 
6118 	/* failure detaching or another thread just attached it */
6119 	if ((MDI_CLIENT_IS_POWERED_DOWN(ct) &&
6120 	    i_ddi_node_state(ct->ct_dip) == DS_READY) ||
6121 	    (i_ddi_node_state(ct->ct_dip) != DS_READY &&
6122 	    !DEVI_IS_ATTACHING(ct->ct_dip))) {
6123 		MDI_DEBUG(4, (CE_NOTE, child,
6124 		    "i_mdi_pm_post_unconfig i_mdi_pm_reset_client\n"));
6125 		i_mdi_pm_reset_client(ct);
6126 	}
6127 
6128 	MDI_DEBUG(4, (CE_NOTE, child,
6129 	    "i_mdi_pm_post_unconfig not changed\n"));
6130 	MDI_CLIENT_UNLOCK(ct);
6131 }
6132 
6133 static void
6134 i_mdi_pm_post_unconfig(dev_info_t *parent, dev_info_t *child, int held)
6135 {
6136 	int			circ;
6137 	dev_info_t		*cdip;
6138 
6139 	ASSERT(MDI_VHCI(parent));
6140 
6141 	if (!held) {
6142 		MDI_DEBUG(4, (CE_NOTE, parent,
6143 		    "i_mdi_pm_post_unconfig held = %d\n", held));
6144 		return;
6145 	}
6146 
6147 	if (child) {
6148 		i_mdi_pm_post_unconfig_one(child);
6149 		return;
6150 	}
6151 
6152 	ndi_devi_enter(parent, &circ);
6153 	cdip = ddi_get_child(parent);
6154 	while (cdip) {
6155 		dev_info_t *next = ddi_get_next_sibling(cdip);
6156 
6157 		i_mdi_pm_post_unconfig_one(cdip);
6158 		cdip = next;
6159 	}
6160 	ndi_devi_exit(parent, circ);
6161 }
6162 
6163 int
6164 mdi_power(dev_info_t *vdip, mdi_pm_op_t op, void *args, char *devnm, int flags)
6165 {
6166 	int			circ, ret = MDI_SUCCESS;
6167 	dev_info_t		*client_dip = NULL;
6168 	mdi_client_t		*ct;
6169 
6170 	/*
6171 	 * Handling ndi_devi_config_one and ndi_devi_unconfig_one.
6172 	 * Power up pHCI for the named client device.
6173 	 * Note: Before the client is enumerated under vhci by phci,
6174 	 * client_dip can be NULL. Then proceed to power up all the
6175 	 * pHCIs.
6176 	 */
6177 	if (devnm != NULL) {
6178 		ndi_devi_enter(vdip, &circ);
6179 		client_dip = ndi_devi_findchild(vdip, devnm);
6180 		ndi_devi_exit(vdip, circ);
6181 	}
6182 
6183 	MDI_DEBUG(4, (CE_NOTE, vdip, "mdi_power op = %d\n", op));
6184 
6185 	switch (op) {
6186 	case MDI_PM_PRE_CONFIG:
6187 		ret = i_mdi_pm_pre_config(vdip, client_dip);
6188 
6189 		break;
6190 	case MDI_PM_PRE_UNCONFIG:
6191 		ret = i_mdi_pm_pre_unconfig(vdip, client_dip, (int *)args,
6192 		    flags);
6193 
6194 		break;
6195 	case MDI_PM_POST_CONFIG:
6196 		i_mdi_pm_post_config(vdip, client_dip);
6197 
6198 		break;
6199 	case MDI_PM_POST_UNCONFIG:
6200 		i_mdi_pm_post_unconfig(vdip, client_dip, *(int *)args);
6201 
6202 		break;
6203 	case MDI_PM_HOLD_POWER:
6204 	case MDI_PM_RELE_POWER:
6205 		ASSERT(args);
6206 
6207 		client_dip = (dev_info_t *)args;
6208 		ASSERT(MDI_CLIENT(client_dip));
6209 
6210 		ct = i_devi_get_client(client_dip);
6211 		MDI_CLIENT_LOCK(ct);
6212 
6213 		if (op == MDI_PM_HOLD_POWER) {
6214 			if (ct->ct_power_cnt == 0) {
6215 				(void) i_mdi_power_all_phci(ct);
6216 				MDI_DEBUG(4, (CE_NOTE, client_dip,
6217 				    "mdi_power i_mdi_pm_hold_client\n"));
6218 				i_mdi_pm_hold_client(ct, ct->ct_path_count);
6219 			}
6220 		} else {
6221 			if (DEVI_IS_ATTACHING(ct->ct_dip)) {
6222 				MDI_DEBUG(4, (CE_NOTE, client_dip,
6223 				    "mdi_power i_mdi_pm_rele_client\n"));
6224 				i_mdi_pm_rele_client(ct, ct->ct_path_count);
6225 			} else {
6226 				MDI_DEBUG(4, (CE_NOTE, client_dip,
6227 				    "mdi_power i_mdi_pm_reset_client\n"));
6228 				i_mdi_pm_reset_client(ct);
6229 			}
6230 		}
6231 
6232 		MDI_CLIENT_UNLOCK(ct);
6233 		break;
6234 	default:
6235 		break;
6236 	}
6237 
6238 	return (ret);
6239 }
6240 
6241 int
6242 mdi_component_is_vhci(dev_info_t *dip, const char **mdi_class)
6243 {
6244 	mdi_vhci_t *vhci;
6245 
6246 	if (!MDI_VHCI(dip))
6247 		return (MDI_FAILURE);
6248 
6249 	if (mdi_class) {
6250 		vhci = DEVI(dip)->devi_mdi_xhci;
6251 		ASSERT(vhci);
6252 		*mdi_class = vhci->vh_class;
6253 	}
6254 
6255 	return (MDI_SUCCESS);
6256 }
6257 
6258 int
6259 mdi_component_is_phci(dev_info_t *dip, const char **mdi_class)
6260 {
6261 	mdi_phci_t *phci;
6262 
6263 	if (!MDI_PHCI(dip))
6264 		return (MDI_FAILURE);
6265 
6266 	if (mdi_class) {
6267 		phci = DEVI(dip)->devi_mdi_xhci;
6268 		ASSERT(phci);
6269 		*mdi_class = phci->ph_vhci->vh_class;
6270 	}
6271 
6272 	return (MDI_SUCCESS);
6273 }
6274 
6275 int
6276 mdi_component_is_client(dev_info_t *dip, const char **mdi_class)
6277 {
6278 	mdi_client_t *client;
6279 
6280 	if (!MDI_CLIENT(dip))
6281 		return (MDI_FAILURE);
6282 
6283 	if (mdi_class) {
6284 		client = DEVI(dip)->devi_mdi_client;
6285 		ASSERT(client);
6286 		*mdi_class = client->ct_vhci->vh_class;
6287 	}
6288 
6289 	return (MDI_SUCCESS);
6290 }
6291 
6292 /*
6293  * XXX This list should include all phci drivers needed during boot time
6294  * though it currently contains "fp" only.
6295  * Hopefully, the mechanism provided here will be replaced with a better
6296  * mechanism by vhci driven enumeration project.
6297  */
6298 static char *phci_driver_list[] = { "fp" };
6299 #define	N_PHCI_DRIVERS	(sizeof (phci_driver_list) / sizeof (char *))
6300 
6301 static void
6302 i_mdi_attach_phci_drivers()
6303 {
6304 	int  i;
6305 	major_t m;
6306 
6307 	for (i = 0; i < N_PHCI_DRIVERS; i++) {
6308 		m = ddi_name_to_major(phci_driver_list[i]);
6309 		if (m != (major_t)-1) {
6310 			if (ddi_hold_installed_driver(m) != NULL)
6311 				ddi_rele_driver(m);
6312 		}
6313 	}
6314 }
6315 
6316 /* bus config the specified phci */
6317 static void
6318 i_mdi_phci_bus_config(void *arg)
6319 {
6320 	mdi_phci_config_t *phc = (mdi_phci_config_t *)arg;
6321 	mdi_vhci_config_t *vhc;
6322 	dev_info_t	*ph_dip;
6323 	int		rv;
6324 
6325 	ASSERT(phc);
6326 	vhc = phc->phc_vhc;
6327 	ASSERT(vhc->vhc_op == BUS_CONFIG_ALL ||
6328 	    vhc->vhc_op == BUS_CONFIG_DRIVER);
6329 
6330 	/*
6331 	 * Must have already held the phci parent in
6332 	 * i_mdi_bus_config_all_phcis().
6333 	 * First configure the phci itself.
6334 	 */
6335 	rv = ndi_devi_config_one(phc->phc_parent_dip, phc->phc_devnm + 1,
6336 	    &ph_dip, vhc->vhc_flags);
6337 
6338 	/* release the hold that i_mdi_bus_config_all_phcis() placed */
6339 	ndi_rele_devi(phc->phc_parent_dip);
6340 
6341 	if (rv == NDI_SUCCESS) {
6342 		/* now bus config the phci */
6343 		if (vhc->vhc_op == BUS_CONFIG_DRIVER) {
6344 			(void) ndi_devi_config_driver(ph_dip, vhc->vhc_flags,
6345 				vhc->vhc_major);
6346 		} else
6347 			(void) ndi_devi_config(ph_dip, vhc->vhc_flags);
6348 
6349 		/* release the hold that ndi_devi_config_one() placed */
6350 		ndi_rele_devi(ph_dip);
6351 	}
6352 }
6353 
6354 /*
6355  * Bus config all registered phcis associated with the vhci in parallel.
6356  * This process guarantees that the child nodes are enumerated under the vhci,
6357  * but not necessarily attached.
6358  * op must be BUS_CONFIG_DRIVER or BUS_CONFIG_ALL.
6359  */
6360 static int
6361 i_mdi_bus_config_all_phcis(dev_info_t *vdip, uint_t flags,
6362     ddi_bus_config_op_t op, major_t maj, int optimize)
6363 {
6364 	mdi_vhci_t		*vh;
6365 	mdi_phci_t		*ph;
6366 	mdi_phci_config_t	*phc;
6367 	int64_t			req_time;
6368 	int			phci_count, rv;
6369 	static int		first_time = 1;
6370 
6371 	ASSERT(op == BUS_CONFIG_ALL || op == BUS_CONFIG_DRIVER);
6372 	ASSERT(!DEVI_BUSY_OWNED(vdip));
6373 
6374 	MDI_DEBUG(2, (CE_NOTE, vdip,
6375 	    "!MDI: %s on all phcis: major = %d, flags = 0x%x, optimize = %d\n",
6376 	    (op == BUS_CONFIG_DRIVER) ? "BUS_CONFIG_DRIVER" : "BUS_CONFIG_ALL",
6377 	    (int)maj, flags, optimize));
6378 
6379 	vh = i_devi_get_vhci(vdip);
6380 	ASSERT(vh);
6381 
6382 	mutex_enter(&mdi_mutex);
6383 
6384 	req_time = lbolt64;
6385 
6386 	/*
6387 	 * Reduce unnecessary BUS_CONFIG_ALLs when opening stale
6388 	 * /dev/[r]dsk links.
6389 	 */
6390 	if (optimize && (req_time < vh->vh_bus_config.vhc_cutoff_time)) {
6391 		mutex_exit(&mdi_mutex);
6392 		return (MDI_SUCCESS);
6393 	}
6394 
6395 	/*
6396 	 * To initiate bus configs on all phcis in parallel, create a taskq
6397 	 * with multiple threads. Since creation of a taskq is a heavy weight
6398 	 * operation, taskq is created once per vhci and destroyed only when
6399 	 * vhci unregisters with mdi.
6400 	 *
6401 	 * If multiple bus config requests arrive at a time, bus configs on
6402 	 * phcis are initiated on behalf of one of the requests. Other requests
6403 	 * wait until the bus configs on phcis is done.
6404 	 *
6405 	 * When a BUS_CONFIG_ALL on phcis completes, the following is done
6406 	 * to avoid more of unnecessary bus configs.
6407 	 *
6408 	 *	o all BUS_CONFIG_ALL requests currently waiting with optimize
6409 	 *	flag set are returned, i.e., no new BUS_CONFIG_ALL is initiated
6410 	 *	on phcis on behalf of these requests.
6411 	 *
6412 	 *	o all BUS_CONFIG_ALL or BUS_CONFIG_DRIVER requests currently
6413 	 *	waiting but have arrived prior to initiating BUS_CONFIG_ALL on
6414 	 *	phcis are also returned.
6415 	 *
6416 	 * In other cases a new BUS_CONFIG_ALL or BUS_CONFIG_DRIVER is
6417 	 * initiated on phcis on behalf of a new request.
6418 	 */
6419 
6420 	/* check if a bus config on phcis is in progress */
6421 	while (vh->vh_bus_config.vhc_start_time != 0) {
6422 		ddi_bus_config_op_t current_op;
6423 		int64_t start_time;
6424 
6425 		current_op = vh->vh_bus_config.vhc_op;
6426 		start_time = vh->vh_bus_config.vhc_start_time;
6427 
6428 		/* wait until the current bus configs on phcis are done */
6429 		while (vh->vh_bus_config.vhc_start_time == start_time)
6430 			cv_wait(&vh->vh_bus_config.vhc_cv, &mdi_mutex);
6431 
6432 		if (current_op == BUS_CONFIG_ALL &&
6433 		    vh->vh_bus_config.vhc_cutoff_time > 0 && (optimize ||
6434 		    req_time < start_time)) {
6435 			mutex_exit(&mdi_mutex);
6436 			return (MDI_SUCCESS);
6437 		}
6438 	}
6439 
6440 	/*
6441 	 * At this point we are single threaded until vh_bus_config.start_time
6442 	 * is reset to 0 at the end of this function.
6443 	 */
6444 
6445 	vh->vh_bus_config.vhc_op = op;
6446 	vh->vh_bus_config.vhc_major = maj;
6447 	vh->vh_bus_config.vhc_flags = flags;
6448 	vh->vh_bus_config.vhc_start_time = lbolt64;
6449 
6450 	if (first_time && strcmp(vh->vh_class, MDI_HCI_CLASS_SCSI) == 0) {
6451 		mutex_exit(&mdi_mutex);
6452 		i_mdi_attach_phci_drivers();
6453 		mutex_enter(&mdi_mutex);
6454 		first_time = 0;
6455 	}
6456 
6457 	ASSERT(vh->vh_phci_count >= 0);
6458 	if (vh->vh_phci_count == 0) {
6459 		rv = MDI_SUCCESS;
6460 		goto out1;
6461 	}
6462 
6463 	/*
6464 	 * Create a taskq to initiate bus configs in parallel on phcis.
6465 	 * Taskq allocation can be done in mdi_vhci_register() routine
6466 	 * instead of here. For most systems, doing it here on demand saves
6467 	 * resources as this code path is never called most of the times.
6468 	 */
6469 	if (vh->vh_bus_config.vhc_taskq == NULL) {
6470 		/*
6471 		 * it is ok even if vh->vh_phci_count changes after we release
6472 		 * the mdi_mutex as phci_count is used just as an
6473 		 * advisory number to taskq_create.
6474 		 */
6475 		phci_count = vh->vh_phci_count;
6476 		mutex_exit(&mdi_mutex);
6477 
6478 		/*
6479 		 * As we are single threaded, it is ok to access the
6480 		 * vh_bus_config.taskq member of vh outside of mdi_mutex
6481 		 */
6482 		if ((vh->vh_bus_config.vhc_taskq = taskq_create(
6483 		    "mdi_bus_config_taskq", mdi_max_bus_config_threads,
6484 		    MDI_TASKQ_PRI, phci_count, INT_MAX,
6485 		    TASKQ_PREPOPULATE | TASKQ_DYNAMIC)) == NULL) {
6486 			rv = MDI_FAILURE;
6487 			goto out;
6488 		}
6489 
6490 		mutex_enter(&mdi_mutex);
6491 	}
6492 
6493 	/* allocate at least vh->vh_phci_count phci bus config structures */
6494 	while (vh->vh_bus_config.vhc_phc_cnt < vh->vh_phci_count) {
6495 		int count;
6496 
6497 		count = vh->vh_phci_count - vh->vh_bus_config.vhc_phc_cnt;
6498 		mutex_exit(&mdi_mutex);
6499 		while (count--) {
6500 			phc = kmem_alloc(sizeof (*phc), KM_SLEEP);
6501 			phc->phc_vhc = &vh->vh_bus_config;
6502 			/*
6503 			 * there is no need to hold a lock here as we
6504 			 * are single threaded and no one else manipulates
6505 			 * the list while we are here.
6506 			 */
6507 			phc->phc_next = vh->vh_bus_config.vhc_phc;
6508 			vh->vh_bus_config.vhc_phc = phc;
6509 			vh->vh_bus_config.vhc_phc_cnt++;
6510 		}
6511 		mutex_enter(&mdi_mutex);
6512 		/*
6513 		 * as new phcis could register with mdi after we dropped
6514 		 * the mdi_mutex, we need to recheck the vh->vh_phci_count.
6515 		 * Hence the while loop.
6516 		 */
6517 	}
6518 
6519 	for (ph = vh->vh_phci_head, phc = vh->vh_bus_config.vhc_phc;
6520 	    ph != NULL; ph = ph->ph_next, phc = phc->phc_next) {
6521 
6522 		ASSERT(phc != NULL);
6523 
6524 		/* build a phci config handle to be passed to a taskq thread */
6525 		MDI_PHCI_LOCK(ph);
6526 		ASSERT(ph->ph_dip);
6527 
6528 		/*
6529 		 * We need to hold the phci dip before bus configuring the phci.
6530 		 * But placing a hold on the phci dip is not safe here due to
6531 		 * the race with phci detach. To get around this race,
6532 		 * we place a hold on the phci dip's parent and note down
6533 		 * the phci's name@addr. Later, in i_mdi_phci_bus_config(),
6534 		 * we'll first configure the phci itself before bus
6535 		 * configuring the phci.
6536 		 */
6537 		phc->phc_parent_dip = ddi_get_parent(ph->ph_dip);
6538 		ndi_hold_devi(phc->phc_parent_dip);
6539 		(void) ddi_deviname(ph->ph_dip, phc->phc_devnm);
6540 		MDI_PHCI_UNLOCK(ph);
6541 	}
6542 
6543 	phci_count = vh->vh_phci_count;
6544 	if (vh->vh_bus_config.vhc_cutoff_time == -1)
6545 		vh->vh_bus_config.vhc_cutoff_time = 0;
6546 	mutex_exit(&mdi_mutex);
6547 
6548 	MDI_DEBUG(2, (CE_NOTE, vdip,
6549 	    "!MDI: initiating %s on all phcis, major = %d, flags = 0x%x\n",
6550 	    (op == BUS_CONFIG_DRIVER) ? "BUS_CONFIG_DRIVER" : "BUS_CONFIG_ALL",
6551 	    (int)maj, flags));
6552 
6553 	/*
6554 	 * again, no need to hold a lock here as we are single threaded and
6555 	 * no one else manipulates the list while we are here.
6556 	 */
6557 	for (phc = vh->vh_bus_config.vhc_phc; phci_count--;
6558 	    phc = phc->phc_next) {
6559 		(void) taskq_dispatch(vh->vh_bus_config.vhc_taskq,
6560 		    i_mdi_phci_bus_config, phc, TQ_SLEEP);
6561 	}
6562 
6563 	/* wait until all phci bus configs are done */
6564 	taskq_wait(vh->vh_bus_config.vhc_taskq);
6565 	rv = MDI_SUCCESS;
6566 
6567 out:
6568 	mutex_enter(&mdi_mutex);
6569 out1:
6570 	vh->vh_bus_config.vhc_start_time = 0;
6571 	if (op == BUS_CONFIG_ALL && vh->vh_bus_config.vhc_cutoff_time != -1) {
6572 		vh->vh_bus_config.vhc_cutoff_time = lbolt64 +
6573 		    (int64_t)drv_usectohz(mdi_bus_config_timeout * 1000000);
6574 	}
6575 	cv_broadcast(&vh->vh_bus_config.vhc_cv);
6576 	mutex_exit(&mdi_mutex);
6577 
6578 	MDI_DEBUG(2, (CE_NOTE, vdip, "!MDI: %s on all phcis %s\n",
6579 	    (op == BUS_CONFIG_DRIVER) ? "BUS_CONFIG_DRIVER" : "BUS_CONFIG_ALL",
6580 	    (rv == MDI_SUCCESS) ? "successful" : "failed"));
6581 
6582 	return (rv);
6583 }
6584 
6585 /*
6586  * A simple bus config implementation for vhcis with the assumption that all
6587  * phcis are always registered with MDI.
6588  *
6589  * BUS_CONFIG_ALL
6590  *
6591  * 	Do BUS_CONFIG_ALL on all phcis associated with the vhci.
6592  *
6593  * BUS_CONFIG_DRIVER
6594  *
6595  * 	Do BUS_CONFIG_DRIVER on all phcis associated with the vhci.
6596  *
6597  * BUS_CONFIG_ONE
6598  *
6599  *	If the requested child has already been enumerated under the vhci
6600  *	configure the child and return. Otherwise do BUS_CONFIG_ALL on all
6601  *	phcis associated with the vhci.
6602  */
6603 int
6604 mdi_vhci_bus_config(dev_info_t *vdip, uint_t flags, ddi_bus_config_op_t op,
6605     void *arg, dev_info_t **child)
6606 {
6607 	int rv = MDI_SUCCESS;
6608 
6609 	/*
6610 	 * While bus configuring phcis, the phci driver interactions with MDI
6611 	 * cause child nodes to be enumerated under the vhci node for which
6612 	 * they need to ndi_devi_enter the vhci node.
6613 	 *
6614 	 * Unfortunately, to avoid the deadlock, we ourself can not wait for
6615 	 * for the bus config operations on phcis to finish while holding the
6616 	 * ndi_devi_enter lock. To avoid this deadlock, skip bus configs on
6617 	 * phcis and call the default framework provided bus config function
6618 	 * if we are called with ndi_devi_enter lock held.
6619 	 */
6620 	if (DEVI_BUSY_OWNED(vdip)) {
6621 		MDI_DEBUG(2, (CE_NOTE, vdip,
6622 		    "!MDI: vhci bus config: vhci dip is busy owned\n"));
6623 		goto default_bus_config;
6624 	}
6625 
6626 	switch (op) {
6627 	case BUS_CONFIG_ONE:
6628 		/*
6629 		 * First try to directly configure the requested child.
6630 		 * This will work only if the requested child has already
6631 		 * been enumerated under vhci, which is usually the most common
6632 		 * case.
6633 		 */
6634 		if (ndi_busop_bus_config(vdip, flags, op, arg, child, 0) ==
6635 		    NDI_SUCCESS) {
6636 			return (MDI_SUCCESS);
6637 		}
6638 
6639 		MDI_DEBUG(2, (CE_NOTE, vdip, "!MDI: BUS_CONFIG_ONE on %s: "
6640 		    "will do BUS_CONFIG_ALL on all phcis\n", (char *)arg));
6641 
6642 		/* now do BUS_CONFIG_ALL on all phcis */
6643 		rv = i_mdi_bus_config_all_phcis(vdip, flags,
6644 		    BUS_CONFIG_ALL, -1, 1);
6645 		break;
6646 
6647 	case BUS_CONFIG_DRIVER:
6648 		rv = i_mdi_bus_config_all_phcis(vdip, flags, op,
6649 		    (major_t)(uintptr_t)arg, 0);
6650 		break;
6651 
6652 	case BUS_CONFIG_ALL:
6653 		rv = i_mdi_bus_config_all_phcis(vdip, flags, op, -1, 0);
6654 		break;
6655 
6656 	default:
6657 		break;
6658 	}
6659 
6660 default_bus_config:
6661 	/*
6662 	 * i_mdi_bus_config_all_phcis() guarantees that child nodes are
6663 	 * enumerated under the vhci, but not necessarily attached.
6664 	 * Now configure the appropriate child nodes.
6665 	 */
6666 	if (rv == MDI_SUCCESS &&
6667 	    ndi_busop_bus_config(vdip, flags, op, arg, child, 0) ==
6668 	    NDI_SUCCESS) {
6669 		return (MDI_SUCCESS);
6670 	}
6671 
6672 	return (MDI_FAILURE);
6673 }
6674 
6675 
6676 void *
6677 mdi_client_get_vhci_private(dev_info_t *dip)
6678 {
6679 	ASSERT(mdi_component_is_client(dip, NULL) == MDI_SUCCESS);
6680 	if (mdi_component_is_client(dip, NULL) == MDI_SUCCESS) {
6681 		mdi_client_t	*ct;
6682 		ct = i_devi_get_client(dip);
6683 		return (ct->ct_vprivate);
6684 	}
6685 	return (NULL);
6686 }
6687 
6688 void
6689 mdi_client_set_vhci_private(dev_info_t *dip, void *data)
6690 {
6691 	ASSERT(mdi_component_is_client(dip, NULL) == MDI_SUCCESS);
6692 	if (mdi_component_is_client(dip, NULL) == MDI_SUCCESS) {
6693 		mdi_client_t	*ct;
6694 		ct = i_devi_get_client(dip);
6695 		ct->ct_vprivate = data;
6696 	}
6697 }
6698 /*
6699  * mdi_pi_get_vhci_private():
6700  *		Get the vhci private information associated with the
6701  *		mdi_pathinfo node
6702  */
6703 void *
6704 mdi_pi_get_vhci_private(mdi_pathinfo_t *pip)
6705 {
6706 	caddr_t	vprivate = NULL;
6707 	if (pip) {
6708 		vprivate = MDI_PI(pip)->pi_vprivate;
6709 	}
6710 	return (vprivate);
6711 }
6712 
6713 /*
6714  * mdi_pi_set_vhci_private():
6715  *		Set the vhci private information in the mdi_pathinfo node
6716  */
6717 void
6718 mdi_pi_set_vhci_private(mdi_pathinfo_t *pip, void *priv)
6719 {
6720 	if (pip) {
6721 		MDI_PI(pip)->pi_vprivate = priv;
6722 	}
6723 }
6724 
6725 /*
6726  * mdi_phci_get_vhci_private():
6727  *		Get the vhci private information associated with the
6728  *		mdi_phci node
6729  */
6730 void *
6731 mdi_phci_get_vhci_private(dev_info_t *dip)
6732 {
6733 	ASSERT(mdi_component_is_phci(dip, NULL) == MDI_SUCCESS);
6734 	if (mdi_component_is_phci(dip, NULL) == MDI_SUCCESS) {
6735 		mdi_phci_t	*ph;
6736 		ph = i_devi_get_phci(dip);
6737 		return (ph->ph_vprivate);
6738 	}
6739 	return (NULL);
6740 }
6741 
6742 /*
6743  * mdi_phci_set_vhci_private():
6744  *		Set the vhci private information in the mdi_phci node
6745  */
6746 void
6747 mdi_phci_set_vhci_private(dev_info_t *dip, void *priv)
6748 {
6749 	ASSERT(mdi_component_is_phci(dip, NULL) == MDI_SUCCESS);
6750 	if (mdi_component_is_phci(dip, NULL) == MDI_SUCCESS) {
6751 		mdi_phci_t	*ph;
6752 		ph = i_devi_get_phci(dip);
6753 		ph->ph_vprivate = priv;
6754 	}
6755 }
6756