xref: /illumos-gate/usr/src/uts/common/sys/mdi_impldefs.h (revision bdad7b9cb5784df1403f5f3d188edea03f0fb7cb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #ifndef	_SYS_MDI_IMPLDEFS_H
27 #define	_SYS_MDI_IMPLDEFS_H
28 
29 #pragma ident	"%Z%%M%	%I%	%E% SMI"
30 
31 #include <sys/note.h>
32 #include <sys/types.h>
33 #include <sys/sunmdi.h>
34 #include <sys/modhash.h>
35 #include <sys/callb.h>
36 
37 #ifdef	__cplusplus
38 extern "C" {
39 #endif
40 
41 #ifdef _KERNEL
42 
43 /*
44  * Multipath Driver Interfaces
45  *
46  * The multipathing framework is provided in two modules.  The 'mpxio' misc.
47  * module provides the core multipath framework and the 'scsi_vhci' nexus
48  * driver provides the SCSI-III command set driver functionality for
49  * managing Fibre-Channel storage devices.
50  *
51  * As in any multipathing solution there are three major problems to solve:
52  *
53  * 1) Identification and enumeration of multipath client devices.
54  * 2) Optimal path selection when routing I/O requests.
55  * 3) Observability interfaces to snapshot the multipath configuration,
56  *    and infrastructure to provide performance and error statistics.
57  *
58  * The mpxio framework consists of several major components:
59  *
60  * 1) The MDI is the Multiplexed Device Interface; this is the core glue which
61  *    holds the following components together.
62  * 2) vHCI (Virtual Host Controller Interconnect) drivers provide multipathing
63  *    services for a given bus technology (example: 'scsi_vhci' provides
64  *    multipathing support for SCSI-III fibre-channel devices).
65  * 3) pHCI (Physical Host Controller Interconnect) drivers provide transport
66  *    services for a given host controller (example: 'fcp' provides transport
67  *    for fibre-channel devices).
68  * 4) Client Devices are standard Solaris target (or leaf) drivers
69  *    (example: 'ssd' is the standard disk driver for fibre-channel arrays).
70  * 5) Multipath information nodes ('pathinfo' nodes) connect client device
71  *    nodes and pHCI device nodes in the device tree.
72  *
73  * With the scsi_vhci, a QLC card, and mpxio enabled, the device tree might
74  * look like this:
75  *
76  *              /\
77  *             /  ............
78  *     <vHCI>:/               \
79  *      +-----------+   +-----------+
80  *      | scsi_vhci |   |  pci@1f,0 |
81  *      +-----------+   +-----------+
82  *            /   \               \
83  * <Client>: /     \ :<Client>     \ :parent(pHCI)
84  *  +----------+ +-----------+    +-------------+
85  *  | ssd 1    | | ssd 2     |    | qlc@0,0     |
86  *  +----------+ +-----------+    +-------------+
87  *   |            |                /        \
88  *   |            |       <pHCI>: /          \ :<pHCI>
89  *   |            |      +-------------+   +-------------+
90  *   |            |      | pHCI 1 (fp) |   | pHCI 2 (fp) |
91  *   |            |      +-------------+   +-------------+
92  *   |            |          /        |      /          |
93  *   |            |    +------+       |    +------+     |
94  *   |            |    | ssd 3|       |    | ssd  |     |
95  *   |            |    |!mpxio|       |    | (OBP)|     |
96  *   |            |    +------+       |    +------+     |
97  *   |            |                   |                 |
98  *   |            |       <pathinfo>: |                 |
99  *   |            |               +-------+         +--------+
100  *   |            +-------------->| path  |-------->| path   |
101  *   |                            | info  |         | info   |
102  *   |                            | node 1|         | node 3 |
103  *   |                            +-------+         +--------+
104  *   |                                |               |
105  *   |                                |            +~~~~~~~~+
106  *   |                            +-------+        :+--------+
107  *   +--------------------------->| path  |-------->| path   |
108  *                                | info  |        :| info   |
109  *                                | node 2|        +| node 4 |
110  *                                +-------+         +--------+
111  *
112  * The multipath information nodes (mdi_pathinfo nodes) establish the
113  * relationship between the pseudo client driver instance nodes (children
114  * of the vHCI) and the physical host controller interconnect (pHCI
115  * drivers) forming a matrix structure.
116  *
117  * The mpxio module implements locking at multiple granularity levels to
118  * support the needs of various consumers.  The multipath matrix can be
119  * column locked, or row locked depending on the consumer. The intention
120  * is to balance simplicity and performance.
121  *
122  * Locking:
123  *
124  * The devinfo locking still applies:
125  *
126  *   1) An ndi_devi_enter of a parent protects linkage/state of children.
127  *   2) state >= DS_INITIALIZED adds devi_ref of parent
128  *   3) devi_ref at state >= DS_ATTACHED prevents detach(9E).
129  *
130  * The ordering of 1) is (vHCI, pHCI). For a DEBUG kernel this ordering
131  * is asserted by the ndi_devi_enter() implementation.  There is also an
132  * ndi_devi_enter(Client), which is atypical since the client is a leaf.
133  * This is done to synchronize pathinfo nodes during devinfo snapshot (see
134  * di_register_pip) by pretending that the pathinfo nodes are children
135  * of the client.
136  *
137  * In addition to devinfo locking the current implementation utilizes
138  * the following locks:
139  *
140  *   mdi_mutex: protects the global list of vHCIs.
141  *
142  *   vh_phci_mutex: per-vHCI (mutex) lock: protects list of pHCIs registered
143  *   with vHCI.
144  *
145  *   vh_client_mutex: per-vHCI (mutex) lock: protects list/hash of Clients
146  *   associated with vHCI.
147  *
148  *   ph_mutex: per-pHCI (mutex) lock: protects the column (pHCI-mdi_pathinfo
149  *   node list) and per-pHCI structure fields.  mdi_pathinfo node creation,
150  *   deletion and child mdi_pathinfo node state changes are serialized on per
151  *   pHCI basis (Protection against DR).
152  *
153  *   ct_mutex: per-client (mutex) lock: protects the row (client-mdi_pathinfo
154  *   node list) and per-client structure fields.  The client-mdi_pathinfo node
155  *   list is typically walked to select an optimal path when routing I/O
156  *   requests.
157  *
158  *   pi_mutex: per-mdi_pathinfo (mutex) lock: protects the mdi_pathinfo node
159  *   structure fields.
160  *
161  * Note that per-Client structure and per-pHCI fields are freely readable when
162  * corresponding mdi_pathinfo locks are held, since holding an mdi_pathinfo
163  * node guarantees that its corresponding client and pHCI devices will not be
164  * freed.
165  */
166 
167 /*
168  * MDI Client global unique identifier property name string definition
169  */
170 extern const char			*mdi_client_guid_prop;
171 #define	MDI_CLIENT_GUID_PROP		(char *)mdi_client_guid_prop
172 
173 /*
174  * MDI Client load balancing policy definitions
175  *
176  * Load balancing policies are determined on a per-vHCI basis and are
177  * configurable via the vHCI's driver.conf file.
178  */
179 typedef enum {
180 	LOAD_BALANCE_NONE,		/* Alternate pathing		*/
181 	LOAD_BALANCE_RR,		/* Round Robin			*/
182 	LOAD_BALANCE_LBA		/* Logical Block Addressing	*/
183 } client_lb_t;
184 
185 typedef struct {
186 	int region_size;
187 }client_lb_args_t;
188 
189 /*
190  * MDI client load balancing property name/value string definitions
191  */
192 extern const char			*mdi_load_balance;
193 extern const char			*mdi_load_balance_none;
194 extern const char			*mdi_load_balance_ap;
195 extern const char			*mdi_load_balance_rr;
196 extern const char			*mdi_load_balance_lba;
197 
198 #define	LOAD_BALANCE_PROP		(char *)mdi_load_balance
199 #define	LOAD_BALANCE_PROP_NONE		(char *)mdi_load_balance_none
200 #define	LOAD_BALANCE_PROP_AP		(char *)mdi_load_balance_ap
201 #define	LOAD_BALANCE_PROP_RR		(char *)mdi_load_balance_rr
202 #define	LOAD_BALANCE_PROP_LBA		(char *)mdi_load_balance_lba
203 
204 /* default for region size */
205 #define	LOAD_BALANCE_DEFAULT_REGION_SIZE	18
206 
207 /*
208  * vHCI drivers:
209  *
210  * vHCI drivers are pseudo nexus drivers which implement multipath services
211  * for a specific command set or bus architecture ('class').  There is a
212  * single instance of the vHCI driver for each command set which supports
213  * multipath devices.
214  *
215  * Each vHCI driver registers the following callbacks from attach(9e).
216  */
217 #define	MDI_VHCI_OPS_REV_1		1
218 /*
219  * Change MDI_VHCI_OPS_REV_NAME as per MDI_VHCI_OPS_REV
220  */
221 #define	MDI_VHCI_OPS_REV	MDI_VHCI_OPS_REV_1
222 #define	MDI_VHCI_OPS_REV_NAME	"1"
223 
224 typedef struct mdi_vhci_ops {
225 	/* revision management */
226 	int	vo_revision;
227 
228 	/* mdi_pathinfo node init callback */
229 	int	(*vo_pi_init)(dev_info_t *vdip, mdi_pathinfo_t *pip, int flags);
230 
231 	/* mdi_pathinfo node uninit callback */
232 	int	(*vo_pi_uninit)(dev_info_t *vdip, mdi_pathinfo_t *pip,
233 		    int flags);
234 
235 	/* mdi_pathinfo node state change callback */
236 	int	(*vo_pi_state_change)(dev_info_t *vdip, mdi_pathinfo_t *pip,
237 		    mdi_pathinfo_state_t state, uint32_t, int flags);
238 
239 	/* Client path failover callback */
240 	int	(*vo_failover)(dev_info_t *vdip, dev_info_t *cdip, int flags);
241 
242 	/* Client attached callback */
243 	void	(*vo_client_attached)(dev_info_t *cdip);
244 } mdi_vhci_ops_t;
245 
246 /*
247  * An mdi_vhci structure is created and bound to the devinfo node of every
248  * registered vHCI class driver; this happens when a vHCI registers itself from
249  * attach(9e).  This structure is unbound and freed when the vHCI unregisters
250  * at detach(9e) time;
251  *
252  * Each vHCI driver is associated with a vHCI class name; this is the handle
253  * used to register and unregister pHCI drivers for a given transport.
254  *
255  * Locking: Different parts of this structure are guarded by different
256  * locks: global threading of multiple vHCIs and initialization is protected
257  * by mdi_mutex, the list of pHCIs associated with a vHCI is protected by
258  * vh_phci_mutex, and Clients are protected by vh_client_mutex.
259  *
260  * XXX Depending on the context, some of the fields can be freely read without
261  * holding any locks (ex. holding vh_client_mutex lock also guarantees that
262  * the vHCI (parent) cannot be unexpectedly freed).
263  */
264 typedef struct mdi_vhci {
265 	/* protected by mdi_mutex... */
266 	struct mdi_vhci		*vh_next;	/* next vHCI link	*/
267 	struct mdi_vhci		*vh_prev;	/* prev vHCI link	*/
268 	char			*vh_class;	/* vHCI class name	*/
269 	dev_info_t		*vh_dip;	/* vHCI devi handle	*/
270 	int			vh_refcnt;	/* vHCI reference count	*/
271 	struct mdi_vhci_config	*vh_config;	/* vHCI config		*/
272 	client_lb_t		vh_lb;		/* vHCI load-balancing	*/
273 	struct mdi_vhci_ops	*vh_ops;	/* vHCI callback vectors */
274 
275 	/* protected by MDI_VHCI_PHCI_LOCK vh_phci_mutex... */
276 	kmutex_t		vh_phci_mutex;	/* pHCI mutex		*/
277 	int			vh_phci_count;	/* pHCI device count	*/
278 	struct mdi_phci		*vh_phci_head;	/* pHCI list head	*/
279 	struct mdi_phci		*vh_phci_tail;	/* pHCI list tail	*/
280 
281 	/* protected by MDI_VHCI_CLIENT_LOCK vh_client_mutex... */
282 	kmutex_t		vh_client_mutex; /* Client mutex	*/
283 	int			vh_client_count; /* Client count	*/
284 	struct client_hash	*vh_client_table; /* Client hash	*/
285 } mdi_vhci_t;
286 
287 /*
288  * per-vHCI lock macros
289  */
290 #define	MDI_VHCI_PHCI_LOCK(vh)		mutex_enter(&(vh)->vh_phci_mutex)
291 #define	MDI_VHCI_PHCI_TRYLOCK(vh)	mutex_tryenter(&(vh)->vh_phci_mutex)
292 #define	MDI_VHCI_PHCI_UNLOCK(vh)	mutex_exit(&(vh)->vh_phci_mutex)
293 #ifdef	DEBUG
294 #define	MDI_VHCI_PCHI_LOCKED(vh)	MUTEX_HELD(&(vh)->vh_phci_mutex)
295 #endif	/* DEBUG */
296 #define	MDI_VHCI_CLIENT_LOCK(vh)	mutex_enter(&(vh)->vh_client_mutex)
297 #define	MDI_VHCI_CLIENT_TRYLOCK(vh)	mutex_tryenter(&(vh)->vh_client_mutex)
298 #define	MDI_VHCI_CLIENT_UNLOCK(vh)	mutex_exit(&(vh)->vh_client_mutex)
299 #ifdef	DEBUG
300 #define	MDI_VHCI_CLIENT_LOCKED(vh)	MUTEX_HELD(&(vh)->vh_client_mutex)
301 #endif	/* DEBUG */
302 
303 
304 /*
305  * GUID Hash definitions
306  *
307  * Since all the mpxio managed devices for a given class are enumerated under
308  * the single vHCI instance for that class, sequentially walking through the
309  * client device link to find a client would be prohibitively slow.
310  */
311 
312 #define	CLIENT_HASH_TABLE_SIZE	(32)	/* GUID hash */
313 
314 /*
315  * Client hash table structure
316  */
317 struct client_hash {
318 	struct mdi_client	*ct_hash_head;	/* Client hash head	*/
319 	int			ct_hash_count;	/* Client hash count	*/
320 };
321 
322 
323 /*
324  * pHCI Drivers:
325  *
326  * Physical HBA drivers provide transport services for mpxio-managed devices.
327  * As each pHCI instance is attached, it must register itself with the mpxio
328  * framework using mdi_phci_register().  When the pHCI is detached it must
329  * similarly call mdi_phci_unregister().
330  *
331  * The framework maintains a list of registered pHCI device instances for each
332  * vHCI.  This list involves (vh_phci_count, vh_phci_head, vh_phci_tail) and
333  * (ph_next, ph_prev, ph_vhci) and is protected by vh_phci_mutex.
334  *
335  * Locking order:
336  *
337  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_phci::ph_mutex))		XXX
338  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex devinfo_tree_lock))		XXX
339  */
340 typedef struct mdi_phci {
341 	/* protected by MDI_VHCI_PHCI_LOCK vh_phci_mutex... */
342 	struct mdi_phci		*ph_next;	/* next pHCI link	*/
343 	struct mdi_phci		*ph_prev;	/* prev pHCI link	*/
344 	dev_info_t		*ph_dip;	/* pHCI devi handle	*/
345 	struct mdi_vhci 	*ph_vhci;	/* pHCI back ref. to vHCI */
346 
347 	/* protected by MDI_PHCI_LOCK ph_mutex... */
348 	kmutex_t		ph_mutex;	/* per-pHCI mutex	*/
349 	int			ph_path_count;	/* pi count		*/
350 	mdi_pathinfo_t		*ph_path_head;	/* pi list head		*/
351 	mdi_pathinfo_t		*ph_path_tail;	/* pi list tail		*/
352 	int			ph_flags;	/* pHCI operation flags	*/
353 	int			ph_unstable;	/* Paths in transient state */
354 	kcondvar_t		ph_unstable_cv;	/* Paths in transient state */
355 
356 	/* protected by mdi_phci_[gs]et_vhci_private caller... */
357 	void			*ph_vprivate;	/* vHCI driver private	*/
358 } mdi_phci_t;
359 
360 /*
361  * A pHCI device is 'unstable' while one or more paths are in a transitional
362  * state.  Hotplugging is prevented during this state.
363  */
364 #define	MDI_PHCI_UNSTABLE(ph)		(ph)->ph_unstable++;
365 #define	MDI_PHCI_STABLE(ph) { \
366 	(ph)->ph_unstable--; \
367 	if ((ph)->ph_unstable == 0) { \
368 		cv_broadcast(&(ph)->ph_unstable_cv); \
369 	} \
370 }
371 
372 /*
373  * per-pHCI lock macros
374  */
375 #define	MDI_PHCI_LOCK(ph)		mutex_enter(&(ph)->ph_mutex)
376 #define	MDI_PHCI_TRYLOCK(ph)		mutex_tryenter(&(ph)->ph_mutex)
377 #define	MDI_PHCI_UNLOCK(ph)		mutex_exit(&(ph)->ph_mutex)
378 #ifdef	DEBUG
379 #define	MDI_PHCI_LOCKED(vh)		MUTEX_HELD(&(ph)->ph_mutex)
380 #endif	/* DEBUG */
381 
382 /*
383  * pHCI state definitions and macros to track the pHCI driver instance state
384  */
385 #define	MDI_PHCI_FLAGS_OFFLINE		0x1	/* pHCI is offline */
386 #define	MDI_PHCI_FLAGS_SUSPEND		0x2	/* pHCI is suspended */
387 #define	MDI_PHCI_FLAGS_POWER_DOWN	0x4	/* pHCI is power down */
388 #define	MDI_PHCI_FLAGS_DETACH		0x8	/* pHCI is detached */
389 #define	MDI_PHCI_FLAGS_USER_DISABLE	0x10	/* pHCI is disabled,user */
390 #define	MDI_PHCI_FLAGS_D_DISABLE	0x20	/* pHCI is disabled,driver */
391 #define	MDI_PHCI_FLAGS_D_DISABLE_TRANS	0x40	/* pHCI is disabled,transient */
392 #define	MDI_PHCI_FLAGS_POWER_TRANSITION	0x80	/* pHCI is power transition */
393 
394 #define	MDI_PHCI_DISABLE_MASK						\
395 	    (MDI_PHCI_FLAGS_USER_DISABLE | MDI_PHCI_FLAGS_D_DISABLE |	\
396 	    MDI_PHCI_FLAGS_D_DISABLE_TRANS)
397 
398 #define	MDI_PHCI_IS_READY(ph)						\
399 	    (((ph)->ph_flags & MDI_PHCI_DISABLE_MASK) == 0)
400 
401 #define	MDI_PHCI_SET_OFFLINE(ph) 					{\
402 	    ASSERT(MDI_PHCI_LOCKED(ph));				\
403 	    (ph)->ph_flags |= MDI_PHCI_FLAGS_OFFLINE;			}
404 #define	MDI_PHCI_SET_ONLINE(ph)						{\
405 	    ASSERT(MDI_PHCI_LOCKED(ph));				\
406 	    (ph)->ph_flags &= ~MDI_PHCI_FLAGS_OFFLINE;			}
407 #define	MDI_PHCI_IS_OFFLINE(ph)						\
408 	    ((ph)->ph_flags & MDI_PHCI_FLAGS_OFFLINE)
409 
410 #define	MDI_PHCI_SET_SUSPEND(ph) 					{\
411 	    ASSERT(MDI_PHCI_LOCKED(ph));				\
412 	    (ph)->ph_flags |= MDI_PHCI_FLAGS_SUSPEND;			}
413 #define	MDI_PHCI_SET_RESUME(ph)						{\
414 	    ASSERT(MDI_PHCI_LOCKED(ph));				\
415 	    (ph)->ph_flags &= ~MDI_PHCI_FLAGS_SUSPEND;			}
416 #define	MDI_PHCI_IS_SUSPENDED(ph)					\
417 	    ((ph)->ph_flags & MDI_PHCI_FLAGS_SUSPEND)
418 
419 #define	MDI_PHCI_SET_DETACH(ph)						{\
420 	    ASSERT(MDI_PHCI_LOCKED(ph));				\
421 	    (ph)->ph_flags |= MDI_PHCI_FLAGS_DETACH;			}
422 #define	MDI_PHCI_SET_ATTACH(ph)						{\
423 	    ASSERT(MDI_PHCI_LOCKED(ph));				\
424 	    (ph)->ph_flags &= ~MDI_PHCI_FLAGS_DETACH;			}
425 
426 #define	MDI_PHCI_SET_POWER_DOWN(ph)					{\
427 	    ASSERT(MDI_PHCI_LOCKED(ph));				\
428 	    (ph)->ph_flags |= MDI_PHCI_FLAGS_POWER_DOWN;		}
429 #define	MDI_PHCI_SET_POWER_UP(ph)					{\
430 	    ASSERT(MDI_PHCI_LOCKED(ph));				\
431 	    (ph)->ph_flags &= ~MDI_PHCI_FLAGS_POWER_DOWN;		}
432 #define	MDI_PHCI_IS_POWERED_DOWN(ph)					\
433 	    ((ph)->ph_flags & MDI_PHCI_FLAGS_POWER_DOWN)
434 
435 #define	MDI_PHCI_SET_USER_ENABLE(ph)					{\
436 	    ASSERT(MDI_PHCI_LOCKED(ph));				\
437 	    (ph)->ph_flags &= ~MDI_PHCI_FLAGS_USER_DISABLE;		}
438 #define	MDI_PHCI_SET_USER_DISABLE(ph)					{\
439 	    ASSERT(MDI_PHCI_LOCKED(ph));				\
440 	    (ph)->ph_flags |= MDI_PHCI_FLAGS_USER_DISABLE;		}
441 #define	MDI_PHCI_IS_USER_DISABLED(ph)					\
442 	    ((ph)->ph_flags & MDI_PHCI_FLAGS_USER_DISABLE)
443 
444 #define	MDI_PHCI_SET_DRV_ENABLE(ph)					{\
445 	    ASSERT(MDI_PHCI_LOCKED(ph));				\
446 	    (ph)->ph_flags &= ~MDI_PHCI_FLAGS_D_DISABLE;		}
447 #define	MDI_PHCI_SET_DRV_DISABLE(ph)					{\
448 	    ASSERT(MDI_PHCI_LOCKED(ph));				\
449 	    (ph)->ph_flags |= MDI_PHCI_FLAGS_D_DISABLE;			}
450 #define	MDI_PHCI_IS_DRV_DISABLED(ph)					\
451 	    ((ph)->ph_flags & MDI_PHCI_FLAGS_D_DISABLE)
452 
453 #define	MDI_PHCI_SET_DRV_ENABLE_TRANSIENT(ph)				{\
454 	    ASSERT(MDI_PHCI_LOCKED(ph));				\
455 	    (ph)->ph_flags &= ~MDI_PHCI_FLAGS_D_DISABLE_TRANS;		}
456 #define	MDI_PHCI_SET_DRV_DISABLE_TRANSIENT(ph)				{\
457 	    ASSERT(MDI_PHCI_LOCKED(ph));				\
458 	    (ph)->ph_flags |= MDI_PHCI_FLAGS_D_DISABLE_TRANS;		}
459 #define	MDI_PHCI_IS_DRV_DISABLED_TRANSIENT(ph)				\
460 	    ((ph)->ph_flags & MDI_PHCI_FLAGS_D_DISABLE_TRANS)
461 
462 #define	MDI_PHCI_SET_POWER_TRANSITION(ph)				{\
463 	    ASSERT(MDI_PHCI_LOCKED(ph));				\
464 	    (ph)->ph_flags |= MDI_PHCI_FLAGS_POWER_TRANSITION;		}
465 #define	MDI_PHCI_CLEAR_POWER_TRANSITION(ph)				{\
466 	    ASSERT(MDI_PHCI_LOCKED(ph));				\
467 	    (ph)->ph_flags &= ~MDI_PHCI_FLAGS_POWER_TRANSITION;		}
468 #define	MDI_PHCI_IS_POWER_TRANSITION(ph)				\
469 	    ((ph)->ph_flags & MDI_PHCI_FLAGS_POWER_TRANSITION)
470 
471 /*
472  * mpxio Managed Clients:
473  *
474  * This framework creates a struct mdi_client for every client device created
475  * by the framework as a result of self-enumeration of target devices by the
476  * registered pHCI devices.  This structure is bound to client device dev_info
477  * node at the time of client device allocation (ndi_devi_alloc(9e)). This
478  * structure is unbound from the dev_info node when mpxio framework removes a
479  * client device node from the system.
480  *
481  * This structure is created when a first path is enumerated and removed when
482  * last path is de-enumerated from the system.
483  *
484  * Multipath client devices are instantiated as children of corresponding vHCI
485  * driver instance. Each client device is uniquely identified by a GUID
486  * provided by target device itself.  The parent vHCI device also maintains a
487  * hashed list of client devices, protected by vh_client_mutex.
488  *
489  * Typically pHCI devices self-enumerate their child devices using taskq,
490  * resulting in multiple paths to the same client device to be enumerated by
491  * competing threads.
492  *
493  * Currently this framework supports two kinds of load-balancing policy
494  * configurable through the vHCI driver configuration files.
495  *
496  * NONE		- Legacy AP mode
497  * Round Robin	- Balance the pHCI load in a Round Robin fashion.
498  *
499  * This framework identifies the client device in three distinct states:
500  *
501  * OPTIMAL	- Client device has atleast one redundant path.
502  * DEGRADED	- No redundant paths (critical).  Failure in the current active
503  *		  path would result in data access failures.
504  * FAILED 	- No paths are available to access this device.
505  *
506  * Locking order:
507  *
508  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_client::ct_mutex))			XXX
509  * _NOTE(LOCK_ORDER(mdi_client::ct_mutex devinfo_tree_lock))		XXX
510  */
511 typedef struct mdi_client {
512 	/* protected by MDI_VHCI_CLIENT_LOCK vh_client_mutex... */
513 	struct mdi_client	*ct_hnext;	/* next client		*/
514 	struct mdi_client	*ct_hprev;	/* prev client		*/
515 	dev_info_t		*ct_dip;	/* client devi handle	*/
516 	struct mdi_vhci		*ct_vhci;	/* vHCI back ref	*/
517 	char			*ct_drvname;	/* client driver name	*/
518 	char			*ct_guid;	/* client guid		*/
519 	client_lb_t		ct_lb;		/* load balancing scheme */
520 	client_lb_args_t	*ct_lb_args; 	/* load balancing args */
521 
522 
523 	/* protected by MDI_CLIENT_LOCK ct_mutex... */
524 	kmutex_t		ct_mutex;	/* per-client mutex	*/
525 	int			ct_path_count;	/* multi path count	*/
526 	mdi_pathinfo_t		*ct_path_head;	/* multi path list head	*/
527 	mdi_pathinfo_t		*ct_path_tail;	/* multi path list tail	*/
528 	mdi_pathinfo_t		*ct_path_last;	/* last path used for i/o */
529 	int			ct_state;	/* state information	*/
530 	int			ct_flags;	/* Driver op. flags	*/
531 	int			ct_failover_flags;	/* Failover args */
532 	int			ct_failover_status;	/* last fo status */
533 	kcondvar_t		ct_failover_cv;	/* Failover status cv	*/
534 	int			ct_unstable;	/* Paths in transient state */
535 	kcondvar_t		ct_unstable_cv;	/* Paths in transient state */
536 
537 	int			ct_power_cnt;	/* Hold count on parent power */
538 	kcondvar_t		ct_powerchange_cv;
539 					/* Paths in power transient state */
540 	short			ct_powercnt_config;
541 					/* held in pre/post config */
542 	short			ct_powercnt_unconfig;
543 					/* held in pre/post unconfig */
544 	int			ct_powercnt_reset;
545 					/* ct_power_cnt was resetted */
546 
547 	void			*ct_cprivate;	/* client driver private */
548 	void			*ct_vprivate;	/* vHCI driver private	*/
549 } mdi_client_t;
550 
551 /*
552  * per-Client device locking definitions
553  */
554 #define	MDI_CLIENT_LOCK(ct)		mutex_enter(&(ct)->ct_mutex)
555 #define	MDI_CLIENT_TRYLOCK(ct)		mutex_tryenter(&(ct)->ct_mutex)
556 #define	MDI_CLIENT_UNLOCK(ct)		mutex_exit(&(ct)->ct_mutex)
557 #ifdef	DEBUG
558 #define	MDI_CLIENT_LOCKED(ct)		MUTEX_HELD(&(ct)->ct_mutex)
559 #endif	/* DEBUG */
560 
561 /*
562  * A Client device is in unstable while one or more paths are in transitional
563  * state.  We do not allow failover to take place while paths are in transient
564  * state. Similarly we do not allow state transition while client device
565  * failover is in progress.
566  */
567 #define	MDI_CLIENT_UNSTABLE(ct)		(ct)->ct_unstable++;
568 #define	MDI_CLIENT_STABLE(ct) { \
569 	(ct)->ct_unstable--; \
570 	if ((ct)->ct_unstable == 0) { \
571 		cv_broadcast(&(ct)->ct_unstable_cv); \
572 	} \
573 }
574 
575 /*
576  * Client driver instance state definitions:
577  */
578 #define	MDI_CLIENT_FLAGS_OFFLINE		0x00000001
579 #define	MDI_CLIENT_FLAGS_SUSPEND		0x00000002
580 #define	MDI_CLIENT_FLAGS_POWER_DOWN		0x00000004
581 #define	MDI_CLIENT_FLAGS_DETACH			0x00000008
582 #define	MDI_CLIENT_FLAGS_FAILOVER		0x00000010
583 #define	MDI_CLIENT_FLAGS_REPORT_DEV		0x00000020
584 #define	MDI_CLIENT_FLAGS_PATH_FREE_IN_PROGRESS	0x00000040
585 #define	MDI_CLIENT_FLAGS_ASYNC_FREE		0x00000080
586 #define	MDI_CLIENT_FLAGS_DEV_NOT_SUPPORTED	0x00000100
587 #define	MDI_CLIENT_FLAGS_POWER_TRANSITION	0x00000200
588 
589 #define	MDI_CLIENT_SET_OFFLINE(ct)					{\
590 	    ASSERT(MDI_CLIENT_LOCKED(ct));				\
591 	    (ct)->ct_flags |= MDI_CLIENT_FLAGS_OFFLINE;			}
592 #define	MDI_CLIENT_SET_ONLINE(ct)					{\
593 	    ASSERT(MDI_CLIENT_LOCKED(ct));				\
594 	    (ct)->ct_flags &= ~MDI_CLIENT_FLAGS_OFFLINE;		}
595 #define	MDI_CLIENT_IS_OFFLINE(ct) \
596 	    ((ct)->ct_flags & MDI_CLIENT_FLAGS_OFFLINE)
597 
598 #define	MDI_CLIENT_SET_SUSPEND(ct)					{\
599 	    ASSERT(MDI_CLIENT_LOCKED(ct));				\
600 	    (ct)->ct_flags |= MDI_CLIENT_FLAGS_SUSPEND;			}
601 #define	MDI_CLIENT_SET_RESUME(ct)					{\
602 	    ASSERT(MDI_CLIENT_LOCKED(ct));				\
603 	    (ct)->ct_flags &= ~MDI_CLIENT_FLAGS_SUSPEND;		}
604 #define	MDI_CLIENT_IS_SUSPENDED(ct) \
605 	    ((ct)->ct_flags & MDI_CLIENT_FLAGS_SUSPEND)
606 
607 #define	MDI_CLIENT_SET_POWER_DOWN(ct)					{\
608 	    ASSERT(MDI_CLIENT_LOCKED(ct));				\
609 	    (ct)->ct_flags |= MDI_CLIENT_FLAGS_POWER_DOWN;		}
610 #define	MDI_CLIENT_SET_POWER_UP(ct)					{\
611 	    ASSERT(MDI_CLIENT_LOCKED(ct));				\
612 	    (ct)->ct_flags &= ~MDI_CLIENT_FLAGS_POWER_DOWN;		}
613 #define	MDI_CLIENT_IS_POWERED_DOWN(ct) \
614 	    ((ct)->ct_flags & MDI_CLIENT_FLAGS_POWER_DOWN)
615 
616 #define	MDI_CLIENT_SET_POWER_TRANSITION(ct)				{\
617 	    ASSERT(MDI_CLIENT_LOCKED(ct));				\
618 	    (ct)->ct_flags |= MDI_CLIENT_FLAGS_POWER_TRANSITION;	}
619 #define	MDI_CLIENT_CLEAR_POWER_TRANSITION(ct)				{\
620 	    ASSERT(MDI_CLIENT_LOCKED(ct));				\
621 	    (ct)->ct_flags &= ~MDI_CLIENT_FLAGS_POWER_TRANSITION;	}
622 #define	MDI_CLIENT_IS_POWER_TRANSITION(ct) \
623 	    ((ct)->ct_flags & MDI_CLIENT_FLAGS_POWER_TRANSITION)
624 
625 #define	MDI_CLIENT_SET_DETACH(ct)					{\
626 	    ASSERT(MDI_CLIENT_LOCKED(ct));				\
627 	    (ct)->ct_flags |= MDI_CLIENT_FLAGS_DETACH;			}
628 #define	MDI_CLIENT_SET_ATTACH(ct)					{\
629 	    ASSERT(MDI_CLIENT_LOCKED(ct));				\
630 	    (ct)->ct_flags &= ~MDI_CLIENT_FLAGS_DETACH;			}
631 #define	MDI_CLIENT_IS_DETACHED(ct) \
632 	    ((ct)->ct_flags & MDI_CLIENT_FLAGS_DETACH)
633 
634 #define	MDI_CLIENT_SET_FAILOVER_IN_PROGRESS(ct)				{\
635 	    ASSERT(MDI_CLIENT_LOCKED(ct));				\
636 	    (ct)->ct_flags |= MDI_CLIENT_FLAGS_FAILOVER;		}
637 #define	MDI_CLIENT_CLEAR_FAILOVER_IN_PROGRESS(ct)			{\
638 	    ASSERT(MDI_CLIENT_LOCKED(ct));				\
639 	    (ct)->ct_flags &= ~MDI_CLIENT_FLAGS_FAILOVER;		}
640 #define	MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct) \
641 	    ((ct)->ct_flags & MDI_CLIENT_FLAGS_FAILOVER)
642 
643 #define	MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct)				{\
644 	    ASSERT(MDI_CLIENT_LOCKED(ct));				\
645 	    (ct)->ct_flags |= MDI_CLIENT_FLAGS_REPORT_DEV;		}
646 #define	MDI_CLIENT_CLEAR_REPORT_DEV_NEEDED(ct)				{\
647 	    ASSERT(MDI_CLIENT_LOCKED(ct));				\
648 	    (ct)->ct_flags &= ~MDI_CLIENT_FLAGS_REPORT_DEV;		}
649 #define	MDI_CLIENT_IS_REPORT_DEV_NEEDED(ct) \
650 	    ((ct)->ct_flags & MDI_CLIENT_FLAGS_REPORT_DEV)
651 
652 #define	MDI_CLIENT_SET_PATH_FREE_IN_PROGRESS(ct)			{\
653 	    ASSERT(MDI_CLIENT_LOCKED(ct));				\
654 	    (ct)->ct_flags |= MDI_CLIENT_FLAGS_PATH_FREE_IN_PROGRESS;	}
655 #define	MDI_CLIENT_CLEAR_PATH_FREE_IN_PROGRESS(ct)			{\
656 	    ASSERT(MDI_CLIENT_LOCKED(ct));				\
657 	    (ct)->ct_flags &= ~MDI_CLIENT_FLAGS_PATH_FREE_IN_PROGRESS;	}
658 #define	MDI_CLIENT_IS_PATH_FREE_IN_PROGRESS(ct) \
659 	    ((ct)->ct_flags & MDI_CLIENT_FLAGS_PATH_FREE_IN_PROGRESS)
660 
661 #define	MDI_CLIENT_SET_DEV_NOT_SUPPORTED(ct)				{\
662 	    ASSERT(MDI_CLIENT_LOCKED(ct));				\
663 	    (ct)->ct_flags |= MDI_CLIENT_FLAGS_DEV_NOT_SUPPORTED;	}
664 #define	MDI_CLIENT_IS_DEV_NOT_SUPPORTED(ct) \
665 	    ((ct)->ct_flags & MDI_CLIENT_FLAGS_DEV_NOT_SUPPORTED)
666 
667 /*
668  * Client operating states.
669  */
670 #define	MDI_CLIENT_STATE_OPTIMAL	1
671 #define	MDI_CLIENT_STATE_DEGRADED	2
672 #define	MDI_CLIENT_STATE_FAILED		3
673 
674 #define	MDI_CLIENT_STATE(ct) ((ct)->ct_state)
675 #define	MDI_CLIENT_SET_STATE(ct, state) ((ct)->ct_state = state)
676 
677 #define	MDI_CLIENT_IS_FAILED(ct) \
678 	    ((ct)->ct_state == MDI_CLIENT_STATE_FAILED)
679 
680 /*
681  * mdi_pathinfo nodes:
682  *
683  * From this framework's perspective, a 'path' is a tuple consisting of a
684  * client or end device, a host controller which provides device
685  * identification and transport services (pHCI), and bus specific unit
686  * addressing information.  A path may be decorated with properties which
687  * describe the capabilities of the path; such properties are analogous to
688  * device node and minor node properties.
689  *
690  * The framework maintains link list of mdi_pathinfo nodes created by every
691  * pHCI driver instance via the pi_phci_link linkage; this is used (for example)
692  * to make sure that all relevant pathinfo nodes are freed before the pHCI
693  * is unregistered.
694  *
695  * Locking order:
696  *
697  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))		XXX
698  * _NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))	XXX
699  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_client::ct_mutex))		XXX
700  * _NOTE(LOCK_ORDER(devinfo_tree_lock mdi_pathinfo::pi_mutex))		XXX
701  *
702  * mdi_pathinfo node structure definition
703  */
704 struct mdi_pathinfo {
705 	/* protected by MDI_PHCI_LOCK ph_mutex... */
706 	struct mdi_pathinfo	*pi_phci_link;	 /* next path in phci list */
707 	mdi_phci_t		*pi_phci;	/* pHCI dev_info node	*/
708 
709 	/* protected by MDI_CLIENT_LOCK ct_mutex... */
710 	struct mdi_pathinfo	*pi_client_link; /* next path in client list */
711 	mdi_client_t		*pi_client;	/* client		*/
712 
713 	/* protected by MDI_VHCI_CLIENT_LOCK vh_client_mutex... */
714 	char			*pi_addr;	/* path unit address	*/
715 	int			pi_path_instance; /* path instance */
716 
717 	/* protected by MDI_PI_LOCK pi_mutex... */
718 	kmutex_t		pi_mutex;	/* per path mutex	*/
719 	mdi_pathinfo_state_t	pi_state;	/* path state		*/
720 	mdi_pathinfo_state_t	pi_old_state;	/* path state		*/
721 	kcondvar_t		pi_state_cv;	/* path state condvar	*/
722 	nvlist_t		*pi_prop;	/* Properties		*/
723 	void			*pi_cprivate;	/* client private info	*/
724 	void			*pi_pprivate;	/* phci private info	*/
725 	int			pi_ref_cnt;	/* pi reference count	*/
726 	kcondvar_t		pi_ref_cv;	/* condition variable	*/
727 	struct mdi_pi_kstats	*pi_kstats;	/* aggregate kstats */
728 	int			pi_pm_held;	/* phci's kidsup incremented */
729 	int			pi_preferred;	/* Preferred path 	*/
730 	void			*pi_vprivate;	/* vhci private info	*/
731 };
732 
733 /*
734  * pathinfo statistics:
735  *
736  * The mpxio architecture allows for multiple pathinfo nodes for each
737  * client-pHCI combination.  For statistics purposes, these statistics are
738  * aggregated into a single client-pHCI set of kstats.
739  */
740 struct mdi_pi_kstats {
741 	int	pi_kstat_ref;		/* # paths aggregated, also a ref cnt */
742 	kstat_t	*pi_kstat_iostats;	/* mdi:iopath statistic set */
743 	kstat_t *pi_kstat_errstats;	/* error statistics */
744 };
745 
746 /*
747  * pathinfo error kstat
748  */
749 struct pi_errs {
750 	struct kstat_named pi_softerrs;		/* "Soft" Error */
751 	struct kstat_named pi_harderrs;		/* "Hard" Error */
752 	struct kstat_named pi_transerrs;	/* Transport Errors */
753 	struct kstat_named pi_icnt_busy;	/* Interconnect Busy */
754 	struct kstat_named pi_icnt_errors;	/* Interconnect Errors */
755 	struct kstat_named pi_phci_rsrc;	/* pHCI No Resources */
756 	struct kstat_named pi_phci_localerr;	/* pHCI Local Errors */
757 	struct kstat_named pi_phci_invstate;	/* pHCI Invalid State */
758 	struct kstat_named pi_failedfrom;	/* Failover: Failed From */
759 	struct kstat_named pi_failedto;		/* Failover: Failed To */
760 };
761 
762 /*
763  * increment an error counter
764  */
765 #define	MDI_PI_ERRSTAT(pip, x) { \
766 	if (MDI_PI((pip))->pi_kstats != NULL) { \
767 		struct pi_errs *pep; \
768 		pep = MDI_PI(pip)->pi_kstats->pi_kstat_errstats->ks_data; \
769 		pep->x.value.ui32++; \
770 	} \
771 }
772 
773 /*
774  * error codes which can be passed to MDI_PI_ERRSTAT
775  */
776 #define	MDI_PI_SOFTERR	pi_softerrs
777 #define	MDI_PI_HARDERR	pi_harderrs
778 #define	MDI_PI_TRANSERR	pi_transerrs
779 #define	MDI_PI_ICNTBUSY	pi_icnt_busy
780 #define	MDI_PI_ICNTERR	pi_icnt_errors
781 #define	MDI_PI_PHCIRSRC	pi_phci_rsrc
782 #define	MDI_PI_PHCILOCL	pi_phci_localerr
783 #define	MDI_PI_PHCIINVS	pi_phci_invstate
784 #define	MDI_PI_FAILFROM	pi_failedfrom
785 #define	MDI_PI_FAILTO	pi_failedto
786 
787 #define	MDI_PI(type)			((struct mdi_pathinfo *)(type))
788 
789 #define	MDI_PI_LOCK(pip)		mutex_enter(&MDI_PI(pip)->pi_mutex)
790 #define	MDI_PI_TRYLOCK(pip)		mutex_tryenter(&MDI_PI(pip)->pi_mutex)
791 #define	MDI_PI_UNLOCK(pip)		mutex_exit(&MDI_PI(pip)->pi_mutex)
792 #ifdef	DEBUG
793 #define	MDI_PI_LOCKED(pip)		MUTEX_HELD(&MDI_PI(pip)->pi_mutex)
794 #endif	/* DEBUG */
795 
796 #define	MDI_PI_HOLD(pip)		(++MDI_PI(pip)->pi_ref_cnt)
797 #define	MDI_PI_RELE(pip)		(--MDI_PI(pip)->pi_ref_cnt)
798 
799 #define	MDI_EXT_STATE_CHANGE		0x10000000
800 
801 
802 #define	MDI_DISABLE_OP			0x1
803 #define	MDI_ENABLE_OP			0x2
804 #define	MDI_BEFORE_STATE_CHANGE		0x4
805 #define	MDI_AFTER_STATE_CHANGE		0x8
806 #define	MDI_SYNC_FLAG			0x10
807 
808 #define	MDI_PI_STATE(pip)						\
809 	(MDI_PI((pip))->pi_state & MDI_PATHINFO_STATE_MASK)
810 #define	MDI_PI_OLD_STATE(pip)						\
811 	(MDI_PI((pip))->pi_old_state & MDI_PATHINFO_STATE_MASK)
812 
813 #define	MDI_PI_EXT_STATE(pip)						\
814 	(MDI_PI((pip))->pi_state & MDI_PATHINFO_EXT_STATE_MASK)
815 #define	MDI_PI_OLD_EXT_STATE(pip)					\
816 	(MDI_PI((pip))->pi_old_state & MDI_PATHINFO_EXT_STATE_MASK)
817 
818 #define	MDI_PI_SET_TRANSIENT(pip)					{\
819 	ASSERT(MDI_PI_LOCKED(pip));					\
820 	MDI_PI(pip)->pi_state |= MDI_PATHINFO_STATE_TRANSIENT;		}
821 #define	MDI_PI_CLEAR_TRANSIENT(pip)					{\
822 	ASSERT(MDI_PI_LOCKED(pip));					\
823 	MDI_PI(pip)->pi_state &= ~MDI_PATHINFO_STATE_TRANSIENT;		}
824 #define	MDI_PI_IS_TRANSIENT(pip) \
825 	(MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_TRANSIENT)
826 
827 #define	MDI_PI_SET_USER_DISABLE(pip)					{\
828 	ASSERT(MDI_PI_LOCKED(pip));					\
829 	MDI_PI(pip)->pi_state |= MDI_PATHINFO_STATE_USER_DISABLE;	}
830 #define	MDI_PI_SET_DRV_DISABLE(pip)					{\
831 	ASSERT(MDI_PI_LOCKED(pip));					\
832 	MDI_PI(pip)->pi_state |= MDI_PATHINFO_STATE_DRV_DISABLE;	}
833 #define	MDI_PI_SET_DRV_DISABLE_TRANS(pip)				{\
834 	ASSERT(MDI_PI_LOCKED(pip));					\
835 	MDI_PI(pip)->pi_state |= MDI_PATHINFO_STATE_DRV_DISABLE_TRANSIENT; }
836 
837 #define	MDI_PI_SET_USER_ENABLE(pip)					{\
838 	ASSERT(MDI_PI_LOCKED(pip));					\
839 	MDI_PI(pip)->pi_state &= ~MDI_PATHINFO_STATE_USER_DISABLE;	}
840 #define	MDI_PI_SET_DRV_ENABLE(pip)					{\
841 	ASSERT(MDI_PI_LOCKED(pip));					\
842 	MDI_PI(pip)->pi_state &= ~MDI_PATHINFO_STATE_DRV_DISABLE;	}
843 #define	MDI_PI_SET_DRV_ENABLE_TRANS(pip)				{\
844 	ASSERT(MDI_PI_LOCKED(pip));					\
845 	MDI_PI(pip)->pi_state &= ~MDI_PATHINFO_STATE_DRV_DISABLE_TRANSIENT; }
846 
847 #define	MDI_PI_IS_USER_DISABLE(pip)					\
848 	(MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_USER_DISABLE)
849 #define	MDI_PI_IS_DRV_DISABLE(pip)					\
850 	(MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_DRV_DISABLE)
851 #define	MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip)				\
852 	(MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_DRV_DISABLE_TRANSIENT)
853 
854 #define	MDI_PI_IS_DISABLE(pip)						\
855 	(MDI_PI_IS_USER_DISABLE(pip) ||					\
856 	MDI_PI_IS_DRV_DISABLE(pip) ||					\
857 	MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip))
858 
859 #define	MDI_PI_IS_INIT(pip)						\
860 	((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK) ==		\
861 		MDI_PATHINFO_STATE_INIT)
862 
863 #define	MDI_PI_IS_INITING(pip)						\
864 	((MDI_PI(pip)->pi_state & ~MDI_PATHINFO_EXT_STATE_MASK) ==	\
865 		(MDI_PATHINFO_STATE_INIT | MDI_PATHINFO_STATE_TRANSIENT))
866 
867 #define	MDI_PI_SET_INIT(pip)						{\
868 	ASSERT(MDI_PI_LOCKED(pip));					\
869 	MDI_PI(pip)->pi_state = MDI_PATHINFO_STATE_INIT;		}
870 
871 #define	MDI_PI_SET_ONLINING(pip)					{\
872 	uint32_t	ext_state;					\
873 	ASSERT(MDI_PI_LOCKED(pip));					\
874 	ext_state = MDI_PI(pip)->pi_state & MDI_PATHINFO_EXT_STATE_MASK; \
875 	MDI_PI(pip)->pi_old_state = MDI_PI_STATE(pip);			\
876 	MDI_PI(pip)->pi_state =						\
877 	(MDI_PATHINFO_STATE_ONLINE | MDI_PATHINFO_STATE_TRANSIENT);	\
878 	MDI_PI(pip)->pi_state |= ext_state;				}
879 
880 #define	MDI_PI_IS_ONLINING(pip)						\
881 	((MDI_PI(pip)->pi_state & ~MDI_PATHINFO_EXT_STATE_MASK) ==	\
882 	(MDI_PATHINFO_STATE_ONLINE | MDI_PATHINFO_STATE_TRANSIENT))
883 
884 #define	MDI_PI_SET_ONLINE(pip)						{\
885 	uint32_t	ext_state;					\
886 	ASSERT(MDI_PI_LOCKED(pip));					\
887 	ext_state = MDI_PI(pip)->pi_state & MDI_PATHINFO_EXT_STATE_MASK; \
888 	MDI_PI(pip)->pi_state = MDI_PATHINFO_STATE_ONLINE;		\
889 	MDI_PI(pip)->pi_state |= ext_state;				}
890 
891 #define	MDI_PI_IS_ONLINE(pip)						\
892 	((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK) ==		\
893 	MDI_PATHINFO_STATE_ONLINE)
894 
895 #define	MDI_PI_SET_OFFLINING(pip)					{\
896 	uint32_t	ext_state;					\
897 	ASSERT(MDI_PI_LOCKED(pip));					\
898 	ext_state = MDI_PI(pip)->pi_state & MDI_PATHINFO_EXT_STATE_MASK; \
899 	MDI_PI(pip)->pi_old_state = MDI_PI_STATE(pip);			\
900 	MDI_PI(pip)->pi_state =						\
901 	(MDI_PATHINFO_STATE_OFFLINE | MDI_PATHINFO_STATE_TRANSIENT);	\
902 	MDI_PI(pip)->pi_state |= ext_state;				}
903 
904 #define	MDI_PI_IS_OFFLINING(pip)					\
905 	((MDI_PI(pip)->pi_state & ~MDI_PATHINFO_EXT_STATE_MASK) ==	\
906 	(MDI_PATHINFO_STATE_OFFLINE | MDI_PATHINFO_STATE_TRANSIENT))
907 
908 #define	MDI_PI_SET_OFFLINE(pip)						{\
909 	uint32_t	ext_state;					\
910 	ASSERT(MDI_PI_LOCKED(pip));					\
911 	ext_state = MDI_PI(pip)->pi_state & MDI_PATHINFO_EXT_STATE_MASK; \
912 	MDI_PI(pip)->pi_state = MDI_PATHINFO_STATE_OFFLINE;		\
913 	MDI_PI(pip)->pi_state |= ext_state;				}
914 
915 #define	MDI_PI_IS_OFFLINE(pip)						\
916 	((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK) ==		\
917 	MDI_PATHINFO_STATE_OFFLINE)
918 
919 #define	MDI_PI_SET_STANDBYING(pip)					{\
920 	uint32_t	ext_state;					\
921 	ASSERT(MDI_PI_LOCKED(pip));					\
922 	ext_state = MDI_PI(pip)->pi_state & MDI_PATHINFO_EXT_STATE_MASK; \
923 	MDI_PI(pip)->pi_old_state = MDI_PI_STATE(pip);			\
924 	MDI_PI(pip)->pi_state =						\
925 	(MDI_PATHINFO_STATE_STANDBY | MDI_PATHINFO_STATE_TRANSIENT);	\
926 	MDI_PI(pip)->pi_state |= ext_state;				}
927 
928 #define	MDI_PI_SET_STANDBY(pip)						{\
929 	uint32_t	ext_state;					\
930 	ASSERT(MDI_PI_LOCKED(pip));					\
931 	ext_state = MDI_PI(pip)->pi_state & MDI_PATHINFO_EXT_STATE_MASK; \
932 	MDI_PI(pip)->pi_state = MDI_PATHINFO_STATE_STANDBY;		\
933 	MDI_PI(pip)->pi_state |= ext_state;				}
934 
935 #define	MDI_PI_IS_STANDBY(pip)						\
936 	((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK) ==		\
937 	MDI_PATHINFO_STATE_STANDBY)
938 
939 #define	MDI_PI_SET_FAULTING(pip)					{\
940 	uint32_t	ext_state;					\
941 	ASSERT(MDI_PI_LOCKED(pip));					\
942 	ext_state = MDI_PI(pip)->pi_state & MDI_PATHINFO_EXT_STATE_MASK; \
943 	MDI_PI(pip)->pi_old_state = MDI_PI_STATE(pip);			\
944 	MDI_PI(pip)->pi_state =						\
945 	    (MDI_PATHINFO_STATE_FAULT | MDI_PATHINFO_STATE_TRANSIENT);	\
946 	MDI_PI(pip)->pi_state |= ext_state;				}
947 
948 #define	MDI_PI_SET_FAULT(pip)						{\
949 	uint32_t	ext_state;					\
950 	ASSERT(MDI_PI_LOCKED(pip));					\
951 	ext_state = MDI_PI(pip)->pi_state & MDI_PATHINFO_EXT_STATE_MASK; \
952 	MDI_PI(pip)->pi_state = MDI_PATHINFO_STATE_FAULT;		\
953 	MDI_PI(pip)->pi_state |= ext_state;				}
954 
955 #define	MDI_PI_IS_FAULT(pip)						\
956 	((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK) ==		\
957 	MDI_PATHINFO_STATE_FAULT)
958 
959 #define	MDI_PI_IS_SUSPENDED(pip)					\
960 	((MDI_PI(pip))->pi_phci->ph_flags & MDI_PHCI_FLAGS_SUSPEND)
961 
962 /*
963  * mdi_vhcache_client, mdi_vhcache_pathinfo, and mdi_vhcache_phci structures
964  * hold the vhci to phci client mappings of the on-disk vhci busconfig cache.
965  */
966 
967 /* phci structure of vhci cache */
968 typedef struct mdi_vhcache_phci {
969 	char			*cphci_path;	/* phci path name */
970 	uint32_t		cphci_id;	/* used when building nvlist */
971 	mdi_phci_t		*cphci_phci;	/* pointer to actual phci */
972 	struct mdi_vhcache_phci	*cphci_next;	/* next in vhci phci list */
973 } mdi_vhcache_phci_t;
974 
975 /* pathinfo structure of vhci cache */
976 typedef struct mdi_vhcache_pathinfo {
977 	char			*cpi_addr;	/* path address */
978 	mdi_vhcache_phci_t	*cpi_cphci;	/* phci the path belongs to */
979 	struct mdi_pathinfo	*cpi_pip;	/* ptr to actual pathinfo */
980 	uint32_t		cpi_flags;	/* see below */
981 	struct mdi_vhcache_pathinfo *cpi_next;	/* next path for the client */
982 } mdi_vhcache_pathinfo_t;
983 
984 /*
985  * cpi_flags
986  *
987  * MDI_CPI_HINT_PATH_DOES_NOT_EXIST - set when configuration of the path has
988  * failed.
989  */
990 #define	MDI_CPI_HINT_PATH_DOES_NOT_EXIST	0x0001
991 
992 /* client structure of vhci cache */
993 typedef struct mdi_vhcache_client {
994 	char			*cct_name_addr;	/* client address */
995 	mdi_vhcache_pathinfo_t	*cct_cpi_head;	/* client's path list head */
996 	mdi_vhcache_pathinfo_t	*cct_cpi_tail;	/* client's path list tail */
997 	struct mdi_vhcache_client *cct_next;	/* next in vhci client list */
998 } mdi_vhcache_client_t;
999 
1000 /* vhci cache structure - one for vhci instance */
1001 typedef struct mdi_vhci_cache {
1002 	mdi_vhcache_phci_t	*vhcache_phci_head;	/* phci list head */
1003 	mdi_vhcache_phci_t	*vhcache_phci_tail;	/* phci list tail */
1004 	mdi_vhcache_client_t	*vhcache_client_head;	/* client list head */
1005 	mdi_vhcache_client_t	*vhcache_client_tail;	/* client list tail */
1006 	mod_hash_t		*vhcache_client_hash;	/* client hash */
1007 	int			vhcache_flags;		/* see below */
1008 	int64_t			vhcache_clean_time;	/* last clean time */
1009 	krwlock_t		vhcache_lock;		/* cache lock */
1010 } mdi_vhci_cache_t;
1011 
1012 /* vhcache_flags */
1013 #define	MDI_VHCI_CACHE_SETUP_DONE	0x0001	/* cache setup completed */
1014 
1015 /* vhci bus config structure - one for vhci instance */
1016 typedef struct mdi_vhci_config {
1017 	char			*vhc_vhcache_filename;	/* on-disk file name */
1018 	mdi_vhci_cache_t	vhc_vhcache;		/* vhci cache */
1019 	kmutex_t		vhc_lock;		/* vhci config lock */
1020 	kcondvar_t		vhc_cv;
1021 	int			vhc_flags;		/* see below */
1022 
1023 	/* flush vhci cache when lbolt reaches vhc_flush_at_ticks */
1024 	clock_t			vhc_flush_at_ticks;
1025 
1026 	/*
1027 	 * Head and tail of the client list whose paths are being configured
1028 	 * asynchronously. vhc_acc_count is the number of clients on this list.
1029 	 * vhc_acc_thrcount is the number threads running to configure
1030 	 * the paths for these clients.
1031 	 */
1032 	struct mdi_async_client_config *vhc_acc_list_head;
1033 	struct mdi_async_client_config *vhc_acc_list_tail;
1034 	int			vhc_acc_count;
1035 	int			vhc_acc_thrcount;
1036 
1037 	/* callback id - for flushing the cache during system shutdown */
1038 	callb_id_t		vhc_cbid;
1039 
1040 	/*
1041 	 * vhc_path_discovery_boot -	number of times path discovery will be
1042 	 *				attempted during early boot.
1043 	 * vhc_path_discovery_postboot	number of times path discovery will be
1044 	 *				attempted during late boot.
1045 	 * vhc_path_discovery_cutoff_time - time at which paths were last
1046 	 *				discovered  + some timeout
1047 	 */
1048 	int			vhc_path_discovery_boot;
1049 	int			vhc_path_discovery_postboot;
1050 	int64_t			vhc_path_discovery_cutoff_time;
1051 } mdi_vhci_config_t;
1052 
1053 /* vhc_flags */
1054 #define	MDI_VHC_SINGLE_THREADED		0x0001	/* config single threaded */
1055 #define	MDI_VHC_EXIT			0x0002	/* exit all config activity */
1056 #define	MDI_VHC_VHCACHE_DIRTY		0x0004	/* cache dirty */
1057 #define	MDI_VHC_VHCACHE_FLUSH_THREAD	0x0008	/* cache flush thead running */
1058 #define	MDI_VHC_VHCACHE_FLUSH_ERROR	0x0010	/* failed to flush cache */
1059 #define	MDI_VHC_READONLY_FS		0x0020	/* filesys is readonly */
1060 
1061 typedef struct mdi_phys_path {
1062 	char			*phys_path;
1063 	struct mdi_phys_path	*phys_path_next;
1064 } mdi_phys_path_t;
1065 
1066 /*
1067  * Lookup tokens are used to cache the result of the vhci cache client lookup
1068  * operations (to reduce the number of real lookup operations).
1069  */
1070 typedef struct mdi_vhcache_lookup_token {
1071 	mdi_vhcache_client_t	*lt_cct;		/* vhcache client */
1072 	int64_t			lt_cct_lookup_time;	/* last lookup time */
1073 } mdi_vhcache_lookup_token_t;
1074 
1075 /* asynchronous configuration of client paths */
1076 typedef struct mdi_async_client_config {
1077 	char			*acc_ct_name;	/* client name */
1078 	char			*acc_ct_addr;	/* client address */
1079 	mdi_phys_path_t		*acc_phclient_path_list_head;	/* path head */
1080 	mdi_vhcache_lookup_token_t acc_token;	/* lookup token */
1081 	struct mdi_async_client_config *acc_next; /* next in vhci acc list */
1082 } mdi_async_client_config_t;
1083 
1084 /*
1085  * vHCI driver instance registration/unregistration
1086  *
1087  * mdi_vhci_register() is called by a vHCI driver to register itself as the
1088  * manager of devices from a particular 'class'.  This should be called from
1089  * attach(9e).
1090  *
1091  * mdi_vhci_unregister() is called from detach(9E) to unregister a vHCI
1092  * instance from the framework.
1093  */
1094 int		mdi_vhci_register(char *, dev_info_t *, mdi_vhci_ops_t *, int);
1095 int		mdi_vhci_unregister(dev_info_t *, int);
1096 
1097 /*
1098  * Utility functions
1099  */
1100 int		mdi_phci_get_path_count(dev_info_t *);
1101 dev_info_t	*mdi_phci_path2devinfo(dev_info_t *, caddr_t);
1102 
1103 
1104 /*
1105  * Path Selection Functions:
1106  *
1107  * mdi_select_path() is called by a vHCI driver to select to which path an
1108  * I/O request should be routed.  The caller passes the 'buf' structure as
1109  * one of the parameters.  The mpxio framework uses the buf's contents to
1110  * maintain per path statistics (total I/O size / count pending).  If more
1111  * than one online path is available, the framework automatically selects
1112  * a suitable one.  If a failover operation is active for this client device
1113  * the call fails, returning MDI_BUSY.
1114  *
1115  * By default this function returns a suitable path in the 'online' state,
1116  * based on the current load balancing policy.  Currently we support
1117  * LOAD_BALANCE_NONE (Previously selected online path will continue to be
1118  * used as long as the path is usable) and LOAD_BALANCE_RR (Online paths
1119  * will be selected in a round robin fashion).  The load balancing scheme
1120  * can be configured in the vHCI driver's configuration file (driver.conf).
1121  *
1122  * vHCI drivers may override this default behaviour by specifying appropriate
1123  * flags.  If start_pip is specified (non NULL), it is used as the routine's
1124  * starting point; it starts walking from there to find the next appropriate
1125  * path.
1126  *
1127  * The following values for 'flags' are currently defined, the third argument
1128  * to mdi_select_path depends on the flags used.
1129  *
1130  *   <none>:				default, arg is pip
1131  *   MDI_SELECT_ONLINE_PATH:		select an ONLINE path, arg is pip
1132  *   MDI_SELECT_STANDBY_PATH:		select a STANDBY path, arg is pip
1133  *   MDI_SELECT_USER_DISABLE_PATH:	select user disable for failover and
1134  *					auto_failback
1135  *   MDI_SELECT_PATH_INSTANCE:		select a specific path, arg is
1136  *					path instance
1137  *
1138  * The selected paths are returned in an mdi_hold_path() state (pi_ref_cnt),
1139  * caller should release the hold by calling mdi_rele_path() at the end of
1140  * operation.
1141  */
1142 int		mdi_select_path(dev_info_t *, struct buf *, int,
1143 		    void *, mdi_pathinfo_t **);
1144 int		mdi_set_lb_policy(dev_info_t *, client_lb_t);
1145 int		mdi_set_lb_region_size(dev_info_t *, int);
1146 client_lb_t	mdi_get_lb_policy(dev_info_t *);
1147 
1148 /*
1149  * flags for mdi_select_path() routine
1150  */
1151 #define	MDI_SELECT_ONLINE_PATH		0x0001
1152 #define	MDI_SELECT_STANDBY_PATH		0x0002
1153 #define	MDI_SELECT_USER_DISABLE_PATH	0x0004
1154 #define	MDI_SELECT_PATH_INSTANCE	0x0008
1155 
1156 /*
1157  * MDI client device utility functions
1158  */
1159 int		mdi_client_get_path_count(dev_info_t *);
1160 dev_info_t	*mdi_client_path2devinfo(dev_info_t *, caddr_t);
1161 
1162 /*
1163  * Failover:
1164  *
1165  * The vHCI driver calls mdi_failover() to initiate a failover operation.
1166  * mdi_failover() calls back into the vHCI driver's vo_failover()
1167  * entry point to perform the actual failover operation.  The reason
1168  * for requiring the vHCI driver to initiate failover by calling
1169  * mdi_failover(), instead of directly executing vo_failover() itself,
1170  * is to ensure that the mdi framework can keep track of the client
1171  * state properly.  Additionally, mdi_failover() provides as a
1172  * convenience the option of performing the failover operation
1173  * synchronously or asynchronously
1174  *
1175  * Upon successful completion of the failover operation, the paths that were
1176  * previously ONLINE will be in the STANDBY state, and the newly activated
1177  * paths will be in the ONLINE state.
1178  *
1179  * The flags modifier determines whether the activation is done synchronously
1180  */
1181 int mdi_failover(dev_info_t *, dev_info_t *, int);
1182 
1183 /*
1184  * Client device failover mode of operation
1185  */
1186 #define	MDI_FAILOVER_SYNC	1	/* Syncronous Failover		*/
1187 #define	MDI_FAILOVER_ASYNC	2	/* Asyncronous Failover		*/
1188 
1189 /*
1190  * mdi_pathinfo node kstat functions.
1191  */
1192 int mdi_pi_kstat_exists(mdi_pathinfo_t *);
1193 int mdi_pi_kstat_create(mdi_pathinfo_t *pip, char *ks_name);
1194 void mdi_pi_kstat_iosupdate(mdi_pathinfo_t *, struct buf *);
1195 
1196 /*
1197  * mdi_pathinfo node extended state change functions.
1198  */
1199 int mdi_pi_get_state2(mdi_pathinfo_t *, mdi_pathinfo_state_t *, uint32_t *);
1200 int mdi_pi_get_preferred(mdi_pathinfo_t *);
1201 
1202 /*
1203  * mdi_pathinfo node member functions
1204  */
1205 void *mdi_pi_get_client_private(mdi_pathinfo_t *);
1206 void mdi_pi_set_client_private(mdi_pathinfo_t *, void *);
1207 void mdi_pi_set_state(mdi_pathinfo_t *, mdi_pathinfo_state_t);
1208 void mdi_pi_set_preferred(mdi_pathinfo_t *, int);
1209 
1210 /* get/set vhci private data */
1211 void *mdi_client_get_vhci_private(dev_info_t *);
1212 void mdi_client_set_vhci_private(dev_info_t *, void *);
1213 void *mdi_phci_get_vhci_private(dev_info_t *);
1214 void mdi_phci_set_vhci_private(dev_info_t *, void *);
1215 void *mdi_pi_get_vhci_private(mdi_pathinfo_t *);
1216 void mdi_pi_set_vhci_private(mdi_pathinfo_t *, void *);
1217 
1218 /*
1219  * mdi_pathinfo Property utilities
1220  */
1221 int mdi_prop_size(mdi_pathinfo_t *, size_t *);
1222 int mdi_prop_pack(mdi_pathinfo_t *, char **, uint_t);
1223 
1224 /* obsolete interface, to be removed */
1225 void mdi_get_next_path(dev_info_t *, mdi_pathinfo_t *, mdi_pathinfo_t **);
1226 int mdi_get_component_type(dev_info_t *);
1227 
1228 #endif	/* _KERNEL */
1229 
1230 #ifdef	__cplusplus
1231 }
1232 #endif
1233 
1234 #endif	/* _SYS_MDI_IMPLDEFS_H */
1235