xref: /illumos-gate/usr/src/uts/common/io/ib/clients/ibd/ibd.c (revision f6da83d4178694e7113b71d1e452f15b296f73d8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * An implementation of the IPoIB standard based on PSARC 2001/289.
28  */
29 
30 #include <sys/types.h>
31 #include <sys/conf.h>
32 #include <sys/ddi.h>
33 #include <sys/sunddi.h>
34 #include <sys/modctl.h>
35 #include <sys/stropts.h>
36 #include <sys/stream.h>
37 #include <sys/strsun.h>
38 #include <sys/strsubr.h>
39 #include <sys/dlpi.h>
40 #include <sys/mac_provider.h>
41 
42 #include <sys/pattr.h>		/* for HCK_FULLCKSUM */
43 #include <sys/sysmacros.h>	/* for offsetof */
44 #include <sys/disp.h>		/* for async thread pri */
45 #include <sys/atomic.h>		/* for atomic_add*() */
46 #include <sys/ethernet.h>	/* for ETHERTYPE_IPV6 */
47 #include <netinet/in.h>		/* for netinet/ip.h below */
48 #include <netinet/ip.h>		/* for struct ip */
49 #include <netinet/udp.h>	/* for struct udphdr */
50 #include <inet/common.h>	/* for inet/ip.h below */
51 #include <inet/ip.h>		/* for ipha_t */
52 #include <inet/ip6.h>		/* for ip6_t */
53 #include <inet/tcp.h>		/* for tcph_t */
54 #include <netinet/icmp6.h>	/* for icmp6_t */
55 #include <sys/callb.h>
56 #include <sys/modhash.h>
57 
58 #include <sys/ib/clients/ibd/ibd.h>
59 #include <sys/ib/mgt/sm_attr.h>	/* for SM_INIT_TYPE_* */
60 #include <sys/note.h>
61 #include <sys/multidata.h>
62 
63 #include <sys/ib/mgt/ibmf/ibmf.h>	/* for ibd_get_portspeed */
64 
65 #include <sys/priv_names.h>
66 #include <sys/dls.h>
67 #include <sys/dld_ioc.h>
68 #include <sys/policy.h>
69 #include <sys/ibpart.h>
70 #include <sys/file.h>
71 
72 /*
73  * The write-up below includes details on the following:
74  * 1. The dladm administrative model.
75  * 2. Late HCA initialization feature.
76  * 3. Brussels support and its implications to the current architecture.
77  *
78  * 1. The dladm administrative model.
79  * ------------------------------------------
80  * With the dladm model, ibnex will create one ibd instance per port. These
81  * instances will be created independent of the port state.
82  *
83  * The ibd driver is two faceted: One side of it working as the port driver and
84  * the other as the partition object driver.
85  *
86  * The port instance is a child of the HCA, and will have an entry in the devfs.
87  * A DDI attach only happens for the port driver, and its attach is
88  * handled in ibd_port_attach(). Similary, a DDI detach for the port driver is
89  * handled in ibd_port_unattach().
90  *
91  * The partition object is only a registrant to the mac layer via mac_register()
92  * and does not have an entry in the device tree. There is no DDI softstate
93  * managed by the DDI framework for the partition objects. However, the state is
94  * managed inside the ibd driver, and every partition object hangs off the
95  * "ibd_objlist_head".
96  *
97  * The partition object first comes into existence when a user runs the
98  * 'create-part' subcommand of dladm. This is like invoking the attach entry
99  * point of the partition object. The partition object goes away with the
100  * 'delete-part' subcommand of dladm. This is like invoking the detach entry
101  * point of the partition object.
102  *
103  * The create-part and delete-part subcommands result in dld ioctls that end up
104  * calling ibd_create_parition() and ibd_delete_partition respectively.
105  * There ioctls are registered with the dld layer in _init() via a call to
106  * dld_ioc_register().
107  *
108  * The port instance by itself cannot be plumbed. It is only the partition
109  * objects that can be plumbed and they alone participate in I/O and not the
110  * port driver.
111  *
112  * There are some info ioctls supported in ibd which are used by dladm(1M) to
113  * display useful information. The info entry point for ibd is
114  * ibd_get_partition_info().
115  *
116  * 2. Late HCA initialization feature.
117  * ------------------------------------
118  * As mentioned in section 1, the user creates the partition objects via
119  * dladm(1M). It is possible that:
120  * a) The physical port itself is down and the SM cannot be reached.
121  * b) The PKEY specified by the used has not been created in the SM yet.
122  * c) An IPoIB broadcast group for the specified PKEY is not present.
123  *
124  * In all of the above cases, complete initialization of the partition object is
125  * not possible. However, the new model allows the creation of partition
126  * objects even in such cases but will defer the initialization for later.
127  * When such a partition object is plumbed, the link state will be displayed as
128  * "down".
129  * The driver, at this point, is listening to events that herald the
130  * availability of resources -
131  * i)   LINK_UP when the link becomes available
132  * ii)  PORT_CHANGE when the PKEY has been created
133  * iii) MCG_CREATED when the IPoIB broadcast group for the given pkey has been
134  * created
135  * via ibd_async_handler() for events i) and ii), and via
136  * ibd_snet_notices_handler() for iii.
137  * The driver handles these events (as and when they arrive) and completes the
138  * initialization of the partition object and transitions it to a usable state.
139  *
140  * 3. Brussels support and its implications to the current architecture.
141  * ---------------------------------------------------------------------
142  * The brussels support introduces two new interfaces to the ibd driver -
143  * ibd_m_getprop() and ibd_m_setprop().
144  * These interfaces allow setting and retrieval of certain properties.
145  * Some of them are public properties while most other are private properties
146  * meant to be used by developers. Tuning the latter kind can cause
147  * performance issues and should not be used without understanding the
148  * implications. All properties are specific to an instance of either the
149  * partition object or the port driver.
150  *
151  * The public properties are : mtu and linkmode.
152  * mtu is a read-only property.
153  * linkmode can take two values - UD and CM.
154  *
155  * Changing the linkmode requires some bookkeeping in the driver. The
156  * capabilities need to be re-reported to the mac layer. This is done by
157  * calling mac_capab_update().  The maxsdu is updated by calling
158  * mac_maxsdu_update().
159  * The private properties retain their values across the change of linkmode.
160  * NOTE:
161  * - The port driver does not support any property apart from mtu.
162  * - All other properties are only meant for the partition object.
163  * - The properties cannot be set when an instance is plumbed. The
164  * instance has to be unplumbed to effect any setting.
165  */
166 
167 /*
168  * Driver wide tunables
169  *
170  * ibd_tx_softintr
171  * ibd_rx_softintr
172  *     The softintr mechanism allows ibd to avoid event queue overflows if
173  *     the receive/completion handlers are to be expensive. These are enabled
174  *     by default.
175  *
176  * ibd_log_sz
177  *     This specifies the size of the ibd log buffer in bytes. The buffer is
178  *     allocated and logging is enabled only when IBD_LOGGING is defined.
179  *
180  */
181 uint_t ibd_rx_softintr = 1;
182 uint_t ibd_tx_softintr = 1;
183 
184 #ifdef IBD_LOGGING
185 uint_t ibd_log_sz = 0x20000;
186 #endif
187 
188 #ifdef IBD_LOGGING
189 #define	IBD_LOG_SZ			ibd_log_sz
190 #endif
191 
192 /* Post IBD_RX_POST_CNT receive work requests at a time. */
193 #define	IBD_RX_POST_CNT			8
194 
195 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */
196 #define	IBD_LOG_RX_POST			4
197 
198 /* Minimum number of receive work requests driver needs to always have */
199 #define	IBD_RWQE_MIN	((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4)
200 
201 /*
202  * LSO parameters
203  */
204 #define	IBD_LSO_MAXLEN			65536
205 #define	IBD_LSO_BUFSZ			8192
206 
207 /*
208  * Async operation states
209  */
210 #define	IBD_OP_NOTSTARTED		0
211 #define	IBD_OP_ONGOING			1
212 #define	IBD_OP_COMPLETED		2
213 #define	IBD_OP_ERRORED			3
214 #define	IBD_OP_ROUTERED			4
215 
216 /*
217  * State of IBD driver initialization during attach/m_start
218  */
219 #define	IBD_DRV_STATE_INITIALIZED	0x000001
220 #define	IBD_DRV_RXINTR_ADDED		0x000002
221 #define	IBD_DRV_TXINTR_ADDED		0x000004
222 #define	IBD_DRV_IBTL_ATTACH_DONE	0x000008
223 #define	IBD_DRV_HCA_OPENED		0x000010
224 #define	IBD_DRV_PD_ALLOCD		0x000020
225 #define	IBD_DRV_MAC_REGISTERED		0x000040
226 #define	IBD_DRV_PORT_DETAILS_OBTAINED	0x000080
227 #define	IBD_DRV_BCAST_GROUP_FOUND	0x000100
228 #define	IBD_DRV_ACACHE_INITIALIZED	0x000200
229 #define	IBD_DRV_CQS_ALLOCD		0x000400
230 #define	IBD_DRV_UD_CHANNEL_SETUP	0x000800
231 #define	IBD_DRV_TXLIST_ALLOCD		0x001000
232 #define	IBD_DRV_SCQ_NOTIFY_ENABLED	0x002000
233 #define	IBD_DRV_RXLIST_ALLOCD		0x004000
234 #define	IBD_DRV_BCAST_GROUP_JOINED	0x008000
235 #define	IBD_DRV_ASYNC_THR_CREATED	0x010000
236 #define	IBD_DRV_RCQ_NOTIFY_ENABLED	0x020000
237 #define	IBD_DRV_SM_NOTICES_REGISTERED	0x040000
238 #define	IBD_DRV_STARTED			0x080000
239 #define	IBD_DRV_RC_SRQ_ALLOCD		0x100000
240 #define	IBD_DRV_RC_LARGEBUF_ALLOCD	0x200000
241 #define	IBD_DRV_RC_LISTEN		0x400000
242 #ifdef DEBUG
243 #define	IBD_DRV_RC_PRIVATE_STATE	0x800000
244 #endif
245 #define	IBD_DRV_IN_DELETION		0x1000000
246 #define	IBD_DRV_IN_LATE_HCA_INIT 	0x2000000
247 #define	IBD_DRV_REQ_LIST_INITED 	0x4000000
248 
249 /*
250  * Start/stop in-progress flags; note that restart must always remain
251  * the OR of start and stop flag values.
252  */
253 #define	IBD_DRV_START_IN_PROGRESS	0x10000000
254 #define	IBD_DRV_STOP_IN_PROGRESS	0x20000000
255 #define	IBD_DRV_RESTART_IN_PROGRESS	0x30000000
256 #define	IBD_DRV_DELETE_IN_PROGRESS	IBD_DRV_RESTART_IN_PROGRESS
257 
258 /*
259  * Miscellaneous constants
260  */
261 #define	IB_MGID_IPV4_LOWGRP_MASK	0xFFFFFFFF
262 #define	IBD_DEF_MAX_SDU			2044
263 #define	IBD_DEF_MAX_MTU			(IBD_DEF_MAX_SDU + IPOIB_HDRSIZE)
264 #define	IBD_DEF_RC_MAX_SDU		65520
265 #define	IBD_DEF_RC_MAX_MTU		(IBD_DEF_RC_MAX_SDU + IPOIB_HDRSIZE)
266 #define	IBD_DEFAULT_QKEY		0xB1B
267 #ifdef IBD_LOGGING
268 #define	IBD_DMAX_LINE			100
269 #endif
270 
271 /*
272  * Enumerations for link states
273  */
274 typedef enum {
275 	IBD_LINK_DOWN,
276 	IBD_LINK_UP,
277 	IBD_LINK_UP_ABSENT
278 } ibd_link_op_t;
279 
280 /*
281  * Driver State Pointer
282  */
283 void *ibd_list;
284 
285 /*
286  * Driver Global Data
287  */
288 ibd_global_state_t ibd_gstate;
289 
290 /*
291  * Partition object list
292  */
293 ibd_state_t	*ibd_objlist_head = NULL;
294 kmutex_t	ibd_objlist_lock;
295 
296 /*
297  * Logging
298  */
299 #ifdef IBD_LOGGING
300 kmutex_t ibd_lbuf_lock;
301 uint8_t *ibd_lbuf;
302 uint32_t ibd_lbuf_ndx;
303 #endif
304 
305 /*
306  * Required system entry points
307  */
308 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
309 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
310 
311 /*
312  * Required driver entry points for GLDv3
313  */
314 static int ibd_m_stat(void *, uint_t, uint64_t *);
315 static int ibd_m_start(void *);
316 static void ibd_m_stop(void *);
317 static int ibd_m_promisc(void *, boolean_t);
318 static int ibd_m_multicst(void *, boolean_t, const uint8_t *);
319 static int ibd_m_unicst(void *, const uint8_t *);
320 static mblk_t *ibd_m_tx(void *, mblk_t *);
321 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *);
322 
323 static int ibd_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
324     const void *);
325 static int ibd_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
326 static void ibd_m_propinfo(void *, const char *, mac_prop_id_t,
327     mac_prop_info_handle_t);
328 static int ibd_set_priv_prop(ibd_state_t *, const char *, uint_t,
329     const void *);
330 static int ibd_get_priv_prop(ibd_state_t *, const char *, uint_t, void *);
331 
332 /*
333  * Private driver entry points for GLDv3
334  */
335 
336 /*
337  * Initialization
338  */
339 static int ibd_state_init(ibd_state_t *, dev_info_t *);
340 static int ibd_init_txlist(ibd_state_t *);
341 static int ibd_init_rxlist(ibd_state_t *);
342 static int ibd_acache_init(ibd_state_t *);
343 #ifdef IBD_LOGGING
344 static void ibd_log_init(void);
345 #endif
346 
347 /*
348  * Termination/cleanup
349  */
350 static void ibd_state_fini(ibd_state_t *);
351 static void ibd_fini_txlist(ibd_state_t *);
352 static void ibd_fini_rxlist(ibd_state_t *);
353 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *);
354 static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *);
355 static void ibd_acache_fini(ibd_state_t *);
356 #ifdef IBD_LOGGING
357 static void ibd_log_fini(void);
358 #endif
359 
360 /*
361  * Allocation/acquire/map routines
362  */
363 static int ibd_alloc_tx_copybufs(ibd_state_t *);
364 static int ibd_alloc_rx_copybufs(ibd_state_t *);
365 static int ibd_alloc_tx_lsobufs(ibd_state_t *);
366 static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *);
367 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *,
368     uint32_t *);
369 
370 /*
371  * Free/release/unmap routines
372  */
373 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
374 static void ibd_free_tx_copybufs(ibd_state_t *);
375 static void ibd_free_rx_copybufs(ibd_state_t *);
376 static void ibd_free_rx_rsrcs(ibd_state_t *);
377 static void ibd_free_tx_lsobufs(ibd_state_t *);
378 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int);
379 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t);
380 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *);
381 
382 /*
383  * Handlers/callback routines
384  */
385 static uint_t ibd_intr(caddr_t);
386 static uint_t ibd_tx_recycle(caddr_t);
387 static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
388 static void ibd_scq_handler(ibt_cq_hdl_t, void *);
389 static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t);
390 static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t);
391 static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t);
392 static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t);
393 static void ibd_freemsg_cb(char *);
394 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
395     ibt_async_event_t *);
396 static void ibdpd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
397     ibt_async_event_t *);
398 static void ibd_snet_notices_handler(void *, ib_gid_t,
399     ibt_subnet_event_code_t, ibt_subnet_event_t *);
400 
401 /*
402  * Send/receive routines
403  */
404 static boolean_t ibd_send(ibd_state_t *, mblk_t *);
405 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *);
406 static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *);
407 static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
408 
409 /*
410  * Threads
411  */
412 static void ibd_async_work(ibd_state_t *);
413 
414 /*
415  * Async tasks
416  */
417 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *);
418 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int);
419 static void ibd_async_setprom(ibd_state_t *);
420 static void ibd_async_unsetprom(ibd_state_t *);
421 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t);
422 static void ibd_async_trap(ibd_state_t *, ibd_req_t *);
423 static void ibd_async_txsched(ibd_state_t *);
424 static void ibd_async_link(ibd_state_t *, ibd_req_t *);
425 
426 /*
427  * Async task helpers
428  */
429 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *);
430 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t);
431 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *);
432 static boolean_t ibd_get_allroutergroup(ibd_state_t *,
433     ipoib_mac_t *, ipoib_mac_t *);
434 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t);
435 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *);
436 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *);
437 static ibt_status_t ibd_find_bgroup(ibd_state_t *);
438 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *);
439 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t);
440 static uint64_t ibd_get_portspeed(ibd_state_t *);
441 static boolean_t ibd_async_safe(ibd_state_t *);
442 static void ibd_async_done(ibd_state_t *);
443 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int);
444 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *);
445 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t);
446 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *);
447 
448 /*
449  * Helpers for attach/start routines
450  */
451 static int ibd_register_mac(ibd_state_t *, dev_info_t *);
452 static int ibd_record_capab(ibd_state_t *);
453 static int ibd_get_port_details(ibd_state_t *);
454 static int ibd_alloc_cqs(ibd_state_t *);
455 static int ibd_setup_ud_channel(ibd_state_t *);
456 static int ibd_start(ibd_state_t *);
457 static int ibd_undo_start(ibd_state_t *, link_state_t);
458 static void ibd_set_mac_progress(ibd_state_t *, uint_t);
459 static void ibd_clr_mac_progress(ibd_state_t *, uint_t);
460 static int ibd_part_attach(ibd_state_t *state, dev_info_t *dip);
461 static int ibd_part_unattach(ibd_state_t *state);
462 static int ibd_port_attach(dev_info_t *);
463 static int ibd_port_unattach(ibd_state_t *state, dev_info_t *dip);
464 static int ibd_get_port_state(ibd_state_t *, link_state_t *);
465 
466 
467 /*
468  * Miscellaneous helpers
469  */
470 static int ibd_sched_poll(ibd_state_t *, int, int);
471 static void ibd_resume_transmission(ibd_state_t *);
472 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t);
473 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t);
474 static void *list_get_head(list_t *);
475 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t);
476 static uint_t ibd_hash_by_id(void *, mod_hash_key_t);
477 
478 ibt_status_t ibd_get_part_attr(datalink_id_t, ibt_part_attr_t *);
479 ibt_status_t ibd_get_all_part_attr(ibt_part_attr_t **, int *);
480 
481 #ifdef IBD_LOGGING
482 static void ibd_log(const char *, ...);
483 #endif
484 
485 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach,
486     nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed);
487 
488 /* Module Driver Info */
489 static struct modldrv ibd_modldrv = {
490 	&mod_driverops,			/* This one is a driver */
491 	"InfiniBand GLDv3 Driver",	/* short description */
492 	&ibd_dev_ops			/* driver specific ops */
493 };
494 
495 /* Module Linkage */
496 static struct modlinkage ibd_modlinkage = {
497 	MODREV_1, (void *)&ibd_modldrv, NULL
498 };
499 
500 /*
501  * Module (static) info passed to IBTL during ibt_attach
502  */
503 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = {
504 	IBTI_V_CURR,
505 	IBT_NETWORK,
506 	ibd_async_handler,
507 	NULL,
508 	"IBPART"
509 };
510 
511 static struct ibt_clnt_modinfo_s ibdpd_clnt_modinfo = {
512 	IBTI_V_CURR,
513 	IBT_NETWORK,
514 	ibdpd_async_handler,
515 	NULL,
516 	"IPIB"
517 };
518 
519 /*
520  * GLDv3 entry points
521  */
522 #define	IBD_M_CALLBACK_FLAGS	\
523 	(MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO)
524 
525 static mac_callbacks_t ibd_m_callbacks = {
526 	IBD_M_CALLBACK_FLAGS,
527 	ibd_m_stat,
528 	ibd_m_start,
529 	ibd_m_stop,
530 	ibd_m_promisc,
531 	ibd_m_multicst,
532 	ibd_m_unicst,
533 	ibd_m_tx,
534 	NULL,
535 	NULL,
536 	ibd_m_getcapab,
537 	NULL,
538 	NULL,
539 	ibd_m_setprop,
540 	ibd_m_getprop,
541 	ibd_m_propinfo
542 };
543 
544 /* Private properties */
545 char *ibd_priv_props[] = {
546 	"_ibd_broadcast_group",
547 	"_ibd_coalesce_completions",
548 	"_ibd_create_broadcast_group",
549 	"_ibd_hash_size",
550 	"_ibd_lso_enable",
551 	"_ibd_num_ah",
552 	"_ibd_num_lso_bufs",
553 	"_ibd_rc_enable_srq",
554 	"_ibd_rc_num_rwqe",
555 	"_ibd_rc_num_srq",
556 	"_ibd_rc_num_swqe",
557 	"_ibd_rc_rx_comp_count",
558 	"_ibd_rc_rx_comp_usec",
559 	"_ibd_rc_rx_copy_thresh",
560 	"_ibd_rc_rx_rwqe_thresh",
561 	"_ibd_rc_tx_comp_count",
562 	"_ibd_rc_tx_comp_usec",
563 	"_ibd_rc_tx_copy_thresh",
564 	"_ibd_ud_num_rwqe",
565 	"_ibd_ud_num_swqe",
566 	"_ibd_ud_rx_comp_count",
567 	"_ibd_ud_rx_comp_usec",
568 	"_ibd_ud_tx_comp_count",
569 	"_ibd_ud_tx_comp_usec",
570 	"_ibd_ud_tx_copy_thresh",
571 	NULL
572 };
573 
574 static int ibd_create_partition(void *, intptr_t, int, cred_t *, int *);
575 static int ibd_delete_partition(void *, intptr_t, int, cred_t *, int *);
576 static int ibd_get_partition_info(void *, intptr_t, int, cred_t *, int *);
577 
578 static dld_ioc_info_t ibd_dld_ioctl_list[] = {
579 	{IBD_CREATE_IBPART, DLDCOPYINOUT, sizeof (ibpart_ioctl_t),
580 	    ibd_create_partition, secpolicy_dl_config},
581 	{IBD_DELETE_IBPART, DLDCOPYIN, sizeof (ibpart_ioctl_t),
582 	    ibd_delete_partition, secpolicy_dl_config},
583 	{IBD_INFO_IBPART, DLDCOPYIN, sizeof (ibd_ioctl_t),
584 	    ibd_get_partition_info, NULL}
585 };
586 
587 /*
588  * Fill/clear <scope> and <p_key> in multicast/broadcast address
589  */
590 #define	IBD_FILL_SCOPE_PKEY(maddr, scope, pkey)		\
591 {							\
592 	*(uint32_t *)((char *)(maddr) + 4) |=		\
593 	    htonl((uint32_t)(scope) << 16);		\
594 	*(uint32_t *)((char *)(maddr) + 8) |=		\
595 	    htonl((uint32_t)(pkey) << 16);		\
596 }
597 
598 #define	IBD_CLEAR_SCOPE_PKEY(maddr)			\
599 {							\
600 	*(uint32_t *)((char *)(maddr) + 4) &=		\
601 	    htonl(~((uint32_t)0xF << 16));		\
602 	*(uint32_t *)((char *)(maddr) + 8) &=		\
603 	    htonl(~((uint32_t)0xFFFF << 16));		\
604 }
605 
606 /*
607  * Rudimentary debugging support
608  */
609 #ifdef DEBUG
610 int ibd_debuglevel = 100;
611 void
612 debug_print(int l, char *fmt, ...)
613 {
614 	va_list ap;
615 
616 	if (l < ibd_debuglevel)
617 		return;
618 	va_start(ap, fmt);
619 	vcmn_err(CE_CONT, fmt, ap);
620 	va_end(ap);
621 }
622 #endif
623 
624 /*
625  * Common routine to print warning messages; adds in hca guid, port number
626  * and pkey to be able to identify the IBA interface.
627  */
628 void
629 ibd_print_warn(ibd_state_t *state, char *fmt, ...)
630 {
631 	ib_guid_t hca_guid;
632 	char ibd_print_buf[256];
633 	int len;
634 	va_list ap;
635 
636 	hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
637 	    0, "hca-guid", 0);
638 	len = snprintf(ibd_print_buf, sizeof (ibd_print_buf),
639 	    "%s%d: HCA GUID %016llx port %d PKEY %02x ",
640 	    ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip),
641 	    (u_longlong_t)hca_guid, state->id_port, state->id_pkey);
642 	va_start(ap, fmt);
643 	(void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len,
644 	    fmt, ap);
645 	cmn_err(CE_NOTE, "!%s", ibd_print_buf);
646 	va_end(ap);
647 }
648 
649 /*
650  * Warlock directives
651  */
652 
653 /*
654  * id_lso_lock
655  *
656  * state->id_lso->bkt_nfree may be accessed without a lock to
657  * determine the threshold at which we have to ask the nw layer
658  * to resume transmission (see ibd_resume_transmission()).
659  */
660 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock,
661     ibd_state_t::id_lso))
662 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso))
663 _NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy))
664 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree))
665 
666 /*
667  * id_scq_poll_lock
668  */
669 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock,
670     ibd_state_t::id_scq_poll_busy))
671 
672 /*
673  * id_txpost_lock
674  */
675 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
676     ibd_state_t::id_tx_head))
677 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
678     ibd_state_t::id_tx_busy))
679 
680 /*
681  * id_acache_req_lock
682  */
683 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
684     ibd_state_t::id_acache_req_cv))
685 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
686     ibd_state_t::id_req_list))
687 _NOTE(SCHEME_PROTECTS_DATA("atomic",
688     ibd_acache_s::ac_ref))
689 
690 /*
691  * id_ac_mutex
692  *
693  * This mutex is actually supposed to protect id_ah_op as well,
694  * but this path of the code isn't clean (see update of id_ah_op
695  * in ibd_async_acache(), immediately after the call to
696  * ibd_async_mcache()). For now, we'll skip this check by
697  * declaring that id_ah_op is protected by some internal scheme
698  * that warlock isn't aware of.
699  */
700 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
701     ibd_state_t::id_ah_active))
702 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
703     ibd_state_t::id_ah_free))
704 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
705     ibd_state_t::id_ah_addr))
706 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this",
707     ibd_state_t::id_ah_op))
708 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
709     ibd_state_t::id_ah_error))
710 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
711     ibd_state_t::id_ac_hot_ace))
712 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error))
713 
714 /*
715  * id_mc_mutex
716  */
717 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
718     ibd_state_t::id_mc_full))
719 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
720     ibd_state_t::id_mc_non))
721 
722 /*
723  * id_trap_lock
724  */
725 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
726     ibd_state_t::id_trap_cv))
727 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
728     ibd_state_t::id_trap_stop))
729 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
730     ibd_state_t::id_trap_inprog))
731 
732 /*
733  * id_prom_op
734  */
735 _NOTE(SCHEME_PROTECTS_DATA("only by async thread",
736     ibd_state_t::id_prom_op))
737 
738 /*
739  * id_sched_lock
740  */
741 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock,
742     ibd_state_t::id_sched_needed))
743 
744 /*
745  * id_link_mutex
746  */
747 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex,
748     ibd_state_t::id_link_state))
749 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state))
750 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start",
751     ibd_state_t::id_link_speed))
752 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid))
753 
754 /*
755  * id_tx_list.dl_mutex
756  */
757 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
758     ibd_state_t::id_tx_list.dl_head))
759 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
760     ibd_state_t::id_tx_list.dl_pending_sends))
761 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
762     ibd_state_t::id_tx_list.dl_cnt))
763 
764 /*
765  * id_rx_list.dl_mutex
766  */
767 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
768     ibd_state_t::id_rx_list.dl_bufs_outstanding))
769 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
770     ibd_state_t::id_rx_list.dl_cnt))
771 
772 
773 /*
774  * Items protected by atomic updates
775  */
776 _NOTE(SCHEME_PROTECTS_DATA("atomic update only",
777     ibd_state_s::id_brd_rcv
778     ibd_state_s::id_brd_xmt
779     ibd_state_s::id_multi_rcv
780     ibd_state_s::id_multi_xmt
781     ibd_state_s::id_num_intrs
782     ibd_state_s::id_rcv_bytes
783     ibd_state_s::id_rcv_pkt
784     ibd_state_s::id_rx_post_queue_index
785     ibd_state_s::id_tx_short
786     ibd_state_s::id_xmt_bytes
787     ibd_state_s::id_xmt_pkt
788     ibd_state_s::rc_rcv_trans_byte
789     ibd_state_s::rc_rcv_trans_pkt
790     ibd_state_s::rc_rcv_copy_byte
791     ibd_state_s::rc_rcv_copy_pkt
792     ibd_state_s::rc_xmt_bytes
793     ibd_state_s::rc_xmt_small_pkt
794     ibd_state_s::rc_xmt_fragmented_pkt
795     ibd_state_s::rc_xmt_map_fail_pkt
796     ibd_state_s::rc_xmt_map_succ_pkt))
797 
798 /*
799  * Non-mutex protection schemes for data elements. Almost all of
800  * these are non-shared items.
801  */
802 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded",
803     callb_cpr
804     ib_gid_s
805     ib_header_info
806     ibd_acache_rq
807     ibd_acache_s::ac_mce
808     ibd_acache_s::ac_chan
809     ibd_mcache::mc_fullreap
810     ibd_mcache::mc_jstate
811     ibd_mcache::mc_req
812     ibd_rwqe_s
813     ibd_swqe_s
814     ibd_wqe_s
815     ibt_wr_ds_s::ds_va
816     ibt_wr_lso_s
817     ipoib_mac::ipoib_qpn
818     mac_capab_lso_s
819     msgb::b_next
820     msgb::b_cont
821     msgb::b_rptr
822     msgb::b_wptr
823     ibd_state_s::id_bgroup_created
824     ibd_state_s::id_mac_state
825     ibd_state_s::id_mtu
826     ibd_state_s::id_ud_num_rwqe
827     ibd_state_s::id_ud_num_swqe
828     ibd_state_s::id_qpnum
829     ibd_state_s::id_rcq_hdl
830     ibd_state_s::id_rx_buf_sz
831     ibd_state_s::id_rx_bufs
832     ibd_state_s::id_rx_mr_hdl
833     ibd_state_s::id_rx_wqes
834     ibd_state_s::id_rxwcs
835     ibd_state_s::id_rxwcs_size
836     ibd_state_s::id_rx_nqueues
837     ibd_state_s::id_rx_queues
838     ibd_state_s::id_scope
839     ibd_state_s::id_scq_hdl
840     ibd_state_s::id_tx_buf_sz
841     ibd_state_s::id_tx_bufs
842     ibd_state_s::id_tx_mr_hdl
843     ibd_state_s::id_tx_rel_list.dl_cnt
844     ibd_state_s::id_tx_wqes
845     ibd_state_s::id_txwcs
846     ibd_state_s::id_txwcs_size
847     ibd_state_s::rc_listen_hdl
848     ibd_state_s::rc_listen_hdl_OFED_interop
849     ibd_state_s::rc_srq_size
850     ibd_state_s::rc_srq_rwqes
851     ibd_state_s::rc_srq_rx_bufs
852     ibd_state_s::rc_srq_rx_mr_hdl
853     ibd_state_s::rc_tx_largebuf_desc_base
854     ibd_state_s::rc_tx_mr_bufs
855     ibd_state_s::rc_tx_mr_hdl
856     ipha_s
857     icmph_s
858     ibt_path_info_s::pi_sid
859     ibd_rc_chan_s::ace
860     ibd_rc_chan_s::chan_hdl
861     ibd_rc_chan_s::state
862     ibd_rc_chan_s::chan_state
863     ibd_rc_chan_s::is_tx_chan
864     ibd_rc_chan_s::rcq_hdl
865     ibd_rc_chan_s::rcq_size
866     ibd_rc_chan_s::scq_hdl
867     ibd_rc_chan_s::scq_size
868     ibd_rc_chan_s::requester_gid
869     ibd_rc_chan_s::requester_pkey
870     ibd_rc_chan_s::rx_bufs
871     ibd_rc_chan_s::rx_mr_hdl
872     ibd_rc_chan_s::rx_rwqes
873     ibd_rc_chan_s::tx_wqes
874     ibd_rc_chan_s::tx_mr_bufs
875     ibd_rc_chan_s::tx_mr_hdl
876     ibd_rc_chan_s::tx_rel_list.dl_cnt
877     ibd_rc_chan_s::tx_trans_error_cnt
878     ibd_rc_tx_largebuf_s::lb_buf
879     ibd_rc_msg_hello_s
880     ibt_cm_return_args_s))
881 
882 /*
883  * ibd_rc_chan_s::next is protected by two mutexes:
884  * 1) ibd_state_s::rc_pass_chan_list.chan_list_mutex
885  * 2) ibd_state_s::rc_obs_act_chan_list.chan_list_mutex.
886  */
887 _NOTE(SCHEME_PROTECTS_DATA("protected by two mutexes",
888     ibd_rc_chan_s::next))
889 
890 /*
891  * ibd_state_s.rc_tx_large_bufs_lock
892  */
893 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
894     ibd_state_s::rc_tx_largebuf_free_head))
895 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
896     ibd_state_s::rc_tx_largebuf_nfree))
897 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
898     ibd_rc_tx_largebuf_s::lb_next))
899 
900 /*
901  * ibd_acache_s.tx_too_big_mutex
902  */
903 _NOTE(MUTEX_PROTECTS_DATA(ibd_acache_s::tx_too_big_mutex,
904     ibd_acache_s::tx_too_big_ongoing))
905 
906 /*
907  * tx_wqe_list.dl_mutex
908  */
909 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
910     ibd_rc_chan_s::tx_wqe_list.dl_head))
911 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
912     ibd_rc_chan_s::tx_wqe_list.dl_pending_sends))
913 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
914     ibd_rc_chan_s::tx_wqe_list.dl_cnt))
915 
916 /*
917  * ibd_state_s.rc_ace_recycle_lock
918  */
919 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_ace_recycle_lock,
920     ibd_state_s::rc_ace_recycle))
921 
922 /*
923  * rc_srq_rwqe_list.dl_mutex
924  */
925 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
926     ibd_state_t::rc_srq_rwqe_list.dl_bufs_outstanding))
927 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
928     ibd_state_t::rc_srq_rwqe_list.dl_cnt))
929 
930 /*
931  * Non-mutex protection schemes for data elements. They are counters
932  * for problem diagnosis. Don't need be protected.
933  */
934 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis",
935     ibd_state_s::rc_rcv_alloc_fail
936     ibd_state_s::rc_rcq_invoke
937     ibd_state_s::rc_rcq_err
938     ibd_state_s::rc_ace_not_found
939     ibd_state_s::rc_xmt_drop_too_long_pkt
940     ibd_state_s::rc_xmt_icmp_too_long_pkt
941     ibd_state_s::rc_xmt_reenter_too_long_pkt
942     ibd_state_s::rc_swqe_short
943     ibd_state_s::rc_swqe_mac_update
944     ibd_state_s::rc_xmt_buf_short
945     ibd_state_s::rc_xmt_buf_mac_update
946     ibd_state_s::rc_scq_no_swqe
947     ibd_state_s::rc_scq_no_largebuf
948     ibd_state_s::rc_scq_invoke
949     ibd_state_s::rc_conn_succ
950     ibd_state_s::rc_conn_fail
951     ibd_state_s::rc_null_conn
952     ibd_state_s::rc_no_estab_conn
953     ibd_state_s::rc_act_close
954     ibd_state_s::rc_pas_close
955     ibd_state_s::rc_delay_ace_recycle
956     ibd_state_s::rc_act_close_simultaneous
957     ibd_state_s::rc_reset_cnt))
958 
959 #ifdef DEBUG
960 /*
961  * Non-mutex protection schemes for data elements. They are counters
962  * for problem diagnosis. Don't need be protected.
963  */
964 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis",
965     ibd_state_s::rc_rwqe_short
966     ibd_rc_stat_s::rc_rcv_trans_byte
967     ibd_rc_stat_s::rc_rcv_trans_pkt
968     ibd_rc_stat_s::rc_rcv_copy_byte
969     ibd_rc_stat_s::rc_rcv_copy_pkt
970     ibd_rc_stat_s::rc_rcv_alloc_fail
971     ibd_rc_stat_s::rc_rcq_invoke
972     ibd_rc_stat_s::rc_rcq_err
973     ibd_rc_stat_s::rc_scq_invoke
974     ibd_rc_stat_s::rc_rwqe_short
975     ibd_rc_stat_s::rc_xmt_bytes
976     ibd_rc_stat_s::rc_xmt_small_pkt
977     ibd_rc_stat_s::rc_xmt_fragmented_pkt
978     ibd_rc_stat_s::rc_xmt_map_fail_pkt
979     ibd_rc_stat_s::rc_xmt_map_succ_pkt
980     ibd_rc_stat_s::rc_ace_not_found
981     ibd_rc_stat_s::rc_scq_no_swqe
982     ibd_rc_stat_s::rc_scq_no_largebuf
983     ibd_rc_stat_s::rc_swqe_short
984     ibd_rc_stat_s::rc_swqe_mac_update
985     ibd_rc_stat_s::rc_xmt_buf_short
986     ibd_rc_stat_s::rc_xmt_buf_mac_update
987     ibd_rc_stat_s::rc_conn_succ
988     ibd_rc_stat_s::rc_conn_fail
989     ibd_rc_stat_s::rc_null_conn
990     ibd_rc_stat_s::rc_no_estab_conn
991     ibd_rc_stat_s::rc_act_close
992     ibd_rc_stat_s::rc_pas_close
993     ibd_rc_stat_s::rc_delay_ace_recycle
994     ibd_rc_stat_s::rc_act_close_simultaneous
995     ibd_rc_stat_s::rc_reset_cnt))
996 #endif
997 
998 int
999 _init()
1000 {
1001 	int status;
1002 
1003 	status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t),
1004 	    PAGESIZE), 0);
1005 	if (status != 0) {
1006 		DPRINT(10, "_init:failed in ddi_soft_state_init()");
1007 		return (status);
1008 	}
1009 
1010 	mutex_init(&ibd_objlist_lock, NULL, MUTEX_DRIVER, NULL);
1011 
1012 	mac_init_ops(&ibd_dev_ops, "ibp");
1013 	status = mod_install(&ibd_modlinkage);
1014 	if (status != 0) {
1015 		DPRINT(10, "_init:failed in mod_install()");
1016 		ddi_soft_state_fini(&ibd_list);
1017 		mac_fini_ops(&ibd_dev_ops);
1018 		return (status);
1019 	}
1020 
1021 	mutex_init(&ibd_gstate.ig_mutex, NULL, MUTEX_DRIVER, NULL);
1022 	mutex_enter(&ibd_gstate.ig_mutex);
1023 	ibd_gstate.ig_ibt_hdl = NULL;
1024 	ibd_gstate.ig_ibt_hdl_ref_cnt = 0;
1025 	ibd_gstate.ig_service_list = NULL;
1026 	mutex_exit(&ibd_gstate.ig_mutex);
1027 
1028 	if (dld_ioc_register(IBPART_IOC, ibd_dld_ioctl_list,
1029 	    DLDIOCCNT(ibd_dld_ioctl_list)) != 0) {
1030 		return (EIO);
1031 	}
1032 
1033 	ibt_register_part_attr_cb(ibd_get_part_attr, ibd_get_all_part_attr);
1034 
1035 #ifdef IBD_LOGGING
1036 	ibd_log_init();
1037 #endif
1038 	return (0);
1039 }
1040 
1041 int
1042 _info(struct modinfo *modinfop)
1043 {
1044 	return (mod_info(&ibd_modlinkage, modinfop));
1045 }
1046 
1047 int
1048 _fini()
1049 {
1050 	int status;
1051 
1052 	status = mod_remove(&ibd_modlinkage);
1053 	if (status != 0)
1054 		return (status);
1055 
1056 	ibt_unregister_part_attr_cb();
1057 
1058 	mac_fini_ops(&ibd_dev_ops);
1059 	mutex_destroy(&ibd_objlist_lock);
1060 	ddi_soft_state_fini(&ibd_list);
1061 	mutex_destroy(&ibd_gstate.ig_mutex);
1062 #ifdef IBD_LOGGING
1063 	ibd_log_fini();
1064 #endif
1065 	return (0);
1066 }
1067 
1068 /*
1069  * Convert the GID part of the mac address from network byte order
1070  * to host order.
1071  */
1072 static void
1073 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid)
1074 {
1075 	ib_sn_prefix_t nbopref;
1076 	ib_guid_t nboguid;
1077 
1078 	bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t));
1079 	bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t));
1080 	dgid->gid_prefix = b2h64(nbopref);
1081 	dgid->gid_guid = b2h64(nboguid);
1082 }
1083 
1084 /*
1085  * Create the IPoIB address in network byte order from host order inputs.
1086  */
1087 static void
1088 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix,
1089     ib_guid_t guid)
1090 {
1091 	ib_sn_prefix_t nbopref;
1092 	ib_guid_t nboguid;
1093 
1094 	mac->ipoib_qpn = htonl(qpn);
1095 	nbopref = h2b64(prefix);
1096 	nboguid = h2b64(guid);
1097 	bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t));
1098 	bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t));
1099 }
1100 
1101 /*
1102  * Send to the appropriate all-routers group when the IBA multicast group
1103  * does not exist, based on whether the target group is v4 or v6.
1104  */
1105 static boolean_t
1106 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac,
1107     ipoib_mac_t *rmac)
1108 {
1109 	boolean_t retval = B_TRUE;
1110 	uint32_t adjscope = state->id_scope << 16;
1111 	uint32_t topword;
1112 
1113 	/*
1114 	 * Copy the first 4 bytes in without assuming any alignment of
1115 	 * input mac address; this will have IPoIB signature, flags and
1116 	 * scope bits.
1117 	 */
1118 	bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t));
1119 	topword = ntohl(topword);
1120 
1121 	/*
1122 	 * Generate proper address for IPv4/v6, adding in the Pkey properly.
1123 	 */
1124 	if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) ||
1125 	    (topword == (IB_MCGID_IPV6_PREFIX | adjscope)))
1126 		ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) |
1127 		    ((uint32_t)(state->id_pkey << 16))),
1128 		    (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP));
1129 	else
1130 		/*
1131 		 * Does not have proper bits in the mgid address.
1132 		 */
1133 		retval = B_FALSE;
1134 
1135 	return (retval);
1136 }
1137 
1138 /*
1139  * Membership states for different mcg's are tracked by two lists:
1140  * the "non" list is used for promiscuous mode, when all mcg traffic
1141  * needs to be inspected. This type of membership is never used for
1142  * transmission, so there can not be an AH in the active list
1143  * corresponding to a member in this list. This list does not need
1144  * any protection, since all operations are performed by the async
1145  * thread.
1146  *
1147  * "Full" and "SendOnly" membership is tracked using a single list,
1148  * the "full" list. This is because this single list can then be
1149  * searched during transmit to a multicast group (if an AH for the
1150  * mcg is not found in the active list), since at least one type
1151  * of membership must be present before initiating the transmit.
1152  * This list is also emptied during driver detach, since sendonly
1153  * membership acquired during transmit is dropped at detach time
1154  * along with ipv4 broadcast full membership. Insert/deletes to
1155  * this list are done only by the async thread, but it is also
1156  * searched in program context (see multicast disable case), thus
1157  * the id_mc_mutex protects the list. The driver detach path also
1158  * deconstructs the "full" list, but it ensures that the async
1159  * thread will not be accessing the list (by blocking out mcg
1160  * trap handling and making sure no more Tx reaping will happen).
1161  *
1162  * Currently, an IBA attach is done in the SendOnly case too,
1163  * although this is not required.
1164  */
1165 #define	IBD_MCACHE_INSERT_FULL(state, mce) \
1166 	list_insert_head(&state->id_mc_full, mce)
1167 #define	IBD_MCACHE_INSERT_NON(state, mce) \
1168 	list_insert_head(&state->id_mc_non, mce)
1169 #define	IBD_MCACHE_FIND_FULL(state, mgid) \
1170 	ibd_mcache_find(mgid, &state->id_mc_full)
1171 #define	IBD_MCACHE_FIND_NON(state, mgid) \
1172 	ibd_mcache_find(mgid, &state->id_mc_non)
1173 #define	IBD_MCACHE_PULLOUT_FULL(state, mce) \
1174 	list_remove(&state->id_mc_full, mce)
1175 #define	IBD_MCACHE_PULLOUT_NON(state, mce) \
1176 	list_remove(&state->id_mc_non, mce)
1177 
1178 static void *
1179 list_get_head(list_t *list)
1180 {
1181 	list_node_t *lhead = list_head(list);
1182 
1183 	if (lhead != NULL)
1184 		list_remove(list, lhead);
1185 	return (lhead);
1186 }
1187 
1188 /*
1189  * This is always guaranteed to be able to queue the work.
1190  */
1191 void
1192 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op)
1193 {
1194 	/* Initialize request */
1195 	DPRINT(1, "ibd_queue_work_slot : op: %d \n", op);
1196 	ptr->rq_op = op;
1197 
1198 	/*
1199 	 * Queue provided slot onto request pool.
1200 	 */
1201 	mutex_enter(&state->id_acache_req_lock);
1202 	list_insert_tail(&state->id_req_list, ptr);
1203 
1204 	/* Go, fetch, async thread */
1205 	cv_signal(&state->id_acache_req_cv);
1206 	mutex_exit(&state->id_acache_req_lock);
1207 }
1208 
1209 /*
1210  * Main body of the per interface async thread.
1211  */
1212 static void
1213 ibd_async_work(ibd_state_t *state)
1214 {
1215 	ibd_req_t *ptr;
1216 	callb_cpr_t cprinfo;
1217 
1218 	mutex_enter(&state->id_acache_req_lock);
1219 	CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock,
1220 	    callb_generic_cpr, "ibd_async_work");
1221 
1222 	for (;;) {
1223 		ptr = list_get_head(&state->id_req_list);
1224 		if (ptr != NULL) {
1225 			mutex_exit(&state->id_acache_req_lock);
1226 
1227 			/*
1228 			 * If we are in late hca initialization mode, do not
1229 			 * process any other async request other than TRAP. TRAP
1230 			 * is used for indicating creation of a broadcast group;
1231 			 * in which case, we need to join/create the group.
1232 			 */
1233 			if ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) &&
1234 			    (ptr->rq_op != IBD_ASYNC_TRAP)) {
1235 				goto free_req_and_continue;
1236 			}
1237 
1238 			/*
1239 			 * Once we have done the operation, there is no
1240 			 * guarantee the request slot is going to be valid,
1241 			 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP,
1242 			 * TRAP).
1243 			 *
1244 			 * Perform the request.
1245 			 */
1246 			switch (ptr->rq_op) {
1247 				case IBD_ASYNC_GETAH:
1248 					ibd_async_acache(state, &ptr->rq_mac);
1249 					break;
1250 				case IBD_ASYNC_JOIN:
1251 				case IBD_ASYNC_LEAVE:
1252 					ibd_async_multicast(state,
1253 					    ptr->rq_gid, ptr->rq_op);
1254 					break;
1255 				case IBD_ASYNC_PROMON:
1256 					ibd_async_setprom(state);
1257 					break;
1258 				case IBD_ASYNC_PROMOFF:
1259 					ibd_async_unsetprom(state);
1260 					break;
1261 				case IBD_ASYNC_REAP:
1262 					ibd_async_reap_group(state,
1263 					    ptr->rq_ptr, ptr->rq_gid,
1264 					    IB_MC_JSTATE_FULL);
1265 					/*
1266 					 * the req buf contains in mce
1267 					 * structure, so we do not need
1268 					 * to free it here.
1269 					 */
1270 					ptr = NULL;
1271 					break;
1272 				case IBD_ASYNC_TRAP:
1273 					ibd_async_trap(state, ptr);
1274 					break;
1275 				case IBD_ASYNC_SCHED:
1276 					ibd_async_txsched(state);
1277 					break;
1278 				case IBD_ASYNC_LINK:
1279 					ibd_async_link(state, ptr);
1280 					break;
1281 				case IBD_ASYNC_EXIT:
1282 					mutex_enter(&state->id_acache_req_lock);
1283 #ifndef __lock_lint
1284 					CALLB_CPR_EXIT(&cprinfo);
1285 #else
1286 					mutex_exit(&state->id_acache_req_lock);
1287 #endif
1288 					return;
1289 				case IBD_ASYNC_RC_TOO_BIG:
1290 					ibd_async_rc_process_too_big(state,
1291 					    ptr);
1292 					break;
1293 				case IBD_ASYNC_RC_CLOSE_ACT_CHAN:
1294 					ibd_async_rc_close_act_chan(state, ptr);
1295 					break;
1296 				case IBD_ASYNC_RC_RECYCLE_ACE:
1297 					ibd_async_rc_recycle_ace(state, ptr);
1298 					break;
1299 			}
1300 free_req_and_continue:
1301 			if (ptr != NULL)
1302 				kmem_cache_free(state->id_req_kmc, ptr);
1303 
1304 			mutex_enter(&state->id_acache_req_lock);
1305 		} else {
1306 #ifndef __lock_lint
1307 			/*
1308 			 * Nothing to do: wait till new request arrives.
1309 			 */
1310 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1311 			cv_wait(&state->id_acache_req_cv,
1312 			    &state->id_acache_req_lock);
1313 			CALLB_CPR_SAFE_END(&cprinfo,
1314 			    &state->id_acache_req_lock);
1315 #endif
1316 		}
1317 	}
1318 
1319 	/*NOTREACHED*/
1320 	_NOTE(NOT_REACHED)
1321 }
1322 
1323 /*
1324  * Return when it is safe to queue requests to the async daemon; primarily
1325  * for subnet trap and async event handling. Disallow requests before the
1326  * daemon is created, and when interface deinitilization starts.
1327  */
1328 static boolean_t
1329 ibd_async_safe(ibd_state_t *state)
1330 {
1331 	mutex_enter(&state->id_trap_lock);
1332 	if (state->id_trap_stop) {
1333 		mutex_exit(&state->id_trap_lock);
1334 		return (B_FALSE);
1335 	}
1336 	state->id_trap_inprog++;
1337 	mutex_exit(&state->id_trap_lock);
1338 	return (B_TRUE);
1339 }
1340 
1341 /*
1342  * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet
1343  * trap or event handling to complete to kill the async thread and deconstruct
1344  * the mcg/ace list.
1345  */
1346 static void
1347 ibd_async_done(ibd_state_t *state)
1348 {
1349 	mutex_enter(&state->id_trap_lock);
1350 	if (--state->id_trap_inprog == 0)
1351 		cv_signal(&state->id_trap_cv);
1352 	mutex_exit(&state->id_trap_lock);
1353 }
1354 
1355 /*
1356  * Hash functions:
1357  * ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
1358  * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
1359  * These operate on mac addresses input into ibd_send, but there is no
1360  * guarantee on the alignment of the ipoib_mac_t structure.
1361  */
1362 /*ARGSUSED*/
1363 static uint_t
1364 ibd_hash_by_id(void *hash_data, mod_hash_key_t key)
1365 {
1366 	ulong_t ptraddr = (ulong_t)key;
1367 	uint_t hval;
1368 
1369 	/*
1370 	 * If the input address is 4 byte aligned, we can just dereference
1371 	 * it. This is most common, since IP will send in a 4 byte aligned
1372 	 * IP header, which implies the 24 byte IPoIB psuedo header will be
1373 	 * 4 byte aligned too.
1374 	 */
1375 	if ((ptraddr & 3) == 0)
1376 		return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn);
1377 
1378 	bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t));
1379 	return (hval);
1380 }
1381 
1382 static int
1383 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1384 {
1385 	if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0)
1386 		return (0);
1387 	else
1388 		return (1);
1389 }
1390 
1391 /*
1392  * Initialize all the per interface caches and lists; AH cache,
1393  * MCG list etc.
1394  */
1395 static int
1396 ibd_acache_init(ibd_state_t *state)
1397 {
1398 	ibd_ace_t *ce;
1399 	int i;
1400 
1401 	mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL);
1402 	mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL);
1403 	mutex_enter(&state->id_ac_mutex);
1404 	list_create(&state->id_ah_free, sizeof (ibd_ace_t),
1405 	    offsetof(ibd_ace_t, ac_list));
1406 	list_create(&state->id_ah_active, sizeof (ibd_ace_t),
1407 	    offsetof(ibd_ace_t, ac_list));
1408 	state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash",
1409 	    state->id_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor,
1410 	    ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP);
1411 	list_create(&state->id_mc_full, sizeof (ibd_mce_t),
1412 	    offsetof(ibd_mce_t, mc_list));
1413 	list_create(&state->id_mc_non, sizeof (ibd_mce_t),
1414 	    offsetof(ibd_mce_t, mc_list));
1415 	state->id_ac_hot_ace = NULL;
1416 
1417 	state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) *
1418 	    state->id_num_ah, KM_SLEEP);
1419 	for (i = 0; i < state->id_num_ah; i++, ce++) {
1420 		if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS,
1421 		    state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) {
1422 			mutex_exit(&state->id_ac_mutex);
1423 			ibd_acache_fini(state);
1424 			return (DDI_FAILURE);
1425 		} else {
1426 			CLEAR_REFCYCLE(ce);
1427 			ce->ac_mce = NULL;
1428 			mutex_init(&ce->tx_too_big_mutex, NULL,
1429 			    MUTEX_DRIVER, NULL);
1430 			IBD_ACACHE_INSERT_FREE(state, ce);
1431 		}
1432 	}
1433 	mutex_exit(&state->id_ac_mutex);
1434 	return (DDI_SUCCESS);
1435 }
1436 
1437 static void
1438 ibd_acache_fini(ibd_state_t *state)
1439 {
1440 	ibd_ace_t *ptr;
1441 
1442 	mutex_enter(&state->id_ac_mutex);
1443 
1444 	while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) {
1445 		ASSERT(GET_REF(ptr) == 0);
1446 		mutex_destroy(&ptr->tx_too_big_mutex);
1447 		(void) ibt_free_ud_dest(ptr->ac_dest);
1448 	}
1449 
1450 	while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) {
1451 		ASSERT(GET_REF(ptr) == 0);
1452 		mutex_destroy(&ptr->tx_too_big_mutex);
1453 		(void) ibt_free_ud_dest(ptr->ac_dest);
1454 	}
1455 
1456 	list_destroy(&state->id_ah_free);
1457 	list_destroy(&state->id_ah_active);
1458 	list_destroy(&state->id_mc_full);
1459 	list_destroy(&state->id_mc_non);
1460 	kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * state->id_num_ah);
1461 	mutex_exit(&state->id_ac_mutex);
1462 	mutex_destroy(&state->id_ac_mutex);
1463 	mutex_destroy(&state->id_mc_mutex);
1464 }
1465 
1466 /*
1467  * Search AH active hash list for a cached path to input destination.
1468  * If we are "just looking", hold == F. When we are in the Tx path,
1469  * we set hold == T to grab a reference on the AH so that it can not
1470  * be recycled to a new destination while the Tx request is posted.
1471  */
1472 ibd_ace_t *
1473 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num)
1474 {
1475 	ibd_ace_t *ptr;
1476 
1477 	ASSERT(mutex_owned(&state->id_ac_mutex));
1478 
1479 	/*
1480 	 * Do hash search.
1481 	 */
1482 	if (mod_hash_find(state->id_ah_active_hash,
1483 	    (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) {
1484 		if (hold)
1485 			INC_REF(ptr, num);
1486 		return (ptr);
1487 	}
1488 	return (NULL);
1489 }
1490 
1491 /*
1492  * This is called by the tx side; if an initialized AH is found in
1493  * the active list, it is locked down and can be used; if no entry
1494  * is found, an async request is queued to do path resolution.
1495  */
1496 static ibd_ace_t *
1497 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe)
1498 {
1499 	ibd_ace_t *ptr;
1500 	ibd_req_t *req;
1501 
1502 	/*
1503 	 * Only attempt to print when we can; in the mdt pattr case, the
1504 	 * address is not aligned properly.
1505 	 */
1506 	if (((ulong_t)mac & 3) == 0) {
1507 		DPRINT(4,
1508 		    "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
1509 		    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1510 		    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1511 		    htonl(mac->ipoib_gidsuff[1]));
1512 	}
1513 
1514 	mutex_enter(&state->id_ac_mutex);
1515 
1516 	if (((ptr = state->id_ac_hot_ace) != NULL) &&
1517 	    (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) {
1518 		INC_REF(ptr, numwqe);
1519 		mutex_exit(&state->id_ac_mutex);
1520 		return (ptr);
1521 	}
1522 	if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) {
1523 		state->id_ac_hot_ace = ptr;
1524 		mutex_exit(&state->id_ac_mutex);
1525 		return (ptr);
1526 	}
1527 
1528 	/*
1529 	 * Implementation of a single outstanding async request; if
1530 	 * the operation is not started yet, queue a request and move
1531 	 * to ongoing state. Remember in id_ah_addr for which address
1532 	 * we are queueing the request, in case we need to flag an error;
1533 	 * Any further requests, for the same or different address, until
1534 	 * the operation completes, is sent back to GLDv3 to be retried.
1535 	 * The async thread will update id_ah_op with an error indication
1536 	 * or will set it to indicate the next look up can start; either
1537 	 * way, it will mac_tx_update() so that all blocked requests come
1538 	 * back here.
1539 	 */
1540 	*err = EAGAIN;
1541 	if (state->id_ah_op == IBD_OP_NOTSTARTED) {
1542 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
1543 		if (req != NULL) {
1544 			/*
1545 			 * We did not even find the entry; queue a request
1546 			 * for it.
1547 			 */
1548 			bcopy(mac, &(req->rq_mac), IPOIB_ADDRL);
1549 			ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH);
1550 			state->id_ah_op = IBD_OP_ONGOING;
1551 			bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL);
1552 		}
1553 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1554 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) {
1555 		/*
1556 		 * Check the status of the pathrecord lookup request
1557 		 * we had queued before.
1558 		 */
1559 		if (state->id_ah_op == IBD_OP_ERRORED) {
1560 			*err = EFAULT;
1561 			state->id_ah_error++;
1562 		} else {
1563 			/*
1564 			 * IBD_OP_ROUTERED case: We need to send to the
1565 			 * all-router MCG. If we can find the AH for
1566 			 * the mcg, the Tx will be attempted. If we
1567 			 * do not find the AH, we return NORESOURCES
1568 			 * to retry.
1569 			 */
1570 			ipoib_mac_t routermac;
1571 
1572 			(void) ibd_get_allroutergroup(state, mac, &routermac);
1573 			ptr = ibd_acache_find(state, &routermac, B_TRUE,
1574 			    numwqe);
1575 		}
1576 		state->id_ah_op = IBD_OP_NOTSTARTED;
1577 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1578 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) {
1579 		/*
1580 		 * This case can happen when we get a higher band
1581 		 * packet. The easiest way is to reset the state machine
1582 		 * to accommodate the higher priority packet.
1583 		 */
1584 		state->id_ah_op = IBD_OP_NOTSTARTED;
1585 	}
1586 	mutex_exit(&state->id_ac_mutex);
1587 
1588 	return (ptr);
1589 }
1590 
1591 /*
1592  * Grab a not-currently-in-use AH/PathRecord from the active
1593  * list to recycle to a new destination. Only the async thread
1594  * executes this code.
1595  */
1596 static ibd_ace_t *
1597 ibd_acache_get_unref(ibd_state_t *state)
1598 {
1599 	ibd_ace_t *ptr = list_tail(&state->id_ah_active);
1600 	boolean_t try_rc_chan_recycle = B_FALSE;
1601 
1602 	ASSERT(mutex_owned(&state->id_ac_mutex));
1603 
1604 	/*
1605 	 * Do plain linear search.
1606 	 */
1607 	while (ptr != NULL) {
1608 		/*
1609 		 * Note that it is possible that the "cycle" bit
1610 		 * is set on the AH w/o any reference count. The
1611 		 * mcg must have been deleted, and the tx cleanup
1612 		 * just decremented the reference count to 0, but
1613 		 * hasn't gotten around to grabbing the id_ac_mutex
1614 		 * to move the AH into the free list.
1615 		 */
1616 		if (GET_REF(ptr) == 0) {
1617 			if (ptr->ac_chan != NULL) {
1618 				ASSERT(state->id_enable_rc == B_TRUE);
1619 				if (!try_rc_chan_recycle) {
1620 					try_rc_chan_recycle = B_TRUE;
1621 					ibd_rc_signal_ace_recycle(state, ptr);
1622 				}
1623 			} else {
1624 				IBD_ACACHE_PULLOUT_ACTIVE(state, ptr);
1625 				break;
1626 			}
1627 		}
1628 		ptr = list_prev(&state->id_ah_active, ptr);
1629 	}
1630 	return (ptr);
1631 }
1632 
1633 /*
1634  * Invoked to clean up AH from active list in case of multicast
1635  * disable and to handle sendonly memberships during mcg traps.
1636  * And for port up processing for multicast and unicast AHs.
1637  * Normally, the AH is taken off the active list, and put into
1638  * the free list to be recycled for a new destination. In case
1639  * Tx requests on the AH have not completed yet, the AH is marked
1640  * for reaping (which will put the AH on the free list) once the Tx's
1641  * complete; in this case, depending on the "force" input, we take
1642  * out the AH from the active list right now, or leave it also for
1643  * the reap operation. Returns TRUE if the AH is taken off the active
1644  * list (and either put into the free list right now, or arranged for
1645  * later), FALSE otherwise.
1646  */
1647 boolean_t
1648 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force)
1649 {
1650 	ibd_ace_t *acactive;
1651 	boolean_t ret = B_TRUE;
1652 
1653 	ASSERT(mutex_owned(&state->id_ac_mutex));
1654 
1655 	if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) {
1656 
1657 		/*
1658 		 * Note that the AH might already have the cycle bit set
1659 		 * on it; this might happen if sequences of multicast
1660 		 * enables and disables are coming so fast, that posted
1661 		 * Tx's to the mcg have not completed yet, and the cycle
1662 		 * bit is set successively by each multicast disable.
1663 		 */
1664 		if (SET_CYCLE_IF_REF(acactive)) {
1665 			if (!force) {
1666 				/*
1667 				 * The ace is kept on the active list, further
1668 				 * Tx's can still grab a reference on it; the
1669 				 * ace is reaped when all pending Tx's
1670 				 * referencing the AH complete.
1671 				 */
1672 				ret = B_FALSE;
1673 			} else {
1674 				/*
1675 				 * In the mcg trap case, we always pull the
1676 				 * AH from the active list. And also the port
1677 				 * up multi/unicast case.
1678 				 */
1679 				ASSERT(acactive->ac_chan == NULL);
1680 				IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1681 				acactive->ac_mce = NULL;
1682 			}
1683 		} else {
1684 			/*
1685 			 * Determined the ref count is 0, thus reclaim
1686 			 * immediately after pulling out the ace from
1687 			 * the active list.
1688 			 */
1689 			ASSERT(acactive->ac_chan == NULL);
1690 			IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1691 			acactive->ac_mce = NULL;
1692 			IBD_ACACHE_INSERT_FREE(state, acactive);
1693 		}
1694 
1695 	}
1696 	return (ret);
1697 }
1698 
1699 /*
1700  * Helper function for async path record lookup. If we are trying to
1701  * Tx to a MCG, check our membership, possibly trying to join the
1702  * group if required. If that fails, try to send the packet to the
1703  * all router group (indicated by the redirect output), pointing
1704  * the input mac address to the router mcg address.
1705  */
1706 static ibd_mce_t *
1707 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect)
1708 {
1709 	ib_gid_t mgid;
1710 	ibd_mce_t *mce;
1711 	ipoib_mac_t routermac;
1712 
1713 	*redirect = B_FALSE;
1714 	ibd_n2h_gid(mac, &mgid);
1715 
1716 	/*
1717 	 * Check the FullMember+SendOnlyNonMember list.
1718 	 * Since we are the only one who manipulates the
1719 	 * id_mc_full list, no locks are needed.
1720 	 */
1721 	mce = IBD_MCACHE_FIND_FULL(state, mgid);
1722 	if (mce != NULL) {
1723 		DPRINT(4, "ibd_async_mcache : already joined to group");
1724 		return (mce);
1725 	}
1726 
1727 	/*
1728 	 * Not found; try to join(SendOnlyNonMember) and attach.
1729 	 */
1730 	DPRINT(4, "ibd_async_mcache : not joined to group");
1731 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1732 	    NULL) {
1733 		DPRINT(4, "ibd_async_mcache : nonmem joined to group");
1734 		return (mce);
1735 	}
1736 
1737 	/*
1738 	 * MCGroup not present; try to join the all-router group. If
1739 	 * any of the following steps succeed, we will be redirecting
1740 	 * to the all router group.
1741 	 */
1742 	DPRINT(4, "ibd_async_mcache : nonmem join failed");
1743 	if (!ibd_get_allroutergroup(state, mac, &routermac))
1744 		return (NULL);
1745 	*redirect = B_TRUE;
1746 	ibd_n2h_gid(&routermac, &mgid);
1747 	bcopy(&routermac, mac, IPOIB_ADDRL);
1748 	DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n",
1749 	    mgid.gid_prefix, mgid.gid_guid);
1750 
1751 	/*
1752 	 * Are we already joined to the router group?
1753 	 */
1754 	if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) {
1755 		DPRINT(4, "ibd_async_mcache : using already joined router"
1756 		    "group\n");
1757 		return (mce);
1758 	}
1759 
1760 	/*
1761 	 * Can we join(SendOnlyNonMember) the router group?
1762 	 */
1763 	DPRINT(4, "ibd_async_mcache : attempting join to router grp");
1764 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1765 	    NULL) {
1766 		DPRINT(4, "ibd_async_mcache : joined to router grp");
1767 		return (mce);
1768 	}
1769 
1770 	return (NULL);
1771 }
1772 
1773 /*
1774  * Async path record lookup code.
1775  */
1776 static void
1777 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac)
1778 {
1779 	ibd_ace_t *ce;
1780 	ibd_mce_t *mce = NULL;
1781 	ibt_path_attr_t path_attr;
1782 	ibt_path_info_t path_info;
1783 	ib_gid_t destgid;
1784 	char ret = IBD_OP_NOTSTARTED;
1785 
1786 	DPRINT(4, "ibd_async_acache :  %08X:%08X:%08X:%08X:%08X",
1787 	    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1788 	    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1789 	    htonl(mac->ipoib_gidsuff[1]));
1790 
1791 	/*
1792 	 * Check whether we are trying to transmit to a MCG.
1793 	 * In that case, we need to make sure we are a member of
1794 	 * the MCG.
1795 	 */
1796 	if (mac->ipoib_qpn == htonl(IB_MC_QPN)) {
1797 		boolean_t redirected;
1798 
1799 		/*
1800 		 * If we can not find or join the group or even
1801 		 * redirect, error out.
1802 		 */
1803 		if ((mce = ibd_async_mcache(state, mac, &redirected)) ==
1804 		    NULL) {
1805 			state->id_ah_op = IBD_OP_ERRORED;
1806 			return;
1807 		}
1808 
1809 		/*
1810 		 * If we got redirected, we need to determine whether
1811 		 * the AH for the new mcg is in the cache already, and
1812 		 * not pull it in then; otherwise proceed to get the
1813 		 * path for the new mcg. There is no guarantee that
1814 		 * if the AH is currently in the cache, it will still be
1815 		 * there when we look in ibd_acache_lookup(), but that's
1816 		 * okay, we will come back here.
1817 		 */
1818 		if (redirected) {
1819 			ret = IBD_OP_ROUTERED;
1820 			DPRINT(4, "ibd_async_acache :  redirected to "
1821 			    "%08X:%08X:%08X:%08X:%08X",
1822 			    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1823 			    htonl(mac->ipoib_gidpref[1]),
1824 			    htonl(mac->ipoib_gidsuff[0]),
1825 			    htonl(mac->ipoib_gidsuff[1]));
1826 
1827 			mutex_enter(&state->id_ac_mutex);
1828 			if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) {
1829 				state->id_ah_op = IBD_OP_ROUTERED;
1830 				mutex_exit(&state->id_ac_mutex);
1831 				DPRINT(4, "ibd_async_acache : router AH found");
1832 				return;
1833 			}
1834 			mutex_exit(&state->id_ac_mutex);
1835 		}
1836 	}
1837 
1838 	/*
1839 	 * Get an AH from the free list.
1840 	 */
1841 	mutex_enter(&state->id_ac_mutex);
1842 	if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) {
1843 		/*
1844 		 * No free ones; try to grab an unreferenced active
1845 		 * one. Maybe we need to make the active list LRU,
1846 		 * but that will create more work for Tx callbacks.
1847 		 * Is there a way of not having to pull out the
1848 		 * entry from the active list, but just indicate it
1849 		 * is being recycled? Yes, but that creates one more
1850 		 * check in the fast lookup path.
1851 		 */
1852 		if ((ce = ibd_acache_get_unref(state)) == NULL) {
1853 			/*
1854 			 * Pretty serious shortage now.
1855 			 */
1856 			state->id_ah_op = IBD_OP_NOTSTARTED;
1857 			mutex_exit(&state->id_ac_mutex);
1858 			DPRINT(10, "ibd_async_acache : failed to find AH "
1859 			    "slot\n");
1860 			return;
1861 		}
1862 		/*
1863 		 * We could check whether ac_mce points to a SendOnly
1864 		 * member and drop that membership now. Or do it lazily
1865 		 * at detach time.
1866 		 */
1867 		ce->ac_mce = NULL;
1868 	}
1869 	mutex_exit(&state->id_ac_mutex);
1870 	ASSERT(ce->ac_mce == NULL);
1871 
1872 	/*
1873 	 * Update the entry.
1874 	 */
1875 	bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL);
1876 
1877 	bzero(&path_info, sizeof (path_info));
1878 	bzero(&path_attr, sizeof (ibt_path_attr_t));
1879 	path_attr.pa_sgid = state->id_sgid;
1880 	path_attr.pa_num_dgids = 1;
1881 	ibd_n2h_gid(&ce->ac_mac, &destgid);
1882 	path_attr.pa_dgids = &destgid;
1883 	path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
1884 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
1885 	    &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) {
1886 		DPRINT(10, "ibd_async_acache : failed in ibt_get_paths");
1887 		goto error;
1888 	}
1889 	if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey,
1890 	    ntohl(ce->ac_mac.ipoib_qpn),
1891 	    &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) {
1892 		DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest");
1893 		goto error;
1894 	}
1895 
1896 	/*
1897 	 * mce is set whenever an AH is being associated with a
1898 	 * MCG; this will come in handy when we leave the MCG. The
1899 	 * lock protects Tx fastpath from scanning the active list.
1900 	 */
1901 	if (mce != NULL)
1902 		ce->ac_mce = mce;
1903 
1904 	/*
1905 	 * initiate a RC mode connection for unicast address
1906 	 */
1907 	if (state->id_enable_rc && (mac->ipoib_qpn != htonl(IB_MC_QPN)) &&
1908 	    (htonl(mac->ipoib_qpn) & IBD_MAC_ADDR_RC)) {
1909 		ASSERT(ce->ac_chan == NULL);
1910 		DPRINT(10, "ibd_async_acache: call "
1911 		    "ibd_rc_try_connect(ace=%p)", ce);
1912 		ibd_rc_try_connect(state, ce, &path_info);
1913 		if (ce->ac_chan == NULL) {
1914 			DPRINT(10, "ibd_async_acache: fail to setup RC"
1915 			    " channel");
1916 			state->rc_conn_fail++;
1917 			goto error;
1918 		}
1919 	}
1920 
1921 	mutex_enter(&state->id_ac_mutex);
1922 	IBD_ACACHE_INSERT_ACTIVE(state, ce);
1923 	state->id_ah_op = ret;
1924 	mutex_exit(&state->id_ac_mutex);
1925 	return;
1926 error:
1927 	/*
1928 	 * We might want to drop SendOnly membership here if we
1929 	 * joined above. The lock protects Tx callbacks inserting
1930 	 * into the free list.
1931 	 */
1932 	mutex_enter(&state->id_ac_mutex);
1933 	state->id_ah_op = IBD_OP_ERRORED;
1934 	IBD_ACACHE_INSERT_FREE(state, ce);
1935 	mutex_exit(&state->id_ac_mutex);
1936 }
1937 
1938 /*
1939  * While restoring port's presence on the subnet on a port up, it is possible
1940  * that the port goes down again.
1941  */
1942 static void
1943 ibd_async_link(ibd_state_t *state, ibd_req_t *req)
1944 {
1945 	ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr;
1946 	link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN :
1947 	    LINK_STATE_UP;
1948 	ibd_mce_t *mce, *pmce;
1949 	ibd_ace_t *ace, *pace;
1950 
1951 	DPRINT(10, "ibd_async_link(): %d", opcode);
1952 
1953 	/*
1954 	 * On a link up, revalidate the link speed/width. No point doing
1955 	 * this on a link down, since we will be unable to do SA operations,
1956 	 * defaulting to the lowest speed. Also notice that we update our
1957 	 * notion of speed before calling mac_link_update(), which will do
1958 	 * necessary higher level notifications for speed changes.
1959 	 */
1960 	if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) {
1961 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
1962 		state->id_link_speed = ibd_get_portspeed(state);
1963 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
1964 	}
1965 
1966 	/*
1967 	 * Do all the work required to establish our presence on
1968 	 * the subnet.
1969 	 */
1970 	if (opcode == IBD_LINK_UP_ABSENT) {
1971 		/*
1972 		 * If in promiscuous mode ...
1973 		 */
1974 		if (state->id_prom_op == IBD_OP_COMPLETED) {
1975 			/*
1976 			 * Drop all nonmembership.
1977 			 */
1978 			ibd_async_unsetprom(state);
1979 
1980 			/*
1981 			 * Then, try to regain nonmembership to all mcg's.
1982 			 */
1983 			ibd_async_setprom(state);
1984 
1985 		}
1986 
1987 		/*
1988 		 * Drop all sendonly membership (which also gets rid of the
1989 		 * AHs); try to reacquire all full membership.
1990 		 */
1991 		mce = list_head(&state->id_mc_full);
1992 		while ((pmce = mce) != NULL) {
1993 			mce = list_next(&state->id_mc_full, mce);
1994 			if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON)
1995 				ibd_leave_group(state,
1996 				    pmce->mc_info.mc_adds_vect.av_dgid,
1997 				    IB_MC_JSTATE_SEND_ONLY_NON);
1998 			else
1999 				ibd_reacquire_group(state, pmce);
2000 		}
2001 
2002 		/*
2003 		 * Recycle all active AHs to free list (and if there are
2004 		 * pending posts, make sure they will go into the free list
2005 		 * once the Tx's complete). Grab the lock to prevent
2006 		 * concurrent Tx's as well as Tx cleanups.
2007 		 */
2008 		mutex_enter(&state->id_ac_mutex);
2009 		ace = list_head(&state->id_ah_active);
2010 		while ((pace = ace) != NULL) {
2011 			boolean_t cycled;
2012 
2013 			ace = list_next(&state->id_ah_active, ace);
2014 			mce = pace->ac_mce;
2015 			if (pace->ac_chan != NULL) {
2016 				ASSERT(mce == NULL);
2017 				ASSERT(state->id_enable_rc == B_TRUE);
2018 				if (pace->ac_chan->chan_state ==
2019 				    IBD_RC_STATE_ACT_ESTAB) {
2020 					INC_REF(pace, 1);
2021 					IBD_ACACHE_PULLOUT_ACTIVE(state, pace);
2022 					pace->ac_chan->chan_state =
2023 					    IBD_RC_STATE_ACT_CLOSING;
2024 					ibd_rc_signal_act_close(state, pace);
2025 				} else {
2026 					state->rc_act_close_simultaneous++;
2027 					DPRINT(40, "ibd_async_link: other "
2028 					    "thread is closing it, ace=%p, "
2029 					    "ac_chan=%p, chan_state=%d",
2030 					    pace, pace->ac_chan,
2031 					    pace->ac_chan->chan_state);
2032 				}
2033 			} else {
2034 				cycled = ibd_acache_recycle(state,
2035 				    &pace->ac_mac, B_TRUE);
2036 			}
2037 			/*
2038 			 * If this is for an mcg, it must be for a fullmember,
2039 			 * since we got rid of send-only members above when
2040 			 * processing the mce list.
2041 			 */
2042 			ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate ==
2043 			    IB_MC_JSTATE_FULL)));
2044 
2045 			/*
2046 			 * Check if the fullmember mce needs to be torn down,
2047 			 * ie whether the DLPI disable has already been done.
2048 			 * If so, do some of the work of tx_cleanup, namely
2049 			 * causing leave (which will fail), detach and
2050 			 * mce-freeing. tx_cleanup will put the AH into free
2051 			 * list. The reason to duplicate some of this
2052 			 * tx_cleanup work is because we want to delete the
2053 			 * AH right now instead of waiting for tx_cleanup, to
2054 			 * force subsequent Tx's to reacquire an AH.
2055 			 */
2056 			if ((mce != NULL) && (mce->mc_fullreap))
2057 				ibd_async_reap_group(state, mce,
2058 				    mce->mc_info.mc_adds_vect.av_dgid,
2059 				    mce->mc_jstate);
2060 		}
2061 		mutex_exit(&state->id_ac_mutex);
2062 	}
2063 
2064 	/*
2065 	 * mac handle is guaranteed to exist since driver does ibt_close_hca()
2066 	 * (which stops further events from being delivered) before
2067 	 * mac_unregister(). At this point, it is guaranteed that mac_register
2068 	 * has already been done.
2069 	 */
2070 	mutex_enter(&state->id_link_mutex);
2071 	state->id_link_state = lstate;
2072 	mac_link_update(state->id_mh, lstate);
2073 	mutex_exit(&state->id_link_mutex);
2074 
2075 	ibd_async_done(state);
2076 }
2077 
2078 /*
2079  * Check the pkey table to see if we can find the pkey we're looking for.
2080  * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on
2081  * failure.
2082  */
2083 static int
2084 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey,
2085     uint16_t *pkix)
2086 {
2087 	uint16_t ndx;
2088 
2089 	ASSERT(pkix != NULL);
2090 
2091 	for (ndx = 0; ndx < pkey_tbl_sz; ndx++) {
2092 		if (pkey_tbl[ndx] == pkey) {
2093 			*pkix = ndx;
2094 			return (0);
2095 		}
2096 	}
2097 	return (-1);
2098 }
2099 
2100 /*
2101  * Late HCA Initialization:
2102  * If plumb had succeeded without the availability of an active port or the
2103  * pkey, and either of their availability is now being indicated via PORT_UP
2104  * or PORT_CHANGE respectively, try a start of the interface.
2105  *
2106  * Normal Operation:
2107  * When the link is notified up, we need to do a few things, based
2108  * on the port's current p_init_type_reply claiming a reinit has been
2109  * done or not. The reinit steps are:
2110  * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
2111  *    the old Pkey and GID0 are correct.
2112  * 2. Register for mcg traps (already done by ibmf).
2113  * 3. If PreservePresenceReply indicates the SM has restored port's presence
2114  *    in subnet, nothing more to do. Else go to next steps (on async daemon).
2115  * 4. Give up all sendonly memberships.
2116  * 5. Acquire all full memberships.
2117  * 6. In promiscuous mode, acquire all non memberships.
2118  * 7. Recycle all AHs to free list.
2119  */
2120 static void
2121 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code)
2122 {
2123 	ibt_hca_portinfo_t *port_infop = NULL;
2124 	ibt_status_t ibt_status;
2125 	uint_t psize, port_infosz;
2126 	ibd_link_op_t opcode;
2127 	ibd_req_t *req;
2128 	link_state_t new_link_state = LINK_STATE_UP;
2129 	uint8_t itreply;
2130 	uint16_t pkix;
2131 	int ret;
2132 
2133 	/*
2134 	 * Let's not race with a plumb or an unplumb; if we detect a
2135 	 * pkey relocation event later on here, we may have to restart.
2136 	 */
2137 	ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2138 
2139 	mutex_enter(&state->id_link_mutex);
2140 
2141 	/*
2142 	 * If the link state is unknown, a plumb has not yet been attempted
2143 	 * on the interface. Nothing to do.
2144 	 */
2145 	if (state->id_link_state == LINK_STATE_UNKNOWN) {
2146 		mutex_exit(&state->id_link_mutex);
2147 		goto link_mod_return;
2148 	}
2149 
2150 	/*
2151 	 * If link state is down because of plumb failure, and we are not in
2152 	 * late HCA init, and we were not successfully plumbed, nothing to do.
2153 	 */
2154 	if ((state->id_link_state == LINK_STATE_DOWN) &&
2155 	    ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 0) &&
2156 	    ((state->id_mac_state & IBD_DRV_STARTED) == 0)) {
2157 		mutex_exit(&state->id_link_mutex);
2158 		goto link_mod_return;
2159 	}
2160 
2161 	/*
2162 	 * If this routine was called in response to a port down event,
2163 	 * we just need to see if this should be informed.
2164 	 */
2165 	if (code == IBT_ERROR_PORT_DOWN) {
2166 		new_link_state = LINK_STATE_DOWN;
2167 		goto update_link_state;
2168 	}
2169 
2170 	/*
2171 	 * If it's not a port down event we've received, try to get the port
2172 	 * attributes first. If we fail here, the port is as good as down.
2173 	 * Otherwise, if the link went down by the time the handler gets
2174 	 * here, give up - we cannot even validate the pkey/gid since those
2175 	 * are not valid and this is as bad as a port down anyway.
2176 	 */
2177 	ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
2178 	    &port_infop, &psize, &port_infosz);
2179 	if ((ibt_status != IBT_SUCCESS) || (psize != 1) ||
2180 	    (port_infop->p_linkstate != IBT_PORT_ACTIVE)) {
2181 		new_link_state = LINK_STATE_DOWN;
2182 		goto update_link_state;
2183 	}
2184 
2185 	/*
2186 	 * If in the previous attempt, the pkey was not found either due to the
2187 	 * port state being down, or due to it's absence in the pkey table,
2188 	 * look for it now and try to start the interface.
2189 	 */
2190 	if (state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) {
2191 		mutex_exit(&state->id_link_mutex);
2192 		if ((ret = ibd_start(state)) != 0) {
2193 			DPRINT(10, "ibd_linkmod: cannot start from late HCA "
2194 			    "init, ret=%d", ret);
2195 		}
2196 		ibt_free_portinfo(port_infop, port_infosz);
2197 		goto link_mod_return;
2198 	}
2199 
2200 	/*
2201 	 * Check the SM InitTypeReply flags. If both NoLoadReply and
2202 	 * PreserveContentReply are 0, we don't know anything about the
2203 	 * data loaded into the port attributes, so we need to verify
2204 	 * if gid0 and pkey are still valid.
2205 	 */
2206 	itreply = port_infop->p_init_type_reply;
2207 	if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
2208 	    ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) {
2209 		/*
2210 		 * Check to see if the subnet part of GID0 has changed. If
2211 		 * not, check the simple case first to see if the pkey
2212 		 * index is the same as before; finally check to see if the
2213 		 * pkey has been relocated to a different index in the table.
2214 		 */
2215 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
2216 		if (bcmp(port_infop->p_sgid_tbl,
2217 		    &state->id_sgid, sizeof (ib_gid_t)) != 0) {
2218 
2219 			new_link_state = LINK_STATE_DOWN;
2220 
2221 		} else if (port_infop->p_pkey_tbl[state->id_pkix] ==
2222 		    state->id_pkey) {
2223 
2224 			new_link_state = LINK_STATE_UP;
2225 
2226 		} else if (ibd_locate_pkey(port_infop->p_pkey_tbl,
2227 		    port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) {
2228 
2229 			ibt_free_portinfo(port_infop, port_infosz);
2230 			mutex_exit(&state->id_link_mutex);
2231 
2232 			/*
2233 			 * Currently a restart is required if our pkey has moved
2234 			 * in the pkey table. If we get the ibt_recycle_ud() to
2235 			 * work as documented (expected), we may be able to
2236 			 * avoid a complete restart.  Note that we've already
2237 			 * marked both the start and stop 'in-progress' flags,
2238 			 * so it is ok to go ahead and do this restart.
2239 			 */
2240 			(void) ibd_undo_start(state, LINK_STATE_DOWN);
2241 			if ((ret = ibd_start(state)) != 0) {
2242 				DPRINT(10, "ibd_restart: cannot restart, "
2243 				    "ret=%d", ret);
2244 			}
2245 
2246 			goto link_mod_return;
2247 		} else {
2248 			new_link_state = LINK_STATE_DOWN;
2249 		}
2250 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
2251 	}
2252 
2253 update_link_state:
2254 	if (port_infop) {
2255 		ibt_free_portinfo(port_infop, port_infosz);
2256 	}
2257 
2258 	/*
2259 	 * If we're reporting a link up, check InitTypeReply to see if
2260 	 * the SM has ensured that the port's presence in mcg, traps,
2261 	 * etc. is intact.
2262 	 */
2263 	if (new_link_state == LINK_STATE_DOWN) {
2264 		opcode = IBD_LINK_DOWN;
2265 	} else {
2266 		if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
2267 		    SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) {
2268 			opcode = IBD_LINK_UP;
2269 		} else {
2270 			opcode = IBD_LINK_UP_ABSENT;
2271 		}
2272 	}
2273 
2274 	/*
2275 	 * If the old state is the same as the new state, and the SM indicated
2276 	 * no change in the port parameters, nothing to do.
2277 	 */
2278 	if ((state->id_link_state == new_link_state) && (opcode !=
2279 	    IBD_LINK_UP_ABSENT)) {
2280 		mutex_exit(&state->id_link_mutex);
2281 		goto link_mod_return;
2282 	}
2283 
2284 	/*
2285 	 * Ok, so there was a link state change; see if it's safe to ask
2286 	 * the async thread to do the work
2287 	 */
2288 	if (!ibd_async_safe(state)) {
2289 		state->id_link_state = new_link_state;
2290 		mutex_exit(&state->id_link_mutex);
2291 		goto link_mod_return;
2292 	}
2293 
2294 	mutex_exit(&state->id_link_mutex);
2295 
2296 	/*
2297 	 * Queue up a request for ibd_async_link() to handle this link
2298 	 * state change event
2299 	 */
2300 	req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
2301 	req->rq_ptr = (void *)opcode;
2302 	ibd_queue_work_slot(state, req, IBD_ASYNC_LINK);
2303 
2304 link_mod_return:
2305 	ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2306 }
2307 
2308 /*
2309  * For the port up/down events, IBTL guarantees there will not be concurrent
2310  * invocations of the handler. IBTL might coalesce link transition events,
2311  * and not invoke the handler for _each_ up/down transition, but it will
2312  * invoke the handler with last known state
2313  */
2314 static void
2315 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
2316     ibt_async_code_t code, ibt_async_event_t *event)
2317 {
2318 	ibd_state_t *state = (ibd_state_t *)clnt_private;
2319 
2320 	switch (code) {
2321 	case IBT_ERROR_CATASTROPHIC_CHAN:
2322 		ibd_print_warn(state, "catastrophic channel error");
2323 		break;
2324 	case IBT_ERROR_CQ:
2325 		ibd_print_warn(state, "completion queue error");
2326 		break;
2327 	case IBT_PORT_CHANGE_EVENT:
2328 		/*
2329 		 * Events will be delivered to all instances that have
2330 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2331 		 * Only need to do work for our port; IBTF will deliver
2332 		 * events for other ports on the hca we have ibt_open_hca'ed
2333 		 * too. Note that id_port is initialized in ibd_attach()
2334 		 * before we do an ibt_open_hca() in ibd_attach().
2335 		 */
2336 		ASSERT(state->id_hca_hdl == hca_hdl);
2337 		if (state->id_port != event->ev_port)
2338 			break;
2339 
2340 		if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) ==
2341 		    IBT_PORT_CHANGE_PKEY) {
2342 			ibd_link_mod(state, code);
2343 		}
2344 		break;
2345 	case IBT_ERROR_PORT_DOWN:
2346 	case IBT_CLNT_REREG_EVENT:
2347 	case IBT_EVENT_PORT_UP:
2348 		/*
2349 		 * Events will be delivered to all instances that have
2350 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2351 		 * Only need to do work for our port; IBTF will deliver
2352 		 * events for other ports on the hca we have ibt_open_hca'ed
2353 		 * too. Note that id_port is initialized in ibd_attach()
2354 		 * before we do an ibt_open_hca() in ibd_attach().
2355 		 */
2356 		ASSERT(state->id_hca_hdl == hca_hdl);
2357 		if (state->id_port != event->ev_port)
2358 			break;
2359 
2360 		ibd_link_mod(state, code);
2361 		break;
2362 
2363 	case IBT_HCA_ATTACH_EVENT:
2364 	case IBT_HCA_DETACH_EVENT:
2365 		/*
2366 		 * When a new card is plugged to the system, attach_event is
2367 		 * invoked. Additionally, a cfgadm needs to be run to make the
2368 		 * card known to the system, and an ifconfig needs to be run to
2369 		 * plumb up any ibd interfaces on the card. In the case of card
2370 		 * unplug, a cfgadm is run that will trigger any RCM scripts to
2371 		 * unplumb the ibd interfaces on the card; when the card is
2372 		 * actually unplugged, the detach_event is invoked;
2373 		 * additionally, if any ibd instances are still active on the
2374 		 * card (eg there were no associated RCM scripts), driver's
2375 		 * detach routine is invoked.
2376 		 */
2377 		break;
2378 	default:
2379 		break;
2380 	}
2381 }
2382 
2383 static int
2384 ibd_register_mac(ibd_state_t *state, dev_info_t *dip)
2385 {
2386 	mac_register_t *macp;
2387 	int ret;
2388 
2389 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2390 		DPRINT(10, "ibd_register_mac: mac_alloc() failed");
2391 		return (DDI_FAILURE);
2392 	}
2393 
2394 	/*
2395 	 * Note that when we register with mac during attach, we don't
2396 	 * have the id_macaddr yet, so we'll simply be registering a
2397 	 * zero macaddr that we'll overwrite later during plumb (in
2398 	 * ibd_m_start()). Similar is the case with id_mtu - we'll
2399 	 * update the mac layer with the correct mtu during plumb.
2400 	 */
2401 	macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
2402 	macp->m_driver = state;
2403 	macp->m_dip = dip;
2404 	macp->m_src_addr = (uint8_t *)&state->id_macaddr;
2405 	macp->m_callbacks = &ibd_m_callbacks;
2406 	macp->m_min_sdu = 0;
2407 	if (state->id_type == IBD_PORT_DRIVER) {
2408 		macp->m_max_sdu = IBD_DEF_RC_MAX_SDU;
2409 	} else if (state->id_enable_rc) {
2410 		macp->m_max_sdu = state->rc_mtu - IPOIB_HDRSIZE;
2411 	} else {
2412 		macp->m_max_sdu = IBD_DEF_MAX_SDU;
2413 	}
2414 	macp->m_priv_props = ibd_priv_props;
2415 
2416 	/*
2417 	 *  Register ourselves with the GLDv3 interface
2418 	 */
2419 	if ((ret = mac_register(macp, &state->id_mh)) != 0) {
2420 		mac_free(macp);
2421 		DPRINT(10,
2422 		    "ibd_register_mac: mac_register() failed, ret=%d", ret);
2423 		return (DDI_FAILURE);
2424 	}
2425 
2426 	mac_free(macp);
2427 	return (DDI_SUCCESS);
2428 }
2429 
2430 static int
2431 ibd_record_capab(ibd_state_t *state)
2432 {
2433 	ibt_hca_attr_t hca_attrs;
2434 	ibt_status_t ibt_status;
2435 
2436 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
2437 
2438 	/*
2439 	 * Query the HCA and fetch its attributes
2440 	 */
2441 	ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
2442 	ASSERT(ibt_status == IBT_SUCCESS);
2443 
2444 	/*
2445 	 * 1. Set the Hardware Checksum capability. Currently we only consider
2446 	 *    full checksum offload.
2447 	 */
2448 	if (state->id_enable_rc) {
2449 			state->id_hwcksum_capab = 0;
2450 	} else {
2451 		if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL)
2452 		    == IBT_HCA_CKSUM_FULL) {
2453 			state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL;
2454 		}
2455 	}
2456 
2457 	/*
2458 	 * 2. Set LSO policy, capability and maximum length
2459 	 */
2460 	if (state->id_enable_rc) {
2461 		state->id_lso_capable = B_FALSE;
2462 		state->id_lso_maxlen = 0;
2463 	} else {
2464 		if (hca_attrs.hca_max_lso_size > 0) {
2465 			state->id_lso_capable = B_TRUE;
2466 			if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN)
2467 				state->id_lso_maxlen = IBD_LSO_MAXLEN;
2468 			else
2469 				state->id_lso_maxlen =
2470 				    hca_attrs.hca_max_lso_size;
2471 		} else {
2472 			state->id_lso_capable = B_FALSE;
2473 			state->id_lso_maxlen = 0;
2474 		}
2475 	}
2476 
2477 	/*
2478 	 * 3. Set Reserved L_Key capability
2479 	 */
2480 	if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) {
2481 		state->id_hca_res_lkey_capab = 1;
2482 		state->id_res_lkey = hca_attrs.hca_reserved_lkey;
2483 		state->rc_enable_iov_map = B_TRUE;
2484 	} else {
2485 		/* If no reserved lkey, we will not use ibt_map_mem_iov */
2486 		state->rc_enable_iov_map = B_FALSE;
2487 	}
2488 
2489 	/*
2490 	 * 4. Set maximum sqseg value after checking to see if extended sgl
2491 	 *    size information is provided by the hca
2492 	 */
2493 	if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) {
2494 		state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz;
2495 		state->rc_tx_max_sqseg = hca_attrs.hca_conn_send_sgl_sz;
2496 	} else {
2497 		state->id_max_sqseg = hca_attrs.hca_max_sgl;
2498 		state->rc_tx_max_sqseg = hca_attrs.hca_max_sgl;
2499 	}
2500 	if (state->id_max_sqseg > IBD_MAX_SQSEG) {
2501 		state->id_max_sqseg = IBD_MAX_SQSEG;
2502 	} else if (state->id_max_sqseg < IBD_MAX_SQSEG) {
2503 		ibd_print_warn(state, "Set #sgl = %d instead of default %d",
2504 		    state->id_max_sqseg, IBD_MAX_SQSEG);
2505 	}
2506 	if (state->rc_tx_max_sqseg > IBD_MAX_SQSEG) {
2507 		state->rc_tx_max_sqseg = IBD_MAX_SQSEG;
2508 	} else if (state->rc_tx_max_sqseg < IBD_MAX_SQSEG) {
2509 		ibd_print_warn(state, "RC mode: Set #sgl = %d instead of "
2510 		    "default %d", state->rc_tx_max_sqseg, IBD_MAX_SQSEG);
2511 	}
2512 
2513 	/*
2514 	 * Translating the virtual address regions into physical regions
2515 	 * for using the Reserved LKey feature results in a wr sgl that
2516 	 * is a little longer. Since failing ibt_map_mem_iov() is costly,
2517 	 * we'll fix a high-water mark (65%) for when we should stop.
2518 	 */
2519 	state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100;
2520 	state->rc_max_sqseg_hiwm = (state->rc_tx_max_sqseg * 65) / 100;
2521 
2522 	/*
2523 	 * 5. Set number of recv and send wqes after checking hca maximum
2524 	 *    channel size. Store the max channel size in the state so that it
2525 	 *    can be referred to when the swqe/rwqe change is requested via
2526 	 *    dladm.
2527 	 */
2528 
2529 	state->id_hca_max_chan_sz = hca_attrs.hca_max_chan_sz;
2530 
2531 	if (hca_attrs.hca_max_chan_sz < state->id_ud_num_rwqe)
2532 		state->id_ud_num_rwqe = hca_attrs.hca_max_chan_sz;
2533 
2534 	state->id_rx_bufs_outstanding_limit = state->id_ud_num_rwqe -
2535 	    IBD_RWQE_MIN;
2536 
2537 	if (hca_attrs.hca_max_chan_sz < state->id_ud_num_swqe)
2538 		state->id_ud_num_swqe = hca_attrs.hca_max_chan_sz;
2539 
2540 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
2541 
2542 	return (DDI_SUCCESS);
2543 }
2544 
2545 static int
2546 ibd_part_unattach(ibd_state_t *state)
2547 {
2548 	uint32_t progress = state->id_mac_state;
2549 	ibt_status_t ret;
2550 
2551 	if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) != 0) {
2552 		cmn_err(CE_CONT, "ibd_detach: failed: rx bufs outstanding\n");
2553 		return (DDI_FAILURE);
2554 	}
2555 
2556 	if (state->rc_srq_rwqe_list.dl_bufs_outstanding != 0) {
2557 		cmn_err(CE_CONT, "ibd_detach: failed: srq bufs outstanding\n");
2558 		return (DDI_FAILURE);
2559 	}
2560 
2561 	/* make sure rx resources are freed */
2562 	ibd_free_rx_rsrcs(state);
2563 
2564 	if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
2565 		ASSERT(state->id_enable_rc);
2566 		ibd_rc_fini_srq_list(state);
2567 		state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD);
2568 	}
2569 
2570 	if (progress & IBD_DRV_MAC_REGISTERED) {
2571 		(void) mac_unregister(state->id_mh);
2572 		state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
2573 	}
2574 
2575 	if (progress & IBD_DRV_ASYNC_THR_CREATED) {
2576 		/*
2577 		 * No new async requests will be posted since the device
2578 		 * link state has been marked as unknown; completion handlers
2579 		 * have been turned off, so Tx handler will not cause any
2580 		 * more IBD_ASYNC_REAP requests.
2581 		 *
2582 		 * Queue a request for the async thread to exit, which will
2583 		 * be serviced after any pending ones. This can take a while,
2584 		 * specially if the SM is unreachable, since IBMF will slowly
2585 		 * timeout each SM request issued by the async thread.  Reap
2586 		 * the thread before continuing on, we do not want it to be
2587 		 * lingering in modunloaded code.
2588 		 */
2589 		ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT);
2590 		thread_join(state->id_async_thrid);
2591 
2592 		state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED);
2593 	}
2594 
2595 	if (progress & IBD_DRV_REQ_LIST_INITED) {
2596 		list_destroy(&state->id_req_list);
2597 		mutex_destroy(&state->id_acache_req_lock);
2598 		cv_destroy(&state->id_acache_req_cv);
2599 		state->id_mac_state &= ~IBD_DRV_REQ_LIST_INITED;
2600 	}
2601 
2602 	if (progress & IBD_DRV_PD_ALLOCD) {
2603 		if ((ret = ibt_free_pd(state->id_hca_hdl,
2604 		    state->id_pd_hdl)) != IBT_SUCCESS) {
2605 			ibd_print_warn(state, "failed to free "
2606 			    "protection domain, ret=%d", ret);
2607 		}
2608 		state->id_pd_hdl = NULL;
2609 		state->id_mac_state &= (~IBD_DRV_PD_ALLOCD);
2610 	}
2611 
2612 	if (progress & IBD_DRV_HCA_OPENED) {
2613 		if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
2614 		    IBT_SUCCESS) {
2615 			ibd_print_warn(state, "failed to close "
2616 			    "HCA device, ret=%d", ret);
2617 		}
2618 		state->id_hca_hdl = NULL;
2619 		state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
2620 	}
2621 
2622 	mutex_enter(&ibd_gstate.ig_mutex);
2623 	if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
2624 		if ((ret = ibt_detach(state->id_ibt_hdl)) !=
2625 		    IBT_SUCCESS) {
2626 			ibd_print_warn(state,
2627 			    "ibt_detach() failed, ret=%d", ret);
2628 		}
2629 		state->id_ibt_hdl = NULL;
2630 		state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
2631 		ibd_gstate.ig_ibt_hdl_ref_cnt--;
2632 	}
2633 	if ((ibd_gstate.ig_ibt_hdl_ref_cnt == 0) &&
2634 	    (ibd_gstate.ig_ibt_hdl != NULL)) {
2635 		if ((ret = ibt_detach(ibd_gstate.ig_ibt_hdl)) !=
2636 		    IBT_SUCCESS) {
2637 			ibd_print_warn(state, "ibt_detach(): global "
2638 			    "failed, ret=%d", ret);
2639 		}
2640 		ibd_gstate.ig_ibt_hdl = NULL;
2641 	}
2642 	mutex_exit(&ibd_gstate.ig_mutex);
2643 
2644 	if (progress & IBD_DRV_TXINTR_ADDED) {
2645 		ddi_remove_softintr(state->id_tx);
2646 		state->id_tx = NULL;
2647 		state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED);
2648 	}
2649 
2650 	if (progress & IBD_DRV_RXINTR_ADDED) {
2651 		ddi_remove_softintr(state->id_rx);
2652 		state->id_rx = NULL;
2653 		state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED);
2654 	}
2655 
2656 #ifdef DEBUG
2657 	if (progress & IBD_DRV_RC_PRIVATE_STATE) {
2658 		kstat_delete(state->rc_ksp);
2659 		state->id_mac_state &= (~IBD_DRV_RC_PRIVATE_STATE);
2660 	}
2661 #endif
2662 
2663 	if (progress & IBD_DRV_STATE_INITIALIZED) {
2664 		ibd_state_fini(state);
2665 		state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED);
2666 	}
2667 
2668 	return (DDI_SUCCESS);
2669 }
2670 
2671 int
2672 ibd_part_attach(ibd_state_t *state, dev_info_t *dip)
2673 {
2674 	ibt_status_t ret;
2675 	int rv;
2676 	kthread_t *kht;
2677 
2678 	/*
2679 	 * Initialize mutexes and condition variables
2680 	 */
2681 	if (ibd_state_init(state, dip) != DDI_SUCCESS) {
2682 		DPRINT(10, "ibd_attach: failed in ibd_state_init()");
2683 		return (DDI_FAILURE);
2684 	}
2685 	state->id_mac_state |= IBD_DRV_STATE_INITIALIZED;
2686 
2687 	/*
2688 	 * Allocate rx,tx softintr
2689 	 */
2690 	if (ibd_rx_softintr == 1) {
2691 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx,
2692 		    NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) {
2693 			DPRINT(10, "ibd_attach: failed in "
2694 			    "ddi_add_softintr(id_rx),  ret=%d", rv);
2695 			return (DDI_FAILURE);
2696 		}
2697 		state->id_mac_state |= IBD_DRV_RXINTR_ADDED;
2698 	}
2699 	if (ibd_tx_softintr == 1) {
2700 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx,
2701 		    NULL, NULL, ibd_tx_recycle,
2702 		    (caddr_t)state)) != DDI_SUCCESS) {
2703 			DPRINT(10, "ibd_attach: failed in "
2704 			    "ddi_add_softintr(id_tx), ret=%d", rv);
2705 			return (DDI_FAILURE);
2706 		}
2707 		state->id_mac_state |= IBD_DRV_TXINTR_ADDED;
2708 	}
2709 
2710 	/*
2711 	 * Attach to IBTL
2712 	 */
2713 	mutex_enter(&ibd_gstate.ig_mutex);
2714 	if (ibd_gstate.ig_ibt_hdl == NULL) {
2715 		if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2716 		    &ibd_gstate.ig_ibt_hdl)) != IBT_SUCCESS) {
2717 			DPRINT(10, "ibd_attach: global: failed in "
2718 			    "ibt_attach(), ret=%d", ret);
2719 			mutex_exit(&ibd_gstate.ig_mutex);
2720 			return (DDI_FAILURE);
2721 		}
2722 	}
2723 	if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2724 	    &state->id_ibt_hdl)) != IBT_SUCCESS) {
2725 		DPRINT(10, "ibd_attach: failed in ibt_attach(), ret=%d", ret);
2726 		mutex_exit(&ibd_gstate.ig_mutex);
2727 		return (DDI_FAILURE);
2728 	}
2729 	ibd_gstate.ig_ibt_hdl_ref_cnt++;
2730 	mutex_exit(&ibd_gstate.ig_mutex);
2731 	state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
2732 
2733 	/*
2734 	 * Open the HCA
2735 	 */
2736 	if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid,
2737 	    &state->id_hca_hdl)) != IBT_SUCCESS) {
2738 		DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret);
2739 		return (DDI_FAILURE);
2740 	}
2741 	state->id_mac_state |= IBD_DRV_HCA_OPENED;
2742 
2743 #ifdef DEBUG
2744 	/* Initialize Driver Counters for Reliable Connected Mode */
2745 	if (state->id_enable_rc) {
2746 		if (ibd_rc_init_stats(state) != DDI_SUCCESS) {
2747 			DPRINT(10, "ibd_attach: failed in ibd_rc_init_stats");
2748 			return (DDI_FAILURE);
2749 		}
2750 		state->id_mac_state |= IBD_DRV_RC_PRIVATE_STATE;
2751 	}
2752 #endif
2753 
2754 	/*
2755 	 * Record capabilities
2756 	 */
2757 	(void) ibd_record_capab(state);
2758 
2759 	/*
2760 	 * Allocate a protection domain on the HCA
2761 	 */
2762 	if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
2763 	    &state->id_pd_hdl)) != IBT_SUCCESS) {
2764 		DPRINT(10, "ibd_attach: ibt_alloc_pd() failed, ret=%d", ret);
2765 		return (DDI_FAILURE);
2766 	}
2767 	state->id_mac_state |= IBD_DRV_PD_ALLOCD;
2768 
2769 
2770 	/*
2771 	 * We need to initialise the req_list that is required for the
2772 	 * operation of the async_thread.
2773 	 */
2774 	mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL);
2775 	cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL);
2776 	list_create(&state->id_req_list, sizeof (ibd_req_t),
2777 	    offsetof(ibd_req_t, rq_list));
2778 	state->id_mac_state |= IBD_DRV_REQ_LIST_INITED;
2779 
2780 	/*
2781 	 * Create the async thread; thread_create never fails.
2782 	 */
2783 	kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
2784 	    TS_RUN, minclsyspri);
2785 	state->id_async_thrid = kht->t_did;
2786 	state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED;
2787 
2788 	return (DDI_SUCCESS);
2789 }
2790 
2791 /*
2792  * Attach device to the IO framework.
2793  */
2794 static int
2795 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2796 {
2797 	int ret;
2798 
2799 	switch (cmd) {
2800 		case DDI_ATTACH:
2801 			ret = ibd_port_attach(dip);
2802 			break;
2803 		default:
2804 			ret = DDI_FAILURE;
2805 			break;
2806 	}
2807 	return (ret);
2808 }
2809 
2810 /*
2811  * Detach device from the IO framework.
2812  */
2813 static int
2814 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2815 {
2816 	ibd_state_t *state;
2817 	int instance;
2818 
2819 	/*
2820 	 * IBD doesn't support suspend/resume
2821 	 */
2822 	if (cmd != DDI_DETACH)
2823 		return (DDI_FAILURE);
2824 
2825 	/*
2826 	 * Get the instance softstate
2827 	 */
2828 	instance = ddi_get_instance(dip);
2829 	state = ddi_get_soft_state(ibd_list, instance);
2830 
2831 	/*
2832 	 * Release all resources we're holding still.  Note that if we'd
2833 	 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly
2834 	 * so far, we should find all the flags we need in id_mac_state.
2835 	 */
2836 	return (ibd_port_unattach(state, dip));
2837 }
2838 
2839 /*
2840  * Pre ibt_attach() driver initialization
2841  */
2842 static int
2843 ibd_state_init(ibd_state_t *state, dev_info_t *dip)
2844 {
2845 	char buf[64];
2846 
2847 	mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL);
2848 	state->id_link_state = LINK_STATE_UNKNOWN;
2849 
2850 	mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL);
2851 	cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL);
2852 	state->id_trap_stop = B_TRUE;
2853 	state->id_trap_inprog = 0;
2854 
2855 	mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2856 	mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2857 	state->id_dip = dip;
2858 
2859 	mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL);
2860 
2861 	mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2862 	mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2863 	mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL);
2864 	state->id_tx_busy = 0;
2865 	mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL);
2866 
2867 	state->id_rx_list.dl_bufs_outstanding = 0;
2868 	state->id_rx_list.dl_cnt = 0;
2869 	mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2870 	mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2871 	(void) sprintf(buf, "ibd_req%d_%x", ddi_get_instance(dip),
2872 	    state->id_pkey);
2873 	state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t),
2874 	    0, NULL, NULL, NULL, NULL, NULL, 0);
2875 
2876 	/* For Reliable Connected Mode */
2877 	mutex_init(&state->rc_rx_lock, NULL, MUTEX_DRIVER, NULL);
2878 	mutex_init(&state->rc_tx_large_bufs_lock, NULL, MUTEX_DRIVER, NULL);
2879 	mutex_init(&state->rc_srq_rwqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2880 	mutex_init(&state->rc_srq_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2881 	mutex_init(&state->rc_pass_chan_list.chan_list_mutex, NULL,
2882 	    MUTEX_DRIVER, NULL);
2883 
2884 	/*
2885 	 * Make the default link mode as RC. If this fails during connection
2886 	 * setup, the link mode is automatically transitioned to UD.
2887 	 * Also set the RC MTU.
2888 	 */
2889 	state->id_enable_rc = IBD_DEF_LINK_MODE;
2890 	state->rc_mtu = IBD_DEF_RC_MAX_MTU;
2891 	state->id_mtu = IBD_DEF_MAX_MTU;
2892 
2893 	/* Iniatialize all tunables to default */
2894 	state->id_lso_policy = IBD_DEF_LSO_POLICY;
2895 	state->id_num_lso_bufs = IBD_DEF_NUM_LSO_BUFS;
2896 	state->id_num_ah = IBD_DEF_NUM_AH;
2897 	state->id_hash_size = IBD_DEF_HASH_SIZE;
2898 	state->id_create_broadcast_group = IBD_DEF_CREATE_BCAST_GROUP;
2899 	state->id_allow_coalesce_comp_tuning = IBD_DEF_COALESCE_COMPLETIONS;
2900 	state->id_ud_rx_comp_count = IBD_DEF_UD_RX_COMP_COUNT;
2901 	state->id_ud_rx_comp_usec = IBD_DEF_UD_RX_COMP_USEC;
2902 	state->id_ud_tx_comp_count = IBD_DEF_UD_TX_COMP_COUNT;
2903 	state->id_ud_tx_comp_usec = IBD_DEF_UD_TX_COMP_USEC;
2904 	state->id_rc_rx_comp_count = IBD_DEF_RC_RX_COMP_COUNT;
2905 	state->id_rc_rx_comp_usec = IBD_DEF_RC_RX_COMP_USEC;
2906 	state->id_rc_tx_comp_count = IBD_DEF_RC_TX_COMP_COUNT;
2907 	state->id_rc_tx_comp_usec = IBD_DEF_RC_TX_COMP_USEC;
2908 	state->id_ud_tx_copy_thresh = IBD_DEF_UD_TX_COPY_THRESH;
2909 	state->id_rc_rx_copy_thresh = IBD_DEF_RC_RX_COPY_THRESH;
2910 	state->id_rc_tx_copy_thresh = IBD_DEF_RC_TX_COPY_THRESH;
2911 	state->id_ud_num_rwqe = IBD_DEF_UD_NUM_RWQE;
2912 	state->id_ud_num_swqe = IBD_DEF_UD_NUM_SWQE;
2913 	state->id_rc_num_rwqe = IBD_DEF_RC_NUM_RWQE;
2914 	state->id_rc_num_swqe = IBD_DEF_RC_NUM_SWQE;
2915 	state->rc_enable_srq = IBD_DEF_RC_ENABLE_SRQ;
2916 	state->id_rc_num_srq = IBD_DEF_RC_NUM_SRQ;
2917 	state->id_rc_rx_rwqe_thresh = IBD_DEF_RC_RX_RWQE_THRESH;
2918 
2919 	return (DDI_SUCCESS);
2920 }
2921 
2922 /*
2923  * Post ibt_detach() driver deconstruction
2924  */
2925 static void
2926 ibd_state_fini(ibd_state_t *state)
2927 {
2928 	kmem_cache_destroy(state->id_req_kmc);
2929 
2930 	mutex_destroy(&state->id_rx_list.dl_mutex);
2931 	mutex_destroy(&state->id_rx_free_list.dl_mutex);
2932 
2933 	mutex_destroy(&state->id_txpost_lock);
2934 	mutex_destroy(&state->id_tx_list.dl_mutex);
2935 	mutex_destroy(&state->id_tx_rel_list.dl_mutex);
2936 	mutex_destroy(&state->id_lso_lock);
2937 
2938 	mutex_destroy(&state->id_sched_lock);
2939 	mutex_destroy(&state->id_scq_poll_lock);
2940 	mutex_destroy(&state->id_rcq_poll_lock);
2941 
2942 	cv_destroy(&state->id_trap_cv);
2943 	mutex_destroy(&state->id_trap_lock);
2944 	mutex_destroy(&state->id_link_mutex);
2945 
2946 	/* For Reliable Connected Mode */
2947 	mutex_destroy(&state->rc_srq_free_list.dl_mutex);
2948 	mutex_destroy(&state->rc_srq_rwqe_list.dl_mutex);
2949 	mutex_destroy(&state->rc_pass_chan_list.chan_list_mutex);
2950 	mutex_destroy(&state->rc_tx_large_bufs_lock);
2951 	mutex_destroy(&state->rc_rx_lock);
2952 }
2953 
2954 /*
2955  * Fetch link speed from SA for snmp ifspeed reporting.
2956  */
2957 static uint64_t
2958 ibd_get_portspeed(ibd_state_t *state)
2959 {
2960 	int			ret;
2961 	ibt_path_info_t		path;
2962 	ibt_path_attr_t		path_attr;
2963 	uint8_t			num_paths;
2964 	uint64_t		ifspeed;
2965 
2966 	/*
2967 	 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
2968 	 * translates to 2 Gbps data rate. Thus, 1X single data rate is
2969 	 * 2000000000. Start with that as default.
2970 	 */
2971 	ifspeed = 2000000000;
2972 
2973 	bzero(&path_attr, sizeof (path_attr));
2974 
2975 	/*
2976 	 * Get the port speed from Loopback path information.
2977 	 */
2978 	path_attr.pa_dgids = &state->id_sgid;
2979 	path_attr.pa_num_dgids = 1;
2980 	path_attr.pa_sgid = state->id_sgid;
2981 
2982 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
2983 	    &path_attr, 1, &path, &num_paths) != IBT_SUCCESS)
2984 		goto earlydone;
2985 
2986 	if (num_paths < 1)
2987 		goto earlydone;
2988 
2989 	/*
2990 	 * In case SA does not return an expected value, report the default
2991 	 * speed as 1X.
2992 	 */
2993 	ret = 1;
2994 	switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) {
2995 		case IBT_SRATE_2:	/*  1X SDR i.e 2.5 Gbps */
2996 			ret = 1;
2997 			break;
2998 		case IBT_SRATE_10:	/*  4X SDR or 1X QDR i.e 10 Gbps */
2999 			ret = 4;
3000 			break;
3001 		case IBT_SRATE_30:	/* 12X SDR i.e 30 Gbps */
3002 			ret = 12;
3003 			break;
3004 		case IBT_SRATE_5:	/*  1X DDR i.e  5 Gbps */
3005 			ret = 2;
3006 			break;
3007 		case IBT_SRATE_20:	/*  4X DDR or 8X SDR i.e 20 Gbps */
3008 			ret = 8;
3009 			break;
3010 		case IBT_SRATE_40:	/*  8X DDR or 4X QDR i.e 40 Gbps */
3011 			ret = 16;
3012 			break;
3013 		case IBT_SRATE_60:	/* 12X DDR i.e 60 Gbps */
3014 			ret = 24;
3015 			break;
3016 		case IBT_SRATE_80:	/*  8X QDR i.e 80 Gbps */
3017 			ret = 32;
3018 			break;
3019 		case IBT_SRATE_120:	/* 12X QDR i.e 120 Gbps */
3020 			ret = 48;
3021 			break;
3022 	}
3023 
3024 	ifspeed *= ret;
3025 
3026 earlydone:
3027 	return (ifspeed);
3028 }
3029 
3030 /*
3031  * Search input mcg list (id_mc_full or id_mc_non) for an entry
3032  * representing the input mcg mgid.
3033  */
3034 static ibd_mce_t *
3035 ibd_mcache_find(ib_gid_t mgid, struct list *mlist)
3036 {
3037 	ibd_mce_t *ptr = list_head(mlist);
3038 
3039 	/*
3040 	 * Do plain linear search.
3041 	 */
3042 	while (ptr != NULL) {
3043 		if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid,
3044 		    sizeof (ib_gid_t)) == 0)
3045 			return (ptr);
3046 		ptr = list_next(mlist, ptr);
3047 	}
3048 	return (NULL);
3049 }
3050 
3051 /*
3052  * Execute IBA JOIN.
3053  */
3054 static ibt_status_t
3055 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce)
3056 {
3057 	ibt_mcg_attr_t mcg_attr;
3058 
3059 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3060 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
3061 	mcg_attr.mc_mgid = mgid;
3062 	mcg_attr.mc_join_state = mce->mc_jstate;
3063 	mcg_attr.mc_scope = state->id_scope;
3064 	mcg_attr.mc_pkey = state->id_pkey;
3065 	mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow;
3066 	mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
3067 	mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass;
3068 	return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info,
3069 	    NULL, NULL));
3070 }
3071 
3072 /*
3073  * This code JOINs the port in the proper way (depending on the join
3074  * state) so that IBA fabric will forward mcg packets to/from the port.
3075  * It also attaches the QPN to the mcg so it can receive those mcg
3076  * packets. This code makes sure not to attach the mcg to the QP if
3077  * that has been previously done due to the mcg being joined with a
3078  * different join state, even though this is not required by SWG_0216,
3079  * refid 3610.
3080  */
3081 static ibd_mce_t *
3082 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
3083 {
3084 	ibt_status_t ibt_status;
3085 	ibd_mce_t *mce, *tmce, *omce = NULL;
3086 	boolean_t do_attach = B_TRUE;
3087 
3088 	DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n",
3089 	    jstate, mgid.gid_prefix, mgid.gid_guid);
3090 
3091 	/*
3092 	 * For enable_multicast Full member joins, we need to do some
3093 	 * extra work. If there is already an mce on the list that
3094 	 * indicates full membership, that means the membership has
3095 	 * not yet been dropped (since the disable_multicast was issued)
3096 	 * because there are pending Tx's to the mcg; in that case, just
3097 	 * mark the mce not to be reaped when the Tx completion queues
3098 	 * an async reap operation.
3099 	 *
3100 	 * If there is already an mce on the list indicating sendonly
3101 	 * membership, try to promote to full membership. Be careful
3102 	 * not to deallocate the old mce, since there might be an AH
3103 	 * pointing to it; instead, update the old mce with new data
3104 	 * that tracks the full membership.
3105 	 */
3106 	if ((jstate == IB_MC_JSTATE_FULL) && ((omce =
3107 	    IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) {
3108 		if (omce->mc_jstate == IB_MC_JSTATE_FULL) {
3109 			ASSERT(omce->mc_fullreap);
3110 			omce->mc_fullreap = B_FALSE;
3111 			return (omce);
3112 		} else {
3113 			ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
3114 		}
3115 	}
3116 
3117 	/*
3118 	 * Allocate the ibd_mce_t to track this JOIN.
3119 	 */
3120 	mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP);
3121 	mce->mc_fullreap = B_FALSE;
3122 	mce->mc_jstate = jstate;
3123 
3124 	if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) {
3125 		DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d",
3126 		    ibt_status);
3127 		kmem_free(mce, sizeof (ibd_mce_t));
3128 		return (NULL);
3129 	}
3130 
3131 	/*
3132 	 * Is an IBA attach required? Not if the interface is already joined
3133 	 * to the mcg in a different appropriate join state.
3134 	 */
3135 	if (jstate == IB_MC_JSTATE_NON) {
3136 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
3137 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
3138 			do_attach = B_FALSE;
3139 	} else if (jstate == IB_MC_JSTATE_FULL) {
3140 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
3141 			do_attach = B_FALSE;
3142 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
3143 		do_attach = B_FALSE;
3144 	}
3145 
3146 	if (do_attach) {
3147 		/*
3148 		 * Do the IBA attach.
3149 		 */
3150 		DPRINT(10, "ibd_join_group: ibt_attach_mcg \n");
3151 		if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl,
3152 		    &mce->mc_info)) != IBT_SUCCESS) {
3153 			DPRINT(10, "ibd_join_group : failed qp attachment "
3154 			    "%d\n", ibt_status);
3155 			/*
3156 			 * NOTE that we should probably preserve the join info
3157 			 * in the list and later try to leave again at detach
3158 			 * time.
3159 			 */
3160 			(void) ibt_leave_mcg(state->id_sgid, mgid,
3161 			    state->id_sgid, jstate);
3162 			kmem_free(mce, sizeof (ibd_mce_t));
3163 			return (NULL);
3164 		}
3165 	}
3166 
3167 	/*
3168 	 * Insert the ibd_mce_t in the proper list.
3169 	 */
3170 	if (jstate == IB_MC_JSTATE_NON) {
3171 		IBD_MCACHE_INSERT_NON(state, mce);
3172 	} else {
3173 		/*
3174 		 * Set up the mc_req fields used for reaping the
3175 		 * mcg in case of delayed tx completion (see
3176 		 * ibd_tx_cleanup()). Also done for sendonly join in
3177 		 * case we are promoted to fullmembership later and
3178 		 * keep using the same mce.
3179 		 */
3180 		mce->mc_req.rq_gid = mgid;
3181 		mce->mc_req.rq_ptr = mce;
3182 		/*
3183 		 * Check whether this is the case of trying to join
3184 		 * full member, and we were already joined send only.
3185 		 * We try to drop our SendOnly membership, but it is
3186 		 * possible that the mcg does not exist anymore (and
3187 		 * the subnet trap never reached us), so the leave
3188 		 * operation might fail.
3189 		 */
3190 		if (omce != NULL) {
3191 			(void) ibt_leave_mcg(state->id_sgid, mgid,
3192 			    state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON);
3193 			omce->mc_jstate = IB_MC_JSTATE_FULL;
3194 			bcopy(&mce->mc_info, &omce->mc_info,
3195 			    sizeof (ibt_mcg_info_t));
3196 			kmem_free(mce, sizeof (ibd_mce_t));
3197 			return (omce);
3198 		}
3199 		mutex_enter(&state->id_mc_mutex);
3200 		IBD_MCACHE_INSERT_FULL(state, mce);
3201 		mutex_exit(&state->id_mc_mutex);
3202 	}
3203 
3204 	return (mce);
3205 }
3206 
3207 /*
3208  * Called during port up event handling to attempt to reacquire full
3209  * membership to an mcg. Stripped down version of ibd_join_group().
3210  * Note that it is possible that the mcg might have gone away, and
3211  * gets recreated at this point.
3212  */
3213 static void
3214 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce)
3215 {
3216 	ib_gid_t mgid;
3217 
3218 	/*
3219 	 * If the mc_fullreap flag is set, or this join fails, a subsequent
3220 	 * reap/leave is going to try to leave the group. We could prevent
3221 	 * that by adding a boolean flag into ibd_mce_t, if required.
3222 	 */
3223 	if (mce->mc_fullreap)
3224 		return;
3225 
3226 	mgid = mce->mc_info.mc_adds_vect.av_dgid;
3227 
3228 	DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix,
3229 	    mgid.gid_guid);
3230 
3231 	/* While reacquiring, leave and then join the MCG */
3232 	(void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid,
3233 	    mce->mc_jstate);
3234 	if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS)
3235 		ibd_print_warn(state, "Failure on port up to rejoin "
3236 		    "multicast gid %016llx:%016llx",
3237 		    (u_longlong_t)mgid.gid_prefix,
3238 		    (u_longlong_t)mgid.gid_guid);
3239 }
3240 
3241 /*
3242  * This code handles delayed Tx completion cleanups for mcg's to which
3243  * disable_multicast has been issued, regular mcg related cleanups during
3244  * disable_multicast, disable_promiscuous and mcg traps, as well as
3245  * cleanups during driver detach time. Depending on the join state,
3246  * it deletes the mce from the appropriate list and issues the IBA
3247  * leave/detach; except in the disable_multicast case when the mce
3248  * is left on the active list for a subsequent Tx completion cleanup.
3249  */
3250 static void
3251 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid,
3252     uint8_t jstate)
3253 {
3254 	ibd_mce_t *tmce;
3255 	boolean_t do_detach = B_TRUE;
3256 
3257 	/*
3258 	 * Before detaching, we must check whether the other list
3259 	 * contains the mcg; if we detach blindly, the consumer
3260 	 * who set up the other list will also stop receiving
3261 	 * traffic.
3262 	 */
3263 	if (jstate == IB_MC_JSTATE_FULL) {
3264 		/*
3265 		 * The following check is only relevant while coming
3266 		 * from the Tx completion path in the reap case.
3267 		 */
3268 		if (!mce->mc_fullreap)
3269 			return;
3270 		mutex_enter(&state->id_mc_mutex);
3271 		IBD_MCACHE_PULLOUT_FULL(state, mce);
3272 		mutex_exit(&state->id_mc_mutex);
3273 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
3274 			do_detach = B_FALSE;
3275 	} else if (jstate == IB_MC_JSTATE_NON) {
3276 		IBD_MCACHE_PULLOUT_NON(state, mce);
3277 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
3278 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
3279 			do_detach = B_FALSE;
3280 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
3281 		mutex_enter(&state->id_mc_mutex);
3282 		IBD_MCACHE_PULLOUT_FULL(state, mce);
3283 		mutex_exit(&state->id_mc_mutex);
3284 		do_detach = B_FALSE;
3285 	}
3286 
3287 	/*
3288 	 * If we are reacting to a mcg trap and leaving our sendonly or
3289 	 * non membership, the mcg is possibly already gone, so attempting
3290 	 * to leave might fail. On the other hand, we must try to leave
3291 	 * anyway, since this might be a trap from long ago, and we could
3292 	 * have potentially sendonly joined to a recent incarnation of
3293 	 * the mcg and are about to loose track of this information.
3294 	 */
3295 	if (do_detach) {
3296 		DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : "
3297 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3298 		(void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info);
3299 	}
3300 
3301 	(void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate);
3302 	kmem_free(mce, sizeof (ibd_mce_t));
3303 }
3304 
3305 /*
3306  * Async code executed due to multicast and promiscuous disable requests
3307  * and mcg trap handling; also executed during driver detach. Mostly, a
3308  * leave and detach is done; except for the fullmember case when Tx
3309  * requests are pending, whence arrangements are made for subsequent
3310  * cleanup on Tx completion.
3311  */
3312 static void
3313 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
3314 {
3315 	ipoib_mac_t mcmac;
3316 	boolean_t recycled;
3317 	ibd_mce_t *mce;
3318 
3319 	DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n",
3320 	    jstate, mgid.gid_prefix, mgid.gid_guid);
3321 
3322 	if (jstate == IB_MC_JSTATE_NON) {
3323 		recycled = B_TRUE;
3324 		mce = IBD_MCACHE_FIND_NON(state, mgid);
3325 		/*
3326 		 * In case we are handling a mcg trap, we might not find
3327 		 * the mcg in the non list.
3328 		 */
3329 		if (mce == NULL) {
3330 			return;
3331 		}
3332 	} else {
3333 		mce = IBD_MCACHE_FIND_FULL(state, mgid);
3334 
3335 		/*
3336 		 * In case we are handling a mcg trap, make sure the trap
3337 		 * is not arriving late; if we have an mce that indicates
3338 		 * that we are already a fullmember, that would be a clear
3339 		 * indication that the trap arrived late (ie, is for a
3340 		 * previous incarnation of the mcg).
3341 		 */
3342 		if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
3343 			if ((mce == NULL) || (mce->mc_jstate ==
3344 			    IB_MC_JSTATE_FULL)) {
3345 				return;
3346 			}
3347 		} else {
3348 			ASSERT(jstate == IB_MC_JSTATE_FULL);
3349 
3350 			/*
3351 			 * If join group failed, mce will be NULL here.
3352 			 * This is because in GLDv3 driver, set multicast
3353 			 *  will always return success.
3354 			 */
3355 			if (mce == NULL) {
3356 				return;
3357 			}
3358 
3359 			mce->mc_fullreap = B_TRUE;
3360 		}
3361 
3362 		/*
3363 		 * If no pending Tx's remain that reference the AH
3364 		 * for the mcg, recycle it from active to free list.
3365 		 * Else in the IB_MC_JSTATE_FULL case, just mark the AH,
3366 		 * so the last completing Tx will cause an async reap
3367 		 * operation to be invoked, at which time we will drop our
3368 		 * membership to the mcg so that the pending Tx's complete
3369 		 * successfully. Refer to comments on "AH and MCE active
3370 		 * list manipulation" at top of this file. The lock protects
3371 		 * against Tx fast path and Tx cleanup code.
3372 		 */
3373 		mutex_enter(&state->id_ac_mutex);
3374 		ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid);
3375 		recycled = ibd_acache_recycle(state, &mcmac, (jstate ==
3376 		    IB_MC_JSTATE_SEND_ONLY_NON));
3377 		mutex_exit(&state->id_ac_mutex);
3378 	}
3379 
3380 	if (recycled) {
3381 		DPRINT(2, "ibd_leave_group : leave_group reaping : "
3382 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3383 		ibd_async_reap_group(state, mce, mgid, jstate);
3384 	}
3385 }
3386 
3387 /*
3388  * Find the broadcast address as defined by IPoIB; implicitly
3389  * determines the IBA scope, mtu, tclass etc of the link the
3390  * interface is going to be a member of.
3391  */
3392 static ibt_status_t
3393 ibd_find_bgroup(ibd_state_t *state)
3394 {
3395 	ibt_mcg_attr_t mcg_attr;
3396 	uint_t numg;
3397 	uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL,
3398 	    IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL,
3399 	    IB_MC_SCOPE_GLOBAL };
3400 	int i, mcgmtu;
3401 	boolean_t found = B_FALSE;
3402 	int ret;
3403 	ibt_mcg_info_t mcg_info;
3404 
3405 	state->id_bgroup_created = B_FALSE;
3406 	state->id_bgroup_present = B_FALSE;
3407 
3408 query_bcast_grp:
3409 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3410 	mcg_attr.mc_pkey = state->id_pkey;
3411 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3412 	state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK;
3413 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3414 
3415 	for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) {
3416 		state->id_scope = mcg_attr.mc_scope = scopes[i];
3417 
3418 		/*
3419 		 * Look for the IPoIB broadcast group.
3420 		 */
3421 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3422 		state->id_mgid.gid_prefix =
3423 		    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3424 		    ((uint64_t)state->id_scope << 48) |
3425 		    ((uint32_t)(state->id_pkey << 16)));
3426 		mcg_attr.mc_mgid = state->id_mgid;
3427 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3428 		if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1,
3429 		    &state->id_mcinfo, &numg) == IBT_SUCCESS) {
3430 			found = B_TRUE;
3431 			break;
3432 		}
3433 	}
3434 
3435 	if (!found) {
3436 		if (state->id_create_broadcast_group) {
3437 			/*
3438 			 * If we created the broadcast group, but failed to
3439 			 * find it, we can't do anything except leave the
3440 			 * one we created and return failure.
3441 			 */
3442 			if (state->id_bgroup_created) {
3443 				ibd_print_warn(state, "IPoIB broadcast group "
3444 				    "absent. Unable to query after create.");
3445 				goto find_bgroup_fail;
3446 			}
3447 
3448 			/*
3449 			 * Create the ipoib broadcast group if it didn't exist
3450 			 */
3451 			bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3452 			mcg_attr.mc_qkey = IBD_DEFAULT_QKEY;
3453 			mcg_attr.mc_join_state = IB_MC_JSTATE_FULL;
3454 			mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL;
3455 			mcg_attr.mc_pkey = state->id_pkey;
3456 			mcg_attr.mc_flow = 0;
3457 			mcg_attr.mc_sl = 0;
3458 			mcg_attr.mc_tclass = 0;
3459 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3460 			state->id_mgid.gid_prefix =
3461 			    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3462 			    ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) |
3463 			    ((uint32_t)(state->id_pkey << 16)));
3464 			mcg_attr.mc_mgid = state->id_mgid;
3465 			_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3466 
3467 			if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr,
3468 			    &mcg_info, NULL, NULL)) != IBT_SUCCESS) {
3469 				ibd_print_warn(state, "IPoIB broadcast group "
3470 				    "absent, create failed: ret = %d\n", ret);
3471 				state->id_bgroup_created = B_FALSE;
3472 				return (IBT_FAILURE);
3473 			}
3474 			state->id_bgroup_created = B_TRUE;
3475 			goto query_bcast_grp;
3476 		} else {
3477 			ibd_print_warn(state, "IPoIB broadcast group absent");
3478 			return (IBT_FAILURE);
3479 		}
3480 	}
3481 
3482 	/*
3483 	 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
3484 	 */
3485 	mcgmtu = (128 << state->id_mcinfo->mc_mtu);
3486 	if (state->id_mtu < mcgmtu) {
3487 		ibd_print_warn(state, "IPoIB broadcast group MTU %d "
3488 		    "greater than port's maximum MTU %d", mcgmtu,
3489 		    state->id_mtu);
3490 		ibt_free_mcg_info(state->id_mcinfo, 1);
3491 		goto find_bgroup_fail;
3492 	}
3493 	state->id_mtu = mcgmtu;
3494 	state->id_bgroup_present = B_TRUE;
3495 
3496 	return (IBT_SUCCESS);
3497 
3498 find_bgroup_fail:
3499 	if (state->id_bgroup_created) {
3500 		(void) ibt_leave_mcg(state->id_sgid,
3501 		    mcg_info.mc_adds_vect.av_dgid, state->id_sgid,
3502 		    IB_MC_JSTATE_FULL);
3503 	}
3504 
3505 	return (IBT_FAILURE);
3506 }
3507 
3508 static int
3509 ibd_alloc_tx_copybufs(ibd_state_t *state)
3510 {
3511 	ibt_mr_attr_t mem_attr;
3512 
3513 	/*
3514 	 * Allocate one big chunk for all regular tx copy bufs
3515 	 */
3516 	state->id_tx_buf_sz = state->id_mtu;
3517 	if (state->id_lso_policy && state->id_lso_capable &&
3518 	    (state->id_ud_tx_copy_thresh > state->id_mtu)) {
3519 		state->id_tx_buf_sz = state->id_ud_tx_copy_thresh;
3520 	}
3521 
3522 	state->id_tx_bufs = kmem_zalloc(state->id_ud_num_swqe *
3523 	    state->id_tx_buf_sz, KM_SLEEP);
3524 
3525 	state->id_tx_wqes = kmem_zalloc(state->id_ud_num_swqe *
3526 	    sizeof (ibd_swqe_t), KM_SLEEP);
3527 
3528 	/*
3529 	 * Do one memory registration on the entire txbuf area
3530 	 */
3531 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs;
3532 	mem_attr.mr_len = state->id_ud_num_swqe * state->id_tx_buf_sz;
3533 	mem_attr.mr_as = NULL;
3534 	mem_attr.mr_flags = IBT_MR_SLEEP;
3535 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3536 	    &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) {
3537 		DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed");
3538 		kmem_free(state->id_tx_wqes,
3539 		    state->id_ud_num_swqe * sizeof (ibd_swqe_t));
3540 		kmem_free(state->id_tx_bufs,
3541 		    state->id_ud_num_swqe * state->id_tx_buf_sz);
3542 		state->id_tx_bufs = NULL;
3543 		return (DDI_FAILURE);
3544 	}
3545 
3546 	return (DDI_SUCCESS);
3547 }
3548 
3549 static int
3550 ibd_alloc_tx_lsobufs(ibd_state_t *state)
3551 {
3552 	ibt_mr_attr_t mem_attr;
3553 	ibd_lsobuf_t *buflist;
3554 	ibd_lsobuf_t *lbufp;
3555 	ibd_lsobuf_t *tail;
3556 	ibd_lsobkt_t *bktp;
3557 	uint8_t *membase;
3558 	uint8_t *memp;
3559 	uint_t memsz;
3560 	int i;
3561 
3562 	/*
3563 	 * Allocate the lso bucket
3564 	 */
3565 	bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP);
3566 
3567 	/*
3568 	 * Allocate the entire lso memory and register it
3569 	 */
3570 	memsz = state->id_num_lso_bufs * IBD_LSO_BUFSZ;
3571 	membase = kmem_zalloc(memsz, KM_SLEEP);
3572 
3573 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase;
3574 	mem_attr.mr_len = memsz;
3575 	mem_attr.mr_as = NULL;
3576 	mem_attr.mr_flags = IBT_MR_SLEEP;
3577 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl,
3578 	    &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) {
3579 		DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed");
3580 		kmem_free(membase, memsz);
3581 		kmem_free(bktp, sizeof (ibd_lsobkt_t));
3582 		return (DDI_FAILURE);
3583 	}
3584 
3585 	mutex_enter(&state->id_lso_lock);
3586 
3587 	/*
3588 	 * Now allocate the buflist.  Note that the elements in the buflist and
3589 	 * the buffers in the lso memory have a permanent 1-1 relation, so we
3590 	 * can always derive the address of a buflist entry from the address of
3591 	 * an lso buffer.
3592 	 */
3593 	buflist = kmem_zalloc(state->id_num_lso_bufs * sizeof (ibd_lsobuf_t),
3594 	    KM_SLEEP);
3595 
3596 	/*
3597 	 * Set up the lso buf chain
3598 	 */
3599 	memp = membase;
3600 	lbufp = buflist;
3601 	for (i = 0; i < state->id_num_lso_bufs; i++) {
3602 		lbufp->lb_isfree = 1;
3603 		lbufp->lb_buf = memp;
3604 		lbufp->lb_next = lbufp + 1;
3605 
3606 		tail = lbufp;
3607 
3608 		memp += IBD_LSO_BUFSZ;
3609 		lbufp++;
3610 	}
3611 	tail->lb_next = NULL;
3612 
3613 	/*
3614 	 * Set up the LSO buffer information in ibd state
3615 	 */
3616 	bktp->bkt_bufl = buflist;
3617 	bktp->bkt_free_head = buflist;
3618 	bktp->bkt_mem = membase;
3619 	bktp->bkt_nelem = state->id_num_lso_bufs;
3620 	bktp->bkt_nfree = bktp->bkt_nelem;
3621 
3622 	state->id_lso = bktp;
3623 	mutex_exit(&state->id_lso_lock);
3624 
3625 	return (DDI_SUCCESS);
3626 }
3627 
3628 /*
3629  * Statically allocate Tx buffer list(s).
3630  */
3631 static int
3632 ibd_init_txlist(ibd_state_t *state)
3633 {
3634 	ibd_swqe_t *swqe;
3635 	ibt_lkey_t lkey;
3636 	int i;
3637 	uint_t len;
3638 	uint8_t *bufaddr;
3639 
3640 	if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS)
3641 		return (DDI_FAILURE);
3642 
3643 	if (state->id_lso_policy && state->id_lso_capable) {
3644 		if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS)
3645 			state->id_lso_capable = B_FALSE;
3646 	}
3647 
3648 	mutex_enter(&state->id_tx_list.dl_mutex);
3649 	state->id_tx_list.dl_head = NULL;
3650 	state->id_tx_list.dl_pending_sends = B_FALSE;
3651 	state->id_tx_list.dl_cnt = 0;
3652 	mutex_exit(&state->id_tx_list.dl_mutex);
3653 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
3654 	state->id_tx_rel_list.dl_head = NULL;
3655 	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
3656 	state->id_tx_rel_list.dl_cnt = 0;
3657 	mutex_exit(&state->id_tx_rel_list.dl_mutex);
3658 
3659 	/*
3660 	 * Allocate and setup the swqe list
3661 	 */
3662 	lkey = state->id_tx_mr_desc.md_lkey;
3663 	bufaddr = state->id_tx_bufs;
3664 	len = state->id_tx_buf_sz;
3665 	swqe = state->id_tx_wqes;
3666 	mutex_enter(&state->id_tx_list.dl_mutex);
3667 	for (i = 0; i < state->id_ud_num_swqe; i++, swqe++, bufaddr += len) {
3668 		swqe->swqe_next = NULL;
3669 		swqe->swqe_im_mblk = NULL;
3670 
3671 		swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
3672 		    bufaddr;
3673 		swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
3674 		swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
3675 
3676 		swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
3677 		swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS;
3678 		swqe->w_swr.wr_trans = IBT_UD_SRV;
3679 
3680 		/* These are set in send */
3681 		swqe->w_swr.wr_nds = 0;
3682 		swqe->w_swr.wr_sgl = NULL;
3683 		swqe->w_swr.wr_opcode = IBT_WRC_SEND;
3684 
3685 		/* add to list */
3686 		state->id_tx_list.dl_cnt++;
3687 		swqe->swqe_next = state->id_tx_list.dl_head;
3688 		state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
3689 	}
3690 	mutex_exit(&state->id_tx_list.dl_mutex);
3691 
3692 	return (DDI_SUCCESS);
3693 }
3694 
3695 static int
3696 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p,
3697     uint32_t *nds_p)
3698 {
3699 	ibd_lsobkt_t *bktp;
3700 	ibd_lsobuf_t *lbufp;
3701 	ibd_lsobuf_t *nextp;
3702 	ibt_lkey_t lso_lkey;
3703 	uint_t frag_sz;
3704 	uint_t num_needed;
3705 	int i;
3706 
3707 	ASSERT(sgl_p != NULL);
3708 	ASSERT(nds_p != NULL);
3709 	ASSERT(req_sz != 0);
3710 
3711 	/*
3712 	 * Determine how many bufs we'd need for the size requested
3713 	 */
3714 	num_needed = req_sz / IBD_LSO_BUFSZ;
3715 	if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0)
3716 		num_needed++;
3717 
3718 	mutex_enter(&state->id_lso_lock);
3719 
3720 	/*
3721 	 * If we don't have enough lso bufs, return failure
3722 	 */
3723 	ASSERT(state->id_lso != NULL);
3724 	bktp = state->id_lso;
3725 	if (bktp->bkt_nfree < num_needed) {
3726 		mutex_exit(&state->id_lso_lock);
3727 		return (-1);
3728 	}
3729 
3730 	/*
3731 	 * Pick the first 'num_needed' bufs from the free list
3732 	 */
3733 	lso_lkey = bktp->bkt_mr_desc.md_lkey;
3734 	lbufp = bktp->bkt_free_head;
3735 	for (i = 0; i < num_needed; i++) {
3736 		ASSERT(lbufp->lb_isfree != 0);
3737 		ASSERT(lbufp->lb_buf != NULL);
3738 
3739 		nextp = lbufp->lb_next;
3740 
3741 		sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf;
3742 		sgl_p[i].ds_key = lso_lkey;
3743 		sgl_p[i].ds_len = IBD_LSO_BUFSZ;
3744 
3745 		lbufp->lb_isfree = 0;
3746 		lbufp->lb_next = NULL;
3747 
3748 		lbufp = nextp;
3749 	}
3750 	bktp->bkt_free_head = lbufp;
3751 
3752 	/*
3753 	 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need
3754 	 * to adjust the last sgl entry's length. Since we know we need atleast
3755 	 * one, the i-1 use below is ok.
3756 	 */
3757 	if (frag_sz) {
3758 		sgl_p[i-1].ds_len = frag_sz;
3759 	}
3760 
3761 	/*
3762 	 * Update nfree count and return
3763 	 */
3764 	bktp->bkt_nfree -= num_needed;
3765 
3766 	mutex_exit(&state->id_lso_lock);
3767 
3768 	*nds_p = num_needed;
3769 
3770 	return (0);
3771 }
3772 
3773 static void
3774 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds)
3775 {
3776 	ibd_lsobkt_t *bktp;
3777 	ibd_lsobuf_t *lbufp;
3778 	uint8_t *lso_mem_end;
3779 	uint_t ndx;
3780 	int i;
3781 
3782 	mutex_enter(&state->id_lso_lock);
3783 
3784 	bktp = state->id_lso;
3785 	ASSERT(bktp != NULL);
3786 
3787 	lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ;
3788 	for (i = 0; i < nds; i++) {
3789 		uint8_t *va;
3790 
3791 		va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va;
3792 		ASSERT(va >= bktp->bkt_mem && va < lso_mem_end);
3793 
3794 		/*
3795 		 * Figure out the buflist element this sgl buffer corresponds
3796 		 * to and put it back at the head
3797 		 */
3798 		ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ;
3799 		lbufp = bktp->bkt_bufl + ndx;
3800 
3801 		ASSERT(lbufp->lb_isfree == 0);
3802 		ASSERT(lbufp->lb_buf == va);
3803 
3804 		lbufp->lb_isfree = 1;
3805 		lbufp->lb_next = bktp->bkt_free_head;
3806 		bktp->bkt_free_head = lbufp;
3807 	}
3808 	bktp->bkt_nfree += nds;
3809 
3810 	mutex_exit(&state->id_lso_lock);
3811 }
3812 
3813 static void
3814 ibd_free_tx_copybufs(ibd_state_t *state)
3815 {
3816 	/*
3817 	 * Unregister txbuf mr
3818 	 */
3819 	if (ibt_deregister_mr(state->id_hca_hdl,
3820 	    state->id_tx_mr_hdl) != IBT_SUCCESS) {
3821 		DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed");
3822 	}
3823 	state->id_tx_mr_hdl = NULL;
3824 
3825 	/*
3826 	 * Free txbuf memory
3827 	 */
3828 	kmem_free(state->id_tx_wqes, state->id_ud_num_swqe *
3829 	    sizeof (ibd_swqe_t));
3830 	kmem_free(state->id_tx_bufs, state->id_ud_num_swqe *
3831 	    state->id_tx_buf_sz);
3832 	state->id_tx_wqes = NULL;
3833 	state->id_tx_bufs = NULL;
3834 }
3835 
3836 static void
3837 ibd_free_tx_lsobufs(ibd_state_t *state)
3838 {
3839 	ibd_lsobkt_t *bktp;
3840 
3841 	mutex_enter(&state->id_lso_lock);
3842 
3843 	if ((bktp = state->id_lso) == NULL) {
3844 		mutex_exit(&state->id_lso_lock);
3845 		return;
3846 	}
3847 
3848 	/*
3849 	 * First, free the buflist
3850 	 */
3851 	ASSERT(bktp->bkt_bufl != NULL);
3852 	kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t));
3853 
3854 	/*
3855 	 * Unregister the LSO memory and free it
3856 	 */
3857 	ASSERT(bktp->bkt_mr_hdl != NULL);
3858 	if (ibt_deregister_mr(state->id_hca_hdl,
3859 	    bktp->bkt_mr_hdl) != IBT_SUCCESS) {
3860 		DPRINT(10,
3861 		    "ibd_free_lsobufs: ibt_deregister_mr failed");
3862 	}
3863 	ASSERT(bktp->bkt_mem);
3864 	kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ);
3865 
3866 	/*
3867 	 * Finally free the bucket
3868 	 */
3869 	kmem_free(bktp, sizeof (ibd_lsobkt_t));
3870 	state->id_lso = NULL;
3871 
3872 	mutex_exit(&state->id_lso_lock);
3873 }
3874 
3875 /*
3876  * Free the statically allocated Tx buffer list.
3877  */
3878 static void
3879 ibd_fini_txlist(ibd_state_t *state)
3880 {
3881 	/*
3882 	 * Free the allocated swqes
3883 	 */
3884 	mutex_enter(&state->id_tx_list.dl_mutex);
3885 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
3886 	state->id_tx_list.dl_head = NULL;
3887 	state->id_tx_list.dl_pending_sends = B_FALSE;
3888 	state->id_tx_list.dl_cnt = 0;
3889 	state->id_tx_rel_list.dl_head = NULL;
3890 	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
3891 	state->id_tx_rel_list.dl_cnt = 0;
3892 	mutex_exit(&state->id_tx_rel_list.dl_mutex);
3893 	mutex_exit(&state->id_tx_list.dl_mutex);
3894 
3895 	ibd_free_tx_lsobufs(state);
3896 	ibd_free_tx_copybufs(state);
3897 }
3898 
3899 /*
3900  * post a list of rwqes, NULL terminated.
3901  */
3902 static void
3903 ibd_post_recv_list(ibd_state_t *state, ibd_rwqe_t *rwqe)
3904 {
3905 	uint_t		i;
3906 	uint_t		num_posted;
3907 	ibt_status_t	ibt_status;
3908 	ibt_recv_wr_t	wrs[IBD_RX_POST_CNT];
3909 
3910 	while (rwqe) {
3911 		/* Post up to IBD_RX_POST_CNT receive work requests */
3912 		for (i = 0; i < IBD_RX_POST_CNT; i++) {
3913 			wrs[i] = rwqe->w_rwr;
3914 			rwqe = WQE_TO_RWQE(rwqe->rwqe_next);
3915 			if (rwqe == NULL) {
3916 				i++;
3917 				break;
3918 			}
3919 		}
3920 
3921 		/*
3922 		 * If posting fails for some reason, we'll never receive
3923 		 * completion intimation, so we'll need to cleanup. But
3924 		 * we need to make sure we don't clean up nodes whose
3925 		 * wrs have been successfully posted. We assume that the
3926 		 * hca driver returns on the first failure to post and
3927 		 * therefore the first 'num_posted' entries don't need
3928 		 * cleanup here.
3929 		 */
3930 		atomic_add_32(&state->id_rx_list.dl_cnt, i);
3931 
3932 		num_posted = 0;
3933 		ibt_status = ibt_post_recv(state->id_chnl_hdl, wrs, i,
3934 		    &num_posted);
3935 		if (ibt_status != IBT_SUCCESS) {
3936 			/* This cannot happen unless the device has an error. */
3937 			ibd_print_warn(state, "ibd_post_recv: FATAL: "
3938 			    "posting multiple wrs failed: "
3939 			    "requested=%d, done=%d, ret=%d",
3940 			    IBD_RX_POST_CNT, num_posted, ibt_status);
3941 			atomic_add_32(&state->id_rx_list.dl_cnt,
3942 			    num_posted - i);
3943 		}
3944 	}
3945 }
3946 
3947 /*
3948  * Grab a list of rwqes from the array of lists, and post the list.
3949  */
3950 static void
3951 ibd_post_recv_intr(ibd_state_t *state)
3952 {
3953 	ibd_rx_queue_t	*rxp;
3954 	ibd_rwqe_t *list;
3955 
3956 	/* rotate through the rx_queue array, expecting an adequate number */
3957 	state->id_rx_post_queue_index =
3958 	    (state->id_rx_post_queue_index + 1) &
3959 	    (state->id_rx_nqueues - 1);
3960 
3961 	rxp = state->id_rx_queues + state->id_rx_post_queue_index;
3962 	mutex_enter(&rxp->rx_post_lock);
3963 	list = WQE_TO_RWQE(rxp->rx_head);
3964 	rxp->rx_head = NULL;
3965 	rxp->rx_cnt = 0;
3966 	mutex_exit(&rxp->rx_post_lock);
3967 	ibd_post_recv_list(state, list);
3968 }
3969 
3970 /* macro explained below */
3971 #define	RX_QUEUE_HASH(rwqe) \
3972 	(((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1))
3973 
3974 /*
3975  * Add a rwqe to one of the the Rx lists.  If the list is large enough
3976  * (exactly IBD_RX_POST_CNT), post the list to the hardware.
3977  *
3978  * Note: one of 2^N lists is chosen via a hash.  This is done
3979  * because using one list is contentious.  If the first list is busy
3980  * (mutex_tryenter fails), use a second list (just call mutex_enter).
3981  *
3982  * The number 8 in RX_QUEUE_HASH is a random choice that provides
3983  * even distribution of mapping rwqes to the 2^N queues.
3984  */
3985 static void
3986 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe)
3987 {
3988 	ibd_rx_queue_t	*rxp;
3989 
3990 	rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe);
3991 
3992 	if (!mutex_tryenter(&rxp->rx_post_lock)) {
3993 		/* Failed.  Try a different queue ("ptr + 16" ensures that). */
3994 		rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16);
3995 		mutex_enter(&rxp->rx_post_lock);
3996 	}
3997 	rwqe->rwqe_next = rxp->rx_head;
3998 	if (++rxp->rx_cnt >= IBD_RX_POST_CNT - 2) {
3999 		uint_t active = atomic_inc_32_nv(&state->id_rx_post_active);
4000 
4001 		/* only call ibt_post_recv() every Nth time through here */
4002 		if ((active & (state->id_rx_nqueues - 1)) == 0) {
4003 			rxp->rx_head = NULL;
4004 			rxp->rx_cnt = 0;
4005 			mutex_exit(&rxp->rx_post_lock);
4006 			ibd_post_recv_list(state, rwqe);
4007 			return;
4008 		}
4009 	}
4010 	rxp->rx_head = RWQE_TO_WQE(rwqe);
4011 	mutex_exit(&rxp->rx_post_lock);
4012 }
4013 
4014 static int
4015 ibd_alloc_rx_copybufs(ibd_state_t *state)
4016 {
4017 	ibt_mr_attr_t mem_attr;
4018 	int i;
4019 
4020 	/*
4021 	 * Allocate one big chunk for all regular rx copy bufs
4022 	 */
4023 	state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE;
4024 
4025 	state->id_rx_bufs = kmem_zalloc(state->id_ud_num_rwqe *
4026 	    state->id_rx_buf_sz, KM_SLEEP);
4027 
4028 	state->id_rx_wqes = kmem_zalloc(state->id_ud_num_rwqe *
4029 	    sizeof (ibd_rwqe_t), KM_SLEEP);
4030 
4031 	state->id_rx_nqueues = 1 << IBD_LOG_RX_POST;
4032 	state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues *
4033 	    sizeof (ibd_rx_queue_t), KM_SLEEP);
4034 	for (i = 0; i < state->id_rx_nqueues; i++) {
4035 		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
4036 		mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL);
4037 	}
4038 
4039 	/*
4040 	 * Do one memory registration on the entire rxbuf area
4041 	 */
4042 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs;
4043 	mem_attr.mr_len = state->id_ud_num_rwqe * state->id_rx_buf_sz;
4044 	mem_attr.mr_as = NULL;
4045 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
4046 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
4047 	    &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) {
4048 		DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed");
4049 		kmem_free(state->id_rx_wqes,
4050 		    state->id_ud_num_rwqe * sizeof (ibd_rwqe_t));
4051 		kmem_free(state->id_rx_bufs,
4052 		    state->id_ud_num_rwqe * state->id_rx_buf_sz);
4053 		state->id_rx_bufs = NULL;
4054 		state->id_rx_wqes = NULL;
4055 		return (DDI_FAILURE);
4056 	}
4057 
4058 	return (DDI_SUCCESS);
4059 }
4060 
4061 /*
4062  * Allocate the statically allocated Rx buffer list.
4063  */
4064 static int
4065 ibd_init_rxlist(ibd_state_t *state)
4066 {
4067 	ibd_rwqe_t *rwqe, *next;
4068 	ibd_wqe_t *list;
4069 	ibt_lkey_t lkey;
4070 	int i;
4071 	uint_t len;
4072 	uint8_t *bufaddr;
4073 
4074 	mutex_enter(&state->id_rx_free_list.dl_mutex);
4075 	if (state->id_rx_free_list.dl_head != NULL) {
4076 		/* rx rsrcs were never freed.  Just repost them */
4077 		len = state->id_rx_buf_sz;
4078 		list = state->id_rx_free_list.dl_head;
4079 		state->id_rx_free_list.dl_head = NULL;
4080 		state->id_rx_free_list.dl_cnt = 0;
4081 		mutex_exit(&state->id_rx_free_list.dl_mutex);
4082 		for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
4083 		    rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
4084 			if ((rwqe->rwqe_im_mblk = desballoc(
4085 			    rwqe->rwqe_copybuf.ic_bufaddr, len, 0,
4086 			    &rwqe->w_freemsg_cb)) == NULL) {
4087 				/* allow freemsg_cb to free the rwqes */
4088 				if (atomic_dec_32_nv(&state->id_running) != 0) {
4089 					cmn_err(CE_WARN, "ibd_init_rxlist: "
4090 					    "id_running was not 1\n");
4091 				}
4092 				DPRINT(10, "ibd_init_rxlist : "
4093 				    "failed in desballoc()");
4094 				for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
4095 				    rwqe = next) {
4096 					next = WQE_TO_RWQE(rwqe->rwqe_next);
4097 					if (rwqe->rwqe_im_mblk) {
4098 						atomic_inc_32(&state->
4099 						    id_rx_list.
4100 						    dl_bufs_outstanding);
4101 						freemsg(rwqe->rwqe_im_mblk);
4102 					} else
4103 						ibd_free_rwqe(state, rwqe);
4104 				}
4105 				atomic_inc_32(&state->id_running);
4106 				return (DDI_FAILURE);
4107 			}
4108 		}
4109 		ibd_post_recv_list(state, WQE_TO_RWQE(list));
4110 		return (DDI_SUCCESS);
4111 	}
4112 	mutex_exit(&state->id_rx_free_list.dl_mutex);
4113 
4114 	if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS)
4115 		return (DDI_FAILURE);
4116 
4117 	/*
4118 	 * Allocate and setup the rwqe list
4119 	 */
4120 	len = state->id_rx_buf_sz;
4121 	lkey = state->id_rx_mr_desc.md_lkey;
4122 	rwqe = state->id_rx_wqes;
4123 	bufaddr = state->id_rx_bufs;
4124 	list = NULL;
4125 	for (i = 0; i < state->id_ud_num_rwqe; i++, rwqe++, bufaddr += len) {
4126 		rwqe->w_state = state;
4127 		rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb;
4128 		rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
4129 
4130 		rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
4131 
4132 		if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
4133 		    &rwqe->w_freemsg_cb)) == NULL) {
4134 			DPRINT(10, "ibd_init_rxlist : failed in desballoc()");
4135 			/* allow freemsg_cb to free the rwqes */
4136 			if (atomic_dec_32_nv(&state->id_running) != 0) {
4137 				cmn_err(CE_WARN, "ibd_init_rxlist: "
4138 				    "id_running was not 1\n");
4139 			}
4140 			DPRINT(10, "ibd_init_rxlist : "
4141 			    "failed in desballoc()");
4142 			for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
4143 			    rwqe = next) {
4144 				next = WQE_TO_RWQE(rwqe->rwqe_next);
4145 				freemsg(rwqe->rwqe_im_mblk);
4146 			}
4147 			atomic_inc_32(&state->id_running);
4148 
4149 			/* remove reference to free'd rwqes */
4150 			mutex_enter(&state->id_rx_free_list.dl_mutex);
4151 			state->id_rx_free_list.dl_head = NULL;
4152 			state->id_rx_free_list.dl_cnt = 0;
4153 			mutex_exit(&state->id_rx_free_list.dl_mutex);
4154 
4155 			ibd_fini_rxlist(state);
4156 			return (DDI_FAILURE);
4157 		}
4158 
4159 		rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
4160 		rwqe->rwqe_copybuf.ic_sgl.ds_va =
4161 		    (ib_vaddr_t)(uintptr_t)bufaddr;
4162 		rwqe->rwqe_copybuf.ic_sgl.ds_len = len;
4163 		rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
4164 		rwqe->w_rwr.wr_nds = 1;
4165 		rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
4166 
4167 		rwqe->rwqe_next = list;
4168 		list = RWQE_TO_WQE(rwqe);
4169 	}
4170 	ibd_post_recv_list(state, WQE_TO_RWQE(list));
4171 
4172 	return (DDI_SUCCESS);
4173 }
4174 
4175 static void
4176 ibd_free_rx_copybufs(ibd_state_t *state)
4177 {
4178 	int i;
4179 
4180 	/*
4181 	 * Unregister rxbuf mr
4182 	 */
4183 	if (ibt_deregister_mr(state->id_hca_hdl,
4184 	    state->id_rx_mr_hdl) != IBT_SUCCESS) {
4185 		DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed");
4186 	}
4187 	state->id_rx_mr_hdl = NULL;
4188 
4189 	/*
4190 	 * Free rxbuf memory
4191 	 */
4192 	for (i = 0; i < state->id_rx_nqueues; i++) {
4193 		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
4194 		mutex_destroy(&rxp->rx_post_lock);
4195 	}
4196 	kmem_free(state->id_rx_queues, state->id_rx_nqueues *
4197 	    sizeof (ibd_rx_queue_t));
4198 	kmem_free(state->id_rx_wqes, state->id_ud_num_rwqe *
4199 	    sizeof (ibd_rwqe_t));
4200 	kmem_free(state->id_rx_bufs, state->id_ud_num_rwqe *
4201 	    state->id_rx_buf_sz);
4202 	state->id_rx_queues = NULL;
4203 	state->id_rx_wqes = NULL;
4204 	state->id_rx_bufs = NULL;
4205 }
4206 
4207 static void
4208 ibd_free_rx_rsrcs(ibd_state_t *state)
4209 {
4210 	mutex_enter(&state->id_rx_free_list.dl_mutex);
4211 	if (state->id_rx_free_list.dl_head == NULL) {
4212 		/* already freed */
4213 		mutex_exit(&state->id_rx_free_list.dl_mutex);
4214 		return;
4215 	}
4216 	ASSERT(state->id_rx_free_list.dl_cnt == state->id_ud_num_rwqe);
4217 	ibd_free_rx_copybufs(state);
4218 	state->id_rx_free_list.dl_cnt = 0;
4219 	state->id_rx_free_list.dl_head = NULL;
4220 	mutex_exit(&state->id_rx_free_list.dl_mutex);
4221 }
4222 
4223 /*
4224  * Free the statically allocated Rx buffer list.
4225  */
4226 static void
4227 ibd_fini_rxlist(ibd_state_t *state)
4228 {
4229 	ibd_rwqe_t *rwqe;
4230 	int i;
4231 
4232 	/* run through the rx_queue's, calling freemsg() */
4233 	for (i = 0; i < state->id_rx_nqueues; i++) {
4234 		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
4235 		mutex_enter(&rxp->rx_post_lock);
4236 		for (rwqe = WQE_TO_RWQE(rxp->rx_head); rwqe;
4237 		    rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
4238 			freemsg(rwqe->rwqe_im_mblk);
4239 			rxp->rx_cnt--;
4240 		}
4241 		rxp->rx_head = NULL;
4242 		mutex_exit(&rxp->rx_post_lock);
4243 	}
4244 
4245 	/* cannot free rx resources unless gld returned everything */
4246 	if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) == 0)
4247 		ibd_free_rx_rsrcs(state);
4248 }
4249 
4250 /*
4251  * Free an allocated recv wqe.
4252  */
4253 /* ARGSUSED */
4254 static void
4255 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
4256 {
4257 	/*
4258 	 * desballoc() failed (no memory).
4259 	 *
4260 	 * This rwqe is placed on a free list so that it
4261 	 * can be reinstated when memory is available.
4262 	 *
4263 	 * NOTE: no code currently exists to reinstate
4264 	 * these "lost" rwqes.
4265 	 */
4266 	mutex_enter(&state->id_rx_free_list.dl_mutex);
4267 	state->id_rx_free_list.dl_cnt++;
4268 	rwqe->rwqe_next = state->id_rx_free_list.dl_head;
4269 	state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe);
4270 	mutex_exit(&state->id_rx_free_list.dl_mutex);
4271 }
4272 
4273 /*
4274  * IBA Rx completion queue handler. Guaranteed to be single
4275  * threaded and nonreentrant for this CQ.
4276  */
4277 /* ARGSUSED */
4278 static void
4279 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4280 {
4281 	ibd_state_t *state = (ibd_state_t *)arg;
4282 
4283 	atomic_inc_64(&state->id_num_intrs);
4284 
4285 	if (ibd_rx_softintr == 1) {
4286 		mutex_enter(&state->id_rcq_poll_lock);
4287 		if (state->id_rcq_poll_busy & IBD_CQ_POLLING) {
4288 			state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING;
4289 			mutex_exit(&state->id_rcq_poll_lock);
4290 			return;
4291 		} else {
4292 			mutex_exit(&state->id_rcq_poll_lock);
4293 			ddi_trigger_softintr(state->id_rx);
4294 		}
4295 	} else
4296 		(void) ibd_intr((caddr_t)state);
4297 }
4298 
4299 /*
4300  * CQ handler for Tx completions, when the Tx CQ is in
4301  * interrupt driven mode.
4302  */
4303 /* ARGSUSED */
4304 static void
4305 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4306 {
4307 	ibd_state_t *state = (ibd_state_t *)arg;
4308 
4309 	atomic_inc_64(&state->id_num_intrs);
4310 
4311 	if (ibd_tx_softintr == 1) {
4312 		mutex_enter(&state->id_scq_poll_lock);
4313 		if (state->id_scq_poll_busy & IBD_CQ_POLLING) {
4314 			state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING;
4315 			mutex_exit(&state->id_scq_poll_lock);
4316 			return;
4317 		} else {
4318 			mutex_exit(&state->id_scq_poll_lock);
4319 			ddi_trigger_softintr(state->id_tx);
4320 		}
4321 	} else
4322 		(void) ibd_tx_recycle((caddr_t)state);
4323 }
4324 
4325 /*
4326  * Multicast group create/delete trap handler. These will be delivered
4327  * on a kernel thread (handling can thus block) and can be invoked
4328  * concurrently. The handler can be invoked anytime after it is
4329  * registered and before ibt_detach().
4330  */
4331 /* ARGSUSED */
4332 static void
4333 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code,
4334     ibt_subnet_event_t *event)
4335 {
4336 	ibd_state_t *state = (ibd_state_t *)arg;
4337 	ibd_req_t *req;
4338 
4339 	/*
4340 	 * The trap handler will get invoked once for every event for
4341 	 * every port. The input "gid" is the GID0 of the port the
4342 	 * trap came in on; we just need to act on traps that came
4343 	 * to our port, meaning the port on which the ipoib interface
4344 	 * resides. Since ipoib uses GID0 of the port, we just match
4345 	 * the gids to check whether we need to handle the trap.
4346 	 */
4347 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
4348 	if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0)
4349 		return;
4350 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
4351 
4352 	DPRINT(10, "ibd_notices_handler : %d\n", code);
4353 
4354 	switch (code) {
4355 		case IBT_SM_EVENT_UNAVAILABLE:
4356 			/*
4357 			 * If we are in promiscuous mode or have
4358 			 * sendnonmembers, we need to print a warning
4359 			 * message right now. Else, just store the
4360 			 * information, print when we enter promiscuous
4361 			 * mode or attempt nonmember send. We might
4362 			 * also want to stop caching sendnonmember.
4363 			 */
4364 			ibd_print_warn(state, "IBA multicast support "
4365 			    "degraded due to unavailability of multicast "
4366 			    "traps");
4367 			break;
4368 		case IBT_SM_EVENT_AVAILABLE:
4369 			/*
4370 			 * If we printed a warning message above or
4371 			 * while trying to nonmember send or get into
4372 			 * promiscuous mode, print an okay message.
4373 			 */
4374 			ibd_print_warn(state, "IBA multicast support "
4375 			    "restored due to availability of multicast "
4376 			    "traps");
4377 			break;
4378 		case IBT_SM_EVENT_MCG_CREATED:
4379 		case IBT_SM_EVENT_MCG_DELETED:
4380 			/*
4381 			 * If it is a "deleted" event and we are in late hca
4382 			 * init, nothing to do.
4383 			 */
4384 			if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4385 			    IBD_DRV_IN_LATE_HCA_INIT) && (code ==
4386 			    IBT_SM_EVENT_MCG_DELETED)) {
4387 				break;
4388 			}
4389 			/*
4390 			 * Common processing of creation/deletion traps.
4391 			 * First check if the instance is being
4392 			 * [de]initialized; back off then, without doing
4393 			 * anything more, since we are not sure if the
4394 			 * async thread is around, or whether we might
4395 			 * be racing with the detach code in ibd_m_stop()
4396 			 * that scans the mcg list.
4397 			 */
4398 			if (!ibd_async_safe(state))
4399 				return;
4400 
4401 			req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
4402 			req->rq_gid = event->sm_notice_gid;
4403 			req->rq_ptr = (void *)code;
4404 			ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP);
4405 			break;
4406 	}
4407 }
4408 
4409 static void
4410 ibd_async_trap(ibd_state_t *state, ibd_req_t *req)
4411 {
4412 	ib_gid_t mgid = req->rq_gid;
4413 	ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr;
4414 	int ret;
4415 	ib_pkey_t pkey = (mgid.gid_prefix >> 16) & 0xffff;
4416 
4417 	DPRINT(10, "ibd_async_trap : %d\n", code);
4418 
4419 	/*
4420 	 * Check if we have already joined the IPoIB broadcast group for our
4421 	 * PKEY. If joined, perform the rest of the operation.
4422 	 * Else, the interface is not initialised. Do the initialisation here
4423 	 * by calling ibd_start() and return.
4424 	 */
4425 
4426 	if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4427 	    IBD_DRV_IN_LATE_HCA_INIT) && (state->id_bgroup_present == 0) &&
4428 	    (code == IBT_SM_EVENT_MCG_CREATED)) {
4429 		/*
4430 		 * If we are in late HCA init and a notification for the
4431 		 * creation of a MCG came in, check if it is the IPoIB MCG for
4432 		 * this pkey. If not, return.
4433 		 */
4434 		if ((mgid.gid_guid != IB_MGID_IPV4_LOWGRP_MASK) || (pkey !=
4435 		    state->id_pkey)) {
4436 			ibd_async_done(state);
4437 			return;
4438 		}
4439 		ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
4440 		/*
4441 		 * Check if there is still a necessity to start the interface.
4442 		 * It is possible that the user attempted unplumb at just about
4443 		 * the same time, and if unplumb succeeded, we have nothing to
4444 		 * do.
4445 		 */
4446 		if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4447 		    IBD_DRV_IN_LATE_HCA_INIT) &&
4448 		    ((ret = ibd_start(state)) != 0)) {
4449 			DPRINT(10, "ibd_async_trap: cannot start from late HCA "
4450 			    "init, ret=%d", ret);
4451 		}
4452 		ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
4453 		ibd_async_done(state);
4454 		return;
4455 	}
4456 
4457 	/*
4458 	 * Atomically search the nonmember and sendonlymember lists and
4459 	 * delete.
4460 	 */
4461 	ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON);
4462 
4463 	if (state->id_prom_op == IBD_OP_COMPLETED) {
4464 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
4465 
4466 		/*
4467 		 * If in promiscuous mode, try to join/attach to the new
4468 		 * mcg. Given the unreliable out-of-order mode of trap
4469 		 * delivery, we can never be sure whether it is a problem
4470 		 * if the join fails. Thus, we warn the admin of a failure
4471 		 * if this was a creation trap. Note that the trap might
4472 		 * actually be reporting a long past event, and the mcg
4473 		 * might already have been deleted, thus we might be warning
4474 		 * in vain.
4475 		 */
4476 		if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) ==
4477 		    NULL) && (code == IBT_SM_EVENT_MCG_CREATED))
4478 			ibd_print_warn(state, "IBA promiscuous mode missed "
4479 			    "new multicast gid %016llx:%016llx",
4480 			    (u_longlong_t)mgid.gid_prefix,
4481 			    (u_longlong_t)mgid.gid_guid);
4482 	}
4483 
4484 	/*
4485 	 * Free the request slot allocated by the subnet event thread.
4486 	 */
4487 	ibd_async_done(state);
4488 }
4489 
4490 /*
4491  * GLDv3 entry point to get capabilities.
4492  */
4493 static boolean_t
4494 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
4495 {
4496 	ibd_state_t *state = arg;
4497 
4498 	if (state->id_type == IBD_PORT_DRIVER)
4499 		return (B_FALSE);
4500 
4501 	switch (cap) {
4502 	case MAC_CAPAB_HCKSUM: {
4503 		uint32_t *txflags = cap_data;
4504 
4505 		/*
4506 		 * We either do full checksum or not do it at all
4507 		 */
4508 		if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL)
4509 			*txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4;
4510 		else
4511 			return (B_FALSE);
4512 		break;
4513 	}
4514 
4515 	case MAC_CAPAB_LSO: {
4516 		mac_capab_lso_t *cap_lso = cap_data;
4517 
4518 		/*
4519 		 * In addition to the capability and policy, since LSO
4520 		 * relies on hw checksum, we'll not enable LSO if we
4521 		 * don't have hw checksum.  Of course, if the HCA doesn't
4522 		 * provide the reserved lkey capability, enabling LSO will
4523 		 * actually affect performance adversely, so we'll disable
4524 		 * LSO even for that case.
4525 		 */
4526 		if (!state->id_lso_policy || !state->id_lso_capable)
4527 			return (B_FALSE);
4528 
4529 		if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0)
4530 			return (B_FALSE);
4531 
4532 		if (state->id_hca_res_lkey_capab == 0) {
4533 			ibd_print_warn(state, "no reserved-lkey capability, "
4534 			    "disabling LSO");
4535 			return (B_FALSE);
4536 		}
4537 
4538 		cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
4539 		cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1;
4540 		break;
4541 	}
4542 
4543 	default:
4544 		return (B_FALSE);
4545 	}
4546 
4547 	return (B_TRUE);
4548 }
4549 
4550 /*
4551  * callback function for set/get of properties
4552  */
4553 static int
4554 ibd_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4555     uint_t pr_valsize, const void *pr_val)
4556 {
4557 	ibd_state_t *state = arg;
4558 	int err = 0;
4559 	uint32_t link_mode;
4560 
4561 	/* Cannot set properties on a port driver */
4562 	if (state->id_type == IBD_PORT_DRIVER) {
4563 		return (ENOTSUP);
4564 	}
4565 
4566 	switch (pr_num) {
4567 		case MAC_PROP_IB_LINKMODE:
4568 			if (state->id_mac_state & IBD_DRV_STARTED) {
4569 				err = EBUSY;
4570 				break;
4571 			}
4572 			if (pr_val == NULL) {
4573 				err = EINVAL;
4574 				break;
4575 			}
4576 			bcopy(pr_val, &link_mode, sizeof (link_mode));
4577 			if (link_mode != IBD_LINK_MODE_UD &&
4578 			    link_mode != IBD_LINK_MODE_RC) {
4579 				err = EINVAL;
4580 			} else {
4581 				if (link_mode == IBD_LINK_MODE_RC) {
4582 					if (state->id_enable_rc) {
4583 						return (0);
4584 					}
4585 					state->id_enable_rc = 1;
4586 					/* inform MAC framework of new MTU */
4587 					err = mac_maxsdu_update(state->id_mh,
4588 					    state->rc_mtu - IPOIB_HDRSIZE);
4589 				} else {
4590 					if (!state->id_enable_rc) {
4591 						return (0);
4592 					}
4593 					state->id_enable_rc = 0;
4594 					err = mac_maxsdu_update(state->id_mh,
4595 					    state->id_mtu - IPOIB_HDRSIZE);
4596 				}
4597 				(void) ibd_record_capab(state);
4598 				mac_capab_update(state->id_mh);
4599 			}
4600 			break;
4601 		case MAC_PROP_PRIVATE:
4602 			err = ibd_set_priv_prop(state, pr_name,
4603 			    pr_valsize, pr_val);
4604 			break;
4605 		default:
4606 			err = ENOTSUP;
4607 			break;
4608 	}
4609 	return (err);
4610 }
4611 
4612 static int
4613 ibd_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4614     uint_t pr_valsize, void *pr_val)
4615 {
4616 	ibd_state_t *state = arg;
4617 	int err = 0;
4618 
4619 	switch (pr_num) {
4620 		case MAC_PROP_MTU:
4621 			break;
4622 		default:
4623 			if (state->id_type == IBD_PORT_DRIVER) {
4624 				return (ENOTSUP);
4625 			}
4626 			break;
4627 	}
4628 
4629 	switch (pr_num) {
4630 		case MAC_PROP_IB_LINKMODE:
4631 			*(uint_t *)pr_val = state->id_enable_rc;
4632 			break;
4633 		case MAC_PROP_PRIVATE:
4634 			err = ibd_get_priv_prop(state, pr_name, pr_valsize,
4635 			    pr_val);
4636 			break;
4637 		default:
4638 			err = ENOTSUP;
4639 			break;
4640 	}
4641 	return (err);
4642 }
4643 
4644 static void
4645 ibd_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4646     mac_prop_info_handle_t prh)
4647 {
4648 	ibd_state_t *state = arg;
4649 
4650 	switch (pr_num) {
4651 	case MAC_PROP_IB_LINKMODE: {
4652 		mac_prop_info_set_default_uint32(prh, IBD_DEF_LINK_MODE);
4653 		break;
4654 	}
4655 	case MAC_PROP_MTU: {
4656 		uint32_t min, max;
4657 		if (state->id_type == IBD_PORT_DRIVER) {
4658 			min = 1500;
4659 			max = IBD_DEF_RC_MAX_SDU;
4660 		} else if (state->id_enable_rc) {
4661 			min = max = IBD_DEF_RC_MAX_SDU;
4662 		} else {
4663 			min = max = state->id_mtu - IPOIB_HDRSIZE;
4664 		}
4665 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
4666 		mac_prop_info_set_range_uint32(prh, min, max);
4667 		break;
4668 	}
4669 	case MAC_PROP_PRIVATE: {
4670 		char valstr[64];
4671 		int value;
4672 
4673 		if (strcmp(pr_name, "_ibd_broadcast_group") == 0) {
4674 			mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
4675 			return;
4676 		} else if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
4677 			value = IBD_DEF_COALESCE_COMPLETIONS;
4678 		} else if (strcmp(pr_name,
4679 		    "_ibd_create_broadcast_group") == 0) {
4680 			value = IBD_DEF_CREATE_BCAST_GROUP;
4681 		} else if (strcmp(pr_name, "_ibd_hash_size") == 0) {
4682 			value = IBD_DEF_HASH_SIZE;
4683 		} else if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
4684 			value = IBD_DEF_LSO_POLICY;
4685 		} else if (strcmp(pr_name, "_ibd_num_ah") == 0) {
4686 			value = IBD_DEF_NUM_AH;
4687 		} else if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
4688 			value = IBD_DEF_NUM_LSO_BUFS;
4689 		} else if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
4690 			value = IBD_DEF_RC_ENABLE_SRQ;
4691 		} else if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
4692 			value = IBD_DEF_RC_NUM_RWQE;
4693 		} else if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
4694 			value = IBD_DEF_RC_NUM_SRQ;
4695 		} else if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
4696 			value = IBD_DEF_RC_NUM_SWQE;
4697 		} else if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
4698 			value = IBD_DEF_RC_RX_COMP_COUNT;
4699 		} else if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
4700 			value = IBD_DEF_RC_RX_COMP_USEC;
4701 		} else if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
4702 			value = IBD_DEF_RC_RX_COPY_THRESH;
4703 		} else if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
4704 			value = IBD_DEF_RC_RX_RWQE_THRESH;
4705 		} else if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
4706 			value = IBD_DEF_RC_TX_COMP_COUNT;
4707 		} else if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
4708 			value = IBD_DEF_RC_TX_COMP_USEC;
4709 		} else if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
4710 			value = IBD_DEF_RC_TX_COPY_THRESH;
4711 		} else if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
4712 			value = IBD_DEF_UD_NUM_RWQE;
4713 		} else if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
4714 			value = IBD_DEF_UD_NUM_SWQE;
4715 		} else if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
4716 			value = IBD_DEF_UD_RX_COMP_COUNT;
4717 		} else if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
4718 			value = IBD_DEF_UD_RX_COMP_USEC;
4719 		} else if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
4720 			value = IBD_DEF_UD_TX_COMP_COUNT;
4721 		} else if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
4722 			value = IBD_DEF_UD_TX_COMP_USEC;
4723 		} else if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
4724 			value = IBD_DEF_UD_TX_COPY_THRESH;
4725 		} else {
4726 			return;
4727 		}
4728 
4729 		(void) snprintf(valstr, sizeof (valstr), "%d", value);
4730 		mac_prop_info_set_default_str(prh, valstr);
4731 		break;
4732 	}
4733 	} /* switch (pr_num) */
4734 }
4735 
4736 /* ARGSUSED2 */
4737 static int
4738 ibd_set_priv_prop(ibd_state_t *state, const char *pr_name,
4739     uint_t pr_valsize, const void *pr_val)
4740 {
4741 	int err = 0;
4742 	long result;
4743 
4744 	if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
4745 		if (pr_val == NULL) {
4746 			return (EINVAL);
4747 		}
4748 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4749 		if (result < 0 || result > 1) {
4750 			err = EINVAL;
4751 		} else {
4752 			state->id_allow_coalesce_comp_tuning = (result == 1) ?
4753 			    B_TRUE: B_FALSE;
4754 		}
4755 		return (err);
4756 	}
4757 	if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) {
4758 		if (state->id_mac_state & IBD_DRV_STARTED) {
4759 			return (EBUSY);
4760 		}
4761 		if (pr_val == NULL) {
4762 			return (EINVAL);
4763 		}
4764 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4765 		if (result < 0 || result > 1) {
4766 			err = EINVAL;
4767 		} else {
4768 			state->id_create_broadcast_group = (result == 1) ?
4769 			    B_TRUE: B_FALSE;
4770 		}
4771 		return (err);
4772 	}
4773 	if (strcmp(pr_name, "_ibd_hash_size") == 0) {
4774 		if (state->id_mac_state & IBD_DRV_STARTED) {
4775 			return (EBUSY);
4776 		}
4777 		if (pr_val == NULL) {
4778 			return (EINVAL);
4779 		}
4780 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4781 		if (result < IBD_MIN_HASH_SIZE || result > IBD_MAX_HASH_SIZE) {
4782 			err = EINVAL;
4783 		} else {
4784 			state->id_hash_size = (uint32_t)result;
4785 		}
4786 		return (err);
4787 	}
4788 	if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
4789 		if (state->id_mac_state & IBD_DRV_STARTED) {
4790 			return (EBUSY);
4791 		}
4792 		if (pr_val == NULL) {
4793 			return (EINVAL);
4794 		}
4795 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4796 		if (result < 0 || result > 1) {
4797 			err = EINVAL;
4798 		} else {
4799 			state->id_lso_policy = (result == 1) ?
4800 			    B_TRUE: B_FALSE;
4801 		}
4802 		mac_capab_update(state->id_mh);
4803 		return (err);
4804 	}
4805 	if (strcmp(pr_name, "_ibd_num_ah") == 0) {
4806 		if (state->id_mac_state & IBD_DRV_STARTED) {
4807 			return (EBUSY);
4808 		}
4809 		if (pr_val == NULL) {
4810 			return (EINVAL);
4811 		}
4812 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4813 		if (result < IBD_MIN_NUM_AH || result > IBD_MAX_NUM_AH) {
4814 			err = EINVAL;
4815 		} else {
4816 			state->id_num_ah = (uint32_t)result;
4817 		}
4818 		return (err);
4819 	}
4820 	if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
4821 		if (state->id_mac_state & IBD_DRV_STARTED) {
4822 			return (EBUSY);
4823 		}
4824 		if (!state->id_lso_policy || !state->id_lso_capable) {
4825 			return (EINVAL);
4826 		}
4827 		if (pr_val == NULL) {
4828 			return (EINVAL);
4829 		}
4830 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4831 		if (result < IBD_MIN_NUM_LSO_BUFS ||
4832 		    result > IBD_MAX_NUM_LSO_BUFS) {
4833 			err = EINVAL;
4834 		} else {
4835 			state->id_num_lso_bufs = (uint32_t)result;
4836 		}
4837 		return (err);
4838 	}
4839 	if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
4840 		if (state->id_mac_state & IBD_DRV_STARTED) {
4841 			return (EBUSY);
4842 		}
4843 		if (pr_val == NULL) {
4844 			return (EINVAL);
4845 		}
4846 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4847 		if (result < 0 || result > 1) {
4848 			err = EINVAL;
4849 		} else {
4850 			state->rc_enable_srq = (result == 1) ?
4851 			    B_TRUE: B_FALSE;
4852 		}
4853 		if (!state->rc_enable_srq) {
4854 			state->id_rc_num_srq = 0;
4855 		}
4856 		return (err);
4857 	}
4858 	if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
4859 		if (state->id_mac_state & IBD_DRV_STARTED) {
4860 			return (EBUSY);
4861 		}
4862 		if (pr_val == NULL) {
4863 			return (EINVAL);
4864 		}
4865 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4866 		if (result < IBD_MIN_RC_NUM_RWQE ||
4867 		    result > IBD_MAX_RC_NUM_RWQE) {
4868 			err = EINVAL;
4869 		} else {
4870 			state->id_rc_num_rwqe = (uint32_t)result;
4871 			if (state->id_allow_coalesce_comp_tuning &&
4872 			    state->id_rc_rx_comp_count > state->id_rc_num_rwqe)
4873 				state->id_rc_rx_comp_count =
4874 				    state->id_rc_num_rwqe;
4875 			if (state->id_rc_num_srq > state->id_rc_num_rwqe)
4876 				state->id_rc_num_srq =
4877 				    state->id_rc_num_rwqe - 1;
4878 			/*
4879 			 * If rx_rwqe_threshold is greater than the number of
4880 			 * rwqes, pull it back to 25% of number of rwqes.
4881 			 */
4882 			if (state->id_rc_rx_rwqe_thresh > state->id_rc_num_rwqe)
4883 				state->id_rc_rx_rwqe_thresh =
4884 				    (state->id_rc_num_rwqe >> 2);
4885 
4886 		}
4887 		return (err);
4888 	}
4889 	if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
4890 		if (state->id_mac_state & IBD_DRV_STARTED) {
4891 			return (EBUSY);
4892 		}
4893 		if (pr_val == NULL) {
4894 			return (EINVAL);
4895 		}
4896 		if (!state->rc_enable_srq)
4897 			return (EINVAL);
4898 
4899 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4900 		if (result < IBD_MIN_RC_NUM_SRQ ||
4901 		    result >= state->id_rc_num_rwqe) {
4902 			err = EINVAL;
4903 		} else
4904 			state->id_rc_num_srq = (uint32_t)result;
4905 		return (err);
4906 	}
4907 	if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
4908 		if (state->id_mac_state & IBD_DRV_STARTED) {
4909 			return (EBUSY);
4910 		}
4911 		if (pr_val == NULL) {
4912 			return (EINVAL);
4913 		}
4914 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4915 		if (result < IBD_MIN_RC_NUM_SWQE ||
4916 		    result > IBD_MAX_RC_NUM_SWQE) {
4917 			err = EINVAL;
4918 		} else {
4919 			state->id_rc_num_swqe = (uint32_t)result;
4920 			if (state->id_allow_coalesce_comp_tuning &&
4921 			    state->id_rc_tx_comp_count > state->id_rc_num_swqe)
4922 				state->id_rc_tx_comp_count =
4923 				    state->id_rc_num_swqe;
4924 		}
4925 		return (err);
4926 	}
4927 	if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
4928 		if (!state->id_allow_coalesce_comp_tuning) {
4929 			return (ENOTSUP);
4930 		}
4931 		if (pr_val == NULL) {
4932 			return (EINVAL);
4933 		}
4934 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4935 		if (result < 1 || result > state->id_rc_num_rwqe) {
4936 			err = EINVAL;
4937 		} else {
4938 			state->id_rc_rx_comp_count = (uint32_t)result;
4939 		}
4940 		return (err);
4941 	}
4942 	if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
4943 		if (!state->id_allow_coalesce_comp_tuning) {
4944 			return (ENOTSUP);
4945 		}
4946 		if (pr_val == NULL) {
4947 			return (EINVAL);
4948 		}
4949 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4950 		if (result < 1) {
4951 			err = EINVAL;
4952 		} else {
4953 			state->id_rc_rx_comp_usec = (uint32_t)result;
4954 		}
4955 		return (err);
4956 	}
4957 	if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
4958 		if (state->id_mac_state & IBD_DRV_STARTED) {
4959 			return (EBUSY);
4960 		}
4961 		if (pr_val == NULL) {
4962 			return (EINVAL);
4963 		}
4964 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4965 		if (result < IBD_MIN_RC_RX_COPY_THRESH ||
4966 		    result > state->rc_mtu) {
4967 			err = EINVAL;
4968 		} else {
4969 			state->id_rc_rx_copy_thresh = (uint32_t)result;
4970 		}
4971 		return (err);
4972 	}
4973 	if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
4974 		if (state->id_mac_state & IBD_DRV_STARTED) {
4975 			return (EBUSY);
4976 		}
4977 		if (pr_val == NULL) {
4978 			return (EINVAL);
4979 		}
4980 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4981 		if (result < IBD_MIN_RC_RX_RWQE_THRESH ||
4982 		    result >= state->id_rc_num_rwqe) {
4983 			err = EINVAL;
4984 		} else {
4985 			state->id_rc_rx_rwqe_thresh = (uint32_t)result;
4986 		}
4987 		return (err);
4988 	}
4989 	if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
4990 		if (!state->id_allow_coalesce_comp_tuning) {
4991 			return (ENOTSUP);
4992 		}
4993 		if (pr_val == NULL) {
4994 			return (EINVAL);
4995 		}
4996 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4997 		if (result < 1 || result > state->id_rc_num_swqe) {
4998 			err = EINVAL;
4999 		} else {
5000 			state->id_rc_tx_comp_count = (uint32_t)result;
5001 		}
5002 		return (err);
5003 	}
5004 	if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
5005 		if (!state->id_allow_coalesce_comp_tuning) {
5006 			return (ENOTSUP);
5007 		}
5008 		if (pr_val == NULL) {
5009 			return (EINVAL);
5010 		}
5011 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5012 		if (result < 1)
5013 			err = EINVAL;
5014 		else {
5015 			state->id_rc_tx_comp_usec = (uint32_t)result;
5016 		}
5017 		return (err);
5018 	}
5019 	if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
5020 		if (state->id_mac_state & IBD_DRV_STARTED) {
5021 			return (EBUSY);
5022 		}
5023 		if (pr_val == NULL) {
5024 			return (EINVAL);
5025 		}
5026 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5027 		if (result < IBD_MIN_RC_TX_COPY_THRESH ||
5028 		    result > state->rc_mtu) {
5029 			err = EINVAL;
5030 		} else {
5031 			state->id_rc_tx_copy_thresh = (uint32_t)result;
5032 		}
5033 		return (err);
5034 	}
5035 	if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
5036 		if (state->id_mac_state & IBD_DRV_STARTED) {
5037 			return (EBUSY);
5038 		}
5039 		if (pr_val == NULL) {
5040 			return (EINVAL);
5041 		}
5042 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5043 		if (result < IBD_MIN_UD_NUM_RWQE ||
5044 		    result > IBD_MAX_UD_NUM_RWQE) {
5045 			err = EINVAL;
5046 		} else {
5047 			if (result > state->id_hca_max_chan_sz) {
5048 				state->id_ud_num_rwqe =
5049 				    state->id_hca_max_chan_sz;
5050 			} else {
5051 				state->id_ud_num_rwqe = (uint32_t)result;
5052 			}
5053 			if (state->id_allow_coalesce_comp_tuning &&
5054 			    state->id_ud_rx_comp_count > state->id_ud_num_rwqe)
5055 				state->id_ud_rx_comp_count =
5056 				    state->id_ud_num_rwqe;
5057 		}
5058 		return (err);
5059 	}
5060 	if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
5061 		if (state->id_mac_state & IBD_DRV_STARTED) {
5062 			return (EBUSY);
5063 		}
5064 		if (pr_val == NULL) {
5065 			return (EINVAL);
5066 		}
5067 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5068 		if (result < IBD_MIN_UD_NUM_SWQE ||
5069 		    result > IBD_MAX_UD_NUM_SWQE) {
5070 			err = EINVAL;
5071 		} else {
5072 			if (result > state->id_hca_max_chan_sz) {
5073 				state->id_ud_num_swqe =
5074 				    state->id_hca_max_chan_sz;
5075 			} else {
5076 				state->id_ud_num_swqe = (uint32_t)result;
5077 			}
5078 			if (state->id_allow_coalesce_comp_tuning &&
5079 			    state->id_ud_tx_comp_count > state->id_ud_num_swqe)
5080 				state->id_ud_tx_comp_count =
5081 				    state->id_ud_num_swqe;
5082 		}
5083 		return (err);
5084 	}
5085 	if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
5086 		if (!state->id_allow_coalesce_comp_tuning) {
5087 			return (ENOTSUP);
5088 		}
5089 		if (pr_val == NULL) {
5090 			return (EINVAL);
5091 		}
5092 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5093 		if (result < 1 || result > state->id_ud_num_rwqe) {
5094 			err = EINVAL;
5095 		} else {
5096 			state->id_ud_rx_comp_count = (uint32_t)result;
5097 		}
5098 		return (err);
5099 	}
5100 	if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
5101 		if (!state->id_allow_coalesce_comp_tuning) {
5102 			return (ENOTSUP);
5103 		}
5104 		if (pr_val == NULL) {
5105 			return (EINVAL);
5106 		}
5107 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5108 		if (result < 1) {
5109 			err = EINVAL;
5110 		} else {
5111 			state->id_ud_rx_comp_usec = (uint32_t)result;
5112 		}
5113 		return (err);
5114 	}
5115 	if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
5116 		if (!state->id_allow_coalesce_comp_tuning) {
5117 			return (ENOTSUP);
5118 		}
5119 		if (pr_val == NULL) {
5120 			return (EINVAL);
5121 		}
5122 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5123 		if (result < 1 || result > state->id_ud_num_swqe) {
5124 			err = EINVAL;
5125 		} else {
5126 			state->id_ud_tx_comp_count = (uint32_t)result;
5127 		}
5128 		return (err);
5129 	}
5130 	if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
5131 		if (!state->id_allow_coalesce_comp_tuning) {
5132 			return (ENOTSUP);
5133 		}
5134 		if (pr_val == NULL) {
5135 			return (EINVAL);
5136 		}
5137 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5138 		if (result < 1) {
5139 			err = EINVAL;
5140 		} else {
5141 			state->id_ud_tx_comp_usec = (uint32_t)result;
5142 		}
5143 		return (err);
5144 	}
5145 	if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
5146 		if (state->id_mac_state & IBD_DRV_STARTED) {
5147 			return (EBUSY);
5148 		}
5149 		if (pr_val == NULL) {
5150 			return (EINVAL);
5151 		}
5152 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5153 		if (result < IBD_MIN_UD_TX_COPY_THRESH ||
5154 		    result > IBD_MAX_UD_TX_COPY_THRESH) {
5155 			err = EINVAL;
5156 		} else {
5157 			state->id_ud_tx_copy_thresh = (uint32_t)result;
5158 		}
5159 		return (err);
5160 	}
5161 	return (ENOTSUP);
5162 }
5163 
5164 static int
5165 ibd_get_priv_prop(ibd_state_t *state, const char *pr_name, uint_t pr_valsize,
5166     void *pr_val)
5167 {
5168 	int err = ENOTSUP;
5169 	int value;
5170 
5171 	if (strcmp(pr_name, "_ibd_broadcast_group") == 0) {
5172 		value = state->id_bgroup_present;
5173 		err = 0;
5174 		goto done;
5175 	}
5176 	if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
5177 		value = state->id_allow_coalesce_comp_tuning;
5178 		err = 0;
5179 		goto done;
5180 	}
5181 	if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) {
5182 		value = state->id_create_broadcast_group;
5183 		err = 0;
5184 		goto done;
5185 	}
5186 	if (strcmp(pr_name, "_ibd_hash_size") == 0) {
5187 		value = state->id_hash_size;
5188 		err = 0;
5189 		goto done;
5190 	}
5191 	if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
5192 		value = state->id_lso_policy;
5193 		err = 0;
5194 		goto done;
5195 	}
5196 	if (strcmp(pr_name, "_ibd_num_ah") == 0) {
5197 		value = state->id_num_ah;
5198 		err = 0;
5199 		goto done;
5200 	}
5201 	if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
5202 		value = state->id_num_lso_bufs;
5203 		err = 0;
5204 		goto done;
5205 	}
5206 	if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
5207 		value = state->rc_enable_srq;
5208 		err = 0;
5209 		goto done;
5210 	}
5211 	if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
5212 		value = state->id_rc_num_rwqe;
5213 		err = 0;
5214 		goto done;
5215 	}
5216 	if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
5217 		value = state->id_rc_num_srq;
5218 		err = 0;
5219 		goto done;
5220 	}
5221 	if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
5222 		value = state->id_rc_num_swqe;
5223 		err = 0;
5224 		goto done;
5225 	}
5226 	if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
5227 		value = state->id_rc_rx_comp_count;
5228 		err = 0;
5229 		goto done;
5230 	}
5231 	if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
5232 		value = state->id_rc_rx_comp_usec;
5233 		err = 0;
5234 		goto done;
5235 	}
5236 	if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
5237 		value = state->id_rc_rx_copy_thresh;
5238 		err = 0;
5239 		goto done;
5240 	}
5241 	if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
5242 		value = state->id_rc_rx_rwqe_thresh;
5243 		err = 0;
5244 		goto done;
5245 	}
5246 	if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
5247 		value = state->id_rc_tx_comp_count;
5248 		err = 0;
5249 		goto done;
5250 	}
5251 	if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
5252 		value = state->id_rc_tx_comp_usec;
5253 		err = 0;
5254 		goto done;
5255 	}
5256 	if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
5257 		value = state->id_rc_tx_copy_thresh;
5258 		err = 0;
5259 		goto done;
5260 	}
5261 	if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
5262 		value = state->id_ud_num_rwqe;
5263 		err = 0;
5264 		goto done;
5265 	}
5266 	if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
5267 		value = state->id_ud_num_swqe;
5268 		err = 0;
5269 		goto done;
5270 	}
5271 	if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
5272 		value = state->id_ud_rx_comp_count;
5273 		err = 0;
5274 		goto done;
5275 	}
5276 	if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
5277 		value = state->id_ud_rx_comp_usec;
5278 		err = 0;
5279 		goto done;
5280 	}
5281 	if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
5282 		value = state->id_ud_tx_comp_count;
5283 		err = 0;
5284 		goto done;
5285 	}
5286 	if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
5287 		value = state->id_ud_tx_comp_usec;
5288 		err = 0;
5289 		goto done;
5290 	}
5291 	if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
5292 		value = state->id_ud_tx_copy_thresh;
5293 		err = 0;
5294 		goto done;
5295 	}
5296 done:
5297 	if (err == 0) {
5298 		(void) snprintf(pr_val, pr_valsize, "%d", value);
5299 	}
5300 	return (err);
5301 }
5302 
5303 static int
5304 ibd_get_port_details(ibd_state_t *state)
5305 {
5306 	ibt_hca_portinfo_t *port_infop;
5307 	ibt_status_t ret;
5308 	uint_t psize, port_infosz;
5309 
5310 	mutex_enter(&state->id_link_mutex);
5311 
5312 	/*
5313 	 * Query for port information
5314 	 */
5315 	ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
5316 	    &port_infop, &psize, &port_infosz);
5317 	if ((ret != IBT_SUCCESS) || (psize != 1)) {
5318 		mutex_exit(&state->id_link_mutex);
5319 		DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() "
5320 		    "failed, ret=%d", ret);
5321 		return (ENETDOWN);
5322 	}
5323 
5324 	/*
5325 	 * If the link is active, verify the pkey
5326 	 */
5327 	if (port_infop->p_linkstate == IBT_PORT_ACTIVE) {
5328 		if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port,
5329 		    state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) {
5330 			state->id_link_state = LINK_STATE_DOWN;
5331 		} else {
5332 			state->id_link_state = LINK_STATE_UP;
5333 		}
5334 		state->id_mtu = (128 << port_infop->p_mtu);
5335 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
5336 		state->id_sgid = *port_infop->p_sgid_tbl;
5337 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
5338 		/*
5339 		 * Now that the port is active, record the port speed
5340 		 */
5341 		state->id_link_speed = ibd_get_portspeed(state);
5342 	} else {
5343 		/* Make sure that these are handled in PORT_UP/CHANGE */
5344 		state->id_mtu = 0;
5345 		state->id_link_state = LINK_STATE_DOWN;
5346 		state->id_link_speed = 0;
5347 	}
5348 	mutex_exit(&state->id_link_mutex);
5349 	ibt_free_portinfo(port_infop, port_infosz);
5350 
5351 	return (0);
5352 }
5353 
5354 static int
5355 ibd_alloc_cqs(ibd_state_t *state)
5356 {
5357 	ibt_hca_attr_t hca_attrs;
5358 	ibt_cq_attr_t cq_attr;
5359 	ibt_status_t ret;
5360 	uint32_t real_size;
5361 	uint_t num_rwqe_change = 0;
5362 	uint_t num_swqe_change = 0;
5363 
5364 	ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
5365 	ASSERT(ret == IBT_SUCCESS);
5366 
5367 	/*
5368 	 * Allocate Rx/combined CQ:
5369 	 * Theoretically, there is no point in having more than #rwqe
5370 	 * plus #swqe cqe's, except that the CQ will be signaled for
5371 	 * overflow when the last wqe completes, if none of the previous
5372 	 * cqe's have been polled. Thus, we allocate just a few less wqe's
5373 	 * to make sure such overflow does not occur.
5374 	 */
5375 	cq_attr.cq_sched = NULL;
5376 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
5377 
5378 	/*
5379 	 * Allocate Receive CQ.
5380 	 */
5381 	if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_rwqe + 1)) {
5382 		cq_attr.cq_size = state->id_ud_num_rwqe + 1;
5383 	} else {
5384 		cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
5385 		num_rwqe_change = state->id_ud_num_rwqe;
5386 		state->id_ud_num_rwqe = cq_attr.cq_size - 1;
5387 	}
5388 
5389 	if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
5390 	    &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
5391 		DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) "
5392 		    "failed, ret=%d\n", ret);
5393 		return (DDI_FAILURE);
5394 	}
5395 
5396 	if ((ret = ibt_modify_cq(state->id_rcq_hdl, state->id_ud_rx_comp_count,
5397 	    state->id_ud_rx_comp_usec, 0)) != IBT_SUCCESS) {
5398 		DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt "
5399 		    "moderation failed, ret=%d\n", ret);
5400 	}
5401 
5402 	/* make the #rx wc's the same as max rx chain size */
5403 	state->id_rxwcs_size = IBD_MAX_RX_MP_LEN;
5404 	state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
5405 	    state->id_rxwcs_size, KM_SLEEP);
5406 
5407 	/*
5408 	 * Allocate Send CQ.
5409 	 */
5410 	if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_swqe + 1)) {
5411 		cq_attr.cq_size = state->id_ud_num_swqe + 1;
5412 	} else {
5413 		cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
5414 		num_swqe_change = state->id_ud_num_swqe;
5415 		state->id_ud_num_swqe = cq_attr.cq_size - 1;
5416 	}
5417 
5418 	if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
5419 	    &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) {
5420 		DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) "
5421 		    "failed, ret=%d\n", ret);
5422 		kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) *
5423 		    state->id_rxwcs_size);
5424 		(void) ibt_free_cq(state->id_rcq_hdl);
5425 		return (DDI_FAILURE);
5426 	}
5427 	if ((ret = ibt_modify_cq(state->id_scq_hdl, state->id_ud_tx_comp_count,
5428 	    state->id_ud_tx_comp_usec, 0)) != IBT_SUCCESS) {
5429 		DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt "
5430 		    "moderation failed, ret=%d\n", ret);
5431 	}
5432 
5433 	state->id_txwcs_size = IBD_TX_POLL_THRESH;
5434 	state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) *
5435 	    state->id_txwcs_size, KM_SLEEP);
5436 
5437 	/*
5438 	 * Print message in case we could not allocate as many wqe's
5439 	 * as was requested.
5440 	 */
5441 	if (num_rwqe_change) {
5442 		ibd_print_warn(state, "Setting #rwqe = %d instead of default "
5443 		    "%d", state->id_ud_num_rwqe, num_rwqe_change);
5444 	}
5445 	if (num_swqe_change) {
5446 		ibd_print_warn(state, "Setting #swqe = %d instead of default "
5447 		    "%d", state->id_ud_num_swqe, num_swqe_change);
5448 	}
5449 
5450 	return (DDI_SUCCESS);
5451 }
5452 
5453 static int
5454 ibd_setup_ud_channel(ibd_state_t *state)
5455 {
5456 	ibt_ud_chan_alloc_args_t ud_alloc_attr;
5457 	ibt_ud_chan_query_attr_t ud_chan_attr;
5458 	ibt_status_t ret;
5459 
5460 	ud_alloc_attr.ud_flags  = IBT_ALL_SIGNALED;
5461 	if (state->id_hca_res_lkey_capab)
5462 		ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
5463 	if (state->id_lso_policy && state->id_lso_capable)
5464 		ud_alloc_attr.ud_flags |= IBT_USES_LSO;
5465 
5466 	ud_alloc_attr.ud_hca_port_num	= state->id_port;
5467 	ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg;
5468 	ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG;
5469 	ud_alloc_attr.ud_sizes.cs_sq    = state->id_ud_num_swqe;
5470 	ud_alloc_attr.ud_sizes.cs_rq    = state->id_ud_num_rwqe;
5471 	ud_alloc_attr.ud_qkey		= state->id_mcinfo->mc_qkey;
5472 	ud_alloc_attr.ud_scq		= state->id_scq_hdl;
5473 	ud_alloc_attr.ud_rcq		= state->id_rcq_hdl;
5474 	ud_alloc_attr.ud_pd		= state->id_pd_hdl;
5475 	ud_alloc_attr.ud_pkey_ix	= state->id_pkix;
5476 	ud_alloc_attr.ud_clone_chan	= NULL;
5477 
5478 	if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS,
5479 	    &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) {
5480 		DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() "
5481 		    "failed, ret=%d\n", ret);
5482 		return (DDI_FAILURE);
5483 	}
5484 
5485 	if ((ret = ibt_query_ud_channel(state->id_chnl_hdl,
5486 	    &ud_chan_attr)) != IBT_SUCCESS) {
5487 		DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() "
5488 		    "failed, ret=%d\n", ret);
5489 		(void) ibt_free_channel(state->id_chnl_hdl);
5490 		return (DDI_FAILURE);
5491 	}
5492 
5493 	state->id_qpnum = ud_chan_attr.ud_qpn;
5494 
5495 	return (DDI_SUCCESS);
5496 }
5497 
5498 static int
5499 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state)
5500 {
5501 	uint32_t progress = state->id_mac_state;
5502 	uint_t attempts;
5503 	ibt_status_t ret;
5504 	ib_gid_t mgid;
5505 	ibd_mce_t *mce;
5506 	uint8_t jstate;
5507 
5508 	if (atomic_dec_32_nv(&state->id_running) != 0)
5509 		cmn_err(CE_WARN, "ibd_undo_start: id_running was not 1\n");
5510 
5511 	/*
5512 	 * Before we try to stop/undo whatever we did in ibd_start(),
5513 	 * we need to mark the link state appropriately to prevent the
5514 	 * ip layer from using this instance for any new transfers. Note
5515 	 * that if the original state of the link was "up" when we're
5516 	 * here, we'll set the final link state to "unknown", to behave
5517 	 * in the same fashion as other ethernet drivers.
5518 	 */
5519 	mutex_enter(&state->id_link_mutex);
5520 	if (cur_link_state == LINK_STATE_DOWN) {
5521 		state->id_link_state = cur_link_state;
5522 	} else {
5523 		state->id_link_state = LINK_STATE_UNKNOWN;
5524 	}
5525 	mutex_exit(&state->id_link_mutex);
5526 	bzero(&state->id_macaddr, sizeof (ipoib_mac_t));
5527 	mac_link_update(state->id_mh, state->id_link_state);
5528 
5529 	state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED);
5530 	if (progress & IBD_DRV_STARTED) {
5531 		state->id_mac_state &= (~IBD_DRV_STARTED);
5532 	}
5533 
5534 	if (progress & IBD_DRV_IN_LATE_HCA_INIT) {
5535 		state->id_mac_state &= (~IBD_DRV_IN_LATE_HCA_INIT);
5536 	}
5537 
5538 	/* Stop listen under Reliable Connected Mode */
5539 	if (progress & IBD_DRV_RC_LISTEN) {
5540 		ASSERT(state->id_enable_rc);
5541 		if (state->rc_listen_hdl != NULL) {
5542 			ibd_rc_stop_listen(state);
5543 		}
5544 		state->id_mac_state &= (~IBD_DRV_RC_LISTEN);
5545 	}
5546 
5547 	if ((state->id_enable_rc) && (progress & IBD_DRV_ACACHE_INITIALIZED)) {
5548 		(void) ibd_rc_close_all_chan(state);
5549 	}
5550 
5551 	/*
5552 	 * First, stop receive interrupts; this stops the driver from
5553 	 * handing up buffers to higher layers.  Wait for receive buffers
5554 	 * to be returned and give up after 1 second.
5555 	 */
5556 	if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) {
5557 		attempts = 10;
5558 		while (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding,
5559 		    0) > 0) {
5560 			delay(drv_usectohz(100000));
5561 			if (--attempts == 0) {
5562 				/*
5563 				 * There are pending bufs with the network
5564 				 * layer and we have no choice but to wait
5565 				 * for them to be done with. Reap all the
5566 				 * Tx/Rx completions that were posted since
5567 				 * we turned off the notification and
5568 				 * return failure.
5569 				 */
5570 				cmn_err(CE_CONT, "!ibd: bufs outstanding\n");
5571 				DPRINT(2, "ibd_undo_start: "
5572 				    "reclaiming failed");
5573 				break;
5574 			}
5575 		}
5576 		state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED);
5577 	}
5578 
5579 	if (progress & IBD_DRV_RC_LARGEBUF_ALLOCD) {
5580 		ibd_rc_fini_tx_largebuf_list(state);
5581 		state->id_mac_state &= (~IBD_DRV_RC_LARGEBUF_ALLOCD);
5582 	}
5583 
5584 	if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
5585 		ASSERT(state->id_enable_rc);
5586 		if (state->rc_srq_rwqe_list.dl_bufs_outstanding == 0) {
5587 			ibd_rc_fini_srq_list(state);
5588 			state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD);
5589 		} else {
5590 			cmn_err(CE_CONT, "ibd_undo_start: srq bufs "
5591 			    "outstanding\n");
5592 		}
5593 	}
5594 
5595 	if (progress & IBD_DRV_SM_NOTICES_REGISTERED) {
5596 		ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL);
5597 
5598 		mutex_enter(&state->id_trap_lock);
5599 		state->id_trap_stop = B_TRUE;
5600 		while (state->id_trap_inprog > 0)
5601 			cv_wait(&state->id_trap_cv, &state->id_trap_lock);
5602 		mutex_exit(&state->id_trap_lock);
5603 
5604 		state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED);
5605 	}
5606 
5607 	if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) {
5608 		/*
5609 		 * Flushing the channel ensures that all pending WQE's
5610 		 * are marked with flush_error and handed to the CQ. It
5611 		 * does not guarantee the invocation of the CQ handler.
5612 		 * This call is guaranteed to return successfully for
5613 		 * UD QPNs.
5614 		 */
5615 		if ((ret = ibt_flush_channel(state->id_chnl_hdl)) !=
5616 		    IBT_SUCCESS) {
5617 			DPRINT(10, "ibd_undo_start: flush_channel "
5618 			    "failed, ret=%d", ret);
5619 		}
5620 
5621 		/*
5622 		 * Give some time for the TX CQ handler to process the
5623 		 * completions.
5624 		 */
5625 		mutex_enter(&state->id_tx_list.dl_mutex);
5626 		mutex_enter(&state->id_tx_rel_list.dl_mutex);
5627 		attempts = 10;
5628 		while (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt
5629 		    != state->id_ud_num_swqe) {
5630 			if (--attempts == 0)
5631 				break;
5632 			mutex_exit(&state->id_tx_rel_list.dl_mutex);
5633 			mutex_exit(&state->id_tx_list.dl_mutex);
5634 			delay(drv_usectohz(100000));
5635 			mutex_enter(&state->id_tx_list.dl_mutex);
5636 			mutex_enter(&state->id_tx_rel_list.dl_mutex);
5637 		}
5638 		ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
5639 		if (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt !=
5640 		    state->id_ud_num_swqe) {
5641 			cmn_err(CE_WARN, "tx resources not freed\n");
5642 		}
5643 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
5644 		mutex_exit(&state->id_tx_list.dl_mutex);
5645 
5646 		attempts = 10;
5647 		while (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
5648 			if (--attempts == 0)
5649 				break;
5650 			delay(drv_usectohz(100000));
5651 		}
5652 		ibt_set_cq_handler(state->id_rcq_hdl, 0, 0);
5653 		if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
5654 			cmn_err(CE_WARN, "rx resources not freed\n");
5655 		}
5656 
5657 		state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED);
5658 	}
5659 
5660 	if (progress & IBD_DRV_BCAST_GROUP_JOINED) {
5661 		/*
5662 		 * Drop all residual full/non membership. This includes full
5663 		 * membership to the broadcast group, and any nonmembership
5664 		 * acquired during transmits. We do this after the Tx completion
5665 		 * handlers are done, since those might result in some late
5666 		 * leaves; this also eliminates a potential race with that
5667 		 * path wrt the mc full list insert/delete. Trap handling
5668 		 * has also been suppressed at this point. Thus, no locks
5669 		 * are required while traversing the mc full list.
5670 		 */
5671 		DPRINT(2, "ibd_undo_start: clear full cache entries");
5672 		mce = list_head(&state->id_mc_full);
5673 		while (mce != NULL) {
5674 			mgid = mce->mc_info.mc_adds_vect.av_dgid;
5675 			jstate = mce->mc_jstate;
5676 			mce = list_next(&state->id_mc_full, mce);
5677 			ibd_leave_group(state, mgid, jstate);
5678 		}
5679 		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED);
5680 	}
5681 
5682 	if (progress & IBD_DRV_RXLIST_ALLOCD) {
5683 		ibd_fini_rxlist(state);
5684 		state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD);
5685 	}
5686 
5687 	if (progress & IBD_DRV_TXLIST_ALLOCD) {
5688 		ibd_fini_txlist(state);
5689 		state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD);
5690 	}
5691 
5692 	if (progress & IBD_DRV_UD_CHANNEL_SETUP) {
5693 		if ((ret = ibt_free_channel(state->id_chnl_hdl)) !=
5694 		    IBT_SUCCESS) {
5695 			DPRINT(10, "ibd_undo_start: free_channel "
5696 			    "failed, ret=%d", ret);
5697 		}
5698 
5699 		state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP);
5700 	}
5701 
5702 	if (progress & IBD_DRV_CQS_ALLOCD) {
5703 		kmem_free(state->id_txwcs,
5704 		    sizeof (ibt_wc_t) * state->id_txwcs_size);
5705 		if ((ret = ibt_free_cq(state->id_scq_hdl)) !=
5706 		    IBT_SUCCESS) {
5707 			DPRINT(10, "ibd_undo_start: free_cq(scq) "
5708 			    "failed, ret=%d", ret);
5709 		}
5710 
5711 		kmem_free(state->id_rxwcs,
5712 		    sizeof (ibt_wc_t) * state->id_rxwcs_size);
5713 		if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) {
5714 			DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, "
5715 			    "ret=%d", ret);
5716 		}
5717 
5718 		state->id_txwcs = NULL;
5719 		state->id_rxwcs = NULL;
5720 		state->id_scq_hdl = NULL;
5721 		state->id_rcq_hdl = NULL;
5722 
5723 		state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD);
5724 	}
5725 
5726 	if (progress & IBD_DRV_ACACHE_INITIALIZED) {
5727 		mutex_enter(&state->id_ac_mutex);
5728 		mod_hash_destroy_hash(state->id_ah_active_hash);
5729 		mutex_exit(&state->id_ac_mutex);
5730 		ibd_acache_fini(state);
5731 
5732 		state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED);
5733 	}
5734 
5735 	if (progress & IBD_DRV_BCAST_GROUP_FOUND) {
5736 		/*
5737 		 * If we'd created the ipoib broadcast group and had
5738 		 * successfully joined it, leave it now
5739 		 */
5740 		if (state->id_bgroup_created) {
5741 			mgid = state->id_mcinfo->mc_adds_vect.av_dgid;
5742 			jstate = IB_MC_JSTATE_FULL;
5743 			(void) ibt_leave_mcg(state->id_sgid, mgid,
5744 			    state->id_sgid, jstate);
5745 		}
5746 		ibt_free_mcg_info(state->id_mcinfo, 1);
5747 
5748 		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND);
5749 	}
5750 
5751 	return (DDI_SUCCESS);
5752 }
5753 
5754 /*
5755  * These pair of routines are used to set/clear the condition that
5756  * the caller is likely to do something to change the id_mac_state.
5757  * If there's already someone doing either a start or a stop (possibly
5758  * due to the async handler detecting a pkey relocation event, a plumb
5759  * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until
5760  * that's done.
5761  */
5762 static void
5763 ibd_set_mac_progress(ibd_state_t *state, uint_t flag)
5764 {
5765 	mutex_enter(&state->id_macst_lock);
5766 	while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS)
5767 		cv_wait(&state->id_macst_cv, &state->id_macst_lock);
5768 
5769 	state->id_mac_state |= flag;
5770 	mutex_exit(&state->id_macst_lock);
5771 }
5772 
5773 static void
5774 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag)
5775 {
5776 	mutex_enter(&state->id_macst_lock);
5777 	state->id_mac_state &= (~flag);
5778 	cv_signal(&state->id_macst_cv);
5779 	mutex_exit(&state->id_macst_lock);
5780 }
5781 
5782 /*
5783  * GLDv3 entry point to start hardware.
5784  */
5785 /*ARGSUSED*/
5786 static int
5787 ibd_m_start(void *arg)
5788 {
5789 	ibd_state_t *state = arg;
5790 	int	ret;
5791 
5792 	if (state->id_type == IBD_PORT_DRIVER)
5793 		return (EINVAL);
5794 
5795 	ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5796 	if (state->id_mac_state & IBD_DRV_IN_DELETION) {
5797 		ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5798 		return (EIO);
5799 	}
5800 
5801 	ret = ibd_start(state);
5802 	ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5803 	return (ret);
5804 }
5805 
5806 static int
5807 ibd_start(ibd_state_t *state)
5808 {
5809 	int err;
5810 	ibt_status_t ret;
5811 	int late_hca_init = 0;
5812 
5813 	if (state->id_mac_state & IBD_DRV_STARTED)
5814 		return (DDI_SUCCESS);
5815 
5816 	/*
5817 	 * We do not increment the running flag when calling ibd_start() as
5818 	 * a result of some event which moves the state away from late HCA
5819 	 * initialization viz. MCG_CREATED, PORT_CHANGE or link availability.
5820 	 */
5821 	if (!(state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) &&
5822 	    (atomic_inc_32_nv(&state->id_running) != 1)) {
5823 		DPRINT(10, "ibd_start: id_running is non-zero");
5824 		cmn_err(CE_WARN, "ibd_start: id_running was not 0\n");
5825 		atomic_dec_32(&state->id_running);
5826 		return (EINVAL);
5827 	}
5828 
5829 	/*
5830 	 * Get port details; if we fail here, something bad happened.
5831 	 * Fail plumb.
5832 	 */
5833 	if ((err = ibd_get_port_details(state)) != 0) {
5834 		DPRINT(10, "ibd_start: ibd_get_port_details() failed");
5835 		goto start_fail;
5836 	}
5837 	/*
5838 	 * If state->id_link_state is DOWN, it indicates that either the port
5839 	 * is down, or the pkey is not available. In both cases, resort to late
5840 	 * initialization. Register for subnet notices, and return success.
5841 	 */
5842 	state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED;
5843 	if (state->id_link_state == LINK_STATE_DOWN) {
5844 		late_hca_init = 1;
5845 		goto late_hca_init_return;
5846 	}
5847 
5848 	/*
5849 	 * Find the IPoIB broadcast group
5850 	 */
5851 	if (ibd_find_bgroup(state) != IBT_SUCCESS) {
5852 		/* Resort to late initialization */
5853 		late_hca_init = 1;
5854 		goto reg_snet_notices;
5855 	}
5856 	state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND;
5857 
5858 	/*
5859 	 * Initialize per-interface caches and lists; if we fail here,
5860 	 * it is most likely due to a lack of resources
5861 	 */
5862 	if (ibd_acache_init(state) != DDI_SUCCESS) {
5863 		DPRINT(10, "ibd_start: ibd_acache_init() failed");
5864 		err = ENOMEM;
5865 		goto start_fail;
5866 	}
5867 	state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED;
5868 
5869 	/*
5870 	 * Allocate send and receive completion queues
5871 	 */
5872 	if (ibd_alloc_cqs(state) != DDI_SUCCESS) {
5873 		DPRINT(10, "ibd_start: ibd_alloc_cqs() failed");
5874 		err = ENOMEM;
5875 		goto start_fail;
5876 	}
5877 	state->id_mac_state |= IBD_DRV_CQS_ALLOCD;
5878 
5879 	/*
5880 	 * Setup a UD channel
5881 	 */
5882 	if (ibd_setup_ud_channel(state) != DDI_SUCCESS) {
5883 		err = ENOMEM;
5884 		DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed");
5885 		goto start_fail;
5886 	}
5887 	state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP;
5888 
5889 	/*
5890 	 * Allocate and initialize the tx buffer list
5891 	 */
5892 	if (ibd_init_txlist(state) != DDI_SUCCESS) {
5893 		DPRINT(10, "ibd_start: ibd_init_txlist() failed");
5894 		err = ENOMEM;
5895 		goto start_fail;
5896 	}
5897 	state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD;
5898 
5899 	/*
5900 	 * Create the send cq handler here
5901 	 */
5902 	ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
5903 	if ((ret = ibt_enable_cq_notify(state->id_scq_hdl,
5904 	    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
5905 		DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) "
5906 		    "failed, ret=%d", ret);
5907 		err = EINVAL;
5908 		goto start_fail;
5909 	}
5910 	state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED;
5911 
5912 	/*
5913 	 * Allocate and initialize the rx buffer list
5914 	 */
5915 	if (ibd_init_rxlist(state) != DDI_SUCCESS) {
5916 		DPRINT(10, "ibd_start: ibd_init_rxlist() failed");
5917 		err = ENOMEM;
5918 		goto start_fail;
5919 	}
5920 	state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD;
5921 
5922 	/*
5923 	 * Join IPoIB broadcast group
5924 	 */
5925 	if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) {
5926 		DPRINT(10, "ibd_start: ibd_join_group() failed");
5927 		err = ENOTACTIVE;
5928 		goto start_fail;
5929 	}
5930 	state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED;
5931 
5932 	/*
5933 	 * When we did mac_register() in ibd_attach(), we didn't register
5934 	 * the real macaddr and we didn't have the true port mtu. Now that
5935 	 * we're almost ready, set the local mac address and broadcast
5936 	 * addresses and update gldv3 about the real values of these
5937 	 * parameters.
5938 	 */
5939 	if (state->id_enable_rc) {
5940 		ibd_h2n_mac(&state->id_macaddr,
5941 		    IBD_MAC_ADDR_RC + state->id_qpnum,
5942 		    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
5943 		ibd_h2n_mac(&state->rc_macaddr_loopback, state->id_qpnum,
5944 		    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
5945 	} else {
5946 		ibd_h2n_mac(&state->id_macaddr, state->id_qpnum,
5947 		    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
5948 	}
5949 	ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK,
5950 	    state->id_mgid.gid_prefix, state->id_mgid.gid_guid);
5951 
5952 	if (!state->id_enable_rc) {
5953 		(void) mac_maxsdu_update(state->id_mh, state->id_mtu
5954 		    - IPOIB_HDRSIZE);
5955 	}
5956 	mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
5957 
5958 	/*
5959 	 * Setup the receive cq handler
5960 	 */
5961 	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
5962 	if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl,
5963 	    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
5964 		DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) "
5965 		    "failed, ret=%d", ret);
5966 		err = EINVAL;
5967 		goto start_fail;
5968 	}
5969 	state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED;
5970 
5971 reg_snet_notices:
5972 	/*
5973 	 * In case of normal initialization sequence,
5974 	 * Setup the subnet notices handler after we've initialized the acache/
5975 	 * mcache and started the async thread, both of which are required for
5976 	 * the trap handler to function properly.
5977 	 *
5978 	 * Now that the async thread has been started (and we've already done
5979 	 * a mac_register() during attach so mac_tx_update() can be called
5980 	 * if necessary without any problem), we can enable the trap handler
5981 	 * to queue requests to the async thread.
5982 	 *
5983 	 * In case of late hca initialization, the subnet notices handler will
5984 	 * only handle MCG created/deleted event. The action performed as part
5985 	 * of handling these events is to start the interface. So, the
5986 	 * acache/mcache initialization is not a necessity in such cases for
5987 	 * registering the subnet notices handler. Also, if we are in
5988 	 * ibd_start() as a result of, say, some event handling after entering
5989 	 * late hca initialization phase no need to register again.
5990 	 */
5991 	if ((state->id_mac_state & IBD_DRV_SM_NOTICES_REGISTERED) == 0) {
5992 		ibt_register_subnet_notices(state->id_ibt_hdl,
5993 		    ibd_snet_notices_handler, state);
5994 		mutex_enter(&state->id_trap_lock);
5995 		state->id_trap_stop = B_FALSE;
5996 		mutex_exit(&state->id_trap_lock);
5997 		state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED;
5998 	}
5999 
6000 late_hca_init_return:
6001 	if (late_hca_init == 1) {
6002 		state->id_mac_state |= IBD_DRV_IN_LATE_HCA_INIT;
6003 		/*
6004 		 * In case of late initialization, mark the link state as down,
6005 		 * immaterial of the actual link state as reported in the
6006 		 * port_info.
6007 		 */
6008 		state->id_link_state = LINK_STATE_DOWN;
6009 		mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
6010 		mac_link_update(state->id_mh, state->id_link_state);
6011 		return (DDI_SUCCESS);
6012 	}
6013 
6014 	if (state->id_enable_rc) {
6015 		if (state->rc_enable_srq) {
6016 			if (state->id_mac_state & IBD_DRV_RC_SRQ_ALLOCD) {
6017 				if (ibd_rc_repost_srq_free_list(state) !=
6018 				    IBT_SUCCESS) {
6019 					err = ENOMEM;
6020 					goto start_fail;
6021 				}
6022 			} else {
6023 				/* Allocate SRQ resource */
6024 				if (ibd_rc_init_srq_list(state) !=
6025 				    IBT_SUCCESS) {
6026 					err = ENOMEM;
6027 					goto start_fail;
6028 				}
6029 				state->id_mac_state |= IBD_DRV_RC_SRQ_ALLOCD;
6030 			}
6031 		}
6032 
6033 		if (ibd_rc_init_tx_largebuf_list(state) != IBT_SUCCESS) {
6034 			DPRINT(10, "ibd_start: ibd_rc_init_tx_largebuf_list() "
6035 			    "failed");
6036 			err = ENOMEM;
6037 			goto start_fail;
6038 		}
6039 		state->id_mac_state |= IBD_DRV_RC_LARGEBUF_ALLOCD;
6040 
6041 		/* RC: begin to listen only after everything is available */
6042 		if (ibd_rc_listen(state) != IBT_SUCCESS) {
6043 			DPRINT(10, "ibd_start: ibd_rc_listen() failed");
6044 			err = EINVAL;
6045 			goto start_fail;
6046 		}
6047 		state->id_mac_state |= IBD_DRV_RC_LISTEN;
6048 	}
6049 
6050 	/*
6051 	 * Indicate link status to GLDv3 and higher layers. By default,
6052 	 * we assume we are in up state (which must have been true at
6053 	 * least at the time the broadcast mcg's were probed); if there
6054 	 * were any up/down transitions till the time we come here, the
6055 	 * async handler will have updated last known state, which we
6056 	 * use to tell GLDv3. The async handler will not send any
6057 	 * notifications to GLDv3 till we reach here in the initialization
6058 	 * sequence.
6059 	 */
6060 	mac_link_update(state->id_mh, state->id_link_state);
6061 	state->id_mac_state &= ~IBD_DRV_IN_LATE_HCA_INIT;
6062 	state->id_mac_state |= IBD_DRV_STARTED;
6063 
6064 	return (DDI_SUCCESS);
6065 
6066 start_fail:
6067 	/*
6068 	 * If we ran into a problem during ibd_start() and ran into
6069 	 * some other problem during undoing our partial work, we can't
6070 	 * do anything about it.  Ignore any errors we might get from
6071 	 * ibd_undo_start() and just return the original error we got.
6072 	 */
6073 	(void) ibd_undo_start(state, LINK_STATE_DOWN);
6074 	return (err);
6075 }
6076 
6077 /*
6078  * GLDv3 entry point to stop hardware from receiving packets.
6079  */
6080 /*ARGSUSED*/
6081 static void
6082 ibd_m_stop(void *arg)
6083 {
6084 	ibd_state_t *state = (ibd_state_t *)arg;
6085 
6086 	if (state->id_type == IBD_PORT_DRIVER)
6087 		return;
6088 
6089 	ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
6090 
6091 	(void) ibd_undo_start(state, state->id_link_state);
6092 
6093 	ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
6094 }
6095 
6096 /*
6097  * GLDv3 entry point to modify device's mac address. We do not
6098  * allow address modifications.
6099  */
6100 static int
6101 ibd_m_unicst(void *arg, const uint8_t *macaddr)
6102 {
6103 	ibd_state_t *state = arg;
6104 
6105 	if (state->id_type == IBD_PORT_DRIVER)
6106 		return (EINVAL);
6107 
6108 	/*
6109 	 * Don't bother even comparing the macaddr if we haven't
6110 	 * completed ibd_m_start().
6111 	 */
6112 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6113 		return (0);
6114 
6115 	if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0)
6116 		return (0);
6117 	else
6118 		return (EINVAL);
6119 }
6120 
6121 /*
6122  * The blocking part of the IBA join/leave operations are done out
6123  * of here on the async thread.
6124  */
6125 static void
6126 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op)
6127 {
6128 	DPRINT(3, "ibd_async_multicast : async_setmc op %d :"
6129 	    "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid);
6130 
6131 	if (op == IBD_ASYNC_JOIN) {
6132 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) {
6133 			ibd_print_warn(state, "Join multicast group failed :"
6134 			"%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
6135 		}
6136 	} else {
6137 		/*
6138 		 * Here, we must search for the proper mcg_info and
6139 		 * use that to leave the group.
6140 		 */
6141 		ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL);
6142 	}
6143 }
6144 
6145 /*
6146  * GLDv3 entry point for multicast enable/disable requests.
6147  * This function queues the operation to the async thread and
6148  * return success for a valid multicast address.
6149  */
6150 static int
6151 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac)
6152 {
6153 	ibd_state_t *state = (ibd_state_t *)arg;
6154 	ipoib_mac_t maddr, *mcast;
6155 	ib_gid_t mgid;
6156 	ibd_req_t *req;
6157 
6158 	if (state->id_type == IBD_PORT_DRIVER)
6159 		return (EINVAL);
6160 
6161 	/*
6162 	 * If we haven't completed ibd_m_start(), async thread wouldn't
6163 	 * have been started and id_bcaddr wouldn't be set, so there's
6164 	 * no point in continuing.
6165 	 */
6166 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6167 		return (0);
6168 
6169 	/*
6170 	 * The incoming multicast address might not be aligned properly
6171 	 * on a 4 byte boundary to be considered an ipoib_mac_t. We force
6172 	 * it to look like one though, to get the offsets of the mc gid,
6173 	 * since we know we are not going to dereference any values with
6174 	 * the ipoib_mac_t pointer.
6175 	 */
6176 	bcopy(mcmac, &maddr, sizeof (ipoib_mac_t));
6177 	mcast = &maddr;
6178 
6179 	/*
6180 	 * Check validity of MCG address. We could additionally check
6181 	 * that a enable/disable is not being issued on the "broadcast"
6182 	 * mcg, but since this operation is only invokable by privileged
6183 	 * programs anyway, we allow the flexibility to those dlpi apps.
6184 	 * Note that we do not validate the "scope" of the IBA mcg.
6185 	 */
6186 	if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN)
6187 		return (EINVAL);
6188 
6189 	/*
6190 	 * fill in multicast pkey and scope
6191 	 */
6192 	IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey);
6193 
6194 	/*
6195 	 * If someone is trying to JOIN/LEAVE the broadcast group, we do
6196 	 * nothing (i.e. we stay JOINed to the broadcast group done in
6197 	 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically
6198 	 * requires to be joined to broadcast groups at all times.
6199 	 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
6200 	 * depends on this.
6201 	 */
6202 	if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0)
6203 		return (0);
6204 
6205 	ibd_n2h_gid(mcast, &mgid);
6206 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6207 	if (req == NULL)
6208 		return (ENOMEM);
6209 
6210 	req->rq_gid = mgid;
6211 
6212 	if (add) {
6213 		DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n",
6214 		    mgid.gid_prefix, mgid.gid_guid);
6215 		ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN);
6216 	} else {
6217 		DPRINT(1, "ibd_m_multicst : unset_multicast : "
6218 		    "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
6219 		ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE);
6220 	}
6221 	return (0);
6222 }
6223 
6224 /*
6225  * The blocking part of the IBA promiscuous operations are done
6226  * out of here on the async thread. The dlpireq parameter indicates
6227  * whether this invocation is due to a dlpi request or due to
6228  * a port up/down event.
6229  */
6230 static void
6231 ibd_async_unsetprom(ibd_state_t *state)
6232 {
6233 	ibd_mce_t *mce = list_head(&state->id_mc_non);
6234 	ib_gid_t mgid;
6235 
6236 	DPRINT(2, "ibd_async_unsetprom : async_unset_promisc");
6237 
6238 	while (mce != NULL) {
6239 		mgid = mce->mc_info.mc_adds_vect.av_dgid;
6240 		mce = list_next(&state->id_mc_non, mce);
6241 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
6242 	}
6243 	state->id_prom_op = IBD_OP_NOTSTARTED;
6244 }
6245 
6246 /*
6247  * The blocking part of the IBA promiscuous operations are done
6248  * out of here on the async thread. The dlpireq parameter indicates
6249  * whether this invocation is due to a dlpi request or due to
6250  * a port up/down event.
6251  */
6252 static void
6253 ibd_async_setprom(ibd_state_t *state)
6254 {
6255 	ibt_mcg_attr_t mcg_attr;
6256 	ibt_mcg_info_t *mcg_info;
6257 	ib_gid_t mgid;
6258 	uint_t numg;
6259 	int i;
6260 	char ret = IBD_OP_COMPLETED;
6261 
6262 	DPRINT(2, "ibd_async_setprom : async_set_promisc");
6263 
6264 	/*
6265 	 * Obtain all active MC groups on the IB fabric with
6266 	 * specified criteria (scope + Pkey + Qkey + mtu).
6267 	 */
6268 	bzero(&mcg_attr, sizeof (mcg_attr));
6269 	mcg_attr.mc_pkey = state->id_pkey;
6270 	mcg_attr.mc_scope = state->id_scope;
6271 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
6272 	mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu;
6273 	mcg_attr.mc_mtu_req.r_selector = IBT_EQU;
6274 	if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) !=
6275 	    IBT_SUCCESS) {
6276 		ibd_print_warn(state, "Could not get list of IBA multicast "
6277 		    "groups");
6278 		ret = IBD_OP_ERRORED;
6279 		goto done;
6280 	}
6281 
6282 	/*
6283 	 * Iterate over the returned mcg's and join as NonMember
6284 	 * to the IP mcg's.
6285 	 */
6286 	for (i = 0; i < numg; i++) {
6287 		/*
6288 		 * Do a NonMember JOIN on the MC group.
6289 		 */
6290 		mgid = mcg_info[i].mc_adds_vect.av_dgid;
6291 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL)
6292 			ibd_print_warn(state, "IBA promiscuous mode missed "
6293 			    "multicast gid %016llx:%016llx",
6294 			    (u_longlong_t)mgid.gid_prefix,
6295 			    (u_longlong_t)mgid.gid_guid);
6296 	}
6297 
6298 	ibt_free_mcg_info(mcg_info, numg);
6299 	DPRINT(4, "ibd_async_setprom : async_set_promisc completes");
6300 done:
6301 	state->id_prom_op = ret;
6302 }
6303 
6304 /*
6305  * GLDv3 entry point for multicast promiscuous enable/disable requests.
6306  * GLDv3 assumes phys state receives more packets than multi state,
6307  * which is not true for IPoIB. Thus, treat the multi and phys
6308  * promiscuous states the same way to work with GLDv3's assumption.
6309  */
6310 static int
6311 ibd_m_promisc(void *arg, boolean_t on)
6312 {
6313 	ibd_state_t *state = (ibd_state_t *)arg;
6314 	ibd_req_t *req;
6315 
6316 	if (state->id_type == IBD_PORT_DRIVER)
6317 		return (EINVAL);
6318 
6319 	/*
6320 	 * Async thread wouldn't have been started if we haven't
6321 	 * passed ibd_m_start()
6322 	 */
6323 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6324 		return (0);
6325 
6326 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6327 	if (req == NULL)
6328 		return (ENOMEM);
6329 	if (on) {
6330 		DPRINT(1, "ibd_m_promisc : set_promisc : %d", on);
6331 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON);
6332 	} else {
6333 		DPRINT(1, "ibd_m_promisc : unset_promisc");
6334 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF);
6335 	}
6336 
6337 	return (0);
6338 }
6339 
6340 /*
6341  * GLDv3 entry point for gathering statistics.
6342  */
6343 static int
6344 ibd_m_stat(void *arg, uint_t stat, uint64_t *val)
6345 {
6346 	ibd_state_t *state = (ibd_state_t *)arg;
6347 
6348 	switch (stat) {
6349 	case MAC_STAT_IFSPEED:
6350 		*val = state->id_link_speed;
6351 		break;
6352 	case MAC_STAT_MULTIRCV:
6353 		*val = state->id_multi_rcv;
6354 		break;
6355 	case MAC_STAT_BRDCSTRCV:
6356 		*val = state->id_brd_rcv;
6357 		break;
6358 	case MAC_STAT_MULTIXMT:
6359 		*val = state->id_multi_xmt;
6360 		break;
6361 	case MAC_STAT_BRDCSTXMT:
6362 		*val = state->id_brd_xmt;
6363 		break;
6364 	case MAC_STAT_RBYTES:
6365 		*val = state->id_rcv_bytes + state->rc_rcv_trans_byte
6366 		    + state->rc_rcv_copy_byte;
6367 		break;
6368 	case MAC_STAT_IPACKETS:
6369 		*val = state->id_rcv_pkt + state->rc_rcv_trans_pkt
6370 		    + state->rc_rcv_copy_pkt;
6371 		break;
6372 	case MAC_STAT_OBYTES:
6373 		*val = state->id_xmt_bytes + state->rc_xmt_bytes;
6374 		break;
6375 	case MAC_STAT_OPACKETS:
6376 		*val = state->id_xmt_pkt + state->rc_xmt_small_pkt +
6377 		    state->rc_xmt_fragmented_pkt +
6378 		    state->rc_xmt_map_fail_pkt + state->rc_xmt_map_succ_pkt;
6379 		break;
6380 	case MAC_STAT_OERRORS:
6381 		*val = state->id_ah_error;	/* failed AH translation */
6382 		break;
6383 	case MAC_STAT_IERRORS:
6384 		*val = 0;
6385 		break;
6386 	case MAC_STAT_NOXMTBUF:
6387 		*val = state->id_tx_short + state->rc_swqe_short +
6388 		    state->rc_xmt_buf_short;
6389 		break;
6390 	case MAC_STAT_NORCVBUF:
6391 	default:
6392 		return (ENOTSUP);
6393 	}
6394 
6395 	return (0);
6396 }
6397 
6398 static void
6399 ibd_async_txsched(ibd_state_t *state)
6400 {
6401 	ibd_resume_transmission(state);
6402 }
6403 
6404 static void
6405 ibd_resume_transmission(ibd_state_t *state)
6406 {
6407 	int flag;
6408 	int met_thresh = 0;
6409 	int thresh = 0;
6410 	int ret = -1;
6411 
6412 	mutex_enter(&state->id_sched_lock);
6413 	if (state->id_sched_needed & IBD_RSRC_SWQE) {
6414 		mutex_enter(&state->id_tx_list.dl_mutex);
6415 		mutex_enter(&state->id_tx_rel_list.dl_mutex);
6416 		met_thresh = state->id_tx_list.dl_cnt +
6417 		    state->id_tx_rel_list.dl_cnt;
6418 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
6419 		mutex_exit(&state->id_tx_list.dl_mutex);
6420 		thresh = IBD_FREE_SWQES_THRESH;
6421 		flag = IBD_RSRC_SWQE;
6422 	} else if (state->id_sched_needed & IBD_RSRC_LSOBUF) {
6423 		ASSERT(state->id_lso != NULL);
6424 		mutex_enter(&state->id_lso_lock);
6425 		met_thresh = state->id_lso->bkt_nfree;
6426 		thresh = IBD_FREE_LSOS_THRESH;
6427 		mutex_exit(&state->id_lso_lock);
6428 		flag = IBD_RSRC_LSOBUF;
6429 		if (met_thresh > thresh)
6430 			state->id_sched_lso_cnt++;
6431 	}
6432 	if (met_thresh > thresh) {
6433 		state->id_sched_needed &= ~flag;
6434 		state->id_sched_cnt++;
6435 		ret = 0;
6436 	}
6437 	mutex_exit(&state->id_sched_lock);
6438 
6439 	if (ret == 0)
6440 		mac_tx_update(state->id_mh);
6441 }
6442 
6443 /*
6444  * Release the send wqe back into free list.
6445  */
6446 static void
6447 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n)
6448 {
6449 	/*
6450 	 * Add back on Tx list for reuse.
6451 	 */
6452 	ASSERT(tail->swqe_next == NULL);
6453 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
6454 	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
6455 	tail->swqe_next = state->id_tx_rel_list.dl_head;
6456 	state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head);
6457 	state->id_tx_rel_list.dl_cnt += n;
6458 	mutex_exit(&state->id_tx_rel_list.dl_mutex);
6459 }
6460 
6461 /*
6462  * Acquire a send wqe from free list.
6463  * Returns error number and send wqe pointer.
6464  */
6465 static ibd_swqe_t *
6466 ibd_acquire_swqe(ibd_state_t *state)
6467 {
6468 	ibd_swqe_t *wqe;
6469 
6470 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
6471 	if (state->id_tx_rel_list.dl_head != NULL) {
6472 		/* transfer id_tx_rel_list to id_tx_list */
6473 		state->id_tx_list.dl_head =
6474 		    state->id_tx_rel_list.dl_head;
6475 		state->id_tx_list.dl_cnt =
6476 		    state->id_tx_rel_list.dl_cnt;
6477 		state->id_tx_list.dl_pending_sends = B_FALSE;
6478 
6479 		/* clear id_tx_rel_list */
6480 		state->id_tx_rel_list.dl_head = NULL;
6481 		state->id_tx_rel_list.dl_cnt = 0;
6482 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
6483 
6484 		wqe = WQE_TO_SWQE(state->id_tx_list.dl_head);
6485 		state->id_tx_list.dl_cnt -= 1;
6486 		state->id_tx_list.dl_head = wqe->swqe_next;
6487 	} else {	/* no free swqe */
6488 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
6489 		state->id_tx_list.dl_pending_sends = B_TRUE;
6490 		DPRINT(5, "ibd_acquire_swqe: out of Tx wqe");
6491 		state->id_tx_short++;
6492 		wqe = NULL;
6493 	}
6494 	return (wqe);
6495 }
6496 
6497 static int
6498 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss,
6499     ibt_ud_dest_hdl_t ud_dest)
6500 {
6501 	mblk_t	*nmp;
6502 	int iph_len, tcph_len;
6503 	ibt_wr_lso_t *lso;
6504 	uintptr_t ip_start, tcp_start;
6505 	uint8_t *dst;
6506 	uint_t pending, mblen;
6507 
6508 	/*
6509 	 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default;
6510 	 * we need to adjust it here for lso.
6511 	 */
6512 	lso = &(node->w_swr.wr.ud_lso);
6513 	lso->lso_ud_dest = ud_dest;
6514 	lso->lso_mss = mss;
6515 
6516 	/*
6517 	 * Calculate the LSO header size and set it in the UD LSO structure.
6518 	 * Note that the only assumption we make is that each of the IPoIB,
6519 	 * IP and TCP headers will be contained in a single mblk fragment;
6520 	 * together, the headers may span multiple mblk fragments.
6521 	 */
6522 	nmp = mp;
6523 	ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE;
6524 	if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
6525 		ip_start = (uintptr_t)nmp->b_cont->b_rptr
6526 		    + (ip_start - (uintptr_t)(nmp->b_wptr));
6527 		nmp = nmp->b_cont;
6528 
6529 	}
6530 	iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start);
6531 
6532 	tcp_start = ip_start + iph_len;
6533 	if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
6534 		tcp_start = (uintptr_t)nmp->b_cont->b_rptr
6535 		    + (tcp_start - (uintptr_t)(nmp->b_wptr));
6536 		nmp = nmp->b_cont;
6537 	}
6538 	tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start);
6539 	lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len;
6540 
6541 	/*
6542 	 * If the lso header fits entirely within a single mblk fragment,
6543 	 * we'll avoid an additional copy of the lso header here and just
6544 	 * pass the b_rptr of the mblk directly.
6545 	 *
6546 	 * If this isn't true, we'd have to allocate for it explicitly.
6547 	 */
6548 	if (lso->lso_hdr_sz <= MBLKL(mp)) {
6549 		lso->lso_hdr = mp->b_rptr;
6550 	} else {
6551 		/* On work completion, remember to free this allocated hdr */
6552 		lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP);
6553 		if (lso->lso_hdr == NULL) {
6554 			DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, "
6555 			    "sz = %d", lso->lso_hdr_sz);
6556 			lso->lso_hdr_sz = 0;
6557 			lso->lso_mss = 0;
6558 			return (-1);
6559 		}
6560 	}
6561 
6562 	/*
6563 	 * Copy in the lso header only if we need to
6564 	 */
6565 	if (lso->lso_hdr != mp->b_rptr) {
6566 		dst = lso->lso_hdr;
6567 		pending = lso->lso_hdr_sz;
6568 
6569 		for (nmp = mp; nmp && pending; nmp = nmp->b_cont) {
6570 			mblen = MBLKL(nmp);
6571 			if (pending > mblen) {
6572 				bcopy(nmp->b_rptr, dst, mblen);
6573 				dst += mblen;
6574 				pending -= mblen;
6575 			} else {
6576 				bcopy(nmp->b_rptr, dst, pending);
6577 				break;
6578 			}
6579 		}
6580 	}
6581 
6582 	return (0);
6583 }
6584 
6585 static void
6586 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp)
6587 {
6588 	ibt_wr_lso_t *lso;
6589 
6590 	if ((!node) || (!mp))
6591 		return;
6592 
6593 	/*
6594 	 * Free any header space that we might've allocated if we
6595 	 * did an LSO
6596 	 */
6597 	if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) {
6598 		lso = &(node->w_swr.wr.ud_lso);
6599 		if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) {
6600 			kmem_free(lso->lso_hdr, lso->lso_hdr_sz);
6601 			lso->lso_hdr = NULL;
6602 			lso->lso_hdr_sz = 0;
6603 		}
6604 	}
6605 }
6606 
6607 static void
6608 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node)
6609 {
6610 	uint_t		i;
6611 	uint_t		num_posted;
6612 	uint_t		n_wrs;
6613 	ibt_status_t	ibt_status;
6614 	ibt_send_wr_t	wrs[IBD_MAX_TX_POST_MULTIPLE];
6615 	ibd_swqe_t	*tx_head, *elem;
6616 	ibd_swqe_t	*nodes[IBD_MAX_TX_POST_MULTIPLE];
6617 
6618 	/* post the one request, then check for more */
6619 	ibt_status = ibt_post_send(state->id_chnl_hdl,
6620 	    &node->w_swr, 1, NULL);
6621 	if (ibt_status != IBT_SUCCESS) {
6622 		ibd_print_warn(state, "ibd_post_send: "
6623 		    "posting one wr failed: ret=%d", ibt_status);
6624 		ibd_tx_cleanup(state, node);
6625 	}
6626 
6627 	tx_head = NULL;
6628 	for (;;) {
6629 		if (tx_head == NULL) {
6630 			mutex_enter(&state->id_txpost_lock);
6631 			tx_head = state->id_tx_head;
6632 			if (tx_head == NULL) {
6633 				state->id_tx_busy = 0;
6634 				mutex_exit(&state->id_txpost_lock);
6635 				return;
6636 			}
6637 			state->id_tx_head = NULL;
6638 			mutex_exit(&state->id_txpost_lock);
6639 		}
6640 
6641 		/*
6642 		 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs
6643 		 * at a time if possible, and keep posting them.
6644 		 */
6645 		for (n_wrs = 0, elem = tx_head;
6646 		    (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE);
6647 		    elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
6648 			nodes[n_wrs] = elem;
6649 			wrs[n_wrs] = elem->w_swr;
6650 		}
6651 		tx_head = elem;
6652 
6653 		ASSERT(n_wrs != 0);
6654 
6655 		/*
6656 		 * If posting fails for some reason, we'll never receive
6657 		 * completion intimation, so we'll need to cleanup. But
6658 		 * we need to make sure we don't clean up nodes whose
6659 		 * wrs have been successfully posted. We assume that the
6660 		 * hca driver returns on the first failure to post and
6661 		 * therefore the first 'num_posted' entries don't need
6662 		 * cleanup here.
6663 		 */
6664 		num_posted = 0;
6665 		ibt_status = ibt_post_send(state->id_chnl_hdl,
6666 		    wrs, n_wrs, &num_posted);
6667 		if (ibt_status != IBT_SUCCESS) {
6668 			ibd_print_warn(state, "ibd_post_send: "
6669 			    "posting multiple wrs failed: "
6670 			    "requested=%d, done=%d, ret=%d",
6671 			    n_wrs, num_posted, ibt_status);
6672 
6673 			for (i = num_posted; i < n_wrs; i++)
6674 				ibd_tx_cleanup(state, nodes[i]);
6675 		}
6676 	}
6677 }
6678 
6679 static int
6680 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node,
6681     uint_t lsohdr_sz)
6682 {
6683 	ibt_wr_ds_t *sgl;
6684 	ibt_status_t ibt_status;
6685 	mblk_t *nmp;
6686 	mblk_t *data_mp;
6687 	uchar_t *bufp;
6688 	size_t blksize;
6689 	size_t skip;
6690 	size_t avail;
6691 	uint_t pktsize;
6692 	uint_t frag_len;
6693 	uint_t pending_hdr;
6694 	int nmblks;
6695 	int i;
6696 
6697 	/*
6698 	 * Let's skip ahead to the data if this is LSO
6699 	 */
6700 	data_mp = mp;
6701 	pending_hdr = 0;
6702 	if (lsohdr_sz) {
6703 		pending_hdr = lsohdr_sz;
6704 		for (nmp = mp; nmp; nmp = nmp->b_cont) {
6705 			frag_len = nmp->b_wptr - nmp->b_rptr;
6706 			if (frag_len > pending_hdr)
6707 				break;
6708 			pending_hdr -= frag_len;
6709 		}
6710 		data_mp = nmp;	/* start of data past lso header */
6711 		ASSERT(data_mp != NULL);
6712 	}
6713 
6714 	/*
6715 	 * Calculate the size of message data and number of msg blocks
6716 	 */
6717 	pktsize = 0;
6718 	for (nmblks = 0, nmp = data_mp; nmp != NULL;
6719 	    nmp = nmp->b_cont, nmblks++) {
6720 		pktsize += MBLKL(nmp);
6721 	}
6722 	pktsize -= pending_hdr;
6723 
6724 	/*
6725 	 * We only do ibt_map_mem_iov() if the pktsize is above the
6726 	 * "copy-threshold", and if the number of mp fragments is less than
6727 	 * the maximum acceptable.
6728 	 */
6729 	if ((state->id_hca_res_lkey_capab) &&
6730 	    (pktsize > state->id_ud_tx_copy_thresh) &&
6731 	    (nmblks < state->id_max_sqseg_hiwm)) {
6732 		ibt_iov_t iov_arr[IBD_MAX_SQSEG];
6733 		ibt_iov_attr_t iov_attr;
6734 
6735 		iov_attr.iov_as = NULL;
6736 		iov_attr.iov = iov_arr;
6737 		iov_attr.iov_buf = NULL;
6738 		iov_attr.iov_list_len = nmblks;
6739 		iov_attr.iov_wr_nds = state->id_max_sqseg;
6740 		iov_attr.iov_lso_hdr_sz = lsohdr_sz;
6741 		iov_attr.iov_flags = IBT_IOV_SLEEP;
6742 
6743 		for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) {
6744 			iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr;
6745 			iov_arr[i].iov_len = MBLKL(nmp);
6746 			if (i == 0) {
6747 				iov_arr[i].iov_addr += pending_hdr;
6748 				iov_arr[i].iov_len -= pending_hdr;
6749 			}
6750 		}
6751 
6752 		node->w_buftype = IBD_WQE_MAPPED;
6753 		node->w_swr.wr_sgl = node->w_sgl;
6754 
6755 		ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr,
6756 		    (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl);
6757 		if (ibt_status != IBT_SUCCESS) {
6758 			ibd_print_warn(state, "ibd_send: ibt_map_mem_iov "
6759 			    "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status);
6760 			goto ibd_copy_path;
6761 		}
6762 
6763 		return (0);
6764 	}
6765 
6766 ibd_copy_path:
6767 	if (pktsize <= state->id_tx_buf_sz) {
6768 		node->swqe_copybuf.ic_sgl.ds_len = pktsize;
6769 		node->w_swr.wr_nds = 1;
6770 		node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
6771 		node->w_buftype = IBD_WQE_TXBUF;
6772 
6773 		/*
6774 		 * Even though this is the copy path for transfers less than
6775 		 * id_tx_buf_sz, it could still be an LSO packet.  If so, it
6776 		 * is possible the first data mblk fragment (data_mp) still
6777 		 * contains part of the LSO header that we need to skip.
6778 		 */
6779 		bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
6780 		for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
6781 			blksize = MBLKL(nmp) - pending_hdr;
6782 			bcopy(nmp->b_rptr + pending_hdr, bufp, blksize);
6783 			bufp += blksize;
6784 			pending_hdr = 0;
6785 		}
6786 
6787 		return (0);
6788 	}
6789 
6790 	/*
6791 	 * Copy path for transfers greater than id_tx_buf_sz
6792 	 */
6793 	node->w_swr.wr_sgl = node->w_sgl;
6794 	if (ibd_acquire_lsobufs(state, pktsize,
6795 	    node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) {
6796 		DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed");
6797 		return (-1);
6798 	}
6799 	node->w_buftype = IBD_WQE_LSOBUF;
6800 
6801 	/*
6802 	 * Copy the larger-than-id_tx_buf_sz packet into a set of
6803 	 * fixed-sized, pre-mapped LSO buffers. Note that we might
6804 	 * need to skip part of the LSO header in the first fragment
6805 	 * as before.
6806 	 */
6807 	nmp = data_mp;
6808 	skip = pending_hdr;
6809 	for (i = 0; i < node->w_swr.wr_nds; i++) {
6810 		sgl = node->w_swr.wr_sgl + i;
6811 		bufp = (uchar_t *)(uintptr_t)sgl->ds_va;
6812 		avail = IBD_LSO_BUFSZ;
6813 		while (nmp && avail) {
6814 			blksize = MBLKL(nmp) - skip;
6815 			if (blksize > avail) {
6816 				bcopy(nmp->b_rptr + skip, bufp, avail);
6817 				skip += avail;
6818 				avail = 0;
6819 			} else {
6820 				bcopy(nmp->b_rptr + skip, bufp, blksize);
6821 				skip = 0;
6822 				avail -= blksize;
6823 				bufp += blksize;
6824 				nmp = nmp->b_cont;
6825 			}
6826 		}
6827 	}
6828 
6829 	return (0);
6830 }
6831 
6832 /*
6833  * Schedule a completion queue polling to reap the resource we're
6834  * short on.  If we implement the change to reap tx completions
6835  * in a separate thread, we'll need to wake up that thread here.
6836  */
6837 static int
6838 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag)
6839 {
6840 	ibd_req_t *req;
6841 
6842 	mutex_enter(&state->id_sched_lock);
6843 	state->id_sched_needed |= resource_type;
6844 	mutex_exit(&state->id_sched_lock);
6845 
6846 	/*
6847 	 * If we are asked to queue a work entry, we need to do it
6848 	 */
6849 	if (q_flag) {
6850 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6851 		if (req == NULL)
6852 			return (-1);
6853 
6854 		ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
6855 	}
6856 
6857 	return (0);
6858 }
6859 
6860 /*
6861  * The passed in packet has this format:
6862  * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data
6863  */
6864 static boolean_t
6865 ibd_send(ibd_state_t *state, mblk_t *mp)
6866 {
6867 	ibd_ace_t *ace;
6868 	ibd_swqe_t *node;
6869 	ipoib_mac_t *dest;
6870 	ib_header_info_t *ipibp;
6871 	ip6_t *ip6h;
6872 	uint_t pktsize;
6873 	uint32_t mss;
6874 	uint32_t hckflags;
6875 	uint32_t lsoflags = 0;
6876 	uint_t lsohdr_sz = 0;
6877 	int ret, len;
6878 	boolean_t dofree = B_FALSE;
6879 	boolean_t rc;
6880 	/* if (rc_chan == NULL) send by UD; else send by RC; */
6881 	ibd_rc_chan_t *rc_chan;
6882 	int nmblks;
6883 	mblk_t *nmp;
6884 
6885 	/*
6886 	 * If we aren't done with the device initialization and start,
6887 	 * we shouldn't be here.
6888 	 */
6889 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6890 		return (B_FALSE);
6891 
6892 	/*
6893 	 * Obtain an address handle for the destination.
6894 	 */
6895 	ipibp = (ib_header_info_t *)mp->b_rptr;
6896 	dest = (ipoib_mac_t *)&ipibp->ib_dst;
6897 	if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
6898 		IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey);
6899 
6900 	rc_chan = NULL;
6901 	ace = ibd_acache_lookup(state, dest, &ret, 1);
6902 	if (state->id_enable_rc && (ace != NULL) &&
6903 	    (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN))) {
6904 		if (ace->ac_chan == NULL) {
6905 			state->rc_null_conn++;
6906 		} else {
6907 			if (ace->ac_chan->chan_state ==
6908 			    IBD_RC_STATE_ACT_ESTAB) {
6909 				rc_chan = ace->ac_chan;
6910 				mutex_enter(&rc_chan->tx_wqe_list.dl_mutex);
6911 				node = WQE_TO_SWQE(
6912 				    rc_chan->tx_wqe_list.dl_head);
6913 				if (node != NULL) {
6914 					rc_chan->tx_wqe_list.dl_cnt -= 1;
6915 					rc_chan->tx_wqe_list.dl_head =
6916 					    node->swqe_next;
6917 				} else {
6918 					node = ibd_rc_acquire_swqes(rc_chan);
6919 				}
6920 				mutex_exit(&rc_chan->tx_wqe_list.dl_mutex);
6921 
6922 				if (node == NULL) {
6923 					state->rc_swqe_short++;
6924 					mutex_enter(&state->id_sched_lock);
6925 					state->id_sched_needed |=
6926 					    IBD_RSRC_RC_SWQE;
6927 					mutex_exit(&state->id_sched_lock);
6928 					ibd_dec_ref_ace(state, ace);
6929 					return (B_FALSE);
6930 				}
6931 			} else {
6932 				state->rc_no_estab_conn++;
6933 			}
6934 		}
6935 	}
6936 
6937 	if (rc_chan == NULL) {
6938 		mutex_enter(&state->id_tx_list.dl_mutex);
6939 		node = WQE_TO_SWQE(state->id_tx_list.dl_head);
6940 		if (node != NULL) {
6941 			state->id_tx_list.dl_cnt -= 1;
6942 			state->id_tx_list.dl_head = node->swqe_next;
6943 		} else {
6944 			node = ibd_acquire_swqe(state);
6945 		}
6946 		mutex_exit(&state->id_tx_list.dl_mutex);
6947 		if (node == NULL) {
6948 			/*
6949 			 * If we don't have an swqe available, schedule a
6950 			 * transmit completion queue cleanup and hold off on
6951 			 * sending more packets until we have some free swqes
6952 			 */
6953 			if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0) {
6954 				if (ace != NULL) {
6955 					ibd_dec_ref_ace(state, ace);
6956 				}
6957 				return (B_FALSE);
6958 			}
6959 
6960 			/*
6961 			 * If a poll cannot be scheduled, we have no choice but
6962 			 * to drop this packet
6963 			 */
6964 			ibd_print_warn(state, "ibd_send: no swqe, pkt drop");
6965 			if (ace != NULL) {
6966 				ibd_dec_ref_ace(state, ace);
6967 			}
6968 			return (B_TRUE);
6969 		}
6970 	}
6971 
6972 	/*
6973 	 * Initialize the commonly used fields in swqe to NULL to protect
6974 	 * against ibd_tx_cleanup accidentally misinterpreting these on a
6975 	 * failure.
6976 	 */
6977 	node->swqe_im_mblk = NULL;
6978 	node->w_swr.wr_nds = 0;
6979 	node->w_swr.wr_sgl = NULL;
6980 	node->w_swr.wr_opcode = IBT_WRC_SEND;
6981 
6982 	/*
6983 	 * Calculate the size of message data and number of msg blocks
6984 	 */
6985 	pktsize = 0;
6986 	for (nmblks = 0, nmp = mp; nmp != NULL;
6987 	    nmp = nmp->b_cont, nmblks++) {
6988 		pktsize += MBLKL(nmp);
6989 	}
6990 
6991 	if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
6992 		atomic_inc_64(&state->id_brd_xmt);
6993 	else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
6994 		atomic_inc_64(&state->id_multi_xmt);
6995 
6996 	if (ace != NULL) {
6997 		node->w_ahandle = ace;
6998 		node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
6999 	} else {
7000 		DPRINT(5,
7001 		    "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X",
7002 		    ((ret == EFAULT) ? "failed" : "queued"),
7003 		    htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]),
7004 		    htonl(dest->ipoib_gidpref[1]),
7005 		    htonl(dest->ipoib_gidsuff[0]),
7006 		    htonl(dest->ipoib_gidsuff[1]));
7007 		state->rc_ace_not_found++;
7008 		node->w_ahandle = NULL;
7009 
7010 		/*
7011 		 * Here if ibd_acache_lookup() returns EFAULT, it means ibd
7012 		 * can not find a path for the specific dest address. We
7013 		 * should get rid of this kind of packet.  We also should get
7014 		 * rid of the packet if we cannot schedule a poll via the
7015 		 * async thread.  For the normal case, ibd will return the
7016 		 * packet to upper layer and wait for AH creating.
7017 		 *
7018 		 * Note that we always queue a work slot entry for the async
7019 		 * thread when we fail AH lookup (even in intr mode); this is
7020 		 * due to the convoluted way the code currently looks for AH.
7021 		 */
7022 		if (ret == EFAULT) {
7023 			dofree = B_TRUE;
7024 			rc = B_TRUE;
7025 		} else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) {
7026 			dofree = B_TRUE;
7027 			rc = B_TRUE;
7028 		} else {
7029 			dofree = B_FALSE;
7030 			rc = B_FALSE;
7031 		}
7032 		goto ibd_send_fail;
7033 	}
7034 
7035 	/*
7036 	 * For ND6 packets, padding is at the front of the source lladdr.
7037 	 * Insert the padding at front.
7038 	 */
7039 	if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) {
7040 		if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) {
7041 			if (!pullupmsg(mp, IPV6_HDR_LEN +
7042 			    sizeof (ib_header_info_t))) {
7043 				DPRINT(10, "ibd_send: pullupmsg failure ");
7044 				dofree = B_TRUE;
7045 				rc = B_TRUE;
7046 				goto ibd_send_fail;
7047 			}
7048 			ipibp = (ib_header_info_t *)mp->b_rptr;
7049 		}
7050 		ip6h = (ip6_t *)((uchar_t *)ipibp +
7051 		    sizeof (ib_header_info_t));
7052 		len = ntohs(ip6h->ip6_plen);
7053 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
7054 			mblk_t	*pad;
7055 
7056 			pad = allocb(4, 0);
7057 			pad->b_wptr = (uchar_t *)pad->b_rptr + 4;
7058 			linkb(mp, pad);
7059 			if (MBLKL(mp) < sizeof (ib_header_info_t) +
7060 			    IPV6_HDR_LEN + len + 4) {
7061 				if (!pullupmsg(mp, sizeof (ib_header_info_t) +
7062 				    IPV6_HDR_LEN + len + 4)) {
7063 					DPRINT(10, "ibd_send: pullupmsg "
7064 					    "failure ");
7065 					dofree = B_TRUE;
7066 					rc = B_TRUE;
7067 					goto ibd_send_fail;
7068 				}
7069 				ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
7070 				    sizeof (ib_header_info_t));
7071 			}
7072 
7073 			/* LINTED: E_CONSTANT_CONDITION */
7074 			IBD_PAD_NSNA(ip6h, len, IBD_SEND);
7075 		}
7076 	}
7077 
7078 	ASSERT(mp->b_wptr - mp->b_rptr >= sizeof (ib_addrs_t));
7079 	mp->b_rptr += sizeof (ib_addrs_t);
7080 	pktsize -= sizeof (ib_addrs_t);
7081 
7082 	if (rc_chan) {	/* send in RC mode */
7083 		ibt_iov_t iov_arr[IBD_MAX_SQSEG];
7084 		ibt_iov_attr_t iov_attr;
7085 		uint_t		i;
7086 		size_t	blksize;
7087 		uchar_t *bufp;
7088 		ibd_rc_tx_largebuf_t *lbufp;
7089 
7090 		atomic_add_64(&state->rc_xmt_bytes, pktsize);
7091 
7092 		/*
7093 		 * Upper layer does Tx checksum, we don't need do any
7094 		 * checksum here.
7095 		 */
7096 		ASSERT(node->w_swr.wr_trans == IBT_RC_SRV);
7097 
7098 		/*
7099 		 * We only do ibt_map_mem_iov() if the pktsize is above
7100 		 * the "copy-threshold", and if the number of mp
7101 		 * fragments is less than the maximum acceptable.
7102 		 */
7103 		if (pktsize <= state->id_rc_tx_copy_thresh) {
7104 			atomic_inc_64(&state->rc_xmt_small_pkt);
7105 			/*
7106 			 * Only process unicast packet in Reliable Connected
7107 			 * mode.
7108 			 */
7109 			node->swqe_copybuf.ic_sgl.ds_len = pktsize;
7110 			node->w_swr.wr_nds = 1;
7111 			node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
7112 			node->w_buftype = IBD_WQE_TXBUF;
7113 
7114 			bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
7115 			for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
7116 				blksize = MBLKL(nmp);
7117 				bcopy(nmp->b_rptr, bufp, blksize);
7118 				bufp += blksize;
7119 			}
7120 			freemsg(mp);
7121 			ASSERT(node->swqe_im_mblk == NULL);
7122 		} else {
7123 			if ((state->rc_enable_iov_map) &&
7124 			    (nmblks < state->rc_max_sqseg_hiwm)) {
7125 
7126 				/* do ibt_map_mem_iov() */
7127 				iov_attr.iov_as = NULL;
7128 				iov_attr.iov = iov_arr;
7129 				iov_attr.iov_buf = NULL;
7130 				iov_attr.iov_wr_nds = state->rc_tx_max_sqseg;
7131 				iov_attr.iov_lso_hdr_sz = 0;
7132 				iov_attr.iov_flags = IBT_IOV_SLEEP;
7133 
7134 				i = 0;
7135 				for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
7136 					iov_arr[i].iov_len = MBLKL(nmp);
7137 					if (iov_arr[i].iov_len != 0) {
7138 						iov_arr[i].iov_addr = (caddr_t)
7139 						    (void *)nmp->b_rptr;
7140 						i++;
7141 					}
7142 				}
7143 				iov_attr.iov_list_len = i;
7144 				node->w_swr.wr_sgl = node->w_sgl;
7145 
7146 				ret = ibt_map_mem_iov(state->id_hca_hdl,
7147 				    &iov_attr, (ibt_all_wr_t *)&node->w_swr,
7148 				    &node->w_mi_hdl);
7149 				if (ret != IBT_SUCCESS) {
7150 					atomic_inc_64(
7151 					    &state->rc_xmt_map_fail_pkt);
7152 					DPRINT(30, "ibd_send: ibt_map_mem_iov("
7153 					    ") failed, nmblks=%d, real_nmblks"
7154 					    "=%d, ret=0x%x", nmblks, i, ret);
7155 					goto ibd_rc_large_copy;
7156 				}
7157 
7158 				atomic_inc_64(&state->rc_xmt_map_succ_pkt);
7159 				node->w_buftype = IBD_WQE_MAPPED;
7160 				node->swqe_im_mblk = mp;
7161 			} else {
7162 				atomic_inc_64(&state->rc_xmt_fragmented_pkt);
7163 ibd_rc_large_copy:
7164 				mutex_enter(&state->rc_tx_large_bufs_lock);
7165 				if (state->rc_tx_largebuf_nfree == 0) {
7166 					state->rc_xmt_buf_short++;
7167 					mutex_exit
7168 					    (&state->rc_tx_large_bufs_lock);
7169 					mutex_enter(&state->id_sched_lock);
7170 					state->id_sched_needed |=
7171 					    IBD_RSRC_RC_TX_LARGEBUF;
7172 					mutex_exit(&state->id_sched_lock);
7173 					dofree = B_FALSE;
7174 					rc = B_FALSE;
7175 					/*
7176 					 * If we don't have Tx large bufs,
7177 					 * return failure. node->w_buftype
7178 					 * should not be IBD_WQE_RC_COPYBUF,
7179 					 * otherwise it will cause problem
7180 					 * in ibd_rc_tx_cleanup()
7181 					 */
7182 					node->w_buftype = IBD_WQE_TXBUF;
7183 					goto ibd_send_fail;
7184 				}
7185 
7186 				lbufp = state->rc_tx_largebuf_free_head;
7187 				ASSERT(lbufp->lb_buf != NULL);
7188 				state->rc_tx_largebuf_free_head =
7189 				    lbufp->lb_next;
7190 				lbufp->lb_next = NULL;
7191 				/* Update nfree count */
7192 				state->rc_tx_largebuf_nfree --;
7193 				mutex_exit(&state->rc_tx_large_bufs_lock);
7194 				bufp = lbufp->lb_buf;
7195 				node->w_sgl[0].ds_va =
7196 				    (ib_vaddr_t)(uintptr_t)bufp;
7197 				node->w_sgl[0].ds_key =
7198 				    state->rc_tx_mr_desc.md_lkey;
7199 				node->w_sgl[0].ds_len = pktsize;
7200 				node->w_swr.wr_sgl = node->w_sgl;
7201 				node->w_swr.wr_nds = 1;
7202 				node->w_buftype = IBD_WQE_RC_COPYBUF;
7203 				node->w_rc_tx_largebuf = lbufp;
7204 
7205 				for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
7206 					blksize = MBLKL(nmp);
7207 					if (blksize != 0) {
7208 						bcopy(nmp->b_rptr, bufp,
7209 						    blksize);
7210 						bufp += blksize;
7211 					}
7212 				}
7213 				freemsg(mp);
7214 				ASSERT(node->swqe_im_mblk == NULL);
7215 			}
7216 		}
7217 
7218 		node->swqe_next = NULL;
7219 		mutex_enter(&rc_chan->tx_post_lock);
7220 		if (rc_chan->tx_busy) {
7221 			if (rc_chan->tx_head) {
7222 				rc_chan->tx_tail->swqe_next =
7223 				    SWQE_TO_WQE(node);
7224 			} else {
7225 				rc_chan->tx_head = node;
7226 			}
7227 			rc_chan->tx_tail = node;
7228 			mutex_exit(&rc_chan->tx_post_lock);
7229 		} else {
7230 			rc_chan->tx_busy = 1;
7231 			mutex_exit(&rc_chan->tx_post_lock);
7232 			ibd_rc_post_send(rc_chan, node);
7233 		}
7234 
7235 		return (B_TRUE);
7236 	} /* send by RC */
7237 
7238 	if ((state->id_enable_rc) && (pktsize > state->id_mtu)) {
7239 		/*
7240 		 * Too long pktsize. The packet size from GLD should <=
7241 		 * state->id_mtu + sizeof (ib_addrs_t)
7242 		 */
7243 		if (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN)) {
7244 			ibd_req_t *req;
7245 
7246 			mutex_enter(&ace->tx_too_big_mutex);
7247 			if (ace->tx_too_big_ongoing) {
7248 				mutex_exit(&ace->tx_too_big_mutex);
7249 				state->rc_xmt_reenter_too_long_pkt++;
7250 				dofree = B_TRUE;
7251 			} else {
7252 				ace->tx_too_big_ongoing = B_TRUE;
7253 				mutex_exit(&ace->tx_too_big_mutex);
7254 				state->rc_xmt_icmp_too_long_pkt++;
7255 
7256 				req = kmem_cache_alloc(state->id_req_kmc,
7257 				    KM_NOSLEEP);
7258 				if (req == NULL) {
7259 					ibd_print_warn(state, "ibd_send: alloc "
7260 					    "ibd_req_t fail");
7261 					/* Drop it. */
7262 					dofree = B_TRUE;
7263 				} else {
7264 					req->rq_ptr = mp;
7265 					req->rq_ptr2 = ace;
7266 					ibd_queue_work_slot(state, req,
7267 					    IBD_ASYNC_RC_TOO_BIG);
7268 					dofree = B_FALSE;
7269 				}
7270 			}
7271 		} else {
7272 			ibd_print_warn(state, "Reliable Connected mode is on. "
7273 			    "Multicast packet length %d > %d is too long to "
7274 			    "send packet (%d > %d), drop it",
7275 			    pktsize, state->id_mtu);
7276 			state->rc_xmt_drop_too_long_pkt++;
7277 			/* Drop it. */
7278 			dofree = B_TRUE;
7279 		}
7280 		rc = B_TRUE;
7281 		goto ibd_send_fail;
7282 	}
7283 
7284 	atomic_add_64(&state->id_xmt_bytes, pktsize);
7285 	atomic_inc_64(&state->id_xmt_pkt);
7286 
7287 	/*
7288 	 * Do LSO and checksum related work here.  For LSO send, adjust the
7289 	 * ud destination, the opcode and the LSO header information to the
7290 	 * work request.
7291 	 */
7292 	mac_lso_get(mp, &mss, &lsoflags);
7293 	if ((lsoflags & HW_LSO) != HW_LSO) {
7294 		node->w_swr.wr_opcode = IBT_WRC_SEND;
7295 		lsohdr_sz = 0;
7296 	} else {
7297 		if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) {
7298 			/*
7299 			 * The routine can only fail if there's no memory; we
7300 			 * can only drop the packet if this happens
7301 			 */
7302 			ibd_print_warn(state,
7303 			    "ibd_send: no memory, lso posting failed");
7304 			dofree = B_TRUE;
7305 			rc = B_TRUE;
7306 			goto ibd_send_fail;
7307 		}
7308 
7309 		node->w_swr.wr_opcode = IBT_WRC_SEND_LSO;
7310 		lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz;
7311 	}
7312 
7313 	mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &hckflags);
7314 	if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM)
7315 		node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM;
7316 	else
7317 		node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM;
7318 
7319 	/*
7320 	 * Prepare the sgl for posting; the routine can only fail if there's
7321 	 * no lso buf available for posting. If this is the case, we should
7322 	 * probably resched for lso bufs to become available and then try again.
7323 	 */
7324 	if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) {
7325 		if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) {
7326 			dofree = B_TRUE;
7327 			rc = B_TRUE;
7328 		} else {
7329 			dofree = B_FALSE;
7330 			rc = B_FALSE;
7331 		}
7332 		goto ibd_send_fail;
7333 	}
7334 	node->swqe_im_mblk = mp;
7335 
7336 	/*
7337 	 * Queue the wqe to hardware; since we can now simply queue a
7338 	 * post instead of doing it serially, we cannot assume anything
7339 	 * about the 'node' after ibd_post_send() returns.
7340 	 */
7341 	node->swqe_next = NULL;
7342 
7343 	mutex_enter(&state->id_txpost_lock);
7344 	if (state->id_tx_busy) {
7345 		if (state->id_tx_head) {
7346 			state->id_tx_tail->swqe_next =
7347 			    SWQE_TO_WQE(node);
7348 		} else {
7349 			state->id_tx_head = node;
7350 		}
7351 		state->id_tx_tail = node;
7352 		mutex_exit(&state->id_txpost_lock);
7353 	} else {
7354 		state->id_tx_busy = 1;
7355 		mutex_exit(&state->id_txpost_lock);
7356 		ibd_post_send(state, node);
7357 	}
7358 
7359 	return (B_TRUE);
7360 
7361 ibd_send_fail:
7362 	if (node && mp)
7363 		ibd_free_lsohdr(node, mp);
7364 
7365 	if (dofree)
7366 		freemsg(mp);
7367 
7368 	if (node != NULL) {
7369 		if (rc_chan) {
7370 			ibd_rc_tx_cleanup(node);
7371 		} else {
7372 			ibd_tx_cleanup(state, node);
7373 		}
7374 	}
7375 
7376 	return (rc);
7377 }
7378 
7379 /*
7380  * GLDv3 entry point for transmitting datagram.
7381  */
7382 static mblk_t *
7383 ibd_m_tx(void *arg, mblk_t *mp)
7384 {
7385 	ibd_state_t *state = (ibd_state_t *)arg;
7386 	mblk_t *next;
7387 
7388 	if (state->id_type == IBD_PORT_DRIVER) {
7389 		freemsgchain(mp);
7390 		return (NULL);
7391 	}
7392 
7393 	if ((state->id_link_state != LINK_STATE_UP) ||
7394 	    !(state->id_mac_state & IBD_DRV_STARTED)) {
7395 		freemsgchain(mp);
7396 		mp = NULL;
7397 	}
7398 
7399 	while (mp != NULL) {
7400 		next = mp->b_next;
7401 		mp->b_next = NULL;
7402 		if (ibd_send(state, mp) == B_FALSE) {
7403 			/* Send fail */
7404 			mp->b_next = next;
7405 			break;
7406 		}
7407 		mp = next;
7408 	}
7409 
7410 	return (mp);
7411 }
7412 
7413 /*
7414  * this handles Tx and Rx completions. With separate CQs, this handles
7415  * only Rx completions.
7416  */
7417 static uint_t
7418 ibd_intr(caddr_t arg)
7419 {
7420 	ibd_state_t *state = (ibd_state_t *)arg;
7421 
7422 	ibd_poll_rcq(state, state->id_rcq_hdl);
7423 
7424 	return (DDI_INTR_CLAIMED);
7425 }
7426 
7427 /*
7428  * Poll and fully drain the send cq
7429  */
7430 static void
7431 ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7432 {
7433 	ibt_wc_t *wcs = state->id_txwcs;
7434 	uint_t numwcs = state->id_txwcs_size;
7435 	ibd_wqe_t *wqe;
7436 	ibd_swqe_t *head, *tail;
7437 	ibt_wc_t *wc;
7438 	uint_t num_polled;
7439 	int i;
7440 
7441 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
7442 		head = tail = NULL;
7443 		for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
7444 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
7445 			if (wc->wc_status != IBT_WC_SUCCESS) {
7446 				/*
7447 				 * Channel being torn down.
7448 				 */
7449 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
7450 					DPRINT(5, "ibd_drain_scq: flush error");
7451 					DPRINT(10, "ibd_drain_scq: Bad "
7452 					    "status %d", wc->wc_status);
7453 				} else {
7454 					DPRINT(10, "ibd_drain_scq: "
7455 					    "unexpected wc_status %d",
7456 					    wc->wc_status);
7457 				}
7458 				/*
7459 				 * Fallthrough to invoke the Tx handler to
7460 				 * release held resources, e.g., AH refcount.
7461 				 */
7462 			}
7463 			/*
7464 			 * Add this swqe to the list to be cleaned up.
7465 			 */
7466 			if (head)
7467 				tail->swqe_next = wqe;
7468 			else
7469 				head = WQE_TO_SWQE(wqe);
7470 			tail = WQE_TO_SWQE(wqe);
7471 		}
7472 		tail->swqe_next = NULL;
7473 		ibd_tx_cleanup_list(state, head, tail);
7474 
7475 		/*
7476 		 * Resume any blocked transmissions if possible
7477 		 */
7478 		ibd_resume_transmission(state);
7479 	}
7480 }
7481 
7482 /*
7483  * Poll and fully drain the receive cq
7484  */
7485 static void
7486 ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7487 {
7488 	ibt_wc_t *wcs = state->id_rxwcs;
7489 	uint_t numwcs = state->id_rxwcs_size;
7490 	ibd_rwqe_t *rwqe;
7491 	ibt_wc_t *wc;
7492 	uint_t num_polled;
7493 	int i;
7494 	mblk_t *head, *tail, *mp;
7495 
7496 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
7497 		head = tail = NULL;
7498 		for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
7499 			rwqe = (ibd_rwqe_t *)(uintptr_t)wc->wc_id;
7500 			if (wc->wc_status != IBT_WC_SUCCESS) {
7501 				/*
7502 				 * Channel being torn down.
7503 				 */
7504 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
7505 					DPRINT(5, "ibd_drain_rcq: "
7506 					    "expected flushed rwqe");
7507 				} else {
7508 					DPRINT(5, "ibd_drain_rcq: "
7509 					    "unexpected wc_status %d",
7510 					    wc->wc_status);
7511 				}
7512 				atomic_inc_32(
7513 				    &state->id_rx_list.dl_bufs_outstanding);
7514 				freemsg(rwqe->rwqe_im_mblk);
7515 				continue;
7516 			}
7517 			mp = ibd_process_rx(state, rwqe, wc);
7518 			if (mp == NULL)
7519 				continue;
7520 
7521 			/*
7522 			 * Add this mp to the list to send to the nw layer.
7523 			 */
7524 			if (head)
7525 				tail->b_next = mp;
7526 			else
7527 				head = mp;
7528 			tail = mp;
7529 		}
7530 		if (head)
7531 			mac_rx(state->id_mh, state->id_rh, head);
7532 
7533 		/*
7534 		 * Account for #rwqes polled.
7535 		 * Post more here, if less than one fourth full.
7536 		 */
7537 		if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, -num_polled) <
7538 		    (state->id_ud_num_rwqe / 4))
7539 			ibd_post_recv_intr(state);
7540 	}
7541 }
7542 
7543 /*
7544  * Common code for interrupt handling as well as for polling
7545  * for all completed wqe's while detaching.
7546  */
7547 static void
7548 ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7549 {
7550 	int flag, redo_flag;
7551 	int redo = 1;
7552 
7553 	flag = IBD_CQ_POLLING;
7554 	redo_flag = IBD_REDO_CQ_POLLING;
7555 
7556 	mutex_enter(&state->id_scq_poll_lock);
7557 	if (state->id_scq_poll_busy & flag) {
7558 		ibd_print_warn(state, "ibd_poll_scq: multiple polling threads");
7559 		state->id_scq_poll_busy |= redo_flag;
7560 		mutex_exit(&state->id_scq_poll_lock);
7561 		return;
7562 	}
7563 	state->id_scq_poll_busy |= flag;
7564 	mutex_exit(&state->id_scq_poll_lock);
7565 
7566 	/*
7567 	 * In some cases (eg detaching), this code can be invoked on
7568 	 * any cpu after disabling cq notification (thus no concurrency
7569 	 * exists). Apart from that, the following applies normally:
7570 	 * Transmit completion handling could be from any cpu if
7571 	 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
7572 	 * is interrupt driven.
7573 	 */
7574 
7575 	/*
7576 	 * Poll and drain the CQ
7577 	 */
7578 	ibd_drain_scq(state, cq_hdl);
7579 
7580 	/*
7581 	 * Enable CQ notifications and redrain the cq to catch any
7582 	 * completions we might have missed after the ibd_drain_scq()
7583 	 * above and before the ibt_enable_cq_notify() that follows.
7584 	 * Finally, service any new requests to poll the cq that
7585 	 * could've come in after the ibt_enable_cq_notify().
7586 	 */
7587 	do {
7588 		if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) !=
7589 		    IBT_SUCCESS) {
7590 			DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
7591 		}
7592 
7593 		ibd_drain_scq(state, cq_hdl);
7594 
7595 		mutex_enter(&state->id_scq_poll_lock);
7596 		if (state->id_scq_poll_busy & redo_flag)
7597 			state->id_scq_poll_busy &= ~redo_flag;
7598 		else {
7599 			state->id_scq_poll_busy &= ~flag;
7600 			redo = 0;
7601 		}
7602 		mutex_exit(&state->id_scq_poll_lock);
7603 
7604 	} while (redo);
7605 }
7606 
7607 /*
7608  * Common code for interrupt handling as well as for polling
7609  * for all completed wqe's while detaching.
7610  */
7611 static void
7612 ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq)
7613 {
7614 	int flag, redo_flag;
7615 	int redo = 1;
7616 
7617 	flag = IBD_CQ_POLLING;
7618 	redo_flag = IBD_REDO_CQ_POLLING;
7619 
7620 	mutex_enter(&state->id_rcq_poll_lock);
7621 	if (state->id_rcq_poll_busy & flag) {
7622 		ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads");
7623 		state->id_rcq_poll_busy |= redo_flag;
7624 		mutex_exit(&state->id_rcq_poll_lock);
7625 		return;
7626 	}
7627 	state->id_rcq_poll_busy |= flag;
7628 	mutex_exit(&state->id_rcq_poll_lock);
7629 
7630 	/*
7631 	 * Poll and drain the CQ
7632 	 */
7633 	ibd_drain_rcq(state, rcq);
7634 
7635 	/*
7636 	 * Enable CQ notifications and redrain the cq to catch any
7637 	 * completions we might have missed after the ibd_drain_cq()
7638 	 * above and before the ibt_enable_cq_notify() that follows.
7639 	 * Finally, service any new requests to poll the cq that
7640 	 * could've come in after the ibt_enable_cq_notify().
7641 	 */
7642 	do {
7643 		if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) !=
7644 		    IBT_SUCCESS) {
7645 			DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
7646 		}
7647 
7648 		ibd_drain_rcq(state, rcq);
7649 
7650 		mutex_enter(&state->id_rcq_poll_lock);
7651 		if (state->id_rcq_poll_busy & redo_flag)
7652 			state->id_rcq_poll_busy &= ~redo_flag;
7653 		else {
7654 			state->id_rcq_poll_busy &= ~flag;
7655 			redo = 0;
7656 		}
7657 		mutex_exit(&state->id_rcq_poll_lock);
7658 
7659 	} while (redo);
7660 }
7661 
7662 /*
7663  * Unmap the memory area associated with a given swqe.
7664  */
7665 void
7666 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe)
7667 {
7668 	ibt_status_t stat;
7669 
7670 	DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds);
7671 
7672 	if (swqe->w_mi_hdl) {
7673 		if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl,
7674 		    swqe->w_mi_hdl)) != IBT_SUCCESS) {
7675 			DPRINT(10,
7676 			    "failed in ibt_unmap_mem_iov, ret=%d\n", stat);
7677 		}
7678 		swqe->w_mi_hdl = NULL;
7679 	}
7680 	swqe->w_swr.wr_nds = 0;
7681 }
7682 
7683 void
7684 ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace)
7685 {
7686 	/*
7687 	 * The recycling logic can be eliminated from here
7688 	 * and put into the async thread if we create another
7689 	 * list to hold ACE's for unjoined mcg's.
7690 	 */
7691 	if (DEC_REF_DO_CYCLE(ace)) {
7692 		ibd_mce_t *mce;
7693 
7694 		/*
7695 		 * Check with the lock taken: we decremented
7696 		 * reference count without the lock, and some
7697 		 * transmitter might already have bumped the
7698 		 * reference count (possible in case of multicast
7699 		 * disable when we leave the AH on the active
7700 		 * list). If not still 0, get out, leaving the
7701 		 * recycle bit intact.
7702 		 *
7703 		 * Atomically transition the AH from active
7704 		 * to free list, and queue a work request to
7705 		 * leave the group and destroy the mce. No
7706 		 * transmitter can be looking at the AH or
7707 		 * the MCE in between, since we have the
7708 		 * ac_mutex lock. In the SendOnly reap case,
7709 		 * it is not necessary to hold the ac_mutex
7710 		 * and recheck the ref count (since the AH was
7711 		 * taken off the active list), we just do it
7712 		 * to have uniform processing with the Full
7713 		 * reap case.
7714 		 */
7715 		mutex_enter(&state->id_ac_mutex);
7716 		mce = ace->ac_mce;
7717 		if (GET_REF_CYCLE(ace) == 0) {
7718 			CLEAR_REFCYCLE(ace);
7719 			/*
7720 			 * Identify the case of fullmember reap as
7721 			 * opposed to mcg trap reap. Also, port up
7722 			 * might set ac_mce to NULL to indicate Tx
7723 			 * cleanup should do no more than put the
7724 			 * AH in the free list (see ibd_async_link).
7725 			 */
7726 			if (mce != NULL) {
7727 				ace->ac_mce = NULL;
7728 				IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
7729 				/*
7730 				 * mc_req was initialized at mce
7731 				 * creation time.
7732 				 */
7733 				ibd_queue_work_slot(state,
7734 				    &mce->mc_req, IBD_ASYNC_REAP);
7735 			}
7736 			IBD_ACACHE_INSERT_FREE(state, ace);
7737 		}
7738 		mutex_exit(&state->id_ac_mutex);
7739 	}
7740 }
7741 
7742 /*
7743  * Common code that deals with clean ups after a successful or
7744  * erroneous transmission attempt.
7745  */
7746 static void
7747 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe)
7748 {
7749 	ibd_ace_t *ace = swqe->w_ahandle;
7750 
7751 	DPRINT(20, "ibd_tx_cleanup %p\n", swqe);
7752 
7753 	/*
7754 	 * If this was a dynamic mapping in ibd_send(), we need to
7755 	 * unmap here. If this was an lso buffer we'd used for sending,
7756 	 * we need to release the lso buf to the pool, since the resource
7757 	 * is scarce. However, if this was simply a normal send using
7758 	 * the copybuf (present in each swqe), we don't need to release it.
7759 	 */
7760 	if (swqe->swqe_im_mblk != NULL) {
7761 		if (swqe->w_buftype == IBD_WQE_MAPPED) {
7762 			ibd_unmap_mem(state, swqe);
7763 		} else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
7764 			ibd_release_lsobufs(state,
7765 			    swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
7766 		}
7767 		ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
7768 		freemsg(swqe->swqe_im_mblk);
7769 		swqe->swqe_im_mblk = NULL;
7770 	}
7771 
7772 	/*
7773 	 * Drop the reference count on the AH; it can be reused
7774 	 * now for a different destination if there are no more
7775 	 * posted sends that will use it. This can be eliminated
7776 	 * if we can always associate each Tx buffer with an AH.
7777 	 * The ace can be null if we are cleaning up from the
7778 	 * ibd_send() error path.
7779 	 */
7780 	if (ace != NULL) {
7781 		ibd_dec_ref_ace(state, ace);
7782 	}
7783 
7784 	/*
7785 	 * Release the send wqe for reuse.
7786 	 */
7787 	swqe->swqe_next = NULL;
7788 	ibd_release_swqe(state, swqe, swqe, 1);
7789 }
7790 
7791 static void
7792 ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail)
7793 {
7794 	ibd_ace_t *ace;
7795 	ibd_swqe_t *swqe;
7796 	int n = 0;
7797 
7798 	DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail);
7799 
7800 	for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) {
7801 
7802 		/*
7803 		 * If this was a dynamic mapping in ibd_send(), we need to
7804 		 * unmap here. If this was an lso buffer we'd used for sending,
7805 		 * we need to release the lso buf to the pool, since the
7806 		 * resource is scarce. However, if this was simply a normal
7807 		 * send using the copybuf (present in each swqe), we don't need
7808 		 * to release it.
7809 		 */
7810 		if (swqe->swqe_im_mblk != NULL) {
7811 			if (swqe->w_buftype == IBD_WQE_MAPPED) {
7812 				ibd_unmap_mem(state, swqe);
7813 			} else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
7814 				ibd_release_lsobufs(state,
7815 				    swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
7816 			}
7817 			ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
7818 			freemsg(swqe->swqe_im_mblk);
7819 			swqe->swqe_im_mblk = NULL;
7820 		}
7821 
7822 		/*
7823 		 * Drop the reference count on the AH; it can be reused
7824 		 * now for a different destination if there are no more
7825 		 * posted sends that will use it. This can be eliminated
7826 		 * if we can always associate each Tx buffer with an AH.
7827 		 * The ace can be null if we are cleaning up from the
7828 		 * ibd_send() error path.
7829 		 */
7830 		ace = swqe->w_ahandle;
7831 		if (ace != NULL) {
7832 			ibd_dec_ref_ace(state, ace);
7833 		}
7834 		n++;
7835 	}
7836 
7837 	/*
7838 	 * Release the send wqes for reuse.
7839 	 */
7840 	ibd_release_swqe(state, head, tail, n);
7841 }
7842 
7843 /*
7844  * Processing to be done after receipt of a packet; hand off to GLD
7845  * in the format expected by GLD.  The received packet has this
7846  * format: 2b sap :: 00 :: data.
7847  */
7848 static mblk_t *
7849 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
7850 {
7851 	ib_header_info_t *phdr;
7852 	mblk_t *mp;
7853 	ipoib_hdr_t *ipibp;
7854 	ipha_t *iphap;
7855 	ip6_t *ip6h;
7856 	int len;
7857 	ib_msglen_t pkt_len = wc->wc_bytes_xfer;
7858 	uint32_t bufs;
7859 
7860 	/*
7861 	 * Track number handed to upper layer that need to be returned.
7862 	 */
7863 	bufs = atomic_inc_32_nv(&state->id_rx_list.dl_bufs_outstanding);
7864 
7865 	/* Never run out of rwqes, use allocb when running low */
7866 	if (bufs >= state->id_rx_bufs_outstanding_limit) {
7867 		atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
7868 		atomic_inc_32(&state->id_rx_allocb);
7869 		mp = allocb(pkt_len, BPRI_HI);
7870 		if (mp) {
7871 			bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len);
7872 			ibd_post_recv(state, rwqe);
7873 		} else {	/* no memory */
7874 			atomic_inc_32(&state->id_rx_allocb_failed);
7875 			ibd_post_recv(state, rwqe);
7876 			return (NULL);
7877 		}
7878 	} else {
7879 		mp = rwqe->rwqe_im_mblk;
7880 	}
7881 
7882 
7883 	/*
7884 	 * Adjust write pointer depending on how much data came in.
7885 	 */
7886 	mp->b_wptr = mp->b_rptr + pkt_len;
7887 
7888 	/*
7889 	 * Make sure this is NULL or we're in trouble.
7890 	 */
7891 	if (mp->b_next != NULL) {
7892 		ibd_print_warn(state,
7893 		    "ibd_process_rx: got duplicate mp from rcq?");
7894 		mp->b_next = NULL;
7895 	}
7896 
7897 	/*
7898 	 * the IB link will deliver one of the IB link layer
7899 	 * headers called, the Global Routing Header (GRH).
7900 	 * ibd driver uses the information in GRH to build the
7901 	 * Header_info structure and pass it with the datagram up
7902 	 * to GLDv3.
7903 	 * If the GRH is not valid, indicate to GLDv3 by setting
7904 	 * the VerTcFlow field to 0.
7905 	 */
7906 	phdr = (ib_header_info_t *)mp->b_rptr;
7907 	if (wc->wc_flags & IBT_WC_GRH_PRESENT) {
7908 		phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn);
7909 
7910 		/* if it is loop back packet, just drop it. */
7911 		if (state->id_enable_rc) {
7912 			if (bcmp(&phdr->ib_grh.ipoib_sqpn,
7913 			    &state->rc_macaddr_loopback,
7914 			    IPOIB_ADDRL) == 0) {
7915 				freemsg(mp);
7916 				return (NULL);
7917 			}
7918 		} else {
7919 			if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr,
7920 			    IPOIB_ADDRL) == 0) {
7921 				freemsg(mp);
7922 				return (NULL);
7923 			}
7924 		}
7925 
7926 		ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src,
7927 		    sizeof (ipoib_mac_t));
7928 		if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) {
7929 			phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN);
7930 			IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst);
7931 		} else {
7932 			phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn;
7933 		}
7934 	} else {
7935 		/*
7936 		 * It can not be a IBA multicast packet. Must have been
7937 		 * unicast for us. Just copy the interface address to dst.
7938 		 */
7939 		phdr->ib_grh.ipoib_vertcflow = 0;
7940 		ovbcopy(&state->id_macaddr, &phdr->ib_dst,
7941 		    sizeof (ipoib_mac_t));
7942 	}
7943 
7944 	/*
7945 	 * For ND6 packets, padding is at the front of the source/target
7946 	 * lladdr. However the inet6 layer is not aware of it, hence remove
7947 	 * the padding from such packets.
7948 	 */
7949 	ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t));
7950 	if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
7951 		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
7952 		len = ntohs(ip6h->ip6_plen);
7953 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
7954 			/* LINTED: E_CONSTANT_CONDITION */
7955 			IBD_PAD_NSNA(ip6h, len, IBD_RECV);
7956 		}
7957 	}
7958 
7959 	/*
7960 	 * Update statistics
7961 	 */
7962 	atomic_add_64(&state->id_rcv_bytes, pkt_len);
7963 	atomic_inc_64(&state->id_rcv_pkt);
7964 	if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
7965 		atomic_inc_64(&state->id_brd_rcv);
7966 	else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
7967 		atomic_inc_64(&state->id_multi_rcv);
7968 
7969 	iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
7970 	/*
7971 	 * Set receive checksum status in mp
7972 	 * Hardware checksumming can be considered valid only if:
7973 	 * 1. CQE.IP_OK bit is set
7974 	 * 2. CQE.CKSUM = 0xffff
7975 	 * 3. IPv6 routing header is not present in the packet
7976 	 * 4. If there are no IP_OPTIONS in the IP HEADER
7977 	 */
7978 
7979 	if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) &&
7980 	    (wc->wc_cksum == 0xFFFF) &&
7981 	    (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) {
7982 		mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM_OK);
7983 	}
7984 
7985 	return (mp);
7986 }
7987 
7988 /*
7989  * Callback code invoked from STREAMs when the receive data buffer is
7990  * free for recycling.
7991  */
7992 static void
7993 ibd_freemsg_cb(char *arg)
7994 {
7995 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
7996 	ibd_state_t *state = rwqe->w_state;
7997 
7998 	atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
7999 
8000 	/*
8001 	 * If the driver is stopped, just free the rwqe.
8002 	 */
8003 	if (atomic_add_32_nv(&state->id_running, 0) == 0) {
8004 		DPRINT(6, "ibd_freemsg: wqe being freed");
8005 		rwqe->rwqe_im_mblk = NULL;
8006 		ibd_free_rwqe(state, rwqe);
8007 		return;
8008 	}
8009 
8010 	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
8011 	    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
8012 	if (rwqe->rwqe_im_mblk == NULL) {
8013 		ibd_free_rwqe(state, rwqe);
8014 		DPRINT(6, "ibd_freemsg: desballoc failed");
8015 		return;
8016 	}
8017 
8018 	ibd_post_recv(state, rwqe);
8019 }
8020 
8021 static uint_t
8022 ibd_tx_recycle(caddr_t arg)
8023 {
8024 	ibd_state_t *state = (ibd_state_t *)arg;
8025 
8026 	/*
8027 	 * Poll for completed entries
8028 	 */
8029 	ibd_poll_scq(state, state->id_scq_hdl);
8030 
8031 	return (DDI_INTR_CLAIMED);
8032 }
8033 
8034 #ifdef IBD_LOGGING
8035 static void
8036 ibd_log_init(void)
8037 {
8038 	ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP);
8039 	ibd_lbuf_ndx = 0;
8040 
8041 	mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL);
8042 }
8043 
8044 static void
8045 ibd_log_fini(void)
8046 {
8047 	if (ibd_lbuf)
8048 		kmem_free(ibd_lbuf, IBD_LOG_SZ);
8049 	ibd_lbuf_ndx = 0;
8050 	ibd_lbuf = NULL;
8051 
8052 	mutex_destroy(&ibd_lbuf_lock);
8053 }
8054 
8055 static void
8056 ibd_log(const char *fmt, ...)
8057 {
8058 	va_list	ap;
8059 	uint32_t off;
8060 	uint32_t msglen;
8061 	char tmpbuf[IBD_DMAX_LINE];
8062 
8063 	if (ibd_lbuf == NULL)
8064 		return;
8065 
8066 	va_start(ap, fmt);
8067 	msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap);
8068 	va_end(ap);
8069 
8070 	if (msglen >= IBD_DMAX_LINE)
8071 		msglen = IBD_DMAX_LINE - 1;
8072 
8073 	mutex_enter(&ibd_lbuf_lock);
8074 
8075 	off = ibd_lbuf_ndx;		/* current msg should go here */
8076 	if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n'))
8077 		ibd_lbuf[ibd_lbuf_ndx-1] = '\n';
8078 
8079 	ibd_lbuf_ndx += msglen;		/* place where next msg should start */
8080 	ibd_lbuf[ibd_lbuf_ndx] = 0;	/* current msg should terminate */
8081 
8082 	if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE))
8083 		ibd_lbuf_ndx = 0;
8084 
8085 	mutex_exit(&ibd_lbuf_lock);
8086 
8087 	bcopy(tmpbuf, ibd_lbuf+off, msglen);	/* no lock needed for this */
8088 }
8089 #endif
8090 
8091 /* ARGSUSED */
8092 static int
8093 ibd_create_partition(void *karg, intptr_t arg, int mode, cred_t *credp,
8094     int *rvalp)
8095 {
8096 	ibd_create_ioctl_t	*cmd = karg;
8097 	ibd_state_t		*state, *port_state, *p;
8098 	int			i, err, rval = 0;
8099 	mac_register_t		*macp;
8100 	ibt_hca_portinfo_t 	*pinfop = NULL;
8101 	ibt_status_t 		ibt_status;
8102 	uint_t 			psize, pinfosz;
8103 	boolean_t		force_create = B_FALSE;
8104 
8105 	cmd->ibdioc.ioc_status = 0;
8106 
8107 	if (cmd->ibdioc.ioc_port_inst < 0) {
8108 		cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST;
8109 		return (EINVAL);
8110 	}
8111 	port_state = ddi_get_soft_state(ibd_list, cmd->ibdioc.ioc_port_inst);
8112 	if (port_state == NULL) {
8113 		DPRINT(10, "ibd_create_partition: failed to get state %d",
8114 		    cmd->ibdioc.ioc_port_inst);
8115 		cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST;
8116 		return (EINVAL);
8117 	}
8118 
8119 	/* Limited PKeys not supported */
8120 	if (cmd->ioc_pkey <= IB_PKEY_INVALID_FULL) {
8121 		rval = EINVAL;
8122 		goto part_create_return;
8123 	}
8124 
8125 	if (cmd->ioc_force_create == 0) {
8126 		/*
8127 		 * Check if the port pkey table contains the pkey for which
8128 		 * this partition is being created.
8129 		 */
8130 		ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8131 		    port_state->id_port, &pinfop, &psize, &pinfosz);
8132 
8133 		if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8134 			rval = EINVAL;
8135 			goto part_create_return;
8136 		}
8137 
8138 		if (pinfop->p_linkstate != IBT_PORT_ACTIVE) {
8139 			rval = ENETDOWN;
8140 			cmd->ibdioc.ioc_status = IBD_PORT_IS_DOWN;
8141 			goto part_create_return;
8142 		}
8143 
8144 		for (i = 0; i < pinfop->p_pkey_tbl_sz; i++) {
8145 			if (pinfop->p_pkey_tbl[i] == cmd->ioc_pkey) {
8146 				break;
8147 			}
8148 		}
8149 		if (i == pinfop->p_pkey_tbl_sz) {
8150 			rval = EINVAL;
8151 			cmd->ibdioc.ioc_status = IBD_PKEY_NOT_PRESENT;
8152 			goto part_create_return;
8153 		}
8154 	} else {
8155 		force_create = B_TRUE;
8156 	}
8157 
8158 	mutex_enter(&ibd_objlist_lock);
8159 	for (p = ibd_objlist_head; p; p = p->id_next) {
8160 		if ((p->id_port_inst == cmd->ibdioc.ioc_port_inst) &&
8161 		    (p->id_pkey == cmd->ioc_pkey)) {
8162 			mutex_exit(&ibd_objlist_lock);
8163 			rval = EEXIST;
8164 			cmd->ibdioc.ioc_status = IBD_PARTITION_EXISTS;
8165 			goto part_create_return;
8166 		}
8167 	}
8168 	mutex_exit(&ibd_objlist_lock);
8169 
8170 	state = kmem_zalloc(sizeof (ibd_state_t), KM_SLEEP);
8171 
8172 	state->id_type		= IBD_PARTITION_OBJ;
8173 
8174 	state->id_plinkid	= cmd->ioc_partid;
8175 	state->id_dlinkid	= cmd->ibdioc.ioc_linkid;
8176 	state->id_port_inst	= cmd->ibdioc.ioc_port_inst;
8177 
8178 	state->id_dip		= port_state->id_dip;
8179 	state->id_port		= port_state->id_port;
8180 	state->id_pkey		= cmd->ioc_pkey;
8181 	state->id_hca_guid	= port_state->id_hca_guid;
8182 	state->id_port_guid	= port_state->id_port_guid;
8183 	state->id_force_create	= force_create;
8184 
8185 	mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL);
8186 	cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL);
8187 
8188 	if (ibd_part_attach(state, state->id_dip) != DDI_SUCCESS) {
8189 		rval = EIO;
8190 		cmd->ibdioc.ioc_status = IBD_NO_HW_RESOURCE;
8191 		goto fail;
8192 	}
8193 
8194 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
8195 		rval = EAGAIN;
8196 		goto fail;
8197 	}
8198 
8199 	macp->m_type_ident	= MAC_PLUGIN_IDENT_IB;
8200 	macp->m_dip		= port_state->id_dip;
8201 	macp->m_instance	= (uint_t)-1;
8202 	macp->m_driver		= state;
8203 	macp->m_src_addr	= (uint8_t *)&state->id_macaddr;
8204 	macp->m_callbacks	= &ibd_m_callbacks;
8205 	macp->m_min_sdu		= 0;
8206 	if (state->id_enable_rc) {
8207 		macp->m_max_sdu		= IBD_DEF_RC_MAX_SDU;
8208 	} else {
8209 		macp->m_max_sdu		= IBD_DEF_MAX_SDU;
8210 	}
8211 	macp->m_priv_props = ibd_priv_props;
8212 
8213 	err = mac_register(macp, &state->id_mh);
8214 	mac_free(macp);
8215 
8216 	if (err != 0) {
8217 		DPRINT(10, "ibd_create_partition: mac_register() failed %d",
8218 		    err);
8219 		rval = err;
8220 		goto fail;
8221 	}
8222 
8223 	err = dls_devnet_create(state->id_mh,
8224 	    cmd->ioc_partid, crgetzoneid(credp));
8225 	if (err != 0) {
8226 		DPRINT(10, "ibd_create_partition: dls_devnet_create() failed "
8227 		    "%d", err);
8228 		rval = err;
8229 		(void) mac_unregister(state->id_mh);
8230 		goto fail;
8231 	}
8232 
8233 	/*
8234 	 * Add the new partition state structure to the list
8235 	 */
8236 	mutex_enter(&ibd_objlist_lock);
8237 	if (ibd_objlist_head)
8238 		state->id_next = ibd_objlist_head;
8239 
8240 	ibd_objlist_head = state;
8241 	mutex_exit(&ibd_objlist_lock);
8242 
8243 part_create_return:
8244 	if (pinfop) {
8245 		ibt_free_portinfo(pinfop, pinfosz);
8246 	}
8247 	return (rval);
8248 
8249 fail:
8250 	if (pinfop) {
8251 		ibt_free_portinfo(pinfop, pinfosz);
8252 	}
8253 	(void) ibd_part_unattach(state);
8254 	kmem_free(state, sizeof (ibd_state_t));
8255 	return (rval);
8256 }
8257 
8258 /* ARGSUSED */
8259 static int
8260 ibd_delete_partition(void *karg, intptr_t arg, int mode, cred_t *credp,
8261     int *rvalp)
8262 {
8263 	int err;
8264 	datalink_id_t tmpid;
8265 	ibd_state_t *node, *prev;
8266 	ibd_delete_ioctl_t *cmd = karg;
8267 
8268 	prev = NULL;
8269 
8270 	mutex_enter(&ibd_objlist_lock);
8271 	node = ibd_objlist_head;
8272 
8273 	/* Find the ibd state structure corresponding the partion */
8274 	while (node != NULL) {
8275 		if (node->id_plinkid == cmd->ioc_partid)
8276 			break;
8277 		prev = node;
8278 		node = node->id_next;
8279 	}
8280 
8281 	if (node == NULL) {
8282 		mutex_exit(&ibd_objlist_lock);
8283 		return (ENOENT);
8284 	}
8285 
8286 	if ((err = dls_devnet_destroy(node->id_mh, &tmpid, B_TRUE)) != 0) {
8287 		DPRINT(10, "ibd_delete_partition: dls_devnet_destroy() failed "
8288 		    "%d", err);
8289 		mutex_exit(&ibd_objlist_lock);
8290 		return (err);
8291 	}
8292 
8293 	if ((err = mac_disable(node->id_mh)) != 0) {
8294 		(void) dls_devnet_create(node->id_mh, cmd->ioc_partid,
8295 		    crgetzoneid(credp));
8296 		mutex_exit(&ibd_objlist_lock);
8297 		return (err);
8298 	}
8299 
8300 	/*
8301 	 * Call ibd_part_unattach() only after making sure that the instance has
8302 	 * not been started yet and is also not in late hca init mode.
8303 	 */
8304 	ibd_set_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
8305 	if ((node->id_mac_state & IBD_DRV_STARTED) ||
8306 	    (node->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ||
8307 	    (ibd_part_unattach(node) != DDI_SUCCESS)) {
8308 		ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
8309 		mutex_exit(&ibd_objlist_lock);
8310 		return (EBUSY);
8311 	}
8312 	node->id_mac_state |= IBD_DRV_IN_DELETION;
8313 	ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
8314 
8315 	/* Remove the partition state structure from the linked list */
8316 	if (prev == NULL)
8317 		ibd_objlist_head = node->id_next;
8318 	else
8319 		prev->id_next = node->id_next;
8320 	mutex_exit(&ibd_objlist_lock);
8321 
8322 	if ((err = mac_unregister(node->id_mh)) != 0) {
8323 		DPRINT(10, "ibd_delete_partition: mac_unregister() failed %d",
8324 		    err);
8325 	}
8326 
8327 	cv_destroy(&node->id_macst_cv);
8328 	mutex_destroy(&node->id_macst_lock);
8329 
8330 	kmem_free(node, sizeof (ibd_state_t));
8331 
8332 	return (0);
8333 }
8334 
8335 /* ARGSUSED */
8336 static int
8337 ibd_get_partition_info(void *karg, intptr_t arg, int mode, cred_t *cred,
8338     int *rvalp)
8339 {
8340 	ibd_ioctl_t		cmd;
8341 	ibpart_ioctl_t		partioc;
8342 	ibport_ioctl_t		portioc;
8343 #ifdef _MULTI_DATAMODEL
8344 	ibport_ioctl32_t	portioc32;
8345 #endif
8346 	ibd_state_t		*state, *port_state;
8347 	int			size;
8348 	ibt_hca_portinfo_t 	*pinfop = NULL;
8349 	ibt_status_t 		ibt_status;
8350 	uint_t 			psize, pinfosz;
8351 	int			rval = 0;
8352 
8353 	size = sizeof (ibd_ioctl_t);
8354 	if (ddi_copyin((void *)arg, &cmd, size, mode)) {
8355 		return (EFAULT);
8356 	}
8357 	cmd.ioc_status = 0;
8358 	switch (cmd.ioc_info_cmd) {
8359 	case IBD_INFO_CMD_IBPART:
8360 		size = sizeof (ibpart_ioctl_t);
8361 		if (ddi_copyin((void *)arg, &partioc, size, mode)) {
8362 			return (EFAULT);
8363 		}
8364 
8365 		mutex_enter(&ibd_objlist_lock);
8366 		/* Find the ibd state structure corresponding the partition */
8367 		for (state = ibd_objlist_head; state; state = state->id_next) {
8368 			if (state->id_plinkid == cmd.ioc_linkid) {
8369 				break;
8370 			}
8371 		}
8372 
8373 		if (state == NULL) {
8374 			mutex_exit(&ibd_objlist_lock);
8375 			return (ENOENT);
8376 		}
8377 
8378 		partioc.ibdioc.ioc_linkid = state->id_dlinkid;
8379 		partioc.ibdioc.ioc_port_inst = state->id_port_inst;
8380 		partioc.ibdioc.ioc_portnum = state->id_port;
8381 		partioc.ibdioc.ioc_hcaguid = state->id_hca_guid;
8382 		partioc.ibdioc.ioc_portguid = state->id_port_guid;
8383 		partioc.ibdioc.ioc_status = 0;
8384 		partioc.ioc_partid = state->id_plinkid;
8385 		partioc.ioc_pkey = state->id_pkey;
8386 		partioc.ioc_force_create = state->id_force_create;
8387 		if (ddi_copyout((void *)&partioc, (void *)arg, size, mode)) {
8388 			mutex_exit(&ibd_objlist_lock);
8389 			return (EFAULT);
8390 		}
8391 		mutex_exit(&ibd_objlist_lock);
8392 
8393 		break;
8394 
8395 	case IBD_INFO_CMD_IBPORT:
8396 		if ((cmd.ioc_port_inst < 0) || ((port_state =
8397 		    ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) {
8398 			DPRINT(10, "ibd_create_partition: failed to get"
8399 			    " state %d", cmd.ioc_port_inst);
8400 			size = sizeof (ibd_ioctl_t);
8401 			cmd.ioc_status = IBD_INVALID_PORT_INST;
8402 			if (ddi_copyout((void *)&cmd, (void *)arg, size,
8403 			    mode)) {
8404 				return (EFAULT);
8405 			}
8406 			return (EINVAL);
8407 		}
8408 		ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8409 		    port_state->id_port, &pinfop, &psize, &pinfosz);
8410 		if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8411 			return (EINVAL);
8412 		}
8413 #ifdef _MULTI_DATAMODEL
8414 		switch (ddi_model_convert_from(mode & FMODELS)) {
8415 		case DDI_MODEL_ILP32: {
8416 			size = sizeof (ibport_ioctl32_t);
8417 			if (ddi_copyin((void *)arg, &portioc32, size, mode)) {
8418 				rval = EFAULT;
8419 				goto fail;
8420 			}
8421 			portioc32.ibdioc.ioc_status = 0;
8422 			portioc32.ibdioc.ioc_portnum = port_state->id_port;
8423 			portioc32.ibdioc.ioc_hcaguid =
8424 			    port_state->id_hca_guid;
8425 			portioc32.ibdioc.ioc_portguid =
8426 			    port_state->id_port_guid;
8427 			if (portioc32.ioc_pkey_tbl_sz !=
8428 			    pinfop->p_pkey_tbl_sz) {
8429 				rval = EINVAL;
8430 				size = sizeof (ibd_ioctl_t);
8431 				portioc32.ibdioc.ioc_status =
8432 				    IBD_INVALID_PKEY_TBL_SIZE;
8433 				if (ddi_copyout((void *)&portioc32.ibdioc,
8434 				    (void *)arg, size, mode)) {
8435 					rval = EFAULT;
8436 					goto fail;
8437 				}
8438 				goto fail;
8439 			}
8440 			size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8441 			if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8442 			    (void *)(uintptr_t)portioc32.ioc_pkeys, size,
8443 			    mode)) {
8444 				rval = EFAULT;
8445 				goto fail;
8446 			}
8447 			size = sizeof (ibport_ioctl32_t);
8448 			if (ddi_copyout((void *)&portioc32, (void *)arg, size,
8449 			    mode)) {
8450 				rval = EFAULT;
8451 				goto fail;
8452 			}
8453 			break;
8454 		}
8455 		case DDI_MODEL_NONE:
8456 			size = sizeof (ibport_ioctl_t);
8457 			if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8458 				rval = EFAULT;
8459 				goto fail;
8460 			}
8461 			portioc.ibdioc.ioc_status = 0;
8462 			portioc.ibdioc.ioc_portnum = port_state->id_port;
8463 			portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8464 			portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8465 			if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) {
8466 				rval = EINVAL;
8467 				size = sizeof (ibd_ioctl_t);
8468 				portioc.ibdioc.ioc_status =
8469 				    IBD_INVALID_PKEY_TBL_SIZE;
8470 				if (ddi_copyout((void *)&portioc.ibdioc,
8471 				    (void *)arg, size, mode)) {
8472 					rval = EFAULT;
8473 					goto fail;
8474 				}
8475 				goto fail;
8476 			}
8477 			size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8478 			if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8479 			    (void *)(portioc.ioc_pkeys), size, mode)) {
8480 				rval = EFAULT;
8481 				goto fail;
8482 			}
8483 			size = sizeof (ibport_ioctl_t);
8484 			if (ddi_copyout((void *)&portioc, (void *)arg, size,
8485 			    mode)) {
8486 				rval = EFAULT;
8487 				goto fail;
8488 			}
8489 			break;
8490 		}
8491 #else /* ! _MULTI_DATAMODEL */
8492 		size = sizeof (ibport_ioctl_t);
8493 		if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8494 			rval = EFAULT;
8495 			goto fail;
8496 		}
8497 		portioc.ibdioc.ioc_status = 0;
8498 		portioc.ibdioc.ioc_portnum = port_state->id_port;
8499 		portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8500 		portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8501 		if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) {
8502 			rval = EINVAL;
8503 			size = sizeof (ibd_ioctl_t);
8504 			portioc.ibdioc.ioc_status = IBD_INVALID_PKEY_TBL_SIZE;
8505 			if (ddi_copyout((void *)&portioc.ibdioc, (void *)arg,
8506 			    size, mode)) {
8507 				rval = EFAULT;
8508 				goto fail;
8509 			}
8510 			goto fail;
8511 		}
8512 		size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8513 		if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8514 		    (void *)(portioc.ioc_pkeys), size, mode)) {
8515 			rval = EFAULT;
8516 			goto fail;
8517 		}
8518 		size = sizeof (ibport_ioctl_t);
8519 		if (ddi_copyout((void *)&portioc, (void *)arg, size,
8520 		    mode)) {
8521 			rval = EFAULT;
8522 			goto fail;
8523 		}
8524 #endif /* _MULTI_DATAMODEL */
8525 
8526 		break;
8527 
8528 	case IBD_INFO_CMD_PKEYTBLSZ:
8529 		if ((cmd.ioc_port_inst < 0) || ((port_state =
8530 		    ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) {
8531 			DPRINT(10, "ibd_create_partition: failed to get"
8532 			    " state %d", cmd.ioc_port_inst);
8533 			size = sizeof (ibd_ioctl_t);
8534 			cmd.ioc_status = IBD_INVALID_PORT_INST;
8535 			if (ddi_copyout((void *)&cmd, (void *)arg, size,
8536 			    mode)) {
8537 				return (EFAULT);
8538 			}
8539 			return (EINVAL);
8540 		}
8541 		ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8542 		    port_state->id_port, &pinfop, &psize, &pinfosz);
8543 		if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8544 			return (EINVAL);
8545 		}
8546 #ifdef _MULTI_DATAMODEL
8547 		switch (ddi_model_convert_from(mode & FMODELS)) {
8548 		case DDI_MODEL_ILP32: {
8549 			size = sizeof (ibport_ioctl32_t);
8550 			if (ddi_copyin((void *)arg, &portioc32, size, mode)) {
8551 				rval = EFAULT;
8552 				goto fail;
8553 			}
8554 			portioc32.ibdioc.ioc_status = 0;
8555 			portioc32.ibdioc.ioc_portnum = port_state->id_port;
8556 			portioc32.ibdioc.ioc_hcaguid =
8557 			    port_state->id_hca_guid;
8558 			portioc32.ibdioc.ioc_portguid =
8559 			    port_state->id_port_guid;
8560 			portioc32.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8561 			if (ddi_copyout((void *)&portioc32, (void *)arg, size,
8562 			    mode)) {
8563 				rval = EFAULT;
8564 				goto fail;
8565 			}
8566 			break;
8567 		}
8568 		case DDI_MODEL_NONE:
8569 			size = sizeof (ibport_ioctl_t);
8570 			if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8571 				rval = EFAULT;
8572 				goto fail;
8573 			}
8574 			portioc.ibdioc.ioc_status = 0;
8575 			portioc.ibdioc.ioc_portnum = port_state->id_port;
8576 			portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8577 			portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8578 			portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8579 			if (ddi_copyout((void *)&portioc, (void *)arg, size,
8580 			    mode)) {
8581 				rval = EFAULT;
8582 				goto fail;
8583 			}
8584 			break;
8585 		}
8586 #else /* ! _MULTI_DATAMODEL */
8587 		size = sizeof (ibport_ioctl_t);
8588 		if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8589 			rval = EFAULT;
8590 			goto fail;
8591 		}
8592 		portioc.ibdioc.ioc_status = 0;
8593 		portioc.ibdioc.ioc_portnum = port_state->id_port;
8594 		portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8595 		portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8596 		portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8597 		if (ddi_copyout((void *)&portioc, (void *)arg, size,
8598 		    mode)) {
8599 			rval = EFAULT;
8600 			goto fail;
8601 		}
8602 #endif /* _MULTI_DATAMODEL */
8603 		break;
8604 
8605 	default:
8606 		return (EINVAL);
8607 
8608 	} /* switch (cmd.ioc_info_cmd) */
8609 fail:
8610 	if (pinfop) {
8611 		ibt_free_portinfo(pinfop, pinfosz);
8612 	}
8613 	return (rval);
8614 }
8615 
8616 /* ARGSUSED */
8617 static void
8618 ibdpd_async_handler(void *arg, ibt_hca_hdl_t hca_hdl,
8619     ibt_async_code_t code, ibt_async_event_t *event)
8620 {
8621 	ibd_state_t *state = (ibd_state_t *)arg;
8622 	link_state_t	lstate;
8623 
8624 	switch (code) {
8625 	case IBT_EVENT_PORT_UP:
8626 	case IBT_ERROR_PORT_DOWN:
8627 		if (ibd_get_port_state(state, &lstate) != 0)
8628 			break;
8629 
8630 		if (state->id_link_state != lstate) {
8631 			state->id_link_state = lstate;
8632 			mac_link_update(state->id_mh, lstate);
8633 		}
8634 		break;
8635 	default:
8636 		break;
8637 	}
8638 }
8639 
8640 static int
8641 ibd_get_port_state(ibd_state_t *state, link_state_t *lstate)
8642 {
8643 	ibt_hca_portinfo_t *port_infop;
8644 	uint_t psize, port_infosz;
8645 	ibt_status_t	ret;
8646 
8647 	ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
8648 	    &port_infop, &psize, &port_infosz);
8649 	if ((ret != IBT_SUCCESS) || (psize != 1))
8650 		return (-1);
8651 
8652 	state->id_sgid = *port_infop->p_sgid_tbl;
8653 	state->id_link_speed = ibd_get_portspeed(state);
8654 
8655 	if (port_infop->p_linkstate == IBT_PORT_ACTIVE)
8656 		*lstate = LINK_STATE_UP;
8657 	else
8658 		*lstate = LINK_STATE_DOWN;
8659 
8660 	ibt_free_portinfo(port_infop, port_infosz);
8661 	return (0);
8662 }
8663 
8664 static int
8665 ibd_port_attach(dev_info_t *dip)
8666 {
8667 	ibd_state_t		*state;
8668 	link_state_t		lstate;
8669 	int			instance;
8670 	ibt_status_t		ret;
8671 
8672 	/*
8673 	 * Allocate softstate structure
8674 	 */
8675 	instance = ddi_get_instance(dip);
8676 	if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) {
8677 		DPRINT(10, "ibd_attach: ddi_soft_state_zalloc() failed");
8678 		return (DDI_FAILURE);
8679 	}
8680 
8681 	state = ddi_get_soft_state(ibd_list, instance);
8682 
8683 	state->id_dip = dip;
8684 	state->id_type = IBD_PORT_DRIVER;
8685 
8686 	if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
8687 	    "port-number", 0)) == 0) {
8688 		DPRINT(10, "ibd_attach: invalid port number (%d)",
8689 		    state->id_port);
8690 		return (DDI_FAILURE);
8691 	}
8692 	if ((state->id_hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
8693 	    "hca-guid", 0)) == 0) {
8694 		DPRINT(10, "ibd_attach: hca has invalid guid (0x%llx)",
8695 		    state->id_hca_guid);
8696 		return (DDI_FAILURE);
8697 	}
8698 	if ((state->id_port_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
8699 	    "port-guid", 0)) == 0) {
8700 		DPRINT(10, "ibd_attach: port has invalid guid (0x%llx)",
8701 		    state->id_port_guid);
8702 		return (DDI_FAILURE);
8703 	}
8704 
8705 	/*
8706 	 * Attach to IBTL
8707 	 */
8708 	if ((ret = ibt_attach(&ibdpd_clnt_modinfo, dip, state,
8709 	    &state->id_ibt_hdl)) != IBT_SUCCESS) {
8710 		DPRINT(10, "ibd_attach: failed in ibt_attach(), ret=%d", ret);
8711 		goto done;
8712 	}
8713 
8714 	state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
8715 
8716 	if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid,
8717 	    &state->id_hca_hdl)) != IBT_SUCCESS) {
8718 		DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret);
8719 		goto done;
8720 	}
8721 	state->id_mac_state |= IBD_DRV_HCA_OPENED;
8722 
8723 	/* Update link status */
8724 
8725 	if (ibd_get_port_state(state, &lstate) != 0) {
8726 		DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret);
8727 		goto done;
8728 	}
8729 	state->id_link_state = lstate;
8730 	/*
8731 	 * Register ibd interfaces with the Nemo framework
8732 	 */
8733 	if (ibd_register_mac(state, dip) != IBT_SUCCESS) {
8734 		DPRINT(10, "ibd_attach: failed in ibd_register_mac()");
8735 		goto done;
8736 	}
8737 	state->id_mac_state |= IBD_DRV_MAC_REGISTERED;
8738 
8739 	mac_link_update(state->id_mh, lstate);
8740 
8741 	return (DDI_SUCCESS);
8742 done:
8743 	(void) ibd_port_unattach(state, dip);
8744 	return (DDI_FAILURE);
8745 }
8746 
8747 static int
8748 ibd_port_unattach(ibd_state_t *state, dev_info_t *dip)
8749 {
8750 	int instance;
8751 	uint32_t progress = state->id_mac_state;
8752 	ibt_status_t ret;
8753 
8754 	if (progress & IBD_DRV_MAC_REGISTERED) {
8755 		(void) mac_unregister(state->id_mh);
8756 		state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
8757 	}
8758 
8759 	if (progress & IBD_DRV_HCA_OPENED) {
8760 		if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
8761 		    IBT_SUCCESS) {
8762 			ibd_print_warn(state, "failed to close "
8763 			    "HCA device, ret=%d", ret);
8764 		}
8765 		state->id_hca_hdl = NULL;
8766 		state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
8767 	}
8768 
8769 	if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
8770 		if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) {
8771 			ibd_print_warn(state,
8772 			    "ibt_detach() failed, ret=%d", ret);
8773 		}
8774 		state->id_ibt_hdl = NULL;
8775 		state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
8776 	}
8777 	instance = ddi_get_instance(dip);
8778 	ddi_soft_state_free(ibd_list, instance);
8779 
8780 	return (DDI_SUCCESS);
8781 }
8782 
8783 ibt_status_t
8784 ibd_get_part_attr(datalink_id_t linkid, ibt_part_attr_t *attr)
8785 {
8786 	ibd_state_t	*state;
8787 
8788 	mutex_enter(&ibd_objlist_lock);
8789 
8790 	/* Find the ibd state structure corresponding the partition */
8791 	for (state = ibd_objlist_head; state; state = state->id_next) {
8792 		if (state->id_plinkid == linkid) {
8793 			break;
8794 		}
8795 	}
8796 
8797 	if (state == NULL) {
8798 		mutex_exit(&ibd_objlist_lock);
8799 		return (IBT_NO_SUCH_OBJECT);
8800 	}
8801 
8802 	attr->pa_dlinkid = state->id_dlinkid;
8803 	attr->pa_plinkid = state->id_plinkid;
8804 	attr->pa_port = state->id_port;
8805 	attr->pa_hca_guid = state->id_hca_guid;
8806 	attr->pa_port_guid = state->id_port_guid;
8807 	attr->pa_pkey = state->id_pkey;
8808 
8809 	mutex_exit(&ibd_objlist_lock);
8810 
8811 	return (IBT_SUCCESS);
8812 }
8813 
8814 ibt_status_t
8815 ibd_get_all_part_attr(ibt_part_attr_t **attr_list, int *nparts)
8816 {
8817 	ibd_state_t	*state;
8818 	int		n = 0;
8819 	ibt_part_attr_t	*attr;
8820 
8821 	mutex_enter(&ibd_objlist_lock);
8822 
8823 	for (state = ibd_objlist_head; state; state = state->id_next)
8824 		n++;
8825 
8826 	*nparts = n;
8827 	if (n == 0) {
8828 		*attr_list = NULL;
8829 		mutex_exit(&ibd_objlist_lock);
8830 		return (IBT_SUCCESS);
8831 	}
8832 
8833 	*attr_list = kmem_alloc(sizeof (ibt_part_attr_t) * n, KM_SLEEP);
8834 	attr = *attr_list;
8835 	for (state = ibd_objlist_head; state; state = state->id_next) {
8836 #ifdef DEBUG
8837 		ASSERT(n > 0);
8838 		n--;
8839 #endif
8840 		attr->pa_dlinkid = state->id_dlinkid;
8841 		attr->pa_plinkid = state->id_plinkid;
8842 		attr->pa_port = state->id_port;
8843 		attr->pa_hca_guid = state->id_hca_guid;
8844 		attr->pa_port_guid = state->id_port_guid;
8845 		attr->pa_pkey = state->id_pkey;
8846 		attr++;
8847 	}
8848 
8849 	mutex_exit(&ibd_objlist_lock);
8850 	return (IBT_SUCCESS);
8851 }
8852