xref: /illumos-gate/usr/src/uts/common/io/ib/clients/ibd/ibd.c (revision 4e567b4443d7a1680a7319275e5288eef2c92319)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * An implementation of the IPoIB standard based on PSARC 2001/289.
29  */
30 
31 #include <sys/types.h>
32 #include <sys/conf.h>
33 #include <sys/ddi.h>
34 #include <sys/sunddi.h>
35 #include <sys/modctl.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strsun.h>
39 #include <sys/strsubr.h>
40 #include <sys/dlpi.h>
41 #include <sys/mac_provider.h>
42 
43 #include <sys/pattr.h>		/* for HCK_FULLCKSUM */
44 #include <sys/sysmacros.h>	/* for offsetof */
45 #include <sys/disp.h>		/* for async thread pri */
46 #include <sys/atomic.h>		/* for atomic_add*() */
47 #include <sys/ethernet.h>	/* for ETHERTYPE_IPV6 */
48 #include <netinet/in.h>		/* for netinet/ip.h below */
49 #include <netinet/ip.h>		/* for struct ip */
50 #include <netinet/udp.h>	/* for struct udphdr */
51 #include <inet/common.h>	/* for inet/ip.h below */
52 #include <inet/ip.h>		/* for ipha_t */
53 #include <inet/ip6.h>		/* for ip6_t */
54 #include <inet/tcp.h>		/* for tcph_t */
55 #include <netinet/icmp6.h>	/* for icmp6_t */
56 #include <sys/callb.h>
57 #include <sys/modhash.h>
58 
59 #include <sys/ib/clients/ibd/ibd.h>
60 #include <sys/ib/mgt/sm_attr.h>	/* for SM_INIT_TYPE_* */
61 #include <sys/note.h>
62 #include <sys/multidata.h>
63 
64 #include <sys/ib/mgt/ibmf/ibmf.h>	/* for ibd_get_portspeed */
65 
66 /*
67  * Per-interface tunables (for developers)
68  *
69  * ibd_tx_copy_thresh
70  *     This sets the threshold at which ibd will attempt to do a bcopy of the
71  *     outgoing data into a pre-mapped buffer. The IPoIB driver's send behavior
72  *     is restricted by various parameters, so setting of this value must be
73  *     made after careful considerations only.  For instance, IB HCAs currently
74  *     impose a relatively small limit (when compared to ethernet NICs) on the
75  *     length of the SGL for transmit. On the other hand, the ip stack could
76  *     send down mp chains that are quite long when LSO is enabled.
77  *
78  * ibd_num_swqe
79  *     Number of "send WQE" elements that will be allocated and used by ibd.
80  *     When tuning this parameter, the size of pre-allocated, pre-mapped copy
81  *     buffer in each of these send wqes must be taken into account. This
82  *     copy buffer size is determined by the value of IBD_TX_BUF_SZ (this is
83  *     currently set to the same value of ibd_tx_copy_thresh, but may be
84  *     changed independently if needed).
85  *
86  * ibd_num_rwqe
87  *     Number of "receive WQE" elements that will be allocated and used by
88  *     ibd. This parameter is limited by the maximum channel size of the HCA.
89  *     Each buffer in the receive wqe will be of MTU size.
90  *
91  * ibd_num_lso_bufs
92  *     Number of "larger-than-MTU" copy buffers to use for cases when the
93  *     outgoing mblk chain is too fragmented to be used with ibt_map_mem_iov()
94  *     and too large to be used with regular MTU-sized copy buffers. It is
95  *     not recommended to tune this variable without understanding the
96  *     application environment and/or memory resources. The size of each of
97  *     these lso buffers is determined by the value of IBD_LSO_BUFSZ.
98  *
99  * ibd_num_ah
100  *     Number of AH cache entries to allocate
101  *
102  * ibd_hash_size
103  *     Hash table size for the active AH list
104  *
105  * ibd_tx_softintr
106  * ibd_rx_softintr
107  *     The softintr mechanism allows ibd to avoid event queue overflows if
108  *     the receive/completion handlers are to be expensive. These are enabled
109  *     by default.
110  *
111  * ibd_log_sz
112  *     This specifies the size of the ibd log buffer in bytes. The buffer is
113  *     allocated and logging is enabled only when IBD_LOGGING is defined.
114  *
115  */
116 uint_t ibd_tx_copy_thresh = 0x1000;
117 uint_t ibd_num_swqe = 4000;
118 uint_t ibd_num_rwqe = 4000;
119 uint_t ibd_num_lso_bufs = 0x400;
120 uint_t ibd_num_ah = 256;
121 uint_t ibd_hash_size = 32;
122 uint_t ibd_rx_softintr = 1;
123 uint_t ibd_tx_softintr = 1;
124 uint_t ibd_create_broadcast_group = 1;
125 #ifdef IBD_LOGGING
126 uint_t ibd_log_sz = 0x20000;
127 #endif
128 
129 #define	IBD_TX_COPY_THRESH		ibd_tx_copy_thresh
130 #define	IBD_TX_BUF_SZ			ibd_tx_copy_thresh
131 #define	IBD_NUM_SWQE			ibd_num_swqe
132 #define	IBD_NUM_RWQE			ibd_num_rwqe
133 #define	IBD_NUM_LSO_BUFS		ibd_num_lso_bufs
134 #define	IBD_NUM_AH			ibd_num_ah
135 #define	IBD_HASH_SIZE			ibd_hash_size
136 #ifdef IBD_LOGGING
137 #define	IBD_LOG_SZ			ibd_log_sz
138 #endif
139 
140 /*
141  * ibd_rc_tx_copy_thresh
142  *     This sets the threshold upto which ibd will attempt to do a bcopy of the
143  *     outgoing data into a pre-mapped buffer.
144  */
145 uint_t ibd_rc_tx_copy_thresh = 0x1000;
146 
147 /*
148  * Receive CQ moderation parameters: tunable (for developers)
149  */
150 uint_t ibd_rxcomp_count = 4;
151 uint_t ibd_rxcomp_usec = 10;
152 
153 /*
154  * Send CQ moderation parameters: tunable (for developers)
155  */
156 uint_t ibd_txcomp_count = 16;
157 uint_t ibd_txcomp_usec = 300;
158 
159 /* Post IBD_RX_POST_CNT receive work requests at a time. */
160 #define	IBD_RX_POST_CNT			8
161 
162 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */
163 #define	IBD_LOG_RX_POST			4
164 
165 /* Minimum number of receive work requests driver needs to always have */
166 #define	IBD_RWQE_MIN	((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4)
167 
168 /*
169  * LSO parameters
170  */
171 #define	IBD_LSO_MAXLEN			65536
172 #define	IBD_LSO_BUFSZ			8192
173 #define	IBD_PROP_LSO_POLICY		"lso-policy"
174 
175 /*
176  * Async operation states
177  */
178 #define	IBD_OP_NOTSTARTED		0
179 #define	IBD_OP_ONGOING			1
180 #define	IBD_OP_COMPLETED		2
181 #define	IBD_OP_ERRORED			3
182 #define	IBD_OP_ROUTERED			4
183 
184 /*
185  * State of IBD driver initialization during attach/m_start
186  */
187 #define	IBD_DRV_STATE_INITIALIZED	0x00001
188 #define	IBD_DRV_RXINTR_ADDED		0x00002
189 #define	IBD_DRV_TXINTR_ADDED		0x00004
190 #define	IBD_DRV_IBTL_ATTACH_DONE	0x00008
191 #define	IBD_DRV_HCA_OPENED		0x00010
192 #define	IBD_DRV_PD_ALLOCD		0x00020
193 #define	IBD_DRV_MAC_REGISTERED		0x00040
194 #define	IBD_DRV_PORT_DETAILS_OBTAINED	0x00080
195 #define	IBD_DRV_BCAST_GROUP_FOUND	0x00100
196 #define	IBD_DRV_ACACHE_INITIALIZED	0x00200
197 #define	IBD_DRV_CQS_ALLOCD		0x00400
198 #define	IBD_DRV_UD_CHANNEL_SETUP	0x00800
199 #define	IBD_DRV_TXLIST_ALLOCD		0x01000
200 #define	IBD_DRV_SCQ_NOTIFY_ENABLED	0x02000
201 #define	IBD_DRV_RXLIST_ALLOCD		0x04000
202 #define	IBD_DRV_BCAST_GROUP_JOINED	0x08000
203 #define	IBD_DRV_ASYNC_THR_CREATED	0x10000
204 #define	IBD_DRV_RCQ_NOTIFY_ENABLED	0x20000
205 #define	IBD_DRV_SM_NOTICES_REGISTERED	0x40000
206 #define	IBD_DRV_STARTED			0x80000
207 #define	IBD_DRV_RC_SRQ_ALLOCD		0x100000
208 #define	IBD_DRV_RC_LARGEBUF_ALLOCD	0x200000
209 #define	IBD_DRV_RC_LISTEN		0x400000
210 #ifdef DEBUG
211 #define	IBD_DRV_RC_PRIVATE_STATE	0x800000
212 #endif
213 
214 /*
215  * Start/stop in-progress flags; note that restart must always remain
216  * the OR of start and stop flag values.
217  */
218 #define	IBD_DRV_START_IN_PROGRESS	0x10000000
219 #define	IBD_DRV_STOP_IN_PROGRESS	0x20000000
220 #define	IBD_DRV_RESTART_IN_PROGRESS	0x30000000
221 
222 /*
223  * Miscellaneous constants
224  */
225 #define	IB_MGID_IPV4_LOWGRP_MASK	0xFFFFFFFF
226 #define	IBD_DEF_MAX_SDU			2044
227 #define	IBD_DEFAULT_QKEY		0xB1B
228 #ifdef IBD_LOGGING
229 #define	IBD_DMAX_LINE			100
230 #endif
231 
232 /*
233  * Enumerations for link states
234  */
235 typedef enum {
236 	IBD_LINK_DOWN,
237 	IBD_LINK_UP,
238 	IBD_LINK_UP_ABSENT
239 } ibd_link_op_t;
240 
241 /*
242  * Driver State Pointer
243  */
244 void *ibd_list;
245 
246 /*
247  * Driver Global Data
248  */
249 ibd_global_state_t ibd_gstate;
250 
251 /*
252  * Logging
253  */
254 #ifdef IBD_LOGGING
255 kmutex_t ibd_lbuf_lock;
256 uint8_t *ibd_lbuf;
257 uint32_t ibd_lbuf_ndx;
258 #endif
259 
260 /*
261  * Required system entry points
262  */
263 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
264 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
265 
266 /*
267  * Required driver entry points for GLDv3
268  */
269 static int ibd_m_stat(void *, uint_t, uint64_t *);
270 static int ibd_m_start(void *);
271 static void ibd_m_stop(void *);
272 static int ibd_m_promisc(void *, boolean_t);
273 static int ibd_m_multicst(void *, boolean_t, const uint8_t *);
274 static int ibd_m_unicst(void *, const uint8_t *);
275 static mblk_t *ibd_m_tx(void *, mblk_t *);
276 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *);
277 
278 /*
279  * Private driver entry points for GLDv3
280  */
281 
282 /*
283  * Initialization
284  */
285 static int ibd_state_init(ibd_state_t *, dev_info_t *);
286 static int ibd_init_txlist(ibd_state_t *);
287 static int ibd_init_rxlist(ibd_state_t *);
288 static int ibd_acache_init(ibd_state_t *);
289 #ifdef IBD_LOGGING
290 static void ibd_log_init(void);
291 #endif
292 
293 /*
294  * Termination/cleanup
295  */
296 static void ibd_state_fini(ibd_state_t *);
297 static void ibd_fini_txlist(ibd_state_t *);
298 static void ibd_fini_rxlist(ibd_state_t *);
299 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *);
300 static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *);
301 static void ibd_acache_fini(ibd_state_t *);
302 #ifdef IBD_LOGGING
303 static void ibd_log_fini(void);
304 #endif
305 
306 /*
307  * Allocation/acquire/map routines
308  */
309 static int ibd_alloc_tx_copybufs(ibd_state_t *);
310 static int ibd_alloc_rx_copybufs(ibd_state_t *);
311 static int ibd_alloc_tx_lsobufs(ibd_state_t *);
312 static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *);
313 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *,
314     uint32_t *);
315 
316 /*
317  * Free/release/unmap routines
318  */
319 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
320 static void ibd_free_tx_copybufs(ibd_state_t *);
321 static void ibd_free_rx_copybufs(ibd_state_t *);
322 static void ibd_free_rx_rsrcs(ibd_state_t *);
323 static void ibd_free_tx_lsobufs(ibd_state_t *);
324 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int);
325 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t);
326 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *);
327 
328 /*
329  * Handlers/callback routines
330  */
331 static uint_t ibd_intr(caddr_t);
332 static uint_t ibd_tx_recycle(caddr_t);
333 static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
334 static void ibd_scq_handler(ibt_cq_hdl_t, void *);
335 static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t);
336 static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t);
337 static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t);
338 static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t);
339 static void ibd_freemsg_cb(char *);
340 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
341     ibt_async_event_t *);
342 static void ibd_snet_notices_handler(void *, ib_gid_t,
343     ibt_subnet_event_code_t, ibt_subnet_event_t *);
344 
345 /*
346  * Send/receive routines
347  */
348 static boolean_t ibd_send(ibd_state_t *, mblk_t *);
349 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *);
350 static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *);
351 static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
352 
353 /*
354  * Threads
355  */
356 static void ibd_async_work(ibd_state_t *);
357 
358 /*
359  * Async tasks
360  */
361 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *);
362 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int);
363 static void ibd_async_setprom(ibd_state_t *);
364 static void ibd_async_unsetprom(ibd_state_t *);
365 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t);
366 static void ibd_async_trap(ibd_state_t *, ibd_req_t *);
367 static void ibd_async_txsched(ibd_state_t *);
368 static void ibd_async_link(ibd_state_t *, ibd_req_t *);
369 
370 /*
371  * Async task helpers
372  */
373 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *);
374 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t);
375 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *);
376 static boolean_t ibd_get_allroutergroup(ibd_state_t *,
377     ipoib_mac_t *, ipoib_mac_t *);
378 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t);
379 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *);
380 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *);
381 static ibt_status_t ibd_find_bgroup(ibd_state_t *);
382 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *);
383 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t);
384 static uint64_t ibd_get_portspeed(ibd_state_t *);
385 static boolean_t ibd_async_safe(ibd_state_t *);
386 static void ibd_async_done(ibd_state_t *);
387 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int);
388 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *);
389 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t);
390 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *);
391 
392 /*
393  * Helpers for attach/start routines
394  */
395 static int ibd_register_mac(ibd_state_t *, dev_info_t *);
396 static int ibd_record_capab(ibd_state_t *, dev_info_t *);
397 static int ibd_unattach(ibd_state_t *, dev_info_t *);
398 static int ibd_get_port_details(ibd_state_t *);
399 static int ibd_alloc_cqs(ibd_state_t *);
400 static int ibd_setup_ud_channel(ibd_state_t *);
401 static int ibd_start(ibd_state_t *);
402 static int ibd_undo_start(ibd_state_t *, link_state_t);
403 static void ibd_set_mac_progress(ibd_state_t *, uint_t);
404 static void ibd_clr_mac_progress(ibd_state_t *, uint_t);
405 
406 
407 /*
408  * Miscellaneous helpers
409  */
410 static int ibd_sched_poll(ibd_state_t *, int, int);
411 static void ibd_resume_transmission(ibd_state_t *);
412 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t);
413 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t);
414 static void *list_get_head(list_t *);
415 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t);
416 static uint_t ibd_hash_by_id(void *, mod_hash_key_t);
417 #ifdef IBD_LOGGING
418 static void ibd_log(const char *, ...);
419 #endif
420 
421 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach,
422     nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed);
423 
424 /* Module Driver Info */
425 static struct modldrv ibd_modldrv = {
426 	&mod_driverops,			/* This one is a driver */
427 	"InfiniBand GLDv3 Driver",	/* short description */
428 	&ibd_dev_ops			/* driver specific ops */
429 };
430 
431 /* Module Linkage */
432 static struct modlinkage ibd_modlinkage = {
433 	MODREV_1, (void *)&ibd_modldrv, NULL
434 };
435 
436 /*
437  * Module (static) info passed to IBTL during ibt_attach
438  */
439 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = {
440 	IBTI_V_CURR,
441 	IBT_NETWORK,
442 	ibd_async_handler,
443 	NULL,
444 	"IPIB"
445 };
446 
447 /*
448  * GLDv3 entry points
449  */
450 #define	IBD_M_CALLBACK_FLAGS	(MC_GETCAPAB)
451 static mac_callbacks_t ibd_m_callbacks = {
452 	IBD_M_CALLBACK_FLAGS,
453 	ibd_m_stat,
454 	ibd_m_start,
455 	ibd_m_stop,
456 	ibd_m_promisc,
457 	ibd_m_multicst,
458 	ibd_m_unicst,
459 	ibd_m_tx,
460 	NULL,
461 	NULL,
462 	ibd_m_getcapab
463 };
464 
465 /*
466  * Fill/clear <scope> and <p_key> in multicast/broadcast address
467  */
468 #define	IBD_FILL_SCOPE_PKEY(maddr, scope, pkey)		\
469 {							\
470 	*(uint32_t *)((char *)(maddr) + 4) |=		\
471 	    htonl((uint32_t)(scope) << 16);		\
472 	*(uint32_t *)((char *)(maddr) + 8) |=		\
473 	    htonl((uint32_t)(pkey) << 16);		\
474 }
475 
476 #define	IBD_CLEAR_SCOPE_PKEY(maddr)			\
477 {							\
478 	*(uint32_t *)((char *)(maddr) + 4) &=		\
479 	    htonl(~((uint32_t)0xF << 16));		\
480 	*(uint32_t *)((char *)(maddr) + 8) &=		\
481 	    htonl(~((uint32_t)0xFFFF << 16));		\
482 }
483 
484 /*
485  * Rudimentary debugging support
486  */
487 #ifdef DEBUG
488 int ibd_debuglevel = 100;
489 void
490 debug_print(int l, char *fmt, ...)
491 {
492 	va_list ap;
493 
494 	if (l < ibd_debuglevel)
495 		return;
496 	va_start(ap, fmt);
497 	vcmn_err(CE_CONT, fmt, ap);
498 	va_end(ap);
499 }
500 #endif
501 
502 /*
503  * Common routine to print warning messages; adds in hca guid, port number
504  * and pkey to be able to identify the IBA interface.
505  */
506 void
507 ibd_print_warn(ibd_state_t *state, char *fmt, ...)
508 {
509 	ib_guid_t hca_guid;
510 	char ibd_print_buf[256];
511 	int len;
512 	va_list ap;
513 
514 	hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
515 	    0, "hca-guid", 0);
516 	len = snprintf(ibd_print_buf, sizeof (ibd_print_buf),
517 	    "%s%d: HCA GUID %016llx port %d PKEY %02x ",
518 	    ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip),
519 	    (u_longlong_t)hca_guid, state->id_port, state->id_pkey);
520 	va_start(ap, fmt);
521 	(void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len,
522 	    fmt, ap);
523 	cmn_err(CE_NOTE, "!%s", ibd_print_buf);
524 	va_end(ap);
525 }
526 
527 /*
528  * Warlock directives
529  */
530 
531 /*
532  * id_lso_lock
533  *
534  * state->id_lso->bkt_nfree may be accessed without a lock to
535  * determine the threshold at which we have to ask the nw layer
536  * to resume transmission (see ibd_resume_transmission()).
537  */
538 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock,
539     ibd_state_t::id_lso))
540 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso))
541 _NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy))
542 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree))
543 
544 /*
545  * id_scq_poll_lock
546  */
547 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock,
548     ibd_state_t::id_scq_poll_busy))
549 
550 /*
551  * id_txpost_lock
552  */
553 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
554     ibd_state_t::id_tx_head))
555 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
556     ibd_state_t::id_tx_busy))
557 
558 /*
559  * id_acache_req_lock
560  */
561 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
562     ibd_state_t::id_acache_req_cv))
563 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
564     ibd_state_t::id_req_list))
565 _NOTE(SCHEME_PROTECTS_DATA("atomic",
566     ibd_acache_s::ac_ref))
567 
568 /*
569  * id_ac_mutex
570  *
571  * This mutex is actually supposed to protect id_ah_op as well,
572  * but this path of the code isn't clean (see update of id_ah_op
573  * in ibd_async_acache(), immediately after the call to
574  * ibd_async_mcache()). For now, we'll skip this check by
575  * declaring that id_ah_op is protected by some internal scheme
576  * that warlock isn't aware of.
577  */
578 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
579     ibd_state_t::id_ah_active))
580 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
581     ibd_state_t::id_ah_free))
582 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
583     ibd_state_t::id_ah_addr))
584 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this",
585     ibd_state_t::id_ah_op))
586 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
587     ibd_state_t::id_ah_error))
588 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
589     ibd_state_t::id_ac_hot_ace))
590 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error))
591 
592 /*
593  * id_mc_mutex
594  */
595 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
596     ibd_state_t::id_mc_full))
597 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
598     ibd_state_t::id_mc_non))
599 
600 /*
601  * id_trap_lock
602  */
603 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
604     ibd_state_t::id_trap_cv))
605 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
606     ibd_state_t::id_trap_stop))
607 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
608     ibd_state_t::id_trap_inprog))
609 
610 /*
611  * id_prom_op
612  */
613 _NOTE(SCHEME_PROTECTS_DATA("only by async thread",
614     ibd_state_t::id_prom_op))
615 
616 /*
617  * id_sched_lock
618  */
619 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock,
620     ibd_state_t::id_sched_needed))
621 
622 /*
623  * id_link_mutex
624  */
625 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex,
626     ibd_state_t::id_link_state))
627 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state))
628 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start",
629     ibd_state_t::id_link_speed))
630 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid))
631 
632 /*
633  * id_tx_list.dl_mutex
634  */
635 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
636     ibd_state_t::id_tx_list.dl_head))
637 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
638     ibd_state_t::id_tx_list.dl_pending_sends))
639 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
640     ibd_state_t::id_tx_list.dl_cnt))
641 
642 /*
643  * id_rx_list.dl_mutex
644  */
645 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
646     ibd_state_t::id_rx_list.dl_bufs_outstanding))
647 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
648     ibd_state_t::id_rx_list.dl_cnt))
649 
650 
651 /*
652  * Items protected by atomic updates
653  */
654 _NOTE(SCHEME_PROTECTS_DATA("atomic update only",
655     ibd_state_s::id_brd_rcv
656     ibd_state_s::id_brd_xmt
657     ibd_state_s::id_multi_rcv
658     ibd_state_s::id_multi_xmt
659     ibd_state_s::id_num_intrs
660     ibd_state_s::id_rcv_bytes
661     ibd_state_s::id_rcv_pkt
662     ibd_state_s::id_rx_post_queue_index
663     ibd_state_s::id_tx_short
664     ibd_state_s::id_xmt_bytes
665     ibd_state_s::id_xmt_pkt
666     ibd_state_s::rc_rcv_trans_byte
667     ibd_state_s::rc_rcv_trans_pkt
668     ibd_state_s::rc_rcv_copy_byte
669     ibd_state_s::rc_rcv_copy_pkt
670     ibd_state_s::rc_xmt_bytes
671     ibd_state_s::rc_xmt_small_pkt
672     ibd_state_s::rc_xmt_fragmented_pkt
673     ibd_state_s::rc_xmt_map_fail_pkt
674     ibd_state_s::rc_xmt_map_succ_pkt))
675 
676 /*
677  * Non-mutex protection schemes for data elements. Almost all of
678  * these are non-shared items.
679  */
680 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded",
681     callb_cpr
682     ib_gid_s
683     ib_header_info
684     ibd_acache_rq
685     ibd_acache_s::ac_mce
686     ibd_acache_s::ac_chan
687     ibd_mcache::mc_fullreap
688     ibd_mcache::mc_jstate
689     ibd_mcache::mc_req
690     ibd_rwqe_s
691     ibd_swqe_s
692     ibd_wqe_s
693     ibt_wr_ds_s::ds_va
694     ibt_wr_lso_s
695     ipoib_mac::ipoib_qpn
696     mac_capab_lso_s
697     msgb::b_next
698     msgb::b_cont
699     msgb::b_rptr
700     msgb::b_wptr
701     ibd_state_s::id_bgroup_created
702     ibd_state_s::id_mac_state
703     ibd_state_s::id_mtu
704     ibd_state_s::id_num_rwqe
705     ibd_state_s::id_num_swqe
706     ibd_state_s::id_qpnum
707     ibd_state_s::id_rcq_hdl
708     ibd_state_s::id_rx_buf_sz
709     ibd_state_s::id_rx_bufs
710     ibd_state_s::id_rx_mr_hdl
711     ibd_state_s::id_rx_wqes
712     ibd_state_s::id_rxwcs
713     ibd_state_s::id_rxwcs_size
714     ibd_state_s::id_rx_nqueues
715     ibd_state_s::id_rx_queues
716     ibd_state_s::id_scope
717     ibd_state_s::id_scq_hdl
718     ibd_state_s::id_tx_buf_sz
719     ibd_state_s::id_tx_bufs
720     ibd_state_s::id_tx_mr_hdl
721     ibd_state_s::id_tx_rel_list.dl_cnt
722     ibd_state_s::id_tx_wqes
723     ibd_state_s::id_txwcs
724     ibd_state_s::id_txwcs_size
725     ibd_state_s::rc_listen_hdl
726     ibd_state_s::rc_listen_hdl_OFED_interop
727     ibd_state_s::rc_srq_size
728     ibd_state_s::rc_srq_rwqes
729     ibd_state_s::rc_srq_rx_bufs
730     ibd_state_s::rc_srq_rx_mr_hdl
731     ibd_state_s::rc_tx_largebuf_desc_base
732     ibd_state_s::rc_tx_mr_bufs
733     ibd_state_s::rc_tx_mr_hdl
734     ipha_s
735     icmph_s
736     ibt_path_info_s::pi_sid
737     ibd_rc_chan_s::ace
738     ibd_rc_chan_s::chan_hdl
739     ibd_rc_chan_s::state
740     ibd_rc_chan_s::chan_state
741     ibd_rc_chan_s::is_tx_chan
742     ibd_rc_chan_s::rcq_hdl
743     ibd_rc_chan_s::rcq_size
744     ibd_rc_chan_s::scq_hdl
745     ibd_rc_chan_s::scq_size
746     ibd_rc_chan_s::requester_gid
747     ibd_rc_chan_s::requester_pkey
748     ibd_rc_chan_s::rx_bufs
749     ibd_rc_chan_s::rx_mr_hdl
750     ibd_rc_chan_s::rx_rwqes
751     ibd_rc_chan_s::tx_wqes
752     ibd_rc_chan_s::tx_mr_bufs
753     ibd_rc_chan_s::tx_mr_hdl
754     ibd_rc_chan_s::tx_rel_list.dl_cnt
755     ibd_rc_chan_s::tx_trans_error_cnt
756     ibd_rc_tx_largebuf_s::lb_buf
757     ibd_rc_msg_hello_s
758     ibt_cm_return_args_s))
759 
760 /*
761  * ibd_rc_chan_s::next is protected by two mutexes:
762  * 1) ibd_state_s::rc_pass_chan_list.chan_list_mutex
763  * 2) ibd_state_s::rc_obs_act_chan_list.chan_list_mutex.
764  */
765 _NOTE(SCHEME_PROTECTS_DATA("protected by two mutexes",
766     ibd_rc_chan_s::next))
767 
768 /*
769  * ibd_state_s.rc_tx_large_bufs_lock
770  */
771 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
772     ibd_state_s::rc_tx_largebuf_free_head))
773 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
774     ibd_state_s::rc_tx_largebuf_nfree))
775 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
776     ibd_rc_tx_largebuf_s::lb_next))
777 
778 /*
779  * ibd_acache_s.tx_too_big_mutex
780  */
781 _NOTE(MUTEX_PROTECTS_DATA(ibd_acache_s::tx_too_big_mutex,
782     ibd_acache_s::tx_too_big_ongoing))
783 
784 /*
785  * tx_wqe_list.dl_mutex
786  */
787 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
788     ibd_rc_chan_s::tx_wqe_list.dl_head))
789 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
790     ibd_rc_chan_s::tx_wqe_list.dl_pending_sends))
791 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
792     ibd_rc_chan_s::tx_wqe_list.dl_cnt))
793 
794 /*
795  * ibd_state_s.rc_ace_recycle_lock
796  */
797 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_ace_recycle_lock,
798     ibd_state_s::rc_ace_recycle))
799 
800 /*
801  * rc_srq_rwqe_list.dl_mutex
802  */
803 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
804     ibd_state_t::rc_srq_rwqe_list.dl_bufs_outstanding))
805 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
806     ibd_state_t::rc_srq_rwqe_list.dl_cnt))
807 
808 /*
809  * Non-mutex protection schemes for data elements. They are counters
810  * for problem diagnosis. Don't need be protected.
811  */
812 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis",
813     ibd_state_s::rc_rcv_alloc_fail
814     ibd_state_s::rc_rcq_invoke
815     ibd_state_s::rc_rcq_err
816     ibd_state_s::rc_ace_not_found
817     ibd_state_s::rc_xmt_drop_too_long_pkt
818     ibd_state_s::rc_xmt_icmp_too_long_pkt
819     ibd_state_s::rc_xmt_reenter_too_long_pkt
820     ibd_state_s::rc_swqe_short
821     ibd_state_s::rc_swqe_mac_update
822     ibd_state_s::rc_xmt_buf_short
823     ibd_state_s::rc_xmt_buf_mac_update
824     ibd_state_s::rc_scq_no_swqe
825     ibd_state_s::rc_scq_no_largebuf
826     ibd_state_s::rc_scq_invoke
827     ibd_state_s::rc_conn_succ
828     ibd_state_s::rc_conn_fail
829     ibd_state_s::rc_null_conn
830     ibd_state_s::rc_no_estab_conn
831     ibd_state_s::rc_act_close
832     ibd_state_s::rc_pas_close
833     ibd_state_s::rc_delay_ace_recycle
834     ibd_state_s::rc_act_close_simultaneous
835     ibd_state_s::rc_reset_cnt))
836 
837 #ifdef DEBUG
838 /*
839  * Non-mutex protection schemes for data elements. They are counters
840  * for problem diagnosis. Don't need be protected.
841  */
842 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis",
843     ibd_state_s::rc_rwqe_short
844     ibd_rc_stat_s::rc_rcv_trans_byte
845     ibd_rc_stat_s::rc_rcv_trans_pkt
846     ibd_rc_stat_s::rc_rcv_copy_byte
847     ibd_rc_stat_s::rc_rcv_copy_pkt
848     ibd_rc_stat_s::rc_rcv_alloc_fail
849     ibd_rc_stat_s::rc_rcq_invoke
850     ibd_rc_stat_s::rc_rcq_err
851     ibd_rc_stat_s::rc_scq_invoke
852     ibd_rc_stat_s::rc_rwqe_short
853     ibd_rc_stat_s::rc_xmt_bytes
854     ibd_rc_stat_s::rc_xmt_small_pkt
855     ibd_rc_stat_s::rc_xmt_fragmented_pkt
856     ibd_rc_stat_s::rc_xmt_map_fail_pkt
857     ibd_rc_stat_s::rc_xmt_map_succ_pkt
858     ibd_rc_stat_s::rc_ace_not_found
859     ibd_rc_stat_s::rc_scq_no_swqe
860     ibd_rc_stat_s::rc_scq_no_largebuf
861     ibd_rc_stat_s::rc_swqe_short
862     ibd_rc_stat_s::rc_swqe_mac_update
863     ibd_rc_stat_s::rc_xmt_buf_short
864     ibd_rc_stat_s::rc_xmt_buf_mac_update
865     ibd_rc_stat_s::rc_conn_succ
866     ibd_rc_stat_s::rc_conn_fail
867     ibd_rc_stat_s::rc_null_conn
868     ibd_rc_stat_s::rc_no_estab_conn
869     ibd_rc_stat_s::rc_act_close
870     ibd_rc_stat_s::rc_pas_close
871     ibd_rc_stat_s::rc_delay_ace_recycle
872     ibd_rc_stat_s::rc_act_close_simultaneous
873     ibd_rc_stat_s::rc_reset_cnt))
874 #endif
875 
876 int
877 _init()
878 {
879 	int status;
880 
881 	status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t),
882 	    PAGESIZE), 0);
883 	if (status != 0) {
884 		DPRINT(10, "_init:failed in ddi_soft_state_init()");
885 		return (status);
886 	}
887 
888 	mac_init_ops(&ibd_dev_ops, "ibd");
889 	status = mod_install(&ibd_modlinkage);
890 	if (status != 0) {
891 		DPRINT(10, "_init:failed in mod_install()");
892 		ddi_soft_state_fini(&ibd_list);
893 		mac_fini_ops(&ibd_dev_ops);
894 		return (status);
895 	}
896 
897 	mutex_init(&ibd_gstate.ig_mutex, NULL, MUTEX_DRIVER, NULL);
898 	mutex_enter(&ibd_gstate.ig_mutex);
899 	ibd_gstate.ig_ibt_hdl = NULL;
900 	ibd_gstate.ig_ibt_hdl_ref_cnt = 0;
901 	ibd_gstate.ig_service_list = NULL;
902 	mutex_exit(&ibd_gstate.ig_mutex);
903 
904 #ifdef IBD_LOGGING
905 	ibd_log_init();
906 #endif
907 	return (0);
908 }
909 
910 int
911 _info(struct modinfo *modinfop)
912 {
913 	return (mod_info(&ibd_modlinkage, modinfop));
914 }
915 
916 int
917 _fini()
918 {
919 	int status;
920 
921 	status = mod_remove(&ibd_modlinkage);
922 	if (status != 0)
923 		return (status);
924 
925 	mac_fini_ops(&ibd_dev_ops);
926 	ddi_soft_state_fini(&ibd_list);
927 	mutex_destroy(&ibd_gstate.ig_mutex);
928 #ifdef IBD_LOGGING
929 	ibd_log_fini();
930 #endif
931 	return (0);
932 }
933 
934 /*
935  * Convert the GID part of the mac address from network byte order
936  * to host order.
937  */
938 static void
939 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid)
940 {
941 	ib_sn_prefix_t nbopref;
942 	ib_guid_t nboguid;
943 
944 	bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t));
945 	bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t));
946 	dgid->gid_prefix = b2h64(nbopref);
947 	dgid->gid_guid = b2h64(nboguid);
948 }
949 
950 /*
951  * Create the IPoIB address in network byte order from host order inputs.
952  */
953 static void
954 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix,
955     ib_guid_t guid)
956 {
957 	ib_sn_prefix_t nbopref;
958 	ib_guid_t nboguid;
959 
960 	mac->ipoib_qpn = htonl(qpn);
961 	nbopref = h2b64(prefix);
962 	nboguid = h2b64(guid);
963 	bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t));
964 	bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t));
965 }
966 
967 /*
968  * Send to the appropriate all-routers group when the IBA multicast group
969  * does not exist, based on whether the target group is v4 or v6.
970  */
971 static boolean_t
972 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac,
973     ipoib_mac_t *rmac)
974 {
975 	boolean_t retval = B_TRUE;
976 	uint32_t adjscope = state->id_scope << 16;
977 	uint32_t topword;
978 
979 	/*
980 	 * Copy the first 4 bytes in without assuming any alignment of
981 	 * input mac address; this will have IPoIB signature, flags and
982 	 * scope bits.
983 	 */
984 	bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t));
985 	topword = ntohl(topword);
986 
987 	/*
988 	 * Generate proper address for IPv4/v6, adding in the Pkey properly.
989 	 */
990 	if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) ||
991 	    (topword == (IB_MCGID_IPV6_PREFIX | adjscope)))
992 		ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) |
993 		    ((uint32_t)(state->id_pkey << 16))),
994 		    (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP));
995 	else
996 		/*
997 		 * Does not have proper bits in the mgid address.
998 		 */
999 		retval = B_FALSE;
1000 
1001 	return (retval);
1002 }
1003 
1004 /*
1005  * Membership states for different mcg's are tracked by two lists:
1006  * the "non" list is used for promiscuous mode, when all mcg traffic
1007  * needs to be inspected. This type of membership is never used for
1008  * transmission, so there can not be an AH in the active list
1009  * corresponding to a member in this list. This list does not need
1010  * any protection, since all operations are performed by the async
1011  * thread.
1012  *
1013  * "Full" and "SendOnly" membership is tracked using a single list,
1014  * the "full" list. This is because this single list can then be
1015  * searched during transmit to a multicast group (if an AH for the
1016  * mcg is not found in the active list), since at least one type
1017  * of membership must be present before initiating the transmit.
1018  * This list is also emptied during driver detach, since sendonly
1019  * membership acquired during transmit is dropped at detach time
1020  * along with ipv4 broadcast full membership. Insert/deletes to
1021  * this list are done only by the async thread, but it is also
1022  * searched in program context (see multicast disable case), thus
1023  * the id_mc_mutex protects the list. The driver detach path also
1024  * deconstructs the "full" list, but it ensures that the async
1025  * thread will not be accessing the list (by blocking out mcg
1026  * trap handling and making sure no more Tx reaping will happen).
1027  *
1028  * Currently, an IBA attach is done in the SendOnly case too,
1029  * although this is not required.
1030  */
1031 #define	IBD_MCACHE_INSERT_FULL(state, mce) \
1032 	list_insert_head(&state->id_mc_full, mce)
1033 #define	IBD_MCACHE_INSERT_NON(state, mce) \
1034 	list_insert_head(&state->id_mc_non, mce)
1035 #define	IBD_MCACHE_FIND_FULL(state, mgid) \
1036 	ibd_mcache_find(mgid, &state->id_mc_full)
1037 #define	IBD_MCACHE_FIND_NON(state, mgid) \
1038 	ibd_mcache_find(mgid, &state->id_mc_non)
1039 #define	IBD_MCACHE_PULLOUT_FULL(state, mce) \
1040 	list_remove(&state->id_mc_full, mce)
1041 #define	IBD_MCACHE_PULLOUT_NON(state, mce) \
1042 	list_remove(&state->id_mc_non, mce)
1043 
1044 static void *
1045 list_get_head(list_t *list)
1046 {
1047 	list_node_t *lhead = list_head(list);
1048 
1049 	if (lhead != NULL)
1050 		list_remove(list, lhead);
1051 	return (lhead);
1052 }
1053 
1054 /*
1055  * This is always guaranteed to be able to queue the work.
1056  */
1057 void
1058 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op)
1059 {
1060 	/* Initialize request */
1061 	DPRINT(1, "ibd_queue_work_slot : op: %d \n", op);
1062 	ptr->rq_op = op;
1063 
1064 	/*
1065 	 * Queue provided slot onto request pool.
1066 	 */
1067 	mutex_enter(&state->id_acache_req_lock);
1068 	list_insert_tail(&state->id_req_list, ptr);
1069 
1070 	/* Go, fetch, async thread */
1071 	cv_signal(&state->id_acache_req_cv);
1072 	mutex_exit(&state->id_acache_req_lock);
1073 }
1074 
1075 /*
1076  * Main body of the per interface async thread.
1077  */
1078 static void
1079 ibd_async_work(ibd_state_t *state)
1080 {
1081 	ibd_req_t *ptr;
1082 	callb_cpr_t cprinfo;
1083 
1084 	mutex_enter(&state->id_acache_req_lock);
1085 	CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock,
1086 	    callb_generic_cpr, "ibd_async_work");
1087 
1088 	for (;;) {
1089 		ptr = list_get_head(&state->id_req_list);
1090 		if (ptr != NULL) {
1091 			mutex_exit(&state->id_acache_req_lock);
1092 
1093 			/*
1094 			 * Once we have done the operation, there is no
1095 			 * guarantee the request slot is going to be valid,
1096 			 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP,
1097 			 * TRAP).
1098 			 *
1099 			 * Perform the request.
1100 			 */
1101 			switch (ptr->rq_op) {
1102 				case IBD_ASYNC_GETAH:
1103 					ibd_async_acache(state, &ptr->rq_mac);
1104 					break;
1105 				case IBD_ASYNC_JOIN:
1106 				case IBD_ASYNC_LEAVE:
1107 					ibd_async_multicast(state,
1108 					    ptr->rq_gid, ptr->rq_op);
1109 					break;
1110 				case IBD_ASYNC_PROMON:
1111 					ibd_async_setprom(state);
1112 					break;
1113 				case IBD_ASYNC_PROMOFF:
1114 					ibd_async_unsetprom(state);
1115 					break;
1116 				case IBD_ASYNC_REAP:
1117 					ibd_async_reap_group(state,
1118 					    ptr->rq_ptr, ptr->rq_gid,
1119 					    IB_MC_JSTATE_FULL);
1120 					/*
1121 					 * the req buf contains in mce
1122 					 * structure, so we do not need
1123 					 * to free it here.
1124 					 */
1125 					ptr = NULL;
1126 					break;
1127 				case IBD_ASYNC_TRAP:
1128 					ibd_async_trap(state, ptr);
1129 					break;
1130 				case IBD_ASYNC_SCHED:
1131 					ibd_async_txsched(state);
1132 					break;
1133 				case IBD_ASYNC_LINK:
1134 					ibd_async_link(state, ptr);
1135 					break;
1136 				case IBD_ASYNC_EXIT:
1137 					mutex_enter(&state->id_acache_req_lock);
1138 #ifndef __lock_lint
1139 					CALLB_CPR_EXIT(&cprinfo);
1140 #else
1141 					mutex_exit(&state->id_acache_req_lock);
1142 #endif
1143 					return;
1144 				case IBD_ASYNC_RC_TOO_BIG:
1145 					ibd_async_rc_process_too_big(state,
1146 					    ptr);
1147 					break;
1148 				case IBD_ASYNC_RC_CLOSE_ACT_CHAN:
1149 					ibd_async_rc_close_act_chan(state, ptr);
1150 					break;
1151 				case IBD_ASYNC_RC_RECYCLE_ACE:
1152 					ibd_async_rc_recycle_ace(state, ptr);
1153 					break;
1154 			}
1155 			if (ptr != NULL)
1156 				kmem_cache_free(state->id_req_kmc, ptr);
1157 
1158 			mutex_enter(&state->id_acache_req_lock);
1159 		} else {
1160 #ifndef __lock_lint
1161 			/*
1162 			 * Nothing to do: wait till new request arrives.
1163 			 */
1164 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1165 			cv_wait(&state->id_acache_req_cv,
1166 			    &state->id_acache_req_lock);
1167 			CALLB_CPR_SAFE_END(&cprinfo,
1168 			    &state->id_acache_req_lock);
1169 #endif
1170 		}
1171 	}
1172 
1173 	/*NOTREACHED*/
1174 	_NOTE(NOT_REACHED)
1175 }
1176 
1177 /*
1178  * Return when it is safe to queue requests to the async daemon; primarily
1179  * for subnet trap and async event handling. Disallow requests before the
1180  * daemon is created, and when interface deinitilization starts.
1181  */
1182 static boolean_t
1183 ibd_async_safe(ibd_state_t *state)
1184 {
1185 	mutex_enter(&state->id_trap_lock);
1186 	if (state->id_trap_stop) {
1187 		mutex_exit(&state->id_trap_lock);
1188 		return (B_FALSE);
1189 	}
1190 	state->id_trap_inprog++;
1191 	mutex_exit(&state->id_trap_lock);
1192 	return (B_TRUE);
1193 }
1194 
1195 /*
1196  * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet
1197  * trap or event handling to complete to kill the async thread and deconstruct
1198  * the mcg/ace list.
1199  */
1200 static void
1201 ibd_async_done(ibd_state_t *state)
1202 {
1203 	mutex_enter(&state->id_trap_lock);
1204 	if (--state->id_trap_inprog == 0)
1205 		cv_signal(&state->id_trap_cv);
1206 	mutex_exit(&state->id_trap_lock);
1207 }
1208 
1209 /*
1210  * Hash functions:
1211  * ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
1212  * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
1213  * These operate on mac addresses input into ibd_send, but there is no
1214  * guarantee on the alignment of the ipoib_mac_t structure.
1215  */
1216 /*ARGSUSED*/
1217 static uint_t
1218 ibd_hash_by_id(void *hash_data, mod_hash_key_t key)
1219 {
1220 	ulong_t ptraddr = (ulong_t)key;
1221 	uint_t hval;
1222 
1223 	/*
1224 	 * If the input address is 4 byte aligned, we can just dereference
1225 	 * it. This is most common, since IP will send in a 4 byte aligned
1226 	 * IP header, which implies the 24 byte IPoIB psuedo header will be
1227 	 * 4 byte aligned too.
1228 	 */
1229 	if ((ptraddr & 3) == 0)
1230 		return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn);
1231 
1232 	bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t));
1233 	return (hval);
1234 }
1235 
1236 static int
1237 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1238 {
1239 	if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0)
1240 		return (0);
1241 	else
1242 		return (1);
1243 }
1244 
1245 /*
1246  * Initialize all the per interface caches and lists; AH cache,
1247  * MCG list etc.
1248  */
1249 static int
1250 ibd_acache_init(ibd_state_t *state)
1251 {
1252 	ibd_ace_t *ce;
1253 	int i;
1254 
1255 	mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL);
1256 	cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL);
1257 
1258 	mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL);
1259 	mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL);
1260 	mutex_enter(&state->id_ac_mutex);
1261 	list_create(&state->id_ah_free, sizeof (ibd_ace_t),
1262 	    offsetof(ibd_ace_t, ac_list));
1263 	list_create(&state->id_ah_active, sizeof (ibd_ace_t),
1264 	    offsetof(ibd_ace_t, ac_list));
1265 	state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash",
1266 	    IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor,
1267 	    ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP);
1268 	list_create(&state->id_mc_full, sizeof (ibd_mce_t),
1269 	    offsetof(ibd_mce_t, mc_list));
1270 	list_create(&state->id_mc_non, sizeof (ibd_mce_t),
1271 	    offsetof(ibd_mce_t, mc_list));
1272 	list_create(&state->id_req_list, sizeof (ibd_req_t),
1273 	    offsetof(ibd_req_t, rq_list));
1274 	state->id_ac_hot_ace = NULL;
1275 
1276 	state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) *
1277 	    IBD_NUM_AH, KM_SLEEP);
1278 	for (i = 0; i < IBD_NUM_AH; i++, ce++) {
1279 		if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS,
1280 		    state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) {
1281 			mutex_exit(&state->id_ac_mutex);
1282 			ibd_acache_fini(state);
1283 			return (DDI_FAILURE);
1284 		} else {
1285 			CLEAR_REFCYCLE(ce);
1286 			ce->ac_mce = NULL;
1287 			mutex_init(&ce->tx_too_big_mutex, NULL,
1288 			    MUTEX_DRIVER, NULL);
1289 			IBD_ACACHE_INSERT_FREE(state, ce);
1290 		}
1291 	}
1292 	mutex_exit(&state->id_ac_mutex);
1293 	return (DDI_SUCCESS);
1294 }
1295 
1296 static void
1297 ibd_acache_fini(ibd_state_t *state)
1298 {
1299 	ibd_ace_t *ptr;
1300 
1301 	mutex_enter(&state->id_ac_mutex);
1302 
1303 	while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) {
1304 		ASSERT(GET_REF(ptr) == 0);
1305 		mutex_destroy(&ptr->tx_too_big_mutex);
1306 		(void) ibt_free_ud_dest(ptr->ac_dest);
1307 	}
1308 
1309 	while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) {
1310 		ASSERT(GET_REF(ptr) == 0);
1311 		mutex_destroy(&ptr->tx_too_big_mutex);
1312 		(void) ibt_free_ud_dest(ptr->ac_dest);
1313 	}
1314 
1315 	list_destroy(&state->id_ah_free);
1316 	list_destroy(&state->id_ah_active);
1317 	list_destroy(&state->id_mc_full);
1318 	list_destroy(&state->id_mc_non);
1319 	list_destroy(&state->id_req_list);
1320 	kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH);
1321 	mutex_exit(&state->id_ac_mutex);
1322 	mutex_destroy(&state->id_ac_mutex);
1323 	mutex_destroy(&state->id_mc_mutex);
1324 	mutex_destroy(&state->id_acache_req_lock);
1325 	cv_destroy(&state->id_acache_req_cv);
1326 }
1327 
1328 /*
1329  * Search AH active hash list for a cached path to input destination.
1330  * If we are "just looking", hold == F. When we are in the Tx path,
1331  * we set hold == T to grab a reference on the AH so that it can not
1332  * be recycled to a new destination while the Tx request is posted.
1333  */
1334 ibd_ace_t *
1335 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num)
1336 {
1337 	ibd_ace_t *ptr;
1338 
1339 	ASSERT(mutex_owned(&state->id_ac_mutex));
1340 
1341 	/*
1342 	 * Do hash search.
1343 	 */
1344 	if (mod_hash_find(state->id_ah_active_hash,
1345 	    (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) {
1346 		if (hold)
1347 			INC_REF(ptr, num);
1348 		return (ptr);
1349 	}
1350 	return (NULL);
1351 }
1352 
1353 /*
1354  * This is called by the tx side; if an initialized AH is found in
1355  * the active list, it is locked down and can be used; if no entry
1356  * is found, an async request is queued to do path resolution.
1357  */
1358 static ibd_ace_t *
1359 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe)
1360 {
1361 	ibd_ace_t *ptr;
1362 	ibd_req_t *req;
1363 
1364 	/*
1365 	 * Only attempt to print when we can; in the mdt pattr case, the
1366 	 * address is not aligned properly.
1367 	 */
1368 	if (((ulong_t)mac & 3) == 0) {
1369 		DPRINT(4,
1370 		    "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
1371 		    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1372 		    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1373 		    htonl(mac->ipoib_gidsuff[1]));
1374 	}
1375 
1376 	mutex_enter(&state->id_ac_mutex);
1377 
1378 	if (((ptr = state->id_ac_hot_ace) != NULL) &&
1379 	    (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) {
1380 		INC_REF(ptr, numwqe);
1381 		mutex_exit(&state->id_ac_mutex);
1382 		return (ptr);
1383 	}
1384 	if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) {
1385 		state->id_ac_hot_ace = ptr;
1386 		mutex_exit(&state->id_ac_mutex);
1387 		return (ptr);
1388 	}
1389 
1390 	/*
1391 	 * Implementation of a single outstanding async request; if
1392 	 * the operation is not started yet, queue a request and move
1393 	 * to ongoing state. Remember in id_ah_addr for which address
1394 	 * we are queueing the request, in case we need to flag an error;
1395 	 * Any further requests, for the same or different address, until
1396 	 * the operation completes, is sent back to GLDv3 to be retried.
1397 	 * The async thread will update id_ah_op with an error indication
1398 	 * or will set it to indicate the next look up can start; either
1399 	 * way, it will mac_tx_update() so that all blocked requests come
1400 	 * back here.
1401 	 */
1402 	*err = EAGAIN;
1403 	if (state->id_ah_op == IBD_OP_NOTSTARTED) {
1404 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
1405 		if (req != NULL) {
1406 			/*
1407 			 * We did not even find the entry; queue a request
1408 			 * for it.
1409 			 */
1410 			bcopy(mac, &(req->rq_mac), IPOIB_ADDRL);
1411 			ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH);
1412 			state->id_ah_op = IBD_OP_ONGOING;
1413 			bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL);
1414 		}
1415 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1416 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) {
1417 		/*
1418 		 * Check the status of the pathrecord lookup request
1419 		 * we had queued before.
1420 		 */
1421 		if (state->id_ah_op == IBD_OP_ERRORED) {
1422 			*err = EFAULT;
1423 			state->id_ah_error++;
1424 		} else {
1425 			/*
1426 			 * IBD_OP_ROUTERED case: We need to send to the
1427 			 * all-router MCG. If we can find the AH for
1428 			 * the mcg, the Tx will be attempted. If we
1429 			 * do not find the AH, we return NORESOURCES
1430 			 * to retry.
1431 			 */
1432 			ipoib_mac_t routermac;
1433 
1434 			(void) ibd_get_allroutergroup(state, mac, &routermac);
1435 			ptr = ibd_acache_find(state, &routermac, B_TRUE,
1436 			    numwqe);
1437 		}
1438 		state->id_ah_op = IBD_OP_NOTSTARTED;
1439 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1440 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) {
1441 		/*
1442 		 * This case can happen when we get a higher band
1443 		 * packet. The easiest way is to reset the state machine
1444 		 * to accommodate the higher priority packet.
1445 		 */
1446 		state->id_ah_op = IBD_OP_NOTSTARTED;
1447 	}
1448 	mutex_exit(&state->id_ac_mutex);
1449 
1450 	return (ptr);
1451 }
1452 
1453 /*
1454  * Grab a not-currently-in-use AH/PathRecord from the active
1455  * list to recycle to a new destination. Only the async thread
1456  * executes this code.
1457  */
1458 static ibd_ace_t *
1459 ibd_acache_get_unref(ibd_state_t *state)
1460 {
1461 	ibd_ace_t *ptr = list_tail(&state->id_ah_active);
1462 	boolean_t try_rc_chan_recycle = B_FALSE;
1463 
1464 	ASSERT(mutex_owned(&state->id_ac_mutex));
1465 
1466 	/*
1467 	 * Do plain linear search.
1468 	 */
1469 	while (ptr != NULL) {
1470 		/*
1471 		 * Note that it is possible that the "cycle" bit
1472 		 * is set on the AH w/o any reference count. The
1473 		 * mcg must have been deleted, and the tx cleanup
1474 		 * just decremented the reference count to 0, but
1475 		 * hasn't gotten around to grabbing the id_ac_mutex
1476 		 * to move the AH into the free list.
1477 		 */
1478 		if (GET_REF(ptr) == 0) {
1479 			if (ptr->ac_chan != NULL) {
1480 				ASSERT(state->id_enable_rc == B_TRUE);
1481 				if (!try_rc_chan_recycle) {
1482 					try_rc_chan_recycle = B_TRUE;
1483 					ibd_rc_signal_ace_recycle(state, ptr);
1484 				}
1485 			} else {
1486 				IBD_ACACHE_PULLOUT_ACTIVE(state, ptr);
1487 				break;
1488 			}
1489 		}
1490 		ptr = list_prev(&state->id_ah_active, ptr);
1491 	}
1492 	return (ptr);
1493 }
1494 
1495 /*
1496  * Invoked to clean up AH from active list in case of multicast
1497  * disable and to handle sendonly memberships during mcg traps.
1498  * And for port up processing for multicast and unicast AHs.
1499  * Normally, the AH is taken off the active list, and put into
1500  * the free list to be recycled for a new destination. In case
1501  * Tx requests on the AH have not completed yet, the AH is marked
1502  * for reaping (which will put the AH on the free list) once the Tx's
1503  * complete; in this case, depending on the "force" input, we take
1504  * out the AH from the active list right now, or leave it also for
1505  * the reap operation. Returns TRUE if the AH is taken off the active
1506  * list (and either put into the free list right now, or arranged for
1507  * later), FALSE otherwise.
1508  */
1509 boolean_t
1510 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force)
1511 {
1512 	ibd_ace_t *acactive;
1513 	boolean_t ret = B_TRUE;
1514 
1515 	ASSERT(mutex_owned(&state->id_ac_mutex));
1516 
1517 	if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) {
1518 
1519 		/*
1520 		 * Note that the AH might already have the cycle bit set
1521 		 * on it; this might happen if sequences of multicast
1522 		 * enables and disables are coming so fast, that posted
1523 		 * Tx's to the mcg have not completed yet, and the cycle
1524 		 * bit is set successively by each multicast disable.
1525 		 */
1526 		if (SET_CYCLE_IF_REF(acactive)) {
1527 			if (!force) {
1528 				/*
1529 				 * The ace is kept on the active list, further
1530 				 * Tx's can still grab a reference on it; the
1531 				 * ace is reaped when all pending Tx's
1532 				 * referencing the AH complete.
1533 				 */
1534 				ret = B_FALSE;
1535 			} else {
1536 				/*
1537 				 * In the mcg trap case, we always pull the
1538 				 * AH from the active list. And also the port
1539 				 * up multi/unicast case.
1540 				 */
1541 				ASSERT(acactive->ac_chan == NULL);
1542 				IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1543 				acactive->ac_mce = NULL;
1544 			}
1545 		} else {
1546 			/*
1547 			 * Determined the ref count is 0, thus reclaim
1548 			 * immediately after pulling out the ace from
1549 			 * the active list.
1550 			 */
1551 			ASSERT(acactive->ac_chan == NULL);
1552 			IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1553 			acactive->ac_mce = NULL;
1554 			IBD_ACACHE_INSERT_FREE(state, acactive);
1555 		}
1556 
1557 	}
1558 	return (ret);
1559 }
1560 
1561 /*
1562  * Helper function for async path record lookup. If we are trying to
1563  * Tx to a MCG, check our membership, possibly trying to join the
1564  * group if required. If that fails, try to send the packet to the
1565  * all router group (indicated by the redirect output), pointing
1566  * the input mac address to the router mcg address.
1567  */
1568 static ibd_mce_t *
1569 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect)
1570 {
1571 	ib_gid_t mgid;
1572 	ibd_mce_t *mce;
1573 	ipoib_mac_t routermac;
1574 
1575 	*redirect = B_FALSE;
1576 	ibd_n2h_gid(mac, &mgid);
1577 
1578 	/*
1579 	 * Check the FullMember+SendOnlyNonMember list.
1580 	 * Since we are the only one who manipulates the
1581 	 * id_mc_full list, no locks are needed.
1582 	 */
1583 	mce = IBD_MCACHE_FIND_FULL(state, mgid);
1584 	if (mce != NULL) {
1585 		DPRINT(4, "ibd_async_mcache : already joined to group");
1586 		return (mce);
1587 	}
1588 
1589 	/*
1590 	 * Not found; try to join(SendOnlyNonMember) and attach.
1591 	 */
1592 	DPRINT(4, "ibd_async_mcache : not joined to group");
1593 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1594 	    NULL) {
1595 		DPRINT(4, "ibd_async_mcache : nonmem joined to group");
1596 		return (mce);
1597 	}
1598 
1599 	/*
1600 	 * MCGroup not present; try to join the all-router group. If
1601 	 * any of the following steps succeed, we will be redirecting
1602 	 * to the all router group.
1603 	 */
1604 	DPRINT(4, "ibd_async_mcache : nonmem join failed");
1605 	if (!ibd_get_allroutergroup(state, mac, &routermac))
1606 		return (NULL);
1607 	*redirect = B_TRUE;
1608 	ibd_n2h_gid(&routermac, &mgid);
1609 	bcopy(&routermac, mac, IPOIB_ADDRL);
1610 	DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n",
1611 	    mgid.gid_prefix, mgid.gid_guid);
1612 
1613 	/*
1614 	 * Are we already joined to the router group?
1615 	 */
1616 	if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) {
1617 		DPRINT(4, "ibd_async_mcache : using already joined router"
1618 		    "group\n");
1619 		return (mce);
1620 	}
1621 
1622 	/*
1623 	 * Can we join(SendOnlyNonMember) the router group?
1624 	 */
1625 	DPRINT(4, "ibd_async_mcache : attempting join to router grp");
1626 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1627 	    NULL) {
1628 		DPRINT(4, "ibd_async_mcache : joined to router grp");
1629 		return (mce);
1630 	}
1631 
1632 	return (NULL);
1633 }
1634 
1635 /*
1636  * Async path record lookup code.
1637  */
1638 static void
1639 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac)
1640 {
1641 	ibd_ace_t *ce;
1642 	ibd_mce_t *mce = NULL;
1643 	ibt_path_attr_t path_attr;
1644 	ibt_path_info_t path_info;
1645 	ib_gid_t destgid;
1646 	char ret = IBD_OP_NOTSTARTED;
1647 
1648 	DPRINT(4, "ibd_async_acache :  %08X:%08X:%08X:%08X:%08X",
1649 	    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1650 	    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1651 	    htonl(mac->ipoib_gidsuff[1]));
1652 
1653 	/*
1654 	 * Check whether we are trying to transmit to a MCG.
1655 	 * In that case, we need to make sure we are a member of
1656 	 * the MCG.
1657 	 */
1658 	if (mac->ipoib_qpn == htonl(IB_MC_QPN)) {
1659 		boolean_t redirected;
1660 
1661 		/*
1662 		 * If we can not find or join the group or even
1663 		 * redirect, error out.
1664 		 */
1665 		if ((mce = ibd_async_mcache(state, mac, &redirected)) ==
1666 		    NULL) {
1667 			state->id_ah_op = IBD_OP_ERRORED;
1668 			return;
1669 		}
1670 
1671 		/*
1672 		 * If we got redirected, we need to determine whether
1673 		 * the AH for the new mcg is in the cache already, and
1674 		 * not pull it in then; otherwise proceed to get the
1675 		 * path for the new mcg. There is no guarantee that
1676 		 * if the AH is currently in the cache, it will still be
1677 		 * there when we look in ibd_acache_lookup(), but that's
1678 		 * okay, we will come back here.
1679 		 */
1680 		if (redirected) {
1681 			ret = IBD_OP_ROUTERED;
1682 			DPRINT(4, "ibd_async_acache :  redirected to "
1683 			    "%08X:%08X:%08X:%08X:%08X",
1684 			    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1685 			    htonl(mac->ipoib_gidpref[1]),
1686 			    htonl(mac->ipoib_gidsuff[0]),
1687 			    htonl(mac->ipoib_gidsuff[1]));
1688 
1689 			mutex_enter(&state->id_ac_mutex);
1690 			if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) {
1691 				state->id_ah_op = IBD_OP_ROUTERED;
1692 				mutex_exit(&state->id_ac_mutex);
1693 				DPRINT(4, "ibd_async_acache : router AH found");
1694 				return;
1695 			}
1696 			mutex_exit(&state->id_ac_mutex);
1697 		}
1698 	}
1699 
1700 	/*
1701 	 * Get an AH from the free list.
1702 	 */
1703 	mutex_enter(&state->id_ac_mutex);
1704 	if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) {
1705 		/*
1706 		 * No free ones; try to grab an unreferenced active
1707 		 * one. Maybe we need to make the active list LRU,
1708 		 * but that will create more work for Tx callbacks.
1709 		 * Is there a way of not having to pull out the
1710 		 * entry from the active list, but just indicate it
1711 		 * is being recycled? Yes, but that creates one more
1712 		 * check in the fast lookup path.
1713 		 */
1714 		if ((ce = ibd_acache_get_unref(state)) == NULL) {
1715 			/*
1716 			 * Pretty serious shortage now.
1717 			 */
1718 			state->id_ah_op = IBD_OP_NOTSTARTED;
1719 			mutex_exit(&state->id_ac_mutex);
1720 			DPRINT(10, "ibd_async_acache : failed to find AH "
1721 			    "slot\n");
1722 			return;
1723 		}
1724 		/*
1725 		 * We could check whether ac_mce points to a SendOnly
1726 		 * member and drop that membership now. Or do it lazily
1727 		 * at detach time.
1728 		 */
1729 		ce->ac_mce = NULL;
1730 	}
1731 	mutex_exit(&state->id_ac_mutex);
1732 	ASSERT(ce->ac_mce == NULL);
1733 
1734 	/*
1735 	 * Update the entry.
1736 	 */
1737 	bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL);
1738 
1739 	bzero(&path_info, sizeof (path_info));
1740 	bzero(&path_attr, sizeof (ibt_path_attr_t));
1741 	path_attr.pa_sgid = state->id_sgid;
1742 	path_attr.pa_num_dgids = 1;
1743 	ibd_n2h_gid(&ce->ac_mac, &destgid);
1744 	path_attr.pa_dgids = &destgid;
1745 	path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
1746 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
1747 	    &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) {
1748 		DPRINT(10, "ibd_async_acache : failed in ibt_get_paths");
1749 		goto error;
1750 	}
1751 	if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey,
1752 	    ntohl(ce->ac_mac.ipoib_qpn),
1753 	    &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) {
1754 		DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest");
1755 		goto error;
1756 	}
1757 
1758 	/*
1759 	 * mce is set whenever an AH is being associated with a
1760 	 * MCG; this will come in handy when we leave the MCG. The
1761 	 * lock protects Tx fastpath from scanning the active list.
1762 	 */
1763 	if (mce != NULL)
1764 		ce->ac_mce = mce;
1765 
1766 	/*
1767 	 * initiate a RC mode connection for unicast address
1768 	 */
1769 	if (state->id_enable_rc && (mac->ipoib_qpn != htonl(IB_MC_QPN)) &&
1770 	    (htonl(mac->ipoib_qpn) & IBD_MAC_ADDR_RC)) {
1771 		ASSERT(ce->ac_chan == NULL);
1772 		DPRINT(10, "ibd_async_acache: call "
1773 		    "ibd_rc_try_connect(ace=%p)", ce);
1774 		ibd_rc_try_connect(state, ce, &path_info);
1775 		if (ce->ac_chan == NULL) {
1776 			DPRINT(10, "ibd_async_acache: fail to setup RC"
1777 			    " channel");
1778 			state->rc_conn_fail++;
1779 			goto error;
1780 		}
1781 	}
1782 
1783 	mutex_enter(&state->id_ac_mutex);
1784 	IBD_ACACHE_INSERT_ACTIVE(state, ce);
1785 	state->id_ah_op = ret;
1786 	mutex_exit(&state->id_ac_mutex);
1787 	return;
1788 error:
1789 	/*
1790 	 * We might want to drop SendOnly membership here if we
1791 	 * joined above. The lock protects Tx callbacks inserting
1792 	 * into the free list.
1793 	 */
1794 	mutex_enter(&state->id_ac_mutex);
1795 	state->id_ah_op = IBD_OP_ERRORED;
1796 	IBD_ACACHE_INSERT_FREE(state, ce);
1797 	mutex_exit(&state->id_ac_mutex);
1798 }
1799 
1800 /*
1801  * While restoring port's presence on the subnet on a port up, it is possible
1802  * that the port goes down again.
1803  */
1804 static void
1805 ibd_async_link(ibd_state_t *state, ibd_req_t *req)
1806 {
1807 	ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr;
1808 	link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN :
1809 	    LINK_STATE_UP;
1810 	ibd_mce_t *mce, *pmce;
1811 	ibd_ace_t *ace, *pace;
1812 
1813 	DPRINT(10, "ibd_async_link(): %d", opcode);
1814 
1815 	/*
1816 	 * On a link up, revalidate the link speed/width. No point doing
1817 	 * this on a link down, since we will be unable to do SA operations,
1818 	 * defaulting to the lowest speed. Also notice that we update our
1819 	 * notion of speed before calling mac_link_update(), which will do
1820 	 * necessary higher level notifications for speed changes.
1821 	 */
1822 	if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) {
1823 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
1824 		state->id_link_speed = ibd_get_portspeed(state);
1825 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
1826 	}
1827 
1828 	/*
1829 	 * Do all the work required to establish our presence on
1830 	 * the subnet.
1831 	 */
1832 	if (opcode == IBD_LINK_UP_ABSENT) {
1833 		/*
1834 		 * If in promiscuous mode ...
1835 		 */
1836 		if (state->id_prom_op == IBD_OP_COMPLETED) {
1837 			/*
1838 			 * Drop all nonmembership.
1839 			 */
1840 			ibd_async_unsetprom(state);
1841 
1842 			/*
1843 			 * Then, try to regain nonmembership to all mcg's.
1844 			 */
1845 			ibd_async_setprom(state);
1846 
1847 		}
1848 
1849 		/*
1850 		 * Drop all sendonly membership (which also gets rid of the
1851 		 * AHs); try to reacquire all full membership.
1852 		 */
1853 		mce = list_head(&state->id_mc_full);
1854 		while ((pmce = mce) != NULL) {
1855 			mce = list_next(&state->id_mc_full, mce);
1856 			if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON)
1857 				ibd_leave_group(state,
1858 				    pmce->mc_info.mc_adds_vect.av_dgid,
1859 				    IB_MC_JSTATE_SEND_ONLY_NON);
1860 			else
1861 				ibd_reacquire_group(state, pmce);
1862 		}
1863 
1864 		/*
1865 		 * Recycle all active AHs to free list (and if there are
1866 		 * pending posts, make sure they will go into the free list
1867 		 * once the Tx's complete). Grab the lock to prevent
1868 		 * concurrent Tx's as well as Tx cleanups.
1869 		 */
1870 		mutex_enter(&state->id_ac_mutex);
1871 		ace = list_head(&state->id_ah_active);
1872 		while ((pace = ace) != NULL) {
1873 			boolean_t cycled;
1874 
1875 			ace = list_next(&state->id_ah_active, ace);
1876 			mce = pace->ac_mce;
1877 			if (pace->ac_chan != NULL) {
1878 				ASSERT(mce == NULL);
1879 				ASSERT(state->id_enable_rc == B_TRUE);
1880 				if (pace->ac_chan->chan_state ==
1881 				    IBD_RC_STATE_ACT_ESTAB) {
1882 					INC_REF(pace, 1);
1883 					IBD_ACACHE_PULLOUT_ACTIVE(state, pace);
1884 					pace->ac_chan->chan_state =
1885 					    IBD_RC_STATE_ACT_CLOSING;
1886 					ibd_rc_signal_act_close(state, pace);
1887 				} else {
1888 					state->rc_act_close_simultaneous++;
1889 					DPRINT(40, "ibd_async_link: other "
1890 					    "thread is closing it, ace=%p, "
1891 					    "ac_chan=%p, chan_state=%d",
1892 					    pace, pace->ac_chan,
1893 					    pace->ac_chan->chan_state);
1894 				}
1895 			} else {
1896 				cycled = ibd_acache_recycle(state,
1897 				    &pace->ac_mac, B_TRUE);
1898 			}
1899 			/*
1900 			 * If this is for an mcg, it must be for a fullmember,
1901 			 * since we got rid of send-only members above when
1902 			 * processing the mce list.
1903 			 */
1904 			ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate ==
1905 			    IB_MC_JSTATE_FULL)));
1906 
1907 			/*
1908 			 * Check if the fullmember mce needs to be torn down,
1909 			 * ie whether the DLPI disable has already been done.
1910 			 * If so, do some of the work of tx_cleanup, namely
1911 			 * causing leave (which will fail), detach and
1912 			 * mce-freeing. tx_cleanup will put the AH into free
1913 			 * list. The reason to duplicate some of this
1914 			 * tx_cleanup work is because we want to delete the
1915 			 * AH right now instead of waiting for tx_cleanup, to
1916 			 * force subsequent Tx's to reacquire an AH.
1917 			 */
1918 			if ((mce != NULL) && (mce->mc_fullreap))
1919 				ibd_async_reap_group(state, mce,
1920 				    mce->mc_info.mc_adds_vect.av_dgid,
1921 				    mce->mc_jstate);
1922 		}
1923 		mutex_exit(&state->id_ac_mutex);
1924 	}
1925 
1926 	/*
1927 	 * mac handle is guaranteed to exist since driver does ibt_close_hca()
1928 	 * (which stops further events from being delivered) before
1929 	 * mac_unregister(). At this point, it is guaranteed that mac_register
1930 	 * has already been done.
1931 	 */
1932 	mutex_enter(&state->id_link_mutex);
1933 	state->id_link_state = lstate;
1934 	mac_link_update(state->id_mh, lstate);
1935 	mutex_exit(&state->id_link_mutex);
1936 
1937 	ibd_async_done(state);
1938 }
1939 
1940 /*
1941  * Check the pkey table to see if we can find the pkey we're looking for.
1942  * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on
1943  * failure.
1944  */
1945 static int
1946 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey,
1947     uint16_t *pkix)
1948 {
1949 	uint16_t ndx;
1950 
1951 	ASSERT(pkix != NULL);
1952 
1953 	for (ndx = 0; ndx < pkey_tbl_sz; ndx++) {
1954 		if (pkey_tbl[ndx] == pkey) {
1955 			*pkix = ndx;
1956 			return (0);
1957 		}
1958 	}
1959 	return (-1);
1960 }
1961 
1962 /*
1963  * When the link is notified up, we need to do a few things, based
1964  * on the port's current p_init_type_reply claiming a reinit has been
1965  * done or not. The reinit steps are:
1966  * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
1967  *    the old Pkey and GID0 are correct.
1968  * 2. Register for mcg traps (already done by ibmf).
1969  * 3. If PreservePresenceReply indicates the SM has restored port's presence
1970  *    in subnet, nothing more to do. Else go to next steps (on async daemon).
1971  * 4. Give up all sendonly memberships.
1972  * 5. Acquire all full memberships.
1973  * 6. In promiscuous mode, acquire all non memberships.
1974  * 7. Recycle all AHs to free list.
1975  */
1976 static void
1977 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code)
1978 {
1979 	ibt_hca_portinfo_t *port_infop = NULL;
1980 	ibt_status_t ibt_status;
1981 	uint_t psize, port_infosz;
1982 	ibd_link_op_t opcode;
1983 	ibd_req_t *req;
1984 	link_state_t new_link_state = LINK_STATE_UP;
1985 	uint8_t itreply;
1986 	uint16_t pkix;
1987 	int ret;
1988 
1989 	/*
1990 	 * Let's not race with a plumb or an unplumb; if we detect a
1991 	 * pkey relocation event later on here, we may have to restart.
1992 	 */
1993 	ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
1994 
1995 	mutex_enter(&state->id_link_mutex);
1996 
1997 	/*
1998 	 * If the init code in ibd_m_start hasn't yet set up the
1999 	 * pkey/gid, nothing to do; that code will set the link state.
2000 	 */
2001 	if (state->id_link_state == LINK_STATE_UNKNOWN) {
2002 		mutex_exit(&state->id_link_mutex);
2003 		goto link_mod_return;
2004 	}
2005 
2006 	/*
2007 	 * If this routine was called in response to a port down event,
2008 	 * we just need to see if this should be informed.
2009 	 */
2010 	if (code == IBT_ERROR_PORT_DOWN) {
2011 		new_link_state = LINK_STATE_DOWN;
2012 		goto update_link_state;
2013 	}
2014 
2015 	/*
2016 	 * If it's not a port down event we've received, try to get the port
2017 	 * attributes first. If we fail here, the port is as good as down.
2018 	 * Otherwise, if the link went down by the time the handler gets
2019 	 * here, give up - we cannot even validate the pkey/gid since those
2020 	 * are not valid and this is as bad as a port down anyway.
2021 	 */
2022 	ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
2023 	    &port_infop, &psize, &port_infosz);
2024 	if ((ibt_status != IBT_SUCCESS) || (psize != 1) ||
2025 	    (port_infop->p_linkstate != IBT_PORT_ACTIVE)) {
2026 		new_link_state = LINK_STATE_DOWN;
2027 		goto update_link_state;
2028 	}
2029 
2030 	/*
2031 	 * Check the SM InitTypeReply flags. If both NoLoadReply and
2032 	 * PreserveContentReply are 0, we don't know anything about the
2033 	 * data loaded into the port attributes, so we need to verify
2034 	 * if gid0 and pkey are still valid.
2035 	 */
2036 	itreply = port_infop->p_init_type_reply;
2037 	if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
2038 	    ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) {
2039 		/*
2040 		 * Check to see if the subnet part of GID0 has changed. If
2041 		 * not, check the simple case first to see if the pkey
2042 		 * index is the same as before; finally check to see if the
2043 		 * pkey has been relocated to a different index in the table.
2044 		 */
2045 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
2046 		if (bcmp(port_infop->p_sgid_tbl,
2047 		    &state->id_sgid, sizeof (ib_gid_t)) != 0) {
2048 
2049 			new_link_state = LINK_STATE_DOWN;
2050 
2051 		} else if (port_infop->p_pkey_tbl[state->id_pkix] ==
2052 		    state->id_pkey) {
2053 
2054 			new_link_state = LINK_STATE_UP;
2055 
2056 		} else if (ibd_locate_pkey(port_infop->p_pkey_tbl,
2057 		    port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) {
2058 
2059 			ibt_free_portinfo(port_infop, port_infosz);
2060 			mutex_exit(&state->id_link_mutex);
2061 
2062 			/*
2063 			 * Currently a restart is required if our pkey has moved
2064 			 * in the pkey table. If we get the ibt_recycle_ud() to
2065 			 * work as documented (expected), we may be able to
2066 			 * avoid a complete restart.  Note that we've already
2067 			 * marked both the start and stop 'in-progress' flags,
2068 			 * so it is ok to go ahead and do this restart.
2069 			 */
2070 			(void) ibd_undo_start(state, LINK_STATE_DOWN);
2071 			if ((ret = ibd_start(state)) != 0) {
2072 				DPRINT(10, "ibd_restart: cannot restart, "
2073 				    "ret=%d", ret);
2074 			}
2075 
2076 			goto link_mod_return;
2077 		} else {
2078 			new_link_state = LINK_STATE_DOWN;
2079 		}
2080 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
2081 	}
2082 
2083 update_link_state:
2084 	if (port_infop) {
2085 		ibt_free_portinfo(port_infop, port_infosz);
2086 	}
2087 
2088 	/*
2089 	 * If we're reporting a link up, check InitTypeReply to see if
2090 	 * the SM has ensured that the port's presence in mcg, traps,
2091 	 * etc. is intact.
2092 	 */
2093 	if (new_link_state == LINK_STATE_DOWN) {
2094 		opcode = IBD_LINK_DOWN;
2095 	} else {
2096 		if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
2097 		    SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) {
2098 			opcode = IBD_LINK_UP;
2099 		} else {
2100 			opcode = IBD_LINK_UP_ABSENT;
2101 		}
2102 	}
2103 
2104 	/*
2105 	 * If the old state is the same as the new state, and the SM indicated
2106 	 * no change in the port parameters, nothing to do.
2107 	 */
2108 	if ((state->id_link_state == new_link_state) && (opcode !=
2109 	    IBD_LINK_UP_ABSENT)) {
2110 		mutex_exit(&state->id_link_mutex);
2111 		goto link_mod_return;
2112 	}
2113 
2114 	/*
2115 	 * Ok, so there was a link state change; see if it's safe to ask
2116 	 * the async thread to do the work
2117 	 */
2118 	if (!ibd_async_safe(state)) {
2119 		state->id_link_state = new_link_state;
2120 		mutex_exit(&state->id_link_mutex);
2121 		goto link_mod_return;
2122 	}
2123 
2124 	mutex_exit(&state->id_link_mutex);
2125 
2126 	/*
2127 	 * Queue up a request for ibd_async_link() to handle this link
2128 	 * state change event
2129 	 */
2130 	req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
2131 	req->rq_ptr = (void *)opcode;
2132 	ibd_queue_work_slot(state, req, IBD_ASYNC_LINK);
2133 
2134 link_mod_return:
2135 	ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2136 }
2137 
2138 /*
2139  * For the port up/down events, IBTL guarantees there will not be concurrent
2140  * invocations of the handler. IBTL might coalesce link transition events,
2141  * and not invoke the handler for _each_ up/down transition, but it will
2142  * invoke the handler with last known state
2143  */
2144 static void
2145 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
2146     ibt_async_code_t code, ibt_async_event_t *event)
2147 {
2148 	ibd_state_t *state = (ibd_state_t *)clnt_private;
2149 
2150 	switch (code) {
2151 	case IBT_ERROR_CATASTROPHIC_CHAN:
2152 		ibd_print_warn(state, "catastrophic channel error");
2153 		break;
2154 	case IBT_ERROR_CQ:
2155 		ibd_print_warn(state, "completion queue error");
2156 		break;
2157 	case IBT_PORT_CHANGE_EVENT:
2158 		/*
2159 		 * Events will be delivered to all instances that have
2160 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2161 		 * Only need to do work for our port; IBTF will deliver
2162 		 * events for other ports on the hca we have ibt_open_hca'ed
2163 		 * too. Note that id_port is initialized in ibd_attach()
2164 		 * before we do an ibt_open_hca() in ibd_attach().
2165 		 */
2166 		ASSERT(state->id_hca_hdl == hca_hdl);
2167 		if (state->id_port != event->ev_port)
2168 			break;
2169 
2170 		if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) ==
2171 		    IBT_PORT_CHANGE_PKEY) {
2172 			ibd_link_mod(state, code);
2173 		}
2174 		break;
2175 	case IBT_ERROR_PORT_DOWN:
2176 	case IBT_CLNT_REREG_EVENT:
2177 	case IBT_EVENT_PORT_UP:
2178 		/*
2179 		 * Events will be delivered to all instances that have
2180 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2181 		 * Only need to do work for our port; IBTF will deliver
2182 		 * events for other ports on the hca we have ibt_open_hca'ed
2183 		 * too. Note that id_port is initialized in ibd_attach()
2184 		 * before we do an ibt_open_hca() in ibd_attach().
2185 		 */
2186 		ASSERT(state->id_hca_hdl == hca_hdl);
2187 		if (state->id_port != event->ev_port)
2188 			break;
2189 
2190 		ibd_link_mod(state, code);
2191 		break;
2192 
2193 	case IBT_HCA_ATTACH_EVENT:
2194 	case IBT_HCA_DETACH_EVENT:
2195 		/*
2196 		 * When a new card is plugged to the system, attach_event is
2197 		 * invoked. Additionally, a cfgadm needs to be run to make the
2198 		 * card known to the system, and an ifconfig needs to be run to
2199 		 * plumb up any ibd interfaces on the card. In the case of card
2200 		 * unplug, a cfgadm is run that will trigger any RCM scripts to
2201 		 * unplumb the ibd interfaces on the card; when the card is
2202 		 * actually unplugged, the detach_event is invoked;
2203 		 * additionally, if any ibd instances are still active on the
2204 		 * card (eg there were no associated RCM scripts), driver's
2205 		 * detach routine is invoked.
2206 		 */
2207 		break;
2208 	default:
2209 		break;
2210 	}
2211 }
2212 
2213 static int
2214 ibd_register_mac(ibd_state_t *state, dev_info_t *dip)
2215 {
2216 	mac_register_t *macp;
2217 	int ret;
2218 
2219 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2220 		DPRINT(10, "ibd_register_mac: mac_alloc() failed");
2221 		return (DDI_FAILURE);
2222 	}
2223 
2224 	/*
2225 	 * Note that when we register with mac during attach, we don't
2226 	 * have the id_macaddr yet, so we'll simply be registering a
2227 	 * zero macaddr that we'll overwrite later during plumb (in
2228 	 * ibd_m_start()). Similar is the case with id_mtu - we'll
2229 	 * update the mac layer with the correct mtu during plumb.
2230 	 */
2231 	macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
2232 	macp->m_driver = state;
2233 	macp->m_dip = dip;
2234 	macp->m_src_addr = (uint8_t *)&state->id_macaddr;
2235 	macp->m_callbacks = &ibd_m_callbacks;
2236 	macp->m_min_sdu = 0;
2237 	if (state->id_enable_rc) {
2238 		macp->m_max_sdu = state->rc_mtu - IPOIB_HDRSIZE;
2239 	} else {
2240 		macp->m_max_sdu = IBD_DEF_MAX_SDU;
2241 	}
2242 
2243 	/*
2244 	 *  Register ourselves with the GLDv3 interface
2245 	 */
2246 	if ((ret = mac_register(macp, &state->id_mh)) != 0) {
2247 		mac_free(macp);
2248 		DPRINT(10,
2249 		    "ibd_register_mac: mac_register() failed, ret=%d", ret);
2250 		return (DDI_FAILURE);
2251 	}
2252 
2253 	mac_free(macp);
2254 	return (DDI_SUCCESS);
2255 }
2256 
2257 static int
2258 ibd_record_capab(ibd_state_t *state, dev_info_t *dip)
2259 {
2260 	ibt_hca_attr_t hca_attrs;
2261 	ibt_status_t ibt_status;
2262 
2263 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
2264 
2265 	/*
2266 	 * Query the HCA and fetch its attributes
2267 	 */
2268 	ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
2269 	ASSERT(ibt_status == IBT_SUCCESS);
2270 
2271 	/*
2272 	 * 1. Set the Hardware Checksum capability. Currently we only consider
2273 	 *    full checksum offload.
2274 	 */
2275 	if (state->id_enable_rc) {
2276 			state->id_hwcksum_capab = 0;
2277 	} else {
2278 		if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL)
2279 		    == IBT_HCA_CKSUM_FULL) {
2280 			state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL;
2281 		}
2282 	}
2283 
2284 	/*
2285 	 * 2. Set LSO policy, capability and maximum length
2286 	 */
2287 	if (state->id_enable_rc) {
2288 		state->id_lso_policy = B_FALSE;
2289 		state->id_lso_capable = B_FALSE;
2290 		state->id_lso_maxlen = 0;
2291 	} else {
2292 		if (ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS
2293 		    |DDI_PROP_NOTPROM, IBD_PROP_LSO_POLICY, 1)) {
2294 			state->id_lso_policy = B_TRUE;
2295 		} else {
2296 			state->id_lso_policy = B_FALSE;
2297 		}
2298 
2299 		if (hca_attrs.hca_max_lso_size > 0) {
2300 			state->id_lso_capable = B_TRUE;
2301 			if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN)
2302 				state->id_lso_maxlen = IBD_LSO_MAXLEN;
2303 			else
2304 				state->id_lso_maxlen =
2305 				    hca_attrs.hca_max_lso_size;
2306 		} else {
2307 			state->id_lso_capable = B_FALSE;
2308 			state->id_lso_maxlen = 0;
2309 		}
2310 	}
2311 
2312 	/*
2313 	 * 3. Set Reserved L_Key capability
2314 	 */
2315 	if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) {
2316 		state->id_hca_res_lkey_capab = 1;
2317 		state->id_res_lkey = hca_attrs.hca_reserved_lkey;
2318 		state->rc_enable_iov_map = B_TRUE;
2319 	} else {
2320 		/* If no reserved lkey, we will not use ibt_map_mem_iov */
2321 		state->rc_enable_iov_map = B_FALSE;
2322 	}
2323 
2324 	/*
2325 	 * 4. Set maximum sqseg value after checking to see if extended sgl
2326 	 *    size information is provided by the hca
2327 	 */
2328 	if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) {
2329 		state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz;
2330 		state->rc_tx_max_sqseg = hca_attrs.hca_conn_send_sgl_sz;
2331 	} else {
2332 		state->id_max_sqseg = hca_attrs.hca_max_sgl;
2333 		state->rc_tx_max_sqseg = hca_attrs.hca_max_sgl;
2334 	}
2335 	if (state->id_max_sqseg > IBD_MAX_SQSEG) {
2336 		state->id_max_sqseg = IBD_MAX_SQSEG;
2337 	} else if (state->id_max_sqseg < IBD_MAX_SQSEG) {
2338 		ibd_print_warn(state, "Set #sgl = %d instead of default %d",
2339 		    state->id_max_sqseg, IBD_MAX_SQSEG);
2340 	}
2341 	if (state->rc_tx_max_sqseg > IBD_MAX_SQSEG) {
2342 		state->rc_tx_max_sqseg = IBD_MAX_SQSEG;
2343 	} else if (state->rc_tx_max_sqseg < IBD_MAX_SQSEG) {
2344 		ibd_print_warn(state, "RC mode: Set #sgl = %d instead of "
2345 		    "default %d", state->rc_tx_max_sqseg, IBD_MAX_SQSEG);
2346 	}
2347 
2348 	/*
2349 	 * Translating the virtual address regions into physical regions
2350 	 * for using the Reserved LKey feature results in a wr sgl that
2351 	 * is a little longer. Since failing ibt_map_mem_iov() is costly,
2352 	 * we'll fix a high-water mark (65%) for when we should stop.
2353 	 */
2354 	state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100;
2355 	state->rc_max_sqseg_hiwm = (state->rc_tx_max_sqseg * 65) / 100;
2356 
2357 	/*
2358 	 * 5. Set number of recv and send wqes after checking hca maximum
2359 	 *    channel size
2360 	 */
2361 	if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) {
2362 		state->id_num_rwqe = hca_attrs.hca_max_chan_sz;
2363 	} else {
2364 		state->id_num_rwqe = IBD_NUM_RWQE;
2365 	}
2366 	state->id_rx_bufs_outstanding_limit = state->id_num_rwqe - IBD_RWQE_MIN;
2367 	if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) {
2368 		state->id_num_swqe = hca_attrs.hca_max_chan_sz;
2369 	} else {
2370 		state->id_num_swqe = IBD_NUM_SWQE;
2371 	}
2372 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
2373 
2374 	return (DDI_SUCCESS);
2375 }
2376 
2377 static int
2378 ibd_unattach(ibd_state_t *state, dev_info_t *dip)
2379 {
2380 	int instance;
2381 	uint32_t progress = state->id_mac_state;
2382 	ibt_status_t ret;
2383 
2384 	if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) != 0) {
2385 		cmn_err(CE_CONT, "ibd_detach: failed: rx bufs outstanding\n");
2386 		return (DDI_FAILURE);
2387 	}
2388 
2389 	if (state->rc_srq_rwqe_list.dl_bufs_outstanding != 0) {
2390 		cmn_err(CE_CONT, "ibd_detach: failed: srq bufs outstanding\n");
2391 		return (DDI_FAILURE);
2392 	}
2393 
2394 	/* make sure rx resources are freed */
2395 	ibd_free_rx_rsrcs(state);
2396 
2397 	if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
2398 		ASSERT(state->id_enable_rc);
2399 		ibd_rc_fini_srq_list(state);
2400 		state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD);
2401 	}
2402 
2403 	if (progress & IBD_DRV_MAC_REGISTERED) {
2404 		(void) mac_unregister(state->id_mh);
2405 		state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
2406 	}
2407 
2408 	if (progress & IBD_DRV_PD_ALLOCD) {
2409 		if ((ret = ibt_free_pd(state->id_hca_hdl,
2410 		    state->id_pd_hdl)) != IBT_SUCCESS) {
2411 			ibd_print_warn(state, "failed to free "
2412 			    "protection domain, ret=%d", ret);
2413 		}
2414 		state->id_pd_hdl = NULL;
2415 		state->id_mac_state &= (~IBD_DRV_PD_ALLOCD);
2416 	}
2417 
2418 	if (progress & IBD_DRV_HCA_OPENED) {
2419 		if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
2420 		    IBT_SUCCESS) {
2421 			ibd_print_warn(state, "failed to close "
2422 			    "HCA device, ret=%d", ret);
2423 		}
2424 		state->id_hca_hdl = NULL;
2425 		state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
2426 	}
2427 
2428 	mutex_enter(&ibd_gstate.ig_mutex);
2429 	if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
2430 		if ((ret = ibt_detach(state->id_ibt_hdl)) !=
2431 		    IBT_SUCCESS) {
2432 			ibd_print_warn(state,
2433 			    "ibt_detach() failed, ret=%d", ret);
2434 		}
2435 		state->id_ibt_hdl = NULL;
2436 		state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
2437 		ibd_gstate.ig_ibt_hdl_ref_cnt--;
2438 	}
2439 	if ((ibd_gstate.ig_ibt_hdl_ref_cnt == 0) &&
2440 	    (ibd_gstate.ig_ibt_hdl != NULL)) {
2441 		if ((ret = ibt_detach(ibd_gstate.ig_ibt_hdl)) !=
2442 		    IBT_SUCCESS) {
2443 			ibd_print_warn(state, "ibt_detach(): global "
2444 			    "failed, ret=%d", ret);
2445 		}
2446 		ibd_gstate.ig_ibt_hdl = NULL;
2447 	}
2448 	mutex_exit(&ibd_gstate.ig_mutex);
2449 
2450 	if (progress & IBD_DRV_TXINTR_ADDED) {
2451 		ddi_remove_softintr(state->id_tx);
2452 		state->id_tx = NULL;
2453 		state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED);
2454 	}
2455 
2456 	if (progress & IBD_DRV_RXINTR_ADDED) {
2457 		ddi_remove_softintr(state->id_rx);
2458 		state->id_rx = NULL;
2459 		state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED);
2460 	}
2461 
2462 #ifdef DEBUG
2463 	if (progress & IBD_DRV_RC_PRIVATE_STATE) {
2464 		kstat_delete(state->rc_ksp);
2465 		state->id_mac_state &= (~IBD_DRV_RC_PRIVATE_STATE);
2466 	}
2467 #endif
2468 
2469 	if (progress & IBD_DRV_STATE_INITIALIZED) {
2470 		ibd_state_fini(state);
2471 		state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED);
2472 	}
2473 
2474 	instance = ddi_get_instance(dip);
2475 	ddi_soft_state_free(ibd_list, instance);
2476 
2477 	return (DDI_SUCCESS);
2478 }
2479 
2480 /*
2481  * Attach device to the IO framework.
2482  */
2483 static int
2484 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2485 {
2486 	ibd_state_t *state = NULL;
2487 	ib_guid_t hca_guid;
2488 	int instance;
2489 	ibt_status_t ret;
2490 	int rv;
2491 
2492 	/*
2493 	 * IBD doesn't support suspend/resume
2494 	 */
2495 	if (cmd != DDI_ATTACH)
2496 		return (DDI_FAILURE);
2497 
2498 	/*
2499 	 * Allocate softstate structure
2500 	 */
2501 	instance = ddi_get_instance(dip);
2502 	if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE)
2503 		return (DDI_FAILURE);
2504 	state = ddi_get_soft_state(ibd_list, instance);
2505 
2506 	/*
2507 	 * Initialize mutexes and condition variables
2508 	 */
2509 	if (ibd_state_init(state, dip) != DDI_SUCCESS) {
2510 		DPRINT(10, "ibd_attach: failed in ibd_state_init()");
2511 		goto attach_fail;
2512 	}
2513 	state->id_mac_state |= IBD_DRV_STATE_INITIALIZED;
2514 
2515 	/*
2516 	 * Allocate rx,tx softintr
2517 	 */
2518 	if (ibd_rx_softintr == 1) {
2519 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx,
2520 		    NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) {
2521 			DPRINT(10, "ibd_attach: failed in "
2522 			    "ddi_add_softintr(id_rx),  ret=%d", rv);
2523 			goto attach_fail;
2524 		}
2525 		state->id_mac_state |= IBD_DRV_RXINTR_ADDED;
2526 	}
2527 	if (ibd_tx_softintr == 1) {
2528 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx,
2529 		    NULL, NULL, ibd_tx_recycle,
2530 		    (caddr_t)state)) != DDI_SUCCESS) {
2531 			DPRINT(10, "ibd_attach: failed in "
2532 			    "ddi_add_softintr(id_tx), ret=%d", rv);
2533 			goto attach_fail;
2534 		}
2535 		state->id_mac_state |= IBD_DRV_TXINTR_ADDED;
2536 	}
2537 
2538 	/*
2539 	 * Obtain IBA P_Key, port number and HCA guid and validate
2540 	 * them (for P_Key, only full members are allowed as per
2541 	 * IPoIB specification; neither port number nor HCA guid
2542 	 * can be zero)
2543 	 */
2544 	if ((state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
2545 	    "port-pkey", IB_PKEY_INVALID_LIMITED)) <= IB_PKEY_INVALID_FULL) {
2546 		DPRINT(10, "ibd_attach: port device has wrong partition (0x%x)",
2547 		    state->id_pkey);
2548 		goto attach_fail;
2549 	}
2550 	if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
2551 	    "port-number", 0)) == 0) {
2552 		DPRINT(10, "ibd_attach: invalid port number (%d)",
2553 		    state->id_port);
2554 		goto attach_fail;
2555 	}
2556 	if ((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
2557 	    "hca-guid", 0)) == 0) {
2558 		DPRINT(10, "ibd_attach: port hca has invalid guid (0x%llx)",
2559 		    hca_guid);
2560 		goto attach_fail;
2561 	}
2562 
2563 	/*
2564 	 * Attach to IBTL
2565 	 */
2566 	mutex_enter(&ibd_gstate.ig_mutex);
2567 	if (ibd_gstate.ig_ibt_hdl == NULL) {
2568 		if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2569 		    &ibd_gstate.ig_ibt_hdl)) != IBT_SUCCESS) {
2570 			DPRINT(10, "ibd_attach: global: failed in "
2571 			    "ibt_attach(), ret=%d", ret);
2572 			mutex_exit(&ibd_gstate.ig_mutex);
2573 			goto attach_fail;
2574 		}
2575 	}
2576 	if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2577 	    &state->id_ibt_hdl)) != IBT_SUCCESS) {
2578 		DPRINT(10, "ibd_attach: failed in ibt_attach(), ret=%d",
2579 		    ret);
2580 		mutex_exit(&ibd_gstate.ig_mutex);
2581 		goto attach_fail;
2582 	}
2583 	ibd_gstate.ig_ibt_hdl_ref_cnt++;
2584 	mutex_exit(&ibd_gstate.ig_mutex);
2585 	state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
2586 
2587 	/*
2588 	 * Open the HCA
2589 	 */
2590 	if ((ret = ibt_open_hca(state->id_ibt_hdl, hca_guid,
2591 	    &state->id_hca_hdl)) != IBT_SUCCESS) {
2592 		DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret);
2593 		goto attach_fail;
2594 	}
2595 	state->id_mac_state |= IBD_DRV_HCA_OPENED;
2596 
2597 	/* Get RC config before ibd_record_capab */
2598 	ibd_rc_get_conf(state);
2599 
2600 #ifdef DEBUG
2601 	/* Initialize Driver Counters for Reliable Connected Mode */
2602 	if (state->id_enable_rc) {
2603 		if (ibd_rc_init_stats(state) != DDI_SUCCESS) {
2604 			DPRINT(10, "ibd_attach: failed in ibd_rc_init_stats");
2605 			goto attach_fail;
2606 		}
2607 		state->id_mac_state |= IBD_DRV_RC_PRIVATE_STATE;
2608 	}
2609 #endif
2610 
2611 	/*
2612 	 * Record capabilities
2613 	 */
2614 	(void) ibd_record_capab(state, dip);
2615 
2616 	/*
2617 	 * Allocate a protection domain on the HCA
2618 	 */
2619 	if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
2620 	    &state->id_pd_hdl)) != IBT_SUCCESS) {
2621 		DPRINT(10, "ibd_attach: ibt_alloc_pd() failed, ret=%d", ret);
2622 		goto attach_fail;
2623 	}
2624 	state->id_mac_state |= IBD_DRV_PD_ALLOCD;
2625 
2626 
2627 	/*
2628 	 * Register ibd interfaces with the Nemo framework
2629 	 */
2630 	if (ibd_register_mac(state, dip) != IBT_SUCCESS) {
2631 		DPRINT(10, "ibd_attach: failed in ibd_register_mac()");
2632 		goto attach_fail;
2633 	}
2634 	state->id_mac_state |= IBD_DRV_MAC_REGISTERED;
2635 
2636 	/*
2637 	 * We're done with everything we could to make the attach
2638 	 * succeed.  All the buffer allocations and IPoIB broadcast
2639 	 * group joins are deferred to when the interface instance
2640 	 * is actually plumbed to avoid wasting memory.
2641 	 */
2642 	return (DDI_SUCCESS);
2643 
2644 attach_fail:
2645 	(void) ibd_unattach(state, dip);
2646 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
2647 	return (DDI_FAILURE);
2648 }
2649 
2650 /*
2651  * Detach device from the IO framework.
2652  */
2653 static int
2654 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2655 {
2656 	ibd_state_t *state;
2657 	int instance;
2658 
2659 	/*
2660 	 * IBD doesn't support suspend/resume
2661 	 */
2662 	if (cmd != DDI_DETACH)
2663 		return (DDI_FAILURE);
2664 
2665 	/*
2666 	 * Get the instance softstate
2667 	 */
2668 	instance = ddi_get_instance(dip);
2669 	state = ddi_get_soft_state(ibd_list, instance);
2670 
2671 	/*
2672 	 * Release all resources we're holding still.  Note that if we'd
2673 	 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly
2674 	 * so far, we should find all the flags we need in id_mac_state.
2675 	 */
2676 	return (ibd_unattach(state, dip));
2677 }
2678 
2679 /*
2680  * Pre ibt_attach() driver initialization
2681  */
2682 static int
2683 ibd_state_init(ibd_state_t *state, dev_info_t *dip)
2684 {
2685 	char buf[64];
2686 
2687 	mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL);
2688 	state->id_link_state = LINK_STATE_UNKNOWN;
2689 
2690 	mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL);
2691 	cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL);
2692 	state->id_trap_stop = B_TRUE;
2693 	state->id_trap_inprog = 0;
2694 
2695 	mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2696 	mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2697 	state->id_dip = dip;
2698 
2699 	mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL);
2700 
2701 	mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2702 	mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2703 	mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL);
2704 	state->id_tx_busy = 0;
2705 	mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL);
2706 
2707 	state->id_rx_list.dl_bufs_outstanding = 0;
2708 	state->id_rx_list.dl_cnt = 0;
2709 	mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2710 	mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2711 	(void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip));
2712 	state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t),
2713 	    0, NULL, NULL, NULL, NULL, NULL, 0);
2714 
2715 	mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL);
2716 	cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL);
2717 
2718 	/* For Reliable Connected Mode */
2719 	mutex_init(&state->rc_rx_lock, NULL, MUTEX_DRIVER, NULL);
2720 	mutex_init(&state->rc_tx_large_bufs_lock, NULL, MUTEX_DRIVER, NULL);
2721 	mutex_init(&state->rc_srq_rwqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2722 	mutex_init(&state->rc_srq_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2723 	mutex_init(&state->rc_pass_chan_list.chan_list_mutex, NULL,
2724 	    MUTEX_DRIVER, NULL);
2725 
2726 	return (DDI_SUCCESS);
2727 }
2728 
2729 /*
2730  * Post ibt_detach() driver deconstruction
2731  */
2732 static void
2733 ibd_state_fini(ibd_state_t *state)
2734 {
2735 	cv_destroy(&state->id_macst_cv);
2736 	mutex_destroy(&state->id_macst_lock);
2737 
2738 	kmem_cache_destroy(state->id_req_kmc);
2739 
2740 	mutex_destroy(&state->id_rx_list.dl_mutex);
2741 	mutex_destroy(&state->id_rx_free_list.dl_mutex);
2742 
2743 	mutex_destroy(&state->id_txpost_lock);
2744 	mutex_destroy(&state->id_tx_list.dl_mutex);
2745 	mutex_destroy(&state->id_tx_rel_list.dl_mutex);
2746 	mutex_destroy(&state->id_lso_lock);
2747 
2748 	mutex_destroy(&state->id_sched_lock);
2749 	mutex_destroy(&state->id_scq_poll_lock);
2750 	mutex_destroy(&state->id_rcq_poll_lock);
2751 
2752 	cv_destroy(&state->id_trap_cv);
2753 	mutex_destroy(&state->id_trap_lock);
2754 	mutex_destroy(&state->id_link_mutex);
2755 
2756 	/* For Reliable Connected Mode */
2757 	mutex_destroy(&state->rc_srq_free_list.dl_mutex);
2758 	mutex_destroy(&state->rc_srq_rwqe_list.dl_mutex);
2759 	mutex_destroy(&state->rc_pass_chan_list.chan_list_mutex);
2760 	mutex_destroy(&state->rc_tx_large_bufs_lock);
2761 	mutex_destroy(&state->rc_rx_lock);
2762 }
2763 
2764 /*
2765  * Fetch link speed from SA for snmp ifspeed reporting.
2766  */
2767 static uint64_t
2768 ibd_get_portspeed(ibd_state_t *state)
2769 {
2770 	int			ret;
2771 	ibt_path_info_t		path;
2772 	ibt_path_attr_t		path_attr;
2773 	uint8_t			num_paths;
2774 	uint64_t		ifspeed;
2775 
2776 	/*
2777 	 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
2778 	 * translates to 2 Gbps data rate. Thus, 1X single data rate is
2779 	 * 2000000000. Start with that as default.
2780 	 */
2781 	ifspeed = 2000000000;
2782 
2783 	bzero(&path_attr, sizeof (path_attr));
2784 
2785 	/*
2786 	 * Get the port speed from Loopback path information.
2787 	 */
2788 	path_attr.pa_dgids = &state->id_sgid;
2789 	path_attr.pa_num_dgids = 1;
2790 	path_attr.pa_sgid = state->id_sgid;
2791 
2792 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
2793 	    &path_attr, 1, &path, &num_paths) != IBT_SUCCESS)
2794 		goto earlydone;
2795 
2796 	if (num_paths < 1)
2797 		goto earlydone;
2798 
2799 	/*
2800 	 * In case SA does not return an expected value, report the default
2801 	 * speed as 1X.
2802 	 */
2803 	ret = 1;
2804 	switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) {
2805 		case IBT_SRATE_2:	/*  1X SDR i.e 2.5 Gbps */
2806 			ret = 1;
2807 			break;
2808 		case IBT_SRATE_10:	/*  4X SDR or 1X QDR i.e 10 Gbps */
2809 			ret = 4;
2810 			break;
2811 		case IBT_SRATE_30:	/* 12X SDR i.e 30 Gbps */
2812 			ret = 12;
2813 			break;
2814 		case IBT_SRATE_5:	/*  1X DDR i.e  5 Gbps */
2815 			ret = 2;
2816 			break;
2817 		case IBT_SRATE_20:	/*  4X DDR or 8X SDR i.e 20 Gbps */
2818 			ret = 8;
2819 			break;
2820 		case IBT_SRATE_40:	/*  8X DDR or 4X QDR i.e 40 Gbps */
2821 			ret = 16;
2822 			break;
2823 		case IBT_SRATE_60:	/* 12X DDR i.e 60 Gbps */
2824 			ret = 24;
2825 			break;
2826 		case IBT_SRATE_80:	/*  8X QDR i.e 80 Gbps */
2827 			ret = 32;
2828 			break;
2829 		case IBT_SRATE_120:	/* 12X QDR i.e 120 Gbps */
2830 			ret = 48;
2831 			break;
2832 	}
2833 
2834 	ifspeed *= ret;
2835 
2836 earlydone:
2837 	return (ifspeed);
2838 }
2839 
2840 /*
2841  * Search input mcg list (id_mc_full or id_mc_non) for an entry
2842  * representing the input mcg mgid.
2843  */
2844 static ibd_mce_t *
2845 ibd_mcache_find(ib_gid_t mgid, struct list *mlist)
2846 {
2847 	ibd_mce_t *ptr = list_head(mlist);
2848 
2849 	/*
2850 	 * Do plain linear search.
2851 	 */
2852 	while (ptr != NULL) {
2853 		if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid,
2854 		    sizeof (ib_gid_t)) == 0)
2855 			return (ptr);
2856 		ptr = list_next(mlist, ptr);
2857 	}
2858 	return (NULL);
2859 }
2860 
2861 /*
2862  * Execute IBA JOIN.
2863  */
2864 static ibt_status_t
2865 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce)
2866 {
2867 	ibt_mcg_attr_t mcg_attr;
2868 
2869 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
2870 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
2871 	mcg_attr.mc_mgid = mgid;
2872 	mcg_attr.mc_join_state = mce->mc_jstate;
2873 	mcg_attr.mc_scope = state->id_scope;
2874 	mcg_attr.mc_pkey = state->id_pkey;
2875 	mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow;
2876 	mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
2877 	mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass;
2878 	return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info,
2879 	    NULL, NULL));
2880 }
2881 
2882 /*
2883  * This code JOINs the port in the proper way (depending on the join
2884  * state) so that IBA fabric will forward mcg packets to/from the port.
2885  * It also attaches the QPN to the mcg so it can receive those mcg
2886  * packets. This code makes sure not to attach the mcg to the QP if
2887  * that has been previously done due to the mcg being joined with a
2888  * different join state, even though this is not required by SWG_0216,
2889  * refid 3610.
2890  */
2891 static ibd_mce_t *
2892 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
2893 {
2894 	ibt_status_t ibt_status;
2895 	ibd_mce_t *mce, *tmce, *omce = NULL;
2896 	boolean_t do_attach = B_TRUE;
2897 
2898 	DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n",
2899 	    jstate, mgid.gid_prefix, mgid.gid_guid);
2900 
2901 	/*
2902 	 * For enable_multicast Full member joins, we need to do some
2903 	 * extra work. If there is already an mce on the list that
2904 	 * indicates full membership, that means the membership has
2905 	 * not yet been dropped (since the disable_multicast was issued)
2906 	 * because there are pending Tx's to the mcg; in that case, just
2907 	 * mark the mce not to be reaped when the Tx completion queues
2908 	 * an async reap operation.
2909 	 *
2910 	 * If there is already an mce on the list indicating sendonly
2911 	 * membership, try to promote to full membership. Be careful
2912 	 * not to deallocate the old mce, since there might be an AH
2913 	 * pointing to it; instead, update the old mce with new data
2914 	 * that tracks the full membership.
2915 	 */
2916 	if ((jstate == IB_MC_JSTATE_FULL) && ((omce =
2917 	    IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) {
2918 		if (omce->mc_jstate == IB_MC_JSTATE_FULL) {
2919 			ASSERT(omce->mc_fullreap);
2920 			omce->mc_fullreap = B_FALSE;
2921 			return (omce);
2922 		} else {
2923 			ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
2924 		}
2925 	}
2926 
2927 	/*
2928 	 * Allocate the ibd_mce_t to track this JOIN.
2929 	 */
2930 	mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP);
2931 	mce->mc_fullreap = B_FALSE;
2932 	mce->mc_jstate = jstate;
2933 
2934 	if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) {
2935 		DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d",
2936 		    ibt_status);
2937 		kmem_free(mce, sizeof (ibd_mce_t));
2938 		return (NULL);
2939 	}
2940 
2941 	/*
2942 	 * Is an IBA attach required? Not if the interface is already joined
2943 	 * to the mcg in a different appropriate join state.
2944 	 */
2945 	if (jstate == IB_MC_JSTATE_NON) {
2946 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
2947 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
2948 			do_attach = B_FALSE;
2949 	} else if (jstate == IB_MC_JSTATE_FULL) {
2950 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
2951 			do_attach = B_FALSE;
2952 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
2953 		do_attach = B_FALSE;
2954 	}
2955 
2956 	if (do_attach) {
2957 		/*
2958 		 * Do the IBA attach.
2959 		 */
2960 		DPRINT(10, "ibd_join_group: ibt_attach_mcg \n");
2961 		if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl,
2962 		    &mce->mc_info)) != IBT_SUCCESS) {
2963 			DPRINT(10, "ibd_join_group : failed qp attachment "
2964 			    "%d\n", ibt_status);
2965 			/*
2966 			 * NOTE that we should probably preserve the join info
2967 			 * in the list and later try to leave again at detach
2968 			 * time.
2969 			 */
2970 			(void) ibt_leave_mcg(state->id_sgid, mgid,
2971 			    state->id_sgid, jstate);
2972 			kmem_free(mce, sizeof (ibd_mce_t));
2973 			return (NULL);
2974 		}
2975 	}
2976 
2977 	/*
2978 	 * Insert the ibd_mce_t in the proper list.
2979 	 */
2980 	if (jstate == IB_MC_JSTATE_NON) {
2981 		IBD_MCACHE_INSERT_NON(state, mce);
2982 	} else {
2983 		/*
2984 		 * Set up the mc_req fields used for reaping the
2985 		 * mcg in case of delayed tx completion (see
2986 		 * ibd_tx_cleanup()). Also done for sendonly join in
2987 		 * case we are promoted to fullmembership later and
2988 		 * keep using the same mce.
2989 		 */
2990 		mce->mc_req.rq_gid = mgid;
2991 		mce->mc_req.rq_ptr = mce;
2992 		/*
2993 		 * Check whether this is the case of trying to join
2994 		 * full member, and we were already joined send only.
2995 		 * We try to drop our SendOnly membership, but it is
2996 		 * possible that the mcg does not exist anymore (and
2997 		 * the subnet trap never reached us), so the leave
2998 		 * operation might fail.
2999 		 */
3000 		if (omce != NULL) {
3001 			(void) ibt_leave_mcg(state->id_sgid, mgid,
3002 			    state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON);
3003 			omce->mc_jstate = IB_MC_JSTATE_FULL;
3004 			bcopy(&mce->mc_info, &omce->mc_info,
3005 			    sizeof (ibt_mcg_info_t));
3006 			kmem_free(mce, sizeof (ibd_mce_t));
3007 			return (omce);
3008 		}
3009 		mutex_enter(&state->id_mc_mutex);
3010 		IBD_MCACHE_INSERT_FULL(state, mce);
3011 		mutex_exit(&state->id_mc_mutex);
3012 	}
3013 
3014 	return (mce);
3015 }
3016 
3017 /*
3018  * Called during port up event handling to attempt to reacquire full
3019  * membership to an mcg. Stripped down version of ibd_join_group().
3020  * Note that it is possible that the mcg might have gone away, and
3021  * gets recreated at this point.
3022  */
3023 static void
3024 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce)
3025 {
3026 	ib_gid_t mgid;
3027 
3028 	/*
3029 	 * If the mc_fullreap flag is set, or this join fails, a subsequent
3030 	 * reap/leave is going to try to leave the group. We could prevent
3031 	 * that by adding a boolean flag into ibd_mce_t, if required.
3032 	 */
3033 	if (mce->mc_fullreap)
3034 		return;
3035 
3036 	mgid = mce->mc_info.mc_adds_vect.av_dgid;
3037 
3038 	DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix,
3039 	    mgid.gid_guid);
3040 
3041 	/* While reacquiring, leave and then join the MCG */
3042 	(void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid,
3043 	    mce->mc_jstate);
3044 	if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS)
3045 		ibd_print_warn(state, "Failure on port up to rejoin "
3046 		    "multicast gid %016llx:%016llx",
3047 		    (u_longlong_t)mgid.gid_prefix,
3048 		    (u_longlong_t)mgid.gid_guid);
3049 }
3050 
3051 /*
3052  * This code handles delayed Tx completion cleanups for mcg's to which
3053  * disable_multicast has been issued, regular mcg related cleanups during
3054  * disable_multicast, disable_promiscuous and mcg traps, as well as
3055  * cleanups during driver detach time. Depending on the join state,
3056  * it deletes the mce from the appropriate list and issues the IBA
3057  * leave/detach; except in the disable_multicast case when the mce
3058  * is left on the active list for a subsequent Tx completion cleanup.
3059  */
3060 static void
3061 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid,
3062     uint8_t jstate)
3063 {
3064 	ibd_mce_t *tmce;
3065 	boolean_t do_detach = B_TRUE;
3066 
3067 	/*
3068 	 * Before detaching, we must check whether the other list
3069 	 * contains the mcg; if we detach blindly, the consumer
3070 	 * who set up the other list will also stop receiving
3071 	 * traffic.
3072 	 */
3073 	if (jstate == IB_MC_JSTATE_FULL) {
3074 		/*
3075 		 * The following check is only relevant while coming
3076 		 * from the Tx completion path in the reap case.
3077 		 */
3078 		if (!mce->mc_fullreap)
3079 			return;
3080 		mutex_enter(&state->id_mc_mutex);
3081 		IBD_MCACHE_PULLOUT_FULL(state, mce);
3082 		mutex_exit(&state->id_mc_mutex);
3083 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
3084 			do_detach = B_FALSE;
3085 	} else if (jstate == IB_MC_JSTATE_NON) {
3086 		IBD_MCACHE_PULLOUT_NON(state, mce);
3087 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
3088 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
3089 			do_detach = B_FALSE;
3090 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
3091 		mutex_enter(&state->id_mc_mutex);
3092 		IBD_MCACHE_PULLOUT_FULL(state, mce);
3093 		mutex_exit(&state->id_mc_mutex);
3094 		do_detach = B_FALSE;
3095 	}
3096 
3097 	/*
3098 	 * If we are reacting to a mcg trap and leaving our sendonly or
3099 	 * non membership, the mcg is possibly already gone, so attempting
3100 	 * to leave might fail. On the other hand, we must try to leave
3101 	 * anyway, since this might be a trap from long ago, and we could
3102 	 * have potentially sendonly joined to a recent incarnation of
3103 	 * the mcg and are about to loose track of this information.
3104 	 */
3105 	if (do_detach) {
3106 		DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : "
3107 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3108 		(void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info);
3109 	}
3110 
3111 	(void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate);
3112 	kmem_free(mce, sizeof (ibd_mce_t));
3113 }
3114 
3115 /*
3116  * Async code executed due to multicast and promiscuous disable requests
3117  * and mcg trap handling; also executed during driver detach. Mostly, a
3118  * leave and detach is done; except for the fullmember case when Tx
3119  * requests are pending, whence arrangements are made for subsequent
3120  * cleanup on Tx completion.
3121  */
3122 static void
3123 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
3124 {
3125 	ipoib_mac_t mcmac;
3126 	boolean_t recycled;
3127 	ibd_mce_t *mce;
3128 
3129 	DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n",
3130 	    jstate, mgid.gid_prefix, mgid.gid_guid);
3131 
3132 	if (jstate == IB_MC_JSTATE_NON) {
3133 		recycled = B_TRUE;
3134 		mce = IBD_MCACHE_FIND_NON(state, mgid);
3135 		/*
3136 		 * In case we are handling a mcg trap, we might not find
3137 		 * the mcg in the non list.
3138 		 */
3139 		if (mce == NULL) {
3140 			return;
3141 		}
3142 	} else {
3143 		mce = IBD_MCACHE_FIND_FULL(state, mgid);
3144 
3145 		/*
3146 		 * In case we are handling a mcg trap, make sure the trap
3147 		 * is not arriving late; if we have an mce that indicates
3148 		 * that we are already a fullmember, that would be a clear
3149 		 * indication that the trap arrived late (ie, is for a
3150 		 * previous incarnation of the mcg).
3151 		 */
3152 		if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
3153 			if ((mce == NULL) || (mce->mc_jstate ==
3154 			    IB_MC_JSTATE_FULL)) {
3155 				return;
3156 			}
3157 		} else {
3158 			ASSERT(jstate == IB_MC_JSTATE_FULL);
3159 
3160 			/*
3161 			 * If join group failed, mce will be NULL here.
3162 			 * This is because in GLDv3 driver, set multicast
3163 			 *  will always return success.
3164 			 */
3165 			if (mce == NULL) {
3166 				return;
3167 			}
3168 
3169 			mce->mc_fullreap = B_TRUE;
3170 		}
3171 
3172 		/*
3173 		 * If no pending Tx's remain that reference the AH
3174 		 * for the mcg, recycle it from active to free list.
3175 		 * Else in the IB_MC_JSTATE_FULL case, just mark the AH,
3176 		 * so the last completing Tx will cause an async reap
3177 		 * operation to be invoked, at which time we will drop our
3178 		 * membership to the mcg so that the pending Tx's complete
3179 		 * successfully. Refer to comments on "AH and MCE active
3180 		 * list manipulation" at top of this file. The lock protects
3181 		 * against Tx fast path and Tx cleanup code.
3182 		 */
3183 		mutex_enter(&state->id_ac_mutex);
3184 		ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid);
3185 		recycled = ibd_acache_recycle(state, &mcmac, (jstate ==
3186 		    IB_MC_JSTATE_SEND_ONLY_NON));
3187 		mutex_exit(&state->id_ac_mutex);
3188 	}
3189 
3190 	if (recycled) {
3191 		DPRINT(2, "ibd_leave_group : leave_group reaping : "
3192 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3193 		ibd_async_reap_group(state, mce, mgid, jstate);
3194 	}
3195 }
3196 
3197 /*
3198  * Find the broadcast address as defined by IPoIB; implicitly
3199  * determines the IBA scope, mtu, tclass etc of the link the
3200  * interface is going to be a member of.
3201  */
3202 static ibt_status_t
3203 ibd_find_bgroup(ibd_state_t *state)
3204 {
3205 	ibt_mcg_attr_t mcg_attr;
3206 	uint_t numg;
3207 	uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL,
3208 	    IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL,
3209 	    IB_MC_SCOPE_GLOBAL };
3210 	int i, mcgmtu;
3211 	boolean_t found = B_FALSE;
3212 	int ret;
3213 	ibt_mcg_info_t mcg_info;
3214 
3215 	state->id_bgroup_created = B_FALSE;
3216 
3217 query_bcast_grp:
3218 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3219 	mcg_attr.mc_pkey = state->id_pkey;
3220 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3221 	state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK;
3222 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3223 
3224 	for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) {
3225 		state->id_scope = mcg_attr.mc_scope = scopes[i];
3226 
3227 		/*
3228 		 * Look for the IPoIB broadcast group.
3229 		 */
3230 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3231 		state->id_mgid.gid_prefix =
3232 		    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3233 		    ((uint64_t)state->id_scope << 48) |
3234 		    ((uint32_t)(state->id_pkey << 16)));
3235 		mcg_attr.mc_mgid = state->id_mgid;
3236 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3237 		if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1,
3238 		    &state->id_mcinfo, &numg) == IBT_SUCCESS) {
3239 			found = B_TRUE;
3240 			break;
3241 		}
3242 	}
3243 
3244 	if (!found) {
3245 		if (ibd_create_broadcast_group) {
3246 			/*
3247 			 * If we created the broadcast group, but failed to
3248 			 * find it, we can't do anything except leave the
3249 			 * one we created and return failure.
3250 			 */
3251 			if (state->id_bgroup_created) {
3252 				ibd_print_warn(state, "IPoIB broadcast group "
3253 				    "absent. Unable to query after create.");
3254 				goto find_bgroup_fail;
3255 			}
3256 
3257 			/*
3258 			 * Create the ipoib broadcast group if it didn't exist
3259 			 */
3260 			bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3261 			mcg_attr.mc_qkey = IBD_DEFAULT_QKEY;
3262 			mcg_attr.mc_join_state = IB_MC_JSTATE_FULL;
3263 			mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL;
3264 			mcg_attr.mc_pkey = state->id_pkey;
3265 			mcg_attr.mc_flow = 0;
3266 			mcg_attr.mc_sl = 0;
3267 			mcg_attr.mc_tclass = 0;
3268 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3269 			state->id_mgid.gid_prefix =
3270 			    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3271 			    ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) |
3272 			    ((uint32_t)(state->id_pkey << 16)));
3273 			mcg_attr.mc_mgid = state->id_mgid;
3274 			_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3275 
3276 			if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr,
3277 			    &mcg_info, NULL, NULL)) != IBT_SUCCESS) {
3278 				ibd_print_warn(state, "IPoIB broadcast group "
3279 				    "absent, create failed: ret = %d\n", ret);
3280 				state->id_bgroup_created = B_FALSE;
3281 				return (IBT_FAILURE);
3282 			}
3283 			state->id_bgroup_created = B_TRUE;
3284 			goto query_bcast_grp;
3285 		} else {
3286 			ibd_print_warn(state, "IPoIB broadcast group absent");
3287 			return (IBT_FAILURE);
3288 		}
3289 	}
3290 
3291 	/*
3292 	 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
3293 	 */
3294 	mcgmtu = (128 << state->id_mcinfo->mc_mtu);
3295 	if (state->id_mtu < mcgmtu) {
3296 		ibd_print_warn(state, "IPoIB broadcast group MTU %d "
3297 		    "greater than port's maximum MTU %d", mcgmtu,
3298 		    state->id_mtu);
3299 		ibt_free_mcg_info(state->id_mcinfo, 1);
3300 		goto find_bgroup_fail;
3301 	}
3302 	state->id_mtu = mcgmtu;
3303 
3304 	return (IBT_SUCCESS);
3305 
3306 find_bgroup_fail:
3307 	if (state->id_bgroup_created) {
3308 		(void) ibt_leave_mcg(state->id_sgid,
3309 		    mcg_info.mc_adds_vect.av_dgid, state->id_sgid,
3310 		    IB_MC_JSTATE_FULL);
3311 	}
3312 
3313 	return (IBT_FAILURE);
3314 }
3315 
3316 static int
3317 ibd_alloc_tx_copybufs(ibd_state_t *state)
3318 {
3319 	ibt_mr_attr_t mem_attr;
3320 
3321 	/*
3322 	 * Allocate one big chunk for all regular tx copy bufs
3323 	 */
3324 	state->id_tx_buf_sz = state->id_mtu;
3325 	if (state->id_lso_policy && state->id_lso_capable &&
3326 	    (IBD_TX_BUF_SZ > state->id_mtu)) {
3327 		state->id_tx_buf_sz = IBD_TX_BUF_SZ;
3328 	}
3329 
3330 	state->id_tx_bufs = kmem_zalloc(state->id_num_swqe *
3331 	    state->id_tx_buf_sz, KM_SLEEP);
3332 
3333 	state->id_tx_wqes = kmem_zalloc(state->id_num_swqe *
3334 	    sizeof (ibd_swqe_t), KM_SLEEP);
3335 
3336 	/*
3337 	 * Do one memory registration on the entire txbuf area
3338 	 */
3339 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs;
3340 	mem_attr.mr_len = state->id_num_swqe * state->id_tx_buf_sz;
3341 	mem_attr.mr_as = NULL;
3342 	mem_attr.mr_flags = IBT_MR_SLEEP;
3343 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3344 	    &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) {
3345 		DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed");
3346 		kmem_free(state->id_tx_wqes,
3347 		    state->id_num_swqe * sizeof (ibd_swqe_t));
3348 		kmem_free(state->id_tx_bufs,
3349 		    state->id_num_swqe * state->id_tx_buf_sz);
3350 		state->id_tx_bufs = NULL;
3351 		return (DDI_FAILURE);
3352 	}
3353 
3354 	return (DDI_SUCCESS);
3355 }
3356 
3357 static int
3358 ibd_alloc_tx_lsobufs(ibd_state_t *state)
3359 {
3360 	ibt_mr_attr_t mem_attr;
3361 	ibd_lsobuf_t *buflist;
3362 	ibd_lsobuf_t *lbufp;
3363 	ibd_lsobuf_t *tail;
3364 	ibd_lsobkt_t *bktp;
3365 	uint8_t *membase;
3366 	uint8_t *memp;
3367 	uint_t memsz;
3368 	int i;
3369 
3370 	/*
3371 	 * Allocate the lso bucket
3372 	 */
3373 	bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP);
3374 
3375 	/*
3376 	 * Allocate the entire lso memory and register it
3377 	 */
3378 	memsz = IBD_NUM_LSO_BUFS * IBD_LSO_BUFSZ;
3379 	membase = kmem_zalloc(memsz, KM_SLEEP);
3380 
3381 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase;
3382 	mem_attr.mr_len = memsz;
3383 	mem_attr.mr_as = NULL;
3384 	mem_attr.mr_flags = IBT_MR_SLEEP;
3385 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl,
3386 	    &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) {
3387 		DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed");
3388 		kmem_free(membase, memsz);
3389 		kmem_free(bktp, sizeof (ibd_lsobkt_t));
3390 		return (DDI_FAILURE);
3391 	}
3392 
3393 	mutex_enter(&state->id_lso_lock);
3394 
3395 	/*
3396 	 * Now allocate the buflist.  Note that the elements in the buflist and
3397 	 * the buffers in the lso memory have a permanent 1-1 relation, so we
3398 	 * can always derive the address of a buflist entry from the address of
3399 	 * an lso buffer.
3400 	 */
3401 	buflist = kmem_zalloc(IBD_NUM_LSO_BUFS * sizeof (ibd_lsobuf_t),
3402 	    KM_SLEEP);
3403 
3404 	/*
3405 	 * Set up the lso buf chain
3406 	 */
3407 	memp = membase;
3408 	lbufp = buflist;
3409 	for (i = 0; i < IBD_NUM_LSO_BUFS; i++) {
3410 		lbufp->lb_isfree = 1;
3411 		lbufp->lb_buf = memp;
3412 		lbufp->lb_next = lbufp + 1;
3413 
3414 		tail = lbufp;
3415 
3416 		memp += IBD_LSO_BUFSZ;
3417 		lbufp++;
3418 	}
3419 	tail->lb_next = NULL;
3420 
3421 	/*
3422 	 * Set up the LSO buffer information in ibd state
3423 	 */
3424 	bktp->bkt_bufl = buflist;
3425 	bktp->bkt_free_head = buflist;
3426 	bktp->bkt_mem = membase;
3427 	bktp->bkt_nelem = IBD_NUM_LSO_BUFS;
3428 	bktp->bkt_nfree = bktp->bkt_nelem;
3429 
3430 	state->id_lso = bktp;
3431 	mutex_exit(&state->id_lso_lock);
3432 
3433 	return (DDI_SUCCESS);
3434 }
3435 
3436 /*
3437  * Statically allocate Tx buffer list(s).
3438  */
3439 static int
3440 ibd_init_txlist(ibd_state_t *state)
3441 {
3442 	ibd_swqe_t *swqe;
3443 	ibt_lkey_t lkey;
3444 	int i;
3445 	uint_t len;
3446 	uint8_t *bufaddr;
3447 
3448 	if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS)
3449 		return (DDI_FAILURE);
3450 
3451 	if (state->id_lso_policy && state->id_lso_capable) {
3452 		if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS)
3453 			state->id_lso_policy = B_FALSE;
3454 	}
3455 
3456 	mutex_enter(&state->id_tx_list.dl_mutex);
3457 	state->id_tx_list.dl_head = NULL;
3458 	state->id_tx_list.dl_pending_sends = B_FALSE;
3459 	state->id_tx_list.dl_cnt = 0;
3460 	mutex_exit(&state->id_tx_list.dl_mutex);
3461 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
3462 	state->id_tx_rel_list.dl_head = NULL;
3463 	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
3464 	state->id_tx_rel_list.dl_cnt = 0;
3465 	mutex_exit(&state->id_tx_rel_list.dl_mutex);
3466 
3467 	/*
3468 	 * Allocate and setup the swqe list
3469 	 */
3470 	lkey = state->id_tx_mr_desc.md_lkey;
3471 	bufaddr = state->id_tx_bufs;
3472 	len = state->id_tx_buf_sz;
3473 	swqe = state->id_tx_wqes;
3474 	mutex_enter(&state->id_tx_list.dl_mutex);
3475 	for (i = 0; i < state->id_num_swqe; i++, swqe++, bufaddr += len) {
3476 		swqe->swqe_next = NULL;
3477 		swqe->swqe_im_mblk = NULL;
3478 
3479 		swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
3480 		    bufaddr;
3481 		swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
3482 		swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
3483 
3484 		swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
3485 		swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS;
3486 		swqe->w_swr.wr_trans = IBT_UD_SRV;
3487 
3488 		/* These are set in send */
3489 		swqe->w_swr.wr_nds = 0;
3490 		swqe->w_swr.wr_sgl = NULL;
3491 		swqe->w_swr.wr_opcode = IBT_WRC_SEND;
3492 
3493 		/* add to list */
3494 		state->id_tx_list.dl_cnt++;
3495 		swqe->swqe_next = state->id_tx_list.dl_head;
3496 		state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
3497 	}
3498 	mutex_exit(&state->id_tx_list.dl_mutex);
3499 
3500 	return (DDI_SUCCESS);
3501 }
3502 
3503 static int
3504 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p,
3505     uint32_t *nds_p)
3506 {
3507 	ibd_lsobkt_t *bktp;
3508 	ibd_lsobuf_t *lbufp;
3509 	ibd_lsobuf_t *nextp;
3510 	ibt_lkey_t lso_lkey;
3511 	uint_t frag_sz;
3512 	uint_t num_needed;
3513 	int i;
3514 
3515 	ASSERT(sgl_p != NULL);
3516 	ASSERT(nds_p != NULL);
3517 	ASSERT(req_sz != 0);
3518 
3519 	/*
3520 	 * Determine how many bufs we'd need for the size requested
3521 	 */
3522 	num_needed = req_sz / IBD_LSO_BUFSZ;
3523 	if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0)
3524 		num_needed++;
3525 
3526 	mutex_enter(&state->id_lso_lock);
3527 
3528 	/*
3529 	 * If we don't have enough lso bufs, return failure
3530 	 */
3531 	ASSERT(state->id_lso != NULL);
3532 	bktp = state->id_lso;
3533 	if (bktp->bkt_nfree < num_needed) {
3534 		mutex_exit(&state->id_lso_lock);
3535 		return (-1);
3536 	}
3537 
3538 	/*
3539 	 * Pick the first 'num_needed' bufs from the free list
3540 	 */
3541 	lso_lkey = bktp->bkt_mr_desc.md_lkey;
3542 	lbufp = bktp->bkt_free_head;
3543 	for (i = 0; i < num_needed; i++) {
3544 		ASSERT(lbufp->lb_isfree != 0);
3545 		ASSERT(lbufp->lb_buf != NULL);
3546 
3547 		nextp = lbufp->lb_next;
3548 
3549 		sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf;
3550 		sgl_p[i].ds_key = lso_lkey;
3551 		sgl_p[i].ds_len = IBD_LSO_BUFSZ;
3552 
3553 		lbufp->lb_isfree = 0;
3554 		lbufp->lb_next = NULL;
3555 
3556 		lbufp = nextp;
3557 	}
3558 	bktp->bkt_free_head = lbufp;
3559 
3560 	/*
3561 	 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need
3562 	 * to adjust the last sgl entry's length. Since we know we need atleast
3563 	 * one, the i-1 use below is ok.
3564 	 */
3565 	if (frag_sz) {
3566 		sgl_p[i-1].ds_len = frag_sz;
3567 	}
3568 
3569 	/*
3570 	 * Update nfree count and return
3571 	 */
3572 	bktp->bkt_nfree -= num_needed;
3573 
3574 	mutex_exit(&state->id_lso_lock);
3575 
3576 	*nds_p = num_needed;
3577 
3578 	return (0);
3579 }
3580 
3581 static void
3582 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds)
3583 {
3584 	ibd_lsobkt_t *bktp;
3585 	ibd_lsobuf_t *lbufp;
3586 	uint8_t *lso_mem_end;
3587 	uint_t ndx;
3588 	int i;
3589 
3590 	mutex_enter(&state->id_lso_lock);
3591 
3592 	bktp = state->id_lso;
3593 	ASSERT(bktp != NULL);
3594 
3595 	lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ;
3596 	for (i = 0; i < nds; i++) {
3597 		uint8_t *va;
3598 
3599 		va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va;
3600 		ASSERT(va >= bktp->bkt_mem && va < lso_mem_end);
3601 
3602 		/*
3603 		 * Figure out the buflist element this sgl buffer corresponds
3604 		 * to and put it back at the head
3605 		 */
3606 		ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ;
3607 		lbufp = bktp->bkt_bufl + ndx;
3608 
3609 		ASSERT(lbufp->lb_isfree == 0);
3610 		ASSERT(lbufp->lb_buf == va);
3611 
3612 		lbufp->lb_isfree = 1;
3613 		lbufp->lb_next = bktp->bkt_free_head;
3614 		bktp->bkt_free_head = lbufp;
3615 	}
3616 	bktp->bkt_nfree += nds;
3617 
3618 	mutex_exit(&state->id_lso_lock);
3619 }
3620 
3621 static void
3622 ibd_free_tx_copybufs(ibd_state_t *state)
3623 {
3624 	/*
3625 	 * Unregister txbuf mr
3626 	 */
3627 	if (ibt_deregister_mr(state->id_hca_hdl,
3628 	    state->id_tx_mr_hdl) != IBT_SUCCESS) {
3629 		DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed");
3630 	}
3631 	state->id_tx_mr_hdl = NULL;
3632 
3633 	/*
3634 	 * Free txbuf memory
3635 	 */
3636 	kmem_free(state->id_tx_wqes, state->id_num_swqe * sizeof (ibd_swqe_t));
3637 	kmem_free(state->id_tx_bufs, state->id_num_swqe * state->id_tx_buf_sz);
3638 	state->id_tx_wqes = NULL;
3639 	state->id_tx_bufs = NULL;
3640 }
3641 
3642 static void
3643 ibd_free_tx_lsobufs(ibd_state_t *state)
3644 {
3645 	ibd_lsobkt_t *bktp;
3646 
3647 	mutex_enter(&state->id_lso_lock);
3648 
3649 	if ((bktp = state->id_lso) == NULL) {
3650 		mutex_exit(&state->id_lso_lock);
3651 		return;
3652 	}
3653 
3654 	/*
3655 	 * First, free the buflist
3656 	 */
3657 	ASSERT(bktp->bkt_bufl != NULL);
3658 	kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t));
3659 
3660 	/*
3661 	 * Unregister the LSO memory and free it
3662 	 */
3663 	ASSERT(bktp->bkt_mr_hdl != NULL);
3664 	if (ibt_deregister_mr(state->id_hca_hdl,
3665 	    bktp->bkt_mr_hdl) != IBT_SUCCESS) {
3666 		DPRINT(10,
3667 		    "ibd_free_lsobufs: ibt_deregister_mr failed");
3668 	}
3669 	ASSERT(bktp->bkt_mem);
3670 	kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ);
3671 
3672 	/*
3673 	 * Finally free the bucket
3674 	 */
3675 	kmem_free(bktp, sizeof (ibd_lsobkt_t));
3676 	state->id_lso = NULL;
3677 
3678 	mutex_exit(&state->id_lso_lock);
3679 }
3680 
3681 /*
3682  * Free the statically allocated Tx buffer list.
3683  */
3684 static void
3685 ibd_fini_txlist(ibd_state_t *state)
3686 {
3687 	/*
3688 	 * Free the allocated swqes
3689 	 */
3690 	mutex_enter(&state->id_tx_list.dl_mutex);
3691 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
3692 	state->id_tx_list.dl_head = NULL;
3693 	state->id_tx_list.dl_pending_sends = B_FALSE;
3694 	state->id_tx_list.dl_cnt = 0;
3695 	state->id_tx_rel_list.dl_head = NULL;
3696 	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
3697 	state->id_tx_rel_list.dl_cnt = 0;
3698 	mutex_exit(&state->id_tx_rel_list.dl_mutex);
3699 	mutex_exit(&state->id_tx_list.dl_mutex);
3700 
3701 	ibd_free_tx_lsobufs(state);
3702 	ibd_free_tx_copybufs(state);
3703 }
3704 
3705 /*
3706  * post a list of rwqes, NULL terminated.
3707  */
3708 static void
3709 ibd_post_recv_list(ibd_state_t *state, ibd_rwqe_t *rwqe)
3710 {
3711 	uint_t		i;
3712 	uint_t		num_posted;
3713 	ibt_status_t	ibt_status;
3714 	ibt_recv_wr_t	wrs[IBD_RX_POST_CNT];
3715 
3716 	while (rwqe) {
3717 		/* Post up to IBD_RX_POST_CNT receive work requests */
3718 		for (i = 0; i < IBD_RX_POST_CNT; i++) {
3719 			wrs[i] = rwqe->w_rwr;
3720 			rwqe = WQE_TO_RWQE(rwqe->rwqe_next);
3721 			if (rwqe == NULL) {
3722 				i++;
3723 				break;
3724 			}
3725 		}
3726 
3727 		/*
3728 		 * If posting fails for some reason, we'll never receive
3729 		 * completion intimation, so we'll need to cleanup. But
3730 		 * we need to make sure we don't clean up nodes whose
3731 		 * wrs have been successfully posted. We assume that the
3732 		 * hca driver returns on the first failure to post and
3733 		 * therefore the first 'num_posted' entries don't need
3734 		 * cleanup here.
3735 		 */
3736 		atomic_add_32(&state->id_rx_list.dl_cnt, i);
3737 
3738 		num_posted = 0;
3739 		ibt_status = ibt_post_recv(state->id_chnl_hdl, wrs, i,
3740 		    &num_posted);
3741 		if (ibt_status != IBT_SUCCESS) {
3742 			/* This cannot happen unless the device has an error. */
3743 			ibd_print_warn(state, "ibd_post_recv: FATAL: "
3744 			    "posting multiple wrs failed: "
3745 			    "requested=%d, done=%d, ret=%d",
3746 			    IBD_RX_POST_CNT, num_posted, ibt_status);
3747 			atomic_add_32(&state->id_rx_list.dl_cnt,
3748 			    num_posted - i);
3749 		}
3750 	}
3751 }
3752 
3753 /*
3754  * Grab a list of rwqes from the array of lists, and post the list.
3755  */
3756 static void
3757 ibd_post_recv_intr(ibd_state_t *state)
3758 {
3759 	ibd_rx_queue_t	*rxp;
3760 	ibd_rwqe_t *list;
3761 
3762 	/* rotate through the rx_queue array, expecting an adequate number */
3763 	state->id_rx_post_queue_index =
3764 	    (state->id_rx_post_queue_index + 1) &
3765 	    (state->id_rx_nqueues - 1);
3766 
3767 	rxp = state->id_rx_queues + state->id_rx_post_queue_index;
3768 	mutex_enter(&rxp->rx_post_lock);
3769 	list = WQE_TO_RWQE(rxp->rx_head);
3770 	rxp->rx_head = NULL;
3771 	rxp->rx_cnt = 0;
3772 	mutex_exit(&rxp->rx_post_lock);
3773 	ibd_post_recv_list(state, list);
3774 }
3775 
3776 /* macro explained below */
3777 #define	RX_QUEUE_HASH(rwqe) \
3778 	(((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1))
3779 
3780 /*
3781  * Add a rwqe to one of the the Rx lists.  If the list is large enough
3782  * (exactly IBD_RX_POST_CNT), post the list to the hardware.
3783  *
3784  * Note: one of 2^N lists is chosen via a hash.  This is done
3785  * because using one list is contentious.  If the first list is busy
3786  * (mutex_tryenter fails), use a second list (just call mutex_enter).
3787  *
3788  * The number 8 in RX_QUEUE_HASH is a random choice that provides
3789  * even distribution of mapping rwqes to the 2^N queues.
3790  */
3791 static void
3792 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe)
3793 {
3794 	ibd_rx_queue_t	*rxp;
3795 
3796 	rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe);
3797 
3798 	if (!mutex_tryenter(&rxp->rx_post_lock)) {
3799 		/* Failed.  Try a different queue ("ptr + 16" ensures that). */
3800 		rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16);
3801 		mutex_enter(&rxp->rx_post_lock);
3802 	}
3803 	rwqe->rwqe_next = rxp->rx_head;
3804 	if (++rxp->rx_cnt >= IBD_RX_POST_CNT - 2) {
3805 		uint_t active = atomic_inc_32_nv(&state->id_rx_post_active);
3806 
3807 		/* only call ibt_post_recv() every Nth time through here */
3808 		if ((active & (state->id_rx_nqueues - 1)) == 0) {
3809 			rxp->rx_head = NULL;
3810 			rxp->rx_cnt = 0;
3811 			mutex_exit(&rxp->rx_post_lock);
3812 			ibd_post_recv_list(state, rwqe);
3813 			return;
3814 		}
3815 	}
3816 	rxp->rx_head = RWQE_TO_WQE(rwqe);
3817 	mutex_exit(&rxp->rx_post_lock);
3818 }
3819 
3820 static int
3821 ibd_alloc_rx_copybufs(ibd_state_t *state)
3822 {
3823 	ibt_mr_attr_t mem_attr;
3824 	int i;
3825 
3826 	/*
3827 	 * Allocate one big chunk for all regular rx copy bufs
3828 	 */
3829 	state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE;
3830 
3831 	state->id_rx_bufs = kmem_zalloc(state->id_num_rwqe *
3832 	    state->id_rx_buf_sz, KM_SLEEP);
3833 
3834 	state->id_rx_wqes = kmem_zalloc(state->id_num_rwqe *
3835 	    sizeof (ibd_rwqe_t), KM_SLEEP);
3836 
3837 	state->id_rx_nqueues = 1 << IBD_LOG_RX_POST;
3838 	state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues *
3839 	    sizeof (ibd_rx_queue_t), KM_SLEEP);
3840 	for (i = 0; i < state->id_rx_nqueues; i++) {
3841 		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
3842 		mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL);
3843 	}
3844 
3845 	/*
3846 	 * Do one memory registration on the entire rxbuf area
3847 	 */
3848 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs;
3849 	mem_attr.mr_len = state->id_num_rwqe * state->id_rx_buf_sz;
3850 	mem_attr.mr_as = NULL;
3851 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3852 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3853 	    &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) {
3854 		DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed");
3855 		kmem_free(state->id_rx_wqes,
3856 		    state->id_num_rwqe * sizeof (ibd_rwqe_t));
3857 		kmem_free(state->id_rx_bufs,
3858 		    state->id_num_rwqe * state->id_rx_buf_sz);
3859 		state->id_rx_bufs = NULL;
3860 		state->id_rx_wqes = NULL;
3861 		return (DDI_FAILURE);
3862 	}
3863 
3864 	return (DDI_SUCCESS);
3865 }
3866 
3867 /*
3868  * Allocate the statically allocated Rx buffer list.
3869  */
3870 static int
3871 ibd_init_rxlist(ibd_state_t *state)
3872 {
3873 	ibd_rwqe_t *rwqe, *next;
3874 	ibd_wqe_t *list;
3875 	ibt_lkey_t lkey;
3876 	int i;
3877 	uint_t len;
3878 	uint8_t *bufaddr;
3879 
3880 	mutex_enter(&state->id_rx_free_list.dl_mutex);
3881 	if (state->id_rx_free_list.dl_head != NULL) {
3882 		/* rx rsrcs were never freed.  Just repost them */
3883 		len = state->id_rx_buf_sz;
3884 		list = state->id_rx_free_list.dl_head;
3885 		state->id_rx_free_list.dl_head = NULL;
3886 		state->id_rx_free_list.dl_cnt = 0;
3887 		mutex_exit(&state->id_rx_free_list.dl_mutex);
3888 		for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
3889 		    rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
3890 			if ((rwqe->rwqe_im_mblk = desballoc(
3891 			    rwqe->rwqe_copybuf.ic_bufaddr, len, 0,
3892 			    &rwqe->w_freemsg_cb)) == NULL) {
3893 				/* allow freemsg_cb to free the rwqes */
3894 				if (atomic_dec_32_nv(&state->id_running) != 0) {
3895 					cmn_err(CE_WARN, "ibd_init_rxlist: "
3896 					    "id_running was not 1\n");
3897 				}
3898 				DPRINT(10, "ibd_init_rxlist : "
3899 				    "failed in desballoc()");
3900 				for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
3901 				    rwqe = next) {
3902 					next = WQE_TO_RWQE(rwqe->rwqe_next);
3903 					if (rwqe->rwqe_im_mblk) {
3904 						atomic_inc_32(&state->
3905 						    id_rx_list.
3906 						    dl_bufs_outstanding);
3907 						freemsg(rwqe->rwqe_im_mblk);
3908 					} else
3909 						ibd_free_rwqe(state, rwqe);
3910 				}
3911 				atomic_inc_32(&state->id_running);
3912 				return (DDI_FAILURE);
3913 			}
3914 		}
3915 		ibd_post_recv_list(state, WQE_TO_RWQE(list));
3916 		return (DDI_SUCCESS);
3917 	}
3918 	mutex_exit(&state->id_rx_free_list.dl_mutex);
3919 
3920 	if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS)
3921 		return (DDI_FAILURE);
3922 
3923 	/*
3924 	 * Allocate and setup the rwqe list
3925 	 */
3926 	len = state->id_rx_buf_sz;
3927 	lkey = state->id_rx_mr_desc.md_lkey;
3928 	rwqe = state->id_rx_wqes;
3929 	bufaddr = state->id_rx_bufs;
3930 	list = NULL;
3931 	for (i = 0; i < state->id_num_rwqe; i++, rwqe++, bufaddr += len) {
3932 		rwqe->w_state = state;
3933 		rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb;
3934 		rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
3935 
3936 		rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
3937 
3938 		if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
3939 		    &rwqe->w_freemsg_cb)) == NULL) {
3940 			DPRINT(10, "ibd_init_rxlist : failed in desballoc()");
3941 			/* allow freemsg_cb to free the rwqes */
3942 			if (atomic_dec_32_nv(&state->id_running) != 0) {
3943 				cmn_err(CE_WARN, "ibd_init_rxlist: "
3944 				    "id_running was not 1\n");
3945 			}
3946 			DPRINT(10, "ibd_init_rxlist : "
3947 			    "failed in desballoc()");
3948 			for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
3949 			    rwqe = next) {
3950 				next = WQE_TO_RWQE(rwqe->rwqe_next);
3951 				freemsg(rwqe->rwqe_im_mblk);
3952 			}
3953 			atomic_inc_32(&state->id_running);
3954 
3955 			/* remove reference to free'd rwqes */
3956 			mutex_enter(&state->id_rx_free_list.dl_mutex);
3957 			state->id_rx_free_list.dl_head = NULL;
3958 			state->id_rx_free_list.dl_cnt = 0;
3959 			mutex_exit(&state->id_rx_free_list.dl_mutex);
3960 
3961 			ibd_fini_rxlist(state);
3962 			return (DDI_FAILURE);
3963 		}
3964 
3965 		rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
3966 		rwqe->rwqe_copybuf.ic_sgl.ds_va =
3967 		    (ib_vaddr_t)(uintptr_t)bufaddr;
3968 		rwqe->rwqe_copybuf.ic_sgl.ds_len = len;
3969 		rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
3970 		rwqe->w_rwr.wr_nds = 1;
3971 		rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
3972 
3973 		rwqe->rwqe_next = list;
3974 		list = RWQE_TO_WQE(rwqe);
3975 	}
3976 	ibd_post_recv_list(state, WQE_TO_RWQE(list));
3977 
3978 	return (DDI_SUCCESS);
3979 }
3980 
3981 static void
3982 ibd_free_rx_copybufs(ibd_state_t *state)
3983 {
3984 	int i;
3985 
3986 	/*
3987 	 * Unregister rxbuf mr
3988 	 */
3989 	if (ibt_deregister_mr(state->id_hca_hdl,
3990 	    state->id_rx_mr_hdl) != IBT_SUCCESS) {
3991 		DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed");
3992 	}
3993 	state->id_rx_mr_hdl = NULL;
3994 
3995 	/*
3996 	 * Free rxbuf memory
3997 	 */
3998 	for (i = 0; i < state->id_rx_nqueues; i++) {
3999 		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
4000 		mutex_destroy(&rxp->rx_post_lock);
4001 	}
4002 	kmem_free(state->id_rx_queues, state->id_rx_nqueues *
4003 	    sizeof (ibd_rx_queue_t));
4004 	kmem_free(state->id_rx_wqes, state->id_num_rwqe * sizeof (ibd_rwqe_t));
4005 	kmem_free(state->id_rx_bufs, state->id_num_rwqe * state->id_rx_buf_sz);
4006 	state->id_rx_queues = NULL;
4007 	state->id_rx_wqes = NULL;
4008 	state->id_rx_bufs = NULL;
4009 }
4010 
4011 static void
4012 ibd_free_rx_rsrcs(ibd_state_t *state)
4013 {
4014 	mutex_enter(&state->id_rx_free_list.dl_mutex);
4015 	if (state->id_rx_free_list.dl_head == NULL) {
4016 		/* already freed */
4017 		mutex_exit(&state->id_rx_free_list.dl_mutex);
4018 		return;
4019 	}
4020 	ASSERT(state->id_rx_free_list.dl_cnt == state->id_num_rwqe);
4021 	ibd_free_rx_copybufs(state);
4022 	state->id_rx_free_list.dl_cnt = 0;
4023 	state->id_rx_free_list.dl_head = NULL;
4024 	mutex_exit(&state->id_rx_free_list.dl_mutex);
4025 }
4026 
4027 /*
4028  * Free the statically allocated Rx buffer list.
4029  */
4030 static void
4031 ibd_fini_rxlist(ibd_state_t *state)
4032 {
4033 	ibd_rwqe_t *rwqe;
4034 	int i;
4035 
4036 	/* run through the rx_queue's, calling freemsg() */
4037 	for (i = 0; i < state->id_rx_nqueues; i++) {
4038 		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
4039 		mutex_enter(&rxp->rx_post_lock);
4040 		for (rwqe = WQE_TO_RWQE(rxp->rx_head); rwqe;
4041 		    rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
4042 			freemsg(rwqe->rwqe_im_mblk);
4043 			rxp->rx_cnt--;
4044 		}
4045 		rxp->rx_head = NULL;
4046 		mutex_exit(&rxp->rx_post_lock);
4047 	}
4048 
4049 	/* cannot free rx resources unless gld returned everything */
4050 	if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) == 0)
4051 		ibd_free_rx_rsrcs(state);
4052 }
4053 
4054 /*
4055  * Free an allocated recv wqe.
4056  */
4057 /* ARGSUSED */
4058 static void
4059 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
4060 {
4061 	/*
4062 	 * desballoc() failed (no memory).
4063 	 *
4064 	 * This rwqe is placed on a free list so that it
4065 	 * can be reinstated when memory is available.
4066 	 *
4067 	 * NOTE: no code currently exists to reinstate
4068 	 * these "lost" rwqes.
4069 	 */
4070 	mutex_enter(&state->id_rx_free_list.dl_mutex);
4071 	state->id_rx_free_list.dl_cnt++;
4072 	rwqe->rwqe_next = state->id_rx_free_list.dl_head;
4073 	state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe);
4074 	mutex_exit(&state->id_rx_free_list.dl_mutex);
4075 }
4076 
4077 /*
4078  * IBA Rx completion queue handler. Guaranteed to be single
4079  * threaded and nonreentrant for this CQ.
4080  */
4081 /* ARGSUSED */
4082 static void
4083 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4084 {
4085 	ibd_state_t *state = (ibd_state_t *)arg;
4086 
4087 	atomic_inc_64(&state->id_num_intrs);
4088 
4089 	if (ibd_rx_softintr == 1) {
4090 		mutex_enter(&state->id_rcq_poll_lock);
4091 		if (state->id_rcq_poll_busy & IBD_CQ_POLLING) {
4092 			state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING;
4093 			mutex_exit(&state->id_rcq_poll_lock);
4094 			return;
4095 		} else {
4096 			mutex_exit(&state->id_rcq_poll_lock);
4097 			ddi_trigger_softintr(state->id_rx);
4098 		}
4099 	} else
4100 		(void) ibd_intr((caddr_t)state);
4101 }
4102 
4103 /*
4104  * CQ handler for Tx completions, when the Tx CQ is in
4105  * interrupt driven mode.
4106  */
4107 /* ARGSUSED */
4108 static void
4109 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4110 {
4111 	ibd_state_t *state = (ibd_state_t *)arg;
4112 
4113 	atomic_inc_64(&state->id_num_intrs);
4114 
4115 	if (ibd_tx_softintr == 1) {
4116 		mutex_enter(&state->id_scq_poll_lock);
4117 		if (state->id_scq_poll_busy & IBD_CQ_POLLING) {
4118 			state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING;
4119 			mutex_exit(&state->id_scq_poll_lock);
4120 			return;
4121 		} else {
4122 			mutex_exit(&state->id_scq_poll_lock);
4123 			ddi_trigger_softintr(state->id_tx);
4124 		}
4125 	} else
4126 		(void) ibd_tx_recycle((caddr_t)state);
4127 }
4128 
4129 /*
4130  * Multicast group create/delete trap handler. These will be delivered
4131  * on a kernel thread (handling can thus block) and can be invoked
4132  * concurrently. The handler can be invoked anytime after it is
4133  * registered and before ibt_detach().
4134  */
4135 /* ARGSUSED */
4136 static void
4137 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code,
4138     ibt_subnet_event_t *event)
4139 {
4140 	ibd_state_t *state = (ibd_state_t *)arg;
4141 	ibd_req_t *req;
4142 
4143 	/*
4144 	 * The trap handler will get invoked once for every event for
4145 	 * every port. The input "gid" is the GID0 of the port the
4146 	 * trap came in on; we just need to act on traps that came
4147 	 * to our port, meaning the port on which the ipoib interface
4148 	 * resides. Since ipoib uses GID0 of the port, we just match
4149 	 * the gids to check whether we need to handle the trap.
4150 	 */
4151 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
4152 	if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0)
4153 		return;
4154 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
4155 
4156 	DPRINT(10, "ibd_notices_handler : %d\n", code);
4157 
4158 	switch (code) {
4159 		case IBT_SM_EVENT_UNAVAILABLE:
4160 			/*
4161 			 * If we are in promiscuous mode or have
4162 			 * sendnonmembers, we need to print a warning
4163 			 * message right now. Else, just store the
4164 			 * information, print when we enter promiscuous
4165 			 * mode or attempt nonmember send. We might
4166 			 * also want to stop caching sendnonmember.
4167 			 */
4168 			ibd_print_warn(state, "IBA multicast support "
4169 			    "degraded due to unavailability of multicast "
4170 			    "traps");
4171 			break;
4172 		case IBT_SM_EVENT_AVAILABLE:
4173 			/*
4174 			 * If we printed a warning message above or
4175 			 * while trying to nonmember send or get into
4176 			 * promiscuous mode, print an okay message.
4177 			 */
4178 			ibd_print_warn(state, "IBA multicast support "
4179 			    "restored due to availability of multicast "
4180 			    "traps");
4181 			break;
4182 		case IBT_SM_EVENT_MCG_CREATED:
4183 		case IBT_SM_EVENT_MCG_DELETED:
4184 			/*
4185 			 * Common processing of creation/deletion traps.
4186 			 * First check if the instance is being
4187 			 * [de]initialized; back off then, without doing
4188 			 * anything more, since we are not sure if the
4189 			 * async thread is around, or whether we might
4190 			 * be racing with the detach code in ibd_m_stop()
4191 			 * that scans the mcg list.
4192 			 */
4193 			if (!ibd_async_safe(state))
4194 				return;
4195 
4196 			req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
4197 			req->rq_gid = event->sm_notice_gid;
4198 			req->rq_ptr = (void *)code;
4199 			ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP);
4200 			break;
4201 	}
4202 }
4203 
4204 static void
4205 ibd_async_trap(ibd_state_t *state, ibd_req_t *req)
4206 {
4207 	ib_gid_t mgid = req->rq_gid;
4208 	ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr;
4209 
4210 	DPRINT(10, "ibd_async_trap : %d\n", code);
4211 
4212 	/*
4213 	 * Atomically search the nonmember and sendonlymember lists and
4214 	 * delete.
4215 	 */
4216 	ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON);
4217 
4218 	if (state->id_prom_op == IBD_OP_COMPLETED) {
4219 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
4220 
4221 		/*
4222 		 * If in promiscuous mode, try to join/attach to the new
4223 		 * mcg. Given the unreliable out-of-order mode of trap
4224 		 * delivery, we can never be sure whether it is a problem
4225 		 * if the join fails. Thus, we warn the admin of a failure
4226 		 * if this was a creation trap. Note that the trap might
4227 		 * actually be reporting a long past event, and the mcg
4228 		 * might already have been deleted, thus we might be warning
4229 		 * in vain.
4230 		 */
4231 		if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) ==
4232 		    NULL) && (code == IBT_SM_EVENT_MCG_CREATED))
4233 			ibd_print_warn(state, "IBA promiscuous mode missed "
4234 			    "new multicast gid %016llx:%016llx",
4235 			    (u_longlong_t)mgid.gid_prefix,
4236 			    (u_longlong_t)mgid.gid_guid);
4237 	}
4238 
4239 	/*
4240 	 * Free the request slot allocated by the subnet event thread.
4241 	 */
4242 	ibd_async_done(state);
4243 }
4244 
4245 /*
4246  * GLDv3 entry point to get capabilities.
4247  */
4248 static boolean_t
4249 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
4250 {
4251 	ibd_state_t *state = arg;
4252 
4253 	switch (cap) {
4254 	case MAC_CAPAB_HCKSUM: {
4255 		uint32_t *txflags = cap_data;
4256 
4257 		/*
4258 		 * We either do full checksum or not do it at all
4259 		 */
4260 		if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL)
4261 			*txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4;
4262 		else
4263 			return (B_FALSE);
4264 		break;
4265 	}
4266 
4267 	case MAC_CAPAB_LSO: {
4268 		mac_capab_lso_t *cap_lso = cap_data;
4269 
4270 		/*
4271 		 * In addition to the capability and policy, since LSO
4272 		 * relies on hw checksum, we'll not enable LSO if we
4273 		 * don't have hw checksum.  Of course, if the HCA doesn't
4274 		 * provide the reserved lkey capability, enabling LSO will
4275 		 * actually affect performance adversely, so we'll disable
4276 		 * LSO even for that case.
4277 		 */
4278 		if (!state->id_lso_policy || !state->id_lso_capable)
4279 			return (B_FALSE);
4280 
4281 		if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0)
4282 			return (B_FALSE);
4283 
4284 		if (state->id_hca_res_lkey_capab == 0) {
4285 			ibd_print_warn(state, "no reserved-lkey capability, "
4286 			    "disabling LSO");
4287 			return (B_FALSE);
4288 		}
4289 
4290 		cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
4291 		cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1;
4292 		break;
4293 	}
4294 
4295 	default:
4296 		return (B_FALSE);
4297 	}
4298 
4299 	return (B_TRUE);
4300 }
4301 
4302 static int
4303 ibd_get_port_details(ibd_state_t *state)
4304 {
4305 	ibt_hca_portinfo_t *port_infop;
4306 	ibt_status_t ret;
4307 	uint_t psize, port_infosz;
4308 
4309 	mutex_enter(&state->id_link_mutex);
4310 
4311 	/*
4312 	 * Query for port information
4313 	 */
4314 	ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
4315 	    &port_infop, &psize, &port_infosz);
4316 	if ((ret != IBT_SUCCESS) || (psize != 1)) {
4317 		mutex_exit(&state->id_link_mutex);
4318 		DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() "
4319 		    "failed, ret=%d", ret);
4320 		return (ENETDOWN);
4321 	}
4322 
4323 	/*
4324 	 * If the link already went down by the time we get here,
4325 	 * give up
4326 	 */
4327 	if (port_infop->p_linkstate != IBT_PORT_ACTIVE) {
4328 		mutex_exit(&state->id_link_mutex);
4329 		ibt_free_portinfo(port_infop, port_infosz);
4330 		DPRINT(10, "ibd_get_port_details: port is not active");
4331 		return (ENETDOWN);
4332 	}
4333 
4334 	/*
4335 	 * If the link is active, verify the pkey
4336 	 */
4337 	if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port,
4338 	    state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) {
4339 		mutex_exit(&state->id_link_mutex);
4340 		ibt_free_portinfo(port_infop, port_infosz);
4341 		DPRINT(10, "ibd_get_port_details: ibt_pkey2index "
4342 		    "failed, ret=%d", ret);
4343 		return (ENONET);
4344 	}
4345 
4346 	state->id_mtu = (128 << port_infop->p_mtu);
4347 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
4348 	state->id_sgid = *port_infop->p_sgid_tbl;
4349 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
4350 	state->id_link_state = LINK_STATE_UP;
4351 
4352 	mutex_exit(&state->id_link_mutex);
4353 	ibt_free_portinfo(port_infop, port_infosz);
4354 
4355 	/*
4356 	 * Now that the port is active, record the port speed
4357 	 */
4358 	state->id_link_speed = ibd_get_portspeed(state);
4359 
4360 	return (0);
4361 }
4362 
4363 static int
4364 ibd_alloc_cqs(ibd_state_t *state)
4365 {
4366 	ibt_hca_attr_t hca_attrs;
4367 	ibt_cq_attr_t cq_attr;
4368 	ibt_status_t ret;
4369 	uint32_t real_size;
4370 
4371 	ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
4372 	ASSERT(ret == IBT_SUCCESS);
4373 
4374 	/*
4375 	 * Allocate Rx/combined CQ:
4376 	 * Theoretically, there is no point in having more than #rwqe
4377 	 * plus #swqe cqe's, except that the CQ will be signaled for
4378 	 * overflow when the last wqe completes, if none of the previous
4379 	 * cqe's have been polled. Thus, we allocate just a few less wqe's
4380 	 * to make sure such overflow does not occur.
4381 	 */
4382 	cq_attr.cq_sched = NULL;
4383 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
4384 
4385 	/*
4386 	 * Allocate Receive CQ.
4387 	 */
4388 	if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) {
4389 		cq_attr.cq_size = state->id_num_rwqe + 1;
4390 	} else {
4391 		cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
4392 		state->id_num_rwqe = cq_attr.cq_size - 1;
4393 	}
4394 
4395 	if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
4396 	    &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
4397 		DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) "
4398 		    "failed, ret=%d\n", ret);
4399 		return (DDI_FAILURE);
4400 	}
4401 
4402 	if ((ret = ibt_modify_cq(state->id_rcq_hdl,
4403 	    ibd_rxcomp_count, ibd_rxcomp_usec, 0)) != IBT_SUCCESS) {
4404 		DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt "
4405 		    "moderation failed, ret=%d\n", ret);
4406 	}
4407 
4408 	/* make the #rx wc's the same as max rx chain size */
4409 	state->id_rxwcs_size = IBD_MAX_RX_MP_LEN;
4410 	state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
4411 	    state->id_rxwcs_size, KM_SLEEP);
4412 
4413 	/*
4414 	 * Allocate Send CQ.
4415 	 */
4416 	if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) {
4417 		cq_attr.cq_size = state->id_num_swqe + 1;
4418 	} else {
4419 		cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
4420 		state->id_num_swqe = cq_attr.cq_size - 1;
4421 	}
4422 
4423 	if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
4424 	    &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) {
4425 		DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) "
4426 		    "failed, ret=%d\n", ret);
4427 		kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) *
4428 		    state->id_rxwcs_size);
4429 		(void) ibt_free_cq(state->id_rcq_hdl);
4430 		return (DDI_FAILURE);
4431 	}
4432 	if ((ret = ibt_modify_cq(state->id_scq_hdl,
4433 	    ibd_txcomp_count, ibd_txcomp_usec, 0)) != IBT_SUCCESS) {
4434 		DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt "
4435 		    "moderation failed, ret=%d\n", ret);
4436 	}
4437 
4438 	state->id_txwcs_size = IBD_TX_POLL_THRESH;
4439 	state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) *
4440 	    state->id_txwcs_size, KM_SLEEP);
4441 
4442 	/*
4443 	 * Print message in case we could not allocate as many wqe's
4444 	 * as was requested.
4445 	 */
4446 	if (state->id_num_rwqe != IBD_NUM_RWQE) {
4447 		ibd_print_warn(state, "Setting #rwqe = %d instead of default "
4448 		    "%d", state->id_num_rwqe, IBD_NUM_RWQE);
4449 	}
4450 	if (state->id_num_swqe != IBD_NUM_SWQE) {
4451 		ibd_print_warn(state, "Setting #swqe = %d instead of default "
4452 		    "%d", state->id_num_swqe, IBD_NUM_SWQE);
4453 	}
4454 
4455 	return (DDI_SUCCESS);
4456 }
4457 
4458 static int
4459 ibd_setup_ud_channel(ibd_state_t *state)
4460 {
4461 	ibt_ud_chan_alloc_args_t ud_alloc_attr;
4462 	ibt_ud_chan_query_attr_t ud_chan_attr;
4463 	ibt_status_t ret;
4464 
4465 	ud_alloc_attr.ud_flags  = IBT_ALL_SIGNALED;
4466 	if (state->id_hca_res_lkey_capab)
4467 		ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
4468 	if (state->id_lso_policy && state->id_lso_capable)
4469 		ud_alloc_attr.ud_flags |= IBT_USES_LSO;
4470 
4471 	ud_alloc_attr.ud_hca_port_num	= state->id_port;
4472 	ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg;
4473 	ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG;
4474 	ud_alloc_attr.ud_sizes.cs_sq    = state->id_num_swqe;
4475 	ud_alloc_attr.ud_sizes.cs_rq    = state->id_num_rwqe;
4476 	ud_alloc_attr.ud_qkey		= state->id_mcinfo->mc_qkey;
4477 	ud_alloc_attr.ud_scq		= state->id_scq_hdl;
4478 	ud_alloc_attr.ud_rcq		= state->id_rcq_hdl;
4479 	ud_alloc_attr.ud_pd		= state->id_pd_hdl;
4480 	ud_alloc_attr.ud_pkey_ix	= state->id_pkix;
4481 	ud_alloc_attr.ud_clone_chan	= NULL;
4482 
4483 	if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS,
4484 	    &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) {
4485 		DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() "
4486 		    "failed, ret=%d\n", ret);
4487 		return (DDI_FAILURE);
4488 	}
4489 
4490 	if ((ret = ibt_query_ud_channel(state->id_chnl_hdl,
4491 	    &ud_chan_attr)) != IBT_SUCCESS) {
4492 		DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() "
4493 		    "failed, ret=%d\n", ret);
4494 		(void) ibt_free_channel(state->id_chnl_hdl);
4495 		return (DDI_FAILURE);
4496 	}
4497 
4498 	state->id_qpnum = ud_chan_attr.ud_qpn;
4499 
4500 	return (DDI_SUCCESS);
4501 }
4502 
4503 static int
4504 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state)
4505 {
4506 	uint32_t progress = state->id_mac_state;
4507 	uint_t attempts;
4508 	ibt_status_t ret;
4509 	ib_gid_t mgid;
4510 	ibd_mce_t *mce;
4511 	uint8_t jstate;
4512 
4513 	if (atomic_dec_32_nv(&state->id_running) != 0)
4514 		cmn_err(CE_WARN, "ibd_undo_start: id_running was not 1\n");
4515 
4516 	/*
4517 	 * Before we try to stop/undo whatever we did in ibd_start(),
4518 	 * we need to mark the link state appropriately to prevent the
4519 	 * ip layer from using this instance for any new transfers. Note
4520 	 * that if the original state of the link was "up" when we're
4521 	 * here, we'll set the final link state to "unknown", to behave
4522 	 * in the same fashion as other ethernet drivers.
4523 	 */
4524 	mutex_enter(&state->id_link_mutex);
4525 	if (cur_link_state == LINK_STATE_DOWN) {
4526 		state->id_link_state = cur_link_state;
4527 	} else {
4528 		state->id_link_state = LINK_STATE_UNKNOWN;
4529 	}
4530 	mutex_exit(&state->id_link_mutex);
4531 	mac_link_update(state->id_mh, state->id_link_state);
4532 
4533 	state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED);
4534 	if (progress & IBD_DRV_STARTED) {
4535 		state->id_mac_state &= (~IBD_DRV_STARTED);
4536 	}
4537 
4538 	/* Stop listen under Reliable Connected Mode */
4539 	if (progress & IBD_DRV_RC_LISTEN) {
4540 		ASSERT(state->id_enable_rc);
4541 		if (state->rc_listen_hdl != NULL) {
4542 			ibd_rc_stop_listen(state);
4543 		}
4544 		state->id_mac_state &= (~IBD_DRV_RC_LISTEN);
4545 	}
4546 
4547 	if ((state->id_enable_rc) && (progress & IBD_DRV_ACACHE_INITIALIZED)) {
4548 		(void) ibd_rc_close_all_chan(state);
4549 	}
4550 
4551 	/*
4552 	 * First, stop receive interrupts; this stops the driver from
4553 	 * handing up buffers to higher layers.  Wait for receive buffers
4554 	 * to be returned and give up after 1 second.
4555 	 */
4556 	if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) {
4557 		attempts = 10;
4558 		while (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding,
4559 		    0) > 0) {
4560 			delay(drv_usectohz(100000));
4561 			if (--attempts == 0) {
4562 				/*
4563 				 * There are pending bufs with the network
4564 				 * layer and we have no choice but to wait
4565 				 * for them to be done with. Reap all the
4566 				 * Tx/Rx completions that were posted since
4567 				 * we turned off the notification and
4568 				 * return failure.
4569 				 */
4570 				cmn_err(CE_CONT, "!ibd: bufs outstanding\n");
4571 				DPRINT(2, "ibd_undo_start: "
4572 				    "reclaiming failed");
4573 				break;
4574 			}
4575 		}
4576 		state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED);
4577 	}
4578 
4579 	if (progress & IBD_DRV_RC_LARGEBUF_ALLOCD) {
4580 		ibd_rc_fini_tx_largebuf_list(state);
4581 		state->id_mac_state &= (~IBD_DRV_RC_LARGEBUF_ALLOCD);
4582 	}
4583 
4584 	if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
4585 		ASSERT(state->id_enable_rc);
4586 		if (state->rc_srq_rwqe_list.dl_bufs_outstanding == 0) {
4587 			ibd_rc_fini_srq_list(state);
4588 			state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD);
4589 		} else {
4590 			cmn_err(CE_CONT, "ibd_undo_start: srq bufs "
4591 			    "outstanding\n");
4592 		}
4593 	}
4594 
4595 	if (progress & IBD_DRV_SM_NOTICES_REGISTERED) {
4596 		ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL);
4597 
4598 		mutex_enter(&state->id_trap_lock);
4599 		state->id_trap_stop = B_TRUE;
4600 		while (state->id_trap_inprog > 0)
4601 			cv_wait(&state->id_trap_cv, &state->id_trap_lock);
4602 		mutex_exit(&state->id_trap_lock);
4603 
4604 		state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED);
4605 	}
4606 
4607 	if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) {
4608 		/*
4609 		 * Flushing the channel ensures that all pending WQE's
4610 		 * are marked with flush_error and handed to the CQ. It
4611 		 * does not guarantee the invocation of the CQ handler.
4612 		 * This call is guaranteed to return successfully for
4613 		 * UD QPNs.
4614 		 */
4615 		if ((ret = ibt_flush_channel(state->id_chnl_hdl)) !=
4616 		    IBT_SUCCESS) {
4617 			DPRINT(10, "ibd_undo_start: flush_channel "
4618 			    "failed, ret=%d", ret);
4619 		}
4620 
4621 		/*
4622 		 * Give some time for the TX CQ handler to process the
4623 		 * completions.
4624 		 */
4625 		mutex_enter(&state->id_tx_list.dl_mutex);
4626 		mutex_enter(&state->id_tx_rel_list.dl_mutex);
4627 		attempts = 10;
4628 		while (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt
4629 		    != state->id_num_swqe) {
4630 			if (--attempts == 0)
4631 				break;
4632 			mutex_exit(&state->id_tx_rel_list.dl_mutex);
4633 			mutex_exit(&state->id_tx_list.dl_mutex);
4634 			delay(drv_usectohz(100000));
4635 			mutex_enter(&state->id_tx_list.dl_mutex);
4636 			mutex_enter(&state->id_tx_rel_list.dl_mutex);
4637 		}
4638 		ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
4639 		if (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt !=
4640 		    state->id_num_swqe) {
4641 			cmn_err(CE_WARN, "tx resources not freed\n");
4642 		}
4643 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
4644 		mutex_exit(&state->id_tx_list.dl_mutex);
4645 
4646 		attempts = 10;
4647 		while (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
4648 			if (--attempts == 0)
4649 				break;
4650 			delay(drv_usectohz(100000));
4651 		}
4652 		ibt_set_cq_handler(state->id_rcq_hdl, 0, 0);
4653 		if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
4654 			cmn_err(CE_WARN, "rx resources not freed\n");
4655 		}
4656 
4657 		state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED);
4658 	}
4659 
4660 	if (progress & IBD_DRV_ASYNC_THR_CREATED) {
4661 		/*
4662 		 * No new async requests will be posted since the device
4663 		 * link state has been marked as unknown; completion handlers
4664 		 * have been turned off, so Tx handler will not cause any
4665 		 * more IBD_ASYNC_REAP requests.
4666 		 *
4667 		 * Queue a request for the async thread to exit, which will
4668 		 * be serviced after any pending ones. This can take a while,
4669 		 * specially if the SM is unreachable, since IBMF will slowly
4670 		 * timeout each SM request issued by the async thread.  Reap
4671 		 * the thread before continuing on, we do not want it to be
4672 		 * lingering in modunloaded code (or we could move the reap
4673 		 * to ibd_detach(), provided we keep track of the current
4674 		 * id_async_thrid somewhere safe).
4675 		 */
4676 		ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT);
4677 		thread_join(state->id_async_thrid);
4678 
4679 		state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED);
4680 	}
4681 
4682 	if (progress & IBD_DRV_BCAST_GROUP_JOINED) {
4683 		/*
4684 		 * Drop all residual full/non membership. This includes full
4685 		 * membership to the broadcast group, and any nonmembership
4686 		 * acquired during transmits. We do this after the Tx completion
4687 		 * handlers are done, since those might result in some late
4688 		 * leaves; this also eliminates a potential race with that
4689 		 * path wrt the mc full list insert/delete. Trap handling
4690 		 * has also been suppressed at this point. Thus, no locks
4691 		 * are required while traversing the mc full list.
4692 		 */
4693 		DPRINT(2, "ibd_undo_start: clear full cache entries");
4694 		mce = list_head(&state->id_mc_full);
4695 		while (mce != NULL) {
4696 			mgid = mce->mc_info.mc_adds_vect.av_dgid;
4697 			jstate = mce->mc_jstate;
4698 			mce = list_next(&state->id_mc_full, mce);
4699 			ibd_leave_group(state, mgid, jstate);
4700 		}
4701 		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED);
4702 	}
4703 
4704 	if (progress & IBD_DRV_RXLIST_ALLOCD) {
4705 		ibd_fini_rxlist(state);
4706 		state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD);
4707 	}
4708 
4709 	if (progress & IBD_DRV_TXLIST_ALLOCD) {
4710 		ibd_fini_txlist(state);
4711 		state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD);
4712 	}
4713 
4714 	if (progress & IBD_DRV_UD_CHANNEL_SETUP) {
4715 		if ((ret = ibt_free_channel(state->id_chnl_hdl)) !=
4716 		    IBT_SUCCESS) {
4717 			DPRINT(10, "ibd_undo_start: free_channel "
4718 			    "failed, ret=%d", ret);
4719 		}
4720 
4721 		state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP);
4722 	}
4723 
4724 	if (progress & IBD_DRV_CQS_ALLOCD) {
4725 		kmem_free(state->id_txwcs,
4726 		    sizeof (ibt_wc_t) * state->id_txwcs_size);
4727 		if ((ret = ibt_free_cq(state->id_scq_hdl)) !=
4728 		    IBT_SUCCESS) {
4729 			DPRINT(10, "ibd_undo_start: free_cq(scq) "
4730 			    "failed, ret=%d", ret);
4731 		}
4732 
4733 		kmem_free(state->id_rxwcs,
4734 		    sizeof (ibt_wc_t) * state->id_rxwcs_size);
4735 		if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) {
4736 			DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, "
4737 			    "ret=%d", ret);
4738 		}
4739 
4740 		state->id_txwcs = NULL;
4741 		state->id_rxwcs = NULL;
4742 		state->id_scq_hdl = NULL;
4743 		state->id_rcq_hdl = NULL;
4744 
4745 		state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD);
4746 	}
4747 
4748 	if (progress & IBD_DRV_ACACHE_INITIALIZED) {
4749 		mutex_enter(&state->id_ac_mutex);
4750 		mod_hash_destroy_hash(state->id_ah_active_hash);
4751 		mutex_exit(&state->id_ac_mutex);
4752 		ibd_acache_fini(state);
4753 
4754 		state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED);
4755 	}
4756 
4757 	if (progress & IBD_DRV_BCAST_GROUP_FOUND) {
4758 		/*
4759 		 * If we'd created the ipoib broadcast group and had
4760 		 * successfully joined it, leave it now
4761 		 */
4762 		if (state->id_bgroup_created) {
4763 			mgid = state->id_mcinfo->mc_adds_vect.av_dgid;
4764 			jstate = IB_MC_JSTATE_FULL;
4765 			(void) ibt_leave_mcg(state->id_sgid, mgid,
4766 			    state->id_sgid, jstate);
4767 		}
4768 		ibt_free_mcg_info(state->id_mcinfo, 1);
4769 
4770 		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND);
4771 	}
4772 
4773 	return (DDI_SUCCESS);
4774 }
4775 
4776 /*
4777  * These pair of routines are used to set/clear the condition that
4778  * the caller is likely to do something to change the id_mac_state.
4779  * If there's already someone doing either a start or a stop (possibly
4780  * due to the async handler detecting a pkey relocation event, a plumb
4781  * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until
4782  * that's done.
4783  */
4784 static void
4785 ibd_set_mac_progress(ibd_state_t *state, uint_t flag)
4786 {
4787 	mutex_enter(&state->id_macst_lock);
4788 	while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS)
4789 		cv_wait(&state->id_macst_cv, &state->id_macst_lock);
4790 
4791 	state->id_mac_state |= flag;
4792 	mutex_exit(&state->id_macst_lock);
4793 }
4794 
4795 static void
4796 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag)
4797 {
4798 	mutex_enter(&state->id_macst_lock);
4799 	state->id_mac_state &= (~flag);
4800 	cv_signal(&state->id_macst_cv);
4801 	mutex_exit(&state->id_macst_lock);
4802 }
4803 
4804 /*
4805  * GLDv3 entry point to start hardware.
4806  */
4807 /*ARGSUSED*/
4808 static int
4809 ibd_m_start(void *arg)
4810 {
4811 	ibd_state_t *state = arg;
4812 	int	ret;
4813 
4814 	ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
4815 
4816 	ret = ibd_start(state);
4817 
4818 	ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
4819 
4820 	return (ret);
4821 }
4822 
4823 static int
4824 ibd_start(ibd_state_t *state)
4825 {
4826 	kthread_t *kht;
4827 	int err;
4828 	ibt_status_t ret;
4829 
4830 	if (state->id_mac_state & IBD_DRV_STARTED)
4831 		return (DDI_SUCCESS);
4832 
4833 	if (atomic_inc_32_nv(&state->id_running) != 1) {
4834 		DPRINT(10, "ibd_start: id_running is non-zero");
4835 		cmn_err(CE_WARN, "ibd_start: id_running was not 0\n");
4836 		atomic_dec_32(&state->id_running);
4837 		return (EINVAL);
4838 	}
4839 
4840 	/*
4841 	 * Get port details; if we fail here, very likely the port
4842 	 * state is inactive or the pkey can't be verified.
4843 	 */
4844 	if ((err = ibd_get_port_details(state)) != 0) {
4845 		DPRINT(10, "ibd_start: ibd_get_port_details() failed");
4846 		goto start_fail;
4847 	}
4848 	state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED;
4849 
4850 	/*
4851 	 * Find the IPoIB broadcast group
4852 	 */
4853 	if (ibd_find_bgroup(state) != IBT_SUCCESS) {
4854 		DPRINT(10, "ibd_start: ibd_find_bgroup() failed");
4855 		err = ENOTACTIVE;
4856 		goto start_fail;
4857 	}
4858 	state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND;
4859 
4860 	/*
4861 	 * Initialize per-interface caches and lists; if we fail here,
4862 	 * it is most likely due to a lack of resources
4863 	 */
4864 	if (ibd_acache_init(state) != DDI_SUCCESS) {
4865 		DPRINT(10, "ibd_start: ibd_acache_init() failed");
4866 		err = ENOMEM;
4867 		goto start_fail;
4868 	}
4869 	state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED;
4870 
4871 	/*
4872 	 * Allocate send and receive completion queues
4873 	 */
4874 	if (ibd_alloc_cqs(state) != DDI_SUCCESS) {
4875 		DPRINT(10, "ibd_start: ibd_alloc_cqs() failed");
4876 		err = ENOMEM;
4877 		goto start_fail;
4878 	}
4879 	state->id_mac_state |= IBD_DRV_CQS_ALLOCD;
4880 
4881 	/*
4882 	 * Setup a UD channel
4883 	 */
4884 	if (ibd_setup_ud_channel(state) != DDI_SUCCESS) {
4885 		err = ENOMEM;
4886 		DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed");
4887 		goto start_fail;
4888 	}
4889 	state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP;
4890 
4891 	/*
4892 	 * Allocate and initialize the tx buffer list
4893 	 */
4894 	if (ibd_init_txlist(state) != DDI_SUCCESS) {
4895 		DPRINT(10, "ibd_start: ibd_init_txlist() failed");
4896 		err = ENOMEM;
4897 		goto start_fail;
4898 	}
4899 	state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD;
4900 
4901 	/*
4902 	 * Create the send cq handler here
4903 	 */
4904 	ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
4905 	if ((ret = ibt_enable_cq_notify(state->id_scq_hdl,
4906 	    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
4907 		DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) "
4908 		    "failed, ret=%d", ret);
4909 		err = EINVAL;
4910 		goto start_fail;
4911 	}
4912 	state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED;
4913 
4914 	/*
4915 	 * Allocate and initialize the rx buffer list
4916 	 */
4917 	if (ibd_init_rxlist(state) != DDI_SUCCESS) {
4918 		DPRINT(10, "ibd_start: ibd_init_rxlist() failed");
4919 		err = ENOMEM;
4920 		goto start_fail;
4921 	}
4922 	state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD;
4923 
4924 	/*
4925 	 * Join IPoIB broadcast group
4926 	 */
4927 	if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) {
4928 		DPRINT(10, "ibd_start: ibd_join_group() failed");
4929 		err = ENOTACTIVE;
4930 		goto start_fail;
4931 	}
4932 	state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED;
4933 
4934 	/*
4935 	 * Create the async thread; thread_create never fails.
4936 	 */
4937 	kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
4938 	    TS_RUN, minclsyspri);
4939 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_async_thrid))
4940 	state->id_async_thrid = kht->t_did;
4941 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_async_thrid))
4942 	state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED;
4943 
4944 	/*
4945 	 * When we did mac_register() in ibd_attach(), we didn't register
4946 	 * the real macaddr and we didn't have the true port mtu. Now that
4947 	 * we're almost ready, set the local mac address and broadcast
4948 	 * addresses and update gldv3 about the real values of these
4949 	 * parameters.
4950 	 */
4951 	if (state->id_enable_rc) {
4952 		ibd_h2n_mac(&state->id_macaddr,
4953 		    IBD_MAC_ADDR_RC + state->id_qpnum,
4954 		    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
4955 		ibd_h2n_mac(&state->rc_macaddr_loopback, state->id_qpnum,
4956 		    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
4957 	} else {
4958 		ibd_h2n_mac(&state->id_macaddr, state->id_qpnum,
4959 		    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
4960 	}
4961 	ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK,
4962 	    state->id_mgid.gid_prefix, state->id_mgid.gid_guid);
4963 
4964 	if (!state->id_enable_rc) {
4965 		(void) mac_maxsdu_update(state->id_mh, state->id_mtu
4966 		    - IPOIB_HDRSIZE);
4967 	}
4968 	mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
4969 
4970 	/*
4971 	 * Setup the receive cq handler
4972 	 */
4973 	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
4974 	if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl,
4975 	    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
4976 		DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) "
4977 		    "failed, ret=%d", ret);
4978 		err = EINVAL;
4979 		goto start_fail;
4980 	}
4981 	state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED;
4982 
4983 	/*
4984 	 * Setup the subnet notices handler after we've initialized the acache/
4985 	 * mcache and started the async thread, both of which are required for
4986 	 * the trap handler to function properly.
4987 	 *
4988 	 * Now that the async thread has been started (and we've already done
4989 	 * a mac_register() during attach so mac_tx_update() can be called
4990 	 * if necessary without any problem), we can enable the trap handler
4991 	 * to queue requests to the async thread.
4992 	 */
4993 	ibt_register_subnet_notices(state->id_ibt_hdl,
4994 	    ibd_snet_notices_handler, state);
4995 	mutex_enter(&state->id_trap_lock);
4996 	state->id_trap_stop = B_FALSE;
4997 	mutex_exit(&state->id_trap_lock);
4998 	state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED;
4999 
5000 	if (state->id_enable_rc) {
5001 		if (state->rc_enable_srq) {
5002 			if (state->id_mac_state & IBD_DRV_RC_SRQ_ALLOCD) {
5003 				if (ibd_rc_repost_srq_free_list(state) !=
5004 				    IBT_SUCCESS) {
5005 					err = ENOMEM;
5006 					goto start_fail;
5007 				}
5008 			} else {
5009 				/* Allocate SRQ resource */
5010 				if (ibd_rc_init_srq_list(state) !=
5011 				    IBT_SUCCESS) {
5012 					err = ENOMEM;
5013 					goto start_fail;
5014 				}
5015 				state->id_mac_state |= IBD_DRV_RC_SRQ_ALLOCD;
5016 			}
5017 		}
5018 
5019 		if (ibd_rc_init_tx_largebuf_list(state) != IBT_SUCCESS) {
5020 			DPRINT(10, "ibd_start: ibd_rc_init_tx_largebuf_list() "
5021 			    "failed");
5022 			err = ENOMEM;
5023 			goto start_fail;
5024 		}
5025 		state->id_mac_state |= IBD_DRV_RC_LARGEBUF_ALLOCD;
5026 
5027 		/* RC: begin to listen only after everything is available */
5028 		if (ibd_rc_listen(state) != IBT_SUCCESS) {
5029 			DPRINT(10, "ibd_start: ibd_rc_listen() failed");
5030 			err = EINVAL;
5031 			goto start_fail;
5032 		}
5033 		state->id_mac_state |= IBD_DRV_RC_LISTEN;
5034 	}
5035 
5036 	/*
5037 	 * Indicate link status to GLDv3 and higher layers. By default,
5038 	 * we assume we are in up state (which must have been true at
5039 	 * least at the time the broadcast mcg's were probed); if there
5040 	 * were any up/down transitions till the time we come here, the
5041 	 * async handler will have updated last known state, which we
5042 	 * use to tell GLDv3. The async handler will not send any
5043 	 * notifications to GLDv3 till we reach here in the initialization
5044 	 * sequence.
5045 	 */
5046 	state->id_mac_state |= IBD_DRV_STARTED;
5047 	mac_link_update(state->id_mh, state->id_link_state);
5048 
5049 	return (DDI_SUCCESS);
5050 
5051 start_fail:
5052 	/*
5053 	 * If we ran into a problem during ibd_start() and ran into
5054 	 * some other problem during undoing our partial work, we can't
5055 	 * do anything about it.  Ignore any errors we might get from
5056 	 * ibd_undo_start() and just return the original error we got.
5057 	 */
5058 	(void) ibd_undo_start(state, LINK_STATE_DOWN);
5059 	return (err);
5060 }
5061 
5062 /*
5063  * GLDv3 entry point to stop hardware from receiving packets.
5064  */
5065 /*ARGSUSED*/
5066 static void
5067 ibd_m_stop(void *arg)
5068 {
5069 	ibd_state_t *state = (ibd_state_t *)arg;
5070 
5071 	ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
5072 
5073 	(void) ibd_undo_start(state, state->id_link_state);
5074 
5075 	ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
5076 }
5077 
5078 /*
5079  * GLDv3 entry point to modify device's mac address. We do not
5080  * allow address modifications.
5081  */
5082 static int
5083 ibd_m_unicst(void *arg, const uint8_t *macaddr)
5084 {
5085 	ibd_state_t *state = arg;
5086 
5087 	/*
5088 	 * Don't bother even comparing the macaddr if we haven't
5089 	 * completed ibd_m_start().
5090 	 */
5091 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
5092 		return (0);
5093 
5094 	if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0)
5095 		return (0);
5096 	else
5097 		return (EINVAL);
5098 }
5099 
5100 /*
5101  * The blocking part of the IBA join/leave operations are done out
5102  * of here on the async thread.
5103  */
5104 static void
5105 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op)
5106 {
5107 	DPRINT(3, "ibd_async_multicast : async_setmc op %d :"
5108 	    "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid);
5109 
5110 	if (op == IBD_ASYNC_JOIN) {
5111 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) {
5112 			ibd_print_warn(state, "Join multicast group failed :"
5113 			"%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
5114 		}
5115 	} else {
5116 		/*
5117 		 * Here, we must search for the proper mcg_info and
5118 		 * use that to leave the group.
5119 		 */
5120 		ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL);
5121 	}
5122 }
5123 
5124 /*
5125  * GLDv3 entry point for multicast enable/disable requests.
5126  * This function queues the operation to the async thread and
5127  * return success for a valid multicast address.
5128  */
5129 static int
5130 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac)
5131 {
5132 	ibd_state_t *state = (ibd_state_t *)arg;
5133 	ipoib_mac_t maddr, *mcast;
5134 	ib_gid_t mgid;
5135 	ibd_req_t *req;
5136 
5137 	/*
5138 	 * If we haven't completed ibd_m_start(), async thread wouldn't
5139 	 * have been started and id_bcaddr wouldn't be set, so there's
5140 	 * no point in continuing.
5141 	 */
5142 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
5143 		return (0);
5144 
5145 	/*
5146 	 * The incoming multicast address might not be aligned properly
5147 	 * on a 4 byte boundary to be considered an ipoib_mac_t. We force
5148 	 * it to look like one though, to get the offsets of the mc gid,
5149 	 * since we know we are not going to dereference any values with
5150 	 * the ipoib_mac_t pointer.
5151 	 */
5152 	bcopy(mcmac, &maddr, sizeof (ipoib_mac_t));
5153 	mcast = &maddr;
5154 
5155 	/*
5156 	 * Check validity of MCG address. We could additionally check
5157 	 * that a enable/disable is not being issued on the "broadcast"
5158 	 * mcg, but since this operation is only invokable by privileged
5159 	 * programs anyway, we allow the flexibility to those dlpi apps.
5160 	 * Note that we do not validate the "scope" of the IBA mcg.
5161 	 */
5162 	if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN)
5163 		return (EINVAL);
5164 
5165 	/*
5166 	 * fill in multicast pkey and scope
5167 	 */
5168 	IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey);
5169 
5170 	/*
5171 	 * If someone is trying to JOIN/LEAVE the broadcast group, we do
5172 	 * nothing (i.e. we stay JOINed to the broadcast group done in
5173 	 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically
5174 	 * requires to be joined to broadcast groups at all times.
5175 	 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
5176 	 * depends on this.
5177 	 */
5178 	if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0)
5179 		return (0);
5180 
5181 	ibd_n2h_gid(mcast, &mgid);
5182 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
5183 	if (req == NULL)
5184 		return (ENOMEM);
5185 
5186 	req->rq_gid = mgid;
5187 
5188 	if (add) {
5189 		DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n",
5190 		    mgid.gid_prefix, mgid.gid_guid);
5191 		ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN);
5192 	} else {
5193 		DPRINT(1, "ibd_m_multicst : unset_multicast : "
5194 		    "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
5195 		ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE);
5196 	}
5197 	return (0);
5198 }
5199 
5200 /*
5201  * The blocking part of the IBA promiscuous operations are done
5202  * out of here on the async thread. The dlpireq parameter indicates
5203  * whether this invocation is due to a dlpi request or due to
5204  * a port up/down event.
5205  */
5206 static void
5207 ibd_async_unsetprom(ibd_state_t *state)
5208 {
5209 	ibd_mce_t *mce = list_head(&state->id_mc_non);
5210 	ib_gid_t mgid;
5211 
5212 	DPRINT(2, "ibd_async_unsetprom : async_unset_promisc");
5213 
5214 	while (mce != NULL) {
5215 		mgid = mce->mc_info.mc_adds_vect.av_dgid;
5216 		mce = list_next(&state->id_mc_non, mce);
5217 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
5218 	}
5219 	state->id_prom_op = IBD_OP_NOTSTARTED;
5220 }
5221 
5222 /*
5223  * The blocking part of the IBA promiscuous operations are done
5224  * out of here on the async thread. The dlpireq parameter indicates
5225  * whether this invocation is due to a dlpi request or due to
5226  * a port up/down event.
5227  */
5228 static void
5229 ibd_async_setprom(ibd_state_t *state)
5230 {
5231 	ibt_mcg_attr_t mcg_attr;
5232 	ibt_mcg_info_t *mcg_info;
5233 	ib_gid_t mgid;
5234 	uint_t numg;
5235 	int i;
5236 	char ret = IBD_OP_COMPLETED;
5237 
5238 	DPRINT(2, "ibd_async_setprom : async_set_promisc");
5239 
5240 	/*
5241 	 * Obtain all active MC groups on the IB fabric with
5242 	 * specified criteria (scope + Pkey + Qkey + mtu).
5243 	 */
5244 	bzero(&mcg_attr, sizeof (mcg_attr));
5245 	mcg_attr.mc_pkey = state->id_pkey;
5246 	mcg_attr.mc_scope = state->id_scope;
5247 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
5248 	mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu;
5249 	mcg_attr.mc_mtu_req.r_selector = IBT_EQU;
5250 	if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) !=
5251 	    IBT_SUCCESS) {
5252 		ibd_print_warn(state, "Could not get list of IBA multicast "
5253 		    "groups");
5254 		ret = IBD_OP_ERRORED;
5255 		goto done;
5256 	}
5257 
5258 	/*
5259 	 * Iterate over the returned mcg's and join as NonMember
5260 	 * to the IP mcg's.
5261 	 */
5262 	for (i = 0; i < numg; i++) {
5263 		/*
5264 		 * Do a NonMember JOIN on the MC group.
5265 		 */
5266 		mgid = mcg_info[i].mc_adds_vect.av_dgid;
5267 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL)
5268 			ibd_print_warn(state, "IBA promiscuous mode missed "
5269 			    "multicast gid %016llx:%016llx",
5270 			    (u_longlong_t)mgid.gid_prefix,
5271 			    (u_longlong_t)mgid.gid_guid);
5272 	}
5273 
5274 	ibt_free_mcg_info(mcg_info, numg);
5275 	DPRINT(4, "ibd_async_setprom : async_set_promisc completes");
5276 done:
5277 	state->id_prom_op = ret;
5278 }
5279 
5280 /*
5281  * GLDv3 entry point for multicast promiscuous enable/disable requests.
5282  * GLDv3 assumes phys state receives more packets than multi state,
5283  * which is not true for IPoIB. Thus, treat the multi and phys
5284  * promiscuous states the same way to work with GLDv3's assumption.
5285  */
5286 static int
5287 ibd_m_promisc(void *arg, boolean_t on)
5288 {
5289 	ibd_state_t *state = (ibd_state_t *)arg;
5290 	ibd_req_t *req;
5291 
5292 	/*
5293 	 * Async thread wouldn't have been started if we haven't
5294 	 * passed ibd_m_start()
5295 	 */
5296 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
5297 		return (0);
5298 
5299 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
5300 	if (req == NULL)
5301 		return (ENOMEM);
5302 	if (on) {
5303 		DPRINT(1, "ibd_m_promisc : set_promisc : %d", on);
5304 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON);
5305 	} else {
5306 		DPRINT(1, "ibd_m_promisc : unset_promisc");
5307 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF);
5308 	}
5309 
5310 	return (0);
5311 }
5312 
5313 /*
5314  * GLDv3 entry point for gathering statistics.
5315  */
5316 static int
5317 ibd_m_stat(void *arg, uint_t stat, uint64_t *val)
5318 {
5319 	ibd_state_t *state = (ibd_state_t *)arg;
5320 
5321 	switch (stat) {
5322 	case MAC_STAT_IFSPEED:
5323 		*val = state->id_link_speed;
5324 		break;
5325 	case MAC_STAT_MULTIRCV:
5326 		*val = state->id_multi_rcv;
5327 		break;
5328 	case MAC_STAT_BRDCSTRCV:
5329 		*val = state->id_brd_rcv;
5330 		break;
5331 	case MAC_STAT_MULTIXMT:
5332 		*val = state->id_multi_xmt;
5333 		break;
5334 	case MAC_STAT_BRDCSTXMT:
5335 		*val = state->id_brd_xmt;
5336 		break;
5337 	case MAC_STAT_RBYTES:
5338 		*val = state->id_rcv_bytes + state->rc_rcv_trans_byte
5339 		    + state->rc_rcv_copy_byte;
5340 		break;
5341 	case MAC_STAT_IPACKETS:
5342 		*val = state->id_rcv_pkt + state->rc_rcv_trans_pkt
5343 		    + state->rc_rcv_copy_pkt;
5344 		break;
5345 	case MAC_STAT_OBYTES:
5346 		*val = state->id_xmt_bytes + state->rc_xmt_bytes;
5347 		break;
5348 	case MAC_STAT_OPACKETS:
5349 		*val = state->id_xmt_pkt + state->rc_xmt_small_pkt +
5350 		    state->rc_xmt_fragmented_pkt +
5351 		    state->rc_xmt_map_fail_pkt + state->rc_xmt_map_succ_pkt;
5352 		break;
5353 	case MAC_STAT_OERRORS:
5354 		*val = state->id_ah_error;	/* failed AH translation */
5355 		break;
5356 	case MAC_STAT_IERRORS:
5357 		*val = 0;
5358 		break;
5359 	case MAC_STAT_NOXMTBUF:
5360 		*val = state->id_tx_short + state->rc_swqe_short +
5361 		    state->rc_xmt_buf_short;
5362 		break;
5363 	case MAC_STAT_NORCVBUF:
5364 	default:
5365 		return (ENOTSUP);
5366 	}
5367 
5368 	return (0);
5369 }
5370 
5371 static void
5372 ibd_async_txsched(ibd_state_t *state)
5373 {
5374 	ibd_resume_transmission(state);
5375 }
5376 
5377 static void
5378 ibd_resume_transmission(ibd_state_t *state)
5379 {
5380 	int flag;
5381 	int met_thresh = 0;
5382 	int thresh = 0;
5383 	int ret = -1;
5384 
5385 	mutex_enter(&state->id_sched_lock);
5386 	if (state->id_sched_needed & IBD_RSRC_SWQE) {
5387 		mutex_enter(&state->id_tx_list.dl_mutex);
5388 		mutex_enter(&state->id_tx_rel_list.dl_mutex);
5389 		met_thresh = state->id_tx_list.dl_cnt +
5390 		    state->id_tx_rel_list.dl_cnt;
5391 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
5392 		mutex_exit(&state->id_tx_list.dl_mutex);
5393 		thresh = IBD_FREE_SWQES_THRESH;
5394 		flag = IBD_RSRC_SWQE;
5395 	} else if (state->id_sched_needed & IBD_RSRC_LSOBUF) {
5396 		ASSERT(state->id_lso != NULL);
5397 		mutex_enter(&state->id_lso_lock);
5398 		met_thresh = state->id_lso->bkt_nfree;
5399 		thresh = IBD_FREE_LSOS_THRESH;
5400 		mutex_exit(&state->id_lso_lock);
5401 		flag = IBD_RSRC_LSOBUF;
5402 		if (met_thresh > thresh)
5403 			state->id_sched_lso_cnt++;
5404 	}
5405 	if (met_thresh > thresh) {
5406 		state->id_sched_needed &= ~flag;
5407 		state->id_sched_cnt++;
5408 		ret = 0;
5409 	}
5410 	mutex_exit(&state->id_sched_lock);
5411 
5412 	if (ret == 0)
5413 		mac_tx_update(state->id_mh);
5414 }
5415 
5416 /*
5417  * Release the send wqe back into free list.
5418  */
5419 static void
5420 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n)
5421 {
5422 	/*
5423 	 * Add back on Tx list for reuse.
5424 	 */
5425 	ASSERT(tail->swqe_next == NULL);
5426 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
5427 	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
5428 	tail->swqe_next = state->id_tx_rel_list.dl_head;
5429 	state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head);
5430 	state->id_tx_rel_list.dl_cnt += n;
5431 	mutex_exit(&state->id_tx_rel_list.dl_mutex);
5432 }
5433 
5434 /*
5435  * Acquire a send wqe from free list.
5436  * Returns error number and send wqe pointer.
5437  */
5438 static ibd_swqe_t *
5439 ibd_acquire_swqe(ibd_state_t *state)
5440 {
5441 	ibd_swqe_t *wqe;
5442 
5443 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
5444 	if (state->id_tx_rel_list.dl_head != NULL) {
5445 		/* transfer id_tx_rel_list to id_tx_list */
5446 		state->id_tx_list.dl_head =
5447 		    state->id_tx_rel_list.dl_head;
5448 		state->id_tx_list.dl_cnt =
5449 		    state->id_tx_rel_list.dl_cnt;
5450 		state->id_tx_list.dl_pending_sends = B_FALSE;
5451 
5452 		/* clear id_tx_rel_list */
5453 		state->id_tx_rel_list.dl_head = NULL;
5454 		state->id_tx_rel_list.dl_cnt = 0;
5455 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
5456 
5457 		wqe = WQE_TO_SWQE(state->id_tx_list.dl_head);
5458 		state->id_tx_list.dl_cnt -= 1;
5459 		state->id_tx_list.dl_head = wqe->swqe_next;
5460 	} else {	/* no free swqe */
5461 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
5462 		state->id_tx_list.dl_pending_sends = B_TRUE;
5463 		DPRINT(5, "ibd_acquire_swqe: out of Tx wqe");
5464 		state->id_tx_short++;
5465 		wqe = NULL;
5466 	}
5467 	return (wqe);
5468 }
5469 
5470 static int
5471 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss,
5472     ibt_ud_dest_hdl_t ud_dest)
5473 {
5474 	mblk_t	*nmp;
5475 	int iph_len, tcph_len;
5476 	ibt_wr_lso_t *lso;
5477 	uintptr_t ip_start, tcp_start;
5478 	uint8_t *dst;
5479 	uint_t pending, mblen;
5480 
5481 	/*
5482 	 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default;
5483 	 * we need to adjust it here for lso.
5484 	 */
5485 	lso = &(node->w_swr.wr.ud_lso);
5486 	lso->lso_ud_dest = ud_dest;
5487 	lso->lso_mss = mss;
5488 
5489 	/*
5490 	 * Calculate the LSO header size and set it in the UD LSO structure.
5491 	 * Note that the only assumption we make is that each of the IPoIB,
5492 	 * IP and TCP headers will be contained in a single mblk fragment;
5493 	 * together, the headers may span multiple mblk fragments.
5494 	 */
5495 	nmp = mp;
5496 	ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE;
5497 	if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
5498 		ip_start = (uintptr_t)nmp->b_cont->b_rptr
5499 		    + (ip_start - (uintptr_t)(nmp->b_wptr));
5500 		nmp = nmp->b_cont;
5501 
5502 	}
5503 	iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start);
5504 
5505 	tcp_start = ip_start + iph_len;
5506 	if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
5507 		tcp_start = (uintptr_t)nmp->b_cont->b_rptr
5508 		    + (tcp_start - (uintptr_t)(nmp->b_wptr));
5509 		nmp = nmp->b_cont;
5510 	}
5511 	tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start);
5512 	lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len;
5513 
5514 	/*
5515 	 * If the lso header fits entirely within a single mblk fragment,
5516 	 * we'll avoid an additional copy of the lso header here and just
5517 	 * pass the b_rptr of the mblk directly.
5518 	 *
5519 	 * If this isn't true, we'd have to allocate for it explicitly.
5520 	 */
5521 	if (lso->lso_hdr_sz <= MBLKL(mp)) {
5522 		lso->lso_hdr = mp->b_rptr;
5523 	} else {
5524 		/* On work completion, remember to free this allocated hdr */
5525 		lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP);
5526 		if (lso->lso_hdr == NULL) {
5527 			DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, "
5528 			    "sz = %d", lso->lso_hdr_sz);
5529 			lso->lso_hdr_sz = 0;
5530 			lso->lso_mss = 0;
5531 			return (-1);
5532 		}
5533 	}
5534 
5535 	/*
5536 	 * Copy in the lso header only if we need to
5537 	 */
5538 	if (lso->lso_hdr != mp->b_rptr) {
5539 		dst = lso->lso_hdr;
5540 		pending = lso->lso_hdr_sz;
5541 
5542 		for (nmp = mp; nmp && pending; nmp = nmp->b_cont) {
5543 			mblen = MBLKL(nmp);
5544 			if (pending > mblen) {
5545 				bcopy(nmp->b_rptr, dst, mblen);
5546 				dst += mblen;
5547 				pending -= mblen;
5548 			} else {
5549 				bcopy(nmp->b_rptr, dst, pending);
5550 				break;
5551 			}
5552 		}
5553 	}
5554 
5555 	return (0);
5556 }
5557 
5558 static void
5559 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp)
5560 {
5561 	ibt_wr_lso_t *lso;
5562 
5563 	if ((!node) || (!mp))
5564 		return;
5565 
5566 	/*
5567 	 * Free any header space that we might've allocated if we
5568 	 * did an LSO
5569 	 */
5570 	if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) {
5571 		lso = &(node->w_swr.wr.ud_lso);
5572 		if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) {
5573 			kmem_free(lso->lso_hdr, lso->lso_hdr_sz);
5574 			lso->lso_hdr = NULL;
5575 			lso->lso_hdr_sz = 0;
5576 		}
5577 	}
5578 }
5579 
5580 static void
5581 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node)
5582 {
5583 	uint_t		i;
5584 	uint_t		num_posted;
5585 	uint_t		n_wrs;
5586 	ibt_status_t	ibt_status;
5587 	ibt_send_wr_t	wrs[IBD_MAX_TX_POST_MULTIPLE];
5588 	ibd_swqe_t	*tx_head, *elem;
5589 	ibd_swqe_t	*nodes[IBD_MAX_TX_POST_MULTIPLE];
5590 
5591 	/* post the one request, then check for more */
5592 	ibt_status = ibt_post_send(state->id_chnl_hdl,
5593 	    &node->w_swr, 1, NULL);
5594 	if (ibt_status != IBT_SUCCESS) {
5595 		ibd_print_warn(state, "ibd_post_send: "
5596 		    "posting one wr failed: ret=%d", ibt_status);
5597 		ibd_tx_cleanup(state, node);
5598 	}
5599 
5600 	tx_head = NULL;
5601 	for (;;) {
5602 		if (tx_head == NULL) {
5603 			mutex_enter(&state->id_txpost_lock);
5604 			tx_head = state->id_tx_head;
5605 			if (tx_head == NULL) {
5606 				state->id_tx_busy = 0;
5607 				mutex_exit(&state->id_txpost_lock);
5608 				return;
5609 			}
5610 			state->id_tx_head = NULL;
5611 			mutex_exit(&state->id_txpost_lock);
5612 		}
5613 
5614 		/*
5615 		 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs
5616 		 * at a time if possible, and keep posting them.
5617 		 */
5618 		for (n_wrs = 0, elem = tx_head;
5619 		    (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE);
5620 		    elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
5621 			nodes[n_wrs] = elem;
5622 			wrs[n_wrs] = elem->w_swr;
5623 		}
5624 		tx_head = elem;
5625 
5626 		ASSERT(n_wrs != 0);
5627 
5628 		/*
5629 		 * If posting fails for some reason, we'll never receive
5630 		 * completion intimation, so we'll need to cleanup. But
5631 		 * we need to make sure we don't clean up nodes whose
5632 		 * wrs have been successfully posted. We assume that the
5633 		 * hca driver returns on the first failure to post and
5634 		 * therefore the first 'num_posted' entries don't need
5635 		 * cleanup here.
5636 		 */
5637 		num_posted = 0;
5638 		ibt_status = ibt_post_send(state->id_chnl_hdl,
5639 		    wrs, n_wrs, &num_posted);
5640 		if (ibt_status != IBT_SUCCESS) {
5641 			ibd_print_warn(state, "ibd_post_send: "
5642 			    "posting multiple wrs failed: "
5643 			    "requested=%d, done=%d, ret=%d",
5644 			    n_wrs, num_posted, ibt_status);
5645 
5646 			for (i = num_posted; i < n_wrs; i++)
5647 				ibd_tx_cleanup(state, nodes[i]);
5648 		}
5649 	}
5650 }
5651 
5652 static int
5653 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node,
5654     uint_t lsohdr_sz)
5655 {
5656 	ibt_wr_ds_t *sgl;
5657 	ibt_status_t ibt_status;
5658 	mblk_t *nmp;
5659 	mblk_t *data_mp;
5660 	uchar_t *bufp;
5661 	size_t blksize;
5662 	size_t skip;
5663 	size_t avail;
5664 	uint_t pktsize;
5665 	uint_t frag_len;
5666 	uint_t pending_hdr;
5667 	int nmblks;
5668 	int i;
5669 
5670 	/*
5671 	 * Let's skip ahead to the data if this is LSO
5672 	 */
5673 	data_mp = mp;
5674 	pending_hdr = 0;
5675 	if (lsohdr_sz) {
5676 		pending_hdr = lsohdr_sz;
5677 		for (nmp = mp; nmp; nmp = nmp->b_cont) {
5678 			frag_len = nmp->b_wptr - nmp->b_rptr;
5679 			if (frag_len > pending_hdr)
5680 				break;
5681 			pending_hdr -= frag_len;
5682 		}
5683 		data_mp = nmp;	/* start of data past lso header */
5684 		ASSERT(data_mp != NULL);
5685 	}
5686 
5687 	/*
5688 	 * Calculate the size of message data and number of msg blocks
5689 	 */
5690 	pktsize = 0;
5691 	for (nmblks = 0, nmp = data_mp; nmp != NULL;
5692 	    nmp = nmp->b_cont, nmblks++) {
5693 		pktsize += MBLKL(nmp);
5694 	}
5695 	pktsize -= pending_hdr;
5696 
5697 	/*
5698 	 * We only do ibt_map_mem_iov() if the pktsize is above the
5699 	 * "copy-threshold", and if the number of mp fragments is less than
5700 	 * the maximum acceptable.
5701 	 */
5702 	if ((state->id_hca_res_lkey_capab) &&
5703 	    (pktsize > IBD_TX_COPY_THRESH) &&
5704 	    (nmblks < state->id_max_sqseg_hiwm)) {
5705 		ibt_iov_t iov_arr[IBD_MAX_SQSEG];
5706 		ibt_iov_attr_t iov_attr;
5707 
5708 		iov_attr.iov_as = NULL;
5709 		iov_attr.iov = iov_arr;
5710 		iov_attr.iov_buf = NULL;
5711 		iov_attr.iov_list_len = nmblks;
5712 		iov_attr.iov_wr_nds = state->id_max_sqseg;
5713 		iov_attr.iov_lso_hdr_sz = lsohdr_sz;
5714 		iov_attr.iov_flags = IBT_IOV_SLEEP;
5715 
5716 		for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) {
5717 			iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr;
5718 			iov_arr[i].iov_len = MBLKL(nmp);
5719 			if (i == 0) {
5720 				iov_arr[i].iov_addr += pending_hdr;
5721 				iov_arr[i].iov_len -= pending_hdr;
5722 			}
5723 		}
5724 
5725 		node->w_buftype = IBD_WQE_MAPPED;
5726 		node->w_swr.wr_sgl = node->w_sgl;
5727 
5728 		ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr,
5729 		    (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl);
5730 		if (ibt_status != IBT_SUCCESS) {
5731 			ibd_print_warn(state, "ibd_send: ibt_map_mem_iov "
5732 			    "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status);
5733 			goto ibd_copy_path;
5734 		}
5735 
5736 		return (0);
5737 	}
5738 
5739 ibd_copy_path:
5740 	if (pktsize <= state->id_tx_buf_sz) {
5741 		node->swqe_copybuf.ic_sgl.ds_len = pktsize;
5742 		node->w_swr.wr_nds = 1;
5743 		node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
5744 		node->w_buftype = IBD_WQE_TXBUF;
5745 
5746 		/*
5747 		 * Even though this is the copy path for transfers less than
5748 		 * id_tx_buf_sz, it could still be an LSO packet.  If so, it
5749 		 * is possible the first data mblk fragment (data_mp) still
5750 		 * contains part of the LSO header that we need to skip.
5751 		 */
5752 		bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
5753 		for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
5754 			blksize = MBLKL(nmp) - pending_hdr;
5755 			bcopy(nmp->b_rptr + pending_hdr, bufp, blksize);
5756 			bufp += blksize;
5757 			pending_hdr = 0;
5758 		}
5759 
5760 		return (0);
5761 	}
5762 
5763 	/*
5764 	 * Copy path for transfers greater than id_tx_buf_sz
5765 	 */
5766 	node->w_swr.wr_sgl = node->w_sgl;
5767 	if (ibd_acquire_lsobufs(state, pktsize,
5768 	    node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) {
5769 		DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed");
5770 		return (-1);
5771 	}
5772 	node->w_buftype = IBD_WQE_LSOBUF;
5773 
5774 	/*
5775 	 * Copy the larger-than-id_tx_buf_sz packet into a set of
5776 	 * fixed-sized, pre-mapped LSO buffers. Note that we might
5777 	 * need to skip part of the LSO header in the first fragment
5778 	 * as before.
5779 	 */
5780 	nmp = data_mp;
5781 	skip = pending_hdr;
5782 	for (i = 0; i < node->w_swr.wr_nds; i++) {
5783 		sgl = node->w_swr.wr_sgl + i;
5784 		bufp = (uchar_t *)(uintptr_t)sgl->ds_va;
5785 		avail = IBD_LSO_BUFSZ;
5786 		while (nmp && avail) {
5787 			blksize = MBLKL(nmp) - skip;
5788 			if (blksize > avail) {
5789 				bcopy(nmp->b_rptr + skip, bufp, avail);
5790 				skip += avail;
5791 				avail = 0;
5792 			} else {
5793 				bcopy(nmp->b_rptr + skip, bufp, blksize);
5794 				skip = 0;
5795 				avail -= blksize;
5796 				bufp += blksize;
5797 				nmp = nmp->b_cont;
5798 			}
5799 		}
5800 	}
5801 
5802 	return (0);
5803 }
5804 
5805 /*
5806  * Schedule a completion queue polling to reap the resource we're
5807  * short on.  If we implement the change to reap tx completions
5808  * in a separate thread, we'll need to wake up that thread here.
5809  */
5810 static int
5811 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag)
5812 {
5813 	ibd_req_t *req;
5814 
5815 	mutex_enter(&state->id_sched_lock);
5816 	state->id_sched_needed |= resource_type;
5817 	mutex_exit(&state->id_sched_lock);
5818 
5819 	/*
5820 	 * If we are asked to queue a work entry, we need to do it
5821 	 */
5822 	if (q_flag) {
5823 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
5824 		if (req == NULL)
5825 			return (-1);
5826 
5827 		ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
5828 	}
5829 
5830 	return (0);
5831 }
5832 
5833 /*
5834  * The passed in packet has this format:
5835  * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data
5836  */
5837 static boolean_t
5838 ibd_send(ibd_state_t *state, mblk_t *mp)
5839 {
5840 	ibd_ace_t *ace;
5841 	ibd_swqe_t *node;
5842 	ipoib_mac_t *dest;
5843 	ib_header_info_t *ipibp;
5844 	ip6_t *ip6h;
5845 	uint_t pktsize;
5846 	uint32_t mss;
5847 	uint32_t hckflags;
5848 	uint32_t lsoflags = 0;
5849 	uint_t lsohdr_sz = 0;
5850 	int ret, len;
5851 	boolean_t dofree = B_FALSE;
5852 	boolean_t rc;
5853 	/* if (rc_chan == NULL) send by UD; else send by RC; */
5854 	ibd_rc_chan_t *rc_chan;
5855 	int nmblks;
5856 	mblk_t *nmp;
5857 
5858 	/*
5859 	 * If we aren't done with the device initialization and start,
5860 	 * we shouldn't be here.
5861 	 */
5862 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
5863 		return (B_FALSE);
5864 
5865 	/*
5866 	 * Obtain an address handle for the destination.
5867 	 */
5868 	ipibp = (ib_header_info_t *)mp->b_rptr;
5869 	dest = (ipoib_mac_t *)&ipibp->ib_dst;
5870 	if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
5871 		IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey);
5872 
5873 	rc_chan = NULL;
5874 	ace = ibd_acache_lookup(state, dest, &ret, 1);
5875 	if (state->id_enable_rc && (ace != NULL) &&
5876 	    (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN))) {
5877 		if (ace->ac_chan == NULL) {
5878 			state->rc_null_conn++;
5879 		} else {
5880 			if (ace->ac_chan->chan_state ==
5881 			    IBD_RC_STATE_ACT_ESTAB) {
5882 				rc_chan = ace->ac_chan;
5883 				mutex_enter(&rc_chan->tx_wqe_list.dl_mutex);
5884 				node = WQE_TO_SWQE(
5885 				    rc_chan->tx_wqe_list.dl_head);
5886 				if (node != NULL) {
5887 					rc_chan->tx_wqe_list.dl_cnt -= 1;
5888 					rc_chan->tx_wqe_list.dl_head =
5889 					    node->swqe_next;
5890 				} else {
5891 					node = ibd_rc_acquire_swqes(rc_chan);
5892 				}
5893 				mutex_exit(&rc_chan->tx_wqe_list.dl_mutex);
5894 
5895 				if (node == NULL) {
5896 					state->rc_swqe_short++;
5897 					mutex_enter(&state->id_sched_lock);
5898 					state->id_sched_needed |=
5899 					    IBD_RSRC_RC_SWQE;
5900 					mutex_exit(&state->id_sched_lock);
5901 					ibd_dec_ref_ace(state, ace);
5902 					return (B_FALSE);
5903 				}
5904 			} else {
5905 				state->rc_no_estab_conn++;
5906 			}
5907 		}
5908 	}
5909 
5910 	if (rc_chan == NULL) {
5911 		mutex_enter(&state->id_tx_list.dl_mutex);
5912 		node = WQE_TO_SWQE(state->id_tx_list.dl_head);
5913 		if (node != NULL) {
5914 			state->id_tx_list.dl_cnt -= 1;
5915 			state->id_tx_list.dl_head = node->swqe_next;
5916 		} else {
5917 			node = ibd_acquire_swqe(state);
5918 		}
5919 		mutex_exit(&state->id_tx_list.dl_mutex);
5920 		if (node == NULL) {
5921 			/*
5922 			 * If we don't have an swqe available, schedule a
5923 			 * transmit completion queue cleanup and hold off on
5924 			 * sending more packets until we have some free swqes
5925 			 */
5926 			if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0) {
5927 				if (ace != NULL) {
5928 					ibd_dec_ref_ace(state, ace);
5929 				}
5930 				return (B_FALSE);
5931 			}
5932 
5933 			/*
5934 			 * If a poll cannot be scheduled, we have no choice but
5935 			 * to drop this packet
5936 			 */
5937 			ibd_print_warn(state, "ibd_send: no swqe, pkt drop");
5938 			if (ace != NULL) {
5939 				ibd_dec_ref_ace(state, ace);
5940 			}
5941 			return (B_TRUE);
5942 		}
5943 	}
5944 
5945 	/*
5946 	 * Initialize the commonly used fields in swqe to NULL to protect
5947 	 * against ibd_tx_cleanup accidentally misinterpreting these on a
5948 	 * failure.
5949 	 */
5950 	node->swqe_im_mblk = NULL;
5951 	node->w_swr.wr_nds = 0;
5952 	node->w_swr.wr_sgl = NULL;
5953 	node->w_swr.wr_opcode = IBT_WRC_SEND;
5954 
5955 	/*
5956 	 * Calculate the size of message data and number of msg blocks
5957 	 */
5958 	pktsize = 0;
5959 	for (nmblks = 0, nmp = mp; nmp != NULL;
5960 	    nmp = nmp->b_cont, nmblks++) {
5961 		pktsize += MBLKL(nmp);
5962 	}
5963 
5964 	if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
5965 		atomic_inc_64(&state->id_brd_xmt);
5966 	else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
5967 		atomic_inc_64(&state->id_multi_xmt);
5968 
5969 	if (ace != NULL) {
5970 		node->w_ahandle = ace;
5971 		node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
5972 	} else {
5973 		DPRINT(5,
5974 		    "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X",
5975 		    ((ret == EFAULT) ? "failed" : "queued"),
5976 		    htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]),
5977 		    htonl(dest->ipoib_gidpref[1]),
5978 		    htonl(dest->ipoib_gidsuff[0]),
5979 		    htonl(dest->ipoib_gidsuff[1]));
5980 		state->rc_ace_not_found++;
5981 		node->w_ahandle = NULL;
5982 
5983 		/*
5984 		 * Here if ibd_acache_lookup() returns EFAULT, it means ibd
5985 		 * can not find a path for the specific dest address. We
5986 		 * should get rid of this kind of packet.  We also should get
5987 		 * rid of the packet if we cannot schedule a poll via the
5988 		 * async thread.  For the normal case, ibd will return the
5989 		 * packet to upper layer and wait for AH creating.
5990 		 *
5991 		 * Note that we always queue a work slot entry for the async
5992 		 * thread when we fail AH lookup (even in intr mode); this is
5993 		 * due to the convoluted way the code currently looks for AH.
5994 		 */
5995 		if (ret == EFAULT) {
5996 			dofree = B_TRUE;
5997 			rc = B_TRUE;
5998 		} else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) {
5999 			dofree = B_TRUE;
6000 			rc = B_TRUE;
6001 		} else {
6002 			dofree = B_FALSE;
6003 			rc = B_FALSE;
6004 		}
6005 		goto ibd_send_fail;
6006 	}
6007 
6008 	/*
6009 	 * For ND6 packets, padding is at the front of the source lladdr.
6010 	 * Insert the padding at front.
6011 	 */
6012 	if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) {
6013 		if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) {
6014 			if (!pullupmsg(mp, IPV6_HDR_LEN +
6015 			    sizeof (ib_header_info_t))) {
6016 				DPRINT(10, "ibd_send: pullupmsg failure ");
6017 				dofree = B_TRUE;
6018 				rc = B_TRUE;
6019 				goto ibd_send_fail;
6020 			}
6021 			ipibp = (ib_header_info_t *)mp->b_rptr;
6022 		}
6023 		ip6h = (ip6_t *)((uchar_t *)ipibp +
6024 		    sizeof (ib_header_info_t));
6025 		len = ntohs(ip6h->ip6_plen);
6026 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
6027 			mblk_t	*pad;
6028 
6029 			pad = allocb(4, 0);
6030 			pad->b_wptr = (uchar_t *)pad->b_rptr + 4;
6031 			linkb(mp, pad);
6032 			if (MBLKL(mp) < sizeof (ib_header_info_t) +
6033 			    IPV6_HDR_LEN + len + 4) {
6034 				if (!pullupmsg(mp, sizeof (ib_header_info_t) +
6035 				    IPV6_HDR_LEN + len + 4)) {
6036 					DPRINT(10, "ibd_send: pullupmsg "
6037 					    "failure ");
6038 					dofree = B_TRUE;
6039 					rc = B_TRUE;
6040 					goto ibd_send_fail;
6041 				}
6042 				ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
6043 				    sizeof (ib_header_info_t));
6044 			}
6045 
6046 			/* LINTED: E_CONSTANT_CONDITION */
6047 			IBD_PAD_NSNA(ip6h, len, IBD_SEND);
6048 		}
6049 	}
6050 
6051 	ASSERT(mp->b_wptr - mp->b_rptr >= sizeof (ib_addrs_t));
6052 	mp->b_rptr += sizeof (ib_addrs_t);
6053 	pktsize -= sizeof (ib_addrs_t);
6054 
6055 	if (rc_chan) {	/* send in RC mode */
6056 		ibt_iov_t iov_arr[IBD_MAX_SQSEG];
6057 		ibt_iov_attr_t iov_attr;
6058 		uint_t		i;
6059 		size_t	blksize;
6060 		uchar_t *bufp;
6061 		ibd_rc_tx_largebuf_t *lbufp;
6062 
6063 		atomic_add_64(&state->rc_xmt_bytes, pktsize);
6064 
6065 		/*
6066 		 * Upper layer does Tx checksum, we don't need do any
6067 		 * checksum here.
6068 		 */
6069 		ASSERT(node->w_swr.wr_trans == IBT_RC_SRV);
6070 
6071 		/*
6072 		 * We only do ibt_map_mem_iov() if the pktsize is above
6073 		 * the "copy-threshold", and if the number of mp
6074 		 * fragments is less than the maximum acceptable.
6075 		 */
6076 		if (pktsize <= ibd_rc_tx_copy_thresh) {
6077 			atomic_inc_64(&state->rc_xmt_small_pkt);
6078 			/*
6079 			 * Only process unicast packet in Reliable Connected
6080 			 * mode.
6081 			 */
6082 			node->swqe_copybuf.ic_sgl.ds_len = pktsize;
6083 			node->w_swr.wr_nds = 1;
6084 			node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
6085 			node->w_buftype = IBD_WQE_TXBUF;
6086 
6087 			bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
6088 			for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
6089 				blksize = MBLKL(nmp);
6090 				bcopy(nmp->b_rptr, bufp, blksize);
6091 				bufp += blksize;
6092 			}
6093 			freemsg(mp);
6094 			ASSERT(node->swqe_im_mblk == NULL);
6095 		} else {
6096 			if ((state->rc_enable_iov_map) &&
6097 			    (nmblks < state->rc_max_sqseg_hiwm)) {
6098 
6099 				/* do ibt_map_mem_iov() */
6100 				iov_attr.iov_as = NULL;
6101 				iov_attr.iov = iov_arr;
6102 				iov_attr.iov_buf = NULL;
6103 				iov_attr.iov_wr_nds = state->rc_tx_max_sqseg;
6104 				iov_attr.iov_lso_hdr_sz = 0;
6105 				iov_attr.iov_flags = IBT_IOV_SLEEP;
6106 
6107 				i = 0;
6108 				for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
6109 					iov_arr[i].iov_len = MBLKL(nmp);
6110 					if (iov_arr[i].iov_len != 0) {
6111 						iov_arr[i].iov_addr = (caddr_t)
6112 						    (void *)nmp->b_rptr;
6113 						i++;
6114 					}
6115 				}
6116 				iov_attr.iov_list_len = i;
6117 				node->w_swr.wr_sgl = node->w_sgl;
6118 
6119 				ret = ibt_map_mem_iov(state->id_hca_hdl,
6120 				    &iov_attr, (ibt_all_wr_t *)&node->w_swr,
6121 				    &node->w_mi_hdl);
6122 				if (ret != IBT_SUCCESS) {
6123 					atomic_inc_64(
6124 					    &state->rc_xmt_map_fail_pkt);
6125 					DPRINT(30, "ibd_send: ibt_map_mem_iov("
6126 					    ") failed, nmblks=%d, real_nmblks"
6127 					    "=%d, ret=0x%x", nmblks, i, ret);
6128 					goto ibd_rc_large_copy;
6129 				}
6130 
6131 				atomic_inc_64(&state->rc_xmt_map_succ_pkt);
6132 				node->w_buftype = IBD_WQE_MAPPED;
6133 				node->swqe_im_mblk = mp;
6134 			} else {
6135 				atomic_inc_64(&state->rc_xmt_fragmented_pkt);
6136 ibd_rc_large_copy:
6137 				mutex_enter(&state->rc_tx_large_bufs_lock);
6138 				if (state->rc_tx_largebuf_nfree == 0) {
6139 					state->rc_xmt_buf_short++;
6140 					mutex_exit
6141 					    (&state->rc_tx_large_bufs_lock);
6142 					mutex_enter(&state->id_sched_lock);
6143 					state->id_sched_needed |=
6144 					    IBD_RSRC_RC_TX_LARGEBUF;
6145 					mutex_exit(&state->id_sched_lock);
6146 					dofree = B_FALSE;
6147 					rc = B_FALSE;
6148 					/*
6149 					 * If we don't have Tx large bufs,
6150 					 * return failure. node->w_buftype
6151 					 * should not be IBD_WQE_RC_COPYBUF,
6152 					 * otherwise it will cause problem
6153 					 * in ibd_rc_tx_cleanup()
6154 					 */
6155 					node->w_buftype = IBD_WQE_TXBUF;
6156 					goto ibd_send_fail;
6157 				}
6158 
6159 				lbufp = state->rc_tx_largebuf_free_head;
6160 				ASSERT(lbufp->lb_buf != NULL);
6161 				state->rc_tx_largebuf_free_head =
6162 				    lbufp->lb_next;
6163 				lbufp->lb_next = NULL;
6164 				/* Update nfree count */
6165 				state->rc_tx_largebuf_nfree --;
6166 				mutex_exit(&state->rc_tx_large_bufs_lock);
6167 				bufp = lbufp->lb_buf;
6168 				node->w_sgl[0].ds_va =
6169 				    (ib_vaddr_t)(uintptr_t)bufp;
6170 				node->w_sgl[0].ds_key =
6171 				    state->rc_tx_mr_desc.md_lkey;
6172 				node->w_sgl[0].ds_len = pktsize;
6173 				node->w_swr.wr_sgl = node->w_sgl;
6174 				node->w_swr.wr_nds = 1;
6175 				node->w_buftype = IBD_WQE_RC_COPYBUF;
6176 				node->w_rc_tx_largebuf = lbufp;
6177 
6178 				for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
6179 					blksize = MBLKL(nmp);
6180 					if (blksize != 0) {
6181 						bcopy(nmp->b_rptr, bufp,
6182 						    blksize);
6183 						bufp += blksize;
6184 					}
6185 				}
6186 				freemsg(mp);
6187 				ASSERT(node->swqe_im_mblk == NULL);
6188 			}
6189 		}
6190 
6191 		node->swqe_next = NULL;
6192 		mutex_enter(&rc_chan->tx_post_lock);
6193 		if (rc_chan->tx_busy) {
6194 			if (rc_chan->tx_head) {
6195 				rc_chan->tx_tail->swqe_next =
6196 				    SWQE_TO_WQE(node);
6197 			} else {
6198 				rc_chan->tx_head = node;
6199 			}
6200 			rc_chan->tx_tail = node;
6201 			mutex_exit(&rc_chan->tx_post_lock);
6202 		} else {
6203 			rc_chan->tx_busy = 1;
6204 			mutex_exit(&rc_chan->tx_post_lock);
6205 			ibd_rc_post_send(rc_chan, node);
6206 		}
6207 
6208 		return (B_TRUE);
6209 	} /* send by RC */
6210 
6211 	if ((state->id_enable_rc) && (pktsize > state->id_mtu)) {
6212 		/*
6213 		 * Too long pktsize. The packet size from GLD should <=
6214 		 * state->id_mtu + sizeof (ib_addrs_t)
6215 		 */
6216 		if (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN)) {
6217 			ibd_req_t *req;
6218 
6219 			mutex_enter(&ace->tx_too_big_mutex);
6220 			if (ace->tx_too_big_ongoing) {
6221 				mutex_exit(&ace->tx_too_big_mutex);
6222 				state->rc_xmt_reenter_too_long_pkt++;
6223 				dofree = B_TRUE;
6224 			} else {
6225 				ace->tx_too_big_ongoing = B_TRUE;
6226 				mutex_exit(&ace->tx_too_big_mutex);
6227 				state->rc_xmt_icmp_too_long_pkt++;
6228 
6229 				req = kmem_cache_alloc(state->id_req_kmc,
6230 				    KM_NOSLEEP);
6231 				if (req == NULL) {
6232 					ibd_print_warn(state, "ibd_send: alloc "
6233 					    "ibd_req_t fail");
6234 					/* Drop it. */
6235 					dofree = B_TRUE;
6236 				} else {
6237 					req->rq_ptr = mp;
6238 					req->rq_ptr2 = ace;
6239 					ibd_queue_work_slot(state, req,
6240 					    IBD_ASYNC_RC_TOO_BIG);
6241 					dofree = B_FALSE;
6242 				}
6243 			}
6244 		} else {
6245 			ibd_print_warn(state, "Reliable Connected mode is on. "
6246 			    "Multicast packet length %d > %d is too long to "
6247 			    "send packet (%d > %d), drop it",
6248 			    pktsize, state->id_mtu);
6249 			state->rc_xmt_drop_too_long_pkt++;
6250 			/* Drop it. */
6251 			dofree = B_TRUE;
6252 		}
6253 		rc = B_TRUE;
6254 		goto ibd_send_fail;
6255 	}
6256 
6257 	atomic_add_64(&state->id_xmt_bytes, pktsize);
6258 	atomic_inc_64(&state->id_xmt_pkt);
6259 
6260 	/*
6261 	 * Do LSO and checksum related work here.  For LSO send, adjust the
6262 	 * ud destination, the opcode and the LSO header information to the
6263 	 * work request.
6264 	 */
6265 	mac_lso_get(mp, &mss, &lsoflags);
6266 	if ((lsoflags & HW_LSO) != HW_LSO) {
6267 		node->w_swr.wr_opcode = IBT_WRC_SEND;
6268 		lsohdr_sz = 0;
6269 	} else {
6270 		if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) {
6271 			/*
6272 			 * The routine can only fail if there's no memory; we
6273 			 * can only drop the packet if this happens
6274 			 */
6275 			ibd_print_warn(state,
6276 			    "ibd_send: no memory, lso posting failed");
6277 			dofree = B_TRUE;
6278 			rc = B_TRUE;
6279 			goto ibd_send_fail;
6280 		}
6281 
6282 		node->w_swr.wr_opcode = IBT_WRC_SEND_LSO;
6283 		lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz;
6284 	}
6285 
6286 	mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &hckflags);
6287 	if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM)
6288 		node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM;
6289 	else
6290 		node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM;
6291 
6292 	/*
6293 	 * Prepare the sgl for posting; the routine can only fail if there's
6294 	 * no lso buf available for posting. If this is the case, we should
6295 	 * probably resched for lso bufs to become available and then try again.
6296 	 */
6297 	if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) {
6298 		if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) {
6299 			dofree = B_TRUE;
6300 			rc = B_TRUE;
6301 		} else {
6302 			dofree = B_FALSE;
6303 			rc = B_FALSE;
6304 		}
6305 		goto ibd_send_fail;
6306 	}
6307 	node->swqe_im_mblk = mp;
6308 
6309 	/*
6310 	 * Queue the wqe to hardware; since we can now simply queue a
6311 	 * post instead of doing it serially, we cannot assume anything
6312 	 * about the 'node' after ibd_post_send() returns.
6313 	 */
6314 	node->swqe_next = NULL;
6315 
6316 	mutex_enter(&state->id_txpost_lock);
6317 	if (state->id_tx_busy) {
6318 		if (state->id_tx_head) {
6319 			state->id_tx_tail->swqe_next =
6320 			    SWQE_TO_WQE(node);
6321 		} else {
6322 			state->id_tx_head = node;
6323 		}
6324 		state->id_tx_tail = node;
6325 		mutex_exit(&state->id_txpost_lock);
6326 	} else {
6327 		state->id_tx_busy = 1;
6328 		mutex_exit(&state->id_txpost_lock);
6329 		ibd_post_send(state, node);
6330 	}
6331 
6332 	return (B_TRUE);
6333 
6334 ibd_send_fail:
6335 	if (node && mp)
6336 		ibd_free_lsohdr(node, mp);
6337 
6338 	if (dofree)
6339 		freemsg(mp);
6340 
6341 	if (node != NULL) {
6342 		if (rc_chan) {
6343 			ibd_rc_tx_cleanup(node);
6344 		} else {
6345 			ibd_tx_cleanup(state, node);
6346 		}
6347 	}
6348 
6349 	return (rc);
6350 }
6351 
6352 /*
6353  * GLDv3 entry point for transmitting datagram.
6354  */
6355 static mblk_t *
6356 ibd_m_tx(void *arg, mblk_t *mp)
6357 {
6358 	ibd_state_t *state = (ibd_state_t *)arg;
6359 	mblk_t *next;
6360 
6361 	if (state->id_link_state != LINK_STATE_UP) {
6362 		freemsgchain(mp);
6363 		mp = NULL;
6364 	}
6365 
6366 	while (mp != NULL) {
6367 		next = mp->b_next;
6368 		mp->b_next = NULL;
6369 		if (ibd_send(state, mp) == B_FALSE) {
6370 			/* Send fail */
6371 			mp->b_next = next;
6372 			break;
6373 		}
6374 		mp = next;
6375 	}
6376 
6377 	return (mp);
6378 }
6379 
6380 /*
6381  * this handles Tx and Rx completions. With separate CQs, this handles
6382  * only Rx completions.
6383  */
6384 static uint_t
6385 ibd_intr(caddr_t arg)
6386 {
6387 	ibd_state_t *state = (ibd_state_t *)arg;
6388 
6389 	ibd_poll_rcq(state, state->id_rcq_hdl);
6390 
6391 	return (DDI_INTR_CLAIMED);
6392 }
6393 
6394 /*
6395  * Poll and fully drain the send cq
6396  */
6397 static void
6398 ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
6399 {
6400 	ibt_wc_t *wcs = state->id_txwcs;
6401 	uint_t numwcs = state->id_txwcs_size;
6402 	ibd_wqe_t *wqe;
6403 	ibd_swqe_t *head, *tail;
6404 	ibt_wc_t *wc;
6405 	uint_t num_polled;
6406 	int i;
6407 
6408 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
6409 		head = tail = NULL;
6410 		for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
6411 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
6412 			if (wc->wc_status != IBT_WC_SUCCESS) {
6413 				/*
6414 				 * Channel being torn down.
6415 				 */
6416 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
6417 					DPRINT(5, "ibd_drain_scq: flush error");
6418 					DPRINT(10, "ibd_drain_scq: Bad "
6419 					    "status %d", wc->wc_status);
6420 				} else {
6421 					DPRINT(10, "ibd_drain_scq: "
6422 					    "unexpected wc_status %d",
6423 					    wc->wc_status);
6424 				}
6425 				/*
6426 				 * Fallthrough to invoke the Tx handler to
6427 				 * release held resources, e.g., AH refcount.
6428 				 */
6429 			}
6430 			/*
6431 			 * Add this swqe to the list to be cleaned up.
6432 			 */
6433 			if (head)
6434 				tail->swqe_next = wqe;
6435 			else
6436 				head = WQE_TO_SWQE(wqe);
6437 			tail = WQE_TO_SWQE(wqe);
6438 		}
6439 		tail->swqe_next = NULL;
6440 		ibd_tx_cleanup_list(state, head, tail);
6441 
6442 		/*
6443 		 * Resume any blocked transmissions if possible
6444 		 */
6445 		ibd_resume_transmission(state);
6446 	}
6447 }
6448 
6449 /*
6450  * Poll and fully drain the receive cq
6451  */
6452 static void
6453 ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
6454 {
6455 	ibt_wc_t *wcs = state->id_rxwcs;
6456 	uint_t numwcs = state->id_rxwcs_size;
6457 	ibd_rwqe_t *rwqe;
6458 	ibt_wc_t *wc;
6459 	uint_t num_polled;
6460 	int i;
6461 	mblk_t *head, *tail, *mp;
6462 
6463 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
6464 		head = tail = NULL;
6465 		for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
6466 			rwqe = (ibd_rwqe_t *)(uintptr_t)wc->wc_id;
6467 			if (wc->wc_status != IBT_WC_SUCCESS) {
6468 				/*
6469 				 * Channel being torn down.
6470 				 */
6471 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
6472 					DPRINT(5, "ibd_drain_rcq: "
6473 					    "expected flushed rwqe");
6474 				} else {
6475 					DPRINT(5, "ibd_drain_rcq: "
6476 					    "unexpected wc_status %d",
6477 					    wc->wc_status);
6478 				}
6479 				atomic_inc_32(
6480 				    &state->id_rx_list.dl_bufs_outstanding);
6481 				freemsg(rwqe->rwqe_im_mblk);
6482 				continue;
6483 			}
6484 			mp = ibd_process_rx(state, rwqe, wc);
6485 			if (mp == NULL)
6486 				continue;
6487 
6488 			/*
6489 			 * Add this mp to the list to send to the nw layer.
6490 			 */
6491 			if (head)
6492 				tail->b_next = mp;
6493 			else
6494 				head = mp;
6495 			tail = mp;
6496 		}
6497 		if (head)
6498 			mac_rx(state->id_mh, state->id_rh, head);
6499 
6500 		/*
6501 		 * Account for #rwqes polled.
6502 		 * Post more here, if less than one fourth full.
6503 		 */
6504 		if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, -num_polled) <
6505 		    (state->id_num_rwqe / 4))
6506 			ibd_post_recv_intr(state);
6507 	}
6508 }
6509 
6510 /*
6511  * Common code for interrupt handling as well as for polling
6512  * for all completed wqe's while detaching.
6513  */
6514 static void
6515 ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
6516 {
6517 	int flag, redo_flag;
6518 	int redo = 1;
6519 
6520 	flag = IBD_CQ_POLLING;
6521 	redo_flag = IBD_REDO_CQ_POLLING;
6522 
6523 	mutex_enter(&state->id_scq_poll_lock);
6524 	if (state->id_scq_poll_busy & flag) {
6525 		ibd_print_warn(state, "ibd_poll_scq: multiple polling threads");
6526 		state->id_scq_poll_busy |= redo_flag;
6527 		mutex_exit(&state->id_scq_poll_lock);
6528 		return;
6529 	}
6530 	state->id_scq_poll_busy |= flag;
6531 	mutex_exit(&state->id_scq_poll_lock);
6532 
6533 	/*
6534 	 * In some cases (eg detaching), this code can be invoked on
6535 	 * any cpu after disabling cq notification (thus no concurrency
6536 	 * exists). Apart from that, the following applies normally:
6537 	 * Transmit completion handling could be from any cpu if
6538 	 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
6539 	 * is interrupt driven.
6540 	 */
6541 
6542 	/*
6543 	 * Poll and drain the CQ
6544 	 */
6545 	ibd_drain_scq(state, cq_hdl);
6546 
6547 	/*
6548 	 * Enable CQ notifications and redrain the cq to catch any
6549 	 * completions we might have missed after the ibd_drain_scq()
6550 	 * above and before the ibt_enable_cq_notify() that follows.
6551 	 * Finally, service any new requests to poll the cq that
6552 	 * could've come in after the ibt_enable_cq_notify().
6553 	 */
6554 	do {
6555 		if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) !=
6556 		    IBT_SUCCESS) {
6557 			DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
6558 		}
6559 
6560 		ibd_drain_scq(state, cq_hdl);
6561 
6562 		mutex_enter(&state->id_scq_poll_lock);
6563 		if (state->id_scq_poll_busy & redo_flag)
6564 			state->id_scq_poll_busy &= ~redo_flag;
6565 		else {
6566 			state->id_scq_poll_busy &= ~flag;
6567 			redo = 0;
6568 		}
6569 		mutex_exit(&state->id_scq_poll_lock);
6570 
6571 	} while (redo);
6572 }
6573 
6574 /*
6575  * Common code for interrupt handling as well as for polling
6576  * for all completed wqe's while detaching.
6577  */
6578 static void
6579 ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq)
6580 {
6581 	int flag, redo_flag;
6582 	int redo = 1;
6583 
6584 	flag = IBD_CQ_POLLING;
6585 	redo_flag = IBD_REDO_CQ_POLLING;
6586 
6587 	mutex_enter(&state->id_rcq_poll_lock);
6588 	if (state->id_rcq_poll_busy & flag) {
6589 		ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads");
6590 		state->id_rcq_poll_busy |= redo_flag;
6591 		mutex_exit(&state->id_rcq_poll_lock);
6592 		return;
6593 	}
6594 	state->id_rcq_poll_busy |= flag;
6595 	mutex_exit(&state->id_rcq_poll_lock);
6596 
6597 	/*
6598 	 * Poll and drain the CQ
6599 	 */
6600 	ibd_drain_rcq(state, rcq);
6601 
6602 	/*
6603 	 * Enable CQ notifications and redrain the cq to catch any
6604 	 * completions we might have missed after the ibd_drain_cq()
6605 	 * above and before the ibt_enable_cq_notify() that follows.
6606 	 * Finally, service any new requests to poll the cq that
6607 	 * could've come in after the ibt_enable_cq_notify().
6608 	 */
6609 	do {
6610 		if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) !=
6611 		    IBT_SUCCESS) {
6612 			DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
6613 		}
6614 
6615 		ibd_drain_rcq(state, rcq);
6616 
6617 		mutex_enter(&state->id_rcq_poll_lock);
6618 		if (state->id_rcq_poll_busy & redo_flag)
6619 			state->id_rcq_poll_busy &= ~redo_flag;
6620 		else {
6621 			state->id_rcq_poll_busy &= ~flag;
6622 			redo = 0;
6623 		}
6624 		mutex_exit(&state->id_rcq_poll_lock);
6625 
6626 	} while (redo);
6627 }
6628 
6629 /*
6630  * Unmap the memory area associated with a given swqe.
6631  */
6632 void
6633 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe)
6634 {
6635 	ibt_status_t stat;
6636 
6637 	DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds);
6638 
6639 	if (swqe->w_mi_hdl) {
6640 		if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl,
6641 		    swqe->w_mi_hdl)) != IBT_SUCCESS) {
6642 			DPRINT(10,
6643 			    "failed in ibt_unmap_mem_iov, ret=%d\n", stat);
6644 		}
6645 		swqe->w_mi_hdl = NULL;
6646 	}
6647 	swqe->w_swr.wr_nds = 0;
6648 }
6649 
6650 void
6651 ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace)
6652 {
6653 	/*
6654 	 * The recycling logic can be eliminated from here
6655 	 * and put into the async thread if we create another
6656 	 * list to hold ACE's for unjoined mcg's.
6657 	 */
6658 	if (DEC_REF_DO_CYCLE(ace)) {
6659 		ibd_mce_t *mce;
6660 
6661 		/*
6662 		 * Check with the lock taken: we decremented
6663 		 * reference count without the lock, and some
6664 		 * transmitter might already have bumped the
6665 		 * reference count (possible in case of multicast
6666 		 * disable when we leave the AH on the active
6667 		 * list). If not still 0, get out, leaving the
6668 		 * recycle bit intact.
6669 		 *
6670 		 * Atomically transition the AH from active
6671 		 * to free list, and queue a work request to
6672 		 * leave the group and destroy the mce. No
6673 		 * transmitter can be looking at the AH or
6674 		 * the MCE in between, since we have the
6675 		 * ac_mutex lock. In the SendOnly reap case,
6676 		 * it is not necessary to hold the ac_mutex
6677 		 * and recheck the ref count (since the AH was
6678 		 * taken off the active list), we just do it
6679 		 * to have uniform processing with the Full
6680 		 * reap case.
6681 		 */
6682 		mutex_enter(&state->id_ac_mutex);
6683 		mce = ace->ac_mce;
6684 		if (GET_REF_CYCLE(ace) == 0) {
6685 			CLEAR_REFCYCLE(ace);
6686 			/*
6687 			 * Identify the case of fullmember reap as
6688 			 * opposed to mcg trap reap. Also, port up
6689 			 * might set ac_mce to NULL to indicate Tx
6690 			 * cleanup should do no more than put the
6691 			 * AH in the free list (see ibd_async_link).
6692 			 */
6693 			if (mce != NULL) {
6694 				ace->ac_mce = NULL;
6695 				IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
6696 				/*
6697 				 * mc_req was initialized at mce
6698 				 * creation time.
6699 				 */
6700 				ibd_queue_work_slot(state,
6701 				    &mce->mc_req, IBD_ASYNC_REAP);
6702 			}
6703 			IBD_ACACHE_INSERT_FREE(state, ace);
6704 		}
6705 		mutex_exit(&state->id_ac_mutex);
6706 	}
6707 }
6708 
6709 /*
6710  * Common code that deals with clean ups after a successful or
6711  * erroneous transmission attempt.
6712  */
6713 static void
6714 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe)
6715 {
6716 	ibd_ace_t *ace = swqe->w_ahandle;
6717 
6718 	DPRINT(20, "ibd_tx_cleanup %p\n", swqe);
6719 
6720 	/*
6721 	 * If this was a dynamic mapping in ibd_send(), we need to
6722 	 * unmap here. If this was an lso buffer we'd used for sending,
6723 	 * we need to release the lso buf to the pool, since the resource
6724 	 * is scarce. However, if this was simply a normal send using
6725 	 * the copybuf (present in each swqe), we don't need to release it.
6726 	 */
6727 	if (swqe->swqe_im_mblk != NULL) {
6728 		if (swqe->w_buftype == IBD_WQE_MAPPED) {
6729 			ibd_unmap_mem(state, swqe);
6730 		} else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
6731 			ibd_release_lsobufs(state,
6732 			    swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
6733 		}
6734 		ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
6735 		freemsg(swqe->swqe_im_mblk);
6736 		swqe->swqe_im_mblk = NULL;
6737 	}
6738 
6739 	/*
6740 	 * Drop the reference count on the AH; it can be reused
6741 	 * now for a different destination if there are no more
6742 	 * posted sends that will use it. This can be eliminated
6743 	 * if we can always associate each Tx buffer with an AH.
6744 	 * The ace can be null if we are cleaning up from the
6745 	 * ibd_send() error path.
6746 	 */
6747 	if (ace != NULL) {
6748 		ibd_dec_ref_ace(state, ace);
6749 	}
6750 
6751 	/*
6752 	 * Release the send wqe for reuse.
6753 	 */
6754 	swqe->swqe_next = NULL;
6755 	ibd_release_swqe(state, swqe, swqe, 1);
6756 }
6757 
6758 static void
6759 ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail)
6760 {
6761 	ibd_ace_t *ace;
6762 	ibd_swqe_t *swqe;
6763 	int n = 0;
6764 
6765 	DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail);
6766 
6767 	for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) {
6768 
6769 		/*
6770 		 * If this was a dynamic mapping in ibd_send(), we need to
6771 		 * unmap here. If this was an lso buffer we'd used for sending,
6772 		 * we need to release the lso buf to the pool, since the
6773 		 * resource is scarce. However, if this was simply a normal
6774 		 * send using the copybuf (present in each swqe), we don't need
6775 		 * to release it.
6776 		 */
6777 		if (swqe->swqe_im_mblk != NULL) {
6778 			if (swqe->w_buftype == IBD_WQE_MAPPED) {
6779 				ibd_unmap_mem(state, swqe);
6780 			} else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
6781 				ibd_release_lsobufs(state,
6782 				    swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
6783 			}
6784 			ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
6785 			freemsg(swqe->swqe_im_mblk);
6786 			swqe->swqe_im_mblk = NULL;
6787 		}
6788 
6789 		/*
6790 		 * Drop the reference count on the AH; it can be reused
6791 		 * now for a different destination if there are no more
6792 		 * posted sends that will use it. This can be eliminated
6793 		 * if we can always associate each Tx buffer with an AH.
6794 		 * The ace can be null if we are cleaning up from the
6795 		 * ibd_send() error path.
6796 		 */
6797 		ace = swqe->w_ahandle;
6798 		if (ace != NULL) {
6799 			ibd_dec_ref_ace(state, ace);
6800 		}
6801 		n++;
6802 	}
6803 
6804 	/*
6805 	 * Release the send wqes for reuse.
6806 	 */
6807 	ibd_release_swqe(state, head, tail, n);
6808 }
6809 
6810 /*
6811  * Processing to be done after receipt of a packet; hand off to GLD
6812  * in the format expected by GLD.  The received packet has this
6813  * format: 2b sap :: 00 :: data.
6814  */
6815 static mblk_t *
6816 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
6817 {
6818 	ib_header_info_t *phdr;
6819 	mblk_t *mp;
6820 	ipoib_hdr_t *ipibp;
6821 	ipha_t *iphap;
6822 	ip6_t *ip6h;
6823 	int len;
6824 	ib_msglen_t pkt_len = wc->wc_bytes_xfer;
6825 	uint32_t bufs;
6826 
6827 	/*
6828 	 * Track number handed to upper layer that need to be returned.
6829 	 */
6830 	bufs = atomic_inc_32_nv(&state->id_rx_list.dl_bufs_outstanding);
6831 
6832 	/* Never run out of rwqes, use allocb when running low */
6833 	if (bufs >= state->id_rx_bufs_outstanding_limit) {
6834 		atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
6835 		atomic_inc_32(&state->id_rx_allocb);
6836 		mp = allocb(pkt_len, BPRI_HI);
6837 		if (mp) {
6838 			bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len);
6839 			ibd_post_recv(state, rwqe);
6840 		} else {	/* no memory */
6841 			atomic_inc_32(&state->id_rx_allocb_failed);
6842 			ibd_post_recv(state, rwqe);
6843 			return (NULL);
6844 		}
6845 	} else {
6846 		mp = rwqe->rwqe_im_mblk;
6847 	}
6848 
6849 
6850 	/*
6851 	 * Adjust write pointer depending on how much data came in.
6852 	 */
6853 	mp->b_wptr = mp->b_rptr + pkt_len;
6854 
6855 	/*
6856 	 * Make sure this is NULL or we're in trouble.
6857 	 */
6858 	if (mp->b_next != NULL) {
6859 		ibd_print_warn(state,
6860 		    "ibd_process_rx: got duplicate mp from rcq?");
6861 		mp->b_next = NULL;
6862 	}
6863 
6864 	/*
6865 	 * the IB link will deliver one of the IB link layer
6866 	 * headers called, the Global Routing Header (GRH).
6867 	 * ibd driver uses the information in GRH to build the
6868 	 * Header_info structure and pass it with the datagram up
6869 	 * to GLDv3.
6870 	 * If the GRH is not valid, indicate to GLDv3 by setting
6871 	 * the VerTcFlow field to 0.
6872 	 */
6873 	phdr = (ib_header_info_t *)mp->b_rptr;
6874 	if (wc->wc_flags & IBT_WC_GRH_PRESENT) {
6875 		phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn);
6876 
6877 		/* if it is loop back packet, just drop it. */
6878 		if (state->id_enable_rc) {
6879 			if (bcmp(&phdr->ib_grh.ipoib_sqpn,
6880 			    &state->rc_macaddr_loopback,
6881 			    IPOIB_ADDRL) == 0) {
6882 				freemsg(mp);
6883 				return (NULL);
6884 			}
6885 		} else {
6886 			if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr,
6887 			    IPOIB_ADDRL) == 0) {
6888 				freemsg(mp);
6889 				return (NULL);
6890 			}
6891 		}
6892 
6893 		ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src,
6894 		    sizeof (ipoib_mac_t));
6895 		if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) {
6896 			phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN);
6897 			IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst);
6898 		} else {
6899 			phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn;
6900 		}
6901 	} else {
6902 		/*
6903 		 * It can not be a IBA multicast packet. Must have been
6904 		 * unicast for us. Just copy the interface address to dst.
6905 		 */
6906 		phdr->ib_grh.ipoib_vertcflow = 0;
6907 		ovbcopy(&state->id_macaddr, &phdr->ib_dst,
6908 		    sizeof (ipoib_mac_t));
6909 	}
6910 
6911 	/*
6912 	 * For ND6 packets, padding is at the front of the source/target
6913 	 * lladdr. However the inet6 layer is not aware of it, hence remove
6914 	 * the padding from such packets.
6915 	 */
6916 	ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t));
6917 	if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
6918 		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
6919 		len = ntohs(ip6h->ip6_plen);
6920 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
6921 			/* LINTED: E_CONSTANT_CONDITION */
6922 			IBD_PAD_NSNA(ip6h, len, IBD_RECV);
6923 		}
6924 	}
6925 
6926 	/*
6927 	 * Update statistics
6928 	 */
6929 	atomic_add_64(&state->id_rcv_bytes, pkt_len);
6930 	atomic_inc_64(&state->id_rcv_pkt);
6931 	if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
6932 		atomic_inc_64(&state->id_brd_rcv);
6933 	else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
6934 		atomic_inc_64(&state->id_multi_rcv);
6935 
6936 	iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
6937 	/*
6938 	 * Set receive checksum status in mp
6939 	 * Hardware checksumming can be considered valid only if:
6940 	 * 1. CQE.IP_OK bit is set
6941 	 * 2. CQE.CKSUM = 0xffff
6942 	 * 3. IPv6 routing header is not present in the packet
6943 	 * 4. If there are no IP_OPTIONS in the IP HEADER
6944 	 */
6945 
6946 	if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) &&
6947 	    (wc->wc_cksum == 0xFFFF) &&
6948 	    (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) {
6949 		mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM_OK);
6950 	}
6951 
6952 	return (mp);
6953 }
6954 
6955 /*
6956  * Callback code invoked from STREAMs when the receive data buffer is
6957  * free for recycling.
6958  */
6959 static void
6960 ibd_freemsg_cb(char *arg)
6961 {
6962 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
6963 	ibd_state_t *state = rwqe->w_state;
6964 
6965 	atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
6966 
6967 	/*
6968 	 * If the driver is stopped, just free the rwqe.
6969 	 */
6970 	if (atomic_add_32_nv(&state->id_running, 0) == 0) {
6971 		DPRINT(6, "ibd_freemsg: wqe being freed");
6972 		rwqe->rwqe_im_mblk = NULL;
6973 		ibd_free_rwqe(state, rwqe);
6974 		return;
6975 	}
6976 
6977 	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
6978 	    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
6979 	if (rwqe->rwqe_im_mblk == NULL) {
6980 		ibd_free_rwqe(state, rwqe);
6981 		DPRINT(6, "ibd_freemsg: desballoc failed");
6982 		return;
6983 	}
6984 
6985 	ibd_post_recv(state, rwqe);
6986 }
6987 
6988 static uint_t
6989 ibd_tx_recycle(caddr_t arg)
6990 {
6991 	ibd_state_t *state = (ibd_state_t *)arg;
6992 
6993 	/*
6994 	 * Poll for completed entries
6995 	 */
6996 	ibd_poll_scq(state, state->id_scq_hdl);
6997 
6998 	return (DDI_INTR_CLAIMED);
6999 }
7000 
7001 #ifdef IBD_LOGGING
7002 static void
7003 ibd_log_init(void)
7004 {
7005 	ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP);
7006 	ibd_lbuf_ndx = 0;
7007 
7008 	mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL);
7009 }
7010 
7011 static void
7012 ibd_log_fini(void)
7013 {
7014 	if (ibd_lbuf)
7015 		kmem_free(ibd_lbuf, IBD_LOG_SZ);
7016 	ibd_lbuf_ndx = 0;
7017 	ibd_lbuf = NULL;
7018 
7019 	mutex_destroy(&ibd_lbuf_lock);
7020 }
7021 
7022 static void
7023 ibd_log(const char *fmt, ...)
7024 {
7025 	va_list	ap;
7026 	uint32_t off;
7027 	uint32_t msglen;
7028 	char tmpbuf[IBD_DMAX_LINE];
7029 
7030 	if (ibd_lbuf == NULL)
7031 		return;
7032 
7033 	va_start(ap, fmt);
7034 	msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap);
7035 	va_end(ap);
7036 
7037 	if (msglen >= IBD_DMAX_LINE)
7038 		msglen = IBD_DMAX_LINE - 1;
7039 
7040 	mutex_enter(&ibd_lbuf_lock);
7041 
7042 	off = ibd_lbuf_ndx;		/* current msg should go here */
7043 	if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n'))
7044 		ibd_lbuf[ibd_lbuf_ndx-1] = '\n';
7045 
7046 	ibd_lbuf_ndx += msglen;		/* place where next msg should start */
7047 	ibd_lbuf[ibd_lbuf_ndx] = 0;	/* current msg should terminate */
7048 
7049 	if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE))
7050 		ibd_lbuf_ndx = 0;
7051 
7052 	mutex_exit(&ibd_lbuf_lock);
7053 
7054 	bcopy(tmpbuf, ibd_lbuf+off, msglen);	/* no lock needed for this */
7055 }
7056 #endif
7057